vmx2spu.h revision 1.1 1 1.1 mrg /* Copyright (C) 2006, 2007, 2009 Free Software Foundation, Inc.
2 1.1 mrg
3 1.1 mrg This file is free software; you can redistribute it and/or modify it under
4 1.1 mrg the terms of the GNU General Public License as published by the Free
5 1.1 mrg Software Foundation; either version 3 of the License, or (at your option)
6 1.1 mrg any later version.
7 1.1 mrg
8 1.1 mrg This file is distributed in the hope that it will be useful, but WITHOUT
9 1.1 mrg ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10 1.1 mrg FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
11 1.1 mrg for more details.
12 1.1 mrg
13 1.1 mrg Under Section 7 of GPL version 3, you are granted additional
14 1.1 mrg permissions described in the GCC Runtime Library Exception, version
15 1.1 mrg 3.1, as published by the Free Software Foundation.
16 1.1 mrg
17 1.1 mrg You should have received a copy of the GNU General Public License and
18 1.1 mrg a copy of the GCC Runtime Library Exception along with this program;
19 1.1 mrg see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
20 1.1 mrg <http://www.gnu.org/licenses/>. */
21 1.1 mrg
22 1.1 mrg #ifndef _VMX2SPU_H_
23 1.1 mrg #define _VMX2SPU_H_ 1
24 1.1 mrg
25 1.1 mrg #ifdef __cplusplus
26 1.1 mrg
27 1.1 mrg #ifdef __SPU__
28 1.1 mrg
29 1.1 mrg #include <spu_intrinsics.h>
30 1.1 mrg #include <vec_types.h>
31 1.1 mrg
32 1.1 mrg /* This file maps generic VMX intrinsics and predicates to the SPU using
33 1.1 mrg * overloaded C++ functions.
34 1.1 mrg */
35 1.1 mrg
36 1.1 mrg /************************************************************************
37 1.1 mrg * INTRINSICS
38 1.1 mrg ************************************************************************/
39 1.1 mrg
40 1.1 mrg /* vec_abs (vector absolute value)
41 1.1 mrg * =======
42 1.1 mrg */
43 1.1 mrg static inline vec_char16 vec_abs(vec_char16 a)
44 1.1 mrg {
45 1.1 mrg vec_char16 minus_a;
46 1.1 mrg
47 1.1 mrg minus_a = (vec_char16)(spu_add((vec_ushort8)(spu_and(spu_xor(a, 0xFF), 0x7F)), 0x101));
48 1.1 mrg return (spu_sel(minus_a, a, spu_cmpgt(a, -1)));
49 1.1 mrg }
50 1.1 mrg
51 1.1 mrg static inline vec_short8 vec_abs(vec_short8 a)
52 1.1 mrg {
53 1.1 mrg return (spu_sel(spu_sub(0, a), a, spu_cmpgt(a, -1)));
54 1.1 mrg }
55 1.1 mrg
56 1.1 mrg static inline vec_int4 vec_abs(vec_int4 a)
57 1.1 mrg {
58 1.1 mrg return (spu_sel(spu_sub(0, a), a, spu_cmpgt(a, -1)));
59 1.1 mrg }
60 1.1 mrg
61 1.1 mrg static inline vec_float4 vec_abs(vec_float4 a)
62 1.1 mrg {
63 1.1 mrg return ((vec_float4)(spu_rlmask(spu_sl((vec_uint4)(a), 1), -1)));
64 1.1 mrg }
65 1.1 mrg
66 1.1 mrg /* vec_abss (vector absolute value saturate)
67 1.1 mrg * ========
68 1.1 mrg */
69 1.1 mrg static inline vec_char16 vec_abss(vec_char16 a)
70 1.1 mrg {
71 1.1 mrg vec_char16 minus_a;
72 1.1 mrg
73 1.1 mrg minus_a = (vec_char16)spu_add((vec_short8)(spu_xor(a, -1)),
74 1.1 mrg (vec_short8)(spu_and(spu_cmpgt((vec_uchar16)(a), 0x80), 1)));
75 1.1 mrg return (spu_sel(minus_a, a, spu_cmpgt(a, -1)));
76 1.1 mrg }
77 1.1 mrg
78 1.1 mrg static inline vec_short8 vec_abss(vec_short8 a)
79 1.1 mrg {
80 1.1 mrg vec_short8 minus_a;
81 1.1 mrg
82 1.1 mrg minus_a = spu_add(spu_sub(0, a), (vec_short8)(spu_cmpeq(a, ((vec_short8){0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000}))));
83 1.1 mrg return (spu_sel(minus_a, a, spu_cmpgt(a, -1)));
84 1.1 mrg }
85 1.1 mrg
86 1.1 mrg static inline vec_int4 vec_abss(vec_int4 a)
87 1.1 mrg {
88 1.1 mrg vec_int4 minus_a;
89 1.1 mrg
90 1.1 mrg minus_a = spu_add(spu_sub(0, a), (vec_int4)(spu_cmpeq(a, ((vec_int4){0x80000000,0x80000000,0x80000000,0x80000000}))));
91 1.1 mrg return (spu_sel(minus_a, a, spu_cmpgt(a, -1)));
92 1.1 mrg }
93 1.1 mrg
94 1.1 mrg
95 1.1 mrg /* vec_add (vector add)
96 1.1 mrg * =======
97 1.1 mrg */
98 1.1 mrg static inline vec_uchar16 vec_add(vec_uchar16 a, vec_uchar16 b)
99 1.1 mrg {
100 1.1 mrg return ((vec_uchar16)(spu_sel(spu_add((vec_ushort8)(a), (vec_ushort8)(b)),
101 1.1 mrg spu_add(spu_and((vec_ushort8)(a), 0xFF00), spu_and((vec_ushort8)(b), 0xFF00)),
102 1.1 mrg spu_splats((unsigned short)(0xFF00)))));
103 1.1 mrg }
104 1.1 mrg
105 1.1 mrg static inline vec_char16 vec_add(vec_char16 a, vec_char16 b)
106 1.1 mrg {
107 1.1 mrg return ((vec_char16)vec_add((vec_uchar16)(a), (vec_uchar16)(b)));
108 1.1 mrg }
109 1.1 mrg
110 1.1 mrg static inline vec_char16 vec_add(vec_bchar16 a, vec_char16 b)
111 1.1 mrg {
112 1.1 mrg return ((vec_char16)vec_add((vec_uchar16)(a), (vec_uchar16)(b)));
113 1.1 mrg }
114 1.1 mrg
115 1.1 mrg static inline vec_char16 vec_add(vec_char16 a, vec_bchar16 b)
116 1.1 mrg {
117 1.1 mrg return ((vec_char16)vec_add((vec_uchar16)(a), (vec_uchar16)(b)));
118 1.1 mrg }
119 1.1 mrg
120 1.1 mrg static inline vec_ushort8 vec_add(vec_ushort8 a, vec_ushort8 b)
121 1.1 mrg {
122 1.1 mrg return (spu_add(a, b));
123 1.1 mrg }
124 1.1 mrg
125 1.1 mrg static inline vec_short8 vec_add(vec_short8 a, vec_short8 b)
126 1.1 mrg {
127 1.1 mrg return (spu_add(a, b));
128 1.1 mrg }
129 1.1 mrg
130 1.1 mrg static inline vec_short8 vec_add(vec_bshort8 a, vec_short8 b)
131 1.1 mrg {
132 1.1 mrg return (spu_add((vec_short8)(a), b));
133 1.1 mrg }
134 1.1 mrg
135 1.1 mrg static inline vec_short8 vec_add(vec_short8 a, vec_bshort8 b)
136 1.1 mrg {
137 1.1 mrg return (spu_add(a, (vec_short8)(b)));
138 1.1 mrg }
139 1.1 mrg
140 1.1 mrg static inline vec_uint4 vec_add(vec_uint4 a, vec_uint4 b)
141 1.1 mrg {
142 1.1 mrg return (spu_add(a, b));
143 1.1 mrg }
144 1.1 mrg
145 1.1 mrg static inline vec_int4 vec_add(vec_int4 a, vec_int4 b)
146 1.1 mrg {
147 1.1 mrg return (spu_add(a, b));
148 1.1 mrg }
149 1.1 mrg
150 1.1 mrg static inline vec_int4 vec_add(vec_bint4 a, vec_int4 b)
151 1.1 mrg {
152 1.1 mrg return (spu_add((vec_int4)(a), b));
153 1.1 mrg }
154 1.1 mrg
155 1.1 mrg static inline vec_int4 vec_add(vec_int4 a, vec_bint4 b)
156 1.1 mrg {
157 1.1 mrg return (spu_add(a, (vec_int4)(b)));
158 1.1 mrg }
159 1.1 mrg
160 1.1 mrg static inline vec_float4 vec_add(vec_float4 a, vec_float4 b)
161 1.1 mrg {
162 1.1 mrg return (spu_add(a, b));
163 1.1 mrg }
164 1.1 mrg
165 1.1 mrg /* vec_addc (vector add carryout unsigned word)
166 1.1 mrg * ========
167 1.1 mrg */
168 1.1 mrg #define vec_addc(_a, _b) spu_genc(_a, _b)
169 1.1 mrg
170 1.1 mrg /* vec_adds (vector add saturated)
171 1.1 mrg * ========
172 1.1 mrg */
173 1.1 mrg static inline vec_uchar16 vec_adds(vec_uchar16 a, vec_uchar16 b)
174 1.1 mrg {
175 1.1 mrg vec_uchar16 s1, s2, s, d;
176 1.1 mrg
177 1.1 mrg s1 = (vec_uchar16)(spu_add(spu_rlmask((vec_ushort8)(a), -8), spu_rlmask((vec_ushort8)(b), -8)));
178 1.1 mrg s2 = (vec_uchar16)(spu_add(spu_and((vec_ushort8)(a), 0xFF), spu_and((vec_ushort8)(b), 0xFF)));
179 1.1 mrg s = spu_shuffle(s1, s2, ((vec_uchar16){0, 16, 2, 18, 4, 20, 6, 22,
180 1.1 mrg 8, 24, 10, 26, 12, 28, 14, 30}));
181 1.1 mrg d = spu_shuffle(s1, s2, ((vec_uchar16){1, 17, 3, 19, 5, 21, 7, 23,
182 1.1 mrg 9, 25, 11, 27, 13, 29, 15, 31}));
183 1.1 mrg return (spu_or(d, spu_cmpeq(s, 1)));
184 1.1 mrg }
185 1.1 mrg
186 1.1 mrg static inline vec_char16 vec_adds(vec_char16 a, vec_char16 b)
187 1.1 mrg {
188 1.1 mrg vec_uchar16 s1, s2, s, d;
189 1.1 mrg
190 1.1 mrg s1 = (vec_uchar16)(spu_add(spu_rlmask((vec_ushort8)(a), -8), spu_rlmask((vec_ushort8)(b), -8)));
191 1.1 mrg s2 = (vec_uchar16)(spu_add(spu_and((vec_ushort8)(a), 0xFF), spu_and((vec_ushort8)(b), 0xFF)));
192 1.1 mrg s = spu_shuffle(s1, s2, ((vec_uchar16){1, 17, 3, 19, 5, 21, 7, 23,
193 1.1 mrg 9, 25, 11, 27, 13, 29, 15, 31}));
194 1.1 mrg d = spu_sel(s, spu_splats((unsigned char)0x7F), spu_cmpgt(spu_and(s, (vec_uchar16)(spu_nor(a, b))), 0x7F));
195 1.1 mrg d = spu_sel(d, spu_splats((unsigned char)0x80), spu_cmpgt(spu_nor(s, (vec_uchar16)(spu_nand(a, b))), 0x7F));
196 1.1 mrg return ((vec_char16)(d));
197 1.1 mrg }
198 1.1 mrg
199 1.1 mrg static inline vec_char16 vec_adds(vec_bchar16 a, vec_char16 b)
200 1.1 mrg {
201 1.1 mrg return (vec_adds((vec_char16)(a), b));
202 1.1 mrg }
203 1.1 mrg
204 1.1 mrg static inline vec_char16 vec_adds(vec_char16 a, vec_bchar16 b)
205 1.1 mrg {
206 1.1 mrg return (vec_adds(a, (vec_char16)(b)));
207 1.1 mrg }
208 1.1 mrg
209 1.1 mrg static inline vec_ushort8 vec_adds(vec_ushort8 a, vec_ushort8 b)
210 1.1 mrg {
211 1.1 mrg vec_ushort8 s, d;
212 1.1 mrg
213 1.1 mrg s = spu_add(a, b);
214 1.1 mrg d = spu_or(s, spu_rlmaska(spu_sel(spu_xor(s, -1), a, spu_eqv(a, b)), -15));
215 1.1 mrg return (d);
216 1.1 mrg }
217 1.1 mrg
218 1.1 mrg static inline vec_short8 vec_adds(vec_short8 a, vec_short8 b)
219 1.1 mrg {
220 1.1 mrg vec_short8 s, d;
221 1.1 mrg
222 1.1 mrg s = spu_add(a, b);
223 1.1 mrg d = spu_sel(s, spu_splats((signed short)0x7FFF), (vec_ushort8)(spu_rlmaska(spu_and(s, spu_nor(a, b)), -15)));
224 1.1 mrg d = spu_sel(d, spu_splats((signed short)0x8000), (vec_ushort8)(spu_rlmaska(spu_nor(s, spu_nand(a, b)), -15)));
225 1.1 mrg return (d);
226 1.1 mrg }
227 1.1 mrg
228 1.1 mrg static inline vec_short8 vec_adds(vec_bshort8 a, vec_short8 b)
229 1.1 mrg {
230 1.1 mrg return (vec_adds((vec_short8)(a), b));
231 1.1 mrg }
232 1.1 mrg
233 1.1 mrg static inline vec_short8 vec_adds(vec_short8 a, vec_bshort8 b)
234 1.1 mrg {
235 1.1 mrg return (vec_adds(a, (vec_short8)(b)));
236 1.1 mrg }
237 1.1 mrg
238 1.1 mrg static inline vec_uint4 vec_adds(vec_uint4 a, vec_uint4 b)
239 1.1 mrg {
240 1.1 mrg return (spu_or(spu_add(a, b), spu_rlmaska(spu_sl(spu_genc(a, b), 31), -31)));
241 1.1 mrg }
242 1.1 mrg
243 1.1 mrg static inline vec_int4 vec_adds(vec_int4 a, vec_int4 b)
244 1.1 mrg {
245 1.1 mrg vec_int4 s, d;
246 1.1 mrg
247 1.1 mrg s = spu_add(a, b);
248 1.1 mrg d = spu_sel(s, spu_splats((signed int)0x7FFFFFFF), (vec_uint4)spu_rlmaska(spu_and(s, spu_nor(a, b)), -31));
249 1.1 mrg d = spu_sel(d, spu_splats((signed int)0x80000000), (vec_uint4)spu_rlmaska(spu_nor(s, spu_nand(a, b)), -31));
250 1.1 mrg return (d);
251 1.1 mrg }
252 1.1 mrg
253 1.1 mrg static inline vec_int4 vec_adds(vec_bint4 a, vec_int4 b)
254 1.1 mrg {
255 1.1 mrg return (vec_adds((vec_int4)(a), b));
256 1.1 mrg }
257 1.1 mrg
258 1.1 mrg static inline vec_int4 vec_adds(vec_int4 a, vec_bint4 b)
259 1.1 mrg {
260 1.1 mrg return (vec_adds(a, (vec_int4)(b)));
261 1.1 mrg }
262 1.1 mrg
263 1.1 mrg /* vec_and (vector logical and)
264 1.1 mrg * =======
265 1.1 mrg */
266 1.1 mrg static inline vec_uchar16 vec_and(vec_uchar16 a, vec_uchar16 b)
267 1.1 mrg {
268 1.1 mrg return (spu_and(a, b));
269 1.1 mrg }
270 1.1 mrg
271 1.1 mrg static inline vec_char16 vec_and(vec_char16 a, vec_char16 b)
272 1.1 mrg {
273 1.1 mrg return (spu_and(a, b));
274 1.1 mrg }
275 1.1 mrg
276 1.1 mrg static inline vec_char16 vec_and(vec_bchar16 a, vec_char16 b)
277 1.1 mrg {
278 1.1 mrg return (spu_and((vec_char16)(a), b));
279 1.1 mrg }
280 1.1 mrg
281 1.1 mrg static inline vec_char16 vec_and(vec_char16 a, vec_bchar16 b)
282 1.1 mrg {
283 1.1 mrg return (spu_and(a, (vec_char16)(b)));
284 1.1 mrg }
285 1.1 mrg
286 1.1 mrg static inline vec_ushort8 vec_and(vec_ushort8 a, vec_ushort8 b)
287 1.1 mrg {
288 1.1 mrg return (spu_and(a, b));
289 1.1 mrg }
290 1.1 mrg
291 1.1 mrg static inline vec_short8 vec_and(vec_short8 a, vec_short8 b)
292 1.1 mrg {
293 1.1 mrg return (spu_and(a, b));
294 1.1 mrg }
295 1.1 mrg
296 1.1 mrg static inline vec_short8 vec_and(vec_bshort8 a, vec_short8 b)
297 1.1 mrg {
298 1.1 mrg return (spu_and((vec_short8)(a), b));
299 1.1 mrg }
300 1.1 mrg
301 1.1 mrg static inline vec_short8 vec_and(vec_short8 a, vec_bshort8 b)
302 1.1 mrg {
303 1.1 mrg return (spu_and(a, (vec_short8)(b)));
304 1.1 mrg }
305 1.1 mrg
306 1.1 mrg static inline vec_uint4 vec_and(vec_uint4 a, vec_uint4 b)
307 1.1 mrg {
308 1.1 mrg return (spu_and(a, b));
309 1.1 mrg }
310 1.1 mrg
311 1.1 mrg static inline vec_int4 vec_and(vec_int4 a, vec_int4 b)
312 1.1 mrg {
313 1.1 mrg return (spu_and(a, b));
314 1.1 mrg }
315 1.1 mrg
316 1.1 mrg static inline vec_int4 vec_and(vec_bint4 a, vec_int4 b)
317 1.1 mrg {
318 1.1 mrg return (spu_and((vec_int4)(a), b));
319 1.1 mrg }
320 1.1 mrg
321 1.1 mrg static inline vec_int4 vec_and(vec_int4 a, vec_bint4 b)
322 1.1 mrg {
323 1.1 mrg return (spu_and(a, (vec_int4)(b)));
324 1.1 mrg }
325 1.1 mrg
326 1.1 mrg static inline vec_float4 vec_and(vec_float4 a, vec_float4 b)
327 1.1 mrg {
328 1.1 mrg return (spu_and(a, b));
329 1.1 mrg }
330 1.1 mrg
331 1.1 mrg static inline vec_float4 vec_and(vec_bint4 a, vec_float4 b)
332 1.1 mrg {
333 1.1 mrg return (spu_and((vec_float4)(a),b));
334 1.1 mrg }
335 1.1 mrg
336 1.1 mrg static inline vec_float4 vec_and(vec_float4 a, vec_bint4 b)
337 1.1 mrg {
338 1.1 mrg return (spu_and(a, (vec_float4)(b)));
339 1.1 mrg }
340 1.1 mrg
341 1.1 mrg
342 1.1 mrg /* vec_andc (vector logical and with complement)
343 1.1 mrg * ========
344 1.1 mrg */
345 1.1 mrg static inline vec_uchar16 vec_andc(vec_uchar16 a, vec_uchar16 b)
346 1.1 mrg {
347 1.1 mrg return (spu_andc(a, b));
348 1.1 mrg }
349 1.1 mrg
350 1.1 mrg static inline vec_char16 vec_andc(vec_char16 a, vec_char16 b)
351 1.1 mrg {
352 1.1 mrg return (spu_andc(a, b));
353 1.1 mrg }
354 1.1 mrg
355 1.1 mrg static inline vec_char16 vec_andc(vec_bchar16 a, vec_char16 b)
356 1.1 mrg {
357 1.1 mrg return (spu_andc((vec_char16)(a), b));
358 1.1 mrg }
359 1.1 mrg
360 1.1 mrg static inline vec_char16 vec_andc(vec_char16 a, vec_bchar16 b)
361 1.1 mrg {
362 1.1 mrg return (spu_andc(a, (vec_char16)(b)));
363 1.1 mrg }
364 1.1 mrg
365 1.1 mrg static inline vec_ushort8 vec_andc(vec_ushort8 a, vec_ushort8 b)
366 1.1 mrg {
367 1.1 mrg return (spu_andc(a, b));
368 1.1 mrg }
369 1.1 mrg
370 1.1 mrg static inline vec_short8 vec_andc(vec_short8 a, vec_short8 b)
371 1.1 mrg {
372 1.1 mrg return (spu_andc(a, b));
373 1.1 mrg }
374 1.1 mrg
375 1.1 mrg static inline vec_short8 vec_andc(vec_bshort8 a, vec_short8 b)
376 1.1 mrg {
377 1.1 mrg return (spu_andc((vec_short8)(a), b));
378 1.1 mrg }
379 1.1 mrg
380 1.1 mrg static inline vec_short8 vec_andc(vec_short8 a, vec_bshort8 b)
381 1.1 mrg {
382 1.1 mrg return (spu_andc(a, (vec_short8)(b)));
383 1.1 mrg }
384 1.1 mrg
385 1.1 mrg static inline vec_uint4 vec_andc(vec_uint4 a, vec_uint4 b)
386 1.1 mrg {
387 1.1 mrg return (spu_andc(a, b));
388 1.1 mrg }
389 1.1 mrg
390 1.1 mrg static inline vec_int4 vec_andc(vec_int4 a, vec_int4 b)
391 1.1 mrg {
392 1.1 mrg return (spu_andc(a, b));
393 1.1 mrg }
394 1.1 mrg
395 1.1 mrg static inline vec_int4 vec_andc(vec_bint4 a, vec_int4 b)
396 1.1 mrg {
397 1.1 mrg return (spu_andc((vec_int4)(a), b));
398 1.1 mrg }
399 1.1 mrg
400 1.1 mrg static inline vec_int4 vec_andc(vec_int4 a, vec_bint4 b)
401 1.1 mrg {
402 1.1 mrg return (spu_andc(a, (vec_int4)(b)));
403 1.1 mrg }
404 1.1 mrg
405 1.1 mrg static inline vec_float4 vec_andc(vec_float4 a, vec_float4 b)
406 1.1 mrg {
407 1.1 mrg return (spu_andc(a,b));
408 1.1 mrg }
409 1.1 mrg
410 1.1 mrg static inline vec_float4 vec_andc(vec_bint4 a, vec_float4 b)
411 1.1 mrg {
412 1.1 mrg return (spu_andc((vec_float4)(a),b));
413 1.1 mrg }
414 1.1 mrg
415 1.1 mrg static inline vec_float4 vec_andc(vec_float4 a, vec_bint4 b)
416 1.1 mrg {
417 1.1 mrg return (spu_andc(a, (vec_float4)(b)));
418 1.1 mrg }
419 1.1 mrg
420 1.1 mrg /* vec_avg (vector average)
421 1.1 mrg * =======
422 1.1 mrg */
423 1.1 mrg static inline vec_uchar16 vec_avg(vec_uchar16 a, vec_uchar16 b)
424 1.1 mrg {
425 1.1 mrg return (spu_avg(a, b));
426 1.1 mrg }
427 1.1 mrg
428 1.1 mrg static inline vec_char16 vec_avg(vec_char16 a, vec_char16 b)
429 1.1 mrg {
430 1.1 mrg return ((vec_char16)(spu_xor(spu_avg((vec_uchar16)(a), (vec_uchar16)(b)),
431 1.1 mrg (vec_uchar16)(spu_and(spu_xor(a,b), 0x80)))));
432 1.1 mrg }
433 1.1 mrg
434 1.1 mrg static inline vec_ushort8 vec_avg(vec_ushort8 a, vec_ushort8 b)
435 1.1 mrg {
436 1.1 mrg return (spu_add(spu_add(spu_rlmask(a, -1), spu_rlmask(b, -1)),
437 1.1 mrg spu_and(spu_or(a, b), 1)));
438 1.1 mrg }
439 1.1 mrg
440 1.1 mrg static inline vec_short8 vec_avg(vec_short8 a, vec_short8 b)
441 1.1 mrg {
442 1.1 mrg return (spu_add(spu_add(spu_rlmaska(a, -1), spu_rlmaska(b, -1)),
443 1.1 mrg spu_and(spu_or(a, b), 1)));
444 1.1 mrg }
445 1.1 mrg
446 1.1 mrg static inline vec_uint4 vec_avg(vec_uint4 a, vec_uint4 b)
447 1.1 mrg {
448 1.1 mrg return (spu_add(spu_add(spu_rlmask(a, -1), spu_rlmask(b, -1)),
449 1.1 mrg spu_and(spu_or(a, b), 1)));
450 1.1 mrg }
451 1.1 mrg
452 1.1 mrg static inline vec_int4 vec_avg(vec_int4 a, vec_int4 b)
453 1.1 mrg {
454 1.1 mrg return (spu_add(spu_add(spu_rlmaska(a, -1), spu_rlmaska(b, -1)),
455 1.1 mrg spu_and(spu_or(a, b), 1)));
456 1.1 mrg }
457 1.1 mrg
458 1.1 mrg
459 1.1 mrg /* vec_ceil (vector ceiling)
460 1.1 mrg * ========
461 1.1 mrg */
462 1.1 mrg static inline vec_float4 vec_ceil(vec_float4 a)
463 1.1 mrg {
464 1.1 mrg vec_int4 exp;
465 1.1 mrg vec_uint4 mask;
466 1.1 mrg
467 1.1 mrg a = spu_add(a, (vec_float4)(spu_and(spu_xor(spu_rlmaska((vec_int4)a, -31), -1), spu_splats((signed int)0x3F7FFFFF))));
468 1.1 mrg exp = spu_sub(127, (vec_int4)(spu_and(spu_rlmask((vec_uint4)(a), -23), 0xFF)));
469 1.1 mrg mask = spu_rlmask(spu_splats((unsigned int)0x7FFFFF), exp);
470 1.1 mrg mask = spu_sel(spu_splats((unsigned int)0), mask, spu_cmpgt(exp, -31));
471 1.1 mrg mask = spu_or(mask, spu_xor((vec_uint4)(spu_rlmaska(spu_add(exp, -1), -31)), -1));
472 1.1 mrg
473 1.1 mrg return ((vec_float4)(spu_andc((vec_uint4)(a), mask)));
474 1.1 mrg }
475 1.1 mrg
476 1.1 mrg
477 1.1 mrg /* vec_cmpb (vector compare bounds floating-point)
478 1.1 mrg * ========
479 1.1 mrg */
480 1.1 mrg static inline vec_int4 vec_cmpb(vec_float4 a, vec_float4 b)
481 1.1 mrg {
482 1.1 mrg vec_int4 b0 = (vec_int4)spu_splats(0x80000000);
483 1.1 mrg vec_int4 b1 = (vec_int4)spu_splats(0x40000000);
484 1.1 mrg
485 1.1 mrg return (spu_or(spu_and((vec_int4)spu_cmpgt(a, b), b0),
486 1.1 mrg spu_and((vec_int4)spu_cmpgt(spu_xor(b, (vec_float4)(b0)), a), b1)));
487 1.1 mrg }
488 1.1 mrg
489 1.1 mrg /* vec_cmpeq (vector compare equal)
490 1.1 mrg * =========
491 1.1 mrg */
492 1.1 mrg #define vec_cmpeq(_a, _b) spu_cmpeq(_a, _b)
493 1.1 mrg
494 1.1 mrg
495 1.1 mrg /* vec_cmpge (vector compare greater than or equal)
496 1.1 mrg * =========
497 1.1 mrg */
498 1.1 mrg static inline vec_bint4 vec_cmpge(vec_float4 a, vec_float4 b)
499 1.1 mrg {
500 1.1 mrg return (spu_xor(spu_cmpgt(b, a), -1));
501 1.1 mrg }
502 1.1 mrg
503 1.1 mrg
504 1.1 mrg /* vec_cmpgt (vector compare greater than)
505 1.1 mrg * =========
506 1.1 mrg */
507 1.1 mrg #define vec_cmpgt(_a, _b) spu_cmpgt(_a, _b)
508 1.1 mrg
509 1.1 mrg
510 1.1 mrg /* vec_cmple (vector compare less than or equal)
511 1.1 mrg * =========
512 1.1 mrg */
513 1.1 mrg static inline vec_bint4 vec_cmple(vec_float4 a, vec_float4 b)
514 1.1 mrg {
515 1.1 mrg return (spu_xor(spu_cmpgt(a, b), -1));
516 1.1 mrg }
517 1.1 mrg
518 1.1 mrg
519 1.1 mrg /* vec_cmplt (vector compare less than)
520 1.1 mrg * =========
521 1.1 mrg */
522 1.1 mrg #define vec_cmplt(_a, _b) spu_cmpgt(_b, _a)
523 1.1 mrg
524 1.1 mrg
525 1.1 mrg /* vec_ctf (vector convert from fixed-point word)
526 1.1 mrg * =======
527 1.1 mrg */
528 1.1 mrg #define vec_ctf(_a, _b) spu_convtf(_a, _b)
529 1.1 mrg
530 1.1 mrg
531 1.1 mrg /* vec_cts (vector convert to signed fixed-point word saturate)
532 1.1 mrg * =======
533 1.1 mrg */
534 1.1 mrg #define vec_cts(_a, _b) spu_convts(_a, _b)
535 1.1 mrg
536 1.1 mrg
537 1.1 mrg /* vec_ctu (vector convert to unsigned fixed-point word saturate)
538 1.1 mrg * =======
539 1.1 mrg */
540 1.1 mrg #define vec_ctu(_a, _b) spu_convtu(_a, _b)
541 1.1 mrg
542 1.1 mrg
543 1.1 mrg /* vec_dss (vector data stream stop)
544 1.1 mrg * =======
545 1.1 mrg */
546 1.1 mrg #define vec_dss(_a)
547 1.1 mrg
548 1.1 mrg
549 1.1 mrg /* vec_dssall (vector data stream stop all)
550 1.1 mrg * ==========
551 1.1 mrg */
552 1.1 mrg #define vec_dssall()
553 1.1 mrg
554 1.1 mrg
555 1.1 mrg /* vec_dst (vector data stream touch)
556 1.1 mrg * =======
557 1.1 mrg */
558 1.1 mrg #define vec_dst(_a, _b, _c)
559 1.1 mrg
560 1.1 mrg
561 1.1 mrg /* vec_dstst (vector data stream touch for store)
562 1.1 mrg * =========
563 1.1 mrg */
564 1.1 mrg #define vec_dstst(_a, _b, _c)
565 1.1 mrg
566 1.1 mrg
567 1.1 mrg /* vec_dststt (vector data stream touch for store transient)
568 1.1 mrg * ==========
569 1.1 mrg */
570 1.1 mrg #define vec_dststt(_a, _b, _c)
571 1.1 mrg
572 1.1 mrg
573 1.1 mrg /* vec_dstt (vector data stream touch transient)
574 1.1 mrg * ========
575 1.1 mrg */
576 1.1 mrg #define vec_dstt(_a, _b, _c)
577 1.1 mrg
578 1.1 mrg
579 1.1 mrg /* vec_expte (vector is 2 raised tp the exponent estimate floating-point)
580 1.1 mrg * =========
581 1.1 mrg */
582 1.1 mrg static inline vec_float4 vec_expte(vec_float4 a)
583 1.1 mrg {
584 1.1 mrg vec_float4 bias, frac, exp;
585 1.1 mrg vec_int4 ia;
586 1.1 mrg
587 1.1 mrg bias = (vec_float4)(spu_andc(spu_splats((signed int)0x3F7FFFFF), spu_rlmaska((vec_int4)(a), -31)));
588 1.1 mrg ia = spu_convts(spu_add(a, bias), 0);
589 1.1 mrg frac = spu_sub(spu_convtf(ia, 0), a);
590 1.1 mrg exp = (vec_float4)(spu_sl(spu_add(ia, 127), 23));
591 1.1 mrg
592 1.1 mrg return (spu_mul(spu_madd(spu_madd(spu_splats(0.17157287f), frac, spu_splats(-0.67157287f)),
593 1.1 mrg frac, spu_splats(1.0f)), exp));
594 1.1 mrg }
595 1.1 mrg
596 1.1 mrg
597 1.1 mrg /* vec_floor (vector floor)
598 1.1 mrg * =========
599 1.1 mrg */
600 1.1 mrg static inline vec_float4 vec_floor(vec_float4 a)
601 1.1 mrg {
602 1.1 mrg vec_int4 exp;
603 1.1 mrg vec_uint4 mask;
604 1.1 mrg
605 1.1 mrg a = spu_sub(a, (vec_float4)(spu_and(spu_rlmaska((vec_int4)a, -31), spu_splats((signed int)0x3F7FFFFF))));
606 1.1 mrg exp = spu_sub(127, (vec_int4)(spu_and(spu_rlmask((vec_uint4)(a), -23), 0xFF)));
607 1.1 mrg mask = spu_rlmask(spu_splats((unsigned int)0x7FFFFF), exp);
608 1.1 mrg mask = spu_sel(spu_splats((unsigned int)0), mask, spu_cmpgt(exp, -31));
609 1.1 mrg mask = spu_or(mask, spu_xor((vec_uint4)(spu_rlmaska(spu_add(exp, -1), -31)), -1));
610 1.1 mrg
611 1.1 mrg return ((vec_float4)(spu_andc((vec_uint4)(a), mask)));
612 1.1 mrg }
613 1.1 mrg
614 1.1 mrg
615 1.1 mrg /* vec_ld (vector load indexed)
616 1.1 mrg * ======
617 1.1 mrg */
618 1.1 mrg static inline vec_uchar16 vec_ld(int a, unsigned char *b)
619 1.1 mrg {
620 1.1 mrg return (*((vec_uchar16 *)(b+a)));
621 1.1 mrg }
622 1.1 mrg
623 1.1 mrg static inline vec_uchar16 vec_ld(int a, vec_uchar16 *b)
624 1.1 mrg {
625 1.1 mrg return (*((vec_uchar16 *)((unsigned char *)(b)+a)));
626 1.1 mrg }
627 1.1 mrg
628 1.1 mrg static inline vec_char16 vec_ld(int a, signed char *b)
629 1.1 mrg {
630 1.1 mrg return (*((vec_char16 *)(b+a)));
631 1.1 mrg }
632 1.1 mrg
633 1.1 mrg static inline vec_char16 vec_ld(int a, vec_char16 *b)
634 1.1 mrg {
635 1.1 mrg return (*((vec_char16 *)((signed char *)(b)+a)));
636 1.1 mrg }
637 1.1 mrg
638 1.1 mrg static inline vec_ushort8 vec_ld(int a, unsigned short *b)
639 1.1 mrg {
640 1.1 mrg return (*((vec_ushort8 *)((unsigned char *)(b)+a)));
641 1.1 mrg }
642 1.1 mrg
643 1.1 mrg static inline vec_ushort8 vec_ld(int a, vec_ushort8 *b)
644 1.1 mrg {
645 1.1 mrg return (*((vec_ushort8 *)((unsigned char *)(b)+a)));
646 1.1 mrg }
647 1.1 mrg
648 1.1 mrg static inline vec_short8 vec_ld(int a, signed short *b)
649 1.1 mrg {
650 1.1 mrg return (*((vec_short8 *)((unsigned char *)(b)+a)));
651 1.1 mrg }
652 1.1 mrg
653 1.1 mrg static inline vec_short8 vec_ld(int a, vec_short8 *b)
654 1.1 mrg {
655 1.1 mrg return (*((vec_short8 *)((signed char *)(b)+a)));
656 1.1 mrg }
657 1.1 mrg
658 1.1 mrg static inline vec_uint4 vec_ld(int a, unsigned int *b)
659 1.1 mrg {
660 1.1 mrg return (*((vec_uint4 *)((unsigned char *)(b)+a)));
661 1.1 mrg }
662 1.1 mrg
663 1.1 mrg static inline vec_uint4 vec_ld(int a, vec_uint4 *b)
664 1.1 mrg {
665 1.1 mrg return (*((vec_uint4 *)((unsigned char *)(b)+a)));
666 1.1 mrg }
667 1.1 mrg
668 1.1 mrg static inline vec_int4 vec_ld(int a, signed int *b)
669 1.1 mrg {
670 1.1 mrg return (*((vec_int4 *)((unsigned char *)(b)+a)));
671 1.1 mrg }
672 1.1 mrg
673 1.1 mrg static inline vec_int4 vec_ld(int a, vec_int4 *b)
674 1.1 mrg {
675 1.1 mrg return (*((vec_int4 *)((signed char *)(b)+a)));
676 1.1 mrg }
677 1.1 mrg
678 1.1 mrg static inline vec_float4 vec_ld(int a, float *b)
679 1.1 mrg {
680 1.1 mrg return (*((vec_float4 *)((unsigned char *)(b)+a)));
681 1.1 mrg }
682 1.1 mrg
683 1.1 mrg static inline vec_float4 vec_ld(int a, vec_float4 *b)
684 1.1 mrg {
685 1.1 mrg return (*((vec_float4 *)((unsigned char *)(b)+a)));
686 1.1 mrg }
687 1.1 mrg
688 1.1 mrg /* vec_lde (vector load element indexed)
689 1.1 mrg * =======
690 1.1 mrg */
691 1.1 mrg static inline vec_uchar16 vec_lde(int a, unsigned char *b)
692 1.1 mrg {
693 1.1 mrg return (*((vec_uchar16 *)(b+a)));
694 1.1 mrg }
695 1.1 mrg
696 1.1 mrg static inline vec_char16 vec_lde(int a, signed char *b)
697 1.1 mrg {
698 1.1 mrg return (*((vec_char16 *)(b+a)));
699 1.1 mrg }
700 1.1 mrg
701 1.1 mrg static inline vec_ushort8 vec_lde(int a, unsigned short *b)
702 1.1 mrg {
703 1.1 mrg return (*((vec_ushort8 *)((unsigned char *)(b)+a)));
704 1.1 mrg }
705 1.1 mrg
706 1.1 mrg static inline vec_short8 vec_lde(int a, signed short *b)
707 1.1 mrg {
708 1.1 mrg return (*((vec_short8 *)((unsigned char *)(b)+a)));
709 1.1 mrg }
710 1.1 mrg
711 1.1 mrg
712 1.1 mrg static inline vec_uint4 vec_lde(int a, unsigned int *b)
713 1.1 mrg {
714 1.1 mrg return (*((vec_uint4 *)((unsigned char *)(b)+a)));
715 1.1 mrg }
716 1.1 mrg
717 1.1 mrg static inline vec_int4 vec_lde(int a, signed int *b)
718 1.1 mrg {
719 1.1 mrg return (*((vec_int4 *)((unsigned char *)(b)+a)));
720 1.1 mrg }
721 1.1 mrg
722 1.1 mrg
723 1.1 mrg static inline vec_float4 vec_lde(int a, float *b)
724 1.1 mrg {
725 1.1 mrg return (*((vec_float4 *)((unsigned char *)(b)+a)));
726 1.1 mrg }
727 1.1 mrg
728 1.1 mrg /* vec_ldl (vector load indexed LRU)
729 1.1 mrg * =======
730 1.1 mrg */
731 1.1 mrg #define vec_ldl(_a, _b) vec_ld(_a, _b)
732 1.1 mrg
733 1.1 mrg
734 1.1 mrg /* vec_loge (vector log2 estimate floating-point)
735 1.1 mrg * ========
736 1.1 mrg */
737 1.1 mrg static inline vec_float4 vec_loge(vec_float4 a)
738 1.1 mrg {
739 1.1 mrg vec_int4 exp;
740 1.1 mrg vec_float4 frac;
741 1.1 mrg
742 1.1 mrg exp = spu_add((vec_int4)(spu_and(spu_rlmask((vec_uint4)(a), -23), 0xFF)), -127);
743 1.1 mrg frac = (vec_float4)(spu_sub((vec_int4)(a), spu_sl(exp, 23)));
744 1.1 mrg
745 1.1 mrg return (spu_madd(spu_madd(spu_splats(-0.33985f), frac, spu_splats(2.01955f)),
746 1.1 mrg frac, spu_sub(spu_convtf(exp, 0), spu_splats(1.6797f))));
747 1.1 mrg }
748 1.1 mrg
749 1.1 mrg
750 1.1 mrg /* vec_lvsl (vector load for shift left)
751 1.1 mrg * ========
752 1.1 mrg */
753 1.1 mrg static inline vec_uchar16 vec_lvsl(int a, unsigned char *b)
754 1.1 mrg {
755 1.1 mrg return ((vec_uchar16)spu_add((vec_ushort8)(spu_splats((unsigned char)((a + (int)(b)) & 0xF))),
756 1.1 mrg ((vec_ushort8){0x0001, 0x0203, 0x0405, 0x0607,
757 1.1 mrg 0x0809, 0x0A0B, 0x0C0D, 0x0E0F})));
758 1.1 mrg }
759 1.1 mrg
760 1.1 mrg static inline vec_uchar16 vec_lvsl(int a, signed char *b)
761 1.1 mrg {
762 1.1 mrg return (vec_lvsl(a, (unsigned char *)b));
763 1.1 mrg }
764 1.1 mrg
765 1.1 mrg static inline vec_uchar16 vec_lvsl(int a, unsigned short *b)
766 1.1 mrg {
767 1.1 mrg return (vec_lvsl(a, (unsigned char *)b));
768 1.1 mrg }
769 1.1 mrg
770 1.1 mrg static inline vec_uchar16 vec_lvsl(int a, short *b)
771 1.1 mrg {
772 1.1 mrg return (vec_lvsl(a, (unsigned char *)b));
773 1.1 mrg }
774 1.1 mrg
775 1.1 mrg static inline vec_uchar16 vec_lvsl(int a, unsigned int *b)
776 1.1 mrg {
777 1.1 mrg return (vec_lvsl(a, (unsigned char *)b));
778 1.1 mrg }
779 1.1 mrg
780 1.1 mrg static inline vec_uchar16 vec_lvsl(int a, int *b)
781 1.1 mrg {
782 1.1 mrg return (vec_lvsl(a, (unsigned char *)b));
783 1.1 mrg }
784 1.1 mrg
785 1.1 mrg static inline vec_uchar16 vec_lvsl(int a, float *b)
786 1.1 mrg {
787 1.1 mrg return (vec_lvsl(a, (unsigned char *)b));
788 1.1 mrg }
789 1.1 mrg
790 1.1 mrg
791 1.1 mrg /* vec_lvsr (vector load for shift right)
792 1.1 mrg * ========
793 1.1 mrg */
794 1.1 mrg static inline vec_uchar16 vec_lvsr(int a, unsigned char *b)
795 1.1 mrg {
796 1.1 mrg return ((vec_uchar16)(spu_sub(((vec_ushort8){0x1011, 0x1213, 0x1415, 0x1617,
797 1.1 mrg 0x1819, 0x1A1B, 0x1C1D, 0x1E1F}),
798 1.1 mrg (vec_ushort8)(spu_splats((unsigned char)((a + (int)(b)) & 0xF))))));
799 1.1 mrg }
800 1.1 mrg
801 1.1 mrg static inline vec_uchar16 vec_lvsr(int a, signed char *b)
802 1.1 mrg {
803 1.1 mrg return (vec_lvsr(a, (unsigned char *)b));
804 1.1 mrg }
805 1.1 mrg
806 1.1 mrg static inline vec_uchar16 vec_lvsr(int a, unsigned short *b)
807 1.1 mrg {
808 1.1 mrg return (vec_lvsr(a, (unsigned char *)b));
809 1.1 mrg }
810 1.1 mrg
811 1.1 mrg static inline vec_uchar16 vec_lvsr(int a, short *b)
812 1.1 mrg {
813 1.1 mrg return (vec_lvsr(a, (unsigned char *)b));
814 1.1 mrg }
815 1.1 mrg
816 1.1 mrg static inline vec_uchar16 vec_lvsr(int a, unsigned int *b)
817 1.1 mrg {
818 1.1 mrg return (vec_lvsr(a, (unsigned char *)b));
819 1.1 mrg }
820 1.1 mrg
821 1.1 mrg static inline vec_uchar16 vec_lvsr(int a, int *b)
822 1.1 mrg {
823 1.1 mrg return (vec_lvsr(a, (unsigned char *)b));
824 1.1 mrg }
825 1.1 mrg
826 1.1 mrg static inline vec_uchar16 vec_lvsr(int a, float *b)
827 1.1 mrg {
828 1.1 mrg return (vec_lvsr(a, (unsigned char *)b));
829 1.1 mrg }
830 1.1 mrg
831 1.1 mrg /* vec_madd (vector multiply add)
832 1.1 mrg * ========
833 1.1 mrg */
834 1.1 mrg #define vec_madd(_a, _b, _c) spu_madd(_a, _b, _c)
835 1.1 mrg
836 1.1 mrg
837 1.1 mrg
838 1.1 mrg /* vec_madds (vector multiply add saturate)
839 1.1 mrg * =========
840 1.1 mrg */
841 1.1 mrg static inline vec_short8 vec_madds(vec_short8 a, vec_short8 b, vec_short8 c)
842 1.1 mrg {
843 1.1 mrg return (vec_adds(c, spu_sel((vec_short8)(spu_sl(spu_mule(a, b), 1)),
844 1.1 mrg (vec_short8)(spu_rlmask(spu_mulo(a, b), -15)),
845 1.1 mrg ((vec_ushort8){0, 0xFFFF, 0, 0xFFFF, 0, 0xFFFF, 0, 0xFFFF}))));
846 1.1 mrg }
847 1.1 mrg
848 1.1 mrg /* vec_max (vector maximum)
849 1.1 mrg * =======
850 1.1 mrg */
851 1.1 mrg static inline vec_uchar16 vec_max(vec_uchar16 a, vec_uchar16 b)
852 1.1 mrg {
853 1.1 mrg return (spu_sel(b, a, spu_cmpgt(a, b)));
854 1.1 mrg }
855 1.1 mrg
856 1.1 mrg static inline vec_char16 vec_max(vec_char16 a, vec_char16 b)
857 1.1 mrg {
858 1.1 mrg return (spu_sel(b, a, spu_cmpgt(a, b)));
859 1.1 mrg }
860 1.1 mrg
861 1.1 mrg static inline vec_char16 vec_max(vec_bchar16 a, vec_char16 b)
862 1.1 mrg {
863 1.1 mrg return (spu_sel(b, (vec_char16)(a), spu_cmpgt((vec_char16)(a), b)));
864 1.1 mrg }
865 1.1 mrg
866 1.1 mrg static inline vec_char16 vec_max(vec_char16 a, vec_bchar16 b)
867 1.1 mrg {
868 1.1 mrg return (spu_sel((vec_char16)(b), a, spu_cmpgt(a, (vec_char16)(b))));
869 1.1 mrg }
870 1.1 mrg
871 1.1 mrg static inline vec_ushort8 vec_max(vec_ushort8 a, vec_ushort8 b)
872 1.1 mrg {
873 1.1 mrg return (spu_sel(b, a, spu_cmpgt(a, b)));
874 1.1 mrg }
875 1.1 mrg
876 1.1 mrg static inline vec_short8 vec_max(vec_short8 a, vec_short8 b)
877 1.1 mrg {
878 1.1 mrg return (spu_sel(b, a, spu_cmpgt(a, b)));
879 1.1 mrg }
880 1.1 mrg
881 1.1 mrg static inline vec_short8 vec_max(vec_bshort8 a, vec_short8 b)
882 1.1 mrg {
883 1.1 mrg return (spu_sel(b, (vec_short8)(a), spu_cmpgt((vec_short8)(a), b)));
884 1.1 mrg }
885 1.1 mrg
886 1.1 mrg static inline vec_short8 vec_max(vec_short8 a, vec_bshort8 b)
887 1.1 mrg {
888 1.1 mrg return (spu_sel((vec_short8)(b), a, spu_cmpgt(a, (vec_short8)(b))));
889 1.1 mrg }
890 1.1 mrg
891 1.1 mrg static inline vec_uint4 vec_max(vec_uint4 a, vec_uint4 b)
892 1.1 mrg {
893 1.1 mrg return (spu_sel(b, a, spu_cmpgt(a, b)));
894 1.1 mrg }
895 1.1 mrg
896 1.1 mrg static inline vec_int4 vec_max(vec_int4 a, vec_int4 b)
897 1.1 mrg {
898 1.1 mrg return (spu_sel(b, a, spu_cmpgt(a, b)));
899 1.1 mrg }
900 1.1 mrg
901 1.1 mrg static inline vec_int4 vec_max(vec_bint4 a, vec_int4 b)
902 1.1 mrg {
903 1.1 mrg return (spu_sel(b, (vec_int4)(a), spu_cmpgt((vec_int4)(a), b)));
904 1.1 mrg }
905 1.1 mrg
906 1.1 mrg static inline vec_int4 vec_max(vec_int4 a, vec_bint4 b)
907 1.1 mrg {
908 1.1 mrg return (spu_sel((vec_int4)(b), a, spu_cmpgt(a, (vec_int4)(b))));
909 1.1 mrg }
910 1.1 mrg
911 1.1 mrg static inline vec_float4 vec_max(vec_float4 a, vec_float4 b)
912 1.1 mrg {
913 1.1 mrg return (spu_sel(b, a, spu_cmpgt(a, b)));
914 1.1 mrg }
915 1.1 mrg
916 1.1 mrg
917 1.1 mrg /* vec_mergeh (vector merge high)
918 1.1 mrg * ==========
919 1.1 mrg */
920 1.1 mrg static inline vec_uchar16 vec_mergeh(vec_uchar16 a, vec_uchar16 b)
921 1.1 mrg {
922 1.1 mrg return (spu_shuffle(a, b, ((vec_uchar16){0, 16, 1, 17, 2, 18, 3, 19,
923 1.1 mrg 4, 20, 5, 21, 6, 22, 7, 23})));
924 1.1 mrg }
925 1.1 mrg
926 1.1 mrg static inline vec_char16 vec_mergeh(vec_char16 a, vec_char16 b)
927 1.1 mrg {
928 1.1 mrg return (spu_shuffle(a, b, ((vec_uchar16){0, 16, 1, 17, 2, 18, 3, 19,
929 1.1 mrg 4, 20, 5, 21, 6, 22, 7, 23})));
930 1.1 mrg }
931 1.1 mrg
932 1.1 mrg static inline vec_ushort8 vec_mergeh(vec_ushort8 a, vec_ushort8 b)
933 1.1 mrg {
934 1.1 mrg return (spu_shuffle(a, b, ((vec_uchar16){0, 1, 16, 17, 2, 3, 18, 19,
935 1.1 mrg 4, 5, 20, 21, 6, 7, 22, 23})));
936 1.1 mrg }
937 1.1 mrg
938 1.1 mrg static inline vec_short8 vec_mergeh(vec_short8 a, vec_short8 b)
939 1.1 mrg {
940 1.1 mrg return (spu_shuffle(a, b, ((vec_uchar16){0, 1, 16, 17, 2, 3, 18, 19,
941 1.1 mrg 4, 5, 20, 21, 6, 7, 22, 23})));
942 1.1 mrg }
943 1.1 mrg
944 1.1 mrg static inline vec_uint4 vec_mergeh(vec_uint4 a, vec_uint4 b)
945 1.1 mrg {
946 1.1 mrg return (spu_shuffle(a, b, ((vec_uchar16){0, 1, 2, 3, 16, 17, 18, 19,
947 1.1 mrg 4, 5, 6, 7, 20, 21, 22, 23})));
948 1.1 mrg }
949 1.1 mrg
950 1.1 mrg static inline vec_int4 vec_mergeh(vec_int4 a, vec_int4 b)
951 1.1 mrg {
952 1.1 mrg return (spu_shuffle(a, b, ((vec_uchar16){0, 1, 2, 3, 16, 17, 18, 19,
953 1.1 mrg 4, 5, 6, 7, 20, 21, 22, 23})));
954 1.1 mrg }
955 1.1 mrg
956 1.1 mrg static inline vec_float4 vec_mergeh(vec_float4 a, vec_float4 b)
957 1.1 mrg {
958 1.1 mrg return (spu_shuffle(a, b, ((vec_uchar16){0, 1, 2, 3, 16, 17, 18, 19,
959 1.1 mrg 4, 5, 6, 7, 20, 21, 22, 23})));
960 1.1 mrg }
961 1.1 mrg
962 1.1 mrg /* vec_mergel (vector merge low)
963 1.1 mrg * ==========
964 1.1 mrg */
965 1.1 mrg static inline vec_uchar16 vec_mergel(vec_uchar16 a, vec_uchar16 b)
966 1.1 mrg {
967 1.1 mrg return (spu_shuffle(a, b, ((vec_uchar16){ 8, 24, 9, 25, 10, 26, 11, 27,
968 1.1 mrg 12, 28, 13, 29, 14, 30, 15, 31})));
969 1.1 mrg }
970 1.1 mrg
971 1.1 mrg static inline vec_char16 vec_mergel(vec_char16 a, vec_char16 b)
972 1.1 mrg {
973 1.1 mrg return (spu_shuffle(a, b, ((vec_uchar16){ 8, 24, 9, 25, 10, 26, 11, 27,
974 1.1 mrg 12, 28, 13, 29, 14, 30, 15, 31})));
975 1.1 mrg }
976 1.1 mrg
977 1.1 mrg static inline vec_ushort8 vec_mergel(vec_ushort8 a, vec_ushort8 b)
978 1.1 mrg {
979 1.1 mrg return (spu_shuffle(a, b, ((vec_uchar16){ 8, 9, 24, 25, 10, 11, 26, 27,
980 1.1 mrg 12, 13, 28, 29, 14, 15, 30, 31})));
981 1.1 mrg }
982 1.1 mrg
983 1.1 mrg static inline vec_short8 vec_mergel(vec_short8 a, vec_short8 b)
984 1.1 mrg {
985 1.1 mrg return (spu_shuffle(a, b, ((vec_uchar16){ 8, 9, 24, 25, 10, 11, 26, 27,
986 1.1 mrg 12, 13, 28, 29, 14, 15, 30, 31})));
987 1.1 mrg }
988 1.1 mrg
989 1.1 mrg static inline vec_uint4 vec_mergel(vec_uint4 a, vec_uint4 b)
990 1.1 mrg {
991 1.1 mrg return (spu_shuffle(a, b, ((vec_uchar16){ 8, 9, 10, 11, 24, 25, 26, 27,
992 1.1 mrg 12, 13, 14, 15, 28, 29, 30, 31})));
993 1.1 mrg }
994 1.1 mrg
995 1.1 mrg static inline vec_int4 vec_mergel(vec_int4 a, vec_int4 b)
996 1.1 mrg {
997 1.1 mrg return (spu_shuffle(a, b, ((vec_uchar16){ 8, 9, 10, 11, 24, 25, 26, 27,
998 1.1 mrg 12, 13, 14, 15, 28, 29, 30, 31})));
999 1.1 mrg }
1000 1.1 mrg
1001 1.1 mrg static inline vec_float4 vec_mergel(vec_float4 a, vec_float4 b)
1002 1.1 mrg {
1003 1.1 mrg return (spu_shuffle(a, b, ((vec_uchar16){ 8, 9, 10, 11, 24, 25, 26, 27,
1004 1.1 mrg 12, 13, 14, 15, 28, 29, 30, 31})));
1005 1.1 mrg }
1006 1.1 mrg
1007 1.1 mrg /* vec_mfvscr (vector move from vector status and control register)
1008 1.1 mrg * ==========
1009 1.1 mrg */
1010 1.1 mrg static inline vec_ushort8 vec_mfvscr()
1011 1.1 mrg {
1012 1.1 mrg return ((vec_ushort8)spu_splats(0)); /* not supported */
1013 1.1 mrg }
1014 1.1 mrg
1015 1.1 mrg
1016 1.1 mrg /* vec_min (vector minimum)
1017 1.1 mrg * =======
1018 1.1 mrg */
1019 1.1 mrg static inline vec_uchar16 vec_min(vec_uchar16 a, vec_uchar16 b)
1020 1.1 mrg {
1021 1.1 mrg return (spu_sel(a, b, spu_cmpgt(a, b)));
1022 1.1 mrg }
1023 1.1 mrg
1024 1.1 mrg static inline vec_char16 vec_min(vec_char16 a, vec_char16 b)
1025 1.1 mrg {
1026 1.1 mrg return (spu_sel(a, b, spu_cmpgt(a, b)));
1027 1.1 mrg }
1028 1.1 mrg
1029 1.1 mrg static inline vec_char16 vec_min(vec_bchar16 a, vec_char16 b)
1030 1.1 mrg {
1031 1.1 mrg return (spu_sel((vec_char16)(a), b, spu_cmpgt((vec_char16)(a), b)));
1032 1.1 mrg }
1033 1.1 mrg
1034 1.1 mrg static inline vec_char16 vec_min(vec_char16 a, vec_bchar16 b)
1035 1.1 mrg {
1036 1.1 mrg return (spu_sel(a, (vec_char16)(b), spu_cmpgt(a, (vec_char16)(b))));
1037 1.1 mrg }
1038 1.1 mrg
1039 1.1 mrg static inline vec_ushort8 vec_min(vec_ushort8 a, vec_ushort8 b)
1040 1.1 mrg {
1041 1.1 mrg return (spu_sel(a, b, spu_cmpgt(a, b)));
1042 1.1 mrg }
1043 1.1 mrg
1044 1.1 mrg static inline vec_short8 vec_min(vec_short8 a, vec_short8 b)
1045 1.1 mrg {
1046 1.1 mrg return (spu_sel(a, b, spu_cmpgt(a, b)));
1047 1.1 mrg }
1048 1.1 mrg
1049 1.1 mrg static inline vec_short8 vec_min(vec_bshort8 a, vec_short8 b)
1050 1.1 mrg {
1051 1.1 mrg return (spu_sel((vec_short8)(a), b, spu_cmpgt((vec_short8)(a), b)));
1052 1.1 mrg }
1053 1.1 mrg
1054 1.1 mrg static inline vec_short8 vec_min(vec_short8 a, vec_bshort8 b)
1055 1.1 mrg {
1056 1.1 mrg return (spu_sel(a, (vec_short8)(b), spu_cmpgt(a, (vec_short8)(b))));
1057 1.1 mrg }
1058 1.1 mrg
1059 1.1 mrg static inline vec_uint4 vec_min(vec_uint4 a, vec_uint4 b)
1060 1.1 mrg {
1061 1.1 mrg return (spu_sel(a, b, spu_cmpgt(a, b)));
1062 1.1 mrg }
1063 1.1 mrg
1064 1.1 mrg static inline vec_int4 vec_min(vec_int4 a, vec_int4 b)
1065 1.1 mrg {
1066 1.1 mrg return (spu_sel(a, b, spu_cmpgt(a, b)));
1067 1.1 mrg }
1068 1.1 mrg
1069 1.1 mrg static inline vec_int4 vec_min(vec_bint4 a, vec_int4 b)
1070 1.1 mrg {
1071 1.1 mrg return (spu_sel((vec_int4)(a), b, spu_cmpgt((vec_int4)(a), b)));
1072 1.1 mrg }
1073 1.1 mrg
1074 1.1 mrg static inline vec_int4 vec_min(vec_int4 a, vec_bint4 b)
1075 1.1 mrg {
1076 1.1 mrg return (spu_sel(a, (vec_int4)(b), spu_cmpgt(a, (vec_int4)(b))));
1077 1.1 mrg }
1078 1.1 mrg
1079 1.1 mrg static inline vec_float4 vec_min(vec_float4 a, vec_float4 b)
1080 1.1 mrg {
1081 1.1 mrg return (spu_sel(a, b, spu_cmpgt(a, b)));
1082 1.1 mrg }
1083 1.1 mrg
1084 1.1 mrg /* vec_mladd (vector multiply low and add unsigned half word)
1085 1.1 mrg * =========
1086 1.1 mrg */
1087 1.1 mrg static inline vec_short8 vec_mladd(vec_short8 a, vec_short8 b, vec_short8 c)
1088 1.1 mrg {
1089 1.1 mrg return ((vec_short8)(spu_shuffle(spu_madd((vec_short8)(spu_rl((vec_uint4)(a), -16)),
1090 1.1 mrg (vec_short8)(spu_rl((vec_uint4)(b), -16)),
1091 1.1 mrg (vec_int4)(spu_rl((vec_uint4)(c), -16))),
1092 1.1 mrg spu_madd(a, b, spu_extend(c)),
1093 1.1 mrg ((vec_uchar16){ 2, 3, 18, 19, 6, 7, 22, 23,
1094 1.1 mrg 10, 11, 26, 27, 14, 15, 30, 31}))));
1095 1.1 mrg }
1096 1.1 mrg
1097 1.1 mrg
1098 1.1 mrg static inline vec_ushort8 vec_mladd(vec_ushort8 a, vec_ushort8 b, vec_ushort8 c)
1099 1.1 mrg {
1100 1.1 mrg return ((vec_ushort8)(vec_mladd((vec_short8)(a), (vec_short8)(b), (vec_short8)(c))));
1101 1.1 mrg }
1102 1.1 mrg
1103 1.1 mrg static inline vec_short8 vec_mladd(vec_ushort8 a, vec_short8 b, vec_short8 c)
1104 1.1 mrg {
1105 1.1 mrg return (vec_mladd((vec_short8)(a), b, c));
1106 1.1 mrg }
1107 1.1 mrg
1108 1.1 mrg static inline vec_short8 vec_mladd(vec_short8 a, vec_ushort8 b, vec_ushort8 c)
1109 1.1 mrg {
1110 1.1 mrg return (vec_mladd(a, (vec_short8)(b), (vec_short8)(c)));
1111 1.1 mrg }
1112 1.1 mrg
1113 1.1 mrg
1114 1.1 mrg /* vec_mradds (vector multiply round and add saturate)
1115 1.1 mrg * ==========
1116 1.1 mrg */
1117 1.1 mrg static inline vec_short8 vec_mradds(vec_short8 a, vec_short8 b, vec_short8 c)
1118 1.1 mrg {
1119 1.1 mrg vec_int4 round = (vec_int4)spu_splats(0x4000);
1120 1.1 mrg vec_short8 hi, lo;
1121 1.1 mrg
1122 1.1 mrg hi = (vec_short8)(spu_sl(spu_add(spu_mule(a, b), round), 1));
1123 1.1 mrg lo = (vec_short8)(spu_rlmask(spu_add(spu_mulo(a, b), round), -15));
1124 1.1 mrg
1125 1.1 mrg return (vec_adds(spu_sel(hi, lo, ((vec_ushort8){0, 0xFFFF, 0, 0xFFFF, 0, 0xFFFF, 0, 0xFFFF})), c));
1126 1.1 mrg }
1127 1.1 mrg
1128 1.1 mrg
1129 1.1 mrg /* vec_msum (vector multiply sum)
1130 1.1 mrg * ========
1131 1.1 mrg */
1132 1.1 mrg static inline vec_uint4 vec_msum(vec_uchar16 a, vec_uchar16 b, vec_uint4 c)
1133 1.1 mrg {
1134 1.1 mrg vec_ushort8 a1, a2, b1, b2;
1135 1.1 mrg vec_uint4 p1, p2;
1136 1.1 mrg
1137 1.1 mrg a1 = spu_and((vec_ushort8)(a), 0xFF);
1138 1.1 mrg a2 = spu_rlmask((vec_ushort8)(a), -8);
1139 1.1 mrg b1 = spu_and((vec_ushort8)(b), 0xFF);
1140 1.1 mrg b2 = spu_rlmask((vec_ushort8)(b), -8);
1141 1.1 mrg
1142 1.1 mrg p1 = spu_add(spu_mulo(a1, b1), spu_mulo(spu_rlqwbyte(a1, -2), spu_rlqwbyte(b1, -2)));
1143 1.1 mrg p2 = spu_add(spu_mulo(a2, b2), spu_mulo(spu_rlqwbyte(a2, -2), spu_rlqwbyte(b2, -2)));
1144 1.1 mrg return (spu_add(p2, spu_add(p1, c)));
1145 1.1 mrg }
1146 1.1 mrg
1147 1.1 mrg static inline vec_int4 vec_msum(vec_char16 a, vec_uchar16 b, vec_int4 c)
1148 1.1 mrg {
1149 1.1 mrg vec_short8 a1, a2, b1, b2;
1150 1.1 mrg vec_int4 p1, p2;
1151 1.1 mrg
1152 1.1 mrg a1 = (vec_short8)(spu_extend(a));
1153 1.1 mrg a2 = spu_rlmaska((vec_short8)(a), -8);
1154 1.1 mrg b1 = (vec_short8)(spu_and((vec_ushort8)(b), 0xFF));
1155 1.1 mrg b2 = (vec_short8)spu_rlmask((vec_ushort8)(b), -8);
1156 1.1 mrg
1157 1.1 mrg p1 = spu_add(spu_mulo(a1, b1), spu_mulo(spu_rlqwbyte(a1, -2), spu_rlqwbyte(b1, -2)));
1158 1.1 mrg p2 = spu_add(spu_mulo(a2, b2), spu_mulo(spu_rlqwbyte(a2, -2), spu_rlqwbyte(b2, -2)));
1159 1.1 mrg return (spu_add(p2, spu_add(p1, c)));
1160 1.1 mrg }
1161 1.1 mrg
1162 1.1 mrg static inline vec_uint4 vec_msum(vec_ushort8 a, vec_ushort8 b, vec_uint4 c)
1163 1.1 mrg {
1164 1.1 mrg return (spu_add(spu_add(spu_mulo(a, b), spu_mulo(spu_rlqwbyte(a, -2), spu_rlqwbyte(b, -2))), c));
1165 1.1 mrg }
1166 1.1 mrg
1167 1.1 mrg static inline vec_int4 vec_msum(vec_short8 a, vec_short8 b, vec_int4 c)
1168 1.1 mrg {
1169 1.1 mrg return (spu_add(spu_add(spu_mulo(a, b), spu_mulo(spu_rlqwbyte(a, -2), spu_rlqwbyte(b, -2))), c));
1170 1.1 mrg }
1171 1.1 mrg
1172 1.1 mrg
1173 1.1 mrg /* vec_msums (vector multiply sum saturate)
1174 1.1 mrg * ========
1175 1.1 mrg */
1176 1.1 mrg static inline vec_uint4 vec_msums(vec_ushort8 a, vec_ushort8 b, vec_uint4 c)
1177 1.1 mrg {
1178 1.1 mrg vec_uint4 p1, p2;
1179 1.1 mrg
1180 1.1 mrg p1 = spu_mulo(a, b);
1181 1.1 mrg p2 = spu_mulo(spu_rlqwbyte(a, -2), spu_rlqwbyte(b, -2));
1182 1.1 mrg
1183 1.1 mrg return (vec_adds(p2, vec_adds(p1, c)));
1184 1.1 mrg }
1185 1.1 mrg
1186 1.1 mrg static inline vec_int4 vec_msums(vec_short8 a, vec_short8 b, vec_int4 c)
1187 1.1 mrg {
1188 1.1 mrg return (vec_adds(spu_add(spu_mulo(a, b), spu_mulo(spu_rlqwbyte(a, -2), spu_rlqwbyte(b, -2))), c));
1189 1.1 mrg }
1190 1.1 mrg
1191 1.1 mrg /* vec_mtvscr (vector move to vector status and control register)
1192 1.1 mrg * ==========
1193 1.1 mrg */
1194 1.1 mrg #define vec_mtvscr(_a) /* not supported */
1195 1.1 mrg
1196 1.1 mrg
1197 1.1 mrg /* vec_mule (vector multiply even)
1198 1.1 mrg * ========
1199 1.1 mrg */
1200 1.1 mrg static inline vec_ushort8 vec_mule(vec_uchar16 a, vec_uchar16 b)
1201 1.1 mrg {
1202 1.1 mrg vec_ushort8 hi, lo;
1203 1.1 mrg
1204 1.1 mrg hi = (vec_ushort8)spu_mulo((vec_ushort8)(spu_rlmask((vec_uint4)(a), -24)),
1205 1.1 mrg (vec_ushort8)(spu_rlmask((vec_uint4)(b), -24)));
1206 1.1 mrg lo = (vec_ushort8)spu_mulo((vec_ushort8)(spu_rlmask((vec_short8)(a), -8)),
1207 1.1 mrg (vec_ushort8)(spu_rlmask((vec_short8)(b), -8)));
1208 1.1 mrg
1209 1.1 mrg return (spu_shuffle(hi, lo, ((vec_uchar16){ 2, 3, 18, 19, 6, 7, 22, 23,
1210 1.1 mrg 10, 11, 26, 27, 14, 15, 30, 31})));
1211 1.1 mrg }
1212 1.1 mrg
1213 1.1 mrg static inline vec_short8 vec_mule(vec_char16 a, vec_char16 b)
1214 1.1 mrg {
1215 1.1 mrg vec_short8 hi, lo;
1216 1.1 mrg
1217 1.1 mrg hi = (vec_short8)spu_mulo((vec_short8)(spu_rlmaska((vec_uint4)(a), -24)),
1218 1.1 mrg (vec_short8)(spu_rlmaska((vec_uint4)(b), -24)));
1219 1.1 mrg lo = (vec_short8)spu_mulo((vec_short8)(spu_rlmaska((vec_short8)(a), -8)),
1220 1.1 mrg (vec_short8)(spu_rlmaska((vec_short8)(b), -8)));
1221 1.1 mrg
1222 1.1 mrg return (spu_shuffle(hi, lo, ((vec_uchar16){ 2, 3, 18, 19, 6, 7, 22, 23,
1223 1.1 mrg 10, 11, 26, 27, 14, 15, 30, 31})));
1224 1.1 mrg }
1225 1.1 mrg
1226 1.1 mrg static inline vec_uint4 vec_mule(vec_ushort8 a, vec_ushort8 b)
1227 1.1 mrg {
1228 1.1 mrg return (spu_mulo((vec_ushort8)spu_rlmask((vec_uint4)(a), -16),
1229 1.1 mrg (vec_ushort8)spu_rlmask((vec_uint4)(b), -16)));
1230 1.1 mrg }
1231 1.1 mrg
1232 1.1 mrg
1233 1.1 mrg static inline vec_int4 vec_mule(vec_short8 a, vec_short8 b)
1234 1.1 mrg {
1235 1.1 mrg return (spu_mulo((vec_short8)spu_rlmaska((vec_int4)(a), -16),
1236 1.1 mrg (vec_short8)spu_rlmaska((vec_int4)(b), -16)));
1237 1.1 mrg }
1238 1.1 mrg
1239 1.1 mrg
1240 1.1 mrg /* vec_mulo (vector multiply odd)
1241 1.1 mrg * ========
1242 1.1 mrg */
1243 1.1 mrg static inline vec_ushort8 vec_mulo(vec_uchar16 a, vec_uchar16 b)
1244 1.1 mrg {
1245 1.1 mrg vec_ushort8 hi, lo;
1246 1.1 mrg
1247 1.1 mrg hi = (vec_ushort8)spu_mulo((vec_ushort8)(spu_and(spu_rlmask((vec_uint4)(a), -16), 0xFF)),
1248 1.1 mrg (vec_ushort8)(spu_and(spu_rlmask((vec_uint4)(b), -16), 0xFF)));
1249 1.1 mrg lo = (vec_ushort8)spu_mulo(spu_and((vec_ushort8)(a), 0xFF), spu_and((vec_ushort8)(b), 0xFF));
1250 1.1 mrg
1251 1.1 mrg return (spu_shuffle(hi, lo, ((vec_uchar16){ 2, 3, 18, 19, 6, 7, 22, 23,
1252 1.1 mrg 10, 11, 26, 27, 14, 15, 30, 31})));
1253 1.1 mrg }
1254 1.1 mrg
1255 1.1 mrg static inline vec_short8 vec_mulo(vec_char16 a, vec_char16 b)
1256 1.1 mrg {
1257 1.1 mrg vec_short8 aa, bb, hi, lo;
1258 1.1 mrg
1259 1.1 mrg aa = spu_extend(a);
1260 1.1 mrg bb = spu_extend(b);
1261 1.1 mrg
1262 1.1 mrg hi = (vec_short8)spu_mulo((vec_short8)(spu_rlmaska((vec_uint4)(aa), -16)),
1263 1.1 mrg (vec_short8)(spu_rlmaska((vec_uint4)(bb), -16)));
1264 1.1 mrg lo = (vec_short8)spu_mulo(aa, bb);
1265 1.1 mrg return (spu_shuffle(hi, lo, ((vec_uchar16){ 2, 3, 18, 19, 6, 7, 22, 23,
1266 1.1 mrg 10, 11, 26, 27, 14, 15, 30, 31})));
1267 1.1 mrg }
1268 1.1 mrg
1269 1.1 mrg static inline vec_uint4 vec_mulo(vec_ushort8 a, vec_ushort8 b)
1270 1.1 mrg {
1271 1.1 mrg return (spu_mulo(a, b));
1272 1.1 mrg }
1273 1.1 mrg
1274 1.1 mrg
1275 1.1 mrg static inline vec_int4 vec_mulo(vec_short8 a, vec_short8 b)
1276 1.1 mrg {
1277 1.1 mrg return (spu_mulo(a, b));
1278 1.1 mrg }
1279 1.1 mrg
1280 1.1 mrg
1281 1.1 mrg /* vec_nmsub (vector negative multiply subtract)
1282 1.1 mrg * =========
1283 1.1 mrg */
1284 1.1 mrg #define vec_nmsub(_a, _b, _c) spu_nmsub(_a, _b, _c)
1285 1.1 mrg
1286 1.1 mrg
1287 1.1 mrg /* vec_nor (vector logical nor)
1288 1.1 mrg * =======
1289 1.1 mrg */
1290 1.1 mrg #define vec_nor(_a, _b) spu_nor(_a, _b)
1291 1.1 mrg
1292 1.1 mrg
1293 1.1 mrg /* vec_or (vector logical or)
1294 1.1 mrg * ======
1295 1.1 mrg */
1296 1.1 mrg static inline vec_uchar16 vec_or(vec_uchar16 a, vec_uchar16 b)
1297 1.1 mrg {
1298 1.1 mrg return (spu_or(a, b));
1299 1.1 mrg }
1300 1.1 mrg
1301 1.1 mrg static inline vec_char16 vec_or(vec_char16 a, vec_char16 b)
1302 1.1 mrg {
1303 1.1 mrg return (spu_or(a, b));
1304 1.1 mrg }
1305 1.1 mrg
1306 1.1 mrg static inline vec_char16 vec_or(vec_bchar16 a, vec_char16 b)
1307 1.1 mrg {
1308 1.1 mrg return (spu_or((vec_char16)(a), b));
1309 1.1 mrg }
1310 1.1 mrg
1311 1.1 mrg static inline vec_char16 vec_or(vec_char16 a, vec_bchar16 b)
1312 1.1 mrg {
1313 1.1 mrg return (spu_or(a, (vec_char16)(b)));
1314 1.1 mrg }
1315 1.1 mrg
1316 1.1 mrg static inline vec_ushort8 vec_or(vec_ushort8 a, vec_ushort8 b)
1317 1.1 mrg {
1318 1.1 mrg return (spu_or(a, b));
1319 1.1 mrg }
1320 1.1 mrg
1321 1.1 mrg static inline vec_short8 vec_or(vec_short8 a, vec_short8 b)
1322 1.1 mrg {
1323 1.1 mrg return (spu_or(a, b));
1324 1.1 mrg }
1325 1.1 mrg
1326 1.1 mrg static inline vec_short8 vec_or(vec_bshort8 a, vec_short8 b)
1327 1.1 mrg {
1328 1.1 mrg return (spu_or((vec_short8)(a), b));
1329 1.1 mrg }
1330 1.1 mrg
1331 1.1 mrg static inline vec_short8 vec_or(vec_short8 a, vec_bshort8 b)
1332 1.1 mrg {
1333 1.1 mrg return (spu_or(a, (vec_short8)(b)));
1334 1.1 mrg }
1335 1.1 mrg
1336 1.1 mrg static inline vec_uint4 vec_or(vec_uint4 a, vec_uint4 b)
1337 1.1 mrg {
1338 1.1 mrg return (spu_or(a, b));
1339 1.1 mrg }
1340 1.1 mrg
1341 1.1 mrg static inline vec_int4 vec_or(vec_int4 a, vec_int4 b)
1342 1.1 mrg {
1343 1.1 mrg return (spu_or(a, b));
1344 1.1 mrg }
1345 1.1 mrg
1346 1.1 mrg static inline vec_int4 vec_or(vec_bint4 a, vec_int4 b)
1347 1.1 mrg {
1348 1.1 mrg return (spu_or((vec_int4)(a), b));
1349 1.1 mrg }
1350 1.1 mrg
1351 1.1 mrg static inline vec_int4 vec_or(vec_int4 a, vec_bint4 b)
1352 1.1 mrg {
1353 1.1 mrg return (spu_or(a, (vec_int4)(b)));
1354 1.1 mrg }
1355 1.1 mrg
1356 1.1 mrg static inline vec_float4 vec_or(vec_float4 a, vec_float4 b)
1357 1.1 mrg {
1358 1.1 mrg return (spu_or(a, b));
1359 1.1 mrg }
1360 1.1 mrg
1361 1.1 mrg static inline vec_float4 vec_or(vec_bint4 a, vec_float4 b)
1362 1.1 mrg {
1363 1.1 mrg return (spu_or((vec_float4)(a),b));
1364 1.1 mrg }
1365 1.1 mrg
1366 1.1 mrg static inline vec_float4 vec_or(vec_float4 a, vec_bint4 b)
1367 1.1 mrg {
1368 1.1 mrg return (spu_or(a, (vec_float4)(b)));
1369 1.1 mrg }
1370 1.1 mrg
1371 1.1 mrg
1372 1.1 mrg /* vec_pack (vector pack)
1373 1.1 mrg * ========
1374 1.1 mrg */
1375 1.1 mrg static inline vec_uchar16 vec_pack(vec_ushort8 a, vec_ushort8 b)
1376 1.1 mrg {
1377 1.1 mrg return ((vec_uchar16)spu_shuffle(a, b, ((vec_uchar16){ 1, 3, 5, 7, 9, 11, 13, 15,
1378 1.1 mrg 17, 19, 21, 23, 25, 27, 29, 31})));
1379 1.1 mrg }
1380 1.1 mrg
1381 1.1 mrg static inline vec_char16 vec_pack(vec_short8 a, vec_short8 b)
1382 1.1 mrg {
1383 1.1 mrg return ((vec_char16)spu_shuffle(a, b, ((vec_uchar16){ 1, 3, 5, 7, 9, 11, 13, 15,
1384 1.1 mrg 17, 19, 21, 23, 25, 27, 29, 31})));
1385 1.1 mrg }
1386 1.1 mrg
1387 1.1 mrg static inline vec_ushort8 vec_pack(vec_uint4 a, vec_uint4 b)
1388 1.1 mrg {
1389 1.1 mrg return ((vec_ushort8)spu_shuffle(a, b, ((vec_uchar16){ 2, 3, 6, 7, 10, 11, 14, 15,
1390 1.1 mrg 18, 19, 22, 23, 26, 27, 30, 31})));
1391 1.1 mrg }
1392 1.1 mrg
1393 1.1 mrg static inline vec_short8 vec_pack(vec_int4 a, vec_int4 b)
1394 1.1 mrg {
1395 1.1 mrg return ((vec_short8)spu_shuffle(a, b, ((vec_uchar16){ 2, 3, 6, 7, 10, 11, 14, 15,
1396 1.1 mrg 18, 19, 22, 23, 26, 27, 30, 31})));
1397 1.1 mrg }
1398 1.1 mrg
1399 1.1 mrg
1400 1.1 mrg /* vec_packpx (vector pack pixel)
1401 1.1 mrg * ==========
1402 1.1 mrg */
1403 1.1 mrg static inline vec_pixel8 vec_packpx(vec_uint4 a, vec_uint4 b)
1404 1.1 mrg {
1405 1.1 mrg vec_uint4 x03FF = (vec_uint4)(spu_splats((unsigned short)0x03FF));
1406 1.1 mrg vec_uint4 x001F = (vec_uint4)(spu_splats((unsigned short)0x001F));
1407 1.1 mrg
1408 1.1 mrg return ((vec_pixel8)(spu_shuffle(spu_sel(spu_sel(spu_sl(a, 7), spu_sl(a, 10), x03FF),
1409 1.1 mrg spu_sl(a, 13), x001F),
1410 1.1 mrg spu_sel(spu_sel(spu_sl(b, 7), spu_sl(b, 10), x03FF),
1411 1.1 mrg spu_sl(b, 13), x001F),
1412 1.1 mrg ((vec_uchar16){ 0, 1, 4, 5, 8, 9, 12, 13,
1413 1.1 mrg 16, 17, 20, 21, 24, 25, 28, 29}))));
1414 1.1 mrg }
1415 1.1 mrg
1416 1.1 mrg
1417 1.1 mrg /* vec_packs (vector pack saturate)
1418 1.1 mrg * =========
1419 1.1 mrg */
1420 1.1 mrg static inline vec_uchar16 vec_packs(vec_ushort8 a, vec_ushort8 b)
1421 1.1 mrg {
1422 1.1 mrg vec_ushort8 max = spu_splats((unsigned short)0x00FF);
1423 1.1 mrg
1424 1.1 mrg return ((vec_uchar16)(spu_shuffle(spu_sel(a, max, spu_cmpgt(a, 255)),
1425 1.1 mrg spu_sel(b, max, spu_cmpgt(b, 255)),
1426 1.1 mrg ((vec_uchar16){ 1, 3, 5, 7, 9, 11, 13, 15,
1427 1.1 mrg 17, 19, 21, 23, 25, 27, 29, 31}))));
1428 1.1 mrg }
1429 1.1 mrg
1430 1.1 mrg static inline vec_char16 vec_packs(vec_short8 a, vec_short8 b)
1431 1.1 mrg {
1432 1.1 mrg vec_short8 max = spu_splats((signed short)0x007F);
1433 1.1 mrg vec_short8 min = spu_splats((signed short)0xFF80);
1434 1.1 mrg
1435 1.1 mrg return ((vec_char16)(spu_shuffle(spu_sel(min, spu_sel(a, max, spu_cmpgt(a, 127)), spu_cmpgt(a, -128)),
1436 1.1 mrg spu_sel(min, spu_sel(b, max, spu_cmpgt(b, 127)), spu_cmpgt(b, -128)),
1437 1.1 mrg ((vec_uchar16){ 1, 3, 5, 7, 9, 11, 13, 15,
1438 1.1 mrg 17, 19, 21, 23, 25, 27, 29, 31}))));
1439 1.1 mrg }
1440 1.1 mrg
1441 1.1 mrg static inline vec_ushort8 vec_packs(vec_uint4 a, vec_uint4 b)
1442 1.1 mrg {
1443 1.1 mrg vec_uint4 max = spu_splats((unsigned int)0x0000FFFF);
1444 1.1 mrg
1445 1.1 mrg return ((vec_ushort8)(spu_shuffle(spu_sel(a, max, spu_cmpgt(a, max)),
1446 1.1 mrg spu_sel(b, max, spu_cmpgt(b, max)),
1447 1.1 mrg ((vec_uchar16){ 2, 3, 6, 7, 10, 11, 14, 15,
1448 1.1 mrg 18, 19, 22, 23, 26, 27, 30, 31}))));
1449 1.1 mrg }
1450 1.1 mrg
1451 1.1 mrg static inline vec_short8 vec_packs(vec_int4 a, vec_int4 b)
1452 1.1 mrg {
1453 1.1 mrg vec_int4 max = spu_splats((signed int)0x00007FFF);
1454 1.1 mrg vec_int4 min = spu_splats((signed int)0xFFFF8000);
1455 1.1 mrg
1456 1.1 mrg return ((vec_short8)(spu_shuffle(spu_sel(min, spu_sel(a, max, spu_cmpgt(a, max)), spu_cmpgt(a, min)),
1457 1.1 mrg spu_sel(min, spu_sel(b, max, spu_cmpgt(b, max)), spu_cmpgt(b, min)),
1458 1.1 mrg ((vec_uchar16){ 2, 3, 6, 7, 10, 11, 14, 15,
1459 1.1 mrg 18, 19, 22, 23, 26, 27, 30, 31}))));
1460 1.1 mrg }
1461 1.1 mrg
1462 1.1 mrg
1463 1.1 mrg /* vec_packsu (vector pack saturate unsigned)
1464 1.1 mrg * ==========
1465 1.1 mrg */
1466 1.1 mrg static inline vec_uchar16 vec_packsu(vec_ushort8 a, vec_ushort8 b)
1467 1.1 mrg {
1468 1.1 mrg return ((vec_uchar16)spu_shuffle(spu_or(a, (vec_ushort8)(spu_cmpgt(a, 255))),
1469 1.1 mrg spu_or(b, (vec_ushort8)(spu_cmpgt(b, 255))),
1470 1.1 mrg ((vec_uchar16){ 1, 3, 5, 7, 9, 11, 13, 15,
1471 1.1 mrg 17, 19, 21, 23, 25, 27, 29, 31})));
1472 1.1 mrg }
1473 1.1 mrg
1474 1.1 mrg static inline vec_uchar16 vec_packsu(vec_short8 a, vec_short8 b)
1475 1.1 mrg {
1476 1.1 mrg vec_short8 max = spu_splats((signed short)0x00FF);
1477 1.1 mrg vec_short8 min = spu_splats((signed short)0x0000);
1478 1.1 mrg
1479 1.1 mrg return ((vec_uchar16)(spu_shuffle(spu_sel(min, spu_sel(a, max, spu_cmpgt(a, 255)), spu_cmpgt(a, 0)),
1480 1.1 mrg spu_sel(min, spu_sel(b, max, spu_cmpgt(b, 255)), spu_cmpgt(b, 0)),
1481 1.1 mrg ((vec_uchar16){ 1, 3, 5, 7, 9, 11, 13, 15,
1482 1.1 mrg 17, 19, 21, 23, 25, 27, 29, 31}))));
1483 1.1 mrg
1484 1.1 mrg return (vec_packsu((vec_ushort8)(a), (vec_ushort8)(b)));
1485 1.1 mrg }
1486 1.1 mrg
1487 1.1 mrg static inline vec_ushort8 vec_packsu(vec_uint4 a, vec_uint4 b)
1488 1.1 mrg {
1489 1.1 mrg vec_uint4 max = spu_splats((unsigned int)0xFFFF);
1490 1.1 mrg
1491 1.1 mrg return ((vec_ushort8)spu_shuffle(spu_or(a, (vec_uint4)(spu_cmpgt(a, max))),
1492 1.1 mrg spu_or(b, (vec_uint4)(spu_cmpgt(b, max))),
1493 1.1 mrg ((vec_uchar16){ 2, 3, 6, 7, 10, 11, 14, 15,
1494 1.1 mrg 18, 19, 22, 23, 26, 27, 30, 31})));
1495 1.1 mrg }
1496 1.1 mrg
1497 1.1 mrg static inline vec_ushort8 vec_packsu(vec_int4 a, vec_int4 b)
1498 1.1 mrg {
1499 1.1 mrg vec_int4 max = spu_splats((signed int)0x0000FFFF);
1500 1.1 mrg vec_int4 min = spu_splats((signed int)0x00000000);
1501 1.1 mrg
1502 1.1 mrg return ((vec_ushort8)(spu_shuffle(spu_sel(min, spu_sel(a, max, spu_cmpgt(a, max)), spu_cmpgt(a, min)),
1503 1.1 mrg spu_sel(min, spu_sel(b, max, spu_cmpgt(b, max)), spu_cmpgt(b, min)),
1504 1.1 mrg ((vec_uchar16){ 2, 3, 6, 7, 10, 11, 14, 15,
1505 1.1 mrg 18, 19, 22, 23, 26, 27, 30, 31}))));
1506 1.1 mrg }
1507 1.1 mrg
1508 1.1 mrg
1509 1.1 mrg /* vec_perm (vector permute)
1510 1.1 mrg * ========
1511 1.1 mrg */
1512 1.1 mrg static inline vec_uchar16 vec_perm(vec_uchar16 a, vec_uchar16 b, vec_uchar16 c)
1513 1.1 mrg {
1514 1.1 mrg return (spu_shuffle(a, b, spu_and(c, 0x1F)));
1515 1.1 mrg }
1516 1.1 mrg
1517 1.1 mrg static inline vec_char16 vec_perm(vec_char16 a, vec_char16 b, vec_uchar16 c)
1518 1.1 mrg {
1519 1.1 mrg return ((vec_char16)(vec_perm((vec_uchar16)(a), (vec_uchar16)(b), c)));
1520 1.1 mrg }
1521 1.1 mrg
1522 1.1 mrg static inline vec_ushort8 vec_perm(vec_ushort8 a, vec_ushort8 b, vec_uchar16 c)
1523 1.1 mrg {
1524 1.1 mrg return ((vec_ushort8)(vec_perm((vec_uchar16)(a), (vec_uchar16)(b), c)));
1525 1.1 mrg }
1526 1.1 mrg
1527 1.1 mrg static inline vec_short8 vec_perm(vec_short8 a, vec_short8 b, vec_uchar16 c)
1528 1.1 mrg {
1529 1.1 mrg return ((vec_short8)(vec_perm((vec_uchar16)(a), (vec_uchar16)(b), c)));
1530 1.1 mrg }
1531 1.1 mrg
1532 1.1 mrg static inline vec_uint4 vec_perm(vec_uint4 a, vec_uint4 b, vec_uchar16 c)
1533 1.1 mrg {
1534 1.1 mrg return ((vec_uint4)(vec_perm((vec_uchar16)(a), (vec_uchar16)(b), c)));
1535 1.1 mrg }
1536 1.1 mrg
1537 1.1 mrg static inline vec_int4 vec_perm(vec_int4 a, vec_int4 b, vec_uchar16 c)
1538 1.1 mrg {
1539 1.1 mrg return ((vec_int4)(vec_perm((vec_uchar16)(a), (vec_uchar16)(b), c)));
1540 1.1 mrg }
1541 1.1 mrg
1542 1.1 mrg static inline vec_float4 vec_perm(vec_float4 a, vec_float4 b, vec_uchar16 c)
1543 1.1 mrg {
1544 1.1 mrg return ((vec_float4)(vec_perm((vec_uchar16)(a), (vec_uchar16)(b), c)));
1545 1.1 mrg }
1546 1.1 mrg
1547 1.1 mrg
1548 1.1 mrg /* vec_re (vector reciprocal estimate)
1549 1.1 mrg * ======
1550 1.1 mrg */
1551 1.1 mrg #define vec_re(_a) spu_re(_a)
1552 1.1 mrg
1553 1.1 mrg
1554 1.1 mrg /* vec_rl (vector rotate left)
1555 1.1 mrg * ======
1556 1.1 mrg */
1557 1.1 mrg static inline vec_uchar16 vec_rl(vec_uchar16 a, vec_uchar16 b)
1558 1.1 mrg {
1559 1.1 mrg vec_ushort8 r1, r2;
1560 1.1 mrg
1561 1.1 mrg r1 = spu_rl(spu_and((vec_ushort8)(a), 0xFF), (vec_short8)spu_and((vec_ushort8)(b), 7));
1562 1.1 mrg r2 = spu_rl(spu_and((vec_ushort8)(a), -256), (vec_short8)spu_and(spu_rlmask((vec_ushort8)(b), -8), 7));
1563 1.1 mrg return ((vec_uchar16)(spu_sel(spu_or(r2, spu_sl(r2, 8)), spu_or(r1, spu_rlmask(r1, -8)), spu_splats((unsigned short)0xFF))));
1564 1.1 mrg }
1565 1.1 mrg
1566 1.1 mrg static inline vec_char16 vec_rl(vec_char16 a, vec_uchar16 b)
1567 1.1 mrg {
1568 1.1 mrg return ((vec_char16)(vec_rl((vec_uchar16)(a), b)));
1569 1.1 mrg }
1570 1.1 mrg
1571 1.1 mrg static inline vec_ushort8 vec_rl(vec_ushort8 a, vec_ushort8 b)
1572 1.1 mrg {
1573 1.1 mrg return (spu_rl(a, (vec_short8)(b)));
1574 1.1 mrg }
1575 1.1 mrg
1576 1.1 mrg static inline vec_short8 vec_rl(vec_short8 a, vec_ushort8 b)
1577 1.1 mrg {
1578 1.1 mrg return (spu_rl(a, (vec_short8)(b)));
1579 1.1 mrg }
1580 1.1 mrg
1581 1.1 mrg static inline vec_uint4 vec_rl(vec_uint4 a, vec_uint4 b)
1582 1.1 mrg {
1583 1.1 mrg return (spu_rl(a, (vec_int4)(b)));
1584 1.1 mrg }
1585 1.1 mrg
1586 1.1 mrg static inline vec_int4 vec_rl(vec_int4 a, vec_uint4 b)
1587 1.1 mrg {
1588 1.1 mrg return (spu_rl(a, (vec_int4)(b)));
1589 1.1 mrg }
1590 1.1 mrg
1591 1.1 mrg
1592 1.1 mrg /* vec_round (vector round)
1593 1.1 mrg * =========
1594 1.1 mrg */
1595 1.1 mrg static inline vec_float4 vec_round(vec_float4 a)
1596 1.1 mrg {
1597 1.1 mrg vec_float4 s_half, s_one, d;
1598 1.1 mrg vec_uint4 odd;
1599 1.1 mrg vec_uint4 msb = spu_splats((unsigned int)0x80000000);
1600 1.1 mrg vec_float4 half = spu_splats(0.5f);
1601 1.1 mrg vec_int4 exp;
1602 1.1 mrg vec_uint4 mask;
1603 1.1 mrg
1604 1.1 mrg s_half = (vec_float4)(spu_sel((vec_uint4)(half), (vec_uint4)(a), msb));
1605 1.1 mrg a = spu_add(a, s_half);
1606 1.1 mrg s_one = spu_add(s_half, s_half);
1607 1.1 mrg exp = spu_sub(127, (vec_int4)(spu_and(spu_rlmask((vec_uint4)(a), -23), 0xFF)));
1608 1.1 mrg mask = spu_rlmask(spu_splats((unsigned int)0x7FFFFF), exp);
1609 1.1 mrg mask = spu_sel(spu_splats((unsigned int)0), mask, spu_cmpgt(exp, -31));
1610 1.1 mrg mask = spu_or(mask, spu_xor((vec_uint4)(spu_rlmaska(spu_add(exp, -1), -31)), -1));
1611 1.1 mrg
1612 1.1 mrg odd = spu_and((vec_uint4)(spu_convts(a, 0)), 1);
1613 1.1 mrg s_one = spu_andc(s_one, (vec_float4)spu_cmpeq(mask, 0));
1614 1.1 mrg s_one = spu_and(s_one, spu_and((vec_float4)spu_cmpeq(spu_and((vec_uint4)(a), mask), 0),
1615 1.1 mrg (vec_float4)spu_cmpeq(odd, 1)));
1616 1.1 mrg d = spu_andc(a, (vec_float4)(mask));
1617 1.1 mrg d = spu_sub(d, s_one);
1618 1.1 mrg return (d);
1619 1.1 mrg }
1620 1.1 mrg
1621 1.1 mrg /* vec_rsqrte (vector reciprocal square root estimate)
1622 1.1 mrg * ==========
1623 1.1 mrg */
1624 1.1 mrg #define vec_rsqrte(_a) spu_rsqrte(_a)
1625 1.1 mrg
1626 1.1 mrg
1627 1.1 mrg /* vec_sel (vector select)
1628 1.1 mrg * =======
1629 1.1 mrg */
1630 1.1 mrg #define vec_sel(_a, _b, _c) spu_sel(_a, _b, _c)
1631 1.1 mrg
1632 1.1 mrg
1633 1.1 mrg /* vec_sl (vector shift left)
1634 1.1 mrg * ======
1635 1.1 mrg */
1636 1.1 mrg static inline vec_uchar16 vec_sl(vec_uchar16 a, vec_uchar16 b)
1637 1.1 mrg {
1638 1.1 mrg vec_ushort8 hi, lo;
1639 1.1 mrg
1640 1.1 mrg lo = spu_and(spu_sl((vec_ushort8)(a), spu_and((vec_ushort8)(b), 7)), 0xFF);
1641 1.1 mrg hi = spu_sl(spu_and((vec_ushort8)(a), -256), spu_and(spu_rlmask((vec_ushort8)(b), -8), 7));
1642 1.1 mrg
1643 1.1 mrg return ((vec_uchar16)(spu_or(hi, lo)));
1644 1.1 mrg }
1645 1.1 mrg
1646 1.1 mrg static inline vec_char16 vec_sl(vec_char16 a, vec_uchar16 b)
1647 1.1 mrg {
1648 1.1 mrg return ((vec_char16)(vec_sl((vec_uchar16)(a), b)));
1649 1.1 mrg }
1650 1.1 mrg
1651 1.1 mrg static inline vec_ushort8 vec_sl(vec_ushort8 a, vec_ushort8 b)
1652 1.1 mrg {
1653 1.1 mrg return (spu_sl(a, spu_and(b, 15)));
1654 1.1 mrg }
1655 1.1 mrg
1656 1.1 mrg static inline vec_short8 vec_sl(vec_short8 a, vec_ushort8 b)
1657 1.1 mrg {
1658 1.1 mrg return (spu_sl(a, spu_and((vec_ushort8)(b), 15)));
1659 1.1 mrg }
1660 1.1 mrg
1661 1.1 mrg static inline vec_uint4 vec_sl(vec_uint4 a, vec_uint4 b)
1662 1.1 mrg {
1663 1.1 mrg return (spu_sl(a, spu_and(b, 31)));
1664 1.1 mrg }
1665 1.1 mrg
1666 1.1 mrg static inline vec_int4 vec_sl(vec_int4 a, vec_uint4 b)
1667 1.1 mrg {
1668 1.1 mrg return (spu_sl(a, spu_and(b, 31)));
1669 1.1 mrg }
1670 1.1 mrg
1671 1.1 mrg
1672 1.1 mrg /* vec_sld (vector shift left double)
1673 1.1 mrg * =======
1674 1.1 mrg */
1675 1.1 mrg #define vec_sld(_a, _b, _c) spu_shuffle(_a, _b, ((vec_uchar16){ 0+(_c), 1+(_c), 2+(_c), 3+(_c), \
1676 1.1 mrg 4+(_c), 5+(_c), 6+(_c), 7+(_c), \
1677 1.1 mrg 8+(_c), 9+(_c), 10+(_c), 11+(_c), \
1678 1.1 mrg 12+(_c), 13+(_c), 14+(_c), 15+(_c)}))
1679 1.1 mrg
1680 1.1 mrg
1681 1.1 mrg /* vec_sll (vector shift left long)
1682 1.1 mrg * =======
1683 1.1 mrg */
1684 1.1 mrg #define vec_sll(_a, _b) spu_slqw(_a, spu_extract((vec_uint4)(_b), 0))
1685 1.1 mrg
1686 1.1 mrg
1687 1.1 mrg /* vec_slo (vector shift left by octet)
1688 1.1 mrg * =======
1689 1.1 mrg */
1690 1.1 mrg #define vec_slo(_a, _b) spu_slqwbytebc(_a, spu_extract((vec_uint4)(_b), 3) & 0x7F)
1691 1.1 mrg
1692 1.1 mrg
1693 1.1 mrg /* vec_splat (vector splat)
1694 1.1 mrg * =========
1695 1.1 mrg */
1696 1.1 mrg #define vec_splat(_a, _b) spu_splats(spu_extract(_a, _b))
1697 1.1 mrg
1698 1.1 mrg
1699 1.1 mrg /* vec_splat_s8 (vector splat signed byte)
1700 1.1 mrg * ============
1701 1.1 mrg */
1702 1.1 mrg #define vec_splat_s8(_a) spu_splats((signed char)(_a))
1703 1.1 mrg
1704 1.1 mrg
1705 1.1 mrg /* vec_splat_s16 (vector splat signed half-word)
1706 1.1 mrg * =============
1707 1.1 mrg */
1708 1.1 mrg #define vec_splat_s16(_a) spu_splats((signed short)(_a))
1709 1.1 mrg
1710 1.1 mrg
1711 1.1 mrg /* vec_splat_s32 (vector splat signed word)
1712 1.1 mrg * =============
1713 1.1 mrg */
1714 1.1 mrg #define vec_splat_s32(_a) spu_splats((signed int)(_a))
1715 1.1 mrg
1716 1.1 mrg
1717 1.1 mrg /* vec_splat_u8 (vector splat unsigned byte)
1718 1.1 mrg * ============
1719 1.1 mrg */
1720 1.1 mrg #define vec_splat_u8(_a) spu_splats((unsigned char)(_a))
1721 1.1 mrg
1722 1.1 mrg
1723 1.1 mrg /* vec_splat_u16 (vector splat unsigned half-word)
1724 1.1 mrg * =============
1725 1.1 mrg */
1726 1.1 mrg #define vec_splat_u16(_a) spu_splats((unsigned short)(_a))
1727 1.1 mrg
1728 1.1 mrg
1729 1.1 mrg /* vec_splat_u32 (vector splat unsigned word)
1730 1.1 mrg * =============
1731 1.1 mrg */
1732 1.1 mrg #define vec_splat_u32(_a) spu_splats((unsigned int)(_a))
1733 1.1 mrg
1734 1.1 mrg
1735 1.1 mrg /* vec_sr (vector shift right)
1736 1.1 mrg * ======
1737 1.1 mrg */
1738 1.1 mrg static inline vec_uchar16 vec_sr(vec_uchar16 a, vec_uchar16 b)
1739 1.1 mrg {
1740 1.1 mrg vec_ushort8 hi, lo;
1741 1.1 mrg
1742 1.1 mrg lo = spu_rlmask(spu_and((vec_ushort8)(a), 0xFF), spu_sub(0, (vec_short8)(spu_and((vec_ushort8)(b), 7))));
1743 1.1 mrg hi = spu_and(spu_rlmask((vec_ushort8)(a), spu_sub(0, (vec_short8)(spu_and(spu_rlmask((vec_ushort8)(b), -8), 7)))), -256);
1744 1.1 mrg
1745 1.1 mrg return ((vec_uchar16)(spu_or(hi, lo)));
1746 1.1 mrg }
1747 1.1 mrg
1748 1.1 mrg static inline vec_char16 vec_sr(vec_char16 a, vec_uchar16 b)
1749 1.1 mrg {
1750 1.1 mrg return ((vec_char16)(vec_sr((vec_uchar16)(a), b)));
1751 1.1 mrg }
1752 1.1 mrg
1753 1.1 mrg static inline vec_ushort8 vec_sr(vec_ushort8 a, vec_ushort8 b)
1754 1.1 mrg {
1755 1.1 mrg return (spu_rlmask(a, spu_sub(0, (vec_short8)(spu_and(b, 15)))));
1756 1.1 mrg }
1757 1.1 mrg
1758 1.1 mrg static inline vec_short8 vec_sr(vec_short8 a, vec_ushort8 b)
1759 1.1 mrg {
1760 1.1 mrg return ((vec_short8)(vec_sr((vec_ushort8)(a), b)));
1761 1.1 mrg }
1762 1.1 mrg
1763 1.1 mrg static inline vec_uint4 vec_sr(vec_uint4 a, vec_uint4 b)
1764 1.1 mrg {
1765 1.1 mrg return (spu_rlmask(a, spu_sub(0, (vec_int4)(spu_and(b, 31)))));
1766 1.1 mrg }
1767 1.1 mrg
1768 1.1 mrg static inline vec_int4 vec_sr(vec_int4 a, vec_uint4 b)
1769 1.1 mrg {
1770 1.1 mrg return ((vec_int4)(vec_sr((vec_uint4)(a), b)));
1771 1.1 mrg }
1772 1.1 mrg
1773 1.1 mrg
1774 1.1 mrg /* vec_sra (vector shift right algebraic)
1775 1.1 mrg * =======
1776 1.1 mrg */
1777 1.1 mrg static inline vec_char16 vec_sra(vec_char16 a, vec_uchar16 b)
1778 1.1 mrg {
1779 1.1 mrg vec_short8 hi, lo;
1780 1.1 mrg
1781 1.1 mrg lo = spu_and(spu_rlmaska(spu_extend(a), spu_sub(0, (vec_short8)(spu_and((vec_ushort8)(b), 7)))), 0xFF);
1782 1.1 mrg hi = spu_and(spu_rlmaska((vec_short8)(a), spu_sub(0, (vec_short8)(spu_and(spu_rlmask((vec_ushort8)(b), -8), 7)))), -256);
1783 1.1 mrg
1784 1.1 mrg return ((vec_char16)(spu_or(hi, lo)));
1785 1.1 mrg }
1786 1.1 mrg
1787 1.1 mrg static inline vec_uchar16 vec_sra(vec_uchar16 a, vec_uchar16 b)
1788 1.1 mrg {
1789 1.1 mrg return ((vec_uchar16)(vec_sra((vec_char16)(a), b)));
1790 1.1 mrg }
1791 1.1 mrg
1792 1.1 mrg static inline vec_short8 vec_sra(vec_short8 a, vec_ushort8 b)
1793 1.1 mrg {
1794 1.1 mrg return (spu_rlmaska(a, spu_sub(0, (vec_short8)(spu_and(b, 15)))));
1795 1.1 mrg }
1796 1.1 mrg
1797 1.1 mrg static inline vec_ushort8 vec_sra(vec_ushort8 a, vec_ushort8 b)
1798 1.1 mrg {
1799 1.1 mrg return ((vec_ushort8)(vec_sra((vec_short8)(a), b)));
1800 1.1 mrg }
1801 1.1 mrg
1802 1.1 mrg static inline vec_int4 vec_sra(vec_int4 a, vec_uint4 b)
1803 1.1 mrg {
1804 1.1 mrg return (spu_rlmaska(a, spu_sub(0, (vec_int4)(spu_and(b, 31)))));
1805 1.1 mrg }
1806 1.1 mrg
1807 1.1 mrg static inline vec_uint4 vec_sra(vec_uint4 a, vec_uint4 b)
1808 1.1 mrg {
1809 1.1 mrg return ((vec_uint4)(vec_sra((vec_int4)(a), b)));
1810 1.1 mrg }
1811 1.1 mrg
1812 1.1 mrg
1813 1.1 mrg /* vec_srl (vector shift right long)
1814 1.1 mrg * =======
1815 1.1 mrg */
1816 1.1 mrg #define vec_srl(_a, _b) spu_rlmaskqw(_a, 0-spu_extract((vec_int4)(_b), 3))
1817 1.1 mrg
1818 1.1 mrg
1819 1.1 mrg /* vec_sro (vector shift right by octet)
1820 1.1 mrg * =======
1821 1.1 mrg */
1822 1.1 mrg #define vec_sro(_a, _b) spu_rlmaskqwbyte(_a, 0 - ((spu_extract((vec_int4)(_b), 3) >> 3) & 0xF))
1823 1.1 mrg
1824 1.1 mrg /* vec_st (vector store indexed)
1825 1.1 mrg * ======
1826 1.1 mrg */
1827 1.1 mrg static inline void vec_st(vec_uchar16 a, int b, unsigned char *c)
1828 1.1 mrg {
1829 1.1 mrg *((vec_uchar16 *)(c+b)) = a;
1830 1.1 mrg }
1831 1.1 mrg
1832 1.1 mrg static inline void vec_st(vec_uchar16 a, int b, vec_uchar16 *c)
1833 1.1 mrg {
1834 1.1 mrg *((vec_uchar16 *)((unsigned char *)(c)+b)) = a;
1835 1.1 mrg }
1836 1.1 mrg
1837 1.1 mrg static inline void vec_st(vec_char16 a, int b, signed char *c)
1838 1.1 mrg {
1839 1.1 mrg *((vec_char16 *)(c+b)) = a;
1840 1.1 mrg }
1841 1.1 mrg
1842 1.1 mrg static inline void vec_st(vec_char16 a, int b, vec_char16 *c)
1843 1.1 mrg {
1844 1.1 mrg *((vec_char16 *)((signed char *)(c)+b)) = a;
1845 1.1 mrg }
1846 1.1 mrg
1847 1.1 mrg static inline void vec_st(vec_bchar16 a, int b, signed char *c)
1848 1.1 mrg {
1849 1.1 mrg *((vec_bchar16 *)((signed char *)(c)+b)) = a;
1850 1.1 mrg }
1851 1.1 mrg
1852 1.1 mrg static inline void vec_st(vec_ushort8 a, int b, unsigned short *c)
1853 1.1 mrg {
1854 1.1 mrg *((vec_ushort8 *)((unsigned char *)(c)+b)) = a;
1855 1.1 mrg }
1856 1.1 mrg
1857 1.1 mrg static inline void vec_st(vec_ushort8 a, int b, vec_ushort8 *c)
1858 1.1 mrg {
1859 1.1 mrg *((vec_ushort8 *)((unsigned char *)(c)+b)) = a;
1860 1.1 mrg }
1861 1.1 mrg
1862 1.1 mrg static inline void vec_st(vec_short8 a, int b, signed short *c)
1863 1.1 mrg {
1864 1.1 mrg *((vec_short8 *)((unsigned char *)(c)+b)) = a;
1865 1.1 mrg }
1866 1.1 mrg
1867 1.1 mrg static inline void vec_st(vec_short8 a, int b, vec_short8 *c)
1868 1.1 mrg {
1869 1.1 mrg *((vec_short8 *)((signed char *)(c)+b)) = a;
1870 1.1 mrg }
1871 1.1 mrg
1872 1.1 mrg static inline void vec_st(vec_bshort8 a, int b, signed short *c)
1873 1.1 mrg {
1874 1.1 mrg *((vec_bshort8 *)((signed char *)(c)+b)) = a;
1875 1.1 mrg }
1876 1.1 mrg
1877 1.1 mrg static inline void vec_st(vec_uint4 a, int b, unsigned int *c)
1878 1.1 mrg {
1879 1.1 mrg *((vec_uint4 *)((unsigned char *)(c)+b)) = a;
1880 1.1 mrg }
1881 1.1 mrg
1882 1.1 mrg static inline void vec_st(vec_uint4 a, int b, vec_uint4 *c)
1883 1.1 mrg {
1884 1.1 mrg *((vec_uint4 *)((unsigned char *)(c)+b)) = a;
1885 1.1 mrg }
1886 1.1 mrg
1887 1.1 mrg static inline void vec_st(vec_int4 a, int b, signed int *c)
1888 1.1 mrg {
1889 1.1 mrg *((vec_int4 *)((unsigned char *)(c)+b)) = a;
1890 1.1 mrg }
1891 1.1 mrg
1892 1.1 mrg static inline void vec_st(vec_int4 a, int b, vec_int4 *c)
1893 1.1 mrg {
1894 1.1 mrg *((vec_int4 *)((signed char *)(c)+b)) = a;
1895 1.1 mrg }
1896 1.1 mrg
1897 1.1 mrg static inline void vec_st(vec_bint4 a, int b, signed int *c)
1898 1.1 mrg {
1899 1.1 mrg *((vec_bint4 *)((signed char *)(c)+b)) = a;
1900 1.1 mrg }
1901 1.1 mrg
1902 1.1 mrg static inline void vec_st(vec_float4 a, int b, float *c)
1903 1.1 mrg {
1904 1.1 mrg *((vec_float4 *)((unsigned char *)(c)+b)) = a;
1905 1.1 mrg }
1906 1.1 mrg
1907 1.1 mrg static inline void vec_st(vec_float4 a, int b, vec_float4 *c)
1908 1.1 mrg {
1909 1.1 mrg *((vec_float4 *)((unsigned char *)(c)+b)) = a;
1910 1.1 mrg }
1911 1.1 mrg
1912 1.1 mrg
1913 1.1 mrg /* vec_ste (vector store element indexed)
1914 1.1 mrg * =======
1915 1.1 mrg */
1916 1.1 mrg static inline void vec_ste(vec_uchar16 a, int b, unsigned char *c)
1917 1.1 mrg {
1918 1.1 mrg unsigned char *ptr;
1919 1.1 mrg
1920 1.1 mrg ptr = c + b;
1921 1.1 mrg *ptr = spu_extract(a, (int)(ptr) & 15);
1922 1.1 mrg }
1923 1.1 mrg
1924 1.1 mrg static inline void vec_ste(vec_char16 a, int b, signed char *c)
1925 1.1 mrg {
1926 1.1 mrg vec_ste((vec_uchar16)(a), b, (unsigned char *)(c));
1927 1.1 mrg }
1928 1.1 mrg
1929 1.1 mrg static inline void vec_ste(vec_bchar16 a, int b, signed char *c)
1930 1.1 mrg {
1931 1.1 mrg vec_ste((vec_uchar16)(a), b, (unsigned char *)(c));
1932 1.1 mrg }
1933 1.1 mrg
1934 1.1 mrg static inline void vec_ste(vec_ushort8 a, int b, unsigned short *c)
1935 1.1 mrg {
1936 1.1 mrg unsigned short *ptr;
1937 1.1 mrg
1938 1.1 mrg ptr = (unsigned short *)(((unsigned int)(c) + b) & ~1);
1939 1.1 mrg *ptr = spu_extract(a, ((int)(ptr) >> 1) & 7);
1940 1.1 mrg }
1941 1.1 mrg
1942 1.1 mrg static inline void vec_ste(vec_short8 a, int b, signed short *c)
1943 1.1 mrg {
1944 1.1 mrg vec_ste((vec_ushort8)(a), b, (unsigned short *)(c));
1945 1.1 mrg }
1946 1.1 mrg
1947 1.1 mrg static inline void vec_ste(vec_bshort8 a, int b, signed short *c)
1948 1.1 mrg {
1949 1.1 mrg vec_ste((vec_ushort8)(a), b, (unsigned short *)(c));
1950 1.1 mrg }
1951 1.1 mrg
1952 1.1 mrg static inline void vec_ste(vec_uint4 a, int b, unsigned int *c)
1953 1.1 mrg {
1954 1.1 mrg unsigned int *ptr;
1955 1.1 mrg
1956 1.1 mrg ptr = (unsigned int *)(((unsigned int)(c) + b) & ~3);
1957 1.1 mrg *ptr = spu_extract(a, ((int)(ptr) >> 2) & 3);
1958 1.1 mrg }
1959 1.1 mrg
1960 1.1 mrg static inline void vec_ste(vec_int4 a, int b, signed int *c)
1961 1.1 mrg {
1962 1.1 mrg vec_ste((vec_uint4)(a), b, (unsigned int *)(c));
1963 1.1 mrg }
1964 1.1 mrg
1965 1.1 mrg static inline void vec_ste(vec_bint4 a, int b, signed int *c)
1966 1.1 mrg {
1967 1.1 mrg vec_ste((vec_uint4)(a), b, (unsigned int *)(c));
1968 1.1 mrg }
1969 1.1 mrg
1970 1.1 mrg static inline void vec_ste(vec_float4 a, int b, float *c)
1971 1.1 mrg {
1972 1.1 mrg vec_ste((vec_uint4)(a), b, (unsigned int *)(c));
1973 1.1 mrg }
1974 1.1 mrg
1975 1.1 mrg
1976 1.1 mrg /* vec_stl (vector store indexed LRU)
1977 1.1 mrg * =======
1978 1.1 mrg */
1979 1.1 mrg #define vec_stl(_a, _b, _c) vec_st(_a, _b, _c)
1980 1.1 mrg
1981 1.1 mrg
1982 1.1 mrg /* vec_sub (vector subtract)
1983 1.1 mrg * =======
1984 1.1 mrg */
1985 1.1 mrg static inline vec_uchar16 vec_sub(vec_uchar16 a, vec_uchar16 b)
1986 1.1 mrg {
1987 1.1 mrg return ((vec_uchar16)(spu_sel(spu_sub((vec_ushort8)(a), (vec_ushort8)(b)),
1988 1.1 mrg spu_sub(spu_and((vec_ushort8)(a), -256), spu_and((vec_ushort8)(b), -256)),
1989 1.1 mrg spu_splats((unsigned short)0xFF00))));
1990 1.1 mrg }
1991 1.1 mrg
1992 1.1 mrg static inline vec_char16 vec_sub(vec_char16 a, vec_char16 b)
1993 1.1 mrg {
1994 1.1 mrg return ((vec_char16)(vec_sub((vec_uchar16)(a), (vec_uchar16)(b))));
1995 1.1 mrg }
1996 1.1 mrg
1997 1.1 mrg static inline vec_char16 vec_sub(vec_bchar16 a, vec_char16 b)
1998 1.1 mrg {
1999 1.1 mrg return ((vec_char16)(vec_sub((vec_uchar16)(a), (vec_uchar16)(b))));
2000 1.1 mrg }
2001 1.1 mrg
2002 1.1 mrg static inline vec_char16 vec_sub(vec_char16 a, vec_bchar16 b)
2003 1.1 mrg {
2004 1.1 mrg return ((vec_char16)(vec_sub((vec_uchar16)(a), (vec_uchar16)(b))));
2005 1.1 mrg }
2006 1.1 mrg
2007 1.1 mrg static inline vec_ushort8 vec_sub(vec_ushort8 a, vec_ushort8 b)
2008 1.1 mrg {
2009 1.1 mrg return (spu_sub(a, b));
2010 1.1 mrg }
2011 1.1 mrg
2012 1.1 mrg static inline vec_short8 vec_sub(vec_short8 a, vec_short8 b)
2013 1.1 mrg {
2014 1.1 mrg return (spu_sub(a, b));
2015 1.1 mrg }
2016 1.1 mrg
2017 1.1 mrg static inline vec_short8 vec_sub(vec_bshort8 a, vec_short8 b)
2018 1.1 mrg {
2019 1.1 mrg return (spu_sub((vec_short8)(a), b));
2020 1.1 mrg }
2021 1.1 mrg
2022 1.1 mrg static inline vec_short8 vec_sub(vec_short8 a, vec_bshort8 b)
2023 1.1 mrg {
2024 1.1 mrg return (spu_sub(a, (vec_short8)(b)));
2025 1.1 mrg }
2026 1.1 mrg
2027 1.1 mrg static inline vec_uint4 vec_sub(vec_uint4 a, vec_uint4 b)
2028 1.1 mrg {
2029 1.1 mrg return (spu_sub(a, b));
2030 1.1 mrg }
2031 1.1 mrg
2032 1.1 mrg static inline vec_int4 vec_sub(vec_int4 a, vec_int4 b)
2033 1.1 mrg {
2034 1.1 mrg return (spu_sub(a, b));
2035 1.1 mrg }
2036 1.1 mrg
2037 1.1 mrg static inline vec_int4 vec_sub(vec_bint4 a, vec_int4 b)
2038 1.1 mrg {
2039 1.1 mrg return (spu_sub((vec_int4)(a), b));
2040 1.1 mrg }
2041 1.1 mrg
2042 1.1 mrg static inline vec_int4 vec_sub(vec_int4 a, vec_bint4 b)
2043 1.1 mrg {
2044 1.1 mrg return (spu_sub(a, (vec_int4)(b)));
2045 1.1 mrg }
2046 1.1 mrg
2047 1.1 mrg static inline vec_float4 vec_sub(vec_float4 a, vec_float4 b)
2048 1.1 mrg {
2049 1.1 mrg return (spu_sub(a, b));
2050 1.1 mrg }
2051 1.1 mrg
2052 1.1 mrg
2053 1.1 mrg /* vec_subc (vector subtract carryout)
2054 1.1 mrg * ========
2055 1.1 mrg */
2056 1.1 mrg #define vec_subc(_a, _b) spu_genb(_a, _b)
2057 1.1 mrg
2058 1.1 mrg
2059 1.1 mrg /* vec_subs (vector subtract saturate)
2060 1.1 mrg * ========
2061 1.1 mrg */
2062 1.1 mrg static inline vec_uchar16 vec_subs(vec_uchar16 a, vec_uchar16 b)
2063 1.1 mrg {
2064 1.1 mrg vec_ushort8 s1, s2;
2065 1.1 mrg vec_uchar16 s, d;
2066 1.1 mrg
2067 1.1 mrg s1 = spu_sub(spu_rlmask((vec_ushort8)(a), -8), spu_rlmask((vec_ushort8)(b), -8));
2068 1.1 mrg s2 = spu_sub(spu_and((vec_ushort8)(a), 0xFF), spu_and((vec_ushort8)(b), 0xFF));
2069 1.1 mrg s = (vec_uchar16)(spu_shuffle(s1, s2, ((vec_uchar16){0, 16, 2, 18, 4, 20, 6, 22,
2070 1.1 mrg 8, 24, 10, 26, 12, 28, 14, 30})));
2071 1.1 mrg d = (vec_uchar16)(spu_shuffle(s1, s2, ((vec_uchar16){1, 17, 3, 19, 5, 21, 7, 23,
2072 1.1 mrg 9, 25, 11, 27, 13, 29, 15, 31})));
2073 1.1 mrg return (spu_andc(d, s));
2074 1.1 mrg }
2075 1.1 mrg
2076 1.1 mrg static inline vec_char16 vec_subs(vec_char16 a, vec_char16 b)
2077 1.1 mrg {
2078 1.1 mrg vec_ushort8 s1, s2;
2079 1.1 mrg vec_uchar16 s, d;
2080 1.1 mrg
2081 1.1 mrg s1 = spu_sub(spu_rlmask((vec_ushort8)(a), -8), spu_rlmask((vec_ushort8)(b), -8));
2082 1.1 mrg s2 = spu_sub(spu_and((vec_ushort8)(a), 0xFF), spu_and((vec_ushort8)(b), 0xFF));
2083 1.1 mrg s = (vec_uchar16)(spu_shuffle(s1, s2, ((vec_uchar16){1, 17, 3, 19, 5, 21, 7, 23,
2084 1.1 mrg 9, 25, 11, 27, 13, 29, 15, 31})));
2085 1.1 mrg d = spu_sel(s, spu_splats((unsigned char)0x7F), spu_cmpgt(spu_nor((vec_uchar16)(a), spu_nand(s, (vec_uchar16)(b))), 0x7F));
2086 1.1 mrg d = spu_sel(d, spu_splats((unsigned char)0x80), spu_cmpgt(spu_and((vec_uchar16)(a), spu_nor(s, (vec_uchar16)(b))), 0x7F));
2087 1.1 mrg
2088 1.1 mrg return ((vec_char16)(d));
2089 1.1 mrg }
2090 1.1 mrg
2091 1.1 mrg static inline vec_char16 vec_subs(vec_bchar16 a, vec_char16 b)
2092 1.1 mrg {
2093 1.1 mrg return (vec_subs((vec_char16)(a), b));
2094 1.1 mrg }
2095 1.1 mrg
2096 1.1 mrg static inline vec_char16 vec_subs(vec_char16 a, vec_bchar16 b)
2097 1.1 mrg {
2098 1.1 mrg return (vec_subs(a, (vec_char16)(b)));
2099 1.1 mrg }
2100 1.1 mrg
2101 1.1 mrg static inline vec_ushort8 vec_subs(vec_ushort8 a, vec_ushort8 b)
2102 1.1 mrg {
2103 1.1 mrg return (spu_andc(spu_sub(a, b), spu_cmpgt(b, a)));
2104 1.1 mrg }
2105 1.1 mrg
2106 1.1 mrg static inline vec_short8 vec_subs(vec_short8 a, vec_short8 b)
2107 1.1 mrg {
2108 1.1 mrg vec_short8 s;
2109 1.1 mrg vec_short8 d;
2110 1.1 mrg
2111 1.1 mrg s = spu_sub(a, b);
2112 1.1 mrg d = spu_sel(s, spu_splats((signed short)0x7FFF), (vec_ushort8)(spu_rlmaska(spu_nor(a, spu_nand(s, b)), -15)));
2113 1.1 mrg d = spu_sel(d, spu_splats((signed short)0x8000), (vec_ushort8)(spu_rlmaska(spu_and(a, spu_nor(s, b)), -15)));
2114 1.1 mrg
2115 1.1 mrg return (d);
2116 1.1 mrg }
2117 1.1 mrg
2118 1.1 mrg static inline vec_short8 vec_subs(vec_bshort8 a, vec_short8 b)
2119 1.1 mrg {
2120 1.1 mrg return ((vec_short8)(vec_subs((vec_short8)(a), b)));
2121 1.1 mrg }
2122 1.1 mrg
2123 1.1 mrg static inline vec_short8 vec_subs(vec_short8 a, vec_bshort8 b)
2124 1.1 mrg {
2125 1.1 mrg return ((vec_short8)(vec_subs(a, (vec_short8)(b))));
2126 1.1 mrg }
2127 1.1 mrg
2128 1.1 mrg static inline vec_uint4 vec_subs(vec_uint4 a, vec_uint4 b)
2129 1.1 mrg {
2130 1.1 mrg return (spu_andc(spu_sub(a, b), spu_cmpgt(b, a)));
2131 1.1 mrg }
2132 1.1 mrg
2133 1.1 mrg static inline vec_int4 vec_subs(vec_int4 a, vec_int4 b)
2134 1.1 mrg {
2135 1.1 mrg vec_int4 s;
2136 1.1 mrg vec_int4 d;
2137 1.1 mrg
2138 1.1 mrg s = spu_sub(a, b);
2139 1.1 mrg d = spu_sel(s, spu_splats((signed int)0x7FFFFFFF), (vec_uint4)(spu_rlmaska(spu_nor(a, spu_nand(s, b)), -31)));
2140 1.1 mrg d = spu_sel(d, spu_splats((signed int)0x80000000), (vec_uint4)(spu_rlmaska(spu_and(a, spu_nor(s, b)), -31)));
2141 1.1 mrg
2142 1.1 mrg return (d);
2143 1.1 mrg }
2144 1.1 mrg
2145 1.1 mrg static inline vec_int4 vec_subs(vec_bint4 a, vec_int4 b)
2146 1.1 mrg {
2147 1.1 mrg return ((vec_int4)(vec_subs((vec_int4)(a), b)));
2148 1.1 mrg }
2149 1.1 mrg
2150 1.1 mrg static inline vec_int4 vec_subs(vec_int4 a, vec_bint4 b)
2151 1.1 mrg {
2152 1.1 mrg return ((vec_int4)(vec_subs(a, (vec_int4)(b))));
2153 1.1 mrg }
2154 1.1 mrg
2155 1.1 mrg
2156 1.1 mrg /* vec_sum4s (vector sum across partial (1/4) saturated)
2157 1.1 mrg * =========
2158 1.1 mrg */
2159 1.1 mrg static inline vec_uint4 vec_sum4s(vec_uchar16 a, vec_uint4 b)
2160 1.1 mrg {
2161 1.1 mrg vec_uint4 a01_23, a0123;
2162 1.1 mrg
2163 1.1 mrg a01_23 = (vec_uint4)(spu_add(spu_rlmask((vec_ushort8)(a), -8),
2164 1.1 mrg spu_and((vec_ushort8)(a), 0xFF)));
2165 1.1 mrg a0123 = spu_add(spu_rlmask(a01_23, -16), spu_and(a01_23, 0x1FF));
2166 1.1 mrg return (vec_adds(a0123, b));
2167 1.1 mrg }
2168 1.1 mrg
2169 1.1 mrg static inline vec_int4 vec_sum4s(vec_char16 a, vec_int4 b)
2170 1.1 mrg {
2171 1.1 mrg vec_int4 a01_23, a0123;
2172 1.1 mrg
2173 1.1 mrg a01_23 = (vec_int4)(spu_add(spu_rlmaska((vec_short8)(a), -8),
2174 1.1 mrg spu_extend(a)));
2175 1.1 mrg a0123 = spu_add(spu_rlmaska(a01_23, -16), spu_extend((vec_short8)(a01_23)));
2176 1.1 mrg return (vec_adds(a0123, b));
2177 1.1 mrg }
2178 1.1 mrg
2179 1.1 mrg static inline vec_int4 vec_sum4s(vec_short8 a, vec_int4 b)
2180 1.1 mrg {
2181 1.1 mrg vec_int4 a0123;
2182 1.1 mrg
2183 1.1 mrg a0123 = spu_add(spu_rlmaska((vec_int4)(a), -16), spu_extend(a));
2184 1.1 mrg return (vec_adds(a0123, b));
2185 1.1 mrg }
2186 1.1 mrg
2187 1.1 mrg
2188 1.1 mrg /* vec_sum2s (vector sum across partial (1/2) saturated)
2189 1.1 mrg * =========
2190 1.1 mrg */
2191 1.1 mrg static inline vec_int4 vec_sum2s(vec_int4 a, vec_int4 b)
2192 1.1 mrg {
2193 1.1 mrg vec_int4 c, d;
2194 1.1 mrg vec_int4 sign1, sign2, sign3;
2195 1.1 mrg vec_int4 carry, sum_l, sum_h, sat, sat_val;
2196 1.1 mrg
2197 1.1 mrg sign1 = spu_rlmaska(a, -31);
2198 1.1 mrg sign2 = spu_rlmaska(b, -31);
2199 1.1 mrg
2200 1.1 mrg c = spu_rlqwbyte(a, -4);
2201 1.1 mrg sign3 = spu_rlqwbyte(sign1, -4);
2202 1.1 mrg
2203 1.1 mrg carry = spu_genc(a, b);
2204 1.1 mrg sum_l = spu_add(a, b);
2205 1.1 mrg sum_h = spu_addx(sign1, sign2, carry);
2206 1.1 mrg
2207 1.1 mrg carry = spu_genc(sum_l, c);
2208 1.1 mrg sum_l = spu_add(sum_l, c);
2209 1.1 mrg sum_h = spu_addx(sum_h, sign3, carry);
2210 1.1 mrg
2211 1.1 mrg sign1 = spu_rlmaska(sum_l, -31);
2212 1.1 mrg sign2 = spu_rlmaska(sum_h, -31);
2213 1.1 mrg
2214 1.1 mrg sat_val = spu_xor(sign2, spu_splats((signed int)0x7FFFFFFF));
2215 1.1 mrg
2216 1.1 mrg sat = spu_orc(spu_xor(sign1, sign2), (vec_int4)spu_cmpeq(sum_h, sign2));
2217 1.1 mrg
2218 1.1 mrg d = spu_and(spu_sel(sum_l, sat_val, (vec_uint4)(sat)), (vec_int4){0, -1, 0, -1});
2219 1.1 mrg
2220 1.1 mrg return (d);
2221 1.1 mrg }
2222 1.1 mrg
2223 1.1 mrg
2224 1.1 mrg /* vec_sums (vector sum saturated)
2225 1.1 mrg * ========
2226 1.1 mrg */
2227 1.1 mrg static inline vec_int4 vec_sums(vec_int4 a, vec_int4 b)
2228 1.1 mrg {
2229 1.1 mrg vec_int4 a0, a1, a2, c0, c1, c2, d;
2230 1.1 mrg vec_int4 sign_a, sign_b, sign_l, sign_h;
2231 1.1 mrg vec_int4 sum_l, sum_h, sat, sat_val;
2232 1.1 mrg
2233 1.1 mrg sign_a = spu_rlmaska(a, -31);
2234 1.1 mrg sign_b = spu_rlmaska(b, -31);
2235 1.1 mrg
2236 1.1 mrg a0 = spu_rlqwbyte(a, -12);
2237 1.1 mrg a1 = spu_rlqwbyte(a, -8);
2238 1.1 mrg a2 = spu_rlqwbyte(a, -4);
2239 1.1 mrg
2240 1.1 mrg sum_l = spu_add(a, b);
2241 1.1 mrg sum_h = spu_addx(sign_a, sign_b, spu_genc(a, b));
2242 1.1 mrg
2243 1.1 mrg c2 = spu_genc(sum_l, a2);
2244 1.1 mrg sum_l = spu_add(sum_l, a2);
2245 1.1 mrg sum_h = spu_addx(sum_h, spu_rlqwbyte(sign_a, -4), c2);
2246 1.1 mrg
2247 1.1 mrg c1 = spu_genc(sum_l, a1);
2248 1.1 mrg sum_l = spu_add(sum_l, a1);
2249 1.1 mrg sum_h = spu_addx(sum_h, spu_rlqwbyte(sign_a, -8), c1);
2250 1.1 mrg
2251 1.1 mrg c0 = spu_genc(sum_l, a0);
2252 1.1 mrg sum_l = spu_add(sum_l, a0);
2253 1.1 mrg sum_h = spu_addx(sum_h, spu_rlqwbyte(sign_a, -12), c0);
2254 1.1 mrg
2255 1.1 mrg sign_l = spu_rlmaska(sum_l, -31);
2256 1.1 mrg sign_h = spu_rlmaska(sum_h, -31);
2257 1.1 mrg
2258 1.1 mrg sat_val = spu_xor(sign_h, spu_splats((signed int)0x7FFFFFFF));
2259 1.1 mrg
2260 1.1 mrg sat = spu_orc(spu_xor(sign_l, sign_h), (vec_int4)spu_cmpeq(sum_h, sign_h));
2261 1.1 mrg
2262 1.1 mrg d = spu_and(spu_sel(sum_l, sat_val, (vec_uint4)(sat)), ((vec_int4){0, 0, 0, -1}));
2263 1.1 mrg
2264 1.1 mrg return (d);
2265 1.1 mrg }
2266 1.1 mrg
2267 1.1 mrg
2268 1.1 mrg /* vec_trunc (vector truncate)
2269 1.1 mrg * =========
2270 1.1 mrg */
2271 1.1 mrg static inline vec_float4 vec_trunc(vec_float4 a)
2272 1.1 mrg {
2273 1.1 mrg vec_int4 exp;
2274 1.1 mrg vec_uint4 mask;
2275 1.1 mrg
2276 1.1 mrg exp = spu_sub(127, (vec_int4)(spu_and(spu_rlmask((vec_uint4)(a), -23), 0xFF)));
2277 1.1 mrg mask = spu_rlmask(spu_splats((unsigned int)0x7FFFFF), exp);
2278 1.1 mrg mask = spu_sel(spu_splats((unsigned int)0), mask, spu_cmpgt(exp, -31));
2279 1.1 mrg mask = spu_or(mask, spu_xor((vec_uint4)(spu_rlmaska(spu_add(exp, -1), -31)), -1));
2280 1.1 mrg return (spu_andc(a, (vec_float4)(mask)));
2281 1.1 mrg }
2282 1.1 mrg
2283 1.1 mrg /* vec_unpackh (vector unpack high element)
2284 1.1 mrg * ===========
2285 1.1 mrg */
2286 1.1 mrg static inline vec_short8 vec_unpackh(vec_char16 a)
2287 1.1 mrg {
2288 1.1 mrg return (spu_extend(spu_shuffle(a, a, ((vec_uchar16){0, 0, 1, 1, 2, 2, 3, 3,
2289 1.1 mrg 4, 4, 5, 5, 6, 6, 7, 7}))));
2290 1.1 mrg }
2291 1.1 mrg
2292 1.1 mrg static inline vec_bshort8 vec_unpackh(vec_bchar16 a)
2293 1.1 mrg {
2294 1.1 mrg return ((vec_bshort8)(vec_unpackh((vec_char16)(a))));
2295 1.1 mrg }
2296 1.1 mrg
2297 1.1 mrg static inline vec_int4 vec_unpackh(vec_short8 a)
2298 1.1 mrg {
2299 1.1 mrg return (spu_extend(spu_shuffle(a, a, ((vec_uchar16){0, 0, 0, 1, 0, 0, 2, 3,
2300 1.1 mrg 0, 0, 4, 5, 0, 0, 6, 7}))));
2301 1.1 mrg }
2302 1.1 mrg
2303 1.1 mrg #ifdef SUPPORT_UNPACK_PIXEL
2304 1.1 mrg /* Due to type conflicts, unpacking of pixel types and boolean shorts
2305 1.1 mrg * can not simultaneously be supported. By default, the boolean short is
2306 1.1 mrg * supported.
2307 1.1 mrg */
2308 1.1 mrg static inline vec_uint4 vec_unpackh(vec_pixel8 a)
2309 1.1 mrg {
2310 1.1 mrg vec_ushort8 p1, p2;
2311 1.1 mrg
2312 1.1 mrg p1 = spu_shuffle((vec_ushort8)(spu_rlmaska((vec_short8)(a.p), -7)),
2313 1.1 mrg spu_and((vec_ushort8)(a.p), 0x1F),
2314 1.1 mrg ((vec_uchar16){ 0, 128, 128, 17, 2, 128, 128, 19,
2315 1.1 mrg 4, 128, 128, 21, 6, 128, 128, 23}));
2316 1.1 mrg p2 = spu_shuffle(spu_and(spu_rlmask((vec_ushort8)(a.p), -5), 0x1F),
2317 1.1 mrg spu_and(spu_rlmask((vec_ushort8)(a.p), -10), 0x1F),
2318 1.1 mrg ((vec_uchar16){ 128, 17, 1, 128, 128, 19, 3, 128,
2319 1.1 mrg 128, 21, 5, 128, 128, 23, 7, 128}));
2320 1.1 mrg return ((vec_uint4)(spu_or(p1, p2)));
2321 1.1 mrg }
2322 1.1 mrg
2323 1.1 mrg #else
2324 1.1 mrg
2325 1.1 mrg static inline vec_bint4 vec_unpackh(vec_bshort8 a)
2326 1.1 mrg {
2327 1.1 mrg return ((vec_bint4)(vec_unpackh((vec_short8)(a))));
2328 1.1 mrg }
2329 1.1 mrg #endif
2330 1.1 mrg
2331 1.1 mrg
2332 1.1 mrg
2333 1.1 mrg
2334 1.1 mrg
2335 1.1 mrg /* vec_unpackl (vector unpack low element)
2336 1.1 mrg * ===========
2337 1.1 mrg */
2338 1.1 mrg static inline vec_short8 vec_unpackl(vec_char16 a)
2339 1.1 mrg {
2340 1.1 mrg return (spu_extend(spu_shuffle(a, a, ((vec_uchar16){8, 8, 9, 9, 10, 10, 11, 11,
2341 1.1 mrg 12, 12, 13, 13, 14, 14, 15, 15}))));
2342 1.1 mrg }
2343 1.1 mrg
2344 1.1 mrg static inline vec_bshort8 vec_unpackl(vec_bchar16 a)
2345 1.1 mrg {
2346 1.1 mrg return ((vec_bshort8)(vec_unpackl((vec_char16)(a))));
2347 1.1 mrg }
2348 1.1 mrg
2349 1.1 mrg
2350 1.1 mrg static inline vec_int4 vec_unpackl(vec_short8 a)
2351 1.1 mrg {
2352 1.1 mrg return (spu_extend(spu_shuffle(a, a, ((vec_uchar16){0, 0, 8, 9, 0, 0, 10, 11,
2353 1.1 mrg 0, 0,12,13, 0, 0, 14, 15}))));
2354 1.1 mrg }
2355 1.1 mrg
2356 1.1 mrg
2357 1.1 mrg #ifdef SUPPORT_UNPACK_PIXEL
2358 1.1 mrg /* Due to type conflicts, unpacking of pixel types and boolean shorts
2359 1.1 mrg * can not simultaneously be supported. By default, the boolean short is
2360 1.1 mrg * supported.
2361 1.1 mrg */
2362 1.1 mrg static inline vec_uint4 vec_unpackl(vec_pixel8 a)
2363 1.1 mrg {
2364 1.1 mrg vec_ushort8 p1, p2;
2365 1.1 mrg
2366 1.1 mrg p1 = spu_shuffle((vec_ushort8)(spu_rlmaska((vec_short8)(a), -7)),
2367 1.1 mrg spu_and((vec_ushort8)(a), 0x1F),
2368 1.1 mrg ((vec_uchar16){ 8, 128, 128, 25, 10, 128, 128, 27,
2369 1.1 mrg 12, 128, 128, 29, 14, 128, 128, 31}));
2370 1.1 mrg p2 = spu_shuffle(spu_and(spu_rlmask((vec_ushort8)(a), -5), 0x1F),
2371 1.1 mrg spu_and(spu_rlmask((vec_ushort8)(a), -10), 0x1F),
2372 1.1 mrg ((vec_uchar16){ 128, 25, 9, 128, 128, 27, 11, 128,
2373 1.1 mrg 128, 29, 13, 128, 128, 31, 15, 128}));
2374 1.1 mrg return ((vec_uint4)(spu_or(p1, p2)));
2375 1.1 mrg }
2376 1.1 mrg
2377 1.1 mrg #else
2378 1.1 mrg
2379 1.1 mrg static inline vec_bint4 vec_unpackl(vec_bshort8 a)
2380 1.1 mrg {
2381 1.1 mrg return ((vec_bint4)(vec_unpackl((vec_short8)(a))));
2382 1.1 mrg
2383 1.1 mrg }
2384 1.1 mrg #endif
2385 1.1 mrg
2386 1.1 mrg
2387 1.1 mrg
2388 1.1 mrg /* vec_xor (vector logical xor)
2389 1.1 mrg * ======
2390 1.1 mrg */
2391 1.1 mrg static inline vec_uchar16 vec_xor(vec_uchar16 a, vec_uchar16 b)
2392 1.1 mrg {
2393 1.1 mrg return (spu_xor(a, b));
2394 1.1 mrg }
2395 1.1 mrg
2396 1.1 mrg static inline vec_char16 vec_xor(vec_char16 a, vec_char16 b)
2397 1.1 mrg {
2398 1.1 mrg return (spu_xor(a, b));
2399 1.1 mrg }
2400 1.1 mrg
2401 1.1 mrg static inline vec_char16 vec_xor(vec_bchar16 a, vec_char16 b)
2402 1.1 mrg {
2403 1.1 mrg return (spu_xor((vec_char16)(a), b));
2404 1.1 mrg }
2405 1.1 mrg
2406 1.1 mrg static inline vec_char16 vec_xor(vec_char16 a, vec_bchar16 b)
2407 1.1 mrg {
2408 1.1 mrg return (spu_xor(a, (vec_char16)(b)));
2409 1.1 mrg }
2410 1.1 mrg
2411 1.1 mrg static inline vec_ushort8 vec_xor(vec_ushort8 a, vec_ushort8 b)
2412 1.1 mrg {
2413 1.1 mrg return (spu_xor(a, b));
2414 1.1 mrg }
2415 1.1 mrg
2416 1.1 mrg static inline vec_short8 vec_xor(vec_short8 a, vec_short8 b)
2417 1.1 mrg {
2418 1.1 mrg return (spu_xor(a, b));
2419 1.1 mrg }
2420 1.1 mrg
2421 1.1 mrg static inline vec_short8 vec_xor(vec_bshort8 a, vec_short8 b)
2422 1.1 mrg {
2423 1.1 mrg return (spu_xor((vec_short8)(a), b));
2424 1.1 mrg }
2425 1.1 mrg
2426 1.1 mrg static inline vec_short8 vec_xor(vec_short8 a, vec_bshort8 b)
2427 1.1 mrg {
2428 1.1 mrg return (spu_xor(a, (vec_short8)(b)));
2429 1.1 mrg }
2430 1.1 mrg
2431 1.1 mrg static inline vec_uint4 vec_xor(vec_uint4 a, vec_uint4 b)
2432 1.1 mrg {
2433 1.1 mrg return (spu_xor(a, b));
2434 1.1 mrg }
2435 1.1 mrg
2436 1.1 mrg static inline vec_int4 vec_xor(vec_int4 a, vec_int4 b)
2437 1.1 mrg {
2438 1.1 mrg return (spu_xor(a, b));
2439 1.1 mrg }
2440 1.1 mrg
2441 1.1 mrg static inline vec_int4 vec_xor(vec_bint4 a, vec_int4 b)
2442 1.1 mrg {
2443 1.1 mrg return (spu_xor((vec_int4)(a), b));
2444 1.1 mrg }
2445 1.1 mrg
2446 1.1 mrg static inline vec_int4 vec_xor(vec_int4 a, vec_bint4 b)
2447 1.1 mrg {
2448 1.1 mrg return (spu_xor(a, (vec_int4)(b)));
2449 1.1 mrg }
2450 1.1 mrg
2451 1.1 mrg static inline vec_float4 vec_xor(vec_float4 a, vec_float4 b)
2452 1.1 mrg {
2453 1.1 mrg return (spu_xor(a, b));
2454 1.1 mrg }
2455 1.1 mrg
2456 1.1 mrg static inline vec_float4 vec_xor(vec_bint4 a, vec_float4 b)
2457 1.1 mrg {
2458 1.1 mrg return (spu_xor((vec_float4)(a),b));
2459 1.1 mrg }
2460 1.1 mrg
2461 1.1 mrg static inline vec_float4 vec_xor(vec_float4 a, vec_bint4 b)
2462 1.1 mrg {
2463 1.1 mrg return (spu_xor(a, (vec_float4)(b)));
2464 1.1 mrg }
2465 1.1 mrg
2466 1.1 mrg /************************************************************************
2467 1.1 mrg * PREDICATES
2468 1.1 mrg ************************************************************************/
2469 1.1 mrg
2470 1.1 mrg /* vec_all_eq (all elements equal)
2471 1.1 mrg * ==========
2472 1.1 mrg */
2473 1.1 mrg static inline int vec_all_eq(vec_uchar16 a, vec_uchar16 b)
2474 1.1 mrg {
2475 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) == 0xFFFF));
2476 1.1 mrg }
2477 1.1 mrg
2478 1.1 mrg static inline int vec_all_eq(vec_char16 a, vec_char16 b)
2479 1.1 mrg {
2480 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) == 0xFFFF));
2481 1.1 mrg }
2482 1.1 mrg
2483 1.1 mrg static inline int vec_all_eq(vec_bchar16 a, vec_char16 b)
2484 1.1 mrg {
2485 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpeq((vec_char16)(a), b)), 0) == 0xFFFF));
2486 1.1 mrg }
2487 1.1 mrg
2488 1.1 mrg static inline int vec_all_eq(vec_char16 a, vec_bchar16 b)
2489 1.1 mrg {
2490 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpeq(a, (vec_char16)(b))), 0) == 0xFFFF));
2491 1.1 mrg }
2492 1.1 mrg
2493 1.1 mrg static inline int vec_all_eq(vec_ushort8 a, vec_ushort8 b)
2494 1.1 mrg {
2495 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) == 0xFF));
2496 1.1 mrg }
2497 1.1 mrg
2498 1.1 mrg static inline int vec_all_eq(vec_short8 a, vec_short8 b)
2499 1.1 mrg {
2500 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) == 0xFF));
2501 1.1 mrg }
2502 1.1 mrg
2503 1.1 mrg static inline int vec_all_eq(vec_bshort8 a, vec_short8 b)
2504 1.1 mrg {
2505 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpeq((vec_short8)(a), b)), 0) == 0xFF));
2506 1.1 mrg }
2507 1.1 mrg
2508 1.1 mrg static inline int vec_all_eq(vec_short8 a, vec_bshort8 b)
2509 1.1 mrg {
2510 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpeq(a, (vec_short8)(b))), 0) == 0xFF));
2511 1.1 mrg }
2512 1.1 mrg
2513 1.1 mrg static inline int vec_all_eq(vec_uint4 a, vec_uint4 b)
2514 1.1 mrg {
2515 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) == 0xF));
2516 1.1 mrg }
2517 1.1 mrg
2518 1.1 mrg static inline int vec_all_eq(vec_int4 a, vec_int4 b)
2519 1.1 mrg {
2520 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) == 0xF));
2521 1.1 mrg }
2522 1.1 mrg
2523 1.1 mrg static inline int vec_all_eq(vec_bint4 a, vec_int4 b)
2524 1.1 mrg {
2525 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpeq((vec_int4)(a), b)), 0) == 0xF));
2526 1.1 mrg }
2527 1.1 mrg
2528 1.1 mrg static inline int vec_all_eq(vec_int4 a, vec_bint4 b)
2529 1.1 mrg {
2530 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpeq(a, (vec_int4)(b))), 0) == 0xF));
2531 1.1 mrg }
2532 1.1 mrg
2533 1.1 mrg static inline int vec_all_eq(vec_float4 a, vec_float4 b)
2534 1.1 mrg {
2535 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) == 0xF));
2536 1.1 mrg }
2537 1.1 mrg
2538 1.1 mrg
2539 1.1 mrg /* vec_all_ge (all elements greater than or equal)
2540 1.1 mrg * ==========
2541 1.1 mrg */
2542 1.1 mrg static inline int vec_all_ge(vec_uchar16 a, vec_uchar16 b)
2543 1.1 mrg {
2544 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) == 0));
2545 1.1 mrg }
2546 1.1 mrg
2547 1.1 mrg static inline int vec_all_ge(vec_char16 a, vec_char16 b)
2548 1.1 mrg {
2549 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) == 0));
2550 1.1 mrg }
2551 1.1 mrg
2552 1.1 mrg static inline int vec_all_ge(vec_bchar16 a, vec_char16 b)
2553 1.1 mrg {
2554 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt(b, (vec_char16)(a))), 0) == 0));
2555 1.1 mrg }
2556 1.1 mrg
2557 1.1 mrg static inline int vec_all_ge(vec_char16 a, vec_bchar16 b)
2558 1.1 mrg {
2559 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_char16)(b), a)), 0) == 0));
2560 1.1 mrg }
2561 1.1 mrg
2562 1.1 mrg static inline int vec_all_ge(vec_ushort8 a, vec_ushort8 b)
2563 1.1 mrg {
2564 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) == 0));
2565 1.1 mrg }
2566 1.1 mrg
2567 1.1 mrg static inline int vec_all_ge(vec_short8 a, vec_short8 b)
2568 1.1 mrg {
2569 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) == 0));
2570 1.1 mrg }
2571 1.1 mrg
2572 1.1 mrg static inline int vec_all_ge(vec_bshort8 a, vec_short8 b)
2573 1.1 mrg {
2574 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt(b, (vec_short8)(a))), 0) == 0));
2575 1.1 mrg }
2576 1.1 mrg
2577 1.1 mrg static inline int vec_all_ge(vec_short8 a, vec_bshort8 b)
2578 1.1 mrg {
2579 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_short8)(b), a)), 0) == 0));
2580 1.1 mrg }
2581 1.1 mrg
2582 1.1 mrg static inline int vec_all_ge(vec_uint4 a, vec_uint4 b)
2583 1.1 mrg {
2584 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) == 0));
2585 1.1 mrg }
2586 1.1 mrg
2587 1.1 mrg static inline int vec_all_ge(vec_int4 a, vec_int4 b)
2588 1.1 mrg {
2589 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) == 0));
2590 1.1 mrg }
2591 1.1 mrg
2592 1.1 mrg static inline int vec_all_ge(vec_bint4 a, vec_int4 b)
2593 1.1 mrg {
2594 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt(b, (vec_int4)(a))), 0) == 0));
2595 1.1 mrg }
2596 1.1 mrg
2597 1.1 mrg static inline int vec_all_ge(vec_int4 a, vec_bint4 b)
2598 1.1 mrg {
2599 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_int4)(b), a)), 0) == 0));
2600 1.1 mrg }
2601 1.1 mrg
2602 1.1 mrg static inline int vec_all_ge(vec_float4 a, vec_float4 b)
2603 1.1 mrg {
2604 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) == 0));
2605 1.1 mrg }
2606 1.1 mrg
2607 1.1 mrg
2608 1.1 mrg /* vec_all_gt (all elements greater than)
2609 1.1 mrg * ==========
2610 1.1 mrg */
2611 1.1 mrg static inline int vec_all_gt(vec_uchar16 a, vec_uchar16 b)
2612 1.1 mrg {
2613 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) == 0xFFFF));
2614 1.1 mrg }
2615 1.1 mrg
2616 1.1 mrg static inline int vec_all_gt(vec_char16 a, vec_char16 b)
2617 1.1 mrg {
2618 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) == 0xFFFF));
2619 1.1 mrg }
2620 1.1 mrg
2621 1.1 mrg static inline int vec_all_gt(vec_bchar16 a, vec_char16 b)
2622 1.1 mrg {
2623 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_char16)(a), b)), 0) == 0xFFFF));
2624 1.1 mrg }
2625 1.1 mrg
2626 1.1 mrg static inline int vec_all_gt(vec_char16 a, vec_bchar16 b)
2627 1.1 mrg {
2628 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt(a, (vec_char16)(b))), 0) == 0xFFFF));
2629 1.1 mrg }
2630 1.1 mrg
2631 1.1 mrg static inline int vec_all_gt(vec_ushort8 a, vec_ushort8 b)
2632 1.1 mrg {
2633 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) == 0xFF));
2634 1.1 mrg }
2635 1.1 mrg
2636 1.1 mrg static inline int vec_all_gt(vec_short8 a, vec_short8 b)
2637 1.1 mrg {
2638 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) == 0xFF));
2639 1.1 mrg }
2640 1.1 mrg
2641 1.1 mrg static inline int vec_all_gt(vec_bshort8 a, vec_short8 b)
2642 1.1 mrg {
2643 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_short8)(a), b)), 0) == 0xFF));
2644 1.1 mrg }
2645 1.1 mrg
2646 1.1 mrg static inline int vec_all_gt(vec_short8 a, vec_bshort8 b)
2647 1.1 mrg {
2648 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt(a, (vec_short8)(b))), 0) == 0xFF));
2649 1.1 mrg }
2650 1.1 mrg
2651 1.1 mrg static inline int vec_all_gt(vec_uint4 a, vec_uint4 b)
2652 1.1 mrg {
2653 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) == 0xF));
2654 1.1 mrg }
2655 1.1 mrg
2656 1.1 mrg static inline int vec_all_gt(vec_int4 a, vec_int4 b)
2657 1.1 mrg {
2658 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) == 0xF));
2659 1.1 mrg }
2660 1.1 mrg
2661 1.1 mrg static inline int vec_all_gt(vec_bint4 a, vec_int4 b)
2662 1.1 mrg {
2663 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_int4)(a), b)), 0) == 0xF));
2664 1.1 mrg }
2665 1.1 mrg
2666 1.1 mrg static inline int vec_all_gt(vec_int4 a, vec_bint4 b)
2667 1.1 mrg {
2668 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt(a, (vec_int4)(b))), 0) == 0xF));
2669 1.1 mrg }
2670 1.1 mrg
2671 1.1 mrg static inline int vec_all_gt(vec_float4 a, vec_float4 b)
2672 1.1 mrg {
2673 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) == 0xF));
2674 1.1 mrg }
2675 1.1 mrg
2676 1.1 mrg
2677 1.1 mrg /* vec_all_in (all elements in bounds)
2678 1.1 mrg * ==========
2679 1.1 mrg */
2680 1.1 mrg static inline int vec_all_in(vec_float4 a, vec_float4 b)
2681 1.1 mrg {
2682 1.1 mrg return (spu_extract(spu_gather(spu_nor(spu_cmpabsgt(a, b), (vec_uint4)(spu_rlmaska((vec_int4)(b), -31)))), 0) == 0xF);
2683 1.1 mrg }
2684 1.1 mrg
2685 1.1 mrg
2686 1.1 mrg /* vec_all_le (all elements less than or equal)
2687 1.1 mrg * ==========
2688 1.1 mrg */
2689 1.1 mrg static inline int vec_all_le(vec_uchar16 a, vec_uchar16 b)
2690 1.1 mrg {
2691 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) == 0));
2692 1.1 mrg }
2693 1.1 mrg
2694 1.1 mrg static inline int vec_all_le(vec_char16 a, vec_char16 b)
2695 1.1 mrg {
2696 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) == 0));
2697 1.1 mrg }
2698 1.1 mrg
2699 1.1 mrg static inline int vec_all_le(vec_bchar16 a, vec_char16 b)
2700 1.1 mrg {
2701 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_char16)(a), b)), 0) == 0));
2702 1.1 mrg }
2703 1.1 mrg
2704 1.1 mrg static inline int vec_all_le(vec_char16 a, vec_bchar16 b)
2705 1.1 mrg {
2706 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt(a, (vec_char16)(b))), 0) == 0));
2707 1.1 mrg }
2708 1.1 mrg
2709 1.1 mrg static inline int vec_all_le(vec_ushort8 a, vec_ushort8 b)
2710 1.1 mrg {
2711 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) == 0));
2712 1.1 mrg }
2713 1.1 mrg
2714 1.1 mrg static inline int vec_all_le(vec_short8 a, vec_short8 b)
2715 1.1 mrg {
2716 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) == 0));
2717 1.1 mrg }
2718 1.1 mrg
2719 1.1 mrg static inline int vec_all_le(vec_bshort8 a, vec_short8 b)
2720 1.1 mrg {
2721 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_short8)(a), b)), 0) == 0));
2722 1.1 mrg }
2723 1.1 mrg
2724 1.1 mrg static inline int vec_all_le(vec_short8 a, vec_bshort8 b)
2725 1.1 mrg {
2726 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt(a, (vec_short8)(b))), 0) == 0));
2727 1.1 mrg }
2728 1.1 mrg
2729 1.1 mrg static inline int vec_all_le(vec_uint4 a, vec_uint4 b)
2730 1.1 mrg {
2731 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) == 0));
2732 1.1 mrg }
2733 1.1 mrg
2734 1.1 mrg static inline int vec_all_le(vec_int4 a, vec_int4 b)
2735 1.1 mrg {
2736 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) == 0));
2737 1.1 mrg }
2738 1.1 mrg
2739 1.1 mrg static inline int vec_all_le(vec_bint4 a, vec_int4 b)
2740 1.1 mrg {
2741 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_int4)(a), b)), 0) == 0));
2742 1.1 mrg }
2743 1.1 mrg
2744 1.1 mrg static inline int vec_all_le(vec_int4 a, vec_bint4 b)
2745 1.1 mrg {
2746 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt(a, (vec_int4)(b))), 0) == 0));
2747 1.1 mrg }
2748 1.1 mrg
2749 1.1 mrg static inline int vec_all_le(vec_float4 a, vec_float4 b)
2750 1.1 mrg {
2751 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) == 0));
2752 1.1 mrg }
2753 1.1 mrg
2754 1.1 mrg
2755 1.1 mrg /* vec_all_lt (all elements less than)
2756 1.1 mrg * ==========
2757 1.1 mrg */
2758 1.1 mrg static inline int vec_all_lt(vec_uchar16 a, vec_uchar16 b)
2759 1.1 mrg {
2760 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) == 0xFFFF));
2761 1.1 mrg }
2762 1.1 mrg
2763 1.1 mrg static inline int vec_all_lt(vec_char16 a, vec_char16 b)
2764 1.1 mrg {
2765 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) == 0xFFFF));
2766 1.1 mrg }
2767 1.1 mrg
2768 1.1 mrg static inline int vec_all_lt(vec_bchar16 a, vec_char16 b)
2769 1.1 mrg {
2770 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt(b, (vec_char16)(a))), 0) == 0xFFFF));
2771 1.1 mrg }
2772 1.1 mrg
2773 1.1 mrg static inline int vec_all_lt(vec_char16 a, vec_bchar16 b)
2774 1.1 mrg {
2775 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_char16)(b), a)), 0) == 0xFFFF));
2776 1.1 mrg }
2777 1.1 mrg
2778 1.1 mrg static inline int vec_all_lt(vec_ushort8 a, vec_ushort8 b)
2779 1.1 mrg {
2780 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) == 0xFF));
2781 1.1 mrg }
2782 1.1 mrg
2783 1.1 mrg static inline int vec_all_lt(vec_short8 a, vec_short8 b)
2784 1.1 mrg {
2785 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) == 0xFF));
2786 1.1 mrg }
2787 1.1 mrg
2788 1.1 mrg static inline int vec_all_lt(vec_bshort8 a, vec_short8 b)
2789 1.1 mrg {
2790 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt(b, (vec_short8)(a))), 0) == 0xFF));
2791 1.1 mrg }
2792 1.1 mrg
2793 1.1 mrg static inline int vec_all_lt(vec_short8 a, vec_bshort8 b)
2794 1.1 mrg {
2795 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_short8)(b), a)), 0) == 0xFF));
2796 1.1 mrg }
2797 1.1 mrg
2798 1.1 mrg static inline int vec_all_lt(vec_uint4 a, vec_uint4 b)
2799 1.1 mrg {
2800 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) == 0xF));
2801 1.1 mrg }
2802 1.1 mrg
2803 1.1 mrg static inline int vec_all_lt(vec_int4 a, vec_int4 b)
2804 1.1 mrg {
2805 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) == 0xF));
2806 1.1 mrg }
2807 1.1 mrg
2808 1.1 mrg static inline int vec_all_lt(vec_bint4 a, vec_int4 b)
2809 1.1 mrg {
2810 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt(b, (vec_int4)(a))), 0) == 0xF));
2811 1.1 mrg }
2812 1.1 mrg
2813 1.1 mrg static inline int vec_all_lt(vec_int4 a, vec_bint4 b)
2814 1.1 mrg {
2815 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_int4)(b), a)), 0) == 0xF));
2816 1.1 mrg }
2817 1.1 mrg
2818 1.1 mrg static inline int vec_all_lt(vec_float4 a, vec_float4 b)
2819 1.1 mrg {
2820 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) == 0xF));
2821 1.1 mrg }
2822 1.1 mrg
2823 1.1 mrg
2824 1.1 mrg /* vec_all_nan (all elements not a number)
2825 1.1 mrg * ===========
2826 1.1 mrg */
2827 1.1 mrg static inline int vec_all_nan(vec_float4 a)
2828 1.1 mrg {
2829 1.1 mrg vec_uint4 exp, man;
2830 1.1 mrg vec_uint4 exp_mask = spu_splats((unsigned int)0x7F800000);
2831 1.1 mrg
2832 1.1 mrg exp = spu_and((vec_uint4)(a), exp_mask);
2833 1.1 mrg man = spu_and((vec_uint4)(a), spu_splats((unsigned int)0x007FFFFF));
2834 1.1 mrg return ((int)(spu_extract(spu_gather(spu_andc(spu_cmpeq(exp, exp_mask),
2835 1.1 mrg spu_cmpeq(man, 0))), 0) == 0xF));
2836 1.1 mrg }
2837 1.1 mrg
2838 1.1 mrg #define vec_all_nan(_a) (0)
2839 1.1 mrg
2840 1.1 mrg
2841 1.1 mrg /* vec_all_ne (all elements not equal)
2842 1.1 mrg * ==========
2843 1.1 mrg */
2844 1.1 mrg static inline int vec_all_ne(vec_uchar16 a, vec_uchar16 b)
2845 1.1 mrg {
2846 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) == 0));
2847 1.1 mrg }
2848 1.1 mrg
2849 1.1 mrg static inline int vec_all_ne(vec_char16 a, vec_char16 b)
2850 1.1 mrg {
2851 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) == 0));
2852 1.1 mrg }
2853 1.1 mrg
2854 1.1 mrg static inline int vec_all_ne(vec_bchar16 a, vec_char16 b)
2855 1.1 mrg {
2856 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpeq((vec_char16)(a), b)), 0) == 0));
2857 1.1 mrg }
2858 1.1 mrg
2859 1.1 mrg static inline int vec_all_ne(vec_char16 a, vec_bchar16 b)
2860 1.1 mrg {
2861 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpeq(a, (vec_char16)(b))), 0) == 0));
2862 1.1 mrg }
2863 1.1 mrg
2864 1.1 mrg static inline int vec_all_ne(vec_ushort8 a, vec_ushort8 b)
2865 1.1 mrg {
2866 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) == 0));
2867 1.1 mrg }
2868 1.1 mrg
2869 1.1 mrg static inline int vec_all_ne(vec_short8 a, vec_short8 b)
2870 1.1 mrg {
2871 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) == 0));
2872 1.1 mrg }
2873 1.1 mrg
2874 1.1 mrg static inline int vec_all_ne(vec_bshort8 a, vec_short8 b)
2875 1.1 mrg {
2876 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpeq((vec_short8)(a), b)), 0) == 0));
2877 1.1 mrg }
2878 1.1 mrg
2879 1.1 mrg static inline int vec_all_ne(vec_short8 a, vec_bshort8 b)
2880 1.1 mrg {
2881 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpeq(a, (vec_short8)(b))), 0) == 0));
2882 1.1 mrg }
2883 1.1 mrg
2884 1.1 mrg static inline int vec_all_ne(vec_uint4 a, vec_uint4 b)
2885 1.1 mrg {
2886 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) == 0));
2887 1.1 mrg }
2888 1.1 mrg
2889 1.1 mrg static inline int vec_all_ne(vec_int4 a, vec_int4 b)
2890 1.1 mrg {
2891 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) == 0));
2892 1.1 mrg }
2893 1.1 mrg
2894 1.1 mrg static inline int vec_all_ne(vec_bint4 a, vec_int4 b)
2895 1.1 mrg {
2896 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpeq((vec_int4)(a), b)), 0) == 0));
2897 1.1 mrg }
2898 1.1 mrg
2899 1.1 mrg static inline int vec_all_ne(vec_int4 a, vec_bint4 b)
2900 1.1 mrg {
2901 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpeq(a, (vec_int4)(b))), 0) == 0));
2902 1.1 mrg }
2903 1.1 mrg
2904 1.1 mrg static inline int vec_all_ne(vec_float4 a, vec_float4 b)
2905 1.1 mrg {
2906 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) == 0));
2907 1.1 mrg }
2908 1.1 mrg
2909 1.1 mrg
2910 1.1 mrg /* vec_all_nge (all elements not greater than or equal)
2911 1.1 mrg * ===========
2912 1.1 mrg */
2913 1.1 mrg static inline int vec_all_nge(vec_float4 a, vec_float4 b)
2914 1.1 mrg {
2915 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) == 0xF));
2916 1.1 mrg }
2917 1.1 mrg
2918 1.1 mrg
2919 1.1 mrg /* vec_all_ngt (all elements not greater than)
2920 1.1 mrg * ===========
2921 1.1 mrg */
2922 1.1 mrg static inline int vec_all_ngt(vec_float4 a, vec_float4 b)
2923 1.1 mrg {
2924 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) == 0));
2925 1.1 mrg }
2926 1.1 mrg
2927 1.1 mrg
2928 1.1 mrg /* vec_all_nle (all elements not less than or equal)
2929 1.1 mrg * ===========
2930 1.1 mrg */
2931 1.1 mrg static inline int vec_all_nle(vec_float4 a, vec_float4 b)
2932 1.1 mrg {
2933 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) == 0xF));
2934 1.1 mrg }
2935 1.1 mrg
2936 1.1 mrg
2937 1.1 mrg /* vec_all_nlt (all elements not less than)
2938 1.1 mrg * ===========
2939 1.1 mrg */
2940 1.1 mrg static inline int vec_all_nlt(vec_float4 a, vec_float4 b)
2941 1.1 mrg {
2942 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) == 0));
2943 1.1 mrg }
2944 1.1 mrg
2945 1.1 mrg
2946 1.1 mrg /* vec_all_numeric (all elements numeric)
2947 1.1 mrg * ===========
2948 1.1 mrg */
2949 1.1 mrg static inline int vec_all_numeric(vec_float4 a)
2950 1.1 mrg {
2951 1.1 mrg vec_uint4 exp;
2952 1.1 mrg
2953 1.1 mrg exp = spu_and(spu_rlmask((vec_uint4)(a), -23), 0xFF);
2954 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpeq(exp, 255)), 0) == 0));
2955 1.1 mrg }
2956 1.1 mrg
2957 1.1 mrg
2958 1.1 mrg
2959 1.1 mrg /* vec_any_eq (any elements equal)
2960 1.1 mrg * ==========
2961 1.1 mrg */
2962 1.1 mrg static inline int vec_any_eq(vec_uchar16 a, vec_uchar16 b)
2963 1.1 mrg {
2964 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) != 0));
2965 1.1 mrg }
2966 1.1 mrg
2967 1.1 mrg static inline int vec_any_eq(vec_char16 a, vec_char16 b)
2968 1.1 mrg {
2969 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) != 0));
2970 1.1 mrg }
2971 1.1 mrg
2972 1.1 mrg static inline int vec_any_eq(vec_bchar16 a, vec_char16 b)
2973 1.1 mrg {
2974 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpeq((vec_char16)(a), b)), 0) != 0));
2975 1.1 mrg }
2976 1.1 mrg
2977 1.1 mrg static inline int vec_any_eq(vec_char16 a, vec_bchar16 b)
2978 1.1 mrg {
2979 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpeq(a, (vec_char16)(b))), 0) != 0));
2980 1.1 mrg }
2981 1.1 mrg
2982 1.1 mrg static inline int vec_any_eq(vec_ushort8 a, vec_ushort8 b)
2983 1.1 mrg {
2984 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) != 0));
2985 1.1 mrg }
2986 1.1 mrg
2987 1.1 mrg static inline int vec_any_eq(vec_short8 a, vec_short8 b)
2988 1.1 mrg {
2989 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) != 0));
2990 1.1 mrg }
2991 1.1 mrg
2992 1.1 mrg static inline int vec_any_eq(vec_bshort8 a, vec_short8 b)
2993 1.1 mrg {
2994 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpeq((vec_short8)(a), b)), 0) != 0));
2995 1.1 mrg }
2996 1.1 mrg
2997 1.1 mrg static inline int vec_any_eq(vec_short8 a, vec_bshort8 b)
2998 1.1 mrg {
2999 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpeq(a, (vec_short8)(b))), 0) != 0));
3000 1.1 mrg }
3001 1.1 mrg
3002 1.1 mrg static inline int vec_any_eq(vec_uint4 a, vec_uint4 b)
3003 1.1 mrg {
3004 1.1 mrg return ((int)(spu_extract(spu_orx(spu_rlmask(spu_cmpeq(a, b), -31)), 0)));
3005 1.1 mrg }
3006 1.1 mrg
3007 1.1 mrg static inline int vec_any_eq(vec_int4 a, vec_int4 b)
3008 1.1 mrg {
3009 1.1 mrg return ((int)(spu_extract(spu_orx(spu_rlmask(spu_cmpeq(a, b), -31)), 0)));
3010 1.1 mrg }
3011 1.1 mrg
3012 1.1 mrg static inline int vec_any_eq(vec_bint4 a, vec_int4 b)
3013 1.1 mrg {
3014 1.1 mrg return ((int)(spu_extract(spu_orx(spu_rlmask(spu_cmpeq((vec_int4)(a), b), -31)), 0)));
3015 1.1 mrg }
3016 1.1 mrg
3017 1.1 mrg static inline int vec_any_eq(vec_int4 a, vec_bint4 b)
3018 1.1 mrg {
3019 1.1 mrg return ((int)(spu_extract(spu_orx(spu_rlmask(spu_cmpeq(a, (vec_int4)(b)), -31)), 0)));
3020 1.1 mrg }
3021 1.1 mrg
3022 1.1 mrg static inline int vec_any_eq(vec_float4 a, vec_float4 b)
3023 1.1 mrg {
3024 1.1 mrg return ((int)(spu_extract(spu_orx(spu_rlmask(spu_cmpeq(a, b), -31)), 0)));
3025 1.1 mrg }
3026 1.1 mrg
3027 1.1 mrg /* vec_any_ge (any elements greater than or equal)
3028 1.1 mrg * ==========
3029 1.1 mrg */
3030 1.1 mrg static inline int vec_any_ge(vec_uchar16 a, vec_uchar16 b)
3031 1.1 mrg {
3032 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) != 0xFFFF));
3033 1.1 mrg }
3034 1.1 mrg
3035 1.1 mrg static inline int vec_any_ge(vec_char16 a, vec_char16 b)
3036 1.1 mrg {
3037 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) != 0xFFFF));
3038 1.1 mrg }
3039 1.1 mrg
3040 1.1 mrg static inline int vec_any_ge(vec_bchar16 a, vec_char16 b)
3041 1.1 mrg {
3042 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt(b, (vec_char16)(a))), 0) != 0xFFFF));
3043 1.1 mrg }
3044 1.1 mrg
3045 1.1 mrg static inline int vec_any_ge(vec_char16 a, vec_bchar16 b)
3046 1.1 mrg {
3047 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_char16)(b), a)), 0) != 0xFFFF));
3048 1.1 mrg }
3049 1.1 mrg
3050 1.1 mrg static inline int vec_any_ge(vec_ushort8 a, vec_ushort8 b)
3051 1.1 mrg {
3052 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) != 0xFF));
3053 1.1 mrg }
3054 1.1 mrg
3055 1.1 mrg static inline int vec_any_ge(vec_short8 a, vec_short8 b)
3056 1.1 mrg {
3057 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) != 0xFF));
3058 1.1 mrg }
3059 1.1 mrg
3060 1.1 mrg static inline int vec_any_ge(vec_bshort8 a, vec_short8 b)
3061 1.1 mrg {
3062 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt(b, (vec_short8)(a))), 0) != 0xFF));
3063 1.1 mrg }
3064 1.1 mrg
3065 1.1 mrg static inline int vec_any_ge(vec_short8 a, vec_bshort8 b)
3066 1.1 mrg {
3067 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_short8)(b), a)), 0) != 0xFF));
3068 1.1 mrg }
3069 1.1 mrg
3070 1.1 mrg static inline int vec_any_ge(vec_uint4 a, vec_uint4 b)
3071 1.1 mrg {
3072 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) != 0xF));
3073 1.1 mrg }
3074 1.1 mrg
3075 1.1 mrg static inline int vec_any_ge(vec_int4 a, vec_int4 b)
3076 1.1 mrg {
3077 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) != 0xF));
3078 1.1 mrg }
3079 1.1 mrg
3080 1.1 mrg static inline int vec_any_ge(vec_bint4 a, vec_int4 b)
3081 1.1 mrg {
3082 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt(b, (vec_int4)(a))), 0) != 0xF));
3083 1.1 mrg }
3084 1.1 mrg
3085 1.1 mrg static inline int vec_any_ge(vec_int4 a, vec_bint4 b)
3086 1.1 mrg {
3087 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_int4)(b), a)), 0) != 0xF));
3088 1.1 mrg }
3089 1.1 mrg
3090 1.1 mrg static inline int vec_any_ge(vec_float4 a, vec_float4 b)
3091 1.1 mrg {
3092 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) != 0xF));
3093 1.1 mrg }
3094 1.1 mrg
3095 1.1 mrg
3096 1.1 mrg /* vec_any_gt (any elements greater than)
3097 1.1 mrg * ==========
3098 1.1 mrg */
3099 1.1 mrg static inline int vec_any_gt(vec_uchar16 a, vec_uchar16 b)
3100 1.1 mrg {
3101 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) != 0));
3102 1.1 mrg }
3103 1.1 mrg
3104 1.1 mrg static inline int vec_any_gt(vec_char16 a, vec_char16 b)
3105 1.1 mrg {
3106 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) != 0));
3107 1.1 mrg }
3108 1.1 mrg
3109 1.1 mrg static inline int vec_any_gt(vec_bchar16 a, vec_char16 b)
3110 1.1 mrg {
3111 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_char16)(a), b)), 0) != 0));
3112 1.1 mrg }
3113 1.1 mrg
3114 1.1 mrg static inline int vec_any_gt(vec_char16 a, vec_bchar16 b)
3115 1.1 mrg {
3116 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt(a, (vec_char16)(b))), 0) != 0));
3117 1.1 mrg }
3118 1.1 mrg
3119 1.1 mrg static inline int vec_any_gt(vec_ushort8 a, vec_ushort8 b)
3120 1.1 mrg {
3121 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) != 0));
3122 1.1 mrg }
3123 1.1 mrg
3124 1.1 mrg static inline int vec_any_gt(vec_short8 a, vec_short8 b)
3125 1.1 mrg {
3126 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) != 0));
3127 1.1 mrg }
3128 1.1 mrg
3129 1.1 mrg static inline int vec_any_gt(vec_bshort8 a, vec_short8 b)
3130 1.1 mrg {
3131 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_short8)(a), b)), 0) != 0));
3132 1.1 mrg }
3133 1.1 mrg
3134 1.1 mrg static inline int vec_any_gt(vec_short8 a, vec_bshort8 b)
3135 1.1 mrg {
3136 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt(a, (vec_short8)(b))), 0) != 0));
3137 1.1 mrg }
3138 1.1 mrg
3139 1.1 mrg
3140 1.1 mrg static inline int vec_any_gt(vec_uint4 a, vec_uint4 b)
3141 1.1 mrg {
3142 1.1 mrg return ((int)(spu_extract(spu_orx(spu_rlmask(spu_cmpgt(a, b), -31)), 0)));
3143 1.1 mrg }
3144 1.1 mrg
3145 1.1 mrg static inline int vec_any_gt(vec_int4 a, vec_int4 b)
3146 1.1 mrg {
3147 1.1 mrg return ((int)(spu_extract(spu_orx(spu_rlmask(spu_cmpgt(a, b), -31)), 0)));
3148 1.1 mrg }
3149 1.1 mrg
3150 1.1 mrg static inline int vec_any_gt(vec_bint4 a, vec_int4 b)
3151 1.1 mrg {
3152 1.1 mrg return ((int)(spu_extract(spu_orx(spu_rlmask(spu_cmpgt((vec_int4)(a), b), -31)), 0)));
3153 1.1 mrg }
3154 1.1 mrg
3155 1.1 mrg static inline int vec_any_gt(vec_int4 a, vec_bint4 b)
3156 1.1 mrg {
3157 1.1 mrg return ((int)(spu_extract(spu_orx(spu_rlmask(spu_cmpgt(a, (vec_int4)(b)), -31)), 0)));
3158 1.1 mrg }
3159 1.1 mrg
3160 1.1 mrg static inline int vec_any_gt(vec_float4 a, vec_float4 b)
3161 1.1 mrg {
3162 1.1 mrg return ((int)(spu_extract(spu_orx(spu_rlmask(spu_cmpgt(a, b), -31)), 0)));
3163 1.1 mrg }
3164 1.1 mrg
3165 1.1 mrg /* vec_any_le (any elements less than or equal)
3166 1.1 mrg * ==========
3167 1.1 mrg */
3168 1.1 mrg static inline int vec_any_le(vec_uchar16 a, vec_uchar16 b)
3169 1.1 mrg {
3170 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) != 0xFFFF));
3171 1.1 mrg }
3172 1.1 mrg
3173 1.1 mrg static inline int vec_any_le(vec_char16 a, vec_char16 b)
3174 1.1 mrg {
3175 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) != 0xFFFF));
3176 1.1 mrg }
3177 1.1 mrg
3178 1.1 mrg static inline int vec_any_le(vec_bchar16 a, vec_char16 b)
3179 1.1 mrg {
3180 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_char16)(a), b)), 0) != 0xFFFF));
3181 1.1 mrg }
3182 1.1 mrg
3183 1.1 mrg static inline int vec_any_le(vec_char16 a, vec_bchar16 b)
3184 1.1 mrg {
3185 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt(a, (vec_char16)(b))), 0) != 0xFFFF));
3186 1.1 mrg }
3187 1.1 mrg
3188 1.1 mrg static inline int vec_any_le(vec_ushort8 a, vec_ushort8 b)
3189 1.1 mrg {
3190 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) != 0xFF));
3191 1.1 mrg }
3192 1.1 mrg
3193 1.1 mrg static inline int vec_any_le(vec_short8 a, vec_short8 b)
3194 1.1 mrg {
3195 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) != 0xFF));
3196 1.1 mrg }
3197 1.1 mrg
3198 1.1 mrg static inline int vec_any_le(vec_bshort8 a, vec_short8 b)
3199 1.1 mrg {
3200 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_short8)(a), b)), 0) != 0xFF));
3201 1.1 mrg }
3202 1.1 mrg
3203 1.1 mrg static inline int vec_any_le(vec_short8 a, vec_bshort8 b)
3204 1.1 mrg {
3205 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt(a, (vec_short8)(b))), 0) != 0xFF));
3206 1.1 mrg }
3207 1.1 mrg
3208 1.1 mrg static inline int vec_any_le(vec_uint4 a, vec_uint4 b)
3209 1.1 mrg {
3210 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) != 0xF));
3211 1.1 mrg }
3212 1.1 mrg
3213 1.1 mrg static inline int vec_any_le(vec_int4 a, vec_int4 b)
3214 1.1 mrg {
3215 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) != 0xF));
3216 1.1 mrg }
3217 1.1 mrg
3218 1.1 mrg static inline int vec_any_le(vec_bint4 a, vec_int4 b)
3219 1.1 mrg {
3220 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_int4)(a), b)), 0) != 0xF));
3221 1.1 mrg }
3222 1.1 mrg
3223 1.1 mrg static inline int vec_any_le(vec_int4 a, vec_bint4 b)
3224 1.1 mrg {
3225 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt(a, (vec_int4)(b))), 0) != 0xF));
3226 1.1 mrg }
3227 1.1 mrg
3228 1.1 mrg static inline int vec_any_le(vec_float4 a, vec_float4 b)
3229 1.1 mrg {
3230 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) != 0xF));
3231 1.1 mrg }
3232 1.1 mrg
3233 1.1 mrg
3234 1.1 mrg /* vec_any_lt (any elements less than)
3235 1.1 mrg * ==========
3236 1.1 mrg */
3237 1.1 mrg static inline int vec_any_lt(vec_uchar16 a, vec_uchar16 b)
3238 1.1 mrg {
3239 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) != 0));
3240 1.1 mrg }
3241 1.1 mrg
3242 1.1 mrg static inline int vec_any_lt(vec_char16 a, vec_char16 b)
3243 1.1 mrg {
3244 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) != 0));
3245 1.1 mrg }
3246 1.1 mrg
3247 1.1 mrg static inline int vec_any_lt(vec_bchar16 a, vec_char16 b)
3248 1.1 mrg {
3249 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt(b, (vec_char16)(a))), 0) != 0));
3250 1.1 mrg }
3251 1.1 mrg
3252 1.1 mrg static inline int vec_any_lt(vec_char16 a, vec_bchar16 b)
3253 1.1 mrg {
3254 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_char16)(b), a)), 0) != 0));
3255 1.1 mrg }
3256 1.1 mrg
3257 1.1 mrg static inline int vec_any_lt(vec_ushort8 a, vec_ushort8 b)
3258 1.1 mrg {
3259 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) != 0));
3260 1.1 mrg }
3261 1.1 mrg
3262 1.1 mrg static inline int vec_any_lt(vec_short8 a, vec_short8 b)
3263 1.1 mrg {
3264 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) != 0));
3265 1.1 mrg }
3266 1.1 mrg
3267 1.1 mrg static inline int vec_any_lt(vec_bshort8 a, vec_short8 b)
3268 1.1 mrg {
3269 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt(b, (vec_short8)(a))), 0) != 0));
3270 1.1 mrg }
3271 1.1 mrg
3272 1.1 mrg static inline int vec_any_lt(vec_short8 a, vec_bshort8 b)
3273 1.1 mrg {
3274 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_short8)(b), a)), 0) != 0));
3275 1.1 mrg }
3276 1.1 mrg
3277 1.1 mrg static inline int vec_any_lt(vec_uint4 a, vec_uint4 b)
3278 1.1 mrg {
3279 1.1 mrg return ((int)(spu_extract(spu_orx(spu_rlmask(spu_cmpgt(b, a), -31)), 0)));
3280 1.1 mrg }
3281 1.1 mrg
3282 1.1 mrg static inline int vec_any_lt(vec_int4 a, vec_int4 b)
3283 1.1 mrg {
3284 1.1 mrg return ((int)(spu_extract(spu_orx(spu_rlmask(spu_cmpgt(b, a), -31)), 0)));
3285 1.1 mrg }
3286 1.1 mrg
3287 1.1 mrg static inline int vec_any_lt(vec_bint4 a, vec_int4 b)
3288 1.1 mrg {
3289 1.1 mrg return ((int)(spu_extract(spu_orx(spu_rlmask(spu_cmpgt(b, (vec_int4)(a)), -31)), 0)));
3290 1.1 mrg }
3291 1.1 mrg
3292 1.1 mrg static inline int vec_any_lt(vec_int4 a, vec_bint4 b)
3293 1.1 mrg {
3294 1.1 mrg return ((int)(spu_extract(spu_orx(spu_rlmask(spu_cmpgt((vec_int4)(b), a), -31)), 0)));
3295 1.1 mrg }
3296 1.1 mrg
3297 1.1 mrg static inline int vec_any_lt(vec_float4 a, vec_float4 b)
3298 1.1 mrg {
3299 1.1 mrg return ((int)(spu_extract(spu_orx(spu_rlmask(spu_cmpgt(b, a), -31)), 0)));
3300 1.1 mrg }
3301 1.1 mrg
3302 1.1 mrg /* vec_any_nan (any elements not a number)
3303 1.1 mrg * ===========
3304 1.1 mrg */
3305 1.1 mrg static inline int vec_any_nan(vec_float4 a)
3306 1.1 mrg {
3307 1.1 mrg vec_uint4 exp, man;
3308 1.1 mrg vec_uint4 exp_mask = spu_splats((unsigned int)0x7F800000);
3309 1.1 mrg
3310 1.1 mrg exp = spu_and((vec_uint4)(a), exp_mask);
3311 1.1 mrg man = spu_and((vec_uint4)(a), spu_splats((unsigned int)0x007FFFFF));
3312 1.1 mrg return ((int)(spu_extract(spu_gather(spu_andc(spu_cmpeq(exp, exp_mask),
3313 1.1 mrg spu_cmpeq(man, 0))), 0) != 0));
3314 1.1 mrg }
3315 1.1 mrg
3316 1.1 mrg
3317 1.1 mrg /* vec_any_ne (any elements not equal)
3318 1.1 mrg * ==========
3319 1.1 mrg */
3320 1.1 mrg static inline int vec_any_ne(vec_uchar16 a, vec_uchar16 b)
3321 1.1 mrg {
3322 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) != 0xFFFF));
3323 1.1 mrg }
3324 1.1 mrg
3325 1.1 mrg static inline int vec_any_ne(vec_char16 a, vec_char16 b)
3326 1.1 mrg {
3327 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) != 0xFFFF));
3328 1.1 mrg }
3329 1.1 mrg
3330 1.1 mrg static inline int vec_any_ne(vec_bchar16 a, vec_char16 b)
3331 1.1 mrg {
3332 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpeq((vec_char16)(a), b)), 0) != 0xFFFF));
3333 1.1 mrg }
3334 1.1 mrg
3335 1.1 mrg static inline int vec_any_ne(vec_char16 a, vec_bchar16 b)
3336 1.1 mrg {
3337 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpeq(a, (vec_char16)(b))), 0) != 0xFFFF));
3338 1.1 mrg }
3339 1.1 mrg
3340 1.1 mrg static inline int vec_any_ne(vec_ushort8 a, vec_ushort8 b)
3341 1.1 mrg {
3342 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) != 0xFF));
3343 1.1 mrg }
3344 1.1 mrg
3345 1.1 mrg static inline int vec_any_ne(vec_short8 a, vec_short8 b)
3346 1.1 mrg {
3347 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) != 0xFF));
3348 1.1 mrg }
3349 1.1 mrg
3350 1.1 mrg static inline int vec_any_ne(vec_bshort8 a, vec_short8 b)
3351 1.1 mrg {
3352 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpeq((vec_short8)(a), b)), 0) != 0xFF));
3353 1.1 mrg }
3354 1.1 mrg
3355 1.1 mrg static inline int vec_any_ne(vec_short8 a, vec_bshort8 b)
3356 1.1 mrg {
3357 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpeq(a, (vec_short8)(b))), 0) != 0xFF));
3358 1.1 mrg }
3359 1.1 mrg
3360 1.1 mrg static inline int vec_any_ne(vec_uint4 a, vec_uint4 b)
3361 1.1 mrg {
3362 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) != 0xF));
3363 1.1 mrg }
3364 1.1 mrg
3365 1.1 mrg static inline int vec_any_ne(vec_int4 a, vec_int4 b)
3366 1.1 mrg {
3367 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) != 0xF));
3368 1.1 mrg }
3369 1.1 mrg
3370 1.1 mrg static inline int vec_any_ne(vec_bint4 a, vec_int4 b)
3371 1.1 mrg {
3372 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpeq((vec_int4)(a), b)), 0) != 0xF));
3373 1.1 mrg }
3374 1.1 mrg
3375 1.1 mrg static inline int vec_any_ne(vec_int4 a, vec_bint4 b)
3376 1.1 mrg {
3377 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpeq(a, (vec_int4)(b))), 0) != 0xF));
3378 1.1 mrg }
3379 1.1 mrg
3380 1.1 mrg static inline int vec_any_ne(vec_float4 a, vec_float4 b)
3381 1.1 mrg {
3382 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) != 0xF));
3383 1.1 mrg }
3384 1.1 mrg
3385 1.1 mrg
3386 1.1 mrg /* vec_any_nge (any elements not greater than or equal)
3387 1.1 mrg * ===========
3388 1.1 mrg */
3389 1.1 mrg static inline int vec_any_nge(vec_float4 a, vec_float4 b)
3390 1.1 mrg {
3391 1.1 mrg return ((int)(spu_extract(spu_orx(spu_rlmask(spu_cmpgt(b, a), -31)), 0)));
3392 1.1 mrg }
3393 1.1 mrg
3394 1.1 mrg /* vec_any_ngt (any elements not greater than)
3395 1.1 mrg * ===========
3396 1.1 mrg */
3397 1.1 mrg static inline int vec_any_ngt(vec_float4 a, vec_float4 b)
3398 1.1 mrg {
3399 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) != 0xF));
3400 1.1 mrg }
3401 1.1 mrg
3402 1.1 mrg
3403 1.1 mrg /* vec_any_nle (any elements not less than or equal)
3404 1.1 mrg * ===========
3405 1.1 mrg */
3406 1.1 mrg static inline int vec_any_nle(vec_float4 a, vec_float4 b)
3407 1.1 mrg {
3408 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) != 0));
3409 1.1 mrg }
3410 1.1 mrg
3411 1.1 mrg
3412 1.1 mrg /* vec_any_nlt (any elements not less than)
3413 1.1 mrg * ===========
3414 1.1 mrg */
3415 1.1 mrg static inline int vec_any_nlt(vec_float4 a, vec_float4 b)
3416 1.1 mrg {
3417 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) != 0xF));
3418 1.1 mrg }
3419 1.1 mrg
3420 1.1 mrg
3421 1.1 mrg /* vec_any_numeric (any elements numeric)
3422 1.1 mrg * ===============
3423 1.1 mrg */
3424 1.1 mrg static inline int vec_any_numeric(vec_float4 a)
3425 1.1 mrg {
3426 1.1 mrg vec_uint4 exp;
3427 1.1 mrg
3428 1.1 mrg exp = spu_and(spu_rlmask((vec_uint4)(a), -23), 0xFF);
3429 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpeq(exp, 255)), 0) != 0xF));
3430 1.1 mrg }
3431 1.1 mrg
3432 1.1 mrg
3433 1.1 mrg /* vec_any_out (any elements out of bounds)
3434 1.1 mrg * ===========
3435 1.1 mrg */
3436 1.1 mrg static inline int vec_any_out(vec_float4 a, vec_float4 b)
3437 1.1 mrg {
3438 1.1 mrg return (spu_extract(spu_gather(spu_nor(spu_cmpabsgt(a, b), (vec_uint4)(spu_rlmaska((vec_int4)(b), -31)))), 0) != 0xF);
3439 1.1 mrg }
3440 1.1 mrg
3441 1.1 mrg
3442 1.1 mrg /* CBE Language Extension Intrinsics
3443 1.1 mrg */
3444 1.1 mrg
3445 1.1 mrg /* vec_extract (extract element from vector)
3446 1.1 mrg * ===========
3447 1.1 mrg */
3448 1.1 mrg #define vec_extract(_a, _element) spu_extract(_a, _element)
3449 1.1 mrg
3450 1.1 mrg
3451 1.1 mrg /* vec_insert (insert scalar into specified vector element)
3452 1.1 mrg * ==========
3453 1.1 mrg */
3454 1.1 mrg #define vec_insert(_a, _b, _element) spu_insert(_a, _b, _element)
3455 1.1 mrg
3456 1.1 mrg /* vec_lvlx (load vector left indexed)
3457 1.1 mrg * ========
3458 1.1 mrg */
3459 1.1 mrg static inline vec_uchar16 vec_lvlx(int a, unsigned char *b)
3460 1.1 mrg {
3461 1.1 mrg vec_uchar16 *p = (vec_uchar16 *)((unsigned char *)(b) + a);
3462 1.1 mrg return(spu_slqwbyte(*p, (unsigned int)p & 0xF));
3463 1.1 mrg }
3464 1.1 mrg
3465 1.1 mrg static inline vec_uchar16 vec_lvlx(int a, vec_uchar16 *b)
3466 1.1 mrg {
3467 1.1 mrg vec_uchar16 *p = (vec_uchar16 *)((unsigned char *)(b) + a);
3468 1.1 mrg return(spu_slqwbyte(*p, (unsigned int)p & 0xF));
3469 1.1 mrg }
3470 1.1 mrg
3471 1.1 mrg static inline vec_char16 vec_lvlx(int a, signed char *b)
3472 1.1 mrg {
3473 1.1 mrg vec_char16 *p = (vec_char16 *)((unsigned char *)(b) + a);
3474 1.1 mrg return(spu_slqwbyte(*p, (unsigned int)p & 0xF));
3475 1.1 mrg }
3476 1.1 mrg
3477 1.1 mrg static inline vec_char16 vec_lvlx(int a, vec_char16 *b)
3478 1.1 mrg {
3479 1.1 mrg vec_char16 *p = (vec_char16 *)((unsigned char *)(b) + a);
3480 1.1 mrg return(spu_slqwbyte(*p, (unsigned int)p & 0xF));
3481 1.1 mrg }
3482 1.1 mrg
3483 1.1 mrg static inline vec_ushort8 vec_lvlx(int a, unsigned short *b)
3484 1.1 mrg {
3485 1.1 mrg vec_ushort8 *p = (vec_ushort8 *)((unsigned char *)(b) + a);
3486 1.1 mrg return(spu_slqwbyte(*p, (unsigned int)p & 0xF));
3487 1.1 mrg }
3488 1.1 mrg
3489 1.1 mrg static inline vec_ushort8 vec_lvlx(int a, vec_ushort8 *b)
3490 1.1 mrg {
3491 1.1 mrg vec_ushort8 *p = (vec_ushort8 *)((unsigned char *)(b) + a);
3492 1.1 mrg return(spu_slqwbyte(*p, (unsigned int)p & 0xF));
3493 1.1 mrg }
3494 1.1 mrg
3495 1.1 mrg static inline vec_short8 vec_lvlx(int a, signed short *b)
3496 1.1 mrg {
3497 1.1 mrg vec_short8 *p = (vec_short8 *)((unsigned char *)(b) + a);
3498 1.1 mrg return(spu_slqwbyte(*p, (unsigned int)p & 0xF));
3499 1.1 mrg }
3500 1.1 mrg
3501 1.1 mrg static inline vec_short8 vec_lvlx(int a, vec_short8 *b)
3502 1.1 mrg {
3503 1.1 mrg vec_short8 *p = (vec_short8 *)((unsigned char *)(b) + a);
3504 1.1 mrg return(spu_slqwbyte(*p, (unsigned int)p & 0xF));
3505 1.1 mrg }
3506 1.1 mrg
3507 1.1 mrg static inline vec_uint4 vec_lvlx(int a, unsigned int *b)
3508 1.1 mrg {
3509 1.1 mrg vec_uint4 *p = (vec_uint4 *)((unsigned char *)(b) + a);
3510 1.1 mrg return(spu_slqwbyte(*p, (unsigned int)p & 0xF));
3511 1.1 mrg }
3512 1.1 mrg
3513 1.1 mrg static inline vec_uint4 vec_lvlx(int a, vec_uint4 *b)
3514 1.1 mrg {
3515 1.1 mrg vec_uint4 *p = (vec_uint4 *)((unsigned char *)(b) + a);
3516 1.1 mrg return(spu_slqwbyte(*p, (unsigned int)p & 0xF));
3517 1.1 mrg }
3518 1.1 mrg
3519 1.1 mrg static inline vec_int4 vec_lvlx(int a, signed int *b)
3520 1.1 mrg {
3521 1.1 mrg vec_int4 *p = (vec_int4 *)((unsigned char *)(b) + a);
3522 1.1 mrg return(spu_slqwbyte(*p, (unsigned int)p & 0xF));
3523 1.1 mrg }
3524 1.1 mrg
3525 1.1 mrg static inline vec_int4 vec_lvlx(int a, vec_int4 *b)
3526 1.1 mrg {
3527 1.1 mrg vec_int4 *p = (vec_int4 *)((unsigned char *)(b) + a);
3528 1.1 mrg return(spu_slqwbyte(*p, (unsigned int)p & 0xF));
3529 1.1 mrg }
3530 1.1 mrg
3531 1.1 mrg static inline vec_float4 vec_lvlx(int a, float *b)
3532 1.1 mrg {
3533 1.1 mrg vec_float4 *p = (vec_float4 *)((unsigned char *)(b) + a);
3534 1.1 mrg return(spu_slqwbyte(*p, (unsigned int)p & 0xF));
3535 1.1 mrg }
3536 1.1 mrg
3537 1.1 mrg static inline vec_float4 vec_lvlx(int a, vec_float4 *b)
3538 1.1 mrg {
3539 1.1 mrg vec_float4 *p = (vec_float4 *)((unsigned char *)(b) + a);
3540 1.1 mrg return(spu_slqwbyte(*p, (unsigned int)p & 0xF));
3541 1.1 mrg }
3542 1.1 mrg
3543 1.1 mrg
3544 1.1 mrg /* vec_lvlxl (load vector left indexed last)
3545 1.1 mrg * =========
3546 1.1 mrg */
3547 1.1 mrg #define vec_lvlxl(_a, _b) vec_lvlx(_a, _b)
3548 1.1 mrg
3549 1.1 mrg
3550 1.1 mrg /* vec_lvrx (load vector right indexed)
3551 1.1 mrg * ========
3552 1.1 mrg */
3553 1.1 mrg static inline vec_uchar16 vec_lvrx(int a, unsigned char *b)
3554 1.1 mrg {
3555 1.1 mrg vec_uchar16 *p = (vec_uchar16 *)((unsigned char *)(b) + a);
3556 1.1 mrg return(spu_rlmaskqwbyte(*p, ((int)p & 0xF)-16));
3557 1.1 mrg }
3558 1.1 mrg
3559 1.1 mrg static inline vec_uchar16 vec_lvrx(int a, vec_uchar16 *b)
3560 1.1 mrg {
3561 1.1 mrg vec_uchar16 *p = (vec_uchar16 *)((unsigned char *)(b) + a);
3562 1.1 mrg return(spu_rlmaskqwbyte(*p, ((int)p & 0xF)-16));
3563 1.1 mrg }
3564 1.1 mrg
3565 1.1 mrg static inline vec_char16 vec_lvrx(int a, signed char *b)
3566 1.1 mrg {
3567 1.1 mrg vec_char16 *p = (vec_char16 *)((unsigned char *)(b) + a);
3568 1.1 mrg return(spu_rlmaskqwbyte(*p, ((int)p & 0xF)-16));
3569 1.1 mrg }
3570 1.1 mrg
3571 1.1 mrg static inline vec_char16 vec_lvrx(int a, vec_char16 *b)
3572 1.1 mrg {
3573 1.1 mrg vec_char16 *p = (vec_char16 *)((unsigned char *)(b) + a);
3574 1.1 mrg return(spu_rlmaskqwbyte(*p, ((int)p & 0xF)-16));
3575 1.1 mrg }
3576 1.1 mrg
3577 1.1 mrg static inline vec_ushort8 vec_lvrx(int a, unsigned short *b)
3578 1.1 mrg {
3579 1.1 mrg vec_ushort8 *p = (vec_ushort8 *)((unsigned char *)(b) + a);
3580 1.1 mrg return(spu_rlmaskqwbyte(*p, ((int)p & 0xF)-16));
3581 1.1 mrg }
3582 1.1 mrg
3583 1.1 mrg static inline vec_ushort8 vec_lvrx(int a, vec_ushort8 *b)
3584 1.1 mrg {
3585 1.1 mrg vec_ushort8 *p = (vec_ushort8 *)((unsigned char *)(b) + a);
3586 1.1 mrg return(spu_rlmaskqwbyte(*p, ((int)p & 0xF)-16));
3587 1.1 mrg }
3588 1.1 mrg
3589 1.1 mrg static inline vec_short8 vec_lvrx(int a, signed short *b)
3590 1.1 mrg {
3591 1.1 mrg vec_short8 *p = (vec_short8 *)((unsigned char *)(b) + a);
3592 1.1 mrg return(spu_rlmaskqwbyte(*p, ((int)p & 0xF)-16));
3593 1.1 mrg }
3594 1.1 mrg
3595 1.1 mrg static inline vec_short8 vec_lvrx(int a, vec_short8 *b)
3596 1.1 mrg {
3597 1.1 mrg vec_short8 *p = (vec_short8 *)((unsigned char *)(b) + a);
3598 1.1 mrg return(spu_rlmaskqwbyte(*p, ((int)p & 0xF)-16));
3599 1.1 mrg }
3600 1.1 mrg
3601 1.1 mrg static inline vec_uint4 vec_lvrx(int a, unsigned int *b)
3602 1.1 mrg {
3603 1.1 mrg vec_uint4 *p = (vec_uint4 *)((unsigned char *)(b) + a);
3604 1.1 mrg return(spu_rlmaskqwbyte(*p, ((int)p & 0xF)-16));
3605 1.1 mrg }
3606 1.1 mrg
3607 1.1 mrg static inline vec_uint4 vec_lvrx(int a, vec_uint4 *b)
3608 1.1 mrg {
3609 1.1 mrg vec_uint4 *p = (vec_uint4 *)((unsigned char *)(b) + a);
3610 1.1 mrg return(spu_rlmaskqwbyte(*p, ((int)p & 0xF)-16));
3611 1.1 mrg }
3612 1.1 mrg
3613 1.1 mrg static inline vec_int4 vec_lvrx(int a, signed int *b)
3614 1.1 mrg {
3615 1.1 mrg vec_int4 *p = (vec_int4 *)((unsigned char *)(b) + a);
3616 1.1 mrg return(spu_rlmaskqwbyte(*p, ((int)p & 0xF)-16));
3617 1.1 mrg }
3618 1.1 mrg
3619 1.1 mrg static inline vec_int4 vec_lvrx(int a, vec_int4 *b)
3620 1.1 mrg {
3621 1.1 mrg vec_int4 *p = (vec_int4 *)((unsigned char *)(b) + a);
3622 1.1 mrg return(spu_rlmaskqwbyte(*p, ((int)p & 0xF)-16));
3623 1.1 mrg }
3624 1.1 mrg
3625 1.1 mrg static inline vec_float4 vec_lvrx(int a, float *b)
3626 1.1 mrg {
3627 1.1 mrg vec_float4 *p = (vec_float4 *)((unsigned char *)(b) + a);
3628 1.1 mrg return(spu_rlmaskqwbyte(*p, ((int)p & 0xF)-16));
3629 1.1 mrg }
3630 1.1 mrg
3631 1.1 mrg static inline vec_float4 vec_lvrx(int a, vec_float4 *b)
3632 1.1 mrg {
3633 1.1 mrg vec_float4 *p = (vec_float4 *)((unsigned char *)(b) + a);
3634 1.1 mrg return(spu_rlmaskqwbyte(*p, ((int)p & 0xF)-16));
3635 1.1 mrg }
3636 1.1 mrg
3637 1.1 mrg
3638 1.1 mrg
3639 1.1 mrg /* vec_lvrxl (load vector right indexed last)
3640 1.1 mrg * =========
3641 1.1 mrg */
3642 1.1 mrg #define vec_lvrxl(_a, _b) vec_lvrx(_a, _b)
3643 1.1 mrg
3644 1.1 mrg
3645 1.1 mrg /* vec_promote (promote scalar to a vector)
3646 1.1 mrg * ===========
3647 1.1 mrg */
3648 1.1 mrg #define vec_promote(_a, _element) spu_promote(_a, _element)
3649 1.1 mrg
3650 1.1 mrg
3651 1.1 mrg /* vec_splats (splat scalar to a vector)
3652 1.1 mrg * ==========
3653 1.1 mrg */
3654 1.1 mrg #define vec_splats(_a) spu_splats(_a)
3655 1.1 mrg
3656 1.1 mrg
3657 1.1 mrg /* vec_stvlx (store vector left indexed)
3658 1.1 mrg * =========
3659 1.1 mrg */
3660 1.1 mrg static inline void vec_stvlx(vec_uchar16 a, int b, unsigned char *c)
3661 1.1 mrg {
3662 1.1 mrg int shift;
3663 1.1 mrg vec_uchar16 *p = (vec_uchar16 *)((unsigned char *)(c) + b);
3664 1.1 mrg
3665 1.1 mrg shift = -((int)p & 0xF);
3666 1.1 mrg *p = spu_sel(*p,
3667 1.1 mrg spu_rlmaskqwbyte(a, shift),
3668 1.1 mrg spu_rlmaskqwbyte(spu_splats((unsigned char)0xFF), shift));
3669 1.1 mrg }
3670 1.1 mrg
3671 1.1 mrg static inline void vec_stvlx(vec_uchar16 a, int b, vec_uchar16 *c)
3672 1.1 mrg {
3673 1.1 mrg int shift;
3674 1.1 mrg vec_uchar16 *p = (vec_uchar16 *)((unsigned char *)(c) + b);
3675 1.1 mrg
3676 1.1 mrg shift = -((int)p & 0xF);
3677 1.1 mrg *p = spu_sel(*p,
3678 1.1 mrg spu_rlmaskqwbyte(a, shift),
3679 1.1 mrg spu_rlmaskqwbyte(spu_splats((unsigned char)0xFF), shift));
3680 1.1 mrg }
3681 1.1 mrg
3682 1.1 mrg static inline void vec_stvlx(vec_char16 a, int b, signed char *c)
3683 1.1 mrg {
3684 1.1 mrg int shift;
3685 1.1 mrg vec_char16 *p = (vec_char16 *)((unsigned char *)(c) + b);
3686 1.1 mrg
3687 1.1 mrg shift = -((int)p & 0xF);
3688 1.1 mrg *p = spu_sel(*p,
3689 1.1 mrg spu_rlmaskqwbyte(a, shift),
3690 1.1 mrg spu_rlmaskqwbyte(spu_splats((unsigned char)0xFF), shift));
3691 1.1 mrg }
3692 1.1 mrg
3693 1.1 mrg static inline void vec_stvlx(vec_char16 a, int b, vec_char16 *c)
3694 1.1 mrg {
3695 1.1 mrg int shift;
3696 1.1 mrg vec_char16 *p = (vec_char16 *)((unsigned char *)(c) + b);
3697 1.1 mrg
3698 1.1 mrg shift = -((int)p & 0xF);
3699 1.1 mrg *p = spu_sel(*p,
3700 1.1 mrg spu_rlmaskqwbyte(a, shift),
3701 1.1 mrg spu_rlmaskqwbyte(spu_splats((unsigned char)0xFF), shift));
3702 1.1 mrg }
3703 1.1 mrg
3704 1.1 mrg static inline void vec_stvlx(vec_ushort8 a, int b, unsigned short *c)
3705 1.1 mrg {
3706 1.1 mrg int shift;
3707 1.1 mrg vec_ushort8 *p = (vec_ushort8 *)((unsigned char *)(c) + b);
3708 1.1 mrg
3709 1.1 mrg shift = -((int)p & 0xF);
3710 1.1 mrg *p = spu_sel(*p,
3711 1.1 mrg spu_rlmaskqwbyte(a, shift),
3712 1.1 mrg spu_rlmaskqwbyte(spu_splats((unsigned short)0xFFFF), shift));
3713 1.1 mrg }
3714 1.1 mrg
3715 1.1 mrg static inline void vec_stvlx(vec_ushort8 a, int b, vec_ushort8 *c)
3716 1.1 mrg {
3717 1.1 mrg int shift;
3718 1.1 mrg vec_ushort8 *p = (vec_ushort8 *)((unsigned char *)(c) + b);
3719 1.1 mrg
3720 1.1 mrg shift = -((int)p & 0xF);
3721 1.1 mrg *p = spu_sel(*p,
3722 1.1 mrg spu_rlmaskqwbyte(a, shift),
3723 1.1 mrg spu_rlmaskqwbyte(spu_splats((unsigned short)0xFFFF), shift));
3724 1.1 mrg }
3725 1.1 mrg
3726 1.1 mrg static inline void vec_stvlx(vec_short8 a, int b, signed short *c)
3727 1.1 mrg {
3728 1.1 mrg int shift;
3729 1.1 mrg vec_short8 *p = (vec_short8 *)((unsigned char *)(c) + b);
3730 1.1 mrg
3731 1.1 mrg shift = -((int)p & 0xF);
3732 1.1 mrg *p = spu_sel(*p,
3733 1.1 mrg spu_rlmaskqwbyte(a, shift),
3734 1.1 mrg spu_rlmaskqwbyte(spu_splats((unsigned short)0xFFFF), shift));
3735 1.1 mrg }
3736 1.1 mrg
3737 1.1 mrg static inline void vec_stvlx(vec_short8 a, int b, vec_short8 *c)
3738 1.1 mrg {
3739 1.1 mrg int shift;
3740 1.1 mrg vec_short8 *p = (vec_short8 *)((unsigned char *)(c) + b);
3741 1.1 mrg
3742 1.1 mrg shift = -((int)p & 0xF);
3743 1.1 mrg *p = spu_sel(*p,
3744 1.1 mrg spu_rlmaskqwbyte(a, shift),
3745 1.1 mrg spu_rlmaskqwbyte(spu_splats((unsigned short)0xFFFF), shift));
3746 1.1 mrg }
3747 1.1 mrg
3748 1.1 mrg static inline void vec_stvlx(vec_uint4 a, int b, unsigned int *c)
3749 1.1 mrg {
3750 1.1 mrg int shift;
3751 1.1 mrg vec_uint4 *p = (vec_uint4 *)((unsigned char *)(c) + b);
3752 1.1 mrg
3753 1.1 mrg shift = -((int)p & 0xF);
3754 1.1 mrg *p = spu_sel(*p,
3755 1.1 mrg spu_rlmaskqwbyte(a, shift),
3756 1.1 mrg spu_rlmaskqwbyte(spu_splats((unsigned int)0xFFFFFFFF), shift));
3757 1.1 mrg }
3758 1.1 mrg
3759 1.1 mrg static inline void vec_stvlx(vec_uint4 a, int b, vec_uint4 *c)
3760 1.1 mrg {
3761 1.1 mrg int shift;
3762 1.1 mrg vec_uint4 *p = (vec_uint4 *)((unsigned char *)(c) + b);
3763 1.1 mrg
3764 1.1 mrg shift = -((int)p & 0xF);
3765 1.1 mrg *p = spu_sel(*p,
3766 1.1 mrg spu_rlmaskqwbyte(a, shift),
3767 1.1 mrg spu_rlmaskqwbyte(spu_splats((unsigned int)0xFFFFFFFF), shift));
3768 1.1 mrg }
3769 1.1 mrg
3770 1.1 mrg static inline void vec_stvlx(vec_int4 a, int b, signed int *c)
3771 1.1 mrg {
3772 1.1 mrg int shift;
3773 1.1 mrg vec_int4 *p = (vec_int4 *)((unsigned char *)(c) + b);
3774 1.1 mrg
3775 1.1 mrg shift = -((int)p & 0xF);
3776 1.1 mrg *p = spu_sel(*p,
3777 1.1 mrg spu_rlmaskqwbyte(a, shift),
3778 1.1 mrg spu_rlmaskqwbyte(spu_splats((unsigned int)0xFFFFFFFF), shift));
3779 1.1 mrg }
3780 1.1 mrg
3781 1.1 mrg static inline void vec_stvlx(vec_int4 a, int b, vec_int4 *c)
3782 1.1 mrg {
3783 1.1 mrg int shift;
3784 1.1 mrg vec_int4 *p = (vec_int4 *)((unsigned char *)(c) + b);
3785 1.1 mrg
3786 1.1 mrg shift = -((int)p & 0xF);
3787 1.1 mrg *p = spu_sel(*p,
3788 1.1 mrg spu_rlmaskqwbyte(a, shift),
3789 1.1 mrg spu_rlmaskqwbyte(spu_splats((unsigned int)0xFFFFFFFF), shift));
3790 1.1 mrg }
3791 1.1 mrg
3792 1.1 mrg static inline void vec_stvlx(vec_float4 a, int b, float *c)
3793 1.1 mrg {
3794 1.1 mrg int shift;
3795 1.1 mrg vec_float4 *p = (vec_float4 *)((unsigned char *)(c) + b);
3796 1.1 mrg
3797 1.1 mrg shift = -((int)p & 0xF);
3798 1.1 mrg *p = spu_sel(*p,
3799 1.1 mrg spu_rlmaskqwbyte(a, shift),
3800 1.1 mrg spu_rlmaskqwbyte(spu_splats((unsigned int)0xFFFFFFFF), shift));
3801 1.1 mrg }
3802 1.1 mrg
3803 1.1 mrg static inline void vec_stvlx(vec_float4 a, int b, vec_float4 *c)
3804 1.1 mrg {
3805 1.1 mrg int shift;
3806 1.1 mrg vec_float4 *p = (vec_float4 *)((unsigned char *)(c) + b);
3807 1.1 mrg
3808 1.1 mrg shift = -((int)p & 0xF);
3809 1.1 mrg *p = spu_sel(*p,
3810 1.1 mrg spu_rlmaskqwbyte(a, shift),
3811 1.1 mrg spu_rlmaskqwbyte(spu_splats((unsigned int)0xFFFFFFFF), shift));
3812 1.1 mrg }
3813 1.1 mrg
3814 1.1 mrg /* vec_stvlxl (store vector left indexed last)
3815 1.1 mrg * ==========
3816 1.1 mrg */
3817 1.1 mrg #define vec_stvlxl(_a, _b, _c) vec_stvlx(_a, _b, _c)
3818 1.1 mrg
3819 1.1 mrg
3820 1.1 mrg /* vec_stvrx (store vector right indexed)
3821 1.1 mrg * =========
3822 1.1 mrg */
3823 1.1 mrg static inline void vec_stvrx(vec_uchar16 a, int b, unsigned char *c)
3824 1.1 mrg {
3825 1.1 mrg int shift;
3826 1.1 mrg vec_uchar16 *p = (vec_uchar16 *)((unsigned char *)(c) + b);
3827 1.1 mrg
3828 1.1 mrg shift = 16-((int)p & 0xF);
3829 1.1 mrg *p = spu_sel(*p,
3830 1.1 mrg spu_slqwbyte(a, shift),
3831 1.1 mrg spu_slqwbyte(spu_splats((unsigned char)0xFF), shift));
3832 1.1 mrg }
3833 1.1 mrg
3834 1.1 mrg static inline void vec_stvrx(vec_uchar16 a, int b, vec_uchar16 *c)
3835 1.1 mrg {
3836 1.1 mrg int shift;
3837 1.1 mrg vec_uchar16 *p = (vec_uchar16 *)((unsigned char *)(c) + b);
3838 1.1 mrg
3839 1.1 mrg shift = 16-((int)p & 0xF);
3840 1.1 mrg *p = spu_sel(*p,
3841 1.1 mrg spu_slqwbyte(a, shift),
3842 1.1 mrg spu_slqwbyte(spu_splats((unsigned char)0xFF), shift));
3843 1.1 mrg }
3844 1.1 mrg
3845 1.1 mrg static inline void vec_stvrx(vec_char16 a, int b, signed char *c)
3846 1.1 mrg {
3847 1.1 mrg int shift;
3848 1.1 mrg vec_char16 *p = (vec_char16 *)((unsigned char *)(c) + b);
3849 1.1 mrg
3850 1.1 mrg shift = 16-((int)p & 0xF);
3851 1.1 mrg *p = spu_sel(*p,
3852 1.1 mrg spu_slqwbyte(a, shift),
3853 1.1 mrg spu_slqwbyte(spu_splats((unsigned char)0xFF), shift));
3854 1.1 mrg }
3855 1.1 mrg
3856 1.1 mrg static inline void vec_stvrx(vec_char16 a, int b, vec_char16 *c)
3857 1.1 mrg {
3858 1.1 mrg int shift;
3859 1.1 mrg vec_char16 *p = (vec_char16 *)((unsigned char *)(c) + b);
3860 1.1 mrg
3861 1.1 mrg shift = 16-((int)p & 0xF);
3862 1.1 mrg *p = spu_sel(*p,
3863 1.1 mrg spu_slqwbyte(a, shift),
3864 1.1 mrg spu_slqwbyte(spu_splats((unsigned char)0xFF), shift));
3865 1.1 mrg }
3866 1.1 mrg
3867 1.1 mrg static inline void vec_stvrx(vec_ushort8 a, int b, unsigned short *c)
3868 1.1 mrg {
3869 1.1 mrg int shift;
3870 1.1 mrg vec_ushort8 *p = (vec_ushort8 *)((unsigned char *)(c) + b);
3871 1.1 mrg
3872 1.1 mrg shift = 16-((int)p & 0xF);
3873 1.1 mrg *p = spu_sel(*p,
3874 1.1 mrg spu_slqwbyte(a, shift),
3875 1.1 mrg spu_slqwbyte(spu_splats((unsigned short)0xFFFF), shift));
3876 1.1 mrg }
3877 1.1 mrg
3878 1.1 mrg static inline void vec_stvrx(vec_ushort8 a, int b, vec_ushort8 *c)
3879 1.1 mrg {
3880 1.1 mrg int shift;
3881 1.1 mrg vec_ushort8 *p = (vec_ushort8 *)((unsigned char *)(c) + b);
3882 1.1 mrg
3883 1.1 mrg shift = 16-((int)p & 0xF);
3884 1.1 mrg *p = spu_sel(*p,
3885 1.1 mrg spu_slqwbyte(a, shift),
3886 1.1 mrg spu_slqwbyte(spu_splats((unsigned short)0xFFFF), shift));
3887 1.1 mrg }
3888 1.1 mrg
3889 1.1 mrg static inline void vec_stvrx(vec_short8 a, int b, signed short *c)
3890 1.1 mrg {
3891 1.1 mrg int shift;
3892 1.1 mrg vec_short8 *p = (vec_short8 *)((unsigned char *)(c) + b);
3893 1.1 mrg
3894 1.1 mrg shift = 16-((int)p & 0xF);
3895 1.1 mrg *p = spu_sel(*p,
3896 1.1 mrg spu_slqwbyte(a, shift),
3897 1.1 mrg spu_slqwbyte(spu_splats((unsigned short)0xFFFF), shift));
3898 1.1 mrg }
3899 1.1 mrg
3900 1.1 mrg static inline void vec_stvrx(vec_short8 a, int b, vec_short8 *c)
3901 1.1 mrg {
3902 1.1 mrg int shift;
3903 1.1 mrg vec_short8 *p = (vec_short8 *)((unsigned char *)(c) + b);
3904 1.1 mrg
3905 1.1 mrg shift = 16-((int)p & 0xF);
3906 1.1 mrg *p = spu_sel(*p,
3907 1.1 mrg spu_slqwbyte(a, shift),
3908 1.1 mrg spu_slqwbyte(spu_splats((unsigned short)0xFFFF), shift));
3909 1.1 mrg }
3910 1.1 mrg
3911 1.1 mrg static inline void vec_stvrx(vec_uint4 a, int b, unsigned int *c)
3912 1.1 mrg {
3913 1.1 mrg int shift;
3914 1.1 mrg vec_uint4 *p = (vec_uint4 *)((unsigned char *)(c) + b);
3915 1.1 mrg
3916 1.1 mrg shift = 16-((int)p & 0xF);
3917 1.1 mrg *p = spu_sel(*p,
3918 1.1 mrg spu_slqwbyte(a, shift),
3919 1.1 mrg spu_slqwbyte(spu_splats((unsigned int)0xFFFFFFFF), shift));
3920 1.1 mrg }
3921 1.1 mrg
3922 1.1 mrg static inline void vec_stvrx(vec_uint4 a, int b, vec_uint4 *c)
3923 1.1 mrg {
3924 1.1 mrg int shift;
3925 1.1 mrg vec_uint4 *p = (vec_uint4 *)((unsigned char *)(c) + b);
3926 1.1 mrg
3927 1.1 mrg shift = 16-((int)p & 0xF);
3928 1.1 mrg *p = spu_sel(*p,
3929 1.1 mrg spu_slqwbyte(a, shift),
3930 1.1 mrg spu_slqwbyte(spu_splats((unsigned int)0xFFFFFFFF), shift));
3931 1.1 mrg }
3932 1.1 mrg
3933 1.1 mrg static inline void vec_stvrx(vec_int4 a, int b, signed int *c)
3934 1.1 mrg {
3935 1.1 mrg int shift;
3936 1.1 mrg vec_int4 *p = (vec_int4 *)((unsigned char *)(c) + b);
3937 1.1 mrg
3938 1.1 mrg shift = 16-((int)p & 0xF);
3939 1.1 mrg *p = spu_sel(*p,
3940 1.1 mrg spu_slqwbyte(a, shift),
3941 1.1 mrg spu_slqwbyte(spu_splats((unsigned int)0xFFFFFFFF), shift));
3942 1.1 mrg }
3943 1.1 mrg
3944 1.1 mrg static inline void vec_stvrx(vec_int4 a, int b, vec_int4 *c)
3945 1.1 mrg {
3946 1.1 mrg int shift;
3947 1.1 mrg vec_int4 *p = (vec_int4 *)((unsigned char *)(c) + b);
3948 1.1 mrg
3949 1.1 mrg shift = 16-((int)p & 0xF);
3950 1.1 mrg *p = spu_sel(*p,
3951 1.1 mrg spu_slqwbyte(a, shift),
3952 1.1 mrg spu_slqwbyte(spu_splats((unsigned int)0xFFFFFFFF), shift));
3953 1.1 mrg }
3954 1.1 mrg
3955 1.1 mrg static inline void vec_stvrx(vec_float4 a, int b, float *c)
3956 1.1 mrg {
3957 1.1 mrg int shift;
3958 1.1 mrg vec_float4 *p = (vec_float4 *)((unsigned char *)(c) + b);
3959 1.1 mrg
3960 1.1 mrg shift = 16-((int)p & 0xF);
3961 1.1 mrg *p = spu_sel(*p,
3962 1.1 mrg spu_slqwbyte(a, shift),
3963 1.1 mrg spu_slqwbyte(spu_splats((unsigned int)0xFFFFFFFF), shift));
3964 1.1 mrg }
3965 1.1 mrg
3966 1.1 mrg static inline void vec_stvrx(vec_float4 a, int b, vec_float4 *c)
3967 1.1 mrg {
3968 1.1 mrg int shift;
3969 1.1 mrg vec_float4 *p = (vec_float4 *)((unsigned char *)(c) + b);
3970 1.1 mrg
3971 1.1 mrg shift = 16-((int)p & 0xF);
3972 1.1 mrg *p = spu_sel(*p,
3973 1.1 mrg spu_slqwbyte(a, shift),
3974 1.1 mrg spu_slqwbyte(spu_splats((unsigned int)0xFFFFFFFF), shift));
3975 1.1 mrg }
3976 1.1 mrg
3977 1.1 mrg /* vec_stvrxl (store vector right indexed last)
3978 1.1 mrg * ==========
3979 1.1 mrg */
3980 1.1 mrg #define vec_stvrxl(_a, _b, _c) vec_stvrx(_a, _b, _c)
3981 1.1 mrg
3982 1.1 mrg
3983 1.1 mrg #endif /* __SPU__ */
3984 1.1 mrg #endif /* __cplusplus */
3985 1.1 mrg #endif /* !_VMX2SPU_H_ */
3986