si2vmx.h revision 1.1.1.8 1 1.1 mrg /* Cell BEA specific SPU intrinsics to PPU/VMX intrinsics
2 1.1.1.8 mrg Copyright (C) 2007-2020 Free Software Foundation, Inc.
3 1.1 mrg
4 1.1 mrg This file is free software; you can redistribute it and/or modify it under
5 1.1 mrg the terms of the GNU General Public License as published by the Free
6 1.1 mrg Software Foundation; either version 3 of the License, or (at your option)
7 1.1 mrg any later version.
8 1.1 mrg
9 1.1 mrg This file is distributed in the hope that it will be useful, but WITHOUT
10 1.1 mrg ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 1.1 mrg FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
12 1.1 mrg for more details.
13 1.1 mrg
14 1.1 mrg Under Section 7 of GPL version 3, you are granted additional
15 1.1 mrg permissions described in the GCC Runtime Library Exception, version
16 1.1 mrg 3.1, as published by the Free Software Foundation.
17 1.1 mrg
18 1.1 mrg You should have received a copy of the GNU General Public License and
19 1.1 mrg a copy of the GCC Runtime Library Exception along with this program;
20 1.1 mrg see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
21 1.1 mrg <http://www.gnu.org/licenses/>. */
22 1.1 mrg
23 1.1 mrg #ifndef _SI2VMX_H_
24 1.1 mrg #define _SI2VMX_H_ 1
25 1.1 mrg
26 1.1 mrg #ifndef __SPU__
27 1.1 mrg
28 1.1 mrg #include <stdlib.h>
29 1.1 mrg #include <vec_types.h>
30 1.1 mrg
31 1.1 mrg
32 1.1 mrg /* Specify a default halt action for spu_hcmpeq and spu_hcmpgt intrinsics.
33 1.1 mrg * Users can override the action by defining it prior to including this
34 1.1 mrg * header file.
35 1.1 mrg */
36 1.1 mrg #ifndef SPU_HALT_ACTION
37 1.1 mrg #define SPU_HALT_ACTION abort()
38 1.1 mrg #endif
39 1.1 mrg
40 1.1 mrg /* Specify a default stop action for the spu_stop intrinsic.
41 1.1 mrg * Users can override the action by defining it prior to including this
42 1.1 mrg * header file.
43 1.1 mrg */
44 1.1 mrg #ifndef SPU_STOP_ACTION
45 1.1 mrg #define SPU_STOP_ACTION abort()
46 1.1 mrg #endif
47 1.1 mrg
48 1.1 mrg
49 1.1 mrg /* Specify a default action for unsupported intrinsic.
50 1.1 mrg * Users can override the action by defining it prior to including this
51 1.1 mrg * header file.
52 1.1 mrg */
53 1.1 mrg #ifndef SPU_UNSUPPORTED_ACTION
54 1.1 mrg #define SPU_UNSUPPORTED_ACTION abort()
55 1.1 mrg #endif
56 1.1 mrg
57 1.1 mrg
58 1.1 mrg /* Casting intrinsics - from scalar to quadword
59 1.1 mrg */
60 1.1 mrg
61 1.1 mrg static __inline qword si_from_uchar(unsigned char c) {
62 1.1 mrg union {
63 1.1 mrg qword q;
64 1.1 mrg unsigned char c[16];
65 1.1 mrg } x;
66 1.1 mrg x.c[3] = c;
67 1.1 mrg return (x.q);
68 1.1 mrg }
69 1.1 mrg
70 1.1 mrg static __inline qword si_from_char(signed char c) {
71 1.1 mrg union {
72 1.1 mrg qword q;
73 1.1 mrg signed char c[16];
74 1.1 mrg } x;
75 1.1 mrg x.c[3] = c;
76 1.1 mrg return (x.q);
77 1.1 mrg }
78 1.1 mrg
79 1.1 mrg static __inline qword si_from_ushort(unsigned short s) {
80 1.1 mrg union {
81 1.1 mrg qword q;
82 1.1 mrg unsigned short s[8];
83 1.1 mrg } x;
84 1.1 mrg x.s[1] = s;
85 1.1 mrg return (x.q);
86 1.1 mrg }
87 1.1 mrg
88 1.1 mrg static __inline qword si_from_short(short s) {
89 1.1 mrg union {
90 1.1 mrg qword q;
91 1.1 mrg short s[8];
92 1.1 mrg } x;
93 1.1 mrg x.s[1] = s;
94 1.1 mrg return (x.q);
95 1.1 mrg }
96 1.1 mrg
97 1.1 mrg
98 1.1 mrg static __inline qword si_from_uint(unsigned int i) {
99 1.1 mrg union {
100 1.1 mrg qword q;
101 1.1 mrg unsigned int i[4];
102 1.1 mrg } x;
103 1.1 mrg x.i[0] = i;
104 1.1 mrg return (x.q);
105 1.1 mrg }
106 1.1 mrg
107 1.1 mrg static __inline qword si_from_int(int i) {
108 1.1 mrg union {
109 1.1 mrg qword q;
110 1.1 mrg int i[4];
111 1.1 mrg } x;
112 1.1 mrg x.i[0] = i;
113 1.1 mrg return (x.q);
114 1.1 mrg }
115 1.1 mrg
116 1.1 mrg static __inline qword si_from_ullong(unsigned long long l) {
117 1.1 mrg union {
118 1.1 mrg qword q;
119 1.1 mrg unsigned long long l[2];
120 1.1 mrg } x;
121 1.1 mrg x.l[0] = l;
122 1.1 mrg return (x.q);
123 1.1 mrg }
124 1.1 mrg
125 1.1 mrg static __inline qword si_from_llong(long long l) {
126 1.1 mrg union {
127 1.1 mrg qword q;
128 1.1 mrg long long l[2];
129 1.1 mrg } x;
130 1.1 mrg x.l[0] = l;
131 1.1 mrg return (x.q);
132 1.1 mrg }
133 1.1 mrg
134 1.1 mrg static __inline qword si_from_float(float f) {
135 1.1 mrg union {
136 1.1 mrg qword q;
137 1.1 mrg float f[4];
138 1.1 mrg } x;
139 1.1 mrg x.f[0] = f;
140 1.1 mrg return (x.q);
141 1.1 mrg }
142 1.1 mrg
143 1.1 mrg static __inline qword si_from_double(double d) {
144 1.1 mrg union {
145 1.1 mrg qword q;
146 1.1 mrg double d[2];
147 1.1 mrg } x;
148 1.1 mrg x.d[0] = d;
149 1.1 mrg return (x.q);
150 1.1 mrg }
151 1.1 mrg
152 1.1 mrg static __inline qword si_from_ptr(void *ptr) {
153 1.1 mrg union {
154 1.1 mrg qword q;
155 1.1 mrg void *p;
156 1.1 mrg } x;
157 1.1 mrg x.p = ptr;
158 1.1 mrg return (x.q);
159 1.1 mrg }
160 1.1 mrg
161 1.1 mrg
162 1.1 mrg /* Casting intrinsics - from quadword to scalar
163 1.1 mrg */
164 1.1 mrg static __inline unsigned char si_to_uchar(qword q) {
165 1.1 mrg union {
166 1.1 mrg qword q;
167 1.1 mrg unsigned char c[16];
168 1.1 mrg } x;
169 1.1 mrg x.q = q;
170 1.1 mrg return (x.c[3]);
171 1.1 mrg }
172 1.1 mrg
173 1.1 mrg static __inline signed char si_to_char(qword q) {
174 1.1 mrg union {
175 1.1 mrg qword q;
176 1.1 mrg signed char c[16];
177 1.1 mrg } x;
178 1.1 mrg x.q = q;
179 1.1 mrg return (x.c[3]);
180 1.1 mrg }
181 1.1 mrg
182 1.1 mrg static __inline unsigned short si_to_ushort(qword q) {
183 1.1 mrg union {
184 1.1 mrg qword q;
185 1.1 mrg unsigned short s[8];
186 1.1 mrg } x;
187 1.1 mrg x.q = q;
188 1.1 mrg return (x.s[1]);
189 1.1 mrg }
190 1.1 mrg
191 1.1 mrg static __inline short si_to_short(qword q) {
192 1.1 mrg union {
193 1.1 mrg qword q;
194 1.1 mrg short s[8];
195 1.1 mrg } x;
196 1.1 mrg x.q = q;
197 1.1 mrg return (x.s[1]);
198 1.1 mrg }
199 1.1 mrg
200 1.1 mrg static __inline unsigned int si_to_uint(qword q) {
201 1.1 mrg union {
202 1.1 mrg qword q;
203 1.1 mrg unsigned int i[4];
204 1.1 mrg } x;
205 1.1 mrg x.q = q;
206 1.1 mrg return (x.i[0]);
207 1.1 mrg }
208 1.1 mrg
209 1.1 mrg static __inline int si_to_int(qword q) {
210 1.1 mrg union {
211 1.1 mrg qword q;
212 1.1 mrg int i[4];
213 1.1 mrg } x;
214 1.1 mrg x.q = q;
215 1.1 mrg return (x.i[0]);
216 1.1 mrg }
217 1.1 mrg
218 1.1 mrg static __inline unsigned long long si_to_ullong(qword q) {
219 1.1 mrg union {
220 1.1 mrg qword q;
221 1.1 mrg unsigned long long l[2];
222 1.1 mrg } x;
223 1.1 mrg x.q = q;
224 1.1 mrg return (x.l[0]);
225 1.1 mrg }
226 1.1 mrg
227 1.1 mrg static __inline long long si_to_llong(qword q) {
228 1.1 mrg union {
229 1.1 mrg qword q;
230 1.1 mrg long long l[2];
231 1.1 mrg } x;
232 1.1 mrg x.q = q;
233 1.1 mrg return (x.l[0]);
234 1.1 mrg }
235 1.1 mrg
236 1.1 mrg static __inline float si_to_float(qword q) {
237 1.1 mrg union {
238 1.1 mrg qword q;
239 1.1 mrg float f[4];
240 1.1 mrg } x;
241 1.1 mrg x.q = q;
242 1.1 mrg return (x.f[0]);
243 1.1 mrg }
244 1.1 mrg
245 1.1 mrg static __inline double si_to_double(qword q) {
246 1.1 mrg union {
247 1.1 mrg qword q;
248 1.1 mrg double d[2];
249 1.1 mrg } x;
250 1.1 mrg x.q = q;
251 1.1 mrg return (x.d[0]);
252 1.1 mrg }
253 1.1 mrg
254 1.1 mrg static __inline void * si_to_ptr(qword q) {
255 1.1 mrg union {
256 1.1 mrg qword q;
257 1.1 mrg void *p;
258 1.1 mrg } x;
259 1.1 mrg x.q = q;
260 1.1 mrg return (x.p);
261 1.1 mrg }
262 1.1 mrg
263 1.1 mrg
264 1.1 mrg /* Absolute difference
265 1.1 mrg */
266 1.1 mrg static __inline qword si_absdb(qword a, qword b)
267 1.1 mrg {
268 1.1 mrg vec_uchar16 ac, bc, dc;
269 1.1 mrg
270 1.1 mrg ac = (vec_uchar16)(a);
271 1.1 mrg bc = (vec_uchar16)(b);
272 1.1 mrg dc = vec_sel(vec_sub(bc, ac), vec_sub(ac, bc), vec_cmpgt(ac, bc));
273 1.1 mrg
274 1.1 mrg return ((qword)(dc));
275 1.1 mrg }
276 1.1 mrg
277 1.1 mrg /* Add intrinsics
278 1.1 mrg */
279 1.1 mrg #define si_a(_a, _b) ((qword)(vec_add((vec_uint4)(_a), (vec_uint4)(_b))))
280 1.1 mrg
281 1.1 mrg #define si_ah(_a, _b) ((qword)(vec_add((vec_ushort8)(_a), (vec_ushort8)(_b))))
282 1.1 mrg
283 1.1 mrg static __inline qword si_ai(qword a, int b)
284 1.1 mrg {
285 1.1 mrg return ((qword)(vec_add((vec_int4)(a),
286 1.1 mrg vec_splat((vec_int4)(si_from_int(b)), 0))));
287 1.1 mrg }
288 1.1 mrg
289 1.1 mrg
290 1.1 mrg static __inline qword si_ahi(qword a, short b)
291 1.1 mrg {
292 1.1 mrg return ((qword)(vec_add((vec_short8)(a),
293 1.1 mrg vec_splat((vec_short8)(si_from_short(b)), 1))));
294 1.1 mrg }
295 1.1 mrg
296 1.1 mrg
297 1.1 mrg #define si_fa(_a, _b) ((qword)(vec_add((vec_float4)(_a), (vec_float4)(_b))))
298 1.1 mrg
299 1.1 mrg
300 1.1 mrg static __inline qword si_dfa(qword a, qword b)
301 1.1 mrg {
302 1.1 mrg union {
303 1.1 mrg vec_double2 v;
304 1.1 mrg double d[2];
305 1.1 mrg } ad, bd, dd;
306 1.1 mrg
307 1.1 mrg ad.v = (vec_double2)(a);
308 1.1 mrg bd.v = (vec_double2)(b);
309 1.1 mrg dd.d[0] = ad.d[0] + bd.d[0];
310 1.1 mrg dd.d[1] = ad.d[1] + bd.d[1];
311 1.1 mrg
312 1.1 mrg return ((qword)(dd.v));
313 1.1 mrg }
314 1.1 mrg
315 1.1 mrg /* Add word extended
316 1.1 mrg */
317 1.1 mrg #define si_addx(_a, _b, _c) ((qword)(vec_add(vec_add((vec_uint4)(_a), (vec_uint4)(_b)), \
318 1.1 mrg vec_and((vec_uint4)(_c), vec_splat_u32(1)))))
319 1.1 mrg
320 1.1 mrg
321 1.1 mrg /* Bit-wise AND
322 1.1 mrg */
323 1.1 mrg #define si_and(_a, _b) ((qword)(vec_and((vec_uint4)(_a), (vec_uint4)(_b))))
324 1.1 mrg
325 1.1 mrg
326 1.1 mrg static __inline qword si_andbi(qword a, signed char b)
327 1.1 mrg {
328 1.1 mrg return ((qword)(vec_and((vec_char16)(a),
329 1.1 mrg vec_splat((vec_char16)(si_from_char(b)), 3))));
330 1.1 mrg }
331 1.1 mrg
332 1.1 mrg static __inline qword si_andhi(qword a, signed short b)
333 1.1 mrg {
334 1.1 mrg return ((qword)(vec_and((vec_short8)(a),
335 1.1 mrg vec_splat((vec_short8)(si_from_short(b)), 1))));
336 1.1 mrg }
337 1.1 mrg
338 1.1 mrg
339 1.1 mrg static __inline qword si_andi(qword a, signed int b)
340 1.1 mrg {
341 1.1 mrg return ((qword)(vec_and((vec_int4)(a),
342 1.1 mrg vec_splat((vec_int4)(si_from_int(b)), 0))));
343 1.1 mrg }
344 1.1 mrg
345 1.1 mrg
346 1.1 mrg /* Bit-wise AND with complement
347 1.1 mrg */
348 1.1 mrg #define si_andc(_a, _b) ((qword)(vec_andc((vec_uchar16)(_a), (vec_uchar16)(_b))))
349 1.1 mrg
350 1.1 mrg
351 1.1 mrg /* Average byte vectors
352 1.1 mrg */
353 1.1 mrg #define si_avgb(_a, _b) ((qword)(vec_avg((vec_uchar16)(_a), (vec_uchar16)(_b))))
354 1.1 mrg
355 1.1 mrg
356 1.1 mrg /* Branch indirect and set link on external data
357 1.1 mrg */
358 1.1 mrg #define si_bisled(_func) /* not mappable */
359 1.1 mrg #define si_bisledd(_func) /* not mappable */
360 1.1 mrg #define si_bislede(_func) /* not mappable */
361 1.1 mrg
362 1.1 mrg
363 1.1 mrg /* Borrow generate
364 1.1 mrg */
365 1.1 mrg #define si_bg(_a, _b) ((qword)(vec_subc((vec_uint4)(_b), (vec_uint4)(_a))))
366 1.1 mrg
367 1.1 mrg #define si_bgx(_a, _b, _c) ((qword)(vec_and(vec_or(vec_cmpgt((vec_uint4)(_b), (vec_uint4)(_a)), \
368 1.1 mrg vec_and(vec_cmpeq((vec_uint4)(_b), (vec_uint4)(_a)), \
369 1.1 mrg (vec_uint4)(_c))), vec_splat_u32(1))))
370 1.1 mrg
371 1.1 mrg /* Compare absolute equal
372 1.1 mrg */
373 1.1 mrg static __inline qword si_fcmeq(qword a, qword b)
374 1.1 mrg {
375 1.1 mrg vec_float4 msb = (vec_float4)((vec_uint4){0x80000000, 0x80000000, 0x80000000, 0x80000000});
376 1.1 mrg
377 1.1 mrg return ((qword)(vec_cmpeq(vec_andc((vec_float4)(a), msb),
378 1.1 mrg vec_andc((vec_float4)(b), msb))));
379 1.1 mrg }
380 1.1 mrg
381 1.1 mrg static __inline qword si_dfcmeq(qword a, qword b)
382 1.1 mrg {
383 1.1 mrg vec_uint4 sign_mask= (vec_uint4) { 0x7FFFFFFF, 0xFFFFFFFF, 0x7FFFFFFF, 0xFFFFFFFF };
384 1.1 mrg vec_uint4 nan_mask = (vec_uint4) { 0x7FF00000, 0x00000000, 0x7FF00000, 0x00000000 };
385 1.1 mrg vec_uchar16 hihi_promote = (vec_uchar16) { 0,1,2,3, 16,17,18,19, 8,9,10,11, 24,25,26,27};
386 1.1 mrg
387 1.1 mrg vec_uint4 biteq;
388 1.1 mrg vec_uint4 aabs;
389 1.1 mrg vec_uint4 babs;
390 1.1 mrg vec_uint4 a_gt;
391 1.1 mrg vec_uint4 ahi_inf;
392 1.1 mrg vec_uint4 anan;
393 1.1 mrg vec_uint4 result;
394 1.1 mrg
395 1.1 mrg union {
396 1.1 mrg vec_uchar16 v;
397 1.1 mrg int i[4];
398 1.1 mrg } x;
399 1.1 mrg
400 1.1 mrg /* Shift 4 bytes */
401 1.1 mrg x.i[3] = 4 << 3;
402 1.1 mrg
403 1.1 mrg /* Mask out sign bits */
404 1.1 mrg aabs = vec_and((vec_uint4)a,sign_mask);
405 1.1 mrg babs = vec_and((vec_uint4)b,sign_mask);
406 1.1 mrg
407 1.1 mrg /* A) Check for bit equality, store in high word */
408 1.1 mrg biteq = (vec_uint4) vec_cmpeq((vec_uint4)aabs,(vec_uint4)babs);
409 1.1 mrg biteq = vec_and(biteq,(vec_uint4)vec_slo((vec_uchar16)biteq,x.v));
410 1.1 mrg
411 1.1 mrg /*
412 1.1 mrg B) Check if a is NaN, store in high word
413 1.1 mrg
414 1.1 mrg B1) If the high word is greater than max_exp (indicates a NaN)
415 1.1 mrg B2) If the low word is greater than 0
416 1.1 mrg */
417 1.1 mrg a_gt = (vec_uint4)vec_cmpgt(aabs,nan_mask);
418 1.1 mrg
419 1.1 mrg /* B3) Check if the high word is equal to the inf exponent */
420 1.1 mrg ahi_inf = (vec_uint4)vec_cmpeq(aabs,nan_mask);
421 1.1 mrg
422 1.1 mrg /* anan = B1[hi] or (B2[lo] and B3[hi]) */
423 1.1 mrg anan = (vec_uint4)vec_or(a_gt,vec_and((vec_uint4)vec_slo((vec_uchar16)a_gt,x.v),ahi_inf));
424 1.1 mrg
425 1.1 mrg /* result = A and not B */
426 1.1 mrg result = vec_andc(biteq, anan);
427 1.1 mrg
428 1.1 mrg /* Promote high words to 64 bits and return */
429 1.1 mrg return ((qword)(vec_perm((vec_uchar16)result, (vec_uchar16)result, hihi_promote)));
430 1.1 mrg }
431 1.1 mrg
432 1.1 mrg
433 1.1 mrg /* Compare absolute greater than
434 1.1 mrg */
435 1.1 mrg static __inline qword si_fcmgt(qword a, qword b)
436 1.1 mrg {
437 1.1 mrg vec_float4 msb = (vec_float4)((vec_uint4){0x80000000, 0x80000000, 0x80000000, 0x80000000});
438 1.1 mrg
439 1.1 mrg return ((qword)(vec_cmpgt(vec_andc((vec_float4)(a), msb),
440 1.1 mrg vec_andc((vec_float4)(b), msb))));
441 1.1 mrg }
442 1.1 mrg
443 1.1 mrg static __inline qword si_dfcmgt(qword a, qword b)
444 1.1 mrg {
445 1.1 mrg vec_uchar16 splat_hi = (vec_uchar16) { 0,1,2,3, 0,1,2,3, 8,9,10,11, 8,9,10,11 };
446 1.1 mrg vec_uint4 nan_mask = (vec_uint4) { 0x7FF00000, 0x0, 0x7FF00000, 0x0 };
447 1.1 mrg vec_uint4 sign_mask = (vec_uint4) { 0x7FFFFFFF, 0xFFFFFFFF, 0x7FFFFFFF, 0xFFFFFFFF };
448 1.1 mrg
449 1.1 mrg union {
450 1.1 mrg vec_uchar16 v;
451 1.1 mrg int i[4];
452 1.1 mrg } x;
453 1.1 mrg
454 1.1 mrg /* Shift 4 bytes */
455 1.1 mrg x.i[3] = 4 << 3;
456 1.1 mrg
457 1.1 mrg // absolute value of a,b
458 1.1 mrg vec_uint4 aabs = vec_and((vec_uint4)a, sign_mask);
459 1.1 mrg vec_uint4 babs = vec_and((vec_uint4)b, sign_mask);
460 1.1 mrg
461 1.1 mrg // check if a is nan
462 1.1 mrg vec_uint4 a_inf = (vec_uint4)vec_cmpeq(aabs, nan_mask);
463 1.1 mrg vec_uint4 a_nan = (vec_uint4)vec_cmpgt(aabs, nan_mask);
464 1.1 mrg a_nan = vec_or(a_nan, vec_and((vec_uint4)vec_slo((vec_uchar16)a_nan,x.v),a_inf));
465 1.1 mrg a_nan = (vec_uint4)vec_perm((vec_uchar16)a_nan, (vec_uchar16)a_nan, splat_hi);
466 1.1 mrg
467 1.1 mrg // check if b is nan
468 1.1 mrg vec_uint4 b_inf = (vec_uint4)vec_cmpeq(babs, nan_mask);
469 1.1 mrg vec_uint4 b_nan = (vec_uint4)vec_cmpgt(babs, nan_mask);
470 1.1 mrg b_nan = vec_or(b_nan, vec_and((vec_uint4)vec_slo((vec_uchar16)b_nan,x.v),b_inf));
471 1.1 mrg b_nan = (vec_uint4)vec_perm((vec_uchar16)b_nan, (vec_uchar16)b_nan, splat_hi);
472 1.1 mrg
473 1.1 mrg // A) Check if the exponents are different
474 1.1 mrg vec_uint4 gt_hi = (vec_uint4)vec_cmpgt(aabs,babs);
475 1.1 mrg
476 1.1 mrg // B) Check if high word equal, and low word greater
477 1.1 mrg vec_uint4 gt_lo = (vec_uint4)vec_cmpgt((vec_uint4)aabs, (vec_uint4)babs);
478 1.1 mrg vec_uint4 eq = (vec_uint4)vec_cmpeq(aabs, babs);
479 1.1 mrg vec_uint4 eqgt = vec_and(eq,vec_slo(gt_lo,x.v));
480 1.1 mrg
481 1.1 mrg // If either A or B is true, return true (unless NaNs detected)
482 1.1 mrg vec_uint4 r = vec_or(gt_hi, eqgt);
483 1.1 mrg
484 1.1 mrg // splat the high words of the comparison step
485 1.1 mrg r = (vec_uint4)vec_perm((vec_uchar16)r,(vec_uchar16)r,splat_hi);
486 1.1 mrg
487 1.1 mrg // correct for NaNs in input
488 1.1 mrg return ((qword)vec_andc(r,vec_or(a_nan,b_nan)));
489 1.1 mrg }
490 1.1 mrg
491 1.1 mrg
492 1.1 mrg /* Compare equal
493 1.1 mrg */
494 1.1 mrg static __inline qword si_ceqb(qword a, qword b)
495 1.1 mrg {
496 1.1 mrg return ((qword)(vec_cmpeq((vec_uchar16)(a), (vec_uchar16)(b))));
497 1.1 mrg }
498 1.1 mrg
499 1.1 mrg static __inline qword si_ceqh(qword a, qword b)
500 1.1 mrg {
501 1.1 mrg return ((qword)(vec_cmpeq((vec_ushort8)(a), (vec_ushort8)(b))));
502 1.1 mrg }
503 1.1 mrg
504 1.1 mrg static __inline qword si_ceq(qword a, qword b)
505 1.1 mrg {
506 1.1 mrg return ((qword)(vec_cmpeq((vec_uint4)(a), (vec_uint4)(b))));
507 1.1 mrg }
508 1.1 mrg
509 1.1 mrg static __inline qword si_fceq(qword a, qword b)
510 1.1 mrg {
511 1.1 mrg return ((qword)(vec_cmpeq((vec_float4)(a), (vec_float4)(b))));
512 1.1 mrg }
513 1.1 mrg
514 1.1 mrg static __inline qword si_ceqbi(qword a, signed char b)
515 1.1 mrg {
516 1.1 mrg return ((qword)(vec_cmpeq((vec_char16)(a),
517 1.1 mrg vec_splat((vec_char16)(si_from_char(b)), 3))));
518 1.1 mrg }
519 1.1 mrg
520 1.1 mrg static __inline qword si_ceqhi(qword a, signed short b)
521 1.1 mrg {
522 1.1 mrg return ((qword)(vec_cmpeq((vec_short8)(a),
523 1.1 mrg vec_splat((vec_short8)(si_from_short(b)), 1))));
524 1.1 mrg }
525 1.1 mrg
526 1.1 mrg static __inline qword si_ceqi(qword a, signed int b)
527 1.1 mrg {
528 1.1 mrg return ((qword)(vec_cmpeq((vec_int4)(a),
529 1.1 mrg vec_splat((vec_int4)(si_from_int(b)), 0))));
530 1.1 mrg }
531 1.1 mrg
532 1.1 mrg static __inline qword si_dfceq(qword a, qword b)
533 1.1 mrg {
534 1.1 mrg vec_uint4 sign_mask= (vec_uint4) { 0x7FFFFFFF, 0xFFFFFFFF, 0x7FFFFFFF, 0xFFFFFFFF };
535 1.1 mrg vec_uint4 nan_mask = (vec_uint4) { 0x7FF00000, 0x00000000, 0x7FF00000, 0x00000000 };
536 1.1 mrg vec_uchar16 hihi_promote = (vec_uchar16) { 0,1,2,3, 16,17,18,19, 8,9,10,11, 24,25,26,27};
537 1.1 mrg
538 1.1 mrg vec_uint4 biteq;
539 1.1 mrg vec_uint4 aabs;
540 1.1 mrg vec_uint4 babs;
541 1.1 mrg vec_uint4 a_gt;
542 1.1 mrg vec_uint4 ahi_inf;
543 1.1 mrg vec_uint4 anan;
544 1.1 mrg vec_uint4 iszero;
545 1.1 mrg vec_uint4 result;
546 1.1 mrg
547 1.1 mrg union {
548 1.1 mrg vec_uchar16 v;
549 1.1 mrg int i[4];
550 1.1 mrg } x;
551 1.1 mrg
552 1.1 mrg /* Shift 4 bytes */
553 1.1 mrg x.i[3] = 4 << 3;
554 1.1 mrg
555 1.1 mrg /* A) Check for bit equality, store in high word */
556 1.1 mrg biteq = (vec_uint4) vec_cmpeq((vec_uint4)a,(vec_uint4)b);
557 1.1 mrg biteq = vec_and(biteq,(vec_uint4)vec_slo((vec_uchar16)biteq,x.v));
558 1.1 mrg
559 1.1 mrg /* Mask out sign bits */
560 1.1 mrg aabs = vec_and((vec_uint4)a,sign_mask);
561 1.1 mrg babs = vec_and((vec_uint4)b,sign_mask);
562 1.1 mrg
563 1.1 mrg /*
564 1.1 mrg B) Check if a is NaN, store in high word
565 1.1 mrg
566 1.1 mrg B1) If the high word is greater than max_exp (indicates a NaN)
567 1.1 mrg B2) If the low word is greater than 0
568 1.1 mrg */
569 1.1 mrg a_gt = (vec_uint4)vec_cmpgt(aabs,nan_mask);
570 1.1 mrg
571 1.1 mrg /* B3) Check if the high word is equal to the inf exponent */
572 1.1 mrg ahi_inf = (vec_uint4)vec_cmpeq(aabs,nan_mask);
573 1.1 mrg
574 1.1 mrg /* anan = B1[hi] or (B2[lo] and B3[hi]) */
575 1.1 mrg anan = (vec_uint4)vec_or(a_gt,vec_and((vec_uint4)vec_slo((vec_uchar16)a_gt,x.v),ahi_inf));
576 1.1 mrg
577 1.1 mrg /* C) Check for 0 = -0 special case */
578 1.1 mrg iszero =(vec_uint4)vec_cmpeq((vec_uint4)vec_or(aabs,babs),(vec_uint4)vec_splat_u32(0));
579 1.1 mrg iszero = vec_and(iszero,(vec_uint4)vec_slo((vec_uchar16)iszero,x.v));
580 1.1 mrg
581 1.1 mrg /* result = (A or C) and not B */
582 1.1 mrg result = vec_or(biteq,iszero);
583 1.1 mrg result = vec_andc(result, anan);
584 1.1 mrg
585 1.1 mrg /* Promote high words to 64 bits and return */
586 1.1 mrg return ((qword)(vec_perm((vec_uchar16)result, (vec_uchar16)result, hihi_promote)));
587 1.1 mrg }
588 1.1 mrg
589 1.1 mrg
590 1.1 mrg /* Compare greater than
591 1.1 mrg */
592 1.1 mrg static __inline qword si_cgtb(qword a, qword b)
593 1.1 mrg {
594 1.1 mrg return ((qword)(vec_cmpgt((vec_char16)(a), (vec_char16)(b))));
595 1.1 mrg }
596 1.1 mrg
597 1.1 mrg static __inline qword si_cgth(qword a, qword b)
598 1.1 mrg {
599 1.1 mrg return ((qword)(vec_cmpgt((vec_short8)(a), (vec_short8)(b))));
600 1.1 mrg }
601 1.1 mrg
602 1.1 mrg static __inline qword si_cgt(qword a, qword b)
603 1.1 mrg {
604 1.1 mrg return ((qword)(vec_cmpgt((vec_int4)(a), (vec_int4)(b))));
605 1.1 mrg }
606 1.1 mrg
607 1.1 mrg static __inline qword si_clgtb(qword a, qword b)
608 1.1 mrg {
609 1.1 mrg return ((qword)(vec_cmpgt((vec_uchar16)(a), (vec_uchar16)(b))));
610 1.1 mrg }
611 1.1 mrg
612 1.1 mrg static __inline qword si_clgth(qword a, qword b)
613 1.1 mrg {
614 1.1 mrg return ((qword)(vec_cmpgt((vec_ushort8)(a), (vec_ushort8)(b))));
615 1.1 mrg }
616 1.1 mrg
617 1.1 mrg static __inline qword si_clgt(qword a, qword b)
618 1.1 mrg {
619 1.1 mrg return ((qword)(vec_cmpgt((vec_uint4)(a), (vec_uint4)(b))));
620 1.1 mrg }
621 1.1 mrg
622 1.1 mrg static __inline qword si_fcgt(qword a, qword b)
623 1.1 mrg {
624 1.1 mrg return ((qword)(vec_cmpgt((vec_float4)(a), (vec_float4)(b))));
625 1.1 mrg }
626 1.1 mrg
627 1.1 mrg static __inline qword si_dfcgt(qword a, qword b)
628 1.1 mrg {
629 1.1 mrg vec_uchar16 splat_hi = (vec_uchar16) { 0,1,2,3, 0,1,2,3, 8,9,10,11, 8,9,10,11 };
630 1.1 mrg vec_uchar16 borrow_shuffle = (vec_uchar16) { 4,5,6,7, 192,192,192,192, 12,13,14,15, 192,192,192,192 };
631 1.1 mrg vec_uint4 nan_mask = (vec_uint4) { 0x7FF00000, 0x0, 0x7FF00000, 0x0 };
632 1.1 mrg vec_uint4 sign_mask = (vec_uint4) { 0x7FFFFFFF, 0xFFFFFFFF, 0x7FFFFFFF, 0xFFFFFFFF };
633 1.1 mrg
634 1.1 mrg union {
635 1.1 mrg vec_uchar16 v;
636 1.1 mrg int i[4];
637 1.1 mrg } x;
638 1.1 mrg
639 1.1 mrg /* Shift 4 bytes */
640 1.1 mrg x.i[3] = 4 << 3;
641 1.1 mrg
642 1.1 mrg // absolute value of a,b
643 1.1 mrg vec_uint4 aabs = vec_and((vec_uint4)a, sign_mask);
644 1.1 mrg vec_uint4 babs = vec_and((vec_uint4)b, sign_mask);
645 1.1 mrg
646 1.1 mrg // check if a is nan
647 1.1 mrg vec_uint4 a_inf = (vec_uint4)vec_cmpeq(aabs, nan_mask);
648 1.1 mrg vec_uint4 a_nan = (vec_uint4)vec_cmpgt(aabs, nan_mask);
649 1.1 mrg a_nan = vec_or(a_nan, vec_and((vec_uint4)vec_slo((vec_uchar16)a_nan,x.v),a_inf));
650 1.1 mrg a_nan = (vec_uint4)vec_perm((vec_uchar16)a_nan, (vec_uchar16)a_nan, splat_hi);
651 1.1 mrg
652 1.1 mrg // check if b is nan
653 1.1 mrg vec_uint4 b_inf = (vec_uint4)vec_cmpeq(babs, nan_mask);
654 1.1 mrg vec_uint4 b_nan = (vec_uint4)vec_cmpgt(babs, nan_mask);
655 1.1 mrg b_nan = vec_or(b_nan, vec_and((vec_uint4)vec_slo((vec_uchar16)b_nan,x.v),b_inf));
656 1.1 mrg b_nan = (vec_uint4)vec_perm((vec_uchar16)b_nan, (vec_uchar16)b_nan, splat_hi);
657 1.1 mrg
658 1.1 mrg // sign of a
659 1.1 mrg vec_uint4 asel = (vec_uint4)vec_sra((vec_int4)(a), (vec_uint4)vec_splat(((vec_uint4)si_from_int(31)), 0));
660 1.1 mrg asel = (vec_uint4)vec_perm((vec_uchar16)asel,(vec_uchar16)asel,splat_hi);
661 1.1 mrg
662 1.1 mrg // sign of b
663 1.1 mrg vec_uint4 bsel = (vec_uint4)vec_sra((vec_int4)(b), (vec_uint4)vec_splat(((vec_uint4)si_from_int(31)), 0));
664 1.1 mrg bsel = (vec_uint4)vec_perm((vec_uchar16)bsel,(vec_uchar16)bsel,splat_hi);
665 1.1 mrg
666 1.1 mrg // negative a
667 1.1 mrg vec_uint4 abor = vec_subc((vec_uint4)vec_splat_u32(0), aabs);
668 1.1 mrg vec_uchar16 pat = vec_sel(((vec_uchar16){0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15}), vec_sr(borrow_shuffle, vec_splat_u8(3)), vec_sra(borrow_shuffle, vec_splat_u8(7)));
669 1.1 mrg abor = (vec_uint4)(vec_perm(vec_perm((vec_uchar16)abor, (vec_uchar16)abor, borrow_shuffle),((vec_uchar16){0, 0, 0, 0, 0, 0, 0, 0, 0xFF, 0xFF, 0xFF, 0xFF, 0x80, 0x80, 0x80, 0x80}),pat));
670 1.1 mrg vec_uint4 aneg = vec_add(vec_add(vec_splat_u32(0), vec_nor(aabs, aabs)), vec_and(abor, vec_splat_u32(1)));
671 1.1 mrg
672 1.1 mrg // pick the one we want
673 1.1 mrg vec_int4 aval = (vec_int4)vec_sel((vec_uchar16)aabs, (vec_uchar16)aneg, (vec_uchar16)asel);
674 1.1 mrg
675 1.1 mrg // negative b
676 1.1 mrg vec_uint4 bbor = vec_subc((vec_uint4)vec_splat_u32(0), babs);
677 1.1 mrg bbor = (vec_uint4)(vec_perm(vec_perm((vec_uchar16)bbor, (vec_uchar16)bbor, borrow_shuffle),((vec_uchar16){0, 0, 0, 0, 0, 0, 0, 0, 0xFF, 0xFF, 0xFF, 0xFF, 0x80, 0x80, 0x80, 0x80}),pat));
678 1.1 mrg vec_uint4 bneg = vec_add(vec_nor(babs, babs), vec_and(bbor, vec_splat_u32(1)));
679 1.1 mrg
680 1.1 mrg // pick the one we want
681 1.1 mrg vec_int4 bval=(vec_int4)vec_sel((vec_uchar16)babs, (vec_uchar16)bneg, (vec_uchar16)bsel);
682 1.1 mrg
683 1.1 mrg // A) Check if the exponents are different
684 1.1 mrg vec_uint4 gt_hi = (vec_uint4)vec_cmpgt(aval,bval);
685 1.1 mrg
686 1.1 mrg // B) Check if high word equal, and low word greater
687 1.1 mrg vec_uint4 gt_lo = (vec_uint4)vec_cmpgt((vec_uint4)aval, (vec_uint4)bval);
688 1.1 mrg vec_uint4 eq = (vec_uint4)vec_cmpeq(aval, bval);
689 1.1 mrg vec_uint4 eqgt = vec_and(eq,vec_slo(gt_lo,x.v));
690 1.1 mrg
691 1.1 mrg // If either A or B is true, return true (unless NaNs detected)
692 1.1 mrg vec_uint4 r = vec_or(gt_hi, eqgt);
693 1.1 mrg
694 1.1 mrg // splat the high words of the comparison step
695 1.1 mrg r = (vec_uint4)vec_perm((vec_uchar16)r,(vec_uchar16)r,splat_hi);
696 1.1 mrg
697 1.1 mrg // correct for NaNs in input
698 1.1 mrg return ((qword)vec_andc(r,vec_or(a_nan,b_nan)));
699 1.1 mrg }
700 1.1 mrg
701 1.1 mrg static __inline qword si_cgtbi(qword a, signed char b)
702 1.1 mrg {
703 1.1 mrg return ((qword)(vec_cmpgt((vec_char16)(a),
704 1.1 mrg vec_splat((vec_char16)(si_from_char(b)), 3))));
705 1.1 mrg }
706 1.1 mrg
707 1.1 mrg static __inline qword si_cgthi(qword a, signed short b)
708 1.1 mrg {
709 1.1 mrg return ((qword)(vec_cmpgt((vec_short8)(a),
710 1.1 mrg vec_splat((vec_short8)(si_from_short(b)), 1))));
711 1.1 mrg }
712 1.1 mrg
713 1.1 mrg static __inline qword si_cgti(qword a, signed int b)
714 1.1 mrg {
715 1.1 mrg return ((qword)(vec_cmpgt((vec_int4)(a),
716 1.1 mrg vec_splat((vec_int4)(si_from_int(b)), 0))));
717 1.1 mrg }
718 1.1 mrg
719 1.1 mrg static __inline qword si_clgtbi(qword a, unsigned char b)
720 1.1 mrg {
721 1.1 mrg return ((qword)(vec_cmpgt((vec_uchar16)(a),
722 1.1 mrg vec_splat((vec_uchar16)(si_from_uchar(b)), 3))));
723 1.1 mrg }
724 1.1 mrg
725 1.1 mrg static __inline qword si_clgthi(qword a, unsigned short b)
726 1.1 mrg {
727 1.1 mrg return ((qword)(vec_cmpgt((vec_ushort8)(a),
728 1.1 mrg vec_splat((vec_ushort8)(si_from_ushort(b)), 1))));
729 1.1 mrg }
730 1.1 mrg
731 1.1 mrg static __inline qword si_clgti(qword a, unsigned int b)
732 1.1 mrg {
733 1.1 mrg return ((qword)(vec_cmpgt((vec_uint4)(a),
734 1.1 mrg vec_splat((vec_uint4)(si_from_uint(b)), 0))));
735 1.1 mrg }
736 1.1 mrg
737 1.1 mrg static __inline qword si_dftsv(qword a, char b)
738 1.1 mrg {
739 1.1 mrg vec_uchar16 splat_hi = (vec_uchar16) { 0,1,2,3, 0,1,2,3, 8,9,10,11, 8,9,10,11 };
740 1.1 mrg vec_uint4 sign_mask = (vec_uint4) { 0x7FFFFFFF, 0xFFFFFFFF, 0x7FFFFFFF, 0xFFFFFFFF };
741 1.1 mrg vec_uint4 result = (vec_uint4){0};
742 1.1 mrg vec_uint4 sign = (vec_uint4)vec_sra((vec_int4)(a), (vec_uint4)vec_splat(((vec_uint4)si_from_int(31)), 0));
743 1.1 mrg sign = (vec_uint4)vec_perm((vec_uchar16)sign,(vec_uchar16)sign,splat_hi);
744 1.1 mrg vec_uint4 aabs = vec_and((vec_uint4)a,sign_mask);
745 1.1 mrg
746 1.1 mrg union {
747 1.1 mrg vec_uchar16 v;
748 1.1 mrg int i[4];
749 1.1 mrg } x;
750 1.1 mrg
751 1.1 mrg /* Shift 4 bytes */
752 1.1 mrg x.i[3] = 4 << 3;
753 1.1 mrg
754 1.1 mrg /* Nan or +inf or -inf */
755 1.1 mrg if (b & 0x70)
756 1.1 mrg {
757 1.1 mrg vec_uint4 nan_mask = (vec_uint4) { 0x7FF00000, 0x0, 0x7FF00000, 0x0 };
758 1.1 mrg vec_uint4 a_inf = (vec_uint4)vec_cmpeq(aabs, nan_mask);
759 1.1 mrg /* NaN */
760 1.1 mrg if (b & 0x40)
761 1.1 mrg {
762 1.1 mrg vec_uint4 a_nan = (vec_uint4)vec_cmpgt(aabs, nan_mask);
763 1.1 mrg a_nan = vec_or(a_nan, vec_and((vec_uint4)vec_slo((vec_uchar16)a_nan,x.v),a_inf));
764 1.1 mrg a_nan = (vec_uint4)vec_perm((vec_uchar16)a_nan, (vec_uchar16)a_nan, splat_hi);
765 1.1 mrg result = vec_or(result, a_nan);
766 1.1 mrg }
767 1.1 mrg /* inf */
768 1.1 mrg if (b & 0x30)
769 1.1 mrg {
770 1.1 mrg a_inf = vec_and((vec_uint4)vec_slo((vec_uchar16)a_inf,x.v), a_inf);
771 1.1 mrg a_inf = (vec_uint4)vec_perm((vec_uchar16)a_inf, (vec_uchar16)a_inf, splat_hi);
772 1.1 mrg /* +inf */
773 1.1 mrg if (b & 0x20)
774 1.1 mrg result = vec_or(vec_andc(a_inf, sign), result);
775 1.1 mrg /* -inf */
776 1.1 mrg if (b & 0x10)
777 1.1 mrg result = vec_or(vec_and(a_inf, sign), result);
778 1.1 mrg }
779 1.1 mrg }
780 1.1 mrg /* 0 or denorm */
781 1.1 mrg if (b & 0xF)
782 1.1 mrg {
783 1.1 mrg vec_uint4 iszero =(vec_uint4)vec_cmpeq(aabs,(vec_uint4)vec_splat_u32(0));
784 1.1 mrg iszero = vec_and(iszero,(vec_uint4)vec_slo((vec_uchar16)iszero,x.v));
785 1.1 mrg /* denorm */
786 1.1 mrg if (b & 0x3)
787 1.1 mrg {
788 1.1 mrg vec_uint4 denorm_mask = (vec_uint4){0xFFFFF, 0xFFFFF, 0xFFFFF, 0xFFFFF};
789 1.1 mrg vec_uint4 isdenorm = vec_nor((vec_uint4)vec_cmpgt(aabs, denorm_mask), iszero);
790 1.1 mrg isdenorm = (vec_uint4)vec_perm((vec_uchar16)isdenorm, (vec_uchar16)isdenorm, splat_hi);
791 1.1 mrg /* +denorm */
792 1.1 mrg if (b & 0x2)
793 1.1 mrg result = vec_or(vec_andc(isdenorm, sign), result);
794 1.1 mrg /* -denorm */
795 1.1 mrg if (b & 0x1)
796 1.1 mrg result = vec_or(vec_and(isdenorm, sign), result);
797 1.1 mrg }
798 1.1 mrg /* 0 */
799 1.1 mrg if (b & 0xC)
800 1.1 mrg {
801 1.1 mrg iszero = (vec_uint4)vec_perm((vec_uchar16)iszero, (vec_uchar16)iszero, splat_hi);
802 1.1 mrg /* +0 */
803 1.1 mrg if (b & 0x8)
804 1.1 mrg result = vec_or(vec_andc(iszero, sign), result);
805 1.1 mrg /* -0 */
806 1.1 mrg if (b & 0x4)
807 1.1 mrg result = vec_or(vec_and(iszero, sign), result);
808 1.1 mrg }
809 1.1 mrg }
810 1.1 mrg return ((qword)result);
811 1.1 mrg }
812 1.1 mrg
813 1.1 mrg
814 1.1 mrg /* Carry generate
815 1.1 mrg */
816 1.1 mrg #define si_cg(_a, _b) ((qword)(vec_addc((vec_uint4)(_a), (vec_uint4)(_b))))
817 1.1 mrg
818 1.1 mrg #define si_cgx(_a, _b, _c) ((qword)(vec_or(vec_addc((vec_uint4)(_a), (vec_uint4)(_b)), \
819 1.1 mrg vec_addc(vec_add((vec_uint4)(_a), (vec_uint4)(_b)), \
820 1.1 mrg vec_and((vec_uint4)(_c), vec_splat_u32(1))))))
821 1.1 mrg
822 1.1 mrg
823 1.1 mrg /* Count ones for bytes
824 1.1 mrg */
825 1.1 mrg static __inline qword si_cntb(qword a)
826 1.1 mrg {
827 1.1 mrg vec_uchar16 nib_cnt = (vec_uchar16){0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4};
828 1.1 mrg vec_uchar16 four = { 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4 };
829 1.1 mrg vec_uchar16 av;
830 1.1 mrg
831 1.1 mrg av = (vec_uchar16)(a);
832 1.1 mrg
833 1.1 mrg return ((qword)(vec_add(vec_perm(nib_cnt, nib_cnt, av),
834 1.1 mrg vec_perm(nib_cnt, nib_cnt, vec_sr (av, four)))));
835 1.1 mrg }
836 1.1 mrg
837 1.1 mrg /* Count ones for bytes
838 1.1 mrg */
839 1.1 mrg static __inline qword si_clz(qword a)
840 1.1 mrg {
841 1.1 mrg vec_uchar16 av;
842 1.1 mrg vec_uchar16 cnt_hi, cnt_lo, cnt, tmp1, tmp2, tmp3;
843 1.1 mrg vec_uchar16 four = vec_splat_u8(4);
844 1.1 mrg vec_uchar16 nib_cnt = (vec_uchar16){4, 3, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0};
845 1.1 mrg vec_uchar16 eight = vec_splat_u8(8);
846 1.1 mrg vec_uchar16 sixteen = (vec_uchar16){16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16};
847 1.1 mrg vec_uchar16 twentyfour = (vec_uchar16){24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24};
848 1.1 mrg
849 1.1 mrg av = (vec_uchar16)(a);
850 1.1 mrg
851 1.1 mrg cnt_hi = vec_perm(nib_cnt, nib_cnt, vec_sr(av, four));
852 1.1 mrg cnt_lo = vec_perm(nib_cnt, nib_cnt, av);
853 1.1 mrg
854 1.1 mrg cnt = vec_add(cnt_hi, vec_and(cnt_lo, vec_cmpeq(cnt_hi, four)));
855 1.1 mrg
856 1.1 mrg tmp1 = (vec_uchar16)vec_sl((vec_uint4)(cnt), (vec_uint4)(eight));
857 1.1 mrg tmp2 = (vec_uchar16)vec_sl((vec_uint4)(cnt), (vec_uint4)(sixteen));
858 1.1 mrg tmp3 = (vec_uchar16)vec_sl((vec_uint4)(cnt), (vec_uint4)(twentyfour));
859 1.1 mrg
860 1.1 mrg cnt = vec_add(cnt, vec_and(tmp1, vec_cmpeq(cnt, eight)));
861 1.1 mrg cnt = vec_add(cnt, vec_and(tmp2, vec_cmpeq(cnt, sixteen)));
862 1.1 mrg cnt = vec_add(cnt, vec_and(tmp3, vec_cmpeq(cnt, twentyfour)));
863 1.1 mrg
864 1.1 mrg return (qword)((vec_sr((vec_uint4)(cnt), (vec_uint4)(twentyfour))));
865 1.1 mrg }
866 1.1 mrg
867 1.1 mrg /* Convert to float
868 1.1 mrg */
869 1.1 mrg #define si_cuflt(_a, _b) ((qword)(vec_ctf((vec_uint4)(_a), _b)))
870 1.1 mrg #define si_csflt(_a, _b) ((qword)(vec_ctf((vec_int4)(_a), _b)))
871 1.1 mrg
872 1.1 mrg /* Convert to signed int
873 1.1 mrg */
874 1.1 mrg #define si_cflts(_a, _b) ((qword)(vec_cts((vec_float4)(_a), _b)))
875 1.1 mrg
876 1.1 mrg /* Convert to unsigned int
877 1.1 mrg */
878 1.1 mrg #define si_cfltu(_a, _b) ((qword)(vec_ctu((vec_float4)(_a), _b)))
879 1.1 mrg
880 1.1 mrg /* Synchronize
881 1.1 mrg */
882 1.1 mrg #define si_dsync() /* do nothing */
883 1.1 mrg #define si_sync() /* do nothing */
884 1.1 mrg #define si_syncc() /* do nothing */
885 1.1 mrg
886 1.1 mrg
887 1.1 mrg /* Equivalence
888 1.1 mrg */
889 1.1 mrg static __inline qword si_eqv(qword a, qword b)
890 1.1 mrg {
891 1.1 mrg vec_uchar16 d;
892 1.1 mrg
893 1.1 mrg d = vec_xor((vec_uchar16)(a), (vec_uchar16)(b));
894 1.1 mrg return ((qword)(vec_nor(d, d)));
895 1.1 mrg }
896 1.1 mrg
897 1.1 mrg /* Extend
898 1.1 mrg */
899 1.1 mrg static __inline qword si_xsbh(qword a)
900 1.1 mrg {
901 1.1 mrg vec_char16 av;
902 1.1 mrg
903 1.1 mrg av = (vec_char16)(a);
904 1.1 mrg return ((qword)(vec_unpackh(vec_perm(av, av, ((vec_uchar16){1, 3, 5, 7, 9,11,13,15,
905 1.1 mrg 0, 0, 0, 0, 0, 0, 0, 0})))));
906 1.1 mrg }
907 1.1 mrg
908 1.1 mrg static __inline qword si_xshw(qword a)
909 1.1 mrg {
910 1.1 mrg vec_short8 av;
911 1.1 mrg
912 1.1 mrg av = (vec_short8)(a);
913 1.1 mrg return ((qword)(vec_unpackh(vec_perm(av, av, ((vec_uchar16){2, 3, 6, 7,
914 1.1 mrg 10,11,14,15,
915 1.1 mrg 0, 0, 0, 0,
916 1.1 mrg 0, 0, 0, 0})))));
917 1.1 mrg }
918 1.1 mrg
919 1.1 mrg static __inline qword si_xswd(qword a)
920 1.1 mrg {
921 1.1 mrg vec_int4 av;
922 1.1 mrg
923 1.1 mrg av = (vec_int4)(a);
924 1.1 mrg return ((qword)(vec_perm(av, vec_sra(av, ((vec_uint4){31,31,31,31})),
925 1.1 mrg ((vec_uchar16){20, 21, 22, 23,
926 1.1 mrg 4, 5, 6, 7,
927 1.1 mrg 28, 29, 30, 31,
928 1.1 mrg 12, 13, 14, 15}))));
929 1.1 mrg }
930 1.1 mrg
931 1.1 mrg static __inline qword si_fesd(qword a)
932 1.1 mrg {
933 1.1 mrg union {
934 1.1 mrg double d[2];
935 1.1 mrg vec_double2 vd;
936 1.1 mrg } out;
937 1.1 mrg union {
938 1.1 mrg float f[4];
939 1.1 mrg vec_float4 vf;
940 1.1 mrg } in;
941 1.1 mrg
942 1.1 mrg in.vf = (vec_float4)(a);
943 1.1 mrg out.d[0] = (double)(in.f[0]);
944 1.1 mrg out.d[1] = (double)(in.f[2]);
945 1.1 mrg return ((qword)(out.vd));
946 1.1 mrg }
947 1.1 mrg
948 1.1 mrg /* Gather
949 1.1 mrg */
950 1.1 mrg static __inline qword si_gbb(qword a)
951 1.1 mrg {
952 1.1 mrg vec_uchar16 bits;
953 1.1 mrg vec_uint4 bytes;
954 1.1 mrg
955 1.1 mrg bits = vec_sl(vec_and((vec_uchar16)(a), vec_splat_u8(1)), ((vec_uchar16){7, 6, 5, 4, 3, 2, 1, 0,
956 1.1 mrg 7, 6, 5, 4, 3, 2, 1, 0}));
957 1.1 mrg bytes = (vec_uint4)vec_sum2s((vec_int4)(vec_sum4s(bits, ((vec_uint4){0}))), ((vec_int4){0}));
958 1.1 mrg
959 1.1 mrg return ((qword)(vec_perm(bytes, bytes, ((vec_uchar16){0, 0, 7,15, 0, 0, 0, 0,
960 1.1 mrg 0, 0, 0, 0, 0, 0, 0, 0}))));
961 1.1 mrg }
962 1.1 mrg
963 1.1 mrg
964 1.1 mrg static __inline qword si_gbh(qword a)
965 1.1 mrg {
966 1.1 mrg vec_ushort8 bits;
967 1.1 mrg vec_uint4 bytes;
968 1.1 mrg
969 1.1 mrg bits = vec_sl(vec_and((vec_ushort8)(a), vec_splat_u16(1)), ((vec_ushort8){7, 6, 5, 4, 3, 2, 1, 0}));
970 1.1 mrg
971 1.1 mrg bytes = (vec_uint4)vec_sums((vec_int4)(vec_sum4s((vec_short8)(bits), (vec_int4){0})), (vec_int4){0});
972 1.1 mrg
973 1.1 mrg return ((qword)(vec_sld(bytes, bytes, 12)));
974 1.1 mrg }
975 1.1 mrg
976 1.1 mrg static __inline qword si_gb(qword a)
977 1.1 mrg {
978 1.1 mrg vec_uint4 bits;
979 1.1 mrg vec_uint4 bytes;
980 1.1 mrg
981 1.1 mrg bits = vec_sl(vec_and((vec_uint4)(a), vec_splat_u32(1)), ((vec_uint4){3, 2, 1, 0}));
982 1.1 mrg bytes = (vec_uint4)vec_sums((vec_int4)(bits), ((vec_int4){0}));
983 1.1 mrg return ((qword)(vec_sld(bytes, bytes, 12)));
984 1.1 mrg }
985 1.1 mrg
986 1.1 mrg
987 1.1 mrg /* Compare and halt
988 1.1 mrg */
989 1.1 mrg static __inline void si_heq(qword a, qword b)
990 1.1 mrg {
991 1.1 mrg union {
992 1.1 mrg vector unsigned int v;
993 1.1 mrg unsigned int i[4];
994 1.1 mrg } aa, bb;
995 1.1 mrg
996 1.1 mrg aa.v = (vector unsigned int)(a);
997 1.1 mrg bb.v = (vector unsigned int)(b);
998 1.1 mrg
999 1.1 mrg if (aa.i[0] == bb.i[0]) { SPU_HALT_ACTION; };
1000 1.1 mrg }
1001 1.1 mrg
1002 1.1 mrg static __inline void si_heqi(qword a, unsigned int b)
1003 1.1 mrg {
1004 1.1 mrg union {
1005 1.1 mrg vector unsigned int v;
1006 1.1 mrg unsigned int i[4];
1007 1.1 mrg } aa;
1008 1.1 mrg
1009 1.1 mrg aa.v = (vector unsigned int)(a);
1010 1.1 mrg
1011 1.1 mrg if (aa.i[0] == b) { SPU_HALT_ACTION; };
1012 1.1 mrg }
1013 1.1 mrg
1014 1.1 mrg static __inline void si_hgt(qword a, qword b)
1015 1.1 mrg {
1016 1.1 mrg union {
1017 1.1 mrg vector signed int v;
1018 1.1 mrg signed int i[4];
1019 1.1 mrg } aa, bb;
1020 1.1 mrg
1021 1.1 mrg aa.v = (vector signed int)(a);
1022 1.1 mrg bb.v = (vector signed int)(b);
1023 1.1 mrg
1024 1.1 mrg if (aa.i[0] > bb.i[0]) { SPU_HALT_ACTION; };
1025 1.1 mrg }
1026 1.1 mrg
1027 1.1 mrg static __inline void si_hgti(qword a, signed int b)
1028 1.1 mrg {
1029 1.1 mrg union {
1030 1.1 mrg vector signed int v;
1031 1.1 mrg signed int i[4];
1032 1.1 mrg } aa;
1033 1.1 mrg
1034 1.1 mrg aa.v = (vector signed int)(a);
1035 1.1 mrg
1036 1.1 mrg if (aa.i[0] > b) { SPU_HALT_ACTION; };
1037 1.1 mrg }
1038 1.1 mrg
1039 1.1 mrg static __inline void si_hlgt(qword a, qword b)
1040 1.1 mrg {
1041 1.1 mrg union {
1042 1.1 mrg vector unsigned int v;
1043 1.1 mrg unsigned int i[4];
1044 1.1 mrg } aa, bb;
1045 1.1 mrg
1046 1.1 mrg aa.v = (vector unsigned int)(a);
1047 1.1 mrg bb.v = (vector unsigned int)(b);
1048 1.1 mrg
1049 1.1 mrg if (aa.i[0] > bb.i[0]) { SPU_HALT_ACTION; };
1050 1.1 mrg }
1051 1.1 mrg
1052 1.1 mrg static __inline void si_hlgti(qword a, unsigned int b)
1053 1.1 mrg {
1054 1.1 mrg union {
1055 1.1 mrg vector unsigned int v;
1056 1.1 mrg unsigned int i[4];
1057 1.1 mrg } aa;
1058 1.1 mrg
1059 1.1 mrg aa.v = (vector unsigned int)(a);
1060 1.1 mrg
1061 1.1 mrg if (aa.i[0] > b) { SPU_HALT_ACTION; };
1062 1.1 mrg }
1063 1.1 mrg
1064 1.1 mrg
1065 1.1 mrg /* Multiply and Add
1066 1.1 mrg */
1067 1.1 mrg static __inline qword si_mpya(qword a, qword b, qword c)
1068 1.1 mrg {
1069 1.1 mrg return ((qword)(vec_msum(vec_and((vec_short8)(a),
1070 1.1 mrg ((vec_short8){0, -1, 0, -1, 0, -1, 0, -1})),
1071 1.1 mrg (vec_short8)(b), (vec_int4)(c))));
1072 1.1 mrg }
1073 1.1 mrg
1074 1.1 mrg static __inline qword si_fma(qword a, qword b, qword c)
1075 1.1 mrg {
1076 1.1 mrg return ((qword)(vec_madd((vec_float4)(a), (vec_float4)(b), (vec_float4)(c))));
1077 1.1 mrg }
1078 1.1 mrg
1079 1.1 mrg static __inline qword si_dfma(qword a, qword b, qword c)
1080 1.1 mrg {
1081 1.1 mrg union {
1082 1.1 mrg vec_double2 v;
1083 1.1 mrg double d[2];
1084 1.1 mrg } aa, bb, cc, dd;
1085 1.1 mrg
1086 1.1 mrg aa.v = (vec_double2)(a);
1087 1.1 mrg bb.v = (vec_double2)(b);
1088 1.1 mrg cc.v = (vec_double2)(c);
1089 1.1 mrg dd.d[0] = aa.d[0] * bb.d[0] + cc.d[0];
1090 1.1 mrg dd.d[1] = aa.d[1] * bb.d[1] + cc.d[1];
1091 1.1 mrg return ((qword)(dd.v));
1092 1.1 mrg }
1093 1.1 mrg
1094 1.1 mrg /* Form Mask
1095 1.1 mrg */
1096 1.1 mrg #define si_fsmbi(_a) si_fsmb(si_from_int(_a))
1097 1.1 mrg
1098 1.1 mrg static __inline qword si_fsmb(qword a)
1099 1.1 mrg {
1100 1.1 mrg vec_char16 mask;
1101 1.1 mrg vec_ushort8 in;
1102 1.1 mrg
1103 1.1 mrg in = (vec_ushort8)(a);
1104 1.1 mrg mask = (vec_char16)(vec_perm(in, in, ((vec_uchar16){2, 2, 2, 2, 2, 2, 2, 2,
1105 1.1 mrg 3, 3, 3, 3, 3, 3, 3, 3})));
1106 1.1 mrg return ((qword)(vec_sra(vec_sl(mask, ((vec_uchar16){0, 1, 2, 3, 4, 5, 6, 7,
1107 1.1 mrg 0, 1, 2, 3, 4, 5, 6, 7})),
1108 1.1 mrg vec_splat_u8(7))));
1109 1.1 mrg }
1110 1.1 mrg
1111 1.1 mrg
1112 1.1 mrg static __inline qword si_fsmh(qword a)
1113 1.1 mrg {
1114 1.1 mrg vec_uchar16 in;
1115 1.1 mrg vec_short8 mask;
1116 1.1 mrg
1117 1.1 mrg in = (vec_uchar16)(a);
1118 1.1 mrg mask = (vec_short8)(vec_splat(in, 3));
1119 1.1 mrg return ((qword)(vec_sra(vec_sl(mask, ((vec_ushort8){0, 1, 2, 3, 4, 5, 6, 7})),
1120 1.1 mrg vec_splat_u16(15))));
1121 1.1 mrg }
1122 1.1 mrg
1123 1.1 mrg static __inline qword si_fsm(qword a)
1124 1.1 mrg {
1125 1.1 mrg vec_uchar16 in;
1126 1.1 mrg vec_int4 mask;
1127 1.1 mrg
1128 1.1 mrg in = (vec_uchar16)(a);
1129 1.1 mrg mask = (vec_int4)(vec_splat(in, 3));
1130 1.1 mrg return ((qword)(vec_sra(vec_sl(mask, ((vec_uint4){28, 29, 30, 31})),
1131 1.1 mrg ((vec_uint4){31,31,31,31}))));
1132 1.1 mrg }
1133 1.1 mrg
1134 1.1 mrg /* Move from/to registers
1135 1.1 mrg */
1136 1.1 mrg #define si_fscrrd() ((qword)((vec_uint4){0}))
1137 1.1 mrg #define si_fscrwr(_a)
1138 1.1 mrg
1139 1.1 mrg #define si_mfspr(_reg) ((qword)((vec_uint4){0}))
1140 1.1 mrg #define si_mtspr(_reg, _a)
1141 1.1 mrg
1142 1.1 mrg /* Multiply High High Add
1143 1.1 mrg */
1144 1.1 mrg static __inline qword si_mpyhha(qword a, qword b, qword c)
1145 1.1 mrg {
1146 1.1 mrg return ((qword)(vec_add(vec_mule((vec_short8)(a), (vec_short8)(b)), (vec_int4)(c))));
1147 1.1 mrg }
1148 1.1 mrg
1149 1.1 mrg static __inline qword si_mpyhhau(qword a, qword b, qword c)
1150 1.1 mrg {
1151 1.1 mrg return ((qword)(vec_add(vec_mule((vec_ushort8)(a), (vec_ushort8)(b)), (vec_uint4)(c))));
1152 1.1 mrg }
1153 1.1 mrg
1154 1.1 mrg /* Multiply Subtract
1155 1.1 mrg */
1156 1.1 mrg static __inline qword si_fms(qword a, qword b, qword c)
1157 1.1 mrg {
1158 1.1 mrg return ((qword)(vec_madd((vec_float4)(a), (vec_float4)(b),
1159 1.1 mrg vec_sub(((vec_float4){0.0f}), (vec_float4)(c)))));
1160 1.1 mrg }
1161 1.1 mrg
1162 1.1 mrg static __inline qword si_dfms(qword a, qword b, qword c)
1163 1.1 mrg {
1164 1.1 mrg union {
1165 1.1 mrg vec_double2 v;
1166 1.1 mrg double d[2];
1167 1.1 mrg } aa, bb, cc, dd;
1168 1.1 mrg
1169 1.1 mrg aa.v = (vec_double2)(a);
1170 1.1 mrg bb.v = (vec_double2)(b);
1171 1.1 mrg cc.v = (vec_double2)(c);
1172 1.1 mrg dd.d[0] = aa.d[0] * bb.d[0] - cc.d[0];
1173 1.1 mrg dd.d[1] = aa.d[1] * bb.d[1] - cc.d[1];
1174 1.1 mrg return ((qword)(dd.v));
1175 1.1 mrg }
1176 1.1 mrg
1177 1.1 mrg /* Multiply
1178 1.1 mrg */
1179 1.1 mrg static __inline qword si_fm(qword a, qword b)
1180 1.1 mrg {
1181 1.1 mrg return ((qword)(vec_madd((vec_float4)(a), (vec_float4)(b), ((vec_float4){0.0f}))));
1182 1.1 mrg }
1183 1.1 mrg
1184 1.1 mrg static __inline qword si_dfm(qword a, qword b)
1185 1.1 mrg {
1186 1.1 mrg union {
1187 1.1 mrg vec_double2 v;
1188 1.1 mrg double d[2];
1189 1.1 mrg } aa, bb, dd;
1190 1.1 mrg
1191 1.1 mrg aa.v = (vec_double2)(a);
1192 1.1 mrg bb.v = (vec_double2)(b);
1193 1.1 mrg dd.d[0] = aa.d[0] * bb.d[0];
1194 1.1 mrg dd.d[1] = aa.d[1] * bb.d[1];
1195 1.1 mrg return ((qword)(dd.v));
1196 1.1 mrg }
1197 1.1 mrg
1198 1.1 mrg /* Multiply High
1199 1.1 mrg */
1200 1.1 mrg static __inline qword si_mpyh(qword a, qword b)
1201 1.1 mrg {
1202 1.1 mrg vec_uint4 sixteen = (vec_uint4){16, 16, 16, 16};
1203 1.1 mrg
1204 1.1 mrg return ((qword)(vec_sl(vec_mule((vec_short8)(a), (vec_short8)(vec_sl((vec_uint4)(b), sixteen))), sixteen)));
1205 1.1 mrg }
1206 1.1 mrg
1207 1.1 mrg
1208 1.1 mrg /* Multiply High High
1209 1.1 mrg */
1210 1.1 mrg static __inline qword si_mpyhh(qword a, qword b)
1211 1.1 mrg {
1212 1.1 mrg return ((qword)(vec_mule((vec_short8)(a), (vec_short8)(b))));
1213 1.1 mrg }
1214 1.1 mrg
1215 1.1 mrg static __inline qword si_mpyhhu(qword a, qword b)
1216 1.1 mrg {
1217 1.1 mrg return ((qword)(vec_mule((vec_ushort8)(a), (vec_ushort8)(b))));
1218 1.1 mrg }
1219 1.1 mrg
1220 1.1 mrg /* Multiply Odd
1221 1.1 mrg */
1222 1.1 mrg static __inline qword si_mpy(qword a, qword b)
1223 1.1 mrg {
1224 1.1 mrg return ((qword)(vec_mulo((vec_short8)(a), (vec_short8)(b))));
1225 1.1 mrg }
1226 1.1 mrg
1227 1.1 mrg static __inline qword si_mpyu(qword a, qword b)
1228 1.1 mrg {
1229 1.1 mrg return ((qword)(vec_mulo((vec_ushort8)(a), (vec_ushort8)(b))));
1230 1.1 mrg }
1231 1.1 mrg
1232 1.1 mrg static __inline qword si_mpyi(qword a, short b)
1233 1.1 mrg {
1234 1.1 mrg return ((qword)(vec_mulo((vec_short8)(a),
1235 1.1 mrg vec_splat((vec_short8)(si_from_short(b)), 1))));
1236 1.1 mrg }
1237 1.1 mrg
1238 1.1 mrg static __inline qword si_mpyui(qword a, unsigned short b)
1239 1.1 mrg {
1240 1.1 mrg return ((qword)(vec_mulo((vec_ushort8)(a),
1241 1.1 mrg vec_splat((vec_ushort8)(si_from_ushort(b)), 1))));
1242 1.1 mrg }
1243 1.1 mrg
1244 1.1 mrg /* Multiply and Shift Right
1245 1.1 mrg */
1246 1.1 mrg static __inline qword si_mpys(qword a, qword b)
1247 1.1 mrg {
1248 1.1 mrg return ((qword)(vec_sra(vec_mulo((vec_short8)(a), (vec_short8)(b)), ((vec_uint4){16,16,16,16}))));
1249 1.1 mrg }
1250 1.1 mrg
1251 1.1 mrg /* Nand
1252 1.1 mrg */
1253 1.1 mrg static __inline qword si_nand(qword a, qword b)
1254 1.1 mrg {
1255 1.1 mrg vec_uchar16 d;
1256 1.1 mrg
1257 1.1 mrg d = vec_and((vec_uchar16)(a), (vec_uchar16)(b));
1258 1.1 mrg return ((qword)(vec_nor(d, d)));
1259 1.1 mrg }
1260 1.1 mrg
1261 1.1 mrg /* Negative Multiply Add
1262 1.1 mrg */
1263 1.1 mrg static __inline qword si_dfnma(qword a, qword b, qword c)
1264 1.1 mrg {
1265 1.1 mrg union {
1266 1.1 mrg vec_double2 v;
1267 1.1 mrg double d[2];
1268 1.1 mrg } aa, bb, cc, dd;
1269 1.1 mrg
1270 1.1 mrg aa.v = (vec_double2)(a);
1271 1.1 mrg bb.v = (vec_double2)(b);
1272 1.1 mrg cc.v = (vec_double2)(c);
1273 1.1 mrg dd.d[0] = -cc.d[0] - aa.d[0] * bb.d[0];
1274 1.1 mrg dd.d[1] = -cc.d[1] - aa.d[1] * bb.d[1];
1275 1.1 mrg return ((qword)(dd.v));
1276 1.1 mrg }
1277 1.1 mrg
1278 1.1 mrg /* Negative Multiply and Subtract
1279 1.1 mrg */
1280 1.1 mrg static __inline qword si_fnms(qword a, qword b, qword c)
1281 1.1 mrg {
1282 1.1 mrg return ((qword)(vec_nmsub((vec_float4)(a), (vec_float4)(b), (vec_float4)(c))));
1283 1.1 mrg }
1284 1.1 mrg
1285 1.1 mrg static __inline qword si_dfnms(qword a, qword b, qword c)
1286 1.1 mrg {
1287 1.1 mrg union {
1288 1.1 mrg vec_double2 v;
1289 1.1 mrg double d[2];
1290 1.1 mrg } aa, bb, cc, dd;
1291 1.1 mrg
1292 1.1 mrg aa.v = (vec_double2)(a);
1293 1.1 mrg bb.v = (vec_double2)(b);
1294 1.1 mrg cc.v = (vec_double2)(c);
1295 1.1 mrg dd.d[0] = cc.d[0] - aa.d[0] * bb.d[0];
1296 1.1 mrg dd.d[1] = cc.d[1] - aa.d[1] * bb.d[1];
1297 1.1 mrg return ((qword)(dd.v));
1298 1.1 mrg }
1299 1.1 mrg
1300 1.1 mrg /* Nor
1301 1.1 mrg */
1302 1.1 mrg static __inline qword si_nor(qword a, qword b)
1303 1.1 mrg {
1304 1.1 mrg return ((qword)(vec_nor((vec_uchar16)(a), (vec_uchar16)(b))));
1305 1.1 mrg }
1306 1.1 mrg
1307 1.1 mrg /* Or
1308 1.1 mrg */
1309 1.1 mrg static __inline qword si_or(qword a, qword b)
1310 1.1 mrg {
1311 1.1 mrg return ((qword)(vec_or((vec_uchar16)(a), (vec_uchar16)(b))));
1312 1.1 mrg }
1313 1.1 mrg
1314 1.1 mrg static __inline qword si_orbi(qword a, unsigned char b)
1315 1.1 mrg {
1316 1.1 mrg return ((qword)(vec_or((vec_uchar16)(a),
1317 1.1 mrg vec_splat((vec_uchar16)(si_from_uchar(b)), 3))));
1318 1.1 mrg }
1319 1.1 mrg
1320 1.1 mrg static __inline qword si_orhi(qword a, unsigned short b)
1321 1.1 mrg {
1322 1.1 mrg return ((qword)(vec_or((vec_ushort8)(a),
1323 1.1 mrg vec_splat((vec_ushort8)(si_from_ushort(b)), 1))));
1324 1.1 mrg }
1325 1.1 mrg
1326 1.1 mrg static __inline qword si_ori(qword a, unsigned int b)
1327 1.1 mrg {
1328 1.1 mrg return ((qword)(vec_or((vec_uint4)(a),
1329 1.1 mrg vec_splat((vec_uint4)(si_from_uint(b)), 0))));
1330 1.1 mrg }
1331 1.1 mrg
1332 1.1 mrg /* Or Complement
1333 1.1 mrg */
1334 1.1 mrg static __inline qword si_orc(qword a, qword b)
1335 1.1 mrg {
1336 1.1 mrg return ((qword)(vec_or((vec_uchar16)(a), vec_nor((vec_uchar16)(b), (vec_uchar16)(b)))));
1337 1.1 mrg }
1338 1.1 mrg
1339 1.1 mrg
1340 1.1 mrg /* Or Across
1341 1.1 mrg */
1342 1.1 mrg static __inline qword si_orx(qword a)
1343 1.1 mrg {
1344 1.1 mrg vec_uchar16 tmp;
1345 1.1 mrg tmp = (vec_uchar16)(a);
1346 1.1 mrg tmp = vec_or(tmp, vec_sld(tmp, tmp, 8));
1347 1.1 mrg tmp = vec_or(tmp, vec_sld(tmp, tmp, 4));
1348 1.1 mrg return ((qword)(vec_and(tmp, ((vec_uchar16){0xFF,0xFF,0xFF,0xFF, 0x00,0x00,0x00,0x00,
1349 1.1 mrg 0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00}))));
1350 1.1 mrg }
1351 1.1 mrg
1352 1.1 mrg
1353 1.1 mrg /* Estimates
1354 1.1 mrg */
1355 1.1 mrg static __inline qword si_frest(qword a)
1356 1.1 mrg {
1357 1.1 mrg return ((qword)(vec_re((vec_float4)(a))));
1358 1.1 mrg }
1359 1.1 mrg
1360 1.1 mrg static __inline qword si_frsqest(qword a)
1361 1.1 mrg {
1362 1.1 mrg return ((qword)(vec_rsqrte((vec_float4)(a))));
1363 1.1 mrg }
1364 1.1 mrg
1365 1.1 mrg #define si_fi(_a, _d) (_d)
1366 1.1 mrg
1367 1.1 mrg /* Channel Read and Write
1368 1.1 mrg */
1369 1.1 mrg #define si_rdch(_channel) ((qword)(vec_splat_u8(0))) /* not mappable */
1370 1.1 mrg #define si_rchcnt(_channel) ((qword)(vec_splat_u8(0))) /* not mappable */
1371 1.1 mrg #define si_wrch(_channel, _a) /* not mappable */
1372 1.1 mrg
1373 1.1 mrg /* Rotate Left
1374 1.1 mrg */
1375 1.1 mrg static __inline qword si_roth(qword a, qword b)
1376 1.1 mrg {
1377 1.1 mrg return ((qword)(vec_rl((vec_ushort8)(a), (vec_ushort8)(b))));
1378 1.1 mrg }
1379 1.1 mrg
1380 1.1 mrg static __inline qword si_rot(qword a, qword b)
1381 1.1 mrg {
1382 1.1 mrg return ((qword)(vec_rl((vec_uint4)(a), (vec_uint4)(b))));
1383 1.1 mrg }
1384 1.1 mrg
1385 1.1 mrg static __inline qword si_rothi(qword a, int b)
1386 1.1 mrg {
1387 1.1 mrg return ((qword)(vec_rl((vec_ushort8)(a),
1388 1.1 mrg vec_splat((vec_ushort8)(si_from_int(b)), 1))));
1389 1.1 mrg }
1390 1.1 mrg
1391 1.1 mrg static __inline qword si_roti(qword a, int b)
1392 1.1 mrg {
1393 1.1 mrg return ((qword)(vec_rl((vec_uint4)(a),
1394 1.1 mrg vec_splat((vec_uint4)(si_from_int(b)), 0))));
1395 1.1 mrg }
1396 1.1 mrg
1397 1.1 mrg /* Rotate Left with Mask
1398 1.1 mrg */
1399 1.1 mrg static __inline qword si_rothm(qword a, qword b)
1400 1.1 mrg {
1401 1.1 mrg vec_ushort8 neg_b;
1402 1.1 mrg vec_ushort8 mask;
1403 1.1 mrg
1404 1.1 mrg neg_b = (vec_ushort8)vec_sub(vec_splat_s16(0), (vec_short8)(b));
1405 1.1 mrg mask = vec_sra(vec_sl(neg_b, vec_splat_u16(11)), vec_splat_u16(15));
1406 1.1 mrg return ((qword)(vec_andc(vec_sr((vec_ushort8)(a), neg_b), mask)));
1407 1.1 mrg }
1408 1.1 mrg
1409 1.1 mrg static __inline qword si_rotm(qword a, qword b)
1410 1.1 mrg {
1411 1.1 mrg vec_uint4 neg_b;
1412 1.1 mrg vec_uint4 mask;
1413 1.1 mrg
1414 1.1 mrg neg_b = (vec_uint4)vec_sub(vec_splat_s32(0), (vec_int4)(b));
1415 1.1 mrg mask = vec_sra(vec_sl(neg_b, ((vec_uint4){26,26,26,26})), ((vec_uint4){31,31,31,31}));
1416 1.1 mrg return ((qword)(vec_andc(vec_sr((vec_uint4)(a), neg_b), mask)));
1417 1.1 mrg }
1418 1.1 mrg
1419 1.1 mrg static __inline qword si_rothmi(qword a, int b)
1420 1.1 mrg {
1421 1.1 mrg vec_ushort8 neg_b;
1422 1.1 mrg vec_ushort8 mask;
1423 1.1 mrg
1424 1.1 mrg neg_b = vec_splat((vec_ushort8)(si_from_int(-b)), 1);
1425 1.1 mrg mask = vec_sra(vec_sl(neg_b, vec_splat_u16(11)), vec_splat_u16(15));
1426 1.1 mrg return ((qword)(vec_andc(vec_sr((vec_ushort8)(a), neg_b), mask)));
1427 1.1 mrg }
1428 1.1 mrg
1429 1.1 mrg static __inline qword si_rotmi(qword a, int b)
1430 1.1 mrg {
1431 1.1 mrg vec_uint4 neg_b;
1432 1.1 mrg vec_uint4 mask;
1433 1.1 mrg
1434 1.1 mrg neg_b = vec_splat((vec_uint4)(si_from_int(-b)), 0);
1435 1.1 mrg mask = vec_sra(vec_sl(neg_b, ((vec_uint4){26,26,26,26})), ((vec_uint4){31,31,31,31}));
1436 1.1 mrg return ((qword)(vec_andc(vec_sr((vec_uint4)(a), neg_b), mask)));
1437 1.1 mrg }
1438 1.1 mrg
1439 1.1 mrg
1440 1.1 mrg /* Rotate Left Algebraic with Mask
1441 1.1 mrg */
1442 1.1 mrg static __inline qword si_rotmah(qword a, qword b)
1443 1.1 mrg {
1444 1.1 mrg vec_ushort8 neg_b;
1445 1.1 mrg vec_ushort8 mask;
1446 1.1 mrg
1447 1.1 mrg neg_b = (vec_ushort8)vec_sub(vec_splat_s16(0), (vec_short8)(b));
1448 1.1 mrg mask = vec_sra(vec_sl(neg_b, vec_splat_u16(11)), vec_splat_u16(15));
1449 1.1 mrg return ((qword)(vec_sra((vec_short8)(a), (vec_ushort8)vec_or(neg_b, mask))));
1450 1.1 mrg }
1451 1.1 mrg
1452 1.1 mrg static __inline qword si_rotma(qword a, qword b)
1453 1.1 mrg {
1454 1.1 mrg vec_uint4 neg_b;
1455 1.1 mrg vec_uint4 mask;
1456 1.1 mrg
1457 1.1 mrg neg_b = (vec_uint4)vec_sub(vec_splat_s32(0), (vec_int4)(b));
1458 1.1 mrg mask = vec_sra(vec_sl(neg_b, ((vec_uint4){26,26,26,26})), ((vec_uint4){31,31,31,31}));
1459 1.1 mrg return ((qword)(vec_sra((vec_int4)(a), (vec_uint4)vec_or(neg_b, mask))));
1460 1.1 mrg }
1461 1.1 mrg
1462 1.1 mrg
1463 1.1 mrg static __inline qword si_rotmahi(qword a, int b)
1464 1.1 mrg {
1465 1.1 mrg vec_ushort8 neg_b;
1466 1.1 mrg vec_ushort8 mask;
1467 1.1 mrg
1468 1.1 mrg neg_b = vec_splat((vec_ushort8)(si_from_int(-b)), 1);
1469 1.1 mrg mask = vec_sra(vec_sl(neg_b, vec_splat_u16(11)), vec_splat_u16(15));
1470 1.1 mrg return ((qword)(vec_sra((vec_short8)(a), (vec_ushort8)vec_or(neg_b, mask))));
1471 1.1 mrg }
1472 1.1 mrg
1473 1.1 mrg static __inline qword si_rotmai(qword a, int b)
1474 1.1 mrg {
1475 1.1 mrg vec_uint4 neg_b;
1476 1.1 mrg vec_uint4 mask;
1477 1.1 mrg
1478 1.1 mrg neg_b = vec_splat((vec_uint4)(si_from_int(-b)), 0);
1479 1.1 mrg mask = vec_sra(vec_sl(neg_b, ((vec_uint4){26,26,26,26})), ((vec_uint4){31,31,31,31}));
1480 1.1 mrg return ((qword)(vec_sra((vec_int4)(a), (vec_uint4)vec_or(neg_b, mask))));
1481 1.1 mrg }
1482 1.1 mrg
1483 1.1 mrg
1484 1.1 mrg /* Rotate Left Quadword by Bytes with Mask
1485 1.1 mrg */
1486 1.1 mrg static __inline qword si_rotqmbyi(qword a, int count)
1487 1.1 mrg {
1488 1.1 mrg union {
1489 1.1 mrg vec_uchar16 v;
1490 1.1 mrg int i[4];
1491 1.1 mrg } x;
1492 1.1 mrg vec_uchar16 mask;
1493 1.1 mrg
1494 1.1 mrg count = 0 - count;
1495 1.1 mrg x.i[3] = count << 3;
1496 1.1 mrg mask = (count & 0x10) ? vec_splat_u8(0) : vec_splat_u8(-1);
1497 1.1 mrg
1498 1.1 mrg return ((qword)(vec_and(vec_sro((vec_uchar16)(a), x.v), mask)));
1499 1.1 mrg }
1500 1.1 mrg
1501 1.1 mrg
1502 1.1 mrg static __inline qword si_rotqmby(qword a, qword count)
1503 1.1 mrg {
1504 1.1 mrg union {
1505 1.1 mrg vec_uchar16 v;
1506 1.1 mrg int i[4];
1507 1.1 mrg } x;
1508 1.1 mrg int cnt;
1509 1.1 mrg vec_uchar16 mask;
1510 1.1 mrg
1511 1.1 mrg x.v = (vec_uchar16)(count);
1512 1.1 mrg x.i[0] = cnt = (0 - x.i[0]) << 3;
1513 1.1 mrg
1514 1.1 mrg x.v = vec_splat(x.v, 3);
1515 1.1 mrg mask = (cnt & 0x80) ? vec_splat_u8(0) : vec_splat_u8(-1);
1516 1.1 mrg
1517 1.1 mrg return ((qword)(vec_and(vec_sro((vec_uchar16)(a), x.v), mask)));
1518 1.1 mrg }
1519 1.1 mrg
1520 1.1 mrg
1521 1.1 mrg /* Rotate Left Quadword by Bytes
1522 1.1 mrg */
1523 1.1 mrg static __inline qword si_rotqbyi(qword a, int count)
1524 1.1 mrg {
1525 1.1 mrg union {
1526 1.1 mrg vec_uchar16 v;
1527 1.1 mrg int i[4];
1528 1.1 mrg } left, right;
1529 1.1 mrg
1530 1.1 mrg count <<= 3;
1531 1.1 mrg left.i[3] = count;
1532 1.1 mrg right.i[3] = 0 - count;
1533 1.1 mrg return ((qword)(vec_or(vec_slo((vec_uchar16)(a), left.v), vec_sro((vec_uchar16)(a), right.v))));
1534 1.1 mrg }
1535 1.1 mrg
1536 1.1 mrg static __inline qword si_rotqby(qword a, qword count)
1537 1.1 mrg {
1538 1.1 mrg vec_uchar16 left, right;
1539 1.1 mrg
1540 1.1 mrg left = vec_sl(vec_splat((vec_uchar16)(count), 3), vec_splat_u8(3));
1541 1.1 mrg right = vec_sub(vec_splat_u8(0), left);
1542 1.1 mrg return ((qword)(vec_or(vec_slo((vec_uchar16)(a), left), vec_sro((vec_uchar16)(a), right))));
1543 1.1 mrg }
1544 1.1 mrg
1545 1.1 mrg /* Rotate Left Quadword by Bytes Bit Count
1546 1.1 mrg */
1547 1.1 mrg static __inline qword si_rotqbybi(qword a, qword count)
1548 1.1 mrg {
1549 1.1 mrg vec_uchar16 left, right;
1550 1.1 mrg
1551 1.1 mrg left = vec_splat((vec_uchar16)(count), 3);
1552 1.1 mrg right = vec_sub(vec_splat_u8(7), left);
1553 1.1 mrg return ((qword)(vec_or(vec_slo((vec_uchar16)(a), left), vec_sro((vec_uchar16)(a), right))));
1554 1.1 mrg }
1555 1.1 mrg
1556 1.1 mrg
1557 1.1 mrg /* Rotate Left Quadword by Bytes Bit Count
1558 1.1 mrg */
1559 1.1 mrg static __inline qword si_rotqbii(qword a, int count)
1560 1.1 mrg {
1561 1.1 mrg vec_uchar16 x, y;
1562 1.1 mrg vec_uchar16 result;
1563 1.1 mrg
1564 1.1 mrg x = vec_splat((vec_uchar16)(si_from_int(count & 7)), 3);
1565 1.1 mrg y = (vec_uchar16)(vec_sr((vec_uint4)vec_sro((vec_uchar16)(a), ((vec_uchar16)((vec_uint4){0,0,0,120}))),
1566 1.1 mrg (vec_uint4)vec_sub(vec_splat_u8(8), x)));
1567 1.1 mrg result = vec_or(vec_sll((qword)(a), x), y);
1568 1.1 mrg return ((qword)(result));
1569 1.1 mrg }
1570 1.1 mrg
1571 1.1 mrg static __inline qword si_rotqbi(qword a, qword count)
1572 1.1 mrg {
1573 1.1 mrg vec_uchar16 x, y;
1574 1.1 mrg vec_uchar16 result;
1575 1.1 mrg
1576 1.1 mrg x = vec_and(vec_splat((vec_uchar16)(count), 3), vec_splat_u8(7));
1577 1.1 mrg y = (vec_uchar16)(vec_sr((vec_uint4)vec_sro((vec_uchar16)(a), ((vec_uchar16)((vec_uint4){0,0,0,120}))),
1578 1.1 mrg (vec_uint4)vec_sub(vec_splat_u8(8), x)));
1579 1.1 mrg
1580 1.1 mrg result = vec_or(vec_sll((qword)(a), x), y);
1581 1.1 mrg return ((qword)(result));
1582 1.1 mrg }
1583 1.1 mrg
1584 1.1 mrg
1585 1.1 mrg /* Rotate Left Quadword and Mask by Bits
1586 1.1 mrg */
1587 1.1 mrg static __inline qword si_rotqmbii(qword a, int count)
1588 1.1 mrg {
1589 1.1 mrg return ((qword)(vec_srl((vec_uchar16)(a), vec_splat((vec_uchar16)(si_from_int(0 - count)), 3))));
1590 1.1 mrg }
1591 1.1 mrg
1592 1.1 mrg static __inline qword si_rotqmbi(qword a, qword count)
1593 1.1 mrg {
1594 1.1 mrg return ((qword)(vec_srl((vec_uchar16)(a), vec_sub(vec_splat_u8(0), vec_splat((vec_uchar16)(count), 3)))));
1595 1.1 mrg }
1596 1.1 mrg
1597 1.1 mrg
1598 1.1 mrg /* Rotate Left Quadword and Mask by Bytes with Bit Count
1599 1.1 mrg */
1600 1.1 mrg static __inline qword si_rotqmbybi(qword a, qword count)
1601 1.1 mrg {
1602 1.1 mrg union {
1603 1.1 mrg vec_uchar16 v;
1604 1.1 mrg int i[4];
1605 1.1 mrg } x;
1606 1.1 mrg int cnt;
1607 1.1 mrg vec_uchar16 mask;
1608 1.1 mrg
1609 1.1 mrg x.v = (vec_uchar16)(count);
1610 1.1 mrg x.i[0] = cnt = 0 - (x.i[0] & ~7);
1611 1.1 mrg x.v = vec_splat(x.v, 3);
1612 1.1 mrg mask = (cnt & 0x80) ? vec_splat_u8(0) : vec_splat_u8(-1);
1613 1.1 mrg
1614 1.1 mrg return ((qword)(vec_and(vec_sro((vec_uchar16)(a), x.v), mask)));
1615 1.1 mrg }
1616 1.1 mrg
1617 1.1 mrg
1618 1.1 mrg
1619 1.1 mrg
1620 1.1 mrg /* Round Double to Float
1621 1.1 mrg */
1622 1.1 mrg static __inline qword si_frds(qword a)
1623 1.1 mrg {
1624 1.1 mrg union {
1625 1.1 mrg vec_float4 v;
1626 1.1 mrg float f[4];
1627 1.1 mrg } d;
1628 1.1 mrg union {
1629 1.1 mrg vec_double2 v;
1630 1.1 mrg double d[2];
1631 1.1 mrg } in;
1632 1.1 mrg
1633 1.1 mrg in.v = (vec_double2)(a);
1634 1.1 mrg d.v = (vec_float4){0.0f};
1635 1.1 mrg d.f[0] = (float)in.d[0];
1636 1.1 mrg d.f[2] = (float)in.d[1];
1637 1.1 mrg
1638 1.1 mrg return ((qword)(d.v));
1639 1.1 mrg }
1640 1.1 mrg
1641 1.1 mrg /* Select Bits
1642 1.1 mrg */
1643 1.1 mrg static __inline qword si_selb(qword a, qword b, qword c)
1644 1.1 mrg {
1645 1.1 mrg return ((qword)(vec_sel((vec_uchar16)(a), (vec_uchar16)(b), (vec_uchar16)(c))));
1646 1.1 mrg }
1647 1.1 mrg
1648 1.1 mrg
1649 1.1 mrg /* Shuffle Bytes
1650 1.1 mrg */
1651 1.1 mrg static __inline qword si_shufb(qword a, qword b, qword pattern)
1652 1.1 mrg {
1653 1.1 mrg vec_uchar16 pat;
1654 1.1 mrg
1655 1.1 mrg pat = vec_sel(((vec_uchar16){0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15}),
1656 1.1 mrg vec_sr((vec_uchar16)(pattern), vec_splat_u8(3)),
1657 1.1 mrg vec_sra((vec_uchar16)(pattern), vec_splat_u8(7)));
1658 1.1 mrg return ((qword)(vec_perm(vec_perm(a, b, pattern),
1659 1.1 mrg ((vec_uchar16){0, 0, 0, 0, 0, 0, 0, 0,
1660 1.1 mrg 0xFF, 0xFF, 0xFF, 0xFF, 0x80, 0x80, 0x80, 0x80}),
1661 1.1 mrg pat)));
1662 1.1 mrg }
1663 1.1 mrg
1664 1.1 mrg
1665 1.1 mrg /* Shift Left
1666 1.1 mrg */
1667 1.1 mrg static __inline qword si_shlh(qword a, qword b)
1668 1.1 mrg {
1669 1.1 mrg vec_ushort8 mask;
1670 1.1 mrg
1671 1.1 mrg mask = (vec_ushort8)vec_sra(vec_sl((vec_ushort8)(b), vec_splat_u16(11)), vec_splat_u16(15));
1672 1.1 mrg return ((qword)(vec_andc(vec_sl((vec_ushort8)(a), (vec_ushort8)(b)), mask)));
1673 1.1 mrg }
1674 1.1 mrg
1675 1.1 mrg static __inline qword si_shl(qword a, qword b)
1676 1.1 mrg {
1677 1.1 mrg vec_uint4 mask;
1678 1.1 mrg
1679 1.1 mrg mask = (vec_uint4)vec_sra(vec_sl((vec_uint4)(b), ((vec_uint4){26,26,26,26})), ((vec_uint4){31,31,31,31}));
1680 1.1 mrg return ((qword)(vec_andc(vec_sl((vec_uint4)(a), (vec_uint4)(b)), mask)));
1681 1.1 mrg }
1682 1.1 mrg
1683 1.1 mrg
1684 1.1 mrg static __inline qword si_shlhi(qword a, unsigned int b)
1685 1.1 mrg {
1686 1.1 mrg vec_ushort8 mask;
1687 1.1 mrg vec_ushort8 bv;
1688 1.1 mrg
1689 1.1 mrg bv = vec_splat((vec_ushort8)(si_from_int(b)), 1);
1690 1.1 mrg mask = (vec_ushort8)vec_sra(vec_sl(bv, vec_splat_u16(11)), vec_splat_u16(15));
1691 1.1 mrg return ((qword)(vec_andc(vec_sl((vec_ushort8)(a), bv), mask)));
1692 1.1 mrg }
1693 1.1 mrg
1694 1.1 mrg static __inline qword si_shli(qword a, unsigned int b)
1695 1.1 mrg {
1696 1.1 mrg vec_uint4 bv;
1697 1.1 mrg vec_uint4 mask;
1698 1.1 mrg
1699 1.1 mrg bv = vec_splat((vec_uint4)(si_from_uint(b)), 0);
1700 1.1 mrg mask = (vec_uint4)vec_sra(vec_sl(bv, ((vec_uint4){26,26,26,26})), ((vec_uint4){31,31,31,31}));
1701 1.1 mrg return ((qword)(vec_andc(vec_sl((vec_uint4)(a), bv), mask)));
1702 1.1 mrg }
1703 1.1 mrg
1704 1.1 mrg
1705 1.1 mrg /* Shift Left Quadword
1706 1.1 mrg */
1707 1.1 mrg static __inline qword si_shlqbii(qword a, unsigned int count)
1708 1.1 mrg {
1709 1.1 mrg vec_uchar16 x;
1710 1.1 mrg
1711 1.1 mrg x = vec_splat((vec_uchar16)(si_from_uint(count)), 3);
1712 1.1 mrg return ((qword)(vec_sll((vec_uchar16)(a), x)));
1713 1.1 mrg }
1714 1.1 mrg
1715 1.1 mrg static __inline qword si_shlqbi(qword a, qword count)
1716 1.1 mrg {
1717 1.1 mrg vec_uchar16 x;
1718 1.1 mrg
1719 1.1 mrg x = vec_splat((vec_uchar16)(count), 3);
1720 1.1 mrg return ((qword)(vec_sll((vec_uchar16)(a), x)));
1721 1.1 mrg }
1722 1.1 mrg
1723 1.1 mrg
1724 1.1 mrg /* Shift Left Quadword by Bytes
1725 1.1 mrg */
1726 1.1 mrg static __inline qword si_shlqbyi(qword a, unsigned int count)
1727 1.1 mrg {
1728 1.1 mrg union {
1729 1.1 mrg vec_uchar16 v;
1730 1.1 mrg int i[4];
1731 1.1 mrg } x;
1732 1.1 mrg vec_uchar16 mask;
1733 1.1 mrg
1734 1.1 mrg x.i[3] = count << 3;
1735 1.1 mrg mask = (count & 0x10) ? vec_splat_u8(0) : vec_splat_u8(-1);
1736 1.1 mrg return ((qword)(vec_and(vec_slo((vec_uchar16)(a), x.v), mask)));
1737 1.1 mrg }
1738 1.1 mrg
1739 1.1 mrg static __inline qword si_shlqby(qword a, qword count)
1740 1.1 mrg {
1741 1.1 mrg union {
1742 1.1 mrg vec_uchar16 v;
1743 1.1 mrg unsigned int i[4];
1744 1.1 mrg } x;
1745 1.1 mrg unsigned int cnt;
1746 1.1 mrg vec_uchar16 mask;
1747 1.1 mrg
1748 1.1 mrg x.v = vec_sl(vec_splat((vec_uchar16)(count), 3), vec_splat_u8(3));
1749 1.1 mrg cnt = x.i[0];
1750 1.1 mrg mask = (cnt & 0x80) ? vec_splat_u8(0) : vec_splat_u8(-1);
1751 1.1 mrg return ((qword)(vec_and(vec_slo((vec_uchar16)(a), x.v), mask)));
1752 1.1 mrg }
1753 1.1 mrg
1754 1.1 mrg /* Shift Left Quadword by Bytes with Bit Count
1755 1.1 mrg */
1756 1.1 mrg static __inline qword si_shlqbybi(qword a, qword count)
1757 1.1 mrg {
1758 1.1 mrg union {
1759 1.1 mrg vec_uchar16 v;
1760 1.1 mrg int i[4];
1761 1.1 mrg } x;
1762 1.1 mrg unsigned int cnt;
1763 1.1 mrg vec_uchar16 mask;
1764 1.1 mrg
1765 1.1 mrg x.v = vec_splat((vec_uchar16)(count), 3);
1766 1.1 mrg cnt = x.i[0];
1767 1.1 mrg mask = (cnt & 0x80) ? vec_splat_u8(0) : vec_splat_u8(-1);
1768 1.1 mrg return ((qword)(vec_and(vec_slo((vec_uchar16)(a), x.v), mask)));
1769 1.1 mrg }
1770 1.1 mrg
1771 1.1 mrg
1772 1.1 mrg /* Stop and Signal
1773 1.1 mrg */
1774 1.1 mrg #define si_stop(_type) SPU_STOP_ACTION
1775 1.1 mrg #define si_stopd(a, b, c) SPU_STOP_ACTION
1776 1.1 mrg
1777 1.1 mrg
1778 1.1 mrg /* Subtract
1779 1.1 mrg */
1780 1.1 mrg static __inline qword si_sfh(qword a, qword b)
1781 1.1 mrg {
1782 1.1 mrg return ((qword)(vec_sub((vec_ushort8)(b), (vec_ushort8)(a))));
1783 1.1 mrg }
1784 1.1 mrg
1785 1.1 mrg static __inline qword si_sf(qword a, qword b)
1786 1.1 mrg {
1787 1.1 mrg return ((qword)(vec_sub((vec_uint4)(b), (vec_uint4)(a))));
1788 1.1 mrg }
1789 1.1 mrg
1790 1.1 mrg static __inline qword si_fs(qword a, qword b)
1791 1.1 mrg {
1792 1.1 mrg return ((qword)(vec_sub((vec_float4)(a), (vec_float4)(b))));
1793 1.1 mrg }
1794 1.1 mrg
1795 1.1 mrg static __inline qword si_dfs(qword a, qword b)
1796 1.1 mrg {
1797 1.1 mrg union {
1798 1.1 mrg vec_double2 v;
1799 1.1 mrg double d[2];
1800 1.1 mrg } aa, bb, dd;
1801 1.1 mrg
1802 1.1 mrg aa.v = (vec_double2)(a);
1803 1.1 mrg bb.v = (vec_double2)(b);
1804 1.1 mrg dd.d[0] = aa.d[0] - bb.d[0];
1805 1.1 mrg dd.d[1] = aa.d[1] - bb.d[1];
1806 1.1 mrg return ((qword)(dd.v));
1807 1.1 mrg }
1808 1.1 mrg
1809 1.1 mrg static __inline qword si_sfhi(qword a, short b)
1810 1.1 mrg {
1811 1.1 mrg return ((qword)(vec_sub(vec_splat((vec_short8)(si_from_short(b)), 1),
1812 1.1 mrg (vec_short8)(a))));
1813 1.1 mrg }
1814 1.1 mrg
1815 1.1 mrg static __inline qword si_sfi(qword a, int b)
1816 1.1 mrg {
1817 1.1 mrg return ((qword)(vec_sub(vec_splat((vec_int4)(si_from_int(b)), 0),
1818 1.1 mrg (vec_int4)(a))));
1819 1.1 mrg }
1820 1.1 mrg
1821 1.1 mrg /* Subtract word extended
1822 1.1 mrg */
1823 1.1 mrg #define si_sfx(_a, _b, _c) ((qword)(vec_add(vec_add((vec_uint4)(_b), \
1824 1.1 mrg vec_nor((vec_uint4)(_a), (vec_uint4)(_a))), \
1825 1.1 mrg vec_and((vec_uint4)(_c), vec_splat_u32(1)))))
1826 1.1 mrg
1827 1.1 mrg
1828 1.1 mrg /* Sum Bytes into Shorts
1829 1.1 mrg */
1830 1.1 mrg static __inline qword si_sumb(qword a, qword b)
1831 1.1 mrg {
1832 1.1 mrg vec_uint4 zero = (vec_uint4){0};
1833 1.1 mrg vec_ushort8 sum_a, sum_b;
1834 1.1 mrg
1835 1.1 mrg sum_a = (vec_ushort8)vec_sum4s((vec_uchar16)(a), zero);
1836 1.1 mrg sum_b = (vec_ushort8)vec_sum4s((vec_uchar16)(b), zero);
1837 1.1 mrg
1838 1.1 mrg return ((qword)(vec_perm(sum_a, sum_b, ((vec_uchar16){18, 19, 2, 3, 22, 23, 6, 7,
1839 1.1 mrg 26, 27, 10, 11, 30, 31, 14, 15}))));
1840 1.1 mrg }
1841 1.1 mrg
1842 1.1 mrg /* Exclusive OR
1843 1.1 mrg */
1844 1.1 mrg static __inline qword si_xor(qword a, qword b)
1845 1.1 mrg {
1846 1.1 mrg return ((qword)(vec_xor((vec_uchar16)(a), (vec_uchar16)(b))));
1847 1.1 mrg }
1848 1.1 mrg
1849 1.1 mrg static __inline qword si_xorbi(qword a, unsigned char b)
1850 1.1 mrg {
1851 1.1 mrg return ((qword)(vec_xor((vec_uchar16)(a),
1852 1.1 mrg vec_splat((vec_uchar16)(si_from_uchar(b)), 3))));
1853 1.1 mrg }
1854 1.1 mrg
1855 1.1 mrg static __inline qword si_xorhi(qword a, unsigned short b)
1856 1.1 mrg {
1857 1.1 mrg return ((qword)(vec_xor((vec_ushort8)(a),
1858 1.1 mrg vec_splat((vec_ushort8)(si_from_ushort(b)), 1))));
1859 1.1 mrg }
1860 1.1 mrg
1861 1.1 mrg static __inline qword si_xori(qword a, unsigned int b)
1862 1.1 mrg {
1863 1.1 mrg return ((qword)(vec_xor((vec_uint4)(a),
1864 1.1 mrg vec_splat((vec_uint4)(si_from_uint(b)), 0))));
1865 1.1 mrg }
1866 1.1 mrg
1867 1.1 mrg
1868 1.1 mrg /* Generate Controls for Sub-Quadword Insertion
1869 1.1 mrg */
1870 1.1 mrg static __inline qword si_cbd(qword a, int imm)
1871 1.1 mrg {
1872 1.1 mrg union {
1873 1.1 mrg vec_uint4 v;
1874 1.1 mrg unsigned char c[16];
1875 1.1 mrg } shmask;
1876 1.1 mrg
1877 1.1 mrg shmask.v = ((vec_uint4){0x10111213, 0x14151617, 0x18191A1B, 0x1C1D1E1F});
1878 1.1 mrg shmask.c[(si_to_uint(a) + (unsigned int)(imm)) & 0xF] = 0x03;
1879 1.1 mrg return ((qword)(shmask.v));
1880 1.1 mrg }
1881 1.1 mrg
1882 1.1 mrg static __inline qword si_cdd(qword a, int imm)
1883 1.1 mrg {
1884 1.1 mrg union {
1885 1.1 mrg vec_uint4 v;
1886 1.1 mrg unsigned long long ll[2];
1887 1.1 mrg } shmask;
1888 1.1 mrg
1889 1.1 mrg shmask.v = ((vec_uint4){0x10111213, 0x14151617, 0x18191A1B, 0x1C1D1E1F});
1890 1.1 mrg shmask.ll[((si_to_uint(a) + (unsigned int)(imm)) >> 3) & 0x1] = 0x0001020304050607ULL;
1891 1.1 mrg return ((qword)(shmask.v));
1892 1.1 mrg }
1893 1.1 mrg
1894 1.1 mrg static __inline qword si_chd(qword a, int imm)
1895 1.1 mrg {
1896 1.1 mrg union {
1897 1.1 mrg vec_uint4 v;
1898 1.1 mrg unsigned short s[8];
1899 1.1 mrg } shmask;
1900 1.1 mrg
1901 1.1 mrg shmask.v = ((vec_uint4){0x10111213, 0x14151617, 0x18191A1B, 0x1C1D1E1F});
1902 1.1 mrg shmask.s[((si_to_uint(a) + (unsigned int)(imm)) >> 1) & 0x7] = 0x0203;
1903 1.1 mrg return ((qword)(shmask.v));
1904 1.1 mrg }
1905 1.1 mrg
1906 1.1 mrg static __inline qword si_cwd(qword a, int imm)
1907 1.1 mrg {
1908 1.1 mrg union {
1909 1.1 mrg vec_uint4 v;
1910 1.1 mrg unsigned int i[4];
1911 1.1 mrg } shmask;
1912 1.1 mrg
1913 1.1 mrg shmask.v = ((vec_uint4){0x10111213, 0x14151617, 0x18191A1B, 0x1C1D1E1F});
1914 1.1 mrg shmask.i[((si_to_uint(a) + (unsigned int)(imm)) >> 2) & 0x3] = 0x00010203;
1915 1.1 mrg return ((qword)(shmask.v));
1916 1.1 mrg }
1917 1.1 mrg
1918 1.1 mrg static __inline qword si_cbx(qword a, qword b)
1919 1.1 mrg {
1920 1.1 mrg union {
1921 1.1 mrg vec_uint4 v;
1922 1.1 mrg unsigned char c[16];
1923 1.1 mrg } shmask;
1924 1.1 mrg
1925 1.1 mrg shmask.v = ((vec_uint4){0x10111213, 0x14151617, 0x18191A1B, 0x1C1D1E1F});
1926 1.1 mrg shmask.c[si_to_uint((qword)(vec_add((vec_uint4)(a), (vec_uint4)(b)))) & 0xF] = 0x03;
1927 1.1 mrg return ((qword)(shmask.v));
1928 1.1 mrg }
1929 1.1 mrg
1930 1.1 mrg
1931 1.1 mrg static __inline qword si_cdx(qword a, qword b)
1932 1.1 mrg {
1933 1.1 mrg union {
1934 1.1 mrg vec_uint4 v;
1935 1.1 mrg unsigned long long ll[2];
1936 1.1 mrg } shmask;
1937 1.1 mrg
1938 1.1 mrg shmask.v = ((vec_uint4){0x10111213, 0x14151617, 0x18191A1B, 0x1C1D1E1F});
1939 1.1 mrg shmask.ll[(si_to_uint((qword)(vec_add((vec_uint4)(a), (vec_uint4)(b)))) >> 3) & 0x1] = 0x0001020304050607ULL;
1940 1.1 mrg return ((qword)(shmask.v));
1941 1.1 mrg }
1942 1.1 mrg
1943 1.1 mrg static __inline qword si_chx(qword a, qword b)
1944 1.1 mrg {
1945 1.1 mrg union {
1946 1.1 mrg vec_uint4 v;
1947 1.1 mrg unsigned short s[8];
1948 1.1 mrg } shmask;
1949 1.1 mrg
1950 1.1 mrg shmask.v = ((vec_uint4){0x10111213, 0x14151617, 0x18191A1B, 0x1C1D1E1F});
1951 1.1 mrg shmask.s[(si_to_uint((qword)(vec_add((vec_uint4)(a), (vec_uint4)(b)))) >> 1) & 0x7] = 0x0203;
1952 1.1 mrg return ((qword)(shmask.v));
1953 1.1 mrg }
1954 1.1 mrg
1955 1.1 mrg static __inline qword si_cwx(qword a, qword b)
1956 1.1 mrg {
1957 1.1 mrg union {
1958 1.1 mrg vec_uint4 v;
1959 1.1 mrg unsigned int i[4];
1960 1.1 mrg } shmask;
1961 1.1 mrg
1962 1.1 mrg shmask.v = ((vec_uint4){0x10111213, 0x14151617, 0x18191A1B, 0x1C1D1E1F});
1963 1.1 mrg shmask.i[(si_to_uint((qword)(vec_add((vec_uint4)(a), (vec_uint4)(b)))) >> 2) & 0x3] = 0x00010203;
1964 1.1 mrg return ((qword)(shmask.v));
1965 1.1 mrg }
1966 1.1 mrg
1967 1.1 mrg
1968 1.1 mrg /* Constant Formation
1969 1.1 mrg */
1970 1.1 mrg static __inline qword si_il(signed short imm)
1971 1.1 mrg {
1972 1.1 mrg return ((qword)(vec_splat((vec_int4)(si_from_int((signed int)(imm))), 0)));
1973 1.1 mrg }
1974 1.1 mrg
1975 1.1 mrg
1976 1.1 mrg static __inline qword si_ila(unsigned int imm)
1977 1.1 mrg {
1978 1.1 mrg return ((qword)(vec_splat((vec_uint4)(si_from_uint(imm)), 0)));
1979 1.1 mrg }
1980 1.1 mrg
1981 1.1 mrg static __inline qword si_ilh(signed short imm)
1982 1.1 mrg {
1983 1.1 mrg return ((qword)(vec_splat((vec_short8)(si_from_short(imm)), 1)));
1984 1.1 mrg }
1985 1.1 mrg
1986 1.1 mrg static __inline qword si_ilhu(signed short imm)
1987 1.1 mrg {
1988 1.1 mrg return ((qword)(vec_splat((vec_uint4)(si_from_uint((unsigned int)(imm) << 16)), 0)));
1989 1.1 mrg }
1990 1.1 mrg
1991 1.1 mrg static __inline qword si_iohl(qword a, unsigned short imm)
1992 1.1 mrg {
1993 1.1 mrg return ((qword)(vec_or((vec_uint4)(a), vec_splat((vec_uint4)(si_from_uint((unsigned int)(imm))), 0))));
1994 1.1 mrg }
1995 1.1 mrg
1996 1.1 mrg /* No Operation
1997 1.1 mrg */
1998 1.1 mrg #define si_lnop() /* do nothing */
1999 1.1 mrg #define si_nop() /* do nothing */
2000 1.1 mrg
2001 1.1 mrg
2002 1.1 mrg /* Memory Load and Store
2003 1.1 mrg */
2004 1.1 mrg static __inline qword si_lqa(unsigned int imm)
2005 1.1 mrg {
2006 1.1 mrg return ((qword)(vec_ld(0, (vector unsigned char *)(imm))));
2007 1.1 mrg }
2008 1.1 mrg
2009 1.1 mrg static __inline qword si_lqd(qword a, unsigned int imm)
2010 1.1 mrg {
2011 1.1 mrg return ((qword)(vec_ld(si_to_uint(a) & ~0xF, (vector unsigned char *)(imm))));
2012 1.1 mrg }
2013 1.1 mrg
2014 1.1 mrg static __inline qword si_lqr(unsigned int imm)
2015 1.1 mrg {
2016 1.1 mrg return ((qword)(vec_ld(0, (vector unsigned char *)(imm))));
2017 1.1 mrg }
2018 1.1 mrg
2019 1.1 mrg static __inline qword si_lqx(qword a, qword b)
2020 1.1 mrg {
2021 1.1 mrg return ((qword)(vec_ld(si_to_uint((qword)(vec_add((vec_uint4)(a), (vec_uint4)(b)))), (vector unsigned char *)(0))));
2022 1.1 mrg }
2023 1.1 mrg
2024 1.1 mrg static __inline void si_stqa(qword a, unsigned int imm)
2025 1.1 mrg {
2026 1.1 mrg vec_st((vec_uchar16)(a), 0, (vector unsigned char *)(imm));
2027 1.1 mrg }
2028 1.1 mrg
2029 1.1 mrg static __inline void si_stqd(qword a, qword b, unsigned int imm)
2030 1.1 mrg {
2031 1.1 mrg vec_st((vec_uchar16)(a), si_to_uint(b) & ~0xF, (vector unsigned char *)(imm));
2032 1.1 mrg }
2033 1.1 mrg
2034 1.1 mrg static __inline void si_stqr(qword a, unsigned int imm)
2035 1.1 mrg {
2036 1.1 mrg vec_st((vec_uchar16)(a), 0, (vector unsigned char *)(imm));
2037 1.1 mrg }
2038 1.1 mrg
2039 1.1 mrg static __inline void si_stqx(qword a, qword b, qword c)
2040 1.1 mrg {
2041 1.1 mrg vec_st((vec_uchar16)(a),
2042 1.1 mrg si_to_uint((qword)(vec_add((vec_uint4)(b), (vec_uint4)(c)))),
2043 1.1 mrg (vector unsigned char *)(0));
2044 1.1 mrg }
2045 1.1 mrg
2046 1.1 mrg #endif /* !__SPU__ */
2047 1.1 mrg #endif /* !_SI2VMX_H_ */
2048 1.1 mrg
2049