1 1.1 mrg /* Cell BEA specific SPU intrinsics to PPU/VMX intrinsics 2 1.12 mrg Copyright (C) 2007-2022 Free Software Foundation, Inc. 3 1.1 mrg 4 1.1 mrg This file is free software; you can redistribute it and/or modify it under 5 1.1 mrg the terms of the GNU General Public License as published by the Free 6 1.1 mrg Software Foundation; either version 3 of the License, or (at your option) 7 1.1 mrg any later version. 8 1.1 mrg 9 1.1 mrg This file is distributed in the hope that it will be useful, but WITHOUT 10 1.1 mrg ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 1.1 mrg FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 12 1.1 mrg for more details. 13 1.1 mrg 14 1.1 mrg Under Section 7 of GPL version 3, you are granted additional 15 1.1 mrg permissions described in the GCC Runtime Library Exception, version 16 1.1 mrg 3.1, as published by the Free Software Foundation. 17 1.1 mrg 18 1.1 mrg You should have received a copy of the GNU General Public License and 19 1.1 mrg a copy of the GCC Runtime Library Exception along with this program; 20 1.1 mrg see the files COPYING3 and COPYING.RUNTIME respectively. If not, see 21 1.1 mrg <http://www.gnu.org/licenses/>. */ 22 1.1 mrg 23 1.1 mrg #ifndef _SI2VMX_H_ 24 1.1 mrg #define _SI2VMX_H_ 1 25 1.1 mrg 26 1.1 mrg #ifndef __SPU__ 27 1.1 mrg 28 1.1 mrg #include <stdlib.h> 29 1.1 mrg #include <vec_types.h> 30 1.1 mrg 31 1.1 mrg 32 1.1 mrg /* Specify a default halt action for spu_hcmpeq and spu_hcmpgt intrinsics. 33 1.1 mrg * Users can override the action by defining it prior to including this 34 1.1 mrg * header file. 35 1.1 mrg */ 36 1.1 mrg #ifndef SPU_HALT_ACTION 37 1.1 mrg #define SPU_HALT_ACTION abort() 38 1.1 mrg #endif 39 1.1 mrg 40 1.1 mrg /* Specify a default stop action for the spu_stop intrinsic. 41 1.1 mrg * Users can override the action by defining it prior to including this 42 1.1 mrg * header file. 43 1.1 mrg */ 44 1.1 mrg #ifndef SPU_STOP_ACTION 45 1.1 mrg #define SPU_STOP_ACTION abort() 46 1.1 mrg #endif 47 1.1 mrg 48 1.1 mrg 49 1.1 mrg /* Specify a default action for unsupported intrinsic. 50 1.1 mrg * Users can override the action by defining it prior to including this 51 1.1 mrg * header file. 52 1.1 mrg */ 53 1.1 mrg #ifndef SPU_UNSUPPORTED_ACTION 54 1.1 mrg #define SPU_UNSUPPORTED_ACTION abort() 55 1.1 mrg #endif 56 1.1 mrg 57 1.1 mrg 58 1.1 mrg /* Casting intrinsics - from scalar to quadword 59 1.1 mrg */ 60 1.1 mrg 61 1.1 mrg static __inline qword si_from_uchar(unsigned char c) { 62 1.1 mrg union { 63 1.1 mrg qword q; 64 1.1 mrg unsigned char c[16]; 65 1.1 mrg } x; 66 1.1 mrg x.c[3] = c; 67 1.1 mrg return (x.q); 68 1.1 mrg } 69 1.1 mrg 70 1.1 mrg static __inline qword si_from_char(signed char c) { 71 1.1 mrg union { 72 1.1 mrg qword q; 73 1.1 mrg signed char c[16]; 74 1.1 mrg } x; 75 1.1 mrg x.c[3] = c; 76 1.1 mrg return (x.q); 77 1.1 mrg } 78 1.1 mrg 79 1.1 mrg static __inline qword si_from_ushort(unsigned short s) { 80 1.1 mrg union { 81 1.1 mrg qword q; 82 1.1 mrg unsigned short s[8]; 83 1.1 mrg } x; 84 1.1 mrg x.s[1] = s; 85 1.1 mrg return (x.q); 86 1.1 mrg } 87 1.1 mrg 88 1.1 mrg static __inline qword si_from_short(short s) { 89 1.1 mrg union { 90 1.1 mrg qword q; 91 1.1 mrg short s[8]; 92 1.1 mrg } x; 93 1.1 mrg x.s[1] = s; 94 1.1 mrg return (x.q); 95 1.1 mrg } 96 1.1 mrg 97 1.1 mrg 98 1.1 mrg static __inline qword si_from_uint(unsigned int i) { 99 1.1 mrg union { 100 1.1 mrg qword q; 101 1.1 mrg unsigned int i[4]; 102 1.1 mrg } x; 103 1.1 mrg x.i[0] = i; 104 1.1 mrg return (x.q); 105 1.1 mrg } 106 1.1 mrg 107 1.1 mrg static __inline qword si_from_int(int i) { 108 1.1 mrg union { 109 1.1 mrg qword q; 110 1.1 mrg int i[4]; 111 1.1 mrg } x; 112 1.1 mrg x.i[0] = i; 113 1.1 mrg return (x.q); 114 1.1 mrg } 115 1.1 mrg 116 1.1 mrg static __inline qword si_from_ullong(unsigned long long l) { 117 1.1 mrg union { 118 1.1 mrg qword q; 119 1.1 mrg unsigned long long l[2]; 120 1.1 mrg } x; 121 1.1 mrg x.l[0] = l; 122 1.1 mrg return (x.q); 123 1.1 mrg } 124 1.1 mrg 125 1.1 mrg static __inline qword si_from_llong(long long l) { 126 1.1 mrg union { 127 1.1 mrg qword q; 128 1.1 mrg long long l[2]; 129 1.1 mrg } x; 130 1.1 mrg x.l[0] = l; 131 1.1 mrg return (x.q); 132 1.1 mrg } 133 1.1 mrg 134 1.1 mrg static __inline qword si_from_float(float f) { 135 1.1 mrg union { 136 1.1 mrg qword q; 137 1.1 mrg float f[4]; 138 1.1 mrg } x; 139 1.1 mrg x.f[0] = f; 140 1.1 mrg return (x.q); 141 1.1 mrg } 142 1.1 mrg 143 1.1 mrg static __inline qword si_from_double(double d) { 144 1.1 mrg union { 145 1.1 mrg qword q; 146 1.1 mrg double d[2]; 147 1.1 mrg } x; 148 1.1 mrg x.d[0] = d; 149 1.1 mrg return (x.q); 150 1.1 mrg } 151 1.1 mrg 152 1.1 mrg static __inline qword si_from_ptr(void *ptr) { 153 1.1 mrg union { 154 1.1 mrg qword q; 155 1.1 mrg void *p; 156 1.1 mrg } x; 157 1.1 mrg x.p = ptr; 158 1.1 mrg return (x.q); 159 1.1 mrg } 160 1.1 mrg 161 1.1 mrg 162 1.1 mrg /* Casting intrinsics - from quadword to scalar 163 1.1 mrg */ 164 1.1 mrg static __inline unsigned char si_to_uchar(qword q) { 165 1.1 mrg union { 166 1.1 mrg qword q; 167 1.1 mrg unsigned char c[16]; 168 1.1 mrg } x; 169 1.1 mrg x.q = q; 170 1.1 mrg return (x.c[3]); 171 1.1 mrg } 172 1.1 mrg 173 1.1 mrg static __inline signed char si_to_char(qword q) { 174 1.1 mrg union { 175 1.1 mrg qword q; 176 1.1 mrg signed char c[16]; 177 1.1 mrg } x; 178 1.1 mrg x.q = q; 179 1.1 mrg return (x.c[3]); 180 1.1 mrg } 181 1.1 mrg 182 1.1 mrg static __inline unsigned short si_to_ushort(qword q) { 183 1.1 mrg union { 184 1.1 mrg qword q; 185 1.1 mrg unsigned short s[8]; 186 1.1 mrg } x; 187 1.1 mrg x.q = q; 188 1.1 mrg return (x.s[1]); 189 1.1 mrg } 190 1.1 mrg 191 1.1 mrg static __inline short si_to_short(qword q) { 192 1.1 mrg union { 193 1.1 mrg qword q; 194 1.1 mrg short s[8]; 195 1.1 mrg } x; 196 1.1 mrg x.q = q; 197 1.1 mrg return (x.s[1]); 198 1.1 mrg } 199 1.1 mrg 200 1.1 mrg static __inline unsigned int si_to_uint(qword q) { 201 1.1 mrg union { 202 1.1 mrg qword q; 203 1.1 mrg unsigned int i[4]; 204 1.1 mrg } x; 205 1.1 mrg x.q = q; 206 1.1 mrg return (x.i[0]); 207 1.1 mrg } 208 1.1 mrg 209 1.1 mrg static __inline int si_to_int(qword q) { 210 1.1 mrg union { 211 1.1 mrg qword q; 212 1.1 mrg int i[4]; 213 1.1 mrg } x; 214 1.1 mrg x.q = q; 215 1.1 mrg return (x.i[0]); 216 1.1 mrg } 217 1.1 mrg 218 1.1 mrg static __inline unsigned long long si_to_ullong(qword q) { 219 1.1 mrg union { 220 1.1 mrg qword q; 221 1.1 mrg unsigned long long l[2]; 222 1.1 mrg } x; 223 1.1 mrg x.q = q; 224 1.1 mrg return (x.l[0]); 225 1.1 mrg } 226 1.1 mrg 227 1.1 mrg static __inline long long si_to_llong(qword q) { 228 1.1 mrg union { 229 1.1 mrg qword q; 230 1.1 mrg long long l[2]; 231 1.1 mrg } x; 232 1.1 mrg x.q = q; 233 1.1 mrg return (x.l[0]); 234 1.1 mrg } 235 1.1 mrg 236 1.1 mrg static __inline float si_to_float(qword q) { 237 1.1 mrg union { 238 1.1 mrg qword q; 239 1.1 mrg float f[4]; 240 1.1 mrg } x; 241 1.1 mrg x.q = q; 242 1.1 mrg return (x.f[0]); 243 1.1 mrg } 244 1.1 mrg 245 1.1 mrg static __inline double si_to_double(qword q) { 246 1.1 mrg union { 247 1.1 mrg qword q; 248 1.1 mrg double d[2]; 249 1.1 mrg } x; 250 1.1 mrg x.q = q; 251 1.1 mrg return (x.d[0]); 252 1.1 mrg } 253 1.1 mrg 254 1.1 mrg static __inline void * si_to_ptr(qword q) { 255 1.1 mrg union { 256 1.1 mrg qword q; 257 1.1 mrg void *p; 258 1.1 mrg } x; 259 1.1 mrg x.q = q; 260 1.1 mrg return (x.p); 261 1.1 mrg } 262 1.1 mrg 263 1.1 mrg 264 1.1 mrg /* Absolute difference 265 1.1 mrg */ 266 1.1 mrg static __inline qword si_absdb(qword a, qword b) 267 1.1 mrg { 268 1.1 mrg vec_uchar16 ac, bc, dc; 269 1.1 mrg 270 1.1 mrg ac = (vec_uchar16)(a); 271 1.1 mrg bc = (vec_uchar16)(b); 272 1.1 mrg dc = vec_sel(vec_sub(bc, ac), vec_sub(ac, bc), vec_cmpgt(ac, bc)); 273 1.1 mrg 274 1.1 mrg return ((qword)(dc)); 275 1.1 mrg } 276 1.1 mrg 277 1.1 mrg /* Add intrinsics 278 1.1 mrg */ 279 1.1 mrg #define si_a(_a, _b) ((qword)(vec_add((vec_uint4)(_a), (vec_uint4)(_b)))) 280 1.1 mrg 281 1.1 mrg #define si_ah(_a, _b) ((qword)(vec_add((vec_ushort8)(_a), (vec_ushort8)(_b)))) 282 1.1 mrg 283 1.1 mrg static __inline qword si_ai(qword a, int b) 284 1.1 mrg { 285 1.1 mrg return ((qword)(vec_add((vec_int4)(a), 286 1.1 mrg vec_splat((vec_int4)(si_from_int(b)), 0)))); 287 1.1 mrg } 288 1.1 mrg 289 1.1 mrg 290 1.1 mrg static __inline qword si_ahi(qword a, short b) 291 1.1 mrg { 292 1.1 mrg return ((qword)(vec_add((vec_short8)(a), 293 1.1 mrg vec_splat((vec_short8)(si_from_short(b)), 1)))); 294 1.1 mrg } 295 1.1 mrg 296 1.1 mrg 297 1.1 mrg #define si_fa(_a, _b) ((qword)(vec_add((vec_float4)(_a), (vec_float4)(_b)))) 298 1.1 mrg 299 1.1 mrg 300 1.1 mrg static __inline qword si_dfa(qword a, qword b) 301 1.1 mrg { 302 1.1 mrg union { 303 1.1 mrg vec_double2 v; 304 1.1 mrg double d[2]; 305 1.1 mrg } ad, bd, dd; 306 1.1 mrg 307 1.1 mrg ad.v = (vec_double2)(a); 308 1.1 mrg bd.v = (vec_double2)(b); 309 1.1 mrg dd.d[0] = ad.d[0] + bd.d[0]; 310 1.1 mrg dd.d[1] = ad.d[1] + bd.d[1]; 311 1.1 mrg 312 1.1 mrg return ((qword)(dd.v)); 313 1.1 mrg } 314 1.1 mrg 315 1.1 mrg /* Add word extended 316 1.1 mrg */ 317 1.1 mrg #define si_addx(_a, _b, _c) ((qword)(vec_add(vec_add((vec_uint4)(_a), (vec_uint4)(_b)), \ 318 1.1 mrg vec_and((vec_uint4)(_c), vec_splat_u32(1))))) 319 1.1 mrg 320 1.1 mrg 321 1.1 mrg /* Bit-wise AND 322 1.1 mrg */ 323 1.1 mrg #define si_and(_a, _b) ((qword)(vec_and((vec_uint4)(_a), (vec_uint4)(_b)))) 324 1.1 mrg 325 1.1 mrg 326 1.1 mrg static __inline qword si_andbi(qword a, signed char b) 327 1.1 mrg { 328 1.1 mrg return ((qword)(vec_and((vec_char16)(a), 329 1.1 mrg vec_splat((vec_char16)(si_from_char(b)), 3)))); 330 1.1 mrg } 331 1.1 mrg 332 1.1 mrg static __inline qword si_andhi(qword a, signed short b) 333 1.1 mrg { 334 1.1 mrg return ((qword)(vec_and((vec_short8)(a), 335 1.1 mrg vec_splat((vec_short8)(si_from_short(b)), 1)))); 336 1.1 mrg } 337 1.1 mrg 338 1.1 mrg 339 1.1 mrg static __inline qword si_andi(qword a, signed int b) 340 1.1 mrg { 341 1.1 mrg return ((qword)(vec_and((vec_int4)(a), 342 1.1 mrg vec_splat((vec_int4)(si_from_int(b)), 0)))); 343 1.1 mrg } 344 1.1 mrg 345 1.1 mrg 346 1.1 mrg /* Bit-wise AND with complement 347 1.1 mrg */ 348 1.1 mrg #define si_andc(_a, _b) ((qword)(vec_andc((vec_uchar16)(_a), (vec_uchar16)(_b)))) 349 1.1 mrg 350 1.1 mrg 351 1.1 mrg /* Average byte vectors 352 1.1 mrg */ 353 1.1 mrg #define si_avgb(_a, _b) ((qword)(vec_avg((vec_uchar16)(_a), (vec_uchar16)(_b)))) 354 1.1 mrg 355 1.1 mrg 356 1.1 mrg /* Branch indirect and set link on external data 357 1.1 mrg */ 358 1.1 mrg #define si_bisled(_func) /* not mappable */ 359 1.1 mrg #define si_bisledd(_func) /* not mappable */ 360 1.1 mrg #define si_bislede(_func) /* not mappable */ 361 1.1 mrg 362 1.1 mrg 363 1.1 mrg /* Borrow generate 364 1.1 mrg */ 365 1.1 mrg #define si_bg(_a, _b) ((qword)(vec_subc((vec_uint4)(_b), (vec_uint4)(_a)))) 366 1.1 mrg 367 1.1 mrg #define si_bgx(_a, _b, _c) ((qword)(vec_and(vec_or(vec_cmpgt((vec_uint4)(_b), (vec_uint4)(_a)), \ 368 1.1 mrg vec_and(vec_cmpeq((vec_uint4)(_b), (vec_uint4)(_a)), \ 369 1.1 mrg (vec_uint4)(_c))), vec_splat_u32(1)))) 370 1.1 mrg 371 1.1 mrg /* Compare absolute equal 372 1.1 mrg */ 373 1.1 mrg static __inline qword si_fcmeq(qword a, qword b) 374 1.1 mrg { 375 1.1 mrg vec_float4 msb = (vec_float4)((vec_uint4){0x80000000, 0x80000000, 0x80000000, 0x80000000}); 376 1.1 mrg 377 1.1 mrg return ((qword)(vec_cmpeq(vec_andc((vec_float4)(a), msb), 378 1.1 mrg vec_andc((vec_float4)(b), msb)))); 379 1.1 mrg } 380 1.1 mrg 381 1.1 mrg static __inline qword si_dfcmeq(qword a, qword b) 382 1.1 mrg { 383 1.1 mrg vec_uint4 sign_mask= (vec_uint4) { 0x7FFFFFFF, 0xFFFFFFFF, 0x7FFFFFFF, 0xFFFFFFFF }; 384 1.1 mrg vec_uint4 nan_mask = (vec_uint4) { 0x7FF00000, 0x00000000, 0x7FF00000, 0x00000000 }; 385 1.1 mrg vec_uchar16 hihi_promote = (vec_uchar16) { 0,1,2,3, 16,17,18,19, 8,9,10,11, 24,25,26,27}; 386 1.1 mrg 387 1.1 mrg vec_uint4 biteq; 388 1.1 mrg vec_uint4 aabs; 389 1.1 mrg vec_uint4 babs; 390 1.1 mrg vec_uint4 a_gt; 391 1.1 mrg vec_uint4 ahi_inf; 392 1.1 mrg vec_uint4 anan; 393 1.1 mrg vec_uint4 result; 394 1.1 mrg 395 1.1 mrg union { 396 1.1 mrg vec_uchar16 v; 397 1.1 mrg int i[4]; 398 1.1 mrg } x; 399 1.1 mrg 400 1.1 mrg /* Shift 4 bytes */ 401 1.1 mrg x.i[3] = 4 << 3; 402 1.1 mrg 403 1.1 mrg /* Mask out sign bits */ 404 1.1 mrg aabs = vec_and((vec_uint4)a,sign_mask); 405 1.1 mrg babs = vec_and((vec_uint4)b,sign_mask); 406 1.1 mrg 407 1.1 mrg /* A) Check for bit equality, store in high word */ 408 1.1 mrg biteq = (vec_uint4) vec_cmpeq((vec_uint4)aabs,(vec_uint4)babs); 409 1.1 mrg biteq = vec_and(biteq,(vec_uint4)vec_slo((vec_uchar16)biteq,x.v)); 410 1.1 mrg 411 1.1 mrg /* 412 1.1 mrg B) Check if a is NaN, store in high word 413 1.1 mrg 414 1.1 mrg B1) If the high word is greater than max_exp (indicates a NaN) 415 1.1 mrg B2) If the low word is greater than 0 416 1.1 mrg */ 417 1.1 mrg a_gt = (vec_uint4)vec_cmpgt(aabs,nan_mask); 418 1.1 mrg 419 1.1 mrg /* B3) Check if the high word is equal to the inf exponent */ 420 1.1 mrg ahi_inf = (vec_uint4)vec_cmpeq(aabs,nan_mask); 421 1.1 mrg 422 1.1 mrg /* anan = B1[hi] or (B2[lo] and B3[hi]) */ 423 1.1 mrg anan = (vec_uint4)vec_or(a_gt,vec_and((vec_uint4)vec_slo((vec_uchar16)a_gt,x.v),ahi_inf)); 424 1.1 mrg 425 1.1 mrg /* result = A and not B */ 426 1.1 mrg result = vec_andc(biteq, anan); 427 1.1 mrg 428 1.1 mrg /* Promote high words to 64 bits and return */ 429 1.1 mrg return ((qword)(vec_perm((vec_uchar16)result, (vec_uchar16)result, hihi_promote))); 430 1.1 mrg } 431 1.1 mrg 432 1.1 mrg 433 1.1 mrg /* Compare absolute greater than 434 1.1 mrg */ 435 1.1 mrg static __inline qword si_fcmgt(qword a, qword b) 436 1.1 mrg { 437 1.1 mrg vec_float4 msb = (vec_float4)((vec_uint4){0x80000000, 0x80000000, 0x80000000, 0x80000000}); 438 1.1 mrg 439 1.1 mrg return ((qword)(vec_cmpgt(vec_andc((vec_float4)(a), msb), 440 1.1 mrg vec_andc((vec_float4)(b), msb)))); 441 1.1 mrg } 442 1.1 mrg 443 1.1 mrg static __inline qword si_dfcmgt(qword a, qword b) 444 1.1 mrg { 445 1.1 mrg vec_uchar16 splat_hi = (vec_uchar16) { 0,1,2,3, 0,1,2,3, 8,9,10,11, 8,9,10,11 }; 446 1.1 mrg vec_uint4 nan_mask = (vec_uint4) { 0x7FF00000, 0x0, 0x7FF00000, 0x0 }; 447 1.1 mrg vec_uint4 sign_mask = (vec_uint4) { 0x7FFFFFFF, 0xFFFFFFFF, 0x7FFFFFFF, 0xFFFFFFFF }; 448 1.1 mrg 449 1.1 mrg union { 450 1.1 mrg vec_uchar16 v; 451 1.1 mrg int i[4]; 452 1.1 mrg } x; 453 1.1 mrg 454 1.1 mrg /* Shift 4 bytes */ 455 1.1 mrg x.i[3] = 4 << 3; 456 1.1 mrg 457 1.1 mrg // absolute value of a,b 458 1.1 mrg vec_uint4 aabs = vec_and((vec_uint4)a, sign_mask); 459 1.1 mrg vec_uint4 babs = vec_and((vec_uint4)b, sign_mask); 460 1.1 mrg 461 1.1 mrg // check if a is nan 462 1.1 mrg vec_uint4 a_inf = (vec_uint4)vec_cmpeq(aabs, nan_mask); 463 1.1 mrg vec_uint4 a_nan = (vec_uint4)vec_cmpgt(aabs, nan_mask); 464 1.1 mrg a_nan = vec_or(a_nan, vec_and((vec_uint4)vec_slo((vec_uchar16)a_nan,x.v),a_inf)); 465 1.1 mrg a_nan = (vec_uint4)vec_perm((vec_uchar16)a_nan, (vec_uchar16)a_nan, splat_hi); 466 1.1 mrg 467 1.1 mrg // check if b is nan 468 1.1 mrg vec_uint4 b_inf = (vec_uint4)vec_cmpeq(babs, nan_mask); 469 1.1 mrg vec_uint4 b_nan = (vec_uint4)vec_cmpgt(babs, nan_mask); 470 1.1 mrg b_nan = vec_or(b_nan, vec_and((vec_uint4)vec_slo((vec_uchar16)b_nan,x.v),b_inf)); 471 1.1 mrg b_nan = (vec_uint4)vec_perm((vec_uchar16)b_nan, (vec_uchar16)b_nan, splat_hi); 472 1.1 mrg 473 1.1 mrg // A) Check if the exponents are different 474 1.1 mrg vec_uint4 gt_hi = (vec_uint4)vec_cmpgt(aabs,babs); 475 1.1 mrg 476 1.1 mrg // B) Check if high word equal, and low word greater 477 1.1 mrg vec_uint4 gt_lo = (vec_uint4)vec_cmpgt((vec_uint4)aabs, (vec_uint4)babs); 478 1.1 mrg vec_uint4 eq = (vec_uint4)vec_cmpeq(aabs, babs); 479 1.1 mrg vec_uint4 eqgt = vec_and(eq,vec_slo(gt_lo,x.v)); 480 1.1 mrg 481 1.1 mrg // If either A or B is true, return true (unless NaNs detected) 482 1.1 mrg vec_uint4 r = vec_or(gt_hi, eqgt); 483 1.1 mrg 484 1.1 mrg // splat the high words of the comparison step 485 1.1 mrg r = (vec_uint4)vec_perm((vec_uchar16)r,(vec_uchar16)r,splat_hi); 486 1.1 mrg 487 1.1 mrg // correct for NaNs in input 488 1.1 mrg return ((qword)vec_andc(r,vec_or(a_nan,b_nan))); 489 1.1 mrg } 490 1.1 mrg 491 1.1 mrg 492 1.1 mrg /* Compare equal 493 1.1 mrg */ 494 1.1 mrg static __inline qword si_ceqb(qword a, qword b) 495 1.1 mrg { 496 1.1 mrg return ((qword)(vec_cmpeq((vec_uchar16)(a), (vec_uchar16)(b)))); 497 1.1 mrg } 498 1.1 mrg 499 1.1 mrg static __inline qword si_ceqh(qword a, qword b) 500 1.1 mrg { 501 1.1 mrg return ((qword)(vec_cmpeq((vec_ushort8)(a), (vec_ushort8)(b)))); 502 1.1 mrg } 503 1.1 mrg 504 1.1 mrg static __inline qword si_ceq(qword a, qword b) 505 1.1 mrg { 506 1.1 mrg return ((qword)(vec_cmpeq((vec_uint4)(a), (vec_uint4)(b)))); 507 1.1 mrg } 508 1.1 mrg 509 1.1 mrg static __inline qword si_fceq(qword a, qword b) 510 1.1 mrg { 511 1.1 mrg return ((qword)(vec_cmpeq((vec_float4)(a), (vec_float4)(b)))); 512 1.1 mrg } 513 1.1 mrg 514 1.1 mrg static __inline qword si_ceqbi(qword a, signed char b) 515 1.1 mrg { 516 1.1 mrg return ((qword)(vec_cmpeq((vec_char16)(a), 517 1.1 mrg vec_splat((vec_char16)(si_from_char(b)), 3)))); 518 1.1 mrg } 519 1.1 mrg 520 1.1 mrg static __inline qword si_ceqhi(qword a, signed short b) 521 1.1 mrg { 522 1.1 mrg return ((qword)(vec_cmpeq((vec_short8)(a), 523 1.1 mrg vec_splat((vec_short8)(si_from_short(b)), 1)))); 524 1.1 mrg } 525 1.1 mrg 526 1.1 mrg static __inline qword si_ceqi(qword a, signed int b) 527 1.1 mrg { 528 1.1 mrg return ((qword)(vec_cmpeq((vec_int4)(a), 529 1.1 mrg vec_splat((vec_int4)(si_from_int(b)), 0)))); 530 1.1 mrg } 531 1.1 mrg 532 1.1 mrg static __inline qword si_dfceq(qword a, qword b) 533 1.1 mrg { 534 1.1 mrg vec_uint4 sign_mask= (vec_uint4) { 0x7FFFFFFF, 0xFFFFFFFF, 0x7FFFFFFF, 0xFFFFFFFF }; 535 1.1 mrg vec_uint4 nan_mask = (vec_uint4) { 0x7FF00000, 0x00000000, 0x7FF00000, 0x00000000 }; 536 1.1 mrg vec_uchar16 hihi_promote = (vec_uchar16) { 0,1,2,3, 16,17,18,19, 8,9,10,11, 24,25,26,27}; 537 1.1 mrg 538 1.1 mrg vec_uint4 biteq; 539 1.1 mrg vec_uint4 aabs; 540 1.1 mrg vec_uint4 babs; 541 1.1 mrg vec_uint4 a_gt; 542 1.1 mrg vec_uint4 ahi_inf; 543 1.1 mrg vec_uint4 anan; 544 1.1 mrg vec_uint4 iszero; 545 1.1 mrg vec_uint4 result; 546 1.1 mrg 547 1.1 mrg union { 548 1.1 mrg vec_uchar16 v; 549 1.1 mrg int i[4]; 550 1.1 mrg } x; 551 1.1 mrg 552 1.1 mrg /* Shift 4 bytes */ 553 1.1 mrg x.i[3] = 4 << 3; 554 1.1 mrg 555 1.1 mrg /* A) Check for bit equality, store in high word */ 556 1.1 mrg biteq = (vec_uint4) vec_cmpeq((vec_uint4)a,(vec_uint4)b); 557 1.1 mrg biteq = vec_and(biteq,(vec_uint4)vec_slo((vec_uchar16)biteq,x.v)); 558 1.1 mrg 559 1.1 mrg /* Mask out sign bits */ 560 1.1 mrg aabs = vec_and((vec_uint4)a,sign_mask); 561 1.1 mrg babs = vec_and((vec_uint4)b,sign_mask); 562 1.1 mrg 563 1.1 mrg /* 564 1.1 mrg B) Check if a is NaN, store in high word 565 1.1 mrg 566 1.1 mrg B1) If the high word is greater than max_exp (indicates a NaN) 567 1.1 mrg B2) If the low word is greater than 0 568 1.1 mrg */ 569 1.1 mrg a_gt = (vec_uint4)vec_cmpgt(aabs,nan_mask); 570 1.1 mrg 571 1.1 mrg /* B3) Check if the high word is equal to the inf exponent */ 572 1.1 mrg ahi_inf = (vec_uint4)vec_cmpeq(aabs,nan_mask); 573 1.1 mrg 574 1.1 mrg /* anan = B1[hi] or (B2[lo] and B3[hi]) */ 575 1.1 mrg anan = (vec_uint4)vec_or(a_gt,vec_and((vec_uint4)vec_slo((vec_uchar16)a_gt,x.v),ahi_inf)); 576 1.1 mrg 577 1.1 mrg /* C) Check for 0 = -0 special case */ 578 1.1 mrg iszero =(vec_uint4)vec_cmpeq((vec_uint4)vec_or(aabs,babs),(vec_uint4)vec_splat_u32(0)); 579 1.1 mrg iszero = vec_and(iszero,(vec_uint4)vec_slo((vec_uchar16)iszero,x.v)); 580 1.1 mrg 581 1.1 mrg /* result = (A or C) and not B */ 582 1.1 mrg result = vec_or(biteq,iszero); 583 1.1 mrg result = vec_andc(result, anan); 584 1.1 mrg 585 1.1 mrg /* Promote high words to 64 bits and return */ 586 1.1 mrg return ((qword)(vec_perm((vec_uchar16)result, (vec_uchar16)result, hihi_promote))); 587 1.1 mrg } 588 1.1 mrg 589 1.1 mrg 590 1.1 mrg /* Compare greater than 591 1.1 mrg */ 592 1.1 mrg static __inline qword si_cgtb(qword a, qword b) 593 1.1 mrg { 594 1.1 mrg return ((qword)(vec_cmpgt((vec_char16)(a), (vec_char16)(b)))); 595 1.1 mrg } 596 1.1 mrg 597 1.1 mrg static __inline qword si_cgth(qword a, qword b) 598 1.1 mrg { 599 1.1 mrg return ((qword)(vec_cmpgt((vec_short8)(a), (vec_short8)(b)))); 600 1.1 mrg } 601 1.1 mrg 602 1.1 mrg static __inline qword si_cgt(qword a, qword b) 603 1.1 mrg { 604 1.1 mrg return ((qword)(vec_cmpgt((vec_int4)(a), (vec_int4)(b)))); 605 1.1 mrg } 606 1.1 mrg 607 1.1 mrg static __inline qword si_clgtb(qword a, qword b) 608 1.1 mrg { 609 1.1 mrg return ((qword)(vec_cmpgt((vec_uchar16)(a), (vec_uchar16)(b)))); 610 1.1 mrg } 611 1.1 mrg 612 1.1 mrg static __inline qword si_clgth(qword a, qword b) 613 1.1 mrg { 614 1.1 mrg return ((qword)(vec_cmpgt((vec_ushort8)(a), (vec_ushort8)(b)))); 615 1.1 mrg } 616 1.1 mrg 617 1.1 mrg static __inline qword si_clgt(qword a, qword b) 618 1.1 mrg { 619 1.1 mrg return ((qword)(vec_cmpgt((vec_uint4)(a), (vec_uint4)(b)))); 620 1.1 mrg } 621 1.1 mrg 622 1.1 mrg static __inline qword si_fcgt(qword a, qword b) 623 1.1 mrg { 624 1.1 mrg return ((qword)(vec_cmpgt((vec_float4)(a), (vec_float4)(b)))); 625 1.1 mrg } 626 1.1 mrg 627 1.1 mrg static __inline qword si_dfcgt(qword a, qword b) 628 1.1 mrg { 629 1.1 mrg vec_uchar16 splat_hi = (vec_uchar16) { 0,1,2,3, 0,1,2,3, 8,9,10,11, 8,9,10,11 }; 630 1.1 mrg vec_uchar16 borrow_shuffle = (vec_uchar16) { 4,5,6,7, 192,192,192,192, 12,13,14,15, 192,192,192,192 }; 631 1.1 mrg vec_uint4 nan_mask = (vec_uint4) { 0x7FF00000, 0x0, 0x7FF00000, 0x0 }; 632 1.1 mrg vec_uint4 sign_mask = (vec_uint4) { 0x7FFFFFFF, 0xFFFFFFFF, 0x7FFFFFFF, 0xFFFFFFFF }; 633 1.1 mrg 634 1.1 mrg union { 635 1.1 mrg vec_uchar16 v; 636 1.1 mrg int i[4]; 637 1.1 mrg } x; 638 1.1 mrg 639 1.1 mrg /* Shift 4 bytes */ 640 1.1 mrg x.i[3] = 4 << 3; 641 1.1 mrg 642 1.1 mrg // absolute value of a,b 643 1.1 mrg vec_uint4 aabs = vec_and((vec_uint4)a, sign_mask); 644 1.1 mrg vec_uint4 babs = vec_and((vec_uint4)b, sign_mask); 645 1.1 mrg 646 1.1 mrg // check if a is nan 647 1.1 mrg vec_uint4 a_inf = (vec_uint4)vec_cmpeq(aabs, nan_mask); 648 1.1 mrg vec_uint4 a_nan = (vec_uint4)vec_cmpgt(aabs, nan_mask); 649 1.1 mrg a_nan = vec_or(a_nan, vec_and((vec_uint4)vec_slo((vec_uchar16)a_nan,x.v),a_inf)); 650 1.1 mrg a_nan = (vec_uint4)vec_perm((vec_uchar16)a_nan, (vec_uchar16)a_nan, splat_hi); 651 1.1 mrg 652 1.1 mrg // check if b is nan 653 1.1 mrg vec_uint4 b_inf = (vec_uint4)vec_cmpeq(babs, nan_mask); 654 1.1 mrg vec_uint4 b_nan = (vec_uint4)vec_cmpgt(babs, nan_mask); 655 1.1 mrg b_nan = vec_or(b_nan, vec_and((vec_uint4)vec_slo((vec_uchar16)b_nan,x.v),b_inf)); 656 1.1 mrg b_nan = (vec_uint4)vec_perm((vec_uchar16)b_nan, (vec_uchar16)b_nan, splat_hi); 657 1.1 mrg 658 1.1 mrg // sign of a 659 1.1 mrg vec_uint4 asel = (vec_uint4)vec_sra((vec_int4)(a), (vec_uint4)vec_splat(((vec_uint4)si_from_int(31)), 0)); 660 1.1 mrg asel = (vec_uint4)vec_perm((vec_uchar16)asel,(vec_uchar16)asel,splat_hi); 661 1.1 mrg 662 1.1 mrg // sign of b 663 1.1 mrg vec_uint4 bsel = (vec_uint4)vec_sra((vec_int4)(b), (vec_uint4)vec_splat(((vec_uint4)si_from_int(31)), 0)); 664 1.1 mrg bsel = (vec_uint4)vec_perm((vec_uchar16)bsel,(vec_uchar16)bsel,splat_hi); 665 1.1 mrg 666 1.1 mrg // negative a 667 1.1 mrg vec_uint4 abor = vec_subc((vec_uint4)vec_splat_u32(0), aabs); 668 1.1 mrg vec_uchar16 pat = vec_sel(((vec_uchar16){0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15}), vec_sr(borrow_shuffle, vec_splat_u8(3)), vec_sra(borrow_shuffle, vec_splat_u8(7))); 669 1.1 mrg abor = (vec_uint4)(vec_perm(vec_perm((vec_uchar16)abor, (vec_uchar16)abor, borrow_shuffle),((vec_uchar16){0, 0, 0, 0, 0, 0, 0, 0, 0xFF, 0xFF, 0xFF, 0xFF, 0x80, 0x80, 0x80, 0x80}),pat)); 670 1.1 mrg vec_uint4 aneg = vec_add(vec_add(vec_splat_u32(0), vec_nor(aabs, aabs)), vec_and(abor, vec_splat_u32(1))); 671 1.1 mrg 672 1.1 mrg // pick the one we want 673 1.1 mrg vec_int4 aval = (vec_int4)vec_sel((vec_uchar16)aabs, (vec_uchar16)aneg, (vec_uchar16)asel); 674 1.1 mrg 675 1.1 mrg // negative b 676 1.1 mrg vec_uint4 bbor = vec_subc((vec_uint4)vec_splat_u32(0), babs); 677 1.1 mrg bbor = (vec_uint4)(vec_perm(vec_perm((vec_uchar16)bbor, (vec_uchar16)bbor, borrow_shuffle),((vec_uchar16){0, 0, 0, 0, 0, 0, 0, 0, 0xFF, 0xFF, 0xFF, 0xFF, 0x80, 0x80, 0x80, 0x80}),pat)); 678 1.1 mrg vec_uint4 bneg = vec_add(vec_nor(babs, babs), vec_and(bbor, vec_splat_u32(1))); 679 1.1 mrg 680 1.1 mrg // pick the one we want 681 1.1 mrg vec_int4 bval=(vec_int4)vec_sel((vec_uchar16)babs, (vec_uchar16)bneg, (vec_uchar16)bsel); 682 1.1 mrg 683 1.1 mrg // A) Check if the exponents are different 684 1.1 mrg vec_uint4 gt_hi = (vec_uint4)vec_cmpgt(aval,bval); 685 1.1 mrg 686 1.1 mrg // B) Check if high word equal, and low word greater 687 1.1 mrg vec_uint4 gt_lo = (vec_uint4)vec_cmpgt((vec_uint4)aval, (vec_uint4)bval); 688 1.1 mrg vec_uint4 eq = (vec_uint4)vec_cmpeq(aval, bval); 689 1.1 mrg vec_uint4 eqgt = vec_and(eq,vec_slo(gt_lo,x.v)); 690 1.1 mrg 691 1.1 mrg // If either A or B is true, return true (unless NaNs detected) 692 1.1 mrg vec_uint4 r = vec_or(gt_hi, eqgt); 693 1.1 mrg 694 1.1 mrg // splat the high words of the comparison step 695 1.1 mrg r = (vec_uint4)vec_perm((vec_uchar16)r,(vec_uchar16)r,splat_hi); 696 1.1 mrg 697 1.1 mrg // correct for NaNs in input 698 1.1 mrg return ((qword)vec_andc(r,vec_or(a_nan,b_nan))); 699 1.1 mrg } 700 1.1 mrg 701 1.1 mrg static __inline qword si_cgtbi(qword a, signed char b) 702 1.1 mrg { 703 1.1 mrg return ((qword)(vec_cmpgt((vec_char16)(a), 704 1.1 mrg vec_splat((vec_char16)(si_from_char(b)), 3)))); 705 1.1 mrg } 706 1.1 mrg 707 1.1 mrg static __inline qword si_cgthi(qword a, signed short b) 708 1.1 mrg { 709 1.1 mrg return ((qword)(vec_cmpgt((vec_short8)(a), 710 1.1 mrg vec_splat((vec_short8)(si_from_short(b)), 1)))); 711 1.1 mrg } 712 1.1 mrg 713 1.1 mrg static __inline qword si_cgti(qword a, signed int b) 714 1.1 mrg { 715 1.1 mrg return ((qword)(vec_cmpgt((vec_int4)(a), 716 1.1 mrg vec_splat((vec_int4)(si_from_int(b)), 0)))); 717 1.1 mrg } 718 1.1 mrg 719 1.1 mrg static __inline qword si_clgtbi(qword a, unsigned char b) 720 1.1 mrg { 721 1.1 mrg return ((qword)(vec_cmpgt((vec_uchar16)(a), 722 1.1 mrg vec_splat((vec_uchar16)(si_from_uchar(b)), 3)))); 723 1.1 mrg } 724 1.1 mrg 725 1.1 mrg static __inline qword si_clgthi(qword a, unsigned short b) 726 1.1 mrg { 727 1.1 mrg return ((qword)(vec_cmpgt((vec_ushort8)(a), 728 1.1 mrg vec_splat((vec_ushort8)(si_from_ushort(b)), 1)))); 729 1.1 mrg } 730 1.1 mrg 731 1.1 mrg static __inline qword si_clgti(qword a, unsigned int b) 732 1.1 mrg { 733 1.1 mrg return ((qword)(vec_cmpgt((vec_uint4)(a), 734 1.1 mrg vec_splat((vec_uint4)(si_from_uint(b)), 0)))); 735 1.1 mrg } 736 1.1 mrg 737 1.1 mrg static __inline qword si_dftsv(qword a, char b) 738 1.1 mrg { 739 1.1 mrg vec_uchar16 splat_hi = (vec_uchar16) { 0,1,2,3, 0,1,2,3, 8,9,10,11, 8,9,10,11 }; 740 1.1 mrg vec_uint4 sign_mask = (vec_uint4) { 0x7FFFFFFF, 0xFFFFFFFF, 0x7FFFFFFF, 0xFFFFFFFF }; 741 1.1 mrg vec_uint4 result = (vec_uint4){0}; 742 1.1 mrg vec_uint4 sign = (vec_uint4)vec_sra((vec_int4)(a), (vec_uint4)vec_splat(((vec_uint4)si_from_int(31)), 0)); 743 1.1 mrg sign = (vec_uint4)vec_perm((vec_uchar16)sign,(vec_uchar16)sign,splat_hi); 744 1.1 mrg vec_uint4 aabs = vec_and((vec_uint4)a,sign_mask); 745 1.1 mrg 746 1.1 mrg union { 747 1.1 mrg vec_uchar16 v; 748 1.1 mrg int i[4]; 749 1.1 mrg } x; 750 1.1 mrg 751 1.1 mrg /* Shift 4 bytes */ 752 1.1 mrg x.i[3] = 4 << 3; 753 1.1 mrg 754 1.1 mrg /* Nan or +inf or -inf */ 755 1.1 mrg if (b & 0x70) 756 1.1 mrg { 757 1.1 mrg vec_uint4 nan_mask = (vec_uint4) { 0x7FF00000, 0x0, 0x7FF00000, 0x0 }; 758 1.1 mrg vec_uint4 a_inf = (vec_uint4)vec_cmpeq(aabs, nan_mask); 759 1.1 mrg /* NaN */ 760 1.1 mrg if (b & 0x40) 761 1.1 mrg { 762 1.1 mrg vec_uint4 a_nan = (vec_uint4)vec_cmpgt(aabs, nan_mask); 763 1.1 mrg a_nan = vec_or(a_nan, vec_and((vec_uint4)vec_slo((vec_uchar16)a_nan,x.v),a_inf)); 764 1.1 mrg a_nan = (vec_uint4)vec_perm((vec_uchar16)a_nan, (vec_uchar16)a_nan, splat_hi); 765 1.1 mrg result = vec_or(result, a_nan); 766 1.1 mrg } 767 1.1 mrg /* inf */ 768 1.1 mrg if (b & 0x30) 769 1.1 mrg { 770 1.1 mrg a_inf = vec_and((vec_uint4)vec_slo((vec_uchar16)a_inf,x.v), a_inf); 771 1.1 mrg a_inf = (vec_uint4)vec_perm((vec_uchar16)a_inf, (vec_uchar16)a_inf, splat_hi); 772 1.1 mrg /* +inf */ 773 1.1 mrg if (b & 0x20) 774 1.1 mrg result = vec_or(vec_andc(a_inf, sign), result); 775 1.1 mrg /* -inf */ 776 1.1 mrg if (b & 0x10) 777 1.1 mrg result = vec_or(vec_and(a_inf, sign), result); 778 1.1 mrg } 779 1.1 mrg } 780 1.1 mrg /* 0 or denorm */ 781 1.1 mrg if (b & 0xF) 782 1.1 mrg { 783 1.1 mrg vec_uint4 iszero =(vec_uint4)vec_cmpeq(aabs,(vec_uint4)vec_splat_u32(0)); 784 1.1 mrg iszero = vec_and(iszero,(vec_uint4)vec_slo((vec_uchar16)iszero,x.v)); 785 1.1 mrg /* denorm */ 786 1.1 mrg if (b & 0x3) 787 1.1 mrg { 788 1.1 mrg vec_uint4 denorm_mask = (vec_uint4){0xFFFFF, 0xFFFFF, 0xFFFFF, 0xFFFFF}; 789 1.1 mrg vec_uint4 isdenorm = vec_nor((vec_uint4)vec_cmpgt(aabs, denorm_mask), iszero); 790 1.1 mrg isdenorm = (vec_uint4)vec_perm((vec_uchar16)isdenorm, (vec_uchar16)isdenorm, splat_hi); 791 1.1 mrg /* +denorm */ 792 1.1 mrg if (b & 0x2) 793 1.1 mrg result = vec_or(vec_andc(isdenorm, sign), result); 794 1.1 mrg /* -denorm */ 795 1.1 mrg if (b & 0x1) 796 1.1 mrg result = vec_or(vec_and(isdenorm, sign), result); 797 1.1 mrg } 798 1.1 mrg /* 0 */ 799 1.1 mrg if (b & 0xC) 800 1.1 mrg { 801 1.1 mrg iszero = (vec_uint4)vec_perm((vec_uchar16)iszero, (vec_uchar16)iszero, splat_hi); 802 1.1 mrg /* +0 */ 803 1.1 mrg if (b & 0x8) 804 1.1 mrg result = vec_or(vec_andc(iszero, sign), result); 805 1.1 mrg /* -0 */ 806 1.1 mrg if (b & 0x4) 807 1.1 mrg result = vec_or(vec_and(iszero, sign), result); 808 1.1 mrg } 809 1.1 mrg } 810 1.1 mrg return ((qword)result); 811 1.1 mrg } 812 1.1 mrg 813 1.1 mrg 814 1.1 mrg /* Carry generate 815 1.1 mrg */ 816 1.1 mrg #define si_cg(_a, _b) ((qword)(vec_addc((vec_uint4)(_a), (vec_uint4)(_b)))) 817 1.1 mrg 818 1.1 mrg #define si_cgx(_a, _b, _c) ((qword)(vec_or(vec_addc((vec_uint4)(_a), (vec_uint4)(_b)), \ 819 1.1 mrg vec_addc(vec_add((vec_uint4)(_a), (vec_uint4)(_b)), \ 820 1.1 mrg vec_and((vec_uint4)(_c), vec_splat_u32(1)))))) 821 1.1 mrg 822 1.1 mrg 823 1.1 mrg /* Count ones for bytes 824 1.1 mrg */ 825 1.1 mrg static __inline qword si_cntb(qword a) 826 1.1 mrg { 827 1.1 mrg vec_uchar16 nib_cnt = (vec_uchar16){0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4}; 828 1.1 mrg vec_uchar16 four = { 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4 }; 829 1.1 mrg vec_uchar16 av; 830 1.1 mrg 831 1.1 mrg av = (vec_uchar16)(a); 832 1.1 mrg 833 1.1 mrg return ((qword)(vec_add(vec_perm(nib_cnt, nib_cnt, av), 834 1.1 mrg vec_perm(nib_cnt, nib_cnt, vec_sr (av, four))))); 835 1.1 mrg } 836 1.1 mrg 837 1.1 mrg /* Count ones for bytes 838 1.1 mrg */ 839 1.1 mrg static __inline qword si_clz(qword a) 840 1.1 mrg { 841 1.1 mrg vec_uchar16 av; 842 1.1 mrg vec_uchar16 cnt_hi, cnt_lo, cnt, tmp1, tmp2, tmp3; 843 1.1 mrg vec_uchar16 four = vec_splat_u8(4); 844 1.1 mrg vec_uchar16 nib_cnt = (vec_uchar16){4, 3, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0}; 845 1.1 mrg vec_uchar16 eight = vec_splat_u8(8); 846 1.1 mrg vec_uchar16 sixteen = (vec_uchar16){16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16}; 847 1.1 mrg vec_uchar16 twentyfour = (vec_uchar16){24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24}; 848 1.1 mrg 849 1.1 mrg av = (vec_uchar16)(a); 850 1.1 mrg 851 1.1 mrg cnt_hi = vec_perm(nib_cnt, nib_cnt, vec_sr(av, four)); 852 1.1 mrg cnt_lo = vec_perm(nib_cnt, nib_cnt, av); 853 1.1 mrg 854 1.1 mrg cnt = vec_add(cnt_hi, vec_and(cnt_lo, vec_cmpeq(cnt_hi, four))); 855 1.1 mrg 856 1.1 mrg tmp1 = (vec_uchar16)vec_sl((vec_uint4)(cnt), (vec_uint4)(eight)); 857 1.1 mrg tmp2 = (vec_uchar16)vec_sl((vec_uint4)(cnt), (vec_uint4)(sixteen)); 858 1.1 mrg tmp3 = (vec_uchar16)vec_sl((vec_uint4)(cnt), (vec_uint4)(twentyfour)); 859 1.1 mrg 860 1.1 mrg cnt = vec_add(cnt, vec_and(tmp1, vec_cmpeq(cnt, eight))); 861 1.1 mrg cnt = vec_add(cnt, vec_and(tmp2, vec_cmpeq(cnt, sixteen))); 862 1.1 mrg cnt = vec_add(cnt, vec_and(tmp3, vec_cmpeq(cnt, twentyfour))); 863 1.1 mrg 864 1.1 mrg return (qword)((vec_sr((vec_uint4)(cnt), (vec_uint4)(twentyfour)))); 865 1.1 mrg } 866 1.1 mrg 867 1.1 mrg /* Convert to float 868 1.1 mrg */ 869 1.1 mrg #define si_cuflt(_a, _b) ((qword)(vec_ctf((vec_uint4)(_a), _b))) 870 1.1 mrg #define si_csflt(_a, _b) ((qword)(vec_ctf((vec_int4)(_a), _b))) 871 1.1 mrg 872 1.1 mrg /* Convert to signed int 873 1.1 mrg */ 874 1.1 mrg #define si_cflts(_a, _b) ((qword)(vec_cts((vec_float4)(_a), _b))) 875 1.1 mrg 876 1.1 mrg /* Convert to unsigned int 877 1.1 mrg */ 878 1.1 mrg #define si_cfltu(_a, _b) ((qword)(vec_ctu((vec_float4)(_a), _b))) 879 1.1 mrg 880 1.1 mrg /* Synchronize 881 1.1 mrg */ 882 1.1 mrg #define si_dsync() /* do nothing */ 883 1.1 mrg #define si_sync() /* do nothing */ 884 1.1 mrg #define si_syncc() /* do nothing */ 885 1.1 mrg 886 1.1 mrg 887 1.1 mrg /* Equivalence 888 1.1 mrg */ 889 1.1 mrg static __inline qword si_eqv(qword a, qword b) 890 1.1 mrg { 891 1.1 mrg vec_uchar16 d; 892 1.1 mrg 893 1.1 mrg d = vec_xor((vec_uchar16)(a), (vec_uchar16)(b)); 894 1.1 mrg return ((qword)(vec_nor(d, d))); 895 1.1 mrg } 896 1.1 mrg 897 1.1 mrg /* Extend 898 1.1 mrg */ 899 1.1 mrg static __inline qword si_xsbh(qword a) 900 1.1 mrg { 901 1.1 mrg vec_char16 av; 902 1.1 mrg 903 1.1 mrg av = (vec_char16)(a); 904 1.1 mrg return ((qword)(vec_unpackh(vec_perm(av, av, ((vec_uchar16){1, 3, 5, 7, 9,11,13,15, 905 1.1 mrg 0, 0, 0, 0, 0, 0, 0, 0}))))); 906 1.1 mrg } 907 1.1 mrg 908 1.1 mrg static __inline qword si_xshw(qword a) 909 1.1 mrg { 910 1.1 mrg vec_short8 av; 911 1.1 mrg 912 1.1 mrg av = (vec_short8)(a); 913 1.1 mrg return ((qword)(vec_unpackh(vec_perm(av, av, ((vec_uchar16){2, 3, 6, 7, 914 1.1 mrg 10,11,14,15, 915 1.1 mrg 0, 0, 0, 0, 916 1.1 mrg 0, 0, 0, 0}))))); 917 1.1 mrg } 918 1.1 mrg 919 1.1 mrg static __inline qword si_xswd(qword a) 920 1.1 mrg { 921 1.1 mrg vec_int4 av; 922 1.1 mrg 923 1.1 mrg av = (vec_int4)(a); 924 1.1 mrg return ((qword)(vec_perm(av, vec_sra(av, ((vec_uint4){31,31,31,31})), 925 1.1 mrg ((vec_uchar16){20, 21, 22, 23, 926 1.1 mrg 4, 5, 6, 7, 927 1.1 mrg 28, 29, 30, 31, 928 1.1 mrg 12, 13, 14, 15})))); 929 1.1 mrg } 930 1.1 mrg 931 1.1 mrg static __inline qword si_fesd(qword a) 932 1.1 mrg { 933 1.1 mrg union { 934 1.1 mrg double d[2]; 935 1.1 mrg vec_double2 vd; 936 1.1 mrg } out; 937 1.1 mrg union { 938 1.1 mrg float f[4]; 939 1.1 mrg vec_float4 vf; 940 1.1 mrg } in; 941 1.1 mrg 942 1.1 mrg in.vf = (vec_float4)(a); 943 1.1 mrg out.d[0] = (double)(in.f[0]); 944 1.1 mrg out.d[1] = (double)(in.f[2]); 945 1.1 mrg return ((qword)(out.vd)); 946 1.1 mrg } 947 1.1 mrg 948 1.1 mrg /* Gather 949 1.1 mrg */ 950 1.1 mrg static __inline qword si_gbb(qword a) 951 1.1 mrg { 952 1.1 mrg vec_uchar16 bits; 953 1.1 mrg vec_uint4 bytes; 954 1.1 mrg 955 1.1 mrg bits = vec_sl(vec_and((vec_uchar16)(a), vec_splat_u8(1)), ((vec_uchar16){7, 6, 5, 4, 3, 2, 1, 0, 956 1.1 mrg 7, 6, 5, 4, 3, 2, 1, 0})); 957 1.1 mrg bytes = (vec_uint4)vec_sum2s((vec_int4)(vec_sum4s(bits, ((vec_uint4){0}))), ((vec_int4){0})); 958 1.1 mrg 959 1.1 mrg return ((qword)(vec_perm(bytes, bytes, ((vec_uchar16){0, 0, 7,15, 0, 0, 0, 0, 960 1.1 mrg 0, 0, 0, 0, 0, 0, 0, 0})))); 961 1.1 mrg } 962 1.1 mrg 963 1.1 mrg 964 1.1 mrg static __inline qword si_gbh(qword a) 965 1.1 mrg { 966 1.1 mrg vec_ushort8 bits; 967 1.1 mrg vec_uint4 bytes; 968 1.1 mrg 969 1.1 mrg bits = vec_sl(vec_and((vec_ushort8)(a), vec_splat_u16(1)), ((vec_ushort8){7, 6, 5, 4, 3, 2, 1, 0})); 970 1.1 mrg 971 1.1 mrg bytes = (vec_uint4)vec_sums((vec_int4)(vec_sum4s((vec_short8)(bits), (vec_int4){0})), (vec_int4){0}); 972 1.1 mrg 973 1.1 mrg return ((qword)(vec_sld(bytes, bytes, 12))); 974 1.1 mrg } 975 1.1 mrg 976 1.1 mrg static __inline qword si_gb(qword a) 977 1.1 mrg { 978 1.1 mrg vec_uint4 bits; 979 1.1 mrg vec_uint4 bytes; 980 1.1 mrg 981 1.1 mrg bits = vec_sl(vec_and((vec_uint4)(a), vec_splat_u32(1)), ((vec_uint4){3, 2, 1, 0})); 982 1.1 mrg bytes = (vec_uint4)vec_sums((vec_int4)(bits), ((vec_int4){0})); 983 1.1 mrg return ((qword)(vec_sld(bytes, bytes, 12))); 984 1.1 mrg } 985 1.1 mrg 986 1.1 mrg 987 1.1 mrg /* Compare and halt 988 1.1 mrg */ 989 1.1 mrg static __inline void si_heq(qword a, qword b) 990 1.1 mrg { 991 1.1 mrg union { 992 1.1 mrg vector unsigned int v; 993 1.1 mrg unsigned int i[4]; 994 1.1 mrg } aa, bb; 995 1.1 mrg 996 1.1 mrg aa.v = (vector unsigned int)(a); 997 1.1 mrg bb.v = (vector unsigned int)(b); 998 1.1 mrg 999 1.1 mrg if (aa.i[0] == bb.i[0]) { SPU_HALT_ACTION; }; 1000 1.1 mrg } 1001 1.1 mrg 1002 1.1 mrg static __inline void si_heqi(qword a, unsigned int b) 1003 1.1 mrg { 1004 1.1 mrg union { 1005 1.1 mrg vector unsigned int v; 1006 1.1 mrg unsigned int i[4]; 1007 1.1 mrg } aa; 1008 1.1 mrg 1009 1.1 mrg aa.v = (vector unsigned int)(a); 1010 1.1 mrg 1011 1.1 mrg if (aa.i[0] == b) { SPU_HALT_ACTION; }; 1012 1.1 mrg } 1013 1.1 mrg 1014 1.1 mrg static __inline void si_hgt(qword a, qword b) 1015 1.1 mrg { 1016 1.1 mrg union { 1017 1.1 mrg vector signed int v; 1018 1.1 mrg signed int i[4]; 1019 1.1 mrg } aa, bb; 1020 1.1 mrg 1021 1.1 mrg aa.v = (vector signed int)(a); 1022 1.1 mrg bb.v = (vector signed int)(b); 1023 1.1 mrg 1024 1.1 mrg if (aa.i[0] > bb.i[0]) { SPU_HALT_ACTION; }; 1025 1.1 mrg } 1026 1.1 mrg 1027 1.1 mrg static __inline void si_hgti(qword a, signed int b) 1028 1.1 mrg { 1029 1.1 mrg union { 1030 1.1 mrg vector signed int v; 1031 1.1 mrg signed int i[4]; 1032 1.1 mrg } aa; 1033 1.1 mrg 1034 1.1 mrg aa.v = (vector signed int)(a); 1035 1.1 mrg 1036 1.1 mrg if (aa.i[0] > b) { SPU_HALT_ACTION; }; 1037 1.1 mrg } 1038 1.1 mrg 1039 1.1 mrg static __inline void si_hlgt(qword a, qword b) 1040 1.1 mrg { 1041 1.1 mrg union { 1042 1.1 mrg vector unsigned int v; 1043 1.1 mrg unsigned int i[4]; 1044 1.1 mrg } aa, bb; 1045 1.1 mrg 1046 1.1 mrg aa.v = (vector unsigned int)(a); 1047 1.1 mrg bb.v = (vector unsigned int)(b); 1048 1.1 mrg 1049 1.1 mrg if (aa.i[0] > bb.i[0]) { SPU_HALT_ACTION; }; 1050 1.1 mrg } 1051 1.1 mrg 1052 1.1 mrg static __inline void si_hlgti(qword a, unsigned int b) 1053 1.1 mrg { 1054 1.1 mrg union { 1055 1.1 mrg vector unsigned int v; 1056 1.1 mrg unsigned int i[4]; 1057 1.1 mrg } aa; 1058 1.1 mrg 1059 1.1 mrg aa.v = (vector unsigned int)(a); 1060 1.1 mrg 1061 1.1 mrg if (aa.i[0] > b) { SPU_HALT_ACTION; }; 1062 1.1 mrg } 1063 1.1 mrg 1064 1.1 mrg 1065 1.1 mrg /* Multiply and Add 1066 1.1 mrg */ 1067 1.1 mrg static __inline qword si_mpya(qword a, qword b, qword c) 1068 1.1 mrg { 1069 1.1 mrg return ((qword)(vec_msum(vec_and((vec_short8)(a), 1070 1.1 mrg ((vec_short8){0, -1, 0, -1, 0, -1, 0, -1})), 1071 1.1 mrg (vec_short8)(b), (vec_int4)(c)))); 1072 1.1 mrg } 1073 1.1 mrg 1074 1.1 mrg static __inline qword si_fma(qword a, qword b, qword c) 1075 1.1 mrg { 1076 1.1 mrg return ((qword)(vec_madd((vec_float4)(a), (vec_float4)(b), (vec_float4)(c)))); 1077 1.1 mrg } 1078 1.1 mrg 1079 1.1 mrg static __inline qword si_dfma(qword a, qword b, qword c) 1080 1.1 mrg { 1081 1.1 mrg union { 1082 1.1 mrg vec_double2 v; 1083 1.1 mrg double d[2]; 1084 1.1 mrg } aa, bb, cc, dd; 1085 1.1 mrg 1086 1.1 mrg aa.v = (vec_double2)(a); 1087 1.1 mrg bb.v = (vec_double2)(b); 1088 1.1 mrg cc.v = (vec_double2)(c); 1089 1.1 mrg dd.d[0] = aa.d[0] * bb.d[0] + cc.d[0]; 1090 1.1 mrg dd.d[1] = aa.d[1] * bb.d[1] + cc.d[1]; 1091 1.1 mrg return ((qword)(dd.v)); 1092 1.1 mrg } 1093 1.1 mrg 1094 1.1 mrg /* Form Mask 1095 1.1 mrg */ 1096 1.1 mrg #define si_fsmbi(_a) si_fsmb(si_from_int(_a)) 1097 1.1 mrg 1098 1.1 mrg static __inline qword si_fsmb(qword a) 1099 1.1 mrg { 1100 1.1 mrg vec_char16 mask; 1101 1.1 mrg vec_ushort8 in; 1102 1.1 mrg 1103 1.1 mrg in = (vec_ushort8)(a); 1104 1.1 mrg mask = (vec_char16)(vec_perm(in, in, ((vec_uchar16){2, 2, 2, 2, 2, 2, 2, 2, 1105 1.1 mrg 3, 3, 3, 3, 3, 3, 3, 3}))); 1106 1.1 mrg return ((qword)(vec_sra(vec_sl(mask, ((vec_uchar16){0, 1, 2, 3, 4, 5, 6, 7, 1107 1.1 mrg 0, 1, 2, 3, 4, 5, 6, 7})), 1108 1.1 mrg vec_splat_u8(7)))); 1109 1.1 mrg } 1110 1.1 mrg 1111 1.1 mrg 1112 1.1 mrg static __inline qword si_fsmh(qword a) 1113 1.1 mrg { 1114 1.1 mrg vec_uchar16 in; 1115 1.1 mrg vec_short8 mask; 1116 1.1 mrg 1117 1.1 mrg in = (vec_uchar16)(a); 1118 1.1 mrg mask = (vec_short8)(vec_splat(in, 3)); 1119 1.1 mrg return ((qword)(vec_sra(vec_sl(mask, ((vec_ushort8){0, 1, 2, 3, 4, 5, 6, 7})), 1120 1.1 mrg vec_splat_u16(15)))); 1121 1.1 mrg } 1122 1.1 mrg 1123 1.1 mrg static __inline qword si_fsm(qword a) 1124 1.1 mrg { 1125 1.1 mrg vec_uchar16 in; 1126 1.1 mrg vec_int4 mask; 1127 1.1 mrg 1128 1.1 mrg in = (vec_uchar16)(a); 1129 1.1 mrg mask = (vec_int4)(vec_splat(in, 3)); 1130 1.1 mrg return ((qword)(vec_sra(vec_sl(mask, ((vec_uint4){28, 29, 30, 31})), 1131 1.1 mrg ((vec_uint4){31,31,31,31})))); 1132 1.1 mrg } 1133 1.1 mrg 1134 1.1 mrg /* Move from/to registers 1135 1.1 mrg */ 1136 1.1 mrg #define si_fscrrd() ((qword)((vec_uint4){0})) 1137 1.1 mrg #define si_fscrwr(_a) 1138 1.1 mrg 1139 1.1 mrg #define si_mfspr(_reg) ((qword)((vec_uint4){0})) 1140 1.1 mrg #define si_mtspr(_reg, _a) 1141 1.1 mrg 1142 1.1 mrg /* Multiply High High Add 1143 1.1 mrg */ 1144 1.1 mrg static __inline qword si_mpyhha(qword a, qword b, qword c) 1145 1.1 mrg { 1146 1.1 mrg return ((qword)(vec_add(vec_mule((vec_short8)(a), (vec_short8)(b)), (vec_int4)(c)))); 1147 1.1 mrg } 1148 1.1 mrg 1149 1.1 mrg static __inline qword si_mpyhhau(qword a, qword b, qword c) 1150 1.1 mrg { 1151 1.1 mrg return ((qword)(vec_add(vec_mule((vec_ushort8)(a), (vec_ushort8)(b)), (vec_uint4)(c)))); 1152 1.1 mrg } 1153 1.1 mrg 1154 1.1 mrg /* Multiply Subtract 1155 1.1 mrg */ 1156 1.1 mrg static __inline qword si_fms(qword a, qword b, qword c) 1157 1.1 mrg { 1158 1.1 mrg return ((qword)(vec_madd((vec_float4)(a), (vec_float4)(b), 1159 1.1 mrg vec_sub(((vec_float4){0.0f}), (vec_float4)(c))))); 1160 1.1 mrg } 1161 1.1 mrg 1162 1.1 mrg static __inline qword si_dfms(qword a, qword b, qword c) 1163 1.1 mrg { 1164 1.1 mrg union { 1165 1.1 mrg vec_double2 v; 1166 1.1 mrg double d[2]; 1167 1.1 mrg } aa, bb, cc, dd; 1168 1.1 mrg 1169 1.1 mrg aa.v = (vec_double2)(a); 1170 1.1 mrg bb.v = (vec_double2)(b); 1171 1.1 mrg cc.v = (vec_double2)(c); 1172 1.1 mrg dd.d[0] = aa.d[0] * bb.d[0] - cc.d[0]; 1173 1.1 mrg dd.d[1] = aa.d[1] * bb.d[1] - cc.d[1]; 1174 1.1 mrg return ((qword)(dd.v)); 1175 1.1 mrg } 1176 1.1 mrg 1177 1.1 mrg /* Multiply 1178 1.1 mrg */ 1179 1.1 mrg static __inline qword si_fm(qword a, qword b) 1180 1.1 mrg { 1181 1.1 mrg return ((qword)(vec_madd((vec_float4)(a), (vec_float4)(b), ((vec_float4){0.0f})))); 1182 1.1 mrg } 1183 1.1 mrg 1184 1.1 mrg static __inline qword si_dfm(qword a, qword b) 1185 1.1 mrg { 1186 1.1 mrg union { 1187 1.1 mrg vec_double2 v; 1188 1.1 mrg double d[2]; 1189 1.1 mrg } aa, bb, dd; 1190 1.1 mrg 1191 1.1 mrg aa.v = (vec_double2)(a); 1192 1.1 mrg bb.v = (vec_double2)(b); 1193 1.1 mrg dd.d[0] = aa.d[0] * bb.d[0]; 1194 1.1 mrg dd.d[1] = aa.d[1] * bb.d[1]; 1195 1.1 mrg return ((qword)(dd.v)); 1196 1.1 mrg } 1197 1.1 mrg 1198 1.1 mrg /* Multiply High 1199 1.1 mrg */ 1200 1.1 mrg static __inline qword si_mpyh(qword a, qword b) 1201 1.1 mrg { 1202 1.1 mrg vec_uint4 sixteen = (vec_uint4){16, 16, 16, 16}; 1203 1.1 mrg 1204 1.1 mrg return ((qword)(vec_sl(vec_mule((vec_short8)(a), (vec_short8)(vec_sl((vec_uint4)(b), sixteen))), sixteen))); 1205 1.1 mrg } 1206 1.1 mrg 1207 1.1 mrg 1208 1.1 mrg /* Multiply High High 1209 1.1 mrg */ 1210 1.1 mrg static __inline qword si_mpyhh(qword a, qword b) 1211 1.1 mrg { 1212 1.1 mrg return ((qword)(vec_mule((vec_short8)(a), (vec_short8)(b)))); 1213 1.1 mrg } 1214 1.1 mrg 1215 1.1 mrg static __inline qword si_mpyhhu(qword a, qword b) 1216 1.1 mrg { 1217 1.1 mrg return ((qword)(vec_mule((vec_ushort8)(a), (vec_ushort8)(b)))); 1218 1.1 mrg } 1219 1.1 mrg 1220 1.1 mrg /* Multiply Odd 1221 1.1 mrg */ 1222 1.1 mrg static __inline qword si_mpy(qword a, qword b) 1223 1.1 mrg { 1224 1.1 mrg return ((qword)(vec_mulo((vec_short8)(a), (vec_short8)(b)))); 1225 1.1 mrg } 1226 1.1 mrg 1227 1.1 mrg static __inline qword si_mpyu(qword a, qword b) 1228 1.1 mrg { 1229 1.1 mrg return ((qword)(vec_mulo((vec_ushort8)(a), (vec_ushort8)(b)))); 1230 1.1 mrg } 1231 1.1 mrg 1232 1.1 mrg static __inline qword si_mpyi(qword a, short b) 1233 1.1 mrg { 1234 1.1 mrg return ((qword)(vec_mulo((vec_short8)(a), 1235 1.1 mrg vec_splat((vec_short8)(si_from_short(b)), 1)))); 1236 1.1 mrg } 1237 1.1 mrg 1238 1.1 mrg static __inline qword si_mpyui(qword a, unsigned short b) 1239 1.1 mrg { 1240 1.1 mrg return ((qword)(vec_mulo((vec_ushort8)(a), 1241 1.1 mrg vec_splat((vec_ushort8)(si_from_ushort(b)), 1)))); 1242 1.1 mrg } 1243 1.1 mrg 1244 1.1 mrg /* Multiply and Shift Right 1245 1.1 mrg */ 1246 1.1 mrg static __inline qword si_mpys(qword a, qword b) 1247 1.1 mrg { 1248 1.1 mrg return ((qword)(vec_sra(vec_mulo((vec_short8)(a), (vec_short8)(b)), ((vec_uint4){16,16,16,16})))); 1249 1.1 mrg } 1250 1.1 mrg 1251 1.1 mrg /* Nand 1252 1.1 mrg */ 1253 1.1 mrg static __inline qword si_nand(qword a, qword b) 1254 1.1 mrg { 1255 1.1 mrg vec_uchar16 d; 1256 1.1 mrg 1257 1.1 mrg d = vec_and((vec_uchar16)(a), (vec_uchar16)(b)); 1258 1.1 mrg return ((qword)(vec_nor(d, d))); 1259 1.1 mrg } 1260 1.1 mrg 1261 1.1 mrg /* Negative Multiply Add 1262 1.1 mrg */ 1263 1.1 mrg static __inline qword si_dfnma(qword a, qword b, qword c) 1264 1.1 mrg { 1265 1.1 mrg union { 1266 1.1 mrg vec_double2 v; 1267 1.1 mrg double d[2]; 1268 1.1 mrg } aa, bb, cc, dd; 1269 1.1 mrg 1270 1.1 mrg aa.v = (vec_double2)(a); 1271 1.1 mrg bb.v = (vec_double2)(b); 1272 1.1 mrg cc.v = (vec_double2)(c); 1273 1.1 mrg dd.d[0] = -cc.d[0] - aa.d[0] * bb.d[0]; 1274 1.1 mrg dd.d[1] = -cc.d[1] - aa.d[1] * bb.d[1]; 1275 1.1 mrg return ((qword)(dd.v)); 1276 1.1 mrg } 1277 1.1 mrg 1278 1.1 mrg /* Negative Multiply and Subtract 1279 1.1 mrg */ 1280 1.1 mrg static __inline qword si_fnms(qword a, qword b, qword c) 1281 1.1 mrg { 1282 1.1 mrg return ((qword)(vec_nmsub((vec_float4)(a), (vec_float4)(b), (vec_float4)(c)))); 1283 1.1 mrg } 1284 1.1 mrg 1285 1.1 mrg static __inline qword si_dfnms(qword a, qword b, qword c) 1286 1.1 mrg { 1287 1.1 mrg union { 1288 1.1 mrg vec_double2 v; 1289 1.1 mrg double d[2]; 1290 1.1 mrg } aa, bb, cc, dd; 1291 1.1 mrg 1292 1.1 mrg aa.v = (vec_double2)(a); 1293 1.1 mrg bb.v = (vec_double2)(b); 1294 1.1 mrg cc.v = (vec_double2)(c); 1295 1.1 mrg dd.d[0] = cc.d[0] - aa.d[0] * bb.d[0]; 1296 1.1 mrg dd.d[1] = cc.d[1] - aa.d[1] * bb.d[1]; 1297 1.1 mrg return ((qword)(dd.v)); 1298 1.1 mrg } 1299 1.1 mrg 1300 1.1 mrg /* Nor 1301 1.1 mrg */ 1302 1.1 mrg static __inline qword si_nor(qword a, qword b) 1303 1.1 mrg { 1304 1.1 mrg return ((qword)(vec_nor((vec_uchar16)(a), (vec_uchar16)(b)))); 1305 1.1 mrg } 1306 1.1 mrg 1307 1.1 mrg /* Or 1308 1.1 mrg */ 1309 1.1 mrg static __inline qword si_or(qword a, qword b) 1310 1.1 mrg { 1311 1.1 mrg return ((qword)(vec_or((vec_uchar16)(a), (vec_uchar16)(b)))); 1312 1.1 mrg } 1313 1.1 mrg 1314 1.1 mrg static __inline qword si_orbi(qword a, unsigned char b) 1315 1.1 mrg { 1316 1.1 mrg return ((qword)(vec_or((vec_uchar16)(a), 1317 1.1 mrg vec_splat((vec_uchar16)(si_from_uchar(b)), 3)))); 1318 1.1 mrg } 1319 1.1 mrg 1320 1.1 mrg static __inline qword si_orhi(qword a, unsigned short b) 1321 1.1 mrg { 1322 1.1 mrg return ((qword)(vec_or((vec_ushort8)(a), 1323 1.1 mrg vec_splat((vec_ushort8)(si_from_ushort(b)), 1)))); 1324 1.1 mrg } 1325 1.1 mrg 1326 1.1 mrg static __inline qword si_ori(qword a, unsigned int b) 1327 1.1 mrg { 1328 1.1 mrg return ((qword)(vec_or((vec_uint4)(a), 1329 1.1 mrg vec_splat((vec_uint4)(si_from_uint(b)), 0)))); 1330 1.1 mrg } 1331 1.1 mrg 1332 1.1 mrg /* Or Complement 1333 1.1 mrg */ 1334 1.1 mrg static __inline qword si_orc(qword a, qword b) 1335 1.1 mrg { 1336 1.1 mrg return ((qword)(vec_or((vec_uchar16)(a), vec_nor((vec_uchar16)(b), (vec_uchar16)(b))))); 1337 1.1 mrg } 1338 1.1 mrg 1339 1.1 mrg 1340 1.1 mrg /* Or Across 1341 1.1 mrg */ 1342 1.1 mrg static __inline qword si_orx(qword a) 1343 1.1 mrg { 1344 1.1 mrg vec_uchar16 tmp; 1345 1.1 mrg tmp = (vec_uchar16)(a); 1346 1.1 mrg tmp = vec_or(tmp, vec_sld(tmp, tmp, 8)); 1347 1.1 mrg tmp = vec_or(tmp, vec_sld(tmp, tmp, 4)); 1348 1.1 mrg return ((qword)(vec_and(tmp, ((vec_uchar16){0xFF,0xFF,0xFF,0xFF, 0x00,0x00,0x00,0x00, 1349 1.1 mrg 0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00})))); 1350 1.1 mrg } 1351 1.1 mrg 1352 1.1 mrg 1353 1.1 mrg /* Estimates 1354 1.1 mrg */ 1355 1.1 mrg static __inline qword si_frest(qword a) 1356 1.1 mrg { 1357 1.1 mrg return ((qword)(vec_re((vec_float4)(a)))); 1358 1.1 mrg } 1359 1.1 mrg 1360 1.1 mrg static __inline qword si_frsqest(qword a) 1361 1.1 mrg { 1362 1.1 mrg return ((qword)(vec_rsqrte((vec_float4)(a)))); 1363 1.1 mrg } 1364 1.1 mrg 1365 1.1 mrg #define si_fi(_a, _d) (_d) 1366 1.1 mrg 1367 1.1 mrg /* Channel Read and Write 1368 1.1 mrg */ 1369 1.1 mrg #define si_rdch(_channel) ((qword)(vec_splat_u8(0))) /* not mappable */ 1370 1.1 mrg #define si_rchcnt(_channel) ((qword)(vec_splat_u8(0))) /* not mappable */ 1371 1.1 mrg #define si_wrch(_channel, _a) /* not mappable */ 1372 1.1 mrg 1373 1.1 mrg /* Rotate Left 1374 1.1 mrg */ 1375 1.1 mrg static __inline qword si_roth(qword a, qword b) 1376 1.1 mrg { 1377 1.1 mrg return ((qword)(vec_rl((vec_ushort8)(a), (vec_ushort8)(b)))); 1378 1.1 mrg } 1379 1.1 mrg 1380 1.1 mrg static __inline qword si_rot(qword a, qword b) 1381 1.1 mrg { 1382 1.1 mrg return ((qword)(vec_rl((vec_uint4)(a), (vec_uint4)(b)))); 1383 1.1 mrg } 1384 1.1 mrg 1385 1.1 mrg static __inline qword si_rothi(qword a, int b) 1386 1.1 mrg { 1387 1.1 mrg return ((qword)(vec_rl((vec_ushort8)(a), 1388 1.1 mrg vec_splat((vec_ushort8)(si_from_int(b)), 1)))); 1389 1.1 mrg } 1390 1.1 mrg 1391 1.1 mrg static __inline qword si_roti(qword a, int b) 1392 1.1 mrg { 1393 1.1 mrg return ((qword)(vec_rl((vec_uint4)(a), 1394 1.1 mrg vec_splat((vec_uint4)(si_from_int(b)), 0)))); 1395 1.1 mrg } 1396 1.1 mrg 1397 1.1 mrg /* Rotate Left with Mask 1398 1.1 mrg */ 1399 1.1 mrg static __inline qword si_rothm(qword a, qword b) 1400 1.1 mrg { 1401 1.1 mrg vec_ushort8 neg_b; 1402 1.1 mrg vec_ushort8 mask; 1403 1.1 mrg 1404 1.1 mrg neg_b = (vec_ushort8)vec_sub(vec_splat_s16(0), (vec_short8)(b)); 1405 1.1 mrg mask = vec_sra(vec_sl(neg_b, vec_splat_u16(11)), vec_splat_u16(15)); 1406 1.1 mrg return ((qword)(vec_andc(vec_sr((vec_ushort8)(a), neg_b), mask))); 1407 1.1 mrg } 1408 1.1 mrg 1409 1.1 mrg static __inline qword si_rotm(qword a, qword b) 1410 1.1 mrg { 1411 1.1 mrg vec_uint4 neg_b; 1412 1.1 mrg vec_uint4 mask; 1413 1.1 mrg 1414 1.1 mrg neg_b = (vec_uint4)vec_sub(vec_splat_s32(0), (vec_int4)(b)); 1415 1.1 mrg mask = vec_sra(vec_sl(neg_b, ((vec_uint4){26,26,26,26})), ((vec_uint4){31,31,31,31})); 1416 1.1 mrg return ((qword)(vec_andc(vec_sr((vec_uint4)(a), neg_b), mask))); 1417 1.1 mrg } 1418 1.1 mrg 1419 1.1 mrg static __inline qword si_rothmi(qword a, int b) 1420 1.1 mrg { 1421 1.1 mrg vec_ushort8 neg_b; 1422 1.1 mrg vec_ushort8 mask; 1423 1.1 mrg 1424 1.1 mrg neg_b = vec_splat((vec_ushort8)(si_from_int(-b)), 1); 1425 1.1 mrg mask = vec_sra(vec_sl(neg_b, vec_splat_u16(11)), vec_splat_u16(15)); 1426 1.1 mrg return ((qword)(vec_andc(vec_sr((vec_ushort8)(a), neg_b), mask))); 1427 1.1 mrg } 1428 1.1 mrg 1429 1.1 mrg static __inline qword si_rotmi(qword a, int b) 1430 1.1 mrg { 1431 1.1 mrg vec_uint4 neg_b; 1432 1.1 mrg vec_uint4 mask; 1433 1.1 mrg 1434 1.1 mrg neg_b = vec_splat((vec_uint4)(si_from_int(-b)), 0); 1435 1.1 mrg mask = vec_sra(vec_sl(neg_b, ((vec_uint4){26,26,26,26})), ((vec_uint4){31,31,31,31})); 1436 1.1 mrg return ((qword)(vec_andc(vec_sr((vec_uint4)(a), neg_b), mask))); 1437 1.1 mrg } 1438 1.1 mrg 1439 1.1 mrg 1440 1.1 mrg /* Rotate Left Algebraic with Mask 1441 1.1 mrg */ 1442 1.1 mrg static __inline qword si_rotmah(qword a, qword b) 1443 1.1 mrg { 1444 1.1 mrg vec_ushort8 neg_b; 1445 1.1 mrg vec_ushort8 mask; 1446 1.1 mrg 1447 1.1 mrg neg_b = (vec_ushort8)vec_sub(vec_splat_s16(0), (vec_short8)(b)); 1448 1.1 mrg mask = vec_sra(vec_sl(neg_b, vec_splat_u16(11)), vec_splat_u16(15)); 1449 1.1 mrg return ((qword)(vec_sra((vec_short8)(a), (vec_ushort8)vec_or(neg_b, mask)))); 1450 1.1 mrg } 1451 1.1 mrg 1452 1.1 mrg static __inline qword si_rotma(qword a, qword b) 1453 1.1 mrg { 1454 1.1 mrg vec_uint4 neg_b; 1455 1.1 mrg vec_uint4 mask; 1456 1.1 mrg 1457 1.1 mrg neg_b = (vec_uint4)vec_sub(vec_splat_s32(0), (vec_int4)(b)); 1458 1.1 mrg mask = vec_sra(vec_sl(neg_b, ((vec_uint4){26,26,26,26})), ((vec_uint4){31,31,31,31})); 1459 1.1 mrg return ((qword)(vec_sra((vec_int4)(a), (vec_uint4)vec_or(neg_b, mask)))); 1460 1.1 mrg } 1461 1.1 mrg 1462 1.1 mrg 1463 1.1 mrg static __inline qword si_rotmahi(qword a, int b) 1464 1.1 mrg { 1465 1.1 mrg vec_ushort8 neg_b; 1466 1.1 mrg vec_ushort8 mask; 1467 1.1 mrg 1468 1.1 mrg neg_b = vec_splat((vec_ushort8)(si_from_int(-b)), 1); 1469 1.1 mrg mask = vec_sra(vec_sl(neg_b, vec_splat_u16(11)), vec_splat_u16(15)); 1470 1.1 mrg return ((qword)(vec_sra((vec_short8)(a), (vec_ushort8)vec_or(neg_b, mask)))); 1471 1.1 mrg } 1472 1.1 mrg 1473 1.1 mrg static __inline qword si_rotmai(qword a, int b) 1474 1.1 mrg { 1475 1.1 mrg vec_uint4 neg_b; 1476 1.1 mrg vec_uint4 mask; 1477 1.1 mrg 1478 1.1 mrg neg_b = vec_splat((vec_uint4)(si_from_int(-b)), 0); 1479 1.1 mrg mask = vec_sra(vec_sl(neg_b, ((vec_uint4){26,26,26,26})), ((vec_uint4){31,31,31,31})); 1480 1.1 mrg return ((qword)(vec_sra((vec_int4)(a), (vec_uint4)vec_or(neg_b, mask)))); 1481 1.1 mrg } 1482 1.1 mrg 1483 1.1 mrg 1484 1.1 mrg /* Rotate Left Quadword by Bytes with Mask 1485 1.1 mrg */ 1486 1.1 mrg static __inline qword si_rotqmbyi(qword a, int count) 1487 1.1 mrg { 1488 1.1 mrg union { 1489 1.1 mrg vec_uchar16 v; 1490 1.1 mrg int i[4]; 1491 1.1 mrg } x; 1492 1.1 mrg vec_uchar16 mask; 1493 1.1 mrg 1494 1.1 mrg count = 0 - count; 1495 1.1 mrg x.i[3] = count << 3; 1496 1.1 mrg mask = (count & 0x10) ? vec_splat_u8(0) : vec_splat_u8(-1); 1497 1.1 mrg 1498 1.1 mrg return ((qword)(vec_and(vec_sro((vec_uchar16)(a), x.v), mask))); 1499 1.1 mrg } 1500 1.1 mrg 1501 1.1 mrg 1502 1.1 mrg static __inline qword si_rotqmby(qword a, qword count) 1503 1.1 mrg { 1504 1.1 mrg union { 1505 1.1 mrg vec_uchar16 v; 1506 1.1 mrg int i[4]; 1507 1.1 mrg } x; 1508 1.1 mrg int cnt; 1509 1.1 mrg vec_uchar16 mask; 1510 1.1 mrg 1511 1.1 mrg x.v = (vec_uchar16)(count); 1512 1.1 mrg x.i[0] = cnt = (0 - x.i[0]) << 3; 1513 1.1 mrg 1514 1.1 mrg x.v = vec_splat(x.v, 3); 1515 1.1 mrg mask = (cnt & 0x80) ? vec_splat_u8(0) : vec_splat_u8(-1); 1516 1.1 mrg 1517 1.1 mrg return ((qword)(vec_and(vec_sro((vec_uchar16)(a), x.v), mask))); 1518 1.1 mrg } 1519 1.1 mrg 1520 1.1 mrg 1521 1.1 mrg /* Rotate Left Quadword by Bytes 1522 1.1 mrg */ 1523 1.1 mrg static __inline qword si_rotqbyi(qword a, int count) 1524 1.1 mrg { 1525 1.1 mrg union { 1526 1.1 mrg vec_uchar16 v; 1527 1.1 mrg int i[4]; 1528 1.1 mrg } left, right; 1529 1.1 mrg 1530 1.1 mrg count <<= 3; 1531 1.1 mrg left.i[3] = count; 1532 1.1 mrg right.i[3] = 0 - count; 1533 1.1 mrg return ((qword)(vec_or(vec_slo((vec_uchar16)(a), left.v), vec_sro((vec_uchar16)(a), right.v)))); 1534 1.1 mrg } 1535 1.1 mrg 1536 1.1 mrg static __inline qword si_rotqby(qword a, qword count) 1537 1.1 mrg { 1538 1.1 mrg vec_uchar16 left, right; 1539 1.1 mrg 1540 1.1 mrg left = vec_sl(vec_splat((vec_uchar16)(count), 3), vec_splat_u8(3)); 1541 1.1 mrg right = vec_sub(vec_splat_u8(0), left); 1542 1.1 mrg return ((qword)(vec_or(vec_slo((vec_uchar16)(a), left), vec_sro((vec_uchar16)(a), right)))); 1543 1.1 mrg } 1544 1.1 mrg 1545 1.1 mrg /* Rotate Left Quadword by Bytes Bit Count 1546 1.1 mrg */ 1547 1.1 mrg static __inline qword si_rotqbybi(qword a, qword count) 1548 1.1 mrg { 1549 1.1 mrg vec_uchar16 left, right; 1550 1.1 mrg 1551 1.1 mrg left = vec_splat((vec_uchar16)(count), 3); 1552 1.1 mrg right = vec_sub(vec_splat_u8(7), left); 1553 1.1 mrg return ((qword)(vec_or(vec_slo((vec_uchar16)(a), left), vec_sro((vec_uchar16)(a), right)))); 1554 1.1 mrg } 1555 1.1 mrg 1556 1.1 mrg 1557 1.1 mrg /* Rotate Left Quadword by Bytes Bit Count 1558 1.1 mrg */ 1559 1.1 mrg static __inline qword si_rotqbii(qword a, int count) 1560 1.1 mrg { 1561 1.1 mrg vec_uchar16 x, y; 1562 1.1 mrg vec_uchar16 result; 1563 1.1 mrg 1564 1.1 mrg x = vec_splat((vec_uchar16)(si_from_int(count & 7)), 3); 1565 1.1 mrg y = (vec_uchar16)(vec_sr((vec_uint4)vec_sro((vec_uchar16)(a), ((vec_uchar16)((vec_uint4){0,0,0,120}))), 1566 1.1 mrg (vec_uint4)vec_sub(vec_splat_u8(8), x))); 1567 1.1 mrg result = vec_or(vec_sll((qword)(a), x), y); 1568 1.1 mrg return ((qword)(result)); 1569 1.1 mrg } 1570 1.1 mrg 1571 1.1 mrg static __inline qword si_rotqbi(qword a, qword count) 1572 1.1 mrg { 1573 1.1 mrg vec_uchar16 x, y; 1574 1.1 mrg vec_uchar16 result; 1575 1.1 mrg 1576 1.1 mrg x = vec_and(vec_splat((vec_uchar16)(count), 3), vec_splat_u8(7)); 1577 1.1 mrg y = (vec_uchar16)(vec_sr((vec_uint4)vec_sro((vec_uchar16)(a), ((vec_uchar16)((vec_uint4){0,0,0,120}))), 1578 1.1 mrg (vec_uint4)vec_sub(vec_splat_u8(8), x))); 1579 1.1 mrg 1580 1.1 mrg result = vec_or(vec_sll((qword)(a), x), y); 1581 1.1 mrg return ((qword)(result)); 1582 1.1 mrg } 1583 1.1 mrg 1584 1.1 mrg 1585 1.1 mrg /* Rotate Left Quadword and Mask by Bits 1586 1.1 mrg */ 1587 1.1 mrg static __inline qword si_rotqmbii(qword a, int count) 1588 1.1 mrg { 1589 1.1 mrg return ((qword)(vec_srl((vec_uchar16)(a), vec_splat((vec_uchar16)(si_from_int(0 - count)), 3)))); 1590 1.1 mrg } 1591 1.1 mrg 1592 1.1 mrg static __inline qword si_rotqmbi(qword a, qword count) 1593 1.1 mrg { 1594 1.1 mrg return ((qword)(vec_srl((vec_uchar16)(a), vec_sub(vec_splat_u8(0), vec_splat((vec_uchar16)(count), 3))))); 1595 1.1 mrg } 1596 1.1 mrg 1597 1.1 mrg 1598 1.1 mrg /* Rotate Left Quadword and Mask by Bytes with Bit Count 1599 1.1 mrg */ 1600 1.1 mrg static __inline qword si_rotqmbybi(qword a, qword count) 1601 1.1 mrg { 1602 1.1 mrg union { 1603 1.1 mrg vec_uchar16 v; 1604 1.1 mrg int i[4]; 1605 1.1 mrg } x; 1606 1.1 mrg int cnt; 1607 1.1 mrg vec_uchar16 mask; 1608 1.1 mrg 1609 1.1 mrg x.v = (vec_uchar16)(count); 1610 1.1 mrg x.i[0] = cnt = 0 - (x.i[0] & ~7); 1611 1.1 mrg x.v = vec_splat(x.v, 3); 1612 1.1 mrg mask = (cnt & 0x80) ? vec_splat_u8(0) : vec_splat_u8(-1); 1613 1.1 mrg 1614 1.1 mrg return ((qword)(vec_and(vec_sro((vec_uchar16)(a), x.v), mask))); 1615 1.1 mrg } 1616 1.1 mrg 1617 1.1 mrg 1618 1.1 mrg 1619 1.1 mrg 1620 1.1 mrg /* Round Double to Float 1621 1.1 mrg */ 1622 1.1 mrg static __inline qword si_frds(qword a) 1623 1.1 mrg { 1624 1.1 mrg union { 1625 1.1 mrg vec_float4 v; 1626 1.1 mrg float f[4]; 1627 1.1 mrg } d; 1628 1.1 mrg union { 1629 1.1 mrg vec_double2 v; 1630 1.1 mrg double d[2]; 1631 1.1 mrg } in; 1632 1.1 mrg 1633 1.1 mrg in.v = (vec_double2)(a); 1634 1.1 mrg d.v = (vec_float4){0.0f}; 1635 1.1 mrg d.f[0] = (float)in.d[0]; 1636 1.1 mrg d.f[2] = (float)in.d[1]; 1637 1.1 mrg 1638 1.1 mrg return ((qword)(d.v)); 1639 1.1 mrg } 1640 1.1 mrg 1641 1.1 mrg /* Select Bits 1642 1.1 mrg */ 1643 1.1 mrg static __inline qword si_selb(qword a, qword b, qword c) 1644 1.1 mrg { 1645 1.1 mrg return ((qword)(vec_sel((vec_uchar16)(a), (vec_uchar16)(b), (vec_uchar16)(c)))); 1646 1.1 mrg } 1647 1.1 mrg 1648 1.1 mrg 1649 1.1 mrg /* Shuffle Bytes 1650 1.1 mrg */ 1651 1.1 mrg static __inline qword si_shufb(qword a, qword b, qword pattern) 1652 1.1 mrg { 1653 1.1 mrg vec_uchar16 pat; 1654 1.1 mrg 1655 1.1 mrg pat = vec_sel(((vec_uchar16){0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15}), 1656 1.1 mrg vec_sr((vec_uchar16)(pattern), vec_splat_u8(3)), 1657 1.1 mrg vec_sra((vec_uchar16)(pattern), vec_splat_u8(7))); 1658 1.1 mrg return ((qword)(vec_perm(vec_perm(a, b, pattern), 1659 1.1 mrg ((vec_uchar16){0, 0, 0, 0, 0, 0, 0, 0, 1660 1.1 mrg 0xFF, 0xFF, 0xFF, 0xFF, 0x80, 0x80, 0x80, 0x80}), 1661 1.1 mrg pat))); 1662 1.1 mrg } 1663 1.1 mrg 1664 1.1 mrg 1665 1.1 mrg /* Shift Left 1666 1.1 mrg */ 1667 1.1 mrg static __inline qword si_shlh(qword a, qword b) 1668 1.1 mrg { 1669 1.1 mrg vec_ushort8 mask; 1670 1.1 mrg 1671 1.1 mrg mask = (vec_ushort8)vec_sra(vec_sl((vec_ushort8)(b), vec_splat_u16(11)), vec_splat_u16(15)); 1672 1.1 mrg return ((qword)(vec_andc(vec_sl((vec_ushort8)(a), (vec_ushort8)(b)), mask))); 1673 1.1 mrg } 1674 1.1 mrg 1675 1.1 mrg static __inline qword si_shl(qword a, qword b) 1676 1.1 mrg { 1677 1.1 mrg vec_uint4 mask; 1678 1.1 mrg 1679 1.1 mrg mask = (vec_uint4)vec_sra(vec_sl((vec_uint4)(b), ((vec_uint4){26,26,26,26})), ((vec_uint4){31,31,31,31})); 1680 1.1 mrg return ((qword)(vec_andc(vec_sl((vec_uint4)(a), (vec_uint4)(b)), mask))); 1681 1.1 mrg } 1682 1.1 mrg 1683 1.1 mrg 1684 1.1 mrg static __inline qword si_shlhi(qword a, unsigned int b) 1685 1.1 mrg { 1686 1.1 mrg vec_ushort8 mask; 1687 1.1 mrg vec_ushort8 bv; 1688 1.1 mrg 1689 1.1 mrg bv = vec_splat((vec_ushort8)(si_from_int(b)), 1); 1690 1.1 mrg mask = (vec_ushort8)vec_sra(vec_sl(bv, vec_splat_u16(11)), vec_splat_u16(15)); 1691 1.1 mrg return ((qword)(vec_andc(vec_sl((vec_ushort8)(a), bv), mask))); 1692 1.1 mrg } 1693 1.1 mrg 1694 1.1 mrg static __inline qword si_shli(qword a, unsigned int b) 1695 1.1 mrg { 1696 1.1 mrg vec_uint4 bv; 1697 1.1 mrg vec_uint4 mask; 1698 1.1 mrg 1699 1.1 mrg bv = vec_splat((vec_uint4)(si_from_uint(b)), 0); 1700 1.1 mrg mask = (vec_uint4)vec_sra(vec_sl(bv, ((vec_uint4){26,26,26,26})), ((vec_uint4){31,31,31,31})); 1701 1.1 mrg return ((qword)(vec_andc(vec_sl((vec_uint4)(a), bv), mask))); 1702 1.1 mrg } 1703 1.1 mrg 1704 1.1 mrg 1705 1.1 mrg /* Shift Left Quadword 1706 1.1 mrg */ 1707 1.1 mrg static __inline qword si_shlqbii(qword a, unsigned int count) 1708 1.1 mrg { 1709 1.1 mrg vec_uchar16 x; 1710 1.1 mrg 1711 1.1 mrg x = vec_splat((vec_uchar16)(si_from_uint(count)), 3); 1712 1.1 mrg return ((qword)(vec_sll((vec_uchar16)(a), x))); 1713 1.1 mrg } 1714 1.1 mrg 1715 1.1 mrg static __inline qword si_shlqbi(qword a, qword count) 1716 1.1 mrg { 1717 1.1 mrg vec_uchar16 x; 1718 1.1 mrg 1719 1.1 mrg x = vec_splat((vec_uchar16)(count), 3); 1720 1.1 mrg return ((qword)(vec_sll((vec_uchar16)(a), x))); 1721 1.1 mrg } 1722 1.1 mrg 1723 1.1 mrg 1724 1.1 mrg /* Shift Left Quadword by Bytes 1725 1.1 mrg */ 1726 1.1 mrg static __inline qword si_shlqbyi(qword a, unsigned int count) 1727 1.1 mrg { 1728 1.1 mrg union { 1729 1.1 mrg vec_uchar16 v; 1730 1.1 mrg int i[4]; 1731 1.1 mrg } x; 1732 1.1 mrg vec_uchar16 mask; 1733 1.1 mrg 1734 1.1 mrg x.i[3] = count << 3; 1735 1.1 mrg mask = (count & 0x10) ? vec_splat_u8(0) : vec_splat_u8(-1); 1736 1.1 mrg return ((qword)(vec_and(vec_slo((vec_uchar16)(a), x.v), mask))); 1737 1.1 mrg } 1738 1.1 mrg 1739 1.1 mrg static __inline qword si_shlqby(qword a, qword count) 1740 1.1 mrg { 1741 1.1 mrg union { 1742 1.1 mrg vec_uchar16 v; 1743 1.1 mrg unsigned int i[4]; 1744 1.1 mrg } x; 1745 1.1 mrg unsigned int cnt; 1746 1.1 mrg vec_uchar16 mask; 1747 1.1 mrg 1748 1.1 mrg x.v = vec_sl(vec_splat((vec_uchar16)(count), 3), vec_splat_u8(3)); 1749 1.1 mrg cnt = x.i[0]; 1750 1.1 mrg mask = (cnt & 0x80) ? vec_splat_u8(0) : vec_splat_u8(-1); 1751 1.1 mrg return ((qword)(vec_and(vec_slo((vec_uchar16)(a), x.v), mask))); 1752 1.1 mrg } 1753 1.1 mrg 1754 1.1 mrg /* Shift Left Quadword by Bytes with Bit Count 1755 1.1 mrg */ 1756 1.1 mrg static __inline qword si_shlqbybi(qword a, qword count) 1757 1.1 mrg { 1758 1.1 mrg union { 1759 1.1 mrg vec_uchar16 v; 1760 1.1 mrg int i[4]; 1761 1.1 mrg } x; 1762 1.1 mrg unsigned int cnt; 1763 1.1 mrg vec_uchar16 mask; 1764 1.1 mrg 1765 1.1 mrg x.v = vec_splat((vec_uchar16)(count), 3); 1766 1.1 mrg cnt = x.i[0]; 1767 1.1 mrg mask = (cnt & 0x80) ? vec_splat_u8(0) : vec_splat_u8(-1); 1768 1.1 mrg return ((qword)(vec_and(vec_slo((vec_uchar16)(a), x.v), mask))); 1769 1.1 mrg } 1770 1.1 mrg 1771 1.1 mrg 1772 1.1 mrg /* Stop and Signal 1773 1.1 mrg */ 1774 1.1 mrg #define si_stop(_type) SPU_STOP_ACTION 1775 1.1 mrg #define si_stopd(a, b, c) SPU_STOP_ACTION 1776 1.1 mrg 1777 1.1 mrg 1778 1.1 mrg /* Subtract 1779 1.1 mrg */ 1780 1.1 mrg static __inline qword si_sfh(qword a, qword b) 1781 1.1 mrg { 1782 1.1 mrg return ((qword)(vec_sub((vec_ushort8)(b), (vec_ushort8)(a)))); 1783 1.1 mrg } 1784 1.1 mrg 1785 1.1 mrg static __inline qword si_sf(qword a, qword b) 1786 1.1 mrg { 1787 1.1 mrg return ((qword)(vec_sub((vec_uint4)(b), (vec_uint4)(a)))); 1788 1.1 mrg } 1789 1.1 mrg 1790 1.1 mrg static __inline qword si_fs(qword a, qword b) 1791 1.1 mrg { 1792 1.1 mrg return ((qword)(vec_sub((vec_float4)(a), (vec_float4)(b)))); 1793 1.1 mrg } 1794 1.1 mrg 1795 1.1 mrg static __inline qword si_dfs(qword a, qword b) 1796 1.1 mrg { 1797 1.1 mrg union { 1798 1.1 mrg vec_double2 v; 1799 1.1 mrg double d[2]; 1800 1.1 mrg } aa, bb, dd; 1801 1.1 mrg 1802 1.1 mrg aa.v = (vec_double2)(a); 1803 1.1 mrg bb.v = (vec_double2)(b); 1804 1.1 mrg dd.d[0] = aa.d[0] - bb.d[0]; 1805 1.1 mrg dd.d[1] = aa.d[1] - bb.d[1]; 1806 1.1 mrg return ((qword)(dd.v)); 1807 1.1 mrg } 1808 1.1 mrg 1809 1.1 mrg static __inline qword si_sfhi(qword a, short b) 1810 1.1 mrg { 1811 1.1 mrg return ((qword)(vec_sub(vec_splat((vec_short8)(si_from_short(b)), 1), 1812 1.1 mrg (vec_short8)(a)))); 1813 1.1 mrg } 1814 1.1 mrg 1815 1.1 mrg static __inline qword si_sfi(qword a, int b) 1816 1.1 mrg { 1817 1.1 mrg return ((qword)(vec_sub(vec_splat((vec_int4)(si_from_int(b)), 0), 1818 1.1 mrg (vec_int4)(a)))); 1819 1.1 mrg } 1820 1.1 mrg 1821 1.1 mrg /* Subtract word extended 1822 1.1 mrg */ 1823 1.1 mrg #define si_sfx(_a, _b, _c) ((qword)(vec_add(vec_add((vec_uint4)(_b), \ 1824 1.1 mrg vec_nor((vec_uint4)(_a), (vec_uint4)(_a))), \ 1825 1.1 mrg vec_and((vec_uint4)(_c), vec_splat_u32(1))))) 1826 1.1 mrg 1827 1.1 mrg 1828 1.1 mrg /* Sum Bytes into Shorts 1829 1.1 mrg */ 1830 1.1 mrg static __inline qword si_sumb(qword a, qword b) 1831 1.1 mrg { 1832 1.1 mrg vec_uint4 zero = (vec_uint4){0}; 1833 1.1 mrg vec_ushort8 sum_a, sum_b; 1834 1.1 mrg 1835 1.1 mrg sum_a = (vec_ushort8)vec_sum4s((vec_uchar16)(a), zero); 1836 1.1 mrg sum_b = (vec_ushort8)vec_sum4s((vec_uchar16)(b), zero); 1837 1.1 mrg 1838 1.1 mrg return ((qword)(vec_perm(sum_a, sum_b, ((vec_uchar16){18, 19, 2, 3, 22, 23, 6, 7, 1839 1.1 mrg 26, 27, 10, 11, 30, 31, 14, 15})))); 1840 1.1 mrg } 1841 1.1 mrg 1842 1.1 mrg /* Exclusive OR 1843 1.1 mrg */ 1844 1.1 mrg static __inline qword si_xor(qword a, qword b) 1845 1.1 mrg { 1846 1.1 mrg return ((qword)(vec_xor((vec_uchar16)(a), (vec_uchar16)(b)))); 1847 1.1 mrg } 1848 1.1 mrg 1849 1.1 mrg static __inline qword si_xorbi(qword a, unsigned char b) 1850 1.1 mrg { 1851 1.1 mrg return ((qword)(vec_xor((vec_uchar16)(a), 1852 1.1 mrg vec_splat((vec_uchar16)(si_from_uchar(b)), 3)))); 1853 1.1 mrg } 1854 1.1 mrg 1855 1.1 mrg static __inline qword si_xorhi(qword a, unsigned short b) 1856 1.1 mrg { 1857 1.1 mrg return ((qword)(vec_xor((vec_ushort8)(a), 1858 1.1 mrg vec_splat((vec_ushort8)(si_from_ushort(b)), 1)))); 1859 1.1 mrg } 1860 1.1 mrg 1861 1.1 mrg static __inline qword si_xori(qword a, unsigned int b) 1862 1.1 mrg { 1863 1.1 mrg return ((qword)(vec_xor((vec_uint4)(a), 1864 1.1 mrg vec_splat((vec_uint4)(si_from_uint(b)), 0)))); 1865 1.1 mrg } 1866 1.1 mrg 1867 1.1 mrg 1868 1.1 mrg /* Generate Controls for Sub-Quadword Insertion 1869 1.1 mrg */ 1870 1.1 mrg static __inline qword si_cbd(qword a, int imm) 1871 1.1 mrg { 1872 1.1 mrg union { 1873 1.1 mrg vec_uint4 v; 1874 1.1 mrg unsigned char c[16]; 1875 1.1 mrg } shmask; 1876 1.1 mrg 1877 1.1 mrg shmask.v = ((vec_uint4){0x10111213, 0x14151617, 0x18191A1B, 0x1C1D1E1F}); 1878 1.1 mrg shmask.c[(si_to_uint(a) + (unsigned int)(imm)) & 0xF] = 0x03; 1879 1.1 mrg return ((qword)(shmask.v)); 1880 1.1 mrg } 1881 1.1 mrg 1882 1.1 mrg static __inline qword si_cdd(qword a, int imm) 1883 1.1 mrg { 1884 1.1 mrg union { 1885 1.1 mrg vec_uint4 v; 1886 1.1 mrg unsigned long long ll[2]; 1887 1.1 mrg } shmask; 1888 1.1 mrg 1889 1.1 mrg shmask.v = ((vec_uint4){0x10111213, 0x14151617, 0x18191A1B, 0x1C1D1E1F}); 1890 1.1 mrg shmask.ll[((si_to_uint(a) + (unsigned int)(imm)) >> 3) & 0x1] = 0x0001020304050607ULL; 1891 1.1 mrg return ((qword)(shmask.v)); 1892 1.1 mrg } 1893 1.1 mrg 1894 1.1 mrg static __inline qword si_chd(qword a, int imm) 1895 1.1 mrg { 1896 1.1 mrg union { 1897 1.1 mrg vec_uint4 v; 1898 1.1 mrg unsigned short s[8]; 1899 1.1 mrg } shmask; 1900 1.1 mrg 1901 1.1 mrg shmask.v = ((vec_uint4){0x10111213, 0x14151617, 0x18191A1B, 0x1C1D1E1F}); 1902 1.1 mrg shmask.s[((si_to_uint(a) + (unsigned int)(imm)) >> 1) & 0x7] = 0x0203; 1903 1.1 mrg return ((qword)(shmask.v)); 1904 1.1 mrg } 1905 1.1 mrg 1906 1.1 mrg static __inline qword si_cwd(qword a, int imm) 1907 1.1 mrg { 1908 1.1 mrg union { 1909 1.1 mrg vec_uint4 v; 1910 1.1 mrg unsigned int i[4]; 1911 1.1 mrg } shmask; 1912 1.1 mrg 1913 1.1 mrg shmask.v = ((vec_uint4){0x10111213, 0x14151617, 0x18191A1B, 0x1C1D1E1F}); 1914 1.1 mrg shmask.i[((si_to_uint(a) + (unsigned int)(imm)) >> 2) & 0x3] = 0x00010203; 1915 1.1 mrg return ((qword)(shmask.v)); 1916 1.1 mrg } 1917 1.1 mrg 1918 1.1 mrg static __inline qword si_cbx(qword a, qword b) 1919 1.1 mrg { 1920 1.1 mrg union { 1921 1.1 mrg vec_uint4 v; 1922 1.1 mrg unsigned char c[16]; 1923 1.1 mrg } shmask; 1924 1.1 mrg 1925 1.1 mrg shmask.v = ((vec_uint4){0x10111213, 0x14151617, 0x18191A1B, 0x1C1D1E1F}); 1926 1.1 mrg shmask.c[si_to_uint((qword)(vec_add((vec_uint4)(a), (vec_uint4)(b)))) & 0xF] = 0x03; 1927 1.1 mrg return ((qword)(shmask.v)); 1928 1.1 mrg } 1929 1.1 mrg 1930 1.1 mrg 1931 1.1 mrg static __inline qword si_cdx(qword a, qword b) 1932 1.1 mrg { 1933 1.1 mrg union { 1934 1.1 mrg vec_uint4 v; 1935 1.1 mrg unsigned long long ll[2]; 1936 1.1 mrg } shmask; 1937 1.1 mrg 1938 1.1 mrg shmask.v = ((vec_uint4){0x10111213, 0x14151617, 0x18191A1B, 0x1C1D1E1F}); 1939 1.1 mrg shmask.ll[(si_to_uint((qword)(vec_add((vec_uint4)(a), (vec_uint4)(b)))) >> 3) & 0x1] = 0x0001020304050607ULL; 1940 1.1 mrg return ((qword)(shmask.v)); 1941 1.1 mrg } 1942 1.1 mrg 1943 1.1 mrg static __inline qword si_chx(qword a, qword b) 1944 1.1 mrg { 1945 1.1 mrg union { 1946 1.1 mrg vec_uint4 v; 1947 1.1 mrg unsigned short s[8]; 1948 1.1 mrg } shmask; 1949 1.1 mrg 1950 1.1 mrg shmask.v = ((vec_uint4){0x10111213, 0x14151617, 0x18191A1B, 0x1C1D1E1F}); 1951 1.1 mrg shmask.s[(si_to_uint((qword)(vec_add((vec_uint4)(a), (vec_uint4)(b)))) >> 1) & 0x7] = 0x0203; 1952 1.1 mrg return ((qword)(shmask.v)); 1953 1.1 mrg } 1954 1.1 mrg 1955 1.1 mrg static __inline qword si_cwx(qword a, qword b) 1956 1.1 mrg { 1957 1.1 mrg union { 1958 1.1 mrg vec_uint4 v; 1959 1.1 mrg unsigned int i[4]; 1960 1.1 mrg } shmask; 1961 1.1 mrg 1962 1.1 mrg shmask.v = ((vec_uint4){0x10111213, 0x14151617, 0x18191A1B, 0x1C1D1E1F}); 1963 1.1 mrg shmask.i[(si_to_uint((qword)(vec_add((vec_uint4)(a), (vec_uint4)(b)))) >> 2) & 0x3] = 0x00010203; 1964 1.1 mrg return ((qword)(shmask.v)); 1965 1.1 mrg } 1966 1.1 mrg 1967 1.1 mrg 1968 1.1 mrg /* Constant Formation 1969 1.1 mrg */ 1970 1.1 mrg static __inline qword si_il(signed short imm) 1971 1.1 mrg { 1972 1.1 mrg return ((qword)(vec_splat((vec_int4)(si_from_int((signed int)(imm))), 0))); 1973 1.1 mrg } 1974 1.1 mrg 1975 1.1 mrg 1976 1.1 mrg static __inline qword si_ila(unsigned int imm) 1977 1.1 mrg { 1978 1.1 mrg return ((qword)(vec_splat((vec_uint4)(si_from_uint(imm)), 0))); 1979 1.1 mrg } 1980 1.1 mrg 1981 1.1 mrg static __inline qword si_ilh(signed short imm) 1982 1.1 mrg { 1983 1.1 mrg return ((qword)(vec_splat((vec_short8)(si_from_short(imm)), 1))); 1984 1.1 mrg } 1985 1.1 mrg 1986 1.1 mrg static __inline qword si_ilhu(signed short imm) 1987 1.1 mrg { 1988 1.1 mrg return ((qword)(vec_splat((vec_uint4)(si_from_uint((unsigned int)(imm) << 16)), 0))); 1989 1.1 mrg } 1990 1.1 mrg 1991 1.1 mrg static __inline qword si_iohl(qword a, unsigned short imm) 1992 1.1 mrg { 1993 1.1 mrg return ((qword)(vec_or((vec_uint4)(a), vec_splat((vec_uint4)(si_from_uint((unsigned int)(imm))), 0)))); 1994 1.1 mrg } 1995 1.1 mrg 1996 1.1 mrg /* No Operation 1997 1.1 mrg */ 1998 1.1 mrg #define si_lnop() /* do nothing */ 1999 1.1 mrg #define si_nop() /* do nothing */ 2000 1.1 mrg 2001 1.1 mrg 2002 1.1 mrg /* Memory Load and Store 2003 1.1 mrg */ 2004 1.1 mrg static __inline qword si_lqa(unsigned int imm) 2005 1.1 mrg { 2006 1.1 mrg return ((qword)(vec_ld(0, (vector unsigned char *)(imm)))); 2007 1.1 mrg } 2008 1.1 mrg 2009 1.1 mrg static __inline qword si_lqd(qword a, unsigned int imm) 2010 1.1 mrg { 2011 1.1 mrg return ((qword)(vec_ld(si_to_uint(a) & ~0xF, (vector unsigned char *)(imm)))); 2012 1.1 mrg } 2013 1.1 mrg 2014 1.1 mrg static __inline qword si_lqr(unsigned int imm) 2015 1.1 mrg { 2016 1.1 mrg return ((qword)(vec_ld(0, (vector unsigned char *)(imm)))); 2017 1.1 mrg } 2018 1.1 mrg 2019 1.1 mrg static __inline qword si_lqx(qword a, qword b) 2020 1.1 mrg { 2021 1.1 mrg return ((qword)(vec_ld(si_to_uint((qword)(vec_add((vec_uint4)(a), (vec_uint4)(b)))), (vector unsigned char *)(0)))); 2022 1.1 mrg } 2023 1.1 mrg 2024 1.1 mrg static __inline void si_stqa(qword a, unsigned int imm) 2025 1.1 mrg { 2026 1.1 mrg vec_st((vec_uchar16)(a), 0, (vector unsigned char *)(imm)); 2027 1.1 mrg } 2028 1.1 mrg 2029 1.1 mrg static __inline void si_stqd(qword a, qword b, unsigned int imm) 2030 1.1 mrg { 2031 1.1 mrg vec_st((vec_uchar16)(a), si_to_uint(b) & ~0xF, (vector unsigned char *)(imm)); 2032 1.1 mrg } 2033 1.1 mrg 2034 1.1 mrg static __inline void si_stqr(qword a, unsigned int imm) 2035 1.1 mrg { 2036 1.1 mrg vec_st((vec_uchar16)(a), 0, (vector unsigned char *)(imm)); 2037 1.1 mrg } 2038 1.1 mrg 2039 1.1 mrg static __inline void si_stqx(qword a, qword b, qword c) 2040 1.1 mrg { 2041 1.1 mrg vec_st((vec_uchar16)(a), 2042 1.1 mrg si_to_uint((qword)(vec_add((vec_uint4)(b), (vec_uint4)(c)))), 2043 1.1 mrg (vector unsigned char *)(0)); 2044 1.1 mrg } 2045 1.1 mrg 2046 1.1 mrg #endif /* !__SPU__ */ 2047 1.1 mrg #endif /* !_SI2VMX_H_ */ 2048 1.1 mrg 2049