1 1.10 mrg /* Copyright (C) 2006-2019 Free Software Foundation, Inc. 2 1.1 mrg 3 1.1 mrg This file is free software; you can redistribute it and/or modify it under 4 1.1 mrg the terms of the GNU General Public License as published by the Free 5 1.1 mrg Software Foundation; either version 3 of the License, or (at your option) 6 1.1 mrg any later version. 7 1.1 mrg 8 1.1 mrg This file is distributed in the hope that it will be useful, but WITHOUT 9 1.1 mrg ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 10 1.1 mrg FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 11 1.1 mrg for more details. 12 1.1 mrg 13 1.1 mrg Under Section 7 of GPL version 3, you are granted additional 14 1.1 mrg permissions described in the GCC Runtime Library Exception, version 15 1.1 mrg 3.1, as published by the Free Software Foundation. 16 1.1 mrg 17 1.1 mrg You should have received a copy of the GNU General Public License and 18 1.1 mrg a copy of the GCC Runtime Library Exception along with this program; 19 1.1 mrg see the files COPYING3 and COPYING.RUNTIME respectively. If not, see 20 1.1 mrg <http://www.gnu.org/licenses/>. */ 21 1.1 mrg 22 1.1 mrg #ifndef _VMX2SPU_H_ 23 1.1 mrg #define _VMX2SPU_H_ 1 24 1.1 mrg 25 1.1 mrg #ifdef __cplusplus 26 1.1 mrg 27 1.1 mrg #ifdef __SPU__ 28 1.1 mrg 29 1.1 mrg #include <spu_intrinsics.h> 30 1.1 mrg #include <vec_types.h> 31 1.1 mrg 32 1.1 mrg /* This file maps generic VMX intrinsics and predicates to the SPU using 33 1.1 mrg * overloaded C++ functions. 34 1.1 mrg */ 35 1.1 mrg 36 1.1 mrg /************************************************************************ 37 1.1 mrg * INTRINSICS 38 1.1 mrg ************************************************************************/ 39 1.1 mrg 40 1.1 mrg /* vec_abs (vector absolute value) 41 1.1 mrg * ======= 42 1.1 mrg */ 43 1.1 mrg static inline vec_char16 vec_abs(vec_char16 a) 44 1.1 mrg { 45 1.1 mrg vec_char16 minus_a; 46 1.1 mrg 47 1.1 mrg minus_a = (vec_char16)(spu_add((vec_ushort8)(spu_and(spu_xor(a, 0xFF), 0x7F)), 0x101)); 48 1.1 mrg return (spu_sel(minus_a, a, spu_cmpgt(a, -1))); 49 1.1 mrg } 50 1.1 mrg 51 1.1 mrg static inline vec_short8 vec_abs(vec_short8 a) 52 1.1 mrg { 53 1.1 mrg return (spu_sel(spu_sub(0, a), a, spu_cmpgt(a, -1))); 54 1.1 mrg } 55 1.1 mrg 56 1.1 mrg static inline vec_int4 vec_abs(vec_int4 a) 57 1.1 mrg { 58 1.1 mrg return (spu_sel(spu_sub(0, a), a, spu_cmpgt(a, -1))); 59 1.1 mrg } 60 1.1 mrg 61 1.1 mrg static inline vec_float4 vec_abs(vec_float4 a) 62 1.1 mrg { 63 1.1 mrg return ((vec_float4)(spu_rlmask(spu_sl((vec_uint4)(a), 1), -1))); 64 1.1 mrg } 65 1.1 mrg 66 1.1 mrg /* vec_abss (vector absolute value saturate) 67 1.1 mrg * ======== 68 1.1 mrg */ 69 1.1 mrg static inline vec_char16 vec_abss(vec_char16 a) 70 1.1 mrg { 71 1.1 mrg vec_char16 minus_a; 72 1.1 mrg 73 1.1 mrg minus_a = (vec_char16)spu_add((vec_short8)(spu_xor(a, -1)), 74 1.1 mrg (vec_short8)(spu_and(spu_cmpgt((vec_uchar16)(a), 0x80), 1))); 75 1.1 mrg return (spu_sel(minus_a, a, spu_cmpgt(a, -1))); 76 1.1 mrg } 77 1.1 mrg 78 1.1 mrg static inline vec_short8 vec_abss(vec_short8 a) 79 1.1 mrg { 80 1.1 mrg vec_short8 minus_a; 81 1.1 mrg 82 1.1 mrg minus_a = spu_add(spu_sub(0, a), (vec_short8)(spu_cmpeq(a, ((vec_short8){0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000})))); 83 1.1 mrg return (spu_sel(minus_a, a, spu_cmpgt(a, -1))); 84 1.1 mrg } 85 1.1 mrg 86 1.1 mrg static inline vec_int4 vec_abss(vec_int4 a) 87 1.1 mrg { 88 1.1 mrg vec_int4 minus_a; 89 1.1 mrg 90 1.1 mrg minus_a = spu_add(spu_sub(0, a), (vec_int4)(spu_cmpeq(a, ((vec_int4){0x80000000,0x80000000,0x80000000,0x80000000})))); 91 1.1 mrg return (spu_sel(minus_a, a, spu_cmpgt(a, -1))); 92 1.1 mrg } 93 1.1 mrg 94 1.1 mrg 95 1.1 mrg /* vec_add (vector add) 96 1.1 mrg * ======= 97 1.1 mrg */ 98 1.1 mrg static inline vec_uchar16 vec_add(vec_uchar16 a, vec_uchar16 b) 99 1.1 mrg { 100 1.1 mrg return ((vec_uchar16)(spu_sel(spu_add((vec_ushort8)(a), (vec_ushort8)(b)), 101 1.1 mrg spu_add(spu_and((vec_ushort8)(a), 0xFF00), spu_and((vec_ushort8)(b), 0xFF00)), 102 1.1 mrg spu_splats((unsigned short)(0xFF00))))); 103 1.1 mrg } 104 1.1 mrg 105 1.1 mrg static inline vec_char16 vec_add(vec_char16 a, vec_char16 b) 106 1.1 mrg { 107 1.1 mrg return ((vec_char16)vec_add((vec_uchar16)(a), (vec_uchar16)(b))); 108 1.1 mrg } 109 1.1 mrg 110 1.1 mrg static inline vec_char16 vec_add(vec_bchar16 a, vec_char16 b) 111 1.1 mrg { 112 1.1 mrg return ((vec_char16)vec_add((vec_uchar16)(a), (vec_uchar16)(b))); 113 1.1 mrg } 114 1.1 mrg 115 1.1 mrg static inline vec_char16 vec_add(vec_char16 a, vec_bchar16 b) 116 1.1 mrg { 117 1.1 mrg return ((vec_char16)vec_add((vec_uchar16)(a), (vec_uchar16)(b))); 118 1.1 mrg } 119 1.1 mrg 120 1.1 mrg static inline vec_ushort8 vec_add(vec_ushort8 a, vec_ushort8 b) 121 1.1 mrg { 122 1.1 mrg return (spu_add(a, b)); 123 1.1 mrg } 124 1.1 mrg 125 1.1 mrg static inline vec_short8 vec_add(vec_short8 a, vec_short8 b) 126 1.1 mrg { 127 1.1 mrg return (spu_add(a, b)); 128 1.1 mrg } 129 1.1 mrg 130 1.1 mrg static inline vec_short8 vec_add(vec_bshort8 a, vec_short8 b) 131 1.1 mrg { 132 1.1 mrg return (spu_add((vec_short8)(a), b)); 133 1.1 mrg } 134 1.1 mrg 135 1.1 mrg static inline vec_short8 vec_add(vec_short8 a, vec_bshort8 b) 136 1.1 mrg { 137 1.1 mrg return (spu_add(a, (vec_short8)(b))); 138 1.1 mrg } 139 1.1 mrg 140 1.1 mrg static inline vec_uint4 vec_add(vec_uint4 a, vec_uint4 b) 141 1.1 mrg { 142 1.1 mrg return (spu_add(a, b)); 143 1.1 mrg } 144 1.1 mrg 145 1.1 mrg static inline vec_int4 vec_add(vec_int4 a, vec_int4 b) 146 1.1 mrg { 147 1.1 mrg return (spu_add(a, b)); 148 1.1 mrg } 149 1.1 mrg 150 1.1 mrg static inline vec_int4 vec_add(vec_bint4 a, vec_int4 b) 151 1.1 mrg { 152 1.1 mrg return (spu_add((vec_int4)(a), b)); 153 1.1 mrg } 154 1.1 mrg 155 1.1 mrg static inline vec_int4 vec_add(vec_int4 a, vec_bint4 b) 156 1.1 mrg { 157 1.1 mrg return (spu_add(a, (vec_int4)(b))); 158 1.1 mrg } 159 1.1 mrg 160 1.1 mrg static inline vec_float4 vec_add(vec_float4 a, vec_float4 b) 161 1.1 mrg { 162 1.1 mrg return (spu_add(a, b)); 163 1.1 mrg } 164 1.1 mrg 165 1.1 mrg /* vec_addc (vector add carryout unsigned word) 166 1.1 mrg * ======== 167 1.1 mrg */ 168 1.1 mrg #define vec_addc(_a, _b) spu_genc(_a, _b) 169 1.1 mrg 170 1.1 mrg /* vec_adds (vector add saturated) 171 1.1 mrg * ======== 172 1.1 mrg */ 173 1.1 mrg static inline vec_uchar16 vec_adds(vec_uchar16 a, vec_uchar16 b) 174 1.1 mrg { 175 1.1 mrg vec_uchar16 s1, s2, s, d; 176 1.1 mrg 177 1.1 mrg s1 = (vec_uchar16)(spu_add(spu_rlmask((vec_ushort8)(a), -8), spu_rlmask((vec_ushort8)(b), -8))); 178 1.1 mrg s2 = (vec_uchar16)(spu_add(spu_and((vec_ushort8)(a), 0xFF), spu_and((vec_ushort8)(b), 0xFF))); 179 1.1 mrg s = spu_shuffle(s1, s2, ((vec_uchar16){0, 16, 2, 18, 4, 20, 6, 22, 180 1.1 mrg 8, 24, 10, 26, 12, 28, 14, 30})); 181 1.1 mrg d = spu_shuffle(s1, s2, ((vec_uchar16){1, 17, 3, 19, 5, 21, 7, 23, 182 1.1 mrg 9, 25, 11, 27, 13, 29, 15, 31})); 183 1.1 mrg return (spu_or(d, spu_cmpeq(s, 1))); 184 1.1 mrg } 185 1.1 mrg 186 1.1 mrg static inline vec_char16 vec_adds(vec_char16 a, vec_char16 b) 187 1.1 mrg { 188 1.1 mrg vec_uchar16 s1, s2, s, d; 189 1.1 mrg 190 1.1 mrg s1 = (vec_uchar16)(spu_add(spu_rlmask((vec_ushort8)(a), -8), spu_rlmask((vec_ushort8)(b), -8))); 191 1.1 mrg s2 = (vec_uchar16)(spu_add(spu_and((vec_ushort8)(a), 0xFF), spu_and((vec_ushort8)(b), 0xFF))); 192 1.1 mrg s = spu_shuffle(s1, s2, ((vec_uchar16){1, 17, 3, 19, 5, 21, 7, 23, 193 1.1 mrg 9, 25, 11, 27, 13, 29, 15, 31})); 194 1.1 mrg d = spu_sel(s, spu_splats((unsigned char)0x7F), spu_cmpgt(spu_and(s, (vec_uchar16)(spu_nor(a, b))), 0x7F)); 195 1.1 mrg d = spu_sel(d, spu_splats((unsigned char)0x80), spu_cmpgt(spu_nor(s, (vec_uchar16)(spu_nand(a, b))), 0x7F)); 196 1.1 mrg return ((vec_char16)(d)); 197 1.1 mrg } 198 1.1 mrg 199 1.1 mrg static inline vec_char16 vec_adds(vec_bchar16 a, vec_char16 b) 200 1.1 mrg { 201 1.1 mrg return (vec_adds((vec_char16)(a), b)); 202 1.1 mrg } 203 1.1 mrg 204 1.1 mrg static inline vec_char16 vec_adds(vec_char16 a, vec_bchar16 b) 205 1.1 mrg { 206 1.1 mrg return (vec_adds(a, (vec_char16)(b))); 207 1.1 mrg } 208 1.1 mrg 209 1.1 mrg static inline vec_ushort8 vec_adds(vec_ushort8 a, vec_ushort8 b) 210 1.1 mrg { 211 1.1 mrg vec_ushort8 s, d; 212 1.1 mrg 213 1.1 mrg s = spu_add(a, b); 214 1.1 mrg d = spu_or(s, spu_rlmaska(spu_sel(spu_xor(s, -1), a, spu_eqv(a, b)), -15)); 215 1.1 mrg return (d); 216 1.1 mrg } 217 1.1 mrg 218 1.1 mrg static inline vec_short8 vec_adds(vec_short8 a, vec_short8 b) 219 1.1 mrg { 220 1.1 mrg vec_short8 s, d; 221 1.1 mrg 222 1.1 mrg s = spu_add(a, b); 223 1.1 mrg d = spu_sel(s, spu_splats((signed short)0x7FFF), (vec_ushort8)(spu_rlmaska(spu_and(s, spu_nor(a, b)), -15))); 224 1.1 mrg d = spu_sel(d, spu_splats((signed short)0x8000), (vec_ushort8)(spu_rlmaska(spu_nor(s, spu_nand(a, b)), -15))); 225 1.1 mrg return (d); 226 1.1 mrg } 227 1.1 mrg 228 1.1 mrg static inline vec_short8 vec_adds(vec_bshort8 a, vec_short8 b) 229 1.1 mrg { 230 1.1 mrg return (vec_adds((vec_short8)(a), b)); 231 1.1 mrg } 232 1.1 mrg 233 1.1 mrg static inline vec_short8 vec_adds(vec_short8 a, vec_bshort8 b) 234 1.1 mrg { 235 1.1 mrg return (vec_adds(a, (vec_short8)(b))); 236 1.1 mrg } 237 1.1 mrg 238 1.1 mrg static inline vec_uint4 vec_adds(vec_uint4 a, vec_uint4 b) 239 1.1 mrg { 240 1.1 mrg return (spu_or(spu_add(a, b), spu_rlmaska(spu_sl(spu_genc(a, b), 31), -31))); 241 1.1 mrg } 242 1.1 mrg 243 1.1 mrg static inline vec_int4 vec_adds(vec_int4 a, vec_int4 b) 244 1.1 mrg { 245 1.1 mrg vec_int4 s, d; 246 1.1 mrg 247 1.1 mrg s = spu_add(a, b); 248 1.1 mrg d = spu_sel(s, spu_splats((signed int)0x7FFFFFFF), (vec_uint4)spu_rlmaska(spu_and(s, spu_nor(a, b)), -31)); 249 1.1 mrg d = spu_sel(d, spu_splats((signed int)0x80000000), (vec_uint4)spu_rlmaska(spu_nor(s, spu_nand(a, b)), -31)); 250 1.1 mrg return (d); 251 1.1 mrg } 252 1.1 mrg 253 1.1 mrg static inline vec_int4 vec_adds(vec_bint4 a, vec_int4 b) 254 1.1 mrg { 255 1.1 mrg return (vec_adds((vec_int4)(a), b)); 256 1.1 mrg } 257 1.1 mrg 258 1.1 mrg static inline vec_int4 vec_adds(vec_int4 a, vec_bint4 b) 259 1.1 mrg { 260 1.1 mrg return (vec_adds(a, (vec_int4)(b))); 261 1.1 mrg } 262 1.1 mrg 263 1.1 mrg /* vec_and (vector logical and) 264 1.1 mrg * ======= 265 1.1 mrg */ 266 1.1 mrg static inline vec_uchar16 vec_and(vec_uchar16 a, vec_uchar16 b) 267 1.1 mrg { 268 1.1 mrg return (spu_and(a, b)); 269 1.1 mrg } 270 1.1 mrg 271 1.1 mrg static inline vec_char16 vec_and(vec_char16 a, vec_char16 b) 272 1.1 mrg { 273 1.1 mrg return (spu_and(a, b)); 274 1.1 mrg } 275 1.1 mrg 276 1.1 mrg static inline vec_char16 vec_and(vec_bchar16 a, vec_char16 b) 277 1.1 mrg { 278 1.1 mrg return (spu_and((vec_char16)(a), b)); 279 1.1 mrg } 280 1.1 mrg 281 1.1 mrg static inline vec_char16 vec_and(vec_char16 a, vec_bchar16 b) 282 1.1 mrg { 283 1.1 mrg return (spu_and(a, (vec_char16)(b))); 284 1.1 mrg } 285 1.1 mrg 286 1.1 mrg static inline vec_ushort8 vec_and(vec_ushort8 a, vec_ushort8 b) 287 1.1 mrg { 288 1.1 mrg return (spu_and(a, b)); 289 1.1 mrg } 290 1.1 mrg 291 1.1 mrg static inline vec_short8 vec_and(vec_short8 a, vec_short8 b) 292 1.1 mrg { 293 1.1 mrg return (spu_and(a, b)); 294 1.1 mrg } 295 1.1 mrg 296 1.1 mrg static inline vec_short8 vec_and(vec_bshort8 a, vec_short8 b) 297 1.1 mrg { 298 1.1 mrg return (spu_and((vec_short8)(a), b)); 299 1.1 mrg } 300 1.1 mrg 301 1.1 mrg static inline vec_short8 vec_and(vec_short8 a, vec_bshort8 b) 302 1.1 mrg { 303 1.1 mrg return (spu_and(a, (vec_short8)(b))); 304 1.1 mrg } 305 1.1 mrg 306 1.1 mrg static inline vec_uint4 vec_and(vec_uint4 a, vec_uint4 b) 307 1.1 mrg { 308 1.1 mrg return (spu_and(a, b)); 309 1.1 mrg } 310 1.1 mrg 311 1.1 mrg static inline vec_int4 vec_and(vec_int4 a, vec_int4 b) 312 1.1 mrg { 313 1.1 mrg return (spu_and(a, b)); 314 1.1 mrg } 315 1.1 mrg 316 1.1 mrg static inline vec_int4 vec_and(vec_bint4 a, vec_int4 b) 317 1.1 mrg { 318 1.1 mrg return (spu_and((vec_int4)(a), b)); 319 1.1 mrg } 320 1.1 mrg 321 1.1 mrg static inline vec_int4 vec_and(vec_int4 a, vec_bint4 b) 322 1.1 mrg { 323 1.1 mrg return (spu_and(a, (vec_int4)(b))); 324 1.1 mrg } 325 1.1 mrg 326 1.1 mrg static inline vec_float4 vec_and(vec_float4 a, vec_float4 b) 327 1.1 mrg { 328 1.1 mrg return (spu_and(a, b)); 329 1.1 mrg } 330 1.1 mrg 331 1.1 mrg static inline vec_float4 vec_and(vec_bint4 a, vec_float4 b) 332 1.1 mrg { 333 1.1 mrg return (spu_and((vec_float4)(a),b)); 334 1.1 mrg } 335 1.1 mrg 336 1.1 mrg static inline vec_float4 vec_and(vec_float4 a, vec_bint4 b) 337 1.1 mrg { 338 1.1 mrg return (spu_and(a, (vec_float4)(b))); 339 1.1 mrg } 340 1.1 mrg 341 1.1 mrg 342 1.1 mrg /* vec_andc (vector logical and with complement) 343 1.1 mrg * ======== 344 1.1 mrg */ 345 1.1 mrg static inline vec_uchar16 vec_andc(vec_uchar16 a, vec_uchar16 b) 346 1.1 mrg { 347 1.1 mrg return (spu_andc(a, b)); 348 1.1 mrg } 349 1.1 mrg 350 1.1 mrg static inline vec_char16 vec_andc(vec_char16 a, vec_char16 b) 351 1.1 mrg { 352 1.1 mrg return (spu_andc(a, b)); 353 1.1 mrg } 354 1.1 mrg 355 1.1 mrg static inline vec_char16 vec_andc(vec_bchar16 a, vec_char16 b) 356 1.1 mrg { 357 1.1 mrg return (spu_andc((vec_char16)(a), b)); 358 1.1 mrg } 359 1.1 mrg 360 1.1 mrg static inline vec_char16 vec_andc(vec_char16 a, vec_bchar16 b) 361 1.1 mrg { 362 1.1 mrg return (spu_andc(a, (vec_char16)(b))); 363 1.1 mrg } 364 1.1 mrg 365 1.1 mrg static inline vec_ushort8 vec_andc(vec_ushort8 a, vec_ushort8 b) 366 1.1 mrg { 367 1.1 mrg return (spu_andc(a, b)); 368 1.1 mrg } 369 1.1 mrg 370 1.1 mrg static inline vec_short8 vec_andc(vec_short8 a, vec_short8 b) 371 1.1 mrg { 372 1.1 mrg return (spu_andc(a, b)); 373 1.1 mrg } 374 1.1 mrg 375 1.1 mrg static inline vec_short8 vec_andc(vec_bshort8 a, vec_short8 b) 376 1.1 mrg { 377 1.1 mrg return (spu_andc((vec_short8)(a), b)); 378 1.1 mrg } 379 1.1 mrg 380 1.1 mrg static inline vec_short8 vec_andc(vec_short8 a, vec_bshort8 b) 381 1.1 mrg { 382 1.1 mrg return (spu_andc(a, (vec_short8)(b))); 383 1.1 mrg } 384 1.1 mrg 385 1.1 mrg static inline vec_uint4 vec_andc(vec_uint4 a, vec_uint4 b) 386 1.1 mrg { 387 1.1 mrg return (spu_andc(a, b)); 388 1.1 mrg } 389 1.1 mrg 390 1.1 mrg static inline vec_int4 vec_andc(vec_int4 a, vec_int4 b) 391 1.1 mrg { 392 1.1 mrg return (spu_andc(a, b)); 393 1.1 mrg } 394 1.1 mrg 395 1.1 mrg static inline vec_int4 vec_andc(vec_bint4 a, vec_int4 b) 396 1.1 mrg { 397 1.1 mrg return (spu_andc((vec_int4)(a), b)); 398 1.1 mrg } 399 1.1 mrg 400 1.1 mrg static inline vec_int4 vec_andc(vec_int4 a, vec_bint4 b) 401 1.1 mrg { 402 1.1 mrg return (spu_andc(a, (vec_int4)(b))); 403 1.1 mrg } 404 1.1 mrg 405 1.1 mrg static inline vec_float4 vec_andc(vec_float4 a, vec_float4 b) 406 1.1 mrg { 407 1.1 mrg return (spu_andc(a,b)); 408 1.1 mrg } 409 1.1 mrg 410 1.1 mrg static inline vec_float4 vec_andc(vec_bint4 a, vec_float4 b) 411 1.1 mrg { 412 1.1 mrg return (spu_andc((vec_float4)(a),b)); 413 1.1 mrg } 414 1.1 mrg 415 1.1 mrg static inline vec_float4 vec_andc(vec_float4 a, vec_bint4 b) 416 1.1 mrg { 417 1.1 mrg return (spu_andc(a, (vec_float4)(b))); 418 1.1 mrg } 419 1.1 mrg 420 1.1 mrg /* vec_avg (vector average) 421 1.1 mrg * ======= 422 1.1 mrg */ 423 1.1 mrg static inline vec_uchar16 vec_avg(vec_uchar16 a, vec_uchar16 b) 424 1.1 mrg { 425 1.1 mrg return (spu_avg(a, b)); 426 1.1 mrg } 427 1.1 mrg 428 1.1 mrg static inline vec_char16 vec_avg(vec_char16 a, vec_char16 b) 429 1.1 mrg { 430 1.1 mrg return ((vec_char16)(spu_xor(spu_avg((vec_uchar16)(a), (vec_uchar16)(b)), 431 1.1 mrg (vec_uchar16)(spu_and(spu_xor(a,b), 0x80))))); 432 1.1 mrg } 433 1.1 mrg 434 1.1 mrg static inline vec_ushort8 vec_avg(vec_ushort8 a, vec_ushort8 b) 435 1.1 mrg { 436 1.1 mrg return (spu_add(spu_add(spu_rlmask(a, -1), spu_rlmask(b, -1)), 437 1.1 mrg spu_and(spu_or(a, b), 1))); 438 1.1 mrg } 439 1.1 mrg 440 1.1 mrg static inline vec_short8 vec_avg(vec_short8 a, vec_short8 b) 441 1.1 mrg { 442 1.1 mrg return (spu_add(spu_add(spu_rlmaska(a, -1), spu_rlmaska(b, -1)), 443 1.1 mrg spu_and(spu_or(a, b), 1))); 444 1.1 mrg } 445 1.1 mrg 446 1.1 mrg static inline vec_uint4 vec_avg(vec_uint4 a, vec_uint4 b) 447 1.1 mrg { 448 1.1 mrg return (spu_add(spu_add(spu_rlmask(a, -1), spu_rlmask(b, -1)), 449 1.1 mrg spu_and(spu_or(a, b), 1))); 450 1.1 mrg } 451 1.1 mrg 452 1.1 mrg static inline vec_int4 vec_avg(vec_int4 a, vec_int4 b) 453 1.1 mrg { 454 1.1 mrg return (spu_add(spu_add(spu_rlmaska(a, -1), spu_rlmaska(b, -1)), 455 1.1 mrg spu_and(spu_or(a, b), 1))); 456 1.1 mrg } 457 1.1 mrg 458 1.1 mrg 459 1.1 mrg /* vec_ceil (vector ceiling) 460 1.1 mrg * ======== 461 1.1 mrg */ 462 1.1 mrg static inline vec_float4 vec_ceil(vec_float4 a) 463 1.1 mrg { 464 1.1 mrg vec_int4 exp; 465 1.1 mrg vec_uint4 mask; 466 1.1 mrg 467 1.1 mrg a = spu_add(a, (vec_float4)(spu_and(spu_xor(spu_rlmaska((vec_int4)a, -31), -1), spu_splats((signed int)0x3F7FFFFF)))); 468 1.1 mrg exp = spu_sub(127, (vec_int4)(spu_and(spu_rlmask((vec_uint4)(a), -23), 0xFF))); 469 1.1 mrg mask = spu_rlmask(spu_splats((unsigned int)0x7FFFFF), exp); 470 1.1 mrg mask = spu_sel(spu_splats((unsigned int)0), mask, spu_cmpgt(exp, -31)); 471 1.1 mrg mask = spu_or(mask, spu_xor((vec_uint4)(spu_rlmaska(spu_add(exp, -1), -31)), -1)); 472 1.1 mrg 473 1.1 mrg return ((vec_float4)(spu_andc((vec_uint4)(a), mask))); 474 1.1 mrg } 475 1.1 mrg 476 1.1 mrg 477 1.1 mrg /* vec_cmpb (vector compare bounds floating-point) 478 1.1 mrg * ======== 479 1.1 mrg */ 480 1.1 mrg static inline vec_int4 vec_cmpb(vec_float4 a, vec_float4 b) 481 1.1 mrg { 482 1.1 mrg vec_int4 b0 = (vec_int4)spu_splats(0x80000000); 483 1.1 mrg vec_int4 b1 = (vec_int4)spu_splats(0x40000000); 484 1.1 mrg 485 1.1 mrg return (spu_or(spu_and((vec_int4)spu_cmpgt(a, b), b0), 486 1.1 mrg spu_and((vec_int4)spu_cmpgt(spu_xor(b, (vec_float4)(b0)), a), b1))); 487 1.1 mrg } 488 1.1 mrg 489 1.1 mrg /* vec_cmpeq (vector compare equal) 490 1.1 mrg * ========= 491 1.1 mrg */ 492 1.1 mrg #define vec_cmpeq(_a, _b) spu_cmpeq(_a, _b) 493 1.1 mrg 494 1.1 mrg 495 1.1 mrg /* vec_cmpge (vector compare greater than or equal) 496 1.1 mrg * ========= 497 1.1 mrg */ 498 1.1 mrg static inline vec_bint4 vec_cmpge(vec_float4 a, vec_float4 b) 499 1.1 mrg { 500 1.1 mrg return (spu_xor(spu_cmpgt(b, a), -1)); 501 1.1 mrg } 502 1.1 mrg 503 1.1 mrg 504 1.1 mrg /* vec_cmpgt (vector compare greater than) 505 1.1 mrg * ========= 506 1.1 mrg */ 507 1.1 mrg #define vec_cmpgt(_a, _b) spu_cmpgt(_a, _b) 508 1.1 mrg 509 1.1 mrg 510 1.1 mrg /* vec_cmple (vector compare less than or equal) 511 1.1 mrg * ========= 512 1.1 mrg */ 513 1.1 mrg static inline vec_bint4 vec_cmple(vec_float4 a, vec_float4 b) 514 1.1 mrg { 515 1.1 mrg return (spu_xor(spu_cmpgt(a, b), -1)); 516 1.1 mrg } 517 1.1 mrg 518 1.1 mrg 519 1.1 mrg /* vec_cmplt (vector compare less than) 520 1.1 mrg * ========= 521 1.1 mrg */ 522 1.1 mrg #define vec_cmplt(_a, _b) spu_cmpgt(_b, _a) 523 1.1 mrg 524 1.1 mrg 525 1.1 mrg /* vec_ctf (vector convert from fixed-point word) 526 1.1 mrg * ======= 527 1.1 mrg */ 528 1.1 mrg #define vec_ctf(_a, _b) spu_convtf(_a, _b) 529 1.1 mrg 530 1.1 mrg 531 1.1 mrg /* vec_cts (vector convert to signed fixed-point word saturate) 532 1.1 mrg * ======= 533 1.1 mrg */ 534 1.1 mrg #define vec_cts(_a, _b) spu_convts(_a, _b) 535 1.1 mrg 536 1.1 mrg 537 1.1 mrg /* vec_ctu (vector convert to unsigned fixed-point word saturate) 538 1.1 mrg * ======= 539 1.1 mrg */ 540 1.1 mrg #define vec_ctu(_a, _b) spu_convtu(_a, _b) 541 1.1 mrg 542 1.1 mrg 543 1.1 mrg /* vec_dss (vector data stream stop) 544 1.1 mrg * ======= 545 1.1 mrg */ 546 1.1 mrg #define vec_dss(_a) 547 1.1 mrg 548 1.1 mrg 549 1.1 mrg /* vec_dssall (vector data stream stop all) 550 1.1 mrg * ========== 551 1.1 mrg */ 552 1.1 mrg #define vec_dssall() 553 1.1 mrg 554 1.1 mrg 555 1.1 mrg /* vec_dst (vector data stream touch) 556 1.1 mrg * ======= 557 1.1 mrg */ 558 1.1 mrg #define vec_dst(_a, _b, _c) 559 1.1 mrg 560 1.1 mrg 561 1.1 mrg /* vec_dstst (vector data stream touch for store) 562 1.1 mrg * ========= 563 1.1 mrg */ 564 1.1 mrg #define vec_dstst(_a, _b, _c) 565 1.1 mrg 566 1.1 mrg 567 1.1 mrg /* vec_dststt (vector data stream touch for store transient) 568 1.1 mrg * ========== 569 1.1 mrg */ 570 1.1 mrg #define vec_dststt(_a, _b, _c) 571 1.1 mrg 572 1.1 mrg 573 1.1 mrg /* vec_dstt (vector data stream touch transient) 574 1.1 mrg * ======== 575 1.1 mrg */ 576 1.1 mrg #define vec_dstt(_a, _b, _c) 577 1.1 mrg 578 1.1 mrg 579 1.1 mrg /* vec_expte (vector is 2 raised tp the exponent estimate floating-point) 580 1.1 mrg * ========= 581 1.1 mrg */ 582 1.1 mrg static inline vec_float4 vec_expte(vec_float4 a) 583 1.1 mrg { 584 1.1 mrg vec_float4 bias, frac, exp; 585 1.1 mrg vec_int4 ia; 586 1.1 mrg 587 1.1 mrg bias = (vec_float4)(spu_andc(spu_splats((signed int)0x3F7FFFFF), spu_rlmaska((vec_int4)(a), -31))); 588 1.1 mrg ia = spu_convts(spu_add(a, bias), 0); 589 1.1 mrg frac = spu_sub(spu_convtf(ia, 0), a); 590 1.1 mrg exp = (vec_float4)(spu_sl(spu_add(ia, 127), 23)); 591 1.1 mrg 592 1.1 mrg return (spu_mul(spu_madd(spu_madd(spu_splats(0.17157287f), frac, spu_splats(-0.67157287f)), 593 1.1 mrg frac, spu_splats(1.0f)), exp)); 594 1.1 mrg } 595 1.1 mrg 596 1.1 mrg 597 1.1 mrg /* vec_floor (vector floor) 598 1.1 mrg * ========= 599 1.1 mrg */ 600 1.1 mrg static inline vec_float4 vec_floor(vec_float4 a) 601 1.1 mrg { 602 1.1 mrg vec_int4 exp; 603 1.1 mrg vec_uint4 mask; 604 1.1 mrg 605 1.1 mrg a = spu_sub(a, (vec_float4)(spu_and(spu_rlmaska((vec_int4)a, -31), spu_splats((signed int)0x3F7FFFFF)))); 606 1.1 mrg exp = spu_sub(127, (vec_int4)(spu_and(spu_rlmask((vec_uint4)(a), -23), 0xFF))); 607 1.1 mrg mask = spu_rlmask(spu_splats((unsigned int)0x7FFFFF), exp); 608 1.1 mrg mask = spu_sel(spu_splats((unsigned int)0), mask, spu_cmpgt(exp, -31)); 609 1.1 mrg mask = spu_or(mask, spu_xor((vec_uint4)(spu_rlmaska(spu_add(exp, -1), -31)), -1)); 610 1.1 mrg 611 1.1 mrg return ((vec_float4)(spu_andc((vec_uint4)(a), mask))); 612 1.1 mrg } 613 1.1 mrg 614 1.1 mrg 615 1.1 mrg /* vec_ld (vector load indexed) 616 1.1 mrg * ====== 617 1.1 mrg */ 618 1.1 mrg static inline vec_uchar16 vec_ld(int a, unsigned char *b) 619 1.1 mrg { 620 1.1 mrg return (*((vec_uchar16 *)(b+a))); 621 1.1 mrg } 622 1.1 mrg 623 1.1 mrg static inline vec_uchar16 vec_ld(int a, vec_uchar16 *b) 624 1.1 mrg { 625 1.1 mrg return (*((vec_uchar16 *)((unsigned char *)(b)+a))); 626 1.1 mrg } 627 1.1 mrg 628 1.1 mrg static inline vec_char16 vec_ld(int a, signed char *b) 629 1.1 mrg { 630 1.1 mrg return (*((vec_char16 *)(b+a))); 631 1.1 mrg } 632 1.1 mrg 633 1.1 mrg static inline vec_char16 vec_ld(int a, vec_char16 *b) 634 1.1 mrg { 635 1.1 mrg return (*((vec_char16 *)((signed char *)(b)+a))); 636 1.1 mrg } 637 1.1 mrg 638 1.1 mrg static inline vec_ushort8 vec_ld(int a, unsigned short *b) 639 1.1 mrg { 640 1.1 mrg return (*((vec_ushort8 *)((unsigned char *)(b)+a))); 641 1.1 mrg } 642 1.1 mrg 643 1.1 mrg static inline vec_ushort8 vec_ld(int a, vec_ushort8 *b) 644 1.1 mrg { 645 1.1 mrg return (*((vec_ushort8 *)((unsigned char *)(b)+a))); 646 1.1 mrg } 647 1.1 mrg 648 1.1 mrg static inline vec_short8 vec_ld(int a, signed short *b) 649 1.1 mrg { 650 1.1 mrg return (*((vec_short8 *)((unsigned char *)(b)+a))); 651 1.1 mrg } 652 1.1 mrg 653 1.1 mrg static inline vec_short8 vec_ld(int a, vec_short8 *b) 654 1.1 mrg { 655 1.1 mrg return (*((vec_short8 *)((signed char *)(b)+a))); 656 1.1 mrg } 657 1.1 mrg 658 1.1 mrg static inline vec_uint4 vec_ld(int a, unsigned int *b) 659 1.1 mrg { 660 1.1 mrg return (*((vec_uint4 *)((unsigned char *)(b)+a))); 661 1.1 mrg } 662 1.1 mrg 663 1.1 mrg static inline vec_uint4 vec_ld(int a, vec_uint4 *b) 664 1.1 mrg { 665 1.1 mrg return (*((vec_uint4 *)((unsigned char *)(b)+a))); 666 1.1 mrg } 667 1.1 mrg 668 1.1 mrg static inline vec_int4 vec_ld(int a, signed int *b) 669 1.1 mrg { 670 1.1 mrg return (*((vec_int4 *)((unsigned char *)(b)+a))); 671 1.1 mrg } 672 1.1 mrg 673 1.1 mrg static inline vec_int4 vec_ld(int a, vec_int4 *b) 674 1.1 mrg { 675 1.1 mrg return (*((vec_int4 *)((signed char *)(b)+a))); 676 1.1 mrg } 677 1.1 mrg 678 1.1 mrg static inline vec_float4 vec_ld(int a, float *b) 679 1.1 mrg { 680 1.1 mrg return (*((vec_float4 *)((unsigned char *)(b)+a))); 681 1.1 mrg } 682 1.1 mrg 683 1.1 mrg static inline vec_float4 vec_ld(int a, vec_float4 *b) 684 1.1 mrg { 685 1.1 mrg return (*((vec_float4 *)((unsigned char *)(b)+a))); 686 1.1 mrg } 687 1.1 mrg 688 1.1 mrg /* vec_lde (vector load element indexed) 689 1.1 mrg * ======= 690 1.1 mrg */ 691 1.1 mrg static inline vec_uchar16 vec_lde(int a, unsigned char *b) 692 1.1 mrg { 693 1.1 mrg return (*((vec_uchar16 *)(b+a))); 694 1.1 mrg } 695 1.1 mrg 696 1.1 mrg static inline vec_char16 vec_lde(int a, signed char *b) 697 1.1 mrg { 698 1.1 mrg return (*((vec_char16 *)(b+a))); 699 1.1 mrg } 700 1.1 mrg 701 1.1 mrg static inline vec_ushort8 vec_lde(int a, unsigned short *b) 702 1.1 mrg { 703 1.1 mrg return (*((vec_ushort8 *)((unsigned char *)(b)+a))); 704 1.1 mrg } 705 1.1 mrg 706 1.1 mrg static inline vec_short8 vec_lde(int a, signed short *b) 707 1.1 mrg { 708 1.1 mrg return (*((vec_short8 *)((unsigned char *)(b)+a))); 709 1.1 mrg } 710 1.1 mrg 711 1.1 mrg 712 1.1 mrg static inline vec_uint4 vec_lde(int a, unsigned int *b) 713 1.1 mrg { 714 1.1 mrg return (*((vec_uint4 *)((unsigned char *)(b)+a))); 715 1.1 mrg } 716 1.1 mrg 717 1.1 mrg static inline vec_int4 vec_lde(int a, signed int *b) 718 1.1 mrg { 719 1.1 mrg return (*((vec_int4 *)((unsigned char *)(b)+a))); 720 1.1 mrg } 721 1.1 mrg 722 1.1 mrg 723 1.1 mrg static inline vec_float4 vec_lde(int a, float *b) 724 1.1 mrg { 725 1.1 mrg return (*((vec_float4 *)((unsigned char *)(b)+a))); 726 1.1 mrg } 727 1.1 mrg 728 1.1 mrg /* vec_ldl (vector load indexed LRU) 729 1.1 mrg * ======= 730 1.1 mrg */ 731 1.1 mrg #define vec_ldl(_a, _b) vec_ld(_a, _b) 732 1.1 mrg 733 1.1 mrg 734 1.1 mrg /* vec_loge (vector log2 estimate floating-point) 735 1.1 mrg * ======== 736 1.1 mrg */ 737 1.1 mrg static inline vec_float4 vec_loge(vec_float4 a) 738 1.1 mrg { 739 1.1 mrg vec_int4 exp; 740 1.1 mrg vec_float4 frac; 741 1.1 mrg 742 1.1 mrg exp = spu_add((vec_int4)(spu_and(spu_rlmask((vec_uint4)(a), -23), 0xFF)), -127); 743 1.1 mrg frac = (vec_float4)(spu_sub((vec_int4)(a), spu_sl(exp, 23))); 744 1.1 mrg 745 1.1 mrg return (spu_madd(spu_madd(spu_splats(-0.33985f), frac, spu_splats(2.01955f)), 746 1.1 mrg frac, spu_sub(spu_convtf(exp, 0), spu_splats(1.6797f)))); 747 1.1 mrg } 748 1.1 mrg 749 1.1 mrg 750 1.1 mrg /* vec_lvsl (vector load for shift left) 751 1.1 mrg * ======== 752 1.1 mrg */ 753 1.1 mrg static inline vec_uchar16 vec_lvsl(int a, unsigned char *b) 754 1.1 mrg { 755 1.1 mrg return ((vec_uchar16)spu_add((vec_ushort8)(spu_splats((unsigned char)((a + (int)(b)) & 0xF))), 756 1.1 mrg ((vec_ushort8){0x0001, 0x0203, 0x0405, 0x0607, 757 1.1 mrg 0x0809, 0x0A0B, 0x0C0D, 0x0E0F}))); 758 1.1 mrg } 759 1.1 mrg 760 1.1 mrg static inline vec_uchar16 vec_lvsl(int a, signed char *b) 761 1.1 mrg { 762 1.1 mrg return (vec_lvsl(a, (unsigned char *)b)); 763 1.1 mrg } 764 1.1 mrg 765 1.1 mrg static inline vec_uchar16 vec_lvsl(int a, unsigned short *b) 766 1.1 mrg { 767 1.1 mrg return (vec_lvsl(a, (unsigned char *)b)); 768 1.1 mrg } 769 1.1 mrg 770 1.1 mrg static inline vec_uchar16 vec_lvsl(int a, short *b) 771 1.1 mrg { 772 1.1 mrg return (vec_lvsl(a, (unsigned char *)b)); 773 1.1 mrg } 774 1.1 mrg 775 1.1 mrg static inline vec_uchar16 vec_lvsl(int a, unsigned int *b) 776 1.1 mrg { 777 1.1 mrg return (vec_lvsl(a, (unsigned char *)b)); 778 1.1 mrg } 779 1.1 mrg 780 1.1 mrg static inline vec_uchar16 vec_lvsl(int a, int *b) 781 1.1 mrg { 782 1.1 mrg return (vec_lvsl(a, (unsigned char *)b)); 783 1.1 mrg } 784 1.1 mrg 785 1.1 mrg static inline vec_uchar16 vec_lvsl(int a, float *b) 786 1.1 mrg { 787 1.1 mrg return (vec_lvsl(a, (unsigned char *)b)); 788 1.1 mrg } 789 1.1 mrg 790 1.1 mrg 791 1.1 mrg /* vec_lvsr (vector load for shift right) 792 1.1 mrg * ======== 793 1.1 mrg */ 794 1.1 mrg static inline vec_uchar16 vec_lvsr(int a, unsigned char *b) 795 1.1 mrg { 796 1.1 mrg return ((vec_uchar16)(spu_sub(((vec_ushort8){0x1011, 0x1213, 0x1415, 0x1617, 797 1.1 mrg 0x1819, 0x1A1B, 0x1C1D, 0x1E1F}), 798 1.1 mrg (vec_ushort8)(spu_splats((unsigned char)((a + (int)(b)) & 0xF)))))); 799 1.1 mrg } 800 1.1 mrg 801 1.1 mrg static inline vec_uchar16 vec_lvsr(int a, signed char *b) 802 1.1 mrg { 803 1.1 mrg return (vec_lvsr(a, (unsigned char *)b)); 804 1.1 mrg } 805 1.1 mrg 806 1.1 mrg static inline vec_uchar16 vec_lvsr(int a, unsigned short *b) 807 1.1 mrg { 808 1.1 mrg return (vec_lvsr(a, (unsigned char *)b)); 809 1.1 mrg } 810 1.1 mrg 811 1.1 mrg static inline vec_uchar16 vec_lvsr(int a, short *b) 812 1.1 mrg { 813 1.1 mrg return (vec_lvsr(a, (unsigned char *)b)); 814 1.1 mrg } 815 1.1 mrg 816 1.1 mrg static inline vec_uchar16 vec_lvsr(int a, unsigned int *b) 817 1.1 mrg { 818 1.1 mrg return (vec_lvsr(a, (unsigned char *)b)); 819 1.1 mrg } 820 1.1 mrg 821 1.1 mrg static inline vec_uchar16 vec_lvsr(int a, int *b) 822 1.1 mrg { 823 1.1 mrg return (vec_lvsr(a, (unsigned char *)b)); 824 1.1 mrg } 825 1.1 mrg 826 1.1 mrg static inline vec_uchar16 vec_lvsr(int a, float *b) 827 1.1 mrg { 828 1.1 mrg return (vec_lvsr(a, (unsigned char *)b)); 829 1.1 mrg } 830 1.1 mrg 831 1.1 mrg /* vec_madd (vector multiply add) 832 1.1 mrg * ======== 833 1.1 mrg */ 834 1.1 mrg #define vec_madd(_a, _b, _c) spu_madd(_a, _b, _c) 835 1.1 mrg 836 1.1 mrg 837 1.1 mrg 838 1.1 mrg /* vec_madds (vector multiply add saturate) 839 1.1 mrg * ========= 840 1.1 mrg */ 841 1.1 mrg static inline vec_short8 vec_madds(vec_short8 a, vec_short8 b, vec_short8 c) 842 1.1 mrg { 843 1.1 mrg return (vec_adds(c, spu_sel((vec_short8)(spu_sl(spu_mule(a, b), 1)), 844 1.1 mrg (vec_short8)(spu_rlmask(spu_mulo(a, b), -15)), 845 1.1 mrg ((vec_ushort8){0, 0xFFFF, 0, 0xFFFF, 0, 0xFFFF, 0, 0xFFFF})))); 846 1.1 mrg } 847 1.1 mrg 848 1.1 mrg /* vec_max (vector maximum) 849 1.1 mrg * ======= 850 1.1 mrg */ 851 1.1 mrg static inline vec_uchar16 vec_max(vec_uchar16 a, vec_uchar16 b) 852 1.1 mrg { 853 1.1 mrg return (spu_sel(b, a, spu_cmpgt(a, b))); 854 1.1 mrg } 855 1.1 mrg 856 1.1 mrg static inline vec_char16 vec_max(vec_char16 a, vec_char16 b) 857 1.1 mrg { 858 1.1 mrg return (spu_sel(b, a, spu_cmpgt(a, b))); 859 1.1 mrg } 860 1.1 mrg 861 1.1 mrg static inline vec_char16 vec_max(vec_bchar16 a, vec_char16 b) 862 1.1 mrg { 863 1.1 mrg return (spu_sel(b, (vec_char16)(a), spu_cmpgt((vec_char16)(a), b))); 864 1.1 mrg } 865 1.1 mrg 866 1.1 mrg static inline vec_char16 vec_max(vec_char16 a, vec_bchar16 b) 867 1.1 mrg { 868 1.1 mrg return (spu_sel((vec_char16)(b), a, spu_cmpgt(a, (vec_char16)(b)))); 869 1.1 mrg } 870 1.1 mrg 871 1.1 mrg static inline vec_ushort8 vec_max(vec_ushort8 a, vec_ushort8 b) 872 1.1 mrg { 873 1.1 mrg return (spu_sel(b, a, spu_cmpgt(a, b))); 874 1.1 mrg } 875 1.1 mrg 876 1.1 mrg static inline vec_short8 vec_max(vec_short8 a, vec_short8 b) 877 1.1 mrg { 878 1.1 mrg return (spu_sel(b, a, spu_cmpgt(a, b))); 879 1.1 mrg } 880 1.1 mrg 881 1.1 mrg static inline vec_short8 vec_max(vec_bshort8 a, vec_short8 b) 882 1.1 mrg { 883 1.1 mrg return (spu_sel(b, (vec_short8)(a), spu_cmpgt((vec_short8)(a), b))); 884 1.1 mrg } 885 1.1 mrg 886 1.1 mrg static inline vec_short8 vec_max(vec_short8 a, vec_bshort8 b) 887 1.1 mrg { 888 1.1 mrg return (spu_sel((vec_short8)(b), a, spu_cmpgt(a, (vec_short8)(b)))); 889 1.1 mrg } 890 1.1 mrg 891 1.1 mrg static inline vec_uint4 vec_max(vec_uint4 a, vec_uint4 b) 892 1.1 mrg { 893 1.1 mrg return (spu_sel(b, a, spu_cmpgt(a, b))); 894 1.1 mrg } 895 1.1 mrg 896 1.1 mrg static inline vec_int4 vec_max(vec_int4 a, vec_int4 b) 897 1.1 mrg { 898 1.1 mrg return (spu_sel(b, a, spu_cmpgt(a, b))); 899 1.1 mrg } 900 1.1 mrg 901 1.1 mrg static inline vec_int4 vec_max(vec_bint4 a, vec_int4 b) 902 1.1 mrg { 903 1.1 mrg return (spu_sel(b, (vec_int4)(a), spu_cmpgt((vec_int4)(a), b))); 904 1.1 mrg } 905 1.1 mrg 906 1.1 mrg static inline vec_int4 vec_max(vec_int4 a, vec_bint4 b) 907 1.1 mrg { 908 1.1 mrg return (spu_sel((vec_int4)(b), a, spu_cmpgt(a, (vec_int4)(b)))); 909 1.1 mrg } 910 1.1 mrg 911 1.1 mrg static inline vec_float4 vec_max(vec_float4 a, vec_float4 b) 912 1.1 mrg { 913 1.1 mrg return (spu_sel(b, a, spu_cmpgt(a, b))); 914 1.1 mrg } 915 1.1 mrg 916 1.1 mrg 917 1.1 mrg /* vec_mergeh (vector merge high) 918 1.1 mrg * ========== 919 1.1 mrg */ 920 1.1 mrg static inline vec_uchar16 vec_mergeh(vec_uchar16 a, vec_uchar16 b) 921 1.1 mrg { 922 1.1 mrg return (spu_shuffle(a, b, ((vec_uchar16){0, 16, 1, 17, 2, 18, 3, 19, 923 1.1 mrg 4, 20, 5, 21, 6, 22, 7, 23}))); 924 1.1 mrg } 925 1.1 mrg 926 1.1 mrg static inline vec_char16 vec_mergeh(vec_char16 a, vec_char16 b) 927 1.1 mrg { 928 1.1 mrg return (spu_shuffle(a, b, ((vec_uchar16){0, 16, 1, 17, 2, 18, 3, 19, 929 1.1 mrg 4, 20, 5, 21, 6, 22, 7, 23}))); 930 1.1 mrg } 931 1.1 mrg 932 1.1 mrg static inline vec_ushort8 vec_mergeh(vec_ushort8 a, vec_ushort8 b) 933 1.1 mrg { 934 1.1 mrg return (spu_shuffle(a, b, ((vec_uchar16){0, 1, 16, 17, 2, 3, 18, 19, 935 1.1 mrg 4, 5, 20, 21, 6, 7, 22, 23}))); 936 1.1 mrg } 937 1.1 mrg 938 1.1 mrg static inline vec_short8 vec_mergeh(vec_short8 a, vec_short8 b) 939 1.1 mrg { 940 1.1 mrg return (spu_shuffle(a, b, ((vec_uchar16){0, 1, 16, 17, 2, 3, 18, 19, 941 1.1 mrg 4, 5, 20, 21, 6, 7, 22, 23}))); 942 1.1 mrg } 943 1.1 mrg 944 1.1 mrg static inline vec_uint4 vec_mergeh(vec_uint4 a, vec_uint4 b) 945 1.1 mrg { 946 1.1 mrg return (spu_shuffle(a, b, ((vec_uchar16){0, 1, 2, 3, 16, 17, 18, 19, 947 1.1 mrg 4, 5, 6, 7, 20, 21, 22, 23}))); 948 1.1 mrg } 949 1.1 mrg 950 1.1 mrg static inline vec_int4 vec_mergeh(vec_int4 a, vec_int4 b) 951 1.1 mrg { 952 1.1 mrg return (spu_shuffle(a, b, ((vec_uchar16){0, 1, 2, 3, 16, 17, 18, 19, 953 1.1 mrg 4, 5, 6, 7, 20, 21, 22, 23}))); 954 1.1 mrg } 955 1.1 mrg 956 1.1 mrg static inline vec_float4 vec_mergeh(vec_float4 a, vec_float4 b) 957 1.1 mrg { 958 1.1 mrg return (spu_shuffle(a, b, ((vec_uchar16){0, 1, 2, 3, 16, 17, 18, 19, 959 1.1 mrg 4, 5, 6, 7, 20, 21, 22, 23}))); 960 1.1 mrg } 961 1.1 mrg 962 1.1 mrg /* vec_mergel (vector merge low) 963 1.1 mrg * ========== 964 1.1 mrg */ 965 1.1 mrg static inline vec_uchar16 vec_mergel(vec_uchar16 a, vec_uchar16 b) 966 1.1 mrg { 967 1.1 mrg return (spu_shuffle(a, b, ((vec_uchar16){ 8, 24, 9, 25, 10, 26, 11, 27, 968 1.1 mrg 12, 28, 13, 29, 14, 30, 15, 31}))); 969 1.1 mrg } 970 1.1 mrg 971 1.1 mrg static inline vec_char16 vec_mergel(vec_char16 a, vec_char16 b) 972 1.1 mrg { 973 1.1 mrg return (spu_shuffle(a, b, ((vec_uchar16){ 8, 24, 9, 25, 10, 26, 11, 27, 974 1.1 mrg 12, 28, 13, 29, 14, 30, 15, 31}))); 975 1.1 mrg } 976 1.1 mrg 977 1.1 mrg static inline vec_ushort8 vec_mergel(vec_ushort8 a, vec_ushort8 b) 978 1.1 mrg { 979 1.1 mrg return (spu_shuffle(a, b, ((vec_uchar16){ 8, 9, 24, 25, 10, 11, 26, 27, 980 1.1 mrg 12, 13, 28, 29, 14, 15, 30, 31}))); 981 1.1 mrg } 982 1.1 mrg 983 1.1 mrg static inline vec_short8 vec_mergel(vec_short8 a, vec_short8 b) 984 1.1 mrg { 985 1.1 mrg return (spu_shuffle(a, b, ((vec_uchar16){ 8, 9, 24, 25, 10, 11, 26, 27, 986 1.1 mrg 12, 13, 28, 29, 14, 15, 30, 31}))); 987 1.1 mrg } 988 1.1 mrg 989 1.1 mrg static inline vec_uint4 vec_mergel(vec_uint4 a, vec_uint4 b) 990 1.1 mrg { 991 1.1 mrg return (spu_shuffle(a, b, ((vec_uchar16){ 8, 9, 10, 11, 24, 25, 26, 27, 992 1.1 mrg 12, 13, 14, 15, 28, 29, 30, 31}))); 993 1.1 mrg } 994 1.1 mrg 995 1.1 mrg static inline vec_int4 vec_mergel(vec_int4 a, vec_int4 b) 996 1.1 mrg { 997 1.1 mrg return (spu_shuffle(a, b, ((vec_uchar16){ 8, 9, 10, 11, 24, 25, 26, 27, 998 1.1 mrg 12, 13, 14, 15, 28, 29, 30, 31}))); 999 1.1 mrg } 1000 1.1 mrg 1001 1.1 mrg static inline vec_float4 vec_mergel(vec_float4 a, vec_float4 b) 1002 1.1 mrg { 1003 1.1 mrg return (spu_shuffle(a, b, ((vec_uchar16){ 8, 9, 10, 11, 24, 25, 26, 27, 1004 1.1 mrg 12, 13, 14, 15, 28, 29, 30, 31}))); 1005 1.1 mrg } 1006 1.1 mrg 1007 1.1 mrg /* vec_mfvscr (vector move from vector status and control register) 1008 1.1 mrg * ========== 1009 1.1 mrg */ 1010 1.1 mrg static inline vec_ushort8 vec_mfvscr() 1011 1.1 mrg { 1012 1.1 mrg return ((vec_ushort8)spu_splats(0)); /* not supported */ 1013 1.1 mrg } 1014 1.1 mrg 1015 1.1 mrg 1016 1.1 mrg /* vec_min (vector minimum) 1017 1.1 mrg * ======= 1018 1.1 mrg */ 1019 1.1 mrg static inline vec_uchar16 vec_min(vec_uchar16 a, vec_uchar16 b) 1020 1.1 mrg { 1021 1.1 mrg return (spu_sel(a, b, spu_cmpgt(a, b))); 1022 1.1 mrg } 1023 1.1 mrg 1024 1.1 mrg static inline vec_char16 vec_min(vec_char16 a, vec_char16 b) 1025 1.1 mrg { 1026 1.1 mrg return (spu_sel(a, b, spu_cmpgt(a, b))); 1027 1.1 mrg } 1028 1.1 mrg 1029 1.1 mrg static inline vec_char16 vec_min(vec_bchar16 a, vec_char16 b) 1030 1.1 mrg { 1031 1.1 mrg return (spu_sel((vec_char16)(a), b, spu_cmpgt((vec_char16)(a), b))); 1032 1.1 mrg } 1033 1.1 mrg 1034 1.1 mrg static inline vec_char16 vec_min(vec_char16 a, vec_bchar16 b) 1035 1.1 mrg { 1036 1.1 mrg return (spu_sel(a, (vec_char16)(b), spu_cmpgt(a, (vec_char16)(b)))); 1037 1.1 mrg } 1038 1.1 mrg 1039 1.1 mrg static inline vec_ushort8 vec_min(vec_ushort8 a, vec_ushort8 b) 1040 1.1 mrg { 1041 1.1 mrg return (spu_sel(a, b, spu_cmpgt(a, b))); 1042 1.1 mrg } 1043 1.1 mrg 1044 1.1 mrg static inline vec_short8 vec_min(vec_short8 a, vec_short8 b) 1045 1.1 mrg { 1046 1.1 mrg return (spu_sel(a, b, spu_cmpgt(a, b))); 1047 1.1 mrg } 1048 1.1 mrg 1049 1.1 mrg static inline vec_short8 vec_min(vec_bshort8 a, vec_short8 b) 1050 1.1 mrg { 1051 1.1 mrg return (spu_sel((vec_short8)(a), b, spu_cmpgt((vec_short8)(a), b))); 1052 1.1 mrg } 1053 1.1 mrg 1054 1.1 mrg static inline vec_short8 vec_min(vec_short8 a, vec_bshort8 b) 1055 1.1 mrg { 1056 1.1 mrg return (spu_sel(a, (vec_short8)(b), spu_cmpgt(a, (vec_short8)(b)))); 1057 1.1 mrg } 1058 1.1 mrg 1059 1.1 mrg static inline vec_uint4 vec_min(vec_uint4 a, vec_uint4 b) 1060 1.1 mrg { 1061 1.1 mrg return (spu_sel(a, b, spu_cmpgt(a, b))); 1062 1.1 mrg } 1063 1.1 mrg 1064 1.1 mrg static inline vec_int4 vec_min(vec_int4 a, vec_int4 b) 1065 1.1 mrg { 1066 1.1 mrg return (spu_sel(a, b, spu_cmpgt(a, b))); 1067 1.1 mrg } 1068 1.1 mrg 1069 1.1 mrg static inline vec_int4 vec_min(vec_bint4 a, vec_int4 b) 1070 1.1 mrg { 1071 1.1 mrg return (spu_sel((vec_int4)(a), b, spu_cmpgt((vec_int4)(a), b))); 1072 1.1 mrg } 1073 1.1 mrg 1074 1.1 mrg static inline vec_int4 vec_min(vec_int4 a, vec_bint4 b) 1075 1.1 mrg { 1076 1.1 mrg return (spu_sel(a, (vec_int4)(b), spu_cmpgt(a, (vec_int4)(b)))); 1077 1.1 mrg } 1078 1.1 mrg 1079 1.1 mrg static inline vec_float4 vec_min(vec_float4 a, vec_float4 b) 1080 1.1 mrg { 1081 1.1 mrg return (spu_sel(a, b, spu_cmpgt(a, b))); 1082 1.1 mrg } 1083 1.1 mrg 1084 1.1 mrg /* vec_mladd (vector multiply low and add unsigned half word) 1085 1.1 mrg * ========= 1086 1.1 mrg */ 1087 1.1 mrg static inline vec_short8 vec_mladd(vec_short8 a, vec_short8 b, vec_short8 c) 1088 1.1 mrg { 1089 1.1 mrg return ((vec_short8)(spu_shuffle(spu_madd((vec_short8)(spu_rl((vec_uint4)(a), -16)), 1090 1.1 mrg (vec_short8)(spu_rl((vec_uint4)(b), -16)), 1091 1.1 mrg (vec_int4)(spu_rl((vec_uint4)(c), -16))), 1092 1.1 mrg spu_madd(a, b, spu_extend(c)), 1093 1.1 mrg ((vec_uchar16){ 2, 3, 18, 19, 6, 7, 22, 23, 1094 1.1 mrg 10, 11, 26, 27, 14, 15, 30, 31})))); 1095 1.1 mrg } 1096 1.1 mrg 1097 1.1 mrg 1098 1.1 mrg static inline vec_ushort8 vec_mladd(vec_ushort8 a, vec_ushort8 b, vec_ushort8 c) 1099 1.1 mrg { 1100 1.1 mrg return ((vec_ushort8)(vec_mladd((vec_short8)(a), (vec_short8)(b), (vec_short8)(c)))); 1101 1.1 mrg } 1102 1.1 mrg 1103 1.1 mrg static inline vec_short8 vec_mladd(vec_ushort8 a, vec_short8 b, vec_short8 c) 1104 1.1 mrg { 1105 1.1 mrg return (vec_mladd((vec_short8)(a), b, c)); 1106 1.1 mrg } 1107 1.1 mrg 1108 1.1 mrg static inline vec_short8 vec_mladd(vec_short8 a, vec_ushort8 b, vec_ushort8 c) 1109 1.1 mrg { 1110 1.1 mrg return (vec_mladd(a, (vec_short8)(b), (vec_short8)(c))); 1111 1.1 mrg } 1112 1.1 mrg 1113 1.1 mrg 1114 1.1 mrg /* vec_mradds (vector multiply round and add saturate) 1115 1.1 mrg * ========== 1116 1.1 mrg */ 1117 1.1 mrg static inline vec_short8 vec_mradds(vec_short8 a, vec_short8 b, vec_short8 c) 1118 1.1 mrg { 1119 1.1 mrg vec_int4 round = (vec_int4)spu_splats(0x4000); 1120 1.1 mrg vec_short8 hi, lo; 1121 1.1 mrg 1122 1.1 mrg hi = (vec_short8)(spu_sl(spu_add(spu_mule(a, b), round), 1)); 1123 1.1 mrg lo = (vec_short8)(spu_rlmask(spu_add(spu_mulo(a, b), round), -15)); 1124 1.1 mrg 1125 1.1 mrg return (vec_adds(spu_sel(hi, lo, ((vec_ushort8){0, 0xFFFF, 0, 0xFFFF, 0, 0xFFFF, 0, 0xFFFF})), c)); 1126 1.1 mrg } 1127 1.1 mrg 1128 1.1 mrg 1129 1.1 mrg /* vec_msum (vector multiply sum) 1130 1.1 mrg * ======== 1131 1.1 mrg */ 1132 1.1 mrg static inline vec_uint4 vec_msum(vec_uchar16 a, vec_uchar16 b, vec_uint4 c) 1133 1.1 mrg { 1134 1.1 mrg vec_ushort8 a1, a2, b1, b2; 1135 1.1 mrg vec_uint4 p1, p2; 1136 1.1 mrg 1137 1.1 mrg a1 = spu_and((vec_ushort8)(a), 0xFF); 1138 1.1 mrg a2 = spu_rlmask((vec_ushort8)(a), -8); 1139 1.1 mrg b1 = spu_and((vec_ushort8)(b), 0xFF); 1140 1.1 mrg b2 = spu_rlmask((vec_ushort8)(b), -8); 1141 1.1 mrg 1142 1.1 mrg p1 = spu_add(spu_mulo(a1, b1), spu_mulo(spu_rlqwbyte(a1, -2), spu_rlqwbyte(b1, -2))); 1143 1.1 mrg p2 = spu_add(spu_mulo(a2, b2), spu_mulo(spu_rlqwbyte(a2, -2), spu_rlqwbyte(b2, -2))); 1144 1.1 mrg return (spu_add(p2, spu_add(p1, c))); 1145 1.1 mrg } 1146 1.1 mrg 1147 1.1 mrg static inline vec_int4 vec_msum(vec_char16 a, vec_uchar16 b, vec_int4 c) 1148 1.1 mrg { 1149 1.1 mrg vec_short8 a1, a2, b1, b2; 1150 1.1 mrg vec_int4 p1, p2; 1151 1.1 mrg 1152 1.1 mrg a1 = (vec_short8)(spu_extend(a)); 1153 1.1 mrg a2 = spu_rlmaska((vec_short8)(a), -8); 1154 1.1 mrg b1 = (vec_short8)(spu_and((vec_ushort8)(b), 0xFF)); 1155 1.1 mrg b2 = (vec_short8)spu_rlmask((vec_ushort8)(b), -8); 1156 1.1 mrg 1157 1.1 mrg p1 = spu_add(spu_mulo(a1, b1), spu_mulo(spu_rlqwbyte(a1, -2), spu_rlqwbyte(b1, -2))); 1158 1.1 mrg p2 = spu_add(spu_mulo(a2, b2), spu_mulo(spu_rlqwbyte(a2, -2), spu_rlqwbyte(b2, -2))); 1159 1.1 mrg return (spu_add(p2, spu_add(p1, c))); 1160 1.1 mrg } 1161 1.1 mrg 1162 1.1 mrg static inline vec_uint4 vec_msum(vec_ushort8 a, vec_ushort8 b, vec_uint4 c) 1163 1.1 mrg { 1164 1.1 mrg return (spu_add(spu_add(spu_mulo(a, b), spu_mulo(spu_rlqwbyte(a, -2), spu_rlqwbyte(b, -2))), c)); 1165 1.1 mrg } 1166 1.1 mrg 1167 1.1 mrg static inline vec_int4 vec_msum(vec_short8 a, vec_short8 b, vec_int4 c) 1168 1.1 mrg { 1169 1.1 mrg return (spu_add(spu_add(spu_mulo(a, b), spu_mulo(spu_rlqwbyte(a, -2), spu_rlqwbyte(b, -2))), c)); 1170 1.1 mrg } 1171 1.1 mrg 1172 1.1 mrg 1173 1.1 mrg /* vec_msums (vector multiply sum saturate) 1174 1.1 mrg * ======== 1175 1.1 mrg */ 1176 1.1 mrg static inline vec_uint4 vec_msums(vec_ushort8 a, vec_ushort8 b, vec_uint4 c) 1177 1.1 mrg { 1178 1.1 mrg vec_uint4 p1, p2; 1179 1.1 mrg 1180 1.1 mrg p1 = spu_mulo(a, b); 1181 1.1 mrg p2 = spu_mulo(spu_rlqwbyte(a, -2), spu_rlqwbyte(b, -2)); 1182 1.1 mrg 1183 1.1 mrg return (vec_adds(p2, vec_adds(p1, c))); 1184 1.1 mrg } 1185 1.1 mrg 1186 1.1 mrg static inline vec_int4 vec_msums(vec_short8 a, vec_short8 b, vec_int4 c) 1187 1.1 mrg { 1188 1.1 mrg return (vec_adds(spu_add(spu_mulo(a, b), spu_mulo(spu_rlqwbyte(a, -2), spu_rlqwbyte(b, -2))), c)); 1189 1.1 mrg } 1190 1.1 mrg 1191 1.1 mrg /* vec_mtvscr (vector move to vector status and control register) 1192 1.1 mrg * ========== 1193 1.1 mrg */ 1194 1.1 mrg #define vec_mtvscr(_a) /* not supported */ 1195 1.1 mrg 1196 1.1 mrg 1197 1.1 mrg /* vec_mule (vector multiply even) 1198 1.1 mrg * ======== 1199 1.1 mrg */ 1200 1.1 mrg static inline vec_ushort8 vec_mule(vec_uchar16 a, vec_uchar16 b) 1201 1.1 mrg { 1202 1.1 mrg vec_ushort8 hi, lo; 1203 1.1 mrg 1204 1.1 mrg hi = (vec_ushort8)spu_mulo((vec_ushort8)(spu_rlmask((vec_uint4)(a), -24)), 1205 1.1 mrg (vec_ushort8)(spu_rlmask((vec_uint4)(b), -24))); 1206 1.1 mrg lo = (vec_ushort8)spu_mulo((vec_ushort8)(spu_rlmask((vec_short8)(a), -8)), 1207 1.1 mrg (vec_ushort8)(spu_rlmask((vec_short8)(b), -8))); 1208 1.1 mrg 1209 1.1 mrg return (spu_shuffle(hi, lo, ((vec_uchar16){ 2, 3, 18, 19, 6, 7, 22, 23, 1210 1.1 mrg 10, 11, 26, 27, 14, 15, 30, 31}))); 1211 1.1 mrg } 1212 1.1 mrg 1213 1.1 mrg static inline vec_short8 vec_mule(vec_char16 a, vec_char16 b) 1214 1.1 mrg { 1215 1.1 mrg vec_short8 hi, lo; 1216 1.1 mrg 1217 1.1 mrg hi = (vec_short8)spu_mulo((vec_short8)(spu_rlmaska((vec_uint4)(a), -24)), 1218 1.1 mrg (vec_short8)(spu_rlmaska((vec_uint4)(b), -24))); 1219 1.1 mrg lo = (vec_short8)spu_mulo((vec_short8)(spu_rlmaska((vec_short8)(a), -8)), 1220 1.1 mrg (vec_short8)(spu_rlmaska((vec_short8)(b), -8))); 1221 1.1 mrg 1222 1.1 mrg return (spu_shuffle(hi, lo, ((vec_uchar16){ 2, 3, 18, 19, 6, 7, 22, 23, 1223 1.1 mrg 10, 11, 26, 27, 14, 15, 30, 31}))); 1224 1.1 mrg } 1225 1.1 mrg 1226 1.1 mrg static inline vec_uint4 vec_mule(vec_ushort8 a, vec_ushort8 b) 1227 1.1 mrg { 1228 1.1 mrg return (spu_mulo((vec_ushort8)spu_rlmask((vec_uint4)(a), -16), 1229 1.1 mrg (vec_ushort8)spu_rlmask((vec_uint4)(b), -16))); 1230 1.1 mrg } 1231 1.1 mrg 1232 1.1 mrg 1233 1.1 mrg static inline vec_int4 vec_mule(vec_short8 a, vec_short8 b) 1234 1.1 mrg { 1235 1.1 mrg return (spu_mulo((vec_short8)spu_rlmaska((vec_int4)(a), -16), 1236 1.1 mrg (vec_short8)spu_rlmaska((vec_int4)(b), -16))); 1237 1.1 mrg } 1238 1.1 mrg 1239 1.1 mrg 1240 1.1 mrg /* vec_mulo (vector multiply odd) 1241 1.1 mrg * ======== 1242 1.1 mrg */ 1243 1.1 mrg static inline vec_ushort8 vec_mulo(vec_uchar16 a, vec_uchar16 b) 1244 1.1 mrg { 1245 1.1 mrg vec_ushort8 hi, lo; 1246 1.1 mrg 1247 1.1 mrg hi = (vec_ushort8)spu_mulo((vec_ushort8)(spu_and(spu_rlmask((vec_uint4)(a), -16), 0xFF)), 1248 1.1 mrg (vec_ushort8)(spu_and(spu_rlmask((vec_uint4)(b), -16), 0xFF))); 1249 1.1 mrg lo = (vec_ushort8)spu_mulo(spu_and((vec_ushort8)(a), 0xFF), spu_and((vec_ushort8)(b), 0xFF)); 1250 1.1 mrg 1251 1.1 mrg return (spu_shuffle(hi, lo, ((vec_uchar16){ 2, 3, 18, 19, 6, 7, 22, 23, 1252 1.1 mrg 10, 11, 26, 27, 14, 15, 30, 31}))); 1253 1.1 mrg } 1254 1.1 mrg 1255 1.1 mrg static inline vec_short8 vec_mulo(vec_char16 a, vec_char16 b) 1256 1.1 mrg { 1257 1.1 mrg vec_short8 aa, bb, hi, lo; 1258 1.1 mrg 1259 1.1 mrg aa = spu_extend(a); 1260 1.1 mrg bb = spu_extend(b); 1261 1.1 mrg 1262 1.1 mrg hi = (vec_short8)spu_mulo((vec_short8)(spu_rlmaska((vec_uint4)(aa), -16)), 1263 1.1 mrg (vec_short8)(spu_rlmaska((vec_uint4)(bb), -16))); 1264 1.1 mrg lo = (vec_short8)spu_mulo(aa, bb); 1265 1.1 mrg return (spu_shuffle(hi, lo, ((vec_uchar16){ 2, 3, 18, 19, 6, 7, 22, 23, 1266 1.1 mrg 10, 11, 26, 27, 14, 15, 30, 31}))); 1267 1.1 mrg } 1268 1.1 mrg 1269 1.1 mrg static inline vec_uint4 vec_mulo(vec_ushort8 a, vec_ushort8 b) 1270 1.1 mrg { 1271 1.1 mrg return (spu_mulo(a, b)); 1272 1.1 mrg } 1273 1.1 mrg 1274 1.1 mrg 1275 1.1 mrg static inline vec_int4 vec_mulo(vec_short8 a, vec_short8 b) 1276 1.1 mrg { 1277 1.1 mrg return (spu_mulo(a, b)); 1278 1.1 mrg } 1279 1.1 mrg 1280 1.1 mrg 1281 1.1 mrg /* vec_nmsub (vector negative multiply subtract) 1282 1.1 mrg * ========= 1283 1.1 mrg */ 1284 1.1 mrg #define vec_nmsub(_a, _b, _c) spu_nmsub(_a, _b, _c) 1285 1.1 mrg 1286 1.1 mrg 1287 1.1 mrg /* vec_nor (vector logical nor) 1288 1.1 mrg * ======= 1289 1.1 mrg */ 1290 1.1 mrg #define vec_nor(_a, _b) spu_nor(_a, _b) 1291 1.1 mrg 1292 1.1 mrg 1293 1.1 mrg /* vec_or (vector logical or) 1294 1.1 mrg * ====== 1295 1.1 mrg */ 1296 1.1 mrg static inline vec_uchar16 vec_or(vec_uchar16 a, vec_uchar16 b) 1297 1.1 mrg { 1298 1.1 mrg return (spu_or(a, b)); 1299 1.1 mrg } 1300 1.1 mrg 1301 1.1 mrg static inline vec_char16 vec_or(vec_char16 a, vec_char16 b) 1302 1.1 mrg { 1303 1.1 mrg return (spu_or(a, b)); 1304 1.1 mrg } 1305 1.1 mrg 1306 1.1 mrg static inline vec_char16 vec_or(vec_bchar16 a, vec_char16 b) 1307 1.1 mrg { 1308 1.1 mrg return (spu_or((vec_char16)(a), b)); 1309 1.1 mrg } 1310 1.1 mrg 1311 1.1 mrg static inline vec_char16 vec_or(vec_char16 a, vec_bchar16 b) 1312 1.1 mrg { 1313 1.1 mrg return (spu_or(a, (vec_char16)(b))); 1314 1.1 mrg } 1315 1.1 mrg 1316 1.1 mrg static inline vec_ushort8 vec_or(vec_ushort8 a, vec_ushort8 b) 1317 1.1 mrg { 1318 1.1 mrg return (spu_or(a, b)); 1319 1.1 mrg } 1320 1.1 mrg 1321 1.1 mrg static inline vec_short8 vec_or(vec_short8 a, vec_short8 b) 1322 1.1 mrg { 1323 1.1 mrg return (spu_or(a, b)); 1324 1.1 mrg } 1325 1.1 mrg 1326 1.1 mrg static inline vec_short8 vec_or(vec_bshort8 a, vec_short8 b) 1327 1.1 mrg { 1328 1.1 mrg return (spu_or((vec_short8)(a), b)); 1329 1.1 mrg } 1330 1.1 mrg 1331 1.1 mrg static inline vec_short8 vec_or(vec_short8 a, vec_bshort8 b) 1332 1.1 mrg { 1333 1.1 mrg return (spu_or(a, (vec_short8)(b))); 1334 1.1 mrg } 1335 1.1 mrg 1336 1.1 mrg static inline vec_uint4 vec_or(vec_uint4 a, vec_uint4 b) 1337 1.1 mrg { 1338 1.1 mrg return (spu_or(a, b)); 1339 1.1 mrg } 1340 1.1 mrg 1341 1.1 mrg static inline vec_int4 vec_or(vec_int4 a, vec_int4 b) 1342 1.1 mrg { 1343 1.1 mrg return (spu_or(a, b)); 1344 1.1 mrg } 1345 1.1 mrg 1346 1.1 mrg static inline vec_int4 vec_or(vec_bint4 a, vec_int4 b) 1347 1.1 mrg { 1348 1.1 mrg return (spu_or((vec_int4)(a), b)); 1349 1.1 mrg } 1350 1.1 mrg 1351 1.1 mrg static inline vec_int4 vec_or(vec_int4 a, vec_bint4 b) 1352 1.1 mrg { 1353 1.1 mrg return (spu_or(a, (vec_int4)(b))); 1354 1.1 mrg } 1355 1.1 mrg 1356 1.1 mrg static inline vec_float4 vec_or(vec_float4 a, vec_float4 b) 1357 1.1 mrg { 1358 1.1 mrg return (spu_or(a, b)); 1359 1.1 mrg } 1360 1.1 mrg 1361 1.1 mrg static inline vec_float4 vec_or(vec_bint4 a, vec_float4 b) 1362 1.1 mrg { 1363 1.1 mrg return (spu_or((vec_float4)(a),b)); 1364 1.1 mrg } 1365 1.1 mrg 1366 1.1 mrg static inline vec_float4 vec_or(vec_float4 a, vec_bint4 b) 1367 1.1 mrg { 1368 1.1 mrg return (spu_or(a, (vec_float4)(b))); 1369 1.1 mrg } 1370 1.1 mrg 1371 1.1 mrg 1372 1.1 mrg /* vec_pack (vector pack) 1373 1.1 mrg * ======== 1374 1.1 mrg */ 1375 1.1 mrg static inline vec_uchar16 vec_pack(vec_ushort8 a, vec_ushort8 b) 1376 1.1 mrg { 1377 1.1 mrg return ((vec_uchar16)spu_shuffle(a, b, ((vec_uchar16){ 1, 3, 5, 7, 9, 11, 13, 15, 1378 1.1 mrg 17, 19, 21, 23, 25, 27, 29, 31}))); 1379 1.1 mrg } 1380 1.1 mrg 1381 1.1 mrg static inline vec_char16 vec_pack(vec_short8 a, vec_short8 b) 1382 1.1 mrg { 1383 1.1 mrg return ((vec_char16)spu_shuffle(a, b, ((vec_uchar16){ 1, 3, 5, 7, 9, 11, 13, 15, 1384 1.1 mrg 17, 19, 21, 23, 25, 27, 29, 31}))); 1385 1.1 mrg } 1386 1.1 mrg 1387 1.1 mrg static inline vec_ushort8 vec_pack(vec_uint4 a, vec_uint4 b) 1388 1.1 mrg { 1389 1.1 mrg return ((vec_ushort8)spu_shuffle(a, b, ((vec_uchar16){ 2, 3, 6, 7, 10, 11, 14, 15, 1390 1.1 mrg 18, 19, 22, 23, 26, 27, 30, 31}))); 1391 1.1 mrg } 1392 1.1 mrg 1393 1.1 mrg static inline vec_short8 vec_pack(vec_int4 a, vec_int4 b) 1394 1.1 mrg { 1395 1.1 mrg return ((vec_short8)spu_shuffle(a, b, ((vec_uchar16){ 2, 3, 6, 7, 10, 11, 14, 15, 1396 1.1 mrg 18, 19, 22, 23, 26, 27, 30, 31}))); 1397 1.1 mrg } 1398 1.1 mrg 1399 1.1 mrg 1400 1.1 mrg /* vec_packpx (vector pack pixel) 1401 1.1 mrg * ========== 1402 1.1 mrg */ 1403 1.1 mrg static inline vec_pixel8 vec_packpx(vec_uint4 a, vec_uint4 b) 1404 1.1 mrg { 1405 1.1 mrg vec_uint4 x03FF = (vec_uint4)(spu_splats((unsigned short)0x03FF)); 1406 1.1 mrg vec_uint4 x001F = (vec_uint4)(spu_splats((unsigned short)0x001F)); 1407 1.1 mrg 1408 1.1 mrg return ((vec_pixel8)(spu_shuffle(spu_sel(spu_sel(spu_sl(a, 7), spu_sl(a, 10), x03FF), 1409 1.1 mrg spu_sl(a, 13), x001F), 1410 1.1 mrg spu_sel(spu_sel(spu_sl(b, 7), spu_sl(b, 10), x03FF), 1411 1.1 mrg spu_sl(b, 13), x001F), 1412 1.1 mrg ((vec_uchar16){ 0, 1, 4, 5, 8, 9, 12, 13, 1413 1.1 mrg 16, 17, 20, 21, 24, 25, 28, 29})))); 1414 1.1 mrg } 1415 1.1 mrg 1416 1.1 mrg 1417 1.1 mrg /* vec_packs (vector pack saturate) 1418 1.1 mrg * ========= 1419 1.1 mrg */ 1420 1.1 mrg static inline vec_uchar16 vec_packs(vec_ushort8 a, vec_ushort8 b) 1421 1.1 mrg { 1422 1.1 mrg vec_ushort8 max = spu_splats((unsigned short)0x00FF); 1423 1.1 mrg 1424 1.1 mrg return ((vec_uchar16)(spu_shuffle(spu_sel(a, max, spu_cmpgt(a, 255)), 1425 1.1 mrg spu_sel(b, max, spu_cmpgt(b, 255)), 1426 1.1 mrg ((vec_uchar16){ 1, 3, 5, 7, 9, 11, 13, 15, 1427 1.1 mrg 17, 19, 21, 23, 25, 27, 29, 31})))); 1428 1.1 mrg } 1429 1.1 mrg 1430 1.1 mrg static inline vec_char16 vec_packs(vec_short8 a, vec_short8 b) 1431 1.1 mrg { 1432 1.1 mrg vec_short8 max = spu_splats((signed short)0x007F); 1433 1.1 mrg vec_short8 min = spu_splats((signed short)0xFF80); 1434 1.1 mrg 1435 1.1 mrg return ((vec_char16)(spu_shuffle(spu_sel(min, spu_sel(a, max, spu_cmpgt(a, 127)), spu_cmpgt(a, -128)), 1436 1.1 mrg spu_sel(min, spu_sel(b, max, spu_cmpgt(b, 127)), spu_cmpgt(b, -128)), 1437 1.1 mrg ((vec_uchar16){ 1, 3, 5, 7, 9, 11, 13, 15, 1438 1.1 mrg 17, 19, 21, 23, 25, 27, 29, 31})))); 1439 1.1 mrg } 1440 1.1 mrg 1441 1.1 mrg static inline vec_ushort8 vec_packs(vec_uint4 a, vec_uint4 b) 1442 1.1 mrg { 1443 1.1 mrg vec_uint4 max = spu_splats((unsigned int)0x0000FFFF); 1444 1.1 mrg 1445 1.1 mrg return ((vec_ushort8)(spu_shuffle(spu_sel(a, max, spu_cmpgt(a, max)), 1446 1.1 mrg spu_sel(b, max, spu_cmpgt(b, max)), 1447 1.1 mrg ((vec_uchar16){ 2, 3, 6, 7, 10, 11, 14, 15, 1448 1.1 mrg 18, 19, 22, 23, 26, 27, 30, 31})))); 1449 1.1 mrg } 1450 1.1 mrg 1451 1.1 mrg static inline vec_short8 vec_packs(vec_int4 a, vec_int4 b) 1452 1.1 mrg { 1453 1.1 mrg vec_int4 max = spu_splats((signed int)0x00007FFF); 1454 1.1 mrg vec_int4 min = spu_splats((signed int)0xFFFF8000); 1455 1.1 mrg 1456 1.1 mrg return ((vec_short8)(spu_shuffle(spu_sel(min, spu_sel(a, max, spu_cmpgt(a, max)), spu_cmpgt(a, min)), 1457 1.1 mrg spu_sel(min, spu_sel(b, max, spu_cmpgt(b, max)), spu_cmpgt(b, min)), 1458 1.1 mrg ((vec_uchar16){ 2, 3, 6, 7, 10, 11, 14, 15, 1459 1.1 mrg 18, 19, 22, 23, 26, 27, 30, 31})))); 1460 1.1 mrg } 1461 1.1 mrg 1462 1.1 mrg 1463 1.1 mrg /* vec_packsu (vector pack saturate unsigned) 1464 1.1 mrg * ========== 1465 1.1 mrg */ 1466 1.1 mrg static inline vec_uchar16 vec_packsu(vec_ushort8 a, vec_ushort8 b) 1467 1.1 mrg { 1468 1.1 mrg return ((vec_uchar16)spu_shuffle(spu_or(a, (vec_ushort8)(spu_cmpgt(a, 255))), 1469 1.1 mrg spu_or(b, (vec_ushort8)(spu_cmpgt(b, 255))), 1470 1.1 mrg ((vec_uchar16){ 1, 3, 5, 7, 9, 11, 13, 15, 1471 1.1 mrg 17, 19, 21, 23, 25, 27, 29, 31}))); 1472 1.1 mrg } 1473 1.1 mrg 1474 1.1 mrg static inline vec_uchar16 vec_packsu(vec_short8 a, vec_short8 b) 1475 1.1 mrg { 1476 1.1 mrg vec_short8 max = spu_splats((signed short)0x00FF); 1477 1.1 mrg vec_short8 min = spu_splats((signed short)0x0000); 1478 1.1 mrg 1479 1.1 mrg return ((vec_uchar16)(spu_shuffle(spu_sel(min, spu_sel(a, max, spu_cmpgt(a, 255)), spu_cmpgt(a, 0)), 1480 1.1 mrg spu_sel(min, spu_sel(b, max, spu_cmpgt(b, 255)), spu_cmpgt(b, 0)), 1481 1.1 mrg ((vec_uchar16){ 1, 3, 5, 7, 9, 11, 13, 15, 1482 1.1 mrg 17, 19, 21, 23, 25, 27, 29, 31})))); 1483 1.1 mrg 1484 1.1 mrg return (vec_packsu((vec_ushort8)(a), (vec_ushort8)(b))); 1485 1.1 mrg } 1486 1.1 mrg 1487 1.1 mrg static inline vec_ushort8 vec_packsu(vec_uint4 a, vec_uint4 b) 1488 1.1 mrg { 1489 1.1 mrg vec_uint4 max = spu_splats((unsigned int)0xFFFF); 1490 1.1 mrg 1491 1.1 mrg return ((vec_ushort8)spu_shuffle(spu_or(a, (vec_uint4)(spu_cmpgt(a, max))), 1492 1.1 mrg spu_or(b, (vec_uint4)(spu_cmpgt(b, max))), 1493 1.1 mrg ((vec_uchar16){ 2, 3, 6, 7, 10, 11, 14, 15, 1494 1.1 mrg 18, 19, 22, 23, 26, 27, 30, 31}))); 1495 1.1 mrg } 1496 1.1 mrg 1497 1.1 mrg static inline vec_ushort8 vec_packsu(vec_int4 a, vec_int4 b) 1498 1.1 mrg { 1499 1.1 mrg vec_int4 max = spu_splats((signed int)0x0000FFFF); 1500 1.1 mrg vec_int4 min = spu_splats((signed int)0x00000000); 1501 1.1 mrg 1502 1.1 mrg return ((vec_ushort8)(spu_shuffle(spu_sel(min, spu_sel(a, max, spu_cmpgt(a, max)), spu_cmpgt(a, min)), 1503 1.1 mrg spu_sel(min, spu_sel(b, max, spu_cmpgt(b, max)), spu_cmpgt(b, min)), 1504 1.1 mrg ((vec_uchar16){ 2, 3, 6, 7, 10, 11, 14, 15, 1505 1.1 mrg 18, 19, 22, 23, 26, 27, 30, 31})))); 1506 1.1 mrg } 1507 1.1 mrg 1508 1.1 mrg 1509 1.1 mrg /* vec_perm (vector permute) 1510 1.1 mrg * ======== 1511 1.1 mrg */ 1512 1.1 mrg static inline vec_uchar16 vec_perm(vec_uchar16 a, vec_uchar16 b, vec_uchar16 c) 1513 1.1 mrg { 1514 1.1 mrg return (spu_shuffle(a, b, spu_and(c, 0x1F))); 1515 1.1 mrg } 1516 1.1 mrg 1517 1.1 mrg static inline vec_char16 vec_perm(vec_char16 a, vec_char16 b, vec_uchar16 c) 1518 1.1 mrg { 1519 1.1 mrg return ((vec_char16)(vec_perm((vec_uchar16)(a), (vec_uchar16)(b), c))); 1520 1.1 mrg } 1521 1.1 mrg 1522 1.1 mrg static inline vec_ushort8 vec_perm(vec_ushort8 a, vec_ushort8 b, vec_uchar16 c) 1523 1.1 mrg { 1524 1.1 mrg return ((vec_ushort8)(vec_perm((vec_uchar16)(a), (vec_uchar16)(b), c))); 1525 1.1 mrg } 1526 1.1 mrg 1527 1.1 mrg static inline vec_short8 vec_perm(vec_short8 a, vec_short8 b, vec_uchar16 c) 1528 1.1 mrg { 1529 1.1 mrg return ((vec_short8)(vec_perm((vec_uchar16)(a), (vec_uchar16)(b), c))); 1530 1.1 mrg } 1531 1.1 mrg 1532 1.1 mrg static inline vec_uint4 vec_perm(vec_uint4 a, vec_uint4 b, vec_uchar16 c) 1533 1.1 mrg { 1534 1.1 mrg return ((vec_uint4)(vec_perm((vec_uchar16)(a), (vec_uchar16)(b), c))); 1535 1.1 mrg } 1536 1.1 mrg 1537 1.1 mrg static inline vec_int4 vec_perm(vec_int4 a, vec_int4 b, vec_uchar16 c) 1538 1.1 mrg { 1539 1.1 mrg return ((vec_int4)(vec_perm((vec_uchar16)(a), (vec_uchar16)(b), c))); 1540 1.1 mrg } 1541 1.1 mrg 1542 1.1 mrg static inline vec_float4 vec_perm(vec_float4 a, vec_float4 b, vec_uchar16 c) 1543 1.1 mrg { 1544 1.1 mrg return ((vec_float4)(vec_perm((vec_uchar16)(a), (vec_uchar16)(b), c))); 1545 1.1 mrg } 1546 1.1 mrg 1547 1.1 mrg 1548 1.1 mrg /* vec_re (vector reciprocal estimate) 1549 1.1 mrg * ====== 1550 1.1 mrg */ 1551 1.1 mrg #define vec_re(_a) spu_re(_a) 1552 1.1 mrg 1553 1.1 mrg 1554 1.1 mrg /* vec_rl (vector rotate left) 1555 1.1 mrg * ====== 1556 1.1 mrg */ 1557 1.1 mrg static inline vec_uchar16 vec_rl(vec_uchar16 a, vec_uchar16 b) 1558 1.1 mrg { 1559 1.1 mrg vec_ushort8 r1, r2; 1560 1.1 mrg 1561 1.1 mrg r1 = spu_rl(spu_and((vec_ushort8)(a), 0xFF), (vec_short8)spu_and((vec_ushort8)(b), 7)); 1562 1.1 mrg r2 = spu_rl(spu_and((vec_ushort8)(a), -256), (vec_short8)spu_and(spu_rlmask((vec_ushort8)(b), -8), 7)); 1563 1.1 mrg return ((vec_uchar16)(spu_sel(spu_or(r2, spu_sl(r2, 8)), spu_or(r1, spu_rlmask(r1, -8)), spu_splats((unsigned short)0xFF)))); 1564 1.1 mrg } 1565 1.1 mrg 1566 1.1 mrg static inline vec_char16 vec_rl(vec_char16 a, vec_uchar16 b) 1567 1.1 mrg { 1568 1.1 mrg return ((vec_char16)(vec_rl((vec_uchar16)(a), b))); 1569 1.1 mrg } 1570 1.1 mrg 1571 1.1 mrg static inline vec_ushort8 vec_rl(vec_ushort8 a, vec_ushort8 b) 1572 1.1 mrg { 1573 1.1 mrg return (spu_rl(a, (vec_short8)(b))); 1574 1.1 mrg } 1575 1.1 mrg 1576 1.1 mrg static inline vec_short8 vec_rl(vec_short8 a, vec_ushort8 b) 1577 1.1 mrg { 1578 1.1 mrg return (spu_rl(a, (vec_short8)(b))); 1579 1.1 mrg } 1580 1.1 mrg 1581 1.1 mrg static inline vec_uint4 vec_rl(vec_uint4 a, vec_uint4 b) 1582 1.1 mrg { 1583 1.1 mrg return (spu_rl(a, (vec_int4)(b))); 1584 1.1 mrg } 1585 1.1 mrg 1586 1.1 mrg static inline vec_int4 vec_rl(vec_int4 a, vec_uint4 b) 1587 1.1 mrg { 1588 1.1 mrg return (spu_rl(a, (vec_int4)(b))); 1589 1.1 mrg } 1590 1.1 mrg 1591 1.1 mrg 1592 1.1 mrg /* vec_round (vector round) 1593 1.1 mrg * ========= 1594 1.1 mrg */ 1595 1.1 mrg static inline vec_float4 vec_round(vec_float4 a) 1596 1.1 mrg { 1597 1.1 mrg vec_float4 s_half, s_one, d; 1598 1.1 mrg vec_uint4 odd; 1599 1.1 mrg vec_uint4 msb = spu_splats((unsigned int)0x80000000); 1600 1.1 mrg vec_float4 half = spu_splats(0.5f); 1601 1.1 mrg vec_int4 exp; 1602 1.1 mrg vec_uint4 mask; 1603 1.1 mrg 1604 1.1 mrg s_half = (vec_float4)(spu_sel((vec_uint4)(half), (vec_uint4)(a), msb)); 1605 1.1 mrg a = spu_add(a, s_half); 1606 1.1 mrg s_one = spu_add(s_half, s_half); 1607 1.1 mrg exp = spu_sub(127, (vec_int4)(spu_and(spu_rlmask((vec_uint4)(a), -23), 0xFF))); 1608 1.1 mrg mask = spu_rlmask(spu_splats((unsigned int)0x7FFFFF), exp); 1609 1.1 mrg mask = spu_sel(spu_splats((unsigned int)0), mask, spu_cmpgt(exp, -31)); 1610 1.1 mrg mask = spu_or(mask, spu_xor((vec_uint4)(spu_rlmaska(spu_add(exp, -1), -31)), -1)); 1611 1.1 mrg 1612 1.1 mrg odd = spu_and((vec_uint4)(spu_convts(a, 0)), 1); 1613 1.1 mrg s_one = spu_andc(s_one, (vec_float4)spu_cmpeq(mask, 0)); 1614 1.1 mrg s_one = spu_and(s_one, spu_and((vec_float4)spu_cmpeq(spu_and((vec_uint4)(a), mask), 0), 1615 1.1 mrg (vec_float4)spu_cmpeq(odd, 1))); 1616 1.1 mrg d = spu_andc(a, (vec_float4)(mask)); 1617 1.1 mrg d = spu_sub(d, s_one); 1618 1.1 mrg return (d); 1619 1.1 mrg } 1620 1.1 mrg 1621 1.1 mrg /* vec_rsqrte (vector reciprocal square root estimate) 1622 1.1 mrg * ========== 1623 1.1 mrg */ 1624 1.1 mrg #define vec_rsqrte(_a) spu_rsqrte(_a) 1625 1.1 mrg 1626 1.1 mrg 1627 1.1 mrg /* vec_sel (vector select) 1628 1.1 mrg * ======= 1629 1.1 mrg */ 1630 1.1 mrg #define vec_sel(_a, _b, _c) spu_sel(_a, _b, _c) 1631 1.1 mrg 1632 1.1 mrg 1633 1.1 mrg /* vec_sl (vector shift left) 1634 1.1 mrg * ====== 1635 1.1 mrg */ 1636 1.1 mrg static inline vec_uchar16 vec_sl(vec_uchar16 a, vec_uchar16 b) 1637 1.1 mrg { 1638 1.1 mrg vec_ushort8 hi, lo; 1639 1.1 mrg 1640 1.1 mrg lo = spu_and(spu_sl((vec_ushort8)(a), spu_and((vec_ushort8)(b), 7)), 0xFF); 1641 1.1 mrg hi = spu_sl(spu_and((vec_ushort8)(a), -256), spu_and(spu_rlmask((vec_ushort8)(b), -8), 7)); 1642 1.1 mrg 1643 1.1 mrg return ((vec_uchar16)(spu_or(hi, lo))); 1644 1.1 mrg } 1645 1.1 mrg 1646 1.1 mrg static inline vec_char16 vec_sl(vec_char16 a, vec_uchar16 b) 1647 1.1 mrg { 1648 1.1 mrg return ((vec_char16)(vec_sl((vec_uchar16)(a), b))); 1649 1.1 mrg } 1650 1.1 mrg 1651 1.1 mrg static inline vec_ushort8 vec_sl(vec_ushort8 a, vec_ushort8 b) 1652 1.1 mrg { 1653 1.1 mrg return (spu_sl(a, spu_and(b, 15))); 1654 1.1 mrg } 1655 1.1 mrg 1656 1.1 mrg static inline vec_short8 vec_sl(vec_short8 a, vec_ushort8 b) 1657 1.1 mrg { 1658 1.1 mrg return (spu_sl(a, spu_and((vec_ushort8)(b), 15))); 1659 1.1 mrg } 1660 1.1 mrg 1661 1.1 mrg static inline vec_uint4 vec_sl(vec_uint4 a, vec_uint4 b) 1662 1.1 mrg { 1663 1.1 mrg return (spu_sl(a, spu_and(b, 31))); 1664 1.1 mrg } 1665 1.1 mrg 1666 1.1 mrg static inline vec_int4 vec_sl(vec_int4 a, vec_uint4 b) 1667 1.1 mrg { 1668 1.1 mrg return (spu_sl(a, spu_and(b, 31))); 1669 1.1 mrg } 1670 1.1 mrg 1671 1.1 mrg 1672 1.1 mrg /* vec_sld (vector shift left double) 1673 1.1 mrg * ======= 1674 1.1 mrg */ 1675 1.1 mrg #define vec_sld(_a, _b, _c) spu_shuffle(_a, _b, ((vec_uchar16){ 0+(_c), 1+(_c), 2+(_c), 3+(_c), \ 1676 1.1 mrg 4+(_c), 5+(_c), 6+(_c), 7+(_c), \ 1677 1.1 mrg 8+(_c), 9+(_c), 10+(_c), 11+(_c), \ 1678 1.1 mrg 12+(_c), 13+(_c), 14+(_c), 15+(_c)})) 1679 1.1 mrg 1680 1.1 mrg 1681 1.1 mrg /* vec_sll (vector shift left long) 1682 1.1 mrg * ======= 1683 1.1 mrg */ 1684 1.1 mrg #define vec_sll(_a, _b) spu_slqw(_a, spu_extract((vec_uint4)(_b), 0)) 1685 1.1 mrg 1686 1.1 mrg 1687 1.1 mrg /* vec_slo (vector shift left by octet) 1688 1.1 mrg * ======= 1689 1.1 mrg */ 1690 1.1 mrg #define vec_slo(_a, _b) spu_slqwbytebc(_a, spu_extract((vec_uint4)(_b), 3) & 0x7F) 1691 1.1 mrg 1692 1.1 mrg 1693 1.1 mrg /* vec_splat (vector splat) 1694 1.1 mrg * ========= 1695 1.1 mrg */ 1696 1.1 mrg #define vec_splat(_a, _b) spu_splats(spu_extract(_a, _b)) 1697 1.1 mrg 1698 1.1 mrg 1699 1.1 mrg /* vec_splat_s8 (vector splat signed byte) 1700 1.1 mrg * ============ 1701 1.1 mrg */ 1702 1.1 mrg #define vec_splat_s8(_a) spu_splats((signed char)(_a)) 1703 1.1 mrg 1704 1.1 mrg 1705 1.1 mrg /* vec_splat_s16 (vector splat signed half-word) 1706 1.1 mrg * ============= 1707 1.1 mrg */ 1708 1.1 mrg #define vec_splat_s16(_a) spu_splats((signed short)(_a)) 1709 1.1 mrg 1710 1.1 mrg 1711 1.1 mrg /* vec_splat_s32 (vector splat signed word) 1712 1.1 mrg * ============= 1713 1.1 mrg */ 1714 1.1 mrg #define vec_splat_s32(_a) spu_splats((signed int)(_a)) 1715 1.1 mrg 1716 1.1 mrg 1717 1.1 mrg /* vec_splat_u8 (vector splat unsigned byte) 1718 1.1 mrg * ============ 1719 1.1 mrg */ 1720 1.1 mrg #define vec_splat_u8(_a) spu_splats((unsigned char)(_a)) 1721 1.1 mrg 1722 1.1 mrg 1723 1.1 mrg /* vec_splat_u16 (vector splat unsigned half-word) 1724 1.1 mrg * ============= 1725 1.1 mrg */ 1726 1.1 mrg #define vec_splat_u16(_a) spu_splats((unsigned short)(_a)) 1727 1.1 mrg 1728 1.1 mrg 1729 1.1 mrg /* vec_splat_u32 (vector splat unsigned word) 1730 1.1 mrg * ============= 1731 1.1 mrg */ 1732 1.1 mrg #define vec_splat_u32(_a) spu_splats((unsigned int)(_a)) 1733 1.1 mrg 1734 1.1 mrg 1735 1.1 mrg /* vec_sr (vector shift right) 1736 1.1 mrg * ====== 1737 1.1 mrg */ 1738 1.1 mrg static inline vec_uchar16 vec_sr(vec_uchar16 a, vec_uchar16 b) 1739 1.1 mrg { 1740 1.1 mrg vec_ushort8 hi, lo; 1741 1.1 mrg 1742 1.1 mrg lo = spu_rlmask(spu_and((vec_ushort8)(a), 0xFF), spu_sub(0, (vec_short8)(spu_and((vec_ushort8)(b), 7)))); 1743 1.1 mrg hi = spu_and(spu_rlmask((vec_ushort8)(a), spu_sub(0, (vec_short8)(spu_and(spu_rlmask((vec_ushort8)(b), -8), 7)))), -256); 1744 1.1 mrg 1745 1.1 mrg return ((vec_uchar16)(spu_or(hi, lo))); 1746 1.1 mrg } 1747 1.1 mrg 1748 1.1 mrg static inline vec_char16 vec_sr(vec_char16 a, vec_uchar16 b) 1749 1.1 mrg { 1750 1.1 mrg return ((vec_char16)(vec_sr((vec_uchar16)(a), b))); 1751 1.1 mrg } 1752 1.1 mrg 1753 1.1 mrg static inline vec_ushort8 vec_sr(vec_ushort8 a, vec_ushort8 b) 1754 1.1 mrg { 1755 1.1 mrg return (spu_rlmask(a, spu_sub(0, (vec_short8)(spu_and(b, 15))))); 1756 1.1 mrg } 1757 1.1 mrg 1758 1.1 mrg static inline vec_short8 vec_sr(vec_short8 a, vec_ushort8 b) 1759 1.1 mrg { 1760 1.1 mrg return ((vec_short8)(vec_sr((vec_ushort8)(a), b))); 1761 1.1 mrg } 1762 1.1 mrg 1763 1.1 mrg static inline vec_uint4 vec_sr(vec_uint4 a, vec_uint4 b) 1764 1.1 mrg { 1765 1.1 mrg return (spu_rlmask(a, spu_sub(0, (vec_int4)(spu_and(b, 31))))); 1766 1.1 mrg } 1767 1.1 mrg 1768 1.1 mrg static inline vec_int4 vec_sr(vec_int4 a, vec_uint4 b) 1769 1.1 mrg { 1770 1.1 mrg return ((vec_int4)(vec_sr((vec_uint4)(a), b))); 1771 1.1 mrg } 1772 1.1 mrg 1773 1.1 mrg 1774 1.1 mrg /* vec_sra (vector shift right algebraic) 1775 1.1 mrg * ======= 1776 1.1 mrg */ 1777 1.1 mrg static inline vec_char16 vec_sra(vec_char16 a, vec_uchar16 b) 1778 1.1 mrg { 1779 1.1 mrg vec_short8 hi, lo; 1780 1.1 mrg 1781 1.1 mrg lo = spu_and(spu_rlmaska(spu_extend(a), spu_sub(0, (vec_short8)(spu_and((vec_ushort8)(b), 7)))), 0xFF); 1782 1.1 mrg hi = spu_and(spu_rlmaska((vec_short8)(a), spu_sub(0, (vec_short8)(spu_and(spu_rlmask((vec_ushort8)(b), -8), 7)))), -256); 1783 1.1 mrg 1784 1.1 mrg return ((vec_char16)(spu_or(hi, lo))); 1785 1.1 mrg } 1786 1.1 mrg 1787 1.1 mrg static inline vec_uchar16 vec_sra(vec_uchar16 a, vec_uchar16 b) 1788 1.1 mrg { 1789 1.1 mrg return ((vec_uchar16)(vec_sra((vec_char16)(a), b))); 1790 1.1 mrg } 1791 1.1 mrg 1792 1.1 mrg static inline vec_short8 vec_sra(vec_short8 a, vec_ushort8 b) 1793 1.1 mrg { 1794 1.1 mrg return (spu_rlmaska(a, spu_sub(0, (vec_short8)(spu_and(b, 15))))); 1795 1.1 mrg } 1796 1.1 mrg 1797 1.1 mrg static inline vec_ushort8 vec_sra(vec_ushort8 a, vec_ushort8 b) 1798 1.1 mrg { 1799 1.1 mrg return ((vec_ushort8)(vec_sra((vec_short8)(a), b))); 1800 1.1 mrg } 1801 1.1 mrg 1802 1.1 mrg static inline vec_int4 vec_sra(vec_int4 a, vec_uint4 b) 1803 1.1 mrg { 1804 1.1 mrg return (spu_rlmaska(a, spu_sub(0, (vec_int4)(spu_and(b, 31))))); 1805 1.1 mrg } 1806 1.1 mrg 1807 1.1 mrg static inline vec_uint4 vec_sra(vec_uint4 a, vec_uint4 b) 1808 1.1 mrg { 1809 1.1 mrg return ((vec_uint4)(vec_sra((vec_int4)(a), b))); 1810 1.1 mrg } 1811 1.1 mrg 1812 1.1 mrg 1813 1.1 mrg /* vec_srl (vector shift right long) 1814 1.1 mrg * ======= 1815 1.1 mrg */ 1816 1.1 mrg #define vec_srl(_a, _b) spu_rlmaskqw(_a, 0-spu_extract((vec_int4)(_b), 3)) 1817 1.1 mrg 1818 1.1 mrg 1819 1.1 mrg /* vec_sro (vector shift right by octet) 1820 1.1 mrg * ======= 1821 1.1 mrg */ 1822 1.1 mrg #define vec_sro(_a, _b) spu_rlmaskqwbyte(_a, 0 - ((spu_extract((vec_int4)(_b), 3) >> 3) & 0xF)) 1823 1.1 mrg 1824 1.1 mrg /* vec_st (vector store indexed) 1825 1.1 mrg * ====== 1826 1.1 mrg */ 1827 1.1 mrg static inline void vec_st(vec_uchar16 a, int b, unsigned char *c) 1828 1.1 mrg { 1829 1.1 mrg *((vec_uchar16 *)(c+b)) = a; 1830 1.1 mrg } 1831 1.1 mrg 1832 1.1 mrg static inline void vec_st(vec_uchar16 a, int b, vec_uchar16 *c) 1833 1.1 mrg { 1834 1.1 mrg *((vec_uchar16 *)((unsigned char *)(c)+b)) = a; 1835 1.1 mrg } 1836 1.1 mrg 1837 1.1 mrg static inline void vec_st(vec_char16 a, int b, signed char *c) 1838 1.1 mrg { 1839 1.1 mrg *((vec_char16 *)(c+b)) = a; 1840 1.1 mrg } 1841 1.1 mrg 1842 1.1 mrg static inline void vec_st(vec_char16 a, int b, vec_char16 *c) 1843 1.1 mrg { 1844 1.1 mrg *((vec_char16 *)((signed char *)(c)+b)) = a; 1845 1.1 mrg } 1846 1.1 mrg 1847 1.1 mrg static inline void vec_st(vec_bchar16 a, int b, signed char *c) 1848 1.1 mrg { 1849 1.1 mrg *((vec_bchar16 *)((signed char *)(c)+b)) = a; 1850 1.1 mrg } 1851 1.1 mrg 1852 1.1 mrg static inline void vec_st(vec_ushort8 a, int b, unsigned short *c) 1853 1.1 mrg { 1854 1.1 mrg *((vec_ushort8 *)((unsigned char *)(c)+b)) = a; 1855 1.1 mrg } 1856 1.1 mrg 1857 1.1 mrg static inline void vec_st(vec_ushort8 a, int b, vec_ushort8 *c) 1858 1.1 mrg { 1859 1.1 mrg *((vec_ushort8 *)((unsigned char *)(c)+b)) = a; 1860 1.1 mrg } 1861 1.1 mrg 1862 1.1 mrg static inline void vec_st(vec_short8 a, int b, signed short *c) 1863 1.1 mrg { 1864 1.1 mrg *((vec_short8 *)((unsigned char *)(c)+b)) = a; 1865 1.1 mrg } 1866 1.1 mrg 1867 1.1 mrg static inline void vec_st(vec_short8 a, int b, vec_short8 *c) 1868 1.1 mrg { 1869 1.1 mrg *((vec_short8 *)((signed char *)(c)+b)) = a; 1870 1.1 mrg } 1871 1.1 mrg 1872 1.1 mrg static inline void vec_st(vec_bshort8 a, int b, signed short *c) 1873 1.1 mrg { 1874 1.1 mrg *((vec_bshort8 *)((signed char *)(c)+b)) = a; 1875 1.1 mrg } 1876 1.1 mrg 1877 1.1 mrg static inline void vec_st(vec_uint4 a, int b, unsigned int *c) 1878 1.1 mrg { 1879 1.1 mrg *((vec_uint4 *)((unsigned char *)(c)+b)) = a; 1880 1.1 mrg } 1881 1.1 mrg 1882 1.1 mrg static inline void vec_st(vec_uint4 a, int b, vec_uint4 *c) 1883 1.1 mrg { 1884 1.1 mrg *((vec_uint4 *)((unsigned char *)(c)+b)) = a; 1885 1.1 mrg } 1886 1.1 mrg 1887 1.1 mrg static inline void vec_st(vec_int4 a, int b, signed int *c) 1888 1.1 mrg { 1889 1.1 mrg *((vec_int4 *)((unsigned char *)(c)+b)) = a; 1890 1.1 mrg } 1891 1.1 mrg 1892 1.1 mrg static inline void vec_st(vec_int4 a, int b, vec_int4 *c) 1893 1.1 mrg { 1894 1.1 mrg *((vec_int4 *)((signed char *)(c)+b)) = a; 1895 1.1 mrg } 1896 1.1 mrg 1897 1.1 mrg static inline void vec_st(vec_bint4 a, int b, signed int *c) 1898 1.1 mrg { 1899 1.1 mrg *((vec_bint4 *)((signed char *)(c)+b)) = a; 1900 1.1 mrg } 1901 1.1 mrg 1902 1.1 mrg static inline void vec_st(vec_float4 a, int b, float *c) 1903 1.1 mrg { 1904 1.1 mrg *((vec_float4 *)((unsigned char *)(c)+b)) = a; 1905 1.1 mrg } 1906 1.1 mrg 1907 1.1 mrg static inline void vec_st(vec_float4 a, int b, vec_float4 *c) 1908 1.1 mrg { 1909 1.1 mrg *((vec_float4 *)((unsigned char *)(c)+b)) = a; 1910 1.1 mrg } 1911 1.1 mrg 1912 1.1 mrg 1913 1.1 mrg /* vec_ste (vector store element indexed) 1914 1.1 mrg * ======= 1915 1.1 mrg */ 1916 1.1 mrg static inline void vec_ste(vec_uchar16 a, int b, unsigned char *c) 1917 1.1 mrg { 1918 1.1 mrg unsigned char *ptr; 1919 1.1 mrg 1920 1.1 mrg ptr = c + b; 1921 1.1 mrg *ptr = spu_extract(a, (int)(ptr) & 15); 1922 1.1 mrg } 1923 1.1 mrg 1924 1.1 mrg static inline void vec_ste(vec_char16 a, int b, signed char *c) 1925 1.1 mrg { 1926 1.1 mrg vec_ste((vec_uchar16)(a), b, (unsigned char *)(c)); 1927 1.1 mrg } 1928 1.1 mrg 1929 1.1 mrg static inline void vec_ste(vec_bchar16 a, int b, signed char *c) 1930 1.1 mrg { 1931 1.1 mrg vec_ste((vec_uchar16)(a), b, (unsigned char *)(c)); 1932 1.1 mrg } 1933 1.1 mrg 1934 1.1 mrg static inline void vec_ste(vec_ushort8 a, int b, unsigned short *c) 1935 1.1 mrg { 1936 1.1 mrg unsigned short *ptr; 1937 1.1 mrg 1938 1.1 mrg ptr = (unsigned short *)(((unsigned int)(c) + b) & ~1); 1939 1.1 mrg *ptr = spu_extract(a, ((int)(ptr) >> 1) & 7); 1940 1.1 mrg } 1941 1.1 mrg 1942 1.1 mrg static inline void vec_ste(vec_short8 a, int b, signed short *c) 1943 1.1 mrg { 1944 1.1 mrg vec_ste((vec_ushort8)(a), b, (unsigned short *)(c)); 1945 1.1 mrg } 1946 1.1 mrg 1947 1.1 mrg static inline void vec_ste(vec_bshort8 a, int b, signed short *c) 1948 1.1 mrg { 1949 1.1 mrg vec_ste((vec_ushort8)(a), b, (unsigned short *)(c)); 1950 1.1 mrg } 1951 1.1 mrg 1952 1.1 mrg static inline void vec_ste(vec_uint4 a, int b, unsigned int *c) 1953 1.1 mrg { 1954 1.1 mrg unsigned int *ptr; 1955 1.1 mrg 1956 1.1 mrg ptr = (unsigned int *)(((unsigned int)(c) + b) & ~3); 1957 1.1 mrg *ptr = spu_extract(a, ((int)(ptr) >> 2) & 3); 1958 1.1 mrg } 1959 1.1 mrg 1960 1.1 mrg static inline void vec_ste(vec_int4 a, int b, signed int *c) 1961 1.1 mrg { 1962 1.1 mrg vec_ste((vec_uint4)(a), b, (unsigned int *)(c)); 1963 1.1 mrg } 1964 1.1 mrg 1965 1.1 mrg static inline void vec_ste(vec_bint4 a, int b, signed int *c) 1966 1.1 mrg { 1967 1.1 mrg vec_ste((vec_uint4)(a), b, (unsigned int *)(c)); 1968 1.1 mrg } 1969 1.1 mrg 1970 1.1 mrg static inline void vec_ste(vec_float4 a, int b, float *c) 1971 1.1 mrg { 1972 1.1 mrg vec_ste((vec_uint4)(a), b, (unsigned int *)(c)); 1973 1.1 mrg } 1974 1.1 mrg 1975 1.1 mrg 1976 1.1 mrg /* vec_stl (vector store indexed LRU) 1977 1.1 mrg * ======= 1978 1.1 mrg */ 1979 1.1 mrg #define vec_stl(_a, _b, _c) vec_st(_a, _b, _c) 1980 1.1 mrg 1981 1.1 mrg 1982 1.1 mrg /* vec_sub (vector subtract) 1983 1.1 mrg * ======= 1984 1.1 mrg */ 1985 1.1 mrg static inline vec_uchar16 vec_sub(vec_uchar16 a, vec_uchar16 b) 1986 1.1 mrg { 1987 1.1 mrg return ((vec_uchar16)(spu_sel(spu_sub((vec_ushort8)(a), (vec_ushort8)(b)), 1988 1.1 mrg spu_sub(spu_and((vec_ushort8)(a), -256), spu_and((vec_ushort8)(b), -256)), 1989 1.1 mrg spu_splats((unsigned short)0xFF00)))); 1990 1.1 mrg } 1991 1.1 mrg 1992 1.1 mrg static inline vec_char16 vec_sub(vec_char16 a, vec_char16 b) 1993 1.1 mrg { 1994 1.1 mrg return ((vec_char16)(vec_sub((vec_uchar16)(a), (vec_uchar16)(b)))); 1995 1.1 mrg } 1996 1.1 mrg 1997 1.1 mrg static inline vec_char16 vec_sub(vec_bchar16 a, vec_char16 b) 1998 1.1 mrg { 1999 1.1 mrg return ((vec_char16)(vec_sub((vec_uchar16)(a), (vec_uchar16)(b)))); 2000 1.1 mrg } 2001 1.1 mrg 2002 1.1 mrg static inline vec_char16 vec_sub(vec_char16 a, vec_bchar16 b) 2003 1.1 mrg { 2004 1.1 mrg return ((vec_char16)(vec_sub((vec_uchar16)(a), (vec_uchar16)(b)))); 2005 1.1 mrg } 2006 1.1 mrg 2007 1.1 mrg static inline vec_ushort8 vec_sub(vec_ushort8 a, vec_ushort8 b) 2008 1.1 mrg { 2009 1.1 mrg return (spu_sub(a, b)); 2010 1.1 mrg } 2011 1.1 mrg 2012 1.1 mrg static inline vec_short8 vec_sub(vec_short8 a, vec_short8 b) 2013 1.1 mrg { 2014 1.1 mrg return (spu_sub(a, b)); 2015 1.1 mrg } 2016 1.1 mrg 2017 1.1 mrg static inline vec_short8 vec_sub(vec_bshort8 a, vec_short8 b) 2018 1.1 mrg { 2019 1.1 mrg return (spu_sub((vec_short8)(a), b)); 2020 1.1 mrg } 2021 1.1 mrg 2022 1.1 mrg static inline vec_short8 vec_sub(vec_short8 a, vec_bshort8 b) 2023 1.1 mrg { 2024 1.1 mrg return (spu_sub(a, (vec_short8)(b))); 2025 1.1 mrg } 2026 1.1 mrg 2027 1.1 mrg static inline vec_uint4 vec_sub(vec_uint4 a, vec_uint4 b) 2028 1.1 mrg { 2029 1.1 mrg return (spu_sub(a, b)); 2030 1.1 mrg } 2031 1.1 mrg 2032 1.1 mrg static inline vec_int4 vec_sub(vec_int4 a, vec_int4 b) 2033 1.1 mrg { 2034 1.1 mrg return (spu_sub(a, b)); 2035 1.1 mrg } 2036 1.1 mrg 2037 1.1 mrg static inline vec_int4 vec_sub(vec_bint4 a, vec_int4 b) 2038 1.1 mrg { 2039 1.1 mrg return (spu_sub((vec_int4)(a), b)); 2040 1.1 mrg } 2041 1.1 mrg 2042 1.1 mrg static inline vec_int4 vec_sub(vec_int4 a, vec_bint4 b) 2043 1.1 mrg { 2044 1.1 mrg return (spu_sub(a, (vec_int4)(b))); 2045 1.1 mrg } 2046 1.1 mrg 2047 1.1 mrg static inline vec_float4 vec_sub(vec_float4 a, vec_float4 b) 2048 1.1 mrg { 2049 1.1 mrg return (spu_sub(a, b)); 2050 1.1 mrg } 2051 1.1 mrg 2052 1.1 mrg 2053 1.1 mrg /* vec_subc (vector subtract carryout) 2054 1.1 mrg * ======== 2055 1.1 mrg */ 2056 1.1 mrg #define vec_subc(_a, _b) spu_genb(_a, _b) 2057 1.1 mrg 2058 1.1 mrg 2059 1.1 mrg /* vec_subs (vector subtract saturate) 2060 1.1 mrg * ======== 2061 1.1 mrg */ 2062 1.1 mrg static inline vec_uchar16 vec_subs(vec_uchar16 a, vec_uchar16 b) 2063 1.1 mrg { 2064 1.1 mrg vec_ushort8 s1, s2; 2065 1.1 mrg vec_uchar16 s, d; 2066 1.1 mrg 2067 1.1 mrg s1 = spu_sub(spu_rlmask((vec_ushort8)(a), -8), spu_rlmask((vec_ushort8)(b), -8)); 2068 1.1 mrg s2 = spu_sub(spu_and((vec_ushort8)(a), 0xFF), spu_and((vec_ushort8)(b), 0xFF)); 2069 1.1 mrg s = (vec_uchar16)(spu_shuffle(s1, s2, ((vec_uchar16){0, 16, 2, 18, 4, 20, 6, 22, 2070 1.1 mrg 8, 24, 10, 26, 12, 28, 14, 30}))); 2071 1.1 mrg d = (vec_uchar16)(spu_shuffle(s1, s2, ((vec_uchar16){1, 17, 3, 19, 5, 21, 7, 23, 2072 1.1 mrg 9, 25, 11, 27, 13, 29, 15, 31}))); 2073 1.1 mrg return (spu_andc(d, s)); 2074 1.1 mrg } 2075 1.1 mrg 2076 1.1 mrg static inline vec_char16 vec_subs(vec_char16 a, vec_char16 b) 2077 1.1 mrg { 2078 1.1 mrg vec_ushort8 s1, s2; 2079 1.1 mrg vec_uchar16 s, d; 2080 1.1 mrg 2081 1.1 mrg s1 = spu_sub(spu_rlmask((vec_ushort8)(a), -8), spu_rlmask((vec_ushort8)(b), -8)); 2082 1.1 mrg s2 = spu_sub(spu_and((vec_ushort8)(a), 0xFF), spu_and((vec_ushort8)(b), 0xFF)); 2083 1.1 mrg s = (vec_uchar16)(spu_shuffle(s1, s2, ((vec_uchar16){1, 17, 3, 19, 5, 21, 7, 23, 2084 1.1 mrg 9, 25, 11, 27, 13, 29, 15, 31}))); 2085 1.1 mrg d = spu_sel(s, spu_splats((unsigned char)0x7F), spu_cmpgt(spu_nor((vec_uchar16)(a), spu_nand(s, (vec_uchar16)(b))), 0x7F)); 2086 1.1 mrg d = spu_sel(d, spu_splats((unsigned char)0x80), spu_cmpgt(spu_and((vec_uchar16)(a), spu_nor(s, (vec_uchar16)(b))), 0x7F)); 2087 1.1 mrg 2088 1.1 mrg return ((vec_char16)(d)); 2089 1.1 mrg } 2090 1.1 mrg 2091 1.1 mrg static inline vec_char16 vec_subs(vec_bchar16 a, vec_char16 b) 2092 1.1 mrg { 2093 1.1 mrg return (vec_subs((vec_char16)(a), b)); 2094 1.1 mrg } 2095 1.1 mrg 2096 1.1 mrg static inline vec_char16 vec_subs(vec_char16 a, vec_bchar16 b) 2097 1.1 mrg { 2098 1.1 mrg return (vec_subs(a, (vec_char16)(b))); 2099 1.1 mrg } 2100 1.1 mrg 2101 1.1 mrg static inline vec_ushort8 vec_subs(vec_ushort8 a, vec_ushort8 b) 2102 1.1 mrg { 2103 1.1 mrg return (spu_andc(spu_sub(a, b), spu_cmpgt(b, a))); 2104 1.1 mrg } 2105 1.1 mrg 2106 1.1 mrg static inline vec_short8 vec_subs(vec_short8 a, vec_short8 b) 2107 1.1 mrg { 2108 1.1 mrg vec_short8 s; 2109 1.1 mrg vec_short8 d; 2110 1.1 mrg 2111 1.1 mrg s = spu_sub(a, b); 2112 1.1 mrg d = spu_sel(s, spu_splats((signed short)0x7FFF), (vec_ushort8)(spu_rlmaska(spu_nor(a, spu_nand(s, b)), -15))); 2113 1.1 mrg d = spu_sel(d, spu_splats((signed short)0x8000), (vec_ushort8)(spu_rlmaska(spu_and(a, spu_nor(s, b)), -15))); 2114 1.1 mrg 2115 1.1 mrg return (d); 2116 1.1 mrg } 2117 1.1 mrg 2118 1.1 mrg static inline vec_short8 vec_subs(vec_bshort8 a, vec_short8 b) 2119 1.1 mrg { 2120 1.1 mrg return ((vec_short8)(vec_subs((vec_short8)(a), b))); 2121 1.1 mrg } 2122 1.1 mrg 2123 1.1 mrg static inline vec_short8 vec_subs(vec_short8 a, vec_bshort8 b) 2124 1.1 mrg { 2125 1.1 mrg return ((vec_short8)(vec_subs(a, (vec_short8)(b)))); 2126 1.1 mrg } 2127 1.1 mrg 2128 1.1 mrg static inline vec_uint4 vec_subs(vec_uint4 a, vec_uint4 b) 2129 1.1 mrg { 2130 1.1 mrg return (spu_andc(spu_sub(a, b), spu_cmpgt(b, a))); 2131 1.1 mrg } 2132 1.1 mrg 2133 1.1 mrg static inline vec_int4 vec_subs(vec_int4 a, vec_int4 b) 2134 1.1 mrg { 2135 1.1 mrg vec_int4 s; 2136 1.1 mrg vec_int4 d; 2137 1.1 mrg 2138 1.1 mrg s = spu_sub(a, b); 2139 1.1 mrg d = spu_sel(s, spu_splats((signed int)0x7FFFFFFF), (vec_uint4)(spu_rlmaska(spu_nor(a, spu_nand(s, b)), -31))); 2140 1.1 mrg d = spu_sel(d, spu_splats((signed int)0x80000000), (vec_uint4)(spu_rlmaska(spu_and(a, spu_nor(s, b)), -31))); 2141 1.1 mrg 2142 1.1 mrg return (d); 2143 1.1 mrg } 2144 1.1 mrg 2145 1.1 mrg static inline vec_int4 vec_subs(vec_bint4 a, vec_int4 b) 2146 1.1 mrg { 2147 1.1 mrg return ((vec_int4)(vec_subs((vec_int4)(a), b))); 2148 1.1 mrg } 2149 1.1 mrg 2150 1.1 mrg static inline vec_int4 vec_subs(vec_int4 a, vec_bint4 b) 2151 1.1 mrg { 2152 1.1 mrg return ((vec_int4)(vec_subs(a, (vec_int4)(b)))); 2153 1.1 mrg } 2154 1.1 mrg 2155 1.1 mrg 2156 1.1 mrg /* vec_sum4s (vector sum across partial (1/4) saturated) 2157 1.1 mrg * ========= 2158 1.1 mrg */ 2159 1.1 mrg static inline vec_uint4 vec_sum4s(vec_uchar16 a, vec_uint4 b) 2160 1.1 mrg { 2161 1.1 mrg vec_uint4 a01_23, a0123; 2162 1.1 mrg 2163 1.1 mrg a01_23 = (vec_uint4)(spu_add(spu_rlmask((vec_ushort8)(a), -8), 2164 1.1 mrg spu_and((vec_ushort8)(a), 0xFF))); 2165 1.1 mrg a0123 = spu_add(spu_rlmask(a01_23, -16), spu_and(a01_23, 0x1FF)); 2166 1.1 mrg return (vec_adds(a0123, b)); 2167 1.1 mrg } 2168 1.1 mrg 2169 1.1 mrg static inline vec_int4 vec_sum4s(vec_char16 a, vec_int4 b) 2170 1.1 mrg { 2171 1.1 mrg vec_int4 a01_23, a0123; 2172 1.1 mrg 2173 1.1 mrg a01_23 = (vec_int4)(spu_add(spu_rlmaska((vec_short8)(a), -8), 2174 1.1 mrg spu_extend(a))); 2175 1.1 mrg a0123 = spu_add(spu_rlmaska(a01_23, -16), spu_extend((vec_short8)(a01_23))); 2176 1.1 mrg return (vec_adds(a0123, b)); 2177 1.1 mrg } 2178 1.1 mrg 2179 1.1 mrg static inline vec_int4 vec_sum4s(vec_short8 a, vec_int4 b) 2180 1.1 mrg { 2181 1.1 mrg vec_int4 a0123; 2182 1.1 mrg 2183 1.1 mrg a0123 = spu_add(spu_rlmaska((vec_int4)(a), -16), spu_extend(a)); 2184 1.1 mrg return (vec_adds(a0123, b)); 2185 1.1 mrg } 2186 1.1 mrg 2187 1.1 mrg 2188 1.1 mrg /* vec_sum2s (vector sum across partial (1/2) saturated) 2189 1.1 mrg * ========= 2190 1.1 mrg */ 2191 1.1 mrg static inline vec_int4 vec_sum2s(vec_int4 a, vec_int4 b) 2192 1.1 mrg { 2193 1.1 mrg vec_int4 c, d; 2194 1.1 mrg vec_int4 sign1, sign2, sign3; 2195 1.1 mrg vec_int4 carry, sum_l, sum_h, sat, sat_val; 2196 1.1 mrg 2197 1.1 mrg sign1 = spu_rlmaska(a, -31); 2198 1.1 mrg sign2 = spu_rlmaska(b, -31); 2199 1.1 mrg 2200 1.1 mrg c = spu_rlqwbyte(a, -4); 2201 1.1 mrg sign3 = spu_rlqwbyte(sign1, -4); 2202 1.1 mrg 2203 1.1 mrg carry = spu_genc(a, b); 2204 1.1 mrg sum_l = spu_add(a, b); 2205 1.1 mrg sum_h = spu_addx(sign1, sign2, carry); 2206 1.1 mrg 2207 1.1 mrg carry = spu_genc(sum_l, c); 2208 1.1 mrg sum_l = spu_add(sum_l, c); 2209 1.1 mrg sum_h = spu_addx(sum_h, sign3, carry); 2210 1.1 mrg 2211 1.1 mrg sign1 = spu_rlmaska(sum_l, -31); 2212 1.1 mrg sign2 = spu_rlmaska(sum_h, -31); 2213 1.1 mrg 2214 1.1 mrg sat_val = spu_xor(sign2, spu_splats((signed int)0x7FFFFFFF)); 2215 1.1 mrg 2216 1.1 mrg sat = spu_orc(spu_xor(sign1, sign2), (vec_int4)spu_cmpeq(sum_h, sign2)); 2217 1.1 mrg 2218 1.1 mrg d = spu_and(spu_sel(sum_l, sat_val, (vec_uint4)(sat)), (vec_int4){0, -1, 0, -1}); 2219 1.1 mrg 2220 1.1 mrg return (d); 2221 1.1 mrg } 2222 1.1 mrg 2223 1.1 mrg 2224 1.1 mrg /* vec_sums (vector sum saturated) 2225 1.1 mrg * ======== 2226 1.1 mrg */ 2227 1.1 mrg static inline vec_int4 vec_sums(vec_int4 a, vec_int4 b) 2228 1.1 mrg { 2229 1.1 mrg vec_int4 a0, a1, a2, c0, c1, c2, d; 2230 1.1 mrg vec_int4 sign_a, sign_b, sign_l, sign_h; 2231 1.1 mrg vec_int4 sum_l, sum_h, sat, sat_val; 2232 1.1 mrg 2233 1.1 mrg sign_a = spu_rlmaska(a, -31); 2234 1.1 mrg sign_b = spu_rlmaska(b, -31); 2235 1.1 mrg 2236 1.1 mrg a0 = spu_rlqwbyte(a, -12); 2237 1.1 mrg a1 = spu_rlqwbyte(a, -8); 2238 1.1 mrg a2 = spu_rlqwbyte(a, -4); 2239 1.1 mrg 2240 1.1 mrg sum_l = spu_add(a, b); 2241 1.1 mrg sum_h = spu_addx(sign_a, sign_b, spu_genc(a, b)); 2242 1.1 mrg 2243 1.1 mrg c2 = spu_genc(sum_l, a2); 2244 1.1 mrg sum_l = spu_add(sum_l, a2); 2245 1.1 mrg sum_h = spu_addx(sum_h, spu_rlqwbyte(sign_a, -4), c2); 2246 1.1 mrg 2247 1.1 mrg c1 = spu_genc(sum_l, a1); 2248 1.1 mrg sum_l = spu_add(sum_l, a1); 2249 1.1 mrg sum_h = spu_addx(sum_h, spu_rlqwbyte(sign_a, -8), c1); 2250 1.1 mrg 2251 1.1 mrg c0 = spu_genc(sum_l, a0); 2252 1.1 mrg sum_l = spu_add(sum_l, a0); 2253 1.1 mrg sum_h = spu_addx(sum_h, spu_rlqwbyte(sign_a, -12), c0); 2254 1.1 mrg 2255 1.1 mrg sign_l = spu_rlmaska(sum_l, -31); 2256 1.1 mrg sign_h = spu_rlmaska(sum_h, -31); 2257 1.1 mrg 2258 1.1 mrg sat_val = spu_xor(sign_h, spu_splats((signed int)0x7FFFFFFF)); 2259 1.1 mrg 2260 1.1 mrg sat = spu_orc(spu_xor(sign_l, sign_h), (vec_int4)spu_cmpeq(sum_h, sign_h)); 2261 1.1 mrg 2262 1.1 mrg d = spu_and(spu_sel(sum_l, sat_val, (vec_uint4)(sat)), ((vec_int4){0, 0, 0, -1})); 2263 1.1 mrg 2264 1.1 mrg return (d); 2265 1.1 mrg } 2266 1.1 mrg 2267 1.1 mrg 2268 1.1 mrg /* vec_trunc (vector truncate) 2269 1.1 mrg * ========= 2270 1.1 mrg */ 2271 1.1 mrg static inline vec_float4 vec_trunc(vec_float4 a) 2272 1.1 mrg { 2273 1.1 mrg vec_int4 exp; 2274 1.1 mrg vec_uint4 mask; 2275 1.1 mrg 2276 1.1 mrg exp = spu_sub(127, (vec_int4)(spu_and(spu_rlmask((vec_uint4)(a), -23), 0xFF))); 2277 1.1 mrg mask = spu_rlmask(spu_splats((unsigned int)0x7FFFFF), exp); 2278 1.1 mrg mask = spu_sel(spu_splats((unsigned int)0), mask, spu_cmpgt(exp, -31)); 2279 1.1 mrg mask = spu_or(mask, spu_xor((vec_uint4)(spu_rlmaska(spu_add(exp, -1), -31)), -1)); 2280 1.1 mrg return (spu_andc(a, (vec_float4)(mask))); 2281 1.1 mrg } 2282 1.1 mrg 2283 1.1 mrg /* vec_unpackh (vector unpack high element) 2284 1.1 mrg * =========== 2285 1.1 mrg */ 2286 1.1 mrg static inline vec_short8 vec_unpackh(vec_char16 a) 2287 1.1 mrg { 2288 1.1 mrg return (spu_extend(spu_shuffle(a, a, ((vec_uchar16){0, 0, 1, 1, 2, 2, 3, 3, 2289 1.1 mrg 4, 4, 5, 5, 6, 6, 7, 7})))); 2290 1.1 mrg } 2291 1.1 mrg 2292 1.1 mrg static inline vec_bshort8 vec_unpackh(vec_bchar16 a) 2293 1.1 mrg { 2294 1.1 mrg return ((vec_bshort8)(vec_unpackh((vec_char16)(a)))); 2295 1.1 mrg } 2296 1.1 mrg 2297 1.1 mrg static inline vec_int4 vec_unpackh(vec_short8 a) 2298 1.1 mrg { 2299 1.1 mrg return (spu_extend(spu_shuffle(a, a, ((vec_uchar16){0, 0, 0, 1, 0, 0, 2, 3, 2300 1.1 mrg 0, 0, 4, 5, 0, 0, 6, 7})))); 2301 1.1 mrg } 2302 1.1 mrg 2303 1.1 mrg #ifdef SUPPORT_UNPACK_PIXEL 2304 1.1 mrg /* Due to type conflicts, unpacking of pixel types and boolean shorts 2305 1.10 mrg * cannot simultaneously be supported. By default, the boolean short is 2306 1.1 mrg * supported. 2307 1.1 mrg */ 2308 1.1 mrg static inline vec_uint4 vec_unpackh(vec_pixel8 a) 2309 1.1 mrg { 2310 1.1 mrg vec_ushort8 p1, p2; 2311 1.1 mrg 2312 1.1 mrg p1 = spu_shuffle((vec_ushort8)(spu_rlmaska((vec_short8)(a.p), -7)), 2313 1.1 mrg spu_and((vec_ushort8)(a.p), 0x1F), 2314 1.1 mrg ((vec_uchar16){ 0, 128, 128, 17, 2, 128, 128, 19, 2315 1.1 mrg 4, 128, 128, 21, 6, 128, 128, 23})); 2316 1.1 mrg p2 = spu_shuffle(spu_and(spu_rlmask((vec_ushort8)(a.p), -5), 0x1F), 2317 1.1 mrg spu_and(spu_rlmask((vec_ushort8)(a.p), -10), 0x1F), 2318 1.1 mrg ((vec_uchar16){ 128, 17, 1, 128, 128, 19, 3, 128, 2319 1.1 mrg 128, 21, 5, 128, 128, 23, 7, 128})); 2320 1.1 mrg return ((vec_uint4)(spu_or(p1, p2))); 2321 1.1 mrg } 2322 1.1 mrg 2323 1.1 mrg #else 2324 1.1 mrg 2325 1.1 mrg static inline vec_bint4 vec_unpackh(vec_bshort8 a) 2326 1.1 mrg { 2327 1.1 mrg return ((vec_bint4)(vec_unpackh((vec_short8)(a)))); 2328 1.1 mrg } 2329 1.1 mrg #endif 2330 1.1 mrg 2331 1.1 mrg 2332 1.1 mrg 2333 1.1 mrg 2334 1.1 mrg 2335 1.1 mrg /* vec_unpackl (vector unpack low element) 2336 1.1 mrg * =========== 2337 1.1 mrg */ 2338 1.1 mrg static inline vec_short8 vec_unpackl(vec_char16 a) 2339 1.1 mrg { 2340 1.1 mrg return (spu_extend(spu_shuffle(a, a, ((vec_uchar16){8, 8, 9, 9, 10, 10, 11, 11, 2341 1.1 mrg 12, 12, 13, 13, 14, 14, 15, 15})))); 2342 1.1 mrg } 2343 1.1 mrg 2344 1.1 mrg static inline vec_bshort8 vec_unpackl(vec_bchar16 a) 2345 1.1 mrg { 2346 1.1 mrg return ((vec_bshort8)(vec_unpackl((vec_char16)(a)))); 2347 1.1 mrg } 2348 1.1 mrg 2349 1.1 mrg 2350 1.1 mrg static inline vec_int4 vec_unpackl(vec_short8 a) 2351 1.1 mrg { 2352 1.1 mrg return (spu_extend(spu_shuffle(a, a, ((vec_uchar16){0, 0, 8, 9, 0, 0, 10, 11, 2353 1.1 mrg 0, 0,12,13, 0, 0, 14, 15})))); 2354 1.1 mrg } 2355 1.1 mrg 2356 1.1 mrg 2357 1.1 mrg #ifdef SUPPORT_UNPACK_PIXEL 2358 1.1 mrg /* Due to type conflicts, unpacking of pixel types and boolean shorts 2359 1.10 mrg * cannot simultaneously be supported. By default, the boolean short is 2360 1.1 mrg * supported. 2361 1.1 mrg */ 2362 1.1 mrg static inline vec_uint4 vec_unpackl(vec_pixel8 a) 2363 1.1 mrg { 2364 1.1 mrg vec_ushort8 p1, p2; 2365 1.1 mrg 2366 1.1 mrg p1 = spu_shuffle((vec_ushort8)(spu_rlmaska((vec_short8)(a), -7)), 2367 1.1 mrg spu_and((vec_ushort8)(a), 0x1F), 2368 1.1 mrg ((vec_uchar16){ 8, 128, 128, 25, 10, 128, 128, 27, 2369 1.1 mrg 12, 128, 128, 29, 14, 128, 128, 31})); 2370 1.1 mrg p2 = spu_shuffle(spu_and(spu_rlmask((vec_ushort8)(a), -5), 0x1F), 2371 1.1 mrg spu_and(spu_rlmask((vec_ushort8)(a), -10), 0x1F), 2372 1.1 mrg ((vec_uchar16){ 128, 25, 9, 128, 128, 27, 11, 128, 2373 1.1 mrg 128, 29, 13, 128, 128, 31, 15, 128})); 2374 1.1 mrg return ((vec_uint4)(spu_or(p1, p2))); 2375 1.1 mrg } 2376 1.1 mrg 2377 1.1 mrg #else 2378 1.1 mrg 2379 1.1 mrg static inline vec_bint4 vec_unpackl(vec_bshort8 a) 2380 1.1 mrg { 2381 1.1 mrg return ((vec_bint4)(vec_unpackl((vec_short8)(a)))); 2382 1.1 mrg 2383 1.1 mrg } 2384 1.1 mrg #endif 2385 1.1 mrg 2386 1.1 mrg 2387 1.1 mrg 2388 1.1 mrg /* vec_xor (vector logical xor) 2389 1.1 mrg * ====== 2390 1.1 mrg */ 2391 1.1 mrg static inline vec_uchar16 vec_xor(vec_uchar16 a, vec_uchar16 b) 2392 1.1 mrg { 2393 1.1 mrg return (spu_xor(a, b)); 2394 1.1 mrg } 2395 1.1 mrg 2396 1.1 mrg static inline vec_char16 vec_xor(vec_char16 a, vec_char16 b) 2397 1.1 mrg { 2398 1.1 mrg return (spu_xor(a, b)); 2399 1.1 mrg } 2400 1.1 mrg 2401 1.1 mrg static inline vec_char16 vec_xor(vec_bchar16 a, vec_char16 b) 2402 1.1 mrg { 2403 1.1 mrg return (spu_xor((vec_char16)(a), b)); 2404 1.1 mrg } 2405 1.1 mrg 2406 1.1 mrg static inline vec_char16 vec_xor(vec_char16 a, vec_bchar16 b) 2407 1.1 mrg { 2408 1.1 mrg return (spu_xor(a, (vec_char16)(b))); 2409 1.1 mrg } 2410 1.1 mrg 2411 1.1 mrg static inline vec_ushort8 vec_xor(vec_ushort8 a, vec_ushort8 b) 2412 1.1 mrg { 2413 1.1 mrg return (spu_xor(a, b)); 2414 1.1 mrg } 2415 1.1 mrg 2416 1.1 mrg static inline vec_short8 vec_xor(vec_short8 a, vec_short8 b) 2417 1.1 mrg { 2418 1.1 mrg return (spu_xor(a, b)); 2419 1.1 mrg } 2420 1.1 mrg 2421 1.1 mrg static inline vec_short8 vec_xor(vec_bshort8 a, vec_short8 b) 2422 1.1 mrg { 2423 1.1 mrg return (spu_xor((vec_short8)(a), b)); 2424 1.1 mrg } 2425 1.1 mrg 2426 1.1 mrg static inline vec_short8 vec_xor(vec_short8 a, vec_bshort8 b) 2427 1.1 mrg { 2428 1.1 mrg return (spu_xor(a, (vec_short8)(b))); 2429 1.1 mrg } 2430 1.1 mrg 2431 1.1 mrg static inline vec_uint4 vec_xor(vec_uint4 a, vec_uint4 b) 2432 1.1 mrg { 2433 1.1 mrg return (spu_xor(a, b)); 2434 1.1 mrg } 2435 1.1 mrg 2436 1.1 mrg static inline vec_int4 vec_xor(vec_int4 a, vec_int4 b) 2437 1.1 mrg { 2438 1.1 mrg return (spu_xor(a, b)); 2439 1.1 mrg } 2440 1.1 mrg 2441 1.1 mrg static inline vec_int4 vec_xor(vec_bint4 a, vec_int4 b) 2442 1.1 mrg { 2443 1.1 mrg return (spu_xor((vec_int4)(a), b)); 2444 1.1 mrg } 2445 1.1 mrg 2446 1.1 mrg static inline vec_int4 vec_xor(vec_int4 a, vec_bint4 b) 2447 1.1 mrg { 2448 1.1 mrg return (spu_xor(a, (vec_int4)(b))); 2449 1.1 mrg } 2450 1.1 mrg 2451 1.1 mrg static inline vec_float4 vec_xor(vec_float4 a, vec_float4 b) 2452 1.1 mrg { 2453 1.1 mrg return (spu_xor(a, b)); 2454 1.1 mrg } 2455 1.1 mrg 2456 1.1 mrg static inline vec_float4 vec_xor(vec_bint4 a, vec_float4 b) 2457 1.1 mrg { 2458 1.1 mrg return (spu_xor((vec_float4)(a),b)); 2459 1.1 mrg } 2460 1.1 mrg 2461 1.1 mrg static inline vec_float4 vec_xor(vec_float4 a, vec_bint4 b) 2462 1.1 mrg { 2463 1.1 mrg return (spu_xor(a, (vec_float4)(b))); 2464 1.1 mrg } 2465 1.1 mrg 2466 1.1 mrg /************************************************************************ 2467 1.1 mrg * PREDICATES 2468 1.1 mrg ************************************************************************/ 2469 1.1 mrg 2470 1.1 mrg /* vec_all_eq (all elements equal) 2471 1.1 mrg * ========== 2472 1.1 mrg */ 2473 1.1 mrg static inline int vec_all_eq(vec_uchar16 a, vec_uchar16 b) 2474 1.1 mrg { 2475 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) == 0xFFFF)); 2476 1.1 mrg } 2477 1.1 mrg 2478 1.1 mrg static inline int vec_all_eq(vec_char16 a, vec_char16 b) 2479 1.1 mrg { 2480 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) == 0xFFFF)); 2481 1.1 mrg } 2482 1.1 mrg 2483 1.1 mrg static inline int vec_all_eq(vec_bchar16 a, vec_char16 b) 2484 1.1 mrg { 2485 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpeq((vec_char16)(a), b)), 0) == 0xFFFF)); 2486 1.1 mrg } 2487 1.1 mrg 2488 1.1 mrg static inline int vec_all_eq(vec_char16 a, vec_bchar16 b) 2489 1.1 mrg { 2490 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpeq(a, (vec_char16)(b))), 0) == 0xFFFF)); 2491 1.1 mrg } 2492 1.1 mrg 2493 1.1 mrg static inline int vec_all_eq(vec_ushort8 a, vec_ushort8 b) 2494 1.1 mrg { 2495 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) == 0xFF)); 2496 1.1 mrg } 2497 1.1 mrg 2498 1.1 mrg static inline int vec_all_eq(vec_short8 a, vec_short8 b) 2499 1.1 mrg { 2500 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) == 0xFF)); 2501 1.1 mrg } 2502 1.1 mrg 2503 1.1 mrg static inline int vec_all_eq(vec_bshort8 a, vec_short8 b) 2504 1.1 mrg { 2505 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpeq((vec_short8)(a), b)), 0) == 0xFF)); 2506 1.1 mrg } 2507 1.1 mrg 2508 1.1 mrg static inline int vec_all_eq(vec_short8 a, vec_bshort8 b) 2509 1.1 mrg { 2510 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpeq(a, (vec_short8)(b))), 0) == 0xFF)); 2511 1.1 mrg } 2512 1.1 mrg 2513 1.1 mrg static inline int vec_all_eq(vec_uint4 a, vec_uint4 b) 2514 1.1 mrg { 2515 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) == 0xF)); 2516 1.1 mrg } 2517 1.1 mrg 2518 1.1 mrg static inline int vec_all_eq(vec_int4 a, vec_int4 b) 2519 1.1 mrg { 2520 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) == 0xF)); 2521 1.1 mrg } 2522 1.1 mrg 2523 1.1 mrg static inline int vec_all_eq(vec_bint4 a, vec_int4 b) 2524 1.1 mrg { 2525 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpeq((vec_int4)(a), b)), 0) == 0xF)); 2526 1.1 mrg } 2527 1.1 mrg 2528 1.1 mrg static inline int vec_all_eq(vec_int4 a, vec_bint4 b) 2529 1.1 mrg { 2530 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpeq(a, (vec_int4)(b))), 0) == 0xF)); 2531 1.1 mrg } 2532 1.1 mrg 2533 1.1 mrg static inline int vec_all_eq(vec_float4 a, vec_float4 b) 2534 1.1 mrg { 2535 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) == 0xF)); 2536 1.1 mrg } 2537 1.1 mrg 2538 1.1 mrg 2539 1.1 mrg /* vec_all_ge (all elements greater than or equal) 2540 1.1 mrg * ========== 2541 1.1 mrg */ 2542 1.1 mrg static inline int vec_all_ge(vec_uchar16 a, vec_uchar16 b) 2543 1.1 mrg { 2544 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) == 0)); 2545 1.1 mrg } 2546 1.1 mrg 2547 1.1 mrg static inline int vec_all_ge(vec_char16 a, vec_char16 b) 2548 1.1 mrg { 2549 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) == 0)); 2550 1.1 mrg } 2551 1.1 mrg 2552 1.1 mrg static inline int vec_all_ge(vec_bchar16 a, vec_char16 b) 2553 1.1 mrg { 2554 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt(b, (vec_char16)(a))), 0) == 0)); 2555 1.1 mrg } 2556 1.1 mrg 2557 1.1 mrg static inline int vec_all_ge(vec_char16 a, vec_bchar16 b) 2558 1.1 mrg { 2559 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_char16)(b), a)), 0) == 0)); 2560 1.1 mrg } 2561 1.1 mrg 2562 1.1 mrg static inline int vec_all_ge(vec_ushort8 a, vec_ushort8 b) 2563 1.1 mrg { 2564 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) == 0)); 2565 1.1 mrg } 2566 1.1 mrg 2567 1.1 mrg static inline int vec_all_ge(vec_short8 a, vec_short8 b) 2568 1.1 mrg { 2569 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) == 0)); 2570 1.1 mrg } 2571 1.1 mrg 2572 1.1 mrg static inline int vec_all_ge(vec_bshort8 a, vec_short8 b) 2573 1.1 mrg { 2574 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt(b, (vec_short8)(a))), 0) == 0)); 2575 1.1 mrg } 2576 1.1 mrg 2577 1.1 mrg static inline int vec_all_ge(vec_short8 a, vec_bshort8 b) 2578 1.1 mrg { 2579 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_short8)(b), a)), 0) == 0)); 2580 1.1 mrg } 2581 1.1 mrg 2582 1.1 mrg static inline int vec_all_ge(vec_uint4 a, vec_uint4 b) 2583 1.1 mrg { 2584 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) == 0)); 2585 1.1 mrg } 2586 1.1 mrg 2587 1.1 mrg static inline int vec_all_ge(vec_int4 a, vec_int4 b) 2588 1.1 mrg { 2589 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) == 0)); 2590 1.1 mrg } 2591 1.1 mrg 2592 1.1 mrg static inline int vec_all_ge(vec_bint4 a, vec_int4 b) 2593 1.1 mrg { 2594 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt(b, (vec_int4)(a))), 0) == 0)); 2595 1.1 mrg } 2596 1.1 mrg 2597 1.1 mrg static inline int vec_all_ge(vec_int4 a, vec_bint4 b) 2598 1.1 mrg { 2599 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_int4)(b), a)), 0) == 0)); 2600 1.1 mrg } 2601 1.1 mrg 2602 1.1 mrg static inline int vec_all_ge(vec_float4 a, vec_float4 b) 2603 1.1 mrg { 2604 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) == 0)); 2605 1.1 mrg } 2606 1.1 mrg 2607 1.1 mrg 2608 1.1 mrg /* vec_all_gt (all elements greater than) 2609 1.1 mrg * ========== 2610 1.1 mrg */ 2611 1.1 mrg static inline int vec_all_gt(vec_uchar16 a, vec_uchar16 b) 2612 1.1 mrg { 2613 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) == 0xFFFF)); 2614 1.1 mrg } 2615 1.1 mrg 2616 1.1 mrg static inline int vec_all_gt(vec_char16 a, vec_char16 b) 2617 1.1 mrg { 2618 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) == 0xFFFF)); 2619 1.1 mrg } 2620 1.1 mrg 2621 1.1 mrg static inline int vec_all_gt(vec_bchar16 a, vec_char16 b) 2622 1.1 mrg { 2623 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_char16)(a), b)), 0) == 0xFFFF)); 2624 1.1 mrg } 2625 1.1 mrg 2626 1.1 mrg static inline int vec_all_gt(vec_char16 a, vec_bchar16 b) 2627 1.1 mrg { 2628 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt(a, (vec_char16)(b))), 0) == 0xFFFF)); 2629 1.1 mrg } 2630 1.1 mrg 2631 1.1 mrg static inline int vec_all_gt(vec_ushort8 a, vec_ushort8 b) 2632 1.1 mrg { 2633 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) == 0xFF)); 2634 1.1 mrg } 2635 1.1 mrg 2636 1.1 mrg static inline int vec_all_gt(vec_short8 a, vec_short8 b) 2637 1.1 mrg { 2638 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) == 0xFF)); 2639 1.1 mrg } 2640 1.1 mrg 2641 1.1 mrg static inline int vec_all_gt(vec_bshort8 a, vec_short8 b) 2642 1.1 mrg { 2643 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_short8)(a), b)), 0) == 0xFF)); 2644 1.1 mrg } 2645 1.1 mrg 2646 1.1 mrg static inline int vec_all_gt(vec_short8 a, vec_bshort8 b) 2647 1.1 mrg { 2648 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt(a, (vec_short8)(b))), 0) == 0xFF)); 2649 1.1 mrg } 2650 1.1 mrg 2651 1.1 mrg static inline int vec_all_gt(vec_uint4 a, vec_uint4 b) 2652 1.1 mrg { 2653 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) == 0xF)); 2654 1.1 mrg } 2655 1.1 mrg 2656 1.1 mrg static inline int vec_all_gt(vec_int4 a, vec_int4 b) 2657 1.1 mrg { 2658 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) == 0xF)); 2659 1.1 mrg } 2660 1.1 mrg 2661 1.1 mrg static inline int vec_all_gt(vec_bint4 a, vec_int4 b) 2662 1.1 mrg { 2663 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_int4)(a), b)), 0) == 0xF)); 2664 1.1 mrg } 2665 1.1 mrg 2666 1.1 mrg static inline int vec_all_gt(vec_int4 a, vec_bint4 b) 2667 1.1 mrg { 2668 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt(a, (vec_int4)(b))), 0) == 0xF)); 2669 1.1 mrg } 2670 1.1 mrg 2671 1.1 mrg static inline int vec_all_gt(vec_float4 a, vec_float4 b) 2672 1.1 mrg { 2673 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) == 0xF)); 2674 1.1 mrg } 2675 1.1 mrg 2676 1.1 mrg 2677 1.1 mrg /* vec_all_in (all elements in bounds) 2678 1.1 mrg * ========== 2679 1.1 mrg */ 2680 1.1 mrg static inline int vec_all_in(vec_float4 a, vec_float4 b) 2681 1.1 mrg { 2682 1.1 mrg return (spu_extract(spu_gather(spu_nor(spu_cmpabsgt(a, b), (vec_uint4)(spu_rlmaska((vec_int4)(b), -31)))), 0) == 0xF); 2683 1.1 mrg } 2684 1.1 mrg 2685 1.1 mrg 2686 1.1 mrg /* vec_all_le (all elements less than or equal) 2687 1.1 mrg * ========== 2688 1.1 mrg */ 2689 1.1 mrg static inline int vec_all_le(vec_uchar16 a, vec_uchar16 b) 2690 1.1 mrg { 2691 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) == 0)); 2692 1.1 mrg } 2693 1.1 mrg 2694 1.1 mrg static inline int vec_all_le(vec_char16 a, vec_char16 b) 2695 1.1 mrg { 2696 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) == 0)); 2697 1.1 mrg } 2698 1.1 mrg 2699 1.1 mrg static inline int vec_all_le(vec_bchar16 a, vec_char16 b) 2700 1.1 mrg { 2701 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_char16)(a), b)), 0) == 0)); 2702 1.1 mrg } 2703 1.1 mrg 2704 1.1 mrg static inline int vec_all_le(vec_char16 a, vec_bchar16 b) 2705 1.1 mrg { 2706 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt(a, (vec_char16)(b))), 0) == 0)); 2707 1.1 mrg } 2708 1.1 mrg 2709 1.1 mrg static inline int vec_all_le(vec_ushort8 a, vec_ushort8 b) 2710 1.1 mrg { 2711 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) == 0)); 2712 1.1 mrg } 2713 1.1 mrg 2714 1.1 mrg static inline int vec_all_le(vec_short8 a, vec_short8 b) 2715 1.1 mrg { 2716 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) == 0)); 2717 1.1 mrg } 2718 1.1 mrg 2719 1.1 mrg static inline int vec_all_le(vec_bshort8 a, vec_short8 b) 2720 1.1 mrg { 2721 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_short8)(a), b)), 0) == 0)); 2722 1.1 mrg } 2723 1.1 mrg 2724 1.1 mrg static inline int vec_all_le(vec_short8 a, vec_bshort8 b) 2725 1.1 mrg { 2726 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt(a, (vec_short8)(b))), 0) == 0)); 2727 1.1 mrg } 2728 1.1 mrg 2729 1.1 mrg static inline int vec_all_le(vec_uint4 a, vec_uint4 b) 2730 1.1 mrg { 2731 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) == 0)); 2732 1.1 mrg } 2733 1.1 mrg 2734 1.1 mrg static inline int vec_all_le(vec_int4 a, vec_int4 b) 2735 1.1 mrg { 2736 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) == 0)); 2737 1.1 mrg } 2738 1.1 mrg 2739 1.1 mrg static inline int vec_all_le(vec_bint4 a, vec_int4 b) 2740 1.1 mrg { 2741 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_int4)(a), b)), 0) == 0)); 2742 1.1 mrg } 2743 1.1 mrg 2744 1.1 mrg static inline int vec_all_le(vec_int4 a, vec_bint4 b) 2745 1.1 mrg { 2746 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt(a, (vec_int4)(b))), 0) == 0)); 2747 1.1 mrg } 2748 1.1 mrg 2749 1.1 mrg static inline int vec_all_le(vec_float4 a, vec_float4 b) 2750 1.1 mrg { 2751 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) == 0)); 2752 1.1 mrg } 2753 1.1 mrg 2754 1.1 mrg 2755 1.1 mrg /* vec_all_lt (all elements less than) 2756 1.1 mrg * ========== 2757 1.1 mrg */ 2758 1.1 mrg static inline int vec_all_lt(vec_uchar16 a, vec_uchar16 b) 2759 1.1 mrg { 2760 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) == 0xFFFF)); 2761 1.1 mrg } 2762 1.1 mrg 2763 1.1 mrg static inline int vec_all_lt(vec_char16 a, vec_char16 b) 2764 1.1 mrg { 2765 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) == 0xFFFF)); 2766 1.1 mrg } 2767 1.1 mrg 2768 1.1 mrg static inline int vec_all_lt(vec_bchar16 a, vec_char16 b) 2769 1.1 mrg { 2770 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt(b, (vec_char16)(a))), 0) == 0xFFFF)); 2771 1.1 mrg } 2772 1.1 mrg 2773 1.1 mrg static inline int vec_all_lt(vec_char16 a, vec_bchar16 b) 2774 1.1 mrg { 2775 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_char16)(b), a)), 0) == 0xFFFF)); 2776 1.1 mrg } 2777 1.1 mrg 2778 1.1 mrg static inline int vec_all_lt(vec_ushort8 a, vec_ushort8 b) 2779 1.1 mrg { 2780 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) == 0xFF)); 2781 1.1 mrg } 2782 1.1 mrg 2783 1.1 mrg static inline int vec_all_lt(vec_short8 a, vec_short8 b) 2784 1.1 mrg { 2785 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) == 0xFF)); 2786 1.1 mrg } 2787 1.1 mrg 2788 1.1 mrg static inline int vec_all_lt(vec_bshort8 a, vec_short8 b) 2789 1.1 mrg { 2790 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt(b, (vec_short8)(a))), 0) == 0xFF)); 2791 1.1 mrg } 2792 1.1 mrg 2793 1.1 mrg static inline int vec_all_lt(vec_short8 a, vec_bshort8 b) 2794 1.1 mrg { 2795 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_short8)(b), a)), 0) == 0xFF)); 2796 1.1 mrg } 2797 1.1 mrg 2798 1.1 mrg static inline int vec_all_lt(vec_uint4 a, vec_uint4 b) 2799 1.1 mrg { 2800 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) == 0xF)); 2801 1.1 mrg } 2802 1.1 mrg 2803 1.1 mrg static inline int vec_all_lt(vec_int4 a, vec_int4 b) 2804 1.1 mrg { 2805 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) == 0xF)); 2806 1.1 mrg } 2807 1.1 mrg 2808 1.1 mrg static inline int vec_all_lt(vec_bint4 a, vec_int4 b) 2809 1.1 mrg { 2810 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt(b, (vec_int4)(a))), 0) == 0xF)); 2811 1.1 mrg } 2812 1.1 mrg 2813 1.1 mrg static inline int vec_all_lt(vec_int4 a, vec_bint4 b) 2814 1.1 mrg { 2815 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_int4)(b), a)), 0) == 0xF)); 2816 1.1 mrg } 2817 1.1 mrg 2818 1.1 mrg static inline int vec_all_lt(vec_float4 a, vec_float4 b) 2819 1.1 mrg { 2820 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) == 0xF)); 2821 1.1 mrg } 2822 1.1 mrg 2823 1.1 mrg 2824 1.1 mrg /* vec_all_nan (all elements not a number) 2825 1.1 mrg * =========== 2826 1.1 mrg */ 2827 1.1 mrg static inline int vec_all_nan(vec_float4 a) 2828 1.1 mrg { 2829 1.1 mrg vec_uint4 exp, man; 2830 1.1 mrg vec_uint4 exp_mask = spu_splats((unsigned int)0x7F800000); 2831 1.1 mrg 2832 1.1 mrg exp = spu_and((vec_uint4)(a), exp_mask); 2833 1.1 mrg man = spu_and((vec_uint4)(a), spu_splats((unsigned int)0x007FFFFF)); 2834 1.1 mrg return ((int)(spu_extract(spu_gather(spu_andc(spu_cmpeq(exp, exp_mask), 2835 1.1 mrg spu_cmpeq(man, 0))), 0) == 0xF)); 2836 1.1 mrg } 2837 1.1 mrg 2838 1.1 mrg #define vec_all_nan(_a) (0) 2839 1.1 mrg 2840 1.1 mrg 2841 1.1 mrg /* vec_all_ne (all elements not equal) 2842 1.1 mrg * ========== 2843 1.1 mrg */ 2844 1.1 mrg static inline int vec_all_ne(vec_uchar16 a, vec_uchar16 b) 2845 1.1 mrg { 2846 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) == 0)); 2847 1.1 mrg } 2848 1.1 mrg 2849 1.1 mrg static inline int vec_all_ne(vec_char16 a, vec_char16 b) 2850 1.1 mrg { 2851 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) == 0)); 2852 1.1 mrg } 2853 1.1 mrg 2854 1.1 mrg static inline int vec_all_ne(vec_bchar16 a, vec_char16 b) 2855 1.1 mrg { 2856 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpeq((vec_char16)(a), b)), 0) == 0)); 2857 1.1 mrg } 2858 1.1 mrg 2859 1.1 mrg static inline int vec_all_ne(vec_char16 a, vec_bchar16 b) 2860 1.1 mrg { 2861 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpeq(a, (vec_char16)(b))), 0) == 0)); 2862 1.1 mrg } 2863 1.1 mrg 2864 1.1 mrg static inline int vec_all_ne(vec_ushort8 a, vec_ushort8 b) 2865 1.1 mrg { 2866 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) == 0)); 2867 1.1 mrg } 2868 1.1 mrg 2869 1.1 mrg static inline int vec_all_ne(vec_short8 a, vec_short8 b) 2870 1.1 mrg { 2871 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) == 0)); 2872 1.1 mrg } 2873 1.1 mrg 2874 1.1 mrg static inline int vec_all_ne(vec_bshort8 a, vec_short8 b) 2875 1.1 mrg { 2876 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpeq((vec_short8)(a), b)), 0) == 0)); 2877 1.1 mrg } 2878 1.1 mrg 2879 1.1 mrg static inline int vec_all_ne(vec_short8 a, vec_bshort8 b) 2880 1.1 mrg { 2881 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpeq(a, (vec_short8)(b))), 0) == 0)); 2882 1.1 mrg } 2883 1.1 mrg 2884 1.1 mrg static inline int vec_all_ne(vec_uint4 a, vec_uint4 b) 2885 1.1 mrg { 2886 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) == 0)); 2887 1.1 mrg } 2888 1.1 mrg 2889 1.1 mrg static inline int vec_all_ne(vec_int4 a, vec_int4 b) 2890 1.1 mrg { 2891 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) == 0)); 2892 1.1 mrg } 2893 1.1 mrg 2894 1.1 mrg static inline int vec_all_ne(vec_bint4 a, vec_int4 b) 2895 1.1 mrg { 2896 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpeq((vec_int4)(a), b)), 0) == 0)); 2897 1.1 mrg } 2898 1.1 mrg 2899 1.1 mrg static inline int vec_all_ne(vec_int4 a, vec_bint4 b) 2900 1.1 mrg { 2901 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpeq(a, (vec_int4)(b))), 0) == 0)); 2902 1.1 mrg } 2903 1.1 mrg 2904 1.1 mrg static inline int vec_all_ne(vec_float4 a, vec_float4 b) 2905 1.1 mrg { 2906 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) == 0)); 2907 1.1 mrg } 2908 1.1 mrg 2909 1.1 mrg 2910 1.1 mrg /* vec_all_nge (all elements not greater than or equal) 2911 1.1 mrg * =========== 2912 1.1 mrg */ 2913 1.1 mrg static inline int vec_all_nge(vec_float4 a, vec_float4 b) 2914 1.1 mrg { 2915 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) == 0xF)); 2916 1.1 mrg } 2917 1.1 mrg 2918 1.1 mrg 2919 1.1 mrg /* vec_all_ngt (all elements not greater than) 2920 1.1 mrg * =========== 2921 1.1 mrg */ 2922 1.1 mrg static inline int vec_all_ngt(vec_float4 a, vec_float4 b) 2923 1.1 mrg { 2924 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) == 0)); 2925 1.1 mrg } 2926 1.1 mrg 2927 1.1 mrg 2928 1.1 mrg /* vec_all_nle (all elements not less than or equal) 2929 1.1 mrg * =========== 2930 1.1 mrg */ 2931 1.1 mrg static inline int vec_all_nle(vec_float4 a, vec_float4 b) 2932 1.1 mrg { 2933 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) == 0xF)); 2934 1.1 mrg } 2935 1.1 mrg 2936 1.1 mrg 2937 1.1 mrg /* vec_all_nlt (all elements not less than) 2938 1.1 mrg * =========== 2939 1.1 mrg */ 2940 1.1 mrg static inline int vec_all_nlt(vec_float4 a, vec_float4 b) 2941 1.1 mrg { 2942 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) == 0)); 2943 1.1 mrg } 2944 1.1 mrg 2945 1.1 mrg 2946 1.1 mrg /* vec_all_numeric (all elements numeric) 2947 1.1 mrg * =========== 2948 1.1 mrg */ 2949 1.1 mrg static inline int vec_all_numeric(vec_float4 a) 2950 1.1 mrg { 2951 1.1 mrg vec_uint4 exp; 2952 1.1 mrg 2953 1.1 mrg exp = spu_and(spu_rlmask((vec_uint4)(a), -23), 0xFF); 2954 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpeq(exp, 255)), 0) == 0)); 2955 1.1 mrg } 2956 1.1 mrg 2957 1.1 mrg 2958 1.1 mrg 2959 1.1 mrg /* vec_any_eq (any elements equal) 2960 1.1 mrg * ========== 2961 1.1 mrg */ 2962 1.1 mrg static inline int vec_any_eq(vec_uchar16 a, vec_uchar16 b) 2963 1.1 mrg { 2964 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) != 0)); 2965 1.1 mrg } 2966 1.1 mrg 2967 1.1 mrg static inline int vec_any_eq(vec_char16 a, vec_char16 b) 2968 1.1 mrg { 2969 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) != 0)); 2970 1.1 mrg } 2971 1.1 mrg 2972 1.1 mrg static inline int vec_any_eq(vec_bchar16 a, vec_char16 b) 2973 1.1 mrg { 2974 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpeq((vec_char16)(a), b)), 0) != 0)); 2975 1.1 mrg } 2976 1.1 mrg 2977 1.1 mrg static inline int vec_any_eq(vec_char16 a, vec_bchar16 b) 2978 1.1 mrg { 2979 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpeq(a, (vec_char16)(b))), 0) != 0)); 2980 1.1 mrg } 2981 1.1 mrg 2982 1.1 mrg static inline int vec_any_eq(vec_ushort8 a, vec_ushort8 b) 2983 1.1 mrg { 2984 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) != 0)); 2985 1.1 mrg } 2986 1.1 mrg 2987 1.1 mrg static inline int vec_any_eq(vec_short8 a, vec_short8 b) 2988 1.1 mrg { 2989 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) != 0)); 2990 1.1 mrg } 2991 1.1 mrg 2992 1.1 mrg static inline int vec_any_eq(vec_bshort8 a, vec_short8 b) 2993 1.1 mrg { 2994 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpeq((vec_short8)(a), b)), 0) != 0)); 2995 1.1 mrg } 2996 1.1 mrg 2997 1.1 mrg static inline int vec_any_eq(vec_short8 a, vec_bshort8 b) 2998 1.1 mrg { 2999 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpeq(a, (vec_short8)(b))), 0) != 0)); 3000 1.1 mrg } 3001 1.1 mrg 3002 1.1 mrg static inline int vec_any_eq(vec_uint4 a, vec_uint4 b) 3003 1.1 mrg { 3004 1.1 mrg return ((int)(spu_extract(spu_orx(spu_rlmask(spu_cmpeq(a, b), -31)), 0))); 3005 1.1 mrg } 3006 1.1 mrg 3007 1.1 mrg static inline int vec_any_eq(vec_int4 a, vec_int4 b) 3008 1.1 mrg { 3009 1.1 mrg return ((int)(spu_extract(spu_orx(spu_rlmask(spu_cmpeq(a, b), -31)), 0))); 3010 1.1 mrg } 3011 1.1 mrg 3012 1.1 mrg static inline int vec_any_eq(vec_bint4 a, vec_int4 b) 3013 1.1 mrg { 3014 1.1 mrg return ((int)(spu_extract(spu_orx(spu_rlmask(spu_cmpeq((vec_int4)(a), b), -31)), 0))); 3015 1.1 mrg } 3016 1.1 mrg 3017 1.1 mrg static inline int vec_any_eq(vec_int4 a, vec_bint4 b) 3018 1.1 mrg { 3019 1.1 mrg return ((int)(spu_extract(spu_orx(spu_rlmask(spu_cmpeq(a, (vec_int4)(b)), -31)), 0))); 3020 1.1 mrg } 3021 1.1 mrg 3022 1.1 mrg static inline int vec_any_eq(vec_float4 a, vec_float4 b) 3023 1.1 mrg { 3024 1.1 mrg return ((int)(spu_extract(spu_orx(spu_rlmask(spu_cmpeq(a, b), -31)), 0))); 3025 1.1 mrg } 3026 1.1 mrg 3027 1.1 mrg /* vec_any_ge (any elements greater than or equal) 3028 1.1 mrg * ========== 3029 1.1 mrg */ 3030 1.1 mrg static inline int vec_any_ge(vec_uchar16 a, vec_uchar16 b) 3031 1.1 mrg { 3032 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) != 0xFFFF)); 3033 1.1 mrg } 3034 1.1 mrg 3035 1.1 mrg static inline int vec_any_ge(vec_char16 a, vec_char16 b) 3036 1.1 mrg { 3037 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) != 0xFFFF)); 3038 1.1 mrg } 3039 1.1 mrg 3040 1.1 mrg static inline int vec_any_ge(vec_bchar16 a, vec_char16 b) 3041 1.1 mrg { 3042 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt(b, (vec_char16)(a))), 0) != 0xFFFF)); 3043 1.1 mrg } 3044 1.1 mrg 3045 1.1 mrg static inline int vec_any_ge(vec_char16 a, vec_bchar16 b) 3046 1.1 mrg { 3047 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_char16)(b), a)), 0) != 0xFFFF)); 3048 1.1 mrg } 3049 1.1 mrg 3050 1.1 mrg static inline int vec_any_ge(vec_ushort8 a, vec_ushort8 b) 3051 1.1 mrg { 3052 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) != 0xFF)); 3053 1.1 mrg } 3054 1.1 mrg 3055 1.1 mrg static inline int vec_any_ge(vec_short8 a, vec_short8 b) 3056 1.1 mrg { 3057 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) != 0xFF)); 3058 1.1 mrg } 3059 1.1 mrg 3060 1.1 mrg static inline int vec_any_ge(vec_bshort8 a, vec_short8 b) 3061 1.1 mrg { 3062 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt(b, (vec_short8)(a))), 0) != 0xFF)); 3063 1.1 mrg } 3064 1.1 mrg 3065 1.1 mrg static inline int vec_any_ge(vec_short8 a, vec_bshort8 b) 3066 1.1 mrg { 3067 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_short8)(b), a)), 0) != 0xFF)); 3068 1.1 mrg } 3069 1.1 mrg 3070 1.1 mrg static inline int vec_any_ge(vec_uint4 a, vec_uint4 b) 3071 1.1 mrg { 3072 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) != 0xF)); 3073 1.1 mrg } 3074 1.1 mrg 3075 1.1 mrg static inline int vec_any_ge(vec_int4 a, vec_int4 b) 3076 1.1 mrg { 3077 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) != 0xF)); 3078 1.1 mrg } 3079 1.1 mrg 3080 1.1 mrg static inline int vec_any_ge(vec_bint4 a, vec_int4 b) 3081 1.1 mrg { 3082 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt(b, (vec_int4)(a))), 0) != 0xF)); 3083 1.1 mrg } 3084 1.1 mrg 3085 1.1 mrg static inline int vec_any_ge(vec_int4 a, vec_bint4 b) 3086 1.1 mrg { 3087 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_int4)(b), a)), 0) != 0xF)); 3088 1.1 mrg } 3089 1.1 mrg 3090 1.1 mrg static inline int vec_any_ge(vec_float4 a, vec_float4 b) 3091 1.1 mrg { 3092 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) != 0xF)); 3093 1.1 mrg } 3094 1.1 mrg 3095 1.1 mrg 3096 1.1 mrg /* vec_any_gt (any elements greater than) 3097 1.1 mrg * ========== 3098 1.1 mrg */ 3099 1.1 mrg static inline int vec_any_gt(vec_uchar16 a, vec_uchar16 b) 3100 1.1 mrg { 3101 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) != 0)); 3102 1.1 mrg } 3103 1.1 mrg 3104 1.1 mrg static inline int vec_any_gt(vec_char16 a, vec_char16 b) 3105 1.1 mrg { 3106 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) != 0)); 3107 1.1 mrg } 3108 1.1 mrg 3109 1.1 mrg static inline int vec_any_gt(vec_bchar16 a, vec_char16 b) 3110 1.1 mrg { 3111 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_char16)(a), b)), 0) != 0)); 3112 1.1 mrg } 3113 1.1 mrg 3114 1.1 mrg static inline int vec_any_gt(vec_char16 a, vec_bchar16 b) 3115 1.1 mrg { 3116 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt(a, (vec_char16)(b))), 0) != 0)); 3117 1.1 mrg } 3118 1.1 mrg 3119 1.1 mrg static inline int vec_any_gt(vec_ushort8 a, vec_ushort8 b) 3120 1.1 mrg { 3121 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) != 0)); 3122 1.1 mrg } 3123 1.1 mrg 3124 1.1 mrg static inline int vec_any_gt(vec_short8 a, vec_short8 b) 3125 1.1 mrg { 3126 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) != 0)); 3127 1.1 mrg } 3128 1.1 mrg 3129 1.1 mrg static inline int vec_any_gt(vec_bshort8 a, vec_short8 b) 3130 1.1 mrg { 3131 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_short8)(a), b)), 0) != 0)); 3132 1.1 mrg } 3133 1.1 mrg 3134 1.1 mrg static inline int vec_any_gt(vec_short8 a, vec_bshort8 b) 3135 1.1 mrg { 3136 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt(a, (vec_short8)(b))), 0) != 0)); 3137 1.1 mrg } 3138 1.1 mrg 3139 1.1 mrg 3140 1.1 mrg static inline int vec_any_gt(vec_uint4 a, vec_uint4 b) 3141 1.1 mrg { 3142 1.1 mrg return ((int)(spu_extract(spu_orx(spu_rlmask(spu_cmpgt(a, b), -31)), 0))); 3143 1.1 mrg } 3144 1.1 mrg 3145 1.1 mrg static inline int vec_any_gt(vec_int4 a, vec_int4 b) 3146 1.1 mrg { 3147 1.1 mrg return ((int)(spu_extract(spu_orx(spu_rlmask(spu_cmpgt(a, b), -31)), 0))); 3148 1.1 mrg } 3149 1.1 mrg 3150 1.1 mrg static inline int vec_any_gt(vec_bint4 a, vec_int4 b) 3151 1.1 mrg { 3152 1.1 mrg return ((int)(spu_extract(spu_orx(spu_rlmask(spu_cmpgt((vec_int4)(a), b), -31)), 0))); 3153 1.1 mrg } 3154 1.1 mrg 3155 1.1 mrg static inline int vec_any_gt(vec_int4 a, vec_bint4 b) 3156 1.1 mrg { 3157 1.1 mrg return ((int)(spu_extract(spu_orx(spu_rlmask(spu_cmpgt(a, (vec_int4)(b)), -31)), 0))); 3158 1.1 mrg } 3159 1.1 mrg 3160 1.1 mrg static inline int vec_any_gt(vec_float4 a, vec_float4 b) 3161 1.1 mrg { 3162 1.1 mrg return ((int)(spu_extract(spu_orx(spu_rlmask(spu_cmpgt(a, b), -31)), 0))); 3163 1.1 mrg } 3164 1.1 mrg 3165 1.1 mrg /* vec_any_le (any elements less than or equal) 3166 1.1 mrg * ========== 3167 1.1 mrg */ 3168 1.1 mrg static inline int vec_any_le(vec_uchar16 a, vec_uchar16 b) 3169 1.1 mrg { 3170 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) != 0xFFFF)); 3171 1.1 mrg } 3172 1.1 mrg 3173 1.1 mrg static inline int vec_any_le(vec_char16 a, vec_char16 b) 3174 1.1 mrg { 3175 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) != 0xFFFF)); 3176 1.1 mrg } 3177 1.1 mrg 3178 1.1 mrg static inline int vec_any_le(vec_bchar16 a, vec_char16 b) 3179 1.1 mrg { 3180 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_char16)(a), b)), 0) != 0xFFFF)); 3181 1.1 mrg } 3182 1.1 mrg 3183 1.1 mrg static inline int vec_any_le(vec_char16 a, vec_bchar16 b) 3184 1.1 mrg { 3185 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt(a, (vec_char16)(b))), 0) != 0xFFFF)); 3186 1.1 mrg } 3187 1.1 mrg 3188 1.1 mrg static inline int vec_any_le(vec_ushort8 a, vec_ushort8 b) 3189 1.1 mrg { 3190 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) != 0xFF)); 3191 1.1 mrg } 3192 1.1 mrg 3193 1.1 mrg static inline int vec_any_le(vec_short8 a, vec_short8 b) 3194 1.1 mrg { 3195 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) != 0xFF)); 3196 1.1 mrg } 3197 1.1 mrg 3198 1.1 mrg static inline int vec_any_le(vec_bshort8 a, vec_short8 b) 3199 1.1 mrg { 3200 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_short8)(a), b)), 0) != 0xFF)); 3201 1.1 mrg } 3202 1.1 mrg 3203 1.1 mrg static inline int vec_any_le(vec_short8 a, vec_bshort8 b) 3204 1.1 mrg { 3205 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt(a, (vec_short8)(b))), 0) != 0xFF)); 3206 1.1 mrg } 3207 1.1 mrg 3208 1.1 mrg static inline int vec_any_le(vec_uint4 a, vec_uint4 b) 3209 1.1 mrg { 3210 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) != 0xF)); 3211 1.1 mrg } 3212 1.1 mrg 3213 1.1 mrg static inline int vec_any_le(vec_int4 a, vec_int4 b) 3214 1.1 mrg { 3215 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) != 0xF)); 3216 1.1 mrg } 3217 1.1 mrg 3218 1.1 mrg static inline int vec_any_le(vec_bint4 a, vec_int4 b) 3219 1.1 mrg { 3220 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_int4)(a), b)), 0) != 0xF)); 3221 1.1 mrg } 3222 1.1 mrg 3223 1.1 mrg static inline int vec_any_le(vec_int4 a, vec_bint4 b) 3224 1.1 mrg { 3225 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt(a, (vec_int4)(b))), 0) != 0xF)); 3226 1.1 mrg } 3227 1.1 mrg 3228 1.1 mrg static inline int vec_any_le(vec_float4 a, vec_float4 b) 3229 1.1 mrg { 3230 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) != 0xF)); 3231 1.1 mrg } 3232 1.1 mrg 3233 1.1 mrg 3234 1.1 mrg /* vec_any_lt (any elements less than) 3235 1.1 mrg * ========== 3236 1.1 mrg */ 3237 1.1 mrg static inline int vec_any_lt(vec_uchar16 a, vec_uchar16 b) 3238 1.1 mrg { 3239 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) != 0)); 3240 1.1 mrg } 3241 1.1 mrg 3242 1.1 mrg static inline int vec_any_lt(vec_char16 a, vec_char16 b) 3243 1.1 mrg { 3244 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) != 0)); 3245 1.1 mrg } 3246 1.1 mrg 3247 1.1 mrg static inline int vec_any_lt(vec_bchar16 a, vec_char16 b) 3248 1.1 mrg { 3249 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt(b, (vec_char16)(a))), 0) != 0)); 3250 1.1 mrg } 3251 1.1 mrg 3252 1.1 mrg static inline int vec_any_lt(vec_char16 a, vec_bchar16 b) 3253 1.1 mrg { 3254 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_char16)(b), a)), 0) != 0)); 3255 1.1 mrg } 3256 1.1 mrg 3257 1.1 mrg static inline int vec_any_lt(vec_ushort8 a, vec_ushort8 b) 3258 1.1 mrg { 3259 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) != 0)); 3260 1.1 mrg } 3261 1.1 mrg 3262 1.1 mrg static inline int vec_any_lt(vec_short8 a, vec_short8 b) 3263 1.1 mrg { 3264 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) != 0)); 3265 1.1 mrg } 3266 1.1 mrg 3267 1.1 mrg static inline int vec_any_lt(vec_bshort8 a, vec_short8 b) 3268 1.1 mrg { 3269 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt(b, (vec_short8)(a))), 0) != 0)); 3270 1.1 mrg } 3271 1.1 mrg 3272 1.1 mrg static inline int vec_any_lt(vec_short8 a, vec_bshort8 b) 3273 1.1 mrg { 3274 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt((vec_short8)(b), a)), 0) != 0)); 3275 1.1 mrg } 3276 1.1 mrg 3277 1.1 mrg static inline int vec_any_lt(vec_uint4 a, vec_uint4 b) 3278 1.1 mrg { 3279 1.1 mrg return ((int)(spu_extract(spu_orx(spu_rlmask(spu_cmpgt(b, a), -31)), 0))); 3280 1.1 mrg } 3281 1.1 mrg 3282 1.1 mrg static inline int vec_any_lt(vec_int4 a, vec_int4 b) 3283 1.1 mrg { 3284 1.1 mrg return ((int)(spu_extract(spu_orx(spu_rlmask(spu_cmpgt(b, a), -31)), 0))); 3285 1.1 mrg } 3286 1.1 mrg 3287 1.1 mrg static inline int vec_any_lt(vec_bint4 a, vec_int4 b) 3288 1.1 mrg { 3289 1.1 mrg return ((int)(spu_extract(spu_orx(spu_rlmask(spu_cmpgt(b, (vec_int4)(a)), -31)), 0))); 3290 1.1 mrg } 3291 1.1 mrg 3292 1.1 mrg static inline int vec_any_lt(vec_int4 a, vec_bint4 b) 3293 1.1 mrg { 3294 1.1 mrg return ((int)(spu_extract(spu_orx(spu_rlmask(spu_cmpgt((vec_int4)(b), a), -31)), 0))); 3295 1.1 mrg } 3296 1.1 mrg 3297 1.1 mrg static inline int vec_any_lt(vec_float4 a, vec_float4 b) 3298 1.1 mrg { 3299 1.1 mrg return ((int)(spu_extract(spu_orx(spu_rlmask(spu_cmpgt(b, a), -31)), 0))); 3300 1.1 mrg } 3301 1.1 mrg 3302 1.1 mrg /* vec_any_nan (any elements not a number) 3303 1.1 mrg * =========== 3304 1.1 mrg */ 3305 1.1 mrg static inline int vec_any_nan(vec_float4 a) 3306 1.1 mrg { 3307 1.1 mrg vec_uint4 exp, man; 3308 1.1 mrg vec_uint4 exp_mask = spu_splats((unsigned int)0x7F800000); 3309 1.1 mrg 3310 1.1 mrg exp = spu_and((vec_uint4)(a), exp_mask); 3311 1.1 mrg man = spu_and((vec_uint4)(a), spu_splats((unsigned int)0x007FFFFF)); 3312 1.1 mrg return ((int)(spu_extract(spu_gather(spu_andc(spu_cmpeq(exp, exp_mask), 3313 1.1 mrg spu_cmpeq(man, 0))), 0) != 0)); 3314 1.1 mrg } 3315 1.1 mrg 3316 1.1 mrg 3317 1.1 mrg /* vec_any_ne (any elements not equal) 3318 1.1 mrg * ========== 3319 1.1 mrg */ 3320 1.1 mrg static inline int vec_any_ne(vec_uchar16 a, vec_uchar16 b) 3321 1.1 mrg { 3322 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) != 0xFFFF)); 3323 1.1 mrg } 3324 1.1 mrg 3325 1.1 mrg static inline int vec_any_ne(vec_char16 a, vec_char16 b) 3326 1.1 mrg { 3327 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) != 0xFFFF)); 3328 1.1 mrg } 3329 1.1 mrg 3330 1.1 mrg static inline int vec_any_ne(vec_bchar16 a, vec_char16 b) 3331 1.1 mrg { 3332 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpeq((vec_char16)(a), b)), 0) != 0xFFFF)); 3333 1.1 mrg } 3334 1.1 mrg 3335 1.1 mrg static inline int vec_any_ne(vec_char16 a, vec_bchar16 b) 3336 1.1 mrg { 3337 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpeq(a, (vec_char16)(b))), 0) != 0xFFFF)); 3338 1.1 mrg } 3339 1.1 mrg 3340 1.1 mrg static inline int vec_any_ne(vec_ushort8 a, vec_ushort8 b) 3341 1.1 mrg { 3342 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) != 0xFF)); 3343 1.1 mrg } 3344 1.1 mrg 3345 1.1 mrg static inline int vec_any_ne(vec_short8 a, vec_short8 b) 3346 1.1 mrg { 3347 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) != 0xFF)); 3348 1.1 mrg } 3349 1.1 mrg 3350 1.1 mrg static inline int vec_any_ne(vec_bshort8 a, vec_short8 b) 3351 1.1 mrg { 3352 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpeq((vec_short8)(a), b)), 0) != 0xFF)); 3353 1.1 mrg } 3354 1.1 mrg 3355 1.1 mrg static inline int vec_any_ne(vec_short8 a, vec_bshort8 b) 3356 1.1 mrg { 3357 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpeq(a, (vec_short8)(b))), 0) != 0xFF)); 3358 1.1 mrg } 3359 1.1 mrg 3360 1.1 mrg static inline int vec_any_ne(vec_uint4 a, vec_uint4 b) 3361 1.1 mrg { 3362 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) != 0xF)); 3363 1.1 mrg } 3364 1.1 mrg 3365 1.1 mrg static inline int vec_any_ne(vec_int4 a, vec_int4 b) 3366 1.1 mrg { 3367 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) != 0xF)); 3368 1.1 mrg } 3369 1.1 mrg 3370 1.1 mrg static inline int vec_any_ne(vec_bint4 a, vec_int4 b) 3371 1.1 mrg { 3372 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpeq((vec_int4)(a), b)), 0) != 0xF)); 3373 1.1 mrg } 3374 1.1 mrg 3375 1.1 mrg static inline int vec_any_ne(vec_int4 a, vec_bint4 b) 3376 1.1 mrg { 3377 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpeq(a, (vec_int4)(b))), 0) != 0xF)); 3378 1.1 mrg } 3379 1.1 mrg 3380 1.1 mrg static inline int vec_any_ne(vec_float4 a, vec_float4 b) 3381 1.1 mrg { 3382 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpeq(a, b)), 0) != 0xF)); 3383 1.1 mrg } 3384 1.1 mrg 3385 1.1 mrg 3386 1.1 mrg /* vec_any_nge (any elements not greater than or equal) 3387 1.1 mrg * =========== 3388 1.1 mrg */ 3389 1.1 mrg static inline int vec_any_nge(vec_float4 a, vec_float4 b) 3390 1.1 mrg { 3391 1.1 mrg return ((int)(spu_extract(spu_orx(spu_rlmask(spu_cmpgt(b, a), -31)), 0))); 3392 1.1 mrg } 3393 1.1 mrg 3394 1.1 mrg /* vec_any_ngt (any elements not greater than) 3395 1.1 mrg * =========== 3396 1.1 mrg */ 3397 1.1 mrg static inline int vec_any_ngt(vec_float4 a, vec_float4 b) 3398 1.1 mrg { 3399 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) != 0xF)); 3400 1.1 mrg } 3401 1.1 mrg 3402 1.1 mrg 3403 1.1 mrg /* vec_any_nle (any elements not less than or equal) 3404 1.1 mrg * =========== 3405 1.1 mrg */ 3406 1.1 mrg static inline int vec_any_nle(vec_float4 a, vec_float4 b) 3407 1.1 mrg { 3408 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt(a, b)), 0) != 0)); 3409 1.1 mrg } 3410 1.1 mrg 3411 1.1 mrg 3412 1.1 mrg /* vec_any_nlt (any elements not less than) 3413 1.1 mrg * =========== 3414 1.1 mrg */ 3415 1.1 mrg static inline int vec_any_nlt(vec_float4 a, vec_float4 b) 3416 1.1 mrg { 3417 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpgt(b, a)), 0) != 0xF)); 3418 1.1 mrg } 3419 1.1 mrg 3420 1.1 mrg 3421 1.1 mrg /* vec_any_numeric (any elements numeric) 3422 1.1 mrg * =============== 3423 1.1 mrg */ 3424 1.1 mrg static inline int vec_any_numeric(vec_float4 a) 3425 1.1 mrg { 3426 1.1 mrg vec_uint4 exp; 3427 1.1 mrg 3428 1.1 mrg exp = spu_and(spu_rlmask((vec_uint4)(a), -23), 0xFF); 3429 1.1 mrg return ((int)(spu_extract(spu_gather(spu_cmpeq(exp, 255)), 0) != 0xF)); 3430 1.1 mrg } 3431 1.1 mrg 3432 1.1 mrg 3433 1.1 mrg /* vec_any_out (any elements out of bounds) 3434 1.1 mrg * =========== 3435 1.1 mrg */ 3436 1.1 mrg static inline int vec_any_out(vec_float4 a, vec_float4 b) 3437 1.1 mrg { 3438 1.1 mrg return (spu_extract(spu_gather(spu_nor(spu_cmpabsgt(a, b), (vec_uint4)(spu_rlmaska((vec_int4)(b), -31)))), 0) != 0xF); 3439 1.1 mrg } 3440 1.1 mrg 3441 1.1 mrg 3442 1.1 mrg /* CBE Language Extension Intrinsics 3443 1.1 mrg */ 3444 1.1 mrg 3445 1.1 mrg /* vec_extract (extract element from vector) 3446 1.1 mrg * =========== 3447 1.1 mrg */ 3448 1.1 mrg #define vec_extract(_a, _element) spu_extract(_a, _element) 3449 1.1 mrg 3450 1.1 mrg 3451 1.1 mrg /* vec_insert (insert scalar into specified vector element) 3452 1.1 mrg * ========== 3453 1.1 mrg */ 3454 1.1 mrg #define vec_insert(_a, _b, _element) spu_insert(_a, _b, _element) 3455 1.1 mrg 3456 1.1 mrg /* vec_lvlx (load vector left indexed) 3457 1.1 mrg * ======== 3458 1.1 mrg */ 3459 1.1 mrg static inline vec_uchar16 vec_lvlx(int a, unsigned char *b) 3460 1.1 mrg { 3461 1.1 mrg vec_uchar16 *p = (vec_uchar16 *)((unsigned char *)(b) + a); 3462 1.1 mrg return(spu_slqwbyte(*p, (unsigned int)p & 0xF)); 3463 1.1 mrg } 3464 1.1 mrg 3465 1.1 mrg static inline vec_uchar16 vec_lvlx(int a, vec_uchar16 *b) 3466 1.1 mrg { 3467 1.1 mrg vec_uchar16 *p = (vec_uchar16 *)((unsigned char *)(b) + a); 3468 1.1 mrg return(spu_slqwbyte(*p, (unsigned int)p & 0xF)); 3469 1.1 mrg } 3470 1.1 mrg 3471 1.1 mrg static inline vec_char16 vec_lvlx(int a, signed char *b) 3472 1.1 mrg { 3473 1.1 mrg vec_char16 *p = (vec_char16 *)((unsigned char *)(b) + a); 3474 1.1 mrg return(spu_slqwbyte(*p, (unsigned int)p & 0xF)); 3475 1.1 mrg } 3476 1.1 mrg 3477 1.1 mrg static inline vec_char16 vec_lvlx(int a, vec_char16 *b) 3478 1.1 mrg { 3479 1.1 mrg vec_char16 *p = (vec_char16 *)((unsigned char *)(b) + a); 3480 1.1 mrg return(spu_slqwbyte(*p, (unsigned int)p & 0xF)); 3481 1.1 mrg } 3482 1.1 mrg 3483 1.1 mrg static inline vec_ushort8 vec_lvlx(int a, unsigned short *b) 3484 1.1 mrg { 3485 1.1 mrg vec_ushort8 *p = (vec_ushort8 *)((unsigned char *)(b) + a); 3486 1.1 mrg return(spu_slqwbyte(*p, (unsigned int)p & 0xF)); 3487 1.1 mrg } 3488 1.1 mrg 3489 1.1 mrg static inline vec_ushort8 vec_lvlx(int a, vec_ushort8 *b) 3490 1.1 mrg { 3491 1.1 mrg vec_ushort8 *p = (vec_ushort8 *)((unsigned char *)(b) + a); 3492 1.1 mrg return(spu_slqwbyte(*p, (unsigned int)p & 0xF)); 3493 1.1 mrg } 3494 1.1 mrg 3495 1.1 mrg static inline vec_short8 vec_lvlx(int a, signed short *b) 3496 1.1 mrg { 3497 1.1 mrg vec_short8 *p = (vec_short8 *)((unsigned char *)(b) + a); 3498 1.1 mrg return(spu_slqwbyte(*p, (unsigned int)p & 0xF)); 3499 1.1 mrg } 3500 1.1 mrg 3501 1.1 mrg static inline vec_short8 vec_lvlx(int a, vec_short8 *b) 3502 1.1 mrg { 3503 1.1 mrg vec_short8 *p = (vec_short8 *)((unsigned char *)(b) + a); 3504 1.1 mrg return(spu_slqwbyte(*p, (unsigned int)p & 0xF)); 3505 1.1 mrg } 3506 1.1 mrg 3507 1.1 mrg static inline vec_uint4 vec_lvlx(int a, unsigned int *b) 3508 1.1 mrg { 3509 1.1 mrg vec_uint4 *p = (vec_uint4 *)((unsigned char *)(b) + a); 3510 1.1 mrg return(spu_slqwbyte(*p, (unsigned int)p & 0xF)); 3511 1.1 mrg } 3512 1.1 mrg 3513 1.1 mrg static inline vec_uint4 vec_lvlx(int a, vec_uint4 *b) 3514 1.1 mrg { 3515 1.1 mrg vec_uint4 *p = (vec_uint4 *)((unsigned char *)(b) + a); 3516 1.1 mrg return(spu_slqwbyte(*p, (unsigned int)p & 0xF)); 3517 1.1 mrg } 3518 1.1 mrg 3519 1.1 mrg static inline vec_int4 vec_lvlx(int a, signed int *b) 3520 1.1 mrg { 3521 1.1 mrg vec_int4 *p = (vec_int4 *)((unsigned char *)(b) + a); 3522 1.1 mrg return(spu_slqwbyte(*p, (unsigned int)p & 0xF)); 3523 1.1 mrg } 3524 1.1 mrg 3525 1.1 mrg static inline vec_int4 vec_lvlx(int a, vec_int4 *b) 3526 1.1 mrg { 3527 1.1 mrg vec_int4 *p = (vec_int4 *)((unsigned char *)(b) + a); 3528 1.1 mrg return(spu_slqwbyte(*p, (unsigned int)p & 0xF)); 3529 1.1 mrg } 3530 1.1 mrg 3531 1.1 mrg static inline vec_float4 vec_lvlx(int a, float *b) 3532 1.1 mrg { 3533 1.1 mrg vec_float4 *p = (vec_float4 *)((unsigned char *)(b) + a); 3534 1.1 mrg return(spu_slqwbyte(*p, (unsigned int)p & 0xF)); 3535 1.1 mrg } 3536 1.1 mrg 3537 1.1 mrg static inline vec_float4 vec_lvlx(int a, vec_float4 *b) 3538 1.1 mrg { 3539 1.1 mrg vec_float4 *p = (vec_float4 *)((unsigned char *)(b) + a); 3540 1.1 mrg return(spu_slqwbyte(*p, (unsigned int)p & 0xF)); 3541 1.1 mrg } 3542 1.1 mrg 3543 1.1 mrg 3544 1.1 mrg /* vec_lvlxl (load vector left indexed last) 3545 1.1 mrg * ========= 3546 1.1 mrg */ 3547 1.1 mrg #define vec_lvlxl(_a, _b) vec_lvlx(_a, _b) 3548 1.1 mrg 3549 1.1 mrg 3550 1.1 mrg /* vec_lvrx (load vector right indexed) 3551 1.1 mrg * ======== 3552 1.1 mrg */ 3553 1.1 mrg static inline vec_uchar16 vec_lvrx(int a, unsigned char *b) 3554 1.1 mrg { 3555 1.1 mrg vec_uchar16 *p = (vec_uchar16 *)((unsigned char *)(b) + a); 3556 1.1 mrg return(spu_rlmaskqwbyte(*p, ((int)p & 0xF)-16)); 3557 1.1 mrg } 3558 1.1 mrg 3559 1.1 mrg static inline vec_uchar16 vec_lvrx(int a, vec_uchar16 *b) 3560 1.1 mrg { 3561 1.1 mrg vec_uchar16 *p = (vec_uchar16 *)((unsigned char *)(b) + a); 3562 1.1 mrg return(spu_rlmaskqwbyte(*p, ((int)p & 0xF)-16)); 3563 1.1 mrg } 3564 1.1 mrg 3565 1.1 mrg static inline vec_char16 vec_lvrx(int a, signed char *b) 3566 1.1 mrg { 3567 1.1 mrg vec_char16 *p = (vec_char16 *)((unsigned char *)(b) + a); 3568 1.1 mrg return(spu_rlmaskqwbyte(*p, ((int)p & 0xF)-16)); 3569 1.1 mrg } 3570 1.1 mrg 3571 1.1 mrg static inline vec_char16 vec_lvrx(int a, vec_char16 *b) 3572 1.1 mrg { 3573 1.1 mrg vec_char16 *p = (vec_char16 *)((unsigned char *)(b) + a); 3574 1.1 mrg return(spu_rlmaskqwbyte(*p, ((int)p & 0xF)-16)); 3575 1.1 mrg } 3576 1.1 mrg 3577 1.1 mrg static inline vec_ushort8 vec_lvrx(int a, unsigned short *b) 3578 1.1 mrg { 3579 1.1 mrg vec_ushort8 *p = (vec_ushort8 *)((unsigned char *)(b) + a); 3580 1.1 mrg return(spu_rlmaskqwbyte(*p, ((int)p & 0xF)-16)); 3581 1.1 mrg } 3582 1.1 mrg 3583 1.1 mrg static inline vec_ushort8 vec_lvrx(int a, vec_ushort8 *b) 3584 1.1 mrg { 3585 1.1 mrg vec_ushort8 *p = (vec_ushort8 *)((unsigned char *)(b) + a); 3586 1.1 mrg return(spu_rlmaskqwbyte(*p, ((int)p & 0xF)-16)); 3587 1.1 mrg } 3588 1.1 mrg 3589 1.1 mrg static inline vec_short8 vec_lvrx(int a, signed short *b) 3590 1.1 mrg { 3591 1.1 mrg vec_short8 *p = (vec_short8 *)((unsigned char *)(b) + a); 3592 1.1 mrg return(spu_rlmaskqwbyte(*p, ((int)p & 0xF)-16)); 3593 1.1 mrg } 3594 1.1 mrg 3595 1.1 mrg static inline vec_short8 vec_lvrx(int a, vec_short8 *b) 3596 1.1 mrg { 3597 1.1 mrg vec_short8 *p = (vec_short8 *)((unsigned char *)(b) + a); 3598 1.1 mrg return(spu_rlmaskqwbyte(*p, ((int)p & 0xF)-16)); 3599 1.1 mrg } 3600 1.1 mrg 3601 1.1 mrg static inline vec_uint4 vec_lvrx(int a, unsigned int *b) 3602 1.1 mrg { 3603 1.1 mrg vec_uint4 *p = (vec_uint4 *)((unsigned char *)(b) + a); 3604 1.1 mrg return(spu_rlmaskqwbyte(*p, ((int)p & 0xF)-16)); 3605 1.1 mrg } 3606 1.1 mrg 3607 1.1 mrg static inline vec_uint4 vec_lvrx(int a, vec_uint4 *b) 3608 1.1 mrg { 3609 1.1 mrg vec_uint4 *p = (vec_uint4 *)((unsigned char *)(b) + a); 3610 1.1 mrg return(spu_rlmaskqwbyte(*p, ((int)p & 0xF)-16)); 3611 1.1 mrg } 3612 1.1 mrg 3613 1.1 mrg static inline vec_int4 vec_lvrx(int a, signed int *b) 3614 1.1 mrg { 3615 1.1 mrg vec_int4 *p = (vec_int4 *)((unsigned char *)(b) + a); 3616 1.1 mrg return(spu_rlmaskqwbyte(*p, ((int)p & 0xF)-16)); 3617 1.1 mrg } 3618 1.1 mrg 3619 1.1 mrg static inline vec_int4 vec_lvrx(int a, vec_int4 *b) 3620 1.1 mrg { 3621 1.1 mrg vec_int4 *p = (vec_int4 *)((unsigned char *)(b) + a); 3622 1.1 mrg return(spu_rlmaskqwbyte(*p, ((int)p & 0xF)-16)); 3623 1.1 mrg } 3624 1.1 mrg 3625 1.1 mrg static inline vec_float4 vec_lvrx(int a, float *b) 3626 1.1 mrg { 3627 1.1 mrg vec_float4 *p = (vec_float4 *)((unsigned char *)(b) + a); 3628 1.1 mrg return(spu_rlmaskqwbyte(*p, ((int)p & 0xF)-16)); 3629 1.1 mrg } 3630 1.1 mrg 3631 1.1 mrg static inline vec_float4 vec_lvrx(int a, vec_float4 *b) 3632 1.1 mrg { 3633 1.1 mrg vec_float4 *p = (vec_float4 *)((unsigned char *)(b) + a); 3634 1.1 mrg return(spu_rlmaskqwbyte(*p, ((int)p & 0xF)-16)); 3635 1.1 mrg } 3636 1.1 mrg 3637 1.1 mrg 3638 1.1 mrg 3639 1.1 mrg /* vec_lvrxl (load vector right indexed last) 3640 1.1 mrg * ========= 3641 1.1 mrg */ 3642 1.1 mrg #define vec_lvrxl(_a, _b) vec_lvrx(_a, _b) 3643 1.1 mrg 3644 1.1 mrg 3645 1.1 mrg /* vec_promote (promote scalar to a vector) 3646 1.1 mrg * =========== 3647 1.1 mrg */ 3648 1.1 mrg #define vec_promote(_a, _element) spu_promote(_a, _element) 3649 1.1 mrg 3650 1.1 mrg 3651 1.1 mrg /* vec_splats (splat scalar to a vector) 3652 1.1 mrg * ========== 3653 1.1 mrg */ 3654 1.1 mrg #define vec_splats(_a) spu_splats(_a) 3655 1.1 mrg 3656 1.1 mrg 3657 1.1 mrg /* vec_stvlx (store vector left indexed) 3658 1.1 mrg * ========= 3659 1.1 mrg */ 3660 1.1 mrg static inline void vec_stvlx(vec_uchar16 a, int b, unsigned char *c) 3661 1.1 mrg { 3662 1.1 mrg int shift; 3663 1.1 mrg vec_uchar16 *p = (vec_uchar16 *)((unsigned char *)(c) + b); 3664 1.1 mrg 3665 1.1 mrg shift = -((int)p & 0xF); 3666 1.1 mrg *p = spu_sel(*p, 3667 1.1 mrg spu_rlmaskqwbyte(a, shift), 3668 1.1 mrg spu_rlmaskqwbyte(spu_splats((unsigned char)0xFF), shift)); 3669 1.1 mrg } 3670 1.1 mrg 3671 1.1 mrg static inline void vec_stvlx(vec_uchar16 a, int b, vec_uchar16 *c) 3672 1.1 mrg { 3673 1.1 mrg int shift; 3674 1.1 mrg vec_uchar16 *p = (vec_uchar16 *)((unsigned char *)(c) + b); 3675 1.1 mrg 3676 1.1 mrg shift = -((int)p & 0xF); 3677 1.1 mrg *p = spu_sel(*p, 3678 1.1 mrg spu_rlmaskqwbyte(a, shift), 3679 1.1 mrg spu_rlmaskqwbyte(spu_splats((unsigned char)0xFF), shift)); 3680 1.1 mrg } 3681 1.1 mrg 3682 1.1 mrg static inline void vec_stvlx(vec_char16 a, int b, signed char *c) 3683 1.1 mrg { 3684 1.1 mrg int shift; 3685 1.1 mrg vec_char16 *p = (vec_char16 *)((unsigned char *)(c) + b); 3686 1.1 mrg 3687 1.1 mrg shift = -((int)p & 0xF); 3688 1.1 mrg *p = spu_sel(*p, 3689 1.1 mrg spu_rlmaskqwbyte(a, shift), 3690 1.1 mrg spu_rlmaskqwbyte(spu_splats((unsigned char)0xFF), shift)); 3691 1.1 mrg } 3692 1.1 mrg 3693 1.1 mrg static inline void vec_stvlx(vec_char16 a, int b, vec_char16 *c) 3694 1.1 mrg { 3695 1.1 mrg int shift; 3696 1.1 mrg vec_char16 *p = (vec_char16 *)((unsigned char *)(c) + b); 3697 1.1 mrg 3698 1.1 mrg shift = -((int)p & 0xF); 3699 1.1 mrg *p = spu_sel(*p, 3700 1.1 mrg spu_rlmaskqwbyte(a, shift), 3701 1.1 mrg spu_rlmaskqwbyte(spu_splats((unsigned char)0xFF), shift)); 3702 1.1 mrg } 3703 1.1 mrg 3704 1.1 mrg static inline void vec_stvlx(vec_ushort8 a, int b, unsigned short *c) 3705 1.1 mrg { 3706 1.1 mrg int shift; 3707 1.1 mrg vec_ushort8 *p = (vec_ushort8 *)((unsigned char *)(c) + b); 3708 1.1 mrg 3709 1.1 mrg shift = -((int)p & 0xF); 3710 1.1 mrg *p = spu_sel(*p, 3711 1.1 mrg spu_rlmaskqwbyte(a, shift), 3712 1.1 mrg spu_rlmaskqwbyte(spu_splats((unsigned short)0xFFFF), shift)); 3713 1.1 mrg } 3714 1.1 mrg 3715 1.1 mrg static inline void vec_stvlx(vec_ushort8 a, int b, vec_ushort8 *c) 3716 1.1 mrg { 3717 1.1 mrg int shift; 3718 1.1 mrg vec_ushort8 *p = (vec_ushort8 *)((unsigned char *)(c) + b); 3719 1.1 mrg 3720 1.1 mrg shift = -((int)p & 0xF); 3721 1.1 mrg *p = spu_sel(*p, 3722 1.1 mrg spu_rlmaskqwbyte(a, shift), 3723 1.1 mrg spu_rlmaskqwbyte(spu_splats((unsigned short)0xFFFF), shift)); 3724 1.1 mrg } 3725 1.1 mrg 3726 1.1 mrg static inline void vec_stvlx(vec_short8 a, int b, signed short *c) 3727 1.1 mrg { 3728 1.1 mrg int shift; 3729 1.1 mrg vec_short8 *p = (vec_short8 *)((unsigned char *)(c) + b); 3730 1.1 mrg 3731 1.1 mrg shift = -((int)p & 0xF); 3732 1.1 mrg *p = spu_sel(*p, 3733 1.1 mrg spu_rlmaskqwbyte(a, shift), 3734 1.1 mrg spu_rlmaskqwbyte(spu_splats((unsigned short)0xFFFF), shift)); 3735 1.1 mrg } 3736 1.1 mrg 3737 1.1 mrg static inline void vec_stvlx(vec_short8 a, int b, vec_short8 *c) 3738 1.1 mrg { 3739 1.1 mrg int shift; 3740 1.1 mrg vec_short8 *p = (vec_short8 *)((unsigned char *)(c) + b); 3741 1.1 mrg 3742 1.1 mrg shift = -((int)p & 0xF); 3743 1.1 mrg *p = spu_sel(*p, 3744 1.1 mrg spu_rlmaskqwbyte(a, shift), 3745 1.1 mrg spu_rlmaskqwbyte(spu_splats((unsigned short)0xFFFF), shift)); 3746 1.1 mrg } 3747 1.1 mrg 3748 1.1 mrg static inline void vec_stvlx(vec_uint4 a, int b, unsigned int *c) 3749 1.1 mrg { 3750 1.1 mrg int shift; 3751 1.1 mrg vec_uint4 *p = (vec_uint4 *)((unsigned char *)(c) + b); 3752 1.1 mrg 3753 1.1 mrg shift = -((int)p & 0xF); 3754 1.1 mrg *p = spu_sel(*p, 3755 1.1 mrg spu_rlmaskqwbyte(a, shift), 3756 1.1 mrg spu_rlmaskqwbyte(spu_splats((unsigned int)0xFFFFFFFF), shift)); 3757 1.1 mrg } 3758 1.1 mrg 3759 1.1 mrg static inline void vec_stvlx(vec_uint4 a, int b, vec_uint4 *c) 3760 1.1 mrg { 3761 1.1 mrg int shift; 3762 1.1 mrg vec_uint4 *p = (vec_uint4 *)((unsigned char *)(c) + b); 3763 1.1 mrg 3764 1.1 mrg shift = -((int)p & 0xF); 3765 1.1 mrg *p = spu_sel(*p, 3766 1.1 mrg spu_rlmaskqwbyte(a, shift), 3767 1.1 mrg spu_rlmaskqwbyte(spu_splats((unsigned int)0xFFFFFFFF), shift)); 3768 1.1 mrg } 3769 1.1 mrg 3770 1.1 mrg static inline void vec_stvlx(vec_int4 a, int b, signed int *c) 3771 1.1 mrg { 3772 1.1 mrg int shift; 3773 1.1 mrg vec_int4 *p = (vec_int4 *)((unsigned char *)(c) + b); 3774 1.1 mrg 3775 1.1 mrg shift = -((int)p & 0xF); 3776 1.1 mrg *p = spu_sel(*p, 3777 1.1 mrg spu_rlmaskqwbyte(a, shift), 3778 1.1 mrg spu_rlmaskqwbyte(spu_splats((unsigned int)0xFFFFFFFF), shift)); 3779 1.1 mrg } 3780 1.1 mrg 3781 1.1 mrg static inline void vec_stvlx(vec_int4 a, int b, vec_int4 *c) 3782 1.1 mrg { 3783 1.1 mrg int shift; 3784 1.1 mrg vec_int4 *p = (vec_int4 *)((unsigned char *)(c) + b); 3785 1.1 mrg 3786 1.1 mrg shift = -((int)p & 0xF); 3787 1.1 mrg *p = spu_sel(*p, 3788 1.1 mrg spu_rlmaskqwbyte(a, shift), 3789 1.1 mrg spu_rlmaskqwbyte(spu_splats((unsigned int)0xFFFFFFFF), shift)); 3790 1.1 mrg } 3791 1.1 mrg 3792 1.1 mrg static inline void vec_stvlx(vec_float4 a, int b, float *c) 3793 1.1 mrg { 3794 1.1 mrg int shift; 3795 1.1 mrg vec_float4 *p = (vec_float4 *)((unsigned char *)(c) + b); 3796 1.1 mrg 3797 1.1 mrg shift = -((int)p & 0xF); 3798 1.1 mrg *p = spu_sel(*p, 3799 1.1 mrg spu_rlmaskqwbyte(a, shift), 3800 1.1 mrg spu_rlmaskqwbyte(spu_splats((unsigned int)0xFFFFFFFF), shift)); 3801 1.1 mrg } 3802 1.1 mrg 3803 1.1 mrg static inline void vec_stvlx(vec_float4 a, int b, vec_float4 *c) 3804 1.1 mrg { 3805 1.1 mrg int shift; 3806 1.1 mrg vec_float4 *p = (vec_float4 *)((unsigned char *)(c) + b); 3807 1.1 mrg 3808 1.1 mrg shift = -((int)p & 0xF); 3809 1.1 mrg *p = spu_sel(*p, 3810 1.1 mrg spu_rlmaskqwbyte(a, shift), 3811 1.1 mrg spu_rlmaskqwbyte(spu_splats((unsigned int)0xFFFFFFFF), shift)); 3812 1.1 mrg } 3813 1.1 mrg 3814 1.1 mrg /* vec_stvlxl (store vector left indexed last) 3815 1.1 mrg * ========== 3816 1.1 mrg */ 3817 1.1 mrg #define vec_stvlxl(_a, _b, _c) vec_stvlx(_a, _b, _c) 3818 1.1 mrg 3819 1.1 mrg 3820 1.1 mrg /* vec_stvrx (store vector right indexed) 3821 1.1 mrg * ========= 3822 1.1 mrg */ 3823 1.1 mrg static inline void vec_stvrx(vec_uchar16 a, int b, unsigned char *c) 3824 1.1 mrg { 3825 1.1 mrg int shift; 3826 1.1 mrg vec_uchar16 *p = (vec_uchar16 *)((unsigned char *)(c) + b); 3827 1.1 mrg 3828 1.1 mrg shift = 16-((int)p & 0xF); 3829 1.1 mrg *p = spu_sel(*p, 3830 1.1 mrg spu_slqwbyte(a, shift), 3831 1.1 mrg spu_slqwbyte(spu_splats((unsigned char)0xFF), shift)); 3832 1.1 mrg } 3833 1.1 mrg 3834 1.1 mrg static inline void vec_stvrx(vec_uchar16 a, int b, vec_uchar16 *c) 3835 1.1 mrg { 3836 1.1 mrg int shift; 3837 1.1 mrg vec_uchar16 *p = (vec_uchar16 *)((unsigned char *)(c) + b); 3838 1.1 mrg 3839 1.1 mrg shift = 16-((int)p & 0xF); 3840 1.1 mrg *p = spu_sel(*p, 3841 1.1 mrg spu_slqwbyte(a, shift), 3842 1.1 mrg spu_slqwbyte(spu_splats((unsigned char)0xFF), shift)); 3843 1.1 mrg } 3844 1.1 mrg 3845 1.1 mrg static inline void vec_stvrx(vec_char16 a, int b, signed char *c) 3846 1.1 mrg { 3847 1.1 mrg int shift; 3848 1.1 mrg vec_char16 *p = (vec_char16 *)((unsigned char *)(c) + b); 3849 1.1 mrg 3850 1.1 mrg shift = 16-((int)p & 0xF); 3851 1.1 mrg *p = spu_sel(*p, 3852 1.1 mrg spu_slqwbyte(a, shift), 3853 1.1 mrg spu_slqwbyte(spu_splats((unsigned char)0xFF), shift)); 3854 1.1 mrg } 3855 1.1 mrg 3856 1.1 mrg static inline void vec_stvrx(vec_char16 a, int b, vec_char16 *c) 3857 1.1 mrg { 3858 1.1 mrg int shift; 3859 1.1 mrg vec_char16 *p = (vec_char16 *)((unsigned char *)(c) + b); 3860 1.1 mrg 3861 1.1 mrg shift = 16-((int)p & 0xF); 3862 1.1 mrg *p = spu_sel(*p, 3863 1.1 mrg spu_slqwbyte(a, shift), 3864 1.1 mrg spu_slqwbyte(spu_splats((unsigned char)0xFF), shift)); 3865 1.1 mrg } 3866 1.1 mrg 3867 1.1 mrg static inline void vec_stvrx(vec_ushort8 a, int b, unsigned short *c) 3868 1.1 mrg { 3869 1.1 mrg int shift; 3870 1.1 mrg vec_ushort8 *p = (vec_ushort8 *)((unsigned char *)(c) + b); 3871 1.1 mrg 3872 1.1 mrg shift = 16-((int)p & 0xF); 3873 1.1 mrg *p = spu_sel(*p, 3874 1.1 mrg spu_slqwbyte(a, shift), 3875 1.1 mrg spu_slqwbyte(spu_splats((unsigned short)0xFFFF), shift)); 3876 1.1 mrg } 3877 1.1 mrg 3878 1.1 mrg static inline void vec_stvrx(vec_ushort8 a, int b, vec_ushort8 *c) 3879 1.1 mrg { 3880 1.1 mrg int shift; 3881 1.1 mrg vec_ushort8 *p = (vec_ushort8 *)((unsigned char *)(c) + b); 3882 1.1 mrg 3883 1.1 mrg shift = 16-((int)p & 0xF); 3884 1.1 mrg *p = spu_sel(*p, 3885 1.1 mrg spu_slqwbyte(a, shift), 3886 1.1 mrg spu_slqwbyte(spu_splats((unsigned short)0xFFFF), shift)); 3887 1.1 mrg } 3888 1.1 mrg 3889 1.1 mrg static inline void vec_stvrx(vec_short8 a, int b, signed short *c) 3890 1.1 mrg { 3891 1.1 mrg int shift; 3892 1.1 mrg vec_short8 *p = (vec_short8 *)((unsigned char *)(c) + b); 3893 1.1 mrg 3894 1.1 mrg shift = 16-((int)p & 0xF); 3895 1.1 mrg *p = spu_sel(*p, 3896 1.1 mrg spu_slqwbyte(a, shift), 3897 1.1 mrg spu_slqwbyte(spu_splats((unsigned short)0xFFFF), shift)); 3898 1.1 mrg } 3899 1.1 mrg 3900 1.1 mrg static inline void vec_stvrx(vec_short8 a, int b, vec_short8 *c) 3901 1.1 mrg { 3902 1.1 mrg int shift; 3903 1.1 mrg vec_short8 *p = (vec_short8 *)((unsigned char *)(c) + b); 3904 1.1 mrg 3905 1.1 mrg shift = 16-((int)p & 0xF); 3906 1.1 mrg *p = spu_sel(*p, 3907 1.1 mrg spu_slqwbyte(a, shift), 3908 1.1 mrg spu_slqwbyte(spu_splats((unsigned short)0xFFFF), shift)); 3909 1.1 mrg } 3910 1.1 mrg 3911 1.1 mrg static inline void vec_stvrx(vec_uint4 a, int b, unsigned int *c) 3912 1.1 mrg { 3913 1.1 mrg int shift; 3914 1.1 mrg vec_uint4 *p = (vec_uint4 *)((unsigned char *)(c) + b); 3915 1.1 mrg 3916 1.1 mrg shift = 16-((int)p & 0xF); 3917 1.1 mrg *p = spu_sel(*p, 3918 1.1 mrg spu_slqwbyte(a, shift), 3919 1.1 mrg spu_slqwbyte(spu_splats((unsigned int)0xFFFFFFFF), shift)); 3920 1.1 mrg } 3921 1.1 mrg 3922 1.1 mrg static inline void vec_stvrx(vec_uint4 a, int b, vec_uint4 *c) 3923 1.1 mrg { 3924 1.1 mrg int shift; 3925 1.1 mrg vec_uint4 *p = (vec_uint4 *)((unsigned char *)(c) + b); 3926 1.1 mrg 3927 1.1 mrg shift = 16-((int)p & 0xF); 3928 1.1 mrg *p = spu_sel(*p, 3929 1.1 mrg spu_slqwbyte(a, shift), 3930 1.1 mrg spu_slqwbyte(spu_splats((unsigned int)0xFFFFFFFF), shift)); 3931 1.1 mrg } 3932 1.1 mrg 3933 1.1 mrg static inline void vec_stvrx(vec_int4 a, int b, signed int *c) 3934 1.1 mrg { 3935 1.1 mrg int shift; 3936 1.1 mrg vec_int4 *p = (vec_int4 *)((unsigned char *)(c) + b); 3937 1.1 mrg 3938 1.1 mrg shift = 16-((int)p & 0xF); 3939 1.1 mrg *p = spu_sel(*p, 3940 1.1 mrg spu_slqwbyte(a, shift), 3941 1.1 mrg spu_slqwbyte(spu_splats((unsigned int)0xFFFFFFFF), shift)); 3942 1.1 mrg } 3943 1.1 mrg 3944 1.1 mrg static inline void vec_stvrx(vec_int4 a, int b, vec_int4 *c) 3945 1.1 mrg { 3946 1.1 mrg int shift; 3947 1.1 mrg vec_int4 *p = (vec_int4 *)((unsigned char *)(c) + b); 3948 1.1 mrg 3949 1.1 mrg shift = 16-((int)p & 0xF); 3950 1.1 mrg *p = spu_sel(*p, 3951 1.1 mrg spu_slqwbyte(a, shift), 3952 1.1 mrg spu_slqwbyte(spu_splats((unsigned int)0xFFFFFFFF), shift)); 3953 1.1 mrg } 3954 1.1 mrg 3955 1.1 mrg static inline void vec_stvrx(vec_float4 a, int b, float *c) 3956 1.1 mrg { 3957 1.1 mrg int shift; 3958 1.1 mrg vec_float4 *p = (vec_float4 *)((unsigned char *)(c) + b); 3959 1.1 mrg 3960 1.1 mrg shift = 16-((int)p & 0xF); 3961 1.1 mrg *p = spu_sel(*p, 3962 1.1 mrg spu_slqwbyte(a, shift), 3963 1.1 mrg spu_slqwbyte(spu_splats((unsigned int)0xFFFFFFFF), shift)); 3964 1.1 mrg } 3965 1.1 mrg 3966 1.1 mrg static inline void vec_stvrx(vec_float4 a, int b, vec_float4 *c) 3967 1.1 mrg { 3968 1.1 mrg int shift; 3969 1.1 mrg vec_float4 *p = (vec_float4 *)((unsigned char *)(c) + b); 3970 1.1 mrg 3971 1.1 mrg shift = 16-((int)p & 0xF); 3972 1.1 mrg *p = spu_sel(*p, 3973 1.1 mrg spu_slqwbyte(a, shift), 3974 1.1 mrg spu_slqwbyte(spu_splats((unsigned int)0xFFFFFFFF), shift)); 3975 1.1 mrg } 3976 1.1 mrg 3977 1.1 mrg /* vec_stvrxl (store vector right indexed last) 3978 1.1 mrg * ========== 3979 1.1 mrg */ 3980 1.1 mrg #define vec_stvrxl(_a, _b, _c) vec_stvrx(_a, _b, _c) 3981 1.1 mrg 3982 1.1 mrg 3983 1.1 mrg #endif /* __SPU__ */ 3984 1.1 mrg #endif /* __cplusplus */ 3985 1.1 mrg #endif /* !_VMX2SPU_H_ */ 3986