1 1.1.1.4 mrg /* Copyright (C) 2002-2022 Free Software Foundation, Inc. 2 1.1 mrg 3 1.1 mrg This file is part of GCC. 4 1.1 mrg 5 1.1 mrg GCC is free software; you can redistribute it and/or modify 6 1.1 mrg it under the terms of the GNU General Public License as published by 7 1.1 mrg the Free Software Foundation; either version 3, or (at your option) 8 1.1 mrg any later version. 9 1.1 mrg 10 1.1 mrg GCC is distributed in the hope that it will be useful, 11 1.1 mrg but WITHOUT ANY WARRANTY; without even the implied warranty of 12 1.1 mrg MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 1.1 mrg GNU General Public License for more details. 14 1.1 mrg 15 1.1 mrg Under Section 7 of GPL version 3, you are granted additional 16 1.1 mrg permissions described in the GCC Runtime Library Exception, version 17 1.1 mrg 3.1, as published by the Free Software Foundation. 18 1.1 mrg 19 1.1 mrg You should have received a copy of the GNU General Public License and 20 1.1 mrg a copy of the GCC Runtime Library Exception along with this program; 21 1.1 mrg see the files COPYING3 and COPYING.RUNTIME respectively. If not, see 22 1.1 mrg <http://www.gnu.org/licenses/>. */ 23 1.1 mrg 24 1.1 mrg /* Implemented from the specification included in the Intel C++ Compiler 25 1.1 mrg User Guide and Reference, version 9.0. */ 26 1.1 mrg 27 1.1 mrg #ifndef NO_WARN_X86_INTRINSICS 28 1.1 mrg /* This header is distributed to simplify porting x86_64 code that 29 1.1 mrg makes explicit use of Intel intrinsics to powerpc64le. 30 1.1 mrg It is the user's responsibility to determine if the results are 31 1.1 mrg acceptable and make additional changes as necessary. 32 1.1 mrg Note that much code that uses Intel intrinsics can be rewritten in 33 1.1 mrg standard C or GNU C extensions, which are more portable and better 34 1.1 mrg optimized across multiple targets. 35 1.1 mrg 36 1.1 mrg In the specific case of X86 MMX (__m64) intrinsics, the PowerPC 37 1.1 mrg target does not support a native __vector_size__ (8) type. Instead 38 1.1 mrg we typedef __m64 to a 64-bit unsigned long long, which is natively 39 1.1 mrg supported in 64-bit mode. This works well for the _si64 and some 40 1.1 mrg _pi32 operations, but starts to generate long sequences for _pi16 41 1.1 mrg and _pi8 operations. For those cases it better (faster and 42 1.1 mrg smaller code) to transfer __m64 data to the PowerPC vector 128-bit 43 1.1 mrg unit, perform the operation, and then transfer the result back to 44 1.1 mrg the __m64 type. This implies that the direct register move 45 1.1 mrg instructions, introduced with power8, are available for efficient 46 1.1 mrg implementation of these transfers. 47 1.1 mrg 48 1.1 mrg Most MMX intrinsic operations can be performed efficiently as 49 1.1 mrg C language 64-bit scalar operation or optimized to use the newer 50 1.1 mrg 128-bit SSE/Altivec operations. We recomend this for new 51 1.1 mrg applications. */ 52 1.1 mrg #error "Please read comment above. Use -DNO_WARN_X86_INTRINSICS to disable this error." 53 1.1 mrg #endif 54 1.1 mrg 55 1.1 mrg #ifndef _MMINTRIN_H_INCLUDED 56 1.1 mrg #define _MMINTRIN_H_INCLUDED 57 1.1 mrg 58 1.1 mrg #include <altivec.h> 59 1.1 mrg /* The Intel API is flexible enough that we must allow aliasing with other 60 1.1 mrg vector types, and their scalar components. */ 61 1.1.1.3 mrg typedef __attribute__ ((__aligned__ (8), 62 1.1.1.3 mrg __may_alias__)) unsigned long long __m64; 63 1.1 mrg 64 1.1 mrg typedef __attribute__ ((__aligned__ (8))) 65 1.1 mrg union 66 1.1 mrg { 67 1.1 mrg __m64 as_m64; 68 1.1 mrg char as_char[8]; 69 1.1 mrg signed char as_signed_char [8]; 70 1.1 mrg short as_short[4]; 71 1.1 mrg int as_int[2]; 72 1.1 mrg long long as_long_long; 73 1.1 mrg float as_float[2]; 74 1.1 mrg double as_double; 75 1.1 mrg } __m64_union; 76 1.1 mrg 77 1.1 mrg /* Empty the multimedia state. */ 78 1.1 mrg extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 79 1.1 mrg _mm_empty (void) 80 1.1 mrg { 81 1.1 mrg /* nothing to do on PowerPC. */ 82 1.1 mrg } 83 1.1 mrg 84 1.1 mrg extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 85 1.1 mrg _m_empty (void) 86 1.1 mrg { 87 1.1 mrg /* nothing to do on PowerPC. */ 88 1.1 mrg } 89 1.1 mrg 90 1.1 mrg /* Convert I to a __m64 object. The integer is zero-extended to 64-bits. */ 91 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 92 1.1 mrg _mm_cvtsi32_si64 (int __i) 93 1.1 mrg { 94 1.1 mrg return (__m64) (unsigned int) __i; 95 1.1 mrg } 96 1.1 mrg 97 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 98 1.1 mrg _m_from_int (int __i) 99 1.1 mrg { 100 1.1 mrg return _mm_cvtsi32_si64 (__i); 101 1.1 mrg } 102 1.1 mrg 103 1.1 mrg /* Convert the lower 32 bits of the __m64 object into an integer. */ 104 1.1 mrg extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 105 1.1 mrg _mm_cvtsi64_si32 (__m64 __i) 106 1.1 mrg { 107 1.1 mrg return ((int) __i); 108 1.1 mrg } 109 1.1 mrg 110 1.1 mrg extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 111 1.1 mrg _m_to_int (__m64 __i) 112 1.1 mrg { 113 1.1 mrg return _mm_cvtsi64_si32 (__i); 114 1.1 mrg } 115 1.1 mrg 116 1.1 mrg /* Convert I to a __m64 object. */ 117 1.1 mrg 118 1.1 mrg /* Intel intrinsic. */ 119 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 120 1.1 mrg _m_from_int64 (long long __i) 121 1.1 mrg { 122 1.1 mrg return (__m64) __i; 123 1.1 mrg } 124 1.1 mrg 125 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 126 1.1 mrg _mm_cvtsi64_m64 (long long __i) 127 1.1 mrg { 128 1.1 mrg return (__m64) __i; 129 1.1 mrg } 130 1.1 mrg 131 1.1 mrg /* Microsoft intrinsic. */ 132 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 133 1.1 mrg _mm_cvtsi64x_si64 (long long __i) 134 1.1 mrg { 135 1.1 mrg return (__m64) __i; 136 1.1 mrg } 137 1.1 mrg 138 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 139 1.1 mrg _mm_set_pi64x (long long __i) 140 1.1 mrg { 141 1.1 mrg return (__m64) __i; 142 1.1 mrg } 143 1.1 mrg 144 1.1 mrg /* Convert the __m64 object to a 64bit integer. */ 145 1.1 mrg 146 1.1 mrg /* Intel intrinsic. */ 147 1.1 mrg extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 148 1.1 mrg _m_to_int64 (__m64 __i) 149 1.1 mrg { 150 1.1 mrg return (long long)__i; 151 1.1 mrg } 152 1.1 mrg 153 1.1 mrg extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 154 1.1 mrg _mm_cvtm64_si64 (__m64 __i) 155 1.1 mrg { 156 1.1 mrg return (long long) __i; 157 1.1 mrg } 158 1.1 mrg 159 1.1 mrg /* Microsoft intrinsic. */ 160 1.1 mrg extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 161 1.1 mrg _mm_cvtsi64_si64x (__m64 __i) 162 1.1 mrg { 163 1.1 mrg return (long long) __i; 164 1.1 mrg } 165 1.1 mrg 166 1.1 mrg #ifdef _ARCH_PWR8 167 1.1 mrg /* Pack the four 16-bit values from M1 into the lower four 8-bit values of 168 1.1 mrg the result, and the four 16-bit values from M2 into the upper four 8-bit 169 1.1 mrg values of the result, all with signed saturation. */ 170 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 171 1.1 mrg _mm_packs_pi16 (__m64 __m1, __m64 __m2) 172 1.1 mrg { 173 1.1.1.3 mrg __vector signed short __vm1; 174 1.1.1.3 mrg __vector signed char __vresult; 175 1.1 mrg 176 1.1.1.3 mrg __vm1 = (__vector signed short) (__vector unsigned long long) 177 1.1.1.2 mrg #ifdef __LITTLE_ENDIAN__ 178 1.1.1.2 mrg { __m1, __m2 }; 179 1.1.1.2 mrg #else 180 1.1.1.2 mrg { __m2, __m1 }; 181 1.1.1.2 mrg #endif 182 1.1.1.3 mrg __vresult = vec_packs (__vm1, __vm1); 183 1.1.1.3 mrg return (__m64) ((__vector long long) __vresult)[0]; 184 1.1 mrg } 185 1.1 mrg 186 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 187 1.1 mrg _m_packsswb (__m64 __m1, __m64 __m2) 188 1.1 mrg { 189 1.1 mrg return _mm_packs_pi16 (__m1, __m2); 190 1.1 mrg } 191 1.1 mrg 192 1.1 mrg /* Pack the two 32-bit values from M1 in to the lower two 16-bit values of 193 1.1 mrg the result, and the two 32-bit values from M2 into the upper two 16-bit 194 1.1 mrg values of the result, all with signed saturation. */ 195 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 196 1.1 mrg _mm_packs_pi32 (__m64 __m1, __m64 __m2) 197 1.1 mrg { 198 1.1.1.3 mrg __vector signed int __vm1; 199 1.1.1.3 mrg __vector signed short __vresult; 200 1.1 mrg 201 1.1.1.3 mrg __vm1 = (__vector signed int) (__vector unsigned long long) 202 1.1.1.2 mrg #ifdef __LITTLE_ENDIAN__ 203 1.1.1.2 mrg { __m1, __m2 }; 204 1.1.1.2 mrg #else 205 1.1.1.2 mrg { __m2, __m1 }; 206 1.1.1.2 mrg #endif 207 1.1.1.3 mrg __vresult = vec_packs (__vm1, __vm1); 208 1.1.1.3 mrg return (__m64) ((__vector long long) __vresult)[0]; 209 1.1 mrg } 210 1.1 mrg 211 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 212 1.1 mrg _m_packssdw (__m64 __m1, __m64 __m2) 213 1.1 mrg { 214 1.1 mrg return _mm_packs_pi32 (__m1, __m2); 215 1.1 mrg } 216 1.1 mrg 217 1.1 mrg /* Pack the four 16-bit values from M1 into the lower four 8-bit values of 218 1.1 mrg the result, and the four 16-bit values from M2 into the upper four 8-bit 219 1.1 mrg values of the result, all with unsigned saturation. */ 220 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 221 1.1 mrg _mm_packs_pu16 (__m64 __m1, __m64 __m2) 222 1.1 mrg { 223 1.1.1.3 mrg __vector unsigned char __r; 224 1.1.1.3 mrg __vector signed short __vm1 = (__vector signed short) (__vector long long) 225 1.1.1.2 mrg #ifdef __LITTLE_ENDIAN__ 226 1.1.1.2 mrg { __m1, __m2 }; 227 1.1.1.2 mrg #else 228 1.1.1.2 mrg { __m2, __m1 }; 229 1.1.1.2 mrg #endif 230 1.1.1.2 mrg const __vector signed short __zero = { 0 }; 231 1.1.1.3 mrg __vector __bool short __select = vec_cmplt (__vm1, __zero); 232 1.1.1.3 mrg __r = vec_packs ((__vector unsigned short) __vm1, (__vector unsigned short) __vm1); 233 1.1.1.3 mrg __vector __bool char __packsel = vec_pack (__select, __select); 234 1.1.1.3 mrg __r = vec_sel (__r, (const __vector unsigned char) __zero, __packsel); 235 1.1.1.3 mrg return (__m64) ((__vector long long) __r)[0]; 236 1.1 mrg } 237 1.1 mrg 238 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 239 1.1 mrg _m_packuswb (__m64 __m1, __m64 __m2) 240 1.1 mrg { 241 1.1 mrg return _mm_packs_pu16 (__m1, __m2); 242 1.1 mrg } 243 1.1 mrg #endif /* end ARCH_PWR8 */ 244 1.1 mrg 245 1.1 mrg /* Interleave the four 8-bit values from the high half of M1 with the four 246 1.1 mrg 8-bit values from the high half of M2. */ 247 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 248 1.1 mrg _mm_unpackhi_pi8 (__m64 __m1, __m64 __m2) 249 1.1 mrg { 250 1.1 mrg #if _ARCH_PWR8 251 1.1.1.3 mrg __vector unsigned char __a, __b, __c; 252 1.1 mrg 253 1.1.1.3 mrg __a = (__vector unsigned char)vec_splats (__m1); 254 1.1.1.3 mrg __b = (__vector unsigned char)vec_splats (__m2); 255 1.1.1.3 mrg __c = vec_mergel (__a, __b); 256 1.1.1.3 mrg return (__m64) ((__vector long long) __c)[1]; 257 1.1 mrg #else 258 1.1.1.3 mrg __m64_union __mu1, __mu2, __res; 259 1.1 mrg 260 1.1.1.3 mrg __mu1.as_m64 = __m1; 261 1.1.1.3 mrg __mu2.as_m64 = __m2; 262 1.1 mrg 263 1.1.1.3 mrg __res.as_char[0] = __mu1.as_char[4]; 264 1.1.1.3 mrg __res.as_char[1] = __mu2.as_char[4]; 265 1.1.1.3 mrg __res.as_char[2] = __mu1.as_char[5]; 266 1.1.1.3 mrg __res.as_char[3] = __mu2.as_char[5]; 267 1.1.1.3 mrg __res.as_char[4] = __mu1.as_char[6]; 268 1.1.1.3 mrg __res.as_char[5] = __mu2.as_char[6]; 269 1.1.1.3 mrg __res.as_char[6] = __mu1.as_char[7]; 270 1.1.1.3 mrg __res.as_char[7] = __mu2.as_char[7]; 271 1.1 mrg 272 1.1.1.3 mrg return (__m64) __res.as_m64; 273 1.1 mrg #endif 274 1.1 mrg } 275 1.1 mrg 276 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 277 1.1 mrg _m_punpckhbw (__m64 __m1, __m64 __m2) 278 1.1 mrg { 279 1.1 mrg return _mm_unpackhi_pi8 (__m1, __m2); 280 1.1 mrg } 281 1.1 mrg 282 1.1 mrg /* Interleave the two 16-bit values from the high half of M1 with the two 283 1.1 mrg 16-bit values from the high half of M2. */ 284 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 285 1.1 mrg _mm_unpackhi_pi16 (__m64 __m1, __m64 __m2) 286 1.1 mrg { 287 1.1.1.3 mrg __m64_union __mu1, __mu2, __res; 288 1.1 mrg 289 1.1.1.3 mrg __mu1.as_m64 = __m1; 290 1.1.1.3 mrg __mu2.as_m64 = __m2; 291 1.1 mrg 292 1.1.1.3 mrg __res.as_short[0] = __mu1.as_short[2]; 293 1.1.1.3 mrg __res.as_short[1] = __mu2.as_short[2]; 294 1.1.1.3 mrg __res.as_short[2] = __mu1.as_short[3]; 295 1.1.1.3 mrg __res.as_short[3] = __mu2.as_short[3]; 296 1.1 mrg 297 1.1.1.3 mrg return (__m64) __res.as_m64; 298 1.1 mrg } 299 1.1 mrg 300 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 301 1.1 mrg _m_punpckhwd (__m64 __m1, __m64 __m2) 302 1.1 mrg { 303 1.1 mrg return _mm_unpackhi_pi16 (__m1, __m2); 304 1.1 mrg } 305 1.1 mrg /* Interleave the 32-bit value from the high half of M1 with the 32-bit 306 1.1 mrg value from the high half of M2. */ 307 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 308 1.1 mrg _mm_unpackhi_pi32 (__m64 __m1, __m64 __m2) 309 1.1 mrg { 310 1.1.1.3 mrg __m64_union __mu1, __mu2, __res; 311 1.1 mrg 312 1.1.1.3 mrg __mu1.as_m64 = __m1; 313 1.1.1.3 mrg __mu2.as_m64 = __m2; 314 1.1 mrg 315 1.1.1.3 mrg __res.as_int[0] = __mu1.as_int[1]; 316 1.1.1.3 mrg __res.as_int[1] = __mu2.as_int[1]; 317 1.1 mrg 318 1.1.1.3 mrg return (__m64) __res.as_m64; 319 1.1 mrg } 320 1.1 mrg 321 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 322 1.1 mrg _m_punpckhdq (__m64 __m1, __m64 __m2) 323 1.1 mrg { 324 1.1 mrg return _mm_unpackhi_pi32 (__m1, __m2); 325 1.1 mrg } 326 1.1 mrg /* Interleave the four 8-bit values from the low half of M1 with the four 327 1.1 mrg 8-bit values from the low half of M2. */ 328 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 329 1.1 mrg _mm_unpacklo_pi8 (__m64 __m1, __m64 __m2) 330 1.1 mrg { 331 1.1 mrg #if _ARCH_PWR8 332 1.1.1.3 mrg __vector unsigned char __a, __b, __c; 333 1.1 mrg 334 1.1.1.3 mrg __a = (__vector unsigned char)vec_splats (__m1); 335 1.1.1.3 mrg __b = (__vector unsigned char)vec_splats (__m2); 336 1.1.1.3 mrg __c = vec_mergel (__a, __b); 337 1.1.1.3 mrg return (__m64) ((__vector long long) __c)[0]; 338 1.1 mrg #else 339 1.1.1.3 mrg __m64_union __mu1, __mu2, __res; 340 1.1 mrg 341 1.1.1.3 mrg __mu1.as_m64 = __m1; 342 1.1.1.3 mrg __mu2.as_m64 = __m2; 343 1.1 mrg 344 1.1.1.3 mrg __res.as_char[0] = __mu1.as_char[0]; 345 1.1.1.3 mrg __res.as_char[1] = __mu2.as_char[0]; 346 1.1.1.3 mrg __res.as_char[2] = __mu1.as_char[1]; 347 1.1.1.3 mrg __res.as_char[3] = __mu2.as_char[1]; 348 1.1.1.3 mrg __res.as_char[4] = __mu1.as_char[2]; 349 1.1.1.3 mrg __res.as_char[5] = __mu2.as_char[2]; 350 1.1.1.3 mrg __res.as_char[6] = __mu1.as_char[3]; 351 1.1.1.3 mrg __res.as_char[7] = __mu2.as_char[3]; 352 1.1 mrg 353 1.1.1.3 mrg return (__m64) __res.as_m64; 354 1.1 mrg #endif 355 1.1 mrg } 356 1.1 mrg 357 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 358 1.1 mrg _m_punpcklbw (__m64 __m1, __m64 __m2) 359 1.1 mrg { 360 1.1 mrg return _mm_unpacklo_pi8 (__m1, __m2); 361 1.1 mrg } 362 1.1 mrg /* Interleave the two 16-bit values from the low half of M1 with the two 363 1.1 mrg 16-bit values from the low half of M2. */ 364 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 365 1.1 mrg _mm_unpacklo_pi16 (__m64 __m1, __m64 __m2) 366 1.1 mrg { 367 1.1.1.3 mrg __m64_union __mu1, __mu2, __res; 368 1.1 mrg 369 1.1.1.3 mrg __mu1.as_m64 = __m1; 370 1.1.1.3 mrg __mu2.as_m64 = __m2; 371 1.1 mrg 372 1.1.1.3 mrg __res.as_short[0] = __mu1.as_short[0]; 373 1.1.1.3 mrg __res.as_short[1] = __mu2.as_short[0]; 374 1.1.1.3 mrg __res.as_short[2] = __mu1.as_short[1]; 375 1.1.1.3 mrg __res.as_short[3] = __mu2.as_short[1]; 376 1.1 mrg 377 1.1.1.3 mrg return (__m64) __res.as_m64; 378 1.1 mrg } 379 1.1 mrg 380 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 381 1.1 mrg _m_punpcklwd (__m64 __m1, __m64 __m2) 382 1.1 mrg { 383 1.1 mrg return _mm_unpacklo_pi16 (__m1, __m2); 384 1.1 mrg } 385 1.1 mrg 386 1.1 mrg /* Interleave the 32-bit value from the low half of M1 with the 32-bit 387 1.1 mrg value from the low half of M2. */ 388 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 389 1.1 mrg _mm_unpacklo_pi32 (__m64 __m1, __m64 __m2) 390 1.1 mrg { 391 1.1.1.3 mrg __m64_union __mu1, __mu2, __res; 392 1.1 mrg 393 1.1.1.3 mrg __mu1.as_m64 = __m1; 394 1.1.1.3 mrg __mu2.as_m64 = __m2; 395 1.1 mrg 396 1.1.1.3 mrg __res.as_int[0] = __mu1.as_int[0]; 397 1.1.1.3 mrg __res.as_int[1] = __mu2.as_int[0]; 398 1.1 mrg 399 1.1.1.3 mrg return (__m64) __res.as_m64; 400 1.1 mrg } 401 1.1 mrg 402 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 403 1.1 mrg _m_punpckldq (__m64 __m1, __m64 __m2) 404 1.1 mrg { 405 1.1 mrg return _mm_unpacklo_pi32 (__m1, __m2); 406 1.1 mrg } 407 1.1 mrg 408 1.1 mrg /* Add the 8-bit values in M1 to the 8-bit values in M2. */ 409 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 410 1.1 mrg _mm_add_pi8 (__m64 __m1, __m64 __m2) 411 1.1 mrg { 412 1.1 mrg #if _ARCH_PWR8 413 1.1.1.3 mrg __vector signed char __a, __b, __c; 414 1.1 mrg 415 1.1.1.3 mrg __a = (__vector signed char)vec_splats (__m1); 416 1.1.1.3 mrg __b = (__vector signed char)vec_splats (__m2); 417 1.1.1.3 mrg __c = vec_add (__a, __b); 418 1.1.1.3 mrg return (__m64) ((__vector long long) __c)[0]; 419 1.1 mrg #else 420 1.1.1.3 mrg __m64_union __mu1, __mu2, __res; 421 1.1 mrg 422 1.1.1.3 mrg __mu1.as_m64 = __m1; 423 1.1.1.3 mrg __mu2.as_m64 = __m2; 424 1.1 mrg 425 1.1.1.3 mrg __res.as_char[0] = __mu1.as_char[0] + __mu2.as_char[0]; 426 1.1.1.3 mrg __res.as_char[1] = __mu1.as_char[1] + __mu2.as_char[1]; 427 1.1.1.3 mrg __res.as_char[2] = __mu1.as_char[2] + __mu2.as_char[2]; 428 1.1.1.3 mrg __res.as_char[3] = __mu1.as_char[3] + __mu2.as_char[3]; 429 1.1.1.3 mrg __res.as_char[4] = __mu1.as_char[4] + __mu2.as_char[4]; 430 1.1.1.3 mrg __res.as_char[5] = __mu1.as_char[5] + __mu2.as_char[5]; 431 1.1.1.3 mrg __res.as_char[6] = __mu1.as_char[6] + __mu2.as_char[6]; 432 1.1.1.3 mrg __res.as_char[7] = __mu1.as_char[7] + __mu2.as_char[7]; 433 1.1 mrg 434 1.1.1.3 mrg return (__m64) __res.as_m64; 435 1.1 mrg #endif 436 1.1 mrg } 437 1.1 mrg 438 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 439 1.1 mrg _m_paddb (__m64 __m1, __m64 __m2) 440 1.1 mrg { 441 1.1 mrg return _mm_add_pi8 (__m1, __m2); 442 1.1 mrg } 443 1.1 mrg 444 1.1 mrg /* Add the 16-bit values in M1 to the 16-bit values in M2. */ 445 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 446 1.1 mrg _mm_add_pi16 (__m64 __m1, __m64 __m2) 447 1.1 mrg { 448 1.1 mrg #if _ARCH_PWR8 449 1.1.1.3 mrg __vector signed short __a, __b, __c; 450 1.1 mrg 451 1.1.1.3 mrg __a = (__vector signed short)vec_splats (__m1); 452 1.1.1.3 mrg __b = (__vector signed short)vec_splats (__m2); 453 1.1.1.3 mrg __c = vec_add (__a, __b); 454 1.1.1.3 mrg return (__m64) ((__vector long long) __c)[0]; 455 1.1 mrg #else 456 1.1.1.3 mrg __m64_union __mu1, __mu2, __res; 457 1.1 mrg 458 1.1.1.3 mrg __mu1.as_m64 = __m1; 459 1.1.1.3 mrg __mu2.as_m64 = __m2; 460 1.1 mrg 461 1.1.1.3 mrg __res.as_short[0] = __mu1.as_short[0] + __mu2.as_short[0]; 462 1.1.1.3 mrg __res.as_short[1] = __mu1.as_short[1] + __mu2.as_short[1]; 463 1.1.1.3 mrg __res.as_short[2] = __mu1.as_short[2] + __mu2.as_short[2]; 464 1.1.1.3 mrg __res.as_short[3] = __mu1.as_short[3] + __mu2.as_short[3]; 465 1.1 mrg 466 1.1.1.3 mrg return (__m64) __res.as_m64; 467 1.1 mrg #endif 468 1.1 mrg } 469 1.1 mrg 470 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 471 1.1 mrg _m_paddw (__m64 __m1, __m64 __m2) 472 1.1 mrg { 473 1.1 mrg return _mm_add_pi16 (__m1, __m2); 474 1.1 mrg } 475 1.1 mrg 476 1.1 mrg /* Add the 32-bit values in M1 to the 32-bit values in M2. */ 477 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 478 1.1 mrg _mm_add_pi32 (__m64 __m1, __m64 __m2) 479 1.1 mrg { 480 1.1 mrg #if _ARCH_PWR9 481 1.1.1.3 mrg __vector signed int __a, __b, __c; 482 1.1 mrg 483 1.1.1.3 mrg __a = (__vector signed int)vec_splats (__m1); 484 1.1.1.3 mrg __b = (__vector signed int)vec_splats (__m2); 485 1.1.1.3 mrg __c = vec_add (__a, __b); 486 1.1.1.3 mrg return (__m64) ((__vector long long) __c)[0]; 487 1.1 mrg #else 488 1.1.1.3 mrg __m64_union __mu1, __mu2, __res; 489 1.1 mrg 490 1.1.1.3 mrg __mu1.as_m64 = __m1; 491 1.1.1.3 mrg __mu2.as_m64 = __m2; 492 1.1 mrg 493 1.1.1.3 mrg __res.as_int[0] = __mu1.as_int[0] + __mu2.as_int[0]; 494 1.1.1.3 mrg __res.as_int[1] = __mu1.as_int[1] + __mu2.as_int[1]; 495 1.1 mrg 496 1.1.1.3 mrg return (__m64) __res.as_m64; 497 1.1 mrg #endif 498 1.1 mrg } 499 1.1 mrg 500 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 501 1.1 mrg _m_paddd (__m64 __m1, __m64 __m2) 502 1.1 mrg { 503 1.1 mrg return _mm_add_pi32 (__m1, __m2); 504 1.1 mrg } 505 1.1 mrg 506 1.1 mrg /* Subtract the 8-bit values in M2 from the 8-bit values in M1. */ 507 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 508 1.1 mrg _mm_sub_pi8 (__m64 __m1, __m64 __m2) 509 1.1 mrg { 510 1.1 mrg #if _ARCH_PWR8 511 1.1.1.3 mrg __vector signed char __a, __b, __c; 512 1.1 mrg 513 1.1.1.3 mrg __a = (__vector signed char)vec_splats (__m1); 514 1.1.1.3 mrg __b = (__vector signed char)vec_splats (__m2); 515 1.1.1.3 mrg __c = vec_sub (__a, __b); 516 1.1.1.3 mrg return (__m64) ((__vector long long) __c)[0]; 517 1.1 mrg #else 518 1.1.1.3 mrg __m64_union __mu1, __mu2, __res; 519 1.1 mrg 520 1.1.1.3 mrg __mu1.as_m64 = __m1; 521 1.1.1.3 mrg __mu2.as_m64 = __m2; 522 1.1 mrg 523 1.1.1.3 mrg __res.as_char[0] = __mu1.as_char[0] - __mu2.as_char[0]; 524 1.1.1.3 mrg __res.as_char[1] = __mu1.as_char[1] - __mu2.as_char[1]; 525 1.1.1.3 mrg __res.as_char[2] = __mu1.as_char[2] - __mu2.as_char[2]; 526 1.1.1.3 mrg __res.as_char[3] = __mu1.as_char[3] - __mu2.as_char[3]; 527 1.1.1.3 mrg __res.as_char[4] = __mu1.as_char[4] - __mu2.as_char[4]; 528 1.1.1.3 mrg __res.as_char[5] = __mu1.as_char[5] - __mu2.as_char[5]; 529 1.1.1.3 mrg __res.as_char[6] = __mu1.as_char[6] - __mu2.as_char[6]; 530 1.1.1.3 mrg __res.as_char[7] = __mu1.as_char[7] - __mu2.as_char[7]; 531 1.1 mrg 532 1.1.1.3 mrg return (__m64) __res.as_m64; 533 1.1 mrg #endif 534 1.1 mrg } 535 1.1 mrg 536 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 537 1.1 mrg _m_psubb (__m64 __m1, __m64 __m2) 538 1.1 mrg { 539 1.1 mrg return _mm_sub_pi8 (__m1, __m2); 540 1.1 mrg } 541 1.1 mrg 542 1.1 mrg /* Subtract the 16-bit values in M2 from the 16-bit values in M1. */ 543 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 544 1.1 mrg _mm_sub_pi16 (__m64 __m1, __m64 __m2) 545 1.1 mrg { 546 1.1 mrg #if _ARCH_PWR8 547 1.1.1.3 mrg __vector signed short __a, __b, __c; 548 1.1 mrg 549 1.1.1.3 mrg __a = (__vector signed short)vec_splats (__m1); 550 1.1.1.3 mrg __b = (__vector signed short)vec_splats (__m2); 551 1.1.1.3 mrg __c = vec_sub (__a, __b); 552 1.1.1.3 mrg return (__m64) ((__vector long long) __c)[0]; 553 1.1 mrg #else 554 1.1.1.3 mrg __m64_union __mu1, __mu2, __res; 555 1.1 mrg 556 1.1.1.3 mrg __mu1.as_m64 = __m1; 557 1.1.1.3 mrg __mu2.as_m64 = __m2; 558 1.1 mrg 559 1.1.1.3 mrg __res.as_short[0] = __mu1.as_short[0] - __mu2.as_short[0]; 560 1.1.1.3 mrg __res.as_short[1] = __mu1.as_short[1] - __mu2.as_short[1]; 561 1.1.1.3 mrg __res.as_short[2] = __mu1.as_short[2] - __mu2.as_short[2]; 562 1.1.1.3 mrg __res.as_short[3] = __mu1.as_short[3] - __mu2.as_short[3]; 563 1.1 mrg 564 1.1.1.3 mrg return (__m64) __res.as_m64; 565 1.1 mrg #endif 566 1.1 mrg } 567 1.1 mrg 568 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 569 1.1 mrg _m_psubw (__m64 __m1, __m64 __m2) 570 1.1 mrg { 571 1.1 mrg return _mm_sub_pi16 (__m1, __m2); 572 1.1 mrg } 573 1.1 mrg 574 1.1 mrg /* Subtract the 32-bit values in M2 from the 32-bit values in M1. */ 575 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 576 1.1 mrg _mm_sub_pi32 (__m64 __m1, __m64 __m2) 577 1.1 mrg { 578 1.1 mrg #if _ARCH_PWR9 579 1.1.1.3 mrg __vector signed int __a, __b, __c; 580 1.1 mrg 581 1.1.1.3 mrg __a = (__vector signed int)vec_splats (__m1); 582 1.1.1.3 mrg __b = (__vector signed int)vec_splats (__m2); 583 1.1.1.3 mrg __c = vec_sub (__a, __b); 584 1.1.1.3 mrg return (__m64) ((__vector long long) __c)[0]; 585 1.1 mrg #else 586 1.1.1.3 mrg __m64_union __mu1, __mu2, __res; 587 1.1 mrg 588 1.1.1.3 mrg __mu1.as_m64 = __m1; 589 1.1.1.3 mrg __mu2.as_m64 = __m2; 590 1.1 mrg 591 1.1.1.3 mrg __res.as_int[0] = __mu1.as_int[0] - __mu2.as_int[0]; 592 1.1.1.3 mrg __res.as_int[1] = __mu1.as_int[1] - __mu2.as_int[1]; 593 1.1 mrg 594 1.1.1.3 mrg return (__m64) __res.as_m64; 595 1.1 mrg #endif 596 1.1 mrg } 597 1.1 mrg 598 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 599 1.1 mrg _m_psubd (__m64 __m1, __m64 __m2) 600 1.1 mrg { 601 1.1 mrg return _mm_sub_pi32 (__m1, __m2); 602 1.1 mrg } 603 1.1 mrg 604 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 605 1.1 mrg _mm_add_si64 (__m64 __m1, __m64 __m2) 606 1.1 mrg { 607 1.1 mrg return (__m1 + __m2); 608 1.1 mrg } 609 1.1 mrg 610 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 611 1.1 mrg _mm_sub_si64 (__m64 __m1, __m64 __m2) 612 1.1 mrg { 613 1.1 mrg return (__m1 - __m2); 614 1.1 mrg } 615 1.1 mrg 616 1.1 mrg /* Shift the 64-bit value in M left by COUNT. */ 617 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 618 1.1 mrg _mm_sll_si64 (__m64 __m, __m64 __count) 619 1.1 mrg { 620 1.1 mrg return (__m << __count); 621 1.1 mrg } 622 1.1 mrg 623 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 624 1.1 mrg _m_psllq (__m64 __m, __m64 __count) 625 1.1 mrg { 626 1.1 mrg return _mm_sll_si64 (__m, __count); 627 1.1 mrg } 628 1.1 mrg 629 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 630 1.1 mrg _mm_slli_si64 (__m64 __m, const int __count) 631 1.1 mrg { 632 1.1 mrg return (__m << __count); 633 1.1 mrg } 634 1.1 mrg 635 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 636 1.1 mrg _m_psllqi (__m64 __m, const int __count) 637 1.1 mrg { 638 1.1 mrg return _mm_slli_si64 (__m, __count); 639 1.1 mrg } 640 1.1 mrg 641 1.1 mrg /* Shift the 64-bit value in M left by COUNT; shift in zeros. */ 642 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 643 1.1 mrg _mm_srl_si64 (__m64 __m, __m64 __count) 644 1.1 mrg { 645 1.1 mrg return (__m >> __count); 646 1.1 mrg } 647 1.1 mrg 648 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 649 1.1 mrg _m_psrlq (__m64 __m, __m64 __count) 650 1.1 mrg { 651 1.1 mrg return _mm_srl_si64 (__m, __count); 652 1.1 mrg } 653 1.1 mrg 654 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 655 1.1 mrg _mm_srli_si64 (__m64 __m, const int __count) 656 1.1 mrg { 657 1.1 mrg return (__m >> __count); 658 1.1 mrg } 659 1.1 mrg 660 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 661 1.1 mrg _m_psrlqi (__m64 __m, const int __count) 662 1.1 mrg { 663 1.1 mrg return _mm_srli_si64 (__m, __count); 664 1.1 mrg } 665 1.1 mrg 666 1.1 mrg /* Bit-wise AND the 64-bit values in M1 and M2. */ 667 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 668 1.1 mrg _mm_and_si64 (__m64 __m1, __m64 __m2) 669 1.1 mrg { 670 1.1 mrg return (__m1 & __m2); 671 1.1 mrg } 672 1.1 mrg 673 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 674 1.1 mrg _m_pand (__m64 __m1, __m64 __m2) 675 1.1 mrg { 676 1.1 mrg return _mm_and_si64 (__m1, __m2); 677 1.1 mrg } 678 1.1 mrg 679 1.1 mrg /* Bit-wise complement the 64-bit value in M1 and bit-wise AND it with the 680 1.1 mrg 64-bit value in M2. */ 681 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 682 1.1 mrg _mm_andnot_si64 (__m64 __m1, __m64 __m2) 683 1.1 mrg { 684 1.1 mrg return (~__m1 & __m2); 685 1.1 mrg } 686 1.1 mrg 687 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 688 1.1 mrg _m_pandn (__m64 __m1, __m64 __m2) 689 1.1 mrg { 690 1.1 mrg return _mm_andnot_si64 (__m1, __m2); 691 1.1 mrg } 692 1.1 mrg 693 1.1 mrg /* Bit-wise inclusive OR the 64-bit values in M1 and M2. */ 694 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 695 1.1 mrg _mm_or_si64 (__m64 __m1, __m64 __m2) 696 1.1 mrg { 697 1.1 mrg return (__m1 | __m2); 698 1.1 mrg } 699 1.1 mrg 700 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 701 1.1 mrg _m_por (__m64 __m1, __m64 __m2) 702 1.1 mrg { 703 1.1 mrg return _mm_or_si64 (__m1, __m2); 704 1.1 mrg } 705 1.1 mrg 706 1.1 mrg /* Bit-wise exclusive OR the 64-bit values in M1 and M2. */ 707 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 708 1.1 mrg _mm_xor_si64 (__m64 __m1, __m64 __m2) 709 1.1 mrg { 710 1.1 mrg return (__m1 ^ __m2); 711 1.1 mrg } 712 1.1 mrg 713 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 714 1.1 mrg _m_pxor (__m64 __m1, __m64 __m2) 715 1.1 mrg { 716 1.1 mrg return _mm_xor_si64 (__m1, __m2); 717 1.1 mrg } 718 1.1 mrg 719 1.1 mrg /* Creates a 64-bit zero. */ 720 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 721 1.1 mrg _mm_setzero_si64 (void) 722 1.1 mrg { 723 1.1 mrg return (__m64) 0; 724 1.1 mrg } 725 1.1 mrg 726 1.1 mrg /* Compare eight 8-bit values. The result of the comparison is 0xFF if the 727 1.1 mrg test is true and zero if false. */ 728 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 729 1.1 mrg _mm_cmpeq_pi8 (__m64 __m1, __m64 __m2) 730 1.1 mrg { 731 1.1.1.2 mrg #if defined(_ARCH_PWR6) && defined(__powerpc64__) 732 1.1.1.3 mrg __m64 __res; 733 1.1 mrg __asm__( 734 1.1 mrg "cmpb %0,%1,%2;\n" 735 1.1.1.3 mrg : "=r" (__res) 736 1.1 mrg : "r" (__m1), 737 1.1 mrg "r" (__m2) 738 1.1 mrg : ); 739 1.1.1.3 mrg return (__res); 740 1.1 mrg #else 741 1.1.1.3 mrg __m64_union __mu1, __mu2, __res; 742 1.1 mrg 743 1.1.1.3 mrg __mu1.as_m64 = __m1; 744 1.1.1.3 mrg __mu2.as_m64 = __m2; 745 1.1 mrg 746 1.1.1.3 mrg __res.as_char[0] = (__mu1.as_char[0] == __mu2.as_char[0])? -1: 0; 747 1.1.1.3 mrg __res.as_char[1] = (__mu1.as_char[1] == __mu2.as_char[1])? -1: 0; 748 1.1.1.3 mrg __res.as_char[2] = (__mu1.as_char[2] == __mu2.as_char[2])? -1: 0; 749 1.1.1.3 mrg __res.as_char[3] = (__mu1.as_char[3] == __mu2.as_char[3])? -1: 0; 750 1.1.1.3 mrg __res.as_char[4] = (__mu1.as_char[4] == __mu2.as_char[4])? -1: 0; 751 1.1.1.3 mrg __res.as_char[5] = (__mu1.as_char[5] == __mu2.as_char[5])? -1: 0; 752 1.1.1.3 mrg __res.as_char[6] = (__mu1.as_char[6] == __mu2.as_char[6])? -1: 0; 753 1.1.1.3 mrg __res.as_char[7] = (__mu1.as_char[7] == __mu2.as_char[7])? -1: 0; 754 1.1 mrg 755 1.1.1.3 mrg return (__m64) __res.as_m64; 756 1.1 mrg #endif 757 1.1 mrg } 758 1.1 mrg 759 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 760 1.1 mrg _m_pcmpeqb (__m64 __m1, __m64 __m2) 761 1.1 mrg { 762 1.1 mrg return _mm_cmpeq_pi8 (__m1, __m2); 763 1.1 mrg } 764 1.1 mrg 765 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 766 1.1 mrg _mm_cmpgt_pi8 (__m64 __m1, __m64 __m2) 767 1.1 mrg { 768 1.1 mrg #if _ARCH_PWR8 769 1.1.1.3 mrg __vector signed char __a, __b, __c; 770 1.1 mrg 771 1.1.1.3 mrg __a = (__vector signed char)vec_splats (__m1); 772 1.1.1.3 mrg __b = (__vector signed char)vec_splats (__m2); 773 1.1.1.3 mrg __c = (__vector signed char)vec_cmpgt (__a, __b); 774 1.1.1.3 mrg return (__m64) ((__vector long long) __c)[0]; 775 1.1 mrg #else 776 1.1.1.3 mrg __m64_union __mu1, __mu2, __res; 777 1.1 mrg 778 1.1.1.3 mrg __mu1.as_m64 = __m1; 779 1.1.1.3 mrg __mu2.as_m64 = __m2; 780 1.1 mrg 781 1.1.1.3 mrg __res.as_char[0] = (__mu1.as_char[0] > __mu2.as_char[0])? -1: 0; 782 1.1.1.3 mrg __res.as_char[1] = (__mu1.as_char[1] > __mu2.as_char[1])? -1: 0; 783 1.1.1.3 mrg __res.as_char[2] = (__mu1.as_char[2] > __mu2.as_char[2])? -1: 0; 784 1.1.1.3 mrg __res.as_char[3] = (__mu1.as_char[3] > __mu2.as_char[3])? -1: 0; 785 1.1.1.3 mrg __res.as_char[4] = (__mu1.as_char[4] > __mu2.as_char[4])? -1: 0; 786 1.1.1.3 mrg __res.as_char[5] = (__mu1.as_char[5] > __mu2.as_char[5])? -1: 0; 787 1.1.1.3 mrg __res.as_char[6] = (__mu1.as_char[6] > __mu2.as_char[6])? -1: 0; 788 1.1.1.3 mrg __res.as_char[7] = (__mu1.as_char[7] > __mu2.as_char[7])? -1: 0; 789 1.1 mrg 790 1.1.1.3 mrg return (__m64) __res.as_m64; 791 1.1 mrg #endif 792 1.1 mrg } 793 1.1 mrg 794 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 795 1.1 mrg _m_pcmpgtb (__m64 __m1, __m64 __m2) 796 1.1 mrg { 797 1.1 mrg return _mm_cmpgt_pi8 (__m1, __m2); 798 1.1 mrg } 799 1.1 mrg 800 1.1 mrg /* Compare four 16-bit values. The result of the comparison is 0xFFFF if 801 1.1 mrg the test is true and zero if false. */ 802 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 803 1.1 mrg _mm_cmpeq_pi16 (__m64 __m1, __m64 __m2) 804 1.1 mrg { 805 1.1 mrg #if _ARCH_PWR8 806 1.1.1.3 mrg __vector signed short __a, __b, __c; 807 1.1 mrg 808 1.1.1.3 mrg __a = (__vector signed short)vec_splats (__m1); 809 1.1.1.3 mrg __b = (__vector signed short)vec_splats (__m2); 810 1.1.1.3 mrg __c = (__vector signed short)vec_cmpeq (__a, __b); 811 1.1.1.3 mrg return (__m64) ((__vector long long) __c)[0]; 812 1.1 mrg #else 813 1.1.1.3 mrg __m64_union __mu1, __mu2, __res; 814 1.1 mrg 815 1.1.1.3 mrg __mu1.as_m64 = __m1; 816 1.1.1.3 mrg __mu2.as_m64 = __m2; 817 1.1 mrg 818 1.1.1.3 mrg __res.as_short[0] = (__mu1.as_short[0] == __mu2.as_short[0])? -1: 0; 819 1.1.1.3 mrg __res.as_short[1] = (__mu1.as_short[1] == __mu2.as_short[1])? -1: 0; 820 1.1.1.3 mrg __res.as_short[2] = (__mu1.as_short[2] == __mu2.as_short[2])? -1: 0; 821 1.1.1.3 mrg __res.as_short[3] = (__mu1.as_short[3] == __mu2.as_short[3])? -1: 0; 822 1.1 mrg 823 1.1.1.3 mrg return (__m64) __res.as_m64; 824 1.1 mrg #endif 825 1.1 mrg } 826 1.1 mrg 827 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 828 1.1 mrg _m_pcmpeqw (__m64 __m1, __m64 __m2) 829 1.1 mrg { 830 1.1 mrg return _mm_cmpeq_pi16 (__m1, __m2); 831 1.1 mrg } 832 1.1 mrg 833 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 834 1.1 mrg _mm_cmpgt_pi16 (__m64 __m1, __m64 __m2) 835 1.1 mrg { 836 1.1 mrg #if _ARCH_PWR8 837 1.1.1.3 mrg __vector signed short __a, __b, __c; 838 1.1 mrg 839 1.1.1.3 mrg __a = (__vector signed short)vec_splats (__m1); 840 1.1.1.3 mrg __b = (__vector signed short)vec_splats (__m2); 841 1.1.1.3 mrg __c = (__vector signed short)vec_cmpgt (__a, __b); 842 1.1.1.3 mrg return (__m64) ((__vector long long) __c)[0]; 843 1.1 mrg #else 844 1.1.1.3 mrg __m64_union __mu1, __mu2, __res; 845 1.1 mrg 846 1.1.1.3 mrg __mu1.as_m64 = __m1; 847 1.1.1.3 mrg __mu2.as_m64 = __m2; 848 1.1 mrg 849 1.1.1.3 mrg __res.as_short[0] = (__mu1.as_short[0] > __mu2.as_short[0])? -1: 0; 850 1.1.1.3 mrg __res.as_short[1] = (__mu1.as_short[1] > __mu2.as_short[1])? -1: 0; 851 1.1.1.3 mrg __res.as_short[2] = (__mu1.as_short[2] > __mu2.as_short[2])? -1: 0; 852 1.1.1.3 mrg __res.as_short[3] = (__mu1.as_short[3] > __mu2.as_short[3])? -1: 0; 853 1.1 mrg 854 1.1.1.3 mrg return (__m64) __res.as_m64; 855 1.1 mrg #endif 856 1.1 mrg } 857 1.1 mrg 858 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 859 1.1 mrg _m_pcmpgtw (__m64 __m1, __m64 __m2) 860 1.1 mrg { 861 1.1 mrg return _mm_cmpgt_pi16 (__m1, __m2); 862 1.1 mrg } 863 1.1 mrg 864 1.1 mrg /* Compare two 32-bit values. The result of the comparison is 0xFFFFFFFF if 865 1.1 mrg the test is true and zero if false. */ 866 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 867 1.1 mrg _mm_cmpeq_pi32 (__m64 __m1, __m64 __m2) 868 1.1 mrg { 869 1.1 mrg #if _ARCH_PWR9 870 1.1.1.3 mrg __vector signed int __a, __b, __c; 871 1.1 mrg 872 1.1.1.3 mrg __a = (__vector signed int)vec_splats (__m1); 873 1.1.1.3 mrg __b = (__vector signed int)vec_splats (__m2); 874 1.1.1.3 mrg __c = (__vector signed int)vec_cmpeq (__a, __b); 875 1.1.1.3 mrg return (__m64) ((__vector long long) __c)[0]; 876 1.1 mrg #else 877 1.1.1.3 mrg __m64_union __mu1, __mu2, __res; 878 1.1 mrg 879 1.1.1.3 mrg __mu1.as_m64 = __m1; 880 1.1.1.3 mrg __mu2.as_m64 = __m2; 881 1.1 mrg 882 1.1.1.3 mrg __res.as_int[0] = (__mu1.as_int[0] == __mu2.as_int[0])? -1: 0; 883 1.1.1.3 mrg __res.as_int[1] = (__mu1.as_int[1] == __mu2.as_int[1])? -1: 0; 884 1.1 mrg 885 1.1.1.3 mrg return (__m64) __res.as_m64; 886 1.1 mrg #endif 887 1.1 mrg } 888 1.1 mrg 889 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 890 1.1 mrg _m_pcmpeqd (__m64 __m1, __m64 __m2) 891 1.1 mrg { 892 1.1 mrg return _mm_cmpeq_pi32 (__m1, __m2); 893 1.1 mrg } 894 1.1 mrg 895 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 896 1.1 mrg _mm_cmpgt_pi32 (__m64 __m1, __m64 __m2) 897 1.1 mrg { 898 1.1 mrg #if _ARCH_PWR9 899 1.1.1.3 mrg __vector signed int __a, __b, __c; 900 1.1 mrg 901 1.1.1.3 mrg __a = (__vector signed int)vec_splats (__m1); 902 1.1.1.3 mrg __b = (__vector signed int)vec_splats (__m2); 903 1.1.1.3 mrg __c = (__vector signed int)vec_cmpgt (__a, __b); 904 1.1.1.3 mrg return (__m64) ((__vector long long) __c)[0]; 905 1.1 mrg #else 906 1.1.1.3 mrg __m64_union __mu1, __mu2, __res; 907 1.1 mrg 908 1.1.1.3 mrg __mu1.as_m64 = __m1; 909 1.1.1.3 mrg __mu2.as_m64 = __m2; 910 1.1 mrg 911 1.1.1.3 mrg __res.as_int[0] = (__mu1.as_int[0] > __mu2.as_int[0])? -1: 0; 912 1.1.1.3 mrg __res.as_int[1] = (__mu1.as_int[1] > __mu2.as_int[1])? -1: 0; 913 1.1 mrg 914 1.1.1.3 mrg return (__m64) __res.as_m64; 915 1.1 mrg #endif 916 1.1 mrg } 917 1.1 mrg 918 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 919 1.1 mrg _m_pcmpgtd (__m64 __m1, __m64 __m2) 920 1.1 mrg { 921 1.1 mrg return _mm_cmpgt_pi32 (__m1, __m2); 922 1.1 mrg } 923 1.1 mrg 924 1.1 mrg #if _ARCH_PWR8 925 1.1 mrg /* Add the 8-bit values in M1 to the 8-bit values in M2 using signed 926 1.1 mrg saturated arithmetic. */ 927 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 928 1.1 mrg _mm_adds_pi8 (__m64 __m1, __m64 __m2) 929 1.1 mrg { 930 1.1.1.3 mrg __vector signed char __a, __b, __c; 931 1.1 mrg 932 1.1.1.3 mrg __a = (__vector signed char)vec_splats (__m1); 933 1.1.1.3 mrg __b = (__vector signed char)vec_splats (__m2); 934 1.1.1.3 mrg __c = vec_adds (__a, __b); 935 1.1.1.3 mrg return (__m64) ((__vector long long) __c)[0]; 936 1.1 mrg } 937 1.1 mrg 938 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 939 1.1 mrg _m_paddsb (__m64 __m1, __m64 __m2) 940 1.1 mrg { 941 1.1 mrg return _mm_adds_pi8 (__m1, __m2); 942 1.1 mrg } 943 1.1 mrg /* Add the 16-bit values in M1 to the 16-bit values in M2 using signed 944 1.1 mrg saturated arithmetic. */ 945 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 946 1.1 mrg _mm_adds_pi16 (__m64 __m1, __m64 __m2) 947 1.1 mrg { 948 1.1.1.3 mrg __vector signed short __a, __b, __c; 949 1.1 mrg 950 1.1.1.3 mrg __a = (__vector signed short)vec_splats (__m1); 951 1.1.1.3 mrg __b = (__vector signed short)vec_splats (__m2); 952 1.1.1.3 mrg __c = vec_adds (__a, __b); 953 1.1.1.3 mrg return (__m64) ((__vector long long) __c)[0]; 954 1.1 mrg } 955 1.1 mrg 956 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 957 1.1 mrg _m_paddsw (__m64 __m1, __m64 __m2) 958 1.1 mrg { 959 1.1 mrg return _mm_adds_pi16 (__m1, __m2); 960 1.1 mrg } 961 1.1 mrg /* Add the 8-bit values in M1 to the 8-bit values in M2 using unsigned 962 1.1 mrg saturated arithmetic. */ 963 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 964 1.1 mrg _mm_adds_pu8 (__m64 __m1, __m64 __m2) 965 1.1 mrg { 966 1.1.1.3 mrg __vector unsigned char __a, __b, __c; 967 1.1 mrg 968 1.1.1.3 mrg __a = (__vector unsigned char)vec_splats (__m1); 969 1.1.1.3 mrg __b = (__vector unsigned char)vec_splats (__m2); 970 1.1.1.3 mrg __c = vec_adds (__a, __b); 971 1.1.1.3 mrg return (__m64) ((__vector long long) __c)[0]; 972 1.1 mrg } 973 1.1 mrg 974 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 975 1.1 mrg _m_paddusb (__m64 __m1, __m64 __m2) 976 1.1 mrg { 977 1.1 mrg return _mm_adds_pu8 (__m1, __m2); 978 1.1 mrg } 979 1.1 mrg 980 1.1 mrg /* Add the 16-bit values in M1 to the 16-bit values in M2 using unsigned 981 1.1 mrg saturated arithmetic. */ 982 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 983 1.1 mrg _mm_adds_pu16 (__m64 __m1, __m64 __m2) 984 1.1 mrg { 985 1.1.1.3 mrg __vector unsigned short __a, __b, __c; 986 1.1 mrg 987 1.1.1.3 mrg __a = (__vector unsigned short)vec_splats (__m1); 988 1.1.1.3 mrg __b = (__vector unsigned short)vec_splats (__m2); 989 1.1.1.3 mrg __c = vec_adds (__a, __b); 990 1.1.1.3 mrg return (__m64) ((__vector long long) __c)[0]; 991 1.1 mrg } 992 1.1 mrg 993 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 994 1.1 mrg _m_paddusw (__m64 __m1, __m64 __m2) 995 1.1 mrg { 996 1.1 mrg return _mm_adds_pu16 (__m1, __m2); 997 1.1 mrg } 998 1.1 mrg 999 1.1 mrg /* Subtract the 8-bit values in M2 from the 8-bit values in M1 using signed 1000 1.1 mrg saturating arithmetic. */ 1001 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1002 1.1 mrg _mm_subs_pi8 (__m64 __m1, __m64 __m2) 1003 1.1 mrg { 1004 1.1.1.3 mrg __vector signed char __a, __b, __c; 1005 1.1 mrg 1006 1.1.1.3 mrg __a = (__vector signed char)vec_splats (__m1); 1007 1.1.1.3 mrg __b = (__vector signed char)vec_splats (__m2); 1008 1.1.1.3 mrg __c = vec_subs (__a, __b); 1009 1.1.1.3 mrg return (__m64) ((__vector long long) __c)[0]; 1010 1.1 mrg } 1011 1.1 mrg 1012 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1013 1.1 mrg _m_psubsb (__m64 __m1, __m64 __m2) 1014 1.1 mrg { 1015 1.1 mrg return _mm_subs_pi8 (__m1, __m2); 1016 1.1 mrg } 1017 1.1 mrg 1018 1.1 mrg /* Subtract the 16-bit values in M2 from the 16-bit values in M1 using 1019 1.1 mrg signed saturating arithmetic. */ 1020 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1021 1.1 mrg _mm_subs_pi16 (__m64 __m1, __m64 __m2) 1022 1.1 mrg { 1023 1.1.1.3 mrg __vector signed short __a, __b, __c; 1024 1.1 mrg 1025 1.1.1.3 mrg __a = (__vector signed short)vec_splats (__m1); 1026 1.1.1.3 mrg __b = (__vector signed short)vec_splats (__m2); 1027 1.1.1.3 mrg __c = vec_subs (__a, __b); 1028 1.1.1.3 mrg return (__m64) ((__vector long long) __c)[0]; 1029 1.1 mrg } 1030 1.1 mrg 1031 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1032 1.1 mrg _m_psubsw (__m64 __m1, __m64 __m2) 1033 1.1 mrg { 1034 1.1 mrg return _mm_subs_pi16 (__m1, __m2); 1035 1.1 mrg } 1036 1.1 mrg 1037 1.1 mrg /* Subtract the 8-bit values in M2 from the 8-bit values in M1 using 1038 1.1 mrg unsigned saturating arithmetic. */ 1039 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1040 1.1 mrg _mm_subs_pu8 (__m64 __m1, __m64 __m2) 1041 1.1 mrg { 1042 1.1.1.3 mrg __vector unsigned char __a, __b, __c; 1043 1.1 mrg 1044 1.1.1.3 mrg __a = (__vector unsigned char)vec_splats (__m1); 1045 1.1.1.3 mrg __b = (__vector unsigned char)vec_splats (__m2); 1046 1.1.1.3 mrg __c = vec_subs (__a, __b); 1047 1.1.1.3 mrg return (__m64) ((__vector long long) __c)[0]; 1048 1.1 mrg } 1049 1.1 mrg 1050 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1051 1.1 mrg _m_psubusb (__m64 __m1, __m64 __m2) 1052 1.1 mrg { 1053 1.1 mrg return _mm_subs_pu8 (__m1, __m2); 1054 1.1 mrg } 1055 1.1 mrg 1056 1.1 mrg /* Subtract the 16-bit values in M2 from the 16-bit values in M1 using 1057 1.1 mrg unsigned saturating arithmetic. */ 1058 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1059 1.1 mrg _mm_subs_pu16 (__m64 __m1, __m64 __m2) 1060 1.1 mrg { 1061 1.1.1.3 mrg __vector unsigned short __a, __b, __c; 1062 1.1 mrg 1063 1.1.1.3 mrg __a = (__vector unsigned short)vec_splats (__m1); 1064 1.1.1.3 mrg __b = (__vector unsigned short)vec_splats (__m2); 1065 1.1.1.3 mrg __c = vec_subs (__a, __b); 1066 1.1.1.3 mrg return (__m64) ((__vector long long) __c)[0]; 1067 1.1 mrg } 1068 1.1 mrg 1069 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1070 1.1 mrg _m_psubusw (__m64 __m1, __m64 __m2) 1071 1.1 mrg { 1072 1.1 mrg return _mm_subs_pu16 (__m1, __m2); 1073 1.1 mrg } 1074 1.1 mrg 1075 1.1 mrg /* Multiply four 16-bit values in M1 by four 16-bit values in M2 producing 1076 1.1 mrg four 32-bit intermediate results, which are then summed by pairs to 1077 1.1 mrg produce two 32-bit results. */ 1078 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1079 1.1 mrg _mm_madd_pi16 (__m64 __m1, __m64 __m2) 1080 1.1 mrg { 1081 1.1.1.3 mrg __vector signed short __a, __b; 1082 1.1.1.3 mrg __vector signed int __c; 1083 1.1.1.3 mrg __vector signed int __zero = {0, 0, 0, 0}; 1084 1.1.1.3 mrg 1085 1.1.1.3 mrg __a = (__vector signed short)vec_splats (__m1); 1086 1.1.1.3 mrg __b = (__vector signed short)vec_splats (__m2); 1087 1.1.1.3 mrg __c = vec_vmsumshm (__a, __b, __zero); 1088 1.1.1.3 mrg return (__m64) ((__vector long long) __c)[0]; 1089 1.1 mrg } 1090 1.1 mrg 1091 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1092 1.1 mrg _m_pmaddwd (__m64 __m1, __m64 __m2) 1093 1.1 mrg { 1094 1.1 mrg return _mm_madd_pi16 (__m1, __m2); 1095 1.1 mrg } 1096 1.1 mrg /* Multiply four signed 16-bit values in M1 by four signed 16-bit values in 1097 1.1 mrg M2 and produce the high 16 bits of the 32-bit results. */ 1098 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1099 1.1 mrg _mm_mulhi_pi16 (__m64 __m1, __m64 __m2) 1100 1.1 mrg { 1101 1.1.1.3 mrg __vector signed short __a, __b; 1102 1.1.1.3 mrg __vector signed short __c; 1103 1.1.1.3 mrg __vector signed int __w0, __w1; 1104 1.1.1.3 mrg __vector unsigned char __xform1 = { 1105 1.1.1.2 mrg #ifdef __LITTLE_ENDIAN__ 1106 1.1 mrg 0x02, 0x03, 0x12, 0x13, 0x06, 0x07, 0x16, 0x17, 1107 1.1 mrg 0x0A, 0x0B, 0x1A, 0x1B, 0x0E, 0x0F, 0x1E, 0x1F 1108 1.1.1.2 mrg #else 1109 1.1.1.2 mrg 0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15, 1110 1.1.1.2 mrg 0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15 1111 1.1.1.2 mrg #endif 1112 1.1 mrg }; 1113 1.1 mrg 1114 1.1.1.3 mrg __a = (__vector signed short)vec_splats (__m1); 1115 1.1.1.3 mrg __b = (__vector signed short)vec_splats (__m2); 1116 1.1 mrg 1117 1.1.1.3 mrg __w0 = vec_vmulesh (__a, __b); 1118 1.1.1.3 mrg __w1 = vec_vmulosh (__a, __b); 1119 1.1.1.3 mrg __c = (__vector signed short)vec_perm (__w0, __w1, __xform1); 1120 1.1 mrg 1121 1.1.1.3 mrg return (__m64) ((__vector long long) __c)[0]; 1122 1.1 mrg } 1123 1.1 mrg 1124 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1125 1.1 mrg _m_pmulhw (__m64 __m1, __m64 __m2) 1126 1.1 mrg { 1127 1.1 mrg return _mm_mulhi_pi16 (__m1, __m2); 1128 1.1 mrg } 1129 1.1 mrg 1130 1.1 mrg /* Multiply four 16-bit values in M1 by four 16-bit values in M2 and produce 1131 1.1 mrg the low 16 bits of the results. */ 1132 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1133 1.1 mrg _mm_mullo_pi16 (__m64 __m1, __m64 __m2) 1134 1.1 mrg { 1135 1.1.1.3 mrg __vector signed short __a, __b, __c; 1136 1.1 mrg 1137 1.1.1.3 mrg __a = (__vector signed short)vec_splats (__m1); 1138 1.1.1.3 mrg __b = (__vector signed short)vec_splats (__m2); 1139 1.1.1.3 mrg __c = __a * __b; 1140 1.1.1.3 mrg return (__m64) ((__vector long long) __c)[0]; 1141 1.1 mrg } 1142 1.1 mrg 1143 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1144 1.1 mrg _m_pmullw (__m64 __m1, __m64 __m2) 1145 1.1 mrg { 1146 1.1 mrg return _mm_mullo_pi16 (__m1, __m2); 1147 1.1 mrg } 1148 1.1 mrg 1149 1.1 mrg /* Shift four 16-bit values in M left by COUNT. */ 1150 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1151 1.1 mrg _mm_sll_pi16 (__m64 __m, __m64 __count) 1152 1.1 mrg { 1153 1.1.1.3 mrg __vector signed short __r; 1154 1.1.1.3 mrg __vector unsigned short __c; 1155 1.1 mrg 1156 1.1 mrg if (__count <= 15) 1157 1.1 mrg { 1158 1.1.1.3 mrg __r = (__vector signed short)vec_splats (__m); 1159 1.1.1.3 mrg __c = (__vector unsigned short)vec_splats ((unsigned short)__count); 1160 1.1.1.3 mrg __r = vec_sl (__r, (__vector unsigned short)__c); 1161 1.1.1.3 mrg return (__m64) ((__vector long long) __r)[0]; 1162 1.1 mrg } 1163 1.1 mrg else 1164 1.1 mrg return (0); 1165 1.1 mrg } 1166 1.1 mrg 1167 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1168 1.1 mrg _m_psllw (__m64 __m, __m64 __count) 1169 1.1 mrg { 1170 1.1 mrg return _mm_sll_pi16 (__m, __count); 1171 1.1 mrg } 1172 1.1 mrg 1173 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1174 1.1 mrg _mm_slli_pi16 (__m64 __m, int __count) 1175 1.1 mrg { 1176 1.1 mrg /* Promote int to long then invoke mm_sll_pi16. */ 1177 1.1 mrg return _mm_sll_pi16 (__m, __count); 1178 1.1 mrg } 1179 1.1 mrg 1180 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1181 1.1 mrg _m_psllwi (__m64 __m, int __count) 1182 1.1 mrg { 1183 1.1 mrg return _mm_slli_pi16 (__m, __count); 1184 1.1 mrg } 1185 1.1 mrg 1186 1.1 mrg /* Shift two 32-bit values in M left by COUNT. */ 1187 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1188 1.1 mrg _mm_sll_pi32 (__m64 __m, __m64 __count) 1189 1.1 mrg { 1190 1.1.1.3 mrg __m64_union __res; 1191 1.1 mrg 1192 1.1.1.3 mrg __res.as_m64 = __m; 1193 1.1 mrg 1194 1.1.1.3 mrg __res.as_int[0] = __res.as_int[0] << __count; 1195 1.1.1.3 mrg __res.as_int[1] = __res.as_int[1] << __count; 1196 1.1.1.3 mrg return (__res.as_m64); 1197 1.1 mrg } 1198 1.1 mrg 1199 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1200 1.1 mrg _m_pslld (__m64 __m, __m64 __count) 1201 1.1 mrg { 1202 1.1 mrg return _mm_sll_pi32 (__m, __count); 1203 1.1 mrg } 1204 1.1 mrg 1205 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1206 1.1 mrg _mm_slli_pi32 (__m64 __m, int __count) 1207 1.1 mrg { 1208 1.1 mrg /* Promote int to long then invoke mm_sll_pi32. */ 1209 1.1 mrg return _mm_sll_pi32 (__m, __count); 1210 1.1 mrg } 1211 1.1 mrg 1212 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1213 1.1 mrg _m_pslldi (__m64 __m, int __count) 1214 1.1 mrg { 1215 1.1 mrg return _mm_slli_pi32 (__m, __count); 1216 1.1 mrg } 1217 1.1 mrg 1218 1.1 mrg /* Shift four 16-bit values in M right by COUNT; shift in the sign bit. */ 1219 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1220 1.1 mrg _mm_sra_pi16 (__m64 __m, __m64 __count) 1221 1.1 mrg { 1222 1.1.1.3 mrg __vector signed short __r; 1223 1.1.1.3 mrg __vector unsigned short __c; 1224 1.1 mrg 1225 1.1 mrg if (__count <= 15) 1226 1.1 mrg { 1227 1.1.1.3 mrg __r = (__vector signed short)vec_splats (__m); 1228 1.1.1.3 mrg __c = (__vector unsigned short)vec_splats ((unsigned short)__count); 1229 1.1.1.3 mrg __r = vec_sra (__r, (__vector unsigned short)__c); 1230 1.1.1.3 mrg return (__m64) ((__vector long long) __r)[0]; 1231 1.1 mrg } 1232 1.1 mrg else 1233 1.1 mrg return (0); 1234 1.1 mrg } 1235 1.1 mrg 1236 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1237 1.1 mrg _m_psraw (__m64 __m, __m64 __count) 1238 1.1 mrg { 1239 1.1 mrg return _mm_sra_pi16 (__m, __count); 1240 1.1 mrg } 1241 1.1 mrg 1242 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1243 1.1 mrg _mm_srai_pi16 (__m64 __m, int __count) 1244 1.1 mrg { 1245 1.1 mrg /* Promote int to long then invoke mm_sra_pi32. */ 1246 1.1 mrg return _mm_sra_pi16 (__m, __count); 1247 1.1 mrg } 1248 1.1 mrg 1249 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1250 1.1 mrg _m_psrawi (__m64 __m, int __count) 1251 1.1 mrg { 1252 1.1 mrg return _mm_srai_pi16 (__m, __count); 1253 1.1 mrg } 1254 1.1 mrg 1255 1.1 mrg /* Shift two 32-bit values in M right by COUNT; shift in the sign bit. */ 1256 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1257 1.1 mrg _mm_sra_pi32 (__m64 __m, __m64 __count) 1258 1.1 mrg { 1259 1.1.1.3 mrg __m64_union __res; 1260 1.1 mrg 1261 1.1.1.3 mrg __res.as_m64 = __m; 1262 1.1 mrg 1263 1.1.1.3 mrg __res.as_int[0] = __res.as_int[0] >> __count; 1264 1.1.1.3 mrg __res.as_int[1] = __res.as_int[1] >> __count; 1265 1.1.1.3 mrg return (__res.as_m64); 1266 1.1 mrg } 1267 1.1 mrg 1268 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1269 1.1 mrg _m_psrad (__m64 __m, __m64 __count) 1270 1.1 mrg { 1271 1.1 mrg return _mm_sra_pi32 (__m, __count); 1272 1.1 mrg } 1273 1.1 mrg 1274 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1275 1.1 mrg _mm_srai_pi32 (__m64 __m, int __count) 1276 1.1 mrg { 1277 1.1 mrg /* Promote int to long then invoke mm_sra_pi32. */ 1278 1.1 mrg return _mm_sra_pi32 (__m, __count); 1279 1.1 mrg } 1280 1.1 mrg 1281 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1282 1.1 mrg _m_psradi (__m64 __m, int __count) 1283 1.1 mrg { 1284 1.1 mrg return _mm_srai_pi32 (__m, __count); 1285 1.1 mrg } 1286 1.1 mrg 1287 1.1 mrg /* Shift four 16-bit values in M right by COUNT; shift in zeros. */ 1288 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1289 1.1 mrg _mm_srl_pi16 (__m64 __m, __m64 __count) 1290 1.1 mrg { 1291 1.1.1.3 mrg __vector unsigned short __r; 1292 1.1.1.3 mrg __vector unsigned short __c; 1293 1.1 mrg 1294 1.1 mrg if (__count <= 15) 1295 1.1 mrg { 1296 1.1.1.3 mrg __r = (__vector unsigned short)vec_splats (__m); 1297 1.1.1.3 mrg __c = (__vector unsigned short)vec_splats ((unsigned short)__count); 1298 1.1.1.3 mrg __r = vec_sr (__r, (__vector unsigned short)__c); 1299 1.1.1.3 mrg return (__m64) ((__vector long long) __r)[0]; 1300 1.1 mrg } 1301 1.1 mrg else 1302 1.1 mrg return (0); 1303 1.1 mrg } 1304 1.1 mrg 1305 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1306 1.1 mrg _m_psrlw (__m64 __m, __m64 __count) 1307 1.1 mrg { 1308 1.1 mrg return _mm_srl_pi16 (__m, __count); 1309 1.1 mrg } 1310 1.1 mrg 1311 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1312 1.1 mrg _mm_srli_pi16 (__m64 __m, int __count) 1313 1.1 mrg { 1314 1.1 mrg /* Promote int to long then invoke mm_sra_pi32. */ 1315 1.1 mrg return _mm_srl_pi16 (__m, __count); 1316 1.1 mrg } 1317 1.1 mrg 1318 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1319 1.1 mrg _m_psrlwi (__m64 __m, int __count) 1320 1.1 mrg { 1321 1.1 mrg return _mm_srli_pi16 (__m, __count); 1322 1.1 mrg } 1323 1.1 mrg 1324 1.1 mrg /* Shift two 32-bit values in M right by COUNT; shift in zeros. */ 1325 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1326 1.1 mrg _mm_srl_pi32 (__m64 __m, __m64 __count) 1327 1.1 mrg { 1328 1.1.1.3 mrg __m64_union __res; 1329 1.1 mrg 1330 1.1.1.3 mrg __res.as_m64 = __m; 1331 1.1 mrg 1332 1.1.1.3 mrg __res.as_int[0] = (unsigned int)__res.as_int[0] >> __count; 1333 1.1.1.3 mrg __res.as_int[1] = (unsigned int)__res.as_int[1] >> __count; 1334 1.1.1.3 mrg return (__res.as_m64); 1335 1.1 mrg } 1336 1.1 mrg 1337 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1338 1.1 mrg _m_psrld (__m64 __m, __m64 __count) 1339 1.1 mrg { 1340 1.1 mrg return _mm_srl_pi32 (__m, __count); 1341 1.1 mrg } 1342 1.1 mrg 1343 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1344 1.1 mrg _mm_srli_pi32 (__m64 __m, int __count) 1345 1.1 mrg { 1346 1.1 mrg /* Promote int to long then invoke mm_srl_pi32. */ 1347 1.1 mrg return _mm_srl_pi32 (__m, __count); 1348 1.1 mrg } 1349 1.1 mrg 1350 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1351 1.1 mrg _m_psrldi (__m64 __m, int __count) 1352 1.1 mrg { 1353 1.1 mrg return _mm_srli_pi32 (__m, __count); 1354 1.1 mrg } 1355 1.1 mrg #endif /* _ARCH_PWR8 */ 1356 1.1 mrg 1357 1.1 mrg /* Creates a vector of two 32-bit values; I0 is least significant. */ 1358 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1359 1.1 mrg _mm_set_pi32 (int __i1, int __i0) 1360 1.1 mrg { 1361 1.1.1.3 mrg __m64_union __res; 1362 1.1 mrg 1363 1.1.1.3 mrg __res.as_int[0] = __i0; 1364 1.1.1.3 mrg __res.as_int[1] = __i1; 1365 1.1.1.3 mrg return (__res.as_m64); 1366 1.1 mrg } 1367 1.1 mrg 1368 1.1 mrg /* Creates a vector of four 16-bit values; W0 is least significant. */ 1369 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1370 1.1 mrg _mm_set_pi16 (short __w3, short __w2, short __w1, short __w0) 1371 1.1 mrg { 1372 1.1.1.3 mrg __m64_union __res; 1373 1.1 mrg 1374 1.1.1.3 mrg __res.as_short[0] = __w0; 1375 1.1.1.3 mrg __res.as_short[1] = __w1; 1376 1.1.1.3 mrg __res.as_short[2] = __w2; 1377 1.1.1.3 mrg __res.as_short[3] = __w3; 1378 1.1.1.3 mrg return (__res.as_m64); 1379 1.1 mrg } 1380 1.1 mrg 1381 1.1 mrg /* Creates a vector of eight 8-bit values; B0 is least significant. */ 1382 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1383 1.1 mrg _mm_set_pi8 (char __b7, char __b6, char __b5, char __b4, 1384 1.1 mrg char __b3, char __b2, char __b1, char __b0) 1385 1.1 mrg { 1386 1.1.1.3 mrg __m64_union __res; 1387 1.1 mrg 1388 1.1.1.3 mrg __res.as_char[0] = __b0; 1389 1.1.1.3 mrg __res.as_char[1] = __b1; 1390 1.1.1.3 mrg __res.as_char[2] = __b2; 1391 1.1.1.3 mrg __res.as_char[3] = __b3; 1392 1.1.1.3 mrg __res.as_char[4] = __b4; 1393 1.1.1.3 mrg __res.as_char[5] = __b5; 1394 1.1.1.3 mrg __res.as_char[6] = __b6; 1395 1.1.1.3 mrg __res.as_char[7] = __b7; 1396 1.1.1.3 mrg return (__res.as_m64); 1397 1.1 mrg } 1398 1.1 mrg 1399 1.1 mrg /* Similar, but with the arguments in reverse order. */ 1400 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1401 1.1 mrg _mm_setr_pi32 (int __i0, int __i1) 1402 1.1 mrg { 1403 1.1.1.3 mrg __m64_union __res; 1404 1.1 mrg 1405 1.1.1.3 mrg __res.as_int[0] = __i0; 1406 1.1.1.3 mrg __res.as_int[1] = __i1; 1407 1.1.1.3 mrg return (__res.as_m64); 1408 1.1 mrg } 1409 1.1 mrg 1410 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1411 1.1 mrg _mm_setr_pi16 (short __w0, short __w1, short __w2, short __w3) 1412 1.1 mrg { 1413 1.1 mrg return _mm_set_pi16 (__w3, __w2, __w1, __w0); 1414 1.1 mrg } 1415 1.1 mrg 1416 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1417 1.1 mrg _mm_setr_pi8 (char __b0, char __b1, char __b2, char __b3, 1418 1.1 mrg char __b4, char __b5, char __b6, char __b7) 1419 1.1 mrg { 1420 1.1 mrg return _mm_set_pi8 (__b7, __b6, __b5, __b4, __b3, __b2, __b1, __b0); 1421 1.1 mrg } 1422 1.1 mrg 1423 1.1 mrg /* Creates a vector of two 32-bit values, both elements containing I. */ 1424 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1425 1.1 mrg _mm_set1_pi32 (int __i) 1426 1.1 mrg { 1427 1.1.1.3 mrg __m64_union __res; 1428 1.1 mrg 1429 1.1.1.3 mrg __res.as_int[0] = __i; 1430 1.1.1.3 mrg __res.as_int[1] = __i; 1431 1.1.1.3 mrg return (__res.as_m64); 1432 1.1 mrg } 1433 1.1 mrg 1434 1.1 mrg /* Creates a vector of four 16-bit values, all elements containing W. */ 1435 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1436 1.1 mrg _mm_set1_pi16 (short __w) 1437 1.1 mrg { 1438 1.1 mrg #if _ARCH_PWR9 1439 1.1 mrg __vector signed short w; 1440 1.1 mrg 1441 1.1 mrg w = (__vector signed short)vec_splats (__w); 1442 1.1.1.2 mrg return (__m64) ((__vector long long) w)[0]; 1443 1.1 mrg #else 1444 1.1.1.3 mrg __m64_union __res; 1445 1.1 mrg 1446 1.1.1.3 mrg __res.as_short[0] = __w; 1447 1.1.1.3 mrg __res.as_short[1] = __w; 1448 1.1.1.3 mrg __res.as_short[2] = __w; 1449 1.1.1.3 mrg __res.as_short[3] = __w; 1450 1.1.1.3 mrg return (__res.as_m64); 1451 1.1 mrg #endif 1452 1.1 mrg } 1453 1.1 mrg 1454 1.1 mrg /* Creates a vector of eight 8-bit values, all elements containing B. */ 1455 1.1 mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1456 1.1 mrg _mm_set1_pi8 (signed char __b) 1457 1.1 mrg { 1458 1.1 mrg #if _ARCH_PWR8 1459 1.1.1.3 mrg __vector signed char __res; 1460 1.1 mrg 1461 1.1.1.3 mrg __res = (__vector signed char)vec_splats (__b); 1462 1.1.1.3 mrg return (__m64) ((__vector long long) __res)[0]; 1463 1.1 mrg #else 1464 1.1.1.3 mrg __m64_union __res; 1465 1.1 mrg 1466 1.1.1.3 mrg __res.as_char[0] = __b; 1467 1.1.1.3 mrg __res.as_char[1] = __b; 1468 1.1.1.3 mrg __res.as_char[2] = __b; 1469 1.1.1.3 mrg __res.as_char[3] = __b; 1470 1.1.1.3 mrg __res.as_char[4] = __b; 1471 1.1.1.3 mrg __res.as_char[5] = __b; 1472 1.1.1.3 mrg __res.as_char[6] = __b; 1473 1.1.1.3 mrg __res.as_char[7] = __b; 1474 1.1.1.3 mrg return (__res.as_m64); 1475 1.1 mrg #endif 1476 1.1 mrg } 1477 1.1 mrg #endif /* _MMINTRIN_H_INCLUDED */ 1478