1 1.1.1.3 mrg /* Copyright (C) 2003-2022 Free Software Foundation, Inc. 2 1.1 mrg 3 1.1 mrg This file is part of GCC. 4 1.1 mrg 5 1.1 mrg GCC is free software; you can redistribute it and/or modify 6 1.1 mrg it under the terms of the GNU General Public License as published by 7 1.1 mrg the Free Software Foundation; either version 3, or (at your option) 8 1.1 mrg any later version. 9 1.1 mrg 10 1.1 mrg GCC is distributed in the hope that it will be useful, 11 1.1 mrg but WITHOUT ANY WARRANTY; without even the implied warranty of 12 1.1 mrg MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 1.1 mrg GNU General Public License for more details. 14 1.1 mrg 15 1.1 mrg Under Section 7 of GPL version 3, you are granted additional 16 1.1 mrg permissions described in the GCC Runtime Library Exception, version 17 1.1 mrg 3.1, as published by the Free Software Foundation. 18 1.1 mrg 19 1.1 mrg You should have received a copy of the GNU General Public License and 20 1.1 mrg a copy of the GCC Runtime Library Exception along with this program; 21 1.1 mrg see the files COPYING3 and COPYING.RUNTIME respectively. If not, see 22 1.1 mrg <http://www.gnu.org/licenses/>. */ 23 1.1 mrg 24 1.1 mrg /* Implemented from the specification included in the Intel C++ Compiler 25 1.1 mrg User Guide and Reference, version 9.0. */ 26 1.1 mrg 27 1.1 mrg #ifndef NO_WARN_X86_INTRINSICS 28 1.1 mrg /* This header is distributed to simplify porting x86_64 code that 29 1.1 mrg makes explicit use of Intel intrinsics to powerpc64le. 30 1.1 mrg It is the user's responsibility to determine if the results are 31 1.1 mrg acceptable and make additional changes as necessary. 32 1.1 mrg Note that much code that uses Intel intrinsics can be rewritten in 33 1.1 mrg standard C or GNU C extensions, which are more portable and better 34 1.1 mrg optimized across multiple targets. */ 35 1.1 mrg #endif 36 1.1 mrg 37 1.1 mrg #ifndef TMMINTRIN_H_ 38 1.1 mrg #define TMMINTRIN_H_ 39 1.1 mrg 40 1.1 mrg #include <altivec.h> 41 1.1 mrg #include <assert.h> 42 1.1 mrg 43 1.1 mrg /* We need definitions from the SSE header files. */ 44 1.1 mrg #include <pmmintrin.h> 45 1.1 mrg 46 1.1 mrg extern __inline __m128i 47 1.1 mrg __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 48 1.1 mrg _mm_abs_epi16 (__m128i __A) 49 1.1 mrg { 50 1.1 mrg return (__m128i) vec_abs ((__v8hi) __A); 51 1.1 mrg } 52 1.1 mrg 53 1.1 mrg extern __inline __m128i 54 1.1 mrg __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 55 1.1 mrg _mm_abs_epi32 (__m128i __A) 56 1.1 mrg { 57 1.1 mrg return (__m128i) vec_abs ((__v4si) __A); 58 1.1 mrg } 59 1.1 mrg 60 1.1 mrg extern __inline __m128i 61 1.1 mrg __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 62 1.1 mrg _mm_abs_epi8 (__m128i __A) 63 1.1 mrg { 64 1.1 mrg return (__m128i) vec_abs ((__v16qi) __A); 65 1.1 mrg } 66 1.1 mrg 67 1.1 mrg extern __inline __m64 68 1.1 mrg __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 69 1.1 mrg _mm_abs_pi16 (__m64 __A) 70 1.1 mrg { 71 1.1 mrg __v8hi __B = (__v8hi) (__v2du) { __A, __A }; 72 1.1 mrg return (__m64) ((__v2du) vec_abs (__B))[0]; 73 1.1 mrg } 74 1.1 mrg 75 1.1 mrg extern __inline __m64 76 1.1 mrg __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 77 1.1 mrg _mm_abs_pi32 (__m64 __A) 78 1.1 mrg { 79 1.1 mrg __v4si __B = (__v4si) (__v2du) { __A, __A }; 80 1.1 mrg return (__m64) ((__v2du) vec_abs (__B))[0]; 81 1.1 mrg } 82 1.1 mrg 83 1.1 mrg extern __inline __m64 84 1.1 mrg __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 85 1.1 mrg _mm_abs_pi8 (__m64 __A) 86 1.1 mrg { 87 1.1 mrg __v16qi __B = (__v16qi) (__v2du) { __A, __A }; 88 1.1 mrg return (__m64) ((__v2du) vec_abs (__B))[0]; 89 1.1 mrg } 90 1.1 mrg 91 1.1 mrg extern __inline __m128i 92 1.1 mrg __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 93 1.1 mrg _mm_alignr_epi8 (__m128i __A, __m128i __B, const unsigned int __count) 94 1.1 mrg { 95 1.1 mrg if (__builtin_constant_p (__count) && __count < 16) 96 1.1 mrg { 97 1.1 mrg #ifdef __LITTLE_ENDIAN__ 98 1.1 mrg __A = (__m128i) vec_reve ((__v16qu) __A); 99 1.1 mrg __B = (__m128i) vec_reve ((__v16qu) __B); 100 1.1 mrg #endif 101 1.1 mrg __A = (__m128i) vec_sld ((__v16qu) __B, (__v16qu) __A, __count); 102 1.1 mrg #ifdef __LITTLE_ENDIAN__ 103 1.1 mrg __A = (__m128i) vec_reve ((__v16qu) __A); 104 1.1 mrg #endif 105 1.1 mrg return __A; 106 1.1 mrg } 107 1.1 mrg 108 1.1 mrg if (__count == 0) 109 1.1 mrg return __B; 110 1.1 mrg 111 1.1 mrg if (__count >= 16) 112 1.1 mrg { 113 1.1 mrg if (__count >= 32) 114 1.1 mrg { 115 1.1.1.2 mrg const __v16qu __zero = { 0 }; 116 1.1.1.2 mrg return (__m128i) __zero; 117 1.1 mrg } 118 1.1 mrg else 119 1.1 mrg { 120 1.1 mrg const __v16qu __shift = 121 1.1 mrg vec_splats ((unsigned char) ((__count - 16) * 8)); 122 1.1 mrg #ifdef __LITTLE_ENDIAN__ 123 1.1 mrg return (__m128i) vec_sro ((__v16qu) __A, __shift); 124 1.1 mrg #else 125 1.1 mrg return (__m128i) vec_slo ((__v16qu) __A, __shift); 126 1.1 mrg #endif 127 1.1 mrg } 128 1.1 mrg } 129 1.1 mrg else 130 1.1 mrg { 131 1.1 mrg const __v16qu __shiftA = 132 1.1 mrg vec_splats ((unsigned char) ((16 - __count) * 8)); 133 1.1 mrg const __v16qu __shiftB = vec_splats ((unsigned char) (__count * 8)); 134 1.1 mrg #ifdef __LITTLE_ENDIAN__ 135 1.1 mrg __A = (__m128i) vec_slo ((__v16qu) __A, __shiftA); 136 1.1 mrg __B = (__m128i) vec_sro ((__v16qu) __B, __shiftB); 137 1.1 mrg #else 138 1.1 mrg __A = (__m128i) vec_sro ((__v16qu) __A, __shiftA); 139 1.1 mrg __B = (__m128i) vec_slo ((__v16qu) __B, __shiftB); 140 1.1 mrg #endif 141 1.1 mrg return (__m128i) vec_or ((__v16qu) __A, (__v16qu) __B); 142 1.1 mrg } 143 1.1 mrg } 144 1.1 mrg 145 1.1 mrg extern __inline __m64 146 1.1 mrg __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 147 1.1 mrg _mm_alignr_pi8 (__m64 __A, __m64 __B, unsigned int __count) 148 1.1 mrg { 149 1.1 mrg if (__count < 16) 150 1.1 mrg { 151 1.1 mrg __v2du __C = { __B, __A }; 152 1.1 mrg #ifdef __LITTLE_ENDIAN__ 153 1.1 mrg const __v4su __shift = { __count << 3, 0, 0, 0 }; 154 1.1 mrg __C = (__v2du) vec_sro ((__v16qu) __C, (__v16qu) __shift); 155 1.1 mrg #else 156 1.1 mrg const __v4su __shift = { 0, 0, 0, __count << 3 }; 157 1.1 mrg __C = (__v2du) vec_slo ((__v16qu) __C, (__v16qu) __shift); 158 1.1 mrg #endif 159 1.1 mrg return (__m64) __C[0]; 160 1.1 mrg } 161 1.1 mrg else 162 1.1 mrg { 163 1.1 mrg const __m64 __zero = { 0 }; 164 1.1 mrg return __zero; 165 1.1 mrg } 166 1.1 mrg } 167 1.1 mrg 168 1.1 mrg extern __inline __m128i 169 1.1 mrg __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 170 1.1 mrg _mm_hadd_epi16 (__m128i __A, __m128i __B) 171 1.1 mrg { 172 1.1 mrg const __v16qu __P = 173 1.1 mrg { 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 }; 174 1.1 mrg const __v16qu __Q = 175 1.1 mrg { 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 }; 176 1.1 mrg __v8hi __C = vec_perm ((__v8hi) __A, (__v8hi) __B, __P); 177 1.1 mrg __v8hi __D = vec_perm ((__v8hi) __A, (__v8hi) __B, __Q); 178 1.1 mrg return (__m128i) vec_add (__C, __D); 179 1.1 mrg } 180 1.1 mrg 181 1.1 mrg extern __inline __m128i 182 1.1 mrg __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 183 1.1 mrg _mm_hadd_epi32 (__m128i __A, __m128i __B) 184 1.1 mrg { 185 1.1 mrg const __v16qu __P = 186 1.1 mrg { 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27 }; 187 1.1 mrg const __v16qu __Q = 188 1.1 mrg { 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31 }; 189 1.1 mrg __v4si __C = vec_perm ((__v4si) __A, (__v4si) __B, __P); 190 1.1 mrg __v4si __D = vec_perm ((__v4si) __A, (__v4si) __B, __Q); 191 1.1 mrg return (__m128i) vec_add (__C, __D); 192 1.1 mrg } 193 1.1 mrg 194 1.1 mrg extern __inline __m64 195 1.1 mrg __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 196 1.1 mrg _mm_hadd_pi16 (__m64 __A, __m64 __B) 197 1.1 mrg { 198 1.1 mrg __v8hi __C = (__v8hi) (__v2du) { __A, __B }; 199 1.1 mrg const __v16qu __P = 200 1.1 mrg { 0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 4, 5, 8, 9, 12, 13 }; 201 1.1 mrg const __v16qu __Q = 202 1.1 mrg { 2, 3, 6, 7, 10, 11, 14, 15, 2, 3, 6, 7, 10, 11, 14, 15 }; 203 1.1 mrg __v8hi __D = vec_perm (__C, __C, __Q); 204 1.1 mrg __C = vec_perm (__C, __C, __P); 205 1.1 mrg __C = vec_add (__C, __D); 206 1.1 mrg return (__m64) ((__v2du) __C)[1]; 207 1.1 mrg } 208 1.1 mrg 209 1.1 mrg extern __inline __m64 210 1.1 mrg __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 211 1.1 mrg _mm_hadd_pi32 (__m64 __A, __m64 __B) 212 1.1 mrg { 213 1.1 mrg __v4si __C = (__v4si) (__v2du) { __A, __B }; 214 1.1 mrg const __v16qu __P = 215 1.1 mrg { 0, 1, 2, 3, 8, 9, 10, 11, 0, 1, 2, 3, 8, 9, 10, 11 }; 216 1.1 mrg const __v16qu __Q = 217 1.1 mrg { 4, 5, 6, 7, 12, 13, 14, 15, 4, 5, 6, 7, 12, 13, 14, 15 }; 218 1.1 mrg __v4si __D = vec_perm (__C, __C, __Q); 219 1.1 mrg __C = vec_perm (__C, __C, __P); 220 1.1 mrg __C = vec_add (__C, __D); 221 1.1 mrg return (__m64) ((__v2du) __C)[1]; 222 1.1 mrg } 223 1.1 mrg 224 1.1 mrg extern __inline __m128i 225 1.1 mrg __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 226 1.1 mrg _mm_hadds_epi16 (__m128i __A, __m128i __B) 227 1.1 mrg { 228 1.1 mrg __v4si __C = { 0 }, __D = { 0 }; 229 1.1 mrg __C = vec_sum4s ((__v8hi) __A, __C); 230 1.1 mrg __D = vec_sum4s ((__v8hi) __B, __D); 231 1.1 mrg __C = (__v4si) vec_packs (__C, __D); 232 1.1 mrg return (__m128i) __C; 233 1.1 mrg } 234 1.1 mrg 235 1.1 mrg extern __inline __m64 236 1.1 mrg __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 237 1.1 mrg _mm_hadds_pi16 (__m64 __A, __m64 __B) 238 1.1 mrg { 239 1.1 mrg const __v4si __zero = { 0 }; 240 1.1 mrg __v8hi __C = (__v8hi) (__v2du) { __A, __B }; 241 1.1 mrg __v4si __D = vec_sum4s (__C, __zero); 242 1.1 mrg __C = vec_packs (__D, __D); 243 1.1 mrg return (__m64) ((__v2du) __C)[1]; 244 1.1 mrg } 245 1.1 mrg 246 1.1 mrg extern __inline __m128i 247 1.1 mrg __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 248 1.1 mrg _mm_hsub_epi16 (__m128i __A, __m128i __B) 249 1.1 mrg { 250 1.1 mrg const __v16qu __P = 251 1.1 mrg { 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 }; 252 1.1 mrg const __v16qu __Q = 253 1.1 mrg { 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 }; 254 1.1 mrg __v8hi __C = vec_perm ((__v8hi) __A, (__v8hi) __B, __P); 255 1.1 mrg __v8hi __D = vec_perm ((__v8hi) __A, (__v8hi) __B, __Q); 256 1.1 mrg return (__m128i) vec_sub (__C, __D); 257 1.1 mrg } 258 1.1 mrg 259 1.1 mrg extern __inline __m128i 260 1.1 mrg __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 261 1.1 mrg _mm_hsub_epi32 (__m128i __A, __m128i __B) 262 1.1 mrg { 263 1.1 mrg const __v16qu __P = 264 1.1 mrg { 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27 }; 265 1.1 mrg const __v16qu __Q = 266 1.1 mrg { 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31 }; 267 1.1 mrg __v4si __C = vec_perm ((__v4si) __A, (__v4si) __B, __P); 268 1.1 mrg __v4si __D = vec_perm ((__v4si) __A, (__v4si) __B, __Q); 269 1.1 mrg return (__m128i) vec_sub (__C, __D); 270 1.1 mrg } 271 1.1 mrg 272 1.1 mrg extern __inline __m64 273 1.1 mrg __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 274 1.1 mrg _mm_hsub_pi16 (__m64 __A, __m64 __B) 275 1.1 mrg { 276 1.1 mrg const __v16qu __P = 277 1.1 mrg { 0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 4, 5, 8, 9, 12, 13 }; 278 1.1 mrg const __v16qu __Q = 279 1.1 mrg { 2, 3, 6, 7, 10, 11, 14, 15, 2, 3, 6, 7, 10, 11, 14, 15 }; 280 1.1 mrg __v8hi __C = (__v8hi) (__v2du) { __A, __B }; 281 1.1 mrg __v8hi __D = vec_perm (__C, __C, __Q); 282 1.1 mrg __C = vec_perm (__C, __C, __P); 283 1.1 mrg __C = vec_sub (__C, __D); 284 1.1 mrg return (__m64) ((__v2du) __C)[1]; 285 1.1 mrg } 286 1.1 mrg 287 1.1 mrg extern __inline __m64 288 1.1 mrg __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 289 1.1 mrg _mm_hsub_pi32 (__m64 __A, __m64 __B) 290 1.1 mrg { 291 1.1 mrg const __v16qu __P = 292 1.1 mrg { 0, 1, 2, 3, 8, 9, 10, 11, 0, 1, 2, 3, 8, 9, 10, 11 }; 293 1.1 mrg const __v16qu __Q = 294 1.1 mrg { 4, 5, 6, 7, 12, 13, 14, 15, 4, 5, 6, 7, 12, 13, 14, 15 }; 295 1.1 mrg __v4si __C = (__v4si) (__v2du) { __A, __B }; 296 1.1 mrg __v4si __D = vec_perm (__C, __C, __Q); 297 1.1 mrg __C = vec_perm (__C, __C, __P); 298 1.1 mrg __C = vec_sub (__C, __D); 299 1.1 mrg return (__m64) ((__v2du) __C)[1]; 300 1.1 mrg } 301 1.1 mrg 302 1.1 mrg extern __inline __m128i 303 1.1 mrg __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 304 1.1 mrg _mm_hsubs_epi16 (__m128i __A, __m128i __B) 305 1.1 mrg { 306 1.1 mrg const __v16qu __P = 307 1.1 mrg { 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 }; 308 1.1 mrg const __v16qu __Q = 309 1.1 mrg { 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 }; 310 1.1 mrg __v8hi __C = vec_perm ((__v8hi) __A, (__v8hi) __B, __P); 311 1.1 mrg __v8hi __D = vec_perm ((__v8hi) __A, (__v8hi) __B, __Q); 312 1.1 mrg return (__m128i) vec_subs (__C, __D); 313 1.1 mrg } 314 1.1 mrg 315 1.1 mrg extern __inline __m64 316 1.1 mrg __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 317 1.1 mrg _mm_hsubs_pi16 (__m64 __A, __m64 __B) 318 1.1 mrg { 319 1.1 mrg const __v16qu __P = 320 1.1 mrg { 0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 4, 5, 8, 9, 12, 13 }; 321 1.1 mrg const __v16qu __Q = 322 1.1 mrg { 2, 3, 6, 7, 10, 11, 14, 15, 2, 3, 6, 7, 10, 11, 14, 15 }; 323 1.1 mrg __v8hi __C = (__v8hi) (__v2du) { __A, __B }; 324 1.1 mrg __v8hi __D = vec_perm (__C, __C, __P); 325 1.1 mrg __v8hi __E = vec_perm (__C, __C, __Q); 326 1.1 mrg __C = vec_subs (__D, __E); 327 1.1 mrg return (__m64) ((__v2du) __C)[1]; 328 1.1 mrg } 329 1.1 mrg 330 1.1 mrg extern __inline __m128i 331 1.1 mrg __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 332 1.1 mrg _mm_shuffle_epi8 (__m128i __A, __m128i __B) 333 1.1 mrg { 334 1.1 mrg const __v16qi __zero = { 0 }; 335 1.1 mrg __vector __bool char __select = vec_cmplt ((__v16qi) __B, __zero); 336 1.1 mrg __v16qi __C = vec_perm ((__v16qi) __A, (__v16qi) __A, (__v16qu) __B); 337 1.1 mrg return (__m128i) vec_sel (__C, __zero, __select); 338 1.1 mrg } 339 1.1 mrg 340 1.1 mrg extern __inline __m64 341 1.1 mrg __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 342 1.1 mrg _mm_shuffle_pi8 (__m64 __A, __m64 __B) 343 1.1 mrg { 344 1.1 mrg const __v16qi __zero = { 0 }; 345 1.1 mrg __v16qi __C = (__v16qi) (__v2du) { __A, __A }; 346 1.1 mrg __v16qi __D = (__v16qi) (__v2du) { __B, __B }; 347 1.1 mrg __vector __bool char __select = vec_cmplt ((__v16qi) __D, __zero); 348 1.1 mrg __C = vec_perm ((__v16qi) __C, (__v16qi) __C, (__v16qu) __D); 349 1.1 mrg __C = vec_sel (__C, __zero, __select); 350 1.1 mrg return (__m64) ((__v2du) (__C))[0]; 351 1.1 mrg } 352 1.1 mrg 353 1.1.1.3 mrg #ifdef _ARCH_PWR8 354 1.1 mrg extern __inline __m128i 355 1.1 mrg __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 356 1.1 mrg _mm_sign_epi8 (__m128i __A, __m128i __B) 357 1.1 mrg { 358 1.1 mrg const __v16qi __zero = { 0 }; 359 1.1 mrg __v16qi __selectneg = (__v16qi) vec_cmplt ((__v16qi) __B, __zero); 360 1.1 mrg __v16qi __selectpos = 361 1.1 mrg (__v16qi) vec_neg ((__v16qi) vec_cmpgt ((__v16qi) __B, __zero)); 362 1.1 mrg __v16qi __conv = vec_add (__selectneg, __selectpos); 363 1.1 mrg return (__m128i) vec_mul ((__v16qi) __A, (__v16qi) __conv); 364 1.1 mrg } 365 1.1.1.3 mrg #endif 366 1.1 mrg 367 1.1.1.3 mrg #ifdef _ARCH_PWR8 368 1.1 mrg extern __inline __m128i 369 1.1 mrg __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 370 1.1 mrg _mm_sign_epi16 (__m128i __A, __m128i __B) 371 1.1 mrg { 372 1.1 mrg const __v8hi __zero = { 0 }; 373 1.1 mrg __v8hi __selectneg = (__v8hi) vec_cmplt ((__v8hi) __B, __zero); 374 1.1 mrg __v8hi __selectpos = 375 1.1 mrg (__v8hi) vec_neg ((__v8hi) vec_cmpgt ((__v8hi) __B, __zero)); 376 1.1 mrg __v8hi __conv = vec_add (__selectneg, __selectpos); 377 1.1 mrg return (__m128i) vec_mul ((__v8hi) __A, (__v8hi) __conv); 378 1.1 mrg } 379 1.1.1.3 mrg #endif 380 1.1 mrg 381 1.1.1.3 mrg #ifdef _ARCH_PWR8 382 1.1 mrg extern __inline __m128i 383 1.1 mrg __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 384 1.1 mrg _mm_sign_epi32 (__m128i __A, __m128i __B) 385 1.1 mrg { 386 1.1 mrg const __v4si __zero = { 0 }; 387 1.1 mrg __v4si __selectneg = (__v4si) vec_cmplt ((__v4si) __B, __zero); 388 1.1 mrg __v4si __selectpos = 389 1.1 mrg (__v4si) vec_neg ((__v4si) vec_cmpgt ((__v4si) __B, __zero)); 390 1.1 mrg __v4si __conv = vec_add (__selectneg, __selectpos); 391 1.1 mrg return (__m128i) vec_mul ((__v4si) __A, (__v4si) __conv); 392 1.1 mrg } 393 1.1.1.3 mrg #endif 394 1.1 mrg 395 1.1.1.3 mrg #ifdef _ARCH_PWR8 396 1.1 mrg extern __inline __m64 397 1.1 mrg __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 398 1.1 mrg _mm_sign_pi8 (__m64 __A, __m64 __B) 399 1.1 mrg { 400 1.1 mrg const __v16qi __zero = { 0 }; 401 1.1 mrg __v16qi __C = (__v16qi) (__v2du) { __A, __A }; 402 1.1 mrg __v16qi __D = (__v16qi) (__v2du) { __B, __B }; 403 1.1 mrg __C = (__v16qi) _mm_sign_epi8 ((__m128i) __C, (__m128i) __D); 404 1.1 mrg return (__m64) ((__v2du) (__C))[0]; 405 1.1 mrg } 406 1.1.1.3 mrg #endif 407 1.1 mrg 408 1.1.1.3 mrg #ifdef _ARCH_PWR8 409 1.1 mrg extern __inline __m64 410 1.1 mrg __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 411 1.1 mrg _mm_sign_pi16 (__m64 __A, __m64 __B) 412 1.1 mrg { 413 1.1 mrg const __v8hi __zero = { 0 }; 414 1.1 mrg __v8hi __C = (__v8hi) (__v2du) { __A, __A }; 415 1.1 mrg __v8hi __D = (__v8hi) (__v2du) { __B, __B }; 416 1.1 mrg __C = (__v8hi) _mm_sign_epi16 ((__m128i) __C, (__m128i) __D); 417 1.1 mrg return (__m64) ((__v2du) (__C))[0]; 418 1.1 mrg } 419 1.1.1.3 mrg #endif 420 1.1 mrg 421 1.1.1.3 mrg #ifdef _ARCH_PWR8 422 1.1 mrg extern __inline __m64 423 1.1 mrg __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 424 1.1 mrg _mm_sign_pi32 (__m64 __A, __m64 __B) 425 1.1 mrg { 426 1.1 mrg const __v4si __zero = { 0 }; 427 1.1 mrg __v4si __C = (__v4si) (__v2du) { __A, __A }; 428 1.1 mrg __v4si __D = (__v4si) (__v2du) { __B, __B }; 429 1.1 mrg __C = (__v4si) _mm_sign_epi32 ((__m128i) __C, (__m128i) __D); 430 1.1 mrg return (__m64) ((__v2du) (__C))[0]; 431 1.1 mrg } 432 1.1.1.3 mrg #endif 433 1.1 mrg 434 1.1 mrg extern __inline __m128i 435 1.1 mrg __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 436 1.1 mrg _mm_maddubs_epi16 (__m128i __A, __m128i __B) 437 1.1 mrg { 438 1.1 mrg __v8hi __unsigned = vec_splats ((signed short) 0x00ff); 439 1.1 mrg __v8hi __C = vec_and (vec_unpackh ((__v16qi) __A), __unsigned); 440 1.1 mrg __v8hi __D = vec_and (vec_unpackl ((__v16qi) __A), __unsigned); 441 1.1 mrg __v8hi __E = vec_unpackh ((__v16qi) __B); 442 1.1 mrg __v8hi __F = vec_unpackl ((__v16qi) __B); 443 1.1 mrg __C = vec_mul (__C, __E); 444 1.1 mrg __D = vec_mul (__D, __F); 445 1.1 mrg const __v16qu __odds = 446 1.1 mrg { 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 }; 447 1.1 mrg const __v16qu __evens = 448 1.1 mrg { 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 }; 449 1.1 mrg __E = vec_perm (__C, __D, __odds); 450 1.1 mrg __F = vec_perm (__C, __D, __evens); 451 1.1 mrg return (__m128i) vec_adds (__E, __F); 452 1.1 mrg } 453 1.1 mrg 454 1.1 mrg extern __inline __m64 455 1.1 mrg __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 456 1.1 mrg _mm_maddubs_pi16 (__m64 __A, __m64 __B) 457 1.1 mrg { 458 1.1 mrg __v8hi __C = (__v8hi) (__v2du) { __A, __A }; 459 1.1 mrg __C = vec_unpackl ((__v16qi) __C); 460 1.1 mrg const __v8hi __unsigned = vec_splats ((signed short) 0x00ff); 461 1.1 mrg __C = vec_and (__C, __unsigned); 462 1.1 mrg __v8hi __D = (__v8hi) (__v2du) { __B, __B }; 463 1.1 mrg __D = vec_unpackl ((__v16qi) __D); 464 1.1 mrg __D = vec_mul (__C, __D); 465 1.1 mrg const __v16qu __odds = 466 1.1 mrg { 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 }; 467 1.1 mrg const __v16qu __evens = 468 1.1 mrg { 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 }; 469 1.1 mrg __C = vec_perm (__D, __D, __odds); 470 1.1 mrg __D = vec_perm (__D, __D, __evens); 471 1.1 mrg __C = vec_adds (__C, __D); 472 1.1 mrg return (__m64) ((__v2du) (__C))[0]; 473 1.1 mrg } 474 1.1 mrg 475 1.1 mrg extern __inline __m128i 476 1.1 mrg __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 477 1.1 mrg _mm_mulhrs_epi16 (__m128i __A, __m128i __B) 478 1.1 mrg { 479 1.1 mrg __v4si __C = vec_unpackh ((__v8hi) __A); 480 1.1 mrg __v4si __D = vec_unpackh ((__v8hi) __B); 481 1.1 mrg __C = vec_mul (__C, __D); 482 1.1 mrg __D = vec_unpackl ((__v8hi) __A); 483 1.1 mrg __v4si __E = vec_unpackl ((__v8hi) __B); 484 1.1 mrg __D = vec_mul (__D, __E); 485 1.1 mrg const __v4su __shift = vec_splats ((unsigned int) 14); 486 1.1 mrg __C = vec_sr (__C, __shift); 487 1.1 mrg __D = vec_sr (__D, __shift); 488 1.1 mrg const __v4si __ones = vec_splats ((signed int) 1); 489 1.1 mrg __C = vec_add (__C, __ones); 490 1.1 mrg __C = vec_sr (__C, (__v4su) __ones); 491 1.1 mrg __D = vec_add (__D, __ones); 492 1.1 mrg __D = vec_sr (__D, (__v4su) __ones); 493 1.1 mrg return (__m128i) vec_pack (__C, __D); 494 1.1 mrg } 495 1.1 mrg 496 1.1 mrg extern __inline __m64 497 1.1 mrg __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 498 1.1 mrg _mm_mulhrs_pi16 (__m64 __A, __m64 __B) 499 1.1 mrg { 500 1.1 mrg __v4si __C = (__v4si) (__v2du) { __A, __A }; 501 1.1 mrg __C = vec_unpackh ((__v8hi) __C); 502 1.1 mrg __v4si __D = (__v4si) (__v2du) { __B, __B }; 503 1.1 mrg __D = vec_unpackh ((__v8hi) __D); 504 1.1 mrg __C = vec_mul (__C, __D); 505 1.1 mrg const __v4su __shift = vec_splats ((unsigned int) 14); 506 1.1 mrg __C = vec_sr (__C, __shift); 507 1.1 mrg const __v4si __ones = vec_splats ((signed int) 1); 508 1.1 mrg __C = vec_add (__C, __ones); 509 1.1 mrg __C = vec_sr (__C, (__v4su) __ones); 510 1.1 mrg __v8hi __E = vec_pack (__C, __D); 511 1.1 mrg return (__m64) ((__v2du) (__E))[0]; 512 1.1 mrg } 513 1.1 mrg 514 1.1 mrg #endif 515