Home | History | Annotate | Line # | Download | only in rs6000
tmmintrin.h revision 1.1.1.3
      1 /* Copyright (C) 2003-2022 Free Software Foundation, Inc.
      2 
      3    This file is part of GCC.
      4 
      5    GCC is free software; you can redistribute it and/or modify
      6    it under the terms of the GNU General Public License as published by
      7    the Free Software Foundation; either version 3, or (at your option)
      8    any later version.
      9 
     10    GCC is distributed in the hope that it will be useful,
     11    but WITHOUT ANY WARRANTY; without even the implied warranty of
     12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     13    GNU General Public License for more details.
     14 
     15    Under Section 7 of GPL version 3, you are granted additional
     16    permissions described in the GCC Runtime Library Exception, version
     17    3.1, as published by the Free Software Foundation.
     18 
     19    You should have received a copy of the GNU General Public License and
     20    a copy of the GCC Runtime Library Exception along with this program;
     21    see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
     22    <http://www.gnu.org/licenses/>.  */
     23 
     24 /* Implemented from the specification included in the Intel C++ Compiler
     25    User Guide and Reference, version 9.0.  */
     26 
     27 #ifndef NO_WARN_X86_INTRINSICS
     28 /* This header is distributed to simplify porting x86_64 code that
     29    makes explicit use of Intel intrinsics to powerpc64le.
     30    It is the user's responsibility to determine if the results are
     31    acceptable and make additional changes as necessary.
     32    Note that much code that uses Intel intrinsics can be rewritten in
     33    standard C or GNU C extensions, which are more portable and better
     34    optimized across multiple targets.  */
     35 #endif
     36 
     37 #ifndef TMMINTRIN_H_
     38 #define TMMINTRIN_H_
     39 
     40 #include <altivec.h>
     41 #include <assert.h>
     42 
     43 /* We need definitions from the SSE header files.  */
     44 #include <pmmintrin.h>
     45 
     46 extern __inline __m128i
     47 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     48 _mm_abs_epi16 (__m128i __A)
     49 {
     50   return (__m128i) vec_abs ((__v8hi) __A);
     51 }
     52 
     53 extern __inline __m128i
     54 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     55 _mm_abs_epi32 (__m128i __A)
     56 {
     57   return (__m128i) vec_abs ((__v4si) __A);
     58 }
     59 
     60 extern __inline __m128i
     61 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     62 _mm_abs_epi8 (__m128i __A)
     63 {
     64   return (__m128i) vec_abs ((__v16qi) __A);
     65 }
     66 
     67 extern __inline __m64
     68 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     69 _mm_abs_pi16 (__m64 __A)
     70 {
     71   __v8hi __B = (__v8hi) (__v2du) { __A, __A };
     72   return (__m64) ((__v2du) vec_abs (__B))[0];
     73 }
     74 
     75 extern __inline __m64
     76 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     77 _mm_abs_pi32 (__m64 __A)
     78 {
     79   __v4si __B = (__v4si) (__v2du) { __A, __A };
     80   return (__m64) ((__v2du) vec_abs (__B))[0];
     81 }
     82 
     83 extern __inline __m64
     84 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     85 _mm_abs_pi8 (__m64 __A)
     86 {
     87   __v16qi __B = (__v16qi) (__v2du) { __A, __A };
     88   return (__m64) ((__v2du) vec_abs (__B))[0];
     89 }
     90 
     91 extern __inline __m128i
     92 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     93 _mm_alignr_epi8 (__m128i __A, __m128i __B, const unsigned int __count)
     94 {
     95   if (__builtin_constant_p (__count) && __count < 16)
     96     {
     97 #ifdef __LITTLE_ENDIAN__
     98       __A = (__m128i) vec_reve ((__v16qu) __A);
     99       __B = (__m128i) vec_reve ((__v16qu) __B);
    100 #endif
    101       __A = (__m128i) vec_sld ((__v16qu) __B, (__v16qu) __A, __count);
    102 #ifdef __LITTLE_ENDIAN__
    103       __A = (__m128i) vec_reve ((__v16qu) __A);
    104 #endif
    105       return __A;
    106     }
    107 
    108   if (__count == 0)
    109     return __B;
    110 
    111   if (__count >= 16)
    112     {
    113       if (__count >= 32)
    114 	{
    115 	  const __v16qu __zero = { 0 };
    116 	  return (__m128i) __zero;
    117 	}
    118       else
    119 	{
    120 	  const __v16qu __shift =
    121 	    vec_splats ((unsigned char) ((__count - 16) * 8));
    122 #ifdef __LITTLE_ENDIAN__
    123 	  return (__m128i) vec_sro ((__v16qu) __A, __shift);
    124 #else
    125 	  return (__m128i) vec_slo ((__v16qu) __A, __shift);
    126 #endif
    127 	}
    128     }
    129   else
    130     {
    131       const __v16qu __shiftA =
    132 	vec_splats ((unsigned char) ((16 - __count) * 8));
    133       const __v16qu __shiftB = vec_splats ((unsigned char) (__count * 8));
    134 #ifdef __LITTLE_ENDIAN__
    135       __A = (__m128i) vec_slo ((__v16qu) __A, __shiftA);
    136       __B = (__m128i) vec_sro ((__v16qu) __B, __shiftB);
    137 #else
    138       __A = (__m128i) vec_sro ((__v16qu) __A, __shiftA);
    139       __B = (__m128i) vec_slo ((__v16qu) __B, __shiftB);
    140 #endif
    141       return (__m128i) vec_or ((__v16qu) __A, (__v16qu) __B);
    142     }
    143 }
    144 
    145 extern __inline __m64
    146 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    147 _mm_alignr_pi8 (__m64 __A, __m64 __B, unsigned int __count)
    148 {
    149   if (__count < 16)
    150     {
    151       __v2du __C = { __B, __A };
    152 #ifdef __LITTLE_ENDIAN__
    153       const __v4su __shift = { __count << 3, 0, 0, 0 };
    154       __C = (__v2du) vec_sro ((__v16qu) __C, (__v16qu) __shift);
    155 #else
    156       const __v4su __shift = { 0, 0, 0, __count << 3 };
    157       __C = (__v2du) vec_slo ((__v16qu) __C, (__v16qu) __shift);
    158 #endif
    159       return (__m64) __C[0];
    160     }
    161   else
    162     {
    163       const __m64 __zero = { 0 };
    164       return __zero;
    165     }
    166 }
    167 
    168 extern __inline __m128i
    169 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    170 _mm_hadd_epi16 (__m128i __A, __m128i __B)
    171 {
    172   const __v16qu __P =
    173     {  0,  1,  4,  5,  8,  9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 };
    174   const __v16qu __Q =
    175     {  2,  3,  6,  7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 };
    176   __v8hi __C = vec_perm ((__v8hi) __A, (__v8hi) __B, __P);
    177   __v8hi __D = vec_perm ((__v8hi) __A, (__v8hi) __B, __Q);
    178   return (__m128i) vec_add (__C, __D);
    179 }
    180 
    181 extern __inline __m128i
    182 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    183 _mm_hadd_epi32 (__m128i __A, __m128i __B)
    184 {
    185   const __v16qu __P =
    186     {  0,  1,  2,  3,  8,  9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27 };
    187   const __v16qu __Q =
    188     {  4,  5,  6,  7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31 };
    189   __v4si __C = vec_perm ((__v4si) __A, (__v4si) __B, __P);
    190   __v4si __D = vec_perm ((__v4si) __A, (__v4si) __B, __Q);
    191   return (__m128i) vec_add (__C, __D);
    192 }
    193 
    194 extern __inline __m64
    195 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    196 _mm_hadd_pi16 (__m64 __A, __m64 __B)
    197 {
    198   __v8hi __C = (__v8hi) (__v2du) { __A, __B };
    199   const __v16qu __P =
    200     {  0,  1,  4,  5,  8,  9, 12, 13,  0,  1,  4,  5,  8,  9, 12, 13 };
    201   const __v16qu __Q =
    202     {  2,  3,  6,  7, 10, 11, 14, 15,  2,  3,  6,  7, 10, 11, 14, 15 };
    203   __v8hi __D = vec_perm (__C, __C, __Q);
    204   __C = vec_perm (__C, __C, __P);
    205   __C = vec_add (__C, __D);
    206   return (__m64) ((__v2du) __C)[1];
    207 }
    208 
    209 extern __inline __m64
    210 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    211 _mm_hadd_pi32 (__m64 __A, __m64 __B)
    212 {
    213   __v4si __C = (__v4si) (__v2du) { __A, __B };
    214   const __v16qu __P =
    215     {  0,  1,  2,  3,  8,  9, 10, 11,  0,  1,  2,  3,  8,  9, 10, 11 };
    216   const __v16qu __Q =
    217     {  4,  5,  6,  7, 12, 13, 14, 15,  4,  5,  6,  7, 12, 13, 14, 15 };
    218   __v4si __D = vec_perm (__C, __C, __Q);
    219   __C = vec_perm (__C, __C, __P);
    220   __C = vec_add (__C, __D);
    221   return (__m64) ((__v2du) __C)[1];
    222 }
    223 
    224 extern __inline __m128i
    225 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    226 _mm_hadds_epi16 (__m128i __A, __m128i __B)
    227 {
    228   __v4si __C = { 0 }, __D = { 0 };
    229   __C = vec_sum4s ((__v8hi) __A, __C);
    230   __D = vec_sum4s ((__v8hi) __B, __D);
    231   __C = (__v4si) vec_packs (__C, __D);
    232   return (__m128i) __C;
    233 }
    234 
    235 extern __inline __m64
    236 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    237 _mm_hadds_pi16 (__m64 __A, __m64 __B)
    238 {
    239   const __v4si __zero = { 0 };
    240   __v8hi __C = (__v8hi) (__v2du) { __A, __B };
    241   __v4si __D = vec_sum4s (__C, __zero);
    242   __C = vec_packs (__D, __D);
    243   return (__m64) ((__v2du) __C)[1];
    244 }
    245 
    246 extern __inline __m128i
    247 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    248 _mm_hsub_epi16 (__m128i __A, __m128i __B)
    249 {
    250   const __v16qu __P =
    251     {  0,  1,  4,  5,  8,  9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 };
    252   const __v16qu __Q =
    253     {  2,  3,  6,  7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 };
    254   __v8hi __C = vec_perm ((__v8hi) __A, (__v8hi) __B, __P);
    255   __v8hi __D = vec_perm ((__v8hi) __A, (__v8hi) __B, __Q);
    256   return (__m128i) vec_sub (__C, __D);
    257 }
    258 
    259 extern __inline __m128i
    260 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    261 _mm_hsub_epi32 (__m128i __A, __m128i __B)
    262 {
    263   const __v16qu __P =
    264     {  0,  1,  2,  3,  8,  9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27 };
    265   const __v16qu __Q =
    266     {  4,  5,  6,  7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31 };
    267   __v4si __C = vec_perm ((__v4si) __A, (__v4si) __B, __P);
    268   __v4si __D = vec_perm ((__v4si) __A, (__v4si) __B, __Q);
    269   return (__m128i) vec_sub (__C, __D);
    270 }
    271 
    272 extern __inline __m64
    273 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    274 _mm_hsub_pi16 (__m64 __A, __m64 __B)
    275 {
    276   const __v16qu __P =
    277     {  0,  1,  4,  5,  8,  9, 12, 13,  0,  1,  4,  5,  8,  9, 12, 13 };
    278   const __v16qu __Q =
    279     {  2,  3,  6,  7, 10, 11, 14, 15,  2,  3,  6,  7, 10, 11, 14, 15 };
    280   __v8hi __C = (__v8hi) (__v2du) { __A, __B };
    281   __v8hi __D = vec_perm (__C, __C, __Q);
    282   __C = vec_perm (__C, __C, __P);
    283   __C = vec_sub (__C, __D);
    284   return (__m64) ((__v2du) __C)[1];
    285 }
    286 
    287 extern __inline __m64
    288 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    289 _mm_hsub_pi32 (__m64 __A, __m64 __B)
    290 {
    291   const __v16qu __P =
    292     {  0,  1,  2,  3,  8,  9, 10, 11,  0,  1,  2,  3,  8,  9, 10, 11 };
    293   const __v16qu __Q =
    294     {  4,  5,  6,  7, 12, 13, 14, 15,  4,  5,  6,  7, 12, 13, 14, 15 };
    295   __v4si __C = (__v4si) (__v2du) { __A, __B };
    296   __v4si __D = vec_perm (__C, __C, __Q);
    297   __C = vec_perm (__C, __C, __P);
    298   __C = vec_sub (__C, __D);
    299   return (__m64) ((__v2du) __C)[1];
    300 }
    301 
    302 extern __inline __m128i
    303 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    304 _mm_hsubs_epi16 (__m128i __A, __m128i __B)
    305 {
    306   const __v16qu __P =
    307     {  0,  1,  4,  5,  8,  9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 };
    308   const __v16qu __Q =
    309     {  2,  3,  6,  7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 };
    310   __v8hi __C = vec_perm ((__v8hi) __A, (__v8hi) __B, __P);
    311   __v8hi __D = vec_perm ((__v8hi) __A, (__v8hi) __B, __Q);
    312   return (__m128i) vec_subs (__C, __D);
    313 }
    314 
    315 extern __inline __m64
    316 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    317 _mm_hsubs_pi16 (__m64 __A, __m64 __B)
    318 {
    319   const __v16qu __P =
    320     {  0,  1,  4,  5,  8,  9, 12, 13,  0,  1,  4,  5,  8,  9, 12, 13 };
    321   const __v16qu __Q =
    322     {  2,  3,  6,  7, 10, 11, 14, 15,  2,  3,  6,  7, 10, 11, 14, 15 };
    323   __v8hi __C = (__v8hi) (__v2du) { __A, __B };
    324   __v8hi __D = vec_perm (__C, __C, __P);
    325   __v8hi __E = vec_perm (__C, __C, __Q);
    326   __C = vec_subs (__D, __E);
    327   return (__m64) ((__v2du) __C)[1];
    328 }
    329 
    330 extern __inline __m128i
    331 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    332 _mm_shuffle_epi8 (__m128i __A, __m128i __B)
    333 {
    334   const __v16qi __zero = { 0 };
    335   __vector __bool char __select = vec_cmplt ((__v16qi) __B, __zero);
    336   __v16qi __C = vec_perm ((__v16qi) __A, (__v16qi) __A, (__v16qu) __B);
    337   return (__m128i) vec_sel (__C, __zero, __select);
    338 }
    339 
    340 extern __inline __m64
    341 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    342 _mm_shuffle_pi8 (__m64 __A, __m64 __B)
    343 {
    344   const __v16qi __zero = { 0 };
    345   __v16qi __C = (__v16qi) (__v2du) { __A, __A };
    346   __v16qi __D = (__v16qi) (__v2du) { __B, __B };
    347   __vector __bool char __select = vec_cmplt ((__v16qi) __D, __zero);
    348   __C = vec_perm ((__v16qi) __C, (__v16qi) __C, (__v16qu) __D);
    349   __C = vec_sel (__C, __zero, __select);
    350   return (__m64) ((__v2du) (__C))[0];
    351 }
    352 
    353 #ifdef _ARCH_PWR8
    354 extern __inline __m128i
    355 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    356 _mm_sign_epi8 (__m128i __A, __m128i __B)
    357 {
    358   const __v16qi __zero = { 0 };
    359   __v16qi __selectneg = (__v16qi) vec_cmplt ((__v16qi) __B, __zero);
    360   __v16qi __selectpos =
    361     (__v16qi) vec_neg ((__v16qi) vec_cmpgt ((__v16qi) __B, __zero));
    362   __v16qi __conv = vec_add (__selectneg, __selectpos);
    363   return (__m128i) vec_mul ((__v16qi) __A, (__v16qi) __conv);
    364 }
    365 #endif
    366 
    367 #ifdef _ARCH_PWR8
    368 extern __inline __m128i
    369 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    370 _mm_sign_epi16 (__m128i __A, __m128i __B)
    371 {
    372   const __v8hi __zero = { 0 };
    373   __v8hi __selectneg = (__v8hi) vec_cmplt ((__v8hi) __B, __zero);
    374   __v8hi __selectpos =
    375     (__v8hi) vec_neg ((__v8hi) vec_cmpgt ((__v8hi) __B, __zero));
    376   __v8hi __conv = vec_add (__selectneg, __selectpos);
    377   return (__m128i) vec_mul ((__v8hi) __A, (__v8hi) __conv);
    378 }
    379 #endif
    380 
    381 #ifdef _ARCH_PWR8
    382 extern __inline __m128i
    383 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    384 _mm_sign_epi32 (__m128i __A, __m128i __B)
    385 {
    386   const __v4si __zero = { 0 };
    387   __v4si __selectneg = (__v4si) vec_cmplt ((__v4si) __B, __zero);
    388   __v4si __selectpos =
    389     (__v4si) vec_neg ((__v4si) vec_cmpgt ((__v4si) __B, __zero));
    390   __v4si __conv = vec_add (__selectneg, __selectpos);
    391   return (__m128i) vec_mul ((__v4si) __A, (__v4si) __conv);
    392 }
    393 #endif
    394 
    395 #ifdef _ARCH_PWR8
    396 extern __inline __m64
    397 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    398 _mm_sign_pi8 (__m64 __A, __m64 __B)
    399 {
    400   const __v16qi __zero = { 0 };
    401   __v16qi __C = (__v16qi) (__v2du) { __A, __A };
    402   __v16qi __D = (__v16qi) (__v2du) { __B, __B };
    403   __C = (__v16qi) _mm_sign_epi8 ((__m128i) __C, (__m128i) __D);
    404   return (__m64) ((__v2du) (__C))[0];
    405 }
    406 #endif
    407 
    408 #ifdef _ARCH_PWR8
    409 extern __inline __m64
    410 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    411 _mm_sign_pi16 (__m64 __A, __m64 __B)
    412 {
    413   const __v8hi __zero = { 0 };
    414   __v8hi __C = (__v8hi) (__v2du) { __A, __A };
    415   __v8hi __D = (__v8hi) (__v2du) { __B, __B };
    416   __C = (__v8hi) _mm_sign_epi16 ((__m128i) __C, (__m128i) __D);
    417   return (__m64) ((__v2du) (__C))[0];
    418 }
    419 #endif
    420 
    421 #ifdef _ARCH_PWR8
    422 extern __inline __m64
    423 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    424 _mm_sign_pi32 (__m64 __A, __m64 __B)
    425 {
    426   const __v4si __zero = { 0 };
    427   __v4si __C = (__v4si) (__v2du) { __A, __A };
    428   __v4si __D = (__v4si) (__v2du) { __B, __B };
    429   __C = (__v4si) _mm_sign_epi32 ((__m128i) __C, (__m128i) __D);
    430   return (__m64) ((__v2du) (__C))[0];
    431 }
    432 #endif
    433 
    434 extern __inline __m128i
    435 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    436 _mm_maddubs_epi16 (__m128i __A, __m128i __B)
    437 {
    438   __v8hi __unsigned = vec_splats ((signed short) 0x00ff);
    439   __v8hi __C = vec_and (vec_unpackh ((__v16qi) __A), __unsigned);
    440   __v8hi __D = vec_and (vec_unpackl ((__v16qi) __A), __unsigned);
    441   __v8hi __E = vec_unpackh ((__v16qi) __B);
    442   __v8hi __F = vec_unpackl ((__v16qi) __B);
    443   __C = vec_mul (__C, __E);
    444   __D = vec_mul (__D, __F);
    445   const __v16qu __odds  =
    446     {  0,  1,  4,  5,  8,  9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 };
    447   const __v16qu __evens =
    448     {  2,  3,  6,  7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 };
    449   __E = vec_perm (__C, __D, __odds);
    450   __F = vec_perm (__C, __D, __evens);
    451   return (__m128i) vec_adds (__E, __F);
    452 }
    453 
    454 extern __inline __m64
    455 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    456 _mm_maddubs_pi16 (__m64 __A, __m64 __B)
    457 {
    458   __v8hi __C = (__v8hi) (__v2du) { __A, __A };
    459   __C = vec_unpackl ((__v16qi) __C);
    460   const __v8hi __unsigned = vec_splats ((signed short) 0x00ff);
    461   __C = vec_and (__C, __unsigned);
    462   __v8hi __D = (__v8hi) (__v2du) { __B, __B };
    463   __D = vec_unpackl ((__v16qi) __D);
    464   __D = vec_mul (__C, __D);
    465   const __v16qu __odds  =
    466     {  0,  1,  4,  5,  8,  9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 };
    467   const __v16qu __evens =
    468     {  2,  3,  6,  7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 };
    469   __C = vec_perm (__D, __D, __odds);
    470   __D = vec_perm (__D, __D, __evens);
    471   __C = vec_adds (__C, __D);
    472   return (__m64) ((__v2du) (__C))[0];
    473 }
    474 
    475 extern __inline __m128i
    476 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    477 _mm_mulhrs_epi16 (__m128i __A, __m128i __B)
    478 {
    479   __v4si __C = vec_unpackh ((__v8hi) __A);
    480   __v4si __D = vec_unpackh ((__v8hi) __B);
    481   __C = vec_mul (__C, __D);
    482   __D = vec_unpackl ((__v8hi) __A);
    483   __v4si __E = vec_unpackl ((__v8hi) __B);
    484   __D = vec_mul (__D, __E);
    485   const __v4su __shift = vec_splats ((unsigned int) 14);
    486   __C = vec_sr (__C, __shift);
    487   __D = vec_sr (__D, __shift);
    488   const __v4si __ones = vec_splats ((signed int) 1);
    489   __C = vec_add (__C, __ones);
    490   __C = vec_sr (__C, (__v4su) __ones);
    491   __D = vec_add (__D, __ones);
    492   __D = vec_sr (__D, (__v4su) __ones);
    493   return (__m128i) vec_pack (__C, __D);
    494 }
    495 
    496 extern __inline __m64
    497 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    498 _mm_mulhrs_pi16 (__m64 __A, __m64 __B)
    499 {
    500   __v4si __C = (__v4si) (__v2du) { __A, __A };
    501   __C = vec_unpackh ((__v8hi) __C);
    502   __v4si __D = (__v4si) (__v2du) { __B, __B };
    503   __D = vec_unpackh ((__v8hi) __D);
    504   __C = vec_mul (__C, __D);
    505   const __v4su __shift = vec_splats ((unsigned int) 14);
    506   __C = vec_sr (__C, __shift);
    507   const __v4si __ones = vec_splats ((signed int) 1);
    508   __C = vec_add (__C, __ones);
    509   __C = vec_sr (__C, (__v4su) __ones);
    510   __v8hi __E = vec_pack (__C, __D);
    511   return (__m64) ((__v2du) (__E))[0];
    512 }
    513 
    514 #endif
    515