Home | History | Annotate | Line # | Download | only in rs6000
tmmintrin.h revision 1.1.1.1
      1  1.1  mrg /* Copyright (C) 2003-2019 Free Software Foundation, Inc.
      2  1.1  mrg 
      3  1.1  mrg    This file is part of GCC.
      4  1.1  mrg 
      5  1.1  mrg    GCC is free software; you can redistribute it and/or modify
      6  1.1  mrg    it under the terms of the GNU General Public License as published by
      7  1.1  mrg    the Free Software Foundation; either version 3, or (at your option)
      8  1.1  mrg    any later version.
      9  1.1  mrg 
     10  1.1  mrg    GCC is distributed in the hope that it will be useful,
     11  1.1  mrg    but WITHOUT ANY WARRANTY; without even the implied warranty of
     12  1.1  mrg    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     13  1.1  mrg    GNU General Public License for more details.
     14  1.1  mrg 
     15  1.1  mrg    Under Section 7 of GPL version 3, you are granted additional
     16  1.1  mrg    permissions described in the GCC Runtime Library Exception, version
     17  1.1  mrg    3.1, as published by the Free Software Foundation.
     18  1.1  mrg 
     19  1.1  mrg    You should have received a copy of the GNU General Public License and
     20  1.1  mrg    a copy of the GCC Runtime Library Exception along with this program;
     21  1.1  mrg    see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
     22  1.1  mrg    <http://www.gnu.org/licenses/>.  */
     23  1.1  mrg 
     24  1.1  mrg /* Implemented from the specification included in the Intel C++ Compiler
     25  1.1  mrg    User Guide and Reference, version 9.0.  */
     26  1.1  mrg 
     27  1.1  mrg #ifndef NO_WARN_X86_INTRINSICS
     28  1.1  mrg /* This header is distributed to simplify porting x86_64 code that
     29  1.1  mrg    makes explicit use of Intel intrinsics to powerpc64le.
     30  1.1  mrg    It is the user's responsibility to determine if the results are
     31  1.1  mrg    acceptable and make additional changes as necessary.
     32  1.1  mrg    Note that much code that uses Intel intrinsics can be rewritten in
     33  1.1  mrg    standard C or GNU C extensions, which are more portable and better
     34  1.1  mrg    optimized across multiple targets.  */
     35  1.1  mrg #endif
     36  1.1  mrg 
     37  1.1  mrg #ifndef TMMINTRIN_H_
     38  1.1  mrg #define TMMINTRIN_H_
     39  1.1  mrg 
     40  1.1  mrg #include <altivec.h>
     41  1.1  mrg #include <assert.h>
     42  1.1  mrg 
     43  1.1  mrg /* We need definitions from the SSE header files.  */
     44  1.1  mrg #include <pmmintrin.h>
     45  1.1  mrg 
     46  1.1  mrg extern __inline __m128i
     47  1.1  mrg __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     48  1.1  mrg _mm_abs_epi16 (__m128i __A)
     49  1.1  mrg {
     50  1.1  mrg   return (__m128i) vec_abs ((__v8hi) __A);
     51  1.1  mrg }
     52  1.1  mrg 
     53  1.1  mrg extern __inline __m128i
     54  1.1  mrg __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     55  1.1  mrg _mm_abs_epi32 (__m128i __A)
     56  1.1  mrg {
     57  1.1  mrg   return (__m128i) vec_abs ((__v4si) __A);
     58  1.1  mrg }
     59  1.1  mrg 
     60  1.1  mrg extern __inline __m128i
     61  1.1  mrg __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     62  1.1  mrg _mm_abs_epi8 (__m128i __A)
     63  1.1  mrg {
     64  1.1  mrg   return (__m128i) vec_abs ((__v16qi) __A);
     65  1.1  mrg }
     66  1.1  mrg 
     67  1.1  mrg extern __inline __m64
     68  1.1  mrg __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     69  1.1  mrg _mm_abs_pi16 (__m64 __A)
     70  1.1  mrg {
     71  1.1  mrg   __v8hi __B = (__v8hi) (__v2du) { __A, __A };
     72  1.1  mrg   return (__m64) ((__v2du) vec_abs (__B))[0];
     73  1.1  mrg }
     74  1.1  mrg 
     75  1.1  mrg extern __inline __m64
     76  1.1  mrg __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     77  1.1  mrg _mm_abs_pi32 (__m64 __A)
     78  1.1  mrg {
     79  1.1  mrg   __v4si __B = (__v4si) (__v2du) { __A, __A };
     80  1.1  mrg   return (__m64) ((__v2du) vec_abs (__B))[0];
     81  1.1  mrg }
     82  1.1  mrg 
     83  1.1  mrg extern __inline __m64
     84  1.1  mrg __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     85  1.1  mrg _mm_abs_pi8 (__m64 __A)
     86  1.1  mrg {
     87  1.1  mrg   __v16qi __B = (__v16qi) (__v2du) { __A, __A };
     88  1.1  mrg   return (__m64) ((__v2du) vec_abs (__B))[0];
     89  1.1  mrg }
     90  1.1  mrg 
     91  1.1  mrg extern __inline __m128i
     92  1.1  mrg __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     93  1.1  mrg _mm_alignr_epi8 (__m128i __A, __m128i __B, const unsigned int __count)
     94  1.1  mrg {
     95  1.1  mrg   if (__builtin_constant_p (__count) && __count < 16)
     96  1.1  mrg     {
     97  1.1  mrg #ifdef __LITTLE_ENDIAN__
     98  1.1  mrg       __A = (__m128i) vec_reve ((__v16qu) __A);
     99  1.1  mrg       __B = (__m128i) vec_reve ((__v16qu) __B);
    100  1.1  mrg #endif
    101  1.1  mrg       __A = (__m128i) vec_sld ((__v16qu) __B, (__v16qu) __A, __count);
    102  1.1  mrg #ifdef __LITTLE_ENDIAN__
    103  1.1  mrg       __A = (__m128i) vec_reve ((__v16qu) __A);
    104  1.1  mrg #endif
    105  1.1  mrg       return __A;
    106  1.1  mrg     }
    107  1.1  mrg 
    108  1.1  mrg   if (__count == 0)
    109  1.1  mrg     return __B;
    110  1.1  mrg 
    111  1.1  mrg   if (__count >= 16)
    112  1.1  mrg     {
    113  1.1  mrg       if (__count >= 32)
    114  1.1  mrg 	{
    115  1.1  mrg 	  const __v16qu zero = { 0 };
    116  1.1  mrg 	  return (__m128i) zero;
    117  1.1  mrg 	}
    118  1.1  mrg       else
    119  1.1  mrg 	{
    120  1.1  mrg 	  const __v16qu __shift =
    121  1.1  mrg 	    vec_splats ((unsigned char) ((__count - 16) * 8));
    122  1.1  mrg #ifdef __LITTLE_ENDIAN__
    123  1.1  mrg 	  return (__m128i) vec_sro ((__v16qu) __A, __shift);
    124  1.1  mrg #else
    125  1.1  mrg 	  return (__m128i) vec_slo ((__v16qu) __A, __shift);
    126  1.1  mrg #endif
    127  1.1  mrg 	}
    128  1.1  mrg     }
    129  1.1  mrg   else
    130  1.1  mrg     {
    131  1.1  mrg       const __v16qu __shiftA =
    132  1.1  mrg 	vec_splats ((unsigned char) ((16 - __count) * 8));
    133  1.1  mrg       const __v16qu __shiftB = vec_splats ((unsigned char) (__count * 8));
    134  1.1  mrg #ifdef __LITTLE_ENDIAN__
    135  1.1  mrg       __A = (__m128i) vec_slo ((__v16qu) __A, __shiftA);
    136  1.1  mrg       __B = (__m128i) vec_sro ((__v16qu) __B, __shiftB);
    137  1.1  mrg #else
    138  1.1  mrg       __A = (__m128i) vec_sro ((__v16qu) __A, __shiftA);
    139  1.1  mrg       __B = (__m128i) vec_slo ((__v16qu) __B, __shiftB);
    140  1.1  mrg #endif
    141  1.1  mrg       return (__m128i) vec_or ((__v16qu) __A, (__v16qu) __B);
    142  1.1  mrg     }
    143  1.1  mrg }
    144  1.1  mrg 
    145  1.1  mrg extern __inline __m64
    146  1.1  mrg __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    147  1.1  mrg _mm_alignr_pi8 (__m64 __A, __m64 __B, unsigned int __count)
    148  1.1  mrg {
    149  1.1  mrg   if (__count < 16)
    150  1.1  mrg     {
    151  1.1  mrg       __v2du __C = { __B, __A };
    152  1.1  mrg #ifdef __LITTLE_ENDIAN__
    153  1.1  mrg       const __v4su __shift = { __count << 3, 0, 0, 0 };
    154  1.1  mrg       __C = (__v2du) vec_sro ((__v16qu) __C, (__v16qu) __shift);
    155  1.1  mrg #else
    156  1.1  mrg       const __v4su __shift = { 0, 0, 0, __count << 3 };
    157  1.1  mrg       __C = (__v2du) vec_slo ((__v16qu) __C, (__v16qu) __shift);
    158  1.1  mrg #endif
    159  1.1  mrg       return (__m64) __C[0];
    160  1.1  mrg     }
    161  1.1  mrg   else
    162  1.1  mrg     {
    163  1.1  mrg       const __m64 __zero = { 0 };
    164  1.1  mrg       return __zero;
    165  1.1  mrg     }
    166  1.1  mrg }
    167  1.1  mrg 
    168  1.1  mrg extern __inline __m128i
    169  1.1  mrg __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    170  1.1  mrg _mm_hadd_epi16 (__m128i __A, __m128i __B)
    171  1.1  mrg {
    172  1.1  mrg   const __v16qu __P =
    173  1.1  mrg     {  0,  1,  4,  5,  8,  9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 };
    174  1.1  mrg   const __v16qu __Q =
    175  1.1  mrg     {  2,  3,  6,  7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 };
    176  1.1  mrg   __v8hi __C = vec_perm ((__v8hi) __A, (__v8hi) __B, __P);
    177  1.1  mrg   __v8hi __D = vec_perm ((__v8hi) __A, (__v8hi) __B, __Q);
    178  1.1  mrg   return (__m128i) vec_add (__C, __D);
    179  1.1  mrg }
    180  1.1  mrg 
    181  1.1  mrg extern __inline __m128i
    182  1.1  mrg __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    183  1.1  mrg _mm_hadd_epi32 (__m128i __A, __m128i __B)
    184  1.1  mrg {
    185  1.1  mrg   const __v16qu __P =
    186  1.1  mrg     {  0,  1,  2,  3,  8,  9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27 };
    187  1.1  mrg   const __v16qu __Q =
    188  1.1  mrg     {  4,  5,  6,  7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31 };
    189  1.1  mrg   __v4si __C = vec_perm ((__v4si) __A, (__v4si) __B, __P);
    190  1.1  mrg   __v4si __D = vec_perm ((__v4si) __A, (__v4si) __B, __Q);
    191  1.1  mrg   return (__m128i) vec_add (__C, __D);
    192  1.1  mrg }
    193  1.1  mrg 
    194  1.1  mrg extern __inline __m64
    195  1.1  mrg __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    196  1.1  mrg _mm_hadd_pi16 (__m64 __A, __m64 __B)
    197  1.1  mrg {
    198  1.1  mrg   __v8hi __C = (__v8hi) (__v2du) { __A, __B };
    199  1.1  mrg   const __v16qu __P =
    200  1.1  mrg     {  0,  1,  4,  5,  8,  9, 12, 13,  0,  1,  4,  5,  8,  9, 12, 13 };
    201  1.1  mrg   const __v16qu __Q =
    202  1.1  mrg     {  2,  3,  6,  7, 10, 11, 14, 15,  2,  3,  6,  7, 10, 11, 14, 15 };
    203  1.1  mrg   __v8hi __D = vec_perm (__C, __C, __Q);
    204  1.1  mrg   __C = vec_perm (__C, __C, __P);
    205  1.1  mrg   __C = vec_add (__C, __D);
    206  1.1  mrg   return (__m64) ((__v2du) __C)[1];
    207  1.1  mrg }
    208  1.1  mrg 
    209  1.1  mrg extern __inline __m64
    210  1.1  mrg __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    211  1.1  mrg _mm_hadd_pi32 (__m64 __A, __m64 __B)
    212  1.1  mrg {
    213  1.1  mrg   __v4si __C = (__v4si) (__v2du) { __A, __B };
    214  1.1  mrg   const __v16qu __P =
    215  1.1  mrg     {  0,  1,  2,  3,  8,  9, 10, 11,  0,  1,  2,  3,  8,  9, 10, 11 };
    216  1.1  mrg   const __v16qu __Q =
    217  1.1  mrg     {  4,  5,  6,  7, 12, 13, 14, 15,  4,  5,  6,  7, 12, 13, 14, 15 };
    218  1.1  mrg   __v4si __D = vec_perm (__C, __C, __Q);
    219  1.1  mrg   __C = vec_perm (__C, __C, __P);
    220  1.1  mrg   __C = vec_add (__C, __D);
    221  1.1  mrg   return (__m64) ((__v2du) __C)[1];
    222  1.1  mrg }
    223  1.1  mrg 
    224  1.1  mrg extern __inline __m128i
    225  1.1  mrg __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    226  1.1  mrg _mm_hadds_epi16 (__m128i __A, __m128i __B)
    227  1.1  mrg {
    228  1.1  mrg   __v4si __C = { 0 }, __D = { 0 };
    229  1.1  mrg   __C = vec_sum4s ((__v8hi) __A, __C);
    230  1.1  mrg   __D = vec_sum4s ((__v8hi) __B, __D);
    231  1.1  mrg   __C = (__v4si) vec_packs (__C, __D);
    232  1.1  mrg   return (__m128i) __C;
    233  1.1  mrg }
    234  1.1  mrg 
    235  1.1  mrg extern __inline __m64
    236  1.1  mrg __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    237  1.1  mrg _mm_hadds_pi16 (__m64 __A, __m64 __B)
    238  1.1  mrg {
    239  1.1  mrg   const __v4si __zero = { 0 };
    240  1.1  mrg   __v8hi __C = (__v8hi) (__v2du) { __A, __B };
    241  1.1  mrg   __v4si __D = vec_sum4s (__C, __zero);
    242  1.1  mrg   __C = vec_packs (__D, __D);
    243  1.1  mrg   return (__m64) ((__v2du) __C)[1];
    244  1.1  mrg }
    245  1.1  mrg 
    246  1.1  mrg extern __inline __m128i
    247  1.1  mrg __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    248  1.1  mrg _mm_hsub_epi16 (__m128i __A, __m128i __B)
    249  1.1  mrg {
    250  1.1  mrg   const __v16qu __P =
    251  1.1  mrg     {  0,  1,  4,  5,  8,  9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 };
    252  1.1  mrg   const __v16qu __Q =
    253  1.1  mrg     {  2,  3,  6,  7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 };
    254  1.1  mrg   __v8hi __C = vec_perm ((__v8hi) __A, (__v8hi) __B, __P);
    255  1.1  mrg   __v8hi __D = vec_perm ((__v8hi) __A, (__v8hi) __B, __Q);
    256  1.1  mrg   return (__m128i) vec_sub (__C, __D);
    257  1.1  mrg }
    258  1.1  mrg 
    259  1.1  mrg extern __inline __m128i
    260  1.1  mrg __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    261  1.1  mrg _mm_hsub_epi32 (__m128i __A, __m128i __B)
    262  1.1  mrg {
    263  1.1  mrg   const __v16qu __P =
    264  1.1  mrg     {  0,  1,  2,  3,  8,  9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27 };
    265  1.1  mrg   const __v16qu __Q =
    266  1.1  mrg     {  4,  5,  6,  7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31 };
    267  1.1  mrg   __v4si __C = vec_perm ((__v4si) __A, (__v4si) __B, __P);
    268  1.1  mrg   __v4si __D = vec_perm ((__v4si) __A, (__v4si) __B, __Q);
    269  1.1  mrg   return (__m128i) vec_sub (__C, __D);
    270  1.1  mrg }
    271  1.1  mrg 
    272  1.1  mrg extern __inline __m64
    273  1.1  mrg __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    274  1.1  mrg _mm_hsub_pi16 (__m64 __A, __m64 __B)
    275  1.1  mrg {
    276  1.1  mrg   const __v16qu __P =
    277  1.1  mrg     {  0,  1,  4,  5,  8,  9, 12, 13,  0,  1,  4,  5,  8,  9, 12, 13 };
    278  1.1  mrg   const __v16qu __Q =
    279  1.1  mrg     {  2,  3,  6,  7, 10, 11, 14, 15,  2,  3,  6,  7, 10, 11, 14, 15 };
    280  1.1  mrg   __v8hi __C = (__v8hi) (__v2du) { __A, __B };
    281  1.1  mrg   __v8hi __D = vec_perm (__C, __C, __Q);
    282  1.1  mrg   __C = vec_perm (__C, __C, __P);
    283  1.1  mrg   __C = vec_sub (__C, __D);
    284  1.1  mrg   return (__m64) ((__v2du) __C)[1];
    285  1.1  mrg }
    286  1.1  mrg 
    287  1.1  mrg extern __inline __m64
    288  1.1  mrg __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    289  1.1  mrg _mm_hsub_pi32 (__m64 __A, __m64 __B)
    290  1.1  mrg {
    291  1.1  mrg   const __v16qu __P =
    292  1.1  mrg     {  0,  1,  2,  3,  8,  9, 10, 11,  0,  1,  2,  3,  8,  9, 10, 11 };
    293  1.1  mrg   const __v16qu __Q =
    294  1.1  mrg     {  4,  5,  6,  7, 12, 13, 14, 15,  4,  5,  6,  7, 12, 13, 14, 15 };
    295  1.1  mrg   __v4si __C = (__v4si) (__v2du) { __A, __B };
    296  1.1  mrg   __v4si __D = vec_perm (__C, __C, __Q);
    297  1.1  mrg   __C = vec_perm (__C, __C, __P);
    298  1.1  mrg   __C = vec_sub (__C, __D);
    299  1.1  mrg   return (__m64) ((__v2du) __C)[1];
    300  1.1  mrg }
    301  1.1  mrg 
    302  1.1  mrg extern __inline __m128i
    303  1.1  mrg __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    304  1.1  mrg _mm_hsubs_epi16 (__m128i __A, __m128i __B)
    305  1.1  mrg {
    306  1.1  mrg   const __v16qu __P =
    307  1.1  mrg     {  0,  1,  4,  5,  8,  9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 };
    308  1.1  mrg   const __v16qu __Q =
    309  1.1  mrg     {  2,  3,  6,  7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 };
    310  1.1  mrg   __v8hi __C = vec_perm ((__v8hi) __A, (__v8hi) __B, __P);
    311  1.1  mrg   __v8hi __D = vec_perm ((__v8hi) __A, (__v8hi) __B, __Q);
    312  1.1  mrg   return (__m128i) vec_subs (__C, __D);
    313  1.1  mrg }
    314  1.1  mrg 
    315  1.1  mrg extern __inline __m64
    316  1.1  mrg __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    317  1.1  mrg _mm_hsubs_pi16 (__m64 __A, __m64 __B)
    318  1.1  mrg {
    319  1.1  mrg   const __v16qu __P =
    320  1.1  mrg     {  0,  1,  4,  5,  8,  9, 12, 13,  0,  1,  4,  5,  8,  9, 12, 13 };
    321  1.1  mrg   const __v16qu __Q =
    322  1.1  mrg     {  2,  3,  6,  7, 10, 11, 14, 15,  2,  3,  6,  7, 10, 11, 14, 15 };
    323  1.1  mrg   __v8hi __C = (__v8hi) (__v2du) { __A, __B };
    324  1.1  mrg   __v8hi __D = vec_perm (__C, __C, __P);
    325  1.1  mrg   __v8hi __E = vec_perm (__C, __C, __Q);
    326  1.1  mrg   __C = vec_subs (__D, __E);
    327  1.1  mrg   return (__m64) ((__v2du) __C)[1];
    328  1.1  mrg }
    329  1.1  mrg 
    330  1.1  mrg extern __inline __m128i
    331  1.1  mrg __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    332  1.1  mrg _mm_shuffle_epi8 (__m128i __A, __m128i __B)
    333  1.1  mrg {
    334  1.1  mrg   const __v16qi __zero = { 0 };
    335  1.1  mrg   __vector __bool char __select = vec_cmplt ((__v16qi) __B, __zero);
    336  1.1  mrg   __v16qi __C = vec_perm ((__v16qi) __A, (__v16qi) __A, (__v16qu) __B);
    337  1.1  mrg   return (__m128i) vec_sel (__C, __zero, __select);
    338  1.1  mrg }
    339  1.1  mrg 
    340  1.1  mrg extern __inline __m64
    341  1.1  mrg __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    342  1.1  mrg _mm_shuffle_pi8 (__m64 __A, __m64 __B)
    343  1.1  mrg {
    344  1.1  mrg   const __v16qi __zero = { 0 };
    345  1.1  mrg   __v16qi __C = (__v16qi) (__v2du) { __A, __A };
    346  1.1  mrg   __v16qi __D = (__v16qi) (__v2du) { __B, __B };
    347  1.1  mrg   __vector __bool char __select = vec_cmplt ((__v16qi) __D, __zero);
    348  1.1  mrg   __C = vec_perm ((__v16qi) __C, (__v16qi) __C, (__v16qu) __D);
    349  1.1  mrg   __C = vec_sel (__C, __zero, __select);
    350  1.1  mrg   return (__m64) ((__v2du) (__C))[0];
    351  1.1  mrg }
    352  1.1  mrg 
    353  1.1  mrg extern __inline __m128i
    354  1.1  mrg __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    355  1.1  mrg _mm_sign_epi8 (__m128i __A, __m128i __B)
    356  1.1  mrg {
    357  1.1  mrg   const __v16qi __zero = { 0 };
    358  1.1  mrg   __v16qi __selectneg = (__v16qi) vec_cmplt ((__v16qi) __B, __zero);
    359  1.1  mrg   __v16qi __selectpos =
    360  1.1  mrg     (__v16qi) vec_neg ((__v16qi) vec_cmpgt ((__v16qi) __B, __zero));
    361  1.1  mrg   __v16qi __conv = vec_add (__selectneg, __selectpos);
    362  1.1  mrg   return (__m128i) vec_mul ((__v16qi) __A, (__v16qi) __conv);
    363  1.1  mrg }
    364  1.1  mrg 
    365  1.1  mrg extern __inline __m128i
    366  1.1  mrg __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    367  1.1  mrg _mm_sign_epi16 (__m128i __A, __m128i __B)
    368  1.1  mrg {
    369  1.1  mrg   const __v8hi __zero = { 0 };
    370  1.1  mrg   __v8hi __selectneg = (__v8hi) vec_cmplt ((__v8hi) __B, __zero);
    371  1.1  mrg   __v8hi __selectpos =
    372  1.1  mrg     (__v8hi) vec_neg ((__v8hi) vec_cmpgt ((__v8hi) __B, __zero));
    373  1.1  mrg   __v8hi __conv = vec_add (__selectneg, __selectpos);
    374  1.1  mrg   return (__m128i) vec_mul ((__v8hi) __A, (__v8hi) __conv);
    375  1.1  mrg }
    376  1.1  mrg 
    377  1.1  mrg extern __inline __m128i
    378  1.1  mrg __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    379  1.1  mrg _mm_sign_epi32 (__m128i __A, __m128i __B)
    380  1.1  mrg {
    381  1.1  mrg   const __v4si __zero = { 0 };
    382  1.1  mrg   __v4si __selectneg = (__v4si) vec_cmplt ((__v4si) __B, __zero);
    383  1.1  mrg   __v4si __selectpos =
    384  1.1  mrg     (__v4si) vec_neg ((__v4si) vec_cmpgt ((__v4si) __B, __zero));
    385  1.1  mrg   __v4si __conv = vec_add (__selectneg, __selectpos);
    386  1.1  mrg   return (__m128i) vec_mul ((__v4si) __A, (__v4si) __conv);
    387  1.1  mrg }
    388  1.1  mrg 
    389  1.1  mrg extern __inline __m64
    390  1.1  mrg __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    391  1.1  mrg _mm_sign_pi8 (__m64 __A, __m64 __B)
    392  1.1  mrg {
    393  1.1  mrg   const __v16qi __zero = { 0 };
    394  1.1  mrg   __v16qi __C = (__v16qi) (__v2du) { __A, __A };
    395  1.1  mrg   __v16qi __D = (__v16qi) (__v2du) { __B, __B };
    396  1.1  mrg   __C = (__v16qi) _mm_sign_epi8 ((__m128i) __C, (__m128i) __D);
    397  1.1  mrg   return (__m64) ((__v2du) (__C))[0];
    398  1.1  mrg }
    399  1.1  mrg 
    400  1.1  mrg extern __inline __m64
    401  1.1  mrg __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    402  1.1  mrg _mm_sign_pi16 (__m64 __A, __m64 __B)
    403  1.1  mrg {
    404  1.1  mrg   const __v8hi __zero = { 0 };
    405  1.1  mrg   __v8hi __C = (__v8hi) (__v2du) { __A, __A };
    406  1.1  mrg   __v8hi __D = (__v8hi) (__v2du) { __B, __B };
    407  1.1  mrg   __C = (__v8hi) _mm_sign_epi16 ((__m128i) __C, (__m128i) __D);
    408  1.1  mrg   return (__m64) ((__v2du) (__C))[0];
    409  1.1  mrg }
    410  1.1  mrg 
    411  1.1  mrg extern __inline __m64
    412  1.1  mrg __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    413  1.1  mrg _mm_sign_pi32 (__m64 __A, __m64 __B)
    414  1.1  mrg {
    415  1.1  mrg   const __v4si __zero = { 0 };
    416  1.1  mrg   __v4si __C = (__v4si) (__v2du) { __A, __A };
    417  1.1  mrg   __v4si __D = (__v4si) (__v2du) { __B, __B };
    418  1.1  mrg   __C = (__v4si) _mm_sign_epi32 ((__m128i) __C, (__m128i) __D);
    419  1.1  mrg   return (__m64) ((__v2du) (__C))[0];
    420  1.1  mrg }
    421  1.1  mrg 
    422  1.1  mrg extern __inline __m128i
    423  1.1  mrg __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    424  1.1  mrg _mm_maddubs_epi16 (__m128i __A, __m128i __B)
    425  1.1  mrg {
    426  1.1  mrg   __v8hi __unsigned = vec_splats ((signed short) 0x00ff);
    427  1.1  mrg   __v8hi __C = vec_and (vec_unpackh ((__v16qi) __A), __unsigned);
    428  1.1  mrg   __v8hi __D = vec_and (vec_unpackl ((__v16qi) __A), __unsigned);
    429  1.1  mrg   __v8hi __E = vec_unpackh ((__v16qi) __B);
    430  1.1  mrg   __v8hi __F = vec_unpackl ((__v16qi) __B);
    431  1.1  mrg   __C = vec_mul (__C, __E);
    432  1.1  mrg   __D = vec_mul (__D, __F);
    433  1.1  mrg   const __v16qu __odds  =
    434  1.1  mrg     {  0,  1,  4,  5,  8,  9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 };
    435  1.1  mrg   const __v16qu __evens =
    436  1.1  mrg     {  2,  3,  6,  7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 };
    437  1.1  mrg   __E = vec_perm (__C, __D, __odds);
    438  1.1  mrg   __F = vec_perm (__C, __D, __evens);
    439  1.1  mrg   return (__m128i) vec_adds (__E, __F);
    440  1.1  mrg }
    441  1.1  mrg 
    442  1.1  mrg extern __inline __m64
    443  1.1  mrg __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    444  1.1  mrg _mm_maddubs_pi16 (__m64 __A, __m64 __B)
    445  1.1  mrg {
    446  1.1  mrg   __v8hi __C = (__v8hi) (__v2du) { __A, __A };
    447  1.1  mrg   __C = vec_unpackl ((__v16qi) __C);
    448  1.1  mrg   const __v8hi __unsigned = vec_splats ((signed short) 0x00ff);
    449  1.1  mrg   __C = vec_and (__C, __unsigned);
    450  1.1  mrg   __v8hi __D = (__v8hi) (__v2du) { __B, __B };
    451  1.1  mrg   __D = vec_unpackl ((__v16qi) __D);
    452  1.1  mrg   __D = vec_mul (__C, __D);
    453  1.1  mrg   const __v16qu __odds  =
    454  1.1  mrg     {  0,  1,  4,  5,  8,  9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 };
    455  1.1  mrg   const __v16qu __evens =
    456  1.1  mrg     {  2,  3,  6,  7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 };
    457  1.1  mrg   __C = vec_perm (__D, __D, __odds);
    458  1.1  mrg   __D = vec_perm (__D, __D, __evens);
    459  1.1  mrg   __C = vec_adds (__C, __D);
    460  1.1  mrg   return (__m64) ((__v2du) (__C))[0];
    461  1.1  mrg }
    462  1.1  mrg 
    463  1.1  mrg extern __inline __m128i
    464  1.1  mrg __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    465  1.1  mrg _mm_mulhrs_epi16 (__m128i __A, __m128i __B)
    466  1.1  mrg {
    467  1.1  mrg   __v4si __C = vec_unpackh ((__v8hi) __A);
    468  1.1  mrg   __v4si __D = vec_unpackh ((__v8hi) __B);
    469  1.1  mrg   __C = vec_mul (__C, __D);
    470  1.1  mrg   __D = vec_unpackl ((__v8hi) __A);
    471  1.1  mrg   __v4si __E = vec_unpackl ((__v8hi) __B);
    472  1.1  mrg   __D = vec_mul (__D, __E);
    473  1.1  mrg   const __v4su __shift = vec_splats ((unsigned int) 14);
    474  1.1  mrg   __C = vec_sr (__C, __shift);
    475  1.1  mrg   __D = vec_sr (__D, __shift);
    476  1.1  mrg   const __v4si __ones = vec_splats ((signed int) 1);
    477  1.1  mrg   __C = vec_add (__C, __ones);
    478  1.1  mrg   __C = vec_sr (__C, (__v4su) __ones);
    479  1.1  mrg   __D = vec_add (__D, __ones);
    480  1.1  mrg   __D = vec_sr (__D, (__v4su) __ones);
    481  1.1  mrg   return (__m128i) vec_pack (__C, __D);
    482  1.1  mrg }
    483  1.1  mrg 
    484  1.1  mrg extern __inline __m64
    485  1.1  mrg __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    486  1.1  mrg _mm_mulhrs_pi16 (__m64 __A, __m64 __B)
    487  1.1  mrg {
    488  1.1  mrg   __v4si __C = (__v4si) (__v2du) { __A, __A };
    489  1.1  mrg   __C = vec_unpackh ((__v8hi) __C);
    490  1.1  mrg   __v4si __D = (__v4si) (__v2du) { __B, __B };
    491  1.1  mrg   __D = vec_unpackh ((__v8hi) __D);
    492  1.1  mrg   __C = vec_mul (__C, __D);
    493  1.1  mrg   const __v4su __shift = vec_splats ((unsigned int) 14);
    494  1.1  mrg   __C = vec_sr (__C, __shift);
    495  1.1  mrg   const __v4si __ones = vec_splats ((signed int) 1);
    496  1.1  mrg   __C = vec_add (__C, __ones);
    497  1.1  mrg   __C = vec_sr (__C, (__v4su) __ones);
    498  1.1  mrg   __v8hi __E = vec_pack (__C, __D);
    499  1.1  mrg   return (__m64) ((__v2du) (__E))[0];
    500  1.1  mrg }
    501  1.1  mrg 
    502  1.1  mrg #endif
    503