Home | History | Annotate | Line # | Download | only in rs6000
mmintrin.h revision 1.1
      1  1.1  mrg /* Copyright (C) 2002-2018 Free Software Foundation, Inc.
      2  1.1  mrg 
      3  1.1  mrg    This file is part of GCC.
      4  1.1  mrg 
      5  1.1  mrg    GCC is free software; you can redistribute it and/or modify
      6  1.1  mrg    it under the terms of the GNU General Public License as published by
      7  1.1  mrg    the Free Software Foundation; either version 3, or (at your option)
      8  1.1  mrg    any later version.
      9  1.1  mrg 
     10  1.1  mrg    GCC is distributed in the hope that it will be useful,
     11  1.1  mrg    but WITHOUT ANY WARRANTY; without even the implied warranty of
     12  1.1  mrg    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     13  1.1  mrg    GNU General Public License for more details.
     14  1.1  mrg 
     15  1.1  mrg    Under Section 7 of GPL version 3, you are granted additional
     16  1.1  mrg    permissions described in the GCC Runtime Library Exception, version
     17  1.1  mrg    3.1, as published by the Free Software Foundation.
     18  1.1  mrg 
     19  1.1  mrg    You should have received a copy of the GNU General Public License and
     20  1.1  mrg    a copy of the GCC Runtime Library Exception along with this program;
     21  1.1  mrg    see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
     22  1.1  mrg    <http://www.gnu.org/licenses/>.  */
     23  1.1  mrg 
     24  1.1  mrg /* Implemented from the specification included in the Intel C++ Compiler
     25  1.1  mrg    User Guide and Reference, version 9.0.  */
     26  1.1  mrg 
     27  1.1  mrg #ifndef NO_WARN_X86_INTRINSICS
     28  1.1  mrg /* This header is distributed to simplify porting x86_64 code that
     29  1.1  mrg    makes explicit use of Intel intrinsics to powerpc64le.
     30  1.1  mrg    It is the user's responsibility to determine if the results are
     31  1.1  mrg    acceptable and make additional changes as necessary.
     32  1.1  mrg    Note that much code that uses Intel intrinsics can be rewritten in
     33  1.1  mrg    standard C or GNU C extensions, which are more portable and better
     34  1.1  mrg    optimized across multiple targets.
     35  1.1  mrg 
     36  1.1  mrg    In the specific case of X86 MMX (__m64) intrinsics, the PowerPC
     37  1.1  mrg    target does not support a native __vector_size__ (8) type.  Instead
     38  1.1  mrg    we typedef __m64 to a 64-bit unsigned long long, which is natively
     39  1.1  mrg    supported in 64-bit mode.  This works well for the _si64 and some
     40  1.1  mrg    _pi32 operations, but starts to generate long sequences for _pi16
     41  1.1  mrg    and _pi8 operations.  For those cases it better (faster and
     42  1.1  mrg    smaller code) to transfer __m64 data to the PowerPC vector 128-bit
     43  1.1  mrg    unit, perform the operation, and then transfer the result back to
     44  1.1  mrg    the __m64 type. This implies that the direct register move
     45  1.1  mrg    instructions, introduced with power8, are available for efficient
     46  1.1  mrg    implementation of these transfers.
     47  1.1  mrg 
     48  1.1  mrg    Most MMX intrinsic operations can be performed efficiently as
     49  1.1  mrg    C language 64-bit scalar operation or optimized to use the newer
     50  1.1  mrg    128-bit SSE/Altivec operations.  We recomend this for new
     51  1.1  mrg    applications.  */
     52  1.1  mrg #error "Please read comment above.  Use -DNO_WARN_X86_INTRINSICS to disable this error."
     53  1.1  mrg #endif
     54  1.1  mrg 
     55  1.1  mrg #ifndef _MMINTRIN_H_INCLUDED
     56  1.1  mrg #define _MMINTRIN_H_INCLUDED
     57  1.1  mrg 
     58  1.1  mrg #include <altivec.h>
     59  1.1  mrg /* The Intel API is flexible enough that we must allow aliasing with other
     60  1.1  mrg    vector types, and their scalar components.  */
     61  1.1  mrg typedef __attribute__ ((__aligned__ (8))) unsigned long long __m64;
     62  1.1  mrg 
     63  1.1  mrg typedef __attribute__ ((__aligned__ (8)))
     64  1.1  mrg union
     65  1.1  mrg   {
     66  1.1  mrg     __m64 as_m64;
     67  1.1  mrg     char as_char[8];
     68  1.1  mrg     signed char as_signed_char [8];
     69  1.1  mrg     short as_short[4];
     70  1.1  mrg     int as_int[2];
     71  1.1  mrg     long long as_long_long;
     72  1.1  mrg     float as_float[2];
     73  1.1  mrg     double as_double;
     74  1.1  mrg   } __m64_union;
     75  1.1  mrg 
     76  1.1  mrg /* Empty the multimedia state.  */
     77  1.1  mrg extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     78  1.1  mrg _mm_empty (void)
     79  1.1  mrg {
     80  1.1  mrg   /* nothing to do on PowerPC.  */
     81  1.1  mrg }
     82  1.1  mrg 
     83  1.1  mrg extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     84  1.1  mrg _m_empty (void)
     85  1.1  mrg {
     86  1.1  mrg   /* nothing to do on PowerPC.  */
     87  1.1  mrg }
     88  1.1  mrg 
     89  1.1  mrg /* Convert I to a __m64 object.  The integer is zero-extended to 64-bits.  */
     90  1.1  mrg extern __inline __m64  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     91  1.1  mrg _mm_cvtsi32_si64 (int __i)
     92  1.1  mrg {
     93  1.1  mrg   return (__m64) (unsigned int) __i;
     94  1.1  mrg }
     95  1.1  mrg 
     96  1.1  mrg extern __inline __m64  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     97  1.1  mrg _m_from_int (int __i)
     98  1.1  mrg {
     99  1.1  mrg   return _mm_cvtsi32_si64 (__i);
    100  1.1  mrg }
    101  1.1  mrg 
    102  1.1  mrg /* Convert the lower 32 bits of the __m64 object into an integer.  */
    103  1.1  mrg extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    104  1.1  mrg _mm_cvtsi64_si32 (__m64 __i)
    105  1.1  mrg {
    106  1.1  mrg   return ((int) __i);
    107  1.1  mrg }
    108  1.1  mrg 
    109  1.1  mrg extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    110  1.1  mrg _m_to_int (__m64 __i)
    111  1.1  mrg {
    112  1.1  mrg   return _mm_cvtsi64_si32 (__i);
    113  1.1  mrg }
    114  1.1  mrg 
    115  1.1  mrg #ifdef __powerpc64__
    116  1.1  mrg /* Convert I to a __m64 object.  */
    117  1.1  mrg 
    118  1.1  mrg /* Intel intrinsic.  */
    119  1.1  mrg extern __inline __m64  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    120  1.1  mrg _m_from_int64 (long long __i)
    121  1.1  mrg {
    122  1.1  mrg   return (__m64) __i;
    123  1.1  mrg }
    124  1.1  mrg 
    125  1.1  mrg extern __inline __m64  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    126  1.1  mrg _mm_cvtsi64_m64 (long long __i)
    127  1.1  mrg {
    128  1.1  mrg   return (__m64) __i;
    129  1.1  mrg }
    130  1.1  mrg 
    131  1.1  mrg /* Microsoft intrinsic.  */
    132  1.1  mrg extern __inline __m64  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    133  1.1  mrg _mm_cvtsi64x_si64 (long long __i)
    134  1.1  mrg {
    135  1.1  mrg   return (__m64) __i;
    136  1.1  mrg }
    137  1.1  mrg 
    138  1.1  mrg extern __inline __m64  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    139  1.1  mrg _mm_set_pi64x (long long __i)
    140  1.1  mrg {
    141  1.1  mrg   return (__m64) __i;
    142  1.1  mrg }
    143  1.1  mrg 
    144  1.1  mrg /* Convert the __m64 object to a 64bit integer.  */
    145  1.1  mrg 
    146  1.1  mrg /* Intel intrinsic.  */
    147  1.1  mrg extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    148  1.1  mrg _m_to_int64 (__m64 __i)
    149  1.1  mrg {
    150  1.1  mrg   return (long long)__i;
    151  1.1  mrg }
    152  1.1  mrg 
    153  1.1  mrg extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    154  1.1  mrg _mm_cvtm64_si64 (__m64 __i)
    155  1.1  mrg {
    156  1.1  mrg   return (long long) __i;
    157  1.1  mrg }
    158  1.1  mrg 
    159  1.1  mrg /* Microsoft intrinsic.  */
    160  1.1  mrg extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    161  1.1  mrg _mm_cvtsi64_si64x (__m64 __i)
    162  1.1  mrg {
    163  1.1  mrg   return (long long) __i;
    164  1.1  mrg }
    165  1.1  mrg 
    166  1.1  mrg #ifdef _ARCH_PWR8
    167  1.1  mrg /* Pack the four 16-bit values from M1 into the lower four 8-bit values of
    168  1.1  mrg    the result, and the four 16-bit values from M2 into the upper four 8-bit
    169  1.1  mrg    values of the result, all with signed saturation.  */
    170  1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    171  1.1  mrg _mm_packs_pi16 (__m64 __m1, __m64 __m2)
    172  1.1  mrg {
    173  1.1  mrg   __vector signed short vm1;
    174  1.1  mrg   __vector signed char vresult;
    175  1.1  mrg 
    176  1.1  mrg   vm1 = (__vector signed short)__builtin_pack_vector_int128 (__m2, __m1);
    177  1.1  mrg   vresult = vec_vpkshss (vm1, vm1);
    178  1.1  mrg   return (__m64) __builtin_unpack_vector_int128 ((__vector __int128)vresult, 0);
    179  1.1  mrg }
    180  1.1  mrg 
    181  1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    182  1.1  mrg _m_packsswb (__m64 __m1, __m64 __m2)
    183  1.1  mrg {
    184  1.1  mrg   return _mm_packs_pi16 (__m1, __m2);
    185  1.1  mrg }
    186  1.1  mrg 
    187  1.1  mrg /* Pack the two 32-bit values from M1 in to the lower two 16-bit values of
    188  1.1  mrg    the result, and the two 32-bit values from M2 into the upper two 16-bit
    189  1.1  mrg    values of the result, all with signed saturation.  */
    190  1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    191  1.1  mrg _mm_packs_pi32 (__m64 __m1, __m64 __m2)
    192  1.1  mrg {
    193  1.1  mrg   __vector signed int vm1;
    194  1.1  mrg   __vector signed short vresult;
    195  1.1  mrg 
    196  1.1  mrg   vm1 = (__vector signed int)__builtin_pack_vector_int128 (__m2, __m1);
    197  1.1  mrg   vresult = vec_vpkswss (vm1, vm1);
    198  1.1  mrg   return ((__m64) __builtin_unpack_vector_int128 ((__vector __int128)vresult, 0));
    199  1.1  mrg }
    200  1.1  mrg 
    201  1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    202  1.1  mrg _m_packssdw (__m64 __m1, __m64 __m2)
    203  1.1  mrg {
    204  1.1  mrg   return _mm_packs_pi32 (__m1, __m2);
    205  1.1  mrg }
    206  1.1  mrg 
    207  1.1  mrg /* Pack the four 16-bit values from M1 into the lower four 8-bit values of
    208  1.1  mrg    the result, and the four 16-bit values from M2 into the upper four 8-bit
    209  1.1  mrg    values of the result, all with unsigned saturation.  */
    210  1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    211  1.1  mrg _mm_packs_pu16 (__m64 __m1, __m64 __m2)
    212  1.1  mrg {
    213  1.1  mrg   __vector signed short vm1;
    214  1.1  mrg   __vector unsigned char vresult;
    215  1.1  mrg 
    216  1.1  mrg   vm1 = (__vector signed short)__builtin_pack_vector_int128 (__m2, __m1);
    217  1.1  mrg   vresult = vec_vpkshus (vm1, vm1);
    218  1.1  mrg   return ((__m64) __builtin_unpack_vector_int128 ((__vector __int128)vresult, 0));
    219  1.1  mrg }
    220  1.1  mrg 
    221  1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    222  1.1  mrg _m_packuswb (__m64 __m1, __m64 __m2)
    223  1.1  mrg {
    224  1.1  mrg   return _mm_packs_pu16 (__m1, __m2);
    225  1.1  mrg }
    226  1.1  mrg #endif /* end ARCH_PWR8 */
    227  1.1  mrg 
    228  1.1  mrg /* Interleave the four 8-bit values from the high half of M1 with the four
    229  1.1  mrg    8-bit values from the high half of M2.  */
    230  1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    231  1.1  mrg _mm_unpackhi_pi8 (__m64 __m1, __m64 __m2)
    232  1.1  mrg {
    233  1.1  mrg #if _ARCH_PWR8
    234  1.1  mrg   __vector unsigned char a, b, c;
    235  1.1  mrg 
    236  1.1  mrg   a = (__vector unsigned char)vec_splats (__m1);
    237  1.1  mrg   b = (__vector unsigned char)vec_splats (__m2);
    238  1.1  mrg   c = vec_mergel (a, b);
    239  1.1  mrg   return (__builtin_unpack_vector_int128 ((__vector __int128_t)c, 0));
    240  1.1  mrg #else
    241  1.1  mrg   __m64_union m1, m2, res;
    242  1.1  mrg 
    243  1.1  mrg   m1.as_m64 = __m1;
    244  1.1  mrg   m2.as_m64 = __m2;
    245  1.1  mrg 
    246  1.1  mrg   res.as_char[0] = m1.as_char[4];
    247  1.1  mrg   res.as_char[1] = m2.as_char[4];
    248  1.1  mrg   res.as_char[2] = m1.as_char[5];
    249  1.1  mrg   res.as_char[3] = m2.as_char[5];
    250  1.1  mrg   res.as_char[4] = m1.as_char[6];
    251  1.1  mrg   res.as_char[5] = m2.as_char[6];
    252  1.1  mrg   res.as_char[6] = m1.as_char[7];
    253  1.1  mrg   res.as_char[7] = m2.as_char[7];
    254  1.1  mrg 
    255  1.1  mrg   return (__m64) res.as_m64;
    256  1.1  mrg #endif
    257  1.1  mrg }
    258  1.1  mrg 
    259  1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    260  1.1  mrg _m_punpckhbw (__m64 __m1, __m64 __m2)
    261  1.1  mrg {
    262  1.1  mrg   return _mm_unpackhi_pi8 (__m1, __m2);
    263  1.1  mrg }
    264  1.1  mrg 
    265  1.1  mrg /* Interleave the two 16-bit values from the high half of M1 with the two
    266  1.1  mrg    16-bit values from the high half of M2.  */
    267  1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    268  1.1  mrg _mm_unpackhi_pi16 (__m64 __m1, __m64 __m2)
    269  1.1  mrg {
    270  1.1  mrg   __m64_union m1, m2, res;
    271  1.1  mrg 
    272  1.1  mrg   m1.as_m64 = __m1;
    273  1.1  mrg   m2.as_m64 = __m2;
    274  1.1  mrg 
    275  1.1  mrg   res.as_short[0] = m1.as_short[2];
    276  1.1  mrg   res.as_short[1] = m2.as_short[2];
    277  1.1  mrg   res.as_short[2] = m1.as_short[3];
    278  1.1  mrg   res.as_short[3] = m2.as_short[3];
    279  1.1  mrg 
    280  1.1  mrg   return (__m64) res.as_m64;
    281  1.1  mrg }
    282  1.1  mrg 
    283  1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    284  1.1  mrg _m_punpckhwd (__m64 __m1, __m64 __m2)
    285  1.1  mrg {
    286  1.1  mrg   return _mm_unpackhi_pi16 (__m1, __m2);
    287  1.1  mrg }
    288  1.1  mrg /* Interleave the 32-bit value from the high half of M1 with the 32-bit
    289  1.1  mrg    value from the high half of M2.  */
    290  1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    291  1.1  mrg _mm_unpackhi_pi32 (__m64 __m1, __m64 __m2)
    292  1.1  mrg {
    293  1.1  mrg   __m64_union m1, m2, res;
    294  1.1  mrg 
    295  1.1  mrg   m1.as_m64 = __m1;
    296  1.1  mrg   m2.as_m64 = __m2;
    297  1.1  mrg 
    298  1.1  mrg   res.as_int[0] = m1.as_int[1];
    299  1.1  mrg   res.as_int[1] = m2.as_int[1];
    300  1.1  mrg 
    301  1.1  mrg   return (__m64) res.as_m64;
    302  1.1  mrg }
    303  1.1  mrg 
    304  1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    305  1.1  mrg _m_punpckhdq (__m64 __m1, __m64 __m2)
    306  1.1  mrg {
    307  1.1  mrg   return _mm_unpackhi_pi32 (__m1, __m2);
    308  1.1  mrg }
    309  1.1  mrg /* Interleave the four 8-bit values from the low half of M1 with the four
    310  1.1  mrg    8-bit values from the low half of M2.  */
    311  1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    312  1.1  mrg _mm_unpacklo_pi8 (__m64 __m1, __m64 __m2)
    313  1.1  mrg {
    314  1.1  mrg #if _ARCH_PWR8
    315  1.1  mrg   __vector unsigned char a, b, c;
    316  1.1  mrg 
    317  1.1  mrg   a = (__vector unsigned char)vec_splats (__m1);
    318  1.1  mrg   b = (__vector unsigned char)vec_splats (__m2);
    319  1.1  mrg   c = vec_mergel (a, b);
    320  1.1  mrg   return (__builtin_unpack_vector_int128 ((__vector __int128_t)c, 1));
    321  1.1  mrg #else
    322  1.1  mrg   __m64_union m1, m2, res;
    323  1.1  mrg 
    324  1.1  mrg   m1.as_m64 = __m1;
    325  1.1  mrg   m2.as_m64 = __m2;
    326  1.1  mrg 
    327  1.1  mrg   res.as_char[0] = m1.as_char[0];
    328  1.1  mrg   res.as_char[1] = m2.as_char[0];
    329  1.1  mrg   res.as_char[2] = m1.as_char[1];
    330  1.1  mrg   res.as_char[3] = m2.as_char[1];
    331  1.1  mrg   res.as_char[4] = m1.as_char[2];
    332  1.1  mrg   res.as_char[5] = m2.as_char[2];
    333  1.1  mrg   res.as_char[6] = m1.as_char[3];
    334  1.1  mrg   res.as_char[7] = m2.as_char[3];
    335  1.1  mrg 
    336  1.1  mrg   return (__m64) res.as_m64;
    337  1.1  mrg #endif
    338  1.1  mrg }
    339  1.1  mrg 
    340  1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    341  1.1  mrg _m_punpcklbw (__m64 __m1, __m64 __m2)
    342  1.1  mrg {
    343  1.1  mrg   return _mm_unpacklo_pi8 (__m1, __m2);
    344  1.1  mrg }
    345  1.1  mrg /* Interleave the two 16-bit values from the low half of M1 with the two
    346  1.1  mrg    16-bit values from the low half of M2.  */
    347  1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    348  1.1  mrg _mm_unpacklo_pi16 (__m64 __m1, __m64 __m2)
    349  1.1  mrg {
    350  1.1  mrg   __m64_union m1, m2, res;
    351  1.1  mrg 
    352  1.1  mrg   m1.as_m64 = __m1;
    353  1.1  mrg   m2.as_m64 = __m2;
    354  1.1  mrg 
    355  1.1  mrg   res.as_short[0] = m1.as_short[0];
    356  1.1  mrg   res.as_short[1] = m2.as_short[0];
    357  1.1  mrg   res.as_short[2] = m1.as_short[1];
    358  1.1  mrg   res.as_short[3] = m2.as_short[1];
    359  1.1  mrg 
    360  1.1  mrg   return (__m64) res.as_m64;
    361  1.1  mrg }
    362  1.1  mrg 
    363  1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    364  1.1  mrg _m_punpcklwd (__m64 __m1, __m64 __m2)
    365  1.1  mrg {
    366  1.1  mrg   return _mm_unpacklo_pi16 (__m1, __m2);
    367  1.1  mrg }
    368  1.1  mrg 
    369  1.1  mrg /* Interleave the 32-bit value from the low half of M1 with the 32-bit
    370  1.1  mrg    value from the low half of M2.  */
    371  1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    372  1.1  mrg _mm_unpacklo_pi32 (__m64 __m1, __m64 __m2)
    373  1.1  mrg {
    374  1.1  mrg   __m64_union m1, m2, res;
    375  1.1  mrg 
    376  1.1  mrg   m1.as_m64 = __m1;
    377  1.1  mrg   m2.as_m64 = __m2;
    378  1.1  mrg 
    379  1.1  mrg   res.as_int[0] = m1.as_int[0];
    380  1.1  mrg   res.as_int[1] = m2.as_int[0];
    381  1.1  mrg 
    382  1.1  mrg   return (__m64) res.as_m64;
    383  1.1  mrg }
    384  1.1  mrg 
    385  1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    386  1.1  mrg _m_punpckldq (__m64 __m1, __m64 __m2)
    387  1.1  mrg {
    388  1.1  mrg   return _mm_unpacklo_pi32 (__m1, __m2);
    389  1.1  mrg }
    390  1.1  mrg 
    391  1.1  mrg /* Add the 8-bit values in M1 to the 8-bit values in M2.  */
    392  1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    393  1.1  mrg _mm_add_pi8 (__m64 __m1, __m64 __m2)
    394  1.1  mrg {
    395  1.1  mrg #if _ARCH_PWR8
    396  1.1  mrg   __vector signed char a, b, c;
    397  1.1  mrg 
    398  1.1  mrg   a = (__vector signed char)vec_splats (__m1);
    399  1.1  mrg   b = (__vector signed char)vec_splats (__m2);
    400  1.1  mrg   c = vec_add (a, b);
    401  1.1  mrg   return (__builtin_unpack_vector_int128 ((__vector __int128_t)c, 0));
    402  1.1  mrg #else
    403  1.1  mrg   __m64_union m1, m2, res;
    404  1.1  mrg 
    405  1.1  mrg   m1.as_m64 = __m1;
    406  1.1  mrg   m2.as_m64 = __m2;
    407  1.1  mrg 
    408  1.1  mrg   res.as_char[0] = m1.as_char[0] + m2.as_char[0];
    409  1.1  mrg   res.as_char[1] = m1.as_char[1] + m2.as_char[1];
    410  1.1  mrg   res.as_char[2] = m1.as_char[2] + m2.as_char[2];
    411  1.1  mrg   res.as_char[3] = m1.as_char[3] + m2.as_char[3];
    412  1.1  mrg   res.as_char[4] = m1.as_char[4] + m2.as_char[4];
    413  1.1  mrg   res.as_char[5] = m1.as_char[5] + m2.as_char[5];
    414  1.1  mrg   res.as_char[6] = m1.as_char[6] + m2.as_char[6];
    415  1.1  mrg   res.as_char[7] = m1.as_char[7] + m2.as_char[7];
    416  1.1  mrg 
    417  1.1  mrg   return (__m64) res.as_m64;
    418  1.1  mrg #endif
    419  1.1  mrg }
    420  1.1  mrg 
    421  1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    422  1.1  mrg _m_paddb (__m64 __m1, __m64 __m2)
    423  1.1  mrg {
    424  1.1  mrg   return _mm_add_pi8 (__m1, __m2);
    425  1.1  mrg }
    426  1.1  mrg 
    427  1.1  mrg /* Add the 16-bit values in M1 to the 16-bit values in M2.  */
    428  1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    429  1.1  mrg _mm_add_pi16 (__m64 __m1, __m64 __m2)
    430  1.1  mrg {
    431  1.1  mrg #if _ARCH_PWR8
    432  1.1  mrg   __vector signed short a, b, c;
    433  1.1  mrg 
    434  1.1  mrg   a = (__vector signed short)vec_splats (__m1);
    435  1.1  mrg   b = (__vector signed short)vec_splats (__m2);
    436  1.1  mrg   c = vec_add (a, b);
    437  1.1  mrg   return (__builtin_unpack_vector_int128 ((__vector __int128_t)c, 0));
    438  1.1  mrg #else
    439  1.1  mrg   __m64_union m1, m2, res;
    440  1.1  mrg 
    441  1.1  mrg   m1.as_m64 = __m1;
    442  1.1  mrg   m2.as_m64 = __m2;
    443  1.1  mrg 
    444  1.1  mrg   res.as_short[0] = m1.as_short[0] + m2.as_short[0];
    445  1.1  mrg   res.as_short[1] = m1.as_short[1] + m2.as_short[1];
    446  1.1  mrg   res.as_short[2] = m1.as_short[2] + m2.as_short[2];
    447  1.1  mrg   res.as_short[3] = m1.as_short[3] + m2.as_short[3];
    448  1.1  mrg 
    449  1.1  mrg   return (__m64) res.as_m64;
    450  1.1  mrg #endif
    451  1.1  mrg }
    452  1.1  mrg 
    453  1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    454  1.1  mrg _m_paddw (__m64 __m1, __m64 __m2)
    455  1.1  mrg {
    456  1.1  mrg   return _mm_add_pi16 (__m1, __m2);
    457  1.1  mrg }
    458  1.1  mrg 
    459  1.1  mrg /* Add the 32-bit values in M1 to the 32-bit values in M2.  */
    460  1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    461  1.1  mrg _mm_add_pi32 (__m64 __m1, __m64 __m2)
    462  1.1  mrg {
    463  1.1  mrg #if _ARCH_PWR9
    464  1.1  mrg   __vector signed int a, b, c;
    465  1.1  mrg 
    466  1.1  mrg   a = (__vector signed int)vec_splats (__m1);
    467  1.1  mrg   b = (__vector signed int)vec_splats (__m2);
    468  1.1  mrg   c = vec_add (a, b);
    469  1.1  mrg   return (__builtin_unpack_vector_int128 ((__vector __int128_t)c, 0));
    470  1.1  mrg #else
    471  1.1  mrg   __m64_union m1, m2, res;
    472  1.1  mrg 
    473  1.1  mrg   m1.as_m64 = __m1;
    474  1.1  mrg   m2.as_m64 = __m2;
    475  1.1  mrg 
    476  1.1  mrg   res.as_int[0] = m1.as_int[0] + m2.as_int[0];
    477  1.1  mrg   res.as_int[1] = m1.as_int[1] + m2.as_int[1];
    478  1.1  mrg 
    479  1.1  mrg   return (__m64) res.as_m64;
    480  1.1  mrg #endif
    481  1.1  mrg }
    482  1.1  mrg 
    483  1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    484  1.1  mrg _m_paddd (__m64 __m1, __m64 __m2)
    485  1.1  mrg {
    486  1.1  mrg   return _mm_add_pi32 (__m1, __m2);
    487  1.1  mrg }
    488  1.1  mrg 
    489  1.1  mrg /* Subtract the 8-bit values in M2 from the 8-bit values in M1.  */
    490  1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    491  1.1  mrg _mm_sub_pi8 (__m64 __m1, __m64 __m2)
    492  1.1  mrg {
    493  1.1  mrg #if _ARCH_PWR8
    494  1.1  mrg   __vector signed char a, b, c;
    495  1.1  mrg 
    496  1.1  mrg   a = (__vector signed char)vec_splats (__m1);
    497  1.1  mrg   b = (__vector signed char)vec_splats (__m2);
    498  1.1  mrg   c = vec_sub (a, b);
    499  1.1  mrg   return (__builtin_unpack_vector_int128 ((__vector __int128_t)c, 0));
    500  1.1  mrg #else
    501  1.1  mrg   __m64_union m1, m2, res;
    502  1.1  mrg 
    503  1.1  mrg   m1.as_m64 = __m1;
    504  1.1  mrg   m2.as_m64 = __m2;
    505  1.1  mrg 
    506  1.1  mrg   res.as_char[0] = m1.as_char[0] - m2.as_char[0];
    507  1.1  mrg   res.as_char[1] = m1.as_char[1] - m2.as_char[1];
    508  1.1  mrg   res.as_char[2] = m1.as_char[2] - m2.as_char[2];
    509  1.1  mrg   res.as_char[3] = m1.as_char[3] - m2.as_char[3];
    510  1.1  mrg   res.as_char[4] = m1.as_char[4] - m2.as_char[4];
    511  1.1  mrg   res.as_char[5] = m1.as_char[5] - m2.as_char[5];
    512  1.1  mrg   res.as_char[6] = m1.as_char[6] - m2.as_char[6];
    513  1.1  mrg   res.as_char[7] = m1.as_char[7] - m2.as_char[7];
    514  1.1  mrg 
    515  1.1  mrg   return (__m64) res.as_m64;
    516  1.1  mrg #endif
    517  1.1  mrg }
    518  1.1  mrg 
    519  1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    520  1.1  mrg _m_psubb (__m64 __m1, __m64 __m2)
    521  1.1  mrg {
    522  1.1  mrg   return _mm_sub_pi8 (__m1, __m2);
    523  1.1  mrg }
    524  1.1  mrg 
    525  1.1  mrg /* Subtract the 16-bit values in M2 from the 16-bit values in M1.  */
    526  1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    527  1.1  mrg _mm_sub_pi16 (__m64 __m1, __m64 __m2)
    528  1.1  mrg {
    529  1.1  mrg #if _ARCH_PWR8
    530  1.1  mrg   __vector signed short a, b, c;
    531  1.1  mrg 
    532  1.1  mrg   a = (__vector signed short)vec_splats (__m1);
    533  1.1  mrg   b = (__vector signed short)vec_splats (__m2);
    534  1.1  mrg   c = vec_sub (a, b);
    535  1.1  mrg   return (__builtin_unpack_vector_int128 ((__vector __int128_t)c, 0));
    536  1.1  mrg #else
    537  1.1  mrg   __m64_union m1, m2, res;
    538  1.1  mrg 
    539  1.1  mrg   m1.as_m64 = __m1;
    540  1.1  mrg   m2.as_m64 = __m2;
    541  1.1  mrg 
    542  1.1  mrg   res.as_short[0] = m1.as_short[0] - m2.as_short[0];
    543  1.1  mrg   res.as_short[1] = m1.as_short[1] - m2.as_short[1];
    544  1.1  mrg   res.as_short[2] = m1.as_short[2] - m2.as_short[2];
    545  1.1  mrg   res.as_short[3] = m1.as_short[3] - m2.as_short[3];
    546  1.1  mrg 
    547  1.1  mrg   return (__m64) res.as_m64;
    548  1.1  mrg #endif
    549  1.1  mrg }
    550  1.1  mrg 
    551  1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    552  1.1  mrg _m_psubw (__m64 __m1, __m64 __m2)
    553  1.1  mrg {
    554  1.1  mrg   return _mm_sub_pi16 (__m1, __m2);
    555  1.1  mrg }
    556  1.1  mrg 
    557  1.1  mrg /* Subtract the 32-bit values in M2 from the 32-bit values in M1.  */
    558  1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    559  1.1  mrg _mm_sub_pi32 (__m64 __m1, __m64 __m2)
    560  1.1  mrg {
    561  1.1  mrg #if _ARCH_PWR9
    562  1.1  mrg   __vector signed int a, b, c;
    563  1.1  mrg 
    564  1.1  mrg   a = (__vector signed int)vec_splats (__m1);
    565  1.1  mrg   b = (__vector signed int)vec_splats (__m2);
    566  1.1  mrg   c = vec_sub (a, b);
    567  1.1  mrg   return (__builtin_unpack_vector_int128 ((__vector __int128_t)c, 0));
    568  1.1  mrg #else
    569  1.1  mrg   __m64_union m1, m2, res;
    570  1.1  mrg 
    571  1.1  mrg   m1.as_m64 = __m1;
    572  1.1  mrg   m2.as_m64 = __m2;
    573  1.1  mrg 
    574  1.1  mrg   res.as_int[0] = m1.as_int[0] - m2.as_int[0];
    575  1.1  mrg   res.as_int[1] = m1.as_int[1] - m2.as_int[1];
    576  1.1  mrg 
    577  1.1  mrg   return (__m64) res.as_m64;
    578  1.1  mrg #endif
    579  1.1  mrg }
    580  1.1  mrg 
    581  1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    582  1.1  mrg _m_psubd (__m64 __m1, __m64 __m2)
    583  1.1  mrg {
    584  1.1  mrg   return _mm_sub_pi32 (__m1, __m2);
    585  1.1  mrg }
    586  1.1  mrg 
    587  1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    588  1.1  mrg _mm_add_si64 (__m64 __m1, __m64 __m2)
    589  1.1  mrg {
    590  1.1  mrg   return (__m1 + __m2);
    591  1.1  mrg }
    592  1.1  mrg 
    593  1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    594  1.1  mrg _mm_sub_si64 (__m64 __m1, __m64 __m2)
    595  1.1  mrg {
    596  1.1  mrg   return (__m1 - __m2);
    597  1.1  mrg }
    598  1.1  mrg 
    599  1.1  mrg /* Shift the 64-bit value in M left by COUNT.  */
    600  1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    601  1.1  mrg _mm_sll_si64 (__m64 __m, __m64 __count)
    602  1.1  mrg {
    603  1.1  mrg   return (__m << __count);
    604  1.1  mrg }
    605  1.1  mrg 
    606  1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    607  1.1  mrg _m_psllq (__m64 __m, __m64 __count)
    608  1.1  mrg {
    609  1.1  mrg   return _mm_sll_si64 (__m, __count);
    610  1.1  mrg }
    611  1.1  mrg 
    612  1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    613  1.1  mrg _mm_slli_si64 (__m64 __m, const int __count)
    614  1.1  mrg {
    615  1.1  mrg   return (__m << __count);
    616  1.1  mrg }
    617  1.1  mrg 
    618  1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    619  1.1  mrg _m_psllqi (__m64 __m, const int __count)
    620  1.1  mrg {
    621  1.1  mrg   return _mm_slli_si64 (__m, __count);
    622  1.1  mrg }
    623  1.1  mrg 
    624  1.1  mrg /* Shift the 64-bit value in M left by COUNT; shift in zeros.  */
    625  1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    626  1.1  mrg _mm_srl_si64 (__m64 __m, __m64 __count)
    627  1.1  mrg {
    628  1.1  mrg   return (__m >> __count);
    629  1.1  mrg }
    630  1.1  mrg 
    631  1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    632  1.1  mrg _m_psrlq (__m64 __m, __m64 __count)
    633  1.1  mrg {
    634  1.1  mrg   return _mm_srl_si64 (__m, __count);
    635  1.1  mrg }
    636  1.1  mrg 
    637  1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    638  1.1  mrg _mm_srli_si64 (__m64 __m, const int __count)
    639  1.1  mrg {
    640  1.1  mrg   return (__m >> __count);
    641  1.1  mrg }
    642  1.1  mrg 
    643  1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    644  1.1  mrg _m_psrlqi (__m64 __m, const int __count)
    645  1.1  mrg {
    646  1.1  mrg   return _mm_srli_si64 (__m, __count);
    647  1.1  mrg }
    648  1.1  mrg 
    649  1.1  mrg /* Bit-wise AND the 64-bit values in M1 and M2.  */
    650  1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    651  1.1  mrg _mm_and_si64 (__m64 __m1, __m64 __m2)
    652  1.1  mrg {
    653  1.1  mrg   return (__m1 & __m2);
    654  1.1  mrg }
    655  1.1  mrg 
    656  1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    657  1.1  mrg _m_pand (__m64 __m1, __m64 __m2)
    658  1.1  mrg {
    659  1.1  mrg   return _mm_and_si64 (__m1, __m2);
    660  1.1  mrg }
    661  1.1  mrg 
    662  1.1  mrg /* Bit-wise complement the 64-bit value in M1 and bit-wise AND it with the
    663  1.1  mrg    64-bit value in M2.  */
    664  1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    665  1.1  mrg _mm_andnot_si64 (__m64 __m1, __m64 __m2)
    666  1.1  mrg {
    667  1.1  mrg   return (~__m1 & __m2);
    668  1.1  mrg }
    669  1.1  mrg 
    670  1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    671  1.1  mrg _m_pandn (__m64 __m1, __m64 __m2)
    672  1.1  mrg {
    673  1.1  mrg   return _mm_andnot_si64 (__m1, __m2);
    674  1.1  mrg }
    675  1.1  mrg 
    676  1.1  mrg /* Bit-wise inclusive OR the 64-bit values in M1 and M2.  */
    677  1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    678  1.1  mrg _mm_or_si64 (__m64 __m1, __m64 __m2)
    679  1.1  mrg {
    680  1.1  mrg   return (__m1 | __m2);
    681  1.1  mrg }
    682  1.1  mrg 
    683  1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    684  1.1  mrg _m_por (__m64 __m1, __m64 __m2)
    685  1.1  mrg {
    686  1.1  mrg   return _mm_or_si64 (__m1, __m2);
    687  1.1  mrg }
    688  1.1  mrg 
    689  1.1  mrg /* Bit-wise exclusive OR the 64-bit values in M1 and M2.  */
    690  1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    691  1.1  mrg _mm_xor_si64 (__m64 __m1, __m64 __m2)
    692  1.1  mrg {
    693  1.1  mrg   return  (__m1 ^ __m2);
    694  1.1  mrg }
    695  1.1  mrg 
    696  1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    697  1.1  mrg _m_pxor (__m64 __m1, __m64 __m2)
    698  1.1  mrg {
    699  1.1  mrg   return _mm_xor_si64 (__m1, __m2);
    700  1.1  mrg }
    701  1.1  mrg 
    702  1.1  mrg /* Creates a 64-bit zero.  */
    703  1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    704  1.1  mrg _mm_setzero_si64 (void)
    705  1.1  mrg {
    706  1.1  mrg   return (__m64) 0;
    707  1.1  mrg }
    708  1.1  mrg 
    709  1.1  mrg /* Compare eight 8-bit values.  The result of the comparison is 0xFF if the
    710  1.1  mrg    test is true and zero if false.  */
    711  1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    712  1.1  mrg _mm_cmpeq_pi8 (__m64 __m1, __m64 __m2)
    713  1.1  mrg {
    714  1.1  mrg #ifdef _ARCH_PWR6
    715  1.1  mrg   __m64 res;
    716  1.1  mrg   __asm__(
    717  1.1  mrg       "cmpb %0,%1,%2;\n"
    718  1.1  mrg       : "=r" (res)
    719  1.1  mrg       : "r" (__m1),
    720  1.1  mrg 	"r" (__m2)
    721  1.1  mrg       : );
    722  1.1  mrg   return (res);
    723  1.1  mrg #else
    724  1.1  mrg   __m64_union m1, m2, res;
    725  1.1  mrg 
    726  1.1  mrg   m1.as_m64 = __m1;
    727  1.1  mrg   m2.as_m64 = __m2;
    728  1.1  mrg 
    729  1.1  mrg   res.as_char[0] = (m1.as_char[0] == m2.as_char[0])? -1: 0;
    730  1.1  mrg   res.as_char[1] = (m1.as_char[1] == m2.as_char[1])? -1: 0;
    731  1.1  mrg   res.as_char[2] = (m1.as_char[2] == m2.as_char[2])? -1: 0;
    732  1.1  mrg   res.as_char[3] = (m1.as_char[3] == m2.as_char[3])? -1: 0;
    733  1.1  mrg   res.as_char[4] = (m1.as_char[4] == m2.as_char[4])? -1: 0;
    734  1.1  mrg   res.as_char[5] = (m1.as_char[5] == m2.as_char[5])? -1: 0;
    735  1.1  mrg   res.as_char[6] = (m1.as_char[6] == m2.as_char[6])? -1: 0;
    736  1.1  mrg   res.as_char[7] = (m1.as_char[7] == m2.as_char[7])? -1: 0;
    737  1.1  mrg 
    738  1.1  mrg   return (__m64) res.as_m64;
    739  1.1  mrg #endif
    740  1.1  mrg }
    741  1.1  mrg 
    742  1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    743  1.1  mrg _m_pcmpeqb (__m64 __m1, __m64 __m2)
    744  1.1  mrg {
    745  1.1  mrg   return _mm_cmpeq_pi8 (__m1, __m2);
    746  1.1  mrg }
    747  1.1  mrg 
    748  1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    749  1.1  mrg _mm_cmpgt_pi8 (__m64 __m1, __m64 __m2)
    750  1.1  mrg {
    751  1.1  mrg #if _ARCH_PWR8
    752  1.1  mrg   __vector signed char a, b, c;
    753  1.1  mrg 
    754  1.1  mrg   a = (__vector signed char)vec_splats (__m1);
    755  1.1  mrg   b = (__vector signed char)vec_splats (__m2);
    756  1.1  mrg   c = (__vector signed char)vec_cmpgt (a, b);
    757  1.1  mrg   return (__builtin_unpack_vector_int128 ((__vector __int128_t)c, 0));
    758  1.1  mrg #else
    759  1.1  mrg   __m64_union m1, m2, res;
    760  1.1  mrg 
    761  1.1  mrg   m1.as_m64 = __m1;
    762  1.1  mrg   m2.as_m64 = __m2;
    763  1.1  mrg 
    764  1.1  mrg   res.as_char[0] = (m1.as_char[0] > m2.as_char[0])? -1: 0;
    765  1.1  mrg   res.as_char[1] = (m1.as_char[1] > m2.as_char[1])? -1: 0;
    766  1.1  mrg   res.as_char[2] = (m1.as_char[2] > m2.as_char[2])? -1: 0;
    767  1.1  mrg   res.as_char[3] = (m1.as_char[3] > m2.as_char[3])? -1: 0;
    768  1.1  mrg   res.as_char[4] = (m1.as_char[4] > m2.as_char[4])? -1: 0;
    769  1.1  mrg   res.as_char[5] = (m1.as_char[5] > m2.as_char[5])? -1: 0;
    770  1.1  mrg   res.as_char[6] = (m1.as_char[6] > m2.as_char[6])? -1: 0;
    771  1.1  mrg   res.as_char[7] = (m1.as_char[7] > m2.as_char[7])? -1: 0;
    772  1.1  mrg 
    773  1.1  mrg   return (__m64) res.as_m64;
    774  1.1  mrg #endif
    775  1.1  mrg }
    776  1.1  mrg 
    777  1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    778  1.1  mrg _m_pcmpgtb (__m64 __m1, __m64 __m2)
    779  1.1  mrg {
    780  1.1  mrg   return _mm_cmpgt_pi8 (__m1, __m2);
    781  1.1  mrg }
    782  1.1  mrg 
    783  1.1  mrg /* Compare four 16-bit values.  The result of the comparison is 0xFFFF if
    784  1.1  mrg    the test is true and zero if false.  */
    785  1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    786  1.1  mrg _mm_cmpeq_pi16 (__m64 __m1, __m64 __m2)
    787  1.1  mrg {
    788  1.1  mrg #if _ARCH_PWR8
    789  1.1  mrg   __vector signed short a, b, c;
    790  1.1  mrg 
    791  1.1  mrg   a = (__vector signed short)vec_splats (__m1);
    792  1.1  mrg   b = (__vector signed short)vec_splats (__m2);
    793  1.1  mrg   c = (__vector signed short)vec_cmpeq (a, b);
    794  1.1  mrg   return (__builtin_unpack_vector_int128 ((__vector __int128_t)c, 0));
    795  1.1  mrg #else
    796  1.1  mrg   __m64_union m1, m2, res;
    797  1.1  mrg 
    798  1.1  mrg   m1.as_m64 = __m1;
    799  1.1  mrg   m2.as_m64 = __m2;
    800  1.1  mrg 
    801  1.1  mrg   res.as_short[0] = (m1.as_short[0] == m2.as_short[0])? -1: 0;
    802  1.1  mrg   res.as_short[1] = (m1.as_short[1] == m2.as_short[1])? -1: 0;
    803  1.1  mrg   res.as_short[2] = (m1.as_short[2] == m2.as_short[2])? -1: 0;
    804  1.1  mrg   res.as_short[3] = (m1.as_short[3] == m2.as_short[3])? -1: 0;
    805  1.1  mrg 
    806  1.1  mrg   return (__m64) res.as_m64;
    807  1.1  mrg #endif
    808  1.1  mrg }
    809  1.1  mrg 
    810  1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    811  1.1  mrg _m_pcmpeqw (__m64 __m1, __m64 __m2)
    812  1.1  mrg {
    813  1.1  mrg   return _mm_cmpeq_pi16 (__m1, __m2);
    814  1.1  mrg }
    815  1.1  mrg 
    816  1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    817  1.1  mrg _mm_cmpgt_pi16 (__m64 __m1, __m64 __m2)
    818  1.1  mrg {
    819  1.1  mrg #if _ARCH_PWR8
    820  1.1  mrg   __vector signed short a, b, c;
    821  1.1  mrg 
    822  1.1  mrg   a = (__vector signed short)vec_splats (__m1);
    823  1.1  mrg   b = (__vector signed short)vec_splats (__m2);
    824  1.1  mrg   c = (__vector signed short)vec_cmpgt (a, b);
    825  1.1  mrg   return (__builtin_unpack_vector_int128 ((__vector __int128_t)c, 0));
    826  1.1  mrg #else
    827  1.1  mrg   __m64_union m1, m2, res;
    828  1.1  mrg 
    829  1.1  mrg   m1.as_m64 = __m1;
    830  1.1  mrg   m2.as_m64 = __m2;
    831  1.1  mrg 
    832  1.1  mrg   res.as_short[0] = (m1.as_short[0] > m2.as_short[0])? -1: 0;
    833  1.1  mrg   res.as_short[1] = (m1.as_short[1] > m2.as_short[1])? -1: 0;
    834  1.1  mrg   res.as_short[2] = (m1.as_short[2] > m2.as_short[2])? -1: 0;
    835  1.1  mrg   res.as_short[3] = (m1.as_short[3] > m2.as_short[3])? -1: 0;
    836  1.1  mrg 
    837  1.1  mrg   return (__m64) res.as_m64;
    838  1.1  mrg #endif
    839  1.1  mrg }
    840  1.1  mrg 
    841  1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    842  1.1  mrg _m_pcmpgtw (__m64 __m1, __m64 __m2)
    843  1.1  mrg {
    844  1.1  mrg   return _mm_cmpgt_pi16 (__m1, __m2);
    845  1.1  mrg }
    846  1.1  mrg 
    847  1.1  mrg /* Compare two 32-bit values.  The result of the comparison is 0xFFFFFFFF if
    848  1.1  mrg    the test is true and zero if false.  */
    849  1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    850  1.1  mrg _mm_cmpeq_pi32 (__m64 __m1, __m64 __m2)
    851  1.1  mrg {
    852  1.1  mrg #if _ARCH_PWR9
    853  1.1  mrg   __vector signed int a, b, c;
    854  1.1  mrg 
    855  1.1  mrg   a = (__vector signed int)vec_splats (__m1);
    856  1.1  mrg   b = (__vector signed int)vec_splats (__m2);
    857  1.1  mrg   c = (__vector signed int)vec_cmpeq (a, b);
    858  1.1  mrg   return (__builtin_unpack_vector_int128 ((__vector __int128_t)c, 0));
    859  1.1  mrg #else
    860  1.1  mrg   __m64_union m1, m2, res;
    861  1.1  mrg 
    862  1.1  mrg   m1.as_m64 = __m1;
    863  1.1  mrg   m2.as_m64 = __m2;
    864  1.1  mrg 
    865  1.1  mrg   res.as_int[0] = (m1.as_int[0] == m2.as_int[0])? -1: 0;
    866  1.1  mrg   res.as_int[1] = (m1.as_int[1] == m2.as_int[1])? -1: 0;
    867  1.1  mrg 
    868  1.1  mrg   return (__m64) res.as_m64;
    869  1.1  mrg #endif
    870  1.1  mrg }
    871  1.1  mrg 
    872  1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    873  1.1  mrg _m_pcmpeqd (__m64 __m1, __m64 __m2)
    874  1.1  mrg {
    875  1.1  mrg   return _mm_cmpeq_pi32 (__m1, __m2);
    876  1.1  mrg }
    877  1.1  mrg 
    878  1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    879  1.1  mrg _mm_cmpgt_pi32 (__m64 __m1, __m64 __m2)
    880  1.1  mrg {
    881  1.1  mrg #if _ARCH_PWR9
    882  1.1  mrg   __vector signed int a, b, c;
    883  1.1  mrg 
    884  1.1  mrg   a = (__vector signed int)vec_splats (__m1);
    885  1.1  mrg   b = (__vector signed int)vec_splats (__m2);
    886  1.1  mrg   c = (__vector signed int)vec_cmpgt (a, b);
    887  1.1  mrg   return (__builtin_unpack_vector_int128 ((__vector __int128_t)c, 0));
    888  1.1  mrg #else
    889  1.1  mrg   __m64_union m1, m2, res;
    890  1.1  mrg 
    891  1.1  mrg   m1.as_m64 = __m1;
    892  1.1  mrg   m2.as_m64 = __m2;
    893  1.1  mrg 
    894  1.1  mrg   res.as_int[0] = (m1.as_int[0] > m2.as_int[0])? -1: 0;
    895  1.1  mrg   res.as_int[1] = (m1.as_int[1] > m2.as_int[1])? -1: 0;
    896  1.1  mrg 
    897  1.1  mrg   return (__m64) res.as_m64;
    898  1.1  mrg #endif
    899  1.1  mrg }
    900  1.1  mrg 
    901  1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    902  1.1  mrg _m_pcmpgtd (__m64 __m1, __m64 __m2)
    903  1.1  mrg {
    904  1.1  mrg   return _mm_cmpgt_pi32 (__m1, __m2);
    905  1.1  mrg }
    906  1.1  mrg 
    907  1.1  mrg #if _ARCH_PWR8
    908  1.1  mrg /* Add the 8-bit values in M1 to the 8-bit values in M2 using signed
    909  1.1  mrg    saturated arithmetic.  */
    910  1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    911  1.1  mrg _mm_adds_pi8 (__m64 __m1, __m64 __m2)
    912  1.1  mrg {
    913  1.1  mrg   __vector signed char a, b, c;
    914  1.1  mrg 
    915  1.1  mrg   a = (__vector signed char)vec_splats (__m1);
    916  1.1  mrg   b = (__vector signed char)vec_splats (__m2);
    917  1.1  mrg   c = vec_adds (a, b);
    918  1.1  mrg   return (__builtin_unpack_vector_int128 ((__vector __int128_t)c, 0));
    919  1.1  mrg }
    920  1.1  mrg 
    921  1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    922  1.1  mrg _m_paddsb (__m64 __m1, __m64 __m2)
    923  1.1  mrg {
    924  1.1  mrg   return _mm_adds_pi8 (__m1, __m2);
    925  1.1  mrg }
    926  1.1  mrg /* Add the 16-bit values in M1 to the 16-bit values in M2 using signed
    927  1.1  mrg    saturated arithmetic.  */
    928  1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    929  1.1  mrg _mm_adds_pi16 (__m64 __m1, __m64 __m2)
    930  1.1  mrg {
    931  1.1  mrg   __vector signed short a, b, c;
    932  1.1  mrg 
    933  1.1  mrg   a = (__vector signed short)vec_splats (__m1);
    934  1.1  mrg   b = (__vector signed short)vec_splats (__m2);
    935  1.1  mrg   c = vec_adds (a, b);
    936  1.1  mrg   return (__builtin_unpack_vector_int128 ((__vector __int128_t)c, 0));
    937  1.1  mrg }
    938  1.1  mrg 
    939  1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    940  1.1  mrg _m_paddsw (__m64 __m1, __m64 __m2)
    941  1.1  mrg {
    942  1.1  mrg   return _mm_adds_pi16 (__m1, __m2);
    943  1.1  mrg }
    944  1.1  mrg /* Add the 8-bit values in M1 to the 8-bit values in M2 using unsigned
    945  1.1  mrg    saturated arithmetic.  */
    946  1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    947  1.1  mrg _mm_adds_pu8 (__m64 __m1, __m64 __m2)
    948  1.1  mrg {
    949  1.1  mrg   __vector unsigned char a, b, c;
    950  1.1  mrg 
    951  1.1  mrg   a = (__vector unsigned char)vec_splats (__m1);
    952  1.1  mrg   b = (__vector unsigned char)vec_splats (__m2);
    953  1.1  mrg   c = vec_adds (a, b);
    954  1.1  mrg   return (__builtin_unpack_vector_int128 ((__vector __int128_t)c, 0));
    955  1.1  mrg }
    956  1.1  mrg 
    957  1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    958  1.1  mrg _m_paddusb (__m64 __m1, __m64 __m2)
    959  1.1  mrg {
    960  1.1  mrg   return _mm_adds_pu8 (__m1, __m2);
    961  1.1  mrg }
    962  1.1  mrg 
    963  1.1  mrg /* Add the 16-bit values in M1 to the 16-bit values in M2 using unsigned
    964  1.1  mrg    saturated arithmetic.  */
    965  1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    966  1.1  mrg _mm_adds_pu16 (__m64 __m1, __m64 __m2)
    967  1.1  mrg {
    968  1.1  mrg   __vector unsigned short a, b, c;
    969  1.1  mrg 
    970  1.1  mrg   a = (__vector unsigned short)vec_splats (__m1);
    971  1.1  mrg   b = (__vector unsigned short)vec_splats (__m2);
    972  1.1  mrg   c = vec_adds (a, b);
    973  1.1  mrg   return (__builtin_unpack_vector_int128 ((__vector __int128_t)c, 0));
    974  1.1  mrg }
    975  1.1  mrg 
    976  1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    977  1.1  mrg _m_paddusw (__m64 __m1, __m64 __m2)
    978  1.1  mrg {
    979  1.1  mrg   return _mm_adds_pu16 (__m1, __m2);
    980  1.1  mrg }
    981  1.1  mrg 
    982  1.1  mrg /* Subtract the 8-bit values in M2 from the 8-bit values in M1 using signed
    983  1.1  mrg    saturating arithmetic.  */
    984  1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    985  1.1  mrg _mm_subs_pi8 (__m64 __m1, __m64 __m2)
    986  1.1  mrg {
    987  1.1  mrg   __vector signed char a, b, c;
    988  1.1  mrg 
    989  1.1  mrg   a = (__vector signed char)vec_splats (__m1);
    990  1.1  mrg   b = (__vector signed char)vec_splats (__m2);
    991  1.1  mrg   c = vec_subs (a, b);
    992  1.1  mrg   return (__builtin_unpack_vector_int128 ((__vector __int128_t)c, 0));
    993  1.1  mrg }
    994  1.1  mrg 
    995  1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    996  1.1  mrg _m_psubsb (__m64 __m1, __m64 __m2)
    997  1.1  mrg {
    998  1.1  mrg   return _mm_subs_pi8 (__m1, __m2);
    999  1.1  mrg }
   1000  1.1  mrg 
   1001  1.1  mrg /* Subtract the 16-bit values in M2 from the 16-bit values in M1 using
   1002  1.1  mrg    signed saturating arithmetic.  */
   1003  1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1004  1.1  mrg _mm_subs_pi16 (__m64 __m1, __m64 __m2)
   1005  1.1  mrg {
   1006  1.1  mrg   __vector signed short a, b, c;
   1007  1.1  mrg 
   1008  1.1  mrg   a = (__vector signed short)vec_splats (__m1);
   1009  1.1  mrg   b = (__vector signed short)vec_splats (__m2);
   1010  1.1  mrg   c = vec_subs (a, b);
   1011  1.1  mrg   return (__builtin_unpack_vector_int128 ((__vector __int128_t)c, 0));
   1012  1.1  mrg }
   1013  1.1  mrg 
   1014  1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1015  1.1  mrg _m_psubsw (__m64 __m1, __m64 __m2)
   1016  1.1  mrg {
   1017  1.1  mrg   return _mm_subs_pi16 (__m1, __m2);
   1018  1.1  mrg }
   1019  1.1  mrg 
   1020  1.1  mrg /* Subtract the 8-bit values in M2 from the 8-bit values in M1 using
   1021  1.1  mrg    unsigned saturating arithmetic.  */
   1022  1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1023  1.1  mrg _mm_subs_pu8 (__m64 __m1, __m64 __m2)
   1024  1.1  mrg {
   1025  1.1  mrg   __vector unsigned char a, b, c;
   1026  1.1  mrg 
   1027  1.1  mrg   a = (__vector unsigned char)vec_splats (__m1);
   1028  1.1  mrg   b = (__vector unsigned char)vec_splats (__m2);
   1029  1.1  mrg   c = vec_subs (a, b);
   1030  1.1  mrg   return (__builtin_unpack_vector_int128 ((__vector __int128_t)c, 0));
   1031  1.1  mrg }
   1032  1.1  mrg 
   1033  1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1034  1.1  mrg _m_psubusb (__m64 __m1, __m64 __m2)
   1035  1.1  mrg {
   1036  1.1  mrg   return _mm_subs_pu8 (__m1, __m2);
   1037  1.1  mrg }
   1038  1.1  mrg 
   1039  1.1  mrg /* Subtract the 16-bit values in M2 from the 16-bit values in M1 using
   1040  1.1  mrg    unsigned saturating arithmetic.  */
   1041  1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1042  1.1  mrg _mm_subs_pu16 (__m64 __m1, __m64 __m2)
   1043  1.1  mrg {
   1044  1.1  mrg   __vector unsigned short a, b, c;
   1045  1.1  mrg 
   1046  1.1  mrg   a = (__vector unsigned short)vec_splats (__m1);
   1047  1.1  mrg   b = (__vector unsigned short)vec_splats (__m2);
   1048  1.1  mrg   c = vec_subs (a, b);
   1049  1.1  mrg   return (__builtin_unpack_vector_int128 ((__vector __int128_t)c, 0));
   1050  1.1  mrg }
   1051  1.1  mrg 
   1052  1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1053  1.1  mrg _m_psubusw (__m64 __m1, __m64 __m2)
   1054  1.1  mrg {
   1055  1.1  mrg   return _mm_subs_pu16 (__m1, __m2);
   1056  1.1  mrg }
   1057  1.1  mrg 
   1058  1.1  mrg /* Multiply four 16-bit values in M1 by four 16-bit values in M2 producing
   1059  1.1  mrg    four 32-bit intermediate results, which are then summed by pairs to
   1060  1.1  mrg    produce two 32-bit results.  */
   1061  1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1062  1.1  mrg _mm_madd_pi16 (__m64 __m1, __m64 __m2)
   1063  1.1  mrg {
   1064  1.1  mrg   __vector signed short a, b;
   1065  1.1  mrg   __vector signed int c;
   1066  1.1  mrg   __vector signed int zero = {0, 0, 0, 0};
   1067  1.1  mrg 
   1068  1.1  mrg   a = (__vector signed short)vec_splats (__m1);
   1069  1.1  mrg   b = (__vector signed short)vec_splats (__m2);
   1070  1.1  mrg   c = vec_vmsumshm (a, b, zero);
   1071  1.1  mrg   return (__builtin_unpack_vector_int128 ((__vector __int128_t)c, 0));
   1072  1.1  mrg }
   1073  1.1  mrg 
   1074  1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1075  1.1  mrg _m_pmaddwd (__m64 __m1, __m64 __m2)
   1076  1.1  mrg {
   1077  1.1  mrg   return _mm_madd_pi16 (__m1, __m2);
   1078  1.1  mrg }
   1079  1.1  mrg /* Multiply four signed 16-bit values in M1 by four signed 16-bit values in
   1080  1.1  mrg    M2 and produce the high 16 bits of the 32-bit results.  */
   1081  1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1082  1.1  mrg _mm_mulhi_pi16 (__m64 __m1, __m64 __m2)
   1083  1.1  mrg {
   1084  1.1  mrg   __vector signed short a, b;
   1085  1.1  mrg   __vector signed short c;
   1086  1.1  mrg   __vector signed int w0, w1;
   1087  1.1  mrg   __vector unsigned char xform1 = {
   1088  1.1  mrg       0x02, 0x03, 0x12, 0x13,  0x06, 0x07, 0x16, 0x17,
   1089  1.1  mrg       0x0A, 0x0B, 0x1A, 0x1B,  0x0E, 0x0F, 0x1E, 0x1F
   1090  1.1  mrg     };
   1091  1.1  mrg 
   1092  1.1  mrg   a = (__vector signed short)vec_splats (__m1);
   1093  1.1  mrg   b = (__vector signed short)vec_splats (__m2);
   1094  1.1  mrg 
   1095  1.1  mrg   w0 = vec_vmulesh (a, b);
   1096  1.1  mrg   w1 = vec_vmulosh (a, b);
   1097  1.1  mrg   c = (__vector signed short)vec_perm (w0, w1, xform1);
   1098  1.1  mrg 
   1099  1.1  mrg   return (__builtin_unpack_vector_int128 ((__vector __int128_t)c, 0));
   1100  1.1  mrg }
   1101  1.1  mrg 
   1102  1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1103  1.1  mrg _m_pmulhw (__m64 __m1, __m64 __m2)
   1104  1.1  mrg {
   1105  1.1  mrg   return _mm_mulhi_pi16 (__m1, __m2);
   1106  1.1  mrg }
   1107  1.1  mrg 
   1108  1.1  mrg /* Multiply four 16-bit values in M1 by four 16-bit values in M2 and produce
   1109  1.1  mrg    the low 16 bits of the results.  */
   1110  1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1111  1.1  mrg _mm_mullo_pi16 (__m64 __m1, __m64 __m2)
   1112  1.1  mrg {
   1113  1.1  mrg   __vector signed short a, b, c;
   1114  1.1  mrg 
   1115  1.1  mrg   a = (__vector signed short)vec_splats (__m1);
   1116  1.1  mrg   b = (__vector signed short)vec_splats (__m2);
   1117  1.1  mrg   c = a * b;
   1118  1.1  mrg   return (__builtin_unpack_vector_int128 ((__vector __int128_t)c, 0));
   1119  1.1  mrg }
   1120  1.1  mrg 
   1121  1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1122  1.1  mrg _m_pmullw (__m64 __m1, __m64 __m2)
   1123  1.1  mrg {
   1124  1.1  mrg   return _mm_mullo_pi16 (__m1, __m2);
   1125  1.1  mrg }
   1126  1.1  mrg 
   1127  1.1  mrg /* Shift four 16-bit values in M left by COUNT.  */
   1128  1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1129  1.1  mrg _mm_sll_pi16 (__m64 __m, __m64 __count)
   1130  1.1  mrg {
   1131  1.1  mrg   __vector signed short m, r;
   1132  1.1  mrg   __vector unsigned short c;
   1133  1.1  mrg 
   1134  1.1  mrg   if (__count <= 15)
   1135  1.1  mrg     {
   1136  1.1  mrg       m = (__vector signed short)vec_splats (__m);
   1137  1.1  mrg       c = (__vector unsigned short)vec_splats ((unsigned short)__count);
   1138  1.1  mrg       r = vec_sl (m, (__vector unsigned short)c);
   1139  1.1  mrg       return (__builtin_unpack_vector_int128 ((__vector __int128_t)r, 0));
   1140  1.1  mrg     }
   1141  1.1  mrg   else
   1142  1.1  mrg   return (0);
   1143  1.1  mrg }
   1144  1.1  mrg 
   1145  1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1146  1.1  mrg _m_psllw (__m64 __m, __m64 __count)
   1147  1.1  mrg {
   1148  1.1  mrg   return _mm_sll_pi16 (__m, __count);
   1149  1.1  mrg }
   1150  1.1  mrg 
   1151  1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1152  1.1  mrg _mm_slli_pi16 (__m64 __m, int __count)
   1153  1.1  mrg {
   1154  1.1  mrg   /* Promote int to long then invoke mm_sll_pi16.  */
   1155  1.1  mrg   return _mm_sll_pi16 (__m, __count);
   1156  1.1  mrg }
   1157  1.1  mrg 
   1158  1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1159  1.1  mrg _m_psllwi (__m64 __m, int __count)
   1160  1.1  mrg {
   1161  1.1  mrg   return _mm_slli_pi16 (__m, __count);
   1162  1.1  mrg }
   1163  1.1  mrg 
   1164  1.1  mrg /* Shift two 32-bit values in M left by COUNT.  */
   1165  1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1166  1.1  mrg _mm_sll_pi32 (__m64 __m, __m64 __count)
   1167  1.1  mrg {
   1168  1.1  mrg   __m64_union m, res;
   1169  1.1  mrg 
   1170  1.1  mrg   m.as_m64 = __m;
   1171  1.1  mrg 
   1172  1.1  mrg   res.as_int[0] = m.as_int[0] << __count;
   1173  1.1  mrg   res.as_int[1] = m.as_int[1] << __count;
   1174  1.1  mrg   return (res.as_m64);
   1175  1.1  mrg }
   1176  1.1  mrg 
   1177  1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1178  1.1  mrg _m_pslld (__m64 __m, __m64 __count)
   1179  1.1  mrg {
   1180  1.1  mrg   return _mm_sll_pi32 (__m, __count);
   1181  1.1  mrg }
   1182  1.1  mrg 
   1183  1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1184  1.1  mrg _mm_slli_pi32 (__m64 __m, int __count)
   1185  1.1  mrg {
   1186  1.1  mrg   /* Promote int to long then invoke mm_sll_pi32.  */
   1187  1.1  mrg   return _mm_sll_pi32 (__m, __count);
   1188  1.1  mrg }
   1189  1.1  mrg 
   1190  1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1191  1.1  mrg _m_pslldi (__m64 __m, int __count)
   1192  1.1  mrg {
   1193  1.1  mrg   return _mm_slli_pi32 (__m, __count);
   1194  1.1  mrg }
   1195  1.1  mrg 
   1196  1.1  mrg /* Shift four 16-bit values in M right by COUNT; shift in the sign bit.  */
   1197  1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1198  1.1  mrg _mm_sra_pi16 (__m64 __m, __m64 __count)
   1199  1.1  mrg {
   1200  1.1  mrg   __vector signed short m, r;
   1201  1.1  mrg   __vector unsigned short c;
   1202  1.1  mrg 
   1203  1.1  mrg   if (__count <= 15)
   1204  1.1  mrg     {
   1205  1.1  mrg 	m = (__vector signed short)vec_splats (__m);
   1206  1.1  mrg 	c = (__vector unsigned short)vec_splats ((unsigned short)__count);
   1207  1.1  mrg 	r = vec_sra (m, (__vector unsigned short)c);
   1208  1.1  mrg 	return (__builtin_unpack_vector_int128 ((__vector __int128_t)r, 0));
   1209  1.1  mrg     }
   1210  1.1  mrg   else
   1211  1.1  mrg   return (0);
   1212  1.1  mrg }
   1213  1.1  mrg 
   1214  1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1215  1.1  mrg _m_psraw (__m64 __m, __m64 __count)
   1216  1.1  mrg {
   1217  1.1  mrg   return _mm_sra_pi16 (__m, __count);
   1218  1.1  mrg }
   1219  1.1  mrg 
   1220  1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1221  1.1  mrg _mm_srai_pi16 (__m64 __m, int __count)
   1222  1.1  mrg {
   1223  1.1  mrg   /* Promote int to long then invoke mm_sra_pi32.  */
   1224  1.1  mrg   return _mm_sra_pi16 (__m, __count);
   1225  1.1  mrg }
   1226  1.1  mrg 
   1227  1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1228  1.1  mrg _m_psrawi (__m64 __m, int __count)
   1229  1.1  mrg {
   1230  1.1  mrg   return _mm_srai_pi16 (__m, __count);
   1231  1.1  mrg }
   1232  1.1  mrg 
   1233  1.1  mrg /* Shift two 32-bit values in M right by COUNT; shift in the sign bit.  */
   1234  1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1235  1.1  mrg _mm_sra_pi32 (__m64 __m, __m64 __count)
   1236  1.1  mrg {
   1237  1.1  mrg   __m64_union m, res;
   1238  1.1  mrg 
   1239  1.1  mrg   m.as_m64 = __m;
   1240  1.1  mrg 
   1241  1.1  mrg   res.as_int[0] = m.as_int[0] >> __count;
   1242  1.1  mrg   res.as_int[1] = m.as_int[1] >> __count;
   1243  1.1  mrg   return (res.as_m64);
   1244  1.1  mrg }
   1245  1.1  mrg 
   1246  1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1247  1.1  mrg _m_psrad (__m64 __m, __m64 __count)
   1248  1.1  mrg {
   1249  1.1  mrg   return _mm_sra_pi32 (__m, __count);
   1250  1.1  mrg }
   1251  1.1  mrg 
   1252  1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1253  1.1  mrg _mm_srai_pi32 (__m64 __m, int __count)
   1254  1.1  mrg {
   1255  1.1  mrg   /* Promote int to long then invoke mm_sra_pi32.  */
   1256  1.1  mrg   return _mm_sra_pi32 (__m, __count);
   1257  1.1  mrg }
   1258  1.1  mrg 
   1259  1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1260  1.1  mrg _m_psradi (__m64 __m, int __count)
   1261  1.1  mrg {
   1262  1.1  mrg   return _mm_srai_pi32 (__m, __count);
   1263  1.1  mrg }
   1264  1.1  mrg 
   1265  1.1  mrg /* Shift four 16-bit values in M right by COUNT; shift in zeros.  */
   1266  1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1267  1.1  mrg _mm_srl_pi16 (__m64 __m, __m64 __count)
   1268  1.1  mrg {
   1269  1.1  mrg   __vector unsigned short m, r;
   1270  1.1  mrg   __vector unsigned short c;
   1271  1.1  mrg 
   1272  1.1  mrg   if (__count <= 15)
   1273  1.1  mrg     {
   1274  1.1  mrg 	m = (__vector unsigned short)vec_splats (__m);
   1275  1.1  mrg 	c = (__vector unsigned short)vec_splats ((unsigned short)__count);
   1276  1.1  mrg 	r = vec_sr (m, (__vector unsigned short)c);
   1277  1.1  mrg 	return (__builtin_unpack_vector_int128 ((__vector __int128_t)r, 0));
   1278  1.1  mrg     }
   1279  1.1  mrg   else
   1280  1.1  mrg     return (0);
   1281  1.1  mrg }
   1282  1.1  mrg 
   1283  1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1284  1.1  mrg _m_psrlw (__m64 __m, __m64 __count)
   1285  1.1  mrg {
   1286  1.1  mrg   return _mm_srl_pi16 (__m, __count);
   1287  1.1  mrg }
   1288  1.1  mrg 
   1289  1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1290  1.1  mrg _mm_srli_pi16 (__m64 __m, int __count)
   1291  1.1  mrg {
   1292  1.1  mrg   /* Promote int to long then invoke mm_sra_pi32.  */
   1293  1.1  mrg   return _mm_srl_pi16 (__m, __count);
   1294  1.1  mrg }
   1295  1.1  mrg 
   1296  1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1297  1.1  mrg _m_psrlwi (__m64 __m, int __count)
   1298  1.1  mrg {
   1299  1.1  mrg   return _mm_srli_pi16 (__m, __count);
   1300  1.1  mrg }
   1301  1.1  mrg 
   1302  1.1  mrg /* Shift two 32-bit values in M right by COUNT; shift in zeros.  */
   1303  1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1304  1.1  mrg _mm_srl_pi32 (__m64 __m, __m64 __count)
   1305  1.1  mrg {
   1306  1.1  mrg   __m64_union m, res;
   1307  1.1  mrg 
   1308  1.1  mrg   m.as_m64 = __m;
   1309  1.1  mrg 
   1310  1.1  mrg   res.as_int[0] = (unsigned int)m.as_int[0] >> __count;
   1311  1.1  mrg   res.as_int[1] = (unsigned int)m.as_int[1] >> __count;
   1312  1.1  mrg   return (res.as_m64);
   1313  1.1  mrg }
   1314  1.1  mrg 
   1315  1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1316  1.1  mrg _m_psrld (__m64 __m, __m64 __count)
   1317  1.1  mrg {
   1318  1.1  mrg   return _mm_srl_pi32 (__m, __count);
   1319  1.1  mrg }
   1320  1.1  mrg 
   1321  1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1322  1.1  mrg _mm_srli_pi32 (__m64 __m, int __count)
   1323  1.1  mrg {
   1324  1.1  mrg   /* Promote int to long then invoke mm_srl_pi32.  */
   1325  1.1  mrg   return _mm_srl_pi32 (__m, __count);
   1326  1.1  mrg }
   1327  1.1  mrg 
   1328  1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1329  1.1  mrg _m_psrldi (__m64 __m, int __count)
   1330  1.1  mrg {
   1331  1.1  mrg   return _mm_srli_pi32 (__m, __count);
   1332  1.1  mrg }
   1333  1.1  mrg #endif /* _ARCH_PWR8 */
   1334  1.1  mrg 
   1335  1.1  mrg /* Creates a vector of two 32-bit values; I0 is least significant.  */
   1336  1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1337  1.1  mrg _mm_set_pi32 (int __i1, int __i0)
   1338  1.1  mrg {
   1339  1.1  mrg   __m64_union res;
   1340  1.1  mrg 
   1341  1.1  mrg   res.as_int[0] = __i0;
   1342  1.1  mrg   res.as_int[1] = __i1;
   1343  1.1  mrg   return (res.as_m64);
   1344  1.1  mrg }
   1345  1.1  mrg 
   1346  1.1  mrg /* Creates a vector of four 16-bit values; W0 is least significant.  */
   1347  1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1348  1.1  mrg _mm_set_pi16 (short __w3, short __w2, short __w1, short __w0)
   1349  1.1  mrg {
   1350  1.1  mrg   __m64_union res;
   1351  1.1  mrg 
   1352  1.1  mrg   res.as_short[0] = __w0;
   1353  1.1  mrg   res.as_short[1] = __w1;
   1354  1.1  mrg   res.as_short[2] = __w2;
   1355  1.1  mrg   res.as_short[3] = __w3;
   1356  1.1  mrg   return (res.as_m64);
   1357  1.1  mrg }
   1358  1.1  mrg 
   1359  1.1  mrg /* Creates a vector of eight 8-bit values; B0 is least significant.  */
   1360  1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1361  1.1  mrg _mm_set_pi8 (char __b7, char __b6, char __b5, char __b4,
   1362  1.1  mrg 	     char __b3, char __b2, char __b1, char __b0)
   1363  1.1  mrg {
   1364  1.1  mrg   __m64_union res;
   1365  1.1  mrg 
   1366  1.1  mrg   res.as_char[0] = __b0;
   1367  1.1  mrg   res.as_char[1] = __b1;
   1368  1.1  mrg   res.as_char[2] = __b2;
   1369  1.1  mrg   res.as_char[3] = __b3;
   1370  1.1  mrg   res.as_char[4] = __b4;
   1371  1.1  mrg   res.as_char[5] = __b5;
   1372  1.1  mrg   res.as_char[6] = __b6;
   1373  1.1  mrg   res.as_char[7] = __b7;
   1374  1.1  mrg   return (res.as_m64);
   1375  1.1  mrg }
   1376  1.1  mrg 
   1377  1.1  mrg /* Similar, but with the arguments in reverse order.  */
   1378  1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1379  1.1  mrg _mm_setr_pi32 (int __i0, int __i1)
   1380  1.1  mrg {
   1381  1.1  mrg   __m64_union res;
   1382  1.1  mrg 
   1383  1.1  mrg   res.as_int[0] = __i0;
   1384  1.1  mrg   res.as_int[1] = __i1;
   1385  1.1  mrg   return (res.as_m64);
   1386  1.1  mrg }
   1387  1.1  mrg 
   1388  1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1389  1.1  mrg _mm_setr_pi16 (short __w0, short __w1, short __w2, short __w3)
   1390  1.1  mrg {
   1391  1.1  mrg   return _mm_set_pi16 (__w3, __w2, __w1, __w0);
   1392  1.1  mrg }
   1393  1.1  mrg 
   1394  1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1395  1.1  mrg _mm_setr_pi8 (char __b0, char __b1, char __b2, char __b3,
   1396  1.1  mrg 	      char __b4, char __b5, char __b6, char __b7)
   1397  1.1  mrg {
   1398  1.1  mrg   return _mm_set_pi8 (__b7, __b6, __b5, __b4, __b3, __b2, __b1, __b0);
   1399  1.1  mrg }
   1400  1.1  mrg 
   1401  1.1  mrg /* Creates a vector of two 32-bit values, both elements containing I.  */
   1402  1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1403  1.1  mrg _mm_set1_pi32 (int __i)
   1404  1.1  mrg {
   1405  1.1  mrg   __m64_union res;
   1406  1.1  mrg 
   1407  1.1  mrg   res.as_int[0] = __i;
   1408  1.1  mrg   res.as_int[1] = __i;
   1409  1.1  mrg   return (res.as_m64);
   1410  1.1  mrg }
   1411  1.1  mrg 
   1412  1.1  mrg /* Creates a vector of four 16-bit values, all elements containing W.  */
   1413  1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1414  1.1  mrg _mm_set1_pi16 (short __w)
   1415  1.1  mrg {
   1416  1.1  mrg #if _ARCH_PWR9
   1417  1.1  mrg   __vector signed short w;
   1418  1.1  mrg 
   1419  1.1  mrg   w = (__vector signed short)vec_splats (__w);
   1420  1.1  mrg   return (__builtin_unpack_vector_int128 ((__vector __int128_t)w, 0));
   1421  1.1  mrg #else
   1422  1.1  mrg   __m64_union res;
   1423  1.1  mrg 
   1424  1.1  mrg   res.as_short[0] = __w;
   1425  1.1  mrg   res.as_short[1] = __w;
   1426  1.1  mrg   res.as_short[2] = __w;
   1427  1.1  mrg   res.as_short[3] = __w;
   1428  1.1  mrg   return (res.as_m64);
   1429  1.1  mrg #endif
   1430  1.1  mrg }
   1431  1.1  mrg 
   1432  1.1  mrg /* Creates a vector of eight 8-bit values, all elements containing B.  */
   1433  1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1434  1.1  mrg _mm_set1_pi8 (signed char __b)
   1435  1.1  mrg {
   1436  1.1  mrg #if _ARCH_PWR8
   1437  1.1  mrg   __vector signed char b;
   1438  1.1  mrg 
   1439  1.1  mrg   b = (__vector signed char)vec_splats (__b);
   1440  1.1  mrg   return (__builtin_unpack_vector_int128 ((__vector __int128_t)b, 0));
   1441  1.1  mrg #else
   1442  1.1  mrg   __m64_union res;
   1443  1.1  mrg 
   1444  1.1  mrg   res.as_char[0] = __b;
   1445  1.1  mrg   res.as_char[1] = __b;
   1446  1.1  mrg   res.as_char[2] = __b;
   1447  1.1  mrg   res.as_char[3] = __b;
   1448  1.1  mrg   res.as_char[4] = __b;
   1449  1.1  mrg   res.as_char[5] = __b;
   1450  1.1  mrg   res.as_char[6] = __b;
   1451  1.1  mrg   res.as_char[7] = __b;
   1452  1.1  mrg   return (res.as_m64);
   1453  1.1  mrg #endif
   1454  1.1  mrg }
   1455  1.1  mrg #endif /* __powerpc64__ */
   1456  1.1  mrg #endif /* _MMINTRIN_H_INCLUDED */
   1457