Home | History | Annotate | Line # | Download | only in rs6000
mmintrin.h revision 1.1.1.2
      1  1.1.1.2  mrg /* Copyright (C) 2002-2019 Free Software Foundation, Inc.
      2      1.1  mrg 
      3      1.1  mrg    This file is part of GCC.
      4      1.1  mrg 
      5      1.1  mrg    GCC is free software; you can redistribute it and/or modify
      6      1.1  mrg    it under the terms of the GNU General Public License as published by
      7      1.1  mrg    the Free Software Foundation; either version 3, or (at your option)
      8      1.1  mrg    any later version.
      9      1.1  mrg 
     10      1.1  mrg    GCC is distributed in the hope that it will be useful,
     11      1.1  mrg    but WITHOUT ANY WARRANTY; without even the implied warranty of
     12      1.1  mrg    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     13      1.1  mrg    GNU General Public License for more details.
     14      1.1  mrg 
     15      1.1  mrg    Under Section 7 of GPL version 3, you are granted additional
     16      1.1  mrg    permissions described in the GCC Runtime Library Exception, version
     17      1.1  mrg    3.1, as published by the Free Software Foundation.
     18      1.1  mrg 
     19      1.1  mrg    You should have received a copy of the GNU General Public License and
     20      1.1  mrg    a copy of the GCC Runtime Library Exception along with this program;
     21      1.1  mrg    see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
     22      1.1  mrg    <http://www.gnu.org/licenses/>.  */
     23      1.1  mrg 
     24      1.1  mrg /* Implemented from the specification included in the Intel C++ Compiler
     25      1.1  mrg    User Guide and Reference, version 9.0.  */
     26      1.1  mrg 
     27      1.1  mrg #ifndef NO_WARN_X86_INTRINSICS
     28      1.1  mrg /* This header is distributed to simplify porting x86_64 code that
     29      1.1  mrg    makes explicit use of Intel intrinsics to powerpc64le.
     30      1.1  mrg    It is the user's responsibility to determine if the results are
     31      1.1  mrg    acceptable and make additional changes as necessary.
     32      1.1  mrg    Note that much code that uses Intel intrinsics can be rewritten in
     33      1.1  mrg    standard C or GNU C extensions, which are more portable and better
     34      1.1  mrg    optimized across multiple targets.
     35      1.1  mrg 
     36      1.1  mrg    In the specific case of X86 MMX (__m64) intrinsics, the PowerPC
     37      1.1  mrg    target does not support a native __vector_size__ (8) type.  Instead
     38      1.1  mrg    we typedef __m64 to a 64-bit unsigned long long, which is natively
     39      1.1  mrg    supported in 64-bit mode.  This works well for the _si64 and some
     40      1.1  mrg    _pi32 operations, but starts to generate long sequences for _pi16
     41      1.1  mrg    and _pi8 operations.  For those cases it better (faster and
     42      1.1  mrg    smaller code) to transfer __m64 data to the PowerPC vector 128-bit
     43      1.1  mrg    unit, perform the operation, and then transfer the result back to
     44      1.1  mrg    the __m64 type. This implies that the direct register move
     45      1.1  mrg    instructions, introduced with power8, are available for efficient
     46      1.1  mrg    implementation of these transfers.
     47      1.1  mrg 
     48      1.1  mrg    Most MMX intrinsic operations can be performed efficiently as
     49      1.1  mrg    C language 64-bit scalar operation or optimized to use the newer
     50      1.1  mrg    128-bit SSE/Altivec operations.  We recomend this for new
     51      1.1  mrg    applications.  */
     52      1.1  mrg #error "Please read comment above.  Use -DNO_WARN_X86_INTRINSICS to disable this error."
     53      1.1  mrg #endif
     54      1.1  mrg 
     55      1.1  mrg #ifndef _MMINTRIN_H_INCLUDED
     56      1.1  mrg #define _MMINTRIN_H_INCLUDED
     57      1.1  mrg 
     58      1.1  mrg #include <altivec.h>
     59      1.1  mrg /* The Intel API is flexible enough that we must allow aliasing with other
     60      1.1  mrg    vector types, and their scalar components.  */
     61      1.1  mrg typedef __attribute__ ((__aligned__ (8))) unsigned long long __m64;
     62      1.1  mrg 
     63      1.1  mrg typedef __attribute__ ((__aligned__ (8)))
     64      1.1  mrg union
     65      1.1  mrg   {
     66      1.1  mrg     __m64 as_m64;
     67      1.1  mrg     char as_char[8];
     68      1.1  mrg     signed char as_signed_char [8];
     69      1.1  mrg     short as_short[4];
     70      1.1  mrg     int as_int[2];
     71      1.1  mrg     long long as_long_long;
     72      1.1  mrg     float as_float[2];
     73      1.1  mrg     double as_double;
     74      1.1  mrg   } __m64_union;
     75      1.1  mrg 
     76      1.1  mrg /* Empty the multimedia state.  */
     77      1.1  mrg extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     78      1.1  mrg _mm_empty (void)
     79      1.1  mrg {
     80      1.1  mrg   /* nothing to do on PowerPC.  */
     81      1.1  mrg }
     82      1.1  mrg 
     83      1.1  mrg extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     84      1.1  mrg _m_empty (void)
     85      1.1  mrg {
     86      1.1  mrg   /* nothing to do on PowerPC.  */
     87      1.1  mrg }
     88      1.1  mrg 
     89      1.1  mrg /* Convert I to a __m64 object.  The integer is zero-extended to 64-bits.  */
     90      1.1  mrg extern __inline __m64  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     91      1.1  mrg _mm_cvtsi32_si64 (int __i)
     92      1.1  mrg {
     93      1.1  mrg   return (__m64) (unsigned int) __i;
     94      1.1  mrg }
     95      1.1  mrg 
     96      1.1  mrg extern __inline __m64  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     97      1.1  mrg _m_from_int (int __i)
     98      1.1  mrg {
     99      1.1  mrg   return _mm_cvtsi32_si64 (__i);
    100      1.1  mrg }
    101      1.1  mrg 
    102      1.1  mrg /* Convert the lower 32 bits of the __m64 object into an integer.  */
    103      1.1  mrg extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    104      1.1  mrg _mm_cvtsi64_si32 (__m64 __i)
    105      1.1  mrg {
    106      1.1  mrg   return ((int) __i);
    107      1.1  mrg }
    108      1.1  mrg 
    109      1.1  mrg extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    110      1.1  mrg _m_to_int (__m64 __i)
    111      1.1  mrg {
    112      1.1  mrg   return _mm_cvtsi64_si32 (__i);
    113      1.1  mrg }
    114      1.1  mrg 
    115      1.1  mrg /* Convert I to a __m64 object.  */
    116      1.1  mrg 
    117      1.1  mrg /* Intel intrinsic.  */
    118      1.1  mrg extern __inline __m64  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    119      1.1  mrg _m_from_int64 (long long __i)
    120      1.1  mrg {
    121      1.1  mrg   return (__m64) __i;
    122      1.1  mrg }
    123      1.1  mrg 
    124      1.1  mrg extern __inline __m64  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    125      1.1  mrg _mm_cvtsi64_m64 (long long __i)
    126      1.1  mrg {
    127      1.1  mrg   return (__m64) __i;
    128      1.1  mrg }
    129      1.1  mrg 
    130      1.1  mrg /* Microsoft intrinsic.  */
    131      1.1  mrg extern __inline __m64  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    132      1.1  mrg _mm_cvtsi64x_si64 (long long __i)
    133      1.1  mrg {
    134      1.1  mrg   return (__m64) __i;
    135      1.1  mrg }
    136      1.1  mrg 
    137      1.1  mrg extern __inline __m64  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    138      1.1  mrg _mm_set_pi64x (long long __i)
    139      1.1  mrg {
    140      1.1  mrg   return (__m64) __i;
    141      1.1  mrg }
    142      1.1  mrg 
    143      1.1  mrg /* Convert the __m64 object to a 64bit integer.  */
    144      1.1  mrg 
    145      1.1  mrg /* Intel intrinsic.  */
    146      1.1  mrg extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    147      1.1  mrg _m_to_int64 (__m64 __i)
    148      1.1  mrg {
    149      1.1  mrg   return (long long)__i;
    150      1.1  mrg }
    151      1.1  mrg 
    152      1.1  mrg extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    153      1.1  mrg _mm_cvtm64_si64 (__m64 __i)
    154      1.1  mrg {
    155      1.1  mrg   return (long long) __i;
    156      1.1  mrg }
    157      1.1  mrg 
    158      1.1  mrg /* Microsoft intrinsic.  */
    159      1.1  mrg extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    160      1.1  mrg _mm_cvtsi64_si64x (__m64 __i)
    161      1.1  mrg {
    162      1.1  mrg   return (long long) __i;
    163      1.1  mrg }
    164      1.1  mrg 
    165      1.1  mrg #ifdef _ARCH_PWR8
    166      1.1  mrg /* Pack the four 16-bit values from M1 into the lower four 8-bit values of
    167      1.1  mrg    the result, and the four 16-bit values from M2 into the upper four 8-bit
    168      1.1  mrg    values of the result, all with signed saturation.  */
    169      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    170      1.1  mrg _mm_packs_pi16 (__m64 __m1, __m64 __m2)
    171      1.1  mrg {
    172      1.1  mrg   __vector signed short vm1;
    173      1.1  mrg   __vector signed char vresult;
    174      1.1  mrg 
    175  1.1.1.2  mrg   vm1 = (__vector signed short) (__vector unsigned long long)
    176  1.1.1.2  mrg #ifdef __LITTLE_ENDIAN__
    177  1.1.1.2  mrg         { __m1, __m2 };
    178  1.1.1.2  mrg #else
    179  1.1.1.2  mrg         { __m2, __m1 };
    180  1.1.1.2  mrg #endif
    181  1.1.1.2  mrg   vresult = vec_packs (vm1, vm1);
    182  1.1.1.2  mrg   return (__m64) ((__vector long long) vresult)[0];
    183      1.1  mrg }
    184      1.1  mrg 
    185      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    186      1.1  mrg _m_packsswb (__m64 __m1, __m64 __m2)
    187      1.1  mrg {
    188      1.1  mrg   return _mm_packs_pi16 (__m1, __m2);
    189      1.1  mrg }
    190      1.1  mrg 
    191      1.1  mrg /* Pack the two 32-bit values from M1 in to the lower two 16-bit values of
    192      1.1  mrg    the result, and the two 32-bit values from M2 into the upper two 16-bit
    193      1.1  mrg    values of the result, all with signed saturation.  */
    194      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    195      1.1  mrg _mm_packs_pi32 (__m64 __m1, __m64 __m2)
    196      1.1  mrg {
    197      1.1  mrg   __vector signed int vm1;
    198      1.1  mrg   __vector signed short vresult;
    199      1.1  mrg 
    200  1.1.1.2  mrg   vm1 = (__vector signed int) (__vector unsigned long long)
    201  1.1.1.2  mrg #ifdef __LITTLE_ENDIAN__
    202  1.1.1.2  mrg         { __m1, __m2 };
    203  1.1.1.2  mrg #else
    204  1.1.1.2  mrg         { __m2, __m1 };
    205  1.1.1.2  mrg #endif
    206  1.1.1.2  mrg   vresult = vec_packs (vm1, vm1);
    207  1.1.1.2  mrg   return (__m64) ((__vector long long) vresult)[0];
    208      1.1  mrg }
    209      1.1  mrg 
    210      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    211      1.1  mrg _m_packssdw (__m64 __m1, __m64 __m2)
    212      1.1  mrg {
    213      1.1  mrg   return _mm_packs_pi32 (__m1, __m2);
    214      1.1  mrg }
    215      1.1  mrg 
    216      1.1  mrg /* Pack the four 16-bit values from M1 into the lower four 8-bit values of
    217      1.1  mrg    the result, and the four 16-bit values from M2 into the upper four 8-bit
    218      1.1  mrg    values of the result, all with unsigned saturation.  */
    219      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    220      1.1  mrg _mm_packs_pu16 (__m64 __m1, __m64 __m2)
    221      1.1  mrg {
    222  1.1.1.2  mrg   __vector unsigned char r;
    223  1.1.1.2  mrg   __vector signed short vm1 = (__vector signed short) (__vector long long)
    224  1.1.1.2  mrg #ifdef __LITTLE_ENDIAN__
    225  1.1.1.2  mrg         { __m1, __m2 };
    226  1.1.1.2  mrg #else
    227  1.1.1.2  mrg         { __m2, __m1 };
    228  1.1.1.2  mrg #endif
    229  1.1.1.2  mrg   const __vector signed short __zero = { 0 };
    230  1.1.1.2  mrg   __vector __bool short __select = vec_cmplt (vm1, __zero);
    231  1.1.1.2  mrg   r = vec_packs ((__vector unsigned short) vm1, (__vector unsigned short) vm1);
    232  1.1.1.2  mrg   __vector __bool char packsel = vec_pack (__select, __select);
    233  1.1.1.2  mrg   r = vec_sel (r, (const __vector unsigned char) __zero, packsel);
    234  1.1.1.2  mrg   return (__m64) ((__vector long long) r)[0];
    235      1.1  mrg }
    236      1.1  mrg 
    237      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    238      1.1  mrg _m_packuswb (__m64 __m1, __m64 __m2)
    239      1.1  mrg {
    240      1.1  mrg   return _mm_packs_pu16 (__m1, __m2);
    241      1.1  mrg }
    242      1.1  mrg #endif /* end ARCH_PWR8 */
    243      1.1  mrg 
    244      1.1  mrg /* Interleave the four 8-bit values from the high half of M1 with the four
    245      1.1  mrg    8-bit values from the high half of M2.  */
    246      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    247      1.1  mrg _mm_unpackhi_pi8 (__m64 __m1, __m64 __m2)
    248      1.1  mrg {
    249      1.1  mrg #if _ARCH_PWR8
    250      1.1  mrg   __vector unsigned char a, b, c;
    251      1.1  mrg 
    252      1.1  mrg   a = (__vector unsigned char)vec_splats (__m1);
    253      1.1  mrg   b = (__vector unsigned char)vec_splats (__m2);
    254      1.1  mrg   c = vec_mergel (a, b);
    255  1.1.1.2  mrg   return (__m64) ((__vector long long) c)[1];
    256      1.1  mrg #else
    257      1.1  mrg   __m64_union m1, m2, res;
    258      1.1  mrg 
    259      1.1  mrg   m1.as_m64 = __m1;
    260      1.1  mrg   m2.as_m64 = __m2;
    261      1.1  mrg 
    262      1.1  mrg   res.as_char[0] = m1.as_char[4];
    263      1.1  mrg   res.as_char[1] = m2.as_char[4];
    264      1.1  mrg   res.as_char[2] = m1.as_char[5];
    265      1.1  mrg   res.as_char[3] = m2.as_char[5];
    266      1.1  mrg   res.as_char[4] = m1.as_char[6];
    267      1.1  mrg   res.as_char[5] = m2.as_char[6];
    268      1.1  mrg   res.as_char[6] = m1.as_char[7];
    269      1.1  mrg   res.as_char[7] = m2.as_char[7];
    270      1.1  mrg 
    271      1.1  mrg   return (__m64) res.as_m64;
    272      1.1  mrg #endif
    273      1.1  mrg }
    274      1.1  mrg 
    275      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    276      1.1  mrg _m_punpckhbw (__m64 __m1, __m64 __m2)
    277      1.1  mrg {
    278      1.1  mrg   return _mm_unpackhi_pi8 (__m1, __m2);
    279      1.1  mrg }
    280      1.1  mrg 
    281      1.1  mrg /* Interleave the two 16-bit values from the high half of M1 with the two
    282      1.1  mrg    16-bit values from the high half of M2.  */
    283      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    284      1.1  mrg _mm_unpackhi_pi16 (__m64 __m1, __m64 __m2)
    285      1.1  mrg {
    286      1.1  mrg   __m64_union m1, m2, res;
    287      1.1  mrg 
    288      1.1  mrg   m1.as_m64 = __m1;
    289      1.1  mrg   m2.as_m64 = __m2;
    290      1.1  mrg 
    291      1.1  mrg   res.as_short[0] = m1.as_short[2];
    292      1.1  mrg   res.as_short[1] = m2.as_short[2];
    293      1.1  mrg   res.as_short[2] = m1.as_short[3];
    294      1.1  mrg   res.as_short[3] = m2.as_short[3];
    295      1.1  mrg 
    296      1.1  mrg   return (__m64) res.as_m64;
    297      1.1  mrg }
    298      1.1  mrg 
    299      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    300      1.1  mrg _m_punpckhwd (__m64 __m1, __m64 __m2)
    301      1.1  mrg {
    302      1.1  mrg   return _mm_unpackhi_pi16 (__m1, __m2);
    303      1.1  mrg }
    304      1.1  mrg /* Interleave the 32-bit value from the high half of M1 with the 32-bit
    305      1.1  mrg    value from the high half of M2.  */
    306      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    307      1.1  mrg _mm_unpackhi_pi32 (__m64 __m1, __m64 __m2)
    308      1.1  mrg {
    309      1.1  mrg   __m64_union m1, m2, res;
    310      1.1  mrg 
    311      1.1  mrg   m1.as_m64 = __m1;
    312      1.1  mrg   m2.as_m64 = __m2;
    313      1.1  mrg 
    314      1.1  mrg   res.as_int[0] = m1.as_int[1];
    315      1.1  mrg   res.as_int[1] = m2.as_int[1];
    316      1.1  mrg 
    317      1.1  mrg   return (__m64) res.as_m64;
    318      1.1  mrg }
    319      1.1  mrg 
    320      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    321      1.1  mrg _m_punpckhdq (__m64 __m1, __m64 __m2)
    322      1.1  mrg {
    323      1.1  mrg   return _mm_unpackhi_pi32 (__m1, __m2);
    324      1.1  mrg }
    325      1.1  mrg /* Interleave the four 8-bit values from the low half of M1 with the four
    326      1.1  mrg    8-bit values from the low half of M2.  */
    327      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    328      1.1  mrg _mm_unpacklo_pi8 (__m64 __m1, __m64 __m2)
    329      1.1  mrg {
    330      1.1  mrg #if _ARCH_PWR8
    331      1.1  mrg   __vector unsigned char a, b, c;
    332      1.1  mrg 
    333      1.1  mrg   a = (__vector unsigned char)vec_splats (__m1);
    334      1.1  mrg   b = (__vector unsigned char)vec_splats (__m2);
    335      1.1  mrg   c = vec_mergel (a, b);
    336  1.1.1.2  mrg   return (__m64) ((__vector long long) c)[0];
    337      1.1  mrg #else
    338      1.1  mrg   __m64_union m1, m2, res;
    339      1.1  mrg 
    340      1.1  mrg   m1.as_m64 = __m1;
    341      1.1  mrg   m2.as_m64 = __m2;
    342      1.1  mrg 
    343      1.1  mrg   res.as_char[0] = m1.as_char[0];
    344      1.1  mrg   res.as_char[1] = m2.as_char[0];
    345      1.1  mrg   res.as_char[2] = m1.as_char[1];
    346      1.1  mrg   res.as_char[3] = m2.as_char[1];
    347      1.1  mrg   res.as_char[4] = m1.as_char[2];
    348      1.1  mrg   res.as_char[5] = m2.as_char[2];
    349      1.1  mrg   res.as_char[6] = m1.as_char[3];
    350      1.1  mrg   res.as_char[7] = m2.as_char[3];
    351      1.1  mrg 
    352      1.1  mrg   return (__m64) res.as_m64;
    353      1.1  mrg #endif
    354      1.1  mrg }
    355      1.1  mrg 
    356      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    357      1.1  mrg _m_punpcklbw (__m64 __m1, __m64 __m2)
    358      1.1  mrg {
    359      1.1  mrg   return _mm_unpacklo_pi8 (__m1, __m2);
    360      1.1  mrg }
    361      1.1  mrg /* Interleave the two 16-bit values from the low half of M1 with the two
    362      1.1  mrg    16-bit values from the low half of M2.  */
    363      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    364      1.1  mrg _mm_unpacklo_pi16 (__m64 __m1, __m64 __m2)
    365      1.1  mrg {
    366      1.1  mrg   __m64_union m1, m2, res;
    367      1.1  mrg 
    368      1.1  mrg   m1.as_m64 = __m1;
    369      1.1  mrg   m2.as_m64 = __m2;
    370      1.1  mrg 
    371      1.1  mrg   res.as_short[0] = m1.as_short[0];
    372      1.1  mrg   res.as_short[1] = m2.as_short[0];
    373      1.1  mrg   res.as_short[2] = m1.as_short[1];
    374      1.1  mrg   res.as_short[3] = m2.as_short[1];
    375      1.1  mrg 
    376      1.1  mrg   return (__m64) res.as_m64;
    377      1.1  mrg }
    378      1.1  mrg 
    379      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    380      1.1  mrg _m_punpcklwd (__m64 __m1, __m64 __m2)
    381      1.1  mrg {
    382      1.1  mrg   return _mm_unpacklo_pi16 (__m1, __m2);
    383      1.1  mrg }
    384      1.1  mrg 
    385      1.1  mrg /* Interleave the 32-bit value from the low half of M1 with the 32-bit
    386      1.1  mrg    value from the low half of M2.  */
    387      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    388      1.1  mrg _mm_unpacklo_pi32 (__m64 __m1, __m64 __m2)
    389      1.1  mrg {
    390      1.1  mrg   __m64_union m1, m2, res;
    391      1.1  mrg 
    392      1.1  mrg   m1.as_m64 = __m1;
    393      1.1  mrg   m2.as_m64 = __m2;
    394      1.1  mrg 
    395      1.1  mrg   res.as_int[0] = m1.as_int[0];
    396      1.1  mrg   res.as_int[1] = m2.as_int[0];
    397      1.1  mrg 
    398      1.1  mrg   return (__m64) res.as_m64;
    399      1.1  mrg }
    400      1.1  mrg 
    401      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    402      1.1  mrg _m_punpckldq (__m64 __m1, __m64 __m2)
    403      1.1  mrg {
    404      1.1  mrg   return _mm_unpacklo_pi32 (__m1, __m2);
    405      1.1  mrg }
    406      1.1  mrg 
    407      1.1  mrg /* Add the 8-bit values in M1 to the 8-bit values in M2.  */
    408      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    409      1.1  mrg _mm_add_pi8 (__m64 __m1, __m64 __m2)
    410      1.1  mrg {
    411      1.1  mrg #if _ARCH_PWR8
    412      1.1  mrg   __vector signed char a, b, c;
    413      1.1  mrg 
    414      1.1  mrg   a = (__vector signed char)vec_splats (__m1);
    415      1.1  mrg   b = (__vector signed char)vec_splats (__m2);
    416      1.1  mrg   c = vec_add (a, b);
    417  1.1.1.2  mrg   return (__m64) ((__vector long long) c)[0];
    418      1.1  mrg #else
    419      1.1  mrg   __m64_union m1, m2, res;
    420      1.1  mrg 
    421      1.1  mrg   m1.as_m64 = __m1;
    422      1.1  mrg   m2.as_m64 = __m2;
    423      1.1  mrg 
    424      1.1  mrg   res.as_char[0] = m1.as_char[0] + m2.as_char[0];
    425      1.1  mrg   res.as_char[1] = m1.as_char[1] + m2.as_char[1];
    426      1.1  mrg   res.as_char[2] = m1.as_char[2] + m2.as_char[2];
    427      1.1  mrg   res.as_char[3] = m1.as_char[3] + m2.as_char[3];
    428      1.1  mrg   res.as_char[4] = m1.as_char[4] + m2.as_char[4];
    429      1.1  mrg   res.as_char[5] = m1.as_char[5] + m2.as_char[5];
    430      1.1  mrg   res.as_char[6] = m1.as_char[6] + m2.as_char[6];
    431      1.1  mrg   res.as_char[7] = m1.as_char[7] + m2.as_char[7];
    432      1.1  mrg 
    433      1.1  mrg   return (__m64) res.as_m64;
    434      1.1  mrg #endif
    435      1.1  mrg }
    436      1.1  mrg 
    437      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    438      1.1  mrg _m_paddb (__m64 __m1, __m64 __m2)
    439      1.1  mrg {
    440      1.1  mrg   return _mm_add_pi8 (__m1, __m2);
    441      1.1  mrg }
    442      1.1  mrg 
    443      1.1  mrg /* Add the 16-bit values in M1 to the 16-bit values in M2.  */
    444      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    445      1.1  mrg _mm_add_pi16 (__m64 __m1, __m64 __m2)
    446      1.1  mrg {
    447      1.1  mrg #if _ARCH_PWR8
    448      1.1  mrg   __vector signed short a, b, c;
    449      1.1  mrg 
    450      1.1  mrg   a = (__vector signed short)vec_splats (__m1);
    451      1.1  mrg   b = (__vector signed short)vec_splats (__m2);
    452      1.1  mrg   c = vec_add (a, b);
    453  1.1.1.2  mrg   return (__m64) ((__vector long long) c)[0];
    454      1.1  mrg #else
    455      1.1  mrg   __m64_union m1, m2, res;
    456      1.1  mrg 
    457      1.1  mrg   m1.as_m64 = __m1;
    458      1.1  mrg   m2.as_m64 = __m2;
    459      1.1  mrg 
    460      1.1  mrg   res.as_short[0] = m1.as_short[0] + m2.as_short[0];
    461      1.1  mrg   res.as_short[1] = m1.as_short[1] + m2.as_short[1];
    462      1.1  mrg   res.as_short[2] = m1.as_short[2] + m2.as_short[2];
    463      1.1  mrg   res.as_short[3] = m1.as_short[3] + m2.as_short[3];
    464      1.1  mrg 
    465      1.1  mrg   return (__m64) res.as_m64;
    466      1.1  mrg #endif
    467      1.1  mrg }
    468      1.1  mrg 
    469      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    470      1.1  mrg _m_paddw (__m64 __m1, __m64 __m2)
    471      1.1  mrg {
    472      1.1  mrg   return _mm_add_pi16 (__m1, __m2);
    473      1.1  mrg }
    474      1.1  mrg 
    475      1.1  mrg /* Add the 32-bit values in M1 to the 32-bit values in M2.  */
    476      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    477      1.1  mrg _mm_add_pi32 (__m64 __m1, __m64 __m2)
    478      1.1  mrg {
    479      1.1  mrg #if _ARCH_PWR9
    480      1.1  mrg   __vector signed int a, b, c;
    481      1.1  mrg 
    482      1.1  mrg   a = (__vector signed int)vec_splats (__m1);
    483      1.1  mrg   b = (__vector signed int)vec_splats (__m2);
    484      1.1  mrg   c = vec_add (a, b);
    485  1.1.1.2  mrg   return (__m64) ((__vector long long) c)[0];
    486      1.1  mrg #else
    487      1.1  mrg   __m64_union m1, m2, res;
    488      1.1  mrg 
    489      1.1  mrg   m1.as_m64 = __m1;
    490      1.1  mrg   m2.as_m64 = __m2;
    491      1.1  mrg 
    492      1.1  mrg   res.as_int[0] = m1.as_int[0] + m2.as_int[0];
    493      1.1  mrg   res.as_int[1] = m1.as_int[1] + m2.as_int[1];
    494      1.1  mrg 
    495      1.1  mrg   return (__m64) res.as_m64;
    496      1.1  mrg #endif
    497      1.1  mrg }
    498      1.1  mrg 
    499      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    500      1.1  mrg _m_paddd (__m64 __m1, __m64 __m2)
    501      1.1  mrg {
    502      1.1  mrg   return _mm_add_pi32 (__m1, __m2);
    503      1.1  mrg }
    504      1.1  mrg 
    505      1.1  mrg /* Subtract the 8-bit values in M2 from the 8-bit values in M1.  */
    506      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    507      1.1  mrg _mm_sub_pi8 (__m64 __m1, __m64 __m2)
    508      1.1  mrg {
    509      1.1  mrg #if _ARCH_PWR8
    510      1.1  mrg   __vector signed char a, b, c;
    511      1.1  mrg 
    512      1.1  mrg   a = (__vector signed char)vec_splats (__m1);
    513      1.1  mrg   b = (__vector signed char)vec_splats (__m2);
    514      1.1  mrg   c = vec_sub (a, b);
    515  1.1.1.2  mrg   return (__m64) ((__vector long long) c)[0];
    516      1.1  mrg #else
    517      1.1  mrg   __m64_union m1, m2, res;
    518      1.1  mrg 
    519      1.1  mrg   m1.as_m64 = __m1;
    520      1.1  mrg   m2.as_m64 = __m2;
    521      1.1  mrg 
    522      1.1  mrg   res.as_char[0] = m1.as_char[0] - m2.as_char[0];
    523      1.1  mrg   res.as_char[1] = m1.as_char[1] - m2.as_char[1];
    524      1.1  mrg   res.as_char[2] = m1.as_char[2] - m2.as_char[2];
    525      1.1  mrg   res.as_char[3] = m1.as_char[3] - m2.as_char[3];
    526      1.1  mrg   res.as_char[4] = m1.as_char[4] - m2.as_char[4];
    527      1.1  mrg   res.as_char[5] = m1.as_char[5] - m2.as_char[5];
    528      1.1  mrg   res.as_char[6] = m1.as_char[6] - m2.as_char[6];
    529      1.1  mrg   res.as_char[7] = m1.as_char[7] - m2.as_char[7];
    530      1.1  mrg 
    531      1.1  mrg   return (__m64) res.as_m64;
    532      1.1  mrg #endif
    533      1.1  mrg }
    534      1.1  mrg 
    535      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    536      1.1  mrg _m_psubb (__m64 __m1, __m64 __m2)
    537      1.1  mrg {
    538      1.1  mrg   return _mm_sub_pi8 (__m1, __m2);
    539      1.1  mrg }
    540      1.1  mrg 
    541      1.1  mrg /* Subtract the 16-bit values in M2 from the 16-bit values in M1.  */
    542      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    543      1.1  mrg _mm_sub_pi16 (__m64 __m1, __m64 __m2)
    544      1.1  mrg {
    545      1.1  mrg #if _ARCH_PWR8
    546      1.1  mrg   __vector signed short a, b, c;
    547      1.1  mrg 
    548      1.1  mrg   a = (__vector signed short)vec_splats (__m1);
    549      1.1  mrg   b = (__vector signed short)vec_splats (__m2);
    550      1.1  mrg   c = vec_sub (a, b);
    551  1.1.1.2  mrg   return (__m64) ((__vector long long) c)[0];
    552      1.1  mrg #else
    553      1.1  mrg   __m64_union m1, m2, res;
    554      1.1  mrg 
    555      1.1  mrg   m1.as_m64 = __m1;
    556      1.1  mrg   m2.as_m64 = __m2;
    557      1.1  mrg 
    558      1.1  mrg   res.as_short[0] = m1.as_short[0] - m2.as_short[0];
    559      1.1  mrg   res.as_short[1] = m1.as_short[1] - m2.as_short[1];
    560      1.1  mrg   res.as_short[2] = m1.as_short[2] - m2.as_short[2];
    561      1.1  mrg   res.as_short[3] = m1.as_short[3] - m2.as_short[3];
    562      1.1  mrg 
    563      1.1  mrg   return (__m64) res.as_m64;
    564      1.1  mrg #endif
    565      1.1  mrg }
    566      1.1  mrg 
    567      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    568      1.1  mrg _m_psubw (__m64 __m1, __m64 __m2)
    569      1.1  mrg {
    570      1.1  mrg   return _mm_sub_pi16 (__m1, __m2);
    571      1.1  mrg }
    572      1.1  mrg 
    573      1.1  mrg /* Subtract the 32-bit values in M2 from the 32-bit values in M1.  */
    574      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    575      1.1  mrg _mm_sub_pi32 (__m64 __m1, __m64 __m2)
    576      1.1  mrg {
    577      1.1  mrg #if _ARCH_PWR9
    578      1.1  mrg   __vector signed int a, b, c;
    579      1.1  mrg 
    580      1.1  mrg   a = (__vector signed int)vec_splats (__m1);
    581      1.1  mrg   b = (__vector signed int)vec_splats (__m2);
    582      1.1  mrg   c = vec_sub (a, b);
    583  1.1.1.2  mrg   return (__m64) ((__vector long long) c)[0];
    584      1.1  mrg #else
    585      1.1  mrg   __m64_union m1, m2, res;
    586      1.1  mrg 
    587      1.1  mrg   m1.as_m64 = __m1;
    588      1.1  mrg   m2.as_m64 = __m2;
    589      1.1  mrg 
    590      1.1  mrg   res.as_int[0] = m1.as_int[0] - m2.as_int[0];
    591      1.1  mrg   res.as_int[1] = m1.as_int[1] - m2.as_int[1];
    592      1.1  mrg 
    593      1.1  mrg   return (__m64) res.as_m64;
    594      1.1  mrg #endif
    595      1.1  mrg }
    596      1.1  mrg 
    597      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    598      1.1  mrg _m_psubd (__m64 __m1, __m64 __m2)
    599      1.1  mrg {
    600      1.1  mrg   return _mm_sub_pi32 (__m1, __m2);
    601      1.1  mrg }
    602      1.1  mrg 
    603      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    604      1.1  mrg _mm_add_si64 (__m64 __m1, __m64 __m2)
    605      1.1  mrg {
    606      1.1  mrg   return (__m1 + __m2);
    607      1.1  mrg }
    608      1.1  mrg 
    609      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    610      1.1  mrg _mm_sub_si64 (__m64 __m1, __m64 __m2)
    611      1.1  mrg {
    612      1.1  mrg   return (__m1 - __m2);
    613      1.1  mrg }
    614      1.1  mrg 
    615      1.1  mrg /* Shift the 64-bit value in M left by COUNT.  */
    616      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    617      1.1  mrg _mm_sll_si64 (__m64 __m, __m64 __count)
    618      1.1  mrg {
    619      1.1  mrg   return (__m << __count);
    620      1.1  mrg }
    621      1.1  mrg 
    622      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    623      1.1  mrg _m_psllq (__m64 __m, __m64 __count)
    624      1.1  mrg {
    625      1.1  mrg   return _mm_sll_si64 (__m, __count);
    626      1.1  mrg }
    627      1.1  mrg 
    628      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    629      1.1  mrg _mm_slli_si64 (__m64 __m, const int __count)
    630      1.1  mrg {
    631      1.1  mrg   return (__m << __count);
    632      1.1  mrg }
    633      1.1  mrg 
    634      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    635      1.1  mrg _m_psllqi (__m64 __m, const int __count)
    636      1.1  mrg {
    637      1.1  mrg   return _mm_slli_si64 (__m, __count);
    638      1.1  mrg }
    639      1.1  mrg 
    640      1.1  mrg /* Shift the 64-bit value in M left by COUNT; shift in zeros.  */
    641      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    642      1.1  mrg _mm_srl_si64 (__m64 __m, __m64 __count)
    643      1.1  mrg {
    644      1.1  mrg   return (__m >> __count);
    645      1.1  mrg }
    646      1.1  mrg 
    647      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    648      1.1  mrg _m_psrlq (__m64 __m, __m64 __count)
    649      1.1  mrg {
    650      1.1  mrg   return _mm_srl_si64 (__m, __count);
    651      1.1  mrg }
    652      1.1  mrg 
    653      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    654      1.1  mrg _mm_srli_si64 (__m64 __m, const int __count)
    655      1.1  mrg {
    656      1.1  mrg   return (__m >> __count);
    657      1.1  mrg }
    658      1.1  mrg 
    659      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    660      1.1  mrg _m_psrlqi (__m64 __m, const int __count)
    661      1.1  mrg {
    662      1.1  mrg   return _mm_srli_si64 (__m, __count);
    663      1.1  mrg }
    664      1.1  mrg 
    665      1.1  mrg /* Bit-wise AND the 64-bit values in M1 and M2.  */
    666      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    667      1.1  mrg _mm_and_si64 (__m64 __m1, __m64 __m2)
    668      1.1  mrg {
    669      1.1  mrg   return (__m1 & __m2);
    670      1.1  mrg }
    671      1.1  mrg 
    672      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    673      1.1  mrg _m_pand (__m64 __m1, __m64 __m2)
    674      1.1  mrg {
    675      1.1  mrg   return _mm_and_si64 (__m1, __m2);
    676      1.1  mrg }
    677      1.1  mrg 
    678      1.1  mrg /* Bit-wise complement the 64-bit value in M1 and bit-wise AND it with the
    679      1.1  mrg    64-bit value in M2.  */
    680      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    681      1.1  mrg _mm_andnot_si64 (__m64 __m1, __m64 __m2)
    682      1.1  mrg {
    683      1.1  mrg   return (~__m1 & __m2);
    684      1.1  mrg }
    685      1.1  mrg 
    686      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    687      1.1  mrg _m_pandn (__m64 __m1, __m64 __m2)
    688      1.1  mrg {
    689      1.1  mrg   return _mm_andnot_si64 (__m1, __m2);
    690      1.1  mrg }
    691      1.1  mrg 
    692      1.1  mrg /* Bit-wise inclusive OR the 64-bit values in M1 and M2.  */
    693      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    694      1.1  mrg _mm_or_si64 (__m64 __m1, __m64 __m2)
    695      1.1  mrg {
    696      1.1  mrg   return (__m1 | __m2);
    697      1.1  mrg }
    698      1.1  mrg 
    699      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    700      1.1  mrg _m_por (__m64 __m1, __m64 __m2)
    701      1.1  mrg {
    702      1.1  mrg   return _mm_or_si64 (__m1, __m2);
    703      1.1  mrg }
    704      1.1  mrg 
    705      1.1  mrg /* Bit-wise exclusive OR the 64-bit values in M1 and M2.  */
    706      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    707      1.1  mrg _mm_xor_si64 (__m64 __m1, __m64 __m2)
    708      1.1  mrg {
    709      1.1  mrg   return  (__m1 ^ __m2);
    710      1.1  mrg }
    711      1.1  mrg 
    712      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    713      1.1  mrg _m_pxor (__m64 __m1, __m64 __m2)
    714      1.1  mrg {
    715      1.1  mrg   return _mm_xor_si64 (__m1, __m2);
    716      1.1  mrg }
    717      1.1  mrg 
    718      1.1  mrg /* Creates a 64-bit zero.  */
    719      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    720      1.1  mrg _mm_setzero_si64 (void)
    721      1.1  mrg {
    722      1.1  mrg   return (__m64) 0;
    723      1.1  mrg }
    724      1.1  mrg 
    725      1.1  mrg /* Compare eight 8-bit values.  The result of the comparison is 0xFF if the
    726      1.1  mrg    test is true and zero if false.  */
    727      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    728      1.1  mrg _mm_cmpeq_pi8 (__m64 __m1, __m64 __m2)
    729      1.1  mrg {
    730  1.1.1.2  mrg #if defined(_ARCH_PWR6) && defined(__powerpc64__)
    731      1.1  mrg   __m64 res;
    732      1.1  mrg   __asm__(
    733      1.1  mrg       "cmpb %0,%1,%2;\n"
    734      1.1  mrg       : "=r" (res)
    735      1.1  mrg       : "r" (__m1),
    736      1.1  mrg 	"r" (__m2)
    737      1.1  mrg       : );
    738      1.1  mrg   return (res);
    739      1.1  mrg #else
    740      1.1  mrg   __m64_union m1, m2, res;
    741      1.1  mrg 
    742      1.1  mrg   m1.as_m64 = __m1;
    743      1.1  mrg   m2.as_m64 = __m2;
    744      1.1  mrg 
    745      1.1  mrg   res.as_char[0] = (m1.as_char[0] == m2.as_char[0])? -1: 0;
    746      1.1  mrg   res.as_char[1] = (m1.as_char[1] == m2.as_char[1])? -1: 0;
    747      1.1  mrg   res.as_char[2] = (m1.as_char[2] == m2.as_char[2])? -1: 0;
    748      1.1  mrg   res.as_char[3] = (m1.as_char[3] == m2.as_char[3])? -1: 0;
    749      1.1  mrg   res.as_char[4] = (m1.as_char[4] == m2.as_char[4])? -1: 0;
    750      1.1  mrg   res.as_char[5] = (m1.as_char[5] == m2.as_char[5])? -1: 0;
    751      1.1  mrg   res.as_char[6] = (m1.as_char[6] == m2.as_char[6])? -1: 0;
    752      1.1  mrg   res.as_char[7] = (m1.as_char[7] == m2.as_char[7])? -1: 0;
    753      1.1  mrg 
    754      1.1  mrg   return (__m64) res.as_m64;
    755      1.1  mrg #endif
    756      1.1  mrg }
    757      1.1  mrg 
    758      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    759      1.1  mrg _m_pcmpeqb (__m64 __m1, __m64 __m2)
    760      1.1  mrg {
    761      1.1  mrg   return _mm_cmpeq_pi8 (__m1, __m2);
    762      1.1  mrg }
    763      1.1  mrg 
    764      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    765      1.1  mrg _mm_cmpgt_pi8 (__m64 __m1, __m64 __m2)
    766      1.1  mrg {
    767      1.1  mrg #if _ARCH_PWR8
    768      1.1  mrg   __vector signed char a, b, c;
    769      1.1  mrg 
    770      1.1  mrg   a = (__vector signed char)vec_splats (__m1);
    771      1.1  mrg   b = (__vector signed char)vec_splats (__m2);
    772      1.1  mrg   c = (__vector signed char)vec_cmpgt (a, b);
    773  1.1.1.2  mrg   return (__m64) ((__vector long long) c)[0];
    774      1.1  mrg #else
    775      1.1  mrg   __m64_union m1, m2, res;
    776      1.1  mrg 
    777      1.1  mrg   m1.as_m64 = __m1;
    778      1.1  mrg   m2.as_m64 = __m2;
    779      1.1  mrg 
    780      1.1  mrg   res.as_char[0] = (m1.as_char[0] > m2.as_char[0])? -1: 0;
    781      1.1  mrg   res.as_char[1] = (m1.as_char[1] > m2.as_char[1])? -1: 0;
    782      1.1  mrg   res.as_char[2] = (m1.as_char[2] > m2.as_char[2])? -1: 0;
    783      1.1  mrg   res.as_char[3] = (m1.as_char[3] > m2.as_char[3])? -1: 0;
    784      1.1  mrg   res.as_char[4] = (m1.as_char[4] > m2.as_char[4])? -1: 0;
    785      1.1  mrg   res.as_char[5] = (m1.as_char[5] > m2.as_char[5])? -1: 0;
    786      1.1  mrg   res.as_char[6] = (m1.as_char[6] > m2.as_char[6])? -1: 0;
    787      1.1  mrg   res.as_char[7] = (m1.as_char[7] > m2.as_char[7])? -1: 0;
    788      1.1  mrg 
    789      1.1  mrg   return (__m64) res.as_m64;
    790      1.1  mrg #endif
    791      1.1  mrg }
    792      1.1  mrg 
    793      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    794      1.1  mrg _m_pcmpgtb (__m64 __m1, __m64 __m2)
    795      1.1  mrg {
    796      1.1  mrg   return _mm_cmpgt_pi8 (__m1, __m2);
    797      1.1  mrg }
    798      1.1  mrg 
    799      1.1  mrg /* Compare four 16-bit values.  The result of the comparison is 0xFFFF if
    800      1.1  mrg    the test is true and zero if false.  */
    801      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    802      1.1  mrg _mm_cmpeq_pi16 (__m64 __m1, __m64 __m2)
    803      1.1  mrg {
    804      1.1  mrg #if _ARCH_PWR8
    805      1.1  mrg   __vector signed short a, b, c;
    806      1.1  mrg 
    807      1.1  mrg   a = (__vector signed short)vec_splats (__m1);
    808      1.1  mrg   b = (__vector signed short)vec_splats (__m2);
    809      1.1  mrg   c = (__vector signed short)vec_cmpeq (a, b);
    810  1.1.1.2  mrg   return (__m64) ((__vector long long) c)[0];
    811      1.1  mrg #else
    812      1.1  mrg   __m64_union m1, m2, res;
    813      1.1  mrg 
    814      1.1  mrg   m1.as_m64 = __m1;
    815      1.1  mrg   m2.as_m64 = __m2;
    816      1.1  mrg 
    817      1.1  mrg   res.as_short[0] = (m1.as_short[0] == m2.as_short[0])? -1: 0;
    818      1.1  mrg   res.as_short[1] = (m1.as_short[1] == m2.as_short[1])? -1: 0;
    819      1.1  mrg   res.as_short[2] = (m1.as_short[2] == m2.as_short[2])? -1: 0;
    820      1.1  mrg   res.as_short[3] = (m1.as_short[3] == m2.as_short[3])? -1: 0;
    821      1.1  mrg 
    822      1.1  mrg   return (__m64) res.as_m64;
    823      1.1  mrg #endif
    824      1.1  mrg }
    825      1.1  mrg 
    826      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    827      1.1  mrg _m_pcmpeqw (__m64 __m1, __m64 __m2)
    828      1.1  mrg {
    829      1.1  mrg   return _mm_cmpeq_pi16 (__m1, __m2);
    830      1.1  mrg }
    831      1.1  mrg 
    832      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    833      1.1  mrg _mm_cmpgt_pi16 (__m64 __m1, __m64 __m2)
    834      1.1  mrg {
    835      1.1  mrg #if _ARCH_PWR8
    836      1.1  mrg   __vector signed short a, b, c;
    837      1.1  mrg 
    838      1.1  mrg   a = (__vector signed short)vec_splats (__m1);
    839      1.1  mrg   b = (__vector signed short)vec_splats (__m2);
    840      1.1  mrg   c = (__vector signed short)vec_cmpgt (a, b);
    841  1.1.1.2  mrg   return (__m64) ((__vector long long) c)[0];
    842      1.1  mrg #else
    843      1.1  mrg   __m64_union m1, m2, res;
    844      1.1  mrg 
    845      1.1  mrg   m1.as_m64 = __m1;
    846      1.1  mrg   m2.as_m64 = __m2;
    847      1.1  mrg 
    848      1.1  mrg   res.as_short[0] = (m1.as_short[0] > m2.as_short[0])? -1: 0;
    849      1.1  mrg   res.as_short[1] = (m1.as_short[1] > m2.as_short[1])? -1: 0;
    850      1.1  mrg   res.as_short[2] = (m1.as_short[2] > m2.as_short[2])? -1: 0;
    851      1.1  mrg   res.as_short[3] = (m1.as_short[3] > m2.as_short[3])? -1: 0;
    852      1.1  mrg 
    853      1.1  mrg   return (__m64) res.as_m64;
    854      1.1  mrg #endif
    855      1.1  mrg }
    856      1.1  mrg 
    857      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    858      1.1  mrg _m_pcmpgtw (__m64 __m1, __m64 __m2)
    859      1.1  mrg {
    860      1.1  mrg   return _mm_cmpgt_pi16 (__m1, __m2);
    861      1.1  mrg }
    862      1.1  mrg 
    863      1.1  mrg /* Compare two 32-bit values.  The result of the comparison is 0xFFFFFFFF if
    864      1.1  mrg    the test is true and zero if false.  */
    865      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    866      1.1  mrg _mm_cmpeq_pi32 (__m64 __m1, __m64 __m2)
    867      1.1  mrg {
    868      1.1  mrg #if _ARCH_PWR9
    869      1.1  mrg   __vector signed int a, b, c;
    870      1.1  mrg 
    871      1.1  mrg   a = (__vector signed int)vec_splats (__m1);
    872      1.1  mrg   b = (__vector signed int)vec_splats (__m2);
    873      1.1  mrg   c = (__vector signed int)vec_cmpeq (a, b);
    874  1.1.1.2  mrg   return (__m64) ((__vector long long) c)[0];
    875      1.1  mrg #else
    876      1.1  mrg   __m64_union m1, m2, res;
    877      1.1  mrg 
    878      1.1  mrg   m1.as_m64 = __m1;
    879      1.1  mrg   m2.as_m64 = __m2;
    880      1.1  mrg 
    881      1.1  mrg   res.as_int[0] = (m1.as_int[0] == m2.as_int[0])? -1: 0;
    882      1.1  mrg   res.as_int[1] = (m1.as_int[1] == m2.as_int[1])? -1: 0;
    883      1.1  mrg 
    884      1.1  mrg   return (__m64) res.as_m64;
    885      1.1  mrg #endif
    886      1.1  mrg }
    887      1.1  mrg 
    888      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    889      1.1  mrg _m_pcmpeqd (__m64 __m1, __m64 __m2)
    890      1.1  mrg {
    891      1.1  mrg   return _mm_cmpeq_pi32 (__m1, __m2);
    892      1.1  mrg }
    893      1.1  mrg 
    894      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    895      1.1  mrg _mm_cmpgt_pi32 (__m64 __m1, __m64 __m2)
    896      1.1  mrg {
    897      1.1  mrg #if _ARCH_PWR9
    898      1.1  mrg   __vector signed int a, b, c;
    899      1.1  mrg 
    900      1.1  mrg   a = (__vector signed int)vec_splats (__m1);
    901      1.1  mrg   b = (__vector signed int)vec_splats (__m2);
    902      1.1  mrg   c = (__vector signed int)vec_cmpgt (a, b);
    903  1.1.1.2  mrg   return (__m64) ((__vector long long) c)[0];
    904      1.1  mrg #else
    905      1.1  mrg   __m64_union m1, m2, res;
    906      1.1  mrg 
    907      1.1  mrg   m1.as_m64 = __m1;
    908      1.1  mrg   m2.as_m64 = __m2;
    909      1.1  mrg 
    910      1.1  mrg   res.as_int[0] = (m1.as_int[0] > m2.as_int[0])? -1: 0;
    911      1.1  mrg   res.as_int[1] = (m1.as_int[1] > m2.as_int[1])? -1: 0;
    912      1.1  mrg 
    913      1.1  mrg   return (__m64) res.as_m64;
    914      1.1  mrg #endif
    915      1.1  mrg }
    916      1.1  mrg 
    917      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    918      1.1  mrg _m_pcmpgtd (__m64 __m1, __m64 __m2)
    919      1.1  mrg {
    920      1.1  mrg   return _mm_cmpgt_pi32 (__m1, __m2);
    921      1.1  mrg }
    922      1.1  mrg 
    923      1.1  mrg #if _ARCH_PWR8
    924      1.1  mrg /* Add the 8-bit values in M1 to the 8-bit values in M2 using signed
    925      1.1  mrg    saturated arithmetic.  */
    926      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    927      1.1  mrg _mm_adds_pi8 (__m64 __m1, __m64 __m2)
    928      1.1  mrg {
    929      1.1  mrg   __vector signed char a, b, c;
    930      1.1  mrg 
    931      1.1  mrg   a = (__vector signed char)vec_splats (__m1);
    932      1.1  mrg   b = (__vector signed char)vec_splats (__m2);
    933      1.1  mrg   c = vec_adds (a, b);
    934  1.1.1.2  mrg   return (__m64) ((__vector long long) c)[0];
    935      1.1  mrg }
    936      1.1  mrg 
    937      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    938      1.1  mrg _m_paddsb (__m64 __m1, __m64 __m2)
    939      1.1  mrg {
    940      1.1  mrg   return _mm_adds_pi8 (__m1, __m2);
    941      1.1  mrg }
    942      1.1  mrg /* Add the 16-bit values in M1 to the 16-bit values in M2 using signed
    943      1.1  mrg    saturated arithmetic.  */
    944      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    945      1.1  mrg _mm_adds_pi16 (__m64 __m1, __m64 __m2)
    946      1.1  mrg {
    947      1.1  mrg   __vector signed short a, b, c;
    948      1.1  mrg 
    949      1.1  mrg   a = (__vector signed short)vec_splats (__m1);
    950      1.1  mrg   b = (__vector signed short)vec_splats (__m2);
    951      1.1  mrg   c = vec_adds (a, b);
    952  1.1.1.2  mrg   return (__m64) ((__vector long long) c)[0];
    953      1.1  mrg }
    954      1.1  mrg 
    955      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    956      1.1  mrg _m_paddsw (__m64 __m1, __m64 __m2)
    957      1.1  mrg {
    958      1.1  mrg   return _mm_adds_pi16 (__m1, __m2);
    959      1.1  mrg }
    960      1.1  mrg /* Add the 8-bit values in M1 to the 8-bit values in M2 using unsigned
    961      1.1  mrg    saturated arithmetic.  */
    962      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    963      1.1  mrg _mm_adds_pu8 (__m64 __m1, __m64 __m2)
    964      1.1  mrg {
    965      1.1  mrg   __vector unsigned char a, b, c;
    966      1.1  mrg 
    967      1.1  mrg   a = (__vector unsigned char)vec_splats (__m1);
    968      1.1  mrg   b = (__vector unsigned char)vec_splats (__m2);
    969      1.1  mrg   c = vec_adds (a, b);
    970  1.1.1.2  mrg   return (__m64) ((__vector long long) c)[0];
    971      1.1  mrg }
    972      1.1  mrg 
    973      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    974      1.1  mrg _m_paddusb (__m64 __m1, __m64 __m2)
    975      1.1  mrg {
    976      1.1  mrg   return _mm_adds_pu8 (__m1, __m2);
    977      1.1  mrg }
    978      1.1  mrg 
    979      1.1  mrg /* Add the 16-bit values in M1 to the 16-bit values in M2 using unsigned
    980      1.1  mrg    saturated arithmetic.  */
    981      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    982      1.1  mrg _mm_adds_pu16 (__m64 __m1, __m64 __m2)
    983      1.1  mrg {
    984      1.1  mrg   __vector unsigned short a, b, c;
    985      1.1  mrg 
    986      1.1  mrg   a = (__vector unsigned short)vec_splats (__m1);
    987      1.1  mrg   b = (__vector unsigned short)vec_splats (__m2);
    988      1.1  mrg   c = vec_adds (a, b);
    989  1.1.1.2  mrg   return (__m64) ((__vector long long) c)[0];
    990      1.1  mrg }
    991      1.1  mrg 
    992      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    993      1.1  mrg _m_paddusw (__m64 __m1, __m64 __m2)
    994      1.1  mrg {
    995      1.1  mrg   return _mm_adds_pu16 (__m1, __m2);
    996      1.1  mrg }
    997      1.1  mrg 
    998      1.1  mrg /* Subtract the 8-bit values in M2 from the 8-bit values in M1 using signed
    999      1.1  mrg    saturating arithmetic.  */
   1000      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1001      1.1  mrg _mm_subs_pi8 (__m64 __m1, __m64 __m2)
   1002      1.1  mrg {
   1003      1.1  mrg   __vector signed char a, b, c;
   1004      1.1  mrg 
   1005      1.1  mrg   a = (__vector signed char)vec_splats (__m1);
   1006      1.1  mrg   b = (__vector signed char)vec_splats (__m2);
   1007      1.1  mrg   c = vec_subs (a, b);
   1008  1.1.1.2  mrg   return (__m64) ((__vector long long) c)[0];
   1009      1.1  mrg }
   1010      1.1  mrg 
   1011      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1012      1.1  mrg _m_psubsb (__m64 __m1, __m64 __m2)
   1013      1.1  mrg {
   1014      1.1  mrg   return _mm_subs_pi8 (__m1, __m2);
   1015      1.1  mrg }
   1016      1.1  mrg 
   1017      1.1  mrg /* Subtract the 16-bit values in M2 from the 16-bit values in M1 using
   1018      1.1  mrg    signed saturating arithmetic.  */
   1019      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1020      1.1  mrg _mm_subs_pi16 (__m64 __m1, __m64 __m2)
   1021      1.1  mrg {
   1022      1.1  mrg   __vector signed short a, b, c;
   1023      1.1  mrg 
   1024      1.1  mrg   a = (__vector signed short)vec_splats (__m1);
   1025      1.1  mrg   b = (__vector signed short)vec_splats (__m2);
   1026      1.1  mrg   c = vec_subs (a, b);
   1027  1.1.1.2  mrg   return (__m64) ((__vector long long) c)[0];
   1028      1.1  mrg }
   1029      1.1  mrg 
   1030      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1031      1.1  mrg _m_psubsw (__m64 __m1, __m64 __m2)
   1032      1.1  mrg {
   1033      1.1  mrg   return _mm_subs_pi16 (__m1, __m2);
   1034      1.1  mrg }
   1035      1.1  mrg 
   1036      1.1  mrg /* Subtract the 8-bit values in M2 from the 8-bit values in M1 using
   1037      1.1  mrg    unsigned saturating arithmetic.  */
   1038      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1039      1.1  mrg _mm_subs_pu8 (__m64 __m1, __m64 __m2)
   1040      1.1  mrg {
   1041      1.1  mrg   __vector unsigned char a, b, c;
   1042      1.1  mrg 
   1043      1.1  mrg   a = (__vector unsigned char)vec_splats (__m1);
   1044      1.1  mrg   b = (__vector unsigned char)vec_splats (__m2);
   1045      1.1  mrg   c = vec_subs (a, b);
   1046  1.1.1.2  mrg   return (__m64) ((__vector long long) c)[0];
   1047      1.1  mrg }
   1048      1.1  mrg 
   1049      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1050      1.1  mrg _m_psubusb (__m64 __m1, __m64 __m2)
   1051      1.1  mrg {
   1052      1.1  mrg   return _mm_subs_pu8 (__m1, __m2);
   1053      1.1  mrg }
   1054      1.1  mrg 
   1055      1.1  mrg /* Subtract the 16-bit values in M2 from the 16-bit values in M1 using
   1056      1.1  mrg    unsigned saturating arithmetic.  */
   1057      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1058      1.1  mrg _mm_subs_pu16 (__m64 __m1, __m64 __m2)
   1059      1.1  mrg {
   1060      1.1  mrg   __vector unsigned short a, b, c;
   1061      1.1  mrg 
   1062      1.1  mrg   a = (__vector unsigned short)vec_splats (__m1);
   1063      1.1  mrg   b = (__vector unsigned short)vec_splats (__m2);
   1064      1.1  mrg   c = vec_subs (a, b);
   1065  1.1.1.2  mrg   return (__m64) ((__vector long long) c)[0];
   1066      1.1  mrg }
   1067      1.1  mrg 
   1068      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1069      1.1  mrg _m_psubusw (__m64 __m1, __m64 __m2)
   1070      1.1  mrg {
   1071      1.1  mrg   return _mm_subs_pu16 (__m1, __m2);
   1072      1.1  mrg }
   1073      1.1  mrg 
   1074      1.1  mrg /* Multiply four 16-bit values in M1 by four 16-bit values in M2 producing
   1075      1.1  mrg    four 32-bit intermediate results, which are then summed by pairs to
   1076      1.1  mrg    produce two 32-bit results.  */
   1077      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1078      1.1  mrg _mm_madd_pi16 (__m64 __m1, __m64 __m2)
   1079      1.1  mrg {
   1080      1.1  mrg   __vector signed short a, b;
   1081      1.1  mrg   __vector signed int c;
   1082      1.1  mrg   __vector signed int zero = {0, 0, 0, 0};
   1083      1.1  mrg 
   1084      1.1  mrg   a = (__vector signed short)vec_splats (__m1);
   1085      1.1  mrg   b = (__vector signed short)vec_splats (__m2);
   1086      1.1  mrg   c = vec_vmsumshm (a, b, zero);
   1087  1.1.1.2  mrg   return (__m64) ((__vector long long) c)[0];
   1088      1.1  mrg }
   1089      1.1  mrg 
   1090      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1091      1.1  mrg _m_pmaddwd (__m64 __m1, __m64 __m2)
   1092      1.1  mrg {
   1093      1.1  mrg   return _mm_madd_pi16 (__m1, __m2);
   1094      1.1  mrg }
   1095      1.1  mrg /* Multiply four signed 16-bit values in M1 by four signed 16-bit values in
   1096      1.1  mrg    M2 and produce the high 16 bits of the 32-bit results.  */
   1097      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1098      1.1  mrg _mm_mulhi_pi16 (__m64 __m1, __m64 __m2)
   1099      1.1  mrg {
   1100      1.1  mrg   __vector signed short a, b;
   1101      1.1  mrg   __vector signed short c;
   1102      1.1  mrg   __vector signed int w0, w1;
   1103      1.1  mrg   __vector unsigned char xform1 = {
   1104  1.1.1.2  mrg #ifdef __LITTLE_ENDIAN__
   1105      1.1  mrg       0x02, 0x03, 0x12, 0x13,  0x06, 0x07, 0x16, 0x17,
   1106      1.1  mrg       0x0A, 0x0B, 0x1A, 0x1B,  0x0E, 0x0F, 0x1E, 0x1F
   1107  1.1.1.2  mrg #else
   1108  1.1.1.2  mrg       0x00, 0x01, 0x10, 0x11,  0x04, 0x05, 0x14, 0x15,
   1109  1.1.1.2  mrg       0x00, 0x01, 0x10, 0x11,  0x04, 0x05, 0x14, 0x15
   1110  1.1.1.2  mrg #endif
   1111      1.1  mrg     };
   1112      1.1  mrg 
   1113      1.1  mrg   a = (__vector signed short)vec_splats (__m1);
   1114      1.1  mrg   b = (__vector signed short)vec_splats (__m2);
   1115      1.1  mrg 
   1116      1.1  mrg   w0 = vec_vmulesh (a, b);
   1117      1.1  mrg   w1 = vec_vmulosh (a, b);
   1118      1.1  mrg   c = (__vector signed short)vec_perm (w0, w1, xform1);
   1119      1.1  mrg 
   1120  1.1.1.2  mrg   return (__m64) ((__vector long long) c)[0];
   1121      1.1  mrg }
   1122      1.1  mrg 
   1123      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1124      1.1  mrg _m_pmulhw (__m64 __m1, __m64 __m2)
   1125      1.1  mrg {
   1126      1.1  mrg   return _mm_mulhi_pi16 (__m1, __m2);
   1127      1.1  mrg }
   1128      1.1  mrg 
   1129      1.1  mrg /* Multiply four 16-bit values in M1 by four 16-bit values in M2 and produce
   1130      1.1  mrg    the low 16 bits of the results.  */
   1131      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1132      1.1  mrg _mm_mullo_pi16 (__m64 __m1, __m64 __m2)
   1133      1.1  mrg {
   1134      1.1  mrg   __vector signed short a, b, c;
   1135      1.1  mrg 
   1136      1.1  mrg   a = (__vector signed short)vec_splats (__m1);
   1137      1.1  mrg   b = (__vector signed short)vec_splats (__m2);
   1138      1.1  mrg   c = a * b;
   1139  1.1.1.2  mrg   return (__m64) ((__vector long long) c)[0];
   1140      1.1  mrg }
   1141      1.1  mrg 
   1142      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1143      1.1  mrg _m_pmullw (__m64 __m1, __m64 __m2)
   1144      1.1  mrg {
   1145      1.1  mrg   return _mm_mullo_pi16 (__m1, __m2);
   1146      1.1  mrg }
   1147      1.1  mrg 
   1148      1.1  mrg /* Shift four 16-bit values in M left by COUNT.  */
   1149      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1150      1.1  mrg _mm_sll_pi16 (__m64 __m, __m64 __count)
   1151      1.1  mrg {
   1152      1.1  mrg   __vector signed short m, r;
   1153      1.1  mrg   __vector unsigned short c;
   1154      1.1  mrg 
   1155      1.1  mrg   if (__count <= 15)
   1156      1.1  mrg     {
   1157      1.1  mrg       m = (__vector signed short)vec_splats (__m);
   1158      1.1  mrg       c = (__vector unsigned short)vec_splats ((unsigned short)__count);
   1159      1.1  mrg       r = vec_sl (m, (__vector unsigned short)c);
   1160  1.1.1.2  mrg       return (__m64) ((__vector long long) r)[0];
   1161      1.1  mrg     }
   1162      1.1  mrg   else
   1163      1.1  mrg   return (0);
   1164      1.1  mrg }
   1165      1.1  mrg 
   1166      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1167      1.1  mrg _m_psllw (__m64 __m, __m64 __count)
   1168      1.1  mrg {
   1169      1.1  mrg   return _mm_sll_pi16 (__m, __count);
   1170      1.1  mrg }
   1171      1.1  mrg 
   1172      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1173      1.1  mrg _mm_slli_pi16 (__m64 __m, int __count)
   1174      1.1  mrg {
   1175      1.1  mrg   /* Promote int to long then invoke mm_sll_pi16.  */
   1176      1.1  mrg   return _mm_sll_pi16 (__m, __count);
   1177      1.1  mrg }
   1178      1.1  mrg 
   1179      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1180      1.1  mrg _m_psllwi (__m64 __m, int __count)
   1181      1.1  mrg {
   1182      1.1  mrg   return _mm_slli_pi16 (__m, __count);
   1183      1.1  mrg }
   1184      1.1  mrg 
   1185      1.1  mrg /* Shift two 32-bit values in M left by COUNT.  */
   1186      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1187      1.1  mrg _mm_sll_pi32 (__m64 __m, __m64 __count)
   1188      1.1  mrg {
   1189      1.1  mrg   __m64_union m, res;
   1190      1.1  mrg 
   1191      1.1  mrg   m.as_m64 = __m;
   1192      1.1  mrg 
   1193      1.1  mrg   res.as_int[0] = m.as_int[0] << __count;
   1194      1.1  mrg   res.as_int[1] = m.as_int[1] << __count;
   1195      1.1  mrg   return (res.as_m64);
   1196      1.1  mrg }
   1197      1.1  mrg 
   1198      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1199      1.1  mrg _m_pslld (__m64 __m, __m64 __count)
   1200      1.1  mrg {
   1201      1.1  mrg   return _mm_sll_pi32 (__m, __count);
   1202      1.1  mrg }
   1203      1.1  mrg 
   1204      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1205      1.1  mrg _mm_slli_pi32 (__m64 __m, int __count)
   1206      1.1  mrg {
   1207      1.1  mrg   /* Promote int to long then invoke mm_sll_pi32.  */
   1208      1.1  mrg   return _mm_sll_pi32 (__m, __count);
   1209      1.1  mrg }
   1210      1.1  mrg 
   1211      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1212      1.1  mrg _m_pslldi (__m64 __m, int __count)
   1213      1.1  mrg {
   1214      1.1  mrg   return _mm_slli_pi32 (__m, __count);
   1215      1.1  mrg }
   1216      1.1  mrg 
   1217      1.1  mrg /* Shift four 16-bit values in M right by COUNT; shift in the sign bit.  */
   1218      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1219      1.1  mrg _mm_sra_pi16 (__m64 __m, __m64 __count)
   1220      1.1  mrg {
   1221      1.1  mrg   __vector signed short m, r;
   1222      1.1  mrg   __vector unsigned short c;
   1223      1.1  mrg 
   1224      1.1  mrg   if (__count <= 15)
   1225      1.1  mrg     {
   1226      1.1  mrg 	m = (__vector signed short)vec_splats (__m);
   1227      1.1  mrg 	c = (__vector unsigned short)vec_splats ((unsigned short)__count);
   1228      1.1  mrg 	r = vec_sra (m, (__vector unsigned short)c);
   1229  1.1.1.2  mrg         return (__m64) ((__vector long long) r)[0];
   1230      1.1  mrg     }
   1231      1.1  mrg   else
   1232      1.1  mrg   return (0);
   1233      1.1  mrg }
   1234      1.1  mrg 
   1235      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1236      1.1  mrg _m_psraw (__m64 __m, __m64 __count)
   1237      1.1  mrg {
   1238      1.1  mrg   return _mm_sra_pi16 (__m, __count);
   1239      1.1  mrg }
   1240      1.1  mrg 
   1241      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1242      1.1  mrg _mm_srai_pi16 (__m64 __m, int __count)
   1243      1.1  mrg {
   1244      1.1  mrg   /* Promote int to long then invoke mm_sra_pi32.  */
   1245      1.1  mrg   return _mm_sra_pi16 (__m, __count);
   1246      1.1  mrg }
   1247      1.1  mrg 
   1248      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1249      1.1  mrg _m_psrawi (__m64 __m, int __count)
   1250      1.1  mrg {
   1251      1.1  mrg   return _mm_srai_pi16 (__m, __count);
   1252      1.1  mrg }
   1253      1.1  mrg 
   1254      1.1  mrg /* Shift two 32-bit values in M right by COUNT; shift in the sign bit.  */
   1255      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1256      1.1  mrg _mm_sra_pi32 (__m64 __m, __m64 __count)
   1257      1.1  mrg {
   1258      1.1  mrg   __m64_union m, res;
   1259      1.1  mrg 
   1260      1.1  mrg   m.as_m64 = __m;
   1261      1.1  mrg 
   1262      1.1  mrg   res.as_int[0] = m.as_int[0] >> __count;
   1263      1.1  mrg   res.as_int[1] = m.as_int[1] >> __count;
   1264      1.1  mrg   return (res.as_m64);
   1265      1.1  mrg }
   1266      1.1  mrg 
   1267      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1268      1.1  mrg _m_psrad (__m64 __m, __m64 __count)
   1269      1.1  mrg {
   1270      1.1  mrg   return _mm_sra_pi32 (__m, __count);
   1271      1.1  mrg }
   1272      1.1  mrg 
   1273      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1274      1.1  mrg _mm_srai_pi32 (__m64 __m, int __count)
   1275      1.1  mrg {
   1276      1.1  mrg   /* Promote int to long then invoke mm_sra_pi32.  */
   1277      1.1  mrg   return _mm_sra_pi32 (__m, __count);
   1278      1.1  mrg }
   1279      1.1  mrg 
   1280      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1281      1.1  mrg _m_psradi (__m64 __m, int __count)
   1282      1.1  mrg {
   1283      1.1  mrg   return _mm_srai_pi32 (__m, __count);
   1284      1.1  mrg }
   1285      1.1  mrg 
   1286      1.1  mrg /* Shift four 16-bit values in M right by COUNT; shift in zeros.  */
   1287      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1288      1.1  mrg _mm_srl_pi16 (__m64 __m, __m64 __count)
   1289      1.1  mrg {
   1290      1.1  mrg   __vector unsigned short m, r;
   1291      1.1  mrg   __vector unsigned short c;
   1292      1.1  mrg 
   1293      1.1  mrg   if (__count <= 15)
   1294      1.1  mrg     {
   1295      1.1  mrg 	m = (__vector unsigned short)vec_splats (__m);
   1296      1.1  mrg 	c = (__vector unsigned short)vec_splats ((unsigned short)__count);
   1297      1.1  mrg 	r = vec_sr (m, (__vector unsigned short)c);
   1298  1.1.1.2  mrg         return (__m64) ((__vector long long) r)[0];
   1299      1.1  mrg     }
   1300      1.1  mrg   else
   1301      1.1  mrg     return (0);
   1302      1.1  mrg }
   1303      1.1  mrg 
   1304      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1305      1.1  mrg _m_psrlw (__m64 __m, __m64 __count)
   1306      1.1  mrg {
   1307      1.1  mrg   return _mm_srl_pi16 (__m, __count);
   1308      1.1  mrg }
   1309      1.1  mrg 
   1310      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1311      1.1  mrg _mm_srli_pi16 (__m64 __m, int __count)
   1312      1.1  mrg {
   1313      1.1  mrg   /* Promote int to long then invoke mm_sra_pi32.  */
   1314      1.1  mrg   return _mm_srl_pi16 (__m, __count);
   1315      1.1  mrg }
   1316      1.1  mrg 
   1317      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1318      1.1  mrg _m_psrlwi (__m64 __m, int __count)
   1319      1.1  mrg {
   1320      1.1  mrg   return _mm_srli_pi16 (__m, __count);
   1321      1.1  mrg }
   1322      1.1  mrg 
   1323      1.1  mrg /* Shift two 32-bit values in M right by COUNT; shift in zeros.  */
   1324      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1325      1.1  mrg _mm_srl_pi32 (__m64 __m, __m64 __count)
   1326      1.1  mrg {
   1327      1.1  mrg   __m64_union m, res;
   1328      1.1  mrg 
   1329      1.1  mrg   m.as_m64 = __m;
   1330      1.1  mrg 
   1331      1.1  mrg   res.as_int[0] = (unsigned int)m.as_int[0] >> __count;
   1332      1.1  mrg   res.as_int[1] = (unsigned int)m.as_int[1] >> __count;
   1333      1.1  mrg   return (res.as_m64);
   1334      1.1  mrg }
   1335      1.1  mrg 
   1336      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1337      1.1  mrg _m_psrld (__m64 __m, __m64 __count)
   1338      1.1  mrg {
   1339      1.1  mrg   return _mm_srl_pi32 (__m, __count);
   1340      1.1  mrg }
   1341      1.1  mrg 
   1342      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1343      1.1  mrg _mm_srli_pi32 (__m64 __m, int __count)
   1344      1.1  mrg {
   1345      1.1  mrg   /* Promote int to long then invoke mm_srl_pi32.  */
   1346      1.1  mrg   return _mm_srl_pi32 (__m, __count);
   1347      1.1  mrg }
   1348      1.1  mrg 
   1349      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1350      1.1  mrg _m_psrldi (__m64 __m, int __count)
   1351      1.1  mrg {
   1352      1.1  mrg   return _mm_srli_pi32 (__m, __count);
   1353      1.1  mrg }
   1354      1.1  mrg #endif /* _ARCH_PWR8 */
   1355      1.1  mrg 
   1356      1.1  mrg /* Creates a vector of two 32-bit values; I0 is least significant.  */
   1357      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1358      1.1  mrg _mm_set_pi32 (int __i1, int __i0)
   1359      1.1  mrg {
   1360      1.1  mrg   __m64_union res;
   1361      1.1  mrg 
   1362      1.1  mrg   res.as_int[0] = __i0;
   1363      1.1  mrg   res.as_int[1] = __i1;
   1364      1.1  mrg   return (res.as_m64);
   1365      1.1  mrg }
   1366      1.1  mrg 
   1367      1.1  mrg /* Creates a vector of four 16-bit values; W0 is least significant.  */
   1368      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1369      1.1  mrg _mm_set_pi16 (short __w3, short __w2, short __w1, short __w0)
   1370      1.1  mrg {
   1371      1.1  mrg   __m64_union res;
   1372      1.1  mrg 
   1373      1.1  mrg   res.as_short[0] = __w0;
   1374      1.1  mrg   res.as_short[1] = __w1;
   1375      1.1  mrg   res.as_short[2] = __w2;
   1376      1.1  mrg   res.as_short[3] = __w3;
   1377      1.1  mrg   return (res.as_m64);
   1378      1.1  mrg }
   1379      1.1  mrg 
   1380      1.1  mrg /* Creates a vector of eight 8-bit values; B0 is least significant.  */
   1381      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1382      1.1  mrg _mm_set_pi8 (char __b7, char __b6, char __b5, char __b4,
   1383      1.1  mrg 	     char __b3, char __b2, char __b1, char __b0)
   1384      1.1  mrg {
   1385      1.1  mrg   __m64_union res;
   1386      1.1  mrg 
   1387      1.1  mrg   res.as_char[0] = __b0;
   1388      1.1  mrg   res.as_char[1] = __b1;
   1389      1.1  mrg   res.as_char[2] = __b2;
   1390      1.1  mrg   res.as_char[3] = __b3;
   1391      1.1  mrg   res.as_char[4] = __b4;
   1392      1.1  mrg   res.as_char[5] = __b5;
   1393      1.1  mrg   res.as_char[6] = __b6;
   1394      1.1  mrg   res.as_char[7] = __b7;
   1395      1.1  mrg   return (res.as_m64);
   1396      1.1  mrg }
   1397      1.1  mrg 
   1398      1.1  mrg /* Similar, but with the arguments in reverse order.  */
   1399      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1400      1.1  mrg _mm_setr_pi32 (int __i0, int __i1)
   1401      1.1  mrg {
   1402      1.1  mrg   __m64_union res;
   1403      1.1  mrg 
   1404      1.1  mrg   res.as_int[0] = __i0;
   1405      1.1  mrg   res.as_int[1] = __i1;
   1406      1.1  mrg   return (res.as_m64);
   1407      1.1  mrg }
   1408      1.1  mrg 
   1409      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1410      1.1  mrg _mm_setr_pi16 (short __w0, short __w1, short __w2, short __w3)
   1411      1.1  mrg {
   1412      1.1  mrg   return _mm_set_pi16 (__w3, __w2, __w1, __w0);
   1413      1.1  mrg }
   1414      1.1  mrg 
   1415      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1416      1.1  mrg _mm_setr_pi8 (char __b0, char __b1, char __b2, char __b3,
   1417      1.1  mrg 	      char __b4, char __b5, char __b6, char __b7)
   1418      1.1  mrg {
   1419      1.1  mrg   return _mm_set_pi8 (__b7, __b6, __b5, __b4, __b3, __b2, __b1, __b0);
   1420      1.1  mrg }
   1421      1.1  mrg 
   1422      1.1  mrg /* Creates a vector of two 32-bit values, both elements containing I.  */
   1423      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1424      1.1  mrg _mm_set1_pi32 (int __i)
   1425      1.1  mrg {
   1426      1.1  mrg   __m64_union res;
   1427      1.1  mrg 
   1428      1.1  mrg   res.as_int[0] = __i;
   1429      1.1  mrg   res.as_int[1] = __i;
   1430      1.1  mrg   return (res.as_m64);
   1431      1.1  mrg }
   1432      1.1  mrg 
   1433      1.1  mrg /* Creates a vector of four 16-bit values, all elements containing W.  */
   1434      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1435      1.1  mrg _mm_set1_pi16 (short __w)
   1436      1.1  mrg {
   1437      1.1  mrg #if _ARCH_PWR9
   1438      1.1  mrg   __vector signed short w;
   1439      1.1  mrg 
   1440      1.1  mrg   w = (__vector signed short)vec_splats (__w);
   1441  1.1.1.2  mrg   return (__m64) ((__vector long long) w)[0];
   1442      1.1  mrg #else
   1443      1.1  mrg   __m64_union res;
   1444      1.1  mrg 
   1445      1.1  mrg   res.as_short[0] = __w;
   1446      1.1  mrg   res.as_short[1] = __w;
   1447      1.1  mrg   res.as_short[2] = __w;
   1448      1.1  mrg   res.as_short[3] = __w;
   1449      1.1  mrg   return (res.as_m64);
   1450      1.1  mrg #endif
   1451      1.1  mrg }
   1452      1.1  mrg 
   1453      1.1  mrg /* Creates a vector of eight 8-bit values, all elements containing B.  */
   1454      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1455      1.1  mrg _mm_set1_pi8 (signed char __b)
   1456      1.1  mrg {
   1457      1.1  mrg #if _ARCH_PWR8
   1458      1.1  mrg   __vector signed char b;
   1459      1.1  mrg 
   1460      1.1  mrg   b = (__vector signed char)vec_splats (__b);
   1461  1.1.1.2  mrg   return (__m64) ((__vector long long) b)[0];
   1462      1.1  mrg #else
   1463      1.1  mrg   __m64_union res;
   1464      1.1  mrg 
   1465      1.1  mrg   res.as_char[0] = __b;
   1466      1.1  mrg   res.as_char[1] = __b;
   1467      1.1  mrg   res.as_char[2] = __b;
   1468      1.1  mrg   res.as_char[3] = __b;
   1469      1.1  mrg   res.as_char[4] = __b;
   1470      1.1  mrg   res.as_char[5] = __b;
   1471      1.1  mrg   res.as_char[6] = __b;
   1472      1.1  mrg   res.as_char[7] = __b;
   1473      1.1  mrg   return (res.as_m64);
   1474      1.1  mrg #endif
   1475      1.1  mrg }
   1476      1.1  mrg #endif /* _MMINTRIN_H_INCLUDED */
   1477