Home | History | Annotate | Line # | Download | only in rs6000
mmintrin.h revision 1.1.1.3
      1  1.1.1.3  mrg /* Copyright (C) 2002-2020 Free Software Foundation, Inc.
      2      1.1  mrg 
      3      1.1  mrg    This file is part of GCC.
      4      1.1  mrg 
      5      1.1  mrg    GCC is free software; you can redistribute it and/or modify
      6      1.1  mrg    it under the terms of the GNU General Public License as published by
      7      1.1  mrg    the Free Software Foundation; either version 3, or (at your option)
      8      1.1  mrg    any later version.
      9      1.1  mrg 
     10      1.1  mrg    GCC is distributed in the hope that it will be useful,
     11      1.1  mrg    but WITHOUT ANY WARRANTY; without even the implied warranty of
     12      1.1  mrg    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     13      1.1  mrg    GNU General Public License for more details.
     14      1.1  mrg 
     15      1.1  mrg    Under Section 7 of GPL version 3, you are granted additional
     16      1.1  mrg    permissions described in the GCC Runtime Library Exception, version
     17      1.1  mrg    3.1, as published by the Free Software Foundation.
     18      1.1  mrg 
     19      1.1  mrg    You should have received a copy of the GNU General Public License and
     20      1.1  mrg    a copy of the GCC Runtime Library Exception along with this program;
     21      1.1  mrg    see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
     22      1.1  mrg    <http://www.gnu.org/licenses/>.  */
     23      1.1  mrg 
     24      1.1  mrg /* Implemented from the specification included in the Intel C++ Compiler
     25      1.1  mrg    User Guide and Reference, version 9.0.  */
     26      1.1  mrg 
     27      1.1  mrg #ifndef NO_WARN_X86_INTRINSICS
     28      1.1  mrg /* This header is distributed to simplify porting x86_64 code that
     29      1.1  mrg    makes explicit use of Intel intrinsics to powerpc64le.
     30      1.1  mrg    It is the user's responsibility to determine if the results are
     31      1.1  mrg    acceptable and make additional changes as necessary.
     32      1.1  mrg    Note that much code that uses Intel intrinsics can be rewritten in
     33      1.1  mrg    standard C or GNU C extensions, which are more portable and better
     34      1.1  mrg    optimized across multiple targets.
     35      1.1  mrg 
     36      1.1  mrg    In the specific case of X86 MMX (__m64) intrinsics, the PowerPC
     37      1.1  mrg    target does not support a native __vector_size__ (8) type.  Instead
     38      1.1  mrg    we typedef __m64 to a 64-bit unsigned long long, which is natively
     39      1.1  mrg    supported in 64-bit mode.  This works well for the _si64 and some
     40      1.1  mrg    _pi32 operations, but starts to generate long sequences for _pi16
     41      1.1  mrg    and _pi8 operations.  For those cases it better (faster and
     42      1.1  mrg    smaller code) to transfer __m64 data to the PowerPC vector 128-bit
     43      1.1  mrg    unit, perform the operation, and then transfer the result back to
     44      1.1  mrg    the __m64 type. This implies that the direct register move
     45      1.1  mrg    instructions, introduced with power8, are available for efficient
     46      1.1  mrg    implementation of these transfers.
     47      1.1  mrg 
     48      1.1  mrg    Most MMX intrinsic operations can be performed efficiently as
     49      1.1  mrg    C language 64-bit scalar operation or optimized to use the newer
     50      1.1  mrg    128-bit SSE/Altivec operations.  We recomend this for new
     51      1.1  mrg    applications.  */
     52      1.1  mrg #error "Please read comment above.  Use -DNO_WARN_X86_INTRINSICS to disable this error."
     53      1.1  mrg #endif
     54      1.1  mrg 
     55      1.1  mrg #ifndef _MMINTRIN_H_INCLUDED
     56      1.1  mrg #define _MMINTRIN_H_INCLUDED
     57      1.1  mrg 
     58      1.1  mrg #include <altivec.h>
     59      1.1  mrg /* The Intel API is flexible enough that we must allow aliasing with other
     60      1.1  mrg    vector types, and their scalar components.  */
     61  1.1.1.3  mrg typedef __attribute__ ((__aligned__ (8),
     62  1.1.1.3  mrg 			__may_alias__)) unsigned long long __m64;
     63      1.1  mrg 
     64      1.1  mrg typedef __attribute__ ((__aligned__ (8)))
     65      1.1  mrg union
     66      1.1  mrg   {
     67      1.1  mrg     __m64 as_m64;
     68      1.1  mrg     char as_char[8];
     69      1.1  mrg     signed char as_signed_char [8];
     70      1.1  mrg     short as_short[4];
     71      1.1  mrg     int as_int[2];
     72      1.1  mrg     long long as_long_long;
     73      1.1  mrg     float as_float[2];
     74      1.1  mrg     double as_double;
     75      1.1  mrg   } __m64_union;
     76      1.1  mrg 
     77      1.1  mrg /* Empty the multimedia state.  */
     78      1.1  mrg extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     79      1.1  mrg _mm_empty (void)
     80      1.1  mrg {
     81      1.1  mrg   /* nothing to do on PowerPC.  */
     82      1.1  mrg }
     83      1.1  mrg 
     84      1.1  mrg extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     85      1.1  mrg _m_empty (void)
     86      1.1  mrg {
     87      1.1  mrg   /* nothing to do on PowerPC.  */
     88      1.1  mrg }
     89      1.1  mrg 
     90      1.1  mrg /* Convert I to a __m64 object.  The integer is zero-extended to 64-bits.  */
     91      1.1  mrg extern __inline __m64  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     92      1.1  mrg _mm_cvtsi32_si64 (int __i)
     93      1.1  mrg {
     94      1.1  mrg   return (__m64) (unsigned int) __i;
     95      1.1  mrg }
     96      1.1  mrg 
     97      1.1  mrg extern __inline __m64  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     98      1.1  mrg _m_from_int (int __i)
     99      1.1  mrg {
    100      1.1  mrg   return _mm_cvtsi32_si64 (__i);
    101      1.1  mrg }
    102      1.1  mrg 
    103      1.1  mrg /* Convert the lower 32 bits of the __m64 object into an integer.  */
    104      1.1  mrg extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    105      1.1  mrg _mm_cvtsi64_si32 (__m64 __i)
    106      1.1  mrg {
    107      1.1  mrg   return ((int) __i);
    108      1.1  mrg }
    109      1.1  mrg 
    110      1.1  mrg extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    111      1.1  mrg _m_to_int (__m64 __i)
    112      1.1  mrg {
    113      1.1  mrg   return _mm_cvtsi64_si32 (__i);
    114      1.1  mrg }
    115      1.1  mrg 
    116      1.1  mrg /* Convert I to a __m64 object.  */
    117      1.1  mrg 
    118      1.1  mrg /* Intel intrinsic.  */
    119      1.1  mrg extern __inline __m64  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    120      1.1  mrg _m_from_int64 (long long __i)
    121      1.1  mrg {
    122      1.1  mrg   return (__m64) __i;
    123      1.1  mrg }
    124      1.1  mrg 
    125      1.1  mrg extern __inline __m64  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    126      1.1  mrg _mm_cvtsi64_m64 (long long __i)
    127      1.1  mrg {
    128      1.1  mrg   return (__m64) __i;
    129      1.1  mrg }
    130      1.1  mrg 
    131      1.1  mrg /* Microsoft intrinsic.  */
    132      1.1  mrg extern __inline __m64  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    133      1.1  mrg _mm_cvtsi64x_si64 (long long __i)
    134      1.1  mrg {
    135      1.1  mrg   return (__m64) __i;
    136      1.1  mrg }
    137      1.1  mrg 
    138      1.1  mrg extern __inline __m64  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    139      1.1  mrg _mm_set_pi64x (long long __i)
    140      1.1  mrg {
    141      1.1  mrg   return (__m64) __i;
    142      1.1  mrg }
    143      1.1  mrg 
    144      1.1  mrg /* Convert the __m64 object to a 64bit integer.  */
    145      1.1  mrg 
    146      1.1  mrg /* Intel intrinsic.  */
    147      1.1  mrg extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    148      1.1  mrg _m_to_int64 (__m64 __i)
    149      1.1  mrg {
    150      1.1  mrg   return (long long)__i;
    151      1.1  mrg }
    152      1.1  mrg 
    153      1.1  mrg extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    154      1.1  mrg _mm_cvtm64_si64 (__m64 __i)
    155      1.1  mrg {
    156      1.1  mrg   return (long long) __i;
    157      1.1  mrg }
    158      1.1  mrg 
    159      1.1  mrg /* Microsoft intrinsic.  */
    160      1.1  mrg extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    161      1.1  mrg _mm_cvtsi64_si64x (__m64 __i)
    162      1.1  mrg {
    163      1.1  mrg   return (long long) __i;
    164      1.1  mrg }
    165      1.1  mrg 
    166      1.1  mrg #ifdef _ARCH_PWR8
    167      1.1  mrg /* Pack the four 16-bit values from M1 into the lower four 8-bit values of
    168      1.1  mrg    the result, and the four 16-bit values from M2 into the upper four 8-bit
    169      1.1  mrg    values of the result, all with signed saturation.  */
    170      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    171      1.1  mrg _mm_packs_pi16 (__m64 __m1, __m64 __m2)
    172      1.1  mrg {
    173  1.1.1.3  mrg   __vector signed short __vm1;
    174  1.1.1.3  mrg   __vector signed char __vresult;
    175      1.1  mrg 
    176  1.1.1.3  mrg   __vm1 = (__vector signed short) (__vector unsigned long long)
    177  1.1.1.2  mrg #ifdef __LITTLE_ENDIAN__
    178  1.1.1.2  mrg         { __m1, __m2 };
    179  1.1.1.2  mrg #else
    180  1.1.1.2  mrg         { __m2, __m1 };
    181  1.1.1.2  mrg #endif
    182  1.1.1.3  mrg   __vresult = vec_packs (__vm1, __vm1);
    183  1.1.1.3  mrg   return (__m64) ((__vector long long) __vresult)[0];
    184      1.1  mrg }
    185      1.1  mrg 
    186      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    187      1.1  mrg _m_packsswb (__m64 __m1, __m64 __m2)
    188      1.1  mrg {
    189      1.1  mrg   return _mm_packs_pi16 (__m1, __m2);
    190      1.1  mrg }
    191      1.1  mrg 
    192      1.1  mrg /* Pack the two 32-bit values from M1 in to the lower two 16-bit values of
    193      1.1  mrg    the result, and the two 32-bit values from M2 into the upper two 16-bit
    194      1.1  mrg    values of the result, all with signed saturation.  */
    195      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    196      1.1  mrg _mm_packs_pi32 (__m64 __m1, __m64 __m2)
    197      1.1  mrg {
    198  1.1.1.3  mrg   __vector signed int __vm1;
    199  1.1.1.3  mrg   __vector signed short __vresult;
    200      1.1  mrg 
    201  1.1.1.3  mrg   __vm1 = (__vector signed int) (__vector unsigned long long)
    202  1.1.1.2  mrg #ifdef __LITTLE_ENDIAN__
    203  1.1.1.2  mrg         { __m1, __m2 };
    204  1.1.1.2  mrg #else
    205  1.1.1.2  mrg         { __m2, __m1 };
    206  1.1.1.2  mrg #endif
    207  1.1.1.3  mrg   __vresult = vec_packs (__vm1, __vm1);
    208  1.1.1.3  mrg   return (__m64) ((__vector long long) __vresult)[0];
    209      1.1  mrg }
    210      1.1  mrg 
    211      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    212      1.1  mrg _m_packssdw (__m64 __m1, __m64 __m2)
    213      1.1  mrg {
    214      1.1  mrg   return _mm_packs_pi32 (__m1, __m2);
    215      1.1  mrg }
    216      1.1  mrg 
    217      1.1  mrg /* Pack the four 16-bit values from M1 into the lower four 8-bit values of
    218      1.1  mrg    the result, and the four 16-bit values from M2 into the upper four 8-bit
    219      1.1  mrg    values of the result, all with unsigned saturation.  */
    220      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    221      1.1  mrg _mm_packs_pu16 (__m64 __m1, __m64 __m2)
    222      1.1  mrg {
    223  1.1.1.3  mrg   __vector unsigned char __r;
    224  1.1.1.3  mrg   __vector signed short __vm1 = (__vector signed short) (__vector long long)
    225  1.1.1.2  mrg #ifdef __LITTLE_ENDIAN__
    226  1.1.1.2  mrg         { __m1, __m2 };
    227  1.1.1.2  mrg #else
    228  1.1.1.2  mrg         { __m2, __m1 };
    229  1.1.1.2  mrg #endif
    230  1.1.1.2  mrg   const __vector signed short __zero = { 0 };
    231  1.1.1.3  mrg   __vector __bool short __select = vec_cmplt (__vm1, __zero);
    232  1.1.1.3  mrg   __r = vec_packs ((__vector unsigned short) __vm1, (__vector unsigned short) __vm1);
    233  1.1.1.3  mrg   __vector __bool char __packsel = vec_pack (__select, __select);
    234  1.1.1.3  mrg   __r = vec_sel (__r, (const __vector unsigned char) __zero, __packsel);
    235  1.1.1.3  mrg   return (__m64) ((__vector long long) __r)[0];
    236      1.1  mrg }
    237      1.1  mrg 
    238      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    239      1.1  mrg _m_packuswb (__m64 __m1, __m64 __m2)
    240      1.1  mrg {
    241      1.1  mrg   return _mm_packs_pu16 (__m1, __m2);
    242      1.1  mrg }
    243      1.1  mrg #endif /* end ARCH_PWR8 */
    244      1.1  mrg 
    245      1.1  mrg /* Interleave the four 8-bit values from the high half of M1 with the four
    246      1.1  mrg    8-bit values from the high half of M2.  */
    247      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    248      1.1  mrg _mm_unpackhi_pi8 (__m64 __m1, __m64 __m2)
    249      1.1  mrg {
    250      1.1  mrg #if _ARCH_PWR8
    251  1.1.1.3  mrg   __vector unsigned char __a, __b, __c;
    252      1.1  mrg 
    253  1.1.1.3  mrg   __a = (__vector unsigned char)vec_splats (__m1);
    254  1.1.1.3  mrg   __b = (__vector unsigned char)vec_splats (__m2);
    255  1.1.1.3  mrg   __c = vec_mergel (__a, __b);
    256  1.1.1.3  mrg   return (__m64) ((__vector long long) __c)[1];
    257      1.1  mrg #else
    258  1.1.1.3  mrg   __m64_union __mu1, __mu2, __res;
    259      1.1  mrg 
    260  1.1.1.3  mrg   __mu1.as_m64 = __m1;
    261  1.1.1.3  mrg   __mu2.as_m64 = __m2;
    262      1.1  mrg 
    263  1.1.1.3  mrg   __res.as_char[0] = __mu1.as_char[4];
    264  1.1.1.3  mrg   __res.as_char[1] = __mu2.as_char[4];
    265  1.1.1.3  mrg   __res.as_char[2] = __mu1.as_char[5];
    266  1.1.1.3  mrg   __res.as_char[3] = __mu2.as_char[5];
    267  1.1.1.3  mrg   __res.as_char[4] = __mu1.as_char[6];
    268  1.1.1.3  mrg   __res.as_char[5] = __mu2.as_char[6];
    269  1.1.1.3  mrg   __res.as_char[6] = __mu1.as_char[7];
    270  1.1.1.3  mrg   __res.as_char[7] = __mu2.as_char[7];
    271      1.1  mrg 
    272  1.1.1.3  mrg   return (__m64) __res.as_m64;
    273      1.1  mrg #endif
    274      1.1  mrg }
    275      1.1  mrg 
    276      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    277      1.1  mrg _m_punpckhbw (__m64 __m1, __m64 __m2)
    278      1.1  mrg {
    279      1.1  mrg   return _mm_unpackhi_pi8 (__m1, __m2);
    280      1.1  mrg }
    281      1.1  mrg 
    282      1.1  mrg /* Interleave the two 16-bit values from the high half of M1 with the two
    283      1.1  mrg    16-bit values from the high half of M2.  */
    284      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    285      1.1  mrg _mm_unpackhi_pi16 (__m64 __m1, __m64 __m2)
    286      1.1  mrg {
    287  1.1.1.3  mrg   __m64_union __mu1, __mu2, __res;
    288      1.1  mrg 
    289  1.1.1.3  mrg   __mu1.as_m64 = __m1;
    290  1.1.1.3  mrg   __mu2.as_m64 = __m2;
    291      1.1  mrg 
    292  1.1.1.3  mrg   __res.as_short[0] = __mu1.as_short[2];
    293  1.1.1.3  mrg   __res.as_short[1] = __mu2.as_short[2];
    294  1.1.1.3  mrg   __res.as_short[2] = __mu1.as_short[3];
    295  1.1.1.3  mrg   __res.as_short[3] = __mu2.as_short[3];
    296      1.1  mrg 
    297  1.1.1.3  mrg   return (__m64) __res.as_m64;
    298      1.1  mrg }
    299      1.1  mrg 
    300      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    301      1.1  mrg _m_punpckhwd (__m64 __m1, __m64 __m2)
    302      1.1  mrg {
    303      1.1  mrg   return _mm_unpackhi_pi16 (__m1, __m2);
    304      1.1  mrg }
    305      1.1  mrg /* Interleave the 32-bit value from the high half of M1 with the 32-bit
    306      1.1  mrg    value from the high half of M2.  */
    307      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    308      1.1  mrg _mm_unpackhi_pi32 (__m64 __m1, __m64 __m2)
    309      1.1  mrg {
    310  1.1.1.3  mrg   __m64_union __mu1, __mu2, __res;
    311      1.1  mrg 
    312  1.1.1.3  mrg   __mu1.as_m64 = __m1;
    313  1.1.1.3  mrg   __mu2.as_m64 = __m2;
    314      1.1  mrg 
    315  1.1.1.3  mrg   __res.as_int[0] = __mu1.as_int[1];
    316  1.1.1.3  mrg   __res.as_int[1] = __mu2.as_int[1];
    317      1.1  mrg 
    318  1.1.1.3  mrg   return (__m64) __res.as_m64;
    319      1.1  mrg }
    320      1.1  mrg 
    321      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    322      1.1  mrg _m_punpckhdq (__m64 __m1, __m64 __m2)
    323      1.1  mrg {
    324      1.1  mrg   return _mm_unpackhi_pi32 (__m1, __m2);
    325      1.1  mrg }
    326      1.1  mrg /* Interleave the four 8-bit values from the low half of M1 with the four
    327      1.1  mrg    8-bit values from the low half of M2.  */
    328      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    329      1.1  mrg _mm_unpacklo_pi8 (__m64 __m1, __m64 __m2)
    330      1.1  mrg {
    331      1.1  mrg #if _ARCH_PWR8
    332  1.1.1.3  mrg   __vector unsigned char __a, __b, __c;
    333      1.1  mrg 
    334  1.1.1.3  mrg   __a = (__vector unsigned char)vec_splats (__m1);
    335  1.1.1.3  mrg   __b = (__vector unsigned char)vec_splats (__m2);
    336  1.1.1.3  mrg   __c = vec_mergel (__a, __b);
    337  1.1.1.3  mrg   return (__m64) ((__vector long long) __c)[0];
    338      1.1  mrg #else
    339  1.1.1.3  mrg   __m64_union __mu1, __mu2, __res;
    340      1.1  mrg 
    341  1.1.1.3  mrg   __mu1.as_m64 = __m1;
    342  1.1.1.3  mrg   __mu2.as_m64 = __m2;
    343      1.1  mrg 
    344  1.1.1.3  mrg   __res.as_char[0] = __mu1.as_char[0];
    345  1.1.1.3  mrg   __res.as_char[1] = __mu2.as_char[0];
    346  1.1.1.3  mrg   __res.as_char[2] = __mu1.as_char[1];
    347  1.1.1.3  mrg   __res.as_char[3] = __mu2.as_char[1];
    348  1.1.1.3  mrg   __res.as_char[4] = __mu1.as_char[2];
    349  1.1.1.3  mrg   __res.as_char[5] = __mu2.as_char[2];
    350  1.1.1.3  mrg   __res.as_char[6] = __mu1.as_char[3];
    351  1.1.1.3  mrg   __res.as_char[7] = __mu2.as_char[3];
    352      1.1  mrg 
    353  1.1.1.3  mrg   return (__m64) __res.as_m64;
    354      1.1  mrg #endif
    355      1.1  mrg }
    356      1.1  mrg 
    357      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    358      1.1  mrg _m_punpcklbw (__m64 __m1, __m64 __m2)
    359      1.1  mrg {
    360      1.1  mrg   return _mm_unpacklo_pi8 (__m1, __m2);
    361      1.1  mrg }
    362      1.1  mrg /* Interleave the two 16-bit values from the low half of M1 with the two
    363      1.1  mrg    16-bit values from the low half of M2.  */
    364      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    365      1.1  mrg _mm_unpacklo_pi16 (__m64 __m1, __m64 __m2)
    366      1.1  mrg {
    367  1.1.1.3  mrg   __m64_union __mu1, __mu2, __res;
    368      1.1  mrg 
    369  1.1.1.3  mrg   __mu1.as_m64 = __m1;
    370  1.1.1.3  mrg   __mu2.as_m64 = __m2;
    371      1.1  mrg 
    372  1.1.1.3  mrg   __res.as_short[0] = __mu1.as_short[0];
    373  1.1.1.3  mrg   __res.as_short[1] = __mu2.as_short[0];
    374  1.1.1.3  mrg   __res.as_short[2] = __mu1.as_short[1];
    375  1.1.1.3  mrg   __res.as_short[3] = __mu2.as_short[1];
    376      1.1  mrg 
    377  1.1.1.3  mrg   return (__m64) __res.as_m64;
    378      1.1  mrg }
    379      1.1  mrg 
    380      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    381      1.1  mrg _m_punpcklwd (__m64 __m1, __m64 __m2)
    382      1.1  mrg {
    383      1.1  mrg   return _mm_unpacklo_pi16 (__m1, __m2);
    384      1.1  mrg }
    385      1.1  mrg 
    386      1.1  mrg /* Interleave the 32-bit value from the low half of M1 with the 32-bit
    387      1.1  mrg    value from the low half of M2.  */
    388      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    389      1.1  mrg _mm_unpacklo_pi32 (__m64 __m1, __m64 __m2)
    390      1.1  mrg {
    391  1.1.1.3  mrg   __m64_union __mu1, __mu2, __res;
    392      1.1  mrg 
    393  1.1.1.3  mrg   __mu1.as_m64 = __m1;
    394  1.1.1.3  mrg   __mu2.as_m64 = __m2;
    395      1.1  mrg 
    396  1.1.1.3  mrg   __res.as_int[0] = __mu1.as_int[0];
    397  1.1.1.3  mrg   __res.as_int[1] = __mu2.as_int[0];
    398      1.1  mrg 
    399  1.1.1.3  mrg   return (__m64) __res.as_m64;
    400      1.1  mrg }
    401      1.1  mrg 
    402      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    403      1.1  mrg _m_punpckldq (__m64 __m1, __m64 __m2)
    404      1.1  mrg {
    405      1.1  mrg   return _mm_unpacklo_pi32 (__m1, __m2);
    406      1.1  mrg }
    407      1.1  mrg 
    408      1.1  mrg /* Add the 8-bit values in M1 to the 8-bit values in M2.  */
    409      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    410      1.1  mrg _mm_add_pi8 (__m64 __m1, __m64 __m2)
    411      1.1  mrg {
    412      1.1  mrg #if _ARCH_PWR8
    413  1.1.1.3  mrg   __vector signed char __a, __b, __c;
    414      1.1  mrg 
    415  1.1.1.3  mrg   __a = (__vector signed char)vec_splats (__m1);
    416  1.1.1.3  mrg   __b = (__vector signed char)vec_splats (__m2);
    417  1.1.1.3  mrg   __c = vec_add (__a, __b);
    418  1.1.1.3  mrg   return (__m64) ((__vector long long) __c)[0];
    419      1.1  mrg #else
    420  1.1.1.3  mrg   __m64_union __mu1, __mu2, __res;
    421      1.1  mrg 
    422  1.1.1.3  mrg   __mu1.as_m64 = __m1;
    423  1.1.1.3  mrg   __mu2.as_m64 = __m2;
    424      1.1  mrg 
    425  1.1.1.3  mrg   __res.as_char[0] = __mu1.as_char[0] + __mu2.as_char[0];
    426  1.1.1.3  mrg   __res.as_char[1] = __mu1.as_char[1] + __mu2.as_char[1];
    427  1.1.1.3  mrg   __res.as_char[2] = __mu1.as_char[2] + __mu2.as_char[2];
    428  1.1.1.3  mrg   __res.as_char[3] = __mu1.as_char[3] + __mu2.as_char[3];
    429  1.1.1.3  mrg   __res.as_char[4] = __mu1.as_char[4] + __mu2.as_char[4];
    430  1.1.1.3  mrg   __res.as_char[5] = __mu1.as_char[5] + __mu2.as_char[5];
    431  1.1.1.3  mrg   __res.as_char[6] = __mu1.as_char[6] + __mu2.as_char[6];
    432  1.1.1.3  mrg   __res.as_char[7] = __mu1.as_char[7] + __mu2.as_char[7];
    433      1.1  mrg 
    434  1.1.1.3  mrg   return (__m64) __res.as_m64;
    435      1.1  mrg #endif
    436      1.1  mrg }
    437      1.1  mrg 
    438      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    439      1.1  mrg _m_paddb (__m64 __m1, __m64 __m2)
    440      1.1  mrg {
    441      1.1  mrg   return _mm_add_pi8 (__m1, __m2);
    442      1.1  mrg }
    443      1.1  mrg 
    444      1.1  mrg /* Add the 16-bit values in M1 to the 16-bit values in M2.  */
    445      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    446      1.1  mrg _mm_add_pi16 (__m64 __m1, __m64 __m2)
    447      1.1  mrg {
    448      1.1  mrg #if _ARCH_PWR8
    449  1.1.1.3  mrg   __vector signed short __a, __b, __c;
    450      1.1  mrg 
    451  1.1.1.3  mrg   __a = (__vector signed short)vec_splats (__m1);
    452  1.1.1.3  mrg   __b = (__vector signed short)vec_splats (__m2);
    453  1.1.1.3  mrg   __c = vec_add (__a, __b);
    454  1.1.1.3  mrg   return (__m64) ((__vector long long) __c)[0];
    455      1.1  mrg #else
    456  1.1.1.3  mrg   __m64_union __mu1, __mu2, __res;
    457      1.1  mrg 
    458  1.1.1.3  mrg   __mu1.as_m64 = __m1;
    459  1.1.1.3  mrg   __mu2.as_m64 = __m2;
    460      1.1  mrg 
    461  1.1.1.3  mrg   __res.as_short[0] = __mu1.as_short[0] + __mu2.as_short[0];
    462  1.1.1.3  mrg   __res.as_short[1] = __mu1.as_short[1] + __mu2.as_short[1];
    463  1.1.1.3  mrg   __res.as_short[2] = __mu1.as_short[2] + __mu2.as_short[2];
    464  1.1.1.3  mrg   __res.as_short[3] = __mu1.as_short[3] + __mu2.as_short[3];
    465      1.1  mrg 
    466  1.1.1.3  mrg   return (__m64) __res.as_m64;
    467      1.1  mrg #endif
    468      1.1  mrg }
    469      1.1  mrg 
    470      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    471      1.1  mrg _m_paddw (__m64 __m1, __m64 __m2)
    472      1.1  mrg {
    473      1.1  mrg   return _mm_add_pi16 (__m1, __m2);
    474      1.1  mrg }
    475      1.1  mrg 
    476      1.1  mrg /* Add the 32-bit values in M1 to the 32-bit values in M2.  */
    477      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    478      1.1  mrg _mm_add_pi32 (__m64 __m1, __m64 __m2)
    479      1.1  mrg {
    480      1.1  mrg #if _ARCH_PWR9
    481  1.1.1.3  mrg   __vector signed int __a, __b, __c;
    482      1.1  mrg 
    483  1.1.1.3  mrg   __a = (__vector signed int)vec_splats (__m1);
    484  1.1.1.3  mrg   __b = (__vector signed int)vec_splats (__m2);
    485  1.1.1.3  mrg   __c = vec_add (__a, __b);
    486  1.1.1.3  mrg   return (__m64) ((__vector long long) __c)[0];
    487      1.1  mrg #else
    488  1.1.1.3  mrg   __m64_union __mu1, __mu2, __res;
    489      1.1  mrg 
    490  1.1.1.3  mrg   __mu1.as_m64 = __m1;
    491  1.1.1.3  mrg   __mu2.as_m64 = __m2;
    492      1.1  mrg 
    493  1.1.1.3  mrg   __res.as_int[0] = __mu1.as_int[0] + __mu2.as_int[0];
    494  1.1.1.3  mrg   __res.as_int[1] = __mu1.as_int[1] + __mu2.as_int[1];
    495      1.1  mrg 
    496  1.1.1.3  mrg   return (__m64) __res.as_m64;
    497      1.1  mrg #endif
    498      1.1  mrg }
    499      1.1  mrg 
    500      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    501      1.1  mrg _m_paddd (__m64 __m1, __m64 __m2)
    502      1.1  mrg {
    503      1.1  mrg   return _mm_add_pi32 (__m1, __m2);
    504      1.1  mrg }
    505      1.1  mrg 
    506      1.1  mrg /* Subtract the 8-bit values in M2 from the 8-bit values in M1.  */
    507      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    508      1.1  mrg _mm_sub_pi8 (__m64 __m1, __m64 __m2)
    509      1.1  mrg {
    510      1.1  mrg #if _ARCH_PWR8
    511  1.1.1.3  mrg   __vector signed char __a, __b, __c;
    512      1.1  mrg 
    513  1.1.1.3  mrg   __a = (__vector signed char)vec_splats (__m1);
    514  1.1.1.3  mrg   __b = (__vector signed char)vec_splats (__m2);
    515  1.1.1.3  mrg   __c = vec_sub (__a, __b);
    516  1.1.1.3  mrg   return (__m64) ((__vector long long) __c)[0];
    517      1.1  mrg #else
    518  1.1.1.3  mrg   __m64_union __mu1, __mu2, __res;
    519      1.1  mrg 
    520  1.1.1.3  mrg   __mu1.as_m64 = __m1;
    521  1.1.1.3  mrg   __mu2.as_m64 = __m2;
    522      1.1  mrg 
    523  1.1.1.3  mrg   __res.as_char[0] = __mu1.as_char[0] - __mu2.as_char[0];
    524  1.1.1.3  mrg   __res.as_char[1] = __mu1.as_char[1] - __mu2.as_char[1];
    525  1.1.1.3  mrg   __res.as_char[2] = __mu1.as_char[2] - __mu2.as_char[2];
    526  1.1.1.3  mrg   __res.as_char[3] = __mu1.as_char[3] - __mu2.as_char[3];
    527  1.1.1.3  mrg   __res.as_char[4] = __mu1.as_char[4] - __mu2.as_char[4];
    528  1.1.1.3  mrg   __res.as_char[5] = __mu1.as_char[5] - __mu2.as_char[5];
    529  1.1.1.3  mrg   __res.as_char[6] = __mu1.as_char[6] - __mu2.as_char[6];
    530  1.1.1.3  mrg   __res.as_char[7] = __mu1.as_char[7] - __mu2.as_char[7];
    531      1.1  mrg 
    532  1.1.1.3  mrg   return (__m64) __res.as_m64;
    533      1.1  mrg #endif
    534      1.1  mrg }
    535      1.1  mrg 
    536      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    537      1.1  mrg _m_psubb (__m64 __m1, __m64 __m2)
    538      1.1  mrg {
    539      1.1  mrg   return _mm_sub_pi8 (__m1, __m2);
    540      1.1  mrg }
    541      1.1  mrg 
    542      1.1  mrg /* Subtract the 16-bit values in M2 from the 16-bit values in M1.  */
    543      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    544      1.1  mrg _mm_sub_pi16 (__m64 __m1, __m64 __m2)
    545      1.1  mrg {
    546      1.1  mrg #if _ARCH_PWR8
    547  1.1.1.3  mrg   __vector signed short __a, __b, __c;
    548      1.1  mrg 
    549  1.1.1.3  mrg   __a = (__vector signed short)vec_splats (__m1);
    550  1.1.1.3  mrg   __b = (__vector signed short)vec_splats (__m2);
    551  1.1.1.3  mrg   __c = vec_sub (__a, __b);
    552  1.1.1.3  mrg   return (__m64) ((__vector long long) __c)[0];
    553      1.1  mrg #else
    554  1.1.1.3  mrg   __m64_union __mu1, __mu2, __res;
    555      1.1  mrg 
    556  1.1.1.3  mrg   __mu1.as_m64 = __m1;
    557  1.1.1.3  mrg   __mu2.as_m64 = __m2;
    558      1.1  mrg 
    559  1.1.1.3  mrg   __res.as_short[0] = __mu1.as_short[0] - __mu2.as_short[0];
    560  1.1.1.3  mrg   __res.as_short[1] = __mu1.as_short[1] - __mu2.as_short[1];
    561  1.1.1.3  mrg   __res.as_short[2] = __mu1.as_short[2] - __mu2.as_short[2];
    562  1.1.1.3  mrg   __res.as_short[3] = __mu1.as_short[3] - __mu2.as_short[3];
    563      1.1  mrg 
    564  1.1.1.3  mrg   return (__m64) __res.as_m64;
    565      1.1  mrg #endif
    566      1.1  mrg }
    567      1.1  mrg 
    568      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    569      1.1  mrg _m_psubw (__m64 __m1, __m64 __m2)
    570      1.1  mrg {
    571      1.1  mrg   return _mm_sub_pi16 (__m1, __m2);
    572      1.1  mrg }
    573      1.1  mrg 
    574      1.1  mrg /* Subtract the 32-bit values in M2 from the 32-bit values in M1.  */
    575      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    576      1.1  mrg _mm_sub_pi32 (__m64 __m1, __m64 __m2)
    577      1.1  mrg {
    578      1.1  mrg #if _ARCH_PWR9
    579  1.1.1.3  mrg   __vector signed int __a, __b, __c;
    580      1.1  mrg 
    581  1.1.1.3  mrg   __a = (__vector signed int)vec_splats (__m1);
    582  1.1.1.3  mrg   __b = (__vector signed int)vec_splats (__m2);
    583  1.1.1.3  mrg   __c = vec_sub (__a, __b);
    584  1.1.1.3  mrg   return (__m64) ((__vector long long) __c)[0];
    585      1.1  mrg #else
    586  1.1.1.3  mrg   __m64_union __mu1, __mu2, __res;
    587      1.1  mrg 
    588  1.1.1.3  mrg   __mu1.as_m64 = __m1;
    589  1.1.1.3  mrg   __mu2.as_m64 = __m2;
    590      1.1  mrg 
    591  1.1.1.3  mrg   __res.as_int[0] = __mu1.as_int[0] - __mu2.as_int[0];
    592  1.1.1.3  mrg   __res.as_int[1] = __mu1.as_int[1] - __mu2.as_int[1];
    593      1.1  mrg 
    594  1.1.1.3  mrg   return (__m64) __res.as_m64;
    595      1.1  mrg #endif
    596      1.1  mrg }
    597      1.1  mrg 
    598      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    599      1.1  mrg _m_psubd (__m64 __m1, __m64 __m2)
    600      1.1  mrg {
    601      1.1  mrg   return _mm_sub_pi32 (__m1, __m2);
    602      1.1  mrg }
    603      1.1  mrg 
    604      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    605      1.1  mrg _mm_add_si64 (__m64 __m1, __m64 __m2)
    606      1.1  mrg {
    607      1.1  mrg   return (__m1 + __m2);
    608      1.1  mrg }
    609      1.1  mrg 
    610      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    611      1.1  mrg _mm_sub_si64 (__m64 __m1, __m64 __m2)
    612      1.1  mrg {
    613      1.1  mrg   return (__m1 - __m2);
    614      1.1  mrg }
    615      1.1  mrg 
    616      1.1  mrg /* Shift the 64-bit value in M left by COUNT.  */
    617      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    618      1.1  mrg _mm_sll_si64 (__m64 __m, __m64 __count)
    619      1.1  mrg {
    620      1.1  mrg   return (__m << __count);
    621      1.1  mrg }
    622      1.1  mrg 
    623      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    624      1.1  mrg _m_psllq (__m64 __m, __m64 __count)
    625      1.1  mrg {
    626      1.1  mrg   return _mm_sll_si64 (__m, __count);
    627      1.1  mrg }
    628      1.1  mrg 
    629      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    630      1.1  mrg _mm_slli_si64 (__m64 __m, const int __count)
    631      1.1  mrg {
    632      1.1  mrg   return (__m << __count);
    633      1.1  mrg }
    634      1.1  mrg 
    635      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    636      1.1  mrg _m_psllqi (__m64 __m, const int __count)
    637      1.1  mrg {
    638      1.1  mrg   return _mm_slli_si64 (__m, __count);
    639      1.1  mrg }
    640      1.1  mrg 
    641      1.1  mrg /* Shift the 64-bit value in M left by COUNT; shift in zeros.  */
    642      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    643      1.1  mrg _mm_srl_si64 (__m64 __m, __m64 __count)
    644      1.1  mrg {
    645      1.1  mrg   return (__m >> __count);
    646      1.1  mrg }
    647      1.1  mrg 
    648      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    649      1.1  mrg _m_psrlq (__m64 __m, __m64 __count)
    650      1.1  mrg {
    651      1.1  mrg   return _mm_srl_si64 (__m, __count);
    652      1.1  mrg }
    653      1.1  mrg 
    654      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    655      1.1  mrg _mm_srli_si64 (__m64 __m, const int __count)
    656      1.1  mrg {
    657      1.1  mrg   return (__m >> __count);
    658      1.1  mrg }
    659      1.1  mrg 
    660      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    661      1.1  mrg _m_psrlqi (__m64 __m, const int __count)
    662      1.1  mrg {
    663      1.1  mrg   return _mm_srli_si64 (__m, __count);
    664      1.1  mrg }
    665      1.1  mrg 
    666      1.1  mrg /* Bit-wise AND the 64-bit values in M1 and M2.  */
    667      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    668      1.1  mrg _mm_and_si64 (__m64 __m1, __m64 __m2)
    669      1.1  mrg {
    670      1.1  mrg   return (__m1 & __m2);
    671      1.1  mrg }
    672      1.1  mrg 
    673      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    674      1.1  mrg _m_pand (__m64 __m1, __m64 __m2)
    675      1.1  mrg {
    676      1.1  mrg   return _mm_and_si64 (__m1, __m2);
    677      1.1  mrg }
    678      1.1  mrg 
    679      1.1  mrg /* Bit-wise complement the 64-bit value in M1 and bit-wise AND it with the
    680      1.1  mrg    64-bit value in M2.  */
    681      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    682      1.1  mrg _mm_andnot_si64 (__m64 __m1, __m64 __m2)
    683      1.1  mrg {
    684      1.1  mrg   return (~__m1 & __m2);
    685      1.1  mrg }
    686      1.1  mrg 
    687      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    688      1.1  mrg _m_pandn (__m64 __m1, __m64 __m2)
    689      1.1  mrg {
    690      1.1  mrg   return _mm_andnot_si64 (__m1, __m2);
    691      1.1  mrg }
    692      1.1  mrg 
    693      1.1  mrg /* Bit-wise inclusive OR the 64-bit values in M1 and M2.  */
    694      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    695      1.1  mrg _mm_or_si64 (__m64 __m1, __m64 __m2)
    696      1.1  mrg {
    697      1.1  mrg   return (__m1 | __m2);
    698      1.1  mrg }
    699      1.1  mrg 
    700      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    701      1.1  mrg _m_por (__m64 __m1, __m64 __m2)
    702      1.1  mrg {
    703      1.1  mrg   return _mm_or_si64 (__m1, __m2);
    704      1.1  mrg }
    705      1.1  mrg 
    706      1.1  mrg /* Bit-wise exclusive OR the 64-bit values in M1 and M2.  */
    707      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    708      1.1  mrg _mm_xor_si64 (__m64 __m1, __m64 __m2)
    709      1.1  mrg {
    710      1.1  mrg   return  (__m1 ^ __m2);
    711      1.1  mrg }
    712      1.1  mrg 
    713      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    714      1.1  mrg _m_pxor (__m64 __m1, __m64 __m2)
    715      1.1  mrg {
    716      1.1  mrg   return _mm_xor_si64 (__m1, __m2);
    717      1.1  mrg }
    718      1.1  mrg 
    719      1.1  mrg /* Creates a 64-bit zero.  */
    720      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    721      1.1  mrg _mm_setzero_si64 (void)
    722      1.1  mrg {
    723      1.1  mrg   return (__m64) 0;
    724      1.1  mrg }
    725      1.1  mrg 
    726      1.1  mrg /* Compare eight 8-bit values.  The result of the comparison is 0xFF if the
    727      1.1  mrg    test is true and zero if false.  */
    728      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    729      1.1  mrg _mm_cmpeq_pi8 (__m64 __m1, __m64 __m2)
    730      1.1  mrg {
    731  1.1.1.2  mrg #if defined(_ARCH_PWR6) && defined(__powerpc64__)
    732  1.1.1.3  mrg   __m64 __res;
    733      1.1  mrg   __asm__(
    734      1.1  mrg       "cmpb %0,%1,%2;\n"
    735  1.1.1.3  mrg       : "=r" (__res)
    736      1.1  mrg       : "r" (__m1),
    737      1.1  mrg 	"r" (__m2)
    738      1.1  mrg       : );
    739  1.1.1.3  mrg   return (__res);
    740      1.1  mrg #else
    741  1.1.1.3  mrg   __m64_union __mu1, __mu2, __res;
    742      1.1  mrg 
    743  1.1.1.3  mrg   __mu1.as_m64 = __m1;
    744  1.1.1.3  mrg   __mu2.as_m64 = __m2;
    745      1.1  mrg 
    746  1.1.1.3  mrg   __res.as_char[0] = (__mu1.as_char[0] == __mu2.as_char[0])? -1: 0;
    747  1.1.1.3  mrg   __res.as_char[1] = (__mu1.as_char[1] == __mu2.as_char[1])? -1: 0;
    748  1.1.1.3  mrg   __res.as_char[2] = (__mu1.as_char[2] == __mu2.as_char[2])? -1: 0;
    749  1.1.1.3  mrg   __res.as_char[3] = (__mu1.as_char[3] == __mu2.as_char[3])? -1: 0;
    750  1.1.1.3  mrg   __res.as_char[4] = (__mu1.as_char[4] == __mu2.as_char[4])? -1: 0;
    751  1.1.1.3  mrg   __res.as_char[5] = (__mu1.as_char[5] == __mu2.as_char[5])? -1: 0;
    752  1.1.1.3  mrg   __res.as_char[6] = (__mu1.as_char[6] == __mu2.as_char[6])? -1: 0;
    753  1.1.1.3  mrg   __res.as_char[7] = (__mu1.as_char[7] == __mu2.as_char[7])? -1: 0;
    754      1.1  mrg 
    755  1.1.1.3  mrg   return (__m64) __res.as_m64;
    756      1.1  mrg #endif
    757      1.1  mrg }
    758      1.1  mrg 
    759      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    760      1.1  mrg _m_pcmpeqb (__m64 __m1, __m64 __m2)
    761      1.1  mrg {
    762      1.1  mrg   return _mm_cmpeq_pi8 (__m1, __m2);
    763      1.1  mrg }
    764      1.1  mrg 
    765      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    766      1.1  mrg _mm_cmpgt_pi8 (__m64 __m1, __m64 __m2)
    767      1.1  mrg {
    768      1.1  mrg #if _ARCH_PWR8
    769  1.1.1.3  mrg   __vector signed char __a, __b, __c;
    770      1.1  mrg 
    771  1.1.1.3  mrg   __a = (__vector signed char)vec_splats (__m1);
    772  1.1.1.3  mrg   __b = (__vector signed char)vec_splats (__m2);
    773  1.1.1.3  mrg   __c = (__vector signed char)vec_cmpgt (__a, __b);
    774  1.1.1.3  mrg   return (__m64) ((__vector long long) __c)[0];
    775      1.1  mrg #else
    776  1.1.1.3  mrg   __m64_union __mu1, __mu2, __res;
    777      1.1  mrg 
    778  1.1.1.3  mrg   __mu1.as_m64 = __m1;
    779  1.1.1.3  mrg   __mu2.as_m64 = __m2;
    780      1.1  mrg 
    781  1.1.1.3  mrg   __res.as_char[0] = (__mu1.as_char[0] > __mu2.as_char[0])? -1: 0;
    782  1.1.1.3  mrg   __res.as_char[1] = (__mu1.as_char[1] > __mu2.as_char[1])? -1: 0;
    783  1.1.1.3  mrg   __res.as_char[2] = (__mu1.as_char[2] > __mu2.as_char[2])? -1: 0;
    784  1.1.1.3  mrg   __res.as_char[3] = (__mu1.as_char[3] > __mu2.as_char[3])? -1: 0;
    785  1.1.1.3  mrg   __res.as_char[4] = (__mu1.as_char[4] > __mu2.as_char[4])? -1: 0;
    786  1.1.1.3  mrg   __res.as_char[5] = (__mu1.as_char[5] > __mu2.as_char[5])? -1: 0;
    787  1.1.1.3  mrg   __res.as_char[6] = (__mu1.as_char[6] > __mu2.as_char[6])? -1: 0;
    788  1.1.1.3  mrg   __res.as_char[7] = (__mu1.as_char[7] > __mu2.as_char[7])? -1: 0;
    789      1.1  mrg 
    790  1.1.1.3  mrg   return (__m64) __res.as_m64;
    791      1.1  mrg #endif
    792      1.1  mrg }
    793      1.1  mrg 
    794      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    795      1.1  mrg _m_pcmpgtb (__m64 __m1, __m64 __m2)
    796      1.1  mrg {
    797      1.1  mrg   return _mm_cmpgt_pi8 (__m1, __m2);
    798      1.1  mrg }
    799      1.1  mrg 
    800      1.1  mrg /* Compare four 16-bit values.  The result of the comparison is 0xFFFF if
    801      1.1  mrg    the test is true and zero if false.  */
    802      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    803      1.1  mrg _mm_cmpeq_pi16 (__m64 __m1, __m64 __m2)
    804      1.1  mrg {
    805      1.1  mrg #if _ARCH_PWR8
    806  1.1.1.3  mrg   __vector signed short __a, __b, __c;
    807      1.1  mrg 
    808  1.1.1.3  mrg   __a = (__vector signed short)vec_splats (__m1);
    809  1.1.1.3  mrg   __b = (__vector signed short)vec_splats (__m2);
    810  1.1.1.3  mrg   __c = (__vector signed short)vec_cmpeq (__a, __b);
    811  1.1.1.3  mrg   return (__m64) ((__vector long long) __c)[0];
    812      1.1  mrg #else
    813  1.1.1.3  mrg   __m64_union __mu1, __mu2, __res;
    814      1.1  mrg 
    815  1.1.1.3  mrg   __mu1.as_m64 = __m1;
    816  1.1.1.3  mrg   __mu2.as_m64 = __m2;
    817      1.1  mrg 
    818  1.1.1.3  mrg   __res.as_short[0] = (__mu1.as_short[0] == __mu2.as_short[0])? -1: 0;
    819  1.1.1.3  mrg   __res.as_short[1] = (__mu1.as_short[1] == __mu2.as_short[1])? -1: 0;
    820  1.1.1.3  mrg   __res.as_short[2] = (__mu1.as_short[2] == __mu2.as_short[2])? -1: 0;
    821  1.1.1.3  mrg   __res.as_short[3] = (__mu1.as_short[3] == __mu2.as_short[3])? -1: 0;
    822      1.1  mrg 
    823  1.1.1.3  mrg   return (__m64) __res.as_m64;
    824      1.1  mrg #endif
    825      1.1  mrg }
    826      1.1  mrg 
    827      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    828      1.1  mrg _m_pcmpeqw (__m64 __m1, __m64 __m2)
    829      1.1  mrg {
    830      1.1  mrg   return _mm_cmpeq_pi16 (__m1, __m2);
    831      1.1  mrg }
    832      1.1  mrg 
    833      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    834      1.1  mrg _mm_cmpgt_pi16 (__m64 __m1, __m64 __m2)
    835      1.1  mrg {
    836      1.1  mrg #if _ARCH_PWR8
    837  1.1.1.3  mrg   __vector signed short __a, __b, __c;
    838      1.1  mrg 
    839  1.1.1.3  mrg   __a = (__vector signed short)vec_splats (__m1);
    840  1.1.1.3  mrg   __b = (__vector signed short)vec_splats (__m2);
    841  1.1.1.3  mrg   __c = (__vector signed short)vec_cmpgt (__a, __b);
    842  1.1.1.3  mrg   return (__m64) ((__vector long long) __c)[0];
    843      1.1  mrg #else
    844  1.1.1.3  mrg   __m64_union __mu1, __mu2, __res;
    845      1.1  mrg 
    846  1.1.1.3  mrg   __mu1.as_m64 = __m1;
    847  1.1.1.3  mrg   __mu2.as_m64 = __m2;
    848      1.1  mrg 
    849  1.1.1.3  mrg   __res.as_short[0] = (__mu1.as_short[0] > __mu2.as_short[0])? -1: 0;
    850  1.1.1.3  mrg   __res.as_short[1] = (__mu1.as_short[1] > __mu2.as_short[1])? -1: 0;
    851  1.1.1.3  mrg   __res.as_short[2] = (__mu1.as_short[2] > __mu2.as_short[2])? -1: 0;
    852  1.1.1.3  mrg   __res.as_short[3] = (__mu1.as_short[3] > __mu2.as_short[3])? -1: 0;
    853      1.1  mrg 
    854  1.1.1.3  mrg   return (__m64) __res.as_m64;
    855      1.1  mrg #endif
    856      1.1  mrg }
    857      1.1  mrg 
    858      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    859      1.1  mrg _m_pcmpgtw (__m64 __m1, __m64 __m2)
    860      1.1  mrg {
    861      1.1  mrg   return _mm_cmpgt_pi16 (__m1, __m2);
    862      1.1  mrg }
    863      1.1  mrg 
    864      1.1  mrg /* Compare two 32-bit values.  The result of the comparison is 0xFFFFFFFF if
    865      1.1  mrg    the test is true and zero if false.  */
    866      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    867      1.1  mrg _mm_cmpeq_pi32 (__m64 __m1, __m64 __m2)
    868      1.1  mrg {
    869      1.1  mrg #if _ARCH_PWR9
    870  1.1.1.3  mrg   __vector signed int __a, __b, __c;
    871      1.1  mrg 
    872  1.1.1.3  mrg   __a = (__vector signed int)vec_splats (__m1);
    873  1.1.1.3  mrg   __b = (__vector signed int)vec_splats (__m2);
    874  1.1.1.3  mrg   __c = (__vector signed int)vec_cmpeq (__a, __b);
    875  1.1.1.3  mrg   return (__m64) ((__vector long long) __c)[0];
    876      1.1  mrg #else
    877  1.1.1.3  mrg   __m64_union __mu1, __mu2, __res;
    878      1.1  mrg 
    879  1.1.1.3  mrg   __mu1.as_m64 = __m1;
    880  1.1.1.3  mrg   __mu2.as_m64 = __m2;
    881      1.1  mrg 
    882  1.1.1.3  mrg   __res.as_int[0] = (__mu1.as_int[0] == __mu2.as_int[0])? -1: 0;
    883  1.1.1.3  mrg   __res.as_int[1] = (__mu1.as_int[1] == __mu2.as_int[1])? -1: 0;
    884      1.1  mrg 
    885  1.1.1.3  mrg   return (__m64) __res.as_m64;
    886      1.1  mrg #endif
    887      1.1  mrg }
    888      1.1  mrg 
    889      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    890      1.1  mrg _m_pcmpeqd (__m64 __m1, __m64 __m2)
    891      1.1  mrg {
    892      1.1  mrg   return _mm_cmpeq_pi32 (__m1, __m2);
    893      1.1  mrg }
    894      1.1  mrg 
    895      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    896      1.1  mrg _mm_cmpgt_pi32 (__m64 __m1, __m64 __m2)
    897      1.1  mrg {
    898      1.1  mrg #if _ARCH_PWR9
    899  1.1.1.3  mrg   __vector signed int __a, __b, __c;
    900      1.1  mrg 
    901  1.1.1.3  mrg   __a = (__vector signed int)vec_splats (__m1);
    902  1.1.1.3  mrg   __b = (__vector signed int)vec_splats (__m2);
    903  1.1.1.3  mrg   __c = (__vector signed int)vec_cmpgt (__a, __b);
    904  1.1.1.3  mrg   return (__m64) ((__vector long long) __c)[0];
    905      1.1  mrg #else
    906  1.1.1.3  mrg   __m64_union __mu1, __mu2, __res;
    907      1.1  mrg 
    908  1.1.1.3  mrg   __mu1.as_m64 = __m1;
    909  1.1.1.3  mrg   __mu2.as_m64 = __m2;
    910      1.1  mrg 
    911  1.1.1.3  mrg   __res.as_int[0] = (__mu1.as_int[0] > __mu2.as_int[0])? -1: 0;
    912  1.1.1.3  mrg   __res.as_int[1] = (__mu1.as_int[1] > __mu2.as_int[1])? -1: 0;
    913      1.1  mrg 
    914  1.1.1.3  mrg   return (__m64) __res.as_m64;
    915      1.1  mrg #endif
    916      1.1  mrg }
    917      1.1  mrg 
    918      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    919      1.1  mrg _m_pcmpgtd (__m64 __m1, __m64 __m2)
    920      1.1  mrg {
    921      1.1  mrg   return _mm_cmpgt_pi32 (__m1, __m2);
    922      1.1  mrg }
    923      1.1  mrg 
    924      1.1  mrg #if _ARCH_PWR8
    925      1.1  mrg /* Add the 8-bit values in M1 to the 8-bit values in M2 using signed
    926      1.1  mrg    saturated arithmetic.  */
    927      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    928      1.1  mrg _mm_adds_pi8 (__m64 __m1, __m64 __m2)
    929      1.1  mrg {
    930  1.1.1.3  mrg   __vector signed char __a, __b, __c;
    931      1.1  mrg 
    932  1.1.1.3  mrg   __a = (__vector signed char)vec_splats (__m1);
    933  1.1.1.3  mrg   __b = (__vector signed char)vec_splats (__m2);
    934  1.1.1.3  mrg   __c = vec_adds (__a, __b);
    935  1.1.1.3  mrg   return (__m64) ((__vector long long) __c)[0];
    936      1.1  mrg }
    937      1.1  mrg 
    938      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    939      1.1  mrg _m_paddsb (__m64 __m1, __m64 __m2)
    940      1.1  mrg {
    941      1.1  mrg   return _mm_adds_pi8 (__m1, __m2);
    942      1.1  mrg }
    943      1.1  mrg /* Add the 16-bit values in M1 to the 16-bit values in M2 using signed
    944      1.1  mrg    saturated arithmetic.  */
    945      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    946      1.1  mrg _mm_adds_pi16 (__m64 __m1, __m64 __m2)
    947      1.1  mrg {
    948  1.1.1.3  mrg   __vector signed short __a, __b, __c;
    949      1.1  mrg 
    950  1.1.1.3  mrg   __a = (__vector signed short)vec_splats (__m1);
    951  1.1.1.3  mrg   __b = (__vector signed short)vec_splats (__m2);
    952  1.1.1.3  mrg   __c = vec_adds (__a, __b);
    953  1.1.1.3  mrg   return (__m64) ((__vector long long) __c)[0];
    954      1.1  mrg }
    955      1.1  mrg 
    956      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    957      1.1  mrg _m_paddsw (__m64 __m1, __m64 __m2)
    958      1.1  mrg {
    959      1.1  mrg   return _mm_adds_pi16 (__m1, __m2);
    960      1.1  mrg }
    961      1.1  mrg /* Add the 8-bit values in M1 to the 8-bit values in M2 using unsigned
    962      1.1  mrg    saturated arithmetic.  */
    963      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    964      1.1  mrg _mm_adds_pu8 (__m64 __m1, __m64 __m2)
    965      1.1  mrg {
    966  1.1.1.3  mrg   __vector unsigned char __a, __b, __c;
    967      1.1  mrg 
    968  1.1.1.3  mrg   __a = (__vector unsigned char)vec_splats (__m1);
    969  1.1.1.3  mrg   __b = (__vector unsigned char)vec_splats (__m2);
    970  1.1.1.3  mrg   __c = vec_adds (__a, __b);
    971  1.1.1.3  mrg   return (__m64) ((__vector long long) __c)[0];
    972      1.1  mrg }
    973      1.1  mrg 
    974      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    975      1.1  mrg _m_paddusb (__m64 __m1, __m64 __m2)
    976      1.1  mrg {
    977      1.1  mrg   return _mm_adds_pu8 (__m1, __m2);
    978      1.1  mrg }
    979      1.1  mrg 
    980      1.1  mrg /* Add the 16-bit values in M1 to the 16-bit values in M2 using unsigned
    981      1.1  mrg    saturated arithmetic.  */
    982      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    983      1.1  mrg _mm_adds_pu16 (__m64 __m1, __m64 __m2)
    984      1.1  mrg {
    985  1.1.1.3  mrg   __vector unsigned short __a, __b, __c;
    986      1.1  mrg 
    987  1.1.1.3  mrg   __a = (__vector unsigned short)vec_splats (__m1);
    988  1.1.1.3  mrg   __b = (__vector unsigned short)vec_splats (__m2);
    989  1.1.1.3  mrg   __c = vec_adds (__a, __b);
    990  1.1.1.3  mrg   return (__m64) ((__vector long long) __c)[0];
    991      1.1  mrg }
    992      1.1  mrg 
    993      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    994      1.1  mrg _m_paddusw (__m64 __m1, __m64 __m2)
    995      1.1  mrg {
    996      1.1  mrg   return _mm_adds_pu16 (__m1, __m2);
    997      1.1  mrg }
    998      1.1  mrg 
    999      1.1  mrg /* Subtract the 8-bit values in M2 from the 8-bit values in M1 using signed
   1000      1.1  mrg    saturating arithmetic.  */
   1001      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1002      1.1  mrg _mm_subs_pi8 (__m64 __m1, __m64 __m2)
   1003      1.1  mrg {
   1004  1.1.1.3  mrg   __vector signed char __a, __b, __c;
   1005      1.1  mrg 
   1006  1.1.1.3  mrg   __a = (__vector signed char)vec_splats (__m1);
   1007  1.1.1.3  mrg   __b = (__vector signed char)vec_splats (__m2);
   1008  1.1.1.3  mrg   __c = vec_subs (__a, __b);
   1009  1.1.1.3  mrg   return (__m64) ((__vector long long) __c)[0];
   1010      1.1  mrg }
   1011      1.1  mrg 
   1012      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1013      1.1  mrg _m_psubsb (__m64 __m1, __m64 __m2)
   1014      1.1  mrg {
   1015      1.1  mrg   return _mm_subs_pi8 (__m1, __m2);
   1016      1.1  mrg }
   1017      1.1  mrg 
   1018      1.1  mrg /* Subtract the 16-bit values in M2 from the 16-bit values in M1 using
   1019      1.1  mrg    signed saturating arithmetic.  */
   1020      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1021      1.1  mrg _mm_subs_pi16 (__m64 __m1, __m64 __m2)
   1022      1.1  mrg {
   1023  1.1.1.3  mrg   __vector signed short __a, __b, __c;
   1024      1.1  mrg 
   1025  1.1.1.3  mrg   __a = (__vector signed short)vec_splats (__m1);
   1026  1.1.1.3  mrg   __b = (__vector signed short)vec_splats (__m2);
   1027  1.1.1.3  mrg   __c = vec_subs (__a, __b);
   1028  1.1.1.3  mrg   return (__m64) ((__vector long long) __c)[0];
   1029      1.1  mrg }
   1030      1.1  mrg 
   1031      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1032      1.1  mrg _m_psubsw (__m64 __m1, __m64 __m2)
   1033      1.1  mrg {
   1034      1.1  mrg   return _mm_subs_pi16 (__m1, __m2);
   1035      1.1  mrg }
   1036      1.1  mrg 
   1037      1.1  mrg /* Subtract the 8-bit values in M2 from the 8-bit values in M1 using
   1038      1.1  mrg    unsigned saturating arithmetic.  */
   1039      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1040      1.1  mrg _mm_subs_pu8 (__m64 __m1, __m64 __m2)
   1041      1.1  mrg {
   1042  1.1.1.3  mrg   __vector unsigned char __a, __b, __c;
   1043      1.1  mrg 
   1044  1.1.1.3  mrg   __a = (__vector unsigned char)vec_splats (__m1);
   1045  1.1.1.3  mrg   __b = (__vector unsigned char)vec_splats (__m2);
   1046  1.1.1.3  mrg   __c = vec_subs (__a, __b);
   1047  1.1.1.3  mrg   return (__m64) ((__vector long long) __c)[0];
   1048      1.1  mrg }
   1049      1.1  mrg 
   1050      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1051      1.1  mrg _m_psubusb (__m64 __m1, __m64 __m2)
   1052      1.1  mrg {
   1053      1.1  mrg   return _mm_subs_pu8 (__m1, __m2);
   1054      1.1  mrg }
   1055      1.1  mrg 
   1056      1.1  mrg /* Subtract the 16-bit values in M2 from the 16-bit values in M1 using
   1057      1.1  mrg    unsigned saturating arithmetic.  */
   1058      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1059      1.1  mrg _mm_subs_pu16 (__m64 __m1, __m64 __m2)
   1060      1.1  mrg {
   1061  1.1.1.3  mrg   __vector unsigned short __a, __b, __c;
   1062      1.1  mrg 
   1063  1.1.1.3  mrg   __a = (__vector unsigned short)vec_splats (__m1);
   1064  1.1.1.3  mrg   __b = (__vector unsigned short)vec_splats (__m2);
   1065  1.1.1.3  mrg   __c = vec_subs (__a, __b);
   1066  1.1.1.3  mrg   return (__m64) ((__vector long long) __c)[0];
   1067      1.1  mrg }
   1068      1.1  mrg 
   1069      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1070      1.1  mrg _m_psubusw (__m64 __m1, __m64 __m2)
   1071      1.1  mrg {
   1072      1.1  mrg   return _mm_subs_pu16 (__m1, __m2);
   1073      1.1  mrg }
   1074      1.1  mrg 
   1075      1.1  mrg /* Multiply four 16-bit values in M1 by four 16-bit values in M2 producing
   1076      1.1  mrg    four 32-bit intermediate results, which are then summed by pairs to
   1077      1.1  mrg    produce two 32-bit results.  */
   1078      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1079      1.1  mrg _mm_madd_pi16 (__m64 __m1, __m64 __m2)
   1080      1.1  mrg {
   1081  1.1.1.3  mrg   __vector signed short __a, __b;
   1082  1.1.1.3  mrg   __vector signed int __c;
   1083  1.1.1.3  mrg   __vector signed int __zero = {0, 0, 0, 0};
   1084  1.1.1.3  mrg 
   1085  1.1.1.3  mrg   __a = (__vector signed short)vec_splats (__m1);
   1086  1.1.1.3  mrg   __b = (__vector signed short)vec_splats (__m2);
   1087  1.1.1.3  mrg   __c = vec_vmsumshm (__a, __b, __zero);
   1088  1.1.1.3  mrg   return (__m64) ((__vector long long) __c)[0];
   1089      1.1  mrg }
   1090      1.1  mrg 
   1091      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1092      1.1  mrg _m_pmaddwd (__m64 __m1, __m64 __m2)
   1093      1.1  mrg {
   1094      1.1  mrg   return _mm_madd_pi16 (__m1, __m2);
   1095      1.1  mrg }
   1096      1.1  mrg /* Multiply four signed 16-bit values in M1 by four signed 16-bit values in
   1097      1.1  mrg    M2 and produce the high 16 bits of the 32-bit results.  */
   1098      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1099      1.1  mrg _mm_mulhi_pi16 (__m64 __m1, __m64 __m2)
   1100      1.1  mrg {
   1101  1.1.1.3  mrg   __vector signed short __a, __b;
   1102  1.1.1.3  mrg   __vector signed short __c;
   1103  1.1.1.3  mrg   __vector signed int __w0, __w1;
   1104  1.1.1.3  mrg   __vector unsigned char __xform1 = {
   1105  1.1.1.2  mrg #ifdef __LITTLE_ENDIAN__
   1106      1.1  mrg       0x02, 0x03, 0x12, 0x13,  0x06, 0x07, 0x16, 0x17,
   1107      1.1  mrg       0x0A, 0x0B, 0x1A, 0x1B,  0x0E, 0x0F, 0x1E, 0x1F
   1108  1.1.1.2  mrg #else
   1109  1.1.1.2  mrg       0x00, 0x01, 0x10, 0x11,  0x04, 0x05, 0x14, 0x15,
   1110  1.1.1.2  mrg       0x00, 0x01, 0x10, 0x11,  0x04, 0x05, 0x14, 0x15
   1111  1.1.1.2  mrg #endif
   1112      1.1  mrg     };
   1113      1.1  mrg 
   1114  1.1.1.3  mrg   __a = (__vector signed short)vec_splats (__m1);
   1115  1.1.1.3  mrg   __b = (__vector signed short)vec_splats (__m2);
   1116      1.1  mrg 
   1117  1.1.1.3  mrg   __w0 = vec_vmulesh (__a, __b);
   1118  1.1.1.3  mrg   __w1 = vec_vmulosh (__a, __b);
   1119  1.1.1.3  mrg   __c = (__vector signed short)vec_perm (__w0, __w1, __xform1);
   1120      1.1  mrg 
   1121  1.1.1.3  mrg   return (__m64) ((__vector long long) __c)[0];
   1122      1.1  mrg }
   1123      1.1  mrg 
   1124      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1125      1.1  mrg _m_pmulhw (__m64 __m1, __m64 __m2)
   1126      1.1  mrg {
   1127      1.1  mrg   return _mm_mulhi_pi16 (__m1, __m2);
   1128      1.1  mrg }
   1129      1.1  mrg 
   1130      1.1  mrg /* Multiply four 16-bit values in M1 by four 16-bit values in M2 and produce
   1131      1.1  mrg    the low 16 bits of the results.  */
   1132      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1133      1.1  mrg _mm_mullo_pi16 (__m64 __m1, __m64 __m2)
   1134      1.1  mrg {
   1135  1.1.1.3  mrg   __vector signed short __a, __b, __c;
   1136      1.1  mrg 
   1137  1.1.1.3  mrg   __a = (__vector signed short)vec_splats (__m1);
   1138  1.1.1.3  mrg   __b = (__vector signed short)vec_splats (__m2);
   1139  1.1.1.3  mrg   __c = __a * __b;
   1140  1.1.1.3  mrg   return (__m64) ((__vector long long) __c)[0];
   1141      1.1  mrg }
   1142      1.1  mrg 
   1143      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1144      1.1  mrg _m_pmullw (__m64 __m1, __m64 __m2)
   1145      1.1  mrg {
   1146      1.1  mrg   return _mm_mullo_pi16 (__m1, __m2);
   1147      1.1  mrg }
   1148      1.1  mrg 
   1149      1.1  mrg /* Shift four 16-bit values in M left by COUNT.  */
   1150      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1151      1.1  mrg _mm_sll_pi16 (__m64 __m, __m64 __count)
   1152      1.1  mrg {
   1153  1.1.1.3  mrg   __vector signed short __r;
   1154  1.1.1.3  mrg   __vector unsigned short __c;
   1155      1.1  mrg 
   1156      1.1  mrg   if (__count <= 15)
   1157      1.1  mrg     {
   1158  1.1.1.3  mrg       __r = (__vector signed short)vec_splats (__m);
   1159  1.1.1.3  mrg       __c = (__vector unsigned short)vec_splats ((unsigned short)__count);
   1160  1.1.1.3  mrg       __r = vec_sl (__r, (__vector unsigned short)__c);
   1161  1.1.1.3  mrg       return (__m64) ((__vector long long) __r)[0];
   1162      1.1  mrg     }
   1163      1.1  mrg   else
   1164      1.1  mrg   return (0);
   1165      1.1  mrg }
   1166      1.1  mrg 
   1167      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1168      1.1  mrg _m_psllw (__m64 __m, __m64 __count)
   1169      1.1  mrg {
   1170      1.1  mrg   return _mm_sll_pi16 (__m, __count);
   1171      1.1  mrg }
   1172      1.1  mrg 
   1173      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1174      1.1  mrg _mm_slli_pi16 (__m64 __m, int __count)
   1175      1.1  mrg {
   1176      1.1  mrg   /* Promote int to long then invoke mm_sll_pi16.  */
   1177      1.1  mrg   return _mm_sll_pi16 (__m, __count);
   1178      1.1  mrg }
   1179      1.1  mrg 
   1180      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1181      1.1  mrg _m_psllwi (__m64 __m, int __count)
   1182      1.1  mrg {
   1183      1.1  mrg   return _mm_slli_pi16 (__m, __count);
   1184      1.1  mrg }
   1185      1.1  mrg 
   1186      1.1  mrg /* Shift two 32-bit values in M left by COUNT.  */
   1187      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1188      1.1  mrg _mm_sll_pi32 (__m64 __m, __m64 __count)
   1189      1.1  mrg {
   1190  1.1.1.3  mrg   __m64_union __res;
   1191      1.1  mrg 
   1192  1.1.1.3  mrg   __res.as_m64 = __m;
   1193      1.1  mrg 
   1194  1.1.1.3  mrg   __res.as_int[0] = __res.as_int[0] << __count;
   1195  1.1.1.3  mrg   __res.as_int[1] = __res.as_int[1] << __count;
   1196  1.1.1.3  mrg   return (__res.as_m64);
   1197      1.1  mrg }
   1198      1.1  mrg 
   1199      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1200      1.1  mrg _m_pslld (__m64 __m, __m64 __count)
   1201      1.1  mrg {
   1202      1.1  mrg   return _mm_sll_pi32 (__m, __count);
   1203      1.1  mrg }
   1204      1.1  mrg 
   1205      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1206      1.1  mrg _mm_slli_pi32 (__m64 __m, int __count)
   1207      1.1  mrg {
   1208      1.1  mrg   /* Promote int to long then invoke mm_sll_pi32.  */
   1209      1.1  mrg   return _mm_sll_pi32 (__m, __count);
   1210      1.1  mrg }
   1211      1.1  mrg 
   1212      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1213      1.1  mrg _m_pslldi (__m64 __m, int __count)
   1214      1.1  mrg {
   1215      1.1  mrg   return _mm_slli_pi32 (__m, __count);
   1216      1.1  mrg }
   1217      1.1  mrg 
   1218      1.1  mrg /* Shift four 16-bit values in M right by COUNT; shift in the sign bit.  */
   1219      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1220      1.1  mrg _mm_sra_pi16 (__m64 __m, __m64 __count)
   1221      1.1  mrg {
   1222  1.1.1.3  mrg   __vector signed short __r;
   1223  1.1.1.3  mrg   __vector unsigned short __c;
   1224      1.1  mrg 
   1225      1.1  mrg   if (__count <= 15)
   1226      1.1  mrg     {
   1227  1.1.1.3  mrg 	__r = (__vector signed short)vec_splats (__m);
   1228  1.1.1.3  mrg 	__c = (__vector unsigned short)vec_splats ((unsigned short)__count);
   1229  1.1.1.3  mrg 	__r = vec_sra (__r, (__vector unsigned short)__c);
   1230  1.1.1.3  mrg         return (__m64) ((__vector long long) __r)[0];
   1231      1.1  mrg     }
   1232      1.1  mrg   else
   1233      1.1  mrg   return (0);
   1234      1.1  mrg }
   1235      1.1  mrg 
   1236      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1237      1.1  mrg _m_psraw (__m64 __m, __m64 __count)
   1238      1.1  mrg {
   1239      1.1  mrg   return _mm_sra_pi16 (__m, __count);
   1240      1.1  mrg }
   1241      1.1  mrg 
   1242      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1243      1.1  mrg _mm_srai_pi16 (__m64 __m, int __count)
   1244      1.1  mrg {
   1245      1.1  mrg   /* Promote int to long then invoke mm_sra_pi32.  */
   1246      1.1  mrg   return _mm_sra_pi16 (__m, __count);
   1247      1.1  mrg }
   1248      1.1  mrg 
   1249      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1250      1.1  mrg _m_psrawi (__m64 __m, int __count)
   1251      1.1  mrg {
   1252      1.1  mrg   return _mm_srai_pi16 (__m, __count);
   1253      1.1  mrg }
   1254      1.1  mrg 
   1255      1.1  mrg /* Shift two 32-bit values in M right by COUNT; shift in the sign bit.  */
   1256      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1257      1.1  mrg _mm_sra_pi32 (__m64 __m, __m64 __count)
   1258      1.1  mrg {
   1259  1.1.1.3  mrg   __m64_union __res;
   1260      1.1  mrg 
   1261  1.1.1.3  mrg   __res.as_m64 = __m;
   1262      1.1  mrg 
   1263  1.1.1.3  mrg   __res.as_int[0] = __res.as_int[0] >> __count;
   1264  1.1.1.3  mrg   __res.as_int[1] = __res.as_int[1] >> __count;
   1265  1.1.1.3  mrg   return (__res.as_m64);
   1266      1.1  mrg }
   1267      1.1  mrg 
   1268      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1269      1.1  mrg _m_psrad (__m64 __m, __m64 __count)
   1270      1.1  mrg {
   1271      1.1  mrg   return _mm_sra_pi32 (__m, __count);
   1272      1.1  mrg }
   1273      1.1  mrg 
   1274      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1275      1.1  mrg _mm_srai_pi32 (__m64 __m, int __count)
   1276      1.1  mrg {
   1277      1.1  mrg   /* Promote int to long then invoke mm_sra_pi32.  */
   1278      1.1  mrg   return _mm_sra_pi32 (__m, __count);
   1279      1.1  mrg }
   1280      1.1  mrg 
   1281      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1282      1.1  mrg _m_psradi (__m64 __m, int __count)
   1283      1.1  mrg {
   1284      1.1  mrg   return _mm_srai_pi32 (__m, __count);
   1285      1.1  mrg }
   1286      1.1  mrg 
   1287      1.1  mrg /* Shift four 16-bit values in M right by COUNT; shift in zeros.  */
   1288      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1289      1.1  mrg _mm_srl_pi16 (__m64 __m, __m64 __count)
   1290      1.1  mrg {
   1291  1.1.1.3  mrg   __vector unsigned short __r;
   1292  1.1.1.3  mrg   __vector unsigned short __c;
   1293      1.1  mrg 
   1294      1.1  mrg   if (__count <= 15)
   1295      1.1  mrg     {
   1296  1.1.1.3  mrg 	__r = (__vector unsigned short)vec_splats (__m);
   1297  1.1.1.3  mrg 	__c = (__vector unsigned short)vec_splats ((unsigned short)__count);
   1298  1.1.1.3  mrg 	__r = vec_sr (__r, (__vector unsigned short)__c);
   1299  1.1.1.3  mrg         return (__m64) ((__vector long long) __r)[0];
   1300      1.1  mrg     }
   1301      1.1  mrg   else
   1302      1.1  mrg     return (0);
   1303      1.1  mrg }
   1304      1.1  mrg 
   1305      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1306      1.1  mrg _m_psrlw (__m64 __m, __m64 __count)
   1307      1.1  mrg {
   1308      1.1  mrg   return _mm_srl_pi16 (__m, __count);
   1309      1.1  mrg }
   1310      1.1  mrg 
   1311      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1312      1.1  mrg _mm_srli_pi16 (__m64 __m, int __count)
   1313      1.1  mrg {
   1314      1.1  mrg   /* Promote int to long then invoke mm_sra_pi32.  */
   1315      1.1  mrg   return _mm_srl_pi16 (__m, __count);
   1316      1.1  mrg }
   1317      1.1  mrg 
   1318      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1319      1.1  mrg _m_psrlwi (__m64 __m, int __count)
   1320      1.1  mrg {
   1321      1.1  mrg   return _mm_srli_pi16 (__m, __count);
   1322      1.1  mrg }
   1323      1.1  mrg 
   1324      1.1  mrg /* Shift two 32-bit values in M right by COUNT; shift in zeros.  */
   1325      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1326      1.1  mrg _mm_srl_pi32 (__m64 __m, __m64 __count)
   1327      1.1  mrg {
   1328  1.1.1.3  mrg   __m64_union __res;
   1329      1.1  mrg 
   1330  1.1.1.3  mrg   __res.as_m64 = __m;
   1331      1.1  mrg 
   1332  1.1.1.3  mrg   __res.as_int[0] = (unsigned int)__res.as_int[0] >> __count;
   1333  1.1.1.3  mrg   __res.as_int[1] = (unsigned int)__res.as_int[1] >> __count;
   1334  1.1.1.3  mrg   return (__res.as_m64);
   1335      1.1  mrg }
   1336      1.1  mrg 
   1337      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1338      1.1  mrg _m_psrld (__m64 __m, __m64 __count)
   1339      1.1  mrg {
   1340      1.1  mrg   return _mm_srl_pi32 (__m, __count);
   1341      1.1  mrg }
   1342      1.1  mrg 
   1343      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1344      1.1  mrg _mm_srli_pi32 (__m64 __m, int __count)
   1345      1.1  mrg {
   1346      1.1  mrg   /* Promote int to long then invoke mm_srl_pi32.  */
   1347      1.1  mrg   return _mm_srl_pi32 (__m, __count);
   1348      1.1  mrg }
   1349      1.1  mrg 
   1350      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1351      1.1  mrg _m_psrldi (__m64 __m, int __count)
   1352      1.1  mrg {
   1353      1.1  mrg   return _mm_srli_pi32 (__m, __count);
   1354      1.1  mrg }
   1355      1.1  mrg #endif /* _ARCH_PWR8 */
   1356      1.1  mrg 
   1357      1.1  mrg /* Creates a vector of two 32-bit values; I0 is least significant.  */
   1358      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1359      1.1  mrg _mm_set_pi32 (int __i1, int __i0)
   1360      1.1  mrg {
   1361  1.1.1.3  mrg   __m64_union __res;
   1362      1.1  mrg 
   1363  1.1.1.3  mrg   __res.as_int[0] = __i0;
   1364  1.1.1.3  mrg   __res.as_int[1] = __i1;
   1365  1.1.1.3  mrg   return (__res.as_m64);
   1366      1.1  mrg }
   1367      1.1  mrg 
   1368      1.1  mrg /* Creates a vector of four 16-bit values; W0 is least significant.  */
   1369      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1370      1.1  mrg _mm_set_pi16 (short __w3, short __w2, short __w1, short __w0)
   1371      1.1  mrg {
   1372  1.1.1.3  mrg   __m64_union __res;
   1373      1.1  mrg 
   1374  1.1.1.3  mrg   __res.as_short[0] = __w0;
   1375  1.1.1.3  mrg   __res.as_short[1] = __w1;
   1376  1.1.1.3  mrg   __res.as_short[2] = __w2;
   1377  1.1.1.3  mrg   __res.as_short[3] = __w3;
   1378  1.1.1.3  mrg   return (__res.as_m64);
   1379      1.1  mrg }
   1380      1.1  mrg 
   1381      1.1  mrg /* Creates a vector of eight 8-bit values; B0 is least significant.  */
   1382      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1383      1.1  mrg _mm_set_pi8 (char __b7, char __b6, char __b5, char __b4,
   1384      1.1  mrg 	     char __b3, char __b2, char __b1, char __b0)
   1385      1.1  mrg {
   1386  1.1.1.3  mrg   __m64_union __res;
   1387      1.1  mrg 
   1388  1.1.1.3  mrg   __res.as_char[0] = __b0;
   1389  1.1.1.3  mrg   __res.as_char[1] = __b1;
   1390  1.1.1.3  mrg   __res.as_char[2] = __b2;
   1391  1.1.1.3  mrg   __res.as_char[3] = __b3;
   1392  1.1.1.3  mrg   __res.as_char[4] = __b4;
   1393  1.1.1.3  mrg   __res.as_char[5] = __b5;
   1394  1.1.1.3  mrg   __res.as_char[6] = __b6;
   1395  1.1.1.3  mrg   __res.as_char[7] = __b7;
   1396  1.1.1.3  mrg   return (__res.as_m64);
   1397      1.1  mrg }
   1398      1.1  mrg 
   1399      1.1  mrg /* Similar, but with the arguments in reverse order.  */
   1400      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1401      1.1  mrg _mm_setr_pi32 (int __i0, int __i1)
   1402      1.1  mrg {
   1403  1.1.1.3  mrg   __m64_union __res;
   1404      1.1  mrg 
   1405  1.1.1.3  mrg   __res.as_int[0] = __i0;
   1406  1.1.1.3  mrg   __res.as_int[1] = __i1;
   1407  1.1.1.3  mrg   return (__res.as_m64);
   1408      1.1  mrg }
   1409      1.1  mrg 
   1410      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1411      1.1  mrg _mm_setr_pi16 (short __w0, short __w1, short __w2, short __w3)
   1412      1.1  mrg {
   1413      1.1  mrg   return _mm_set_pi16 (__w3, __w2, __w1, __w0);
   1414      1.1  mrg }
   1415      1.1  mrg 
   1416      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1417      1.1  mrg _mm_setr_pi8 (char __b0, char __b1, char __b2, char __b3,
   1418      1.1  mrg 	      char __b4, char __b5, char __b6, char __b7)
   1419      1.1  mrg {
   1420      1.1  mrg   return _mm_set_pi8 (__b7, __b6, __b5, __b4, __b3, __b2, __b1, __b0);
   1421      1.1  mrg }
   1422      1.1  mrg 
   1423      1.1  mrg /* Creates a vector of two 32-bit values, both elements containing I.  */
   1424      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1425      1.1  mrg _mm_set1_pi32 (int __i)
   1426      1.1  mrg {
   1427  1.1.1.3  mrg   __m64_union __res;
   1428      1.1  mrg 
   1429  1.1.1.3  mrg   __res.as_int[0] = __i;
   1430  1.1.1.3  mrg   __res.as_int[1] = __i;
   1431  1.1.1.3  mrg   return (__res.as_m64);
   1432      1.1  mrg }
   1433      1.1  mrg 
   1434      1.1  mrg /* Creates a vector of four 16-bit values, all elements containing W.  */
   1435      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1436      1.1  mrg _mm_set1_pi16 (short __w)
   1437      1.1  mrg {
   1438      1.1  mrg #if _ARCH_PWR9
   1439      1.1  mrg   __vector signed short w;
   1440      1.1  mrg 
   1441      1.1  mrg   w = (__vector signed short)vec_splats (__w);
   1442  1.1.1.2  mrg   return (__m64) ((__vector long long) w)[0];
   1443      1.1  mrg #else
   1444  1.1.1.3  mrg   __m64_union __res;
   1445      1.1  mrg 
   1446  1.1.1.3  mrg   __res.as_short[0] = __w;
   1447  1.1.1.3  mrg   __res.as_short[1] = __w;
   1448  1.1.1.3  mrg   __res.as_short[2] = __w;
   1449  1.1.1.3  mrg   __res.as_short[3] = __w;
   1450  1.1.1.3  mrg   return (__res.as_m64);
   1451      1.1  mrg #endif
   1452      1.1  mrg }
   1453      1.1  mrg 
   1454      1.1  mrg /* Creates a vector of eight 8-bit values, all elements containing B.  */
   1455      1.1  mrg extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1456      1.1  mrg _mm_set1_pi8 (signed char __b)
   1457      1.1  mrg {
   1458      1.1  mrg #if _ARCH_PWR8
   1459  1.1.1.3  mrg   __vector signed char __res;
   1460      1.1  mrg 
   1461  1.1.1.3  mrg   __res = (__vector signed char)vec_splats (__b);
   1462  1.1.1.3  mrg   return (__m64) ((__vector long long) __res)[0];
   1463      1.1  mrg #else
   1464  1.1.1.3  mrg   __m64_union __res;
   1465      1.1  mrg 
   1466  1.1.1.3  mrg   __res.as_char[0] = __b;
   1467  1.1.1.3  mrg   __res.as_char[1] = __b;
   1468  1.1.1.3  mrg   __res.as_char[2] = __b;
   1469  1.1.1.3  mrg   __res.as_char[3] = __b;
   1470  1.1.1.3  mrg   __res.as_char[4] = __b;
   1471  1.1.1.3  mrg   __res.as_char[5] = __b;
   1472  1.1.1.3  mrg   __res.as_char[6] = __b;
   1473  1.1.1.3  mrg   __res.as_char[7] = __b;
   1474  1.1.1.3  mrg   return (__res.as_m64);
   1475      1.1  mrg #endif
   1476      1.1  mrg }
   1477      1.1  mrg #endif /* _MMINTRIN_H_INCLUDED */
   1478