Home | History | Annotate | Line # | Download | only in rs6000
mmintrin.h revision 1.1.1.2
      1 /* Copyright (C) 2002-2019 Free Software Foundation, Inc.
      2 
      3    This file is part of GCC.
      4 
      5    GCC is free software; you can redistribute it and/or modify
      6    it under the terms of the GNU General Public License as published by
      7    the Free Software Foundation; either version 3, or (at your option)
      8    any later version.
      9 
     10    GCC is distributed in the hope that it will be useful,
     11    but WITHOUT ANY WARRANTY; without even the implied warranty of
     12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     13    GNU General Public License for more details.
     14 
     15    Under Section 7 of GPL version 3, you are granted additional
     16    permissions described in the GCC Runtime Library Exception, version
     17    3.1, as published by the Free Software Foundation.
     18 
     19    You should have received a copy of the GNU General Public License and
     20    a copy of the GCC Runtime Library Exception along with this program;
     21    see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
     22    <http://www.gnu.org/licenses/>.  */
     23 
     24 /* Implemented from the specification included in the Intel C++ Compiler
     25    User Guide and Reference, version 9.0.  */
     26 
     27 #ifndef NO_WARN_X86_INTRINSICS
     28 /* This header is distributed to simplify porting x86_64 code that
     29    makes explicit use of Intel intrinsics to powerpc64le.
     30    It is the user's responsibility to determine if the results are
     31    acceptable and make additional changes as necessary.
     32    Note that much code that uses Intel intrinsics can be rewritten in
     33    standard C or GNU C extensions, which are more portable and better
     34    optimized across multiple targets.
     35 
     36    In the specific case of X86 MMX (__m64) intrinsics, the PowerPC
     37    target does not support a native __vector_size__ (8) type.  Instead
     38    we typedef __m64 to a 64-bit unsigned long long, which is natively
     39    supported in 64-bit mode.  This works well for the _si64 and some
     40    _pi32 operations, but starts to generate long sequences for _pi16
     41    and _pi8 operations.  For those cases it better (faster and
     42    smaller code) to transfer __m64 data to the PowerPC vector 128-bit
     43    unit, perform the operation, and then transfer the result back to
     44    the __m64 type. This implies that the direct register move
     45    instructions, introduced with power8, are available for efficient
     46    implementation of these transfers.
     47 
     48    Most MMX intrinsic operations can be performed efficiently as
     49    C language 64-bit scalar operation or optimized to use the newer
     50    128-bit SSE/Altivec operations.  We recomend this for new
     51    applications.  */
     52 #error "Please read comment above.  Use -DNO_WARN_X86_INTRINSICS to disable this error."
     53 #endif
     54 
     55 #ifndef _MMINTRIN_H_INCLUDED
     56 #define _MMINTRIN_H_INCLUDED
     57 
     58 #include <altivec.h>
     59 /* The Intel API is flexible enough that we must allow aliasing with other
     60    vector types, and their scalar components.  */
     61 typedef __attribute__ ((__aligned__ (8))) unsigned long long __m64;
     62 
     63 typedef __attribute__ ((__aligned__ (8)))
     64 union
     65   {
     66     __m64 as_m64;
     67     char as_char[8];
     68     signed char as_signed_char [8];
     69     short as_short[4];
     70     int as_int[2];
     71     long long as_long_long;
     72     float as_float[2];
     73     double as_double;
     74   } __m64_union;
     75 
     76 /* Empty the multimedia state.  */
     77 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     78 _mm_empty (void)
     79 {
     80   /* nothing to do on PowerPC.  */
     81 }
     82 
     83 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     84 _m_empty (void)
     85 {
     86   /* nothing to do on PowerPC.  */
     87 }
     88 
     89 /* Convert I to a __m64 object.  The integer is zero-extended to 64-bits.  */
     90 extern __inline __m64  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     91 _mm_cvtsi32_si64 (int __i)
     92 {
     93   return (__m64) (unsigned int) __i;
     94 }
     95 
     96 extern __inline __m64  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     97 _m_from_int (int __i)
     98 {
     99   return _mm_cvtsi32_si64 (__i);
    100 }
    101 
    102 /* Convert the lower 32 bits of the __m64 object into an integer.  */
    103 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    104 _mm_cvtsi64_si32 (__m64 __i)
    105 {
    106   return ((int) __i);
    107 }
    108 
    109 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    110 _m_to_int (__m64 __i)
    111 {
    112   return _mm_cvtsi64_si32 (__i);
    113 }
    114 
    115 /* Convert I to a __m64 object.  */
    116 
    117 /* Intel intrinsic.  */
    118 extern __inline __m64  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    119 _m_from_int64 (long long __i)
    120 {
    121   return (__m64) __i;
    122 }
    123 
    124 extern __inline __m64  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    125 _mm_cvtsi64_m64 (long long __i)
    126 {
    127   return (__m64) __i;
    128 }
    129 
    130 /* Microsoft intrinsic.  */
    131 extern __inline __m64  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    132 _mm_cvtsi64x_si64 (long long __i)
    133 {
    134   return (__m64) __i;
    135 }
    136 
    137 extern __inline __m64  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    138 _mm_set_pi64x (long long __i)
    139 {
    140   return (__m64) __i;
    141 }
    142 
    143 /* Convert the __m64 object to a 64bit integer.  */
    144 
    145 /* Intel intrinsic.  */
    146 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    147 _m_to_int64 (__m64 __i)
    148 {
    149   return (long long)__i;
    150 }
    151 
    152 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    153 _mm_cvtm64_si64 (__m64 __i)
    154 {
    155   return (long long) __i;
    156 }
    157 
    158 /* Microsoft intrinsic.  */
    159 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    160 _mm_cvtsi64_si64x (__m64 __i)
    161 {
    162   return (long long) __i;
    163 }
    164 
    165 #ifdef _ARCH_PWR8
    166 /* Pack the four 16-bit values from M1 into the lower four 8-bit values of
    167    the result, and the four 16-bit values from M2 into the upper four 8-bit
    168    values of the result, all with signed saturation.  */
    169 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    170 _mm_packs_pi16 (__m64 __m1, __m64 __m2)
    171 {
    172   __vector signed short vm1;
    173   __vector signed char vresult;
    174 
    175   vm1 = (__vector signed short) (__vector unsigned long long)
    176 #ifdef __LITTLE_ENDIAN__
    177         { __m1, __m2 };
    178 #else
    179         { __m2, __m1 };
    180 #endif
    181   vresult = vec_packs (vm1, vm1);
    182   return (__m64) ((__vector long long) vresult)[0];
    183 }
    184 
    185 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    186 _m_packsswb (__m64 __m1, __m64 __m2)
    187 {
    188   return _mm_packs_pi16 (__m1, __m2);
    189 }
    190 
    191 /* Pack the two 32-bit values from M1 in to the lower two 16-bit values of
    192    the result, and the two 32-bit values from M2 into the upper two 16-bit
    193    values of the result, all with signed saturation.  */
    194 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    195 _mm_packs_pi32 (__m64 __m1, __m64 __m2)
    196 {
    197   __vector signed int vm1;
    198   __vector signed short vresult;
    199 
    200   vm1 = (__vector signed int) (__vector unsigned long long)
    201 #ifdef __LITTLE_ENDIAN__
    202         { __m1, __m2 };
    203 #else
    204         { __m2, __m1 };
    205 #endif
    206   vresult = vec_packs (vm1, vm1);
    207   return (__m64) ((__vector long long) vresult)[0];
    208 }
    209 
    210 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    211 _m_packssdw (__m64 __m1, __m64 __m2)
    212 {
    213   return _mm_packs_pi32 (__m1, __m2);
    214 }
    215 
    216 /* Pack the four 16-bit values from M1 into the lower four 8-bit values of
    217    the result, and the four 16-bit values from M2 into the upper four 8-bit
    218    values of the result, all with unsigned saturation.  */
    219 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    220 _mm_packs_pu16 (__m64 __m1, __m64 __m2)
    221 {
    222   __vector unsigned char r;
    223   __vector signed short vm1 = (__vector signed short) (__vector long long)
    224 #ifdef __LITTLE_ENDIAN__
    225         { __m1, __m2 };
    226 #else
    227         { __m2, __m1 };
    228 #endif
    229   const __vector signed short __zero = { 0 };
    230   __vector __bool short __select = vec_cmplt (vm1, __zero);
    231   r = vec_packs ((__vector unsigned short) vm1, (__vector unsigned short) vm1);
    232   __vector __bool char packsel = vec_pack (__select, __select);
    233   r = vec_sel (r, (const __vector unsigned char) __zero, packsel);
    234   return (__m64) ((__vector long long) r)[0];
    235 }
    236 
    237 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    238 _m_packuswb (__m64 __m1, __m64 __m2)
    239 {
    240   return _mm_packs_pu16 (__m1, __m2);
    241 }
    242 #endif /* end ARCH_PWR8 */
    243 
    244 /* Interleave the four 8-bit values from the high half of M1 with the four
    245    8-bit values from the high half of M2.  */
    246 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    247 _mm_unpackhi_pi8 (__m64 __m1, __m64 __m2)
    248 {
    249 #if _ARCH_PWR8
    250   __vector unsigned char a, b, c;
    251 
    252   a = (__vector unsigned char)vec_splats (__m1);
    253   b = (__vector unsigned char)vec_splats (__m2);
    254   c = vec_mergel (a, b);
    255   return (__m64) ((__vector long long) c)[1];
    256 #else
    257   __m64_union m1, m2, res;
    258 
    259   m1.as_m64 = __m1;
    260   m2.as_m64 = __m2;
    261 
    262   res.as_char[0] = m1.as_char[4];
    263   res.as_char[1] = m2.as_char[4];
    264   res.as_char[2] = m1.as_char[5];
    265   res.as_char[3] = m2.as_char[5];
    266   res.as_char[4] = m1.as_char[6];
    267   res.as_char[5] = m2.as_char[6];
    268   res.as_char[6] = m1.as_char[7];
    269   res.as_char[7] = m2.as_char[7];
    270 
    271   return (__m64) res.as_m64;
    272 #endif
    273 }
    274 
    275 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    276 _m_punpckhbw (__m64 __m1, __m64 __m2)
    277 {
    278   return _mm_unpackhi_pi8 (__m1, __m2);
    279 }
    280 
    281 /* Interleave the two 16-bit values from the high half of M1 with the two
    282    16-bit values from the high half of M2.  */
    283 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    284 _mm_unpackhi_pi16 (__m64 __m1, __m64 __m2)
    285 {
    286   __m64_union m1, m2, res;
    287 
    288   m1.as_m64 = __m1;
    289   m2.as_m64 = __m2;
    290 
    291   res.as_short[0] = m1.as_short[2];
    292   res.as_short[1] = m2.as_short[2];
    293   res.as_short[2] = m1.as_short[3];
    294   res.as_short[3] = m2.as_short[3];
    295 
    296   return (__m64) res.as_m64;
    297 }
    298 
    299 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    300 _m_punpckhwd (__m64 __m1, __m64 __m2)
    301 {
    302   return _mm_unpackhi_pi16 (__m1, __m2);
    303 }
    304 /* Interleave the 32-bit value from the high half of M1 with the 32-bit
    305    value from the high half of M2.  */
    306 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    307 _mm_unpackhi_pi32 (__m64 __m1, __m64 __m2)
    308 {
    309   __m64_union m1, m2, res;
    310 
    311   m1.as_m64 = __m1;
    312   m2.as_m64 = __m2;
    313 
    314   res.as_int[0] = m1.as_int[1];
    315   res.as_int[1] = m2.as_int[1];
    316 
    317   return (__m64) res.as_m64;
    318 }
    319 
    320 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    321 _m_punpckhdq (__m64 __m1, __m64 __m2)
    322 {
    323   return _mm_unpackhi_pi32 (__m1, __m2);
    324 }
    325 /* Interleave the four 8-bit values from the low half of M1 with the four
    326    8-bit values from the low half of M2.  */
    327 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    328 _mm_unpacklo_pi8 (__m64 __m1, __m64 __m2)
    329 {
    330 #if _ARCH_PWR8
    331   __vector unsigned char a, b, c;
    332 
    333   a = (__vector unsigned char)vec_splats (__m1);
    334   b = (__vector unsigned char)vec_splats (__m2);
    335   c = vec_mergel (a, b);
    336   return (__m64) ((__vector long long) c)[0];
    337 #else
    338   __m64_union m1, m2, res;
    339 
    340   m1.as_m64 = __m1;
    341   m2.as_m64 = __m2;
    342 
    343   res.as_char[0] = m1.as_char[0];
    344   res.as_char[1] = m2.as_char[0];
    345   res.as_char[2] = m1.as_char[1];
    346   res.as_char[3] = m2.as_char[1];
    347   res.as_char[4] = m1.as_char[2];
    348   res.as_char[5] = m2.as_char[2];
    349   res.as_char[6] = m1.as_char[3];
    350   res.as_char[7] = m2.as_char[3];
    351 
    352   return (__m64) res.as_m64;
    353 #endif
    354 }
    355 
    356 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    357 _m_punpcklbw (__m64 __m1, __m64 __m2)
    358 {
    359   return _mm_unpacklo_pi8 (__m1, __m2);
    360 }
    361 /* Interleave the two 16-bit values from the low half of M1 with the two
    362    16-bit values from the low half of M2.  */
    363 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    364 _mm_unpacklo_pi16 (__m64 __m1, __m64 __m2)
    365 {
    366   __m64_union m1, m2, res;
    367 
    368   m1.as_m64 = __m1;
    369   m2.as_m64 = __m2;
    370 
    371   res.as_short[0] = m1.as_short[0];
    372   res.as_short[1] = m2.as_short[0];
    373   res.as_short[2] = m1.as_short[1];
    374   res.as_short[3] = m2.as_short[1];
    375 
    376   return (__m64) res.as_m64;
    377 }
    378 
    379 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    380 _m_punpcklwd (__m64 __m1, __m64 __m2)
    381 {
    382   return _mm_unpacklo_pi16 (__m1, __m2);
    383 }
    384 
    385 /* Interleave the 32-bit value from the low half of M1 with the 32-bit
    386    value from the low half of M2.  */
    387 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    388 _mm_unpacklo_pi32 (__m64 __m1, __m64 __m2)
    389 {
    390   __m64_union m1, m2, res;
    391 
    392   m1.as_m64 = __m1;
    393   m2.as_m64 = __m2;
    394 
    395   res.as_int[0] = m1.as_int[0];
    396   res.as_int[1] = m2.as_int[0];
    397 
    398   return (__m64) res.as_m64;
    399 }
    400 
    401 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    402 _m_punpckldq (__m64 __m1, __m64 __m2)
    403 {
    404   return _mm_unpacklo_pi32 (__m1, __m2);
    405 }
    406 
    407 /* Add the 8-bit values in M1 to the 8-bit values in M2.  */
    408 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    409 _mm_add_pi8 (__m64 __m1, __m64 __m2)
    410 {
    411 #if _ARCH_PWR8
    412   __vector signed char a, b, c;
    413 
    414   a = (__vector signed char)vec_splats (__m1);
    415   b = (__vector signed char)vec_splats (__m2);
    416   c = vec_add (a, b);
    417   return (__m64) ((__vector long long) c)[0];
    418 #else
    419   __m64_union m1, m2, res;
    420 
    421   m1.as_m64 = __m1;
    422   m2.as_m64 = __m2;
    423 
    424   res.as_char[0] = m1.as_char[0] + m2.as_char[0];
    425   res.as_char[1] = m1.as_char[1] + m2.as_char[1];
    426   res.as_char[2] = m1.as_char[2] + m2.as_char[2];
    427   res.as_char[3] = m1.as_char[3] + m2.as_char[3];
    428   res.as_char[4] = m1.as_char[4] + m2.as_char[4];
    429   res.as_char[5] = m1.as_char[5] + m2.as_char[5];
    430   res.as_char[6] = m1.as_char[6] + m2.as_char[6];
    431   res.as_char[7] = m1.as_char[7] + m2.as_char[7];
    432 
    433   return (__m64) res.as_m64;
    434 #endif
    435 }
    436 
    437 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    438 _m_paddb (__m64 __m1, __m64 __m2)
    439 {
    440   return _mm_add_pi8 (__m1, __m2);
    441 }
    442 
    443 /* Add the 16-bit values in M1 to the 16-bit values in M2.  */
    444 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    445 _mm_add_pi16 (__m64 __m1, __m64 __m2)
    446 {
    447 #if _ARCH_PWR8
    448   __vector signed short a, b, c;
    449 
    450   a = (__vector signed short)vec_splats (__m1);
    451   b = (__vector signed short)vec_splats (__m2);
    452   c = vec_add (a, b);
    453   return (__m64) ((__vector long long) c)[0];
    454 #else
    455   __m64_union m1, m2, res;
    456 
    457   m1.as_m64 = __m1;
    458   m2.as_m64 = __m2;
    459 
    460   res.as_short[0] = m1.as_short[0] + m2.as_short[0];
    461   res.as_short[1] = m1.as_short[1] + m2.as_short[1];
    462   res.as_short[2] = m1.as_short[2] + m2.as_short[2];
    463   res.as_short[3] = m1.as_short[3] + m2.as_short[3];
    464 
    465   return (__m64) res.as_m64;
    466 #endif
    467 }
    468 
    469 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    470 _m_paddw (__m64 __m1, __m64 __m2)
    471 {
    472   return _mm_add_pi16 (__m1, __m2);
    473 }
    474 
    475 /* Add the 32-bit values in M1 to the 32-bit values in M2.  */
    476 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    477 _mm_add_pi32 (__m64 __m1, __m64 __m2)
    478 {
    479 #if _ARCH_PWR9
    480   __vector signed int a, b, c;
    481 
    482   a = (__vector signed int)vec_splats (__m1);
    483   b = (__vector signed int)vec_splats (__m2);
    484   c = vec_add (a, b);
    485   return (__m64) ((__vector long long) c)[0];
    486 #else
    487   __m64_union m1, m2, res;
    488 
    489   m1.as_m64 = __m1;
    490   m2.as_m64 = __m2;
    491 
    492   res.as_int[0] = m1.as_int[0] + m2.as_int[0];
    493   res.as_int[1] = m1.as_int[1] + m2.as_int[1];
    494 
    495   return (__m64) res.as_m64;
    496 #endif
    497 }
    498 
    499 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    500 _m_paddd (__m64 __m1, __m64 __m2)
    501 {
    502   return _mm_add_pi32 (__m1, __m2);
    503 }
    504 
    505 /* Subtract the 8-bit values in M2 from the 8-bit values in M1.  */
    506 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    507 _mm_sub_pi8 (__m64 __m1, __m64 __m2)
    508 {
    509 #if _ARCH_PWR8
    510   __vector signed char a, b, c;
    511 
    512   a = (__vector signed char)vec_splats (__m1);
    513   b = (__vector signed char)vec_splats (__m2);
    514   c = vec_sub (a, b);
    515   return (__m64) ((__vector long long) c)[0];
    516 #else
    517   __m64_union m1, m2, res;
    518 
    519   m1.as_m64 = __m1;
    520   m2.as_m64 = __m2;
    521 
    522   res.as_char[0] = m1.as_char[0] - m2.as_char[0];
    523   res.as_char[1] = m1.as_char[1] - m2.as_char[1];
    524   res.as_char[2] = m1.as_char[2] - m2.as_char[2];
    525   res.as_char[3] = m1.as_char[3] - m2.as_char[3];
    526   res.as_char[4] = m1.as_char[4] - m2.as_char[4];
    527   res.as_char[5] = m1.as_char[5] - m2.as_char[5];
    528   res.as_char[6] = m1.as_char[6] - m2.as_char[6];
    529   res.as_char[7] = m1.as_char[7] - m2.as_char[7];
    530 
    531   return (__m64) res.as_m64;
    532 #endif
    533 }
    534 
    535 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    536 _m_psubb (__m64 __m1, __m64 __m2)
    537 {
    538   return _mm_sub_pi8 (__m1, __m2);
    539 }
    540 
    541 /* Subtract the 16-bit values in M2 from the 16-bit values in M1.  */
    542 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    543 _mm_sub_pi16 (__m64 __m1, __m64 __m2)
    544 {
    545 #if _ARCH_PWR8
    546   __vector signed short a, b, c;
    547 
    548   a = (__vector signed short)vec_splats (__m1);
    549   b = (__vector signed short)vec_splats (__m2);
    550   c = vec_sub (a, b);
    551   return (__m64) ((__vector long long) c)[0];
    552 #else
    553   __m64_union m1, m2, res;
    554 
    555   m1.as_m64 = __m1;
    556   m2.as_m64 = __m2;
    557 
    558   res.as_short[0] = m1.as_short[0] - m2.as_short[0];
    559   res.as_short[1] = m1.as_short[1] - m2.as_short[1];
    560   res.as_short[2] = m1.as_short[2] - m2.as_short[2];
    561   res.as_short[3] = m1.as_short[3] - m2.as_short[3];
    562 
    563   return (__m64) res.as_m64;
    564 #endif
    565 }
    566 
    567 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    568 _m_psubw (__m64 __m1, __m64 __m2)
    569 {
    570   return _mm_sub_pi16 (__m1, __m2);
    571 }
    572 
    573 /* Subtract the 32-bit values in M2 from the 32-bit values in M1.  */
    574 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    575 _mm_sub_pi32 (__m64 __m1, __m64 __m2)
    576 {
    577 #if _ARCH_PWR9
    578   __vector signed int a, b, c;
    579 
    580   a = (__vector signed int)vec_splats (__m1);
    581   b = (__vector signed int)vec_splats (__m2);
    582   c = vec_sub (a, b);
    583   return (__m64) ((__vector long long) c)[0];
    584 #else
    585   __m64_union m1, m2, res;
    586 
    587   m1.as_m64 = __m1;
    588   m2.as_m64 = __m2;
    589 
    590   res.as_int[0] = m1.as_int[0] - m2.as_int[0];
    591   res.as_int[1] = m1.as_int[1] - m2.as_int[1];
    592 
    593   return (__m64) res.as_m64;
    594 #endif
    595 }
    596 
    597 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    598 _m_psubd (__m64 __m1, __m64 __m2)
    599 {
    600   return _mm_sub_pi32 (__m1, __m2);
    601 }
    602 
    603 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    604 _mm_add_si64 (__m64 __m1, __m64 __m2)
    605 {
    606   return (__m1 + __m2);
    607 }
    608 
    609 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    610 _mm_sub_si64 (__m64 __m1, __m64 __m2)
    611 {
    612   return (__m1 - __m2);
    613 }
    614 
    615 /* Shift the 64-bit value in M left by COUNT.  */
    616 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    617 _mm_sll_si64 (__m64 __m, __m64 __count)
    618 {
    619   return (__m << __count);
    620 }
    621 
    622 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    623 _m_psllq (__m64 __m, __m64 __count)
    624 {
    625   return _mm_sll_si64 (__m, __count);
    626 }
    627 
    628 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    629 _mm_slli_si64 (__m64 __m, const int __count)
    630 {
    631   return (__m << __count);
    632 }
    633 
    634 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    635 _m_psllqi (__m64 __m, const int __count)
    636 {
    637   return _mm_slli_si64 (__m, __count);
    638 }
    639 
    640 /* Shift the 64-bit value in M left by COUNT; shift in zeros.  */
    641 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    642 _mm_srl_si64 (__m64 __m, __m64 __count)
    643 {
    644   return (__m >> __count);
    645 }
    646 
    647 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    648 _m_psrlq (__m64 __m, __m64 __count)
    649 {
    650   return _mm_srl_si64 (__m, __count);
    651 }
    652 
    653 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    654 _mm_srli_si64 (__m64 __m, const int __count)
    655 {
    656   return (__m >> __count);
    657 }
    658 
    659 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    660 _m_psrlqi (__m64 __m, const int __count)
    661 {
    662   return _mm_srli_si64 (__m, __count);
    663 }
    664 
    665 /* Bit-wise AND the 64-bit values in M1 and M2.  */
    666 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    667 _mm_and_si64 (__m64 __m1, __m64 __m2)
    668 {
    669   return (__m1 & __m2);
    670 }
    671 
    672 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    673 _m_pand (__m64 __m1, __m64 __m2)
    674 {
    675   return _mm_and_si64 (__m1, __m2);
    676 }
    677 
    678 /* Bit-wise complement the 64-bit value in M1 and bit-wise AND it with the
    679    64-bit value in M2.  */
    680 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    681 _mm_andnot_si64 (__m64 __m1, __m64 __m2)
    682 {
    683   return (~__m1 & __m2);
    684 }
    685 
    686 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    687 _m_pandn (__m64 __m1, __m64 __m2)
    688 {
    689   return _mm_andnot_si64 (__m1, __m2);
    690 }
    691 
    692 /* Bit-wise inclusive OR the 64-bit values in M1 and M2.  */
    693 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    694 _mm_or_si64 (__m64 __m1, __m64 __m2)
    695 {
    696   return (__m1 | __m2);
    697 }
    698 
    699 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    700 _m_por (__m64 __m1, __m64 __m2)
    701 {
    702   return _mm_or_si64 (__m1, __m2);
    703 }
    704 
    705 /* Bit-wise exclusive OR the 64-bit values in M1 and M2.  */
    706 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    707 _mm_xor_si64 (__m64 __m1, __m64 __m2)
    708 {
    709   return  (__m1 ^ __m2);
    710 }
    711 
    712 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    713 _m_pxor (__m64 __m1, __m64 __m2)
    714 {
    715   return _mm_xor_si64 (__m1, __m2);
    716 }
    717 
    718 /* Creates a 64-bit zero.  */
    719 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    720 _mm_setzero_si64 (void)
    721 {
    722   return (__m64) 0;
    723 }
    724 
    725 /* Compare eight 8-bit values.  The result of the comparison is 0xFF if the
    726    test is true and zero if false.  */
    727 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    728 _mm_cmpeq_pi8 (__m64 __m1, __m64 __m2)
    729 {
    730 #if defined(_ARCH_PWR6) && defined(__powerpc64__)
    731   __m64 res;
    732   __asm__(
    733       "cmpb %0,%1,%2;\n"
    734       : "=r" (res)
    735       : "r" (__m1),
    736 	"r" (__m2)
    737       : );
    738   return (res);
    739 #else
    740   __m64_union m1, m2, res;
    741 
    742   m1.as_m64 = __m1;
    743   m2.as_m64 = __m2;
    744 
    745   res.as_char[0] = (m1.as_char[0] == m2.as_char[0])? -1: 0;
    746   res.as_char[1] = (m1.as_char[1] == m2.as_char[1])? -1: 0;
    747   res.as_char[2] = (m1.as_char[2] == m2.as_char[2])? -1: 0;
    748   res.as_char[3] = (m1.as_char[3] == m2.as_char[3])? -1: 0;
    749   res.as_char[4] = (m1.as_char[4] == m2.as_char[4])? -1: 0;
    750   res.as_char[5] = (m1.as_char[5] == m2.as_char[5])? -1: 0;
    751   res.as_char[6] = (m1.as_char[6] == m2.as_char[6])? -1: 0;
    752   res.as_char[7] = (m1.as_char[7] == m2.as_char[7])? -1: 0;
    753 
    754   return (__m64) res.as_m64;
    755 #endif
    756 }
    757 
    758 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    759 _m_pcmpeqb (__m64 __m1, __m64 __m2)
    760 {
    761   return _mm_cmpeq_pi8 (__m1, __m2);
    762 }
    763 
    764 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    765 _mm_cmpgt_pi8 (__m64 __m1, __m64 __m2)
    766 {
    767 #if _ARCH_PWR8
    768   __vector signed char a, b, c;
    769 
    770   a = (__vector signed char)vec_splats (__m1);
    771   b = (__vector signed char)vec_splats (__m2);
    772   c = (__vector signed char)vec_cmpgt (a, b);
    773   return (__m64) ((__vector long long) c)[0];
    774 #else
    775   __m64_union m1, m2, res;
    776 
    777   m1.as_m64 = __m1;
    778   m2.as_m64 = __m2;
    779 
    780   res.as_char[0] = (m1.as_char[0] > m2.as_char[0])? -1: 0;
    781   res.as_char[1] = (m1.as_char[1] > m2.as_char[1])? -1: 0;
    782   res.as_char[2] = (m1.as_char[2] > m2.as_char[2])? -1: 0;
    783   res.as_char[3] = (m1.as_char[3] > m2.as_char[3])? -1: 0;
    784   res.as_char[4] = (m1.as_char[4] > m2.as_char[4])? -1: 0;
    785   res.as_char[5] = (m1.as_char[5] > m2.as_char[5])? -1: 0;
    786   res.as_char[6] = (m1.as_char[6] > m2.as_char[6])? -1: 0;
    787   res.as_char[7] = (m1.as_char[7] > m2.as_char[7])? -1: 0;
    788 
    789   return (__m64) res.as_m64;
    790 #endif
    791 }
    792 
    793 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    794 _m_pcmpgtb (__m64 __m1, __m64 __m2)
    795 {
    796   return _mm_cmpgt_pi8 (__m1, __m2);
    797 }
    798 
    799 /* Compare four 16-bit values.  The result of the comparison is 0xFFFF if
    800    the test is true and zero if false.  */
    801 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    802 _mm_cmpeq_pi16 (__m64 __m1, __m64 __m2)
    803 {
    804 #if _ARCH_PWR8
    805   __vector signed short a, b, c;
    806 
    807   a = (__vector signed short)vec_splats (__m1);
    808   b = (__vector signed short)vec_splats (__m2);
    809   c = (__vector signed short)vec_cmpeq (a, b);
    810   return (__m64) ((__vector long long) c)[0];
    811 #else
    812   __m64_union m1, m2, res;
    813 
    814   m1.as_m64 = __m1;
    815   m2.as_m64 = __m2;
    816 
    817   res.as_short[0] = (m1.as_short[0] == m2.as_short[0])? -1: 0;
    818   res.as_short[1] = (m1.as_short[1] == m2.as_short[1])? -1: 0;
    819   res.as_short[2] = (m1.as_short[2] == m2.as_short[2])? -1: 0;
    820   res.as_short[3] = (m1.as_short[3] == m2.as_short[3])? -1: 0;
    821 
    822   return (__m64) res.as_m64;
    823 #endif
    824 }
    825 
    826 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    827 _m_pcmpeqw (__m64 __m1, __m64 __m2)
    828 {
    829   return _mm_cmpeq_pi16 (__m1, __m2);
    830 }
    831 
    832 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    833 _mm_cmpgt_pi16 (__m64 __m1, __m64 __m2)
    834 {
    835 #if _ARCH_PWR8
    836   __vector signed short a, b, c;
    837 
    838   a = (__vector signed short)vec_splats (__m1);
    839   b = (__vector signed short)vec_splats (__m2);
    840   c = (__vector signed short)vec_cmpgt (a, b);
    841   return (__m64) ((__vector long long) c)[0];
    842 #else
    843   __m64_union m1, m2, res;
    844 
    845   m1.as_m64 = __m1;
    846   m2.as_m64 = __m2;
    847 
    848   res.as_short[0] = (m1.as_short[0] > m2.as_short[0])? -1: 0;
    849   res.as_short[1] = (m1.as_short[1] > m2.as_short[1])? -1: 0;
    850   res.as_short[2] = (m1.as_short[2] > m2.as_short[2])? -1: 0;
    851   res.as_short[3] = (m1.as_short[3] > m2.as_short[3])? -1: 0;
    852 
    853   return (__m64) res.as_m64;
    854 #endif
    855 }
    856 
    857 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    858 _m_pcmpgtw (__m64 __m1, __m64 __m2)
    859 {
    860   return _mm_cmpgt_pi16 (__m1, __m2);
    861 }
    862 
    863 /* Compare two 32-bit values.  The result of the comparison is 0xFFFFFFFF if
    864    the test is true and zero if false.  */
    865 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    866 _mm_cmpeq_pi32 (__m64 __m1, __m64 __m2)
    867 {
    868 #if _ARCH_PWR9
    869   __vector signed int a, b, c;
    870 
    871   a = (__vector signed int)vec_splats (__m1);
    872   b = (__vector signed int)vec_splats (__m2);
    873   c = (__vector signed int)vec_cmpeq (a, b);
    874   return (__m64) ((__vector long long) c)[0];
    875 #else
    876   __m64_union m1, m2, res;
    877 
    878   m1.as_m64 = __m1;
    879   m2.as_m64 = __m2;
    880 
    881   res.as_int[0] = (m1.as_int[0] == m2.as_int[0])? -1: 0;
    882   res.as_int[1] = (m1.as_int[1] == m2.as_int[1])? -1: 0;
    883 
    884   return (__m64) res.as_m64;
    885 #endif
    886 }
    887 
    888 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    889 _m_pcmpeqd (__m64 __m1, __m64 __m2)
    890 {
    891   return _mm_cmpeq_pi32 (__m1, __m2);
    892 }
    893 
    894 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    895 _mm_cmpgt_pi32 (__m64 __m1, __m64 __m2)
    896 {
    897 #if _ARCH_PWR9
    898   __vector signed int a, b, c;
    899 
    900   a = (__vector signed int)vec_splats (__m1);
    901   b = (__vector signed int)vec_splats (__m2);
    902   c = (__vector signed int)vec_cmpgt (a, b);
    903   return (__m64) ((__vector long long) c)[0];
    904 #else
    905   __m64_union m1, m2, res;
    906 
    907   m1.as_m64 = __m1;
    908   m2.as_m64 = __m2;
    909 
    910   res.as_int[0] = (m1.as_int[0] > m2.as_int[0])? -1: 0;
    911   res.as_int[1] = (m1.as_int[1] > m2.as_int[1])? -1: 0;
    912 
    913   return (__m64) res.as_m64;
    914 #endif
    915 }
    916 
    917 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    918 _m_pcmpgtd (__m64 __m1, __m64 __m2)
    919 {
    920   return _mm_cmpgt_pi32 (__m1, __m2);
    921 }
    922 
    923 #if _ARCH_PWR8
    924 /* Add the 8-bit values in M1 to the 8-bit values in M2 using signed
    925    saturated arithmetic.  */
    926 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    927 _mm_adds_pi8 (__m64 __m1, __m64 __m2)
    928 {
    929   __vector signed char a, b, c;
    930 
    931   a = (__vector signed char)vec_splats (__m1);
    932   b = (__vector signed char)vec_splats (__m2);
    933   c = vec_adds (a, b);
    934   return (__m64) ((__vector long long) c)[0];
    935 }
    936 
    937 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    938 _m_paddsb (__m64 __m1, __m64 __m2)
    939 {
    940   return _mm_adds_pi8 (__m1, __m2);
    941 }
    942 /* Add the 16-bit values in M1 to the 16-bit values in M2 using signed
    943    saturated arithmetic.  */
    944 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    945 _mm_adds_pi16 (__m64 __m1, __m64 __m2)
    946 {
    947   __vector signed short a, b, c;
    948 
    949   a = (__vector signed short)vec_splats (__m1);
    950   b = (__vector signed short)vec_splats (__m2);
    951   c = vec_adds (a, b);
    952   return (__m64) ((__vector long long) c)[0];
    953 }
    954 
    955 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    956 _m_paddsw (__m64 __m1, __m64 __m2)
    957 {
    958   return _mm_adds_pi16 (__m1, __m2);
    959 }
    960 /* Add the 8-bit values in M1 to the 8-bit values in M2 using unsigned
    961    saturated arithmetic.  */
    962 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    963 _mm_adds_pu8 (__m64 __m1, __m64 __m2)
    964 {
    965   __vector unsigned char a, b, c;
    966 
    967   a = (__vector unsigned char)vec_splats (__m1);
    968   b = (__vector unsigned char)vec_splats (__m2);
    969   c = vec_adds (a, b);
    970   return (__m64) ((__vector long long) c)[0];
    971 }
    972 
    973 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    974 _m_paddusb (__m64 __m1, __m64 __m2)
    975 {
    976   return _mm_adds_pu8 (__m1, __m2);
    977 }
    978 
    979 /* Add the 16-bit values in M1 to the 16-bit values in M2 using unsigned
    980    saturated arithmetic.  */
    981 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    982 _mm_adds_pu16 (__m64 __m1, __m64 __m2)
    983 {
    984   __vector unsigned short a, b, c;
    985 
    986   a = (__vector unsigned short)vec_splats (__m1);
    987   b = (__vector unsigned short)vec_splats (__m2);
    988   c = vec_adds (a, b);
    989   return (__m64) ((__vector long long) c)[0];
    990 }
    991 
    992 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    993 _m_paddusw (__m64 __m1, __m64 __m2)
    994 {
    995   return _mm_adds_pu16 (__m1, __m2);
    996 }
    997 
    998 /* Subtract the 8-bit values in M2 from the 8-bit values in M1 using signed
    999    saturating arithmetic.  */
   1000 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1001 _mm_subs_pi8 (__m64 __m1, __m64 __m2)
   1002 {
   1003   __vector signed char a, b, c;
   1004 
   1005   a = (__vector signed char)vec_splats (__m1);
   1006   b = (__vector signed char)vec_splats (__m2);
   1007   c = vec_subs (a, b);
   1008   return (__m64) ((__vector long long) c)[0];
   1009 }
   1010 
   1011 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1012 _m_psubsb (__m64 __m1, __m64 __m2)
   1013 {
   1014   return _mm_subs_pi8 (__m1, __m2);
   1015 }
   1016 
   1017 /* Subtract the 16-bit values in M2 from the 16-bit values in M1 using
   1018    signed saturating arithmetic.  */
   1019 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1020 _mm_subs_pi16 (__m64 __m1, __m64 __m2)
   1021 {
   1022   __vector signed short a, b, c;
   1023 
   1024   a = (__vector signed short)vec_splats (__m1);
   1025   b = (__vector signed short)vec_splats (__m2);
   1026   c = vec_subs (a, b);
   1027   return (__m64) ((__vector long long) c)[0];
   1028 }
   1029 
   1030 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1031 _m_psubsw (__m64 __m1, __m64 __m2)
   1032 {
   1033   return _mm_subs_pi16 (__m1, __m2);
   1034 }
   1035 
   1036 /* Subtract the 8-bit values in M2 from the 8-bit values in M1 using
   1037    unsigned saturating arithmetic.  */
   1038 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1039 _mm_subs_pu8 (__m64 __m1, __m64 __m2)
   1040 {
   1041   __vector unsigned char a, b, c;
   1042 
   1043   a = (__vector unsigned char)vec_splats (__m1);
   1044   b = (__vector unsigned char)vec_splats (__m2);
   1045   c = vec_subs (a, b);
   1046   return (__m64) ((__vector long long) c)[0];
   1047 }
   1048 
   1049 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1050 _m_psubusb (__m64 __m1, __m64 __m2)
   1051 {
   1052   return _mm_subs_pu8 (__m1, __m2);
   1053 }
   1054 
   1055 /* Subtract the 16-bit values in M2 from the 16-bit values in M1 using
   1056    unsigned saturating arithmetic.  */
   1057 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1058 _mm_subs_pu16 (__m64 __m1, __m64 __m2)
   1059 {
   1060   __vector unsigned short a, b, c;
   1061 
   1062   a = (__vector unsigned short)vec_splats (__m1);
   1063   b = (__vector unsigned short)vec_splats (__m2);
   1064   c = vec_subs (a, b);
   1065   return (__m64) ((__vector long long) c)[0];
   1066 }
   1067 
   1068 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1069 _m_psubusw (__m64 __m1, __m64 __m2)
   1070 {
   1071   return _mm_subs_pu16 (__m1, __m2);
   1072 }
   1073 
   1074 /* Multiply four 16-bit values in M1 by four 16-bit values in M2 producing
   1075    four 32-bit intermediate results, which are then summed by pairs to
   1076    produce two 32-bit results.  */
   1077 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1078 _mm_madd_pi16 (__m64 __m1, __m64 __m2)
   1079 {
   1080   __vector signed short a, b;
   1081   __vector signed int c;
   1082   __vector signed int zero = {0, 0, 0, 0};
   1083 
   1084   a = (__vector signed short)vec_splats (__m1);
   1085   b = (__vector signed short)vec_splats (__m2);
   1086   c = vec_vmsumshm (a, b, zero);
   1087   return (__m64) ((__vector long long) c)[0];
   1088 }
   1089 
   1090 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1091 _m_pmaddwd (__m64 __m1, __m64 __m2)
   1092 {
   1093   return _mm_madd_pi16 (__m1, __m2);
   1094 }
   1095 /* Multiply four signed 16-bit values in M1 by four signed 16-bit values in
   1096    M2 and produce the high 16 bits of the 32-bit results.  */
   1097 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1098 _mm_mulhi_pi16 (__m64 __m1, __m64 __m2)
   1099 {
   1100   __vector signed short a, b;
   1101   __vector signed short c;
   1102   __vector signed int w0, w1;
   1103   __vector unsigned char xform1 = {
   1104 #ifdef __LITTLE_ENDIAN__
   1105       0x02, 0x03, 0x12, 0x13,  0x06, 0x07, 0x16, 0x17,
   1106       0x0A, 0x0B, 0x1A, 0x1B,  0x0E, 0x0F, 0x1E, 0x1F
   1107 #else
   1108       0x00, 0x01, 0x10, 0x11,  0x04, 0x05, 0x14, 0x15,
   1109       0x00, 0x01, 0x10, 0x11,  0x04, 0x05, 0x14, 0x15
   1110 #endif
   1111     };
   1112 
   1113   a = (__vector signed short)vec_splats (__m1);
   1114   b = (__vector signed short)vec_splats (__m2);
   1115 
   1116   w0 = vec_vmulesh (a, b);
   1117   w1 = vec_vmulosh (a, b);
   1118   c = (__vector signed short)vec_perm (w0, w1, xform1);
   1119 
   1120   return (__m64) ((__vector long long) c)[0];
   1121 }
   1122 
   1123 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1124 _m_pmulhw (__m64 __m1, __m64 __m2)
   1125 {
   1126   return _mm_mulhi_pi16 (__m1, __m2);
   1127 }
   1128 
   1129 /* Multiply four 16-bit values in M1 by four 16-bit values in M2 and produce
   1130    the low 16 bits of the results.  */
   1131 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1132 _mm_mullo_pi16 (__m64 __m1, __m64 __m2)
   1133 {
   1134   __vector signed short a, b, c;
   1135 
   1136   a = (__vector signed short)vec_splats (__m1);
   1137   b = (__vector signed short)vec_splats (__m2);
   1138   c = a * b;
   1139   return (__m64) ((__vector long long) c)[0];
   1140 }
   1141 
   1142 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1143 _m_pmullw (__m64 __m1, __m64 __m2)
   1144 {
   1145   return _mm_mullo_pi16 (__m1, __m2);
   1146 }
   1147 
   1148 /* Shift four 16-bit values in M left by COUNT.  */
   1149 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1150 _mm_sll_pi16 (__m64 __m, __m64 __count)
   1151 {
   1152   __vector signed short m, r;
   1153   __vector unsigned short c;
   1154 
   1155   if (__count <= 15)
   1156     {
   1157       m = (__vector signed short)vec_splats (__m);
   1158       c = (__vector unsigned short)vec_splats ((unsigned short)__count);
   1159       r = vec_sl (m, (__vector unsigned short)c);
   1160       return (__m64) ((__vector long long) r)[0];
   1161     }
   1162   else
   1163   return (0);
   1164 }
   1165 
   1166 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1167 _m_psllw (__m64 __m, __m64 __count)
   1168 {
   1169   return _mm_sll_pi16 (__m, __count);
   1170 }
   1171 
   1172 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1173 _mm_slli_pi16 (__m64 __m, int __count)
   1174 {
   1175   /* Promote int to long then invoke mm_sll_pi16.  */
   1176   return _mm_sll_pi16 (__m, __count);
   1177 }
   1178 
   1179 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1180 _m_psllwi (__m64 __m, int __count)
   1181 {
   1182   return _mm_slli_pi16 (__m, __count);
   1183 }
   1184 
   1185 /* Shift two 32-bit values in M left by COUNT.  */
   1186 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1187 _mm_sll_pi32 (__m64 __m, __m64 __count)
   1188 {
   1189   __m64_union m, res;
   1190 
   1191   m.as_m64 = __m;
   1192 
   1193   res.as_int[0] = m.as_int[0] << __count;
   1194   res.as_int[1] = m.as_int[1] << __count;
   1195   return (res.as_m64);
   1196 }
   1197 
   1198 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1199 _m_pslld (__m64 __m, __m64 __count)
   1200 {
   1201   return _mm_sll_pi32 (__m, __count);
   1202 }
   1203 
   1204 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1205 _mm_slli_pi32 (__m64 __m, int __count)
   1206 {
   1207   /* Promote int to long then invoke mm_sll_pi32.  */
   1208   return _mm_sll_pi32 (__m, __count);
   1209 }
   1210 
   1211 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1212 _m_pslldi (__m64 __m, int __count)
   1213 {
   1214   return _mm_slli_pi32 (__m, __count);
   1215 }
   1216 
   1217 /* Shift four 16-bit values in M right by COUNT; shift in the sign bit.  */
   1218 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1219 _mm_sra_pi16 (__m64 __m, __m64 __count)
   1220 {
   1221   __vector signed short m, r;
   1222   __vector unsigned short c;
   1223 
   1224   if (__count <= 15)
   1225     {
   1226 	m = (__vector signed short)vec_splats (__m);
   1227 	c = (__vector unsigned short)vec_splats ((unsigned short)__count);
   1228 	r = vec_sra (m, (__vector unsigned short)c);
   1229         return (__m64) ((__vector long long) r)[0];
   1230     }
   1231   else
   1232   return (0);
   1233 }
   1234 
   1235 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1236 _m_psraw (__m64 __m, __m64 __count)
   1237 {
   1238   return _mm_sra_pi16 (__m, __count);
   1239 }
   1240 
   1241 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1242 _mm_srai_pi16 (__m64 __m, int __count)
   1243 {
   1244   /* Promote int to long then invoke mm_sra_pi32.  */
   1245   return _mm_sra_pi16 (__m, __count);
   1246 }
   1247 
   1248 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1249 _m_psrawi (__m64 __m, int __count)
   1250 {
   1251   return _mm_srai_pi16 (__m, __count);
   1252 }
   1253 
   1254 /* Shift two 32-bit values in M right by COUNT; shift in the sign bit.  */
   1255 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1256 _mm_sra_pi32 (__m64 __m, __m64 __count)
   1257 {
   1258   __m64_union m, res;
   1259 
   1260   m.as_m64 = __m;
   1261 
   1262   res.as_int[0] = m.as_int[0] >> __count;
   1263   res.as_int[1] = m.as_int[1] >> __count;
   1264   return (res.as_m64);
   1265 }
   1266 
   1267 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1268 _m_psrad (__m64 __m, __m64 __count)
   1269 {
   1270   return _mm_sra_pi32 (__m, __count);
   1271 }
   1272 
   1273 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1274 _mm_srai_pi32 (__m64 __m, int __count)
   1275 {
   1276   /* Promote int to long then invoke mm_sra_pi32.  */
   1277   return _mm_sra_pi32 (__m, __count);
   1278 }
   1279 
   1280 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1281 _m_psradi (__m64 __m, int __count)
   1282 {
   1283   return _mm_srai_pi32 (__m, __count);
   1284 }
   1285 
   1286 /* Shift four 16-bit values in M right by COUNT; shift in zeros.  */
   1287 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1288 _mm_srl_pi16 (__m64 __m, __m64 __count)
   1289 {
   1290   __vector unsigned short m, r;
   1291   __vector unsigned short c;
   1292 
   1293   if (__count <= 15)
   1294     {
   1295 	m = (__vector unsigned short)vec_splats (__m);
   1296 	c = (__vector unsigned short)vec_splats ((unsigned short)__count);
   1297 	r = vec_sr (m, (__vector unsigned short)c);
   1298         return (__m64) ((__vector long long) r)[0];
   1299     }
   1300   else
   1301     return (0);
   1302 }
   1303 
   1304 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1305 _m_psrlw (__m64 __m, __m64 __count)
   1306 {
   1307   return _mm_srl_pi16 (__m, __count);
   1308 }
   1309 
   1310 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1311 _mm_srli_pi16 (__m64 __m, int __count)
   1312 {
   1313   /* Promote int to long then invoke mm_sra_pi32.  */
   1314   return _mm_srl_pi16 (__m, __count);
   1315 }
   1316 
   1317 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1318 _m_psrlwi (__m64 __m, int __count)
   1319 {
   1320   return _mm_srli_pi16 (__m, __count);
   1321 }
   1322 
   1323 /* Shift two 32-bit values in M right by COUNT; shift in zeros.  */
   1324 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1325 _mm_srl_pi32 (__m64 __m, __m64 __count)
   1326 {
   1327   __m64_union m, res;
   1328 
   1329   m.as_m64 = __m;
   1330 
   1331   res.as_int[0] = (unsigned int)m.as_int[0] >> __count;
   1332   res.as_int[1] = (unsigned int)m.as_int[1] >> __count;
   1333   return (res.as_m64);
   1334 }
   1335 
   1336 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1337 _m_psrld (__m64 __m, __m64 __count)
   1338 {
   1339   return _mm_srl_pi32 (__m, __count);
   1340 }
   1341 
   1342 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1343 _mm_srli_pi32 (__m64 __m, int __count)
   1344 {
   1345   /* Promote int to long then invoke mm_srl_pi32.  */
   1346   return _mm_srl_pi32 (__m, __count);
   1347 }
   1348 
   1349 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1350 _m_psrldi (__m64 __m, int __count)
   1351 {
   1352   return _mm_srli_pi32 (__m, __count);
   1353 }
   1354 #endif /* _ARCH_PWR8 */
   1355 
   1356 /* Creates a vector of two 32-bit values; I0 is least significant.  */
   1357 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1358 _mm_set_pi32 (int __i1, int __i0)
   1359 {
   1360   __m64_union res;
   1361 
   1362   res.as_int[0] = __i0;
   1363   res.as_int[1] = __i1;
   1364   return (res.as_m64);
   1365 }
   1366 
   1367 /* Creates a vector of four 16-bit values; W0 is least significant.  */
   1368 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1369 _mm_set_pi16 (short __w3, short __w2, short __w1, short __w0)
   1370 {
   1371   __m64_union res;
   1372 
   1373   res.as_short[0] = __w0;
   1374   res.as_short[1] = __w1;
   1375   res.as_short[2] = __w2;
   1376   res.as_short[3] = __w3;
   1377   return (res.as_m64);
   1378 }
   1379 
   1380 /* Creates a vector of eight 8-bit values; B0 is least significant.  */
   1381 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1382 _mm_set_pi8 (char __b7, char __b6, char __b5, char __b4,
   1383 	     char __b3, char __b2, char __b1, char __b0)
   1384 {
   1385   __m64_union res;
   1386 
   1387   res.as_char[0] = __b0;
   1388   res.as_char[1] = __b1;
   1389   res.as_char[2] = __b2;
   1390   res.as_char[3] = __b3;
   1391   res.as_char[4] = __b4;
   1392   res.as_char[5] = __b5;
   1393   res.as_char[6] = __b6;
   1394   res.as_char[7] = __b7;
   1395   return (res.as_m64);
   1396 }
   1397 
   1398 /* Similar, but with the arguments in reverse order.  */
   1399 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1400 _mm_setr_pi32 (int __i0, int __i1)
   1401 {
   1402   __m64_union res;
   1403 
   1404   res.as_int[0] = __i0;
   1405   res.as_int[1] = __i1;
   1406   return (res.as_m64);
   1407 }
   1408 
   1409 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1410 _mm_setr_pi16 (short __w0, short __w1, short __w2, short __w3)
   1411 {
   1412   return _mm_set_pi16 (__w3, __w2, __w1, __w0);
   1413 }
   1414 
   1415 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1416 _mm_setr_pi8 (char __b0, char __b1, char __b2, char __b3,
   1417 	      char __b4, char __b5, char __b6, char __b7)
   1418 {
   1419   return _mm_set_pi8 (__b7, __b6, __b5, __b4, __b3, __b2, __b1, __b0);
   1420 }
   1421 
   1422 /* Creates a vector of two 32-bit values, both elements containing I.  */
   1423 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1424 _mm_set1_pi32 (int __i)
   1425 {
   1426   __m64_union res;
   1427 
   1428   res.as_int[0] = __i;
   1429   res.as_int[1] = __i;
   1430   return (res.as_m64);
   1431 }
   1432 
   1433 /* Creates a vector of four 16-bit values, all elements containing W.  */
   1434 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1435 _mm_set1_pi16 (short __w)
   1436 {
   1437 #if _ARCH_PWR9
   1438   __vector signed short w;
   1439 
   1440   w = (__vector signed short)vec_splats (__w);
   1441   return (__m64) ((__vector long long) w)[0];
   1442 #else
   1443   __m64_union res;
   1444 
   1445   res.as_short[0] = __w;
   1446   res.as_short[1] = __w;
   1447   res.as_short[2] = __w;
   1448   res.as_short[3] = __w;
   1449   return (res.as_m64);
   1450 #endif
   1451 }
   1452 
   1453 /* Creates a vector of eight 8-bit values, all elements containing B.  */
   1454 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1455 _mm_set1_pi8 (signed char __b)
   1456 {
   1457 #if _ARCH_PWR8
   1458   __vector signed char b;
   1459 
   1460   b = (__vector signed char)vec_splats (__b);
   1461   return (__m64) ((__vector long long) b)[0];
   1462 #else
   1463   __m64_union res;
   1464 
   1465   res.as_char[0] = __b;
   1466   res.as_char[1] = __b;
   1467   res.as_char[2] = __b;
   1468   res.as_char[3] = __b;
   1469   res.as_char[4] = __b;
   1470   res.as_char[5] = __b;
   1471   res.as_char[6] = __b;
   1472   res.as_char[7] = __b;
   1473   return (res.as_m64);
   1474 #endif
   1475 }
   1476 #endif /* _MMINTRIN_H_INCLUDED */
   1477