Home | History | Annotate | Line # | Download | only in rs6000
mmintrin.h revision 1.1
      1 /* Copyright (C) 2002-2018 Free Software Foundation, Inc.
      2 
      3    This file is part of GCC.
      4 
      5    GCC is free software; you can redistribute it and/or modify
      6    it under the terms of the GNU General Public License as published by
      7    the Free Software Foundation; either version 3, or (at your option)
      8    any later version.
      9 
     10    GCC is distributed in the hope that it will be useful,
     11    but WITHOUT ANY WARRANTY; without even the implied warranty of
     12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     13    GNU General Public License for more details.
     14 
     15    Under Section 7 of GPL version 3, you are granted additional
     16    permissions described in the GCC Runtime Library Exception, version
     17    3.1, as published by the Free Software Foundation.
     18 
     19    You should have received a copy of the GNU General Public License and
     20    a copy of the GCC Runtime Library Exception along with this program;
     21    see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
     22    <http://www.gnu.org/licenses/>.  */
     23 
     24 /* Implemented from the specification included in the Intel C++ Compiler
     25    User Guide and Reference, version 9.0.  */
     26 
     27 #ifndef NO_WARN_X86_INTRINSICS
     28 /* This header is distributed to simplify porting x86_64 code that
     29    makes explicit use of Intel intrinsics to powerpc64le.
     30    It is the user's responsibility to determine if the results are
     31    acceptable and make additional changes as necessary.
     32    Note that much code that uses Intel intrinsics can be rewritten in
     33    standard C or GNU C extensions, which are more portable and better
     34    optimized across multiple targets.
     35 
     36    In the specific case of X86 MMX (__m64) intrinsics, the PowerPC
     37    target does not support a native __vector_size__ (8) type.  Instead
     38    we typedef __m64 to a 64-bit unsigned long long, which is natively
     39    supported in 64-bit mode.  This works well for the _si64 and some
     40    _pi32 operations, but starts to generate long sequences for _pi16
     41    and _pi8 operations.  For those cases it better (faster and
     42    smaller code) to transfer __m64 data to the PowerPC vector 128-bit
     43    unit, perform the operation, and then transfer the result back to
     44    the __m64 type. This implies that the direct register move
     45    instructions, introduced with power8, are available for efficient
     46    implementation of these transfers.
     47 
     48    Most MMX intrinsic operations can be performed efficiently as
     49    C language 64-bit scalar operation or optimized to use the newer
     50    128-bit SSE/Altivec operations.  We recomend this for new
     51    applications.  */
     52 #error "Please read comment above.  Use -DNO_WARN_X86_INTRINSICS to disable this error."
     53 #endif
     54 
     55 #ifndef _MMINTRIN_H_INCLUDED
     56 #define _MMINTRIN_H_INCLUDED
     57 
     58 #include <altivec.h>
     59 /* The Intel API is flexible enough that we must allow aliasing with other
     60    vector types, and their scalar components.  */
     61 typedef __attribute__ ((__aligned__ (8))) unsigned long long __m64;
     62 
     63 typedef __attribute__ ((__aligned__ (8)))
     64 union
     65   {
     66     __m64 as_m64;
     67     char as_char[8];
     68     signed char as_signed_char [8];
     69     short as_short[4];
     70     int as_int[2];
     71     long long as_long_long;
     72     float as_float[2];
     73     double as_double;
     74   } __m64_union;
     75 
     76 /* Empty the multimedia state.  */
     77 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     78 _mm_empty (void)
     79 {
     80   /* nothing to do on PowerPC.  */
     81 }
     82 
     83 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     84 _m_empty (void)
     85 {
     86   /* nothing to do on PowerPC.  */
     87 }
     88 
     89 /* Convert I to a __m64 object.  The integer is zero-extended to 64-bits.  */
     90 extern __inline __m64  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     91 _mm_cvtsi32_si64 (int __i)
     92 {
     93   return (__m64) (unsigned int) __i;
     94 }
     95 
     96 extern __inline __m64  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     97 _m_from_int (int __i)
     98 {
     99   return _mm_cvtsi32_si64 (__i);
    100 }
    101 
    102 /* Convert the lower 32 bits of the __m64 object into an integer.  */
    103 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    104 _mm_cvtsi64_si32 (__m64 __i)
    105 {
    106   return ((int) __i);
    107 }
    108 
    109 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    110 _m_to_int (__m64 __i)
    111 {
    112   return _mm_cvtsi64_si32 (__i);
    113 }
    114 
    115 #ifdef __powerpc64__
    116 /* Convert I to a __m64 object.  */
    117 
    118 /* Intel intrinsic.  */
    119 extern __inline __m64  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    120 _m_from_int64 (long long __i)
    121 {
    122   return (__m64) __i;
    123 }
    124 
    125 extern __inline __m64  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    126 _mm_cvtsi64_m64 (long long __i)
    127 {
    128   return (__m64) __i;
    129 }
    130 
    131 /* Microsoft intrinsic.  */
    132 extern __inline __m64  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    133 _mm_cvtsi64x_si64 (long long __i)
    134 {
    135   return (__m64) __i;
    136 }
    137 
    138 extern __inline __m64  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    139 _mm_set_pi64x (long long __i)
    140 {
    141   return (__m64) __i;
    142 }
    143 
    144 /* Convert the __m64 object to a 64bit integer.  */
    145 
    146 /* Intel intrinsic.  */
    147 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    148 _m_to_int64 (__m64 __i)
    149 {
    150   return (long long)__i;
    151 }
    152 
    153 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    154 _mm_cvtm64_si64 (__m64 __i)
    155 {
    156   return (long long) __i;
    157 }
    158 
    159 /* Microsoft intrinsic.  */
    160 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    161 _mm_cvtsi64_si64x (__m64 __i)
    162 {
    163   return (long long) __i;
    164 }
    165 
    166 #ifdef _ARCH_PWR8
    167 /* Pack the four 16-bit values from M1 into the lower four 8-bit values of
    168    the result, and the four 16-bit values from M2 into the upper four 8-bit
    169    values of the result, all with signed saturation.  */
    170 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    171 _mm_packs_pi16 (__m64 __m1, __m64 __m2)
    172 {
    173   __vector signed short vm1;
    174   __vector signed char vresult;
    175 
    176   vm1 = (__vector signed short)__builtin_pack_vector_int128 (__m2, __m1);
    177   vresult = vec_vpkshss (vm1, vm1);
    178   return (__m64) __builtin_unpack_vector_int128 ((__vector __int128)vresult, 0);
    179 }
    180 
    181 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    182 _m_packsswb (__m64 __m1, __m64 __m2)
    183 {
    184   return _mm_packs_pi16 (__m1, __m2);
    185 }
    186 
    187 /* Pack the two 32-bit values from M1 in to the lower two 16-bit values of
    188    the result, and the two 32-bit values from M2 into the upper two 16-bit
    189    values of the result, all with signed saturation.  */
    190 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    191 _mm_packs_pi32 (__m64 __m1, __m64 __m2)
    192 {
    193   __vector signed int vm1;
    194   __vector signed short vresult;
    195 
    196   vm1 = (__vector signed int)__builtin_pack_vector_int128 (__m2, __m1);
    197   vresult = vec_vpkswss (vm1, vm1);
    198   return ((__m64) __builtin_unpack_vector_int128 ((__vector __int128)vresult, 0));
    199 }
    200 
    201 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    202 _m_packssdw (__m64 __m1, __m64 __m2)
    203 {
    204   return _mm_packs_pi32 (__m1, __m2);
    205 }
    206 
    207 /* Pack the four 16-bit values from M1 into the lower four 8-bit values of
    208    the result, and the four 16-bit values from M2 into the upper four 8-bit
    209    values of the result, all with unsigned saturation.  */
    210 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    211 _mm_packs_pu16 (__m64 __m1, __m64 __m2)
    212 {
    213   __vector signed short vm1;
    214   __vector unsigned char vresult;
    215 
    216   vm1 = (__vector signed short)__builtin_pack_vector_int128 (__m2, __m1);
    217   vresult = vec_vpkshus (vm1, vm1);
    218   return ((__m64) __builtin_unpack_vector_int128 ((__vector __int128)vresult, 0));
    219 }
    220 
    221 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    222 _m_packuswb (__m64 __m1, __m64 __m2)
    223 {
    224   return _mm_packs_pu16 (__m1, __m2);
    225 }
    226 #endif /* end ARCH_PWR8 */
    227 
    228 /* Interleave the four 8-bit values from the high half of M1 with the four
    229    8-bit values from the high half of M2.  */
    230 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    231 _mm_unpackhi_pi8 (__m64 __m1, __m64 __m2)
    232 {
    233 #if _ARCH_PWR8
    234   __vector unsigned char a, b, c;
    235 
    236   a = (__vector unsigned char)vec_splats (__m1);
    237   b = (__vector unsigned char)vec_splats (__m2);
    238   c = vec_mergel (a, b);
    239   return (__builtin_unpack_vector_int128 ((__vector __int128_t)c, 0));
    240 #else
    241   __m64_union m1, m2, res;
    242 
    243   m1.as_m64 = __m1;
    244   m2.as_m64 = __m2;
    245 
    246   res.as_char[0] = m1.as_char[4];
    247   res.as_char[1] = m2.as_char[4];
    248   res.as_char[2] = m1.as_char[5];
    249   res.as_char[3] = m2.as_char[5];
    250   res.as_char[4] = m1.as_char[6];
    251   res.as_char[5] = m2.as_char[6];
    252   res.as_char[6] = m1.as_char[7];
    253   res.as_char[7] = m2.as_char[7];
    254 
    255   return (__m64) res.as_m64;
    256 #endif
    257 }
    258 
    259 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    260 _m_punpckhbw (__m64 __m1, __m64 __m2)
    261 {
    262   return _mm_unpackhi_pi8 (__m1, __m2);
    263 }
    264 
    265 /* Interleave the two 16-bit values from the high half of M1 with the two
    266    16-bit values from the high half of M2.  */
    267 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    268 _mm_unpackhi_pi16 (__m64 __m1, __m64 __m2)
    269 {
    270   __m64_union m1, m2, res;
    271 
    272   m1.as_m64 = __m1;
    273   m2.as_m64 = __m2;
    274 
    275   res.as_short[0] = m1.as_short[2];
    276   res.as_short[1] = m2.as_short[2];
    277   res.as_short[2] = m1.as_short[3];
    278   res.as_short[3] = m2.as_short[3];
    279 
    280   return (__m64) res.as_m64;
    281 }
    282 
    283 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    284 _m_punpckhwd (__m64 __m1, __m64 __m2)
    285 {
    286   return _mm_unpackhi_pi16 (__m1, __m2);
    287 }
    288 /* Interleave the 32-bit value from the high half of M1 with the 32-bit
    289    value from the high half of M2.  */
    290 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    291 _mm_unpackhi_pi32 (__m64 __m1, __m64 __m2)
    292 {
    293   __m64_union m1, m2, res;
    294 
    295   m1.as_m64 = __m1;
    296   m2.as_m64 = __m2;
    297 
    298   res.as_int[0] = m1.as_int[1];
    299   res.as_int[1] = m2.as_int[1];
    300 
    301   return (__m64) res.as_m64;
    302 }
    303 
    304 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    305 _m_punpckhdq (__m64 __m1, __m64 __m2)
    306 {
    307   return _mm_unpackhi_pi32 (__m1, __m2);
    308 }
    309 /* Interleave the four 8-bit values from the low half of M1 with the four
    310    8-bit values from the low half of M2.  */
    311 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    312 _mm_unpacklo_pi8 (__m64 __m1, __m64 __m2)
    313 {
    314 #if _ARCH_PWR8
    315   __vector unsigned char a, b, c;
    316 
    317   a = (__vector unsigned char)vec_splats (__m1);
    318   b = (__vector unsigned char)vec_splats (__m2);
    319   c = vec_mergel (a, b);
    320   return (__builtin_unpack_vector_int128 ((__vector __int128_t)c, 1));
    321 #else
    322   __m64_union m1, m2, res;
    323 
    324   m1.as_m64 = __m1;
    325   m2.as_m64 = __m2;
    326 
    327   res.as_char[0] = m1.as_char[0];
    328   res.as_char[1] = m2.as_char[0];
    329   res.as_char[2] = m1.as_char[1];
    330   res.as_char[3] = m2.as_char[1];
    331   res.as_char[4] = m1.as_char[2];
    332   res.as_char[5] = m2.as_char[2];
    333   res.as_char[6] = m1.as_char[3];
    334   res.as_char[7] = m2.as_char[3];
    335 
    336   return (__m64) res.as_m64;
    337 #endif
    338 }
    339 
    340 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    341 _m_punpcklbw (__m64 __m1, __m64 __m2)
    342 {
    343   return _mm_unpacklo_pi8 (__m1, __m2);
    344 }
    345 /* Interleave the two 16-bit values from the low half of M1 with the two
    346    16-bit values from the low half of M2.  */
    347 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    348 _mm_unpacklo_pi16 (__m64 __m1, __m64 __m2)
    349 {
    350   __m64_union m1, m2, res;
    351 
    352   m1.as_m64 = __m1;
    353   m2.as_m64 = __m2;
    354 
    355   res.as_short[0] = m1.as_short[0];
    356   res.as_short[1] = m2.as_short[0];
    357   res.as_short[2] = m1.as_short[1];
    358   res.as_short[3] = m2.as_short[1];
    359 
    360   return (__m64) res.as_m64;
    361 }
    362 
    363 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    364 _m_punpcklwd (__m64 __m1, __m64 __m2)
    365 {
    366   return _mm_unpacklo_pi16 (__m1, __m2);
    367 }
    368 
    369 /* Interleave the 32-bit value from the low half of M1 with the 32-bit
    370    value from the low half of M2.  */
    371 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    372 _mm_unpacklo_pi32 (__m64 __m1, __m64 __m2)
    373 {
    374   __m64_union m1, m2, res;
    375 
    376   m1.as_m64 = __m1;
    377   m2.as_m64 = __m2;
    378 
    379   res.as_int[0] = m1.as_int[0];
    380   res.as_int[1] = m2.as_int[0];
    381 
    382   return (__m64) res.as_m64;
    383 }
    384 
    385 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    386 _m_punpckldq (__m64 __m1, __m64 __m2)
    387 {
    388   return _mm_unpacklo_pi32 (__m1, __m2);
    389 }
    390 
    391 /* Add the 8-bit values in M1 to the 8-bit values in M2.  */
    392 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    393 _mm_add_pi8 (__m64 __m1, __m64 __m2)
    394 {
    395 #if _ARCH_PWR8
    396   __vector signed char a, b, c;
    397 
    398   a = (__vector signed char)vec_splats (__m1);
    399   b = (__vector signed char)vec_splats (__m2);
    400   c = vec_add (a, b);
    401   return (__builtin_unpack_vector_int128 ((__vector __int128_t)c, 0));
    402 #else
    403   __m64_union m1, m2, res;
    404 
    405   m1.as_m64 = __m1;
    406   m2.as_m64 = __m2;
    407 
    408   res.as_char[0] = m1.as_char[0] + m2.as_char[0];
    409   res.as_char[1] = m1.as_char[1] + m2.as_char[1];
    410   res.as_char[2] = m1.as_char[2] + m2.as_char[2];
    411   res.as_char[3] = m1.as_char[3] + m2.as_char[3];
    412   res.as_char[4] = m1.as_char[4] + m2.as_char[4];
    413   res.as_char[5] = m1.as_char[5] + m2.as_char[5];
    414   res.as_char[6] = m1.as_char[6] + m2.as_char[6];
    415   res.as_char[7] = m1.as_char[7] + m2.as_char[7];
    416 
    417   return (__m64) res.as_m64;
    418 #endif
    419 }
    420 
    421 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    422 _m_paddb (__m64 __m1, __m64 __m2)
    423 {
    424   return _mm_add_pi8 (__m1, __m2);
    425 }
    426 
    427 /* Add the 16-bit values in M1 to the 16-bit values in M2.  */
    428 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    429 _mm_add_pi16 (__m64 __m1, __m64 __m2)
    430 {
    431 #if _ARCH_PWR8
    432   __vector signed short a, b, c;
    433 
    434   a = (__vector signed short)vec_splats (__m1);
    435   b = (__vector signed short)vec_splats (__m2);
    436   c = vec_add (a, b);
    437   return (__builtin_unpack_vector_int128 ((__vector __int128_t)c, 0));
    438 #else
    439   __m64_union m1, m2, res;
    440 
    441   m1.as_m64 = __m1;
    442   m2.as_m64 = __m2;
    443 
    444   res.as_short[0] = m1.as_short[0] + m2.as_short[0];
    445   res.as_short[1] = m1.as_short[1] + m2.as_short[1];
    446   res.as_short[2] = m1.as_short[2] + m2.as_short[2];
    447   res.as_short[3] = m1.as_short[3] + m2.as_short[3];
    448 
    449   return (__m64) res.as_m64;
    450 #endif
    451 }
    452 
    453 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    454 _m_paddw (__m64 __m1, __m64 __m2)
    455 {
    456   return _mm_add_pi16 (__m1, __m2);
    457 }
    458 
    459 /* Add the 32-bit values in M1 to the 32-bit values in M2.  */
    460 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    461 _mm_add_pi32 (__m64 __m1, __m64 __m2)
    462 {
    463 #if _ARCH_PWR9
    464   __vector signed int a, b, c;
    465 
    466   a = (__vector signed int)vec_splats (__m1);
    467   b = (__vector signed int)vec_splats (__m2);
    468   c = vec_add (a, b);
    469   return (__builtin_unpack_vector_int128 ((__vector __int128_t)c, 0));
    470 #else
    471   __m64_union m1, m2, res;
    472 
    473   m1.as_m64 = __m1;
    474   m2.as_m64 = __m2;
    475 
    476   res.as_int[0] = m1.as_int[0] + m2.as_int[0];
    477   res.as_int[1] = m1.as_int[1] + m2.as_int[1];
    478 
    479   return (__m64) res.as_m64;
    480 #endif
    481 }
    482 
    483 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    484 _m_paddd (__m64 __m1, __m64 __m2)
    485 {
    486   return _mm_add_pi32 (__m1, __m2);
    487 }
    488 
    489 /* Subtract the 8-bit values in M2 from the 8-bit values in M1.  */
    490 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    491 _mm_sub_pi8 (__m64 __m1, __m64 __m2)
    492 {
    493 #if _ARCH_PWR8
    494   __vector signed char a, b, c;
    495 
    496   a = (__vector signed char)vec_splats (__m1);
    497   b = (__vector signed char)vec_splats (__m2);
    498   c = vec_sub (a, b);
    499   return (__builtin_unpack_vector_int128 ((__vector __int128_t)c, 0));
    500 #else
    501   __m64_union m1, m2, res;
    502 
    503   m1.as_m64 = __m1;
    504   m2.as_m64 = __m2;
    505 
    506   res.as_char[0] = m1.as_char[0] - m2.as_char[0];
    507   res.as_char[1] = m1.as_char[1] - m2.as_char[1];
    508   res.as_char[2] = m1.as_char[2] - m2.as_char[2];
    509   res.as_char[3] = m1.as_char[3] - m2.as_char[3];
    510   res.as_char[4] = m1.as_char[4] - m2.as_char[4];
    511   res.as_char[5] = m1.as_char[5] - m2.as_char[5];
    512   res.as_char[6] = m1.as_char[6] - m2.as_char[6];
    513   res.as_char[7] = m1.as_char[7] - m2.as_char[7];
    514 
    515   return (__m64) res.as_m64;
    516 #endif
    517 }
    518 
    519 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    520 _m_psubb (__m64 __m1, __m64 __m2)
    521 {
    522   return _mm_sub_pi8 (__m1, __m2);
    523 }
    524 
    525 /* Subtract the 16-bit values in M2 from the 16-bit values in M1.  */
    526 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    527 _mm_sub_pi16 (__m64 __m1, __m64 __m2)
    528 {
    529 #if _ARCH_PWR8
    530   __vector signed short a, b, c;
    531 
    532   a = (__vector signed short)vec_splats (__m1);
    533   b = (__vector signed short)vec_splats (__m2);
    534   c = vec_sub (a, b);
    535   return (__builtin_unpack_vector_int128 ((__vector __int128_t)c, 0));
    536 #else
    537   __m64_union m1, m2, res;
    538 
    539   m1.as_m64 = __m1;
    540   m2.as_m64 = __m2;
    541 
    542   res.as_short[0] = m1.as_short[0] - m2.as_short[0];
    543   res.as_short[1] = m1.as_short[1] - m2.as_short[1];
    544   res.as_short[2] = m1.as_short[2] - m2.as_short[2];
    545   res.as_short[3] = m1.as_short[3] - m2.as_short[3];
    546 
    547   return (__m64) res.as_m64;
    548 #endif
    549 }
    550 
    551 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    552 _m_psubw (__m64 __m1, __m64 __m2)
    553 {
    554   return _mm_sub_pi16 (__m1, __m2);
    555 }
    556 
    557 /* Subtract the 32-bit values in M2 from the 32-bit values in M1.  */
    558 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    559 _mm_sub_pi32 (__m64 __m1, __m64 __m2)
    560 {
    561 #if _ARCH_PWR9
    562   __vector signed int a, b, c;
    563 
    564   a = (__vector signed int)vec_splats (__m1);
    565   b = (__vector signed int)vec_splats (__m2);
    566   c = vec_sub (a, b);
    567   return (__builtin_unpack_vector_int128 ((__vector __int128_t)c, 0));
    568 #else
    569   __m64_union m1, m2, res;
    570 
    571   m1.as_m64 = __m1;
    572   m2.as_m64 = __m2;
    573 
    574   res.as_int[0] = m1.as_int[0] - m2.as_int[0];
    575   res.as_int[1] = m1.as_int[1] - m2.as_int[1];
    576 
    577   return (__m64) res.as_m64;
    578 #endif
    579 }
    580 
    581 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    582 _m_psubd (__m64 __m1, __m64 __m2)
    583 {
    584   return _mm_sub_pi32 (__m1, __m2);
    585 }
    586 
    587 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    588 _mm_add_si64 (__m64 __m1, __m64 __m2)
    589 {
    590   return (__m1 + __m2);
    591 }
    592 
    593 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    594 _mm_sub_si64 (__m64 __m1, __m64 __m2)
    595 {
    596   return (__m1 - __m2);
    597 }
    598 
    599 /* Shift the 64-bit value in M left by COUNT.  */
    600 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    601 _mm_sll_si64 (__m64 __m, __m64 __count)
    602 {
    603   return (__m << __count);
    604 }
    605 
    606 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    607 _m_psllq (__m64 __m, __m64 __count)
    608 {
    609   return _mm_sll_si64 (__m, __count);
    610 }
    611 
    612 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    613 _mm_slli_si64 (__m64 __m, const int __count)
    614 {
    615   return (__m << __count);
    616 }
    617 
    618 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    619 _m_psllqi (__m64 __m, const int __count)
    620 {
    621   return _mm_slli_si64 (__m, __count);
    622 }
    623 
    624 /* Shift the 64-bit value in M left by COUNT; shift in zeros.  */
    625 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    626 _mm_srl_si64 (__m64 __m, __m64 __count)
    627 {
    628   return (__m >> __count);
    629 }
    630 
    631 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    632 _m_psrlq (__m64 __m, __m64 __count)
    633 {
    634   return _mm_srl_si64 (__m, __count);
    635 }
    636 
    637 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    638 _mm_srli_si64 (__m64 __m, const int __count)
    639 {
    640   return (__m >> __count);
    641 }
    642 
    643 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    644 _m_psrlqi (__m64 __m, const int __count)
    645 {
    646   return _mm_srli_si64 (__m, __count);
    647 }
    648 
    649 /* Bit-wise AND the 64-bit values in M1 and M2.  */
    650 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    651 _mm_and_si64 (__m64 __m1, __m64 __m2)
    652 {
    653   return (__m1 & __m2);
    654 }
    655 
    656 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    657 _m_pand (__m64 __m1, __m64 __m2)
    658 {
    659   return _mm_and_si64 (__m1, __m2);
    660 }
    661 
    662 /* Bit-wise complement the 64-bit value in M1 and bit-wise AND it with the
    663    64-bit value in M2.  */
    664 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    665 _mm_andnot_si64 (__m64 __m1, __m64 __m2)
    666 {
    667   return (~__m1 & __m2);
    668 }
    669 
    670 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    671 _m_pandn (__m64 __m1, __m64 __m2)
    672 {
    673   return _mm_andnot_si64 (__m1, __m2);
    674 }
    675 
    676 /* Bit-wise inclusive OR the 64-bit values in M1 and M2.  */
    677 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    678 _mm_or_si64 (__m64 __m1, __m64 __m2)
    679 {
    680   return (__m1 | __m2);
    681 }
    682 
    683 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    684 _m_por (__m64 __m1, __m64 __m2)
    685 {
    686   return _mm_or_si64 (__m1, __m2);
    687 }
    688 
    689 /* Bit-wise exclusive OR the 64-bit values in M1 and M2.  */
    690 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    691 _mm_xor_si64 (__m64 __m1, __m64 __m2)
    692 {
    693   return  (__m1 ^ __m2);
    694 }
    695 
    696 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    697 _m_pxor (__m64 __m1, __m64 __m2)
    698 {
    699   return _mm_xor_si64 (__m1, __m2);
    700 }
    701 
    702 /* Creates a 64-bit zero.  */
    703 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    704 _mm_setzero_si64 (void)
    705 {
    706   return (__m64) 0;
    707 }
    708 
    709 /* Compare eight 8-bit values.  The result of the comparison is 0xFF if the
    710    test is true and zero if false.  */
    711 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    712 _mm_cmpeq_pi8 (__m64 __m1, __m64 __m2)
    713 {
    714 #ifdef _ARCH_PWR6
    715   __m64 res;
    716   __asm__(
    717       "cmpb %0,%1,%2;\n"
    718       : "=r" (res)
    719       : "r" (__m1),
    720 	"r" (__m2)
    721       : );
    722   return (res);
    723 #else
    724   __m64_union m1, m2, res;
    725 
    726   m1.as_m64 = __m1;
    727   m2.as_m64 = __m2;
    728 
    729   res.as_char[0] = (m1.as_char[0] == m2.as_char[0])? -1: 0;
    730   res.as_char[1] = (m1.as_char[1] == m2.as_char[1])? -1: 0;
    731   res.as_char[2] = (m1.as_char[2] == m2.as_char[2])? -1: 0;
    732   res.as_char[3] = (m1.as_char[3] == m2.as_char[3])? -1: 0;
    733   res.as_char[4] = (m1.as_char[4] == m2.as_char[4])? -1: 0;
    734   res.as_char[5] = (m1.as_char[5] == m2.as_char[5])? -1: 0;
    735   res.as_char[6] = (m1.as_char[6] == m2.as_char[6])? -1: 0;
    736   res.as_char[7] = (m1.as_char[7] == m2.as_char[7])? -1: 0;
    737 
    738   return (__m64) res.as_m64;
    739 #endif
    740 }
    741 
    742 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    743 _m_pcmpeqb (__m64 __m1, __m64 __m2)
    744 {
    745   return _mm_cmpeq_pi8 (__m1, __m2);
    746 }
    747 
    748 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    749 _mm_cmpgt_pi8 (__m64 __m1, __m64 __m2)
    750 {
    751 #if _ARCH_PWR8
    752   __vector signed char a, b, c;
    753 
    754   a = (__vector signed char)vec_splats (__m1);
    755   b = (__vector signed char)vec_splats (__m2);
    756   c = (__vector signed char)vec_cmpgt (a, b);
    757   return (__builtin_unpack_vector_int128 ((__vector __int128_t)c, 0));
    758 #else
    759   __m64_union m1, m2, res;
    760 
    761   m1.as_m64 = __m1;
    762   m2.as_m64 = __m2;
    763 
    764   res.as_char[0] = (m1.as_char[0] > m2.as_char[0])? -1: 0;
    765   res.as_char[1] = (m1.as_char[1] > m2.as_char[1])? -1: 0;
    766   res.as_char[2] = (m1.as_char[2] > m2.as_char[2])? -1: 0;
    767   res.as_char[3] = (m1.as_char[3] > m2.as_char[3])? -1: 0;
    768   res.as_char[4] = (m1.as_char[4] > m2.as_char[4])? -1: 0;
    769   res.as_char[5] = (m1.as_char[5] > m2.as_char[5])? -1: 0;
    770   res.as_char[6] = (m1.as_char[6] > m2.as_char[6])? -1: 0;
    771   res.as_char[7] = (m1.as_char[7] > m2.as_char[7])? -1: 0;
    772 
    773   return (__m64) res.as_m64;
    774 #endif
    775 }
    776 
    777 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    778 _m_pcmpgtb (__m64 __m1, __m64 __m2)
    779 {
    780   return _mm_cmpgt_pi8 (__m1, __m2);
    781 }
    782 
    783 /* Compare four 16-bit values.  The result of the comparison is 0xFFFF if
    784    the test is true and zero if false.  */
    785 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    786 _mm_cmpeq_pi16 (__m64 __m1, __m64 __m2)
    787 {
    788 #if _ARCH_PWR8
    789   __vector signed short a, b, c;
    790 
    791   a = (__vector signed short)vec_splats (__m1);
    792   b = (__vector signed short)vec_splats (__m2);
    793   c = (__vector signed short)vec_cmpeq (a, b);
    794   return (__builtin_unpack_vector_int128 ((__vector __int128_t)c, 0));
    795 #else
    796   __m64_union m1, m2, res;
    797 
    798   m1.as_m64 = __m1;
    799   m2.as_m64 = __m2;
    800 
    801   res.as_short[0] = (m1.as_short[0] == m2.as_short[0])? -1: 0;
    802   res.as_short[1] = (m1.as_short[1] == m2.as_short[1])? -1: 0;
    803   res.as_short[2] = (m1.as_short[2] == m2.as_short[2])? -1: 0;
    804   res.as_short[3] = (m1.as_short[3] == m2.as_short[3])? -1: 0;
    805 
    806   return (__m64) res.as_m64;
    807 #endif
    808 }
    809 
    810 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    811 _m_pcmpeqw (__m64 __m1, __m64 __m2)
    812 {
    813   return _mm_cmpeq_pi16 (__m1, __m2);
    814 }
    815 
    816 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    817 _mm_cmpgt_pi16 (__m64 __m1, __m64 __m2)
    818 {
    819 #if _ARCH_PWR8
    820   __vector signed short a, b, c;
    821 
    822   a = (__vector signed short)vec_splats (__m1);
    823   b = (__vector signed short)vec_splats (__m2);
    824   c = (__vector signed short)vec_cmpgt (a, b);
    825   return (__builtin_unpack_vector_int128 ((__vector __int128_t)c, 0));
    826 #else
    827   __m64_union m1, m2, res;
    828 
    829   m1.as_m64 = __m1;
    830   m2.as_m64 = __m2;
    831 
    832   res.as_short[0] = (m1.as_short[0] > m2.as_short[0])? -1: 0;
    833   res.as_short[1] = (m1.as_short[1] > m2.as_short[1])? -1: 0;
    834   res.as_short[2] = (m1.as_short[2] > m2.as_short[2])? -1: 0;
    835   res.as_short[3] = (m1.as_short[3] > m2.as_short[3])? -1: 0;
    836 
    837   return (__m64) res.as_m64;
    838 #endif
    839 }
    840 
    841 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    842 _m_pcmpgtw (__m64 __m1, __m64 __m2)
    843 {
    844   return _mm_cmpgt_pi16 (__m1, __m2);
    845 }
    846 
    847 /* Compare two 32-bit values.  The result of the comparison is 0xFFFFFFFF if
    848    the test is true and zero if false.  */
    849 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    850 _mm_cmpeq_pi32 (__m64 __m1, __m64 __m2)
    851 {
    852 #if _ARCH_PWR9
    853   __vector signed int a, b, c;
    854 
    855   a = (__vector signed int)vec_splats (__m1);
    856   b = (__vector signed int)vec_splats (__m2);
    857   c = (__vector signed int)vec_cmpeq (a, b);
    858   return (__builtin_unpack_vector_int128 ((__vector __int128_t)c, 0));
    859 #else
    860   __m64_union m1, m2, res;
    861 
    862   m1.as_m64 = __m1;
    863   m2.as_m64 = __m2;
    864 
    865   res.as_int[0] = (m1.as_int[0] == m2.as_int[0])? -1: 0;
    866   res.as_int[1] = (m1.as_int[1] == m2.as_int[1])? -1: 0;
    867 
    868   return (__m64) res.as_m64;
    869 #endif
    870 }
    871 
    872 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    873 _m_pcmpeqd (__m64 __m1, __m64 __m2)
    874 {
    875   return _mm_cmpeq_pi32 (__m1, __m2);
    876 }
    877 
    878 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    879 _mm_cmpgt_pi32 (__m64 __m1, __m64 __m2)
    880 {
    881 #if _ARCH_PWR9
    882   __vector signed int a, b, c;
    883 
    884   a = (__vector signed int)vec_splats (__m1);
    885   b = (__vector signed int)vec_splats (__m2);
    886   c = (__vector signed int)vec_cmpgt (a, b);
    887   return (__builtin_unpack_vector_int128 ((__vector __int128_t)c, 0));
    888 #else
    889   __m64_union m1, m2, res;
    890 
    891   m1.as_m64 = __m1;
    892   m2.as_m64 = __m2;
    893 
    894   res.as_int[0] = (m1.as_int[0] > m2.as_int[0])? -1: 0;
    895   res.as_int[1] = (m1.as_int[1] > m2.as_int[1])? -1: 0;
    896 
    897   return (__m64) res.as_m64;
    898 #endif
    899 }
    900 
    901 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    902 _m_pcmpgtd (__m64 __m1, __m64 __m2)
    903 {
    904   return _mm_cmpgt_pi32 (__m1, __m2);
    905 }
    906 
    907 #if _ARCH_PWR8
    908 /* Add the 8-bit values in M1 to the 8-bit values in M2 using signed
    909    saturated arithmetic.  */
    910 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    911 _mm_adds_pi8 (__m64 __m1, __m64 __m2)
    912 {
    913   __vector signed char a, b, c;
    914 
    915   a = (__vector signed char)vec_splats (__m1);
    916   b = (__vector signed char)vec_splats (__m2);
    917   c = vec_adds (a, b);
    918   return (__builtin_unpack_vector_int128 ((__vector __int128_t)c, 0));
    919 }
    920 
    921 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    922 _m_paddsb (__m64 __m1, __m64 __m2)
    923 {
    924   return _mm_adds_pi8 (__m1, __m2);
    925 }
    926 /* Add the 16-bit values in M1 to the 16-bit values in M2 using signed
    927    saturated arithmetic.  */
    928 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    929 _mm_adds_pi16 (__m64 __m1, __m64 __m2)
    930 {
    931   __vector signed short a, b, c;
    932 
    933   a = (__vector signed short)vec_splats (__m1);
    934   b = (__vector signed short)vec_splats (__m2);
    935   c = vec_adds (a, b);
    936   return (__builtin_unpack_vector_int128 ((__vector __int128_t)c, 0));
    937 }
    938 
    939 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    940 _m_paddsw (__m64 __m1, __m64 __m2)
    941 {
    942   return _mm_adds_pi16 (__m1, __m2);
    943 }
    944 /* Add the 8-bit values in M1 to the 8-bit values in M2 using unsigned
    945    saturated arithmetic.  */
    946 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    947 _mm_adds_pu8 (__m64 __m1, __m64 __m2)
    948 {
    949   __vector unsigned char a, b, c;
    950 
    951   a = (__vector unsigned char)vec_splats (__m1);
    952   b = (__vector unsigned char)vec_splats (__m2);
    953   c = vec_adds (a, b);
    954   return (__builtin_unpack_vector_int128 ((__vector __int128_t)c, 0));
    955 }
    956 
    957 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    958 _m_paddusb (__m64 __m1, __m64 __m2)
    959 {
    960   return _mm_adds_pu8 (__m1, __m2);
    961 }
    962 
    963 /* Add the 16-bit values in M1 to the 16-bit values in M2 using unsigned
    964    saturated arithmetic.  */
    965 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    966 _mm_adds_pu16 (__m64 __m1, __m64 __m2)
    967 {
    968   __vector unsigned short a, b, c;
    969 
    970   a = (__vector unsigned short)vec_splats (__m1);
    971   b = (__vector unsigned short)vec_splats (__m2);
    972   c = vec_adds (a, b);
    973   return (__builtin_unpack_vector_int128 ((__vector __int128_t)c, 0));
    974 }
    975 
    976 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    977 _m_paddusw (__m64 __m1, __m64 __m2)
    978 {
    979   return _mm_adds_pu16 (__m1, __m2);
    980 }
    981 
    982 /* Subtract the 8-bit values in M2 from the 8-bit values in M1 using signed
    983    saturating arithmetic.  */
    984 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    985 _mm_subs_pi8 (__m64 __m1, __m64 __m2)
    986 {
    987   __vector signed char a, b, c;
    988 
    989   a = (__vector signed char)vec_splats (__m1);
    990   b = (__vector signed char)vec_splats (__m2);
    991   c = vec_subs (a, b);
    992   return (__builtin_unpack_vector_int128 ((__vector __int128_t)c, 0));
    993 }
    994 
    995 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    996 _m_psubsb (__m64 __m1, __m64 __m2)
    997 {
    998   return _mm_subs_pi8 (__m1, __m2);
    999 }
   1000 
   1001 /* Subtract the 16-bit values in M2 from the 16-bit values in M1 using
   1002    signed saturating arithmetic.  */
   1003 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1004 _mm_subs_pi16 (__m64 __m1, __m64 __m2)
   1005 {
   1006   __vector signed short a, b, c;
   1007 
   1008   a = (__vector signed short)vec_splats (__m1);
   1009   b = (__vector signed short)vec_splats (__m2);
   1010   c = vec_subs (a, b);
   1011   return (__builtin_unpack_vector_int128 ((__vector __int128_t)c, 0));
   1012 }
   1013 
   1014 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1015 _m_psubsw (__m64 __m1, __m64 __m2)
   1016 {
   1017   return _mm_subs_pi16 (__m1, __m2);
   1018 }
   1019 
   1020 /* Subtract the 8-bit values in M2 from the 8-bit values in M1 using
   1021    unsigned saturating arithmetic.  */
   1022 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1023 _mm_subs_pu8 (__m64 __m1, __m64 __m2)
   1024 {
   1025   __vector unsigned char a, b, c;
   1026 
   1027   a = (__vector unsigned char)vec_splats (__m1);
   1028   b = (__vector unsigned char)vec_splats (__m2);
   1029   c = vec_subs (a, b);
   1030   return (__builtin_unpack_vector_int128 ((__vector __int128_t)c, 0));
   1031 }
   1032 
   1033 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1034 _m_psubusb (__m64 __m1, __m64 __m2)
   1035 {
   1036   return _mm_subs_pu8 (__m1, __m2);
   1037 }
   1038 
   1039 /* Subtract the 16-bit values in M2 from the 16-bit values in M1 using
   1040    unsigned saturating arithmetic.  */
   1041 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1042 _mm_subs_pu16 (__m64 __m1, __m64 __m2)
   1043 {
   1044   __vector unsigned short a, b, c;
   1045 
   1046   a = (__vector unsigned short)vec_splats (__m1);
   1047   b = (__vector unsigned short)vec_splats (__m2);
   1048   c = vec_subs (a, b);
   1049   return (__builtin_unpack_vector_int128 ((__vector __int128_t)c, 0));
   1050 }
   1051 
   1052 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1053 _m_psubusw (__m64 __m1, __m64 __m2)
   1054 {
   1055   return _mm_subs_pu16 (__m1, __m2);
   1056 }
   1057 
   1058 /* Multiply four 16-bit values in M1 by four 16-bit values in M2 producing
   1059    four 32-bit intermediate results, which are then summed by pairs to
   1060    produce two 32-bit results.  */
   1061 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1062 _mm_madd_pi16 (__m64 __m1, __m64 __m2)
   1063 {
   1064   __vector signed short a, b;
   1065   __vector signed int c;
   1066   __vector signed int zero = {0, 0, 0, 0};
   1067 
   1068   a = (__vector signed short)vec_splats (__m1);
   1069   b = (__vector signed short)vec_splats (__m2);
   1070   c = vec_vmsumshm (a, b, zero);
   1071   return (__builtin_unpack_vector_int128 ((__vector __int128_t)c, 0));
   1072 }
   1073 
   1074 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1075 _m_pmaddwd (__m64 __m1, __m64 __m2)
   1076 {
   1077   return _mm_madd_pi16 (__m1, __m2);
   1078 }
   1079 /* Multiply four signed 16-bit values in M1 by four signed 16-bit values in
   1080    M2 and produce the high 16 bits of the 32-bit results.  */
   1081 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1082 _mm_mulhi_pi16 (__m64 __m1, __m64 __m2)
   1083 {
   1084   __vector signed short a, b;
   1085   __vector signed short c;
   1086   __vector signed int w0, w1;
   1087   __vector unsigned char xform1 = {
   1088       0x02, 0x03, 0x12, 0x13,  0x06, 0x07, 0x16, 0x17,
   1089       0x0A, 0x0B, 0x1A, 0x1B,  0x0E, 0x0F, 0x1E, 0x1F
   1090     };
   1091 
   1092   a = (__vector signed short)vec_splats (__m1);
   1093   b = (__vector signed short)vec_splats (__m2);
   1094 
   1095   w0 = vec_vmulesh (a, b);
   1096   w1 = vec_vmulosh (a, b);
   1097   c = (__vector signed short)vec_perm (w0, w1, xform1);
   1098 
   1099   return (__builtin_unpack_vector_int128 ((__vector __int128_t)c, 0));
   1100 }
   1101 
   1102 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1103 _m_pmulhw (__m64 __m1, __m64 __m2)
   1104 {
   1105   return _mm_mulhi_pi16 (__m1, __m2);
   1106 }
   1107 
   1108 /* Multiply four 16-bit values in M1 by four 16-bit values in M2 and produce
   1109    the low 16 bits of the results.  */
   1110 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1111 _mm_mullo_pi16 (__m64 __m1, __m64 __m2)
   1112 {
   1113   __vector signed short a, b, c;
   1114 
   1115   a = (__vector signed short)vec_splats (__m1);
   1116   b = (__vector signed short)vec_splats (__m2);
   1117   c = a * b;
   1118   return (__builtin_unpack_vector_int128 ((__vector __int128_t)c, 0));
   1119 }
   1120 
   1121 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1122 _m_pmullw (__m64 __m1, __m64 __m2)
   1123 {
   1124   return _mm_mullo_pi16 (__m1, __m2);
   1125 }
   1126 
   1127 /* Shift four 16-bit values in M left by COUNT.  */
   1128 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1129 _mm_sll_pi16 (__m64 __m, __m64 __count)
   1130 {
   1131   __vector signed short m, r;
   1132   __vector unsigned short c;
   1133 
   1134   if (__count <= 15)
   1135     {
   1136       m = (__vector signed short)vec_splats (__m);
   1137       c = (__vector unsigned short)vec_splats ((unsigned short)__count);
   1138       r = vec_sl (m, (__vector unsigned short)c);
   1139       return (__builtin_unpack_vector_int128 ((__vector __int128_t)r, 0));
   1140     }
   1141   else
   1142   return (0);
   1143 }
   1144 
   1145 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1146 _m_psllw (__m64 __m, __m64 __count)
   1147 {
   1148   return _mm_sll_pi16 (__m, __count);
   1149 }
   1150 
   1151 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1152 _mm_slli_pi16 (__m64 __m, int __count)
   1153 {
   1154   /* Promote int to long then invoke mm_sll_pi16.  */
   1155   return _mm_sll_pi16 (__m, __count);
   1156 }
   1157 
   1158 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1159 _m_psllwi (__m64 __m, int __count)
   1160 {
   1161   return _mm_slli_pi16 (__m, __count);
   1162 }
   1163 
   1164 /* Shift two 32-bit values in M left by COUNT.  */
   1165 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1166 _mm_sll_pi32 (__m64 __m, __m64 __count)
   1167 {
   1168   __m64_union m, res;
   1169 
   1170   m.as_m64 = __m;
   1171 
   1172   res.as_int[0] = m.as_int[0] << __count;
   1173   res.as_int[1] = m.as_int[1] << __count;
   1174   return (res.as_m64);
   1175 }
   1176 
   1177 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1178 _m_pslld (__m64 __m, __m64 __count)
   1179 {
   1180   return _mm_sll_pi32 (__m, __count);
   1181 }
   1182 
   1183 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1184 _mm_slli_pi32 (__m64 __m, int __count)
   1185 {
   1186   /* Promote int to long then invoke mm_sll_pi32.  */
   1187   return _mm_sll_pi32 (__m, __count);
   1188 }
   1189 
   1190 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1191 _m_pslldi (__m64 __m, int __count)
   1192 {
   1193   return _mm_slli_pi32 (__m, __count);
   1194 }
   1195 
   1196 /* Shift four 16-bit values in M right by COUNT; shift in the sign bit.  */
   1197 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1198 _mm_sra_pi16 (__m64 __m, __m64 __count)
   1199 {
   1200   __vector signed short m, r;
   1201   __vector unsigned short c;
   1202 
   1203   if (__count <= 15)
   1204     {
   1205 	m = (__vector signed short)vec_splats (__m);
   1206 	c = (__vector unsigned short)vec_splats ((unsigned short)__count);
   1207 	r = vec_sra (m, (__vector unsigned short)c);
   1208 	return (__builtin_unpack_vector_int128 ((__vector __int128_t)r, 0));
   1209     }
   1210   else
   1211   return (0);
   1212 }
   1213 
   1214 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1215 _m_psraw (__m64 __m, __m64 __count)
   1216 {
   1217   return _mm_sra_pi16 (__m, __count);
   1218 }
   1219 
   1220 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1221 _mm_srai_pi16 (__m64 __m, int __count)
   1222 {
   1223   /* Promote int to long then invoke mm_sra_pi32.  */
   1224   return _mm_sra_pi16 (__m, __count);
   1225 }
   1226 
   1227 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1228 _m_psrawi (__m64 __m, int __count)
   1229 {
   1230   return _mm_srai_pi16 (__m, __count);
   1231 }
   1232 
   1233 /* Shift two 32-bit values in M right by COUNT; shift in the sign bit.  */
   1234 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1235 _mm_sra_pi32 (__m64 __m, __m64 __count)
   1236 {
   1237   __m64_union m, res;
   1238 
   1239   m.as_m64 = __m;
   1240 
   1241   res.as_int[0] = m.as_int[0] >> __count;
   1242   res.as_int[1] = m.as_int[1] >> __count;
   1243   return (res.as_m64);
   1244 }
   1245 
   1246 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1247 _m_psrad (__m64 __m, __m64 __count)
   1248 {
   1249   return _mm_sra_pi32 (__m, __count);
   1250 }
   1251 
   1252 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1253 _mm_srai_pi32 (__m64 __m, int __count)
   1254 {
   1255   /* Promote int to long then invoke mm_sra_pi32.  */
   1256   return _mm_sra_pi32 (__m, __count);
   1257 }
   1258 
   1259 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1260 _m_psradi (__m64 __m, int __count)
   1261 {
   1262   return _mm_srai_pi32 (__m, __count);
   1263 }
   1264 
   1265 /* Shift four 16-bit values in M right by COUNT; shift in zeros.  */
   1266 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1267 _mm_srl_pi16 (__m64 __m, __m64 __count)
   1268 {
   1269   __vector unsigned short m, r;
   1270   __vector unsigned short c;
   1271 
   1272   if (__count <= 15)
   1273     {
   1274 	m = (__vector unsigned short)vec_splats (__m);
   1275 	c = (__vector unsigned short)vec_splats ((unsigned short)__count);
   1276 	r = vec_sr (m, (__vector unsigned short)c);
   1277 	return (__builtin_unpack_vector_int128 ((__vector __int128_t)r, 0));
   1278     }
   1279   else
   1280     return (0);
   1281 }
   1282 
   1283 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1284 _m_psrlw (__m64 __m, __m64 __count)
   1285 {
   1286   return _mm_srl_pi16 (__m, __count);
   1287 }
   1288 
   1289 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1290 _mm_srli_pi16 (__m64 __m, int __count)
   1291 {
   1292   /* Promote int to long then invoke mm_sra_pi32.  */
   1293   return _mm_srl_pi16 (__m, __count);
   1294 }
   1295 
   1296 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1297 _m_psrlwi (__m64 __m, int __count)
   1298 {
   1299   return _mm_srli_pi16 (__m, __count);
   1300 }
   1301 
   1302 /* Shift two 32-bit values in M right by COUNT; shift in zeros.  */
   1303 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1304 _mm_srl_pi32 (__m64 __m, __m64 __count)
   1305 {
   1306   __m64_union m, res;
   1307 
   1308   m.as_m64 = __m;
   1309 
   1310   res.as_int[0] = (unsigned int)m.as_int[0] >> __count;
   1311   res.as_int[1] = (unsigned int)m.as_int[1] >> __count;
   1312   return (res.as_m64);
   1313 }
   1314 
   1315 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1316 _m_psrld (__m64 __m, __m64 __count)
   1317 {
   1318   return _mm_srl_pi32 (__m, __count);
   1319 }
   1320 
   1321 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1322 _mm_srli_pi32 (__m64 __m, int __count)
   1323 {
   1324   /* Promote int to long then invoke mm_srl_pi32.  */
   1325   return _mm_srl_pi32 (__m, __count);
   1326 }
   1327 
   1328 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1329 _m_psrldi (__m64 __m, int __count)
   1330 {
   1331   return _mm_srli_pi32 (__m, __count);
   1332 }
   1333 #endif /* _ARCH_PWR8 */
   1334 
   1335 /* Creates a vector of two 32-bit values; I0 is least significant.  */
   1336 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1337 _mm_set_pi32 (int __i1, int __i0)
   1338 {
   1339   __m64_union res;
   1340 
   1341   res.as_int[0] = __i0;
   1342   res.as_int[1] = __i1;
   1343   return (res.as_m64);
   1344 }
   1345 
   1346 /* Creates a vector of four 16-bit values; W0 is least significant.  */
   1347 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1348 _mm_set_pi16 (short __w3, short __w2, short __w1, short __w0)
   1349 {
   1350   __m64_union res;
   1351 
   1352   res.as_short[0] = __w0;
   1353   res.as_short[1] = __w1;
   1354   res.as_short[2] = __w2;
   1355   res.as_short[3] = __w3;
   1356   return (res.as_m64);
   1357 }
   1358 
   1359 /* Creates a vector of eight 8-bit values; B0 is least significant.  */
   1360 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1361 _mm_set_pi8 (char __b7, char __b6, char __b5, char __b4,
   1362 	     char __b3, char __b2, char __b1, char __b0)
   1363 {
   1364   __m64_union res;
   1365 
   1366   res.as_char[0] = __b0;
   1367   res.as_char[1] = __b1;
   1368   res.as_char[2] = __b2;
   1369   res.as_char[3] = __b3;
   1370   res.as_char[4] = __b4;
   1371   res.as_char[5] = __b5;
   1372   res.as_char[6] = __b6;
   1373   res.as_char[7] = __b7;
   1374   return (res.as_m64);
   1375 }
   1376 
   1377 /* Similar, but with the arguments in reverse order.  */
   1378 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1379 _mm_setr_pi32 (int __i0, int __i1)
   1380 {
   1381   __m64_union res;
   1382 
   1383   res.as_int[0] = __i0;
   1384   res.as_int[1] = __i1;
   1385   return (res.as_m64);
   1386 }
   1387 
   1388 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1389 _mm_setr_pi16 (short __w0, short __w1, short __w2, short __w3)
   1390 {
   1391   return _mm_set_pi16 (__w3, __w2, __w1, __w0);
   1392 }
   1393 
   1394 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1395 _mm_setr_pi8 (char __b0, char __b1, char __b2, char __b3,
   1396 	      char __b4, char __b5, char __b6, char __b7)
   1397 {
   1398   return _mm_set_pi8 (__b7, __b6, __b5, __b4, __b3, __b2, __b1, __b0);
   1399 }
   1400 
   1401 /* Creates a vector of two 32-bit values, both elements containing I.  */
   1402 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1403 _mm_set1_pi32 (int __i)
   1404 {
   1405   __m64_union res;
   1406 
   1407   res.as_int[0] = __i;
   1408   res.as_int[1] = __i;
   1409   return (res.as_m64);
   1410 }
   1411 
   1412 /* Creates a vector of four 16-bit values, all elements containing W.  */
   1413 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1414 _mm_set1_pi16 (short __w)
   1415 {
   1416 #if _ARCH_PWR9
   1417   __vector signed short w;
   1418 
   1419   w = (__vector signed short)vec_splats (__w);
   1420   return (__builtin_unpack_vector_int128 ((__vector __int128_t)w, 0));
   1421 #else
   1422   __m64_union res;
   1423 
   1424   res.as_short[0] = __w;
   1425   res.as_short[1] = __w;
   1426   res.as_short[2] = __w;
   1427   res.as_short[3] = __w;
   1428   return (res.as_m64);
   1429 #endif
   1430 }
   1431 
   1432 /* Creates a vector of eight 8-bit values, all elements containing B.  */
   1433 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1434 _mm_set1_pi8 (signed char __b)
   1435 {
   1436 #if _ARCH_PWR8
   1437   __vector signed char b;
   1438 
   1439   b = (__vector signed char)vec_splats (__b);
   1440   return (__builtin_unpack_vector_int128 ((__vector __int128_t)b, 0));
   1441 #else
   1442   __m64_union res;
   1443 
   1444   res.as_char[0] = __b;
   1445   res.as_char[1] = __b;
   1446   res.as_char[2] = __b;
   1447   res.as_char[3] = __b;
   1448   res.as_char[4] = __b;
   1449   res.as_char[5] = __b;
   1450   res.as_char[6] = __b;
   1451   res.as_char[7] = __b;
   1452   return (res.as_m64);
   1453 #endif
   1454 }
   1455 #endif /* __powerpc64__ */
   1456 #endif /* _MMINTRIN_H_INCLUDED */
   1457