Home | History | Annotate | Line # | Download | only in rs6000
mmintrin.h revision 1.1.1.4
      1 /* Copyright (C) 2002-2022 Free Software Foundation, Inc.
      2 
      3    This file is part of GCC.
      4 
      5    GCC is free software; you can redistribute it and/or modify
      6    it under the terms of the GNU General Public License as published by
      7    the Free Software Foundation; either version 3, or (at your option)
      8    any later version.
      9 
     10    GCC is distributed in the hope that it will be useful,
     11    but WITHOUT ANY WARRANTY; without even the implied warranty of
     12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     13    GNU General Public License for more details.
     14 
     15    Under Section 7 of GPL version 3, you are granted additional
     16    permissions described in the GCC Runtime Library Exception, version
     17    3.1, as published by the Free Software Foundation.
     18 
     19    You should have received a copy of the GNU General Public License and
     20    a copy of the GCC Runtime Library Exception along with this program;
     21    see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
     22    <http://www.gnu.org/licenses/>.  */
     23 
     24 /* Implemented from the specification included in the Intel C++ Compiler
     25    User Guide and Reference, version 9.0.  */
     26 
     27 #ifndef NO_WARN_X86_INTRINSICS
     28 /* This header is distributed to simplify porting x86_64 code that
     29    makes explicit use of Intel intrinsics to powerpc64le.
     30    It is the user's responsibility to determine if the results are
     31    acceptable and make additional changes as necessary.
     32    Note that much code that uses Intel intrinsics can be rewritten in
     33    standard C or GNU C extensions, which are more portable and better
     34    optimized across multiple targets.
     35 
     36    In the specific case of X86 MMX (__m64) intrinsics, the PowerPC
     37    target does not support a native __vector_size__ (8) type.  Instead
     38    we typedef __m64 to a 64-bit unsigned long long, which is natively
     39    supported in 64-bit mode.  This works well for the _si64 and some
     40    _pi32 operations, but starts to generate long sequences for _pi16
     41    and _pi8 operations.  For those cases it better (faster and
     42    smaller code) to transfer __m64 data to the PowerPC vector 128-bit
     43    unit, perform the operation, and then transfer the result back to
     44    the __m64 type. This implies that the direct register move
     45    instructions, introduced with power8, are available for efficient
     46    implementation of these transfers.
     47 
     48    Most MMX intrinsic operations can be performed efficiently as
     49    C language 64-bit scalar operation or optimized to use the newer
     50    128-bit SSE/Altivec operations.  We recomend this for new
     51    applications.  */
     52 #error "Please read comment above.  Use -DNO_WARN_X86_INTRINSICS to disable this error."
     53 #endif
     54 
     55 #ifndef _MMINTRIN_H_INCLUDED
     56 #define _MMINTRIN_H_INCLUDED
     57 
     58 #include <altivec.h>
     59 /* The Intel API is flexible enough that we must allow aliasing with other
     60    vector types, and their scalar components.  */
     61 typedef __attribute__ ((__aligned__ (8),
     62 			__may_alias__)) unsigned long long __m64;
     63 
     64 typedef __attribute__ ((__aligned__ (8)))
     65 union
     66   {
     67     __m64 as_m64;
     68     char as_char[8];
     69     signed char as_signed_char [8];
     70     short as_short[4];
     71     int as_int[2];
     72     long long as_long_long;
     73     float as_float[2];
     74     double as_double;
     75   } __m64_union;
     76 
     77 /* Empty the multimedia state.  */
     78 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     79 _mm_empty (void)
     80 {
     81   /* nothing to do on PowerPC.  */
     82 }
     83 
     84 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     85 _m_empty (void)
     86 {
     87   /* nothing to do on PowerPC.  */
     88 }
     89 
     90 /* Convert I to a __m64 object.  The integer is zero-extended to 64-bits.  */
     91 extern __inline __m64  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     92 _mm_cvtsi32_si64 (int __i)
     93 {
     94   return (__m64) (unsigned int) __i;
     95 }
     96 
     97 extern __inline __m64  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     98 _m_from_int (int __i)
     99 {
    100   return _mm_cvtsi32_si64 (__i);
    101 }
    102 
    103 /* Convert the lower 32 bits of the __m64 object into an integer.  */
    104 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    105 _mm_cvtsi64_si32 (__m64 __i)
    106 {
    107   return ((int) __i);
    108 }
    109 
    110 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    111 _m_to_int (__m64 __i)
    112 {
    113   return _mm_cvtsi64_si32 (__i);
    114 }
    115 
    116 /* Convert I to a __m64 object.  */
    117 
    118 /* Intel intrinsic.  */
    119 extern __inline __m64  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    120 _m_from_int64 (long long __i)
    121 {
    122   return (__m64) __i;
    123 }
    124 
    125 extern __inline __m64  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    126 _mm_cvtsi64_m64 (long long __i)
    127 {
    128   return (__m64) __i;
    129 }
    130 
    131 /* Microsoft intrinsic.  */
    132 extern __inline __m64  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    133 _mm_cvtsi64x_si64 (long long __i)
    134 {
    135   return (__m64) __i;
    136 }
    137 
    138 extern __inline __m64  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    139 _mm_set_pi64x (long long __i)
    140 {
    141   return (__m64) __i;
    142 }
    143 
    144 /* Convert the __m64 object to a 64bit integer.  */
    145 
    146 /* Intel intrinsic.  */
    147 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    148 _m_to_int64 (__m64 __i)
    149 {
    150   return (long long)__i;
    151 }
    152 
    153 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    154 _mm_cvtm64_si64 (__m64 __i)
    155 {
    156   return (long long) __i;
    157 }
    158 
    159 /* Microsoft intrinsic.  */
    160 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    161 _mm_cvtsi64_si64x (__m64 __i)
    162 {
    163   return (long long) __i;
    164 }
    165 
    166 #ifdef _ARCH_PWR8
    167 /* Pack the four 16-bit values from M1 into the lower four 8-bit values of
    168    the result, and the four 16-bit values from M2 into the upper four 8-bit
    169    values of the result, all with signed saturation.  */
    170 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    171 _mm_packs_pi16 (__m64 __m1, __m64 __m2)
    172 {
    173   __vector signed short __vm1;
    174   __vector signed char __vresult;
    175 
    176   __vm1 = (__vector signed short) (__vector unsigned long long)
    177 #ifdef __LITTLE_ENDIAN__
    178         { __m1, __m2 };
    179 #else
    180         { __m2, __m1 };
    181 #endif
    182   __vresult = vec_packs (__vm1, __vm1);
    183   return (__m64) ((__vector long long) __vresult)[0];
    184 }
    185 
    186 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    187 _m_packsswb (__m64 __m1, __m64 __m2)
    188 {
    189   return _mm_packs_pi16 (__m1, __m2);
    190 }
    191 
    192 /* Pack the two 32-bit values from M1 in to the lower two 16-bit values of
    193    the result, and the two 32-bit values from M2 into the upper two 16-bit
    194    values of the result, all with signed saturation.  */
    195 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    196 _mm_packs_pi32 (__m64 __m1, __m64 __m2)
    197 {
    198   __vector signed int __vm1;
    199   __vector signed short __vresult;
    200 
    201   __vm1 = (__vector signed int) (__vector unsigned long long)
    202 #ifdef __LITTLE_ENDIAN__
    203         { __m1, __m2 };
    204 #else
    205         { __m2, __m1 };
    206 #endif
    207   __vresult = vec_packs (__vm1, __vm1);
    208   return (__m64) ((__vector long long) __vresult)[0];
    209 }
    210 
    211 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    212 _m_packssdw (__m64 __m1, __m64 __m2)
    213 {
    214   return _mm_packs_pi32 (__m1, __m2);
    215 }
    216 
    217 /* Pack the four 16-bit values from M1 into the lower four 8-bit values of
    218    the result, and the four 16-bit values from M2 into the upper four 8-bit
    219    values of the result, all with unsigned saturation.  */
    220 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    221 _mm_packs_pu16 (__m64 __m1, __m64 __m2)
    222 {
    223   __vector unsigned char __r;
    224   __vector signed short __vm1 = (__vector signed short) (__vector long long)
    225 #ifdef __LITTLE_ENDIAN__
    226         { __m1, __m2 };
    227 #else
    228         { __m2, __m1 };
    229 #endif
    230   const __vector signed short __zero = { 0 };
    231   __vector __bool short __select = vec_cmplt (__vm1, __zero);
    232   __r = vec_packs ((__vector unsigned short) __vm1, (__vector unsigned short) __vm1);
    233   __vector __bool char __packsel = vec_pack (__select, __select);
    234   __r = vec_sel (__r, (const __vector unsigned char) __zero, __packsel);
    235   return (__m64) ((__vector long long) __r)[0];
    236 }
    237 
    238 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    239 _m_packuswb (__m64 __m1, __m64 __m2)
    240 {
    241   return _mm_packs_pu16 (__m1, __m2);
    242 }
    243 #endif /* end ARCH_PWR8 */
    244 
    245 /* Interleave the four 8-bit values from the high half of M1 with the four
    246    8-bit values from the high half of M2.  */
    247 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    248 _mm_unpackhi_pi8 (__m64 __m1, __m64 __m2)
    249 {
    250 #if _ARCH_PWR8
    251   __vector unsigned char __a, __b, __c;
    252 
    253   __a = (__vector unsigned char)vec_splats (__m1);
    254   __b = (__vector unsigned char)vec_splats (__m2);
    255   __c = vec_mergel (__a, __b);
    256   return (__m64) ((__vector long long) __c)[1];
    257 #else
    258   __m64_union __mu1, __mu2, __res;
    259 
    260   __mu1.as_m64 = __m1;
    261   __mu2.as_m64 = __m2;
    262 
    263   __res.as_char[0] = __mu1.as_char[4];
    264   __res.as_char[1] = __mu2.as_char[4];
    265   __res.as_char[2] = __mu1.as_char[5];
    266   __res.as_char[3] = __mu2.as_char[5];
    267   __res.as_char[4] = __mu1.as_char[6];
    268   __res.as_char[5] = __mu2.as_char[6];
    269   __res.as_char[6] = __mu1.as_char[7];
    270   __res.as_char[7] = __mu2.as_char[7];
    271 
    272   return (__m64) __res.as_m64;
    273 #endif
    274 }
    275 
    276 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    277 _m_punpckhbw (__m64 __m1, __m64 __m2)
    278 {
    279   return _mm_unpackhi_pi8 (__m1, __m2);
    280 }
    281 
    282 /* Interleave the two 16-bit values from the high half of M1 with the two
    283    16-bit values from the high half of M2.  */
    284 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    285 _mm_unpackhi_pi16 (__m64 __m1, __m64 __m2)
    286 {
    287   __m64_union __mu1, __mu2, __res;
    288 
    289   __mu1.as_m64 = __m1;
    290   __mu2.as_m64 = __m2;
    291 
    292   __res.as_short[0] = __mu1.as_short[2];
    293   __res.as_short[1] = __mu2.as_short[2];
    294   __res.as_short[2] = __mu1.as_short[3];
    295   __res.as_short[3] = __mu2.as_short[3];
    296 
    297   return (__m64) __res.as_m64;
    298 }
    299 
    300 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    301 _m_punpckhwd (__m64 __m1, __m64 __m2)
    302 {
    303   return _mm_unpackhi_pi16 (__m1, __m2);
    304 }
    305 /* Interleave the 32-bit value from the high half of M1 with the 32-bit
    306    value from the high half of M2.  */
    307 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    308 _mm_unpackhi_pi32 (__m64 __m1, __m64 __m2)
    309 {
    310   __m64_union __mu1, __mu2, __res;
    311 
    312   __mu1.as_m64 = __m1;
    313   __mu2.as_m64 = __m2;
    314 
    315   __res.as_int[0] = __mu1.as_int[1];
    316   __res.as_int[1] = __mu2.as_int[1];
    317 
    318   return (__m64) __res.as_m64;
    319 }
    320 
    321 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    322 _m_punpckhdq (__m64 __m1, __m64 __m2)
    323 {
    324   return _mm_unpackhi_pi32 (__m1, __m2);
    325 }
    326 /* Interleave the four 8-bit values from the low half of M1 with the four
    327    8-bit values from the low half of M2.  */
    328 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    329 _mm_unpacklo_pi8 (__m64 __m1, __m64 __m2)
    330 {
    331 #if _ARCH_PWR8
    332   __vector unsigned char __a, __b, __c;
    333 
    334   __a = (__vector unsigned char)vec_splats (__m1);
    335   __b = (__vector unsigned char)vec_splats (__m2);
    336   __c = vec_mergel (__a, __b);
    337   return (__m64) ((__vector long long) __c)[0];
    338 #else
    339   __m64_union __mu1, __mu2, __res;
    340 
    341   __mu1.as_m64 = __m1;
    342   __mu2.as_m64 = __m2;
    343 
    344   __res.as_char[0] = __mu1.as_char[0];
    345   __res.as_char[1] = __mu2.as_char[0];
    346   __res.as_char[2] = __mu1.as_char[1];
    347   __res.as_char[3] = __mu2.as_char[1];
    348   __res.as_char[4] = __mu1.as_char[2];
    349   __res.as_char[5] = __mu2.as_char[2];
    350   __res.as_char[6] = __mu1.as_char[3];
    351   __res.as_char[7] = __mu2.as_char[3];
    352 
    353   return (__m64) __res.as_m64;
    354 #endif
    355 }
    356 
    357 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    358 _m_punpcklbw (__m64 __m1, __m64 __m2)
    359 {
    360   return _mm_unpacklo_pi8 (__m1, __m2);
    361 }
    362 /* Interleave the two 16-bit values from the low half of M1 with the two
    363    16-bit values from the low half of M2.  */
    364 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    365 _mm_unpacklo_pi16 (__m64 __m1, __m64 __m2)
    366 {
    367   __m64_union __mu1, __mu2, __res;
    368 
    369   __mu1.as_m64 = __m1;
    370   __mu2.as_m64 = __m2;
    371 
    372   __res.as_short[0] = __mu1.as_short[0];
    373   __res.as_short[1] = __mu2.as_short[0];
    374   __res.as_short[2] = __mu1.as_short[1];
    375   __res.as_short[3] = __mu2.as_short[1];
    376 
    377   return (__m64) __res.as_m64;
    378 }
    379 
    380 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    381 _m_punpcklwd (__m64 __m1, __m64 __m2)
    382 {
    383   return _mm_unpacklo_pi16 (__m1, __m2);
    384 }
    385 
    386 /* Interleave the 32-bit value from the low half of M1 with the 32-bit
    387    value from the low half of M2.  */
    388 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    389 _mm_unpacklo_pi32 (__m64 __m1, __m64 __m2)
    390 {
    391   __m64_union __mu1, __mu2, __res;
    392 
    393   __mu1.as_m64 = __m1;
    394   __mu2.as_m64 = __m2;
    395 
    396   __res.as_int[0] = __mu1.as_int[0];
    397   __res.as_int[1] = __mu2.as_int[0];
    398 
    399   return (__m64) __res.as_m64;
    400 }
    401 
    402 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    403 _m_punpckldq (__m64 __m1, __m64 __m2)
    404 {
    405   return _mm_unpacklo_pi32 (__m1, __m2);
    406 }
    407 
    408 /* Add the 8-bit values in M1 to the 8-bit values in M2.  */
    409 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    410 _mm_add_pi8 (__m64 __m1, __m64 __m2)
    411 {
    412 #if _ARCH_PWR8
    413   __vector signed char __a, __b, __c;
    414 
    415   __a = (__vector signed char)vec_splats (__m1);
    416   __b = (__vector signed char)vec_splats (__m2);
    417   __c = vec_add (__a, __b);
    418   return (__m64) ((__vector long long) __c)[0];
    419 #else
    420   __m64_union __mu1, __mu2, __res;
    421 
    422   __mu1.as_m64 = __m1;
    423   __mu2.as_m64 = __m2;
    424 
    425   __res.as_char[0] = __mu1.as_char[0] + __mu2.as_char[0];
    426   __res.as_char[1] = __mu1.as_char[1] + __mu2.as_char[1];
    427   __res.as_char[2] = __mu1.as_char[2] + __mu2.as_char[2];
    428   __res.as_char[3] = __mu1.as_char[3] + __mu2.as_char[3];
    429   __res.as_char[4] = __mu1.as_char[4] + __mu2.as_char[4];
    430   __res.as_char[5] = __mu1.as_char[5] + __mu2.as_char[5];
    431   __res.as_char[6] = __mu1.as_char[6] + __mu2.as_char[6];
    432   __res.as_char[7] = __mu1.as_char[7] + __mu2.as_char[7];
    433 
    434   return (__m64) __res.as_m64;
    435 #endif
    436 }
    437 
    438 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    439 _m_paddb (__m64 __m1, __m64 __m2)
    440 {
    441   return _mm_add_pi8 (__m1, __m2);
    442 }
    443 
    444 /* Add the 16-bit values in M1 to the 16-bit values in M2.  */
    445 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    446 _mm_add_pi16 (__m64 __m1, __m64 __m2)
    447 {
    448 #if _ARCH_PWR8
    449   __vector signed short __a, __b, __c;
    450 
    451   __a = (__vector signed short)vec_splats (__m1);
    452   __b = (__vector signed short)vec_splats (__m2);
    453   __c = vec_add (__a, __b);
    454   return (__m64) ((__vector long long) __c)[0];
    455 #else
    456   __m64_union __mu1, __mu2, __res;
    457 
    458   __mu1.as_m64 = __m1;
    459   __mu2.as_m64 = __m2;
    460 
    461   __res.as_short[0] = __mu1.as_short[0] + __mu2.as_short[0];
    462   __res.as_short[1] = __mu1.as_short[1] + __mu2.as_short[1];
    463   __res.as_short[2] = __mu1.as_short[2] + __mu2.as_short[2];
    464   __res.as_short[3] = __mu1.as_short[3] + __mu2.as_short[3];
    465 
    466   return (__m64) __res.as_m64;
    467 #endif
    468 }
    469 
    470 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    471 _m_paddw (__m64 __m1, __m64 __m2)
    472 {
    473   return _mm_add_pi16 (__m1, __m2);
    474 }
    475 
    476 /* Add the 32-bit values in M1 to the 32-bit values in M2.  */
    477 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    478 _mm_add_pi32 (__m64 __m1, __m64 __m2)
    479 {
    480 #if _ARCH_PWR9
    481   __vector signed int __a, __b, __c;
    482 
    483   __a = (__vector signed int)vec_splats (__m1);
    484   __b = (__vector signed int)vec_splats (__m2);
    485   __c = vec_add (__a, __b);
    486   return (__m64) ((__vector long long) __c)[0];
    487 #else
    488   __m64_union __mu1, __mu2, __res;
    489 
    490   __mu1.as_m64 = __m1;
    491   __mu2.as_m64 = __m2;
    492 
    493   __res.as_int[0] = __mu1.as_int[0] + __mu2.as_int[0];
    494   __res.as_int[1] = __mu1.as_int[1] + __mu2.as_int[1];
    495 
    496   return (__m64) __res.as_m64;
    497 #endif
    498 }
    499 
    500 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    501 _m_paddd (__m64 __m1, __m64 __m2)
    502 {
    503   return _mm_add_pi32 (__m1, __m2);
    504 }
    505 
    506 /* Subtract the 8-bit values in M2 from the 8-bit values in M1.  */
    507 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    508 _mm_sub_pi8 (__m64 __m1, __m64 __m2)
    509 {
    510 #if _ARCH_PWR8
    511   __vector signed char __a, __b, __c;
    512 
    513   __a = (__vector signed char)vec_splats (__m1);
    514   __b = (__vector signed char)vec_splats (__m2);
    515   __c = vec_sub (__a, __b);
    516   return (__m64) ((__vector long long) __c)[0];
    517 #else
    518   __m64_union __mu1, __mu2, __res;
    519 
    520   __mu1.as_m64 = __m1;
    521   __mu2.as_m64 = __m2;
    522 
    523   __res.as_char[0] = __mu1.as_char[0] - __mu2.as_char[0];
    524   __res.as_char[1] = __mu1.as_char[1] - __mu2.as_char[1];
    525   __res.as_char[2] = __mu1.as_char[2] - __mu2.as_char[2];
    526   __res.as_char[3] = __mu1.as_char[3] - __mu2.as_char[3];
    527   __res.as_char[4] = __mu1.as_char[4] - __mu2.as_char[4];
    528   __res.as_char[5] = __mu1.as_char[5] - __mu2.as_char[5];
    529   __res.as_char[6] = __mu1.as_char[6] - __mu2.as_char[6];
    530   __res.as_char[7] = __mu1.as_char[7] - __mu2.as_char[7];
    531 
    532   return (__m64) __res.as_m64;
    533 #endif
    534 }
    535 
    536 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    537 _m_psubb (__m64 __m1, __m64 __m2)
    538 {
    539   return _mm_sub_pi8 (__m1, __m2);
    540 }
    541 
    542 /* Subtract the 16-bit values in M2 from the 16-bit values in M1.  */
    543 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    544 _mm_sub_pi16 (__m64 __m1, __m64 __m2)
    545 {
    546 #if _ARCH_PWR8
    547   __vector signed short __a, __b, __c;
    548 
    549   __a = (__vector signed short)vec_splats (__m1);
    550   __b = (__vector signed short)vec_splats (__m2);
    551   __c = vec_sub (__a, __b);
    552   return (__m64) ((__vector long long) __c)[0];
    553 #else
    554   __m64_union __mu1, __mu2, __res;
    555 
    556   __mu1.as_m64 = __m1;
    557   __mu2.as_m64 = __m2;
    558 
    559   __res.as_short[0] = __mu1.as_short[0] - __mu2.as_short[0];
    560   __res.as_short[1] = __mu1.as_short[1] - __mu2.as_short[1];
    561   __res.as_short[2] = __mu1.as_short[2] - __mu2.as_short[2];
    562   __res.as_short[3] = __mu1.as_short[3] - __mu2.as_short[3];
    563 
    564   return (__m64) __res.as_m64;
    565 #endif
    566 }
    567 
    568 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    569 _m_psubw (__m64 __m1, __m64 __m2)
    570 {
    571   return _mm_sub_pi16 (__m1, __m2);
    572 }
    573 
    574 /* Subtract the 32-bit values in M2 from the 32-bit values in M1.  */
    575 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    576 _mm_sub_pi32 (__m64 __m1, __m64 __m2)
    577 {
    578 #if _ARCH_PWR9
    579   __vector signed int __a, __b, __c;
    580 
    581   __a = (__vector signed int)vec_splats (__m1);
    582   __b = (__vector signed int)vec_splats (__m2);
    583   __c = vec_sub (__a, __b);
    584   return (__m64) ((__vector long long) __c)[0];
    585 #else
    586   __m64_union __mu1, __mu2, __res;
    587 
    588   __mu1.as_m64 = __m1;
    589   __mu2.as_m64 = __m2;
    590 
    591   __res.as_int[0] = __mu1.as_int[0] - __mu2.as_int[0];
    592   __res.as_int[1] = __mu1.as_int[1] - __mu2.as_int[1];
    593 
    594   return (__m64) __res.as_m64;
    595 #endif
    596 }
    597 
    598 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    599 _m_psubd (__m64 __m1, __m64 __m2)
    600 {
    601   return _mm_sub_pi32 (__m1, __m2);
    602 }
    603 
    604 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    605 _mm_add_si64 (__m64 __m1, __m64 __m2)
    606 {
    607   return (__m1 + __m2);
    608 }
    609 
    610 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    611 _mm_sub_si64 (__m64 __m1, __m64 __m2)
    612 {
    613   return (__m1 - __m2);
    614 }
    615 
    616 /* Shift the 64-bit value in M left by COUNT.  */
    617 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    618 _mm_sll_si64 (__m64 __m, __m64 __count)
    619 {
    620   return (__m << __count);
    621 }
    622 
    623 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    624 _m_psllq (__m64 __m, __m64 __count)
    625 {
    626   return _mm_sll_si64 (__m, __count);
    627 }
    628 
    629 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    630 _mm_slli_si64 (__m64 __m, const int __count)
    631 {
    632   return (__m << __count);
    633 }
    634 
    635 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    636 _m_psllqi (__m64 __m, const int __count)
    637 {
    638   return _mm_slli_si64 (__m, __count);
    639 }
    640 
    641 /* Shift the 64-bit value in M left by COUNT; shift in zeros.  */
    642 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    643 _mm_srl_si64 (__m64 __m, __m64 __count)
    644 {
    645   return (__m >> __count);
    646 }
    647 
    648 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    649 _m_psrlq (__m64 __m, __m64 __count)
    650 {
    651   return _mm_srl_si64 (__m, __count);
    652 }
    653 
    654 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    655 _mm_srli_si64 (__m64 __m, const int __count)
    656 {
    657   return (__m >> __count);
    658 }
    659 
    660 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    661 _m_psrlqi (__m64 __m, const int __count)
    662 {
    663   return _mm_srli_si64 (__m, __count);
    664 }
    665 
    666 /* Bit-wise AND the 64-bit values in M1 and M2.  */
    667 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    668 _mm_and_si64 (__m64 __m1, __m64 __m2)
    669 {
    670   return (__m1 & __m2);
    671 }
    672 
    673 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    674 _m_pand (__m64 __m1, __m64 __m2)
    675 {
    676   return _mm_and_si64 (__m1, __m2);
    677 }
    678 
    679 /* Bit-wise complement the 64-bit value in M1 and bit-wise AND it with the
    680    64-bit value in M2.  */
    681 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    682 _mm_andnot_si64 (__m64 __m1, __m64 __m2)
    683 {
    684   return (~__m1 & __m2);
    685 }
    686 
    687 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    688 _m_pandn (__m64 __m1, __m64 __m2)
    689 {
    690   return _mm_andnot_si64 (__m1, __m2);
    691 }
    692 
    693 /* Bit-wise inclusive OR the 64-bit values in M1 and M2.  */
    694 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    695 _mm_or_si64 (__m64 __m1, __m64 __m2)
    696 {
    697   return (__m1 | __m2);
    698 }
    699 
    700 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    701 _m_por (__m64 __m1, __m64 __m2)
    702 {
    703   return _mm_or_si64 (__m1, __m2);
    704 }
    705 
    706 /* Bit-wise exclusive OR the 64-bit values in M1 and M2.  */
    707 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    708 _mm_xor_si64 (__m64 __m1, __m64 __m2)
    709 {
    710   return  (__m1 ^ __m2);
    711 }
    712 
    713 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    714 _m_pxor (__m64 __m1, __m64 __m2)
    715 {
    716   return _mm_xor_si64 (__m1, __m2);
    717 }
    718 
    719 /* Creates a 64-bit zero.  */
    720 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    721 _mm_setzero_si64 (void)
    722 {
    723   return (__m64) 0;
    724 }
    725 
    726 /* Compare eight 8-bit values.  The result of the comparison is 0xFF if the
    727    test is true and zero if false.  */
    728 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    729 _mm_cmpeq_pi8 (__m64 __m1, __m64 __m2)
    730 {
    731 #if defined(_ARCH_PWR6) && defined(__powerpc64__)
    732   __m64 __res;
    733   __asm__(
    734       "cmpb %0,%1,%2;\n"
    735       : "=r" (__res)
    736       : "r" (__m1),
    737 	"r" (__m2)
    738       : );
    739   return (__res);
    740 #else
    741   __m64_union __mu1, __mu2, __res;
    742 
    743   __mu1.as_m64 = __m1;
    744   __mu2.as_m64 = __m2;
    745 
    746   __res.as_char[0] = (__mu1.as_char[0] == __mu2.as_char[0])? -1: 0;
    747   __res.as_char[1] = (__mu1.as_char[1] == __mu2.as_char[1])? -1: 0;
    748   __res.as_char[2] = (__mu1.as_char[2] == __mu2.as_char[2])? -1: 0;
    749   __res.as_char[3] = (__mu1.as_char[3] == __mu2.as_char[3])? -1: 0;
    750   __res.as_char[4] = (__mu1.as_char[4] == __mu2.as_char[4])? -1: 0;
    751   __res.as_char[5] = (__mu1.as_char[5] == __mu2.as_char[5])? -1: 0;
    752   __res.as_char[6] = (__mu1.as_char[6] == __mu2.as_char[6])? -1: 0;
    753   __res.as_char[7] = (__mu1.as_char[7] == __mu2.as_char[7])? -1: 0;
    754 
    755   return (__m64) __res.as_m64;
    756 #endif
    757 }
    758 
    759 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    760 _m_pcmpeqb (__m64 __m1, __m64 __m2)
    761 {
    762   return _mm_cmpeq_pi8 (__m1, __m2);
    763 }
    764 
    765 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    766 _mm_cmpgt_pi8 (__m64 __m1, __m64 __m2)
    767 {
    768 #if _ARCH_PWR8
    769   __vector signed char __a, __b, __c;
    770 
    771   __a = (__vector signed char)vec_splats (__m1);
    772   __b = (__vector signed char)vec_splats (__m2);
    773   __c = (__vector signed char)vec_cmpgt (__a, __b);
    774   return (__m64) ((__vector long long) __c)[0];
    775 #else
    776   __m64_union __mu1, __mu2, __res;
    777 
    778   __mu1.as_m64 = __m1;
    779   __mu2.as_m64 = __m2;
    780 
    781   __res.as_char[0] = (__mu1.as_char[0] > __mu2.as_char[0])? -1: 0;
    782   __res.as_char[1] = (__mu1.as_char[1] > __mu2.as_char[1])? -1: 0;
    783   __res.as_char[2] = (__mu1.as_char[2] > __mu2.as_char[2])? -1: 0;
    784   __res.as_char[3] = (__mu1.as_char[3] > __mu2.as_char[3])? -1: 0;
    785   __res.as_char[4] = (__mu1.as_char[4] > __mu2.as_char[4])? -1: 0;
    786   __res.as_char[5] = (__mu1.as_char[5] > __mu2.as_char[5])? -1: 0;
    787   __res.as_char[6] = (__mu1.as_char[6] > __mu2.as_char[6])? -1: 0;
    788   __res.as_char[7] = (__mu1.as_char[7] > __mu2.as_char[7])? -1: 0;
    789 
    790   return (__m64) __res.as_m64;
    791 #endif
    792 }
    793 
    794 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    795 _m_pcmpgtb (__m64 __m1, __m64 __m2)
    796 {
    797   return _mm_cmpgt_pi8 (__m1, __m2);
    798 }
    799 
    800 /* Compare four 16-bit values.  The result of the comparison is 0xFFFF if
    801    the test is true and zero if false.  */
    802 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    803 _mm_cmpeq_pi16 (__m64 __m1, __m64 __m2)
    804 {
    805 #if _ARCH_PWR8
    806   __vector signed short __a, __b, __c;
    807 
    808   __a = (__vector signed short)vec_splats (__m1);
    809   __b = (__vector signed short)vec_splats (__m2);
    810   __c = (__vector signed short)vec_cmpeq (__a, __b);
    811   return (__m64) ((__vector long long) __c)[0];
    812 #else
    813   __m64_union __mu1, __mu2, __res;
    814 
    815   __mu1.as_m64 = __m1;
    816   __mu2.as_m64 = __m2;
    817 
    818   __res.as_short[0] = (__mu1.as_short[0] == __mu2.as_short[0])? -1: 0;
    819   __res.as_short[1] = (__mu1.as_short[1] == __mu2.as_short[1])? -1: 0;
    820   __res.as_short[2] = (__mu1.as_short[2] == __mu2.as_short[2])? -1: 0;
    821   __res.as_short[3] = (__mu1.as_short[3] == __mu2.as_short[3])? -1: 0;
    822 
    823   return (__m64) __res.as_m64;
    824 #endif
    825 }
    826 
    827 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    828 _m_pcmpeqw (__m64 __m1, __m64 __m2)
    829 {
    830   return _mm_cmpeq_pi16 (__m1, __m2);
    831 }
    832 
    833 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    834 _mm_cmpgt_pi16 (__m64 __m1, __m64 __m2)
    835 {
    836 #if _ARCH_PWR8
    837   __vector signed short __a, __b, __c;
    838 
    839   __a = (__vector signed short)vec_splats (__m1);
    840   __b = (__vector signed short)vec_splats (__m2);
    841   __c = (__vector signed short)vec_cmpgt (__a, __b);
    842   return (__m64) ((__vector long long) __c)[0];
    843 #else
    844   __m64_union __mu1, __mu2, __res;
    845 
    846   __mu1.as_m64 = __m1;
    847   __mu2.as_m64 = __m2;
    848 
    849   __res.as_short[0] = (__mu1.as_short[0] > __mu2.as_short[0])? -1: 0;
    850   __res.as_short[1] = (__mu1.as_short[1] > __mu2.as_short[1])? -1: 0;
    851   __res.as_short[2] = (__mu1.as_short[2] > __mu2.as_short[2])? -1: 0;
    852   __res.as_short[3] = (__mu1.as_short[3] > __mu2.as_short[3])? -1: 0;
    853 
    854   return (__m64) __res.as_m64;
    855 #endif
    856 }
    857 
    858 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    859 _m_pcmpgtw (__m64 __m1, __m64 __m2)
    860 {
    861   return _mm_cmpgt_pi16 (__m1, __m2);
    862 }
    863 
    864 /* Compare two 32-bit values.  The result of the comparison is 0xFFFFFFFF if
    865    the test is true and zero if false.  */
    866 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    867 _mm_cmpeq_pi32 (__m64 __m1, __m64 __m2)
    868 {
    869 #if _ARCH_PWR9
    870   __vector signed int __a, __b, __c;
    871 
    872   __a = (__vector signed int)vec_splats (__m1);
    873   __b = (__vector signed int)vec_splats (__m2);
    874   __c = (__vector signed int)vec_cmpeq (__a, __b);
    875   return (__m64) ((__vector long long) __c)[0];
    876 #else
    877   __m64_union __mu1, __mu2, __res;
    878 
    879   __mu1.as_m64 = __m1;
    880   __mu2.as_m64 = __m2;
    881 
    882   __res.as_int[0] = (__mu1.as_int[0] == __mu2.as_int[0])? -1: 0;
    883   __res.as_int[1] = (__mu1.as_int[1] == __mu2.as_int[1])? -1: 0;
    884 
    885   return (__m64) __res.as_m64;
    886 #endif
    887 }
    888 
    889 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    890 _m_pcmpeqd (__m64 __m1, __m64 __m2)
    891 {
    892   return _mm_cmpeq_pi32 (__m1, __m2);
    893 }
    894 
    895 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    896 _mm_cmpgt_pi32 (__m64 __m1, __m64 __m2)
    897 {
    898 #if _ARCH_PWR9
    899   __vector signed int __a, __b, __c;
    900 
    901   __a = (__vector signed int)vec_splats (__m1);
    902   __b = (__vector signed int)vec_splats (__m2);
    903   __c = (__vector signed int)vec_cmpgt (__a, __b);
    904   return (__m64) ((__vector long long) __c)[0];
    905 #else
    906   __m64_union __mu1, __mu2, __res;
    907 
    908   __mu1.as_m64 = __m1;
    909   __mu2.as_m64 = __m2;
    910 
    911   __res.as_int[0] = (__mu1.as_int[0] > __mu2.as_int[0])? -1: 0;
    912   __res.as_int[1] = (__mu1.as_int[1] > __mu2.as_int[1])? -1: 0;
    913 
    914   return (__m64) __res.as_m64;
    915 #endif
    916 }
    917 
    918 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    919 _m_pcmpgtd (__m64 __m1, __m64 __m2)
    920 {
    921   return _mm_cmpgt_pi32 (__m1, __m2);
    922 }
    923 
    924 #if _ARCH_PWR8
    925 /* Add the 8-bit values in M1 to the 8-bit values in M2 using signed
    926    saturated arithmetic.  */
    927 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    928 _mm_adds_pi8 (__m64 __m1, __m64 __m2)
    929 {
    930   __vector signed char __a, __b, __c;
    931 
    932   __a = (__vector signed char)vec_splats (__m1);
    933   __b = (__vector signed char)vec_splats (__m2);
    934   __c = vec_adds (__a, __b);
    935   return (__m64) ((__vector long long) __c)[0];
    936 }
    937 
    938 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    939 _m_paddsb (__m64 __m1, __m64 __m2)
    940 {
    941   return _mm_adds_pi8 (__m1, __m2);
    942 }
    943 /* Add the 16-bit values in M1 to the 16-bit values in M2 using signed
    944    saturated arithmetic.  */
    945 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    946 _mm_adds_pi16 (__m64 __m1, __m64 __m2)
    947 {
    948   __vector signed short __a, __b, __c;
    949 
    950   __a = (__vector signed short)vec_splats (__m1);
    951   __b = (__vector signed short)vec_splats (__m2);
    952   __c = vec_adds (__a, __b);
    953   return (__m64) ((__vector long long) __c)[0];
    954 }
    955 
    956 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    957 _m_paddsw (__m64 __m1, __m64 __m2)
    958 {
    959   return _mm_adds_pi16 (__m1, __m2);
    960 }
    961 /* Add the 8-bit values in M1 to the 8-bit values in M2 using unsigned
    962    saturated arithmetic.  */
    963 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    964 _mm_adds_pu8 (__m64 __m1, __m64 __m2)
    965 {
    966   __vector unsigned char __a, __b, __c;
    967 
    968   __a = (__vector unsigned char)vec_splats (__m1);
    969   __b = (__vector unsigned char)vec_splats (__m2);
    970   __c = vec_adds (__a, __b);
    971   return (__m64) ((__vector long long) __c)[0];
    972 }
    973 
    974 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    975 _m_paddusb (__m64 __m1, __m64 __m2)
    976 {
    977   return _mm_adds_pu8 (__m1, __m2);
    978 }
    979 
    980 /* Add the 16-bit values in M1 to the 16-bit values in M2 using unsigned
    981    saturated arithmetic.  */
    982 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    983 _mm_adds_pu16 (__m64 __m1, __m64 __m2)
    984 {
    985   __vector unsigned short __a, __b, __c;
    986 
    987   __a = (__vector unsigned short)vec_splats (__m1);
    988   __b = (__vector unsigned short)vec_splats (__m2);
    989   __c = vec_adds (__a, __b);
    990   return (__m64) ((__vector long long) __c)[0];
    991 }
    992 
    993 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    994 _m_paddusw (__m64 __m1, __m64 __m2)
    995 {
    996   return _mm_adds_pu16 (__m1, __m2);
    997 }
    998 
    999 /* Subtract the 8-bit values in M2 from the 8-bit values in M1 using signed
   1000    saturating arithmetic.  */
   1001 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1002 _mm_subs_pi8 (__m64 __m1, __m64 __m2)
   1003 {
   1004   __vector signed char __a, __b, __c;
   1005 
   1006   __a = (__vector signed char)vec_splats (__m1);
   1007   __b = (__vector signed char)vec_splats (__m2);
   1008   __c = vec_subs (__a, __b);
   1009   return (__m64) ((__vector long long) __c)[0];
   1010 }
   1011 
   1012 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1013 _m_psubsb (__m64 __m1, __m64 __m2)
   1014 {
   1015   return _mm_subs_pi8 (__m1, __m2);
   1016 }
   1017 
   1018 /* Subtract the 16-bit values in M2 from the 16-bit values in M1 using
   1019    signed saturating arithmetic.  */
   1020 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1021 _mm_subs_pi16 (__m64 __m1, __m64 __m2)
   1022 {
   1023   __vector signed short __a, __b, __c;
   1024 
   1025   __a = (__vector signed short)vec_splats (__m1);
   1026   __b = (__vector signed short)vec_splats (__m2);
   1027   __c = vec_subs (__a, __b);
   1028   return (__m64) ((__vector long long) __c)[0];
   1029 }
   1030 
   1031 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1032 _m_psubsw (__m64 __m1, __m64 __m2)
   1033 {
   1034   return _mm_subs_pi16 (__m1, __m2);
   1035 }
   1036 
   1037 /* Subtract the 8-bit values in M2 from the 8-bit values in M1 using
   1038    unsigned saturating arithmetic.  */
   1039 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1040 _mm_subs_pu8 (__m64 __m1, __m64 __m2)
   1041 {
   1042   __vector unsigned char __a, __b, __c;
   1043 
   1044   __a = (__vector unsigned char)vec_splats (__m1);
   1045   __b = (__vector unsigned char)vec_splats (__m2);
   1046   __c = vec_subs (__a, __b);
   1047   return (__m64) ((__vector long long) __c)[0];
   1048 }
   1049 
   1050 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1051 _m_psubusb (__m64 __m1, __m64 __m2)
   1052 {
   1053   return _mm_subs_pu8 (__m1, __m2);
   1054 }
   1055 
   1056 /* Subtract the 16-bit values in M2 from the 16-bit values in M1 using
   1057    unsigned saturating arithmetic.  */
   1058 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1059 _mm_subs_pu16 (__m64 __m1, __m64 __m2)
   1060 {
   1061   __vector unsigned short __a, __b, __c;
   1062 
   1063   __a = (__vector unsigned short)vec_splats (__m1);
   1064   __b = (__vector unsigned short)vec_splats (__m2);
   1065   __c = vec_subs (__a, __b);
   1066   return (__m64) ((__vector long long) __c)[0];
   1067 }
   1068 
   1069 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1070 _m_psubusw (__m64 __m1, __m64 __m2)
   1071 {
   1072   return _mm_subs_pu16 (__m1, __m2);
   1073 }
   1074 
   1075 /* Multiply four 16-bit values in M1 by four 16-bit values in M2 producing
   1076    four 32-bit intermediate results, which are then summed by pairs to
   1077    produce two 32-bit results.  */
   1078 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1079 _mm_madd_pi16 (__m64 __m1, __m64 __m2)
   1080 {
   1081   __vector signed short __a, __b;
   1082   __vector signed int __c;
   1083   __vector signed int __zero = {0, 0, 0, 0};
   1084 
   1085   __a = (__vector signed short)vec_splats (__m1);
   1086   __b = (__vector signed short)vec_splats (__m2);
   1087   __c = vec_vmsumshm (__a, __b, __zero);
   1088   return (__m64) ((__vector long long) __c)[0];
   1089 }
   1090 
   1091 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1092 _m_pmaddwd (__m64 __m1, __m64 __m2)
   1093 {
   1094   return _mm_madd_pi16 (__m1, __m2);
   1095 }
   1096 /* Multiply four signed 16-bit values in M1 by four signed 16-bit values in
   1097    M2 and produce the high 16 bits of the 32-bit results.  */
   1098 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1099 _mm_mulhi_pi16 (__m64 __m1, __m64 __m2)
   1100 {
   1101   __vector signed short __a, __b;
   1102   __vector signed short __c;
   1103   __vector signed int __w0, __w1;
   1104   __vector unsigned char __xform1 = {
   1105 #ifdef __LITTLE_ENDIAN__
   1106       0x02, 0x03, 0x12, 0x13,  0x06, 0x07, 0x16, 0x17,
   1107       0x0A, 0x0B, 0x1A, 0x1B,  0x0E, 0x0F, 0x1E, 0x1F
   1108 #else
   1109       0x00, 0x01, 0x10, 0x11,  0x04, 0x05, 0x14, 0x15,
   1110       0x00, 0x01, 0x10, 0x11,  0x04, 0x05, 0x14, 0x15
   1111 #endif
   1112     };
   1113 
   1114   __a = (__vector signed short)vec_splats (__m1);
   1115   __b = (__vector signed short)vec_splats (__m2);
   1116 
   1117   __w0 = vec_vmulesh (__a, __b);
   1118   __w1 = vec_vmulosh (__a, __b);
   1119   __c = (__vector signed short)vec_perm (__w0, __w1, __xform1);
   1120 
   1121   return (__m64) ((__vector long long) __c)[0];
   1122 }
   1123 
   1124 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1125 _m_pmulhw (__m64 __m1, __m64 __m2)
   1126 {
   1127   return _mm_mulhi_pi16 (__m1, __m2);
   1128 }
   1129 
   1130 /* Multiply four 16-bit values in M1 by four 16-bit values in M2 and produce
   1131    the low 16 bits of the results.  */
   1132 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1133 _mm_mullo_pi16 (__m64 __m1, __m64 __m2)
   1134 {
   1135   __vector signed short __a, __b, __c;
   1136 
   1137   __a = (__vector signed short)vec_splats (__m1);
   1138   __b = (__vector signed short)vec_splats (__m2);
   1139   __c = __a * __b;
   1140   return (__m64) ((__vector long long) __c)[0];
   1141 }
   1142 
   1143 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1144 _m_pmullw (__m64 __m1, __m64 __m2)
   1145 {
   1146   return _mm_mullo_pi16 (__m1, __m2);
   1147 }
   1148 
   1149 /* Shift four 16-bit values in M left by COUNT.  */
   1150 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1151 _mm_sll_pi16 (__m64 __m, __m64 __count)
   1152 {
   1153   __vector signed short __r;
   1154   __vector unsigned short __c;
   1155 
   1156   if (__count <= 15)
   1157     {
   1158       __r = (__vector signed short)vec_splats (__m);
   1159       __c = (__vector unsigned short)vec_splats ((unsigned short)__count);
   1160       __r = vec_sl (__r, (__vector unsigned short)__c);
   1161       return (__m64) ((__vector long long) __r)[0];
   1162     }
   1163   else
   1164   return (0);
   1165 }
   1166 
   1167 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1168 _m_psllw (__m64 __m, __m64 __count)
   1169 {
   1170   return _mm_sll_pi16 (__m, __count);
   1171 }
   1172 
   1173 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1174 _mm_slli_pi16 (__m64 __m, int __count)
   1175 {
   1176   /* Promote int to long then invoke mm_sll_pi16.  */
   1177   return _mm_sll_pi16 (__m, __count);
   1178 }
   1179 
   1180 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1181 _m_psllwi (__m64 __m, int __count)
   1182 {
   1183   return _mm_slli_pi16 (__m, __count);
   1184 }
   1185 
   1186 /* Shift two 32-bit values in M left by COUNT.  */
   1187 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1188 _mm_sll_pi32 (__m64 __m, __m64 __count)
   1189 {
   1190   __m64_union __res;
   1191 
   1192   __res.as_m64 = __m;
   1193 
   1194   __res.as_int[0] = __res.as_int[0] << __count;
   1195   __res.as_int[1] = __res.as_int[1] << __count;
   1196   return (__res.as_m64);
   1197 }
   1198 
   1199 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1200 _m_pslld (__m64 __m, __m64 __count)
   1201 {
   1202   return _mm_sll_pi32 (__m, __count);
   1203 }
   1204 
   1205 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1206 _mm_slli_pi32 (__m64 __m, int __count)
   1207 {
   1208   /* Promote int to long then invoke mm_sll_pi32.  */
   1209   return _mm_sll_pi32 (__m, __count);
   1210 }
   1211 
   1212 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1213 _m_pslldi (__m64 __m, int __count)
   1214 {
   1215   return _mm_slli_pi32 (__m, __count);
   1216 }
   1217 
   1218 /* Shift four 16-bit values in M right by COUNT; shift in the sign bit.  */
   1219 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1220 _mm_sra_pi16 (__m64 __m, __m64 __count)
   1221 {
   1222   __vector signed short __r;
   1223   __vector unsigned short __c;
   1224 
   1225   if (__count <= 15)
   1226     {
   1227 	__r = (__vector signed short)vec_splats (__m);
   1228 	__c = (__vector unsigned short)vec_splats ((unsigned short)__count);
   1229 	__r = vec_sra (__r, (__vector unsigned short)__c);
   1230         return (__m64) ((__vector long long) __r)[0];
   1231     }
   1232   else
   1233   return (0);
   1234 }
   1235 
   1236 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1237 _m_psraw (__m64 __m, __m64 __count)
   1238 {
   1239   return _mm_sra_pi16 (__m, __count);
   1240 }
   1241 
   1242 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1243 _mm_srai_pi16 (__m64 __m, int __count)
   1244 {
   1245   /* Promote int to long then invoke mm_sra_pi32.  */
   1246   return _mm_sra_pi16 (__m, __count);
   1247 }
   1248 
   1249 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1250 _m_psrawi (__m64 __m, int __count)
   1251 {
   1252   return _mm_srai_pi16 (__m, __count);
   1253 }
   1254 
   1255 /* Shift two 32-bit values in M right by COUNT; shift in the sign bit.  */
   1256 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1257 _mm_sra_pi32 (__m64 __m, __m64 __count)
   1258 {
   1259   __m64_union __res;
   1260 
   1261   __res.as_m64 = __m;
   1262 
   1263   __res.as_int[0] = __res.as_int[0] >> __count;
   1264   __res.as_int[1] = __res.as_int[1] >> __count;
   1265   return (__res.as_m64);
   1266 }
   1267 
   1268 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1269 _m_psrad (__m64 __m, __m64 __count)
   1270 {
   1271   return _mm_sra_pi32 (__m, __count);
   1272 }
   1273 
   1274 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1275 _mm_srai_pi32 (__m64 __m, int __count)
   1276 {
   1277   /* Promote int to long then invoke mm_sra_pi32.  */
   1278   return _mm_sra_pi32 (__m, __count);
   1279 }
   1280 
   1281 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1282 _m_psradi (__m64 __m, int __count)
   1283 {
   1284   return _mm_srai_pi32 (__m, __count);
   1285 }
   1286 
   1287 /* Shift four 16-bit values in M right by COUNT; shift in zeros.  */
   1288 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1289 _mm_srl_pi16 (__m64 __m, __m64 __count)
   1290 {
   1291   __vector unsigned short __r;
   1292   __vector unsigned short __c;
   1293 
   1294   if (__count <= 15)
   1295     {
   1296 	__r = (__vector unsigned short)vec_splats (__m);
   1297 	__c = (__vector unsigned short)vec_splats ((unsigned short)__count);
   1298 	__r = vec_sr (__r, (__vector unsigned short)__c);
   1299         return (__m64) ((__vector long long) __r)[0];
   1300     }
   1301   else
   1302     return (0);
   1303 }
   1304 
   1305 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1306 _m_psrlw (__m64 __m, __m64 __count)
   1307 {
   1308   return _mm_srl_pi16 (__m, __count);
   1309 }
   1310 
   1311 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1312 _mm_srli_pi16 (__m64 __m, int __count)
   1313 {
   1314   /* Promote int to long then invoke mm_sra_pi32.  */
   1315   return _mm_srl_pi16 (__m, __count);
   1316 }
   1317 
   1318 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1319 _m_psrlwi (__m64 __m, int __count)
   1320 {
   1321   return _mm_srli_pi16 (__m, __count);
   1322 }
   1323 
   1324 /* Shift two 32-bit values in M right by COUNT; shift in zeros.  */
   1325 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1326 _mm_srl_pi32 (__m64 __m, __m64 __count)
   1327 {
   1328   __m64_union __res;
   1329 
   1330   __res.as_m64 = __m;
   1331 
   1332   __res.as_int[0] = (unsigned int)__res.as_int[0] >> __count;
   1333   __res.as_int[1] = (unsigned int)__res.as_int[1] >> __count;
   1334   return (__res.as_m64);
   1335 }
   1336 
   1337 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1338 _m_psrld (__m64 __m, __m64 __count)
   1339 {
   1340   return _mm_srl_pi32 (__m, __count);
   1341 }
   1342 
   1343 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1344 _mm_srli_pi32 (__m64 __m, int __count)
   1345 {
   1346   /* Promote int to long then invoke mm_srl_pi32.  */
   1347   return _mm_srl_pi32 (__m, __count);
   1348 }
   1349 
   1350 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1351 _m_psrldi (__m64 __m, int __count)
   1352 {
   1353   return _mm_srli_pi32 (__m, __count);
   1354 }
   1355 #endif /* _ARCH_PWR8 */
   1356 
   1357 /* Creates a vector of two 32-bit values; I0 is least significant.  */
   1358 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1359 _mm_set_pi32 (int __i1, int __i0)
   1360 {
   1361   __m64_union __res;
   1362 
   1363   __res.as_int[0] = __i0;
   1364   __res.as_int[1] = __i1;
   1365   return (__res.as_m64);
   1366 }
   1367 
   1368 /* Creates a vector of four 16-bit values; W0 is least significant.  */
   1369 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1370 _mm_set_pi16 (short __w3, short __w2, short __w1, short __w0)
   1371 {
   1372   __m64_union __res;
   1373 
   1374   __res.as_short[0] = __w0;
   1375   __res.as_short[1] = __w1;
   1376   __res.as_short[2] = __w2;
   1377   __res.as_short[3] = __w3;
   1378   return (__res.as_m64);
   1379 }
   1380 
   1381 /* Creates a vector of eight 8-bit values; B0 is least significant.  */
   1382 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1383 _mm_set_pi8 (char __b7, char __b6, char __b5, char __b4,
   1384 	     char __b3, char __b2, char __b1, char __b0)
   1385 {
   1386   __m64_union __res;
   1387 
   1388   __res.as_char[0] = __b0;
   1389   __res.as_char[1] = __b1;
   1390   __res.as_char[2] = __b2;
   1391   __res.as_char[3] = __b3;
   1392   __res.as_char[4] = __b4;
   1393   __res.as_char[5] = __b5;
   1394   __res.as_char[6] = __b6;
   1395   __res.as_char[7] = __b7;
   1396   return (__res.as_m64);
   1397 }
   1398 
   1399 /* Similar, but with the arguments in reverse order.  */
   1400 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1401 _mm_setr_pi32 (int __i0, int __i1)
   1402 {
   1403   __m64_union __res;
   1404 
   1405   __res.as_int[0] = __i0;
   1406   __res.as_int[1] = __i1;
   1407   return (__res.as_m64);
   1408 }
   1409 
   1410 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1411 _mm_setr_pi16 (short __w0, short __w1, short __w2, short __w3)
   1412 {
   1413   return _mm_set_pi16 (__w3, __w2, __w1, __w0);
   1414 }
   1415 
   1416 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1417 _mm_setr_pi8 (char __b0, char __b1, char __b2, char __b3,
   1418 	      char __b4, char __b5, char __b6, char __b7)
   1419 {
   1420   return _mm_set_pi8 (__b7, __b6, __b5, __b4, __b3, __b2, __b1, __b0);
   1421 }
   1422 
   1423 /* Creates a vector of two 32-bit values, both elements containing I.  */
   1424 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1425 _mm_set1_pi32 (int __i)
   1426 {
   1427   __m64_union __res;
   1428 
   1429   __res.as_int[0] = __i;
   1430   __res.as_int[1] = __i;
   1431   return (__res.as_m64);
   1432 }
   1433 
   1434 /* Creates a vector of four 16-bit values, all elements containing W.  */
   1435 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1436 _mm_set1_pi16 (short __w)
   1437 {
   1438 #if _ARCH_PWR9
   1439   __vector signed short w;
   1440 
   1441   w = (__vector signed short)vec_splats (__w);
   1442   return (__m64) ((__vector long long) w)[0];
   1443 #else
   1444   __m64_union __res;
   1445 
   1446   __res.as_short[0] = __w;
   1447   __res.as_short[1] = __w;
   1448   __res.as_short[2] = __w;
   1449   __res.as_short[3] = __w;
   1450   return (__res.as_m64);
   1451 #endif
   1452 }
   1453 
   1454 /* Creates a vector of eight 8-bit values, all elements containing B.  */
   1455 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
   1456 _mm_set1_pi8 (signed char __b)
   1457 {
   1458 #if _ARCH_PWR8
   1459   __vector signed char __res;
   1460 
   1461   __res = (__vector signed char)vec_splats (__b);
   1462   return (__m64) ((__vector long long) __res)[0];
   1463 #else
   1464   __m64_union __res;
   1465 
   1466   __res.as_char[0] = __b;
   1467   __res.as_char[1] = __b;
   1468   __res.as_char[2] = __b;
   1469   __res.as_char[3] = __b;
   1470   __res.as_char[4] = __b;
   1471   __res.as_char[5] = __b;
   1472   __res.as_char[6] = __b;
   1473   __res.as_char[7] = __b;
   1474   return (__res.as_m64);
   1475 #endif
   1476 }
   1477 #endif /* _MMINTRIN_H_INCLUDED */
   1478