Home | History | Annotate | Line # | Download | only in arm
mmintrin.h revision 1.1.1.7
      1  1.1.1.7  mrg /* Copyright (C) 2002-2019 Free Software Foundation, Inc.
      2      1.1  mrg 
      3      1.1  mrg    This file is part of GCC.
      4      1.1  mrg 
      5      1.1  mrg    GCC is free software; you can redistribute it and/or modify it
      6      1.1  mrg    under the terms of the GNU General Public License as published
      7      1.1  mrg    by the Free Software Foundation; either version 3, or (at your
      8      1.1  mrg    option) any later version.
      9      1.1  mrg 
     10      1.1  mrg    GCC is distributed in the hope that it will be useful, but WITHOUT
     11      1.1  mrg    ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
     12      1.1  mrg    or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
     13      1.1  mrg    License for more details.
     14      1.1  mrg 
     15      1.1  mrg    Under Section 7 of GPL version 3, you are granted additional
     16      1.1  mrg    permissions described in the GCC Runtime Library Exception, version
     17      1.1  mrg    3.1, as published by the Free Software Foundation.
     18      1.1  mrg 
     19      1.1  mrg    You should have received a copy of the GNU General Public License and
     20      1.1  mrg    a copy of the GCC Runtime Library Exception along with this program;
     21      1.1  mrg    see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
     22      1.1  mrg    <http://www.gnu.org/licenses/>.  */
     23      1.1  mrg 
     24      1.1  mrg #ifndef _MMINTRIN_H_INCLUDED
     25      1.1  mrg #define _MMINTRIN_H_INCLUDED
     26      1.1  mrg 
     27  1.1.1.2  mrg #ifndef __IWMMXT__
     28  1.1.1.2  mrg #error mmintrin.h included without enabling WMMX/WMMX2 instructions (e.g. -march=iwmmxt or -march=iwmmxt2)
     29  1.1.1.2  mrg #endif
     30  1.1.1.2  mrg 
     31  1.1.1.2  mrg 
     32  1.1.1.2  mrg #if defined __cplusplus
     33  1.1.1.2  mrg extern "C" {
     34  1.1.1.2  mrg /* Intrinsics use C name-mangling.  */
     35  1.1.1.2  mrg #endif /* __cplusplus */
     36  1.1.1.2  mrg 
     37      1.1  mrg /* The data type intended for user use.  */
     38      1.1  mrg typedef unsigned long long __m64, __int64;
     39      1.1  mrg 
     40      1.1  mrg /* Internal data types for implementing the intrinsics.  */
     41      1.1  mrg typedef int __v2si __attribute__ ((vector_size (8)));
     42      1.1  mrg typedef short __v4hi __attribute__ ((vector_size (8)));
     43  1.1.1.2  mrg typedef signed char __v8qi __attribute__ ((vector_size (8)));
     44  1.1.1.2  mrg 
     45  1.1.1.2  mrg /* Provided for source compatibility with MMX.  */
     46  1.1.1.2  mrg extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     47  1.1.1.2  mrg _mm_empty (void)
     48  1.1.1.2  mrg {
     49  1.1.1.2  mrg }
     50      1.1  mrg 
     51      1.1  mrg /* "Convert" __m64 and __int64 into each other.  */
     52  1.1.1.2  mrg static __inline __m64
     53      1.1  mrg _mm_cvtsi64_m64 (__int64 __i)
     54      1.1  mrg {
     55      1.1  mrg   return __i;
     56      1.1  mrg }
     57      1.1  mrg 
     58      1.1  mrg static __inline __int64
     59      1.1  mrg _mm_cvtm64_si64 (__m64 __i)
     60      1.1  mrg {
     61      1.1  mrg   return __i;
     62      1.1  mrg }
     63      1.1  mrg 
     64      1.1  mrg static __inline int
     65      1.1  mrg _mm_cvtsi64_si32 (__int64 __i)
     66      1.1  mrg {
     67      1.1  mrg   return __i;
     68      1.1  mrg }
     69      1.1  mrg 
     70      1.1  mrg static __inline __int64
     71      1.1  mrg _mm_cvtsi32_si64 (int __i)
     72      1.1  mrg {
     73  1.1.1.2  mrg   return (__i & 0xffffffff);
     74      1.1  mrg }
     75      1.1  mrg 
     76      1.1  mrg /* Pack the four 16-bit values from M1 into the lower four 8-bit values of
     77      1.1  mrg    the result, and the four 16-bit values from M2 into the upper four 8-bit
     78      1.1  mrg    values of the result, all with signed saturation.  */
     79      1.1  mrg static __inline __m64
     80      1.1  mrg _mm_packs_pi16 (__m64 __m1, __m64 __m2)
     81      1.1  mrg {
     82      1.1  mrg   return (__m64) __builtin_arm_wpackhss ((__v4hi)__m1, (__v4hi)__m2);
     83      1.1  mrg }
     84      1.1  mrg 
     85      1.1  mrg /* Pack the two 32-bit values from M1 in to the lower two 16-bit values of
     86      1.1  mrg    the result, and the two 32-bit values from M2 into the upper two 16-bit
     87      1.1  mrg    values of the result, all with signed saturation.  */
     88      1.1  mrg static __inline __m64
     89      1.1  mrg _mm_packs_pi32 (__m64 __m1, __m64 __m2)
     90      1.1  mrg {
     91      1.1  mrg   return (__m64) __builtin_arm_wpackwss ((__v2si)__m1, (__v2si)__m2);
     92      1.1  mrg }
     93      1.1  mrg 
     94      1.1  mrg /* Copy the 64-bit value from M1 into the lower 32-bits of the result, and
     95      1.1  mrg    the 64-bit value from M2 into the upper 32-bits of the result, all with
     96      1.1  mrg    signed saturation for values that do not fit exactly into 32-bits.  */
     97      1.1  mrg static __inline __m64
     98      1.1  mrg _mm_packs_pi64 (__m64 __m1, __m64 __m2)
     99      1.1  mrg {
    100      1.1  mrg   return (__m64) __builtin_arm_wpackdss ((long long)__m1, (long long)__m2);
    101      1.1  mrg }
    102      1.1  mrg 
    103      1.1  mrg /* Pack the four 16-bit values from M1 into the lower four 8-bit values of
    104      1.1  mrg    the result, and the four 16-bit values from M2 into the upper four 8-bit
    105      1.1  mrg    values of the result, all with unsigned saturation.  */
    106      1.1  mrg static __inline __m64
    107      1.1  mrg _mm_packs_pu16 (__m64 __m1, __m64 __m2)
    108      1.1  mrg {
    109      1.1  mrg   return (__m64) __builtin_arm_wpackhus ((__v4hi)__m1, (__v4hi)__m2);
    110      1.1  mrg }
    111      1.1  mrg 
    112      1.1  mrg /* Pack the two 32-bit values from M1 into the lower two 16-bit values of
    113      1.1  mrg    the result, and the two 32-bit values from M2 into the upper two 16-bit
    114      1.1  mrg    values of the result, all with unsigned saturation.  */
    115      1.1  mrg static __inline __m64
    116      1.1  mrg _mm_packs_pu32 (__m64 __m1, __m64 __m2)
    117      1.1  mrg {
    118      1.1  mrg   return (__m64) __builtin_arm_wpackwus ((__v2si)__m1, (__v2si)__m2);
    119      1.1  mrg }
    120      1.1  mrg 
    121      1.1  mrg /* Copy the 64-bit value from M1 into the lower 32-bits of the result, and
    122      1.1  mrg    the 64-bit value from M2 into the upper 32-bits of the result, all with
    123      1.1  mrg    unsigned saturation for values that do not fit exactly into 32-bits.  */
    124      1.1  mrg static __inline __m64
    125      1.1  mrg _mm_packs_pu64 (__m64 __m1, __m64 __m2)
    126      1.1  mrg {
    127      1.1  mrg   return (__m64) __builtin_arm_wpackdus ((long long)__m1, (long long)__m2);
    128      1.1  mrg }
    129      1.1  mrg 
    130      1.1  mrg /* Interleave the four 8-bit values from the high half of M1 with the four
    131      1.1  mrg    8-bit values from the high half of M2.  */
    132      1.1  mrg static __inline __m64
    133      1.1  mrg _mm_unpackhi_pi8 (__m64 __m1, __m64 __m2)
    134      1.1  mrg {
    135      1.1  mrg   return (__m64) __builtin_arm_wunpckihb ((__v8qi)__m1, (__v8qi)__m2);
    136      1.1  mrg }
    137      1.1  mrg 
    138      1.1  mrg /* Interleave the two 16-bit values from the high half of M1 with the two
    139      1.1  mrg    16-bit values from the high half of M2.  */
    140      1.1  mrg static __inline __m64
    141      1.1  mrg _mm_unpackhi_pi16 (__m64 __m1, __m64 __m2)
    142      1.1  mrg {
    143      1.1  mrg   return (__m64) __builtin_arm_wunpckihh ((__v4hi)__m1, (__v4hi)__m2);
    144      1.1  mrg }
    145      1.1  mrg 
    146      1.1  mrg /* Interleave the 32-bit value from the high half of M1 with the 32-bit
    147      1.1  mrg    value from the high half of M2.  */
    148      1.1  mrg static __inline __m64
    149      1.1  mrg _mm_unpackhi_pi32 (__m64 __m1, __m64 __m2)
    150      1.1  mrg {
    151      1.1  mrg   return (__m64) __builtin_arm_wunpckihw ((__v2si)__m1, (__v2si)__m2);
    152      1.1  mrg }
    153      1.1  mrg 
    154      1.1  mrg /* Interleave the four 8-bit values from the low half of M1 with the four
    155      1.1  mrg    8-bit values from the low half of M2.  */
    156      1.1  mrg static __inline __m64
    157      1.1  mrg _mm_unpacklo_pi8 (__m64 __m1, __m64 __m2)
    158      1.1  mrg {
    159      1.1  mrg   return (__m64) __builtin_arm_wunpckilb ((__v8qi)__m1, (__v8qi)__m2);
    160      1.1  mrg }
    161      1.1  mrg 
    162      1.1  mrg /* Interleave the two 16-bit values from the low half of M1 with the two
    163      1.1  mrg    16-bit values from the low half of M2.  */
    164      1.1  mrg static __inline __m64
    165      1.1  mrg _mm_unpacklo_pi16 (__m64 __m1, __m64 __m2)
    166      1.1  mrg {
    167      1.1  mrg   return (__m64) __builtin_arm_wunpckilh ((__v4hi)__m1, (__v4hi)__m2);
    168      1.1  mrg }
    169      1.1  mrg 
    170      1.1  mrg /* Interleave the 32-bit value from the low half of M1 with the 32-bit
    171      1.1  mrg    value from the low half of M2.  */
    172      1.1  mrg static __inline __m64
    173      1.1  mrg _mm_unpacklo_pi32 (__m64 __m1, __m64 __m2)
    174      1.1  mrg {
    175      1.1  mrg   return (__m64) __builtin_arm_wunpckilw ((__v2si)__m1, (__v2si)__m2);
    176      1.1  mrg }
    177      1.1  mrg 
    178      1.1  mrg /* Take the four 8-bit values from the low half of M1, sign extend them,
    179      1.1  mrg    and return the result as a vector of four 16-bit quantities.  */
    180      1.1  mrg static __inline __m64
    181      1.1  mrg _mm_unpackel_pi8 (__m64 __m1)
    182      1.1  mrg {
    183      1.1  mrg   return (__m64) __builtin_arm_wunpckelsb ((__v8qi)__m1);
    184      1.1  mrg }
    185      1.1  mrg 
    186      1.1  mrg /* Take the two 16-bit values from the low half of M1, sign extend them,
    187      1.1  mrg    and return the result as a vector of two 32-bit quantities.  */
    188      1.1  mrg static __inline __m64
    189      1.1  mrg _mm_unpackel_pi16 (__m64 __m1)
    190      1.1  mrg {
    191      1.1  mrg   return (__m64) __builtin_arm_wunpckelsh ((__v4hi)__m1);
    192      1.1  mrg }
    193      1.1  mrg 
    194      1.1  mrg /* Take the 32-bit value from the low half of M1, and return it sign extended
    195      1.1  mrg   to 64 bits.  */
    196      1.1  mrg static __inline __m64
    197      1.1  mrg _mm_unpackel_pi32 (__m64 __m1)
    198      1.1  mrg {
    199      1.1  mrg   return (__m64) __builtin_arm_wunpckelsw ((__v2si)__m1);
    200      1.1  mrg }
    201      1.1  mrg 
    202      1.1  mrg /* Take the four 8-bit values from the high half of M1, sign extend them,
    203      1.1  mrg    and return the result as a vector of four 16-bit quantities.  */
    204      1.1  mrg static __inline __m64
    205      1.1  mrg _mm_unpackeh_pi8 (__m64 __m1)
    206      1.1  mrg {
    207      1.1  mrg   return (__m64) __builtin_arm_wunpckehsb ((__v8qi)__m1);
    208      1.1  mrg }
    209      1.1  mrg 
    210      1.1  mrg /* Take the two 16-bit values from the high half of M1, sign extend them,
    211      1.1  mrg    and return the result as a vector of two 32-bit quantities.  */
    212      1.1  mrg static __inline __m64
    213      1.1  mrg _mm_unpackeh_pi16 (__m64 __m1)
    214      1.1  mrg {
    215      1.1  mrg   return (__m64) __builtin_arm_wunpckehsh ((__v4hi)__m1);
    216      1.1  mrg }
    217      1.1  mrg 
    218      1.1  mrg /* Take the 32-bit value from the high half of M1, and return it sign extended
    219      1.1  mrg   to 64 bits.  */
    220      1.1  mrg static __inline __m64
    221      1.1  mrg _mm_unpackeh_pi32 (__m64 __m1)
    222      1.1  mrg {
    223      1.1  mrg   return (__m64) __builtin_arm_wunpckehsw ((__v2si)__m1);
    224      1.1  mrg }
    225      1.1  mrg 
    226      1.1  mrg /* Take the four 8-bit values from the low half of M1, zero extend them,
    227      1.1  mrg    and return the result as a vector of four 16-bit quantities.  */
    228      1.1  mrg static __inline __m64
    229      1.1  mrg _mm_unpackel_pu8 (__m64 __m1)
    230      1.1  mrg {
    231      1.1  mrg   return (__m64) __builtin_arm_wunpckelub ((__v8qi)__m1);
    232      1.1  mrg }
    233      1.1  mrg 
    234      1.1  mrg /* Take the two 16-bit values from the low half of M1, zero extend them,
    235      1.1  mrg    and return the result as a vector of two 32-bit quantities.  */
    236      1.1  mrg static __inline __m64
    237      1.1  mrg _mm_unpackel_pu16 (__m64 __m1)
    238      1.1  mrg {
    239      1.1  mrg   return (__m64) __builtin_arm_wunpckeluh ((__v4hi)__m1);
    240      1.1  mrg }
    241      1.1  mrg 
    242      1.1  mrg /* Take the 32-bit value from the low half of M1, and return it zero extended
    243      1.1  mrg   to 64 bits.  */
    244      1.1  mrg static __inline __m64
    245      1.1  mrg _mm_unpackel_pu32 (__m64 __m1)
    246      1.1  mrg {
    247      1.1  mrg   return (__m64) __builtin_arm_wunpckeluw ((__v2si)__m1);
    248      1.1  mrg }
    249      1.1  mrg 
    250      1.1  mrg /* Take the four 8-bit values from the high half of M1, zero extend them,
    251      1.1  mrg    and return the result as a vector of four 16-bit quantities.  */
    252      1.1  mrg static __inline __m64
    253      1.1  mrg _mm_unpackeh_pu8 (__m64 __m1)
    254      1.1  mrg {
    255      1.1  mrg   return (__m64) __builtin_arm_wunpckehub ((__v8qi)__m1);
    256      1.1  mrg }
    257      1.1  mrg 
    258      1.1  mrg /* Take the two 16-bit values from the high half of M1, zero extend them,
    259      1.1  mrg    and return the result as a vector of two 32-bit quantities.  */
    260      1.1  mrg static __inline __m64
    261      1.1  mrg _mm_unpackeh_pu16 (__m64 __m1)
    262      1.1  mrg {
    263      1.1  mrg   return (__m64) __builtin_arm_wunpckehuh ((__v4hi)__m1);
    264      1.1  mrg }
    265      1.1  mrg 
    266      1.1  mrg /* Take the 32-bit value from the high half of M1, and return it zero extended
    267      1.1  mrg   to 64 bits.  */
    268      1.1  mrg static __inline __m64
    269      1.1  mrg _mm_unpackeh_pu32 (__m64 __m1)
    270      1.1  mrg {
    271      1.1  mrg   return (__m64) __builtin_arm_wunpckehuw ((__v2si)__m1);
    272      1.1  mrg }
    273      1.1  mrg 
    274      1.1  mrg /* Add the 8-bit values in M1 to the 8-bit values in M2.  */
    275      1.1  mrg static __inline __m64
    276      1.1  mrg _mm_add_pi8 (__m64 __m1, __m64 __m2)
    277      1.1  mrg {
    278      1.1  mrg   return (__m64) __builtin_arm_waddb ((__v8qi)__m1, (__v8qi)__m2);
    279      1.1  mrg }
    280      1.1  mrg 
    281      1.1  mrg /* Add the 16-bit values in M1 to the 16-bit values in M2.  */
    282      1.1  mrg static __inline __m64
    283      1.1  mrg _mm_add_pi16 (__m64 __m1, __m64 __m2)
    284      1.1  mrg {
    285      1.1  mrg   return (__m64) __builtin_arm_waddh ((__v4hi)__m1, (__v4hi)__m2);
    286      1.1  mrg }
    287      1.1  mrg 
    288      1.1  mrg /* Add the 32-bit values in M1 to the 32-bit values in M2.  */
    289      1.1  mrg static __inline __m64
    290      1.1  mrg _mm_add_pi32 (__m64 __m1, __m64 __m2)
    291      1.1  mrg {
    292      1.1  mrg   return (__m64) __builtin_arm_waddw ((__v2si)__m1, (__v2si)__m2);
    293      1.1  mrg }
    294      1.1  mrg 
    295      1.1  mrg /* Add the 8-bit values in M1 to the 8-bit values in M2 using signed
    296      1.1  mrg    saturated arithmetic.  */
    297      1.1  mrg static __inline __m64
    298      1.1  mrg _mm_adds_pi8 (__m64 __m1, __m64 __m2)
    299      1.1  mrg {
    300      1.1  mrg   return (__m64) __builtin_arm_waddbss ((__v8qi)__m1, (__v8qi)__m2);
    301      1.1  mrg }
    302      1.1  mrg 
    303      1.1  mrg /* Add the 16-bit values in M1 to the 16-bit values in M2 using signed
    304      1.1  mrg    saturated arithmetic.  */
    305      1.1  mrg static __inline __m64
    306      1.1  mrg _mm_adds_pi16 (__m64 __m1, __m64 __m2)
    307      1.1  mrg {
    308      1.1  mrg   return (__m64) __builtin_arm_waddhss ((__v4hi)__m1, (__v4hi)__m2);
    309      1.1  mrg }
    310      1.1  mrg 
    311      1.1  mrg /* Add the 32-bit values in M1 to the 32-bit values in M2 using signed
    312      1.1  mrg    saturated arithmetic.  */
    313      1.1  mrg static __inline __m64
    314      1.1  mrg _mm_adds_pi32 (__m64 __m1, __m64 __m2)
    315      1.1  mrg {
    316      1.1  mrg   return (__m64) __builtin_arm_waddwss ((__v2si)__m1, (__v2si)__m2);
    317      1.1  mrg }
    318      1.1  mrg 
    319      1.1  mrg /* Add the 8-bit values in M1 to the 8-bit values in M2 using unsigned
    320      1.1  mrg    saturated arithmetic.  */
    321      1.1  mrg static __inline __m64
    322      1.1  mrg _mm_adds_pu8 (__m64 __m1, __m64 __m2)
    323      1.1  mrg {
    324      1.1  mrg   return (__m64) __builtin_arm_waddbus ((__v8qi)__m1, (__v8qi)__m2);
    325      1.1  mrg }
    326      1.1  mrg 
    327      1.1  mrg /* Add the 16-bit values in M1 to the 16-bit values in M2 using unsigned
    328      1.1  mrg    saturated arithmetic.  */
    329      1.1  mrg static __inline __m64
    330      1.1  mrg _mm_adds_pu16 (__m64 __m1, __m64 __m2)
    331      1.1  mrg {
    332      1.1  mrg   return (__m64) __builtin_arm_waddhus ((__v4hi)__m1, (__v4hi)__m2);
    333      1.1  mrg }
    334      1.1  mrg 
    335      1.1  mrg /* Add the 32-bit values in M1 to the 32-bit values in M2 using unsigned
    336      1.1  mrg    saturated arithmetic.  */
    337      1.1  mrg static __inline __m64
    338      1.1  mrg _mm_adds_pu32 (__m64 __m1, __m64 __m2)
    339      1.1  mrg {
    340      1.1  mrg   return (__m64) __builtin_arm_waddwus ((__v2si)__m1, (__v2si)__m2);
    341      1.1  mrg }
    342      1.1  mrg 
    343      1.1  mrg /* Subtract the 8-bit values in M2 from the 8-bit values in M1.  */
    344      1.1  mrg static __inline __m64
    345      1.1  mrg _mm_sub_pi8 (__m64 __m1, __m64 __m2)
    346      1.1  mrg {
    347      1.1  mrg   return (__m64) __builtin_arm_wsubb ((__v8qi)__m1, (__v8qi)__m2);
    348      1.1  mrg }
    349      1.1  mrg 
    350      1.1  mrg /* Subtract the 16-bit values in M2 from the 16-bit values in M1.  */
    351      1.1  mrg static __inline __m64
    352      1.1  mrg _mm_sub_pi16 (__m64 __m1, __m64 __m2)
    353      1.1  mrg {
    354      1.1  mrg   return (__m64) __builtin_arm_wsubh ((__v4hi)__m1, (__v4hi)__m2);
    355      1.1  mrg }
    356      1.1  mrg 
    357      1.1  mrg /* Subtract the 32-bit values in M2 from the 32-bit values in M1.  */
    358      1.1  mrg static __inline __m64
    359      1.1  mrg _mm_sub_pi32 (__m64 __m1, __m64 __m2)
    360      1.1  mrg {
    361      1.1  mrg   return (__m64) __builtin_arm_wsubw ((__v2si)__m1, (__v2si)__m2);
    362      1.1  mrg }
    363      1.1  mrg 
    364      1.1  mrg /* Subtract the 8-bit values in M2 from the 8-bit values in M1 using signed
    365      1.1  mrg    saturating arithmetic.  */
    366      1.1  mrg static __inline __m64
    367      1.1  mrg _mm_subs_pi8 (__m64 __m1, __m64 __m2)
    368      1.1  mrg {
    369      1.1  mrg   return (__m64) __builtin_arm_wsubbss ((__v8qi)__m1, (__v8qi)__m2);
    370      1.1  mrg }
    371      1.1  mrg 
    372      1.1  mrg /* Subtract the 16-bit values in M2 from the 16-bit values in M1 using
    373      1.1  mrg    signed saturating arithmetic.  */
    374      1.1  mrg static __inline __m64
    375      1.1  mrg _mm_subs_pi16 (__m64 __m1, __m64 __m2)
    376      1.1  mrg {
    377      1.1  mrg   return (__m64) __builtin_arm_wsubhss ((__v4hi)__m1, (__v4hi)__m2);
    378      1.1  mrg }
    379      1.1  mrg 
    380      1.1  mrg /* Subtract the 32-bit values in M2 from the 32-bit values in M1 using
    381      1.1  mrg    signed saturating arithmetic.  */
    382      1.1  mrg static __inline __m64
    383      1.1  mrg _mm_subs_pi32 (__m64 __m1, __m64 __m2)
    384      1.1  mrg {
    385      1.1  mrg   return (__m64) __builtin_arm_wsubwss ((__v2si)__m1, (__v2si)__m2);
    386      1.1  mrg }
    387      1.1  mrg 
    388      1.1  mrg /* Subtract the 8-bit values in M2 from the 8-bit values in M1 using
    389      1.1  mrg    unsigned saturating arithmetic.  */
    390      1.1  mrg static __inline __m64
    391      1.1  mrg _mm_subs_pu8 (__m64 __m1, __m64 __m2)
    392      1.1  mrg {
    393      1.1  mrg   return (__m64) __builtin_arm_wsubbus ((__v8qi)__m1, (__v8qi)__m2);
    394      1.1  mrg }
    395      1.1  mrg 
    396      1.1  mrg /* Subtract the 16-bit values in M2 from the 16-bit values in M1 using
    397      1.1  mrg    unsigned saturating arithmetic.  */
    398      1.1  mrg static __inline __m64
    399      1.1  mrg _mm_subs_pu16 (__m64 __m1, __m64 __m2)
    400      1.1  mrg {
    401      1.1  mrg   return (__m64) __builtin_arm_wsubhus ((__v4hi)__m1, (__v4hi)__m2);
    402      1.1  mrg }
    403      1.1  mrg 
    404      1.1  mrg /* Subtract the 32-bit values in M2 from the 32-bit values in M1 using
    405      1.1  mrg    unsigned saturating arithmetic.  */
    406      1.1  mrg static __inline __m64
    407      1.1  mrg _mm_subs_pu32 (__m64 __m1, __m64 __m2)
    408      1.1  mrg {
    409      1.1  mrg   return (__m64) __builtin_arm_wsubwus ((__v2si)__m1, (__v2si)__m2);
    410      1.1  mrg }
    411      1.1  mrg 
    412      1.1  mrg /* Multiply four 16-bit values in M1 by four 16-bit values in M2 producing
    413      1.1  mrg    four 32-bit intermediate results, which are then summed by pairs to
    414      1.1  mrg    produce two 32-bit results.  */
    415      1.1  mrg static __inline __m64
    416      1.1  mrg _mm_madd_pi16 (__m64 __m1, __m64 __m2)
    417      1.1  mrg {
    418      1.1  mrg   return (__m64) __builtin_arm_wmadds ((__v4hi)__m1, (__v4hi)__m2);
    419      1.1  mrg }
    420      1.1  mrg 
    421      1.1  mrg /* Multiply four 16-bit values in M1 by four 16-bit values in M2 producing
    422      1.1  mrg    four 32-bit intermediate results, which are then summed by pairs to
    423      1.1  mrg    produce two 32-bit results.  */
    424      1.1  mrg static __inline __m64
    425      1.1  mrg _mm_madd_pu16 (__m64 __m1, __m64 __m2)
    426      1.1  mrg {
    427      1.1  mrg   return (__m64) __builtin_arm_wmaddu ((__v4hi)__m1, (__v4hi)__m2);
    428      1.1  mrg }
    429      1.1  mrg 
    430      1.1  mrg /* Multiply four signed 16-bit values in M1 by four signed 16-bit values in
    431      1.1  mrg    M2 and produce the high 16 bits of the 32-bit results.  */
    432      1.1  mrg static __inline __m64
    433      1.1  mrg _mm_mulhi_pi16 (__m64 __m1, __m64 __m2)
    434      1.1  mrg {
    435      1.1  mrg   return (__m64) __builtin_arm_wmulsm ((__v4hi)__m1, (__v4hi)__m2);
    436      1.1  mrg }
    437      1.1  mrg 
    438      1.1  mrg /* Multiply four signed 16-bit values in M1 by four signed 16-bit values in
    439      1.1  mrg    M2 and produce the high 16 bits of the 32-bit results.  */
    440      1.1  mrg static __inline __m64
    441      1.1  mrg _mm_mulhi_pu16 (__m64 __m1, __m64 __m2)
    442      1.1  mrg {
    443      1.1  mrg   return (__m64) __builtin_arm_wmulum ((__v4hi)__m1, (__v4hi)__m2);
    444      1.1  mrg }
    445      1.1  mrg 
    446      1.1  mrg /* Multiply four 16-bit values in M1 by four 16-bit values in M2 and produce
    447      1.1  mrg    the low 16 bits of the results.  */
    448      1.1  mrg static __inline __m64
    449      1.1  mrg _mm_mullo_pi16 (__m64 __m1, __m64 __m2)
    450      1.1  mrg {
    451      1.1  mrg   return (__m64) __builtin_arm_wmulul ((__v4hi)__m1, (__v4hi)__m2);
    452      1.1  mrg }
    453      1.1  mrg 
    454      1.1  mrg /* Shift four 16-bit values in M left by COUNT.  */
    455      1.1  mrg static __inline __m64
    456      1.1  mrg _mm_sll_pi16 (__m64 __m, __m64 __count)
    457      1.1  mrg {
    458      1.1  mrg   return (__m64) __builtin_arm_wsllh ((__v4hi)__m, __count);
    459      1.1  mrg }
    460      1.1  mrg 
    461      1.1  mrg static __inline __m64
    462      1.1  mrg _mm_slli_pi16 (__m64 __m, int __count)
    463      1.1  mrg {
    464      1.1  mrg   return (__m64) __builtin_arm_wsllhi ((__v4hi)__m, __count);
    465      1.1  mrg }
    466      1.1  mrg 
    467      1.1  mrg /* Shift two 32-bit values in M left by COUNT.  */
    468      1.1  mrg static __inline __m64
    469      1.1  mrg _mm_sll_pi32 (__m64 __m, __m64 __count)
    470      1.1  mrg {
    471      1.1  mrg   return (__m64) __builtin_arm_wsllw ((__v2si)__m, __count);
    472      1.1  mrg }
    473      1.1  mrg 
    474      1.1  mrg static __inline __m64
    475      1.1  mrg _mm_slli_pi32 (__m64 __m, int __count)
    476      1.1  mrg {
    477      1.1  mrg   return (__m64) __builtin_arm_wsllwi ((__v2si)__m, __count);
    478      1.1  mrg }
    479      1.1  mrg 
    480      1.1  mrg /* Shift the 64-bit value in M left by COUNT.  */
    481      1.1  mrg static __inline __m64
    482      1.1  mrg _mm_sll_si64 (__m64 __m, __m64 __count)
    483      1.1  mrg {
    484      1.1  mrg   return (__m64) __builtin_arm_wslld (__m, __count);
    485      1.1  mrg }
    486      1.1  mrg 
    487      1.1  mrg static __inline __m64
    488      1.1  mrg _mm_slli_si64 (__m64 __m, int __count)
    489      1.1  mrg {
    490      1.1  mrg   return (__m64) __builtin_arm_wslldi (__m, __count);
    491      1.1  mrg }
    492      1.1  mrg 
    493      1.1  mrg /* Shift four 16-bit values in M right by COUNT; shift in the sign bit.  */
    494      1.1  mrg static __inline __m64
    495      1.1  mrg _mm_sra_pi16 (__m64 __m, __m64 __count)
    496      1.1  mrg {
    497      1.1  mrg   return (__m64) __builtin_arm_wsrah ((__v4hi)__m, __count);
    498      1.1  mrg }
    499      1.1  mrg 
    500      1.1  mrg static __inline __m64
    501      1.1  mrg _mm_srai_pi16 (__m64 __m, int __count)
    502      1.1  mrg {
    503      1.1  mrg   return (__m64) __builtin_arm_wsrahi ((__v4hi)__m, __count);
    504      1.1  mrg }
    505      1.1  mrg 
    506      1.1  mrg /* Shift two 32-bit values in M right by COUNT; shift in the sign bit.  */
    507      1.1  mrg static __inline __m64
    508      1.1  mrg _mm_sra_pi32 (__m64 __m, __m64 __count)
    509      1.1  mrg {
    510      1.1  mrg   return (__m64) __builtin_arm_wsraw ((__v2si)__m, __count);
    511      1.1  mrg }
    512      1.1  mrg 
    513      1.1  mrg static __inline __m64
    514      1.1  mrg _mm_srai_pi32 (__m64 __m, int __count)
    515      1.1  mrg {
    516      1.1  mrg   return (__m64) __builtin_arm_wsrawi ((__v2si)__m, __count);
    517      1.1  mrg }
    518      1.1  mrg 
    519      1.1  mrg /* Shift the 64-bit value in M right by COUNT; shift in the sign bit.  */
    520      1.1  mrg static __inline __m64
    521      1.1  mrg _mm_sra_si64 (__m64 __m, __m64 __count)
    522      1.1  mrg {
    523      1.1  mrg   return (__m64) __builtin_arm_wsrad (__m, __count);
    524      1.1  mrg }
    525      1.1  mrg 
    526      1.1  mrg static __inline __m64
    527      1.1  mrg _mm_srai_si64 (__m64 __m, int __count)
    528      1.1  mrg {
    529      1.1  mrg   return (__m64) __builtin_arm_wsradi (__m, __count);
    530      1.1  mrg }
    531      1.1  mrg 
    532      1.1  mrg /* Shift four 16-bit values in M right by COUNT; shift in zeros.  */
    533      1.1  mrg static __inline __m64
    534      1.1  mrg _mm_srl_pi16 (__m64 __m, __m64 __count)
    535      1.1  mrg {
    536      1.1  mrg   return (__m64) __builtin_arm_wsrlh ((__v4hi)__m, __count);
    537      1.1  mrg }
    538      1.1  mrg 
    539      1.1  mrg static __inline __m64
    540      1.1  mrg _mm_srli_pi16 (__m64 __m, int __count)
    541      1.1  mrg {
    542      1.1  mrg   return (__m64) __builtin_arm_wsrlhi ((__v4hi)__m, __count);
    543      1.1  mrg }
    544      1.1  mrg 
    545      1.1  mrg /* Shift two 32-bit values in M right by COUNT; shift in zeros.  */
    546      1.1  mrg static __inline __m64
    547      1.1  mrg _mm_srl_pi32 (__m64 __m, __m64 __count)
    548      1.1  mrg {
    549      1.1  mrg   return (__m64) __builtin_arm_wsrlw ((__v2si)__m, __count);
    550      1.1  mrg }
    551      1.1  mrg 
    552      1.1  mrg static __inline __m64
    553      1.1  mrg _mm_srli_pi32 (__m64 __m, int __count)
    554      1.1  mrg {
    555      1.1  mrg   return (__m64) __builtin_arm_wsrlwi ((__v2si)__m, __count);
    556      1.1  mrg }
    557      1.1  mrg 
    558      1.1  mrg /* Shift the 64-bit value in M left by COUNT; shift in zeros.  */
    559      1.1  mrg static __inline __m64
    560      1.1  mrg _mm_srl_si64 (__m64 __m, __m64 __count)
    561      1.1  mrg {
    562      1.1  mrg   return (__m64) __builtin_arm_wsrld (__m, __count);
    563      1.1  mrg }
    564      1.1  mrg 
    565      1.1  mrg static __inline __m64
    566      1.1  mrg _mm_srli_si64 (__m64 __m, int __count)
    567      1.1  mrg {
    568      1.1  mrg   return (__m64) __builtin_arm_wsrldi (__m, __count);
    569      1.1  mrg }
    570      1.1  mrg 
    571      1.1  mrg /* Rotate four 16-bit values in M right by COUNT.  */
    572      1.1  mrg static __inline __m64
    573      1.1  mrg _mm_ror_pi16 (__m64 __m, __m64 __count)
    574      1.1  mrg {
    575      1.1  mrg   return (__m64) __builtin_arm_wrorh ((__v4hi)__m, __count);
    576      1.1  mrg }
    577      1.1  mrg 
    578      1.1  mrg static __inline __m64
    579      1.1  mrg _mm_rori_pi16 (__m64 __m, int __count)
    580      1.1  mrg {
    581      1.1  mrg   return (__m64) __builtin_arm_wrorhi ((__v4hi)__m, __count);
    582      1.1  mrg }
    583      1.1  mrg 
    584      1.1  mrg /* Rotate two 32-bit values in M right by COUNT.  */
    585      1.1  mrg static __inline __m64
    586      1.1  mrg _mm_ror_pi32 (__m64 __m, __m64 __count)
    587      1.1  mrg {
    588      1.1  mrg   return (__m64) __builtin_arm_wrorw ((__v2si)__m, __count);
    589      1.1  mrg }
    590      1.1  mrg 
    591      1.1  mrg static __inline __m64
    592      1.1  mrg _mm_rori_pi32 (__m64 __m, int __count)
    593      1.1  mrg {
    594      1.1  mrg   return (__m64) __builtin_arm_wrorwi ((__v2si)__m, __count);
    595      1.1  mrg }
    596      1.1  mrg 
    597      1.1  mrg /* Rotate two 64-bit values in M right by COUNT.  */
    598      1.1  mrg static __inline __m64
    599      1.1  mrg _mm_ror_si64 (__m64 __m, __m64 __count)
    600      1.1  mrg {
    601      1.1  mrg   return (__m64) __builtin_arm_wrord (__m, __count);
    602      1.1  mrg }
    603      1.1  mrg 
    604      1.1  mrg static __inline __m64
    605      1.1  mrg _mm_rori_si64 (__m64 __m, int __count)
    606      1.1  mrg {
    607      1.1  mrg   return (__m64) __builtin_arm_wrordi (__m, __count);
    608      1.1  mrg }
    609      1.1  mrg 
    610      1.1  mrg /* Bit-wise AND the 64-bit values in M1 and M2.  */
    611      1.1  mrg static __inline __m64
    612      1.1  mrg _mm_and_si64 (__m64 __m1, __m64 __m2)
    613      1.1  mrg {
    614      1.1  mrg   return __builtin_arm_wand (__m1, __m2);
    615      1.1  mrg }
    616      1.1  mrg 
    617      1.1  mrg /* Bit-wise complement the 64-bit value in M1 and bit-wise AND it with the
    618      1.1  mrg    64-bit value in M2.  */
    619      1.1  mrg static __inline __m64
    620      1.1  mrg _mm_andnot_si64 (__m64 __m1, __m64 __m2)
    621      1.1  mrg {
    622  1.1.1.2  mrg   return __builtin_arm_wandn (__m2, __m1);
    623      1.1  mrg }
    624      1.1  mrg 
    625      1.1  mrg /* Bit-wise inclusive OR the 64-bit values in M1 and M2.  */
    626      1.1  mrg static __inline __m64
    627      1.1  mrg _mm_or_si64 (__m64 __m1, __m64 __m2)
    628      1.1  mrg {
    629      1.1  mrg   return __builtin_arm_wor (__m1, __m2);
    630      1.1  mrg }
    631      1.1  mrg 
    632      1.1  mrg /* Bit-wise exclusive OR the 64-bit values in M1 and M2.  */
    633      1.1  mrg static __inline __m64
    634      1.1  mrg _mm_xor_si64 (__m64 __m1, __m64 __m2)
    635      1.1  mrg {
    636      1.1  mrg   return __builtin_arm_wxor (__m1, __m2);
    637      1.1  mrg }
    638      1.1  mrg 
    639      1.1  mrg /* Compare eight 8-bit values.  The result of the comparison is 0xFF if the
    640      1.1  mrg    test is true and zero if false.  */
    641      1.1  mrg static __inline __m64
    642      1.1  mrg _mm_cmpeq_pi8 (__m64 __m1, __m64 __m2)
    643      1.1  mrg {
    644      1.1  mrg   return (__m64) __builtin_arm_wcmpeqb ((__v8qi)__m1, (__v8qi)__m2);
    645      1.1  mrg }
    646      1.1  mrg 
    647      1.1  mrg static __inline __m64
    648      1.1  mrg _mm_cmpgt_pi8 (__m64 __m1, __m64 __m2)
    649      1.1  mrg {
    650      1.1  mrg   return (__m64) __builtin_arm_wcmpgtsb ((__v8qi)__m1, (__v8qi)__m2);
    651      1.1  mrg }
    652      1.1  mrg 
    653      1.1  mrg static __inline __m64
    654      1.1  mrg _mm_cmpgt_pu8 (__m64 __m1, __m64 __m2)
    655      1.1  mrg {
    656      1.1  mrg   return (__m64) __builtin_arm_wcmpgtub ((__v8qi)__m1, (__v8qi)__m2);
    657      1.1  mrg }
    658      1.1  mrg 
    659      1.1  mrg /* Compare four 16-bit values.  The result of the comparison is 0xFFFF if
    660      1.1  mrg    the test is true and zero if false.  */
    661      1.1  mrg static __inline __m64
    662      1.1  mrg _mm_cmpeq_pi16 (__m64 __m1, __m64 __m2)
    663      1.1  mrg {
    664      1.1  mrg   return (__m64) __builtin_arm_wcmpeqh ((__v4hi)__m1, (__v4hi)__m2);
    665      1.1  mrg }
    666      1.1  mrg 
    667      1.1  mrg static __inline __m64
    668      1.1  mrg _mm_cmpgt_pi16 (__m64 __m1, __m64 __m2)
    669      1.1  mrg {
    670      1.1  mrg   return (__m64) __builtin_arm_wcmpgtsh ((__v4hi)__m1, (__v4hi)__m2);
    671      1.1  mrg }
    672      1.1  mrg 
    673      1.1  mrg static __inline __m64
    674      1.1  mrg _mm_cmpgt_pu16 (__m64 __m1, __m64 __m2)
    675      1.1  mrg {
    676      1.1  mrg   return (__m64) __builtin_arm_wcmpgtuh ((__v4hi)__m1, (__v4hi)__m2);
    677      1.1  mrg }
    678      1.1  mrg 
    679      1.1  mrg /* Compare two 32-bit values.  The result of the comparison is 0xFFFFFFFF if
    680      1.1  mrg    the test is true and zero if false.  */
    681      1.1  mrg static __inline __m64
    682      1.1  mrg _mm_cmpeq_pi32 (__m64 __m1, __m64 __m2)
    683      1.1  mrg {
    684      1.1  mrg   return (__m64) __builtin_arm_wcmpeqw ((__v2si)__m1, (__v2si)__m2);
    685      1.1  mrg }
    686      1.1  mrg 
    687      1.1  mrg static __inline __m64
    688      1.1  mrg _mm_cmpgt_pi32 (__m64 __m1, __m64 __m2)
    689      1.1  mrg {
    690      1.1  mrg   return (__m64) __builtin_arm_wcmpgtsw ((__v2si)__m1, (__v2si)__m2);
    691      1.1  mrg }
    692      1.1  mrg 
    693      1.1  mrg static __inline __m64
    694      1.1  mrg _mm_cmpgt_pu32 (__m64 __m1, __m64 __m2)
    695      1.1  mrg {
    696      1.1  mrg   return (__m64) __builtin_arm_wcmpgtuw ((__v2si)__m1, (__v2si)__m2);
    697      1.1  mrg }
    698      1.1  mrg 
    699      1.1  mrg /* Element-wise multiplication of unsigned 16-bit values __B and __C, followed
    700      1.1  mrg    by accumulate across all elements and __A.  */
    701      1.1  mrg static __inline __m64
    702      1.1  mrg _mm_mac_pu16 (__m64 __A, __m64 __B, __m64 __C)
    703      1.1  mrg {
    704      1.1  mrg   return __builtin_arm_wmacu (__A, (__v4hi)__B, (__v4hi)__C);
    705      1.1  mrg }
    706      1.1  mrg 
    707      1.1  mrg /* Element-wise multiplication of signed 16-bit values __B and __C, followed
    708      1.1  mrg    by accumulate across all elements and __A.  */
    709      1.1  mrg static __inline __m64
    710      1.1  mrg _mm_mac_pi16 (__m64 __A, __m64 __B, __m64 __C)
    711      1.1  mrg {
    712      1.1  mrg   return __builtin_arm_wmacs (__A, (__v4hi)__B, (__v4hi)__C);
    713      1.1  mrg }
    714      1.1  mrg 
    715      1.1  mrg /* Element-wise multiplication of unsigned 16-bit values __B and __C, followed
    716      1.1  mrg    by accumulate across all elements.  */
    717      1.1  mrg static __inline __m64
    718      1.1  mrg _mm_macz_pu16 (__m64 __A, __m64 __B)
    719      1.1  mrg {
    720      1.1  mrg   return __builtin_arm_wmacuz ((__v4hi)__A, (__v4hi)__B);
    721      1.1  mrg }
    722      1.1  mrg 
    723      1.1  mrg /* Element-wise multiplication of signed 16-bit values __B and __C, followed
    724      1.1  mrg    by accumulate across all elements.  */
    725      1.1  mrg static __inline __m64
    726      1.1  mrg _mm_macz_pi16 (__m64 __A, __m64 __B)
    727      1.1  mrg {
    728      1.1  mrg   return __builtin_arm_wmacsz ((__v4hi)__A, (__v4hi)__B);
    729      1.1  mrg }
    730      1.1  mrg 
    731      1.1  mrg /* Accumulate across all unsigned 8-bit values in __A.  */
    732      1.1  mrg static __inline __m64
    733      1.1  mrg _mm_acc_pu8 (__m64 __A)
    734      1.1  mrg {
    735      1.1  mrg   return __builtin_arm_waccb ((__v8qi)__A);
    736      1.1  mrg }
    737      1.1  mrg 
    738      1.1  mrg /* Accumulate across all unsigned 16-bit values in __A.  */
    739      1.1  mrg static __inline __m64
    740      1.1  mrg _mm_acc_pu16 (__m64 __A)
    741      1.1  mrg {
    742      1.1  mrg   return __builtin_arm_wacch ((__v4hi)__A);
    743      1.1  mrg }
    744      1.1  mrg 
    745      1.1  mrg /* Accumulate across all unsigned 32-bit values in __A.  */
    746      1.1  mrg static __inline __m64
    747      1.1  mrg _mm_acc_pu32 (__m64 __A)
    748      1.1  mrg {
    749      1.1  mrg   return __builtin_arm_waccw ((__v2si)__A);
    750      1.1  mrg }
    751      1.1  mrg 
    752      1.1  mrg static __inline __m64
    753      1.1  mrg _mm_mia_si64 (__m64 __A, int __B, int __C)
    754      1.1  mrg {
    755      1.1  mrg   return __builtin_arm_tmia (__A, __B, __C);
    756      1.1  mrg }
    757      1.1  mrg 
    758      1.1  mrg static __inline __m64
    759      1.1  mrg _mm_miaph_si64 (__m64 __A, int __B, int __C)
    760      1.1  mrg {
    761      1.1  mrg   return __builtin_arm_tmiaph (__A, __B, __C);
    762      1.1  mrg }
    763      1.1  mrg 
    764      1.1  mrg static __inline __m64
    765      1.1  mrg _mm_miabb_si64 (__m64 __A, int __B, int __C)
    766      1.1  mrg {
    767      1.1  mrg   return __builtin_arm_tmiabb (__A, __B, __C);
    768      1.1  mrg }
    769      1.1  mrg 
    770      1.1  mrg static __inline __m64
    771      1.1  mrg _mm_miabt_si64 (__m64 __A, int __B, int __C)
    772      1.1  mrg {
    773      1.1  mrg   return __builtin_arm_tmiabt (__A, __B, __C);
    774      1.1  mrg }
    775      1.1  mrg 
    776      1.1  mrg static __inline __m64
    777      1.1  mrg _mm_miatb_si64 (__m64 __A, int __B, int __C)
    778      1.1  mrg {
    779      1.1  mrg   return __builtin_arm_tmiatb (__A, __B, __C);
    780      1.1  mrg }
    781      1.1  mrg 
    782      1.1  mrg static __inline __m64
    783      1.1  mrg _mm_miatt_si64 (__m64 __A, int __B, int __C)
    784      1.1  mrg {
    785      1.1  mrg   return __builtin_arm_tmiatt (__A, __B, __C);
    786      1.1  mrg }
    787      1.1  mrg 
    788      1.1  mrg /* Extract one of the elements of A and sign extend.  The selector N must
    789      1.1  mrg    be immediate.  */
    790      1.1  mrg #define _mm_extract_pi8(A, N) __builtin_arm_textrmsb ((__v8qi)(A), (N))
    791      1.1  mrg #define _mm_extract_pi16(A, N) __builtin_arm_textrmsh ((__v4hi)(A), (N))
    792      1.1  mrg #define _mm_extract_pi32(A, N) __builtin_arm_textrmsw ((__v2si)(A), (N))
    793      1.1  mrg 
    794      1.1  mrg /* Extract one of the elements of A and zero extend.  The selector N must
    795      1.1  mrg    be immediate.  */
    796      1.1  mrg #define _mm_extract_pu8(A, N) __builtin_arm_textrmub ((__v8qi)(A), (N))
    797      1.1  mrg #define _mm_extract_pu16(A, N) __builtin_arm_textrmuh ((__v4hi)(A), (N))
    798      1.1  mrg #define _mm_extract_pu32(A, N) __builtin_arm_textrmuw ((__v2si)(A), (N))
    799      1.1  mrg 
    800      1.1  mrg /* Inserts word D into one of the elements of A.  The selector N must be
    801      1.1  mrg    immediate.  */
    802      1.1  mrg #define _mm_insert_pi8(A, D, N) \
    803      1.1  mrg   ((__m64) __builtin_arm_tinsrb ((__v8qi)(A), (D), (N)))
    804      1.1  mrg #define _mm_insert_pi16(A, D, N) \
    805      1.1  mrg   ((__m64) __builtin_arm_tinsrh ((__v4hi)(A), (D), (N)))
    806      1.1  mrg #define _mm_insert_pi32(A, D, N) \
    807      1.1  mrg   ((__m64) __builtin_arm_tinsrw ((__v2si)(A), (D), (N)))
    808      1.1  mrg 
    809      1.1  mrg /* Compute the element-wise maximum of signed 8-bit values.  */
    810      1.1  mrg static __inline __m64
    811      1.1  mrg _mm_max_pi8 (__m64 __A, __m64 __B)
    812      1.1  mrg {
    813      1.1  mrg   return (__m64) __builtin_arm_wmaxsb ((__v8qi)__A, (__v8qi)__B);
    814      1.1  mrg }
    815      1.1  mrg 
    816      1.1  mrg /* Compute the element-wise maximum of signed 16-bit values.  */
    817      1.1  mrg static __inline __m64
    818      1.1  mrg _mm_max_pi16 (__m64 __A, __m64 __B)
    819      1.1  mrg {
    820      1.1  mrg   return (__m64) __builtin_arm_wmaxsh ((__v4hi)__A, (__v4hi)__B);
    821      1.1  mrg }
    822      1.1  mrg 
    823      1.1  mrg /* Compute the element-wise maximum of signed 32-bit values.  */
    824      1.1  mrg static __inline __m64
    825      1.1  mrg _mm_max_pi32 (__m64 __A, __m64 __B)
    826      1.1  mrg {
    827      1.1  mrg   return (__m64) __builtin_arm_wmaxsw ((__v2si)__A, (__v2si)__B);
    828      1.1  mrg }
    829      1.1  mrg 
    830      1.1  mrg /* Compute the element-wise maximum of unsigned 8-bit values.  */
    831      1.1  mrg static __inline __m64
    832      1.1  mrg _mm_max_pu8 (__m64 __A, __m64 __B)
    833      1.1  mrg {
    834      1.1  mrg   return (__m64) __builtin_arm_wmaxub ((__v8qi)__A, (__v8qi)__B);
    835      1.1  mrg }
    836      1.1  mrg 
    837      1.1  mrg /* Compute the element-wise maximum of unsigned 16-bit values.  */
    838      1.1  mrg static __inline __m64
    839      1.1  mrg _mm_max_pu16 (__m64 __A, __m64 __B)
    840      1.1  mrg {
    841      1.1  mrg   return (__m64) __builtin_arm_wmaxuh ((__v4hi)__A, (__v4hi)__B);
    842      1.1  mrg }
    843      1.1  mrg 
    844      1.1  mrg /* Compute the element-wise maximum of unsigned 32-bit values.  */
    845      1.1  mrg static __inline __m64
    846      1.1  mrg _mm_max_pu32 (__m64 __A, __m64 __B)
    847      1.1  mrg {
    848      1.1  mrg   return (__m64) __builtin_arm_wmaxuw ((__v2si)__A, (__v2si)__B);
    849      1.1  mrg }
    850      1.1  mrg 
    851      1.1  mrg /* Compute the element-wise minimum of signed 16-bit values.  */
    852      1.1  mrg static __inline __m64
    853      1.1  mrg _mm_min_pi8 (__m64 __A, __m64 __B)
    854      1.1  mrg {
    855      1.1  mrg   return (__m64) __builtin_arm_wminsb ((__v8qi)__A, (__v8qi)__B);
    856      1.1  mrg }
    857      1.1  mrg 
    858      1.1  mrg /* Compute the element-wise minimum of signed 16-bit values.  */
    859      1.1  mrg static __inline __m64
    860      1.1  mrg _mm_min_pi16 (__m64 __A, __m64 __B)
    861      1.1  mrg {
    862      1.1  mrg   return (__m64) __builtin_arm_wminsh ((__v4hi)__A, (__v4hi)__B);
    863      1.1  mrg }
    864      1.1  mrg 
    865      1.1  mrg /* Compute the element-wise minimum of signed 32-bit values.  */
    866      1.1  mrg static __inline __m64
    867      1.1  mrg _mm_min_pi32 (__m64 __A, __m64 __B)
    868      1.1  mrg {
    869      1.1  mrg   return (__m64) __builtin_arm_wminsw ((__v2si)__A, (__v2si)__B);
    870      1.1  mrg }
    871      1.1  mrg 
    872      1.1  mrg /* Compute the element-wise minimum of unsigned 16-bit values.  */
    873      1.1  mrg static __inline __m64
    874      1.1  mrg _mm_min_pu8 (__m64 __A, __m64 __B)
    875      1.1  mrg {
    876      1.1  mrg   return (__m64) __builtin_arm_wminub ((__v8qi)__A, (__v8qi)__B);
    877      1.1  mrg }
    878      1.1  mrg 
    879      1.1  mrg /* Compute the element-wise minimum of unsigned 16-bit values.  */
    880      1.1  mrg static __inline __m64
    881      1.1  mrg _mm_min_pu16 (__m64 __A, __m64 __B)
    882      1.1  mrg {
    883      1.1  mrg   return (__m64) __builtin_arm_wminuh ((__v4hi)__A, (__v4hi)__B);
    884      1.1  mrg }
    885      1.1  mrg 
    886      1.1  mrg /* Compute the element-wise minimum of unsigned 32-bit values.  */
    887      1.1  mrg static __inline __m64
    888      1.1  mrg _mm_min_pu32 (__m64 __A, __m64 __B)
    889      1.1  mrg {
    890      1.1  mrg   return (__m64) __builtin_arm_wminuw ((__v2si)__A, (__v2si)__B);
    891      1.1  mrg }
    892      1.1  mrg 
    893      1.1  mrg /* Create an 8-bit mask of the signs of 8-bit values.  */
    894      1.1  mrg static __inline int
    895      1.1  mrg _mm_movemask_pi8 (__m64 __A)
    896      1.1  mrg {
    897      1.1  mrg   return __builtin_arm_tmovmskb ((__v8qi)__A);
    898      1.1  mrg }
    899      1.1  mrg 
    900      1.1  mrg /* Create an 8-bit mask of the signs of 16-bit values.  */
    901      1.1  mrg static __inline int
    902      1.1  mrg _mm_movemask_pi16 (__m64 __A)
    903      1.1  mrg {
    904      1.1  mrg   return __builtin_arm_tmovmskh ((__v4hi)__A);
    905      1.1  mrg }
    906      1.1  mrg 
    907      1.1  mrg /* Create an 8-bit mask of the signs of 32-bit values.  */
    908      1.1  mrg static __inline int
    909      1.1  mrg _mm_movemask_pi32 (__m64 __A)
    910      1.1  mrg {
    911      1.1  mrg   return __builtin_arm_tmovmskw ((__v2si)__A);
    912      1.1  mrg }
    913      1.1  mrg 
    914      1.1  mrg /* Return a combination of the four 16-bit values in A.  The selector
    915      1.1  mrg    must be an immediate.  */
    916      1.1  mrg #define _mm_shuffle_pi16(A, N) \
    917      1.1  mrg   ((__m64) __builtin_arm_wshufh ((__v4hi)(A), (N)))
    918      1.1  mrg 
    919      1.1  mrg 
    920      1.1  mrg /* Compute the rounded averages of the unsigned 8-bit values in A and B.  */
    921      1.1  mrg static __inline __m64
    922      1.1  mrg _mm_avg_pu8 (__m64 __A, __m64 __B)
    923      1.1  mrg {
    924      1.1  mrg   return (__m64) __builtin_arm_wavg2br ((__v8qi)__A, (__v8qi)__B);
    925      1.1  mrg }
    926      1.1  mrg 
    927      1.1  mrg /* Compute the rounded averages of the unsigned 16-bit values in A and B.  */
    928      1.1  mrg static __inline __m64
    929      1.1  mrg _mm_avg_pu16 (__m64 __A, __m64 __B)
    930      1.1  mrg {
    931      1.1  mrg   return (__m64) __builtin_arm_wavg2hr ((__v4hi)__A, (__v4hi)__B);
    932      1.1  mrg }
    933      1.1  mrg 
    934      1.1  mrg /* Compute the averages of the unsigned 8-bit values in A and B.  */
    935      1.1  mrg static __inline __m64
    936      1.1  mrg _mm_avg2_pu8 (__m64 __A, __m64 __B)
    937      1.1  mrg {
    938      1.1  mrg   return (__m64) __builtin_arm_wavg2b ((__v8qi)__A, (__v8qi)__B);
    939      1.1  mrg }
    940      1.1  mrg 
    941      1.1  mrg /* Compute the averages of the unsigned 16-bit values in A and B.  */
    942      1.1  mrg static __inline __m64
    943      1.1  mrg _mm_avg2_pu16 (__m64 __A, __m64 __B)
    944      1.1  mrg {
    945      1.1  mrg   return (__m64) __builtin_arm_wavg2h ((__v4hi)__A, (__v4hi)__B);
    946      1.1  mrg }
    947      1.1  mrg 
    948      1.1  mrg /* Compute the sum of the absolute differences of the unsigned 8-bit
    949      1.1  mrg    values in A and B.  Return the value in the lower 16-bit word; the
    950      1.1  mrg    upper words are cleared.  */
    951      1.1  mrg static __inline __m64
    952      1.1  mrg _mm_sad_pu8 (__m64 __A, __m64 __B)
    953      1.1  mrg {
    954  1.1.1.2  mrg   return (__m64) __builtin_arm_wsadbz ((__v8qi)__A, (__v8qi)__B);
    955  1.1.1.2  mrg }
    956  1.1.1.2  mrg 
    957  1.1.1.2  mrg static __inline __m64
    958  1.1.1.2  mrg _mm_sada_pu8 (__m64 __A, __m64 __B, __m64 __C)
    959  1.1.1.2  mrg {
    960  1.1.1.2  mrg   return (__m64) __builtin_arm_wsadb ((__v2si)__A, (__v8qi)__B, (__v8qi)__C);
    961      1.1  mrg }
    962      1.1  mrg 
    963      1.1  mrg /* Compute the sum of the absolute differences of the unsigned 16-bit
    964      1.1  mrg    values in A and B.  Return the value in the lower 32-bit word; the
    965      1.1  mrg    upper words are cleared.  */
    966      1.1  mrg static __inline __m64
    967      1.1  mrg _mm_sad_pu16 (__m64 __A, __m64 __B)
    968      1.1  mrg {
    969  1.1.1.2  mrg   return (__m64) __builtin_arm_wsadhz ((__v4hi)__A, (__v4hi)__B);
    970      1.1  mrg }
    971      1.1  mrg 
    972  1.1.1.2  mrg static __inline __m64
    973  1.1.1.2  mrg _mm_sada_pu16 (__m64 __A, __m64 __B, __m64 __C)
    974  1.1.1.2  mrg {
    975  1.1.1.2  mrg   return (__m64) __builtin_arm_wsadh ((__v2si)__A, (__v4hi)__B, (__v4hi)__C);
    976  1.1.1.2  mrg }
    977  1.1.1.2  mrg 
    978  1.1.1.2  mrg 
    979      1.1  mrg /* Compute the sum of the absolute differences of the unsigned 8-bit
    980      1.1  mrg    values in A and B.  Return the value in the lower 16-bit word; the
    981      1.1  mrg    upper words are cleared.  */
    982      1.1  mrg static __inline __m64
    983      1.1  mrg _mm_sadz_pu8 (__m64 __A, __m64 __B)
    984      1.1  mrg {
    985      1.1  mrg   return (__m64) __builtin_arm_wsadbz ((__v8qi)__A, (__v8qi)__B);
    986      1.1  mrg }
    987      1.1  mrg 
    988      1.1  mrg /* Compute the sum of the absolute differences of the unsigned 16-bit
    989      1.1  mrg    values in A and B.  Return the value in the lower 32-bit word; the
    990      1.1  mrg    upper words are cleared.  */
    991      1.1  mrg static __inline __m64
    992      1.1  mrg _mm_sadz_pu16 (__m64 __A, __m64 __B)
    993      1.1  mrg {
    994      1.1  mrg   return (__m64) __builtin_arm_wsadhz ((__v4hi)__A, (__v4hi)__B);
    995      1.1  mrg }
    996      1.1  mrg 
    997  1.1.1.2  mrg #define _mm_align_si64(__A,__B, N) \
    998  1.1.1.2  mrg   (__m64) __builtin_arm_walign ((__v8qi) (__A),(__v8qi) (__B), (N))
    999      1.1  mrg 
   1000      1.1  mrg /* Creates a 64-bit zero.  */
   1001      1.1  mrg static __inline __m64
   1002      1.1  mrg _mm_setzero_si64 (void)
   1003      1.1  mrg {
   1004      1.1  mrg   return __builtin_arm_wzero ();
   1005      1.1  mrg }
   1006      1.1  mrg 
   1007      1.1  mrg /* Set and Get arbitrary iWMMXt Control registers.
   1008      1.1  mrg    Note only registers 0-3 and 8-11 are currently defined,
   1009      1.1  mrg    the rest are reserved.  */
   1010      1.1  mrg 
   1011      1.1  mrg static __inline void
   1012      1.1  mrg _mm_setwcx (const int __value, const int __regno)
   1013      1.1  mrg {
   1014      1.1  mrg   switch (__regno)
   1015      1.1  mrg     {
   1016  1.1.1.2  mrg     case 0:
   1017  1.1.1.2  mrg       __asm __volatile ("tmcr wcid, %0" :: "r"(__value));
   1018  1.1.1.2  mrg       break;
   1019  1.1.1.2  mrg     case 1:
   1020  1.1.1.2  mrg       __asm __volatile ("tmcr wcon, %0" :: "r"(__value));
   1021  1.1.1.2  mrg       break;
   1022  1.1.1.2  mrg     case 2:
   1023  1.1.1.2  mrg       __asm __volatile ("tmcr wcssf, %0" :: "r"(__value));
   1024  1.1.1.2  mrg       break;
   1025  1.1.1.2  mrg     case 3:
   1026  1.1.1.2  mrg       __asm __volatile ("tmcr wcasf, %0" :: "r"(__value));
   1027  1.1.1.2  mrg       break;
   1028  1.1.1.2  mrg     case 8:
   1029  1.1.1.2  mrg       __builtin_arm_setwcgr0 (__value);
   1030  1.1.1.2  mrg       break;
   1031  1.1.1.2  mrg     case 9:
   1032  1.1.1.2  mrg       __builtin_arm_setwcgr1 (__value);
   1033  1.1.1.2  mrg       break;
   1034  1.1.1.2  mrg     case 10:
   1035  1.1.1.2  mrg       __builtin_arm_setwcgr2 (__value);
   1036  1.1.1.2  mrg       break;
   1037  1.1.1.2  mrg     case 11:
   1038  1.1.1.2  mrg       __builtin_arm_setwcgr3 (__value);
   1039  1.1.1.2  mrg       break;
   1040  1.1.1.2  mrg     default:
   1041  1.1.1.2  mrg       break;
   1042      1.1  mrg     }
   1043      1.1  mrg }
   1044      1.1  mrg 
   1045      1.1  mrg static __inline int
   1046      1.1  mrg _mm_getwcx (const int __regno)
   1047      1.1  mrg {
   1048  1.1.1.2  mrg   int __value;
   1049      1.1  mrg   switch (__regno)
   1050      1.1  mrg     {
   1051  1.1.1.2  mrg     case 0:
   1052  1.1.1.2  mrg       __asm __volatile ("tmrc %0, wcid" : "=r"(__value));
   1053  1.1.1.2  mrg       break;
   1054  1.1.1.2  mrg     case 1:
   1055  1.1.1.2  mrg       __asm __volatile ("tmrc %0, wcon" : "=r"(__value));
   1056  1.1.1.2  mrg       break;
   1057  1.1.1.2  mrg     case 2:
   1058  1.1.1.2  mrg       __asm __volatile ("tmrc %0, wcssf" : "=r"(__value));
   1059  1.1.1.2  mrg       break;
   1060  1.1.1.2  mrg     case 3:
   1061  1.1.1.2  mrg       __asm __volatile ("tmrc %0, wcasf" : "=r"(__value));
   1062  1.1.1.2  mrg       break;
   1063  1.1.1.2  mrg     case 8:
   1064  1.1.1.2  mrg       return __builtin_arm_getwcgr0 ();
   1065  1.1.1.2  mrg     case 9:
   1066  1.1.1.2  mrg       return __builtin_arm_getwcgr1 ();
   1067  1.1.1.2  mrg     case 10:
   1068  1.1.1.2  mrg       return __builtin_arm_getwcgr2 ();
   1069  1.1.1.2  mrg     case 11:
   1070  1.1.1.2  mrg       return __builtin_arm_getwcgr3 ();
   1071  1.1.1.2  mrg     default:
   1072  1.1.1.2  mrg       break;
   1073      1.1  mrg     }
   1074  1.1.1.2  mrg   return __value;
   1075      1.1  mrg }
   1076      1.1  mrg 
   1077      1.1  mrg /* Creates a vector of two 32-bit values; I0 is least significant.  */
   1078      1.1  mrg static __inline __m64
   1079      1.1  mrg _mm_set_pi32 (int __i1, int __i0)
   1080      1.1  mrg {
   1081  1.1.1.2  mrg   union
   1082  1.1.1.2  mrg   {
   1083      1.1  mrg     __m64 __q;
   1084  1.1.1.2  mrg     struct
   1085  1.1.1.2  mrg     {
   1086      1.1  mrg       unsigned int __i0;
   1087      1.1  mrg       unsigned int __i1;
   1088      1.1  mrg     } __s;
   1089      1.1  mrg   } __u;
   1090      1.1  mrg 
   1091      1.1  mrg   __u.__s.__i0 = __i0;
   1092      1.1  mrg   __u.__s.__i1 = __i1;
   1093      1.1  mrg 
   1094      1.1  mrg   return __u.__q;
   1095      1.1  mrg }
   1096      1.1  mrg 
   1097      1.1  mrg /* Creates a vector of four 16-bit values; W0 is least significant.  */
   1098      1.1  mrg static __inline __m64
   1099      1.1  mrg _mm_set_pi16 (short __w3, short __w2, short __w1, short __w0)
   1100      1.1  mrg {
   1101  1.1.1.2  mrg   unsigned int __i1 = (unsigned short) __w3 << 16 | (unsigned short) __w2;
   1102  1.1.1.2  mrg   unsigned int __i0 = (unsigned short) __w1 << 16 | (unsigned short) __w0;
   1103  1.1.1.2  mrg 
   1104      1.1  mrg   return _mm_set_pi32 (__i1, __i0);
   1105      1.1  mrg }
   1106      1.1  mrg 
   1107      1.1  mrg /* Creates a vector of eight 8-bit values; B0 is least significant.  */
   1108      1.1  mrg static __inline __m64
   1109      1.1  mrg _mm_set_pi8 (char __b7, char __b6, char __b5, char __b4,
   1110      1.1  mrg 	     char __b3, char __b2, char __b1, char __b0)
   1111      1.1  mrg {
   1112      1.1  mrg   unsigned int __i1, __i0;
   1113      1.1  mrg 
   1114      1.1  mrg   __i1 = (unsigned char)__b7;
   1115      1.1  mrg   __i1 = __i1 << 8 | (unsigned char)__b6;
   1116      1.1  mrg   __i1 = __i1 << 8 | (unsigned char)__b5;
   1117      1.1  mrg   __i1 = __i1 << 8 | (unsigned char)__b4;
   1118      1.1  mrg 
   1119      1.1  mrg   __i0 = (unsigned char)__b3;
   1120      1.1  mrg   __i0 = __i0 << 8 | (unsigned char)__b2;
   1121      1.1  mrg   __i0 = __i0 << 8 | (unsigned char)__b1;
   1122      1.1  mrg   __i0 = __i0 << 8 | (unsigned char)__b0;
   1123      1.1  mrg 
   1124      1.1  mrg   return _mm_set_pi32 (__i1, __i0);
   1125      1.1  mrg }
   1126      1.1  mrg 
   1127      1.1  mrg /* Similar, but with the arguments in reverse order.  */
   1128      1.1  mrg static __inline __m64
   1129      1.1  mrg _mm_setr_pi32 (int __i0, int __i1)
   1130      1.1  mrg {
   1131      1.1  mrg   return _mm_set_pi32 (__i1, __i0);
   1132      1.1  mrg }
   1133      1.1  mrg 
   1134      1.1  mrg static __inline __m64
   1135      1.1  mrg _mm_setr_pi16 (short __w0, short __w1, short __w2, short __w3)
   1136      1.1  mrg {
   1137      1.1  mrg   return _mm_set_pi16 (__w3, __w2, __w1, __w0);
   1138      1.1  mrg }
   1139      1.1  mrg 
   1140      1.1  mrg static __inline __m64
   1141      1.1  mrg _mm_setr_pi8 (char __b0, char __b1, char __b2, char __b3,
   1142      1.1  mrg 	      char __b4, char __b5, char __b6, char __b7)
   1143      1.1  mrg {
   1144      1.1  mrg   return _mm_set_pi8 (__b7, __b6, __b5, __b4, __b3, __b2, __b1, __b0);
   1145      1.1  mrg }
   1146      1.1  mrg 
   1147      1.1  mrg /* Creates a vector of two 32-bit values, both elements containing I.  */
   1148      1.1  mrg static __inline __m64
   1149      1.1  mrg _mm_set1_pi32 (int __i)
   1150      1.1  mrg {
   1151      1.1  mrg   return _mm_set_pi32 (__i, __i);
   1152      1.1  mrg }
   1153      1.1  mrg 
   1154      1.1  mrg /* Creates a vector of four 16-bit values, all elements containing W.  */
   1155      1.1  mrg static __inline __m64
   1156      1.1  mrg _mm_set1_pi16 (short __w)
   1157      1.1  mrg {
   1158      1.1  mrg   unsigned int __i = (unsigned short)__w << 16 | (unsigned short)__w;
   1159      1.1  mrg   return _mm_set1_pi32 (__i);
   1160      1.1  mrg }
   1161      1.1  mrg 
   1162      1.1  mrg /* Creates a vector of four 16-bit values, all elements containing B.  */
   1163      1.1  mrg static __inline __m64
   1164      1.1  mrg _mm_set1_pi8 (char __b)
   1165      1.1  mrg {
   1166      1.1  mrg   unsigned int __w = (unsigned char)__b << 8 | (unsigned char)__b;
   1167      1.1  mrg   unsigned int __i = __w << 16 | __w;
   1168      1.1  mrg   return _mm_set1_pi32 (__i);
   1169      1.1  mrg }
   1170      1.1  mrg 
   1171  1.1.1.2  mrg #ifdef __IWMMXT2__
   1172  1.1.1.2  mrg static __inline __m64
   1173  1.1.1.2  mrg _mm_abs_pi8 (__m64 m1)
   1174  1.1.1.2  mrg {
   1175  1.1.1.2  mrg   return (__m64) __builtin_arm_wabsb ((__v8qi)m1);
   1176  1.1.1.2  mrg }
   1177  1.1.1.2  mrg 
   1178  1.1.1.2  mrg static __inline __m64
   1179  1.1.1.2  mrg _mm_abs_pi16 (__m64 m1)
   1180  1.1.1.2  mrg {
   1181  1.1.1.2  mrg   return (__m64) __builtin_arm_wabsh ((__v4hi)m1);
   1182  1.1.1.2  mrg 
   1183  1.1.1.2  mrg }
   1184  1.1.1.2  mrg 
   1185  1.1.1.2  mrg static __inline __m64
   1186  1.1.1.2  mrg _mm_abs_pi32 (__m64 m1)
   1187  1.1.1.2  mrg {
   1188  1.1.1.2  mrg   return (__m64) __builtin_arm_wabsw ((__v2si)m1);
   1189  1.1.1.2  mrg 
   1190  1.1.1.2  mrg }
   1191  1.1.1.2  mrg 
   1192  1.1.1.2  mrg static __inline __m64
   1193  1.1.1.2  mrg _mm_addsubhx_pi16 (__m64 a, __m64 b)
   1194  1.1.1.2  mrg {
   1195  1.1.1.2  mrg   return (__m64) __builtin_arm_waddsubhx ((__v4hi)a, (__v4hi)b);
   1196  1.1.1.2  mrg }
   1197  1.1.1.2  mrg 
   1198  1.1.1.2  mrg static __inline __m64
   1199  1.1.1.2  mrg _mm_absdiff_pu8 (__m64 a, __m64 b)
   1200  1.1.1.2  mrg {
   1201  1.1.1.2  mrg   return (__m64) __builtin_arm_wabsdiffb ((__v8qi)a, (__v8qi)b);
   1202  1.1.1.2  mrg }
   1203  1.1.1.2  mrg 
   1204  1.1.1.2  mrg static __inline __m64
   1205  1.1.1.2  mrg _mm_absdiff_pu16 (__m64 a, __m64 b)
   1206  1.1.1.2  mrg {
   1207  1.1.1.2  mrg   return (__m64) __builtin_arm_wabsdiffh ((__v4hi)a, (__v4hi)b);
   1208  1.1.1.2  mrg }
   1209  1.1.1.2  mrg 
   1210  1.1.1.2  mrg static __inline __m64
   1211  1.1.1.2  mrg _mm_absdiff_pu32 (__m64 a, __m64 b)
   1212  1.1.1.2  mrg {
   1213  1.1.1.2  mrg   return (__m64) __builtin_arm_wabsdiffw ((__v2si)a, (__v2si)b);
   1214  1.1.1.2  mrg }
   1215  1.1.1.2  mrg 
   1216  1.1.1.2  mrg static __inline __m64
   1217  1.1.1.2  mrg _mm_addc_pu16 (__m64 a, __m64 b)
   1218  1.1.1.2  mrg {
   1219  1.1.1.2  mrg   __m64 result;
   1220  1.1.1.2  mrg   __asm__ __volatile__ ("waddhc	%0, %1, %2" : "=y" (result) : "y" (a),  "y" (b));
   1221  1.1.1.2  mrg   return result;
   1222  1.1.1.2  mrg }
   1223  1.1.1.2  mrg 
   1224      1.1  mrg static __inline __m64
   1225  1.1.1.2  mrg _mm_addc_pu32 (__m64 a, __m64 b)
   1226      1.1  mrg {
   1227  1.1.1.2  mrg   __m64 result;
   1228  1.1.1.2  mrg   __asm__ __volatile__ ("waddwc	%0, %1, %2" : "=y" (result) : "y" (a),  "y" (b));
   1229  1.1.1.2  mrg   return result;
   1230      1.1  mrg }
   1231      1.1  mrg 
   1232  1.1.1.2  mrg static __inline __m64
   1233  1.1.1.2  mrg _mm_avg4_pu8 (__m64 a, __m64 b)
   1234  1.1.1.2  mrg {
   1235  1.1.1.2  mrg   return (__m64) __builtin_arm_wavg4 ((__v8qi)a, (__v8qi)b);
   1236  1.1.1.2  mrg }
   1237  1.1.1.2  mrg 
   1238  1.1.1.2  mrg static __inline __m64
   1239  1.1.1.2  mrg _mm_avg4r_pu8 (__m64 a, __m64 b)
   1240  1.1.1.2  mrg {
   1241  1.1.1.2  mrg   return (__m64) __builtin_arm_wavg4r ((__v8qi)a, (__v8qi)b);
   1242  1.1.1.2  mrg }
   1243  1.1.1.2  mrg 
   1244  1.1.1.2  mrg static __inline __m64
   1245  1.1.1.2  mrg _mm_maddx_pi16 (__m64 a, __m64 b)
   1246  1.1.1.2  mrg {
   1247  1.1.1.2  mrg   return (__m64) __builtin_arm_wmaddsx ((__v4hi)a, (__v4hi)b);
   1248  1.1.1.2  mrg }
   1249  1.1.1.2  mrg 
   1250  1.1.1.2  mrg static __inline __m64
   1251  1.1.1.2  mrg _mm_maddx_pu16 (__m64 a, __m64 b)
   1252  1.1.1.2  mrg {
   1253  1.1.1.2  mrg   return (__m64) __builtin_arm_wmaddux ((__v4hi)a, (__v4hi)b);
   1254  1.1.1.2  mrg }
   1255  1.1.1.2  mrg 
   1256  1.1.1.2  mrg static __inline __m64
   1257  1.1.1.2  mrg _mm_msub_pi16 (__m64 a, __m64 b)
   1258  1.1.1.2  mrg {
   1259  1.1.1.2  mrg   return (__m64) __builtin_arm_wmaddsn ((__v4hi)a, (__v4hi)b);
   1260  1.1.1.2  mrg }
   1261  1.1.1.2  mrg 
   1262  1.1.1.2  mrg static __inline __m64
   1263  1.1.1.2  mrg _mm_msub_pu16 (__m64 a, __m64 b)
   1264  1.1.1.2  mrg {
   1265  1.1.1.2  mrg   return (__m64) __builtin_arm_wmaddun ((__v4hi)a, (__v4hi)b);
   1266  1.1.1.2  mrg }
   1267  1.1.1.2  mrg 
   1268  1.1.1.2  mrg static __inline __m64
   1269  1.1.1.2  mrg _mm_mulhi_pi32 (__m64 a, __m64 b)
   1270  1.1.1.2  mrg {
   1271  1.1.1.2  mrg   return (__m64) __builtin_arm_wmulwsm ((__v2si)a, (__v2si)b);
   1272  1.1.1.2  mrg }
   1273  1.1.1.2  mrg 
   1274  1.1.1.2  mrg static __inline __m64
   1275  1.1.1.2  mrg _mm_mulhi_pu32 (__m64 a, __m64 b)
   1276  1.1.1.2  mrg {
   1277  1.1.1.2  mrg   return (__m64) __builtin_arm_wmulwum ((__v2si)a, (__v2si)b);
   1278  1.1.1.2  mrg }
   1279  1.1.1.2  mrg 
   1280  1.1.1.2  mrg static __inline __m64
   1281  1.1.1.2  mrg _mm_mulhir_pi16 (__m64 a, __m64 b)
   1282  1.1.1.2  mrg {
   1283  1.1.1.2  mrg   return (__m64) __builtin_arm_wmulsmr ((__v4hi)a, (__v4hi)b);
   1284  1.1.1.2  mrg }
   1285  1.1.1.2  mrg 
   1286  1.1.1.2  mrg static __inline __m64
   1287  1.1.1.2  mrg _mm_mulhir_pi32 (__m64 a, __m64 b)
   1288  1.1.1.2  mrg {
   1289  1.1.1.2  mrg   return (__m64) __builtin_arm_wmulwsmr ((__v2si)a, (__v2si)b);
   1290  1.1.1.2  mrg }
   1291  1.1.1.2  mrg 
   1292  1.1.1.2  mrg static __inline __m64
   1293  1.1.1.2  mrg _mm_mulhir_pu16 (__m64 a, __m64 b)
   1294  1.1.1.2  mrg {
   1295  1.1.1.2  mrg   return (__m64) __builtin_arm_wmulumr ((__v4hi)a, (__v4hi)b);
   1296  1.1.1.2  mrg }
   1297  1.1.1.2  mrg 
   1298  1.1.1.2  mrg static __inline __m64
   1299  1.1.1.2  mrg _mm_mulhir_pu32 (__m64 a, __m64 b)
   1300  1.1.1.2  mrg {
   1301  1.1.1.2  mrg   return (__m64) __builtin_arm_wmulwumr ((__v2si)a, (__v2si)b);
   1302  1.1.1.2  mrg }
   1303  1.1.1.2  mrg 
   1304  1.1.1.2  mrg static __inline __m64
   1305  1.1.1.2  mrg _mm_mullo_pi32 (__m64 a, __m64 b)
   1306  1.1.1.2  mrg {
   1307  1.1.1.2  mrg   return (__m64) __builtin_arm_wmulwl ((__v2si)a, (__v2si)b);
   1308  1.1.1.2  mrg }
   1309  1.1.1.2  mrg 
   1310  1.1.1.2  mrg static __inline __m64
   1311  1.1.1.2  mrg _mm_qmulm_pi16 (__m64 a, __m64 b)
   1312  1.1.1.2  mrg {
   1313  1.1.1.2  mrg   return (__m64) __builtin_arm_wqmulm ((__v4hi)a, (__v4hi)b);
   1314  1.1.1.2  mrg }
   1315  1.1.1.2  mrg 
   1316  1.1.1.2  mrg static __inline __m64
   1317  1.1.1.2  mrg _mm_qmulm_pi32 (__m64 a, __m64 b)
   1318  1.1.1.2  mrg {
   1319  1.1.1.2  mrg   return (__m64) __builtin_arm_wqmulwm ((__v2si)a, (__v2si)b);
   1320  1.1.1.2  mrg }
   1321  1.1.1.2  mrg 
   1322  1.1.1.2  mrg static __inline __m64
   1323  1.1.1.2  mrg _mm_qmulmr_pi16 (__m64 a, __m64 b)
   1324  1.1.1.2  mrg {
   1325  1.1.1.2  mrg   return (__m64) __builtin_arm_wqmulmr ((__v4hi)a, (__v4hi)b);
   1326  1.1.1.2  mrg }
   1327  1.1.1.2  mrg 
   1328  1.1.1.2  mrg static __inline __m64
   1329  1.1.1.2  mrg _mm_qmulmr_pi32 (__m64 a, __m64 b)
   1330  1.1.1.2  mrg {
   1331  1.1.1.2  mrg   return (__m64) __builtin_arm_wqmulwmr ((__v2si)a, (__v2si)b);
   1332  1.1.1.2  mrg }
   1333  1.1.1.2  mrg 
   1334  1.1.1.2  mrg static __inline __m64
   1335  1.1.1.2  mrg _mm_subaddhx_pi16 (__m64 a, __m64 b)
   1336  1.1.1.2  mrg {
   1337  1.1.1.2  mrg   return (__m64) __builtin_arm_wsubaddhx ((__v4hi)a, (__v4hi)b);
   1338  1.1.1.2  mrg }
   1339  1.1.1.2  mrg 
   1340  1.1.1.2  mrg static __inline __m64
   1341  1.1.1.2  mrg _mm_addbhusl_pu8 (__m64 a, __m64 b)
   1342  1.1.1.2  mrg {
   1343  1.1.1.2  mrg   return (__m64) __builtin_arm_waddbhusl ((__v4hi)a, (__v8qi)b);
   1344  1.1.1.2  mrg }
   1345  1.1.1.2  mrg 
   1346  1.1.1.2  mrg static __inline __m64
   1347  1.1.1.2  mrg _mm_addbhusm_pu8 (__m64 a, __m64 b)
   1348  1.1.1.2  mrg {
   1349  1.1.1.2  mrg   return (__m64) __builtin_arm_waddbhusm ((__v4hi)a, (__v8qi)b);
   1350  1.1.1.2  mrg }
   1351  1.1.1.2  mrg 
   1352  1.1.1.2  mrg #define _mm_qmiabb_pi32(acc, m1, m2) \
   1353  1.1.1.2  mrg   ({\
   1354  1.1.1.2  mrg    __m64 _acc = acc;\
   1355  1.1.1.2  mrg    __m64 _m1 = m1;\
   1356  1.1.1.2  mrg    __m64 _m2 = m2;\
   1357  1.1.1.2  mrg    _acc = (__m64) __builtin_arm_wqmiabb ((__v2si)_acc, (__v4hi)_m1, (__v4hi)_m2);\
   1358  1.1.1.2  mrg    _acc;\
   1359  1.1.1.2  mrg    })
   1360  1.1.1.2  mrg 
   1361  1.1.1.2  mrg #define _mm_qmiabbn_pi32(acc, m1, m2) \
   1362  1.1.1.2  mrg   ({\
   1363  1.1.1.2  mrg    __m64 _acc = acc;\
   1364  1.1.1.2  mrg    __m64 _m1 = m1;\
   1365  1.1.1.2  mrg    __m64 _m2 = m2;\
   1366  1.1.1.2  mrg    _acc = (__m64) __builtin_arm_wqmiabbn ((__v2si)_acc, (__v4hi)_m1, (__v4hi)_m2);\
   1367  1.1.1.2  mrg    _acc;\
   1368  1.1.1.2  mrg    })
   1369  1.1.1.2  mrg 
   1370  1.1.1.2  mrg #define _mm_qmiabt_pi32(acc, m1, m2) \
   1371  1.1.1.2  mrg   ({\
   1372  1.1.1.2  mrg    __m64 _acc = acc;\
   1373  1.1.1.2  mrg    __m64 _m1 = m1;\
   1374  1.1.1.2  mrg    __m64 _m2 = m2;\
   1375  1.1.1.2  mrg    _acc = (__m64) __builtin_arm_wqmiabt ((__v2si)_acc, (__v4hi)_m1, (__v4hi)_m2);\
   1376  1.1.1.2  mrg    _acc;\
   1377  1.1.1.2  mrg    })
   1378  1.1.1.2  mrg 
   1379  1.1.1.2  mrg #define _mm_qmiabtn_pi32(acc, m1, m2) \
   1380  1.1.1.2  mrg   ({\
   1381  1.1.1.2  mrg    __m64 _acc=acc;\
   1382  1.1.1.2  mrg    __m64 _m1=m1;\
   1383  1.1.1.2  mrg    __m64 _m2=m2;\
   1384  1.1.1.2  mrg    _acc = (__m64) __builtin_arm_wqmiabtn ((__v2si)_acc, (__v4hi)_m1, (__v4hi)_m2);\
   1385  1.1.1.2  mrg    _acc;\
   1386  1.1.1.2  mrg    })
   1387  1.1.1.2  mrg 
   1388  1.1.1.2  mrg #define _mm_qmiatb_pi32(acc, m1, m2) \
   1389  1.1.1.2  mrg   ({\
   1390  1.1.1.2  mrg    __m64 _acc = acc;\
   1391  1.1.1.2  mrg    __m64 _m1 = m1;\
   1392  1.1.1.2  mrg    __m64 _m2 = m2;\
   1393  1.1.1.2  mrg    _acc = (__m64) __builtin_arm_wqmiatb ((__v2si)_acc, (__v4hi)_m1, (__v4hi)_m2);\
   1394  1.1.1.2  mrg    _acc;\
   1395  1.1.1.2  mrg    })
   1396  1.1.1.2  mrg 
   1397  1.1.1.2  mrg #define _mm_qmiatbn_pi32(acc, m1, m2) \
   1398  1.1.1.2  mrg   ({\
   1399  1.1.1.2  mrg    __m64 _acc = acc;\
   1400  1.1.1.2  mrg    __m64 _m1 = m1;\
   1401  1.1.1.2  mrg    __m64 _m2 = m2;\
   1402  1.1.1.2  mrg    _acc = (__m64) __builtin_arm_wqmiatbn ((__v2si)_acc, (__v4hi)_m1, (__v4hi)_m2);\
   1403  1.1.1.2  mrg    _acc;\
   1404  1.1.1.2  mrg    })
   1405  1.1.1.2  mrg 
   1406  1.1.1.2  mrg #define _mm_qmiatt_pi32(acc, m1, m2) \
   1407  1.1.1.2  mrg   ({\
   1408  1.1.1.2  mrg    __m64 _acc = acc;\
   1409  1.1.1.2  mrg    __m64 _m1 = m1;\
   1410  1.1.1.2  mrg    __m64 _m2 = m2;\
   1411  1.1.1.2  mrg    _acc = (__m64) __builtin_arm_wqmiatt ((__v2si)_acc, (__v4hi)_m1, (__v4hi)_m2);\
   1412  1.1.1.2  mrg    _acc;\
   1413  1.1.1.2  mrg    })
   1414  1.1.1.2  mrg 
   1415  1.1.1.2  mrg #define _mm_qmiattn_pi32(acc, m1, m2) \
   1416  1.1.1.2  mrg   ({\
   1417  1.1.1.2  mrg    __m64 _acc = acc;\
   1418  1.1.1.2  mrg    __m64 _m1 = m1;\
   1419  1.1.1.2  mrg    __m64 _m2 = m2;\
   1420  1.1.1.2  mrg    _acc = (__m64) __builtin_arm_wqmiattn ((__v2si)_acc, (__v4hi)_m1, (__v4hi)_m2);\
   1421  1.1.1.2  mrg    _acc;\
   1422  1.1.1.2  mrg    })
   1423  1.1.1.2  mrg 
   1424  1.1.1.2  mrg #define _mm_wmiabb_si64(acc, m1, m2) \
   1425  1.1.1.2  mrg   ({\
   1426  1.1.1.2  mrg    __m64 _acc = acc;\
   1427  1.1.1.2  mrg    __m64 _m1 = m1;\
   1428  1.1.1.2  mrg    __m64 _m2 = m2;\
   1429  1.1.1.2  mrg    _acc = (__m64) __builtin_arm_wmiabb (_acc, (__v4hi)_m1, (__v4hi)_m2);\
   1430  1.1.1.2  mrg    _acc;\
   1431  1.1.1.2  mrg    })
   1432  1.1.1.2  mrg 
   1433  1.1.1.2  mrg #define _mm_wmiabbn_si64(acc, m1, m2) \
   1434  1.1.1.2  mrg   ({\
   1435  1.1.1.2  mrg    __m64 _acc = acc;\
   1436  1.1.1.2  mrg    __m64 _m1 = m1;\
   1437  1.1.1.2  mrg    __m64 _m2 = m2;\
   1438  1.1.1.2  mrg    _acc = (__m64) __builtin_arm_wmiabbn (_acc, (__v4hi)_m1, (__v4hi)_m2);\
   1439  1.1.1.2  mrg    _acc;\
   1440  1.1.1.2  mrg    })
   1441  1.1.1.2  mrg 
   1442  1.1.1.2  mrg #define _mm_wmiabt_si64(acc, m1, m2) \
   1443  1.1.1.2  mrg   ({\
   1444  1.1.1.2  mrg    __m64 _acc = acc;\
   1445  1.1.1.2  mrg    __m64 _m1 = m1;\
   1446  1.1.1.2  mrg    __m64 _m2 = m2;\
   1447  1.1.1.2  mrg    _acc = (__m64) __builtin_arm_wmiabt (_acc, (__v4hi)_m1, (__v4hi)_m2);\
   1448  1.1.1.2  mrg    _acc;\
   1449  1.1.1.2  mrg    })
   1450  1.1.1.2  mrg 
   1451  1.1.1.2  mrg #define _mm_wmiabtn_si64(acc, m1, m2) \
   1452  1.1.1.2  mrg   ({\
   1453  1.1.1.2  mrg    __m64 _acc = acc;\
   1454  1.1.1.2  mrg    __m64 _m1 = m1;\
   1455  1.1.1.2  mrg    __m64 _m2 = m2;\
   1456  1.1.1.2  mrg    _acc = (__m64) __builtin_arm_wmiabtn (_acc, (__v4hi)_m1, (__v4hi)_m2);\
   1457  1.1.1.2  mrg    _acc;\
   1458  1.1.1.2  mrg    })
   1459  1.1.1.2  mrg 
   1460  1.1.1.2  mrg #define _mm_wmiatb_si64(acc, m1, m2) \
   1461  1.1.1.2  mrg   ({\
   1462  1.1.1.2  mrg    __m64 _acc = acc;\
   1463  1.1.1.2  mrg    __m64 _m1 = m1;\
   1464  1.1.1.2  mrg    __m64 _m2 = m2;\
   1465  1.1.1.2  mrg    _acc = (__m64) __builtin_arm_wmiatb (_acc, (__v4hi)_m1, (__v4hi)_m2);\
   1466  1.1.1.2  mrg    _acc;\
   1467  1.1.1.2  mrg    })
   1468  1.1.1.2  mrg 
   1469  1.1.1.2  mrg #define _mm_wmiatbn_si64(acc, m1, m2) \
   1470  1.1.1.2  mrg   ({\
   1471  1.1.1.2  mrg    __m64 _acc = acc;\
   1472  1.1.1.2  mrg    __m64 _m1 = m1;\
   1473  1.1.1.2  mrg    __m64 _m2 = m2;\
   1474  1.1.1.2  mrg    _acc = (__m64) __builtin_arm_wmiatbn (_acc, (__v4hi)_m1, (__v4hi)_m2);\
   1475  1.1.1.2  mrg    _acc;\
   1476  1.1.1.2  mrg    })
   1477  1.1.1.2  mrg 
   1478  1.1.1.2  mrg #define _mm_wmiatt_si64(acc, m1, m2) \
   1479  1.1.1.2  mrg   ({\
   1480  1.1.1.2  mrg    __m64 _acc = acc;\
   1481  1.1.1.2  mrg    __m64 _m1 = m1;\
   1482  1.1.1.2  mrg    __m64 _m2 = m2;\
   1483  1.1.1.2  mrg    _acc = (__m64) __builtin_arm_wmiatt (_acc, (__v4hi)_m1, (__v4hi)_m2);\
   1484  1.1.1.2  mrg    _acc;\
   1485  1.1.1.2  mrg    })
   1486  1.1.1.2  mrg 
   1487  1.1.1.2  mrg #define _mm_wmiattn_si64(acc, m1, m2) \
   1488  1.1.1.2  mrg   ({\
   1489  1.1.1.2  mrg    __m64 _acc = acc;\
   1490  1.1.1.2  mrg    __m64 _m1 = m1;\
   1491  1.1.1.2  mrg    __m64 _m2 = m2;\
   1492  1.1.1.2  mrg    _acc = (__m64) __builtin_arm_wmiattn (_acc, (__v4hi)_m1, (__v4hi)_m2);\
   1493  1.1.1.2  mrg    _acc;\
   1494  1.1.1.2  mrg    })
   1495  1.1.1.2  mrg 
   1496  1.1.1.2  mrg #define _mm_wmiawbb_si64(acc, m1, m2) \
   1497  1.1.1.2  mrg   ({\
   1498  1.1.1.2  mrg    __m64 _acc = acc;\
   1499  1.1.1.2  mrg    __m64 _m1 = m1;\
   1500  1.1.1.2  mrg    __m64 _m2 = m2;\
   1501  1.1.1.2  mrg    _acc = (__m64) __builtin_arm_wmiawbb (_acc, (__v2si)_m1, (__v2si)_m2);\
   1502  1.1.1.2  mrg    _acc;\
   1503  1.1.1.2  mrg    })
   1504  1.1.1.2  mrg 
   1505  1.1.1.2  mrg #define _mm_wmiawbbn_si64(acc, m1, m2) \
   1506  1.1.1.2  mrg   ({\
   1507  1.1.1.2  mrg    __m64 _acc = acc;\
   1508  1.1.1.2  mrg    __m64 _m1 = m1;\
   1509  1.1.1.2  mrg    __m64 _m2 = m2;\
   1510  1.1.1.2  mrg    _acc = (__m64) __builtin_arm_wmiawbbn (_acc, (__v2si)_m1, (__v2si)_m2);\
   1511  1.1.1.2  mrg    _acc;\
   1512  1.1.1.2  mrg    })
   1513  1.1.1.2  mrg 
   1514  1.1.1.2  mrg #define _mm_wmiawbt_si64(acc, m1, m2) \
   1515  1.1.1.2  mrg   ({\
   1516  1.1.1.2  mrg    __m64 _acc = acc;\
   1517  1.1.1.2  mrg    __m64 _m1 = m1;\
   1518  1.1.1.2  mrg    __m64 _m2 = m2;\
   1519  1.1.1.2  mrg    _acc = (__m64) __builtin_arm_wmiawbt (_acc, (__v2si)_m1, (__v2si)_m2);\
   1520  1.1.1.2  mrg    _acc;\
   1521  1.1.1.2  mrg    })
   1522  1.1.1.2  mrg 
   1523  1.1.1.2  mrg #define _mm_wmiawbtn_si64(acc, m1, m2) \
   1524  1.1.1.2  mrg   ({\
   1525  1.1.1.2  mrg    __m64 _acc = acc;\
   1526  1.1.1.2  mrg    __m64 _m1 = m1;\
   1527  1.1.1.2  mrg    __m64 _m2 = m2;\
   1528  1.1.1.2  mrg    _acc = (__m64) __builtin_arm_wmiawbtn (_acc, (__v2si)_m1, (__v2si)_m2);\
   1529  1.1.1.2  mrg    _acc;\
   1530  1.1.1.2  mrg    })
   1531  1.1.1.2  mrg 
   1532  1.1.1.2  mrg #define _mm_wmiawtb_si64(acc, m1, m2) \
   1533  1.1.1.2  mrg   ({\
   1534  1.1.1.2  mrg    __m64 _acc = acc;\
   1535  1.1.1.2  mrg    __m64 _m1 = m1;\
   1536  1.1.1.2  mrg    __m64 _m2 = m2;\
   1537  1.1.1.2  mrg    _acc = (__m64) __builtin_arm_wmiawtb (_acc, (__v2si)_m1, (__v2si)_m2);\
   1538  1.1.1.2  mrg    _acc;\
   1539  1.1.1.2  mrg    })
   1540  1.1.1.2  mrg 
   1541  1.1.1.2  mrg #define _mm_wmiawtbn_si64(acc, m1, m2) \
   1542  1.1.1.2  mrg   ({\
   1543  1.1.1.2  mrg    __m64 _acc = acc;\
   1544  1.1.1.2  mrg    __m64 _m1 = m1;\
   1545  1.1.1.2  mrg    __m64 _m2 = m2;\
   1546  1.1.1.2  mrg    _acc = (__m64) __builtin_arm_wmiawtbn (_acc, (__v2si)_m1, (__v2si)_m2);\
   1547  1.1.1.2  mrg    _acc;\
   1548  1.1.1.2  mrg    })
   1549  1.1.1.2  mrg 
   1550  1.1.1.2  mrg #define _mm_wmiawtt_si64(acc, m1, m2) \
   1551  1.1.1.2  mrg   ({\
   1552  1.1.1.2  mrg    __m64 _acc = acc;\
   1553  1.1.1.2  mrg    __m64 _m1 = m1;\
   1554  1.1.1.2  mrg    __m64 _m2 = m2;\
   1555  1.1.1.2  mrg    _acc = (__m64) __builtin_arm_wmiawtt (_acc, (__v2si)_m1, (__v2si)_m2);\
   1556  1.1.1.2  mrg    _acc;\
   1557  1.1.1.2  mrg    })
   1558  1.1.1.2  mrg 
   1559  1.1.1.2  mrg #define _mm_wmiawttn_si64(acc, m1, m2) \
   1560  1.1.1.2  mrg   ({\
   1561  1.1.1.2  mrg    __m64 _acc = acc;\
   1562  1.1.1.2  mrg    __m64 _m1 = m1;\
   1563  1.1.1.2  mrg    __m64 _m2 = m2;\
   1564  1.1.1.2  mrg    _acc = (__m64) __builtin_arm_wmiawttn (_acc, (__v2si)_m1, (__v2si)_m2);\
   1565  1.1.1.2  mrg    _acc;\
   1566  1.1.1.2  mrg    })
   1567  1.1.1.2  mrg 
   1568  1.1.1.2  mrg /* The third arguments should be an immediate.  */
   1569  1.1.1.2  mrg #define _mm_merge_si64(a, b, n) \
   1570  1.1.1.2  mrg   ({\
   1571  1.1.1.2  mrg    __m64 result;\
   1572  1.1.1.2  mrg    result = (__m64) __builtin_arm_wmerge ((__m64) (a), (__m64) (b), (n));\
   1573  1.1.1.2  mrg    result;\
   1574  1.1.1.2  mrg    })
   1575  1.1.1.2  mrg #endif  /* __IWMMXT2__ */
   1576  1.1.1.2  mrg 
   1577  1.1.1.2  mrg static __inline __m64
   1578  1.1.1.2  mrg _mm_alignr0_si64 (__m64 a, __m64 b)
   1579  1.1.1.2  mrg {
   1580  1.1.1.2  mrg   return (__m64) __builtin_arm_walignr0 ((__v8qi) a, (__v8qi) b);
   1581  1.1.1.2  mrg }
   1582  1.1.1.2  mrg 
   1583  1.1.1.2  mrg static __inline __m64
   1584  1.1.1.2  mrg _mm_alignr1_si64 (__m64 a, __m64 b)
   1585  1.1.1.2  mrg {
   1586  1.1.1.2  mrg   return (__m64) __builtin_arm_walignr1 ((__v8qi) a, (__v8qi) b);
   1587  1.1.1.2  mrg }
   1588  1.1.1.2  mrg 
   1589  1.1.1.2  mrg static __inline __m64
   1590  1.1.1.2  mrg _mm_alignr2_si64 (__m64 a, __m64 b)
   1591  1.1.1.2  mrg {
   1592  1.1.1.2  mrg   return (__m64) __builtin_arm_walignr2 ((__v8qi) a, (__v8qi) b);
   1593  1.1.1.2  mrg }
   1594  1.1.1.2  mrg 
   1595  1.1.1.2  mrg static __inline __m64
   1596  1.1.1.2  mrg _mm_alignr3_si64 (__m64 a, __m64 b)
   1597  1.1.1.2  mrg {
   1598  1.1.1.2  mrg   return (__m64) __builtin_arm_walignr3 ((__v8qi) a, (__v8qi) b);
   1599  1.1.1.2  mrg }
   1600  1.1.1.2  mrg 
   1601  1.1.1.2  mrg static __inline void
   1602  1.1.1.2  mrg _mm_tandcb ()
   1603  1.1.1.2  mrg {
   1604  1.1.1.2  mrg   __asm __volatile ("tandcb r15");
   1605  1.1.1.2  mrg }
   1606  1.1.1.2  mrg 
   1607  1.1.1.2  mrg static __inline void
   1608  1.1.1.2  mrg _mm_tandch ()
   1609  1.1.1.2  mrg {
   1610  1.1.1.2  mrg   __asm __volatile ("tandch r15");
   1611  1.1.1.2  mrg }
   1612  1.1.1.2  mrg 
   1613  1.1.1.2  mrg static __inline void
   1614  1.1.1.2  mrg _mm_tandcw ()
   1615  1.1.1.2  mrg {
   1616  1.1.1.2  mrg   __asm __volatile ("tandcw r15");
   1617  1.1.1.2  mrg }
   1618  1.1.1.2  mrg 
   1619  1.1.1.2  mrg #define _mm_textrcb(n) \
   1620  1.1.1.2  mrg   ({\
   1621  1.1.1.2  mrg    __asm__ __volatile__ (\
   1622  1.1.1.2  mrg      "textrcb r15, %0" : : "i" (n));\
   1623  1.1.1.2  mrg    })
   1624  1.1.1.2  mrg 
   1625  1.1.1.2  mrg #define _mm_textrch(n) \
   1626  1.1.1.2  mrg   ({\
   1627  1.1.1.2  mrg    __asm__ __volatile__ (\
   1628  1.1.1.2  mrg      "textrch r15, %0" : : "i" (n));\
   1629  1.1.1.2  mrg    })
   1630  1.1.1.2  mrg 
   1631  1.1.1.2  mrg #define _mm_textrcw(n) \
   1632  1.1.1.2  mrg   ({\
   1633  1.1.1.2  mrg    __asm__ __volatile__ (\
   1634  1.1.1.2  mrg      "textrcw r15, %0" : : "i" (n));\
   1635  1.1.1.2  mrg    })
   1636  1.1.1.2  mrg 
   1637  1.1.1.2  mrg static __inline void
   1638  1.1.1.2  mrg _mm_torcb ()
   1639  1.1.1.2  mrg {
   1640  1.1.1.2  mrg   __asm __volatile ("torcb r15");
   1641  1.1.1.2  mrg }
   1642  1.1.1.2  mrg 
   1643  1.1.1.2  mrg static __inline void
   1644  1.1.1.2  mrg _mm_torch ()
   1645  1.1.1.2  mrg {
   1646  1.1.1.2  mrg   __asm __volatile ("torch r15");
   1647  1.1.1.2  mrg }
   1648  1.1.1.2  mrg 
   1649  1.1.1.2  mrg static __inline void
   1650  1.1.1.2  mrg _mm_torcw ()
   1651  1.1.1.2  mrg {
   1652  1.1.1.2  mrg   __asm __volatile ("torcw r15");
   1653  1.1.1.2  mrg }
   1654  1.1.1.2  mrg 
   1655  1.1.1.2  mrg #ifdef __IWMMXT2__
   1656  1.1.1.2  mrg static __inline void
   1657  1.1.1.2  mrg _mm_torvscb ()
   1658  1.1.1.2  mrg {
   1659  1.1.1.2  mrg   __asm __volatile ("torvscb r15");
   1660  1.1.1.2  mrg }
   1661  1.1.1.2  mrg 
   1662  1.1.1.2  mrg static __inline void
   1663  1.1.1.2  mrg _mm_torvsch ()
   1664  1.1.1.2  mrg {
   1665  1.1.1.2  mrg   __asm __volatile ("torvsch r15");
   1666  1.1.1.2  mrg }
   1667  1.1.1.2  mrg 
   1668  1.1.1.2  mrg static __inline void
   1669  1.1.1.2  mrg _mm_torvscw ()
   1670  1.1.1.2  mrg {
   1671  1.1.1.2  mrg   __asm __volatile ("torvscw r15");
   1672  1.1.1.2  mrg }
   1673  1.1.1.2  mrg #endif /* __IWMMXT2__ */
   1674  1.1.1.2  mrg 
   1675  1.1.1.2  mrg static __inline __m64
   1676  1.1.1.2  mrg _mm_tbcst_pi8 (int value)
   1677  1.1.1.2  mrg {
   1678  1.1.1.2  mrg   return (__m64) __builtin_arm_tbcstb ((signed char) value);
   1679  1.1.1.2  mrg }
   1680  1.1.1.2  mrg 
   1681  1.1.1.2  mrg static __inline __m64
   1682  1.1.1.2  mrg _mm_tbcst_pi16 (int value)
   1683  1.1.1.2  mrg {
   1684  1.1.1.2  mrg   return (__m64) __builtin_arm_tbcsth ((short) value);
   1685  1.1.1.2  mrg }
   1686  1.1.1.2  mrg 
   1687  1.1.1.2  mrg static __inline __m64
   1688  1.1.1.2  mrg _mm_tbcst_pi32 (int value)
   1689  1.1.1.2  mrg {
   1690  1.1.1.2  mrg   return (__m64) __builtin_arm_tbcstw (value);
   1691  1.1.1.2  mrg }
   1692  1.1.1.2  mrg 
   1693  1.1.1.2  mrg #define _m_empty _mm_empty
   1694      1.1  mrg #define _m_packsswb _mm_packs_pi16
   1695      1.1  mrg #define _m_packssdw _mm_packs_pi32
   1696      1.1  mrg #define _m_packuswb _mm_packs_pu16
   1697      1.1  mrg #define _m_packusdw _mm_packs_pu32
   1698      1.1  mrg #define _m_packssqd _mm_packs_pi64
   1699      1.1  mrg #define _m_packusqd _mm_packs_pu64
   1700      1.1  mrg #define _mm_packs_si64 _mm_packs_pi64
   1701      1.1  mrg #define _mm_packs_su64 _mm_packs_pu64
   1702      1.1  mrg #define _m_punpckhbw _mm_unpackhi_pi8
   1703      1.1  mrg #define _m_punpckhwd _mm_unpackhi_pi16
   1704      1.1  mrg #define _m_punpckhdq _mm_unpackhi_pi32
   1705      1.1  mrg #define _m_punpcklbw _mm_unpacklo_pi8
   1706      1.1  mrg #define _m_punpcklwd _mm_unpacklo_pi16
   1707      1.1  mrg #define _m_punpckldq _mm_unpacklo_pi32
   1708      1.1  mrg #define _m_punpckehsbw _mm_unpackeh_pi8
   1709      1.1  mrg #define _m_punpckehswd _mm_unpackeh_pi16
   1710      1.1  mrg #define _m_punpckehsdq _mm_unpackeh_pi32
   1711      1.1  mrg #define _m_punpckehubw _mm_unpackeh_pu8
   1712      1.1  mrg #define _m_punpckehuwd _mm_unpackeh_pu16
   1713      1.1  mrg #define _m_punpckehudq _mm_unpackeh_pu32
   1714      1.1  mrg #define _m_punpckelsbw _mm_unpackel_pi8
   1715      1.1  mrg #define _m_punpckelswd _mm_unpackel_pi16
   1716      1.1  mrg #define _m_punpckelsdq _mm_unpackel_pi32
   1717      1.1  mrg #define _m_punpckelubw _mm_unpackel_pu8
   1718      1.1  mrg #define _m_punpckeluwd _mm_unpackel_pu16
   1719      1.1  mrg #define _m_punpckeludq _mm_unpackel_pu32
   1720      1.1  mrg #define _m_paddb _mm_add_pi8
   1721      1.1  mrg #define _m_paddw _mm_add_pi16
   1722      1.1  mrg #define _m_paddd _mm_add_pi32
   1723      1.1  mrg #define _m_paddsb _mm_adds_pi8
   1724      1.1  mrg #define _m_paddsw _mm_adds_pi16
   1725      1.1  mrg #define _m_paddsd _mm_adds_pi32
   1726      1.1  mrg #define _m_paddusb _mm_adds_pu8
   1727      1.1  mrg #define _m_paddusw _mm_adds_pu16
   1728      1.1  mrg #define _m_paddusd _mm_adds_pu32
   1729      1.1  mrg #define _m_psubb _mm_sub_pi8
   1730      1.1  mrg #define _m_psubw _mm_sub_pi16
   1731      1.1  mrg #define _m_psubd _mm_sub_pi32
   1732      1.1  mrg #define _m_psubsb _mm_subs_pi8
   1733      1.1  mrg #define _m_psubsw _mm_subs_pi16
   1734      1.1  mrg #define _m_psubuw _mm_subs_pi32
   1735      1.1  mrg #define _m_psubusb _mm_subs_pu8
   1736      1.1  mrg #define _m_psubusw _mm_subs_pu16
   1737      1.1  mrg #define _m_psubusd _mm_subs_pu32
   1738      1.1  mrg #define _m_pmaddwd _mm_madd_pi16
   1739      1.1  mrg #define _m_pmadduwd _mm_madd_pu16
   1740      1.1  mrg #define _m_pmulhw _mm_mulhi_pi16
   1741      1.1  mrg #define _m_pmulhuw _mm_mulhi_pu16
   1742      1.1  mrg #define _m_pmullw _mm_mullo_pi16
   1743      1.1  mrg #define _m_pmacsw _mm_mac_pi16
   1744      1.1  mrg #define _m_pmacuw _mm_mac_pu16
   1745      1.1  mrg #define _m_pmacszw _mm_macz_pi16
   1746      1.1  mrg #define _m_pmacuzw _mm_macz_pu16
   1747      1.1  mrg #define _m_paccb _mm_acc_pu8
   1748      1.1  mrg #define _m_paccw _mm_acc_pu16
   1749      1.1  mrg #define _m_paccd _mm_acc_pu32
   1750      1.1  mrg #define _m_pmia _mm_mia_si64
   1751      1.1  mrg #define _m_pmiaph _mm_miaph_si64
   1752      1.1  mrg #define _m_pmiabb _mm_miabb_si64
   1753      1.1  mrg #define _m_pmiabt _mm_miabt_si64
   1754      1.1  mrg #define _m_pmiatb _mm_miatb_si64
   1755      1.1  mrg #define _m_pmiatt _mm_miatt_si64
   1756      1.1  mrg #define _m_psllw _mm_sll_pi16
   1757      1.1  mrg #define _m_psllwi _mm_slli_pi16
   1758      1.1  mrg #define _m_pslld _mm_sll_pi32
   1759      1.1  mrg #define _m_pslldi _mm_slli_pi32
   1760      1.1  mrg #define _m_psllq _mm_sll_si64
   1761      1.1  mrg #define _m_psllqi _mm_slli_si64
   1762      1.1  mrg #define _m_psraw _mm_sra_pi16
   1763      1.1  mrg #define _m_psrawi _mm_srai_pi16
   1764      1.1  mrg #define _m_psrad _mm_sra_pi32
   1765      1.1  mrg #define _m_psradi _mm_srai_pi32
   1766      1.1  mrg #define _m_psraq _mm_sra_si64
   1767      1.1  mrg #define _m_psraqi _mm_srai_si64
   1768      1.1  mrg #define _m_psrlw _mm_srl_pi16
   1769      1.1  mrg #define _m_psrlwi _mm_srli_pi16
   1770      1.1  mrg #define _m_psrld _mm_srl_pi32
   1771      1.1  mrg #define _m_psrldi _mm_srli_pi32
   1772      1.1  mrg #define _m_psrlq _mm_srl_si64
   1773      1.1  mrg #define _m_psrlqi _mm_srli_si64
   1774      1.1  mrg #define _m_prorw _mm_ror_pi16
   1775      1.1  mrg #define _m_prorwi _mm_rori_pi16
   1776      1.1  mrg #define _m_prord _mm_ror_pi32
   1777      1.1  mrg #define _m_prordi _mm_rori_pi32
   1778      1.1  mrg #define _m_prorq _mm_ror_si64
   1779      1.1  mrg #define _m_prorqi _mm_rori_si64
   1780      1.1  mrg #define _m_pand _mm_and_si64
   1781      1.1  mrg #define _m_pandn _mm_andnot_si64
   1782      1.1  mrg #define _m_por _mm_or_si64
   1783      1.1  mrg #define _m_pxor _mm_xor_si64
   1784      1.1  mrg #define _m_pcmpeqb _mm_cmpeq_pi8
   1785      1.1  mrg #define _m_pcmpeqw _mm_cmpeq_pi16
   1786      1.1  mrg #define _m_pcmpeqd _mm_cmpeq_pi32
   1787      1.1  mrg #define _m_pcmpgtb _mm_cmpgt_pi8
   1788      1.1  mrg #define _m_pcmpgtub _mm_cmpgt_pu8
   1789      1.1  mrg #define _m_pcmpgtw _mm_cmpgt_pi16
   1790      1.1  mrg #define _m_pcmpgtuw _mm_cmpgt_pu16
   1791      1.1  mrg #define _m_pcmpgtd _mm_cmpgt_pi32
   1792      1.1  mrg #define _m_pcmpgtud _mm_cmpgt_pu32
   1793      1.1  mrg #define _m_pextrb _mm_extract_pi8
   1794      1.1  mrg #define _m_pextrw _mm_extract_pi16
   1795      1.1  mrg #define _m_pextrd _mm_extract_pi32
   1796      1.1  mrg #define _m_pextrub _mm_extract_pu8
   1797      1.1  mrg #define _m_pextruw _mm_extract_pu16
   1798      1.1  mrg #define _m_pextrud _mm_extract_pu32
   1799      1.1  mrg #define _m_pinsrb _mm_insert_pi8
   1800      1.1  mrg #define _m_pinsrw _mm_insert_pi16
   1801      1.1  mrg #define _m_pinsrd _mm_insert_pi32
   1802      1.1  mrg #define _m_pmaxsb _mm_max_pi8
   1803      1.1  mrg #define _m_pmaxsw _mm_max_pi16
   1804      1.1  mrg #define _m_pmaxsd _mm_max_pi32
   1805      1.1  mrg #define _m_pmaxub _mm_max_pu8
   1806      1.1  mrg #define _m_pmaxuw _mm_max_pu16
   1807      1.1  mrg #define _m_pmaxud _mm_max_pu32
   1808      1.1  mrg #define _m_pminsb _mm_min_pi8
   1809      1.1  mrg #define _m_pminsw _mm_min_pi16
   1810      1.1  mrg #define _m_pminsd _mm_min_pi32
   1811      1.1  mrg #define _m_pminub _mm_min_pu8
   1812      1.1  mrg #define _m_pminuw _mm_min_pu16
   1813      1.1  mrg #define _m_pminud _mm_min_pu32
   1814      1.1  mrg #define _m_pmovmskb _mm_movemask_pi8
   1815      1.1  mrg #define _m_pmovmskw _mm_movemask_pi16
   1816      1.1  mrg #define _m_pmovmskd _mm_movemask_pi32
   1817      1.1  mrg #define _m_pshufw _mm_shuffle_pi16
   1818      1.1  mrg #define _m_pavgb _mm_avg_pu8
   1819      1.1  mrg #define _m_pavgw _mm_avg_pu16
   1820      1.1  mrg #define _m_pavg2b _mm_avg2_pu8
   1821      1.1  mrg #define _m_pavg2w _mm_avg2_pu16
   1822      1.1  mrg #define _m_psadbw _mm_sad_pu8
   1823      1.1  mrg #define _m_psadwd _mm_sad_pu16
   1824      1.1  mrg #define _m_psadzbw _mm_sadz_pu8
   1825      1.1  mrg #define _m_psadzwd _mm_sadz_pu16
   1826      1.1  mrg #define _m_paligniq _mm_align_si64
   1827      1.1  mrg #define _m_cvt_si2pi _mm_cvtsi64_m64
   1828      1.1  mrg #define _m_cvt_pi2si _mm_cvtm64_si64
   1829  1.1.1.2  mrg #define _m_from_int _mm_cvtsi32_si64
   1830  1.1.1.2  mrg #define _m_to_int _mm_cvtsi64_si32
   1831  1.1.1.2  mrg 
   1832  1.1.1.2  mrg #if defined __cplusplus
   1833  1.1.1.2  mrg }; /* End "C" */
   1834  1.1.1.2  mrg #endif /* __cplusplus */
   1835      1.1  mrg 
   1836      1.1  mrg #endif /* _MMINTRIN_H_INCLUDED */
   1837