Home | History | Annotate | Line # | Download | only in rs6000
bmi2intrin.h revision 1.1
      1  1.1  mrg /* Copyright (C) 2011-2018 Free Software Foundation, Inc.
      2  1.1  mrg 
      3  1.1  mrg    This file is part of GCC.
      4  1.1  mrg 
      5  1.1  mrg    GCC is free software; you can redistribute it and/or modify
      6  1.1  mrg    it under the terms of the GNU General Public License as published by
      7  1.1  mrg    the Free Software Foundation; either version 3, or (at your option)
      8  1.1  mrg    any later version.
      9  1.1  mrg 
     10  1.1  mrg    GCC is distributed in the hope that it will be useful,
     11  1.1  mrg    but WITHOUT ANY WARRANTY; without even the implied warranty of
     12  1.1  mrg    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     13  1.1  mrg    GNU General Public License for more details.
     14  1.1  mrg 
     15  1.1  mrg    Under Section 7 of GPL version 3, you are granted additional
     16  1.1  mrg    permissions described in the GCC Runtime Library Exception, version
     17  1.1  mrg    3.1, as published by the Free Software Foundation.
     18  1.1  mrg 
     19  1.1  mrg    You should have received a copy of the GNU General Public License and
     20  1.1  mrg    a copy of the GCC Runtime Library Exception along with this program;
     21  1.1  mrg    see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
     22  1.1  mrg    <http://www.gnu.org/licenses/>.  */
     23  1.1  mrg 
     24  1.1  mrg /* This header is distributed to simplify porting x86_64 code that
     25  1.1  mrg    makes explicit use of Intel intrinsics to powerpc64le.
     26  1.1  mrg    It is the user's responsibility to determine if the results are
     27  1.1  mrg    acceptable and make additional changes as necessary.
     28  1.1  mrg    Note that much code that uses Intel intrinsics can be rewritten in
     29  1.1  mrg    standard C or GNU C extensions, which are more portable and better
     30  1.1  mrg    optimized across multiple targets.  */
     31  1.1  mrg 
     32  1.1  mrg #if !defined _X86INTRIN_H_INCLUDED
     33  1.1  mrg # error "Never use <bmi2intrin.h> directly; include <x86intrin.h> instead."
     34  1.1  mrg #endif
     35  1.1  mrg 
     36  1.1  mrg #ifndef _BMI2INTRIN_H_INCLUDED
     37  1.1  mrg #define _BMI2INTRIN_H_INCLUDED
     38  1.1  mrg 
     39  1.1  mrg extern __inline unsigned int
     40  1.1  mrg __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     41  1.1  mrg _bzhi_u32 (unsigned int __X, unsigned int __Y)
     42  1.1  mrg {
     43  1.1  mrg   return ((__X << (32 - __Y)) >> (32 - __Y));
     44  1.1  mrg }
     45  1.1  mrg 
     46  1.1  mrg extern __inline unsigned int
     47  1.1  mrg __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     48  1.1  mrg _mulx_u32 (unsigned int __X, unsigned int __Y, unsigned int *__P)
     49  1.1  mrg {
     50  1.1  mrg   unsigned long long __res = (unsigned long long) __X * __Y;
     51  1.1  mrg   *__P = (unsigned int) (__res >> 32);
     52  1.1  mrg   return (unsigned int) __res;
     53  1.1  mrg }
     54  1.1  mrg 
     55  1.1  mrg #ifdef  __PPC64__
     56  1.1  mrg extern __inline unsigned long long
     57  1.1  mrg __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     58  1.1  mrg _bzhi_u64 (unsigned long long __X, unsigned long long __Y)
     59  1.1  mrg {
     60  1.1  mrg   return ((__X << (64 - __Y)) >> (64 - __Y));
     61  1.1  mrg }
     62  1.1  mrg 
     63  1.1  mrg /* __int128 requires base 64-bit.  */
     64  1.1  mrg extern __inline unsigned long long
     65  1.1  mrg __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     66  1.1  mrg _mulx_u64 (unsigned long long __X, unsigned long long __Y,
     67  1.1  mrg 	   unsigned long long *__P)
     68  1.1  mrg {
     69  1.1  mrg   unsigned __int128 __res = (unsigned __int128) __X * __Y;
     70  1.1  mrg   *__P = (unsigned long long) (__res >> 64);
     71  1.1  mrg   return (unsigned long long) __res;
     72  1.1  mrg }
     73  1.1  mrg 
     74  1.1  mrg #ifdef  _ARCH_PWR7
     75  1.1  mrg /* popcount and bpermd require power7 minimum.  */
     76  1.1  mrg extern __inline unsigned long long
     77  1.1  mrg __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     78  1.1  mrg _pdep_u64 (unsigned long long __X, unsigned long long __M)
     79  1.1  mrg {
     80  1.1  mrg   unsigned long result = 0x0UL;
     81  1.1  mrg   const unsigned long mask = 0x8000000000000000UL;
     82  1.1  mrg   unsigned long m = __M;
     83  1.1  mrg   unsigned long c, t;
     84  1.1  mrg   unsigned long p;
     85  1.1  mrg 
     86  1.1  mrg   /* The pop-count of the mask gives the number of the bits from
     87  1.1  mrg    source to process.  This is also needed to shift bits from the
     88  1.1  mrg    source into the correct position for the result.  */
     89  1.1  mrg   p = 64 - __builtin_popcountl (__M);
     90  1.1  mrg 
     91  1.1  mrg   /* The loop is for the number of '1' bits in the mask and clearing
     92  1.1  mrg    each mask bit as it is processed.  */
     93  1.1  mrg   while (m != 0)
     94  1.1  mrg     {
     95  1.1  mrg       c = __builtin_clzl (m);
     96  1.1  mrg       t = __X << (p - c);
     97  1.1  mrg       m ^= (mask >> c);
     98  1.1  mrg       result |= (t & (mask >> c));
     99  1.1  mrg       p++;
    100  1.1  mrg     }
    101  1.1  mrg   return (result);
    102  1.1  mrg }
    103  1.1  mrg 
    104  1.1  mrg extern __inline unsigned long long
    105  1.1  mrg __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    106  1.1  mrg _pext_u64 (unsigned long long __X, unsigned long long __M)
    107  1.1  mrg {
    108  1.1  mrg   unsigned long p = 0x4040404040404040UL; // initial bit permute control
    109  1.1  mrg   const unsigned long mask = 0x8000000000000000UL;
    110  1.1  mrg   unsigned long m = __M;
    111  1.1  mrg   unsigned long c;
    112  1.1  mrg   unsigned long result;
    113  1.1  mrg 
    114  1.1  mrg   /* if the mask is constant and selects 8 bits or less we can use
    115  1.1  mrg    the Power8 Bit permute instruction.  */
    116  1.1  mrg   if (__builtin_constant_p (__M) && (__builtin_popcountl (__M) <= 8))
    117  1.1  mrg     {
    118  1.1  mrg       /* Also if the pext mask is constant, then the popcount is
    119  1.1  mrg        constant, we can evaluate the following loop at compile
    120  1.1  mrg        time and use a constant bit permute vector.  */
    121  1.1  mrg       for (long i = 0; i < __builtin_popcountl (__M); i++)
    122  1.1  mrg 	{
    123  1.1  mrg 	  c = __builtin_clzl (m);
    124  1.1  mrg 	  p = (p << 8) | c;
    125  1.1  mrg 	  m ^= (mask >> c);
    126  1.1  mrg 	}
    127  1.1  mrg       result = __builtin_bpermd (p, __X);
    128  1.1  mrg     }
    129  1.1  mrg   else
    130  1.1  mrg     {
    131  1.1  mrg       p = 64 - __builtin_popcountl (__M);
    132  1.1  mrg       result = 0;
    133  1.1  mrg       /* We could a use a for loop here, but that combined with
    134  1.1  mrg        -funroll-loops can expand to a lot of code.  The while
    135  1.1  mrg        loop avoids unrolling and the compiler commons the xor
    136  1.1  mrg        from clearing the mask bit with the (m != 0) test.  The
    137  1.1  mrg        result is a more compact loop setup and body.  */
    138  1.1  mrg       while (m != 0)
    139  1.1  mrg 	{
    140  1.1  mrg 	  unsigned long t;
    141  1.1  mrg 	  c = __builtin_clzl (m);
    142  1.1  mrg 	  t = (__X & (mask >> c)) >> (p - c);
    143  1.1  mrg 	  m ^= (mask >> c);
    144  1.1  mrg 	  result |= (t);
    145  1.1  mrg 	  p++;
    146  1.1  mrg 	}
    147  1.1  mrg     }
    148  1.1  mrg   return (result);
    149  1.1  mrg }
    150  1.1  mrg 
    151  1.1  mrg /* these 32-bit implementations depend on 64-bit pdep/pext
    152  1.1  mrg    which depend on _ARCH_PWR7.  */
    153  1.1  mrg extern __inline unsigned int
    154  1.1  mrg __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    155  1.1  mrg _pdep_u32 (unsigned int __X, unsigned int __Y)
    156  1.1  mrg {
    157  1.1  mrg   return _pdep_u64 (__X, __Y);
    158  1.1  mrg }
    159  1.1  mrg 
    160  1.1  mrg extern __inline unsigned int
    161  1.1  mrg __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    162  1.1  mrg _pext_u32 (unsigned int __X, unsigned int __Y)
    163  1.1  mrg {
    164  1.1  mrg   return _pext_u64 (__X, __Y);
    165  1.1  mrg }
    166  1.1  mrg #endif /* _ARCH_PWR7  */
    167  1.1  mrg #endif /* __PPC64__  */
    168  1.1  mrg 
    169  1.1  mrg #endif /* _BMI2INTRIN_H_INCLUDED */
    170