Home | History | Annotate | Line # | Download | only in rs6000
bmi2intrin.h revision 1.1.1.1
      1 /* Copyright (C) 2011-2018 Free Software Foundation, Inc.
      2 
      3    This file is part of GCC.
      4 
      5    GCC is free software; you can redistribute it and/or modify
      6    it under the terms of the GNU General Public License as published by
      7    the Free Software Foundation; either version 3, or (at your option)
      8    any later version.
      9 
     10    GCC is distributed in the hope that it will be useful,
     11    but WITHOUT ANY WARRANTY; without even the implied warranty of
     12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     13    GNU General Public License for more details.
     14 
     15    Under Section 7 of GPL version 3, you are granted additional
     16    permissions described in the GCC Runtime Library Exception, version
     17    3.1, as published by the Free Software Foundation.
     18 
     19    You should have received a copy of the GNU General Public License and
     20    a copy of the GCC Runtime Library Exception along with this program;
     21    see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
     22    <http://www.gnu.org/licenses/>.  */
     23 
     24 /* This header is distributed to simplify porting x86_64 code that
     25    makes explicit use of Intel intrinsics to powerpc64le.
     26    It is the user's responsibility to determine if the results are
     27    acceptable and make additional changes as necessary.
     28    Note that much code that uses Intel intrinsics can be rewritten in
     29    standard C or GNU C extensions, which are more portable and better
     30    optimized across multiple targets.  */
     31 
     32 #if !defined _X86INTRIN_H_INCLUDED
     33 # error "Never use <bmi2intrin.h> directly; include <x86intrin.h> instead."
     34 #endif
     35 
     36 #ifndef _BMI2INTRIN_H_INCLUDED
     37 #define _BMI2INTRIN_H_INCLUDED
     38 
     39 extern __inline unsigned int
     40 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     41 _bzhi_u32 (unsigned int __X, unsigned int __Y)
     42 {
     43   return ((__X << (32 - __Y)) >> (32 - __Y));
     44 }
     45 
     46 extern __inline unsigned int
     47 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     48 _mulx_u32 (unsigned int __X, unsigned int __Y, unsigned int *__P)
     49 {
     50   unsigned long long __res = (unsigned long long) __X * __Y;
     51   *__P = (unsigned int) (__res >> 32);
     52   return (unsigned int) __res;
     53 }
     54 
     55 #ifdef  __PPC64__
     56 extern __inline unsigned long long
     57 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     58 _bzhi_u64 (unsigned long long __X, unsigned long long __Y)
     59 {
     60   return ((__X << (64 - __Y)) >> (64 - __Y));
     61 }
     62 
     63 /* __int128 requires base 64-bit.  */
     64 extern __inline unsigned long long
     65 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     66 _mulx_u64 (unsigned long long __X, unsigned long long __Y,
     67 	   unsigned long long *__P)
     68 {
     69   unsigned __int128 __res = (unsigned __int128) __X * __Y;
     70   *__P = (unsigned long long) (__res >> 64);
     71   return (unsigned long long) __res;
     72 }
     73 
     74 #ifdef  _ARCH_PWR7
     75 /* popcount and bpermd require power7 minimum.  */
     76 extern __inline unsigned long long
     77 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     78 _pdep_u64 (unsigned long long __X, unsigned long long __M)
     79 {
     80   unsigned long result = 0x0UL;
     81   const unsigned long mask = 0x8000000000000000UL;
     82   unsigned long m = __M;
     83   unsigned long c, t;
     84   unsigned long p;
     85 
     86   /* The pop-count of the mask gives the number of the bits from
     87    source to process.  This is also needed to shift bits from the
     88    source into the correct position for the result.  */
     89   p = 64 - __builtin_popcountl (__M);
     90 
     91   /* The loop is for the number of '1' bits in the mask and clearing
     92    each mask bit as it is processed.  */
     93   while (m != 0)
     94     {
     95       c = __builtin_clzl (m);
     96       t = __X << (p - c);
     97       m ^= (mask >> c);
     98       result |= (t & (mask >> c));
     99       p++;
    100     }
    101   return (result);
    102 }
    103 
    104 extern __inline unsigned long long
    105 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    106 _pext_u64 (unsigned long long __X, unsigned long long __M)
    107 {
    108   unsigned long p = 0x4040404040404040UL; // initial bit permute control
    109   const unsigned long mask = 0x8000000000000000UL;
    110   unsigned long m = __M;
    111   unsigned long c;
    112   unsigned long result;
    113 
    114   /* if the mask is constant and selects 8 bits or less we can use
    115    the Power8 Bit permute instruction.  */
    116   if (__builtin_constant_p (__M) && (__builtin_popcountl (__M) <= 8))
    117     {
    118       /* Also if the pext mask is constant, then the popcount is
    119        constant, we can evaluate the following loop at compile
    120        time and use a constant bit permute vector.  */
    121       for (long i = 0; i < __builtin_popcountl (__M); i++)
    122 	{
    123 	  c = __builtin_clzl (m);
    124 	  p = (p << 8) | c;
    125 	  m ^= (mask >> c);
    126 	}
    127       result = __builtin_bpermd (p, __X);
    128     }
    129   else
    130     {
    131       p = 64 - __builtin_popcountl (__M);
    132       result = 0;
    133       /* We could a use a for loop here, but that combined with
    134        -funroll-loops can expand to a lot of code.  The while
    135        loop avoids unrolling and the compiler commons the xor
    136        from clearing the mask bit with the (m != 0) test.  The
    137        result is a more compact loop setup and body.  */
    138       while (m != 0)
    139 	{
    140 	  unsigned long t;
    141 	  c = __builtin_clzl (m);
    142 	  t = (__X & (mask >> c)) >> (p - c);
    143 	  m ^= (mask >> c);
    144 	  result |= (t);
    145 	  p++;
    146 	}
    147     }
    148   return (result);
    149 }
    150 
    151 /* these 32-bit implementations depend on 64-bit pdep/pext
    152    which depend on _ARCH_PWR7.  */
    153 extern __inline unsigned int
    154 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    155 _pdep_u32 (unsigned int __X, unsigned int __Y)
    156 {
    157   return _pdep_u64 (__X, __Y);
    158 }
    159 
    160 extern __inline unsigned int
    161 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    162 _pext_u32 (unsigned int __X, unsigned int __Y)
    163 {
    164   return _pext_u64 (__X, __Y);
    165 }
    166 #endif /* _ARCH_PWR7  */
    167 #endif /* __PPC64__  */
    168 
    169 #endif /* _BMI2INTRIN_H_INCLUDED */
    170