Home | History | Annotate | Line # | Download | only in rs6000
bmi2intrin.h revision 1.1.1.4
      1 /* Copyright (C) 2011-2022 Free Software Foundation, Inc.
      2 
      3    This file is part of GCC.
      4 
      5    GCC is free software; you can redistribute it and/or modify
      6    it under the terms of the GNU General Public License as published by
      7    the Free Software Foundation; either version 3, or (at your option)
      8    any later version.
      9 
     10    GCC is distributed in the hope that it will be useful,
     11    but WITHOUT ANY WARRANTY; without even the implied warranty of
     12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     13    GNU General Public License for more details.
     14 
     15    Under Section 7 of GPL version 3, you are granted additional
     16    permissions described in the GCC Runtime Library Exception, version
     17    3.1, as published by the Free Software Foundation.
     18 
     19    You should have received a copy of the GNU General Public License and
     20    a copy of the GCC Runtime Library Exception along with this program;
     21    see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
     22    <http://www.gnu.org/licenses/>.  */
     23 
     24 /* This header is distributed to simplify porting x86_64 code that
     25    makes explicit use of Intel intrinsics to powerpc64le.
     26    It is the user's responsibility to determine if the results are
     27    acceptable and make additional changes as necessary.
     28    Note that much code that uses Intel intrinsics can be rewritten in
     29    standard C or GNU C extensions, which are more portable and better
     30    optimized across multiple targets.  */
     31 
     32 #if !defined _X86GPRINTRIN_H_INCLUDED
     33 # error "Never use <bmi2intrin.h> directly; include <x86gprintrin.h> instead."
     34 #endif
     35 
     36 #ifndef _BMI2INTRIN_H_INCLUDED
     37 #define _BMI2INTRIN_H_INCLUDED
     38 
     39 extern __inline unsigned int
     40 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     41 _bzhi_u32 (unsigned int __X, unsigned int __Y)
     42 {
     43   return ((__X << (32 - __Y)) >> (32 - __Y));
     44 }
     45 
     46 extern __inline unsigned int
     47 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     48 _mulx_u32 (unsigned int __X, unsigned int __Y, unsigned int *__P)
     49 {
     50   unsigned long long __res = (unsigned long long) __X * __Y;
     51   *__P = (unsigned int) (__res >> 32);
     52   return (unsigned int) __res;
     53 }
     54 
     55 #ifdef  __PPC64__
     56 extern __inline unsigned long long
     57 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     58 _bzhi_u64 (unsigned long long __X, unsigned long long __Y)
     59 {
     60   return ((__X << (64 - __Y)) >> (64 - __Y));
     61 }
     62 
     63 /* __int128 requires base 64-bit.  */
     64 extern __inline unsigned long long
     65 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     66 _mulx_u64 (unsigned long long __X, unsigned long long __Y,
     67 	   unsigned long long *__P)
     68 {
     69   unsigned __int128 __res = (unsigned __int128) __X * __Y;
     70   *__P = (unsigned long long) (__res >> 64);
     71   return (unsigned long long) __res;
     72 }
     73 
     74 #ifdef  _ARCH_PWR7
     75 /* popcount and bpermd require power7 minimum.  */
     76 extern __inline unsigned long long
     77 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     78 _pdep_u64 (unsigned long long __X, unsigned long long __M)
     79 {
     80   unsigned long __result = 0x0UL;
     81   const unsigned long __mask = 0x8000000000000000UL;
     82   unsigned long __m = __M;
     83   unsigned long __c, __t;
     84   unsigned long __p;
     85 
     86   /* The pop-count of the mask gives the number of the bits from
     87    source to process.  This is also needed to shift bits from the
     88    source into the correct position for the result.  */
     89   __p = 64 - __builtin_popcountl (__M);
     90 
     91   /* The loop is for the number of '1' bits in the mask and clearing
     92    each mask bit as it is processed.  */
     93   while (__m != 0)
     94     {
     95       __c = __builtin_clzl (__m);
     96       __t = __X << (__p - __c);
     97       __m ^= (__mask >> __c);
     98       __result |= (__t & (__mask >> __c));
     99       __p++;
    100     }
    101   return __result;
    102 }
    103 
    104 extern __inline unsigned long long
    105 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    106 _pext_u64 (unsigned long long __X, unsigned long long __M)
    107 {
    108   unsigned long __p = 0x4040404040404040UL; // initial bit permute control
    109   const unsigned long __mask = 0x8000000000000000UL;
    110   unsigned long __m = __M;
    111   unsigned long __c;
    112   unsigned long __result;
    113 
    114   /* if the mask is constant and selects 8 bits or less we can use
    115    the Power8 Bit permute instruction.  */
    116   if (__builtin_constant_p (__M) && (__builtin_popcountl (__M) <= 8))
    117     {
    118       /* Also if the pext mask is constant, then the popcount is
    119        constant, we can evaluate the following loop at compile
    120        time and use a constant bit permute vector.  */
    121       long __i;
    122       for (__i = 0; __i < __builtin_popcountl (__M); __i++)
    123 	{
    124 	  __c = __builtin_clzl (__m);
    125 	  __p = (__p << 8) | __c;
    126 	  __m ^= (__mask >> __c);
    127 	}
    128       __result = __builtin_bpermd (__p, __X);
    129     }
    130   else
    131     {
    132       __p = 64 - __builtin_popcountl (__M);
    133       __result = 0;
    134       /* We could a use a for loop here, but that combined with
    135        -funroll-loops can expand to a lot of code.  The while
    136        loop avoids unrolling and the compiler commons the xor
    137        from clearing the mask bit with the (m != 0) test.  The
    138        result is a more compact loop setup and body.  */
    139       while (__m != 0)
    140 	{
    141 	  unsigned long __t;
    142 	  __c = __builtin_clzl (__m);
    143 	  __t = (__X & (__mask >> __c)) >> (__p - __c);
    144 	  __m ^= (__mask >> __c);
    145 	  __result |= (__t);
    146 	  __p++;
    147 	}
    148     }
    149   return __result;
    150 }
    151 
    152 /* these 32-bit implementations depend on 64-bit pdep/pext
    153    which depend on _ARCH_PWR7.  */
    154 extern __inline unsigned int
    155 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    156 _pdep_u32 (unsigned int __X, unsigned int __Y)
    157 {
    158   return _pdep_u64 (__X, __Y);
    159 }
    160 
    161 extern __inline unsigned int
    162 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    163 _pext_u32 (unsigned int __X, unsigned int __Y)
    164 {
    165   return _pext_u64 (__X, __Y);
    166 }
    167 #endif /* _ARCH_PWR7  */
    168 #endif /* __PPC64__  */
    169 
    170 #endif /* _BMI2INTRIN_H_INCLUDED */
    171