config/rs6000/bmi2intrin.h

1.1.1.4  mrg /* Copyright (C) 2011-2022 Free Software Foundation, Inc.
    1.1  mrg
    1.1  mrg    This file is part of GCC.
    1.1  mrg
    1.1  mrg    GCC is free software; you can redistribute it and/or modify
    1.1  mrg    it under the terms of the GNU General Public License as published by
    1.1  mrg    the Free Software Foundation; either version 3, or (at your option)
    1.1  mrg    any later version.
    1.1  mrg
    1.1  mrg    GCC is distributed in the hope that it will be useful,
    1.1  mrg    but WITHOUT ANY WARRANTY; without even the implied warranty of
    1.1  mrg    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    1.1  mrg    GNU General Public License for more details.
    1.1  mrg
    1.1  mrg    Under Section 7 of GPL version 3, you are granted additional
    1.1  mrg    permissions described in the GCC Runtime Library Exception, version
    1.1  mrg    3.1, as published by the Free Software Foundation.
    1.1  mrg
    1.1  mrg    You should have received a copy of the GNU General Public License and
    1.1  mrg    a copy of the GCC Runtime Library Exception along with this program;
    1.1  mrg    see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
    1.1  mrg    <http://www.gnu.org/licenses/>.  */
    1.1  mrg
    1.1  mrg /* This header is distributed to simplify porting x86_64 code that
    1.1  mrg    makes explicit use of Intel intrinsics to powerpc64le.
    1.1  mrg    It is the user's responsibility to determine if the results are
    1.1  mrg    acceptable and make additional changes as necessary.
    1.1  mrg    Note that much code that uses Intel intrinsics can be rewritten in
    1.1  mrg    standard C or GNU C extensions, which are more portable and better
    1.1  mrg    optimized across multiple targets.  */
    1.1  mrg
1.1.1.4  mrg #if !defined _X86GPRINTRIN_H_INCLUDED
1.1.1.4  mrg # error "Never use <bmi2intrin.h> directly; include <x86gprintrin.h> instead."
    1.1  mrg #endif
    1.1  mrg
    1.1  mrg #ifndef _BMI2INTRIN_H_INCLUDED
    1.1  mrg #define _BMI2INTRIN_H_INCLUDED
    1.1  mrg
    1.1  mrg extern __inline unsigned int
    1.1  mrg __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    1.1  mrg _bzhi_u32 (unsigned int __X, unsigned int __Y)
    1.1  mrg {
    1.1  mrg   return ((__X << (32 - __Y)) >> (32 - __Y));
    1.1  mrg }
    1.1  mrg
    1.1  mrg extern __inline unsigned int
    1.1  mrg __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    1.1  mrg _mulx_u32 (unsigned int __X, unsigned int __Y, unsigned int *__P)
    1.1  mrg {
    1.1  mrg   unsigned long long __res = (unsigned long long) __X * __Y;
    1.1  mrg   *__P = (unsigned int) (__res >> 32);
    1.1  mrg   return (unsigned int) __res;
    1.1  mrg }
    1.1  mrg
    1.1  mrg #ifdef  __PPC64__
    1.1  mrg extern __inline unsigned long long
    1.1  mrg __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    1.1  mrg _bzhi_u64 (unsigned long long __X, unsigned long long __Y)
    1.1  mrg {
    1.1  mrg   return ((__X << (64 - __Y)) >> (64 - __Y));
    1.1  mrg }
    1.1  mrg
    1.1  mrg /* __int128 requires base 64-bit.  */
    1.1  mrg extern __inline unsigned long long
    1.1  mrg __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    1.1  mrg _mulx_u64 (unsigned long long __X, unsigned long long __Y,
    1.1  mrg 	   unsigned long long *__P)
    1.1  mrg {
    1.1  mrg   unsigned __int128 __res = (unsigned __int128) __X * __Y;
    1.1  mrg   *__P = (unsigned long long) (__res >> 64);
    1.1  mrg   return (unsigned long long) __res;
    1.1  mrg }
    1.1  mrg
    1.1  mrg #ifdef  _ARCH_PWR7
    1.1  mrg /* popcount and bpermd require power7 minimum.  */
    1.1  mrg extern __inline unsigned long long
    1.1  mrg __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    1.1  mrg _pdep_u64 (unsigned long long __X, unsigned long long __M)
    1.1  mrg {
1.1.1.3  mrg   unsigned long __result = 0x0UL;
1.1.1.3  mrg   const unsigned long __mask = 0x8000000000000000UL;
1.1.1.3  mrg   unsigned long __m = __M;
1.1.1.3  mrg   unsigned long __c, __t;
1.1.1.3  mrg   unsigned long __p;
    1.1  mrg
    1.1  mrg   /* The pop-count of the mask gives the number of the bits from
    1.1  mrg    source to process.  This is also needed to shift bits from the
    1.1  mrg    source into the correct position for the result.  */
1.1.1.3  mrg   __p = 64 - __builtin_popcountl (__M);
    1.1  mrg
    1.1  mrg   /* The loop is for the number of '1' bits in the mask and clearing
    1.1  mrg    each mask bit as it is processed.  */
1.1.1.3  mrg   while (__m != 0)
    1.1  mrg     {
1.1.1.3  mrg       __c = __builtin_clzl (__m);
1.1.1.3  mrg       __t = __X << (__p - __c);
1.1.1.3  mrg       __m ^= (__mask >> __c);
1.1.1.3  mrg       __result |= (__t & (__mask >> __c));
1.1.1.3  mrg       __p++;
    1.1  mrg     }
1.1.1.3  mrg   return __result;
    1.1  mrg }
    1.1  mrg
    1.1  mrg extern __inline unsigned long long
    1.1  mrg __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    1.1  mrg _pext_u64 (unsigned long long __X, unsigned long long __M)
    1.1  mrg {
1.1.1.3  mrg   unsigned long __p = 0x4040404040404040UL; // initial bit permute control
1.1.1.3  mrg   const unsigned long __mask = 0x8000000000000000UL;
1.1.1.3  mrg   unsigned long __m = __M;
1.1.1.3  mrg   unsigned long __c;
1.1.1.3  mrg   unsigned long __result;
    1.1  mrg
    1.1  mrg   /* if the mask is constant and selects 8 bits or less we can use
    1.1  mrg    the Power8 Bit permute instruction.  */
    1.1  mrg   if (__builtin_constant_p (__M) && (__builtin_popcountl (__M) <= 8))
    1.1  mrg     {
    1.1  mrg       /* Also if the pext mask is constant, then the popcount is
    1.1  mrg        constant, we can evaluate the following loop at compile
    1.1  mrg        time and use a constant bit permute vector.  */
1.1.1.4  mrg       long __i;
1.1.1.4  mrg       for (__i = 0; __i < __builtin_popcountl (__M); __i++)
    1.1  mrg 	{
1.1.1.3  mrg 	  __c = __builtin_clzl (__m);
1.1.1.3  mrg 	  __p = (__p << 8) | __c;
1.1.1.3  mrg 	  __m ^= (__mask >> __c);
    1.1  mrg 	}
1.1.1.3  mrg       __result = __builtin_bpermd (__p, __X);
    1.1  mrg     }
    1.1  mrg   else
    1.1  mrg     {
1.1.1.3  mrg       __p = 64 - __builtin_popcountl (__M);
1.1.1.3  mrg       __result = 0;
    1.1  mrg       /* We could a use a for loop here, but that combined with
    1.1  mrg        -funroll-loops can expand to a lot of code.  The while
    1.1  mrg        loop avoids unrolling and the compiler commons the xor
1.1.1.4  mrg        from clearing the mask bit with the (m != 0) test.  The
    1.1  mrg        result is a more compact loop setup and body.  */
1.1.1.3  mrg       while (__m != 0)
    1.1  mrg 	{
1.1.1.3  mrg 	  unsigned long __t;
1.1.1.3  mrg 	  __c = __builtin_clzl (__m);
1.1.1.3  mrg 	  __t = (__X & (__mask >> __c)) >> (__p - __c);
1.1.1.3  mrg 	  __m ^= (__mask >> __c);
1.1.1.3  mrg 	  __result |= (__t);
1.1.1.3  mrg 	  __p++;
    1.1  mrg 	}
    1.1  mrg     }
1.1.1.3  mrg   return __result;
    1.1  mrg }
    1.1  mrg
    1.1  mrg /* these 32-bit implementations depend on 64-bit pdep/pext
    1.1  mrg    which depend on _ARCH_PWR7.  */
    1.1  mrg extern __inline unsigned int
    1.1  mrg __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    1.1  mrg _pdep_u32 (unsigned int __X, unsigned int __Y)
    1.1  mrg {
    1.1  mrg   return _pdep_u64 (__X, __Y);
    1.1  mrg }
    1.1  mrg
    1.1  mrg extern __inline unsigned int
    1.1  mrg __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    1.1  mrg _pext_u32 (unsigned int __X, unsigned int __Y)
    1.1  mrg {
    1.1  mrg   return _pext_u64 (__X, __Y);
    1.1  mrg }
    1.1  mrg #endif /* _ARCH_PWR7  */
    1.1  mrg #endif /* __PPC64__  */
    1.1  mrg
    1.1  mrg #endif /* _BMI2INTRIN_H_INCLUDED */