1 1.1.1.4 mrg /* Copyright (C) 2011-2022 Free Software Foundation, Inc. 2 1.1 mrg 3 1.1 mrg This file is part of GCC. 4 1.1 mrg 5 1.1 mrg GCC is free software; you can redistribute it and/or modify 6 1.1 mrg it under the terms of the GNU General Public License as published by 7 1.1 mrg the Free Software Foundation; either version 3, or (at your option) 8 1.1 mrg any later version. 9 1.1 mrg 10 1.1 mrg GCC is distributed in the hope that it will be useful, 11 1.1 mrg but WITHOUT ANY WARRANTY; without even the implied warranty of 12 1.1 mrg MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 1.1 mrg GNU General Public License for more details. 14 1.1 mrg 15 1.1 mrg Under Section 7 of GPL version 3, you are granted additional 16 1.1 mrg permissions described in the GCC Runtime Library Exception, version 17 1.1 mrg 3.1, as published by the Free Software Foundation. 18 1.1 mrg 19 1.1 mrg You should have received a copy of the GNU General Public License and 20 1.1 mrg a copy of the GCC Runtime Library Exception along with this program; 21 1.1 mrg see the files COPYING3 and COPYING.RUNTIME respectively. If not, see 22 1.1 mrg <http://www.gnu.org/licenses/>. */ 23 1.1 mrg 24 1.1 mrg /* This header is distributed to simplify porting x86_64 code that 25 1.1 mrg makes explicit use of Intel intrinsics to powerpc64le. 26 1.1 mrg It is the user's responsibility to determine if the results are 27 1.1 mrg acceptable and make additional changes as necessary. 28 1.1 mrg Note that much code that uses Intel intrinsics can be rewritten in 29 1.1 mrg standard C or GNU C extensions, which are more portable and better 30 1.1 mrg optimized across multiple targets. */ 31 1.1 mrg 32 1.1.1.4 mrg #if !defined _X86GPRINTRIN_H_INCLUDED 33 1.1.1.4 mrg # error "Never use <bmi2intrin.h> directly; include <x86gprintrin.h> instead." 34 1.1 mrg #endif 35 1.1 mrg 36 1.1 mrg #ifndef _BMI2INTRIN_H_INCLUDED 37 1.1 mrg #define _BMI2INTRIN_H_INCLUDED 38 1.1 mrg 39 1.1 mrg extern __inline unsigned int 40 1.1 mrg __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 41 1.1 mrg _bzhi_u32 (unsigned int __X, unsigned int __Y) 42 1.1 mrg { 43 1.1 mrg return ((__X << (32 - __Y)) >> (32 - __Y)); 44 1.1 mrg } 45 1.1 mrg 46 1.1 mrg extern __inline unsigned int 47 1.1 mrg __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 48 1.1 mrg _mulx_u32 (unsigned int __X, unsigned int __Y, unsigned int *__P) 49 1.1 mrg { 50 1.1 mrg unsigned long long __res = (unsigned long long) __X * __Y; 51 1.1 mrg *__P = (unsigned int) (__res >> 32); 52 1.1 mrg return (unsigned int) __res; 53 1.1 mrg } 54 1.1 mrg 55 1.1 mrg #ifdef __PPC64__ 56 1.1 mrg extern __inline unsigned long long 57 1.1 mrg __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 58 1.1 mrg _bzhi_u64 (unsigned long long __X, unsigned long long __Y) 59 1.1 mrg { 60 1.1 mrg return ((__X << (64 - __Y)) >> (64 - __Y)); 61 1.1 mrg } 62 1.1 mrg 63 1.1 mrg /* __int128 requires base 64-bit. */ 64 1.1 mrg extern __inline unsigned long long 65 1.1 mrg __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 66 1.1 mrg _mulx_u64 (unsigned long long __X, unsigned long long __Y, 67 1.1 mrg unsigned long long *__P) 68 1.1 mrg { 69 1.1 mrg unsigned __int128 __res = (unsigned __int128) __X * __Y; 70 1.1 mrg *__P = (unsigned long long) (__res >> 64); 71 1.1 mrg return (unsigned long long) __res; 72 1.1 mrg } 73 1.1 mrg 74 1.1 mrg #ifdef _ARCH_PWR7 75 1.1 mrg /* popcount and bpermd require power7 minimum. */ 76 1.1 mrg extern __inline unsigned long long 77 1.1 mrg __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 78 1.1 mrg _pdep_u64 (unsigned long long __X, unsigned long long __M) 79 1.1 mrg { 80 1.1.1.3 mrg unsigned long __result = 0x0UL; 81 1.1.1.3 mrg const unsigned long __mask = 0x8000000000000000UL; 82 1.1.1.3 mrg unsigned long __m = __M; 83 1.1.1.3 mrg unsigned long __c, __t; 84 1.1.1.3 mrg unsigned long __p; 85 1.1 mrg 86 1.1 mrg /* The pop-count of the mask gives the number of the bits from 87 1.1 mrg source to process. This is also needed to shift bits from the 88 1.1 mrg source into the correct position for the result. */ 89 1.1.1.3 mrg __p = 64 - __builtin_popcountl (__M); 90 1.1 mrg 91 1.1 mrg /* The loop is for the number of '1' bits in the mask and clearing 92 1.1 mrg each mask bit as it is processed. */ 93 1.1.1.3 mrg while (__m != 0) 94 1.1 mrg { 95 1.1.1.3 mrg __c = __builtin_clzl (__m); 96 1.1.1.3 mrg __t = __X << (__p - __c); 97 1.1.1.3 mrg __m ^= (__mask >> __c); 98 1.1.1.3 mrg __result |= (__t & (__mask >> __c)); 99 1.1.1.3 mrg __p++; 100 1.1 mrg } 101 1.1.1.3 mrg return __result; 102 1.1 mrg } 103 1.1 mrg 104 1.1 mrg extern __inline unsigned long long 105 1.1 mrg __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 106 1.1 mrg _pext_u64 (unsigned long long __X, unsigned long long __M) 107 1.1 mrg { 108 1.1.1.3 mrg unsigned long __p = 0x4040404040404040UL; // initial bit permute control 109 1.1.1.3 mrg const unsigned long __mask = 0x8000000000000000UL; 110 1.1.1.3 mrg unsigned long __m = __M; 111 1.1.1.3 mrg unsigned long __c; 112 1.1.1.3 mrg unsigned long __result; 113 1.1 mrg 114 1.1 mrg /* if the mask is constant and selects 8 bits or less we can use 115 1.1 mrg the Power8 Bit permute instruction. */ 116 1.1 mrg if (__builtin_constant_p (__M) && (__builtin_popcountl (__M) <= 8)) 117 1.1 mrg { 118 1.1 mrg /* Also if the pext mask is constant, then the popcount is 119 1.1 mrg constant, we can evaluate the following loop at compile 120 1.1 mrg time and use a constant bit permute vector. */ 121 1.1.1.4 mrg long __i; 122 1.1.1.4 mrg for (__i = 0; __i < __builtin_popcountl (__M); __i++) 123 1.1 mrg { 124 1.1.1.3 mrg __c = __builtin_clzl (__m); 125 1.1.1.3 mrg __p = (__p << 8) | __c; 126 1.1.1.3 mrg __m ^= (__mask >> __c); 127 1.1 mrg } 128 1.1.1.3 mrg __result = __builtin_bpermd (__p, __X); 129 1.1 mrg } 130 1.1 mrg else 131 1.1 mrg { 132 1.1.1.3 mrg __p = 64 - __builtin_popcountl (__M); 133 1.1.1.3 mrg __result = 0; 134 1.1 mrg /* We could a use a for loop here, but that combined with 135 1.1 mrg -funroll-loops can expand to a lot of code. The while 136 1.1 mrg loop avoids unrolling and the compiler commons the xor 137 1.1.1.4 mrg from clearing the mask bit with the (m != 0) test. The 138 1.1 mrg result is a more compact loop setup and body. */ 139 1.1.1.3 mrg while (__m != 0) 140 1.1 mrg { 141 1.1.1.3 mrg unsigned long __t; 142 1.1.1.3 mrg __c = __builtin_clzl (__m); 143 1.1.1.3 mrg __t = (__X & (__mask >> __c)) >> (__p - __c); 144 1.1.1.3 mrg __m ^= (__mask >> __c); 145 1.1.1.3 mrg __result |= (__t); 146 1.1.1.3 mrg __p++; 147 1.1 mrg } 148 1.1 mrg } 149 1.1.1.3 mrg return __result; 150 1.1 mrg } 151 1.1 mrg 152 1.1 mrg /* these 32-bit implementations depend on 64-bit pdep/pext 153 1.1 mrg which depend on _ARCH_PWR7. */ 154 1.1 mrg extern __inline unsigned int 155 1.1 mrg __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 156 1.1 mrg _pdep_u32 (unsigned int __X, unsigned int __Y) 157 1.1 mrg { 158 1.1 mrg return _pdep_u64 (__X, __Y); 159 1.1 mrg } 160 1.1 mrg 161 1.1 mrg extern __inline unsigned int 162 1.1 mrg __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 163 1.1 mrg _pext_u32 (unsigned int __X, unsigned int __Y) 164 1.1 mrg { 165 1.1 mrg return _pext_u64 (__X, __Y); 166 1.1 mrg } 167 1.1 mrg #endif /* _ARCH_PWR7 */ 168 1.1 mrg #endif /* __PPC64__ */ 169 1.1 mrg 170 1.1 mrg #endif /* _BMI2INTRIN_H_INCLUDED */ 171