bmi2intrin.h revision 1.1.1.4 1 1.1.1.4 mrg /* Copyright (C) 2011-2022 Free Software Foundation, Inc.
2 1.1 mrg
3 1.1 mrg This file is part of GCC.
4 1.1 mrg
5 1.1 mrg GCC is free software; you can redistribute it and/or modify
6 1.1 mrg it under the terms of the GNU General Public License as published by
7 1.1 mrg the Free Software Foundation; either version 3, or (at your option)
8 1.1 mrg any later version.
9 1.1 mrg
10 1.1 mrg GCC is distributed in the hope that it will be useful,
11 1.1 mrg but WITHOUT ANY WARRANTY; without even the implied warranty of
12 1.1 mrg MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 1.1 mrg GNU General Public License for more details.
14 1.1 mrg
15 1.1 mrg Under Section 7 of GPL version 3, you are granted additional
16 1.1 mrg permissions described in the GCC Runtime Library Exception, version
17 1.1 mrg 3.1, as published by the Free Software Foundation.
18 1.1 mrg
19 1.1 mrg You should have received a copy of the GNU General Public License and
20 1.1 mrg a copy of the GCC Runtime Library Exception along with this program;
21 1.1 mrg see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
22 1.1 mrg <http://www.gnu.org/licenses/>. */
23 1.1 mrg
24 1.1 mrg /* This header is distributed to simplify porting x86_64 code that
25 1.1 mrg makes explicit use of Intel intrinsics to powerpc64le.
26 1.1 mrg It is the user's responsibility to determine if the results are
27 1.1 mrg acceptable and make additional changes as necessary.
28 1.1 mrg Note that much code that uses Intel intrinsics can be rewritten in
29 1.1 mrg standard C or GNU C extensions, which are more portable and better
30 1.1 mrg optimized across multiple targets. */
31 1.1 mrg
32 1.1.1.4 mrg #if !defined _X86GPRINTRIN_H_INCLUDED
33 1.1.1.4 mrg # error "Never use <bmi2intrin.h> directly; include <x86gprintrin.h> instead."
34 1.1 mrg #endif
35 1.1 mrg
36 1.1 mrg #ifndef _BMI2INTRIN_H_INCLUDED
37 1.1 mrg #define _BMI2INTRIN_H_INCLUDED
38 1.1 mrg
39 1.1 mrg extern __inline unsigned int
40 1.1 mrg __attribute__((__gnu_inline__, __always_inline__, __artificial__))
41 1.1 mrg _bzhi_u32 (unsigned int __X, unsigned int __Y)
42 1.1 mrg {
43 1.1 mrg return ((__X << (32 - __Y)) >> (32 - __Y));
44 1.1 mrg }
45 1.1 mrg
46 1.1 mrg extern __inline unsigned int
47 1.1 mrg __attribute__((__gnu_inline__, __always_inline__, __artificial__))
48 1.1 mrg _mulx_u32 (unsigned int __X, unsigned int __Y, unsigned int *__P)
49 1.1 mrg {
50 1.1 mrg unsigned long long __res = (unsigned long long) __X * __Y;
51 1.1 mrg *__P = (unsigned int) (__res >> 32);
52 1.1 mrg return (unsigned int) __res;
53 1.1 mrg }
54 1.1 mrg
55 1.1 mrg #ifdef __PPC64__
56 1.1 mrg extern __inline unsigned long long
57 1.1 mrg __attribute__((__gnu_inline__, __always_inline__, __artificial__))
58 1.1 mrg _bzhi_u64 (unsigned long long __X, unsigned long long __Y)
59 1.1 mrg {
60 1.1 mrg return ((__X << (64 - __Y)) >> (64 - __Y));
61 1.1 mrg }
62 1.1 mrg
63 1.1 mrg /* __int128 requires base 64-bit. */
64 1.1 mrg extern __inline unsigned long long
65 1.1 mrg __attribute__((__gnu_inline__, __always_inline__, __artificial__))
66 1.1 mrg _mulx_u64 (unsigned long long __X, unsigned long long __Y,
67 1.1 mrg unsigned long long *__P)
68 1.1 mrg {
69 1.1 mrg unsigned __int128 __res = (unsigned __int128) __X * __Y;
70 1.1 mrg *__P = (unsigned long long) (__res >> 64);
71 1.1 mrg return (unsigned long long) __res;
72 1.1 mrg }
73 1.1 mrg
74 1.1 mrg #ifdef _ARCH_PWR7
75 1.1 mrg /* popcount and bpermd require power7 minimum. */
76 1.1 mrg extern __inline unsigned long long
77 1.1 mrg __attribute__((__gnu_inline__, __always_inline__, __artificial__))
78 1.1 mrg _pdep_u64 (unsigned long long __X, unsigned long long __M)
79 1.1 mrg {
80 1.1.1.3 mrg unsigned long __result = 0x0UL;
81 1.1.1.3 mrg const unsigned long __mask = 0x8000000000000000UL;
82 1.1.1.3 mrg unsigned long __m = __M;
83 1.1.1.3 mrg unsigned long __c, __t;
84 1.1.1.3 mrg unsigned long __p;
85 1.1 mrg
86 1.1 mrg /* The pop-count of the mask gives the number of the bits from
87 1.1 mrg source to process. This is also needed to shift bits from the
88 1.1 mrg source into the correct position for the result. */
89 1.1.1.3 mrg __p = 64 - __builtin_popcountl (__M);
90 1.1 mrg
91 1.1 mrg /* The loop is for the number of '1' bits in the mask and clearing
92 1.1 mrg each mask bit as it is processed. */
93 1.1.1.3 mrg while (__m != 0)
94 1.1 mrg {
95 1.1.1.3 mrg __c = __builtin_clzl (__m);
96 1.1.1.3 mrg __t = __X << (__p - __c);
97 1.1.1.3 mrg __m ^= (__mask >> __c);
98 1.1.1.3 mrg __result |= (__t & (__mask >> __c));
99 1.1.1.3 mrg __p++;
100 1.1 mrg }
101 1.1.1.3 mrg return __result;
102 1.1 mrg }
103 1.1 mrg
104 1.1 mrg extern __inline unsigned long long
105 1.1 mrg __attribute__((__gnu_inline__, __always_inline__, __artificial__))
106 1.1 mrg _pext_u64 (unsigned long long __X, unsigned long long __M)
107 1.1 mrg {
108 1.1.1.3 mrg unsigned long __p = 0x4040404040404040UL; // initial bit permute control
109 1.1.1.3 mrg const unsigned long __mask = 0x8000000000000000UL;
110 1.1.1.3 mrg unsigned long __m = __M;
111 1.1.1.3 mrg unsigned long __c;
112 1.1.1.3 mrg unsigned long __result;
113 1.1 mrg
114 1.1 mrg /* if the mask is constant and selects 8 bits or less we can use
115 1.1 mrg the Power8 Bit permute instruction. */
116 1.1 mrg if (__builtin_constant_p (__M) && (__builtin_popcountl (__M) <= 8))
117 1.1 mrg {
118 1.1 mrg /* Also if the pext mask is constant, then the popcount is
119 1.1 mrg constant, we can evaluate the following loop at compile
120 1.1 mrg time and use a constant bit permute vector. */
121 1.1.1.4 mrg long __i;
122 1.1.1.4 mrg for (__i = 0; __i < __builtin_popcountl (__M); __i++)
123 1.1 mrg {
124 1.1.1.3 mrg __c = __builtin_clzl (__m);
125 1.1.1.3 mrg __p = (__p << 8) | __c;
126 1.1.1.3 mrg __m ^= (__mask >> __c);
127 1.1 mrg }
128 1.1.1.3 mrg __result = __builtin_bpermd (__p, __X);
129 1.1 mrg }
130 1.1 mrg else
131 1.1 mrg {
132 1.1.1.3 mrg __p = 64 - __builtin_popcountl (__M);
133 1.1.1.3 mrg __result = 0;
134 1.1 mrg /* We could a use a for loop here, but that combined with
135 1.1 mrg -funroll-loops can expand to a lot of code. The while
136 1.1 mrg loop avoids unrolling and the compiler commons the xor
137 1.1.1.4 mrg from clearing the mask bit with the (m != 0) test. The
138 1.1 mrg result is a more compact loop setup and body. */
139 1.1.1.3 mrg while (__m != 0)
140 1.1 mrg {
141 1.1.1.3 mrg unsigned long __t;
142 1.1.1.3 mrg __c = __builtin_clzl (__m);
143 1.1.1.3 mrg __t = (__X & (__mask >> __c)) >> (__p - __c);
144 1.1.1.3 mrg __m ^= (__mask >> __c);
145 1.1.1.3 mrg __result |= (__t);
146 1.1.1.3 mrg __p++;
147 1.1 mrg }
148 1.1 mrg }
149 1.1.1.3 mrg return __result;
150 1.1 mrg }
151 1.1 mrg
152 1.1 mrg /* these 32-bit implementations depend on 64-bit pdep/pext
153 1.1 mrg which depend on _ARCH_PWR7. */
154 1.1 mrg extern __inline unsigned int
155 1.1 mrg __attribute__((__gnu_inline__, __always_inline__, __artificial__))
156 1.1 mrg _pdep_u32 (unsigned int __X, unsigned int __Y)
157 1.1 mrg {
158 1.1 mrg return _pdep_u64 (__X, __Y);
159 1.1 mrg }
160 1.1 mrg
161 1.1 mrg extern __inline unsigned int
162 1.1 mrg __attribute__((__gnu_inline__, __always_inline__, __artificial__))
163 1.1 mrg _pext_u32 (unsigned int __X, unsigned int __Y)
164 1.1 mrg {
165 1.1 mrg return _pext_u64 (__X, __Y);
166 1.1 mrg }
167 1.1 mrg #endif /* _ARCH_PWR7 */
168 1.1 mrg #endif /* __PPC64__ */
169 1.1 mrg
170 1.1 mrg #endif /* _BMI2INTRIN_H_INCLUDED */
171