1b8e80941Smrg/**************************************************************************
2b8e80941Smrg *
3b8e80941Smrg * Copyright 2008 Dennis Smit
4b8e80941Smrg * All Rights Reserved.
5b8e80941Smrg *
6b8e80941Smrg * Permission is hereby granted, free of charge, to any person obtaining a
7b8e80941Smrg * copy of this software and associated documentation files (the "Software"),
8b8e80941Smrg * to deal in the Software without restriction, including without limitation
9b8e80941Smrg * on the rights to use, copy, modify, merge, publish, distribute, sub
10b8e80941Smrg * license, and/or sell copies of the Software, and to permit persons to whom
11b8e80941Smrg * the Software is furnished to do so, subject to the following conditions:
12b8e80941Smrg *
13b8e80941Smrg * The above copyright notice and this permission notice (including the next
14b8e80941Smrg * paragraph) shall be included in all copies or substantial portions of the
15b8e80941Smrg * Software.
16b8e80941Smrg *
17b8e80941Smrg * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18b8e80941Smrg * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19b8e80941Smrg * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.  IN NO EVENT SHALL
20b8e80941Smrg * AUTHORS, COPYRIGHT HOLDERS, AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
21b8e80941Smrg * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
22b8e80941Smrg * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
23b8e80941Smrg * USE OR OTHER DEALINGS IN THE SOFTWARE.
24b8e80941Smrg *
25b8e80941Smrg **************************************************************************/
26b8e80941Smrg
27b8e80941Smrg/**
28b8e80941Smrg * @file
29b8e80941Smrg * CPU feature detection.
30b8e80941Smrg *
31b8e80941Smrg * @author Dennis Smit
32b8e80941Smrg * @author Based on the work of Eric Anholt <anholt@FreeBSD.org>
33b8e80941Smrg */
34b8e80941Smrg
35b8e80941Smrg#include "pipe/p_config.h"
36b8e80941Smrg
37b8e80941Smrg#include "util/u_debug.h"
38b8e80941Smrg#include "u_cpu_detect.h"
39b8e80941Smrg#include "c11/threads.h"
40b8e80941Smrg
41b8e80941Smrg#if defined(PIPE_ARCH_PPC)
42b8e80941Smrg#if defined(PIPE_OS_APPLE)
43b8e80941Smrg#include <sys/sysctl.h>
44b8e80941Smrg#else
45b8e80941Smrg#include <signal.h>
46b8e80941Smrg#include <setjmp.h>
47b8e80941Smrg#endif
48b8e80941Smrg#endif
49b8e80941Smrg
50b8e80941Smrg#if defined(PIPE_OS_NETBSD) || defined(PIPE_OS_OPENBSD)
51b8e80941Smrg#include <sys/param.h>
52b8e80941Smrg#include <sys/sysctl.h>
53b8e80941Smrg#include <machine/cpu.h>
54b8e80941Smrg#endif
55b8e80941Smrg
56b8e80941Smrg#if defined(PIPE_OS_FREEBSD) || defined(PIPE_OS_DRAGONFLY)
57b8e80941Smrg#include <sys/types.h>
58b8e80941Smrg#include <sys/sysctl.h>
59b8e80941Smrg#endif
60b8e80941Smrg
61b8e80941Smrg#if defined(PIPE_OS_LINUX)
62b8e80941Smrg#include <signal.h>
63b8e80941Smrg#include <fcntl.h>
64b8e80941Smrg#include <elf.h>
65b8e80941Smrg#endif
66b8e80941Smrg
67b8e80941Smrg#ifdef PIPE_OS_UNIX
68b8e80941Smrg#include <unistd.h>
69b8e80941Smrg#endif
70b8e80941Smrg
71b8e80941Smrg#if defined(HAS_ANDROID_CPUFEATURES)
72b8e80941Smrg#include <cpu-features.h>
73b8e80941Smrg#endif
74b8e80941Smrg
75b8e80941Smrg#if defined(PIPE_OS_WINDOWS)
76b8e80941Smrg#include <windows.h>
77b8e80941Smrg#if defined(PIPE_CC_MSVC)
78b8e80941Smrg#include <intrin.h>
79b8e80941Smrg#endif
80b8e80941Smrg#endif
81b8e80941Smrg
82b8e80941Smrg
83b8e80941Smrg#ifdef DEBUG
84b8e80941SmrgDEBUG_GET_ONCE_BOOL_OPTION(dump_cpu, "GALLIUM_DUMP_CPU", FALSE)
85b8e80941Smrg#endif
86b8e80941Smrg
87b8e80941Smrg
88b8e80941Smrgstruct util_cpu_caps util_cpu_caps;
89b8e80941Smrg
90b8e80941Smrg#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
91b8e80941Smrgstatic int has_cpuid(void);
92b8e80941Smrg#endif
93b8e80941Smrg
94b8e80941Smrg
95b8e80941Smrg#if defined(PIPE_ARCH_PPC) && !defined(PIPE_OS_APPLE)
96b8e80941Smrgstatic jmp_buf  __lv_powerpc_jmpbuf;
97b8e80941Smrgstatic volatile sig_atomic_t __lv_powerpc_canjump = 0;
98b8e80941Smrg
99b8e80941Smrgstatic void
100b8e80941Smrgsigill_handler(int sig)
101b8e80941Smrg{
102b8e80941Smrg   if (!__lv_powerpc_canjump) {
103b8e80941Smrg      signal (sig, SIG_DFL);
104b8e80941Smrg      raise (sig);
105b8e80941Smrg   }
106b8e80941Smrg
107b8e80941Smrg   __lv_powerpc_canjump = 0;
108b8e80941Smrg   longjmp(__lv_powerpc_jmpbuf, 1);
109b8e80941Smrg}
110b8e80941Smrg#endif
111b8e80941Smrg
112b8e80941Smrg#if defined(PIPE_ARCH_PPC)
113b8e80941Smrgstatic void
114b8e80941Smrgcheck_os_altivec_support(void)
115b8e80941Smrg{
116b8e80941Smrg#if defined(PIPE_OS_APPLE)
117b8e80941Smrg   int sels[2] = {CTL_HW, HW_VECTORUNIT};
118b8e80941Smrg   int has_vu = 0;
119b8e80941Smrg   int len = sizeof (has_vu);
120b8e80941Smrg   int err;
121b8e80941Smrg
122b8e80941Smrg   err = sysctl(sels, 2, &has_vu, &len, NULL, 0);
123b8e80941Smrg
124b8e80941Smrg   if (err == 0) {
125b8e80941Smrg      if (has_vu != 0) {
126b8e80941Smrg         util_cpu_caps.has_altivec = 1;
127b8e80941Smrg      }
128b8e80941Smrg   }
129b8e80941Smrg#else /* !PIPE_OS_APPLE */
130b8e80941Smrg   /* not on Apple/Darwin, do it the brute-force way */
131b8e80941Smrg   /* this is borrowed from the libmpeg2 library */
132b8e80941Smrg   signal(SIGILL, sigill_handler);
133b8e80941Smrg   if (setjmp(__lv_powerpc_jmpbuf)) {
134b8e80941Smrg      signal(SIGILL, SIG_DFL);
135b8e80941Smrg   } else {
136b8e80941Smrg      boolean enable_altivec = TRUE;    /* Default: enable  if available, and if not overridden */
137b8e80941Smrg      boolean enable_vsx = TRUE;
138b8e80941Smrg#ifdef DEBUG
139b8e80941Smrg      /* Disabling Altivec code generation is not the same as disabling VSX code generation,
140b8e80941Smrg       * which can be done simply by passing -mattr=-vsx to the LLVM compiler; cf.
141b8e80941Smrg       * lp_build_create_jit_compiler_for_module().
142b8e80941Smrg       * If you want to disable Altivec code generation, the best place to do it is here.
143b8e80941Smrg       */
144b8e80941Smrg      char *env_control = getenv("GALLIVM_ALTIVEC");    /* 1=enable (default); 0=disable */
145b8e80941Smrg      if (env_control && env_control[0] == '0') {
146b8e80941Smrg         enable_altivec = FALSE;
147b8e80941Smrg      }
148b8e80941Smrg#endif
149b8e80941Smrg      /* VSX instructions can be explicitly enabled/disabled via GALLIVM_VSX=1 or 0 */
150b8e80941Smrg      char *env_vsx = getenv("GALLIVM_VSX");
151b8e80941Smrg      if (env_vsx && env_vsx[0] == '0') {
152b8e80941Smrg         enable_vsx = FALSE;
153b8e80941Smrg      }
154b8e80941Smrg      if (enable_altivec) {
155b8e80941Smrg         __lv_powerpc_canjump = 1;
156b8e80941Smrg
157b8e80941Smrg         __asm __volatile
158b8e80941Smrg            ("mtspr 256, %0\n\t"
159b8e80941Smrg             "vand %%v0, %%v0, %%v0"
160b8e80941Smrg             :
161b8e80941Smrg             : "r" (-1));
162b8e80941Smrg
163b8e80941Smrg         util_cpu_caps.has_altivec = 1;
164b8e80941Smrg
165b8e80941Smrg         if (enable_vsx) {
166b8e80941Smrg            __asm __volatile("xxland %vs0, %vs0, %vs0");
167b8e80941Smrg            util_cpu_caps.has_vsx = 1;
168b8e80941Smrg         }
169b8e80941Smrg         signal(SIGILL, SIG_DFL);
170b8e80941Smrg      } else {
171b8e80941Smrg         util_cpu_caps.has_altivec = 0;
172b8e80941Smrg      }
173b8e80941Smrg   }
174b8e80941Smrg#endif /* !PIPE_OS_APPLE */
175b8e80941Smrg}
176b8e80941Smrg#endif /* PIPE_ARCH_PPC */
177b8e80941Smrg
178b8e80941Smrg
179b8e80941Smrg#if defined(PIPE_ARCH_X86) || defined (PIPE_ARCH_X86_64)
180b8e80941Smrgstatic int has_cpuid(void)
181b8e80941Smrg{
182b8e80941Smrg#if defined(PIPE_ARCH_X86)
183b8e80941Smrg#if defined(PIPE_OS_GCC)
184b8e80941Smrg   int a, c;
185b8e80941Smrg
186b8e80941Smrg   __asm __volatile
187b8e80941Smrg      ("pushf\n"
188b8e80941Smrg       "popl %0\n"
189b8e80941Smrg       "movl %0, %1\n"
190b8e80941Smrg       "xorl $0x200000, %0\n"
191b8e80941Smrg       "push %0\n"
192b8e80941Smrg       "popf\n"
193b8e80941Smrg       "pushf\n"
194b8e80941Smrg       "popl %0\n"
195b8e80941Smrg       : "=a" (a), "=c" (c)
196b8e80941Smrg       :
197b8e80941Smrg       : "cc");
198b8e80941Smrg
199b8e80941Smrg   return a != c;
200b8e80941Smrg#else
201b8e80941Smrg   /* FIXME */
202b8e80941Smrg   return 1;
203b8e80941Smrg#endif
204b8e80941Smrg#elif defined(PIPE_ARCH_X86_64)
205b8e80941Smrg   return 1;
206b8e80941Smrg#else
207b8e80941Smrg   return 0;
208b8e80941Smrg#endif
209b8e80941Smrg}
210b8e80941Smrg
211b8e80941Smrg
212b8e80941Smrg/**
213b8e80941Smrg * @sa cpuid.h included in gcc-4.3 onwards.
214b8e80941Smrg * @sa http://msdn.microsoft.com/en-us/library/hskdteyh.aspx
215b8e80941Smrg */
216b8e80941Smrgstatic inline void
217b8e80941Smrgcpuid(uint32_t ax, uint32_t *p)
218b8e80941Smrg{
219b8e80941Smrg#if defined(PIPE_CC_GCC) && defined(PIPE_ARCH_X86)
220b8e80941Smrg   __asm __volatile (
221b8e80941Smrg     "xchgl %%ebx, %1\n\t"
222b8e80941Smrg     "cpuid\n\t"
223b8e80941Smrg     "xchgl %%ebx, %1"
224b8e80941Smrg     : "=a" (p[0]),
225b8e80941Smrg       "=S" (p[1]),
226b8e80941Smrg       "=c" (p[2]),
227b8e80941Smrg       "=d" (p[3])
228b8e80941Smrg     : "0" (ax)
229b8e80941Smrg   );
230b8e80941Smrg#elif defined(PIPE_CC_GCC) && defined(PIPE_ARCH_X86_64)
231b8e80941Smrg   __asm __volatile (
232b8e80941Smrg     "cpuid\n\t"
233b8e80941Smrg     : "=a" (p[0]),
234b8e80941Smrg       "=b" (p[1]),
235b8e80941Smrg       "=c" (p[2]),
236b8e80941Smrg       "=d" (p[3])
237b8e80941Smrg     : "0" (ax)
238b8e80941Smrg   );
239b8e80941Smrg#elif defined(PIPE_CC_MSVC)
240b8e80941Smrg   __cpuid(p, ax);
241b8e80941Smrg#else
242b8e80941Smrg   p[0] = 0;
243b8e80941Smrg   p[1] = 0;
244b8e80941Smrg   p[2] = 0;
245b8e80941Smrg   p[3] = 0;
246b8e80941Smrg#endif
247b8e80941Smrg}
248b8e80941Smrg
249b8e80941Smrg/**
250b8e80941Smrg * @sa cpuid.h included in gcc-4.4 onwards.
251b8e80941Smrg * @sa http://msdn.microsoft.com/en-us/library/hskdteyh%28v=vs.90%29.aspx
252b8e80941Smrg */
253b8e80941Smrgstatic inline void
254b8e80941Smrgcpuid_count(uint32_t ax, uint32_t cx, uint32_t *p)
255b8e80941Smrg{
256b8e80941Smrg#if defined(PIPE_CC_GCC) && defined(PIPE_ARCH_X86)
257b8e80941Smrg   __asm __volatile (
258b8e80941Smrg     "xchgl %%ebx, %1\n\t"
259b8e80941Smrg     "cpuid\n\t"
260b8e80941Smrg     "xchgl %%ebx, %1"
261b8e80941Smrg     : "=a" (p[0]),
262b8e80941Smrg       "=S" (p[1]),
263b8e80941Smrg       "=c" (p[2]),
264b8e80941Smrg       "=d" (p[3])
265b8e80941Smrg     : "0" (ax), "2" (cx)
266b8e80941Smrg   );
267b8e80941Smrg#elif defined(PIPE_CC_GCC) && defined(PIPE_ARCH_X86_64)
268b8e80941Smrg   __asm __volatile (
269b8e80941Smrg     "cpuid\n\t"
270b8e80941Smrg     : "=a" (p[0]),
271b8e80941Smrg       "=b" (p[1]),
272b8e80941Smrg       "=c" (p[2]),
273b8e80941Smrg       "=d" (p[3])
274b8e80941Smrg     : "0" (ax), "2" (cx)
275b8e80941Smrg   );
276b8e80941Smrg#elif defined(PIPE_CC_MSVC)
277b8e80941Smrg   __cpuidex(p, ax, cx);
278b8e80941Smrg#else
279b8e80941Smrg   p[0] = 0;
280b8e80941Smrg   p[1] = 0;
281b8e80941Smrg   p[2] = 0;
282b8e80941Smrg   p[3] = 0;
283b8e80941Smrg#endif
284b8e80941Smrg}
285b8e80941Smrg
286b8e80941Smrg
287b8e80941Smrgstatic inline uint64_t xgetbv(void)
288b8e80941Smrg{
289b8e80941Smrg#if defined(PIPE_CC_GCC)
290b8e80941Smrg   uint32_t eax, edx;
291b8e80941Smrg
292b8e80941Smrg   __asm __volatile (
293b8e80941Smrg     ".byte 0x0f, 0x01, 0xd0" // xgetbv isn't supported on gcc < 4.4
294b8e80941Smrg     : "=a"(eax),
295b8e80941Smrg       "=d"(edx)
296b8e80941Smrg     : "c"(0)
297b8e80941Smrg   );
298b8e80941Smrg
299b8e80941Smrg   return ((uint64_t)edx << 32) | eax;
300b8e80941Smrg#elif defined(PIPE_CC_MSVC) && defined(_MSC_FULL_VER) && defined(_XCR_XFEATURE_ENABLED_MASK)
301b8e80941Smrg   return _xgetbv(_XCR_XFEATURE_ENABLED_MASK);
302b8e80941Smrg#else
303b8e80941Smrg   return 0;
304b8e80941Smrg#endif
305b8e80941Smrg}
306b8e80941Smrg
307b8e80941Smrg
308b8e80941Smrg#if defined(PIPE_ARCH_X86)
309b8e80941SmrgPIPE_ALIGN_STACK static inline boolean sse2_has_daz(void)
310b8e80941Smrg{
311b8e80941Smrg   struct {
312b8e80941Smrg      uint32_t pad1[7];
313b8e80941Smrg      uint32_t mxcsr_mask;
314b8e80941Smrg      uint32_t pad2[128-8];
315b8e80941Smrg   } PIPE_ALIGN_VAR(16) fxarea;
316b8e80941Smrg
317b8e80941Smrg   fxarea.mxcsr_mask = 0;
318b8e80941Smrg#if defined(PIPE_CC_GCC)
319b8e80941Smrg   __asm __volatile ("fxsave %0" : "+m" (fxarea));
320b8e80941Smrg#elif defined(PIPE_CC_MSVC) || defined(PIPE_CC_ICL)
321b8e80941Smrg   _fxsave(&fxarea);
322b8e80941Smrg#else
323b8e80941Smrg   fxarea.mxcsr_mask = 0;
324b8e80941Smrg#endif
325b8e80941Smrg   return !!(fxarea.mxcsr_mask & (1 << 6));
326b8e80941Smrg}
327b8e80941Smrg#endif
328b8e80941Smrg
329b8e80941Smrg#endif /* X86 or X86_64 */
330b8e80941Smrg
331b8e80941Smrg#if defined(PIPE_ARCH_ARM)
332b8e80941Smrgstatic void
333b8e80941Smrgcheck_os_arm_support(void)
334b8e80941Smrg{
335b8e80941Smrg   /*
336b8e80941Smrg    * On Android, the cpufeatures library is preferred way of checking
337b8e80941Smrg    * CPU capabilities. However, it is not available for standalone Mesa
338b8e80941Smrg    * builds, i.e. when Android build system (Android.mk-based) is not
339b8e80941Smrg    * used. Because of this we cannot use PIPE_OS_ANDROID here, but rather
340b8e80941Smrg    * have a separate macro that only gets enabled from respective Android.mk.
341b8e80941Smrg    */
342b8e80941Smrg#if defined(HAS_ANDROID_CPUFEATURES)
343b8e80941Smrg   AndroidCpuFamily cpu_family = android_getCpuFamily();
344b8e80941Smrg   uint64_t cpu_features = android_getCpuFeatures();
345b8e80941Smrg
346b8e80941Smrg   if (cpu_family == ANDROID_CPU_FAMILY_ARM) {
347b8e80941Smrg      if (cpu_features & ANDROID_CPU_ARM_FEATURE_NEON)
348b8e80941Smrg         util_cpu_caps.has_neon = 1;
349b8e80941Smrg   }
350b8e80941Smrg#elif defined(PIPE_OS_LINUX)
351b8e80941Smrg    Elf32_auxv_t aux;
352b8e80941Smrg    int fd;
353b8e80941Smrg
354b8e80941Smrg    fd = open("/proc/self/auxv", O_RDONLY | O_CLOEXEC);
355b8e80941Smrg    if (fd >= 0) {
356b8e80941Smrg       while (read(fd, &aux, sizeof(Elf32_auxv_t)) == sizeof(Elf32_auxv_t)) {
357b8e80941Smrg          if (aux.a_type == AT_HWCAP) {
358b8e80941Smrg             uint32_t hwcap = aux.a_un.a_val;
359b8e80941Smrg
360b8e80941Smrg             util_cpu_caps.has_neon = (hwcap >> 12) & 1;
361b8e80941Smrg             break;
362b8e80941Smrg          }
363b8e80941Smrg       }
364b8e80941Smrg       close (fd);
365b8e80941Smrg    }
366b8e80941Smrg#endif /* PIPE_OS_LINUX */
367b8e80941Smrg}
368b8e80941Smrg
369b8e80941Smrg#elif defined(PIPE_ARCH_AARCH64)
370b8e80941Smrgstatic void
371b8e80941Smrgcheck_os_arm_support(void)
372b8e80941Smrg{
373b8e80941Smrg    util_cpu_caps.has_neon = true;
374b8e80941Smrg}
375b8e80941Smrg#endif /* PIPE_ARCH_ARM || PIPE_ARCH_AARCH64 */
376b8e80941Smrg
377b8e80941Smrgstatic void
378b8e80941Smrgget_cpu_topology(void)
379b8e80941Smrg{
380b8e80941Smrg   /* Default. This is correct if L3 is not present or there is only one. */
381b8e80941Smrg   util_cpu_caps.cores_per_L3 = util_cpu_caps.nr_cpus;
382b8e80941Smrg
383b8e80941Smrg#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
384b8e80941Smrg   /* AMD Zen */
385b8e80941Smrg   if (util_cpu_caps.x86_cpu_type == 0x17) {
386b8e80941Smrg      uint32_t regs[4];
387b8e80941Smrg
388b8e80941Smrg      /* Query the L3 cache topology information. */
389b8e80941Smrg      cpuid_count(0x8000001D, 3, regs);
390b8e80941Smrg      unsigned cache_level = (regs[0] >> 5) & 0x7;
391b8e80941Smrg      unsigned cores_per_cache = ((regs[0] >> 14) & 0xfff) + 1;
392b8e80941Smrg
393b8e80941Smrg      if (cache_level == 3)
394b8e80941Smrg         util_cpu_caps.cores_per_L3 = cores_per_cache;
395b8e80941Smrg   }
396b8e80941Smrg#endif
397b8e80941Smrg}
398b8e80941Smrg
399b8e80941Smrgstatic void
400b8e80941Smrgutil_cpu_detect_once(void)
401b8e80941Smrg{
402b8e80941Smrg   memset(&util_cpu_caps, 0, sizeof util_cpu_caps);
403b8e80941Smrg
404b8e80941Smrg   /* Count the number of CPUs in system */
405b8e80941Smrg#if defined(PIPE_OS_WINDOWS)
406b8e80941Smrg   {
407b8e80941Smrg      SYSTEM_INFO system_info;
408b8e80941Smrg      GetSystemInfo(&system_info);
409b8e80941Smrg      util_cpu_caps.nr_cpus = system_info.dwNumberOfProcessors;
410b8e80941Smrg   }
411b8e80941Smrg#elif defined(PIPE_OS_UNIX) && defined(_SC_NPROCESSORS_ONLN)
412b8e80941Smrg   util_cpu_caps.nr_cpus = sysconf(_SC_NPROCESSORS_ONLN);
413b8e80941Smrg   if (util_cpu_caps.nr_cpus == ~0)
414b8e80941Smrg      util_cpu_caps.nr_cpus = 1;
415b8e80941Smrg#elif defined(PIPE_OS_BSD)
416b8e80941Smrg   {
417b8e80941Smrg      int mib[2], ncpu;
418b8e80941Smrg      int len;
419b8e80941Smrg
420b8e80941Smrg      mib[0] = CTL_HW;
421b8e80941Smrg      mib[1] = HW_NCPU;
422b8e80941Smrg
423b8e80941Smrg      len = sizeof (ncpu);
424b8e80941Smrg      sysctl(mib, 2, &ncpu, &len, NULL, 0);
425b8e80941Smrg      util_cpu_caps.nr_cpus = ncpu;
426b8e80941Smrg   }
427b8e80941Smrg#else
428b8e80941Smrg   util_cpu_caps.nr_cpus = 1;
429b8e80941Smrg#endif
430b8e80941Smrg
431b8e80941Smrg   /* Make the fallback cacheline size nonzero so that it can be
432b8e80941Smrg    * safely passed to align().
433b8e80941Smrg    */
434b8e80941Smrg   util_cpu_caps.cacheline = sizeof(void *);
435b8e80941Smrg
436b8e80941Smrg#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
437b8e80941Smrg   if (has_cpuid()) {
438b8e80941Smrg      uint32_t regs[4];
439b8e80941Smrg      uint32_t regs2[4];
440b8e80941Smrg
441b8e80941Smrg      util_cpu_caps.cacheline = 32;
442b8e80941Smrg
443b8e80941Smrg      /* Get max cpuid level */
444b8e80941Smrg      cpuid(0x00000000, regs);
445b8e80941Smrg
446b8e80941Smrg      if (regs[0] >= 0x00000001) {
447b8e80941Smrg         unsigned int cacheline;
448b8e80941Smrg
449b8e80941Smrg         cpuid (0x00000001, regs2);
450b8e80941Smrg
451b8e80941Smrg         util_cpu_caps.x86_cpu_type = (regs2[0] >> 8) & 0xf;
452b8e80941Smrg         /* Add "extended family". */
453b8e80941Smrg         if (util_cpu_caps.x86_cpu_type == 0xf)
454b8e80941Smrg             util_cpu_caps.x86_cpu_type += ((regs2[0] >> 20) & 0xff);
455b8e80941Smrg
456b8e80941Smrg         /* general feature flags */
457b8e80941Smrg         util_cpu_caps.has_tsc    = (regs2[3] >>  4) & 1; /* 0x0000010 */
458b8e80941Smrg         util_cpu_caps.has_mmx    = (regs2[3] >> 23) & 1; /* 0x0800000 */
459b8e80941Smrg         util_cpu_caps.has_sse    = (regs2[3] >> 25) & 1; /* 0x2000000 */
460b8e80941Smrg         util_cpu_caps.has_sse2   = (regs2[3] >> 26) & 1; /* 0x4000000 */
461b8e80941Smrg         util_cpu_caps.has_sse3   = (regs2[2] >>  0) & 1; /* 0x0000001 */
462b8e80941Smrg         util_cpu_caps.has_ssse3  = (regs2[2] >>  9) & 1; /* 0x0000020 */
463b8e80941Smrg         util_cpu_caps.has_sse4_1 = (regs2[2] >> 19) & 1;
464b8e80941Smrg         util_cpu_caps.has_sse4_2 = (regs2[2] >> 20) & 1;
465b8e80941Smrg         util_cpu_caps.has_popcnt = (regs2[2] >> 23) & 1;
466b8e80941Smrg         util_cpu_caps.has_avx    = ((regs2[2] >> 28) & 1) && // AVX
467b8e80941Smrg                                    ((regs2[2] >> 27) & 1) && // OSXSAVE
468b8e80941Smrg                                    ((xgetbv() & 6) == 6);    // XMM & YMM
469b8e80941Smrg         util_cpu_caps.has_f16c   = ((regs2[2] >> 29) & 1) && util_cpu_caps.has_avx;
470b8e80941Smrg         util_cpu_caps.has_fma    = ((regs2[2] >> 12) & 1) && util_cpu_caps.has_avx;
471b8e80941Smrg         util_cpu_caps.has_mmx2   = util_cpu_caps.has_sse; /* SSE cpus supports mmxext too */
472b8e80941Smrg#if defined(PIPE_ARCH_X86_64)
473b8e80941Smrg         util_cpu_caps.has_daz = 1;
474b8e80941Smrg#else
475b8e80941Smrg         util_cpu_caps.has_daz = util_cpu_caps.has_sse3 ||
476b8e80941Smrg            (util_cpu_caps.has_sse2 && sse2_has_daz());
477b8e80941Smrg#endif
478b8e80941Smrg
479b8e80941Smrg         cacheline = ((regs2[1] >> 8) & 0xFF) * 8;
480b8e80941Smrg         if (cacheline > 0)
481b8e80941Smrg            util_cpu_caps.cacheline = cacheline;
482b8e80941Smrg      }
483b8e80941Smrg      if (util_cpu_caps.has_avx && regs[0] >= 0x00000007) {
484b8e80941Smrg         uint32_t regs7[4];
485b8e80941Smrg         cpuid_count(0x00000007, 0x00000000, regs7);
486b8e80941Smrg         util_cpu_caps.has_avx2 = (regs7[1] >> 5) & 1;
487b8e80941Smrg      }
488b8e80941Smrg
489b8e80941Smrg      // check for avx512
490b8e80941Smrg      if (((regs2[2] >> 27) & 1) && // OSXSAVE
491b8e80941Smrg          (xgetbv() & (0x7 << 5)) && // OPMASK: upper-256 enabled by OS
492b8e80941Smrg          ((xgetbv() & 6) == 6)) { // XMM/YMM enabled by OS
493b8e80941Smrg         uint32_t regs3[4];
494b8e80941Smrg         cpuid_count(0x00000007, 0x00000000, regs3);
495b8e80941Smrg         util_cpu_caps.has_avx512f    = (regs3[1] >> 16) & 1;
496b8e80941Smrg         util_cpu_caps.has_avx512dq   = (regs3[1] >> 17) & 1;
497b8e80941Smrg         util_cpu_caps.has_avx512ifma = (regs3[1] >> 21) & 1;
498b8e80941Smrg         util_cpu_caps.has_avx512pf   = (regs3[1] >> 26) & 1;
499b8e80941Smrg         util_cpu_caps.has_avx512er   = (regs3[1] >> 27) & 1;
500b8e80941Smrg         util_cpu_caps.has_avx512cd   = (regs3[1] >> 28) & 1;
501b8e80941Smrg         util_cpu_caps.has_avx512bw   = (regs3[1] >> 30) & 1;
502b8e80941Smrg         util_cpu_caps.has_avx512vl   = (regs3[1] >> 31) & 1;
503b8e80941Smrg         util_cpu_caps.has_avx512vbmi = (regs3[2] >>  1) & 1;
504b8e80941Smrg      }
505b8e80941Smrg
506b8e80941Smrg      if (regs[1] == 0x756e6547 && regs[2] == 0x6c65746e && regs[3] == 0x49656e69) {
507b8e80941Smrg         /* GenuineIntel */
508b8e80941Smrg         util_cpu_caps.has_intel = 1;
509b8e80941Smrg      }
510b8e80941Smrg
511b8e80941Smrg      cpuid(0x80000000, regs);
512b8e80941Smrg
513b8e80941Smrg      if (regs[0] >= 0x80000001) {
514b8e80941Smrg
515b8e80941Smrg         cpuid(0x80000001, regs2);
516b8e80941Smrg
517b8e80941Smrg         util_cpu_caps.has_mmx  |= (regs2[3] >> 23) & 1;
518b8e80941Smrg         util_cpu_caps.has_mmx2 |= (regs2[3] >> 22) & 1;
519b8e80941Smrg         util_cpu_caps.has_3dnow = (regs2[3] >> 31) & 1;
520b8e80941Smrg         util_cpu_caps.has_3dnow_ext = (regs2[3] >> 30) & 1;
521b8e80941Smrg
522b8e80941Smrg         util_cpu_caps.has_xop = util_cpu_caps.has_avx &&
523b8e80941Smrg                                 ((regs2[2] >> 11) & 1);
524b8e80941Smrg      }
525b8e80941Smrg
526b8e80941Smrg      if (regs[0] >= 0x80000006) {
527b8e80941Smrg         /* should we really do this if the clflush size above worked? */
528b8e80941Smrg         unsigned int cacheline;
529b8e80941Smrg         cpuid(0x80000006, regs2);
530b8e80941Smrg         cacheline = regs2[2] & 0xFF;
531b8e80941Smrg         if (cacheline > 0)
532b8e80941Smrg            util_cpu_caps.cacheline = cacheline;
533b8e80941Smrg      }
534b8e80941Smrg
535b8e80941Smrg      if (!util_cpu_caps.has_sse) {
536b8e80941Smrg         util_cpu_caps.has_sse2 = 0;
537b8e80941Smrg         util_cpu_caps.has_sse3 = 0;
538b8e80941Smrg         util_cpu_caps.has_ssse3 = 0;
539b8e80941Smrg         util_cpu_caps.has_sse4_1 = 0;
540b8e80941Smrg      }
541b8e80941Smrg   }
542b8e80941Smrg#endif /* PIPE_ARCH_X86 || PIPE_ARCH_X86_64 */
543b8e80941Smrg
544b8e80941Smrg#if defined(PIPE_ARCH_ARM) || defined(PIPE_ARCH_AARCH64)
545b8e80941Smrg   check_os_arm_support();
546b8e80941Smrg#endif
547b8e80941Smrg
548b8e80941Smrg#if defined(PIPE_ARCH_PPC)
549b8e80941Smrg   check_os_altivec_support();
550b8e80941Smrg#endif /* PIPE_ARCH_PPC */
551b8e80941Smrg
552b8e80941Smrg   get_cpu_topology();
553b8e80941Smrg
554b8e80941Smrg#ifdef DEBUG
555b8e80941Smrg   if (debug_get_option_dump_cpu()) {
556b8e80941Smrg      debug_printf("util_cpu_caps.nr_cpus = %u\n", util_cpu_caps.nr_cpus);
557b8e80941Smrg
558b8e80941Smrg      debug_printf("util_cpu_caps.x86_cpu_type = %u\n", util_cpu_caps.x86_cpu_type);
559b8e80941Smrg      debug_printf("util_cpu_caps.cacheline = %u\n", util_cpu_caps.cacheline);
560b8e80941Smrg
561b8e80941Smrg      debug_printf("util_cpu_caps.has_tsc = %u\n", util_cpu_caps.has_tsc);
562b8e80941Smrg      debug_printf("util_cpu_caps.has_mmx = %u\n", util_cpu_caps.has_mmx);
563b8e80941Smrg      debug_printf("util_cpu_caps.has_mmx2 = %u\n", util_cpu_caps.has_mmx2);
564b8e80941Smrg      debug_printf("util_cpu_caps.has_sse = %u\n", util_cpu_caps.has_sse);
565b8e80941Smrg      debug_printf("util_cpu_caps.has_sse2 = %u\n", util_cpu_caps.has_sse2);
566b8e80941Smrg      debug_printf("util_cpu_caps.has_sse3 = %u\n", util_cpu_caps.has_sse3);
567b8e80941Smrg      debug_printf("util_cpu_caps.has_ssse3 = %u\n", util_cpu_caps.has_ssse3);
568b8e80941Smrg      debug_printf("util_cpu_caps.has_sse4_1 = %u\n", util_cpu_caps.has_sse4_1);
569b8e80941Smrg      debug_printf("util_cpu_caps.has_sse4_2 = %u\n", util_cpu_caps.has_sse4_2);
570b8e80941Smrg      debug_printf("util_cpu_caps.has_avx = %u\n", util_cpu_caps.has_avx);
571b8e80941Smrg      debug_printf("util_cpu_caps.has_avx2 = %u\n", util_cpu_caps.has_avx2);
572b8e80941Smrg      debug_printf("util_cpu_caps.has_f16c = %u\n", util_cpu_caps.has_f16c);
573b8e80941Smrg      debug_printf("util_cpu_caps.has_popcnt = %u\n", util_cpu_caps.has_popcnt);
574b8e80941Smrg      debug_printf("util_cpu_caps.has_3dnow = %u\n", util_cpu_caps.has_3dnow);
575b8e80941Smrg      debug_printf("util_cpu_caps.has_3dnow_ext = %u\n", util_cpu_caps.has_3dnow_ext);
576b8e80941Smrg      debug_printf("util_cpu_caps.has_xop = %u\n", util_cpu_caps.has_xop);
577b8e80941Smrg      debug_printf("util_cpu_caps.has_altivec = %u\n", util_cpu_caps.has_altivec);
578b8e80941Smrg      debug_printf("util_cpu_caps.has_vsx = %u\n", util_cpu_caps.has_vsx);
579b8e80941Smrg      debug_printf("util_cpu_caps.has_neon = %u\n", util_cpu_caps.has_neon);
580b8e80941Smrg      debug_printf("util_cpu_caps.has_daz = %u\n", util_cpu_caps.has_daz);
581b8e80941Smrg      debug_printf("util_cpu_caps.has_avx512f = %u\n", util_cpu_caps.has_avx512f);
582b8e80941Smrg      debug_printf("util_cpu_caps.has_avx512dq = %u\n", util_cpu_caps.has_avx512dq);
583b8e80941Smrg      debug_printf("util_cpu_caps.has_avx512ifma = %u\n", util_cpu_caps.has_avx512ifma);
584b8e80941Smrg      debug_printf("util_cpu_caps.has_avx512pf = %u\n", util_cpu_caps.has_avx512pf);
585b8e80941Smrg      debug_printf("util_cpu_caps.has_avx512er = %u\n", util_cpu_caps.has_avx512er);
586b8e80941Smrg      debug_printf("util_cpu_caps.has_avx512cd = %u\n", util_cpu_caps.has_avx512cd);
587b8e80941Smrg      debug_printf("util_cpu_caps.has_avx512bw = %u\n", util_cpu_caps.has_avx512bw);
588b8e80941Smrg      debug_printf("util_cpu_caps.has_avx512vl = %u\n", util_cpu_caps.has_avx512vl);
589b8e80941Smrg      debug_printf("util_cpu_caps.has_avx512vbmi = %u\n", util_cpu_caps.has_avx512vbmi);
590b8e80941Smrg   }
591b8e80941Smrg#endif
592b8e80941Smrg}
593b8e80941Smrg
594b8e80941Smrgstatic once_flag cpu_once_flag = ONCE_FLAG_INIT;
595b8e80941Smrg
596b8e80941Smrgvoid
597b8e80941Smrgutil_cpu_detect(void)
598b8e80941Smrg{
599b8e80941Smrg   call_once(&cpu_once_flag, util_cpu_detect_once);
600b8e80941Smrg}
601