101e04c3fSmrg/**************************************************************************
201e04c3fSmrg *
301e04c3fSmrg * Copyright 2008 Dennis Smit
401e04c3fSmrg * All Rights Reserved.
501e04c3fSmrg *
601e04c3fSmrg * Permission is hereby granted, free of charge, to any person obtaining a
701e04c3fSmrg * copy of this software and associated documentation files (the "Software"),
801e04c3fSmrg * to deal in the Software without restriction, including without limitation
901e04c3fSmrg * on the rights to use, copy, modify, merge, publish, distribute, sub
1001e04c3fSmrg * license, and/or sell copies of the Software, and to permit persons to whom
1101e04c3fSmrg * the Software is furnished to do so, subject to the following conditions:
1201e04c3fSmrg *
1301e04c3fSmrg * The above copyright notice and this permission notice (including the next
1401e04c3fSmrg * paragraph) shall be included in all copies or substantial portions of the
1501e04c3fSmrg * Software.
1601e04c3fSmrg *
1701e04c3fSmrg * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
1801e04c3fSmrg * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
1901e04c3fSmrg * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.  IN NO EVENT SHALL
2001e04c3fSmrg * AUTHORS, COPYRIGHT HOLDERS, AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
2101e04c3fSmrg * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
2201e04c3fSmrg * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
2301e04c3fSmrg * USE OR OTHER DEALINGS IN THE SOFTWARE.
2401e04c3fSmrg *
2501e04c3fSmrg **************************************************************************/
2601e04c3fSmrg
2701e04c3fSmrg/**
2801e04c3fSmrg * @file
2901e04c3fSmrg * CPU feature detection.
3001e04c3fSmrg *
3101e04c3fSmrg * @author Dennis Smit
3201e04c3fSmrg * @author Based on the work of Eric Anholt <anholt@FreeBSD.org>
3301e04c3fSmrg */
3401e04c3fSmrg
3501e04c3fSmrg#include "pipe/p_config.h"
367ec681f3Smrg#include "pipe/p_compiler.h"
3701e04c3fSmrg
3801e04c3fSmrg#include "util/u_debug.h"
3901e04c3fSmrg#include "u_cpu_detect.h"
407ec681f3Smrg#include "u_math.h"
4101e04c3fSmrg#include "c11/threads.h"
4201e04c3fSmrg
437ec681f3Smrg#include <stdio.h>
447ec681f3Smrg#include <inttypes.h>
457ec681f3Smrg
4601e04c3fSmrg#if defined(PIPE_ARCH_PPC)
4701e04c3fSmrg#if defined(PIPE_OS_APPLE)
4801e04c3fSmrg#include <sys/sysctl.h>
4901e04c3fSmrg#else
5001e04c3fSmrg#include <signal.h>
5101e04c3fSmrg#include <setjmp.h>
5201e04c3fSmrg#endif
5301e04c3fSmrg#endif
5401e04c3fSmrg
557ec681f3Smrg#if defined(PIPE_OS_BSD)
5601e04c3fSmrg#include <sys/param.h>
5701e04c3fSmrg#include <sys/sysctl.h>
5801e04c3fSmrg#include <machine/cpu.h>
5901e04c3fSmrg#endif
6001e04c3fSmrg
617ec681f3Smrg#if defined(PIPE_OS_FREEBSD)
627ec681f3Smrg#if __has_include(<sys/auxv.h>)
637ec681f3Smrg#include <sys/auxv.h>
647ec681f3Smrg#define HAVE_ELF_AUX_INFO
657ec681f3Smrg#endif
6601e04c3fSmrg#endif
6701e04c3fSmrg
6801e04c3fSmrg#if defined(PIPE_OS_LINUX)
6901e04c3fSmrg#include <signal.h>
7001e04c3fSmrg#include <fcntl.h>
7101e04c3fSmrg#include <elf.h>
7201e04c3fSmrg#endif
7301e04c3fSmrg
7401e04c3fSmrg#ifdef PIPE_OS_UNIX
7501e04c3fSmrg#include <unistd.h>
7601e04c3fSmrg#endif
7701e04c3fSmrg
7801e04c3fSmrg#if defined(HAS_ANDROID_CPUFEATURES)
7901e04c3fSmrg#include <cpu-features.h>
8001e04c3fSmrg#endif
8101e04c3fSmrg
8201e04c3fSmrg#if defined(PIPE_OS_WINDOWS)
8301e04c3fSmrg#include <windows.h>
8401e04c3fSmrg#if defined(PIPE_CC_MSVC)
8501e04c3fSmrg#include <intrin.h>
8601e04c3fSmrg#endif
8701e04c3fSmrg#endif
8801e04c3fSmrg
897ec681f3Smrg#if defined(HAS_SCHED_H)
907ec681f3Smrg#include <sched.h>
9101e04c3fSmrg#endif
9201e04c3fSmrg
937ec681f3SmrgDEBUG_GET_ONCE_BOOL_OPTION(dump_cpu, "GALLIUM_DUMP_CPU", false)
9401e04c3fSmrg
957ec681f3Smrg
967ec681f3Smrgstruct util_cpu_caps_t util_cpu_caps;
9701e04c3fSmrg
9801e04c3fSmrg#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
9901e04c3fSmrgstatic int has_cpuid(void);
10001e04c3fSmrg#endif
10101e04c3fSmrg
10201e04c3fSmrg
1037ec681f3Smrg#if defined(PIPE_ARCH_PPC) && !defined(PIPE_OS_APPLE) && !defined(PIPE_OS_BSD) && !defined(PIPE_OS_LINUX)
10401e04c3fSmrgstatic jmp_buf  __lv_powerpc_jmpbuf;
10501e04c3fSmrgstatic volatile sig_atomic_t __lv_powerpc_canjump = 0;
10601e04c3fSmrg
10701e04c3fSmrgstatic void
10801e04c3fSmrgsigill_handler(int sig)
10901e04c3fSmrg{
11001e04c3fSmrg   if (!__lv_powerpc_canjump) {
11101e04c3fSmrg      signal (sig, SIG_DFL);
11201e04c3fSmrg      raise (sig);
11301e04c3fSmrg   }
11401e04c3fSmrg
11501e04c3fSmrg   __lv_powerpc_canjump = 0;
11601e04c3fSmrg   longjmp(__lv_powerpc_jmpbuf, 1);
11701e04c3fSmrg}
11801e04c3fSmrg#endif
11901e04c3fSmrg
12001e04c3fSmrg#if defined(PIPE_ARCH_PPC)
12101e04c3fSmrgstatic void
12201e04c3fSmrgcheck_os_altivec_support(void)
12301e04c3fSmrg{
1247ec681f3Smrg#if defined(__ALTIVEC__)
1257ec681f3Smrg   util_cpu_caps.has_altivec = 1;
1267ec681f3Smrg#endif
1277ec681f3Smrg#if defined(__VSX__)
1287ec681f3Smrg   util_cpu_caps.has_vsx = 1;
1297ec681f3Smrg#endif
1307ec681f3Smrg#if defined(__ALTIVEC__) && defined(__VSX__)
1317ec681f3Smrg/* Do nothing */
1327ec681f3Smrg#elif defined(PIPE_OS_APPLE) || defined(PIPE_OS_NETBSD) || defined(PIPE_OS_OPENBSD)
1337ec681f3Smrg#ifdef HW_VECTORUNIT
13401e04c3fSmrg   int sels[2] = {CTL_HW, HW_VECTORUNIT};
1357ec681f3Smrg#else
1367ec681f3Smrg   int sels[2] = {CTL_MACHDEP, CPU_ALTIVEC};
1377ec681f3Smrg#endif
13801e04c3fSmrg   int has_vu = 0;
139c87a3a8aSchristos   size_t len = sizeof (has_vu);
14001e04c3fSmrg   int err;
14101e04c3fSmrg
14201e04c3fSmrg   err = sysctl(sels, 2, &has_vu, &len, NULL, 0);
14301e04c3fSmrg
14401e04c3fSmrg   if (err == 0) {
14501e04c3fSmrg      if (has_vu != 0) {
14601e04c3fSmrg         util_cpu_caps.has_altivec = 1;
14701e04c3fSmrg      }
14801e04c3fSmrg   }
1497ec681f3Smrg#elif defined(PIPE_OS_FREEBSD) /* !PIPE_OS_APPLE && !PIPE_OS_NETBSD && !PIPE_OS_OPENBSD */
1507ec681f3Smrg   unsigned long hwcap = 0;
1517ec681f3Smrg#ifdef HAVE_ELF_AUX_INFO
1527ec681f3Smrg   elf_aux_info(AT_HWCAP, &hwcap, sizeof(hwcap));
1537ec681f3Smrg#else
1547ec681f3Smrg   size_t len = sizeof(hwcap);
1557ec681f3Smrg   sysctlbyname("hw.cpu_features", &hwcap, &len, NULL, 0);
1567ec681f3Smrg#endif
1577ec681f3Smrg   if (hwcap & PPC_FEATURE_HAS_ALTIVEC)
1587ec681f3Smrg      util_cpu_caps.has_altivec = 1;
1597ec681f3Smrg   if (hwcap & PPC_FEATURE_HAS_VSX)
1607ec681f3Smrg      util_cpu_caps.has_vsx = 1;
1617ec681f3Smrg#elif defined(PIPE_OS_LINUX) /* !PIPE_OS_FREEBSD */
1627ec681f3Smrg#if defined(PIPE_ARCH_PPC_64)
1637ec681f3Smrg    Elf64_auxv_t aux;
1647ec681f3Smrg#else
1657ec681f3Smrg    Elf32_auxv_t aux;
1667ec681f3Smrg#endif
1677ec681f3Smrg    int fd = open("/proc/self/auxv", O_RDONLY | O_CLOEXEC);
1687ec681f3Smrg    if (fd >= 0) {
1697ec681f3Smrg       while (read(fd, &aux, sizeof(aux)) == sizeof(aux)) {
1707ec681f3Smrg          if (aux.a_type == AT_HWCAP) {
1717ec681f3Smrg             char *env_vsx = getenv("GALLIVM_VSX");
1727ec681f3Smrg             uint64_t hwcap = aux.a_un.a_val;
1737ec681f3Smrg             util_cpu_caps.has_altivec = (hwcap >> 28) & 1;
1747ec681f3Smrg             if (!env_vsx || env_vsx[0] != '0') {
1757ec681f3Smrg                util_cpu_caps.has_vsx  = (hwcap >>  7) & 1;
1767ec681f3Smrg             }
1777ec681f3Smrg             break;
1787ec681f3Smrg          }
1797ec681f3Smrg       }
1807ec681f3Smrg       close(fd);
1817ec681f3Smrg    }
1827ec681f3Smrg#else /* !PIPE_OS_APPLE && !PIPE_OS_BSD && !PIPE_OS_LINUX */
1837ec681f3Smrg   /* not on Apple/Darwin or Linux, do it the brute-force way */
18401e04c3fSmrg   /* this is borrowed from the libmpeg2 library */
18501e04c3fSmrg   signal(SIGILL, sigill_handler);
18601e04c3fSmrg   if (setjmp(__lv_powerpc_jmpbuf)) {
18701e04c3fSmrg      signal(SIGILL, SIG_DFL);
18801e04c3fSmrg   } else {
18901e04c3fSmrg      boolean enable_altivec = TRUE;    /* Default: enable  if available, and if not overridden */
19001e04c3fSmrg      boolean enable_vsx = TRUE;
19101e04c3fSmrg#ifdef DEBUG
19201e04c3fSmrg      /* Disabling Altivec code generation is not the same as disabling VSX code generation,
19301e04c3fSmrg       * which can be done simply by passing -mattr=-vsx to the LLVM compiler; cf.
19401e04c3fSmrg       * lp_build_create_jit_compiler_for_module().
19501e04c3fSmrg       * If you want to disable Altivec code generation, the best place to do it is here.
19601e04c3fSmrg       */
19701e04c3fSmrg      char *env_control = getenv("GALLIVM_ALTIVEC");    /* 1=enable (default); 0=disable */
19801e04c3fSmrg      if (env_control && env_control[0] == '0') {
19901e04c3fSmrg         enable_altivec = FALSE;
20001e04c3fSmrg      }
20101e04c3fSmrg#endif
20201e04c3fSmrg      /* VSX instructions can be explicitly enabled/disabled via GALLIVM_VSX=1 or 0 */
20301e04c3fSmrg      char *env_vsx = getenv("GALLIVM_VSX");
20401e04c3fSmrg      if (env_vsx && env_vsx[0] == '0') {
20501e04c3fSmrg         enable_vsx = FALSE;
20601e04c3fSmrg      }
20701e04c3fSmrg      if (enable_altivec) {
20801e04c3fSmrg         __lv_powerpc_canjump = 1;
20901e04c3fSmrg
21001e04c3fSmrg         __asm __volatile
21101e04c3fSmrg            ("mtspr 256, %0\n\t"
21201e04c3fSmrg             "vand %%v0, %%v0, %%v0"
21301e04c3fSmrg             :
21401e04c3fSmrg             : "r" (-1));
21501e04c3fSmrg
21601e04c3fSmrg         util_cpu_caps.has_altivec = 1;
21701e04c3fSmrg
21801e04c3fSmrg         if (enable_vsx) {
21901e04c3fSmrg            __asm __volatile("xxland %vs0, %vs0, %vs0");
22001e04c3fSmrg            util_cpu_caps.has_vsx = 1;
22101e04c3fSmrg         }
22201e04c3fSmrg         signal(SIGILL, SIG_DFL);
22301e04c3fSmrg      } else {
22401e04c3fSmrg         util_cpu_caps.has_altivec = 0;
22501e04c3fSmrg      }
22601e04c3fSmrg   }
2277ec681f3Smrg#endif /* !PIPE_OS_APPLE && !PIPE_OS_LINUX */
22801e04c3fSmrg}
22901e04c3fSmrg#endif /* PIPE_ARCH_PPC */
23001e04c3fSmrg
23101e04c3fSmrg
23201e04c3fSmrg#if defined(PIPE_ARCH_X86) || defined (PIPE_ARCH_X86_64)
23301e04c3fSmrgstatic int has_cpuid(void)
23401e04c3fSmrg{
23501e04c3fSmrg#if defined(PIPE_ARCH_X86)
23601e04c3fSmrg#if defined(PIPE_OS_GCC)
23701e04c3fSmrg   int a, c;
23801e04c3fSmrg
23901e04c3fSmrg   __asm __volatile
24001e04c3fSmrg      ("pushf\n"
24101e04c3fSmrg       "popl %0\n"
24201e04c3fSmrg       "movl %0, %1\n"
24301e04c3fSmrg       "xorl $0x200000, %0\n"
24401e04c3fSmrg       "push %0\n"
24501e04c3fSmrg       "popf\n"
24601e04c3fSmrg       "pushf\n"
24701e04c3fSmrg       "popl %0\n"
24801e04c3fSmrg       : "=a" (a), "=c" (c)
24901e04c3fSmrg       :
25001e04c3fSmrg       : "cc");
25101e04c3fSmrg
25201e04c3fSmrg   return a != c;
25301e04c3fSmrg#else
25401e04c3fSmrg   /* FIXME */
25501e04c3fSmrg   return 1;
25601e04c3fSmrg#endif
25701e04c3fSmrg#elif defined(PIPE_ARCH_X86_64)
25801e04c3fSmrg   return 1;
25901e04c3fSmrg#else
26001e04c3fSmrg   return 0;
26101e04c3fSmrg#endif
26201e04c3fSmrg}
26301e04c3fSmrg
26401e04c3fSmrg
26501e04c3fSmrg/**
26601e04c3fSmrg * @sa cpuid.h included in gcc-4.3 onwards.
26701e04c3fSmrg * @sa http://msdn.microsoft.com/en-us/library/hskdteyh.aspx
26801e04c3fSmrg */
26901e04c3fSmrgstatic inline void
27001e04c3fSmrgcpuid(uint32_t ax, uint32_t *p)
27101e04c3fSmrg{
27201e04c3fSmrg#if defined(PIPE_CC_GCC) && defined(PIPE_ARCH_X86)
27301e04c3fSmrg   __asm __volatile (
27401e04c3fSmrg     "xchgl %%ebx, %1\n\t"
27501e04c3fSmrg     "cpuid\n\t"
27601e04c3fSmrg     "xchgl %%ebx, %1"
27701e04c3fSmrg     : "=a" (p[0]),
27801e04c3fSmrg       "=S" (p[1]),
27901e04c3fSmrg       "=c" (p[2]),
28001e04c3fSmrg       "=d" (p[3])
28101e04c3fSmrg     : "0" (ax)
28201e04c3fSmrg   );
28301e04c3fSmrg#elif defined(PIPE_CC_GCC) && defined(PIPE_ARCH_X86_64)
28401e04c3fSmrg   __asm __volatile (
28501e04c3fSmrg     "cpuid\n\t"
28601e04c3fSmrg     : "=a" (p[0]),
28701e04c3fSmrg       "=b" (p[1]),
28801e04c3fSmrg       "=c" (p[2]),
28901e04c3fSmrg       "=d" (p[3])
29001e04c3fSmrg     : "0" (ax)
29101e04c3fSmrg   );
29201e04c3fSmrg#elif defined(PIPE_CC_MSVC)
29301e04c3fSmrg   __cpuid(p, ax);
29401e04c3fSmrg#else
29501e04c3fSmrg   p[0] = 0;
29601e04c3fSmrg   p[1] = 0;
29701e04c3fSmrg   p[2] = 0;
29801e04c3fSmrg   p[3] = 0;
29901e04c3fSmrg#endif
30001e04c3fSmrg}
30101e04c3fSmrg
30201e04c3fSmrg/**
30301e04c3fSmrg * @sa cpuid.h included in gcc-4.4 onwards.
30401e04c3fSmrg * @sa http://msdn.microsoft.com/en-us/library/hskdteyh%28v=vs.90%29.aspx
30501e04c3fSmrg */
30601e04c3fSmrgstatic inline void
30701e04c3fSmrgcpuid_count(uint32_t ax, uint32_t cx, uint32_t *p)
30801e04c3fSmrg{
30901e04c3fSmrg#if defined(PIPE_CC_GCC) && defined(PIPE_ARCH_X86)
31001e04c3fSmrg   __asm __volatile (
31101e04c3fSmrg     "xchgl %%ebx, %1\n\t"
31201e04c3fSmrg     "cpuid\n\t"
31301e04c3fSmrg     "xchgl %%ebx, %1"
31401e04c3fSmrg     : "=a" (p[0]),
31501e04c3fSmrg       "=S" (p[1]),
31601e04c3fSmrg       "=c" (p[2]),
31701e04c3fSmrg       "=d" (p[3])
31801e04c3fSmrg     : "0" (ax), "2" (cx)
31901e04c3fSmrg   );
32001e04c3fSmrg#elif defined(PIPE_CC_GCC) && defined(PIPE_ARCH_X86_64)
32101e04c3fSmrg   __asm __volatile (
32201e04c3fSmrg     "cpuid\n\t"
32301e04c3fSmrg     : "=a" (p[0]),
32401e04c3fSmrg       "=b" (p[1]),
32501e04c3fSmrg       "=c" (p[2]),
32601e04c3fSmrg       "=d" (p[3])
32701e04c3fSmrg     : "0" (ax), "2" (cx)
32801e04c3fSmrg   );
32901e04c3fSmrg#elif defined(PIPE_CC_MSVC)
33001e04c3fSmrg   __cpuidex(p, ax, cx);
33101e04c3fSmrg#else
33201e04c3fSmrg   p[0] = 0;
33301e04c3fSmrg   p[1] = 0;
33401e04c3fSmrg   p[2] = 0;
33501e04c3fSmrg   p[3] = 0;
33601e04c3fSmrg#endif
33701e04c3fSmrg}
33801e04c3fSmrg
33901e04c3fSmrg
34001e04c3fSmrgstatic inline uint64_t xgetbv(void)
34101e04c3fSmrg{
34201e04c3fSmrg#if defined(PIPE_CC_GCC)
34301e04c3fSmrg   uint32_t eax, edx;
34401e04c3fSmrg
34501e04c3fSmrg   __asm __volatile (
34601e04c3fSmrg     ".byte 0x0f, 0x01, 0xd0" // xgetbv isn't supported on gcc < 4.4
34701e04c3fSmrg     : "=a"(eax),
34801e04c3fSmrg       "=d"(edx)
34901e04c3fSmrg     : "c"(0)
35001e04c3fSmrg   );
35101e04c3fSmrg
35201e04c3fSmrg   return ((uint64_t)edx << 32) | eax;
35301e04c3fSmrg#elif defined(PIPE_CC_MSVC) && defined(_MSC_FULL_VER) && defined(_XCR_XFEATURE_ENABLED_MASK)
35401e04c3fSmrg   return _xgetbv(_XCR_XFEATURE_ENABLED_MASK);
35501e04c3fSmrg#else
35601e04c3fSmrg   return 0;
35701e04c3fSmrg#endif
35801e04c3fSmrg}
35901e04c3fSmrg
36001e04c3fSmrg
36101e04c3fSmrg#if defined(PIPE_ARCH_X86)
36201e04c3fSmrgPIPE_ALIGN_STACK static inline boolean sse2_has_daz(void)
36301e04c3fSmrg{
36401e04c3fSmrg   struct {
36501e04c3fSmrg      uint32_t pad1[7];
36601e04c3fSmrg      uint32_t mxcsr_mask;
36701e04c3fSmrg      uint32_t pad2[128-8];
36801e04c3fSmrg   } PIPE_ALIGN_VAR(16) fxarea;
36901e04c3fSmrg
37001e04c3fSmrg   fxarea.mxcsr_mask = 0;
37101e04c3fSmrg#if defined(PIPE_CC_GCC)
37201e04c3fSmrg   __asm __volatile ("fxsave %0" : "+m" (fxarea));
37301e04c3fSmrg#elif defined(PIPE_CC_MSVC) || defined(PIPE_CC_ICL)
37401e04c3fSmrg   _fxsave(&fxarea);
37501e04c3fSmrg#else
37601e04c3fSmrg   fxarea.mxcsr_mask = 0;
37701e04c3fSmrg#endif
37801e04c3fSmrg   return !!(fxarea.mxcsr_mask & (1 << 6));
37901e04c3fSmrg}
38001e04c3fSmrg#endif
38101e04c3fSmrg
38201e04c3fSmrg#endif /* X86 or X86_64 */
38301e04c3fSmrg
38401e04c3fSmrg#if defined(PIPE_ARCH_ARM)
38501e04c3fSmrgstatic void
38601e04c3fSmrgcheck_os_arm_support(void)
38701e04c3fSmrg{
38801e04c3fSmrg   /*
38901e04c3fSmrg    * On Android, the cpufeatures library is preferred way of checking
39001e04c3fSmrg    * CPU capabilities. However, it is not available for standalone Mesa
39101e04c3fSmrg    * builds, i.e. when Android build system (Android.mk-based) is not
39201e04c3fSmrg    * used. Because of this we cannot use PIPE_OS_ANDROID here, but rather
39301e04c3fSmrg    * have a separate macro that only gets enabled from respective Android.mk.
39401e04c3fSmrg    */
3957ec681f3Smrg#if defined(__ARM_NEON) || defined(__ARM_NEON__)
3967ec681f3Smrg   util_cpu_caps.has_neon = 1;
3977ec681f3Smrg#elif defined(PIPE_OS_FREEBSD) && defined(HAVE_ELF_AUX_INFO)
3987ec681f3Smrg   unsigned long hwcap = 0;
3997ec681f3Smrg   elf_aux_info(AT_HWCAP, &hwcap, sizeof(hwcap));
4007ec681f3Smrg   if (hwcap & HWCAP_NEON)
4017ec681f3Smrg      util_cpu_caps.has_neon = 1;
4027ec681f3Smrg#elif defined(HAS_ANDROID_CPUFEATURES)
40301e04c3fSmrg   AndroidCpuFamily cpu_family = android_getCpuFamily();
40401e04c3fSmrg   uint64_t cpu_features = android_getCpuFeatures();
40501e04c3fSmrg
40601e04c3fSmrg   if (cpu_family == ANDROID_CPU_FAMILY_ARM) {
40701e04c3fSmrg      if (cpu_features & ANDROID_CPU_ARM_FEATURE_NEON)
40801e04c3fSmrg         util_cpu_caps.has_neon = 1;
40901e04c3fSmrg   }
41001e04c3fSmrg#elif defined(PIPE_OS_LINUX)
41101e04c3fSmrg    Elf32_auxv_t aux;
41201e04c3fSmrg    int fd;
41301e04c3fSmrg
41401e04c3fSmrg    fd = open("/proc/self/auxv", O_RDONLY | O_CLOEXEC);
41501e04c3fSmrg    if (fd >= 0) {
41601e04c3fSmrg       while (read(fd, &aux, sizeof(Elf32_auxv_t)) == sizeof(Elf32_auxv_t)) {
41701e04c3fSmrg          if (aux.a_type == AT_HWCAP) {
41801e04c3fSmrg             uint32_t hwcap = aux.a_un.a_val;
41901e04c3fSmrg
42001e04c3fSmrg             util_cpu_caps.has_neon = (hwcap >> 12) & 1;
42101e04c3fSmrg             break;
42201e04c3fSmrg          }
42301e04c3fSmrg       }
42401e04c3fSmrg       close (fd);
42501e04c3fSmrg    }
42601e04c3fSmrg#endif /* PIPE_OS_LINUX */
42701e04c3fSmrg}
42801e04c3fSmrg
4298a1362adSmaya#elif defined(PIPE_ARCH_AARCH64)
43001e04c3fSmrgstatic void
4318a1362adSmayacheck_os_arm_support(void)
43201e04c3fSmrg{
4338a1362adSmaya    util_cpu_caps.has_neon = true;
4348a1362adSmaya}
4358a1362adSmaya#endif /* PIPE_ARCH_ARM || PIPE_ARCH_AARCH64 */
43601e04c3fSmrg
4377ec681f3Smrg#if defined(PIPE_ARCH_MIPS64)
43854c3abb5Smartin#ifdef __NetBSD__
43954c3abb5Smartinstatic void
440cbafdbbfSmartincheck_os_mips64_support(void)
44154c3abb5Smartin{
44254c3abb5Smartin    util_cpu_caps.has_msa = false;	/* XXX seems there is no way to detect MSA support from userland */
44354c3abb5Smartin}
44454c3abb5Smartin#else
4457ec681f3Smrgstatic void
4467ec681f3Smrgcheck_os_mips64_support(void)
4477ec681f3Smrg{
4487ec681f3Smrg    Elf64_auxv_t aux;
4497ec681f3Smrg    int fd;
4507ec681f3Smrg
4517ec681f3Smrg    fd = open("/proc/self/auxv", O_RDONLY | O_CLOEXEC);
4527ec681f3Smrg    if (fd >= 0) {
4537ec681f3Smrg       while (read(fd, &aux, sizeof(Elf64_auxv_t)) == sizeof(Elf64_auxv_t)) {
4547ec681f3Smrg          if (aux.a_type == AT_HWCAP) {
4557ec681f3Smrg             uint64_t hwcap = aux.a_un.a_val;
4567ec681f3Smrg
4577ec681f3Smrg             util_cpu_caps.has_msa = (hwcap >> 1) & 1;
4587ec681f3Smrg             break;
4597ec681f3Smrg          }
4607ec681f3Smrg       }
4617ec681f3Smrg       close (fd);
4627ec681f3Smrg    }
4637ec681f3Smrg}
46454c3abb5Smartin#endif
4657ec681f3Smrg#endif /* PIPE_ARCH_MIPS64 */
4667ec681f3Smrg
4677ec681f3Smrg
4688a1362adSmayastatic void
4698a1362adSmayaget_cpu_topology(void)
4708a1362adSmaya{
4717ec681f3Smrg   /* Default. This is OK if L3 is not present or there is only one. */
4727ec681f3Smrg   util_cpu_caps.num_L3_caches = 1;
4737ec681f3Smrg
4747ec681f3Smrg   memset(util_cpu_caps.cpu_to_L3, 0xff, sizeof(util_cpu_caps.cpu_to_L3));
47501e04c3fSmrg
47601e04c3fSmrg#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
47701e04c3fSmrg   /* AMD Zen */
4787ec681f3Smrg   if (util_cpu_caps.family >= CPU_AMD_ZEN1_ZEN2 &&
4797ec681f3Smrg       util_cpu_caps.family < CPU_AMD_LAST) {
4808a1362adSmaya      uint32_t regs[4];
4818a1362adSmaya
4827ec681f3Smrg      uint32_t saved_mask[UTIL_MAX_CPUS / 32] = {0};
4837ec681f3Smrg      uint32_t mask[UTIL_MAX_CPUS / 32] = {0};
4847ec681f3Smrg      bool saved = false;
4857ec681f3Smrg
4867ec681f3Smrg      uint32_t L3_found[UTIL_MAX_CPUS] = {0};
4877ec681f3Smrg      uint32_t num_L3_caches = 0;
4887ec681f3Smrg      util_affinity_mask *L3_affinity_masks = NULL;
4897ec681f3Smrg
4907ec681f3Smrg      /* Query APIC IDs from each CPU core.
4917ec681f3Smrg       *
4927ec681f3Smrg       * An APIC ID is a logical ID of the CPU with respect to the cache
4937ec681f3Smrg       * hierarchy, meaning that consecutive APIC IDs are neighbours in
4947ec681f3Smrg       * the hierarchy, e.g. sharing the same cache.
4957ec681f3Smrg       *
4967ec681f3Smrg       * For example, CPU 0 can have APIC ID 0 and CPU 12 can have APIC ID 1,
4977ec681f3Smrg       * which means that both CPU 0 and 12 are next to each other.
4987ec681f3Smrg       * (e.g. they are 2 threads belonging to 1 SMT2 core)
4997ec681f3Smrg       *
5007ec681f3Smrg       * We need to find out which CPUs share the same L3 cache and they can
5017ec681f3Smrg       * be all over the place.
5027ec681f3Smrg       *
5037ec681f3Smrg       * Querying the APIC ID can only be done by pinning the current thread
5047ec681f3Smrg       * to each core. The original affinity mask is saved.
5057ec681f3Smrg       *
5067ec681f3Smrg       * Loop over all possible CPUs even though some may be offline.
5077ec681f3Smrg       */
5087ec681f3Smrg      for (int16_t i = 0; i < util_cpu_caps.max_cpus && i < UTIL_MAX_CPUS; i++) {
5097ec681f3Smrg         uint32_t cpu_bit = 1u << (i % 32);
5107ec681f3Smrg
5117ec681f3Smrg         mask[i / 32] = cpu_bit;
5127ec681f3Smrg
5137ec681f3Smrg         /* The assumption is that trying to bind the thread to a CPU that is
5147ec681f3Smrg          * offline will fail.
5157ec681f3Smrg          */
5167ec681f3Smrg         if (util_set_current_thread_affinity(mask,
5177ec681f3Smrg                                              !saved ? saved_mask : NULL,
5187ec681f3Smrg                                              util_cpu_caps.num_cpu_mask_bits)) {
5197ec681f3Smrg            saved = true;
5207ec681f3Smrg
5217ec681f3Smrg            /* Query the APIC ID of the current core. */
5227ec681f3Smrg            cpuid(0x00000001, regs);
5237ec681f3Smrg            unsigned apic_id = regs[1] >> 24;
5247ec681f3Smrg
5257ec681f3Smrg            /* Query the total core count for the CPU */
5267ec681f3Smrg            uint32_t core_count = 1;
5277ec681f3Smrg            if (regs[3] & (1 << 28))
5287ec681f3Smrg               core_count = (regs[1] >> 16) & 0xff;
5297ec681f3Smrg
5307ec681f3Smrg            core_count = util_next_power_of_two(core_count);
5317ec681f3Smrg
5327ec681f3Smrg            /* Query the L3 cache count. */
5337ec681f3Smrg            cpuid_count(0x8000001D, 3, regs);
5347ec681f3Smrg            unsigned cache_level = (regs[0] >> 5) & 0x7;
5357ec681f3Smrg            unsigned cores_per_L3 = ((regs[0] >> 14) & 0xfff) + 1;
5367ec681f3Smrg
5377ec681f3Smrg            if (cache_level != 3)
5387ec681f3Smrg               continue;
5397ec681f3Smrg
5407ec681f3Smrg            unsigned local_core_id = apic_id & (core_count - 1);
5417ec681f3Smrg            unsigned phys_id = (apic_id & ~(core_count - 1)) >> util_logbase2(core_count);
5427ec681f3Smrg            unsigned local_l3_cache_index = local_core_id / util_next_power_of_two(cores_per_L3);
5437ec681f3Smrg#define L3_ID(p, i) (p << 16 | i << 1 | 1);
5447ec681f3Smrg
5457ec681f3Smrg            unsigned l3_id = L3_ID(phys_id, local_l3_cache_index);
5467ec681f3Smrg            int idx = -1;
5477ec681f3Smrg            for (unsigned c = 0; c < num_L3_caches; c++) {
5487ec681f3Smrg               if (L3_found[c] == l3_id) {
5497ec681f3Smrg                  idx = c;
5507ec681f3Smrg                  break;
5517ec681f3Smrg               }
5527ec681f3Smrg            }
5537ec681f3Smrg            if (idx == -1) {
5547ec681f3Smrg               idx = num_L3_caches;
5557ec681f3Smrg               L3_found[num_L3_caches++] = l3_id;
5567ec681f3Smrg               L3_affinity_masks = realloc(L3_affinity_masks, sizeof(util_affinity_mask) * num_L3_caches);
5577ec681f3Smrg               if (!L3_affinity_masks)
5587ec681f3Smrg                  return;
5597ec681f3Smrg               memset(&L3_affinity_masks[num_L3_caches - 1], 0, sizeof(util_affinity_mask));
5607ec681f3Smrg            }
5617ec681f3Smrg            util_cpu_caps.cpu_to_L3[i] = idx;
5627ec681f3Smrg            L3_affinity_masks[idx][i / 32] |= cpu_bit;
5637ec681f3Smrg
5647ec681f3Smrg         }
5657ec681f3Smrg         mask[i / 32] = 0;
5667ec681f3Smrg      }
5677ec681f3Smrg
5687ec681f3Smrg      util_cpu_caps.num_L3_caches = num_L3_caches;
5697ec681f3Smrg      util_cpu_caps.L3_affinity_mask = L3_affinity_masks;
5707ec681f3Smrg
5717ec681f3Smrg      if (saved) {
5727ec681f3Smrg         if (debug_get_option_dump_cpu()) {
5737ec681f3Smrg            fprintf(stderr, "CPU <-> L3 cache mapping:\n");
5747ec681f3Smrg            for (unsigned i = 0; i < util_cpu_caps.num_L3_caches; i++) {
5757ec681f3Smrg               fprintf(stderr, "  - L3 %u mask = ", i);
5767ec681f3Smrg               for (int j = util_cpu_caps.max_cpus - 1; j >= 0; j -= 32)
5777ec681f3Smrg                  fprintf(stderr, "%08x ", util_cpu_caps.L3_affinity_mask[i][j / 32]);
5787ec681f3Smrg               fprintf(stderr, "\n");
5797ec681f3Smrg            }
5807ec681f3Smrg         }
58101e04c3fSmrg
5827ec681f3Smrg         /* Restore the original affinity mask. */
5837ec681f3Smrg         util_set_current_thread_affinity(saved_mask, NULL,
5847ec681f3Smrg                                          util_cpu_caps.num_cpu_mask_bits);
5857ec681f3Smrg      } else {
5867ec681f3Smrg         if (debug_get_option_dump_cpu())
5877ec681f3Smrg            fprintf(stderr, "Cannot set thread affinity for any thread.\n");
5887ec681f3Smrg      }
58901e04c3fSmrg   }
59001e04c3fSmrg#endif
59101e04c3fSmrg}
59201e04c3fSmrg
59301e04c3fSmrgstatic void
59401e04c3fSmrgutil_cpu_detect_once(void)
59501e04c3fSmrg{
5967ec681f3Smrg   int available_cpus = 0;
5977ec681f3Smrg   int total_cpus = 0;
5987ec681f3Smrg
59901e04c3fSmrg   memset(&util_cpu_caps, 0, sizeof util_cpu_caps);
60001e04c3fSmrg
60101e04c3fSmrg   /* Count the number of CPUs in system */
60201e04c3fSmrg#if defined(PIPE_OS_WINDOWS)
60301e04c3fSmrg   {
60401e04c3fSmrg      SYSTEM_INFO system_info;
60501e04c3fSmrg      GetSystemInfo(&system_info);
6067ec681f3Smrg      available_cpus = MAX2(1, system_info.dwNumberOfProcessors);
60701e04c3fSmrg   }
6087ec681f3Smrg#elif defined(PIPE_OS_UNIX)
6097ec681f3Smrg#  if defined(HAS_SCHED_GETAFFINITY)
61001e04c3fSmrg   {
6117ec681f3Smrg      /* sched_setaffinity() can be used to further restrict the number of
6127ec681f3Smrg       * CPUs on which the process can run.  Use sched_getaffinity() to
6137ec681f3Smrg       * determine the true number of available CPUs.
6147ec681f3Smrg       *
6157ec681f3Smrg       * FIXME: The Linux manual page for sched_getaffinity describes how this
6167ec681f3Smrg       * simple implementation will fail with > 1024 CPUs, and we'll fall back
6177ec681f3Smrg       * to the _SC_NPROCESSORS_ONLN path.  Support for > 1024 CPUs can be
6187ec681f3Smrg       * added to this path once someone has such a system for testing.
6197ec681f3Smrg       */
6207ec681f3Smrg      cpu_set_t affin;
6217ec681f3Smrg      if (sched_getaffinity(getpid(), sizeof(affin), &affin) == 0)
6227ec681f3Smrg         available_cpus = CPU_COUNT(&affin);
6237ec681f3Smrg   }
6247ec681f3Smrg#  endif
62501e04c3fSmrg
6267ec681f3Smrg   /* Linux, FreeBSD, DragonFly, and Mac OS X should have
6277ec681f3Smrg    * _SC_NOPROCESSORS_ONLN.  NetBSD and OpenBSD should have HW_NCPUONLINE.
6287ec681f3Smrg    * This is what FFmpeg uses on those platforms.
6297ec681f3Smrg    */
6307ec681f3Smrg#  if defined(PIPE_OS_BSD) && defined(HW_NCPUONLINE)
6317ec681f3Smrg   if (available_cpus == 0) {
6327ec681f3Smrg      const int mib[] = { CTL_HW, HW_NCPUONLINE };
6337ec681f3Smrg      int ncpu;
63422fc2aa6Srjs      size_t len = sizeof(ncpu);
63501e04c3fSmrg
63601e04c3fSmrg      sysctl(mib, 2, &ncpu, &len, NULL, 0);
6377ec681f3Smrg      available_cpus = ncpu;
63801e04c3fSmrg   }
6397ec681f3Smrg#  elif defined(_SC_NPROCESSORS_ONLN)
6407ec681f3Smrg   if (available_cpus == 0) {
6417ec681f3Smrg      available_cpus = sysconf(_SC_NPROCESSORS_ONLN);
6427ec681f3Smrg      if (available_cpus == ~0)
6437ec681f3Smrg         available_cpus = 1;
6447ec681f3Smrg   }
6457ec681f3Smrg#  elif defined(PIPE_OS_BSD)
6467ec681f3Smrg   if (available_cpus == 0) {
6477ec681f3Smrg      const int mib[] = { CTL_HW, HW_NCPU };
6487ec681f3Smrg      int ncpu;
6497ec681f3Smrg      int len = sizeof(ncpu);
6507ec681f3Smrg
6517ec681f3Smrg      sysctl(mib, 2, &ncpu, &len, NULL, 0);
6527ec681f3Smrg      available_cpus = ncpu;
6537ec681f3Smrg   }
6547ec681f3Smrg#  endif /* defined(PIPE_OS_BSD) */
6557ec681f3Smrg
6567ec681f3Smrg   /* Determine the maximum number of CPUs configured in the system.  This is
6577ec681f3Smrg    * used to properly set num_cpu_mask_bits below.  On BSDs that don't have
6587ec681f3Smrg    * HW_NCPUONLINE, it was not clear whether HW_NCPU is the number of
6597ec681f3Smrg    * configured or the number of online CPUs.  For that reason, prefer the
6607ec681f3Smrg    * _SC_NPROCESSORS_CONF path on all BSDs.
6617ec681f3Smrg    */
6627ec681f3Smrg#  if defined(_SC_NPROCESSORS_CONF)
6637ec681f3Smrg   total_cpus = sysconf(_SC_NPROCESSORS_CONF);
6647ec681f3Smrg   if (total_cpus == ~0)
6657ec681f3Smrg      total_cpus = 1;
6667ec681f3Smrg#  elif defined(PIPE_OS_BSD)
6677ec681f3Smrg   {
6687ec681f3Smrg      const int mib[] = { CTL_HW, HW_NCPU };
6697ec681f3Smrg      int ncpu;
6707ec681f3Smrg      int len = sizeof(ncpu);
6717ec681f3Smrg
6727ec681f3Smrg      sysctl(mib, 2, &ncpu, &len, NULL, 0);
6737ec681f3Smrg      total_cpus = ncpu;
6747ec681f3Smrg   }
6757ec681f3Smrg#  endif /* defined(PIPE_OS_BSD) */
6767ec681f3Smrg#endif /* defined(PIPE_OS_UNIX) */
6777ec681f3Smrg
6787ec681f3Smrg   util_cpu_caps.nr_cpus = MAX2(1, available_cpus);
6797ec681f3Smrg   total_cpus = MAX2(total_cpus, util_cpu_caps.nr_cpus);
6807ec681f3Smrg
6817ec681f3Smrg   util_cpu_caps.max_cpus = total_cpus;
6827ec681f3Smrg   util_cpu_caps.num_cpu_mask_bits = align(total_cpus, 32);
68301e04c3fSmrg
68401e04c3fSmrg   /* Make the fallback cacheline size nonzero so that it can be
68501e04c3fSmrg    * safely passed to align().
68601e04c3fSmrg    */
68701e04c3fSmrg   util_cpu_caps.cacheline = sizeof(void *);
68801e04c3fSmrg
68901e04c3fSmrg#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
69001e04c3fSmrg   if (has_cpuid()) {
69101e04c3fSmrg      uint32_t regs[4];
69201e04c3fSmrg      uint32_t regs2[4];
69301e04c3fSmrg
69401e04c3fSmrg      util_cpu_caps.cacheline = 32;
69501e04c3fSmrg
69601e04c3fSmrg      /* Get max cpuid level */
69701e04c3fSmrg      cpuid(0x00000000, regs);
69801e04c3fSmrg
69901e04c3fSmrg      if (regs[0] >= 0x00000001) {
70001e04c3fSmrg         unsigned int cacheline;
70101e04c3fSmrg
70201e04c3fSmrg         cpuid (0x00000001, regs2);
70301e04c3fSmrg
70401e04c3fSmrg         util_cpu_caps.x86_cpu_type = (regs2[0] >> 8) & 0xf;
70501e04c3fSmrg         /* Add "extended family". */
70601e04c3fSmrg         if (util_cpu_caps.x86_cpu_type == 0xf)
70701e04c3fSmrg             util_cpu_caps.x86_cpu_type += ((regs2[0] >> 20) & 0xff);
70801e04c3fSmrg
7097ec681f3Smrg         switch (util_cpu_caps.x86_cpu_type) {
7107ec681f3Smrg         case 0x17:
7117ec681f3Smrg            util_cpu_caps.family = CPU_AMD_ZEN1_ZEN2;
7127ec681f3Smrg            break;
7137ec681f3Smrg         case 0x18:
7147ec681f3Smrg            util_cpu_caps.family = CPU_AMD_ZEN_HYGON;
7157ec681f3Smrg            break;
7167ec681f3Smrg         case 0x19:
7177ec681f3Smrg            util_cpu_caps.family = CPU_AMD_ZEN3;
7187ec681f3Smrg            break;
7197ec681f3Smrg         default:
7207ec681f3Smrg            if (util_cpu_caps.x86_cpu_type > 0x19)
7217ec681f3Smrg               util_cpu_caps.family = CPU_AMD_ZEN_NEXT;
7227ec681f3Smrg         }
7237ec681f3Smrg
72401e04c3fSmrg         /* general feature flags */
72501e04c3fSmrg         util_cpu_caps.has_tsc    = (regs2[3] >>  4) & 1; /* 0x0000010 */
72601e04c3fSmrg         util_cpu_caps.has_mmx    = (regs2[3] >> 23) & 1; /* 0x0800000 */
72701e04c3fSmrg         util_cpu_caps.has_sse    = (regs2[3] >> 25) & 1; /* 0x2000000 */
72801e04c3fSmrg         util_cpu_caps.has_sse2   = (regs2[3] >> 26) & 1; /* 0x4000000 */
72901e04c3fSmrg         util_cpu_caps.has_sse3   = (regs2[2] >>  0) & 1; /* 0x0000001 */
73001e04c3fSmrg         util_cpu_caps.has_ssse3  = (regs2[2] >>  9) & 1; /* 0x0000020 */
73101e04c3fSmrg         util_cpu_caps.has_sse4_1 = (regs2[2] >> 19) & 1;
73201e04c3fSmrg         util_cpu_caps.has_sse4_2 = (regs2[2] >> 20) & 1;
73301e04c3fSmrg         util_cpu_caps.has_popcnt = (regs2[2] >> 23) & 1;
73401e04c3fSmrg         util_cpu_caps.has_avx    = ((regs2[2] >> 28) & 1) && // AVX
73501e04c3fSmrg                                    ((regs2[2] >> 27) & 1) && // OSXSAVE
73601e04c3fSmrg                                    ((xgetbv() & 6) == 6);    // XMM & YMM
73701e04c3fSmrg         util_cpu_caps.has_f16c   = ((regs2[2] >> 29) & 1) && util_cpu_caps.has_avx;
73801e04c3fSmrg         util_cpu_caps.has_fma    = ((regs2[2] >> 12) & 1) && util_cpu_caps.has_avx;
73901e04c3fSmrg         util_cpu_caps.has_mmx2   = util_cpu_caps.has_sse; /* SSE cpus supports mmxext too */
74001e04c3fSmrg#if defined(PIPE_ARCH_X86_64)
74101e04c3fSmrg         util_cpu_caps.has_daz = 1;
74201e04c3fSmrg#else
74301e04c3fSmrg         util_cpu_caps.has_daz = util_cpu_caps.has_sse3 ||
74401e04c3fSmrg            (util_cpu_caps.has_sse2 && sse2_has_daz());
74501e04c3fSmrg#endif
74601e04c3fSmrg
74701e04c3fSmrg         cacheline = ((regs2[1] >> 8) & 0xFF) * 8;
74801e04c3fSmrg         if (cacheline > 0)
74901e04c3fSmrg            util_cpu_caps.cacheline = cacheline;
75001e04c3fSmrg      }
75101e04c3fSmrg      if (util_cpu_caps.has_avx && regs[0] >= 0x00000007) {
75201e04c3fSmrg         uint32_t regs7[4];
75301e04c3fSmrg         cpuid_count(0x00000007, 0x00000000, regs7);
75401e04c3fSmrg         util_cpu_caps.has_avx2 = (regs7[1] >> 5) & 1;
75501e04c3fSmrg      }
75601e04c3fSmrg
75701e04c3fSmrg      // check for avx512
75801e04c3fSmrg      if (((regs2[2] >> 27) & 1) && // OSXSAVE
75901e04c3fSmrg          (xgetbv() & (0x7 << 5)) && // OPMASK: upper-256 enabled by OS
76001e04c3fSmrg          ((xgetbv() & 6) == 6)) { // XMM/YMM enabled by OS
76101e04c3fSmrg         uint32_t regs3[4];
76201e04c3fSmrg         cpuid_count(0x00000007, 0x00000000, regs3);
76301e04c3fSmrg         util_cpu_caps.has_avx512f    = (regs3[1] >> 16) & 1;
76401e04c3fSmrg         util_cpu_caps.has_avx512dq   = (regs3[1] >> 17) & 1;
76501e04c3fSmrg         util_cpu_caps.has_avx512ifma = (regs3[1] >> 21) & 1;
76601e04c3fSmrg         util_cpu_caps.has_avx512pf   = (regs3[1] >> 26) & 1;
76701e04c3fSmrg         util_cpu_caps.has_avx512er   = (regs3[1] >> 27) & 1;
76801e04c3fSmrg         util_cpu_caps.has_avx512cd   = (regs3[1] >> 28) & 1;
76901e04c3fSmrg         util_cpu_caps.has_avx512bw   = (regs3[1] >> 30) & 1;
77001e04c3fSmrg         util_cpu_caps.has_avx512vl   = (regs3[1] >> 31) & 1;
77101e04c3fSmrg         util_cpu_caps.has_avx512vbmi = (regs3[2] >>  1) & 1;
77201e04c3fSmrg      }
77301e04c3fSmrg
77401e04c3fSmrg      if (regs[1] == 0x756e6547 && regs[2] == 0x6c65746e && regs[3] == 0x49656e69) {
77501e04c3fSmrg         /* GenuineIntel */
77601e04c3fSmrg         util_cpu_caps.has_intel = 1;
77701e04c3fSmrg      }
77801e04c3fSmrg
77901e04c3fSmrg      cpuid(0x80000000, regs);
78001e04c3fSmrg
78101e04c3fSmrg      if (regs[0] >= 0x80000001) {
78201e04c3fSmrg
78301e04c3fSmrg         cpuid(0x80000001, regs2);
78401e04c3fSmrg
78501e04c3fSmrg         util_cpu_caps.has_mmx  |= (regs2[3] >> 23) & 1;
78601e04c3fSmrg         util_cpu_caps.has_mmx2 |= (regs2[3] >> 22) & 1;
78701e04c3fSmrg         util_cpu_caps.has_3dnow = (regs2[3] >> 31) & 1;
78801e04c3fSmrg         util_cpu_caps.has_3dnow_ext = (regs2[3] >> 30) & 1;
78901e04c3fSmrg
79001e04c3fSmrg         util_cpu_caps.has_xop = util_cpu_caps.has_avx &&
79101e04c3fSmrg                                 ((regs2[2] >> 11) & 1);
79201e04c3fSmrg      }
79301e04c3fSmrg
79401e04c3fSmrg      if (regs[0] >= 0x80000006) {
79501e04c3fSmrg         /* should we really do this if the clflush size above worked? */
79601e04c3fSmrg         unsigned int cacheline;
79701e04c3fSmrg         cpuid(0x80000006, regs2);
79801e04c3fSmrg         cacheline = regs2[2] & 0xFF;
79901e04c3fSmrg         if (cacheline > 0)
80001e04c3fSmrg            util_cpu_caps.cacheline = cacheline;
80101e04c3fSmrg      }
80201e04c3fSmrg
80301e04c3fSmrg      if (!util_cpu_caps.has_sse) {
80401e04c3fSmrg         util_cpu_caps.has_sse2 = 0;
80501e04c3fSmrg         util_cpu_caps.has_sse3 = 0;
80601e04c3fSmrg         util_cpu_caps.has_ssse3 = 0;
80701e04c3fSmrg         util_cpu_caps.has_sse4_1 = 0;
80801e04c3fSmrg      }
80901e04c3fSmrg   }
81001e04c3fSmrg#endif /* PIPE_ARCH_X86 || PIPE_ARCH_X86_64 */
81101e04c3fSmrg
8128a1362adSmaya#if defined(PIPE_ARCH_ARM) || defined(PIPE_ARCH_AARCH64)
81301e04c3fSmrg   check_os_arm_support();
81401e04c3fSmrg#endif
81501e04c3fSmrg
81601e04c3fSmrg#if defined(PIPE_ARCH_PPC)
81701e04c3fSmrg   check_os_altivec_support();
81801e04c3fSmrg#endif /* PIPE_ARCH_PPC */
81901e04c3fSmrg
8207ec681f3Smrg#if defined(PIPE_ARCH_MIPS64)
8217ec681f3Smrg   check_os_mips64_support();
8227ec681f3Smrg#endif /* PIPE_ARCH_MIPS64 */
8237ec681f3Smrg
82401e04c3fSmrg   get_cpu_topology();
82501e04c3fSmrg
82601e04c3fSmrg   if (debug_get_option_dump_cpu()) {
8277ec681f3Smrg      printf("util_cpu_caps.nr_cpus = %u\n", util_cpu_caps.nr_cpus);
8287ec681f3Smrg
8297ec681f3Smrg      printf("util_cpu_caps.x86_cpu_type = %u\n", util_cpu_caps.x86_cpu_type);
8307ec681f3Smrg      printf("util_cpu_caps.cacheline = %u\n", util_cpu_caps.cacheline);
8317ec681f3Smrg
8327ec681f3Smrg      printf("util_cpu_caps.has_tsc = %u\n", util_cpu_caps.has_tsc);
8337ec681f3Smrg      printf("util_cpu_caps.has_mmx = %u\n", util_cpu_caps.has_mmx);
8347ec681f3Smrg      printf("util_cpu_caps.has_mmx2 = %u\n", util_cpu_caps.has_mmx2);
8357ec681f3Smrg      printf("util_cpu_caps.has_sse = %u\n", util_cpu_caps.has_sse);
8367ec681f3Smrg      printf("util_cpu_caps.has_sse2 = %u\n", util_cpu_caps.has_sse2);
8377ec681f3Smrg      printf("util_cpu_caps.has_sse3 = %u\n", util_cpu_caps.has_sse3);
8387ec681f3Smrg      printf("util_cpu_caps.has_ssse3 = %u\n", util_cpu_caps.has_ssse3);
8397ec681f3Smrg      printf("util_cpu_caps.has_sse4_1 = %u\n", util_cpu_caps.has_sse4_1);
8407ec681f3Smrg      printf("util_cpu_caps.has_sse4_2 = %u\n", util_cpu_caps.has_sse4_2);
8417ec681f3Smrg      printf("util_cpu_caps.has_avx = %u\n", util_cpu_caps.has_avx);
8427ec681f3Smrg      printf("util_cpu_caps.has_avx2 = %u\n", util_cpu_caps.has_avx2);
8437ec681f3Smrg      printf("util_cpu_caps.has_f16c = %u\n", util_cpu_caps.has_f16c);
8447ec681f3Smrg      printf("util_cpu_caps.has_popcnt = %u\n", util_cpu_caps.has_popcnt);
8457ec681f3Smrg      printf("util_cpu_caps.has_3dnow = %u\n", util_cpu_caps.has_3dnow);
8467ec681f3Smrg      printf("util_cpu_caps.has_3dnow_ext = %u\n", util_cpu_caps.has_3dnow_ext);
8477ec681f3Smrg      printf("util_cpu_caps.has_xop = %u\n", util_cpu_caps.has_xop);
8487ec681f3Smrg      printf("util_cpu_caps.has_altivec = %u\n", util_cpu_caps.has_altivec);
8497ec681f3Smrg      printf("util_cpu_caps.has_vsx = %u\n", util_cpu_caps.has_vsx);
8507ec681f3Smrg      printf("util_cpu_caps.has_neon = %u\n", util_cpu_caps.has_neon);
8517ec681f3Smrg      printf("util_cpu_caps.has_msa = %u\n", util_cpu_caps.has_msa);
8527ec681f3Smrg      printf("util_cpu_caps.has_daz = %u\n", util_cpu_caps.has_daz);
8537ec681f3Smrg      printf("util_cpu_caps.has_avx512f = %u\n", util_cpu_caps.has_avx512f);
8547ec681f3Smrg      printf("util_cpu_caps.has_avx512dq = %u\n", util_cpu_caps.has_avx512dq);
8557ec681f3Smrg      printf("util_cpu_caps.has_avx512ifma = %u\n", util_cpu_caps.has_avx512ifma);
8567ec681f3Smrg      printf("util_cpu_caps.has_avx512pf = %u\n", util_cpu_caps.has_avx512pf);
8577ec681f3Smrg      printf("util_cpu_caps.has_avx512er = %u\n", util_cpu_caps.has_avx512er);
8587ec681f3Smrg      printf("util_cpu_caps.has_avx512cd = %u\n", util_cpu_caps.has_avx512cd);
8597ec681f3Smrg      printf("util_cpu_caps.has_avx512bw = %u\n", util_cpu_caps.has_avx512bw);
8607ec681f3Smrg      printf("util_cpu_caps.has_avx512vl = %u\n", util_cpu_caps.has_avx512vl);
8617ec681f3Smrg      printf("util_cpu_caps.has_avx512vbmi = %u\n", util_cpu_caps.has_avx512vbmi);
8627ec681f3Smrg      printf("util_cpu_caps.num_L3_caches = %u\n", util_cpu_caps.num_L3_caches);
8637ec681f3Smrg      printf("util_cpu_caps.num_cpu_mask_bits = %u\n", util_cpu_caps.num_cpu_mask_bits);
86401e04c3fSmrg   }
86501e04c3fSmrg}
86601e04c3fSmrg
86701e04c3fSmrgstatic once_flag cpu_once_flag = ONCE_FLAG_INIT;
86801e04c3fSmrg
86901e04c3fSmrgvoid
87001e04c3fSmrgutil_cpu_detect(void)
87101e04c3fSmrg{
87201e04c3fSmrg   call_once(&cpu_once_flag, util_cpu_detect_once);
87301e04c3fSmrg}
874