17ec681f3Smrg /* 27ec681f3Smrg * Copyright © 2013 Intel Corporation 37ec681f3Smrg * 47ec681f3Smrg * Permission is hereby granted, free of charge, to any person obtaining a 57ec681f3Smrg * copy of this software and associated documentation files (the "Software"), 67ec681f3Smrg * to deal in the Software without restriction, including without limitation 77ec681f3Smrg * the rights to use, copy, modify, merge, publish, distribute, sublicense, 87ec681f3Smrg * and/or sell copies of the Software, and to permit persons to whom the 97ec681f3Smrg * Software is furnished to do so, subject to the following conditions: 107ec681f3Smrg * 117ec681f3Smrg * The above copyright notice and this permission notice (including the next 127ec681f3Smrg * paragraph) shall be included in all copies or substantial portions of the 137ec681f3Smrg * Software. 147ec681f3Smrg * 157ec681f3Smrg * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 167ec681f3Smrg * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 177ec681f3Smrg * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 187ec681f3Smrg * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 197ec681f3Smrg * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 207ec681f3Smrg * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 217ec681f3Smrg * IN THE SOFTWARE. 227ec681f3Smrg * 237ec681f3Smrg */ 247ec681f3Smrg 257ec681f3Smrg#ifndef INTEL_DEVICE_INFO_H 267ec681f3Smrg#define INTEL_DEVICE_INFO_H 277ec681f3Smrg 287ec681f3Smrg#include <stdbool.h> 297ec681f3Smrg#include <stdint.h> 307ec681f3Smrg 317ec681f3Smrg#include "util/macros.h" 327ec681f3Smrg#include "compiler/shader_enums.h" 337ec681f3Smrg 347ec681f3Smrg#ifdef __cplusplus 357ec681f3Smrgextern "C" { 367ec681f3Smrg#endif 377ec681f3Smrg 387ec681f3Smrgstruct drm_i915_query_topology_info; 397ec681f3Smrg 407ec681f3Smrg#define INTEL_DEVICE_MAX_NAME_SIZE 64 417ec681f3Smrg#define INTEL_DEVICE_MAX_SLICES (6) /* Maximum on gfx10 */ 427ec681f3Smrg#define INTEL_DEVICE_MAX_SUBSLICES (8) /* Maximum on gfx11 */ 437ec681f3Smrg#define INTEL_DEVICE_MAX_EUS_PER_SUBSLICE (16) /* Maximum on gfx12 */ 447ec681f3Smrg#define INTEL_DEVICE_MAX_PIXEL_PIPES (3) /* Maximum on gfx12 */ 457ec681f3Smrg 467ec681f3Smrg/** 477ec681f3Smrg * Intel hardware information and quirks 487ec681f3Smrg */ 497ec681f3Smrgstruct intel_device_info 507ec681f3Smrg{ 517ec681f3Smrg /* Driver internal numbers used to differentiate platforms. */ 527ec681f3Smrg int ver; 537ec681f3Smrg int verx10; 547ec681f3Smrg int display_ver; 557ec681f3Smrg int revision; 567ec681f3Smrg int gt; 577ec681f3Smrg 587ec681f3Smrg bool is_g4x; 597ec681f3Smrg bool is_ivybridge; 607ec681f3Smrg bool is_baytrail; 617ec681f3Smrg bool is_haswell; 627ec681f3Smrg bool is_broadwell; 637ec681f3Smrg bool is_cherryview; 647ec681f3Smrg bool is_skylake; 657ec681f3Smrg bool is_broxton; 667ec681f3Smrg bool is_kabylake; 677ec681f3Smrg bool is_geminilake; 687ec681f3Smrg bool is_coffeelake; 697ec681f3Smrg bool is_elkhartlake; 707ec681f3Smrg bool is_tigerlake; 717ec681f3Smrg bool is_rocketlake; 727ec681f3Smrg bool is_dg1; 737ec681f3Smrg bool is_alderlake; 747ec681f3Smrg bool is_dg2; 757ec681f3Smrg 767ec681f3Smrg bool has_hiz_and_separate_stencil; 777ec681f3Smrg bool must_use_separate_stencil; 787ec681f3Smrg bool has_sample_with_hiz; 797ec681f3Smrg bool has_llc; 807ec681f3Smrg 817ec681f3Smrg bool has_pln; 827ec681f3Smrg bool has_64bit_float; 837ec681f3Smrg bool has_64bit_int; 847ec681f3Smrg bool has_integer_dword_mul; 857ec681f3Smrg bool has_compr4; 867ec681f3Smrg bool has_surface_tile_offset; 877ec681f3Smrg bool supports_simd16_3src; 887ec681f3Smrg bool disable_ccs_repack; 897ec681f3Smrg bool has_aux_map; 907ec681f3Smrg bool has_tiling_uapi; 917ec681f3Smrg bool has_ray_tracing; 927ec681f3Smrg bool has_local_mem; 937ec681f3Smrg bool has_lsc; 947ec681f3Smrg 957ec681f3Smrg /** 967ec681f3Smrg * \name Intel hardware quirks 977ec681f3Smrg * @{ 987ec681f3Smrg */ 997ec681f3Smrg bool has_negative_rhw_bug; 1007ec681f3Smrg 1017ec681f3Smrg /** 1027ec681f3Smrg * Some versions of Gen hardware don't do centroid interpolation correctly 1037ec681f3Smrg * on unlit pixels, causing incorrect values for derivatives near triangle 1047ec681f3Smrg * edges. Enabling this flag causes the fragment shader to use 1057ec681f3Smrg * non-centroid interpolation for unlit pixels, at the expense of two extra 1067ec681f3Smrg * fragment shader instructions. 1077ec681f3Smrg */ 1087ec681f3Smrg bool needs_unlit_centroid_workaround; 1097ec681f3Smrg /** @} */ 1107ec681f3Smrg 1117ec681f3Smrg /** 1127ec681f3Smrg * \name GPU hardware limits 1137ec681f3Smrg * 1147ec681f3Smrg * In general, you can find shader thread maximums by looking at the "Maximum 1157ec681f3Smrg * Number of Threads" field in the Intel PRM description of the 3DSTATE_VS, 1167ec681f3Smrg * 3DSTATE_GS, 3DSTATE_HS, 3DSTATE_DS, and 3DSTATE_PS commands. URB entry 1177ec681f3Smrg * limits come from the "Number of URB Entries" field in the 1187ec681f3Smrg * 3DSTATE_URB_VS command and friends. 1197ec681f3Smrg * 1207ec681f3Smrg * These fields are used to calculate the scratch space to allocate. The 1217ec681f3Smrg * amount of scratch space can be larger without being harmful on modern 1227ec681f3Smrg * GPUs, however, prior to Haswell, programming the maximum number of threads 1237ec681f3Smrg * to greater than the hardware maximum would cause GPU performance to tank. 1247ec681f3Smrg * 1257ec681f3Smrg * @{ 1267ec681f3Smrg */ 1277ec681f3Smrg /** 1287ec681f3Smrg * Total number of slices present on the device whether or not they've been 1297ec681f3Smrg * fused off. 1307ec681f3Smrg * 1317ec681f3Smrg * XXX: CS thread counts are limited by the inability to do cross subslice 1327ec681f3Smrg * communication. It is the effectively the number of logical threads which 1337ec681f3Smrg * can be executed in a subslice. Fuse configurations may cause this number 1347ec681f3Smrg * to change, so we program @max_cs_threads as the lower maximum. 1357ec681f3Smrg */ 1367ec681f3Smrg unsigned num_slices; 1377ec681f3Smrg 1387ec681f3Smrg /** 1397ec681f3Smrg * Maximum number of slices present on this device (can be more than 1407ec681f3Smrg * num_slices if some slices are fused). 1417ec681f3Smrg */ 1427ec681f3Smrg unsigned max_slices; 1437ec681f3Smrg 1447ec681f3Smrg /** 1457ec681f3Smrg * Number of subslices for each slice (used to be uniform until CNL). 1467ec681f3Smrg */ 1477ec681f3Smrg unsigned num_subslices[INTEL_DEVICE_MAX_SUBSLICES]; 1487ec681f3Smrg 1497ec681f3Smrg /** 1507ec681f3Smrg * Maximum number of subslices per slice present on this device (can be 1517ec681f3Smrg * more than the maximum value in the num_subslices[] array if some 1527ec681f3Smrg * subslices are fused). 1537ec681f3Smrg */ 1547ec681f3Smrg unsigned max_subslices_per_slice; 1557ec681f3Smrg 1567ec681f3Smrg /** 1577ec681f3Smrg * Number of subslices on each pixel pipe (ICL). 1587ec681f3Smrg */ 1597ec681f3Smrg unsigned ppipe_subslices[INTEL_DEVICE_MAX_PIXEL_PIPES]; 1607ec681f3Smrg 1617ec681f3Smrg /** 1627ec681f3Smrg * Upper bound of number of EU per subslice (some SKUs might have just 1 EU 1637ec681f3Smrg * fused across all subslices, like 47 EUs, in which case this number won't 1647ec681f3Smrg * be acurate for one subslice). 1657ec681f3Smrg */ 1667ec681f3Smrg unsigned num_eu_per_subslice; 1677ec681f3Smrg 1687ec681f3Smrg /** 1697ec681f3Smrg * Maximum number of EUs per subslice (can be more than num_eu_per_subslice 1707ec681f3Smrg * if some EUs are fused off). 1717ec681f3Smrg */ 1727ec681f3Smrg unsigned max_eu_per_subslice; 1737ec681f3Smrg 1747ec681f3Smrg /** 1757ec681f3Smrg * Number of threads per eu, varies between 4 and 8 between generations. 1767ec681f3Smrg */ 1777ec681f3Smrg unsigned num_thread_per_eu; 1787ec681f3Smrg 1797ec681f3Smrg /** 1807ec681f3Smrg * A bit mask of the slices available. 1817ec681f3Smrg */ 1827ec681f3Smrg uint8_t slice_masks; 1837ec681f3Smrg 1847ec681f3Smrg /** 1857ec681f3Smrg * An array of bit mask of the subslices available, use subslice_slice_stride 1867ec681f3Smrg * to access this array. 1877ec681f3Smrg */ 1887ec681f3Smrg uint8_t subslice_masks[INTEL_DEVICE_MAX_SLICES * 1897ec681f3Smrg DIV_ROUND_UP(INTEL_DEVICE_MAX_SUBSLICES, 8)]; 1907ec681f3Smrg 1917ec681f3Smrg /** 1927ec681f3Smrg * The number of enabled subslices (considering fusing). For exactly which 1937ec681f3Smrg * subslices are enabled, see subslice_masks[]. 1947ec681f3Smrg */ 1957ec681f3Smrg unsigned subslice_total; 1967ec681f3Smrg 1977ec681f3Smrg /** 1987ec681f3Smrg * An array of bit mask of EUs available, use eu_slice_stride & 1997ec681f3Smrg * eu_subslice_stride to access this array. 2007ec681f3Smrg */ 2017ec681f3Smrg uint8_t eu_masks[INTEL_DEVICE_MAX_SLICES * 2027ec681f3Smrg INTEL_DEVICE_MAX_SUBSLICES * 2037ec681f3Smrg DIV_ROUND_UP(INTEL_DEVICE_MAX_EUS_PER_SUBSLICE, 8)]; 2047ec681f3Smrg 2057ec681f3Smrg /** 2067ec681f3Smrg * Stride to access subslice_masks[]. 2077ec681f3Smrg */ 2087ec681f3Smrg uint16_t subslice_slice_stride; 2097ec681f3Smrg 2107ec681f3Smrg /** 2117ec681f3Smrg * Strides to access eu_masks[]. 2127ec681f3Smrg */ 2137ec681f3Smrg uint16_t eu_slice_stride; 2147ec681f3Smrg uint16_t eu_subslice_stride; 2157ec681f3Smrg 2167ec681f3Smrg unsigned l3_banks; 2177ec681f3Smrg unsigned max_vs_threads; /**< Maximum Vertex Shader threads */ 2187ec681f3Smrg unsigned max_tcs_threads; /**< Maximum Hull Shader threads */ 2197ec681f3Smrg unsigned max_tes_threads; /**< Maximum Domain Shader threads */ 2207ec681f3Smrg unsigned max_gs_threads; /**< Maximum Geometry Shader threads. */ 2217ec681f3Smrg /** 2227ec681f3Smrg * Theoretical maximum number of Pixel Shader threads. 2237ec681f3Smrg * 2247ec681f3Smrg * PSD means Pixel Shader Dispatcher. On modern Intel GPUs, hardware will 2257ec681f3Smrg * automatically scale pixel shader thread count, based on a single value 2267ec681f3Smrg * programmed into 3DSTATE_PS. 2277ec681f3Smrg * 2287ec681f3Smrg * To calculate the maximum number of threads for Gfx8 beyond (which have 2297ec681f3Smrg * multiple Pixel Shader Dispatchers): 2307ec681f3Smrg * 2317ec681f3Smrg * - Look up 3DSTATE_PS and find "Maximum Number of Threads Per PSD" 2327ec681f3Smrg * - Usually there's only one PSD per subslice, so use the number of 2337ec681f3Smrg * subslices for number of PSDs. 2347ec681f3Smrg * - For max_wm_threads, the total should be PSD threads * #PSDs. 2357ec681f3Smrg */ 2367ec681f3Smrg unsigned max_wm_threads; 2377ec681f3Smrg 2387ec681f3Smrg /** 2397ec681f3Smrg * Maximum Compute Shader threads. 2407ec681f3Smrg * 2417ec681f3Smrg * Thread count * number of EUs per subslice 2427ec681f3Smrg */ 2437ec681f3Smrg unsigned max_cs_threads; 2447ec681f3Smrg 2457ec681f3Smrg /** 2467ec681f3Smrg * Maximum number of threads per workgroup supported by the GPGPU_WALKER or 2477ec681f3Smrg * COMPUTE_WALKER command. 2487ec681f3Smrg * 2497ec681f3Smrg * This may be smaller than max_cs_threads as it takes into account added 2507ec681f3Smrg * restrictions on the GPGPU/COMPUTE_WALKER commands. While max_cs_threads 2517ec681f3Smrg * expresses the total parallelism of the GPU, this expresses the maximum 2527ec681f3Smrg * number of threads we can dispatch in a single workgroup. 2537ec681f3Smrg */ 2547ec681f3Smrg unsigned max_cs_workgroup_threads; 2557ec681f3Smrg 2567ec681f3Smrg /** 2577ec681f3Smrg * The maximum number of potential scratch ids. Due to hardware 2587ec681f3Smrg * implementation details, the range of scratch ids may be larger than the 2597ec681f3Smrg * number of subslices. 2607ec681f3Smrg */ 2617ec681f3Smrg unsigned max_scratch_ids[MESA_SHADER_STAGES]; 2627ec681f3Smrg 2637ec681f3Smrg struct { 2647ec681f3Smrg /** 2657ec681f3Smrg * Fixed size of the URB. 2667ec681f3Smrg * 2677ec681f3Smrg * On Gfx6 and DG1, this is measured in KB. Gfx4-5 instead measure 2687ec681f3Smrg * this in 512b blocks, as that's more convenient there. 2697ec681f3Smrg * 2707ec681f3Smrg * On most Gfx7+ platforms, the URB is a section of the L3 cache, 2717ec681f3Smrg * and can be resized based on the L3 programming. For those platforms, 2727ec681f3Smrg * simply leave this field blank (zero) - it isn't used. 2737ec681f3Smrg */ 2747ec681f3Smrg unsigned size; 2757ec681f3Smrg 2767ec681f3Smrg /** 2777ec681f3Smrg * The minimum number of URB entries. See the 3DSTATE_URB_<XS> docs. 2787ec681f3Smrg */ 2797ec681f3Smrg unsigned min_entries[4]; 2807ec681f3Smrg 2817ec681f3Smrg /** 2827ec681f3Smrg * The maximum number of URB entries. See the 3DSTATE_URB_<XS> docs. 2837ec681f3Smrg */ 2847ec681f3Smrg unsigned max_entries[4]; 2857ec681f3Smrg } urb; 2867ec681f3Smrg 2877ec681f3Smrg /* Maximum size in Kb that can be allocated to constants in the URB, this 2887ec681f3Smrg * is usually divided among the stages for implementing push constants. 2897ec681f3Smrg * See 3DSTATE_PUSH_CONSTANT_ALLOC_*. 2907ec681f3Smrg */ 2917ec681f3Smrg unsigned max_constant_urb_size_kb; 2927ec681f3Smrg 2937ec681f3Smrg /** 2947ec681f3Smrg * Size of the command streamer prefetch. This is important to know for 2957ec681f3Smrg * self modifying batches. 2967ec681f3Smrg */ 2977ec681f3Smrg unsigned cs_prefetch_size; 2987ec681f3Smrg 2997ec681f3Smrg /** 3007ec681f3Smrg * For the longest time the timestamp frequency for Gen's timestamp counter 3017ec681f3Smrg * could be assumed to be 12.5MHz, where the least significant bit neatly 3027ec681f3Smrg * corresponded to 80 nanoseconds. 3037ec681f3Smrg * 3047ec681f3Smrg * Since Gfx9 the numbers aren't so round, with a a frequency of 12MHz for 3057ec681f3Smrg * SKL (or scale factor of 83.33333333) and a frequency of 19200000Hz for 3067ec681f3Smrg * BXT. 3077ec681f3Smrg * 3087ec681f3Smrg * For simplicty to fit with the current code scaling by a single constant 3097ec681f3Smrg * to map from raw timestamps to nanoseconds we now do the conversion in 3107ec681f3Smrg * floating point instead of integer arithmetic. 3117ec681f3Smrg * 3127ec681f3Smrg * In general it's probably worth noting that the documented constants we 3137ec681f3Smrg * have for the per-platform timestamp frequencies aren't perfect and 3147ec681f3Smrg * shouldn't be trusted for scaling and comparing timestamps with a large 3157ec681f3Smrg * delta. 3167ec681f3Smrg * 3177ec681f3Smrg * E.g. with crude testing on my system using the 'correct' scale factor I'm 3187ec681f3Smrg * seeing a drift of ~2 milliseconds per second. 3197ec681f3Smrg */ 3207ec681f3Smrg uint64_t timestamp_frequency; 3217ec681f3Smrg 3227ec681f3Smrg uint64_t aperture_bytes; 3237ec681f3Smrg 3247ec681f3Smrg /** 3257ec681f3Smrg * ID to put into the .aub files. 3267ec681f3Smrg */ 3277ec681f3Smrg int simulator_id; 3287ec681f3Smrg 3297ec681f3Smrg /** 3307ec681f3Smrg * holds the pci device id 3317ec681f3Smrg */ 3327ec681f3Smrg uint32_t chipset_id; 3337ec681f3Smrg 3347ec681f3Smrg /** 3357ec681f3Smrg * holds the name of the device 3367ec681f3Smrg */ 3377ec681f3Smrg char name[INTEL_DEVICE_MAX_NAME_SIZE]; 3387ec681f3Smrg 3397ec681f3Smrg /** 3407ec681f3Smrg * no_hw is true when the chipset_id pci device id has been overridden 3417ec681f3Smrg */ 3427ec681f3Smrg bool no_hw; 3437ec681f3Smrg /** @} */ 3447ec681f3Smrg}; 3457ec681f3Smrg 3467ec681f3Smrg#ifdef GFX_VER 3477ec681f3Smrg 3487ec681f3Smrg#define intel_device_info_is_9lp(devinfo) \ 3497ec681f3Smrg (GFX_VER == 9 && ((devinfo)->is_broxton || (devinfo)->is_geminilake)) 3507ec681f3Smrg 3517ec681f3Smrg#else 3527ec681f3Smrg 3537ec681f3Smrg#define intel_device_info_is_9lp(devinfo) \ 3547ec681f3Smrg ((devinfo)->is_broxton || (devinfo)->is_geminilake) 3557ec681f3Smrg 3567ec681f3Smrg#endif 3577ec681f3Smrg 3587ec681f3Smrgstatic inline bool 3597ec681f3Smrgintel_device_info_subslice_available(const struct intel_device_info *devinfo, 3607ec681f3Smrg int slice, int subslice) 3617ec681f3Smrg{ 3627ec681f3Smrg return (devinfo->subslice_masks[slice * devinfo->subslice_slice_stride + 3637ec681f3Smrg subslice / 8] & (1U << (subslice % 8))) != 0; 3647ec681f3Smrg} 3657ec681f3Smrg 3667ec681f3Smrgstatic inline bool 3677ec681f3Smrgintel_device_info_eu_available(const struct intel_device_info *devinfo, 3687ec681f3Smrg int slice, int subslice, int eu) 3697ec681f3Smrg{ 3707ec681f3Smrg unsigned subslice_offset = slice * devinfo->eu_slice_stride + 3717ec681f3Smrg subslice * devinfo->eu_subslice_stride; 3727ec681f3Smrg 3737ec681f3Smrg return (devinfo->eu_masks[subslice_offset + eu / 8] & (1U << eu % 8)) != 0; 3747ec681f3Smrg} 3757ec681f3Smrg 3767ec681f3Smrgstatic inline uint32_t 3777ec681f3Smrgintel_device_info_subslice_total(const struct intel_device_info *devinfo) 3787ec681f3Smrg{ 3797ec681f3Smrg uint32_t total = 0; 3807ec681f3Smrg 3817ec681f3Smrg for (size_t i = 0; i < ARRAY_SIZE(devinfo->subslice_masks); i++) { 3827ec681f3Smrg total += __builtin_popcount(devinfo->subslice_masks[i]); 3837ec681f3Smrg } 3847ec681f3Smrg 3857ec681f3Smrg return total; 3867ec681f3Smrg} 3877ec681f3Smrg 3887ec681f3Smrgstatic inline uint32_t 3897ec681f3Smrgintel_device_info_eu_total(const struct intel_device_info *devinfo) 3907ec681f3Smrg{ 3917ec681f3Smrg uint32_t total = 0; 3927ec681f3Smrg 3937ec681f3Smrg for (uint32_t i = 0; i < ARRAY_SIZE(devinfo->eu_masks); i++) 3947ec681f3Smrg total += __builtin_popcount(devinfo->eu_masks[i]); 3957ec681f3Smrg 3967ec681f3Smrg return total; 3977ec681f3Smrg} 3987ec681f3Smrg 3997ec681f3Smrgstatic inline unsigned 4007ec681f3Smrgintel_device_info_num_dual_subslices(UNUSED 4017ec681f3Smrg const struct intel_device_info *devinfo) 4027ec681f3Smrg{ 4037ec681f3Smrg unreachable("TODO"); 4047ec681f3Smrg} 4057ec681f3Smrg 4067ec681f3Smrgint intel_device_name_to_pci_device_id(const char *name); 4077ec681f3Smrg 4087ec681f3Smrgstatic inline uint64_t 4097ec681f3Smrgintel_device_info_timebase_scale(const struct intel_device_info *devinfo, 4107ec681f3Smrg uint64_t gpu_timestamp) 4117ec681f3Smrg{ 4127ec681f3Smrg return (1000000000ull * gpu_timestamp) / devinfo->timestamp_frequency; 4137ec681f3Smrg} 4147ec681f3Smrg 4157ec681f3Smrgbool intel_get_device_info_from_fd(int fh, struct intel_device_info *devinfo); 4167ec681f3Smrgbool intel_get_device_info_from_pci_id(int pci_id, 4177ec681f3Smrg struct intel_device_info *devinfo); 4187ec681f3Smrgint intel_get_aperture_size(int fd, uint64_t *size); 4197ec681f3Smrg 4207ec681f3Smrg#ifdef __cplusplus 4217ec681f3Smrg} 4227ec681f3Smrg#endif 4237ec681f3Smrg 4247ec681f3Smrg#endif /* INTEL_DEVICE_INFO_H */ 425