1 /* 2 * Copyright © 2013 Intel Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 * 23 */ 24 25#ifndef INTEL_DEVICE_INFO_H 26#define INTEL_DEVICE_INFO_H 27 28#include <stdbool.h> 29#include <stdint.h> 30 31#include "util/macros.h" 32#include "compiler/shader_enums.h" 33 34#ifdef __cplusplus 35extern "C" { 36#endif 37 38struct drm_i915_query_topology_info; 39 40#define INTEL_DEVICE_MAX_NAME_SIZE 64 41#define INTEL_DEVICE_MAX_SLICES (6) /* Maximum on gfx10 */ 42#define INTEL_DEVICE_MAX_SUBSLICES (8) /* Maximum on gfx11 */ 43#define INTEL_DEVICE_MAX_EUS_PER_SUBSLICE (16) /* Maximum on gfx12 */ 44#define INTEL_DEVICE_MAX_PIXEL_PIPES (3) /* Maximum on gfx12 */ 45 46/** 47 * Intel hardware information and quirks 48 */ 49struct intel_device_info 50{ 51 /* Driver internal numbers used to differentiate platforms. */ 52 int ver; 53 int verx10; 54 int display_ver; 55 int revision; 56 int gt; 57 58 bool is_g4x; 59 bool is_ivybridge; 60 bool is_baytrail; 61 bool is_haswell; 62 bool is_broadwell; 63 bool is_cherryview; 64 bool is_skylake; 65 bool is_broxton; 66 bool is_kabylake; 67 bool is_geminilake; 68 bool is_coffeelake; 69 bool is_elkhartlake; 70 bool is_tigerlake; 71 bool is_rocketlake; 72 bool is_dg1; 73 bool is_alderlake; 74 bool is_dg2; 75 76 bool has_hiz_and_separate_stencil; 77 bool must_use_separate_stencil; 78 bool has_sample_with_hiz; 79 bool has_llc; 80 81 bool has_pln; 82 bool has_64bit_float; 83 bool has_64bit_int; 84 bool has_integer_dword_mul; 85 bool has_compr4; 86 bool has_surface_tile_offset; 87 bool supports_simd16_3src; 88 bool disable_ccs_repack; 89 bool has_aux_map; 90 bool has_tiling_uapi; 91 bool has_ray_tracing; 92 bool has_local_mem; 93 bool has_lsc; 94 95 /** 96 * \name Intel hardware quirks 97 * @{ 98 */ 99 bool has_negative_rhw_bug; 100 101 /** 102 * Some versions of Gen hardware don't do centroid interpolation correctly 103 * on unlit pixels, causing incorrect values for derivatives near triangle 104 * edges. Enabling this flag causes the fragment shader to use 105 * non-centroid interpolation for unlit pixels, at the expense of two extra 106 * fragment shader instructions. 107 */ 108 bool needs_unlit_centroid_workaround; 109 /** @} */ 110 111 /** 112 * \name GPU hardware limits 113 * 114 * In general, you can find shader thread maximums by looking at the "Maximum 115 * Number of Threads" field in the Intel PRM description of the 3DSTATE_VS, 116 * 3DSTATE_GS, 3DSTATE_HS, 3DSTATE_DS, and 3DSTATE_PS commands. URB entry 117 * limits come from the "Number of URB Entries" field in the 118 * 3DSTATE_URB_VS command and friends. 119 * 120 * These fields are used to calculate the scratch space to allocate. The 121 * amount of scratch space can be larger without being harmful on modern 122 * GPUs, however, prior to Haswell, programming the maximum number of threads 123 * to greater than the hardware maximum would cause GPU performance to tank. 124 * 125 * @{ 126 */ 127 /** 128 * Total number of slices present on the device whether or not they've been 129 * fused off. 130 * 131 * XXX: CS thread counts are limited by the inability to do cross subslice 132 * communication. It is the effectively the number of logical threads which 133 * can be executed in a subslice. Fuse configurations may cause this number 134 * to change, so we program @max_cs_threads as the lower maximum. 135 */ 136 unsigned num_slices; 137 138 /** 139 * Maximum number of slices present on this device (can be more than 140 * num_slices if some slices are fused). 141 */ 142 unsigned max_slices; 143 144 /** 145 * Number of subslices for each slice (used to be uniform until CNL). 146 */ 147 unsigned num_subslices[INTEL_DEVICE_MAX_SUBSLICES]; 148 149 /** 150 * Maximum number of subslices per slice present on this device (can be 151 * more than the maximum value in the num_subslices[] array if some 152 * subslices are fused). 153 */ 154 unsigned max_subslices_per_slice; 155 156 /** 157 * Number of subslices on each pixel pipe (ICL). 158 */ 159 unsigned ppipe_subslices[INTEL_DEVICE_MAX_PIXEL_PIPES]; 160 161 /** 162 * Upper bound of number of EU per subslice (some SKUs might have just 1 EU 163 * fused across all subslices, like 47 EUs, in which case this number won't 164 * be acurate for one subslice). 165 */ 166 unsigned num_eu_per_subslice; 167 168 /** 169 * Maximum number of EUs per subslice (can be more than num_eu_per_subslice 170 * if some EUs are fused off). 171 */ 172 unsigned max_eu_per_subslice; 173 174 /** 175 * Number of threads per eu, varies between 4 and 8 between generations. 176 */ 177 unsigned num_thread_per_eu; 178 179 /** 180 * A bit mask of the slices available. 181 */ 182 uint8_t slice_masks; 183 184 /** 185 * An array of bit mask of the subslices available, use subslice_slice_stride 186 * to access this array. 187 */ 188 uint8_t subslice_masks[INTEL_DEVICE_MAX_SLICES * 189 DIV_ROUND_UP(INTEL_DEVICE_MAX_SUBSLICES, 8)]; 190 191 /** 192 * The number of enabled subslices (considering fusing). For exactly which 193 * subslices are enabled, see subslice_masks[]. 194 */ 195 unsigned subslice_total; 196 197 /** 198 * An array of bit mask of EUs available, use eu_slice_stride & 199 * eu_subslice_stride to access this array. 200 */ 201 uint8_t eu_masks[INTEL_DEVICE_MAX_SLICES * 202 INTEL_DEVICE_MAX_SUBSLICES * 203 DIV_ROUND_UP(INTEL_DEVICE_MAX_EUS_PER_SUBSLICE, 8)]; 204 205 /** 206 * Stride to access subslice_masks[]. 207 */ 208 uint16_t subslice_slice_stride; 209 210 /** 211 * Strides to access eu_masks[]. 212 */ 213 uint16_t eu_slice_stride; 214 uint16_t eu_subslice_stride; 215 216 unsigned l3_banks; 217 unsigned max_vs_threads; /**< Maximum Vertex Shader threads */ 218 unsigned max_tcs_threads; /**< Maximum Hull Shader threads */ 219 unsigned max_tes_threads; /**< Maximum Domain Shader threads */ 220 unsigned max_gs_threads; /**< Maximum Geometry Shader threads. */ 221 /** 222 * Theoretical maximum number of Pixel Shader threads. 223 * 224 * PSD means Pixel Shader Dispatcher. On modern Intel GPUs, hardware will 225 * automatically scale pixel shader thread count, based on a single value 226 * programmed into 3DSTATE_PS. 227 * 228 * To calculate the maximum number of threads for Gfx8 beyond (which have 229 * multiple Pixel Shader Dispatchers): 230 * 231 * - Look up 3DSTATE_PS and find "Maximum Number of Threads Per PSD" 232 * - Usually there's only one PSD per subslice, so use the number of 233 * subslices for number of PSDs. 234 * - For max_wm_threads, the total should be PSD threads * #PSDs. 235 */ 236 unsigned max_wm_threads; 237 238 /** 239 * Maximum Compute Shader threads. 240 * 241 * Thread count * number of EUs per subslice 242 */ 243 unsigned max_cs_threads; 244 245 /** 246 * Maximum number of threads per workgroup supported by the GPGPU_WALKER or 247 * COMPUTE_WALKER command. 248 * 249 * This may be smaller than max_cs_threads as it takes into account added 250 * restrictions on the GPGPU/COMPUTE_WALKER commands. While max_cs_threads 251 * expresses the total parallelism of the GPU, this expresses the maximum 252 * number of threads we can dispatch in a single workgroup. 253 */ 254 unsigned max_cs_workgroup_threads; 255 256 /** 257 * The maximum number of potential scratch ids. Due to hardware 258 * implementation details, the range of scratch ids may be larger than the 259 * number of subslices. 260 */ 261 unsigned max_scratch_ids[MESA_SHADER_STAGES]; 262 263 struct { 264 /** 265 * Fixed size of the URB. 266 * 267 * On Gfx6 and DG1, this is measured in KB. Gfx4-5 instead measure 268 * this in 512b blocks, as that's more convenient there. 269 * 270 * On most Gfx7+ platforms, the URB is a section of the L3 cache, 271 * and can be resized based on the L3 programming. For those platforms, 272 * simply leave this field blank (zero) - it isn't used. 273 */ 274 unsigned size; 275 276 /** 277 * The minimum number of URB entries. See the 3DSTATE_URB_<XS> docs. 278 */ 279 unsigned min_entries[4]; 280 281 /** 282 * The maximum number of URB entries. See the 3DSTATE_URB_<XS> docs. 283 */ 284 unsigned max_entries[4]; 285 } urb; 286 287 /* Maximum size in Kb that can be allocated to constants in the URB, this 288 * is usually divided among the stages for implementing push constants. 289 * See 3DSTATE_PUSH_CONSTANT_ALLOC_*. 290 */ 291 unsigned max_constant_urb_size_kb; 292 293 /** 294 * Size of the command streamer prefetch. This is important to know for 295 * self modifying batches. 296 */ 297 unsigned cs_prefetch_size; 298 299 /** 300 * For the longest time the timestamp frequency for Gen's timestamp counter 301 * could be assumed to be 12.5MHz, where the least significant bit neatly 302 * corresponded to 80 nanoseconds. 303 * 304 * Since Gfx9 the numbers aren't so round, with a a frequency of 12MHz for 305 * SKL (or scale factor of 83.33333333) and a frequency of 19200000Hz for 306 * BXT. 307 * 308 * For simplicty to fit with the current code scaling by a single constant 309 * to map from raw timestamps to nanoseconds we now do the conversion in 310 * floating point instead of integer arithmetic. 311 * 312 * In general it's probably worth noting that the documented constants we 313 * have for the per-platform timestamp frequencies aren't perfect and 314 * shouldn't be trusted for scaling and comparing timestamps with a large 315 * delta. 316 * 317 * E.g. with crude testing on my system using the 'correct' scale factor I'm 318 * seeing a drift of ~2 milliseconds per second. 319 */ 320 uint64_t timestamp_frequency; 321 322 uint64_t aperture_bytes; 323 324 /** 325 * ID to put into the .aub files. 326 */ 327 int simulator_id; 328 329 /** 330 * holds the pci device id 331 */ 332 uint32_t chipset_id; 333 334 /** 335 * holds the name of the device 336 */ 337 char name[INTEL_DEVICE_MAX_NAME_SIZE]; 338 339 /** 340 * no_hw is true when the chipset_id pci device id has been overridden 341 */ 342 bool no_hw; 343 /** @} */ 344}; 345 346#ifdef GFX_VER 347 348#define intel_device_info_is_9lp(devinfo) \ 349 (GFX_VER == 9 && ((devinfo)->is_broxton || (devinfo)->is_geminilake)) 350 351#else 352 353#define intel_device_info_is_9lp(devinfo) \ 354 ((devinfo)->is_broxton || (devinfo)->is_geminilake) 355 356#endif 357 358static inline bool 359intel_device_info_subslice_available(const struct intel_device_info *devinfo, 360 int slice, int subslice) 361{ 362 return (devinfo->subslice_masks[slice * devinfo->subslice_slice_stride + 363 subslice / 8] & (1U << (subslice % 8))) != 0; 364} 365 366static inline bool 367intel_device_info_eu_available(const struct intel_device_info *devinfo, 368 int slice, int subslice, int eu) 369{ 370 unsigned subslice_offset = slice * devinfo->eu_slice_stride + 371 subslice * devinfo->eu_subslice_stride; 372 373 return (devinfo->eu_masks[subslice_offset + eu / 8] & (1U << eu % 8)) != 0; 374} 375 376static inline uint32_t 377intel_device_info_subslice_total(const struct intel_device_info *devinfo) 378{ 379 uint32_t total = 0; 380 381 for (size_t i = 0; i < ARRAY_SIZE(devinfo->subslice_masks); i++) { 382 total += __builtin_popcount(devinfo->subslice_masks[i]); 383 } 384 385 return total; 386} 387 388static inline uint32_t 389intel_device_info_eu_total(const struct intel_device_info *devinfo) 390{ 391 uint32_t total = 0; 392 393 for (uint32_t i = 0; i < ARRAY_SIZE(devinfo->eu_masks); i++) 394 total += __builtin_popcount(devinfo->eu_masks[i]); 395 396 return total; 397} 398 399static inline unsigned 400intel_device_info_num_dual_subslices(UNUSED 401 const struct intel_device_info *devinfo) 402{ 403 unreachable("TODO"); 404} 405 406int intel_device_name_to_pci_device_id(const char *name); 407 408static inline uint64_t 409intel_device_info_timebase_scale(const struct intel_device_info *devinfo, 410 uint64_t gpu_timestamp) 411{ 412 return (1000000000ull * gpu_timestamp) / devinfo->timestamp_frequency; 413} 414 415bool intel_get_device_info_from_fd(int fh, struct intel_device_info *devinfo); 416bool intel_get_device_info_from_pci_id(int pci_id, 417 struct intel_device_info *devinfo); 418int intel_get_aperture_size(int fd, uint64_t *size); 419 420#ifdef __cplusplus 421} 422#endif 423 424#endif /* INTEL_DEVICE_INFO_H */ 425