intel_device_info.c revision 7ec681f3
1/* 2 * Copyright © 2013 Intel Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 */ 23 24#include <assert.h> 25#include <stdbool.h> 26#include <stdio.h> 27#include <stdlib.h> 28#include <string.h> 29#include <unistd.h> 30#include "intel_device_info.h" 31#include "intel/common/intel_gem.h" 32#include "util/bitscan.h" 33#include "util/debug.h" 34#include "util/log.h" 35#include "util/macros.h" 36 37#include "drm-uapi/i915_drm.h" 38 39static const struct { 40 const char *name; 41 int pci_id; 42} name_map[] = { 43 { "lpt", 0x27a2 }, 44 { "brw", 0x2a02 }, 45 { "g4x", 0x2a42 }, 46 { "ilk", 0x0042 }, 47 { "snb", 0x0126 }, 48 { "ivb", 0x016a }, 49 { "hsw", 0x0d2e }, 50 { "byt", 0x0f33 }, 51 { "bdw", 0x162e }, 52 { "chv", 0x22B3 }, 53 { "skl", 0x1912 }, 54 { "bxt", 0x5A85 }, 55 { "kbl", 0x5912 }, 56 { "aml", 0x591C }, 57 { "glk", 0x3185 }, 58 { "cfl", 0x3E9B }, 59 { "whl", 0x3EA1 }, 60 { "cml", 0x9b41 }, 61 { "icl", 0x8a52 }, 62 { "ehl", 0x4500 }, 63 { "jsl", 0x4E71 }, 64 { "tgl", 0x9a49 }, 65 { "rkl", 0x4c8a }, 66 { "dg1", 0x4905 }, 67 { "adl", 0x4680 }, 68 { "sg1", 0x4907 }, 69}; 70 71/** 72 * Get the PCI ID for the device name. 73 * 74 * Returns -1 if the device is not known. 75 */ 76int 77intel_device_name_to_pci_device_id(const char *name) 78{ 79 for (unsigned i = 0; i < ARRAY_SIZE(name_map); i++) { 80 if (!strcmp(name_map[i].name, name)) 81 return name_map[i].pci_id; 82 } 83 84 return -1; 85} 86 87static const struct intel_device_info intel_device_info_gfx3 = { 88 .ver = 3, 89 .simulator_id = -1, 90 .cs_prefetch_size = 512, 91}; 92 93static const struct intel_device_info intel_device_info_i965 = { 94 .ver = 4, 95 .has_negative_rhw_bug = true, 96 .num_slices = 1, 97 .num_subslices = { 1, }, 98 .num_eu_per_subslice = 8, 99 .num_thread_per_eu = 4, 100 .max_vs_threads = 16, 101 .max_gs_threads = 2, 102 .max_wm_threads = 8 * 4, 103 .urb = { 104 .size = 256, 105 }, 106 .timestamp_frequency = 12500000, 107 .simulator_id = -1, 108 .cs_prefetch_size = 512, 109}; 110 111static const struct intel_device_info intel_device_info_g4x = { 112 .ver = 4, 113 .verx10 = 45, 114 .has_pln = true, 115 .has_compr4 = true, 116 .has_surface_tile_offset = true, 117 .is_g4x = true, 118 .num_slices = 1, 119 .num_subslices = { 1, }, 120 .num_eu_per_subslice = 10, 121 .num_thread_per_eu = 5, 122 .max_vs_threads = 32, 123 .max_gs_threads = 2, 124 .max_wm_threads = 10 * 5, 125 .urb = { 126 .size = 384, 127 }, 128 .timestamp_frequency = 12500000, 129 .simulator_id = -1, 130 .cs_prefetch_size = 512, 131}; 132 133static const struct intel_device_info intel_device_info_ilk = { 134 .ver = 5, 135 .has_pln = true, 136 .has_compr4 = true, 137 .has_surface_tile_offset = true, 138 .num_slices = 1, 139 .num_subslices = { 1, }, 140 .num_eu_per_subslice = 12, 141 .num_thread_per_eu = 6, 142 .max_vs_threads = 72, 143 .max_gs_threads = 32, 144 .max_wm_threads = 12 * 6, 145 .urb = { 146 .size = 1024, 147 }, 148 .timestamp_frequency = 12500000, 149 .simulator_id = -1, 150 .cs_prefetch_size = 512, 151}; 152 153static const struct intel_device_info intel_device_info_snb_gt1 = { 154 .ver = 6, 155 .gt = 1, 156 .has_hiz_and_separate_stencil = true, 157 .has_llc = true, 158 .has_pln = true, 159 .has_surface_tile_offset = true, 160 .needs_unlit_centroid_workaround = true, 161 .num_slices = 1, 162 .num_subslices = { 1, }, 163 .num_eu_per_subslice = 6, 164 .num_thread_per_eu = 6, /* Not confirmed */ 165 .max_vs_threads = 24, 166 .max_gs_threads = 21, /* conservative; 24 if rendering disabled. */ 167 .max_wm_threads = 40, 168 .urb = { 169 .size = 32, 170 .min_entries = { 171 [MESA_SHADER_VERTEX] = 24, 172 }, 173 .max_entries = { 174 [MESA_SHADER_VERTEX] = 256, 175 [MESA_SHADER_GEOMETRY] = 256, 176 }, 177 }, 178 .timestamp_frequency = 12500000, 179 .simulator_id = -1, 180 .cs_prefetch_size = 512, 181}; 182 183static const struct intel_device_info intel_device_info_snb_gt2 = { 184 .ver = 6, 185 .gt = 2, 186 .has_hiz_and_separate_stencil = true, 187 .has_llc = true, 188 .has_pln = true, 189 .has_surface_tile_offset = true, 190 .needs_unlit_centroid_workaround = true, 191 .num_slices = 1, 192 .num_subslices = { 1, }, 193 .num_eu_per_subslice = 12, 194 .num_thread_per_eu = 6, /* Not confirmed */ 195 .max_vs_threads = 60, 196 .max_gs_threads = 60, 197 .max_wm_threads = 80, 198 .urb = { 199 .size = 64, 200 .min_entries = { 201 [MESA_SHADER_VERTEX] = 24, 202 }, 203 .max_entries = { 204 [MESA_SHADER_VERTEX] = 256, 205 [MESA_SHADER_GEOMETRY] = 256, 206 }, 207 }, 208 .timestamp_frequency = 12500000, 209 .simulator_id = -1, 210 .cs_prefetch_size = 512, 211}; 212 213#define GFX7_FEATURES \ 214 .ver = 7, \ 215 .has_hiz_and_separate_stencil = true, \ 216 .must_use_separate_stencil = true, \ 217 .has_llc = true, \ 218 .has_pln = true, \ 219 .has_64bit_float = true, \ 220 .has_surface_tile_offset = true, \ 221 .timestamp_frequency = 12500000, \ 222 .max_constant_urb_size_kb = 16, \ 223 .cs_prefetch_size = 512 224 225static const struct intel_device_info intel_device_info_ivb_gt1 = { 226 GFX7_FEATURES, .is_ivybridge = true, .gt = 1, 227 .num_slices = 1, 228 .num_subslices = { 1, }, 229 .num_eu_per_subslice = 6, 230 .num_thread_per_eu = 6, 231 .l3_banks = 2, 232 .max_vs_threads = 36, 233 .max_tcs_threads = 36, 234 .max_tes_threads = 36, 235 .max_gs_threads = 36, 236 .max_wm_threads = 48, 237 .max_cs_threads = 36, 238 .urb = { 239 .min_entries = { 240 [MESA_SHADER_VERTEX] = 32, 241 [MESA_SHADER_TESS_EVAL] = 10, 242 }, 243 .max_entries = { 244 [MESA_SHADER_VERTEX] = 512, 245 [MESA_SHADER_TESS_CTRL] = 32, 246 [MESA_SHADER_TESS_EVAL] = 288, 247 [MESA_SHADER_GEOMETRY] = 192, 248 }, 249 }, 250 .simulator_id = 7, 251}; 252 253static const struct intel_device_info intel_device_info_ivb_gt2 = { 254 GFX7_FEATURES, .is_ivybridge = true, .gt = 2, 255 .num_slices = 1, 256 .num_subslices = { 1, }, 257 .num_eu_per_subslice = 12, 258 .num_thread_per_eu = 8, /* Not sure why this isn't a multiple of 259 * @max_wm_threads ... */ 260 .l3_banks = 4, 261 .max_vs_threads = 128, 262 .max_tcs_threads = 128, 263 .max_tes_threads = 128, 264 .max_gs_threads = 128, 265 .max_wm_threads = 172, 266 .max_cs_threads = 64, 267 .urb = { 268 .min_entries = { 269 [MESA_SHADER_VERTEX] = 32, 270 [MESA_SHADER_TESS_EVAL] = 10, 271 }, 272 .max_entries = { 273 [MESA_SHADER_VERTEX] = 704, 274 [MESA_SHADER_TESS_CTRL] = 64, 275 [MESA_SHADER_TESS_EVAL] = 448, 276 [MESA_SHADER_GEOMETRY] = 320, 277 }, 278 }, 279 .simulator_id = 7, 280}; 281 282static const struct intel_device_info intel_device_info_byt = { 283 GFX7_FEATURES, .is_baytrail = true, .gt = 1, 284 .num_slices = 1, 285 .num_subslices = { 1, }, 286 .num_eu_per_subslice = 4, 287 .num_thread_per_eu = 8, 288 .l3_banks = 1, 289 .has_llc = false, 290 .max_vs_threads = 36, 291 .max_tcs_threads = 36, 292 .max_tes_threads = 36, 293 .max_gs_threads = 36, 294 .max_wm_threads = 48, 295 .max_cs_threads = 32, 296 .urb = { 297 .min_entries = { 298 [MESA_SHADER_VERTEX] = 32, 299 [MESA_SHADER_TESS_EVAL] = 10, 300 }, 301 .max_entries = { 302 [MESA_SHADER_VERTEX] = 512, 303 [MESA_SHADER_TESS_CTRL] = 32, 304 [MESA_SHADER_TESS_EVAL] = 288, 305 [MESA_SHADER_GEOMETRY] = 192, 306 }, 307 }, 308 .simulator_id = 10, 309}; 310 311#define HSW_FEATURES \ 312 GFX7_FEATURES, \ 313 .is_haswell = true, \ 314 .verx10 = 75, \ 315 .supports_simd16_3src = true 316 317static const struct intel_device_info intel_device_info_hsw_gt1 = { 318 HSW_FEATURES, .gt = 1, 319 .num_slices = 1, 320 .num_subslices = { 1, }, 321 .num_eu_per_subslice = 10, 322 .num_thread_per_eu = 7, 323 .l3_banks = 2, 324 .max_vs_threads = 70, 325 .max_tcs_threads = 70, 326 .max_tes_threads = 70, 327 .max_gs_threads = 70, 328 .max_wm_threads = 102, 329 .max_cs_threads = 70, 330 .urb = { 331 .min_entries = { 332 [MESA_SHADER_VERTEX] = 32, 333 [MESA_SHADER_TESS_EVAL] = 10, 334 }, 335 .max_entries = { 336 [MESA_SHADER_VERTEX] = 640, 337 [MESA_SHADER_TESS_CTRL] = 64, 338 [MESA_SHADER_TESS_EVAL] = 384, 339 [MESA_SHADER_GEOMETRY] = 256, 340 }, 341 }, 342 .simulator_id = 9, 343}; 344 345static const struct intel_device_info intel_device_info_hsw_gt2 = { 346 HSW_FEATURES, .gt = 2, 347 .num_slices = 1, 348 .num_subslices = { 2, }, 349 .num_eu_per_subslice = 10, 350 .num_thread_per_eu = 7, 351 .l3_banks = 4, 352 .max_vs_threads = 280, 353 .max_tcs_threads = 256, 354 .max_tes_threads = 280, 355 .max_gs_threads = 256, 356 .max_wm_threads = 204, 357 .max_cs_threads = 70, 358 .urb = { 359 .min_entries = { 360 [MESA_SHADER_VERTEX] = 64, 361 [MESA_SHADER_TESS_EVAL] = 10, 362 }, 363 .max_entries = { 364 [MESA_SHADER_VERTEX] = 1664, 365 [MESA_SHADER_TESS_CTRL] = 128, 366 [MESA_SHADER_TESS_EVAL] = 960, 367 [MESA_SHADER_GEOMETRY] = 640, 368 }, 369 }, 370 .simulator_id = 9, 371}; 372 373static const struct intel_device_info intel_device_info_hsw_gt3 = { 374 HSW_FEATURES, .gt = 3, 375 .num_slices = 2, 376 .num_subslices = { 2, 2, }, 377 .num_eu_per_subslice = 10, 378 .num_thread_per_eu = 7, 379 .l3_banks = 8, 380 .max_vs_threads = 280, 381 .max_tcs_threads = 256, 382 .max_tes_threads = 280, 383 .max_gs_threads = 256, 384 .max_wm_threads = 408, 385 .max_cs_threads = 70, 386 .urb = { 387 .min_entries = { 388 [MESA_SHADER_VERTEX] = 64, 389 [MESA_SHADER_TESS_EVAL] = 10, 390 }, 391 .max_entries = { 392 [MESA_SHADER_VERTEX] = 1664, 393 [MESA_SHADER_TESS_CTRL] = 128, 394 [MESA_SHADER_TESS_EVAL] = 960, 395 [MESA_SHADER_GEOMETRY] = 640, 396 }, 397 }, 398 .max_constant_urb_size_kb = 32, 399 .simulator_id = 9, 400}; 401 402/* It's unclear how well supported sampling from the hiz buffer is on GFX8, 403 * so keep things conservative for now and set has_sample_with_hiz = false. 404 */ 405#define GFX8_FEATURES \ 406 .ver = 8, \ 407 .has_hiz_and_separate_stencil = true, \ 408 .must_use_separate_stencil = true, \ 409 .has_llc = true, \ 410 .has_sample_with_hiz = false, \ 411 .has_pln = true, \ 412 .has_integer_dword_mul = true, \ 413 .has_64bit_float = true, \ 414 .has_64bit_int = true, \ 415 .supports_simd16_3src = true, \ 416 .has_surface_tile_offset = true, \ 417 .num_thread_per_eu = 7, \ 418 .max_vs_threads = 504, \ 419 .max_tcs_threads = 504, \ 420 .max_tes_threads = 504, \ 421 .max_gs_threads = 504, \ 422 .max_wm_threads = 384, \ 423 .timestamp_frequency = 12500000, \ 424 .max_constant_urb_size_kb = 32, \ 425 .cs_prefetch_size = 512 426 427static const struct intel_device_info intel_device_info_bdw_gt1 = { 428 GFX8_FEATURES, .gt = 1, 429 .is_broadwell = true, 430 .num_slices = 1, 431 .num_subslices = { 2, }, 432 .num_eu_per_subslice = 6, 433 .l3_banks = 2, 434 .max_cs_threads = 42, 435 .urb = { 436 .min_entries = { 437 [MESA_SHADER_VERTEX] = 64, 438 [MESA_SHADER_TESS_EVAL] = 34, 439 }, 440 .max_entries = { 441 [MESA_SHADER_VERTEX] = 2560, 442 [MESA_SHADER_TESS_CTRL] = 504, 443 [MESA_SHADER_TESS_EVAL] = 1536, 444 /* Reduced from 960, seems to be similar to the bug on Gfx9 GT1. */ 445 [MESA_SHADER_GEOMETRY] = 690, 446 }, 447 }, 448 .simulator_id = 11, 449}; 450 451static const struct intel_device_info intel_device_info_bdw_gt2 = { 452 GFX8_FEATURES, .gt = 2, 453 .is_broadwell = true, 454 .num_slices = 1, 455 .num_subslices = { 3, }, 456 .num_eu_per_subslice = 8, 457 .l3_banks = 4, 458 .max_cs_threads = 56, 459 .urb = { 460 .min_entries = { 461 [MESA_SHADER_VERTEX] = 64, 462 [MESA_SHADER_TESS_EVAL] = 34, 463 }, 464 .max_entries = { 465 [MESA_SHADER_VERTEX] = 2560, 466 [MESA_SHADER_TESS_CTRL] = 504, 467 [MESA_SHADER_TESS_EVAL] = 1536, 468 [MESA_SHADER_GEOMETRY] = 960, 469 }, 470 }, 471 .simulator_id = 11, 472}; 473 474static const struct intel_device_info intel_device_info_bdw_gt3 = { 475 GFX8_FEATURES, .gt = 3, 476 .is_broadwell = true, 477 .num_slices = 2, 478 .num_subslices = { 3, 3, }, 479 .num_eu_per_subslice = 8, 480 .l3_banks = 8, 481 .max_cs_threads = 56, 482 .urb = { 483 .min_entries = { 484 [MESA_SHADER_VERTEX] = 64, 485 [MESA_SHADER_TESS_EVAL] = 34, 486 }, 487 .max_entries = { 488 [MESA_SHADER_VERTEX] = 2560, 489 [MESA_SHADER_TESS_CTRL] = 504, 490 [MESA_SHADER_TESS_EVAL] = 1536, 491 [MESA_SHADER_GEOMETRY] = 960, 492 }, 493 }, 494 .simulator_id = 11, 495}; 496 497static const struct intel_device_info intel_device_info_chv = { 498 GFX8_FEATURES, .is_cherryview = 1, .gt = 1, 499 .has_llc = false, 500 .has_integer_dword_mul = false, 501 .num_slices = 1, 502 .num_subslices = { 2, }, 503 .num_eu_per_subslice = 8, 504 .l3_banks = 2, 505 .max_vs_threads = 80, 506 .max_tcs_threads = 80, 507 .max_tes_threads = 80, 508 .max_gs_threads = 80, 509 .max_wm_threads = 128, 510 .max_cs_threads = 6 * 7, 511 .urb = { 512 .min_entries = { 513 [MESA_SHADER_VERTEX] = 34, 514 [MESA_SHADER_TESS_EVAL] = 34, 515 }, 516 .max_entries = { 517 [MESA_SHADER_VERTEX] = 640, 518 [MESA_SHADER_TESS_CTRL] = 80, 519 [MESA_SHADER_TESS_EVAL] = 384, 520 [MESA_SHADER_GEOMETRY] = 256, 521 }, 522 }, 523 .simulator_id = 13, 524}; 525 526#define GFX9_HW_INFO \ 527 .ver = 9, \ 528 .max_vs_threads = 336, \ 529 .max_gs_threads = 336, \ 530 .max_tcs_threads = 336, \ 531 .max_tes_threads = 336, \ 532 .max_cs_threads = 56, \ 533 .timestamp_frequency = 12000000, \ 534 .cs_prefetch_size = 512, \ 535 .urb = { \ 536 .min_entries = { \ 537 [MESA_SHADER_VERTEX] = 64, \ 538 [MESA_SHADER_TESS_EVAL] = 34, \ 539 }, \ 540 .max_entries = { \ 541 [MESA_SHADER_VERTEX] = 1856, \ 542 [MESA_SHADER_TESS_CTRL] = 672, \ 543 [MESA_SHADER_TESS_EVAL] = 1120, \ 544 [MESA_SHADER_GEOMETRY] = 640, \ 545 }, \ 546 } 547 548#define GFX9_LP_FEATURES \ 549 GFX8_FEATURES, \ 550 GFX9_HW_INFO, \ 551 .has_integer_dword_mul = false, \ 552 .gt = 1, \ 553 .has_llc = false, \ 554 .has_sample_with_hiz = true, \ 555 .num_slices = 1, \ 556 .num_thread_per_eu = 6, \ 557 .max_vs_threads = 112, \ 558 .max_tcs_threads = 112, \ 559 .max_tes_threads = 112, \ 560 .max_gs_threads = 112, \ 561 .max_cs_threads = 6 * 6, \ 562 .timestamp_frequency = 19200000, \ 563 .urb = { \ 564 .min_entries = { \ 565 [MESA_SHADER_VERTEX] = 34, \ 566 [MESA_SHADER_TESS_EVAL] = 34, \ 567 }, \ 568 .max_entries = { \ 569 [MESA_SHADER_VERTEX] = 704, \ 570 [MESA_SHADER_TESS_CTRL] = 256, \ 571 [MESA_SHADER_TESS_EVAL] = 416, \ 572 [MESA_SHADER_GEOMETRY] = 256, \ 573 }, \ 574 } 575 576#define GFX9_LP_FEATURES_3X6 \ 577 GFX9_LP_FEATURES, \ 578 .num_subslices = { 3, }, \ 579 .num_eu_per_subslice = 6 580 581#define GFX9_LP_FEATURES_2X6 \ 582 GFX9_LP_FEATURES, \ 583 .num_subslices = { 2, }, \ 584 .num_eu_per_subslice = 6, \ 585 .max_vs_threads = 56, \ 586 .max_tcs_threads = 56, \ 587 .max_tes_threads = 56, \ 588 .max_gs_threads = 56, \ 589 .max_cs_threads = 6 * 6, \ 590 .urb = { \ 591 .min_entries = { \ 592 [MESA_SHADER_VERTEX] = 34, \ 593 [MESA_SHADER_TESS_EVAL] = 34, \ 594 }, \ 595 .max_entries = { \ 596 [MESA_SHADER_VERTEX] = 352, \ 597 [MESA_SHADER_TESS_CTRL] = 128, \ 598 [MESA_SHADER_TESS_EVAL] = 208, \ 599 [MESA_SHADER_GEOMETRY] = 128, \ 600 }, \ 601 } 602 603#define GFX9_FEATURES \ 604 GFX8_FEATURES, \ 605 GFX9_HW_INFO, \ 606 .has_sample_with_hiz = true 607 608static const struct intel_device_info intel_device_info_skl_gt1 = { 609 GFX9_FEATURES, .gt = 1, 610 .is_skylake = true, 611 .num_slices = 1, 612 .num_subslices = { 2, }, 613 .num_eu_per_subslice = 6, 614 .l3_banks = 2, 615 /* GT1 seems to have a bug in the top of the pipe (VF/VS?) fixed functions 616 * leading to some vertices to go missing if we use too much URB. 617 */ 618 .urb.max_entries[MESA_SHADER_VERTEX] = 928, 619 .simulator_id = 12, 620}; 621 622static const struct intel_device_info intel_device_info_skl_gt2 = { 623 GFX9_FEATURES, .gt = 2, 624 .is_skylake = true, 625 .num_slices = 1, 626 .num_subslices = { 3, }, 627 .num_eu_per_subslice = 8, 628 .l3_banks = 4, 629 .simulator_id = 12, 630}; 631 632static const struct intel_device_info intel_device_info_skl_gt3 = { 633 GFX9_FEATURES, .gt = 3, 634 .is_skylake = true, 635 .num_slices = 2, 636 .num_subslices = { 3, 3, }, 637 .num_eu_per_subslice = 8, 638 .l3_banks = 8, 639 .simulator_id = 12, 640}; 641 642static const struct intel_device_info intel_device_info_skl_gt4 = { 643 GFX9_FEATURES, .gt = 4, 644 .is_skylake = true, 645 .num_slices = 3, 646 .num_subslices = { 3, 3, 3, }, 647 .num_eu_per_subslice = 8, 648 .l3_banks = 12, 649 /* From the "L3 Allocation and Programming" documentation: 650 * 651 * "URB is limited to 1008KB due to programming restrictions. This is not a 652 * restriction of the L3 implementation, but of the FF and other clients. 653 * Therefore, in a GT4 implementation it is possible for the programmed 654 * allocation of the L3 data array to provide 3*384KB=1152KB for URB, but 655 * only 1008KB of this will be used." 656 */ 657 .simulator_id = 12, 658}; 659 660static const struct intel_device_info intel_device_info_bxt = { 661 GFX9_LP_FEATURES_3X6, 662 .is_broxton = true, 663 .l3_banks = 2, 664 .simulator_id = 14, 665}; 666 667static const struct intel_device_info intel_device_info_bxt_2x6 = { 668 GFX9_LP_FEATURES_2X6, 669 .is_broxton = true, 670 .l3_banks = 1, 671 .simulator_id = 14, 672}; 673/* 674 * Note: for all KBL SKUs, the PRM says SKL for GS entries, not SKL+. 675 * There's no KBL entry. Using the default SKL (GFX9) GS entries value. 676 */ 677 678static const struct intel_device_info intel_device_info_kbl_gt1 = { 679 GFX9_FEATURES, 680 .is_kabylake = true, 681 .gt = 1, 682 683 .max_cs_threads = 7 * 6, 684 .num_slices = 1, 685 .num_subslices = { 2, }, 686 .num_eu_per_subslice = 6, 687 .l3_banks = 2, 688 /* GT1 seems to have a bug in the top of the pipe (VF/VS?) fixed functions 689 * leading to some vertices to go missing if we use too much URB. 690 */ 691 .urb.max_entries[MESA_SHADER_VERTEX] = 928, 692 .urb.max_entries[MESA_SHADER_GEOMETRY] = 256, 693 .simulator_id = 16, 694}; 695 696static const struct intel_device_info intel_device_info_kbl_gt1_5 = { 697 GFX9_FEATURES, 698 .is_kabylake = true, 699 .gt = 1, 700 701 .max_cs_threads = 7 * 6, 702 .num_slices = 1, 703 .num_subslices = { 3, }, 704 .num_eu_per_subslice = 6, 705 .l3_banks = 4, 706 .simulator_id = 16, 707}; 708 709static const struct intel_device_info intel_device_info_kbl_gt2 = { 710 GFX9_FEATURES, 711 .is_kabylake = true, 712 .gt = 2, 713 714 .num_slices = 1, 715 .num_subslices = { 3, }, 716 .num_eu_per_subslice = 8, 717 .l3_banks = 4, 718 .simulator_id = 16, 719}; 720 721static const struct intel_device_info intel_device_info_kbl_gt3 = { 722 GFX9_FEATURES, 723 .is_kabylake = true, 724 .gt = 3, 725 726 .num_slices = 2, 727 .num_subslices = { 3, 3, }, 728 .num_eu_per_subslice = 8, 729 .l3_banks = 8, 730 .simulator_id = 16, 731}; 732 733static const struct intel_device_info intel_device_info_kbl_gt4 = { 734 GFX9_FEATURES, 735 .is_kabylake = true, 736 .gt = 4, 737 738 /* 739 * From the "L3 Allocation and Programming" documentation: 740 * 741 * "URB is limited to 1008KB due to programming restrictions. This 742 * is not a restriction of the L3 implementation, but of the FF and 743 * other clients. Therefore, in a GT4 implementation it is 744 * possible for the programmed allocation of the L3 data array to 745 * provide 3*384KB=1152KB for URB, but only 1008KB of this 746 * will be used." 747 */ 748 .num_slices = 3, 749 .num_subslices = { 3, 3, 3, }, 750 .num_eu_per_subslice = 8, 751 .l3_banks = 12, 752 .simulator_id = 16, 753}; 754 755static const struct intel_device_info intel_device_info_glk = { 756 GFX9_LP_FEATURES_3X6, 757 .is_geminilake = true, 758 .l3_banks = 2, 759 .simulator_id = 17, 760}; 761 762static const struct intel_device_info intel_device_info_glk_2x6 = { 763 GFX9_LP_FEATURES_2X6, 764 .is_geminilake = true, 765 .l3_banks = 2, 766 .simulator_id = 17, 767}; 768 769static const struct intel_device_info intel_device_info_cfl_gt1 = { 770 GFX9_FEATURES, 771 .is_coffeelake = true, 772 .gt = 1, 773 774 .num_slices = 1, 775 .num_subslices = { 2, }, 776 .num_eu_per_subslice = 6, 777 .l3_banks = 2, 778 /* GT1 seems to have a bug in the top of the pipe (VF/VS?) fixed functions 779 * leading to some vertices to go missing if we use too much URB. 780 */ 781 .urb.max_entries[MESA_SHADER_VERTEX] = 928, 782 .urb.max_entries[MESA_SHADER_GEOMETRY] = 256, 783 .simulator_id = 24, 784}; 785static const struct intel_device_info intel_device_info_cfl_gt2 = { 786 GFX9_FEATURES, 787 .is_coffeelake = true, 788 .gt = 2, 789 790 .num_slices = 1, 791 .num_subslices = { 3, }, 792 .num_eu_per_subslice = 8, 793 .l3_banks = 4, 794 .simulator_id = 24, 795}; 796 797static const struct intel_device_info intel_device_info_cfl_gt3 = { 798 GFX9_FEATURES, 799 .is_coffeelake = true, 800 .gt = 3, 801 802 .num_slices = 2, 803 .num_subslices = { 3, 3, }, 804 .num_eu_per_subslice = 8, 805 .l3_banks = 8, 806 .simulator_id = 24, 807}; 808 809#define subslices(args...) { args, } 810 811#define GFX11_HW_INFO \ 812 .ver = 11, \ 813 .has_pln = false, \ 814 .max_vs_threads = 364, \ 815 .max_gs_threads = 224, \ 816 .max_tcs_threads = 224, \ 817 .max_tes_threads = 364, \ 818 .max_cs_threads = 56, \ 819 .cs_prefetch_size = 512 820 821#define GFX11_FEATURES(_gt, _slices, _subslices, _l3) \ 822 GFX8_FEATURES, \ 823 GFX11_HW_INFO, \ 824 .has_64bit_float = false, \ 825 .has_64bit_int = false, \ 826 .has_integer_dword_mul = false, \ 827 .has_sample_with_hiz = false, \ 828 .gt = _gt, .num_slices = _slices, .l3_banks = _l3, \ 829 .num_subslices = _subslices, \ 830 .num_eu_per_subslice = 8 831 832#define GFX11_URB_MIN_MAX_ENTRIES \ 833 .min_entries = { \ 834 [MESA_SHADER_VERTEX] = 64, \ 835 [MESA_SHADER_TESS_EVAL] = 34, \ 836 }, \ 837 .max_entries = { \ 838 [MESA_SHADER_VERTEX] = 2384, \ 839 [MESA_SHADER_TESS_CTRL] = 1032, \ 840 [MESA_SHADER_TESS_EVAL] = 2384, \ 841 [MESA_SHADER_GEOMETRY] = 1032, \ 842 } 843 844static const struct intel_device_info intel_device_info_icl_gt2 = { 845 GFX11_FEATURES(2, 1, subslices(8), 8), 846 .urb = { 847 GFX11_URB_MIN_MAX_ENTRIES, 848 }, 849 .simulator_id = 19, 850}; 851 852static const struct intel_device_info intel_device_info_icl_gt1_5 = { 853 GFX11_FEATURES(1, 1, subslices(6), 6), 854 .urb = { 855 GFX11_URB_MIN_MAX_ENTRIES, 856 }, 857 .simulator_id = 19, 858}; 859 860static const struct intel_device_info intel_device_info_icl_gt1 = { 861 GFX11_FEATURES(1, 1, subslices(4), 6), 862 .urb = { 863 GFX11_URB_MIN_MAX_ENTRIES, 864 }, 865 .simulator_id = 19, 866}; 867 868static const struct intel_device_info intel_device_info_icl_gt0_5 = { 869 GFX11_FEATURES(1, 1, subslices(1), 6), 870 .urb = { 871 GFX11_URB_MIN_MAX_ENTRIES, 872 }, 873 .simulator_id = 19, 874}; 875 876#define GFX11_LP_FEATURES \ 877 .is_elkhartlake = true, \ 878 .urb = { \ 879 GFX11_URB_MIN_MAX_ENTRIES, \ 880 }, \ 881 .disable_ccs_repack = true, \ 882 .simulator_id = 28 883 884static const struct intel_device_info intel_device_info_ehl_4x8 = { 885 GFX11_FEATURES(1, 1, subslices(4), 4), 886 GFX11_LP_FEATURES, 887}; 888 889static const struct intel_device_info intel_device_info_ehl_4x6 = { 890 GFX11_FEATURES(1, 1, subslices(4), 4), 891 GFX11_LP_FEATURES, 892 .num_eu_per_subslice = 6, 893}; 894 895static const struct intel_device_info intel_device_info_ehl_4x5 = { 896 GFX11_FEATURES(1, 1, subslices(4), 4), 897 GFX11_LP_FEATURES, 898 .num_eu_per_subslice = 5, 899}; 900 901static const struct intel_device_info intel_device_info_ehl_4x4 = { 902 GFX11_FEATURES(1, 1, subslices(4), 4), 903 GFX11_LP_FEATURES, 904 .num_eu_per_subslice = 4, 905}; 906 907static const struct intel_device_info intel_device_info_ehl_2x8 = { 908 GFX11_FEATURES(1, 1, subslices(2), 4), 909 GFX11_LP_FEATURES, 910}; 911 912static const struct intel_device_info intel_device_info_ehl_2x4 = { 913 GFX11_FEATURES(1, 1, subslices(2), 4), 914 GFX11_LP_FEATURES, 915 .num_eu_per_subslice =4, 916}; 917 918#define GFX12_URB_MIN_MAX_ENTRIES \ 919 .min_entries = { \ 920 [MESA_SHADER_VERTEX] = 64, \ 921 [MESA_SHADER_TESS_EVAL] = 34, \ 922 }, \ 923 .max_entries = { \ 924 [MESA_SHADER_VERTEX] = 3576, \ 925 [MESA_SHADER_TESS_CTRL] = 1548, \ 926 [MESA_SHADER_TESS_EVAL] = 3576, \ 927 /* Wa_14013840143 */ \ 928 [MESA_SHADER_GEOMETRY] = 1536, \ 929 } 930 931#define GFX12_HW_INFO \ 932 .ver = 12, \ 933 .has_pln = false, \ 934 .has_sample_with_hiz = false, \ 935 .has_aux_map = true, \ 936 .max_vs_threads = 546, \ 937 .max_gs_threads = 336, \ 938 .max_tcs_threads = 336, \ 939 .max_tes_threads = 546, \ 940 .max_cs_threads = 112, /* threads per DSS */ \ 941 .urb = { \ 942 GFX12_URB_MIN_MAX_ENTRIES, \ 943 } 944 945#define GFX12_FEATURES(_gt, _slices, _l3) \ 946 GFX8_FEATURES, \ 947 GFX12_HW_INFO, \ 948 .has_64bit_float = false, \ 949 .has_64bit_int = false, \ 950 .has_integer_dword_mul = false, \ 951 .gt = _gt, .num_slices = _slices, .l3_banks = _l3, \ 952 .simulator_id = 22, \ 953 .num_eu_per_subslice = 16, \ 954 .cs_prefetch_size = 512 955 956#define dual_subslices(args...) { args, } 957 958#define GFX12_GT05_FEATURES \ 959 GFX12_FEATURES(1, 1, 4), \ 960 .num_subslices = dual_subslices(1) 961 962#define GFX12_GT_FEATURES(_gt) \ 963 GFX12_FEATURES(_gt, 1, _gt == 1 ? 4 : 8), \ 964 .num_subslices = dual_subslices(_gt == 1 ? 2 : 6) 965 966static const struct intel_device_info intel_device_info_tgl_gt1 = { 967 GFX12_GT_FEATURES(1), 968 .is_tigerlake = true, 969}; 970 971static const struct intel_device_info intel_device_info_tgl_gt2 = { 972 GFX12_GT_FEATURES(2), 973 .is_tigerlake = true, 974}; 975 976static const struct intel_device_info intel_device_info_rkl_gt05 = { 977 GFX12_GT05_FEATURES, 978 .is_rocketlake = true, 979}; 980 981static const struct intel_device_info intel_device_info_rkl_gt1 = { 982 GFX12_GT_FEATURES(1), 983 .is_rocketlake = true, 984}; 985 986static const struct intel_device_info intel_device_info_adl_gt05 = { 987 GFX12_GT05_FEATURES, 988 .is_alderlake = true, 989}; 990 991static const struct intel_device_info intel_device_info_adl_gt1 = { 992 GFX12_GT_FEATURES(1), 993 .is_alderlake = true, 994}; 995 996static const struct intel_device_info intel_device_info_adl_gt2 = { 997 GFX12_GT_FEATURES(2), 998 .is_alderlake = true, 999 .display_ver = 13, 1000}; 1001 1002#define GFX12_DG1_SG1_FEATURES \ 1003 GFX12_GT_FEATURES(2), \ 1004 .is_dg1 = true, \ 1005 .has_llc = false, \ 1006 .has_local_mem = true, \ 1007 .urb.size = 768, \ 1008 .simulator_id = 30 1009 1010static const struct intel_device_info intel_device_info_dg1 = { 1011 GFX12_DG1_SG1_FEATURES, 1012}; 1013 1014static const struct intel_device_info intel_device_info_sg1 = { 1015 GFX12_DG1_SG1_FEATURES, 1016}; 1017 1018static void 1019reset_masks(struct intel_device_info *devinfo) 1020{ 1021 devinfo->subslice_slice_stride = 0; 1022 devinfo->eu_subslice_stride = 0; 1023 devinfo->eu_slice_stride = 0; 1024 1025 devinfo->num_slices = 0; 1026 devinfo->num_eu_per_subslice = 0; 1027 memset(devinfo->num_subslices, 0, sizeof(devinfo->num_subslices)); 1028 1029 memset(&devinfo->slice_masks, 0, sizeof(devinfo->slice_masks)); 1030 memset(devinfo->subslice_masks, 0, sizeof(devinfo->subslice_masks)); 1031 memset(devinfo->eu_masks, 0, sizeof(devinfo->eu_masks)); 1032 memset(devinfo->ppipe_subslices, 0, sizeof(devinfo->ppipe_subslices)); 1033} 1034 1035static void 1036update_from_topology(struct intel_device_info *devinfo, 1037 const struct drm_i915_query_topology_info *topology) 1038{ 1039 reset_masks(devinfo); 1040 1041 assert(topology->max_slices > 0); 1042 assert(topology->max_subslices > 0); 1043 assert(topology->max_eus_per_subslice > 0); 1044 1045 devinfo->subslice_slice_stride = topology->subslice_stride; 1046 1047 devinfo->eu_subslice_stride = DIV_ROUND_UP(topology->max_eus_per_subslice, 8); 1048 devinfo->eu_slice_stride = topology->max_subslices * devinfo->eu_subslice_stride; 1049 1050 assert(sizeof(devinfo->slice_masks) >= DIV_ROUND_UP(topology->max_slices, 8)); 1051 memcpy(&devinfo->slice_masks, topology->data, DIV_ROUND_UP(topology->max_slices, 8)); 1052 devinfo->num_slices = __builtin_popcount(devinfo->slice_masks); 1053 devinfo->max_slices = topology->max_slices; 1054 devinfo->max_subslices_per_slice = topology->max_subslices; 1055 devinfo->max_eu_per_subslice = topology->max_eus_per_subslice; 1056 1057 uint32_t subslice_mask_len = 1058 topology->max_slices * topology->subslice_stride; 1059 assert(sizeof(devinfo->subslice_masks) >= subslice_mask_len); 1060 memcpy(devinfo->subslice_masks, &topology->data[topology->subslice_offset], 1061 subslice_mask_len); 1062 1063 uint32_t n_subslices = 0; 1064 for (int s = 0; s < topology->max_slices; s++) { 1065 if ((devinfo->slice_masks & (1 << s)) == 0) 1066 continue; 1067 1068 for (int b = 0; b < devinfo->subslice_slice_stride; b++) { 1069 devinfo->num_subslices[s] += 1070 __builtin_popcount(devinfo->subslice_masks[s * devinfo->subslice_slice_stride + b]); 1071 } 1072 n_subslices += devinfo->num_subslices[s]; 1073 } 1074 assert(n_subslices > 0); 1075 1076 if (devinfo->ver >= 11) { 1077 /* On current ICL+ hardware we only have one slice. */ 1078 assert(devinfo->slice_masks == 1); 1079 1080 /* Count the number of subslices on each pixel pipe. Assume that every 1081 * contiguous group of 4 subslices in the mask belong to the same pixel 1082 * pipe. However note that on TGL the kernel returns a mask of enabled 1083 * *dual* subslices instead of actual subslices somewhat confusingly, so 1084 * each pixel pipe only takes 2 bits in the mask even though it's still 1085 * 4 subslices. 1086 */ 1087 const unsigned ppipe_bits = devinfo->ver >= 12 ? 2 : 4; 1088 for (unsigned p = 0; p < INTEL_DEVICE_MAX_PIXEL_PIPES; p++) { 1089 const unsigned ppipe_mask = BITFIELD_RANGE(p * ppipe_bits, ppipe_bits); 1090 devinfo->ppipe_subslices[p] = 1091 __builtin_popcount(devinfo->subslice_masks[0] & ppipe_mask); 1092 } 1093 } 1094 1095 if (devinfo->ver == 12 && devinfo->num_slices == 1) { 1096 if (n_subslices >= 6) { 1097 assert(n_subslices == 6); 1098 devinfo->l3_banks = 8; 1099 } else if (n_subslices > 2) { 1100 devinfo->l3_banks = 6; 1101 } else { 1102 devinfo->l3_banks = 4; 1103 } 1104 } 1105 1106 uint32_t eu_mask_len = 1107 topology->eu_stride * topology->max_subslices * topology->max_slices; 1108 assert(sizeof(devinfo->eu_masks) >= eu_mask_len); 1109 memcpy(devinfo->eu_masks, &topology->data[topology->eu_offset], eu_mask_len); 1110 1111 uint32_t n_eus = 0; 1112 for (int b = 0; b < eu_mask_len; b++) 1113 n_eus += __builtin_popcount(devinfo->eu_masks[b]); 1114 1115 devinfo->num_eu_per_subslice = DIV_ROUND_UP(n_eus, n_subslices); 1116} 1117 1118/* Generate detailed mask from the I915_PARAM_SLICE_MASK, 1119 * I915_PARAM_SUBSLICE_MASK & I915_PARAM_EU_TOTAL getparam. 1120 */ 1121static bool 1122update_from_masks(struct intel_device_info *devinfo, uint32_t slice_mask, 1123 uint32_t subslice_mask, uint32_t n_eus) 1124{ 1125 struct drm_i915_query_topology_info *topology; 1126 1127 assert((slice_mask & 0xff) == slice_mask); 1128 1129 size_t data_length = 100; 1130 1131 topology = calloc(1, sizeof(*topology) + data_length); 1132 if (!topology) 1133 return false; 1134 1135 topology->max_slices = util_last_bit(slice_mask); 1136 topology->max_subslices = util_last_bit(subslice_mask); 1137 1138 topology->subslice_offset = DIV_ROUND_UP(topology->max_slices, 8); 1139 topology->subslice_stride = DIV_ROUND_UP(topology->max_subslices, 8); 1140 1141 uint32_t n_subslices = __builtin_popcount(slice_mask) * 1142 __builtin_popcount(subslice_mask); 1143 uint32_t num_eu_per_subslice = DIV_ROUND_UP(n_eus, n_subslices); 1144 uint32_t eu_mask = (1U << num_eu_per_subslice) - 1; 1145 1146 topology->max_eus_per_subslice = num_eu_per_subslice; 1147 topology->eu_offset = topology->subslice_offset + 1148 topology->max_slices * DIV_ROUND_UP(topology->max_subslices, 8); 1149 topology->eu_stride = DIV_ROUND_UP(num_eu_per_subslice, 8); 1150 1151 /* Set slice mask in topology */ 1152 for (int b = 0; b < topology->subslice_offset; b++) 1153 topology->data[b] = (slice_mask >> (b * 8)) & 0xff; 1154 1155 for (int s = 0; s < topology->max_slices; s++) { 1156 1157 /* Set subslice mask in topology */ 1158 for (int b = 0; b < topology->subslice_stride; b++) { 1159 int subslice_offset = topology->subslice_offset + 1160 s * topology->subslice_stride + b; 1161 1162 topology->data[subslice_offset] = (subslice_mask >> (b * 8)) & 0xff; 1163 } 1164 1165 /* Set eu mask in topology */ 1166 for (int ss = 0; ss < topology->max_subslices; ss++) { 1167 for (int b = 0; b < topology->eu_stride; b++) { 1168 int eu_offset = topology->eu_offset + 1169 (s * topology->max_subslices + ss) * topology->eu_stride + b; 1170 1171 topology->data[eu_offset] = (eu_mask >> (b * 8)) & 0xff; 1172 } 1173 } 1174 } 1175 1176 update_from_topology(devinfo, topology); 1177 free(topology); 1178 1179 return true; 1180} 1181 1182/* Generate mask from the device data. */ 1183static void 1184fill_masks(struct intel_device_info *devinfo) 1185{ 1186 /* All of our internal device descriptions assign the same number of 1187 * subslices for each slice. Just verify that this is true. 1188 */ 1189 for (int s = 1; s < devinfo->num_slices; s++) 1190 assert(devinfo->num_subslices[0] == devinfo->num_subslices[s]); 1191 1192 update_from_masks(devinfo, 1193 (1U << devinfo->num_slices) - 1, 1194 (1U << devinfo->num_subslices[0]) - 1, 1195 devinfo->num_slices * devinfo->num_subslices[0] * 1196 devinfo->num_eu_per_subslice); 1197} 1198 1199static bool 1200getparam(int fd, uint32_t param, int *value) 1201{ 1202 int tmp; 1203 1204 struct drm_i915_getparam gp = { 1205 .param = param, 1206 .value = &tmp, 1207 }; 1208 1209 int ret = intel_ioctl(fd, DRM_IOCTL_I915_GETPARAM, &gp); 1210 if (ret != 0) 1211 return false; 1212 1213 *value = tmp; 1214 return true; 1215} 1216 1217static void 1218update_cs_workgroup_threads(struct intel_device_info *devinfo) 1219{ 1220 /* GPGPU_WALKER::ThreadWidthCounterMaximum is U6-1 so the most threads we 1221 * can program is 64 without going up to a rectangular group. This only 1222 * impacts Haswell and TGL which have higher thread counts. 1223 * 1224 * INTERFACE_DESCRIPTOR_DATA::NumberofThreadsinGPGPUThreadGroup on Xe-HP+ 1225 * is 10 bits so we have no such restrictions. 1226 */ 1227 devinfo->max_cs_workgroup_threads = 1228 devinfo->verx10 >= 125 ? devinfo->max_cs_threads : 1229 MIN2(devinfo->max_cs_threads, 64); 1230} 1231 1232bool 1233intel_get_device_info_from_pci_id(int pci_id, 1234 struct intel_device_info *devinfo) 1235{ 1236 switch (pci_id) { 1237#undef CHIPSET 1238#define CHIPSET(id, family, fam_str, name) \ 1239 case id: *devinfo = intel_device_info_##family; break; 1240#include "pci_ids/i965_pci_ids.h" 1241#include "pci_ids/iris_pci_ids.h" 1242 1243#undef CHIPSET 1244#define CHIPSET(id, fam_str, name) \ 1245 case id: *devinfo = intel_device_info_gfx3; break; 1246#include "pci_ids/i915_pci_ids.h" 1247 1248 default: 1249 mesa_logw("Driver does not support the 0x%x PCI ID.", pci_id); 1250 return false; 1251 } 1252 1253 switch (pci_id) { 1254#undef CHIPSET 1255#define CHIPSET(_id, _family, _fam_str, _name) \ 1256 case _id: \ 1257 /* sizeof(str_literal) includes the null */ \ 1258 STATIC_ASSERT(sizeof(_name) + sizeof(_fam_str) + 2 <= \ 1259 sizeof(devinfo->name)); \ 1260 strncpy(devinfo->name, _name " (" _fam_str ")", sizeof(devinfo->name)); \ 1261 break; 1262#include "pci_ids/i965_pci_ids.h" 1263#include "pci_ids/iris_pci_ids.h" 1264 default: 1265 strncpy(devinfo->name, "Intel Unknown", sizeof(devinfo->name)); 1266 } 1267 1268 fill_masks(devinfo); 1269 1270 /* From the Skylake PRM, 3DSTATE_PS::Scratch Space Base Pointer: 1271 * 1272 * "Scratch Space per slice is computed based on 4 sub-slices. SW must 1273 * allocate scratch space enough so that each slice has 4 slices allowed." 1274 * 1275 * The equivalent internal documentation says that this programming note 1276 * applies to all Gfx9+ platforms. 1277 * 1278 * The hardware typically calculates the scratch space pointer by taking 1279 * the base address, and adding per-thread-scratch-space * thread ID. 1280 * Extra padding can be necessary depending how the thread IDs are 1281 * calculated for a particular shader stage. 1282 */ 1283 1284 switch(devinfo->ver) { 1285 case 9: 1286 devinfo->max_wm_threads = 64 /* threads-per-PSD */ 1287 * devinfo->num_slices 1288 * 4; /* effective subslices per slice */ 1289 break; 1290 case 11: 1291 case 12: 1292 devinfo->max_wm_threads = 128 /* threads-per-PSD */ 1293 * devinfo->num_slices 1294 * 8; /* subslices per slice */ 1295 break; 1296 default: 1297 assert(devinfo->ver < 9); 1298 break; 1299 } 1300 1301 assert(devinfo->num_slices <= ARRAY_SIZE(devinfo->num_subslices)); 1302 1303 if (devinfo->verx10 == 0) 1304 devinfo->verx10 = devinfo->ver * 10; 1305 1306 if (devinfo->display_ver == 0) 1307 devinfo->display_ver = devinfo->ver; 1308 1309 update_cs_workgroup_threads(devinfo); 1310 1311 devinfo->chipset_id = pci_id; 1312 return true; 1313} 1314 1315/** 1316 * for gfx8/gfx9, SLICE_MASK/SUBSLICE_MASK can be used to compute the topology 1317 * (kernel 4.13+) 1318 */ 1319static bool 1320getparam_topology(struct intel_device_info *devinfo, int fd) 1321{ 1322 int slice_mask = 0; 1323 if (!getparam(fd, I915_PARAM_SLICE_MASK, &slice_mask)) 1324 goto maybe_warn; 1325 1326 int n_eus; 1327 if (!getparam(fd, I915_PARAM_EU_TOTAL, &n_eus)) 1328 goto maybe_warn; 1329 1330 int subslice_mask = 0; 1331 if (!getparam(fd, I915_PARAM_SUBSLICE_MASK, &subslice_mask)) 1332 goto maybe_warn; 1333 1334 return update_from_masks(devinfo, slice_mask, subslice_mask, n_eus); 1335 1336 maybe_warn: 1337 /* Only with Gfx8+ are we starting to see devices with fusing that can only 1338 * be detected at runtime. 1339 */ 1340 if (devinfo->ver >= 8) 1341 mesa_logw("Kernel 4.1 required to properly query GPU properties."); 1342 1343 return false; 1344} 1345 1346/** 1347 * preferred API for updating the topology in devinfo (kernel 4.17+) 1348 */ 1349static bool 1350query_topology(struct intel_device_info *devinfo, int fd) 1351{ 1352 struct drm_i915_query_topology_info *topo_info = 1353 intel_i915_query_alloc(fd, DRM_I915_QUERY_TOPOLOGY_INFO); 1354 if (topo_info == NULL) 1355 return false; 1356 1357 update_from_topology(devinfo, topo_info); 1358 1359 free(topo_info); 1360 1361 return true; 1362 1363} 1364 1365int 1366intel_get_aperture_size(int fd, uint64_t *size) 1367{ 1368 struct drm_i915_gem_get_aperture aperture = { 0 }; 1369 1370 int ret = intel_ioctl(fd, DRM_IOCTL_I915_GEM_GET_APERTURE, &aperture); 1371 if (ret == 0 && size) 1372 *size = aperture.aper_size; 1373 1374 return ret; 1375} 1376 1377static bool 1378has_get_tiling(int fd) 1379{ 1380 int ret; 1381 1382 struct drm_i915_gem_create gem_create = { 1383 .size = 4096, 1384 }; 1385 1386 if (intel_ioctl(fd, DRM_IOCTL_I915_GEM_CREATE, &gem_create)) { 1387 unreachable("Failed to create GEM BO"); 1388 return false; 1389 } 1390 1391 struct drm_i915_gem_get_tiling get_tiling = { 1392 .handle = gem_create.handle, 1393 }; 1394 ret = intel_ioctl(fd, DRM_IOCTL_I915_GEM_SET_TILING, &get_tiling); 1395 1396 struct drm_gem_close close = { 1397 .handle = gem_create.handle, 1398 }; 1399 intel_ioctl(fd, DRM_IOCTL_GEM_CLOSE, &close); 1400 1401 return ret == 0; 1402} 1403 1404static void 1405fixup_chv_device_info(struct intel_device_info *devinfo) 1406{ 1407 assert(devinfo->is_cherryview); 1408 1409 /* Cherryview is annoying. The number of EUs is depending on fusing and 1410 * isn't determinable from the PCI ID alone. We default to the minimum 1411 * available for that PCI ID and then compute the real value from the 1412 * subslice information we get from the kernel. 1413 */ 1414 const uint32_t subslice_total = intel_device_info_subslice_total(devinfo); 1415 const uint32_t eu_total = intel_device_info_eu_total(devinfo); 1416 1417 /* Logical CS threads = EUs per subslice * num threads per EU */ 1418 uint32_t max_cs_threads = 1419 eu_total / subslice_total * devinfo->num_thread_per_eu; 1420 1421 /* Fuse configurations may give more threads than expected, never less. */ 1422 if (max_cs_threads > devinfo->max_cs_threads) 1423 devinfo->max_cs_threads = max_cs_threads; 1424 1425 update_cs_workgroup_threads(devinfo); 1426 1427 /* Braswell is even more annoying. Its marketing name isn't determinable 1428 * from the PCI ID and is also dependent on fusing. 1429 */ 1430 if (devinfo->chipset_id != 0x22B1) 1431 return; 1432 1433 char *bsw_model; 1434 switch (eu_total) { 1435 case 16: bsw_model = "405"; break; 1436 case 12: bsw_model = "400"; break; 1437 default: bsw_model = " "; break; 1438 } 1439 1440 char *needle = strstr(devinfo->name, "XXX"); 1441 assert(needle); 1442 if (needle) 1443 memcpy(needle, bsw_model, 3); 1444} 1445 1446static void 1447init_max_scratch_ids(struct intel_device_info *devinfo) 1448{ 1449 /* Determine the max number of subslices that potentially might be used in 1450 * scratch space ids. 1451 * 1452 * For, Gfx11+, scratch space allocation is based on the number of threads 1453 * in the base configuration. 1454 * 1455 * For Gfx9, devinfo->subslice_total is the TOTAL number of subslices and 1456 * we wish to view that there are 4 subslices per slice instead of the 1457 * actual number of subslices per slice. The documentation for 3DSTATE_PS 1458 * "Scratch Space Base Pointer" says: 1459 * 1460 * "Scratch Space per slice is computed based on 4 sub-slices. SW 1461 * must allocate scratch space enough so that each slice has 4 1462 * slices allowed." 1463 * 1464 * According to the other driver team, this applies to compute shaders 1465 * as well. This is not currently documented at all. 1466 * 1467 * For Gfx8 and older we user devinfo->subslice_total. 1468 */ 1469 unsigned subslices; 1470 if (devinfo->verx10 == 125) 1471 subslices = 32; 1472 else if (devinfo->ver == 12) 1473 subslices = (devinfo->is_dg1 || devinfo->gt == 2 ? 6 : 2); 1474 else if (devinfo->ver == 11) 1475 subslices = 8; 1476 else if (devinfo->ver >= 9 && devinfo->ver < 11) 1477 subslices = 4 * devinfo->num_slices; 1478 else 1479 subslices = devinfo->subslice_total; 1480 assert(subslices >= devinfo->subslice_total); 1481 1482 unsigned scratch_ids_per_subslice; 1483 if (devinfo->ver >= 12) { 1484 /* Same as ICL below, but with 16 EUs. */ 1485 scratch_ids_per_subslice = 16 * 8; 1486 } else if (devinfo->ver >= 11) { 1487 /* The MEDIA_VFE_STATE docs say: 1488 * 1489 * "Starting with this configuration, the Maximum Number of 1490 * Threads must be set to (#EU * 8) for GPGPU dispatches. 1491 * 1492 * Although there are only 7 threads per EU in the configuration, 1493 * the FFTID is calculated as if there are 8 threads per EU, 1494 * which in turn requires a larger amount of Scratch Space to be 1495 * allocated by the driver." 1496 */ 1497 scratch_ids_per_subslice = 8 * 8; 1498 } else if (devinfo->is_haswell) { 1499 /* WaCSScratchSize:hsw 1500 * 1501 * Haswell's scratch space address calculation appears to be sparse 1502 * rather than tightly packed. The Thread ID has bits indicating 1503 * which subslice, EU within a subslice, and thread within an EU it 1504 * is. There's a maximum of two slices and two subslices, so these 1505 * can be stored with a single bit. Even though there are only 10 EUs 1506 * per subslice, this is stored in 4 bits, so there's an effective 1507 * maximum value of 16 EUs. Similarly, although there are only 7 1508 * threads per EU, this is stored in a 3 bit number, giving an 1509 * effective maximum value of 8 threads per EU. 1510 * 1511 * This means that we need to use 16 * 8 instead of 10 * 7 for the 1512 * number of threads per subslice. 1513 */ 1514 scratch_ids_per_subslice = 16 * 8; 1515 } else if (devinfo->is_cherryview) { 1516 /* Cherryview devices have either 6 or 8 EUs per subslice, and each 1517 * EU has 7 threads. The 6 EU devices appear to calculate thread IDs 1518 * as if it had 8 EUs. 1519 */ 1520 scratch_ids_per_subslice = 8 * 7; 1521 } else { 1522 scratch_ids_per_subslice = devinfo->max_cs_threads; 1523 } 1524 1525 unsigned max_thread_ids = scratch_ids_per_subslice * subslices; 1526 1527 if (devinfo->verx10 >= 125) { 1528 /* On GFX version 12.5, scratch access changed to a surface-based model. 1529 * Instead of each shader type having its own layout based on IDs passed 1530 * from the relevant fixed-function unit, all scratch access is based on 1531 * thread IDs like it always has been for compute. 1532 */ 1533 for (int i = MESA_SHADER_VERTEX; i < MESA_SHADER_STAGES; i++) 1534 devinfo->max_scratch_ids[i] = max_thread_ids; 1535 } else { 1536 unsigned max_scratch_ids[] = { 1537 [MESA_SHADER_VERTEX] = devinfo->max_vs_threads, 1538 [MESA_SHADER_TESS_CTRL] = devinfo->max_tcs_threads, 1539 [MESA_SHADER_TESS_EVAL] = devinfo->max_tes_threads, 1540 [MESA_SHADER_GEOMETRY] = devinfo->max_gs_threads, 1541 [MESA_SHADER_FRAGMENT] = devinfo->max_wm_threads, 1542 [MESA_SHADER_COMPUTE] = max_thread_ids, 1543 }; 1544 STATIC_ASSERT(sizeof(devinfo->max_scratch_ids) == sizeof(max_scratch_ids)); 1545 memcpy(devinfo->max_scratch_ids, max_scratch_ids, 1546 sizeof(devinfo->max_scratch_ids)); 1547 } 1548} 1549 1550bool 1551intel_get_device_info_from_fd(int fd, struct intel_device_info *devinfo) 1552{ 1553 int devid = 0; 1554 1555 const char *devid_override = getenv("INTEL_DEVID_OVERRIDE"); 1556 if (devid_override && strlen(devid_override) > 0) { 1557 if (geteuid() == getuid()) { 1558 devid = intel_device_name_to_pci_device_id(devid_override); 1559 /* Fallback to PCI ID. */ 1560 if (devid <= 0) 1561 devid = strtol(devid_override, NULL, 0); 1562 if (devid <= 0) { 1563 mesa_loge("Invalid INTEL_DEVID_OVERRIDE=\"%s\". " 1564 "Use a valid numeric PCI ID or one of the supported " 1565 "platform names:", devid_override); 1566 for (unsigned i = 0; i < ARRAY_SIZE(name_map); i++) 1567 mesa_loge(" %s", name_map[i].name); 1568 return false; 1569 } 1570 } else { 1571 mesa_logi("Ignoring INTEL_DEVID_OVERRIDE=\"%s\" because " 1572 "real and effective user ID don't match.", devid_override); 1573 } 1574 } 1575 1576 if (devid > 0) { 1577 if (!intel_get_device_info_from_pci_id(devid, devinfo)) 1578 return false; 1579 devinfo->no_hw = true; 1580 } else { 1581 /* query the device id */ 1582 if (!getparam(fd, I915_PARAM_CHIPSET_ID, &devid)) 1583 return false; 1584 if (!intel_get_device_info_from_pci_id(devid, devinfo)) 1585 return false; 1586 devinfo->no_hw = env_var_as_boolean("INTEL_NO_HW", false); 1587 } 1588 1589 if (devinfo->ver == 10) { 1590 mesa_loge("Gfx10 support is redacted."); 1591 return false; 1592 } 1593 1594 /* remaining initializion queries the kernel for device info */ 1595 if (devinfo->no_hw) 1596 return true; 1597 1598 int timestamp_frequency; 1599 if (getparam(fd, I915_PARAM_CS_TIMESTAMP_FREQUENCY, 1600 ×tamp_frequency)) 1601 devinfo->timestamp_frequency = timestamp_frequency; 1602 else if (devinfo->ver >= 10) { 1603 mesa_loge("Kernel 4.15 required to read the CS timestamp frequency."); 1604 return false; 1605 } 1606 1607 if (!getparam(fd, I915_PARAM_REVISION, &devinfo->revision)) 1608 devinfo->revision = 0; 1609 1610 if (!query_topology(devinfo, fd)) { 1611 if (devinfo->ver >= 10) { 1612 /* topology uAPI required for CNL+ (kernel 4.17+) */ 1613 return false; 1614 } 1615 1616 /* else use the kernel 4.13+ api for gfx8+. For older kernels, topology 1617 * will be wrong, affecting GPU metrics. In this case, fail silently. 1618 */ 1619 getparam_topology(devinfo, fd); 1620 } 1621 1622 if (devinfo->is_cherryview) 1623 fixup_chv_device_info(devinfo); 1624 1625 intel_get_aperture_size(fd, &devinfo->aperture_bytes); 1626 devinfo->has_tiling_uapi = has_get_tiling(fd); 1627 1628 devinfo->subslice_total = 0; 1629 for (uint32_t i = 0; i < devinfo->max_slices; i++) 1630 devinfo->subslice_total += __builtin_popcount(devinfo->subslice_masks[i]); 1631 1632 /* Gfx7 and older do not support EU/Subslice info */ 1633 assert(devinfo->subslice_total >= 1 || devinfo->ver <= 7); 1634 devinfo->subslice_total = MAX2(devinfo->subslice_total, 1); 1635 1636 init_max_scratch_ids(devinfo); 1637 1638 return true; 1639} 1640