intel_device_info.c revision 7ec681f3
1/*
2 * Copyright © 2013 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24#include <assert.h>
25#include <stdbool.h>
26#include <stdio.h>
27#include <stdlib.h>
28#include <string.h>
29#include <unistd.h>
30#include "intel_device_info.h"
31#include "intel/common/intel_gem.h"
32#include "util/bitscan.h"
33#include "util/debug.h"
34#include "util/log.h"
35#include "util/macros.h"
36
37#include "drm-uapi/i915_drm.h"
38
39static const struct {
40   const char *name;
41   int pci_id;
42} name_map[] = {
43   { "lpt", 0x27a2 },
44   { "brw", 0x2a02 },
45   { "g4x", 0x2a42 },
46   { "ilk", 0x0042 },
47   { "snb", 0x0126 },
48   { "ivb", 0x016a },
49   { "hsw", 0x0d2e },
50   { "byt", 0x0f33 },
51   { "bdw", 0x162e },
52   { "chv", 0x22B3 },
53   { "skl", 0x1912 },
54   { "bxt", 0x5A85 },
55   { "kbl", 0x5912 },
56   { "aml", 0x591C },
57   { "glk", 0x3185 },
58   { "cfl", 0x3E9B },
59   { "whl", 0x3EA1 },
60   { "cml", 0x9b41 },
61   { "icl", 0x8a52 },
62   { "ehl", 0x4500 },
63   { "jsl", 0x4E71 },
64   { "tgl", 0x9a49 },
65   { "rkl", 0x4c8a },
66   { "dg1", 0x4905 },
67   { "adl", 0x4680 },
68   { "sg1", 0x4907 },
69};
70
71/**
72 * Get the PCI ID for the device name.
73 *
74 * Returns -1 if the device is not known.
75 */
76int
77intel_device_name_to_pci_device_id(const char *name)
78{
79   for (unsigned i = 0; i < ARRAY_SIZE(name_map); i++) {
80      if (!strcmp(name_map[i].name, name))
81         return name_map[i].pci_id;
82   }
83
84   return -1;
85}
86
87static const struct intel_device_info intel_device_info_gfx3 = {
88   .ver = 3,
89   .simulator_id = -1,
90   .cs_prefetch_size = 512,
91};
92
93static const struct intel_device_info intel_device_info_i965 = {
94   .ver = 4,
95   .has_negative_rhw_bug = true,
96   .num_slices = 1,
97   .num_subslices = { 1, },
98   .num_eu_per_subslice = 8,
99   .num_thread_per_eu = 4,
100   .max_vs_threads = 16,
101   .max_gs_threads = 2,
102   .max_wm_threads = 8 * 4,
103   .urb = {
104      .size = 256,
105   },
106   .timestamp_frequency = 12500000,
107   .simulator_id = -1,
108   .cs_prefetch_size = 512,
109};
110
111static const struct intel_device_info intel_device_info_g4x = {
112   .ver = 4,
113   .verx10 = 45,
114   .has_pln = true,
115   .has_compr4 = true,
116   .has_surface_tile_offset = true,
117   .is_g4x = true,
118   .num_slices = 1,
119   .num_subslices = { 1, },
120   .num_eu_per_subslice = 10,
121   .num_thread_per_eu = 5,
122   .max_vs_threads = 32,
123   .max_gs_threads = 2,
124   .max_wm_threads = 10 * 5,
125   .urb = {
126      .size = 384,
127   },
128   .timestamp_frequency = 12500000,
129   .simulator_id = -1,
130   .cs_prefetch_size = 512,
131};
132
133static const struct intel_device_info intel_device_info_ilk = {
134   .ver = 5,
135   .has_pln = true,
136   .has_compr4 = true,
137   .has_surface_tile_offset = true,
138   .num_slices = 1,
139   .num_subslices = { 1, },
140   .num_eu_per_subslice = 12,
141   .num_thread_per_eu = 6,
142   .max_vs_threads = 72,
143   .max_gs_threads = 32,
144   .max_wm_threads = 12 * 6,
145   .urb = {
146      .size = 1024,
147   },
148   .timestamp_frequency = 12500000,
149   .simulator_id = -1,
150   .cs_prefetch_size = 512,
151};
152
153static const struct intel_device_info intel_device_info_snb_gt1 = {
154   .ver = 6,
155   .gt = 1,
156   .has_hiz_and_separate_stencil = true,
157   .has_llc = true,
158   .has_pln = true,
159   .has_surface_tile_offset = true,
160   .needs_unlit_centroid_workaround = true,
161   .num_slices = 1,
162   .num_subslices = { 1, },
163   .num_eu_per_subslice = 6,
164   .num_thread_per_eu = 6, /* Not confirmed */
165   .max_vs_threads = 24,
166   .max_gs_threads = 21, /* conservative; 24 if rendering disabled. */
167   .max_wm_threads = 40,
168   .urb = {
169      .size = 32,
170      .min_entries = {
171         [MESA_SHADER_VERTEX]   = 24,
172      },
173      .max_entries = {
174         [MESA_SHADER_VERTEX]   = 256,
175         [MESA_SHADER_GEOMETRY] = 256,
176      },
177   },
178   .timestamp_frequency = 12500000,
179   .simulator_id = -1,
180   .cs_prefetch_size = 512,
181};
182
183static const struct intel_device_info intel_device_info_snb_gt2 = {
184   .ver = 6,
185   .gt = 2,
186   .has_hiz_and_separate_stencil = true,
187   .has_llc = true,
188   .has_pln = true,
189   .has_surface_tile_offset = true,
190   .needs_unlit_centroid_workaround = true,
191   .num_slices = 1,
192   .num_subslices = { 1, },
193   .num_eu_per_subslice = 12,
194   .num_thread_per_eu = 6, /* Not confirmed */
195   .max_vs_threads = 60,
196   .max_gs_threads = 60,
197   .max_wm_threads = 80,
198   .urb = {
199      .size = 64,
200      .min_entries = {
201         [MESA_SHADER_VERTEX]   = 24,
202      },
203      .max_entries = {
204         [MESA_SHADER_VERTEX]   = 256,
205         [MESA_SHADER_GEOMETRY] = 256,
206      },
207   },
208   .timestamp_frequency = 12500000,
209   .simulator_id = -1,
210   .cs_prefetch_size = 512,
211};
212
213#define GFX7_FEATURES                               \
214   .ver = 7,                                        \
215   .has_hiz_and_separate_stencil = true,            \
216   .must_use_separate_stencil = true,               \
217   .has_llc = true,                                 \
218   .has_pln = true,                                 \
219   .has_64bit_float = true,                         \
220   .has_surface_tile_offset = true,                 \
221   .timestamp_frequency = 12500000,                 \
222   .max_constant_urb_size_kb = 16,                  \
223   .cs_prefetch_size = 512
224
225static const struct intel_device_info intel_device_info_ivb_gt1 = {
226   GFX7_FEATURES, .is_ivybridge = true, .gt = 1,
227   .num_slices = 1,
228   .num_subslices = { 1, },
229   .num_eu_per_subslice = 6,
230   .num_thread_per_eu = 6,
231   .l3_banks = 2,
232   .max_vs_threads = 36,
233   .max_tcs_threads = 36,
234   .max_tes_threads = 36,
235   .max_gs_threads = 36,
236   .max_wm_threads = 48,
237   .max_cs_threads = 36,
238   .urb = {
239      .min_entries = {
240         [MESA_SHADER_VERTEX]    = 32,
241         [MESA_SHADER_TESS_EVAL] = 10,
242      },
243      .max_entries = {
244         [MESA_SHADER_VERTEX]    = 512,
245         [MESA_SHADER_TESS_CTRL] = 32,
246         [MESA_SHADER_TESS_EVAL] = 288,
247         [MESA_SHADER_GEOMETRY]  = 192,
248      },
249   },
250   .simulator_id = 7,
251};
252
253static const struct intel_device_info intel_device_info_ivb_gt2 = {
254   GFX7_FEATURES, .is_ivybridge = true, .gt = 2,
255   .num_slices = 1,
256   .num_subslices = { 1, },
257   .num_eu_per_subslice = 12,
258   .num_thread_per_eu = 8, /* Not sure why this isn't a multiple of
259                            * @max_wm_threads ... */
260   .l3_banks = 4,
261   .max_vs_threads = 128,
262   .max_tcs_threads = 128,
263   .max_tes_threads = 128,
264   .max_gs_threads = 128,
265   .max_wm_threads = 172,
266   .max_cs_threads = 64,
267   .urb = {
268      .min_entries = {
269         [MESA_SHADER_VERTEX]    = 32,
270         [MESA_SHADER_TESS_EVAL] = 10,
271      },
272      .max_entries = {
273         [MESA_SHADER_VERTEX]    = 704,
274         [MESA_SHADER_TESS_CTRL] = 64,
275         [MESA_SHADER_TESS_EVAL] = 448,
276         [MESA_SHADER_GEOMETRY]  = 320,
277      },
278   },
279   .simulator_id = 7,
280};
281
282static const struct intel_device_info intel_device_info_byt = {
283   GFX7_FEATURES, .is_baytrail = true, .gt = 1,
284   .num_slices = 1,
285   .num_subslices = { 1, },
286   .num_eu_per_subslice = 4,
287   .num_thread_per_eu = 8,
288   .l3_banks = 1,
289   .has_llc = false,
290   .max_vs_threads = 36,
291   .max_tcs_threads = 36,
292   .max_tes_threads = 36,
293   .max_gs_threads = 36,
294   .max_wm_threads = 48,
295   .max_cs_threads = 32,
296   .urb = {
297      .min_entries = {
298         [MESA_SHADER_VERTEX]    = 32,
299         [MESA_SHADER_TESS_EVAL] = 10,
300      },
301      .max_entries = {
302         [MESA_SHADER_VERTEX]    = 512,
303         [MESA_SHADER_TESS_CTRL] = 32,
304         [MESA_SHADER_TESS_EVAL] = 288,
305         [MESA_SHADER_GEOMETRY]  = 192,
306      },
307   },
308   .simulator_id = 10,
309};
310
311#define HSW_FEATURES             \
312   GFX7_FEATURES,                \
313   .is_haswell = true,           \
314   .verx10 = 75,                 \
315   .supports_simd16_3src = true
316
317static const struct intel_device_info intel_device_info_hsw_gt1 = {
318   HSW_FEATURES, .gt = 1,
319   .num_slices = 1,
320   .num_subslices = { 1, },
321   .num_eu_per_subslice = 10,
322   .num_thread_per_eu = 7,
323   .l3_banks = 2,
324   .max_vs_threads = 70,
325   .max_tcs_threads = 70,
326   .max_tes_threads = 70,
327   .max_gs_threads = 70,
328   .max_wm_threads = 102,
329   .max_cs_threads = 70,
330   .urb = {
331      .min_entries = {
332         [MESA_SHADER_VERTEX]    = 32,
333         [MESA_SHADER_TESS_EVAL] = 10,
334      },
335      .max_entries = {
336         [MESA_SHADER_VERTEX]    = 640,
337         [MESA_SHADER_TESS_CTRL] = 64,
338         [MESA_SHADER_TESS_EVAL] = 384,
339         [MESA_SHADER_GEOMETRY]  = 256,
340      },
341   },
342   .simulator_id = 9,
343};
344
345static const struct intel_device_info intel_device_info_hsw_gt2 = {
346   HSW_FEATURES, .gt = 2,
347   .num_slices = 1,
348   .num_subslices = { 2, },
349   .num_eu_per_subslice = 10,
350   .num_thread_per_eu = 7,
351   .l3_banks = 4,
352   .max_vs_threads = 280,
353   .max_tcs_threads = 256,
354   .max_tes_threads = 280,
355   .max_gs_threads = 256,
356   .max_wm_threads = 204,
357   .max_cs_threads = 70,
358   .urb = {
359      .min_entries = {
360         [MESA_SHADER_VERTEX]    = 64,
361         [MESA_SHADER_TESS_EVAL] = 10,
362      },
363      .max_entries = {
364         [MESA_SHADER_VERTEX]    = 1664,
365         [MESA_SHADER_TESS_CTRL] = 128,
366         [MESA_SHADER_TESS_EVAL] = 960,
367         [MESA_SHADER_GEOMETRY]  = 640,
368      },
369   },
370   .simulator_id = 9,
371};
372
373static const struct intel_device_info intel_device_info_hsw_gt3 = {
374   HSW_FEATURES, .gt = 3,
375   .num_slices = 2,
376   .num_subslices = { 2, 2, },
377   .num_eu_per_subslice = 10,
378   .num_thread_per_eu = 7,
379   .l3_banks = 8,
380   .max_vs_threads = 280,
381   .max_tcs_threads = 256,
382   .max_tes_threads = 280,
383   .max_gs_threads = 256,
384   .max_wm_threads = 408,
385   .max_cs_threads = 70,
386   .urb = {
387      .min_entries = {
388         [MESA_SHADER_VERTEX]    = 64,
389         [MESA_SHADER_TESS_EVAL] = 10,
390      },
391      .max_entries = {
392         [MESA_SHADER_VERTEX]    = 1664,
393         [MESA_SHADER_TESS_CTRL] = 128,
394         [MESA_SHADER_TESS_EVAL] = 960,
395         [MESA_SHADER_GEOMETRY]  = 640,
396      },
397   },
398   .max_constant_urb_size_kb = 32,
399   .simulator_id = 9,
400};
401
402/* It's unclear how well supported sampling from the hiz buffer is on GFX8,
403 * so keep things conservative for now and set has_sample_with_hiz = false.
404 */
405#define GFX8_FEATURES                               \
406   .ver = 8,                                        \
407   .has_hiz_and_separate_stencil = true,            \
408   .must_use_separate_stencil = true,               \
409   .has_llc = true,                                 \
410   .has_sample_with_hiz = false,                    \
411   .has_pln = true,                                 \
412   .has_integer_dword_mul = true,                   \
413   .has_64bit_float = true,                         \
414   .has_64bit_int = true,                           \
415   .supports_simd16_3src = true,                    \
416   .has_surface_tile_offset = true,                 \
417   .num_thread_per_eu = 7,                          \
418   .max_vs_threads = 504,                           \
419   .max_tcs_threads = 504,                          \
420   .max_tes_threads = 504,                          \
421   .max_gs_threads = 504,                           \
422   .max_wm_threads = 384,                           \
423   .timestamp_frequency = 12500000,                 \
424   .max_constant_urb_size_kb = 32,                  \
425   .cs_prefetch_size = 512
426
427static const struct intel_device_info intel_device_info_bdw_gt1 = {
428   GFX8_FEATURES, .gt = 1,
429   .is_broadwell = true,
430   .num_slices = 1,
431   .num_subslices = { 2, },
432   .num_eu_per_subslice = 6,
433   .l3_banks = 2,
434   .max_cs_threads = 42,
435   .urb = {
436      .min_entries = {
437         [MESA_SHADER_VERTEX]    = 64,
438         [MESA_SHADER_TESS_EVAL] = 34,
439      },
440      .max_entries = {
441         [MESA_SHADER_VERTEX]    = 2560,
442         [MESA_SHADER_TESS_CTRL] = 504,
443         [MESA_SHADER_TESS_EVAL] = 1536,
444         /* Reduced from 960, seems to be similar to the bug on Gfx9 GT1. */
445         [MESA_SHADER_GEOMETRY]  = 690,
446      },
447   },
448   .simulator_id = 11,
449};
450
451static const struct intel_device_info intel_device_info_bdw_gt2 = {
452   GFX8_FEATURES, .gt = 2,
453   .is_broadwell = true,
454   .num_slices = 1,
455   .num_subslices = { 3, },
456   .num_eu_per_subslice = 8,
457   .l3_banks = 4,
458   .max_cs_threads = 56,
459   .urb = {
460      .min_entries = {
461         [MESA_SHADER_VERTEX]    = 64,
462         [MESA_SHADER_TESS_EVAL] = 34,
463      },
464      .max_entries = {
465         [MESA_SHADER_VERTEX]    = 2560,
466         [MESA_SHADER_TESS_CTRL] = 504,
467         [MESA_SHADER_TESS_EVAL] = 1536,
468         [MESA_SHADER_GEOMETRY]  = 960,
469      },
470   },
471   .simulator_id = 11,
472};
473
474static const struct intel_device_info intel_device_info_bdw_gt3 = {
475   GFX8_FEATURES, .gt = 3,
476   .is_broadwell = true,
477   .num_slices = 2,
478   .num_subslices = { 3, 3, },
479   .num_eu_per_subslice = 8,
480   .l3_banks = 8,
481   .max_cs_threads = 56,
482   .urb = {
483      .min_entries = {
484         [MESA_SHADER_VERTEX]    = 64,
485         [MESA_SHADER_TESS_EVAL] = 34,
486      },
487      .max_entries = {
488         [MESA_SHADER_VERTEX]    = 2560,
489         [MESA_SHADER_TESS_CTRL] = 504,
490         [MESA_SHADER_TESS_EVAL] = 1536,
491         [MESA_SHADER_GEOMETRY]  = 960,
492      },
493   },
494   .simulator_id = 11,
495};
496
497static const struct intel_device_info intel_device_info_chv = {
498   GFX8_FEATURES, .is_cherryview = 1, .gt = 1,
499   .has_llc = false,
500   .has_integer_dword_mul = false,
501   .num_slices = 1,
502   .num_subslices = { 2, },
503   .num_eu_per_subslice = 8,
504   .l3_banks = 2,
505   .max_vs_threads = 80,
506   .max_tcs_threads = 80,
507   .max_tes_threads = 80,
508   .max_gs_threads = 80,
509   .max_wm_threads = 128,
510   .max_cs_threads = 6 * 7,
511   .urb = {
512      .min_entries = {
513         [MESA_SHADER_VERTEX]    = 34,
514         [MESA_SHADER_TESS_EVAL] = 34,
515      },
516      .max_entries = {
517         [MESA_SHADER_VERTEX]    = 640,
518         [MESA_SHADER_TESS_CTRL] = 80,
519         [MESA_SHADER_TESS_EVAL] = 384,
520         [MESA_SHADER_GEOMETRY]  = 256,
521      },
522   },
523   .simulator_id = 13,
524};
525
526#define GFX9_HW_INFO                                \
527   .ver = 9,                                        \
528   .max_vs_threads = 336,                           \
529   .max_gs_threads = 336,                           \
530   .max_tcs_threads = 336,                          \
531   .max_tes_threads = 336,                          \
532   .max_cs_threads = 56,                            \
533   .timestamp_frequency = 12000000,                 \
534   .cs_prefetch_size = 512,                         \
535   .urb = {                                         \
536      .min_entries = {                              \
537         [MESA_SHADER_VERTEX]    = 64,              \
538         [MESA_SHADER_TESS_EVAL] = 34,              \
539      },                                            \
540      .max_entries = {                              \
541         [MESA_SHADER_VERTEX]    = 1856,            \
542         [MESA_SHADER_TESS_CTRL] = 672,             \
543         [MESA_SHADER_TESS_EVAL] = 1120,            \
544         [MESA_SHADER_GEOMETRY]  = 640,             \
545      },                                            \
546   }
547
548#define GFX9_LP_FEATURES                           \
549   GFX8_FEATURES,                                  \
550   GFX9_HW_INFO,                                   \
551   .has_integer_dword_mul = false,                 \
552   .gt = 1,                                        \
553   .has_llc = false,                               \
554   .has_sample_with_hiz = true,                    \
555   .num_slices = 1,                                \
556   .num_thread_per_eu = 6,                         \
557   .max_vs_threads = 112,                          \
558   .max_tcs_threads = 112,                         \
559   .max_tes_threads = 112,                         \
560   .max_gs_threads = 112,                          \
561   .max_cs_threads = 6 * 6,                        \
562   .timestamp_frequency = 19200000,                \
563   .urb = {                                        \
564      .min_entries = {                             \
565         [MESA_SHADER_VERTEX]    = 34,             \
566         [MESA_SHADER_TESS_EVAL] = 34,             \
567      },                                           \
568      .max_entries = {                             \
569         [MESA_SHADER_VERTEX]    = 704,            \
570         [MESA_SHADER_TESS_CTRL] = 256,            \
571         [MESA_SHADER_TESS_EVAL] = 416,            \
572         [MESA_SHADER_GEOMETRY]  = 256,            \
573      },                                           \
574   }
575
576#define GFX9_LP_FEATURES_3X6                       \
577   GFX9_LP_FEATURES,                               \
578   .num_subslices = { 3, },                        \
579   .num_eu_per_subslice = 6
580
581#define GFX9_LP_FEATURES_2X6                       \
582   GFX9_LP_FEATURES,                               \
583   .num_subslices = { 2, },                        \
584   .num_eu_per_subslice = 6,                       \
585   .max_vs_threads = 56,                           \
586   .max_tcs_threads = 56,                          \
587   .max_tes_threads = 56,                          \
588   .max_gs_threads = 56,                           \
589   .max_cs_threads = 6 * 6,                        \
590   .urb = {                                        \
591      .min_entries = {                             \
592         [MESA_SHADER_VERTEX]    = 34,             \
593         [MESA_SHADER_TESS_EVAL] = 34,             \
594      },                                           \
595      .max_entries = {                             \
596         [MESA_SHADER_VERTEX]    = 352,            \
597         [MESA_SHADER_TESS_CTRL] = 128,            \
598         [MESA_SHADER_TESS_EVAL] = 208,            \
599         [MESA_SHADER_GEOMETRY]  = 128,            \
600      },                                           \
601   }
602
603#define GFX9_FEATURES                               \
604   GFX8_FEATURES,                                   \
605   GFX9_HW_INFO,                                    \
606   .has_sample_with_hiz = true
607
608static const struct intel_device_info intel_device_info_skl_gt1 = {
609   GFX9_FEATURES, .gt = 1,
610   .is_skylake = true,
611   .num_slices = 1,
612   .num_subslices = { 2, },
613   .num_eu_per_subslice = 6,
614   .l3_banks = 2,
615   /* GT1 seems to have a bug in the top of the pipe (VF/VS?) fixed functions
616    * leading to some vertices to go missing if we use too much URB.
617    */
618   .urb.max_entries[MESA_SHADER_VERTEX] = 928,
619   .simulator_id = 12,
620};
621
622static const struct intel_device_info intel_device_info_skl_gt2 = {
623   GFX9_FEATURES, .gt = 2,
624   .is_skylake = true,
625   .num_slices = 1,
626   .num_subslices = { 3, },
627   .num_eu_per_subslice = 8,
628   .l3_banks = 4,
629   .simulator_id = 12,
630};
631
632static const struct intel_device_info intel_device_info_skl_gt3 = {
633   GFX9_FEATURES, .gt = 3,
634   .is_skylake = true,
635   .num_slices = 2,
636   .num_subslices = { 3, 3, },
637   .num_eu_per_subslice = 8,
638   .l3_banks = 8,
639   .simulator_id = 12,
640};
641
642static const struct intel_device_info intel_device_info_skl_gt4 = {
643   GFX9_FEATURES, .gt = 4,
644   .is_skylake = true,
645   .num_slices = 3,
646   .num_subslices = { 3, 3, 3, },
647   .num_eu_per_subslice = 8,
648   .l3_banks = 12,
649   /* From the "L3 Allocation and Programming" documentation:
650    *
651    * "URB is limited to 1008KB due to programming restrictions.  This is not a
652    * restriction of the L3 implementation, but of the FF and other clients.
653    * Therefore, in a GT4 implementation it is possible for the programmed
654    * allocation of the L3 data array to provide 3*384KB=1152KB for URB, but
655    * only 1008KB of this will be used."
656    */
657   .simulator_id = 12,
658};
659
660static const struct intel_device_info intel_device_info_bxt = {
661   GFX9_LP_FEATURES_3X6,
662   .is_broxton = true,
663   .l3_banks = 2,
664   .simulator_id = 14,
665};
666
667static const struct intel_device_info intel_device_info_bxt_2x6 = {
668   GFX9_LP_FEATURES_2X6,
669   .is_broxton = true,
670   .l3_banks = 1,
671   .simulator_id = 14,
672};
673/*
674 * Note: for all KBL SKUs, the PRM says SKL for GS entries, not SKL+.
675 * There's no KBL entry. Using the default SKL (GFX9) GS entries value.
676 */
677
678static const struct intel_device_info intel_device_info_kbl_gt1 = {
679   GFX9_FEATURES,
680   .is_kabylake = true,
681   .gt = 1,
682
683   .max_cs_threads = 7 * 6,
684   .num_slices = 1,
685   .num_subslices = { 2, },
686   .num_eu_per_subslice = 6,
687   .l3_banks = 2,
688   /* GT1 seems to have a bug in the top of the pipe (VF/VS?) fixed functions
689    * leading to some vertices to go missing if we use too much URB.
690    */
691   .urb.max_entries[MESA_SHADER_VERTEX] = 928,
692   .urb.max_entries[MESA_SHADER_GEOMETRY] = 256,
693   .simulator_id = 16,
694};
695
696static const struct intel_device_info intel_device_info_kbl_gt1_5 = {
697   GFX9_FEATURES,
698   .is_kabylake = true,
699   .gt = 1,
700
701   .max_cs_threads = 7 * 6,
702   .num_slices = 1,
703   .num_subslices = { 3, },
704   .num_eu_per_subslice = 6,
705   .l3_banks = 4,
706   .simulator_id = 16,
707};
708
709static const struct intel_device_info intel_device_info_kbl_gt2 = {
710   GFX9_FEATURES,
711   .is_kabylake = true,
712   .gt = 2,
713
714   .num_slices = 1,
715   .num_subslices = { 3, },
716   .num_eu_per_subslice = 8,
717   .l3_banks = 4,
718   .simulator_id = 16,
719};
720
721static const struct intel_device_info intel_device_info_kbl_gt3 = {
722   GFX9_FEATURES,
723   .is_kabylake = true,
724   .gt = 3,
725
726   .num_slices = 2,
727   .num_subslices = { 3, 3, },
728   .num_eu_per_subslice = 8,
729   .l3_banks = 8,
730   .simulator_id = 16,
731};
732
733static const struct intel_device_info intel_device_info_kbl_gt4 = {
734   GFX9_FEATURES,
735   .is_kabylake = true,
736   .gt = 4,
737
738   /*
739    * From the "L3 Allocation and Programming" documentation:
740    *
741    * "URB is limited to 1008KB due to programming restrictions.  This
742    *  is not a restriction of the L3 implementation, but of the FF and
743    *  other clients.  Therefore, in a GT4 implementation it is
744    *  possible for the programmed allocation of the L3 data array to
745    *  provide 3*384KB=1152KB for URB, but only 1008KB of this
746    *  will be used."
747    */
748   .num_slices = 3,
749   .num_subslices = { 3, 3, 3, },
750   .num_eu_per_subslice = 8,
751   .l3_banks = 12,
752   .simulator_id = 16,
753};
754
755static const struct intel_device_info intel_device_info_glk = {
756   GFX9_LP_FEATURES_3X6,
757   .is_geminilake = true,
758   .l3_banks = 2,
759   .simulator_id = 17,
760};
761
762static const struct intel_device_info intel_device_info_glk_2x6 = {
763   GFX9_LP_FEATURES_2X6,
764   .is_geminilake = true,
765   .l3_banks = 2,
766   .simulator_id = 17,
767};
768
769static const struct intel_device_info intel_device_info_cfl_gt1 = {
770   GFX9_FEATURES,
771   .is_coffeelake = true,
772   .gt = 1,
773
774   .num_slices = 1,
775   .num_subslices = { 2, },
776   .num_eu_per_subslice = 6,
777   .l3_banks = 2,
778   /* GT1 seems to have a bug in the top of the pipe (VF/VS?) fixed functions
779    * leading to some vertices to go missing if we use too much URB.
780    */
781   .urb.max_entries[MESA_SHADER_VERTEX] = 928,
782   .urb.max_entries[MESA_SHADER_GEOMETRY] = 256,
783   .simulator_id = 24,
784};
785static const struct intel_device_info intel_device_info_cfl_gt2 = {
786   GFX9_FEATURES,
787   .is_coffeelake = true,
788   .gt = 2,
789
790   .num_slices = 1,
791   .num_subslices = { 3, },
792   .num_eu_per_subslice = 8,
793   .l3_banks = 4,
794   .simulator_id = 24,
795};
796
797static const struct intel_device_info intel_device_info_cfl_gt3 = {
798   GFX9_FEATURES,
799   .is_coffeelake = true,
800   .gt = 3,
801
802   .num_slices = 2,
803   .num_subslices = { 3, 3, },
804   .num_eu_per_subslice = 8,
805   .l3_banks = 8,
806   .simulator_id = 24,
807};
808
809#define subslices(args...) { args, }
810
811#define GFX11_HW_INFO                               \
812   .ver = 11,                                       \
813   .has_pln = false,                                \
814   .max_vs_threads = 364,                           \
815   .max_gs_threads = 224,                           \
816   .max_tcs_threads = 224,                          \
817   .max_tes_threads = 364,                          \
818   .max_cs_threads = 56,                            \
819   .cs_prefetch_size = 512
820
821#define GFX11_FEATURES(_gt, _slices, _subslices, _l3) \
822   GFX8_FEATURES,                                     \
823   GFX11_HW_INFO,                                     \
824   .has_64bit_float = false,                          \
825   .has_64bit_int = false,                            \
826   .has_integer_dword_mul = false,                    \
827   .has_sample_with_hiz = false,                      \
828   .gt = _gt, .num_slices = _slices, .l3_banks = _l3, \
829   .num_subslices = _subslices,                       \
830   .num_eu_per_subslice = 8
831
832#define GFX11_URB_MIN_MAX_ENTRIES                     \
833   .min_entries = {                                   \
834      [MESA_SHADER_VERTEX]    = 64,                   \
835      [MESA_SHADER_TESS_EVAL] = 34,                   \
836   },                                                 \
837   .max_entries = {                                   \
838      [MESA_SHADER_VERTEX]    = 2384,                 \
839      [MESA_SHADER_TESS_CTRL] = 1032,                 \
840      [MESA_SHADER_TESS_EVAL] = 2384,                 \
841      [MESA_SHADER_GEOMETRY]  = 1032,                 \
842   }
843
844static const struct intel_device_info intel_device_info_icl_gt2 = {
845   GFX11_FEATURES(2, 1, subslices(8), 8),
846   .urb = {
847      GFX11_URB_MIN_MAX_ENTRIES,
848   },
849   .simulator_id = 19,
850};
851
852static const struct intel_device_info intel_device_info_icl_gt1_5 = {
853   GFX11_FEATURES(1, 1, subslices(6), 6),
854   .urb = {
855      GFX11_URB_MIN_MAX_ENTRIES,
856   },
857   .simulator_id = 19,
858};
859
860static const struct intel_device_info intel_device_info_icl_gt1 = {
861   GFX11_FEATURES(1, 1, subslices(4), 6),
862   .urb = {
863      GFX11_URB_MIN_MAX_ENTRIES,
864   },
865   .simulator_id = 19,
866};
867
868static const struct intel_device_info intel_device_info_icl_gt0_5 = {
869   GFX11_FEATURES(1, 1, subslices(1), 6),
870   .urb = {
871      GFX11_URB_MIN_MAX_ENTRIES,
872   },
873   .simulator_id = 19,
874};
875
876#define GFX11_LP_FEATURES                           \
877   .is_elkhartlake = true,                          \
878   .urb = {                                         \
879      GFX11_URB_MIN_MAX_ENTRIES,                    \
880   },                                               \
881   .disable_ccs_repack = true,                      \
882   .simulator_id = 28
883
884static const struct intel_device_info intel_device_info_ehl_4x8 = {
885   GFX11_FEATURES(1, 1, subslices(4), 4),
886   GFX11_LP_FEATURES,
887};
888
889static const struct intel_device_info intel_device_info_ehl_4x6 = {
890   GFX11_FEATURES(1, 1, subslices(4), 4),
891   GFX11_LP_FEATURES,
892   .num_eu_per_subslice = 6,
893};
894
895static const struct intel_device_info intel_device_info_ehl_4x5 = {
896   GFX11_FEATURES(1, 1, subslices(4), 4),
897   GFX11_LP_FEATURES,
898   .num_eu_per_subslice = 5,
899};
900
901static const struct intel_device_info intel_device_info_ehl_4x4 = {
902   GFX11_FEATURES(1, 1, subslices(4), 4),
903   GFX11_LP_FEATURES,
904   .num_eu_per_subslice = 4,
905};
906
907static const struct intel_device_info intel_device_info_ehl_2x8 = {
908   GFX11_FEATURES(1, 1, subslices(2), 4),
909   GFX11_LP_FEATURES,
910};
911
912static const struct intel_device_info intel_device_info_ehl_2x4 = {
913   GFX11_FEATURES(1, 1, subslices(2), 4),
914   GFX11_LP_FEATURES,
915   .num_eu_per_subslice =4,
916};
917
918#define GFX12_URB_MIN_MAX_ENTRIES                   \
919   .min_entries = {                                 \
920      [MESA_SHADER_VERTEX]    = 64,                 \
921      [MESA_SHADER_TESS_EVAL] = 34,                 \
922   },                                               \
923   .max_entries = {                                 \
924      [MESA_SHADER_VERTEX]    = 3576,               \
925      [MESA_SHADER_TESS_CTRL] = 1548,               \
926      [MESA_SHADER_TESS_EVAL] = 3576,               \
927      /* Wa_14013840143 */                          \
928      [MESA_SHADER_GEOMETRY]  = 1536,               \
929   }
930
931#define GFX12_HW_INFO                               \
932   .ver = 12,                                       \
933   .has_pln = false,                                \
934   .has_sample_with_hiz = false,                    \
935   .has_aux_map = true,                             \
936   .max_vs_threads = 546,                           \
937   .max_gs_threads = 336,                           \
938   .max_tcs_threads = 336,                          \
939   .max_tes_threads = 546,                          \
940   .max_cs_threads = 112, /* threads per DSS */     \
941   .urb = {                                         \
942      GFX12_URB_MIN_MAX_ENTRIES,                    \
943   }
944
945#define GFX12_FEATURES(_gt, _slices, _l3)                       \
946   GFX8_FEATURES,                                               \
947   GFX12_HW_INFO,                                               \
948   .has_64bit_float = false,                                    \
949   .has_64bit_int = false,                                      \
950   .has_integer_dword_mul = false,                              \
951   .gt = _gt, .num_slices = _slices, .l3_banks = _l3,           \
952   .simulator_id = 22,                                          \
953   .num_eu_per_subslice = 16,                                   \
954   .cs_prefetch_size = 512
955
956#define dual_subslices(args...) { args, }
957
958#define GFX12_GT05_FEATURES                                     \
959   GFX12_FEATURES(1, 1, 4),                                     \
960   .num_subslices = dual_subslices(1)
961
962#define GFX12_GT_FEATURES(_gt)                                  \
963   GFX12_FEATURES(_gt, 1, _gt == 1 ? 4 : 8),                    \
964   .num_subslices = dual_subslices(_gt == 1 ? 2 : 6)
965
966static const struct intel_device_info intel_device_info_tgl_gt1 = {
967   GFX12_GT_FEATURES(1),
968   .is_tigerlake = true,
969};
970
971static const struct intel_device_info intel_device_info_tgl_gt2 = {
972   GFX12_GT_FEATURES(2),
973   .is_tigerlake = true,
974};
975
976static const struct intel_device_info intel_device_info_rkl_gt05 = {
977   GFX12_GT05_FEATURES,
978   .is_rocketlake = true,
979};
980
981static const struct intel_device_info intel_device_info_rkl_gt1 = {
982   GFX12_GT_FEATURES(1),
983   .is_rocketlake = true,
984};
985
986static const struct intel_device_info intel_device_info_adl_gt05 = {
987   GFX12_GT05_FEATURES,
988   .is_alderlake = true,
989};
990
991static const struct intel_device_info intel_device_info_adl_gt1 = {
992   GFX12_GT_FEATURES(1),
993   .is_alderlake = true,
994};
995
996static const struct intel_device_info intel_device_info_adl_gt2 = {
997   GFX12_GT_FEATURES(2),
998   .is_alderlake = true,
999   .display_ver = 13,
1000};
1001
1002#define GFX12_DG1_SG1_FEATURES                  \
1003   GFX12_GT_FEATURES(2),                        \
1004   .is_dg1 = true,                              \
1005   .has_llc = false,                            \
1006   .has_local_mem = true,                       \
1007   .urb.size = 768,                             \
1008   .simulator_id = 30
1009
1010static const struct intel_device_info intel_device_info_dg1 = {
1011   GFX12_DG1_SG1_FEATURES,
1012};
1013
1014static const struct intel_device_info intel_device_info_sg1 = {
1015   GFX12_DG1_SG1_FEATURES,
1016};
1017
1018static void
1019reset_masks(struct intel_device_info *devinfo)
1020{
1021   devinfo->subslice_slice_stride = 0;
1022   devinfo->eu_subslice_stride = 0;
1023   devinfo->eu_slice_stride = 0;
1024
1025   devinfo->num_slices = 0;
1026   devinfo->num_eu_per_subslice = 0;
1027   memset(devinfo->num_subslices, 0, sizeof(devinfo->num_subslices));
1028
1029   memset(&devinfo->slice_masks, 0, sizeof(devinfo->slice_masks));
1030   memset(devinfo->subslice_masks, 0, sizeof(devinfo->subslice_masks));
1031   memset(devinfo->eu_masks, 0, sizeof(devinfo->eu_masks));
1032   memset(devinfo->ppipe_subslices, 0, sizeof(devinfo->ppipe_subslices));
1033}
1034
1035static void
1036update_from_topology(struct intel_device_info *devinfo,
1037                     const struct drm_i915_query_topology_info *topology)
1038{
1039   reset_masks(devinfo);
1040
1041   assert(topology->max_slices > 0);
1042   assert(topology->max_subslices > 0);
1043   assert(topology->max_eus_per_subslice > 0);
1044
1045   devinfo->subslice_slice_stride = topology->subslice_stride;
1046
1047   devinfo->eu_subslice_stride = DIV_ROUND_UP(topology->max_eus_per_subslice, 8);
1048   devinfo->eu_slice_stride = topology->max_subslices * devinfo->eu_subslice_stride;
1049
1050   assert(sizeof(devinfo->slice_masks) >= DIV_ROUND_UP(topology->max_slices, 8));
1051   memcpy(&devinfo->slice_masks, topology->data, DIV_ROUND_UP(topology->max_slices, 8));
1052   devinfo->num_slices = __builtin_popcount(devinfo->slice_masks);
1053   devinfo->max_slices = topology->max_slices;
1054   devinfo->max_subslices_per_slice = topology->max_subslices;
1055   devinfo->max_eu_per_subslice = topology->max_eus_per_subslice;
1056
1057   uint32_t subslice_mask_len =
1058      topology->max_slices * topology->subslice_stride;
1059   assert(sizeof(devinfo->subslice_masks) >= subslice_mask_len);
1060   memcpy(devinfo->subslice_masks, &topology->data[topology->subslice_offset],
1061          subslice_mask_len);
1062
1063   uint32_t n_subslices = 0;
1064   for (int s = 0; s < topology->max_slices; s++) {
1065      if ((devinfo->slice_masks & (1 << s)) == 0)
1066         continue;
1067
1068      for (int b = 0; b < devinfo->subslice_slice_stride; b++) {
1069         devinfo->num_subslices[s] +=
1070            __builtin_popcount(devinfo->subslice_masks[s * devinfo->subslice_slice_stride + b]);
1071      }
1072      n_subslices += devinfo->num_subslices[s];
1073   }
1074   assert(n_subslices > 0);
1075
1076   if (devinfo->ver >= 11) {
1077      /* On current ICL+ hardware we only have one slice. */
1078      assert(devinfo->slice_masks == 1);
1079
1080      /* Count the number of subslices on each pixel pipe. Assume that every
1081       * contiguous group of 4 subslices in the mask belong to the same pixel
1082       * pipe.  However note that on TGL the kernel returns a mask of enabled
1083       * *dual* subslices instead of actual subslices somewhat confusingly, so
1084       * each pixel pipe only takes 2 bits in the mask even though it's still
1085       * 4 subslices.
1086       */
1087      const unsigned ppipe_bits = devinfo->ver >= 12 ? 2 : 4;
1088      for (unsigned p = 0; p < INTEL_DEVICE_MAX_PIXEL_PIPES; p++) {
1089         const unsigned ppipe_mask = BITFIELD_RANGE(p * ppipe_bits, ppipe_bits);
1090         devinfo->ppipe_subslices[p] =
1091            __builtin_popcount(devinfo->subslice_masks[0] & ppipe_mask);
1092      }
1093   }
1094
1095   if (devinfo->ver == 12 && devinfo->num_slices == 1) {
1096      if (n_subslices >= 6) {
1097         assert(n_subslices == 6);
1098         devinfo->l3_banks = 8;
1099      } else if (n_subslices > 2) {
1100         devinfo->l3_banks = 6;
1101      } else {
1102         devinfo->l3_banks = 4;
1103      }
1104   }
1105
1106   uint32_t eu_mask_len =
1107      topology->eu_stride * topology->max_subslices * topology->max_slices;
1108   assert(sizeof(devinfo->eu_masks) >= eu_mask_len);
1109   memcpy(devinfo->eu_masks, &topology->data[topology->eu_offset], eu_mask_len);
1110
1111   uint32_t n_eus = 0;
1112   for (int b = 0; b < eu_mask_len; b++)
1113      n_eus += __builtin_popcount(devinfo->eu_masks[b]);
1114
1115   devinfo->num_eu_per_subslice = DIV_ROUND_UP(n_eus, n_subslices);
1116}
1117
1118/* Generate detailed mask from the I915_PARAM_SLICE_MASK,
1119 * I915_PARAM_SUBSLICE_MASK & I915_PARAM_EU_TOTAL getparam.
1120 */
1121static bool
1122update_from_masks(struct intel_device_info *devinfo, uint32_t slice_mask,
1123                  uint32_t subslice_mask, uint32_t n_eus)
1124{
1125   struct drm_i915_query_topology_info *topology;
1126
1127   assert((slice_mask & 0xff) == slice_mask);
1128
1129   size_t data_length = 100;
1130
1131   topology = calloc(1, sizeof(*topology) + data_length);
1132   if (!topology)
1133      return false;
1134
1135   topology->max_slices = util_last_bit(slice_mask);
1136   topology->max_subslices = util_last_bit(subslice_mask);
1137
1138   topology->subslice_offset = DIV_ROUND_UP(topology->max_slices, 8);
1139   topology->subslice_stride = DIV_ROUND_UP(topology->max_subslices, 8);
1140
1141   uint32_t n_subslices = __builtin_popcount(slice_mask) *
1142      __builtin_popcount(subslice_mask);
1143   uint32_t num_eu_per_subslice = DIV_ROUND_UP(n_eus, n_subslices);
1144   uint32_t eu_mask = (1U << num_eu_per_subslice) - 1;
1145
1146   topology->max_eus_per_subslice = num_eu_per_subslice;
1147   topology->eu_offset = topology->subslice_offset +
1148      topology->max_slices * DIV_ROUND_UP(topology->max_subslices, 8);
1149   topology->eu_stride = DIV_ROUND_UP(num_eu_per_subslice, 8);
1150
1151   /* Set slice mask in topology */
1152   for (int b = 0; b < topology->subslice_offset; b++)
1153      topology->data[b] = (slice_mask >> (b * 8)) & 0xff;
1154
1155   for (int s = 0; s < topology->max_slices; s++) {
1156
1157      /* Set subslice mask in topology */
1158      for (int b = 0; b < topology->subslice_stride; b++) {
1159         int subslice_offset = topology->subslice_offset +
1160            s * topology->subslice_stride + b;
1161
1162         topology->data[subslice_offset] = (subslice_mask >> (b * 8)) & 0xff;
1163      }
1164
1165      /* Set eu mask in topology */
1166      for (int ss = 0; ss < topology->max_subslices; ss++) {
1167         for (int b = 0; b < topology->eu_stride; b++) {
1168            int eu_offset = topology->eu_offset +
1169               (s * topology->max_subslices + ss) * topology->eu_stride + b;
1170
1171            topology->data[eu_offset] = (eu_mask >> (b * 8)) & 0xff;
1172         }
1173      }
1174   }
1175
1176   update_from_topology(devinfo, topology);
1177   free(topology);
1178
1179   return true;
1180}
1181
1182/* Generate mask from the device data. */
1183static void
1184fill_masks(struct intel_device_info *devinfo)
1185{
1186   /* All of our internal device descriptions assign the same number of
1187    * subslices for each slice. Just verify that this is true.
1188    */
1189   for (int s = 1; s < devinfo->num_slices; s++)
1190      assert(devinfo->num_subslices[0] == devinfo->num_subslices[s]);
1191
1192   update_from_masks(devinfo,
1193                     (1U << devinfo->num_slices) - 1,
1194                     (1U << devinfo->num_subslices[0]) - 1,
1195                     devinfo->num_slices * devinfo->num_subslices[0] *
1196                     devinfo->num_eu_per_subslice);
1197}
1198
1199static bool
1200getparam(int fd, uint32_t param, int *value)
1201{
1202   int tmp;
1203
1204   struct drm_i915_getparam gp = {
1205      .param = param,
1206      .value = &tmp,
1207   };
1208
1209   int ret = intel_ioctl(fd, DRM_IOCTL_I915_GETPARAM, &gp);
1210   if (ret != 0)
1211      return false;
1212
1213   *value = tmp;
1214   return true;
1215}
1216
1217static void
1218update_cs_workgroup_threads(struct intel_device_info *devinfo)
1219{
1220   /* GPGPU_WALKER::ThreadWidthCounterMaximum is U6-1 so the most threads we
1221    * can program is 64 without going up to a rectangular group. This only
1222    * impacts Haswell and TGL which have higher thread counts.
1223    *
1224    * INTERFACE_DESCRIPTOR_DATA::NumberofThreadsinGPGPUThreadGroup on Xe-HP+
1225    * is 10 bits so we have no such restrictions.
1226    */
1227   devinfo->max_cs_workgroup_threads =
1228      devinfo->verx10 >= 125 ? devinfo->max_cs_threads :
1229                               MIN2(devinfo->max_cs_threads, 64);
1230}
1231
1232bool
1233intel_get_device_info_from_pci_id(int pci_id,
1234                                  struct intel_device_info *devinfo)
1235{
1236   switch (pci_id) {
1237#undef CHIPSET
1238#define CHIPSET(id, family, fam_str, name) \
1239      case id: *devinfo = intel_device_info_##family; break;
1240#include "pci_ids/i965_pci_ids.h"
1241#include "pci_ids/iris_pci_ids.h"
1242
1243#undef CHIPSET
1244#define CHIPSET(id, fam_str, name) \
1245      case id: *devinfo = intel_device_info_gfx3; break;
1246#include "pci_ids/i915_pci_ids.h"
1247
1248   default:
1249      mesa_logw("Driver does not support the 0x%x PCI ID.", pci_id);
1250      return false;
1251   }
1252
1253   switch (pci_id) {
1254#undef CHIPSET
1255#define CHIPSET(_id, _family, _fam_str, _name) \
1256   case _id: \
1257      /* sizeof(str_literal) includes the null */ \
1258      STATIC_ASSERT(sizeof(_name) + sizeof(_fam_str) + 2 <= \
1259                    sizeof(devinfo->name)); \
1260      strncpy(devinfo->name, _name " (" _fam_str ")", sizeof(devinfo->name)); \
1261      break;
1262#include "pci_ids/i965_pci_ids.h"
1263#include "pci_ids/iris_pci_ids.h"
1264   default:
1265      strncpy(devinfo->name, "Intel Unknown", sizeof(devinfo->name));
1266   }
1267
1268   fill_masks(devinfo);
1269
1270   /* From the Skylake PRM, 3DSTATE_PS::Scratch Space Base Pointer:
1271    *
1272    * "Scratch Space per slice is computed based on 4 sub-slices.  SW must
1273    *  allocate scratch space enough so that each slice has 4 slices allowed."
1274    *
1275    * The equivalent internal documentation says that this programming note
1276    * applies to all Gfx9+ platforms.
1277    *
1278    * The hardware typically calculates the scratch space pointer by taking
1279    * the base address, and adding per-thread-scratch-space * thread ID.
1280    * Extra padding can be necessary depending how the thread IDs are
1281    * calculated for a particular shader stage.
1282    */
1283
1284   switch(devinfo->ver) {
1285   case 9:
1286      devinfo->max_wm_threads = 64 /* threads-per-PSD */
1287                              * devinfo->num_slices
1288                              * 4; /* effective subslices per slice */
1289      break;
1290   case 11:
1291   case 12:
1292      devinfo->max_wm_threads = 128 /* threads-per-PSD */
1293                              * devinfo->num_slices
1294                              * 8; /* subslices per slice */
1295      break;
1296   default:
1297      assert(devinfo->ver < 9);
1298      break;
1299   }
1300
1301   assert(devinfo->num_slices <= ARRAY_SIZE(devinfo->num_subslices));
1302
1303   if (devinfo->verx10 == 0)
1304      devinfo->verx10 = devinfo->ver * 10;
1305
1306   if (devinfo->display_ver == 0)
1307      devinfo->display_ver = devinfo->ver;
1308
1309   update_cs_workgroup_threads(devinfo);
1310
1311   devinfo->chipset_id = pci_id;
1312   return true;
1313}
1314
1315/**
1316 * for gfx8/gfx9, SLICE_MASK/SUBSLICE_MASK can be used to compute the topology
1317 * (kernel 4.13+)
1318 */
1319static bool
1320getparam_topology(struct intel_device_info *devinfo, int fd)
1321{
1322   int slice_mask = 0;
1323   if (!getparam(fd, I915_PARAM_SLICE_MASK, &slice_mask))
1324      goto maybe_warn;
1325
1326   int n_eus;
1327   if (!getparam(fd, I915_PARAM_EU_TOTAL, &n_eus))
1328      goto maybe_warn;
1329
1330   int subslice_mask = 0;
1331   if (!getparam(fd, I915_PARAM_SUBSLICE_MASK, &subslice_mask))
1332      goto maybe_warn;
1333
1334   return update_from_masks(devinfo, slice_mask, subslice_mask, n_eus);
1335
1336 maybe_warn:
1337   /* Only with Gfx8+ are we starting to see devices with fusing that can only
1338    * be detected at runtime.
1339    */
1340   if (devinfo->ver >= 8)
1341      mesa_logw("Kernel 4.1 required to properly query GPU properties.");
1342
1343   return false;
1344}
1345
1346/**
1347 * preferred API for updating the topology in devinfo (kernel 4.17+)
1348 */
1349static bool
1350query_topology(struct intel_device_info *devinfo, int fd)
1351{
1352   struct drm_i915_query_topology_info *topo_info =
1353      intel_i915_query_alloc(fd, DRM_I915_QUERY_TOPOLOGY_INFO);
1354   if (topo_info == NULL)
1355      return false;
1356
1357   update_from_topology(devinfo, topo_info);
1358
1359   free(topo_info);
1360
1361   return true;
1362
1363}
1364
1365int
1366intel_get_aperture_size(int fd, uint64_t *size)
1367{
1368   struct drm_i915_gem_get_aperture aperture = { 0 };
1369
1370   int ret = intel_ioctl(fd, DRM_IOCTL_I915_GEM_GET_APERTURE, &aperture);
1371   if (ret == 0 && size)
1372      *size = aperture.aper_size;
1373
1374   return ret;
1375}
1376
1377static bool
1378has_get_tiling(int fd)
1379{
1380   int ret;
1381
1382   struct drm_i915_gem_create gem_create = {
1383      .size = 4096,
1384   };
1385
1386   if (intel_ioctl(fd, DRM_IOCTL_I915_GEM_CREATE, &gem_create)) {
1387      unreachable("Failed to create GEM BO");
1388      return false;
1389   }
1390
1391   struct drm_i915_gem_get_tiling get_tiling = {
1392      .handle = gem_create.handle,
1393   };
1394   ret = intel_ioctl(fd, DRM_IOCTL_I915_GEM_SET_TILING, &get_tiling);
1395
1396   struct drm_gem_close close = {
1397      .handle = gem_create.handle,
1398   };
1399   intel_ioctl(fd, DRM_IOCTL_GEM_CLOSE, &close);
1400
1401   return ret == 0;
1402}
1403
1404static void
1405fixup_chv_device_info(struct intel_device_info *devinfo)
1406{
1407   assert(devinfo->is_cherryview);
1408
1409   /* Cherryview is annoying.  The number of EUs is depending on fusing and
1410    * isn't determinable from the PCI ID alone.  We default to the minimum
1411    * available for that PCI ID and then compute the real value from the
1412    * subslice information we get from the kernel.
1413    */
1414   const uint32_t subslice_total = intel_device_info_subslice_total(devinfo);
1415   const uint32_t eu_total = intel_device_info_eu_total(devinfo);
1416
1417   /* Logical CS threads = EUs per subslice * num threads per EU */
1418   uint32_t max_cs_threads =
1419      eu_total / subslice_total * devinfo->num_thread_per_eu;
1420
1421   /* Fuse configurations may give more threads than expected, never less. */
1422   if (max_cs_threads > devinfo->max_cs_threads)
1423      devinfo->max_cs_threads = max_cs_threads;
1424
1425   update_cs_workgroup_threads(devinfo);
1426
1427   /* Braswell is even more annoying.  Its marketing name isn't determinable
1428    * from the PCI ID and is also dependent on fusing.
1429    */
1430   if (devinfo->chipset_id != 0x22B1)
1431      return;
1432
1433   char *bsw_model;
1434   switch (eu_total) {
1435   case 16: bsw_model = "405"; break;
1436   case 12: bsw_model = "400"; break;
1437   default: bsw_model = "   "; break;
1438   }
1439
1440   char *needle = strstr(devinfo->name, "XXX");
1441   assert(needle);
1442   if (needle)
1443      memcpy(needle, bsw_model, 3);
1444}
1445
1446static void
1447init_max_scratch_ids(struct intel_device_info *devinfo)
1448{
1449   /* Determine the max number of subslices that potentially might be used in
1450    * scratch space ids.
1451    *
1452    * For, Gfx11+, scratch space allocation is based on the number of threads
1453    * in the base configuration.
1454    *
1455    * For Gfx9, devinfo->subslice_total is the TOTAL number of subslices and
1456    * we wish to view that there are 4 subslices per slice instead of the
1457    * actual number of subslices per slice. The documentation for 3DSTATE_PS
1458    * "Scratch Space Base Pointer" says:
1459    *
1460    *    "Scratch Space per slice is computed based on 4 sub-slices.  SW
1461    *     must allocate scratch space enough so that each slice has 4
1462    *     slices allowed."
1463    *
1464    * According to the other driver team, this applies to compute shaders
1465    * as well.  This is not currently documented at all.
1466    *
1467    * For Gfx8 and older we user devinfo->subslice_total.
1468    */
1469   unsigned subslices;
1470   if (devinfo->verx10 == 125)
1471      subslices = 32;
1472   else if (devinfo->ver == 12)
1473      subslices = (devinfo->is_dg1 || devinfo->gt == 2 ? 6 : 2);
1474   else if (devinfo->ver == 11)
1475      subslices = 8;
1476   else if (devinfo->ver >= 9 && devinfo->ver < 11)
1477      subslices = 4 * devinfo->num_slices;
1478   else
1479      subslices = devinfo->subslice_total;
1480   assert(subslices >= devinfo->subslice_total);
1481
1482   unsigned scratch_ids_per_subslice;
1483   if (devinfo->ver >= 12) {
1484      /* Same as ICL below, but with 16 EUs. */
1485      scratch_ids_per_subslice = 16 * 8;
1486   } else if (devinfo->ver >= 11) {
1487      /* The MEDIA_VFE_STATE docs say:
1488       *
1489       *    "Starting with this configuration, the Maximum Number of
1490       *     Threads must be set to (#EU * 8) for GPGPU dispatches.
1491       *
1492       *     Although there are only 7 threads per EU in the configuration,
1493       *     the FFTID is calculated as if there are 8 threads per EU,
1494       *     which in turn requires a larger amount of Scratch Space to be
1495       *     allocated by the driver."
1496       */
1497      scratch_ids_per_subslice = 8 * 8;
1498   } else if (devinfo->is_haswell) {
1499      /* WaCSScratchSize:hsw
1500       *
1501       * Haswell's scratch space address calculation appears to be sparse
1502       * rather than tightly packed. The Thread ID has bits indicating
1503       * which subslice, EU within a subslice, and thread within an EU it
1504       * is. There's a maximum of two slices and two subslices, so these
1505       * can be stored with a single bit. Even though there are only 10 EUs
1506       * per subslice, this is stored in 4 bits, so there's an effective
1507       * maximum value of 16 EUs. Similarly, although there are only 7
1508       * threads per EU, this is stored in a 3 bit number, giving an
1509       * effective maximum value of 8 threads per EU.
1510       *
1511       * This means that we need to use 16 * 8 instead of 10 * 7 for the
1512       * number of threads per subslice.
1513       */
1514      scratch_ids_per_subslice = 16 * 8;
1515   } else if (devinfo->is_cherryview) {
1516      /* Cherryview devices have either 6 or 8 EUs per subslice, and each
1517       * EU has 7 threads. The 6 EU devices appear to calculate thread IDs
1518       * as if it had 8 EUs.
1519       */
1520      scratch_ids_per_subslice = 8 * 7;
1521   } else {
1522      scratch_ids_per_subslice = devinfo->max_cs_threads;
1523   }
1524
1525   unsigned max_thread_ids = scratch_ids_per_subslice * subslices;
1526
1527   if (devinfo->verx10 >= 125) {
1528      /* On GFX version 12.5, scratch access changed to a surface-based model.
1529       * Instead of each shader type having its own layout based on IDs passed
1530       * from the relevant fixed-function unit, all scratch access is based on
1531       * thread IDs like it always has been for compute.
1532       */
1533      for (int i = MESA_SHADER_VERTEX; i < MESA_SHADER_STAGES; i++)
1534         devinfo->max_scratch_ids[i] = max_thread_ids;
1535   } else {
1536      unsigned max_scratch_ids[] = {
1537         [MESA_SHADER_VERTEX]    = devinfo->max_vs_threads,
1538         [MESA_SHADER_TESS_CTRL] = devinfo->max_tcs_threads,
1539         [MESA_SHADER_TESS_EVAL] = devinfo->max_tes_threads,
1540         [MESA_SHADER_GEOMETRY]  = devinfo->max_gs_threads,
1541         [MESA_SHADER_FRAGMENT]  = devinfo->max_wm_threads,
1542         [MESA_SHADER_COMPUTE]   = max_thread_ids,
1543      };
1544      STATIC_ASSERT(sizeof(devinfo->max_scratch_ids) == sizeof(max_scratch_ids));
1545      memcpy(devinfo->max_scratch_ids, max_scratch_ids,
1546             sizeof(devinfo->max_scratch_ids));
1547   }
1548}
1549
1550bool
1551intel_get_device_info_from_fd(int fd, struct intel_device_info *devinfo)
1552{
1553   int devid = 0;
1554
1555   const char *devid_override = getenv("INTEL_DEVID_OVERRIDE");
1556   if (devid_override && strlen(devid_override) > 0) {
1557      if (geteuid() == getuid()) {
1558         devid = intel_device_name_to_pci_device_id(devid_override);
1559         /* Fallback to PCI ID. */
1560         if (devid <= 0)
1561            devid = strtol(devid_override, NULL, 0);
1562         if (devid <= 0) {
1563            mesa_loge("Invalid INTEL_DEVID_OVERRIDE=\"%s\". "
1564                    "Use a valid numeric PCI ID or one of the supported "
1565                    "platform names:", devid_override);
1566            for (unsigned i = 0; i < ARRAY_SIZE(name_map); i++)
1567               mesa_loge("   %s", name_map[i].name);
1568            return false;
1569         }
1570      } else {
1571         mesa_logi("Ignoring INTEL_DEVID_OVERRIDE=\"%s\" because "
1572                   "real and effective user ID don't match.", devid_override);
1573      }
1574   }
1575
1576   if (devid > 0) {
1577      if (!intel_get_device_info_from_pci_id(devid, devinfo))
1578         return false;
1579      devinfo->no_hw = true;
1580   } else {
1581      /* query the device id */
1582      if (!getparam(fd, I915_PARAM_CHIPSET_ID, &devid))
1583         return false;
1584      if (!intel_get_device_info_from_pci_id(devid, devinfo))
1585         return false;
1586      devinfo->no_hw = env_var_as_boolean("INTEL_NO_HW", false);
1587   }
1588
1589   if (devinfo->ver == 10) {
1590      mesa_loge("Gfx10 support is redacted.");
1591      return false;
1592   }
1593
1594   /* remaining initializion queries the kernel for device info */
1595   if (devinfo->no_hw)
1596      return true;
1597
1598   int timestamp_frequency;
1599   if (getparam(fd, I915_PARAM_CS_TIMESTAMP_FREQUENCY,
1600                &timestamp_frequency))
1601      devinfo->timestamp_frequency = timestamp_frequency;
1602   else if (devinfo->ver >= 10) {
1603      mesa_loge("Kernel 4.15 required to read the CS timestamp frequency.");
1604      return false;
1605   }
1606
1607   if (!getparam(fd, I915_PARAM_REVISION, &devinfo->revision))
1608      devinfo->revision = 0;
1609
1610   if (!query_topology(devinfo, fd)) {
1611      if (devinfo->ver >= 10) {
1612         /* topology uAPI required for CNL+ (kernel 4.17+) */
1613         return false;
1614      }
1615
1616      /* else use the kernel 4.13+ api for gfx8+.  For older kernels, topology
1617       * will be wrong, affecting GPU metrics. In this case, fail silently.
1618       */
1619      getparam_topology(devinfo, fd);
1620   }
1621
1622   if (devinfo->is_cherryview)
1623      fixup_chv_device_info(devinfo);
1624
1625   intel_get_aperture_size(fd, &devinfo->aperture_bytes);
1626   devinfo->has_tiling_uapi = has_get_tiling(fd);
1627
1628   devinfo->subslice_total = 0;
1629   for (uint32_t i = 0; i < devinfo->max_slices; i++)
1630      devinfo->subslice_total += __builtin_popcount(devinfo->subslice_masks[i]);
1631
1632   /* Gfx7 and older do not support EU/Subslice info */
1633   assert(devinfo->subslice_total >= 1 || devinfo->ver <= 7);
1634   devinfo->subslice_total = MAX2(devinfo->subslice_total, 1);
1635
1636   init_max_scratch_ids(devinfo);
1637
1638   return true;
1639}
1640