1/* 2 * Copyright © 2018 Intel Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 */ 23 24#include <dirent.h> 25 26#include <sys/types.h> 27#include <sys/stat.h> 28#include <fcntl.h> 29#include <unistd.h> 30#include <errno.h> 31 32#ifndef HAVE_DIRENT_D_TYPE 33#include <limits.h> // PATH_MAX 34#endif 35 36#include <drm-uapi/i915_drm.h> 37 38#include "common/intel_gem.h" 39 40#include "dev/intel_debug.h" 41#include "dev/intel_device_info.h" 42 43#include "perf/intel_perf.h" 44#include "perf/intel_perf_regs.h" 45#include "perf/intel_perf_mdapi.h" 46#include "perf/intel_perf_metrics.h" 47#include "perf/intel_perf_private.h" 48 49#include "util/bitscan.h" 50#include "util/macros.h" 51#include "util/mesa-sha1.h" 52#include "util/u_math.h" 53 54#define FILE_DEBUG_FLAG DEBUG_PERFMON 55 56static bool 57is_dir_or_link(const struct dirent *entry, const char *parent_dir) 58{ 59#ifdef HAVE_DIRENT_D_TYPE 60 return entry->d_type == DT_DIR || entry->d_type == DT_LNK; 61#else 62 struct stat st; 63 char path[PATH_MAX + 1]; 64 snprintf(path, sizeof(path), "%s/%s", parent_dir, entry->d_name); 65 lstat(path, &st); 66 return S_ISDIR(st.st_mode) || S_ISLNK(st.st_mode); 67#endif 68} 69 70static bool 71get_sysfs_dev_dir(struct intel_perf_config *perf, int fd) 72{ 73 struct stat sb; 74 int min, maj; 75 DIR *drmdir; 76 struct dirent *drm_entry; 77 int len; 78 79 perf->sysfs_dev_dir[0] = '\0'; 80 81 if (INTEL_DEBUG(DEBUG_NO_OACONFIG)) 82 return true; 83 84 if (fstat(fd, &sb)) { 85 DBG("Failed to stat DRM fd\n"); 86 return false; 87 } 88 89 maj = major(sb.st_rdev); 90 min = minor(sb.st_rdev); 91 92 if (!S_ISCHR(sb.st_mode)) { 93 DBG("DRM fd is not a character device as expected\n"); 94 return false; 95 } 96 97 len = snprintf(perf->sysfs_dev_dir, 98 sizeof(perf->sysfs_dev_dir), 99 "/sys/dev/char/%d:%d/device/drm", maj, min); 100 if (len < 0 || len >= sizeof(perf->sysfs_dev_dir)) { 101 DBG("Failed to concatenate sysfs path to drm device\n"); 102 return false; 103 } 104 105 drmdir = opendir(perf->sysfs_dev_dir); 106 if (!drmdir) { 107 DBG("Failed to open %s: %m\n", perf->sysfs_dev_dir); 108 return false; 109 } 110 111 while ((drm_entry = readdir(drmdir))) { 112 if (is_dir_or_link(drm_entry, perf->sysfs_dev_dir) && 113 strncmp(drm_entry->d_name, "card", 4) == 0) 114 { 115 len = snprintf(perf->sysfs_dev_dir, 116 sizeof(perf->sysfs_dev_dir), 117 "/sys/dev/char/%d:%d/device/drm/%s", 118 maj, min, drm_entry->d_name); 119 closedir(drmdir); 120 if (len < 0 || len >= sizeof(perf->sysfs_dev_dir)) 121 return false; 122 else 123 return true; 124 } 125 } 126 127 closedir(drmdir); 128 129 DBG("Failed to find cardX directory under /sys/dev/char/%d:%d/device/drm\n", 130 maj, min); 131 132 return false; 133} 134 135static bool 136read_file_uint64(const char *file, uint64_t *val) 137{ 138 char buf[32]; 139 int fd, n; 140 141 fd = open(file, 0); 142 if (fd < 0) 143 return false; 144 while ((n = read(fd, buf, sizeof (buf) - 1)) < 0 && 145 errno == EINTR); 146 close(fd); 147 if (n < 0) 148 return false; 149 150 buf[n] = '\0'; 151 *val = strtoull(buf, NULL, 0); 152 153 return true; 154} 155 156static bool 157read_sysfs_drm_device_file_uint64(struct intel_perf_config *perf, 158 const char *file, 159 uint64_t *value) 160{ 161 char buf[512]; 162 int len; 163 164 len = snprintf(buf, sizeof(buf), "%s/%s", perf->sysfs_dev_dir, file); 165 if (len < 0 || len >= sizeof(buf)) { 166 DBG("Failed to concatenate sys filename to read u64 from\n"); 167 return false; 168 } 169 170 return read_file_uint64(buf, value); 171} 172 173static void 174register_oa_config(struct intel_perf_config *perf, 175 const struct intel_device_info *devinfo, 176 const struct intel_perf_query_info *query, 177 uint64_t config_id) 178{ 179 struct intel_perf_query_info *registered_query = 180 intel_perf_append_query_info(perf, 0); 181 182 *registered_query = *query; 183 registered_query->oa_format = devinfo->ver >= 8 ? 184 I915_OA_FORMAT_A32u40_A4u32_B8_C8 : I915_OA_FORMAT_A45_B8_C8; 185 registered_query->oa_metrics_set_id = config_id; 186 DBG("metric set registered: id = %" PRIu64", guid = %s\n", 187 registered_query->oa_metrics_set_id, query->guid); 188} 189 190static void 191enumerate_sysfs_metrics(struct intel_perf_config *perf, 192 const struct intel_device_info *devinfo) 193{ 194 DIR *metricsdir = NULL; 195 struct dirent *metric_entry; 196 char buf[256]; 197 int len; 198 199 len = snprintf(buf, sizeof(buf), "%s/metrics", perf->sysfs_dev_dir); 200 if (len < 0 || len >= sizeof(buf)) { 201 DBG("Failed to concatenate path to sysfs metrics/ directory\n"); 202 return; 203 } 204 205 metricsdir = opendir(buf); 206 if (!metricsdir) { 207 DBG("Failed to open %s: %m\n", buf); 208 return; 209 } 210 211 while ((metric_entry = readdir(metricsdir))) { 212 struct hash_entry *entry; 213 if (!is_dir_or_link(metric_entry, buf) || 214 metric_entry->d_name[0] == '.') 215 continue; 216 217 DBG("metric set: %s\n", metric_entry->d_name); 218 entry = _mesa_hash_table_search(perf->oa_metrics_table, 219 metric_entry->d_name); 220 if (entry) { 221 uint64_t id; 222 if (!intel_perf_load_metric_id(perf, metric_entry->d_name, &id)) { 223 DBG("Failed to read metric set id from %s: %m", buf); 224 continue; 225 } 226 227 register_oa_config(perf, devinfo, 228 (const struct intel_perf_query_info *)entry->data, id); 229 } else 230 DBG("metric set not known by mesa (skipping)\n"); 231 } 232 233 closedir(metricsdir); 234} 235 236static void 237add_all_metrics(struct intel_perf_config *perf, 238 const struct intel_device_info *devinfo) 239{ 240 hash_table_foreach(perf->oa_metrics_table, entry) { 241 const struct intel_perf_query_info *query = entry->data; 242 register_oa_config(perf, devinfo, query, 0); 243 } 244} 245 246static bool 247kernel_has_dynamic_config_support(struct intel_perf_config *perf, int fd) 248{ 249 uint64_t invalid_config_id = UINT64_MAX; 250 251 return intel_ioctl(fd, DRM_IOCTL_I915_PERF_REMOVE_CONFIG, 252 &invalid_config_id) < 0 && errno == ENOENT; 253} 254 255static bool 256i915_query_perf_config_supported(struct intel_perf_config *perf, int fd) 257{ 258 int32_t length = 0; 259 return !intel_i915_query_flags(fd, DRM_I915_QUERY_PERF_CONFIG, 260 DRM_I915_QUERY_PERF_CONFIG_LIST, 261 NULL, &length); 262} 263 264static bool 265i915_query_perf_config_data(struct intel_perf_config *perf, 266 int fd, const char *guid, 267 struct drm_i915_perf_oa_config *config) 268{ 269 char data[sizeof(struct drm_i915_query_perf_config) + 270 sizeof(struct drm_i915_perf_oa_config)] = {}; 271 struct drm_i915_query_perf_config *query = (void *)data; 272 273 memcpy(query->uuid, guid, sizeof(query->uuid)); 274 memcpy(query->data, config, sizeof(*config)); 275 276 int32_t item_length = sizeof(data); 277 if (intel_i915_query_flags(fd, DRM_I915_QUERY_PERF_CONFIG, 278 DRM_I915_QUERY_PERF_CONFIG_DATA_FOR_UUID, 279 query, &item_length)) 280 return false; 281 282 memcpy(config, query->data, sizeof(*config)); 283 284 return true; 285} 286 287bool 288intel_perf_load_metric_id(struct intel_perf_config *perf_cfg, 289 const char *guid, 290 uint64_t *metric_id) 291{ 292 char config_path[280]; 293 294 snprintf(config_path, sizeof(config_path), "%s/metrics/%s/id", 295 perf_cfg->sysfs_dev_dir, guid); 296 297 /* Don't recreate already loaded configs. */ 298 return read_file_uint64(config_path, metric_id); 299} 300 301static uint64_t 302i915_add_config(struct intel_perf_config *perf, int fd, 303 const struct intel_perf_registers *config, 304 const char *guid) 305{ 306 struct drm_i915_perf_oa_config i915_config = { 0, }; 307 308 memcpy(i915_config.uuid, guid, sizeof(i915_config.uuid)); 309 310 i915_config.n_mux_regs = config->n_mux_regs; 311 i915_config.mux_regs_ptr = to_const_user_pointer(config->mux_regs); 312 313 i915_config.n_boolean_regs = config->n_b_counter_regs; 314 i915_config.boolean_regs_ptr = to_const_user_pointer(config->b_counter_regs); 315 316 i915_config.n_flex_regs = config->n_flex_regs; 317 i915_config.flex_regs_ptr = to_const_user_pointer(config->flex_regs); 318 319 int ret = intel_ioctl(fd, DRM_IOCTL_I915_PERF_ADD_CONFIG, &i915_config); 320 return ret > 0 ? ret : 0; 321} 322 323static void 324init_oa_configs(struct intel_perf_config *perf, int fd, 325 const struct intel_device_info *devinfo) 326{ 327 hash_table_foreach(perf->oa_metrics_table, entry) { 328 const struct intel_perf_query_info *query = entry->data; 329 uint64_t config_id; 330 331 if (intel_perf_load_metric_id(perf, query->guid, &config_id)) { 332 DBG("metric set: %s (already loaded)\n", query->guid); 333 register_oa_config(perf, devinfo, query, config_id); 334 continue; 335 } 336 337 int ret = i915_add_config(perf, fd, &query->config, query->guid); 338 if (ret < 0) { 339 DBG("Failed to load \"%s\" (%s) metrics set in kernel: %s\n", 340 query->name, query->guid, strerror(errno)); 341 continue; 342 } 343 344 register_oa_config(perf, devinfo, query, ret); 345 DBG("metric set: %s (added)\n", query->guid); 346 } 347} 348 349static void 350compute_topology_builtins(struct intel_perf_config *perf, 351 const struct intel_device_info *devinfo) 352{ 353 perf->sys_vars.slice_mask = devinfo->slice_masks; 354 perf->sys_vars.n_eu_slices = devinfo->num_slices; 355 356 for (int i = 0; i < sizeof(devinfo->subslice_masks[i]); i++) { 357 perf->sys_vars.n_eu_sub_slices += 358 util_bitcount(devinfo->subslice_masks[i]); 359 } 360 361 for (int i = 0; i < sizeof(devinfo->eu_masks); i++) 362 perf->sys_vars.n_eus += util_bitcount(devinfo->eu_masks[i]); 363 364 perf->sys_vars.eu_threads_count = devinfo->num_thread_per_eu; 365 366 /* The subslice mask builtin contains bits for all slices. Prior to Gfx11 367 * it had groups of 3bits for each slice, on Gfx11 and above it's 8bits for 368 * each slice. 369 * 370 * Ideally equations would be updated to have a slice/subslice query 371 * function/operator. 372 */ 373 perf->sys_vars.subslice_mask = 0; 374 375 int bits_per_subslice = devinfo->ver >= 11 ? 8 : 3; 376 377 for (int s = 0; s < util_last_bit(devinfo->slice_masks); s++) { 378 for (int ss = 0; ss < (devinfo->subslice_slice_stride * 8); ss++) { 379 if (intel_device_info_subslice_available(devinfo, s, ss)) 380 perf->sys_vars.subslice_mask |= 1ULL << (s * bits_per_subslice + ss); 381 } 382 } 383} 384 385static bool 386init_oa_sys_vars(struct intel_perf_config *perf, 387 const struct intel_device_info *devinfo, 388 bool use_register_snapshots) 389{ 390 uint64_t min_freq_mhz = 0, max_freq_mhz = 0; 391 392 if (!INTEL_DEBUG(DEBUG_NO_OACONFIG)) { 393 if (!read_sysfs_drm_device_file_uint64(perf, "gt_min_freq_mhz", &min_freq_mhz)) 394 return false; 395 396 if (!read_sysfs_drm_device_file_uint64(perf, "gt_max_freq_mhz", &max_freq_mhz)) 397 return false; 398 } else { 399 min_freq_mhz = 300; 400 max_freq_mhz = 1000; 401 } 402 403 memset(&perf->sys_vars, 0, sizeof(perf->sys_vars)); 404 perf->sys_vars.gt_min_freq = min_freq_mhz * 1000000; 405 perf->sys_vars.gt_max_freq = max_freq_mhz * 1000000; 406 perf->sys_vars.timestamp_frequency = devinfo->timestamp_frequency; 407 perf->sys_vars.revision = devinfo->revision; 408 perf->sys_vars.query_mode = use_register_snapshots; 409 compute_topology_builtins(perf, devinfo); 410 411 return true; 412} 413 414typedef void (*perf_register_oa_queries_t)(struct intel_perf_config *); 415 416static perf_register_oa_queries_t 417get_register_queries_function(const struct intel_device_info *devinfo) 418{ 419 if (devinfo->is_haswell) 420 return intel_oa_register_queries_hsw; 421 if (devinfo->is_cherryview) 422 return intel_oa_register_queries_chv; 423 if (devinfo->is_broadwell) 424 return intel_oa_register_queries_bdw; 425 if (devinfo->is_broxton) 426 return intel_oa_register_queries_bxt; 427 if (devinfo->is_skylake) { 428 if (devinfo->gt == 2) 429 return intel_oa_register_queries_sklgt2; 430 if (devinfo->gt == 3) 431 return intel_oa_register_queries_sklgt3; 432 if (devinfo->gt == 4) 433 return intel_oa_register_queries_sklgt4; 434 } 435 if (devinfo->is_kabylake) { 436 if (devinfo->gt == 2) 437 return intel_oa_register_queries_kblgt2; 438 if (devinfo->gt == 3) 439 return intel_oa_register_queries_kblgt3; 440 } 441 if (devinfo->is_geminilake) 442 return intel_oa_register_queries_glk; 443 if (devinfo->is_coffeelake) { 444 if (devinfo->gt == 2) 445 return intel_oa_register_queries_cflgt2; 446 if (devinfo->gt == 3) 447 return intel_oa_register_queries_cflgt3; 448 } 449 if (devinfo->ver == 11) { 450 if (devinfo->is_elkhartlake) 451 return intel_oa_register_queries_ehl; 452 return intel_oa_register_queries_icl; 453 } 454 if (devinfo->is_tigerlake) { 455 if (devinfo->gt == 1) 456 return intel_oa_register_queries_tglgt1; 457 if (devinfo->gt == 2) 458 return intel_oa_register_queries_tglgt2; 459 } 460 if (devinfo->is_rocketlake) 461 return intel_oa_register_queries_rkl; 462 if (devinfo->is_dg1) 463 return intel_oa_register_queries_dg1; 464 if (devinfo->is_alderlake) 465 return intel_oa_register_queries_adl; 466 467 return NULL; 468} 469 470static int 471intel_perf_compare_counter_names(const void *v1, const void *v2) 472{ 473 const struct intel_perf_query_counter *c1 = v1; 474 const struct intel_perf_query_counter *c2 = v2; 475 476 return strcmp(c1->name, c2->name); 477} 478 479static void 480sort_query(struct intel_perf_query_info *q) 481{ 482 qsort(q->counters, q->n_counters, sizeof(q->counters[0]), 483 intel_perf_compare_counter_names); 484} 485 486static void 487load_pipeline_statistic_metrics(struct intel_perf_config *perf_cfg, 488 const struct intel_device_info *devinfo) 489{ 490 struct intel_perf_query_info *query = 491 intel_perf_append_query_info(perf_cfg, MAX_STAT_COUNTERS); 492 493 query->kind = INTEL_PERF_QUERY_TYPE_PIPELINE; 494 query->name = "Pipeline Statistics Registers"; 495 496 intel_perf_query_add_basic_stat_reg(query, IA_VERTICES_COUNT, 497 "N vertices submitted"); 498 intel_perf_query_add_basic_stat_reg(query, IA_PRIMITIVES_COUNT, 499 "N primitives submitted"); 500 intel_perf_query_add_basic_stat_reg(query, VS_INVOCATION_COUNT, 501 "N vertex shader invocations"); 502 503 if (devinfo->ver == 6) { 504 intel_perf_query_add_stat_reg(query, GFX6_SO_PRIM_STORAGE_NEEDED, 1, 1, 505 "SO_PRIM_STORAGE_NEEDED", 506 "N geometry shader stream-out primitives (total)"); 507 intel_perf_query_add_stat_reg(query, GFX6_SO_NUM_PRIMS_WRITTEN, 1, 1, 508 "SO_NUM_PRIMS_WRITTEN", 509 "N geometry shader stream-out primitives (written)"); 510 } else { 511 intel_perf_query_add_stat_reg(query, GFX7_SO_PRIM_STORAGE_NEEDED(0), 1, 1, 512 "SO_PRIM_STORAGE_NEEDED (Stream 0)", 513 "N stream-out (stream 0) primitives (total)"); 514 intel_perf_query_add_stat_reg(query, GFX7_SO_PRIM_STORAGE_NEEDED(1), 1, 1, 515 "SO_PRIM_STORAGE_NEEDED (Stream 1)", 516 "N stream-out (stream 1) primitives (total)"); 517 intel_perf_query_add_stat_reg(query, GFX7_SO_PRIM_STORAGE_NEEDED(2), 1, 1, 518 "SO_PRIM_STORAGE_NEEDED (Stream 2)", 519 "N stream-out (stream 2) primitives (total)"); 520 intel_perf_query_add_stat_reg(query, GFX7_SO_PRIM_STORAGE_NEEDED(3), 1, 1, 521 "SO_PRIM_STORAGE_NEEDED (Stream 3)", 522 "N stream-out (stream 3) primitives (total)"); 523 intel_perf_query_add_stat_reg(query, GFX7_SO_NUM_PRIMS_WRITTEN(0), 1, 1, 524 "SO_NUM_PRIMS_WRITTEN (Stream 0)", 525 "N stream-out (stream 0) primitives (written)"); 526 intel_perf_query_add_stat_reg(query, GFX7_SO_NUM_PRIMS_WRITTEN(1), 1, 1, 527 "SO_NUM_PRIMS_WRITTEN (Stream 1)", 528 "N stream-out (stream 1) primitives (written)"); 529 intel_perf_query_add_stat_reg(query, GFX7_SO_NUM_PRIMS_WRITTEN(2), 1, 1, 530 "SO_NUM_PRIMS_WRITTEN (Stream 2)", 531 "N stream-out (stream 2) primitives (written)"); 532 intel_perf_query_add_stat_reg(query, GFX7_SO_NUM_PRIMS_WRITTEN(3), 1, 1, 533 "SO_NUM_PRIMS_WRITTEN (Stream 3)", 534 "N stream-out (stream 3) primitives (written)"); 535 } 536 537 intel_perf_query_add_basic_stat_reg(query, HS_INVOCATION_COUNT, 538 "N TCS shader invocations"); 539 intel_perf_query_add_basic_stat_reg(query, DS_INVOCATION_COUNT, 540 "N TES shader invocations"); 541 542 intel_perf_query_add_basic_stat_reg(query, GS_INVOCATION_COUNT, 543 "N geometry shader invocations"); 544 intel_perf_query_add_basic_stat_reg(query, GS_PRIMITIVES_COUNT, 545 "N geometry shader primitives emitted"); 546 547 intel_perf_query_add_basic_stat_reg(query, CL_INVOCATION_COUNT, 548 "N primitives entering clipping"); 549 intel_perf_query_add_basic_stat_reg(query, CL_PRIMITIVES_COUNT, 550 "N primitives leaving clipping"); 551 552 if (devinfo->is_haswell || devinfo->ver == 8) { 553 intel_perf_query_add_stat_reg(query, PS_INVOCATION_COUNT, 1, 4, 554 "N fragment shader invocations", 555 "N fragment shader invocations"); 556 } else { 557 intel_perf_query_add_basic_stat_reg(query, PS_INVOCATION_COUNT, 558 "N fragment shader invocations"); 559 } 560 561 intel_perf_query_add_basic_stat_reg(query, PS_DEPTH_COUNT, 562 "N z-pass fragments"); 563 564 if (devinfo->ver >= 7) { 565 intel_perf_query_add_basic_stat_reg(query, CS_INVOCATION_COUNT, 566 "N compute shader invocations"); 567 } 568 569 query->data_size = sizeof(uint64_t) * query->n_counters; 570 571 sort_query(query); 572} 573 574static int 575i915_perf_version(int drm_fd) 576{ 577 int tmp; 578 drm_i915_getparam_t gp = { 579 .param = I915_PARAM_PERF_REVISION, 580 .value = &tmp, 581 }; 582 583 int ret = intel_ioctl(drm_fd, DRM_IOCTL_I915_GETPARAM, &gp); 584 585 /* Return 0 if this getparam is not supported, the first version supported 586 * is 1. 587 */ 588 return ret < 0 ? 0 : tmp; 589} 590 591static void 592i915_get_sseu(int drm_fd, struct drm_i915_gem_context_param_sseu *sseu) 593{ 594 struct drm_i915_gem_context_param arg = { 595 .param = I915_CONTEXT_PARAM_SSEU, 596 .size = sizeof(*sseu), 597 .value = to_user_pointer(sseu) 598 }; 599 600 intel_ioctl(drm_fd, DRM_IOCTL_I915_GEM_CONTEXT_GETPARAM, &arg); 601} 602 603static inline int 604compare_str_or_null(const char *s1, const char *s2) 605{ 606 if (s1 == NULL && s2 == NULL) 607 return 0; 608 if (s1 == NULL) 609 return -1; 610 if (s2 == NULL) 611 return 1; 612 613 return strcmp(s1, s2); 614} 615 616static int 617compare_counter_categories_and_names(const void *_c1, const void *_c2) 618{ 619 const struct intel_perf_query_counter_info *c1 = (const struct intel_perf_query_counter_info *)_c1; 620 const struct intel_perf_query_counter_info *c2 = (const struct intel_perf_query_counter_info *)_c2; 621 622 /* pipeline counters don't have an assigned category */ 623 int r = compare_str_or_null(c1->counter->category, c2->counter->category); 624 if (r) 625 return r; 626 627 return strcmp(c1->counter->name, c2->counter->name); 628} 629 630static void 631build_unique_counter_list(struct intel_perf_config *perf) 632{ 633 assert(perf->n_queries < 64); 634 635 size_t max_counters = 0; 636 637 for (int q = 0; q < perf->n_queries; q++) 638 max_counters += perf->queries[q].n_counters; 639 640 /* 641 * Allocate big enough array to hold maximum possible number of counters. 642 * We can't alloc it small and realloc when needed because the hash table 643 * below contains pointers to this array. 644 */ 645 struct intel_perf_query_counter_info *counter_infos = 646 ralloc_array_size(perf, sizeof(counter_infos[0]), max_counters); 647 648 perf->n_counters = 0; 649 650 struct hash_table *counters_table = 651 _mesa_hash_table_create(perf, 652 _mesa_hash_string, 653 _mesa_key_string_equal); 654 struct hash_entry *entry; 655 for (int q = 0; q < perf->n_queries ; q++) { 656 struct intel_perf_query_info *query = &perf->queries[q]; 657 658 for (int c = 0; c < query->n_counters; c++) { 659 struct intel_perf_query_counter *counter; 660 struct intel_perf_query_counter_info *counter_info; 661 662 counter = &query->counters[c]; 663 entry = _mesa_hash_table_search(counters_table, counter->symbol_name); 664 665 if (entry) { 666 counter_info = entry->data; 667 counter_info->query_mask |= BITFIELD64_BIT(q); 668 continue; 669 } 670 assert(perf->n_counters < max_counters); 671 672 counter_info = &counter_infos[perf->n_counters++]; 673 counter_info->counter = counter; 674 counter_info->query_mask = BITFIELD64_BIT(q); 675 676 counter_info->location.group_idx = q; 677 counter_info->location.counter_idx = c; 678 679 _mesa_hash_table_insert(counters_table, counter->symbol_name, counter_info); 680 } 681 } 682 683 _mesa_hash_table_destroy(counters_table, NULL); 684 685 /* Now we can realloc counter_infos array because hash table doesn't exist. */ 686 perf->counter_infos = reralloc_array_size(perf, counter_infos, 687 sizeof(counter_infos[0]), perf->n_counters); 688 689 qsort(perf->counter_infos, perf->n_counters, sizeof(perf->counter_infos[0]), 690 compare_counter_categories_and_names); 691} 692 693static bool 694oa_metrics_available(struct intel_perf_config *perf, int fd, 695 const struct intel_device_info *devinfo, 696 bool use_register_snapshots) 697{ 698 perf_register_oa_queries_t oa_register = get_register_queries_function(devinfo); 699 bool i915_perf_oa_available = false; 700 struct stat sb; 701 702 perf->i915_query_supported = i915_query_perf_config_supported(perf, fd); 703 perf->i915_perf_version = i915_perf_version(fd); 704 705 /* Record the default SSEU configuration. */ 706 i915_get_sseu(fd, &perf->sseu); 707 708 /* The existence of this sysctl parameter implies the kernel supports 709 * the i915 perf interface. 710 */ 711 if (stat("/proc/sys/dev/i915/perf_stream_paranoid", &sb) == 0) { 712 713 /* If _paranoid == 1 then on Gfx8+ we won't be able to access OA 714 * metrics unless running as root. 715 */ 716 if (devinfo->is_haswell) 717 i915_perf_oa_available = true; 718 else { 719 uint64_t paranoid = 1; 720 721 read_file_uint64("/proc/sys/dev/i915/perf_stream_paranoid", ¶noid); 722 723 if (paranoid == 0 || geteuid() == 0) 724 i915_perf_oa_available = true; 725 } 726 727 perf->platform_supported = oa_register != NULL; 728 } 729 730 return i915_perf_oa_available && 731 oa_register && 732 get_sysfs_dev_dir(perf, fd) && 733 init_oa_sys_vars(perf, devinfo, use_register_snapshots); 734} 735 736static void 737load_oa_metrics(struct intel_perf_config *perf, int fd, 738 const struct intel_device_info *devinfo) 739{ 740 int existing_queries = perf->n_queries; 741 742 perf_register_oa_queries_t oa_register = get_register_queries_function(devinfo); 743 744 perf->oa_metrics_table = 745 _mesa_hash_table_create(perf, _mesa_hash_string, 746 _mesa_key_string_equal); 747 748 /* Index all the metric sets mesa knows about before looking to see what 749 * the kernel is advertising. 750 */ 751 oa_register(perf); 752 753 if (!INTEL_DEBUG(DEBUG_NO_OACONFIG)) { 754 if (kernel_has_dynamic_config_support(perf, fd)) 755 init_oa_configs(perf, fd, devinfo); 756 else 757 enumerate_sysfs_metrics(perf, devinfo); 758 } else { 759 add_all_metrics(perf, devinfo); 760 } 761 762 /* sort counters in each individual group created by this function by name */ 763 for (int i = existing_queries; i < perf->n_queries; ++i) 764 sort_query(&perf->queries[i]); 765 766 /* Select a fallback OA metric. Look for the TestOa metric or use the last 767 * one if no present (on HSW). 768 */ 769 for (int i = existing_queries; i < perf->n_queries; i++) { 770 if (perf->queries[i].symbol_name && 771 strcmp(perf->queries[i].symbol_name, "TestOa") == 0) { 772 perf->fallback_raw_oa_metric = perf->queries[i].oa_metrics_set_id; 773 break; 774 } 775 } 776 if (perf->fallback_raw_oa_metric == 0 && perf->n_queries > 0) 777 perf->fallback_raw_oa_metric = perf->queries[perf->n_queries - 1].oa_metrics_set_id; 778} 779 780struct intel_perf_registers * 781intel_perf_load_configuration(struct intel_perf_config *perf_cfg, int fd, const char *guid) 782{ 783 if (!perf_cfg->i915_query_supported) 784 return NULL; 785 786 struct drm_i915_perf_oa_config i915_config = { 0, }; 787 if (!i915_query_perf_config_data(perf_cfg, fd, guid, &i915_config)) 788 return NULL; 789 790 struct intel_perf_registers *config = rzalloc(NULL, struct intel_perf_registers); 791 config->n_flex_regs = i915_config.n_flex_regs; 792 config->flex_regs = rzalloc_array(config, struct intel_perf_query_register_prog, config->n_flex_regs); 793 config->n_mux_regs = i915_config.n_mux_regs; 794 config->mux_regs = rzalloc_array(config, struct intel_perf_query_register_prog, config->n_mux_regs); 795 config->n_b_counter_regs = i915_config.n_boolean_regs; 796 config->b_counter_regs = rzalloc_array(config, struct intel_perf_query_register_prog, config->n_b_counter_regs); 797 798 /* 799 * struct intel_perf_query_register_prog maps exactly to the tuple of 800 * (register offset, register value) returned by the i915. 801 */ 802 i915_config.flex_regs_ptr = to_const_user_pointer(config->flex_regs); 803 i915_config.mux_regs_ptr = to_const_user_pointer(config->mux_regs); 804 i915_config.boolean_regs_ptr = to_const_user_pointer(config->b_counter_regs); 805 if (!i915_query_perf_config_data(perf_cfg, fd, guid, &i915_config)) { 806 ralloc_free(config); 807 return NULL; 808 } 809 810 return config; 811} 812 813uint64_t 814intel_perf_store_configuration(struct intel_perf_config *perf_cfg, int fd, 815 const struct intel_perf_registers *config, 816 const char *guid) 817{ 818 if (guid) 819 return i915_add_config(perf_cfg, fd, config, guid); 820 821 struct mesa_sha1 sha1_ctx; 822 _mesa_sha1_init(&sha1_ctx); 823 824 if (config->flex_regs) { 825 _mesa_sha1_update(&sha1_ctx, config->flex_regs, 826 sizeof(config->flex_regs[0]) * 827 config->n_flex_regs); 828 } 829 if (config->mux_regs) { 830 _mesa_sha1_update(&sha1_ctx, config->mux_regs, 831 sizeof(config->mux_regs[0]) * 832 config->n_mux_regs); 833 } 834 if (config->b_counter_regs) { 835 _mesa_sha1_update(&sha1_ctx, config->b_counter_regs, 836 sizeof(config->b_counter_regs[0]) * 837 config->n_b_counter_regs); 838 } 839 840 uint8_t hash[20]; 841 _mesa_sha1_final(&sha1_ctx, hash); 842 843 char formatted_hash[41]; 844 _mesa_sha1_format(formatted_hash, hash); 845 846 char generated_guid[37]; 847 snprintf(generated_guid, sizeof(generated_guid), 848 "%.8s-%.4s-%.4s-%.4s-%.12s", 849 &formatted_hash[0], &formatted_hash[8], 850 &formatted_hash[8 + 4], &formatted_hash[8 + 4 + 4], 851 &formatted_hash[8 + 4 + 4 + 4]); 852 853 /* Check if already present. */ 854 uint64_t id; 855 if (intel_perf_load_metric_id(perf_cfg, generated_guid, &id)) 856 return id; 857 858 return i915_add_config(perf_cfg, fd, config, generated_guid); 859} 860 861static uint64_t 862get_passes_mask(struct intel_perf_config *perf, 863 const uint32_t *counter_indices, 864 uint32_t counter_indices_count) 865{ 866 uint64_t queries_mask = 0; 867 868 assert(perf->n_queries < 64); 869 870 /* Compute the number of passes by going through all counters N times (with 871 * N the number of queries) to make sure we select the most constraining 872 * counters first and look at the more flexible ones (that could be 873 * obtained from multiple queries) later. That way we minimize the number 874 * of passes required. 875 */ 876 for (uint32_t q = 0; q < perf->n_queries; q++) { 877 for (uint32_t i = 0; i < counter_indices_count; i++) { 878 assert(counter_indices[i] < perf->n_counters); 879 880 uint32_t idx = counter_indices[i]; 881 if (util_bitcount64(perf->counter_infos[idx].query_mask) != (q + 1)) 882 continue; 883 884 if (queries_mask & perf->counter_infos[idx].query_mask) 885 continue; 886 887 queries_mask |= BITFIELD64_BIT(ffsll(perf->counter_infos[idx].query_mask) - 1); 888 } 889 } 890 891 return queries_mask; 892} 893 894uint32_t 895intel_perf_get_n_passes(struct intel_perf_config *perf, 896 const uint32_t *counter_indices, 897 uint32_t counter_indices_count, 898 struct intel_perf_query_info **pass_queries) 899{ 900 uint64_t queries_mask = get_passes_mask(perf, counter_indices, counter_indices_count); 901 902 if (pass_queries) { 903 uint32_t pass = 0; 904 for (uint32_t q = 0; q < perf->n_queries; q++) { 905 if ((1ULL << q) & queries_mask) 906 pass_queries[pass++] = &perf->queries[q]; 907 } 908 } 909 910 return util_bitcount64(queries_mask); 911} 912 913void 914intel_perf_get_counters_passes(struct intel_perf_config *perf, 915 const uint32_t *counter_indices, 916 uint32_t counter_indices_count, 917 struct intel_perf_counter_pass *counter_pass) 918{ 919 uint64_t queries_mask = get_passes_mask(perf, counter_indices, counter_indices_count); 920 ASSERTED uint32_t n_passes = util_bitcount64(queries_mask); 921 922 for (uint32_t i = 0; i < counter_indices_count; i++) { 923 assert(counter_indices[i] < perf->n_counters); 924 925 uint32_t idx = counter_indices[i]; 926 counter_pass[i].counter = perf->counter_infos[idx].counter; 927 928 uint32_t query_idx = ffsll(perf->counter_infos[idx].query_mask & queries_mask) - 1; 929 counter_pass[i].query = &perf->queries[query_idx]; 930 931 uint32_t clear_bits = 63 - query_idx; 932 counter_pass[i].pass = util_bitcount64((queries_mask << clear_bits) >> clear_bits) - 1; 933 assert(counter_pass[i].pass < n_passes); 934 } 935} 936 937/* Accumulate 32bits OA counters */ 938static inline void 939accumulate_uint32(const uint32_t *report0, 940 const uint32_t *report1, 941 uint64_t *accumulator) 942{ 943 *accumulator += (uint32_t)(*report1 - *report0); 944} 945 946/* Accumulate 40bits OA counters */ 947static inline void 948accumulate_uint40(int a_index, 949 const uint32_t *report0, 950 const uint32_t *report1, 951 uint64_t *accumulator) 952{ 953 const uint8_t *high_bytes0 = (uint8_t *)(report0 + 40); 954 const uint8_t *high_bytes1 = (uint8_t *)(report1 + 40); 955 uint64_t high0 = (uint64_t)(high_bytes0[a_index]) << 32; 956 uint64_t high1 = (uint64_t)(high_bytes1[a_index]) << 32; 957 uint64_t value0 = report0[a_index + 4] | high0; 958 uint64_t value1 = report1[a_index + 4] | high1; 959 uint64_t delta; 960 961 if (value0 > value1) 962 delta = (1ULL << 40) + value1 - value0; 963 else 964 delta = value1 - value0; 965 966 *accumulator += delta; 967} 968 969static void 970gfx8_read_report_clock_ratios(const uint32_t *report, 971 uint64_t *slice_freq_hz, 972 uint64_t *unslice_freq_hz) 973{ 974 /* The lower 16bits of the RPT_ID field of the OA reports contains a 975 * snapshot of the bits coming from the RP_FREQ_NORMAL register and is 976 * divided this way : 977 * 978 * RPT_ID[31:25]: RP_FREQ_NORMAL[20:14] (low squashed_slice_clock_frequency) 979 * RPT_ID[10:9]: RP_FREQ_NORMAL[22:21] (high squashed_slice_clock_frequency) 980 * RPT_ID[8:0]: RP_FREQ_NORMAL[31:23] (squashed_unslice_clock_frequency) 981 * 982 * RP_FREQ_NORMAL[31:23]: Software Unslice Ratio Request 983 * Multiple of 33.33MHz 2xclk (16 MHz 1xclk) 984 * 985 * RP_FREQ_NORMAL[22:14]: Software Slice Ratio Request 986 * Multiple of 33.33MHz 2xclk (16 MHz 1xclk) 987 */ 988 989 uint32_t unslice_freq = report[0] & 0x1ff; 990 uint32_t slice_freq_low = (report[0] >> 25) & 0x7f; 991 uint32_t slice_freq_high = (report[0] >> 9) & 0x3; 992 uint32_t slice_freq = slice_freq_low | (slice_freq_high << 7); 993 994 *slice_freq_hz = slice_freq * 16666667ULL; 995 *unslice_freq_hz = unslice_freq * 16666667ULL; 996} 997 998void 999intel_perf_query_result_read_frequencies(struct intel_perf_query_result *result, 1000 const struct intel_device_info *devinfo, 1001 const uint32_t *start, 1002 const uint32_t *end) 1003{ 1004 /* Slice/Unslice frequency is only available in the OA reports when the 1005 * "Disable OA reports due to clock ratio change" field in 1006 * OA_DEBUG_REGISTER is set to 1. This is how the kernel programs this 1007 * global register (see drivers/gpu/drm/i915/i915_perf.c) 1008 * 1009 * Documentation says this should be available on Gfx9+ but experimentation 1010 * shows that Gfx8 reports similar values, so we enable it there too. 1011 */ 1012 if (devinfo->ver < 8) 1013 return; 1014 1015 gfx8_read_report_clock_ratios(start, 1016 &result->slice_frequency[0], 1017 &result->unslice_frequency[0]); 1018 gfx8_read_report_clock_ratios(end, 1019 &result->slice_frequency[1], 1020 &result->unslice_frequency[1]); 1021} 1022 1023static inline bool 1024can_use_mi_rpc_bc_counters(const struct intel_device_info *devinfo) 1025{ 1026 return devinfo->ver <= 11; 1027} 1028 1029void 1030intel_perf_query_result_accumulate(struct intel_perf_query_result *result, 1031 const struct intel_perf_query_info *query, 1032 const struct intel_device_info *devinfo, 1033 const uint32_t *start, 1034 const uint32_t *end) 1035{ 1036 int i; 1037 1038 if (result->hw_id == INTEL_PERF_INVALID_CTX_ID && 1039 start[2] != INTEL_PERF_INVALID_CTX_ID) 1040 result->hw_id = start[2]; 1041 if (result->reports_accumulated == 0) 1042 result->begin_timestamp = start[1]; 1043 result->reports_accumulated++; 1044 1045 switch (query->oa_format) { 1046 case I915_OA_FORMAT_A32u40_A4u32_B8_C8: 1047 accumulate_uint32(start + 1, end + 1, 1048 result->accumulator + query->gpu_time_offset); /* timestamp */ 1049 accumulate_uint32(start + 3, end + 3, 1050 result->accumulator + query->gpu_clock_offset); /* clock */ 1051 1052 /* 32x 40bit A counters... */ 1053 for (i = 0; i < 32; i++) { 1054 accumulate_uint40(i, start, end, 1055 result->accumulator + query->a_offset + i); 1056 } 1057 1058 /* 4x 32bit A counters... */ 1059 for (i = 0; i < 4; i++) { 1060 accumulate_uint32(start + 36 + i, end + 36 + i, 1061 result->accumulator + query->a_offset + 32 + i); 1062 } 1063 1064 if (can_use_mi_rpc_bc_counters(devinfo)) { 1065 /* 8x 32bit B counters */ 1066 for (i = 0; i < 8; i++) { 1067 accumulate_uint32(start + 48 + i, end + 48 + i, 1068 result->accumulator + query->b_offset + i); 1069 } 1070 1071 /* 8x 32bit C counters... */ 1072 for (i = 0; i < 8; i++) { 1073 accumulate_uint32(start + 56 + i, end + 56 + i, 1074 result->accumulator + query->c_offset + i); 1075 } 1076 } 1077 break; 1078 1079 case I915_OA_FORMAT_A45_B8_C8: 1080 accumulate_uint32(start + 1, end + 1, result->accumulator); /* timestamp */ 1081 1082 for (i = 0; i < 61; i++) { 1083 accumulate_uint32(start + 3 + i, end + 3 + i, 1084 result->accumulator + query->a_offset + i); 1085 } 1086 break; 1087 1088 default: 1089 unreachable("Can't accumulate OA counters in unknown format"); 1090 } 1091 1092} 1093 1094#define GET_FIELD(word, field) (((word) & field ## _MASK) >> field ## _SHIFT) 1095 1096void 1097intel_perf_query_result_read_gt_frequency(struct intel_perf_query_result *result, 1098 const struct intel_device_info *devinfo, 1099 const uint32_t start, 1100 const uint32_t end) 1101{ 1102 switch (devinfo->ver) { 1103 case 7: 1104 case 8: 1105 result->gt_frequency[0] = GET_FIELD(start, GFX7_RPSTAT1_CURR_GT_FREQ) * 50ULL; 1106 result->gt_frequency[1] = GET_FIELD(end, GFX7_RPSTAT1_CURR_GT_FREQ) * 50ULL; 1107 break; 1108 case 9: 1109 case 11: 1110 case 12: 1111 result->gt_frequency[0] = GET_FIELD(start, GFX9_RPSTAT0_CURR_GT_FREQ) * 50ULL / 3ULL; 1112 result->gt_frequency[1] = GET_FIELD(end, GFX9_RPSTAT0_CURR_GT_FREQ) * 50ULL / 3ULL; 1113 break; 1114 default: 1115 unreachable("unexpected gen"); 1116 } 1117 1118 /* Put the numbers into Hz. */ 1119 result->gt_frequency[0] *= 1000000ULL; 1120 result->gt_frequency[1] *= 1000000ULL; 1121} 1122 1123void 1124intel_perf_query_result_read_perfcnts(struct intel_perf_query_result *result, 1125 const struct intel_perf_query_info *query, 1126 const uint64_t *start, 1127 const uint64_t *end) 1128{ 1129 for (uint32_t i = 0; i < 2; i++) { 1130 uint64_t v0 = start[i] & PERF_CNT_VALUE_MASK; 1131 uint64_t v1 = end[i] & PERF_CNT_VALUE_MASK; 1132 1133 result->accumulator[query->perfcnt_offset + i] = v0 > v1 ? 1134 (PERF_CNT_VALUE_MASK + 1 + v1 - v0) : 1135 (v1 - v0); 1136 } 1137} 1138 1139static uint32_t 1140query_accumulator_offset(const struct intel_perf_query_info *query, 1141 enum intel_perf_query_field_type type, 1142 uint8_t index) 1143{ 1144 switch (type) { 1145 case INTEL_PERF_QUERY_FIELD_TYPE_SRM_PERFCNT: 1146 return query->perfcnt_offset + index; 1147 case INTEL_PERF_QUERY_FIELD_TYPE_SRM_OA_B: 1148 return query->b_offset + index; 1149 case INTEL_PERF_QUERY_FIELD_TYPE_SRM_OA_C: 1150 return query->c_offset + index; 1151 default: 1152 unreachable("Invalid register type"); 1153 return 0; 1154 } 1155} 1156 1157void 1158intel_perf_query_result_accumulate_fields(struct intel_perf_query_result *result, 1159 const struct intel_perf_query_info *query, 1160 const struct intel_device_info *devinfo, 1161 const void *start, 1162 const void *end, 1163 bool no_oa_accumulate) 1164{ 1165 struct intel_perf_query_field_layout *layout = &query->perf->query_layout; 1166 1167 for (uint32_t r = 0; r < layout->n_fields; r++) { 1168 struct intel_perf_query_field *field = &layout->fields[r]; 1169 1170 if (field->type == INTEL_PERF_QUERY_FIELD_TYPE_MI_RPC) { 1171 intel_perf_query_result_read_frequencies(result, devinfo, 1172 start + field->location, 1173 end + field->location); 1174 /* no_oa_accumulate=true is used when doing GL perf queries, we 1175 * manually parse the OA reports from the OA buffer and substract 1176 * unrelated deltas, so don't accumulate the begin/end reports here. 1177 */ 1178 if (!no_oa_accumulate) { 1179 intel_perf_query_result_accumulate(result, query, devinfo, 1180 start + field->location, 1181 end + field->location); 1182 } 1183 } else { 1184 uint64_t v0, v1; 1185 1186 if (field->size == 4) { 1187 v0 = *(const uint32_t *)(start + field->location); 1188 v1 = *(const uint32_t *)(end + field->location); 1189 } else { 1190 assert(field->size == 8); 1191 v0 = *(const uint64_t *)(start + field->location); 1192 v1 = *(const uint64_t *)(end + field->location); 1193 } 1194 1195 if (field->mask) { 1196 v0 = field->mask & v0; 1197 v1 = field->mask & v1; 1198 } 1199 1200 /* RPSTAT is a bit of a special case because its begin/end values 1201 * represent frequencies. We store it in a separate location. 1202 */ 1203 if (field->type == INTEL_PERF_QUERY_FIELD_TYPE_SRM_RPSTAT) 1204 intel_perf_query_result_read_gt_frequency(result, devinfo, v0, v1); 1205 else 1206 result->accumulator[query_accumulator_offset(query, field->type, field->index)] = v1 - v0; 1207 } 1208 } 1209} 1210 1211void 1212intel_perf_query_result_clear(struct intel_perf_query_result *result) 1213{ 1214 memset(result, 0, sizeof(*result)); 1215 result->hw_id = INTEL_PERF_INVALID_CTX_ID; 1216} 1217 1218void 1219intel_perf_query_result_print_fields(const struct intel_perf_query_info *query, 1220 const struct intel_device_info *devinfo, 1221 const void *data) 1222{ 1223 const struct intel_perf_query_field_layout *layout = &query->perf->query_layout; 1224 1225 for (uint32_t r = 0; r < layout->n_fields; r++) { 1226 const struct intel_perf_query_field *field = &layout->fields[r]; 1227 const uint32_t *value32 = data + field->location; 1228 1229 switch (field->type) { 1230 case INTEL_PERF_QUERY_FIELD_TYPE_MI_RPC: 1231 fprintf(stderr, "MI_RPC:\n"); 1232 fprintf(stderr, " TS: 0x%08x\n", *(value32 + 1)); 1233 fprintf(stderr, " CLK: 0x%08x\n", *(value32 + 3)); 1234 break; 1235 case INTEL_PERF_QUERY_FIELD_TYPE_SRM_OA_B: 1236 fprintf(stderr, "B%u: 0x%08x\n", field->index, *value32); 1237 break; 1238 case INTEL_PERF_QUERY_FIELD_TYPE_SRM_OA_C: 1239 fprintf(stderr, "C%u: 0x%08x\n", field->index, *value32); 1240 break; 1241 default: 1242 break; 1243 } 1244 } 1245} 1246 1247static int 1248intel_perf_compare_query_names(const void *v1, const void *v2) 1249{ 1250 const struct intel_perf_query_info *q1 = v1; 1251 const struct intel_perf_query_info *q2 = v2; 1252 1253 return strcmp(q1->name, q2->name); 1254} 1255 1256static inline struct intel_perf_query_field * 1257add_query_register(struct intel_perf_query_field_layout *layout, 1258 enum intel_perf_query_field_type type, 1259 uint16_t offset, 1260 uint16_t size, 1261 uint8_t index) 1262{ 1263 /* Align MI_RPC to 64bytes (HW requirement) & 64bit registers to 8bytes 1264 * (shows up nicely in the debugger). 1265 */ 1266 if (type == INTEL_PERF_QUERY_FIELD_TYPE_MI_RPC) 1267 layout->size = align(layout->size, 64); 1268 else if (size % 8 == 0) 1269 layout->size = align(layout->size, 8); 1270 1271 layout->fields[layout->n_fields++] = (struct intel_perf_query_field) { 1272 .mmio_offset = offset, 1273 .location = layout->size, 1274 .type = type, 1275 .index = index, 1276 .size = size, 1277 }; 1278 layout->size += size; 1279 1280 return &layout->fields[layout->n_fields - 1]; 1281} 1282 1283static void 1284intel_perf_init_query_fields(struct intel_perf_config *perf_cfg, 1285 const struct intel_device_info *devinfo, 1286 bool use_register_snapshots) 1287{ 1288 struct intel_perf_query_field_layout *layout = &perf_cfg->query_layout; 1289 1290 layout->n_fields = 0; 1291 1292 /* MI_RPC requires a 64byte alignment. */ 1293 layout->alignment = 64; 1294 1295 layout->fields = rzalloc_array(perf_cfg, struct intel_perf_query_field, 5 + 16); 1296 1297 add_query_register(layout, INTEL_PERF_QUERY_FIELD_TYPE_MI_RPC, 1298 0, 256, 0); 1299 1300 if (use_register_snapshots) { 1301 if (devinfo->ver <= 11) { 1302 struct intel_perf_query_field *field = 1303 add_query_register(layout, 1304 INTEL_PERF_QUERY_FIELD_TYPE_SRM_PERFCNT, 1305 PERF_CNT_1_DW0, 8, 0); 1306 field->mask = PERF_CNT_VALUE_MASK; 1307 1308 field = add_query_register(layout, 1309 INTEL_PERF_QUERY_FIELD_TYPE_SRM_PERFCNT, 1310 PERF_CNT_2_DW0, 8, 1); 1311 field->mask = PERF_CNT_VALUE_MASK; 1312 } 1313 1314 if (devinfo->ver == 8 && !devinfo->is_cherryview) { 1315 add_query_register(layout, 1316 INTEL_PERF_QUERY_FIELD_TYPE_SRM_RPSTAT, 1317 GFX7_RPSTAT1, 4, 0); 1318 } 1319 1320 if (devinfo->ver >= 9) { 1321 add_query_register(layout, 1322 INTEL_PERF_QUERY_FIELD_TYPE_SRM_RPSTAT, 1323 GFX9_RPSTAT0, 4, 0); 1324 } 1325 1326 if (!can_use_mi_rpc_bc_counters(devinfo)) { 1327 if (devinfo->ver >= 8 && devinfo->ver <= 11) { 1328 for (uint32_t i = 0; i < GFX8_N_OA_PERF_B32; i++) { 1329 add_query_register(layout, INTEL_PERF_QUERY_FIELD_TYPE_SRM_OA_B, 1330 GFX8_OA_PERF_B32(i), 4, i); 1331 } 1332 for (uint32_t i = 0; i < GFX8_N_OA_PERF_C32; i++) { 1333 add_query_register(layout, INTEL_PERF_QUERY_FIELD_TYPE_SRM_OA_C, 1334 GFX8_OA_PERF_C32(i), 4, i); 1335 } 1336 } else if (devinfo->ver == 12) { 1337 for (uint32_t i = 0; i < GFX12_N_OAG_PERF_B32; i++) { 1338 add_query_register(layout, INTEL_PERF_QUERY_FIELD_TYPE_SRM_OA_B, 1339 GFX12_OAG_PERF_B32(i), 4, i); 1340 } 1341 for (uint32_t i = 0; i < GFX12_N_OAG_PERF_C32; i++) { 1342 add_query_register(layout, INTEL_PERF_QUERY_FIELD_TYPE_SRM_OA_C, 1343 GFX12_OAG_PERF_C32(i), 4, i); 1344 } 1345 } 1346 } 1347 } 1348 1349 /* Align the whole package to 64bytes so that 2 snapshots can be put 1350 * together without extract alignment for the user. 1351 */ 1352 layout->size = align(layout->size, 64); 1353} 1354 1355void 1356intel_perf_init_metrics(struct intel_perf_config *perf_cfg, 1357 const struct intel_device_info *devinfo, 1358 int drm_fd, 1359 bool include_pipeline_statistics, 1360 bool use_register_snapshots) 1361{ 1362 intel_perf_init_query_fields(perf_cfg, devinfo, use_register_snapshots); 1363 1364 if (include_pipeline_statistics) { 1365 load_pipeline_statistic_metrics(perf_cfg, devinfo); 1366 intel_perf_register_mdapi_statistic_query(perf_cfg, devinfo); 1367 } 1368 1369 bool oa_metrics = oa_metrics_available(perf_cfg, drm_fd, devinfo, 1370 use_register_snapshots); 1371 if (oa_metrics) 1372 load_oa_metrics(perf_cfg, drm_fd, devinfo); 1373 1374 /* sort query groups by name */ 1375 qsort(perf_cfg->queries, perf_cfg->n_queries, 1376 sizeof(perf_cfg->queries[0]), intel_perf_compare_query_names); 1377 1378 build_unique_counter_list(perf_cfg); 1379 1380 if (oa_metrics) 1381 intel_perf_register_mdapi_oa_query(perf_cfg, devinfo); 1382} 1383