1/* 2 * Copyright © 2018 Intel Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 */ 23 24#ifndef INTEL_PERF_H 25#define INTEL_PERF_H 26 27#include <stdio.h> 28#include <stdbool.h> 29#include <stdint.h> 30#include <string.h> 31 32#if defined(MAJOR_IN_SYSMACROS) 33#include <sys/sysmacros.h> 34#elif defined(MAJOR_IN_MKDEV) 35#include <sys/mkdev.h> 36#endif 37 38#include "util/hash_table.h" 39#include "compiler/glsl/list.h" 40#include "util/ralloc.h" 41 42#include "drm-uapi/i915_drm.h" 43 44#ifdef __cplusplus 45extern "C" { 46#endif 47 48struct intel_device_info; 49 50struct intel_perf_config; 51struct intel_perf_query_info; 52 53#define INTEL_PERF_INVALID_CTX_ID (0xffffffff) 54 55enum intel_perf_counter_type { 56 INTEL_PERF_COUNTER_TYPE_EVENT, 57 INTEL_PERF_COUNTER_TYPE_DURATION_NORM, 58 INTEL_PERF_COUNTER_TYPE_DURATION_RAW, 59 INTEL_PERF_COUNTER_TYPE_THROUGHPUT, 60 INTEL_PERF_COUNTER_TYPE_RAW, 61 INTEL_PERF_COUNTER_TYPE_TIMESTAMP, 62}; 63 64enum intel_perf_counter_data_type { 65 INTEL_PERF_COUNTER_DATA_TYPE_BOOL32, 66 INTEL_PERF_COUNTER_DATA_TYPE_UINT32, 67 INTEL_PERF_COUNTER_DATA_TYPE_UINT64, 68 INTEL_PERF_COUNTER_DATA_TYPE_FLOAT, 69 INTEL_PERF_COUNTER_DATA_TYPE_DOUBLE, 70}; 71 72enum intel_perf_counter_units { 73 /* size */ 74 INTEL_PERF_COUNTER_UNITS_BYTES, 75 76 /* frequency */ 77 INTEL_PERF_COUNTER_UNITS_HZ, 78 79 /* time */ 80 INTEL_PERF_COUNTER_UNITS_NS, 81 INTEL_PERF_COUNTER_UNITS_US, 82 83 /**/ 84 INTEL_PERF_COUNTER_UNITS_PIXELS, 85 INTEL_PERF_COUNTER_UNITS_TEXELS, 86 INTEL_PERF_COUNTER_UNITS_THREADS, 87 INTEL_PERF_COUNTER_UNITS_PERCENT, 88 89 /* events */ 90 INTEL_PERF_COUNTER_UNITS_MESSAGES, 91 INTEL_PERF_COUNTER_UNITS_NUMBER, 92 INTEL_PERF_COUNTER_UNITS_CYCLES, 93 INTEL_PERF_COUNTER_UNITS_EVENTS, 94 INTEL_PERF_COUNTER_UNITS_UTILIZATION, 95 96 /**/ 97 INTEL_PERF_COUNTER_UNITS_EU_SENDS_TO_L3_CACHE_LINES, 98 INTEL_PERF_COUNTER_UNITS_EU_ATOMIC_REQUESTS_TO_L3_CACHE_LINES, 99 INTEL_PERF_COUNTER_UNITS_EU_REQUESTS_TO_L3_CACHE_LINES, 100 INTEL_PERF_COUNTER_UNITS_EU_BYTES_PER_L3_CACHE_LINE, 101 102 INTEL_PERF_COUNTER_UNITS_MAX 103}; 104 105struct intel_pipeline_stat { 106 uint32_t reg; 107 uint32_t numerator; 108 uint32_t denominator; 109}; 110 111/* 112 * The largest OA formats we can use include: 113 * For Haswell: 114 * 1 timestamp, 45 A counters, 8 B counters and 8 C counters. 115 * For Gfx8+ 116 * 1 timestamp, 1 clock, 36 A counters, 8 B counters and 8 C counters 117 * 118 * Plus 2 PERF_CNT registers and 1 RPSTAT register. 119 */ 120#define MAX_OA_REPORT_COUNTERS (62 + 2 + 1) 121 122/* 123 * When currently allocate only one page for pipeline statistics queries. Here 124 * we derived the maximum number of counters for that amount. 125 */ 126#define STATS_BO_SIZE 4096 127#define STATS_BO_END_OFFSET_BYTES (STATS_BO_SIZE / 2) 128#define MAX_STAT_COUNTERS (STATS_BO_END_OFFSET_BYTES / 8) 129 130#define I915_PERF_OA_SAMPLE_SIZE (8 + /* drm_i915_perf_record_header */ \ 131 256) /* OA counter report */ 132 133struct intel_perf_query_result { 134 /** 135 * Storage for the final accumulated OA counters. 136 */ 137 uint64_t accumulator[MAX_OA_REPORT_COUNTERS]; 138 139 /** 140 * Hw ID used by the context on which the query was running. 141 */ 142 uint32_t hw_id; 143 144 /** 145 * Number of reports accumulated to produce the results. 146 */ 147 uint32_t reports_accumulated; 148 149 /** 150 * Frequency in the slices of the GT at the begin and end of the 151 * query. 152 */ 153 uint64_t slice_frequency[2]; 154 155 /** 156 * Frequency in the unslice of the GT at the begin and end of the 157 * query. 158 */ 159 uint64_t unslice_frequency[2]; 160 161 /** 162 * Frequency of the whole GT at the begin and end of the query. 163 */ 164 uint64_t gt_frequency[2]; 165 166 /** 167 * Timestamp of the query. 168 */ 169 uint64_t begin_timestamp; 170 171 /** 172 * Whether the query was interrupted by another workload (aka preemption). 173 */ 174 bool query_disjoint; 175}; 176 177struct intel_perf_query_counter { 178 const char *name; 179 const char *desc; 180 const char *symbol_name; 181 const char *category; 182 enum intel_perf_counter_type type; 183 enum intel_perf_counter_data_type data_type; 184 enum intel_perf_counter_units units; 185 uint64_t raw_max; 186 size_t offset; 187 188 union { 189 uint64_t (*oa_counter_read_uint64)(struct intel_perf_config *perf, 190 const struct intel_perf_query_info *query, 191 const struct intel_perf_query_result *results); 192 float (*oa_counter_read_float)(struct intel_perf_config *perf, 193 const struct intel_perf_query_info *query, 194 const struct intel_perf_query_result *results); 195 struct intel_pipeline_stat pipeline_stat; 196 }; 197}; 198 199struct intel_perf_query_register_prog { 200 uint32_t reg; 201 uint32_t val; 202}; 203 204/* Register programming for a given query */ 205struct intel_perf_registers { 206 const struct intel_perf_query_register_prog *flex_regs; 207 uint32_t n_flex_regs; 208 209 const struct intel_perf_query_register_prog *mux_regs; 210 uint32_t n_mux_regs; 211 212 const struct intel_perf_query_register_prog *b_counter_regs; 213 uint32_t n_b_counter_regs; 214}; 215 216struct intel_perf_query_info { 217 struct intel_perf_config *perf; 218 219 enum intel_perf_query_type { 220 INTEL_PERF_QUERY_TYPE_OA, 221 INTEL_PERF_QUERY_TYPE_RAW, 222 INTEL_PERF_QUERY_TYPE_PIPELINE, 223 } kind; 224 const char *name; 225 const char *symbol_name; 226 const char *guid; 227 struct intel_perf_query_counter *counters; 228 int n_counters; 229 int max_counters; 230 size_t data_size; 231 232 /* OA specific */ 233 uint64_t oa_metrics_set_id; 234 int oa_format; 235 236 /* For indexing into the accumulator[] ... */ 237 int gpu_time_offset; 238 int gpu_clock_offset; 239 int a_offset; 240 int b_offset; 241 int c_offset; 242 int perfcnt_offset; 243 int rpstat_offset; 244 245 struct intel_perf_registers config; 246}; 247 248/* When not using the MI_RPC command, this structure describes the list of 249 * register offsets as well as their storage location so that they can be 250 * stored through a series of MI_SRM commands and accumulated with 251 * intel_perf_query_result_accumulate_snapshots(). 252 */ 253struct intel_perf_query_field_layout { 254 /* Alignment for the layout */ 255 uint32_t alignment; 256 257 /* Size of the whole layout */ 258 uint32_t size; 259 260 uint32_t n_fields; 261 262 struct intel_perf_query_field { 263 /* MMIO location of this register */ 264 uint16_t mmio_offset; 265 266 /* Location of this register in the storage */ 267 uint16_t location; 268 269 /* Type of register, for accumulation (see intel_perf_query_info:*_offset 270 * fields) 271 */ 272 enum intel_perf_query_field_type { 273 INTEL_PERF_QUERY_FIELD_TYPE_MI_RPC, 274 INTEL_PERF_QUERY_FIELD_TYPE_SRM_PERFCNT, 275 INTEL_PERF_QUERY_FIELD_TYPE_SRM_RPSTAT, 276 INTEL_PERF_QUERY_FIELD_TYPE_SRM_OA_B, 277 INTEL_PERF_QUERY_FIELD_TYPE_SRM_OA_C, 278 } type; 279 280 /* Index of register in the given type (for instance A31 or B2, 281 * etc...) 282 */ 283 uint8_t index; 284 285 /* 4, 8 or 256 */ 286 uint16_t size; 287 288 /* If not 0, mask to apply to the register value. */ 289 uint64_t mask; 290 } *fields; 291}; 292 293struct intel_perf_query_counter_info { 294 struct intel_perf_query_counter *counter; 295 296 uint64_t query_mask; 297 298 /** 299 * Each counter can be a part of many groups, each time at different index. 300 * This struct stores one of those locations. 301 */ 302 struct { 303 int group_idx; /* query/group number */ 304 int counter_idx; /* index inside of query/group */ 305 } location; 306}; 307 308struct intel_perf_config { 309 /* Whether i915 has DRM_I915_QUERY_PERF_CONFIG support. */ 310 bool i915_query_supported; 311 312 /* Version of the i915-perf subsystem, refer to i915_drm.h. */ 313 int i915_perf_version; 314 315 /* Powergating configuration for the running the query. */ 316 struct drm_i915_gem_context_param_sseu sseu; 317 318 struct intel_perf_query_info *queries; 319 int n_queries; 320 321 struct intel_perf_query_counter_info *counter_infos; 322 int n_counters; 323 324 struct intel_perf_query_field_layout query_layout; 325 326 /* Variables referenced in the XML meta data for OA performance 327 * counters, e.g in the normalization equations. 328 * 329 * All uint64_t for consistent operand types in generated code 330 */ 331 struct { 332 uint64_t timestamp_frequency; /** $GpuTimestampFrequency */ 333 uint64_t n_eus; /** $EuCoresTotalCount */ 334 uint64_t n_eu_slices; /** $EuSlicesTotalCount */ 335 uint64_t n_eu_sub_slices; /** $EuSubslicesTotalCount */ 336 uint64_t eu_threads_count; /** $EuThreadsCount */ 337 uint64_t slice_mask; /** $SliceMask */ 338 uint64_t subslice_mask; /** $SubsliceMask */ 339 uint64_t gt_min_freq; /** $GpuMinFrequency */ 340 uint64_t gt_max_freq; /** $GpuMaxFrequency */ 341 uint64_t revision; /** $SkuRevisionId */ 342 bool query_mode; /** $QueryMode */ 343 } sys_vars; 344 345 /* OA metric sets, indexed by GUID, as know by Mesa at build time, to 346 * cross-reference with the GUIDs of configs advertised by the kernel at 347 * runtime 348 */ 349 struct hash_table *oa_metrics_table; 350 351 /* When MDAPI hasn't configured the metric we need to use by the time the 352 * query begins, this OA metric is used as a fallback. 353 */ 354 uint64_t fallback_raw_oa_metric; 355 356 /* Whether we have support for this platform. If true && n_queries == 0, 357 * this means we will not be able to use i915-perf because of it is in 358 * paranoid mode. 359 */ 360 bool platform_supported; 361 362 /* Location of the device's sysfs entry. */ 363 char sysfs_dev_dir[256]; 364 365 struct { 366 void *(*bo_alloc)(void *bufmgr, const char *name, uint64_t size); 367 void (*bo_unreference)(void *bo); 368 void *(*bo_map)(void *ctx, void *bo, unsigned flags); 369 void (*bo_unmap)(void *bo); 370 bool (*batch_references)(void *batch, void *bo); 371 void (*bo_wait_rendering)(void *bo); 372 int (*bo_busy)(void *bo); 373 void (*emit_stall_at_pixel_scoreboard)(void *ctx); 374 void (*emit_mi_report_perf_count)(void *ctx, 375 void *bo, 376 uint32_t offset_in_bytes, 377 uint32_t report_id); 378 void (*batchbuffer_flush)(void *ctx, 379 const char *file, int line); 380 void (*store_register_mem)(void *ctx, void *bo, uint32_t reg, uint32_t reg_size, uint32_t offset); 381 382 } vtbl; 383}; 384 385struct intel_perf_counter_pass { 386 struct intel_perf_query_info *query; 387 struct intel_perf_query_counter *counter; 388 uint32_t pass; 389}; 390 391/** Initialize the intel_perf_config object for a given device. 392 * 393 * include_pipeline_statistics : Whether to add a pipeline statistic query 394 * intel_perf_query_info object 395 * 396 * use_register_snapshots : Whether the queries should include counters 397 * that rely on register snapshots using command 398 * streamer instructions (not possible when using 399 * only the OA buffer data). 400 */ 401void intel_perf_init_metrics(struct intel_perf_config *perf_cfg, 402 const struct intel_device_info *devinfo, 403 int drm_fd, 404 bool include_pipeline_statistics, 405 bool use_register_snapshots); 406 407/** Query i915 for a metric id using guid. 408 */ 409bool intel_perf_load_metric_id(struct intel_perf_config *perf_cfg, 410 const char *guid, 411 uint64_t *metric_id); 412 413/** Load a configuation's content from i915 using a guid. 414 */ 415struct intel_perf_registers *intel_perf_load_configuration(struct intel_perf_config *perf_cfg, 416 int fd, const char *guid); 417 418/** Store a configuration into i915 using guid and return a new metric id. 419 * 420 * If guid is NULL, then a generated one will be provided by hashing the 421 * content of the configuration. 422 */ 423uint64_t intel_perf_store_configuration(struct intel_perf_config *perf_cfg, int fd, 424 const struct intel_perf_registers *config, 425 const char *guid); 426 427/** Read the slice/unslice frequency from 2 OA reports and store then into 428 * result. 429 */ 430void intel_perf_query_result_read_frequencies(struct intel_perf_query_result *result, 431 const struct intel_device_info *devinfo, 432 const uint32_t *start, 433 const uint32_t *end); 434 435/** Store the GT frequency as reported by the RPSTAT register. 436 */ 437void intel_perf_query_result_read_gt_frequency(struct intel_perf_query_result *result, 438 const struct intel_device_info *devinfo, 439 const uint32_t start, 440 const uint32_t end); 441 442/** Store PERFCNT registers values. 443 */ 444void intel_perf_query_result_read_perfcnts(struct intel_perf_query_result *result, 445 const struct intel_perf_query_info *query, 446 const uint64_t *start, 447 const uint64_t *end); 448 449/** Accumulate the delta between 2 OA reports into result for a given query. 450 */ 451void intel_perf_query_result_accumulate(struct intel_perf_query_result *result, 452 const struct intel_perf_query_info *query, 453 const struct intel_device_info *devinfo, 454 const uint32_t *start, 455 const uint32_t *end); 456 457/** Accumulate the delta between 2 snapshots of OA perf registers (layout 458 * should match description specified through intel_perf_query_register_layout). 459 */ 460void intel_perf_query_result_accumulate_fields(struct intel_perf_query_result *result, 461 const struct intel_perf_query_info *query, 462 const struct intel_device_info *devinfo, 463 const void *start, 464 const void *end, 465 bool no_oa_accumulate); 466 467void intel_perf_query_result_clear(struct intel_perf_query_result *result); 468 469/** Debug helper printing out query data. 470 */ 471void intel_perf_query_result_print_fields(const struct intel_perf_query_info *query, 472 const struct intel_device_info *devinfo, 473 const void *data); 474 475static inline size_t 476intel_perf_query_counter_get_size(const struct intel_perf_query_counter *counter) 477{ 478 switch (counter->data_type) { 479 case INTEL_PERF_COUNTER_DATA_TYPE_BOOL32: 480 return sizeof(uint32_t); 481 case INTEL_PERF_COUNTER_DATA_TYPE_UINT32: 482 return sizeof(uint32_t); 483 case INTEL_PERF_COUNTER_DATA_TYPE_UINT64: 484 return sizeof(uint64_t); 485 case INTEL_PERF_COUNTER_DATA_TYPE_FLOAT: 486 return sizeof(float); 487 case INTEL_PERF_COUNTER_DATA_TYPE_DOUBLE: 488 return sizeof(double); 489 default: 490 unreachable("invalid counter data type"); 491 } 492} 493 494static inline struct intel_perf_config * 495intel_perf_new(void *ctx) 496{ 497 struct intel_perf_config *perf = rzalloc(ctx, struct intel_perf_config); 498 return perf; 499} 500 501/** Whether we have the ability to hold off preemption on a batch so we don't 502 * have to look at the OA buffer to subtract unrelated workloads off the 503 * values captured through MI_* commands. 504 */ 505static inline bool 506intel_perf_has_hold_preemption(const struct intel_perf_config *perf) 507{ 508 return perf->i915_perf_version >= 3; 509} 510 511/** Whether we have the ability to lock EU array power configuration for the 512 * duration of the performance recording. This is useful on Gfx11 where the HW 513 * architecture requires half the EU for particular workloads. 514 */ 515static inline bool 516intel_perf_has_global_sseu(const struct intel_perf_config *perf) 517{ 518 return perf->i915_perf_version >= 4; 519} 520 521uint32_t intel_perf_get_n_passes(struct intel_perf_config *perf, 522 const uint32_t *counter_indices, 523 uint32_t counter_indices_count, 524 struct intel_perf_query_info **pass_queries); 525void intel_perf_get_counters_passes(struct intel_perf_config *perf, 526 const uint32_t *counter_indices, 527 uint32_t counter_indices_count, 528 struct intel_perf_counter_pass *counter_pass); 529 530#ifdef __cplusplus 531} // extern "C" 532#endif 533 534#endif /* INTEL_PERF_H */ 535