1/* 2 * Copyright © 2020 Valve Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 */ 23 24#include <inttypes.h> 25 26#include "radv_cs.h" 27#include "radv_private.h" 28#include "sid.h" 29 30#define SQTT_BUFFER_ALIGN_SHIFT 12 31 32bool 33radv_is_instruction_timing_enabled(void) 34{ 35 return getenv("RADV_THREAD_TRACE_PIPELINE"); 36} 37 38static bool 39radv_se_is_disabled(struct radv_device *device, unsigned se) 40{ 41 /* No active CU on the SE means it is disabled. */ 42 return device->physical_device->rad_info.cu_mask[se][0] == 0; 43} 44 45static uint32_t 46gfx10_get_thread_trace_ctrl(struct radv_device *device, bool enable) 47{ 48 uint32_t thread_trace_ctrl = S_008D1C_MODE(enable) | S_008D1C_HIWATER(5) | 49 S_008D1C_UTIL_TIMER(1) | S_008D1C_RT_FREQ(2) | /* 4096 clk */ 50 S_008D1C_DRAW_EVENT_EN(1) | S_008D1C_REG_STALL_EN(1) | 51 S_008D1C_SPI_STALL_EN(1) | S_008D1C_SQ_STALL_EN(1) | 52 S_008D1C_REG_DROP_ON_STALL(0); 53 54 if (device->physical_device->rad_info.chip_class == GFX10_3) 55 thread_trace_ctrl |= S_008D1C_LOWATER_OFFSET(4); 56 57 return thread_trace_ctrl; 58} 59 60static void 61radv_emit_thread_trace_start(struct radv_device *device, struct radeon_cmdbuf *cs, 62 uint32_t queue_family_index) 63{ 64 uint32_t shifted_size = device->thread_trace.buffer_size >> SQTT_BUFFER_ALIGN_SHIFT; 65 struct radeon_info *rad_info = &device->physical_device->rad_info; 66 unsigned max_se = rad_info->max_se; 67 68 for (unsigned se = 0; se < max_se; se++) { 69 uint64_t va = radv_buffer_get_va(device->thread_trace.bo); 70 uint64_t data_va = ac_thread_trace_get_data_va(rad_info, &device->thread_trace, va, se); 71 uint64_t shifted_va = data_va >> SQTT_BUFFER_ALIGN_SHIFT; 72 int first_active_cu = ffs(device->physical_device->rad_info.cu_mask[se][0]); 73 74 if (radv_se_is_disabled(device, se)) 75 continue; 76 77 /* Target SEx and SH0. */ 78 radeon_set_uconfig_reg( 79 cs, R_030800_GRBM_GFX_INDEX, 80 S_030800_SE_INDEX(se) | S_030800_SH_INDEX(0) | S_030800_INSTANCE_BROADCAST_WRITES(1)); 81 82 if (device->physical_device->rad_info.chip_class >= GFX10) { 83 /* Order seems important for the following 2 registers. */ 84 radeon_set_privileged_config_reg( 85 cs, R_008D04_SQ_THREAD_TRACE_BUF0_SIZE, 86 S_008D04_SIZE(shifted_size) | S_008D04_BASE_HI(shifted_va >> 32)); 87 88 radeon_set_privileged_config_reg(cs, R_008D00_SQ_THREAD_TRACE_BUF0_BASE, shifted_va); 89 90 radeon_set_privileged_config_reg( 91 cs, R_008D14_SQ_THREAD_TRACE_MASK, 92 S_008D14_WTYPE_INCLUDE(0x7f) | /* all shader stages */ 93 S_008D14_SA_SEL(0) | S_008D14_WGP_SEL(first_active_cu / 2) | S_008D14_SIMD_SEL(0)); 94 95 uint32_t thread_trace_token_mask = S_008D18_REG_INCLUDE( 96 V_008D18_REG_INCLUDE_SQDEC | V_008D18_REG_INCLUDE_SHDEC | V_008D18_REG_INCLUDE_GFXUDEC | 97 V_008D18_REG_INCLUDE_COMP | V_008D18_REG_INCLUDE_CONTEXT | V_008D18_REG_INCLUDE_CONFIG); 98 99 /* Performance counters with SQTT are considered deprecated. */ 100 uint32_t token_exclude = V_008D18_TOKEN_EXCLUDE_PERF; 101 102 if (!radv_is_instruction_timing_enabled()) { 103 /* Reduce SQTT traffic when instruction timing isn't enabled. */ 104 token_exclude |= V_008D18_TOKEN_EXCLUDE_VMEMEXEC | 105 V_008D18_TOKEN_EXCLUDE_ALUEXEC | 106 V_008D18_TOKEN_EXCLUDE_VALUINST | 107 V_008D18_TOKEN_EXCLUDE_IMMEDIATE | 108 V_008D18_TOKEN_EXCLUDE_INST; 109 } 110 thread_trace_token_mask |= S_008D18_TOKEN_EXCLUDE(token_exclude); 111 112 radeon_set_privileged_config_reg(cs, R_008D18_SQ_THREAD_TRACE_TOKEN_MASK, 113 thread_trace_token_mask); 114 115 /* Should be emitted last (it enables thread traces). */ 116 radeon_set_privileged_config_reg(cs, R_008D1C_SQ_THREAD_TRACE_CTRL, 117 gfx10_get_thread_trace_ctrl(device, true)); 118 } else { 119 /* Order seems important for the following 4 registers. */ 120 radeon_set_uconfig_reg(cs, R_030CDC_SQ_THREAD_TRACE_BASE2, 121 S_030CDC_ADDR_HI(shifted_va >> 32)); 122 123 radeon_set_uconfig_reg(cs, R_030CC0_SQ_THREAD_TRACE_BASE, shifted_va); 124 125 radeon_set_uconfig_reg(cs, R_030CC4_SQ_THREAD_TRACE_SIZE, S_030CC4_SIZE(shifted_size)); 126 127 radeon_set_uconfig_reg(cs, R_030CD4_SQ_THREAD_TRACE_CTRL, S_030CD4_RESET_BUFFER(1)); 128 129 uint32_t thread_trace_mask = S_030CC8_CU_SEL(first_active_cu) | S_030CC8_SH_SEL(0) | 130 S_030CC8_SIMD_EN(0xf) | S_030CC8_VM_ID_MASK(0) | 131 S_030CC8_REG_STALL_EN(1) | S_030CC8_SPI_STALL_EN(1) | 132 S_030CC8_SQ_STALL_EN(1); 133 134 if (device->physical_device->rad_info.chip_class < GFX9) { 135 thread_trace_mask |= S_030CC8_RANDOM_SEED(0xffff); 136 } 137 138 radeon_set_uconfig_reg(cs, R_030CC8_SQ_THREAD_TRACE_MASK, thread_trace_mask); 139 140 /* Trace all tokens and registers. */ 141 radeon_set_uconfig_reg( 142 cs, R_030CCC_SQ_THREAD_TRACE_TOKEN_MASK, 143 S_030CCC_TOKEN_MASK(0xbfff) | S_030CCC_REG_MASK(0xff) | S_030CCC_REG_DROP_ON_STALL(0)); 144 145 /* Enable SQTT perf counters for all CUs. */ 146 radeon_set_uconfig_reg(cs, R_030CD0_SQ_THREAD_TRACE_PERF_MASK, 147 S_030CD0_SH0_MASK(0xffff) | S_030CD0_SH1_MASK(0xffff)); 148 149 radeon_set_uconfig_reg(cs, R_030CE0_SQ_THREAD_TRACE_TOKEN_MASK2, 0xffffffff); 150 151 radeon_set_uconfig_reg(cs, R_030CEC_SQ_THREAD_TRACE_HIWATER, S_030CEC_HIWATER(4)); 152 153 if (device->physical_device->rad_info.chip_class == GFX9) { 154 /* Reset thread trace status errors. */ 155 radeon_set_uconfig_reg(cs, R_030CE8_SQ_THREAD_TRACE_STATUS, S_030CE8_UTC_ERROR(0)); 156 } 157 158 /* Enable the thread trace mode. */ 159 uint32_t thread_trace_mode = 160 S_030CD8_MASK_PS(1) | S_030CD8_MASK_VS(1) | S_030CD8_MASK_GS(1) | S_030CD8_MASK_ES(1) | 161 S_030CD8_MASK_HS(1) | S_030CD8_MASK_LS(1) | S_030CD8_MASK_CS(1) | 162 S_030CD8_AUTOFLUSH_EN(1) | /* periodically flush SQTT data to memory */ 163 S_030CD8_MODE(1); 164 165 if (device->physical_device->rad_info.chip_class == GFX9) { 166 /* Count SQTT traffic in TCC perf counters. */ 167 thread_trace_mode |= S_030CD8_TC_PERF_EN(1); 168 } 169 170 radeon_set_uconfig_reg(cs, R_030CD8_SQ_THREAD_TRACE_MODE, thread_trace_mode); 171 } 172 } 173 174 /* Restore global broadcasting. */ 175 radeon_set_uconfig_reg(cs, R_030800_GRBM_GFX_INDEX, 176 S_030800_SE_BROADCAST_WRITES(1) | S_030800_SH_BROADCAST_WRITES(1) | 177 S_030800_INSTANCE_BROADCAST_WRITES(1)); 178 179 /* Start the thread trace with a different event based on the queue. */ 180 if (queue_family_index == RADV_QUEUE_COMPUTE && 181 device->physical_device->rad_info.chip_class >= GFX7) { 182 radeon_set_sh_reg(cs, R_00B878_COMPUTE_THREAD_TRACE_ENABLE, S_00B878_THREAD_TRACE_ENABLE(1)); 183 } else { 184 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); 185 radeon_emit(cs, EVENT_TYPE(V_028A90_THREAD_TRACE_START) | EVENT_INDEX(0)); 186 } 187} 188 189static const uint32_t gfx8_thread_trace_info_regs[] = { 190 R_030CE4_SQ_THREAD_TRACE_WPTR, 191 R_030CE8_SQ_THREAD_TRACE_STATUS, 192 R_008E40_SQ_THREAD_TRACE_CNTR, 193}; 194 195static const uint32_t gfx9_thread_trace_info_regs[] = { 196 R_030CE4_SQ_THREAD_TRACE_WPTR, 197 R_030CE8_SQ_THREAD_TRACE_STATUS, 198 R_030CF0_SQ_THREAD_TRACE_CNTR, 199}; 200 201static const uint32_t gfx10_thread_trace_info_regs[] = { 202 R_008D10_SQ_THREAD_TRACE_WPTR, 203 R_008D20_SQ_THREAD_TRACE_STATUS, 204 R_008D24_SQ_THREAD_TRACE_DROPPED_CNTR, 205}; 206 207static void 208radv_copy_thread_trace_info_regs(struct radv_device *device, struct radeon_cmdbuf *cs, 209 unsigned se_index) 210{ 211 const uint32_t *thread_trace_info_regs = NULL; 212 213 if (device->physical_device->rad_info.chip_class >= GFX10) { 214 thread_trace_info_regs = gfx10_thread_trace_info_regs; 215 } else if (device->physical_device->rad_info.chip_class == GFX9) { 216 thread_trace_info_regs = gfx9_thread_trace_info_regs; 217 } else { 218 assert(device->physical_device->rad_info.chip_class == GFX8); 219 thread_trace_info_regs = gfx8_thread_trace_info_regs; 220 } 221 222 /* Get the VA where the info struct is stored for this SE. */ 223 uint64_t va = radv_buffer_get_va(device->thread_trace.bo); 224 uint64_t info_va = ac_thread_trace_get_info_va(va, se_index); 225 226 /* Copy back the info struct one DWORD at a time. */ 227 for (unsigned i = 0; i < 3; i++) { 228 radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0)); 229 radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_PERF) | COPY_DATA_DST_SEL(COPY_DATA_TC_L2) | 230 COPY_DATA_WR_CONFIRM); 231 radeon_emit(cs, thread_trace_info_regs[i] >> 2); 232 radeon_emit(cs, 0); /* unused */ 233 radeon_emit(cs, (info_va + i * 4)); 234 radeon_emit(cs, (info_va + i * 4) >> 32); 235 } 236} 237 238static void 239radv_emit_thread_trace_stop(struct radv_device *device, struct radeon_cmdbuf *cs, 240 uint32_t queue_family_index) 241{ 242 unsigned max_se = device->physical_device->rad_info.max_se; 243 244 /* Stop the thread trace with a different event based on the queue. */ 245 if (queue_family_index == RADV_QUEUE_COMPUTE && 246 device->physical_device->rad_info.chip_class >= GFX7) { 247 radeon_set_sh_reg(cs, R_00B878_COMPUTE_THREAD_TRACE_ENABLE, S_00B878_THREAD_TRACE_ENABLE(0)); 248 } else { 249 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); 250 radeon_emit(cs, EVENT_TYPE(V_028A90_THREAD_TRACE_STOP) | EVENT_INDEX(0)); 251 } 252 253 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); 254 radeon_emit(cs, EVENT_TYPE(V_028A90_THREAD_TRACE_FINISH) | EVENT_INDEX(0)); 255 256 for (unsigned se = 0; se < max_se; se++) { 257 if (radv_se_is_disabled(device, se)) 258 continue; 259 260 /* Target SEi and SH0. */ 261 radeon_set_uconfig_reg( 262 cs, R_030800_GRBM_GFX_INDEX, 263 S_030800_SE_INDEX(se) | S_030800_SH_INDEX(0) | S_030800_INSTANCE_BROADCAST_WRITES(1)); 264 265 if (device->physical_device->rad_info.chip_class >= GFX10) { 266 /* Make sure to wait for the trace buffer. */ 267 radeon_emit(cs, PKT3(PKT3_WAIT_REG_MEM, 5, 0)); 268 radeon_emit( 269 cs, 270 WAIT_REG_MEM_NOT_EQUAL); /* wait until the register is equal to the reference value */ 271 radeon_emit(cs, R_008D20_SQ_THREAD_TRACE_STATUS >> 2); /* register */ 272 radeon_emit(cs, 0); 273 radeon_emit(cs, 0); /* reference value */ 274 radeon_emit(cs, ~C_008D20_FINISH_DONE); 275 radeon_emit(cs, 4); /* poll interval */ 276 277 /* Disable the thread trace mode. */ 278 radeon_set_privileged_config_reg(cs, R_008D1C_SQ_THREAD_TRACE_CTRL, 279 gfx10_get_thread_trace_ctrl(device, false)); 280 281 /* Wait for thread trace completion. */ 282 radeon_emit(cs, PKT3(PKT3_WAIT_REG_MEM, 5, 0)); 283 radeon_emit( 284 cs, WAIT_REG_MEM_EQUAL); /* wait until the register is equal to the reference value */ 285 radeon_emit(cs, R_008D20_SQ_THREAD_TRACE_STATUS >> 2); /* register */ 286 radeon_emit(cs, 0); 287 radeon_emit(cs, 0); /* reference value */ 288 radeon_emit(cs, ~C_008D20_BUSY); /* mask */ 289 radeon_emit(cs, 4); /* poll interval */ 290 } else { 291 /* Disable the thread trace mode. */ 292 radeon_set_uconfig_reg(cs, R_030CD8_SQ_THREAD_TRACE_MODE, S_030CD8_MODE(0)); 293 294 /* Wait for thread trace completion. */ 295 radeon_emit(cs, PKT3(PKT3_WAIT_REG_MEM, 5, 0)); 296 radeon_emit( 297 cs, WAIT_REG_MEM_EQUAL); /* wait until the register is equal to the reference value */ 298 radeon_emit(cs, R_030CE8_SQ_THREAD_TRACE_STATUS >> 2); /* register */ 299 radeon_emit(cs, 0); 300 radeon_emit(cs, 0); /* reference value */ 301 radeon_emit(cs, ~C_030CE8_BUSY); /* mask */ 302 radeon_emit(cs, 4); /* poll interval */ 303 } 304 305 radv_copy_thread_trace_info_regs(device, cs, se); 306 } 307 308 /* Restore global broadcasting. */ 309 radeon_set_uconfig_reg(cs, R_030800_GRBM_GFX_INDEX, 310 S_030800_SE_BROADCAST_WRITES(1) | S_030800_SH_BROADCAST_WRITES(1) | 311 S_030800_INSTANCE_BROADCAST_WRITES(1)); 312} 313 314void 315radv_emit_thread_trace_userdata(const struct radv_device *device, struct radeon_cmdbuf *cs, 316 const void *data, uint32_t num_dwords) 317{ 318 const uint32_t *dwords = (uint32_t *)data; 319 320 while (num_dwords > 0) { 321 uint32_t count = MIN2(num_dwords, 2); 322 323 radeon_check_space(device->ws, cs, 2 + count); 324 325 /* Without the perfctr bit the CP might not always pass the 326 * write on correctly. */ 327 if (device->physical_device->rad_info.chip_class >= GFX10) 328 radeon_set_uconfig_reg_seq_perfctr(cs, R_030D08_SQ_THREAD_TRACE_USERDATA_2, count); 329 else 330 radeon_set_uconfig_reg_seq(cs, R_030D08_SQ_THREAD_TRACE_USERDATA_2, count); 331 radeon_emit_array(cs, dwords, count); 332 333 dwords += count; 334 num_dwords -= count; 335 } 336} 337 338static void 339radv_emit_spi_config_cntl(struct radv_device *device, struct radeon_cmdbuf *cs, bool enable) 340{ 341 if (device->physical_device->rad_info.chip_class >= GFX9) { 342 uint32_t spi_config_cntl = 343 S_031100_GPR_WRITE_PRIORITY(0x2c688) | S_031100_EXP_PRIORITY_ORDER(3) | 344 S_031100_ENABLE_SQG_TOP_EVENTS(enable) | S_031100_ENABLE_SQG_BOP_EVENTS(enable); 345 346 if (device->physical_device->rad_info.chip_class >= GFX10) 347 spi_config_cntl |= S_031100_PS_PKR_PRIORITY_CNTL(3); 348 349 radeon_set_uconfig_reg(cs, R_031100_SPI_CONFIG_CNTL, spi_config_cntl); 350 } else { 351 /* SPI_CONFIG_CNTL is a protected register on GFX6-GFX8. */ 352 radeon_set_privileged_config_reg( 353 cs, R_009100_SPI_CONFIG_CNTL, 354 S_009100_ENABLE_SQG_TOP_EVENTS(enable) | S_009100_ENABLE_SQG_BOP_EVENTS(enable)); 355 } 356} 357 358static void 359radv_emit_inhibit_clockgating(struct radv_device *device, struct radeon_cmdbuf *cs, bool inhibit) 360{ 361 if (device->physical_device->rad_info.chip_class >= GFX10) { 362 radeon_set_uconfig_reg(cs, R_037390_RLC_PERFMON_CLK_CNTL, 363 S_037390_PERFMON_CLOCK_STATE(inhibit)); 364 } else if (device->physical_device->rad_info.chip_class >= GFX8) { 365 radeon_set_uconfig_reg(cs, R_0372FC_RLC_PERFMON_CLK_CNTL, 366 S_0372FC_PERFMON_CLOCK_STATE(inhibit)); 367 } 368} 369 370static void 371radv_emit_wait_for_idle(struct radv_device *device, struct radeon_cmdbuf *cs, int family) 372{ 373 enum rgp_flush_bits sqtt_flush_bits = 0; 374 si_cs_emit_cache_flush( 375 cs, device->physical_device->rad_info.chip_class, NULL, 0, 376 family == RING_COMPUTE && device->physical_device->rad_info.chip_class >= GFX7, 377 (family == RADV_QUEUE_COMPUTE 378 ? RADV_CMD_FLAG_CS_PARTIAL_FLUSH 379 : (RADV_CMD_FLAG_CS_PARTIAL_FLUSH | RADV_CMD_FLAG_PS_PARTIAL_FLUSH)) | 380 RADV_CMD_FLAG_INV_ICACHE | RADV_CMD_FLAG_INV_SCACHE | RADV_CMD_FLAG_INV_VCACHE | 381 RADV_CMD_FLAG_INV_L2, 382 &sqtt_flush_bits, 0); 383} 384 385static bool 386radv_thread_trace_init_bo(struct radv_device *device) 387{ 388 unsigned max_se = device->physical_device->rad_info.max_se; 389 struct radeon_winsys *ws = device->ws; 390 VkResult result; 391 uint64_t size; 392 393 /* The buffer size and address need to be aligned in HW regs. Align the 394 * size as early as possible so that we do all the allocation & addressing 395 * correctly. */ 396 device->thread_trace.buffer_size = 397 align64(device->thread_trace.buffer_size, 1u << SQTT_BUFFER_ALIGN_SHIFT); 398 399 /* Compute total size of the thread trace BO for all SEs. */ 400 size = align64(sizeof(struct ac_thread_trace_info) * max_se, 1 << SQTT_BUFFER_ALIGN_SHIFT); 401 size += device->thread_trace.buffer_size * (uint64_t)max_se; 402 403 struct radeon_winsys_bo *bo = NULL; 404 result = ws->buffer_create( 405 ws, size, 4096, RADEON_DOMAIN_VRAM, 406 RADEON_FLAG_CPU_ACCESS | RADEON_FLAG_NO_INTERPROCESS_SHARING | RADEON_FLAG_ZERO_VRAM, 407 RADV_BO_PRIORITY_SCRATCH, 0, &bo); 408 device->thread_trace.bo = bo; 409 if (result != VK_SUCCESS) 410 return false; 411 412 result = ws->buffer_make_resident(ws, device->thread_trace.bo, true); 413 if (result != VK_SUCCESS) 414 return false; 415 416 device->thread_trace.ptr = ws->buffer_map(device->thread_trace.bo); 417 if (!device->thread_trace.ptr) 418 return false; 419 420 return true; 421} 422 423static void 424radv_thread_trace_finish_bo(struct radv_device *device) 425{ 426 struct radeon_winsys *ws = device->ws; 427 428 if (unlikely(device->thread_trace.bo)) { 429 ws->buffer_make_resident(ws, device->thread_trace.bo, false); 430 ws->buffer_destroy(ws, device->thread_trace.bo); 431 } 432} 433 434bool 435radv_thread_trace_init(struct radv_device *device) 436{ 437 struct ac_thread_trace_data *thread_trace_data = &device->thread_trace; 438 439 /* Default buffer size set to 32MB per SE. */ 440 device->thread_trace.buffer_size = 441 radv_get_int_debug_option("RADV_THREAD_TRACE_BUFFER_SIZE", 32 * 1024 * 1024); 442 device->thread_trace.start_frame = radv_get_int_debug_option("RADV_THREAD_TRACE", -1); 443 444 const char *trigger_file = getenv("RADV_THREAD_TRACE_TRIGGER"); 445 if (trigger_file) 446 device->thread_trace.trigger_file = strdup(trigger_file); 447 448 if (!radv_thread_trace_init_bo(device)) 449 return false; 450 451 list_inithead(&thread_trace_data->rgp_pso_correlation.record); 452 simple_mtx_init(&thread_trace_data->rgp_pso_correlation.lock, mtx_plain); 453 454 list_inithead(&thread_trace_data->rgp_loader_events.record); 455 simple_mtx_init(&thread_trace_data->rgp_loader_events.lock, mtx_plain); 456 457 list_inithead(&thread_trace_data->rgp_code_object.record); 458 simple_mtx_init(&thread_trace_data->rgp_code_object.lock, mtx_plain); 459 460 return true; 461} 462 463void 464radv_thread_trace_finish(struct radv_device *device) 465{ 466 struct ac_thread_trace_data *thread_trace_data = &device->thread_trace; 467 struct radeon_winsys *ws = device->ws; 468 469 radv_thread_trace_finish_bo(device); 470 471 for (unsigned i = 0; i < 2; i++) { 472 if (device->thread_trace.start_cs[i]) 473 ws->cs_destroy(device->thread_trace.start_cs[i]); 474 if (device->thread_trace.stop_cs[i]) 475 ws->cs_destroy(device->thread_trace.stop_cs[i]); 476 } 477 478 assert(thread_trace_data->rgp_pso_correlation.record_count == 0); 479 simple_mtx_destroy(&thread_trace_data->rgp_pso_correlation.lock); 480 481 assert(thread_trace_data->rgp_loader_events.record_count == 0); 482 simple_mtx_destroy(&thread_trace_data->rgp_loader_events.lock); 483 484 assert(thread_trace_data->rgp_code_object.record_count == 0); 485 simple_mtx_destroy(&thread_trace_data->rgp_code_object.lock); 486} 487 488static bool 489radv_thread_trace_resize_bo(struct radv_device *device) 490{ 491 /* Destroy the previous thread trace BO. */ 492 radv_thread_trace_finish_bo(device); 493 494 /* Double the size of the thread trace buffer per SE. */ 495 device->thread_trace.buffer_size *= 2; 496 497 fprintf(stderr, 498 "Failed to get the thread trace because the buffer " 499 "was too small, resizing to %d KB\n", 500 device->thread_trace.buffer_size / 1024); 501 502 /* Re-create the thread trace BO. */ 503 return radv_thread_trace_init_bo(device); 504} 505 506bool 507radv_begin_thread_trace(struct radv_queue *queue) 508{ 509 struct radv_device *device = queue->device; 510 int family = queue->vk.queue_family_index; 511 struct radeon_winsys *ws = device->ws; 512 struct radeon_cmdbuf *cs; 513 VkResult result; 514 515 /* Destroy the previous start CS and create a new one. */ 516 if (device->thread_trace.start_cs[family]) { 517 ws->cs_destroy(device->thread_trace.start_cs[family]); 518 device->thread_trace.start_cs[family] = NULL; 519 } 520 521 cs = ws->cs_create(ws, family); 522 if (!cs) 523 return false; 524 525 switch (family) { 526 case RADV_QUEUE_GENERAL: 527 radeon_emit(cs, PKT3(PKT3_CONTEXT_CONTROL, 1, 0)); 528 radeon_emit(cs, CC0_UPDATE_LOAD_ENABLES(1)); 529 radeon_emit(cs, CC1_UPDATE_SHADOW_ENABLES(1)); 530 break; 531 case RADV_QUEUE_COMPUTE: 532 radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); 533 radeon_emit(cs, 0); 534 break; 535 } 536 537 radv_cs_add_buffer(ws, cs, device->thread_trace.bo); 538 539 /* Make sure to wait-for-idle before starting SQTT. */ 540 radv_emit_wait_for_idle(device, cs, family); 541 542 /* Disable clock gating before starting SQTT. */ 543 radv_emit_inhibit_clockgating(device, cs, true); 544 545 /* Enable SQG events that collects thread trace data. */ 546 radv_emit_spi_config_cntl(device, cs, true); 547 548 /* Start SQTT. */ 549 radv_emit_thread_trace_start(device, cs, family); 550 551 result = ws->cs_finalize(cs); 552 if (result != VK_SUCCESS) { 553 ws->cs_destroy(cs); 554 return false; 555 } 556 557 device->thread_trace.start_cs[family] = cs; 558 559 return radv_queue_internal_submit(queue, cs); 560} 561 562bool 563radv_end_thread_trace(struct radv_queue *queue) 564{ 565 struct radv_device *device = queue->device; 566 int family = queue->vk.queue_family_index; 567 struct radeon_winsys *ws = device->ws; 568 struct radeon_cmdbuf *cs; 569 VkResult result; 570 571 /* Destroy the previous stop CS and create a new one. */ 572 if (queue->device->thread_trace.stop_cs[family]) { 573 ws->cs_destroy(device->thread_trace.stop_cs[family]); 574 device->thread_trace.stop_cs[family] = NULL; 575 } 576 577 cs = ws->cs_create(ws, family); 578 if (!cs) 579 return false; 580 581 switch (family) { 582 case RADV_QUEUE_GENERAL: 583 radeon_emit(cs, PKT3(PKT3_CONTEXT_CONTROL, 1, 0)); 584 radeon_emit(cs, CC0_UPDATE_LOAD_ENABLES(1)); 585 radeon_emit(cs, CC1_UPDATE_SHADOW_ENABLES(1)); 586 break; 587 case RADV_QUEUE_COMPUTE: 588 radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); 589 radeon_emit(cs, 0); 590 break; 591 } 592 593 radv_cs_add_buffer(ws, cs, device->thread_trace.bo); 594 595 /* Make sure to wait-for-idle before stopping SQTT. */ 596 radv_emit_wait_for_idle(device, cs, family); 597 598 /* Stop SQTT. */ 599 radv_emit_thread_trace_stop(device, cs, family); 600 601 /* Restore previous state by disabling SQG events. */ 602 radv_emit_spi_config_cntl(device, cs, false); 603 604 /* Restore previous state by re-enabling clock gating. */ 605 radv_emit_inhibit_clockgating(device, cs, false); 606 607 result = ws->cs_finalize(cs); 608 if (result != VK_SUCCESS) { 609 ws->cs_destroy(cs); 610 return false; 611 } 612 613 device->thread_trace.stop_cs[family] = cs; 614 615 return radv_queue_internal_submit(queue, cs); 616} 617 618bool 619radv_get_thread_trace(struct radv_queue *queue, struct ac_thread_trace *thread_trace) 620{ 621 struct radv_device *device = queue->device; 622 struct radeon_info *rad_info = &device->physical_device->rad_info; 623 unsigned max_se = rad_info->max_se; 624 void *thread_trace_ptr = device->thread_trace.ptr; 625 626 memset(thread_trace, 0, sizeof(*thread_trace)); 627 628 for (unsigned se = 0; se < max_se; se++) { 629 uint64_t info_offset = ac_thread_trace_get_info_offset(se); 630 uint64_t data_offset = ac_thread_trace_get_data_offset(rad_info, &device->thread_trace, se); 631 void *info_ptr = (uint8_t *)thread_trace_ptr + info_offset; 632 void *data_ptr = (uint8_t *)thread_trace_ptr + data_offset; 633 struct ac_thread_trace_info *info = (struct ac_thread_trace_info *)info_ptr; 634 struct ac_thread_trace_se thread_trace_se = {0}; 635 int first_active_cu = ffs(device->physical_device->rad_info.cu_mask[se][0]); 636 637 if (radv_se_is_disabled(device, se)) 638 continue; 639 640 if (!ac_is_thread_trace_complete(&device->physical_device->rad_info, &device->thread_trace, 641 info)) { 642 if (!radv_thread_trace_resize_bo(device)) { 643 fprintf(stderr, "Failed to resize the thread " 644 "trace buffer.\n"); 645 abort(); 646 } 647 return false; 648 } 649 650 thread_trace_se.data_ptr = data_ptr; 651 thread_trace_se.info = *info; 652 thread_trace_se.shader_engine = se; 653 654 /* RGP seems to expect units of WGP on GFX10+. */ 655 thread_trace_se.compute_unit = device->physical_device->rad_info.chip_class >= GFX10 656 ? (first_active_cu / 2) 657 : first_active_cu; 658 659 thread_trace->traces[thread_trace->num_traces] = thread_trace_se; 660 thread_trace->num_traces++; 661 } 662 663 thread_trace->data = &device->thread_trace; 664 return true; 665} 666