si_sqtt.c revision 7ec681f3
1/* 2 * Copyright 2020 Advanced Micro Devices, Inc. 3 * All Rights Reserved. 4 * 5 * Permission is hereby granted, free of charge, to any person obtaining a 6 * copy of this software and associated documentation files (the "Software"), 7 * to deal in the Software without restriction, including without limitation 8 * on the rights to use, copy, modify, merge, publish, distribute, sub 9 * license, and/or sell copies of the Software, and to permit persons to whom 10 * the Software is furnished to do so, subject to the following conditions: 11 * 12 * The above copyright notice and this permission notice (including the next 13 * paragraph) shall be included in all copies or substantial portions of the 14 * Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL 19 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, 20 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 21 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE 22 * USE OR OTHER DEALINGS IN THE SOFTWARE. 23 * 24 */ 25 26 27#include "si_pipe.h" 28#include "si_build_pm4.h" 29#include "si_compute.h" 30 31#include "ac_rgp.h" 32#include "ac_sqtt.h" 33#include "util/u_memory.h" 34#include "tgsi/tgsi_from_mesa.h" 35 36static void 37si_emit_spi_config_cntl(struct si_context* sctx, 38 struct radeon_cmdbuf *cs, bool enable); 39 40static bool 41si_thread_trace_init_bo(struct si_context *sctx) 42{ 43 unsigned max_se = sctx->screen->info.max_se; 44 struct radeon_winsys *ws = sctx->ws; 45 uint64_t size; 46 47 /* The buffer size and address need to be aligned in HW regs. Align the 48 * size as early as possible so that we do all the allocation & addressing 49 * correctly. */ 50 sctx->thread_trace->buffer_size = align64(sctx->thread_trace->buffer_size, 51 1u << SQTT_BUFFER_ALIGN_SHIFT); 52 53 /* Compute total size of the thread trace BO for all SEs. */ 54 size = align64(sizeof(struct ac_thread_trace_info) * max_se, 55 1 << SQTT_BUFFER_ALIGN_SHIFT); 56 size += sctx->thread_trace->buffer_size * (uint64_t)max_se; 57 58 sctx->thread_trace->bo = 59 ws->buffer_create(ws, size, 4096, 60 RADEON_DOMAIN_VRAM, 61 RADEON_FLAG_NO_INTERPROCESS_SHARING | 62 RADEON_FLAG_GTT_WC | 63 RADEON_FLAG_NO_SUBALLOC); 64 if (!sctx->thread_trace->bo) 65 return false; 66 67 return true; 68} 69 70static bool 71si_se_is_disabled(struct si_context* sctx, unsigned se) 72{ 73 /* No active CU on the SE means it is disabled. */ 74 return sctx->screen->info.cu_mask[se][0] == 0; 75} 76 77 78static void 79si_emit_thread_trace_start(struct si_context* sctx, 80 struct radeon_cmdbuf *cs, 81 uint32_t queue_family_index) 82{ 83 struct si_screen *sscreen = sctx->screen; 84 uint32_t shifted_size = sctx->thread_trace->buffer_size >> SQTT_BUFFER_ALIGN_SHIFT; 85 unsigned max_se = sscreen->info.max_se; 86 87 radeon_begin(cs); 88 89 for (unsigned se = 0; se < max_se; se++) { 90 uint64_t va = sctx->ws->buffer_get_virtual_address(sctx->thread_trace->bo); 91 uint64_t data_va = ac_thread_trace_get_data_va(&sctx->screen->info, sctx->thread_trace, va, se); 92 uint64_t shifted_va = data_va >> SQTT_BUFFER_ALIGN_SHIFT; 93 94 if (si_se_is_disabled(sctx, se)) 95 continue; 96 97 /* Target SEx and SH0. */ 98 radeon_set_uconfig_reg(R_030800_GRBM_GFX_INDEX, 99 S_030800_SE_INDEX(se) | 100 S_030800_SH_INDEX(0) | 101 S_030800_INSTANCE_BROADCAST_WRITES(1)); 102 103 /* Select the first active CUs */ 104 int first_active_cu = ffs(sctx->screen->info.cu_mask[se][0]); 105 106 if (sctx->chip_class >= GFX10) { 107 /* Order seems important for the following 2 registers. */ 108 radeon_set_privileged_config_reg(R_008D04_SQ_THREAD_TRACE_BUF0_SIZE, 109 S_008D04_SIZE(shifted_size) | 110 S_008D04_BASE_HI(shifted_va >> 32)); 111 112 radeon_set_privileged_config_reg(R_008D00_SQ_THREAD_TRACE_BUF0_BASE, shifted_va); 113 114 int wgp = first_active_cu / 2; 115 radeon_set_privileged_config_reg(R_008D14_SQ_THREAD_TRACE_MASK, 116 S_008D14_WTYPE_INCLUDE(0x7f) | /* all shader stages */ 117 S_008D14_SA_SEL(0) | 118 S_008D14_WGP_SEL(wgp) | 119 S_008D14_SIMD_SEL(0)); 120 121 radeon_set_privileged_config_reg(R_008D18_SQ_THREAD_TRACE_TOKEN_MASK, 122 S_008D18_REG_INCLUDE(V_008D18_REG_INCLUDE_SQDEC | 123 V_008D18_REG_INCLUDE_SHDEC | 124 V_008D18_REG_INCLUDE_GFXUDEC | 125 V_008D18_REG_INCLUDE_CONTEXT | 126 V_008D18_REG_INCLUDE_COMP | 127 V_008D18_REG_INCLUDE_CONFIG) | 128 S_008D18_TOKEN_EXCLUDE(V_008D18_TOKEN_EXCLUDE_PERF)); 129 130 /* Should be emitted last (it enables thread traces). */ 131 radeon_set_privileged_config_reg(R_008D1C_SQ_THREAD_TRACE_CTRL, 132 S_008D1C_MODE(1) | 133 S_008D1C_HIWATER(5) | 134 S_008D1C_UTIL_TIMER(1) | 135 S_008D1C_RT_FREQ(2) | /* 4096 clk */ 136 S_008D1C_DRAW_EVENT_EN(1) | 137 S_008D1C_REG_STALL_EN(1) | 138 S_008D1C_SPI_STALL_EN(1) | 139 S_008D1C_SQ_STALL_EN(1) | 140 S_008D1C_REG_DROP_ON_STALL(0) | 141 S_008D1C_LOWATER_OFFSET( 142 sctx->chip_class >= GFX10_3 ? 4 : 0)); 143 } else { 144 /* Order seems important for the following 4 registers. */ 145 radeon_set_uconfig_reg(R_030CDC_SQ_THREAD_TRACE_BASE2, 146 S_030CDC_ADDR_HI(shifted_va >> 32)); 147 148 radeon_set_uconfig_reg(R_030CC0_SQ_THREAD_TRACE_BASE, shifted_va); 149 150 radeon_set_uconfig_reg(R_030CC4_SQ_THREAD_TRACE_SIZE, 151 S_030CC4_SIZE(shifted_size)); 152 153 radeon_set_uconfig_reg(R_030CD4_SQ_THREAD_TRACE_CTRL, 154 S_030CD4_RESET_BUFFER(1)); 155 156 uint32_t thread_trace_mask = S_030CC8_CU_SEL(first_active_cu) | 157 S_030CC8_SH_SEL(0) | 158 S_030CC8_SIMD_EN(0xf) | 159 S_030CC8_VM_ID_MASK(0) | 160 S_030CC8_REG_STALL_EN(1) | 161 S_030CC8_SPI_STALL_EN(1) | 162 S_030CC8_SQ_STALL_EN(1); 163 164 radeon_set_uconfig_reg(R_030CC8_SQ_THREAD_TRACE_MASK, 165 thread_trace_mask); 166 167 /* Trace all tokens and registers. */ 168 radeon_set_uconfig_reg(R_030CCC_SQ_THREAD_TRACE_TOKEN_MASK, 169 S_030CCC_TOKEN_MASK(0xbfff) | 170 S_030CCC_REG_MASK(0xff) | 171 S_030CCC_REG_DROP_ON_STALL(0)); 172 173 /* Enable SQTT perf counters for all CUs. */ 174 radeon_set_uconfig_reg(R_030CD0_SQ_THREAD_TRACE_PERF_MASK, 175 S_030CD0_SH0_MASK(0xffff) | 176 S_030CD0_SH1_MASK(0xffff)); 177 178 radeon_set_uconfig_reg(R_030CE0_SQ_THREAD_TRACE_TOKEN_MASK2, 0xffffffff); 179 180 radeon_set_uconfig_reg(R_030CEC_SQ_THREAD_TRACE_HIWATER, 181 S_030CEC_HIWATER(4)); 182 183 if (sctx->chip_class == GFX9) { 184 /* Reset thread trace status errors. */ 185 radeon_set_uconfig_reg(R_030CE8_SQ_THREAD_TRACE_STATUS, 186 S_030CE8_UTC_ERROR(0)); 187 } 188 189 /* Enable the thread trace mode. */ 190 uint32_t thread_trace_mode = 191 S_030CD8_MASK_PS(1) | 192 S_030CD8_MASK_VS(1) | 193 S_030CD8_MASK_GS(1) | 194 S_030CD8_MASK_ES(1) | 195 S_030CD8_MASK_HS(1) | 196 S_030CD8_MASK_LS(1) | 197 S_030CD8_MASK_CS(1) | 198 S_030CD8_AUTOFLUSH_EN(1) | /* periodically flush SQTT data to memory */ 199 S_030CD8_MODE(1); 200 201 if (sctx->chip_class == GFX9) { 202 /* Count SQTT traffic in TCC perf counters. */ 203 thread_trace_mode |= S_030CD8_TC_PERF_EN(1); 204 } 205 206 radeon_set_uconfig_reg(R_030CD8_SQ_THREAD_TRACE_MODE, 207 thread_trace_mode); 208 } 209 } 210 211 /* Restore global broadcasting. */ 212 radeon_set_uconfig_reg(R_030800_GRBM_GFX_INDEX, 213 S_030800_SE_BROADCAST_WRITES(1) | 214 S_030800_SH_BROADCAST_WRITES(1) | 215 S_030800_INSTANCE_BROADCAST_WRITES(1)); 216 217 /* Start the thread trace with a different event based on the queue. */ 218 if (queue_family_index == RING_COMPUTE) { 219 radeon_set_sh_reg(R_00B878_COMPUTE_THREAD_TRACE_ENABLE, 220 S_00B878_THREAD_TRACE_ENABLE(1)); 221 } else { 222 radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0)); 223 radeon_emit(EVENT_TYPE(V_028A90_THREAD_TRACE_START) | EVENT_INDEX(0)); 224 } 225 radeon_end(); 226} 227 228static const uint32_t gfx9_thread_trace_info_regs[] = 229{ 230 R_030CE4_SQ_THREAD_TRACE_WPTR, 231 R_030CE8_SQ_THREAD_TRACE_STATUS, 232 R_030CF0_SQ_THREAD_TRACE_CNTR, 233}; 234 235static const uint32_t gfx10_thread_trace_info_regs[] = 236{ 237 R_008D10_SQ_THREAD_TRACE_WPTR, 238 R_008D20_SQ_THREAD_TRACE_STATUS, 239 R_008D24_SQ_THREAD_TRACE_DROPPED_CNTR, 240}; 241 242static void 243si_copy_thread_trace_info_regs(struct si_context* sctx, 244 struct radeon_cmdbuf *cs, 245 unsigned se_index) 246{ 247 const uint32_t *thread_trace_info_regs = NULL; 248 249 switch (sctx->chip_class) { 250 case GFX10_3: 251 case GFX10: 252 thread_trace_info_regs = gfx10_thread_trace_info_regs; 253 break; 254 case GFX9: 255 thread_trace_info_regs = gfx9_thread_trace_info_regs; 256 break; 257 default: 258 unreachable("Unsupported chip_class"); 259 } 260 261 /* Get the VA where the info struct is stored for this SE. */ 262 uint64_t va = sctx->ws->buffer_get_virtual_address(sctx->thread_trace->bo); 263 uint64_t info_va = ac_thread_trace_get_info_va(va, se_index); 264 265 radeon_begin(cs); 266 267 /* Copy back the info struct one DWORD at a time. */ 268 for (unsigned i = 0; i < 3; i++) { 269 radeon_emit(PKT3(PKT3_COPY_DATA, 4, 0)); 270 radeon_emit(COPY_DATA_SRC_SEL(COPY_DATA_PERF) | 271 COPY_DATA_DST_SEL(COPY_DATA_TC_L2) | 272 COPY_DATA_WR_CONFIRM); 273 radeon_emit(thread_trace_info_regs[i] >> 2); 274 radeon_emit(0); /* unused */ 275 radeon_emit((info_va + i * 4)); 276 radeon_emit((info_va + i * 4) >> 32); 277 } 278 radeon_end(); 279} 280 281 282 283static void 284si_emit_thread_trace_stop(struct si_context *sctx, 285 struct radeon_cmdbuf *cs, 286 uint32_t queue_family_index) 287{ 288 unsigned max_se = sctx->screen->info.max_se; 289 290 radeon_begin(cs); 291 292 /* Stop the thread trace with a different event based on the queue. */ 293 if (queue_family_index == RING_COMPUTE) { 294 radeon_set_sh_reg(R_00B878_COMPUTE_THREAD_TRACE_ENABLE, 295 S_00B878_THREAD_TRACE_ENABLE(0)); 296 } else { 297 radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0)); 298 radeon_emit(EVENT_TYPE(V_028A90_THREAD_TRACE_STOP) | EVENT_INDEX(0)); 299 } 300 301 radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0)); 302 radeon_emit(EVENT_TYPE(V_028A90_THREAD_TRACE_FINISH) | EVENT_INDEX(0)); 303 radeon_end(); 304 305 for (unsigned se = 0; se < max_se; se++) { 306 if (si_se_is_disabled(sctx, se)) 307 continue; 308 309 radeon_begin(cs); 310 311 /* Target SEi and SH0. */ 312 radeon_set_uconfig_reg(R_030800_GRBM_GFX_INDEX, 313 S_030800_SE_INDEX(se) | 314 S_030800_SH_INDEX(0) | 315 S_030800_INSTANCE_BROADCAST_WRITES(1)); 316 317 if (sctx->chip_class >= GFX10) { 318 /* Make sure to wait for the trace buffer. */ 319 radeon_emit(PKT3(PKT3_WAIT_REG_MEM, 5, 0)); 320 radeon_emit(WAIT_REG_MEM_NOT_EQUAL); /* wait until the register is equal to the reference value */ 321 radeon_emit(R_008D20_SQ_THREAD_TRACE_STATUS >> 2); /* register */ 322 radeon_emit(0); 323 radeon_emit(0); /* reference value */ 324 radeon_emit(S_008D20_FINISH_DONE(1)); /* mask */ 325 radeon_emit(4); /* poll interval */ 326 327 /* Disable the thread trace mode. */ 328 radeon_set_privileged_config_reg(R_008D1C_SQ_THREAD_TRACE_CTRL, 329 S_008D1C_MODE(0)); 330 331 /* Wait for thread trace completion. */ 332 radeon_emit(PKT3(PKT3_WAIT_REG_MEM, 5, 0)); 333 radeon_emit(WAIT_REG_MEM_EQUAL); /* wait until the register is equal to the reference value */ 334 radeon_emit(R_008D20_SQ_THREAD_TRACE_STATUS >> 2); /* register */ 335 radeon_emit(0); 336 radeon_emit(0); /* reference value */ 337 radeon_emit(S_008D20_BUSY(1)); /* mask */ 338 radeon_emit(4); /* poll interval */ 339 } else { 340 /* Disable the thread trace mode. */ 341 radeon_set_uconfig_reg(R_030CD8_SQ_THREAD_TRACE_MODE, 342 S_030CD8_MODE(0)); 343 344 /* Wait for thread trace completion. */ 345 radeon_emit(PKT3(PKT3_WAIT_REG_MEM, 5, 0)); 346 radeon_emit(WAIT_REG_MEM_EQUAL); /* wait until the register is equal to the reference value */ 347 radeon_emit(R_030CE8_SQ_THREAD_TRACE_STATUS >> 2); /* register */ 348 radeon_emit(0); 349 radeon_emit(0); /* reference value */ 350 radeon_emit(S_030CE8_BUSY(1)); /* mask */ 351 radeon_emit(4); /* poll interval */ 352 } 353 radeon_end(); 354 355 si_copy_thread_trace_info_regs(sctx, cs, se); 356 } 357 358 /* Restore global broadcasting. */ 359 radeon_begin_again(cs); 360 radeon_set_uconfig_reg(R_030800_GRBM_GFX_INDEX, 361 S_030800_SE_BROADCAST_WRITES(1) | 362 S_030800_SH_BROADCAST_WRITES(1) | 363 S_030800_INSTANCE_BROADCAST_WRITES(1)); 364 radeon_end(); 365} 366 367static void 368si_thread_trace_start(struct si_context *sctx, int family, struct radeon_cmdbuf *cs) 369{ 370 struct radeon_winsys *ws = sctx->ws; 371 372 radeon_begin(cs); 373 374 switch (family) { 375 case RING_GFX: 376 radeon_emit(PKT3(PKT3_CONTEXT_CONTROL, 1, 0)); 377 radeon_emit(CC0_UPDATE_LOAD_ENABLES(1)); 378 radeon_emit(CC1_UPDATE_SHADOW_ENABLES(1)); 379 break; 380 case RING_COMPUTE: 381 radeon_emit(PKT3(PKT3_NOP, 0, 0)); 382 radeon_emit(0); 383 break; 384 } 385 radeon_end(); 386 387 ws->cs_add_buffer(cs, 388 sctx->thread_trace->bo, 389 RADEON_USAGE_READWRITE, 390 RADEON_DOMAIN_VRAM, 391 0); 392 393 si_cp_dma_wait_for_idle(sctx, cs); 394 395 /* Make sure to wait-for-idle before starting SQTT. */ 396 sctx->flags |= 397 SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_CS_PARTIAL_FLUSH | 398 SI_CONTEXT_INV_ICACHE | SI_CONTEXT_INV_SCACHE | SI_CONTEXT_INV_VCACHE | 399 SI_CONTEXT_INV_L2 | SI_CONTEXT_PFP_SYNC_ME; 400 sctx->emit_cache_flush(sctx, cs); 401 402 si_inhibit_clockgating(sctx, cs, true); 403 404 /* Enable SQG events that collects thread trace data. */ 405 si_emit_spi_config_cntl(sctx, cs, true); 406 407 si_emit_thread_trace_start(sctx, cs, family); 408} 409 410static void 411si_thread_trace_stop(struct si_context *sctx, int family, struct radeon_cmdbuf *cs) 412{ 413 struct radeon_winsys *ws = sctx->ws; 414 415 radeon_begin(cs); 416 417 switch (family) { 418 case RING_GFX: 419 radeon_emit(PKT3(PKT3_CONTEXT_CONTROL, 1, 0)); 420 radeon_emit(CC0_UPDATE_LOAD_ENABLES(1)); 421 radeon_emit(CC1_UPDATE_SHADOW_ENABLES(1)); 422 break; 423 case RING_COMPUTE: 424 radeon_emit(PKT3(PKT3_NOP, 0, 0)); 425 radeon_emit(0); 426 break; 427 } 428 radeon_end(); 429 430 ws->cs_add_buffer(cs, 431 sctx->thread_trace->bo, 432 RADEON_USAGE_READWRITE, 433 RADEON_DOMAIN_VRAM, 434 0); 435 436 si_cp_dma_wait_for_idle(sctx, cs); 437 438 /* Make sure to wait-for-idle before stopping SQTT. */ 439 sctx->flags |= 440 SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_CS_PARTIAL_FLUSH | 441 SI_CONTEXT_INV_ICACHE | SI_CONTEXT_INV_SCACHE | SI_CONTEXT_INV_VCACHE | 442 SI_CONTEXT_INV_L2 | SI_CONTEXT_PFP_SYNC_ME; 443 sctx->emit_cache_flush(sctx, cs); 444 445 si_emit_thread_trace_stop(sctx, cs, family); 446 447 /* Restore previous state by disabling SQG events. */ 448 si_emit_spi_config_cntl(sctx, cs, false); 449 450 si_inhibit_clockgating(sctx, cs, false); 451} 452 453 454static void 455si_thread_trace_init_cs(struct si_context *sctx) 456{ 457 struct radeon_winsys *ws = sctx->ws; 458 459 /* Thread trace start CS (only handles RING_GFX). */ 460 sctx->thread_trace->start_cs[RING_GFX] = CALLOC_STRUCT(radeon_cmdbuf); 461 if (!ws->cs_create(sctx->thread_trace->start_cs[RING_GFX], 462 sctx->ctx, RING_GFX, NULL, NULL, 0)) { 463 free(sctx->thread_trace->start_cs[RING_GFX]); 464 sctx->thread_trace->start_cs[RING_GFX] = NULL; 465 return; 466 } 467 468 si_thread_trace_start(sctx, RING_GFX, sctx->thread_trace->start_cs[RING_GFX]); 469 470 /* Thread trace stop CS. */ 471 sctx->thread_trace->stop_cs[RING_GFX] = CALLOC_STRUCT(radeon_cmdbuf); 472 if (!ws->cs_create(sctx->thread_trace->stop_cs[RING_GFX], 473 sctx->ctx, RING_GFX, NULL, NULL, 0)) { 474 free(sctx->thread_trace->start_cs[RING_GFX]); 475 sctx->thread_trace->start_cs[RING_GFX] = NULL; 476 free(sctx->thread_trace->stop_cs[RING_GFX]); 477 sctx->thread_trace->stop_cs[RING_GFX] = NULL; 478 return; 479 } 480 481 si_thread_trace_stop(sctx, RING_GFX, sctx->thread_trace->stop_cs[RING_GFX]); 482} 483 484static void 485si_begin_thread_trace(struct si_context *sctx, struct radeon_cmdbuf *rcs) 486{ 487 struct radeon_cmdbuf *cs = sctx->thread_trace->start_cs[RING_GFX]; 488 sctx->ws->cs_flush(cs, 0, NULL); 489} 490 491static void 492si_end_thread_trace(struct si_context *sctx, struct radeon_cmdbuf *rcs) 493{ 494 struct radeon_cmdbuf *cs = sctx->thread_trace->stop_cs[RING_GFX]; 495 sctx->ws->cs_flush(cs, 0, &sctx->last_sqtt_fence); 496} 497 498static bool 499si_get_thread_trace(struct si_context *sctx, 500 struct ac_thread_trace *thread_trace) 501{ 502 unsigned max_se = sctx->screen->info.max_se; 503 504 memset(thread_trace, 0, sizeof(*thread_trace)); 505 thread_trace->num_traces = max_se; 506 507 sctx->thread_trace->ptr = sctx->ws->buffer_map(sctx->ws, sctx->thread_trace->bo, 508 NULL, 509 PIPE_MAP_READ); 510 511 if (!sctx->thread_trace->ptr) 512 return false; 513 514 void *thread_trace_ptr = sctx->thread_trace->ptr; 515 516 for (unsigned se = 0; se < max_se; se++) { 517 uint64_t info_offset = ac_thread_trace_get_info_offset(se); 518 uint64_t data_offset = ac_thread_trace_get_data_offset(&sctx->screen->info, sctx->thread_trace, se); 519 void *info_ptr = thread_trace_ptr + info_offset; 520 void *data_ptr = thread_trace_ptr + data_offset; 521 struct ac_thread_trace_info *info = 522 (struct ac_thread_trace_info *)info_ptr; 523 524 struct ac_thread_trace_se thread_trace_se = {0}; 525 526 if (!ac_is_thread_trace_complete(&sctx->screen->info, sctx->thread_trace, info)) { 527 uint32_t expected_size = 528 ac_get_expected_buffer_size(&sctx->screen->info, info); 529 uint32_t available_size = (info->cur_offset * 32) / 1024; 530 531 fprintf(stderr, "Failed to get the thread trace " 532 "because the buffer is too small. The " 533 "hardware needs %d KB but the " 534 "buffer size is %d KB.\n", 535 expected_size, available_size); 536 fprintf(stderr, "Please update the buffer size with " 537 "AMD_THREAD_TRACE_BUFFER_SIZE=<size_in_kbytes>\n"); 538 return false; 539 } 540 541 thread_trace_se.data_ptr = data_ptr; 542 thread_trace_se.info = *info; 543 thread_trace_se.shader_engine = se; 544 545 int first_active_cu = ffs(sctx->screen->info.cu_mask[se][0]); 546 547 /* For GFX10+ compute_unit really means WGP */ 548 thread_trace_se.compute_unit = 549 sctx->screen->info.chip_class >= GFX10 ? (first_active_cu / 2) : first_active_cu; 550 551 thread_trace->traces[se] = thread_trace_se; 552 } 553 554 thread_trace->data = sctx->thread_trace; 555 return true; 556} 557 558 559bool 560si_init_thread_trace(struct si_context *sctx) 561{ 562 static bool warn_once = true; 563 if (warn_once) { 564 fprintf(stderr, "*************************************************\n"); 565 fprintf(stderr, "* WARNING: Thread trace support is experimental *\n"); 566 fprintf(stderr, "*************************************************\n"); 567 warn_once = false; 568 } 569 570 sctx->thread_trace = CALLOC_STRUCT(ac_thread_trace_data); 571 572 if (sctx->chip_class < GFX8) { 573 fprintf(stderr, "GPU hardware not supported: refer to " 574 "the RGP documentation for the list of " 575 "supported GPUs!\n"); 576 return false; 577 } 578 579 if (sctx->chip_class > GFX10_3) { 580 fprintf(stderr, "radeonsi: Thread trace is not supported " 581 "for that GPU!\n"); 582 return false; 583 } 584 585 /* Default buffer size set to 1MB per SE. */ 586 sctx->thread_trace->buffer_size = debug_get_num_option("AMD_THREAD_TRACE_BUFFER_SIZE", 1024) * 1024; 587 sctx->thread_trace->start_frame = 10; 588 589 const char *trigger = getenv("AMD_THREAD_TRACE_TRIGGER"); 590 if (trigger) { 591 sctx->thread_trace->start_frame = atoi(trigger); 592 if (sctx->thread_trace->start_frame <= 0) { 593 /* This isn't a frame number, must be a file */ 594 sctx->thread_trace->trigger_file = strdup(trigger); 595 sctx->thread_trace->start_frame = -1; 596 } 597 } 598 599 if (!si_thread_trace_init_bo(sctx)) 600 return false; 601 602 list_inithead(&sctx->thread_trace->rgp_pso_correlation.record); 603 simple_mtx_init(&sctx->thread_trace->rgp_pso_correlation.lock, mtx_plain); 604 605 list_inithead(&sctx->thread_trace->rgp_loader_events.record); 606 simple_mtx_init(&sctx->thread_trace->rgp_loader_events.lock, mtx_plain); 607 608 list_inithead(&sctx->thread_trace->rgp_code_object.record); 609 simple_mtx_init(&sctx->thread_trace->rgp_code_object.lock, mtx_plain); 610 611 si_thread_trace_init_cs(sctx); 612 613 sctx->sqtt_next_event = EventInvalid; 614 615 return true; 616} 617 618void 619si_destroy_thread_trace(struct si_context *sctx) 620{ 621 struct si_screen *sscreen = sctx->screen; 622 struct pb_buffer *bo = sctx->thread_trace->bo; 623 radeon_bo_reference(sctx->screen->ws, &bo, NULL); 624 625 if (sctx->thread_trace->trigger_file) 626 free(sctx->thread_trace->trigger_file); 627 628 sscreen->ws->cs_destroy(sctx->thread_trace->start_cs[RING_GFX]); 629 sscreen->ws->cs_destroy(sctx->thread_trace->stop_cs[RING_GFX]); 630 631 struct rgp_pso_correlation *pso_correlation = &sctx->thread_trace->rgp_pso_correlation; 632 struct rgp_loader_events *loader_events = &sctx->thread_trace->rgp_loader_events; 633 struct rgp_code_object *code_object = &sctx->thread_trace->rgp_code_object; 634 list_for_each_entry_safe(struct rgp_pso_correlation_record, record, 635 &pso_correlation->record, list) { 636 list_del(&record->list); 637 free(record); 638 } 639 simple_mtx_destroy(&sctx->thread_trace->rgp_pso_correlation.lock); 640 641 list_for_each_entry_safe(struct rgp_loader_events_record, record, 642 &loader_events->record, list) { 643 list_del(&record->list); 644 free(record); 645 } 646 simple_mtx_destroy(&sctx->thread_trace->rgp_loader_events.lock); 647 648 list_for_each_entry_safe(struct rgp_code_object_record, record, 649 &code_object->record, list) { 650 uint32_t mask = record->shader_stages_mask; 651 int i; 652 653 /* Free the disassembly. */ 654 while (mask) { 655 i = u_bit_scan(&mask); 656 free(record->shader_data[i].code); 657 } 658 list_del(&record->list); 659 free(record); 660 } 661 simple_mtx_destroy(&sctx->thread_trace->rgp_code_object.lock); 662 663 free(sctx->thread_trace); 664 sctx->thread_trace = NULL; 665} 666 667static uint64_t num_frames = 0; 668 669void 670si_handle_thread_trace(struct si_context *sctx, struct radeon_cmdbuf *rcs) 671{ 672 /* Should we enable SQTT yet? */ 673 if (!sctx->thread_trace_enabled) { 674 bool frame_trigger = num_frames == sctx->thread_trace->start_frame; 675 bool file_trigger = false; 676 if (sctx->thread_trace->trigger_file && 677 access(sctx->thread_trace->trigger_file, W_OK) == 0) { 678 if (unlink(sctx->thread_trace->trigger_file) == 0) { 679 file_trigger = true; 680 } else { 681 /* Do not enable tracing if we cannot remove the file, 682 * because by then we'll trace every frame. 683 */ 684 fprintf(stderr, "radeonsi: could not remove thread trace trigger file, ignoring\n"); 685 } 686 } 687 688 if (frame_trigger || file_trigger) { 689 /* Wait for last submission */ 690 sctx->ws->fence_wait(sctx->ws, sctx->last_gfx_fence, PIPE_TIMEOUT_INFINITE); 691 692 /* Start SQTT */ 693 si_begin_thread_trace(sctx, rcs); 694 695 sctx->thread_trace_enabled = true; 696 sctx->thread_trace->start_frame = -1; 697 698 /* Force shader update to make sure si_sqtt_describe_pipeline_bind is called 699 * for the current "pipeline". 700 */ 701 sctx->do_update_shaders = true; 702 } 703 } else { 704 struct ac_thread_trace thread_trace = {0}; 705 706 /* Stop SQTT */ 707 si_end_thread_trace(sctx, rcs); 708 sctx->thread_trace_enabled = false; 709 sctx->thread_trace->start_frame = -1; 710 assert (sctx->last_sqtt_fence); 711 712 /* Wait for SQTT to finish and read back the bo */ 713 if (sctx->ws->fence_wait(sctx->ws, sctx->last_sqtt_fence, PIPE_TIMEOUT_INFINITE) && 714 si_get_thread_trace(sctx, &thread_trace)) { 715 ac_dump_rgp_capture(&sctx->screen->info, &thread_trace); 716 } else { 717 fprintf(stderr, "Failed to read the trace\n"); 718 } 719 } 720 721 num_frames++; 722} 723 724 725static void 726si_emit_thread_trace_userdata(struct si_context* sctx, 727 struct radeon_cmdbuf *cs, 728 const void *data, uint32_t num_dwords) 729{ 730 const uint32_t *dwords = (uint32_t *)data; 731 732 radeon_begin(cs); 733 734 while (num_dwords > 0) { 735 uint32_t count = MIN2(num_dwords, 2); 736 737 /* Without the perfctr bit the CP might not always pass the 738 * write on correctly. */ 739 radeon_set_uconfig_reg_seq(R_030D08_SQ_THREAD_TRACE_USERDATA_2, count, sctx->chip_class >= GFX10); 740 741 radeon_emit_array(dwords, count); 742 743 dwords += count; 744 num_dwords -= count; 745 } 746 radeon_end(); 747} 748 749static void 750si_emit_spi_config_cntl(struct si_context* sctx, 751 struct radeon_cmdbuf *cs, bool enable) 752{ 753 radeon_begin(cs); 754 755 if (sctx->chip_class >= GFX9) { 756 uint32_t spi_config_cntl = S_031100_GPR_WRITE_PRIORITY(0x2c688) | 757 S_031100_EXP_PRIORITY_ORDER(3) | 758 S_031100_ENABLE_SQG_TOP_EVENTS(enable) | 759 S_031100_ENABLE_SQG_BOP_EVENTS(enable); 760 761 if (sctx->chip_class >= GFX10) 762 spi_config_cntl |= S_031100_PS_PKR_PRIORITY_CNTL(3); 763 764 radeon_set_uconfig_reg(R_031100_SPI_CONFIG_CNTL, spi_config_cntl); 765 } else { 766 /* SPI_CONFIG_CNTL is a protected register on GFX6-GFX8. */ 767 radeon_set_privileged_config_reg(R_009100_SPI_CONFIG_CNTL, 768 S_009100_ENABLE_SQG_TOP_EVENTS(enable) | 769 S_009100_ENABLE_SQG_BOP_EVENTS(enable)); 770 } 771 radeon_end(); 772} 773 774static uint32_t num_events = 0; 775void 776si_sqtt_write_event_marker(struct si_context* sctx, struct radeon_cmdbuf *rcs, 777 enum rgp_sqtt_marker_event_type api_type, 778 uint32_t vertex_offset_user_data, 779 uint32_t instance_offset_user_data, 780 uint32_t draw_index_user_data) 781{ 782 struct rgp_sqtt_marker_event marker = {0}; 783 784 marker.identifier = RGP_SQTT_MARKER_IDENTIFIER_EVENT; 785 marker.api_type = api_type == EventInvalid ? EventCmdDraw : api_type; 786 marker.cmd_id = num_events++; 787 marker.cb_id = 0; 788 789 if (vertex_offset_user_data == UINT_MAX || 790 instance_offset_user_data == UINT_MAX) { 791 vertex_offset_user_data = 0; 792 instance_offset_user_data = 0; 793 } 794 795 if (draw_index_user_data == UINT_MAX) 796 draw_index_user_data = vertex_offset_user_data; 797 798 marker.vertex_offset_reg_idx = vertex_offset_user_data; 799 marker.instance_offset_reg_idx = instance_offset_user_data; 800 marker.draw_index_reg_idx = draw_index_user_data; 801 802 si_emit_thread_trace_userdata(sctx, rcs, &marker, sizeof(marker) / 4); 803 804 sctx->sqtt_next_event = EventInvalid; 805} 806 807void 808si_write_event_with_dims_marker(struct si_context* sctx, struct radeon_cmdbuf *rcs, 809 enum rgp_sqtt_marker_event_type api_type, 810 uint32_t x, uint32_t y, uint32_t z) 811{ 812 struct rgp_sqtt_marker_event_with_dims marker = {0}; 813 814 marker.event.identifier = RGP_SQTT_MARKER_IDENTIFIER_EVENT; 815 marker.event.api_type = api_type; 816 marker.event.cmd_id = num_events++; 817 marker.event.cb_id = 0; 818 marker.event.has_thread_dims = 1; 819 820 marker.thread_x = x; 821 marker.thread_y = y; 822 marker.thread_z = z; 823 824 si_emit_thread_trace_userdata(sctx, rcs, &marker, sizeof(marker) / 4); 825 sctx->sqtt_next_event = EventInvalid; 826} 827 828void 829si_sqtt_describe_barrier_start(struct si_context* sctx, struct radeon_cmdbuf *rcs) 830{ 831 struct rgp_sqtt_marker_barrier_start marker = {0}; 832 833 marker.identifier = RGP_SQTT_MARKER_IDENTIFIER_BARRIER_START; 834 marker.cb_id = 0; 835 marker.dword02 = 0xC0000000 + 10; /* RGP_BARRIER_INTERNAL_BASE */ 836 837 si_emit_thread_trace_userdata(sctx, rcs, &marker, sizeof(marker) / 4); 838} 839 840void 841si_sqtt_describe_barrier_end(struct si_context* sctx, struct radeon_cmdbuf *rcs, 842 unsigned flags) 843{ 844 struct rgp_sqtt_marker_barrier_end marker = {0}; 845 846 marker.identifier = RGP_SQTT_MARKER_IDENTIFIER_BARRIER_END; 847 marker.cb_id = 0; 848 849 if (flags & SI_CONTEXT_VS_PARTIAL_FLUSH) 850 marker.vs_partial_flush = true; 851 if (flags & SI_CONTEXT_PS_PARTIAL_FLUSH) 852 marker.ps_partial_flush = true; 853 if (flags & SI_CONTEXT_CS_PARTIAL_FLUSH) 854 marker.cs_partial_flush = true; 855 856 if (flags & SI_CONTEXT_PFP_SYNC_ME) 857 marker.pfp_sync_me = true; 858 859 if (flags & SI_CONTEXT_INV_VCACHE) 860 marker.inval_tcp = true; 861 if (flags & SI_CONTEXT_INV_ICACHE) 862 marker.inval_sqI = true; 863 if (flags & SI_CONTEXT_INV_SCACHE) 864 marker.inval_sqK = true; 865 if (flags & SI_CONTEXT_INV_L2) 866 marker.inval_tcc = true; 867 868 if (flags & SI_CONTEXT_FLUSH_AND_INV_CB) { 869 marker.inval_cb = true; 870 marker.flush_cb = true; 871 } 872 if (flags & SI_CONTEXT_FLUSH_AND_INV_DB) { 873 marker.inval_db = true; 874 marker.flush_db = true; 875 } 876 877 si_emit_thread_trace_userdata(sctx, rcs, &marker, sizeof(marker) / 4); 878} 879 880void 881si_write_user_event(struct si_context* sctx, struct radeon_cmdbuf *rcs, 882 enum rgp_sqtt_marker_user_event_type type, 883 const char *str, int len) 884{ 885 if (type == UserEventPop) { 886 assert (str == NULL); 887 struct rgp_sqtt_marker_user_event marker = { 0 }; 888 marker.identifier = RGP_SQTT_MARKER_IDENTIFIER_USER_EVENT; 889 marker.data_type = type; 890 891 si_emit_thread_trace_userdata(sctx, rcs, &marker, sizeof(marker) / 4); 892 } else { 893 assert (str != NULL); 894 struct rgp_sqtt_marker_user_event_with_length marker = { 0 }; 895 marker.user_event.identifier = RGP_SQTT_MARKER_IDENTIFIER_USER_EVENT; 896 marker.user_event.data_type = type; 897 len = MIN2(1024, len); 898 marker.length = align(len, 4); 899 900 uint8_t *buffer = alloca(sizeof(marker) + marker.length); 901 memcpy(buffer, &marker, sizeof(marker)); 902 memcpy(buffer + sizeof(marker), str, len); 903 buffer[sizeof(marker) + len - 1] = '\0'; 904 905 si_emit_thread_trace_userdata(sctx, rcs, buffer, sizeof(marker) / 4 + marker.length / 4); 906 } 907} 908 909 910bool 911si_sqtt_pipeline_is_registered(struct ac_thread_trace_data *thread_trace_data, 912 uint64_t pipeline_hash) 913{ 914 simple_mtx_lock(&thread_trace_data->rgp_pso_correlation.lock); 915 list_for_each_entry_safe(struct rgp_pso_correlation_record, record, 916 &thread_trace_data->rgp_pso_correlation.record, list) { 917 if (record->pipeline_hash[0] == pipeline_hash) { 918 simple_mtx_unlock(&thread_trace_data->rgp_pso_correlation.lock); 919 return true; 920 } 921 922 } 923 simple_mtx_unlock(&thread_trace_data->rgp_pso_correlation.lock); 924 925 return false; 926} 927 928 929 930static enum rgp_hardware_stages 931si_sqtt_pipe_to_rgp_shader_stage(struct si_shader_key* key, enum pipe_shader_type stage) 932{ 933 switch (stage) { 934 case PIPE_SHADER_VERTEX: 935 if (key->as_ls) 936 return RGP_HW_STAGE_LS; 937 else if (key->as_es) 938 return RGP_HW_STAGE_ES; 939 else if (key->as_ngg) 940 return RGP_HW_STAGE_GS; 941 else 942 return RGP_HW_STAGE_VS; 943 case PIPE_SHADER_TESS_CTRL: 944 return RGP_HW_STAGE_HS; 945 case PIPE_SHADER_TESS_EVAL: 946 if (key->as_es) 947 return RGP_HW_STAGE_ES; 948 else if (key->as_ngg) 949 return RGP_HW_STAGE_GS; 950 else 951 return RGP_HW_STAGE_VS; 952 case PIPE_SHADER_GEOMETRY: 953 return RGP_HW_STAGE_GS; 954 case PIPE_SHADER_FRAGMENT: 955 return RGP_HW_STAGE_PS; 956 case PIPE_SHADER_COMPUTE: 957 return RGP_HW_STAGE_CS; 958 default: 959 unreachable("invalid mesa shader stage"); 960 } 961} 962 963static bool 964si_sqtt_add_code_object(struct si_context* sctx, 965 uint64_t pipeline_hash, 966 bool is_compute) 967{ 968 struct ac_thread_trace_data *thread_trace_data = sctx->thread_trace; 969 struct rgp_code_object *code_object = &thread_trace_data->rgp_code_object; 970 struct rgp_code_object_record *record; 971 972 record = malloc(sizeof(struct rgp_code_object_record)); 973 if (!record) 974 return false; 975 976 record->shader_stages_mask = 0; 977 record->num_shaders_combined = 0; 978 record->pipeline_hash[0] = pipeline_hash; 979 record->pipeline_hash[1] = pipeline_hash; 980 981 for (unsigned i = 0; i < PIPE_SHADER_TYPES; i++) { 982 struct si_shader *shader; 983 enum rgp_hardware_stages hw_stage; 984 985 if (is_compute) { 986 if (i != PIPE_SHADER_COMPUTE) 987 continue; 988 shader = &sctx->cs_shader_state.program->shader; 989 hw_stage = RGP_HW_STAGE_CS; 990 } else if (i != PIPE_SHADER_COMPUTE) { 991 if (!sctx->shaders[i].cso || !sctx->shaders[i].current) 992 continue; 993 shader = sctx->shaders[i].current; 994 hw_stage = si_sqtt_pipe_to_rgp_shader_stage(&shader->key, i); 995 } else { 996 continue; 997 } 998 999 uint8_t *code = malloc(shader->binary.uploaded_code_size); 1000 if (!code) { 1001 free(record); 1002 return false; 1003 } 1004 memcpy(code, shader->binary.uploaded_code, shader->binary.uploaded_code_size); 1005 1006 uint64_t va = shader->bo->gpu_address; 1007 unsigned gl_shader_stage = tgsi_processor_to_shader_stage(i); 1008 record->shader_data[gl_shader_stage].hash[0] = _mesa_hash_data(code, shader->binary.uploaded_code_size); 1009 record->shader_data[gl_shader_stage].hash[1] = record->shader_data[gl_shader_stage].hash[0]; 1010 record->shader_data[gl_shader_stage].code_size = shader->binary.uploaded_code_size; 1011 record->shader_data[gl_shader_stage].code = code; 1012 record->shader_data[gl_shader_stage].vgpr_count = shader->config.num_vgprs; 1013 record->shader_data[gl_shader_stage].sgpr_count = shader->config.num_sgprs; 1014 record->shader_data[gl_shader_stage].base_address = va & 0xffffffffffff; 1015 record->shader_data[gl_shader_stage].elf_symbol_offset = 0; 1016 record->shader_data[gl_shader_stage].hw_stage = hw_stage; 1017 record->shader_data[gl_shader_stage].is_combined = false; 1018 record->shader_data[gl_shader_stage].scratch_memory_size = shader->config.scratch_bytes_per_wave; 1019 record->shader_data[gl_shader_stage].wavefront_size = si_get_shader_wave_size(shader); 1020 1021 record->shader_stages_mask |= 1 << gl_shader_stage; 1022 record->num_shaders_combined++; 1023 } 1024 1025 simple_mtx_lock(&code_object->lock); 1026 list_addtail(&record->list, &code_object->record); 1027 code_object->record_count++; 1028 simple_mtx_unlock(&code_object->lock); 1029 1030 return true; 1031} 1032 1033bool 1034si_sqtt_register_pipeline(struct si_context* sctx, uint64_t pipeline_hash, uint64_t base_address, bool is_compute) 1035{ 1036 struct ac_thread_trace_data *thread_trace_data = sctx->thread_trace; 1037 1038 assert (!si_sqtt_pipeline_is_registered(thread_trace_data, pipeline_hash)); 1039 1040 bool result = ac_sqtt_add_pso_correlation(thread_trace_data, pipeline_hash); 1041 if (!result) 1042 return false; 1043 1044 result = ac_sqtt_add_code_object_loader_event(thread_trace_data, pipeline_hash, base_address); 1045 if (!result) 1046 return false; 1047 1048 return si_sqtt_add_code_object(sctx, pipeline_hash, is_compute); 1049} 1050 1051void 1052si_sqtt_describe_pipeline_bind(struct si_context* sctx, 1053 uint64_t pipeline_hash, 1054 int bind_point) 1055{ 1056 struct rgp_sqtt_marker_pipeline_bind marker = {0}; 1057 struct radeon_cmdbuf *cs = &sctx->gfx_cs; 1058 1059 if (likely(!sctx->thread_trace_enabled)) { 1060 return; 1061 } 1062 1063 marker.identifier = RGP_SQTT_MARKER_IDENTIFIER_BIND_PIPELINE; 1064 marker.cb_id = 0; 1065 marker.bind_point = bind_point; 1066 marker.api_pso_hash[0] = pipeline_hash; 1067 marker.api_pso_hash[1] = pipeline_hash >> 32; 1068 1069 si_emit_thread_trace_userdata(sctx, cs, &marker, sizeof(marker) / 4); 1070} 1071