17ec681f3Smrg/* 27ec681f3Smrg * Copyright © 2020 Valve Corporation 37ec681f3Smrg * 47ec681f3Smrg * Permission is hereby granted, free of charge, to any person obtaining a 57ec681f3Smrg * copy of this software and associated documentation files (the "Software"), 67ec681f3Smrg * to deal in the Software without restriction, including without limitation 77ec681f3Smrg * the rights to use, copy, modify, merge, publish, distribute, sublicense, 87ec681f3Smrg * and/or sell copies of the Software, and to permit persons to whom the 97ec681f3Smrg * Software is furnished to do so, subject to the following conditions: 107ec681f3Smrg * 117ec681f3Smrg * The above copyright notice and this permission notice (including the next 127ec681f3Smrg * paragraph) shall be included in all copies or substantial portions of the 137ec681f3Smrg * Software. 147ec681f3Smrg * 157ec681f3Smrg * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 167ec681f3Smrg * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 177ec681f3Smrg * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 187ec681f3Smrg * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 197ec681f3Smrg * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 207ec681f3Smrg * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 217ec681f3Smrg * IN THE SOFTWARE. 227ec681f3Smrg */ 237ec681f3Smrg 247ec681f3Smrg#include <inttypes.h> 257ec681f3Smrg 267ec681f3Smrg#include "radv_cs.h" 277ec681f3Smrg#include "radv_private.h" 287ec681f3Smrg#include "sid.h" 297ec681f3Smrg 307ec681f3Smrg#define SQTT_BUFFER_ALIGN_SHIFT 12 317ec681f3Smrg 327ec681f3Smrgbool 337ec681f3Smrgradv_is_instruction_timing_enabled(void) 347ec681f3Smrg{ 357ec681f3Smrg return getenv("RADV_THREAD_TRACE_PIPELINE"); 367ec681f3Smrg} 377ec681f3Smrg 387ec681f3Smrgstatic bool 397ec681f3Smrgradv_se_is_disabled(struct radv_device *device, unsigned se) 407ec681f3Smrg{ 417ec681f3Smrg /* No active CU on the SE means it is disabled. */ 427ec681f3Smrg return device->physical_device->rad_info.cu_mask[se][0] == 0; 437ec681f3Smrg} 447ec681f3Smrg 457ec681f3Smrgstatic uint32_t 467ec681f3Smrggfx10_get_thread_trace_ctrl(struct radv_device *device, bool enable) 477ec681f3Smrg{ 487ec681f3Smrg uint32_t thread_trace_ctrl = S_008D1C_MODE(enable) | S_008D1C_HIWATER(5) | 497ec681f3Smrg S_008D1C_UTIL_TIMER(1) | S_008D1C_RT_FREQ(2) | /* 4096 clk */ 507ec681f3Smrg S_008D1C_DRAW_EVENT_EN(1) | S_008D1C_REG_STALL_EN(1) | 517ec681f3Smrg S_008D1C_SPI_STALL_EN(1) | S_008D1C_SQ_STALL_EN(1) | 527ec681f3Smrg S_008D1C_REG_DROP_ON_STALL(0); 537ec681f3Smrg 547ec681f3Smrg if (device->physical_device->rad_info.chip_class == GFX10_3) 557ec681f3Smrg thread_trace_ctrl |= S_008D1C_LOWATER_OFFSET(4); 567ec681f3Smrg 577ec681f3Smrg return thread_trace_ctrl; 587ec681f3Smrg} 597ec681f3Smrg 607ec681f3Smrgstatic void 617ec681f3Smrgradv_emit_thread_trace_start(struct radv_device *device, struct radeon_cmdbuf *cs, 627ec681f3Smrg uint32_t queue_family_index) 637ec681f3Smrg{ 647ec681f3Smrg uint32_t shifted_size = device->thread_trace.buffer_size >> SQTT_BUFFER_ALIGN_SHIFT; 657ec681f3Smrg struct radeon_info *rad_info = &device->physical_device->rad_info; 667ec681f3Smrg unsigned max_se = rad_info->max_se; 677ec681f3Smrg 687ec681f3Smrg for (unsigned se = 0; se < max_se; se++) { 697ec681f3Smrg uint64_t va = radv_buffer_get_va(device->thread_trace.bo); 707ec681f3Smrg uint64_t data_va = ac_thread_trace_get_data_va(rad_info, &device->thread_trace, va, se); 717ec681f3Smrg uint64_t shifted_va = data_va >> SQTT_BUFFER_ALIGN_SHIFT; 727ec681f3Smrg int first_active_cu = ffs(device->physical_device->rad_info.cu_mask[se][0]); 737ec681f3Smrg 747ec681f3Smrg if (radv_se_is_disabled(device, se)) 757ec681f3Smrg continue; 767ec681f3Smrg 777ec681f3Smrg /* Target SEx and SH0. */ 787ec681f3Smrg radeon_set_uconfig_reg( 797ec681f3Smrg cs, R_030800_GRBM_GFX_INDEX, 807ec681f3Smrg S_030800_SE_INDEX(se) | S_030800_SH_INDEX(0) | S_030800_INSTANCE_BROADCAST_WRITES(1)); 817ec681f3Smrg 827ec681f3Smrg if (device->physical_device->rad_info.chip_class >= GFX10) { 837ec681f3Smrg /* Order seems important for the following 2 registers. */ 847ec681f3Smrg radeon_set_privileged_config_reg( 857ec681f3Smrg cs, R_008D04_SQ_THREAD_TRACE_BUF0_SIZE, 867ec681f3Smrg S_008D04_SIZE(shifted_size) | S_008D04_BASE_HI(shifted_va >> 32)); 877ec681f3Smrg 887ec681f3Smrg radeon_set_privileged_config_reg(cs, R_008D00_SQ_THREAD_TRACE_BUF0_BASE, shifted_va); 897ec681f3Smrg 907ec681f3Smrg radeon_set_privileged_config_reg( 917ec681f3Smrg cs, R_008D14_SQ_THREAD_TRACE_MASK, 927ec681f3Smrg S_008D14_WTYPE_INCLUDE(0x7f) | /* all shader stages */ 937ec681f3Smrg S_008D14_SA_SEL(0) | S_008D14_WGP_SEL(first_active_cu / 2) | S_008D14_SIMD_SEL(0)); 947ec681f3Smrg 957ec681f3Smrg uint32_t thread_trace_token_mask = S_008D18_REG_INCLUDE( 967ec681f3Smrg V_008D18_REG_INCLUDE_SQDEC | V_008D18_REG_INCLUDE_SHDEC | V_008D18_REG_INCLUDE_GFXUDEC | 977ec681f3Smrg V_008D18_REG_INCLUDE_COMP | V_008D18_REG_INCLUDE_CONTEXT | V_008D18_REG_INCLUDE_CONFIG); 987ec681f3Smrg 997ec681f3Smrg /* Performance counters with SQTT are considered deprecated. */ 1007ec681f3Smrg uint32_t token_exclude = V_008D18_TOKEN_EXCLUDE_PERF; 1017ec681f3Smrg 1027ec681f3Smrg if (!radv_is_instruction_timing_enabled()) { 1037ec681f3Smrg /* Reduce SQTT traffic when instruction timing isn't enabled. */ 1047ec681f3Smrg token_exclude |= V_008D18_TOKEN_EXCLUDE_VMEMEXEC | 1057ec681f3Smrg V_008D18_TOKEN_EXCLUDE_ALUEXEC | 1067ec681f3Smrg V_008D18_TOKEN_EXCLUDE_VALUINST | 1077ec681f3Smrg V_008D18_TOKEN_EXCLUDE_IMMEDIATE | 1087ec681f3Smrg V_008D18_TOKEN_EXCLUDE_INST; 1097ec681f3Smrg } 1107ec681f3Smrg thread_trace_token_mask |= S_008D18_TOKEN_EXCLUDE(token_exclude); 1117ec681f3Smrg 1127ec681f3Smrg radeon_set_privileged_config_reg(cs, R_008D18_SQ_THREAD_TRACE_TOKEN_MASK, 1137ec681f3Smrg thread_trace_token_mask); 1147ec681f3Smrg 1157ec681f3Smrg /* Should be emitted last (it enables thread traces). */ 1167ec681f3Smrg radeon_set_privileged_config_reg(cs, R_008D1C_SQ_THREAD_TRACE_CTRL, 1177ec681f3Smrg gfx10_get_thread_trace_ctrl(device, true)); 1187ec681f3Smrg } else { 1197ec681f3Smrg /* Order seems important for the following 4 registers. */ 1207ec681f3Smrg radeon_set_uconfig_reg(cs, R_030CDC_SQ_THREAD_TRACE_BASE2, 1217ec681f3Smrg S_030CDC_ADDR_HI(shifted_va >> 32)); 1227ec681f3Smrg 1237ec681f3Smrg radeon_set_uconfig_reg(cs, R_030CC0_SQ_THREAD_TRACE_BASE, shifted_va); 1247ec681f3Smrg 1257ec681f3Smrg radeon_set_uconfig_reg(cs, R_030CC4_SQ_THREAD_TRACE_SIZE, S_030CC4_SIZE(shifted_size)); 1267ec681f3Smrg 1277ec681f3Smrg radeon_set_uconfig_reg(cs, R_030CD4_SQ_THREAD_TRACE_CTRL, S_030CD4_RESET_BUFFER(1)); 1287ec681f3Smrg 1297ec681f3Smrg uint32_t thread_trace_mask = S_030CC8_CU_SEL(first_active_cu) | S_030CC8_SH_SEL(0) | 1307ec681f3Smrg S_030CC8_SIMD_EN(0xf) | S_030CC8_VM_ID_MASK(0) | 1317ec681f3Smrg S_030CC8_REG_STALL_EN(1) | S_030CC8_SPI_STALL_EN(1) | 1327ec681f3Smrg S_030CC8_SQ_STALL_EN(1); 1337ec681f3Smrg 1347ec681f3Smrg if (device->physical_device->rad_info.chip_class < GFX9) { 1357ec681f3Smrg thread_trace_mask |= S_030CC8_RANDOM_SEED(0xffff); 1367ec681f3Smrg } 1377ec681f3Smrg 1387ec681f3Smrg radeon_set_uconfig_reg(cs, R_030CC8_SQ_THREAD_TRACE_MASK, thread_trace_mask); 1397ec681f3Smrg 1407ec681f3Smrg /* Trace all tokens and registers. */ 1417ec681f3Smrg radeon_set_uconfig_reg( 1427ec681f3Smrg cs, R_030CCC_SQ_THREAD_TRACE_TOKEN_MASK, 1437ec681f3Smrg S_030CCC_TOKEN_MASK(0xbfff) | S_030CCC_REG_MASK(0xff) | S_030CCC_REG_DROP_ON_STALL(0)); 1447ec681f3Smrg 1457ec681f3Smrg /* Enable SQTT perf counters for all CUs. */ 1467ec681f3Smrg radeon_set_uconfig_reg(cs, R_030CD0_SQ_THREAD_TRACE_PERF_MASK, 1477ec681f3Smrg S_030CD0_SH0_MASK(0xffff) | S_030CD0_SH1_MASK(0xffff)); 1487ec681f3Smrg 1497ec681f3Smrg radeon_set_uconfig_reg(cs, R_030CE0_SQ_THREAD_TRACE_TOKEN_MASK2, 0xffffffff); 1507ec681f3Smrg 1517ec681f3Smrg radeon_set_uconfig_reg(cs, R_030CEC_SQ_THREAD_TRACE_HIWATER, S_030CEC_HIWATER(4)); 1527ec681f3Smrg 1537ec681f3Smrg if (device->physical_device->rad_info.chip_class == GFX9) { 1547ec681f3Smrg /* Reset thread trace status errors. */ 1557ec681f3Smrg radeon_set_uconfig_reg(cs, R_030CE8_SQ_THREAD_TRACE_STATUS, S_030CE8_UTC_ERROR(0)); 1567ec681f3Smrg } 1577ec681f3Smrg 1587ec681f3Smrg /* Enable the thread trace mode. */ 1597ec681f3Smrg uint32_t thread_trace_mode = 1607ec681f3Smrg S_030CD8_MASK_PS(1) | S_030CD8_MASK_VS(1) | S_030CD8_MASK_GS(1) | S_030CD8_MASK_ES(1) | 1617ec681f3Smrg S_030CD8_MASK_HS(1) | S_030CD8_MASK_LS(1) | S_030CD8_MASK_CS(1) | 1627ec681f3Smrg S_030CD8_AUTOFLUSH_EN(1) | /* periodically flush SQTT data to memory */ 1637ec681f3Smrg S_030CD8_MODE(1); 1647ec681f3Smrg 1657ec681f3Smrg if (device->physical_device->rad_info.chip_class == GFX9) { 1667ec681f3Smrg /* Count SQTT traffic in TCC perf counters. */ 1677ec681f3Smrg thread_trace_mode |= S_030CD8_TC_PERF_EN(1); 1687ec681f3Smrg } 1697ec681f3Smrg 1707ec681f3Smrg radeon_set_uconfig_reg(cs, R_030CD8_SQ_THREAD_TRACE_MODE, thread_trace_mode); 1717ec681f3Smrg } 1727ec681f3Smrg } 1737ec681f3Smrg 1747ec681f3Smrg /* Restore global broadcasting. */ 1757ec681f3Smrg radeon_set_uconfig_reg(cs, R_030800_GRBM_GFX_INDEX, 1767ec681f3Smrg S_030800_SE_BROADCAST_WRITES(1) | S_030800_SH_BROADCAST_WRITES(1) | 1777ec681f3Smrg S_030800_INSTANCE_BROADCAST_WRITES(1)); 1787ec681f3Smrg 1797ec681f3Smrg /* Start the thread trace with a different event based on the queue. */ 1807ec681f3Smrg if (queue_family_index == RADV_QUEUE_COMPUTE && 1817ec681f3Smrg device->physical_device->rad_info.chip_class >= GFX7) { 1827ec681f3Smrg radeon_set_sh_reg(cs, R_00B878_COMPUTE_THREAD_TRACE_ENABLE, S_00B878_THREAD_TRACE_ENABLE(1)); 1837ec681f3Smrg } else { 1847ec681f3Smrg radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); 1857ec681f3Smrg radeon_emit(cs, EVENT_TYPE(V_028A90_THREAD_TRACE_START) | EVENT_INDEX(0)); 1867ec681f3Smrg } 1877ec681f3Smrg} 1887ec681f3Smrg 1897ec681f3Smrgstatic const uint32_t gfx8_thread_trace_info_regs[] = { 1907ec681f3Smrg R_030CE4_SQ_THREAD_TRACE_WPTR, 1917ec681f3Smrg R_030CE8_SQ_THREAD_TRACE_STATUS, 1927ec681f3Smrg R_008E40_SQ_THREAD_TRACE_CNTR, 1937ec681f3Smrg}; 1947ec681f3Smrg 1957ec681f3Smrgstatic const uint32_t gfx9_thread_trace_info_regs[] = { 1967ec681f3Smrg R_030CE4_SQ_THREAD_TRACE_WPTR, 1977ec681f3Smrg R_030CE8_SQ_THREAD_TRACE_STATUS, 1987ec681f3Smrg R_030CF0_SQ_THREAD_TRACE_CNTR, 1997ec681f3Smrg}; 2007ec681f3Smrg 2017ec681f3Smrgstatic const uint32_t gfx10_thread_trace_info_regs[] = { 2027ec681f3Smrg R_008D10_SQ_THREAD_TRACE_WPTR, 2037ec681f3Smrg R_008D20_SQ_THREAD_TRACE_STATUS, 2047ec681f3Smrg R_008D24_SQ_THREAD_TRACE_DROPPED_CNTR, 2057ec681f3Smrg}; 2067ec681f3Smrg 2077ec681f3Smrgstatic void 2087ec681f3Smrgradv_copy_thread_trace_info_regs(struct radv_device *device, struct radeon_cmdbuf *cs, 2097ec681f3Smrg unsigned se_index) 2107ec681f3Smrg{ 2117ec681f3Smrg const uint32_t *thread_trace_info_regs = NULL; 2127ec681f3Smrg 2137ec681f3Smrg if (device->physical_device->rad_info.chip_class >= GFX10) { 2147ec681f3Smrg thread_trace_info_regs = gfx10_thread_trace_info_regs; 2157ec681f3Smrg } else if (device->physical_device->rad_info.chip_class == GFX9) { 2167ec681f3Smrg thread_trace_info_regs = gfx9_thread_trace_info_regs; 2177ec681f3Smrg } else { 2187ec681f3Smrg assert(device->physical_device->rad_info.chip_class == GFX8); 2197ec681f3Smrg thread_trace_info_regs = gfx8_thread_trace_info_regs; 2207ec681f3Smrg } 2217ec681f3Smrg 2227ec681f3Smrg /* Get the VA where the info struct is stored for this SE. */ 2237ec681f3Smrg uint64_t va = radv_buffer_get_va(device->thread_trace.bo); 2247ec681f3Smrg uint64_t info_va = ac_thread_trace_get_info_va(va, se_index); 2257ec681f3Smrg 2267ec681f3Smrg /* Copy back the info struct one DWORD at a time. */ 2277ec681f3Smrg for (unsigned i = 0; i < 3; i++) { 2287ec681f3Smrg radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0)); 2297ec681f3Smrg radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_PERF) | COPY_DATA_DST_SEL(COPY_DATA_TC_L2) | 2307ec681f3Smrg COPY_DATA_WR_CONFIRM); 2317ec681f3Smrg radeon_emit(cs, thread_trace_info_regs[i] >> 2); 2327ec681f3Smrg radeon_emit(cs, 0); /* unused */ 2337ec681f3Smrg radeon_emit(cs, (info_va + i * 4)); 2347ec681f3Smrg radeon_emit(cs, (info_va + i * 4) >> 32); 2357ec681f3Smrg } 2367ec681f3Smrg} 2377ec681f3Smrg 2387ec681f3Smrgstatic void 2397ec681f3Smrgradv_emit_thread_trace_stop(struct radv_device *device, struct radeon_cmdbuf *cs, 2407ec681f3Smrg uint32_t queue_family_index) 2417ec681f3Smrg{ 2427ec681f3Smrg unsigned max_se = device->physical_device->rad_info.max_se; 2437ec681f3Smrg 2447ec681f3Smrg /* Stop the thread trace with a different event based on the queue. */ 2457ec681f3Smrg if (queue_family_index == RADV_QUEUE_COMPUTE && 2467ec681f3Smrg device->physical_device->rad_info.chip_class >= GFX7) { 2477ec681f3Smrg radeon_set_sh_reg(cs, R_00B878_COMPUTE_THREAD_TRACE_ENABLE, S_00B878_THREAD_TRACE_ENABLE(0)); 2487ec681f3Smrg } else { 2497ec681f3Smrg radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); 2507ec681f3Smrg radeon_emit(cs, EVENT_TYPE(V_028A90_THREAD_TRACE_STOP) | EVENT_INDEX(0)); 2517ec681f3Smrg } 2527ec681f3Smrg 2537ec681f3Smrg radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); 2547ec681f3Smrg radeon_emit(cs, EVENT_TYPE(V_028A90_THREAD_TRACE_FINISH) | EVENT_INDEX(0)); 2557ec681f3Smrg 2567ec681f3Smrg for (unsigned se = 0; se < max_se; se++) { 2577ec681f3Smrg if (radv_se_is_disabled(device, se)) 2587ec681f3Smrg continue; 2597ec681f3Smrg 2607ec681f3Smrg /* Target SEi and SH0. */ 2617ec681f3Smrg radeon_set_uconfig_reg( 2627ec681f3Smrg cs, R_030800_GRBM_GFX_INDEX, 2637ec681f3Smrg S_030800_SE_INDEX(se) | S_030800_SH_INDEX(0) | S_030800_INSTANCE_BROADCAST_WRITES(1)); 2647ec681f3Smrg 2657ec681f3Smrg if (device->physical_device->rad_info.chip_class >= GFX10) { 2667ec681f3Smrg /* Make sure to wait for the trace buffer. */ 2677ec681f3Smrg radeon_emit(cs, PKT3(PKT3_WAIT_REG_MEM, 5, 0)); 2687ec681f3Smrg radeon_emit( 2697ec681f3Smrg cs, 2707ec681f3Smrg WAIT_REG_MEM_NOT_EQUAL); /* wait until the register is equal to the reference value */ 2717ec681f3Smrg radeon_emit(cs, R_008D20_SQ_THREAD_TRACE_STATUS >> 2); /* register */ 2727ec681f3Smrg radeon_emit(cs, 0); 2737ec681f3Smrg radeon_emit(cs, 0); /* reference value */ 2747ec681f3Smrg radeon_emit(cs, ~C_008D20_FINISH_DONE); 2757ec681f3Smrg radeon_emit(cs, 4); /* poll interval */ 2767ec681f3Smrg 2777ec681f3Smrg /* Disable the thread trace mode. */ 2787ec681f3Smrg radeon_set_privileged_config_reg(cs, R_008D1C_SQ_THREAD_TRACE_CTRL, 2797ec681f3Smrg gfx10_get_thread_trace_ctrl(device, false)); 2807ec681f3Smrg 2817ec681f3Smrg /* Wait for thread trace completion. */ 2827ec681f3Smrg radeon_emit(cs, PKT3(PKT3_WAIT_REG_MEM, 5, 0)); 2837ec681f3Smrg radeon_emit( 2847ec681f3Smrg cs, WAIT_REG_MEM_EQUAL); /* wait until the register is equal to the reference value */ 2857ec681f3Smrg radeon_emit(cs, R_008D20_SQ_THREAD_TRACE_STATUS >> 2); /* register */ 2867ec681f3Smrg radeon_emit(cs, 0); 2877ec681f3Smrg radeon_emit(cs, 0); /* reference value */ 2887ec681f3Smrg radeon_emit(cs, ~C_008D20_BUSY); /* mask */ 2897ec681f3Smrg radeon_emit(cs, 4); /* poll interval */ 2907ec681f3Smrg } else { 2917ec681f3Smrg /* Disable the thread trace mode. */ 2927ec681f3Smrg radeon_set_uconfig_reg(cs, R_030CD8_SQ_THREAD_TRACE_MODE, S_030CD8_MODE(0)); 2937ec681f3Smrg 2947ec681f3Smrg /* Wait for thread trace completion. */ 2957ec681f3Smrg radeon_emit(cs, PKT3(PKT3_WAIT_REG_MEM, 5, 0)); 2967ec681f3Smrg radeon_emit( 2977ec681f3Smrg cs, WAIT_REG_MEM_EQUAL); /* wait until the register is equal to the reference value */ 2987ec681f3Smrg radeon_emit(cs, R_030CE8_SQ_THREAD_TRACE_STATUS >> 2); /* register */ 2997ec681f3Smrg radeon_emit(cs, 0); 3007ec681f3Smrg radeon_emit(cs, 0); /* reference value */ 3017ec681f3Smrg radeon_emit(cs, ~C_030CE8_BUSY); /* mask */ 3027ec681f3Smrg radeon_emit(cs, 4); /* poll interval */ 3037ec681f3Smrg } 3047ec681f3Smrg 3057ec681f3Smrg radv_copy_thread_trace_info_regs(device, cs, se); 3067ec681f3Smrg } 3077ec681f3Smrg 3087ec681f3Smrg /* Restore global broadcasting. */ 3097ec681f3Smrg radeon_set_uconfig_reg(cs, R_030800_GRBM_GFX_INDEX, 3107ec681f3Smrg S_030800_SE_BROADCAST_WRITES(1) | S_030800_SH_BROADCAST_WRITES(1) | 3117ec681f3Smrg S_030800_INSTANCE_BROADCAST_WRITES(1)); 3127ec681f3Smrg} 3137ec681f3Smrg 3147ec681f3Smrgvoid 3157ec681f3Smrgradv_emit_thread_trace_userdata(const struct radv_device *device, struct radeon_cmdbuf *cs, 3167ec681f3Smrg const void *data, uint32_t num_dwords) 3177ec681f3Smrg{ 3187ec681f3Smrg const uint32_t *dwords = (uint32_t *)data; 3197ec681f3Smrg 3207ec681f3Smrg while (num_dwords > 0) { 3217ec681f3Smrg uint32_t count = MIN2(num_dwords, 2); 3227ec681f3Smrg 3237ec681f3Smrg radeon_check_space(device->ws, cs, 2 + count); 3247ec681f3Smrg 3257ec681f3Smrg /* Without the perfctr bit the CP might not always pass the 3267ec681f3Smrg * write on correctly. */ 3277ec681f3Smrg if (device->physical_device->rad_info.chip_class >= GFX10) 3287ec681f3Smrg radeon_set_uconfig_reg_seq_perfctr(cs, R_030D08_SQ_THREAD_TRACE_USERDATA_2, count); 3297ec681f3Smrg else 3307ec681f3Smrg radeon_set_uconfig_reg_seq(cs, R_030D08_SQ_THREAD_TRACE_USERDATA_2, count); 3317ec681f3Smrg radeon_emit_array(cs, dwords, count); 3327ec681f3Smrg 3337ec681f3Smrg dwords += count; 3347ec681f3Smrg num_dwords -= count; 3357ec681f3Smrg } 3367ec681f3Smrg} 3377ec681f3Smrg 3387ec681f3Smrgstatic void 3397ec681f3Smrgradv_emit_spi_config_cntl(struct radv_device *device, struct radeon_cmdbuf *cs, bool enable) 3407ec681f3Smrg{ 3417ec681f3Smrg if (device->physical_device->rad_info.chip_class >= GFX9) { 3427ec681f3Smrg uint32_t spi_config_cntl = 3437ec681f3Smrg S_031100_GPR_WRITE_PRIORITY(0x2c688) | S_031100_EXP_PRIORITY_ORDER(3) | 3447ec681f3Smrg S_031100_ENABLE_SQG_TOP_EVENTS(enable) | S_031100_ENABLE_SQG_BOP_EVENTS(enable); 3457ec681f3Smrg 3467ec681f3Smrg if (device->physical_device->rad_info.chip_class >= GFX10) 3477ec681f3Smrg spi_config_cntl |= S_031100_PS_PKR_PRIORITY_CNTL(3); 3487ec681f3Smrg 3497ec681f3Smrg radeon_set_uconfig_reg(cs, R_031100_SPI_CONFIG_CNTL, spi_config_cntl); 3507ec681f3Smrg } else { 3517ec681f3Smrg /* SPI_CONFIG_CNTL is a protected register on GFX6-GFX8. */ 3527ec681f3Smrg radeon_set_privileged_config_reg( 3537ec681f3Smrg cs, R_009100_SPI_CONFIG_CNTL, 3547ec681f3Smrg S_009100_ENABLE_SQG_TOP_EVENTS(enable) | S_009100_ENABLE_SQG_BOP_EVENTS(enable)); 3557ec681f3Smrg } 3567ec681f3Smrg} 3577ec681f3Smrg 3587ec681f3Smrgstatic void 3597ec681f3Smrgradv_emit_inhibit_clockgating(struct radv_device *device, struct radeon_cmdbuf *cs, bool inhibit) 3607ec681f3Smrg{ 3617ec681f3Smrg if (device->physical_device->rad_info.chip_class >= GFX10) { 3627ec681f3Smrg radeon_set_uconfig_reg(cs, R_037390_RLC_PERFMON_CLK_CNTL, 3637ec681f3Smrg S_037390_PERFMON_CLOCK_STATE(inhibit)); 3647ec681f3Smrg } else if (device->physical_device->rad_info.chip_class >= GFX8) { 3657ec681f3Smrg radeon_set_uconfig_reg(cs, R_0372FC_RLC_PERFMON_CLK_CNTL, 3667ec681f3Smrg S_0372FC_PERFMON_CLOCK_STATE(inhibit)); 3677ec681f3Smrg } 3687ec681f3Smrg} 3697ec681f3Smrg 3707ec681f3Smrgstatic void 3717ec681f3Smrgradv_emit_wait_for_idle(struct radv_device *device, struct radeon_cmdbuf *cs, int family) 3727ec681f3Smrg{ 3737ec681f3Smrg enum rgp_flush_bits sqtt_flush_bits = 0; 3747ec681f3Smrg si_cs_emit_cache_flush( 3757ec681f3Smrg cs, device->physical_device->rad_info.chip_class, NULL, 0, 3767ec681f3Smrg family == RING_COMPUTE && device->physical_device->rad_info.chip_class >= GFX7, 3777ec681f3Smrg (family == RADV_QUEUE_COMPUTE 3787ec681f3Smrg ? RADV_CMD_FLAG_CS_PARTIAL_FLUSH 3797ec681f3Smrg : (RADV_CMD_FLAG_CS_PARTIAL_FLUSH | RADV_CMD_FLAG_PS_PARTIAL_FLUSH)) | 3807ec681f3Smrg RADV_CMD_FLAG_INV_ICACHE | RADV_CMD_FLAG_INV_SCACHE | RADV_CMD_FLAG_INV_VCACHE | 3817ec681f3Smrg RADV_CMD_FLAG_INV_L2, 3827ec681f3Smrg &sqtt_flush_bits, 0); 3837ec681f3Smrg} 3847ec681f3Smrg 3857ec681f3Smrgstatic bool 3867ec681f3Smrgradv_thread_trace_init_bo(struct radv_device *device) 3877ec681f3Smrg{ 3887ec681f3Smrg unsigned max_se = device->physical_device->rad_info.max_se; 3897ec681f3Smrg struct radeon_winsys *ws = device->ws; 3907ec681f3Smrg VkResult result; 3917ec681f3Smrg uint64_t size; 3927ec681f3Smrg 3937ec681f3Smrg /* The buffer size and address need to be aligned in HW regs. Align the 3947ec681f3Smrg * size as early as possible so that we do all the allocation & addressing 3957ec681f3Smrg * correctly. */ 3967ec681f3Smrg device->thread_trace.buffer_size = 3977ec681f3Smrg align64(device->thread_trace.buffer_size, 1u << SQTT_BUFFER_ALIGN_SHIFT); 3987ec681f3Smrg 3997ec681f3Smrg /* Compute total size of the thread trace BO for all SEs. */ 4007ec681f3Smrg size = align64(sizeof(struct ac_thread_trace_info) * max_se, 1 << SQTT_BUFFER_ALIGN_SHIFT); 4017ec681f3Smrg size += device->thread_trace.buffer_size * (uint64_t)max_se; 4027ec681f3Smrg 4037ec681f3Smrg struct radeon_winsys_bo *bo = NULL; 4047ec681f3Smrg result = ws->buffer_create( 4057ec681f3Smrg ws, size, 4096, RADEON_DOMAIN_VRAM, 4067ec681f3Smrg RADEON_FLAG_CPU_ACCESS | RADEON_FLAG_NO_INTERPROCESS_SHARING | RADEON_FLAG_ZERO_VRAM, 4077ec681f3Smrg RADV_BO_PRIORITY_SCRATCH, 0, &bo); 4087ec681f3Smrg device->thread_trace.bo = bo; 4097ec681f3Smrg if (result != VK_SUCCESS) 4107ec681f3Smrg return false; 4117ec681f3Smrg 4127ec681f3Smrg result = ws->buffer_make_resident(ws, device->thread_trace.bo, true); 4137ec681f3Smrg if (result != VK_SUCCESS) 4147ec681f3Smrg return false; 4157ec681f3Smrg 4167ec681f3Smrg device->thread_trace.ptr = ws->buffer_map(device->thread_trace.bo); 4177ec681f3Smrg if (!device->thread_trace.ptr) 4187ec681f3Smrg return false; 4197ec681f3Smrg 4207ec681f3Smrg return true; 4217ec681f3Smrg} 4227ec681f3Smrg 4237ec681f3Smrgstatic void 4247ec681f3Smrgradv_thread_trace_finish_bo(struct radv_device *device) 4257ec681f3Smrg{ 4267ec681f3Smrg struct radeon_winsys *ws = device->ws; 4277ec681f3Smrg 4287ec681f3Smrg if (unlikely(device->thread_trace.bo)) { 4297ec681f3Smrg ws->buffer_make_resident(ws, device->thread_trace.bo, false); 4307ec681f3Smrg ws->buffer_destroy(ws, device->thread_trace.bo); 4317ec681f3Smrg } 4327ec681f3Smrg} 4337ec681f3Smrg 4347ec681f3Smrgbool 4357ec681f3Smrgradv_thread_trace_init(struct radv_device *device) 4367ec681f3Smrg{ 4377ec681f3Smrg struct ac_thread_trace_data *thread_trace_data = &device->thread_trace; 4387ec681f3Smrg 4397ec681f3Smrg /* Default buffer size set to 32MB per SE. */ 4407ec681f3Smrg device->thread_trace.buffer_size = 4417ec681f3Smrg radv_get_int_debug_option("RADV_THREAD_TRACE_BUFFER_SIZE", 32 * 1024 * 1024); 4427ec681f3Smrg device->thread_trace.start_frame = radv_get_int_debug_option("RADV_THREAD_TRACE", -1); 4437ec681f3Smrg 4447ec681f3Smrg const char *trigger_file = getenv("RADV_THREAD_TRACE_TRIGGER"); 4457ec681f3Smrg if (trigger_file) 4467ec681f3Smrg device->thread_trace.trigger_file = strdup(trigger_file); 4477ec681f3Smrg 4487ec681f3Smrg if (!radv_thread_trace_init_bo(device)) 4497ec681f3Smrg return false; 4507ec681f3Smrg 4517ec681f3Smrg list_inithead(&thread_trace_data->rgp_pso_correlation.record); 4527ec681f3Smrg simple_mtx_init(&thread_trace_data->rgp_pso_correlation.lock, mtx_plain); 4537ec681f3Smrg 4547ec681f3Smrg list_inithead(&thread_trace_data->rgp_loader_events.record); 4557ec681f3Smrg simple_mtx_init(&thread_trace_data->rgp_loader_events.lock, mtx_plain); 4567ec681f3Smrg 4577ec681f3Smrg list_inithead(&thread_trace_data->rgp_code_object.record); 4587ec681f3Smrg simple_mtx_init(&thread_trace_data->rgp_code_object.lock, mtx_plain); 4597ec681f3Smrg 4607ec681f3Smrg return true; 4617ec681f3Smrg} 4627ec681f3Smrg 4637ec681f3Smrgvoid 4647ec681f3Smrgradv_thread_trace_finish(struct radv_device *device) 4657ec681f3Smrg{ 4667ec681f3Smrg struct ac_thread_trace_data *thread_trace_data = &device->thread_trace; 4677ec681f3Smrg struct radeon_winsys *ws = device->ws; 4687ec681f3Smrg 4697ec681f3Smrg radv_thread_trace_finish_bo(device); 4707ec681f3Smrg 4717ec681f3Smrg for (unsigned i = 0; i < 2; i++) { 4727ec681f3Smrg if (device->thread_trace.start_cs[i]) 4737ec681f3Smrg ws->cs_destroy(device->thread_trace.start_cs[i]); 4747ec681f3Smrg if (device->thread_trace.stop_cs[i]) 4757ec681f3Smrg ws->cs_destroy(device->thread_trace.stop_cs[i]); 4767ec681f3Smrg } 4777ec681f3Smrg 4787ec681f3Smrg assert(thread_trace_data->rgp_pso_correlation.record_count == 0); 4797ec681f3Smrg simple_mtx_destroy(&thread_trace_data->rgp_pso_correlation.lock); 4807ec681f3Smrg 4817ec681f3Smrg assert(thread_trace_data->rgp_loader_events.record_count == 0); 4827ec681f3Smrg simple_mtx_destroy(&thread_trace_data->rgp_loader_events.lock); 4837ec681f3Smrg 4847ec681f3Smrg assert(thread_trace_data->rgp_code_object.record_count == 0); 4857ec681f3Smrg simple_mtx_destroy(&thread_trace_data->rgp_code_object.lock); 4867ec681f3Smrg} 4877ec681f3Smrg 4887ec681f3Smrgstatic bool 4897ec681f3Smrgradv_thread_trace_resize_bo(struct radv_device *device) 4907ec681f3Smrg{ 4917ec681f3Smrg /* Destroy the previous thread trace BO. */ 4927ec681f3Smrg radv_thread_trace_finish_bo(device); 4937ec681f3Smrg 4947ec681f3Smrg /* Double the size of the thread trace buffer per SE. */ 4957ec681f3Smrg device->thread_trace.buffer_size *= 2; 4967ec681f3Smrg 4977ec681f3Smrg fprintf(stderr, 4987ec681f3Smrg "Failed to get the thread trace because the buffer " 4997ec681f3Smrg "was too small, resizing to %d KB\n", 5007ec681f3Smrg device->thread_trace.buffer_size / 1024); 5017ec681f3Smrg 5027ec681f3Smrg /* Re-create the thread trace BO. */ 5037ec681f3Smrg return radv_thread_trace_init_bo(device); 5047ec681f3Smrg} 5057ec681f3Smrg 5067ec681f3Smrgbool 5077ec681f3Smrgradv_begin_thread_trace(struct radv_queue *queue) 5087ec681f3Smrg{ 5097ec681f3Smrg struct radv_device *device = queue->device; 5107ec681f3Smrg int family = queue->vk.queue_family_index; 5117ec681f3Smrg struct radeon_winsys *ws = device->ws; 5127ec681f3Smrg struct radeon_cmdbuf *cs; 5137ec681f3Smrg VkResult result; 5147ec681f3Smrg 5157ec681f3Smrg /* Destroy the previous start CS and create a new one. */ 5167ec681f3Smrg if (device->thread_trace.start_cs[family]) { 5177ec681f3Smrg ws->cs_destroy(device->thread_trace.start_cs[family]); 5187ec681f3Smrg device->thread_trace.start_cs[family] = NULL; 5197ec681f3Smrg } 5207ec681f3Smrg 5217ec681f3Smrg cs = ws->cs_create(ws, family); 5227ec681f3Smrg if (!cs) 5237ec681f3Smrg return false; 5247ec681f3Smrg 5257ec681f3Smrg switch (family) { 5267ec681f3Smrg case RADV_QUEUE_GENERAL: 5277ec681f3Smrg radeon_emit(cs, PKT3(PKT3_CONTEXT_CONTROL, 1, 0)); 5287ec681f3Smrg radeon_emit(cs, CC0_UPDATE_LOAD_ENABLES(1)); 5297ec681f3Smrg radeon_emit(cs, CC1_UPDATE_SHADOW_ENABLES(1)); 5307ec681f3Smrg break; 5317ec681f3Smrg case RADV_QUEUE_COMPUTE: 5327ec681f3Smrg radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); 5337ec681f3Smrg radeon_emit(cs, 0); 5347ec681f3Smrg break; 5357ec681f3Smrg } 5367ec681f3Smrg 5377ec681f3Smrg radv_cs_add_buffer(ws, cs, device->thread_trace.bo); 5387ec681f3Smrg 5397ec681f3Smrg /* Make sure to wait-for-idle before starting SQTT. */ 5407ec681f3Smrg radv_emit_wait_for_idle(device, cs, family); 5417ec681f3Smrg 5427ec681f3Smrg /* Disable clock gating before starting SQTT. */ 5437ec681f3Smrg radv_emit_inhibit_clockgating(device, cs, true); 5447ec681f3Smrg 5457ec681f3Smrg /* Enable SQG events that collects thread trace data. */ 5467ec681f3Smrg radv_emit_spi_config_cntl(device, cs, true); 5477ec681f3Smrg 5487ec681f3Smrg /* Start SQTT. */ 5497ec681f3Smrg radv_emit_thread_trace_start(device, cs, family); 5507ec681f3Smrg 5517ec681f3Smrg result = ws->cs_finalize(cs); 5527ec681f3Smrg if (result != VK_SUCCESS) { 5537ec681f3Smrg ws->cs_destroy(cs); 5547ec681f3Smrg return false; 5557ec681f3Smrg } 5567ec681f3Smrg 5577ec681f3Smrg device->thread_trace.start_cs[family] = cs; 5587ec681f3Smrg 5597ec681f3Smrg return radv_queue_internal_submit(queue, cs); 5607ec681f3Smrg} 5617ec681f3Smrg 5627ec681f3Smrgbool 5637ec681f3Smrgradv_end_thread_trace(struct radv_queue *queue) 5647ec681f3Smrg{ 5657ec681f3Smrg struct radv_device *device = queue->device; 5667ec681f3Smrg int family = queue->vk.queue_family_index; 5677ec681f3Smrg struct radeon_winsys *ws = device->ws; 5687ec681f3Smrg struct radeon_cmdbuf *cs; 5697ec681f3Smrg VkResult result; 5707ec681f3Smrg 5717ec681f3Smrg /* Destroy the previous stop CS and create a new one. */ 5727ec681f3Smrg if (queue->device->thread_trace.stop_cs[family]) { 5737ec681f3Smrg ws->cs_destroy(device->thread_trace.stop_cs[family]); 5747ec681f3Smrg device->thread_trace.stop_cs[family] = NULL; 5757ec681f3Smrg } 5767ec681f3Smrg 5777ec681f3Smrg cs = ws->cs_create(ws, family); 5787ec681f3Smrg if (!cs) 5797ec681f3Smrg return false; 5807ec681f3Smrg 5817ec681f3Smrg switch (family) { 5827ec681f3Smrg case RADV_QUEUE_GENERAL: 5837ec681f3Smrg radeon_emit(cs, PKT3(PKT3_CONTEXT_CONTROL, 1, 0)); 5847ec681f3Smrg radeon_emit(cs, CC0_UPDATE_LOAD_ENABLES(1)); 5857ec681f3Smrg radeon_emit(cs, CC1_UPDATE_SHADOW_ENABLES(1)); 5867ec681f3Smrg break; 5877ec681f3Smrg case RADV_QUEUE_COMPUTE: 5887ec681f3Smrg radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); 5897ec681f3Smrg radeon_emit(cs, 0); 5907ec681f3Smrg break; 5917ec681f3Smrg } 5927ec681f3Smrg 5937ec681f3Smrg radv_cs_add_buffer(ws, cs, device->thread_trace.bo); 5947ec681f3Smrg 5957ec681f3Smrg /* Make sure to wait-for-idle before stopping SQTT. */ 5967ec681f3Smrg radv_emit_wait_for_idle(device, cs, family); 5977ec681f3Smrg 5987ec681f3Smrg /* Stop SQTT. */ 5997ec681f3Smrg radv_emit_thread_trace_stop(device, cs, family); 6007ec681f3Smrg 6017ec681f3Smrg /* Restore previous state by disabling SQG events. */ 6027ec681f3Smrg radv_emit_spi_config_cntl(device, cs, false); 6037ec681f3Smrg 6047ec681f3Smrg /* Restore previous state by re-enabling clock gating. */ 6057ec681f3Smrg radv_emit_inhibit_clockgating(device, cs, false); 6067ec681f3Smrg 6077ec681f3Smrg result = ws->cs_finalize(cs); 6087ec681f3Smrg if (result != VK_SUCCESS) { 6097ec681f3Smrg ws->cs_destroy(cs); 6107ec681f3Smrg return false; 6117ec681f3Smrg } 6127ec681f3Smrg 6137ec681f3Smrg device->thread_trace.stop_cs[family] = cs; 6147ec681f3Smrg 6157ec681f3Smrg return radv_queue_internal_submit(queue, cs); 6167ec681f3Smrg} 6177ec681f3Smrg 6187ec681f3Smrgbool 6197ec681f3Smrgradv_get_thread_trace(struct radv_queue *queue, struct ac_thread_trace *thread_trace) 6207ec681f3Smrg{ 6217ec681f3Smrg struct radv_device *device = queue->device; 6227ec681f3Smrg struct radeon_info *rad_info = &device->physical_device->rad_info; 6237ec681f3Smrg unsigned max_se = rad_info->max_se; 6247ec681f3Smrg void *thread_trace_ptr = device->thread_trace.ptr; 6257ec681f3Smrg 6267ec681f3Smrg memset(thread_trace, 0, sizeof(*thread_trace)); 6277ec681f3Smrg 6287ec681f3Smrg for (unsigned se = 0; se < max_se; se++) { 6297ec681f3Smrg uint64_t info_offset = ac_thread_trace_get_info_offset(se); 6307ec681f3Smrg uint64_t data_offset = ac_thread_trace_get_data_offset(rad_info, &device->thread_trace, se); 6317ec681f3Smrg void *info_ptr = (uint8_t *)thread_trace_ptr + info_offset; 6327ec681f3Smrg void *data_ptr = (uint8_t *)thread_trace_ptr + data_offset; 6337ec681f3Smrg struct ac_thread_trace_info *info = (struct ac_thread_trace_info *)info_ptr; 6347ec681f3Smrg struct ac_thread_trace_se thread_trace_se = {0}; 6357ec681f3Smrg int first_active_cu = ffs(device->physical_device->rad_info.cu_mask[se][0]); 6367ec681f3Smrg 6377ec681f3Smrg if (radv_se_is_disabled(device, se)) 6387ec681f3Smrg continue; 6397ec681f3Smrg 6407ec681f3Smrg if (!ac_is_thread_trace_complete(&device->physical_device->rad_info, &device->thread_trace, 6417ec681f3Smrg info)) { 6427ec681f3Smrg if (!radv_thread_trace_resize_bo(device)) { 6437ec681f3Smrg fprintf(stderr, "Failed to resize the thread " 6447ec681f3Smrg "trace buffer.\n"); 6457ec681f3Smrg abort(); 6467ec681f3Smrg } 6477ec681f3Smrg return false; 6487ec681f3Smrg } 6497ec681f3Smrg 6507ec681f3Smrg thread_trace_se.data_ptr = data_ptr; 6517ec681f3Smrg thread_trace_se.info = *info; 6527ec681f3Smrg thread_trace_se.shader_engine = se; 6537ec681f3Smrg 6547ec681f3Smrg /* RGP seems to expect units of WGP on GFX10+. */ 6557ec681f3Smrg thread_trace_se.compute_unit = device->physical_device->rad_info.chip_class >= GFX10 6567ec681f3Smrg ? (first_active_cu / 2) 6577ec681f3Smrg : first_active_cu; 6587ec681f3Smrg 6597ec681f3Smrg thread_trace->traces[thread_trace->num_traces] = thread_trace_se; 6607ec681f3Smrg thread_trace->num_traces++; 6617ec681f3Smrg } 6627ec681f3Smrg 6637ec681f3Smrg thread_trace->data = &device->thread_trace; 6647ec681f3Smrg return true; 6657ec681f3Smrg} 666