17ec681f3Smrg/*
27ec681f3Smrg * Copyright © 2020 Valve Corporation
37ec681f3Smrg *
47ec681f3Smrg * Permission is hereby granted, free of charge, to any person obtaining a
57ec681f3Smrg * copy of this software and associated documentation files (the "Software"),
67ec681f3Smrg * to deal in the Software without restriction, including without limitation
77ec681f3Smrg * the rights to use, copy, modify, merge, publish, distribute, sublicense,
87ec681f3Smrg * and/or sell copies of the Software, and to permit persons to whom the
97ec681f3Smrg * Software is furnished to do so, subject to the following conditions:
107ec681f3Smrg *
117ec681f3Smrg * The above copyright notice and this permission notice (including the next
127ec681f3Smrg * paragraph) shall be included in all copies or substantial portions of the
137ec681f3Smrg * Software.
147ec681f3Smrg *
157ec681f3Smrg * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
167ec681f3Smrg * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
177ec681f3Smrg * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
187ec681f3Smrg * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
197ec681f3Smrg * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
207ec681f3Smrg * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
217ec681f3Smrg * IN THE SOFTWARE.
227ec681f3Smrg */
237ec681f3Smrg
247ec681f3Smrg#include <inttypes.h>
257ec681f3Smrg
267ec681f3Smrg#include "radv_cs.h"
277ec681f3Smrg#include "radv_private.h"
287ec681f3Smrg#include "sid.h"
297ec681f3Smrg
307ec681f3Smrg#define SQTT_BUFFER_ALIGN_SHIFT 12
317ec681f3Smrg
327ec681f3Smrgbool
337ec681f3Smrgradv_is_instruction_timing_enabled(void)
347ec681f3Smrg{
357ec681f3Smrg   return getenv("RADV_THREAD_TRACE_PIPELINE");
367ec681f3Smrg}
377ec681f3Smrg
387ec681f3Smrgstatic bool
397ec681f3Smrgradv_se_is_disabled(struct radv_device *device, unsigned se)
407ec681f3Smrg{
417ec681f3Smrg   /* No active CU on the SE means it is disabled. */
427ec681f3Smrg   return device->physical_device->rad_info.cu_mask[se][0] == 0;
437ec681f3Smrg}
447ec681f3Smrg
457ec681f3Smrgstatic uint32_t
467ec681f3Smrggfx10_get_thread_trace_ctrl(struct radv_device *device, bool enable)
477ec681f3Smrg{
487ec681f3Smrg   uint32_t thread_trace_ctrl = S_008D1C_MODE(enable) | S_008D1C_HIWATER(5) |
497ec681f3Smrg                                S_008D1C_UTIL_TIMER(1) | S_008D1C_RT_FREQ(2) | /* 4096 clk */
507ec681f3Smrg                                S_008D1C_DRAW_EVENT_EN(1) | S_008D1C_REG_STALL_EN(1) |
517ec681f3Smrg                                S_008D1C_SPI_STALL_EN(1) | S_008D1C_SQ_STALL_EN(1) |
527ec681f3Smrg                                S_008D1C_REG_DROP_ON_STALL(0);
537ec681f3Smrg
547ec681f3Smrg   if (device->physical_device->rad_info.chip_class == GFX10_3)
557ec681f3Smrg      thread_trace_ctrl |= S_008D1C_LOWATER_OFFSET(4);
567ec681f3Smrg
577ec681f3Smrg   return thread_trace_ctrl;
587ec681f3Smrg}
597ec681f3Smrg
607ec681f3Smrgstatic void
617ec681f3Smrgradv_emit_thread_trace_start(struct radv_device *device, struct radeon_cmdbuf *cs,
627ec681f3Smrg                             uint32_t queue_family_index)
637ec681f3Smrg{
647ec681f3Smrg   uint32_t shifted_size = device->thread_trace.buffer_size >> SQTT_BUFFER_ALIGN_SHIFT;
657ec681f3Smrg   struct radeon_info *rad_info = &device->physical_device->rad_info;
667ec681f3Smrg   unsigned max_se = rad_info->max_se;
677ec681f3Smrg
687ec681f3Smrg   for (unsigned se = 0; se < max_se; se++) {
697ec681f3Smrg      uint64_t va = radv_buffer_get_va(device->thread_trace.bo);
707ec681f3Smrg      uint64_t data_va = ac_thread_trace_get_data_va(rad_info, &device->thread_trace, va, se);
717ec681f3Smrg      uint64_t shifted_va = data_va >> SQTT_BUFFER_ALIGN_SHIFT;
727ec681f3Smrg      int first_active_cu = ffs(device->physical_device->rad_info.cu_mask[se][0]);
737ec681f3Smrg
747ec681f3Smrg      if (radv_se_is_disabled(device, se))
757ec681f3Smrg         continue;
767ec681f3Smrg
777ec681f3Smrg      /* Target SEx and SH0. */
787ec681f3Smrg      radeon_set_uconfig_reg(
797ec681f3Smrg         cs, R_030800_GRBM_GFX_INDEX,
807ec681f3Smrg         S_030800_SE_INDEX(se) | S_030800_SH_INDEX(0) | S_030800_INSTANCE_BROADCAST_WRITES(1));
817ec681f3Smrg
827ec681f3Smrg      if (device->physical_device->rad_info.chip_class >= GFX10) {
837ec681f3Smrg         /* Order seems important for the following 2 registers. */
847ec681f3Smrg         radeon_set_privileged_config_reg(
857ec681f3Smrg            cs, R_008D04_SQ_THREAD_TRACE_BUF0_SIZE,
867ec681f3Smrg            S_008D04_SIZE(shifted_size) | S_008D04_BASE_HI(shifted_va >> 32));
877ec681f3Smrg
887ec681f3Smrg         radeon_set_privileged_config_reg(cs, R_008D00_SQ_THREAD_TRACE_BUF0_BASE, shifted_va);
897ec681f3Smrg
907ec681f3Smrg         radeon_set_privileged_config_reg(
917ec681f3Smrg            cs, R_008D14_SQ_THREAD_TRACE_MASK,
927ec681f3Smrg            S_008D14_WTYPE_INCLUDE(0x7f) | /* all shader stages */
937ec681f3Smrg               S_008D14_SA_SEL(0) | S_008D14_WGP_SEL(first_active_cu / 2) | S_008D14_SIMD_SEL(0));
947ec681f3Smrg
957ec681f3Smrg         uint32_t thread_trace_token_mask = S_008D18_REG_INCLUDE(
967ec681f3Smrg            V_008D18_REG_INCLUDE_SQDEC | V_008D18_REG_INCLUDE_SHDEC | V_008D18_REG_INCLUDE_GFXUDEC |
977ec681f3Smrg            V_008D18_REG_INCLUDE_COMP | V_008D18_REG_INCLUDE_CONTEXT | V_008D18_REG_INCLUDE_CONFIG);
987ec681f3Smrg
997ec681f3Smrg         /* Performance counters with SQTT are considered deprecated. */
1007ec681f3Smrg         uint32_t token_exclude = V_008D18_TOKEN_EXCLUDE_PERF;
1017ec681f3Smrg
1027ec681f3Smrg         if (!radv_is_instruction_timing_enabled()) {
1037ec681f3Smrg            /* Reduce SQTT traffic when instruction timing isn't enabled. */
1047ec681f3Smrg            token_exclude |= V_008D18_TOKEN_EXCLUDE_VMEMEXEC |
1057ec681f3Smrg                             V_008D18_TOKEN_EXCLUDE_ALUEXEC |
1067ec681f3Smrg                             V_008D18_TOKEN_EXCLUDE_VALUINST |
1077ec681f3Smrg                             V_008D18_TOKEN_EXCLUDE_IMMEDIATE |
1087ec681f3Smrg                             V_008D18_TOKEN_EXCLUDE_INST;
1097ec681f3Smrg         }
1107ec681f3Smrg         thread_trace_token_mask |= S_008D18_TOKEN_EXCLUDE(token_exclude);
1117ec681f3Smrg
1127ec681f3Smrg         radeon_set_privileged_config_reg(cs, R_008D18_SQ_THREAD_TRACE_TOKEN_MASK,
1137ec681f3Smrg                                          thread_trace_token_mask);
1147ec681f3Smrg
1157ec681f3Smrg         /* Should be emitted last (it enables thread traces). */
1167ec681f3Smrg         radeon_set_privileged_config_reg(cs, R_008D1C_SQ_THREAD_TRACE_CTRL,
1177ec681f3Smrg                                          gfx10_get_thread_trace_ctrl(device, true));
1187ec681f3Smrg      } else {
1197ec681f3Smrg         /* Order seems important for the following 4 registers. */
1207ec681f3Smrg         radeon_set_uconfig_reg(cs, R_030CDC_SQ_THREAD_TRACE_BASE2,
1217ec681f3Smrg                                S_030CDC_ADDR_HI(shifted_va >> 32));
1227ec681f3Smrg
1237ec681f3Smrg         radeon_set_uconfig_reg(cs, R_030CC0_SQ_THREAD_TRACE_BASE, shifted_va);
1247ec681f3Smrg
1257ec681f3Smrg         radeon_set_uconfig_reg(cs, R_030CC4_SQ_THREAD_TRACE_SIZE, S_030CC4_SIZE(shifted_size));
1267ec681f3Smrg
1277ec681f3Smrg         radeon_set_uconfig_reg(cs, R_030CD4_SQ_THREAD_TRACE_CTRL, S_030CD4_RESET_BUFFER(1));
1287ec681f3Smrg
1297ec681f3Smrg         uint32_t thread_trace_mask = S_030CC8_CU_SEL(first_active_cu) | S_030CC8_SH_SEL(0) |
1307ec681f3Smrg                                      S_030CC8_SIMD_EN(0xf) | S_030CC8_VM_ID_MASK(0) |
1317ec681f3Smrg                                      S_030CC8_REG_STALL_EN(1) | S_030CC8_SPI_STALL_EN(1) |
1327ec681f3Smrg                                      S_030CC8_SQ_STALL_EN(1);
1337ec681f3Smrg
1347ec681f3Smrg         if (device->physical_device->rad_info.chip_class < GFX9) {
1357ec681f3Smrg            thread_trace_mask |= S_030CC8_RANDOM_SEED(0xffff);
1367ec681f3Smrg         }
1377ec681f3Smrg
1387ec681f3Smrg         radeon_set_uconfig_reg(cs, R_030CC8_SQ_THREAD_TRACE_MASK, thread_trace_mask);
1397ec681f3Smrg
1407ec681f3Smrg         /* Trace all tokens and registers. */
1417ec681f3Smrg         radeon_set_uconfig_reg(
1427ec681f3Smrg            cs, R_030CCC_SQ_THREAD_TRACE_TOKEN_MASK,
1437ec681f3Smrg            S_030CCC_TOKEN_MASK(0xbfff) | S_030CCC_REG_MASK(0xff) | S_030CCC_REG_DROP_ON_STALL(0));
1447ec681f3Smrg
1457ec681f3Smrg         /* Enable SQTT perf counters for all CUs. */
1467ec681f3Smrg         radeon_set_uconfig_reg(cs, R_030CD0_SQ_THREAD_TRACE_PERF_MASK,
1477ec681f3Smrg                                S_030CD0_SH0_MASK(0xffff) | S_030CD0_SH1_MASK(0xffff));
1487ec681f3Smrg
1497ec681f3Smrg         radeon_set_uconfig_reg(cs, R_030CE0_SQ_THREAD_TRACE_TOKEN_MASK2, 0xffffffff);
1507ec681f3Smrg
1517ec681f3Smrg         radeon_set_uconfig_reg(cs, R_030CEC_SQ_THREAD_TRACE_HIWATER, S_030CEC_HIWATER(4));
1527ec681f3Smrg
1537ec681f3Smrg         if (device->physical_device->rad_info.chip_class == GFX9) {
1547ec681f3Smrg            /* Reset thread trace status errors. */
1557ec681f3Smrg            radeon_set_uconfig_reg(cs, R_030CE8_SQ_THREAD_TRACE_STATUS, S_030CE8_UTC_ERROR(0));
1567ec681f3Smrg         }
1577ec681f3Smrg
1587ec681f3Smrg         /* Enable the thread trace mode. */
1597ec681f3Smrg         uint32_t thread_trace_mode =
1607ec681f3Smrg            S_030CD8_MASK_PS(1) | S_030CD8_MASK_VS(1) | S_030CD8_MASK_GS(1) | S_030CD8_MASK_ES(1) |
1617ec681f3Smrg            S_030CD8_MASK_HS(1) | S_030CD8_MASK_LS(1) | S_030CD8_MASK_CS(1) |
1627ec681f3Smrg            S_030CD8_AUTOFLUSH_EN(1) | /* periodically flush SQTT data to memory */
1637ec681f3Smrg            S_030CD8_MODE(1);
1647ec681f3Smrg
1657ec681f3Smrg         if (device->physical_device->rad_info.chip_class == GFX9) {
1667ec681f3Smrg            /* Count SQTT traffic in TCC perf counters. */
1677ec681f3Smrg            thread_trace_mode |= S_030CD8_TC_PERF_EN(1);
1687ec681f3Smrg         }
1697ec681f3Smrg
1707ec681f3Smrg         radeon_set_uconfig_reg(cs, R_030CD8_SQ_THREAD_TRACE_MODE, thread_trace_mode);
1717ec681f3Smrg      }
1727ec681f3Smrg   }
1737ec681f3Smrg
1747ec681f3Smrg   /* Restore global broadcasting. */
1757ec681f3Smrg   radeon_set_uconfig_reg(cs, R_030800_GRBM_GFX_INDEX,
1767ec681f3Smrg                          S_030800_SE_BROADCAST_WRITES(1) | S_030800_SH_BROADCAST_WRITES(1) |
1777ec681f3Smrg                             S_030800_INSTANCE_BROADCAST_WRITES(1));
1787ec681f3Smrg
1797ec681f3Smrg   /* Start the thread trace with a different event based on the queue. */
1807ec681f3Smrg   if (queue_family_index == RADV_QUEUE_COMPUTE &&
1817ec681f3Smrg       device->physical_device->rad_info.chip_class >= GFX7) {
1827ec681f3Smrg      radeon_set_sh_reg(cs, R_00B878_COMPUTE_THREAD_TRACE_ENABLE, S_00B878_THREAD_TRACE_ENABLE(1));
1837ec681f3Smrg   } else {
1847ec681f3Smrg      radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
1857ec681f3Smrg      radeon_emit(cs, EVENT_TYPE(V_028A90_THREAD_TRACE_START) | EVENT_INDEX(0));
1867ec681f3Smrg   }
1877ec681f3Smrg}
1887ec681f3Smrg
1897ec681f3Smrgstatic const uint32_t gfx8_thread_trace_info_regs[] = {
1907ec681f3Smrg   R_030CE4_SQ_THREAD_TRACE_WPTR,
1917ec681f3Smrg   R_030CE8_SQ_THREAD_TRACE_STATUS,
1927ec681f3Smrg   R_008E40_SQ_THREAD_TRACE_CNTR,
1937ec681f3Smrg};
1947ec681f3Smrg
1957ec681f3Smrgstatic const uint32_t gfx9_thread_trace_info_regs[] = {
1967ec681f3Smrg   R_030CE4_SQ_THREAD_TRACE_WPTR,
1977ec681f3Smrg   R_030CE8_SQ_THREAD_TRACE_STATUS,
1987ec681f3Smrg   R_030CF0_SQ_THREAD_TRACE_CNTR,
1997ec681f3Smrg};
2007ec681f3Smrg
2017ec681f3Smrgstatic const uint32_t gfx10_thread_trace_info_regs[] = {
2027ec681f3Smrg   R_008D10_SQ_THREAD_TRACE_WPTR,
2037ec681f3Smrg   R_008D20_SQ_THREAD_TRACE_STATUS,
2047ec681f3Smrg   R_008D24_SQ_THREAD_TRACE_DROPPED_CNTR,
2057ec681f3Smrg};
2067ec681f3Smrg
2077ec681f3Smrgstatic void
2087ec681f3Smrgradv_copy_thread_trace_info_regs(struct radv_device *device, struct radeon_cmdbuf *cs,
2097ec681f3Smrg                                 unsigned se_index)
2107ec681f3Smrg{
2117ec681f3Smrg   const uint32_t *thread_trace_info_regs = NULL;
2127ec681f3Smrg
2137ec681f3Smrg   if (device->physical_device->rad_info.chip_class >= GFX10) {
2147ec681f3Smrg      thread_trace_info_regs = gfx10_thread_trace_info_regs;
2157ec681f3Smrg   } else if (device->physical_device->rad_info.chip_class == GFX9) {
2167ec681f3Smrg      thread_trace_info_regs = gfx9_thread_trace_info_regs;
2177ec681f3Smrg   } else {
2187ec681f3Smrg      assert(device->physical_device->rad_info.chip_class == GFX8);
2197ec681f3Smrg      thread_trace_info_regs = gfx8_thread_trace_info_regs;
2207ec681f3Smrg   }
2217ec681f3Smrg
2227ec681f3Smrg   /* Get the VA where the info struct is stored for this SE. */
2237ec681f3Smrg   uint64_t va = radv_buffer_get_va(device->thread_trace.bo);
2247ec681f3Smrg   uint64_t info_va = ac_thread_trace_get_info_va(va, se_index);
2257ec681f3Smrg
2267ec681f3Smrg   /* Copy back the info struct one DWORD at a time. */
2277ec681f3Smrg   for (unsigned i = 0; i < 3; i++) {
2287ec681f3Smrg      radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
2297ec681f3Smrg      radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_PERF) | COPY_DATA_DST_SEL(COPY_DATA_TC_L2) |
2307ec681f3Smrg                         COPY_DATA_WR_CONFIRM);
2317ec681f3Smrg      radeon_emit(cs, thread_trace_info_regs[i] >> 2);
2327ec681f3Smrg      radeon_emit(cs, 0); /* unused */
2337ec681f3Smrg      radeon_emit(cs, (info_va + i * 4));
2347ec681f3Smrg      radeon_emit(cs, (info_va + i * 4) >> 32);
2357ec681f3Smrg   }
2367ec681f3Smrg}
2377ec681f3Smrg
2387ec681f3Smrgstatic void
2397ec681f3Smrgradv_emit_thread_trace_stop(struct radv_device *device, struct radeon_cmdbuf *cs,
2407ec681f3Smrg                            uint32_t queue_family_index)
2417ec681f3Smrg{
2427ec681f3Smrg   unsigned max_se = device->physical_device->rad_info.max_se;
2437ec681f3Smrg
2447ec681f3Smrg   /* Stop the thread trace with a different event based on the queue. */
2457ec681f3Smrg   if (queue_family_index == RADV_QUEUE_COMPUTE &&
2467ec681f3Smrg       device->physical_device->rad_info.chip_class >= GFX7) {
2477ec681f3Smrg      radeon_set_sh_reg(cs, R_00B878_COMPUTE_THREAD_TRACE_ENABLE, S_00B878_THREAD_TRACE_ENABLE(0));
2487ec681f3Smrg   } else {
2497ec681f3Smrg      radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
2507ec681f3Smrg      radeon_emit(cs, EVENT_TYPE(V_028A90_THREAD_TRACE_STOP) | EVENT_INDEX(0));
2517ec681f3Smrg   }
2527ec681f3Smrg
2537ec681f3Smrg   radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
2547ec681f3Smrg   radeon_emit(cs, EVENT_TYPE(V_028A90_THREAD_TRACE_FINISH) | EVENT_INDEX(0));
2557ec681f3Smrg
2567ec681f3Smrg   for (unsigned se = 0; se < max_se; se++) {
2577ec681f3Smrg      if (radv_se_is_disabled(device, se))
2587ec681f3Smrg         continue;
2597ec681f3Smrg
2607ec681f3Smrg      /* Target SEi and SH0. */
2617ec681f3Smrg      radeon_set_uconfig_reg(
2627ec681f3Smrg         cs, R_030800_GRBM_GFX_INDEX,
2637ec681f3Smrg         S_030800_SE_INDEX(se) | S_030800_SH_INDEX(0) | S_030800_INSTANCE_BROADCAST_WRITES(1));
2647ec681f3Smrg
2657ec681f3Smrg      if (device->physical_device->rad_info.chip_class >= GFX10) {
2667ec681f3Smrg         /* Make sure to wait for the trace buffer. */
2677ec681f3Smrg         radeon_emit(cs, PKT3(PKT3_WAIT_REG_MEM, 5, 0));
2687ec681f3Smrg         radeon_emit(
2697ec681f3Smrg            cs,
2707ec681f3Smrg            WAIT_REG_MEM_NOT_EQUAL); /* wait until the register is equal to the reference value */
2717ec681f3Smrg         radeon_emit(cs, R_008D20_SQ_THREAD_TRACE_STATUS >> 2); /* register */
2727ec681f3Smrg         radeon_emit(cs, 0);
2737ec681f3Smrg         radeon_emit(cs, 0);                       /* reference value */
2747ec681f3Smrg         radeon_emit(cs, ~C_008D20_FINISH_DONE);
2757ec681f3Smrg         radeon_emit(cs, 4);                       /* poll interval */
2767ec681f3Smrg
2777ec681f3Smrg         /* Disable the thread trace mode. */
2787ec681f3Smrg         radeon_set_privileged_config_reg(cs, R_008D1C_SQ_THREAD_TRACE_CTRL,
2797ec681f3Smrg                                          gfx10_get_thread_trace_ctrl(device, false));
2807ec681f3Smrg
2817ec681f3Smrg         /* Wait for thread trace completion. */
2827ec681f3Smrg         radeon_emit(cs, PKT3(PKT3_WAIT_REG_MEM, 5, 0));
2837ec681f3Smrg         radeon_emit(
2847ec681f3Smrg            cs, WAIT_REG_MEM_EQUAL); /* wait until the register is equal to the reference value */
2857ec681f3Smrg         radeon_emit(cs, R_008D20_SQ_THREAD_TRACE_STATUS >> 2); /* register */
2867ec681f3Smrg         radeon_emit(cs, 0);
2877ec681f3Smrg         radeon_emit(cs, 0);                /* reference value */
2887ec681f3Smrg         radeon_emit(cs, ~C_008D20_BUSY); /* mask */
2897ec681f3Smrg         radeon_emit(cs, 4);                /* poll interval */
2907ec681f3Smrg      } else {
2917ec681f3Smrg         /* Disable the thread trace mode. */
2927ec681f3Smrg         radeon_set_uconfig_reg(cs, R_030CD8_SQ_THREAD_TRACE_MODE, S_030CD8_MODE(0));
2937ec681f3Smrg
2947ec681f3Smrg         /* Wait for thread trace completion. */
2957ec681f3Smrg         radeon_emit(cs, PKT3(PKT3_WAIT_REG_MEM, 5, 0));
2967ec681f3Smrg         radeon_emit(
2977ec681f3Smrg            cs, WAIT_REG_MEM_EQUAL); /* wait until the register is equal to the reference value */
2987ec681f3Smrg         radeon_emit(cs, R_030CE8_SQ_THREAD_TRACE_STATUS >> 2); /* register */
2997ec681f3Smrg         radeon_emit(cs, 0);
3007ec681f3Smrg         radeon_emit(cs, 0);                /* reference value */
3017ec681f3Smrg         radeon_emit(cs, ~C_030CE8_BUSY); /* mask */
3027ec681f3Smrg         radeon_emit(cs, 4);                /* poll interval */
3037ec681f3Smrg      }
3047ec681f3Smrg
3057ec681f3Smrg      radv_copy_thread_trace_info_regs(device, cs, se);
3067ec681f3Smrg   }
3077ec681f3Smrg
3087ec681f3Smrg   /* Restore global broadcasting. */
3097ec681f3Smrg   radeon_set_uconfig_reg(cs, R_030800_GRBM_GFX_INDEX,
3107ec681f3Smrg                          S_030800_SE_BROADCAST_WRITES(1) | S_030800_SH_BROADCAST_WRITES(1) |
3117ec681f3Smrg                             S_030800_INSTANCE_BROADCAST_WRITES(1));
3127ec681f3Smrg}
3137ec681f3Smrg
3147ec681f3Smrgvoid
3157ec681f3Smrgradv_emit_thread_trace_userdata(const struct radv_device *device, struct radeon_cmdbuf *cs,
3167ec681f3Smrg                                const void *data, uint32_t num_dwords)
3177ec681f3Smrg{
3187ec681f3Smrg   const uint32_t *dwords = (uint32_t *)data;
3197ec681f3Smrg
3207ec681f3Smrg   while (num_dwords > 0) {
3217ec681f3Smrg      uint32_t count = MIN2(num_dwords, 2);
3227ec681f3Smrg
3237ec681f3Smrg      radeon_check_space(device->ws, cs, 2 + count);
3247ec681f3Smrg
3257ec681f3Smrg      /* Without the perfctr bit the CP might not always pass the
3267ec681f3Smrg       * write on correctly. */
3277ec681f3Smrg      if (device->physical_device->rad_info.chip_class >= GFX10)
3287ec681f3Smrg         radeon_set_uconfig_reg_seq_perfctr(cs, R_030D08_SQ_THREAD_TRACE_USERDATA_2, count);
3297ec681f3Smrg      else
3307ec681f3Smrg         radeon_set_uconfig_reg_seq(cs, R_030D08_SQ_THREAD_TRACE_USERDATA_2, count);
3317ec681f3Smrg      radeon_emit_array(cs, dwords, count);
3327ec681f3Smrg
3337ec681f3Smrg      dwords += count;
3347ec681f3Smrg      num_dwords -= count;
3357ec681f3Smrg   }
3367ec681f3Smrg}
3377ec681f3Smrg
3387ec681f3Smrgstatic void
3397ec681f3Smrgradv_emit_spi_config_cntl(struct radv_device *device, struct radeon_cmdbuf *cs, bool enable)
3407ec681f3Smrg{
3417ec681f3Smrg   if (device->physical_device->rad_info.chip_class >= GFX9) {
3427ec681f3Smrg      uint32_t spi_config_cntl =
3437ec681f3Smrg         S_031100_GPR_WRITE_PRIORITY(0x2c688) | S_031100_EXP_PRIORITY_ORDER(3) |
3447ec681f3Smrg         S_031100_ENABLE_SQG_TOP_EVENTS(enable) | S_031100_ENABLE_SQG_BOP_EVENTS(enable);
3457ec681f3Smrg
3467ec681f3Smrg      if (device->physical_device->rad_info.chip_class >= GFX10)
3477ec681f3Smrg         spi_config_cntl |= S_031100_PS_PKR_PRIORITY_CNTL(3);
3487ec681f3Smrg
3497ec681f3Smrg      radeon_set_uconfig_reg(cs, R_031100_SPI_CONFIG_CNTL, spi_config_cntl);
3507ec681f3Smrg   } else {
3517ec681f3Smrg      /* SPI_CONFIG_CNTL is a protected register on GFX6-GFX8. */
3527ec681f3Smrg      radeon_set_privileged_config_reg(
3537ec681f3Smrg         cs, R_009100_SPI_CONFIG_CNTL,
3547ec681f3Smrg         S_009100_ENABLE_SQG_TOP_EVENTS(enable) | S_009100_ENABLE_SQG_BOP_EVENTS(enable));
3557ec681f3Smrg   }
3567ec681f3Smrg}
3577ec681f3Smrg
3587ec681f3Smrgstatic void
3597ec681f3Smrgradv_emit_inhibit_clockgating(struct radv_device *device, struct radeon_cmdbuf *cs, bool inhibit)
3607ec681f3Smrg{
3617ec681f3Smrg   if (device->physical_device->rad_info.chip_class >= GFX10) {
3627ec681f3Smrg      radeon_set_uconfig_reg(cs, R_037390_RLC_PERFMON_CLK_CNTL,
3637ec681f3Smrg                             S_037390_PERFMON_CLOCK_STATE(inhibit));
3647ec681f3Smrg   } else if (device->physical_device->rad_info.chip_class >= GFX8) {
3657ec681f3Smrg      radeon_set_uconfig_reg(cs, R_0372FC_RLC_PERFMON_CLK_CNTL,
3667ec681f3Smrg                             S_0372FC_PERFMON_CLOCK_STATE(inhibit));
3677ec681f3Smrg   }
3687ec681f3Smrg}
3697ec681f3Smrg
3707ec681f3Smrgstatic void
3717ec681f3Smrgradv_emit_wait_for_idle(struct radv_device *device, struct radeon_cmdbuf *cs, int family)
3727ec681f3Smrg{
3737ec681f3Smrg   enum rgp_flush_bits sqtt_flush_bits = 0;
3747ec681f3Smrg   si_cs_emit_cache_flush(
3757ec681f3Smrg      cs, device->physical_device->rad_info.chip_class, NULL, 0,
3767ec681f3Smrg      family == RING_COMPUTE && device->physical_device->rad_info.chip_class >= GFX7,
3777ec681f3Smrg      (family == RADV_QUEUE_COMPUTE
3787ec681f3Smrg          ? RADV_CMD_FLAG_CS_PARTIAL_FLUSH
3797ec681f3Smrg          : (RADV_CMD_FLAG_CS_PARTIAL_FLUSH | RADV_CMD_FLAG_PS_PARTIAL_FLUSH)) |
3807ec681f3Smrg         RADV_CMD_FLAG_INV_ICACHE | RADV_CMD_FLAG_INV_SCACHE | RADV_CMD_FLAG_INV_VCACHE |
3817ec681f3Smrg         RADV_CMD_FLAG_INV_L2,
3827ec681f3Smrg      &sqtt_flush_bits, 0);
3837ec681f3Smrg}
3847ec681f3Smrg
3857ec681f3Smrgstatic bool
3867ec681f3Smrgradv_thread_trace_init_bo(struct radv_device *device)
3877ec681f3Smrg{
3887ec681f3Smrg   unsigned max_se = device->physical_device->rad_info.max_se;
3897ec681f3Smrg   struct radeon_winsys *ws = device->ws;
3907ec681f3Smrg   VkResult result;
3917ec681f3Smrg   uint64_t size;
3927ec681f3Smrg
3937ec681f3Smrg   /* The buffer size and address need to be aligned in HW regs. Align the
3947ec681f3Smrg    * size as early as possible so that we do all the allocation & addressing
3957ec681f3Smrg    * correctly. */
3967ec681f3Smrg   device->thread_trace.buffer_size =
3977ec681f3Smrg      align64(device->thread_trace.buffer_size, 1u << SQTT_BUFFER_ALIGN_SHIFT);
3987ec681f3Smrg
3997ec681f3Smrg   /* Compute total size of the thread trace BO for all SEs. */
4007ec681f3Smrg   size = align64(sizeof(struct ac_thread_trace_info) * max_se, 1 << SQTT_BUFFER_ALIGN_SHIFT);
4017ec681f3Smrg   size += device->thread_trace.buffer_size * (uint64_t)max_se;
4027ec681f3Smrg
4037ec681f3Smrg   struct radeon_winsys_bo *bo = NULL;
4047ec681f3Smrg   result = ws->buffer_create(
4057ec681f3Smrg      ws, size, 4096, RADEON_DOMAIN_VRAM,
4067ec681f3Smrg      RADEON_FLAG_CPU_ACCESS | RADEON_FLAG_NO_INTERPROCESS_SHARING | RADEON_FLAG_ZERO_VRAM,
4077ec681f3Smrg      RADV_BO_PRIORITY_SCRATCH, 0, &bo);
4087ec681f3Smrg   device->thread_trace.bo = bo;
4097ec681f3Smrg   if (result != VK_SUCCESS)
4107ec681f3Smrg      return false;
4117ec681f3Smrg
4127ec681f3Smrg   result = ws->buffer_make_resident(ws, device->thread_trace.bo, true);
4137ec681f3Smrg   if (result != VK_SUCCESS)
4147ec681f3Smrg      return false;
4157ec681f3Smrg
4167ec681f3Smrg   device->thread_trace.ptr = ws->buffer_map(device->thread_trace.bo);
4177ec681f3Smrg   if (!device->thread_trace.ptr)
4187ec681f3Smrg      return false;
4197ec681f3Smrg
4207ec681f3Smrg   return true;
4217ec681f3Smrg}
4227ec681f3Smrg
4237ec681f3Smrgstatic void
4247ec681f3Smrgradv_thread_trace_finish_bo(struct radv_device *device)
4257ec681f3Smrg{
4267ec681f3Smrg   struct radeon_winsys *ws = device->ws;
4277ec681f3Smrg
4287ec681f3Smrg   if (unlikely(device->thread_trace.bo)) {
4297ec681f3Smrg      ws->buffer_make_resident(ws, device->thread_trace.bo, false);
4307ec681f3Smrg      ws->buffer_destroy(ws, device->thread_trace.bo);
4317ec681f3Smrg   }
4327ec681f3Smrg}
4337ec681f3Smrg
4347ec681f3Smrgbool
4357ec681f3Smrgradv_thread_trace_init(struct radv_device *device)
4367ec681f3Smrg{
4377ec681f3Smrg   struct ac_thread_trace_data *thread_trace_data = &device->thread_trace;
4387ec681f3Smrg
4397ec681f3Smrg   /* Default buffer size set to 32MB per SE. */
4407ec681f3Smrg   device->thread_trace.buffer_size =
4417ec681f3Smrg      radv_get_int_debug_option("RADV_THREAD_TRACE_BUFFER_SIZE", 32 * 1024 * 1024);
4427ec681f3Smrg   device->thread_trace.start_frame = radv_get_int_debug_option("RADV_THREAD_TRACE", -1);
4437ec681f3Smrg
4447ec681f3Smrg   const char *trigger_file = getenv("RADV_THREAD_TRACE_TRIGGER");
4457ec681f3Smrg   if (trigger_file)
4467ec681f3Smrg      device->thread_trace.trigger_file = strdup(trigger_file);
4477ec681f3Smrg
4487ec681f3Smrg   if (!radv_thread_trace_init_bo(device))
4497ec681f3Smrg      return false;
4507ec681f3Smrg
4517ec681f3Smrg   list_inithead(&thread_trace_data->rgp_pso_correlation.record);
4527ec681f3Smrg   simple_mtx_init(&thread_trace_data->rgp_pso_correlation.lock, mtx_plain);
4537ec681f3Smrg
4547ec681f3Smrg   list_inithead(&thread_trace_data->rgp_loader_events.record);
4557ec681f3Smrg   simple_mtx_init(&thread_trace_data->rgp_loader_events.lock, mtx_plain);
4567ec681f3Smrg
4577ec681f3Smrg   list_inithead(&thread_trace_data->rgp_code_object.record);
4587ec681f3Smrg   simple_mtx_init(&thread_trace_data->rgp_code_object.lock, mtx_plain);
4597ec681f3Smrg
4607ec681f3Smrg   return true;
4617ec681f3Smrg}
4627ec681f3Smrg
4637ec681f3Smrgvoid
4647ec681f3Smrgradv_thread_trace_finish(struct radv_device *device)
4657ec681f3Smrg{
4667ec681f3Smrg   struct ac_thread_trace_data *thread_trace_data = &device->thread_trace;
4677ec681f3Smrg   struct radeon_winsys *ws = device->ws;
4687ec681f3Smrg
4697ec681f3Smrg   radv_thread_trace_finish_bo(device);
4707ec681f3Smrg
4717ec681f3Smrg   for (unsigned i = 0; i < 2; i++) {
4727ec681f3Smrg      if (device->thread_trace.start_cs[i])
4737ec681f3Smrg         ws->cs_destroy(device->thread_trace.start_cs[i]);
4747ec681f3Smrg      if (device->thread_trace.stop_cs[i])
4757ec681f3Smrg         ws->cs_destroy(device->thread_trace.stop_cs[i]);
4767ec681f3Smrg   }
4777ec681f3Smrg
4787ec681f3Smrg   assert(thread_trace_data->rgp_pso_correlation.record_count == 0);
4797ec681f3Smrg   simple_mtx_destroy(&thread_trace_data->rgp_pso_correlation.lock);
4807ec681f3Smrg
4817ec681f3Smrg   assert(thread_trace_data->rgp_loader_events.record_count == 0);
4827ec681f3Smrg   simple_mtx_destroy(&thread_trace_data->rgp_loader_events.lock);
4837ec681f3Smrg
4847ec681f3Smrg   assert(thread_trace_data->rgp_code_object.record_count == 0);
4857ec681f3Smrg   simple_mtx_destroy(&thread_trace_data->rgp_code_object.lock);
4867ec681f3Smrg}
4877ec681f3Smrg
4887ec681f3Smrgstatic bool
4897ec681f3Smrgradv_thread_trace_resize_bo(struct radv_device *device)
4907ec681f3Smrg{
4917ec681f3Smrg   /* Destroy the previous thread trace BO. */
4927ec681f3Smrg   radv_thread_trace_finish_bo(device);
4937ec681f3Smrg
4947ec681f3Smrg   /* Double the size of the thread trace buffer per SE. */
4957ec681f3Smrg   device->thread_trace.buffer_size *= 2;
4967ec681f3Smrg
4977ec681f3Smrg   fprintf(stderr,
4987ec681f3Smrg           "Failed to get the thread trace because the buffer "
4997ec681f3Smrg           "was too small, resizing to %d KB\n",
5007ec681f3Smrg           device->thread_trace.buffer_size / 1024);
5017ec681f3Smrg
5027ec681f3Smrg   /* Re-create the thread trace BO. */
5037ec681f3Smrg   return radv_thread_trace_init_bo(device);
5047ec681f3Smrg}
5057ec681f3Smrg
5067ec681f3Smrgbool
5077ec681f3Smrgradv_begin_thread_trace(struct radv_queue *queue)
5087ec681f3Smrg{
5097ec681f3Smrg   struct radv_device *device = queue->device;
5107ec681f3Smrg   int family = queue->vk.queue_family_index;
5117ec681f3Smrg   struct radeon_winsys *ws = device->ws;
5127ec681f3Smrg   struct radeon_cmdbuf *cs;
5137ec681f3Smrg   VkResult result;
5147ec681f3Smrg
5157ec681f3Smrg   /* Destroy the previous start CS and create a new one. */
5167ec681f3Smrg   if (device->thread_trace.start_cs[family]) {
5177ec681f3Smrg      ws->cs_destroy(device->thread_trace.start_cs[family]);
5187ec681f3Smrg      device->thread_trace.start_cs[family] = NULL;
5197ec681f3Smrg   }
5207ec681f3Smrg
5217ec681f3Smrg   cs = ws->cs_create(ws, family);
5227ec681f3Smrg   if (!cs)
5237ec681f3Smrg      return false;
5247ec681f3Smrg
5257ec681f3Smrg   switch (family) {
5267ec681f3Smrg   case RADV_QUEUE_GENERAL:
5277ec681f3Smrg      radeon_emit(cs, PKT3(PKT3_CONTEXT_CONTROL, 1, 0));
5287ec681f3Smrg      radeon_emit(cs, CC0_UPDATE_LOAD_ENABLES(1));
5297ec681f3Smrg      radeon_emit(cs, CC1_UPDATE_SHADOW_ENABLES(1));
5307ec681f3Smrg      break;
5317ec681f3Smrg   case RADV_QUEUE_COMPUTE:
5327ec681f3Smrg      radeon_emit(cs, PKT3(PKT3_NOP, 0, 0));
5337ec681f3Smrg      radeon_emit(cs, 0);
5347ec681f3Smrg      break;
5357ec681f3Smrg   }
5367ec681f3Smrg
5377ec681f3Smrg   radv_cs_add_buffer(ws, cs, device->thread_trace.bo);
5387ec681f3Smrg
5397ec681f3Smrg   /* Make sure to wait-for-idle before starting SQTT. */
5407ec681f3Smrg   radv_emit_wait_for_idle(device, cs, family);
5417ec681f3Smrg
5427ec681f3Smrg   /* Disable clock gating before starting SQTT. */
5437ec681f3Smrg   radv_emit_inhibit_clockgating(device, cs, true);
5447ec681f3Smrg
5457ec681f3Smrg   /* Enable SQG events that collects thread trace data. */
5467ec681f3Smrg   radv_emit_spi_config_cntl(device, cs, true);
5477ec681f3Smrg
5487ec681f3Smrg   /* Start SQTT. */
5497ec681f3Smrg   radv_emit_thread_trace_start(device, cs, family);
5507ec681f3Smrg
5517ec681f3Smrg   result = ws->cs_finalize(cs);
5527ec681f3Smrg   if (result != VK_SUCCESS) {
5537ec681f3Smrg      ws->cs_destroy(cs);
5547ec681f3Smrg      return false;
5557ec681f3Smrg   }
5567ec681f3Smrg
5577ec681f3Smrg   device->thread_trace.start_cs[family] = cs;
5587ec681f3Smrg
5597ec681f3Smrg   return radv_queue_internal_submit(queue, cs);
5607ec681f3Smrg}
5617ec681f3Smrg
5627ec681f3Smrgbool
5637ec681f3Smrgradv_end_thread_trace(struct radv_queue *queue)
5647ec681f3Smrg{
5657ec681f3Smrg   struct radv_device *device = queue->device;
5667ec681f3Smrg   int family = queue->vk.queue_family_index;
5677ec681f3Smrg   struct radeon_winsys *ws = device->ws;
5687ec681f3Smrg   struct radeon_cmdbuf *cs;
5697ec681f3Smrg   VkResult result;
5707ec681f3Smrg
5717ec681f3Smrg   /* Destroy the previous stop CS and create a new one. */
5727ec681f3Smrg   if (queue->device->thread_trace.stop_cs[family]) {
5737ec681f3Smrg      ws->cs_destroy(device->thread_trace.stop_cs[family]);
5747ec681f3Smrg      device->thread_trace.stop_cs[family] = NULL;
5757ec681f3Smrg   }
5767ec681f3Smrg
5777ec681f3Smrg   cs = ws->cs_create(ws, family);
5787ec681f3Smrg   if (!cs)
5797ec681f3Smrg      return false;
5807ec681f3Smrg
5817ec681f3Smrg   switch (family) {
5827ec681f3Smrg   case RADV_QUEUE_GENERAL:
5837ec681f3Smrg      radeon_emit(cs, PKT3(PKT3_CONTEXT_CONTROL, 1, 0));
5847ec681f3Smrg      radeon_emit(cs, CC0_UPDATE_LOAD_ENABLES(1));
5857ec681f3Smrg      radeon_emit(cs, CC1_UPDATE_SHADOW_ENABLES(1));
5867ec681f3Smrg      break;
5877ec681f3Smrg   case RADV_QUEUE_COMPUTE:
5887ec681f3Smrg      radeon_emit(cs, PKT3(PKT3_NOP, 0, 0));
5897ec681f3Smrg      radeon_emit(cs, 0);
5907ec681f3Smrg      break;
5917ec681f3Smrg   }
5927ec681f3Smrg
5937ec681f3Smrg   radv_cs_add_buffer(ws, cs, device->thread_trace.bo);
5947ec681f3Smrg
5957ec681f3Smrg   /* Make sure to wait-for-idle before stopping SQTT. */
5967ec681f3Smrg   radv_emit_wait_for_idle(device, cs, family);
5977ec681f3Smrg
5987ec681f3Smrg   /* Stop SQTT. */
5997ec681f3Smrg   radv_emit_thread_trace_stop(device, cs, family);
6007ec681f3Smrg
6017ec681f3Smrg   /* Restore previous state by disabling SQG events. */
6027ec681f3Smrg   radv_emit_spi_config_cntl(device, cs, false);
6037ec681f3Smrg
6047ec681f3Smrg   /* Restore previous state by re-enabling clock gating. */
6057ec681f3Smrg   radv_emit_inhibit_clockgating(device, cs, false);
6067ec681f3Smrg
6077ec681f3Smrg   result = ws->cs_finalize(cs);
6087ec681f3Smrg   if (result != VK_SUCCESS) {
6097ec681f3Smrg      ws->cs_destroy(cs);
6107ec681f3Smrg      return false;
6117ec681f3Smrg   }
6127ec681f3Smrg
6137ec681f3Smrg   device->thread_trace.stop_cs[family] = cs;
6147ec681f3Smrg
6157ec681f3Smrg   return radv_queue_internal_submit(queue, cs);
6167ec681f3Smrg}
6177ec681f3Smrg
6187ec681f3Smrgbool
6197ec681f3Smrgradv_get_thread_trace(struct radv_queue *queue, struct ac_thread_trace *thread_trace)
6207ec681f3Smrg{
6217ec681f3Smrg   struct radv_device *device = queue->device;
6227ec681f3Smrg   struct radeon_info *rad_info = &device->physical_device->rad_info;
6237ec681f3Smrg   unsigned max_se = rad_info->max_se;
6247ec681f3Smrg   void *thread_trace_ptr = device->thread_trace.ptr;
6257ec681f3Smrg
6267ec681f3Smrg   memset(thread_trace, 0, sizeof(*thread_trace));
6277ec681f3Smrg
6287ec681f3Smrg   for (unsigned se = 0; se < max_se; se++) {
6297ec681f3Smrg      uint64_t info_offset = ac_thread_trace_get_info_offset(se);
6307ec681f3Smrg      uint64_t data_offset = ac_thread_trace_get_data_offset(rad_info, &device->thread_trace, se);
6317ec681f3Smrg      void *info_ptr = (uint8_t *)thread_trace_ptr + info_offset;
6327ec681f3Smrg      void *data_ptr = (uint8_t *)thread_trace_ptr + data_offset;
6337ec681f3Smrg      struct ac_thread_trace_info *info = (struct ac_thread_trace_info *)info_ptr;
6347ec681f3Smrg      struct ac_thread_trace_se thread_trace_se = {0};
6357ec681f3Smrg      int first_active_cu = ffs(device->physical_device->rad_info.cu_mask[se][0]);
6367ec681f3Smrg
6377ec681f3Smrg      if (radv_se_is_disabled(device, se))
6387ec681f3Smrg         continue;
6397ec681f3Smrg
6407ec681f3Smrg      if (!ac_is_thread_trace_complete(&device->physical_device->rad_info, &device->thread_trace,
6417ec681f3Smrg                                       info)) {
6427ec681f3Smrg         if (!radv_thread_trace_resize_bo(device)) {
6437ec681f3Smrg            fprintf(stderr, "Failed to resize the thread "
6447ec681f3Smrg                            "trace buffer.\n");
6457ec681f3Smrg            abort();
6467ec681f3Smrg         }
6477ec681f3Smrg         return false;
6487ec681f3Smrg      }
6497ec681f3Smrg
6507ec681f3Smrg      thread_trace_se.data_ptr = data_ptr;
6517ec681f3Smrg      thread_trace_se.info = *info;
6527ec681f3Smrg      thread_trace_se.shader_engine = se;
6537ec681f3Smrg
6547ec681f3Smrg      /* RGP seems to expect units of WGP on GFX10+. */
6557ec681f3Smrg      thread_trace_se.compute_unit = device->physical_device->rad_info.chip_class >= GFX10
6567ec681f3Smrg                                        ? (first_active_cu / 2)
6577ec681f3Smrg                                        : first_active_cu;
6587ec681f3Smrg
6597ec681f3Smrg      thread_trace->traces[thread_trace->num_traces] = thread_trace_se;
6607ec681f3Smrg      thread_trace->num_traces++;
6617ec681f3Smrg   }
6627ec681f3Smrg
6637ec681f3Smrg   thread_trace->data = &device->thread_trace;
6647ec681f3Smrg   return true;
6657ec681f3Smrg}
666