1361fc4cbSmaya/*
2361fc4cbSmaya * Copyright © 2016 Red Hat.
3361fc4cbSmaya * Copyright © 2016 Bas Nieuwenhuizen
4361fc4cbSmaya *
5361fc4cbSmaya * based in part on anv driver which is:
6361fc4cbSmaya * Copyright © 2015 Intel Corporation
7361fc4cbSmaya *
8361fc4cbSmaya * Permission is hereby granted, free of charge, to any person obtaining a
9361fc4cbSmaya * copy of this software and associated documentation files (the "Software"),
10361fc4cbSmaya * to deal in the Software without restriction, including without limitation
11361fc4cbSmaya * the rights to use, copy, modify, merge, publish, distribute, sublicense,
12361fc4cbSmaya * and/or sell copies of the Software, and to permit persons to whom the
13361fc4cbSmaya * Software is furnished to do so, subject to the following conditions:
14361fc4cbSmaya *
15361fc4cbSmaya * The above copyright notice and this permission notice (including the next
16361fc4cbSmaya * paragraph) shall be included in all copies or substantial portions of the
17361fc4cbSmaya * Software.
18361fc4cbSmaya *
19361fc4cbSmaya * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20361fc4cbSmaya * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21361fc4cbSmaya * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
22361fc4cbSmaya * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23361fc4cbSmaya * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
24361fc4cbSmaya * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
25361fc4cbSmaya * DEALINGS IN THE SOFTWARE.
26361fc4cbSmaya */
27361fc4cbSmaya
28361fc4cbSmaya#include "tu_private.h"
29361fc4cbSmaya
307ec681f3Smrg#include "adreno_pm4.xml.h"
317ec681f3Smrg#include "adreno_common.xml.h"
32361fc4cbSmaya
33361fc4cbSmaya#include "vk_format.h"
347ec681f3Smrg#include "vk_util.h"
35361fc4cbSmaya
36361fc4cbSmaya#include "tu_cs.h"
37361fc4cbSmaya
387ec681f3Smrg#include "tu_tracepoints.h"
39361fc4cbSmaya
40361fc4cbSmayavoid
417ec681f3Smrgtu6_emit_event_write(struct tu_cmd_buffer *cmd,
427ec681f3Smrg                     struct tu_cs *cs,
437ec681f3Smrg                     enum vgt_event_type event)
44361fc4cbSmaya{
457ec681f3Smrg   bool need_seqno = false;
467ec681f3Smrg   switch (event) {
477ec681f3Smrg   case CACHE_FLUSH_TS:
487ec681f3Smrg   case WT_DONE_TS:
497ec681f3Smrg   case RB_DONE_TS:
507ec681f3Smrg   case PC_CCU_FLUSH_DEPTH_TS:
517ec681f3Smrg   case PC_CCU_FLUSH_COLOR_TS:
527ec681f3Smrg   case PC_CCU_RESOLVE_TS:
537ec681f3Smrg      need_seqno = true;
547ec681f3Smrg      break;
557ec681f3Smrg   default:
567ec681f3Smrg      break;
57361fc4cbSmaya   }
58361fc4cbSmaya
597ec681f3Smrg   tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, need_seqno ? 4 : 1);
607ec681f3Smrg   tu_cs_emit(cs, CP_EVENT_WRITE_0_EVENT(event));
617ec681f3Smrg   if (need_seqno) {
627ec681f3Smrg      tu_cs_emit_qw(cs, global_iova(cmd, seqno_dummy));
637ec681f3Smrg      tu_cs_emit(cs, 0);
64361fc4cbSmaya   }
65361fc4cbSmaya}
66361fc4cbSmaya
677ec681f3Smrgstatic void
687ec681f3Smrgtu6_emit_flushes(struct tu_cmd_buffer *cmd_buffer,
697ec681f3Smrg                 struct tu_cs *cs,
707ec681f3Smrg                 enum tu_cmd_flush_bits flushes)
71361fc4cbSmaya{
727ec681f3Smrg   if (unlikely(cmd_buffer->device->physical_device->instance->debug_flags & TU_DEBUG_FLUSHALL))
737ec681f3Smrg      flushes |= TU_CMD_FLAG_ALL_FLUSH | TU_CMD_FLAG_ALL_INVALIDATE;
747ec681f3Smrg
757ec681f3Smrg   if (unlikely(cmd_buffer->device->physical_device->instance->debug_flags & TU_DEBUG_SYNCDRAW))
767ec681f3Smrg      flushes |= TU_CMD_FLAG_WAIT_MEM_WRITES |
777ec681f3Smrg                 TU_CMD_FLAG_WAIT_FOR_IDLE |
787ec681f3Smrg                 TU_CMD_FLAG_WAIT_FOR_ME;
797ec681f3Smrg
807ec681f3Smrg   /* Experiments show that invalidating CCU while it still has data in it
817ec681f3Smrg    * doesn't work, so make sure to always flush before invalidating in case
827ec681f3Smrg    * any data remains that hasn't yet been made available through a barrier.
837ec681f3Smrg    * However it does seem to work for UCHE.
847ec681f3Smrg    */
857ec681f3Smrg   if (flushes & (TU_CMD_FLAG_CCU_FLUSH_COLOR |
867ec681f3Smrg                  TU_CMD_FLAG_CCU_INVALIDATE_COLOR))
877ec681f3Smrg      tu6_emit_event_write(cmd_buffer, cs, PC_CCU_FLUSH_COLOR_TS);
887ec681f3Smrg   if (flushes & (TU_CMD_FLAG_CCU_FLUSH_DEPTH |
897ec681f3Smrg                  TU_CMD_FLAG_CCU_INVALIDATE_DEPTH))
907ec681f3Smrg      tu6_emit_event_write(cmd_buffer, cs, PC_CCU_FLUSH_DEPTH_TS);
917ec681f3Smrg   if (flushes & TU_CMD_FLAG_CCU_INVALIDATE_COLOR)
927ec681f3Smrg      tu6_emit_event_write(cmd_buffer, cs, PC_CCU_INVALIDATE_COLOR);
937ec681f3Smrg   if (flushes & TU_CMD_FLAG_CCU_INVALIDATE_DEPTH)
947ec681f3Smrg      tu6_emit_event_write(cmd_buffer, cs, PC_CCU_INVALIDATE_DEPTH);
957ec681f3Smrg   if (flushes & TU_CMD_FLAG_CACHE_FLUSH)
967ec681f3Smrg      tu6_emit_event_write(cmd_buffer, cs, CACHE_FLUSH_TS);
977ec681f3Smrg   if (flushes & TU_CMD_FLAG_CACHE_INVALIDATE)
987ec681f3Smrg      tu6_emit_event_write(cmd_buffer, cs, CACHE_INVALIDATE);
997ec681f3Smrg   if (flushes & TU_CMD_FLAG_WAIT_MEM_WRITES)
1007ec681f3Smrg      tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0);
1017ec681f3Smrg   if ((flushes & TU_CMD_FLAG_WAIT_FOR_IDLE) ||
1027ec681f3Smrg       (cmd_buffer->device->physical_device->info->a6xx.has_ccu_flush_bug &&
1037ec681f3Smrg        (flushes & (TU_CMD_FLAG_CCU_FLUSH_COLOR | TU_CMD_FLAG_CCU_FLUSH_DEPTH))))
1047ec681f3Smrg      tu_cs_emit_wfi(cs);
1057ec681f3Smrg   if (flushes & TU_CMD_FLAG_WAIT_FOR_ME)
1067ec681f3Smrg      tu_cs_emit_pkt7(cs, CP_WAIT_FOR_ME, 0);
107361fc4cbSmaya}
108361fc4cbSmaya
1097ec681f3Smrg/* "Normal" cache flushes, that don't require any special handling */
1107ec681f3Smrg
1117ec681f3Smrgstatic void
1127ec681f3Smrgtu_emit_cache_flush(struct tu_cmd_buffer *cmd_buffer,
1137ec681f3Smrg                    struct tu_cs *cs)
114361fc4cbSmaya{
1157ec681f3Smrg   tu6_emit_flushes(cmd_buffer, cs, cmd_buffer->state.cache.flush_bits);
1167ec681f3Smrg   cmd_buffer->state.cache.flush_bits = 0;
1177ec681f3Smrg}
118361fc4cbSmaya
1197ec681f3Smrg/* Renderpass cache flushes */
1207ec681f3Smrg
1217ec681f3Smrgvoid
1227ec681f3Smrgtu_emit_cache_flush_renderpass(struct tu_cmd_buffer *cmd_buffer,
1237ec681f3Smrg                               struct tu_cs *cs)
1247ec681f3Smrg{
1257ec681f3Smrg   if (!cmd_buffer->state.renderpass_cache.flush_bits &&
1267ec681f3Smrg       likely(!cmd_buffer->device->physical_device->instance->debug_flags))
1277ec681f3Smrg      return;
1287ec681f3Smrg   tu6_emit_flushes(cmd_buffer, cs, cmd_buffer->state.renderpass_cache.flush_bits);
1297ec681f3Smrg   cmd_buffer->state.renderpass_cache.flush_bits = 0;
130361fc4cbSmaya}
131361fc4cbSmaya
1327ec681f3Smrg/* Cache flushes for things that use the color/depth read/write path (i.e.
1337ec681f3Smrg * blits and draws). This deals with changing CCU state as well as the usual
1347ec681f3Smrg * cache flushing.
1357ec681f3Smrg */
1367ec681f3Smrg
1377ec681f3Smrgvoid
1387ec681f3Smrgtu_emit_cache_flush_ccu(struct tu_cmd_buffer *cmd_buffer,
1397ec681f3Smrg                        struct tu_cs *cs,
1407ec681f3Smrg                        enum tu_cmd_ccu_state ccu_state)
141361fc4cbSmaya{
1427ec681f3Smrg   enum tu_cmd_flush_bits flushes = cmd_buffer->state.cache.flush_bits;
143361fc4cbSmaya
1447ec681f3Smrg   assert(ccu_state != TU_CMD_CCU_UNKNOWN);
145361fc4cbSmaya
1467ec681f3Smrg   /* Changing CCU state must involve invalidating the CCU. In sysmem mode,
1477ec681f3Smrg    * the CCU may also contain data that we haven't flushed out yet, so we
1487ec681f3Smrg    * also need to flush. Also, in order to program RB_CCU_CNTL, we need to
1497ec681f3Smrg    * emit a WFI as it isn't pipelined.
1507ec681f3Smrg    */
1517ec681f3Smrg   if (ccu_state != cmd_buffer->state.ccu_state) {
1527ec681f3Smrg      if (cmd_buffer->state.ccu_state != TU_CMD_CCU_GMEM) {
1537ec681f3Smrg         flushes |=
1547ec681f3Smrg            TU_CMD_FLAG_CCU_FLUSH_COLOR |
1557ec681f3Smrg            TU_CMD_FLAG_CCU_FLUSH_DEPTH;
1567ec681f3Smrg         cmd_buffer->state.cache.pending_flush_bits &= ~(
1577ec681f3Smrg            TU_CMD_FLAG_CCU_FLUSH_COLOR |
1587ec681f3Smrg            TU_CMD_FLAG_CCU_FLUSH_DEPTH);
1597ec681f3Smrg      }
1607ec681f3Smrg      flushes |=
1617ec681f3Smrg         TU_CMD_FLAG_CCU_INVALIDATE_COLOR |
1627ec681f3Smrg         TU_CMD_FLAG_CCU_INVALIDATE_DEPTH |
1637ec681f3Smrg         TU_CMD_FLAG_WAIT_FOR_IDLE;
1647ec681f3Smrg      cmd_buffer->state.cache.pending_flush_bits &= ~(
1657ec681f3Smrg         TU_CMD_FLAG_CCU_INVALIDATE_COLOR |
1667ec681f3Smrg         TU_CMD_FLAG_CCU_INVALIDATE_DEPTH |
1677ec681f3Smrg         TU_CMD_FLAG_WAIT_FOR_IDLE);
168361fc4cbSmaya   }
169361fc4cbSmaya
1707ec681f3Smrg   tu6_emit_flushes(cmd_buffer, cs, flushes);
1717ec681f3Smrg   cmd_buffer->state.cache.flush_bits = 0;
1727ec681f3Smrg
1737ec681f3Smrg   if (ccu_state != cmd_buffer->state.ccu_state) {
1747ec681f3Smrg      struct tu_physical_device *phys_dev = cmd_buffer->device->physical_device;
1757ec681f3Smrg      tu_cs_emit_regs(cs,
1767ec681f3Smrg                      A6XX_RB_CCU_CNTL(.color_offset =
1777ec681f3Smrg                                          ccu_state == TU_CMD_CCU_GMEM ?
1787ec681f3Smrg                                          phys_dev->ccu_offset_gmem :
1797ec681f3Smrg                                          phys_dev->ccu_offset_bypass,
1807ec681f3Smrg                                       .gmem = ccu_state == TU_CMD_CCU_GMEM));
1817ec681f3Smrg      cmd_buffer->state.ccu_state = ccu_state;
1827ec681f3Smrg   }
183361fc4cbSmaya}
184361fc4cbSmaya
185361fc4cbSmayastatic void
1867ec681f3Smrgtu6_emit_zs(struct tu_cmd_buffer *cmd,
1877ec681f3Smrg            const struct tu_subpass *subpass,
1887ec681f3Smrg            struct tu_cs *cs)
189361fc4cbSmaya{
1907ec681f3Smrg   const uint32_t a = subpass->depth_stencil_attachment.attachment;
1917ec681f3Smrg   if (a == VK_ATTACHMENT_UNUSED) {
1927ec681f3Smrg      tu_cs_emit_regs(cs,
1937ec681f3Smrg                      A6XX_RB_DEPTH_BUFFER_INFO(.depth_format = DEPTH6_NONE),
1947ec681f3Smrg                      A6XX_RB_DEPTH_BUFFER_PITCH(0),
1957ec681f3Smrg                      A6XX_RB_DEPTH_BUFFER_ARRAY_PITCH(0),
1967ec681f3Smrg                      A6XX_RB_DEPTH_BUFFER_BASE(0),
1977ec681f3Smrg                      A6XX_RB_DEPTH_BUFFER_BASE_GMEM(0));
198361fc4cbSmaya
1997ec681f3Smrg      tu_cs_emit_regs(cs,
2007ec681f3Smrg                      A6XX_GRAS_SU_DEPTH_BUFFER_INFO(.depth_format = DEPTH6_NONE));
201361fc4cbSmaya
2027ec681f3Smrg      tu_cs_emit_regs(cs,
2037ec681f3Smrg                      A6XX_GRAS_LRZ_BUFFER_BASE(0),
2047ec681f3Smrg                      A6XX_GRAS_LRZ_BUFFER_PITCH(0),
2057ec681f3Smrg                      A6XX_GRAS_LRZ_FAST_CLEAR_BUFFER_BASE(0));
206361fc4cbSmaya
2077ec681f3Smrg      tu_cs_emit_regs(cs, A6XX_RB_STENCIL_INFO(0));
208361fc4cbSmaya
2097ec681f3Smrg      return;
210361fc4cbSmaya   }
211361fc4cbSmaya
2127ec681f3Smrg   const struct tu_image_view *iview = cmd->state.attachments[a];
2137ec681f3Smrg   const struct tu_render_pass_attachment *attachment =
2147ec681f3Smrg      &cmd->state.pass->attachments[a];
2157ec681f3Smrg   enum a6xx_depth_format fmt = tu6_pipe2depth(attachment->format);
216361fc4cbSmaya
2177ec681f3Smrg   tu_cs_emit_pkt4(cs, REG_A6XX_RB_DEPTH_BUFFER_INFO, 6);
2187ec681f3Smrg   tu_cs_emit(cs, A6XX_RB_DEPTH_BUFFER_INFO(.depth_format = fmt).value);
2197ec681f3Smrg   tu_cs_image_ref(cs, iview, 0);
2207ec681f3Smrg   tu_cs_emit(cs, attachment->gmem_offset);
221361fc4cbSmaya
2227ec681f3Smrg   tu_cs_emit_regs(cs,
2237ec681f3Smrg                   A6XX_GRAS_SU_DEPTH_BUFFER_INFO(.depth_format = fmt));
2247ec681f3Smrg
2257ec681f3Smrg   tu_cs_emit_pkt4(cs, REG_A6XX_RB_DEPTH_FLAG_BUFFER_BASE, 3);
2267ec681f3Smrg   tu_cs_image_flag_ref(cs, iview, 0);
227361fc4cbSmaya
2287ec681f3Smrg   tu_cs_emit_regs(cs, A6XX_GRAS_LRZ_BUFFER_BASE(.bo = iview->image->bo,
2297ec681f3Smrg                                                 .bo_offset = iview->image->bo_offset + iview->image->lrz_offset),
2307ec681f3Smrg                   A6XX_GRAS_LRZ_BUFFER_PITCH(.pitch = iview->image->lrz_pitch),
2317ec681f3Smrg                   A6XX_GRAS_LRZ_FAST_CLEAR_BUFFER_BASE());
2327ec681f3Smrg
2337ec681f3Smrg   if (attachment->format == VK_FORMAT_D32_SFLOAT_S8_UINT ||
2347ec681f3Smrg       attachment->format == VK_FORMAT_S8_UINT) {
2357ec681f3Smrg
2367ec681f3Smrg      tu_cs_emit_pkt4(cs, REG_A6XX_RB_STENCIL_INFO, 6);
2377ec681f3Smrg      tu_cs_emit(cs, A6XX_RB_STENCIL_INFO(.separate_stencil = true).value);
2387ec681f3Smrg      if (attachment->format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
2397ec681f3Smrg         tu_cs_image_stencil_ref(cs, iview, 0);
2407ec681f3Smrg         tu_cs_emit(cs, attachment->gmem_offset_stencil);
2417ec681f3Smrg      } else {
2427ec681f3Smrg         tu_cs_image_ref(cs, iview, 0);
2437ec681f3Smrg         tu_cs_emit(cs, attachment->gmem_offset);
2447ec681f3Smrg      }
2457ec681f3Smrg   } else {
2467ec681f3Smrg      tu_cs_emit_regs(cs,
2477ec681f3Smrg                     A6XX_RB_STENCIL_INFO(0));
248361fc4cbSmaya   }
249361fc4cbSmaya}
250361fc4cbSmaya
251361fc4cbSmayastatic void
2527ec681f3Smrgtu6_emit_mrt(struct tu_cmd_buffer *cmd,
2537ec681f3Smrg             const struct tu_subpass *subpass,
2547ec681f3Smrg             struct tu_cs *cs)
255361fc4cbSmaya{
2567ec681f3Smrg   const struct tu_framebuffer *fb = cmd->state.framebuffer;
257361fc4cbSmaya
2587ec681f3Smrg   for (uint32_t i = 0; i < subpass->color_count; ++i) {
2597ec681f3Smrg      uint32_t a = subpass->color_attachments[i].attachment;
2607ec681f3Smrg      if (a == VK_ATTACHMENT_UNUSED)
2617ec681f3Smrg         continue;
262361fc4cbSmaya
2637ec681f3Smrg      const struct tu_image_view *iview = cmd->state.attachments[a];
264361fc4cbSmaya
2657ec681f3Smrg      tu_cs_emit_pkt4(cs, REG_A6XX_RB_MRT_BUF_INFO(i), 6);
2667ec681f3Smrg      tu_cs_emit(cs, iview->RB_MRT_BUF_INFO);
2677ec681f3Smrg      tu_cs_image_ref(cs, iview, 0);
2687ec681f3Smrg      tu_cs_emit(cs, cmd->state.pass->attachments[a].gmem_offset);
269361fc4cbSmaya
2707ec681f3Smrg      tu_cs_emit_regs(cs,
2717ec681f3Smrg                      A6XX_SP_FS_MRT_REG(i, .dword = iview->SP_FS_MRT_REG));
272361fc4cbSmaya
2737ec681f3Smrg      tu_cs_emit_pkt4(cs, REG_A6XX_RB_MRT_FLAG_BUFFER_ADDR(i), 3);
2747ec681f3Smrg      tu_cs_image_flag_ref(cs, iview, 0);
275361fc4cbSmaya   }
276361fc4cbSmaya
2777ec681f3Smrg   tu_cs_emit_regs(cs,
2787ec681f3Smrg                   A6XX_RB_SRGB_CNTL(.dword = subpass->srgb_cntl));
2797ec681f3Smrg   tu_cs_emit_regs(cs,
2807ec681f3Smrg                   A6XX_SP_SRGB_CNTL(.dword = subpass->srgb_cntl));
281361fc4cbSmaya
2827ec681f3Smrg   unsigned layers = MAX2(fb->layers, util_logbase2(subpass->multiview_mask) + 1);
2837ec681f3Smrg   tu_cs_emit_regs(cs, A6XX_GRAS_MAX_LAYER_INDEX(layers - 1));
284361fc4cbSmaya
2857ec681f3Smrg   tu_cs_emit_write_reg(cs, REG_A6XX_GRAS_SC_CNTL,
2867ec681f3Smrg                        A6XX_GRAS_SC_CNTL_CCUSINGLECACHELINESIZE(2));
287361fc4cbSmaya
2887ec681f3Smrg   /* If there is a feedback loop, then the shader can read the previous value
2897ec681f3Smrg    * of a pixel being written out. It can also write some components and then
2907ec681f3Smrg    * read different components without a barrier in between. This is a
2917ec681f3Smrg    * problem in sysmem mode with UBWC, because the main buffer and flags
2927ec681f3Smrg    * buffer can get out-of-sync if only one is flushed. We fix this by
2937ec681f3Smrg    * setting the SINGLE_PRIM_MODE field to the same value that the blob does
2947ec681f3Smrg    * for advanced_blend in sysmem mode if a feedback loop is detected.
2957ec681f3Smrg    */
2967ec681f3Smrg   if (subpass->feedback) {
2977ec681f3Smrg      tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_SYSMEM);
2987ec681f3Smrg      tu_cs_emit_write_reg(cs, REG_A6XX_GRAS_SC_CNTL,
2997ec681f3Smrg                           A6XX_GRAS_SC_CNTL_CCUSINGLECACHELINESIZE(2) |
3007ec681f3Smrg                           A6XX_GRAS_SC_CNTL_SINGLE_PRIM_MODE(
3017ec681f3Smrg                              FLUSH_PER_OVERLAP_AND_OVERWRITE));
3027ec681f3Smrg      tu_cond_exec_end(cs);
3037ec681f3Smrg   }
304361fc4cbSmaya}
305361fc4cbSmaya
306361fc4cbSmayavoid
3077ec681f3Smrgtu6_emit_msaa(struct tu_cs *cs, VkSampleCountFlagBits vk_samples,
3087ec681f3Smrg              enum a5xx_line_mode line_mode)
309361fc4cbSmaya{
3107ec681f3Smrg   const enum a3xx_msaa_samples samples = tu_msaa_samples(vk_samples);
3117ec681f3Smrg   bool msaa_disable = (samples == MSAA_ONE) || (line_mode == BRESENHAM);
3127ec681f3Smrg
3137ec681f3Smrg   tu_cs_emit_regs(cs,
3147ec681f3Smrg                   A6XX_SP_TP_RAS_MSAA_CNTL(samples),
3157ec681f3Smrg                   A6XX_SP_TP_DEST_MSAA_CNTL(.samples = samples,
3167ec681f3Smrg                                             .msaa_disable = msaa_disable));
3177ec681f3Smrg
3187ec681f3Smrg   tu_cs_emit_regs(cs,
3197ec681f3Smrg                   A6XX_GRAS_RAS_MSAA_CNTL(samples),
3207ec681f3Smrg                   A6XX_GRAS_DEST_MSAA_CNTL(.samples = samples,
3217ec681f3Smrg                                            .msaa_disable = msaa_disable));
3227ec681f3Smrg
3237ec681f3Smrg   tu_cs_emit_regs(cs,
3247ec681f3Smrg                   A6XX_RB_RAS_MSAA_CNTL(samples),
3257ec681f3Smrg                   A6XX_RB_DEST_MSAA_CNTL(.samples = samples,
3267ec681f3Smrg                                          .msaa_disable = msaa_disable));
3277ec681f3Smrg
3287ec681f3Smrg   tu_cs_emit_regs(cs,
3297ec681f3Smrg                   A6XX_RB_MSAA_CNTL(samples));
330361fc4cbSmaya}
331361fc4cbSmaya
332361fc4cbSmayastatic void
3337ec681f3Smrgtu6_emit_bin_size(struct tu_cs *cs,
3347ec681f3Smrg                  uint32_t bin_w, uint32_t bin_h, uint32_t flags)
335361fc4cbSmaya{
3367ec681f3Smrg   tu_cs_emit_regs(cs,
3377ec681f3Smrg                   A6XX_GRAS_BIN_CONTROL(.binw = bin_w,
3387ec681f3Smrg                                         .binh = bin_h,
3397ec681f3Smrg                                         .dword = flags));
340361fc4cbSmaya
3417ec681f3Smrg   tu_cs_emit_regs(cs,
3427ec681f3Smrg                   A6XX_RB_BIN_CONTROL(.binw = bin_w,
3437ec681f3Smrg                                       .binh = bin_h,
3447ec681f3Smrg                                       .dword = flags));
345361fc4cbSmaya
3467ec681f3Smrg   /* no flag for RB_BIN_CONTROL2... */
3477ec681f3Smrg   tu_cs_emit_regs(cs,
3487ec681f3Smrg                   A6XX_RB_BIN_CONTROL2(.binw = bin_w,
3497ec681f3Smrg                                        .binh = bin_h));
350361fc4cbSmaya}
351361fc4cbSmaya
352361fc4cbSmayastatic void
3537ec681f3Smrgtu6_emit_render_cntl(struct tu_cmd_buffer *cmd,
3547ec681f3Smrg                     const struct tu_subpass *subpass,
3557ec681f3Smrg                     struct tu_cs *cs,
3567ec681f3Smrg                     bool binning)
357361fc4cbSmaya{
3587ec681f3Smrg   /* doesn't RB_RENDER_CNTL set differently for binning pass: */
3597ec681f3Smrg   bool no_track = !cmd->device->physical_device->info->a6xx.has_cp_reg_write;
3607ec681f3Smrg   uint32_t cntl = 0;
3617ec681f3Smrg   cntl |= A6XX_RB_RENDER_CNTL_CCUSINGLECACHELINESIZE(2);
3627ec681f3Smrg   if (binning) {
3637ec681f3Smrg      if (no_track)
3647ec681f3Smrg         return;
3657ec681f3Smrg      cntl |= A6XX_RB_RENDER_CNTL_BINNING;
3667ec681f3Smrg   } else {
3677ec681f3Smrg      uint32_t mrts_ubwc_enable = 0;
3687ec681f3Smrg      for (uint32_t i = 0; i < subpass->color_count; ++i) {
3697ec681f3Smrg         uint32_t a = subpass->color_attachments[i].attachment;
3707ec681f3Smrg         if (a == VK_ATTACHMENT_UNUSED)
3717ec681f3Smrg            continue;
3727ec681f3Smrg
3737ec681f3Smrg         const struct tu_image_view *iview = cmd->state.attachments[a];
3747ec681f3Smrg         if (iview->ubwc_enabled)
3757ec681f3Smrg            mrts_ubwc_enable |= 1 << i;
3767ec681f3Smrg      }
377361fc4cbSmaya
3787ec681f3Smrg      cntl |= A6XX_RB_RENDER_CNTL_FLAG_MRTS(mrts_ubwc_enable);
379361fc4cbSmaya
3807ec681f3Smrg      const uint32_t a = subpass->depth_stencil_attachment.attachment;
3817ec681f3Smrg      if (a != VK_ATTACHMENT_UNUSED) {
3827ec681f3Smrg         const struct tu_image_view *iview = cmd->state.attachments[a];
3837ec681f3Smrg         if (iview->ubwc_enabled)
3847ec681f3Smrg            cntl |= A6XX_RB_RENDER_CNTL_FLAG_DEPTH;
3857ec681f3Smrg      }
386361fc4cbSmaya
3877ec681f3Smrg      if (no_track) {
3887ec681f3Smrg         tu_cs_emit_pkt4(cs, REG_A6XX_RB_RENDER_CNTL, 1);
3897ec681f3Smrg         tu_cs_emit(cs, cntl);
3907ec681f3Smrg         return;
3917ec681f3Smrg      }
392361fc4cbSmaya
3937ec681f3Smrg      /* In the !binning case, we need to set RB_RENDER_CNTL in the draw_cs
3947ec681f3Smrg       * in order to set it correctly for the different subpasses. However,
3957ec681f3Smrg       * that means the packets we're emitting also happen during binning. So
3967ec681f3Smrg       * we need to guard the write on !BINNING at CP execution time.
3977ec681f3Smrg       */
3987ec681f3Smrg      tu_cs_reserve(cs, 3 + 4);
3997ec681f3Smrg      tu_cs_emit_pkt7(cs, CP_COND_REG_EXEC, 2);
4007ec681f3Smrg      tu_cs_emit(cs, CP_COND_REG_EXEC_0_MODE(RENDER_MODE) |
4017ec681f3Smrg                     CP_COND_REG_EXEC_0_GMEM | CP_COND_REG_EXEC_0_SYSMEM);
4027ec681f3Smrg      tu_cs_emit(cs, CP_COND_REG_EXEC_1_DWORDS(4));
403361fc4cbSmaya   }
404361fc4cbSmaya
4057ec681f3Smrg   tu_cs_emit_pkt7(cs, CP_REG_WRITE, 3);
4067ec681f3Smrg   tu_cs_emit(cs, CP_REG_WRITE_0_TRACKER(TRACK_RENDER_CNTL));
4077ec681f3Smrg   tu_cs_emit(cs, REG_A6XX_RB_RENDER_CNTL);
4087ec681f3Smrg   tu_cs_emit(cs, cntl);
409361fc4cbSmaya}
410361fc4cbSmaya
411361fc4cbSmayastatic void
4127ec681f3Smrgtu6_emit_blit_scissor(struct tu_cmd_buffer *cmd, struct tu_cs *cs, bool align)
413361fc4cbSmaya{
4147ec681f3Smrg   struct tu_physical_device *phys_dev = cmd->device->physical_device;
4157ec681f3Smrg   const VkRect2D *render_area = &cmd->state.render_area;
4167ec681f3Smrg
4177ec681f3Smrg   /* Avoid assertion fails with an empty render area at (0, 0) where the
4187ec681f3Smrg    * subtraction below wraps around. Empty render areas should be forced to
4197ec681f3Smrg    * the sysmem path by use_sysmem_rendering(). It's not even clear whether
4207ec681f3Smrg    * an empty scissor here works, and the blob seems to force sysmem too as
4217ec681f3Smrg    * it sets something wrong (non-empty) for the scissor.
4227ec681f3Smrg    */
4237ec681f3Smrg   if (render_area->extent.width == 0 ||
4247ec681f3Smrg       render_area->extent.height == 0)
4257ec681f3Smrg      return;
426361fc4cbSmaya
4277ec681f3Smrg   uint32_t x1 = render_area->offset.x;
4287ec681f3Smrg   uint32_t y1 = render_area->offset.y;
4297ec681f3Smrg   uint32_t x2 = x1 + render_area->extent.width - 1;
4307ec681f3Smrg   uint32_t y2 = y1 + render_area->extent.height - 1;
431361fc4cbSmaya
4327ec681f3Smrg   if (align) {
4337ec681f3Smrg      x1 = x1 & ~(phys_dev->info->gmem_align_w - 1);
4347ec681f3Smrg      y1 = y1 & ~(phys_dev->info->gmem_align_h - 1);
4357ec681f3Smrg      x2 = ALIGN_POT(x2 + 1, phys_dev->info->gmem_align_w) - 1;
4367ec681f3Smrg      y2 = ALIGN_POT(y2 + 1, phys_dev->info->gmem_align_h) - 1;
4377ec681f3Smrg   }
438361fc4cbSmaya
4397ec681f3Smrg   tu_cs_emit_regs(cs,
4407ec681f3Smrg                   A6XX_RB_BLIT_SCISSOR_TL(.x = x1, .y = y1),
4417ec681f3Smrg                   A6XX_RB_BLIT_SCISSOR_BR(.x = x2, .y = y2));
442361fc4cbSmaya}
443361fc4cbSmaya
4447ec681f3Smrgvoid
4457ec681f3Smrgtu6_emit_window_scissor(struct tu_cs *cs,
4467ec681f3Smrg                        uint32_t x1,
4477ec681f3Smrg                        uint32_t y1,
4487ec681f3Smrg                        uint32_t x2,
4497ec681f3Smrg                        uint32_t y2)
450361fc4cbSmaya{
4517ec681f3Smrg   tu_cs_emit_regs(cs,
4527ec681f3Smrg                   A6XX_GRAS_SC_WINDOW_SCISSOR_TL(.x = x1, .y = y1),
4537ec681f3Smrg                   A6XX_GRAS_SC_WINDOW_SCISSOR_BR(.x = x2, .y = y2));
454361fc4cbSmaya
4557ec681f3Smrg   tu_cs_emit_regs(cs,
4567ec681f3Smrg                   A6XX_GRAS_2D_RESOLVE_CNTL_1(.x = x1, .y = y1),
4577ec681f3Smrg                   A6XX_GRAS_2D_RESOLVE_CNTL_2(.x = x2, .y = y2));
458361fc4cbSmaya}
459361fc4cbSmaya
4607ec681f3Smrgvoid
4617ec681f3Smrgtu6_emit_window_offset(struct tu_cs *cs, uint32_t x1, uint32_t y1)
462361fc4cbSmaya{
4637ec681f3Smrg   tu_cs_emit_regs(cs,
4647ec681f3Smrg                   A6XX_RB_WINDOW_OFFSET(.x = x1, .y = y1));
465361fc4cbSmaya
4667ec681f3Smrg   tu_cs_emit_regs(cs,
4677ec681f3Smrg                   A6XX_RB_WINDOW_OFFSET2(.x = x1, .y = y1));
468361fc4cbSmaya
4697ec681f3Smrg   tu_cs_emit_regs(cs,
4707ec681f3Smrg                   A6XX_SP_WINDOW_OFFSET(.x = x1, .y = y1));
471361fc4cbSmaya
4727ec681f3Smrg   tu_cs_emit_regs(cs,
4737ec681f3Smrg                   A6XX_SP_TP_WINDOW_OFFSET(.x = x1, .y = y1));
474361fc4cbSmaya}
475361fc4cbSmaya
4767ec681f3Smrgvoid
4777ec681f3Smrgtu6_apply_depth_bounds_workaround(struct tu_device *device,
4787ec681f3Smrg                                  uint32_t *rb_depth_cntl)
479361fc4cbSmaya{
4807ec681f3Smrg   if (!device->physical_device->info->a6xx.depth_bounds_require_depth_test_quirk)
4817ec681f3Smrg      return;
482361fc4cbSmaya
4837ec681f3Smrg   /* On some GPUs it is necessary to enable z test for depth bounds test when
4847ec681f3Smrg    * UBWC is enabled. Otherwise, the GPU would hang. FUNC_ALWAYS is required to
4857ec681f3Smrg    * pass z test. Relevant tests:
4867ec681f3Smrg    *  dEQP-VK.pipeline.extended_dynamic_state.two_draws_dynamic.depth_bounds_test_disable
4877ec681f3Smrg    *  dEQP-VK.dynamic_state.ds_state.depth_bounds_1
4887ec681f3Smrg    */
4897ec681f3Smrg   *rb_depth_cntl |= A6XX_RB_DEPTH_CNTL_Z_TEST_ENABLE |
4907ec681f3Smrg                     A6XX_RB_DEPTH_CNTL_ZFUNC(FUNC_ALWAYS);
491361fc4cbSmaya}
492361fc4cbSmaya
493361fc4cbSmayastatic void
4947ec681f3Smrgtu_cs_emit_draw_state(struct tu_cs *cs, uint32_t id, struct tu_draw_state state)
495361fc4cbSmaya{
4967ec681f3Smrg   uint32_t enable_mask;
4977ec681f3Smrg   switch (id) {
4987ec681f3Smrg   case TU_DRAW_STATE_PROGRAM:
4997ec681f3Smrg   case TU_DRAW_STATE_VI:
5007ec681f3Smrg   case TU_DRAW_STATE_FS_CONST:
5017ec681f3Smrg   /* The blob seems to not enable this (DESC_SETS_LOAD) for binning, even
5027ec681f3Smrg    * when resources would actually be used in the binning shader.
5037ec681f3Smrg    * Presumably the overhead of prefetching the resources isn't
5047ec681f3Smrg    * worth it.
5057ec681f3Smrg    */
5067ec681f3Smrg   case TU_DRAW_STATE_DESC_SETS_LOAD:
5077ec681f3Smrg      enable_mask = CP_SET_DRAW_STATE__0_GMEM |
5087ec681f3Smrg                    CP_SET_DRAW_STATE__0_SYSMEM;
5097ec681f3Smrg      break;
5107ec681f3Smrg   case TU_DRAW_STATE_PROGRAM_BINNING:
5117ec681f3Smrg   case TU_DRAW_STATE_VI_BINNING:
5127ec681f3Smrg      enable_mask = CP_SET_DRAW_STATE__0_BINNING;
5137ec681f3Smrg      break;
5147ec681f3Smrg   case TU_DRAW_STATE_INPUT_ATTACHMENTS_GMEM:
5157ec681f3Smrg      enable_mask = CP_SET_DRAW_STATE__0_GMEM;
5167ec681f3Smrg      break;
5177ec681f3Smrg   case TU_DRAW_STATE_INPUT_ATTACHMENTS_SYSMEM:
5187ec681f3Smrg      enable_mask = CP_SET_DRAW_STATE__0_SYSMEM;
5197ec681f3Smrg      break;
5207ec681f3Smrg   default:
5217ec681f3Smrg      enable_mask = CP_SET_DRAW_STATE__0_GMEM |
5227ec681f3Smrg                    CP_SET_DRAW_STATE__0_SYSMEM |
5237ec681f3Smrg                    CP_SET_DRAW_STATE__0_BINNING;
5247ec681f3Smrg      break;
5257ec681f3Smrg   }
526361fc4cbSmaya
5277ec681f3Smrg   STATIC_ASSERT(TU_DRAW_STATE_COUNT <= 32);
5287ec681f3Smrg
5297ec681f3Smrg   /* We need to reload the descriptors every time the descriptor sets
5307ec681f3Smrg    * change. However, the commands we send only depend on the pipeline
5317ec681f3Smrg    * because the whole point is to cache descriptors which are used by the
5327ec681f3Smrg    * pipeline. There's a problem here, in that the firmware has an
5337ec681f3Smrg    * "optimization" which skips executing groups that are set to the same
5347ec681f3Smrg    * value as the last draw. This means that if the descriptor sets change
5357ec681f3Smrg    * but not the pipeline, we'd try to re-execute the same buffer which
5367ec681f3Smrg    * the firmware would ignore and we wouldn't pre-load the new
5377ec681f3Smrg    * descriptors. Set the DIRTY bit to avoid this optimization
5387ec681f3Smrg    */
5397ec681f3Smrg   if (id == TU_DRAW_STATE_DESC_SETS_LOAD)
5407ec681f3Smrg      enable_mask |= CP_SET_DRAW_STATE__0_DIRTY;
5417ec681f3Smrg
5427ec681f3Smrg   tu_cs_emit(cs, CP_SET_DRAW_STATE__0_COUNT(state.size) |
5437ec681f3Smrg                  enable_mask |
5447ec681f3Smrg                  CP_SET_DRAW_STATE__0_GROUP_ID(id) |
5457ec681f3Smrg                  COND(!state.size, CP_SET_DRAW_STATE__0_DISABLE));
5467ec681f3Smrg   tu_cs_emit_qw(cs, state.iova);
547361fc4cbSmaya}
548361fc4cbSmaya
5497ec681f3Smrgstatic bool
5507ec681f3Smrguse_hw_binning(struct tu_cmd_buffer *cmd)
5517ec681f3Smrg{
5527ec681f3Smrg   const struct tu_framebuffer *fb = cmd->state.framebuffer;
553361fc4cbSmaya
5547ec681f3Smrg   /* XFB commands are emitted for BINNING || SYSMEM, which makes it incompatible
5557ec681f3Smrg    * with non-hw binning GMEM rendering. this is required because some of the
5567ec681f3Smrg    * XFB commands need to only be executed once
5577ec681f3Smrg    */
5587ec681f3Smrg   if (cmd->state.xfb_used)
5597ec681f3Smrg      return true;
560361fc4cbSmaya
5617ec681f3Smrg   if (unlikely(cmd->device->physical_device->instance->debug_flags & TU_DEBUG_NOBIN))
5627ec681f3Smrg      return false;
563361fc4cbSmaya
5647ec681f3Smrg   if (unlikely(cmd->device->physical_device->instance->debug_flags & TU_DEBUG_FORCEBIN))
5657ec681f3Smrg      return true;
566361fc4cbSmaya
5677ec681f3Smrg   return (fb->tile_count.width * fb->tile_count.height) > 2;
568361fc4cbSmaya}
569361fc4cbSmaya
5707ec681f3Smrgstatic bool
5717ec681f3Smrguse_sysmem_rendering(struct tu_cmd_buffer *cmd)
572361fc4cbSmaya{
5737ec681f3Smrg   if (unlikely(cmd->device->physical_device->instance->debug_flags & TU_DEBUG_SYSMEM))
5747ec681f3Smrg      return true;
575361fc4cbSmaya
5767ec681f3Smrg   /* can't fit attachments into gmem */
5777ec681f3Smrg   if (!cmd->state.pass->gmem_pixels)
5787ec681f3Smrg      return true;
579361fc4cbSmaya
5807ec681f3Smrg   if (cmd->state.framebuffer->layers > 1)
5817ec681f3Smrg      return true;
5827ec681f3Smrg
5837ec681f3Smrg   /* Use sysmem for empty render areas */
5847ec681f3Smrg   if (cmd->state.render_area.extent.width == 0 ||
5857ec681f3Smrg       cmd->state.render_area.extent.height == 0)
5867ec681f3Smrg      return true;
587361fc4cbSmaya
5887ec681f3Smrg   if (cmd->state.has_tess)
5897ec681f3Smrg      return true;
590361fc4cbSmaya
5917ec681f3Smrg   if (cmd->state.disable_gmem)
5927ec681f3Smrg      return true;
593361fc4cbSmaya
5947ec681f3Smrg   return false;
595361fc4cbSmaya}
596361fc4cbSmaya
597361fc4cbSmayastatic void
598361fc4cbSmayatu6_emit_tile_select(struct tu_cmd_buffer *cmd,
599361fc4cbSmaya                     struct tu_cs *cs,
6007ec681f3Smrg                     uint32_t tx, uint32_t ty, uint32_t pipe, uint32_t slot)
601361fc4cbSmaya{
6027ec681f3Smrg   const struct tu_framebuffer *fb = cmd->state.framebuffer;
603361fc4cbSmaya
604361fc4cbSmaya   tu_cs_emit_pkt7(cs, CP_SET_MARKER, 1);
6057ec681f3Smrg   tu_cs_emit(cs, A6XX_CP_SET_MARKER_0_MODE(RM6_GMEM));
6067ec681f3Smrg
6077ec681f3Smrg   const uint32_t x1 = fb->tile0.width * tx;
6087ec681f3Smrg   const uint32_t y1 = fb->tile0.height * ty;
6097ec681f3Smrg   const uint32_t x2 = MIN2(x1 + fb->tile0.width - 1, MAX_VIEWPORT_SIZE - 1);
6107ec681f3Smrg   const uint32_t y2 = MIN2(y1 + fb->tile0.height - 1, MAX_VIEWPORT_SIZE - 1);
6117ec681f3Smrg   tu6_emit_window_scissor(cs, x1, y1, x2, y2);
6127ec681f3Smrg   tu6_emit_window_offset(cs, x1, y1);
6137ec681f3Smrg
6147ec681f3Smrg   tu_cs_emit_regs(cs, A6XX_VPC_SO_DISABLE(false));
6157ec681f3Smrg
6167ec681f3Smrg   if (use_hw_binning(cmd)) {
6177ec681f3Smrg      tu_cs_emit_pkt7(cs, CP_WAIT_FOR_ME, 0);
6187ec681f3Smrg
6197ec681f3Smrg      tu_cs_emit_pkt7(cs, CP_SET_MODE, 1);
6207ec681f3Smrg      tu_cs_emit(cs, 0x0);
621361fc4cbSmaya
6227ec681f3Smrg      tu_cs_emit_pkt7(cs, CP_SET_BIN_DATA5_OFFSET, 4);
6237ec681f3Smrg      tu_cs_emit(cs, fb->pipe_sizes[pipe] |
6247ec681f3Smrg                     CP_SET_BIN_DATA5_0_VSC_N(slot));
6257ec681f3Smrg      tu_cs_emit(cs, pipe * cmd->vsc_draw_strm_pitch);
6267ec681f3Smrg      tu_cs_emit(cs, pipe * 4);
6277ec681f3Smrg      tu_cs_emit(cs, pipe * cmd->vsc_prim_strm_pitch);
628361fc4cbSmaya
6297ec681f3Smrg      tu_cs_emit_pkt7(cs, CP_SET_VISIBILITY_OVERRIDE, 1);
6307ec681f3Smrg      tu_cs_emit(cs, 0x0);
631361fc4cbSmaya
6327ec681f3Smrg      tu_cs_emit_pkt7(cs, CP_SET_MODE, 1);
6337ec681f3Smrg      tu_cs_emit(cs, 0x0);
634361fc4cbSmaya   } else {
635361fc4cbSmaya      tu_cs_emit_pkt7(cs, CP_SET_VISIBILITY_OVERRIDE, 1);
636361fc4cbSmaya      tu_cs_emit(cs, 0x1);
637361fc4cbSmaya
638361fc4cbSmaya      tu_cs_emit_pkt7(cs, CP_SET_MODE, 1);
639361fc4cbSmaya      tu_cs_emit(cs, 0x0);
640361fc4cbSmaya   }
641361fc4cbSmaya}
642361fc4cbSmaya
643361fc4cbSmayastatic void
6447ec681f3Smrgtu6_emit_sysmem_resolve(struct tu_cmd_buffer *cmd,
6457ec681f3Smrg                        struct tu_cs *cs,
6467ec681f3Smrg                        uint32_t layer_mask,
6477ec681f3Smrg                        uint32_t a,
6487ec681f3Smrg                        uint32_t gmem_a)
649361fc4cbSmaya{
650361fc4cbSmaya   const struct tu_framebuffer *fb = cmd->state.framebuffer;
6517ec681f3Smrg   const struct tu_image_view *dst = cmd->state.attachments[a];
6527ec681f3Smrg   const struct tu_image_view *src = cmd->state.attachments[gmem_a];
6537ec681f3Smrg
6547ec681f3Smrg   tu_resolve_sysmem(cmd, cs, src, dst, layer_mask, fb->layers, &cmd->state.render_area);
6557ec681f3Smrg}
656361fc4cbSmaya
6577ec681f3Smrgstatic void
6587ec681f3Smrgtu6_emit_sysmem_resolves(struct tu_cmd_buffer *cmd,
6597ec681f3Smrg                         struct tu_cs *cs,
6607ec681f3Smrg                         const struct tu_subpass *subpass)
6617ec681f3Smrg{
6627ec681f3Smrg   if (subpass->resolve_attachments) {
6637ec681f3Smrg      /* From the documentation for vkCmdNextSubpass, section 7.4 "Render Pass
6647ec681f3Smrg       * Commands":
6657ec681f3Smrg       *
6667ec681f3Smrg       *    End-of-subpass multisample resolves are treated as color
6677ec681f3Smrg       *    attachment writes for the purposes of synchronization.
6687ec681f3Smrg       *    This applies to resolve operations for both color and
6697ec681f3Smrg       *    depth/stencil attachments. That is, they are considered to
6707ec681f3Smrg       *    execute in the VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT
6717ec681f3Smrg       *    pipeline stage and their writes are synchronized with
6727ec681f3Smrg       *    VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT. Synchronization between
6737ec681f3Smrg       *    rendering within a subpass and any resolve operations at the end
6747ec681f3Smrg       *    of the subpass occurs automatically, without need for explicit
6757ec681f3Smrg       *    dependencies or pipeline barriers. However, if the resolve
6767ec681f3Smrg       *    attachment is also used in a different subpass, an explicit
6777ec681f3Smrg       *    dependency is needed.
6787ec681f3Smrg       *
6797ec681f3Smrg       * We use the CP_BLIT path for sysmem resolves, which is really a
6807ec681f3Smrg       * transfer command, so we have to manually flush similar to the gmem
6817ec681f3Smrg       * resolve case. However, a flush afterwards isn't needed because of the
6827ec681f3Smrg       * last sentence and the fact that we're in sysmem mode.
6837ec681f3Smrg       */
6847ec681f3Smrg      tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
6857ec681f3Smrg      if (subpass->resolve_depth_stencil)
6867ec681f3Smrg         tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_DEPTH_TS);
687361fc4cbSmaya
6887ec681f3Smrg      tu6_emit_event_write(cmd, cs, CACHE_INVALIDATE);
689361fc4cbSmaya
6907ec681f3Smrg      /* Wait for the flushes to land before using the 2D engine */
6917ec681f3Smrg      tu_cs_emit_wfi(cs);
692361fc4cbSmaya
6937ec681f3Smrg      for (unsigned i = 0; i < subpass->resolve_count; i++) {
6947ec681f3Smrg         uint32_t a = subpass->resolve_attachments[i].attachment;
6957ec681f3Smrg         if (a == VK_ATTACHMENT_UNUSED)
6967ec681f3Smrg            continue;
6977ec681f3Smrg
6987ec681f3Smrg         uint32_t gmem_a = tu_subpass_get_attachment_to_resolve(subpass, i);
699361fc4cbSmaya
7007ec681f3Smrg         tu6_emit_sysmem_resolve(cmd, cs, subpass->multiview_mask, a, gmem_a);
7017ec681f3Smrg      }
7027ec681f3Smrg   }
703361fc4cbSmaya}
704361fc4cbSmaya
705361fc4cbSmayastatic void
706361fc4cbSmayatu6_emit_tile_store(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
707361fc4cbSmaya{
7087ec681f3Smrg   const struct tu_render_pass *pass = cmd->state.pass;
7097ec681f3Smrg   const struct tu_subpass *subpass = &pass->subpasses[pass->subpass_count-1];
710361fc4cbSmaya
711361fc4cbSmaya   tu_cs_emit_pkt7(cs, CP_SKIP_IB2_ENABLE_GLOBAL, 1);
712361fc4cbSmaya   tu_cs_emit(cs, 0x0);
713361fc4cbSmaya
714361fc4cbSmaya   tu_cs_emit_pkt7(cs, CP_SET_MARKER, 1);
7157ec681f3Smrg   tu_cs_emit(cs, A6XX_CP_SET_MARKER_0_MODE(RM6_RESOLVE));
716361fc4cbSmaya
7177ec681f3Smrg   tu6_emit_blit_scissor(cmd, cs, true);
718361fc4cbSmaya
7197ec681f3Smrg   for (uint32_t a = 0; a < pass->attachment_count; ++a) {
7207ec681f3Smrg      if (pass->attachments[a].gmem_offset >= 0)
7217ec681f3Smrg         tu_store_gmem_attachment(cmd, cs, a, a);
7227ec681f3Smrg   }
723361fc4cbSmaya
7247ec681f3Smrg   if (subpass->resolve_attachments) {
7257ec681f3Smrg      for (unsigned i = 0; i < subpass->resolve_count; i++) {
7267ec681f3Smrg         uint32_t a = subpass->resolve_attachments[i].attachment;
7277ec681f3Smrg         if (a != VK_ATTACHMENT_UNUSED) {
7287ec681f3Smrg            uint32_t gmem_a = tu_subpass_get_attachment_to_resolve(subpass, i);
7297ec681f3Smrg            tu_store_gmem_attachment(cmd, cs, a, gmem_a);
7307ec681f3Smrg         }
7317ec681f3Smrg      }
732361fc4cbSmaya   }
733361fc4cbSmaya}
734361fc4cbSmaya
7357ec681f3Smrgvoid
7367ec681f3Smrgtu_disable_draw_states(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
737361fc4cbSmaya{
7387ec681f3Smrg   tu_cs_emit_pkt7(cs, CP_SET_DRAW_STATE, 3);
7397ec681f3Smrg   tu_cs_emit(cs, CP_SET_DRAW_STATE__0_COUNT(0) |
7407ec681f3Smrg                     CP_SET_DRAW_STATE__0_DISABLE_ALL_GROUPS |
7417ec681f3Smrg                     CP_SET_DRAW_STATE__0_GROUP_ID(0));
7427ec681f3Smrg   tu_cs_emit(cs, CP_SET_DRAW_STATE__1_ADDR_LO(0));
7437ec681f3Smrg   tu_cs_emit(cs, CP_SET_DRAW_STATE__2_ADDR_HI(0));
7447ec681f3Smrg
7457ec681f3Smrg   cmd->state.dirty |= TU_CMD_DIRTY_DRAW_STATE;
746361fc4cbSmaya}
747361fc4cbSmaya
748361fc4cbSmayastatic void
749361fc4cbSmayatu6_init_hw(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
750361fc4cbSmaya{
7517ec681f3Smrg   struct tu_device *dev = cmd->device;
7527ec681f3Smrg   const struct tu_physical_device *phys_dev = dev->physical_device;
7537ec681f3Smrg
7547ec681f3Smrg   tu6_emit_event_write(cmd, cs, CACHE_INVALIDATE);
7557ec681f3Smrg
7567ec681f3Smrg   tu_cs_emit_regs(cs, A6XX_HLSQ_INVALIDATE_CMD(
7577ec681f3Smrg         .vs_state = true,
7587ec681f3Smrg         .hs_state = true,
7597ec681f3Smrg         .ds_state = true,
7607ec681f3Smrg         .gs_state = true,
7617ec681f3Smrg         .fs_state = true,
7627ec681f3Smrg         .cs_state = true,
7637ec681f3Smrg         .gfx_ibo = true,
7647ec681f3Smrg         .cs_ibo = true,
7657ec681f3Smrg         .gfx_shared_const = true,
7667ec681f3Smrg         .cs_shared_const = true,
7677ec681f3Smrg         .gfx_bindless = 0x1f,
7687ec681f3Smrg         .cs_bindless = 0x1f));
7697ec681f3Smrg
7707ec681f3Smrg   tu_cs_emit_wfi(cs);
7717ec681f3Smrg
7727ec681f3Smrg   cmd->state.cache.pending_flush_bits &=
7737ec681f3Smrg      ~(TU_CMD_FLAG_WAIT_FOR_IDLE | TU_CMD_FLAG_CACHE_INVALIDATE);
7747ec681f3Smrg
7757ec681f3Smrg   tu_cs_emit_regs(cs,
7767ec681f3Smrg                   A6XX_RB_CCU_CNTL(.color_offset = phys_dev->ccu_offset_bypass));
7777ec681f3Smrg   cmd->state.ccu_state = TU_CMD_CCU_SYSMEM;
778361fc4cbSmaya   tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_8E04, 0x00100000);
7797ec681f3Smrg   tu_cs_emit_write_reg(cs, REG_A6XX_SP_FLOAT_CNTL, 0);
780361fc4cbSmaya   tu_cs_emit_write_reg(cs, REG_A6XX_SP_UNKNOWN_AE00, 0);
7817ec681f3Smrg   tu_cs_emit_write_reg(cs, REG_A6XX_SP_PERFCTR_ENABLE, 0x3f);
7827ec681f3Smrg   tu_cs_emit_write_reg(cs, REG_A6XX_TPL1_UNKNOWN_B605, 0x44);
7837ec681f3Smrg   tu_cs_emit_write_reg(cs, REG_A6XX_TPL1_DBG_ECO_CNTL,
7847ec681f3Smrg                        phys_dev->info->a6xx.magic.TPL1_DBG_ECO_CNTL);
785361fc4cbSmaya   tu_cs_emit_write_reg(cs, REG_A6XX_HLSQ_UNKNOWN_BE00, 0x80);
786361fc4cbSmaya   tu_cs_emit_write_reg(cs, REG_A6XX_HLSQ_UNKNOWN_BE01, 0);
787361fc4cbSmaya
788361fc4cbSmaya   tu_cs_emit_write_reg(cs, REG_A6XX_VPC_UNKNOWN_9600, 0);
7897ec681f3Smrg   tu_cs_emit_write_reg(cs, REG_A6XX_GRAS_DBG_ECO_CNTL, 0x880);
790361fc4cbSmaya   tu_cs_emit_write_reg(cs, REG_A6XX_HLSQ_UNKNOWN_BE04, 0);
7917ec681f3Smrg   tu_cs_emit_write_reg(cs, REG_A6XX_SP_CHICKEN_BITS, 0x00000410);
792361fc4cbSmaya   tu_cs_emit_write_reg(cs, REG_A6XX_SP_IBO_COUNT, 0);
793361fc4cbSmaya   tu_cs_emit_write_reg(cs, REG_A6XX_SP_UNKNOWN_B182, 0);
7947ec681f3Smrg   tu_cs_emit_write_reg(cs, REG_A6XX_HLSQ_SHARED_CONSTS, 0);
795361fc4cbSmaya   tu_cs_emit_write_reg(cs, REG_A6XX_UCHE_UNKNOWN_0E12, 0x3200000);
796361fc4cbSmaya   tu_cs_emit_write_reg(cs, REG_A6XX_UCHE_CLIENT_PF, 4);
797361fc4cbSmaya   tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_8E01, 0x0);
7987ec681f3Smrg   tu_cs_emit_write_reg(cs, REG_A6XX_SP_UNKNOWN_A9A8, 0);
7997ec681f3Smrg   tu_cs_emit_write_reg(cs, REG_A6XX_SP_MODE_CONTROL,
8007ec681f3Smrg                        A6XX_SP_MODE_CONTROL_CONSTANT_DEMOTION_ENABLE | 4);
8017ec681f3Smrg
8027ec681f3Smrg   /* TODO: set A6XX_VFD_ADD_OFFSET_INSTANCE and fix ir3 to avoid adding base instance */
8037ec681f3Smrg   tu_cs_emit_write_reg(cs, REG_A6XX_VFD_ADD_OFFSET, A6XX_VFD_ADD_OFFSET_VERTEX);
804361fc4cbSmaya   tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_8811, 0x00000010);
805361fc4cbSmaya   tu_cs_emit_write_reg(cs, REG_A6XX_PC_MODE_CNTL, 0x1f);
806361fc4cbSmaya
807361fc4cbSmaya   tu_cs_emit_write_reg(cs, REG_A6XX_GRAS_UNKNOWN_8110, 0);
808361fc4cbSmaya
809361fc4cbSmaya   tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_8818, 0);
810361fc4cbSmaya   tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_8819, 0);
811361fc4cbSmaya   tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_881A, 0);
812361fc4cbSmaya   tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_881B, 0);
813361fc4cbSmaya   tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_881C, 0);
814361fc4cbSmaya   tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_881D, 0);
815361fc4cbSmaya   tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_881E, 0);
816361fc4cbSmaya   tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_88F0, 0);
817361fc4cbSmaya
8187ec681f3Smrg   tu_cs_emit_regs(cs, A6XX_VPC_POINT_COORD_INVERT(false));
819361fc4cbSmaya   tu_cs_emit_write_reg(cs, REG_A6XX_VPC_UNKNOWN_9300, 0);
820361fc4cbSmaya
8217ec681f3Smrg   tu_cs_emit_regs(cs, A6XX_VPC_SO_DISABLE(true));
822361fc4cbSmaya
823361fc4cbSmaya   tu_cs_emit_write_reg(cs, REG_A6XX_SP_UNKNOWN_B183, 0);
824361fc4cbSmaya
8257ec681f3Smrg   tu_cs_emit_write_reg(cs, REG_A6XX_GRAS_SU_CONSERVATIVE_RAS_CNTL, 0);
826361fc4cbSmaya   tu_cs_emit_write_reg(cs, REG_A6XX_GRAS_UNKNOWN_80AF, 0);
827361fc4cbSmaya   tu_cs_emit_write_reg(cs, REG_A6XX_VPC_UNKNOWN_9210, 0);
828361fc4cbSmaya   tu_cs_emit_write_reg(cs, REG_A6XX_VPC_UNKNOWN_9211, 0);
829361fc4cbSmaya   tu_cs_emit_write_reg(cs, REG_A6XX_VPC_UNKNOWN_9602, 0);
830361fc4cbSmaya   tu_cs_emit_write_reg(cs, REG_A6XX_PC_UNKNOWN_9E72, 0);
8317ec681f3Smrg   tu_cs_emit_write_reg(cs, REG_A6XX_SP_TP_MODE_CNTL,
8327ec681f3Smrg                        0x000000a0 |
8337ec681f3Smrg                        A6XX_SP_TP_MODE_CNTL_ISAMMODE(ISAMMODE_GL));
834361fc4cbSmaya   tu_cs_emit_write_reg(cs, REG_A6XX_HLSQ_CONTROL_5_REG, 0xfc);
835361fc4cbSmaya
836361fc4cbSmaya   tu_cs_emit_write_reg(cs, REG_A6XX_VFD_MODE_CNTL, 0x00000000);
837361fc4cbSmaya
838361fc4cbSmaya   tu_cs_emit_write_reg(cs, REG_A6XX_PC_MODE_CNTL, 0x0000001f);
839361fc4cbSmaya
8407ec681f3Smrg   tu_cs_emit_regs(cs, A6XX_RB_ALPHA_CONTROL()); /* always disable alpha test */
8417ec681f3Smrg   tu_cs_emit_regs(cs, A6XX_RB_DITHER_CNTL()); /* always disable dithering */
842361fc4cbSmaya
8437ec681f3Smrg   tu_disable_draw_states(cmd, cs);
844361fc4cbSmaya
8457ec681f3Smrg   tu_cs_emit_regs(cs,
8467ec681f3Smrg                   A6XX_SP_TP_BORDER_COLOR_BASE_ADDR(.bo = &dev->global_bo,
8477ec681f3Smrg                                                     .bo_offset = gb_offset(bcolor_builtin)));
8487ec681f3Smrg   tu_cs_emit_regs(cs,
8497ec681f3Smrg                   A6XX_SP_PS_TP_BORDER_COLOR_BASE_ADDR(.bo = &dev->global_bo,
8507ec681f3Smrg                                                        .bo_offset = gb_offset(bcolor_builtin)));
851361fc4cbSmaya
8527ec681f3Smrg   /* VSC buffers:
8537ec681f3Smrg    * use vsc pitches from the largest values used so far with this device
8547ec681f3Smrg    * if there hasn't been overflow, there will already be a scratch bo
8557ec681f3Smrg    * allocated for these sizes
8567ec681f3Smrg    *
8577ec681f3Smrg    * if overflow is detected, the stream size is increased by 2x
8587ec681f3Smrg    */
8597ec681f3Smrg   mtx_lock(&dev->mutex);
860361fc4cbSmaya
8617ec681f3Smrg   struct tu6_global *global = dev->global_bo.map;
862361fc4cbSmaya
8637ec681f3Smrg   uint32_t vsc_draw_overflow = global->vsc_draw_overflow;
8647ec681f3Smrg   uint32_t vsc_prim_overflow = global->vsc_prim_overflow;
865361fc4cbSmaya
8667ec681f3Smrg   if (vsc_draw_overflow >= dev->vsc_draw_strm_pitch)
8677ec681f3Smrg      dev->vsc_draw_strm_pitch = (dev->vsc_draw_strm_pitch - VSC_PAD) * 2 + VSC_PAD;
868361fc4cbSmaya
8697ec681f3Smrg   if (vsc_prim_overflow >= dev->vsc_prim_strm_pitch)
8707ec681f3Smrg      dev->vsc_prim_strm_pitch = (dev->vsc_prim_strm_pitch - VSC_PAD) * 2 + VSC_PAD;
871361fc4cbSmaya
8727ec681f3Smrg   cmd->vsc_prim_strm_pitch = dev->vsc_prim_strm_pitch;
8737ec681f3Smrg   cmd->vsc_draw_strm_pitch = dev->vsc_draw_strm_pitch;
874361fc4cbSmaya
8757ec681f3Smrg   mtx_unlock(&dev->mutex);
876361fc4cbSmaya
8777ec681f3Smrg   struct tu_bo *vsc_bo;
8787ec681f3Smrg   uint32_t size0 = cmd->vsc_prim_strm_pitch * MAX_VSC_PIPES +
8797ec681f3Smrg                    cmd->vsc_draw_strm_pitch * MAX_VSC_PIPES;
880361fc4cbSmaya
8817ec681f3Smrg   tu_get_scratch_bo(dev, size0 + MAX_VSC_PIPES * 4, &vsc_bo);
882361fc4cbSmaya
8837ec681f3Smrg   tu_cs_emit_regs(cs,
8847ec681f3Smrg                   A6XX_VSC_DRAW_STRM_SIZE_ADDRESS(.bo = vsc_bo, .bo_offset = size0));
8857ec681f3Smrg   tu_cs_emit_regs(cs,
8867ec681f3Smrg                   A6XX_VSC_PRIM_STRM_ADDRESS(.bo = vsc_bo));
8877ec681f3Smrg   tu_cs_emit_regs(cs,
8887ec681f3Smrg                   A6XX_VSC_DRAW_STRM_ADDRESS(.bo = vsc_bo,
8897ec681f3Smrg                                              .bo_offset = cmd->vsc_prim_strm_pitch * MAX_VSC_PIPES));
890361fc4cbSmaya
891361fc4cbSmaya   tu_cs_sanity_check(cs);
892361fc4cbSmaya}
893361fc4cbSmaya
894361fc4cbSmayastatic void
8957ec681f3Smrgupdate_vsc_pipe(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
896361fc4cbSmaya{
8977ec681f3Smrg   const struct tu_framebuffer *fb = cmd->state.framebuffer;
8987ec681f3Smrg
8997ec681f3Smrg   tu_cs_emit_regs(cs,
9007ec681f3Smrg                   A6XX_VSC_BIN_SIZE(.width = fb->tile0.width,
9017ec681f3Smrg                                     .height = fb->tile0.height));
9027ec681f3Smrg
9037ec681f3Smrg   tu_cs_emit_regs(cs,
9047ec681f3Smrg                   A6XX_VSC_BIN_COUNT(.nx = fb->tile_count.width,
9057ec681f3Smrg                                      .ny = fb->tile_count.height));
9067ec681f3Smrg
9077ec681f3Smrg   tu_cs_emit_pkt4(cs, REG_A6XX_VSC_PIPE_CONFIG_REG(0), 32);
9087ec681f3Smrg   tu_cs_emit_array(cs, fb->pipe_config, 32);
9097ec681f3Smrg
9107ec681f3Smrg   tu_cs_emit_regs(cs,
9117ec681f3Smrg                   A6XX_VSC_PRIM_STRM_PITCH(cmd->vsc_prim_strm_pitch),
9127ec681f3Smrg                   A6XX_VSC_PRIM_STRM_LIMIT(cmd->vsc_prim_strm_pitch - VSC_PAD));
9137ec681f3Smrg
9147ec681f3Smrg   tu_cs_emit_regs(cs,
9157ec681f3Smrg                   A6XX_VSC_DRAW_STRM_PITCH(cmd->vsc_draw_strm_pitch),
9167ec681f3Smrg                   A6XX_VSC_DRAW_STRM_LIMIT(cmd->vsc_draw_strm_pitch - VSC_PAD));
9177ec681f3Smrg}
9187ec681f3Smrg
9197ec681f3Smrgstatic void
9207ec681f3Smrgemit_vsc_overflow_test(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
9217ec681f3Smrg{
9227ec681f3Smrg   const struct tu_framebuffer *fb = cmd->state.framebuffer;
9237ec681f3Smrg   const uint32_t used_pipe_count =
9247ec681f3Smrg      fb->pipe_count.width * fb->pipe_count.height;
9257ec681f3Smrg
9267ec681f3Smrg   for (int i = 0; i < used_pipe_count; i++) {
9277ec681f3Smrg      tu_cs_emit_pkt7(cs, CP_COND_WRITE5, 8);
9287ec681f3Smrg      tu_cs_emit(cs, CP_COND_WRITE5_0_FUNCTION(WRITE_GE) |
9297ec681f3Smrg            CP_COND_WRITE5_0_WRITE_MEMORY);
9307ec681f3Smrg      tu_cs_emit(cs, CP_COND_WRITE5_1_POLL_ADDR_LO(REG_A6XX_VSC_DRAW_STRM_SIZE_REG(i)));
9317ec681f3Smrg      tu_cs_emit(cs, CP_COND_WRITE5_2_POLL_ADDR_HI(0));
9327ec681f3Smrg      tu_cs_emit(cs, CP_COND_WRITE5_3_REF(cmd->vsc_draw_strm_pitch - VSC_PAD));
9337ec681f3Smrg      tu_cs_emit(cs, CP_COND_WRITE5_4_MASK(~0));
9347ec681f3Smrg      tu_cs_emit_qw(cs, global_iova(cmd, vsc_draw_overflow));
9357ec681f3Smrg      tu_cs_emit(cs, CP_COND_WRITE5_7_WRITE_DATA(cmd->vsc_draw_strm_pitch));
9367ec681f3Smrg
9377ec681f3Smrg      tu_cs_emit_pkt7(cs, CP_COND_WRITE5, 8);
9387ec681f3Smrg      tu_cs_emit(cs, CP_COND_WRITE5_0_FUNCTION(WRITE_GE) |
9397ec681f3Smrg            CP_COND_WRITE5_0_WRITE_MEMORY);
9407ec681f3Smrg      tu_cs_emit(cs, CP_COND_WRITE5_1_POLL_ADDR_LO(REG_A6XX_VSC_PRIM_STRM_SIZE_REG(i)));
9417ec681f3Smrg      tu_cs_emit(cs, CP_COND_WRITE5_2_POLL_ADDR_HI(0));
9427ec681f3Smrg      tu_cs_emit(cs, CP_COND_WRITE5_3_REF(cmd->vsc_prim_strm_pitch - VSC_PAD));
9437ec681f3Smrg      tu_cs_emit(cs, CP_COND_WRITE5_4_MASK(~0));
9447ec681f3Smrg      tu_cs_emit_qw(cs, global_iova(cmd, vsc_prim_overflow));
9457ec681f3Smrg      tu_cs_emit(cs, CP_COND_WRITE5_7_WRITE_DATA(cmd->vsc_prim_strm_pitch));
946361fc4cbSmaya   }
947361fc4cbSmaya
9487ec681f3Smrg   tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0);
9497ec681f3Smrg}
9507ec681f3Smrg
9517ec681f3Smrgstatic void
9527ec681f3Smrgtu6_emit_binning_pass(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
9537ec681f3Smrg{
9547ec681f3Smrg   struct tu_physical_device *phys_dev = cmd->device->physical_device;
9557ec681f3Smrg   const struct tu_framebuffer *fb = cmd->state.framebuffer;
956361fc4cbSmaya
9577ec681f3Smrg   tu6_emit_window_scissor(cs, 0, 0, fb->width - 1, fb->height - 1);
958361fc4cbSmaya
9597ec681f3Smrg   tu_cs_emit_pkt7(cs, CP_SET_MARKER, 1);
9607ec681f3Smrg   tu_cs_emit(cs, A6XX_CP_SET_MARKER_0_MODE(RM6_BINNING));
961361fc4cbSmaya
9627ec681f3Smrg   tu_cs_emit_pkt7(cs, CP_SET_VISIBILITY_OVERRIDE, 1);
9637ec681f3Smrg   tu_cs_emit(cs, 0x1);
964361fc4cbSmaya
9657ec681f3Smrg   tu_cs_emit_pkt7(cs, CP_SET_MODE, 1);
9667ec681f3Smrg   tu_cs_emit(cs, 0x1);
967361fc4cbSmaya
9687ec681f3Smrg   tu_cs_emit_wfi(cs);
969361fc4cbSmaya
9707ec681f3Smrg   tu_cs_emit_regs(cs,
9717ec681f3Smrg                   A6XX_VFD_MODE_CNTL(.render_mode = BINNING_PASS));
972361fc4cbSmaya
9737ec681f3Smrg   update_vsc_pipe(cmd, cs);
974361fc4cbSmaya
9757ec681f3Smrg   tu_cs_emit_regs(cs,
9767ec681f3Smrg                   A6XX_PC_POWER_CNTL(phys_dev->info->a6xx.magic.PC_POWER_CNTL));
9777ec681f3Smrg
9787ec681f3Smrg   tu_cs_emit_regs(cs,
9797ec681f3Smrg                   A6XX_VFD_POWER_CNTL(phys_dev->info->a6xx.magic.PC_POWER_CNTL));
9807ec681f3Smrg
9817ec681f3Smrg   tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, 1);
9827ec681f3Smrg   tu_cs_emit(cs, UNK_2C);
9837ec681f3Smrg
9847ec681f3Smrg   tu_cs_emit_regs(cs,
9857ec681f3Smrg                   A6XX_RB_WINDOW_OFFSET(.x = 0, .y = 0));
9867ec681f3Smrg
9877ec681f3Smrg   tu_cs_emit_regs(cs,
9887ec681f3Smrg                   A6XX_SP_TP_WINDOW_OFFSET(.x = 0, .y = 0));
9897ec681f3Smrg
9907ec681f3Smrg   trace_start_binning_ib(&cmd->trace, cs);
9917ec681f3Smrg
9927ec681f3Smrg   /* emit IB to binning drawcmds: */
9937ec681f3Smrg   tu_cs_emit_call(cs, &cmd->draw_cs);
9947ec681f3Smrg
9957ec681f3Smrg   trace_end_binning_ib(&cmd->trace, cs);
9967ec681f3Smrg
9977ec681f3Smrg   /* switching from binning pass to GMEM pass will cause a switch from
9987ec681f3Smrg    * PROGRAM_BINNING to PROGRAM, which invalidates const state (XS_CONST states)
9997ec681f3Smrg    * so make sure these states are re-emitted
10007ec681f3Smrg    * (eventually these states shouldn't exist at all with shader prologue)
10017ec681f3Smrg    * only VS and GS are invalidated, as FS isn't emitted in binning pass,
10027ec681f3Smrg    * and we don't use HW binning when tesselation is used
10037ec681f3Smrg    */
10047ec681f3Smrg   tu_cs_emit_pkt7(cs, CP_SET_DRAW_STATE, 3);
10057ec681f3Smrg   tu_cs_emit(cs, CP_SET_DRAW_STATE__0_COUNT(0) |
10067ec681f3Smrg                  CP_SET_DRAW_STATE__0_DISABLE |
10077ec681f3Smrg                  CP_SET_DRAW_STATE__0_GROUP_ID(TU_DRAW_STATE_SHADER_GEOM_CONST));
10087ec681f3Smrg   tu_cs_emit(cs, CP_SET_DRAW_STATE__1_ADDR_LO(0));
10097ec681f3Smrg   tu_cs_emit(cs, CP_SET_DRAW_STATE__2_ADDR_HI(0));
10107ec681f3Smrg
10117ec681f3Smrg   tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, 1);
10127ec681f3Smrg   tu_cs_emit(cs, UNK_2D);
10137ec681f3Smrg
10147ec681f3Smrg   /* This flush is probably required because the VSC, which produces the
10157ec681f3Smrg    * visibility stream, is a client of UCHE, whereas the CP needs to read the
10167ec681f3Smrg    * visibility stream (without caching) to do draw skipping. The
10177ec681f3Smrg    * WFI+WAIT_FOR_ME combination guarantees that the binning commands
10187ec681f3Smrg    * submitted are finished before reading the VSC regs (in
10197ec681f3Smrg    * emit_vsc_overflow_test) or the VSC_DATA buffer directly (implicitly as
10207ec681f3Smrg    * part of draws).
10217ec681f3Smrg    */
10227ec681f3Smrg   tu6_emit_event_write(cmd, cs, CACHE_FLUSH_TS);
10237ec681f3Smrg
10247ec681f3Smrg   tu_cs_emit_wfi(cs);
10257ec681f3Smrg
10267ec681f3Smrg   tu_cs_emit_pkt7(cs, CP_WAIT_FOR_ME, 0);
10277ec681f3Smrg
10287ec681f3Smrg   emit_vsc_overflow_test(cmd, cs);
10297ec681f3Smrg
10307ec681f3Smrg   tu_cs_emit_pkt7(cs, CP_SET_VISIBILITY_OVERRIDE, 1);
10317ec681f3Smrg   tu_cs_emit(cs, 0x0);
10327ec681f3Smrg
10337ec681f3Smrg   tu_cs_emit_pkt7(cs, CP_SET_MODE, 1);
10347ec681f3Smrg   tu_cs_emit(cs, 0x0);
1035361fc4cbSmaya}
1036361fc4cbSmaya
10377ec681f3Smrgstatic struct tu_draw_state
10387ec681f3Smrgtu_emit_input_attachments(struct tu_cmd_buffer *cmd,
10397ec681f3Smrg                          const struct tu_subpass *subpass,
10407ec681f3Smrg                          bool gmem)
1041361fc4cbSmaya{
10427ec681f3Smrg   /* note: we can probably emit input attachments just once for the whole
10437ec681f3Smrg    * renderpass, this would avoid emitting both sysmem/gmem versions
10447ec681f3Smrg    *
10457ec681f3Smrg    * emit two texture descriptors for each input, as a workaround for
10467ec681f3Smrg    * d24s8/d32s8, which can be sampled as both float (depth) and integer (stencil)
10477ec681f3Smrg    * tu_shader lowers uint input attachment loads to use the 2nd descriptor
10487ec681f3Smrg    * in the pair
10497ec681f3Smrg    * TODO: a smarter workaround
10507ec681f3Smrg    */
10517ec681f3Smrg
10527ec681f3Smrg   if (!subpass->input_count)
10537ec681f3Smrg      return (struct tu_draw_state) {};
10547ec681f3Smrg
10557ec681f3Smrg   struct tu_cs_memory texture;
10567ec681f3Smrg   VkResult result = tu_cs_alloc(&cmd->sub_cs, subpass->input_count * 2,
10577ec681f3Smrg                                 A6XX_TEX_CONST_DWORDS, &texture);
1058361fc4cbSmaya   if (result != VK_SUCCESS) {
1059361fc4cbSmaya      cmd->record_result = result;
10607ec681f3Smrg      return (struct tu_draw_state) {};
1061361fc4cbSmaya   }
1062361fc4cbSmaya
10637ec681f3Smrg   for (unsigned i = 0; i < subpass->input_count * 2; i++) {
10647ec681f3Smrg      uint32_t a = subpass->input_attachments[i / 2].attachment;
10657ec681f3Smrg      if (a == VK_ATTACHMENT_UNUSED)
10667ec681f3Smrg         continue;
10677ec681f3Smrg
10687ec681f3Smrg      const struct tu_image_view *iview = cmd->state.attachments[a];
10697ec681f3Smrg      const struct tu_render_pass_attachment *att =
10707ec681f3Smrg         &cmd->state.pass->attachments[a];
10717ec681f3Smrg      uint32_t *dst = &texture.map[A6XX_TEX_CONST_DWORDS * i];
10727ec681f3Smrg      uint32_t gmem_offset = att->gmem_offset;
10737ec681f3Smrg      uint32_t cpp = att->cpp;
10747ec681f3Smrg
10757ec681f3Smrg      memcpy(dst, iview->descriptor, A6XX_TEX_CONST_DWORDS * 4);
10767ec681f3Smrg
10777ec681f3Smrg      if (i % 2 == 1 && att->format == VK_FORMAT_D24_UNORM_S8_UINT) {
10787ec681f3Smrg         /* note this works because spec says fb and input attachments
10797ec681f3Smrg          * must use identity swizzle
10807ec681f3Smrg          */
10817ec681f3Smrg         dst[0] &= ~(A6XX_TEX_CONST_0_FMT__MASK |
10827ec681f3Smrg            A6XX_TEX_CONST_0_SWIZ_X__MASK | A6XX_TEX_CONST_0_SWIZ_Y__MASK |
10837ec681f3Smrg            A6XX_TEX_CONST_0_SWIZ_Z__MASK | A6XX_TEX_CONST_0_SWIZ_W__MASK);
10847ec681f3Smrg         if (!cmd->device->physical_device->info->a6xx.has_z24uint_s8uint) {
10857ec681f3Smrg            dst[0] |= A6XX_TEX_CONST_0_FMT(FMT6_8_8_8_8_UINT) |
10867ec681f3Smrg               A6XX_TEX_CONST_0_SWIZ_X(A6XX_TEX_W) |
10877ec681f3Smrg               A6XX_TEX_CONST_0_SWIZ_Y(A6XX_TEX_ZERO) |
10887ec681f3Smrg               A6XX_TEX_CONST_0_SWIZ_Z(A6XX_TEX_ZERO) |
10897ec681f3Smrg               A6XX_TEX_CONST_0_SWIZ_W(A6XX_TEX_ONE);
10907ec681f3Smrg         } else {
10917ec681f3Smrg            dst[0] |= A6XX_TEX_CONST_0_FMT(FMT6_Z24_UINT_S8_UINT) |
10927ec681f3Smrg               A6XX_TEX_CONST_0_SWIZ_X(A6XX_TEX_Y) |
10937ec681f3Smrg               A6XX_TEX_CONST_0_SWIZ_Y(A6XX_TEX_ZERO) |
10947ec681f3Smrg               A6XX_TEX_CONST_0_SWIZ_Z(A6XX_TEX_ZERO) |
10957ec681f3Smrg               A6XX_TEX_CONST_0_SWIZ_W(A6XX_TEX_ONE);
10967ec681f3Smrg         }
10977ec681f3Smrg      }
1098361fc4cbSmaya
10997ec681f3Smrg      if (i % 2 == 1 && att->format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
11007ec681f3Smrg         dst[0] &= ~A6XX_TEX_CONST_0_FMT__MASK;
11017ec681f3Smrg         dst[0] |= A6XX_TEX_CONST_0_FMT(FMT6_8_UINT);
11027ec681f3Smrg         dst[2] &= ~(A6XX_TEX_CONST_2_PITCHALIGN__MASK | A6XX_TEX_CONST_2_PITCH__MASK);
11037ec681f3Smrg         dst[2] |= A6XX_TEX_CONST_2_PITCH(iview->stencil_PITCH << 6);
11047ec681f3Smrg         dst[3] = 0;
11057ec681f3Smrg         dst[4] = iview->stencil_base_addr;
11067ec681f3Smrg         dst[5] = (dst[5] & 0xffff) | iview->stencil_base_addr >> 32;
11077ec681f3Smrg
11087ec681f3Smrg         cpp = att->samples;
11097ec681f3Smrg         gmem_offset = att->gmem_offset_stencil;
11107ec681f3Smrg      }
1111361fc4cbSmaya
11127ec681f3Smrg      if (!gmem || !subpass->input_attachments[i / 2].patch_input_gmem)
11137ec681f3Smrg         continue;
1114361fc4cbSmaya
11157ec681f3Smrg      /* patched for gmem */
11167ec681f3Smrg      dst[0] &= ~(A6XX_TEX_CONST_0_SWAP__MASK | A6XX_TEX_CONST_0_TILE_MODE__MASK);
11177ec681f3Smrg      dst[0] |= A6XX_TEX_CONST_0_TILE_MODE(TILE6_2);
11187ec681f3Smrg      dst[2] =
11197ec681f3Smrg         A6XX_TEX_CONST_2_TYPE(A6XX_TEX_2D) |
11207ec681f3Smrg         A6XX_TEX_CONST_2_PITCH(cmd->state.framebuffer->tile0.width * cpp);
11217ec681f3Smrg      dst[3] = 0;
11227ec681f3Smrg      dst[4] = cmd->device->physical_device->gmem_base + gmem_offset;
11237ec681f3Smrg      dst[5] = A6XX_TEX_CONST_5_DEPTH(1);
11247ec681f3Smrg      for (unsigned i = 6; i < A6XX_TEX_CONST_DWORDS; i++)
11257ec681f3Smrg         dst[i] = 0;
11267ec681f3Smrg   }
11277ec681f3Smrg
11287ec681f3Smrg   struct tu_cs cs;
11297ec681f3Smrg   struct tu_draw_state ds = tu_cs_draw_state(&cmd->sub_cs, &cs, 9);
11307ec681f3Smrg
11317ec681f3Smrg   tu_cs_emit_pkt7(&cs, CP_LOAD_STATE6_FRAG, 3);
11327ec681f3Smrg   tu_cs_emit(&cs, CP_LOAD_STATE6_0_DST_OFF(0) |
11337ec681f3Smrg                  CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
11347ec681f3Smrg                  CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
11357ec681f3Smrg                  CP_LOAD_STATE6_0_STATE_BLOCK(SB6_FS_TEX) |
11367ec681f3Smrg                  CP_LOAD_STATE6_0_NUM_UNIT(subpass->input_count * 2));
11377ec681f3Smrg   tu_cs_emit_qw(&cs, texture.iova);
11387ec681f3Smrg
11397ec681f3Smrg   tu_cs_emit_regs(&cs, A6XX_SP_FS_TEX_CONST(.qword = texture.iova));
11407ec681f3Smrg
11417ec681f3Smrg   tu_cs_emit_regs(&cs, A6XX_SP_FS_TEX_COUNT(subpass->input_count * 2));
11427ec681f3Smrg
11437ec681f3Smrg   assert(cs.cur == cs.end); /* validate draw state size */
11447ec681f3Smrg
11457ec681f3Smrg   return ds;
1146361fc4cbSmaya}
1147361fc4cbSmaya
1148361fc4cbSmayastatic void
11497ec681f3Smrgtu_set_input_attachments(struct tu_cmd_buffer *cmd, const struct tu_subpass *subpass)
1150361fc4cbSmaya{
11517ec681f3Smrg   struct tu_cs *cs = &cmd->draw_cs;
11527ec681f3Smrg
11537ec681f3Smrg   tu_cs_emit_pkt7(cs, CP_SET_DRAW_STATE, 6);
11547ec681f3Smrg   tu_cs_emit_draw_state(cs, TU_DRAW_STATE_INPUT_ATTACHMENTS_GMEM,
11557ec681f3Smrg                         tu_emit_input_attachments(cmd, subpass, true));
11567ec681f3Smrg   tu_cs_emit_draw_state(cs, TU_DRAW_STATE_INPUT_ATTACHMENTS_SYSMEM,
11577ec681f3Smrg                         tu_emit_input_attachments(cmd, subpass, false));
11587ec681f3Smrg}
11597ec681f3Smrg
11607ec681f3Smrgstatic void
11617ec681f3Smrgtu_emit_renderpass_begin(struct tu_cmd_buffer *cmd,
11627ec681f3Smrg                         const VkRenderPassBeginInfo *info)
11637ec681f3Smrg{
11647ec681f3Smrg   struct tu_cs *cs = &cmd->draw_cs;
11657ec681f3Smrg
11667ec681f3Smrg   tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_GMEM);
11677ec681f3Smrg
11687ec681f3Smrg   tu6_emit_blit_scissor(cmd, cs, true);
11697ec681f3Smrg
11707ec681f3Smrg   for (uint32_t i = 0; i < cmd->state.pass->attachment_count; ++i)
11717ec681f3Smrg      tu_load_gmem_attachment(cmd, cs, i, false);
11727ec681f3Smrg
11737ec681f3Smrg   tu6_emit_blit_scissor(cmd, cs, false);
11747ec681f3Smrg
11757ec681f3Smrg   for (uint32_t i = 0; i < cmd->state.pass->attachment_count; ++i)
11767ec681f3Smrg      tu_clear_gmem_attachment(cmd, cs, i, info);
11777ec681f3Smrg
11787ec681f3Smrg   tu_cond_exec_end(cs);
11797ec681f3Smrg
11807ec681f3Smrg   tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_SYSMEM);
11817ec681f3Smrg
11827ec681f3Smrg   for (uint32_t i = 0; i < cmd->state.pass->attachment_count; ++i)
11837ec681f3Smrg      tu_clear_sysmem_attachment(cmd, cs, i, info);
11847ec681f3Smrg
11857ec681f3Smrg   tu_cond_exec_end(cs);
11867ec681f3Smrg}
11877ec681f3Smrg
11887ec681f3Smrgstatic void
11897ec681f3Smrgtu6_sysmem_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
11907ec681f3Smrg{
11917ec681f3Smrg   const struct tu_framebuffer *fb = cmd->state.framebuffer;
1192361fc4cbSmaya
11937ec681f3Smrg   assert(fb->width > 0 && fb->height > 0);
11947ec681f3Smrg   tu6_emit_window_scissor(cs, 0, 0, fb->width - 1, fb->height - 1);
11957ec681f3Smrg   tu6_emit_window_offset(cs, 0, 0);
1196361fc4cbSmaya
11977ec681f3Smrg   tu6_emit_bin_size(cs, 0, 0,
11987ec681f3Smrg                     A6XX_RB_BIN_CONTROL_BUFFERS_LOCATION(BUFFERS_IN_SYSMEM));
1199361fc4cbSmaya
12007ec681f3Smrg   tu6_emit_event_write(cmd, cs, LRZ_FLUSH);
12017ec681f3Smrg
12027ec681f3Smrg   tu_cs_emit_pkt7(cs, CP_SET_MARKER, 1);
12037ec681f3Smrg   tu_cs_emit(cs, A6XX_CP_SET_MARKER_0_MODE(RM6_BYPASS));
12047ec681f3Smrg
12057ec681f3Smrg   tu_cs_emit_pkt7(cs, CP_SKIP_IB2_ENABLE_GLOBAL, 1);
12067ec681f3Smrg   tu_cs_emit(cs, 0x0);
12077ec681f3Smrg
12087ec681f3Smrg   tu_emit_cache_flush_ccu(cmd, cs, TU_CMD_CCU_SYSMEM);
12097ec681f3Smrg
12107ec681f3Smrg   /* enable stream-out, with sysmem there is only one pass: */
12117ec681f3Smrg   tu_cs_emit_regs(cs, A6XX_VPC_SO_DISABLE(false));
12127ec681f3Smrg
12137ec681f3Smrg   tu_cs_emit_pkt7(cs, CP_SET_VISIBILITY_OVERRIDE, 1);
12147ec681f3Smrg   tu_cs_emit(cs, 0x1);
12157ec681f3Smrg
12167ec681f3Smrg   tu_cs_emit_pkt7(cs, CP_SET_MODE, 1);
12177ec681f3Smrg   tu_cs_emit(cs, 0x0);
1218361fc4cbSmaya
1219361fc4cbSmaya   tu_cs_sanity_check(cs);
1220361fc4cbSmaya}
1221361fc4cbSmaya
1222361fc4cbSmayastatic void
12237ec681f3Smrgtu6_sysmem_render_end(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
1224361fc4cbSmaya{
12257ec681f3Smrg   /* Do any resolves of the last subpass. These are handled in the
12267ec681f3Smrg    * tile_store_cs in the gmem path.
12277ec681f3Smrg    */
12287ec681f3Smrg   tu6_emit_sysmem_resolves(cmd, cs, cmd->state.subpass);
1229361fc4cbSmaya
12307ec681f3Smrg   tu_cs_emit_call(cs, &cmd->draw_epilogue_cs);
1231361fc4cbSmaya
12327ec681f3Smrg   tu_cs_emit_pkt7(cs, CP_SKIP_IB2_ENABLE_GLOBAL, 1);
12337ec681f3Smrg   tu_cs_emit(cs, 0x0);
1234361fc4cbSmaya
12357ec681f3Smrg   tu6_emit_event_write(cmd, cs, LRZ_FLUSH);
12367ec681f3Smrg
12377ec681f3Smrg   tu_cs_sanity_check(cs);
1238361fc4cbSmaya}
1239361fc4cbSmaya
1240361fc4cbSmayastatic void
12417ec681f3Smrgtu6_tile_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
1242361fc4cbSmaya{
12437ec681f3Smrg   struct tu_physical_device *phys_dev = cmd->device->physical_device;
1244361fc4cbSmaya
12457ec681f3Smrg   tu6_emit_event_write(cmd, cs, LRZ_FLUSH);
12467ec681f3Smrg
12477ec681f3Smrg   tu_cs_emit_pkt7(cs, CP_SKIP_IB2_ENABLE_GLOBAL, 1);
12487ec681f3Smrg   tu_cs_emit(cs, 0x0);
12497ec681f3Smrg
12507ec681f3Smrg   tu_emit_cache_flush_ccu(cmd, cs, TU_CMD_CCU_GMEM);
1251361fc4cbSmaya
12527ec681f3Smrg   const struct tu_framebuffer *fb = cmd->state.framebuffer;
12537ec681f3Smrg   if (use_hw_binning(cmd)) {
12547ec681f3Smrg      /* enable stream-out during binning pass: */
12557ec681f3Smrg      tu_cs_emit_regs(cs, A6XX_VPC_SO_DISABLE(false));
1256361fc4cbSmaya
12577ec681f3Smrg      tu6_emit_bin_size(cs, fb->tile0.width, fb->tile0.height,
12587ec681f3Smrg                        A6XX_RB_BIN_CONTROL_RENDER_MODE(BINNING_PASS) |
12597ec681f3Smrg                        A6XX_RB_BIN_CONTROL_LRZ_FEEDBACK_ZMODE_MASK(0x6));
1260361fc4cbSmaya
12617ec681f3Smrg      tu6_emit_render_cntl(cmd, cmd->state.subpass, cs, true);
12627ec681f3Smrg
12637ec681f3Smrg      tu6_emit_binning_pass(cmd, cs);
12647ec681f3Smrg
12657ec681f3Smrg      /* and disable stream-out for draw pass: */
12667ec681f3Smrg      tu_cs_emit_regs(cs, A6XX_VPC_SO_DISABLE(true));
12677ec681f3Smrg
12687ec681f3Smrg      tu6_emit_bin_size(cs, fb->tile0.width, fb->tile0.height,
12697ec681f3Smrg                        A6XX_RB_BIN_CONTROL_FORCE_LRZ_WRITE_DIS |
12707ec681f3Smrg                        A6XX_RB_BIN_CONTROL_LRZ_FEEDBACK_ZMODE_MASK(0x6));
12717ec681f3Smrg
12727ec681f3Smrg      tu_cs_emit_regs(cs,
12737ec681f3Smrg                      A6XX_VFD_MODE_CNTL(0));
12747ec681f3Smrg
12757ec681f3Smrg      tu_cs_emit_regs(cs,
12767ec681f3Smrg                      A6XX_PC_POWER_CNTL(phys_dev->info->a6xx.magic.PC_POWER_CNTL));
12777ec681f3Smrg
12787ec681f3Smrg      tu_cs_emit_regs(cs,
12797ec681f3Smrg                      A6XX_VFD_POWER_CNTL(phys_dev->info->a6xx.magic.PC_POWER_CNTL));
12807ec681f3Smrg
12817ec681f3Smrg      tu_cs_emit_pkt7(cs, CP_SKIP_IB2_ENABLE_GLOBAL, 1);
12827ec681f3Smrg      tu_cs_emit(cs, 0x1);
12837ec681f3Smrg   } else {
12847ec681f3Smrg      /* no binning pass, so enable stream-out for draw pass:: */
12857ec681f3Smrg      tu_cs_emit_regs(cs, A6XX_VPC_SO_DISABLE(false));
12867ec681f3Smrg
12877ec681f3Smrg      tu6_emit_bin_size(cs, fb->tile0.width, fb->tile0.height,
12887ec681f3Smrg                        A6XX_RB_BIN_CONTROL_LRZ_FEEDBACK_ZMODE_MASK(0x6));
1289361fc4cbSmaya   }
12907ec681f3Smrg
12917ec681f3Smrg   tu_cs_sanity_check(cs);
1292361fc4cbSmaya}
1293361fc4cbSmaya
1294361fc4cbSmayastatic void
12957ec681f3Smrgtu6_render_tile(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
1296361fc4cbSmaya{
12977ec681f3Smrg   tu_cs_emit_call(cs, &cmd->draw_cs);
1298361fc4cbSmaya
12997ec681f3Smrg   if (use_hw_binning(cmd)) {
13007ec681f3Smrg      tu_cs_emit_pkt7(cs, CP_SET_MARKER, 1);
13017ec681f3Smrg      tu_cs_emit(cs, A6XX_CP_SET_MARKER_0_MODE(RM6_ENDVIS));
1302361fc4cbSmaya   }
1303361fc4cbSmaya
13047ec681f3Smrg   tu_cs_emit_call(cs, &cmd->tile_store_cs);
1305361fc4cbSmaya
13067ec681f3Smrg   if (!u_trace_iterator_equal(cmd->trace_renderpass_start, cmd->trace_renderpass_end)) {
13077ec681f3Smrg      tu_cs_emit_wfi(cs);
13087ec681f3Smrg      tu_cs_emit_pkt7(&cmd->cs, CP_WAIT_FOR_ME, 0);
13097ec681f3Smrg      u_trace_clone_append(cmd->trace_renderpass_start,
13107ec681f3Smrg                           cmd->trace_renderpass_end,
13117ec681f3Smrg                           &cmd->trace,
13127ec681f3Smrg                           cs, tu_copy_timestamp_buffer);
13137ec681f3Smrg   }
13147ec681f3Smrg
13157ec681f3Smrg   tu_cs_sanity_check(cs);
1316361fc4cbSmaya}
1317361fc4cbSmaya
1318361fc4cbSmayastatic void
13197ec681f3Smrgtu6_tile_render_end(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
1320361fc4cbSmaya{
13217ec681f3Smrg   tu_cs_emit_call(cs, &cmd->draw_epilogue_cs);
1322361fc4cbSmaya
13237ec681f3Smrg   tu_cs_emit_regs(cs,
13247ec681f3Smrg                   A6XX_GRAS_LRZ_CNTL(0));
1325361fc4cbSmaya
13267ec681f3Smrg   tu6_emit_event_write(cmd, cs, LRZ_FLUSH);
1327361fc4cbSmaya
13287ec681f3Smrg   tu6_emit_event_write(cmd, cs, PC_CCU_RESOLVE_TS);
1329361fc4cbSmaya
13307ec681f3Smrg   tu_cs_sanity_check(cs);
13317ec681f3Smrg}
1332361fc4cbSmaya
13337ec681f3Smrgstatic void
13347ec681f3Smrgtu_cmd_render_tiles(struct tu_cmd_buffer *cmd)
1335361fc4cbSmaya{
13367ec681f3Smrg   const struct tu_framebuffer *fb = cmd->state.framebuffer;
1337361fc4cbSmaya
13387ec681f3Smrg   tu6_tile_render_begin(cmd, &cmd->cs);
13397ec681f3Smrg
13407ec681f3Smrg   uint32_t pipe = 0;
13417ec681f3Smrg   for (uint32_t py = 0; py < fb->pipe_count.height; py++) {
13427ec681f3Smrg      for (uint32_t px = 0; px < fb->pipe_count.width; px++, pipe++) {
13437ec681f3Smrg         uint32_t tx1 = px * fb->pipe0.width;
13447ec681f3Smrg         uint32_t ty1 = py * fb->pipe0.height;
13457ec681f3Smrg         uint32_t tx2 = MIN2(tx1 + fb->pipe0.width, fb->tile_count.width);
13467ec681f3Smrg         uint32_t ty2 = MIN2(ty1 + fb->pipe0.height, fb->tile_count.height);
13477ec681f3Smrg         uint32_t slot = 0;
13487ec681f3Smrg         for (uint32_t ty = ty1; ty < ty2; ty++) {
13497ec681f3Smrg            for (uint32_t tx = tx1; tx < tx2; tx++, slot++) {
13507ec681f3Smrg               tu6_emit_tile_select(cmd, &cmd->cs, tx, ty, pipe, slot);
13517ec681f3Smrg
13527ec681f3Smrg               trace_start_draw_ib_gmem(&cmd->trace, &cmd->cs);
13537ec681f3Smrg               tu6_render_tile(cmd, &cmd->cs);
13547ec681f3Smrg               trace_end_draw_ib_gmem(&cmd->trace, &cmd->cs);
13557ec681f3Smrg            }
13567ec681f3Smrg         }
1357361fc4cbSmaya      }
1358361fc4cbSmaya   }
1359361fc4cbSmaya
13607ec681f3Smrg   tu6_tile_render_end(cmd, &cmd->cs);
1361361fc4cbSmaya
13627ec681f3Smrg   trace_end_render_pass(&cmd->trace, &cmd->cs, fb);
1363361fc4cbSmaya
13647ec681f3Smrg   if (!u_trace_iterator_equal(cmd->trace_renderpass_start, cmd->trace_renderpass_end))
13657ec681f3Smrg      u_trace_disable_event_range(cmd->trace_renderpass_start,
13667ec681f3Smrg                                  cmd->trace_renderpass_end);
13677ec681f3Smrg}
1368361fc4cbSmaya
13697ec681f3Smrgstatic void
13707ec681f3Smrgtu_cmd_render_sysmem(struct tu_cmd_buffer *cmd)
13717ec681f3Smrg{
13727ec681f3Smrg   tu6_sysmem_render_begin(cmd, &cmd->cs);
1373361fc4cbSmaya
13747ec681f3Smrg   trace_start_draw_ib_sysmem(&cmd->trace, &cmd->cs);
1375361fc4cbSmaya
13767ec681f3Smrg   tu_cs_emit_call(&cmd->cs, &cmd->draw_cs);
1377361fc4cbSmaya
13787ec681f3Smrg   trace_end_draw_ib_sysmem(&cmd->trace, &cmd->cs);
1379361fc4cbSmaya
13807ec681f3Smrg   tu6_sysmem_render_end(cmd, &cmd->cs);
1381361fc4cbSmaya
13827ec681f3Smrg   trace_end_render_pass(&cmd->trace, &cmd->cs, cmd->state.framebuffer);
1383361fc4cbSmaya}
1384361fc4cbSmaya
1385361fc4cbSmayastatic VkResult
1386361fc4cbSmayatu_create_cmd_buffer(struct tu_device *device,
1387361fc4cbSmaya                     struct tu_cmd_pool *pool,
1388361fc4cbSmaya                     VkCommandBufferLevel level,
1389361fc4cbSmaya                     VkCommandBuffer *pCommandBuffer)
1390361fc4cbSmaya{
1391361fc4cbSmaya   struct tu_cmd_buffer *cmd_buffer;
13927ec681f3Smrg
13937ec681f3Smrg   cmd_buffer = vk_zalloc2(&device->vk.alloc, NULL, sizeof(*cmd_buffer), 8,
13947ec681f3Smrg                           VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
13957ec681f3Smrg
1396361fc4cbSmaya   if (cmd_buffer == NULL)
13977ec681f3Smrg      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
13987ec681f3Smrg
13997ec681f3Smrg   VkResult result = vk_command_buffer_init(&cmd_buffer->vk, &device->vk);
14007ec681f3Smrg   if (result != VK_SUCCESS) {
14017ec681f3Smrg      vk_free2(&device->vk.alloc, NULL, cmd_buffer);
14027ec681f3Smrg      return result;
14037ec681f3Smrg   }
1404361fc4cbSmaya
1405361fc4cbSmaya   cmd_buffer->device = device;
1406361fc4cbSmaya   cmd_buffer->pool = pool;
1407361fc4cbSmaya   cmd_buffer->level = level;
1408361fc4cbSmaya
1409361fc4cbSmaya   if (pool) {
1410361fc4cbSmaya      list_addtail(&cmd_buffer->pool_link, &pool->cmd_buffers);
1411361fc4cbSmaya      cmd_buffer->queue_family_index = pool->queue_family_index;
1412361fc4cbSmaya
1413361fc4cbSmaya   } else {
1414361fc4cbSmaya      /* Init the pool_link so we can safely call list_del when we destroy
1415361fc4cbSmaya       * the command buffer
1416361fc4cbSmaya       */
1417361fc4cbSmaya      list_inithead(&cmd_buffer->pool_link);
1418361fc4cbSmaya      cmd_buffer->queue_family_index = TU_QUEUE_GENERAL;
1419361fc4cbSmaya   }
1420361fc4cbSmaya
14217ec681f3Smrg   u_trace_init(&cmd_buffer->trace, &device->trace_context);
1422361fc4cbSmaya
14237ec681f3Smrg   tu_cs_init(&cmd_buffer->cs, device, TU_CS_MODE_GROW, 4096);
14247ec681f3Smrg   tu_cs_init(&cmd_buffer->draw_cs, device, TU_CS_MODE_GROW, 4096);
14257ec681f3Smrg   tu_cs_init(&cmd_buffer->tile_store_cs, device, TU_CS_MODE_GROW, 2048);
14267ec681f3Smrg   tu_cs_init(&cmd_buffer->draw_epilogue_cs, device, TU_CS_MODE_GROW, 4096);
14277ec681f3Smrg   tu_cs_init(&cmd_buffer->sub_cs, device, TU_CS_MODE_SUB_STREAM, 2048);
1428361fc4cbSmaya
14297ec681f3Smrg   *pCommandBuffer = tu_cmd_buffer_to_handle(cmd_buffer);
1430361fc4cbSmaya
1431361fc4cbSmaya   return VK_SUCCESS;
1432361fc4cbSmaya}
1433361fc4cbSmaya
1434361fc4cbSmayastatic void
1435361fc4cbSmayatu_cmd_buffer_destroy(struct tu_cmd_buffer *cmd_buffer)
1436361fc4cbSmaya{
1437361fc4cbSmaya   list_del(&cmd_buffer->pool_link);
1438361fc4cbSmaya
14397ec681f3Smrg   tu_cs_finish(&cmd_buffer->cs);
14407ec681f3Smrg   tu_cs_finish(&cmd_buffer->draw_cs);
14417ec681f3Smrg   tu_cs_finish(&cmd_buffer->tile_store_cs);
14427ec681f3Smrg   tu_cs_finish(&cmd_buffer->draw_epilogue_cs);
14437ec681f3Smrg   tu_cs_finish(&cmd_buffer->sub_cs);
1444361fc4cbSmaya
14457ec681f3Smrg   u_trace_fini(&cmd_buffer->trace);
1446361fc4cbSmaya
14477ec681f3Smrg   vk_command_buffer_finish(&cmd_buffer->vk);
14487ec681f3Smrg   vk_free2(&cmd_buffer->device->vk.alloc, &cmd_buffer->pool->alloc,
14497ec681f3Smrg            cmd_buffer);
1450361fc4cbSmaya}
1451361fc4cbSmaya
1452361fc4cbSmayastatic VkResult
1453361fc4cbSmayatu_reset_cmd_buffer(struct tu_cmd_buffer *cmd_buffer)
1454361fc4cbSmaya{
14557ec681f3Smrg   vk_command_buffer_reset(&cmd_buffer->vk);
1456361fc4cbSmaya
1457361fc4cbSmaya   cmd_buffer->record_result = VK_SUCCESS;
1458361fc4cbSmaya
14597ec681f3Smrg   tu_cs_reset(&cmd_buffer->cs);
14607ec681f3Smrg   tu_cs_reset(&cmd_buffer->draw_cs);
14617ec681f3Smrg   tu_cs_reset(&cmd_buffer->tile_store_cs);
14627ec681f3Smrg   tu_cs_reset(&cmd_buffer->draw_epilogue_cs);
14637ec681f3Smrg   tu_cs_reset(&cmd_buffer->sub_cs);
1464361fc4cbSmaya
14657ec681f3Smrg   for (unsigned i = 0; i < MAX_BIND_POINTS; i++) {
14667ec681f3Smrg      memset(&cmd_buffer->descriptors[i].sets, 0, sizeof(cmd_buffer->descriptors[i].sets));
14677ec681f3Smrg      memset(&cmd_buffer->descriptors[i].push_set, 0, sizeof(cmd_buffer->descriptors[i].push_set));
14687ec681f3Smrg      cmd_buffer->descriptors[i].push_set.base.type = VK_OBJECT_TYPE_DESCRIPTOR_SET;
1469361fc4cbSmaya   }
1470361fc4cbSmaya
14717ec681f3Smrg   u_trace_fini(&cmd_buffer->trace);
14727ec681f3Smrg   u_trace_init(&cmd_buffer->trace, &cmd_buffer->device->trace_context);
14737ec681f3Smrg
1474361fc4cbSmaya   cmd_buffer->status = TU_CMD_BUFFER_STATUS_INITIAL;
1475361fc4cbSmaya
1476361fc4cbSmaya   return cmd_buffer->record_result;
1477361fc4cbSmaya}
1478361fc4cbSmaya
14797ec681f3SmrgVKAPI_ATTR VkResult VKAPI_CALL
1480361fc4cbSmayatu_AllocateCommandBuffers(VkDevice _device,
1481361fc4cbSmaya                          const VkCommandBufferAllocateInfo *pAllocateInfo,
1482361fc4cbSmaya                          VkCommandBuffer *pCommandBuffers)
1483361fc4cbSmaya{
1484361fc4cbSmaya   TU_FROM_HANDLE(tu_device, device, _device);
1485361fc4cbSmaya   TU_FROM_HANDLE(tu_cmd_pool, pool, pAllocateInfo->commandPool);
1486361fc4cbSmaya
1487361fc4cbSmaya   VkResult result = VK_SUCCESS;
1488361fc4cbSmaya   uint32_t i;
1489361fc4cbSmaya
1490361fc4cbSmaya   for (i = 0; i < pAllocateInfo->commandBufferCount; i++) {
1491361fc4cbSmaya
14927ec681f3Smrg      if (!list_is_empty(&pool->free_cmd_buffers)) {
1493361fc4cbSmaya         struct tu_cmd_buffer *cmd_buffer = list_first_entry(
1494361fc4cbSmaya            &pool->free_cmd_buffers, struct tu_cmd_buffer, pool_link);
1495361fc4cbSmaya
1496361fc4cbSmaya         list_del(&cmd_buffer->pool_link);
1497361fc4cbSmaya         list_addtail(&cmd_buffer->pool_link, &pool->cmd_buffers);
1498361fc4cbSmaya
1499361fc4cbSmaya         result = tu_reset_cmd_buffer(cmd_buffer);
1500361fc4cbSmaya         cmd_buffer->level = pAllocateInfo->level;
15017ec681f3Smrg         vk_command_buffer_finish(&cmd_buffer->vk);
15027ec681f3Smrg         VkResult init_result =
15037ec681f3Smrg            vk_command_buffer_init(&cmd_buffer->vk, &device->vk);
15047ec681f3Smrg         if (init_result != VK_SUCCESS)
15057ec681f3Smrg            result = init_result;
1506361fc4cbSmaya
1507361fc4cbSmaya         pCommandBuffers[i] = tu_cmd_buffer_to_handle(cmd_buffer);
1508361fc4cbSmaya      } else {
1509361fc4cbSmaya         result = tu_create_cmd_buffer(device, pool, pAllocateInfo->level,
1510361fc4cbSmaya                                       &pCommandBuffers[i]);
1511361fc4cbSmaya      }
1512361fc4cbSmaya      if (result != VK_SUCCESS)
1513361fc4cbSmaya         break;
1514361fc4cbSmaya   }
1515361fc4cbSmaya
1516361fc4cbSmaya   if (result != VK_SUCCESS) {
1517361fc4cbSmaya      tu_FreeCommandBuffers(_device, pAllocateInfo->commandPool, i,
1518361fc4cbSmaya                            pCommandBuffers);
1519361fc4cbSmaya
1520361fc4cbSmaya      /* From the Vulkan 1.0.66 spec:
1521361fc4cbSmaya       *
1522361fc4cbSmaya       * "vkAllocateCommandBuffers can be used to create multiple
1523361fc4cbSmaya       *  command buffers. If the creation of any of those command
1524361fc4cbSmaya       *  buffers fails, the implementation must destroy all
1525361fc4cbSmaya       *  successfully created command buffer objects from this
1526361fc4cbSmaya       *  command, set all entries of the pCommandBuffers array to
1527361fc4cbSmaya       *  NULL and return the error."
1528361fc4cbSmaya       */
1529361fc4cbSmaya      memset(pCommandBuffers, 0,
1530361fc4cbSmaya             sizeof(*pCommandBuffers) * pAllocateInfo->commandBufferCount);
1531361fc4cbSmaya   }
1532361fc4cbSmaya
1533361fc4cbSmaya   return result;
1534361fc4cbSmaya}
1535361fc4cbSmaya
15367ec681f3SmrgVKAPI_ATTR void VKAPI_CALL
1537361fc4cbSmayatu_FreeCommandBuffers(VkDevice device,
1538361fc4cbSmaya                      VkCommandPool commandPool,
1539361fc4cbSmaya                      uint32_t commandBufferCount,
1540361fc4cbSmaya                      const VkCommandBuffer *pCommandBuffers)
1541361fc4cbSmaya{
1542361fc4cbSmaya   for (uint32_t i = 0; i < commandBufferCount; i++) {
1543361fc4cbSmaya      TU_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, pCommandBuffers[i]);
1544361fc4cbSmaya
1545361fc4cbSmaya      if (cmd_buffer) {
1546361fc4cbSmaya         if (cmd_buffer->pool) {
1547361fc4cbSmaya            list_del(&cmd_buffer->pool_link);
1548361fc4cbSmaya            list_addtail(&cmd_buffer->pool_link,
1549361fc4cbSmaya                         &cmd_buffer->pool->free_cmd_buffers);
1550361fc4cbSmaya         } else
1551361fc4cbSmaya            tu_cmd_buffer_destroy(cmd_buffer);
1552361fc4cbSmaya      }
1553361fc4cbSmaya   }
1554361fc4cbSmaya}
1555361fc4cbSmaya
15567ec681f3SmrgVKAPI_ATTR VkResult VKAPI_CALL
1557361fc4cbSmayatu_ResetCommandBuffer(VkCommandBuffer commandBuffer,
1558361fc4cbSmaya                      VkCommandBufferResetFlags flags)
1559361fc4cbSmaya{
1560361fc4cbSmaya   TU_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, commandBuffer);
1561361fc4cbSmaya   return tu_reset_cmd_buffer(cmd_buffer);
1562361fc4cbSmaya}
1563361fc4cbSmaya
15647ec681f3Smrg/* Initialize the cache, assuming all necessary flushes have happened but *not*
15657ec681f3Smrg * invalidations.
15667ec681f3Smrg */
15677ec681f3Smrgstatic void
15687ec681f3Smrgtu_cache_init(struct tu_cache_state *cache)
15697ec681f3Smrg{
15707ec681f3Smrg   cache->flush_bits = 0;
15717ec681f3Smrg   cache->pending_flush_bits = TU_CMD_FLAG_ALL_INVALIDATE;
15727ec681f3Smrg}
15737ec681f3Smrg
15747ec681f3SmrgVKAPI_ATTR VkResult VKAPI_CALL
1575361fc4cbSmayatu_BeginCommandBuffer(VkCommandBuffer commandBuffer,
1576361fc4cbSmaya                      const VkCommandBufferBeginInfo *pBeginInfo)
1577361fc4cbSmaya{
1578361fc4cbSmaya   TU_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, commandBuffer);
1579361fc4cbSmaya   VkResult result = VK_SUCCESS;
1580361fc4cbSmaya
1581361fc4cbSmaya   if (cmd_buffer->status != TU_CMD_BUFFER_STATUS_INITIAL) {
1582361fc4cbSmaya      /* If the command buffer has already been resetted with
1583361fc4cbSmaya       * vkResetCommandBuffer, no need to do it again.
1584361fc4cbSmaya       */
1585361fc4cbSmaya      result = tu_reset_cmd_buffer(cmd_buffer);
1586361fc4cbSmaya      if (result != VK_SUCCESS)
1587361fc4cbSmaya         return result;
1588361fc4cbSmaya   }
1589361fc4cbSmaya
1590361fc4cbSmaya   memset(&cmd_buffer->state, 0, sizeof(cmd_buffer->state));
15917ec681f3Smrg   cmd_buffer->state.index_size = 0xff; /* dirty restart index */
15927ec681f3Smrg   cmd_buffer->state.line_mode = RECTANGULAR;
15937ec681f3Smrg
15947ec681f3Smrg   tu_cache_init(&cmd_buffer->state.cache);
15957ec681f3Smrg   tu_cache_init(&cmd_buffer->state.renderpass_cache);
1596361fc4cbSmaya   cmd_buffer->usage_flags = pBeginInfo->flags;
1597361fc4cbSmaya
1598361fc4cbSmaya   tu_cs_begin(&cmd_buffer->cs);
15997ec681f3Smrg   tu_cs_begin(&cmd_buffer->draw_cs);
16007ec681f3Smrg   tu_cs_begin(&cmd_buffer->tile_store_cs);
16017ec681f3Smrg   tu_cs_begin(&cmd_buffer->draw_epilogue_cs);
1602361fc4cbSmaya
1603361fc4cbSmaya   /* setup initial configuration into command buffer */
1604361fc4cbSmaya   if (cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY) {
1605361fc4cbSmaya      switch (cmd_buffer->queue_family_index) {
1606361fc4cbSmaya      case TU_QUEUE_GENERAL:
1607361fc4cbSmaya         tu6_init_hw(cmd_buffer, &cmd_buffer->cs);
1608361fc4cbSmaya         break;
1609361fc4cbSmaya      default:
1610361fc4cbSmaya         break;
1611361fc4cbSmaya      }
16127ec681f3Smrg   } else if (cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY) {
16137ec681f3Smrg      assert(pBeginInfo->pInheritanceInfo);
16147ec681f3Smrg
16157ec681f3Smrg      vk_foreach_struct(ext, pBeginInfo->pInheritanceInfo) {
16167ec681f3Smrg         switch (ext->sType) {
16177ec681f3Smrg         case VK_STRUCTURE_TYPE_COMMAND_BUFFER_INHERITANCE_CONDITIONAL_RENDERING_INFO_EXT: {
16187ec681f3Smrg            const VkCommandBufferInheritanceConditionalRenderingInfoEXT *cond_rend = (void *) ext;
16197ec681f3Smrg            cmd_buffer->state.predication_active = cond_rend->conditionalRenderingEnable;
16207ec681f3Smrg            break;
16217ec681f3Smrg         default:
16227ec681f3Smrg            break;
16237ec681f3Smrg         }
16247ec681f3Smrg         }
16257ec681f3Smrg      }
16267ec681f3Smrg
16277ec681f3Smrg      if (pBeginInfo->flags & VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT) {
16287ec681f3Smrg         cmd_buffer->state.pass = tu_render_pass_from_handle(pBeginInfo->pInheritanceInfo->renderPass);
16297ec681f3Smrg         cmd_buffer->state.subpass =
16307ec681f3Smrg            &cmd_buffer->state.pass->subpasses[pBeginInfo->pInheritanceInfo->subpass];
16317ec681f3Smrg      } else {
16327ec681f3Smrg         /* When executing in the middle of another command buffer, the CCU
16337ec681f3Smrg          * state is unknown.
16347ec681f3Smrg          */
16357ec681f3Smrg         cmd_buffer->state.ccu_state = TU_CMD_CCU_UNKNOWN;
16367ec681f3Smrg      }
1637361fc4cbSmaya   }
1638361fc4cbSmaya
1639361fc4cbSmaya   cmd_buffer->status = TU_CMD_BUFFER_STATUS_RECORDING;
1640361fc4cbSmaya
1641361fc4cbSmaya   return VK_SUCCESS;
1642361fc4cbSmaya}
1643361fc4cbSmaya
16447ec681f3SmrgVKAPI_ATTR void VKAPI_CALL
1645361fc4cbSmayatu_CmdBindVertexBuffers(VkCommandBuffer commandBuffer,
1646361fc4cbSmaya                        uint32_t firstBinding,
1647361fc4cbSmaya                        uint32_t bindingCount,
1648361fc4cbSmaya                        const VkBuffer *pBuffers,
1649361fc4cbSmaya                        const VkDeviceSize *pOffsets)
1650361fc4cbSmaya{
16517ec681f3Smrg   tu_CmdBindVertexBuffers2EXT(commandBuffer, firstBinding, bindingCount,
16527ec681f3Smrg                               pBuffers, pOffsets, NULL, NULL);
16537ec681f3Smrg}
1654361fc4cbSmaya
16557ec681f3SmrgVKAPI_ATTR void VKAPI_CALL
16567ec681f3Smrgtu_CmdBindVertexBuffers2EXT(VkCommandBuffer commandBuffer,
16577ec681f3Smrg                            uint32_t firstBinding,
16587ec681f3Smrg                            uint32_t bindingCount,
16597ec681f3Smrg                            const VkBuffer* pBuffers,
16607ec681f3Smrg                            const VkDeviceSize* pOffsets,
16617ec681f3Smrg                            const VkDeviceSize* pSizes,
16627ec681f3Smrg                            const VkDeviceSize* pStrides)
16637ec681f3Smrg{
16647ec681f3Smrg   TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
16657ec681f3Smrg   struct tu_cs cs;
16667ec681f3Smrg   /* TODO: track a "max_vb" value for the cmdbuf to save a bit of memory  */
16677ec681f3Smrg   cmd->state.vertex_buffers.iova = tu_cs_draw_state(&cmd->sub_cs, &cs, 4 * MAX_VBS).iova;
1668361fc4cbSmaya
1669361fc4cbSmaya   for (uint32_t i = 0; i < bindingCount; i++) {
16707ec681f3Smrg      if (pBuffers[i] == VK_NULL_HANDLE) {
16717ec681f3Smrg         cmd->state.vb[firstBinding + i].base = 0;
16727ec681f3Smrg         cmd->state.vb[firstBinding + i].size = 0;
16737ec681f3Smrg      } else {
16747ec681f3Smrg         struct tu_buffer *buf = tu_buffer_from_handle(pBuffers[i]);
16757ec681f3Smrg         cmd->state.vb[firstBinding + i].base = tu_buffer_iova(buf) + pOffsets[i];
16767ec681f3Smrg         cmd->state.vb[firstBinding + i].size = pSizes ? pSizes[i] : (buf->size - pOffsets[i]);
16777ec681f3Smrg      }
16787ec681f3Smrg
16797ec681f3Smrg      if (pStrides)
16807ec681f3Smrg         cmd->state.vb[firstBinding + i].stride = pStrides[i];
16817ec681f3Smrg   }
16827ec681f3Smrg
16837ec681f3Smrg   for (uint32_t i = 0; i < MAX_VBS; i++) {
16847ec681f3Smrg      tu_cs_emit_regs(&cs,
16857ec681f3Smrg                      A6XX_VFD_FETCH_BASE(i, .qword = cmd->state.vb[i].base),
16867ec681f3Smrg                      A6XX_VFD_FETCH_SIZE(i, cmd->state.vb[i].size));
16877ec681f3Smrg   }
1688361fc4cbSmaya
1689361fc4cbSmaya   cmd->state.dirty |= TU_CMD_DIRTY_VERTEX_BUFFERS;
16907ec681f3Smrg
16917ec681f3Smrg   if (pStrides) {
16927ec681f3Smrg      cmd->state.dynamic_state[TU_DYNAMIC_STATE_VB_STRIDE].iova =
16937ec681f3Smrg         tu_cs_draw_state(&cmd->sub_cs, &cs, 2 * MAX_VBS).iova;
16947ec681f3Smrg
16957ec681f3Smrg      for (uint32_t i = 0; i < MAX_VBS; i++)
16967ec681f3Smrg         tu_cs_emit_regs(&cs, A6XX_VFD_FETCH_STRIDE(i, cmd->state.vb[i].stride));
16977ec681f3Smrg
16987ec681f3Smrg      cmd->state.dirty |= TU_CMD_DIRTY_VB_STRIDE;
16997ec681f3Smrg   }
1700361fc4cbSmaya}
1701361fc4cbSmaya
17027ec681f3SmrgVKAPI_ATTR void VKAPI_CALL
1703361fc4cbSmayatu_CmdBindIndexBuffer(VkCommandBuffer commandBuffer,
1704361fc4cbSmaya                      VkBuffer buffer,
1705361fc4cbSmaya                      VkDeviceSize offset,
1706361fc4cbSmaya                      VkIndexType indexType)
1707361fc4cbSmaya{
1708361fc4cbSmaya   TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1709361fc4cbSmaya   TU_FROM_HANDLE(tu_buffer, buf, buffer);
1710361fc4cbSmaya
1711361fc4cbSmaya
1712361fc4cbSmaya
17137ec681f3Smrg   uint32_t index_size, index_shift, restart_index;
17147ec681f3Smrg
17157ec681f3Smrg   switch (indexType) {
17167ec681f3Smrg   case VK_INDEX_TYPE_UINT16:
17177ec681f3Smrg      index_size = INDEX4_SIZE_16_BIT;
17187ec681f3Smrg      index_shift = 1;
17197ec681f3Smrg      restart_index = 0xffff;
17207ec681f3Smrg      break;
17217ec681f3Smrg   case VK_INDEX_TYPE_UINT32:
17227ec681f3Smrg      index_size = INDEX4_SIZE_32_BIT;
17237ec681f3Smrg      index_shift = 2;
17247ec681f3Smrg      restart_index = 0xffffffff;
17257ec681f3Smrg      break;
17267ec681f3Smrg   case VK_INDEX_TYPE_UINT8_EXT:
17277ec681f3Smrg      index_size = INDEX4_SIZE_8_BIT;
17287ec681f3Smrg      index_shift = 0;
17297ec681f3Smrg      restart_index = 0xff;
17307ec681f3Smrg      break;
17317ec681f3Smrg   default:
17327ec681f3Smrg      unreachable("invalid VkIndexType");
1733361fc4cbSmaya   }
1734361fc4cbSmaya
17357ec681f3Smrg   /* initialize/update the restart index */
17367ec681f3Smrg   if (cmd->state.index_size != index_size)
17377ec681f3Smrg      tu_cs_emit_regs(&cmd->draw_cs, A6XX_PC_RESTART_INDEX(restart_index));
17387ec681f3Smrg
17397ec681f3Smrg   assert(buf->size >= offset);
1740361fc4cbSmaya
17417ec681f3Smrg   cmd->state.index_va = buf->bo->iova + buf->bo_offset + offset;
17427ec681f3Smrg   cmd->state.max_index_count = (buf->size - offset) >> index_shift;
17437ec681f3Smrg   cmd->state.index_size = index_size;
1744361fc4cbSmaya}
1745361fc4cbSmaya
17467ec681f3SmrgVKAPI_ATTR void VKAPI_CALL
1747361fc4cbSmayatu_CmdBindDescriptorSets(VkCommandBuffer commandBuffer,
1748361fc4cbSmaya                         VkPipelineBindPoint pipelineBindPoint,
1749361fc4cbSmaya                         VkPipelineLayout _layout,
1750361fc4cbSmaya                         uint32_t firstSet,
1751361fc4cbSmaya                         uint32_t descriptorSetCount,
1752361fc4cbSmaya                         const VkDescriptorSet *pDescriptorSets,
1753361fc4cbSmaya                         uint32_t dynamicOffsetCount,
1754361fc4cbSmaya                         const uint32_t *pDynamicOffsets)
1755361fc4cbSmaya{
17567ec681f3Smrg   TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
17577ec681f3Smrg   TU_FROM_HANDLE(tu_pipeline_layout, layout, _layout);
17587ec681f3Smrg   unsigned dyn_idx = 0;
17597ec681f3Smrg
17607ec681f3Smrg   struct tu_descriptor_state *descriptors_state =
17617ec681f3Smrg      tu_get_descriptors_state(cmd, pipelineBindPoint);
17627ec681f3Smrg
17637ec681f3Smrg   for (unsigned i = 0; i < descriptorSetCount; ++i) {
17647ec681f3Smrg      unsigned idx = i + firstSet;
17657ec681f3Smrg      TU_FROM_HANDLE(tu_descriptor_set, set, pDescriptorSets[i]);
17667ec681f3Smrg
17677ec681f3Smrg      descriptors_state->sets[idx] = set;
17687ec681f3Smrg
17697ec681f3Smrg      for(unsigned j = 0; j < set->layout->dynamic_offset_count; ++j, ++dyn_idx) {
17707ec681f3Smrg         /* update the contents of the dynamic descriptor set */
17717ec681f3Smrg         unsigned src_idx = j;
17727ec681f3Smrg         unsigned dst_idx = j + layout->set[idx].dynamic_offset_start;
17737ec681f3Smrg         assert(dyn_idx < dynamicOffsetCount);
17747ec681f3Smrg
17757ec681f3Smrg         uint32_t *dst =
17767ec681f3Smrg            &descriptors_state->dynamic_descriptors[dst_idx * A6XX_TEX_CONST_DWORDS];
17777ec681f3Smrg         uint32_t *src =
17787ec681f3Smrg            &set->dynamic_descriptors[src_idx * A6XX_TEX_CONST_DWORDS];
17797ec681f3Smrg         uint32_t offset = pDynamicOffsets[dyn_idx];
17807ec681f3Smrg
17817ec681f3Smrg         /* Patch the storage/uniform descriptors right away. */
17827ec681f3Smrg         if (layout->set[idx].layout->dynamic_ubo & (1 << j)) {
17837ec681f3Smrg            /* Note: we can assume here that the addition won't roll over and
17847ec681f3Smrg             * change the SIZE field.
17857ec681f3Smrg             */
17867ec681f3Smrg            uint64_t va = src[0] | ((uint64_t)src[1] << 32);
17877ec681f3Smrg            va += offset;
17887ec681f3Smrg            dst[0] = va;
17897ec681f3Smrg            dst[1] = va >> 32;
17907ec681f3Smrg         } else {
17917ec681f3Smrg            memcpy(dst, src, A6XX_TEX_CONST_DWORDS * 4);
17927ec681f3Smrg            /* Note: A6XX_IBO_5_DEPTH is always 0 */
17937ec681f3Smrg            uint64_t va = dst[4] | ((uint64_t)dst[5] << 32);
17947ec681f3Smrg            va += offset;
17957ec681f3Smrg            dst[4] = va;
17967ec681f3Smrg            dst[5] = va >> 32;
17977ec681f3Smrg         }
17987ec681f3Smrg      }
17997ec681f3Smrg   }
18007ec681f3Smrg   assert(dyn_idx == dynamicOffsetCount);
18017ec681f3Smrg
18027ec681f3Smrg   uint32_t sp_bindless_base_reg, hlsq_bindless_base_reg, hlsq_invalidate_value;
18037ec681f3Smrg   uint64_t addr[MAX_SETS + 1] = {};
18047ec681f3Smrg   struct tu_cs *cs, state_cs;
18057ec681f3Smrg
18067ec681f3Smrg   for (uint32_t i = 0; i < MAX_SETS; i++) {
18077ec681f3Smrg      struct tu_descriptor_set *set = descriptors_state->sets[i];
18087ec681f3Smrg      if (set)
18097ec681f3Smrg         addr[i] = set->va | 3;
18107ec681f3Smrg   }
18117ec681f3Smrg
18127ec681f3Smrg   if (layout->dynamic_offset_count) {
18137ec681f3Smrg      /* allocate and fill out dynamic descriptor set */
18147ec681f3Smrg      struct tu_cs_memory dynamic_desc_set;
18157ec681f3Smrg      VkResult result = tu_cs_alloc(&cmd->sub_cs, layout->dynamic_offset_count,
18167ec681f3Smrg                                    A6XX_TEX_CONST_DWORDS, &dynamic_desc_set);
18177ec681f3Smrg      if (result != VK_SUCCESS) {
18187ec681f3Smrg         cmd->record_result = result;
18197ec681f3Smrg         return;
18207ec681f3Smrg      }
18217ec681f3Smrg
18227ec681f3Smrg      memcpy(dynamic_desc_set.map, descriptors_state->dynamic_descriptors,
18237ec681f3Smrg             layout->dynamic_offset_count * A6XX_TEX_CONST_DWORDS * 4);
18247ec681f3Smrg      addr[MAX_SETS] = dynamic_desc_set.iova | 3;
18257ec681f3Smrg   }
18267ec681f3Smrg
18277ec681f3Smrg   if (pipelineBindPoint == VK_PIPELINE_BIND_POINT_GRAPHICS) {
18287ec681f3Smrg      sp_bindless_base_reg = REG_A6XX_SP_BINDLESS_BASE(0);
18297ec681f3Smrg      hlsq_bindless_base_reg = REG_A6XX_HLSQ_BINDLESS_BASE(0);
18307ec681f3Smrg      hlsq_invalidate_value = A6XX_HLSQ_INVALIDATE_CMD_GFX_BINDLESS(0x1f);
18317ec681f3Smrg
18327ec681f3Smrg      cmd->state.desc_sets = tu_cs_draw_state(&cmd->sub_cs, &state_cs, 24);
18337ec681f3Smrg      cmd->state.dirty |= TU_CMD_DIRTY_DESC_SETS_LOAD | TU_CMD_DIRTY_SHADER_CONSTS;
18347ec681f3Smrg      cs = &state_cs;
18357ec681f3Smrg   } else {
18367ec681f3Smrg      assert(pipelineBindPoint == VK_PIPELINE_BIND_POINT_COMPUTE);
18377ec681f3Smrg
18387ec681f3Smrg      sp_bindless_base_reg = REG_A6XX_SP_CS_BINDLESS_BASE(0);
18397ec681f3Smrg      hlsq_bindless_base_reg = REG_A6XX_HLSQ_CS_BINDLESS_BASE(0);
18407ec681f3Smrg      hlsq_invalidate_value = A6XX_HLSQ_INVALIDATE_CMD_CS_BINDLESS(0x1f);
18417ec681f3Smrg
18427ec681f3Smrg      cmd->state.dirty |= TU_CMD_DIRTY_COMPUTE_DESC_SETS_LOAD;
18437ec681f3Smrg      cs = &cmd->cs;
18447ec681f3Smrg   }
18457ec681f3Smrg
18467ec681f3Smrg   tu_cs_emit_pkt4(cs, sp_bindless_base_reg, 10);
18477ec681f3Smrg   tu_cs_emit_array(cs, (const uint32_t*) addr, 10);
18487ec681f3Smrg   tu_cs_emit_pkt4(cs, hlsq_bindless_base_reg, 10);
18497ec681f3Smrg   tu_cs_emit_array(cs, (const uint32_t*) addr, 10);
18507ec681f3Smrg   tu_cs_emit_regs(cs, A6XX_HLSQ_INVALIDATE_CMD(.dword = hlsq_invalidate_value));
18517ec681f3Smrg
18527ec681f3Smrg   if (pipelineBindPoint == VK_PIPELINE_BIND_POINT_GRAPHICS) {
18537ec681f3Smrg      assert(cs->cur == cs->end); /* validate draw state size */
18547ec681f3Smrg      /* note: this also avoids emitting draw states before renderpass clears,
18557ec681f3Smrg       * which may use the 3D clear path (for MSAA cases)
18567ec681f3Smrg       */
18577ec681f3Smrg      if (!(cmd->state.dirty & TU_CMD_DIRTY_DRAW_STATE)) {
18587ec681f3Smrg         tu_cs_emit_pkt7(&cmd->draw_cs, CP_SET_DRAW_STATE, 3);
18597ec681f3Smrg         tu_cs_emit_draw_state(&cmd->draw_cs, TU_DRAW_STATE_DESC_SETS, cmd->state.desc_sets);
18607ec681f3Smrg      }
18617ec681f3Smrg   }
1862361fc4cbSmaya}
1863361fc4cbSmaya
18647ec681f3SmrgVKAPI_ATTR void VKAPI_CALL
18657ec681f3Smrgtu_CmdPushDescriptorSetKHR(VkCommandBuffer commandBuffer,
18667ec681f3Smrg                           VkPipelineBindPoint pipelineBindPoint,
18677ec681f3Smrg                           VkPipelineLayout _layout,
18687ec681f3Smrg                           uint32_t _set,
18697ec681f3Smrg                           uint32_t descriptorWriteCount,
18707ec681f3Smrg                           const VkWriteDescriptorSet *pDescriptorWrites)
18717ec681f3Smrg{
18727ec681f3Smrg   TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
18737ec681f3Smrg   TU_FROM_HANDLE(tu_pipeline_layout, pipe_layout, _layout);
18747ec681f3Smrg   struct tu_descriptor_set_layout *layout = pipe_layout->set[_set].layout;
18757ec681f3Smrg   struct tu_descriptor_set *set =
18767ec681f3Smrg      &tu_get_descriptors_state(cmd, pipelineBindPoint)->push_set;
18777ec681f3Smrg
18787ec681f3Smrg   struct tu_cs_memory set_mem;
18797ec681f3Smrg   VkResult result = tu_cs_alloc(&cmd->sub_cs,
18807ec681f3Smrg                                 DIV_ROUND_UP(layout->size, A6XX_TEX_CONST_DWORDS * 4),
18817ec681f3Smrg                                 A6XX_TEX_CONST_DWORDS, &set_mem);
18827ec681f3Smrg   if (result != VK_SUCCESS) {
18837ec681f3Smrg      cmd->record_result = result;
18847ec681f3Smrg      return;
18857ec681f3Smrg   }
18867ec681f3Smrg
18877ec681f3Smrg   /* preserve previous content if the layout is the same: */
18887ec681f3Smrg   if (set->layout == layout)
18897ec681f3Smrg      memcpy(set_mem.map, set->mapped_ptr, layout->size);
18907ec681f3Smrg
18917ec681f3Smrg   set->layout = layout;
18927ec681f3Smrg   set->mapped_ptr = set_mem.map;
18937ec681f3Smrg   set->va = set_mem.iova;
18947ec681f3Smrg
18957ec681f3Smrg   tu_update_descriptor_sets(cmd->device, tu_descriptor_set_to_handle(set),
18967ec681f3Smrg                             descriptorWriteCount, pDescriptorWrites, 0, NULL);
18977ec681f3Smrg
18987ec681f3Smrg   tu_CmdBindDescriptorSets(commandBuffer, pipelineBindPoint, _layout, _set,
18997ec681f3Smrg                            1, (VkDescriptorSet[]) { tu_descriptor_set_to_handle(set) },
19007ec681f3Smrg                            0, NULL);
19017ec681f3Smrg}
19027ec681f3Smrg
19037ec681f3SmrgVKAPI_ATTR void VKAPI_CALL
19047ec681f3Smrgtu_CmdPushDescriptorSetWithTemplateKHR(VkCommandBuffer commandBuffer,
19057ec681f3Smrg                                       VkDescriptorUpdateTemplate descriptorUpdateTemplate,
19067ec681f3Smrg                                       VkPipelineLayout _layout,
19077ec681f3Smrg                                       uint32_t _set,
19087ec681f3Smrg                                       const void* pData)
19097ec681f3Smrg{
19107ec681f3Smrg   TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
19117ec681f3Smrg   TU_FROM_HANDLE(tu_pipeline_layout, pipe_layout, _layout);
19127ec681f3Smrg   TU_FROM_HANDLE(tu_descriptor_update_template, templ, descriptorUpdateTemplate);
19137ec681f3Smrg   struct tu_descriptor_set_layout *layout = pipe_layout->set[_set].layout;
19147ec681f3Smrg   struct tu_descriptor_set *set =
19157ec681f3Smrg      &tu_get_descriptors_state(cmd, templ->bind_point)->push_set;
19167ec681f3Smrg
19177ec681f3Smrg   struct tu_cs_memory set_mem;
19187ec681f3Smrg   VkResult result = tu_cs_alloc(&cmd->sub_cs,
19197ec681f3Smrg                                 DIV_ROUND_UP(layout->size, A6XX_TEX_CONST_DWORDS * 4),
19207ec681f3Smrg                                 A6XX_TEX_CONST_DWORDS, &set_mem);
19217ec681f3Smrg   if (result != VK_SUCCESS) {
19227ec681f3Smrg      cmd->record_result = result;
19237ec681f3Smrg      return;
19247ec681f3Smrg   }
19257ec681f3Smrg
19267ec681f3Smrg   /* preserve previous content if the layout is the same: */
19277ec681f3Smrg   if (set->layout == layout)
19287ec681f3Smrg      memcpy(set_mem.map, set->mapped_ptr, layout->size);
19297ec681f3Smrg
19307ec681f3Smrg   set->layout = layout;
19317ec681f3Smrg   set->mapped_ptr = set_mem.map;
19327ec681f3Smrg   set->va = set_mem.iova;
19337ec681f3Smrg
19347ec681f3Smrg   tu_update_descriptor_set_with_template(cmd->device, set, descriptorUpdateTemplate, pData);
19357ec681f3Smrg
19367ec681f3Smrg   tu_CmdBindDescriptorSets(commandBuffer, templ->bind_point, _layout, _set,
19377ec681f3Smrg                            1, (VkDescriptorSet[]) { tu_descriptor_set_to_handle(set) },
19387ec681f3Smrg                            0, NULL);
19397ec681f3Smrg}
19407ec681f3Smrg
19417ec681f3SmrgVKAPI_ATTR void VKAPI_CALL
19427ec681f3Smrgtu_CmdBindTransformFeedbackBuffersEXT(VkCommandBuffer commandBuffer,
19437ec681f3Smrg                                      uint32_t firstBinding,
19447ec681f3Smrg                                      uint32_t bindingCount,
19457ec681f3Smrg                                      const VkBuffer *pBuffers,
19467ec681f3Smrg                                      const VkDeviceSize *pOffsets,
19477ec681f3Smrg                                      const VkDeviceSize *pSizes)
19487ec681f3Smrg{
19497ec681f3Smrg   TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
19507ec681f3Smrg   struct tu_cs *cs = &cmd->draw_cs;
19517ec681f3Smrg
19527ec681f3Smrg   /* using COND_REG_EXEC for xfb commands matches the blob behavior
19537ec681f3Smrg    * presumably there isn't any benefit using a draw state when the
19547ec681f3Smrg    * condition is (SYSMEM | BINNING)
19557ec681f3Smrg    */
19567ec681f3Smrg   tu_cond_exec_start(cs, CP_COND_REG_EXEC_0_MODE(RENDER_MODE) |
19577ec681f3Smrg                          CP_COND_REG_EXEC_0_SYSMEM |
19587ec681f3Smrg                          CP_COND_REG_EXEC_0_BINNING);
19597ec681f3Smrg
19607ec681f3Smrg   for (uint32_t i = 0; i < bindingCount; i++) {
19617ec681f3Smrg      TU_FROM_HANDLE(tu_buffer, buf, pBuffers[i]);
19627ec681f3Smrg      uint64_t iova = buf->bo->iova + pOffsets[i];
19637ec681f3Smrg      uint32_t size = buf->bo->size - pOffsets[i];
19647ec681f3Smrg      uint32_t idx = i + firstBinding;
19657ec681f3Smrg
19667ec681f3Smrg      if (pSizes && pSizes[i] != VK_WHOLE_SIZE)
19677ec681f3Smrg         size = pSizes[i];
19687ec681f3Smrg
19697ec681f3Smrg      /* BUFFER_BASE is 32-byte aligned, add remaining offset to BUFFER_OFFSET */
19707ec681f3Smrg      uint32_t offset = iova & 0x1f;
19717ec681f3Smrg      iova &= ~(uint64_t) 0x1f;
19727ec681f3Smrg
19737ec681f3Smrg      tu_cs_emit_pkt4(cs, REG_A6XX_VPC_SO_BUFFER_BASE(idx), 3);
19747ec681f3Smrg      tu_cs_emit_qw(cs, iova);
19757ec681f3Smrg      tu_cs_emit(cs, size + offset);
19767ec681f3Smrg
19777ec681f3Smrg      cmd->state.streamout_offset[idx] = offset;
19787ec681f3Smrg   }
19797ec681f3Smrg
19807ec681f3Smrg   tu_cond_exec_end(cs);
19817ec681f3Smrg}
19827ec681f3Smrg
19837ec681f3SmrgVKAPI_ATTR void VKAPI_CALL
19847ec681f3Smrgtu_CmdBeginTransformFeedbackEXT(VkCommandBuffer commandBuffer,
19857ec681f3Smrg                                uint32_t firstCounterBuffer,
19867ec681f3Smrg                                uint32_t counterBufferCount,
19877ec681f3Smrg                                const VkBuffer *pCounterBuffers,
19887ec681f3Smrg                                const VkDeviceSize *pCounterBufferOffsets)
19897ec681f3Smrg{
19907ec681f3Smrg   TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
19917ec681f3Smrg   struct tu_cs *cs = &cmd->draw_cs;
19927ec681f3Smrg
19937ec681f3Smrg   tu_cond_exec_start(cs, CP_COND_REG_EXEC_0_MODE(RENDER_MODE) |
19947ec681f3Smrg                          CP_COND_REG_EXEC_0_SYSMEM |
19957ec681f3Smrg                          CP_COND_REG_EXEC_0_BINNING);
19967ec681f3Smrg
19977ec681f3Smrg   /* TODO: only update offset for active buffers */
19987ec681f3Smrg   for (uint32_t i = 0; i < IR3_MAX_SO_BUFFERS; i++)
19997ec681f3Smrg      tu_cs_emit_regs(cs, A6XX_VPC_SO_BUFFER_OFFSET(i, cmd->state.streamout_offset[i]));
20007ec681f3Smrg
20017ec681f3Smrg   for (uint32_t i = 0; i < (pCounterBuffers ? counterBufferCount : 0); i++) {
20027ec681f3Smrg      uint32_t idx = firstCounterBuffer + i;
20037ec681f3Smrg      uint32_t offset = cmd->state.streamout_offset[idx];
20047ec681f3Smrg      uint64_t counter_buffer_offset = pCounterBufferOffsets ? pCounterBufferOffsets[i] : 0u;
20057ec681f3Smrg
20067ec681f3Smrg      if (!pCounterBuffers[i])
20077ec681f3Smrg         continue;
20087ec681f3Smrg
20097ec681f3Smrg      TU_FROM_HANDLE(tu_buffer, buf, pCounterBuffers[i]);
20107ec681f3Smrg
20117ec681f3Smrg      tu_cs_emit_pkt7(cs, CP_MEM_TO_REG, 3);
20127ec681f3Smrg      tu_cs_emit(cs, CP_MEM_TO_REG_0_REG(REG_A6XX_VPC_SO_BUFFER_OFFSET(idx)) |
20137ec681f3Smrg                     CP_MEM_TO_REG_0_UNK31 |
20147ec681f3Smrg                     CP_MEM_TO_REG_0_CNT(1));
20157ec681f3Smrg      tu_cs_emit_qw(cs, buf->bo->iova + counter_buffer_offset);
20167ec681f3Smrg
20177ec681f3Smrg      if (offset) {
20187ec681f3Smrg         tu_cs_emit_pkt7(cs, CP_REG_RMW, 3);
20197ec681f3Smrg         tu_cs_emit(cs, CP_REG_RMW_0_DST_REG(REG_A6XX_VPC_SO_BUFFER_OFFSET(idx)) |
20207ec681f3Smrg                        CP_REG_RMW_0_SRC1_ADD);
20217ec681f3Smrg         tu_cs_emit(cs, 0xffffffff);
20227ec681f3Smrg         tu_cs_emit(cs, offset);
20237ec681f3Smrg      }
20247ec681f3Smrg   }
20257ec681f3Smrg
20267ec681f3Smrg   tu_cond_exec_end(cs);
20277ec681f3Smrg}
20287ec681f3Smrg
20297ec681f3SmrgVKAPI_ATTR void VKAPI_CALL
20307ec681f3Smrgtu_CmdEndTransformFeedbackEXT(VkCommandBuffer commandBuffer,
20317ec681f3Smrg                              uint32_t firstCounterBuffer,
20327ec681f3Smrg                              uint32_t counterBufferCount,
20337ec681f3Smrg                              const VkBuffer *pCounterBuffers,
20347ec681f3Smrg                              const VkDeviceSize *pCounterBufferOffsets)
20357ec681f3Smrg{
20367ec681f3Smrg   TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
20377ec681f3Smrg   struct tu_cs *cs = &cmd->draw_cs;
20387ec681f3Smrg
20397ec681f3Smrg   tu_cond_exec_start(cs, CP_COND_REG_EXEC_0_MODE(RENDER_MODE) |
20407ec681f3Smrg                          CP_COND_REG_EXEC_0_SYSMEM |
20417ec681f3Smrg                          CP_COND_REG_EXEC_0_BINNING);
20427ec681f3Smrg
20437ec681f3Smrg   /* TODO: only flush buffers that need to be flushed */
20447ec681f3Smrg   for (uint32_t i = 0; i < IR3_MAX_SO_BUFFERS; i++) {
20457ec681f3Smrg      /* note: FLUSH_BASE is always the same, so it could go in init_hw()? */
20467ec681f3Smrg      tu_cs_emit_pkt4(cs, REG_A6XX_VPC_SO_FLUSH_BASE(i), 2);
20477ec681f3Smrg      tu_cs_emit_qw(cs, global_iova(cmd, flush_base[i]));
20487ec681f3Smrg      tu6_emit_event_write(cmd, cs, FLUSH_SO_0 + i);
20497ec681f3Smrg   }
20507ec681f3Smrg
20517ec681f3Smrg   for (uint32_t i = 0; i < (pCounterBuffers ? counterBufferCount : 0); i++) {
20527ec681f3Smrg      uint32_t idx = firstCounterBuffer + i;
20537ec681f3Smrg      uint32_t offset = cmd->state.streamout_offset[idx];
20547ec681f3Smrg      uint64_t counter_buffer_offset = pCounterBufferOffsets ? pCounterBufferOffsets[i] : 0u;
20557ec681f3Smrg
20567ec681f3Smrg      if (!pCounterBuffers[i])
20577ec681f3Smrg         continue;
20587ec681f3Smrg
20597ec681f3Smrg      TU_FROM_HANDLE(tu_buffer, buf, pCounterBuffers[i]);
20607ec681f3Smrg
20617ec681f3Smrg      /* VPC_SO_FLUSH_BASE has dwords counter, but counter should be in bytes */
20627ec681f3Smrg      tu_cs_emit_pkt7(cs, CP_MEM_TO_REG, 3);
20637ec681f3Smrg      tu_cs_emit(cs, CP_MEM_TO_REG_0_REG(REG_A6XX_CP_SCRATCH_REG(0)) |
20647ec681f3Smrg                     CP_MEM_TO_REG_0_SHIFT_BY_2 |
20657ec681f3Smrg                     0x40000 | /* ??? */
20667ec681f3Smrg                     CP_MEM_TO_REG_0_UNK31 |
20677ec681f3Smrg                     CP_MEM_TO_REG_0_CNT(1));
20687ec681f3Smrg      tu_cs_emit_qw(cs, global_iova(cmd, flush_base[idx]));
20697ec681f3Smrg
20707ec681f3Smrg      if (offset) {
20717ec681f3Smrg         tu_cs_emit_pkt7(cs, CP_REG_RMW, 3);
20727ec681f3Smrg         tu_cs_emit(cs, CP_REG_RMW_0_DST_REG(REG_A6XX_CP_SCRATCH_REG(0)) |
20737ec681f3Smrg                        CP_REG_RMW_0_SRC1_ADD);
20747ec681f3Smrg         tu_cs_emit(cs, 0xffffffff);
20757ec681f3Smrg         tu_cs_emit(cs, -offset);
20767ec681f3Smrg      }
20777ec681f3Smrg
20787ec681f3Smrg      tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3);
20797ec681f3Smrg      tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(REG_A6XX_CP_SCRATCH_REG(0)) |
20807ec681f3Smrg                     CP_REG_TO_MEM_0_CNT(1));
20817ec681f3Smrg      tu_cs_emit_qw(cs, buf->bo->iova + counter_buffer_offset);
20827ec681f3Smrg   }
20837ec681f3Smrg
20847ec681f3Smrg   tu_cond_exec_end(cs);
20857ec681f3Smrg
20867ec681f3Smrg   cmd->state.xfb_used = true;
20877ec681f3Smrg}
20887ec681f3Smrg
20897ec681f3SmrgVKAPI_ATTR void VKAPI_CALL
2090361fc4cbSmayatu_CmdPushConstants(VkCommandBuffer commandBuffer,
2091361fc4cbSmaya                    VkPipelineLayout layout,
2092361fc4cbSmaya                    VkShaderStageFlags stageFlags,
2093361fc4cbSmaya                    uint32_t offset,
2094361fc4cbSmaya                    uint32_t size,
2095361fc4cbSmaya                    const void *pValues)
2096361fc4cbSmaya{
20977ec681f3Smrg   TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
20987ec681f3Smrg   memcpy((void*) cmd->push_constants + offset, pValues, size);
20997ec681f3Smrg   cmd->state.dirty |= TU_CMD_DIRTY_SHADER_CONSTS;
21007ec681f3Smrg}
21017ec681f3Smrg
21027ec681f3Smrg/* Flush everything which has been made available but we haven't actually
21037ec681f3Smrg * flushed yet.
21047ec681f3Smrg */
21057ec681f3Smrgstatic void
21067ec681f3Smrgtu_flush_all_pending(struct tu_cache_state *cache)
21077ec681f3Smrg{
21087ec681f3Smrg   cache->flush_bits |= cache->pending_flush_bits & TU_CMD_FLAG_ALL_FLUSH;
21097ec681f3Smrg   cache->pending_flush_bits &= ~TU_CMD_FLAG_ALL_FLUSH;
2110361fc4cbSmaya}
2111361fc4cbSmaya
21127ec681f3SmrgVKAPI_ATTR VkResult VKAPI_CALL
2113361fc4cbSmayatu_EndCommandBuffer(VkCommandBuffer commandBuffer)
2114361fc4cbSmaya{
2115361fc4cbSmaya   TU_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, commandBuffer);
2116361fc4cbSmaya
21177ec681f3Smrg   /* We currently flush CCU at the end of the command buffer, like
21187ec681f3Smrg    * what the blob does. There's implicit synchronization around every
21197ec681f3Smrg    * vkQueueSubmit, but the kernel only flushes the UCHE, and we don't
21207ec681f3Smrg    * know yet if this command buffer will be the last in the submit so we
21217ec681f3Smrg    * have to defensively flush everything else.
21227ec681f3Smrg    *
21237ec681f3Smrg    * TODO: We could definitely do better than this, since these flushes
21247ec681f3Smrg    * aren't required by Vulkan, but we'd need kernel support to do that.
21257ec681f3Smrg    * Ideally, we'd like the kernel to flush everything afterwards, so that we
21267ec681f3Smrg    * wouldn't have to do any flushes here, and when submitting multiple
21277ec681f3Smrg    * command buffers there wouldn't be any unnecessary flushes in between.
21287ec681f3Smrg    */
21297ec681f3Smrg   if (cmd_buffer->state.pass) {
21307ec681f3Smrg      tu_flush_all_pending(&cmd_buffer->state.renderpass_cache);
21317ec681f3Smrg      tu_emit_cache_flush_renderpass(cmd_buffer, &cmd_buffer->draw_cs);
21327ec681f3Smrg   } else {
21337ec681f3Smrg      tu_flush_all_pending(&cmd_buffer->state.cache);
21347ec681f3Smrg      cmd_buffer->state.cache.flush_bits |=
21357ec681f3Smrg         TU_CMD_FLAG_CCU_FLUSH_COLOR |
21367ec681f3Smrg         TU_CMD_FLAG_CCU_FLUSH_DEPTH;
21377ec681f3Smrg      tu_emit_cache_flush(cmd_buffer, &cmd_buffer->cs);
2138361fc4cbSmaya   }
2139361fc4cbSmaya
2140361fc4cbSmaya   tu_cs_end(&cmd_buffer->cs);
21417ec681f3Smrg   tu_cs_end(&cmd_buffer->draw_cs);
21427ec681f3Smrg   tu_cs_end(&cmd_buffer->tile_store_cs);
21437ec681f3Smrg   tu_cs_end(&cmd_buffer->draw_epilogue_cs);
2144361fc4cbSmaya
2145361fc4cbSmaya   cmd_buffer->status = TU_CMD_BUFFER_STATUS_EXECUTABLE;
2146361fc4cbSmaya
2147361fc4cbSmaya   return cmd_buffer->record_result;
2148361fc4cbSmaya}
2149361fc4cbSmaya
21507ec681f3Smrgstatic struct tu_cs
21517ec681f3Smrgtu_cmd_dynamic_state(struct tu_cmd_buffer *cmd, uint32_t id, uint32_t size)
21527ec681f3Smrg{
21537ec681f3Smrg   struct tu_cs cs;
21547ec681f3Smrg
21557ec681f3Smrg   assert(id < ARRAY_SIZE(cmd->state.dynamic_state));
21567ec681f3Smrg   cmd->state.dynamic_state[id] = tu_cs_draw_state(&cmd->sub_cs, &cs, size);
21577ec681f3Smrg
21587ec681f3Smrg   /* note: this also avoids emitting draw states before renderpass clears,
21597ec681f3Smrg    * which may use the 3D clear path (for MSAA cases)
21607ec681f3Smrg    */
21617ec681f3Smrg   if (cmd->state.dirty & TU_CMD_DIRTY_DRAW_STATE)
21627ec681f3Smrg      return cs;
21637ec681f3Smrg
21647ec681f3Smrg   tu_cs_emit_pkt7(&cmd->draw_cs, CP_SET_DRAW_STATE, 3);
21657ec681f3Smrg   tu_cs_emit_draw_state(&cmd->draw_cs, TU_DRAW_STATE_DYNAMIC + id, cmd->state.dynamic_state[id]);
21667ec681f3Smrg
21677ec681f3Smrg   return cs;
21687ec681f3Smrg}
21697ec681f3Smrg
21707ec681f3SmrgVKAPI_ATTR void VKAPI_CALL
2171361fc4cbSmayatu_CmdBindPipeline(VkCommandBuffer commandBuffer,
2172361fc4cbSmaya                   VkPipelineBindPoint pipelineBindPoint,
2173361fc4cbSmaya                   VkPipeline _pipeline)
2174361fc4cbSmaya{
2175361fc4cbSmaya   TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2176361fc4cbSmaya   TU_FROM_HANDLE(tu_pipeline, pipeline, _pipeline);
2177361fc4cbSmaya
21787ec681f3Smrg   if (pipelineBindPoint == VK_PIPELINE_BIND_POINT_COMPUTE) {
21797ec681f3Smrg      cmd->state.compute_pipeline = pipeline;
21807ec681f3Smrg      tu_cs_emit_state_ib(&cmd->cs, pipeline->program.state);
21817ec681f3Smrg      return;
21827ec681f3Smrg   }
21837ec681f3Smrg
21847ec681f3Smrg   assert(pipelineBindPoint == VK_PIPELINE_BIND_POINT_GRAPHICS);
21857ec681f3Smrg
21867ec681f3Smrg   cmd->state.pipeline = pipeline;
21877ec681f3Smrg   cmd->state.dirty |= TU_CMD_DIRTY_DESC_SETS_LOAD | TU_CMD_DIRTY_SHADER_CONSTS |
21887ec681f3Smrg                       TU_CMD_DIRTY_LRZ | TU_CMD_DIRTY_VS_PARAMS;
21897ec681f3Smrg
21907ec681f3Smrg   /* note: this also avoids emitting draw states before renderpass clears,
21917ec681f3Smrg    * which may use the 3D clear path (for MSAA cases)
21927ec681f3Smrg    */
21937ec681f3Smrg   if (!(cmd->state.dirty & TU_CMD_DIRTY_DRAW_STATE)) {
21947ec681f3Smrg      struct tu_cs *cs = &cmd->draw_cs;
21957ec681f3Smrg      uint32_t mask = ~pipeline->dynamic_state_mask & BITFIELD_MASK(TU_DYNAMIC_STATE_COUNT);
21967ec681f3Smrg
21977ec681f3Smrg      tu_cs_emit_pkt7(cs, CP_SET_DRAW_STATE, 3 * (7 + util_bitcount(mask)));
21987ec681f3Smrg      tu_cs_emit_draw_state(cs, TU_DRAW_STATE_PROGRAM_CONFIG, pipeline->program.config_state);
21997ec681f3Smrg      tu_cs_emit_draw_state(cs, TU_DRAW_STATE_PROGRAM, pipeline->program.state);
22007ec681f3Smrg      tu_cs_emit_draw_state(cs, TU_DRAW_STATE_PROGRAM_BINNING, pipeline->program.binning_state);
22017ec681f3Smrg      tu_cs_emit_draw_state(cs, TU_DRAW_STATE_VI, pipeline->vi.state);
22027ec681f3Smrg      tu_cs_emit_draw_state(cs, TU_DRAW_STATE_VI_BINNING, pipeline->vi.binning_state);
22037ec681f3Smrg      tu_cs_emit_draw_state(cs, TU_DRAW_STATE_RAST, pipeline->rast_state);
22047ec681f3Smrg      tu_cs_emit_draw_state(cs, TU_DRAW_STATE_BLEND, pipeline->blend_state);
22057ec681f3Smrg
22067ec681f3Smrg      u_foreach_bit(i, mask)
22077ec681f3Smrg         tu_cs_emit_draw_state(cs, TU_DRAW_STATE_DYNAMIC + i, pipeline->dynamic_state[i]);
22087ec681f3Smrg   }
22097ec681f3Smrg
22107ec681f3Smrg   if (cmd->state.line_mode != pipeline->line_mode) {
22117ec681f3Smrg      cmd->state.line_mode = pipeline->line_mode;
22127ec681f3Smrg
22137ec681f3Smrg      /* We have to disable MSAA when bresenham lines are used, this is
22147ec681f3Smrg       * a hardware limitation and spec allows it:
22157ec681f3Smrg       *
22167ec681f3Smrg       *    When Bresenham lines are being rasterized, sample locations may
22177ec681f3Smrg       *    all be treated as being at the pixel center (this may affect
22187ec681f3Smrg       *    attribute and depth interpolation).
22197ec681f3Smrg       */
22207ec681f3Smrg      if (cmd->state.subpass && cmd->state.subpass->samples) {
22217ec681f3Smrg         tu6_emit_msaa(&cmd->draw_cs, cmd->state.subpass->samples, cmd->state.line_mode);
22227ec681f3Smrg      }
22237ec681f3Smrg   }
22247ec681f3Smrg
22257ec681f3Smrg   /* the vertex_buffers draw state always contains all the currently
22267ec681f3Smrg    * bound vertex buffers. update its size to only emit the vbs which
22277ec681f3Smrg    * are actually used by the pipeline
22287ec681f3Smrg    * note there is a HW optimization which makes it so the draw state
22297ec681f3Smrg    * is not re-executed completely when only the size changes
22307ec681f3Smrg    */
22317ec681f3Smrg   if (cmd->state.vertex_buffers.size != pipeline->num_vbs * 4) {
22327ec681f3Smrg      cmd->state.vertex_buffers.size = pipeline->num_vbs * 4;
22337ec681f3Smrg      cmd->state.dirty |= TU_CMD_DIRTY_VERTEX_BUFFERS;
2234361fc4cbSmaya   }
22357ec681f3Smrg
22367ec681f3Smrg   if ((pipeline->dynamic_state_mask & BIT(TU_DYNAMIC_STATE_VB_STRIDE)) &&
22377ec681f3Smrg       cmd->state.dynamic_state[TU_DYNAMIC_STATE_VB_STRIDE].size != pipeline->num_vbs * 2) {
22387ec681f3Smrg      cmd->state.dynamic_state[TU_DYNAMIC_STATE_VB_STRIDE].size = pipeline->num_vbs * 2;
22397ec681f3Smrg      cmd->state.dirty |= TU_CMD_DIRTY_VB_STRIDE;
22407ec681f3Smrg   }
22417ec681f3Smrg
22427ec681f3Smrg#define UPDATE_REG(X, Y) {                                           \
22437ec681f3Smrg   /* note: would be better to have pipeline bits already masked */  \
22447ec681f3Smrg   uint32_t pipeline_bits = pipeline->X & pipeline->X##_mask;        \
22457ec681f3Smrg   if ((cmd->state.X & pipeline->X##_mask) != pipeline_bits) {       \
22467ec681f3Smrg      cmd->state.X &= ~pipeline->X##_mask;                           \
22477ec681f3Smrg      cmd->state.X |= pipeline_bits;                                 \
22487ec681f3Smrg      cmd->state.dirty |= TU_CMD_DIRTY_##Y;                          \
22497ec681f3Smrg   }                                                                 \
22507ec681f3Smrg   if (!(pipeline->dynamic_state_mask & BIT(TU_DYNAMIC_STATE_##Y)))  \
22517ec681f3Smrg      cmd->state.dirty &= ~TU_CMD_DIRTY_##Y;                         \
2252361fc4cbSmaya}
2253361fc4cbSmaya
22547ec681f3Smrg   /* these registers can have bits set from both pipeline and dynamic state
22557ec681f3Smrg    * this updates the bits set by the pipeline
22567ec681f3Smrg    * if the pipeline doesn't use a dynamic state for the register, then
22577ec681f3Smrg    * the relevant dirty bit is cleared to avoid overriding the non-dynamic
22587ec681f3Smrg    * state with a dynamic state the next draw.
22597ec681f3Smrg    */
22607ec681f3Smrg   UPDATE_REG(gras_su_cntl, GRAS_SU_CNTL);
22617ec681f3Smrg   UPDATE_REG(rb_depth_cntl, RB_DEPTH_CNTL);
22627ec681f3Smrg   UPDATE_REG(rb_stencil_cntl, RB_STENCIL_CNTL);
22637ec681f3Smrg   UPDATE_REG(pc_raster_cntl, RASTERIZER_DISCARD);
22647ec681f3Smrg   UPDATE_REG(vpc_unknown_9107, RASTERIZER_DISCARD);
22657ec681f3Smrg#undef UPDATE_REG
22667ec681f3Smrg
22677ec681f3Smrg   if (pipeline->rb_depth_cntl_disable)
22687ec681f3Smrg      cmd->state.dirty |= TU_CMD_DIRTY_RB_DEPTH_CNTL;
22697ec681f3Smrg}
22707ec681f3Smrg
22717ec681f3SmrgVKAPI_ATTR void VKAPI_CALL
2272361fc4cbSmayatu_CmdSetViewport(VkCommandBuffer commandBuffer,
2273361fc4cbSmaya                  uint32_t firstViewport,
2274361fc4cbSmaya                  uint32_t viewportCount,
2275361fc4cbSmaya                  const VkViewport *pViewports)
2276361fc4cbSmaya{
2277361fc4cbSmaya   TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
22787ec681f3Smrg   struct tu_cs cs;
2279361fc4cbSmaya
22807ec681f3Smrg   memcpy(&cmd->state.viewport[firstViewport], pViewports, viewportCount * sizeof(*pViewports));
22817ec681f3Smrg   cmd->state.max_viewport = MAX2(cmd->state.max_viewport, firstViewport + viewportCount);
2282361fc4cbSmaya
22837ec681f3Smrg   cs = tu_cmd_dynamic_state(cmd, VK_DYNAMIC_STATE_VIEWPORT, 8 + 10 * cmd->state.max_viewport);
22847ec681f3Smrg   tu6_emit_viewport(&cs, cmd->state.viewport, cmd->state.max_viewport);
2285361fc4cbSmaya}
2286361fc4cbSmaya
22877ec681f3SmrgVKAPI_ATTR void VKAPI_CALL
2288361fc4cbSmayatu_CmdSetScissor(VkCommandBuffer commandBuffer,
2289361fc4cbSmaya                 uint32_t firstScissor,
2290361fc4cbSmaya                 uint32_t scissorCount,
2291361fc4cbSmaya                 const VkRect2D *pScissors)
2292361fc4cbSmaya{
2293361fc4cbSmaya   TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
22947ec681f3Smrg   struct tu_cs cs;
2295361fc4cbSmaya
22967ec681f3Smrg   memcpy(&cmd->state.scissor[firstScissor], pScissors, scissorCount * sizeof(*pScissors));
22977ec681f3Smrg   cmd->state.max_scissor = MAX2(cmd->state.max_scissor, firstScissor + scissorCount);
2298361fc4cbSmaya
22997ec681f3Smrg   cs = tu_cmd_dynamic_state(cmd, VK_DYNAMIC_STATE_SCISSOR, 1 + 2 * cmd->state.max_scissor);
23007ec681f3Smrg   tu6_emit_scissor(&cs, cmd->state.scissor, cmd->state.max_scissor);
2301361fc4cbSmaya}
2302361fc4cbSmaya
23037ec681f3SmrgVKAPI_ATTR void VKAPI_CALL
2304361fc4cbSmayatu_CmdSetLineWidth(VkCommandBuffer commandBuffer, float lineWidth)
2305361fc4cbSmaya{
2306361fc4cbSmaya   TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2307361fc4cbSmaya
23087ec681f3Smrg   cmd->state.gras_su_cntl &= ~A6XX_GRAS_SU_CNTL_LINEHALFWIDTH__MASK;
23097ec681f3Smrg   cmd->state.gras_su_cntl |= A6XX_GRAS_SU_CNTL_LINEHALFWIDTH(lineWidth / 2.0f);
2310361fc4cbSmaya
23117ec681f3Smrg   cmd->state.dirty |= TU_CMD_DIRTY_GRAS_SU_CNTL;
2312361fc4cbSmaya}
2313361fc4cbSmaya
23147ec681f3SmrgVKAPI_ATTR void VKAPI_CALL
2315361fc4cbSmayatu_CmdSetDepthBias(VkCommandBuffer commandBuffer,
2316361fc4cbSmaya                   float depthBiasConstantFactor,
2317361fc4cbSmaya                   float depthBiasClamp,
2318361fc4cbSmaya                   float depthBiasSlopeFactor)
2319361fc4cbSmaya{
2320361fc4cbSmaya   TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
23217ec681f3Smrg   struct tu_cs cs = tu_cmd_dynamic_state(cmd, VK_DYNAMIC_STATE_DEPTH_BIAS, 4);
2322361fc4cbSmaya
23237ec681f3Smrg   tu6_emit_depth_bias(&cs, depthBiasConstantFactor, depthBiasClamp, depthBiasSlopeFactor);
2324361fc4cbSmaya}
2325361fc4cbSmaya
23267ec681f3SmrgVKAPI_ATTR void VKAPI_CALL
2327361fc4cbSmayatu_CmdSetBlendConstants(VkCommandBuffer commandBuffer,
2328361fc4cbSmaya                        const float blendConstants[4])
2329361fc4cbSmaya{
2330361fc4cbSmaya   TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
23317ec681f3Smrg   struct tu_cs cs = tu_cmd_dynamic_state(cmd, VK_DYNAMIC_STATE_BLEND_CONSTANTS, 5);
2332361fc4cbSmaya
23337ec681f3Smrg   tu_cs_emit_pkt4(&cs, REG_A6XX_RB_BLEND_RED_F32, 4);
23347ec681f3Smrg   tu_cs_emit_array(&cs, (const uint32_t *) blendConstants, 4);
2335361fc4cbSmaya}
2336361fc4cbSmaya
23377ec681f3SmrgVKAPI_ATTR void VKAPI_CALL
2338361fc4cbSmayatu_CmdSetDepthBounds(VkCommandBuffer commandBuffer,
2339361fc4cbSmaya                     float minDepthBounds,
2340361fc4cbSmaya                     float maxDepthBounds)
2341361fc4cbSmaya{
23427ec681f3Smrg   TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
23437ec681f3Smrg   struct tu_cs cs = tu_cmd_dynamic_state(cmd, VK_DYNAMIC_STATE_DEPTH_BOUNDS, 3);
23447ec681f3Smrg
23457ec681f3Smrg   tu_cs_emit_regs(&cs,
23467ec681f3Smrg                   A6XX_RB_Z_BOUNDS_MIN(minDepthBounds),
23477ec681f3Smrg                   A6XX_RB_Z_BOUNDS_MAX(maxDepthBounds));
2348361fc4cbSmaya}
2349361fc4cbSmaya
2350361fc4cbSmayavoid
23517ec681f3Smrgupdate_stencil_mask(uint32_t *value, VkStencilFaceFlags face, uint32_t mask)
23527ec681f3Smrg{
23537ec681f3Smrg   if (face & VK_STENCIL_FACE_FRONT_BIT)
23547ec681f3Smrg      *value = (*value & 0xff00) | (mask & 0xff);
23557ec681f3Smrg   if (face & VK_STENCIL_FACE_BACK_BIT)
23567ec681f3Smrg      *value = (*value & 0xff) | (mask & 0xff) << 8;
23577ec681f3Smrg}
23587ec681f3Smrg
23597ec681f3SmrgVKAPI_ATTR void VKAPI_CALL
2360361fc4cbSmayatu_CmdSetStencilCompareMask(VkCommandBuffer commandBuffer,
2361361fc4cbSmaya                            VkStencilFaceFlags faceMask,
2362361fc4cbSmaya                            uint32_t compareMask)
2363361fc4cbSmaya{
2364361fc4cbSmaya   TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
23657ec681f3Smrg   struct tu_cs cs = tu_cmd_dynamic_state(cmd, VK_DYNAMIC_STATE_STENCIL_COMPARE_MASK, 2);
2366361fc4cbSmaya
23677ec681f3Smrg   update_stencil_mask(&cmd->state.dynamic_stencil_mask, faceMask, compareMask);
2368361fc4cbSmaya
23697ec681f3Smrg   tu_cs_emit_regs(&cs, A6XX_RB_STENCILMASK(.dword = cmd->state.dynamic_stencil_mask));
2370361fc4cbSmaya}
2371361fc4cbSmaya
23727ec681f3SmrgVKAPI_ATTR void VKAPI_CALL
2373361fc4cbSmayatu_CmdSetStencilWriteMask(VkCommandBuffer commandBuffer,
2374361fc4cbSmaya                          VkStencilFaceFlags faceMask,
2375361fc4cbSmaya                          uint32_t writeMask)
2376361fc4cbSmaya{
2377361fc4cbSmaya   TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
23787ec681f3Smrg   struct tu_cs cs = tu_cmd_dynamic_state(cmd, VK_DYNAMIC_STATE_STENCIL_WRITE_MASK, 2);
2379361fc4cbSmaya
23807ec681f3Smrg   update_stencil_mask(&cmd->state.dynamic_stencil_wrmask, faceMask, writeMask);
2381361fc4cbSmaya
23827ec681f3Smrg   tu_cs_emit_regs(&cs, A6XX_RB_STENCILWRMASK(.dword = cmd->state.dynamic_stencil_wrmask));
23837ec681f3Smrg
23847ec681f3Smrg   cmd->state.dirty |= TU_CMD_DIRTY_LRZ;
2385361fc4cbSmaya}
2386361fc4cbSmaya
23877ec681f3SmrgVKAPI_ATTR void VKAPI_CALL
2388361fc4cbSmayatu_CmdSetStencilReference(VkCommandBuffer commandBuffer,
2389361fc4cbSmaya                          VkStencilFaceFlags faceMask,
2390361fc4cbSmaya                          uint32_t reference)
2391361fc4cbSmaya{
2392361fc4cbSmaya   TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
23937ec681f3Smrg   struct tu_cs cs = tu_cmd_dynamic_state(cmd, VK_DYNAMIC_STATE_STENCIL_REFERENCE, 2);
2394361fc4cbSmaya
23957ec681f3Smrg   update_stencil_mask(&cmd->state.dynamic_stencil_ref, faceMask, reference);
2396361fc4cbSmaya
23977ec681f3Smrg   tu_cs_emit_regs(&cs, A6XX_RB_STENCILREF(.dword = cmd->state.dynamic_stencil_ref));
2398361fc4cbSmaya}
2399361fc4cbSmaya
24007ec681f3SmrgVKAPI_ATTR void VKAPI_CALL
24017ec681f3Smrgtu_CmdSetSampleLocationsEXT(VkCommandBuffer commandBuffer,
24027ec681f3Smrg                            const VkSampleLocationsInfoEXT* pSampleLocationsInfo)
2403361fc4cbSmaya{
24047ec681f3Smrg   TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
24057ec681f3Smrg   struct tu_cs cs = tu_cmd_dynamic_state(cmd, TU_DYNAMIC_STATE_SAMPLE_LOCATIONS, 9);
24067ec681f3Smrg
24077ec681f3Smrg   assert(pSampleLocationsInfo);
24087ec681f3Smrg
24097ec681f3Smrg   tu6_emit_sample_locations(&cs, pSampleLocationsInfo);
2410361fc4cbSmaya}
2411361fc4cbSmaya
24127ec681f3SmrgVKAPI_ATTR void VKAPI_CALL
24137ec681f3Smrgtu_CmdSetCullModeEXT(VkCommandBuffer commandBuffer, VkCullModeFlags cullMode)
2414361fc4cbSmaya{
24157ec681f3Smrg   TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2416361fc4cbSmaya
24177ec681f3Smrg   cmd->state.gras_su_cntl &=
24187ec681f3Smrg      ~(A6XX_GRAS_SU_CNTL_CULL_FRONT | A6XX_GRAS_SU_CNTL_CULL_BACK);
2419361fc4cbSmaya
24207ec681f3Smrg   if (cullMode & VK_CULL_MODE_FRONT_BIT)
24217ec681f3Smrg      cmd->state.gras_su_cntl |= A6XX_GRAS_SU_CNTL_CULL_FRONT;
24227ec681f3Smrg   if (cullMode & VK_CULL_MODE_BACK_BIT)
24237ec681f3Smrg      cmd->state.gras_su_cntl |= A6XX_GRAS_SU_CNTL_CULL_BACK;
2424361fc4cbSmaya
24257ec681f3Smrg   cmd->state.dirty |= TU_CMD_DIRTY_GRAS_SU_CNTL;
24267ec681f3Smrg}
2427361fc4cbSmaya
24287ec681f3SmrgVKAPI_ATTR void VKAPI_CALL
24297ec681f3Smrgtu_CmdSetFrontFaceEXT(VkCommandBuffer commandBuffer, VkFrontFace frontFace)
24307ec681f3Smrg{
24317ec681f3Smrg   TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2432361fc4cbSmaya
24337ec681f3Smrg   cmd->state.gras_su_cntl &= ~A6XX_GRAS_SU_CNTL_FRONT_CW;
2434361fc4cbSmaya
24357ec681f3Smrg   if (frontFace == VK_FRONT_FACE_CLOCKWISE)
24367ec681f3Smrg      cmd->state.gras_su_cntl |= A6XX_GRAS_SU_CNTL_FRONT_CW;
24377ec681f3Smrg
24387ec681f3Smrg   cmd->state.dirty |= TU_CMD_DIRTY_GRAS_SU_CNTL;
2439361fc4cbSmaya}
2440361fc4cbSmaya
24417ec681f3SmrgVKAPI_ATTR void VKAPI_CALL
24427ec681f3Smrgtu_CmdSetPrimitiveTopologyEXT(VkCommandBuffer commandBuffer,
24437ec681f3Smrg                              VkPrimitiveTopology primitiveTopology)
2444361fc4cbSmaya{
24457ec681f3Smrg   TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2446361fc4cbSmaya
24477ec681f3Smrg   cmd->state.primtype = tu6_primtype(primitiveTopology);
24487ec681f3Smrg}
2449361fc4cbSmaya
24507ec681f3SmrgVKAPI_ATTR void VKAPI_CALL
24517ec681f3Smrgtu_CmdSetViewportWithCountEXT(VkCommandBuffer commandBuffer,
24527ec681f3Smrg                              uint32_t viewportCount,
24537ec681f3Smrg                              const VkViewport* pViewports)
24547ec681f3Smrg{
24557ec681f3Smrg   tu_CmdSetViewport(commandBuffer, 0, viewportCount, pViewports);
24567ec681f3Smrg}
2457361fc4cbSmaya
24587ec681f3SmrgVKAPI_ATTR void VKAPI_CALL
24597ec681f3Smrgtu_CmdSetScissorWithCountEXT(VkCommandBuffer commandBuffer,
24607ec681f3Smrg                             uint32_t scissorCount,
24617ec681f3Smrg                             const VkRect2D* pScissors)
24627ec681f3Smrg{
24637ec681f3Smrg   tu_CmdSetScissor(commandBuffer, 0, scissorCount, pScissors);
24647ec681f3Smrg}
24657ec681f3Smrg
24667ec681f3SmrgVKAPI_ATTR void VKAPI_CALL
24677ec681f3Smrgtu_CmdSetDepthTestEnableEXT(VkCommandBuffer commandBuffer,
24687ec681f3Smrg                            VkBool32 depthTestEnable)
24697ec681f3Smrg{
24707ec681f3Smrg   TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
24717ec681f3Smrg
24727ec681f3Smrg   cmd->state.rb_depth_cntl &= ~A6XX_RB_DEPTH_CNTL_Z_TEST_ENABLE;
24737ec681f3Smrg
24747ec681f3Smrg   if (depthTestEnable)
24757ec681f3Smrg      cmd->state.rb_depth_cntl |= A6XX_RB_DEPTH_CNTL_Z_TEST_ENABLE;
24767ec681f3Smrg
24777ec681f3Smrg   cmd->state.dirty |= TU_CMD_DIRTY_RB_DEPTH_CNTL;
24787ec681f3Smrg}
24797ec681f3Smrg
24807ec681f3SmrgVKAPI_ATTR void VKAPI_CALL
24817ec681f3Smrgtu_CmdSetDepthWriteEnableEXT(VkCommandBuffer commandBuffer,
24827ec681f3Smrg                             VkBool32 depthWriteEnable)
24837ec681f3Smrg{
24847ec681f3Smrg   TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
24857ec681f3Smrg
24867ec681f3Smrg   cmd->state.rb_depth_cntl &= ~A6XX_RB_DEPTH_CNTL_Z_WRITE_ENABLE;
24877ec681f3Smrg
24887ec681f3Smrg   if (depthWriteEnable)
24897ec681f3Smrg      cmd->state.rb_depth_cntl |= A6XX_RB_DEPTH_CNTL_Z_WRITE_ENABLE;
24907ec681f3Smrg
24917ec681f3Smrg   cmd->state.dirty |= TU_CMD_DIRTY_RB_DEPTH_CNTL;
24927ec681f3Smrg}
24937ec681f3Smrg
24947ec681f3SmrgVKAPI_ATTR void VKAPI_CALL
24957ec681f3Smrgtu_CmdSetDepthCompareOpEXT(VkCommandBuffer commandBuffer,
24967ec681f3Smrg                           VkCompareOp depthCompareOp)
24977ec681f3Smrg{
24987ec681f3Smrg   TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
24997ec681f3Smrg
25007ec681f3Smrg   cmd->state.rb_depth_cntl &= ~A6XX_RB_DEPTH_CNTL_ZFUNC__MASK;
25017ec681f3Smrg
25027ec681f3Smrg   cmd->state.rb_depth_cntl |=
25037ec681f3Smrg      A6XX_RB_DEPTH_CNTL_ZFUNC(tu6_compare_func(depthCompareOp));
25047ec681f3Smrg
25057ec681f3Smrg   cmd->state.dirty |= TU_CMD_DIRTY_RB_DEPTH_CNTL;
25067ec681f3Smrg}
25077ec681f3Smrg
25087ec681f3SmrgVKAPI_ATTR void VKAPI_CALL
25097ec681f3Smrgtu_CmdSetDepthBoundsTestEnableEXT(VkCommandBuffer commandBuffer,
25107ec681f3Smrg                                  VkBool32 depthBoundsTestEnable)
25117ec681f3Smrg{
25127ec681f3Smrg   TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
25137ec681f3Smrg
25147ec681f3Smrg   cmd->state.rb_depth_cntl &= ~A6XX_RB_DEPTH_CNTL_Z_BOUNDS_ENABLE;
25157ec681f3Smrg
25167ec681f3Smrg   if (depthBoundsTestEnable)
25177ec681f3Smrg      cmd->state.rb_depth_cntl |= A6XX_RB_DEPTH_CNTL_Z_BOUNDS_ENABLE;
25187ec681f3Smrg
25197ec681f3Smrg   cmd->state.dirty |= TU_CMD_DIRTY_RB_DEPTH_CNTL;
25207ec681f3Smrg}
25217ec681f3Smrg
25227ec681f3SmrgVKAPI_ATTR void VKAPI_CALL
25237ec681f3Smrgtu_CmdSetStencilTestEnableEXT(VkCommandBuffer commandBuffer,
25247ec681f3Smrg                              VkBool32 stencilTestEnable)
25257ec681f3Smrg{
25267ec681f3Smrg   TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
25277ec681f3Smrg
25287ec681f3Smrg   cmd->state.rb_stencil_cntl &= ~(
25297ec681f3Smrg      A6XX_RB_STENCIL_CONTROL_STENCIL_ENABLE |
25307ec681f3Smrg      A6XX_RB_STENCIL_CONTROL_STENCIL_ENABLE_BF |
25317ec681f3Smrg      A6XX_RB_STENCIL_CONTROL_STENCIL_READ);
25327ec681f3Smrg
25337ec681f3Smrg   if (stencilTestEnable) {
25347ec681f3Smrg      cmd->state.rb_stencil_cntl |=
25357ec681f3Smrg         A6XX_RB_STENCIL_CONTROL_STENCIL_ENABLE |
25367ec681f3Smrg         A6XX_RB_STENCIL_CONTROL_STENCIL_ENABLE_BF |
25377ec681f3Smrg         A6XX_RB_STENCIL_CONTROL_STENCIL_READ;
25387ec681f3Smrg   }
25397ec681f3Smrg
25407ec681f3Smrg   cmd->state.dirty |= TU_CMD_DIRTY_RB_STENCIL_CNTL;
25417ec681f3Smrg}
25427ec681f3Smrg
25437ec681f3SmrgVKAPI_ATTR void VKAPI_CALL
25447ec681f3Smrgtu_CmdSetStencilOpEXT(VkCommandBuffer commandBuffer,
25457ec681f3Smrg                      VkStencilFaceFlags faceMask,
25467ec681f3Smrg                      VkStencilOp failOp,
25477ec681f3Smrg                      VkStencilOp passOp,
25487ec681f3Smrg                      VkStencilOp depthFailOp,
25497ec681f3Smrg                      VkCompareOp compareOp)
25507ec681f3Smrg{
25517ec681f3Smrg   TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
25527ec681f3Smrg
25537ec681f3Smrg   if (faceMask & VK_STENCIL_FACE_FRONT_BIT) {
25547ec681f3Smrg      cmd->state.rb_stencil_cntl &= ~(
25557ec681f3Smrg         A6XX_RB_STENCIL_CONTROL_FUNC__MASK |
25567ec681f3Smrg         A6XX_RB_STENCIL_CONTROL_FAIL__MASK |
25577ec681f3Smrg         A6XX_RB_STENCIL_CONTROL_ZPASS__MASK |
25587ec681f3Smrg         A6XX_RB_STENCIL_CONTROL_ZFAIL__MASK);
25597ec681f3Smrg
25607ec681f3Smrg      cmd->state.rb_stencil_cntl |=
25617ec681f3Smrg         A6XX_RB_STENCIL_CONTROL_FUNC(tu6_compare_func(compareOp)) |
25627ec681f3Smrg         A6XX_RB_STENCIL_CONTROL_FAIL(tu6_stencil_op(failOp)) |
25637ec681f3Smrg         A6XX_RB_STENCIL_CONTROL_ZPASS(tu6_stencil_op(passOp)) |
25647ec681f3Smrg         A6XX_RB_STENCIL_CONTROL_ZFAIL(tu6_stencil_op(depthFailOp));
25657ec681f3Smrg   }
25667ec681f3Smrg
25677ec681f3Smrg   if (faceMask & VK_STENCIL_FACE_BACK_BIT) {
25687ec681f3Smrg      cmd->state.rb_stencil_cntl &= ~(
25697ec681f3Smrg         A6XX_RB_STENCIL_CONTROL_FUNC_BF__MASK |
25707ec681f3Smrg         A6XX_RB_STENCIL_CONTROL_FAIL_BF__MASK |
25717ec681f3Smrg         A6XX_RB_STENCIL_CONTROL_ZPASS_BF__MASK |
25727ec681f3Smrg         A6XX_RB_STENCIL_CONTROL_ZFAIL_BF__MASK);
25737ec681f3Smrg
25747ec681f3Smrg      cmd->state.rb_stencil_cntl |=
25757ec681f3Smrg         A6XX_RB_STENCIL_CONTROL_FUNC_BF(tu6_compare_func(compareOp)) |
25767ec681f3Smrg         A6XX_RB_STENCIL_CONTROL_FAIL_BF(tu6_stencil_op(failOp)) |
25777ec681f3Smrg         A6XX_RB_STENCIL_CONTROL_ZPASS_BF(tu6_stencil_op(passOp)) |
25787ec681f3Smrg         A6XX_RB_STENCIL_CONTROL_ZFAIL_BF(tu6_stencil_op(depthFailOp));
25797ec681f3Smrg   }
25807ec681f3Smrg
25817ec681f3Smrg   cmd->state.dirty |= TU_CMD_DIRTY_RB_STENCIL_CNTL;
25827ec681f3Smrg}
25837ec681f3Smrg
25847ec681f3SmrgVKAPI_ATTR void VKAPI_CALL
25857ec681f3Smrgtu_CmdSetDepthBiasEnableEXT(VkCommandBuffer commandBuffer,
25867ec681f3Smrg                            VkBool32 depthBiasEnable)
25877ec681f3Smrg{
25887ec681f3Smrg   TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
25897ec681f3Smrg
25907ec681f3Smrg   cmd->state.gras_su_cntl &= ~A6XX_GRAS_SU_CNTL_POLY_OFFSET;
25917ec681f3Smrg   if (depthBiasEnable)
25927ec681f3Smrg      cmd->state.gras_su_cntl |= A6XX_GRAS_SU_CNTL_POLY_OFFSET;
25937ec681f3Smrg
25947ec681f3Smrg   cmd->state.dirty |= TU_CMD_DIRTY_GRAS_SU_CNTL;
25957ec681f3Smrg}
25967ec681f3Smrg
25977ec681f3SmrgVKAPI_ATTR void VKAPI_CALL
25987ec681f3Smrgtu_CmdSetPrimitiveRestartEnableEXT(VkCommandBuffer commandBuffer,
25997ec681f3Smrg                                   VkBool32 primitiveRestartEnable)
26007ec681f3Smrg{
26017ec681f3Smrg   TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
26027ec681f3Smrg
26037ec681f3Smrg   cmd->state.primitive_restart_enable = primitiveRestartEnable;
26047ec681f3Smrg}
26057ec681f3Smrg
26067ec681f3SmrgVKAPI_ATTR void VKAPI_CALL
26077ec681f3Smrgtu_CmdSetRasterizerDiscardEnableEXT(VkCommandBuffer commandBuffer,
26087ec681f3Smrg                                    VkBool32 rasterizerDiscardEnable)
26097ec681f3Smrg{
26107ec681f3Smrg   TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
26117ec681f3Smrg
26127ec681f3Smrg   cmd->state.pc_raster_cntl &= ~A6XX_PC_RASTER_CNTL_DISCARD;
26137ec681f3Smrg   cmd->state.vpc_unknown_9107 &= ~A6XX_VPC_UNKNOWN_9107_RASTER_DISCARD;
26147ec681f3Smrg   if (rasterizerDiscardEnable) {
26157ec681f3Smrg      cmd->state.pc_raster_cntl |= A6XX_PC_RASTER_CNTL_DISCARD;
26167ec681f3Smrg      cmd->state.vpc_unknown_9107 |= A6XX_VPC_UNKNOWN_9107_RASTER_DISCARD;
26177ec681f3Smrg   }
26187ec681f3Smrg
26197ec681f3Smrg   cmd->state.dirty |= TU_CMD_DIRTY_RASTERIZER_DISCARD;
26207ec681f3Smrg}
26217ec681f3Smrg
26227ec681f3SmrgVKAPI_ATTR void VKAPI_CALL
26237ec681f3Smrgtu_CmdSetLogicOpEXT(VkCommandBuffer commandBuffer,
26247ec681f3Smrg                    VkLogicOp logicOp)
26257ec681f3Smrg{
26267ec681f3Smrg   tu_stub();
26277ec681f3Smrg}
26287ec681f3Smrg
26297ec681f3SmrgVKAPI_ATTR void VKAPI_CALL
26307ec681f3Smrgtu_CmdSetPatchControlPointsEXT(VkCommandBuffer commandBuffer,
26317ec681f3Smrg                               uint32_t patchControlPoints)
26327ec681f3Smrg{
26337ec681f3Smrg   tu_stub();
26347ec681f3Smrg}
26357ec681f3Smrg
26367ec681f3SmrgVKAPI_ATTR void VKAPI_CALL
26377ec681f3Smrgtu_CmdSetLineStippleEXT(VkCommandBuffer commandBuffer,
26387ec681f3Smrg                        uint32_t lineStippleFactor,
26397ec681f3Smrg                        uint16_t lineStipplePattern)
26407ec681f3Smrg{
26417ec681f3Smrg   tu_stub();
26427ec681f3Smrg}
26437ec681f3Smrg
26447ec681f3Smrgstatic void
26457ec681f3Smrgtu_flush_for_access(struct tu_cache_state *cache,
26467ec681f3Smrg                    enum tu_cmd_access_mask src_mask,
26477ec681f3Smrg                    enum tu_cmd_access_mask dst_mask)
26487ec681f3Smrg{
26497ec681f3Smrg   enum tu_cmd_flush_bits flush_bits = 0;
26507ec681f3Smrg
26517ec681f3Smrg   if (src_mask & TU_ACCESS_SYSMEM_WRITE) {
26527ec681f3Smrg      cache->pending_flush_bits |= TU_CMD_FLAG_ALL_INVALIDATE;
26537ec681f3Smrg   }
26547ec681f3Smrg
26557ec681f3Smrg   if (src_mask & TU_ACCESS_CP_WRITE) {
26567ec681f3Smrg      /* Flush the CP write queue.
26577ec681f3Smrg       */
26587ec681f3Smrg      cache->pending_flush_bits |=
26597ec681f3Smrg         TU_CMD_FLAG_WAIT_MEM_WRITES |
26607ec681f3Smrg         TU_CMD_FLAG_ALL_INVALIDATE;
26617ec681f3Smrg   }
26627ec681f3Smrg
26637ec681f3Smrg#define SRC_FLUSH(domain, flush, invalidate) \
26647ec681f3Smrg   if (src_mask & TU_ACCESS_##domain##_WRITE) {                      \
26657ec681f3Smrg      cache->pending_flush_bits |= TU_CMD_FLAG_##flush |             \
26667ec681f3Smrg         (TU_CMD_FLAG_ALL_INVALIDATE & ~TU_CMD_FLAG_##invalidate);   \
26677ec681f3Smrg   }
26687ec681f3Smrg
26697ec681f3Smrg   SRC_FLUSH(UCHE, CACHE_FLUSH, CACHE_INVALIDATE)
26707ec681f3Smrg   SRC_FLUSH(CCU_COLOR, CCU_FLUSH_COLOR, CCU_INVALIDATE_COLOR)
26717ec681f3Smrg   SRC_FLUSH(CCU_DEPTH, CCU_FLUSH_DEPTH, CCU_INVALIDATE_DEPTH)
26727ec681f3Smrg
26737ec681f3Smrg#undef SRC_FLUSH
26747ec681f3Smrg
26757ec681f3Smrg#define SRC_INCOHERENT_FLUSH(domain, flush, invalidate)              \
26767ec681f3Smrg   if (src_mask & TU_ACCESS_##domain##_INCOHERENT_WRITE) {           \
26777ec681f3Smrg      flush_bits |= TU_CMD_FLAG_##flush;                             \
26787ec681f3Smrg      cache->pending_flush_bits |=                                   \
26797ec681f3Smrg         (TU_CMD_FLAG_ALL_INVALIDATE & ~TU_CMD_FLAG_##invalidate);   \
26807ec681f3Smrg   }
26817ec681f3Smrg
26827ec681f3Smrg   SRC_INCOHERENT_FLUSH(CCU_COLOR, CCU_FLUSH_COLOR, CCU_INVALIDATE_COLOR)
26837ec681f3Smrg   SRC_INCOHERENT_FLUSH(CCU_DEPTH, CCU_FLUSH_DEPTH, CCU_INVALIDATE_DEPTH)
26847ec681f3Smrg
26857ec681f3Smrg#undef SRC_INCOHERENT_FLUSH
26867ec681f3Smrg
26877ec681f3Smrg   /* Treat host & sysmem write accesses the same, since the kernel implicitly
26887ec681f3Smrg    * drains the queue before signalling completion to the host.
26897ec681f3Smrg    */
26907ec681f3Smrg   if (dst_mask & (TU_ACCESS_SYSMEM_READ | TU_ACCESS_SYSMEM_WRITE)) {
26917ec681f3Smrg      flush_bits |= cache->pending_flush_bits & TU_CMD_FLAG_ALL_FLUSH;
26927ec681f3Smrg   }
26937ec681f3Smrg
26947ec681f3Smrg#define DST_FLUSH(domain, flush, invalidate) \
26957ec681f3Smrg   if (dst_mask & (TU_ACCESS_##domain##_READ |                 \
26967ec681f3Smrg                   TU_ACCESS_##domain##_WRITE)) {              \
26977ec681f3Smrg      flush_bits |= cache->pending_flush_bits &                \
26987ec681f3Smrg         (TU_CMD_FLAG_##invalidate |                           \
26997ec681f3Smrg          (TU_CMD_FLAG_ALL_FLUSH & ~TU_CMD_FLAG_##flush));     \
27007ec681f3Smrg   }
27017ec681f3Smrg
27027ec681f3Smrg   DST_FLUSH(UCHE, CACHE_FLUSH, CACHE_INVALIDATE)
27037ec681f3Smrg   DST_FLUSH(CCU_COLOR, CCU_FLUSH_COLOR, CCU_INVALIDATE_COLOR)
27047ec681f3Smrg   DST_FLUSH(CCU_DEPTH, CCU_FLUSH_DEPTH, CCU_INVALIDATE_DEPTH)
27057ec681f3Smrg
27067ec681f3Smrg#undef DST_FLUSH
27077ec681f3Smrg
27087ec681f3Smrg#define DST_INCOHERENT_FLUSH(domain, flush, invalidate) \
27097ec681f3Smrg   if (dst_mask & (TU_ACCESS_##domain##_INCOHERENT_READ |      \
27107ec681f3Smrg                   TU_ACCESS_##domain##_INCOHERENT_WRITE)) {   \
27117ec681f3Smrg      flush_bits |= TU_CMD_FLAG_##invalidate |                 \
27127ec681f3Smrg          (cache->pending_flush_bits &                         \
27137ec681f3Smrg           (TU_CMD_FLAG_ALL_FLUSH & ~TU_CMD_FLAG_##flush));    \
27147ec681f3Smrg   }
27157ec681f3Smrg
27167ec681f3Smrg   DST_INCOHERENT_FLUSH(CCU_COLOR, CCU_FLUSH_COLOR, CCU_INVALIDATE_COLOR)
27177ec681f3Smrg   DST_INCOHERENT_FLUSH(CCU_DEPTH, CCU_FLUSH_DEPTH, CCU_INVALIDATE_DEPTH)
27187ec681f3Smrg
27197ec681f3Smrg#undef DST_INCOHERENT_FLUSH
27207ec681f3Smrg
27217ec681f3Smrg   cache->flush_bits |= flush_bits;
27227ec681f3Smrg   cache->pending_flush_bits &= ~flush_bits;
27237ec681f3Smrg}
27247ec681f3Smrg
27257ec681f3Smrgstatic void
27267ec681f3Smrgtu_flush_for_stage(struct tu_cache_state *cache,
27277ec681f3Smrg                   enum tu_stage src_stage, enum tu_stage dst_stage)
27287ec681f3Smrg{
27297ec681f3Smrg   /* As far as we know, flushes take place in the last stage so if there are
27307ec681f3Smrg    * any pending flushes then we have to move down the source stage, because
27317ec681f3Smrg    * the data only becomes available when the flush finishes. In particular
27327ec681f3Smrg    * this can matter when the CP writes something and we need to invalidate
27337ec681f3Smrg    * UCHE to read it.
27347ec681f3Smrg    */
27357ec681f3Smrg   if (cache->flush_bits & (TU_CMD_FLAG_ALL_FLUSH | TU_CMD_FLAG_ALL_INVALIDATE))
27367ec681f3Smrg      src_stage = TU_STAGE_PS;
27377ec681f3Smrg
27387ec681f3Smrg   /* Note: if the destination stage is the CP, then the CP also has to wait
27397ec681f3Smrg    * for any WFI's to finish. This is already done for draw calls, including
27407ec681f3Smrg    * before indirect param reads, for the most part, so we just need to WFI.
27417ec681f3Smrg    *
27427ec681f3Smrg    * Transform feedback counters are read via CP_MEM_TO_REG, which implicitly
27437ec681f3Smrg    * does CP_WAIT_FOR_ME, but we still need a WFI if the GPU writes it.
27447ec681f3Smrg    *
27457ec681f3Smrg    * Currently we read the draw predicate using CP_MEM_TO_MEM, which
27467ec681f3Smrg    * also implicitly does CP_WAIT_FOR_ME. However CP_DRAW_PRED_SET does *not*
27477ec681f3Smrg    * implicitly do CP_WAIT_FOR_ME, it seems to only wait for counters to
27487ec681f3Smrg    * complete since it's written for DX11 where you can only predicate on the
27497ec681f3Smrg    * result of a query object. So if we implement 64-bit comparisons in the
27507ec681f3Smrg    * future, or if CP_DRAW_PRED_SET grows the capability to do 32-bit
27517ec681f3Smrg    * comparisons, then this will have to be dealt with.
27527ec681f3Smrg    */
27537ec681f3Smrg   if (src_stage > dst_stage)
27547ec681f3Smrg      cache->flush_bits |= TU_CMD_FLAG_WAIT_FOR_IDLE;
27557ec681f3Smrg}
27567ec681f3Smrg
27577ec681f3Smrgstatic enum tu_cmd_access_mask
27587ec681f3Smrgvk2tu_access(VkAccessFlags flags, bool gmem)
27597ec681f3Smrg{
27607ec681f3Smrg   enum tu_cmd_access_mask mask = 0;
27617ec681f3Smrg
27627ec681f3Smrg   if (flags &
27637ec681f3Smrg       (VK_ACCESS_INDIRECT_COMMAND_READ_BIT | /* Read performed by CP */
27647ec681f3Smrg        VK_ACCESS_CONDITIONAL_RENDERING_READ_BIT_EXT | /* Read performed by CP */
27657ec681f3Smrg        VK_ACCESS_TRANSFORM_FEEDBACK_COUNTER_READ_BIT_EXT | /* Read performed by CP */
27667ec681f3Smrg        VK_ACCESS_MEMORY_READ_BIT)) {
27677ec681f3Smrg      mask |= TU_ACCESS_SYSMEM_READ;
27687ec681f3Smrg   }
27697ec681f3Smrg
27707ec681f3Smrg   if (flags &
27717ec681f3Smrg       (VK_ACCESS_TRANSFORM_FEEDBACK_COUNTER_WRITE_BIT_EXT |
27727ec681f3Smrg        VK_ACCESS_MEMORY_WRITE_BIT)) {
27737ec681f3Smrg      mask |= TU_ACCESS_CP_WRITE;
27747ec681f3Smrg   }
27757ec681f3Smrg
27767ec681f3Smrg   if (flags &
27777ec681f3Smrg       (VK_ACCESS_HOST_READ_BIT |
27787ec681f3Smrg        VK_ACCESS_MEMORY_WRITE_BIT)) {
27797ec681f3Smrg      mask |= TU_ACCESS_SYSMEM_READ;
27807ec681f3Smrg   }
27817ec681f3Smrg
27827ec681f3Smrg   if (flags &
27837ec681f3Smrg       (VK_ACCESS_HOST_WRITE_BIT |
27847ec681f3Smrg        VK_ACCESS_MEMORY_WRITE_BIT)) {
27857ec681f3Smrg      mask |= TU_ACCESS_SYSMEM_WRITE;
27867ec681f3Smrg   }
27877ec681f3Smrg
27887ec681f3Smrg   if (flags &
27897ec681f3Smrg       (VK_ACCESS_INDEX_READ_BIT | /* Read performed by PC, I think */
27907ec681f3Smrg        VK_ACCESS_VERTEX_ATTRIBUTE_READ_BIT | /* Read performed by VFD */
27917ec681f3Smrg        VK_ACCESS_UNIFORM_READ_BIT | /* Read performed by SP */
27927ec681f3Smrg        /* TODO: Is there a no-cache bit for textures so that we can ignore
27937ec681f3Smrg         * these?
27947ec681f3Smrg         */
27957ec681f3Smrg        VK_ACCESS_INPUT_ATTACHMENT_READ_BIT | /* Read performed by TP */
27967ec681f3Smrg        VK_ACCESS_SHADER_READ_BIT | /* Read perfomed by SP/TP */
27977ec681f3Smrg        VK_ACCESS_MEMORY_READ_BIT)) {
27987ec681f3Smrg      mask |= TU_ACCESS_UCHE_READ;
27997ec681f3Smrg   }
28007ec681f3Smrg
28017ec681f3Smrg   if (flags &
28027ec681f3Smrg       (VK_ACCESS_SHADER_WRITE_BIT | /* Write performed by SP */
28037ec681f3Smrg        VK_ACCESS_TRANSFORM_FEEDBACK_WRITE_BIT_EXT | /* Write performed by VPC */
28047ec681f3Smrg        VK_ACCESS_MEMORY_WRITE_BIT)) {
28057ec681f3Smrg      mask |= TU_ACCESS_UCHE_WRITE;
28067ec681f3Smrg   }
28077ec681f3Smrg
28087ec681f3Smrg   /* When using GMEM, the CCU is always flushed automatically to GMEM, and
28097ec681f3Smrg    * then GMEM is flushed to sysmem. Furthermore, we already had to flush any
28107ec681f3Smrg    * previous writes in sysmem mode when transitioning to GMEM. Therefore we
28117ec681f3Smrg    * can ignore CCU and pretend that color attachments and transfers use
28127ec681f3Smrg    * sysmem directly.
28137ec681f3Smrg    */
28147ec681f3Smrg
28157ec681f3Smrg   if (flags &
28167ec681f3Smrg       (VK_ACCESS_COLOR_ATTACHMENT_READ_BIT |
28177ec681f3Smrg        VK_ACCESS_COLOR_ATTACHMENT_READ_NONCOHERENT_BIT_EXT |
28187ec681f3Smrg        VK_ACCESS_MEMORY_READ_BIT)) {
28197ec681f3Smrg      if (gmem)
28207ec681f3Smrg         mask |= TU_ACCESS_SYSMEM_READ;
28217ec681f3Smrg      else
28227ec681f3Smrg         mask |= TU_ACCESS_CCU_COLOR_INCOHERENT_READ;
28237ec681f3Smrg   }
28247ec681f3Smrg
28257ec681f3Smrg   if (flags &
28267ec681f3Smrg       (VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_READ_BIT |
28277ec681f3Smrg        VK_ACCESS_MEMORY_READ_BIT)) {
28287ec681f3Smrg      if (gmem)
28297ec681f3Smrg         mask |= TU_ACCESS_SYSMEM_READ;
28307ec681f3Smrg      else
28317ec681f3Smrg         mask |= TU_ACCESS_CCU_DEPTH_INCOHERENT_READ;
28327ec681f3Smrg   }
28337ec681f3Smrg
28347ec681f3Smrg   if (flags &
28357ec681f3Smrg       (VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT |
28367ec681f3Smrg        VK_ACCESS_MEMORY_WRITE_BIT)) {
28377ec681f3Smrg      if (gmem) {
28387ec681f3Smrg         mask |= TU_ACCESS_SYSMEM_WRITE;
28397ec681f3Smrg      } else {
28407ec681f3Smrg         mask |= TU_ACCESS_CCU_COLOR_INCOHERENT_WRITE;
28417ec681f3Smrg      }
28427ec681f3Smrg   }
28437ec681f3Smrg
28447ec681f3Smrg   if (flags &
28457ec681f3Smrg       (VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT |
28467ec681f3Smrg        VK_ACCESS_MEMORY_WRITE_BIT)) {
28477ec681f3Smrg      if (gmem) {
28487ec681f3Smrg         mask |= TU_ACCESS_SYSMEM_WRITE;
28497ec681f3Smrg      } else {
28507ec681f3Smrg         mask |= TU_ACCESS_CCU_DEPTH_INCOHERENT_WRITE;
28517ec681f3Smrg      }
28527ec681f3Smrg   }
28537ec681f3Smrg
28547ec681f3Smrg   if (flags &
28557ec681f3Smrg       (VK_ACCESS_TRANSFER_WRITE_BIT |
28567ec681f3Smrg        VK_ACCESS_MEMORY_WRITE_BIT)) {
28577ec681f3Smrg      if (gmem) {
28587ec681f3Smrg         mask |= TU_ACCESS_SYSMEM_WRITE;
28597ec681f3Smrg      } else {
28607ec681f3Smrg         mask |= TU_ACCESS_CCU_COLOR_WRITE;
28617ec681f3Smrg      }
28627ec681f3Smrg   }
28637ec681f3Smrg
28647ec681f3Smrg   if (flags &
28657ec681f3Smrg       (VK_ACCESS_TRANSFER_READ_BIT | /* Access performed by TP */
28667ec681f3Smrg        VK_ACCESS_MEMORY_READ_BIT)) {
28677ec681f3Smrg      mask |= TU_ACCESS_UCHE_READ;
28687ec681f3Smrg   }
28697ec681f3Smrg
28707ec681f3Smrg   return mask;
28717ec681f3Smrg}
28727ec681f3Smrg
28737ec681f3Smrgstatic enum tu_stage
28747ec681f3Smrgvk2tu_single_stage(VkPipelineStageFlags vk_stage, bool dst)
28757ec681f3Smrg{
28767ec681f3Smrg   switch (vk_stage) {
28777ec681f3Smrg   case VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT:
28787ec681f3Smrg   case VK_PIPELINE_STAGE_DRAW_INDIRECT_BIT:
28797ec681f3Smrg   case VK_PIPELINE_STAGE_CONDITIONAL_RENDERING_BIT_EXT:
28807ec681f3Smrg      return TU_STAGE_CP;
28817ec681f3Smrg   case VK_PIPELINE_STAGE_VERTEX_INPUT_BIT:
28827ec681f3Smrg      return TU_STAGE_FE;
28837ec681f3Smrg   case VK_PIPELINE_STAGE_VERTEX_SHADER_BIT:
28847ec681f3Smrg   case VK_PIPELINE_STAGE_TESSELLATION_CONTROL_SHADER_BIT:
28857ec681f3Smrg   case VK_PIPELINE_STAGE_TESSELLATION_EVALUATION_SHADER_BIT:
28867ec681f3Smrg   case VK_PIPELINE_STAGE_GEOMETRY_SHADER_BIT:
28877ec681f3Smrg      return TU_STAGE_SP_VS;
28887ec681f3Smrg   case VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT:
28897ec681f3Smrg   case VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT:
28907ec681f3Smrg      return TU_STAGE_SP_PS;
28917ec681f3Smrg   case VK_PIPELINE_STAGE_TRANSFORM_FEEDBACK_BIT_EXT: /* Yes, really */
28927ec681f3Smrg   /* See comment in TU_STAGE_GRAS about early fragment tests */
28937ec681f3Smrg   case VK_PIPELINE_STAGE_EARLY_FRAGMENT_TESTS_BIT:
28947ec681f3Smrg   case VK_PIPELINE_STAGE_LATE_FRAGMENT_TESTS_BIT:
28957ec681f3Smrg   case VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT:
28967ec681f3Smrg   case VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT:
28977ec681f3Smrg      return TU_STAGE_PS;
28987ec681f3Smrg
28997ec681f3Smrg   case VK_PIPELINE_STAGE_TRANSFER_BIT:
29007ec681f3Smrg      /* Blits read in SP_PS and write in PS, in both 2d and 3d cases */
29017ec681f3Smrg      return dst ? TU_STAGE_SP_PS : TU_STAGE_PS;
29027ec681f3Smrg
29037ec681f3Smrg   case VK_PIPELINE_STAGE_ALL_GRAPHICS_BIT:
29047ec681f3Smrg   case VK_PIPELINE_STAGE_ALL_COMMANDS_BIT:
29057ec681f3Smrg      /* Be conservative */
29067ec681f3Smrg      return dst ? TU_STAGE_CP : TU_STAGE_PS;
29077ec681f3Smrg
29087ec681f3Smrg   case VK_PIPELINE_STAGE_HOST_BIT:
29097ec681f3Smrg      return dst ? TU_STAGE_PS : TU_STAGE_CP;
29107ec681f3Smrg   }
29117ec681f3Smrg
29127ec681f3Smrg   unreachable("unknown pipeline stage");
29137ec681f3Smrg}
29147ec681f3Smrg
29157ec681f3Smrgstatic enum tu_stage
29167ec681f3Smrgvk2tu_src_stage(VkPipelineStageFlags vk_stages)
29177ec681f3Smrg{
29187ec681f3Smrg   enum tu_stage stage = TU_STAGE_CP;
29197ec681f3Smrg   u_foreach_bit (bit, vk_stages) {
29207ec681f3Smrg      enum tu_stage new_stage = vk2tu_single_stage(1ull << bit, false);
29217ec681f3Smrg      stage = MAX2(stage, new_stage);
29227ec681f3Smrg   }
29237ec681f3Smrg
29247ec681f3Smrg   return stage;
29257ec681f3Smrg}
29267ec681f3Smrg
29277ec681f3Smrgstatic enum tu_stage
29287ec681f3Smrgvk2tu_dst_stage(VkPipelineStageFlags vk_stages)
29297ec681f3Smrg{
29307ec681f3Smrg   enum tu_stage stage = TU_STAGE_PS;
29317ec681f3Smrg   u_foreach_bit (bit, vk_stages) {
29327ec681f3Smrg      enum tu_stage new_stage = vk2tu_single_stage(1ull << bit, true);
29337ec681f3Smrg      stage = MIN2(stage, new_stage);
29347ec681f3Smrg   }
29357ec681f3Smrg
29367ec681f3Smrg   return stage;
29377ec681f3Smrg}
29387ec681f3Smrg
29397ec681f3SmrgVKAPI_ATTR void VKAPI_CALL
29407ec681f3Smrgtu_CmdExecuteCommands(VkCommandBuffer commandBuffer,
29417ec681f3Smrg                      uint32_t commandBufferCount,
29427ec681f3Smrg                      const VkCommandBuffer *pCmdBuffers)
29437ec681f3Smrg{
29447ec681f3Smrg   TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
29457ec681f3Smrg   VkResult result;
29467ec681f3Smrg
29477ec681f3Smrg   assert(commandBufferCount > 0);
29487ec681f3Smrg
29497ec681f3Smrg   /* Emit any pending flushes. */
29507ec681f3Smrg   if (cmd->state.pass) {
29517ec681f3Smrg      tu_flush_all_pending(&cmd->state.renderpass_cache);
29527ec681f3Smrg      tu_emit_cache_flush_renderpass(cmd, &cmd->draw_cs);
29537ec681f3Smrg   } else {
29547ec681f3Smrg      tu_flush_all_pending(&cmd->state.cache);
29557ec681f3Smrg      tu_emit_cache_flush(cmd, &cmd->cs);
29567ec681f3Smrg   }
29577ec681f3Smrg
29587ec681f3Smrg   for (uint32_t i = 0; i < commandBufferCount; i++) {
29597ec681f3Smrg      TU_FROM_HANDLE(tu_cmd_buffer, secondary, pCmdBuffers[i]);
29607ec681f3Smrg
29617ec681f3Smrg      if (secondary->usage_flags &
29627ec681f3Smrg          VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT) {
29637ec681f3Smrg         assert(tu_cs_is_empty(&secondary->cs));
29647ec681f3Smrg
29657ec681f3Smrg         result = tu_cs_add_entries(&cmd->draw_cs, &secondary->draw_cs);
29667ec681f3Smrg         if (result != VK_SUCCESS) {
29677ec681f3Smrg            cmd->record_result = result;
29687ec681f3Smrg            break;
29697ec681f3Smrg         }
29707ec681f3Smrg
29717ec681f3Smrg         result = tu_cs_add_entries(&cmd->draw_epilogue_cs,
29727ec681f3Smrg               &secondary->draw_epilogue_cs);
29737ec681f3Smrg         if (result != VK_SUCCESS) {
29747ec681f3Smrg            cmd->record_result = result;
29757ec681f3Smrg            break;
29767ec681f3Smrg         }
29777ec681f3Smrg
29787ec681f3Smrg         if (secondary->state.has_tess)
29797ec681f3Smrg            cmd->state.has_tess = true;
29807ec681f3Smrg         if (secondary->state.has_subpass_predication)
29817ec681f3Smrg            cmd->state.has_subpass_predication = true;
29827ec681f3Smrg         if (secondary->state.disable_gmem)
29837ec681f3Smrg            cmd->state.disable_gmem = true;
29847ec681f3Smrg      } else {
29857ec681f3Smrg         assert(tu_cs_is_empty(&secondary->draw_cs));
29867ec681f3Smrg         assert(tu_cs_is_empty(&secondary->draw_epilogue_cs));
29877ec681f3Smrg
29887ec681f3Smrg         tu_cs_add_entries(&cmd->cs, &secondary->cs);
29897ec681f3Smrg      }
29907ec681f3Smrg
29917ec681f3Smrg      cmd->state.index_size = secondary->state.index_size; /* for restart index update */
29927ec681f3Smrg   }
29937ec681f3Smrg   cmd->state.dirty = ~0u; /* TODO: set dirty only what needs to be */
29947ec681f3Smrg
29957ec681f3Smrg   if (cmd->state.pass) {
29967ec681f3Smrg      /* After a secondary command buffer is executed, LRZ is not valid
29977ec681f3Smrg       * until it is cleared again.
29987ec681f3Smrg       */
29997ec681f3Smrg      cmd->state.lrz.valid = false;
30007ec681f3Smrg   }
30017ec681f3Smrg
30027ec681f3Smrg   /* After executing secondary command buffers, there may have been arbitrary
30037ec681f3Smrg    * flushes executed, so when we encounter a pipeline barrier with a
30047ec681f3Smrg    * srcMask, we have to assume that we need to invalidate. Therefore we need
30057ec681f3Smrg    * to re-initialize the cache with all pending invalidate bits set.
30067ec681f3Smrg    */
30077ec681f3Smrg   if (cmd->state.pass) {
30087ec681f3Smrg      tu_cache_init(&cmd->state.renderpass_cache);
30097ec681f3Smrg   } else {
30107ec681f3Smrg      tu_cache_init(&cmd->state.cache);
30117ec681f3Smrg   }
30127ec681f3Smrg}
30137ec681f3Smrg
30147ec681f3SmrgVKAPI_ATTR VkResult VKAPI_CALL
30157ec681f3Smrgtu_CreateCommandPool(VkDevice _device,
30167ec681f3Smrg                     const VkCommandPoolCreateInfo *pCreateInfo,
30177ec681f3Smrg                     const VkAllocationCallbacks *pAllocator,
30187ec681f3Smrg                     VkCommandPool *pCmdPool)
30197ec681f3Smrg{
30207ec681f3Smrg   TU_FROM_HANDLE(tu_device, device, _device);
30217ec681f3Smrg   struct tu_cmd_pool *pool;
30227ec681f3Smrg
30237ec681f3Smrg   pool = vk_object_alloc(&device->vk, pAllocator, sizeof(*pool),
30247ec681f3Smrg                          VK_OBJECT_TYPE_COMMAND_POOL);
30257ec681f3Smrg   if (pool == NULL)
30267ec681f3Smrg      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
30277ec681f3Smrg
30287ec681f3Smrg   if (pAllocator)
30297ec681f3Smrg      pool->alloc = *pAllocator;
30307ec681f3Smrg   else
30317ec681f3Smrg      pool->alloc = device->vk.alloc;
30327ec681f3Smrg
30337ec681f3Smrg   list_inithead(&pool->cmd_buffers);
30347ec681f3Smrg   list_inithead(&pool->free_cmd_buffers);
30357ec681f3Smrg
30367ec681f3Smrg   pool->queue_family_index = pCreateInfo->queueFamilyIndex;
30377ec681f3Smrg
30387ec681f3Smrg   *pCmdPool = tu_cmd_pool_to_handle(pool);
30397ec681f3Smrg
30407ec681f3Smrg   return VK_SUCCESS;
30417ec681f3Smrg}
30427ec681f3Smrg
30437ec681f3SmrgVKAPI_ATTR void VKAPI_CALL
30447ec681f3Smrgtu_DestroyCommandPool(VkDevice _device,
30457ec681f3Smrg                      VkCommandPool commandPool,
30467ec681f3Smrg                      const VkAllocationCallbacks *pAllocator)
30477ec681f3Smrg{
30487ec681f3Smrg   TU_FROM_HANDLE(tu_device, device, _device);
30497ec681f3Smrg   TU_FROM_HANDLE(tu_cmd_pool, pool, commandPool);
30507ec681f3Smrg
30517ec681f3Smrg   if (!pool)
30527ec681f3Smrg      return;
30537ec681f3Smrg
30547ec681f3Smrg   list_for_each_entry_safe(struct tu_cmd_buffer, cmd_buffer,
30557ec681f3Smrg                            &pool->cmd_buffers, pool_link)
30567ec681f3Smrg   {
30577ec681f3Smrg      tu_cmd_buffer_destroy(cmd_buffer);
30587ec681f3Smrg   }
30597ec681f3Smrg
30607ec681f3Smrg   list_for_each_entry_safe(struct tu_cmd_buffer, cmd_buffer,
30617ec681f3Smrg                            &pool->free_cmd_buffers, pool_link)
3062361fc4cbSmaya   {
3063361fc4cbSmaya      tu_cmd_buffer_destroy(cmd_buffer);
3064361fc4cbSmaya   }
3065361fc4cbSmaya
30667ec681f3Smrg   vk_object_free(&device->vk, pAllocator, pool);
3067361fc4cbSmaya}
3068361fc4cbSmaya
30697ec681f3SmrgVKAPI_ATTR VkResult VKAPI_CALL
3070361fc4cbSmayatu_ResetCommandPool(VkDevice device,
3071361fc4cbSmaya                    VkCommandPool commandPool,
3072361fc4cbSmaya                    VkCommandPoolResetFlags flags)
3073361fc4cbSmaya{
3074361fc4cbSmaya   TU_FROM_HANDLE(tu_cmd_pool, pool, commandPool);
3075361fc4cbSmaya   VkResult result;
3076361fc4cbSmaya
3077361fc4cbSmaya   list_for_each_entry(struct tu_cmd_buffer, cmd_buffer, &pool->cmd_buffers,
3078361fc4cbSmaya                       pool_link)
3079361fc4cbSmaya   {
3080361fc4cbSmaya      result = tu_reset_cmd_buffer(cmd_buffer);
3081361fc4cbSmaya      if (result != VK_SUCCESS)
3082361fc4cbSmaya         return result;
3083361fc4cbSmaya   }
3084361fc4cbSmaya
3085361fc4cbSmaya   return VK_SUCCESS;
3086361fc4cbSmaya}
3087361fc4cbSmaya
30887ec681f3SmrgVKAPI_ATTR void VKAPI_CALL
3089361fc4cbSmayatu_TrimCommandPool(VkDevice device,
3090361fc4cbSmaya                   VkCommandPool commandPool,
3091361fc4cbSmaya                   VkCommandPoolTrimFlags flags)
3092361fc4cbSmaya{
3093361fc4cbSmaya   TU_FROM_HANDLE(tu_cmd_pool, pool, commandPool);
3094361fc4cbSmaya
3095361fc4cbSmaya   if (!pool)
3096361fc4cbSmaya      return;
3097361fc4cbSmaya
3098361fc4cbSmaya   list_for_each_entry_safe(struct tu_cmd_buffer, cmd_buffer,
3099361fc4cbSmaya                            &pool->free_cmd_buffers, pool_link)
3100361fc4cbSmaya   {
3101361fc4cbSmaya      tu_cmd_buffer_destroy(cmd_buffer);
3102361fc4cbSmaya   }
3103361fc4cbSmaya}
3104361fc4cbSmaya
31057ec681f3Smrgstatic void
31067ec681f3Smrgtu_subpass_barrier(struct tu_cmd_buffer *cmd_buffer,
31077ec681f3Smrg                   const struct tu_subpass_barrier *barrier,
31087ec681f3Smrg                   bool external)
3109361fc4cbSmaya{
31107ec681f3Smrg   /* Note: we don't know until the end of the subpass whether we'll use
31117ec681f3Smrg    * sysmem, so assume sysmem here to be safe.
31127ec681f3Smrg    */
31137ec681f3Smrg   struct tu_cache_state *cache =
31147ec681f3Smrg      external ? &cmd_buffer->state.cache : &cmd_buffer->state.renderpass_cache;
31157ec681f3Smrg   enum tu_cmd_access_mask src_flags =
31167ec681f3Smrg      vk2tu_access(barrier->src_access_mask, false);
31177ec681f3Smrg   enum tu_cmd_access_mask dst_flags =
31187ec681f3Smrg      vk2tu_access(barrier->dst_access_mask, false);
31197ec681f3Smrg
31207ec681f3Smrg   if (barrier->incoherent_ccu_color)
31217ec681f3Smrg      src_flags |= TU_ACCESS_CCU_COLOR_INCOHERENT_WRITE;
31227ec681f3Smrg   if (barrier->incoherent_ccu_depth)
31237ec681f3Smrg      src_flags |= TU_ACCESS_CCU_DEPTH_INCOHERENT_WRITE;
31247ec681f3Smrg
31257ec681f3Smrg   tu_flush_for_access(cache, src_flags, dst_flags);
31267ec681f3Smrg
31277ec681f3Smrg   enum tu_stage src_stage = vk2tu_src_stage(barrier->src_stage_mask);
31287ec681f3Smrg   enum tu_stage dst_stage = vk2tu_dst_stage(barrier->dst_stage_mask);
31297ec681f3Smrg   tu_flush_for_stage(cache, src_stage, dst_stage);
31307ec681f3Smrg}
31317ec681f3Smrg
31327ec681f3SmrgVKAPI_ATTR void VKAPI_CALL
31337ec681f3Smrgtu_CmdBeginRenderPass2(VkCommandBuffer commandBuffer,
31347ec681f3Smrg                       const VkRenderPassBeginInfo *pRenderPassBegin,
31357ec681f3Smrg                       const VkSubpassBeginInfo *pSubpassBeginInfo)
31367ec681f3Smrg{
31377ec681f3Smrg   TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
3138361fc4cbSmaya   TU_FROM_HANDLE(tu_render_pass, pass, pRenderPassBegin->renderPass);
31397ec681f3Smrg   TU_FROM_HANDLE(tu_framebuffer, fb, pRenderPassBegin->framebuffer);
3140361fc4cbSmaya
31417ec681f3Smrg   const struct VkRenderPassAttachmentBeginInfo *pAttachmentInfo =
31427ec681f3Smrg      vk_find_struct_const(pRenderPassBegin->pNext,
31437ec681f3Smrg                           RENDER_PASS_ATTACHMENT_BEGIN_INFO);
3144361fc4cbSmaya
31457ec681f3Smrg   cmd->state.pass = pass;
31467ec681f3Smrg   cmd->state.subpass = pass->subpasses;
31477ec681f3Smrg   cmd->state.framebuffer = fb;
31487ec681f3Smrg   cmd->state.render_area = pRenderPassBegin->renderArea;
31497ec681f3Smrg
31507ec681f3Smrg   cmd->state.attachments =
31517ec681f3Smrg      vk_alloc(&cmd->pool->alloc, pass->attachment_count *
31527ec681f3Smrg               sizeof(cmd->state.attachments[0]), 8,
31537ec681f3Smrg               VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
31547ec681f3Smrg
31557ec681f3Smrg   if (!cmd->state.attachments) {
31567ec681f3Smrg      cmd->record_result = VK_ERROR_OUT_OF_HOST_MEMORY;
3157361fc4cbSmaya      return;
31587ec681f3Smrg   }
3159361fc4cbSmaya
31607ec681f3Smrg   for (unsigned i = 0; i < pass->attachment_count; i++) {
31617ec681f3Smrg      cmd->state.attachments[i] = pAttachmentInfo ?
31627ec681f3Smrg         tu_image_view_from_handle(pAttachmentInfo->pAttachments[i]) :
31637ec681f3Smrg         cmd->state.framebuffer->attachments[i].attachment;
31647ec681f3Smrg   }
3165361fc4cbSmaya
31667ec681f3Smrg   trace_start_render_pass(&cmd->trace, &cmd->cs);
31677ec681f3Smrg
31687ec681f3Smrg   /* Note: because this is external, any flushes will happen before draw_cs
31697ec681f3Smrg    * gets called. However deferred flushes could have to happen later as part
31707ec681f3Smrg    * of the subpass.
31717ec681f3Smrg    */
31727ec681f3Smrg   tu_subpass_barrier(cmd, &pass->subpasses[0].start_barrier, true);
31737ec681f3Smrg   cmd->state.renderpass_cache.pending_flush_bits =
31747ec681f3Smrg      cmd->state.cache.pending_flush_bits;
31757ec681f3Smrg   cmd->state.renderpass_cache.flush_bits = 0;
31767ec681f3Smrg
31777ec681f3Smrg   if (pass->subpasses[0].feedback_invalidate)
31787ec681f3Smrg      cmd->state.renderpass_cache.flush_bits |= TU_CMD_FLAG_CACHE_INVALIDATE;
31797ec681f3Smrg
31807ec681f3Smrg   /* Track LRZ valid state */
31817ec681f3Smrg   uint32_t a = cmd->state.subpass->depth_stencil_attachment.attachment;
31827ec681f3Smrg   if (a != VK_ATTACHMENT_UNUSED) {
31837ec681f3Smrg      const struct tu_render_pass_attachment *att = &cmd->state.pass->attachments[a];
31847ec681f3Smrg      struct tu_image *image = cmd->state.attachments[a]->image;
31857ec681f3Smrg      /* if image has lrz and it isn't a stencil-only clear: */
31867ec681f3Smrg      if (image->lrz_height &&
31877ec681f3Smrg          (att->clear_mask & (VK_IMAGE_ASPECT_COLOR_BIT | VK_IMAGE_ASPECT_DEPTH_BIT))) {
31887ec681f3Smrg         cmd->state.lrz.image = image;
31897ec681f3Smrg         cmd->state.lrz.valid = true;
31907ec681f3Smrg         cmd->state.lrz.prev_direction = TU_LRZ_UNKNOWN;
31917ec681f3Smrg
31927ec681f3Smrg         tu6_clear_lrz(cmd, &cmd->cs, image, &pRenderPassBegin->pClearValues[a]);
31937ec681f3Smrg
31947ec681f3Smrg         /* Clearing writes via CCU color in the PS stage, and LRZ is read via
31957ec681f3Smrg          * UCHE in the earlier GRAS stage.
31967ec681f3Smrg          */
31977ec681f3Smrg         cmd->state.cache.flush_bits |=
31987ec681f3Smrg            TU_CMD_FLAG_CCU_FLUSH_COLOR | TU_CMD_FLAG_CACHE_INVALIDATE |
31997ec681f3Smrg            TU_CMD_FLAG_WAIT_FOR_IDLE;
32007ec681f3Smrg      } else {
32017ec681f3Smrg         cmd->state.lrz.valid = false;
32027ec681f3Smrg      }
32037ec681f3Smrg      cmd->state.dirty |= TU_CMD_DIRTY_LRZ;
32047ec681f3Smrg   }
32057ec681f3Smrg
32067ec681f3Smrg   cmd->trace_renderpass_start = u_trace_end_iterator(&cmd->trace);
32077ec681f3Smrg
32087ec681f3Smrg   tu_emit_renderpass_begin(cmd, pRenderPassBegin);
32097ec681f3Smrg
32107ec681f3Smrg   tu6_emit_zs(cmd, cmd->state.subpass, &cmd->draw_cs);
32117ec681f3Smrg   tu6_emit_mrt(cmd, cmd->state.subpass, &cmd->draw_cs);
32127ec681f3Smrg   if (cmd->state.subpass->samples)
32137ec681f3Smrg      tu6_emit_msaa(&cmd->draw_cs, cmd->state.subpass->samples, cmd->state.line_mode);
32147ec681f3Smrg   tu6_emit_render_cntl(cmd, cmd->state.subpass, &cmd->draw_cs, false);
32157ec681f3Smrg
32167ec681f3Smrg   tu_set_input_attachments(cmd, cmd->state.subpass);
3217361fc4cbSmaya}
3218361fc4cbSmaya
32197ec681f3SmrgVKAPI_ATTR void VKAPI_CALL
32207ec681f3Smrgtu_CmdNextSubpass2(VkCommandBuffer commandBuffer,
32217ec681f3Smrg                   const VkSubpassBeginInfo *pSubpassBeginInfo,
32227ec681f3Smrg                   const VkSubpassEndInfo *pSubpassEndInfo)
3223361fc4cbSmaya{
32247ec681f3Smrg   TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
32257ec681f3Smrg   const struct tu_render_pass *pass = cmd->state.pass;
32267ec681f3Smrg   struct tu_cs *cs = &cmd->draw_cs;
32277ec681f3Smrg
32287ec681f3Smrg   const struct tu_subpass *subpass = cmd->state.subpass++;
32297ec681f3Smrg
32307ec681f3Smrg   /* Track LRZ valid state
32317ec681f3Smrg    *
32327ec681f3Smrg    * TODO: Improve this tracking for keeping the state of the past depth/stencil images,
32337ec681f3Smrg    * so if they become active again, we reuse its old state.
32347ec681f3Smrg    */
32357ec681f3Smrg   cmd->state.lrz.valid = false;
32367ec681f3Smrg   cmd->state.dirty |= TU_CMD_DIRTY_LRZ;
32377ec681f3Smrg
32387ec681f3Smrg   tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_GMEM);
32397ec681f3Smrg
32407ec681f3Smrg   if (subpass->resolve_attachments) {
32417ec681f3Smrg      tu6_emit_blit_scissor(cmd, cs, true);
32427ec681f3Smrg
32437ec681f3Smrg      for (unsigned i = 0; i < subpass->resolve_count; i++) {
32447ec681f3Smrg         uint32_t a = subpass->resolve_attachments[i].attachment;
32457ec681f3Smrg         if (a == VK_ATTACHMENT_UNUSED)
32467ec681f3Smrg            continue;
32477ec681f3Smrg
32487ec681f3Smrg         uint32_t gmem_a = tu_subpass_get_attachment_to_resolve(subpass, i);
32497ec681f3Smrg
32507ec681f3Smrg         tu_store_gmem_attachment(cmd, cs, a, gmem_a);
32517ec681f3Smrg
32527ec681f3Smrg         if (pass->attachments[a].gmem_offset < 0)
32537ec681f3Smrg            continue;
32547ec681f3Smrg
32557ec681f3Smrg         /* TODO:
32567ec681f3Smrg          * check if the resolved attachment is needed by later subpasses,
32577ec681f3Smrg          * if it is, should be doing a GMEM->GMEM resolve instead of GMEM->MEM->GMEM..
32587ec681f3Smrg          */
32597ec681f3Smrg         tu_finishme("missing GMEM->GMEM resolve path\n");
32607ec681f3Smrg         tu_load_gmem_attachment(cmd, cs, a, true);
32617ec681f3Smrg      }
32627ec681f3Smrg   }
32637ec681f3Smrg
32647ec681f3Smrg   tu_cond_exec_end(cs);
32657ec681f3Smrg
32667ec681f3Smrg   tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_SYSMEM);
32677ec681f3Smrg
32687ec681f3Smrg   tu6_emit_sysmem_resolves(cmd, cs, subpass);
32697ec681f3Smrg
32707ec681f3Smrg   tu_cond_exec_end(cs);
32717ec681f3Smrg
32727ec681f3Smrg   /* Handle dependencies for the next subpass */
32737ec681f3Smrg   tu_subpass_barrier(cmd, &cmd->state.subpass->start_barrier, false);
32747ec681f3Smrg
32757ec681f3Smrg   if (cmd->state.subpass->feedback_invalidate)
32767ec681f3Smrg      cmd->state.renderpass_cache.flush_bits |= TU_CMD_FLAG_CACHE_INVALIDATE;
32777ec681f3Smrg
32787ec681f3Smrg   /* emit mrt/zs/msaa/ubwc state for the subpass that is starting */
32797ec681f3Smrg   tu6_emit_zs(cmd, cmd->state.subpass, cs);
32807ec681f3Smrg   tu6_emit_mrt(cmd, cmd->state.subpass, cs);
32817ec681f3Smrg   if (cmd->state.subpass->samples)
32827ec681f3Smrg      tu6_emit_msaa(cs, cmd->state.subpass->samples, cmd->state.line_mode);
32837ec681f3Smrg   tu6_emit_render_cntl(cmd, cmd->state.subpass, cs, false);
32847ec681f3Smrg
32857ec681f3Smrg   tu_set_input_attachments(cmd, cmd->state.subpass);
3286361fc4cbSmaya}
3287361fc4cbSmaya
32887ec681f3Smrgstatic uint32_t
32897ec681f3Smrgtu6_user_consts_size(const struct tu_pipeline *pipeline,
32907ec681f3Smrg                     struct tu_descriptor_state *descriptors_state,
32917ec681f3Smrg                     gl_shader_stage type)
3292361fc4cbSmaya{
32937ec681f3Smrg   const struct tu_program_descriptor_linkage *link =
32947ec681f3Smrg      &pipeline->program.link[type];
32957ec681f3Smrg   const struct ir3_ubo_analysis_state *state = &link->const_state.ubo_state;
32967ec681f3Smrg   uint32_t dwords = 0;
32977ec681f3Smrg
32987ec681f3Smrg   if (link->push_consts.count > 0) {
32997ec681f3Smrg      unsigned num_units = link->push_consts.count;
33007ec681f3Smrg      dwords += 4 + num_units * 4;
33017ec681f3Smrg   }
33027ec681f3Smrg
33037ec681f3Smrg   for (uint32_t i = 0; i < state->num_enabled; i++) {
33047ec681f3Smrg      uint32_t size = state->range[i].end - state->range[i].start;
3305361fc4cbSmaya
33067ec681f3Smrg      size = MIN2(size, (16 * link->constlen) - state->range[i].offset);
33077ec681f3Smrg
33087ec681f3Smrg      if (size == 0)
33097ec681f3Smrg         continue;
3310361fc4cbSmaya
33117ec681f3Smrg      if (!state->range[i].ubo.bindless)
33127ec681f3Smrg         continue;
33137ec681f3Smrg
33147ec681f3Smrg      uint32_t *base = state->range[i].ubo.bindless_base == MAX_SETS ?
33157ec681f3Smrg         descriptors_state->dynamic_descriptors :
33167ec681f3Smrg         descriptors_state->sets[state->range[i].ubo.bindless_base]->mapped_ptr;
33177ec681f3Smrg      unsigned block = state->range[i].ubo.block;
33187ec681f3Smrg      uint32_t *desc = base + block * A6XX_TEX_CONST_DWORDS;
33197ec681f3Smrg      uint32_t desc_size = (desc[1] >> A6XX_UBO_1_SIZE__SHIFT) * 16;
33207ec681f3Smrg      desc_size = desc_size > state->range[i].start ?
33217ec681f3Smrg         desc_size - state->range[i].start : 0;
33227ec681f3Smrg
33237ec681f3Smrg      if (desc_size < size) {
33247ec681f3Smrg         uint32_t zero_size = size - desc_size;
33257ec681f3Smrg         dwords += 4 + zero_size / 4;
33267ec681f3Smrg         size = desc_size;
33277ec681f3Smrg      }
3328361fc4cbSmaya
33297ec681f3Smrg      if (size > 0) {
33307ec681f3Smrg         dwords += 4;
33317ec681f3Smrg      }
33327ec681f3Smrg   }
33337ec681f3Smrg
33347ec681f3Smrg   return dwords;
3335361fc4cbSmaya}
3336361fc4cbSmaya
33377ec681f3Smrgstatic void
33387ec681f3Smrgtu6_emit_user_consts(struct tu_cs *cs, const struct tu_pipeline *pipeline,
33397ec681f3Smrg                     struct tu_descriptor_state *descriptors_state,
33407ec681f3Smrg                     gl_shader_stage type,
33417ec681f3Smrg                     uint32_t *push_constants)
3342361fc4cbSmaya{
33437ec681f3Smrg   const struct tu_program_descriptor_linkage *link =
33447ec681f3Smrg      &pipeline->program.link[type];
33457ec681f3Smrg   const struct ir3_const_state *const_state = &link->const_state;
33467ec681f3Smrg   const struct ir3_ubo_analysis_state *state = &const_state->ubo_state;
33477ec681f3Smrg
33487ec681f3Smrg   if (link->push_consts.count > 0) {
33497ec681f3Smrg      unsigned num_units = link->push_consts.count;
33507ec681f3Smrg      unsigned offset = link->push_consts.lo;
33517ec681f3Smrg      tu_cs_emit_pkt7(cs, tu6_stage2opcode(type), 3 + num_units * 4);
33527ec681f3Smrg      tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(offset) |
33537ec681f3Smrg            CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
33547ec681f3Smrg            CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
33557ec681f3Smrg            CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(type)) |
33567ec681f3Smrg            CP_LOAD_STATE6_0_NUM_UNIT(num_units));
33577ec681f3Smrg      tu_cs_emit(cs, 0);
33587ec681f3Smrg      tu_cs_emit(cs, 0);
33597ec681f3Smrg      for (unsigned i = 0; i < num_units * 4; i++)
33607ec681f3Smrg         tu_cs_emit(cs, push_constants[i + offset * 4]);
33617ec681f3Smrg   }
33627ec681f3Smrg
33637ec681f3Smrg   for (uint32_t i = 0; i < state->num_enabled; i++) {
33647ec681f3Smrg      uint32_t size = state->range[i].end - state->range[i].start;
33657ec681f3Smrg      uint32_t offset = state->range[i].start;
33667ec681f3Smrg
33677ec681f3Smrg      /* and even if the start of the const buffer is before
33687ec681f3Smrg       * first_immediate, the end may not be:
33697ec681f3Smrg       */
33707ec681f3Smrg      size = MIN2(size, (16 * link->constlen) - state->range[i].offset);
33717ec681f3Smrg
33727ec681f3Smrg      if (size == 0)
33737ec681f3Smrg         continue;
33747ec681f3Smrg
33757ec681f3Smrg      /* things should be aligned to vec4: */
33767ec681f3Smrg      debug_assert((state->range[i].offset % 16) == 0);
33777ec681f3Smrg      debug_assert((size % 16) == 0);
33787ec681f3Smrg      debug_assert((offset % 16) == 0);
33797ec681f3Smrg
33807ec681f3Smrg      /* Dig out the descriptor from the descriptor state and read the VA from
33817ec681f3Smrg       * it.  All our UBOs are bindless with the exception of the NIR
33827ec681f3Smrg       * constant_data, which is uploaded once in the pipeline.
33837ec681f3Smrg       */
33847ec681f3Smrg      if (!state->range[i].ubo.bindless) {
33857ec681f3Smrg         assert(state->range[i].ubo.block == const_state->constant_data_ubo);
33867ec681f3Smrg         continue;
33877ec681f3Smrg      }
33887ec681f3Smrg
33897ec681f3Smrg      uint32_t *base = state->range[i].ubo.bindless_base == MAX_SETS ?
33907ec681f3Smrg         descriptors_state->dynamic_descriptors :
33917ec681f3Smrg         descriptors_state->sets[state->range[i].ubo.bindless_base]->mapped_ptr;
33927ec681f3Smrg      unsigned block = state->range[i].ubo.block;
33937ec681f3Smrg      uint32_t *desc = base + block * A6XX_TEX_CONST_DWORDS;
33947ec681f3Smrg      uint64_t va = desc[0] | ((uint64_t)(desc[1] & A6XX_UBO_1_BASE_HI__MASK) << 32);
33957ec681f3Smrg      uint32_t desc_size = (desc[1] >> A6XX_UBO_1_SIZE__SHIFT) * 16;
33967ec681f3Smrg      desc_size = desc_size > state->range[i].start ?
33977ec681f3Smrg         desc_size - state->range[i].start : 0;
33987ec681f3Smrg
33997ec681f3Smrg      /* Handle null UBO descriptors and out-of-range UBO reads by filling the
34007ec681f3Smrg       * rest with 0, simulating what reading with ldc would do. This behavior
34017ec681f3Smrg       * is required by VK_EXT_robustness2.
34027ec681f3Smrg       */
34037ec681f3Smrg      if (desc_size < size) {
34047ec681f3Smrg         uint32_t zero_size = size - desc_size;
34057ec681f3Smrg         uint32_t zero_offset = state->range[i].offset + desc_size;
34067ec681f3Smrg         tu_cs_emit_pkt7(cs, tu6_stage2opcode(type), 3 + zero_size / 4);
34077ec681f3Smrg         tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(zero_offset / 16) |
34087ec681f3Smrg               CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
34097ec681f3Smrg               CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
34107ec681f3Smrg               CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(type)) |
34117ec681f3Smrg               CP_LOAD_STATE6_0_NUM_UNIT(zero_size / 16));
34127ec681f3Smrg         tu_cs_emit_qw(cs, 0);
34137ec681f3Smrg         for (unsigned i = 0; i < zero_size / 4; i++) {
34147ec681f3Smrg            tu_cs_emit(cs, 0);
34157ec681f3Smrg         }
34167ec681f3Smrg         size = desc_size;
34177ec681f3Smrg      }
34187ec681f3Smrg
34197ec681f3Smrg      if (size > 0) {
34207ec681f3Smrg         assert(va);
34217ec681f3Smrg         tu_cs_emit_pkt7(cs, tu6_stage2opcode(type), 3);
34227ec681f3Smrg         tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(state->range[i].offset / 16) |
34237ec681f3Smrg               CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
34247ec681f3Smrg               CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
34257ec681f3Smrg               CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(type)) |
34267ec681f3Smrg               CP_LOAD_STATE6_0_NUM_UNIT(size / 16));
34277ec681f3Smrg         tu_cs_emit_qw(cs, va + offset);
34287ec681f3Smrg      }
34297ec681f3Smrg   }
3430361fc4cbSmaya}
3431361fc4cbSmaya
34327ec681f3Smrgstatic struct tu_draw_state
34337ec681f3Smrgtu6_emit_consts(struct tu_cmd_buffer *cmd,
34347ec681f3Smrg                const struct tu_pipeline *pipeline,
34357ec681f3Smrg                struct tu_descriptor_state *descriptors_state,
34367ec681f3Smrg                gl_shader_stage type)
3437361fc4cbSmaya{
34387ec681f3Smrg   uint32_t dwords = tu6_user_consts_size(pipeline, descriptors_state, type);
34397ec681f3Smrg   if (dwords == 0)
34407ec681f3Smrg      return (struct tu_draw_state) {};
3441361fc4cbSmaya
34427ec681f3Smrg   struct tu_cs cs;
34437ec681f3Smrg   tu_cs_begin_sub_stream(&cmd->sub_cs, dwords, &cs);
3444361fc4cbSmaya
34457ec681f3Smrg   tu6_emit_user_consts(&cs, pipeline, descriptors_state, type, cmd->push_constants);
34467ec681f3Smrg
34477ec681f3Smrg   return tu_cs_end_draw_state(&cmd->sub_cs, &cs);
34487ec681f3Smrg}
34497ec681f3Smrg
34507ec681f3Smrgstatic struct tu_draw_state
34517ec681f3Smrgtu6_emit_consts_geom(struct tu_cmd_buffer *cmd,
34527ec681f3Smrg                      const struct tu_pipeline *pipeline,
34537ec681f3Smrg                      struct tu_descriptor_state *descriptors_state)
34547ec681f3Smrg{
34557ec681f3Smrg   uint32_t dwords = 0;
34567ec681f3Smrg
34577ec681f3Smrg   for (uint32_t type = MESA_SHADER_VERTEX; type < MESA_SHADER_FRAGMENT; type++)
34587ec681f3Smrg      dwords += tu6_user_consts_size(pipeline, descriptors_state, type);
34597ec681f3Smrg
34607ec681f3Smrg   if (dwords == 0)
34617ec681f3Smrg      return (struct tu_draw_state) {};
34627ec681f3Smrg
34637ec681f3Smrg   struct tu_cs cs;
34647ec681f3Smrg   tu_cs_begin_sub_stream(&cmd->sub_cs, dwords, &cs);
34657ec681f3Smrg
34667ec681f3Smrg   for (uint32_t type = MESA_SHADER_VERTEX; type < MESA_SHADER_FRAGMENT; type++)
34677ec681f3Smrg      tu6_emit_user_consts(&cs, pipeline, descriptors_state, type, cmd->push_constants);
34687ec681f3Smrg
34697ec681f3Smrg   return tu_cs_end_draw_state(&cmd->sub_cs, &cs);
34707ec681f3Smrg}
34717ec681f3Smrg
34727ec681f3Smrgstatic uint64_t
34737ec681f3Smrgget_tess_param_bo_size(const struct tu_pipeline *pipeline,
34747ec681f3Smrg                       uint32_t draw_count)
34757ec681f3Smrg{
34767ec681f3Smrg   /* TODO: For indirect draws, we can't compute the BO size ahead of time.
34777ec681f3Smrg    * Still not sure what to do here, so just allocate a reasonably large
34787ec681f3Smrg    * BO and hope for the best for now. */
34797ec681f3Smrg   if (!draw_count)
34807ec681f3Smrg      draw_count = 2048;
34817ec681f3Smrg
34827ec681f3Smrg   /* the tess param BO is pipeline->tess.param_stride bytes per patch,
34837ec681f3Smrg    * which includes both the per-vertex outputs and per-patch outputs
34847ec681f3Smrg    * build_primitive_map in ir3 calculates this stride
3485361fc4cbSmaya    */
34867ec681f3Smrg   uint32_t verts_per_patch = pipeline->ia.primtype - DI_PT_PATCHES0;
34877ec681f3Smrg   uint32_t num_patches = draw_count / verts_per_patch;
34887ec681f3Smrg   return num_patches * pipeline->tess.param_stride;
34897ec681f3Smrg}
34907ec681f3Smrg
34917ec681f3Smrgstatic uint64_t
34927ec681f3Smrgget_tess_factor_bo_size(const struct tu_pipeline *pipeline,
34937ec681f3Smrg                        uint32_t draw_count)
34947ec681f3Smrg{
34957ec681f3Smrg   /* TODO: For indirect draws, we can't compute the BO size ahead of time.
34967ec681f3Smrg    * Still not sure what to do here, so just allocate a reasonably large
34977ec681f3Smrg    * BO and hope for the best for now. */
34987ec681f3Smrg   if (!draw_count)
34997ec681f3Smrg      draw_count = 2048;
35007ec681f3Smrg
35017ec681f3Smrg   /* Each distinct patch gets its own tess factor output. */
35027ec681f3Smrg   uint32_t verts_per_patch = pipeline->ia.primtype - DI_PT_PATCHES0;
35037ec681f3Smrg   uint32_t num_patches = draw_count / verts_per_patch;
35047ec681f3Smrg   uint32_t factor_stride;
35057ec681f3Smrg   switch (pipeline->tess.patch_type) {
35067ec681f3Smrg   case IR3_TESS_ISOLINES:
35077ec681f3Smrg      factor_stride = 12;
35087ec681f3Smrg      break;
35097ec681f3Smrg   case IR3_TESS_TRIANGLES:
35107ec681f3Smrg      factor_stride = 20;
35117ec681f3Smrg      break;
35127ec681f3Smrg   case IR3_TESS_QUADS:
35137ec681f3Smrg      factor_stride = 28;
35147ec681f3Smrg      break;
35157ec681f3Smrg   default:
35167ec681f3Smrg      unreachable("bad tessmode");
35177ec681f3Smrg   }
35187ec681f3Smrg   return factor_stride * num_patches;
35197ec681f3Smrg}
35207ec681f3Smrg
35217ec681f3Smrgstatic VkResult
35227ec681f3Smrgtu6_emit_tess_consts(struct tu_cmd_buffer *cmd,
35237ec681f3Smrg                     uint32_t draw_count,
35247ec681f3Smrg                     const struct tu_pipeline *pipeline,
35257ec681f3Smrg                     struct tu_draw_state *state,
35267ec681f3Smrg                     uint64_t *factor_iova)
35277ec681f3Smrg{
35287ec681f3Smrg   struct tu_cs cs;
35297ec681f3Smrg   VkResult result = tu_cs_begin_sub_stream(&cmd->sub_cs, 16, &cs);
35307ec681f3Smrg   if (result != VK_SUCCESS)
35317ec681f3Smrg      return result;
35327ec681f3Smrg
35337ec681f3Smrg   const struct tu_program_descriptor_linkage *hs_link =
35347ec681f3Smrg      &pipeline->program.link[MESA_SHADER_TESS_CTRL];
35357ec681f3Smrg   bool hs_uses_bo = pipeline->tess.hs_bo_regid < hs_link->constlen;
35367ec681f3Smrg
35377ec681f3Smrg   const struct tu_program_descriptor_linkage *ds_link =
35387ec681f3Smrg      &pipeline->program.link[MESA_SHADER_TESS_EVAL];
35397ec681f3Smrg   bool ds_uses_bo = pipeline->tess.ds_bo_regid < ds_link->constlen;
35407ec681f3Smrg
35417ec681f3Smrg   uint64_t tess_factor_size = get_tess_factor_bo_size(pipeline, draw_count);
35427ec681f3Smrg   uint64_t tess_param_size = get_tess_param_bo_size(pipeline, draw_count);
35437ec681f3Smrg   uint64_t tess_bo_size =  tess_factor_size + tess_param_size;
35447ec681f3Smrg   if ((hs_uses_bo || ds_uses_bo) && tess_bo_size > 0) {
35457ec681f3Smrg      struct tu_bo *tess_bo;
35467ec681f3Smrg      result = tu_get_scratch_bo(cmd->device, tess_bo_size, &tess_bo);
35477ec681f3Smrg      if (result != VK_SUCCESS)
35487ec681f3Smrg         return result;
35497ec681f3Smrg
35507ec681f3Smrg      uint64_t tess_factor_iova = tess_bo->iova;
35517ec681f3Smrg      uint64_t tess_param_iova = tess_factor_iova + tess_factor_size;
35527ec681f3Smrg
35537ec681f3Smrg      if (hs_uses_bo) {
35547ec681f3Smrg         tu_cs_emit_pkt7(&cs, CP_LOAD_STATE6_GEOM, 3 + 4);
35557ec681f3Smrg         tu_cs_emit(&cs, CP_LOAD_STATE6_0_DST_OFF(pipeline->tess.hs_bo_regid) |
35567ec681f3Smrg               CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
35577ec681f3Smrg               CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
35587ec681f3Smrg               CP_LOAD_STATE6_0_STATE_BLOCK(SB6_HS_SHADER) |
35597ec681f3Smrg               CP_LOAD_STATE6_0_NUM_UNIT(1));
35607ec681f3Smrg         tu_cs_emit(&cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
35617ec681f3Smrg         tu_cs_emit(&cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
35627ec681f3Smrg         tu_cs_emit_qw(&cs, tess_param_iova);
35637ec681f3Smrg         tu_cs_emit_qw(&cs, tess_factor_iova);
35647ec681f3Smrg      }
35657ec681f3Smrg
35667ec681f3Smrg      if (ds_uses_bo) {
35677ec681f3Smrg         tu_cs_emit_pkt7(&cs, CP_LOAD_STATE6_GEOM, 3 + 4);
35687ec681f3Smrg         tu_cs_emit(&cs, CP_LOAD_STATE6_0_DST_OFF(pipeline->tess.ds_bo_regid) |
35697ec681f3Smrg               CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
35707ec681f3Smrg               CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
35717ec681f3Smrg               CP_LOAD_STATE6_0_STATE_BLOCK(SB6_DS_SHADER) |
35727ec681f3Smrg               CP_LOAD_STATE6_0_NUM_UNIT(1));
35737ec681f3Smrg         tu_cs_emit(&cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
35747ec681f3Smrg         tu_cs_emit(&cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
35757ec681f3Smrg         tu_cs_emit_qw(&cs, tess_param_iova);
35767ec681f3Smrg         tu_cs_emit_qw(&cs, tess_factor_iova);
35777ec681f3Smrg      }
35787ec681f3Smrg
35797ec681f3Smrg      *factor_iova = tess_factor_iova;
35807ec681f3Smrg   }
35817ec681f3Smrg   *state = tu_cs_end_draw_state(&cmd->sub_cs, &cs);
35827ec681f3Smrg   return VK_SUCCESS;
35837ec681f3Smrg}
35847ec681f3Smrg
35857ec681f3Smrgstatic enum tu_lrz_direction
35867ec681f3Smrgtu6_lrz_depth_mode(struct A6XX_GRAS_LRZ_CNTL *gras_lrz_cntl,
35877ec681f3Smrg                   VkCompareOp depthCompareOp,
35887ec681f3Smrg                   bool *invalidate_lrz)
35897ec681f3Smrg{
35907ec681f3Smrg   enum tu_lrz_direction lrz_direction = TU_LRZ_UNKNOWN;
35917ec681f3Smrg
35927ec681f3Smrg   /* LRZ does not support some depth modes. */
35937ec681f3Smrg   switch (depthCompareOp) {
35947ec681f3Smrg   case VK_COMPARE_OP_ALWAYS:
35957ec681f3Smrg   case VK_COMPARE_OP_NOT_EQUAL:
35967ec681f3Smrg      *invalidate_lrz = true;
35977ec681f3Smrg      gras_lrz_cntl->lrz_write = false;
35987ec681f3Smrg      break;
35997ec681f3Smrg   case VK_COMPARE_OP_EQUAL:
36007ec681f3Smrg   case VK_COMPARE_OP_NEVER:
36017ec681f3Smrg      gras_lrz_cntl->lrz_write = false;
36027ec681f3Smrg      break;
36037ec681f3Smrg   case VK_COMPARE_OP_GREATER:
36047ec681f3Smrg   case VK_COMPARE_OP_GREATER_OR_EQUAL:
36057ec681f3Smrg      lrz_direction = TU_LRZ_GREATER;
36067ec681f3Smrg      gras_lrz_cntl->greater = true;
36077ec681f3Smrg      break;
36087ec681f3Smrg   case VK_COMPARE_OP_LESS:
36097ec681f3Smrg   case VK_COMPARE_OP_LESS_OR_EQUAL:
36107ec681f3Smrg      lrz_direction = TU_LRZ_LESS;
36117ec681f3Smrg      break;
36127ec681f3Smrg   default:
36137ec681f3Smrg      unreachable("bad VK_COMPARE_OP value or uninitialized");
36147ec681f3Smrg      break;
36157ec681f3Smrg   };
36167ec681f3Smrg
36177ec681f3Smrg   return lrz_direction;
36187ec681f3Smrg}
36197ec681f3Smrg
36207ec681f3Smrg/* update lrz state based on stencil-test func:
36217ec681f3Smrg *
36227ec681f3Smrg * Conceptually the order of the pipeline is:
36237ec681f3Smrg *
36247ec681f3Smrg *
36257ec681f3Smrg *   FS -> Alpha-Test  ->  Stencil-Test  ->  Depth-Test
36267ec681f3Smrg *                              |                |
36277ec681f3Smrg *                       if wrmask != 0     if wrmask != 0
36287ec681f3Smrg *                              |                |
36297ec681f3Smrg *                              v                v
36307ec681f3Smrg *                        Stencil-Write      Depth-Write
36317ec681f3Smrg *
36327ec681f3Smrg * Because Stencil-Test can have side effects (Stencil-Write) prior
36337ec681f3Smrg * to depth test, in this case we potentially need to disable early
36347ec681f3Smrg * lrz-test. See:
36357ec681f3Smrg *
36367ec681f3Smrg * https://www.khronos.org/opengl/wiki/Per-Sample_Processing
36377ec681f3Smrg */
36387ec681f3Smrgstatic void
36397ec681f3Smrgtu6_lrz_stencil_op(struct A6XX_GRAS_LRZ_CNTL *gras_lrz_cntl,
36407ec681f3Smrg                   VkCompareOp func,
36417ec681f3Smrg                   bool stencil_write,
36427ec681f3Smrg                   bool *invalidate_lrz)
36437ec681f3Smrg{
36447ec681f3Smrg   switch (func) {
36457ec681f3Smrg   case VK_COMPARE_OP_ALWAYS:
36467ec681f3Smrg      /* nothing to do for LRZ, but for stencil test when stencil-
36477ec681f3Smrg       * write is enabled, we need to disable lrz-test, since
36487ec681f3Smrg       * conceptually stencil test and write happens before depth-test.
36497ec681f3Smrg       */
36507ec681f3Smrg      if (stencil_write) {
36517ec681f3Smrg         gras_lrz_cntl->enable = false;
36527ec681f3Smrg         gras_lrz_cntl->z_test_enable = false;
36537ec681f3Smrg         *invalidate_lrz = true;
36547ec681f3Smrg      }
36557ec681f3Smrg      break;
36567ec681f3Smrg   case VK_COMPARE_OP_NEVER:
36577ec681f3Smrg      /* fragment never passes, disable lrz_write for this draw. */
36587ec681f3Smrg      gras_lrz_cntl->lrz_write = false;
36597ec681f3Smrg      break;
36607ec681f3Smrg   default:
36617ec681f3Smrg      /* whether the fragment passes or not depends on result
36627ec681f3Smrg       * of stencil test, which we cannot know when doing binning
36637ec681f3Smrg       * pass.
36647ec681f3Smrg       */
36657ec681f3Smrg      gras_lrz_cntl->lrz_write = false;
36667ec681f3Smrg      /* similarly to the VK_COMPARE_OP_ALWAYS case, if there are side-
36677ec681f3Smrg       * effects from stencil test we need to disable lrz-test.
36687ec681f3Smrg       */
36697ec681f3Smrg      if (stencil_write) {
36707ec681f3Smrg         gras_lrz_cntl->enable = false;
36717ec681f3Smrg         gras_lrz_cntl->z_test_enable = false;
36727ec681f3Smrg         *invalidate_lrz = true;
36737ec681f3Smrg      }
36747ec681f3Smrg      break;
36757ec681f3Smrg   }
36767ec681f3Smrg}
36777ec681f3Smrg
36787ec681f3Smrgstatic struct A6XX_GRAS_LRZ_CNTL
36797ec681f3Smrgtu6_calculate_lrz_state(struct tu_cmd_buffer *cmd,
36807ec681f3Smrg                        const uint32_t a)
36817ec681f3Smrg{
36827ec681f3Smrg   struct tu_pipeline *pipeline = cmd->state.pipeline;
36837ec681f3Smrg   struct A6XX_GRAS_LRZ_CNTL gras_lrz_cntl = { 0 };
36847ec681f3Smrg   bool invalidate_lrz = pipeline->lrz.force_disable_mask & TU_LRZ_FORCE_DISABLE_LRZ;
36857ec681f3Smrg   bool force_disable_write = pipeline->lrz.force_disable_mask & TU_LRZ_FORCE_DISABLE_WRITE;
36867ec681f3Smrg   enum tu_lrz_direction lrz_direction = TU_LRZ_UNKNOWN;
36877ec681f3Smrg
36887ec681f3Smrg   gras_lrz_cntl.enable = cmd->state.rb_depth_cntl & A6XX_RB_DEPTH_CNTL_Z_TEST_ENABLE;
36897ec681f3Smrg   gras_lrz_cntl.lrz_write = cmd->state.rb_depth_cntl & A6XX_RB_DEPTH_CNTL_Z_WRITE_ENABLE;
36907ec681f3Smrg   gras_lrz_cntl.z_test_enable = cmd->state.rb_depth_cntl & A6XX_RB_DEPTH_CNTL_Z_READ_ENABLE;
36917ec681f3Smrg   gras_lrz_cntl.z_bounds_enable = cmd->state.rb_depth_cntl & A6XX_RB_DEPTH_CNTL_Z_BOUNDS_ENABLE;
36927ec681f3Smrg
36937ec681f3Smrg   VkCompareOp depth_compare_op = (cmd->state.rb_depth_cntl & A6XX_RB_DEPTH_CNTL_ZFUNC__MASK) >> A6XX_RB_DEPTH_CNTL_ZFUNC__SHIFT;
36947ec681f3Smrg   lrz_direction = tu6_lrz_depth_mode(&gras_lrz_cntl, depth_compare_op, &invalidate_lrz);
36957ec681f3Smrg
36967ec681f3Smrg   /* LRZ doesn't transition properly between GREATER* and LESS* depth compare ops */
36977ec681f3Smrg   if (cmd->state.lrz.prev_direction != TU_LRZ_UNKNOWN &&
36987ec681f3Smrg       lrz_direction != TU_LRZ_UNKNOWN &&
36997ec681f3Smrg       cmd->state.lrz.prev_direction != lrz_direction) {
37007ec681f3Smrg      invalidate_lrz = true;
37017ec681f3Smrg   }
37027ec681f3Smrg
37037ec681f3Smrg   cmd->state.lrz.prev_direction = lrz_direction;
37047ec681f3Smrg
37057ec681f3Smrg   /* Invalidate LRZ and disable write if stencil test is enabled */
37067ec681f3Smrg   bool stencil_test_enable = cmd->state.rb_stencil_cntl & A6XX_RB_STENCIL_CONTROL_STENCIL_ENABLE;
37077ec681f3Smrg   if (stencil_test_enable) {
37087ec681f3Smrg      bool stencil_front_writemask =
37097ec681f3Smrg         (pipeline->dynamic_state_mask & BIT(VK_DYNAMIC_STATE_STENCIL_WRITE_MASK)) ?
37107ec681f3Smrg         (cmd->state.dynamic_stencil_wrmask & 0xff) :
37117ec681f3Smrg         (pipeline->stencil_wrmask & 0xff);
37127ec681f3Smrg
37137ec681f3Smrg      bool stencil_back_writemask =
37147ec681f3Smrg         (pipeline->dynamic_state_mask & BIT(VK_DYNAMIC_STATE_STENCIL_WRITE_MASK)) ?
37157ec681f3Smrg         ((cmd->state.dynamic_stencil_wrmask & 0xff00) >> 8) :
37167ec681f3Smrg         (pipeline->stencil_wrmask & 0xff00) >> 8;
37177ec681f3Smrg
37187ec681f3Smrg      VkCompareOp stencil_front_compare_op =
37197ec681f3Smrg         (cmd->state.rb_stencil_cntl & A6XX_RB_STENCIL_CONTROL_FUNC__MASK) >> A6XX_RB_STENCIL_CONTROL_FUNC__SHIFT;
37207ec681f3Smrg
37217ec681f3Smrg      VkCompareOp stencil_back_compare_op =
37227ec681f3Smrg         (cmd->state.rb_stencil_cntl & A6XX_RB_STENCIL_CONTROL_FUNC_BF__MASK) >> A6XX_RB_STENCIL_CONTROL_FUNC_BF__SHIFT;
37237ec681f3Smrg
37247ec681f3Smrg      tu6_lrz_stencil_op(&gras_lrz_cntl, stencil_front_compare_op,
37257ec681f3Smrg                         stencil_front_writemask, &invalidate_lrz);
37267ec681f3Smrg
37277ec681f3Smrg      tu6_lrz_stencil_op(&gras_lrz_cntl, stencil_back_compare_op,
37287ec681f3Smrg                         stencil_back_writemask, &invalidate_lrz);
37297ec681f3Smrg   }
37307ec681f3Smrg
37317ec681f3Smrg   if (force_disable_write)
37327ec681f3Smrg      gras_lrz_cntl.lrz_write = false;
37337ec681f3Smrg
37347ec681f3Smrg   if (invalidate_lrz) {
37357ec681f3Smrg      cmd->state.lrz.valid = false;
37367ec681f3Smrg   }
37377ec681f3Smrg
37387ec681f3Smrg   /* In case no depth attachment or invalid, we clear the gras_lrz_cntl register */
37397ec681f3Smrg   if (a == VK_ATTACHMENT_UNUSED || !cmd->state.lrz.valid)
37407ec681f3Smrg      memset(&gras_lrz_cntl, 0, sizeof(gras_lrz_cntl));
37417ec681f3Smrg
37427ec681f3Smrg   return gras_lrz_cntl;
37437ec681f3Smrg}
37447ec681f3Smrg
37457ec681f3Smrgstatic struct tu_draw_state
37467ec681f3Smrgtu6_build_lrz(struct tu_cmd_buffer *cmd)
37477ec681f3Smrg{
37487ec681f3Smrg   const uint32_t a = cmd->state.subpass->depth_stencil_attachment.attachment;
37497ec681f3Smrg   struct tu_cs lrz_cs;
37507ec681f3Smrg   struct tu_draw_state ds = tu_cs_draw_state(&cmd->sub_cs, &lrz_cs, 4);
37517ec681f3Smrg
37527ec681f3Smrg   struct A6XX_GRAS_LRZ_CNTL gras_lrz_cntl = tu6_calculate_lrz_state(cmd, a);
37537ec681f3Smrg
37547ec681f3Smrg   tu_cs_emit_regs(&lrz_cs, A6XX_GRAS_LRZ_CNTL(
37557ec681f3Smrg      .enable = gras_lrz_cntl.enable,
37567ec681f3Smrg      .greater = gras_lrz_cntl.greater,
37577ec681f3Smrg      .lrz_write = gras_lrz_cntl.lrz_write,
37587ec681f3Smrg      .z_test_enable = gras_lrz_cntl.z_test_enable,
37597ec681f3Smrg      .z_bounds_enable = gras_lrz_cntl.z_bounds_enable));
37607ec681f3Smrg   tu_cs_emit_regs(&lrz_cs, A6XX_RB_LRZ_CNTL(.enable = gras_lrz_cntl.enable));
37617ec681f3Smrg
37627ec681f3Smrg   return ds;
37637ec681f3Smrg}
37647ec681f3Smrg
37657ec681f3Smrgstatic bool
37667ec681f3Smrgtu6_writes_depth(struct tu_cmd_buffer *cmd, bool depth_test_enable)
37677ec681f3Smrg{
37687ec681f3Smrg   bool depth_write_enable =
37697ec681f3Smrg      cmd->state.rb_depth_cntl & A6XX_RB_DEPTH_CNTL_Z_WRITE_ENABLE;
37707ec681f3Smrg
37717ec681f3Smrg   VkCompareOp depth_compare_op =
37727ec681f3Smrg      (cmd->state.rb_depth_cntl & A6XX_RB_DEPTH_CNTL_ZFUNC__MASK) >> A6XX_RB_DEPTH_CNTL_ZFUNC__SHIFT;
37737ec681f3Smrg
37747ec681f3Smrg   bool depth_compare_op_writes = depth_compare_op != VK_COMPARE_OP_NEVER;
37757ec681f3Smrg
37767ec681f3Smrg   return depth_test_enable && depth_write_enable && depth_compare_op_writes;
37777ec681f3Smrg}
37787ec681f3Smrg
37797ec681f3Smrgstatic bool
37807ec681f3Smrgtu6_writes_stencil(struct tu_cmd_buffer *cmd)
37817ec681f3Smrg{
37827ec681f3Smrg   bool stencil_test_enable =
37837ec681f3Smrg      cmd->state.rb_stencil_cntl & A6XX_RB_STENCIL_CONTROL_STENCIL_ENABLE;
37847ec681f3Smrg
37857ec681f3Smrg   bool stencil_front_writemask =
37867ec681f3Smrg      (cmd->state.pipeline->dynamic_state_mask & BIT(VK_DYNAMIC_STATE_STENCIL_WRITE_MASK)) ?
37877ec681f3Smrg      (cmd->state.dynamic_stencil_wrmask & 0xff) :
37887ec681f3Smrg      (cmd->state.pipeline->stencil_wrmask & 0xff);
37897ec681f3Smrg
37907ec681f3Smrg   bool stencil_back_writemask =
37917ec681f3Smrg      (cmd->state.pipeline->dynamic_state_mask & BIT(VK_DYNAMIC_STATE_STENCIL_WRITE_MASK)) ?
37927ec681f3Smrg      ((cmd->state.dynamic_stencil_wrmask & 0xff00) >> 8) :
37937ec681f3Smrg      (cmd->state.pipeline->stencil_wrmask & 0xff00) >> 8;
37947ec681f3Smrg
37957ec681f3Smrg   VkStencilOp front_fail_op =
37967ec681f3Smrg      (cmd->state.rb_stencil_cntl & A6XX_RB_STENCIL_CONTROL_FAIL__MASK) >> A6XX_RB_STENCIL_CONTROL_FAIL__SHIFT;
37977ec681f3Smrg   VkStencilOp front_pass_op =
37987ec681f3Smrg      (cmd->state.rb_stencil_cntl & A6XX_RB_STENCIL_CONTROL_ZPASS__MASK) >> A6XX_RB_STENCIL_CONTROL_ZPASS__SHIFT;
37997ec681f3Smrg   VkStencilOp front_depth_fail_op =
38007ec681f3Smrg      (cmd->state.rb_stencil_cntl & A6XX_RB_STENCIL_CONTROL_ZFAIL__MASK) >> A6XX_RB_STENCIL_CONTROL_ZFAIL__SHIFT;
38017ec681f3Smrg   VkStencilOp back_fail_op =
38027ec681f3Smrg      (cmd->state.rb_stencil_cntl & A6XX_RB_STENCIL_CONTROL_FAIL_BF__MASK) >> A6XX_RB_STENCIL_CONTROL_FAIL_BF__SHIFT;
38037ec681f3Smrg   VkStencilOp back_pass_op =
38047ec681f3Smrg      (cmd->state.rb_stencil_cntl & A6XX_RB_STENCIL_CONTROL_ZPASS_BF__MASK) >> A6XX_RB_STENCIL_CONTROL_ZPASS_BF__SHIFT;
38057ec681f3Smrg   VkStencilOp back_depth_fail_op =
38067ec681f3Smrg      (cmd->state.rb_stencil_cntl & A6XX_RB_STENCIL_CONTROL_ZFAIL_BF__MASK) >> A6XX_RB_STENCIL_CONTROL_ZFAIL_BF__SHIFT;
38077ec681f3Smrg
38087ec681f3Smrg   bool stencil_front_op_writes =
38097ec681f3Smrg      front_pass_op != VK_STENCIL_OP_KEEP &&
38107ec681f3Smrg      front_fail_op != VK_STENCIL_OP_KEEP &&
38117ec681f3Smrg      front_depth_fail_op != VK_STENCIL_OP_KEEP;
38127ec681f3Smrg
38137ec681f3Smrg   bool stencil_back_op_writes =
38147ec681f3Smrg      back_pass_op != VK_STENCIL_OP_KEEP &&
38157ec681f3Smrg      back_fail_op != VK_STENCIL_OP_KEEP &&
38167ec681f3Smrg      back_depth_fail_op != VK_STENCIL_OP_KEEP;
38177ec681f3Smrg
38187ec681f3Smrg   return stencil_test_enable &&
38197ec681f3Smrg      ((stencil_front_writemask && stencil_front_op_writes) ||
38207ec681f3Smrg       (stencil_back_writemask && stencil_back_op_writes));
38217ec681f3Smrg}
38227ec681f3Smrg
38237ec681f3Smrgstatic struct tu_draw_state
38247ec681f3Smrgtu6_build_depth_plane_z_mode(struct tu_cmd_buffer *cmd)
38257ec681f3Smrg{
38267ec681f3Smrg   struct tu_cs cs;
38277ec681f3Smrg   struct tu_draw_state ds = tu_cs_draw_state(&cmd->sub_cs, &cs, 4);
38287ec681f3Smrg
38297ec681f3Smrg   enum a6xx_ztest_mode zmode = A6XX_EARLY_Z;
38307ec681f3Smrg   bool depth_test_enable = cmd->state.rb_depth_cntl & A6XX_RB_DEPTH_CNTL_Z_TEST_ENABLE;
38317ec681f3Smrg   bool depth_write = tu6_writes_depth(cmd, depth_test_enable);
38327ec681f3Smrg   bool stencil_write = tu6_writes_stencil(cmd);
38337ec681f3Smrg
38347ec681f3Smrg   if (cmd->state.pipeline->lrz.fs_has_kill &&
38357ec681f3Smrg       (depth_write || stencil_write)) {
38367ec681f3Smrg      zmode = cmd->state.lrz.valid ? A6XX_EARLY_LRZ_LATE_Z : A6XX_LATE_Z;
38377ec681f3Smrg   }
38387ec681f3Smrg
38397ec681f3Smrg   if (cmd->state.pipeline->lrz.force_late_z || !depth_test_enable)
38407ec681f3Smrg      zmode = A6XX_LATE_Z;
38417ec681f3Smrg
38427ec681f3Smrg   /* User defined early tests take precedence above all else */
38437ec681f3Smrg   if (cmd->state.pipeline->lrz.early_fragment_tests)
38447ec681f3Smrg      zmode = A6XX_EARLY_Z;
38457ec681f3Smrg
38467ec681f3Smrg   tu_cs_emit_pkt4(&cs, REG_A6XX_GRAS_SU_DEPTH_PLANE_CNTL, 1);
38477ec681f3Smrg   tu_cs_emit(&cs, A6XX_GRAS_SU_DEPTH_PLANE_CNTL_Z_MODE(zmode));
38487ec681f3Smrg
38497ec681f3Smrg   tu_cs_emit_pkt4(&cs, REG_A6XX_RB_DEPTH_PLANE_CNTL, 1);
38507ec681f3Smrg   tu_cs_emit(&cs, A6XX_RB_DEPTH_PLANE_CNTL_Z_MODE(zmode));
38517ec681f3Smrg   return ds;
38527ec681f3Smrg}
38537ec681f3Smrg
38547ec681f3Smrgstatic VkResult
38557ec681f3Smrgtu6_draw_common(struct tu_cmd_buffer *cmd,
38567ec681f3Smrg                struct tu_cs *cs,
38577ec681f3Smrg                bool indexed,
38587ec681f3Smrg                /* note: draw_count is 0 for indirect */
38597ec681f3Smrg                uint32_t draw_count)
38607ec681f3Smrg{
38617ec681f3Smrg   const struct tu_pipeline *pipeline = cmd->state.pipeline;
38627ec681f3Smrg   VkResult result;
3863361fc4cbSmaya
38647ec681f3Smrg   tu_emit_cache_flush_renderpass(cmd, cs);
3865361fc4cbSmaya
38667ec681f3Smrg   bool primitive_restart_enabled = pipeline->ia.primitive_restart;
38677ec681f3Smrg   if (pipeline->dynamic_state_mask & BIT(TU_DYNAMIC_STATE_PRIMITIVE_RESTART_ENABLE))
38687ec681f3Smrg      primitive_restart_enabled = cmd->state.primitive_restart_enable;
3869361fc4cbSmaya
38707ec681f3Smrg   tu_cs_emit_regs(cs, A6XX_PC_PRIMITIVE_CNTL_0(
38717ec681f3Smrg         .primitive_restart =
38727ec681f3Smrg               primitive_restart_enabled && indexed,
38737ec681f3Smrg         .provoking_vtx_last = pipeline->provoking_vertex_last,
38747ec681f3Smrg         .tess_upper_left_domain_origin =
38757ec681f3Smrg               pipeline->tess.upper_left_domain_origin));
3876361fc4cbSmaya
38777ec681f3Smrg   bool has_tess =
38787ec681f3Smrg         pipeline->active_stages & VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT;
3879361fc4cbSmaya
38807ec681f3Smrg   /* Early exit if there is nothing to emit, saves CPU cycles */
38817ec681f3Smrg   if (!(cmd->state.dirty & ~TU_CMD_DIRTY_COMPUTE_DESC_SETS_LOAD) &&
38827ec681f3Smrg       !has_tess)
38837ec681f3Smrg      return VK_SUCCESS;
3884361fc4cbSmaya
38857ec681f3Smrg   bool dirty_lrz = cmd->state.dirty & (TU_CMD_DIRTY_LRZ | TU_CMD_DIRTY_RB_DEPTH_CNTL | TU_CMD_DIRTY_RB_STENCIL_CNTL);
3886361fc4cbSmaya
38877ec681f3Smrg   struct tu_descriptor_state *descriptors_state =
38887ec681f3Smrg      &cmd->descriptors[VK_PIPELINE_BIND_POINT_GRAPHICS];
3889361fc4cbSmaya
38907ec681f3Smrg   if (dirty_lrz) {
38917ec681f3Smrg      cmd->state.lrz.state = tu6_build_lrz(cmd);
38927ec681f3Smrg      cmd->state.depth_plane_state = tu6_build_depth_plane_z_mode(cmd);
38937ec681f3Smrg   }
3894361fc4cbSmaya
38957ec681f3Smrg   if (cmd->state.dirty & TU_CMD_DIRTY_RASTERIZER_DISCARD) {
38967ec681f3Smrg      struct tu_cs cs = tu_cmd_dynamic_state(cmd, TU_DYNAMIC_STATE_RASTERIZER_DISCARD, 4);
38977ec681f3Smrg      tu_cs_emit_regs(&cs, A6XX_PC_RASTER_CNTL(.dword = cmd->state.pc_raster_cntl));
38987ec681f3Smrg      tu_cs_emit_regs(&cs, A6XX_VPC_UNKNOWN_9107(.dword = cmd->state.vpc_unknown_9107));
38997ec681f3Smrg   }
3900361fc4cbSmaya
39017ec681f3Smrg   if (cmd->state.dirty & TU_CMD_DIRTY_GRAS_SU_CNTL) {
39027ec681f3Smrg      struct tu_cs cs = tu_cmd_dynamic_state(cmd, TU_DYNAMIC_STATE_GRAS_SU_CNTL, 2);
39037ec681f3Smrg      tu_cs_emit_regs(&cs, A6XX_GRAS_SU_CNTL(.dword = cmd->state.gras_su_cntl));
3904361fc4cbSmaya   }
3905361fc4cbSmaya
39067ec681f3Smrg   if (cmd->state.dirty & TU_CMD_DIRTY_RB_DEPTH_CNTL) {
39077ec681f3Smrg      struct tu_cs cs = tu_cmd_dynamic_state(cmd, TU_DYNAMIC_STATE_RB_DEPTH_CNTL, 2);
39087ec681f3Smrg      uint32_t rb_depth_cntl = cmd->state.rb_depth_cntl;
3909361fc4cbSmaya
39107ec681f3Smrg      if ((rb_depth_cntl & A6XX_RB_DEPTH_CNTL_Z_TEST_ENABLE) ||
39117ec681f3Smrg          (rb_depth_cntl & A6XX_RB_DEPTH_CNTL_Z_BOUNDS_ENABLE))
39127ec681f3Smrg         rb_depth_cntl |= A6XX_RB_DEPTH_CNTL_Z_READ_ENABLE;
3913361fc4cbSmaya
39147ec681f3Smrg      if ((rb_depth_cntl & A6XX_RB_DEPTH_CNTL_Z_BOUNDS_ENABLE) &&
39157ec681f3Smrg          !(rb_depth_cntl & A6XX_RB_DEPTH_CNTL_Z_TEST_ENABLE))
39167ec681f3Smrg         tu6_apply_depth_bounds_workaround(cmd->device, &rb_depth_cntl);
3917361fc4cbSmaya
39187ec681f3Smrg      if (pipeline->rb_depth_cntl_disable)
39197ec681f3Smrg         rb_depth_cntl = 0;
3920361fc4cbSmaya
39217ec681f3Smrg      tu_cs_emit_regs(&cs, A6XX_RB_DEPTH_CNTL(.dword = rb_depth_cntl));
3922361fc4cbSmaya   }
3923361fc4cbSmaya
39247ec681f3Smrg   if (cmd->state.dirty & TU_CMD_DIRTY_RB_STENCIL_CNTL) {
39257ec681f3Smrg      struct tu_cs cs = tu_cmd_dynamic_state(cmd, TU_DYNAMIC_STATE_RB_STENCIL_CNTL, 2);
39267ec681f3Smrg      tu_cs_emit_regs(&cs, A6XX_RB_STENCIL_CONTROL(.dword = cmd->state.rb_stencil_cntl));
3927361fc4cbSmaya   }
3928361fc4cbSmaya
39297ec681f3Smrg   if (cmd->state.dirty & TU_CMD_DIRTY_SHADER_CONSTS) {
39307ec681f3Smrg      cmd->state.shader_const[0] =
39317ec681f3Smrg         tu6_emit_consts_geom(cmd, pipeline, descriptors_state);
39327ec681f3Smrg      cmd->state.shader_const[1] =
39337ec681f3Smrg         tu6_emit_consts(cmd, pipeline, descriptors_state, MESA_SHADER_FRAGMENT);
3934361fc4cbSmaya   }
3935361fc4cbSmaya
39367ec681f3Smrg   struct tu_draw_state tess_consts = {};
39377ec681f3Smrg   if (has_tess) {
39387ec681f3Smrg      uint64_t tess_factor_iova = 0;
39397ec681f3Smrg
39407ec681f3Smrg      cmd->state.has_tess = true;
39417ec681f3Smrg      result = tu6_emit_tess_consts(cmd, draw_count, pipeline, &tess_consts, &tess_factor_iova);
39427ec681f3Smrg      if (result != VK_SUCCESS)
39437ec681f3Smrg         return result;
3944361fc4cbSmaya
39457ec681f3Smrg      /* this sequence matches what the blob does before every tess draw
39467ec681f3Smrg       * PC_TESSFACTOR_ADDR_LO is a non-context register and needs a wfi
39477ec681f3Smrg       * before writing to it
39487ec681f3Smrg       */
39497ec681f3Smrg      tu_cs_emit_wfi(cs);
3950361fc4cbSmaya
39517ec681f3Smrg      tu_cs_emit_regs(cs, A6XX_PC_TESSFACTOR_ADDR(.qword = tess_factor_iova));
39527ec681f3Smrg
39537ec681f3Smrg      tu_cs_emit_pkt7(cs, CP_SET_SUBDRAW_SIZE, 1);
39547ec681f3Smrg      tu_cs_emit(cs, draw_count);
3955361fc4cbSmaya   }
3956361fc4cbSmaya
39577ec681f3Smrg   /* for the first draw in a renderpass, re-emit all the draw states
39587ec681f3Smrg    *
39597ec681f3Smrg    * and if a draw-state disabling path (CmdClearAttachments 3D fallback) was
39607ec681f3Smrg    * used, then draw states must be re-emitted. note however this only happens
39617ec681f3Smrg    * in the sysmem path, so this can be skipped this for the gmem path (TODO)
39627ec681f3Smrg    *
39637ec681f3Smrg    * the two input attachment states are excluded because secondary command
39647ec681f3Smrg    * buffer doesn't have a state ib to restore it, and not re-emitting them
39657ec681f3Smrg    * is OK since CmdClearAttachments won't disable/overwrite them
39667ec681f3Smrg    */
39677ec681f3Smrg   if (cmd->state.dirty & TU_CMD_DIRTY_DRAW_STATE) {
39687ec681f3Smrg      tu_cs_emit_pkt7(cs, CP_SET_DRAW_STATE, 3 * (TU_DRAW_STATE_COUNT - 2));
39697ec681f3Smrg
39707ec681f3Smrg      tu_cs_emit_draw_state(cs, TU_DRAW_STATE_PROGRAM_CONFIG, pipeline->program.config_state);
39717ec681f3Smrg      tu_cs_emit_draw_state(cs, TU_DRAW_STATE_PROGRAM, pipeline->program.state);
39727ec681f3Smrg      tu_cs_emit_draw_state(cs, TU_DRAW_STATE_PROGRAM_BINNING, pipeline->program.binning_state);
39737ec681f3Smrg      tu_cs_emit_draw_state(cs, TU_DRAW_STATE_TESS, tess_consts);
39747ec681f3Smrg      tu_cs_emit_draw_state(cs, TU_DRAW_STATE_VI, pipeline->vi.state);
39757ec681f3Smrg      tu_cs_emit_draw_state(cs, TU_DRAW_STATE_VI_BINNING, pipeline->vi.binning_state);
39767ec681f3Smrg      tu_cs_emit_draw_state(cs, TU_DRAW_STATE_RAST, pipeline->rast_state);
39777ec681f3Smrg      tu_cs_emit_draw_state(cs, TU_DRAW_STATE_BLEND, pipeline->blend_state);
39787ec681f3Smrg      tu_cs_emit_draw_state(cs, TU_DRAW_STATE_SHADER_GEOM_CONST, cmd->state.shader_const[0]);
39797ec681f3Smrg      tu_cs_emit_draw_state(cs, TU_DRAW_STATE_FS_CONST, cmd->state.shader_const[1]);
39807ec681f3Smrg      tu_cs_emit_draw_state(cs, TU_DRAW_STATE_DESC_SETS, cmd->state.desc_sets);
39817ec681f3Smrg      tu_cs_emit_draw_state(cs, TU_DRAW_STATE_DESC_SETS_LOAD, pipeline->load_state);
39827ec681f3Smrg      tu_cs_emit_draw_state(cs, TU_DRAW_STATE_VB, cmd->state.vertex_buffers);
39837ec681f3Smrg      tu_cs_emit_draw_state(cs, TU_DRAW_STATE_VS_PARAMS, cmd->state.vs_params);
39847ec681f3Smrg      tu_cs_emit_draw_state(cs, TU_DRAW_STATE_LRZ, cmd->state.lrz.state);
39857ec681f3Smrg      tu_cs_emit_draw_state(cs, TU_DRAW_STATE_DEPTH_PLANE, cmd->state.depth_plane_state);
39867ec681f3Smrg
39877ec681f3Smrg      for (uint32_t i = 0; i < ARRAY_SIZE(cmd->state.dynamic_state); i++) {
39887ec681f3Smrg         tu_cs_emit_draw_state(cs, TU_DRAW_STATE_DYNAMIC + i,
39897ec681f3Smrg                               ((pipeline->dynamic_state_mask & BIT(i)) ?
39907ec681f3Smrg                                cmd->state.dynamic_state[i] :
39917ec681f3Smrg                                pipeline->dynamic_state[i]));
39927ec681f3Smrg      }
39937ec681f3Smrg   } else {
39947ec681f3Smrg      /* emit draw states that were just updated
39957ec681f3Smrg       * note we eventually don't want to have to emit anything here
39967ec681f3Smrg       */
39977ec681f3Smrg      bool emit_binding_stride = false;
39987ec681f3Smrg      uint32_t draw_state_count =
39997ec681f3Smrg         has_tess +
40007ec681f3Smrg         ((cmd->state.dirty & TU_CMD_DIRTY_SHADER_CONSTS) ? 2 : 0) +
40017ec681f3Smrg         ((cmd->state.dirty & TU_CMD_DIRTY_DESC_SETS_LOAD) ? 1 : 0) +
40027ec681f3Smrg         ((cmd->state.dirty & TU_CMD_DIRTY_VERTEX_BUFFERS) ? 1 : 0) +
40037ec681f3Smrg         ((cmd->state.dirty & TU_CMD_DIRTY_VS_PARAMS) ? 1 : 0) +
40047ec681f3Smrg         (dirty_lrz ? 2 : 0);
40057ec681f3Smrg
40067ec681f3Smrg      if ((cmd->state.dirty & TU_CMD_DIRTY_VB_STRIDE) &&
40077ec681f3Smrg          (pipeline->dynamic_state_mask & BIT(TU_DYNAMIC_STATE_VB_STRIDE))) {
40087ec681f3Smrg         emit_binding_stride = true;
40097ec681f3Smrg         draw_state_count += 1;
4010361fc4cbSmaya      }
4011361fc4cbSmaya
40127ec681f3Smrg      if (draw_state_count > 0)
40137ec681f3Smrg         tu_cs_emit_pkt7(cs, CP_SET_DRAW_STATE, 3 * draw_state_count);
40147ec681f3Smrg
40157ec681f3Smrg      /* We may need to re-emit tess consts if the current draw call is
40167ec681f3Smrg         * sufficiently larger than the last draw call. */
40177ec681f3Smrg      if (has_tess)
40187ec681f3Smrg         tu_cs_emit_draw_state(cs, TU_DRAW_STATE_TESS, tess_consts);
40197ec681f3Smrg      if (cmd->state.dirty & TU_CMD_DIRTY_SHADER_CONSTS) {
40207ec681f3Smrg         tu_cs_emit_draw_state(cs, TU_DRAW_STATE_SHADER_GEOM_CONST, cmd->state.shader_const[0]);
40217ec681f3Smrg         tu_cs_emit_draw_state(cs, TU_DRAW_STATE_FS_CONST, cmd->state.shader_const[1]);
40227ec681f3Smrg      }
40237ec681f3Smrg      if (cmd->state.dirty & TU_CMD_DIRTY_DESC_SETS_LOAD)
40247ec681f3Smrg         tu_cs_emit_draw_state(cs, TU_DRAW_STATE_DESC_SETS_LOAD, pipeline->load_state);
40257ec681f3Smrg      if (cmd->state.dirty & TU_CMD_DIRTY_VERTEX_BUFFERS)
40267ec681f3Smrg         tu_cs_emit_draw_state(cs, TU_DRAW_STATE_VB, cmd->state.vertex_buffers);
40277ec681f3Smrg      if (emit_binding_stride) {
40287ec681f3Smrg         tu_cs_emit_draw_state(cs, TU_DRAW_STATE_DYNAMIC + TU_DYNAMIC_STATE_VB_STRIDE,
40297ec681f3Smrg                               cmd->state.dynamic_state[TU_DYNAMIC_STATE_VB_STRIDE]);
40307ec681f3Smrg      }
40317ec681f3Smrg      if (cmd->state.dirty & TU_CMD_DIRTY_VS_PARAMS)
40327ec681f3Smrg         tu_cs_emit_draw_state(cs, TU_DRAW_STATE_VS_PARAMS, cmd->state.vs_params);
40337ec681f3Smrg
40347ec681f3Smrg      if (dirty_lrz) {
40357ec681f3Smrg         tu_cs_emit_draw_state(cs, TU_DRAW_STATE_LRZ, cmd->state.lrz.state);
40367ec681f3Smrg         tu_cs_emit_draw_state(cs, TU_DRAW_STATE_DEPTH_PLANE, cmd->state.depth_plane_state);
40377ec681f3Smrg      }
4038361fc4cbSmaya   }
4039361fc4cbSmaya
4040361fc4cbSmaya   tu_cs_sanity_check(cs);
4041361fc4cbSmaya
40427ec681f3Smrg   /* There are too many graphics dirty bits to list here, so just list the
40437ec681f3Smrg    * bits to preserve instead. The only things not emitted here are
40447ec681f3Smrg    * compute-related state.
40457ec681f3Smrg    */
40467ec681f3Smrg   cmd->state.dirty &= TU_CMD_DIRTY_COMPUTE_DESC_SETS_LOAD;
40477ec681f3Smrg   return VK_SUCCESS;
40487ec681f3Smrg}
40497ec681f3Smrg
40507ec681f3Smrgstatic uint32_t
40517ec681f3Smrgtu_draw_initiator(struct tu_cmd_buffer *cmd, enum pc_di_src_sel src_sel)
40527ec681f3Smrg{
40537ec681f3Smrg   const struct tu_pipeline *pipeline = cmd->state.pipeline;
40547ec681f3Smrg   enum pc_di_primtype primtype = pipeline->ia.primtype;
40557ec681f3Smrg
40567ec681f3Smrg   if (pipeline->dynamic_state_mask & BIT(TU_DYNAMIC_STATE_PRIMITIVE_TOPOLOGY)) {
40577ec681f3Smrg      if (primtype < DI_PT_PATCHES0) {
40587ec681f3Smrg         /* If tesselation used, only VK_PRIMITIVE_TOPOLOGY_PATCH_LIST can be
40597ec681f3Smrg          * set via vkCmdSetPrimitiveTopologyEXT, but primtype is already
40607ec681f3Smrg          * calculated at the pipeline creation based on control points
40617ec681f3Smrg          * for each patch.
40627ec681f3Smrg          *
40637ec681f3Smrg          * Just use the primtype as is for the case.
40647ec681f3Smrg          */
40657ec681f3Smrg         primtype = cmd->state.primtype;
4066361fc4cbSmaya      }
4067361fc4cbSmaya   }
40687ec681f3Smrg
40697ec681f3Smrg   uint32_t initiator =
40707ec681f3Smrg      CP_DRAW_INDX_OFFSET_0_PRIM_TYPE(primtype) |
40717ec681f3Smrg      CP_DRAW_INDX_OFFSET_0_SOURCE_SELECT(src_sel) |
40727ec681f3Smrg      CP_DRAW_INDX_OFFSET_0_INDEX_SIZE(cmd->state.index_size) |
40737ec681f3Smrg      CP_DRAW_INDX_OFFSET_0_VIS_CULL(USE_VISIBILITY);
40747ec681f3Smrg
40757ec681f3Smrg   if (pipeline->active_stages & VK_SHADER_STAGE_GEOMETRY_BIT)
40767ec681f3Smrg      initiator |= CP_DRAW_INDX_OFFSET_0_GS_ENABLE;
40777ec681f3Smrg
40787ec681f3Smrg   switch (pipeline->tess.patch_type) {
40797ec681f3Smrg   case IR3_TESS_TRIANGLES:
40807ec681f3Smrg      initiator |= CP_DRAW_INDX_OFFSET_0_PATCH_TYPE(TESS_TRIANGLES) |
40817ec681f3Smrg                   CP_DRAW_INDX_OFFSET_0_TESS_ENABLE;
40827ec681f3Smrg      break;
40837ec681f3Smrg   case IR3_TESS_ISOLINES:
40847ec681f3Smrg      initiator |= CP_DRAW_INDX_OFFSET_0_PATCH_TYPE(TESS_ISOLINES) |
40857ec681f3Smrg                   CP_DRAW_INDX_OFFSET_0_TESS_ENABLE;
40867ec681f3Smrg      break;
40877ec681f3Smrg   case IR3_TESS_NONE:
40887ec681f3Smrg      initiator |= CP_DRAW_INDX_OFFSET_0_PATCH_TYPE(TESS_QUADS);
40897ec681f3Smrg      break;
40907ec681f3Smrg   case IR3_TESS_QUADS:
40917ec681f3Smrg      initiator |= CP_DRAW_INDX_OFFSET_0_PATCH_TYPE(TESS_QUADS) |
40927ec681f3Smrg                   CP_DRAW_INDX_OFFSET_0_TESS_ENABLE;
40937ec681f3Smrg      break;
4094361fc4cbSmaya   }
40957ec681f3Smrg   return initiator;
40967ec681f3Smrg}
40977ec681f3Smrg
40987ec681f3Smrg
40997ec681f3Smrgstatic uint32_t
41007ec681f3Smrgvs_params_offset(struct tu_cmd_buffer *cmd)
41017ec681f3Smrg{
41027ec681f3Smrg   const struct tu_program_descriptor_linkage *link =
41037ec681f3Smrg      &cmd->state.pipeline->program.link[MESA_SHADER_VERTEX];
41047ec681f3Smrg   const struct ir3_const_state *const_state = &link->const_state;
4105361fc4cbSmaya
41067ec681f3Smrg   if (const_state->offsets.driver_param >= link->constlen)
41077ec681f3Smrg      return 0;
41087ec681f3Smrg
41097ec681f3Smrg   /* this layout is required by CP_DRAW_INDIRECT_MULTI */
41107ec681f3Smrg   STATIC_ASSERT(IR3_DP_DRAWID == 0);
41117ec681f3Smrg   STATIC_ASSERT(IR3_DP_VTXID_BASE == 1);
41127ec681f3Smrg   STATIC_ASSERT(IR3_DP_INSTID_BASE == 2);
41137ec681f3Smrg
41147ec681f3Smrg   /* 0 means disabled for CP_DRAW_INDIRECT_MULTI */
41157ec681f3Smrg   assert(const_state->offsets.driver_param != 0);
41167ec681f3Smrg
41177ec681f3Smrg   return const_state->offsets.driver_param;
4118361fc4cbSmaya}
4119361fc4cbSmaya
4120361fc4cbSmayastatic void
41217ec681f3Smrgtu6_emit_empty_vs_params(struct tu_cmd_buffer *cmd)
41227ec681f3Smrg{
41237ec681f3Smrg   if (cmd->state.vs_params.iova) {
41247ec681f3Smrg      cmd->state.vs_params = (struct tu_draw_state) {};
41257ec681f3Smrg      cmd->state.dirty |= TU_CMD_DIRTY_VS_PARAMS;
4126361fc4cbSmaya   }
4127361fc4cbSmaya}
4128361fc4cbSmaya
4129361fc4cbSmayastatic void
41307ec681f3Smrgtu6_emit_vs_params(struct tu_cmd_buffer *cmd,
41317ec681f3Smrg                   uint32_t vertex_offset,
41327ec681f3Smrg                   uint32_t first_instance)
4133361fc4cbSmaya{
41347ec681f3Smrg   /* Beside re-emitting params when they are changed, we should re-emit
41357ec681f3Smrg    * them after constants are invalidated via HLSQ_INVALIDATE_CMD.
41367ec681f3Smrg    */
41377ec681f3Smrg   if (!(cmd->state.dirty & (TU_CMD_DIRTY_DRAW_STATE | TU_CMD_DIRTY_VS_PARAMS)) &&
41387ec681f3Smrg       vertex_offset == cmd->state.last_vs_params.vertex_offset &&
41397ec681f3Smrg       first_instance == cmd->state.last_vs_params.first_instance) {
41407ec681f3Smrg      return;
41417ec681f3Smrg   }
4142361fc4cbSmaya
41437ec681f3Smrg   uint32_t offset = vs_params_offset(cmd);
4144361fc4cbSmaya
41457ec681f3Smrg   struct tu_cs cs;
41467ec681f3Smrg   VkResult result = tu_cs_begin_sub_stream(&cmd->sub_cs, 3 + (offset ? 8 : 0), &cs);
4147361fc4cbSmaya   if (result != VK_SUCCESS) {
4148361fc4cbSmaya      cmd->record_result = result;
4149361fc4cbSmaya      return;
4150361fc4cbSmaya   }
4151361fc4cbSmaya
41527ec681f3Smrg   tu_cs_emit_regs(&cs,
41537ec681f3Smrg                   A6XX_VFD_INDEX_OFFSET(vertex_offset),
41547ec681f3Smrg                   A6XX_VFD_INSTANCE_START_OFFSET(first_instance));
41557ec681f3Smrg
41567ec681f3Smrg   if (offset) {
41577ec681f3Smrg      tu_cs_emit_pkt7(&cs, CP_LOAD_STATE6_GEOM, 3 + 4);
41587ec681f3Smrg      tu_cs_emit(&cs, CP_LOAD_STATE6_0_DST_OFF(offset) |
41597ec681f3Smrg            CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
41607ec681f3Smrg            CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
41617ec681f3Smrg            CP_LOAD_STATE6_0_STATE_BLOCK(SB6_VS_SHADER) |
41627ec681f3Smrg            CP_LOAD_STATE6_0_NUM_UNIT(1));
41637ec681f3Smrg      tu_cs_emit(&cs, 0);
41647ec681f3Smrg      tu_cs_emit(&cs, 0);
41657ec681f3Smrg
41667ec681f3Smrg      tu_cs_emit(&cs, 0);
41677ec681f3Smrg      tu_cs_emit(&cs, vertex_offset);
41687ec681f3Smrg      tu_cs_emit(&cs, first_instance);
41697ec681f3Smrg      tu_cs_emit(&cs, 0);
4170361fc4cbSmaya   }
4171361fc4cbSmaya
41727ec681f3Smrg   cmd->state.last_vs_params.vertex_offset = vertex_offset;
41737ec681f3Smrg   cmd->state.last_vs_params.first_instance = first_instance;
4174361fc4cbSmaya
41757ec681f3Smrg   struct tu_cs_entry entry = tu_cs_end_sub_stream(&cmd->sub_cs, &cs);
41767ec681f3Smrg   cmd->state.vs_params = (struct tu_draw_state) {entry.bo->iova + entry.offset, entry.size / 4};
4177361fc4cbSmaya
41787ec681f3Smrg   cmd->state.dirty |= TU_CMD_DIRTY_VS_PARAMS;
4179361fc4cbSmaya}
4180361fc4cbSmaya
41817ec681f3SmrgVKAPI_ATTR void VKAPI_CALL
4182361fc4cbSmayatu_CmdDraw(VkCommandBuffer commandBuffer,
4183361fc4cbSmaya           uint32_t vertexCount,
4184361fc4cbSmaya           uint32_t instanceCount,
4185361fc4cbSmaya           uint32_t firstVertex,
4186361fc4cbSmaya           uint32_t firstInstance)
4187361fc4cbSmaya{
41887ec681f3Smrg   TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
41897ec681f3Smrg   struct tu_cs *cs = &cmd->draw_cs;
41907ec681f3Smrg
41917ec681f3Smrg   tu6_emit_vs_params(cmd, firstVertex, firstInstance);
4192361fc4cbSmaya
41937ec681f3Smrg   tu6_draw_common(cmd, cs, false, vertexCount);
4194361fc4cbSmaya
41957ec681f3Smrg   tu_cs_emit_pkt7(cs, CP_DRAW_INDX_OFFSET, 3);
41967ec681f3Smrg   tu_cs_emit(cs, tu_draw_initiator(cmd, DI_SRC_SEL_AUTO_INDEX));
41977ec681f3Smrg   tu_cs_emit(cs, instanceCount);
41987ec681f3Smrg   tu_cs_emit(cs, vertexCount);
4199361fc4cbSmaya}
4200361fc4cbSmaya
42017ec681f3SmrgVKAPI_ATTR void VKAPI_CALL
4202361fc4cbSmayatu_CmdDrawIndexed(VkCommandBuffer commandBuffer,
4203361fc4cbSmaya                  uint32_t indexCount,
4204361fc4cbSmaya                  uint32_t instanceCount,
4205361fc4cbSmaya                  uint32_t firstIndex,
4206361fc4cbSmaya                  int32_t vertexOffset,
4207361fc4cbSmaya                  uint32_t firstInstance)
4208361fc4cbSmaya{
42097ec681f3Smrg   TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
42107ec681f3Smrg   struct tu_cs *cs = &cmd->draw_cs;
4211361fc4cbSmaya
42127ec681f3Smrg   tu6_emit_vs_params(cmd, vertexOffset, firstInstance);
4213361fc4cbSmaya
42147ec681f3Smrg   tu6_draw_common(cmd, cs, true, indexCount);
42157ec681f3Smrg
42167ec681f3Smrg   tu_cs_emit_pkt7(cs, CP_DRAW_INDX_OFFSET, 7);
42177ec681f3Smrg   tu_cs_emit(cs, tu_draw_initiator(cmd, DI_SRC_SEL_DMA));
42187ec681f3Smrg   tu_cs_emit(cs, instanceCount);
42197ec681f3Smrg   tu_cs_emit(cs, indexCount);
42207ec681f3Smrg   tu_cs_emit(cs, firstIndex);
42217ec681f3Smrg   tu_cs_emit_qw(cs, cmd->state.index_va);
42227ec681f3Smrg   tu_cs_emit(cs, cmd->state.max_index_count);
4223361fc4cbSmaya}
4224361fc4cbSmaya
42257ec681f3Smrg/* Various firmware bugs/inconsistencies mean that some indirect draw opcodes
42267ec681f3Smrg * do not wait for WFI's to complete before executing. Add a WAIT_FOR_ME if
42277ec681f3Smrg * pending for these opcodes. This may result in a few extra WAIT_FOR_ME's
42287ec681f3Smrg * with these opcodes, but the alternative would add unnecessary WAIT_FOR_ME's
42297ec681f3Smrg * before draw opcodes that don't need it.
42307ec681f3Smrg */
42317ec681f3Smrgstatic void
42327ec681f3Smrgdraw_wfm(struct tu_cmd_buffer *cmd)
42337ec681f3Smrg{
42347ec681f3Smrg   cmd->state.renderpass_cache.flush_bits |=
42357ec681f3Smrg      cmd->state.renderpass_cache.pending_flush_bits & TU_CMD_FLAG_WAIT_FOR_ME;
42367ec681f3Smrg   cmd->state.renderpass_cache.pending_flush_bits &= ~TU_CMD_FLAG_WAIT_FOR_ME;
42377ec681f3Smrg}
42387ec681f3Smrg
42397ec681f3SmrgVKAPI_ATTR void VKAPI_CALL
4240361fc4cbSmayatu_CmdDrawIndirect(VkCommandBuffer commandBuffer,
4241361fc4cbSmaya                   VkBuffer _buffer,
4242361fc4cbSmaya                   VkDeviceSize offset,
4243361fc4cbSmaya                   uint32_t drawCount,
4244361fc4cbSmaya                   uint32_t stride)
4245361fc4cbSmaya{
42467ec681f3Smrg   TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
42477ec681f3Smrg   TU_FROM_HANDLE(tu_buffer, buf, _buffer);
42487ec681f3Smrg   struct tu_cs *cs = &cmd->draw_cs;
4249361fc4cbSmaya
42507ec681f3Smrg   tu6_emit_empty_vs_params(cmd);
42517ec681f3Smrg
42527ec681f3Smrg   if (cmd->device->physical_device->info->a6xx.indirect_draw_wfm_quirk)
42537ec681f3Smrg      draw_wfm(cmd);
4254361fc4cbSmaya
42557ec681f3Smrg   tu6_draw_common(cmd, cs, false, 0);
42567ec681f3Smrg
42577ec681f3Smrg   tu_cs_emit_pkt7(cs, CP_DRAW_INDIRECT_MULTI, 6);
42587ec681f3Smrg   tu_cs_emit(cs, tu_draw_initiator(cmd, DI_SRC_SEL_AUTO_INDEX));
42597ec681f3Smrg   tu_cs_emit(cs, A6XX_CP_DRAW_INDIRECT_MULTI_1_OPCODE(INDIRECT_OP_NORMAL) |
42607ec681f3Smrg                  A6XX_CP_DRAW_INDIRECT_MULTI_1_DST_OFF(vs_params_offset(cmd)));
42617ec681f3Smrg   tu_cs_emit(cs, drawCount);
42627ec681f3Smrg   tu_cs_emit_qw(cs, buf->bo->iova + buf->bo_offset + offset);
42637ec681f3Smrg   tu_cs_emit(cs, stride);
4264361fc4cbSmaya}
4265361fc4cbSmaya
42667ec681f3SmrgVKAPI_ATTR void VKAPI_CALL
4267361fc4cbSmayatu_CmdDrawIndexedIndirect(VkCommandBuffer commandBuffer,
4268361fc4cbSmaya                          VkBuffer _buffer,
4269361fc4cbSmaya                          VkDeviceSize offset,
4270361fc4cbSmaya                          uint32_t drawCount,
4271361fc4cbSmaya                          uint32_t stride)
4272361fc4cbSmaya{
42737ec681f3Smrg   TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
42747ec681f3Smrg   TU_FROM_HANDLE(tu_buffer, buf, _buffer);
42757ec681f3Smrg   struct tu_cs *cs = &cmd->draw_cs;
4276361fc4cbSmaya
42777ec681f3Smrg   tu6_emit_empty_vs_params(cmd);
42787ec681f3Smrg
42797ec681f3Smrg   if (cmd->device->physical_device->info->a6xx.indirect_draw_wfm_quirk)
42807ec681f3Smrg      draw_wfm(cmd);
42817ec681f3Smrg
42827ec681f3Smrg   tu6_draw_common(cmd, cs, true, 0);
42837ec681f3Smrg
42847ec681f3Smrg   tu_cs_emit_pkt7(cs, CP_DRAW_INDIRECT_MULTI, 9);
42857ec681f3Smrg   tu_cs_emit(cs, tu_draw_initiator(cmd, DI_SRC_SEL_DMA));
42867ec681f3Smrg   tu_cs_emit(cs, A6XX_CP_DRAW_INDIRECT_MULTI_1_OPCODE(INDIRECT_OP_INDEXED) |
42877ec681f3Smrg                  A6XX_CP_DRAW_INDIRECT_MULTI_1_DST_OFF(vs_params_offset(cmd)));
42887ec681f3Smrg   tu_cs_emit(cs, drawCount);
42897ec681f3Smrg   tu_cs_emit_qw(cs, cmd->state.index_va);
42907ec681f3Smrg   tu_cs_emit(cs, cmd->state.max_index_count);
42917ec681f3Smrg   tu_cs_emit_qw(cs, buf->bo->iova + buf->bo_offset + offset);
42927ec681f3Smrg   tu_cs_emit(cs, stride);
42937ec681f3Smrg}
42947ec681f3Smrg
42957ec681f3SmrgVKAPI_ATTR void VKAPI_CALL
42967ec681f3Smrgtu_CmdDrawIndirectCount(VkCommandBuffer commandBuffer,
42977ec681f3Smrg                        VkBuffer _buffer,
42987ec681f3Smrg                        VkDeviceSize offset,
42997ec681f3Smrg                        VkBuffer countBuffer,
43007ec681f3Smrg                        VkDeviceSize countBufferOffset,
43017ec681f3Smrg                        uint32_t drawCount,
43027ec681f3Smrg                        uint32_t stride)
43037ec681f3Smrg{
43047ec681f3Smrg   TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
43057ec681f3Smrg   TU_FROM_HANDLE(tu_buffer, buf, _buffer);
43067ec681f3Smrg   TU_FROM_HANDLE(tu_buffer, count_buf, countBuffer);
43077ec681f3Smrg   struct tu_cs *cs = &cmd->draw_cs;
43087ec681f3Smrg
43097ec681f3Smrg   tu6_emit_empty_vs_params(cmd);
43107ec681f3Smrg
43117ec681f3Smrg   /* It turns out that the firmware we have for a650 only partially fixed the
43127ec681f3Smrg    * problem with CP_DRAW_INDIRECT_MULTI not waiting for WFI's to complete
43137ec681f3Smrg    * before reading indirect parameters. It waits for WFI's before reading
43147ec681f3Smrg    * the draw parameters, but after reading the indirect count :(.
43157ec681f3Smrg    */
43167ec681f3Smrg   draw_wfm(cmd);
43177ec681f3Smrg
43187ec681f3Smrg   tu6_draw_common(cmd, cs, false, 0);
43197ec681f3Smrg
43207ec681f3Smrg   tu_cs_emit_pkt7(cs, CP_DRAW_INDIRECT_MULTI, 8);
43217ec681f3Smrg   tu_cs_emit(cs, tu_draw_initiator(cmd, DI_SRC_SEL_AUTO_INDEX));
43227ec681f3Smrg   tu_cs_emit(cs, A6XX_CP_DRAW_INDIRECT_MULTI_1_OPCODE(INDIRECT_OP_INDIRECT_COUNT) |
43237ec681f3Smrg                  A6XX_CP_DRAW_INDIRECT_MULTI_1_DST_OFF(vs_params_offset(cmd)));
43247ec681f3Smrg   tu_cs_emit(cs, drawCount);
43257ec681f3Smrg   tu_cs_emit_qw(cs, buf->bo->iova + buf->bo_offset + offset);
43267ec681f3Smrg   tu_cs_emit_qw(cs, count_buf->bo->iova + count_buf->bo_offset + countBufferOffset);
43277ec681f3Smrg   tu_cs_emit(cs, stride);
43287ec681f3Smrg}
4329361fc4cbSmaya
43307ec681f3SmrgVKAPI_ATTR void VKAPI_CALL
43317ec681f3Smrgtu_CmdDrawIndexedIndirectCount(VkCommandBuffer commandBuffer,
43327ec681f3Smrg                               VkBuffer _buffer,
43337ec681f3Smrg                               VkDeviceSize offset,
43347ec681f3Smrg                               VkBuffer countBuffer,
43357ec681f3Smrg                               VkDeviceSize countBufferOffset,
43367ec681f3Smrg                               uint32_t drawCount,
43377ec681f3Smrg                               uint32_t stride)
43387ec681f3Smrg{
43397ec681f3Smrg   TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
43407ec681f3Smrg   TU_FROM_HANDLE(tu_buffer, buf, _buffer);
43417ec681f3Smrg   TU_FROM_HANDLE(tu_buffer, count_buf, countBuffer);
43427ec681f3Smrg   struct tu_cs *cs = &cmd->draw_cs;
43437ec681f3Smrg
43447ec681f3Smrg   tu6_emit_empty_vs_params(cmd);
43457ec681f3Smrg
43467ec681f3Smrg   draw_wfm(cmd);
43477ec681f3Smrg
43487ec681f3Smrg   tu6_draw_common(cmd, cs, true, 0);
43497ec681f3Smrg
43507ec681f3Smrg   tu_cs_emit_pkt7(cs, CP_DRAW_INDIRECT_MULTI, 11);
43517ec681f3Smrg   tu_cs_emit(cs, tu_draw_initiator(cmd, DI_SRC_SEL_DMA));
43527ec681f3Smrg   tu_cs_emit(cs, A6XX_CP_DRAW_INDIRECT_MULTI_1_OPCODE(INDIRECT_OP_INDIRECT_COUNT_INDEXED) |
43537ec681f3Smrg                  A6XX_CP_DRAW_INDIRECT_MULTI_1_DST_OFF(vs_params_offset(cmd)));
43547ec681f3Smrg   tu_cs_emit(cs, drawCount);
43557ec681f3Smrg   tu_cs_emit_qw(cs, cmd->state.index_va);
43567ec681f3Smrg   tu_cs_emit(cs, cmd->state.max_index_count);
43577ec681f3Smrg   tu_cs_emit_qw(cs, buf->bo->iova + buf->bo_offset + offset);
43587ec681f3Smrg   tu_cs_emit_qw(cs, count_buf->bo->iova + count_buf->bo_offset + countBufferOffset);
43597ec681f3Smrg   tu_cs_emit(cs, stride);
43607ec681f3Smrg}
43617ec681f3Smrg
43627ec681f3SmrgVKAPI_ATTR void VKAPI_CALL
43637ec681f3Smrgtu_CmdDrawIndirectByteCountEXT(VkCommandBuffer commandBuffer,
43647ec681f3Smrg                               uint32_t instanceCount,
43657ec681f3Smrg                               uint32_t firstInstance,
43667ec681f3Smrg                               VkBuffer _counterBuffer,
43677ec681f3Smrg                               VkDeviceSize counterBufferOffset,
43687ec681f3Smrg                               uint32_t counterOffset,
43697ec681f3Smrg                               uint32_t vertexStride)
43707ec681f3Smrg{
43717ec681f3Smrg   TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
43727ec681f3Smrg   TU_FROM_HANDLE(tu_buffer, buf, _counterBuffer);
43737ec681f3Smrg   struct tu_cs *cs = &cmd->draw_cs;
43747ec681f3Smrg
43757ec681f3Smrg   /* All known firmware versions do not wait for WFI's with CP_DRAW_AUTO.
43767ec681f3Smrg    * Plus, for the common case where the counter buffer is written by
43777ec681f3Smrg    * vkCmdEndTransformFeedback, we need to wait for the CP_WAIT_MEM_WRITES to
43787ec681f3Smrg    * complete which means we need a WAIT_FOR_ME anyway.
43797ec681f3Smrg    */
43807ec681f3Smrg   draw_wfm(cmd);
43817ec681f3Smrg
43827ec681f3Smrg   tu6_emit_vs_params(cmd, 0, firstInstance);
43837ec681f3Smrg
43847ec681f3Smrg   tu6_draw_common(cmd, cs, false, 0);
43857ec681f3Smrg
43867ec681f3Smrg   tu_cs_emit_pkt7(cs, CP_DRAW_AUTO, 6);
43877ec681f3Smrg   tu_cs_emit(cs, tu_draw_initiator(cmd, DI_SRC_SEL_AUTO_XFB));
43887ec681f3Smrg   tu_cs_emit(cs, instanceCount);
43897ec681f3Smrg   tu_cs_emit_qw(cs, buf->bo->iova + buf->bo_offset + counterBufferOffset);
43907ec681f3Smrg   tu_cs_emit(cs, counterOffset);
43917ec681f3Smrg   tu_cs_emit(cs, vertexStride);
4392361fc4cbSmaya}
4393361fc4cbSmaya
4394361fc4cbSmayastruct tu_dispatch_info
4395361fc4cbSmaya{
4396361fc4cbSmaya   /**
4397361fc4cbSmaya    * Determine the layout of the grid (in block units) to be used.
4398361fc4cbSmaya    */
4399361fc4cbSmaya   uint32_t blocks[3];
4400361fc4cbSmaya
4401361fc4cbSmaya   /**
4402361fc4cbSmaya    * A starting offset for the grid. If unaligned is set, the offset
4403361fc4cbSmaya    * must still be aligned.
4404361fc4cbSmaya    */
4405361fc4cbSmaya   uint32_t offsets[3];
4406361fc4cbSmaya   /**
4407361fc4cbSmaya    * Whether it's an unaligned compute dispatch.
4408361fc4cbSmaya    */
4409361fc4cbSmaya   bool unaligned;
4410361fc4cbSmaya
4411361fc4cbSmaya   /**
4412361fc4cbSmaya    * Indirect compute parameters resource.
4413361fc4cbSmaya    */
4414361fc4cbSmaya   struct tu_buffer *indirect;
4415361fc4cbSmaya   uint64_t indirect_offset;
4416361fc4cbSmaya};
4417361fc4cbSmaya
4418361fc4cbSmayastatic void
44197ec681f3Smrgtu_emit_compute_driver_params(struct tu_cmd_buffer *cmd,
44207ec681f3Smrg                              struct tu_cs *cs, struct tu_pipeline *pipeline,
44217ec681f3Smrg                              const struct tu_dispatch_info *info)
44227ec681f3Smrg{
44237ec681f3Smrg   gl_shader_stage type = MESA_SHADER_COMPUTE;
44247ec681f3Smrg   const struct tu_program_descriptor_linkage *link =
44257ec681f3Smrg      &pipeline->program.link[type];
44267ec681f3Smrg   const struct ir3_const_state *const_state = &link->const_state;
44277ec681f3Smrg   uint32_t offset = const_state->offsets.driver_param;
44287ec681f3Smrg   unsigned subgroup_size = pipeline->compute.subgroup_size;
44297ec681f3Smrg   unsigned subgroup_shift = util_logbase2(subgroup_size);
44307ec681f3Smrg
44317ec681f3Smrg   if (link->constlen <= offset)
44327ec681f3Smrg      return;
44337ec681f3Smrg
44347ec681f3Smrg   uint32_t num_consts = MIN2(const_state->num_driver_params,
44357ec681f3Smrg                              (link->constlen - offset) * 4);
44367ec681f3Smrg
44377ec681f3Smrg   if (!info->indirect) {
44387ec681f3Smrg      uint32_t driver_params[12] = {
44397ec681f3Smrg         [IR3_DP_NUM_WORK_GROUPS_X] = info->blocks[0],
44407ec681f3Smrg         [IR3_DP_NUM_WORK_GROUPS_Y] = info->blocks[1],
44417ec681f3Smrg         [IR3_DP_NUM_WORK_GROUPS_Z] = info->blocks[2],
44427ec681f3Smrg         [IR3_DP_BASE_GROUP_X] = info->offsets[0],
44437ec681f3Smrg         [IR3_DP_BASE_GROUP_Y] = info->offsets[1],
44447ec681f3Smrg         [IR3_DP_BASE_GROUP_Z] = info->offsets[2],
44457ec681f3Smrg         [IR3_DP_SUBGROUP_SIZE] = subgroup_size,
44467ec681f3Smrg         [IR3_DP_SUBGROUP_ID_SHIFT] = subgroup_shift,
44477ec681f3Smrg      };
44487ec681f3Smrg
44497ec681f3Smrg      assert(num_consts <= ARRAY_SIZE(driver_params));
44507ec681f3Smrg
44517ec681f3Smrg      /* push constants */
44527ec681f3Smrg      tu_cs_emit_pkt7(cs, tu6_stage2opcode(type), 3 + num_consts);
44537ec681f3Smrg      tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(offset) |
44547ec681f3Smrg                 CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
44557ec681f3Smrg                 CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
44567ec681f3Smrg                 CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(type)) |
44577ec681f3Smrg                 CP_LOAD_STATE6_0_NUM_UNIT(num_consts / 4));
44587ec681f3Smrg      tu_cs_emit(cs, 0);
44597ec681f3Smrg      tu_cs_emit(cs, 0);
44607ec681f3Smrg      uint32_t i;
44617ec681f3Smrg      for (i = 0; i < num_consts; i++)
44627ec681f3Smrg         tu_cs_emit(cs, driver_params[i]);
44637ec681f3Smrg   } else if (!(info->indirect_offset & 0xf)) {
44647ec681f3Smrg      tu_cs_emit_pkt7(cs, tu6_stage2opcode(type), 3);
44657ec681f3Smrg      tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(offset) |
44667ec681f3Smrg                  CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
44677ec681f3Smrg                  CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
44687ec681f3Smrg                  CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(type)) |
44697ec681f3Smrg                  CP_LOAD_STATE6_0_NUM_UNIT(1));
44707ec681f3Smrg      tu_cs_emit_qw(cs, tu_buffer_iova(info->indirect) + info->indirect_offset);
44717ec681f3Smrg   } else {
44727ec681f3Smrg      /* Vulkan guarantees only 4 byte alignment for indirect_offset.
44737ec681f3Smrg       * However, CP_LOAD_STATE.EXT_SRC_ADDR needs 16 byte alignment.
44747ec681f3Smrg       */
44757ec681f3Smrg
44767ec681f3Smrg      uint64_t indirect_iova = tu_buffer_iova(info->indirect) + info->indirect_offset;
44777ec681f3Smrg
44787ec681f3Smrg      for (uint32_t i = 0; i < 3; i++) {
44797ec681f3Smrg         tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 5);
44807ec681f3Smrg         tu_cs_emit(cs, 0);
44817ec681f3Smrg         tu_cs_emit_qw(cs, global_iova(cmd, cs_indirect_xyz[i]));
44827ec681f3Smrg         tu_cs_emit_qw(cs, indirect_iova + i * 4);
44837ec681f3Smrg      }
44847ec681f3Smrg
44857ec681f3Smrg      tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0);
44867ec681f3Smrg      tu6_emit_event_write(cmd, cs, CACHE_INVALIDATE);
44877ec681f3Smrg
44887ec681f3Smrg      tu_cs_emit_pkt7(cs, tu6_stage2opcode(type), 3);
44897ec681f3Smrg      tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(offset) |
44907ec681f3Smrg                  CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
44917ec681f3Smrg                  CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
44927ec681f3Smrg                  CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(type)) |
44937ec681f3Smrg                  CP_LOAD_STATE6_0_NUM_UNIT(1));
44947ec681f3Smrg      tu_cs_emit_qw(cs, global_iova(cmd, cs_indirect_xyz[0]));
44957ec681f3Smrg   }
44967ec681f3Smrg
44977ec681f3Smrg   /* Fill out IR3_DP_SUBGROUP_SIZE and IR3_DP_SUBGROUP_ID_SHIFT for indirect
44987ec681f3Smrg    * dispatch.
44997ec681f3Smrg    */
45007ec681f3Smrg   if (info->indirect && num_consts > IR3_DP_BASE_GROUP_X) {
45017ec681f3Smrg      tu_cs_emit_pkt7(cs, tu6_stage2opcode(type), 7);
45027ec681f3Smrg      tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(offset + (IR3_DP_BASE_GROUP_X / 4)) |
45037ec681f3Smrg                 CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
45047ec681f3Smrg                 CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
45057ec681f3Smrg                 CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(type)) |
45067ec681f3Smrg                 CP_LOAD_STATE6_0_NUM_UNIT((num_consts - IR3_DP_BASE_GROUP_X) / 4));
45077ec681f3Smrg      tu_cs_emit_qw(cs, 0);
45087ec681f3Smrg      tu_cs_emit(cs, 0); /* BASE_GROUP_X */
45097ec681f3Smrg      tu_cs_emit(cs, 0); /* BASE_GROUP_Y */
45107ec681f3Smrg      tu_cs_emit(cs, 0); /* BASE_GROUP_Z */
45117ec681f3Smrg      tu_cs_emit(cs, subgroup_size);
45127ec681f3Smrg      if (num_consts > IR3_DP_LOCAL_GROUP_SIZE_X) {
45137ec681f3Smrg         assert(num_consts == align(IR3_DP_SUBGROUP_ID_SHIFT, 4));
45147ec681f3Smrg         tu_cs_emit(cs, 0); /* LOCAL_GROUP_SIZE_X */
45157ec681f3Smrg         tu_cs_emit(cs, 0); /* LOCAL_GROUP_SIZE_Y */
45167ec681f3Smrg         tu_cs_emit(cs, 0); /* LOCAL_GROUP_SIZE_Z */
45177ec681f3Smrg         tu_cs_emit(cs, subgroup_shift);
45187ec681f3Smrg      }
45197ec681f3Smrg   }
45207ec681f3Smrg}
45217ec681f3Smrg
45227ec681f3Smrgstatic void
45237ec681f3Smrgtu_dispatch(struct tu_cmd_buffer *cmd,
4524361fc4cbSmaya            const struct tu_dispatch_info *info)
4525361fc4cbSmaya{
45267ec681f3Smrg   if (!info->indirect &&
45277ec681f3Smrg       (info->blocks[0] == 0 || info->blocks[1] == 0 || info->blocks[2] == 0))
45287ec681f3Smrg      return;
45297ec681f3Smrg
45307ec681f3Smrg   struct tu_cs *cs = &cmd->cs;
45317ec681f3Smrg   struct tu_pipeline *pipeline = cmd->state.compute_pipeline;
45327ec681f3Smrg   struct tu_descriptor_state *descriptors_state =
45337ec681f3Smrg      &cmd->descriptors[VK_PIPELINE_BIND_POINT_COMPUTE];
45347ec681f3Smrg
45357ec681f3Smrg   /* TODO: We could probably flush less if we add a compute_flush_bits
45367ec681f3Smrg    * bitfield.
45377ec681f3Smrg    */
45387ec681f3Smrg   tu_emit_cache_flush(cmd, cs);
45397ec681f3Smrg
45407ec681f3Smrg   /* note: no reason to have this in a separate IB */
45417ec681f3Smrg   tu_cs_emit_state_ib(cs,
45427ec681f3Smrg         tu6_emit_consts(cmd, pipeline, descriptors_state, MESA_SHADER_COMPUTE));
45437ec681f3Smrg
45447ec681f3Smrg   tu_emit_compute_driver_params(cmd, cs, pipeline, info);
45457ec681f3Smrg
45467ec681f3Smrg   if (cmd->state.dirty & TU_CMD_DIRTY_COMPUTE_DESC_SETS_LOAD)
45477ec681f3Smrg      tu_cs_emit_state_ib(cs, pipeline->load_state);
45487ec681f3Smrg
45497ec681f3Smrg   cmd->state.dirty &= ~TU_CMD_DIRTY_COMPUTE_DESC_SETS_LOAD;
45507ec681f3Smrg
45517ec681f3Smrg   tu_cs_emit_pkt7(cs, CP_SET_MARKER, 1);
45527ec681f3Smrg   tu_cs_emit(cs, A6XX_CP_SET_MARKER_0_MODE(RM6_COMPUTE));
45537ec681f3Smrg
45547ec681f3Smrg   const uint32_t *local_size = pipeline->compute.local_size;
45557ec681f3Smrg   const uint32_t *num_groups = info->blocks;
45567ec681f3Smrg   tu_cs_emit_regs(cs,
45577ec681f3Smrg                   A6XX_HLSQ_CS_NDRANGE_0(.kerneldim = 3,
45587ec681f3Smrg                                          .localsizex = local_size[0] - 1,
45597ec681f3Smrg                                          .localsizey = local_size[1] - 1,
45607ec681f3Smrg                                          .localsizez = local_size[2] - 1),
45617ec681f3Smrg                   A6XX_HLSQ_CS_NDRANGE_1(.globalsize_x = local_size[0] * num_groups[0]),
45627ec681f3Smrg                   A6XX_HLSQ_CS_NDRANGE_2(.globaloff_x = 0),
45637ec681f3Smrg                   A6XX_HLSQ_CS_NDRANGE_3(.globalsize_y = local_size[1] * num_groups[1]),
45647ec681f3Smrg                   A6XX_HLSQ_CS_NDRANGE_4(.globaloff_y = 0),
45657ec681f3Smrg                   A6XX_HLSQ_CS_NDRANGE_5(.globalsize_z = local_size[2] * num_groups[2]),
45667ec681f3Smrg                   A6XX_HLSQ_CS_NDRANGE_6(.globaloff_z = 0));
45677ec681f3Smrg
45687ec681f3Smrg   tu_cs_emit_regs(cs,
45697ec681f3Smrg                   A6XX_HLSQ_CS_KERNEL_GROUP_X(1),
45707ec681f3Smrg                   A6XX_HLSQ_CS_KERNEL_GROUP_Y(1),
45717ec681f3Smrg                   A6XX_HLSQ_CS_KERNEL_GROUP_Z(1));
45727ec681f3Smrg
45737ec681f3Smrg   trace_start_compute(&cmd->trace, cs);
45747ec681f3Smrg
45757ec681f3Smrg   if (info->indirect) {
45767ec681f3Smrg      uint64_t iova = tu_buffer_iova(info->indirect) + info->indirect_offset;
45777ec681f3Smrg
45787ec681f3Smrg      tu_cs_emit_pkt7(cs, CP_EXEC_CS_INDIRECT, 4);
45797ec681f3Smrg      tu_cs_emit(cs, 0x00000000);
45807ec681f3Smrg      tu_cs_emit_qw(cs, iova);
45817ec681f3Smrg      tu_cs_emit(cs,
45827ec681f3Smrg                 A5XX_CP_EXEC_CS_INDIRECT_3_LOCALSIZEX(local_size[0] - 1) |
45837ec681f3Smrg                 A5XX_CP_EXEC_CS_INDIRECT_3_LOCALSIZEY(local_size[1] - 1) |
45847ec681f3Smrg                 A5XX_CP_EXEC_CS_INDIRECT_3_LOCALSIZEZ(local_size[2] - 1));
45857ec681f3Smrg   } else {
45867ec681f3Smrg      tu_cs_emit_pkt7(cs, CP_EXEC_CS, 4);
45877ec681f3Smrg      tu_cs_emit(cs, 0x00000000);
45887ec681f3Smrg      tu_cs_emit(cs, CP_EXEC_CS_1_NGROUPS_X(info->blocks[0]));
45897ec681f3Smrg      tu_cs_emit(cs, CP_EXEC_CS_2_NGROUPS_Y(info->blocks[1]));
45907ec681f3Smrg      tu_cs_emit(cs, CP_EXEC_CS_3_NGROUPS_Z(info->blocks[2]));
45917ec681f3Smrg   }
45927ec681f3Smrg
45937ec681f3Smrg   trace_end_compute(&cmd->trace, cs,
45947ec681f3Smrg                     info->indirect != NULL,
45957ec681f3Smrg                     local_size[0], local_size[1], local_size[2],
45967ec681f3Smrg                     info->blocks[0], info->blocks[1], info->blocks[2]);
45977ec681f3Smrg
45987ec681f3Smrg   tu_cs_emit_wfi(cs);
4599361fc4cbSmaya}
4600361fc4cbSmaya
46017ec681f3SmrgVKAPI_ATTR void VKAPI_CALL
4602361fc4cbSmayatu_CmdDispatchBase(VkCommandBuffer commandBuffer,
4603361fc4cbSmaya                   uint32_t base_x,
4604361fc4cbSmaya                   uint32_t base_y,
4605361fc4cbSmaya                   uint32_t base_z,
4606361fc4cbSmaya                   uint32_t x,
4607361fc4cbSmaya                   uint32_t y,
4608361fc4cbSmaya                   uint32_t z)
4609361fc4cbSmaya{
4610361fc4cbSmaya   TU_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, commandBuffer);
4611361fc4cbSmaya   struct tu_dispatch_info info = {};
4612361fc4cbSmaya
4613361fc4cbSmaya   info.blocks[0] = x;
4614361fc4cbSmaya   info.blocks[1] = y;
4615361fc4cbSmaya   info.blocks[2] = z;
4616361fc4cbSmaya
4617361fc4cbSmaya   info.offsets[0] = base_x;
4618361fc4cbSmaya   info.offsets[1] = base_y;
4619361fc4cbSmaya   info.offsets[2] = base_z;
4620361fc4cbSmaya   tu_dispatch(cmd_buffer, &info);
4621361fc4cbSmaya}
4622361fc4cbSmaya
46237ec681f3SmrgVKAPI_ATTR void VKAPI_CALL
4624361fc4cbSmayatu_CmdDispatch(VkCommandBuffer commandBuffer,
4625361fc4cbSmaya               uint32_t x,
4626361fc4cbSmaya               uint32_t y,
4627361fc4cbSmaya               uint32_t z)
4628361fc4cbSmaya{
4629361fc4cbSmaya   tu_CmdDispatchBase(commandBuffer, 0, 0, 0, x, y, z);
4630361fc4cbSmaya}
4631361fc4cbSmaya
46327ec681f3SmrgVKAPI_ATTR void VKAPI_CALL
4633361fc4cbSmayatu_CmdDispatchIndirect(VkCommandBuffer commandBuffer,
4634361fc4cbSmaya                       VkBuffer _buffer,
4635361fc4cbSmaya                       VkDeviceSize offset)
4636361fc4cbSmaya{
4637361fc4cbSmaya   TU_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, commandBuffer);
4638361fc4cbSmaya   TU_FROM_HANDLE(tu_buffer, buffer, _buffer);
4639361fc4cbSmaya   struct tu_dispatch_info info = {};
4640361fc4cbSmaya
4641361fc4cbSmaya   info.indirect = buffer;
4642361fc4cbSmaya   info.indirect_offset = offset;
4643361fc4cbSmaya
4644361fc4cbSmaya   tu_dispatch(cmd_buffer, &info);
4645361fc4cbSmaya}
4646361fc4cbSmaya
46477ec681f3SmrgVKAPI_ATTR void VKAPI_CALL
46487ec681f3Smrgtu_CmdEndRenderPass2(VkCommandBuffer commandBuffer,
46497ec681f3Smrg                     const VkSubpassEndInfoKHR *pSubpassEndInfo)
4650361fc4cbSmaya{
4651361fc4cbSmaya   TU_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, commandBuffer);
4652361fc4cbSmaya
46537ec681f3Smrg   tu6_emit_tile_store(cmd_buffer, &cmd_buffer->tile_store_cs);
46547ec681f3Smrg
4655361fc4cbSmaya   tu_cs_end(&cmd_buffer->draw_cs);
46567ec681f3Smrg   tu_cs_end(&cmd_buffer->tile_store_cs);
46577ec681f3Smrg   tu_cs_end(&cmd_buffer->draw_epilogue_cs);
4658361fc4cbSmaya
46597ec681f3Smrg   cmd_buffer->trace_renderpass_end = u_trace_end_iterator(&cmd_buffer->trace);
4660361fc4cbSmaya
46617ec681f3Smrg   if (use_sysmem_rendering(cmd_buffer))
46627ec681f3Smrg      tu_cmd_render_sysmem(cmd_buffer);
46637ec681f3Smrg   else
46647ec681f3Smrg      tu_cmd_render_tiles(cmd_buffer);
46657ec681f3Smrg
46667ec681f3Smrg   /* Outside of renderpasses we assume all draw states are disabled. We do
46677ec681f3Smrg    * this outside the draw CS for the normal case where 3d gmem stores aren't
46687ec681f3Smrg    * used.
46697ec681f3Smrg    */
46707ec681f3Smrg   tu_disable_draw_states(cmd_buffer, &cmd_buffer->cs);
46717ec681f3Smrg
46727ec681f3Smrg   /* discard draw_cs and draw_epilogue_cs entries now that the tiles are
46737ec681f3Smrg      rendered */
4674361fc4cbSmaya   tu_cs_discard_entries(&cmd_buffer->draw_cs);
46757ec681f3Smrg   tu_cs_begin(&cmd_buffer->draw_cs);
46767ec681f3Smrg   tu_cs_discard_entries(&cmd_buffer->tile_store_cs);
46777ec681f3Smrg   tu_cs_begin(&cmd_buffer->tile_store_cs);
46787ec681f3Smrg   tu_cs_discard_entries(&cmd_buffer->draw_epilogue_cs);
46797ec681f3Smrg   tu_cs_begin(&cmd_buffer->draw_epilogue_cs);
46807ec681f3Smrg
46817ec681f3Smrg   cmd_buffer->state.cache.pending_flush_bits |=
46827ec681f3Smrg      cmd_buffer->state.renderpass_cache.pending_flush_bits;
46837ec681f3Smrg   tu_subpass_barrier(cmd_buffer, &cmd_buffer->state.pass->end_barrier, true);
4684361fc4cbSmaya
4685361fc4cbSmaya   vk_free(&cmd_buffer->pool->alloc, cmd_buffer->state.attachments);
4686361fc4cbSmaya
4687361fc4cbSmaya   cmd_buffer->state.pass = NULL;
4688361fc4cbSmaya   cmd_buffer->state.subpass = NULL;
4689361fc4cbSmaya   cmd_buffer->state.framebuffer = NULL;
46907ec681f3Smrg   cmd_buffer->state.attachments = NULL;
46917ec681f3Smrg   cmd_buffer->state.has_tess = false;
46927ec681f3Smrg   cmd_buffer->state.has_subpass_predication = false;
46937ec681f3Smrg   cmd_buffer->state.disable_gmem = false;
4694361fc4cbSmaya
46957ec681f3Smrg   /* LRZ is not valid next time we use it */
46967ec681f3Smrg   cmd_buffer->state.lrz.valid = false;
46977ec681f3Smrg   cmd_buffer->state.dirty |= TU_CMD_DIRTY_LRZ;
4698361fc4cbSmaya}
4699361fc4cbSmaya
4700361fc4cbSmayastruct tu_barrier_info
4701361fc4cbSmaya{
4702361fc4cbSmaya   uint32_t eventCount;
4703361fc4cbSmaya   const VkEvent *pEvents;
4704361fc4cbSmaya   VkPipelineStageFlags srcStageMask;
47057ec681f3Smrg   VkPipelineStageFlags dstStageMask;
4706361fc4cbSmaya};
4707361fc4cbSmaya
4708361fc4cbSmayastatic void
47097ec681f3Smrgtu_barrier(struct tu_cmd_buffer *cmd,
4710361fc4cbSmaya           uint32_t memoryBarrierCount,
4711361fc4cbSmaya           const VkMemoryBarrier *pMemoryBarriers,
4712361fc4cbSmaya           uint32_t bufferMemoryBarrierCount,
4713361fc4cbSmaya           const VkBufferMemoryBarrier *pBufferMemoryBarriers,
4714361fc4cbSmaya           uint32_t imageMemoryBarrierCount,
4715361fc4cbSmaya           const VkImageMemoryBarrier *pImageMemoryBarriers,
4716361fc4cbSmaya           const struct tu_barrier_info *info)
4717361fc4cbSmaya{
47187ec681f3Smrg   struct tu_cs *cs = cmd->state.pass ? &cmd->draw_cs : &cmd->cs;
47197ec681f3Smrg   VkAccessFlags srcAccessMask = 0;
47207ec681f3Smrg   VkAccessFlags dstAccessMask = 0;
47217ec681f3Smrg
47227ec681f3Smrg   if (cmd->state.pass) {
47237ec681f3Smrg      const VkPipelineStageFlags framebuffer_space_stages =
47247ec681f3Smrg         VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT |
47257ec681f3Smrg         VK_PIPELINE_STAGE_EARLY_FRAGMENT_TESTS_BIT |
47267ec681f3Smrg         VK_PIPELINE_STAGE_LATE_FRAGMENT_TESTS_BIT |
47277ec681f3Smrg         VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT;
47287ec681f3Smrg
47297ec681f3Smrg      /* We cannot have non-by-region "fb-space to fb-space" barriers.
47307ec681f3Smrg       *
47317ec681f3Smrg       * From the Vulkan 1.2.185 spec, section 7.6.1 "Subpass Self-dependency":
47327ec681f3Smrg       *
47337ec681f3Smrg       *    If the source and destination stage masks both include
47347ec681f3Smrg       *    framebuffer-space stages, then dependencyFlags must include
47357ec681f3Smrg       *    VK_DEPENDENCY_BY_REGION_BIT.
47367ec681f3Smrg       *    [...]
47377ec681f3Smrg       *    Each of the synchronization scopes and access scopes of a
47387ec681f3Smrg       *    vkCmdPipelineBarrier2KHR or vkCmdPipelineBarrier command inside
47397ec681f3Smrg       *    a render pass instance must be a subset of the scopes of one of
47407ec681f3Smrg       *    the self-dependencies for the current subpass.
47417ec681f3Smrg       *
47427ec681f3Smrg       *    If the self-dependency has VK_DEPENDENCY_BY_REGION_BIT or
47437ec681f3Smrg       *    VK_DEPENDENCY_VIEW_LOCAL_BIT set, then so must the pipeline barrier.
47447ec681f3Smrg       *
47457ec681f3Smrg       * By-region barriers are ok for gmem. All other barriers would involve
47467ec681f3Smrg       * vtx stages which are NOT ok for gmem rendering.
47477ec681f3Smrg       * See dep_invalid_for_gmem().
47487ec681f3Smrg       */
47497ec681f3Smrg      if ((info->srcStageMask & ~framebuffer_space_stages) ||
47507ec681f3Smrg          (info->dstStageMask & ~framebuffer_space_stages)) {
47517ec681f3Smrg         cmd->state.disable_gmem = true;
47527ec681f3Smrg      }
47537ec681f3Smrg   }
47547ec681f3Smrg
47557ec681f3Smrg   for (uint32_t i = 0; i < memoryBarrierCount; i++) {
47567ec681f3Smrg      srcAccessMask |= pMemoryBarriers[i].srcAccessMask;
47577ec681f3Smrg      dstAccessMask |= pMemoryBarriers[i].dstAccessMask;
47587ec681f3Smrg   }
47597ec681f3Smrg
47607ec681f3Smrg   for (uint32_t i = 0; i < bufferMemoryBarrierCount; i++) {
47617ec681f3Smrg      srcAccessMask |= pBufferMemoryBarriers[i].srcAccessMask;
47627ec681f3Smrg      dstAccessMask |= pBufferMemoryBarriers[i].dstAccessMask;
47637ec681f3Smrg   }
47647ec681f3Smrg
47657ec681f3Smrg   enum tu_cmd_access_mask src_flags = 0;
47667ec681f3Smrg   enum tu_cmd_access_mask dst_flags = 0;
47677ec681f3Smrg
47687ec681f3Smrg   for (uint32_t i = 0; i < imageMemoryBarrierCount; i++) {
47697ec681f3Smrg      VkImageLayout old_layout = pImageMemoryBarriers[i].oldLayout;
47707ec681f3Smrg      if (old_layout == VK_IMAGE_LAYOUT_UNDEFINED) {
47717ec681f3Smrg         /* The underlying memory for this image may have been used earlier
47727ec681f3Smrg          * within the same queue submission for a different image, which
47737ec681f3Smrg          * means that there may be old, stale cache entries which are in the
47747ec681f3Smrg          * "wrong" location, which could cause problems later after writing
47757ec681f3Smrg          * to the image. We don't want these entries being flushed later and
47767ec681f3Smrg          * overwriting the actual image, so we need to flush the CCU.
47777ec681f3Smrg          */
47787ec681f3Smrg         src_flags |= TU_ACCESS_CCU_COLOR_INCOHERENT_WRITE;
47797ec681f3Smrg      }
47807ec681f3Smrg      srcAccessMask |= pImageMemoryBarriers[i].srcAccessMask;
47817ec681f3Smrg      dstAccessMask |= pImageMemoryBarriers[i].dstAccessMask;
47827ec681f3Smrg   }
47837ec681f3Smrg
47847ec681f3Smrg   /* Inside a renderpass, we don't know yet whether we'll be using sysmem
47857ec681f3Smrg    * so we have to use the sysmem flushes.
47867ec681f3Smrg    */
47877ec681f3Smrg   bool gmem = cmd->state.ccu_state == TU_CMD_CCU_GMEM &&
47887ec681f3Smrg      !cmd->state.pass;
47897ec681f3Smrg   src_flags |= vk2tu_access(srcAccessMask, gmem);
47907ec681f3Smrg   dst_flags |= vk2tu_access(dstAccessMask, gmem);
47917ec681f3Smrg
47927ec681f3Smrg   struct tu_cache_state *cache =
47937ec681f3Smrg      cmd->state.pass  ? &cmd->state.renderpass_cache : &cmd->state.cache;
47947ec681f3Smrg   tu_flush_for_access(cache, src_flags, dst_flags);
47957ec681f3Smrg
47967ec681f3Smrg   enum tu_stage src_stage = vk2tu_src_stage(info->srcStageMask);
47977ec681f3Smrg   enum tu_stage dst_stage = vk2tu_dst_stage(info->dstStageMask);
47987ec681f3Smrg   tu_flush_for_stage(cache, src_stage, dst_stage);
47997ec681f3Smrg
48007ec681f3Smrg   for (uint32_t i = 0; i < info->eventCount; i++) {
48017ec681f3Smrg      TU_FROM_HANDLE(tu_event, event, info->pEvents[i]);
48027ec681f3Smrg
48037ec681f3Smrg      tu_cs_emit_pkt7(cs, CP_WAIT_REG_MEM, 6);
48047ec681f3Smrg      tu_cs_emit(cs, CP_WAIT_REG_MEM_0_FUNCTION(WRITE_EQ) |
48057ec681f3Smrg                     CP_WAIT_REG_MEM_0_POLL_MEMORY);
48067ec681f3Smrg      tu_cs_emit_qw(cs, event->bo.iova); /* POLL_ADDR_LO/HI */
48077ec681f3Smrg      tu_cs_emit(cs, CP_WAIT_REG_MEM_3_REF(1));
48087ec681f3Smrg      tu_cs_emit(cs, CP_WAIT_REG_MEM_4_MASK(~0u));
48097ec681f3Smrg      tu_cs_emit(cs, CP_WAIT_REG_MEM_5_DELAY_LOOP_CYCLES(20));
48107ec681f3Smrg   }
4811361fc4cbSmaya}
4812361fc4cbSmaya
48137ec681f3SmrgVKAPI_ATTR void VKAPI_CALL
4814361fc4cbSmayatu_CmdPipelineBarrier(VkCommandBuffer commandBuffer,
4815361fc4cbSmaya                      VkPipelineStageFlags srcStageMask,
48167ec681f3Smrg                      VkPipelineStageFlags dstStageMask,
48177ec681f3Smrg                      VkDependencyFlags dependencyFlags,
4818361fc4cbSmaya                      uint32_t memoryBarrierCount,
4819361fc4cbSmaya                      const VkMemoryBarrier *pMemoryBarriers,
4820361fc4cbSmaya                      uint32_t bufferMemoryBarrierCount,
4821361fc4cbSmaya                      const VkBufferMemoryBarrier *pBufferMemoryBarriers,
4822361fc4cbSmaya                      uint32_t imageMemoryBarrierCount,
4823361fc4cbSmaya                      const VkImageMemoryBarrier *pImageMemoryBarriers)
4824361fc4cbSmaya{
4825361fc4cbSmaya   TU_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, commandBuffer);
4826361fc4cbSmaya   struct tu_barrier_info info;
4827361fc4cbSmaya
4828361fc4cbSmaya   info.eventCount = 0;
4829361fc4cbSmaya   info.pEvents = NULL;
4830361fc4cbSmaya   info.srcStageMask = srcStageMask;
48317ec681f3Smrg   info.dstStageMask = dstStageMask;
4832361fc4cbSmaya
4833361fc4cbSmaya   tu_barrier(cmd_buffer, memoryBarrierCount, pMemoryBarriers,
4834361fc4cbSmaya              bufferMemoryBarrierCount, pBufferMemoryBarriers,
4835361fc4cbSmaya              imageMemoryBarrierCount, pImageMemoryBarriers, &info);
4836361fc4cbSmaya}
4837361fc4cbSmaya
4838361fc4cbSmayastatic void
48397ec681f3Smrgwrite_event(struct tu_cmd_buffer *cmd, struct tu_event *event,
48407ec681f3Smrg            VkPipelineStageFlags stageMask, unsigned value)
4841361fc4cbSmaya{
48427ec681f3Smrg   struct tu_cs *cs = &cmd->cs;
48437ec681f3Smrg
48447ec681f3Smrg   /* vkCmdSetEvent/vkCmdResetEvent cannot be called inside a render pass */
48457ec681f3Smrg   assert(!cmd->state.pass);
48467ec681f3Smrg
48477ec681f3Smrg   tu_emit_cache_flush(cmd, cs);
48487ec681f3Smrg
48497ec681f3Smrg   /* Flags that only require a top-of-pipe event. DrawIndirect parameters are
48507ec681f3Smrg    * read by the CP, so the draw indirect stage counts as top-of-pipe too.
48517ec681f3Smrg    */
48527ec681f3Smrg   VkPipelineStageFlags top_of_pipe_flags =
48537ec681f3Smrg      VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT |
48547ec681f3Smrg      VK_PIPELINE_STAGE_DRAW_INDIRECT_BIT;
48557ec681f3Smrg
48567ec681f3Smrg   if (!(stageMask & ~top_of_pipe_flags)) {
48577ec681f3Smrg      tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 3);
48587ec681f3Smrg      tu_cs_emit_qw(cs, event->bo.iova); /* ADDR_LO/HI */
48597ec681f3Smrg      tu_cs_emit(cs, value);
48607ec681f3Smrg   } else {
48617ec681f3Smrg      /* Use a RB_DONE_TS event to wait for everything to complete. */
48627ec681f3Smrg      tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, 4);
48637ec681f3Smrg      tu_cs_emit(cs, CP_EVENT_WRITE_0_EVENT(RB_DONE_TS));
48647ec681f3Smrg      tu_cs_emit_qw(cs, event->bo.iova);
48657ec681f3Smrg      tu_cs_emit(cs, value);
48667ec681f3Smrg   }
4867361fc4cbSmaya}
4868361fc4cbSmaya
48697ec681f3SmrgVKAPI_ATTR void VKAPI_CALL
4870361fc4cbSmayatu_CmdSetEvent(VkCommandBuffer commandBuffer,
4871361fc4cbSmaya               VkEvent _event,
4872361fc4cbSmaya               VkPipelineStageFlags stageMask)
4873361fc4cbSmaya{
48747ec681f3Smrg   TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
4875361fc4cbSmaya   TU_FROM_HANDLE(tu_event, event, _event);
4876361fc4cbSmaya
48777ec681f3Smrg   write_event(cmd, event, stageMask, 1);
4878361fc4cbSmaya}
4879361fc4cbSmaya
48807ec681f3SmrgVKAPI_ATTR void VKAPI_CALL
4881361fc4cbSmayatu_CmdResetEvent(VkCommandBuffer commandBuffer,
4882361fc4cbSmaya                 VkEvent _event,
4883361fc4cbSmaya                 VkPipelineStageFlags stageMask)
4884361fc4cbSmaya{
48857ec681f3Smrg   TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
4886361fc4cbSmaya   TU_FROM_HANDLE(tu_event, event, _event);
4887361fc4cbSmaya
48887ec681f3Smrg   write_event(cmd, event, stageMask, 0);
4889361fc4cbSmaya}
4890361fc4cbSmaya
48917ec681f3SmrgVKAPI_ATTR void VKAPI_CALL
4892361fc4cbSmayatu_CmdWaitEvents(VkCommandBuffer commandBuffer,
4893361fc4cbSmaya                 uint32_t eventCount,
4894361fc4cbSmaya                 const VkEvent *pEvents,
4895361fc4cbSmaya                 VkPipelineStageFlags srcStageMask,
4896361fc4cbSmaya                 VkPipelineStageFlags dstStageMask,
4897361fc4cbSmaya                 uint32_t memoryBarrierCount,
4898361fc4cbSmaya                 const VkMemoryBarrier *pMemoryBarriers,
4899361fc4cbSmaya                 uint32_t bufferMemoryBarrierCount,
4900361fc4cbSmaya                 const VkBufferMemoryBarrier *pBufferMemoryBarriers,
4901361fc4cbSmaya                 uint32_t imageMemoryBarrierCount,
4902361fc4cbSmaya                 const VkImageMemoryBarrier *pImageMemoryBarriers)
4903361fc4cbSmaya{
49047ec681f3Smrg   TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
4905361fc4cbSmaya   struct tu_barrier_info info;
4906361fc4cbSmaya
4907361fc4cbSmaya   info.eventCount = eventCount;
4908361fc4cbSmaya   info.pEvents = pEvents;
49097ec681f3Smrg   info.srcStageMask = srcStageMask;
49107ec681f3Smrg   info.dstStageMask = dstStageMask;
4911361fc4cbSmaya
49127ec681f3Smrg   tu_barrier(cmd, memoryBarrierCount, pMemoryBarriers,
4913361fc4cbSmaya              bufferMemoryBarrierCount, pBufferMemoryBarriers,
4914361fc4cbSmaya              imageMemoryBarrierCount, pImageMemoryBarriers, &info);
4915361fc4cbSmaya}
4916361fc4cbSmaya
49177ec681f3SmrgVKAPI_ATTR void VKAPI_CALL
4918361fc4cbSmayatu_CmdSetDeviceMask(VkCommandBuffer commandBuffer, uint32_t deviceMask)
4919361fc4cbSmaya{
4920361fc4cbSmaya   /* No-op */
4921361fc4cbSmaya}
49227ec681f3Smrg
49237ec681f3Smrg
49247ec681f3SmrgVKAPI_ATTR void VKAPI_CALL
49257ec681f3Smrgtu_CmdBeginConditionalRenderingEXT(VkCommandBuffer commandBuffer,
49267ec681f3Smrg                                   const VkConditionalRenderingBeginInfoEXT *pConditionalRenderingBegin)
49277ec681f3Smrg{
49287ec681f3Smrg   TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
49297ec681f3Smrg
49307ec681f3Smrg   cmd->state.predication_active = true;
49317ec681f3Smrg   if (cmd->state.pass)
49327ec681f3Smrg      cmd->state.has_subpass_predication = true;
49337ec681f3Smrg
49347ec681f3Smrg   struct tu_cs *cs = cmd->state.pass ? &cmd->draw_cs : &cmd->cs;
49357ec681f3Smrg
49367ec681f3Smrg   tu_cs_emit_pkt7(cs, CP_DRAW_PRED_ENABLE_GLOBAL, 1);
49377ec681f3Smrg   tu_cs_emit(cs, 1);
49387ec681f3Smrg
49397ec681f3Smrg   /* Wait for any writes to the predicate to land */
49407ec681f3Smrg   if (cmd->state.pass)
49417ec681f3Smrg      tu_emit_cache_flush_renderpass(cmd, cs);
49427ec681f3Smrg   else
49437ec681f3Smrg      tu_emit_cache_flush(cmd, cs);
49447ec681f3Smrg
49457ec681f3Smrg   TU_FROM_HANDLE(tu_buffer, buf, pConditionalRenderingBegin->buffer);
49467ec681f3Smrg   uint64_t iova = tu_buffer_iova(buf) + pConditionalRenderingBegin->offset;
49477ec681f3Smrg
49487ec681f3Smrg   /* qcom doesn't support 32-bit reference values, only 64-bit, but Vulkan
49497ec681f3Smrg    * mandates 32-bit comparisons. Our workaround is to copy the the reference
49507ec681f3Smrg    * value to the low 32-bits of a location where the high 32 bits are known
49517ec681f3Smrg    * to be 0 and then compare that.
49527ec681f3Smrg    */
49537ec681f3Smrg   tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 5);
49547ec681f3Smrg   tu_cs_emit(cs, 0);
49557ec681f3Smrg   tu_cs_emit_qw(cs, global_iova(cmd, predicate));
49567ec681f3Smrg   tu_cs_emit_qw(cs, iova);
49577ec681f3Smrg
49587ec681f3Smrg   tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0);
49597ec681f3Smrg   tu_cs_emit_pkt7(cs, CP_WAIT_FOR_ME, 0);
49607ec681f3Smrg
49617ec681f3Smrg   bool inv = pConditionalRenderingBegin->flags & VK_CONDITIONAL_RENDERING_INVERTED_BIT_EXT;
49627ec681f3Smrg   tu_cs_emit_pkt7(cs, CP_DRAW_PRED_SET, 3);
49637ec681f3Smrg   tu_cs_emit(cs, CP_DRAW_PRED_SET_0_SRC(PRED_SRC_MEM) |
49647ec681f3Smrg                  CP_DRAW_PRED_SET_0_TEST(inv ? EQ_0_PASS : NE_0_PASS));
49657ec681f3Smrg   tu_cs_emit_qw(cs, global_iova(cmd, predicate));
49667ec681f3Smrg}
49677ec681f3Smrg
49687ec681f3SmrgVKAPI_ATTR void VKAPI_CALL
49697ec681f3Smrgtu_CmdEndConditionalRenderingEXT(VkCommandBuffer commandBuffer)
49707ec681f3Smrg{
49717ec681f3Smrg   TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
49727ec681f3Smrg
49737ec681f3Smrg   cmd->state.predication_active = false;
49747ec681f3Smrg
49757ec681f3Smrg   struct tu_cs *cs = cmd->state.pass ? &cmd->draw_cs : &cmd->cs;
49767ec681f3Smrg
49777ec681f3Smrg   tu_cs_emit_pkt7(cs, CP_DRAW_PRED_ENABLE_GLOBAL, 1);
49787ec681f3Smrg   tu_cs_emit(cs, 0);
49797ec681f3Smrg}
49807ec681f3Smrg
4981