1/*
2 * Copyright © 2016 Red Hat.
3 * Copyright © 2016 Bas Nieuwenhuizen
4 *
5 * based in part on anv driver which is:
6 * Copyright © 2015 Intel Corporation
7 *
8 * Permission is hereby granted, free of charge, to any person obtaining a
9 * copy of this software and associated documentation files (the "Software"),
10 * to deal in the Software without restriction, including without limitation
11 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
12 * and/or sell copies of the Software, and to permit persons to whom the
13 * Software is furnished to do so, subject to the following conditions:
14 *
15 * The above copyright notice and this permission notice (including the next
16 * paragraph) shall be included in all copies or substantial portions of the
17 * Software.
18 *
19 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
22 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
24 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
25 * DEALINGS IN THE SOFTWARE.
26 */
27
28#include "tu_private.h"
29
30#include "adreno_pm4.xml.h"
31#include "adreno_common.xml.h"
32
33#include "vk_format.h"
34#include "vk_util.h"
35
36#include "tu_cs.h"
37
38#include "tu_tracepoints.h"
39
40void
41tu6_emit_event_write(struct tu_cmd_buffer *cmd,
42                     struct tu_cs *cs,
43                     enum vgt_event_type event)
44{
45   bool need_seqno = false;
46   switch (event) {
47   case CACHE_FLUSH_TS:
48   case WT_DONE_TS:
49   case RB_DONE_TS:
50   case PC_CCU_FLUSH_DEPTH_TS:
51   case PC_CCU_FLUSH_COLOR_TS:
52   case PC_CCU_RESOLVE_TS:
53      need_seqno = true;
54      break;
55   default:
56      break;
57   }
58
59   tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, need_seqno ? 4 : 1);
60   tu_cs_emit(cs, CP_EVENT_WRITE_0_EVENT(event));
61   if (need_seqno) {
62      tu_cs_emit_qw(cs, global_iova(cmd, seqno_dummy));
63      tu_cs_emit(cs, 0);
64   }
65}
66
67static void
68tu6_emit_flushes(struct tu_cmd_buffer *cmd_buffer,
69                 struct tu_cs *cs,
70                 enum tu_cmd_flush_bits flushes)
71{
72   if (unlikely(cmd_buffer->device->physical_device->instance->debug_flags & TU_DEBUG_FLUSHALL))
73      flushes |= TU_CMD_FLAG_ALL_FLUSH | TU_CMD_FLAG_ALL_INVALIDATE;
74
75   if (unlikely(cmd_buffer->device->physical_device->instance->debug_flags & TU_DEBUG_SYNCDRAW))
76      flushes |= TU_CMD_FLAG_WAIT_MEM_WRITES |
77                 TU_CMD_FLAG_WAIT_FOR_IDLE |
78                 TU_CMD_FLAG_WAIT_FOR_ME;
79
80   /* Experiments show that invalidating CCU while it still has data in it
81    * doesn't work, so make sure to always flush before invalidating in case
82    * any data remains that hasn't yet been made available through a barrier.
83    * However it does seem to work for UCHE.
84    */
85   if (flushes & (TU_CMD_FLAG_CCU_FLUSH_COLOR |
86                  TU_CMD_FLAG_CCU_INVALIDATE_COLOR))
87      tu6_emit_event_write(cmd_buffer, cs, PC_CCU_FLUSH_COLOR_TS);
88   if (flushes & (TU_CMD_FLAG_CCU_FLUSH_DEPTH |
89                  TU_CMD_FLAG_CCU_INVALIDATE_DEPTH))
90      tu6_emit_event_write(cmd_buffer, cs, PC_CCU_FLUSH_DEPTH_TS);
91   if (flushes & TU_CMD_FLAG_CCU_INVALIDATE_COLOR)
92      tu6_emit_event_write(cmd_buffer, cs, PC_CCU_INVALIDATE_COLOR);
93   if (flushes & TU_CMD_FLAG_CCU_INVALIDATE_DEPTH)
94      tu6_emit_event_write(cmd_buffer, cs, PC_CCU_INVALIDATE_DEPTH);
95   if (flushes & TU_CMD_FLAG_CACHE_FLUSH)
96      tu6_emit_event_write(cmd_buffer, cs, CACHE_FLUSH_TS);
97   if (flushes & TU_CMD_FLAG_CACHE_INVALIDATE)
98      tu6_emit_event_write(cmd_buffer, cs, CACHE_INVALIDATE);
99   if (flushes & TU_CMD_FLAG_WAIT_MEM_WRITES)
100      tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0);
101   if ((flushes & TU_CMD_FLAG_WAIT_FOR_IDLE) ||
102       (cmd_buffer->device->physical_device->info->a6xx.has_ccu_flush_bug &&
103        (flushes & (TU_CMD_FLAG_CCU_FLUSH_COLOR | TU_CMD_FLAG_CCU_FLUSH_DEPTH))))
104      tu_cs_emit_wfi(cs);
105   if (flushes & TU_CMD_FLAG_WAIT_FOR_ME)
106      tu_cs_emit_pkt7(cs, CP_WAIT_FOR_ME, 0);
107}
108
109/* "Normal" cache flushes, that don't require any special handling */
110
111static void
112tu_emit_cache_flush(struct tu_cmd_buffer *cmd_buffer,
113                    struct tu_cs *cs)
114{
115   tu6_emit_flushes(cmd_buffer, cs, cmd_buffer->state.cache.flush_bits);
116   cmd_buffer->state.cache.flush_bits = 0;
117}
118
119/* Renderpass cache flushes */
120
121void
122tu_emit_cache_flush_renderpass(struct tu_cmd_buffer *cmd_buffer,
123                               struct tu_cs *cs)
124{
125   if (!cmd_buffer->state.renderpass_cache.flush_bits &&
126       likely(!cmd_buffer->device->physical_device->instance->debug_flags))
127      return;
128   tu6_emit_flushes(cmd_buffer, cs, cmd_buffer->state.renderpass_cache.flush_bits);
129   cmd_buffer->state.renderpass_cache.flush_bits = 0;
130}
131
132/* Cache flushes for things that use the color/depth read/write path (i.e.
133 * blits and draws). This deals with changing CCU state as well as the usual
134 * cache flushing.
135 */
136
137void
138tu_emit_cache_flush_ccu(struct tu_cmd_buffer *cmd_buffer,
139                        struct tu_cs *cs,
140                        enum tu_cmd_ccu_state ccu_state)
141{
142   enum tu_cmd_flush_bits flushes = cmd_buffer->state.cache.flush_bits;
143
144   assert(ccu_state != TU_CMD_CCU_UNKNOWN);
145
146   /* Changing CCU state must involve invalidating the CCU. In sysmem mode,
147    * the CCU may also contain data that we haven't flushed out yet, so we
148    * also need to flush. Also, in order to program RB_CCU_CNTL, we need to
149    * emit a WFI as it isn't pipelined.
150    */
151   if (ccu_state != cmd_buffer->state.ccu_state) {
152      if (cmd_buffer->state.ccu_state != TU_CMD_CCU_GMEM) {
153         flushes |=
154            TU_CMD_FLAG_CCU_FLUSH_COLOR |
155            TU_CMD_FLAG_CCU_FLUSH_DEPTH;
156         cmd_buffer->state.cache.pending_flush_bits &= ~(
157            TU_CMD_FLAG_CCU_FLUSH_COLOR |
158            TU_CMD_FLAG_CCU_FLUSH_DEPTH);
159      }
160      flushes |=
161         TU_CMD_FLAG_CCU_INVALIDATE_COLOR |
162         TU_CMD_FLAG_CCU_INVALIDATE_DEPTH |
163         TU_CMD_FLAG_WAIT_FOR_IDLE;
164      cmd_buffer->state.cache.pending_flush_bits &= ~(
165         TU_CMD_FLAG_CCU_INVALIDATE_COLOR |
166         TU_CMD_FLAG_CCU_INVALIDATE_DEPTH |
167         TU_CMD_FLAG_WAIT_FOR_IDLE);
168   }
169
170   tu6_emit_flushes(cmd_buffer, cs, flushes);
171   cmd_buffer->state.cache.flush_bits = 0;
172
173   if (ccu_state != cmd_buffer->state.ccu_state) {
174      struct tu_physical_device *phys_dev = cmd_buffer->device->physical_device;
175      tu_cs_emit_regs(cs,
176                      A6XX_RB_CCU_CNTL(.color_offset =
177                                          ccu_state == TU_CMD_CCU_GMEM ?
178                                          phys_dev->ccu_offset_gmem :
179                                          phys_dev->ccu_offset_bypass,
180                                       .gmem = ccu_state == TU_CMD_CCU_GMEM));
181      cmd_buffer->state.ccu_state = ccu_state;
182   }
183}
184
185static void
186tu6_emit_zs(struct tu_cmd_buffer *cmd,
187            const struct tu_subpass *subpass,
188            struct tu_cs *cs)
189{
190   const uint32_t a = subpass->depth_stencil_attachment.attachment;
191   if (a == VK_ATTACHMENT_UNUSED) {
192      tu_cs_emit_regs(cs,
193                      A6XX_RB_DEPTH_BUFFER_INFO(.depth_format = DEPTH6_NONE),
194                      A6XX_RB_DEPTH_BUFFER_PITCH(0),
195                      A6XX_RB_DEPTH_BUFFER_ARRAY_PITCH(0),
196                      A6XX_RB_DEPTH_BUFFER_BASE(0),
197                      A6XX_RB_DEPTH_BUFFER_BASE_GMEM(0));
198
199      tu_cs_emit_regs(cs,
200                      A6XX_GRAS_SU_DEPTH_BUFFER_INFO(.depth_format = DEPTH6_NONE));
201
202      tu_cs_emit_regs(cs,
203                      A6XX_GRAS_LRZ_BUFFER_BASE(0),
204                      A6XX_GRAS_LRZ_BUFFER_PITCH(0),
205                      A6XX_GRAS_LRZ_FAST_CLEAR_BUFFER_BASE(0));
206
207      tu_cs_emit_regs(cs, A6XX_RB_STENCIL_INFO(0));
208
209      return;
210   }
211
212   const struct tu_image_view *iview = cmd->state.attachments[a];
213   const struct tu_render_pass_attachment *attachment =
214      &cmd->state.pass->attachments[a];
215   enum a6xx_depth_format fmt = tu6_pipe2depth(attachment->format);
216
217   tu_cs_emit_pkt4(cs, REG_A6XX_RB_DEPTH_BUFFER_INFO, 6);
218   tu_cs_emit(cs, A6XX_RB_DEPTH_BUFFER_INFO(.depth_format = fmt).value);
219   tu_cs_image_ref(cs, iview, 0);
220   tu_cs_emit(cs, attachment->gmem_offset);
221
222   tu_cs_emit_regs(cs,
223                   A6XX_GRAS_SU_DEPTH_BUFFER_INFO(.depth_format = fmt));
224
225   tu_cs_emit_pkt4(cs, REG_A6XX_RB_DEPTH_FLAG_BUFFER_BASE, 3);
226   tu_cs_image_flag_ref(cs, iview, 0);
227
228   tu_cs_emit_regs(cs, A6XX_GRAS_LRZ_BUFFER_BASE(.bo = iview->image->bo,
229                                                 .bo_offset = iview->image->bo_offset + iview->image->lrz_offset),
230                   A6XX_GRAS_LRZ_BUFFER_PITCH(.pitch = iview->image->lrz_pitch),
231                   A6XX_GRAS_LRZ_FAST_CLEAR_BUFFER_BASE());
232
233   if (attachment->format == VK_FORMAT_D32_SFLOAT_S8_UINT ||
234       attachment->format == VK_FORMAT_S8_UINT) {
235
236      tu_cs_emit_pkt4(cs, REG_A6XX_RB_STENCIL_INFO, 6);
237      tu_cs_emit(cs, A6XX_RB_STENCIL_INFO(.separate_stencil = true).value);
238      if (attachment->format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
239         tu_cs_image_stencil_ref(cs, iview, 0);
240         tu_cs_emit(cs, attachment->gmem_offset_stencil);
241      } else {
242         tu_cs_image_ref(cs, iview, 0);
243         tu_cs_emit(cs, attachment->gmem_offset);
244      }
245   } else {
246      tu_cs_emit_regs(cs,
247                     A6XX_RB_STENCIL_INFO(0));
248   }
249}
250
251static void
252tu6_emit_mrt(struct tu_cmd_buffer *cmd,
253             const struct tu_subpass *subpass,
254             struct tu_cs *cs)
255{
256   const struct tu_framebuffer *fb = cmd->state.framebuffer;
257
258   for (uint32_t i = 0; i < subpass->color_count; ++i) {
259      uint32_t a = subpass->color_attachments[i].attachment;
260      if (a == VK_ATTACHMENT_UNUSED)
261         continue;
262
263      const struct tu_image_view *iview = cmd->state.attachments[a];
264
265      tu_cs_emit_pkt4(cs, REG_A6XX_RB_MRT_BUF_INFO(i), 6);
266      tu_cs_emit(cs, iview->RB_MRT_BUF_INFO);
267      tu_cs_image_ref(cs, iview, 0);
268      tu_cs_emit(cs, cmd->state.pass->attachments[a].gmem_offset);
269
270      tu_cs_emit_regs(cs,
271                      A6XX_SP_FS_MRT_REG(i, .dword = iview->SP_FS_MRT_REG));
272
273      tu_cs_emit_pkt4(cs, REG_A6XX_RB_MRT_FLAG_BUFFER_ADDR(i), 3);
274      tu_cs_image_flag_ref(cs, iview, 0);
275   }
276
277   tu_cs_emit_regs(cs,
278                   A6XX_RB_SRGB_CNTL(.dword = subpass->srgb_cntl));
279   tu_cs_emit_regs(cs,
280                   A6XX_SP_SRGB_CNTL(.dword = subpass->srgb_cntl));
281
282   unsigned layers = MAX2(fb->layers, util_logbase2(subpass->multiview_mask) + 1);
283   tu_cs_emit_regs(cs, A6XX_GRAS_MAX_LAYER_INDEX(layers - 1));
284
285   tu_cs_emit_write_reg(cs, REG_A6XX_GRAS_SC_CNTL,
286                        A6XX_GRAS_SC_CNTL_CCUSINGLECACHELINESIZE(2));
287
288   /* If there is a feedback loop, then the shader can read the previous value
289    * of a pixel being written out. It can also write some components and then
290    * read different components without a barrier in between. This is a
291    * problem in sysmem mode with UBWC, because the main buffer and flags
292    * buffer can get out-of-sync if only one is flushed. We fix this by
293    * setting the SINGLE_PRIM_MODE field to the same value that the blob does
294    * for advanced_blend in sysmem mode if a feedback loop is detected.
295    */
296   if (subpass->feedback) {
297      tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_SYSMEM);
298      tu_cs_emit_write_reg(cs, REG_A6XX_GRAS_SC_CNTL,
299                           A6XX_GRAS_SC_CNTL_CCUSINGLECACHELINESIZE(2) |
300                           A6XX_GRAS_SC_CNTL_SINGLE_PRIM_MODE(
301                              FLUSH_PER_OVERLAP_AND_OVERWRITE));
302      tu_cond_exec_end(cs);
303   }
304}
305
306void
307tu6_emit_msaa(struct tu_cs *cs, VkSampleCountFlagBits vk_samples,
308              enum a5xx_line_mode line_mode)
309{
310   const enum a3xx_msaa_samples samples = tu_msaa_samples(vk_samples);
311   bool msaa_disable = (samples == MSAA_ONE) || (line_mode == BRESENHAM);
312
313   tu_cs_emit_regs(cs,
314                   A6XX_SP_TP_RAS_MSAA_CNTL(samples),
315                   A6XX_SP_TP_DEST_MSAA_CNTL(.samples = samples,
316                                             .msaa_disable = msaa_disable));
317
318   tu_cs_emit_regs(cs,
319                   A6XX_GRAS_RAS_MSAA_CNTL(samples),
320                   A6XX_GRAS_DEST_MSAA_CNTL(.samples = samples,
321                                            .msaa_disable = msaa_disable));
322
323   tu_cs_emit_regs(cs,
324                   A6XX_RB_RAS_MSAA_CNTL(samples),
325                   A6XX_RB_DEST_MSAA_CNTL(.samples = samples,
326                                          .msaa_disable = msaa_disable));
327
328   tu_cs_emit_regs(cs,
329                   A6XX_RB_MSAA_CNTL(samples));
330}
331
332static void
333tu6_emit_bin_size(struct tu_cs *cs,
334                  uint32_t bin_w, uint32_t bin_h, uint32_t flags)
335{
336   tu_cs_emit_regs(cs,
337                   A6XX_GRAS_BIN_CONTROL(.binw = bin_w,
338                                         .binh = bin_h,
339                                         .dword = flags));
340
341   tu_cs_emit_regs(cs,
342                   A6XX_RB_BIN_CONTROL(.binw = bin_w,
343                                       .binh = bin_h,
344                                       .dword = flags));
345
346   /* no flag for RB_BIN_CONTROL2... */
347   tu_cs_emit_regs(cs,
348                   A6XX_RB_BIN_CONTROL2(.binw = bin_w,
349                                        .binh = bin_h));
350}
351
352static void
353tu6_emit_render_cntl(struct tu_cmd_buffer *cmd,
354                     const struct tu_subpass *subpass,
355                     struct tu_cs *cs,
356                     bool binning)
357{
358   /* doesn't RB_RENDER_CNTL set differently for binning pass: */
359   bool no_track = !cmd->device->physical_device->info->a6xx.has_cp_reg_write;
360   uint32_t cntl = 0;
361   cntl |= A6XX_RB_RENDER_CNTL_CCUSINGLECACHELINESIZE(2);
362   if (binning) {
363      if (no_track)
364         return;
365      cntl |= A6XX_RB_RENDER_CNTL_BINNING;
366   } else {
367      uint32_t mrts_ubwc_enable = 0;
368      for (uint32_t i = 0; i < subpass->color_count; ++i) {
369         uint32_t a = subpass->color_attachments[i].attachment;
370         if (a == VK_ATTACHMENT_UNUSED)
371            continue;
372
373         const struct tu_image_view *iview = cmd->state.attachments[a];
374         if (iview->ubwc_enabled)
375            mrts_ubwc_enable |= 1 << i;
376      }
377
378      cntl |= A6XX_RB_RENDER_CNTL_FLAG_MRTS(mrts_ubwc_enable);
379
380      const uint32_t a = subpass->depth_stencil_attachment.attachment;
381      if (a != VK_ATTACHMENT_UNUSED) {
382         const struct tu_image_view *iview = cmd->state.attachments[a];
383         if (iview->ubwc_enabled)
384            cntl |= A6XX_RB_RENDER_CNTL_FLAG_DEPTH;
385      }
386
387      if (no_track) {
388         tu_cs_emit_pkt4(cs, REG_A6XX_RB_RENDER_CNTL, 1);
389         tu_cs_emit(cs, cntl);
390         return;
391      }
392
393      /* In the !binning case, we need to set RB_RENDER_CNTL in the draw_cs
394       * in order to set it correctly for the different subpasses. However,
395       * that means the packets we're emitting also happen during binning. So
396       * we need to guard the write on !BINNING at CP execution time.
397       */
398      tu_cs_reserve(cs, 3 + 4);
399      tu_cs_emit_pkt7(cs, CP_COND_REG_EXEC, 2);
400      tu_cs_emit(cs, CP_COND_REG_EXEC_0_MODE(RENDER_MODE) |
401                     CP_COND_REG_EXEC_0_GMEM | CP_COND_REG_EXEC_0_SYSMEM);
402      tu_cs_emit(cs, CP_COND_REG_EXEC_1_DWORDS(4));
403   }
404
405   tu_cs_emit_pkt7(cs, CP_REG_WRITE, 3);
406   tu_cs_emit(cs, CP_REG_WRITE_0_TRACKER(TRACK_RENDER_CNTL));
407   tu_cs_emit(cs, REG_A6XX_RB_RENDER_CNTL);
408   tu_cs_emit(cs, cntl);
409}
410
411static void
412tu6_emit_blit_scissor(struct tu_cmd_buffer *cmd, struct tu_cs *cs, bool align)
413{
414   struct tu_physical_device *phys_dev = cmd->device->physical_device;
415   const VkRect2D *render_area = &cmd->state.render_area;
416
417   /* Avoid assertion fails with an empty render area at (0, 0) where the
418    * subtraction below wraps around. Empty render areas should be forced to
419    * the sysmem path by use_sysmem_rendering(). It's not even clear whether
420    * an empty scissor here works, and the blob seems to force sysmem too as
421    * it sets something wrong (non-empty) for the scissor.
422    */
423   if (render_area->extent.width == 0 ||
424       render_area->extent.height == 0)
425      return;
426
427   uint32_t x1 = render_area->offset.x;
428   uint32_t y1 = render_area->offset.y;
429   uint32_t x2 = x1 + render_area->extent.width - 1;
430   uint32_t y2 = y1 + render_area->extent.height - 1;
431
432   if (align) {
433      x1 = x1 & ~(phys_dev->info->gmem_align_w - 1);
434      y1 = y1 & ~(phys_dev->info->gmem_align_h - 1);
435      x2 = ALIGN_POT(x2 + 1, phys_dev->info->gmem_align_w) - 1;
436      y2 = ALIGN_POT(y2 + 1, phys_dev->info->gmem_align_h) - 1;
437   }
438
439   tu_cs_emit_regs(cs,
440                   A6XX_RB_BLIT_SCISSOR_TL(.x = x1, .y = y1),
441                   A6XX_RB_BLIT_SCISSOR_BR(.x = x2, .y = y2));
442}
443
444void
445tu6_emit_window_scissor(struct tu_cs *cs,
446                        uint32_t x1,
447                        uint32_t y1,
448                        uint32_t x2,
449                        uint32_t y2)
450{
451   tu_cs_emit_regs(cs,
452                   A6XX_GRAS_SC_WINDOW_SCISSOR_TL(.x = x1, .y = y1),
453                   A6XX_GRAS_SC_WINDOW_SCISSOR_BR(.x = x2, .y = y2));
454
455   tu_cs_emit_regs(cs,
456                   A6XX_GRAS_2D_RESOLVE_CNTL_1(.x = x1, .y = y1),
457                   A6XX_GRAS_2D_RESOLVE_CNTL_2(.x = x2, .y = y2));
458}
459
460void
461tu6_emit_window_offset(struct tu_cs *cs, uint32_t x1, uint32_t y1)
462{
463   tu_cs_emit_regs(cs,
464                   A6XX_RB_WINDOW_OFFSET(.x = x1, .y = y1));
465
466   tu_cs_emit_regs(cs,
467                   A6XX_RB_WINDOW_OFFSET2(.x = x1, .y = y1));
468
469   tu_cs_emit_regs(cs,
470                   A6XX_SP_WINDOW_OFFSET(.x = x1, .y = y1));
471
472   tu_cs_emit_regs(cs,
473                   A6XX_SP_TP_WINDOW_OFFSET(.x = x1, .y = y1));
474}
475
476void
477tu6_apply_depth_bounds_workaround(struct tu_device *device,
478                                  uint32_t *rb_depth_cntl)
479{
480   if (!device->physical_device->info->a6xx.depth_bounds_require_depth_test_quirk)
481      return;
482
483   /* On some GPUs it is necessary to enable z test for depth bounds test when
484    * UBWC is enabled. Otherwise, the GPU would hang. FUNC_ALWAYS is required to
485    * pass z test. Relevant tests:
486    *  dEQP-VK.pipeline.extended_dynamic_state.two_draws_dynamic.depth_bounds_test_disable
487    *  dEQP-VK.dynamic_state.ds_state.depth_bounds_1
488    */
489   *rb_depth_cntl |= A6XX_RB_DEPTH_CNTL_Z_TEST_ENABLE |
490                     A6XX_RB_DEPTH_CNTL_ZFUNC(FUNC_ALWAYS);
491}
492
493static void
494tu_cs_emit_draw_state(struct tu_cs *cs, uint32_t id, struct tu_draw_state state)
495{
496   uint32_t enable_mask;
497   switch (id) {
498   case TU_DRAW_STATE_PROGRAM:
499   case TU_DRAW_STATE_VI:
500   case TU_DRAW_STATE_FS_CONST:
501   /* The blob seems to not enable this (DESC_SETS_LOAD) for binning, even
502    * when resources would actually be used in the binning shader.
503    * Presumably the overhead of prefetching the resources isn't
504    * worth it.
505    */
506   case TU_DRAW_STATE_DESC_SETS_LOAD:
507      enable_mask = CP_SET_DRAW_STATE__0_GMEM |
508                    CP_SET_DRAW_STATE__0_SYSMEM;
509      break;
510   case TU_DRAW_STATE_PROGRAM_BINNING:
511   case TU_DRAW_STATE_VI_BINNING:
512      enable_mask = CP_SET_DRAW_STATE__0_BINNING;
513      break;
514   case TU_DRAW_STATE_INPUT_ATTACHMENTS_GMEM:
515      enable_mask = CP_SET_DRAW_STATE__0_GMEM;
516      break;
517   case TU_DRAW_STATE_INPUT_ATTACHMENTS_SYSMEM:
518      enable_mask = CP_SET_DRAW_STATE__0_SYSMEM;
519      break;
520   default:
521      enable_mask = CP_SET_DRAW_STATE__0_GMEM |
522                    CP_SET_DRAW_STATE__0_SYSMEM |
523                    CP_SET_DRAW_STATE__0_BINNING;
524      break;
525   }
526
527   STATIC_ASSERT(TU_DRAW_STATE_COUNT <= 32);
528
529   /* We need to reload the descriptors every time the descriptor sets
530    * change. However, the commands we send only depend on the pipeline
531    * because the whole point is to cache descriptors which are used by the
532    * pipeline. There's a problem here, in that the firmware has an
533    * "optimization" which skips executing groups that are set to the same
534    * value as the last draw. This means that if the descriptor sets change
535    * but not the pipeline, we'd try to re-execute the same buffer which
536    * the firmware would ignore and we wouldn't pre-load the new
537    * descriptors. Set the DIRTY bit to avoid this optimization
538    */
539   if (id == TU_DRAW_STATE_DESC_SETS_LOAD)
540      enable_mask |= CP_SET_DRAW_STATE__0_DIRTY;
541
542   tu_cs_emit(cs, CP_SET_DRAW_STATE__0_COUNT(state.size) |
543                  enable_mask |
544                  CP_SET_DRAW_STATE__0_GROUP_ID(id) |
545                  COND(!state.size, CP_SET_DRAW_STATE__0_DISABLE));
546   tu_cs_emit_qw(cs, state.iova);
547}
548
549static bool
550use_hw_binning(struct tu_cmd_buffer *cmd)
551{
552   const struct tu_framebuffer *fb = cmd->state.framebuffer;
553
554   /* XFB commands are emitted for BINNING || SYSMEM, which makes it incompatible
555    * with non-hw binning GMEM rendering. this is required because some of the
556    * XFB commands need to only be executed once
557    */
558   if (cmd->state.xfb_used)
559      return true;
560
561   if (unlikely(cmd->device->physical_device->instance->debug_flags & TU_DEBUG_NOBIN))
562      return false;
563
564   if (unlikely(cmd->device->physical_device->instance->debug_flags & TU_DEBUG_FORCEBIN))
565      return true;
566
567   return (fb->tile_count.width * fb->tile_count.height) > 2;
568}
569
570static bool
571use_sysmem_rendering(struct tu_cmd_buffer *cmd)
572{
573   if (unlikely(cmd->device->physical_device->instance->debug_flags & TU_DEBUG_SYSMEM))
574      return true;
575
576   /* can't fit attachments into gmem */
577   if (!cmd->state.pass->gmem_pixels)
578      return true;
579
580   if (cmd->state.framebuffer->layers > 1)
581      return true;
582
583   /* Use sysmem for empty render areas */
584   if (cmd->state.render_area.extent.width == 0 ||
585       cmd->state.render_area.extent.height == 0)
586      return true;
587
588   if (cmd->state.has_tess)
589      return true;
590
591   if (cmd->state.disable_gmem)
592      return true;
593
594   return false;
595}
596
597static void
598tu6_emit_tile_select(struct tu_cmd_buffer *cmd,
599                     struct tu_cs *cs,
600                     uint32_t tx, uint32_t ty, uint32_t pipe, uint32_t slot)
601{
602   const struct tu_framebuffer *fb = cmd->state.framebuffer;
603
604   tu_cs_emit_pkt7(cs, CP_SET_MARKER, 1);
605   tu_cs_emit(cs, A6XX_CP_SET_MARKER_0_MODE(RM6_GMEM));
606
607   const uint32_t x1 = fb->tile0.width * tx;
608   const uint32_t y1 = fb->tile0.height * ty;
609   const uint32_t x2 = MIN2(x1 + fb->tile0.width - 1, MAX_VIEWPORT_SIZE - 1);
610   const uint32_t y2 = MIN2(y1 + fb->tile0.height - 1, MAX_VIEWPORT_SIZE - 1);
611   tu6_emit_window_scissor(cs, x1, y1, x2, y2);
612   tu6_emit_window_offset(cs, x1, y1);
613
614   tu_cs_emit_regs(cs, A6XX_VPC_SO_DISABLE(false));
615
616   if (use_hw_binning(cmd)) {
617      tu_cs_emit_pkt7(cs, CP_WAIT_FOR_ME, 0);
618
619      tu_cs_emit_pkt7(cs, CP_SET_MODE, 1);
620      tu_cs_emit(cs, 0x0);
621
622      tu_cs_emit_pkt7(cs, CP_SET_BIN_DATA5_OFFSET, 4);
623      tu_cs_emit(cs, fb->pipe_sizes[pipe] |
624                     CP_SET_BIN_DATA5_0_VSC_N(slot));
625      tu_cs_emit(cs, pipe * cmd->vsc_draw_strm_pitch);
626      tu_cs_emit(cs, pipe * 4);
627      tu_cs_emit(cs, pipe * cmd->vsc_prim_strm_pitch);
628
629      tu_cs_emit_pkt7(cs, CP_SET_VISIBILITY_OVERRIDE, 1);
630      tu_cs_emit(cs, 0x0);
631
632      tu_cs_emit_pkt7(cs, CP_SET_MODE, 1);
633      tu_cs_emit(cs, 0x0);
634   } else {
635      tu_cs_emit_pkt7(cs, CP_SET_VISIBILITY_OVERRIDE, 1);
636      tu_cs_emit(cs, 0x1);
637
638      tu_cs_emit_pkt7(cs, CP_SET_MODE, 1);
639      tu_cs_emit(cs, 0x0);
640   }
641}
642
643static void
644tu6_emit_sysmem_resolve(struct tu_cmd_buffer *cmd,
645                        struct tu_cs *cs,
646                        uint32_t layer_mask,
647                        uint32_t a,
648                        uint32_t gmem_a)
649{
650   const struct tu_framebuffer *fb = cmd->state.framebuffer;
651   const struct tu_image_view *dst = cmd->state.attachments[a];
652   const struct tu_image_view *src = cmd->state.attachments[gmem_a];
653
654   tu_resolve_sysmem(cmd, cs, src, dst, layer_mask, fb->layers, &cmd->state.render_area);
655}
656
657static void
658tu6_emit_sysmem_resolves(struct tu_cmd_buffer *cmd,
659                         struct tu_cs *cs,
660                         const struct tu_subpass *subpass)
661{
662   if (subpass->resolve_attachments) {
663      /* From the documentation for vkCmdNextSubpass, section 7.4 "Render Pass
664       * Commands":
665       *
666       *    End-of-subpass multisample resolves are treated as color
667       *    attachment writes for the purposes of synchronization.
668       *    This applies to resolve operations for both color and
669       *    depth/stencil attachments. That is, they are considered to
670       *    execute in the VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT
671       *    pipeline stage and their writes are synchronized with
672       *    VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT. Synchronization between
673       *    rendering within a subpass and any resolve operations at the end
674       *    of the subpass occurs automatically, without need for explicit
675       *    dependencies or pipeline barriers. However, if the resolve
676       *    attachment is also used in a different subpass, an explicit
677       *    dependency is needed.
678       *
679       * We use the CP_BLIT path for sysmem resolves, which is really a
680       * transfer command, so we have to manually flush similar to the gmem
681       * resolve case. However, a flush afterwards isn't needed because of the
682       * last sentence and the fact that we're in sysmem mode.
683       */
684      tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_COLOR_TS);
685      if (subpass->resolve_depth_stencil)
686         tu6_emit_event_write(cmd, cs, PC_CCU_FLUSH_DEPTH_TS);
687
688      tu6_emit_event_write(cmd, cs, CACHE_INVALIDATE);
689
690      /* Wait for the flushes to land before using the 2D engine */
691      tu_cs_emit_wfi(cs);
692
693      for (unsigned i = 0; i < subpass->resolve_count; i++) {
694         uint32_t a = subpass->resolve_attachments[i].attachment;
695         if (a == VK_ATTACHMENT_UNUSED)
696            continue;
697
698         uint32_t gmem_a = tu_subpass_get_attachment_to_resolve(subpass, i);
699
700         tu6_emit_sysmem_resolve(cmd, cs, subpass->multiview_mask, a, gmem_a);
701      }
702   }
703}
704
705static void
706tu6_emit_tile_store(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
707{
708   const struct tu_render_pass *pass = cmd->state.pass;
709   const struct tu_subpass *subpass = &pass->subpasses[pass->subpass_count-1];
710
711   tu_cs_emit_pkt7(cs, CP_SKIP_IB2_ENABLE_GLOBAL, 1);
712   tu_cs_emit(cs, 0x0);
713
714   tu_cs_emit_pkt7(cs, CP_SET_MARKER, 1);
715   tu_cs_emit(cs, A6XX_CP_SET_MARKER_0_MODE(RM6_RESOLVE));
716
717   tu6_emit_blit_scissor(cmd, cs, true);
718
719   for (uint32_t a = 0; a < pass->attachment_count; ++a) {
720      if (pass->attachments[a].gmem_offset >= 0)
721         tu_store_gmem_attachment(cmd, cs, a, a);
722   }
723
724   if (subpass->resolve_attachments) {
725      for (unsigned i = 0; i < subpass->resolve_count; i++) {
726         uint32_t a = subpass->resolve_attachments[i].attachment;
727         if (a != VK_ATTACHMENT_UNUSED) {
728            uint32_t gmem_a = tu_subpass_get_attachment_to_resolve(subpass, i);
729            tu_store_gmem_attachment(cmd, cs, a, gmem_a);
730         }
731      }
732   }
733}
734
735void
736tu_disable_draw_states(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
737{
738   tu_cs_emit_pkt7(cs, CP_SET_DRAW_STATE, 3);
739   tu_cs_emit(cs, CP_SET_DRAW_STATE__0_COUNT(0) |
740                     CP_SET_DRAW_STATE__0_DISABLE_ALL_GROUPS |
741                     CP_SET_DRAW_STATE__0_GROUP_ID(0));
742   tu_cs_emit(cs, CP_SET_DRAW_STATE__1_ADDR_LO(0));
743   tu_cs_emit(cs, CP_SET_DRAW_STATE__2_ADDR_HI(0));
744
745   cmd->state.dirty |= TU_CMD_DIRTY_DRAW_STATE;
746}
747
748static void
749tu6_init_hw(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
750{
751   struct tu_device *dev = cmd->device;
752   const struct tu_physical_device *phys_dev = dev->physical_device;
753
754   tu6_emit_event_write(cmd, cs, CACHE_INVALIDATE);
755
756   tu_cs_emit_regs(cs, A6XX_HLSQ_INVALIDATE_CMD(
757         .vs_state = true,
758         .hs_state = true,
759         .ds_state = true,
760         .gs_state = true,
761         .fs_state = true,
762         .cs_state = true,
763         .gfx_ibo = true,
764         .cs_ibo = true,
765         .gfx_shared_const = true,
766         .cs_shared_const = true,
767         .gfx_bindless = 0x1f,
768         .cs_bindless = 0x1f));
769
770   tu_cs_emit_wfi(cs);
771
772   cmd->state.cache.pending_flush_bits &=
773      ~(TU_CMD_FLAG_WAIT_FOR_IDLE | TU_CMD_FLAG_CACHE_INVALIDATE);
774
775   tu_cs_emit_regs(cs,
776                   A6XX_RB_CCU_CNTL(.color_offset = phys_dev->ccu_offset_bypass));
777   cmd->state.ccu_state = TU_CMD_CCU_SYSMEM;
778   tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_8E04, 0x00100000);
779   tu_cs_emit_write_reg(cs, REG_A6XX_SP_FLOAT_CNTL, 0);
780   tu_cs_emit_write_reg(cs, REG_A6XX_SP_UNKNOWN_AE00, 0);
781   tu_cs_emit_write_reg(cs, REG_A6XX_SP_PERFCTR_ENABLE, 0x3f);
782   tu_cs_emit_write_reg(cs, REG_A6XX_TPL1_UNKNOWN_B605, 0x44);
783   tu_cs_emit_write_reg(cs, REG_A6XX_TPL1_DBG_ECO_CNTL,
784                        phys_dev->info->a6xx.magic.TPL1_DBG_ECO_CNTL);
785   tu_cs_emit_write_reg(cs, REG_A6XX_HLSQ_UNKNOWN_BE00, 0x80);
786   tu_cs_emit_write_reg(cs, REG_A6XX_HLSQ_UNKNOWN_BE01, 0);
787
788   tu_cs_emit_write_reg(cs, REG_A6XX_VPC_UNKNOWN_9600, 0);
789   tu_cs_emit_write_reg(cs, REG_A6XX_GRAS_DBG_ECO_CNTL, 0x880);
790   tu_cs_emit_write_reg(cs, REG_A6XX_HLSQ_UNKNOWN_BE04, 0);
791   tu_cs_emit_write_reg(cs, REG_A6XX_SP_CHICKEN_BITS, 0x00000410);
792   tu_cs_emit_write_reg(cs, REG_A6XX_SP_IBO_COUNT, 0);
793   tu_cs_emit_write_reg(cs, REG_A6XX_SP_UNKNOWN_B182, 0);
794   tu_cs_emit_write_reg(cs, REG_A6XX_HLSQ_SHARED_CONSTS, 0);
795   tu_cs_emit_write_reg(cs, REG_A6XX_UCHE_UNKNOWN_0E12, 0x3200000);
796   tu_cs_emit_write_reg(cs, REG_A6XX_UCHE_CLIENT_PF, 4);
797   tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_8E01, 0x0);
798   tu_cs_emit_write_reg(cs, REG_A6XX_SP_UNKNOWN_A9A8, 0);
799   tu_cs_emit_write_reg(cs, REG_A6XX_SP_MODE_CONTROL,
800                        A6XX_SP_MODE_CONTROL_CONSTANT_DEMOTION_ENABLE | 4);
801
802   /* TODO: set A6XX_VFD_ADD_OFFSET_INSTANCE and fix ir3 to avoid adding base instance */
803   tu_cs_emit_write_reg(cs, REG_A6XX_VFD_ADD_OFFSET, A6XX_VFD_ADD_OFFSET_VERTEX);
804   tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_8811, 0x00000010);
805   tu_cs_emit_write_reg(cs, REG_A6XX_PC_MODE_CNTL, 0x1f);
806
807   tu_cs_emit_write_reg(cs, REG_A6XX_GRAS_UNKNOWN_8110, 0);
808
809   tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_8818, 0);
810   tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_8819, 0);
811   tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_881A, 0);
812   tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_881B, 0);
813   tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_881C, 0);
814   tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_881D, 0);
815   tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_881E, 0);
816   tu_cs_emit_write_reg(cs, REG_A6XX_RB_UNKNOWN_88F0, 0);
817
818   tu_cs_emit_regs(cs, A6XX_VPC_POINT_COORD_INVERT(false));
819   tu_cs_emit_write_reg(cs, REG_A6XX_VPC_UNKNOWN_9300, 0);
820
821   tu_cs_emit_regs(cs, A6XX_VPC_SO_DISABLE(true));
822
823   tu_cs_emit_write_reg(cs, REG_A6XX_SP_UNKNOWN_B183, 0);
824
825   tu_cs_emit_write_reg(cs, REG_A6XX_GRAS_SU_CONSERVATIVE_RAS_CNTL, 0);
826   tu_cs_emit_write_reg(cs, REG_A6XX_GRAS_UNKNOWN_80AF, 0);
827   tu_cs_emit_write_reg(cs, REG_A6XX_VPC_UNKNOWN_9210, 0);
828   tu_cs_emit_write_reg(cs, REG_A6XX_VPC_UNKNOWN_9211, 0);
829   tu_cs_emit_write_reg(cs, REG_A6XX_VPC_UNKNOWN_9602, 0);
830   tu_cs_emit_write_reg(cs, REG_A6XX_PC_UNKNOWN_9E72, 0);
831   tu_cs_emit_write_reg(cs, REG_A6XX_SP_TP_MODE_CNTL,
832                        0x000000a0 |
833                        A6XX_SP_TP_MODE_CNTL_ISAMMODE(ISAMMODE_GL));
834   tu_cs_emit_write_reg(cs, REG_A6XX_HLSQ_CONTROL_5_REG, 0xfc);
835
836   tu_cs_emit_write_reg(cs, REG_A6XX_VFD_MODE_CNTL, 0x00000000);
837
838   tu_cs_emit_write_reg(cs, REG_A6XX_PC_MODE_CNTL, 0x0000001f);
839
840   tu_cs_emit_regs(cs, A6XX_RB_ALPHA_CONTROL()); /* always disable alpha test */
841   tu_cs_emit_regs(cs, A6XX_RB_DITHER_CNTL()); /* always disable dithering */
842
843   tu_disable_draw_states(cmd, cs);
844
845   tu_cs_emit_regs(cs,
846                   A6XX_SP_TP_BORDER_COLOR_BASE_ADDR(.bo = &dev->global_bo,
847                                                     .bo_offset = gb_offset(bcolor_builtin)));
848   tu_cs_emit_regs(cs,
849                   A6XX_SP_PS_TP_BORDER_COLOR_BASE_ADDR(.bo = &dev->global_bo,
850                                                        .bo_offset = gb_offset(bcolor_builtin)));
851
852   /* VSC buffers:
853    * use vsc pitches from the largest values used so far with this device
854    * if there hasn't been overflow, there will already be a scratch bo
855    * allocated for these sizes
856    *
857    * if overflow is detected, the stream size is increased by 2x
858    */
859   mtx_lock(&dev->mutex);
860
861   struct tu6_global *global = dev->global_bo.map;
862
863   uint32_t vsc_draw_overflow = global->vsc_draw_overflow;
864   uint32_t vsc_prim_overflow = global->vsc_prim_overflow;
865
866   if (vsc_draw_overflow >= dev->vsc_draw_strm_pitch)
867      dev->vsc_draw_strm_pitch = (dev->vsc_draw_strm_pitch - VSC_PAD) * 2 + VSC_PAD;
868
869   if (vsc_prim_overflow >= dev->vsc_prim_strm_pitch)
870      dev->vsc_prim_strm_pitch = (dev->vsc_prim_strm_pitch - VSC_PAD) * 2 + VSC_PAD;
871
872   cmd->vsc_prim_strm_pitch = dev->vsc_prim_strm_pitch;
873   cmd->vsc_draw_strm_pitch = dev->vsc_draw_strm_pitch;
874
875   mtx_unlock(&dev->mutex);
876
877   struct tu_bo *vsc_bo;
878   uint32_t size0 = cmd->vsc_prim_strm_pitch * MAX_VSC_PIPES +
879                    cmd->vsc_draw_strm_pitch * MAX_VSC_PIPES;
880
881   tu_get_scratch_bo(dev, size0 + MAX_VSC_PIPES * 4, &vsc_bo);
882
883   tu_cs_emit_regs(cs,
884                   A6XX_VSC_DRAW_STRM_SIZE_ADDRESS(.bo = vsc_bo, .bo_offset = size0));
885   tu_cs_emit_regs(cs,
886                   A6XX_VSC_PRIM_STRM_ADDRESS(.bo = vsc_bo));
887   tu_cs_emit_regs(cs,
888                   A6XX_VSC_DRAW_STRM_ADDRESS(.bo = vsc_bo,
889                                              .bo_offset = cmd->vsc_prim_strm_pitch * MAX_VSC_PIPES));
890
891   tu_cs_sanity_check(cs);
892}
893
894static void
895update_vsc_pipe(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
896{
897   const struct tu_framebuffer *fb = cmd->state.framebuffer;
898
899   tu_cs_emit_regs(cs,
900                   A6XX_VSC_BIN_SIZE(.width = fb->tile0.width,
901                                     .height = fb->tile0.height));
902
903   tu_cs_emit_regs(cs,
904                   A6XX_VSC_BIN_COUNT(.nx = fb->tile_count.width,
905                                      .ny = fb->tile_count.height));
906
907   tu_cs_emit_pkt4(cs, REG_A6XX_VSC_PIPE_CONFIG_REG(0), 32);
908   tu_cs_emit_array(cs, fb->pipe_config, 32);
909
910   tu_cs_emit_regs(cs,
911                   A6XX_VSC_PRIM_STRM_PITCH(cmd->vsc_prim_strm_pitch),
912                   A6XX_VSC_PRIM_STRM_LIMIT(cmd->vsc_prim_strm_pitch - VSC_PAD));
913
914   tu_cs_emit_regs(cs,
915                   A6XX_VSC_DRAW_STRM_PITCH(cmd->vsc_draw_strm_pitch),
916                   A6XX_VSC_DRAW_STRM_LIMIT(cmd->vsc_draw_strm_pitch - VSC_PAD));
917}
918
919static void
920emit_vsc_overflow_test(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
921{
922   const struct tu_framebuffer *fb = cmd->state.framebuffer;
923   const uint32_t used_pipe_count =
924      fb->pipe_count.width * fb->pipe_count.height;
925
926   for (int i = 0; i < used_pipe_count; i++) {
927      tu_cs_emit_pkt7(cs, CP_COND_WRITE5, 8);
928      tu_cs_emit(cs, CP_COND_WRITE5_0_FUNCTION(WRITE_GE) |
929            CP_COND_WRITE5_0_WRITE_MEMORY);
930      tu_cs_emit(cs, CP_COND_WRITE5_1_POLL_ADDR_LO(REG_A6XX_VSC_DRAW_STRM_SIZE_REG(i)));
931      tu_cs_emit(cs, CP_COND_WRITE5_2_POLL_ADDR_HI(0));
932      tu_cs_emit(cs, CP_COND_WRITE5_3_REF(cmd->vsc_draw_strm_pitch - VSC_PAD));
933      tu_cs_emit(cs, CP_COND_WRITE5_4_MASK(~0));
934      tu_cs_emit_qw(cs, global_iova(cmd, vsc_draw_overflow));
935      tu_cs_emit(cs, CP_COND_WRITE5_7_WRITE_DATA(cmd->vsc_draw_strm_pitch));
936
937      tu_cs_emit_pkt7(cs, CP_COND_WRITE5, 8);
938      tu_cs_emit(cs, CP_COND_WRITE5_0_FUNCTION(WRITE_GE) |
939            CP_COND_WRITE5_0_WRITE_MEMORY);
940      tu_cs_emit(cs, CP_COND_WRITE5_1_POLL_ADDR_LO(REG_A6XX_VSC_PRIM_STRM_SIZE_REG(i)));
941      tu_cs_emit(cs, CP_COND_WRITE5_2_POLL_ADDR_HI(0));
942      tu_cs_emit(cs, CP_COND_WRITE5_3_REF(cmd->vsc_prim_strm_pitch - VSC_PAD));
943      tu_cs_emit(cs, CP_COND_WRITE5_4_MASK(~0));
944      tu_cs_emit_qw(cs, global_iova(cmd, vsc_prim_overflow));
945      tu_cs_emit(cs, CP_COND_WRITE5_7_WRITE_DATA(cmd->vsc_prim_strm_pitch));
946   }
947
948   tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0);
949}
950
951static void
952tu6_emit_binning_pass(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
953{
954   struct tu_physical_device *phys_dev = cmd->device->physical_device;
955   const struct tu_framebuffer *fb = cmd->state.framebuffer;
956
957   tu6_emit_window_scissor(cs, 0, 0, fb->width - 1, fb->height - 1);
958
959   tu_cs_emit_pkt7(cs, CP_SET_MARKER, 1);
960   tu_cs_emit(cs, A6XX_CP_SET_MARKER_0_MODE(RM6_BINNING));
961
962   tu_cs_emit_pkt7(cs, CP_SET_VISIBILITY_OVERRIDE, 1);
963   tu_cs_emit(cs, 0x1);
964
965   tu_cs_emit_pkt7(cs, CP_SET_MODE, 1);
966   tu_cs_emit(cs, 0x1);
967
968   tu_cs_emit_wfi(cs);
969
970   tu_cs_emit_regs(cs,
971                   A6XX_VFD_MODE_CNTL(.render_mode = BINNING_PASS));
972
973   update_vsc_pipe(cmd, cs);
974
975   tu_cs_emit_regs(cs,
976                   A6XX_PC_POWER_CNTL(phys_dev->info->a6xx.magic.PC_POWER_CNTL));
977
978   tu_cs_emit_regs(cs,
979                   A6XX_VFD_POWER_CNTL(phys_dev->info->a6xx.magic.PC_POWER_CNTL));
980
981   tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, 1);
982   tu_cs_emit(cs, UNK_2C);
983
984   tu_cs_emit_regs(cs,
985                   A6XX_RB_WINDOW_OFFSET(.x = 0, .y = 0));
986
987   tu_cs_emit_regs(cs,
988                   A6XX_SP_TP_WINDOW_OFFSET(.x = 0, .y = 0));
989
990   trace_start_binning_ib(&cmd->trace, cs);
991
992   /* emit IB to binning drawcmds: */
993   tu_cs_emit_call(cs, &cmd->draw_cs);
994
995   trace_end_binning_ib(&cmd->trace, cs);
996
997   /* switching from binning pass to GMEM pass will cause a switch from
998    * PROGRAM_BINNING to PROGRAM, which invalidates const state (XS_CONST states)
999    * so make sure these states are re-emitted
1000    * (eventually these states shouldn't exist at all with shader prologue)
1001    * only VS and GS are invalidated, as FS isn't emitted in binning pass,
1002    * and we don't use HW binning when tesselation is used
1003    */
1004   tu_cs_emit_pkt7(cs, CP_SET_DRAW_STATE, 3);
1005   tu_cs_emit(cs, CP_SET_DRAW_STATE__0_COUNT(0) |
1006                  CP_SET_DRAW_STATE__0_DISABLE |
1007                  CP_SET_DRAW_STATE__0_GROUP_ID(TU_DRAW_STATE_SHADER_GEOM_CONST));
1008   tu_cs_emit(cs, CP_SET_DRAW_STATE__1_ADDR_LO(0));
1009   tu_cs_emit(cs, CP_SET_DRAW_STATE__2_ADDR_HI(0));
1010
1011   tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, 1);
1012   tu_cs_emit(cs, UNK_2D);
1013
1014   /* This flush is probably required because the VSC, which produces the
1015    * visibility stream, is a client of UCHE, whereas the CP needs to read the
1016    * visibility stream (without caching) to do draw skipping. The
1017    * WFI+WAIT_FOR_ME combination guarantees that the binning commands
1018    * submitted are finished before reading the VSC regs (in
1019    * emit_vsc_overflow_test) or the VSC_DATA buffer directly (implicitly as
1020    * part of draws).
1021    */
1022   tu6_emit_event_write(cmd, cs, CACHE_FLUSH_TS);
1023
1024   tu_cs_emit_wfi(cs);
1025
1026   tu_cs_emit_pkt7(cs, CP_WAIT_FOR_ME, 0);
1027
1028   emit_vsc_overflow_test(cmd, cs);
1029
1030   tu_cs_emit_pkt7(cs, CP_SET_VISIBILITY_OVERRIDE, 1);
1031   tu_cs_emit(cs, 0x0);
1032
1033   tu_cs_emit_pkt7(cs, CP_SET_MODE, 1);
1034   tu_cs_emit(cs, 0x0);
1035}
1036
1037static struct tu_draw_state
1038tu_emit_input_attachments(struct tu_cmd_buffer *cmd,
1039                          const struct tu_subpass *subpass,
1040                          bool gmem)
1041{
1042   /* note: we can probably emit input attachments just once for the whole
1043    * renderpass, this would avoid emitting both sysmem/gmem versions
1044    *
1045    * emit two texture descriptors for each input, as a workaround for
1046    * d24s8/d32s8, which can be sampled as both float (depth) and integer (stencil)
1047    * tu_shader lowers uint input attachment loads to use the 2nd descriptor
1048    * in the pair
1049    * TODO: a smarter workaround
1050    */
1051
1052   if (!subpass->input_count)
1053      return (struct tu_draw_state) {};
1054
1055   struct tu_cs_memory texture;
1056   VkResult result = tu_cs_alloc(&cmd->sub_cs, subpass->input_count * 2,
1057                                 A6XX_TEX_CONST_DWORDS, &texture);
1058   if (result != VK_SUCCESS) {
1059      cmd->record_result = result;
1060      return (struct tu_draw_state) {};
1061   }
1062
1063   for (unsigned i = 0; i < subpass->input_count * 2; i++) {
1064      uint32_t a = subpass->input_attachments[i / 2].attachment;
1065      if (a == VK_ATTACHMENT_UNUSED)
1066         continue;
1067
1068      const struct tu_image_view *iview = cmd->state.attachments[a];
1069      const struct tu_render_pass_attachment *att =
1070         &cmd->state.pass->attachments[a];
1071      uint32_t *dst = &texture.map[A6XX_TEX_CONST_DWORDS * i];
1072      uint32_t gmem_offset = att->gmem_offset;
1073      uint32_t cpp = att->cpp;
1074
1075      memcpy(dst, iview->descriptor, A6XX_TEX_CONST_DWORDS * 4);
1076
1077      if (i % 2 == 1 && att->format == VK_FORMAT_D24_UNORM_S8_UINT) {
1078         /* note this works because spec says fb and input attachments
1079          * must use identity swizzle
1080          */
1081         dst[0] &= ~(A6XX_TEX_CONST_0_FMT__MASK |
1082            A6XX_TEX_CONST_0_SWIZ_X__MASK | A6XX_TEX_CONST_0_SWIZ_Y__MASK |
1083            A6XX_TEX_CONST_0_SWIZ_Z__MASK | A6XX_TEX_CONST_0_SWIZ_W__MASK);
1084         if (!cmd->device->physical_device->info->a6xx.has_z24uint_s8uint) {
1085            dst[0] |= A6XX_TEX_CONST_0_FMT(FMT6_8_8_8_8_UINT) |
1086               A6XX_TEX_CONST_0_SWIZ_X(A6XX_TEX_W) |
1087               A6XX_TEX_CONST_0_SWIZ_Y(A6XX_TEX_ZERO) |
1088               A6XX_TEX_CONST_0_SWIZ_Z(A6XX_TEX_ZERO) |
1089               A6XX_TEX_CONST_0_SWIZ_W(A6XX_TEX_ONE);
1090         } else {
1091            dst[0] |= A6XX_TEX_CONST_0_FMT(FMT6_Z24_UINT_S8_UINT) |
1092               A6XX_TEX_CONST_0_SWIZ_X(A6XX_TEX_Y) |
1093               A6XX_TEX_CONST_0_SWIZ_Y(A6XX_TEX_ZERO) |
1094               A6XX_TEX_CONST_0_SWIZ_Z(A6XX_TEX_ZERO) |
1095               A6XX_TEX_CONST_0_SWIZ_W(A6XX_TEX_ONE);
1096         }
1097      }
1098
1099      if (i % 2 == 1 && att->format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
1100         dst[0] &= ~A6XX_TEX_CONST_0_FMT__MASK;
1101         dst[0] |= A6XX_TEX_CONST_0_FMT(FMT6_8_UINT);
1102         dst[2] &= ~(A6XX_TEX_CONST_2_PITCHALIGN__MASK | A6XX_TEX_CONST_2_PITCH__MASK);
1103         dst[2] |= A6XX_TEX_CONST_2_PITCH(iview->stencil_PITCH << 6);
1104         dst[3] = 0;
1105         dst[4] = iview->stencil_base_addr;
1106         dst[5] = (dst[5] & 0xffff) | iview->stencil_base_addr >> 32;
1107
1108         cpp = att->samples;
1109         gmem_offset = att->gmem_offset_stencil;
1110      }
1111
1112      if (!gmem || !subpass->input_attachments[i / 2].patch_input_gmem)
1113         continue;
1114
1115      /* patched for gmem */
1116      dst[0] &= ~(A6XX_TEX_CONST_0_SWAP__MASK | A6XX_TEX_CONST_0_TILE_MODE__MASK);
1117      dst[0] |= A6XX_TEX_CONST_0_TILE_MODE(TILE6_2);
1118      dst[2] =
1119         A6XX_TEX_CONST_2_TYPE(A6XX_TEX_2D) |
1120         A6XX_TEX_CONST_2_PITCH(cmd->state.framebuffer->tile0.width * cpp);
1121      dst[3] = 0;
1122      dst[4] = cmd->device->physical_device->gmem_base + gmem_offset;
1123      dst[5] = A6XX_TEX_CONST_5_DEPTH(1);
1124      for (unsigned i = 6; i < A6XX_TEX_CONST_DWORDS; i++)
1125         dst[i] = 0;
1126   }
1127
1128   struct tu_cs cs;
1129   struct tu_draw_state ds = tu_cs_draw_state(&cmd->sub_cs, &cs, 9);
1130
1131   tu_cs_emit_pkt7(&cs, CP_LOAD_STATE6_FRAG, 3);
1132   tu_cs_emit(&cs, CP_LOAD_STATE6_0_DST_OFF(0) |
1133                  CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
1134                  CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
1135                  CP_LOAD_STATE6_0_STATE_BLOCK(SB6_FS_TEX) |
1136                  CP_LOAD_STATE6_0_NUM_UNIT(subpass->input_count * 2));
1137   tu_cs_emit_qw(&cs, texture.iova);
1138
1139   tu_cs_emit_regs(&cs, A6XX_SP_FS_TEX_CONST(.qword = texture.iova));
1140
1141   tu_cs_emit_regs(&cs, A6XX_SP_FS_TEX_COUNT(subpass->input_count * 2));
1142
1143   assert(cs.cur == cs.end); /* validate draw state size */
1144
1145   return ds;
1146}
1147
1148static void
1149tu_set_input_attachments(struct tu_cmd_buffer *cmd, const struct tu_subpass *subpass)
1150{
1151   struct tu_cs *cs = &cmd->draw_cs;
1152
1153   tu_cs_emit_pkt7(cs, CP_SET_DRAW_STATE, 6);
1154   tu_cs_emit_draw_state(cs, TU_DRAW_STATE_INPUT_ATTACHMENTS_GMEM,
1155                         tu_emit_input_attachments(cmd, subpass, true));
1156   tu_cs_emit_draw_state(cs, TU_DRAW_STATE_INPUT_ATTACHMENTS_SYSMEM,
1157                         tu_emit_input_attachments(cmd, subpass, false));
1158}
1159
1160static void
1161tu_emit_renderpass_begin(struct tu_cmd_buffer *cmd,
1162                         const VkRenderPassBeginInfo *info)
1163{
1164   struct tu_cs *cs = &cmd->draw_cs;
1165
1166   tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_GMEM);
1167
1168   tu6_emit_blit_scissor(cmd, cs, true);
1169
1170   for (uint32_t i = 0; i < cmd->state.pass->attachment_count; ++i)
1171      tu_load_gmem_attachment(cmd, cs, i, false);
1172
1173   tu6_emit_blit_scissor(cmd, cs, false);
1174
1175   for (uint32_t i = 0; i < cmd->state.pass->attachment_count; ++i)
1176      tu_clear_gmem_attachment(cmd, cs, i, info);
1177
1178   tu_cond_exec_end(cs);
1179
1180   tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_SYSMEM);
1181
1182   for (uint32_t i = 0; i < cmd->state.pass->attachment_count; ++i)
1183      tu_clear_sysmem_attachment(cmd, cs, i, info);
1184
1185   tu_cond_exec_end(cs);
1186}
1187
1188static void
1189tu6_sysmem_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
1190{
1191   const struct tu_framebuffer *fb = cmd->state.framebuffer;
1192
1193   assert(fb->width > 0 && fb->height > 0);
1194   tu6_emit_window_scissor(cs, 0, 0, fb->width - 1, fb->height - 1);
1195   tu6_emit_window_offset(cs, 0, 0);
1196
1197   tu6_emit_bin_size(cs, 0, 0,
1198                     A6XX_RB_BIN_CONTROL_BUFFERS_LOCATION(BUFFERS_IN_SYSMEM));
1199
1200   tu6_emit_event_write(cmd, cs, LRZ_FLUSH);
1201
1202   tu_cs_emit_pkt7(cs, CP_SET_MARKER, 1);
1203   tu_cs_emit(cs, A6XX_CP_SET_MARKER_0_MODE(RM6_BYPASS));
1204
1205   tu_cs_emit_pkt7(cs, CP_SKIP_IB2_ENABLE_GLOBAL, 1);
1206   tu_cs_emit(cs, 0x0);
1207
1208   tu_emit_cache_flush_ccu(cmd, cs, TU_CMD_CCU_SYSMEM);
1209
1210   /* enable stream-out, with sysmem there is only one pass: */
1211   tu_cs_emit_regs(cs, A6XX_VPC_SO_DISABLE(false));
1212
1213   tu_cs_emit_pkt7(cs, CP_SET_VISIBILITY_OVERRIDE, 1);
1214   tu_cs_emit(cs, 0x1);
1215
1216   tu_cs_emit_pkt7(cs, CP_SET_MODE, 1);
1217   tu_cs_emit(cs, 0x0);
1218
1219   tu_cs_sanity_check(cs);
1220}
1221
1222static void
1223tu6_sysmem_render_end(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
1224{
1225   /* Do any resolves of the last subpass. These are handled in the
1226    * tile_store_cs in the gmem path.
1227    */
1228   tu6_emit_sysmem_resolves(cmd, cs, cmd->state.subpass);
1229
1230   tu_cs_emit_call(cs, &cmd->draw_epilogue_cs);
1231
1232   tu_cs_emit_pkt7(cs, CP_SKIP_IB2_ENABLE_GLOBAL, 1);
1233   tu_cs_emit(cs, 0x0);
1234
1235   tu6_emit_event_write(cmd, cs, LRZ_FLUSH);
1236
1237   tu_cs_sanity_check(cs);
1238}
1239
1240static void
1241tu6_tile_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
1242{
1243   struct tu_physical_device *phys_dev = cmd->device->physical_device;
1244
1245   tu6_emit_event_write(cmd, cs, LRZ_FLUSH);
1246
1247   tu_cs_emit_pkt7(cs, CP_SKIP_IB2_ENABLE_GLOBAL, 1);
1248   tu_cs_emit(cs, 0x0);
1249
1250   tu_emit_cache_flush_ccu(cmd, cs, TU_CMD_CCU_GMEM);
1251
1252   const struct tu_framebuffer *fb = cmd->state.framebuffer;
1253   if (use_hw_binning(cmd)) {
1254      /* enable stream-out during binning pass: */
1255      tu_cs_emit_regs(cs, A6XX_VPC_SO_DISABLE(false));
1256
1257      tu6_emit_bin_size(cs, fb->tile0.width, fb->tile0.height,
1258                        A6XX_RB_BIN_CONTROL_RENDER_MODE(BINNING_PASS) |
1259                        A6XX_RB_BIN_CONTROL_LRZ_FEEDBACK_ZMODE_MASK(0x6));
1260
1261      tu6_emit_render_cntl(cmd, cmd->state.subpass, cs, true);
1262
1263      tu6_emit_binning_pass(cmd, cs);
1264
1265      /* and disable stream-out for draw pass: */
1266      tu_cs_emit_regs(cs, A6XX_VPC_SO_DISABLE(true));
1267
1268      tu6_emit_bin_size(cs, fb->tile0.width, fb->tile0.height,
1269                        A6XX_RB_BIN_CONTROL_FORCE_LRZ_WRITE_DIS |
1270                        A6XX_RB_BIN_CONTROL_LRZ_FEEDBACK_ZMODE_MASK(0x6));
1271
1272      tu_cs_emit_regs(cs,
1273                      A6XX_VFD_MODE_CNTL(0));
1274
1275      tu_cs_emit_regs(cs,
1276                      A6XX_PC_POWER_CNTL(phys_dev->info->a6xx.magic.PC_POWER_CNTL));
1277
1278      tu_cs_emit_regs(cs,
1279                      A6XX_VFD_POWER_CNTL(phys_dev->info->a6xx.magic.PC_POWER_CNTL));
1280
1281      tu_cs_emit_pkt7(cs, CP_SKIP_IB2_ENABLE_GLOBAL, 1);
1282      tu_cs_emit(cs, 0x1);
1283   } else {
1284      /* no binning pass, so enable stream-out for draw pass:: */
1285      tu_cs_emit_regs(cs, A6XX_VPC_SO_DISABLE(false));
1286
1287      tu6_emit_bin_size(cs, fb->tile0.width, fb->tile0.height,
1288                        A6XX_RB_BIN_CONTROL_LRZ_FEEDBACK_ZMODE_MASK(0x6));
1289   }
1290
1291   tu_cs_sanity_check(cs);
1292}
1293
1294static void
1295tu6_render_tile(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
1296{
1297   tu_cs_emit_call(cs, &cmd->draw_cs);
1298
1299   if (use_hw_binning(cmd)) {
1300      tu_cs_emit_pkt7(cs, CP_SET_MARKER, 1);
1301      tu_cs_emit(cs, A6XX_CP_SET_MARKER_0_MODE(RM6_ENDVIS));
1302   }
1303
1304   tu_cs_emit_call(cs, &cmd->tile_store_cs);
1305
1306   if (!u_trace_iterator_equal(cmd->trace_renderpass_start, cmd->trace_renderpass_end)) {
1307      tu_cs_emit_wfi(cs);
1308      tu_cs_emit_pkt7(&cmd->cs, CP_WAIT_FOR_ME, 0);
1309      u_trace_clone_append(cmd->trace_renderpass_start,
1310                           cmd->trace_renderpass_end,
1311                           &cmd->trace,
1312                           cs, tu_copy_timestamp_buffer);
1313   }
1314
1315   tu_cs_sanity_check(cs);
1316}
1317
1318static void
1319tu6_tile_render_end(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
1320{
1321   tu_cs_emit_call(cs, &cmd->draw_epilogue_cs);
1322
1323   tu_cs_emit_regs(cs,
1324                   A6XX_GRAS_LRZ_CNTL(0));
1325
1326   tu6_emit_event_write(cmd, cs, LRZ_FLUSH);
1327
1328   tu6_emit_event_write(cmd, cs, PC_CCU_RESOLVE_TS);
1329
1330   tu_cs_sanity_check(cs);
1331}
1332
1333static void
1334tu_cmd_render_tiles(struct tu_cmd_buffer *cmd)
1335{
1336   const struct tu_framebuffer *fb = cmd->state.framebuffer;
1337
1338   tu6_tile_render_begin(cmd, &cmd->cs);
1339
1340   uint32_t pipe = 0;
1341   for (uint32_t py = 0; py < fb->pipe_count.height; py++) {
1342      for (uint32_t px = 0; px < fb->pipe_count.width; px++, pipe++) {
1343         uint32_t tx1 = px * fb->pipe0.width;
1344         uint32_t ty1 = py * fb->pipe0.height;
1345         uint32_t tx2 = MIN2(tx1 + fb->pipe0.width, fb->tile_count.width);
1346         uint32_t ty2 = MIN2(ty1 + fb->pipe0.height, fb->tile_count.height);
1347         uint32_t slot = 0;
1348         for (uint32_t ty = ty1; ty < ty2; ty++) {
1349            for (uint32_t tx = tx1; tx < tx2; tx++, slot++) {
1350               tu6_emit_tile_select(cmd, &cmd->cs, tx, ty, pipe, slot);
1351
1352               trace_start_draw_ib_gmem(&cmd->trace, &cmd->cs);
1353               tu6_render_tile(cmd, &cmd->cs);
1354               trace_end_draw_ib_gmem(&cmd->trace, &cmd->cs);
1355            }
1356         }
1357      }
1358   }
1359
1360   tu6_tile_render_end(cmd, &cmd->cs);
1361
1362   trace_end_render_pass(&cmd->trace, &cmd->cs, fb);
1363
1364   if (!u_trace_iterator_equal(cmd->trace_renderpass_start, cmd->trace_renderpass_end))
1365      u_trace_disable_event_range(cmd->trace_renderpass_start,
1366                                  cmd->trace_renderpass_end);
1367}
1368
1369static void
1370tu_cmd_render_sysmem(struct tu_cmd_buffer *cmd)
1371{
1372   tu6_sysmem_render_begin(cmd, &cmd->cs);
1373
1374   trace_start_draw_ib_sysmem(&cmd->trace, &cmd->cs);
1375
1376   tu_cs_emit_call(&cmd->cs, &cmd->draw_cs);
1377
1378   trace_end_draw_ib_sysmem(&cmd->trace, &cmd->cs);
1379
1380   tu6_sysmem_render_end(cmd, &cmd->cs);
1381
1382   trace_end_render_pass(&cmd->trace, &cmd->cs, cmd->state.framebuffer);
1383}
1384
1385static VkResult
1386tu_create_cmd_buffer(struct tu_device *device,
1387                     struct tu_cmd_pool *pool,
1388                     VkCommandBufferLevel level,
1389                     VkCommandBuffer *pCommandBuffer)
1390{
1391   struct tu_cmd_buffer *cmd_buffer;
1392
1393   cmd_buffer = vk_zalloc2(&device->vk.alloc, NULL, sizeof(*cmd_buffer), 8,
1394                           VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
1395
1396   if (cmd_buffer == NULL)
1397      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
1398
1399   VkResult result = vk_command_buffer_init(&cmd_buffer->vk, &device->vk);
1400   if (result != VK_SUCCESS) {
1401      vk_free2(&device->vk.alloc, NULL, cmd_buffer);
1402      return result;
1403   }
1404
1405   cmd_buffer->device = device;
1406   cmd_buffer->pool = pool;
1407   cmd_buffer->level = level;
1408
1409   if (pool) {
1410      list_addtail(&cmd_buffer->pool_link, &pool->cmd_buffers);
1411      cmd_buffer->queue_family_index = pool->queue_family_index;
1412
1413   } else {
1414      /* Init the pool_link so we can safely call list_del when we destroy
1415       * the command buffer
1416       */
1417      list_inithead(&cmd_buffer->pool_link);
1418      cmd_buffer->queue_family_index = TU_QUEUE_GENERAL;
1419   }
1420
1421   u_trace_init(&cmd_buffer->trace, &device->trace_context);
1422
1423   tu_cs_init(&cmd_buffer->cs, device, TU_CS_MODE_GROW, 4096);
1424   tu_cs_init(&cmd_buffer->draw_cs, device, TU_CS_MODE_GROW, 4096);
1425   tu_cs_init(&cmd_buffer->tile_store_cs, device, TU_CS_MODE_GROW, 2048);
1426   tu_cs_init(&cmd_buffer->draw_epilogue_cs, device, TU_CS_MODE_GROW, 4096);
1427   tu_cs_init(&cmd_buffer->sub_cs, device, TU_CS_MODE_SUB_STREAM, 2048);
1428
1429   *pCommandBuffer = tu_cmd_buffer_to_handle(cmd_buffer);
1430
1431   return VK_SUCCESS;
1432}
1433
1434static void
1435tu_cmd_buffer_destroy(struct tu_cmd_buffer *cmd_buffer)
1436{
1437   list_del(&cmd_buffer->pool_link);
1438
1439   tu_cs_finish(&cmd_buffer->cs);
1440   tu_cs_finish(&cmd_buffer->draw_cs);
1441   tu_cs_finish(&cmd_buffer->tile_store_cs);
1442   tu_cs_finish(&cmd_buffer->draw_epilogue_cs);
1443   tu_cs_finish(&cmd_buffer->sub_cs);
1444
1445   u_trace_fini(&cmd_buffer->trace);
1446
1447   vk_command_buffer_finish(&cmd_buffer->vk);
1448   vk_free2(&cmd_buffer->device->vk.alloc, &cmd_buffer->pool->alloc,
1449            cmd_buffer);
1450}
1451
1452static VkResult
1453tu_reset_cmd_buffer(struct tu_cmd_buffer *cmd_buffer)
1454{
1455   vk_command_buffer_reset(&cmd_buffer->vk);
1456
1457   cmd_buffer->record_result = VK_SUCCESS;
1458
1459   tu_cs_reset(&cmd_buffer->cs);
1460   tu_cs_reset(&cmd_buffer->draw_cs);
1461   tu_cs_reset(&cmd_buffer->tile_store_cs);
1462   tu_cs_reset(&cmd_buffer->draw_epilogue_cs);
1463   tu_cs_reset(&cmd_buffer->sub_cs);
1464
1465   for (unsigned i = 0; i < MAX_BIND_POINTS; i++) {
1466      memset(&cmd_buffer->descriptors[i].sets, 0, sizeof(cmd_buffer->descriptors[i].sets));
1467      memset(&cmd_buffer->descriptors[i].push_set, 0, sizeof(cmd_buffer->descriptors[i].push_set));
1468      cmd_buffer->descriptors[i].push_set.base.type = VK_OBJECT_TYPE_DESCRIPTOR_SET;
1469   }
1470
1471   u_trace_fini(&cmd_buffer->trace);
1472   u_trace_init(&cmd_buffer->trace, &cmd_buffer->device->trace_context);
1473
1474   cmd_buffer->status = TU_CMD_BUFFER_STATUS_INITIAL;
1475
1476   return cmd_buffer->record_result;
1477}
1478
1479VKAPI_ATTR VkResult VKAPI_CALL
1480tu_AllocateCommandBuffers(VkDevice _device,
1481                          const VkCommandBufferAllocateInfo *pAllocateInfo,
1482                          VkCommandBuffer *pCommandBuffers)
1483{
1484   TU_FROM_HANDLE(tu_device, device, _device);
1485   TU_FROM_HANDLE(tu_cmd_pool, pool, pAllocateInfo->commandPool);
1486
1487   VkResult result = VK_SUCCESS;
1488   uint32_t i;
1489
1490   for (i = 0; i < pAllocateInfo->commandBufferCount; i++) {
1491
1492      if (!list_is_empty(&pool->free_cmd_buffers)) {
1493         struct tu_cmd_buffer *cmd_buffer = list_first_entry(
1494            &pool->free_cmd_buffers, struct tu_cmd_buffer, pool_link);
1495
1496         list_del(&cmd_buffer->pool_link);
1497         list_addtail(&cmd_buffer->pool_link, &pool->cmd_buffers);
1498
1499         result = tu_reset_cmd_buffer(cmd_buffer);
1500         cmd_buffer->level = pAllocateInfo->level;
1501         vk_command_buffer_finish(&cmd_buffer->vk);
1502         VkResult init_result =
1503            vk_command_buffer_init(&cmd_buffer->vk, &device->vk);
1504         if (init_result != VK_SUCCESS)
1505            result = init_result;
1506
1507         pCommandBuffers[i] = tu_cmd_buffer_to_handle(cmd_buffer);
1508      } else {
1509         result = tu_create_cmd_buffer(device, pool, pAllocateInfo->level,
1510                                       &pCommandBuffers[i]);
1511      }
1512      if (result != VK_SUCCESS)
1513         break;
1514   }
1515
1516   if (result != VK_SUCCESS) {
1517      tu_FreeCommandBuffers(_device, pAllocateInfo->commandPool, i,
1518                            pCommandBuffers);
1519
1520      /* From the Vulkan 1.0.66 spec:
1521       *
1522       * "vkAllocateCommandBuffers can be used to create multiple
1523       *  command buffers. If the creation of any of those command
1524       *  buffers fails, the implementation must destroy all
1525       *  successfully created command buffer objects from this
1526       *  command, set all entries of the pCommandBuffers array to
1527       *  NULL and return the error."
1528       */
1529      memset(pCommandBuffers, 0,
1530             sizeof(*pCommandBuffers) * pAllocateInfo->commandBufferCount);
1531   }
1532
1533   return result;
1534}
1535
1536VKAPI_ATTR void VKAPI_CALL
1537tu_FreeCommandBuffers(VkDevice device,
1538                      VkCommandPool commandPool,
1539                      uint32_t commandBufferCount,
1540                      const VkCommandBuffer *pCommandBuffers)
1541{
1542   for (uint32_t i = 0; i < commandBufferCount; i++) {
1543      TU_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, pCommandBuffers[i]);
1544
1545      if (cmd_buffer) {
1546         if (cmd_buffer->pool) {
1547            list_del(&cmd_buffer->pool_link);
1548            list_addtail(&cmd_buffer->pool_link,
1549                         &cmd_buffer->pool->free_cmd_buffers);
1550         } else
1551            tu_cmd_buffer_destroy(cmd_buffer);
1552      }
1553   }
1554}
1555
1556VKAPI_ATTR VkResult VKAPI_CALL
1557tu_ResetCommandBuffer(VkCommandBuffer commandBuffer,
1558                      VkCommandBufferResetFlags flags)
1559{
1560   TU_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, commandBuffer);
1561   return tu_reset_cmd_buffer(cmd_buffer);
1562}
1563
1564/* Initialize the cache, assuming all necessary flushes have happened but *not*
1565 * invalidations.
1566 */
1567static void
1568tu_cache_init(struct tu_cache_state *cache)
1569{
1570   cache->flush_bits = 0;
1571   cache->pending_flush_bits = TU_CMD_FLAG_ALL_INVALIDATE;
1572}
1573
1574VKAPI_ATTR VkResult VKAPI_CALL
1575tu_BeginCommandBuffer(VkCommandBuffer commandBuffer,
1576                      const VkCommandBufferBeginInfo *pBeginInfo)
1577{
1578   TU_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, commandBuffer);
1579   VkResult result = VK_SUCCESS;
1580
1581   if (cmd_buffer->status != TU_CMD_BUFFER_STATUS_INITIAL) {
1582      /* If the command buffer has already been resetted with
1583       * vkResetCommandBuffer, no need to do it again.
1584       */
1585      result = tu_reset_cmd_buffer(cmd_buffer);
1586      if (result != VK_SUCCESS)
1587         return result;
1588   }
1589
1590   memset(&cmd_buffer->state, 0, sizeof(cmd_buffer->state));
1591   cmd_buffer->state.index_size = 0xff; /* dirty restart index */
1592   cmd_buffer->state.line_mode = RECTANGULAR;
1593
1594   tu_cache_init(&cmd_buffer->state.cache);
1595   tu_cache_init(&cmd_buffer->state.renderpass_cache);
1596   cmd_buffer->usage_flags = pBeginInfo->flags;
1597
1598   tu_cs_begin(&cmd_buffer->cs);
1599   tu_cs_begin(&cmd_buffer->draw_cs);
1600   tu_cs_begin(&cmd_buffer->tile_store_cs);
1601   tu_cs_begin(&cmd_buffer->draw_epilogue_cs);
1602
1603   /* setup initial configuration into command buffer */
1604   if (cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY) {
1605      switch (cmd_buffer->queue_family_index) {
1606      case TU_QUEUE_GENERAL:
1607         tu6_init_hw(cmd_buffer, &cmd_buffer->cs);
1608         break;
1609      default:
1610         break;
1611      }
1612   } else if (cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY) {
1613      assert(pBeginInfo->pInheritanceInfo);
1614
1615      vk_foreach_struct(ext, pBeginInfo->pInheritanceInfo) {
1616         switch (ext->sType) {
1617         case VK_STRUCTURE_TYPE_COMMAND_BUFFER_INHERITANCE_CONDITIONAL_RENDERING_INFO_EXT: {
1618            const VkCommandBufferInheritanceConditionalRenderingInfoEXT *cond_rend = (void *) ext;
1619            cmd_buffer->state.predication_active = cond_rend->conditionalRenderingEnable;
1620            break;
1621         default:
1622            break;
1623         }
1624         }
1625      }
1626
1627      if (pBeginInfo->flags & VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT) {
1628         cmd_buffer->state.pass = tu_render_pass_from_handle(pBeginInfo->pInheritanceInfo->renderPass);
1629         cmd_buffer->state.subpass =
1630            &cmd_buffer->state.pass->subpasses[pBeginInfo->pInheritanceInfo->subpass];
1631      } else {
1632         /* When executing in the middle of another command buffer, the CCU
1633          * state is unknown.
1634          */
1635         cmd_buffer->state.ccu_state = TU_CMD_CCU_UNKNOWN;
1636      }
1637   }
1638
1639   cmd_buffer->status = TU_CMD_BUFFER_STATUS_RECORDING;
1640
1641   return VK_SUCCESS;
1642}
1643
1644VKAPI_ATTR void VKAPI_CALL
1645tu_CmdBindVertexBuffers(VkCommandBuffer commandBuffer,
1646                        uint32_t firstBinding,
1647                        uint32_t bindingCount,
1648                        const VkBuffer *pBuffers,
1649                        const VkDeviceSize *pOffsets)
1650{
1651   tu_CmdBindVertexBuffers2EXT(commandBuffer, firstBinding, bindingCount,
1652                               pBuffers, pOffsets, NULL, NULL);
1653}
1654
1655VKAPI_ATTR void VKAPI_CALL
1656tu_CmdBindVertexBuffers2EXT(VkCommandBuffer commandBuffer,
1657                            uint32_t firstBinding,
1658                            uint32_t bindingCount,
1659                            const VkBuffer* pBuffers,
1660                            const VkDeviceSize* pOffsets,
1661                            const VkDeviceSize* pSizes,
1662                            const VkDeviceSize* pStrides)
1663{
1664   TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1665   struct tu_cs cs;
1666   /* TODO: track a "max_vb" value for the cmdbuf to save a bit of memory  */
1667   cmd->state.vertex_buffers.iova = tu_cs_draw_state(&cmd->sub_cs, &cs, 4 * MAX_VBS).iova;
1668
1669   for (uint32_t i = 0; i < bindingCount; i++) {
1670      if (pBuffers[i] == VK_NULL_HANDLE) {
1671         cmd->state.vb[firstBinding + i].base = 0;
1672         cmd->state.vb[firstBinding + i].size = 0;
1673      } else {
1674         struct tu_buffer *buf = tu_buffer_from_handle(pBuffers[i]);
1675         cmd->state.vb[firstBinding + i].base = tu_buffer_iova(buf) + pOffsets[i];
1676         cmd->state.vb[firstBinding + i].size = pSizes ? pSizes[i] : (buf->size - pOffsets[i]);
1677      }
1678
1679      if (pStrides)
1680         cmd->state.vb[firstBinding + i].stride = pStrides[i];
1681   }
1682
1683   for (uint32_t i = 0; i < MAX_VBS; i++) {
1684      tu_cs_emit_regs(&cs,
1685                      A6XX_VFD_FETCH_BASE(i, .qword = cmd->state.vb[i].base),
1686                      A6XX_VFD_FETCH_SIZE(i, cmd->state.vb[i].size));
1687   }
1688
1689   cmd->state.dirty |= TU_CMD_DIRTY_VERTEX_BUFFERS;
1690
1691   if (pStrides) {
1692      cmd->state.dynamic_state[TU_DYNAMIC_STATE_VB_STRIDE].iova =
1693         tu_cs_draw_state(&cmd->sub_cs, &cs, 2 * MAX_VBS).iova;
1694
1695      for (uint32_t i = 0; i < MAX_VBS; i++)
1696         tu_cs_emit_regs(&cs, A6XX_VFD_FETCH_STRIDE(i, cmd->state.vb[i].stride));
1697
1698      cmd->state.dirty |= TU_CMD_DIRTY_VB_STRIDE;
1699   }
1700}
1701
1702VKAPI_ATTR void VKAPI_CALL
1703tu_CmdBindIndexBuffer(VkCommandBuffer commandBuffer,
1704                      VkBuffer buffer,
1705                      VkDeviceSize offset,
1706                      VkIndexType indexType)
1707{
1708   TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1709   TU_FROM_HANDLE(tu_buffer, buf, buffer);
1710
1711
1712
1713   uint32_t index_size, index_shift, restart_index;
1714
1715   switch (indexType) {
1716   case VK_INDEX_TYPE_UINT16:
1717      index_size = INDEX4_SIZE_16_BIT;
1718      index_shift = 1;
1719      restart_index = 0xffff;
1720      break;
1721   case VK_INDEX_TYPE_UINT32:
1722      index_size = INDEX4_SIZE_32_BIT;
1723      index_shift = 2;
1724      restart_index = 0xffffffff;
1725      break;
1726   case VK_INDEX_TYPE_UINT8_EXT:
1727      index_size = INDEX4_SIZE_8_BIT;
1728      index_shift = 0;
1729      restart_index = 0xff;
1730      break;
1731   default:
1732      unreachable("invalid VkIndexType");
1733   }
1734
1735   /* initialize/update the restart index */
1736   if (cmd->state.index_size != index_size)
1737      tu_cs_emit_regs(&cmd->draw_cs, A6XX_PC_RESTART_INDEX(restart_index));
1738
1739   assert(buf->size >= offset);
1740
1741   cmd->state.index_va = buf->bo->iova + buf->bo_offset + offset;
1742   cmd->state.max_index_count = (buf->size - offset) >> index_shift;
1743   cmd->state.index_size = index_size;
1744}
1745
1746VKAPI_ATTR void VKAPI_CALL
1747tu_CmdBindDescriptorSets(VkCommandBuffer commandBuffer,
1748                         VkPipelineBindPoint pipelineBindPoint,
1749                         VkPipelineLayout _layout,
1750                         uint32_t firstSet,
1751                         uint32_t descriptorSetCount,
1752                         const VkDescriptorSet *pDescriptorSets,
1753                         uint32_t dynamicOffsetCount,
1754                         const uint32_t *pDynamicOffsets)
1755{
1756   TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1757   TU_FROM_HANDLE(tu_pipeline_layout, layout, _layout);
1758   unsigned dyn_idx = 0;
1759
1760   struct tu_descriptor_state *descriptors_state =
1761      tu_get_descriptors_state(cmd, pipelineBindPoint);
1762
1763   for (unsigned i = 0; i < descriptorSetCount; ++i) {
1764      unsigned idx = i + firstSet;
1765      TU_FROM_HANDLE(tu_descriptor_set, set, pDescriptorSets[i]);
1766
1767      descriptors_state->sets[idx] = set;
1768
1769      for(unsigned j = 0; j < set->layout->dynamic_offset_count; ++j, ++dyn_idx) {
1770         /* update the contents of the dynamic descriptor set */
1771         unsigned src_idx = j;
1772         unsigned dst_idx = j + layout->set[idx].dynamic_offset_start;
1773         assert(dyn_idx < dynamicOffsetCount);
1774
1775         uint32_t *dst =
1776            &descriptors_state->dynamic_descriptors[dst_idx * A6XX_TEX_CONST_DWORDS];
1777         uint32_t *src =
1778            &set->dynamic_descriptors[src_idx * A6XX_TEX_CONST_DWORDS];
1779         uint32_t offset = pDynamicOffsets[dyn_idx];
1780
1781         /* Patch the storage/uniform descriptors right away. */
1782         if (layout->set[idx].layout->dynamic_ubo & (1 << j)) {
1783            /* Note: we can assume here that the addition won't roll over and
1784             * change the SIZE field.
1785             */
1786            uint64_t va = src[0] | ((uint64_t)src[1] << 32);
1787            va += offset;
1788            dst[0] = va;
1789            dst[1] = va >> 32;
1790         } else {
1791            memcpy(dst, src, A6XX_TEX_CONST_DWORDS * 4);
1792            /* Note: A6XX_IBO_5_DEPTH is always 0 */
1793            uint64_t va = dst[4] | ((uint64_t)dst[5] << 32);
1794            va += offset;
1795            dst[4] = va;
1796            dst[5] = va >> 32;
1797         }
1798      }
1799   }
1800   assert(dyn_idx == dynamicOffsetCount);
1801
1802   uint32_t sp_bindless_base_reg, hlsq_bindless_base_reg, hlsq_invalidate_value;
1803   uint64_t addr[MAX_SETS + 1] = {};
1804   struct tu_cs *cs, state_cs;
1805
1806   for (uint32_t i = 0; i < MAX_SETS; i++) {
1807      struct tu_descriptor_set *set = descriptors_state->sets[i];
1808      if (set)
1809         addr[i] = set->va | 3;
1810   }
1811
1812   if (layout->dynamic_offset_count) {
1813      /* allocate and fill out dynamic descriptor set */
1814      struct tu_cs_memory dynamic_desc_set;
1815      VkResult result = tu_cs_alloc(&cmd->sub_cs, layout->dynamic_offset_count,
1816                                    A6XX_TEX_CONST_DWORDS, &dynamic_desc_set);
1817      if (result != VK_SUCCESS) {
1818         cmd->record_result = result;
1819         return;
1820      }
1821
1822      memcpy(dynamic_desc_set.map, descriptors_state->dynamic_descriptors,
1823             layout->dynamic_offset_count * A6XX_TEX_CONST_DWORDS * 4);
1824      addr[MAX_SETS] = dynamic_desc_set.iova | 3;
1825   }
1826
1827   if (pipelineBindPoint == VK_PIPELINE_BIND_POINT_GRAPHICS) {
1828      sp_bindless_base_reg = REG_A6XX_SP_BINDLESS_BASE(0);
1829      hlsq_bindless_base_reg = REG_A6XX_HLSQ_BINDLESS_BASE(0);
1830      hlsq_invalidate_value = A6XX_HLSQ_INVALIDATE_CMD_GFX_BINDLESS(0x1f);
1831
1832      cmd->state.desc_sets = tu_cs_draw_state(&cmd->sub_cs, &state_cs, 24);
1833      cmd->state.dirty |= TU_CMD_DIRTY_DESC_SETS_LOAD | TU_CMD_DIRTY_SHADER_CONSTS;
1834      cs = &state_cs;
1835   } else {
1836      assert(pipelineBindPoint == VK_PIPELINE_BIND_POINT_COMPUTE);
1837
1838      sp_bindless_base_reg = REG_A6XX_SP_CS_BINDLESS_BASE(0);
1839      hlsq_bindless_base_reg = REG_A6XX_HLSQ_CS_BINDLESS_BASE(0);
1840      hlsq_invalidate_value = A6XX_HLSQ_INVALIDATE_CMD_CS_BINDLESS(0x1f);
1841
1842      cmd->state.dirty |= TU_CMD_DIRTY_COMPUTE_DESC_SETS_LOAD;
1843      cs = &cmd->cs;
1844   }
1845
1846   tu_cs_emit_pkt4(cs, sp_bindless_base_reg, 10);
1847   tu_cs_emit_array(cs, (const uint32_t*) addr, 10);
1848   tu_cs_emit_pkt4(cs, hlsq_bindless_base_reg, 10);
1849   tu_cs_emit_array(cs, (const uint32_t*) addr, 10);
1850   tu_cs_emit_regs(cs, A6XX_HLSQ_INVALIDATE_CMD(.dword = hlsq_invalidate_value));
1851
1852   if (pipelineBindPoint == VK_PIPELINE_BIND_POINT_GRAPHICS) {
1853      assert(cs->cur == cs->end); /* validate draw state size */
1854      /* note: this also avoids emitting draw states before renderpass clears,
1855       * which may use the 3D clear path (for MSAA cases)
1856       */
1857      if (!(cmd->state.dirty & TU_CMD_DIRTY_DRAW_STATE)) {
1858         tu_cs_emit_pkt7(&cmd->draw_cs, CP_SET_DRAW_STATE, 3);
1859         tu_cs_emit_draw_state(&cmd->draw_cs, TU_DRAW_STATE_DESC_SETS, cmd->state.desc_sets);
1860      }
1861   }
1862}
1863
1864VKAPI_ATTR void VKAPI_CALL
1865tu_CmdPushDescriptorSetKHR(VkCommandBuffer commandBuffer,
1866                           VkPipelineBindPoint pipelineBindPoint,
1867                           VkPipelineLayout _layout,
1868                           uint32_t _set,
1869                           uint32_t descriptorWriteCount,
1870                           const VkWriteDescriptorSet *pDescriptorWrites)
1871{
1872   TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1873   TU_FROM_HANDLE(tu_pipeline_layout, pipe_layout, _layout);
1874   struct tu_descriptor_set_layout *layout = pipe_layout->set[_set].layout;
1875   struct tu_descriptor_set *set =
1876      &tu_get_descriptors_state(cmd, pipelineBindPoint)->push_set;
1877
1878   struct tu_cs_memory set_mem;
1879   VkResult result = tu_cs_alloc(&cmd->sub_cs,
1880                                 DIV_ROUND_UP(layout->size, A6XX_TEX_CONST_DWORDS * 4),
1881                                 A6XX_TEX_CONST_DWORDS, &set_mem);
1882   if (result != VK_SUCCESS) {
1883      cmd->record_result = result;
1884      return;
1885   }
1886
1887   /* preserve previous content if the layout is the same: */
1888   if (set->layout == layout)
1889      memcpy(set_mem.map, set->mapped_ptr, layout->size);
1890
1891   set->layout = layout;
1892   set->mapped_ptr = set_mem.map;
1893   set->va = set_mem.iova;
1894
1895   tu_update_descriptor_sets(cmd->device, tu_descriptor_set_to_handle(set),
1896                             descriptorWriteCount, pDescriptorWrites, 0, NULL);
1897
1898   tu_CmdBindDescriptorSets(commandBuffer, pipelineBindPoint, _layout, _set,
1899                            1, (VkDescriptorSet[]) { tu_descriptor_set_to_handle(set) },
1900                            0, NULL);
1901}
1902
1903VKAPI_ATTR void VKAPI_CALL
1904tu_CmdPushDescriptorSetWithTemplateKHR(VkCommandBuffer commandBuffer,
1905                                       VkDescriptorUpdateTemplate descriptorUpdateTemplate,
1906                                       VkPipelineLayout _layout,
1907                                       uint32_t _set,
1908                                       const void* pData)
1909{
1910   TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1911   TU_FROM_HANDLE(tu_pipeline_layout, pipe_layout, _layout);
1912   TU_FROM_HANDLE(tu_descriptor_update_template, templ, descriptorUpdateTemplate);
1913   struct tu_descriptor_set_layout *layout = pipe_layout->set[_set].layout;
1914   struct tu_descriptor_set *set =
1915      &tu_get_descriptors_state(cmd, templ->bind_point)->push_set;
1916
1917   struct tu_cs_memory set_mem;
1918   VkResult result = tu_cs_alloc(&cmd->sub_cs,
1919                                 DIV_ROUND_UP(layout->size, A6XX_TEX_CONST_DWORDS * 4),
1920                                 A6XX_TEX_CONST_DWORDS, &set_mem);
1921   if (result != VK_SUCCESS) {
1922      cmd->record_result = result;
1923      return;
1924   }
1925
1926   /* preserve previous content if the layout is the same: */
1927   if (set->layout == layout)
1928      memcpy(set_mem.map, set->mapped_ptr, layout->size);
1929
1930   set->layout = layout;
1931   set->mapped_ptr = set_mem.map;
1932   set->va = set_mem.iova;
1933
1934   tu_update_descriptor_set_with_template(cmd->device, set, descriptorUpdateTemplate, pData);
1935
1936   tu_CmdBindDescriptorSets(commandBuffer, templ->bind_point, _layout, _set,
1937                            1, (VkDescriptorSet[]) { tu_descriptor_set_to_handle(set) },
1938                            0, NULL);
1939}
1940
1941VKAPI_ATTR void VKAPI_CALL
1942tu_CmdBindTransformFeedbackBuffersEXT(VkCommandBuffer commandBuffer,
1943                                      uint32_t firstBinding,
1944                                      uint32_t bindingCount,
1945                                      const VkBuffer *pBuffers,
1946                                      const VkDeviceSize *pOffsets,
1947                                      const VkDeviceSize *pSizes)
1948{
1949   TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1950   struct tu_cs *cs = &cmd->draw_cs;
1951
1952   /* using COND_REG_EXEC for xfb commands matches the blob behavior
1953    * presumably there isn't any benefit using a draw state when the
1954    * condition is (SYSMEM | BINNING)
1955    */
1956   tu_cond_exec_start(cs, CP_COND_REG_EXEC_0_MODE(RENDER_MODE) |
1957                          CP_COND_REG_EXEC_0_SYSMEM |
1958                          CP_COND_REG_EXEC_0_BINNING);
1959
1960   for (uint32_t i = 0; i < bindingCount; i++) {
1961      TU_FROM_HANDLE(tu_buffer, buf, pBuffers[i]);
1962      uint64_t iova = buf->bo->iova + pOffsets[i];
1963      uint32_t size = buf->bo->size - pOffsets[i];
1964      uint32_t idx = i + firstBinding;
1965
1966      if (pSizes && pSizes[i] != VK_WHOLE_SIZE)
1967         size = pSizes[i];
1968
1969      /* BUFFER_BASE is 32-byte aligned, add remaining offset to BUFFER_OFFSET */
1970      uint32_t offset = iova & 0x1f;
1971      iova &= ~(uint64_t) 0x1f;
1972
1973      tu_cs_emit_pkt4(cs, REG_A6XX_VPC_SO_BUFFER_BASE(idx), 3);
1974      tu_cs_emit_qw(cs, iova);
1975      tu_cs_emit(cs, size + offset);
1976
1977      cmd->state.streamout_offset[idx] = offset;
1978   }
1979
1980   tu_cond_exec_end(cs);
1981}
1982
1983VKAPI_ATTR void VKAPI_CALL
1984tu_CmdBeginTransformFeedbackEXT(VkCommandBuffer commandBuffer,
1985                                uint32_t firstCounterBuffer,
1986                                uint32_t counterBufferCount,
1987                                const VkBuffer *pCounterBuffers,
1988                                const VkDeviceSize *pCounterBufferOffsets)
1989{
1990   TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
1991   struct tu_cs *cs = &cmd->draw_cs;
1992
1993   tu_cond_exec_start(cs, CP_COND_REG_EXEC_0_MODE(RENDER_MODE) |
1994                          CP_COND_REG_EXEC_0_SYSMEM |
1995                          CP_COND_REG_EXEC_0_BINNING);
1996
1997   /* TODO: only update offset for active buffers */
1998   for (uint32_t i = 0; i < IR3_MAX_SO_BUFFERS; i++)
1999      tu_cs_emit_regs(cs, A6XX_VPC_SO_BUFFER_OFFSET(i, cmd->state.streamout_offset[i]));
2000
2001   for (uint32_t i = 0; i < (pCounterBuffers ? counterBufferCount : 0); i++) {
2002      uint32_t idx = firstCounterBuffer + i;
2003      uint32_t offset = cmd->state.streamout_offset[idx];
2004      uint64_t counter_buffer_offset = pCounterBufferOffsets ? pCounterBufferOffsets[i] : 0u;
2005
2006      if (!pCounterBuffers[i])
2007         continue;
2008
2009      TU_FROM_HANDLE(tu_buffer, buf, pCounterBuffers[i]);
2010
2011      tu_cs_emit_pkt7(cs, CP_MEM_TO_REG, 3);
2012      tu_cs_emit(cs, CP_MEM_TO_REG_0_REG(REG_A6XX_VPC_SO_BUFFER_OFFSET(idx)) |
2013                     CP_MEM_TO_REG_0_UNK31 |
2014                     CP_MEM_TO_REG_0_CNT(1));
2015      tu_cs_emit_qw(cs, buf->bo->iova + counter_buffer_offset);
2016
2017      if (offset) {
2018         tu_cs_emit_pkt7(cs, CP_REG_RMW, 3);
2019         tu_cs_emit(cs, CP_REG_RMW_0_DST_REG(REG_A6XX_VPC_SO_BUFFER_OFFSET(idx)) |
2020                        CP_REG_RMW_0_SRC1_ADD);
2021         tu_cs_emit(cs, 0xffffffff);
2022         tu_cs_emit(cs, offset);
2023      }
2024   }
2025
2026   tu_cond_exec_end(cs);
2027}
2028
2029VKAPI_ATTR void VKAPI_CALL
2030tu_CmdEndTransformFeedbackEXT(VkCommandBuffer commandBuffer,
2031                              uint32_t firstCounterBuffer,
2032                              uint32_t counterBufferCount,
2033                              const VkBuffer *pCounterBuffers,
2034                              const VkDeviceSize *pCounterBufferOffsets)
2035{
2036   TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2037   struct tu_cs *cs = &cmd->draw_cs;
2038
2039   tu_cond_exec_start(cs, CP_COND_REG_EXEC_0_MODE(RENDER_MODE) |
2040                          CP_COND_REG_EXEC_0_SYSMEM |
2041                          CP_COND_REG_EXEC_0_BINNING);
2042
2043   /* TODO: only flush buffers that need to be flushed */
2044   for (uint32_t i = 0; i < IR3_MAX_SO_BUFFERS; i++) {
2045      /* note: FLUSH_BASE is always the same, so it could go in init_hw()? */
2046      tu_cs_emit_pkt4(cs, REG_A6XX_VPC_SO_FLUSH_BASE(i), 2);
2047      tu_cs_emit_qw(cs, global_iova(cmd, flush_base[i]));
2048      tu6_emit_event_write(cmd, cs, FLUSH_SO_0 + i);
2049   }
2050
2051   for (uint32_t i = 0; i < (pCounterBuffers ? counterBufferCount : 0); i++) {
2052      uint32_t idx = firstCounterBuffer + i;
2053      uint32_t offset = cmd->state.streamout_offset[idx];
2054      uint64_t counter_buffer_offset = pCounterBufferOffsets ? pCounterBufferOffsets[i] : 0u;
2055
2056      if (!pCounterBuffers[i])
2057         continue;
2058
2059      TU_FROM_HANDLE(tu_buffer, buf, pCounterBuffers[i]);
2060
2061      /* VPC_SO_FLUSH_BASE has dwords counter, but counter should be in bytes */
2062      tu_cs_emit_pkt7(cs, CP_MEM_TO_REG, 3);
2063      tu_cs_emit(cs, CP_MEM_TO_REG_0_REG(REG_A6XX_CP_SCRATCH_REG(0)) |
2064                     CP_MEM_TO_REG_0_SHIFT_BY_2 |
2065                     0x40000 | /* ??? */
2066                     CP_MEM_TO_REG_0_UNK31 |
2067                     CP_MEM_TO_REG_0_CNT(1));
2068      tu_cs_emit_qw(cs, global_iova(cmd, flush_base[idx]));
2069
2070      if (offset) {
2071         tu_cs_emit_pkt7(cs, CP_REG_RMW, 3);
2072         tu_cs_emit(cs, CP_REG_RMW_0_DST_REG(REG_A6XX_CP_SCRATCH_REG(0)) |
2073                        CP_REG_RMW_0_SRC1_ADD);
2074         tu_cs_emit(cs, 0xffffffff);
2075         tu_cs_emit(cs, -offset);
2076      }
2077
2078      tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3);
2079      tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(REG_A6XX_CP_SCRATCH_REG(0)) |
2080                     CP_REG_TO_MEM_0_CNT(1));
2081      tu_cs_emit_qw(cs, buf->bo->iova + counter_buffer_offset);
2082   }
2083
2084   tu_cond_exec_end(cs);
2085
2086   cmd->state.xfb_used = true;
2087}
2088
2089VKAPI_ATTR void VKAPI_CALL
2090tu_CmdPushConstants(VkCommandBuffer commandBuffer,
2091                    VkPipelineLayout layout,
2092                    VkShaderStageFlags stageFlags,
2093                    uint32_t offset,
2094                    uint32_t size,
2095                    const void *pValues)
2096{
2097   TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2098   memcpy((void*) cmd->push_constants + offset, pValues, size);
2099   cmd->state.dirty |= TU_CMD_DIRTY_SHADER_CONSTS;
2100}
2101
2102/* Flush everything which has been made available but we haven't actually
2103 * flushed yet.
2104 */
2105static void
2106tu_flush_all_pending(struct tu_cache_state *cache)
2107{
2108   cache->flush_bits |= cache->pending_flush_bits & TU_CMD_FLAG_ALL_FLUSH;
2109   cache->pending_flush_bits &= ~TU_CMD_FLAG_ALL_FLUSH;
2110}
2111
2112VKAPI_ATTR VkResult VKAPI_CALL
2113tu_EndCommandBuffer(VkCommandBuffer commandBuffer)
2114{
2115   TU_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, commandBuffer);
2116
2117   /* We currently flush CCU at the end of the command buffer, like
2118    * what the blob does. There's implicit synchronization around every
2119    * vkQueueSubmit, but the kernel only flushes the UCHE, and we don't
2120    * know yet if this command buffer will be the last in the submit so we
2121    * have to defensively flush everything else.
2122    *
2123    * TODO: We could definitely do better than this, since these flushes
2124    * aren't required by Vulkan, but we'd need kernel support to do that.
2125    * Ideally, we'd like the kernel to flush everything afterwards, so that we
2126    * wouldn't have to do any flushes here, and when submitting multiple
2127    * command buffers there wouldn't be any unnecessary flushes in between.
2128    */
2129   if (cmd_buffer->state.pass) {
2130      tu_flush_all_pending(&cmd_buffer->state.renderpass_cache);
2131      tu_emit_cache_flush_renderpass(cmd_buffer, &cmd_buffer->draw_cs);
2132   } else {
2133      tu_flush_all_pending(&cmd_buffer->state.cache);
2134      cmd_buffer->state.cache.flush_bits |=
2135         TU_CMD_FLAG_CCU_FLUSH_COLOR |
2136         TU_CMD_FLAG_CCU_FLUSH_DEPTH;
2137      tu_emit_cache_flush(cmd_buffer, &cmd_buffer->cs);
2138   }
2139
2140   tu_cs_end(&cmd_buffer->cs);
2141   tu_cs_end(&cmd_buffer->draw_cs);
2142   tu_cs_end(&cmd_buffer->tile_store_cs);
2143   tu_cs_end(&cmd_buffer->draw_epilogue_cs);
2144
2145   cmd_buffer->status = TU_CMD_BUFFER_STATUS_EXECUTABLE;
2146
2147   return cmd_buffer->record_result;
2148}
2149
2150static struct tu_cs
2151tu_cmd_dynamic_state(struct tu_cmd_buffer *cmd, uint32_t id, uint32_t size)
2152{
2153   struct tu_cs cs;
2154
2155   assert(id < ARRAY_SIZE(cmd->state.dynamic_state));
2156   cmd->state.dynamic_state[id] = tu_cs_draw_state(&cmd->sub_cs, &cs, size);
2157
2158   /* note: this also avoids emitting draw states before renderpass clears,
2159    * which may use the 3D clear path (for MSAA cases)
2160    */
2161   if (cmd->state.dirty & TU_CMD_DIRTY_DRAW_STATE)
2162      return cs;
2163
2164   tu_cs_emit_pkt7(&cmd->draw_cs, CP_SET_DRAW_STATE, 3);
2165   tu_cs_emit_draw_state(&cmd->draw_cs, TU_DRAW_STATE_DYNAMIC + id, cmd->state.dynamic_state[id]);
2166
2167   return cs;
2168}
2169
2170VKAPI_ATTR void VKAPI_CALL
2171tu_CmdBindPipeline(VkCommandBuffer commandBuffer,
2172                   VkPipelineBindPoint pipelineBindPoint,
2173                   VkPipeline _pipeline)
2174{
2175   TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2176   TU_FROM_HANDLE(tu_pipeline, pipeline, _pipeline);
2177
2178   if (pipelineBindPoint == VK_PIPELINE_BIND_POINT_COMPUTE) {
2179      cmd->state.compute_pipeline = pipeline;
2180      tu_cs_emit_state_ib(&cmd->cs, pipeline->program.state);
2181      return;
2182   }
2183
2184   assert(pipelineBindPoint == VK_PIPELINE_BIND_POINT_GRAPHICS);
2185
2186   cmd->state.pipeline = pipeline;
2187   cmd->state.dirty |= TU_CMD_DIRTY_DESC_SETS_LOAD | TU_CMD_DIRTY_SHADER_CONSTS |
2188                       TU_CMD_DIRTY_LRZ | TU_CMD_DIRTY_VS_PARAMS;
2189
2190   /* note: this also avoids emitting draw states before renderpass clears,
2191    * which may use the 3D clear path (for MSAA cases)
2192    */
2193   if (!(cmd->state.dirty & TU_CMD_DIRTY_DRAW_STATE)) {
2194      struct tu_cs *cs = &cmd->draw_cs;
2195      uint32_t mask = ~pipeline->dynamic_state_mask & BITFIELD_MASK(TU_DYNAMIC_STATE_COUNT);
2196
2197      tu_cs_emit_pkt7(cs, CP_SET_DRAW_STATE, 3 * (7 + util_bitcount(mask)));
2198      tu_cs_emit_draw_state(cs, TU_DRAW_STATE_PROGRAM_CONFIG, pipeline->program.config_state);
2199      tu_cs_emit_draw_state(cs, TU_DRAW_STATE_PROGRAM, pipeline->program.state);
2200      tu_cs_emit_draw_state(cs, TU_DRAW_STATE_PROGRAM_BINNING, pipeline->program.binning_state);
2201      tu_cs_emit_draw_state(cs, TU_DRAW_STATE_VI, pipeline->vi.state);
2202      tu_cs_emit_draw_state(cs, TU_DRAW_STATE_VI_BINNING, pipeline->vi.binning_state);
2203      tu_cs_emit_draw_state(cs, TU_DRAW_STATE_RAST, pipeline->rast_state);
2204      tu_cs_emit_draw_state(cs, TU_DRAW_STATE_BLEND, pipeline->blend_state);
2205
2206      u_foreach_bit(i, mask)
2207         tu_cs_emit_draw_state(cs, TU_DRAW_STATE_DYNAMIC + i, pipeline->dynamic_state[i]);
2208   }
2209
2210   if (cmd->state.line_mode != pipeline->line_mode) {
2211      cmd->state.line_mode = pipeline->line_mode;
2212
2213      /* We have to disable MSAA when bresenham lines are used, this is
2214       * a hardware limitation and spec allows it:
2215       *
2216       *    When Bresenham lines are being rasterized, sample locations may
2217       *    all be treated as being at the pixel center (this may affect
2218       *    attribute and depth interpolation).
2219       */
2220      if (cmd->state.subpass && cmd->state.subpass->samples) {
2221         tu6_emit_msaa(&cmd->draw_cs, cmd->state.subpass->samples, cmd->state.line_mode);
2222      }
2223   }
2224
2225   /* the vertex_buffers draw state always contains all the currently
2226    * bound vertex buffers. update its size to only emit the vbs which
2227    * are actually used by the pipeline
2228    * note there is a HW optimization which makes it so the draw state
2229    * is not re-executed completely when only the size changes
2230    */
2231   if (cmd->state.vertex_buffers.size != pipeline->num_vbs * 4) {
2232      cmd->state.vertex_buffers.size = pipeline->num_vbs * 4;
2233      cmd->state.dirty |= TU_CMD_DIRTY_VERTEX_BUFFERS;
2234   }
2235
2236   if ((pipeline->dynamic_state_mask & BIT(TU_DYNAMIC_STATE_VB_STRIDE)) &&
2237       cmd->state.dynamic_state[TU_DYNAMIC_STATE_VB_STRIDE].size != pipeline->num_vbs * 2) {
2238      cmd->state.dynamic_state[TU_DYNAMIC_STATE_VB_STRIDE].size = pipeline->num_vbs * 2;
2239      cmd->state.dirty |= TU_CMD_DIRTY_VB_STRIDE;
2240   }
2241
2242#define UPDATE_REG(X, Y) {                                           \
2243   /* note: would be better to have pipeline bits already masked */  \
2244   uint32_t pipeline_bits = pipeline->X & pipeline->X##_mask;        \
2245   if ((cmd->state.X & pipeline->X##_mask) != pipeline_bits) {       \
2246      cmd->state.X &= ~pipeline->X##_mask;                           \
2247      cmd->state.X |= pipeline_bits;                                 \
2248      cmd->state.dirty |= TU_CMD_DIRTY_##Y;                          \
2249   }                                                                 \
2250   if (!(pipeline->dynamic_state_mask & BIT(TU_DYNAMIC_STATE_##Y)))  \
2251      cmd->state.dirty &= ~TU_CMD_DIRTY_##Y;                         \
2252}
2253
2254   /* these registers can have bits set from both pipeline and dynamic state
2255    * this updates the bits set by the pipeline
2256    * if the pipeline doesn't use a dynamic state for the register, then
2257    * the relevant dirty bit is cleared to avoid overriding the non-dynamic
2258    * state with a dynamic state the next draw.
2259    */
2260   UPDATE_REG(gras_su_cntl, GRAS_SU_CNTL);
2261   UPDATE_REG(rb_depth_cntl, RB_DEPTH_CNTL);
2262   UPDATE_REG(rb_stencil_cntl, RB_STENCIL_CNTL);
2263   UPDATE_REG(pc_raster_cntl, RASTERIZER_DISCARD);
2264   UPDATE_REG(vpc_unknown_9107, RASTERIZER_DISCARD);
2265#undef UPDATE_REG
2266
2267   if (pipeline->rb_depth_cntl_disable)
2268      cmd->state.dirty |= TU_CMD_DIRTY_RB_DEPTH_CNTL;
2269}
2270
2271VKAPI_ATTR void VKAPI_CALL
2272tu_CmdSetViewport(VkCommandBuffer commandBuffer,
2273                  uint32_t firstViewport,
2274                  uint32_t viewportCount,
2275                  const VkViewport *pViewports)
2276{
2277   TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2278   struct tu_cs cs;
2279
2280   memcpy(&cmd->state.viewport[firstViewport], pViewports, viewportCount * sizeof(*pViewports));
2281   cmd->state.max_viewport = MAX2(cmd->state.max_viewport, firstViewport + viewportCount);
2282
2283   cs = tu_cmd_dynamic_state(cmd, VK_DYNAMIC_STATE_VIEWPORT, 8 + 10 * cmd->state.max_viewport);
2284   tu6_emit_viewport(&cs, cmd->state.viewport, cmd->state.max_viewport);
2285}
2286
2287VKAPI_ATTR void VKAPI_CALL
2288tu_CmdSetScissor(VkCommandBuffer commandBuffer,
2289                 uint32_t firstScissor,
2290                 uint32_t scissorCount,
2291                 const VkRect2D *pScissors)
2292{
2293   TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2294   struct tu_cs cs;
2295
2296   memcpy(&cmd->state.scissor[firstScissor], pScissors, scissorCount * sizeof(*pScissors));
2297   cmd->state.max_scissor = MAX2(cmd->state.max_scissor, firstScissor + scissorCount);
2298
2299   cs = tu_cmd_dynamic_state(cmd, VK_DYNAMIC_STATE_SCISSOR, 1 + 2 * cmd->state.max_scissor);
2300   tu6_emit_scissor(&cs, cmd->state.scissor, cmd->state.max_scissor);
2301}
2302
2303VKAPI_ATTR void VKAPI_CALL
2304tu_CmdSetLineWidth(VkCommandBuffer commandBuffer, float lineWidth)
2305{
2306   TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2307
2308   cmd->state.gras_su_cntl &= ~A6XX_GRAS_SU_CNTL_LINEHALFWIDTH__MASK;
2309   cmd->state.gras_su_cntl |= A6XX_GRAS_SU_CNTL_LINEHALFWIDTH(lineWidth / 2.0f);
2310
2311   cmd->state.dirty |= TU_CMD_DIRTY_GRAS_SU_CNTL;
2312}
2313
2314VKAPI_ATTR void VKAPI_CALL
2315tu_CmdSetDepthBias(VkCommandBuffer commandBuffer,
2316                   float depthBiasConstantFactor,
2317                   float depthBiasClamp,
2318                   float depthBiasSlopeFactor)
2319{
2320   TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2321   struct tu_cs cs = tu_cmd_dynamic_state(cmd, VK_DYNAMIC_STATE_DEPTH_BIAS, 4);
2322
2323   tu6_emit_depth_bias(&cs, depthBiasConstantFactor, depthBiasClamp, depthBiasSlopeFactor);
2324}
2325
2326VKAPI_ATTR void VKAPI_CALL
2327tu_CmdSetBlendConstants(VkCommandBuffer commandBuffer,
2328                        const float blendConstants[4])
2329{
2330   TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2331   struct tu_cs cs = tu_cmd_dynamic_state(cmd, VK_DYNAMIC_STATE_BLEND_CONSTANTS, 5);
2332
2333   tu_cs_emit_pkt4(&cs, REG_A6XX_RB_BLEND_RED_F32, 4);
2334   tu_cs_emit_array(&cs, (const uint32_t *) blendConstants, 4);
2335}
2336
2337VKAPI_ATTR void VKAPI_CALL
2338tu_CmdSetDepthBounds(VkCommandBuffer commandBuffer,
2339                     float minDepthBounds,
2340                     float maxDepthBounds)
2341{
2342   TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2343   struct tu_cs cs = tu_cmd_dynamic_state(cmd, VK_DYNAMIC_STATE_DEPTH_BOUNDS, 3);
2344
2345   tu_cs_emit_regs(&cs,
2346                   A6XX_RB_Z_BOUNDS_MIN(minDepthBounds),
2347                   A6XX_RB_Z_BOUNDS_MAX(maxDepthBounds));
2348}
2349
2350void
2351update_stencil_mask(uint32_t *value, VkStencilFaceFlags face, uint32_t mask)
2352{
2353   if (face & VK_STENCIL_FACE_FRONT_BIT)
2354      *value = (*value & 0xff00) | (mask & 0xff);
2355   if (face & VK_STENCIL_FACE_BACK_BIT)
2356      *value = (*value & 0xff) | (mask & 0xff) << 8;
2357}
2358
2359VKAPI_ATTR void VKAPI_CALL
2360tu_CmdSetStencilCompareMask(VkCommandBuffer commandBuffer,
2361                            VkStencilFaceFlags faceMask,
2362                            uint32_t compareMask)
2363{
2364   TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2365   struct tu_cs cs = tu_cmd_dynamic_state(cmd, VK_DYNAMIC_STATE_STENCIL_COMPARE_MASK, 2);
2366
2367   update_stencil_mask(&cmd->state.dynamic_stencil_mask, faceMask, compareMask);
2368
2369   tu_cs_emit_regs(&cs, A6XX_RB_STENCILMASK(.dword = cmd->state.dynamic_stencil_mask));
2370}
2371
2372VKAPI_ATTR void VKAPI_CALL
2373tu_CmdSetStencilWriteMask(VkCommandBuffer commandBuffer,
2374                          VkStencilFaceFlags faceMask,
2375                          uint32_t writeMask)
2376{
2377   TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2378   struct tu_cs cs = tu_cmd_dynamic_state(cmd, VK_DYNAMIC_STATE_STENCIL_WRITE_MASK, 2);
2379
2380   update_stencil_mask(&cmd->state.dynamic_stencil_wrmask, faceMask, writeMask);
2381
2382   tu_cs_emit_regs(&cs, A6XX_RB_STENCILWRMASK(.dword = cmd->state.dynamic_stencil_wrmask));
2383
2384   cmd->state.dirty |= TU_CMD_DIRTY_LRZ;
2385}
2386
2387VKAPI_ATTR void VKAPI_CALL
2388tu_CmdSetStencilReference(VkCommandBuffer commandBuffer,
2389                          VkStencilFaceFlags faceMask,
2390                          uint32_t reference)
2391{
2392   TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2393   struct tu_cs cs = tu_cmd_dynamic_state(cmd, VK_DYNAMIC_STATE_STENCIL_REFERENCE, 2);
2394
2395   update_stencil_mask(&cmd->state.dynamic_stencil_ref, faceMask, reference);
2396
2397   tu_cs_emit_regs(&cs, A6XX_RB_STENCILREF(.dword = cmd->state.dynamic_stencil_ref));
2398}
2399
2400VKAPI_ATTR void VKAPI_CALL
2401tu_CmdSetSampleLocationsEXT(VkCommandBuffer commandBuffer,
2402                            const VkSampleLocationsInfoEXT* pSampleLocationsInfo)
2403{
2404   TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2405   struct tu_cs cs = tu_cmd_dynamic_state(cmd, TU_DYNAMIC_STATE_SAMPLE_LOCATIONS, 9);
2406
2407   assert(pSampleLocationsInfo);
2408
2409   tu6_emit_sample_locations(&cs, pSampleLocationsInfo);
2410}
2411
2412VKAPI_ATTR void VKAPI_CALL
2413tu_CmdSetCullModeEXT(VkCommandBuffer commandBuffer, VkCullModeFlags cullMode)
2414{
2415   TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2416
2417   cmd->state.gras_su_cntl &=
2418      ~(A6XX_GRAS_SU_CNTL_CULL_FRONT | A6XX_GRAS_SU_CNTL_CULL_BACK);
2419
2420   if (cullMode & VK_CULL_MODE_FRONT_BIT)
2421      cmd->state.gras_su_cntl |= A6XX_GRAS_SU_CNTL_CULL_FRONT;
2422   if (cullMode & VK_CULL_MODE_BACK_BIT)
2423      cmd->state.gras_su_cntl |= A6XX_GRAS_SU_CNTL_CULL_BACK;
2424
2425   cmd->state.dirty |= TU_CMD_DIRTY_GRAS_SU_CNTL;
2426}
2427
2428VKAPI_ATTR void VKAPI_CALL
2429tu_CmdSetFrontFaceEXT(VkCommandBuffer commandBuffer, VkFrontFace frontFace)
2430{
2431   TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2432
2433   cmd->state.gras_su_cntl &= ~A6XX_GRAS_SU_CNTL_FRONT_CW;
2434
2435   if (frontFace == VK_FRONT_FACE_CLOCKWISE)
2436      cmd->state.gras_su_cntl |= A6XX_GRAS_SU_CNTL_FRONT_CW;
2437
2438   cmd->state.dirty |= TU_CMD_DIRTY_GRAS_SU_CNTL;
2439}
2440
2441VKAPI_ATTR void VKAPI_CALL
2442tu_CmdSetPrimitiveTopologyEXT(VkCommandBuffer commandBuffer,
2443                              VkPrimitiveTopology primitiveTopology)
2444{
2445   TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2446
2447   cmd->state.primtype = tu6_primtype(primitiveTopology);
2448}
2449
2450VKAPI_ATTR void VKAPI_CALL
2451tu_CmdSetViewportWithCountEXT(VkCommandBuffer commandBuffer,
2452                              uint32_t viewportCount,
2453                              const VkViewport* pViewports)
2454{
2455   tu_CmdSetViewport(commandBuffer, 0, viewportCount, pViewports);
2456}
2457
2458VKAPI_ATTR void VKAPI_CALL
2459tu_CmdSetScissorWithCountEXT(VkCommandBuffer commandBuffer,
2460                             uint32_t scissorCount,
2461                             const VkRect2D* pScissors)
2462{
2463   tu_CmdSetScissor(commandBuffer, 0, scissorCount, pScissors);
2464}
2465
2466VKAPI_ATTR void VKAPI_CALL
2467tu_CmdSetDepthTestEnableEXT(VkCommandBuffer commandBuffer,
2468                            VkBool32 depthTestEnable)
2469{
2470   TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2471
2472   cmd->state.rb_depth_cntl &= ~A6XX_RB_DEPTH_CNTL_Z_TEST_ENABLE;
2473
2474   if (depthTestEnable)
2475      cmd->state.rb_depth_cntl |= A6XX_RB_DEPTH_CNTL_Z_TEST_ENABLE;
2476
2477   cmd->state.dirty |= TU_CMD_DIRTY_RB_DEPTH_CNTL;
2478}
2479
2480VKAPI_ATTR void VKAPI_CALL
2481tu_CmdSetDepthWriteEnableEXT(VkCommandBuffer commandBuffer,
2482                             VkBool32 depthWriteEnable)
2483{
2484   TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2485
2486   cmd->state.rb_depth_cntl &= ~A6XX_RB_DEPTH_CNTL_Z_WRITE_ENABLE;
2487
2488   if (depthWriteEnable)
2489      cmd->state.rb_depth_cntl |= A6XX_RB_DEPTH_CNTL_Z_WRITE_ENABLE;
2490
2491   cmd->state.dirty |= TU_CMD_DIRTY_RB_DEPTH_CNTL;
2492}
2493
2494VKAPI_ATTR void VKAPI_CALL
2495tu_CmdSetDepthCompareOpEXT(VkCommandBuffer commandBuffer,
2496                           VkCompareOp depthCompareOp)
2497{
2498   TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2499
2500   cmd->state.rb_depth_cntl &= ~A6XX_RB_DEPTH_CNTL_ZFUNC__MASK;
2501
2502   cmd->state.rb_depth_cntl |=
2503      A6XX_RB_DEPTH_CNTL_ZFUNC(tu6_compare_func(depthCompareOp));
2504
2505   cmd->state.dirty |= TU_CMD_DIRTY_RB_DEPTH_CNTL;
2506}
2507
2508VKAPI_ATTR void VKAPI_CALL
2509tu_CmdSetDepthBoundsTestEnableEXT(VkCommandBuffer commandBuffer,
2510                                  VkBool32 depthBoundsTestEnable)
2511{
2512   TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2513
2514   cmd->state.rb_depth_cntl &= ~A6XX_RB_DEPTH_CNTL_Z_BOUNDS_ENABLE;
2515
2516   if (depthBoundsTestEnable)
2517      cmd->state.rb_depth_cntl |= A6XX_RB_DEPTH_CNTL_Z_BOUNDS_ENABLE;
2518
2519   cmd->state.dirty |= TU_CMD_DIRTY_RB_DEPTH_CNTL;
2520}
2521
2522VKAPI_ATTR void VKAPI_CALL
2523tu_CmdSetStencilTestEnableEXT(VkCommandBuffer commandBuffer,
2524                              VkBool32 stencilTestEnable)
2525{
2526   TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2527
2528   cmd->state.rb_stencil_cntl &= ~(
2529      A6XX_RB_STENCIL_CONTROL_STENCIL_ENABLE |
2530      A6XX_RB_STENCIL_CONTROL_STENCIL_ENABLE_BF |
2531      A6XX_RB_STENCIL_CONTROL_STENCIL_READ);
2532
2533   if (stencilTestEnable) {
2534      cmd->state.rb_stencil_cntl |=
2535         A6XX_RB_STENCIL_CONTROL_STENCIL_ENABLE |
2536         A6XX_RB_STENCIL_CONTROL_STENCIL_ENABLE_BF |
2537         A6XX_RB_STENCIL_CONTROL_STENCIL_READ;
2538   }
2539
2540   cmd->state.dirty |= TU_CMD_DIRTY_RB_STENCIL_CNTL;
2541}
2542
2543VKAPI_ATTR void VKAPI_CALL
2544tu_CmdSetStencilOpEXT(VkCommandBuffer commandBuffer,
2545                      VkStencilFaceFlags faceMask,
2546                      VkStencilOp failOp,
2547                      VkStencilOp passOp,
2548                      VkStencilOp depthFailOp,
2549                      VkCompareOp compareOp)
2550{
2551   TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2552
2553   if (faceMask & VK_STENCIL_FACE_FRONT_BIT) {
2554      cmd->state.rb_stencil_cntl &= ~(
2555         A6XX_RB_STENCIL_CONTROL_FUNC__MASK |
2556         A6XX_RB_STENCIL_CONTROL_FAIL__MASK |
2557         A6XX_RB_STENCIL_CONTROL_ZPASS__MASK |
2558         A6XX_RB_STENCIL_CONTROL_ZFAIL__MASK);
2559
2560      cmd->state.rb_stencil_cntl |=
2561         A6XX_RB_STENCIL_CONTROL_FUNC(tu6_compare_func(compareOp)) |
2562         A6XX_RB_STENCIL_CONTROL_FAIL(tu6_stencil_op(failOp)) |
2563         A6XX_RB_STENCIL_CONTROL_ZPASS(tu6_stencil_op(passOp)) |
2564         A6XX_RB_STENCIL_CONTROL_ZFAIL(tu6_stencil_op(depthFailOp));
2565   }
2566
2567   if (faceMask & VK_STENCIL_FACE_BACK_BIT) {
2568      cmd->state.rb_stencil_cntl &= ~(
2569         A6XX_RB_STENCIL_CONTROL_FUNC_BF__MASK |
2570         A6XX_RB_STENCIL_CONTROL_FAIL_BF__MASK |
2571         A6XX_RB_STENCIL_CONTROL_ZPASS_BF__MASK |
2572         A6XX_RB_STENCIL_CONTROL_ZFAIL_BF__MASK);
2573
2574      cmd->state.rb_stencil_cntl |=
2575         A6XX_RB_STENCIL_CONTROL_FUNC_BF(tu6_compare_func(compareOp)) |
2576         A6XX_RB_STENCIL_CONTROL_FAIL_BF(tu6_stencil_op(failOp)) |
2577         A6XX_RB_STENCIL_CONTROL_ZPASS_BF(tu6_stencil_op(passOp)) |
2578         A6XX_RB_STENCIL_CONTROL_ZFAIL_BF(tu6_stencil_op(depthFailOp));
2579   }
2580
2581   cmd->state.dirty |= TU_CMD_DIRTY_RB_STENCIL_CNTL;
2582}
2583
2584VKAPI_ATTR void VKAPI_CALL
2585tu_CmdSetDepthBiasEnableEXT(VkCommandBuffer commandBuffer,
2586                            VkBool32 depthBiasEnable)
2587{
2588   TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2589
2590   cmd->state.gras_su_cntl &= ~A6XX_GRAS_SU_CNTL_POLY_OFFSET;
2591   if (depthBiasEnable)
2592      cmd->state.gras_su_cntl |= A6XX_GRAS_SU_CNTL_POLY_OFFSET;
2593
2594   cmd->state.dirty |= TU_CMD_DIRTY_GRAS_SU_CNTL;
2595}
2596
2597VKAPI_ATTR void VKAPI_CALL
2598tu_CmdSetPrimitiveRestartEnableEXT(VkCommandBuffer commandBuffer,
2599                                   VkBool32 primitiveRestartEnable)
2600{
2601   TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2602
2603   cmd->state.primitive_restart_enable = primitiveRestartEnable;
2604}
2605
2606VKAPI_ATTR void VKAPI_CALL
2607tu_CmdSetRasterizerDiscardEnableEXT(VkCommandBuffer commandBuffer,
2608                                    VkBool32 rasterizerDiscardEnable)
2609{
2610   TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2611
2612   cmd->state.pc_raster_cntl &= ~A6XX_PC_RASTER_CNTL_DISCARD;
2613   cmd->state.vpc_unknown_9107 &= ~A6XX_VPC_UNKNOWN_9107_RASTER_DISCARD;
2614   if (rasterizerDiscardEnable) {
2615      cmd->state.pc_raster_cntl |= A6XX_PC_RASTER_CNTL_DISCARD;
2616      cmd->state.vpc_unknown_9107 |= A6XX_VPC_UNKNOWN_9107_RASTER_DISCARD;
2617   }
2618
2619   cmd->state.dirty |= TU_CMD_DIRTY_RASTERIZER_DISCARD;
2620}
2621
2622VKAPI_ATTR void VKAPI_CALL
2623tu_CmdSetLogicOpEXT(VkCommandBuffer commandBuffer,
2624                    VkLogicOp logicOp)
2625{
2626   tu_stub();
2627}
2628
2629VKAPI_ATTR void VKAPI_CALL
2630tu_CmdSetPatchControlPointsEXT(VkCommandBuffer commandBuffer,
2631                               uint32_t patchControlPoints)
2632{
2633   tu_stub();
2634}
2635
2636VKAPI_ATTR void VKAPI_CALL
2637tu_CmdSetLineStippleEXT(VkCommandBuffer commandBuffer,
2638                        uint32_t lineStippleFactor,
2639                        uint16_t lineStipplePattern)
2640{
2641   tu_stub();
2642}
2643
2644static void
2645tu_flush_for_access(struct tu_cache_state *cache,
2646                    enum tu_cmd_access_mask src_mask,
2647                    enum tu_cmd_access_mask dst_mask)
2648{
2649   enum tu_cmd_flush_bits flush_bits = 0;
2650
2651   if (src_mask & TU_ACCESS_SYSMEM_WRITE) {
2652      cache->pending_flush_bits |= TU_CMD_FLAG_ALL_INVALIDATE;
2653   }
2654
2655   if (src_mask & TU_ACCESS_CP_WRITE) {
2656      /* Flush the CP write queue.
2657       */
2658      cache->pending_flush_bits |=
2659         TU_CMD_FLAG_WAIT_MEM_WRITES |
2660         TU_CMD_FLAG_ALL_INVALIDATE;
2661   }
2662
2663#define SRC_FLUSH(domain, flush, invalidate) \
2664   if (src_mask & TU_ACCESS_##domain##_WRITE) {                      \
2665      cache->pending_flush_bits |= TU_CMD_FLAG_##flush |             \
2666         (TU_CMD_FLAG_ALL_INVALIDATE & ~TU_CMD_FLAG_##invalidate);   \
2667   }
2668
2669   SRC_FLUSH(UCHE, CACHE_FLUSH, CACHE_INVALIDATE)
2670   SRC_FLUSH(CCU_COLOR, CCU_FLUSH_COLOR, CCU_INVALIDATE_COLOR)
2671   SRC_FLUSH(CCU_DEPTH, CCU_FLUSH_DEPTH, CCU_INVALIDATE_DEPTH)
2672
2673#undef SRC_FLUSH
2674
2675#define SRC_INCOHERENT_FLUSH(domain, flush, invalidate)              \
2676   if (src_mask & TU_ACCESS_##domain##_INCOHERENT_WRITE) {           \
2677      flush_bits |= TU_CMD_FLAG_##flush;                             \
2678      cache->pending_flush_bits |=                                   \
2679         (TU_CMD_FLAG_ALL_INVALIDATE & ~TU_CMD_FLAG_##invalidate);   \
2680   }
2681
2682   SRC_INCOHERENT_FLUSH(CCU_COLOR, CCU_FLUSH_COLOR, CCU_INVALIDATE_COLOR)
2683   SRC_INCOHERENT_FLUSH(CCU_DEPTH, CCU_FLUSH_DEPTH, CCU_INVALIDATE_DEPTH)
2684
2685#undef SRC_INCOHERENT_FLUSH
2686
2687   /* Treat host & sysmem write accesses the same, since the kernel implicitly
2688    * drains the queue before signalling completion to the host.
2689    */
2690   if (dst_mask & (TU_ACCESS_SYSMEM_READ | TU_ACCESS_SYSMEM_WRITE)) {
2691      flush_bits |= cache->pending_flush_bits & TU_CMD_FLAG_ALL_FLUSH;
2692   }
2693
2694#define DST_FLUSH(domain, flush, invalidate) \
2695   if (dst_mask & (TU_ACCESS_##domain##_READ |                 \
2696                   TU_ACCESS_##domain##_WRITE)) {              \
2697      flush_bits |= cache->pending_flush_bits &                \
2698         (TU_CMD_FLAG_##invalidate |                           \
2699          (TU_CMD_FLAG_ALL_FLUSH & ~TU_CMD_FLAG_##flush));     \
2700   }
2701
2702   DST_FLUSH(UCHE, CACHE_FLUSH, CACHE_INVALIDATE)
2703   DST_FLUSH(CCU_COLOR, CCU_FLUSH_COLOR, CCU_INVALIDATE_COLOR)
2704   DST_FLUSH(CCU_DEPTH, CCU_FLUSH_DEPTH, CCU_INVALIDATE_DEPTH)
2705
2706#undef DST_FLUSH
2707
2708#define DST_INCOHERENT_FLUSH(domain, flush, invalidate) \
2709   if (dst_mask & (TU_ACCESS_##domain##_INCOHERENT_READ |      \
2710                   TU_ACCESS_##domain##_INCOHERENT_WRITE)) {   \
2711      flush_bits |= TU_CMD_FLAG_##invalidate |                 \
2712          (cache->pending_flush_bits &                         \
2713           (TU_CMD_FLAG_ALL_FLUSH & ~TU_CMD_FLAG_##flush));    \
2714   }
2715
2716   DST_INCOHERENT_FLUSH(CCU_COLOR, CCU_FLUSH_COLOR, CCU_INVALIDATE_COLOR)
2717   DST_INCOHERENT_FLUSH(CCU_DEPTH, CCU_FLUSH_DEPTH, CCU_INVALIDATE_DEPTH)
2718
2719#undef DST_INCOHERENT_FLUSH
2720
2721   cache->flush_bits |= flush_bits;
2722   cache->pending_flush_bits &= ~flush_bits;
2723}
2724
2725static void
2726tu_flush_for_stage(struct tu_cache_state *cache,
2727                   enum tu_stage src_stage, enum tu_stage dst_stage)
2728{
2729   /* As far as we know, flushes take place in the last stage so if there are
2730    * any pending flushes then we have to move down the source stage, because
2731    * the data only becomes available when the flush finishes. In particular
2732    * this can matter when the CP writes something and we need to invalidate
2733    * UCHE to read it.
2734    */
2735   if (cache->flush_bits & (TU_CMD_FLAG_ALL_FLUSH | TU_CMD_FLAG_ALL_INVALIDATE))
2736      src_stage = TU_STAGE_PS;
2737
2738   /* Note: if the destination stage is the CP, then the CP also has to wait
2739    * for any WFI's to finish. This is already done for draw calls, including
2740    * before indirect param reads, for the most part, so we just need to WFI.
2741    *
2742    * Transform feedback counters are read via CP_MEM_TO_REG, which implicitly
2743    * does CP_WAIT_FOR_ME, but we still need a WFI if the GPU writes it.
2744    *
2745    * Currently we read the draw predicate using CP_MEM_TO_MEM, which
2746    * also implicitly does CP_WAIT_FOR_ME. However CP_DRAW_PRED_SET does *not*
2747    * implicitly do CP_WAIT_FOR_ME, it seems to only wait for counters to
2748    * complete since it's written for DX11 where you can only predicate on the
2749    * result of a query object. So if we implement 64-bit comparisons in the
2750    * future, or if CP_DRAW_PRED_SET grows the capability to do 32-bit
2751    * comparisons, then this will have to be dealt with.
2752    */
2753   if (src_stage > dst_stage)
2754      cache->flush_bits |= TU_CMD_FLAG_WAIT_FOR_IDLE;
2755}
2756
2757static enum tu_cmd_access_mask
2758vk2tu_access(VkAccessFlags flags, bool gmem)
2759{
2760   enum tu_cmd_access_mask mask = 0;
2761
2762   if (flags &
2763       (VK_ACCESS_INDIRECT_COMMAND_READ_BIT | /* Read performed by CP */
2764        VK_ACCESS_CONDITIONAL_RENDERING_READ_BIT_EXT | /* Read performed by CP */
2765        VK_ACCESS_TRANSFORM_FEEDBACK_COUNTER_READ_BIT_EXT | /* Read performed by CP */
2766        VK_ACCESS_MEMORY_READ_BIT)) {
2767      mask |= TU_ACCESS_SYSMEM_READ;
2768   }
2769
2770   if (flags &
2771       (VK_ACCESS_TRANSFORM_FEEDBACK_COUNTER_WRITE_BIT_EXT |
2772        VK_ACCESS_MEMORY_WRITE_BIT)) {
2773      mask |= TU_ACCESS_CP_WRITE;
2774   }
2775
2776   if (flags &
2777       (VK_ACCESS_HOST_READ_BIT |
2778        VK_ACCESS_MEMORY_WRITE_BIT)) {
2779      mask |= TU_ACCESS_SYSMEM_READ;
2780   }
2781
2782   if (flags &
2783       (VK_ACCESS_HOST_WRITE_BIT |
2784        VK_ACCESS_MEMORY_WRITE_BIT)) {
2785      mask |= TU_ACCESS_SYSMEM_WRITE;
2786   }
2787
2788   if (flags &
2789       (VK_ACCESS_INDEX_READ_BIT | /* Read performed by PC, I think */
2790        VK_ACCESS_VERTEX_ATTRIBUTE_READ_BIT | /* Read performed by VFD */
2791        VK_ACCESS_UNIFORM_READ_BIT | /* Read performed by SP */
2792        /* TODO: Is there a no-cache bit for textures so that we can ignore
2793         * these?
2794         */
2795        VK_ACCESS_INPUT_ATTACHMENT_READ_BIT | /* Read performed by TP */
2796        VK_ACCESS_SHADER_READ_BIT | /* Read perfomed by SP/TP */
2797        VK_ACCESS_MEMORY_READ_BIT)) {
2798      mask |= TU_ACCESS_UCHE_READ;
2799   }
2800
2801   if (flags &
2802       (VK_ACCESS_SHADER_WRITE_BIT | /* Write performed by SP */
2803        VK_ACCESS_TRANSFORM_FEEDBACK_WRITE_BIT_EXT | /* Write performed by VPC */
2804        VK_ACCESS_MEMORY_WRITE_BIT)) {
2805      mask |= TU_ACCESS_UCHE_WRITE;
2806   }
2807
2808   /* When using GMEM, the CCU is always flushed automatically to GMEM, and
2809    * then GMEM is flushed to sysmem. Furthermore, we already had to flush any
2810    * previous writes in sysmem mode when transitioning to GMEM. Therefore we
2811    * can ignore CCU and pretend that color attachments and transfers use
2812    * sysmem directly.
2813    */
2814
2815   if (flags &
2816       (VK_ACCESS_COLOR_ATTACHMENT_READ_BIT |
2817        VK_ACCESS_COLOR_ATTACHMENT_READ_NONCOHERENT_BIT_EXT |
2818        VK_ACCESS_MEMORY_READ_BIT)) {
2819      if (gmem)
2820         mask |= TU_ACCESS_SYSMEM_READ;
2821      else
2822         mask |= TU_ACCESS_CCU_COLOR_INCOHERENT_READ;
2823   }
2824
2825   if (flags &
2826       (VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_READ_BIT |
2827        VK_ACCESS_MEMORY_READ_BIT)) {
2828      if (gmem)
2829         mask |= TU_ACCESS_SYSMEM_READ;
2830      else
2831         mask |= TU_ACCESS_CCU_DEPTH_INCOHERENT_READ;
2832   }
2833
2834   if (flags &
2835       (VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT |
2836        VK_ACCESS_MEMORY_WRITE_BIT)) {
2837      if (gmem) {
2838         mask |= TU_ACCESS_SYSMEM_WRITE;
2839      } else {
2840         mask |= TU_ACCESS_CCU_COLOR_INCOHERENT_WRITE;
2841      }
2842   }
2843
2844   if (flags &
2845       (VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT |
2846        VK_ACCESS_MEMORY_WRITE_BIT)) {
2847      if (gmem) {
2848         mask |= TU_ACCESS_SYSMEM_WRITE;
2849      } else {
2850         mask |= TU_ACCESS_CCU_DEPTH_INCOHERENT_WRITE;
2851      }
2852   }
2853
2854   if (flags &
2855       (VK_ACCESS_TRANSFER_WRITE_BIT |
2856        VK_ACCESS_MEMORY_WRITE_BIT)) {
2857      if (gmem) {
2858         mask |= TU_ACCESS_SYSMEM_WRITE;
2859      } else {
2860         mask |= TU_ACCESS_CCU_COLOR_WRITE;
2861      }
2862   }
2863
2864   if (flags &
2865       (VK_ACCESS_TRANSFER_READ_BIT | /* Access performed by TP */
2866        VK_ACCESS_MEMORY_READ_BIT)) {
2867      mask |= TU_ACCESS_UCHE_READ;
2868   }
2869
2870   return mask;
2871}
2872
2873static enum tu_stage
2874vk2tu_single_stage(VkPipelineStageFlags vk_stage, bool dst)
2875{
2876   switch (vk_stage) {
2877   case VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT:
2878   case VK_PIPELINE_STAGE_DRAW_INDIRECT_BIT:
2879   case VK_PIPELINE_STAGE_CONDITIONAL_RENDERING_BIT_EXT:
2880      return TU_STAGE_CP;
2881   case VK_PIPELINE_STAGE_VERTEX_INPUT_BIT:
2882      return TU_STAGE_FE;
2883   case VK_PIPELINE_STAGE_VERTEX_SHADER_BIT:
2884   case VK_PIPELINE_STAGE_TESSELLATION_CONTROL_SHADER_BIT:
2885   case VK_PIPELINE_STAGE_TESSELLATION_EVALUATION_SHADER_BIT:
2886   case VK_PIPELINE_STAGE_GEOMETRY_SHADER_BIT:
2887      return TU_STAGE_SP_VS;
2888   case VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT:
2889   case VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT:
2890      return TU_STAGE_SP_PS;
2891   case VK_PIPELINE_STAGE_TRANSFORM_FEEDBACK_BIT_EXT: /* Yes, really */
2892   /* See comment in TU_STAGE_GRAS about early fragment tests */
2893   case VK_PIPELINE_STAGE_EARLY_FRAGMENT_TESTS_BIT:
2894   case VK_PIPELINE_STAGE_LATE_FRAGMENT_TESTS_BIT:
2895   case VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT:
2896   case VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT:
2897      return TU_STAGE_PS;
2898
2899   case VK_PIPELINE_STAGE_TRANSFER_BIT:
2900      /* Blits read in SP_PS and write in PS, in both 2d and 3d cases */
2901      return dst ? TU_STAGE_SP_PS : TU_STAGE_PS;
2902
2903   case VK_PIPELINE_STAGE_ALL_GRAPHICS_BIT:
2904   case VK_PIPELINE_STAGE_ALL_COMMANDS_BIT:
2905      /* Be conservative */
2906      return dst ? TU_STAGE_CP : TU_STAGE_PS;
2907
2908   case VK_PIPELINE_STAGE_HOST_BIT:
2909      return dst ? TU_STAGE_PS : TU_STAGE_CP;
2910   }
2911
2912   unreachable("unknown pipeline stage");
2913}
2914
2915static enum tu_stage
2916vk2tu_src_stage(VkPipelineStageFlags vk_stages)
2917{
2918   enum tu_stage stage = TU_STAGE_CP;
2919   u_foreach_bit (bit, vk_stages) {
2920      enum tu_stage new_stage = vk2tu_single_stage(1ull << bit, false);
2921      stage = MAX2(stage, new_stage);
2922   }
2923
2924   return stage;
2925}
2926
2927static enum tu_stage
2928vk2tu_dst_stage(VkPipelineStageFlags vk_stages)
2929{
2930   enum tu_stage stage = TU_STAGE_PS;
2931   u_foreach_bit (bit, vk_stages) {
2932      enum tu_stage new_stage = vk2tu_single_stage(1ull << bit, true);
2933      stage = MIN2(stage, new_stage);
2934   }
2935
2936   return stage;
2937}
2938
2939VKAPI_ATTR void VKAPI_CALL
2940tu_CmdExecuteCommands(VkCommandBuffer commandBuffer,
2941                      uint32_t commandBufferCount,
2942                      const VkCommandBuffer *pCmdBuffers)
2943{
2944   TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2945   VkResult result;
2946
2947   assert(commandBufferCount > 0);
2948
2949   /* Emit any pending flushes. */
2950   if (cmd->state.pass) {
2951      tu_flush_all_pending(&cmd->state.renderpass_cache);
2952      tu_emit_cache_flush_renderpass(cmd, &cmd->draw_cs);
2953   } else {
2954      tu_flush_all_pending(&cmd->state.cache);
2955      tu_emit_cache_flush(cmd, &cmd->cs);
2956   }
2957
2958   for (uint32_t i = 0; i < commandBufferCount; i++) {
2959      TU_FROM_HANDLE(tu_cmd_buffer, secondary, pCmdBuffers[i]);
2960
2961      if (secondary->usage_flags &
2962          VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT) {
2963         assert(tu_cs_is_empty(&secondary->cs));
2964
2965         result = tu_cs_add_entries(&cmd->draw_cs, &secondary->draw_cs);
2966         if (result != VK_SUCCESS) {
2967            cmd->record_result = result;
2968            break;
2969         }
2970
2971         result = tu_cs_add_entries(&cmd->draw_epilogue_cs,
2972               &secondary->draw_epilogue_cs);
2973         if (result != VK_SUCCESS) {
2974            cmd->record_result = result;
2975            break;
2976         }
2977
2978         if (secondary->state.has_tess)
2979            cmd->state.has_tess = true;
2980         if (secondary->state.has_subpass_predication)
2981            cmd->state.has_subpass_predication = true;
2982         if (secondary->state.disable_gmem)
2983            cmd->state.disable_gmem = true;
2984      } else {
2985         assert(tu_cs_is_empty(&secondary->draw_cs));
2986         assert(tu_cs_is_empty(&secondary->draw_epilogue_cs));
2987
2988         tu_cs_add_entries(&cmd->cs, &secondary->cs);
2989      }
2990
2991      cmd->state.index_size = secondary->state.index_size; /* for restart index update */
2992   }
2993   cmd->state.dirty = ~0u; /* TODO: set dirty only what needs to be */
2994
2995   if (cmd->state.pass) {
2996      /* After a secondary command buffer is executed, LRZ is not valid
2997       * until it is cleared again.
2998       */
2999      cmd->state.lrz.valid = false;
3000   }
3001
3002   /* After executing secondary command buffers, there may have been arbitrary
3003    * flushes executed, so when we encounter a pipeline barrier with a
3004    * srcMask, we have to assume that we need to invalidate. Therefore we need
3005    * to re-initialize the cache with all pending invalidate bits set.
3006    */
3007   if (cmd->state.pass) {
3008      tu_cache_init(&cmd->state.renderpass_cache);
3009   } else {
3010      tu_cache_init(&cmd->state.cache);
3011   }
3012}
3013
3014VKAPI_ATTR VkResult VKAPI_CALL
3015tu_CreateCommandPool(VkDevice _device,
3016                     const VkCommandPoolCreateInfo *pCreateInfo,
3017                     const VkAllocationCallbacks *pAllocator,
3018                     VkCommandPool *pCmdPool)
3019{
3020   TU_FROM_HANDLE(tu_device, device, _device);
3021   struct tu_cmd_pool *pool;
3022
3023   pool = vk_object_alloc(&device->vk, pAllocator, sizeof(*pool),
3024                          VK_OBJECT_TYPE_COMMAND_POOL);
3025   if (pool == NULL)
3026      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
3027
3028   if (pAllocator)
3029      pool->alloc = *pAllocator;
3030   else
3031      pool->alloc = device->vk.alloc;
3032
3033   list_inithead(&pool->cmd_buffers);
3034   list_inithead(&pool->free_cmd_buffers);
3035
3036   pool->queue_family_index = pCreateInfo->queueFamilyIndex;
3037
3038   *pCmdPool = tu_cmd_pool_to_handle(pool);
3039
3040   return VK_SUCCESS;
3041}
3042
3043VKAPI_ATTR void VKAPI_CALL
3044tu_DestroyCommandPool(VkDevice _device,
3045                      VkCommandPool commandPool,
3046                      const VkAllocationCallbacks *pAllocator)
3047{
3048   TU_FROM_HANDLE(tu_device, device, _device);
3049   TU_FROM_HANDLE(tu_cmd_pool, pool, commandPool);
3050
3051   if (!pool)
3052      return;
3053
3054   list_for_each_entry_safe(struct tu_cmd_buffer, cmd_buffer,
3055                            &pool->cmd_buffers, pool_link)
3056   {
3057      tu_cmd_buffer_destroy(cmd_buffer);
3058   }
3059
3060   list_for_each_entry_safe(struct tu_cmd_buffer, cmd_buffer,
3061                            &pool->free_cmd_buffers, pool_link)
3062   {
3063      tu_cmd_buffer_destroy(cmd_buffer);
3064   }
3065
3066   vk_object_free(&device->vk, pAllocator, pool);
3067}
3068
3069VKAPI_ATTR VkResult VKAPI_CALL
3070tu_ResetCommandPool(VkDevice device,
3071                    VkCommandPool commandPool,
3072                    VkCommandPoolResetFlags flags)
3073{
3074   TU_FROM_HANDLE(tu_cmd_pool, pool, commandPool);
3075   VkResult result;
3076
3077   list_for_each_entry(struct tu_cmd_buffer, cmd_buffer, &pool->cmd_buffers,
3078                       pool_link)
3079   {
3080      result = tu_reset_cmd_buffer(cmd_buffer);
3081      if (result != VK_SUCCESS)
3082         return result;
3083   }
3084
3085   return VK_SUCCESS;
3086}
3087
3088VKAPI_ATTR void VKAPI_CALL
3089tu_TrimCommandPool(VkDevice device,
3090                   VkCommandPool commandPool,
3091                   VkCommandPoolTrimFlags flags)
3092{
3093   TU_FROM_HANDLE(tu_cmd_pool, pool, commandPool);
3094
3095   if (!pool)
3096      return;
3097
3098   list_for_each_entry_safe(struct tu_cmd_buffer, cmd_buffer,
3099                            &pool->free_cmd_buffers, pool_link)
3100   {
3101      tu_cmd_buffer_destroy(cmd_buffer);
3102   }
3103}
3104
3105static void
3106tu_subpass_barrier(struct tu_cmd_buffer *cmd_buffer,
3107                   const struct tu_subpass_barrier *barrier,
3108                   bool external)
3109{
3110   /* Note: we don't know until the end of the subpass whether we'll use
3111    * sysmem, so assume sysmem here to be safe.
3112    */
3113   struct tu_cache_state *cache =
3114      external ? &cmd_buffer->state.cache : &cmd_buffer->state.renderpass_cache;
3115   enum tu_cmd_access_mask src_flags =
3116      vk2tu_access(barrier->src_access_mask, false);
3117   enum tu_cmd_access_mask dst_flags =
3118      vk2tu_access(barrier->dst_access_mask, false);
3119
3120   if (barrier->incoherent_ccu_color)
3121      src_flags |= TU_ACCESS_CCU_COLOR_INCOHERENT_WRITE;
3122   if (barrier->incoherent_ccu_depth)
3123      src_flags |= TU_ACCESS_CCU_DEPTH_INCOHERENT_WRITE;
3124
3125   tu_flush_for_access(cache, src_flags, dst_flags);
3126
3127   enum tu_stage src_stage = vk2tu_src_stage(barrier->src_stage_mask);
3128   enum tu_stage dst_stage = vk2tu_dst_stage(barrier->dst_stage_mask);
3129   tu_flush_for_stage(cache, src_stage, dst_stage);
3130}
3131
3132VKAPI_ATTR void VKAPI_CALL
3133tu_CmdBeginRenderPass2(VkCommandBuffer commandBuffer,
3134                       const VkRenderPassBeginInfo *pRenderPassBegin,
3135                       const VkSubpassBeginInfo *pSubpassBeginInfo)
3136{
3137   TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
3138   TU_FROM_HANDLE(tu_render_pass, pass, pRenderPassBegin->renderPass);
3139   TU_FROM_HANDLE(tu_framebuffer, fb, pRenderPassBegin->framebuffer);
3140
3141   const struct VkRenderPassAttachmentBeginInfo *pAttachmentInfo =
3142      vk_find_struct_const(pRenderPassBegin->pNext,
3143                           RENDER_PASS_ATTACHMENT_BEGIN_INFO);
3144
3145   cmd->state.pass = pass;
3146   cmd->state.subpass = pass->subpasses;
3147   cmd->state.framebuffer = fb;
3148   cmd->state.render_area = pRenderPassBegin->renderArea;
3149
3150   cmd->state.attachments =
3151      vk_alloc(&cmd->pool->alloc, pass->attachment_count *
3152               sizeof(cmd->state.attachments[0]), 8,
3153               VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
3154
3155   if (!cmd->state.attachments) {
3156      cmd->record_result = VK_ERROR_OUT_OF_HOST_MEMORY;
3157      return;
3158   }
3159
3160   for (unsigned i = 0; i < pass->attachment_count; i++) {
3161      cmd->state.attachments[i] = pAttachmentInfo ?
3162         tu_image_view_from_handle(pAttachmentInfo->pAttachments[i]) :
3163         cmd->state.framebuffer->attachments[i].attachment;
3164   }
3165
3166   trace_start_render_pass(&cmd->trace, &cmd->cs);
3167
3168   /* Note: because this is external, any flushes will happen before draw_cs
3169    * gets called. However deferred flushes could have to happen later as part
3170    * of the subpass.
3171    */
3172   tu_subpass_barrier(cmd, &pass->subpasses[0].start_barrier, true);
3173   cmd->state.renderpass_cache.pending_flush_bits =
3174      cmd->state.cache.pending_flush_bits;
3175   cmd->state.renderpass_cache.flush_bits = 0;
3176
3177   if (pass->subpasses[0].feedback_invalidate)
3178      cmd->state.renderpass_cache.flush_bits |= TU_CMD_FLAG_CACHE_INVALIDATE;
3179
3180   /* Track LRZ valid state */
3181   uint32_t a = cmd->state.subpass->depth_stencil_attachment.attachment;
3182   if (a != VK_ATTACHMENT_UNUSED) {
3183      const struct tu_render_pass_attachment *att = &cmd->state.pass->attachments[a];
3184      struct tu_image *image = cmd->state.attachments[a]->image;
3185      /* if image has lrz and it isn't a stencil-only clear: */
3186      if (image->lrz_height &&
3187          (att->clear_mask & (VK_IMAGE_ASPECT_COLOR_BIT | VK_IMAGE_ASPECT_DEPTH_BIT))) {
3188         cmd->state.lrz.image = image;
3189         cmd->state.lrz.valid = true;
3190         cmd->state.lrz.prev_direction = TU_LRZ_UNKNOWN;
3191
3192         tu6_clear_lrz(cmd, &cmd->cs, image, &pRenderPassBegin->pClearValues[a]);
3193
3194         /* Clearing writes via CCU color in the PS stage, and LRZ is read via
3195          * UCHE in the earlier GRAS stage.
3196          */
3197         cmd->state.cache.flush_bits |=
3198            TU_CMD_FLAG_CCU_FLUSH_COLOR | TU_CMD_FLAG_CACHE_INVALIDATE |
3199            TU_CMD_FLAG_WAIT_FOR_IDLE;
3200      } else {
3201         cmd->state.lrz.valid = false;
3202      }
3203      cmd->state.dirty |= TU_CMD_DIRTY_LRZ;
3204   }
3205
3206   cmd->trace_renderpass_start = u_trace_end_iterator(&cmd->trace);
3207
3208   tu_emit_renderpass_begin(cmd, pRenderPassBegin);
3209
3210   tu6_emit_zs(cmd, cmd->state.subpass, &cmd->draw_cs);
3211   tu6_emit_mrt(cmd, cmd->state.subpass, &cmd->draw_cs);
3212   if (cmd->state.subpass->samples)
3213      tu6_emit_msaa(&cmd->draw_cs, cmd->state.subpass->samples, cmd->state.line_mode);
3214   tu6_emit_render_cntl(cmd, cmd->state.subpass, &cmd->draw_cs, false);
3215
3216   tu_set_input_attachments(cmd, cmd->state.subpass);
3217}
3218
3219VKAPI_ATTR void VKAPI_CALL
3220tu_CmdNextSubpass2(VkCommandBuffer commandBuffer,
3221                   const VkSubpassBeginInfo *pSubpassBeginInfo,
3222                   const VkSubpassEndInfo *pSubpassEndInfo)
3223{
3224   TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
3225   const struct tu_render_pass *pass = cmd->state.pass;
3226   struct tu_cs *cs = &cmd->draw_cs;
3227
3228   const struct tu_subpass *subpass = cmd->state.subpass++;
3229
3230   /* Track LRZ valid state
3231    *
3232    * TODO: Improve this tracking for keeping the state of the past depth/stencil images,
3233    * so if they become active again, we reuse its old state.
3234    */
3235   cmd->state.lrz.valid = false;
3236   cmd->state.dirty |= TU_CMD_DIRTY_LRZ;
3237
3238   tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_GMEM);
3239
3240   if (subpass->resolve_attachments) {
3241      tu6_emit_blit_scissor(cmd, cs, true);
3242
3243      for (unsigned i = 0; i < subpass->resolve_count; i++) {
3244         uint32_t a = subpass->resolve_attachments[i].attachment;
3245         if (a == VK_ATTACHMENT_UNUSED)
3246            continue;
3247
3248         uint32_t gmem_a = tu_subpass_get_attachment_to_resolve(subpass, i);
3249
3250         tu_store_gmem_attachment(cmd, cs, a, gmem_a);
3251
3252         if (pass->attachments[a].gmem_offset < 0)
3253            continue;
3254
3255         /* TODO:
3256          * check if the resolved attachment is needed by later subpasses,
3257          * if it is, should be doing a GMEM->GMEM resolve instead of GMEM->MEM->GMEM..
3258          */
3259         tu_finishme("missing GMEM->GMEM resolve path\n");
3260         tu_load_gmem_attachment(cmd, cs, a, true);
3261      }
3262   }
3263
3264   tu_cond_exec_end(cs);
3265
3266   tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_SYSMEM);
3267
3268   tu6_emit_sysmem_resolves(cmd, cs, subpass);
3269
3270   tu_cond_exec_end(cs);
3271
3272   /* Handle dependencies for the next subpass */
3273   tu_subpass_barrier(cmd, &cmd->state.subpass->start_barrier, false);
3274
3275   if (cmd->state.subpass->feedback_invalidate)
3276      cmd->state.renderpass_cache.flush_bits |= TU_CMD_FLAG_CACHE_INVALIDATE;
3277
3278   /* emit mrt/zs/msaa/ubwc state for the subpass that is starting */
3279   tu6_emit_zs(cmd, cmd->state.subpass, cs);
3280   tu6_emit_mrt(cmd, cmd->state.subpass, cs);
3281   if (cmd->state.subpass->samples)
3282      tu6_emit_msaa(cs, cmd->state.subpass->samples, cmd->state.line_mode);
3283   tu6_emit_render_cntl(cmd, cmd->state.subpass, cs, false);
3284
3285   tu_set_input_attachments(cmd, cmd->state.subpass);
3286}
3287
3288static uint32_t
3289tu6_user_consts_size(const struct tu_pipeline *pipeline,
3290                     struct tu_descriptor_state *descriptors_state,
3291                     gl_shader_stage type)
3292{
3293   const struct tu_program_descriptor_linkage *link =
3294      &pipeline->program.link[type];
3295   const struct ir3_ubo_analysis_state *state = &link->const_state.ubo_state;
3296   uint32_t dwords = 0;
3297
3298   if (link->push_consts.count > 0) {
3299      unsigned num_units = link->push_consts.count;
3300      dwords += 4 + num_units * 4;
3301   }
3302
3303   for (uint32_t i = 0; i < state->num_enabled; i++) {
3304      uint32_t size = state->range[i].end - state->range[i].start;
3305
3306      size = MIN2(size, (16 * link->constlen) - state->range[i].offset);
3307
3308      if (size == 0)
3309         continue;
3310
3311      if (!state->range[i].ubo.bindless)
3312         continue;
3313
3314      uint32_t *base = state->range[i].ubo.bindless_base == MAX_SETS ?
3315         descriptors_state->dynamic_descriptors :
3316         descriptors_state->sets[state->range[i].ubo.bindless_base]->mapped_ptr;
3317      unsigned block = state->range[i].ubo.block;
3318      uint32_t *desc = base + block * A6XX_TEX_CONST_DWORDS;
3319      uint32_t desc_size = (desc[1] >> A6XX_UBO_1_SIZE__SHIFT) * 16;
3320      desc_size = desc_size > state->range[i].start ?
3321         desc_size - state->range[i].start : 0;
3322
3323      if (desc_size < size) {
3324         uint32_t zero_size = size - desc_size;
3325         dwords += 4 + zero_size / 4;
3326         size = desc_size;
3327      }
3328
3329      if (size > 0) {
3330         dwords += 4;
3331      }
3332   }
3333
3334   return dwords;
3335}
3336
3337static void
3338tu6_emit_user_consts(struct tu_cs *cs, const struct tu_pipeline *pipeline,
3339                     struct tu_descriptor_state *descriptors_state,
3340                     gl_shader_stage type,
3341                     uint32_t *push_constants)
3342{
3343   const struct tu_program_descriptor_linkage *link =
3344      &pipeline->program.link[type];
3345   const struct ir3_const_state *const_state = &link->const_state;
3346   const struct ir3_ubo_analysis_state *state = &const_state->ubo_state;
3347
3348   if (link->push_consts.count > 0) {
3349      unsigned num_units = link->push_consts.count;
3350      unsigned offset = link->push_consts.lo;
3351      tu_cs_emit_pkt7(cs, tu6_stage2opcode(type), 3 + num_units * 4);
3352      tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(offset) |
3353            CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
3354            CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
3355            CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(type)) |
3356            CP_LOAD_STATE6_0_NUM_UNIT(num_units));
3357      tu_cs_emit(cs, 0);
3358      tu_cs_emit(cs, 0);
3359      for (unsigned i = 0; i < num_units * 4; i++)
3360         tu_cs_emit(cs, push_constants[i + offset * 4]);
3361   }
3362
3363   for (uint32_t i = 0; i < state->num_enabled; i++) {
3364      uint32_t size = state->range[i].end - state->range[i].start;
3365      uint32_t offset = state->range[i].start;
3366
3367      /* and even if the start of the const buffer is before
3368       * first_immediate, the end may not be:
3369       */
3370      size = MIN2(size, (16 * link->constlen) - state->range[i].offset);
3371
3372      if (size == 0)
3373         continue;
3374
3375      /* things should be aligned to vec4: */
3376      debug_assert((state->range[i].offset % 16) == 0);
3377      debug_assert((size % 16) == 0);
3378      debug_assert((offset % 16) == 0);
3379
3380      /* Dig out the descriptor from the descriptor state and read the VA from
3381       * it.  All our UBOs are bindless with the exception of the NIR
3382       * constant_data, which is uploaded once in the pipeline.
3383       */
3384      if (!state->range[i].ubo.bindless) {
3385         assert(state->range[i].ubo.block == const_state->constant_data_ubo);
3386         continue;
3387      }
3388
3389      uint32_t *base = state->range[i].ubo.bindless_base == MAX_SETS ?
3390         descriptors_state->dynamic_descriptors :
3391         descriptors_state->sets[state->range[i].ubo.bindless_base]->mapped_ptr;
3392      unsigned block = state->range[i].ubo.block;
3393      uint32_t *desc = base + block * A6XX_TEX_CONST_DWORDS;
3394      uint64_t va = desc[0] | ((uint64_t)(desc[1] & A6XX_UBO_1_BASE_HI__MASK) << 32);
3395      uint32_t desc_size = (desc[1] >> A6XX_UBO_1_SIZE__SHIFT) * 16;
3396      desc_size = desc_size > state->range[i].start ?
3397         desc_size - state->range[i].start : 0;
3398
3399      /* Handle null UBO descriptors and out-of-range UBO reads by filling the
3400       * rest with 0, simulating what reading with ldc would do. This behavior
3401       * is required by VK_EXT_robustness2.
3402       */
3403      if (desc_size < size) {
3404         uint32_t zero_size = size - desc_size;
3405         uint32_t zero_offset = state->range[i].offset + desc_size;
3406         tu_cs_emit_pkt7(cs, tu6_stage2opcode(type), 3 + zero_size / 4);
3407         tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(zero_offset / 16) |
3408               CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
3409               CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
3410               CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(type)) |
3411               CP_LOAD_STATE6_0_NUM_UNIT(zero_size / 16));
3412         tu_cs_emit_qw(cs, 0);
3413         for (unsigned i = 0; i < zero_size / 4; i++) {
3414            tu_cs_emit(cs, 0);
3415         }
3416         size = desc_size;
3417      }
3418
3419      if (size > 0) {
3420         assert(va);
3421         tu_cs_emit_pkt7(cs, tu6_stage2opcode(type), 3);
3422         tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(state->range[i].offset / 16) |
3423               CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
3424               CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
3425               CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(type)) |
3426               CP_LOAD_STATE6_0_NUM_UNIT(size / 16));
3427         tu_cs_emit_qw(cs, va + offset);
3428      }
3429   }
3430}
3431
3432static struct tu_draw_state
3433tu6_emit_consts(struct tu_cmd_buffer *cmd,
3434                const struct tu_pipeline *pipeline,
3435                struct tu_descriptor_state *descriptors_state,
3436                gl_shader_stage type)
3437{
3438   uint32_t dwords = tu6_user_consts_size(pipeline, descriptors_state, type);
3439   if (dwords == 0)
3440      return (struct tu_draw_state) {};
3441
3442   struct tu_cs cs;
3443   tu_cs_begin_sub_stream(&cmd->sub_cs, dwords, &cs);
3444
3445   tu6_emit_user_consts(&cs, pipeline, descriptors_state, type, cmd->push_constants);
3446
3447   return tu_cs_end_draw_state(&cmd->sub_cs, &cs);
3448}
3449
3450static struct tu_draw_state
3451tu6_emit_consts_geom(struct tu_cmd_buffer *cmd,
3452                      const struct tu_pipeline *pipeline,
3453                      struct tu_descriptor_state *descriptors_state)
3454{
3455   uint32_t dwords = 0;
3456
3457   for (uint32_t type = MESA_SHADER_VERTEX; type < MESA_SHADER_FRAGMENT; type++)
3458      dwords += tu6_user_consts_size(pipeline, descriptors_state, type);
3459
3460   if (dwords == 0)
3461      return (struct tu_draw_state) {};
3462
3463   struct tu_cs cs;
3464   tu_cs_begin_sub_stream(&cmd->sub_cs, dwords, &cs);
3465
3466   for (uint32_t type = MESA_SHADER_VERTEX; type < MESA_SHADER_FRAGMENT; type++)
3467      tu6_emit_user_consts(&cs, pipeline, descriptors_state, type, cmd->push_constants);
3468
3469   return tu_cs_end_draw_state(&cmd->sub_cs, &cs);
3470}
3471
3472static uint64_t
3473get_tess_param_bo_size(const struct tu_pipeline *pipeline,
3474                       uint32_t draw_count)
3475{
3476   /* TODO: For indirect draws, we can't compute the BO size ahead of time.
3477    * Still not sure what to do here, so just allocate a reasonably large
3478    * BO and hope for the best for now. */
3479   if (!draw_count)
3480      draw_count = 2048;
3481
3482   /* the tess param BO is pipeline->tess.param_stride bytes per patch,
3483    * which includes both the per-vertex outputs and per-patch outputs
3484    * build_primitive_map in ir3 calculates this stride
3485    */
3486   uint32_t verts_per_patch = pipeline->ia.primtype - DI_PT_PATCHES0;
3487   uint32_t num_patches = draw_count / verts_per_patch;
3488   return num_patches * pipeline->tess.param_stride;
3489}
3490
3491static uint64_t
3492get_tess_factor_bo_size(const struct tu_pipeline *pipeline,
3493                        uint32_t draw_count)
3494{
3495   /* TODO: For indirect draws, we can't compute the BO size ahead of time.
3496    * Still not sure what to do here, so just allocate a reasonably large
3497    * BO and hope for the best for now. */
3498   if (!draw_count)
3499      draw_count = 2048;
3500
3501   /* Each distinct patch gets its own tess factor output. */
3502   uint32_t verts_per_patch = pipeline->ia.primtype - DI_PT_PATCHES0;
3503   uint32_t num_patches = draw_count / verts_per_patch;
3504   uint32_t factor_stride;
3505   switch (pipeline->tess.patch_type) {
3506   case IR3_TESS_ISOLINES:
3507      factor_stride = 12;
3508      break;
3509   case IR3_TESS_TRIANGLES:
3510      factor_stride = 20;
3511      break;
3512   case IR3_TESS_QUADS:
3513      factor_stride = 28;
3514      break;
3515   default:
3516      unreachable("bad tessmode");
3517   }
3518   return factor_stride * num_patches;
3519}
3520
3521static VkResult
3522tu6_emit_tess_consts(struct tu_cmd_buffer *cmd,
3523                     uint32_t draw_count,
3524                     const struct tu_pipeline *pipeline,
3525                     struct tu_draw_state *state,
3526                     uint64_t *factor_iova)
3527{
3528   struct tu_cs cs;
3529   VkResult result = tu_cs_begin_sub_stream(&cmd->sub_cs, 16, &cs);
3530   if (result != VK_SUCCESS)
3531      return result;
3532
3533   const struct tu_program_descriptor_linkage *hs_link =
3534      &pipeline->program.link[MESA_SHADER_TESS_CTRL];
3535   bool hs_uses_bo = pipeline->tess.hs_bo_regid < hs_link->constlen;
3536
3537   const struct tu_program_descriptor_linkage *ds_link =
3538      &pipeline->program.link[MESA_SHADER_TESS_EVAL];
3539   bool ds_uses_bo = pipeline->tess.ds_bo_regid < ds_link->constlen;
3540
3541   uint64_t tess_factor_size = get_tess_factor_bo_size(pipeline, draw_count);
3542   uint64_t tess_param_size = get_tess_param_bo_size(pipeline, draw_count);
3543   uint64_t tess_bo_size =  tess_factor_size + tess_param_size;
3544   if ((hs_uses_bo || ds_uses_bo) && tess_bo_size > 0) {
3545      struct tu_bo *tess_bo;
3546      result = tu_get_scratch_bo(cmd->device, tess_bo_size, &tess_bo);
3547      if (result != VK_SUCCESS)
3548         return result;
3549
3550      uint64_t tess_factor_iova = tess_bo->iova;
3551      uint64_t tess_param_iova = tess_factor_iova + tess_factor_size;
3552
3553      if (hs_uses_bo) {
3554         tu_cs_emit_pkt7(&cs, CP_LOAD_STATE6_GEOM, 3 + 4);
3555         tu_cs_emit(&cs, CP_LOAD_STATE6_0_DST_OFF(pipeline->tess.hs_bo_regid) |
3556               CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
3557               CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
3558               CP_LOAD_STATE6_0_STATE_BLOCK(SB6_HS_SHADER) |
3559               CP_LOAD_STATE6_0_NUM_UNIT(1));
3560         tu_cs_emit(&cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
3561         tu_cs_emit(&cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
3562         tu_cs_emit_qw(&cs, tess_param_iova);
3563         tu_cs_emit_qw(&cs, tess_factor_iova);
3564      }
3565
3566      if (ds_uses_bo) {
3567         tu_cs_emit_pkt7(&cs, CP_LOAD_STATE6_GEOM, 3 + 4);
3568         tu_cs_emit(&cs, CP_LOAD_STATE6_0_DST_OFF(pipeline->tess.ds_bo_regid) |
3569               CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
3570               CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
3571               CP_LOAD_STATE6_0_STATE_BLOCK(SB6_DS_SHADER) |
3572               CP_LOAD_STATE6_0_NUM_UNIT(1));
3573         tu_cs_emit(&cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
3574         tu_cs_emit(&cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
3575         tu_cs_emit_qw(&cs, tess_param_iova);
3576         tu_cs_emit_qw(&cs, tess_factor_iova);
3577      }
3578
3579      *factor_iova = tess_factor_iova;
3580   }
3581   *state = tu_cs_end_draw_state(&cmd->sub_cs, &cs);
3582   return VK_SUCCESS;
3583}
3584
3585static enum tu_lrz_direction
3586tu6_lrz_depth_mode(struct A6XX_GRAS_LRZ_CNTL *gras_lrz_cntl,
3587                   VkCompareOp depthCompareOp,
3588                   bool *invalidate_lrz)
3589{
3590   enum tu_lrz_direction lrz_direction = TU_LRZ_UNKNOWN;
3591
3592   /* LRZ does not support some depth modes. */
3593   switch (depthCompareOp) {
3594   case VK_COMPARE_OP_ALWAYS:
3595   case VK_COMPARE_OP_NOT_EQUAL:
3596      *invalidate_lrz = true;
3597      gras_lrz_cntl->lrz_write = false;
3598      break;
3599   case VK_COMPARE_OP_EQUAL:
3600   case VK_COMPARE_OP_NEVER:
3601      gras_lrz_cntl->lrz_write = false;
3602      break;
3603   case VK_COMPARE_OP_GREATER:
3604   case VK_COMPARE_OP_GREATER_OR_EQUAL:
3605      lrz_direction = TU_LRZ_GREATER;
3606      gras_lrz_cntl->greater = true;
3607      break;
3608   case VK_COMPARE_OP_LESS:
3609   case VK_COMPARE_OP_LESS_OR_EQUAL:
3610      lrz_direction = TU_LRZ_LESS;
3611      break;
3612   default:
3613      unreachable("bad VK_COMPARE_OP value or uninitialized");
3614      break;
3615   };
3616
3617   return lrz_direction;
3618}
3619
3620/* update lrz state based on stencil-test func:
3621 *
3622 * Conceptually the order of the pipeline is:
3623 *
3624 *
3625 *   FS -> Alpha-Test  ->  Stencil-Test  ->  Depth-Test
3626 *                              |                |
3627 *                       if wrmask != 0     if wrmask != 0
3628 *                              |                |
3629 *                              v                v
3630 *                        Stencil-Write      Depth-Write
3631 *
3632 * Because Stencil-Test can have side effects (Stencil-Write) prior
3633 * to depth test, in this case we potentially need to disable early
3634 * lrz-test. See:
3635 *
3636 * https://www.khronos.org/opengl/wiki/Per-Sample_Processing
3637 */
3638static void
3639tu6_lrz_stencil_op(struct A6XX_GRAS_LRZ_CNTL *gras_lrz_cntl,
3640                   VkCompareOp func,
3641                   bool stencil_write,
3642                   bool *invalidate_lrz)
3643{
3644   switch (func) {
3645   case VK_COMPARE_OP_ALWAYS:
3646      /* nothing to do for LRZ, but for stencil test when stencil-
3647       * write is enabled, we need to disable lrz-test, since
3648       * conceptually stencil test and write happens before depth-test.
3649       */
3650      if (stencil_write) {
3651         gras_lrz_cntl->enable = false;
3652         gras_lrz_cntl->z_test_enable = false;
3653         *invalidate_lrz = true;
3654      }
3655      break;
3656   case VK_COMPARE_OP_NEVER:
3657      /* fragment never passes, disable lrz_write for this draw. */
3658      gras_lrz_cntl->lrz_write = false;
3659      break;
3660   default:
3661      /* whether the fragment passes or not depends on result
3662       * of stencil test, which we cannot know when doing binning
3663       * pass.
3664       */
3665      gras_lrz_cntl->lrz_write = false;
3666      /* similarly to the VK_COMPARE_OP_ALWAYS case, if there are side-
3667       * effects from stencil test we need to disable lrz-test.
3668       */
3669      if (stencil_write) {
3670         gras_lrz_cntl->enable = false;
3671         gras_lrz_cntl->z_test_enable = false;
3672         *invalidate_lrz = true;
3673      }
3674      break;
3675   }
3676}
3677
3678static struct A6XX_GRAS_LRZ_CNTL
3679tu6_calculate_lrz_state(struct tu_cmd_buffer *cmd,
3680                        const uint32_t a)
3681{
3682   struct tu_pipeline *pipeline = cmd->state.pipeline;
3683   struct A6XX_GRAS_LRZ_CNTL gras_lrz_cntl = { 0 };
3684   bool invalidate_lrz = pipeline->lrz.force_disable_mask & TU_LRZ_FORCE_DISABLE_LRZ;
3685   bool force_disable_write = pipeline->lrz.force_disable_mask & TU_LRZ_FORCE_DISABLE_WRITE;
3686   enum tu_lrz_direction lrz_direction = TU_LRZ_UNKNOWN;
3687
3688   gras_lrz_cntl.enable = cmd->state.rb_depth_cntl & A6XX_RB_DEPTH_CNTL_Z_TEST_ENABLE;
3689   gras_lrz_cntl.lrz_write = cmd->state.rb_depth_cntl & A6XX_RB_DEPTH_CNTL_Z_WRITE_ENABLE;
3690   gras_lrz_cntl.z_test_enable = cmd->state.rb_depth_cntl & A6XX_RB_DEPTH_CNTL_Z_READ_ENABLE;
3691   gras_lrz_cntl.z_bounds_enable = cmd->state.rb_depth_cntl & A6XX_RB_DEPTH_CNTL_Z_BOUNDS_ENABLE;
3692
3693   VkCompareOp depth_compare_op = (cmd->state.rb_depth_cntl & A6XX_RB_DEPTH_CNTL_ZFUNC__MASK) >> A6XX_RB_DEPTH_CNTL_ZFUNC__SHIFT;
3694   lrz_direction = tu6_lrz_depth_mode(&gras_lrz_cntl, depth_compare_op, &invalidate_lrz);
3695
3696   /* LRZ doesn't transition properly between GREATER* and LESS* depth compare ops */
3697   if (cmd->state.lrz.prev_direction != TU_LRZ_UNKNOWN &&
3698       lrz_direction != TU_LRZ_UNKNOWN &&
3699       cmd->state.lrz.prev_direction != lrz_direction) {
3700      invalidate_lrz = true;
3701   }
3702
3703   cmd->state.lrz.prev_direction = lrz_direction;
3704
3705   /* Invalidate LRZ and disable write if stencil test is enabled */
3706   bool stencil_test_enable = cmd->state.rb_stencil_cntl & A6XX_RB_STENCIL_CONTROL_STENCIL_ENABLE;
3707   if (stencil_test_enable) {
3708      bool stencil_front_writemask =
3709         (pipeline->dynamic_state_mask & BIT(VK_DYNAMIC_STATE_STENCIL_WRITE_MASK)) ?
3710         (cmd->state.dynamic_stencil_wrmask & 0xff) :
3711         (pipeline->stencil_wrmask & 0xff);
3712
3713      bool stencil_back_writemask =
3714         (pipeline->dynamic_state_mask & BIT(VK_DYNAMIC_STATE_STENCIL_WRITE_MASK)) ?
3715         ((cmd->state.dynamic_stencil_wrmask & 0xff00) >> 8) :
3716         (pipeline->stencil_wrmask & 0xff00) >> 8;
3717
3718      VkCompareOp stencil_front_compare_op =
3719         (cmd->state.rb_stencil_cntl & A6XX_RB_STENCIL_CONTROL_FUNC__MASK) >> A6XX_RB_STENCIL_CONTROL_FUNC__SHIFT;
3720
3721      VkCompareOp stencil_back_compare_op =
3722         (cmd->state.rb_stencil_cntl & A6XX_RB_STENCIL_CONTROL_FUNC_BF__MASK) >> A6XX_RB_STENCIL_CONTROL_FUNC_BF__SHIFT;
3723
3724      tu6_lrz_stencil_op(&gras_lrz_cntl, stencil_front_compare_op,
3725                         stencil_front_writemask, &invalidate_lrz);
3726
3727      tu6_lrz_stencil_op(&gras_lrz_cntl, stencil_back_compare_op,
3728                         stencil_back_writemask, &invalidate_lrz);
3729   }
3730
3731   if (force_disable_write)
3732      gras_lrz_cntl.lrz_write = false;
3733
3734   if (invalidate_lrz) {
3735      cmd->state.lrz.valid = false;
3736   }
3737
3738   /* In case no depth attachment or invalid, we clear the gras_lrz_cntl register */
3739   if (a == VK_ATTACHMENT_UNUSED || !cmd->state.lrz.valid)
3740      memset(&gras_lrz_cntl, 0, sizeof(gras_lrz_cntl));
3741
3742   return gras_lrz_cntl;
3743}
3744
3745static struct tu_draw_state
3746tu6_build_lrz(struct tu_cmd_buffer *cmd)
3747{
3748   const uint32_t a = cmd->state.subpass->depth_stencil_attachment.attachment;
3749   struct tu_cs lrz_cs;
3750   struct tu_draw_state ds = tu_cs_draw_state(&cmd->sub_cs, &lrz_cs, 4);
3751
3752   struct A6XX_GRAS_LRZ_CNTL gras_lrz_cntl = tu6_calculate_lrz_state(cmd, a);
3753
3754   tu_cs_emit_regs(&lrz_cs, A6XX_GRAS_LRZ_CNTL(
3755      .enable = gras_lrz_cntl.enable,
3756      .greater = gras_lrz_cntl.greater,
3757      .lrz_write = gras_lrz_cntl.lrz_write,
3758      .z_test_enable = gras_lrz_cntl.z_test_enable,
3759      .z_bounds_enable = gras_lrz_cntl.z_bounds_enable));
3760   tu_cs_emit_regs(&lrz_cs, A6XX_RB_LRZ_CNTL(.enable = gras_lrz_cntl.enable));
3761
3762   return ds;
3763}
3764
3765static bool
3766tu6_writes_depth(struct tu_cmd_buffer *cmd, bool depth_test_enable)
3767{
3768   bool depth_write_enable =
3769      cmd->state.rb_depth_cntl & A6XX_RB_DEPTH_CNTL_Z_WRITE_ENABLE;
3770
3771   VkCompareOp depth_compare_op =
3772      (cmd->state.rb_depth_cntl & A6XX_RB_DEPTH_CNTL_ZFUNC__MASK) >> A6XX_RB_DEPTH_CNTL_ZFUNC__SHIFT;
3773
3774   bool depth_compare_op_writes = depth_compare_op != VK_COMPARE_OP_NEVER;
3775
3776   return depth_test_enable && depth_write_enable && depth_compare_op_writes;
3777}
3778
3779static bool
3780tu6_writes_stencil(struct tu_cmd_buffer *cmd)
3781{
3782   bool stencil_test_enable =
3783      cmd->state.rb_stencil_cntl & A6XX_RB_STENCIL_CONTROL_STENCIL_ENABLE;
3784
3785   bool stencil_front_writemask =
3786      (cmd->state.pipeline->dynamic_state_mask & BIT(VK_DYNAMIC_STATE_STENCIL_WRITE_MASK)) ?
3787      (cmd->state.dynamic_stencil_wrmask & 0xff) :
3788      (cmd->state.pipeline->stencil_wrmask & 0xff);
3789
3790   bool stencil_back_writemask =
3791      (cmd->state.pipeline->dynamic_state_mask & BIT(VK_DYNAMIC_STATE_STENCIL_WRITE_MASK)) ?
3792      ((cmd->state.dynamic_stencil_wrmask & 0xff00) >> 8) :
3793      (cmd->state.pipeline->stencil_wrmask & 0xff00) >> 8;
3794
3795   VkStencilOp front_fail_op =
3796      (cmd->state.rb_stencil_cntl & A6XX_RB_STENCIL_CONTROL_FAIL__MASK) >> A6XX_RB_STENCIL_CONTROL_FAIL__SHIFT;
3797   VkStencilOp front_pass_op =
3798      (cmd->state.rb_stencil_cntl & A6XX_RB_STENCIL_CONTROL_ZPASS__MASK) >> A6XX_RB_STENCIL_CONTROL_ZPASS__SHIFT;
3799   VkStencilOp front_depth_fail_op =
3800      (cmd->state.rb_stencil_cntl & A6XX_RB_STENCIL_CONTROL_ZFAIL__MASK) >> A6XX_RB_STENCIL_CONTROL_ZFAIL__SHIFT;
3801   VkStencilOp back_fail_op =
3802      (cmd->state.rb_stencil_cntl & A6XX_RB_STENCIL_CONTROL_FAIL_BF__MASK) >> A6XX_RB_STENCIL_CONTROL_FAIL_BF__SHIFT;
3803   VkStencilOp back_pass_op =
3804      (cmd->state.rb_stencil_cntl & A6XX_RB_STENCIL_CONTROL_ZPASS_BF__MASK) >> A6XX_RB_STENCIL_CONTROL_ZPASS_BF__SHIFT;
3805   VkStencilOp back_depth_fail_op =
3806      (cmd->state.rb_stencil_cntl & A6XX_RB_STENCIL_CONTROL_ZFAIL_BF__MASK) >> A6XX_RB_STENCIL_CONTROL_ZFAIL_BF__SHIFT;
3807
3808   bool stencil_front_op_writes =
3809      front_pass_op != VK_STENCIL_OP_KEEP &&
3810      front_fail_op != VK_STENCIL_OP_KEEP &&
3811      front_depth_fail_op != VK_STENCIL_OP_KEEP;
3812
3813   bool stencil_back_op_writes =
3814      back_pass_op != VK_STENCIL_OP_KEEP &&
3815      back_fail_op != VK_STENCIL_OP_KEEP &&
3816      back_depth_fail_op != VK_STENCIL_OP_KEEP;
3817
3818   return stencil_test_enable &&
3819      ((stencil_front_writemask && stencil_front_op_writes) ||
3820       (stencil_back_writemask && stencil_back_op_writes));
3821}
3822
3823static struct tu_draw_state
3824tu6_build_depth_plane_z_mode(struct tu_cmd_buffer *cmd)
3825{
3826   struct tu_cs cs;
3827   struct tu_draw_state ds = tu_cs_draw_state(&cmd->sub_cs, &cs, 4);
3828
3829   enum a6xx_ztest_mode zmode = A6XX_EARLY_Z;
3830   bool depth_test_enable = cmd->state.rb_depth_cntl & A6XX_RB_DEPTH_CNTL_Z_TEST_ENABLE;
3831   bool depth_write = tu6_writes_depth(cmd, depth_test_enable);
3832   bool stencil_write = tu6_writes_stencil(cmd);
3833
3834   if (cmd->state.pipeline->lrz.fs_has_kill &&
3835       (depth_write || stencil_write)) {
3836      zmode = cmd->state.lrz.valid ? A6XX_EARLY_LRZ_LATE_Z : A6XX_LATE_Z;
3837   }
3838
3839   if (cmd->state.pipeline->lrz.force_late_z || !depth_test_enable)
3840      zmode = A6XX_LATE_Z;
3841
3842   /* User defined early tests take precedence above all else */
3843   if (cmd->state.pipeline->lrz.early_fragment_tests)
3844      zmode = A6XX_EARLY_Z;
3845
3846   tu_cs_emit_pkt4(&cs, REG_A6XX_GRAS_SU_DEPTH_PLANE_CNTL, 1);
3847   tu_cs_emit(&cs, A6XX_GRAS_SU_DEPTH_PLANE_CNTL_Z_MODE(zmode));
3848
3849   tu_cs_emit_pkt4(&cs, REG_A6XX_RB_DEPTH_PLANE_CNTL, 1);
3850   tu_cs_emit(&cs, A6XX_RB_DEPTH_PLANE_CNTL_Z_MODE(zmode));
3851   return ds;
3852}
3853
3854static VkResult
3855tu6_draw_common(struct tu_cmd_buffer *cmd,
3856                struct tu_cs *cs,
3857                bool indexed,
3858                /* note: draw_count is 0 for indirect */
3859                uint32_t draw_count)
3860{
3861   const struct tu_pipeline *pipeline = cmd->state.pipeline;
3862   VkResult result;
3863
3864   tu_emit_cache_flush_renderpass(cmd, cs);
3865
3866   bool primitive_restart_enabled = pipeline->ia.primitive_restart;
3867   if (pipeline->dynamic_state_mask & BIT(TU_DYNAMIC_STATE_PRIMITIVE_RESTART_ENABLE))
3868      primitive_restart_enabled = cmd->state.primitive_restart_enable;
3869
3870   tu_cs_emit_regs(cs, A6XX_PC_PRIMITIVE_CNTL_0(
3871         .primitive_restart =
3872               primitive_restart_enabled && indexed,
3873         .provoking_vtx_last = pipeline->provoking_vertex_last,
3874         .tess_upper_left_domain_origin =
3875               pipeline->tess.upper_left_domain_origin));
3876
3877   bool has_tess =
3878         pipeline->active_stages & VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT;
3879
3880   /* Early exit if there is nothing to emit, saves CPU cycles */
3881   if (!(cmd->state.dirty & ~TU_CMD_DIRTY_COMPUTE_DESC_SETS_LOAD) &&
3882       !has_tess)
3883      return VK_SUCCESS;
3884
3885   bool dirty_lrz = cmd->state.dirty & (TU_CMD_DIRTY_LRZ | TU_CMD_DIRTY_RB_DEPTH_CNTL | TU_CMD_DIRTY_RB_STENCIL_CNTL);
3886
3887   struct tu_descriptor_state *descriptors_state =
3888      &cmd->descriptors[VK_PIPELINE_BIND_POINT_GRAPHICS];
3889
3890   if (dirty_lrz) {
3891      cmd->state.lrz.state = tu6_build_lrz(cmd);
3892      cmd->state.depth_plane_state = tu6_build_depth_plane_z_mode(cmd);
3893   }
3894
3895   if (cmd->state.dirty & TU_CMD_DIRTY_RASTERIZER_DISCARD) {
3896      struct tu_cs cs = tu_cmd_dynamic_state(cmd, TU_DYNAMIC_STATE_RASTERIZER_DISCARD, 4);
3897      tu_cs_emit_regs(&cs, A6XX_PC_RASTER_CNTL(.dword = cmd->state.pc_raster_cntl));
3898      tu_cs_emit_regs(&cs, A6XX_VPC_UNKNOWN_9107(.dword = cmd->state.vpc_unknown_9107));
3899   }
3900
3901   if (cmd->state.dirty & TU_CMD_DIRTY_GRAS_SU_CNTL) {
3902      struct tu_cs cs = tu_cmd_dynamic_state(cmd, TU_DYNAMIC_STATE_GRAS_SU_CNTL, 2);
3903      tu_cs_emit_regs(&cs, A6XX_GRAS_SU_CNTL(.dword = cmd->state.gras_su_cntl));
3904   }
3905
3906   if (cmd->state.dirty & TU_CMD_DIRTY_RB_DEPTH_CNTL) {
3907      struct tu_cs cs = tu_cmd_dynamic_state(cmd, TU_DYNAMIC_STATE_RB_DEPTH_CNTL, 2);
3908      uint32_t rb_depth_cntl = cmd->state.rb_depth_cntl;
3909
3910      if ((rb_depth_cntl & A6XX_RB_DEPTH_CNTL_Z_TEST_ENABLE) ||
3911          (rb_depth_cntl & A6XX_RB_DEPTH_CNTL_Z_BOUNDS_ENABLE))
3912         rb_depth_cntl |= A6XX_RB_DEPTH_CNTL_Z_READ_ENABLE;
3913
3914      if ((rb_depth_cntl & A6XX_RB_DEPTH_CNTL_Z_BOUNDS_ENABLE) &&
3915          !(rb_depth_cntl & A6XX_RB_DEPTH_CNTL_Z_TEST_ENABLE))
3916         tu6_apply_depth_bounds_workaround(cmd->device, &rb_depth_cntl);
3917
3918      if (pipeline->rb_depth_cntl_disable)
3919         rb_depth_cntl = 0;
3920
3921      tu_cs_emit_regs(&cs, A6XX_RB_DEPTH_CNTL(.dword = rb_depth_cntl));
3922   }
3923
3924   if (cmd->state.dirty & TU_CMD_DIRTY_RB_STENCIL_CNTL) {
3925      struct tu_cs cs = tu_cmd_dynamic_state(cmd, TU_DYNAMIC_STATE_RB_STENCIL_CNTL, 2);
3926      tu_cs_emit_regs(&cs, A6XX_RB_STENCIL_CONTROL(.dword = cmd->state.rb_stencil_cntl));
3927   }
3928
3929   if (cmd->state.dirty & TU_CMD_DIRTY_SHADER_CONSTS) {
3930      cmd->state.shader_const[0] =
3931         tu6_emit_consts_geom(cmd, pipeline, descriptors_state);
3932      cmd->state.shader_const[1] =
3933         tu6_emit_consts(cmd, pipeline, descriptors_state, MESA_SHADER_FRAGMENT);
3934   }
3935
3936   struct tu_draw_state tess_consts = {};
3937   if (has_tess) {
3938      uint64_t tess_factor_iova = 0;
3939
3940      cmd->state.has_tess = true;
3941      result = tu6_emit_tess_consts(cmd, draw_count, pipeline, &tess_consts, &tess_factor_iova);
3942      if (result != VK_SUCCESS)
3943         return result;
3944
3945      /* this sequence matches what the blob does before every tess draw
3946       * PC_TESSFACTOR_ADDR_LO is a non-context register and needs a wfi
3947       * before writing to it
3948       */
3949      tu_cs_emit_wfi(cs);
3950
3951      tu_cs_emit_regs(cs, A6XX_PC_TESSFACTOR_ADDR(.qword = tess_factor_iova));
3952
3953      tu_cs_emit_pkt7(cs, CP_SET_SUBDRAW_SIZE, 1);
3954      tu_cs_emit(cs, draw_count);
3955   }
3956
3957   /* for the first draw in a renderpass, re-emit all the draw states
3958    *
3959    * and if a draw-state disabling path (CmdClearAttachments 3D fallback) was
3960    * used, then draw states must be re-emitted. note however this only happens
3961    * in the sysmem path, so this can be skipped this for the gmem path (TODO)
3962    *
3963    * the two input attachment states are excluded because secondary command
3964    * buffer doesn't have a state ib to restore it, and not re-emitting them
3965    * is OK since CmdClearAttachments won't disable/overwrite them
3966    */
3967   if (cmd->state.dirty & TU_CMD_DIRTY_DRAW_STATE) {
3968      tu_cs_emit_pkt7(cs, CP_SET_DRAW_STATE, 3 * (TU_DRAW_STATE_COUNT - 2));
3969
3970      tu_cs_emit_draw_state(cs, TU_DRAW_STATE_PROGRAM_CONFIG, pipeline->program.config_state);
3971      tu_cs_emit_draw_state(cs, TU_DRAW_STATE_PROGRAM, pipeline->program.state);
3972      tu_cs_emit_draw_state(cs, TU_DRAW_STATE_PROGRAM_BINNING, pipeline->program.binning_state);
3973      tu_cs_emit_draw_state(cs, TU_DRAW_STATE_TESS, tess_consts);
3974      tu_cs_emit_draw_state(cs, TU_DRAW_STATE_VI, pipeline->vi.state);
3975      tu_cs_emit_draw_state(cs, TU_DRAW_STATE_VI_BINNING, pipeline->vi.binning_state);
3976      tu_cs_emit_draw_state(cs, TU_DRAW_STATE_RAST, pipeline->rast_state);
3977      tu_cs_emit_draw_state(cs, TU_DRAW_STATE_BLEND, pipeline->blend_state);
3978      tu_cs_emit_draw_state(cs, TU_DRAW_STATE_SHADER_GEOM_CONST, cmd->state.shader_const[0]);
3979      tu_cs_emit_draw_state(cs, TU_DRAW_STATE_FS_CONST, cmd->state.shader_const[1]);
3980      tu_cs_emit_draw_state(cs, TU_DRAW_STATE_DESC_SETS, cmd->state.desc_sets);
3981      tu_cs_emit_draw_state(cs, TU_DRAW_STATE_DESC_SETS_LOAD, pipeline->load_state);
3982      tu_cs_emit_draw_state(cs, TU_DRAW_STATE_VB, cmd->state.vertex_buffers);
3983      tu_cs_emit_draw_state(cs, TU_DRAW_STATE_VS_PARAMS, cmd->state.vs_params);
3984      tu_cs_emit_draw_state(cs, TU_DRAW_STATE_LRZ, cmd->state.lrz.state);
3985      tu_cs_emit_draw_state(cs, TU_DRAW_STATE_DEPTH_PLANE, cmd->state.depth_plane_state);
3986
3987      for (uint32_t i = 0; i < ARRAY_SIZE(cmd->state.dynamic_state); i++) {
3988         tu_cs_emit_draw_state(cs, TU_DRAW_STATE_DYNAMIC + i,
3989                               ((pipeline->dynamic_state_mask & BIT(i)) ?
3990                                cmd->state.dynamic_state[i] :
3991                                pipeline->dynamic_state[i]));
3992      }
3993   } else {
3994      /* emit draw states that were just updated
3995       * note we eventually don't want to have to emit anything here
3996       */
3997      bool emit_binding_stride = false;
3998      uint32_t draw_state_count =
3999         has_tess +
4000         ((cmd->state.dirty & TU_CMD_DIRTY_SHADER_CONSTS) ? 2 : 0) +
4001         ((cmd->state.dirty & TU_CMD_DIRTY_DESC_SETS_LOAD) ? 1 : 0) +
4002         ((cmd->state.dirty & TU_CMD_DIRTY_VERTEX_BUFFERS) ? 1 : 0) +
4003         ((cmd->state.dirty & TU_CMD_DIRTY_VS_PARAMS) ? 1 : 0) +
4004         (dirty_lrz ? 2 : 0);
4005
4006      if ((cmd->state.dirty & TU_CMD_DIRTY_VB_STRIDE) &&
4007          (pipeline->dynamic_state_mask & BIT(TU_DYNAMIC_STATE_VB_STRIDE))) {
4008         emit_binding_stride = true;
4009         draw_state_count += 1;
4010      }
4011
4012      if (draw_state_count > 0)
4013         tu_cs_emit_pkt7(cs, CP_SET_DRAW_STATE, 3 * draw_state_count);
4014
4015      /* We may need to re-emit tess consts if the current draw call is
4016         * sufficiently larger than the last draw call. */
4017      if (has_tess)
4018         tu_cs_emit_draw_state(cs, TU_DRAW_STATE_TESS, tess_consts);
4019      if (cmd->state.dirty & TU_CMD_DIRTY_SHADER_CONSTS) {
4020         tu_cs_emit_draw_state(cs, TU_DRAW_STATE_SHADER_GEOM_CONST, cmd->state.shader_const[0]);
4021         tu_cs_emit_draw_state(cs, TU_DRAW_STATE_FS_CONST, cmd->state.shader_const[1]);
4022      }
4023      if (cmd->state.dirty & TU_CMD_DIRTY_DESC_SETS_LOAD)
4024         tu_cs_emit_draw_state(cs, TU_DRAW_STATE_DESC_SETS_LOAD, pipeline->load_state);
4025      if (cmd->state.dirty & TU_CMD_DIRTY_VERTEX_BUFFERS)
4026         tu_cs_emit_draw_state(cs, TU_DRAW_STATE_VB, cmd->state.vertex_buffers);
4027      if (emit_binding_stride) {
4028         tu_cs_emit_draw_state(cs, TU_DRAW_STATE_DYNAMIC + TU_DYNAMIC_STATE_VB_STRIDE,
4029                               cmd->state.dynamic_state[TU_DYNAMIC_STATE_VB_STRIDE]);
4030      }
4031      if (cmd->state.dirty & TU_CMD_DIRTY_VS_PARAMS)
4032         tu_cs_emit_draw_state(cs, TU_DRAW_STATE_VS_PARAMS, cmd->state.vs_params);
4033
4034      if (dirty_lrz) {
4035         tu_cs_emit_draw_state(cs, TU_DRAW_STATE_LRZ, cmd->state.lrz.state);
4036         tu_cs_emit_draw_state(cs, TU_DRAW_STATE_DEPTH_PLANE, cmd->state.depth_plane_state);
4037      }
4038   }
4039
4040   tu_cs_sanity_check(cs);
4041
4042   /* There are too many graphics dirty bits to list here, so just list the
4043    * bits to preserve instead. The only things not emitted here are
4044    * compute-related state.
4045    */
4046   cmd->state.dirty &= TU_CMD_DIRTY_COMPUTE_DESC_SETS_LOAD;
4047   return VK_SUCCESS;
4048}
4049
4050static uint32_t
4051tu_draw_initiator(struct tu_cmd_buffer *cmd, enum pc_di_src_sel src_sel)
4052{
4053   const struct tu_pipeline *pipeline = cmd->state.pipeline;
4054   enum pc_di_primtype primtype = pipeline->ia.primtype;
4055
4056   if (pipeline->dynamic_state_mask & BIT(TU_DYNAMIC_STATE_PRIMITIVE_TOPOLOGY)) {
4057      if (primtype < DI_PT_PATCHES0) {
4058         /* If tesselation used, only VK_PRIMITIVE_TOPOLOGY_PATCH_LIST can be
4059          * set via vkCmdSetPrimitiveTopologyEXT, but primtype is already
4060          * calculated at the pipeline creation based on control points
4061          * for each patch.
4062          *
4063          * Just use the primtype as is for the case.
4064          */
4065         primtype = cmd->state.primtype;
4066      }
4067   }
4068
4069   uint32_t initiator =
4070      CP_DRAW_INDX_OFFSET_0_PRIM_TYPE(primtype) |
4071      CP_DRAW_INDX_OFFSET_0_SOURCE_SELECT(src_sel) |
4072      CP_DRAW_INDX_OFFSET_0_INDEX_SIZE(cmd->state.index_size) |
4073      CP_DRAW_INDX_OFFSET_0_VIS_CULL(USE_VISIBILITY);
4074
4075   if (pipeline->active_stages & VK_SHADER_STAGE_GEOMETRY_BIT)
4076      initiator |= CP_DRAW_INDX_OFFSET_0_GS_ENABLE;
4077
4078   switch (pipeline->tess.patch_type) {
4079   case IR3_TESS_TRIANGLES:
4080      initiator |= CP_DRAW_INDX_OFFSET_0_PATCH_TYPE(TESS_TRIANGLES) |
4081                   CP_DRAW_INDX_OFFSET_0_TESS_ENABLE;
4082      break;
4083   case IR3_TESS_ISOLINES:
4084      initiator |= CP_DRAW_INDX_OFFSET_0_PATCH_TYPE(TESS_ISOLINES) |
4085                   CP_DRAW_INDX_OFFSET_0_TESS_ENABLE;
4086      break;
4087   case IR3_TESS_NONE:
4088      initiator |= CP_DRAW_INDX_OFFSET_0_PATCH_TYPE(TESS_QUADS);
4089      break;
4090   case IR3_TESS_QUADS:
4091      initiator |= CP_DRAW_INDX_OFFSET_0_PATCH_TYPE(TESS_QUADS) |
4092                   CP_DRAW_INDX_OFFSET_0_TESS_ENABLE;
4093      break;
4094   }
4095   return initiator;
4096}
4097
4098
4099static uint32_t
4100vs_params_offset(struct tu_cmd_buffer *cmd)
4101{
4102   const struct tu_program_descriptor_linkage *link =
4103      &cmd->state.pipeline->program.link[MESA_SHADER_VERTEX];
4104   const struct ir3_const_state *const_state = &link->const_state;
4105
4106   if (const_state->offsets.driver_param >= link->constlen)
4107      return 0;
4108
4109   /* this layout is required by CP_DRAW_INDIRECT_MULTI */
4110   STATIC_ASSERT(IR3_DP_DRAWID == 0);
4111   STATIC_ASSERT(IR3_DP_VTXID_BASE == 1);
4112   STATIC_ASSERT(IR3_DP_INSTID_BASE == 2);
4113
4114   /* 0 means disabled for CP_DRAW_INDIRECT_MULTI */
4115   assert(const_state->offsets.driver_param != 0);
4116
4117   return const_state->offsets.driver_param;
4118}
4119
4120static void
4121tu6_emit_empty_vs_params(struct tu_cmd_buffer *cmd)
4122{
4123   if (cmd->state.vs_params.iova) {
4124      cmd->state.vs_params = (struct tu_draw_state) {};
4125      cmd->state.dirty |= TU_CMD_DIRTY_VS_PARAMS;
4126   }
4127}
4128
4129static void
4130tu6_emit_vs_params(struct tu_cmd_buffer *cmd,
4131                   uint32_t vertex_offset,
4132                   uint32_t first_instance)
4133{
4134   /* Beside re-emitting params when they are changed, we should re-emit
4135    * them after constants are invalidated via HLSQ_INVALIDATE_CMD.
4136    */
4137   if (!(cmd->state.dirty & (TU_CMD_DIRTY_DRAW_STATE | TU_CMD_DIRTY_VS_PARAMS)) &&
4138       vertex_offset == cmd->state.last_vs_params.vertex_offset &&
4139       first_instance == cmd->state.last_vs_params.first_instance) {
4140      return;
4141   }
4142
4143   uint32_t offset = vs_params_offset(cmd);
4144
4145   struct tu_cs cs;
4146   VkResult result = tu_cs_begin_sub_stream(&cmd->sub_cs, 3 + (offset ? 8 : 0), &cs);
4147   if (result != VK_SUCCESS) {
4148      cmd->record_result = result;
4149      return;
4150   }
4151
4152   tu_cs_emit_regs(&cs,
4153                   A6XX_VFD_INDEX_OFFSET(vertex_offset),
4154                   A6XX_VFD_INSTANCE_START_OFFSET(first_instance));
4155
4156   if (offset) {
4157      tu_cs_emit_pkt7(&cs, CP_LOAD_STATE6_GEOM, 3 + 4);
4158      tu_cs_emit(&cs, CP_LOAD_STATE6_0_DST_OFF(offset) |
4159            CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
4160            CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
4161            CP_LOAD_STATE6_0_STATE_BLOCK(SB6_VS_SHADER) |
4162            CP_LOAD_STATE6_0_NUM_UNIT(1));
4163      tu_cs_emit(&cs, 0);
4164      tu_cs_emit(&cs, 0);
4165
4166      tu_cs_emit(&cs, 0);
4167      tu_cs_emit(&cs, vertex_offset);
4168      tu_cs_emit(&cs, first_instance);
4169      tu_cs_emit(&cs, 0);
4170   }
4171
4172   cmd->state.last_vs_params.vertex_offset = vertex_offset;
4173   cmd->state.last_vs_params.first_instance = first_instance;
4174
4175   struct tu_cs_entry entry = tu_cs_end_sub_stream(&cmd->sub_cs, &cs);
4176   cmd->state.vs_params = (struct tu_draw_state) {entry.bo->iova + entry.offset, entry.size / 4};
4177
4178   cmd->state.dirty |= TU_CMD_DIRTY_VS_PARAMS;
4179}
4180
4181VKAPI_ATTR void VKAPI_CALL
4182tu_CmdDraw(VkCommandBuffer commandBuffer,
4183           uint32_t vertexCount,
4184           uint32_t instanceCount,
4185           uint32_t firstVertex,
4186           uint32_t firstInstance)
4187{
4188   TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
4189   struct tu_cs *cs = &cmd->draw_cs;
4190
4191   tu6_emit_vs_params(cmd, firstVertex, firstInstance);
4192
4193   tu6_draw_common(cmd, cs, false, vertexCount);
4194
4195   tu_cs_emit_pkt7(cs, CP_DRAW_INDX_OFFSET, 3);
4196   tu_cs_emit(cs, tu_draw_initiator(cmd, DI_SRC_SEL_AUTO_INDEX));
4197   tu_cs_emit(cs, instanceCount);
4198   tu_cs_emit(cs, vertexCount);
4199}
4200
4201VKAPI_ATTR void VKAPI_CALL
4202tu_CmdDrawIndexed(VkCommandBuffer commandBuffer,
4203                  uint32_t indexCount,
4204                  uint32_t instanceCount,
4205                  uint32_t firstIndex,
4206                  int32_t vertexOffset,
4207                  uint32_t firstInstance)
4208{
4209   TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
4210   struct tu_cs *cs = &cmd->draw_cs;
4211
4212   tu6_emit_vs_params(cmd, vertexOffset, firstInstance);
4213
4214   tu6_draw_common(cmd, cs, true, indexCount);
4215
4216   tu_cs_emit_pkt7(cs, CP_DRAW_INDX_OFFSET, 7);
4217   tu_cs_emit(cs, tu_draw_initiator(cmd, DI_SRC_SEL_DMA));
4218   tu_cs_emit(cs, instanceCount);
4219   tu_cs_emit(cs, indexCount);
4220   tu_cs_emit(cs, firstIndex);
4221   tu_cs_emit_qw(cs, cmd->state.index_va);
4222   tu_cs_emit(cs, cmd->state.max_index_count);
4223}
4224
4225/* Various firmware bugs/inconsistencies mean that some indirect draw opcodes
4226 * do not wait for WFI's to complete before executing. Add a WAIT_FOR_ME if
4227 * pending for these opcodes. This may result in a few extra WAIT_FOR_ME's
4228 * with these opcodes, but the alternative would add unnecessary WAIT_FOR_ME's
4229 * before draw opcodes that don't need it.
4230 */
4231static void
4232draw_wfm(struct tu_cmd_buffer *cmd)
4233{
4234   cmd->state.renderpass_cache.flush_bits |=
4235      cmd->state.renderpass_cache.pending_flush_bits & TU_CMD_FLAG_WAIT_FOR_ME;
4236   cmd->state.renderpass_cache.pending_flush_bits &= ~TU_CMD_FLAG_WAIT_FOR_ME;
4237}
4238
4239VKAPI_ATTR void VKAPI_CALL
4240tu_CmdDrawIndirect(VkCommandBuffer commandBuffer,
4241                   VkBuffer _buffer,
4242                   VkDeviceSize offset,
4243                   uint32_t drawCount,
4244                   uint32_t stride)
4245{
4246   TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
4247   TU_FROM_HANDLE(tu_buffer, buf, _buffer);
4248   struct tu_cs *cs = &cmd->draw_cs;
4249
4250   tu6_emit_empty_vs_params(cmd);
4251
4252   if (cmd->device->physical_device->info->a6xx.indirect_draw_wfm_quirk)
4253      draw_wfm(cmd);
4254
4255   tu6_draw_common(cmd, cs, false, 0);
4256
4257   tu_cs_emit_pkt7(cs, CP_DRAW_INDIRECT_MULTI, 6);
4258   tu_cs_emit(cs, tu_draw_initiator(cmd, DI_SRC_SEL_AUTO_INDEX));
4259   tu_cs_emit(cs, A6XX_CP_DRAW_INDIRECT_MULTI_1_OPCODE(INDIRECT_OP_NORMAL) |
4260                  A6XX_CP_DRAW_INDIRECT_MULTI_1_DST_OFF(vs_params_offset(cmd)));
4261   tu_cs_emit(cs, drawCount);
4262   tu_cs_emit_qw(cs, buf->bo->iova + buf->bo_offset + offset);
4263   tu_cs_emit(cs, stride);
4264}
4265
4266VKAPI_ATTR void VKAPI_CALL
4267tu_CmdDrawIndexedIndirect(VkCommandBuffer commandBuffer,
4268                          VkBuffer _buffer,
4269                          VkDeviceSize offset,
4270                          uint32_t drawCount,
4271                          uint32_t stride)
4272{
4273   TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
4274   TU_FROM_HANDLE(tu_buffer, buf, _buffer);
4275   struct tu_cs *cs = &cmd->draw_cs;
4276
4277   tu6_emit_empty_vs_params(cmd);
4278
4279   if (cmd->device->physical_device->info->a6xx.indirect_draw_wfm_quirk)
4280      draw_wfm(cmd);
4281
4282   tu6_draw_common(cmd, cs, true, 0);
4283
4284   tu_cs_emit_pkt7(cs, CP_DRAW_INDIRECT_MULTI, 9);
4285   tu_cs_emit(cs, tu_draw_initiator(cmd, DI_SRC_SEL_DMA));
4286   tu_cs_emit(cs, A6XX_CP_DRAW_INDIRECT_MULTI_1_OPCODE(INDIRECT_OP_INDEXED) |
4287                  A6XX_CP_DRAW_INDIRECT_MULTI_1_DST_OFF(vs_params_offset(cmd)));
4288   tu_cs_emit(cs, drawCount);
4289   tu_cs_emit_qw(cs, cmd->state.index_va);
4290   tu_cs_emit(cs, cmd->state.max_index_count);
4291   tu_cs_emit_qw(cs, buf->bo->iova + buf->bo_offset + offset);
4292   tu_cs_emit(cs, stride);
4293}
4294
4295VKAPI_ATTR void VKAPI_CALL
4296tu_CmdDrawIndirectCount(VkCommandBuffer commandBuffer,
4297                        VkBuffer _buffer,
4298                        VkDeviceSize offset,
4299                        VkBuffer countBuffer,
4300                        VkDeviceSize countBufferOffset,
4301                        uint32_t drawCount,
4302                        uint32_t stride)
4303{
4304   TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
4305   TU_FROM_HANDLE(tu_buffer, buf, _buffer);
4306   TU_FROM_HANDLE(tu_buffer, count_buf, countBuffer);
4307   struct tu_cs *cs = &cmd->draw_cs;
4308
4309   tu6_emit_empty_vs_params(cmd);
4310
4311   /* It turns out that the firmware we have for a650 only partially fixed the
4312    * problem with CP_DRAW_INDIRECT_MULTI not waiting for WFI's to complete
4313    * before reading indirect parameters. It waits for WFI's before reading
4314    * the draw parameters, but after reading the indirect count :(.
4315    */
4316   draw_wfm(cmd);
4317
4318   tu6_draw_common(cmd, cs, false, 0);
4319
4320   tu_cs_emit_pkt7(cs, CP_DRAW_INDIRECT_MULTI, 8);
4321   tu_cs_emit(cs, tu_draw_initiator(cmd, DI_SRC_SEL_AUTO_INDEX));
4322   tu_cs_emit(cs, A6XX_CP_DRAW_INDIRECT_MULTI_1_OPCODE(INDIRECT_OP_INDIRECT_COUNT) |
4323                  A6XX_CP_DRAW_INDIRECT_MULTI_1_DST_OFF(vs_params_offset(cmd)));
4324   tu_cs_emit(cs, drawCount);
4325   tu_cs_emit_qw(cs, buf->bo->iova + buf->bo_offset + offset);
4326   tu_cs_emit_qw(cs, count_buf->bo->iova + count_buf->bo_offset + countBufferOffset);
4327   tu_cs_emit(cs, stride);
4328}
4329
4330VKAPI_ATTR void VKAPI_CALL
4331tu_CmdDrawIndexedIndirectCount(VkCommandBuffer commandBuffer,
4332                               VkBuffer _buffer,
4333                               VkDeviceSize offset,
4334                               VkBuffer countBuffer,
4335                               VkDeviceSize countBufferOffset,
4336                               uint32_t drawCount,
4337                               uint32_t stride)
4338{
4339   TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
4340   TU_FROM_HANDLE(tu_buffer, buf, _buffer);
4341   TU_FROM_HANDLE(tu_buffer, count_buf, countBuffer);
4342   struct tu_cs *cs = &cmd->draw_cs;
4343
4344   tu6_emit_empty_vs_params(cmd);
4345
4346   draw_wfm(cmd);
4347
4348   tu6_draw_common(cmd, cs, true, 0);
4349
4350   tu_cs_emit_pkt7(cs, CP_DRAW_INDIRECT_MULTI, 11);
4351   tu_cs_emit(cs, tu_draw_initiator(cmd, DI_SRC_SEL_DMA));
4352   tu_cs_emit(cs, A6XX_CP_DRAW_INDIRECT_MULTI_1_OPCODE(INDIRECT_OP_INDIRECT_COUNT_INDEXED) |
4353                  A6XX_CP_DRAW_INDIRECT_MULTI_1_DST_OFF(vs_params_offset(cmd)));
4354   tu_cs_emit(cs, drawCount);
4355   tu_cs_emit_qw(cs, cmd->state.index_va);
4356   tu_cs_emit(cs, cmd->state.max_index_count);
4357   tu_cs_emit_qw(cs, buf->bo->iova + buf->bo_offset + offset);
4358   tu_cs_emit_qw(cs, count_buf->bo->iova + count_buf->bo_offset + countBufferOffset);
4359   tu_cs_emit(cs, stride);
4360}
4361
4362VKAPI_ATTR void VKAPI_CALL
4363tu_CmdDrawIndirectByteCountEXT(VkCommandBuffer commandBuffer,
4364                               uint32_t instanceCount,
4365                               uint32_t firstInstance,
4366                               VkBuffer _counterBuffer,
4367                               VkDeviceSize counterBufferOffset,
4368                               uint32_t counterOffset,
4369                               uint32_t vertexStride)
4370{
4371   TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
4372   TU_FROM_HANDLE(tu_buffer, buf, _counterBuffer);
4373   struct tu_cs *cs = &cmd->draw_cs;
4374
4375   /* All known firmware versions do not wait for WFI's with CP_DRAW_AUTO.
4376    * Plus, for the common case where the counter buffer is written by
4377    * vkCmdEndTransformFeedback, we need to wait for the CP_WAIT_MEM_WRITES to
4378    * complete which means we need a WAIT_FOR_ME anyway.
4379    */
4380   draw_wfm(cmd);
4381
4382   tu6_emit_vs_params(cmd, 0, firstInstance);
4383
4384   tu6_draw_common(cmd, cs, false, 0);
4385
4386   tu_cs_emit_pkt7(cs, CP_DRAW_AUTO, 6);
4387   tu_cs_emit(cs, tu_draw_initiator(cmd, DI_SRC_SEL_AUTO_XFB));
4388   tu_cs_emit(cs, instanceCount);
4389   tu_cs_emit_qw(cs, buf->bo->iova + buf->bo_offset + counterBufferOffset);
4390   tu_cs_emit(cs, counterOffset);
4391   tu_cs_emit(cs, vertexStride);
4392}
4393
4394struct tu_dispatch_info
4395{
4396   /**
4397    * Determine the layout of the grid (in block units) to be used.
4398    */
4399   uint32_t blocks[3];
4400
4401   /**
4402    * A starting offset for the grid. If unaligned is set, the offset
4403    * must still be aligned.
4404    */
4405   uint32_t offsets[3];
4406   /**
4407    * Whether it's an unaligned compute dispatch.
4408    */
4409   bool unaligned;
4410
4411   /**
4412    * Indirect compute parameters resource.
4413    */
4414   struct tu_buffer *indirect;
4415   uint64_t indirect_offset;
4416};
4417
4418static void
4419tu_emit_compute_driver_params(struct tu_cmd_buffer *cmd,
4420                              struct tu_cs *cs, struct tu_pipeline *pipeline,
4421                              const struct tu_dispatch_info *info)
4422{
4423   gl_shader_stage type = MESA_SHADER_COMPUTE;
4424   const struct tu_program_descriptor_linkage *link =
4425      &pipeline->program.link[type];
4426   const struct ir3_const_state *const_state = &link->const_state;
4427   uint32_t offset = const_state->offsets.driver_param;
4428   unsigned subgroup_size = pipeline->compute.subgroup_size;
4429   unsigned subgroup_shift = util_logbase2(subgroup_size);
4430
4431   if (link->constlen <= offset)
4432      return;
4433
4434   uint32_t num_consts = MIN2(const_state->num_driver_params,
4435                              (link->constlen - offset) * 4);
4436
4437   if (!info->indirect) {
4438      uint32_t driver_params[12] = {
4439         [IR3_DP_NUM_WORK_GROUPS_X] = info->blocks[0],
4440         [IR3_DP_NUM_WORK_GROUPS_Y] = info->blocks[1],
4441         [IR3_DP_NUM_WORK_GROUPS_Z] = info->blocks[2],
4442         [IR3_DP_BASE_GROUP_X] = info->offsets[0],
4443         [IR3_DP_BASE_GROUP_Y] = info->offsets[1],
4444         [IR3_DP_BASE_GROUP_Z] = info->offsets[2],
4445         [IR3_DP_SUBGROUP_SIZE] = subgroup_size,
4446         [IR3_DP_SUBGROUP_ID_SHIFT] = subgroup_shift,
4447      };
4448
4449      assert(num_consts <= ARRAY_SIZE(driver_params));
4450
4451      /* push constants */
4452      tu_cs_emit_pkt7(cs, tu6_stage2opcode(type), 3 + num_consts);
4453      tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(offset) |
4454                 CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
4455                 CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
4456                 CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(type)) |
4457                 CP_LOAD_STATE6_0_NUM_UNIT(num_consts / 4));
4458      tu_cs_emit(cs, 0);
4459      tu_cs_emit(cs, 0);
4460      uint32_t i;
4461      for (i = 0; i < num_consts; i++)
4462         tu_cs_emit(cs, driver_params[i]);
4463   } else if (!(info->indirect_offset & 0xf)) {
4464      tu_cs_emit_pkt7(cs, tu6_stage2opcode(type), 3);
4465      tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(offset) |
4466                  CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
4467                  CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
4468                  CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(type)) |
4469                  CP_LOAD_STATE6_0_NUM_UNIT(1));
4470      tu_cs_emit_qw(cs, tu_buffer_iova(info->indirect) + info->indirect_offset);
4471   } else {
4472      /* Vulkan guarantees only 4 byte alignment for indirect_offset.
4473       * However, CP_LOAD_STATE.EXT_SRC_ADDR needs 16 byte alignment.
4474       */
4475
4476      uint64_t indirect_iova = tu_buffer_iova(info->indirect) + info->indirect_offset;
4477
4478      for (uint32_t i = 0; i < 3; i++) {
4479         tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 5);
4480         tu_cs_emit(cs, 0);
4481         tu_cs_emit_qw(cs, global_iova(cmd, cs_indirect_xyz[i]));
4482         tu_cs_emit_qw(cs, indirect_iova + i * 4);
4483      }
4484
4485      tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0);
4486      tu6_emit_event_write(cmd, cs, CACHE_INVALIDATE);
4487
4488      tu_cs_emit_pkt7(cs, tu6_stage2opcode(type), 3);
4489      tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(offset) |
4490                  CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
4491                  CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
4492                  CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(type)) |
4493                  CP_LOAD_STATE6_0_NUM_UNIT(1));
4494      tu_cs_emit_qw(cs, global_iova(cmd, cs_indirect_xyz[0]));
4495   }
4496
4497   /* Fill out IR3_DP_SUBGROUP_SIZE and IR3_DP_SUBGROUP_ID_SHIFT for indirect
4498    * dispatch.
4499    */
4500   if (info->indirect && num_consts > IR3_DP_BASE_GROUP_X) {
4501      tu_cs_emit_pkt7(cs, tu6_stage2opcode(type), 7);
4502      tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(offset + (IR3_DP_BASE_GROUP_X / 4)) |
4503                 CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
4504                 CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
4505                 CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(type)) |
4506                 CP_LOAD_STATE6_0_NUM_UNIT((num_consts - IR3_DP_BASE_GROUP_X) / 4));
4507      tu_cs_emit_qw(cs, 0);
4508      tu_cs_emit(cs, 0); /* BASE_GROUP_X */
4509      tu_cs_emit(cs, 0); /* BASE_GROUP_Y */
4510      tu_cs_emit(cs, 0); /* BASE_GROUP_Z */
4511      tu_cs_emit(cs, subgroup_size);
4512      if (num_consts > IR3_DP_LOCAL_GROUP_SIZE_X) {
4513         assert(num_consts == align(IR3_DP_SUBGROUP_ID_SHIFT, 4));
4514         tu_cs_emit(cs, 0); /* LOCAL_GROUP_SIZE_X */
4515         tu_cs_emit(cs, 0); /* LOCAL_GROUP_SIZE_Y */
4516         tu_cs_emit(cs, 0); /* LOCAL_GROUP_SIZE_Z */
4517         tu_cs_emit(cs, subgroup_shift);
4518      }
4519   }
4520}
4521
4522static void
4523tu_dispatch(struct tu_cmd_buffer *cmd,
4524            const struct tu_dispatch_info *info)
4525{
4526   if (!info->indirect &&
4527       (info->blocks[0] == 0 || info->blocks[1] == 0 || info->blocks[2] == 0))
4528      return;
4529
4530   struct tu_cs *cs = &cmd->cs;
4531   struct tu_pipeline *pipeline = cmd->state.compute_pipeline;
4532   struct tu_descriptor_state *descriptors_state =
4533      &cmd->descriptors[VK_PIPELINE_BIND_POINT_COMPUTE];
4534
4535   /* TODO: We could probably flush less if we add a compute_flush_bits
4536    * bitfield.
4537    */
4538   tu_emit_cache_flush(cmd, cs);
4539
4540   /* note: no reason to have this in a separate IB */
4541   tu_cs_emit_state_ib(cs,
4542         tu6_emit_consts(cmd, pipeline, descriptors_state, MESA_SHADER_COMPUTE));
4543
4544   tu_emit_compute_driver_params(cmd, cs, pipeline, info);
4545
4546   if (cmd->state.dirty & TU_CMD_DIRTY_COMPUTE_DESC_SETS_LOAD)
4547      tu_cs_emit_state_ib(cs, pipeline->load_state);
4548
4549   cmd->state.dirty &= ~TU_CMD_DIRTY_COMPUTE_DESC_SETS_LOAD;
4550
4551   tu_cs_emit_pkt7(cs, CP_SET_MARKER, 1);
4552   tu_cs_emit(cs, A6XX_CP_SET_MARKER_0_MODE(RM6_COMPUTE));
4553
4554   const uint32_t *local_size = pipeline->compute.local_size;
4555   const uint32_t *num_groups = info->blocks;
4556   tu_cs_emit_regs(cs,
4557                   A6XX_HLSQ_CS_NDRANGE_0(.kerneldim = 3,
4558                                          .localsizex = local_size[0] - 1,
4559                                          .localsizey = local_size[1] - 1,
4560                                          .localsizez = local_size[2] - 1),
4561                   A6XX_HLSQ_CS_NDRANGE_1(.globalsize_x = local_size[0] * num_groups[0]),
4562                   A6XX_HLSQ_CS_NDRANGE_2(.globaloff_x = 0),
4563                   A6XX_HLSQ_CS_NDRANGE_3(.globalsize_y = local_size[1] * num_groups[1]),
4564                   A6XX_HLSQ_CS_NDRANGE_4(.globaloff_y = 0),
4565                   A6XX_HLSQ_CS_NDRANGE_5(.globalsize_z = local_size[2] * num_groups[2]),
4566                   A6XX_HLSQ_CS_NDRANGE_6(.globaloff_z = 0));
4567
4568   tu_cs_emit_regs(cs,
4569                   A6XX_HLSQ_CS_KERNEL_GROUP_X(1),
4570                   A6XX_HLSQ_CS_KERNEL_GROUP_Y(1),
4571                   A6XX_HLSQ_CS_KERNEL_GROUP_Z(1));
4572
4573   trace_start_compute(&cmd->trace, cs);
4574
4575   if (info->indirect) {
4576      uint64_t iova = tu_buffer_iova(info->indirect) + info->indirect_offset;
4577
4578      tu_cs_emit_pkt7(cs, CP_EXEC_CS_INDIRECT, 4);
4579      tu_cs_emit(cs, 0x00000000);
4580      tu_cs_emit_qw(cs, iova);
4581      tu_cs_emit(cs,
4582                 A5XX_CP_EXEC_CS_INDIRECT_3_LOCALSIZEX(local_size[0] - 1) |
4583                 A5XX_CP_EXEC_CS_INDIRECT_3_LOCALSIZEY(local_size[1] - 1) |
4584                 A5XX_CP_EXEC_CS_INDIRECT_3_LOCALSIZEZ(local_size[2] - 1));
4585   } else {
4586      tu_cs_emit_pkt7(cs, CP_EXEC_CS, 4);
4587      tu_cs_emit(cs, 0x00000000);
4588      tu_cs_emit(cs, CP_EXEC_CS_1_NGROUPS_X(info->blocks[0]));
4589      tu_cs_emit(cs, CP_EXEC_CS_2_NGROUPS_Y(info->blocks[1]));
4590      tu_cs_emit(cs, CP_EXEC_CS_3_NGROUPS_Z(info->blocks[2]));
4591   }
4592
4593   trace_end_compute(&cmd->trace, cs,
4594                     info->indirect != NULL,
4595                     local_size[0], local_size[1], local_size[2],
4596                     info->blocks[0], info->blocks[1], info->blocks[2]);
4597
4598   tu_cs_emit_wfi(cs);
4599}
4600
4601VKAPI_ATTR void VKAPI_CALL
4602tu_CmdDispatchBase(VkCommandBuffer commandBuffer,
4603                   uint32_t base_x,
4604                   uint32_t base_y,
4605                   uint32_t base_z,
4606                   uint32_t x,
4607                   uint32_t y,
4608                   uint32_t z)
4609{
4610   TU_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, commandBuffer);
4611   struct tu_dispatch_info info = {};
4612
4613   info.blocks[0] = x;
4614   info.blocks[1] = y;
4615   info.blocks[2] = z;
4616
4617   info.offsets[0] = base_x;
4618   info.offsets[1] = base_y;
4619   info.offsets[2] = base_z;
4620   tu_dispatch(cmd_buffer, &info);
4621}
4622
4623VKAPI_ATTR void VKAPI_CALL
4624tu_CmdDispatch(VkCommandBuffer commandBuffer,
4625               uint32_t x,
4626               uint32_t y,
4627               uint32_t z)
4628{
4629   tu_CmdDispatchBase(commandBuffer, 0, 0, 0, x, y, z);
4630}
4631
4632VKAPI_ATTR void VKAPI_CALL
4633tu_CmdDispatchIndirect(VkCommandBuffer commandBuffer,
4634                       VkBuffer _buffer,
4635                       VkDeviceSize offset)
4636{
4637   TU_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, commandBuffer);
4638   TU_FROM_HANDLE(tu_buffer, buffer, _buffer);
4639   struct tu_dispatch_info info = {};
4640
4641   info.indirect = buffer;
4642   info.indirect_offset = offset;
4643
4644   tu_dispatch(cmd_buffer, &info);
4645}
4646
4647VKAPI_ATTR void VKAPI_CALL
4648tu_CmdEndRenderPass2(VkCommandBuffer commandBuffer,
4649                     const VkSubpassEndInfoKHR *pSubpassEndInfo)
4650{
4651   TU_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, commandBuffer);
4652
4653   tu6_emit_tile_store(cmd_buffer, &cmd_buffer->tile_store_cs);
4654
4655   tu_cs_end(&cmd_buffer->draw_cs);
4656   tu_cs_end(&cmd_buffer->tile_store_cs);
4657   tu_cs_end(&cmd_buffer->draw_epilogue_cs);
4658
4659   cmd_buffer->trace_renderpass_end = u_trace_end_iterator(&cmd_buffer->trace);
4660
4661   if (use_sysmem_rendering(cmd_buffer))
4662      tu_cmd_render_sysmem(cmd_buffer);
4663   else
4664      tu_cmd_render_tiles(cmd_buffer);
4665
4666   /* Outside of renderpasses we assume all draw states are disabled. We do
4667    * this outside the draw CS for the normal case where 3d gmem stores aren't
4668    * used.
4669    */
4670   tu_disable_draw_states(cmd_buffer, &cmd_buffer->cs);
4671
4672   /* discard draw_cs and draw_epilogue_cs entries now that the tiles are
4673      rendered */
4674   tu_cs_discard_entries(&cmd_buffer->draw_cs);
4675   tu_cs_begin(&cmd_buffer->draw_cs);
4676   tu_cs_discard_entries(&cmd_buffer->tile_store_cs);
4677   tu_cs_begin(&cmd_buffer->tile_store_cs);
4678   tu_cs_discard_entries(&cmd_buffer->draw_epilogue_cs);
4679   tu_cs_begin(&cmd_buffer->draw_epilogue_cs);
4680
4681   cmd_buffer->state.cache.pending_flush_bits |=
4682      cmd_buffer->state.renderpass_cache.pending_flush_bits;
4683   tu_subpass_barrier(cmd_buffer, &cmd_buffer->state.pass->end_barrier, true);
4684
4685   vk_free(&cmd_buffer->pool->alloc, cmd_buffer->state.attachments);
4686
4687   cmd_buffer->state.pass = NULL;
4688   cmd_buffer->state.subpass = NULL;
4689   cmd_buffer->state.framebuffer = NULL;
4690   cmd_buffer->state.attachments = NULL;
4691   cmd_buffer->state.has_tess = false;
4692   cmd_buffer->state.has_subpass_predication = false;
4693   cmd_buffer->state.disable_gmem = false;
4694
4695   /* LRZ is not valid next time we use it */
4696   cmd_buffer->state.lrz.valid = false;
4697   cmd_buffer->state.dirty |= TU_CMD_DIRTY_LRZ;
4698}
4699
4700struct tu_barrier_info
4701{
4702   uint32_t eventCount;
4703   const VkEvent *pEvents;
4704   VkPipelineStageFlags srcStageMask;
4705   VkPipelineStageFlags dstStageMask;
4706};
4707
4708static void
4709tu_barrier(struct tu_cmd_buffer *cmd,
4710           uint32_t memoryBarrierCount,
4711           const VkMemoryBarrier *pMemoryBarriers,
4712           uint32_t bufferMemoryBarrierCount,
4713           const VkBufferMemoryBarrier *pBufferMemoryBarriers,
4714           uint32_t imageMemoryBarrierCount,
4715           const VkImageMemoryBarrier *pImageMemoryBarriers,
4716           const struct tu_barrier_info *info)
4717{
4718   struct tu_cs *cs = cmd->state.pass ? &cmd->draw_cs : &cmd->cs;
4719   VkAccessFlags srcAccessMask = 0;
4720   VkAccessFlags dstAccessMask = 0;
4721
4722   if (cmd->state.pass) {
4723      const VkPipelineStageFlags framebuffer_space_stages =
4724         VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT |
4725         VK_PIPELINE_STAGE_EARLY_FRAGMENT_TESTS_BIT |
4726         VK_PIPELINE_STAGE_LATE_FRAGMENT_TESTS_BIT |
4727         VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT;
4728
4729      /* We cannot have non-by-region "fb-space to fb-space" barriers.
4730       *
4731       * From the Vulkan 1.2.185 spec, section 7.6.1 "Subpass Self-dependency":
4732       *
4733       *    If the source and destination stage masks both include
4734       *    framebuffer-space stages, then dependencyFlags must include
4735       *    VK_DEPENDENCY_BY_REGION_BIT.
4736       *    [...]
4737       *    Each of the synchronization scopes and access scopes of a
4738       *    vkCmdPipelineBarrier2KHR or vkCmdPipelineBarrier command inside
4739       *    a render pass instance must be a subset of the scopes of one of
4740       *    the self-dependencies for the current subpass.
4741       *
4742       *    If the self-dependency has VK_DEPENDENCY_BY_REGION_BIT or
4743       *    VK_DEPENDENCY_VIEW_LOCAL_BIT set, then so must the pipeline barrier.
4744       *
4745       * By-region barriers are ok for gmem. All other barriers would involve
4746       * vtx stages which are NOT ok for gmem rendering.
4747       * See dep_invalid_for_gmem().
4748       */
4749      if ((info->srcStageMask & ~framebuffer_space_stages) ||
4750          (info->dstStageMask & ~framebuffer_space_stages)) {
4751         cmd->state.disable_gmem = true;
4752      }
4753   }
4754
4755   for (uint32_t i = 0; i < memoryBarrierCount; i++) {
4756      srcAccessMask |= pMemoryBarriers[i].srcAccessMask;
4757      dstAccessMask |= pMemoryBarriers[i].dstAccessMask;
4758   }
4759
4760   for (uint32_t i = 0; i < bufferMemoryBarrierCount; i++) {
4761      srcAccessMask |= pBufferMemoryBarriers[i].srcAccessMask;
4762      dstAccessMask |= pBufferMemoryBarriers[i].dstAccessMask;
4763   }
4764
4765   enum tu_cmd_access_mask src_flags = 0;
4766   enum tu_cmd_access_mask dst_flags = 0;
4767
4768   for (uint32_t i = 0; i < imageMemoryBarrierCount; i++) {
4769      VkImageLayout old_layout = pImageMemoryBarriers[i].oldLayout;
4770      if (old_layout == VK_IMAGE_LAYOUT_UNDEFINED) {
4771         /* The underlying memory for this image may have been used earlier
4772          * within the same queue submission for a different image, which
4773          * means that there may be old, stale cache entries which are in the
4774          * "wrong" location, which could cause problems later after writing
4775          * to the image. We don't want these entries being flushed later and
4776          * overwriting the actual image, so we need to flush the CCU.
4777          */
4778         src_flags |= TU_ACCESS_CCU_COLOR_INCOHERENT_WRITE;
4779      }
4780      srcAccessMask |= pImageMemoryBarriers[i].srcAccessMask;
4781      dstAccessMask |= pImageMemoryBarriers[i].dstAccessMask;
4782   }
4783
4784   /* Inside a renderpass, we don't know yet whether we'll be using sysmem
4785    * so we have to use the sysmem flushes.
4786    */
4787   bool gmem = cmd->state.ccu_state == TU_CMD_CCU_GMEM &&
4788      !cmd->state.pass;
4789   src_flags |= vk2tu_access(srcAccessMask, gmem);
4790   dst_flags |= vk2tu_access(dstAccessMask, gmem);
4791
4792   struct tu_cache_state *cache =
4793      cmd->state.pass  ? &cmd->state.renderpass_cache : &cmd->state.cache;
4794   tu_flush_for_access(cache, src_flags, dst_flags);
4795
4796   enum tu_stage src_stage = vk2tu_src_stage(info->srcStageMask);
4797   enum tu_stage dst_stage = vk2tu_dst_stage(info->dstStageMask);
4798   tu_flush_for_stage(cache, src_stage, dst_stage);
4799
4800   for (uint32_t i = 0; i < info->eventCount; i++) {
4801      TU_FROM_HANDLE(tu_event, event, info->pEvents[i]);
4802
4803      tu_cs_emit_pkt7(cs, CP_WAIT_REG_MEM, 6);
4804      tu_cs_emit(cs, CP_WAIT_REG_MEM_0_FUNCTION(WRITE_EQ) |
4805                     CP_WAIT_REG_MEM_0_POLL_MEMORY);
4806      tu_cs_emit_qw(cs, event->bo.iova); /* POLL_ADDR_LO/HI */
4807      tu_cs_emit(cs, CP_WAIT_REG_MEM_3_REF(1));
4808      tu_cs_emit(cs, CP_WAIT_REG_MEM_4_MASK(~0u));
4809      tu_cs_emit(cs, CP_WAIT_REG_MEM_5_DELAY_LOOP_CYCLES(20));
4810   }
4811}
4812
4813VKAPI_ATTR void VKAPI_CALL
4814tu_CmdPipelineBarrier(VkCommandBuffer commandBuffer,
4815                      VkPipelineStageFlags srcStageMask,
4816                      VkPipelineStageFlags dstStageMask,
4817                      VkDependencyFlags dependencyFlags,
4818                      uint32_t memoryBarrierCount,
4819                      const VkMemoryBarrier *pMemoryBarriers,
4820                      uint32_t bufferMemoryBarrierCount,
4821                      const VkBufferMemoryBarrier *pBufferMemoryBarriers,
4822                      uint32_t imageMemoryBarrierCount,
4823                      const VkImageMemoryBarrier *pImageMemoryBarriers)
4824{
4825   TU_FROM_HANDLE(tu_cmd_buffer, cmd_buffer, commandBuffer);
4826   struct tu_barrier_info info;
4827
4828   info.eventCount = 0;
4829   info.pEvents = NULL;
4830   info.srcStageMask = srcStageMask;
4831   info.dstStageMask = dstStageMask;
4832
4833   tu_barrier(cmd_buffer, memoryBarrierCount, pMemoryBarriers,
4834              bufferMemoryBarrierCount, pBufferMemoryBarriers,
4835              imageMemoryBarrierCount, pImageMemoryBarriers, &info);
4836}
4837
4838static void
4839write_event(struct tu_cmd_buffer *cmd, struct tu_event *event,
4840            VkPipelineStageFlags stageMask, unsigned value)
4841{
4842   struct tu_cs *cs = &cmd->cs;
4843
4844   /* vkCmdSetEvent/vkCmdResetEvent cannot be called inside a render pass */
4845   assert(!cmd->state.pass);
4846
4847   tu_emit_cache_flush(cmd, cs);
4848
4849   /* Flags that only require a top-of-pipe event. DrawIndirect parameters are
4850    * read by the CP, so the draw indirect stage counts as top-of-pipe too.
4851    */
4852   VkPipelineStageFlags top_of_pipe_flags =
4853      VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT |
4854      VK_PIPELINE_STAGE_DRAW_INDIRECT_BIT;
4855
4856   if (!(stageMask & ~top_of_pipe_flags)) {
4857      tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 3);
4858      tu_cs_emit_qw(cs, event->bo.iova); /* ADDR_LO/HI */
4859      tu_cs_emit(cs, value);
4860   } else {
4861      /* Use a RB_DONE_TS event to wait for everything to complete. */
4862      tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, 4);
4863      tu_cs_emit(cs, CP_EVENT_WRITE_0_EVENT(RB_DONE_TS));
4864      tu_cs_emit_qw(cs, event->bo.iova);
4865      tu_cs_emit(cs, value);
4866   }
4867}
4868
4869VKAPI_ATTR void VKAPI_CALL
4870tu_CmdSetEvent(VkCommandBuffer commandBuffer,
4871               VkEvent _event,
4872               VkPipelineStageFlags stageMask)
4873{
4874   TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
4875   TU_FROM_HANDLE(tu_event, event, _event);
4876
4877   write_event(cmd, event, stageMask, 1);
4878}
4879
4880VKAPI_ATTR void VKAPI_CALL
4881tu_CmdResetEvent(VkCommandBuffer commandBuffer,
4882                 VkEvent _event,
4883                 VkPipelineStageFlags stageMask)
4884{
4885   TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
4886   TU_FROM_HANDLE(tu_event, event, _event);
4887
4888   write_event(cmd, event, stageMask, 0);
4889}
4890
4891VKAPI_ATTR void VKAPI_CALL
4892tu_CmdWaitEvents(VkCommandBuffer commandBuffer,
4893                 uint32_t eventCount,
4894                 const VkEvent *pEvents,
4895                 VkPipelineStageFlags srcStageMask,
4896                 VkPipelineStageFlags dstStageMask,
4897                 uint32_t memoryBarrierCount,
4898                 const VkMemoryBarrier *pMemoryBarriers,
4899                 uint32_t bufferMemoryBarrierCount,
4900                 const VkBufferMemoryBarrier *pBufferMemoryBarriers,
4901                 uint32_t imageMemoryBarrierCount,
4902                 const VkImageMemoryBarrier *pImageMemoryBarriers)
4903{
4904   TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
4905   struct tu_barrier_info info;
4906
4907   info.eventCount = eventCount;
4908   info.pEvents = pEvents;
4909   info.srcStageMask = srcStageMask;
4910   info.dstStageMask = dstStageMask;
4911
4912   tu_barrier(cmd, memoryBarrierCount, pMemoryBarriers,
4913              bufferMemoryBarrierCount, pBufferMemoryBarriers,
4914              imageMemoryBarrierCount, pImageMemoryBarriers, &info);
4915}
4916
4917VKAPI_ATTR void VKAPI_CALL
4918tu_CmdSetDeviceMask(VkCommandBuffer commandBuffer, uint32_t deviceMask)
4919{
4920   /* No-op */
4921}
4922
4923
4924VKAPI_ATTR void VKAPI_CALL
4925tu_CmdBeginConditionalRenderingEXT(VkCommandBuffer commandBuffer,
4926                                   const VkConditionalRenderingBeginInfoEXT *pConditionalRenderingBegin)
4927{
4928   TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
4929
4930   cmd->state.predication_active = true;
4931   if (cmd->state.pass)
4932      cmd->state.has_subpass_predication = true;
4933
4934   struct tu_cs *cs = cmd->state.pass ? &cmd->draw_cs : &cmd->cs;
4935
4936   tu_cs_emit_pkt7(cs, CP_DRAW_PRED_ENABLE_GLOBAL, 1);
4937   tu_cs_emit(cs, 1);
4938
4939   /* Wait for any writes to the predicate to land */
4940   if (cmd->state.pass)
4941      tu_emit_cache_flush_renderpass(cmd, cs);
4942   else
4943      tu_emit_cache_flush(cmd, cs);
4944
4945   TU_FROM_HANDLE(tu_buffer, buf, pConditionalRenderingBegin->buffer);
4946   uint64_t iova = tu_buffer_iova(buf) + pConditionalRenderingBegin->offset;
4947
4948   /* qcom doesn't support 32-bit reference values, only 64-bit, but Vulkan
4949    * mandates 32-bit comparisons. Our workaround is to copy the the reference
4950    * value to the low 32-bits of a location where the high 32 bits are known
4951    * to be 0 and then compare that.
4952    */
4953   tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 5);
4954   tu_cs_emit(cs, 0);
4955   tu_cs_emit_qw(cs, global_iova(cmd, predicate));
4956   tu_cs_emit_qw(cs, iova);
4957
4958   tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0);
4959   tu_cs_emit_pkt7(cs, CP_WAIT_FOR_ME, 0);
4960
4961   bool inv = pConditionalRenderingBegin->flags & VK_CONDITIONAL_RENDERING_INVERTED_BIT_EXT;
4962   tu_cs_emit_pkt7(cs, CP_DRAW_PRED_SET, 3);
4963   tu_cs_emit(cs, CP_DRAW_PRED_SET_0_SRC(PRED_SRC_MEM) |
4964                  CP_DRAW_PRED_SET_0_TEST(inv ? EQ_0_PASS : NE_0_PASS));
4965   tu_cs_emit_qw(cs, global_iova(cmd, predicate));
4966}
4967
4968VKAPI_ATTR void VKAPI_CALL
4969tu_CmdEndConditionalRenderingEXT(VkCommandBuffer commandBuffer)
4970{
4971   TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
4972
4973   cmd->state.predication_active = false;
4974
4975   struct tu_cs *cs = cmd->state.pass ? &cmd->draw_cs : &cmd->cs;
4976
4977   tu_cs_emit_pkt7(cs, CP_DRAW_PRED_ENABLE_GLOBAL, 1);
4978   tu_cs_emit(cs, 0);
4979}
4980
4981