101e04c3fSmrg/*
201e04c3fSmrg * Copyright © 2016 Red Hat.
301e04c3fSmrg * Copyright © 2016 Bas Nieuwenhuizen
401e04c3fSmrg *
501e04c3fSmrg * based in part on anv driver which is:
601e04c3fSmrg * Copyright © 2015 Intel Corporation
701e04c3fSmrg *
801e04c3fSmrg * Permission is hereby granted, free of charge, to any person obtaining a
901e04c3fSmrg * copy of this software and associated documentation files (the "Software"),
1001e04c3fSmrg * to deal in the Software without restriction, including without limitation
1101e04c3fSmrg * the rights to use, copy, modify, merge, publish, distribute, sublicense,
1201e04c3fSmrg * and/or sell copies of the Software, and to permit persons to whom the
1301e04c3fSmrg * Software is furnished to do so, subject to the following conditions:
1401e04c3fSmrg *
1501e04c3fSmrg * The above copyright notice and this permission notice (including the next
1601e04c3fSmrg * paragraph) shall be included in all copies or substantial portions of the
1701e04c3fSmrg * Software.
1801e04c3fSmrg *
1901e04c3fSmrg * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
2001e04c3fSmrg * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
2101e04c3fSmrg * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
2201e04c3fSmrg * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
2301e04c3fSmrg * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
2401e04c3fSmrg * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
2501e04c3fSmrg * IN THE SOFTWARE.
2601e04c3fSmrg */
2701e04c3fSmrg
287ec681f3Smrg#include "radv_cs.h"
297ec681f3Smrg#include "radv_debug.h"
307ec681f3Smrg#include "radv_meta.h"
3101e04c3fSmrg#include "radv_private.h"
3201e04c3fSmrg#include "radv_radeon_winsys.h"
3301e04c3fSmrg#include "radv_shader.h"
3401e04c3fSmrg#include "sid.h"
3501e04c3fSmrg#include "vk_format.h"
367ec681f3Smrg#include "vk_util.h"
3701e04c3fSmrg
3801e04c3fSmrg#include "ac_debug.h"
3901e04c3fSmrg
407ec681f3Smrg#include "util/fast_idiv_by_const.h"
417ec681f3Smrg
427ec681f3Smrgenum {
437ec681f3Smrg   RADV_PREFETCH_VBO_DESCRIPTORS = (1 << 0),
447ec681f3Smrg   RADV_PREFETCH_VS = (1 << 1),
457ec681f3Smrg   RADV_PREFETCH_TCS = (1 << 2),
467ec681f3Smrg   RADV_PREFETCH_TES = (1 << 3),
477ec681f3Smrg   RADV_PREFETCH_GS = (1 << 4),
487ec681f3Smrg   RADV_PREFETCH_PS = (1 << 5),
497ec681f3Smrg   RADV_PREFETCH_SHADERS = (RADV_PREFETCH_VS | RADV_PREFETCH_TCS | RADV_PREFETCH_TES |
507ec681f3Smrg                            RADV_PREFETCH_GS | RADV_PREFETCH_PS)
517ec681f3Smrg};
527ec681f3Smrg
5301e04c3fSmrgenum {
547ec681f3Smrg   RADV_RT_STAGE_BITS = (VK_SHADER_STAGE_RAYGEN_BIT_KHR | VK_SHADER_STAGE_ANY_HIT_BIT_KHR |
557ec681f3Smrg                         VK_SHADER_STAGE_CLOSEST_HIT_BIT_KHR | VK_SHADER_STAGE_MISS_BIT_KHR |
567ec681f3Smrg                         VK_SHADER_STAGE_INTERSECTION_BIT_KHR | VK_SHADER_STAGE_CALLABLE_BIT_KHR)
5701e04c3fSmrg};
5801e04c3fSmrg
5901e04c3fSmrgstatic void radv_handle_image_transition(struct radv_cmd_buffer *cmd_buffer,
607ec681f3Smrg                                         struct radv_image *image, VkImageLayout src_layout,
617ec681f3Smrg                                         bool src_render_loop, VkImageLayout dst_layout,
627ec681f3Smrg                                         bool dst_render_loop, uint32_t src_family,
637ec681f3Smrg                                         uint32_t dst_family, const VkImageSubresourceRange *range,
647ec681f3Smrg                                         struct radv_sample_locations_state *sample_locs);
657ec681f3Smrg
667ec681f3Smrgstatic void radv_set_rt_stack_size(struct radv_cmd_buffer *cmd_buffer, uint32_t size);
6701e04c3fSmrg
6801e04c3fSmrgconst struct radv_dynamic_state default_dynamic_state = {
697ec681f3Smrg   .viewport =
707ec681f3Smrg      {
717ec681f3Smrg         .count = 0,
727ec681f3Smrg      },
737ec681f3Smrg   .scissor =
747ec681f3Smrg      {
757ec681f3Smrg         .count = 0,
767ec681f3Smrg      },
777ec681f3Smrg   .line_width = 1.0f,
787ec681f3Smrg   .depth_bias =
797ec681f3Smrg      {
807ec681f3Smrg         .bias = 0.0f,
817ec681f3Smrg         .clamp = 0.0f,
827ec681f3Smrg         .slope = 0.0f,
837ec681f3Smrg      },
847ec681f3Smrg   .blend_constants = {0.0f, 0.0f, 0.0f, 0.0f},
857ec681f3Smrg   .depth_bounds =
867ec681f3Smrg      {
877ec681f3Smrg         .min = 0.0f,
887ec681f3Smrg         .max = 1.0f,
897ec681f3Smrg      },
907ec681f3Smrg   .stencil_compare_mask =
917ec681f3Smrg      {
927ec681f3Smrg         .front = ~0u,
937ec681f3Smrg         .back = ~0u,
947ec681f3Smrg      },
957ec681f3Smrg   .stencil_write_mask =
967ec681f3Smrg      {
977ec681f3Smrg         .front = ~0u,
987ec681f3Smrg         .back = ~0u,
997ec681f3Smrg      },
1007ec681f3Smrg   .stencil_reference =
1017ec681f3Smrg      {
1027ec681f3Smrg         .front = 0u,
1037ec681f3Smrg         .back = 0u,
1047ec681f3Smrg      },
1057ec681f3Smrg   .line_stipple =
1067ec681f3Smrg      {
1077ec681f3Smrg         .factor = 0u,
1087ec681f3Smrg         .pattern = 0u,
1097ec681f3Smrg      },
1107ec681f3Smrg   .cull_mode = 0u,
1117ec681f3Smrg   .front_face = 0u,
1127ec681f3Smrg   .primitive_topology = 0u,
1137ec681f3Smrg   .fragment_shading_rate =
1147ec681f3Smrg      {
1157ec681f3Smrg         .size = {1u, 1u},
1167ec681f3Smrg         .combiner_ops = {VK_FRAGMENT_SHADING_RATE_COMBINER_OP_KEEP_KHR,
1177ec681f3Smrg                          VK_FRAGMENT_SHADING_RATE_COMBINER_OP_KEEP_KHR},
1187ec681f3Smrg      },
1197ec681f3Smrg   .depth_bias_enable = 0u,
1207ec681f3Smrg   .primitive_restart_enable = 0u,
1217ec681f3Smrg   .rasterizer_discard_enable = 0u,
1227ec681f3Smrg   .logic_op = 0u,
1237ec681f3Smrg   .color_write_enable = 0xffffffffu,
12401e04c3fSmrg};
12501e04c3fSmrg
12601e04c3fSmrgstatic void
1277ec681f3Smrgradv_bind_dynamic_state(struct radv_cmd_buffer *cmd_buffer, const struct radv_dynamic_state *src)
1287ec681f3Smrg{
1297ec681f3Smrg   struct radv_dynamic_state *dest = &cmd_buffer->state.dynamic;
1307ec681f3Smrg   uint64_t copy_mask = src->mask;
1317ec681f3Smrg   uint64_t dest_mask = 0;
1327ec681f3Smrg
1337ec681f3Smrg   dest->discard_rectangle.count = src->discard_rectangle.count;
1347ec681f3Smrg   dest->sample_location.count = src->sample_location.count;
1357ec681f3Smrg
1367ec681f3Smrg   if (copy_mask & RADV_DYNAMIC_VIEWPORT) {
1377ec681f3Smrg      if (dest->viewport.count != src->viewport.count) {
1387ec681f3Smrg         dest->viewport.count = src->viewport.count;
1397ec681f3Smrg         dest_mask |= RADV_DYNAMIC_VIEWPORT;
1407ec681f3Smrg      }
1417ec681f3Smrg
1427ec681f3Smrg      if (memcmp(&dest->viewport.viewports, &src->viewport.viewports,
1437ec681f3Smrg                 src->viewport.count * sizeof(VkViewport))) {
1447ec681f3Smrg         typed_memcpy(dest->viewport.viewports, src->viewport.viewports, src->viewport.count);
1457ec681f3Smrg         typed_memcpy(dest->viewport.xform, src->viewport.xform, src->viewport.count);
1467ec681f3Smrg         dest_mask |= RADV_DYNAMIC_VIEWPORT;
1477ec681f3Smrg      }
1487ec681f3Smrg   }
1497ec681f3Smrg
1507ec681f3Smrg   if (copy_mask & RADV_DYNAMIC_SCISSOR) {
1517ec681f3Smrg      if (dest->scissor.count != src->scissor.count) {
1527ec681f3Smrg         dest->scissor.count = src->scissor.count;
1537ec681f3Smrg         dest_mask |= RADV_DYNAMIC_SCISSOR;
1547ec681f3Smrg      }
1557ec681f3Smrg
1567ec681f3Smrg      if (memcmp(&dest->scissor.scissors, &src->scissor.scissors,
1577ec681f3Smrg                 src->scissor.count * sizeof(VkRect2D))) {
1587ec681f3Smrg         typed_memcpy(dest->scissor.scissors, src->scissor.scissors, src->scissor.count);
1597ec681f3Smrg         dest_mask |= RADV_DYNAMIC_SCISSOR;
1607ec681f3Smrg      }
1617ec681f3Smrg   }
1627ec681f3Smrg
1637ec681f3Smrg   if (copy_mask & RADV_DYNAMIC_LINE_WIDTH) {
1647ec681f3Smrg      if (dest->line_width != src->line_width) {
1657ec681f3Smrg         dest->line_width = src->line_width;
1667ec681f3Smrg         dest_mask |= RADV_DYNAMIC_LINE_WIDTH;
1677ec681f3Smrg      }
1687ec681f3Smrg   }
1697ec681f3Smrg
1707ec681f3Smrg   if (copy_mask & RADV_DYNAMIC_DEPTH_BIAS) {
1717ec681f3Smrg      if (memcmp(&dest->depth_bias, &src->depth_bias, sizeof(src->depth_bias))) {
1727ec681f3Smrg         dest->depth_bias = src->depth_bias;
1737ec681f3Smrg         dest_mask |= RADV_DYNAMIC_DEPTH_BIAS;
1747ec681f3Smrg      }
1757ec681f3Smrg   }
1767ec681f3Smrg
1777ec681f3Smrg   if (copy_mask & RADV_DYNAMIC_BLEND_CONSTANTS) {
1787ec681f3Smrg      if (memcmp(&dest->blend_constants, &src->blend_constants, sizeof(src->blend_constants))) {
1797ec681f3Smrg         typed_memcpy(dest->blend_constants, src->blend_constants, 4);
1807ec681f3Smrg         dest_mask |= RADV_DYNAMIC_BLEND_CONSTANTS;
1817ec681f3Smrg      }
1827ec681f3Smrg   }
1837ec681f3Smrg
1847ec681f3Smrg   if (copy_mask & RADV_DYNAMIC_DEPTH_BOUNDS) {
1857ec681f3Smrg      if (memcmp(&dest->depth_bounds, &src->depth_bounds, sizeof(src->depth_bounds))) {
1867ec681f3Smrg         dest->depth_bounds = src->depth_bounds;
1877ec681f3Smrg         dest_mask |= RADV_DYNAMIC_DEPTH_BOUNDS;
1887ec681f3Smrg      }
1897ec681f3Smrg   }
1907ec681f3Smrg
1917ec681f3Smrg   if (copy_mask & RADV_DYNAMIC_STENCIL_COMPARE_MASK) {
1927ec681f3Smrg      if (memcmp(&dest->stencil_compare_mask, &src->stencil_compare_mask,
1937ec681f3Smrg                 sizeof(src->stencil_compare_mask))) {
1947ec681f3Smrg         dest->stencil_compare_mask = src->stencil_compare_mask;
1957ec681f3Smrg         dest_mask |= RADV_DYNAMIC_STENCIL_COMPARE_MASK;
1967ec681f3Smrg      }
1977ec681f3Smrg   }
1987ec681f3Smrg
1997ec681f3Smrg   if (copy_mask & RADV_DYNAMIC_STENCIL_WRITE_MASK) {
2007ec681f3Smrg      if (memcmp(&dest->stencil_write_mask, &src->stencil_write_mask,
2017ec681f3Smrg                 sizeof(src->stencil_write_mask))) {
2027ec681f3Smrg         dest->stencil_write_mask = src->stencil_write_mask;
2037ec681f3Smrg         dest_mask |= RADV_DYNAMIC_STENCIL_WRITE_MASK;
2047ec681f3Smrg      }
2057ec681f3Smrg   }
2067ec681f3Smrg
2077ec681f3Smrg   if (copy_mask & RADV_DYNAMIC_STENCIL_REFERENCE) {
2087ec681f3Smrg      if (memcmp(&dest->stencil_reference, &src->stencil_reference,
2097ec681f3Smrg                 sizeof(src->stencil_reference))) {
2107ec681f3Smrg         dest->stencil_reference = src->stencil_reference;
2117ec681f3Smrg         dest_mask |= RADV_DYNAMIC_STENCIL_REFERENCE;
2127ec681f3Smrg      }
2137ec681f3Smrg   }
2147ec681f3Smrg
2157ec681f3Smrg   if (copy_mask & RADV_DYNAMIC_DISCARD_RECTANGLE) {
2167ec681f3Smrg      if (memcmp(&dest->discard_rectangle.rectangles, &src->discard_rectangle.rectangles,
2177ec681f3Smrg                 src->discard_rectangle.count * sizeof(VkRect2D))) {
2187ec681f3Smrg         typed_memcpy(dest->discard_rectangle.rectangles, src->discard_rectangle.rectangles,
2197ec681f3Smrg                      src->discard_rectangle.count);
2207ec681f3Smrg         dest_mask |= RADV_DYNAMIC_DISCARD_RECTANGLE;
2217ec681f3Smrg      }
2227ec681f3Smrg   }
2237ec681f3Smrg
2247ec681f3Smrg   if (copy_mask & RADV_DYNAMIC_SAMPLE_LOCATIONS) {
2257ec681f3Smrg      if (dest->sample_location.per_pixel != src->sample_location.per_pixel ||
2267ec681f3Smrg          dest->sample_location.grid_size.width != src->sample_location.grid_size.width ||
2277ec681f3Smrg          dest->sample_location.grid_size.height != src->sample_location.grid_size.height ||
2287ec681f3Smrg          memcmp(&dest->sample_location.locations, &src->sample_location.locations,
2297ec681f3Smrg                 src->sample_location.count * sizeof(VkSampleLocationEXT))) {
2307ec681f3Smrg         dest->sample_location.per_pixel = src->sample_location.per_pixel;
2317ec681f3Smrg         dest->sample_location.grid_size = src->sample_location.grid_size;
2327ec681f3Smrg         typed_memcpy(dest->sample_location.locations, src->sample_location.locations,
2337ec681f3Smrg                      src->sample_location.count);
2347ec681f3Smrg         dest_mask |= RADV_DYNAMIC_SAMPLE_LOCATIONS;
2357ec681f3Smrg      }
2367ec681f3Smrg   }
2377ec681f3Smrg
2387ec681f3Smrg   if (copy_mask & RADV_DYNAMIC_LINE_STIPPLE) {
2397ec681f3Smrg      if (memcmp(&dest->line_stipple, &src->line_stipple, sizeof(src->line_stipple))) {
2407ec681f3Smrg         dest->line_stipple = src->line_stipple;
2417ec681f3Smrg         dest_mask |= RADV_DYNAMIC_LINE_STIPPLE;
2427ec681f3Smrg      }
2437ec681f3Smrg   }
2447ec681f3Smrg
2457ec681f3Smrg   if (copy_mask & RADV_DYNAMIC_CULL_MODE) {
2467ec681f3Smrg      if (dest->cull_mode != src->cull_mode) {
2477ec681f3Smrg         dest->cull_mode = src->cull_mode;
2487ec681f3Smrg         dest_mask |= RADV_DYNAMIC_CULL_MODE;
2497ec681f3Smrg      }
2507ec681f3Smrg   }
2517ec681f3Smrg
2527ec681f3Smrg   if (copy_mask & RADV_DYNAMIC_FRONT_FACE) {
2537ec681f3Smrg      if (dest->front_face != src->front_face) {
2547ec681f3Smrg         dest->front_face = src->front_face;
2557ec681f3Smrg         dest_mask |= RADV_DYNAMIC_FRONT_FACE;
2567ec681f3Smrg      }
2577ec681f3Smrg   }
2587ec681f3Smrg
2597ec681f3Smrg   if (copy_mask & RADV_DYNAMIC_PRIMITIVE_TOPOLOGY) {
2607ec681f3Smrg      if (dest->primitive_topology != src->primitive_topology) {
2617ec681f3Smrg         dest->primitive_topology = src->primitive_topology;
2627ec681f3Smrg         dest_mask |= RADV_DYNAMIC_PRIMITIVE_TOPOLOGY;
2637ec681f3Smrg      }
2647ec681f3Smrg   }
2657ec681f3Smrg
2667ec681f3Smrg   if (copy_mask & RADV_DYNAMIC_DEPTH_TEST_ENABLE) {
2677ec681f3Smrg      if (dest->depth_test_enable != src->depth_test_enable) {
2687ec681f3Smrg         dest->depth_test_enable = src->depth_test_enable;
2697ec681f3Smrg         dest_mask |= RADV_DYNAMIC_DEPTH_TEST_ENABLE;
2707ec681f3Smrg      }
2717ec681f3Smrg   }
2727ec681f3Smrg
2737ec681f3Smrg   if (copy_mask & RADV_DYNAMIC_DEPTH_WRITE_ENABLE) {
2747ec681f3Smrg      if (dest->depth_write_enable != src->depth_write_enable) {
2757ec681f3Smrg         dest->depth_write_enable = src->depth_write_enable;
2767ec681f3Smrg         dest_mask |= RADV_DYNAMIC_DEPTH_WRITE_ENABLE;
2777ec681f3Smrg      }
2787ec681f3Smrg   }
2797ec681f3Smrg
2807ec681f3Smrg   if (copy_mask & RADV_DYNAMIC_DEPTH_COMPARE_OP) {
2817ec681f3Smrg      if (dest->depth_compare_op != src->depth_compare_op) {
2827ec681f3Smrg         dest->depth_compare_op = src->depth_compare_op;
2837ec681f3Smrg         dest_mask |= RADV_DYNAMIC_DEPTH_COMPARE_OP;
2847ec681f3Smrg      }
2857ec681f3Smrg   }
2867ec681f3Smrg
2877ec681f3Smrg   if (copy_mask & RADV_DYNAMIC_DEPTH_BOUNDS_TEST_ENABLE) {
2887ec681f3Smrg      if (dest->depth_bounds_test_enable != src->depth_bounds_test_enable) {
2897ec681f3Smrg         dest->depth_bounds_test_enable = src->depth_bounds_test_enable;
2907ec681f3Smrg         dest_mask |= RADV_DYNAMIC_DEPTH_BOUNDS_TEST_ENABLE;
2917ec681f3Smrg      }
2927ec681f3Smrg   }
2937ec681f3Smrg
2947ec681f3Smrg   if (copy_mask & RADV_DYNAMIC_STENCIL_TEST_ENABLE) {
2957ec681f3Smrg      if (dest->stencil_test_enable != src->stencil_test_enable) {
2967ec681f3Smrg         dest->stencil_test_enable = src->stencil_test_enable;
2977ec681f3Smrg         dest_mask |= RADV_DYNAMIC_STENCIL_TEST_ENABLE;
2987ec681f3Smrg      }
2997ec681f3Smrg   }
3007ec681f3Smrg
3017ec681f3Smrg   if (copy_mask & RADV_DYNAMIC_STENCIL_OP) {
3027ec681f3Smrg      if (memcmp(&dest->stencil_op, &src->stencil_op, sizeof(src->stencil_op))) {
3037ec681f3Smrg         dest->stencil_op = src->stencil_op;
3047ec681f3Smrg         dest_mask |= RADV_DYNAMIC_STENCIL_OP;
3057ec681f3Smrg      }
3067ec681f3Smrg   }
3077ec681f3Smrg
3087ec681f3Smrg   if (copy_mask & RADV_DYNAMIC_FRAGMENT_SHADING_RATE) {
3097ec681f3Smrg      if (memcmp(&dest->fragment_shading_rate, &src->fragment_shading_rate,
3107ec681f3Smrg                 sizeof(src->fragment_shading_rate))) {
3117ec681f3Smrg         dest->fragment_shading_rate = src->fragment_shading_rate;
3127ec681f3Smrg         dest_mask |= RADV_DYNAMIC_FRAGMENT_SHADING_RATE;
3137ec681f3Smrg      }
3147ec681f3Smrg   }
3157ec681f3Smrg
3167ec681f3Smrg   if (copy_mask & RADV_DYNAMIC_DEPTH_BIAS_ENABLE) {
3177ec681f3Smrg      if (dest->depth_bias_enable != src->depth_bias_enable) {
3187ec681f3Smrg         dest->depth_bias_enable = src->depth_bias_enable;
3197ec681f3Smrg         dest_mask |= RADV_DYNAMIC_DEPTH_BIAS_ENABLE;
3207ec681f3Smrg      }
3217ec681f3Smrg   }
3227ec681f3Smrg
3237ec681f3Smrg   if (copy_mask & RADV_DYNAMIC_PRIMITIVE_RESTART_ENABLE) {
3247ec681f3Smrg      if (dest->primitive_restart_enable != src->primitive_restart_enable) {
3257ec681f3Smrg         dest->primitive_restart_enable = src->primitive_restart_enable;
3267ec681f3Smrg         dest_mask |= RADV_DYNAMIC_PRIMITIVE_RESTART_ENABLE;
3277ec681f3Smrg      }
3287ec681f3Smrg   }
3297ec681f3Smrg
3307ec681f3Smrg   if (copy_mask & RADV_DYNAMIC_RASTERIZER_DISCARD_ENABLE) {
3317ec681f3Smrg      if (dest->rasterizer_discard_enable != src->rasterizer_discard_enable) {
3327ec681f3Smrg         dest->rasterizer_discard_enable = src->rasterizer_discard_enable;
3337ec681f3Smrg         dest_mask |= RADV_DYNAMIC_RASTERIZER_DISCARD_ENABLE;
3347ec681f3Smrg      }
3357ec681f3Smrg   }
3367ec681f3Smrg
3377ec681f3Smrg   if (copy_mask & RADV_DYNAMIC_LOGIC_OP) {
3387ec681f3Smrg      if (dest->logic_op != src->logic_op) {
3397ec681f3Smrg         dest->logic_op = src->logic_op;
3407ec681f3Smrg         dest_mask |= RADV_DYNAMIC_LOGIC_OP;
3417ec681f3Smrg      }
3427ec681f3Smrg   }
3437ec681f3Smrg
3447ec681f3Smrg   if (copy_mask & RADV_DYNAMIC_COLOR_WRITE_ENABLE) {
3457ec681f3Smrg      if (dest->color_write_enable != src->color_write_enable) {
3467ec681f3Smrg         dest->color_write_enable = src->color_write_enable;
3477ec681f3Smrg         dest_mask |= RADV_DYNAMIC_COLOR_WRITE_ENABLE;
3487ec681f3Smrg      }
3497ec681f3Smrg   }
3507ec681f3Smrg
3517ec681f3Smrg   cmd_buffer->state.dirty |= dest_mask;
35201e04c3fSmrg}
35301e04c3fSmrg
35401e04c3fSmrgstatic void
3557ec681f3Smrgradv_bind_streamout_state(struct radv_cmd_buffer *cmd_buffer, struct radv_pipeline *pipeline)
35601e04c3fSmrg{
3577ec681f3Smrg   struct radv_streamout_state *so = &cmd_buffer->state.streamout;
3587ec681f3Smrg   struct radv_shader_info *info;
35901e04c3fSmrg
3607ec681f3Smrg   if (!pipeline->streamout_shader || cmd_buffer->device->physical_device->use_ngg_streamout)
3617ec681f3Smrg      return;
36201e04c3fSmrg
3637ec681f3Smrg   info = &pipeline->streamout_shader->info;
3647ec681f3Smrg   for (int i = 0; i < MAX_SO_BUFFERS; i++)
3657ec681f3Smrg      so->stride_in_dw[i] = info->so.strides[i];
36601e04c3fSmrg
3677ec681f3Smrg   so->enabled_stream_buffers_mask = info->so.enabled_stream_buffers_mask;
36801e04c3fSmrg}
36901e04c3fSmrg
3707ec681f3Smrgbool
3717ec681f3Smrgradv_cmd_buffer_uses_mec(struct radv_cmd_buffer *cmd_buffer)
37201e04c3fSmrg{
3737ec681f3Smrg   return cmd_buffer->queue_family_index == RADV_QUEUE_COMPUTE &&
3747ec681f3Smrg          cmd_buffer->device->physical_device->rad_info.chip_class >= GFX7;
37501e04c3fSmrg}
37601e04c3fSmrg
3777ec681f3Smrgenum ring_type
3787ec681f3Smrgradv_queue_family_to_ring(int f)
3797ec681f3Smrg{
3807ec681f3Smrg   switch (f) {
3817ec681f3Smrg   case RADV_QUEUE_GENERAL:
3827ec681f3Smrg      return RING_GFX;
3837ec681f3Smrg   case RADV_QUEUE_COMPUTE:
3847ec681f3Smrg      return RING_COMPUTE;
3857ec681f3Smrg   case RADV_QUEUE_TRANSFER:
3867ec681f3Smrg      return RING_DMA;
3877ec681f3Smrg   default:
3887ec681f3Smrg      unreachable("Unknown queue family");
3897ec681f3Smrg   }
39001e04c3fSmrg}
39101e04c3fSmrg
3927ec681f3Smrgstatic void
3937ec681f3Smrgradv_emit_write_data_packet(struct radv_cmd_buffer *cmd_buffer, unsigned engine_sel, uint64_t va,
3947ec681f3Smrg                            unsigned count, const uint32_t *data)
39501e04c3fSmrg{
3967ec681f3Smrg   struct radeon_cmdbuf *cs = cmd_buffer->cs;
3977ec681f3Smrg
3987ec681f3Smrg   radeon_check_space(cmd_buffer->device->ws, cs, 4 + count);
3997ec681f3Smrg
4007ec681f3Smrg   radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 2 + count, 0));
4017ec681f3Smrg   radeon_emit(cs, S_370_DST_SEL(V_370_MEM) | S_370_WR_CONFIRM(1) | S_370_ENGINE_SEL(engine_sel));
4027ec681f3Smrg   radeon_emit(cs, va);
4037ec681f3Smrg   radeon_emit(cs, va >> 32);
4047ec681f3Smrg   radeon_emit_array(cs, data, count);
4057ec681f3Smrg}
40601e04c3fSmrg
4077ec681f3Smrgstatic void
4087ec681f3Smrgradv_emit_clear_data(struct radv_cmd_buffer *cmd_buffer, unsigned engine_sel, uint64_t va,
4097ec681f3Smrg                     unsigned size)
4107ec681f3Smrg{
4117ec681f3Smrg   uint32_t *zeroes = alloca(size);
4127ec681f3Smrg   memset(zeroes, 0, size);
4137ec681f3Smrg   radv_emit_write_data_packet(cmd_buffer, engine_sel, va, size / 4, zeroes);
4147ec681f3Smrg}
41501e04c3fSmrg
4167ec681f3Smrgstatic void
4177ec681f3Smrgradv_destroy_cmd_buffer(struct radv_cmd_buffer *cmd_buffer)
4187ec681f3Smrg{
4197ec681f3Smrg   list_del(&cmd_buffer->pool_link);
42001e04c3fSmrg
4217ec681f3Smrg   list_for_each_entry_safe(struct radv_cmd_buffer_upload, up, &cmd_buffer->upload.list, list)
4227ec681f3Smrg   {
4237ec681f3Smrg      cmd_buffer->device->ws->buffer_destroy(cmd_buffer->device->ws, up->upload_bo);
4247ec681f3Smrg      list_del(&up->list);
4257ec681f3Smrg      free(up);
4267ec681f3Smrg   }
42701e04c3fSmrg
4287ec681f3Smrg   if (cmd_buffer->upload.upload_bo)
4297ec681f3Smrg      cmd_buffer->device->ws->buffer_destroy(cmd_buffer->device->ws, cmd_buffer->upload.upload_bo);
43001e04c3fSmrg
4317ec681f3Smrg   if (cmd_buffer->cs)
4327ec681f3Smrg      cmd_buffer->device->ws->cs_destroy(cmd_buffer->cs);
43301e04c3fSmrg
4347ec681f3Smrg   for (unsigned i = 0; i < MAX_BIND_POINTS; i++) {
4357ec681f3Smrg      free(cmd_buffer->descriptors[i].push_set.set.mapped_ptr);
4367ec681f3Smrg      vk_object_base_finish(&cmd_buffer->descriptors[i].push_set.set.base);
4377ec681f3Smrg   }
43801e04c3fSmrg
4397ec681f3Smrg   vk_object_base_finish(&cmd_buffer->meta_push_descriptors.base);
44001e04c3fSmrg
4417ec681f3Smrg   vk_command_buffer_finish(&cmd_buffer->vk);
4427ec681f3Smrg   vk_free(&cmd_buffer->pool->alloc, cmd_buffer);
44301e04c3fSmrg}
44401e04c3fSmrg
4457ec681f3Smrgstatic VkResult
4467ec681f3Smrgradv_create_cmd_buffer(struct radv_device *device, struct radv_cmd_pool *pool,
4477ec681f3Smrg                       VkCommandBufferLevel level, VkCommandBuffer *pCommandBuffer)
44801e04c3fSmrg{
4497ec681f3Smrg   struct radv_cmd_buffer *cmd_buffer;
4507ec681f3Smrg   unsigned ring;
4517ec681f3Smrg   cmd_buffer = vk_zalloc(&pool->alloc, sizeof(*cmd_buffer), 8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
4527ec681f3Smrg   if (cmd_buffer == NULL)
4537ec681f3Smrg      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
45401e04c3fSmrg
4557ec681f3Smrg   VkResult result =
4567ec681f3Smrg      vk_command_buffer_init(&cmd_buffer->vk, &device->vk);
4577ec681f3Smrg   if (result != VK_SUCCESS) {
4587ec681f3Smrg      vk_free(&cmd_buffer->pool->alloc, cmd_buffer);
4597ec681f3Smrg      return result;
4607ec681f3Smrg   }
46101e04c3fSmrg
4627ec681f3Smrg   cmd_buffer->device = device;
4637ec681f3Smrg   cmd_buffer->pool = pool;
4647ec681f3Smrg   cmd_buffer->level = level;
46501e04c3fSmrg
4667ec681f3Smrg   list_addtail(&cmd_buffer->pool_link, &pool->cmd_buffers);
4677ec681f3Smrg   cmd_buffer->queue_family_index = pool->queue_family_index;
46801e04c3fSmrg
4697ec681f3Smrg   ring = radv_queue_family_to_ring(cmd_buffer->queue_family_index);
4707ec681f3Smrg
4717ec681f3Smrg   cmd_buffer->cs = device->ws->cs_create(device->ws, ring);
4727ec681f3Smrg   if (!cmd_buffer->cs) {
4737ec681f3Smrg      radv_destroy_cmd_buffer(cmd_buffer);
4747ec681f3Smrg      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
4757ec681f3Smrg   }
4767ec681f3Smrg
4777ec681f3Smrg   vk_object_base_init(&device->vk, &cmd_buffer->meta_push_descriptors.base,
4787ec681f3Smrg                       VK_OBJECT_TYPE_DESCRIPTOR_SET);
4797ec681f3Smrg
4807ec681f3Smrg   for (unsigned i = 0; i < MAX_BIND_POINTS; i++)
4817ec681f3Smrg      vk_object_base_init(&device->vk, &cmd_buffer->descriptors[i].push_set.set.base,
4827ec681f3Smrg                          VK_OBJECT_TYPE_DESCRIPTOR_SET);
4837ec681f3Smrg
4847ec681f3Smrg   *pCommandBuffer = radv_cmd_buffer_to_handle(cmd_buffer);
4857ec681f3Smrg
4867ec681f3Smrg   list_inithead(&cmd_buffer->upload.list);
4877ec681f3Smrg
4887ec681f3Smrg   return VK_SUCCESS;
48901e04c3fSmrg}
49001e04c3fSmrg
49101e04c3fSmrgstatic VkResult
49201e04c3fSmrgradv_reset_cmd_buffer(struct radv_cmd_buffer *cmd_buffer)
49301e04c3fSmrg{
4947ec681f3Smrg   vk_command_buffer_reset(&cmd_buffer->vk);
4957ec681f3Smrg
4967ec681f3Smrg   cmd_buffer->device->ws->cs_reset(cmd_buffer->cs);
4977ec681f3Smrg
4987ec681f3Smrg   list_for_each_entry_safe(struct radv_cmd_buffer_upload, up, &cmd_buffer->upload.list, list)
4997ec681f3Smrg   {
5007ec681f3Smrg      cmd_buffer->device->ws->buffer_destroy(cmd_buffer->device->ws, up->upload_bo);
5017ec681f3Smrg      list_del(&up->list);
5027ec681f3Smrg      free(up);
5037ec681f3Smrg   }
50401e04c3fSmrg
5057ec681f3Smrg   cmd_buffer->push_constant_stages = 0;
5067ec681f3Smrg   cmd_buffer->scratch_size_per_wave_needed = 0;
5077ec681f3Smrg   cmd_buffer->scratch_waves_wanted = 0;
5087ec681f3Smrg   cmd_buffer->compute_scratch_size_per_wave_needed = 0;
5097ec681f3Smrg   cmd_buffer->compute_scratch_waves_wanted = 0;
5107ec681f3Smrg   cmd_buffer->esgs_ring_size_needed = 0;
5117ec681f3Smrg   cmd_buffer->gsvs_ring_size_needed = 0;
5127ec681f3Smrg   cmd_buffer->tess_rings_needed = false;
5137ec681f3Smrg   cmd_buffer->gds_needed = false;
5147ec681f3Smrg   cmd_buffer->gds_oa_needed = false;
5157ec681f3Smrg   cmd_buffer->sample_positions_needed = false;
51601e04c3fSmrg
5177ec681f3Smrg   if (cmd_buffer->upload.upload_bo)
5187ec681f3Smrg      radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, cmd_buffer->upload.upload_bo);
5197ec681f3Smrg   cmd_buffer->upload.offset = 0;
52001e04c3fSmrg
5217ec681f3Smrg   cmd_buffer->record_result = VK_SUCCESS;
52201e04c3fSmrg
5237ec681f3Smrg   memset(cmd_buffer->vertex_bindings, 0, sizeof(cmd_buffer->vertex_bindings));
52401e04c3fSmrg
5257ec681f3Smrg   for (unsigned i = 0; i < MAX_BIND_POINTS; i++) {
5267ec681f3Smrg      cmd_buffer->descriptors[i].dirty = 0;
5277ec681f3Smrg      cmd_buffer->descriptors[i].valid = 0;
5287ec681f3Smrg      cmd_buffer->descriptors[i].push_dirty = false;
5297ec681f3Smrg   }
530ed98bd31Smaya
5317ec681f3Smrg   if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX9 &&
5327ec681f3Smrg       cmd_buffer->queue_family_index == RADV_QUEUE_GENERAL) {
5337ec681f3Smrg      unsigned num_db = cmd_buffer->device->physical_device->rad_info.max_render_backends;
5347ec681f3Smrg      unsigned fence_offset, eop_bug_offset;
5357ec681f3Smrg      void *fence_ptr;
53601e04c3fSmrg
5377ec681f3Smrg      radv_cmd_buffer_upload_alloc(cmd_buffer, 8, &fence_offset, &fence_ptr);
5387ec681f3Smrg      memset(fence_ptr, 0, 8);
53901e04c3fSmrg
5407ec681f3Smrg      cmd_buffer->gfx9_fence_va = radv_buffer_get_va(cmd_buffer->upload.upload_bo);
5417ec681f3Smrg      cmd_buffer->gfx9_fence_va += fence_offset;
542ed98bd31Smaya
5437ec681f3Smrg      radv_emit_clear_data(cmd_buffer, V_370_PFP, cmd_buffer->gfx9_fence_va, 8);
54401e04c3fSmrg
5457ec681f3Smrg      if (cmd_buffer->device->physical_device->rad_info.chip_class == GFX9) {
5467ec681f3Smrg         /* Allocate a buffer for the EOP bug on GFX9. */
5477ec681f3Smrg         radv_cmd_buffer_upload_alloc(cmd_buffer, 16 * num_db, &eop_bug_offset, &fence_ptr);
5487ec681f3Smrg         memset(fence_ptr, 0, 16 * num_db);
5497ec681f3Smrg         cmd_buffer->gfx9_eop_bug_va = radv_buffer_get_va(cmd_buffer->upload.upload_bo);
5507ec681f3Smrg         cmd_buffer->gfx9_eop_bug_va += eop_bug_offset;
55101e04c3fSmrg
5527ec681f3Smrg         radv_emit_clear_data(cmd_buffer, V_370_PFP, cmd_buffer->gfx9_eop_bug_va, 16 * num_db);
5537ec681f3Smrg      }
5547ec681f3Smrg   }
55501e04c3fSmrg
5567ec681f3Smrg   cmd_buffer->status = RADV_CMD_BUFFER_STATUS_INITIAL;
5577ec681f3Smrg
5587ec681f3Smrg   return cmd_buffer->record_result;
55901e04c3fSmrg}
56001e04c3fSmrg
56101e04c3fSmrgstatic bool
5627ec681f3Smrgradv_cmd_buffer_resize_upload_buf(struct radv_cmd_buffer *cmd_buffer, uint64_t min_needed)
56301e04c3fSmrg{
5647ec681f3Smrg   uint64_t new_size;
5657ec681f3Smrg   struct radeon_winsys_bo *bo = NULL;
5667ec681f3Smrg   struct radv_cmd_buffer_upload *upload;
5677ec681f3Smrg   struct radv_device *device = cmd_buffer->device;
56801e04c3fSmrg
5697ec681f3Smrg   new_size = MAX2(min_needed, 16 * 1024);
5707ec681f3Smrg   new_size = MAX2(new_size, 2 * cmd_buffer->upload.size);
57101e04c3fSmrg
5727ec681f3Smrg   VkResult result =
5737ec681f3Smrg      device->ws->buffer_create(device->ws, new_size, 4096, device->ws->cs_domain(device->ws),
5747ec681f3Smrg                                RADEON_FLAG_CPU_ACCESS | RADEON_FLAG_NO_INTERPROCESS_SHARING |
5757ec681f3Smrg                                   RADEON_FLAG_32BIT | RADEON_FLAG_GTT_WC,
5767ec681f3Smrg                                RADV_BO_PRIORITY_UPLOAD_BUFFER, 0, &bo);
57701e04c3fSmrg
5787ec681f3Smrg   if (result != VK_SUCCESS) {
5797ec681f3Smrg      cmd_buffer->record_result = result;
5807ec681f3Smrg      return false;
5817ec681f3Smrg   }
58201e04c3fSmrg
5837ec681f3Smrg   radv_cs_add_buffer(device->ws, cmd_buffer->cs, bo);
5847ec681f3Smrg   if (cmd_buffer->upload.upload_bo) {
5857ec681f3Smrg      upload = malloc(sizeof(*upload));
58601e04c3fSmrg
5877ec681f3Smrg      if (!upload) {
5887ec681f3Smrg         cmd_buffer->record_result = VK_ERROR_OUT_OF_HOST_MEMORY;
5897ec681f3Smrg         device->ws->buffer_destroy(device->ws, bo);
5907ec681f3Smrg         return false;
5917ec681f3Smrg      }
59201e04c3fSmrg
5937ec681f3Smrg      memcpy(upload, &cmd_buffer->upload, sizeof(*upload));
5947ec681f3Smrg      list_add(&upload->list, &cmd_buffer->upload.list);
5957ec681f3Smrg   }
59601e04c3fSmrg
5977ec681f3Smrg   cmd_buffer->upload.upload_bo = bo;
5987ec681f3Smrg   cmd_buffer->upload.size = new_size;
5997ec681f3Smrg   cmd_buffer->upload.offset = 0;
6007ec681f3Smrg   cmd_buffer->upload.map = device->ws->buffer_map(cmd_buffer->upload.upload_bo);
60101e04c3fSmrg
6027ec681f3Smrg   if (!cmd_buffer->upload.map) {
6037ec681f3Smrg      cmd_buffer->record_result = VK_ERROR_OUT_OF_DEVICE_MEMORY;
6047ec681f3Smrg      return false;
6057ec681f3Smrg   }
60601e04c3fSmrg
6077ec681f3Smrg   return true;
60801e04c3fSmrg}
60901e04c3fSmrg
61001e04c3fSmrgbool
6117ec681f3Smrgradv_cmd_buffer_upload_alloc(struct radv_cmd_buffer *cmd_buffer, unsigned size,
6127ec681f3Smrg                             unsigned *out_offset, void **ptr)
61301e04c3fSmrg{
6147ec681f3Smrg   assert(size % 4 == 0);
6157ec681f3Smrg
6167ec681f3Smrg   struct radeon_info *rad_info = &cmd_buffer->device->physical_device->rad_info;
617993e1d59Smrg
6187ec681f3Smrg   /* Align to the scalar cache line size if it results in this allocation
6197ec681f3Smrg    * being placed in less of them.
6207ec681f3Smrg    */
6217ec681f3Smrg   unsigned offset = cmd_buffer->upload.offset;
6227ec681f3Smrg   unsigned line_size = rad_info->chip_class >= GFX10 ? 64 : 32;
6237ec681f3Smrg   unsigned gap = align(offset, line_size) - offset;
6247ec681f3Smrg   if ((size & (line_size - 1)) > gap)
6257ec681f3Smrg      offset = align(offset, line_size);
62601e04c3fSmrg
6277ec681f3Smrg   if (offset + size > cmd_buffer->upload.size) {
6287ec681f3Smrg      if (!radv_cmd_buffer_resize_upload_buf(cmd_buffer, size))
6297ec681f3Smrg         return false;
6307ec681f3Smrg      offset = 0;
6317ec681f3Smrg   }
63201e04c3fSmrg
6337ec681f3Smrg   *out_offset = offset;
6347ec681f3Smrg   *ptr = cmd_buffer->upload.map + offset;
6357ec681f3Smrg
6367ec681f3Smrg   cmd_buffer->upload.offset = offset + size;
6377ec681f3Smrg   return true;
63801e04c3fSmrg}
63901e04c3fSmrg
64001e04c3fSmrgbool
6417ec681f3Smrgradv_cmd_buffer_upload_data(struct radv_cmd_buffer *cmd_buffer, unsigned size, const void *data,
6427ec681f3Smrg                            unsigned *out_offset)
64301e04c3fSmrg{
6447ec681f3Smrg   uint8_t *ptr;
64501e04c3fSmrg
6467ec681f3Smrg   if (!radv_cmd_buffer_upload_alloc(cmd_buffer, size, out_offset, (void **)&ptr))
6477ec681f3Smrg      return false;
64801e04c3fSmrg
6497ec681f3Smrg   if (ptr)
6507ec681f3Smrg      memcpy(ptr, data, size);
65101e04c3fSmrg
6527ec681f3Smrg   return true;
65301e04c3fSmrg}
65401e04c3fSmrg
6557ec681f3Smrgvoid
6567ec681f3Smrgradv_cmd_buffer_trace_emit(struct radv_cmd_buffer *cmd_buffer)
65701e04c3fSmrg{
6587ec681f3Smrg   struct radv_device *device = cmd_buffer->device;
6597ec681f3Smrg   struct radeon_cmdbuf *cs = cmd_buffer->cs;
6607ec681f3Smrg   uint64_t va;
6617ec681f3Smrg
6627ec681f3Smrg   va = radv_buffer_get_va(device->trace_bo);
6637ec681f3Smrg   if (cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY)
6647ec681f3Smrg      va += 4;
66501e04c3fSmrg
6667ec681f3Smrg   ++cmd_buffer->state.trace_id;
6677ec681f3Smrg   radv_emit_write_data_packet(cmd_buffer, V_370_ME, va, 1, &cmd_buffer->state.trace_id);
66801e04c3fSmrg
6697ec681f3Smrg   radeon_check_space(cmd_buffer->device->ws, cs, 2);
6707ec681f3Smrg
6717ec681f3Smrg   radeon_emit(cs, PKT3(PKT3_NOP, 0, 0));
6727ec681f3Smrg   radeon_emit(cs, AC_ENCODE_TRACE_POINT(cmd_buffer->state.trace_id));
67301e04c3fSmrg}
67401e04c3fSmrg
6757ec681f3Smrgstatic void
6767ec681f3Smrgradv_cmd_buffer_after_draw(struct radv_cmd_buffer *cmd_buffer, enum radv_cmd_flush_bits flags)
67701e04c3fSmrg{
6787ec681f3Smrg   if (unlikely(cmd_buffer->device->thread_trace.bo)) {
6797ec681f3Smrg      radeon_emit(cmd_buffer->cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
6807ec681f3Smrg      radeon_emit(cmd_buffer->cs, EVENT_TYPE(V_028A90_THREAD_TRACE_MARKER) | EVENT_INDEX(0));
6817ec681f3Smrg   }
68201e04c3fSmrg
6837ec681f3Smrg   if (cmd_buffer->device->instance->debug_flags & RADV_DEBUG_SYNC_SHADERS) {
6847ec681f3Smrg      enum rgp_flush_bits sqtt_flush_bits = 0;
6857ec681f3Smrg      assert(flags & (RADV_CMD_FLAG_PS_PARTIAL_FLUSH | RADV_CMD_FLAG_CS_PARTIAL_FLUSH));
68601e04c3fSmrg
6877ec681f3Smrg      radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 4);
68801e04c3fSmrg
6897ec681f3Smrg      /* Force wait for graphics or compute engines to be idle. */
6907ec681f3Smrg      si_cs_emit_cache_flush(cmd_buffer->cs,
6917ec681f3Smrg                             cmd_buffer->device->physical_device->rad_info.chip_class,
6927ec681f3Smrg                             &cmd_buffer->gfx9_fence_idx, cmd_buffer->gfx9_fence_va,
6937ec681f3Smrg                             radv_cmd_buffer_uses_mec(cmd_buffer), flags, &sqtt_flush_bits,
6947ec681f3Smrg                             cmd_buffer->gfx9_eop_bug_va);
6957ec681f3Smrg   }
69601e04c3fSmrg
6977ec681f3Smrg   if (unlikely(cmd_buffer->device->trace_bo))
6987ec681f3Smrg      radv_cmd_buffer_trace_emit(cmd_buffer);
69901e04c3fSmrg}
70001e04c3fSmrg
70101e04c3fSmrgstatic void
7027ec681f3Smrgradv_save_pipeline(struct radv_cmd_buffer *cmd_buffer, struct radv_pipeline *pipeline)
70301e04c3fSmrg{
7047ec681f3Smrg   struct radv_device *device = cmd_buffer->device;
7057ec681f3Smrg   enum ring_type ring;
7067ec681f3Smrg   uint32_t data[2];
7077ec681f3Smrg   uint64_t va;
70801e04c3fSmrg
7097ec681f3Smrg   va = radv_buffer_get_va(device->trace_bo);
71001e04c3fSmrg
7117ec681f3Smrg   ring = radv_queue_family_to_ring(cmd_buffer->queue_family_index);
71201e04c3fSmrg
7137ec681f3Smrg   switch (ring) {
7147ec681f3Smrg   case RING_GFX:
7157ec681f3Smrg      va += 8;
7167ec681f3Smrg      break;
7177ec681f3Smrg   case RING_COMPUTE:
7187ec681f3Smrg      va += 16;
7197ec681f3Smrg      break;
7207ec681f3Smrg   default:
7217ec681f3Smrg      assert(!"invalid ring type");
7227ec681f3Smrg   }
7237ec681f3Smrg
7247ec681f3Smrg   uint64_t pipeline_address = (uintptr_t)pipeline;
7257ec681f3Smrg   data[0] = pipeline_address;
7267ec681f3Smrg   data[1] = pipeline_address >> 32;
7277ec681f3Smrg
7287ec681f3Smrg   radv_emit_write_data_packet(cmd_buffer, V_370_ME, va, 2, data);
72901e04c3fSmrg}
73001e04c3fSmrg
73101e04c3fSmrgstatic void
7327ec681f3Smrgradv_save_vertex_descriptors(struct radv_cmd_buffer *cmd_buffer, uint64_t vb_ptr)
73301e04c3fSmrg{
7347ec681f3Smrg   struct radv_device *device = cmd_buffer->device;
7357ec681f3Smrg   uint32_t data[2];
7367ec681f3Smrg   uint64_t va;
73701e04c3fSmrg
7387ec681f3Smrg   va = radv_buffer_get_va(device->trace_bo);
7397ec681f3Smrg   va += 24;
74001e04c3fSmrg
7417ec681f3Smrg   data[0] = vb_ptr;
7427ec681f3Smrg   data[1] = vb_ptr >> 32;
74301e04c3fSmrg
7447ec681f3Smrg   radv_emit_write_data_packet(cmd_buffer, V_370_ME, va, 2, data);
74501e04c3fSmrg}
74601e04c3fSmrg
7477ec681f3Smrgvoid
7487ec681f3Smrgradv_set_descriptor_set(struct radv_cmd_buffer *cmd_buffer, VkPipelineBindPoint bind_point,
7497ec681f3Smrg                        struct radv_descriptor_set *set, unsigned idx)
75001e04c3fSmrg{
7517ec681f3Smrg   struct radv_descriptor_state *descriptors_state =
7527ec681f3Smrg      radv_get_descriptors_state(cmd_buffer, bind_point);
75301e04c3fSmrg
7547ec681f3Smrg   descriptors_state->sets[idx] = set;
75501e04c3fSmrg
7567ec681f3Smrg   descriptors_state->valid |= (1u << idx); /* active descriptors */
7577ec681f3Smrg   descriptors_state->dirty |= (1u << idx);
75801e04c3fSmrg}
75901e04c3fSmrg
76001e04c3fSmrgstatic void
7617ec681f3Smrgradv_save_descriptors(struct radv_cmd_buffer *cmd_buffer, VkPipelineBindPoint bind_point)
76201e04c3fSmrg{
7637ec681f3Smrg   struct radv_descriptor_state *descriptors_state =
7647ec681f3Smrg      radv_get_descriptors_state(cmd_buffer, bind_point);
7657ec681f3Smrg   struct radv_device *device = cmd_buffer->device;
7667ec681f3Smrg   uint32_t data[MAX_SETS * 2] = {0};
7677ec681f3Smrg   uint64_t va;
7687ec681f3Smrg   va = radv_buffer_get_va(device->trace_bo) + 32;
76901e04c3fSmrg
7707ec681f3Smrg   u_foreach_bit(i, descriptors_state->valid)
7717ec681f3Smrg   {
7727ec681f3Smrg      struct radv_descriptor_set *set = descriptors_state->sets[i];
7737ec681f3Smrg      data[i * 2] = (uint64_t)(uintptr_t)set;
7747ec681f3Smrg      data[i * 2 + 1] = (uint64_t)(uintptr_t)set >> 32;
7757ec681f3Smrg   }
77601e04c3fSmrg
7777ec681f3Smrg   radv_emit_write_data_packet(cmd_buffer, V_370_ME, va, MAX_SETS * 2, data);
77801e04c3fSmrg}
77901e04c3fSmrg
78001e04c3fSmrgstruct radv_userdata_info *
7817ec681f3Smrgradv_lookup_user_sgpr(struct radv_pipeline *pipeline, gl_shader_stage stage, int idx)
78201e04c3fSmrg{
7837ec681f3Smrg   struct radv_shader_variant *shader = radv_get_shader(pipeline, stage);
7847ec681f3Smrg   return &shader->info.user_sgprs_locs.shader_data[idx];
78501e04c3fSmrg}
78601e04c3fSmrg
78701e04c3fSmrgstatic void
7887ec681f3Smrgradv_emit_userdata_address(struct radv_cmd_buffer *cmd_buffer, struct radv_pipeline *pipeline,
7897ec681f3Smrg                           gl_shader_stage stage, int idx, uint64_t va)
79001e04c3fSmrg{
7917ec681f3Smrg   struct radv_userdata_info *loc = radv_lookup_user_sgpr(pipeline, stage, idx);
7927ec681f3Smrg   uint32_t base_reg = pipeline->user_data_0[stage];
7937ec681f3Smrg   if (loc->sgpr_idx == -1)
7947ec681f3Smrg      return;
79501e04c3fSmrg
7967ec681f3Smrg   assert(loc->num_sgprs == 1);
79701e04c3fSmrg
7987ec681f3Smrg   radv_emit_shader_pointer(cmd_buffer->device, cmd_buffer->cs, base_reg + loc->sgpr_idx * 4, va,
7997ec681f3Smrg                            false);
80001e04c3fSmrg}
80101e04c3fSmrg
80201e04c3fSmrgstatic void
8037ec681f3Smrgradv_emit_descriptor_pointers(struct radv_cmd_buffer *cmd_buffer, struct radv_pipeline *pipeline,
8047ec681f3Smrg                              struct radv_descriptor_state *descriptors_state,
8057ec681f3Smrg                              gl_shader_stage stage)
80601e04c3fSmrg{
8077ec681f3Smrg   struct radv_device *device = cmd_buffer->device;
8087ec681f3Smrg   struct radeon_cmdbuf *cs = cmd_buffer->cs;
8097ec681f3Smrg   uint32_t sh_base = pipeline->user_data_0[stage];
8107ec681f3Smrg   struct radv_userdata_locations *locs = &pipeline->shaders[stage]->info.user_sgprs_locs;
8117ec681f3Smrg   unsigned mask = locs->descriptor_sets_enabled;
81201e04c3fSmrg
8137ec681f3Smrg   mask &= descriptors_state->dirty & descriptors_state->valid;
81401e04c3fSmrg
8157ec681f3Smrg   while (mask) {
8167ec681f3Smrg      int start, count;
81701e04c3fSmrg
8187ec681f3Smrg      u_bit_scan_consecutive_range(&mask, &start, &count);
81901e04c3fSmrg
8207ec681f3Smrg      struct radv_userdata_info *loc = &locs->descriptor_sets[start];
8217ec681f3Smrg      unsigned sh_offset = sh_base + loc->sgpr_idx * 4;
82201e04c3fSmrg
8237ec681f3Smrg      radv_emit_shader_pointer_head(cs, sh_offset, count, true);
8247ec681f3Smrg      for (int i = 0; i < count; i++) {
8257ec681f3Smrg         struct radv_descriptor_set *set = descriptors_state->sets[start + i];
82601e04c3fSmrg
8277ec681f3Smrg         radv_emit_shader_pointer_body(device, cs, set->header.va, true);
8287ec681f3Smrg      }
8297ec681f3Smrg   }
83001e04c3fSmrg}
83101e04c3fSmrg
8327ec681f3Smrg/**
8337ec681f3Smrg * Convert the user sample locations to hardware sample locations (the values
8347ec681f3Smrg * that will be emitted by PA_SC_AA_SAMPLE_LOCS_PIXEL_*).
8357ec681f3Smrg */
836ed98bd31Smayastatic void
8377ec681f3Smrgradv_convert_user_sample_locs(struct radv_sample_locations_state *state, uint32_t x, uint32_t y,
8387ec681f3Smrg                              VkOffset2D *sample_locs)
839ed98bd31Smaya{
8407ec681f3Smrg   uint32_t x_offset = x % state->grid_size.width;
8417ec681f3Smrg   uint32_t y_offset = y % state->grid_size.height;
8427ec681f3Smrg   uint32_t num_samples = (uint32_t)state->per_pixel;
8437ec681f3Smrg   VkSampleLocationEXT *user_locs;
8447ec681f3Smrg   uint32_t pixel_offset;
8457ec681f3Smrg
8467ec681f3Smrg   pixel_offset = (x_offset + y_offset * state->grid_size.width) * num_samples;
8477ec681f3Smrg
8487ec681f3Smrg   assert(pixel_offset <= MAX_SAMPLE_LOCATIONS);
8497ec681f3Smrg   user_locs = &state->locations[pixel_offset];
850ed98bd31Smaya
8517ec681f3Smrg   for (uint32_t i = 0; i < num_samples; i++) {
8527ec681f3Smrg      float shifted_pos_x = user_locs[i].x - 0.5;
8537ec681f3Smrg      float shifted_pos_y = user_locs[i].y - 0.5;
854ed98bd31Smaya
8557ec681f3Smrg      int32_t scaled_pos_x = floorf(shifted_pos_x * 16);
8567ec681f3Smrg      int32_t scaled_pos_y = floorf(shifted_pos_y * 16);
8577ec681f3Smrg
8587ec681f3Smrg      sample_locs[i].x = CLAMP(scaled_pos_x, -8, 7);
8597ec681f3Smrg      sample_locs[i].y = CLAMP(scaled_pos_y, -8, 7);
8607ec681f3Smrg   }
861ed98bd31Smaya}
862ed98bd31Smaya
8637ec681f3Smrg/**
8647ec681f3Smrg * Compute the PA_SC_AA_SAMPLE_LOCS_PIXEL_* mask based on hardware sample
8657ec681f3Smrg * locations.
8667ec681f3Smrg */
86701e04c3fSmrgstatic void
8687ec681f3Smrgradv_compute_sample_locs_pixel(uint32_t num_samples, VkOffset2D *sample_locs,
8697ec681f3Smrg                               uint32_t *sample_locs_pixel)
8707ec681f3Smrg{
8717ec681f3Smrg   for (uint32_t i = 0; i < num_samples; i++) {
8727ec681f3Smrg      uint32_t sample_reg_idx = i / 4;
8737ec681f3Smrg      uint32_t sample_loc_idx = i % 4;
8747ec681f3Smrg      int32_t pos_x = sample_locs[i].x;
8757ec681f3Smrg      int32_t pos_y = sample_locs[i].y;
8767ec681f3Smrg
8777ec681f3Smrg      uint32_t shift_x = 8 * sample_loc_idx;
8787ec681f3Smrg      uint32_t shift_y = shift_x + 4;
8797ec681f3Smrg
8807ec681f3Smrg      sample_locs_pixel[sample_reg_idx] |= (pos_x & 0xf) << shift_x;
8817ec681f3Smrg      sample_locs_pixel[sample_reg_idx] |= (pos_y & 0xf) << shift_y;
8827ec681f3Smrg   }
8837ec681f3Smrg}
8847ec681f3Smrg
8857ec681f3Smrg/**
8867ec681f3Smrg * Compute the PA_SC_CENTROID_PRIORITY_* mask based on the top left hardware
8877ec681f3Smrg * sample locations.
8887ec681f3Smrg */
8897ec681f3Smrgstatic uint64_t
8907ec681f3Smrgradv_compute_centroid_priority(struct radv_cmd_buffer *cmd_buffer, VkOffset2D *sample_locs,
8917ec681f3Smrg                               uint32_t num_samples)
89201e04c3fSmrg{
8937ec681f3Smrg   uint32_t *centroid_priorities = alloca(num_samples * sizeof(*centroid_priorities));
8947ec681f3Smrg   uint32_t sample_mask = num_samples - 1;
8957ec681f3Smrg   uint32_t *distances = alloca(num_samples * sizeof(*distances));
8967ec681f3Smrg   uint64_t centroid_priority = 0;
89701e04c3fSmrg
8987ec681f3Smrg   /* Compute the distances from center for each sample. */
8997ec681f3Smrg   for (int i = 0; i < num_samples; i++) {
9007ec681f3Smrg      distances[i] = (sample_locs[i].x * sample_locs[i].x) + (sample_locs[i].y * sample_locs[i].y);
9017ec681f3Smrg   }
90201e04c3fSmrg
9037ec681f3Smrg   /* Compute the centroid priorities by looking at the distances array. */
9047ec681f3Smrg   for (int i = 0; i < num_samples; i++) {
9057ec681f3Smrg      uint32_t min_idx = 0;
90601e04c3fSmrg
9077ec681f3Smrg      for (int j = 1; j < num_samples; j++) {
9087ec681f3Smrg         if (distances[j] < distances[min_idx])
9097ec681f3Smrg            min_idx = j;
9107ec681f3Smrg      }
91101e04c3fSmrg
9127ec681f3Smrg      centroid_priorities[i] = min_idx;
9137ec681f3Smrg      distances[min_idx] = 0xffffffff;
9147ec681f3Smrg   }
91501e04c3fSmrg
9167ec681f3Smrg   /* Compute the final centroid priority. */
9177ec681f3Smrg   for (int i = 0; i < 8; i++) {
9187ec681f3Smrg      centroid_priority |= centroid_priorities[i & sample_mask] << (i * 4);
9197ec681f3Smrg   }
92001e04c3fSmrg
9217ec681f3Smrg   return centroid_priority << 32 | centroid_priority;
9227ec681f3Smrg}
923ed98bd31Smaya
9247ec681f3Smrg/**
9257ec681f3Smrg * Emit the sample locations that are specified with VK_EXT_sample_locations.
9267ec681f3Smrg */
9277ec681f3Smrgstatic void
9287ec681f3Smrgradv_emit_sample_locations(struct radv_cmd_buffer *cmd_buffer)
9297ec681f3Smrg{
9307ec681f3Smrg   struct radv_sample_locations_state *sample_location = &cmd_buffer->state.dynamic.sample_location;
9317ec681f3Smrg   uint32_t num_samples = (uint32_t)sample_location->per_pixel;
9327ec681f3Smrg   struct radeon_cmdbuf *cs = cmd_buffer->cs;
9337ec681f3Smrg   uint32_t sample_locs_pixel[4][2] = {0};
9347ec681f3Smrg   VkOffset2D sample_locs[4][8]; /* 8 is the max. sample count supported */
9357ec681f3Smrg   uint32_t max_sample_dist = 0;
9367ec681f3Smrg   uint64_t centroid_priority;
9377ec681f3Smrg
9387ec681f3Smrg   if (!cmd_buffer->state.dynamic.sample_location.count)
9397ec681f3Smrg      return;
9407ec681f3Smrg
9417ec681f3Smrg   /* Convert the user sample locations to hardware sample locations. */
9427ec681f3Smrg   radv_convert_user_sample_locs(sample_location, 0, 0, sample_locs[0]);
9437ec681f3Smrg   radv_convert_user_sample_locs(sample_location, 1, 0, sample_locs[1]);
9447ec681f3Smrg   radv_convert_user_sample_locs(sample_location, 0, 1, sample_locs[2]);
9457ec681f3Smrg   radv_convert_user_sample_locs(sample_location, 1, 1, sample_locs[3]);
9467ec681f3Smrg
9477ec681f3Smrg   /* Compute the PA_SC_AA_SAMPLE_LOCS_PIXEL_* mask. */
9487ec681f3Smrg   for (uint32_t i = 0; i < 4; i++) {
9497ec681f3Smrg      radv_compute_sample_locs_pixel(num_samples, sample_locs[i], sample_locs_pixel[i]);
9507ec681f3Smrg   }
9517ec681f3Smrg
9527ec681f3Smrg   /* Compute the PA_SC_CENTROID_PRIORITY_* mask. */
9537ec681f3Smrg   centroid_priority = radv_compute_centroid_priority(cmd_buffer, sample_locs[0], num_samples);
9547ec681f3Smrg
9557ec681f3Smrg   /* Compute the maximum sample distance from the specified locations. */
9567ec681f3Smrg   for (unsigned i = 0; i < 4; ++i) {
9577ec681f3Smrg      for (uint32_t j = 0; j < num_samples; j++) {
9587ec681f3Smrg         VkOffset2D offset = sample_locs[i][j];
9597ec681f3Smrg         max_sample_dist = MAX2(max_sample_dist, MAX2(abs(offset.x), abs(offset.y)));
9607ec681f3Smrg      }
9617ec681f3Smrg   }
9627ec681f3Smrg
9637ec681f3Smrg   /* Emit the specified user sample locations. */
9647ec681f3Smrg   switch (num_samples) {
9657ec681f3Smrg   case 2:
9667ec681f3Smrg   case 4:
9677ec681f3Smrg      radeon_set_context_reg(cs, R_028BF8_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_0,
9687ec681f3Smrg                             sample_locs_pixel[0][0]);
9697ec681f3Smrg      radeon_set_context_reg(cs, R_028C08_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_0,
9707ec681f3Smrg                             sample_locs_pixel[1][0]);
9717ec681f3Smrg      radeon_set_context_reg(cs, R_028C18_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_0,
9727ec681f3Smrg                             sample_locs_pixel[2][0]);
9737ec681f3Smrg      radeon_set_context_reg(cs, R_028C28_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_0,
9747ec681f3Smrg                             sample_locs_pixel[3][0]);
9757ec681f3Smrg      break;
9767ec681f3Smrg   case 8:
9777ec681f3Smrg      radeon_set_context_reg(cs, R_028BF8_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_0,
9787ec681f3Smrg                             sample_locs_pixel[0][0]);
9797ec681f3Smrg      radeon_set_context_reg(cs, R_028C08_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_0,
9807ec681f3Smrg                             sample_locs_pixel[1][0]);
9817ec681f3Smrg      radeon_set_context_reg(cs, R_028C18_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_0,
9827ec681f3Smrg                             sample_locs_pixel[2][0]);
9837ec681f3Smrg      radeon_set_context_reg(cs, R_028C28_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_0,
9847ec681f3Smrg                             sample_locs_pixel[3][0]);
9857ec681f3Smrg      radeon_set_context_reg(cs, R_028BFC_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_1,
9867ec681f3Smrg                             sample_locs_pixel[0][1]);
9877ec681f3Smrg      radeon_set_context_reg(cs, R_028C0C_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_1,
9887ec681f3Smrg                             sample_locs_pixel[1][1]);
9897ec681f3Smrg      radeon_set_context_reg(cs, R_028C1C_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_1,
9907ec681f3Smrg                             sample_locs_pixel[2][1]);
9917ec681f3Smrg      radeon_set_context_reg(cs, R_028C2C_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_1,
9927ec681f3Smrg                             sample_locs_pixel[3][1]);
9937ec681f3Smrg      break;
9947ec681f3Smrg   default:
9957ec681f3Smrg      unreachable("invalid number of samples");
9967ec681f3Smrg   }
9977ec681f3Smrg
9987ec681f3Smrg   /* Emit the maximum sample distance and the centroid priority. */
9997ec681f3Smrg   radeon_set_context_reg_rmw(cs, R_028BE0_PA_SC_AA_CONFIG,
10007ec681f3Smrg                              S_028BE0_MAX_SAMPLE_DIST(max_sample_dist), ~C_028BE0_MAX_SAMPLE_DIST);
10017ec681f3Smrg
10027ec681f3Smrg   radeon_set_context_reg_seq(cs, R_028BD4_PA_SC_CENTROID_PRIORITY_0, 2);
10037ec681f3Smrg   radeon_emit(cs, centroid_priority);
10047ec681f3Smrg   radeon_emit(cs, centroid_priority >> 32);
10057ec681f3Smrg
10067ec681f3Smrg   cmd_buffer->state.context_roll_without_scissor_emitted = true;
100701e04c3fSmrg}
100801e04c3fSmrg
100901e04c3fSmrgstatic void
10107ec681f3Smrgradv_emit_inline_push_consts(struct radv_cmd_buffer *cmd_buffer, struct radv_pipeline *pipeline,
10117ec681f3Smrg                             gl_shader_stage stage, int idx, uint32_t *values)
101201e04c3fSmrg{
10137ec681f3Smrg   struct radv_userdata_info *loc = radv_lookup_user_sgpr(pipeline, stage, idx);
10147ec681f3Smrg   uint32_t base_reg = pipeline->user_data_0[stage];
10157ec681f3Smrg   if (loc->sgpr_idx == -1)
10167ec681f3Smrg      return;
101701e04c3fSmrg
10187ec681f3Smrg   radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 2 + loc->num_sgprs);
101901e04c3fSmrg
10207ec681f3Smrg   radeon_set_sh_reg_seq(cmd_buffer->cs, base_reg + loc->sgpr_idx * 4, loc->num_sgprs);
10217ec681f3Smrg   radeon_emit_array(cmd_buffer->cs, values, loc->num_sgprs);
102201e04c3fSmrg}
102301e04c3fSmrg
102401e04c3fSmrgstatic void
10257ec681f3Smrgradv_update_multisample_state(struct radv_cmd_buffer *cmd_buffer, struct radv_pipeline *pipeline)
102601e04c3fSmrg{
10277ec681f3Smrg   int num_samples = pipeline->graphics.ms.num_samples;
10287ec681f3Smrg   struct radv_pipeline *old_pipeline = cmd_buffer->state.emitted_pipeline;
10297ec681f3Smrg
10307ec681f3Smrg   if (pipeline->shaders[MESA_SHADER_FRAGMENT]->info.ps.needs_sample_positions)
10317ec681f3Smrg      cmd_buffer->sample_positions_needed = true;
103201e04c3fSmrg
10337ec681f3Smrg   if (old_pipeline && num_samples == old_pipeline->graphics.ms.num_samples)
10347ec681f3Smrg      return;
103501e04c3fSmrg
10367ec681f3Smrg   radv_emit_default_sample_locations(cmd_buffer->cs, num_samples);
103701e04c3fSmrg
10387ec681f3Smrg   cmd_buffer->state.context_roll_without_scissor_emitted = true;
10397ec681f3Smrg}
10407ec681f3Smrg
10417ec681f3Smrgstatic void
10427ec681f3Smrgradv_update_binning_state(struct radv_cmd_buffer *cmd_buffer, struct radv_pipeline *pipeline)
10437ec681f3Smrg{
10447ec681f3Smrg   const struct radv_pipeline *old_pipeline = cmd_buffer->state.emitted_pipeline;
104501e04c3fSmrg
10467ec681f3Smrg   if (pipeline->device->physical_device->rad_info.chip_class < GFX9)
10477ec681f3Smrg      return;
104801e04c3fSmrg
10497ec681f3Smrg   if (old_pipeline &&
10507ec681f3Smrg       old_pipeline->graphics.binning.pa_sc_binner_cntl_0 ==
10517ec681f3Smrg          pipeline->graphics.binning.pa_sc_binner_cntl_0)
10527ec681f3Smrg      return;
105301e04c3fSmrg
10547ec681f3Smrg   bool binning_flush = false;
10557ec681f3Smrg   if (cmd_buffer->device->physical_device->rad_info.family == CHIP_VEGA12 ||
10567ec681f3Smrg       cmd_buffer->device->physical_device->rad_info.family == CHIP_VEGA20 ||
10577ec681f3Smrg       cmd_buffer->device->physical_device->rad_info.family == CHIP_RAVEN2 ||
10587ec681f3Smrg       cmd_buffer->device->physical_device->rad_info.chip_class >= GFX10) {
10597ec681f3Smrg      binning_flush = !old_pipeline ||
10607ec681f3Smrg                      G_028C44_BINNING_MODE(old_pipeline->graphics.binning.pa_sc_binner_cntl_0) !=
10617ec681f3Smrg                         G_028C44_BINNING_MODE(pipeline->graphics.binning.pa_sc_binner_cntl_0);
10627ec681f3Smrg   }
106301e04c3fSmrg
10647ec681f3Smrg   radeon_set_context_reg(cmd_buffer->cs, R_028C44_PA_SC_BINNER_CNTL_0,
10657ec681f3Smrg                          pipeline->graphics.binning.pa_sc_binner_cntl_0 |
10667ec681f3Smrg                             S_028C44_FLUSH_ON_BINNING_TRANSITION(!!binning_flush));
106701e04c3fSmrg
10687ec681f3Smrg   cmd_buffer->state.context_roll_without_scissor_emitted = true;
106901e04c3fSmrg}
107001e04c3fSmrg
107101e04c3fSmrgstatic void
10727ec681f3Smrgradv_emit_shader_prefetch(struct radv_cmd_buffer *cmd_buffer, struct radv_shader_variant *shader)
107301e04c3fSmrg{
10747ec681f3Smrg   uint64_t va;
10757ec681f3Smrg
10767ec681f3Smrg   if (!shader)
10777ec681f3Smrg      return;
10787ec681f3Smrg
10797ec681f3Smrg   va = radv_shader_variant_get_va(shader);
10807ec681f3Smrg
10817ec681f3Smrg   si_cp_dma_prefetch(cmd_buffer, va, shader->code_size);
108201e04c3fSmrg}
108301e04c3fSmrg
108401e04c3fSmrgstatic void
10857ec681f3Smrgradv_emit_prefetch_L2(struct radv_cmd_buffer *cmd_buffer, struct radv_pipeline *pipeline,
10867ec681f3Smrg                      bool vertex_stage_only)
108701e04c3fSmrg{
10887ec681f3Smrg   struct radv_cmd_state *state = &cmd_buffer->state;
10897ec681f3Smrg   uint32_t mask = state->prefetch_L2_mask;
10907ec681f3Smrg
10917ec681f3Smrg   if (vertex_stage_only) {
10927ec681f3Smrg      /* Fast prefetch path for starting draws as soon as possible.
10937ec681f3Smrg       */
10947ec681f3Smrg      mask = state->prefetch_L2_mask & (RADV_PREFETCH_VS | RADV_PREFETCH_VBO_DESCRIPTORS);
10957ec681f3Smrg   }
109601e04c3fSmrg
10977ec681f3Smrg   if (mask & RADV_PREFETCH_VS)
10987ec681f3Smrg      radv_emit_shader_prefetch(cmd_buffer, pipeline->shaders[MESA_SHADER_VERTEX]);
109901e04c3fSmrg
11007ec681f3Smrg   if (mask & RADV_PREFETCH_VBO_DESCRIPTORS)
11017ec681f3Smrg      si_cp_dma_prefetch(cmd_buffer, state->vb_va, pipeline->vb_desc_alloc_size);
110201e04c3fSmrg
11037ec681f3Smrg   if (mask & RADV_PREFETCH_TCS)
11047ec681f3Smrg      radv_emit_shader_prefetch(cmd_buffer, pipeline->shaders[MESA_SHADER_TESS_CTRL]);
110501e04c3fSmrg
11067ec681f3Smrg   if (mask & RADV_PREFETCH_TES)
11077ec681f3Smrg      radv_emit_shader_prefetch(cmd_buffer, pipeline->shaders[MESA_SHADER_TESS_EVAL]);
110801e04c3fSmrg
11097ec681f3Smrg   if (mask & RADV_PREFETCH_GS) {
11107ec681f3Smrg      radv_emit_shader_prefetch(cmd_buffer, pipeline->shaders[MESA_SHADER_GEOMETRY]);
11117ec681f3Smrg      if (radv_pipeline_has_gs_copy_shader(pipeline))
11127ec681f3Smrg         radv_emit_shader_prefetch(cmd_buffer, pipeline->gs_copy_shader);
11137ec681f3Smrg   }
111401e04c3fSmrg
11157ec681f3Smrg   if (mask & RADV_PREFETCH_PS)
11167ec681f3Smrg      radv_emit_shader_prefetch(cmd_buffer, pipeline->shaders[MESA_SHADER_FRAGMENT]);
11177ec681f3Smrg
11187ec681f3Smrg   state->prefetch_L2_mask &= ~mask;
11197ec681f3Smrg}
1120ed98bd31Smaya
11217ec681f3Smrgstatic void
11227ec681f3Smrgradv_emit_rbplus_state(struct radv_cmd_buffer *cmd_buffer)
11237ec681f3Smrg{
11247ec681f3Smrg   if (!cmd_buffer->device->physical_device->rad_info.rbplus_allowed)
11257ec681f3Smrg      return;
11267ec681f3Smrg
11277ec681f3Smrg   struct radv_pipeline *pipeline = cmd_buffer->state.pipeline;
11287ec681f3Smrg   const struct radv_subpass *subpass = cmd_buffer->state.subpass;
11297ec681f3Smrg
11307ec681f3Smrg   unsigned sx_ps_downconvert = 0;
11317ec681f3Smrg   unsigned sx_blend_opt_epsilon = 0;
11327ec681f3Smrg   unsigned sx_blend_opt_control = 0;
11337ec681f3Smrg
11347ec681f3Smrg   if (!cmd_buffer->state.attachments || !subpass)
11357ec681f3Smrg      return;
11367ec681f3Smrg
11377ec681f3Smrg   for (unsigned i = 0; i < subpass->color_count; ++i) {
11387ec681f3Smrg      if (subpass->color_attachments[i].attachment == VK_ATTACHMENT_UNUSED) {
11397ec681f3Smrg         /* We don't set the DISABLE bits, because the HW can't have holes,
11407ec681f3Smrg          * so the SPI color format is set to 32-bit 1-component. */
11417ec681f3Smrg         sx_ps_downconvert |= V_028754_SX_RT_EXPORT_32_R << (i * 4);
11427ec681f3Smrg         continue;
11437ec681f3Smrg      }
11447ec681f3Smrg
11457ec681f3Smrg      int idx = subpass->color_attachments[i].attachment;
11467ec681f3Smrg      struct radv_color_buffer_info *cb = &cmd_buffer->state.attachments[idx].cb;
11477ec681f3Smrg
11487ec681f3Smrg      unsigned format = G_028C70_FORMAT(cb->cb_color_info);
11497ec681f3Smrg      unsigned swap = G_028C70_COMP_SWAP(cb->cb_color_info);
11507ec681f3Smrg      uint32_t spi_format = (pipeline->graphics.col_format >> (i * 4)) & 0xf;
11517ec681f3Smrg      uint32_t colormask = (pipeline->graphics.cb_target_mask >> (i * 4)) & 0xf;
11527ec681f3Smrg
11537ec681f3Smrg      bool has_alpha, has_rgb;
11547ec681f3Smrg
11557ec681f3Smrg      /* Set if RGB and A are present. */
11567ec681f3Smrg      has_alpha = !G_028C74_FORCE_DST_ALPHA_1(cb->cb_color_attrib);
11577ec681f3Smrg
11587ec681f3Smrg      if (format == V_028C70_COLOR_8 || format == V_028C70_COLOR_16 || format == V_028C70_COLOR_32)
11597ec681f3Smrg         has_rgb = !has_alpha;
11607ec681f3Smrg      else
11617ec681f3Smrg         has_rgb = true;
11627ec681f3Smrg
11637ec681f3Smrg      /* Check the colormask and export format. */
11647ec681f3Smrg      if (!(colormask & 0x7))
11657ec681f3Smrg         has_rgb = false;
11667ec681f3Smrg      if (!(colormask & 0x8))
11677ec681f3Smrg         has_alpha = false;
11687ec681f3Smrg
11697ec681f3Smrg      if (spi_format == V_028714_SPI_SHADER_ZERO) {
11707ec681f3Smrg         has_rgb = false;
11717ec681f3Smrg         has_alpha = false;
11727ec681f3Smrg      }
11737ec681f3Smrg
11747ec681f3Smrg      /* The HW doesn't quite blend correctly with rgb9e5 if we disable the alpha
11757ec681f3Smrg       * optimization, even though it has no alpha. */
11767ec681f3Smrg      if (has_rgb && format == V_028C70_COLOR_5_9_9_9)
11777ec681f3Smrg         has_alpha = true;
11787ec681f3Smrg
11797ec681f3Smrg      /* Disable value checking for disabled channels. */
11807ec681f3Smrg      if (!has_rgb)
11817ec681f3Smrg         sx_blend_opt_control |= S_02875C_MRT0_COLOR_OPT_DISABLE(1) << (i * 4);
11827ec681f3Smrg      if (!has_alpha)
11837ec681f3Smrg         sx_blend_opt_control |= S_02875C_MRT0_ALPHA_OPT_DISABLE(1) << (i * 4);
11847ec681f3Smrg
11857ec681f3Smrg      /* Enable down-conversion for 32bpp and smaller formats. */
11867ec681f3Smrg      switch (format) {
11877ec681f3Smrg      case V_028C70_COLOR_8:
11887ec681f3Smrg      case V_028C70_COLOR_8_8:
11897ec681f3Smrg      case V_028C70_COLOR_8_8_8_8:
11907ec681f3Smrg         /* For 1 and 2-channel formats, use the superset thereof. */
11917ec681f3Smrg         if (spi_format == V_028714_SPI_SHADER_FP16_ABGR ||
11927ec681f3Smrg             spi_format == V_028714_SPI_SHADER_UINT16_ABGR ||
11937ec681f3Smrg             spi_format == V_028714_SPI_SHADER_SINT16_ABGR) {
11947ec681f3Smrg            sx_ps_downconvert |= V_028754_SX_RT_EXPORT_8_8_8_8 << (i * 4);
11957ec681f3Smrg            sx_blend_opt_epsilon |= V_028758_8BIT_FORMAT << (i * 4);
11967ec681f3Smrg         }
11977ec681f3Smrg         break;
11987ec681f3Smrg
11997ec681f3Smrg      case V_028C70_COLOR_5_6_5:
12007ec681f3Smrg         if (spi_format == V_028714_SPI_SHADER_FP16_ABGR) {
12017ec681f3Smrg            sx_ps_downconvert |= V_028754_SX_RT_EXPORT_5_6_5 << (i * 4);
12027ec681f3Smrg            sx_blend_opt_epsilon |= V_028758_6BIT_FORMAT << (i * 4);
12037ec681f3Smrg         }
12047ec681f3Smrg         break;
12057ec681f3Smrg
12067ec681f3Smrg      case V_028C70_COLOR_1_5_5_5:
12077ec681f3Smrg         if (spi_format == V_028714_SPI_SHADER_FP16_ABGR) {
12087ec681f3Smrg            sx_ps_downconvert |= V_028754_SX_RT_EXPORT_1_5_5_5 << (i * 4);
12097ec681f3Smrg            sx_blend_opt_epsilon |= V_028758_5BIT_FORMAT << (i * 4);
12107ec681f3Smrg         }
12117ec681f3Smrg         break;
12127ec681f3Smrg
12137ec681f3Smrg      case V_028C70_COLOR_4_4_4_4:
12147ec681f3Smrg         if (spi_format == V_028714_SPI_SHADER_FP16_ABGR) {
12157ec681f3Smrg            sx_ps_downconvert |= V_028754_SX_RT_EXPORT_4_4_4_4 << (i * 4);
12167ec681f3Smrg            sx_blend_opt_epsilon |= V_028758_4BIT_FORMAT << (i * 4);
12177ec681f3Smrg         }
12187ec681f3Smrg         break;
12197ec681f3Smrg
12207ec681f3Smrg      case V_028C70_COLOR_32:
12217ec681f3Smrg         if (swap == V_028C70_SWAP_STD && spi_format == V_028714_SPI_SHADER_32_R)
12227ec681f3Smrg            sx_ps_downconvert |= V_028754_SX_RT_EXPORT_32_R << (i * 4);
12237ec681f3Smrg         else if (swap == V_028C70_SWAP_ALT_REV && spi_format == V_028714_SPI_SHADER_32_AR)
12247ec681f3Smrg            sx_ps_downconvert |= V_028754_SX_RT_EXPORT_32_A << (i * 4);
12257ec681f3Smrg         break;
12267ec681f3Smrg
12277ec681f3Smrg      case V_028C70_COLOR_16:
12287ec681f3Smrg      case V_028C70_COLOR_16_16:
12297ec681f3Smrg         /* For 1-channel formats, use the superset thereof. */
12307ec681f3Smrg         if (spi_format == V_028714_SPI_SHADER_UNORM16_ABGR ||
12317ec681f3Smrg             spi_format == V_028714_SPI_SHADER_SNORM16_ABGR ||
12327ec681f3Smrg             spi_format == V_028714_SPI_SHADER_UINT16_ABGR ||
12337ec681f3Smrg             spi_format == V_028714_SPI_SHADER_SINT16_ABGR) {
12347ec681f3Smrg            if (swap == V_028C70_SWAP_STD || swap == V_028C70_SWAP_STD_REV)
12357ec681f3Smrg               sx_ps_downconvert |= V_028754_SX_RT_EXPORT_16_16_GR << (i * 4);
12367ec681f3Smrg            else
12377ec681f3Smrg               sx_ps_downconvert |= V_028754_SX_RT_EXPORT_16_16_AR << (i * 4);
12387ec681f3Smrg         }
12397ec681f3Smrg         break;
12407ec681f3Smrg
12417ec681f3Smrg      case V_028C70_COLOR_10_11_11:
12427ec681f3Smrg         if (spi_format == V_028714_SPI_SHADER_FP16_ABGR)
12437ec681f3Smrg            sx_ps_downconvert |= V_028754_SX_RT_EXPORT_10_11_11 << (i * 4);
12447ec681f3Smrg         break;
12457ec681f3Smrg
12467ec681f3Smrg      case V_028C70_COLOR_2_10_10_10:
12477ec681f3Smrg         if (spi_format == V_028714_SPI_SHADER_FP16_ABGR) {
12487ec681f3Smrg            sx_ps_downconvert |= V_028754_SX_RT_EXPORT_2_10_10_10 << (i * 4);
12497ec681f3Smrg            sx_blend_opt_epsilon |= V_028758_10BIT_FORMAT << (i * 4);
12507ec681f3Smrg         }
12517ec681f3Smrg         break;
12527ec681f3Smrg      case V_028C70_COLOR_5_9_9_9:
12537ec681f3Smrg         if (spi_format == V_028714_SPI_SHADER_FP16_ABGR)
12547ec681f3Smrg            sx_ps_downconvert |= V_028754_SX_RT_EXPORT_9_9_9_E5 << (i * 4);
12557ec681f3Smrg         break;
12567ec681f3Smrg      }
12577ec681f3Smrg   }
12587ec681f3Smrg
12597ec681f3Smrg   /* Do not set the DISABLE bits for the unused attachments, as that
12607ec681f3Smrg    * breaks dual source blending in SkQP and does not seem to improve
12617ec681f3Smrg    * performance. */
12627ec681f3Smrg
12637ec681f3Smrg   if (sx_ps_downconvert == cmd_buffer->state.last_sx_ps_downconvert &&
12647ec681f3Smrg       sx_blend_opt_epsilon == cmd_buffer->state.last_sx_blend_opt_epsilon &&
12657ec681f3Smrg       sx_blend_opt_control == cmd_buffer->state.last_sx_blend_opt_control)
12667ec681f3Smrg      return;
12677ec681f3Smrg
12687ec681f3Smrg   radeon_set_context_reg_seq(cmd_buffer->cs, R_028754_SX_PS_DOWNCONVERT, 3);
12697ec681f3Smrg   radeon_emit(cmd_buffer->cs, sx_ps_downconvert);
12707ec681f3Smrg   radeon_emit(cmd_buffer->cs, sx_blend_opt_epsilon);
12717ec681f3Smrg   radeon_emit(cmd_buffer->cs, sx_blend_opt_control);
12727ec681f3Smrg
12737ec681f3Smrg   cmd_buffer->state.context_roll_without_scissor_emitted = true;
12747ec681f3Smrg
12757ec681f3Smrg   cmd_buffer->state.last_sx_ps_downconvert = sx_ps_downconvert;
12767ec681f3Smrg   cmd_buffer->state.last_sx_blend_opt_epsilon = sx_blend_opt_epsilon;
12777ec681f3Smrg   cmd_buffer->state.last_sx_blend_opt_control = sx_blend_opt_control;
12787ec681f3Smrg}
127901e04c3fSmrg
12807ec681f3Smrgstatic void
12817ec681f3Smrgradv_emit_batch_break_on_new_ps(struct radv_cmd_buffer *cmd_buffer)
12827ec681f3Smrg{
12837ec681f3Smrg   if (!cmd_buffer->device->pbb_allowed)
12847ec681f3Smrg      return;
128501e04c3fSmrg
12867ec681f3Smrg   struct radv_binning_settings settings =
12877ec681f3Smrg      radv_get_binning_settings(cmd_buffer->device->physical_device);
12887ec681f3Smrg   bool break_for_new_ps =
12897ec681f3Smrg      (!cmd_buffer->state.emitted_pipeline ||
12907ec681f3Smrg       cmd_buffer->state.emitted_pipeline->shaders[MESA_SHADER_FRAGMENT] !=
12917ec681f3Smrg          cmd_buffer->state.pipeline->shaders[MESA_SHADER_FRAGMENT]) &&
12927ec681f3Smrg      (settings.context_states_per_bin > 1 || settings.persistent_states_per_bin > 1);
12937ec681f3Smrg   bool break_for_new_cb_target_mask =
12947ec681f3Smrg      (cmd_buffer->state.dirty & RADV_CMD_DIRTY_DYNAMIC_COLOR_WRITE_ENABLE) &&
12957ec681f3Smrg      settings.context_states_per_bin > 1;
129601e04c3fSmrg
12977ec681f3Smrg   if (!break_for_new_ps && !break_for_new_cb_target_mask)
12987ec681f3Smrg      return;
129901e04c3fSmrg
13007ec681f3Smrg   radeon_emit(cmd_buffer->cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
13017ec681f3Smrg   radeon_emit(cmd_buffer->cs, EVENT_TYPE(V_028A90_BREAK_BATCH) | EVENT_INDEX(0));
13027ec681f3Smrg}
130301e04c3fSmrg
13047ec681f3Smrgstatic void
13057ec681f3Smrgradv_emit_graphics_pipeline(struct radv_cmd_buffer *cmd_buffer)
13067ec681f3Smrg{
13077ec681f3Smrg   struct radv_pipeline *pipeline = cmd_buffer->state.pipeline;
13087ec681f3Smrg
13097ec681f3Smrg   if (!pipeline || cmd_buffer->state.emitted_pipeline == pipeline)
13107ec681f3Smrg      return;
13117ec681f3Smrg
13127ec681f3Smrg   radv_update_multisample_state(cmd_buffer, pipeline);
13137ec681f3Smrg   radv_update_binning_state(cmd_buffer, pipeline);
13147ec681f3Smrg
13157ec681f3Smrg   cmd_buffer->scratch_size_per_wave_needed =
13167ec681f3Smrg      MAX2(cmd_buffer->scratch_size_per_wave_needed, pipeline->scratch_bytes_per_wave);
13177ec681f3Smrg   cmd_buffer->scratch_waves_wanted = MAX2(cmd_buffer->scratch_waves_wanted, pipeline->max_waves);
13187ec681f3Smrg
13197ec681f3Smrg   if (!cmd_buffer->state.emitted_pipeline ||
13207ec681f3Smrg       cmd_buffer->state.emitted_pipeline->graphics.can_use_guardband !=
13217ec681f3Smrg          pipeline->graphics.can_use_guardband)
13227ec681f3Smrg      cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_SCISSOR;
13237ec681f3Smrg
13247ec681f3Smrg   if (!cmd_buffer->state.emitted_pipeline ||
13257ec681f3Smrg       cmd_buffer->state.emitted_pipeline->graphics.pa_su_sc_mode_cntl !=
13267ec681f3Smrg          pipeline->graphics.pa_su_sc_mode_cntl)
13277ec681f3Smrg      cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_CULL_MODE |
13287ec681f3Smrg                                 RADV_CMD_DIRTY_DYNAMIC_FRONT_FACE |
13297ec681f3Smrg                                 RADV_CMD_DIRTY_DYNAMIC_DEPTH_BIAS;
13307ec681f3Smrg
13317ec681f3Smrg   if (!cmd_buffer->state.emitted_pipeline ||
13327ec681f3Smrg       cmd_buffer->state.emitted_pipeline->graphics.pa_cl_clip_cntl !=
13337ec681f3Smrg          pipeline->graphics.pa_cl_clip_cntl)
13347ec681f3Smrg      cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_RASTERIZER_DISCARD_ENABLE;
13357ec681f3Smrg
13367ec681f3Smrg   if (!cmd_buffer->state.emitted_pipeline ||
13377ec681f3Smrg       cmd_buffer->state.emitted_pipeline->graphics.cb_color_control !=
13387ec681f3Smrg       pipeline->graphics.cb_color_control)
13397ec681f3Smrg      cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_LOGIC_OP;
13407ec681f3Smrg
13417ec681f3Smrg   if (!cmd_buffer->state.emitted_pipeline)
13427ec681f3Smrg      cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_PRIMITIVE_TOPOLOGY |
13437ec681f3Smrg                                 RADV_CMD_DIRTY_DYNAMIC_DEPTH_BIAS |
13447ec681f3Smrg                                 RADV_CMD_DIRTY_DYNAMIC_DEPTH_BOUNDS |
13457ec681f3Smrg                                 RADV_CMD_DIRTY_DYNAMIC_PRIMITIVE_RESTART_ENABLE;
13467ec681f3Smrg
13477ec681f3Smrg   if (!cmd_buffer->state.emitted_pipeline ||
13487ec681f3Smrg       cmd_buffer->state.emitted_pipeline->graphics.db_depth_control !=
13497ec681f3Smrg          pipeline->graphics.db_depth_control)
13507ec681f3Smrg      cmd_buffer->state.dirty |=
13517ec681f3Smrg         RADV_CMD_DIRTY_DYNAMIC_DEPTH_TEST_ENABLE | RADV_CMD_DIRTY_DYNAMIC_DEPTH_WRITE_ENABLE |
13527ec681f3Smrg         RADV_CMD_DIRTY_DYNAMIC_DEPTH_COMPARE_OP | RADV_CMD_DIRTY_DYNAMIC_DEPTH_BOUNDS_TEST_ENABLE |
13537ec681f3Smrg         RADV_CMD_DIRTY_DYNAMIC_STENCIL_TEST_ENABLE | RADV_CMD_DIRTY_DYNAMIC_STENCIL_OP;
13547ec681f3Smrg
13557ec681f3Smrg   if (!cmd_buffer->state.emitted_pipeline)
13567ec681f3Smrg      cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_STENCIL_OP;
13577ec681f3Smrg
13587ec681f3Smrg   if (!cmd_buffer->state.emitted_pipeline ||
13597ec681f3Smrg       cmd_buffer->state.emitted_pipeline->graphics.cb_target_mask !=
13607ec681f3Smrg       pipeline->graphics.cb_target_mask) {
13617ec681f3Smrg      cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_COLOR_WRITE_ENABLE;
13627ec681f3Smrg   }
13637ec681f3Smrg
13647ec681f3Smrg   radeon_emit_array(cmd_buffer->cs, pipeline->cs.buf, pipeline->cs.cdw);
13657ec681f3Smrg
13667ec681f3Smrg   if (pipeline->graphics.has_ngg_culling &&
13677ec681f3Smrg       pipeline->graphics.last_vgt_api_stage != MESA_SHADER_GEOMETRY &&
13687ec681f3Smrg       !cmd_buffer->state.last_nggc_settings) {
13697ec681f3Smrg      /* The already emitted RSRC2 contains the LDS required for NGG culling.
13707ec681f3Smrg       * Culling is currently disabled, so re-emit RSRC2 to reduce LDS usage.
13717ec681f3Smrg       * API GS always needs LDS, so this isn't useful there.
13727ec681f3Smrg       */
13737ec681f3Smrg      struct radv_shader_variant *v = pipeline->shaders[pipeline->graphics.last_vgt_api_stage];
13747ec681f3Smrg      radeon_set_sh_reg(cmd_buffer->cs, R_00B22C_SPI_SHADER_PGM_RSRC2_GS,
13757ec681f3Smrg                        (v->config.rsrc2 & C_00B22C_LDS_SIZE) |
13767ec681f3Smrg                        S_00B22C_LDS_SIZE(v->info.num_lds_blocks_when_not_culling));
13777ec681f3Smrg   }
13787ec681f3Smrg
13797ec681f3Smrg   if (!cmd_buffer->state.emitted_pipeline ||
13807ec681f3Smrg       cmd_buffer->state.emitted_pipeline->ctx_cs.cdw != pipeline->ctx_cs.cdw ||
13817ec681f3Smrg       cmd_buffer->state.emitted_pipeline->ctx_cs_hash != pipeline->ctx_cs_hash ||
13827ec681f3Smrg       memcmp(cmd_buffer->state.emitted_pipeline->ctx_cs.buf, pipeline->ctx_cs.buf,
13837ec681f3Smrg              pipeline->ctx_cs.cdw * 4)) {
13847ec681f3Smrg      radeon_emit_array(cmd_buffer->cs, pipeline->ctx_cs.buf, pipeline->ctx_cs.cdw);
13857ec681f3Smrg      cmd_buffer->state.context_roll_without_scissor_emitted = true;
13867ec681f3Smrg   }
13877ec681f3Smrg
13887ec681f3Smrg   radv_emit_batch_break_on_new_ps(cmd_buffer);
13897ec681f3Smrg
13907ec681f3Smrg   for (unsigned i = 0; i < MESA_SHADER_COMPUTE; i++) {
13917ec681f3Smrg      if (!pipeline->shaders[i])
13927ec681f3Smrg         continue;
13937ec681f3Smrg
13947ec681f3Smrg      radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, pipeline->shaders[i]->bo);
13957ec681f3Smrg   }
13967ec681f3Smrg
13977ec681f3Smrg   if (radv_pipeline_has_gs_copy_shader(pipeline))
13987ec681f3Smrg      radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, pipeline->gs_copy_shader->bo);
13997ec681f3Smrg
14007ec681f3Smrg   if (unlikely(cmd_buffer->device->trace_bo))
14017ec681f3Smrg      radv_save_pipeline(cmd_buffer, pipeline);
14027ec681f3Smrg
14037ec681f3Smrg   cmd_buffer->state.emitted_pipeline = pipeline;
14047ec681f3Smrg
14057ec681f3Smrg   cmd_buffer->state.dirty &= ~RADV_CMD_DIRTY_PIPELINE;
140601e04c3fSmrg}
140701e04c3fSmrg
140801e04c3fSmrgstatic void
140901e04c3fSmrgradv_emit_viewport(struct radv_cmd_buffer *cmd_buffer)
141001e04c3fSmrg{
14117ec681f3Smrg   const struct radv_viewport_state *viewport = &cmd_buffer->state.dynamic.viewport;
14127ec681f3Smrg   int i;
14137ec681f3Smrg   const unsigned count = viewport->count;
14147ec681f3Smrg
14157ec681f3Smrg   assert(count);
14167ec681f3Smrg   radeon_set_context_reg_seq(cmd_buffer->cs, R_02843C_PA_CL_VPORT_XSCALE, count * 6);
14177ec681f3Smrg
14187ec681f3Smrg   for (i = 0; i < count; i++) {
14197ec681f3Smrg      radeon_emit(cmd_buffer->cs, fui(viewport->xform[i].scale[0]));
14207ec681f3Smrg      radeon_emit(cmd_buffer->cs, fui(viewport->xform[i].translate[0]));
14217ec681f3Smrg      radeon_emit(cmd_buffer->cs, fui(viewport->xform[i].scale[1]));
14227ec681f3Smrg      radeon_emit(cmd_buffer->cs, fui(viewport->xform[i].translate[1]));
14237ec681f3Smrg      radeon_emit(cmd_buffer->cs, fui(viewport->xform[i].scale[2]));
14247ec681f3Smrg      radeon_emit(cmd_buffer->cs, fui(viewport->xform[i].translate[2]));
14257ec681f3Smrg   }
14267ec681f3Smrg
14277ec681f3Smrg   radeon_set_context_reg_seq(cmd_buffer->cs, R_0282D0_PA_SC_VPORT_ZMIN_0, count * 2);
14287ec681f3Smrg   for (i = 0; i < count; i++) {
14297ec681f3Smrg      float zmin = MIN2(viewport->viewports[i].minDepth, viewport->viewports[i].maxDepth);
14307ec681f3Smrg      float zmax = MAX2(viewport->viewports[i].minDepth, viewport->viewports[i].maxDepth);
14317ec681f3Smrg      radeon_emit(cmd_buffer->cs, fui(zmin));
14327ec681f3Smrg      radeon_emit(cmd_buffer->cs, fui(zmax));
14337ec681f3Smrg   }
143401e04c3fSmrg}
143501e04c3fSmrg
143601e04c3fSmrgstatic void
143701e04c3fSmrgradv_emit_scissor(struct radv_cmd_buffer *cmd_buffer)
143801e04c3fSmrg{
14397ec681f3Smrg   uint32_t count = cmd_buffer->state.dynamic.scissor.count;
144001e04c3fSmrg
14417ec681f3Smrg   si_write_scissors(cmd_buffer->cs, 0, count, cmd_buffer->state.dynamic.scissor.scissors,
14427ec681f3Smrg                     cmd_buffer->state.dynamic.viewport.viewports,
14437ec681f3Smrg                     cmd_buffer->state.emitted_pipeline->graphics.can_use_guardband);
1444ed98bd31Smaya
14457ec681f3Smrg   cmd_buffer->state.context_roll_without_scissor_emitted = false;
144601e04c3fSmrg}
144701e04c3fSmrg
144801e04c3fSmrgstatic void
144901e04c3fSmrgradv_emit_discard_rectangle(struct radv_cmd_buffer *cmd_buffer)
145001e04c3fSmrg{
14517ec681f3Smrg   if (!cmd_buffer->state.dynamic.discard_rectangle.count)
14527ec681f3Smrg      return;
145301e04c3fSmrg
14547ec681f3Smrg   radeon_set_context_reg_seq(cmd_buffer->cs, R_028210_PA_SC_CLIPRECT_0_TL,
14557ec681f3Smrg                              cmd_buffer->state.dynamic.discard_rectangle.count * 2);
14567ec681f3Smrg   for (unsigned i = 0; i < cmd_buffer->state.dynamic.discard_rectangle.count; ++i) {
14577ec681f3Smrg      VkRect2D rect = cmd_buffer->state.dynamic.discard_rectangle.rectangles[i];
14587ec681f3Smrg      radeon_emit(cmd_buffer->cs, S_028210_TL_X(rect.offset.x) | S_028210_TL_Y(rect.offset.y));
14597ec681f3Smrg      radeon_emit(cmd_buffer->cs, S_028214_BR_X(rect.offset.x + rect.extent.width) |
14607ec681f3Smrg                                     S_028214_BR_Y(rect.offset.y + rect.extent.height));
14617ec681f3Smrg   }
146201e04c3fSmrg}
146301e04c3fSmrg
146401e04c3fSmrgstatic void
146501e04c3fSmrgradv_emit_line_width(struct radv_cmd_buffer *cmd_buffer)
146601e04c3fSmrg{
14677ec681f3Smrg   unsigned width = cmd_buffer->state.dynamic.line_width * 8;
146801e04c3fSmrg
14697ec681f3Smrg   radeon_set_context_reg(cmd_buffer->cs, R_028A08_PA_SU_LINE_CNTL,
14707ec681f3Smrg                          S_028A08_WIDTH(CLAMP(width, 0, 0xFFFF)));
147101e04c3fSmrg}
147201e04c3fSmrg
147301e04c3fSmrgstatic void
147401e04c3fSmrgradv_emit_blend_constants(struct radv_cmd_buffer *cmd_buffer)
147501e04c3fSmrg{
14767ec681f3Smrg   struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
147701e04c3fSmrg
14787ec681f3Smrg   radeon_set_context_reg_seq(cmd_buffer->cs, R_028414_CB_BLEND_RED, 4);
14797ec681f3Smrg   radeon_emit_array(cmd_buffer->cs, (uint32_t *)d->blend_constants, 4);
148001e04c3fSmrg}
148101e04c3fSmrg
148201e04c3fSmrgstatic void
148301e04c3fSmrgradv_emit_stencil(struct radv_cmd_buffer *cmd_buffer)
148401e04c3fSmrg{
14857ec681f3Smrg   struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
148601e04c3fSmrg
14877ec681f3Smrg   radeon_set_context_reg_seq(cmd_buffer->cs, R_028430_DB_STENCILREFMASK, 2);
14887ec681f3Smrg   radeon_emit(cmd_buffer->cs, S_028430_STENCILTESTVAL(d->stencil_reference.front) |
14897ec681f3Smrg                                  S_028430_STENCILMASK(d->stencil_compare_mask.front) |
14907ec681f3Smrg                                  S_028430_STENCILWRITEMASK(d->stencil_write_mask.front) |
14917ec681f3Smrg                                  S_028430_STENCILOPVAL(1));
14927ec681f3Smrg   radeon_emit(cmd_buffer->cs, S_028434_STENCILTESTVAL_BF(d->stencil_reference.back) |
14937ec681f3Smrg                                  S_028434_STENCILMASK_BF(d->stencil_compare_mask.back) |
14947ec681f3Smrg                                  S_028434_STENCILWRITEMASK_BF(d->stencil_write_mask.back) |
14957ec681f3Smrg                                  S_028434_STENCILOPVAL_BF(1));
149601e04c3fSmrg}
149701e04c3fSmrg
149801e04c3fSmrgstatic void
149901e04c3fSmrgradv_emit_depth_bounds(struct radv_cmd_buffer *cmd_buffer)
150001e04c3fSmrg{
15017ec681f3Smrg   struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
150201e04c3fSmrg
15037ec681f3Smrg   radeon_set_context_reg_seq(cmd_buffer->cs, R_028020_DB_DEPTH_BOUNDS_MIN, 2);
15047ec681f3Smrg   radeon_emit(cmd_buffer->cs, fui(d->depth_bounds.min));
15057ec681f3Smrg   radeon_emit(cmd_buffer->cs, fui(d->depth_bounds.max));
150601e04c3fSmrg}
150701e04c3fSmrg
150801e04c3fSmrgstatic void
150901e04c3fSmrgradv_emit_depth_bias(struct radv_cmd_buffer *cmd_buffer)
151001e04c3fSmrg{
15117ec681f3Smrg   struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
15127ec681f3Smrg   unsigned slope = fui(d->depth_bias.slope * 16.0f);
15137ec681f3Smrg
15147ec681f3Smrg   radeon_set_context_reg_seq(cmd_buffer->cs, R_028B7C_PA_SU_POLY_OFFSET_CLAMP, 5);
15157ec681f3Smrg   radeon_emit(cmd_buffer->cs, fui(d->depth_bias.clamp)); /* CLAMP */
15167ec681f3Smrg   radeon_emit(cmd_buffer->cs, slope);                    /* FRONT SCALE */
15177ec681f3Smrg   radeon_emit(cmd_buffer->cs, fui(d->depth_bias.bias));  /* FRONT OFFSET */
15187ec681f3Smrg   radeon_emit(cmd_buffer->cs, slope);                    /* BACK SCALE */
15197ec681f3Smrg   radeon_emit(cmd_buffer->cs, fui(d->depth_bias.bias));  /* BACK OFFSET */
15207ec681f3Smrg}
15217ec681f3Smrg
15227ec681f3Smrgstatic void
15237ec681f3Smrgradv_emit_line_stipple(struct radv_cmd_buffer *cmd_buffer)
15247ec681f3Smrg{
15257ec681f3Smrg   struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
15267ec681f3Smrg   uint32_t auto_reset_cntl = 1;
15277ec681f3Smrg
15287ec681f3Smrg   if (d->primitive_topology == V_008958_DI_PT_LINESTRIP)
15297ec681f3Smrg      auto_reset_cntl = 2;
15307ec681f3Smrg
15317ec681f3Smrg   radeon_set_context_reg(cmd_buffer->cs, R_028A0C_PA_SC_LINE_STIPPLE,
15327ec681f3Smrg                          S_028A0C_LINE_PATTERN(d->line_stipple.pattern) |
15337ec681f3Smrg                             S_028A0C_REPEAT_COUNT(d->line_stipple.factor - 1) |
15347ec681f3Smrg                             S_028A0C_AUTO_RESET_CNTL(auto_reset_cntl));
15357ec681f3Smrg}
15367ec681f3Smrg
15377ec681f3Smrgstatic void
15387ec681f3Smrgradv_emit_culling(struct radv_cmd_buffer *cmd_buffer, uint64_t states)
15397ec681f3Smrg{
15407ec681f3Smrg   unsigned pa_su_sc_mode_cntl = cmd_buffer->state.pipeline->graphics.pa_su_sc_mode_cntl;
15417ec681f3Smrg   struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
15427ec681f3Smrg
15437ec681f3Smrg   pa_su_sc_mode_cntl &= C_028814_CULL_FRONT &
15447ec681f3Smrg                         C_028814_CULL_BACK &
15457ec681f3Smrg                         C_028814_FACE &
15467ec681f3Smrg                         C_028814_POLY_OFFSET_FRONT_ENABLE &
15477ec681f3Smrg                         C_028814_POLY_OFFSET_BACK_ENABLE &
15487ec681f3Smrg                         C_028814_POLY_OFFSET_PARA_ENABLE;
15497ec681f3Smrg
15507ec681f3Smrg   pa_su_sc_mode_cntl |= S_028814_CULL_FRONT(!!(d->cull_mode & VK_CULL_MODE_FRONT_BIT)) |
15517ec681f3Smrg                         S_028814_CULL_BACK(!!(d->cull_mode & VK_CULL_MODE_BACK_BIT)) |
15527ec681f3Smrg                         S_028814_FACE(d->front_face) |
15537ec681f3Smrg                         S_028814_POLY_OFFSET_FRONT_ENABLE(d->depth_bias_enable) |
15547ec681f3Smrg                         S_028814_POLY_OFFSET_BACK_ENABLE(d->depth_bias_enable) |
15557ec681f3Smrg                         S_028814_POLY_OFFSET_PARA_ENABLE(d->depth_bias_enable);
15567ec681f3Smrg
15577ec681f3Smrg   radeon_set_context_reg(cmd_buffer->cs, R_028814_PA_SU_SC_MODE_CNTL, pa_su_sc_mode_cntl);
15587ec681f3Smrg}
15597ec681f3Smrg
15607ec681f3Smrgstatic void
15617ec681f3Smrgradv_emit_primitive_topology(struct radv_cmd_buffer *cmd_buffer)
15627ec681f3Smrg{
15637ec681f3Smrg   struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
15647ec681f3Smrg
15657ec681f3Smrg   if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX7) {
15667ec681f3Smrg      radeon_set_uconfig_reg_idx(cmd_buffer->device->physical_device, cmd_buffer->cs,
15677ec681f3Smrg                                 R_030908_VGT_PRIMITIVE_TYPE, 1, d->primitive_topology);
15687ec681f3Smrg   } else {
15697ec681f3Smrg      radeon_set_config_reg(cmd_buffer->cs, R_008958_VGT_PRIMITIVE_TYPE, d->primitive_topology);
15707ec681f3Smrg   }
15717ec681f3Smrg}
15727ec681f3Smrg
15737ec681f3Smrgstatic void
15747ec681f3Smrgradv_emit_depth_control(struct radv_cmd_buffer *cmd_buffer, uint64_t states)
15757ec681f3Smrg{
15767ec681f3Smrg   unsigned db_depth_control = cmd_buffer->state.pipeline->graphics.db_depth_control;
15777ec681f3Smrg   struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
15787ec681f3Smrg
15797ec681f3Smrg   db_depth_control &= C_028800_Z_ENABLE &
15807ec681f3Smrg                       C_028800_Z_WRITE_ENABLE &
15817ec681f3Smrg                       C_028800_ZFUNC &
15827ec681f3Smrg                       C_028800_DEPTH_BOUNDS_ENABLE &
15837ec681f3Smrg                       C_028800_STENCIL_ENABLE &
15847ec681f3Smrg                       C_028800_BACKFACE_ENABLE &
15857ec681f3Smrg                       C_028800_STENCILFUNC &
15867ec681f3Smrg                       C_028800_STENCILFUNC_BF;
15877ec681f3Smrg
15887ec681f3Smrg   db_depth_control |= S_028800_Z_ENABLE(d->depth_test_enable ? 1 : 0) |
15897ec681f3Smrg                       S_028800_Z_WRITE_ENABLE(d->depth_write_enable ? 1 : 0) |
15907ec681f3Smrg                       S_028800_ZFUNC(d->depth_compare_op) |
15917ec681f3Smrg                       S_028800_DEPTH_BOUNDS_ENABLE(d->depth_bounds_test_enable ? 1 : 0) |
15927ec681f3Smrg                       S_028800_STENCIL_ENABLE(d->stencil_test_enable ? 1 : 0) |
15937ec681f3Smrg                       S_028800_BACKFACE_ENABLE(d->stencil_test_enable ? 1 : 0) |
15947ec681f3Smrg                       S_028800_STENCILFUNC(d->stencil_op.front.compare_op) |
15957ec681f3Smrg                       S_028800_STENCILFUNC_BF(d->stencil_op.back.compare_op);
15967ec681f3Smrg
15977ec681f3Smrg   radeon_set_context_reg(cmd_buffer->cs, R_028800_DB_DEPTH_CONTROL, db_depth_control);
15987ec681f3Smrg}
15997ec681f3Smrg
16007ec681f3Smrgstatic void
16017ec681f3Smrgradv_emit_stencil_control(struct radv_cmd_buffer *cmd_buffer)
16027ec681f3Smrg{
16037ec681f3Smrg   struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
16047ec681f3Smrg
16057ec681f3Smrg   radeon_set_context_reg(
16067ec681f3Smrg      cmd_buffer->cs, R_02842C_DB_STENCIL_CONTROL,
16077ec681f3Smrg      S_02842C_STENCILFAIL(si_translate_stencil_op(d->stencil_op.front.fail_op)) |
16087ec681f3Smrg         S_02842C_STENCILZPASS(si_translate_stencil_op(d->stencil_op.front.pass_op)) |
16097ec681f3Smrg         S_02842C_STENCILZFAIL(si_translate_stencil_op(d->stencil_op.front.depth_fail_op)) |
16107ec681f3Smrg         S_02842C_STENCILFAIL_BF(si_translate_stencil_op(d->stencil_op.back.fail_op)) |
16117ec681f3Smrg         S_02842C_STENCILZPASS_BF(si_translate_stencil_op(d->stencil_op.back.pass_op)) |
16127ec681f3Smrg         S_02842C_STENCILZFAIL_BF(si_translate_stencil_op(d->stencil_op.back.depth_fail_op)));
16137ec681f3Smrg}
16147ec681f3Smrg
16157ec681f3Smrgstatic void
16167ec681f3Smrgradv_emit_fragment_shading_rate(struct radv_cmd_buffer *cmd_buffer)
16177ec681f3Smrg{
16187ec681f3Smrg   struct radv_pipeline *pipeline = cmd_buffer->state.pipeline;
16197ec681f3Smrg   const struct radv_subpass *subpass = cmd_buffer->state.subpass;
16207ec681f3Smrg   struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
16217ec681f3Smrg   uint32_t rate_x = MIN2(2, d->fragment_shading_rate.size.width) - 1;
16227ec681f3Smrg   uint32_t rate_y = MIN2(2, d->fragment_shading_rate.size.height) - 1;
16237ec681f3Smrg   uint32_t pa_cl_vrs_cntl = pipeline->graphics.vrs.pa_cl_vrs_cntl;
16247ec681f3Smrg   uint32_t vertex_comb_mode = d->fragment_shading_rate.combiner_ops[0];
16257ec681f3Smrg   uint32_t htile_comb_mode = d->fragment_shading_rate.combiner_ops[1];
16267ec681f3Smrg
16277ec681f3Smrg   if (subpass && !subpass->vrs_attachment) {
16287ec681f3Smrg      /* When the current subpass has no VRS attachment, the VRS rates are expected to be 1x1, so we
16297ec681f3Smrg       * can cheat by tweaking the different combiner modes.
16307ec681f3Smrg       */
16317ec681f3Smrg      switch (htile_comb_mode) {
16327ec681f3Smrg      case VK_FRAGMENT_SHADING_RATE_COMBINER_OP_MIN_KHR:
16337ec681f3Smrg         /* The result of min(A, 1x1) is always 1x1. */
16347ec681f3Smrg         FALLTHROUGH;
16357ec681f3Smrg      case VK_FRAGMENT_SHADING_RATE_COMBINER_OP_REPLACE_KHR:
16367ec681f3Smrg         /* Force the per-draw VRS rate to 1x1. */
16377ec681f3Smrg         rate_x = rate_y = 0;
16387ec681f3Smrg
16397ec681f3Smrg         /* As the result of min(A, 1x1) or replace(A, 1x1) are always 1x1, set the vertex rate
16407ec681f3Smrg          * combiner mode as passthrough.
16417ec681f3Smrg          */
16427ec681f3Smrg         vertex_comb_mode = V_028848_VRS_COMB_MODE_PASSTHRU;
16437ec681f3Smrg         break;
16447ec681f3Smrg      case VK_FRAGMENT_SHADING_RATE_COMBINER_OP_MAX_KHR:
16457ec681f3Smrg         /* The result of max(A, 1x1) is always A. */
16467ec681f3Smrg         FALLTHROUGH;
16477ec681f3Smrg      case VK_FRAGMENT_SHADING_RATE_COMBINER_OP_KEEP_KHR:
16487ec681f3Smrg         /* Nothing to do here because the SAMPLE_ITER combiner mode should already be passthrough. */
16497ec681f3Smrg         break;
16507ec681f3Smrg      default:
16517ec681f3Smrg         break;
16527ec681f3Smrg      }
16537ec681f3Smrg   }
16547ec681f3Smrg
16557ec681f3Smrg   /* Emit per-draw VRS rate which is the first combiner. */
16567ec681f3Smrg   radeon_set_uconfig_reg(cmd_buffer->cs, R_03098C_GE_VRS_RATE,
16577ec681f3Smrg                          S_03098C_RATE_X(rate_x) | S_03098C_RATE_Y(rate_y));
16587ec681f3Smrg
16597ec681f3Smrg   /* VERTEX_RATE_COMBINER_MODE controls the combiner mode between the
16607ec681f3Smrg    * draw rate and the vertex rate.
16617ec681f3Smrg    */
16627ec681f3Smrg   pa_cl_vrs_cntl |= S_028848_VERTEX_RATE_COMBINER_MODE(vertex_comb_mode);
16637ec681f3Smrg
16647ec681f3Smrg   /* HTILE_RATE_COMBINER_MODE controls the combiner mode between the primitive rate and the HTILE
16657ec681f3Smrg    * rate.
16667ec681f3Smrg    */
16677ec681f3Smrg   pa_cl_vrs_cntl |= S_028848_HTILE_RATE_COMBINER_MODE(htile_comb_mode);
16687ec681f3Smrg
16697ec681f3Smrg   radeon_set_context_reg(cmd_buffer->cs, R_028848_PA_CL_VRS_CNTL, pa_cl_vrs_cntl);
16707ec681f3Smrg}
16717ec681f3Smrg
16727ec681f3Smrgstatic void
16737ec681f3Smrgradv_emit_primitive_restart_enable(struct radv_cmd_buffer *cmd_buffer)
16747ec681f3Smrg{
16757ec681f3Smrg   struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
16767ec681f3Smrg
16777ec681f3Smrg   if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX9) {
16787ec681f3Smrg      radeon_set_uconfig_reg(cmd_buffer->cs, R_03092C_VGT_MULTI_PRIM_IB_RESET_EN,
16797ec681f3Smrg                             d->primitive_restart_enable);
16807ec681f3Smrg   } else {
16817ec681f3Smrg      radeon_set_context_reg(cmd_buffer->cs, R_028A94_VGT_MULTI_PRIM_IB_RESET_EN,
16827ec681f3Smrg                             d->primitive_restart_enable);
16837ec681f3Smrg   }
16847ec681f3Smrg}
16857ec681f3Smrg
16867ec681f3Smrgstatic void
16877ec681f3Smrgradv_emit_rasterizer_discard_enable(struct radv_cmd_buffer *cmd_buffer)
16887ec681f3Smrg{
16897ec681f3Smrg   unsigned pa_cl_clip_cntl = cmd_buffer->state.pipeline->graphics.pa_cl_clip_cntl;
16907ec681f3Smrg   struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
16917ec681f3Smrg
16927ec681f3Smrg   pa_cl_clip_cntl &= C_028810_DX_RASTERIZATION_KILL;
16937ec681f3Smrg   pa_cl_clip_cntl |= S_028810_DX_RASTERIZATION_KILL(d->rasterizer_discard_enable);
16947ec681f3Smrg
16957ec681f3Smrg   radeon_set_context_reg(cmd_buffer->cs, R_028810_PA_CL_CLIP_CNTL, pa_cl_clip_cntl);
16967ec681f3Smrg}
16977ec681f3Smrg
16987ec681f3Smrgstatic void
16997ec681f3Smrgradv_emit_logic_op(struct radv_cmd_buffer *cmd_buffer)
17007ec681f3Smrg{
17017ec681f3Smrg   unsigned cb_color_control = cmd_buffer->state.pipeline->graphics.cb_color_control;
17027ec681f3Smrg   struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
17037ec681f3Smrg
17047ec681f3Smrg   cb_color_control &= C_028808_ROP3;
17057ec681f3Smrg   cb_color_control |= S_028808_ROP3(d->logic_op);
17067ec681f3Smrg
17077ec681f3Smrg   radeon_set_context_reg(cmd_buffer->cs, R_028808_CB_COLOR_CONTROL, cb_color_control);
17087ec681f3Smrg}
170901e04c3fSmrg
17107ec681f3Smrgstatic void
17117ec681f3Smrgradv_emit_color_write_enable(struct radv_cmd_buffer *cmd_buffer)
17127ec681f3Smrg{
17137ec681f3Smrg   struct radv_pipeline *pipeline = cmd_buffer->state.pipeline;
17147ec681f3Smrg   struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
171501e04c3fSmrg
17167ec681f3Smrg   radeon_set_context_reg(cmd_buffer->cs, R_028238_CB_TARGET_MASK,
17177ec681f3Smrg                          pipeline->graphics.cb_target_mask & d->color_write_enable);
171801e04c3fSmrg}
171901e04c3fSmrg
172001e04c3fSmrgstatic void
17217ec681f3Smrgradv_emit_fb_color_state(struct radv_cmd_buffer *cmd_buffer, int index,
17227ec681f3Smrg                         struct radv_color_buffer_info *cb, struct radv_image_view *iview,
17237ec681f3Smrg                         VkImageLayout layout, bool in_render_loop, bool disable_dcc)
17247ec681f3Smrg{
17257ec681f3Smrg   bool is_vi = cmd_buffer->device->physical_device->rad_info.chip_class >= GFX8;
17267ec681f3Smrg   uint32_t cb_color_info = cb->cb_color_info;
17277ec681f3Smrg   struct radv_image *image = iview->image;
17287ec681f3Smrg
17297ec681f3Smrg   if (!radv_layout_dcc_compressed(
17307ec681f3Smrg          cmd_buffer->device, image, iview->base_mip, layout, in_render_loop,
17317ec681f3Smrg          radv_image_queue_family_mask(image, cmd_buffer->queue_family_index,
17327ec681f3Smrg                                       cmd_buffer->queue_family_index)) ||
17337ec681f3Smrg       disable_dcc) {
17347ec681f3Smrg      cb_color_info &= C_028C70_DCC_ENABLE;
17357ec681f3Smrg   }
17367ec681f3Smrg
17377ec681f3Smrg   if (!radv_layout_fmask_compressed(
17387ec681f3Smrg          cmd_buffer->device, image, layout,
17397ec681f3Smrg          radv_image_queue_family_mask(image, cmd_buffer->queue_family_index,
17407ec681f3Smrg                                       cmd_buffer->queue_family_index))) {
17417ec681f3Smrg      cb_color_info &= C_028C70_COMPRESSION;
17427ec681f3Smrg   }
17437ec681f3Smrg
17447ec681f3Smrg   if (radv_image_is_tc_compat_cmask(image) && (radv_is_fmask_decompress_pipeline(cmd_buffer) ||
17457ec681f3Smrg                                                radv_is_dcc_decompress_pipeline(cmd_buffer))) {
17467ec681f3Smrg      /* If this bit is set, the FMASK decompression operation
17477ec681f3Smrg       * doesn't occur (DCC_COMPRESS also implies FMASK_DECOMPRESS).
17487ec681f3Smrg       */
17497ec681f3Smrg      cb_color_info &= C_028C70_FMASK_COMPRESS_1FRAG_ONLY;
17507ec681f3Smrg   }
17517ec681f3Smrg
17527ec681f3Smrg   if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX10) {
17537ec681f3Smrg      radeon_set_context_reg_seq(cmd_buffer->cs, R_028C60_CB_COLOR0_BASE + index * 0x3c, 11);
17547ec681f3Smrg      radeon_emit(cmd_buffer->cs, cb->cb_color_base);
17557ec681f3Smrg      radeon_emit(cmd_buffer->cs, 0);
17567ec681f3Smrg      radeon_emit(cmd_buffer->cs, 0);
17577ec681f3Smrg      radeon_emit(cmd_buffer->cs, cb->cb_color_view);
17587ec681f3Smrg      radeon_emit(cmd_buffer->cs, cb_color_info);
17597ec681f3Smrg      radeon_emit(cmd_buffer->cs, cb->cb_color_attrib);
17607ec681f3Smrg      radeon_emit(cmd_buffer->cs, cb->cb_dcc_control);
17617ec681f3Smrg      radeon_emit(cmd_buffer->cs, cb->cb_color_cmask);
17627ec681f3Smrg      radeon_emit(cmd_buffer->cs, 0);
17637ec681f3Smrg      radeon_emit(cmd_buffer->cs, cb->cb_color_fmask);
17647ec681f3Smrg      radeon_emit(cmd_buffer->cs, 0);
17657ec681f3Smrg
17667ec681f3Smrg      radeon_set_context_reg(cmd_buffer->cs, R_028C94_CB_COLOR0_DCC_BASE + index * 0x3c, cb->cb_dcc_base);
17677ec681f3Smrg
17687ec681f3Smrg      radeon_set_context_reg(cmd_buffer->cs, R_028E40_CB_COLOR0_BASE_EXT + index * 4,
17697ec681f3Smrg                             cb->cb_color_base >> 32);
17707ec681f3Smrg      radeon_set_context_reg(cmd_buffer->cs, R_028E60_CB_COLOR0_CMASK_BASE_EXT + index * 4,
17717ec681f3Smrg                             cb->cb_color_cmask >> 32);
17727ec681f3Smrg      radeon_set_context_reg(cmd_buffer->cs, R_028E80_CB_COLOR0_FMASK_BASE_EXT + index * 4,
17737ec681f3Smrg                             cb->cb_color_fmask >> 32);
17747ec681f3Smrg      radeon_set_context_reg(cmd_buffer->cs, R_028EA0_CB_COLOR0_DCC_BASE_EXT + index * 4,
17757ec681f3Smrg                             cb->cb_dcc_base >> 32);
17767ec681f3Smrg      radeon_set_context_reg(cmd_buffer->cs, R_028EC0_CB_COLOR0_ATTRIB2 + index * 4,
17777ec681f3Smrg                             cb->cb_color_attrib2);
17787ec681f3Smrg      radeon_set_context_reg(cmd_buffer->cs, R_028EE0_CB_COLOR0_ATTRIB3 + index * 4,
17797ec681f3Smrg                             cb->cb_color_attrib3);
17807ec681f3Smrg   } else if (cmd_buffer->device->physical_device->rad_info.chip_class == GFX9) {
17817ec681f3Smrg      radeon_set_context_reg_seq(cmd_buffer->cs, R_028C60_CB_COLOR0_BASE + index * 0x3c, 11);
17827ec681f3Smrg      radeon_emit(cmd_buffer->cs, cb->cb_color_base);
17837ec681f3Smrg      radeon_emit(cmd_buffer->cs, S_028C64_BASE_256B(cb->cb_color_base >> 32));
17847ec681f3Smrg      radeon_emit(cmd_buffer->cs, cb->cb_color_attrib2);
17857ec681f3Smrg      radeon_emit(cmd_buffer->cs, cb->cb_color_view);
17867ec681f3Smrg      radeon_emit(cmd_buffer->cs, cb_color_info);
17877ec681f3Smrg      radeon_emit(cmd_buffer->cs, cb->cb_color_attrib);
17887ec681f3Smrg      radeon_emit(cmd_buffer->cs, cb->cb_dcc_control);
17897ec681f3Smrg      radeon_emit(cmd_buffer->cs, cb->cb_color_cmask);
17907ec681f3Smrg      radeon_emit(cmd_buffer->cs, S_028C80_BASE_256B(cb->cb_color_cmask >> 32));
17917ec681f3Smrg      radeon_emit(cmd_buffer->cs, cb->cb_color_fmask);
17927ec681f3Smrg      radeon_emit(cmd_buffer->cs, S_028C88_BASE_256B(cb->cb_color_fmask >> 32));
17937ec681f3Smrg
17947ec681f3Smrg      radeon_set_context_reg_seq(cmd_buffer->cs, R_028C94_CB_COLOR0_DCC_BASE + index * 0x3c, 2);
17957ec681f3Smrg      radeon_emit(cmd_buffer->cs, cb->cb_dcc_base);
17967ec681f3Smrg      radeon_emit(cmd_buffer->cs, S_028C98_BASE_256B(cb->cb_dcc_base >> 32));
17977ec681f3Smrg
17987ec681f3Smrg      radeon_set_context_reg(cmd_buffer->cs, R_0287A0_CB_MRT0_EPITCH + index * 4,
17997ec681f3Smrg                             cb->cb_mrt_epitch);
18007ec681f3Smrg   } else {
18017ec681f3Smrg      radeon_set_context_reg_seq(cmd_buffer->cs, R_028C60_CB_COLOR0_BASE + index * 0x3c, 11);
18027ec681f3Smrg      radeon_emit(cmd_buffer->cs, cb->cb_color_base);
18037ec681f3Smrg      radeon_emit(cmd_buffer->cs, cb->cb_color_pitch);
18047ec681f3Smrg      radeon_emit(cmd_buffer->cs, cb->cb_color_slice);
18057ec681f3Smrg      radeon_emit(cmd_buffer->cs, cb->cb_color_view);
18067ec681f3Smrg      radeon_emit(cmd_buffer->cs, cb_color_info);
18077ec681f3Smrg      radeon_emit(cmd_buffer->cs, cb->cb_color_attrib);
18087ec681f3Smrg      radeon_emit(cmd_buffer->cs, cb->cb_dcc_control);
18097ec681f3Smrg      radeon_emit(cmd_buffer->cs, cb->cb_color_cmask);
18107ec681f3Smrg      radeon_emit(cmd_buffer->cs, cb->cb_color_cmask_slice);
18117ec681f3Smrg      radeon_emit(cmd_buffer->cs, cb->cb_color_fmask);
18127ec681f3Smrg      radeon_emit(cmd_buffer->cs, cb->cb_color_fmask_slice);
18137ec681f3Smrg
18147ec681f3Smrg      if (is_vi) { /* DCC BASE */
18157ec681f3Smrg         radeon_set_context_reg(cmd_buffer->cs, R_028C94_CB_COLOR0_DCC_BASE + index * 0x3c,
18167ec681f3Smrg                                cb->cb_dcc_base);
18177ec681f3Smrg      }
18187ec681f3Smrg   }
18197ec681f3Smrg
18207ec681f3Smrg   if (G_028C70_DCC_ENABLE(cb_color_info)) {
18217ec681f3Smrg      /* Drawing with DCC enabled also compresses colorbuffers. */
18227ec681f3Smrg      VkImageSubresourceRange range = {
18237ec681f3Smrg         .aspectMask = iview->aspect_mask,
18247ec681f3Smrg         .baseMipLevel = iview->base_mip,
18257ec681f3Smrg         .levelCount = iview->level_count,
18267ec681f3Smrg         .baseArrayLayer = iview->base_layer,
18277ec681f3Smrg         .layerCount = iview->layer_count,
18287ec681f3Smrg      };
18297ec681f3Smrg
18307ec681f3Smrg      radv_update_dcc_metadata(cmd_buffer, image, &range, true);
18317ec681f3Smrg   }
183201e04c3fSmrg}
183301e04c3fSmrg
183401e04c3fSmrgstatic void
18357ec681f3Smrgradv_update_zrange_precision(struct radv_cmd_buffer *cmd_buffer, struct radv_ds_buffer_info *ds,
18367ec681f3Smrg                             const struct radv_image_view *iview, VkImageLayout layout,
18377ec681f3Smrg                             bool in_render_loop, bool requires_cond_exec)
18387ec681f3Smrg{
18397ec681f3Smrg   const struct radv_image *image = iview->image;
18407ec681f3Smrg   uint32_t db_z_info = ds->db_z_info;
18417ec681f3Smrg   uint32_t db_z_info_reg;
18427ec681f3Smrg
18437ec681f3Smrg   if (!cmd_buffer->device->physical_device->rad_info.has_tc_compat_zrange_bug ||
18447ec681f3Smrg       !radv_image_is_tc_compat_htile(image))
18457ec681f3Smrg      return;
18467ec681f3Smrg
18477ec681f3Smrg   if (!radv_layout_is_htile_compressed(
18487ec681f3Smrg          cmd_buffer->device, image, layout, in_render_loop,
18497ec681f3Smrg          radv_image_queue_family_mask(image, cmd_buffer->queue_family_index,
18507ec681f3Smrg                                       cmd_buffer->queue_family_index))) {
18517ec681f3Smrg      db_z_info &= C_028040_TILE_SURFACE_ENABLE;
18527ec681f3Smrg   }
18537ec681f3Smrg
18547ec681f3Smrg   db_z_info &= C_028040_ZRANGE_PRECISION;
18557ec681f3Smrg
18567ec681f3Smrg   if (cmd_buffer->device->physical_device->rad_info.chip_class == GFX9) {
18577ec681f3Smrg      db_z_info_reg = R_028038_DB_Z_INFO;
18587ec681f3Smrg   } else {
18597ec681f3Smrg      db_z_info_reg = R_028040_DB_Z_INFO;
18607ec681f3Smrg   }
18617ec681f3Smrg
18627ec681f3Smrg   /* When we don't know the last fast clear value we need to emit a
18637ec681f3Smrg    * conditional packet that will eventually skip the following
18647ec681f3Smrg    * SET_CONTEXT_REG packet.
18657ec681f3Smrg    */
18667ec681f3Smrg   if (requires_cond_exec) {
18677ec681f3Smrg      uint64_t va = radv_get_tc_compat_zrange_va(image, iview->base_mip);
18687ec681f3Smrg
18697ec681f3Smrg      radeon_emit(cmd_buffer->cs, PKT3(PKT3_COND_EXEC, 3, 0));
18707ec681f3Smrg      radeon_emit(cmd_buffer->cs, va);
18717ec681f3Smrg      radeon_emit(cmd_buffer->cs, va >> 32);
18727ec681f3Smrg      radeon_emit(cmd_buffer->cs, 0);
18737ec681f3Smrg      radeon_emit(cmd_buffer->cs, 3); /* SET_CONTEXT_REG size */
18747ec681f3Smrg   }
18757ec681f3Smrg
18767ec681f3Smrg   radeon_set_context_reg(cmd_buffer->cs, db_z_info_reg, db_z_info);
187701e04c3fSmrg}
187801e04c3fSmrg
187901e04c3fSmrgstatic void
18807ec681f3Smrgradv_emit_fb_ds_state(struct radv_cmd_buffer *cmd_buffer, struct radv_ds_buffer_info *ds,
18817ec681f3Smrg                      struct radv_image_view *iview, VkImageLayout layout, bool in_render_loop)
18827ec681f3Smrg{
18837ec681f3Smrg   const struct radv_image *image = iview->image;
18847ec681f3Smrg   uint32_t db_z_info = ds->db_z_info;
18857ec681f3Smrg   uint32_t db_stencil_info = ds->db_stencil_info;
18867ec681f3Smrg
18877ec681f3Smrg   if (!radv_layout_is_htile_compressed(
18887ec681f3Smrg          cmd_buffer->device, image, layout, in_render_loop,
18897ec681f3Smrg          radv_image_queue_family_mask(image, cmd_buffer->queue_family_index,
18907ec681f3Smrg                                       cmd_buffer->queue_family_index))) {
18917ec681f3Smrg      db_z_info &= C_028040_TILE_SURFACE_ENABLE;
18927ec681f3Smrg      db_stencil_info |= S_028044_TILE_STENCIL_DISABLE(1);
18937ec681f3Smrg   }
18947ec681f3Smrg
18957ec681f3Smrg   radeon_set_context_reg(cmd_buffer->cs, R_028008_DB_DEPTH_VIEW, ds->db_depth_view);
18967ec681f3Smrg   radeon_set_context_reg(cmd_buffer->cs, R_028ABC_DB_HTILE_SURFACE, ds->db_htile_surface);
18977ec681f3Smrg
18987ec681f3Smrg   if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX10) {
18997ec681f3Smrg      radeon_set_context_reg(cmd_buffer->cs, R_028014_DB_HTILE_DATA_BASE, ds->db_htile_data_base);
19007ec681f3Smrg      radeon_set_context_reg(cmd_buffer->cs, R_02801C_DB_DEPTH_SIZE_XY, ds->db_depth_size);
19017ec681f3Smrg
19027ec681f3Smrg      radeon_set_context_reg_seq(cmd_buffer->cs, R_02803C_DB_DEPTH_INFO, 7);
19037ec681f3Smrg      radeon_emit(cmd_buffer->cs, S_02803C_RESOURCE_LEVEL(1));
19047ec681f3Smrg      radeon_emit(cmd_buffer->cs, db_z_info);
19057ec681f3Smrg      radeon_emit(cmd_buffer->cs, db_stencil_info);
19067ec681f3Smrg      radeon_emit(cmd_buffer->cs, ds->db_z_read_base);
19077ec681f3Smrg      radeon_emit(cmd_buffer->cs, ds->db_stencil_read_base);
19087ec681f3Smrg      radeon_emit(cmd_buffer->cs, ds->db_z_read_base);
19097ec681f3Smrg      radeon_emit(cmd_buffer->cs, ds->db_stencil_read_base);
19107ec681f3Smrg
19117ec681f3Smrg      radeon_set_context_reg_seq(cmd_buffer->cs, R_028068_DB_Z_READ_BASE_HI, 5);
19127ec681f3Smrg      radeon_emit(cmd_buffer->cs, ds->db_z_read_base >> 32);
19137ec681f3Smrg      radeon_emit(cmd_buffer->cs, ds->db_stencil_read_base >> 32);
19147ec681f3Smrg      radeon_emit(cmd_buffer->cs, ds->db_z_read_base >> 32);
19157ec681f3Smrg      radeon_emit(cmd_buffer->cs, ds->db_stencil_read_base >> 32);
19167ec681f3Smrg      radeon_emit(cmd_buffer->cs, ds->db_htile_data_base >> 32);
19177ec681f3Smrg   } else if (cmd_buffer->device->physical_device->rad_info.chip_class == GFX9) {
19187ec681f3Smrg      radeon_set_context_reg_seq(cmd_buffer->cs, R_028014_DB_HTILE_DATA_BASE, 3);
19197ec681f3Smrg      radeon_emit(cmd_buffer->cs, ds->db_htile_data_base);
19207ec681f3Smrg      radeon_emit(cmd_buffer->cs, S_028018_BASE_HI(ds->db_htile_data_base >> 32));
19217ec681f3Smrg      radeon_emit(cmd_buffer->cs, ds->db_depth_size);
19227ec681f3Smrg
19237ec681f3Smrg      radeon_set_context_reg_seq(cmd_buffer->cs, R_028038_DB_Z_INFO, 10);
19247ec681f3Smrg      radeon_emit(cmd_buffer->cs, db_z_info);          /* DB_Z_INFO */
19257ec681f3Smrg      radeon_emit(cmd_buffer->cs, db_stencil_info);    /* DB_STENCIL_INFO */
19267ec681f3Smrg      radeon_emit(cmd_buffer->cs, ds->db_z_read_base); /* DB_Z_READ_BASE */
19277ec681f3Smrg      radeon_emit(cmd_buffer->cs,
19287ec681f3Smrg                  S_028044_BASE_HI(ds->db_z_read_base >> 32)); /* DB_Z_READ_BASE_HI */
19297ec681f3Smrg      radeon_emit(cmd_buffer->cs, ds->db_stencil_read_base);   /* DB_STENCIL_READ_BASE */
19307ec681f3Smrg      radeon_emit(cmd_buffer->cs,
19317ec681f3Smrg                  S_02804C_BASE_HI(ds->db_stencil_read_base >> 32)); /* DB_STENCIL_READ_BASE_HI */
19327ec681f3Smrg      radeon_emit(cmd_buffer->cs, ds->db_z_write_base);              /* DB_Z_WRITE_BASE */
19337ec681f3Smrg      radeon_emit(cmd_buffer->cs,
19347ec681f3Smrg                  S_028054_BASE_HI(ds->db_z_write_base >> 32)); /* DB_Z_WRITE_BASE_HI */
19357ec681f3Smrg      radeon_emit(cmd_buffer->cs, ds->db_stencil_write_base);   /* DB_STENCIL_WRITE_BASE */
19367ec681f3Smrg      radeon_emit(cmd_buffer->cs,
19377ec681f3Smrg                  S_02805C_BASE_HI(ds->db_stencil_write_base >> 32)); /* DB_STENCIL_WRITE_BASE_HI */
19387ec681f3Smrg
19397ec681f3Smrg      radeon_set_context_reg_seq(cmd_buffer->cs, R_028068_DB_Z_INFO2, 2);
19407ec681f3Smrg      radeon_emit(cmd_buffer->cs, ds->db_z_info2);
19417ec681f3Smrg      radeon_emit(cmd_buffer->cs, ds->db_stencil_info2);
19427ec681f3Smrg   } else {
19437ec681f3Smrg      radeon_set_context_reg(cmd_buffer->cs, R_028014_DB_HTILE_DATA_BASE, ds->db_htile_data_base);
19447ec681f3Smrg
19457ec681f3Smrg      radeon_set_context_reg_seq(cmd_buffer->cs, R_02803C_DB_DEPTH_INFO, 9);
19467ec681f3Smrg      radeon_emit(cmd_buffer->cs, ds->db_depth_info);         /* R_02803C_DB_DEPTH_INFO */
19477ec681f3Smrg      radeon_emit(cmd_buffer->cs, db_z_info);                 /* R_028040_DB_Z_INFO */
19487ec681f3Smrg      radeon_emit(cmd_buffer->cs, db_stencil_info);           /* R_028044_DB_STENCIL_INFO */
19497ec681f3Smrg      radeon_emit(cmd_buffer->cs, ds->db_z_read_base);        /* R_028048_DB_Z_READ_BASE */
19507ec681f3Smrg      radeon_emit(cmd_buffer->cs, ds->db_stencil_read_base);  /* R_02804C_DB_STENCIL_READ_BASE */
19517ec681f3Smrg      radeon_emit(cmd_buffer->cs, ds->db_z_write_base);       /* R_028050_DB_Z_WRITE_BASE */
19527ec681f3Smrg      radeon_emit(cmd_buffer->cs, ds->db_stencil_write_base); /* R_028054_DB_STENCIL_WRITE_BASE */
19537ec681f3Smrg      radeon_emit(cmd_buffer->cs, ds->db_depth_size);         /* R_028058_DB_DEPTH_SIZE */
19547ec681f3Smrg      radeon_emit(cmd_buffer->cs, ds->db_depth_slice);        /* R_02805C_DB_DEPTH_SLICE */
19557ec681f3Smrg   }
19567ec681f3Smrg
19577ec681f3Smrg   /* Update the ZRANGE_PRECISION value for the TC-compat bug. */
19587ec681f3Smrg   radv_update_zrange_precision(cmd_buffer, ds, iview, layout, in_render_loop, true);
19597ec681f3Smrg
19607ec681f3Smrg   radeon_set_context_reg(cmd_buffer->cs, R_028B78_PA_SU_POLY_OFFSET_DB_FMT_CNTL,
19617ec681f3Smrg                          ds->pa_su_poly_offset_db_fmt_cntl);
196201e04c3fSmrg}
196301e04c3fSmrg
196401e04c3fSmrg/**
196501e04c3fSmrg * Update the fast clear depth/stencil values if the image is bound as a
196601e04c3fSmrg * depth/stencil buffer.
196701e04c3fSmrg */
196801e04c3fSmrgstatic void
196901e04c3fSmrgradv_update_bound_fast_clear_ds(struct radv_cmd_buffer *cmd_buffer,
19707ec681f3Smrg                                const struct radv_image_view *iview,
19717ec681f3Smrg                                VkClearDepthStencilValue ds_clear_value, VkImageAspectFlags aspects)
197201e04c3fSmrg{
19737ec681f3Smrg   const struct radv_subpass *subpass = cmd_buffer->state.subpass;
19747ec681f3Smrg   const struct radv_image *image = iview->image;
19757ec681f3Smrg   struct radeon_cmdbuf *cs = cmd_buffer->cs;
19767ec681f3Smrg   uint32_t att_idx;
197701e04c3fSmrg
19787ec681f3Smrg   if (!cmd_buffer->state.attachments || !subpass)
19797ec681f3Smrg      return;
198001e04c3fSmrg
19817ec681f3Smrg   if (!subpass->depth_stencil_attachment)
19827ec681f3Smrg      return;
198301e04c3fSmrg
19847ec681f3Smrg   att_idx = subpass->depth_stencil_attachment->attachment;
19857ec681f3Smrg   if (cmd_buffer->state.attachments[att_idx].iview->image != image)
19867ec681f3Smrg      return;
198701e04c3fSmrg
19887ec681f3Smrg   if (aspects == (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT)) {
19897ec681f3Smrg      radeon_set_context_reg_seq(cs, R_028028_DB_STENCIL_CLEAR, 2);
19907ec681f3Smrg      radeon_emit(cs, ds_clear_value.stencil);
19917ec681f3Smrg      radeon_emit(cs, fui(ds_clear_value.depth));
19927ec681f3Smrg   } else if (aspects == VK_IMAGE_ASPECT_DEPTH_BIT) {
19937ec681f3Smrg      radeon_set_context_reg(cs, R_02802C_DB_DEPTH_CLEAR, fui(ds_clear_value.depth));
19947ec681f3Smrg   } else {
19957ec681f3Smrg      assert(aspects == VK_IMAGE_ASPECT_STENCIL_BIT);
19967ec681f3Smrg      radeon_set_context_reg(cs, R_028028_DB_STENCIL_CLEAR, ds_clear_value.stencil);
19977ec681f3Smrg   }
199801e04c3fSmrg
19997ec681f3Smrg   /* Update the ZRANGE_PRECISION value for the TC-compat bug. This is
20007ec681f3Smrg    * only needed when clearing Z to 0.0.
20017ec681f3Smrg    */
20027ec681f3Smrg   if ((aspects & VK_IMAGE_ASPECT_DEPTH_BIT) && ds_clear_value.depth == 0.0) {
20037ec681f3Smrg      VkImageLayout layout = subpass->depth_stencil_attachment->layout;
20047ec681f3Smrg      bool in_render_loop = subpass->depth_stencil_attachment->in_render_loop;
200501e04c3fSmrg
20067ec681f3Smrg      radv_update_zrange_precision(cmd_buffer, &cmd_buffer->state.attachments[att_idx].ds, iview,
20077ec681f3Smrg                                   layout, in_render_loop, false);
20087ec681f3Smrg   }
2009ed98bd31Smaya
20107ec681f3Smrg   cmd_buffer->state.context_roll_without_scissor_emitted = true;
201101e04c3fSmrg}
201201e04c3fSmrg
201301e04c3fSmrg/**
201401e04c3fSmrg * Set the clear depth/stencil values to the image's metadata.
201501e04c3fSmrg */
201601e04c3fSmrgstatic void
20177ec681f3Smrgradv_set_ds_clear_metadata(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image,
20187ec681f3Smrg                           const VkImageSubresourceRange *range,
20197ec681f3Smrg                           VkClearDepthStencilValue ds_clear_value, VkImageAspectFlags aspects)
20207ec681f3Smrg{
20217ec681f3Smrg   struct radeon_cmdbuf *cs = cmd_buffer->cs;
20227ec681f3Smrg   uint32_t level_count = radv_get_levelCount(image, range);
20237ec681f3Smrg
20247ec681f3Smrg   if (aspects == (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT)) {
20257ec681f3Smrg      uint64_t va = radv_get_ds_clear_value_va(image, range->baseMipLevel);
20267ec681f3Smrg
20277ec681f3Smrg      /* Use the fastest way when both aspects are used. */
20287ec681f3Smrg      radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 2 + 2 * level_count, cmd_buffer->state.predicating));
20297ec681f3Smrg      radeon_emit(cs, S_370_DST_SEL(V_370_MEM) | S_370_WR_CONFIRM(1) | S_370_ENGINE_SEL(V_370_PFP));
20307ec681f3Smrg      radeon_emit(cs, va);
20317ec681f3Smrg      radeon_emit(cs, va >> 32);
20327ec681f3Smrg
20337ec681f3Smrg      for (uint32_t l = 0; l < level_count; l++) {
20347ec681f3Smrg         radeon_emit(cs, ds_clear_value.stencil);
20357ec681f3Smrg         radeon_emit(cs, fui(ds_clear_value.depth));
20367ec681f3Smrg      }
20377ec681f3Smrg   } else {
20387ec681f3Smrg      /* Otherwise we need one WRITE_DATA packet per level. */
20397ec681f3Smrg      for (uint32_t l = 0; l < level_count; l++) {
20407ec681f3Smrg         uint64_t va = radv_get_ds_clear_value_va(image, range->baseMipLevel + l);
20417ec681f3Smrg         unsigned value;
20427ec681f3Smrg
20437ec681f3Smrg         if (aspects == VK_IMAGE_ASPECT_DEPTH_BIT) {
20447ec681f3Smrg            value = fui(ds_clear_value.depth);
20457ec681f3Smrg            va += 4;
20467ec681f3Smrg         } else {
20477ec681f3Smrg            assert(aspects == VK_IMAGE_ASPECT_STENCIL_BIT);
20487ec681f3Smrg            value = ds_clear_value.stencil;
20497ec681f3Smrg         }
20507ec681f3Smrg
20517ec681f3Smrg         radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 3, cmd_buffer->state.predicating));
20527ec681f3Smrg         radeon_emit(cs,
20537ec681f3Smrg                     S_370_DST_SEL(V_370_MEM) | S_370_WR_CONFIRM(1) | S_370_ENGINE_SEL(V_370_PFP));
20547ec681f3Smrg         radeon_emit(cs, va);
20557ec681f3Smrg         radeon_emit(cs, va >> 32);
20567ec681f3Smrg         radeon_emit(cs, value);
20577ec681f3Smrg      }
20587ec681f3Smrg   }
205901e04c3fSmrg}
206001e04c3fSmrg
206101e04c3fSmrg/**
206201e04c3fSmrg * Update the TC-compat metadata value for this image.
206301e04c3fSmrg */
206401e04c3fSmrgstatic void
20657ec681f3Smrgradv_set_tc_compat_zrange_metadata(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image,
20667ec681f3Smrg                                   const VkImageSubresourceRange *range, uint32_t value)
206701e04c3fSmrg{
20687ec681f3Smrg   struct radeon_cmdbuf *cs = cmd_buffer->cs;
20697ec681f3Smrg
20707ec681f3Smrg   if (!cmd_buffer->device->physical_device->rad_info.has_tc_compat_zrange_bug)
20717ec681f3Smrg      return;
207201e04c3fSmrg
20737ec681f3Smrg   uint64_t va = radv_get_tc_compat_zrange_va(image, range->baseMipLevel);
20747ec681f3Smrg   uint32_t level_count = radv_get_levelCount(image, range);
20757ec681f3Smrg
20767ec681f3Smrg   radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 2 + level_count, cmd_buffer->state.predicating));
20777ec681f3Smrg   radeon_emit(cs, S_370_DST_SEL(V_370_MEM) | S_370_WR_CONFIRM(1) | S_370_ENGINE_SEL(V_370_PFP));
20787ec681f3Smrg   radeon_emit(cs, va);
20797ec681f3Smrg   radeon_emit(cs, va >> 32);
20807ec681f3Smrg
20817ec681f3Smrg   for (uint32_t l = 0; l < level_count; l++)
20827ec681f3Smrg      radeon_emit(cs, value);
208301e04c3fSmrg}
208401e04c3fSmrg
208501e04c3fSmrgstatic void
208601e04c3fSmrgradv_update_tc_compat_zrange_metadata(struct radv_cmd_buffer *cmd_buffer,
20877ec681f3Smrg                                      const struct radv_image_view *iview,
20887ec681f3Smrg                                      VkClearDepthStencilValue ds_clear_value)
208901e04c3fSmrg{
20907ec681f3Smrg   VkImageSubresourceRange range = {
20917ec681f3Smrg      .aspectMask = iview->aspect_mask,
20927ec681f3Smrg      .baseMipLevel = iview->base_mip,
20937ec681f3Smrg      .levelCount = iview->level_count,
20947ec681f3Smrg      .baseArrayLayer = iview->base_layer,
20957ec681f3Smrg      .layerCount = iview->layer_count,
20967ec681f3Smrg   };
20977ec681f3Smrg   uint32_t cond_val;
209801e04c3fSmrg
20997ec681f3Smrg   /* Conditionally set DB_Z_INFO.ZRANGE_PRECISION to 0 when the last
21007ec681f3Smrg    * depth clear value is 0.0f.
21017ec681f3Smrg    */
21027ec681f3Smrg   cond_val = ds_clear_value.depth == 0.0f ? UINT_MAX : 0;
210301e04c3fSmrg
21047ec681f3Smrg   radv_set_tc_compat_zrange_metadata(cmd_buffer, iview->image, &range, cond_val);
210501e04c3fSmrg}
210601e04c3fSmrg
210701e04c3fSmrg/**
210801e04c3fSmrg * Update the clear depth/stencil values for this image.
210901e04c3fSmrg */
211001e04c3fSmrgvoid
211101e04c3fSmrgradv_update_ds_clear_metadata(struct radv_cmd_buffer *cmd_buffer,
21127ec681f3Smrg                              const struct radv_image_view *iview,
21137ec681f3Smrg                              VkClearDepthStencilValue ds_clear_value, VkImageAspectFlags aspects)
211401e04c3fSmrg{
21157ec681f3Smrg   VkImageSubresourceRange range = {
21167ec681f3Smrg      .aspectMask = iview->aspect_mask,
21177ec681f3Smrg      .baseMipLevel = iview->base_mip,
21187ec681f3Smrg      .levelCount = iview->level_count,
21197ec681f3Smrg      .baseArrayLayer = iview->base_layer,
21207ec681f3Smrg      .layerCount = iview->layer_count,
21217ec681f3Smrg   };
21227ec681f3Smrg   struct radv_image *image = iview->image;
212301e04c3fSmrg
21247ec681f3Smrg   assert(radv_htile_enabled(image, range.baseMipLevel));
212501e04c3fSmrg
21267ec681f3Smrg   radv_set_ds_clear_metadata(cmd_buffer, iview->image, &range, ds_clear_value, aspects);
212701e04c3fSmrg
21287ec681f3Smrg   if (radv_image_is_tc_compat_htile(image) && (aspects & VK_IMAGE_ASPECT_DEPTH_BIT)) {
21297ec681f3Smrg      radv_update_tc_compat_zrange_metadata(cmd_buffer, iview, ds_clear_value);
21307ec681f3Smrg   }
21317ec681f3Smrg
21327ec681f3Smrg   radv_update_bound_fast_clear_ds(cmd_buffer, iview, ds_clear_value, aspects);
213301e04c3fSmrg}
213401e04c3fSmrg
213501e04c3fSmrg/**
213601e04c3fSmrg * Load the clear depth/stencil values from the image's metadata.
213701e04c3fSmrg */
213801e04c3fSmrgstatic void
21397ec681f3Smrgradv_load_ds_clear_metadata(struct radv_cmd_buffer *cmd_buffer, const struct radv_image_view *iview)
21407ec681f3Smrg{
21417ec681f3Smrg   struct radeon_cmdbuf *cs = cmd_buffer->cs;
21427ec681f3Smrg   const struct radv_image *image = iview->image;
21437ec681f3Smrg   VkImageAspectFlags aspects = vk_format_aspects(image->vk_format);
21447ec681f3Smrg   uint64_t va = radv_get_ds_clear_value_va(image, iview->base_mip);
21457ec681f3Smrg   unsigned reg_offset = 0, reg_count = 0;
21467ec681f3Smrg
21477ec681f3Smrg   assert(radv_image_has_htile(image));
21487ec681f3Smrg
21497ec681f3Smrg   if (aspects & VK_IMAGE_ASPECT_STENCIL_BIT) {
21507ec681f3Smrg      ++reg_count;
21517ec681f3Smrg   } else {
21527ec681f3Smrg      ++reg_offset;
21537ec681f3Smrg      va += 4;
21547ec681f3Smrg   }
21557ec681f3Smrg   if (aspects & VK_IMAGE_ASPECT_DEPTH_BIT)
21567ec681f3Smrg      ++reg_count;
21577ec681f3Smrg
21587ec681f3Smrg   uint32_t reg = R_028028_DB_STENCIL_CLEAR + 4 * reg_offset;
21597ec681f3Smrg
21607ec681f3Smrg   if (cmd_buffer->device->physical_device->rad_info.has_load_ctx_reg_pkt) {
21617ec681f3Smrg      radeon_emit(cs, PKT3(PKT3_LOAD_CONTEXT_REG_INDEX, 3, 0));
21627ec681f3Smrg      radeon_emit(cs, va);
21637ec681f3Smrg      radeon_emit(cs, va >> 32);
21647ec681f3Smrg      radeon_emit(cs, (reg - SI_CONTEXT_REG_OFFSET) >> 2);
21657ec681f3Smrg      radeon_emit(cs, reg_count);
21667ec681f3Smrg   } else {
21677ec681f3Smrg      radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
21687ec681f3Smrg      radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_SRC_MEM) | COPY_DATA_DST_SEL(COPY_DATA_REG) |
21697ec681f3Smrg                         (reg_count == 2 ? COPY_DATA_COUNT_SEL : 0));
21707ec681f3Smrg      radeon_emit(cs, va);
21717ec681f3Smrg      radeon_emit(cs, va >> 32);
21727ec681f3Smrg      radeon_emit(cs, reg >> 2);
21737ec681f3Smrg      radeon_emit(cs, 0);
21747ec681f3Smrg
21757ec681f3Smrg      radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0));
21767ec681f3Smrg      radeon_emit(cs, 0);
21777ec681f3Smrg   }
217801e04c3fSmrg}
217901e04c3fSmrg
218001e04c3fSmrg/*
218101e04c3fSmrg * With DCC some colors don't require CMASK elimination before being
218201e04c3fSmrg * used as a texture. This sets a predicate value to determine if the
218301e04c3fSmrg * cmask eliminate is required.
218401e04c3fSmrg */
218501e04c3fSmrgvoid
21867ec681f3Smrgradv_update_fce_metadata(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image,
21877ec681f3Smrg                         const VkImageSubresourceRange *range, bool value)
2188ed98bd31Smaya{
21897ec681f3Smrg   if (!image->fce_pred_offset)
21907ec681f3Smrg      return;
21917ec681f3Smrg
21927ec681f3Smrg   uint64_t pred_val = value;
21937ec681f3Smrg   uint64_t va = radv_image_get_fce_pred_va(image, range->baseMipLevel);
21947ec681f3Smrg   uint32_t level_count = radv_get_levelCount(image, range);
21957ec681f3Smrg   uint32_t count = 2 * level_count;
2196ed98bd31Smaya
21977ec681f3Smrg   radeon_emit(cmd_buffer->cs, PKT3(PKT3_WRITE_DATA, 2 + count, 0));
21987ec681f3Smrg   radeon_emit(cmd_buffer->cs,
21997ec681f3Smrg               S_370_DST_SEL(V_370_MEM) | S_370_WR_CONFIRM(1) | S_370_ENGINE_SEL(V_370_PFP));
22007ec681f3Smrg   radeon_emit(cmd_buffer->cs, va);
22017ec681f3Smrg   radeon_emit(cmd_buffer->cs, va >> 32);
2202ed98bd31Smaya
22037ec681f3Smrg   for (uint32_t l = 0; l < level_count; l++) {
22047ec681f3Smrg      radeon_emit(cmd_buffer->cs, pred_val);
22057ec681f3Smrg      radeon_emit(cmd_buffer->cs, pred_val >> 32);
22067ec681f3Smrg   }
2207ed98bd31Smaya}
2208ed98bd31Smaya
2209ed98bd31Smaya/**
2210ed98bd31Smaya * Update the DCC predicate to reflect the compression state.
2211ed98bd31Smaya */
2212ed98bd31Smayavoid
22137ec681f3Smrgradv_update_dcc_metadata(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image,
22147ec681f3Smrg                         const VkImageSubresourceRange *range, bool value)
221501e04c3fSmrg{
22167ec681f3Smrg   if (image->dcc_pred_offset == 0)
22177ec681f3Smrg      return;
221801e04c3fSmrg
22197ec681f3Smrg   uint64_t pred_val = value;
22207ec681f3Smrg   uint64_t va = radv_image_get_dcc_pred_va(image, range->baseMipLevel);
22217ec681f3Smrg   uint32_t level_count = radv_get_levelCount(image, range);
22227ec681f3Smrg   uint32_t count = 2 * level_count;
222301e04c3fSmrg
22247ec681f3Smrg   assert(radv_dcc_enabled(image, range->baseMipLevel));
22257ec681f3Smrg
22267ec681f3Smrg   radeon_emit(cmd_buffer->cs, PKT3(PKT3_WRITE_DATA, 2 + count, 0));
22277ec681f3Smrg   radeon_emit(cmd_buffer->cs,
22287ec681f3Smrg               S_370_DST_SEL(V_370_MEM) | S_370_WR_CONFIRM(1) | S_370_ENGINE_SEL(V_370_PFP));
22297ec681f3Smrg   radeon_emit(cmd_buffer->cs, va);
22307ec681f3Smrg   radeon_emit(cmd_buffer->cs, va >> 32);
22317ec681f3Smrg
22327ec681f3Smrg   for (uint32_t l = 0; l < level_count; l++) {
22337ec681f3Smrg      radeon_emit(cmd_buffer->cs, pred_val);
22347ec681f3Smrg      radeon_emit(cmd_buffer->cs, pred_val >> 32);
22357ec681f3Smrg   }
223601e04c3fSmrg}
223701e04c3fSmrg
223801e04c3fSmrg/**
223901e04c3fSmrg * Update the fast clear color values if the image is bound as a color buffer.
224001e04c3fSmrg */
224101e04c3fSmrgstatic void
22427ec681f3Smrgradv_update_bound_fast_clear_color(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image,
22437ec681f3Smrg                                   int cb_idx, uint32_t color_values[2])
224401e04c3fSmrg{
22457ec681f3Smrg   const struct radv_subpass *subpass = cmd_buffer->state.subpass;
22467ec681f3Smrg   struct radeon_cmdbuf *cs = cmd_buffer->cs;
22477ec681f3Smrg   uint32_t att_idx;
224801e04c3fSmrg
22497ec681f3Smrg   if (!cmd_buffer->state.attachments || !subpass)
22507ec681f3Smrg      return;
225101e04c3fSmrg
22527ec681f3Smrg   att_idx = subpass->color_attachments[cb_idx].attachment;
22537ec681f3Smrg   if (att_idx == VK_ATTACHMENT_UNUSED)
22547ec681f3Smrg      return;
225501e04c3fSmrg
22567ec681f3Smrg   if (cmd_buffer->state.attachments[att_idx].iview->image != image)
22577ec681f3Smrg      return;
225801e04c3fSmrg
22597ec681f3Smrg   radeon_set_context_reg_seq(cs, R_028C8C_CB_COLOR0_CLEAR_WORD0 + cb_idx * 0x3c, 2);
22607ec681f3Smrg   radeon_emit(cs, color_values[0]);
22617ec681f3Smrg   radeon_emit(cs, color_values[1]);
2262ed98bd31Smaya
22637ec681f3Smrg   cmd_buffer->state.context_roll_without_scissor_emitted = true;
226401e04c3fSmrg}
226501e04c3fSmrg
226601e04c3fSmrg/**
226701e04c3fSmrg * Set the clear color values to the image's metadata.
226801e04c3fSmrg */
226901e04c3fSmrgstatic void
22707ec681f3Smrgradv_set_color_clear_metadata(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image,
22717ec681f3Smrg                              const VkImageSubresourceRange *range, uint32_t color_values[2])
227201e04c3fSmrg{
22737ec681f3Smrg   struct radeon_cmdbuf *cs = cmd_buffer->cs;
22747ec681f3Smrg   uint32_t level_count = radv_get_levelCount(image, range);
22757ec681f3Smrg   uint32_t count = 2 * level_count;
227601e04c3fSmrg
22777ec681f3Smrg   assert(radv_image_has_cmask(image) || radv_dcc_enabled(image, range->baseMipLevel));
227801e04c3fSmrg
22797ec681f3Smrg   if (radv_image_has_clear_value(image)) {
22807ec681f3Smrg      uint64_t va = radv_image_get_fast_clear_va(image, range->baseMipLevel);
228101e04c3fSmrg
22827ec681f3Smrg      radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 2 + count, cmd_buffer->state.predicating));
22837ec681f3Smrg      radeon_emit(cs, S_370_DST_SEL(V_370_MEM) | S_370_WR_CONFIRM(1) | S_370_ENGINE_SEL(V_370_PFP));
22847ec681f3Smrg      radeon_emit(cs, va);
22857ec681f3Smrg      radeon_emit(cs, va >> 32);
22867ec681f3Smrg
22877ec681f3Smrg      for (uint32_t l = 0; l < level_count; l++) {
22887ec681f3Smrg         radeon_emit(cs, color_values[0]);
22897ec681f3Smrg         radeon_emit(cs, color_values[1]);
22907ec681f3Smrg      }
22917ec681f3Smrg   } else {
22927ec681f3Smrg      /* Some default value we can set in the update. */
22937ec681f3Smrg      assert(color_values[0] == 0 && color_values[1] == 0);
22947ec681f3Smrg   }
229501e04c3fSmrg}
229601e04c3fSmrg
229701e04c3fSmrg/**
229801e04c3fSmrg * Update the clear color values for this image.
229901e04c3fSmrg */
230001e04c3fSmrgvoid
230101e04c3fSmrgradv_update_color_clear_metadata(struct radv_cmd_buffer *cmd_buffer,
23027ec681f3Smrg                                 const struct radv_image_view *iview, int cb_idx,
23037ec681f3Smrg                                 uint32_t color_values[2])
230401e04c3fSmrg{
23057ec681f3Smrg   struct radv_image *image = iview->image;
23067ec681f3Smrg   VkImageSubresourceRange range = {
23077ec681f3Smrg      .aspectMask = iview->aspect_mask,
23087ec681f3Smrg      .baseMipLevel = iview->base_mip,
23097ec681f3Smrg      .levelCount = iview->level_count,
23107ec681f3Smrg      .baseArrayLayer = iview->base_layer,
23117ec681f3Smrg      .layerCount = iview->layer_count,
23127ec681f3Smrg   };
23137ec681f3Smrg
23147ec681f3Smrg   assert(radv_image_has_cmask(image) || radv_dcc_enabled(image, iview->base_mip));
231501e04c3fSmrg
23167ec681f3Smrg   /* Do not need to update the clear value for images that are fast cleared with the comp-to-single
23177ec681f3Smrg    * mode because the hardware gets the value from the image directly.
23187ec681f3Smrg    */
23197ec681f3Smrg   if (iview->image->support_comp_to_single)
23207ec681f3Smrg      return;
232101e04c3fSmrg
23227ec681f3Smrg   radv_set_color_clear_metadata(cmd_buffer, image, &range, color_values);
23237ec681f3Smrg
23247ec681f3Smrg   radv_update_bound_fast_clear_color(cmd_buffer, image, cb_idx, color_values);
232501e04c3fSmrg}
232601e04c3fSmrg
232701e04c3fSmrg/**
232801e04c3fSmrg * Load the clear color values from the image's metadata.
232901e04c3fSmrg */
233001e04c3fSmrgstatic void
23317ec681f3Smrgradv_load_color_clear_metadata(struct radv_cmd_buffer *cmd_buffer, struct radv_image_view *iview,
23327ec681f3Smrg                               int cb_idx)
23337ec681f3Smrg{
23347ec681f3Smrg   struct radeon_cmdbuf *cs = cmd_buffer->cs;
23357ec681f3Smrg   struct radv_image *image = iview->image;
23367ec681f3Smrg
23377ec681f3Smrg   if (!radv_image_has_cmask(image) && !radv_dcc_enabled(image, iview->base_mip))
23387ec681f3Smrg      return;
23397ec681f3Smrg
23407ec681f3Smrg   if (iview->image->support_comp_to_single)
23417ec681f3Smrg      return;
23427ec681f3Smrg
23437ec681f3Smrg   if (!radv_image_has_clear_value(image)) {
23447ec681f3Smrg      uint32_t color_values[2] = {0, 0};
23457ec681f3Smrg      radv_update_bound_fast_clear_color(cmd_buffer, image, cb_idx, color_values);
23467ec681f3Smrg      return;
23477ec681f3Smrg   }
23487ec681f3Smrg
23497ec681f3Smrg   uint64_t va = radv_image_get_fast_clear_va(image, iview->base_mip);
23507ec681f3Smrg   uint32_t reg = R_028C8C_CB_COLOR0_CLEAR_WORD0 + cb_idx * 0x3c;
23517ec681f3Smrg
23527ec681f3Smrg   if (cmd_buffer->device->physical_device->rad_info.has_load_ctx_reg_pkt) {
23537ec681f3Smrg      radeon_emit(cs, PKT3(PKT3_LOAD_CONTEXT_REG_INDEX, 3, cmd_buffer->state.predicating));
23547ec681f3Smrg      radeon_emit(cs, va);
23557ec681f3Smrg      radeon_emit(cs, va >> 32);
23567ec681f3Smrg      radeon_emit(cs, (reg - SI_CONTEXT_REG_OFFSET) >> 2);
23577ec681f3Smrg      radeon_emit(cs, 2);
23587ec681f3Smrg   } else {
23597ec681f3Smrg      radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, cmd_buffer->state.predicating));
23607ec681f3Smrg      radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_SRC_MEM) | COPY_DATA_DST_SEL(COPY_DATA_REG) |
23617ec681f3Smrg                         COPY_DATA_COUNT_SEL);
23627ec681f3Smrg      radeon_emit(cs, va);
23637ec681f3Smrg      radeon_emit(cs, va >> 32);
23647ec681f3Smrg      radeon_emit(cs, reg >> 2);
23657ec681f3Smrg      radeon_emit(cs, 0);
23667ec681f3Smrg
23677ec681f3Smrg      radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, cmd_buffer->state.predicating));
23687ec681f3Smrg      radeon_emit(cs, 0);
23697ec681f3Smrg   }
23707ec681f3Smrg}
23717ec681f3Smrg
23727ec681f3Smrg/* GFX9+ metadata cache flushing workaround. metadata cache coherency is
23737ec681f3Smrg * broken if the CB caches data of multiple mips of the same image at the
23747ec681f3Smrg * same time.
23757ec681f3Smrg *
23767ec681f3Smrg * Insert some flushes to avoid this.
23777ec681f3Smrg */
23787ec681f3Smrgstatic void
23797ec681f3Smrgradv_emit_fb_mip_change_flush(struct radv_cmd_buffer *cmd_buffer)
238001e04c3fSmrg{
23817ec681f3Smrg   struct radv_framebuffer *framebuffer = cmd_buffer->state.framebuffer;
23827ec681f3Smrg   const struct radv_subpass *subpass = cmd_buffer->state.subpass;
23837ec681f3Smrg   bool color_mip_changed = false;
238401e04c3fSmrg
23857ec681f3Smrg   /* Entire workaround is not applicable before GFX9 */
23867ec681f3Smrg   if (cmd_buffer->device->physical_device->rad_info.chip_class < GFX9)
23877ec681f3Smrg      return;
238801e04c3fSmrg
23897ec681f3Smrg   if (!framebuffer)
23907ec681f3Smrg      return;
239101e04c3fSmrg
23927ec681f3Smrg   for (int i = 0; i < subpass->color_count; ++i) {
23937ec681f3Smrg      int idx = subpass->color_attachments[i].attachment;
23947ec681f3Smrg      if (idx == VK_ATTACHMENT_UNUSED)
23957ec681f3Smrg         continue;
239601e04c3fSmrg
23977ec681f3Smrg      struct radv_image_view *iview = cmd_buffer->state.attachments[idx].iview;
239801e04c3fSmrg
23997ec681f3Smrg      if ((radv_image_has_CB_metadata(iview->image) ||
24007ec681f3Smrg           radv_dcc_enabled(iview->image, iview->base_mip) ||
24017ec681f3Smrg           radv_dcc_enabled(iview->image, cmd_buffer->state.cb_mip[i])) &&
24027ec681f3Smrg          cmd_buffer->state.cb_mip[i] != iview->base_mip)
24037ec681f3Smrg         color_mip_changed = true;
240401e04c3fSmrg
24057ec681f3Smrg      cmd_buffer->state.cb_mip[i] = iview->base_mip;
24067ec681f3Smrg   }
240701e04c3fSmrg
24087ec681f3Smrg   if (color_mip_changed) {
24097ec681f3Smrg      cmd_buffer->state.flush_bits |=
24107ec681f3Smrg         RADV_CMD_FLAG_FLUSH_AND_INV_CB | RADV_CMD_FLAG_FLUSH_AND_INV_CB_META;
24117ec681f3Smrg   }
241201e04c3fSmrg}
241301e04c3fSmrg
24147ec681f3Smrg/* This function does the flushes for mip changes if the levels are not zero for
24157ec681f3Smrg * all render targets. This way we can assume at the start of the next cmd_buffer
24167ec681f3Smrg * that rendering to mip 0 doesn't need any flushes. As that is the most common
24177ec681f3Smrg * case that saves some flushes. */
241801e04c3fSmrgstatic void
24197ec681f3Smrgradv_emit_mip_change_flush_default(struct radv_cmd_buffer *cmd_buffer)
242001e04c3fSmrg{
24217ec681f3Smrg   /* Entire workaround is not applicable before GFX9 */
24227ec681f3Smrg   if (cmd_buffer->device->physical_device->rad_info.chip_class < GFX9)
24237ec681f3Smrg      return;
242401e04c3fSmrg
24257ec681f3Smrg   bool need_color_mip_flush = false;
24267ec681f3Smrg   for (unsigned i = 0; i < 8; ++i) {
24277ec681f3Smrg      if (cmd_buffer->state.cb_mip[i]) {
24287ec681f3Smrg         need_color_mip_flush = true;
24297ec681f3Smrg         break;
24307ec681f3Smrg      }
24317ec681f3Smrg   }
243201e04c3fSmrg
24337ec681f3Smrg   if (need_color_mip_flush) {
24347ec681f3Smrg      cmd_buffer->state.flush_bits |=
24357ec681f3Smrg         RADV_CMD_FLAG_FLUSH_AND_INV_CB | RADV_CMD_FLAG_FLUSH_AND_INV_CB_META;
24367ec681f3Smrg   }
243701e04c3fSmrg
24387ec681f3Smrg   memset(cmd_buffer->state.cb_mip, 0, sizeof(cmd_buffer->state.cb_mip));
243901e04c3fSmrg}
244001e04c3fSmrg
24417ec681f3Smrgstatic struct radv_image *
24427ec681f3Smrgradv_cmd_buffer_get_vrs_image(struct radv_cmd_buffer *cmd_buffer)
244301e04c3fSmrg{
24447ec681f3Smrg   struct radv_device *device = cmd_buffer->device;
24457ec681f3Smrg
24467ec681f3Smrg   if (!device->vrs.image) {
24477ec681f3Smrg      VkResult result;
244801e04c3fSmrg
24497ec681f3Smrg      /* The global VRS state is initialized on-demand to avoid wasting VRAM. */
24507ec681f3Smrg      result = radv_device_init_vrs_state(device);
24517ec681f3Smrg      if (result != VK_SUCCESS) {
24527ec681f3Smrg         cmd_buffer->record_result = result;
24537ec681f3Smrg         return NULL;
24547ec681f3Smrg      }
24557ec681f3Smrg   }
245601e04c3fSmrg
24577ec681f3Smrg   return device->vrs.image;
245801e04c3fSmrg}
245901e04c3fSmrg
246001e04c3fSmrgstatic void
24617ec681f3Smrgradv_emit_framebuffer_state(struct radv_cmd_buffer *cmd_buffer)
24627ec681f3Smrg{
24637ec681f3Smrg   int i;
24647ec681f3Smrg   struct radv_framebuffer *framebuffer = cmd_buffer->state.framebuffer;
24657ec681f3Smrg   const struct radv_subpass *subpass = cmd_buffer->state.subpass;
24667ec681f3Smrg
24677ec681f3Smrg   /* this may happen for inherited secondary recording */
24687ec681f3Smrg   if (!framebuffer)
24697ec681f3Smrg      return;
24707ec681f3Smrg
24717ec681f3Smrg   for (i = 0; i < 8; ++i) {
24727ec681f3Smrg      if (i >= subpass->color_count ||
24737ec681f3Smrg          subpass->color_attachments[i].attachment == VK_ATTACHMENT_UNUSED) {
24747ec681f3Smrg         radeon_set_context_reg(cmd_buffer->cs, R_028C70_CB_COLOR0_INFO + i * 0x3C,
24757ec681f3Smrg                                S_028C70_FORMAT(V_028C70_COLOR_INVALID));
24767ec681f3Smrg         continue;
24777ec681f3Smrg      }
24787ec681f3Smrg
24797ec681f3Smrg      int idx = subpass->color_attachments[i].attachment;
24807ec681f3Smrg      struct radv_image_view *iview = cmd_buffer->state.attachments[idx].iview;
24817ec681f3Smrg      VkImageLayout layout = subpass->color_attachments[i].layout;
24827ec681f3Smrg      bool in_render_loop = subpass->color_attachments[i].in_render_loop;
24837ec681f3Smrg
24847ec681f3Smrg      radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, iview->image->bo);
24857ec681f3Smrg
24867ec681f3Smrg      assert(iview->aspect_mask & (VK_IMAGE_ASPECT_COLOR_BIT | VK_IMAGE_ASPECT_PLANE_0_BIT |
24877ec681f3Smrg                                   VK_IMAGE_ASPECT_PLANE_1_BIT | VK_IMAGE_ASPECT_PLANE_2_BIT));
24887ec681f3Smrg      radv_emit_fb_color_state(cmd_buffer, i, &cmd_buffer->state.attachments[idx].cb, iview, layout,
24897ec681f3Smrg                               in_render_loop, cmd_buffer->state.attachments[idx].disable_dcc);
24907ec681f3Smrg
24917ec681f3Smrg      radv_load_color_clear_metadata(cmd_buffer, iview, i);
24927ec681f3Smrg   }
24937ec681f3Smrg
24947ec681f3Smrg   if (subpass->depth_stencil_attachment) {
24957ec681f3Smrg      int idx = subpass->depth_stencil_attachment->attachment;
24967ec681f3Smrg      VkImageLayout layout = subpass->depth_stencil_attachment->layout;
24977ec681f3Smrg      bool in_render_loop = subpass->depth_stencil_attachment->in_render_loop;
24987ec681f3Smrg      struct radv_image_view *iview = cmd_buffer->state.attachments[idx].iview;
24997ec681f3Smrg      radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs,
25007ec681f3Smrg                         cmd_buffer->state.attachments[idx].iview->image->bo);
25017ec681f3Smrg
25027ec681f3Smrg      radv_emit_fb_ds_state(cmd_buffer, &cmd_buffer->state.attachments[idx].ds, iview, layout,
25037ec681f3Smrg                            in_render_loop);
25047ec681f3Smrg
25057ec681f3Smrg      if (radv_layout_is_htile_compressed(
25067ec681f3Smrg             cmd_buffer->device, iview->image, layout, in_render_loop,
25077ec681f3Smrg             radv_image_queue_family_mask(iview->image, cmd_buffer->queue_family_index,
25087ec681f3Smrg                                          cmd_buffer->queue_family_index))) {
25097ec681f3Smrg         /* Only load the depth/stencil fast clear values when
25107ec681f3Smrg          * compressed rendering is enabled.
25117ec681f3Smrg          */
25127ec681f3Smrg         radv_load_ds_clear_metadata(cmd_buffer, iview);
25137ec681f3Smrg      }
25147ec681f3Smrg   } else if (subpass->vrs_attachment && cmd_buffer->device->vrs.image) {
25157ec681f3Smrg      /* When a subpass uses a VRS attachment without binding a depth/stencil attachment, we have to
25167ec681f3Smrg       * bind our internal depth buffer that contains the VRS data as part of HTILE.
25177ec681f3Smrg       */
25187ec681f3Smrg      VkImageLayout layout = VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL;
25197ec681f3Smrg      struct radv_buffer *htile_buffer = cmd_buffer->device->vrs.buffer;
25207ec681f3Smrg      struct radv_image *image = cmd_buffer->device->vrs.image;
25217ec681f3Smrg      struct radv_ds_buffer_info ds;
25227ec681f3Smrg      struct radv_image_view iview;
25237ec681f3Smrg
25247ec681f3Smrg      radv_image_view_init(&iview, cmd_buffer->device,
25257ec681f3Smrg                           &(VkImageViewCreateInfo){
25267ec681f3Smrg                              .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO,
25277ec681f3Smrg                              .image = radv_image_to_handle(image),
25287ec681f3Smrg                              .viewType = radv_meta_get_view_type(image),
25297ec681f3Smrg                              .format = image->vk_format,
25307ec681f3Smrg                              .subresourceRange =
25317ec681f3Smrg                                 {
25327ec681f3Smrg                                    .aspectMask = VK_IMAGE_ASPECT_DEPTH_BIT,
25337ec681f3Smrg                                    .baseMipLevel = 0,
25347ec681f3Smrg                                    .levelCount = 1,
25357ec681f3Smrg                                    .baseArrayLayer = 0,
25367ec681f3Smrg                                    .layerCount = 1,
25377ec681f3Smrg                                 },
25387ec681f3Smrg                           },
25397ec681f3Smrg                           NULL);
25407ec681f3Smrg
25417ec681f3Smrg      radv_initialise_vrs_surface(image, htile_buffer, &ds);
25427ec681f3Smrg
25437ec681f3Smrg      radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, htile_buffer->bo);
25447ec681f3Smrg
25457ec681f3Smrg      radv_emit_fb_ds_state(cmd_buffer, &ds, &iview, layout, false);
25467ec681f3Smrg
25477ec681f3Smrg      radv_image_view_finish(&iview);
25487ec681f3Smrg   } else {
25497ec681f3Smrg      if (cmd_buffer->device->physical_device->rad_info.chip_class == GFX9)
25507ec681f3Smrg         radeon_set_context_reg_seq(cmd_buffer->cs, R_028038_DB_Z_INFO, 2);
25517ec681f3Smrg      else
25527ec681f3Smrg         radeon_set_context_reg_seq(cmd_buffer->cs, R_028040_DB_Z_INFO, 2);
25537ec681f3Smrg
25547ec681f3Smrg      radeon_emit(cmd_buffer->cs, S_028040_FORMAT(V_028040_Z_INVALID));       /* DB_Z_INFO */
25557ec681f3Smrg      radeon_emit(cmd_buffer->cs, S_028044_FORMAT(V_028044_STENCIL_INVALID)); /* DB_STENCIL_INFO */
25567ec681f3Smrg   }
25577ec681f3Smrg   radeon_set_context_reg(cmd_buffer->cs, R_028208_PA_SC_WINDOW_SCISSOR_BR,
25587ec681f3Smrg                          S_028208_BR_X(framebuffer->width) | S_028208_BR_Y(framebuffer->height));
25597ec681f3Smrg
25607ec681f3Smrg   if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX8) {
25617ec681f3Smrg      bool disable_constant_encode =
25627ec681f3Smrg         cmd_buffer->device->physical_device->rad_info.has_dcc_constant_encode;
25637ec681f3Smrg      enum chip_class chip_class = cmd_buffer->device->physical_device->rad_info.chip_class;
25647ec681f3Smrg      uint8_t watermark = chip_class >= GFX10 ? 6 : 4;
25657ec681f3Smrg
25667ec681f3Smrg      radeon_set_context_reg(cmd_buffer->cs, R_028424_CB_DCC_CONTROL,
25677ec681f3Smrg                             S_028424_OVERWRITE_COMBINER_MRT_SHARING_DISABLE(chip_class <= GFX9) |
25687ec681f3Smrg                                S_028424_OVERWRITE_COMBINER_WATERMARK(watermark) |
25697ec681f3Smrg                                S_028424_DISABLE_CONSTANT_ENCODE_REG(disable_constant_encode));
25707ec681f3Smrg   }
25717ec681f3Smrg
25727ec681f3Smrg   cmd_buffer->state.dirty &= ~RADV_CMD_DIRTY_FRAMEBUFFER;
257301e04c3fSmrg}
257401e04c3fSmrg
257501e04c3fSmrgstatic void
25767ec681f3Smrgradv_emit_index_buffer(struct radv_cmd_buffer *cmd_buffer, bool indirect)
257701e04c3fSmrg{
25787ec681f3Smrg   struct radeon_cmdbuf *cs = cmd_buffer->cs;
25797ec681f3Smrg   struct radv_cmd_state *state = &cmd_buffer->state;
258001e04c3fSmrg
25817ec681f3Smrg   if (state->index_type != state->last_index_type) {
25827ec681f3Smrg      if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX9) {
25837ec681f3Smrg         radeon_set_uconfig_reg_idx(cmd_buffer->device->physical_device, cs,
25847ec681f3Smrg                                    R_03090C_VGT_INDEX_TYPE, 2, state->index_type);
25857ec681f3Smrg      } else {
25867ec681f3Smrg         radeon_emit(cs, PKT3(PKT3_INDEX_TYPE, 0, 0));
25877ec681f3Smrg         radeon_emit(cs, state->index_type);
25887ec681f3Smrg      }
258901e04c3fSmrg
25907ec681f3Smrg      state->last_index_type = state->index_type;
25917ec681f3Smrg   }
259201e04c3fSmrg
25937ec681f3Smrg   /* For the direct indexed draws we use DRAW_INDEX_2, which includes
25947ec681f3Smrg    * the index_va and max_index_count already. */
25957ec681f3Smrg   if (!indirect)
25967ec681f3Smrg      return;
259701e04c3fSmrg
25987ec681f3Smrg   radeon_emit(cs, PKT3(PKT3_INDEX_BASE, 1, 0));
25997ec681f3Smrg   radeon_emit(cs, state->index_va);
26007ec681f3Smrg   radeon_emit(cs, state->index_va >> 32);
260101e04c3fSmrg
26027ec681f3Smrg   radeon_emit(cs, PKT3(PKT3_INDEX_BUFFER_SIZE, 0, 0));
26037ec681f3Smrg   radeon_emit(cs, state->max_index_count);
260401e04c3fSmrg
26057ec681f3Smrg   cmd_buffer->state.dirty &= ~RADV_CMD_DIRTY_INDEX_BUFFER;
26067ec681f3Smrg}
260701e04c3fSmrg
26087ec681f3Smrgvoid
26097ec681f3Smrgradv_set_db_count_control(struct radv_cmd_buffer *cmd_buffer)
26107ec681f3Smrg{
26117ec681f3Smrg   bool has_perfect_queries = cmd_buffer->state.perfect_occlusion_queries_enabled;
26127ec681f3Smrg   struct radv_pipeline *pipeline = cmd_buffer->state.pipeline;
26137ec681f3Smrg   uint32_t pa_sc_mode_cntl_1 = pipeline ? pipeline->graphics.ms.pa_sc_mode_cntl_1 : 0;
26147ec681f3Smrg   uint32_t db_count_control;
26157ec681f3Smrg
26167ec681f3Smrg   if (!cmd_buffer->state.active_occlusion_queries) {
26177ec681f3Smrg      if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX7) {
26187ec681f3Smrg         if (G_028A4C_OUT_OF_ORDER_PRIMITIVE_ENABLE(pa_sc_mode_cntl_1) &&
26197ec681f3Smrg             pipeline->graphics.disable_out_of_order_rast_for_occlusion && has_perfect_queries) {
26207ec681f3Smrg            /* Re-enable out-of-order rasterization if the
26217ec681f3Smrg             * bound pipeline supports it and if it's has
26227ec681f3Smrg             * been disabled before starting any perfect
26237ec681f3Smrg             * occlusion queries.
26247ec681f3Smrg             */
26257ec681f3Smrg            radeon_set_context_reg(cmd_buffer->cs, R_028A4C_PA_SC_MODE_CNTL_1, pa_sc_mode_cntl_1);
26267ec681f3Smrg         }
26277ec681f3Smrg      }
26287ec681f3Smrg      db_count_control = S_028004_ZPASS_INCREMENT_DISABLE(1);
26297ec681f3Smrg   } else {
26307ec681f3Smrg      const struct radv_subpass *subpass = cmd_buffer->state.subpass;
26317ec681f3Smrg      uint32_t sample_rate = subpass ? util_logbase2(subpass->max_sample_count) : 0;
26327ec681f3Smrg      bool gfx10_perfect =
26337ec681f3Smrg         cmd_buffer->device->physical_device->rad_info.chip_class >= GFX10 && has_perfect_queries;
26347ec681f3Smrg
26357ec681f3Smrg      if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX7) {
26367ec681f3Smrg         /* Always enable PERFECT_ZPASS_COUNTS due to issues with partially
26377ec681f3Smrg          * covered tiles, discards, and early depth testing. For more details,
26387ec681f3Smrg          * see https://gitlab.freedesktop.org/mesa/mesa/-/issues/3218 */
26397ec681f3Smrg         db_count_control = S_028004_PERFECT_ZPASS_COUNTS(1) |
26407ec681f3Smrg                            S_028004_DISABLE_CONSERVATIVE_ZPASS_COUNTS(gfx10_perfect) |
26417ec681f3Smrg                            S_028004_SAMPLE_RATE(sample_rate) | S_028004_ZPASS_ENABLE(1) |
26427ec681f3Smrg                            S_028004_SLICE_EVEN_ENABLE(1) | S_028004_SLICE_ODD_ENABLE(1);
26437ec681f3Smrg
26447ec681f3Smrg         if (G_028A4C_OUT_OF_ORDER_PRIMITIVE_ENABLE(pa_sc_mode_cntl_1) &&
26457ec681f3Smrg             pipeline->graphics.disable_out_of_order_rast_for_occlusion && has_perfect_queries) {
26467ec681f3Smrg            /* If the bound pipeline has enabled
26477ec681f3Smrg             * out-of-order rasterization, we should
26487ec681f3Smrg             * disable it before starting any perfect
26497ec681f3Smrg             * occlusion queries.
26507ec681f3Smrg             */
26517ec681f3Smrg            pa_sc_mode_cntl_1 &= C_028A4C_OUT_OF_ORDER_PRIMITIVE_ENABLE;
26527ec681f3Smrg
26537ec681f3Smrg            radeon_set_context_reg(cmd_buffer->cs, R_028A4C_PA_SC_MODE_CNTL_1, pa_sc_mode_cntl_1);
26547ec681f3Smrg         }
26557ec681f3Smrg      } else {
26567ec681f3Smrg         db_count_control = S_028004_PERFECT_ZPASS_COUNTS(1) | S_028004_SAMPLE_RATE(sample_rate);
26577ec681f3Smrg      }
26587ec681f3Smrg   }
26597ec681f3Smrg
26607ec681f3Smrg   radeon_set_context_reg(cmd_buffer->cs, R_028004_DB_COUNT_CONTROL, db_count_control);
26617ec681f3Smrg
26627ec681f3Smrg   cmd_buffer->state.context_roll_without_scissor_emitted = true;
26637ec681f3Smrg}
26647ec681f3Smrg
26657ec681f3Smrgunsigned
26667ec681f3Smrgradv_instance_rate_prolog_index(unsigned num_attributes, uint32_t instance_rate_inputs)
26677ec681f3Smrg{
26687ec681f3Smrg   /* instance_rate_vs_prologs is a flattened array of array of arrays of different sizes, or a
26697ec681f3Smrg    * single array sorted in ascending order using:
26707ec681f3Smrg    * - total number of attributes
26717ec681f3Smrg    * - number of instanced attributes
26727ec681f3Smrg    * - index of first instanced attribute
26737ec681f3Smrg    */
26747ec681f3Smrg
26757ec681f3Smrg   /* From total number of attributes to offset. */
26767ec681f3Smrg   static const uint16_t total_to_offset[16] = {0,   1,   4,   10,  20,  35,  56,  84,
26777ec681f3Smrg                                                120, 165, 220, 286, 364, 455, 560, 680};
26787ec681f3Smrg   unsigned start_index = total_to_offset[num_attributes - 1];
26797ec681f3Smrg
26807ec681f3Smrg   /* From number of instanced attributes to offset. This would require a different LUT depending on
26817ec681f3Smrg    * the total number of attributes, but we can exploit a pattern to use just the LUT for 16 total
26827ec681f3Smrg    * attributes.
26837ec681f3Smrg    */
26847ec681f3Smrg   static const uint8_t count_to_offset_total16[16] = {0,   16,  31,  45,  58,  70,  81,  91,
26857ec681f3Smrg                                                       100, 108, 115, 121, 126, 130, 133, 135};
26867ec681f3Smrg   unsigned count = util_bitcount(instance_rate_inputs);
26877ec681f3Smrg   unsigned offset_from_start_index =
26887ec681f3Smrg      count_to_offset_total16[count - 1] - ((16 - num_attributes) * (count - 1));
26897ec681f3Smrg
26907ec681f3Smrg   unsigned first = ffs(instance_rate_inputs) - 1;
26917ec681f3Smrg   return start_index + offset_from_start_index + first;
26927ec681f3Smrg}
26937ec681f3Smrg
26947ec681f3Smrgunion vs_prolog_key_header {
26957ec681f3Smrg   struct {
26967ec681f3Smrg      uint32_t key_size : 8;
26977ec681f3Smrg      uint32_t num_attributes : 6;
26987ec681f3Smrg      uint32_t as_ls : 1;
26997ec681f3Smrg      uint32_t is_ngg : 1;
27007ec681f3Smrg      uint32_t wave32 : 1;
27017ec681f3Smrg      uint32_t next_stage : 3;
27027ec681f3Smrg      uint32_t instance_rate_inputs : 1;
27037ec681f3Smrg      uint32_t alpha_adjust_lo : 1;
27047ec681f3Smrg      uint32_t alpha_adjust_hi : 1;
27057ec681f3Smrg      uint32_t misaligned_mask : 1;
27067ec681f3Smrg      uint32_t post_shuffle : 1;
27077ec681f3Smrg      uint32_t nontrivial_divisors : 1;
27087ec681f3Smrg      /* We need this to ensure the padding is zero. It's useful even if it's unused. */
27097ec681f3Smrg      uint32_t padding0 : 6;
27107ec681f3Smrg   };
27117ec681f3Smrg   uint32_t v;
27127ec681f3Smrg};
271301e04c3fSmrg
27147ec681f3Smrguint32_t
27157ec681f3Smrgradv_hash_vs_prolog(const void *key_)
27167ec681f3Smrg{
27177ec681f3Smrg   const uint32_t *key = key_;
27187ec681f3Smrg   union vs_prolog_key_header header;
27197ec681f3Smrg   header.v = key[0];
27207ec681f3Smrg   return _mesa_hash_data(key, header.key_size);
27217ec681f3Smrg}
272201e04c3fSmrg
27237ec681f3Smrgbool
27247ec681f3Smrgradv_cmp_vs_prolog(const void *a_, const void *b_)
27257ec681f3Smrg{
27267ec681f3Smrg   const uint32_t *a = a_;
27277ec681f3Smrg   const uint32_t *b = b_;
27287ec681f3Smrg   if (a[0] != b[0])
27297ec681f3Smrg      return false;
27307ec681f3Smrg
27317ec681f3Smrg   union vs_prolog_key_header header;
27327ec681f3Smrg   header.v = a[0];
27337ec681f3Smrg   return memcmp(a, b, header.key_size) == 0;
27347ec681f3Smrg}
27357ec681f3Smrg
27367ec681f3Smrgstatic struct radv_shader_prolog *
27377ec681f3Smrglookup_vs_prolog(struct radv_cmd_buffer *cmd_buffer, struct radv_shader_variant *vs_shader,
27387ec681f3Smrg                 uint32_t *nontrivial_divisors)
27397ec681f3Smrg{
27407ec681f3Smrg   STATIC_ASSERT(sizeof(union vs_prolog_key_header) == 4);
27417ec681f3Smrg   assert(vs_shader->info.vs.dynamic_inputs);
27427ec681f3Smrg
27437ec681f3Smrg   const struct radv_vs_input_state *state = &cmd_buffer->state.dynamic_vs_input;
27447ec681f3Smrg   const struct radv_pipeline *pipeline = cmd_buffer->state.pipeline;
27457ec681f3Smrg   struct radv_device *device = cmd_buffer->device;
27467ec681f3Smrg
27477ec681f3Smrg   unsigned num_attributes = pipeline->last_vertex_attrib_bit;
27487ec681f3Smrg   uint32_t attribute_mask = BITFIELD_MASK(num_attributes);
27497ec681f3Smrg
27507ec681f3Smrg   uint32_t instance_rate_inputs = state->instance_rate_inputs & attribute_mask;
27517ec681f3Smrg   *nontrivial_divisors = state->nontrivial_divisors & attribute_mask;
27527ec681f3Smrg   enum chip_class chip = device->physical_device->rad_info.chip_class;
27537ec681f3Smrg   const uint32_t misaligned_mask = chip == GFX6 || chip >= GFX10 ? cmd_buffer->state.vbo_misaligned_mask : 0;
27547ec681f3Smrg
27557ec681f3Smrg   /* try to use a pre-compiled prolog first */
27567ec681f3Smrg   struct radv_shader_prolog *prolog = NULL;
27577ec681f3Smrg   if (pipeline->can_use_simple_input &&
27587ec681f3Smrg       (!vs_shader->info.vs.as_ls || !instance_rate_inputs) &&
27597ec681f3Smrg       !misaligned_mask && !state->alpha_adjust_lo && !state->alpha_adjust_hi) {
27607ec681f3Smrg      if (!instance_rate_inputs) {
27617ec681f3Smrg         prolog = device->simple_vs_prologs[num_attributes - 1];
27627ec681f3Smrg      } else if (num_attributes <= 16 && !*nontrivial_divisors &&
27637ec681f3Smrg                 util_bitcount(instance_rate_inputs) ==
27647ec681f3Smrg                    (util_last_bit(instance_rate_inputs) - ffs(instance_rate_inputs) + 1)) {
27657ec681f3Smrg         unsigned index = radv_instance_rate_prolog_index(num_attributes, instance_rate_inputs);
27667ec681f3Smrg         prolog = device->instance_rate_vs_prologs[index];
27677ec681f3Smrg      }
27687ec681f3Smrg   }
27697ec681f3Smrg   if (prolog)
27707ec681f3Smrg      return prolog;
27717ec681f3Smrg
27727ec681f3Smrg   /* if we couldn't use a pre-compiled prolog, find one in the cache or create one */
27737ec681f3Smrg   uint32_t key_words[16];
27747ec681f3Smrg   unsigned key_size = 1;
27757ec681f3Smrg
27767ec681f3Smrg   struct radv_vs_prolog_key key;
27777ec681f3Smrg   key.state = state;
27787ec681f3Smrg   key.num_attributes = num_attributes;
27797ec681f3Smrg   key.misaligned_mask = misaligned_mask;
27807ec681f3Smrg   /* The instance ID input VGPR is placed differently when as_ls=true. */
27817ec681f3Smrg   key.as_ls = vs_shader->info.vs.as_ls && instance_rate_inputs;
27827ec681f3Smrg   key.is_ngg = vs_shader->info.is_ngg;
27837ec681f3Smrg   key.wave32 = vs_shader->info.wave_size == 32;
27847ec681f3Smrg   key.next_stage = pipeline->next_vertex_stage;
27857ec681f3Smrg
27867ec681f3Smrg   union vs_prolog_key_header header;
27877ec681f3Smrg   header.v = 0;
27887ec681f3Smrg   header.num_attributes = num_attributes;
27897ec681f3Smrg   header.as_ls = key.as_ls;
27907ec681f3Smrg   header.is_ngg = key.is_ngg;
27917ec681f3Smrg   header.wave32 = key.wave32;
27927ec681f3Smrg   header.next_stage = key.next_stage;
27937ec681f3Smrg
27947ec681f3Smrg   if (instance_rate_inputs & ~*nontrivial_divisors) {
27957ec681f3Smrg      header.instance_rate_inputs = true;
27967ec681f3Smrg      key_words[key_size++] = instance_rate_inputs;
27977ec681f3Smrg   }
27987ec681f3Smrg   if (*nontrivial_divisors) {
27997ec681f3Smrg      header.nontrivial_divisors = true;
28007ec681f3Smrg      key_words[key_size++] = *nontrivial_divisors;
28017ec681f3Smrg   }
28027ec681f3Smrg   if (misaligned_mask) {
28037ec681f3Smrg      header.misaligned_mask = true;
28047ec681f3Smrg      key_words[key_size++] = misaligned_mask;
28057ec681f3Smrg
28067ec681f3Smrg      uint8_t *formats = (uint8_t *)&key_words[key_size];
28077ec681f3Smrg      unsigned num_formats = 0;
28087ec681f3Smrg      u_foreach_bit(index, misaligned_mask) formats[num_formats++] = state->formats[index];
28097ec681f3Smrg      while (num_formats & 0x3)
28107ec681f3Smrg         formats[num_formats++] = 0;
28117ec681f3Smrg      key_size += num_formats / 4u;
28127ec681f3Smrg
28137ec681f3Smrg      if (state->post_shuffle & attribute_mask) {
28147ec681f3Smrg         header.post_shuffle = true;
28157ec681f3Smrg         key_words[key_size++] = state->post_shuffle & attribute_mask;
28167ec681f3Smrg      }
28177ec681f3Smrg   }
28187ec681f3Smrg   if (state->alpha_adjust_lo & attribute_mask) {
28197ec681f3Smrg      header.alpha_adjust_lo = true;
28207ec681f3Smrg      key_words[key_size++] = state->alpha_adjust_lo & attribute_mask;
28217ec681f3Smrg   }
28227ec681f3Smrg   if (state->alpha_adjust_hi & attribute_mask) {
28237ec681f3Smrg      header.alpha_adjust_hi = true;
28247ec681f3Smrg      key_words[key_size++] = state->alpha_adjust_hi & attribute_mask;
28257ec681f3Smrg   }
28267ec681f3Smrg
28277ec681f3Smrg   header.key_size = key_size * sizeof(key_words[0]);
28287ec681f3Smrg   key_words[0] = header.v;
28297ec681f3Smrg
28307ec681f3Smrg   uint32_t hash = radv_hash_vs_prolog(key_words);
28317ec681f3Smrg
28327ec681f3Smrg   if (cmd_buffer->state.emitted_vs_prolog &&
28337ec681f3Smrg       cmd_buffer->state.emitted_vs_prolog_key_hash == hash &&
28347ec681f3Smrg       radv_cmp_vs_prolog(key_words, cmd_buffer->state.emitted_vs_prolog_key))
28357ec681f3Smrg      return cmd_buffer->state.emitted_vs_prolog;
28367ec681f3Smrg
28377ec681f3Smrg   u_rwlock_rdlock(&device->vs_prologs_lock);
28387ec681f3Smrg   struct hash_entry *prolog_entry =
28397ec681f3Smrg      _mesa_hash_table_search_pre_hashed(device->vs_prologs, hash, key_words);
28407ec681f3Smrg   u_rwlock_rdunlock(&device->vs_prologs_lock);
28417ec681f3Smrg
28427ec681f3Smrg   if (!prolog_entry) {
28437ec681f3Smrg      u_rwlock_wrlock(&device->vs_prologs_lock);
28447ec681f3Smrg      prolog_entry = _mesa_hash_table_search_pre_hashed(device->vs_prologs, hash, key_words);
28457ec681f3Smrg      if (prolog_entry) {
28467ec681f3Smrg         u_rwlock_wrunlock(&device->vs_prologs_lock);
28477ec681f3Smrg         return prolog_entry->data;
28487ec681f3Smrg      }
28497ec681f3Smrg
28507ec681f3Smrg      prolog = radv_create_vs_prolog(device, &key);
28517ec681f3Smrg      uint32_t *key2 = malloc(key_size * 4);
28527ec681f3Smrg      if (!prolog || !key2) {
28537ec681f3Smrg         radv_prolog_destroy(device, prolog);
28547ec681f3Smrg         free(key2);
28557ec681f3Smrg         u_rwlock_wrunlock(&device->vs_prologs_lock);
28567ec681f3Smrg         return NULL;
28577ec681f3Smrg      }
28587ec681f3Smrg      memcpy(key2, key_words, key_size * 4);
28597ec681f3Smrg      _mesa_hash_table_insert_pre_hashed(device->vs_prologs, hash, key2, prolog);
28607ec681f3Smrg
28617ec681f3Smrg      u_rwlock_wrunlock(&device->vs_prologs_lock);
28627ec681f3Smrg      return prolog;
28637ec681f3Smrg   }
28647ec681f3Smrg
28657ec681f3Smrg   return prolog_entry->data;
286601e04c3fSmrg}
286701e04c3fSmrg
286801e04c3fSmrgstatic void
28697ec681f3Smrgemit_prolog_regs(struct radv_cmd_buffer *cmd_buffer, struct radv_shader_variant *vs_shader,
28707ec681f3Smrg                 struct radv_shader_prolog *prolog, bool pipeline_is_dirty)
28717ec681f3Smrg{
28727ec681f3Smrg   /* no need to re-emit anything in this case */
28737ec681f3Smrg   if (cmd_buffer->state.emitted_vs_prolog == prolog && !pipeline_is_dirty)
28747ec681f3Smrg      return;
28757ec681f3Smrg
28767ec681f3Smrg   enum chip_class chip = cmd_buffer->device->physical_device->rad_info.chip_class;
28777ec681f3Smrg   struct radv_pipeline *pipeline = cmd_buffer->state.pipeline;
28787ec681f3Smrg   uint64_t prolog_va = radv_buffer_get_va(prolog->bo) + prolog->alloc->offset;
28797ec681f3Smrg
28807ec681f3Smrg   assert(cmd_buffer->state.emitted_pipeline == cmd_buffer->state.pipeline);
28817ec681f3Smrg   assert(vs_shader->info.num_input_sgprs <= prolog->num_preserved_sgprs);
28827ec681f3Smrg
28837ec681f3Smrg   uint32_t rsrc1 = vs_shader->config.rsrc1;
28847ec681f3Smrg   if (chip < GFX10 && G_00B228_SGPRS(prolog->rsrc1) > G_00B228_SGPRS(vs_shader->config.rsrc1))
28857ec681f3Smrg      rsrc1 = (rsrc1 & C_00B228_SGPRS) | (prolog->rsrc1 & ~C_00B228_SGPRS);
28867ec681f3Smrg
28877ec681f3Smrg   /* The main shader must not use less VGPRs than the prolog, otherwise shared vgprs might not
28887ec681f3Smrg    * work.
28897ec681f3Smrg    */
28907ec681f3Smrg   assert(G_00B848_VGPRS(vs_shader->config.rsrc1) >= G_00B848_VGPRS(prolog->rsrc1));
28917ec681f3Smrg
28927ec681f3Smrg   unsigned pgm_lo_reg = R_00B120_SPI_SHADER_PGM_LO_VS;
28937ec681f3Smrg   unsigned rsrc1_reg = R_00B128_SPI_SHADER_PGM_RSRC1_VS;
28947ec681f3Smrg   if (vs_shader->info.is_ngg || pipeline->shaders[MESA_SHADER_GEOMETRY] == vs_shader) {
28957ec681f3Smrg      pgm_lo_reg = chip >= GFX10 ? R_00B320_SPI_SHADER_PGM_LO_ES : R_00B210_SPI_SHADER_PGM_LO_ES;
28967ec681f3Smrg      rsrc1_reg = R_00B228_SPI_SHADER_PGM_RSRC1_GS;
28977ec681f3Smrg   } else if (pipeline->shaders[MESA_SHADER_TESS_CTRL] == vs_shader) {
28987ec681f3Smrg      pgm_lo_reg = chip >= GFX10 ? R_00B520_SPI_SHADER_PGM_LO_LS : R_00B410_SPI_SHADER_PGM_LO_LS;
28997ec681f3Smrg      rsrc1_reg = R_00B428_SPI_SHADER_PGM_RSRC1_HS;
29007ec681f3Smrg   } else if (vs_shader->info.vs.as_ls) {
29017ec681f3Smrg      pgm_lo_reg = R_00B520_SPI_SHADER_PGM_LO_LS;
29027ec681f3Smrg      rsrc1_reg = R_00B528_SPI_SHADER_PGM_RSRC1_LS;
29037ec681f3Smrg   } else if (vs_shader->info.vs.as_es) {
29047ec681f3Smrg      pgm_lo_reg = R_00B320_SPI_SHADER_PGM_LO_ES;
29057ec681f3Smrg      rsrc1_reg = R_00B328_SPI_SHADER_PGM_RSRC1_ES;
29067ec681f3Smrg   }
29077ec681f3Smrg
29087ec681f3Smrg   radeon_set_sh_reg_seq(cmd_buffer->cs, pgm_lo_reg, 2);
29097ec681f3Smrg   radeon_emit(cmd_buffer->cs, prolog_va >> 8);
29107ec681f3Smrg   radeon_emit(cmd_buffer->cs, S_00B124_MEM_BASE(prolog_va >> 40));
29117ec681f3Smrg
29127ec681f3Smrg   if (chip < GFX10)
29137ec681f3Smrg      radeon_set_sh_reg(cmd_buffer->cs, rsrc1_reg, rsrc1);
29147ec681f3Smrg   else
29157ec681f3Smrg      assert(rsrc1 == vs_shader->config.rsrc1);
29167ec681f3Smrg
29177ec681f3Smrg   radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, prolog->bo);
291801e04c3fSmrg}
291901e04c3fSmrg
292001e04c3fSmrgstatic void
29217ec681f3Smrgemit_prolog_inputs(struct radv_cmd_buffer *cmd_buffer, struct radv_shader_variant *vs_shader,
29227ec681f3Smrg                   uint32_t nontrivial_divisors, bool pipeline_is_dirty)
29237ec681f3Smrg{
29247ec681f3Smrg   /* no need to re-emit anything in this case */
29257ec681f3Smrg   if (!nontrivial_divisors && !pipeline_is_dirty && cmd_buffer->state.emitted_vs_prolog &&
29267ec681f3Smrg       !cmd_buffer->state.emitted_vs_prolog->nontrivial_divisors)
29277ec681f3Smrg      return;
29287ec681f3Smrg
29297ec681f3Smrg   struct radv_vs_input_state *state = &cmd_buffer->state.dynamic_vs_input;
29307ec681f3Smrg   uint64_t input_va = radv_shader_variant_get_va(vs_shader);
29317ec681f3Smrg
29327ec681f3Smrg   if (nontrivial_divisors) {
29337ec681f3Smrg      unsigned inputs_offset;
29347ec681f3Smrg      uint32_t *inputs;
29357ec681f3Smrg      unsigned size = 8 + util_bitcount(nontrivial_divisors) * 8;
29367ec681f3Smrg      if (!radv_cmd_buffer_upload_alloc(cmd_buffer, size, &inputs_offset, (void **)&inputs))
29377ec681f3Smrg         return;
29387ec681f3Smrg
29397ec681f3Smrg      *(inputs++) = input_va;
29407ec681f3Smrg      *(inputs++) = input_va >> 32;
29417ec681f3Smrg
29427ec681f3Smrg      u_foreach_bit(index, nontrivial_divisors)
29437ec681f3Smrg      {
29447ec681f3Smrg         uint32_t div = state->divisors[index];
29457ec681f3Smrg         if (div == 0) {
29467ec681f3Smrg            *(inputs++) = 0;
29477ec681f3Smrg            *(inputs++) = 1;
29487ec681f3Smrg         } else if (util_is_power_of_two_or_zero(div)) {
29497ec681f3Smrg            *(inputs++) = util_logbase2(div) | (1 << 8);
29507ec681f3Smrg            *(inputs++) = 0xffffffffu;
29517ec681f3Smrg         } else {
29527ec681f3Smrg            struct util_fast_udiv_info info = util_compute_fast_udiv_info(div, 32, 32);
29537ec681f3Smrg            *(inputs++) = info.pre_shift | (info.increment << 8) | (info.post_shift << 16);
29547ec681f3Smrg            *(inputs++) = info.multiplier;
29557ec681f3Smrg         }
29567ec681f3Smrg      }
29577ec681f3Smrg
29587ec681f3Smrg      input_va = radv_buffer_get_va(cmd_buffer->upload.upload_bo) + inputs_offset;
29597ec681f3Smrg   }
29607ec681f3Smrg
29617ec681f3Smrg   struct radv_userdata_info *loc =
29627ec681f3Smrg      &vs_shader->info.user_sgprs_locs.shader_data[AC_UD_VS_PROLOG_INPUTS];
29637ec681f3Smrg   uint32_t base_reg = cmd_buffer->state.pipeline->user_data_0[MESA_SHADER_VERTEX];
29647ec681f3Smrg   assert(loc->sgpr_idx != -1);
29657ec681f3Smrg   assert(loc->num_sgprs == 2);
29667ec681f3Smrg   radv_emit_shader_pointer(cmd_buffer->device, cmd_buffer->cs, base_reg + loc->sgpr_idx * 4,
29677ec681f3Smrg                            input_va, true);
296801e04c3fSmrg}
296901e04c3fSmrg
297001e04c3fSmrgstatic void
29717ec681f3Smrgradv_emit_vertex_input(struct radv_cmd_buffer *cmd_buffer, bool pipeline_is_dirty)
297201e04c3fSmrg{
29737ec681f3Smrg   struct radv_pipeline *pipeline = cmd_buffer->state.pipeline;
29747ec681f3Smrg   struct radv_shader_variant *vs_shader = radv_get_shader(pipeline, MESA_SHADER_VERTEX);
297501e04c3fSmrg
29767ec681f3Smrg   if (!vs_shader->info.vs.has_prolog)
29777ec681f3Smrg      return;
297801e04c3fSmrg
29797ec681f3Smrg   uint32_t nontrivial_divisors;
29807ec681f3Smrg   struct radv_shader_prolog *prolog =
29817ec681f3Smrg      lookup_vs_prolog(cmd_buffer, vs_shader, &nontrivial_divisors);
29827ec681f3Smrg   if (!prolog) {
29837ec681f3Smrg      cmd_buffer->record_result = VK_ERROR_OUT_OF_HOST_MEMORY;
29847ec681f3Smrg      return;
29857ec681f3Smrg   }
29867ec681f3Smrg   emit_prolog_regs(cmd_buffer, vs_shader, prolog, pipeline_is_dirty);
29877ec681f3Smrg   emit_prolog_inputs(cmd_buffer, vs_shader, nontrivial_divisors, pipeline_is_dirty);
298801e04c3fSmrg
29897ec681f3Smrg   cmd_buffer->state.emitted_vs_prolog = prolog;
299001e04c3fSmrg}
299101e04c3fSmrg
299201e04c3fSmrgstatic void
29937ec681f3Smrgradv_cmd_buffer_flush_dynamic_state(struct radv_cmd_buffer *cmd_buffer, bool pipeline_is_dirty)
299401e04c3fSmrg{
29957ec681f3Smrg   uint64_t states =
29967ec681f3Smrg      cmd_buffer->state.dirty & cmd_buffer->state.emitted_pipeline->graphics.needed_dynamic_state;
299701e04c3fSmrg
29987ec681f3Smrg   if (states & (RADV_CMD_DIRTY_DYNAMIC_VIEWPORT))
29997ec681f3Smrg      radv_emit_viewport(cmd_buffer);
300001e04c3fSmrg
30017ec681f3Smrg   if (states & (RADV_CMD_DIRTY_DYNAMIC_SCISSOR | RADV_CMD_DIRTY_DYNAMIC_VIEWPORT) &&
30027ec681f3Smrg       !cmd_buffer->device->physical_device->rad_info.has_gfx9_scissor_bug)
30037ec681f3Smrg      radv_emit_scissor(cmd_buffer);
300401e04c3fSmrg
30057ec681f3Smrg   if (states & RADV_CMD_DIRTY_DYNAMIC_LINE_WIDTH)
30067ec681f3Smrg      radv_emit_line_width(cmd_buffer);
300701e04c3fSmrg
30087ec681f3Smrg   if (states & RADV_CMD_DIRTY_DYNAMIC_BLEND_CONSTANTS)
30097ec681f3Smrg      radv_emit_blend_constants(cmd_buffer);
301001e04c3fSmrg
30117ec681f3Smrg   if (states &
30127ec681f3Smrg       (RADV_CMD_DIRTY_DYNAMIC_STENCIL_REFERENCE | RADV_CMD_DIRTY_DYNAMIC_STENCIL_WRITE_MASK |
30137ec681f3Smrg        RADV_CMD_DIRTY_DYNAMIC_STENCIL_COMPARE_MASK))
30147ec681f3Smrg      radv_emit_stencil(cmd_buffer);
301501e04c3fSmrg
30167ec681f3Smrg   if (states & RADV_CMD_DIRTY_DYNAMIC_DEPTH_BOUNDS)
30177ec681f3Smrg      radv_emit_depth_bounds(cmd_buffer);
301801e04c3fSmrg
30197ec681f3Smrg   if (states & RADV_CMD_DIRTY_DYNAMIC_DEPTH_BIAS)
30207ec681f3Smrg      radv_emit_depth_bias(cmd_buffer);
302101e04c3fSmrg
30227ec681f3Smrg   if (states & RADV_CMD_DIRTY_DYNAMIC_DISCARD_RECTANGLE)
30237ec681f3Smrg      radv_emit_discard_rectangle(cmd_buffer);
302401e04c3fSmrg
30257ec681f3Smrg   if (states & RADV_CMD_DIRTY_DYNAMIC_SAMPLE_LOCATIONS)
30267ec681f3Smrg      radv_emit_sample_locations(cmd_buffer);
302701e04c3fSmrg
30287ec681f3Smrg   if (states & RADV_CMD_DIRTY_DYNAMIC_LINE_STIPPLE)
30297ec681f3Smrg      radv_emit_line_stipple(cmd_buffer);
303001e04c3fSmrg
30317ec681f3Smrg   if (states & (RADV_CMD_DIRTY_DYNAMIC_CULL_MODE | RADV_CMD_DIRTY_DYNAMIC_FRONT_FACE |
30327ec681f3Smrg                 RADV_CMD_DIRTY_DYNAMIC_DEPTH_BIAS_ENABLE))
30337ec681f3Smrg      radv_emit_culling(cmd_buffer, states);
3034ed98bd31Smaya
30357ec681f3Smrg   if (states & RADV_CMD_DIRTY_DYNAMIC_PRIMITIVE_TOPOLOGY)
30367ec681f3Smrg      radv_emit_primitive_topology(cmd_buffer);
303701e04c3fSmrg
30387ec681f3Smrg   if (states &
30397ec681f3Smrg       (RADV_CMD_DIRTY_DYNAMIC_DEPTH_TEST_ENABLE | RADV_CMD_DIRTY_DYNAMIC_DEPTH_WRITE_ENABLE |
30407ec681f3Smrg        RADV_CMD_DIRTY_DYNAMIC_DEPTH_COMPARE_OP | RADV_CMD_DIRTY_DYNAMIC_DEPTH_BOUNDS_TEST_ENABLE |
30417ec681f3Smrg        RADV_CMD_DIRTY_DYNAMIC_STENCIL_TEST_ENABLE | RADV_CMD_DIRTY_DYNAMIC_STENCIL_OP))
30427ec681f3Smrg      radv_emit_depth_control(cmd_buffer, states);
304301e04c3fSmrg
30447ec681f3Smrg   if (states & RADV_CMD_DIRTY_DYNAMIC_STENCIL_OP)
30457ec681f3Smrg      radv_emit_stencil_control(cmd_buffer);
304601e04c3fSmrg
30477ec681f3Smrg   if (states & RADV_CMD_DIRTY_DYNAMIC_FRAGMENT_SHADING_RATE)
30487ec681f3Smrg      radv_emit_fragment_shading_rate(cmd_buffer);
304901e04c3fSmrg
30507ec681f3Smrg   if (states & RADV_CMD_DIRTY_DYNAMIC_PRIMITIVE_RESTART_ENABLE)
30517ec681f3Smrg      radv_emit_primitive_restart_enable(cmd_buffer);
305201e04c3fSmrg
30537ec681f3Smrg   if (states & RADV_CMD_DIRTY_DYNAMIC_RASTERIZER_DISCARD_ENABLE)
30547ec681f3Smrg      radv_emit_rasterizer_discard_enable(cmd_buffer);
305501e04c3fSmrg
30567ec681f3Smrg   if (states & RADV_CMD_DIRTY_DYNAMIC_LOGIC_OP)
30577ec681f3Smrg      radv_emit_logic_op(cmd_buffer);
305801e04c3fSmrg
30597ec681f3Smrg   if (states & RADV_CMD_DIRTY_DYNAMIC_COLOR_WRITE_ENABLE)
30607ec681f3Smrg      radv_emit_color_write_enable(cmd_buffer);
306101e04c3fSmrg
30627ec681f3Smrg   if (states & RADV_CMD_DIRTY_DYNAMIC_VERTEX_INPUT)
30637ec681f3Smrg      radv_emit_vertex_input(cmd_buffer, pipeline_is_dirty);
306401e04c3fSmrg
30657ec681f3Smrg   cmd_buffer->state.dirty &= ~states;
30667ec681f3Smrg}
306701e04c3fSmrg
30687ec681f3Smrgstatic void
30697ec681f3Smrgradv_flush_push_descriptors(struct radv_cmd_buffer *cmd_buffer, VkPipelineBindPoint bind_point)
30707ec681f3Smrg{
30717ec681f3Smrg   struct radv_descriptor_state *descriptors_state =
30727ec681f3Smrg      radv_get_descriptors_state(cmd_buffer, bind_point);
30737ec681f3Smrg   struct radv_descriptor_set *set = (struct radv_descriptor_set *)&descriptors_state->push_set.set;
30747ec681f3Smrg   unsigned bo_offset;
307501e04c3fSmrg
30767ec681f3Smrg   if (!radv_cmd_buffer_upload_data(cmd_buffer, set->header.size, set->header.mapped_ptr,
30777ec681f3Smrg                                    &bo_offset))
30787ec681f3Smrg      return;
307901e04c3fSmrg
30807ec681f3Smrg   set->header.va = radv_buffer_get_va(cmd_buffer->upload.upload_bo);
30817ec681f3Smrg   set->header.va += bo_offset;
30827ec681f3Smrg}
308301e04c3fSmrg
30847ec681f3Smrgstatic void
30857ec681f3Smrgradv_flush_indirect_descriptor_sets(struct radv_cmd_buffer *cmd_buffer,
30867ec681f3Smrg                                    struct radv_pipeline *pipeline, VkPipelineBindPoint bind_point)
30877ec681f3Smrg{
30887ec681f3Smrg   struct radv_descriptor_state *descriptors_state =
30897ec681f3Smrg      radv_get_descriptors_state(cmd_buffer, bind_point);
30907ec681f3Smrg   uint32_t size = MAX_SETS * 4;
30917ec681f3Smrg   uint32_t offset;
30927ec681f3Smrg   void *ptr;
30937ec681f3Smrg
30947ec681f3Smrg   if (!radv_cmd_buffer_upload_alloc(cmd_buffer, size, &offset, &ptr))
30957ec681f3Smrg      return;
30967ec681f3Smrg
30977ec681f3Smrg   for (unsigned i = 0; i < MAX_SETS; i++) {
30987ec681f3Smrg      uint32_t *uptr = ((uint32_t *)ptr) + i;
30997ec681f3Smrg      uint64_t set_va = 0;
31007ec681f3Smrg      struct radv_descriptor_set *set = descriptors_state->sets[i];
31017ec681f3Smrg      if (descriptors_state->valid & (1u << i))
31027ec681f3Smrg         set_va = set->header.va;
31037ec681f3Smrg      uptr[0] = set_va & 0xffffffff;
31047ec681f3Smrg   }
31057ec681f3Smrg
31067ec681f3Smrg   uint64_t va = radv_buffer_get_va(cmd_buffer->upload.upload_bo);
31077ec681f3Smrg   va += offset;
31087ec681f3Smrg
31097ec681f3Smrg   if (bind_point == VK_PIPELINE_BIND_POINT_GRAPHICS) {
31107ec681f3Smrg      if (pipeline->shaders[MESA_SHADER_VERTEX])
31117ec681f3Smrg         radv_emit_userdata_address(cmd_buffer, pipeline, MESA_SHADER_VERTEX,
31127ec681f3Smrg                                    AC_UD_INDIRECT_DESCRIPTOR_SETS, va);
31137ec681f3Smrg
31147ec681f3Smrg      if (pipeline->shaders[MESA_SHADER_FRAGMENT])
31157ec681f3Smrg         radv_emit_userdata_address(cmd_buffer, pipeline, MESA_SHADER_FRAGMENT,
31167ec681f3Smrg                                    AC_UD_INDIRECT_DESCRIPTOR_SETS, va);
31177ec681f3Smrg
31187ec681f3Smrg      if (radv_pipeline_has_gs(pipeline))
31197ec681f3Smrg         radv_emit_userdata_address(cmd_buffer, pipeline, MESA_SHADER_GEOMETRY,
31207ec681f3Smrg                                    AC_UD_INDIRECT_DESCRIPTOR_SETS, va);
31217ec681f3Smrg
31227ec681f3Smrg      if (radv_pipeline_has_tess(pipeline))
31237ec681f3Smrg         radv_emit_userdata_address(cmd_buffer, pipeline, MESA_SHADER_TESS_CTRL,
31247ec681f3Smrg                                    AC_UD_INDIRECT_DESCRIPTOR_SETS, va);
31257ec681f3Smrg
31267ec681f3Smrg      if (radv_pipeline_has_tess(pipeline))
31277ec681f3Smrg         radv_emit_userdata_address(cmd_buffer, pipeline, MESA_SHADER_TESS_EVAL,
31287ec681f3Smrg                                    AC_UD_INDIRECT_DESCRIPTOR_SETS, va);
31297ec681f3Smrg   } else {
31307ec681f3Smrg      radv_emit_userdata_address(cmd_buffer, pipeline, MESA_SHADER_COMPUTE,
31317ec681f3Smrg                                 AC_UD_INDIRECT_DESCRIPTOR_SETS, va);
31327ec681f3Smrg   }
31337ec681f3Smrg}
313401e04c3fSmrg
31357ec681f3Smrgstatic void
31367ec681f3Smrgradv_flush_descriptors(struct radv_cmd_buffer *cmd_buffer, VkShaderStageFlags stages,
31377ec681f3Smrg                       struct radv_pipeline *pipeline, VkPipelineBindPoint bind_point)
31387ec681f3Smrg{
31397ec681f3Smrg   struct radv_descriptor_state *descriptors_state =
31407ec681f3Smrg      radv_get_descriptors_state(cmd_buffer, bind_point);
31417ec681f3Smrg   bool flush_indirect_descriptors;
314201e04c3fSmrg
31437ec681f3Smrg   if (!descriptors_state->dirty)
31447ec681f3Smrg      return;
314501e04c3fSmrg
31467ec681f3Smrg   if (descriptors_state->push_dirty)
31477ec681f3Smrg      radv_flush_push_descriptors(cmd_buffer, bind_point);
314801e04c3fSmrg
31497ec681f3Smrg   flush_indirect_descriptors = pipeline && pipeline->need_indirect_descriptor_sets;
315001e04c3fSmrg
31517ec681f3Smrg   if (flush_indirect_descriptors)
31527ec681f3Smrg      radv_flush_indirect_descriptor_sets(cmd_buffer, pipeline, bind_point);
315301e04c3fSmrg
31547ec681f3Smrg   ASSERTED unsigned cdw_max =
31557ec681f3Smrg      radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, MAX_SETS * MESA_SHADER_STAGES * 4);
315601e04c3fSmrg
31577ec681f3Smrg   if (pipeline) {
31587ec681f3Smrg      if (stages & VK_SHADER_STAGE_COMPUTE_BIT) {
31597ec681f3Smrg         radv_emit_descriptor_pointers(cmd_buffer, pipeline, descriptors_state,
31607ec681f3Smrg                                       MESA_SHADER_COMPUTE);
31617ec681f3Smrg      } else {
31627ec681f3Smrg         radv_foreach_stage(stage, stages)
31637ec681f3Smrg         {
31647ec681f3Smrg            if (!cmd_buffer->state.pipeline->shaders[stage])
31657ec681f3Smrg               continue;
316601e04c3fSmrg
31677ec681f3Smrg            radv_emit_descriptor_pointers(cmd_buffer, pipeline, descriptors_state, stage);
31687ec681f3Smrg         }
31697ec681f3Smrg      }
31707ec681f3Smrg   }
317101e04c3fSmrg
31727ec681f3Smrg   descriptors_state->dirty = 0;
31737ec681f3Smrg   descriptors_state->push_dirty = false;
317401e04c3fSmrg
31757ec681f3Smrg   assert(cmd_buffer->cs->cdw <= cdw_max);
317601e04c3fSmrg
31777ec681f3Smrg   if (unlikely(cmd_buffer->device->trace_bo))
31787ec681f3Smrg      radv_save_descriptors(cmd_buffer, bind_point);
317901e04c3fSmrg}
318001e04c3fSmrg
31817ec681f3Smrgstatic bool
31827ec681f3Smrgradv_shader_loads_push_constants(struct radv_pipeline *pipeline, gl_shader_stage stage)
318301e04c3fSmrg{
31847ec681f3Smrg   struct radv_userdata_info *loc =
31857ec681f3Smrg      radv_lookup_user_sgpr(pipeline, stage, AC_UD_PUSH_CONSTANTS);
31867ec681f3Smrg   return loc->sgpr_idx != -1;
31877ec681f3Smrg}
318801e04c3fSmrg
31897ec681f3Smrgstatic void
31907ec681f3Smrgradv_flush_constants(struct radv_cmd_buffer *cmd_buffer, VkShaderStageFlags stages,
31917ec681f3Smrg                     struct radv_pipeline *pipeline, VkPipelineBindPoint bind_point)
31927ec681f3Smrg{
31937ec681f3Smrg   struct radv_descriptor_state *descriptors_state =
31947ec681f3Smrg      radv_get_descriptors_state(cmd_buffer, bind_point);
31957ec681f3Smrg   struct radv_shader_variant *shader, *prev_shader;
31967ec681f3Smrg   bool need_push_constants = false;
31977ec681f3Smrg   unsigned offset;
31987ec681f3Smrg   void *ptr;
31997ec681f3Smrg   uint64_t va;
32007ec681f3Smrg   uint32_t internal_stages;
32017ec681f3Smrg   uint32_t dirty_stages = 0;
32027ec681f3Smrg
32037ec681f3Smrg   stages &= cmd_buffer->push_constant_stages;
32047ec681f3Smrg   if (!stages || (!pipeline->push_constant_size && !pipeline->dynamic_offset_count))
32057ec681f3Smrg      return;
32067ec681f3Smrg
32077ec681f3Smrg   internal_stages = stages;
32087ec681f3Smrg   switch (bind_point) {
32097ec681f3Smrg   case VK_PIPELINE_BIND_POINT_GRAPHICS:
32107ec681f3Smrg      break;
32117ec681f3Smrg   case VK_PIPELINE_BIND_POINT_COMPUTE:
32127ec681f3Smrg      dirty_stages = RADV_RT_STAGE_BITS;
32137ec681f3Smrg      break;
32147ec681f3Smrg   case VK_PIPELINE_BIND_POINT_RAY_TRACING_KHR:
32157ec681f3Smrg      internal_stages = VK_SHADER_STAGE_COMPUTE_BIT;
32167ec681f3Smrg      dirty_stages = VK_SHADER_STAGE_COMPUTE_BIT;
32177ec681f3Smrg      break;
32187ec681f3Smrg   default:
32197ec681f3Smrg      unreachable("Unhandled bind point");
32207ec681f3Smrg   }
32217ec681f3Smrg
32227ec681f3Smrg   radv_foreach_stage(stage, internal_stages)
32237ec681f3Smrg   {
32247ec681f3Smrg      shader = radv_get_shader(pipeline, stage);
32257ec681f3Smrg      if (!shader)
32267ec681f3Smrg         continue;
32277ec681f3Smrg
32287ec681f3Smrg      need_push_constants |= radv_shader_loads_push_constants(pipeline, stage);
32297ec681f3Smrg
32307ec681f3Smrg      uint8_t base = shader->info.min_push_constant_used / 4;
32317ec681f3Smrg
32327ec681f3Smrg      radv_emit_inline_push_consts(cmd_buffer, pipeline, stage, AC_UD_INLINE_PUSH_CONSTANTS,
32337ec681f3Smrg                                   (uint32_t *)&cmd_buffer->push_constants[base * 4]);
32347ec681f3Smrg   }
32357ec681f3Smrg
32367ec681f3Smrg   if (need_push_constants) {
32377ec681f3Smrg      if (!radv_cmd_buffer_upload_alloc(
32387ec681f3Smrg             cmd_buffer, pipeline->push_constant_size + 16 * pipeline->dynamic_offset_count, &offset,
32397ec681f3Smrg             &ptr))
32407ec681f3Smrg         return;
32417ec681f3Smrg
32427ec681f3Smrg      memcpy(ptr, cmd_buffer->push_constants, pipeline->push_constant_size);
32437ec681f3Smrg      memcpy((char *)ptr + pipeline->push_constant_size, descriptors_state->dynamic_buffers,
32447ec681f3Smrg             16 * pipeline->dynamic_offset_count);
32457ec681f3Smrg
32467ec681f3Smrg      va = radv_buffer_get_va(cmd_buffer->upload.upload_bo);
32477ec681f3Smrg      va += offset;
32487ec681f3Smrg
32497ec681f3Smrg      ASSERTED unsigned cdw_max =
32507ec681f3Smrg         radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, MESA_SHADER_STAGES * 4);
32517ec681f3Smrg
32527ec681f3Smrg      prev_shader = NULL;
32537ec681f3Smrg      radv_foreach_stage(stage, internal_stages)
32547ec681f3Smrg      {
32557ec681f3Smrg         shader = radv_get_shader(pipeline, stage);
32567ec681f3Smrg
32577ec681f3Smrg         /* Avoid redundantly emitting the address for merged stages. */
32587ec681f3Smrg         if (shader && shader != prev_shader) {
32597ec681f3Smrg            radv_emit_userdata_address(cmd_buffer, pipeline, stage, AC_UD_PUSH_CONSTANTS, va);
32607ec681f3Smrg
32617ec681f3Smrg            prev_shader = shader;
32627ec681f3Smrg         }
32637ec681f3Smrg      }
32647ec681f3Smrg      assert(cmd_buffer->cs->cdw <= cdw_max);
32657ec681f3Smrg   }
32667ec681f3Smrg
32677ec681f3Smrg   cmd_buffer->push_constant_stages &= ~stages;
32687ec681f3Smrg   cmd_buffer->push_constant_stages |= dirty_stages;
32697ec681f3Smrg}
32707ec681f3Smrg
32717ec681f3Smrgenum radv_dst_sel {
32727ec681f3Smrg   DST_SEL_0001 = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_0) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_0) |
32737ec681f3Smrg                  S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_0) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_1),
32747ec681f3Smrg   DST_SEL_X001 = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_0) |
32757ec681f3Smrg                  S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_0) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_1),
32767ec681f3Smrg   DST_SEL_XY01 = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
32777ec681f3Smrg                  S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_0) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_1),
32787ec681f3Smrg   DST_SEL_XYZ1 = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
32797ec681f3Smrg                  S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_1),
32807ec681f3Smrg   DST_SEL_XYZW = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
32817ec681f3Smrg                  S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W),
32827ec681f3Smrg   DST_SEL_ZYXW = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
32837ec681f3Smrg                  S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W),
32847ec681f3Smrg};
328501e04c3fSmrg
32867ec681f3Smrgstatic const uint32_t data_format_dst_sel[] = {
32877ec681f3Smrg   [V_008F0C_BUF_DATA_FORMAT_INVALID] = DST_SEL_0001,
32887ec681f3Smrg   [V_008F0C_BUF_DATA_FORMAT_8] = DST_SEL_X001,
32897ec681f3Smrg   [V_008F0C_BUF_DATA_FORMAT_16] = DST_SEL_X001,
32907ec681f3Smrg   [V_008F0C_BUF_DATA_FORMAT_8_8] = DST_SEL_XY01,
32917ec681f3Smrg   [V_008F0C_BUF_DATA_FORMAT_32] = DST_SEL_X001,
32927ec681f3Smrg   [V_008F0C_BUF_DATA_FORMAT_16_16] = DST_SEL_XY01,
32937ec681f3Smrg   [V_008F0C_BUF_DATA_FORMAT_10_11_11] = DST_SEL_XYZ1,
32947ec681f3Smrg   [V_008F0C_BUF_DATA_FORMAT_11_11_10] = DST_SEL_XYZ1,
32957ec681f3Smrg   [V_008F0C_BUF_DATA_FORMAT_10_10_10_2] = DST_SEL_XYZW,
32967ec681f3Smrg   [V_008F0C_BUF_DATA_FORMAT_2_10_10_10] = DST_SEL_XYZW,
32977ec681f3Smrg   [V_008F0C_BUF_DATA_FORMAT_8_8_8_8] = DST_SEL_XYZW,
32987ec681f3Smrg   [V_008F0C_BUF_DATA_FORMAT_32_32] = DST_SEL_XY01,
32997ec681f3Smrg   [V_008F0C_BUF_DATA_FORMAT_16_16_16_16] = DST_SEL_XYZW,
33007ec681f3Smrg   [V_008F0C_BUF_DATA_FORMAT_32_32_32] = DST_SEL_XYZ1,
33017ec681f3Smrg   [V_008F0C_BUF_DATA_FORMAT_32_32_32_32] = DST_SEL_XYZW,
33027ec681f3Smrg};
330301e04c3fSmrg
33047ec681f3Smrgstatic void
33057ec681f3Smrgradv_flush_vertex_descriptors(struct radv_cmd_buffer *cmd_buffer, bool pipeline_is_dirty)
33067ec681f3Smrg{
33077ec681f3Smrg   if ((pipeline_is_dirty || (cmd_buffer->state.dirty & RADV_CMD_DIRTY_VERTEX_BUFFER)) &&
33087ec681f3Smrg       cmd_buffer->state.pipeline->vb_desc_usage_mask) {
33097ec681f3Smrg      struct radv_pipeline *pipeline = cmd_buffer->state.pipeline;
33107ec681f3Smrg      struct radv_shader_variant *vs_shader = radv_get_shader(pipeline, MESA_SHADER_VERTEX);
33117ec681f3Smrg      enum chip_class chip = cmd_buffer->device->physical_device->rad_info.chip_class;
33127ec681f3Smrg      unsigned vb_offset;
33137ec681f3Smrg      void *vb_ptr;
33147ec681f3Smrg      unsigned desc_index = 0;
33157ec681f3Smrg      uint32_t mask = pipeline->vb_desc_usage_mask;
33167ec681f3Smrg      uint64_t va;
33177ec681f3Smrg      struct radv_vs_input_state *vs_state =
33187ec681f3Smrg         vs_shader->info.vs.dynamic_inputs ? &cmd_buffer->state.dynamic_vs_input : NULL;
33197ec681f3Smrg
33207ec681f3Smrg      /* allocate some descriptor state for vertex buffers */
33217ec681f3Smrg      if (!radv_cmd_buffer_upload_alloc(cmd_buffer, pipeline->vb_desc_alloc_size, &vb_offset, &vb_ptr))
33227ec681f3Smrg         return;
33237ec681f3Smrg
33247ec681f3Smrg      assert(!vs_state || pipeline->use_per_attribute_vb_descs);
33257ec681f3Smrg
33267ec681f3Smrg      while (mask) {
33277ec681f3Smrg         unsigned i = u_bit_scan(&mask);
33287ec681f3Smrg         uint32_t *desc = &((uint32_t *)vb_ptr)[desc_index++ * 4];
33297ec681f3Smrg         uint32_t offset, rsrc_word3;
33307ec681f3Smrg         unsigned binding =
33317ec681f3Smrg            vs_state ? cmd_buffer->state.dynamic_vs_input.bindings[i]
33327ec681f3Smrg                     : (pipeline->use_per_attribute_vb_descs ? pipeline->attrib_bindings[i] : i);
33337ec681f3Smrg         struct radv_buffer *buffer = cmd_buffer->vertex_bindings[binding].buffer;
33347ec681f3Smrg         unsigned num_records;
33357ec681f3Smrg         unsigned stride;
33367ec681f3Smrg
33377ec681f3Smrg         if (vs_state) {
33387ec681f3Smrg            unsigned format = vs_state->formats[i];
33397ec681f3Smrg            unsigned dfmt = format & 0xf;
33407ec681f3Smrg            unsigned nfmt = (format >> 4) & 0x7;
33417ec681f3Smrg
33427ec681f3Smrg            rsrc_word3 =
33437ec681f3Smrg               vs_state->post_shuffle & (1u << i) ? DST_SEL_ZYXW : data_format_dst_sel[dfmt];
33447ec681f3Smrg
33457ec681f3Smrg            if (chip >= GFX10)
33467ec681f3Smrg               rsrc_word3 |= S_008F0C_FORMAT(ac_get_tbuffer_format(chip, dfmt, nfmt));
33477ec681f3Smrg            else
33487ec681f3Smrg               rsrc_word3 |= S_008F0C_NUM_FORMAT(nfmt) | S_008F0C_DATA_FORMAT(dfmt);
33497ec681f3Smrg         } else {
33507ec681f3Smrg            if (chip >= GFX10)
33517ec681f3Smrg               rsrc_word3 = DST_SEL_XYZW | S_008F0C_FORMAT(V_008F0C_GFX10_FORMAT_32_UINT);
33527ec681f3Smrg            else
33537ec681f3Smrg               rsrc_word3 = DST_SEL_XYZW | S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_UINT) |
33547ec681f3Smrg                            S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
33557ec681f3Smrg         }
33567ec681f3Smrg
33577ec681f3Smrg         if (!buffer) {
33587ec681f3Smrg            if (vs_state) {
33597ec681f3Smrg               /* Stride needs to be non-zero on GFX9, or else bounds checking is disabled. We need
33607ec681f3Smrg                * to include the format/word3 so that the alpha channel is 1 for formats without an
33617ec681f3Smrg                * alpha channel.
33627ec681f3Smrg                */
33637ec681f3Smrg               desc[0] = 0;
33647ec681f3Smrg               desc[1] = S_008F04_STRIDE(16);
33657ec681f3Smrg               desc[2] = 0;
33667ec681f3Smrg               desc[3] = rsrc_word3;
33677ec681f3Smrg            } else {
33687ec681f3Smrg               memset(desc, 0, 4 * 4);
33697ec681f3Smrg            }
33707ec681f3Smrg            continue;
33717ec681f3Smrg         }
33727ec681f3Smrg
33737ec681f3Smrg         va = radv_buffer_get_va(buffer->bo);
33747ec681f3Smrg
33757ec681f3Smrg         offset = cmd_buffer->vertex_bindings[binding].offset;
33767ec681f3Smrg         va += offset + buffer->offset;
33777ec681f3Smrg         if (vs_state)
33787ec681f3Smrg            va += vs_state->offsets[i];
33797ec681f3Smrg
33807ec681f3Smrg         if (cmd_buffer->vertex_bindings[binding].size) {
33817ec681f3Smrg            num_records = cmd_buffer->vertex_bindings[binding].size;
33827ec681f3Smrg         } else {
33837ec681f3Smrg            num_records = buffer->size - offset;
33847ec681f3Smrg         }
33857ec681f3Smrg
33867ec681f3Smrg         if (pipeline->graphics.uses_dynamic_stride) {
33877ec681f3Smrg            stride = cmd_buffer->vertex_bindings[binding].stride;
33887ec681f3Smrg         } else {
33897ec681f3Smrg            stride = pipeline->binding_stride[binding];
33907ec681f3Smrg         }
33917ec681f3Smrg
33927ec681f3Smrg         if (pipeline->use_per_attribute_vb_descs) {
33937ec681f3Smrg            uint32_t attrib_end = vs_state ? vs_state->offsets[i] + vs_state->format_sizes[i]
33947ec681f3Smrg                                           : pipeline->attrib_ends[i];
33957ec681f3Smrg
33967ec681f3Smrg            if (num_records < attrib_end) {
33977ec681f3Smrg               num_records = 0; /* not enough space for one vertex */
33987ec681f3Smrg            } else if (stride == 0) {
33997ec681f3Smrg               num_records = 1; /* only one vertex */
34007ec681f3Smrg            } else {
34017ec681f3Smrg               num_records = (num_records - attrib_end) / stride + 1;
34027ec681f3Smrg               /* If attrib_offset>stride, then the compiler will increase the vertex index by
34037ec681f3Smrg                * attrib_offset/stride and decrease the offset by attrib_offset%stride. This is
34047ec681f3Smrg                * only allowed with static strides.
34057ec681f3Smrg                */
34067ec681f3Smrg               num_records += pipeline->attrib_index_offset[i];
34077ec681f3Smrg            }
34087ec681f3Smrg
34097ec681f3Smrg            /* GFX10 uses OOB_SELECT_RAW if stride==0, so convert num_records from elements into
34107ec681f3Smrg             * into bytes in that case. GFX8 always uses bytes.
34117ec681f3Smrg             */
34127ec681f3Smrg            if (num_records && (chip == GFX8 || (chip != GFX9 && !stride))) {
34137ec681f3Smrg               num_records = (num_records - 1) * stride + attrib_end;
34147ec681f3Smrg            } else if (!num_records) {
34157ec681f3Smrg               /* On GFX9, it seems bounds checking is disabled if both
34167ec681f3Smrg                * num_records and stride are zero. This doesn't seem necessary on GFX8, GFX10 and
34177ec681f3Smrg                * GFX10.3 but it doesn't hurt.
34187ec681f3Smrg                */
34197ec681f3Smrg               if (vs_state) {
34207ec681f3Smrg                  desc[0] = 0;
34217ec681f3Smrg                  desc[1] = S_008F04_STRIDE(16);
34227ec681f3Smrg                  desc[2] = 0;
34237ec681f3Smrg                  desc[3] = rsrc_word3;
34247ec681f3Smrg               } else {
34257ec681f3Smrg                  memset(desc, 0, 16);
34267ec681f3Smrg               }
34277ec681f3Smrg               continue;
34287ec681f3Smrg            }
34297ec681f3Smrg         } else {
34307ec681f3Smrg            if (chip != GFX8 && stride)
34317ec681f3Smrg               num_records = DIV_ROUND_UP(num_records, stride);
34327ec681f3Smrg         }
34337ec681f3Smrg
34347ec681f3Smrg         if (chip >= GFX10) {
34357ec681f3Smrg            /* OOB_SELECT chooses the out-of-bounds check:
34367ec681f3Smrg             * - 1: index >= NUM_RECORDS (Structured)
34377ec681f3Smrg             * - 3: offset >= NUM_RECORDS (Raw)
34387ec681f3Smrg             */
34397ec681f3Smrg            int oob_select = stride ? V_008F0C_OOB_SELECT_STRUCTURED : V_008F0C_OOB_SELECT_RAW;
34407ec681f3Smrg            rsrc_word3 |= S_008F0C_OOB_SELECT(oob_select) | S_008F0C_RESOURCE_LEVEL(1);
34417ec681f3Smrg         }
34427ec681f3Smrg
34437ec681f3Smrg         desc[0] = va;
34447ec681f3Smrg         desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32) | S_008F04_STRIDE(stride);
34457ec681f3Smrg         desc[2] = num_records;
34467ec681f3Smrg         desc[3] = rsrc_word3;
34477ec681f3Smrg      }
34487ec681f3Smrg
34497ec681f3Smrg      va = radv_buffer_get_va(cmd_buffer->upload.upload_bo);
34507ec681f3Smrg      va += vb_offset;
34517ec681f3Smrg
34527ec681f3Smrg      radv_emit_userdata_address(cmd_buffer, pipeline, MESA_SHADER_VERTEX, AC_UD_VS_VERTEX_BUFFERS,
34537ec681f3Smrg                                 va);
34547ec681f3Smrg
34557ec681f3Smrg      cmd_buffer->state.vb_va = va;
34567ec681f3Smrg      cmd_buffer->state.prefetch_L2_mask |= RADV_PREFETCH_VBO_DESCRIPTORS;
34577ec681f3Smrg
34587ec681f3Smrg      if (unlikely(cmd_buffer->device->trace_bo))
34597ec681f3Smrg         radv_save_vertex_descriptors(cmd_buffer, (uintptr_t)vb_ptr);
34607ec681f3Smrg   }
34617ec681f3Smrg   cmd_buffer->state.dirty &= ~RADV_CMD_DIRTY_VERTEX_BUFFER;
346201e04c3fSmrg}
346301e04c3fSmrg
346401e04c3fSmrgstatic void
34657ec681f3Smrgradv_emit_streamout_buffers(struct radv_cmd_buffer *cmd_buffer, uint64_t va)
346601e04c3fSmrg{
34677ec681f3Smrg   struct radv_pipeline *pipeline = cmd_buffer->state.pipeline;
34687ec681f3Smrg   struct radv_userdata_info *loc;
34697ec681f3Smrg   uint32_t base_reg;
347001e04c3fSmrg
34717ec681f3Smrg   for (unsigned stage = 0; stage < MESA_SHADER_STAGES; ++stage) {
34727ec681f3Smrg      if (!radv_get_shader(pipeline, stage))
34737ec681f3Smrg         continue;
347401e04c3fSmrg
34757ec681f3Smrg      loc = radv_lookup_user_sgpr(pipeline, stage, AC_UD_STREAMOUT_BUFFERS);
34767ec681f3Smrg      if (loc->sgpr_idx == -1)
34777ec681f3Smrg         continue;
347801e04c3fSmrg
34797ec681f3Smrg      base_reg = pipeline->user_data_0[stage];
348001e04c3fSmrg
34817ec681f3Smrg      radv_emit_shader_pointer(cmd_buffer->device, cmd_buffer->cs, base_reg + loc->sgpr_idx * 4, va,
34827ec681f3Smrg                               false);
34837ec681f3Smrg   }
348401e04c3fSmrg
34857ec681f3Smrg   if (radv_pipeline_has_gs_copy_shader(pipeline)) {
34867ec681f3Smrg      loc = &pipeline->gs_copy_shader->info.user_sgprs_locs.shader_data[AC_UD_STREAMOUT_BUFFERS];
34877ec681f3Smrg      if (loc->sgpr_idx != -1) {
34887ec681f3Smrg         base_reg = R_00B130_SPI_SHADER_USER_DATA_VS_0;
348901e04c3fSmrg
34907ec681f3Smrg         radv_emit_shader_pointer(cmd_buffer->device, cmd_buffer->cs, base_reg + loc->sgpr_idx * 4,
34917ec681f3Smrg                                  va, false);
34927ec681f3Smrg      }
34937ec681f3Smrg   }
349401e04c3fSmrg}
349501e04c3fSmrg
34967ec681f3Smrgstatic void
34977ec681f3Smrgradv_flush_streamout_descriptors(struct radv_cmd_buffer *cmd_buffer)
349801e04c3fSmrg{
34997ec681f3Smrg   if (cmd_buffer->state.dirty & RADV_CMD_DIRTY_STREAMOUT_BUFFER) {
35007ec681f3Smrg      struct radv_streamout_binding *sb = cmd_buffer->streamout_bindings;
35017ec681f3Smrg      struct radv_streamout_state *so = &cmd_buffer->state.streamout;
35027ec681f3Smrg      unsigned so_offset;
35037ec681f3Smrg      void *so_ptr;
35047ec681f3Smrg      uint64_t va;
350501e04c3fSmrg
35067ec681f3Smrg      /* Allocate some descriptor state for streamout buffers. */
35077ec681f3Smrg      if (!radv_cmd_buffer_upload_alloc(cmd_buffer, MAX_SO_BUFFERS * 16, &so_offset, &so_ptr))
35087ec681f3Smrg         return;
350901e04c3fSmrg
35107ec681f3Smrg      for (uint32_t i = 0; i < MAX_SO_BUFFERS; i++) {
35117ec681f3Smrg         struct radv_buffer *buffer = sb[i].buffer;
35127ec681f3Smrg         uint32_t *desc = &((uint32_t *)so_ptr)[i * 4];
351301e04c3fSmrg
35147ec681f3Smrg         if (!(so->enabled_mask & (1 << i)))
35157ec681f3Smrg            continue;
351601e04c3fSmrg
35177ec681f3Smrg         va = radv_buffer_get_va(buffer->bo) + buffer->offset;
351801e04c3fSmrg
35197ec681f3Smrg         va += sb[i].offset;
352001e04c3fSmrg
35217ec681f3Smrg         /* Set the descriptor.
35227ec681f3Smrg          *
35237ec681f3Smrg          * On GFX8, the format must be non-INVALID, otherwise
35247ec681f3Smrg          * the buffer will be considered not bound and store
35257ec681f3Smrg          * instructions will be no-ops.
35267ec681f3Smrg          */
35277ec681f3Smrg         uint32_t size = 0xffffffff;
35287ec681f3Smrg
35297ec681f3Smrg         /* Compute the correct buffer size for NGG streamout
35307ec681f3Smrg          * because it's used to determine the max emit per
35317ec681f3Smrg          * buffer.
35327ec681f3Smrg          */
35337ec681f3Smrg         if (cmd_buffer->device->physical_device->use_ngg_streamout)
35347ec681f3Smrg            size = buffer->size - sb[i].offset;
353501e04c3fSmrg
35367ec681f3Smrg         uint32_t rsrc_word3 =
35377ec681f3Smrg            S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
35387ec681f3Smrg            S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W);
353901e04c3fSmrg
35407ec681f3Smrg         if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX10) {
35417ec681f3Smrg            rsrc_word3 |= S_008F0C_FORMAT(V_008F0C_GFX10_FORMAT_32_FLOAT) |
35427ec681f3Smrg                          S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) | S_008F0C_RESOURCE_LEVEL(1);
35437ec681f3Smrg         } else {
35447ec681f3Smrg            rsrc_word3 |= S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
35457ec681f3Smrg         }
354601e04c3fSmrg
35477ec681f3Smrg         desc[0] = va;
35487ec681f3Smrg         desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32);
35497ec681f3Smrg         desc[2] = size;
35507ec681f3Smrg         desc[3] = rsrc_word3;
35517ec681f3Smrg      }
355201e04c3fSmrg
35537ec681f3Smrg      va = radv_buffer_get_va(cmd_buffer->upload.upload_bo);
35547ec681f3Smrg      va += so_offset;
355501e04c3fSmrg
35567ec681f3Smrg      radv_emit_streamout_buffers(cmd_buffer, va);
35577ec681f3Smrg   }
355801e04c3fSmrg
35597ec681f3Smrg   cmd_buffer->state.dirty &= ~RADV_CMD_DIRTY_STREAMOUT_BUFFER;
356001e04c3fSmrg}
356101e04c3fSmrg
356201e04c3fSmrgstatic void
35637ec681f3Smrgradv_flush_ngg_gs_state(struct radv_cmd_buffer *cmd_buffer)
356401e04c3fSmrg{
35657ec681f3Smrg   struct radv_pipeline *pipeline = cmd_buffer->state.pipeline;
35667ec681f3Smrg   struct radv_userdata_info *loc;
35677ec681f3Smrg   uint32_t ngg_gs_state = 0;
35687ec681f3Smrg   uint32_t base_reg;
356901e04c3fSmrg
35707ec681f3Smrg   if (!radv_pipeline_has_gs(pipeline) || !pipeline->graphics.is_ngg)
35717ec681f3Smrg      return;
357201e04c3fSmrg
35737ec681f3Smrg   /* By default NGG GS queries are disabled but they are enabled if the
35747ec681f3Smrg    * command buffer has active GDS queries or if it's a secondary command
35757ec681f3Smrg    * buffer that inherits the number of generated primitives.
35767ec681f3Smrg    */
35777ec681f3Smrg   if (cmd_buffer->state.active_pipeline_gds_queries ||
35787ec681f3Smrg       (cmd_buffer->state.inherited_pipeline_statistics &
35797ec681f3Smrg        VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_PRIMITIVES_BIT))
35807ec681f3Smrg      ngg_gs_state = 1;
3581ed98bd31Smaya
35827ec681f3Smrg   loc = radv_lookup_user_sgpr(pipeline, MESA_SHADER_GEOMETRY, AC_UD_NGG_GS_STATE);
35837ec681f3Smrg   base_reg = pipeline->user_data_0[MESA_SHADER_GEOMETRY];
35847ec681f3Smrg   assert(loc->sgpr_idx != -1);
358501e04c3fSmrg
35867ec681f3Smrg   radeon_set_sh_reg(cmd_buffer->cs, base_reg + loc->sgpr_idx * 4, ngg_gs_state);
358701e04c3fSmrg}
358801e04c3fSmrg
35897ec681f3Smrgstatic void
35907ec681f3Smrgradv_upload_graphics_shader_descriptors(struct radv_cmd_buffer *cmd_buffer, bool pipeline_is_dirty)
359101e04c3fSmrg{
35927ec681f3Smrg   radv_flush_vertex_descriptors(cmd_buffer, pipeline_is_dirty);
35937ec681f3Smrg   radv_flush_streamout_descriptors(cmd_buffer);
35947ec681f3Smrg   radv_flush_descriptors(cmd_buffer, VK_SHADER_STAGE_ALL_GRAPHICS, cmd_buffer->state.pipeline,
35957ec681f3Smrg                          VK_PIPELINE_BIND_POINT_GRAPHICS);
35967ec681f3Smrg   radv_flush_constants(cmd_buffer, VK_SHADER_STAGE_ALL_GRAPHICS, cmd_buffer->state.pipeline,
35977ec681f3Smrg                        VK_PIPELINE_BIND_POINT_GRAPHICS);
35987ec681f3Smrg   radv_flush_ngg_gs_state(cmd_buffer);
359901e04c3fSmrg}
360001e04c3fSmrg
36017ec681f3Smrgstruct radv_draw_info {
36027ec681f3Smrg   /**
36037ec681f3Smrg    * Number of vertices.
36047ec681f3Smrg    */
36057ec681f3Smrg   uint32_t count;
36067ec681f3Smrg
36077ec681f3Smrg   /**
36087ec681f3Smrg    * First instance id.
36097ec681f3Smrg    */
36107ec681f3Smrg   uint32_t first_instance;
36117ec681f3Smrg
36127ec681f3Smrg   /**
36137ec681f3Smrg    * Number of instances.
36147ec681f3Smrg    */
36157ec681f3Smrg   uint32_t instance_count;
36167ec681f3Smrg
36177ec681f3Smrg   /**
36187ec681f3Smrg    * Whether it's an indexed draw.
36197ec681f3Smrg    */
36207ec681f3Smrg   bool indexed;
36217ec681f3Smrg
36227ec681f3Smrg   /**
36237ec681f3Smrg    * Indirect draw parameters resource.
36247ec681f3Smrg    */
36257ec681f3Smrg   struct radv_buffer *indirect;
36267ec681f3Smrg   uint64_t indirect_offset;
36277ec681f3Smrg   uint32_t stride;
36287ec681f3Smrg
36297ec681f3Smrg   /**
36307ec681f3Smrg    * Draw count parameters resource.
36317ec681f3Smrg    */
36327ec681f3Smrg   struct radv_buffer *count_buffer;
36337ec681f3Smrg   uint64_t count_buffer_offset;
36347ec681f3Smrg
36357ec681f3Smrg   /**
36367ec681f3Smrg    * Stream output parameters resource.
36377ec681f3Smrg    */
36387ec681f3Smrg   struct radv_buffer *strmout_buffer;
36397ec681f3Smrg   uint64_t strmout_buffer_offset;
36407ec681f3Smrg};
364101e04c3fSmrg
36427ec681f3Smrgstatic uint32_t
36437ec681f3Smrgradv_get_primitive_reset_index(struct radv_cmd_buffer *cmd_buffer)
36447ec681f3Smrg{
36457ec681f3Smrg   switch (cmd_buffer->state.index_type) {
36467ec681f3Smrg   case V_028A7C_VGT_INDEX_8:
36477ec681f3Smrg      return 0xffu;
36487ec681f3Smrg   case V_028A7C_VGT_INDEX_16:
36497ec681f3Smrg      return 0xffffu;
36507ec681f3Smrg   case V_028A7C_VGT_INDEX_32:
36517ec681f3Smrg      return 0xffffffffu;
36527ec681f3Smrg   default:
36537ec681f3Smrg      unreachable("invalid index type");
36547ec681f3Smrg   }
36557ec681f3Smrg}
365601e04c3fSmrg
36577ec681f3Smrgstatic void
36587ec681f3Smrgsi_emit_ia_multi_vgt_param(struct radv_cmd_buffer *cmd_buffer, bool instanced_draw,
36597ec681f3Smrg                           bool indirect_draw, bool count_from_stream_output,
36607ec681f3Smrg                           uint32_t draw_vertex_count)
36617ec681f3Smrg{
36627ec681f3Smrg   struct radeon_info *info = &cmd_buffer->device->physical_device->rad_info;
36637ec681f3Smrg   struct radv_cmd_state *state = &cmd_buffer->state;
36647ec681f3Smrg   unsigned topology = state->dynamic.primitive_topology;
36657ec681f3Smrg   bool prim_restart_enable = state->dynamic.primitive_restart_enable;
36667ec681f3Smrg   struct radeon_cmdbuf *cs = cmd_buffer->cs;
36677ec681f3Smrg   unsigned ia_multi_vgt_param;
36687ec681f3Smrg
36697ec681f3Smrg   ia_multi_vgt_param =
36707ec681f3Smrg      si_get_ia_multi_vgt_param(cmd_buffer, instanced_draw, indirect_draw, count_from_stream_output,
36717ec681f3Smrg                                draw_vertex_count, topology, prim_restart_enable);
36727ec681f3Smrg
36737ec681f3Smrg   if (state->last_ia_multi_vgt_param != ia_multi_vgt_param) {
36747ec681f3Smrg      if (info->chip_class == GFX9) {
36757ec681f3Smrg         radeon_set_uconfig_reg_idx(cmd_buffer->device->physical_device, cs,
36767ec681f3Smrg                                    R_030960_IA_MULTI_VGT_PARAM, 4, ia_multi_vgt_param);
36777ec681f3Smrg      } else if (info->chip_class >= GFX7) {
36787ec681f3Smrg         radeon_set_context_reg_idx(cs, R_028AA8_IA_MULTI_VGT_PARAM, 1, ia_multi_vgt_param);
36797ec681f3Smrg      } else {
36807ec681f3Smrg         radeon_set_context_reg(cs, R_028AA8_IA_MULTI_VGT_PARAM, ia_multi_vgt_param);
36817ec681f3Smrg      }
36827ec681f3Smrg      state->last_ia_multi_vgt_param = ia_multi_vgt_param;
36837ec681f3Smrg   }
368401e04c3fSmrg}
368501e04c3fSmrg
36867ec681f3Smrgstatic void
36877ec681f3Smrgradv_emit_draw_registers(struct radv_cmd_buffer *cmd_buffer, const struct radv_draw_info *draw_info)
368801e04c3fSmrg{
36897ec681f3Smrg   struct radeon_info *info = &cmd_buffer->device->physical_device->rad_info;
36907ec681f3Smrg   struct radv_cmd_state *state = &cmd_buffer->state;
36917ec681f3Smrg   struct radeon_cmdbuf *cs = cmd_buffer->cs;
369201e04c3fSmrg
36937ec681f3Smrg   /* Draw state. */
36947ec681f3Smrg   if (info->chip_class < GFX10) {
36957ec681f3Smrg      si_emit_ia_multi_vgt_param(cmd_buffer, draw_info->instance_count > 1, draw_info->indirect,
36967ec681f3Smrg                                 !!draw_info->strmout_buffer,
36977ec681f3Smrg                                 draw_info->indirect ? 0 : draw_info->count);
36987ec681f3Smrg   }
369901e04c3fSmrg
37007ec681f3Smrg   if (state->dynamic.primitive_restart_enable) {
37017ec681f3Smrg      uint32_t primitive_reset_index = radv_get_primitive_reset_index(cmd_buffer);
3702ed98bd31Smaya
37037ec681f3Smrg      if (primitive_reset_index != state->last_primitive_reset_index) {
37047ec681f3Smrg         radeon_set_context_reg(cs, R_02840C_VGT_MULTI_PRIM_IB_RESET_INDX, primitive_reset_index);
37057ec681f3Smrg         state->last_primitive_reset_index = primitive_reset_index;
37067ec681f3Smrg      }
37077ec681f3Smrg   }
370801e04c3fSmrg
37097ec681f3Smrg   if (draw_info->strmout_buffer) {
37107ec681f3Smrg      uint64_t va = radv_buffer_get_va(draw_info->strmout_buffer->bo);
371101e04c3fSmrg
37127ec681f3Smrg      va += draw_info->strmout_buffer->offset + draw_info->strmout_buffer_offset;
371301e04c3fSmrg
37147ec681f3Smrg      radeon_set_context_reg(cs, R_028B30_VGT_STRMOUT_DRAW_OPAQUE_VERTEX_STRIDE, draw_info->stride);
371501e04c3fSmrg
37167ec681f3Smrg      radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
37177ec681f3Smrg      radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_SRC_MEM) | COPY_DATA_DST_SEL(COPY_DATA_REG) |
37187ec681f3Smrg                         COPY_DATA_WR_CONFIRM);
37197ec681f3Smrg      radeon_emit(cs, va);
37207ec681f3Smrg      radeon_emit(cs, va >> 32);
37217ec681f3Smrg      radeon_emit(cs, R_028B2C_VGT_STRMOUT_DRAW_OPAQUE_BUFFER_FILLED_SIZE >> 2);
37227ec681f3Smrg      radeon_emit(cs, 0); /* unused */
3723ed98bd31Smaya
37247ec681f3Smrg      radv_cs_add_buffer(cmd_buffer->device->ws, cs, draw_info->strmout_buffer->bo);
37257ec681f3Smrg   }
37267ec681f3Smrg}
372701e04c3fSmrg
37287ec681f3Smrgstatic void
37297ec681f3Smrgradv_stage_flush(struct radv_cmd_buffer *cmd_buffer, VkPipelineStageFlags src_stage_mask)
37307ec681f3Smrg{
37317ec681f3Smrg   if (src_stage_mask &
37327ec681f3Smrg       (VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT | VK_PIPELINE_STAGE_TRANSFER_BIT |
37337ec681f3Smrg        VK_PIPELINE_STAGE_ACCELERATION_STRUCTURE_BUILD_BIT_KHR |
37347ec681f3Smrg        VK_PIPELINE_STAGE_RAY_TRACING_SHADER_BIT_KHR | VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT |
37357ec681f3Smrg        VK_PIPELINE_STAGE_ALL_COMMANDS_BIT)) {
37367ec681f3Smrg      cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_CS_PARTIAL_FLUSH;
37377ec681f3Smrg   }
37387ec681f3Smrg
37397ec681f3Smrg   if (src_stage_mask &
37407ec681f3Smrg       (VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT | VK_PIPELINE_STAGE_EARLY_FRAGMENT_TESTS_BIT |
37417ec681f3Smrg        VK_PIPELINE_STAGE_LATE_FRAGMENT_TESTS_BIT | VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT |
37427ec681f3Smrg        VK_PIPELINE_STAGE_TRANSFER_BIT | VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT |
37437ec681f3Smrg        VK_PIPELINE_STAGE_ALL_GRAPHICS_BIT | VK_PIPELINE_STAGE_ALL_COMMANDS_BIT)) {
37447ec681f3Smrg      cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_PS_PARTIAL_FLUSH;
37457ec681f3Smrg   } else if (src_stage_mask &
37467ec681f3Smrg              (VK_PIPELINE_STAGE_DRAW_INDIRECT_BIT | VK_PIPELINE_STAGE_VERTEX_INPUT_BIT |
37477ec681f3Smrg               VK_PIPELINE_STAGE_VERTEX_SHADER_BIT |
37487ec681f3Smrg               VK_PIPELINE_STAGE_TESSELLATION_CONTROL_SHADER_BIT |
37497ec681f3Smrg               VK_PIPELINE_STAGE_TESSELLATION_EVALUATION_SHADER_BIT |
37507ec681f3Smrg               VK_PIPELINE_STAGE_GEOMETRY_SHADER_BIT |
37517ec681f3Smrg               VK_PIPELINE_STAGE_TRANSFORM_FEEDBACK_BIT_EXT)) {
37527ec681f3Smrg      cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_VS_PARTIAL_FLUSH;
37537ec681f3Smrg   }
375401e04c3fSmrg}
375501e04c3fSmrg
37567ec681f3Smrgstatic bool
37577ec681f3Smrgcan_skip_buffer_l2_flushes(struct radv_device *device)
375801e04c3fSmrg{
37597ec681f3Smrg   return device->physical_device->rad_info.chip_class == GFX9 ||
37607ec681f3Smrg          (device->physical_device->rad_info.chip_class >= GFX10 &&
37617ec681f3Smrg           !device->physical_device->rad_info.tcc_rb_non_coherent);
376201e04c3fSmrg}
376301e04c3fSmrg
37647ec681f3Smrg/*
37657ec681f3Smrg * In vulkan barriers have two kinds of operations:
37667ec681f3Smrg *
37677ec681f3Smrg * - visibility (implemented with radv_src_access_flush)
37687ec681f3Smrg * - availability (implemented with radv_dst_access_flush)
37697ec681f3Smrg *
37707ec681f3Smrg * for a memory operation to observe the result of a previous memory operation
37717ec681f3Smrg * one needs to do a visibility operation from the source memory and then an
37727ec681f3Smrg * availability operation to the target memory.
37737ec681f3Smrg *
37747ec681f3Smrg * The complication is the availability and visibility operations do not need to
37757ec681f3Smrg * be in the same barrier.
37767ec681f3Smrg *
37777ec681f3Smrg * The cleanest way to implement this is to define the visibility operation to
37787ec681f3Smrg * bring the caches to a "state of rest", which none of the caches below that
37797ec681f3Smrg * level dirty.
37807ec681f3Smrg *
37817ec681f3Smrg * For GFX8 and earlier this would be VRAM/GTT with none of the caches dirty.
37827ec681f3Smrg *
37837ec681f3Smrg * For GFX9+ we can define the state at rest to be L2 instead of VRAM for all
37847ec681f3Smrg * buffers and for images marked as coherent, and VRAM/GTT for non-coherent
37857ec681f3Smrg * images. However, given the existence of memory barriers which do not specify
37867ec681f3Smrg * the image/buffer it often devolves to just VRAM/GTT anyway.
37877ec681f3Smrg *
37887ec681f3Smrg * To help reducing the invalidations for GPUs that have L2 coherency between the
37897ec681f3Smrg * RB and the shader caches, we always invalidate L2 on the src side, as we can
37907ec681f3Smrg * use our knowledge of past usage to optimize flushes away.
37917ec681f3Smrg */
379201e04c3fSmrg
37937ec681f3Smrgenum radv_cmd_flush_bits
37947ec681f3Smrgradv_src_access_flush(struct radv_cmd_buffer *cmd_buffer, VkAccessFlags src_flags,
37957ec681f3Smrg                      const struct radv_image *image)
37967ec681f3Smrg{
37977ec681f3Smrg   bool has_CB_meta = true, has_DB_meta = true;
37987ec681f3Smrg   bool image_is_coherent = image ? image->l2_coherent : false;
37997ec681f3Smrg   enum radv_cmd_flush_bits flush_bits = 0;
38007ec681f3Smrg
38017ec681f3Smrg   if (image) {
38027ec681f3Smrg      if (!radv_image_has_CB_metadata(image))
38037ec681f3Smrg         has_CB_meta = false;
38047ec681f3Smrg      if (!radv_image_has_htile(image))
38057ec681f3Smrg         has_DB_meta = false;
38067ec681f3Smrg   }
38077ec681f3Smrg
38087ec681f3Smrg   u_foreach_bit(b, src_flags)
38097ec681f3Smrg   {
38107ec681f3Smrg      switch ((VkAccessFlagBits)(1 << b)) {
38117ec681f3Smrg      case VK_ACCESS_SHADER_WRITE_BIT:
38127ec681f3Smrg         /* since the STORAGE bit isn't set we know that this is a meta operation.
38137ec681f3Smrg          * on the dst flush side we skip CB/DB flushes without the STORAGE bit, so
38147ec681f3Smrg          * set it here. */
38157ec681f3Smrg         if (image && !(image->usage & VK_IMAGE_USAGE_STORAGE_BIT)) {
38167ec681f3Smrg            if (vk_format_is_depth_or_stencil(image->vk_format)) {
38177ec681f3Smrg               flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB;
38187ec681f3Smrg            } else {
38197ec681f3Smrg               flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB;
38207ec681f3Smrg            }
38217ec681f3Smrg         }
38227ec681f3Smrg
38237ec681f3Smrg         /* This is valid even for the rb_noncoherent_dirty case, because with how we account for
38247ec681f3Smrg          * dirtyness, if it isn't dirty it doesn't contain the data at all and hence doesn't need
38257ec681f3Smrg          * invalidating. */
38267ec681f3Smrg         if (!image_is_coherent)
38277ec681f3Smrg            flush_bits |= RADV_CMD_FLAG_WB_L2;
38287ec681f3Smrg         break;
38297ec681f3Smrg      case VK_ACCESS_ACCELERATION_STRUCTURE_WRITE_BIT_KHR:
38307ec681f3Smrg      case VK_ACCESS_TRANSFORM_FEEDBACK_WRITE_BIT_EXT:
38317ec681f3Smrg      case VK_ACCESS_TRANSFORM_FEEDBACK_COUNTER_WRITE_BIT_EXT:
38327ec681f3Smrg         if (!image_is_coherent)
38337ec681f3Smrg            flush_bits |= RADV_CMD_FLAG_WB_L2;
38347ec681f3Smrg         break;
38357ec681f3Smrg      case VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT:
38367ec681f3Smrg         flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB;
38377ec681f3Smrg         if (has_CB_meta)
38387ec681f3Smrg            flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB_META;
38397ec681f3Smrg         break;
38407ec681f3Smrg      case VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT:
38417ec681f3Smrg         flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB;
38427ec681f3Smrg         if (has_DB_meta)
38437ec681f3Smrg            flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB_META;
38447ec681f3Smrg         break;
38457ec681f3Smrg      case VK_ACCESS_TRANSFER_WRITE_BIT:
38467ec681f3Smrg         flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB | RADV_CMD_FLAG_FLUSH_AND_INV_DB;
38477ec681f3Smrg
38487ec681f3Smrg         if (!image_is_coherent)
38497ec681f3Smrg            flush_bits |= RADV_CMD_FLAG_INV_L2;
38507ec681f3Smrg         if (has_CB_meta)
38517ec681f3Smrg            flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB_META;
38527ec681f3Smrg         if (has_DB_meta)
38537ec681f3Smrg            flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB_META;
38547ec681f3Smrg         break;
38557ec681f3Smrg      case VK_ACCESS_MEMORY_WRITE_BIT:
38567ec681f3Smrg         flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB | RADV_CMD_FLAG_FLUSH_AND_INV_DB;
38577ec681f3Smrg
38587ec681f3Smrg         if (!image_is_coherent)
38597ec681f3Smrg            flush_bits |= RADV_CMD_FLAG_INV_L2;
38607ec681f3Smrg         if (has_CB_meta)
38617ec681f3Smrg            flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB_META;
38627ec681f3Smrg         if (has_DB_meta)
38637ec681f3Smrg            flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB_META;
38647ec681f3Smrg         break;
38657ec681f3Smrg      default:
38667ec681f3Smrg         break;
38677ec681f3Smrg      }
38687ec681f3Smrg   }
38697ec681f3Smrg   return flush_bits;
38707ec681f3Smrg}
38717ec681f3Smrg
38727ec681f3Smrgenum radv_cmd_flush_bits
38737ec681f3Smrgradv_dst_access_flush(struct radv_cmd_buffer *cmd_buffer, VkAccessFlags dst_flags,
38747ec681f3Smrg                      const struct radv_image *image)
38757ec681f3Smrg{
38767ec681f3Smrg   bool has_CB_meta = true, has_DB_meta = true;
38777ec681f3Smrg   enum radv_cmd_flush_bits flush_bits = 0;
38787ec681f3Smrg   bool flush_CB = true, flush_DB = true;
38797ec681f3Smrg   bool image_is_coherent = image ? image->l2_coherent : false;
38807ec681f3Smrg
38817ec681f3Smrg   if (image) {
38827ec681f3Smrg      if (!(image->usage & VK_IMAGE_USAGE_STORAGE_BIT)) {
38837ec681f3Smrg         flush_CB = false;
38847ec681f3Smrg         flush_DB = false;
38857ec681f3Smrg      }
38867ec681f3Smrg
38877ec681f3Smrg      if (!radv_image_has_CB_metadata(image))
38887ec681f3Smrg         has_CB_meta = false;
38897ec681f3Smrg      if (!radv_image_has_htile(image))
38907ec681f3Smrg         has_DB_meta = false;
38917ec681f3Smrg   }
38927ec681f3Smrg
38937ec681f3Smrg   /* All the L2 invalidations below are not the CB/DB. So if there are no incoherent images
38947ec681f3Smrg    * in the L2 cache in CB/DB mode then they are already usable from all the other L2 clients. */
38957ec681f3Smrg   image_is_coherent |=
38967ec681f3Smrg      can_skip_buffer_l2_flushes(cmd_buffer->device) && !cmd_buffer->state.rb_noncoherent_dirty;
38977ec681f3Smrg
38987ec681f3Smrg   u_foreach_bit(b, dst_flags)
38997ec681f3Smrg   {
39007ec681f3Smrg      switch ((VkAccessFlagBits)(1 << b)) {
39017ec681f3Smrg      case VK_ACCESS_INDIRECT_COMMAND_READ_BIT:
39027ec681f3Smrg      case VK_ACCESS_INDEX_READ_BIT:
39037ec681f3Smrg      case VK_ACCESS_TRANSFORM_FEEDBACK_COUNTER_WRITE_BIT_EXT:
39047ec681f3Smrg         break;
39057ec681f3Smrg      case VK_ACCESS_UNIFORM_READ_BIT:
39067ec681f3Smrg         flush_bits |= RADV_CMD_FLAG_INV_VCACHE | RADV_CMD_FLAG_INV_SCACHE;
39077ec681f3Smrg         break;
39087ec681f3Smrg      case VK_ACCESS_VERTEX_ATTRIBUTE_READ_BIT:
39097ec681f3Smrg      case VK_ACCESS_INPUT_ATTACHMENT_READ_BIT:
39107ec681f3Smrg      case VK_ACCESS_TRANSFER_READ_BIT:
39117ec681f3Smrg      case VK_ACCESS_TRANSFER_WRITE_BIT:
39127ec681f3Smrg         flush_bits |= RADV_CMD_FLAG_INV_VCACHE;
39137ec681f3Smrg
39147ec681f3Smrg         if (has_CB_meta || has_DB_meta)
39157ec681f3Smrg            flush_bits |= RADV_CMD_FLAG_INV_L2_METADATA;
39167ec681f3Smrg         if (!image_is_coherent)
39177ec681f3Smrg            flush_bits |= RADV_CMD_FLAG_INV_L2;
39187ec681f3Smrg         break;
39197ec681f3Smrg      case VK_ACCESS_SHADER_READ_BIT:
39207ec681f3Smrg         flush_bits |= RADV_CMD_FLAG_INV_VCACHE;
39217ec681f3Smrg         /* Unlike LLVM, ACO uses SMEM for SSBOs and we have to
39227ec681f3Smrg          * invalidate the scalar cache. */
39237ec681f3Smrg         if (!cmd_buffer->device->physical_device->use_llvm && !image)
39247ec681f3Smrg            flush_bits |= RADV_CMD_FLAG_INV_SCACHE;
39257ec681f3Smrg
39267ec681f3Smrg         if (has_CB_meta || has_DB_meta)
39277ec681f3Smrg            flush_bits |= RADV_CMD_FLAG_INV_L2_METADATA;
39287ec681f3Smrg         if (!image_is_coherent)
39297ec681f3Smrg            flush_bits |= RADV_CMD_FLAG_INV_L2;
39307ec681f3Smrg         break;
39317ec681f3Smrg      case VK_ACCESS_ACCELERATION_STRUCTURE_READ_BIT_KHR:
39327ec681f3Smrg         flush_bits |= RADV_CMD_FLAG_INV_VCACHE;
39337ec681f3Smrg         if (cmd_buffer->device->physical_device->rad_info.chip_class < GFX9)
39347ec681f3Smrg            flush_bits |= RADV_CMD_FLAG_INV_L2;
39357ec681f3Smrg         break;
39367ec681f3Smrg      case VK_ACCESS_SHADER_WRITE_BIT:
39377ec681f3Smrg      case VK_ACCESS_ACCELERATION_STRUCTURE_WRITE_BIT_KHR:
39387ec681f3Smrg         break;
39397ec681f3Smrg      case VK_ACCESS_COLOR_ATTACHMENT_READ_BIT:
39407ec681f3Smrg      case VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT:
39417ec681f3Smrg         if (flush_CB)
39427ec681f3Smrg            flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB;
39437ec681f3Smrg         if (has_CB_meta)
39447ec681f3Smrg            flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB_META;
39457ec681f3Smrg         break;
39467ec681f3Smrg      case VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_READ_BIT:
39477ec681f3Smrg      case VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT:
39487ec681f3Smrg         if (flush_DB)
39497ec681f3Smrg            flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB;
39507ec681f3Smrg         if (has_DB_meta)
39517ec681f3Smrg            flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB_META;
39527ec681f3Smrg         break;
39537ec681f3Smrg      case VK_ACCESS_MEMORY_READ_BIT:
39547ec681f3Smrg      case VK_ACCESS_MEMORY_WRITE_BIT:
39557ec681f3Smrg         flush_bits |= RADV_CMD_FLAG_INV_VCACHE | RADV_CMD_FLAG_INV_SCACHE;
39567ec681f3Smrg         if (!image_is_coherent)
39577ec681f3Smrg            flush_bits |= RADV_CMD_FLAG_INV_L2;
39587ec681f3Smrg         if (flush_CB)
39597ec681f3Smrg            flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB;
39607ec681f3Smrg         if (has_CB_meta)
39617ec681f3Smrg            flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB_META;
39627ec681f3Smrg         if (flush_DB)
39637ec681f3Smrg            flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB;
39647ec681f3Smrg         if (has_DB_meta)
39657ec681f3Smrg            flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB_META;
39667ec681f3Smrg         break;
39677ec681f3Smrg      default:
39687ec681f3Smrg         break;
39697ec681f3Smrg      }
39707ec681f3Smrg   }
39717ec681f3Smrg   return flush_bits;
39727ec681f3Smrg}
3973ed98bd31Smaya
39747ec681f3Smrgvoid
39757ec681f3Smrgradv_emit_subpass_barrier(struct radv_cmd_buffer *cmd_buffer, const struct radv_subpass_barrier *barrier)
39767ec681f3Smrg{
39777ec681f3Smrg   struct radv_framebuffer *fb = cmd_buffer->state.framebuffer;
39787ec681f3Smrg   if (fb && !fb->imageless) {
39797ec681f3Smrg      for (int i = 0; i < fb->attachment_count; ++i) {
39807ec681f3Smrg         cmd_buffer->state.flush_bits |=
39817ec681f3Smrg            radv_src_access_flush(cmd_buffer, barrier->src_access_mask, fb->attachments[i]->image);
39827ec681f3Smrg      }
39837ec681f3Smrg   } else {
39847ec681f3Smrg      cmd_buffer->state.flush_bits |=
39857ec681f3Smrg         radv_src_access_flush(cmd_buffer, barrier->src_access_mask, NULL);
39867ec681f3Smrg   }
39877ec681f3Smrg
39887ec681f3Smrg   radv_stage_flush(cmd_buffer, barrier->src_stage_mask);
39897ec681f3Smrg
39907ec681f3Smrg   if (fb && !fb->imageless) {
39917ec681f3Smrg      for (int i = 0; i < fb->attachment_count; ++i) {
39927ec681f3Smrg         cmd_buffer->state.flush_bits |=
39937ec681f3Smrg            radv_dst_access_flush(cmd_buffer, barrier->dst_access_mask, fb->attachments[i]->image);
39947ec681f3Smrg      }
39957ec681f3Smrg   } else {
39967ec681f3Smrg      cmd_buffer->state.flush_bits |=
39977ec681f3Smrg         radv_dst_access_flush(cmd_buffer, barrier->dst_access_mask, NULL);
39987ec681f3Smrg   }
39997ec681f3Smrg}
40007ec681f3Smrg
40017ec681f3Smrguint32_t
40027ec681f3Smrgradv_get_subpass_id(struct radv_cmd_buffer *cmd_buffer)
40037ec681f3Smrg{
40047ec681f3Smrg   struct radv_cmd_state *state = &cmd_buffer->state;
40057ec681f3Smrg   uint32_t subpass_id = state->subpass - state->pass->subpasses;
40067ec681f3Smrg
40077ec681f3Smrg   /* The id of this subpass shouldn't exceed the number of subpasses in
40087ec681f3Smrg    * this render pass minus 1.
40097ec681f3Smrg    */
40107ec681f3Smrg   assert(subpass_id < state->pass->subpass_count);
40117ec681f3Smrg   return subpass_id;
40127ec681f3Smrg}
40137ec681f3Smrg
40147ec681f3Smrgstatic struct radv_sample_locations_state *
40157ec681f3Smrgradv_get_attachment_sample_locations(struct radv_cmd_buffer *cmd_buffer, uint32_t att_idx,
40167ec681f3Smrg                                     bool begin_subpass)
40177ec681f3Smrg{
40187ec681f3Smrg   struct radv_cmd_state *state = &cmd_buffer->state;
40197ec681f3Smrg   uint32_t subpass_id = radv_get_subpass_id(cmd_buffer);
40207ec681f3Smrg   struct radv_image_view *view = state->attachments[att_idx].iview;
40217ec681f3Smrg
40227ec681f3Smrg   if (view->image->info.samples == 1)
40237ec681f3Smrg      return NULL;
40247ec681f3Smrg
40257ec681f3Smrg   if (state->pass->attachments[att_idx].first_subpass_idx == subpass_id) {
40267ec681f3Smrg      /* Return the initial sample locations if this is the initial
40277ec681f3Smrg       * layout transition of the given subpass attachemnt.
40287ec681f3Smrg       */
40297ec681f3Smrg      if (state->attachments[att_idx].sample_location.count > 0)
40307ec681f3Smrg         return &state->attachments[att_idx].sample_location;
40317ec681f3Smrg   } else {
40327ec681f3Smrg      /* Otherwise return the subpass sample locations if defined. */
40337ec681f3Smrg      if (state->subpass_sample_locs) {
40347ec681f3Smrg         /* Because the driver sets the current subpass before
40357ec681f3Smrg          * initial layout transitions, we should use the sample
40367ec681f3Smrg          * locations from the previous subpass to avoid an
40377ec681f3Smrg          * off-by-one problem. Otherwise, use the sample
40387ec681f3Smrg          * locations for the current subpass for final layout
40397ec681f3Smrg          * transitions.
40407ec681f3Smrg          */
40417ec681f3Smrg         if (begin_subpass)
40427ec681f3Smrg            subpass_id--;
40437ec681f3Smrg
40447ec681f3Smrg         for (uint32_t i = 0; i < state->num_subpass_sample_locs; i++) {
40457ec681f3Smrg            if (state->subpass_sample_locs[i].subpass_idx == subpass_id)
40467ec681f3Smrg               return &state->subpass_sample_locs[i].sample_location;
40477ec681f3Smrg         }
40487ec681f3Smrg      }
40497ec681f3Smrg   }
40507ec681f3Smrg
40517ec681f3Smrg   return NULL;
40527ec681f3Smrg}
405301e04c3fSmrg
40547ec681f3Smrgstatic void
40557ec681f3Smrgradv_handle_subpass_image_transition(struct radv_cmd_buffer *cmd_buffer,
40567ec681f3Smrg                                     struct radv_subpass_attachment att, bool begin_subpass)
40577ec681f3Smrg{
40587ec681f3Smrg   unsigned idx = att.attachment;
40597ec681f3Smrg   struct radv_image_view *view = cmd_buffer->state.attachments[idx].iview;
40607ec681f3Smrg   struct radv_sample_locations_state *sample_locs;
40617ec681f3Smrg   VkImageSubresourceRange range;
40627ec681f3Smrg   range.aspectMask = view->aspect_mask;
40637ec681f3Smrg   range.baseMipLevel = view->base_mip;
40647ec681f3Smrg   range.levelCount = 1;
40657ec681f3Smrg   range.baseArrayLayer = view->base_layer;
40667ec681f3Smrg   range.layerCount = cmd_buffer->state.framebuffer->layers;
40677ec681f3Smrg
40687ec681f3Smrg   if (cmd_buffer->state.subpass->view_mask) {
40697ec681f3Smrg      /* If the current subpass uses multiview, the driver might have
40707ec681f3Smrg       * performed a fast color/depth clear to the whole image
40717ec681f3Smrg       * (including all layers). To make sure the driver will
40727ec681f3Smrg       * decompress the image correctly (if needed), we have to
40737ec681f3Smrg       * account for the "real" number of layers. If the view mask is
40747ec681f3Smrg       * sparse, this will decompress more layers than needed.
40757ec681f3Smrg       */
40767ec681f3Smrg      range.layerCount = util_last_bit(cmd_buffer->state.subpass->view_mask);
40777ec681f3Smrg   }
40787ec681f3Smrg
40797ec681f3Smrg   /* Get the subpass sample locations for the given attachment, if NULL
40807ec681f3Smrg    * is returned the driver will use the default HW locations.
40817ec681f3Smrg    */
40827ec681f3Smrg   sample_locs = radv_get_attachment_sample_locations(cmd_buffer, idx, begin_subpass);
40837ec681f3Smrg
40847ec681f3Smrg   /* Determine if the subpass uses separate depth/stencil layouts. */
40857ec681f3Smrg   bool uses_separate_depth_stencil_layouts = false;
40867ec681f3Smrg   if ((cmd_buffer->state.attachments[idx].current_layout !=
40877ec681f3Smrg        cmd_buffer->state.attachments[idx].current_stencil_layout) ||
40887ec681f3Smrg       (att.layout != att.stencil_layout)) {
40897ec681f3Smrg      uses_separate_depth_stencil_layouts = true;
40907ec681f3Smrg   }
40917ec681f3Smrg
40927ec681f3Smrg   /* For separate layouts, perform depth and stencil transitions
40937ec681f3Smrg    * separately.
40947ec681f3Smrg    */
40957ec681f3Smrg   if (uses_separate_depth_stencil_layouts &&
40967ec681f3Smrg       (range.aspectMask == (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT))) {
40977ec681f3Smrg      /* Depth-only transitions. */
40987ec681f3Smrg      range.aspectMask = VK_IMAGE_ASPECT_DEPTH_BIT;
40997ec681f3Smrg      radv_handle_image_transition(cmd_buffer, view->image,
41007ec681f3Smrg                                   cmd_buffer->state.attachments[idx].current_layout,
41017ec681f3Smrg                                   cmd_buffer->state.attachments[idx].current_in_render_loop,
41027ec681f3Smrg                                   att.layout, att.in_render_loop, 0, 0, &range, sample_locs);
41037ec681f3Smrg
41047ec681f3Smrg      /* Stencil-only transitions. */
41057ec681f3Smrg      range.aspectMask = VK_IMAGE_ASPECT_STENCIL_BIT;
41067ec681f3Smrg      radv_handle_image_transition(
41077ec681f3Smrg         cmd_buffer, view->image, cmd_buffer->state.attachments[idx].current_stencil_layout,
41087ec681f3Smrg         cmd_buffer->state.attachments[idx].current_in_render_loop, att.stencil_layout,
41097ec681f3Smrg         att.in_render_loop, 0, 0, &range, sample_locs);
41107ec681f3Smrg   } else {
41117ec681f3Smrg      radv_handle_image_transition(cmd_buffer, view->image,
41127ec681f3Smrg                                   cmd_buffer->state.attachments[idx].current_layout,
41137ec681f3Smrg                                   cmd_buffer->state.attachments[idx].current_in_render_loop,
41147ec681f3Smrg                                   att.layout, att.in_render_loop, 0, 0, &range, sample_locs);
41157ec681f3Smrg   }
41167ec681f3Smrg
41177ec681f3Smrg   cmd_buffer->state.attachments[idx].current_layout = att.layout;
41187ec681f3Smrg   cmd_buffer->state.attachments[idx].current_stencil_layout = att.stencil_layout;
41197ec681f3Smrg   cmd_buffer->state.attachments[idx].current_in_render_loop = att.in_render_loop;
412001e04c3fSmrg}
412101e04c3fSmrg
41227ec681f3Smrgvoid
41237ec681f3Smrgradv_cmd_buffer_set_subpass(struct radv_cmd_buffer *cmd_buffer, const struct radv_subpass *subpass)
412401e04c3fSmrg{
41257ec681f3Smrg   cmd_buffer->state.subpass = subpass;
412601e04c3fSmrg
41277ec681f3Smrg   cmd_buffer->state.dirty |= RADV_CMD_DIRTY_FRAMEBUFFER;
41287ec681f3Smrg}
4129ed98bd31Smaya
41307ec681f3Smrgstatic VkResult
41317ec681f3Smrgradv_cmd_state_setup_sample_locations(struct radv_cmd_buffer *cmd_buffer,
41327ec681f3Smrg                                      struct radv_render_pass *pass,
41337ec681f3Smrg                                      const VkRenderPassBeginInfo *info)
41347ec681f3Smrg{
41357ec681f3Smrg   const struct VkRenderPassSampleLocationsBeginInfoEXT *sample_locs =
41367ec681f3Smrg      vk_find_struct_const(info->pNext, RENDER_PASS_SAMPLE_LOCATIONS_BEGIN_INFO_EXT);
41377ec681f3Smrg   struct radv_cmd_state *state = &cmd_buffer->state;
41387ec681f3Smrg
41397ec681f3Smrg   if (!sample_locs) {
41407ec681f3Smrg      state->subpass_sample_locs = NULL;
41417ec681f3Smrg      return VK_SUCCESS;
41427ec681f3Smrg   }
41437ec681f3Smrg
41447ec681f3Smrg   for (uint32_t i = 0; i < sample_locs->attachmentInitialSampleLocationsCount; i++) {
41457ec681f3Smrg      const VkAttachmentSampleLocationsEXT *att_sample_locs =
41467ec681f3Smrg         &sample_locs->pAttachmentInitialSampleLocations[i];
41477ec681f3Smrg      uint32_t att_idx = att_sample_locs->attachmentIndex;
41487ec681f3Smrg      struct radv_image *image = cmd_buffer->state.attachments[att_idx].iview->image;
41497ec681f3Smrg
41507ec681f3Smrg      assert(vk_format_is_depth_or_stencil(image->vk_format));
41517ec681f3Smrg
41527ec681f3Smrg      /* From the Vulkan spec 1.1.108:
41537ec681f3Smrg       *
41547ec681f3Smrg       * "If the image referenced by the framebuffer attachment at
41557ec681f3Smrg       *  index attachmentIndex was not created with
41567ec681f3Smrg       *  VK_IMAGE_CREATE_SAMPLE_LOCATIONS_COMPATIBLE_DEPTH_BIT_EXT
41577ec681f3Smrg       *  then the values specified in sampleLocationsInfo are
41587ec681f3Smrg       *  ignored."
41597ec681f3Smrg       */
41607ec681f3Smrg      if (!(image->flags & VK_IMAGE_CREATE_SAMPLE_LOCATIONS_COMPATIBLE_DEPTH_BIT_EXT))
41617ec681f3Smrg         continue;
41627ec681f3Smrg
41637ec681f3Smrg      const VkSampleLocationsInfoEXT *sample_locs_info = &att_sample_locs->sampleLocationsInfo;
41647ec681f3Smrg
41657ec681f3Smrg      state->attachments[att_idx].sample_location.per_pixel =
41667ec681f3Smrg         sample_locs_info->sampleLocationsPerPixel;
41677ec681f3Smrg      state->attachments[att_idx].sample_location.grid_size =
41687ec681f3Smrg         sample_locs_info->sampleLocationGridSize;
41697ec681f3Smrg      state->attachments[att_idx].sample_location.count = sample_locs_info->sampleLocationsCount;
41707ec681f3Smrg      typed_memcpy(&state->attachments[att_idx].sample_location.locations[0],
41717ec681f3Smrg                   sample_locs_info->pSampleLocations, sample_locs_info->sampleLocationsCount);
41727ec681f3Smrg   }
41737ec681f3Smrg
41747ec681f3Smrg   state->subpass_sample_locs =
41757ec681f3Smrg      vk_alloc(&cmd_buffer->pool->alloc,
41767ec681f3Smrg               sample_locs->postSubpassSampleLocationsCount * sizeof(state->subpass_sample_locs[0]),
41777ec681f3Smrg               8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
41787ec681f3Smrg   if (state->subpass_sample_locs == NULL) {
41797ec681f3Smrg      cmd_buffer->record_result = VK_ERROR_OUT_OF_HOST_MEMORY;
41807ec681f3Smrg      return cmd_buffer->record_result;
41817ec681f3Smrg   }
41827ec681f3Smrg
41837ec681f3Smrg   state->num_subpass_sample_locs = sample_locs->postSubpassSampleLocationsCount;
41847ec681f3Smrg
41857ec681f3Smrg   for (uint32_t i = 0; i < sample_locs->postSubpassSampleLocationsCount; i++) {
41867ec681f3Smrg      const VkSubpassSampleLocationsEXT *subpass_sample_locs_info =
41877ec681f3Smrg         &sample_locs->pPostSubpassSampleLocations[i];
41887ec681f3Smrg      const VkSampleLocationsInfoEXT *sample_locs_info =
41897ec681f3Smrg         &subpass_sample_locs_info->sampleLocationsInfo;
41907ec681f3Smrg
41917ec681f3Smrg      state->subpass_sample_locs[i].subpass_idx = subpass_sample_locs_info->subpassIndex;
41927ec681f3Smrg      state->subpass_sample_locs[i].sample_location.per_pixel =
41937ec681f3Smrg         sample_locs_info->sampleLocationsPerPixel;
41947ec681f3Smrg      state->subpass_sample_locs[i].sample_location.grid_size =
41957ec681f3Smrg         sample_locs_info->sampleLocationGridSize;
41967ec681f3Smrg      state->subpass_sample_locs[i].sample_location.count = sample_locs_info->sampleLocationsCount;
41977ec681f3Smrg      typed_memcpy(&state->subpass_sample_locs[i].sample_location.locations[0],
41987ec681f3Smrg                   sample_locs_info->pSampleLocations, sample_locs_info->sampleLocationsCount);
41997ec681f3Smrg   }
42007ec681f3Smrg
42017ec681f3Smrg   return VK_SUCCESS;
42027ec681f3Smrg}
420301e04c3fSmrg
42047ec681f3Smrgstatic VkResult
42057ec681f3Smrgradv_cmd_state_setup_attachments(struct radv_cmd_buffer *cmd_buffer, struct radv_render_pass *pass,
42067ec681f3Smrg                                 const VkRenderPassBeginInfo *info,
42077ec681f3Smrg                                 const struct radv_extra_render_pass_begin_info *extra)
42087ec681f3Smrg{
42097ec681f3Smrg   struct radv_cmd_state *state = &cmd_buffer->state;
42107ec681f3Smrg   const struct VkRenderPassAttachmentBeginInfo *attachment_info = NULL;
42117ec681f3Smrg
42127ec681f3Smrg   if (info) {
42137ec681f3Smrg      attachment_info = vk_find_struct_const(info->pNext, RENDER_PASS_ATTACHMENT_BEGIN_INFO);
42147ec681f3Smrg   }
42157ec681f3Smrg
42167ec681f3Smrg   if (pass->attachment_count == 0) {
42177ec681f3Smrg      state->attachments = NULL;
42187ec681f3Smrg      return VK_SUCCESS;
42197ec681f3Smrg   }
42207ec681f3Smrg
42217ec681f3Smrg   state->attachments =
42227ec681f3Smrg      vk_alloc(&cmd_buffer->pool->alloc, pass->attachment_count * sizeof(state->attachments[0]), 8,
42237ec681f3Smrg               VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
42247ec681f3Smrg   if (state->attachments == NULL) {
42257ec681f3Smrg      cmd_buffer->record_result = VK_ERROR_OUT_OF_HOST_MEMORY;
42267ec681f3Smrg      return cmd_buffer->record_result;
42277ec681f3Smrg   }
42287ec681f3Smrg
42297ec681f3Smrg   for (uint32_t i = 0; i < pass->attachment_count; ++i) {
42307ec681f3Smrg      struct radv_render_pass_attachment *att = &pass->attachments[i];
42317ec681f3Smrg      VkImageAspectFlags att_aspects = vk_format_aspects(att->format);
42327ec681f3Smrg      VkImageAspectFlags clear_aspects = 0;
42337ec681f3Smrg
42347ec681f3Smrg      if (att_aspects == VK_IMAGE_ASPECT_COLOR_BIT) {
42357ec681f3Smrg         /* color attachment */
42367ec681f3Smrg         if (att->load_op == VK_ATTACHMENT_LOAD_OP_CLEAR) {
42377ec681f3Smrg            clear_aspects |= VK_IMAGE_ASPECT_COLOR_BIT;
42387ec681f3Smrg         }
42397ec681f3Smrg      } else {
42407ec681f3Smrg         /* depthstencil attachment */
42417ec681f3Smrg         if ((att_aspects & VK_IMAGE_ASPECT_DEPTH_BIT) &&
42427ec681f3Smrg             att->load_op == VK_ATTACHMENT_LOAD_OP_CLEAR) {
42437ec681f3Smrg            clear_aspects |= VK_IMAGE_ASPECT_DEPTH_BIT;
42447ec681f3Smrg            if ((att_aspects & VK_IMAGE_ASPECT_STENCIL_BIT) &&
42457ec681f3Smrg                att->stencil_load_op == VK_ATTACHMENT_LOAD_OP_DONT_CARE)
42467ec681f3Smrg               clear_aspects |= VK_IMAGE_ASPECT_STENCIL_BIT;
42477ec681f3Smrg         }
42487ec681f3Smrg         if ((att_aspects & VK_IMAGE_ASPECT_STENCIL_BIT) &&
42497ec681f3Smrg             att->stencil_load_op == VK_ATTACHMENT_LOAD_OP_CLEAR) {
42507ec681f3Smrg            clear_aspects |= VK_IMAGE_ASPECT_STENCIL_BIT;
42517ec681f3Smrg         }
42527ec681f3Smrg      }
42537ec681f3Smrg
42547ec681f3Smrg      state->attachments[i].pending_clear_aspects = clear_aspects;
42557ec681f3Smrg      state->attachments[i].cleared_views = 0;
42567ec681f3Smrg      if (clear_aspects && info) {
42577ec681f3Smrg         assert(info->clearValueCount > i);
42587ec681f3Smrg         state->attachments[i].clear_value = info->pClearValues[i];
42597ec681f3Smrg      }
42607ec681f3Smrg
42617ec681f3Smrg      state->attachments[i].current_layout = att->initial_layout;
42627ec681f3Smrg      state->attachments[i].current_in_render_loop = false;
42637ec681f3Smrg      state->attachments[i].current_stencil_layout = att->stencil_initial_layout;
42647ec681f3Smrg      state->attachments[i].disable_dcc = extra && extra->disable_dcc;
42657ec681f3Smrg      state->attachments[i].sample_location.count = 0;
42667ec681f3Smrg
42677ec681f3Smrg      struct radv_image_view *iview;
42687ec681f3Smrg      if (attachment_info && attachment_info->attachmentCount > i) {
42697ec681f3Smrg         iview = radv_image_view_from_handle(attachment_info->pAttachments[i]);
42707ec681f3Smrg      } else {
42717ec681f3Smrg         iview = state->framebuffer->attachments[i];
42727ec681f3Smrg      }
42737ec681f3Smrg
42747ec681f3Smrg      state->attachments[i].iview = iview;
42757ec681f3Smrg      if (iview->aspect_mask & (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT)) {
42767ec681f3Smrg         radv_initialise_ds_surface(cmd_buffer->device, &state->attachments[i].ds, iview);
42777ec681f3Smrg      } else {
42787ec681f3Smrg         radv_initialise_color_surface(cmd_buffer->device, &state->attachments[i].cb, iview);
42797ec681f3Smrg      }
42807ec681f3Smrg   }
42817ec681f3Smrg
42827ec681f3Smrg   return VK_SUCCESS;
42837ec681f3Smrg}
42847ec681f3Smrg
42857ec681f3SmrgVkResult
42867ec681f3Smrgradv_AllocateCommandBuffers(VkDevice _device, const VkCommandBufferAllocateInfo *pAllocateInfo,
42877ec681f3Smrg                            VkCommandBuffer *pCommandBuffers)
42887ec681f3Smrg{
42897ec681f3Smrg   RADV_FROM_HANDLE(radv_device, device, _device);
42907ec681f3Smrg   RADV_FROM_HANDLE(radv_cmd_pool, pool, pAllocateInfo->commandPool);
42917ec681f3Smrg
42927ec681f3Smrg   VkResult result = VK_SUCCESS;
42937ec681f3Smrg   uint32_t i;
42947ec681f3Smrg
42957ec681f3Smrg   for (i = 0; i < pAllocateInfo->commandBufferCount; i++) {
42967ec681f3Smrg
42977ec681f3Smrg      if (!list_is_empty(&pool->free_cmd_buffers)) {
42987ec681f3Smrg         struct radv_cmd_buffer *cmd_buffer =
42997ec681f3Smrg            list_first_entry(&pool->free_cmd_buffers, struct radv_cmd_buffer, pool_link);
43007ec681f3Smrg
43017ec681f3Smrg         list_del(&cmd_buffer->pool_link);
43027ec681f3Smrg         list_addtail(&cmd_buffer->pool_link, &pool->cmd_buffers);
43037ec681f3Smrg
43047ec681f3Smrg         result = radv_reset_cmd_buffer(cmd_buffer);
43057ec681f3Smrg         cmd_buffer->level = pAllocateInfo->level;
43067ec681f3Smrg         vk_command_buffer_finish(&cmd_buffer->vk);
43077ec681f3Smrg         VkResult init_result =
43087ec681f3Smrg            vk_command_buffer_init(&cmd_buffer->vk, &device->vk);
43097ec681f3Smrg         if (init_result != VK_SUCCESS)
43107ec681f3Smrg            result = init_result;
43117ec681f3Smrg
43127ec681f3Smrg         pCommandBuffers[i] = radv_cmd_buffer_to_handle(cmd_buffer);
43137ec681f3Smrg      } else {
43147ec681f3Smrg         result = radv_create_cmd_buffer(device, pool, pAllocateInfo->level, &pCommandBuffers[i]);
43157ec681f3Smrg      }
43167ec681f3Smrg      if (result != VK_SUCCESS)
43177ec681f3Smrg         break;
43187ec681f3Smrg   }
43197ec681f3Smrg
43207ec681f3Smrg   if (result != VK_SUCCESS) {
43217ec681f3Smrg      radv_FreeCommandBuffers(_device, pAllocateInfo->commandPool, i, pCommandBuffers);
43227ec681f3Smrg
43237ec681f3Smrg      /* From the Vulkan 1.0.66 spec:
43247ec681f3Smrg       *
43257ec681f3Smrg       * "vkAllocateCommandBuffers can be used to create multiple
43267ec681f3Smrg       *  command buffers. If the creation of any of those command
43277ec681f3Smrg       *  buffers fails, the implementation must destroy all
43287ec681f3Smrg       *  successfully created command buffer objects from this
43297ec681f3Smrg       *  command, set all entries of the pCommandBuffers array to
43307ec681f3Smrg       *  NULL and return the error."
43317ec681f3Smrg       */
43327ec681f3Smrg      memset(pCommandBuffers, 0, sizeof(*pCommandBuffers) * pAllocateInfo->commandBufferCount);
43337ec681f3Smrg   }
43347ec681f3Smrg
43357ec681f3Smrg   return result;
433601e04c3fSmrg}
433701e04c3fSmrg
43387ec681f3Smrgvoid
43397ec681f3Smrgradv_FreeCommandBuffers(VkDevice device, VkCommandPool commandPool, uint32_t commandBufferCount,
43407ec681f3Smrg                        const VkCommandBuffer *pCommandBuffers)
434101e04c3fSmrg{
43427ec681f3Smrg   for (uint32_t i = 0; i < commandBufferCount; i++) {
43437ec681f3Smrg      RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, pCommandBuffers[i]);
434401e04c3fSmrg
43457ec681f3Smrg      if (cmd_buffer) {
43467ec681f3Smrg         if (cmd_buffer->pool) {
43477ec681f3Smrg            list_del(&cmd_buffer->pool_link);
43487ec681f3Smrg            list_addtail(&cmd_buffer->pool_link, &cmd_buffer->pool->free_cmd_buffers);
43497ec681f3Smrg         } else
43507ec681f3Smrg            radv_destroy_cmd_buffer(cmd_buffer);
43517ec681f3Smrg      }
43527ec681f3Smrg   }
43537ec681f3Smrg}
435401e04c3fSmrg
43557ec681f3SmrgVkResult
43567ec681f3Smrgradv_ResetCommandBuffer(VkCommandBuffer commandBuffer, VkCommandBufferResetFlags flags)
43577ec681f3Smrg{
43587ec681f3Smrg   RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
43597ec681f3Smrg   return radv_reset_cmd_buffer(cmd_buffer);
436001e04c3fSmrg}
436101e04c3fSmrg
43627ec681f3SmrgVkResult
43637ec681f3Smrgradv_BeginCommandBuffer(VkCommandBuffer commandBuffer, const VkCommandBufferBeginInfo *pBeginInfo)
436401e04c3fSmrg{
43657ec681f3Smrg   RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
43667ec681f3Smrg   VkResult result = VK_SUCCESS;
43677ec681f3Smrg
43687ec681f3Smrg   if (cmd_buffer->status != RADV_CMD_BUFFER_STATUS_INITIAL) {
43697ec681f3Smrg      /* If the command buffer has already been resetted with
43707ec681f3Smrg       * vkResetCommandBuffer, no need to do it again.
43717ec681f3Smrg       */
43727ec681f3Smrg      result = radv_reset_cmd_buffer(cmd_buffer);
43737ec681f3Smrg      if (result != VK_SUCCESS)
43747ec681f3Smrg         return result;
43757ec681f3Smrg   }
43767ec681f3Smrg
43777ec681f3Smrg   memset(&cmd_buffer->state, 0, sizeof(cmd_buffer->state));
43787ec681f3Smrg   cmd_buffer->state.last_primitive_reset_en = -1;
43797ec681f3Smrg   cmd_buffer->state.last_index_type = -1;
43807ec681f3Smrg   cmd_buffer->state.last_num_instances = -1;
43817ec681f3Smrg   cmd_buffer->state.last_vertex_offset = -1;
43827ec681f3Smrg   cmd_buffer->state.last_first_instance = -1;
43837ec681f3Smrg   cmd_buffer->state.last_drawid = -1;
43847ec681f3Smrg   cmd_buffer->state.predication_type = -1;
43857ec681f3Smrg   cmd_buffer->state.last_sx_ps_downconvert = -1;
43867ec681f3Smrg   cmd_buffer->state.last_sx_blend_opt_epsilon = -1;
43877ec681f3Smrg   cmd_buffer->state.last_sx_blend_opt_control = -1;
43887ec681f3Smrg   cmd_buffer->state.last_nggc_settings = -1;
43897ec681f3Smrg   cmd_buffer->state.last_nggc_settings_sgpr_idx = -1;
43907ec681f3Smrg   cmd_buffer->usage_flags = pBeginInfo->flags;
43917ec681f3Smrg
43927ec681f3Smrg   if (cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY &&
43937ec681f3Smrg       (pBeginInfo->flags & VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT)) {
43947ec681f3Smrg      assert(pBeginInfo->pInheritanceInfo);
43957ec681f3Smrg      cmd_buffer->state.framebuffer =
43967ec681f3Smrg         radv_framebuffer_from_handle(pBeginInfo->pInheritanceInfo->framebuffer);
43977ec681f3Smrg      cmd_buffer->state.pass =
43987ec681f3Smrg         radv_render_pass_from_handle(pBeginInfo->pInheritanceInfo->renderPass);
43997ec681f3Smrg
44007ec681f3Smrg      struct radv_subpass *subpass =
44017ec681f3Smrg         &cmd_buffer->state.pass->subpasses[pBeginInfo->pInheritanceInfo->subpass];
44027ec681f3Smrg
44037ec681f3Smrg      if (cmd_buffer->state.framebuffer) {
44047ec681f3Smrg         result = radv_cmd_state_setup_attachments(cmd_buffer, cmd_buffer->state.pass, NULL, NULL);
44057ec681f3Smrg         if (result != VK_SUCCESS)
44067ec681f3Smrg            return result;
44077ec681f3Smrg      }
44087ec681f3Smrg
44097ec681f3Smrg      cmd_buffer->state.inherited_pipeline_statistics =
44107ec681f3Smrg         pBeginInfo->pInheritanceInfo->pipelineStatistics;
44117ec681f3Smrg
44127ec681f3Smrg      radv_cmd_buffer_set_subpass(cmd_buffer, subpass);
44137ec681f3Smrg   }
4414ed98bd31Smaya
44157ec681f3Smrg   if (unlikely(cmd_buffer->device->trace_bo))
44167ec681f3Smrg      radv_cmd_buffer_trace_emit(cmd_buffer);
441701e04c3fSmrg
44187ec681f3Smrg   radv_describe_begin_cmd_buffer(cmd_buffer);
441901e04c3fSmrg
44207ec681f3Smrg   cmd_buffer->status = RADV_CMD_BUFFER_STATUS_RECORDING;
44217ec681f3Smrg
44227ec681f3Smrg   return result;
442301e04c3fSmrg}
442401e04c3fSmrg
44257ec681f3Smrgvoid
44267ec681f3Smrgradv_CmdBindVertexBuffers(VkCommandBuffer commandBuffer, uint32_t firstBinding,
44277ec681f3Smrg                          uint32_t bindingCount, const VkBuffer *pBuffers,
44287ec681f3Smrg                          const VkDeviceSize *pOffsets)
442901e04c3fSmrg{
44307ec681f3Smrg   radv_CmdBindVertexBuffers2EXT(commandBuffer, firstBinding, bindingCount, pBuffers, pOffsets,
44317ec681f3Smrg                                 NULL, NULL);
44327ec681f3Smrg}
4433ed98bd31Smaya
44347ec681f3Smrgvoid
44357ec681f3Smrgradv_CmdBindVertexBuffers2EXT(VkCommandBuffer commandBuffer, uint32_t firstBinding,
44367ec681f3Smrg                              uint32_t bindingCount, const VkBuffer *pBuffers,
44377ec681f3Smrg                              const VkDeviceSize *pOffsets, const VkDeviceSize *pSizes,
44387ec681f3Smrg                              const VkDeviceSize *pStrides)
44397ec681f3Smrg{
44407ec681f3Smrg   RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
44417ec681f3Smrg   struct radv_vertex_binding *vb = cmd_buffer->vertex_bindings;
44427ec681f3Smrg   struct radv_vs_input_state *state = &cmd_buffer->state.dynamic_vs_input;
44437ec681f3Smrg   bool changed = false;
44447ec681f3Smrg
44457ec681f3Smrg   /* We have to defer setting up vertex buffer since we need the buffer
44467ec681f3Smrg    * stride from the pipeline. */
44477ec681f3Smrg
44487ec681f3Smrg   assert(firstBinding + bindingCount <= MAX_VBS);
44497ec681f3Smrg   cmd_buffer->state.vbo_misaligned_mask = state->misaligned_mask;
44507ec681f3Smrg   enum chip_class chip = cmd_buffer->device->physical_device->rad_info.chip_class;
44517ec681f3Smrg   for (uint32_t i = 0; i < bindingCount; i++) {
44527ec681f3Smrg      RADV_FROM_HANDLE(radv_buffer, buffer, pBuffers[i]);
44537ec681f3Smrg      uint32_t idx = firstBinding + i;
44547ec681f3Smrg      VkDeviceSize size = pSizes ? pSizes[i] : 0;
44557ec681f3Smrg      VkDeviceSize stride = pStrides ? pStrides[i] : 0;
44567ec681f3Smrg
44577ec681f3Smrg      /* pSizes and pStrides are optional. */
44587ec681f3Smrg      if (!changed && (vb[idx].buffer != buffer || vb[idx].offset != pOffsets[i] ||
44597ec681f3Smrg                       vb[idx].size != size || (pStrides && vb[idx].stride != stride))) {
44607ec681f3Smrg         changed = true;
44617ec681f3Smrg      }
44627ec681f3Smrg
44637ec681f3Smrg      vb[idx].buffer = buffer;
44647ec681f3Smrg      vb[idx].offset = pOffsets[i];
44657ec681f3Smrg      vb[idx].size = size;
44667ec681f3Smrg      /* if pStrides=NULL, it shouldn't overwrite the strides specified by CmdSetVertexInputEXT */
44677ec681f3Smrg
44687ec681f3Smrg      if (chip == GFX6 || chip >= GFX10) {
44697ec681f3Smrg         const uint32_t bit = 1u << idx;
44707ec681f3Smrg         if (!buffer) {
44717ec681f3Smrg            cmd_buffer->state.vbo_misaligned_mask &= ~bit;
44727ec681f3Smrg            cmd_buffer->state.vbo_bound_mask &= ~bit;
44737ec681f3Smrg         } else {
44747ec681f3Smrg            cmd_buffer->state.vbo_bound_mask |= bit;
44757ec681f3Smrg            if (pStrides && vb[idx].stride != stride) {
44767ec681f3Smrg               if (stride & state->format_align_req_minus_1[idx])
44777ec681f3Smrg                  cmd_buffer->state.vbo_misaligned_mask |= bit;
44787ec681f3Smrg               else
44797ec681f3Smrg                  cmd_buffer->state.vbo_misaligned_mask &= ~bit;
44807ec681f3Smrg            }
44817ec681f3Smrg            if (state->possibly_misaligned_mask & bit &&
44827ec681f3Smrg                (vb[idx].offset + state->offsets[idx]) & state->format_align_req_minus_1[idx])
44837ec681f3Smrg               cmd_buffer->state.vbo_misaligned_mask |= bit;
44847ec681f3Smrg         }
44857ec681f3Smrg      }
44867ec681f3Smrg
44877ec681f3Smrg      if (pStrides)
44887ec681f3Smrg         vb[idx].stride = stride;
44897ec681f3Smrg
44907ec681f3Smrg      if (buffer) {
44917ec681f3Smrg         radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, vb[idx].buffer->bo);
44927ec681f3Smrg      }
44937ec681f3Smrg   }
44947ec681f3Smrg
44957ec681f3Smrg   if (!changed) {
44967ec681f3Smrg      /* No state changes. */
44977ec681f3Smrg      return;
44987ec681f3Smrg   }
44997ec681f3Smrg
45007ec681f3Smrg   cmd_buffer->state.dirty |= RADV_CMD_DIRTY_VERTEX_BUFFER |
45017ec681f3Smrg                              RADV_CMD_DIRTY_DYNAMIC_VERTEX_INPUT;
45027ec681f3Smrg}
450301e04c3fSmrg
45047ec681f3Smrgstatic uint32_t
45057ec681f3Smrgvk_to_index_type(VkIndexType type)
45067ec681f3Smrg{
45077ec681f3Smrg   switch (type) {
45087ec681f3Smrg   case VK_INDEX_TYPE_UINT8_EXT:
45097ec681f3Smrg      return V_028A7C_VGT_INDEX_8;
45107ec681f3Smrg   case VK_INDEX_TYPE_UINT16:
45117ec681f3Smrg      return V_028A7C_VGT_INDEX_16;
45127ec681f3Smrg   case VK_INDEX_TYPE_UINT32:
45137ec681f3Smrg      return V_028A7C_VGT_INDEX_32;
45147ec681f3Smrg   default:
45157ec681f3Smrg      unreachable("invalid index type");
45167ec681f3Smrg   }
45177ec681f3Smrg}
451801e04c3fSmrg
45197ec681f3Smrgstatic uint32_t
45207ec681f3Smrgradv_get_vgt_index_size(uint32_t type)
45217ec681f3Smrg{
45227ec681f3Smrg   switch (type) {
45237ec681f3Smrg   case V_028A7C_VGT_INDEX_8:
45247ec681f3Smrg      return 1;
45257ec681f3Smrg   case V_028A7C_VGT_INDEX_16:
45267ec681f3Smrg      return 2;
45277ec681f3Smrg   case V_028A7C_VGT_INDEX_32:
45287ec681f3Smrg      return 4;
45297ec681f3Smrg   default:
45307ec681f3Smrg      unreachable("invalid index type");
45317ec681f3Smrg   }
453201e04c3fSmrg}
453301e04c3fSmrg
45347ec681f3Smrgvoid
45357ec681f3Smrgradv_CmdBindIndexBuffer(VkCommandBuffer commandBuffer, VkBuffer buffer, VkDeviceSize offset,
45367ec681f3Smrg                        VkIndexType indexType)
453701e04c3fSmrg{
45387ec681f3Smrg   RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
45397ec681f3Smrg   RADV_FROM_HANDLE(radv_buffer, index_buffer, buffer);
4540ed98bd31Smaya
45417ec681f3Smrg   if (cmd_buffer->state.index_buffer == index_buffer && cmd_buffer->state.index_offset == offset &&
45427ec681f3Smrg       cmd_buffer->state.index_type == indexType) {
45437ec681f3Smrg      /* No state changes. */
45447ec681f3Smrg      return;
45457ec681f3Smrg   }
454601e04c3fSmrg
45477ec681f3Smrg   cmd_buffer->state.index_buffer = index_buffer;
45487ec681f3Smrg   cmd_buffer->state.index_offset = offset;
45497ec681f3Smrg   cmd_buffer->state.index_type = vk_to_index_type(indexType);
45507ec681f3Smrg   cmd_buffer->state.index_va = radv_buffer_get_va(index_buffer->bo);
45517ec681f3Smrg   cmd_buffer->state.index_va += index_buffer->offset + offset;
455201e04c3fSmrg
45537ec681f3Smrg   int index_size = radv_get_vgt_index_size(vk_to_index_type(indexType));
45547ec681f3Smrg   cmd_buffer->state.max_index_count = (index_buffer->size - offset) / index_size;
45557ec681f3Smrg   cmd_buffer->state.dirty |= RADV_CMD_DIRTY_INDEX_BUFFER;
45567ec681f3Smrg   radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, index_buffer->bo);
455701e04c3fSmrg}
455801e04c3fSmrg
45597ec681f3Smrgstatic void
45607ec681f3Smrgradv_bind_descriptor_set(struct radv_cmd_buffer *cmd_buffer, VkPipelineBindPoint bind_point,
45617ec681f3Smrg                         struct radv_descriptor_set *set, unsigned idx)
456201e04c3fSmrg{
45637ec681f3Smrg   struct radeon_winsys *ws = cmd_buffer->device->ws;
456401e04c3fSmrg
45657ec681f3Smrg   radv_set_descriptor_set(cmd_buffer, bind_point, set, idx);
4566ed98bd31Smaya
45677ec681f3Smrg   assert(set);
45687ec681f3Smrg
45697ec681f3Smrg   if (!cmd_buffer->device->use_global_bo_list) {
45707ec681f3Smrg      for (unsigned j = 0; j < set->header.buffer_count; ++j)
45717ec681f3Smrg         if (set->descriptors[j])
45727ec681f3Smrg            radv_cs_add_buffer(ws, cmd_buffer->cs, set->descriptors[j]);
45737ec681f3Smrg   }
45747ec681f3Smrg
45757ec681f3Smrg   if (set->header.bo)
45767ec681f3Smrg      radv_cs_add_buffer(ws, cmd_buffer->cs, set->header.bo);
457701e04c3fSmrg}
457801e04c3fSmrg
45797ec681f3Smrgvoid
45807ec681f3Smrgradv_CmdBindDescriptorSets(VkCommandBuffer commandBuffer, VkPipelineBindPoint pipelineBindPoint,
45817ec681f3Smrg                           VkPipelineLayout _layout, uint32_t firstSet, uint32_t descriptorSetCount,
45827ec681f3Smrg                           const VkDescriptorSet *pDescriptorSets, uint32_t dynamicOffsetCount,
45837ec681f3Smrg                           const uint32_t *pDynamicOffsets)
45847ec681f3Smrg{
45857ec681f3Smrg   RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
45867ec681f3Smrg   RADV_FROM_HANDLE(radv_pipeline_layout, layout, _layout);
45877ec681f3Smrg   unsigned dyn_idx = 0;
45887ec681f3Smrg
45897ec681f3Smrg   const bool no_dynamic_bounds =
45907ec681f3Smrg      cmd_buffer->device->instance->debug_flags & RADV_DEBUG_NO_DYNAMIC_BOUNDS;
45917ec681f3Smrg   struct radv_descriptor_state *descriptors_state =
45927ec681f3Smrg      radv_get_descriptors_state(cmd_buffer, pipelineBindPoint);
45937ec681f3Smrg
45947ec681f3Smrg   for (unsigned i = 0; i < descriptorSetCount; ++i) {
45957ec681f3Smrg      unsigned set_idx = i + firstSet;
45967ec681f3Smrg      RADV_FROM_HANDLE(radv_descriptor_set, set, pDescriptorSets[i]);
45977ec681f3Smrg
45987ec681f3Smrg      /* If the set is already bound we only need to update the
45997ec681f3Smrg       * (potentially changed) dynamic offsets. */
46007ec681f3Smrg      if (descriptors_state->sets[set_idx] != set ||
46017ec681f3Smrg          !(descriptors_state->valid & (1u << set_idx))) {
46027ec681f3Smrg         radv_bind_descriptor_set(cmd_buffer, pipelineBindPoint, set, set_idx);
46037ec681f3Smrg      }
46047ec681f3Smrg
46057ec681f3Smrg      for (unsigned j = 0; j < layout->set[set_idx].dynamic_offset_count; ++j, ++dyn_idx) {
46067ec681f3Smrg         unsigned idx = j + layout->set[i + firstSet].dynamic_offset_start;
46077ec681f3Smrg         uint32_t *dst = descriptors_state->dynamic_buffers + idx * 4;
46087ec681f3Smrg         assert(dyn_idx < dynamicOffsetCount);
46097ec681f3Smrg
46107ec681f3Smrg         struct radv_descriptor_range *range = set->header.dynamic_descriptors + j;
46117ec681f3Smrg
46127ec681f3Smrg         if (!range->va) {
46137ec681f3Smrg            memset(dst, 0, 4 * 4);
46147ec681f3Smrg         } else {
46157ec681f3Smrg            uint64_t va = range->va + pDynamicOffsets[dyn_idx];
46167ec681f3Smrg            dst[0] = va;
46177ec681f3Smrg            dst[1] = S_008F04_BASE_ADDRESS_HI(va >> 32);
46187ec681f3Smrg            dst[2] = no_dynamic_bounds ? 0xffffffffu : range->size;
46197ec681f3Smrg            dst[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
46207ec681f3Smrg                     S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W);
46217ec681f3Smrg
46227ec681f3Smrg            if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX10) {
46237ec681f3Smrg               dst[3] |= S_008F0C_FORMAT(V_008F0C_GFX10_FORMAT_32_FLOAT) |
46247ec681f3Smrg                         S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) | S_008F0C_RESOURCE_LEVEL(1);
46257ec681f3Smrg            } else {
46267ec681f3Smrg               dst[3] |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
46277ec681f3Smrg                         S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
46287ec681f3Smrg            }
46297ec681f3Smrg         }
46307ec681f3Smrg
46317ec681f3Smrg         cmd_buffer->push_constant_stages |= layout->set[set_idx].dynamic_offset_stages;
46327ec681f3Smrg      }
46337ec681f3Smrg   }
46347ec681f3Smrg}
463501e04c3fSmrg
46367ec681f3Smrgstatic bool
46377ec681f3Smrgradv_init_push_descriptor_set(struct radv_cmd_buffer *cmd_buffer, struct radv_descriptor_set *set,
46387ec681f3Smrg                              struct radv_descriptor_set_layout *layout,
46397ec681f3Smrg                              VkPipelineBindPoint bind_point)
46407ec681f3Smrg{
46417ec681f3Smrg   struct radv_descriptor_state *descriptors_state =
46427ec681f3Smrg      radv_get_descriptors_state(cmd_buffer, bind_point);
46437ec681f3Smrg   set->header.size = layout->size;
46447ec681f3Smrg   set->header.layout = layout;
464501e04c3fSmrg
46467ec681f3Smrg   if (descriptors_state->push_set.capacity < set->header.size) {
46477ec681f3Smrg      size_t new_size = MAX2(set->header.size, 1024);
46487ec681f3Smrg      new_size = MAX2(new_size, 2 * descriptors_state->push_set.capacity);
46497ec681f3Smrg      new_size = MIN2(new_size, 96 * MAX_PUSH_DESCRIPTORS);
465001e04c3fSmrg
46517ec681f3Smrg      free(set->header.mapped_ptr);
46527ec681f3Smrg      set->header.mapped_ptr = malloc(new_size);
465301e04c3fSmrg
46547ec681f3Smrg      if (!set->header.mapped_ptr) {
46557ec681f3Smrg         descriptors_state->push_set.capacity = 0;
46567ec681f3Smrg         cmd_buffer->record_result = VK_ERROR_OUT_OF_HOST_MEMORY;
46577ec681f3Smrg         return false;
46587ec681f3Smrg      }
465901e04c3fSmrg
46607ec681f3Smrg      descriptors_state->push_set.capacity = new_size;
46617ec681f3Smrg   }
466201e04c3fSmrg
46637ec681f3Smrg   return true;
46647ec681f3Smrg}
466501e04c3fSmrg
46667ec681f3Smrgvoid
46677ec681f3Smrgradv_meta_push_descriptor_set(struct radv_cmd_buffer *cmd_buffer,
46687ec681f3Smrg                              VkPipelineBindPoint pipelineBindPoint, VkPipelineLayout _layout,
46697ec681f3Smrg                              uint32_t set, uint32_t descriptorWriteCount,
46707ec681f3Smrg                              const VkWriteDescriptorSet *pDescriptorWrites)
46717ec681f3Smrg{
46727ec681f3Smrg   RADV_FROM_HANDLE(radv_pipeline_layout, layout, _layout);
46737ec681f3Smrg   struct radv_descriptor_set *push_set =
46747ec681f3Smrg      (struct radv_descriptor_set *)&cmd_buffer->meta_push_descriptors;
46757ec681f3Smrg   unsigned bo_offset;
467601e04c3fSmrg
46777ec681f3Smrg   assert(set == 0);
46787ec681f3Smrg   assert(layout->set[set].layout->flags & VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR);
467901e04c3fSmrg
46807ec681f3Smrg   push_set->header.size = layout->set[set].layout->size;
46817ec681f3Smrg   push_set->header.layout = layout->set[set].layout;
468201e04c3fSmrg
46837ec681f3Smrg   if (!radv_cmd_buffer_upload_alloc(cmd_buffer, push_set->header.size, &bo_offset,
46847ec681f3Smrg                                     (void **)&push_set->header.mapped_ptr))
46857ec681f3Smrg      return;
468601e04c3fSmrg
46877ec681f3Smrg   push_set->header.va = radv_buffer_get_va(cmd_buffer->upload.upload_bo);
46887ec681f3Smrg   push_set->header.va += bo_offset;
468901e04c3fSmrg
46907ec681f3Smrg   radv_update_descriptor_sets(cmd_buffer->device, cmd_buffer,
46917ec681f3Smrg                               radv_descriptor_set_to_handle(push_set), descriptorWriteCount,
46927ec681f3Smrg                               pDescriptorWrites, 0, NULL);
469301e04c3fSmrg
46947ec681f3Smrg   radv_set_descriptor_set(cmd_buffer, pipelineBindPoint, push_set, set);
469501e04c3fSmrg}
469601e04c3fSmrg
46977ec681f3Smrgvoid
46987ec681f3Smrgradv_CmdPushDescriptorSetKHR(VkCommandBuffer commandBuffer, VkPipelineBindPoint pipelineBindPoint,
46997ec681f3Smrg                             VkPipelineLayout _layout, uint32_t set, uint32_t descriptorWriteCount,
47007ec681f3Smrg                             const VkWriteDescriptorSet *pDescriptorWrites)
470101e04c3fSmrg{
47027ec681f3Smrg   RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
47037ec681f3Smrg   RADV_FROM_HANDLE(radv_pipeline_layout, layout, _layout);
47047ec681f3Smrg   struct radv_descriptor_state *descriptors_state =
47057ec681f3Smrg      radv_get_descriptors_state(cmd_buffer, pipelineBindPoint);
47067ec681f3Smrg   struct radv_descriptor_set *push_set =
47077ec681f3Smrg      (struct radv_descriptor_set *)&descriptors_state->push_set.set;
470801e04c3fSmrg
47097ec681f3Smrg   assert(layout->set[set].layout->flags & VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR);
471001e04c3fSmrg
47117ec681f3Smrg   if (!radv_init_push_descriptor_set(cmd_buffer, push_set, layout->set[set].layout,
47127ec681f3Smrg                                      pipelineBindPoint))
47137ec681f3Smrg      return;
471401e04c3fSmrg
47157ec681f3Smrg   /* Check that there are no inline uniform block updates when calling vkCmdPushDescriptorSetKHR()
47167ec681f3Smrg    * because it is invalid, according to Vulkan spec.
47177ec681f3Smrg    */
47187ec681f3Smrg   for (int i = 0; i < descriptorWriteCount; i++) {
47197ec681f3Smrg      ASSERTED const VkWriteDescriptorSet *writeset = &pDescriptorWrites[i];
47207ec681f3Smrg      assert(writeset->descriptorType != VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT);
47217ec681f3Smrg   }
472201e04c3fSmrg
47237ec681f3Smrg   radv_update_descriptor_sets(cmd_buffer->device, cmd_buffer,
47247ec681f3Smrg                               radv_descriptor_set_to_handle(push_set), descriptorWriteCount,
47257ec681f3Smrg                               pDescriptorWrites, 0, NULL);
472601e04c3fSmrg
47277ec681f3Smrg   radv_set_descriptor_set(cmd_buffer, pipelineBindPoint, push_set, set);
47287ec681f3Smrg   descriptors_state->push_dirty = true;
472901e04c3fSmrg}
473001e04c3fSmrg
47317ec681f3Smrgvoid
47327ec681f3Smrgradv_CmdPushDescriptorSetWithTemplateKHR(VkCommandBuffer commandBuffer,
47337ec681f3Smrg                                         VkDescriptorUpdateTemplate descriptorUpdateTemplate,
47347ec681f3Smrg                                         VkPipelineLayout _layout, uint32_t set, const void *pData)
473501e04c3fSmrg{
47367ec681f3Smrg   RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
47377ec681f3Smrg   RADV_FROM_HANDLE(radv_pipeline_layout, layout, _layout);
47387ec681f3Smrg   RADV_FROM_HANDLE(radv_descriptor_update_template, templ, descriptorUpdateTemplate);
47397ec681f3Smrg   struct radv_descriptor_state *descriptors_state =
47407ec681f3Smrg      radv_get_descriptors_state(cmd_buffer, templ->bind_point);
47417ec681f3Smrg   struct radv_descriptor_set *push_set =
47427ec681f3Smrg      (struct radv_descriptor_set *)&descriptors_state->push_set.set;
474301e04c3fSmrg
47447ec681f3Smrg   assert(layout->set[set].layout->flags & VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR);
474501e04c3fSmrg
47467ec681f3Smrg   if (!radv_init_push_descriptor_set(cmd_buffer, push_set, layout->set[set].layout,
47477ec681f3Smrg                                      templ->bind_point))
47487ec681f3Smrg      return;
474901e04c3fSmrg
47507ec681f3Smrg   radv_update_descriptor_set_with_template(cmd_buffer->device, cmd_buffer, push_set,
47517ec681f3Smrg                                            descriptorUpdateTemplate, pData);
475201e04c3fSmrg
47537ec681f3Smrg   radv_set_descriptor_set(cmd_buffer, templ->bind_point, push_set, set);
47547ec681f3Smrg   descriptors_state->push_dirty = true;
475501e04c3fSmrg}
475601e04c3fSmrg
47577ec681f3Smrgvoid
47587ec681f3Smrgradv_CmdPushConstants(VkCommandBuffer commandBuffer, VkPipelineLayout layout,
47597ec681f3Smrg                      VkShaderStageFlags stageFlags, uint32_t offset, uint32_t size,
47607ec681f3Smrg                      const void *pValues)
476101e04c3fSmrg{
47627ec681f3Smrg   RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
47637ec681f3Smrg   memcpy(cmd_buffer->push_constants + offset, pValues, size);
47647ec681f3Smrg   cmd_buffer->push_constant_stages |= stageFlags;
476501e04c3fSmrg}
476601e04c3fSmrg
47677ec681f3SmrgVkResult
47687ec681f3Smrgradv_EndCommandBuffer(VkCommandBuffer commandBuffer)
476901e04c3fSmrg{
47707ec681f3Smrg   RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
477101e04c3fSmrg
47727ec681f3Smrg   radv_emit_mip_change_flush_default(cmd_buffer);
477301e04c3fSmrg
47747ec681f3Smrg   if (cmd_buffer->queue_family_index != RADV_QUEUE_TRANSFER) {
47757ec681f3Smrg      if (cmd_buffer->device->physical_device->rad_info.chip_class == GFX6)
47767ec681f3Smrg         cmd_buffer->state.flush_bits |=
47777ec681f3Smrg            RADV_CMD_FLAG_CS_PARTIAL_FLUSH | RADV_CMD_FLAG_PS_PARTIAL_FLUSH | RADV_CMD_FLAG_WB_L2;
477801e04c3fSmrg
47797ec681f3Smrg      /* Make sure to sync all pending active queries at the end of
47807ec681f3Smrg       * command buffer.
47817ec681f3Smrg       */
47827ec681f3Smrg      cmd_buffer->state.flush_bits |= cmd_buffer->active_query_flush_bits;
4783ed98bd31Smaya
47847ec681f3Smrg      /* Flush noncoherent images on GFX9+ so we can assume they're clean on the start of a
47857ec681f3Smrg       * command buffer.
47867ec681f3Smrg       */
47877ec681f3Smrg      if (cmd_buffer->state.rb_noncoherent_dirty && can_skip_buffer_l2_flushes(cmd_buffer->device))
47887ec681f3Smrg         cmd_buffer->state.flush_bits |= radv_src_access_flush(
47897ec681f3Smrg            cmd_buffer,
47907ec681f3Smrg            VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT | VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT,
47917ec681f3Smrg            NULL);
4792ed98bd31Smaya
47937ec681f3Smrg      /* Since NGG streamout uses GDS, we need to make GDS idle when
47947ec681f3Smrg       * we leave the IB, otherwise another process might overwrite
47957ec681f3Smrg       * it while our shaders are busy.
47967ec681f3Smrg       */
47977ec681f3Smrg      if (cmd_buffer->gds_needed)
47987ec681f3Smrg         cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_PS_PARTIAL_FLUSH;
47997ec681f3Smrg
48007ec681f3Smrg      si_emit_cache_flush(cmd_buffer);
48017ec681f3Smrg   }
4802ed98bd31Smaya
48037ec681f3Smrg   /* Make sure CP DMA is idle at the end of IBs because the kernel
48047ec681f3Smrg    * doesn't wait for it.
48057ec681f3Smrg    */
48067ec681f3Smrg   si_cp_dma_wait_for_idle(cmd_buffer);
4807ed98bd31Smaya
48087ec681f3Smrg   radv_describe_end_cmd_buffer(cmd_buffer);
4809ed98bd31Smaya
48107ec681f3Smrg   vk_free(&cmd_buffer->pool->alloc, cmd_buffer->state.attachments);
48117ec681f3Smrg   vk_free(&cmd_buffer->pool->alloc, cmd_buffer->state.subpass_sample_locs);
4812ed98bd31Smaya
48137ec681f3Smrg   VkResult result = cmd_buffer->device->ws->cs_finalize(cmd_buffer->cs);
48147ec681f3Smrg   if (result != VK_SUCCESS)
48157ec681f3Smrg      return vk_error(cmd_buffer, result);
4816ed98bd31Smaya
48177ec681f3Smrg   cmd_buffer->status = RADV_CMD_BUFFER_STATUS_EXECUTABLE;
4818ed98bd31Smaya
48197ec681f3Smrg   return cmd_buffer->record_result;
4820ed98bd31Smaya}
4821ed98bd31Smaya
4822ed98bd31Smayastatic void
48237ec681f3Smrgradv_emit_compute_pipeline(struct radv_cmd_buffer *cmd_buffer, struct radv_pipeline *pipeline)
4824ed98bd31Smaya{
48257ec681f3Smrg   if (!pipeline || pipeline == cmd_buffer->state.emitted_compute_pipeline)
48267ec681f3Smrg      return;
48277ec681f3Smrg
48287ec681f3Smrg   assert(!pipeline->ctx_cs.cdw);
4829ed98bd31Smaya
48307ec681f3Smrg   cmd_buffer->state.emitted_compute_pipeline = pipeline;
4831ed98bd31Smaya
48327ec681f3Smrg   radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, pipeline->cs.cdw);
48337ec681f3Smrg   radeon_emit_array(cmd_buffer->cs, pipeline->cs.buf, pipeline->cs.cdw);
4834ed98bd31Smaya
48357ec681f3Smrg   cmd_buffer->compute_scratch_size_per_wave_needed =
48367ec681f3Smrg      MAX2(cmd_buffer->compute_scratch_size_per_wave_needed, pipeline->scratch_bytes_per_wave);
48377ec681f3Smrg   cmd_buffer->compute_scratch_waves_wanted =
48387ec681f3Smrg      MAX2(cmd_buffer->compute_scratch_waves_wanted, pipeline->max_waves);
4839ed98bd31Smaya
48407ec681f3Smrg   radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs,
48417ec681f3Smrg                      pipeline->shaders[MESA_SHADER_COMPUTE]->bo);
48427ec681f3Smrg
48437ec681f3Smrg   if (unlikely(cmd_buffer->device->trace_bo))
48447ec681f3Smrg      radv_save_pipeline(cmd_buffer, pipeline);
4845ed98bd31Smaya}
4846ed98bd31Smaya
48477ec681f3Smrgstatic void
48487ec681f3Smrgradv_mark_descriptor_sets_dirty(struct radv_cmd_buffer *cmd_buffer, VkPipelineBindPoint bind_point)
484901e04c3fSmrg{
48507ec681f3Smrg   struct radv_descriptor_state *descriptors_state =
48517ec681f3Smrg      radv_get_descriptors_state(cmd_buffer, bind_point);
485201e04c3fSmrg
48537ec681f3Smrg   descriptors_state->dirty |= descriptors_state->valid;
48547ec681f3Smrg}
485501e04c3fSmrg
48567ec681f3Smrgvoid
48577ec681f3Smrgradv_CmdBindPipeline(VkCommandBuffer commandBuffer, VkPipelineBindPoint pipelineBindPoint,
48587ec681f3Smrg                     VkPipeline _pipeline)
48597ec681f3Smrg{
48607ec681f3Smrg   RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
48617ec681f3Smrg   RADV_FROM_HANDLE(radv_pipeline, pipeline, _pipeline);
48627ec681f3Smrg
48637ec681f3Smrg   switch (pipelineBindPoint) {
48647ec681f3Smrg   case VK_PIPELINE_BIND_POINT_COMPUTE:
48657ec681f3Smrg      if (cmd_buffer->state.compute_pipeline == pipeline)
48667ec681f3Smrg         return;
48677ec681f3Smrg      radv_mark_descriptor_sets_dirty(cmd_buffer, pipelineBindPoint);
48687ec681f3Smrg
48697ec681f3Smrg      cmd_buffer->state.compute_pipeline = pipeline;
48707ec681f3Smrg      cmd_buffer->push_constant_stages |= VK_SHADER_STAGE_COMPUTE_BIT;
48717ec681f3Smrg      break;
48727ec681f3Smrg   case VK_PIPELINE_BIND_POINT_RAY_TRACING_KHR:
48737ec681f3Smrg      if (cmd_buffer->state.rt_pipeline == pipeline)
48747ec681f3Smrg         return;
48757ec681f3Smrg      radv_mark_descriptor_sets_dirty(cmd_buffer, pipelineBindPoint);
48767ec681f3Smrg
48777ec681f3Smrg      cmd_buffer->state.rt_pipeline = pipeline;
48787ec681f3Smrg      cmd_buffer->push_constant_stages |=
48797ec681f3Smrg         (VK_SHADER_STAGE_RAYGEN_BIT_KHR | VK_SHADER_STAGE_ANY_HIT_BIT_KHR |
48807ec681f3Smrg          VK_SHADER_STAGE_CLOSEST_HIT_BIT_KHR | VK_SHADER_STAGE_MISS_BIT_KHR |
48817ec681f3Smrg          VK_SHADER_STAGE_INTERSECTION_BIT_KHR | VK_SHADER_STAGE_CALLABLE_BIT_KHR);
48827ec681f3Smrg      radv_set_rt_stack_size(cmd_buffer, cmd_buffer->state.rt_stack_size);
48837ec681f3Smrg      break;
48847ec681f3Smrg   case VK_PIPELINE_BIND_POINT_GRAPHICS:
48857ec681f3Smrg      if (cmd_buffer->state.pipeline == pipeline)
48867ec681f3Smrg         return;
48877ec681f3Smrg      radv_mark_descriptor_sets_dirty(cmd_buffer, pipelineBindPoint);
48887ec681f3Smrg
48897ec681f3Smrg      bool vtx_emit_count_changed =
48907ec681f3Smrg         !pipeline || !cmd_buffer->state.pipeline ||
48917ec681f3Smrg         cmd_buffer->state.pipeline->graphics.vtx_emit_num != pipeline->graphics.vtx_emit_num ||
48927ec681f3Smrg         cmd_buffer->state.pipeline->graphics.vtx_base_sgpr != pipeline->graphics.vtx_base_sgpr;
48937ec681f3Smrg      cmd_buffer->state.pipeline = pipeline;
48947ec681f3Smrg      if (!pipeline)
48957ec681f3Smrg         break;
48967ec681f3Smrg
48977ec681f3Smrg      cmd_buffer->state.dirty |= RADV_CMD_DIRTY_PIPELINE | RADV_CMD_DIRTY_DYNAMIC_VERTEX_INPUT;
48987ec681f3Smrg      cmd_buffer->push_constant_stages |= pipeline->active_stages;
48997ec681f3Smrg
49007ec681f3Smrg      /* the new vertex shader might not have the same user regs */
49017ec681f3Smrg      if (vtx_emit_count_changed) {
49027ec681f3Smrg         cmd_buffer->state.last_first_instance = -1;
49037ec681f3Smrg         cmd_buffer->state.last_vertex_offset = -1;
49047ec681f3Smrg         cmd_buffer->state.last_drawid = -1;
49057ec681f3Smrg      }
49067ec681f3Smrg
49077ec681f3Smrg      /* Prefetch all pipeline shaders at first draw time. */
49087ec681f3Smrg      cmd_buffer->state.prefetch_L2_mask |= RADV_PREFETCH_SHADERS;
49097ec681f3Smrg
49107ec681f3Smrg      if (cmd_buffer->device->physical_device->rad_info.has_vgt_flush_ngg_legacy_bug &&
49117ec681f3Smrg          cmd_buffer->state.emitted_pipeline &&
49127ec681f3Smrg          cmd_buffer->state.emitted_pipeline->graphics.is_ngg &&
49137ec681f3Smrg          !cmd_buffer->state.pipeline->graphics.is_ngg) {
49147ec681f3Smrg         /* Transitioning from NGG to legacy GS requires
49157ec681f3Smrg          * VGT_FLUSH on GFX10 and Sienna Cichlid. VGT_FLUSH
49167ec681f3Smrg          * is also emitted at the beginning of IBs when legacy
49177ec681f3Smrg          * GS ring pointers are set.
49187ec681f3Smrg          */
49197ec681f3Smrg         cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_VGT_FLUSH;
49207ec681f3Smrg      }
49217ec681f3Smrg
49227ec681f3Smrg      radv_bind_dynamic_state(cmd_buffer, &pipeline->dynamic_state);
49237ec681f3Smrg      radv_bind_streamout_state(cmd_buffer, pipeline);
49247ec681f3Smrg
49257ec681f3Smrg      if (pipeline->graphics.esgs_ring_size > cmd_buffer->esgs_ring_size_needed)
49267ec681f3Smrg         cmd_buffer->esgs_ring_size_needed = pipeline->graphics.esgs_ring_size;
49277ec681f3Smrg      if (pipeline->graphics.gsvs_ring_size > cmd_buffer->gsvs_ring_size_needed)
49287ec681f3Smrg         cmd_buffer->gsvs_ring_size_needed = pipeline->graphics.gsvs_ring_size;
49297ec681f3Smrg
49307ec681f3Smrg      if (radv_pipeline_has_tess(pipeline))
49317ec681f3Smrg         cmd_buffer->tess_rings_needed = true;
49327ec681f3Smrg      break;
49337ec681f3Smrg   default:
49347ec681f3Smrg      assert(!"invalid bind point");
49357ec681f3Smrg      break;
49367ec681f3Smrg   }
493701e04c3fSmrg}
493801e04c3fSmrg
49397ec681f3Smrgvoid
49407ec681f3Smrgradv_CmdSetViewport(VkCommandBuffer commandBuffer, uint32_t firstViewport, uint32_t viewportCount,
49417ec681f3Smrg                    const VkViewport *pViewports)
494201e04c3fSmrg{
49437ec681f3Smrg   RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
49447ec681f3Smrg   struct radv_cmd_state *state = &cmd_buffer->state;
49457ec681f3Smrg   ASSERTED const uint32_t total_count = firstViewport + viewportCount;
49467ec681f3Smrg
49477ec681f3Smrg   assert(firstViewport < MAX_VIEWPORTS);
49487ec681f3Smrg   assert(total_count >= 1 && total_count <= MAX_VIEWPORTS);
49497ec681f3Smrg
49507ec681f3Smrg   if (total_count <= state->dynamic.viewport.count &&
49517ec681f3Smrg       !memcmp(state->dynamic.viewport.viewports + firstViewport, pViewports,
49527ec681f3Smrg               viewportCount * sizeof(*pViewports))) {
49537ec681f3Smrg      return;
49547ec681f3Smrg   }
49557ec681f3Smrg
49567ec681f3Smrg   if (state->dynamic.viewport.count < total_count)
49577ec681f3Smrg      state->dynamic.viewport.count = total_count;
49587ec681f3Smrg
49597ec681f3Smrg   memcpy(state->dynamic.viewport.viewports + firstViewport, pViewports,
49607ec681f3Smrg          viewportCount * sizeof(*pViewports));
49617ec681f3Smrg   for (unsigned i = 0; i < viewportCount; i++) {
49627ec681f3Smrg      radv_get_viewport_xform(&pViewports[i],
49637ec681f3Smrg                              state->dynamic.viewport.xform[i + firstViewport].scale,
49647ec681f3Smrg                              state->dynamic.viewport.xform[i + firstViewport].translate);
49657ec681f3Smrg   }
49667ec681f3Smrg
49677ec681f3Smrg   state->dirty |= RADV_CMD_DIRTY_DYNAMIC_VIEWPORT;
496801e04c3fSmrg}
496901e04c3fSmrg
49707ec681f3Smrgvoid
49717ec681f3Smrgradv_CmdSetScissor(VkCommandBuffer commandBuffer, uint32_t firstScissor, uint32_t scissorCount,
49727ec681f3Smrg                   const VkRect2D *pScissors)
497301e04c3fSmrg{
49747ec681f3Smrg   RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
49757ec681f3Smrg   struct radv_cmd_state *state = &cmd_buffer->state;
49767ec681f3Smrg   ASSERTED const uint32_t total_count = firstScissor + scissorCount;
49777ec681f3Smrg
49787ec681f3Smrg   assert(firstScissor < MAX_SCISSORS);
49797ec681f3Smrg   assert(total_count >= 1 && total_count <= MAX_SCISSORS);
49807ec681f3Smrg
49817ec681f3Smrg   if (total_count <= state->dynamic.scissor.count &&
49827ec681f3Smrg       !memcmp(state->dynamic.scissor.scissors + firstScissor, pScissors,
49837ec681f3Smrg               scissorCount * sizeof(*pScissors))) {
49847ec681f3Smrg      return;
49857ec681f3Smrg   }
498601e04c3fSmrg
49877ec681f3Smrg   if (state->dynamic.scissor.count < total_count)
49887ec681f3Smrg      state->dynamic.scissor.count = total_count;
49897ec681f3Smrg
49907ec681f3Smrg   memcpy(state->dynamic.scissor.scissors + firstScissor, pScissors,
49917ec681f3Smrg          scissorCount * sizeof(*pScissors));
49927ec681f3Smrg
49937ec681f3Smrg   state->dirty |= RADV_CMD_DIRTY_DYNAMIC_SCISSOR;
499401e04c3fSmrg}
499501e04c3fSmrg
49967ec681f3Smrgvoid
49977ec681f3Smrgradv_CmdSetLineWidth(VkCommandBuffer commandBuffer, float lineWidth)
499801e04c3fSmrg{
49997ec681f3Smrg   RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
50007ec681f3Smrg
50017ec681f3Smrg   if (cmd_buffer->state.dynamic.line_width == lineWidth)
50027ec681f3Smrg      return;
50037ec681f3Smrg
50047ec681f3Smrg   cmd_buffer->state.dynamic.line_width = lineWidth;
50057ec681f3Smrg   cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_LINE_WIDTH;
500601e04c3fSmrg}
500701e04c3fSmrg
50087ec681f3Smrgvoid
50097ec681f3Smrgradv_CmdSetDepthBias(VkCommandBuffer commandBuffer, float depthBiasConstantFactor,
50107ec681f3Smrg                     float depthBiasClamp, float depthBiasSlopeFactor)
501101e04c3fSmrg{
50127ec681f3Smrg   RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
50137ec681f3Smrg   struct radv_cmd_state *state = &cmd_buffer->state;
501401e04c3fSmrg
50157ec681f3Smrg   if (state->dynamic.depth_bias.bias == depthBiasConstantFactor &&
50167ec681f3Smrg       state->dynamic.depth_bias.clamp == depthBiasClamp &&
50177ec681f3Smrg       state->dynamic.depth_bias.slope == depthBiasSlopeFactor) {
50187ec681f3Smrg      return;
50197ec681f3Smrg   }
502001e04c3fSmrg
50217ec681f3Smrg   state->dynamic.depth_bias.bias = depthBiasConstantFactor;
50227ec681f3Smrg   state->dynamic.depth_bias.clamp = depthBiasClamp;
50237ec681f3Smrg   state->dynamic.depth_bias.slope = depthBiasSlopeFactor;
50247ec681f3Smrg
50257ec681f3Smrg   state->dirty |= RADV_CMD_DIRTY_DYNAMIC_DEPTH_BIAS;
502601e04c3fSmrg}
502701e04c3fSmrg
50287ec681f3Smrgvoid
50297ec681f3Smrgradv_CmdSetBlendConstants(VkCommandBuffer commandBuffer, const float blendConstants[4])
503001e04c3fSmrg{
50317ec681f3Smrg   RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
50327ec681f3Smrg   struct radv_cmd_state *state = &cmd_buffer->state;
50337ec681f3Smrg
50347ec681f3Smrg   if (!memcmp(state->dynamic.blend_constants, blendConstants, sizeof(float) * 4))
50357ec681f3Smrg      return;
50367ec681f3Smrg
50377ec681f3Smrg   memcpy(state->dynamic.blend_constants, blendConstants, sizeof(float) * 4);
50387ec681f3Smrg
50397ec681f3Smrg   state->dirty |= RADV_CMD_DIRTY_DYNAMIC_BLEND_CONSTANTS;
504001e04c3fSmrg}
504101e04c3fSmrg
50427ec681f3Smrgvoid
50437ec681f3Smrgradv_CmdSetDepthBounds(VkCommandBuffer commandBuffer, float minDepthBounds, float maxDepthBounds)
504401e04c3fSmrg{
50457ec681f3Smrg   RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
50467ec681f3Smrg   struct radv_cmd_state *state = &cmd_buffer->state;
504701e04c3fSmrg
50487ec681f3Smrg   if (state->dynamic.depth_bounds.min == minDepthBounds &&
50497ec681f3Smrg       state->dynamic.depth_bounds.max == maxDepthBounds) {
50507ec681f3Smrg      return;
50517ec681f3Smrg   }
505201e04c3fSmrg
50537ec681f3Smrg   state->dynamic.depth_bounds.min = minDepthBounds;
50547ec681f3Smrg   state->dynamic.depth_bounds.max = maxDepthBounds;
50557ec681f3Smrg
50567ec681f3Smrg   state->dirty |= RADV_CMD_DIRTY_DYNAMIC_DEPTH_BOUNDS;
505701e04c3fSmrg}
505801e04c3fSmrg
50597ec681f3Smrgvoid
50607ec681f3Smrgradv_CmdSetStencilCompareMask(VkCommandBuffer commandBuffer, VkStencilFaceFlags faceMask,
50617ec681f3Smrg                              uint32_t compareMask)
506201e04c3fSmrg{
50637ec681f3Smrg   RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
50647ec681f3Smrg   struct radv_cmd_state *state = &cmd_buffer->state;
50657ec681f3Smrg   bool front_same = state->dynamic.stencil_compare_mask.front == compareMask;
50667ec681f3Smrg   bool back_same = state->dynamic.stencil_compare_mask.back == compareMask;
506701e04c3fSmrg
50687ec681f3Smrg   if ((!(faceMask & VK_STENCIL_FACE_FRONT_BIT) || front_same) &&
50697ec681f3Smrg       (!(faceMask & VK_STENCIL_FACE_BACK_BIT) || back_same)) {
50707ec681f3Smrg      return;
50717ec681f3Smrg   }
507201e04c3fSmrg
50737ec681f3Smrg   if (faceMask & VK_STENCIL_FACE_FRONT_BIT)
50747ec681f3Smrg      state->dynamic.stencil_compare_mask.front = compareMask;
50757ec681f3Smrg   if (faceMask & VK_STENCIL_FACE_BACK_BIT)
50767ec681f3Smrg      state->dynamic.stencil_compare_mask.back = compareMask;
5077ed98bd31Smaya
50787ec681f3Smrg   state->dirty |= RADV_CMD_DIRTY_DYNAMIC_STENCIL_COMPARE_MASK;
50797ec681f3Smrg}
508001e04c3fSmrg
50817ec681f3Smrgvoid
50827ec681f3Smrgradv_CmdSetStencilWriteMask(VkCommandBuffer commandBuffer, VkStencilFaceFlags faceMask,
50837ec681f3Smrg                            uint32_t writeMask)
50847ec681f3Smrg{
50857ec681f3Smrg   RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
50867ec681f3Smrg   struct radv_cmd_state *state = &cmd_buffer->state;
50877ec681f3Smrg   bool front_same = state->dynamic.stencil_write_mask.front == writeMask;
50887ec681f3Smrg   bool back_same = state->dynamic.stencil_write_mask.back == writeMask;
508901e04c3fSmrg
50907ec681f3Smrg   if ((!(faceMask & VK_STENCIL_FACE_FRONT_BIT) || front_same) &&
50917ec681f3Smrg       (!(faceMask & VK_STENCIL_FACE_BACK_BIT) || back_same)) {
50927ec681f3Smrg      return;
50937ec681f3Smrg   }
509401e04c3fSmrg
50957ec681f3Smrg   if (faceMask & VK_STENCIL_FACE_FRONT_BIT)
50967ec681f3Smrg      state->dynamic.stencil_write_mask.front = writeMask;
50977ec681f3Smrg   if (faceMask & VK_STENCIL_FACE_BACK_BIT)
50987ec681f3Smrg      state->dynamic.stencil_write_mask.back = writeMask;
509901e04c3fSmrg
51007ec681f3Smrg   state->dirty |= RADV_CMD_DIRTY_DYNAMIC_STENCIL_WRITE_MASK;
510101e04c3fSmrg}
510201e04c3fSmrg
51037ec681f3Smrgvoid
51047ec681f3Smrgradv_CmdSetStencilReference(VkCommandBuffer commandBuffer, VkStencilFaceFlags faceMask,
51057ec681f3Smrg                            uint32_t reference)
510601e04c3fSmrg{
51077ec681f3Smrg   RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
51087ec681f3Smrg   struct radv_cmd_state *state = &cmd_buffer->state;
51097ec681f3Smrg   bool front_same = state->dynamic.stencil_reference.front == reference;
51107ec681f3Smrg   bool back_same = state->dynamic.stencil_reference.back == reference;
511101e04c3fSmrg
51127ec681f3Smrg   if ((!(faceMask & VK_STENCIL_FACE_FRONT_BIT) || front_same) &&
51137ec681f3Smrg       (!(faceMask & VK_STENCIL_FACE_BACK_BIT) || back_same)) {
51147ec681f3Smrg      return;
51157ec681f3Smrg   }
511601e04c3fSmrg
51177ec681f3Smrg   if (faceMask & VK_STENCIL_FACE_FRONT_BIT)
51187ec681f3Smrg      cmd_buffer->state.dynamic.stencil_reference.front = reference;
51197ec681f3Smrg   if (faceMask & VK_STENCIL_FACE_BACK_BIT)
51207ec681f3Smrg      cmd_buffer->state.dynamic.stencil_reference.back = reference;
512101e04c3fSmrg
51227ec681f3Smrg   cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_STENCIL_REFERENCE;
51237ec681f3Smrg}
5124ed98bd31Smaya
51257ec681f3Smrgvoid
51267ec681f3Smrgradv_CmdSetDiscardRectangleEXT(VkCommandBuffer commandBuffer, uint32_t firstDiscardRectangle,
51277ec681f3Smrg                               uint32_t discardRectangleCount, const VkRect2D *pDiscardRectangles)
51287ec681f3Smrg{
51297ec681f3Smrg   RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
51307ec681f3Smrg   struct radv_cmd_state *state = &cmd_buffer->state;
51317ec681f3Smrg   ASSERTED const uint32_t total_count = firstDiscardRectangle + discardRectangleCount;
513201e04c3fSmrg
51337ec681f3Smrg   assert(firstDiscardRectangle < MAX_DISCARD_RECTANGLES);
51347ec681f3Smrg   assert(total_count >= 1 && total_count <= MAX_DISCARD_RECTANGLES);
513501e04c3fSmrg
51367ec681f3Smrg   if (!memcmp(state->dynamic.discard_rectangle.rectangles + firstDiscardRectangle,
51377ec681f3Smrg               pDiscardRectangles, discardRectangleCount * sizeof(*pDiscardRectangles))) {
51387ec681f3Smrg      return;
51397ec681f3Smrg   }
514001e04c3fSmrg
51417ec681f3Smrg   typed_memcpy(&state->dynamic.discard_rectangle.rectangles[firstDiscardRectangle],
51427ec681f3Smrg                pDiscardRectangles, discardRectangleCount);
514301e04c3fSmrg
51447ec681f3Smrg   state->dirty |= RADV_CMD_DIRTY_DYNAMIC_DISCARD_RECTANGLE;
514501e04c3fSmrg}
514601e04c3fSmrg
51477ec681f3Smrgvoid
51487ec681f3Smrgradv_CmdSetSampleLocationsEXT(VkCommandBuffer commandBuffer,
51497ec681f3Smrg                              const VkSampleLocationsInfoEXT *pSampleLocationsInfo)
51507ec681f3Smrg{
51517ec681f3Smrg   RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
51527ec681f3Smrg   struct radv_cmd_state *state = &cmd_buffer->state;
515301e04c3fSmrg
51547ec681f3Smrg   assert(pSampleLocationsInfo->sampleLocationsCount <= MAX_SAMPLE_LOCATIONS);
515501e04c3fSmrg
51567ec681f3Smrg   state->dynamic.sample_location.per_pixel = pSampleLocationsInfo->sampleLocationsPerPixel;
51577ec681f3Smrg   state->dynamic.sample_location.grid_size = pSampleLocationsInfo->sampleLocationGridSize;
51587ec681f3Smrg   state->dynamic.sample_location.count = pSampleLocationsInfo->sampleLocationsCount;
51597ec681f3Smrg   typed_memcpy(&state->dynamic.sample_location.locations[0],
51607ec681f3Smrg                pSampleLocationsInfo->pSampleLocations, pSampleLocationsInfo->sampleLocationsCount);
51617ec681f3Smrg
51627ec681f3Smrg   state->dirty |= RADV_CMD_DIRTY_DYNAMIC_SAMPLE_LOCATIONS;
516301e04c3fSmrg}
516401e04c3fSmrg
51657ec681f3Smrgvoid
51667ec681f3Smrgradv_CmdSetLineStippleEXT(VkCommandBuffer commandBuffer, uint32_t lineStippleFactor,
51677ec681f3Smrg                          uint16_t lineStipplePattern)
516801e04c3fSmrg{
51697ec681f3Smrg   RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
51707ec681f3Smrg   struct radv_cmd_state *state = &cmd_buffer->state;
51717ec681f3Smrg
51727ec681f3Smrg   if (state->dynamic.line_stipple.factor == lineStippleFactor &&
51737ec681f3Smrg       state->dynamic.line_stipple.pattern == lineStipplePattern)
51747ec681f3Smrg      return;
51757ec681f3Smrg
51767ec681f3Smrg   state->dynamic.line_stipple.factor = lineStippleFactor;
51777ec681f3Smrg   state->dynamic.line_stipple.pattern = lineStipplePattern;
51787ec681f3Smrg
51797ec681f3Smrg   state->dirty |= RADV_CMD_DIRTY_DYNAMIC_LINE_STIPPLE;
518001e04c3fSmrg}
518101e04c3fSmrg
51827ec681f3Smrgvoid
51837ec681f3Smrgradv_CmdSetCullModeEXT(VkCommandBuffer commandBuffer, VkCullModeFlags cullMode)
51847ec681f3Smrg{
51857ec681f3Smrg   RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
51867ec681f3Smrg   struct radv_cmd_state *state = &cmd_buffer->state;
518701e04c3fSmrg
51887ec681f3Smrg   if (state->dynamic.cull_mode == cullMode)
51897ec681f3Smrg      return;
519001e04c3fSmrg
51917ec681f3Smrg   state->dynamic.cull_mode = cullMode;
519201e04c3fSmrg
51937ec681f3Smrg   state->dirty |= RADV_CMD_DIRTY_DYNAMIC_CULL_MODE;
51947ec681f3Smrg}
519501e04c3fSmrg
51967ec681f3Smrgvoid
51977ec681f3Smrgradv_CmdSetFrontFaceEXT(VkCommandBuffer commandBuffer, VkFrontFace frontFace)
51987ec681f3Smrg{
51997ec681f3Smrg   RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
52007ec681f3Smrg   struct radv_cmd_state *state = &cmd_buffer->state;
520101e04c3fSmrg
52027ec681f3Smrg   if (state->dynamic.front_face == frontFace)
52037ec681f3Smrg      return;
520401e04c3fSmrg
52057ec681f3Smrg   state->dynamic.front_face = frontFace;
520601e04c3fSmrg
52077ec681f3Smrg   state->dirty |= RADV_CMD_DIRTY_DYNAMIC_FRONT_FACE;
520801e04c3fSmrg}
520901e04c3fSmrg
52107ec681f3Smrgvoid
52117ec681f3Smrgradv_CmdSetPrimitiveTopologyEXT(VkCommandBuffer commandBuffer,
52127ec681f3Smrg                                VkPrimitiveTopology primitiveTopology)
521301e04c3fSmrg{
52147ec681f3Smrg   RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
52157ec681f3Smrg   struct radv_cmd_state *state = &cmd_buffer->state;
52167ec681f3Smrg   unsigned primitive_topology = si_translate_prim(primitiveTopology);
521701e04c3fSmrg
52187ec681f3Smrg   if (state->dynamic.primitive_topology == primitive_topology)
52197ec681f3Smrg      return;
522001e04c3fSmrg
52217ec681f3Smrg   state->dynamic.primitive_topology = primitive_topology;
52227ec681f3Smrg
52237ec681f3Smrg   state->dirty |= RADV_CMD_DIRTY_DYNAMIC_PRIMITIVE_TOPOLOGY;
522401e04c3fSmrg}
522501e04c3fSmrg
52267ec681f3Smrgvoid
52277ec681f3Smrgradv_CmdSetViewportWithCountEXT(VkCommandBuffer commandBuffer, uint32_t viewportCount,
52287ec681f3Smrg                                const VkViewport *pViewports)
522901e04c3fSmrg{
52307ec681f3Smrg   radv_CmdSetViewport(commandBuffer, 0, viewportCount, pViewports);
523101e04c3fSmrg}
523201e04c3fSmrg
52337ec681f3Smrgvoid
52347ec681f3Smrgradv_CmdSetScissorWithCountEXT(VkCommandBuffer commandBuffer, uint32_t scissorCount,
52357ec681f3Smrg                               const VkRect2D *pScissors)
523601e04c3fSmrg{
52377ec681f3Smrg   radv_CmdSetScissor(commandBuffer, 0, scissorCount, pScissors);
523801e04c3fSmrg}
523901e04c3fSmrg
52407ec681f3Smrgvoid
52417ec681f3Smrgradv_CmdSetDepthTestEnableEXT(VkCommandBuffer commandBuffer, VkBool32 depthTestEnable)
52427ec681f3Smrg
524301e04c3fSmrg{
52447ec681f3Smrg   RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
52457ec681f3Smrg   struct radv_cmd_state *state = &cmd_buffer->state;
52467ec681f3Smrg
52477ec681f3Smrg   if (state->dynamic.depth_test_enable == depthTestEnable)
52487ec681f3Smrg      return;
524901e04c3fSmrg
52507ec681f3Smrg   state->dynamic.depth_test_enable = depthTestEnable;
525101e04c3fSmrg
52527ec681f3Smrg   state->dirty |= RADV_CMD_DIRTY_DYNAMIC_DEPTH_TEST_ENABLE;
525301e04c3fSmrg}
525401e04c3fSmrg
52557ec681f3Smrgvoid
52567ec681f3Smrgradv_CmdSetDepthWriteEnableEXT(VkCommandBuffer commandBuffer, VkBool32 depthWriteEnable)
525701e04c3fSmrg{
52587ec681f3Smrg   RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
52597ec681f3Smrg   struct radv_cmd_state *state = &cmd_buffer->state;
526001e04c3fSmrg
52617ec681f3Smrg   if (state->dynamic.depth_write_enable == depthWriteEnable)
52627ec681f3Smrg      return;
526301e04c3fSmrg
52647ec681f3Smrg   state->dynamic.depth_write_enable = depthWriteEnable;
526501e04c3fSmrg
52667ec681f3Smrg   state->dirty |= RADV_CMD_DIRTY_DYNAMIC_DEPTH_WRITE_ENABLE;
526701e04c3fSmrg}
526801e04c3fSmrg
52697ec681f3Smrgvoid
52707ec681f3Smrgradv_CmdSetDepthCompareOpEXT(VkCommandBuffer commandBuffer, VkCompareOp depthCompareOp)
527101e04c3fSmrg{
52727ec681f3Smrg   RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
52737ec681f3Smrg   struct radv_cmd_state *state = &cmd_buffer->state;
527401e04c3fSmrg
52757ec681f3Smrg   if (state->dynamic.depth_compare_op == depthCompareOp)
52767ec681f3Smrg      return;
527701e04c3fSmrg
52787ec681f3Smrg   state->dynamic.depth_compare_op = depthCompareOp;
527901e04c3fSmrg
52807ec681f3Smrg   state->dirty |= RADV_CMD_DIRTY_DYNAMIC_DEPTH_COMPARE_OP;
52817ec681f3Smrg}
528201e04c3fSmrg
52837ec681f3Smrgvoid
52847ec681f3Smrgradv_CmdSetDepthBoundsTestEnableEXT(VkCommandBuffer commandBuffer, VkBool32 depthBoundsTestEnable)
52857ec681f3Smrg{
52867ec681f3Smrg   RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
52877ec681f3Smrg   struct radv_cmd_state *state = &cmd_buffer->state;
528801e04c3fSmrg
52897ec681f3Smrg   if (state->dynamic.depth_bounds_test_enable == depthBoundsTestEnable)
52907ec681f3Smrg      return;
529101e04c3fSmrg
52927ec681f3Smrg   state->dynamic.depth_bounds_test_enable = depthBoundsTestEnable;
529301e04c3fSmrg
52947ec681f3Smrg   state->dirty |= RADV_CMD_DIRTY_DYNAMIC_DEPTH_BOUNDS_TEST_ENABLE;
529501e04c3fSmrg}
529601e04c3fSmrg
52977ec681f3Smrgvoid
52987ec681f3Smrgradv_CmdSetStencilTestEnableEXT(VkCommandBuffer commandBuffer, VkBool32 stencilTestEnable)
529901e04c3fSmrg{
53007ec681f3Smrg   RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
53017ec681f3Smrg   struct radv_cmd_state *state = &cmd_buffer->state;
530201e04c3fSmrg
53037ec681f3Smrg   if (state->dynamic.stencil_test_enable == stencilTestEnable)
53047ec681f3Smrg      return;
5305993e1d59Smrg
53067ec681f3Smrg   state->dynamic.stencil_test_enable = stencilTestEnable;
53077ec681f3Smrg
53087ec681f3Smrg   state->dirty |= RADV_CMD_DIRTY_DYNAMIC_STENCIL_TEST_ENABLE;
53097ec681f3Smrg}
53107ec681f3Smrg
53117ec681f3Smrgvoid
53127ec681f3Smrgradv_CmdSetStencilOpEXT(VkCommandBuffer commandBuffer, VkStencilFaceFlags faceMask,
53137ec681f3Smrg                        VkStencilOp failOp, VkStencilOp passOp, VkStencilOp depthFailOp,
53147ec681f3Smrg                        VkCompareOp compareOp)
53157ec681f3Smrg{
53167ec681f3Smrg   RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
53177ec681f3Smrg   struct radv_cmd_state *state = &cmd_buffer->state;
53187ec681f3Smrg   bool front_same = state->dynamic.stencil_op.front.fail_op == failOp &&
53197ec681f3Smrg                     state->dynamic.stencil_op.front.pass_op == passOp &&
53207ec681f3Smrg                     state->dynamic.stencil_op.front.depth_fail_op == depthFailOp &&
53217ec681f3Smrg                     state->dynamic.stencil_op.front.compare_op == compareOp;
53227ec681f3Smrg   bool back_same = state->dynamic.stencil_op.back.fail_op == failOp &&
53237ec681f3Smrg                    state->dynamic.stencil_op.back.pass_op == passOp &&
53247ec681f3Smrg                    state->dynamic.stencil_op.back.depth_fail_op == depthFailOp &&
53257ec681f3Smrg                    state->dynamic.stencil_op.back.compare_op == compareOp;
53267ec681f3Smrg
53277ec681f3Smrg   if ((!(faceMask & VK_STENCIL_FACE_FRONT_BIT) || front_same) &&
53287ec681f3Smrg       (!(faceMask & VK_STENCIL_FACE_BACK_BIT) || back_same))
53297ec681f3Smrg      return;
53307ec681f3Smrg
53317ec681f3Smrg   if (faceMask & VK_STENCIL_FACE_FRONT_BIT) {
53327ec681f3Smrg      state->dynamic.stencil_op.front.fail_op = failOp;
53337ec681f3Smrg      state->dynamic.stencil_op.front.pass_op = passOp;
53347ec681f3Smrg      state->dynamic.stencil_op.front.depth_fail_op = depthFailOp;
53357ec681f3Smrg      state->dynamic.stencil_op.front.compare_op = compareOp;
53367ec681f3Smrg   }
53377ec681f3Smrg
53387ec681f3Smrg   if (faceMask & VK_STENCIL_FACE_BACK_BIT) {
53397ec681f3Smrg      state->dynamic.stencil_op.back.fail_op = failOp;
53407ec681f3Smrg      state->dynamic.stencil_op.back.pass_op = passOp;
53417ec681f3Smrg      state->dynamic.stencil_op.back.depth_fail_op = depthFailOp;
53427ec681f3Smrg      state->dynamic.stencil_op.back.compare_op = compareOp;
53437ec681f3Smrg   }
53447ec681f3Smrg
53457ec681f3Smrg   state->dirty |= RADV_CMD_DIRTY_DYNAMIC_STENCIL_OP;
53467ec681f3Smrg}
5347993e1d59Smrg
53487ec681f3Smrgvoid
53497ec681f3Smrgradv_CmdSetFragmentShadingRateKHR(VkCommandBuffer commandBuffer, const VkExtent2D *pFragmentSize,
53507ec681f3Smrg                                  const VkFragmentShadingRateCombinerOpKHR combinerOps[2])
53517ec681f3Smrg{
53527ec681f3Smrg   RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
53537ec681f3Smrg   struct radv_cmd_state *state = &cmd_buffer->state;
535401e04c3fSmrg
53557ec681f3Smrg   if (state->dynamic.fragment_shading_rate.size.width == pFragmentSize->width &&
53567ec681f3Smrg       state->dynamic.fragment_shading_rate.size.height == pFragmentSize->height &&
53577ec681f3Smrg       state->dynamic.fragment_shading_rate.combiner_ops[0] == combinerOps[0] &&
53587ec681f3Smrg       state->dynamic.fragment_shading_rate.combiner_ops[1] == combinerOps[1])
53597ec681f3Smrg      return;
536001e04c3fSmrg
53617ec681f3Smrg   state->dynamic.fragment_shading_rate.size = *pFragmentSize;
53627ec681f3Smrg   for (unsigned i = 0; i < 2; i++)
53637ec681f3Smrg      state->dynamic.fragment_shading_rate.combiner_ops[i] = combinerOps[i];
536401e04c3fSmrg
53657ec681f3Smrg   state->dirty |= RADV_CMD_DIRTY_DYNAMIC_FRAGMENT_SHADING_RATE;
536601e04c3fSmrg}
536701e04c3fSmrg
53687ec681f3Smrgvoid
53697ec681f3Smrgradv_CmdSetDepthBiasEnableEXT(VkCommandBuffer commandBuffer, VkBool32 depthBiasEnable)
537001e04c3fSmrg{
53717ec681f3Smrg   RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
53727ec681f3Smrg   struct radv_cmd_state *state = &cmd_buffer->state;
537301e04c3fSmrg
53747ec681f3Smrg   if (state->dynamic.depth_bias_enable == depthBiasEnable)
53757ec681f3Smrg      return;
537601e04c3fSmrg
53777ec681f3Smrg   state->dynamic.depth_bias_enable = depthBiasEnable;
537801e04c3fSmrg
53797ec681f3Smrg   state->dirty |= RADV_CMD_DIRTY_DYNAMIC_DEPTH_BIAS_ENABLE;
538001e04c3fSmrg}
538101e04c3fSmrg
53827ec681f3Smrgvoid
53837ec681f3Smrgradv_CmdSetPrimitiveRestartEnableEXT(VkCommandBuffer commandBuffer, VkBool32 primitiveRestartEnable)
5384ed98bd31Smaya{
53857ec681f3Smrg   RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
53867ec681f3Smrg   struct radv_cmd_state *state = &cmd_buffer->state;
5387ed98bd31Smaya
53887ec681f3Smrg   if (state->dynamic.primitive_restart_enable == primitiveRestartEnable)
53897ec681f3Smrg      return;
5390ed98bd31Smaya
53917ec681f3Smrg   state->dynamic.primitive_restart_enable = primitiveRestartEnable;
5392ed98bd31Smaya
53937ec681f3Smrg   state->dirty |= RADV_CMD_DIRTY_DYNAMIC_PRIMITIVE_RESTART_ENABLE;
5394ed98bd31Smaya}
5395ed98bd31Smaya
53967ec681f3Smrgvoid
53977ec681f3Smrgradv_CmdSetRasterizerDiscardEnableEXT(VkCommandBuffer commandBuffer,
53987ec681f3Smrg                                      VkBool32 rasterizerDiscardEnable)
539901e04c3fSmrg{
54007ec681f3Smrg   RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
54017ec681f3Smrg   struct radv_cmd_state *state = &cmd_buffer->state;
540201e04c3fSmrg
54037ec681f3Smrg   if (state->dynamic.rasterizer_discard_enable == rasterizerDiscardEnable)
54047ec681f3Smrg      return;
540501e04c3fSmrg
54067ec681f3Smrg   state->dynamic.rasterizer_discard_enable = rasterizerDiscardEnable;
540701e04c3fSmrg
54087ec681f3Smrg   state->dirty |= RADV_CMD_DIRTY_DYNAMIC_RASTERIZER_DISCARD_ENABLE;
540901e04c3fSmrg}
541001e04c3fSmrg
54117ec681f3Smrgvoid
54127ec681f3Smrgradv_CmdSetPatchControlPointsEXT(VkCommandBuffer commandBuffer, uint32_t patchControlPoints)
54137ec681f3Smrg{
54147ec681f3Smrg   /* not implemented */
54157ec681f3Smrg}
54167ec681f3Smrg
54177ec681f3Smrgvoid
54187ec681f3Smrgradv_CmdSetLogicOpEXT(VkCommandBuffer commandBuffer, VkLogicOp logicOp)
541901e04c3fSmrg{
54207ec681f3Smrg   RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
54217ec681f3Smrg   struct radv_cmd_state *state = &cmd_buffer->state;
54227ec681f3Smrg   unsigned logic_op = si_translate_blend_logic_op(logicOp);
542301e04c3fSmrg
54247ec681f3Smrg   if (state->dynamic.logic_op == logic_op)
54257ec681f3Smrg      return;
542601e04c3fSmrg
54277ec681f3Smrg   state->dynamic.logic_op = logic_op;
54287ec681f3Smrg
54297ec681f3Smrg   state->dirty |= RADV_CMD_DIRTY_DYNAMIC_LOGIC_OP;
54307ec681f3Smrg}
543101e04c3fSmrg
54327ec681f3Smrgvoid
54337ec681f3Smrgradv_CmdSetColorWriteEnableEXT(VkCommandBuffer commandBuffer, uint32_t attachmentCount,
54347ec681f3Smrg                               const VkBool32 *pColorWriteEnables)
54357ec681f3Smrg{
54367ec681f3Smrg   RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
54377ec681f3Smrg   struct radv_cmd_state *state = &cmd_buffer->state;
54387ec681f3Smrg   uint32_t color_write_enable = 0;
5439ed98bd31Smaya
54407ec681f3Smrg   assert(attachmentCount < MAX_RTS);
544101e04c3fSmrg
54427ec681f3Smrg   for (uint32_t i = 0; i < attachmentCount; i++) {
54437ec681f3Smrg      color_write_enable |= pColorWriteEnables[i] ? (0xfu << (i * 4)) : 0;
54447ec681f3Smrg   }
544501e04c3fSmrg
54467ec681f3Smrg   if (state->dynamic.color_write_enable == color_write_enable)
54477ec681f3Smrg      return;
544801e04c3fSmrg
54497ec681f3Smrg   state->dynamic.color_write_enable = color_write_enable;
545001e04c3fSmrg
54517ec681f3Smrg   state->dirty |= RADV_CMD_DIRTY_DYNAMIC_COLOR_WRITE_ENABLE;
545201e04c3fSmrg}
545301e04c3fSmrg
54547ec681f3Smrgvoid
54557ec681f3Smrgradv_CmdSetVertexInputEXT(VkCommandBuffer commandBuffer, uint32_t vertexBindingDescriptionCount,
54567ec681f3Smrg                          const VkVertexInputBindingDescription2EXT *pVertexBindingDescriptions,
54577ec681f3Smrg                          uint32_t vertexAttributeDescriptionCount,
54587ec681f3Smrg                          const VkVertexInputAttributeDescription2EXT *pVertexAttributeDescriptions)
54597ec681f3Smrg{
54607ec681f3Smrg   RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
54617ec681f3Smrg   struct radv_vs_input_state *state = &cmd_buffer->state.dynamic_vs_input;
54627ec681f3Smrg
54637ec681f3Smrg   const VkVertexInputBindingDescription2EXT *bindings[MAX_VBS];
54647ec681f3Smrg   for (unsigned i = 0; i < vertexBindingDescriptionCount; i++)
54657ec681f3Smrg      bindings[pVertexBindingDescriptions[i].binding] = &pVertexBindingDescriptions[i];
54667ec681f3Smrg
54677ec681f3Smrg   cmd_buffer->state.vbo_misaligned_mask = 0;
54687ec681f3Smrg
54697ec681f3Smrg   memset(state, 0, sizeof(*state));
54707ec681f3Smrg
54717ec681f3Smrg   enum chip_class chip = cmd_buffer->device->physical_device->rad_info.chip_class;
54727ec681f3Smrg   for (unsigned i = 0; i < vertexAttributeDescriptionCount; i++) {
54737ec681f3Smrg      const VkVertexInputAttributeDescription2EXT *attrib = &pVertexAttributeDescriptions[i];
54747ec681f3Smrg      const VkVertexInputBindingDescription2EXT *binding = bindings[attrib->binding];
54757ec681f3Smrg      unsigned loc = attrib->location;
54767ec681f3Smrg      const struct util_format_description *format_desc = vk_format_description(attrib->format);
54777ec681f3Smrg      unsigned nfmt, dfmt;
54787ec681f3Smrg      bool post_shuffle;
54797ec681f3Smrg      enum radv_vs_input_alpha_adjust alpha_adjust;
54807ec681f3Smrg
54817ec681f3Smrg      state->attribute_mask |= 1u << loc;
54827ec681f3Smrg      state->bindings[loc] = attrib->binding;
54837ec681f3Smrg      if (binding->inputRate == VK_VERTEX_INPUT_RATE_INSTANCE) {
54847ec681f3Smrg         state->instance_rate_inputs |= 1u << loc;
54857ec681f3Smrg         state->divisors[loc] = binding->divisor;
54867ec681f3Smrg         if (binding->divisor != 1)
54877ec681f3Smrg            state->nontrivial_divisors |= 1u << loc;
54887ec681f3Smrg      }
54897ec681f3Smrg      cmd_buffer->vertex_bindings[attrib->binding].stride = binding->stride;
54907ec681f3Smrg      state->offsets[loc] = attrib->offset;
54917ec681f3Smrg
54927ec681f3Smrg      radv_translate_vertex_format(cmd_buffer->device->physical_device, attrib->format, format_desc,
54937ec681f3Smrg                                   &dfmt, &nfmt, &post_shuffle, &alpha_adjust);
54947ec681f3Smrg
54957ec681f3Smrg      state->formats[loc] = dfmt | (nfmt << 4);
54967ec681f3Smrg      const uint8_t format_align_req_minus_1 = format_desc->channel[0].size >= 32 ? 3 :
54977ec681f3Smrg                                               (format_desc->block.bits / 8u - 1);
54987ec681f3Smrg      state->format_align_req_minus_1[loc] = format_align_req_minus_1;
54997ec681f3Smrg      state->format_sizes[loc] = format_desc->block.bits / 8u;
55007ec681f3Smrg
55017ec681f3Smrg      if (chip == GFX6 || chip >= GFX10) {
55027ec681f3Smrg         struct radv_vertex_binding *vb = cmd_buffer->vertex_bindings;
55037ec681f3Smrg         unsigned bit = 1u << loc;
55047ec681f3Smrg         if (binding->stride & format_align_req_minus_1) {
55057ec681f3Smrg            state->misaligned_mask |= bit;
55067ec681f3Smrg            if (cmd_buffer->state.vbo_bound_mask & bit)
55077ec681f3Smrg               cmd_buffer->state.vbo_misaligned_mask |= bit;
55087ec681f3Smrg         } else {
55097ec681f3Smrg            state->possibly_misaligned_mask |= bit;
55107ec681f3Smrg            if (cmd_buffer->state.vbo_bound_mask & bit &&
55117ec681f3Smrg                ((vb[attrib->binding].offset + state->offsets[loc]) & format_align_req_minus_1))
55127ec681f3Smrg               cmd_buffer->state.vbo_misaligned_mask |= bit;
55137ec681f3Smrg         }
55147ec681f3Smrg      }
55157ec681f3Smrg
55167ec681f3Smrg      if (alpha_adjust) {
55177ec681f3Smrg         state->alpha_adjust_lo |= (alpha_adjust & 0x1) << loc;
55187ec681f3Smrg         state->alpha_adjust_hi |= (alpha_adjust >> 1) << loc;
55197ec681f3Smrg      }
55207ec681f3Smrg
55217ec681f3Smrg      if (post_shuffle)
55227ec681f3Smrg         state->post_shuffle |= 1u << loc;
55237ec681f3Smrg   }
55247ec681f3Smrg
55257ec681f3Smrg   cmd_buffer->state.dirty |= RADV_CMD_DIRTY_VERTEX_BUFFER |
55267ec681f3Smrg                              RADV_CMD_DIRTY_DYNAMIC_VERTEX_INPUT;
552701e04c3fSmrg}
552801e04c3fSmrg
55297ec681f3Smrgvoid
55307ec681f3Smrgradv_CmdExecuteCommands(VkCommandBuffer commandBuffer, uint32_t commandBufferCount,
55317ec681f3Smrg                        const VkCommandBuffer *pCmdBuffers)
55327ec681f3Smrg{
55337ec681f3Smrg   RADV_FROM_HANDLE(radv_cmd_buffer, primary, commandBuffer);
55347ec681f3Smrg
55357ec681f3Smrg   assert(commandBufferCount > 0);
55367ec681f3Smrg
55377ec681f3Smrg   radv_emit_mip_change_flush_default(primary);
55387ec681f3Smrg
55397ec681f3Smrg   /* Emit pending flushes on primary prior to executing secondary */
55407ec681f3Smrg   si_emit_cache_flush(primary);
55417ec681f3Smrg
55427ec681f3Smrg   /* Make sure CP DMA is idle on primary prior to executing secondary. */
55437ec681f3Smrg   si_cp_dma_wait_for_idle(primary);
55447ec681f3Smrg
55457ec681f3Smrg   for (uint32_t i = 0; i < commandBufferCount; i++) {
55467ec681f3Smrg      RADV_FROM_HANDLE(radv_cmd_buffer, secondary, pCmdBuffers[i]);
55477ec681f3Smrg      bool allow_ib2 = true;
55487ec681f3Smrg
55497ec681f3Smrg      if (secondary->device->physical_device->rad_info.chip_class == GFX7 &&
55507ec681f3Smrg          secondary->state.uses_draw_indirect_multi) {
55517ec681f3Smrg         /* Do not launch an IB2 for secondary command buffers that contain
55527ec681f3Smrg          * DRAW_{INDEX}_INDIRECT_MULTI on GFX7 because it's illegal and hang the GPU.
55537ec681f3Smrg          */
55547ec681f3Smrg         allow_ib2 = false;
55557ec681f3Smrg      }
55567ec681f3Smrg
55577ec681f3Smrg      if (secondary->queue_family_index == RADV_QUEUE_COMPUTE) {
55587ec681f3Smrg         /* IB2 packets are not supported on compute queues according to PAL. */
55597ec681f3Smrg         allow_ib2 = false;
55607ec681f3Smrg      }
55617ec681f3Smrg
55627ec681f3Smrg      primary->scratch_size_per_wave_needed =
55637ec681f3Smrg         MAX2(primary->scratch_size_per_wave_needed, secondary->scratch_size_per_wave_needed);
55647ec681f3Smrg      primary->scratch_waves_wanted =
55657ec681f3Smrg         MAX2(primary->scratch_waves_wanted, secondary->scratch_waves_wanted);
55667ec681f3Smrg      primary->compute_scratch_size_per_wave_needed =
55677ec681f3Smrg         MAX2(primary->compute_scratch_size_per_wave_needed,
55687ec681f3Smrg              secondary->compute_scratch_size_per_wave_needed);
55697ec681f3Smrg      primary->compute_scratch_waves_wanted =
55707ec681f3Smrg         MAX2(primary->compute_scratch_waves_wanted, secondary->compute_scratch_waves_wanted);
55717ec681f3Smrg
55727ec681f3Smrg      if (secondary->esgs_ring_size_needed > primary->esgs_ring_size_needed)
55737ec681f3Smrg         primary->esgs_ring_size_needed = secondary->esgs_ring_size_needed;
55747ec681f3Smrg      if (secondary->gsvs_ring_size_needed > primary->gsvs_ring_size_needed)
55757ec681f3Smrg         primary->gsvs_ring_size_needed = secondary->gsvs_ring_size_needed;
55767ec681f3Smrg      if (secondary->tess_rings_needed)
55777ec681f3Smrg         primary->tess_rings_needed = true;
55787ec681f3Smrg      if (secondary->sample_positions_needed)
55797ec681f3Smrg         primary->sample_positions_needed = true;
55807ec681f3Smrg      if (secondary->gds_needed)
55817ec681f3Smrg         primary->gds_needed = true;
55827ec681f3Smrg
55837ec681f3Smrg      if (!secondary->state.framebuffer && (primary->state.dirty & RADV_CMD_DIRTY_FRAMEBUFFER)) {
55847ec681f3Smrg         /* Emit the framebuffer state from primary if secondary
55857ec681f3Smrg          * has been recorded without a framebuffer, otherwise
55867ec681f3Smrg          * fast color/depth clears can't work.
55877ec681f3Smrg          */
55887ec681f3Smrg         radv_emit_fb_mip_change_flush(primary);
55897ec681f3Smrg         radv_emit_framebuffer_state(primary);
55907ec681f3Smrg      }
55917ec681f3Smrg
55927ec681f3Smrg      primary->device->ws->cs_execute_secondary(primary->cs, secondary->cs, allow_ib2);
55937ec681f3Smrg
55947ec681f3Smrg      /* When the secondary command buffer is compute only we don't
55957ec681f3Smrg       * need to re-emit the current graphics pipeline.
55967ec681f3Smrg       */
55977ec681f3Smrg      if (secondary->state.emitted_pipeline) {
55987ec681f3Smrg         primary->state.emitted_pipeline = secondary->state.emitted_pipeline;
55997ec681f3Smrg      }
56007ec681f3Smrg
56017ec681f3Smrg      /* When the secondary command buffer is graphics only we don't
56027ec681f3Smrg       * need to re-emit the current compute pipeline.
56037ec681f3Smrg       */
56047ec681f3Smrg      if (secondary->state.emitted_compute_pipeline) {
56057ec681f3Smrg         primary->state.emitted_compute_pipeline = secondary->state.emitted_compute_pipeline;
56067ec681f3Smrg      }
56077ec681f3Smrg
56087ec681f3Smrg      /* Only re-emit the draw packets when needed. */
56097ec681f3Smrg      if (secondary->state.last_primitive_reset_en != -1) {
56107ec681f3Smrg         primary->state.last_primitive_reset_en = secondary->state.last_primitive_reset_en;
56117ec681f3Smrg      }
56127ec681f3Smrg
56137ec681f3Smrg      if (secondary->state.last_primitive_reset_index) {
56147ec681f3Smrg         primary->state.last_primitive_reset_index = secondary->state.last_primitive_reset_index;
56157ec681f3Smrg      }
56167ec681f3Smrg
56177ec681f3Smrg      if (secondary->state.last_ia_multi_vgt_param) {
56187ec681f3Smrg         primary->state.last_ia_multi_vgt_param = secondary->state.last_ia_multi_vgt_param;
56197ec681f3Smrg      }
56207ec681f3Smrg
56217ec681f3Smrg      primary->state.last_first_instance = secondary->state.last_first_instance;
56227ec681f3Smrg      primary->state.last_num_instances = secondary->state.last_num_instances;
56237ec681f3Smrg      primary->state.last_drawid = secondary->state.last_drawid;
56247ec681f3Smrg      primary->state.last_vertex_offset = secondary->state.last_vertex_offset;
56257ec681f3Smrg      primary->state.last_sx_ps_downconvert = secondary->state.last_sx_ps_downconvert;
56267ec681f3Smrg      primary->state.last_sx_blend_opt_epsilon = secondary->state.last_sx_blend_opt_epsilon;
56277ec681f3Smrg      primary->state.last_sx_blend_opt_control = secondary->state.last_sx_blend_opt_control;
56287ec681f3Smrg
56297ec681f3Smrg      if (secondary->state.last_index_type != -1) {
56307ec681f3Smrg         primary->state.last_index_type = secondary->state.last_index_type;
56317ec681f3Smrg      }
56327ec681f3Smrg
56337ec681f3Smrg      primary->state.last_nggc_settings = secondary->state.last_nggc_settings;
56347ec681f3Smrg      primary->state.last_nggc_settings_sgpr_idx = secondary->state.last_nggc_settings_sgpr_idx;
56357ec681f3Smrg      primary->state.last_nggc_skip = secondary->state.last_nggc_skip;
56367ec681f3Smrg   }
56377ec681f3Smrg
56387ec681f3Smrg   /* After executing commands from secondary buffers we have to dirty
56397ec681f3Smrg    * some states.
56407ec681f3Smrg    */
56417ec681f3Smrg   primary->state.dirty |=
56427ec681f3Smrg      RADV_CMD_DIRTY_PIPELINE | RADV_CMD_DIRTY_INDEX_BUFFER | RADV_CMD_DIRTY_DYNAMIC_ALL;
56437ec681f3Smrg   radv_mark_descriptor_sets_dirty(primary, VK_PIPELINE_BIND_POINT_GRAPHICS);
56447ec681f3Smrg   radv_mark_descriptor_sets_dirty(primary, VK_PIPELINE_BIND_POINT_COMPUTE);
56457ec681f3Smrg}
56467ec681f3Smrg
56477ec681f3SmrgVkResult
56487ec681f3Smrgradv_CreateCommandPool(VkDevice _device, const VkCommandPoolCreateInfo *pCreateInfo,
56497ec681f3Smrg                       const VkAllocationCallbacks *pAllocator, VkCommandPool *pCmdPool)
56507ec681f3Smrg{
56517ec681f3Smrg   RADV_FROM_HANDLE(radv_device, device, _device);
56527ec681f3Smrg   struct radv_cmd_pool *pool;
56537ec681f3Smrg
56547ec681f3Smrg   pool =
56557ec681f3Smrg      vk_alloc2(&device->vk.alloc, pAllocator, sizeof(*pool), 8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
56567ec681f3Smrg   if (pool == NULL)
56577ec681f3Smrg      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
56587ec681f3Smrg
56597ec681f3Smrg   vk_object_base_init(&device->vk, &pool->base, VK_OBJECT_TYPE_COMMAND_POOL);
56607ec681f3Smrg
56617ec681f3Smrg   if (pAllocator)
56627ec681f3Smrg      pool->alloc = *pAllocator;
56637ec681f3Smrg   else
56647ec681f3Smrg      pool->alloc = device->vk.alloc;
56657ec681f3Smrg
56667ec681f3Smrg   list_inithead(&pool->cmd_buffers);
56677ec681f3Smrg   list_inithead(&pool->free_cmd_buffers);
56687ec681f3Smrg
56697ec681f3Smrg   pool->queue_family_index = pCreateInfo->queueFamilyIndex;
56707ec681f3Smrg
56717ec681f3Smrg   *pCmdPool = radv_cmd_pool_to_handle(pool);
56727ec681f3Smrg
56737ec681f3Smrg   return VK_SUCCESS;
567401e04c3fSmrg}
567501e04c3fSmrg
56767ec681f3Smrgvoid
56777ec681f3Smrgradv_DestroyCommandPool(VkDevice _device, VkCommandPool commandPool,
56787ec681f3Smrg                        const VkAllocationCallbacks *pAllocator)
56797ec681f3Smrg{
56807ec681f3Smrg   RADV_FROM_HANDLE(radv_device, device, _device);
56817ec681f3Smrg   RADV_FROM_HANDLE(radv_cmd_pool, pool, commandPool);
568201e04c3fSmrg
56837ec681f3Smrg   if (!pool)
56847ec681f3Smrg      return;
56857ec681f3Smrg
56867ec681f3Smrg   list_for_each_entry_safe(struct radv_cmd_buffer, cmd_buffer, &pool->cmd_buffers, pool_link)
56877ec681f3Smrg   {
56887ec681f3Smrg      radv_destroy_cmd_buffer(cmd_buffer);
56897ec681f3Smrg   }
56907ec681f3Smrg
56917ec681f3Smrg   list_for_each_entry_safe(struct radv_cmd_buffer, cmd_buffer, &pool->free_cmd_buffers, pool_link)
56927ec681f3Smrg   {
56937ec681f3Smrg      radv_destroy_cmd_buffer(cmd_buffer);
56947ec681f3Smrg   }
56957ec681f3Smrg
56967ec681f3Smrg   vk_object_base_finish(&pool->base);
56977ec681f3Smrg   vk_free2(&device->vk.alloc, pAllocator, pool);
569801e04c3fSmrg}
569901e04c3fSmrg
57007ec681f3SmrgVkResult
57017ec681f3Smrgradv_ResetCommandPool(VkDevice device, VkCommandPool commandPool, VkCommandPoolResetFlags flags)
570201e04c3fSmrg{
57037ec681f3Smrg   RADV_FROM_HANDLE(radv_cmd_pool, pool, commandPool);
57047ec681f3Smrg   VkResult result;
570501e04c3fSmrg
57067ec681f3Smrg   list_for_each_entry(struct radv_cmd_buffer, cmd_buffer, &pool->cmd_buffers, pool_link)
57077ec681f3Smrg   {
57087ec681f3Smrg      result = radv_reset_cmd_buffer(cmd_buffer);
57097ec681f3Smrg      if (result != VK_SUCCESS)
57107ec681f3Smrg         return result;
57117ec681f3Smrg   }
57127ec681f3Smrg
57137ec681f3Smrg   return VK_SUCCESS;
571401e04c3fSmrg}
571501e04c3fSmrg
57167ec681f3Smrgvoid
57177ec681f3Smrgradv_TrimCommandPool(VkDevice device, VkCommandPool commandPool, VkCommandPoolTrimFlags flags)
571801e04c3fSmrg{
57197ec681f3Smrg   RADV_FROM_HANDLE(radv_cmd_pool, pool, commandPool);
572001e04c3fSmrg
57217ec681f3Smrg   if (!pool)
57227ec681f3Smrg      return;
57237ec681f3Smrg
57247ec681f3Smrg   list_for_each_entry_safe(struct radv_cmd_buffer, cmd_buffer, &pool->free_cmd_buffers, pool_link)
57257ec681f3Smrg   {
57267ec681f3Smrg      radv_destroy_cmd_buffer(cmd_buffer);
57277ec681f3Smrg   }
572801e04c3fSmrg}
572901e04c3fSmrg
57307ec681f3Smrgstatic void
57317ec681f3Smrgradv_cmd_buffer_begin_subpass(struct radv_cmd_buffer *cmd_buffer, uint32_t subpass_id)
573201e04c3fSmrg{
57337ec681f3Smrg   struct radv_cmd_state *state = &cmd_buffer->state;
57347ec681f3Smrg   struct radv_subpass *subpass = &state->pass->subpasses[subpass_id];
57357ec681f3Smrg
57367ec681f3Smrg   ASSERTED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 4096);
57377ec681f3Smrg
57387ec681f3Smrg   radv_emit_subpass_barrier(cmd_buffer, &subpass->start_barrier);
57397ec681f3Smrg
57407ec681f3Smrg   radv_cmd_buffer_set_subpass(cmd_buffer, subpass);
57417ec681f3Smrg
57427ec681f3Smrg   radv_describe_barrier_start(cmd_buffer, RGP_BARRIER_EXTERNAL_RENDER_PASS_SYNC);
57437ec681f3Smrg
57447ec681f3Smrg   for (uint32_t i = 0; i < subpass->attachment_count; ++i) {
57457ec681f3Smrg      const uint32_t a = subpass->attachments[i].attachment;
57467ec681f3Smrg      if (a == VK_ATTACHMENT_UNUSED)
57477ec681f3Smrg         continue;
57487ec681f3Smrg
57497ec681f3Smrg      radv_handle_subpass_image_transition(cmd_buffer, subpass->attachments[i], true);
57507ec681f3Smrg   }
57517ec681f3Smrg
57527ec681f3Smrg   if (subpass->vrs_attachment) {
57537ec681f3Smrg      int idx = subpass->vrs_attachment->attachment;
57547ec681f3Smrg      struct radv_image_view *vrs_iview = cmd_buffer->state.attachments[idx].iview;
57557ec681f3Smrg
57567ec681f3Smrg      if (subpass->depth_stencil_attachment) {
57577ec681f3Smrg         /* When a subpass uses a VRS attachment and a depth/stencil attachment, we just need to
57587ec681f3Smrg          * copy the VRS rates to the HTILE buffer of the attachment.
57597ec681f3Smrg          */
57607ec681f3Smrg         int ds_idx = subpass->depth_stencil_attachment->attachment;
57617ec681f3Smrg         struct radv_image_view *ds_iview = cmd_buffer->state.attachments[ds_idx].iview;
57627ec681f3Smrg         struct radv_image *ds_image = ds_iview->image;
57637ec681f3Smrg
57647ec681f3Smrg         VkExtent2D extent = {
57657ec681f3Smrg            .width = ds_image->info.width,
57667ec681f3Smrg            .height = ds_image->info.height,
57677ec681f3Smrg         };
57687ec681f3Smrg
57697ec681f3Smrg         /* HTILE buffer */
57707ec681f3Smrg         uint64_t htile_offset = ds_image->offset + ds_image->planes[0].surface.meta_offset;
57717ec681f3Smrg         uint64_t htile_size = ds_image->planes[0].surface.meta_slice_size;
57727ec681f3Smrg         struct radv_buffer htile_buffer;
57737ec681f3Smrg
57747ec681f3Smrg         radv_buffer_init(&htile_buffer, cmd_buffer->device, ds_image->bo, htile_size, htile_offset);
57757ec681f3Smrg
57767ec681f3Smrg         /* Copy the VRS rates to the HTILE buffer. */
57777ec681f3Smrg         radv_copy_vrs_htile(cmd_buffer, vrs_iview->image, &extent, ds_image, &htile_buffer, true);
577801e04c3fSmrg
57797ec681f3Smrg         radv_buffer_finish(&htile_buffer);
57807ec681f3Smrg      } else {
57817ec681f3Smrg         /* When a subpass uses a VRS attachment without binding a depth/stencil attachment, we have
57827ec681f3Smrg          * to copy the VRS rates to our internal HTILE buffer.
57837ec681f3Smrg          */
57847ec681f3Smrg         struct radv_framebuffer *fb = cmd_buffer->state.framebuffer;
57857ec681f3Smrg         struct radv_image *ds_image = radv_cmd_buffer_get_vrs_image(cmd_buffer);
578601e04c3fSmrg
57877ec681f3Smrg         if (ds_image) {
57887ec681f3Smrg            /* HTILE buffer */
57897ec681f3Smrg            struct radv_buffer *htile_buffer = cmd_buffer->device->vrs.buffer;
57907ec681f3Smrg
57917ec681f3Smrg            VkExtent2D extent = {
57927ec681f3Smrg               .width = MIN2(fb->width, ds_image->info.width),
57937ec681f3Smrg               .height = MIN2(fb->height, ds_image->info.height),
57947ec681f3Smrg            };
57957ec681f3Smrg
57967ec681f3Smrg            /* Copy the VRS rates to the HTILE buffer. */
57977ec681f3Smrg            radv_copy_vrs_htile(cmd_buffer, vrs_iview->image, &extent, ds_image, htile_buffer, false);
57987ec681f3Smrg         }
57997ec681f3Smrg      }
58007ec681f3Smrg   }
58017ec681f3Smrg
58027ec681f3Smrg   radv_describe_barrier_end(cmd_buffer);
58037ec681f3Smrg
58047ec681f3Smrg   radv_cmd_buffer_clear_subpass(cmd_buffer);
58057ec681f3Smrg
58067ec681f3Smrg   assert(cmd_buffer->cs->cdw <= cdw_max);
58077ec681f3Smrg}
58087ec681f3Smrg
58097ec681f3Smrgstatic void
58107ec681f3Smrgradv_mark_noncoherent_rb(struct radv_cmd_buffer *cmd_buffer)
58117ec681f3Smrg{
58127ec681f3Smrg   const struct radv_subpass *subpass = cmd_buffer->state.subpass;
58137ec681f3Smrg
58147ec681f3Smrg   /* Have to be conservative in cmdbuffers with inherited attachments. */
58157ec681f3Smrg   if (!cmd_buffer->state.attachments) {
58167ec681f3Smrg      cmd_buffer->state.rb_noncoherent_dirty = true;
58177ec681f3Smrg      return;
58187ec681f3Smrg   }
58197ec681f3Smrg
58207ec681f3Smrg   for (uint32_t i = 0; i < subpass->color_count; ++i) {
58217ec681f3Smrg      const uint32_t a = subpass->color_attachments[i].attachment;
58227ec681f3Smrg      if (a == VK_ATTACHMENT_UNUSED)
58237ec681f3Smrg         continue;
58247ec681f3Smrg      if (!cmd_buffer->state.attachments[a].iview->image->l2_coherent) {
58257ec681f3Smrg         cmd_buffer->state.rb_noncoherent_dirty = true;
58267ec681f3Smrg         return;
58277ec681f3Smrg      }
58287ec681f3Smrg   }
58297ec681f3Smrg   if (subpass->depth_stencil_attachment &&
58307ec681f3Smrg       !cmd_buffer->state.attachments[subpass->depth_stencil_attachment->attachment]
58317ec681f3Smrg           .iview->image->l2_coherent)
58327ec681f3Smrg      cmd_buffer->state.rb_noncoherent_dirty = true;
583301e04c3fSmrg}
583401e04c3fSmrg
58357ec681f3Smrgvoid
58367ec681f3Smrgradv_cmd_buffer_restore_subpass(struct radv_cmd_buffer *cmd_buffer,
58377ec681f3Smrg                                const struct radv_subpass *subpass)
583801e04c3fSmrg{
58397ec681f3Smrg   radv_mark_noncoherent_rb(cmd_buffer);
58407ec681f3Smrg   radv_cmd_buffer_set_subpass(cmd_buffer, subpass);
584101e04c3fSmrg}
584201e04c3fSmrg
58437ec681f3Smrgstatic void
58447ec681f3Smrgradv_cmd_buffer_end_subpass(struct radv_cmd_buffer *cmd_buffer)
58457ec681f3Smrg{
58467ec681f3Smrg   struct radv_cmd_state *state = &cmd_buffer->state;
58477ec681f3Smrg   const struct radv_subpass *subpass = state->subpass;
58487ec681f3Smrg   uint32_t subpass_id = radv_get_subpass_id(cmd_buffer);
58497ec681f3Smrg
58507ec681f3Smrg   radv_cmd_buffer_resolve_subpass(cmd_buffer);
58517ec681f3Smrg
58527ec681f3Smrg   radv_describe_barrier_start(cmd_buffer, RGP_BARRIER_EXTERNAL_RENDER_PASS_SYNC);
58537ec681f3Smrg
58547ec681f3Smrg   for (uint32_t i = 0; i < subpass->attachment_count; ++i) {
58557ec681f3Smrg      const uint32_t a = subpass->attachments[i].attachment;
58567ec681f3Smrg      if (a == VK_ATTACHMENT_UNUSED)
58577ec681f3Smrg         continue;
58587ec681f3Smrg
58597ec681f3Smrg      if (state->pass->attachments[a].last_subpass_idx != subpass_id)
58607ec681f3Smrg         continue;
58617ec681f3Smrg
58627ec681f3Smrg      VkImageLayout layout = state->pass->attachments[a].final_layout;
58637ec681f3Smrg      VkImageLayout stencil_layout = state->pass->attachments[a].stencil_final_layout;
58647ec681f3Smrg      struct radv_subpass_attachment att = {a, layout, stencil_layout};
58657ec681f3Smrg      radv_handle_subpass_image_transition(cmd_buffer, att, false);
58667ec681f3Smrg   }
58677ec681f3Smrg
58687ec681f3Smrg   radv_describe_barrier_end(cmd_buffer);
586901e04c3fSmrg}
587001e04c3fSmrg
58717ec681f3Smrgvoid
58727ec681f3Smrgradv_cmd_buffer_begin_render_pass(struct radv_cmd_buffer *cmd_buffer,
58737ec681f3Smrg                                  const VkRenderPassBeginInfo *pRenderPassBegin,
58747ec681f3Smrg                                  const struct radv_extra_render_pass_begin_info *extra_info)
587501e04c3fSmrg{
58767ec681f3Smrg   RADV_FROM_HANDLE(radv_render_pass, pass, pRenderPassBegin->renderPass);
58777ec681f3Smrg   RADV_FROM_HANDLE(radv_framebuffer, framebuffer, pRenderPassBegin->framebuffer);
58787ec681f3Smrg   VkResult result;
587901e04c3fSmrg
58807ec681f3Smrg   cmd_buffer->state.framebuffer = framebuffer;
58817ec681f3Smrg   cmd_buffer->state.pass = pass;
58827ec681f3Smrg   cmd_buffer->state.render_area = pRenderPassBegin->renderArea;
588301e04c3fSmrg
58847ec681f3Smrg   result = radv_cmd_state_setup_attachments(cmd_buffer, pass, pRenderPassBegin, extra_info);
58857ec681f3Smrg   if (result != VK_SUCCESS)
58867ec681f3Smrg      return;
588701e04c3fSmrg
58887ec681f3Smrg   result = radv_cmd_state_setup_sample_locations(cmd_buffer, pass, pRenderPassBegin);
58897ec681f3Smrg   if (result != VK_SUCCESS)
58907ec681f3Smrg      return;
58917ec681f3Smrg}
589201e04c3fSmrg
58937ec681f3Smrgvoid
58947ec681f3Smrgradv_CmdBeginRenderPass2(VkCommandBuffer commandBuffer,
58957ec681f3Smrg                         const VkRenderPassBeginInfo *pRenderPassBeginInfo,
58967ec681f3Smrg                         const VkSubpassBeginInfo *pSubpassBeginInfo)
58977ec681f3Smrg{
58987ec681f3Smrg   RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
589901e04c3fSmrg
59007ec681f3Smrg   radv_cmd_buffer_begin_render_pass(cmd_buffer, pRenderPassBeginInfo, NULL);
590101e04c3fSmrg
59027ec681f3Smrg   radv_cmd_buffer_begin_subpass(cmd_buffer, 0);
590301e04c3fSmrg}
590401e04c3fSmrg
59057ec681f3Smrgvoid
59067ec681f3Smrgradv_CmdNextSubpass2(VkCommandBuffer commandBuffer, const VkSubpassBeginInfo *pSubpassBeginInfo,
59077ec681f3Smrg                     const VkSubpassEndInfo *pSubpassEndInfo)
590801e04c3fSmrg{
59097ec681f3Smrg   RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
591001e04c3fSmrg
59117ec681f3Smrg   radv_mark_noncoherent_rb(cmd_buffer);
5912ed98bd31Smaya
59137ec681f3Smrg   uint32_t prev_subpass = radv_get_subpass_id(cmd_buffer);
59147ec681f3Smrg   radv_cmd_buffer_end_subpass(cmd_buffer);
59157ec681f3Smrg   radv_cmd_buffer_begin_subpass(cmd_buffer, prev_subpass + 1);
591601e04c3fSmrg}
591701e04c3fSmrg
591801e04c3fSmrgstatic void
59197ec681f3Smrgradv_emit_view_index(struct radv_cmd_buffer *cmd_buffer, unsigned index)
59207ec681f3Smrg{
59217ec681f3Smrg   struct radv_pipeline *pipeline = cmd_buffer->state.pipeline;
59227ec681f3Smrg   for (unsigned stage = 0; stage < MESA_SHADER_STAGES; ++stage) {
59237ec681f3Smrg      if (!radv_get_shader(pipeline, stage))
59247ec681f3Smrg         continue;
59257ec681f3Smrg
59267ec681f3Smrg      struct radv_userdata_info *loc = radv_lookup_user_sgpr(pipeline, stage, AC_UD_VIEW_INDEX);
59277ec681f3Smrg      if (loc->sgpr_idx == -1)
59287ec681f3Smrg         continue;
59297ec681f3Smrg      uint32_t base_reg = pipeline->user_data_0[stage];
59307ec681f3Smrg      radeon_set_sh_reg(cmd_buffer->cs, base_reg + loc->sgpr_idx * 4, index);
59317ec681f3Smrg   }
59327ec681f3Smrg   if (radv_pipeline_has_gs_copy_shader(pipeline)) {
59337ec681f3Smrg      struct radv_userdata_info *loc =
59347ec681f3Smrg         &pipeline->gs_copy_shader->info.user_sgprs_locs.shader_data[AC_UD_VIEW_INDEX];
59357ec681f3Smrg      if (loc->sgpr_idx != -1) {
59367ec681f3Smrg         uint32_t base_reg = R_00B130_SPI_SHADER_USER_DATA_VS_0;
59377ec681f3Smrg         radeon_set_sh_reg(cmd_buffer->cs, base_reg + loc->sgpr_idx * 4, index);
59387ec681f3Smrg      }
59397ec681f3Smrg   }
59407ec681f3Smrg}
59417ec681f3Smrg
59427ec681f3Smrgstatic void
59437ec681f3Smrgradv_cs_emit_draw_packet(struct radv_cmd_buffer *cmd_buffer, uint32_t vertex_count,
59447ec681f3Smrg                         uint32_t use_opaque)
594501e04c3fSmrg{
59467ec681f3Smrg   radeon_emit(cmd_buffer->cs, PKT3(PKT3_DRAW_INDEX_AUTO, 1, cmd_buffer->state.predicating));
59477ec681f3Smrg   radeon_emit(cmd_buffer->cs, vertex_count);
59487ec681f3Smrg   radeon_emit(cmd_buffer->cs, V_0287F0_DI_SRC_SEL_AUTO_INDEX | use_opaque);
59497ec681f3Smrg}
59507ec681f3Smrg
59517ec681f3Smrg/**
59527ec681f3Smrg * Emit a PKT3_DRAW_INDEX_2 packet to render "index_count` vertices.
59537ec681f3Smrg *
59547ec681f3Smrg * The starting address "index_va" may point anywhere within the index buffer. The number of
59557ec681f3Smrg * indexes allocated in the index buffer *past that point* is specified by "max_index_count".
59567ec681f3Smrg * Hardware uses this information to return 0 for out-of-bounds reads.
59577ec681f3Smrg */
59587ec681f3Smrgstatic void
59597ec681f3Smrgradv_cs_emit_draw_indexed_packet(struct radv_cmd_buffer *cmd_buffer, uint64_t index_va,
59607ec681f3Smrg                                 uint32_t max_index_count, uint32_t index_count, bool not_eop)
59617ec681f3Smrg{
59627ec681f3Smrg   radeon_emit(cmd_buffer->cs, PKT3(PKT3_DRAW_INDEX_2, 4, cmd_buffer->state.predicating));
59637ec681f3Smrg   radeon_emit(cmd_buffer->cs, max_index_count);
59647ec681f3Smrg   radeon_emit(cmd_buffer->cs, index_va);
59657ec681f3Smrg   radeon_emit(cmd_buffer->cs, index_va >> 32);
59667ec681f3Smrg   radeon_emit(cmd_buffer->cs, index_count);
59677ec681f3Smrg   /* NOT_EOP allows merging multiple draws into 1 wave, but only user VGPRs
59687ec681f3Smrg    * can be changed between draws and GS fast launch must be disabled.
59697ec681f3Smrg    * NOT_EOP doesn't work on gfx9 and older.
59707ec681f3Smrg    */
59717ec681f3Smrg   radeon_emit(cmd_buffer->cs, V_0287F0_DI_SRC_SEL_DMA | S_0287F0_NOT_EOP(not_eop));
59727ec681f3Smrg}
59737ec681f3Smrg
59747ec681f3Smrg/* MUST inline this function to avoid massive perf loss in drawoverhead */
59757ec681f3SmrgALWAYS_INLINE static void
59767ec681f3Smrgradv_cs_emit_indirect_draw_packet(struct radv_cmd_buffer *cmd_buffer, bool indexed,
59777ec681f3Smrg                                  uint32_t draw_count, uint64_t count_va, uint32_t stride)
59787ec681f3Smrg{
59797ec681f3Smrg   struct radeon_cmdbuf *cs = cmd_buffer->cs;
59807ec681f3Smrg   const unsigned di_src_sel = indexed ? V_0287F0_DI_SRC_SEL_DMA : V_0287F0_DI_SRC_SEL_AUTO_INDEX;
59817ec681f3Smrg   bool draw_id_enable = cmd_buffer->state.pipeline->graphics.uses_drawid;
59827ec681f3Smrg   uint32_t base_reg = cmd_buffer->state.pipeline->graphics.vtx_base_sgpr;
59837ec681f3Smrg   uint32_t vertex_offset_reg, start_instance_reg = 0, draw_id_reg = 0;
59847ec681f3Smrg   bool predicating = cmd_buffer->state.predicating;
59857ec681f3Smrg   assert(base_reg);
59867ec681f3Smrg
59877ec681f3Smrg   /* just reset draw state for vertex data */
59887ec681f3Smrg   cmd_buffer->state.last_first_instance = -1;
59897ec681f3Smrg   cmd_buffer->state.last_num_instances = -1;
59907ec681f3Smrg   cmd_buffer->state.last_drawid = -1;
59917ec681f3Smrg   cmd_buffer->state.last_vertex_offset = -1;
59927ec681f3Smrg
59937ec681f3Smrg   vertex_offset_reg = (base_reg - SI_SH_REG_OFFSET) >> 2;
59947ec681f3Smrg   if (cmd_buffer->state.pipeline->graphics.uses_baseinstance)
59957ec681f3Smrg      start_instance_reg = ((base_reg + (draw_id_enable ? 8 : 4)) - SI_SH_REG_OFFSET) >> 2;
59967ec681f3Smrg   if (draw_id_enable)
59977ec681f3Smrg      draw_id_reg = ((base_reg + 4) - SI_SH_REG_OFFSET) >> 2;
59987ec681f3Smrg
59997ec681f3Smrg   if (draw_count == 1 && !count_va && !draw_id_enable) {
60007ec681f3Smrg      radeon_emit(cs,
60017ec681f3Smrg                  PKT3(indexed ? PKT3_DRAW_INDEX_INDIRECT : PKT3_DRAW_INDIRECT, 3, predicating));
60027ec681f3Smrg      radeon_emit(cs, 0);
60037ec681f3Smrg      radeon_emit(cs, vertex_offset_reg);
60047ec681f3Smrg      radeon_emit(cs, start_instance_reg);
60057ec681f3Smrg      radeon_emit(cs, di_src_sel);
60067ec681f3Smrg   } else {
60077ec681f3Smrg      radeon_emit(cs, PKT3(indexed ? PKT3_DRAW_INDEX_INDIRECT_MULTI : PKT3_DRAW_INDIRECT_MULTI, 8,
60087ec681f3Smrg                           predicating));
60097ec681f3Smrg      radeon_emit(cs, 0);
60107ec681f3Smrg      radeon_emit(cs, vertex_offset_reg);
60117ec681f3Smrg      radeon_emit(cs, start_instance_reg);
60127ec681f3Smrg      radeon_emit(cs, draw_id_reg | S_2C3_DRAW_INDEX_ENABLE(draw_id_enable) |
60137ec681f3Smrg                         S_2C3_COUNT_INDIRECT_ENABLE(!!count_va));
60147ec681f3Smrg      radeon_emit(cs, draw_count); /* count */
60157ec681f3Smrg      radeon_emit(cs, count_va);   /* count_addr */
60167ec681f3Smrg      radeon_emit(cs, count_va >> 32);
60177ec681f3Smrg      radeon_emit(cs, stride); /* stride */
60187ec681f3Smrg      radeon_emit(cs, di_src_sel);
60197ec681f3Smrg
60207ec681f3Smrg      cmd_buffer->state.uses_draw_indirect_multi = true;
60217ec681f3Smrg   }
60227ec681f3Smrg}
60237ec681f3Smrg
60247ec681f3Smrgstatic inline void
60257ec681f3Smrgradv_emit_userdata_vertex_internal(struct radv_cmd_buffer *cmd_buffer,
60267ec681f3Smrg                                   const struct radv_draw_info *info, const uint32_t vertex_offset)
60277ec681f3Smrg{
60287ec681f3Smrg   struct radv_cmd_state *state = &cmd_buffer->state;
60297ec681f3Smrg   struct radeon_cmdbuf *cs = cmd_buffer->cs;
60307ec681f3Smrg   const bool uses_baseinstance = state->pipeline->graphics.uses_baseinstance;
60317ec681f3Smrg   const bool uses_drawid = state->pipeline->graphics.uses_drawid;
60327ec681f3Smrg   radeon_set_sh_reg_seq(cs, state->pipeline->graphics.vtx_base_sgpr,
60337ec681f3Smrg                         state->pipeline->graphics.vtx_emit_num);
60347ec681f3Smrg
60357ec681f3Smrg   radeon_emit(cs, vertex_offset);
60367ec681f3Smrg   state->last_vertex_offset = vertex_offset;
60377ec681f3Smrg   if (uses_drawid) {
60387ec681f3Smrg      radeon_emit(cs, 0);
60397ec681f3Smrg      state->last_drawid = 0;
60407ec681f3Smrg   }
60417ec681f3Smrg   if (uses_baseinstance) {
60427ec681f3Smrg      radeon_emit(cs, info->first_instance);
60437ec681f3Smrg      state->last_first_instance = info->first_instance;
60447ec681f3Smrg   }
60457ec681f3Smrg}
60467ec681f3Smrg
60477ec681f3SmrgALWAYS_INLINE static void
60487ec681f3Smrgradv_emit_userdata_vertex(struct radv_cmd_buffer *cmd_buffer, const struct radv_draw_info *info,
60497ec681f3Smrg                          const uint32_t vertex_offset)
60507ec681f3Smrg{
60517ec681f3Smrg   const struct radv_cmd_state *state = &cmd_buffer->state;
60527ec681f3Smrg   const bool uses_baseinstance = state->pipeline->graphics.uses_baseinstance;
60537ec681f3Smrg   const bool uses_drawid = state->pipeline->graphics.uses_drawid;
60547ec681f3Smrg
60557ec681f3Smrg   /* this looks very dumb, but it allows the compiler to optimize better and yields
60567ec681f3Smrg    * ~3-4% perf increase in drawoverhead
60577ec681f3Smrg    */
60587ec681f3Smrg   if (vertex_offset != state->last_vertex_offset) {
60597ec681f3Smrg      radv_emit_userdata_vertex_internal(cmd_buffer, info, vertex_offset);
60607ec681f3Smrg   } else if (uses_drawid && 0 != state->last_drawid) {
60617ec681f3Smrg      radv_emit_userdata_vertex_internal(cmd_buffer, info, vertex_offset);
60627ec681f3Smrg   } else if (uses_baseinstance && info->first_instance != state->last_first_instance) {
60637ec681f3Smrg      radv_emit_userdata_vertex_internal(cmd_buffer, info, vertex_offset);
60647ec681f3Smrg   }
60657ec681f3Smrg}
60667ec681f3Smrg
60677ec681f3SmrgALWAYS_INLINE static void
60687ec681f3Smrgradv_emit_userdata_vertex_drawid(struct radv_cmd_buffer *cmd_buffer, uint32_t vertex_offset, uint32_t drawid)
60697ec681f3Smrg{
60707ec681f3Smrg   struct radv_cmd_state *state = &cmd_buffer->state;
60717ec681f3Smrg   struct radeon_cmdbuf *cs = cmd_buffer->cs;
60727ec681f3Smrg   radeon_set_sh_reg_seq(cs, state->pipeline->graphics.vtx_base_sgpr, 1 + !!drawid);
60737ec681f3Smrg   radeon_emit(cs, vertex_offset);
60747ec681f3Smrg   state->last_vertex_offset = vertex_offset;
60757ec681f3Smrg   if (drawid)
60767ec681f3Smrg      radeon_emit(cs, drawid);
60777ec681f3Smrg
60787ec681f3Smrg}
60797ec681f3Smrg
60807ec681f3SmrgALWAYS_INLINE static void
60817ec681f3Smrgradv_emit_draw_packets_indexed(struct radv_cmd_buffer *cmd_buffer,
60827ec681f3Smrg                               const struct radv_draw_info *info,
60837ec681f3Smrg                               uint32_t drawCount, const VkMultiDrawIndexedInfoEXT *minfo,
60847ec681f3Smrg                               uint32_t stride,
60857ec681f3Smrg                               const int32_t *vertexOffset)
60867ec681f3Smrg
60877ec681f3Smrg{
60887ec681f3Smrg   struct radv_cmd_state *state = &cmd_buffer->state;
60897ec681f3Smrg   struct radeon_cmdbuf *cs = cmd_buffer->cs;
60907ec681f3Smrg   const int index_size = radv_get_vgt_index_size(state->index_type);
60917ec681f3Smrg   unsigned i = 0;
60927ec681f3Smrg   const bool uses_drawid = state->pipeline->graphics.uses_drawid;
60937ec681f3Smrg   const bool can_eop = !uses_drawid && cmd_buffer->device->physical_device->rad_info.chip_class >= GFX10;
60947ec681f3Smrg
60957ec681f3Smrg   if (uses_drawid) {
60967ec681f3Smrg      if (vertexOffset) {
60977ec681f3Smrg         radv_emit_userdata_vertex(cmd_buffer, info, *vertexOffset);
60987ec681f3Smrg         vk_foreach_multi_draw_indexed(draw, i, minfo, drawCount, stride) {
60997ec681f3Smrg            const uint32_t remaining_indexes = MAX2(state->max_index_count, draw->firstIndex) - draw->firstIndex;
61007ec681f3Smrg
61017ec681f3Smrg            /* Skip draw calls with 0-sized index buffers if the GPU can't handle them */
61027ec681f3Smrg            if (!remaining_indexes &&
61037ec681f3Smrg                cmd_buffer->device->physical_device->rad_info.has_zero_index_buffer_bug)
61047ec681f3Smrg               continue;
61057ec681f3Smrg
61067ec681f3Smrg            if (i > 0)
61077ec681f3Smrg               radeon_set_sh_reg(cs, state->pipeline->graphics.vtx_base_sgpr + sizeof(uint32_t), i);
61087ec681f3Smrg
61097ec681f3Smrg            const uint64_t index_va = state->index_va + draw->firstIndex * index_size;
61107ec681f3Smrg
61117ec681f3Smrg            if (!state->subpass->view_mask) {
61127ec681f3Smrg               radv_cs_emit_draw_indexed_packet(cmd_buffer, index_va, remaining_indexes, draw->indexCount, false);
61137ec681f3Smrg            } else {
61147ec681f3Smrg               u_foreach_bit(view, state->subpass->view_mask) {
61157ec681f3Smrg                  radv_emit_view_index(cmd_buffer, view);
61167ec681f3Smrg
61177ec681f3Smrg                  radv_cs_emit_draw_indexed_packet(cmd_buffer, index_va, remaining_indexes, draw->indexCount, false);
61187ec681f3Smrg               }
61197ec681f3Smrg            }
61207ec681f3Smrg         }
61217ec681f3Smrg      } else {
61227ec681f3Smrg         vk_foreach_multi_draw_indexed(draw, i, minfo, drawCount, stride) {
61237ec681f3Smrg            const uint32_t remaining_indexes = MAX2(state->max_index_count, draw->firstIndex) - draw->firstIndex;
61247ec681f3Smrg
61257ec681f3Smrg            /* Skip draw calls with 0-sized index buffers if the GPU can't handle them */
61267ec681f3Smrg            if (!remaining_indexes &&
61277ec681f3Smrg                cmd_buffer->device->physical_device->rad_info.has_zero_index_buffer_bug)
61287ec681f3Smrg               continue;
61297ec681f3Smrg
61307ec681f3Smrg            if (i > 0) {
61317ec681f3Smrg               if (state->last_vertex_offset != draw->vertexOffset)
61327ec681f3Smrg                  radv_emit_userdata_vertex_drawid(cmd_buffer, draw->vertexOffset, i);
61337ec681f3Smrg               else
61347ec681f3Smrg                  radeon_set_sh_reg(cs, state->pipeline->graphics.vtx_base_sgpr + sizeof(uint32_t), i);
61357ec681f3Smrg            } else
61367ec681f3Smrg               radv_emit_userdata_vertex(cmd_buffer, info, draw->vertexOffset);
61377ec681f3Smrg
61387ec681f3Smrg            const uint64_t index_va = state->index_va + draw->firstIndex * index_size;
61397ec681f3Smrg
61407ec681f3Smrg            if (!state->subpass->view_mask) {
61417ec681f3Smrg               radv_cs_emit_draw_indexed_packet(cmd_buffer, index_va, remaining_indexes, draw->indexCount, false);
61427ec681f3Smrg            } else {
61437ec681f3Smrg               u_foreach_bit(view, state->subpass->view_mask) {
61447ec681f3Smrg                  radv_emit_view_index(cmd_buffer, view);
61457ec681f3Smrg
61467ec681f3Smrg                  radv_cs_emit_draw_indexed_packet(cmd_buffer, index_va, remaining_indexes, draw->indexCount, false);
61477ec681f3Smrg               }
61487ec681f3Smrg            }
61497ec681f3Smrg         }
61507ec681f3Smrg      }
61517ec681f3Smrg      if (drawCount > 1) {
61527ec681f3Smrg         state->last_drawid = drawCount - 1;
61537ec681f3Smrg      }
61547ec681f3Smrg   } else {
61557ec681f3Smrg      if (vertexOffset) {
61567ec681f3Smrg         if (cmd_buffer->device->physical_device->rad_info.chip_class == GFX10) {
61577ec681f3Smrg            /* GFX10 has a bug that consecutive draw packets with NOT_EOP must not have
61587ec681f3Smrg             * count == 0 for the last draw that doesn't have NOT_EOP.
61597ec681f3Smrg             */
61607ec681f3Smrg            while (drawCount > 1) {
61617ec681f3Smrg               const VkMultiDrawIndexedInfoEXT *last = (const VkMultiDrawIndexedInfoEXT*)(((const uint8_t*)minfo) + (drawCount - 1) * stride);
61627ec681f3Smrg               if (last->indexCount)
61637ec681f3Smrg                  break;
61647ec681f3Smrg               drawCount--;
61657ec681f3Smrg            }
61667ec681f3Smrg         }
61677ec681f3Smrg
61687ec681f3Smrg         radv_emit_userdata_vertex(cmd_buffer, info, *vertexOffset);
61697ec681f3Smrg         vk_foreach_multi_draw_indexed(draw, i, minfo, drawCount, stride) {
61707ec681f3Smrg            const uint32_t remaining_indexes = MAX2(state->max_index_count, draw->firstIndex) - draw->firstIndex;
61717ec681f3Smrg
61727ec681f3Smrg            /* Skip draw calls with 0-sized index buffers if the GPU can't handle them */
61737ec681f3Smrg            if (!remaining_indexes &&
61747ec681f3Smrg                cmd_buffer->device->physical_device->rad_info.has_zero_index_buffer_bug)
61757ec681f3Smrg               continue;
61767ec681f3Smrg
61777ec681f3Smrg            const uint64_t index_va = state->index_va + draw->firstIndex * index_size;
61787ec681f3Smrg
61797ec681f3Smrg            if (!state->subpass->view_mask) {
61807ec681f3Smrg               radv_cs_emit_draw_indexed_packet(cmd_buffer, index_va, remaining_indexes, draw->indexCount, can_eop && i < drawCount - 1);
61817ec681f3Smrg            } else {
61827ec681f3Smrg               u_foreach_bit(view, state->subpass->view_mask) {
61837ec681f3Smrg                  radv_emit_view_index(cmd_buffer, view);
61847ec681f3Smrg
61857ec681f3Smrg                  radv_cs_emit_draw_indexed_packet(cmd_buffer, index_va, remaining_indexes, draw->indexCount, false);
61867ec681f3Smrg               }
61877ec681f3Smrg            }
61887ec681f3Smrg         }
61897ec681f3Smrg      } else {
61907ec681f3Smrg         vk_foreach_multi_draw_indexed(draw, i, minfo, drawCount, stride) {
61917ec681f3Smrg            const uint32_t remaining_indexes = MAX2(state->max_index_count, draw->firstIndex) - draw->firstIndex;
61927ec681f3Smrg
61937ec681f3Smrg            /* Skip draw calls with 0-sized index buffers if the GPU can't handle them */
61947ec681f3Smrg            if (!remaining_indexes &&
61957ec681f3Smrg                cmd_buffer->device->physical_device->rad_info.has_zero_index_buffer_bug)
61967ec681f3Smrg               continue;
61977ec681f3Smrg
61987ec681f3Smrg            const VkMultiDrawIndexedInfoEXT *next = (const VkMultiDrawIndexedInfoEXT*)(i < drawCount - 1 ? ((uint8_t*)draw + stride) : NULL);
61997ec681f3Smrg            const bool offset_changes = next && next->vertexOffset != draw->vertexOffset;
62007ec681f3Smrg            radv_emit_userdata_vertex(cmd_buffer, info, draw->vertexOffset);
62017ec681f3Smrg
62027ec681f3Smrg            const uint64_t index_va = state->index_va + draw->firstIndex * index_size;
62037ec681f3Smrg
62047ec681f3Smrg            if (!state->subpass->view_mask) {
62057ec681f3Smrg               radv_cs_emit_draw_indexed_packet(cmd_buffer, index_va, remaining_indexes, draw->indexCount, can_eop && !offset_changes && i < drawCount - 1);
62067ec681f3Smrg            } else {
62077ec681f3Smrg               u_foreach_bit(view, state->subpass->view_mask) {
62087ec681f3Smrg                  radv_emit_view_index(cmd_buffer, view);
62097ec681f3Smrg
62107ec681f3Smrg                  radv_cs_emit_draw_indexed_packet(cmd_buffer, index_va, remaining_indexes, draw->indexCount, false);
62117ec681f3Smrg               }
62127ec681f3Smrg            }
62137ec681f3Smrg         }
62147ec681f3Smrg      }
62157ec681f3Smrg      if (drawCount > 1) {
62167ec681f3Smrg         state->last_drawid = drawCount - 1;
62177ec681f3Smrg      }
62187ec681f3Smrg   }
62197ec681f3Smrg}
62207ec681f3Smrg
62217ec681f3SmrgALWAYS_INLINE static void
62227ec681f3Smrgradv_emit_direct_draw_packets(struct radv_cmd_buffer *cmd_buffer, const struct radv_draw_info *info,
62237ec681f3Smrg                              uint32_t drawCount, const VkMultiDrawInfoEXT *minfo,
62247ec681f3Smrg                              uint32_t use_opaque, uint32_t stride)
62257ec681f3Smrg{
62267ec681f3Smrg   unsigned i = 0;
62277ec681f3Smrg   const uint32_t view_mask = cmd_buffer->state.subpass->view_mask;
62287ec681f3Smrg   const bool uses_drawid = cmd_buffer->state.pipeline->graphics.uses_drawid;
62297ec681f3Smrg   uint32_t last_start = 0;
62307ec681f3Smrg
62317ec681f3Smrg   vk_foreach_multi_draw(draw, i, minfo, drawCount, stride) {
62327ec681f3Smrg      if (!i)
62337ec681f3Smrg         radv_emit_userdata_vertex(cmd_buffer, info, draw->firstVertex);
62347ec681f3Smrg      else
62357ec681f3Smrg         radv_emit_userdata_vertex_drawid(cmd_buffer, draw->firstVertex, uses_drawid ? i : 0);
62367ec681f3Smrg
62377ec681f3Smrg      if (!view_mask) {
62387ec681f3Smrg         radv_cs_emit_draw_packet(cmd_buffer, draw->vertexCount, use_opaque);
62397ec681f3Smrg      } else {
62407ec681f3Smrg         u_foreach_bit(view, view_mask) {
62417ec681f3Smrg            radv_emit_view_index(cmd_buffer, view);
62427ec681f3Smrg            radv_cs_emit_draw_packet(cmd_buffer, draw->vertexCount, use_opaque);
62437ec681f3Smrg         }
62447ec681f3Smrg      }
62457ec681f3Smrg      last_start = draw->firstVertex;
62467ec681f3Smrg   }
62477ec681f3Smrg   if (drawCount > 1) {
62487ec681f3Smrg       struct radv_cmd_state *state = &cmd_buffer->state;
62497ec681f3Smrg       state->last_vertex_offset = last_start;
62507ec681f3Smrg       if (uses_drawid)
62517ec681f3Smrg           state->last_drawid = drawCount - 1;
62527ec681f3Smrg   }
62537ec681f3Smrg}
62547ec681f3Smrg
62557ec681f3Smrgstatic void
62567ec681f3Smrgradv_emit_indirect_draw_packets(struct radv_cmd_buffer *cmd_buffer,
62577ec681f3Smrg                                const struct radv_draw_info *info)
62587ec681f3Smrg{
62597ec681f3Smrg   const struct radv_cmd_state *state = &cmd_buffer->state;
62607ec681f3Smrg   struct radeon_winsys *ws = cmd_buffer->device->ws;
62617ec681f3Smrg   struct radeon_cmdbuf *cs = cmd_buffer->cs;
62627ec681f3Smrg   const uint64_t va =
62637ec681f3Smrg      radv_buffer_get_va(info->indirect->bo) + info->indirect->offset + info->indirect_offset;
62647ec681f3Smrg   const uint64_t count_va = info->count_buffer
62657ec681f3Smrg                                ? radv_buffer_get_va(info->count_buffer->bo) +
62667ec681f3Smrg                                     info->count_buffer->offset + info->count_buffer_offset
62677ec681f3Smrg                                : 0;
62687ec681f3Smrg
62697ec681f3Smrg   radv_cs_add_buffer(ws, cs, info->indirect->bo);
62707ec681f3Smrg
62717ec681f3Smrg   radeon_emit(cs, PKT3(PKT3_SET_BASE, 2, 0));
62727ec681f3Smrg   radeon_emit(cs, 1);
62737ec681f3Smrg   radeon_emit(cs, va);
62747ec681f3Smrg   radeon_emit(cs, va >> 32);
62757ec681f3Smrg
62767ec681f3Smrg   if (info->count_buffer) {
62777ec681f3Smrg      radv_cs_add_buffer(ws, cs, info->count_buffer->bo);
62787ec681f3Smrg   }
62797ec681f3Smrg
62807ec681f3Smrg   if (!state->subpass->view_mask) {
62817ec681f3Smrg      radv_cs_emit_indirect_draw_packet(cmd_buffer, info->indexed, info->count, count_va,
62827ec681f3Smrg                                        info->stride);
62837ec681f3Smrg   } else {
62847ec681f3Smrg      u_foreach_bit(i, state->subpass->view_mask)
62857ec681f3Smrg      {
62867ec681f3Smrg         radv_emit_view_index(cmd_buffer, i);
62877ec681f3Smrg
62887ec681f3Smrg         radv_cs_emit_indirect_draw_packet(cmd_buffer, info->indexed, info->count, count_va,
62897ec681f3Smrg                                           info->stride);
62907ec681f3Smrg      }
62917ec681f3Smrg   }
62927ec681f3Smrg}
62937ec681f3Smrg
62947ec681f3Smrg/*
62957ec681f3Smrg * Vega and raven have a bug which triggers if there are multiple context
62967ec681f3Smrg * register contexts active at the same time with different scissor values.
62977ec681f3Smrg *
62987ec681f3Smrg * There are two possible workarounds:
62997ec681f3Smrg * 1) Wait for PS_PARTIAL_FLUSH every time the scissor is changed. That way
63007ec681f3Smrg *    there is only ever 1 active set of scissor values at the same time.
63017ec681f3Smrg *
63027ec681f3Smrg * 2) Whenever the hardware switches contexts we have to set the scissor
63037ec681f3Smrg *    registers again even if it is a noop. That way the new context gets
63047ec681f3Smrg *    the correct scissor values.
63057ec681f3Smrg *
63067ec681f3Smrg * This implements option 2. radv_need_late_scissor_emission needs to
63077ec681f3Smrg * return true on affected HW if radv_emit_all_graphics_states sets
63087ec681f3Smrg * any context registers.
63097ec681f3Smrg */
63107ec681f3Smrgstatic bool
63117ec681f3Smrgradv_need_late_scissor_emission(struct radv_cmd_buffer *cmd_buffer,
63127ec681f3Smrg                                const struct radv_draw_info *info)
63137ec681f3Smrg{
63147ec681f3Smrg   struct radv_cmd_state *state = &cmd_buffer->state;
63157ec681f3Smrg
63167ec681f3Smrg   if (!cmd_buffer->device->physical_device->rad_info.has_gfx9_scissor_bug)
63177ec681f3Smrg      return false;
63187ec681f3Smrg
63197ec681f3Smrg   if (cmd_buffer->state.context_roll_without_scissor_emitted || info->strmout_buffer)
63207ec681f3Smrg      return true;
63217ec681f3Smrg
63227ec681f3Smrg   uint64_t used_states =
63237ec681f3Smrg      cmd_buffer->state.pipeline->graphics.needed_dynamic_state | ~RADV_CMD_DIRTY_DYNAMIC_ALL;
63247ec681f3Smrg
63257ec681f3Smrg   /* Index, vertex and streamout buffers don't change context regs, and
63267ec681f3Smrg    * pipeline is already handled.
63277ec681f3Smrg    */
63287ec681f3Smrg   used_states &= ~(RADV_CMD_DIRTY_INDEX_BUFFER | RADV_CMD_DIRTY_VERTEX_BUFFER |
63297ec681f3Smrg                    RADV_CMD_DIRTY_DYNAMIC_VERTEX_INPUT | RADV_CMD_DIRTY_STREAMOUT_BUFFER |
63307ec681f3Smrg                    RADV_CMD_DIRTY_PIPELINE);
63317ec681f3Smrg
63327ec681f3Smrg   if (cmd_buffer->state.dirty & used_states)
63337ec681f3Smrg      return true;
63347ec681f3Smrg
63357ec681f3Smrg   uint32_t primitive_reset_index = radv_get_primitive_reset_index(cmd_buffer);
63367ec681f3Smrg
63377ec681f3Smrg   if (info->indexed && state->dynamic.primitive_restart_enable &&
63387ec681f3Smrg       primitive_reset_index != state->last_primitive_reset_index)
63397ec681f3Smrg      return true;
63407ec681f3Smrg
63417ec681f3Smrg   return false;
63427ec681f3Smrg}
63437ec681f3Smrg
63447ec681f3Smrgenum {
63457ec681f3Smrg   ngg_cull_none = 0,
63467ec681f3Smrg   ngg_cull_front_face = 1,
63477ec681f3Smrg   ngg_cull_back_face = 2,
63487ec681f3Smrg   ngg_cull_face_is_ccw = 4,
63497ec681f3Smrg   ngg_cull_small_primitives = 8,
63507ec681f3Smrg};
63517ec681f3Smrg
63527ec681f3SmrgALWAYS_INLINE static bool
63537ec681f3Smrgradv_skip_ngg_culling(bool has_tess, const unsigned vtx_cnt,
63547ec681f3Smrg                      bool indirect)
63557ec681f3Smrg{
63567ec681f3Smrg   /* If we have to draw only a few vertices, we get better latency if
63577ec681f3Smrg    * we disable NGG culling.
63587ec681f3Smrg    *
63597ec681f3Smrg    * When tessellation is used, what matters is the number of tessellated
63607ec681f3Smrg    * vertices, so let's always assume it's not a small draw.
63617ec681f3Smrg    */
63627ec681f3Smrg   return !has_tess && !indirect && vtx_cnt < 128;
63637ec681f3Smrg}
63647ec681f3Smrg
63657ec681f3SmrgALWAYS_INLINE static uint32_t
63667ec681f3Smrgradv_get_ngg_culling_settings(struct radv_cmd_buffer *cmd_buffer, bool vp_y_inverted)
63677ec681f3Smrg{
63687ec681f3Smrg   const struct radv_pipeline *pipeline = cmd_buffer->state.pipeline;
63697ec681f3Smrg   const struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
63707ec681f3Smrg
63717ec681f3Smrg   /* Cull every triangle when rasterizer discard is enabled. */
63727ec681f3Smrg   if (d->rasterizer_discard_enable ||
63737ec681f3Smrg       G_028810_DX_RASTERIZATION_KILL(cmd_buffer->state.pipeline->graphics.pa_cl_clip_cntl))
63747ec681f3Smrg      return ngg_cull_front_face | ngg_cull_back_face;
63757ec681f3Smrg
63767ec681f3Smrg   uint32_t pa_su_sc_mode_cntl = cmd_buffer->state.pipeline->graphics.pa_su_sc_mode_cntl;
63777ec681f3Smrg   uint32_t nggc_settings = ngg_cull_none;
63787ec681f3Smrg
63797ec681f3Smrg   /* The culling code needs to know whether face is CW or CCW. */
63807ec681f3Smrg   bool ccw = (pipeline->graphics.needed_dynamic_state & RADV_DYNAMIC_FRONT_FACE)
63817ec681f3Smrg              ? d->front_face == VK_FRONT_FACE_COUNTER_CLOCKWISE
63827ec681f3Smrg              : G_028814_FACE(pa_su_sc_mode_cntl) == 0;
63837ec681f3Smrg
63847ec681f3Smrg   /* Take inverted viewport into account. */
63857ec681f3Smrg   ccw ^= vp_y_inverted;
63867ec681f3Smrg
63877ec681f3Smrg   if (ccw)
63887ec681f3Smrg      nggc_settings |= ngg_cull_face_is_ccw;
63897ec681f3Smrg
63907ec681f3Smrg   /* Face culling settings. */
63917ec681f3Smrg   if ((pipeline->graphics.needed_dynamic_state & RADV_DYNAMIC_CULL_MODE)
63927ec681f3Smrg         ? (d->cull_mode & VK_CULL_MODE_FRONT_BIT)
63937ec681f3Smrg         : G_028814_CULL_FRONT(pa_su_sc_mode_cntl))
63947ec681f3Smrg      nggc_settings |= ngg_cull_front_face;
63957ec681f3Smrg   if ((pipeline->graphics.needed_dynamic_state & RADV_DYNAMIC_CULL_MODE)
63967ec681f3Smrg         ? (d->cull_mode & VK_CULL_MODE_BACK_BIT)
63977ec681f3Smrg         : G_028814_CULL_BACK(pa_su_sc_mode_cntl))
63987ec681f3Smrg      nggc_settings |= ngg_cull_back_face;
63997ec681f3Smrg
64007ec681f3Smrg   /* Small primitive culling is only valid when conservative overestimation is not used. */
64017ec681f3Smrg   if (!pipeline->graphics.uses_conservative_overestimate) {
64027ec681f3Smrg      nggc_settings |= ngg_cull_small_primitives;
64037ec681f3Smrg
64047ec681f3Smrg      /* small_prim_precision = num_samples / 2^subpixel_bits
64057ec681f3Smrg       * num_samples is also always a power of two, so the small prim precision can only be
64067ec681f3Smrg       * a power of two between 2^-2 and 2^-6, therefore it's enough to remember the exponent.
64077ec681f3Smrg       */
64087ec681f3Smrg      unsigned subpixel_bits = 256;
64097ec681f3Smrg      int32_t small_prim_precision_log2 = util_logbase2(pipeline->graphics.ms.num_samples) - util_logbase2(subpixel_bits);
64107ec681f3Smrg      nggc_settings |= ((uint32_t) small_prim_precision_log2 << 24u);
64117ec681f3Smrg   }
64127ec681f3Smrg
64137ec681f3Smrg   return nggc_settings;
64147ec681f3Smrg}
64157ec681f3Smrg
64167ec681f3Smrgstatic void
64177ec681f3Smrgradv_emit_ngg_culling_state(struct radv_cmd_buffer *cmd_buffer, const struct radv_draw_info *draw_info)
64187ec681f3Smrg{
64197ec681f3Smrg   struct radv_pipeline *pipeline = cmd_buffer->state.pipeline;
64207ec681f3Smrg   const unsigned stage = pipeline->graphics.last_vgt_api_stage;
64217ec681f3Smrg   const bool nggc_supported = pipeline->graphics.has_ngg_culling;
64227ec681f3Smrg
64237ec681f3Smrg   if (!nggc_supported && !cmd_buffer->state.last_nggc_settings) {
64247ec681f3Smrg      /* Current shader doesn't support culling and culling was already disabled:
64257ec681f3Smrg       * No further steps needed, just remember the SGPR's location is not set.
64267ec681f3Smrg       */
64277ec681f3Smrg      cmd_buffer->state.last_nggc_settings_sgpr_idx = -1;
64287ec681f3Smrg      return;
64297ec681f3Smrg   }
64307ec681f3Smrg
64317ec681f3Smrg   /* Check dirty flags:
64327ec681f3Smrg    * - Dirty pipeline: SGPR index may have changed (we have to re-emit if changed).
64337ec681f3Smrg    * - Dirty dynamic flags: culling settings may have changed.
64347ec681f3Smrg    */
64357ec681f3Smrg   const bool dirty =
64367ec681f3Smrg      cmd_buffer->state.dirty &
64377ec681f3Smrg      (RADV_CMD_DIRTY_PIPELINE |
64387ec681f3Smrg       RADV_CMD_DIRTY_DYNAMIC_CULL_MODE | RADV_CMD_DIRTY_DYNAMIC_FRONT_FACE |
64397ec681f3Smrg       RADV_CMD_DIRTY_DYNAMIC_RASTERIZER_DISCARD_ENABLE | RADV_CMD_DIRTY_DYNAMIC_VIEWPORT);
64407ec681f3Smrg
64417ec681f3Smrg   /* Check small draw status:
64427ec681f3Smrg    * For small draw calls, we disable culling by setting the SGPR to 0.
64437ec681f3Smrg    */
64447ec681f3Smrg   const bool skip =
64457ec681f3Smrg      radv_skip_ngg_culling(stage == MESA_SHADER_TESS_EVAL, draw_info->count, draw_info->indirect);
64467ec681f3Smrg
64477ec681f3Smrg   /* See if anything changed. */
64487ec681f3Smrg   if (!dirty && skip == cmd_buffer->state.last_nggc_skip)
64497ec681f3Smrg      return;
64507ec681f3Smrg
64517ec681f3Smrg   /* Remember small draw state. */
64527ec681f3Smrg   cmd_buffer->state.last_nggc_skip = skip;
64537ec681f3Smrg   const struct radv_shader_variant *v = pipeline->shaders[stage];
64547ec681f3Smrg   assert(v->info.has_ngg_culling == nggc_supported);
64557ec681f3Smrg
64567ec681f3Smrg   /* Find the user SGPR. */
64577ec681f3Smrg   const uint32_t base_reg = pipeline->user_data_0[stage];
64587ec681f3Smrg   const int8_t nggc_sgpr_idx = v->info.user_sgprs_locs.shader_data[AC_UD_NGG_CULLING_SETTINGS].sgpr_idx;
64597ec681f3Smrg   assert(!nggc_supported || nggc_sgpr_idx != -1);
64607ec681f3Smrg
64617ec681f3Smrg   /* Get viewport transform. */
64627ec681f3Smrg   float vp_scale[2], vp_translate[2];
64637ec681f3Smrg   memcpy(vp_scale, cmd_buffer->state.dynamic.viewport.xform[0].scale, 2 * sizeof(float));
64647ec681f3Smrg   memcpy(vp_translate, cmd_buffer->state.dynamic.viewport.xform[0].translate, 2 * sizeof(float));
64657ec681f3Smrg   bool vp_y_inverted = (-vp_scale[1] + vp_translate[1]) > (vp_scale[1] + vp_translate[1]);
64667ec681f3Smrg
64677ec681f3Smrg   /* Get current culling settings. */
64687ec681f3Smrg   uint32_t nggc_settings = nggc_supported && !skip
64697ec681f3Smrg                            ? radv_get_ngg_culling_settings(cmd_buffer, vp_y_inverted)
64707ec681f3Smrg                            : ngg_cull_none;
64717ec681f3Smrg
64727ec681f3Smrg   bool emit_viewport = nggc_settings &&
64737ec681f3Smrg                        (cmd_buffer->state.dirty & RADV_CMD_DIRTY_DYNAMIC_VIEWPORT ||
64747ec681f3Smrg                         cmd_buffer->state.last_nggc_settings_sgpr_idx != nggc_sgpr_idx ||
64757ec681f3Smrg                         !cmd_buffer->state.last_nggc_settings);
64767ec681f3Smrg
64777ec681f3Smrg   if (emit_viewport) {
64787ec681f3Smrg      /* Correction for inverted Y */
64797ec681f3Smrg      if (vp_y_inverted) {
64807ec681f3Smrg         vp_scale[1] = -vp_scale[1];
64817ec681f3Smrg         vp_translate[1] = -vp_translate[1];
64827ec681f3Smrg      }
64837ec681f3Smrg
64847ec681f3Smrg      /* Correction for number of samples per pixel. */
64857ec681f3Smrg      for (unsigned i = 0; i < 2; ++i) {
64867ec681f3Smrg         vp_scale[i] *= (float) pipeline->graphics.ms.num_samples;
64877ec681f3Smrg         vp_translate[i] *= (float) pipeline->graphics.ms.num_samples;
64887ec681f3Smrg      }
64897ec681f3Smrg
64907ec681f3Smrg      uint32_t vp_reg_values[4] = {fui(vp_scale[0]), fui(vp_scale[1]), fui(vp_translate[0]), fui(vp_translate[1])};
64917ec681f3Smrg      const int8_t vp_sgpr_idx = v->info.user_sgprs_locs.shader_data[AC_UD_NGG_VIEWPORT].sgpr_idx;
64927ec681f3Smrg      assert(vp_sgpr_idx != -1);
64937ec681f3Smrg      radeon_set_sh_reg_seq(cmd_buffer->cs, base_reg + vp_sgpr_idx * 4, 4);
64947ec681f3Smrg      radeon_emit_array(cmd_buffer->cs, vp_reg_values, 4);
64957ec681f3Smrg   }
64967ec681f3Smrg
64977ec681f3Smrg   bool emit_settings = nggc_supported &&
64987ec681f3Smrg                        (cmd_buffer->state.last_nggc_settings != nggc_settings ||
64997ec681f3Smrg                         cmd_buffer->state.last_nggc_settings_sgpr_idx != nggc_sgpr_idx);
65007ec681f3Smrg
65017ec681f3Smrg   /* This needs to be emitted when culling is turned on
65027ec681f3Smrg    * and when it's already on but some settings change.
65037ec681f3Smrg    */
65047ec681f3Smrg   if (emit_settings) {
65057ec681f3Smrg      assert(nggc_sgpr_idx >= 0);
65067ec681f3Smrg      radeon_set_sh_reg(cmd_buffer->cs, base_reg + nggc_sgpr_idx * 4, nggc_settings);
65077ec681f3Smrg   }
65087ec681f3Smrg
65097ec681f3Smrg   /* These only need to be emitted when culling is turned on or off,
65107ec681f3Smrg    * but not when it stays on and just some settings change.
65117ec681f3Smrg    */
65127ec681f3Smrg   if (!!cmd_buffer->state.last_nggc_settings != !!nggc_settings) {
65137ec681f3Smrg      uint32_t rsrc2 = v->config.rsrc2;
65147ec681f3Smrg
65157ec681f3Smrg      if (!nggc_settings) {
65167ec681f3Smrg         /* Allocate less LDS when culling is disabled. (But GS always needs it.) */
65177ec681f3Smrg         if (stage != MESA_SHADER_GEOMETRY)
65187ec681f3Smrg            rsrc2 = (rsrc2 & C_00B22C_LDS_SIZE) | S_00B22C_LDS_SIZE(v->info.num_lds_blocks_when_not_culling);
65197ec681f3Smrg      }
65207ec681f3Smrg
65217ec681f3Smrg      /* When the pipeline is dirty and not yet emitted, don't write it here
65227ec681f3Smrg       * because radv_emit_graphics_pipeline will overwrite this register.
65237ec681f3Smrg       */
65247ec681f3Smrg      if (!(cmd_buffer->state.dirty & RADV_CMD_DIRTY_PIPELINE) ||
65257ec681f3Smrg          cmd_buffer->state.emitted_pipeline == pipeline) {
65267ec681f3Smrg         radeon_set_sh_reg(cmd_buffer->cs, R_00B22C_SPI_SHADER_PGM_RSRC2_GS, rsrc2);
65277ec681f3Smrg      }
65287ec681f3Smrg   }
65297ec681f3Smrg
65307ec681f3Smrg   cmd_buffer->state.last_nggc_settings = nggc_settings;
65317ec681f3Smrg   cmd_buffer->state.last_nggc_settings_sgpr_idx = nggc_sgpr_idx;
65327ec681f3Smrg}
65337ec681f3Smrg
65347ec681f3Smrgstatic void
65357ec681f3Smrgradv_emit_all_graphics_states(struct radv_cmd_buffer *cmd_buffer, const struct radv_draw_info *info,
65367ec681f3Smrg                              bool pipeline_is_dirty)
65377ec681f3Smrg{
65387ec681f3Smrg   bool late_scissor_emission;
65397ec681f3Smrg
65407ec681f3Smrg   if ((cmd_buffer->state.dirty & RADV_CMD_DIRTY_FRAMEBUFFER) ||
65417ec681f3Smrg       cmd_buffer->state.emitted_pipeline != cmd_buffer->state.pipeline)
65427ec681f3Smrg      radv_emit_rbplus_state(cmd_buffer);
65437ec681f3Smrg
65447ec681f3Smrg   if (cmd_buffer->device->physical_device->use_ngg_culling &&
65457ec681f3Smrg       cmd_buffer->state.pipeline->graphics.is_ngg)
65467ec681f3Smrg      radv_emit_ngg_culling_state(cmd_buffer, info);
65477ec681f3Smrg
65487ec681f3Smrg   if (cmd_buffer->state.dirty & RADV_CMD_DIRTY_PIPELINE)
65497ec681f3Smrg      radv_emit_graphics_pipeline(cmd_buffer);
65507ec681f3Smrg
65517ec681f3Smrg   /* This should be before the cmd_buffer->state.dirty is cleared
65527ec681f3Smrg    * (excluding RADV_CMD_DIRTY_PIPELINE) and after
65537ec681f3Smrg    * cmd_buffer->state.context_roll_without_scissor_emitted is set. */
65547ec681f3Smrg   late_scissor_emission = radv_need_late_scissor_emission(cmd_buffer, info);
65557ec681f3Smrg
65567ec681f3Smrg   if (cmd_buffer->state.dirty & RADV_CMD_DIRTY_FRAMEBUFFER)
65577ec681f3Smrg      radv_emit_framebuffer_state(cmd_buffer);
65587ec681f3Smrg
65597ec681f3Smrg   if (info->indexed) {
65607ec681f3Smrg      if (cmd_buffer->state.dirty & RADV_CMD_DIRTY_INDEX_BUFFER)
65617ec681f3Smrg         radv_emit_index_buffer(cmd_buffer, info->indirect);
65627ec681f3Smrg   } else {
65637ec681f3Smrg      /* On GFX7 and later, non-indexed draws overwrite VGT_INDEX_TYPE,
65647ec681f3Smrg       * so the state must be re-emitted before the next indexed
65657ec681f3Smrg       * draw.
65667ec681f3Smrg       */
65677ec681f3Smrg      if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX7) {
65687ec681f3Smrg         cmd_buffer->state.last_index_type = -1;
65697ec681f3Smrg         cmd_buffer->state.dirty |= RADV_CMD_DIRTY_INDEX_BUFFER;
65707ec681f3Smrg      }
65717ec681f3Smrg   }
65727ec681f3Smrg
65737ec681f3Smrg   radv_cmd_buffer_flush_dynamic_state(cmd_buffer, pipeline_is_dirty);
65747ec681f3Smrg
65757ec681f3Smrg   radv_emit_draw_registers(cmd_buffer, info);
65767ec681f3Smrg
65777ec681f3Smrg   if (late_scissor_emission)
65787ec681f3Smrg      radv_emit_scissor(cmd_buffer);
65797ec681f3Smrg}
65807ec681f3Smrg
65817ec681f3Smrg/* MUST inline this function to avoid massive perf loss in drawoverhead */
65827ec681f3SmrgALWAYS_INLINE static bool
65837ec681f3Smrgradv_before_draw(struct radv_cmd_buffer *cmd_buffer, const struct radv_draw_info *info, uint32_t drawCount)
65847ec681f3Smrg{
65857ec681f3Smrg   const bool has_prefetch = cmd_buffer->device->physical_device->rad_info.chip_class >= GFX7;
65867ec681f3Smrg   const bool pipeline_is_dirty = (cmd_buffer->state.dirty & RADV_CMD_DIRTY_PIPELINE) &&
65877ec681f3Smrg                                  cmd_buffer->state.pipeline != cmd_buffer->state.emitted_pipeline;
65887ec681f3Smrg
65897ec681f3Smrg   ASSERTED const unsigned cdw_max =
65907ec681f3Smrg      radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 4096 + 128 * (drawCount - 1));
65917ec681f3Smrg
65927ec681f3Smrg   if (likely(!info->indirect)) {
65937ec681f3Smrg      /* GFX6-GFX7 treat instance_count==0 as instance_count==1. There is
65947ec681f3Smrg       * no workaround for indirect draws, but we can at least skip
65957ec681f3Smrg       * direct draws.
65967ec681f3Smrg       */
65977ec681f3Smrg      if (unlikely(!info->instance_count))
65987ec681f3Smrg         return false;
65997ec681f3Smrg
66007ec681f3Smrg      /* Handle count == 0. */
66017ec681f3Smrg      if (unlikely(!info->count && !info->strmout_buffer))
66027ec681f3Smrg         return false;
66037ec681f3Smrg   }
66047ec681f3Smrg
66057ec681f3Smrg   /* Need to apply this workaround early as it can set flush flags. */
66067ec681f3Smrg   if (cmd_buffer->state.dirty & RADV_CMD_DIRTY_FRAMEBUFFER)
66077ec681f3Smrg      radv_emit_fb_mip_change_flush(cmd_buffer);
66087ec681f3Smrg
66097ec681f3Smrg   /* Use optimal packet order based on whether we need to sync the
66107ec681f3Smrg    * pipeline.
66117ec681f3Smrg    */
66127ec681f3Smrg   if (cmd_buffer->state.flush_bits &
66137ec681f3Smrg       (RADV_CMD_FLAG_FLUSH_AND_INV_CB | RADV_CMD_FLAG_FLUSH_AND_INV_DB |
66147ec681f3Smrg        RADV_CMD_FLAG_PS_PARTIAL_FLUSH | RADV_CMD_FLAG_CS_PARTIAL_FLUSH)) {
66157ec681f3Smrg      /* If we have to wait for idle, set all states first, so that
66167ec681f3Smrg       * all SET packets are processed in parallel with previous draw
66177ec681f3Smrg       * calls. Then upload descriptors, set shader pointers, and
66187ec681f3Smrg       * draw, and prefetch at the end. This ensures that the time
66197ec681f3Smrg       * the CUs are idle is very short. (there are only SET_SH
66207ec681f3Smrg       * packets between the wait and the draw)
66217ec681f3Smrg       */
66227ec681f3Smrg      radv_emit_all_graphics_states(cmd_buffer, info, pipeline_is_dirty);
66237ec681f3Smrg      si_emit_cache_flush(cmd_buffer);
66247ec681f3Smrg      /* <-- CUs are idle here --> */
66257ec681f3Smrg
66267ec681f3Smrg      radv_upload_graphics_shader_descriptors(cmd_buffer, pipeline_is_dirty);
66277ec681f3Smrg   } else {
66287ec681f3Smrg      /* If we don't wait for idle, start prefetches first, then set
66297ec681f3Smrg       * states, and draw at the end.
66307ec681f3Smrg       */
66317ec681f3Smrg      si_emit_cache_flush(cmd_buffer);
66327ec681f3Smrg
66337ec681f3Smrg      if (has_prefetch && cmd_buffer->state.prefetch_L2_mask) {
66347ec681f3Smrg         /* Only prefetch the vertex shader and VBO descriptors
66357ec681f3Smrg          * in order to start the draw as soon as possible.
66367ec681f3Smrg          */
66377ec681f3Smrg         radv_emit_prefetch_L2(cmd_buffer, cmd_buffer->state.pipeline, true);
66387ec681f3Smrg      }
66397ec681f3Smrg
66407ec681f3Smrg      radv_upload_graphics_shader_descriptors(cmd_buffer, pipeline_is_dirty);
66417ec681f3Smrg
66427ec681f3Smrg      radv_emit_all_graphics_states(cmd_buffer, info, pipeline_is_dirty);
66437ec681f3Smrg   }
66447ec681f3Smrg
66457ec681f3Smrg   radv_describe_draw(cmd_buffer);
66467ec681f3Smrg   if (likely(!info->indirect)) {
66477ec681f3Smrg      struct radv_cmd_state *state = &cmd_buffer->state;
66487ec681f3Smrg      struct radeon_cmdbuf *cs = cmd_buffer->cs;
66497ec681f3Smrg      assert(state->pipeline->graphics.vtx_base_sgpr);
66507ec681f3Smrg      if (state->last_num_instances != info->instance_count) {
66517ec681f3Smrg         radeon_emit(cs, PKT3(PKT3_NUM_INSTANCES, 0, false));
66527ec681f3Smrg         radeon_emit(cs, info->instance_count);
66537ec681f3Smrg         state->last_num_instances = info->instance_count;
66547ec681f3Smrg      }
66557ec681f3Smrg   }
66567ec681f3Smrg   assert(cmd_buffer->cs->cdw <= cdw_max);
66577ec681f3Smrg
66587ec681f3Smrg   return true;
66597ec681f3Smrg}
66607ec681f3Smrg
66617ec681f3Smrgstatic void
66627ec681f3Smrgradv_after_draw(struct radv_cmd_buffer *cmd_buffer)
66637ec681f3Smrg{
66647ec681f3Smrg   const struct radeon_info *rad_info = &cmd_buffer->device->physical_device->rad_info;
66657ec681f3Smrg   bool has_prefetch = cmd_buffer->device->physical_device->rad_info.chip_class >= GFX7;
66667ec681f3Smrg   /* Start prefetches after the draw has been started. Both will
66677ec681f3Smrg    * run in parallel, but starting the draw first is more
66687ec681f3Smrg    * important.
66697ec681f3Smrg    */
66707ec681f3Smrg   if (has_prefetch && cmd_buffer->state.prefetch_L2_mask) {
66717ec681f3Smrg      radv_emit_prefetch_L2(cmd_buffer, cmd_buffer->state.pipeline, false);
66727ec681f3Smrg   }
66737ec681f3Smrg
66747ec681f3Smrg   /* Workaround for a VGT hang when streamout is enabled.
66757ec681f3Smrg    * It must be done after drawing.
66767ec681f3Smrg    */
66777ec681f3Smrg   if (cmd_buffer->state.streamout.streamout_enabled &&
66787ec681f3Smrg       (rad_info->family == CHIP_HAWAII || rad_info->family == CHIP_TONGA ||
66797ec681f3Smrg        rad_info->family == CHIP_FIJI)) {
66807ec681f3Smrg      cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_VGT_STREAMOUT_SYNC;
66817ec681f3Smrg   }
66827ec681f3Smrg
66837ec681f3Smrg   radv_cmd_buffer_after_draw(cmd_buffer, RADV_CMD_FLAG_PS_PARTIAL_FLUSH);
66847ec681f3Smrg}
66857ec681f3Smrg
66867ec681f3Smrgvoid
66877ec681f3Smrgradv_CmdDraw(VkCommandBuffer commandBuffer, uint32_t vertexCount, uint32_t instanceCount,
66887ec681f3Smrg             uint32_t firstVertex, uint32_t firstInstance)
66897ec681f3Smrg{
66907ec681f3Smrg   RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
66917ec681f3Smrg   struct radv_draw_info info;
66927ec681f3Smrg
66937ec681f3Smrg   info.count = vertexCount;
66947ec681f3Smrg   info.instance_count = instanceCount;
66957ec681f3Smrg   info.first_instance = firstInstance;
66967ec681f3Smrg   info.strmout_buffer = NULL;
66977ec681f3Smrg   info.indirect = NULL;
66987ec681f3Smrg   info.indexed = false;
66997ec681f3Smrg
67007ec681f3Smrg   if (!radv_before_draw(cmd_buffer, &info, 1))
67017ec681f3Smrg      return;
67027ec681f3Smrg   const VkMultiDrawInfoEXT minfo = { firstVertex, vertexCount };
67037ec681f3Smrg   radv_emit_direct_draw_packets(cmd_buffer, &info, 1, &minfo, 0, 0);
67047ec681f3Smrg   radv_after_draw(cmd_buffer);
67057ec681f3Smrg}
67067ec681f3Smrg
67077ec681f3Smrgvoid
67087ec681f3Smrgradv_CmdDrawMultiEXT(VkCommandBuffer commandBuffer, uint32_t drawCount, const VkMultiDrawInfoEXT *pVertexInfo,
67097ec681f3Smrg                          uint32_t instanceCount, uint32_t firstInstance, uint32_t stride)
67107ec681f3Smrg{
67117ec681f3Smrg   RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
67127ec681f3Smrg   struct radv_draw_info info;
67137ec681f3Smrg
67147ec681f3Smrg   if (!drawCount)
67157ec681f3Smrg      return;
67167ec681f3Smrg
67177ec681f3Smrg   info.count = pVertexInfo->vertexCount;
67187ec681f3Smrg   info.instance_count = instanceCount;
67197ec681f3Smrg   info.first_instance = firstInstance;
67207ec681f3Smrg   info.strmout_buffer = NULL;
67217ec681f3Smrg   info.indirect = NULL;
67227ec681f3Smrg   info.indexed = false;
67237ec681f3Smrg
67247ec681f3Smrg   if (!radv_before_draw(cmd_buffer, &info, drawCount))
67257ec681f3Smrg      return;
67267ec681f3Smrg   radv_emit_direct_draw_packets(cmd_buffer, &info, drawCount, pVertexInfo, 0, stride);
67277ec681f3Smrg   radv_after_draw(cmd_buffer);
67287ec681f3Smrg}
67297ec681f3Smrg
67307ec681f3Smrgvoid
67317ec681f3Smrgradv_CmdDrawIndexed(VkCommandBuffer commandBuffer, uint32_t indexCount, uint32_t instanceCount,
67327ec681f3Smrg                    uint32_t firstIndex, int32_t vertexOffset, uint32_t firstInstance)
67337ec681f3Smrg{
67347ec681f3Smrg   RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
67357ec681f3Smrg   struct radv_draw_info info;
67367ec681f3Smrg
67377ec681f3Smrg   info.indexed = true;
67387ec681f3Smrg   info.count = indexCount;
67397ec681f3Smrg   info.instance_count = instanceCount;
67407ec681f3Smrg   info.first_instance = firstInstance;
67417ec681f3Smrg   info.strmout_buffer = NULL;
67427ec681f3Smrg   info.indirect = NULL;
67437ec681f3Smrg
67447ec681f3Smrg   if (!radv_before_draw(cmd_buffer, &info, 1))
67457ec681f3Smrg      return;
67467ec681f3Smrg   const VkMultiDrawIndexedInfoEXT minfo = { firstIndex, indexCount, vertexOffset };
67477ec681f3Smrg   radv_emit_draw_packets_indexed(cmd_buffer, &info, 1, &minfo, 0, NULL);
67487ec681f3Smrg   radv_after_draw(cmd_buffer);
67497ec681f3Smrg}
67507ec681f3Smrg
67517ec681f3Smrgvoid radv_CmdDrawMultiIndexedEXT(VkCommandBuffer commandBuffer, uint32_t drawCount, const VkMultiDrawIndexedInfoEXT *pIndexInfo,
67527ec681f3Smrg                                  uint32_t instanceCount, uint32_t firstInstance, uint32_t stride, const int32_t *pVertexOffset)
67537ec681f3Smrg{
67547ec681f3Smrg   RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
67557ec681f3Smrg   struct radv_draw_info info;
67567ec681f3Smrg
67577ec681f3Smrg   if (!drawCount)
67587ec681f3Smrg      return;
67597ec681f3Smrg
67607ec681f3Smrg   const VkMultiDrawIndexedInfoEXT *minfo = pIndexInfo;
67617ec681f3Smrg   info.indexed = true;
67627ec681f3Smrg   info.count = minfo->indexCount;
67637ec681f3Smrg   info.instance_count = instanceCount;
67647ec681f3Smrg   info.first_instance = firstInstance;
67657ec681f3Smrg   info.strmout_buffer = NULL;
67667ec681f3Smrg   info.indirect = NULL;
67677ec681f3Smrg
67687ec681f3Smrg   if (!radv_before_draw(cmd_buffer, &info, drawCount))
67697ec681f3Smrg      return;
67707ec681f3Smrg   radv_emit_draw_packets_indexed(cmd_buffer, &info, drawCount, pIndexInfo, stride, pVertexOffset);
67717ec681f3Smrg   radv_after_draw(cmd_buffer);
67727ec681f3Smrg}
67737ec681f3Smrg
67747ec681f3Smrgvoid
67757ec681f3Smrgradv_CmdDrawIndirect(VkCommandBuffer commandBuffer, VkBuffer _buffer, VkDeviceSize offset,
67767ec681f3Smrg                     uint32_t drawCount, uint32_t stride)
67777ec681f3Smrg{
67787ec681f3Smrg   RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
67797ec681f3Smrg   RADV_FROM_HANDLE(radv_buffer, buffer, _buffer);
67807ec681f3Smrg   struct radv_draw_info info;
67817ec681f3Smrg
67827ec681f3Smrg   info.count = drawCount;
67837ec681f3Smrg   info.indirect = buffer;
67847ec681f3Smrg   info.indirect_offset = offset;
67857ec681f3Smrg   info.stride = stride;
67867ec681f3Smrg   info.strmout_buffer = NULL;
67877ec681f3Smrg   info.count_buffer = NULL;
67887ec681f3Smrg   info.indexed = false;
67897ec681f3Smrg   info.instance_count = 0;
67907ec681f3Smrg
67917ec681f3Smrg   if (!radv_before_draw(cmd_buffer, &info, 1))
67927ec681f3Smrg      return;
67937ec681f3Smrg   radv_emit_indirect_draw_packets(cmd_buffer, &info);
67947ec681f3Smrg   radv_after_draw(cmd_buffer);
67957ec681f3Smrg}
67967ec681f3Smrg
67977ec681f3Smrgvoid
67987ec681f3Smrgradv_CmdDrawIndexedIndirect(VkCommandBuffer commandBuffer, VkBuffer _buffer, VkDeviceSize offset,
67997ec681f3Smrg                            uint32_t drawCount, uint32_t stride)
68007ec681f3Smrg{
68017ec681f3Smrg   RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
68027ec681f3Smrg   RADV_FROM_HANDLE(radv_buffer, buffer, _buffer);
68037ec681f3Smrg   struct radv_draw_info info;
68047ec681f3Smrg
68057ec681f3Smrg   info.indexed = true;
68067ec681f3Smrg   info.count = drawCount;
68077ec681f3Smrg   info.indirect = buffer;
68087ec681f3Smrg   info.indirect_offset = offset;
68097ec681f3Smrg   info.stride = stride;
68107ec681f3Smrg   info.count_buffer = NULL;
68117ec681f3Smrg   info.strmout_buffer = NULL;
68127ec681f3Smrg   info.instance_count = 0;
68137ec681f3Smrg
68147ec681f3Smrg   if (!radv_before_draw(cmd_buffer, &info, 1))
68157ec681f3Smrg      return;
68167ec681f3Smrg   radv_emit_indirect_draw_packets(cmd_buffer, &info);
68177ec681f3Smrg   radv_after_draw(cmd_buffer);
68187ec681f3Smrg}
68197ec681f3Smrg
68207ec681f3Smrgvoid
68217ec681f3Smrgradv_CmdDrawIndirectCount(VkCommandBuffer commandBuffer, VkBuffer _buffer, VkDeviceSize offset,
68227ec681f3Smrg                          VkBuffer _countBuffer, VkDeviceSize countBufferOffset,
68237ec681f3Smrg                          uint32_t maxDrawCount, uint32_t stride)
68247ec681f3Smrg{
68257ec681f3Smrg   RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
68267ec681f3Smrg   RADV_FROM_HANDLE(radv_buffer, buffer, _buffer);
68277ec681f3Smrg   RADV_FROM_HANDLE(radv_buffer, count_buffer, _countBuffer);
68287ec681f3Smrg   struct radv_draw_info info;
68297ec681f3Smrg
68307ec681f3Smrg   info.count = maxDrawCount;
68317ec681f3Smrg   info.indirect = buffer;
68327ec681f3Smrg   info.indirect_offset = offset;
68337ec681f3Smrg   info.count_buffer = count_buffer;
68347ec681f3Smrg   info.count_buffer_offset = countBufferOffset;
68357ec681f3Smrg   info.stride = stride;
68367ec681f3Smrg   info.strmout_buffer = NULL;
68377ec681f3Smrg   info.indexed = false;
68387ec681f3Smrg   info.instance_count = 0;
68397ec681f3Smrg
68407ec681f3Smrg   if (!radv_before_draw(cmd_buffer, &info, 1))
68417ec681f3Smrg      return;
68427ec681f3Smrg   radv_emit_indirect_draw_packets(cmd_buffer, &info);
68437ec681f3Smrg   radv_after_draw(cmd_buffer);
68447ec681f3Smrg}
68457ec681f3Smrg
68467ec681f3Smrgvoid
68477ec681f3Smrgradv_CmdDrawIndexedIndirectCount(VkCommandBuffer commandBuffer, VkBuffer _buffer,
68487ec681f3Smrg                                 VkDeviceSize offset, VkBuffer _countBuffer,
68497ec681f3Smrg                                 VkDeviceSize countBufferOffset, uint32_t maxDrawCount,
68507ec681f3Smrg                                 uint32_t stride)
68517ec681f3Smrg{
68527ec681f3Smrg   RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
68537ec681f3Smrg   RADV_FROM_HANDLE(radv_buffer, buffer, _buffer);
68547ec681f3Smrg   RADV_FROM_HANDLE(radv_buffer, count_buffer, _countBuffer);
68557ec681f3Smrg   struct radv_draw_info info;
68567ec681f3Smrg
68577ec681f3Smrg   info.indexed = true;
68587ec681f3Smrg   info.count = maxDrawCount;
68597ec681f3Smrg   info.indirect = buffer;
68607ec681f3Smrg   info.indirect_offset = offset;
68617ec681f3Smrg   info.count_buffer = count_buffer;
68627ec681f3Smrg   info.count_buffer_offset = countBufferOffset;
68637ec681f3Smrg   info.stride = stride;
68647ec681f3Smrg   info.strmout_buffer = NULL;
68657ec681f3Smrg   info.instance_count = 0;
68667ec681f3Smrg
68677ec681f3Smrg   if (!radv_before_draw(cmd_buffer, &info, 1))
68687ec681f3Smrg      return;
68697ec681f3Smrg   radv_emit_indirect_draw_packets(cmd_buffer, &info);
68707ec681f3Smrg   radv_after_draw(cmd_buffer);
68717ec681f3Smrg}
68727ec681f3Smrg
68737ec681f3Smrgstruct radv_dispatch_info {
68747ec681f3Smrg   /**
68757ec681f3Smrg    * Determine the layout of the grid (in block units) to be used.
68767ec681f3Smrg    */
68777ec681f3Smrg   uint32_t blocks[3];
68787ec681f3Smrg
68797ec681f3Smrg   /**
68807ec681f3Smrg    * A starting offset for the grid. If unaligned is set, the offset
68817ec681f3Smrg    * must still be aligned.
68827ec681f3Smrg    */
68837ec681f3Smrg   uint32_t offsets[3];
68847ec681f3Smrg   /**
68857ec681f3Smrg    * Whether it's an unaligned compute dispatch.
68867ec681f3Smrg    */
68877ec681f3Smrg   bool unaligned;
68887ec681f3Smrg
68897ec681f3Smrg   /**
68907ec681f3Smrg    * Indirect compute parameters resource.
68917ec681f3Smrg    */
68927ec681f3Smrg   struct radeon_winsys_bo *indirect;
68937ec681f3Smrg   uint64_t va;
68947ec681f3Smrg};
68957ec681f3Smrg
68967ec681f3Smrgstatic void
68977ec681f3Smrgradv_emit_dispatch_packets(struct radv_cmd_buffer *cmd_buffer, struct radv_pipeline *pipeline,
68987ec681f3Smrg                           const struct radv_dispatch_info *info)
68997ec681f3Smrg{
69007ec681f3Smrg   struct radv_shader_variant *compute_shader = pipeline->shaders[MESA_SHADER_COMPUTE];
69017ec681f3Smrg   unsigned dispatch_initiator = cmd_buffer->device->dispatch_initiator;
69027ec681f3Smrg   struct radeon_winsys *ws = cmd_buffer->device->ws;
69037ec681f3Smrg   bool predicating = cmd_buffer->state.predicating;
69047ec681f3Smrg   struct radeon_cmdbuf *cs = cmd_buffer->cs;
69057ec681f3Smrg   struct radv_userdata_info *loc;
69067ec681f3Smrg
69077ec681f3Smrg   radv_describe_dispatch(cmd_buffer, info->blocks[0], info->blocks[1], info->blocks[2]);
69087ec681f3Smrg
69097ec681f3Smrg   loc = radv_lookup_user_sgpr(pipeline, MESA_SHADER_COMPUTE, AC_UD_CS_GRID_SIZE);
69107ec681f3Smrg
69117ec681f3Smrg   ASSERTED unsigned cdw_max = radeon_check_space(ws, cs, 25);
69127ec681f3Smrg
69137ec681f3Smrg   if (compute_shader->info.wave_size == 32) {
69147ec681f3Smrg      assert(cmd_buffer->device->physical_device->rad_info.chip_class >= GFX10);
69157ec681f3Smrg      dispatch_initiator |= S_00B800_CS_W32_EN(1);
69167ec681f3Smrg   }
69177ec681f3Smrg
69187ec681f3Smrg   if (info->indirect) {
69197ec681f3Smrg      radv_cs_add_buffer(ws, cs, info->indirect);
69207ec681f3Smrg
69217ec681f3Smrg      if (loc->sgpr_idx != -1) {
69227ec681f3Smrg         for (unsigned i = 0; i < 3; ++i) {
69237ec681f3Smrg            radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
69247ec681f3Smrg            radeon_emit(cs,
69257ec681f3Smrg                        COPY_DATA_SRC_SEL(COPY_DATA_SRC_MEM) | COPY_DATA_DST_SEL(COPY_DATA_REG));
69267ec681f3Smrg            radeon_emit(cs, (info->va + 4 * i));
69277ec681f3Smrg            radeon_emit(cs, (info->va + 4 * i) >> 32);
69287ec681f3Smrg            radeon_emit(cs, ((R_00B900_COMPUTE_USER_DATA_0 + loc->sgpr_idx * 4) >> 2) + i);
69297ec681f3Smrg            radeon_emit(cs, 0);
69307ec681f3Smrg         }
69317ec681f3Smrg      }
69327ec681f3Smrg
69337ec681f3Smrg      if (radv_cmd_buffer_uses_mec(cmd_buffer)) {
69347ec681f3Smrg         radeon_emit(cs, PKT3(PKT3_DISPATCH_INDIRECT, 2, predicating) | PKT3_SHADER_TYPE_S(1));
69357ec681f3Smrg         radeon_emit(cs, info->va);
69367ec681f3Smrg         radeon_emit(cs, info->va >> 32);
69377ec681f3Smrg         radeon_emit(cs, dispatch_initiator);
69387ec681f3Smrg      } else {
69397ec681f3Smrg         radeon_emit(cs, PKT3(PKT3_SET_BASE, 2, 0) | PKT3_SHADER_TYPE_S(1));
69407ec681f3Smrg         radeon_emit(cs, 1);
69417ec681f3Smrg         radeon_emit(cs, info->va);
69427ec681f3Smrg         radeon_emit(cs, info->va >> 32);
69437ec681f3Smrg
69447ec681f3Smrg         radeon_emit(cs, PKT3(PKT3_DISPATCH_INDIRECT, 1, predicating) | PKT3_SHADER_TYPE_S(1));
69457ec681f3Smrg         radeon_emit(cs, 0);
69467ec681f3Smrg         radeon_emit(cs, dispatch_initiator);
69477ec681f3Smrg      }
69487ec681f3Smrg   } else {
69497ec681f3Smrg      unsigned blocks[3] = {info->blocks[0], info->blocks[1], info->blocks[2]};
69507ec681f3Smrg      unsigned offsets[3] = {info->offsets[0], info->offsets[1], info->offsets[2]};
69517ec681f3Smrg
69527ec681f3Smrg      if (info->unaligned) {
69537ec681f3Smrg         unsigned *cs_block_size = compute_shader->info.cs.block_size;
69547ec681f3Smrg         unsigned remainder[3];
69557ec681f3Smrg
69567ec681f3Smrg         /* If aligned, these should be an entire block size,
69577ec681f3Smrg          * not 0.
69587ec681f3Smrg          */
69597ec681f3Smrg         remainder[0] = blocks[0] + cs_block_size[0] - align_u32_npot(blocks[0], cs_block_size[0]);
69607ec681f3Smrg         remainder[1] = blocks[1] + cs_block_size[1] - align_u32_npot(blocks[1], cs_block_size[1]);
69617ec681f3Smrg         remainder[2] = blocks[2] + cs_block_size[2] - align_u32_npot(blocks[2], cs_block_size[2]);
69627ec681f3Smrg
69637ec681f3Smrg         blocks[0] = round_up_u32(blocks[0], cs_block_size[0]);
69647ec681f3Smrg         blocks[1] = round_up_u32(blocks[1], cs_block_size[1]);
69657ec681f3Smrg         blocks[2] = round_up_u32(blocks[2], cs_block_size[2]);
69667ec681f3Smrg
69677ec681f3Smrg         for (unsigned i = 0; i < 3; ++i) {
69687ec681f3Smrg            assert(offsets[i] % cs_block_size[i] == 0);
69697ec681f3Smrg            offsets[i] /= cs_block_size[i];
69707ec681f3Smrg         }
69717ec681f3Smrg
69727ec681f3Smrg         radeon_set_sh_reg_seq(cs, R_00B81C_COMPUTE_NUM_THREAD_X, 3);
69737ec681f3Smrg         radeon_emit(cs, S_00B81C_NUM_THREAD_FULL(cs_block_size[0]) |
69747ec681f3Smrg                            S_00B81C_NUM_THREAD_PARTIAL(remainder[0]));
69757ec681f3Smrg         radeon_emit(cs, S_00B81C_NUM_THREAD_FULL(cs_block_size[1]) |
69767ec681f3Smrg                            S_00B81C_NUM_THREAD_PARTIAL(remainder[1]));
69777ec681f3Smrg         radeon_emit(cs, S_00B81C_NUM_THREAD_FULL(cs_block_size[2]) |
69787ec681f3Smrg                            S_00B81C_NUM_THREAD_PARTIAL(remainder[2]));
69797ec681f3Smrg
69807ec681f3Smrg         dispatch_initiator |= S_00B800_PARTIAL_TG_EN(1);
69817ec681f3Smrg      }
69827ec681f3Smrg
69837ec681f3Smrg      if (loc->sgpr_idx != -1) {
69847ec681f3Smrg         assert(loc->num_sgprs == 3);
69857ec681f3Smrg
69867ec681f3Smrg         radeon_set_sh_reg_seq(cs, R_00B900_COMPUTE_USER_DATA_0 + loc->sgpr_idx * 4, 3);
69877ec681f3Smrg         radeon_emit(cs, blocks[0]);
69887ec681f3Smrg         radeon_emit(cs, blocks[1]);
69897ec681f3Smrg         radeon_emit(cs, blocks[2]);
69907ec681f3Smrg      }
69917ec681f3Smrg
69927ec681f3Smrg      if (offsets[0] || offsets[1] || offsets[2]) {
69937ec681f3Smrg         radeon_set_sh_reg_seq(cs, R_00B810_COMPUTE_START_X, 3);
69947ec681f3Smrg         radeon_emit(cs, offsets[0]);
69957ec681f3Smrg         radeon_emit(cs, offsets[1]);
69967ec681f3Smrg         radeon_emit(cs, offsets[2]);
69977ec681f3Smrg
69987ec681f3Smrg         /* The blocks in the packet are not counts but end values. */
69997ec681f3Smrg         for (unsigned i = 0; i < 3; ++i)
70007ec681f3Smrg            blocks[i] += offsets[i];
70017ec681f3Smrg      } else {
70027ec681f3Smrg         dispatch_initiator |= S_00B800_FORCE_START_AT_000(1);
70037ec681f3Smrg      }
70047ec681f3Smrg
70057ec681f3Smrg      radeon_emit(cs, PKT3(PKT3_DISPATCH_DIRECT, 3, predicating) | PKT3_SHADER_TYPE_S(1));
70067ec681f3Smrg      radeon_emit(cs, blocks[0]);
70077ec681f3Smrg      radeon_emit(cs, blocks[1]);
70087ec681f3Smrg      radeon_emit(cs, blocks[2]);
70097ec681f3Smrg      radeon_emit(cs, dispatch_initiator);
70107ec681f3Smrg   }
70117ec681f3Smrg
70127ec681f3Smrg   assert(cmd_buffer->cs->cdw <= cdw_max);
70137ec681f3Smrg}
70147ec681f3Smrg
70157ec681f3Smrgstatic void
70167ec681f3Smrgradv_upload_compute_shader_descriptors(struct radv_cmd_buffer *cmd_buffer,
70177ec681f3Smrg                                       struct radv_pipeline *pipeline,
70187ec681f3Smrg                                       VkPipelineBindPoint bind_point)
70197ec681f3Smrg{
70207ec681f3Smrg   radv_flush_descriptors(cmd_buffer, VK_SHADER_STAGE_COMPUTE_BIT, pipeline, bind_point);
70217ec681f3Smrg   radv_flush_constants(cmd_buffer,
70227ec681f3Smrg                        bind_point == VK_PIPELINE_BIND_POINT_RAY_TRACING_KHR
70237ec681f3Smrg                           ? RADV_RT_STAGE_BITS
70247ec681f3Smrg                           : VK_SHADER_STAGE_COMPUTE_BIT,
70257ec681f3Smrg                        pipeline, bind_point);
70267ec681f3Smrg}
70277ec681f3Smrg
70287ec681f3Smrgstatic void
70297ec681f3Smrgradv_dispatch(struct radv_cmd_buffer *cmd_buffer, const struct radv_dispatch_info *info,
70307ec681f3Smrg              struct radv_pipeline *pipeline, VkPipelineBindPoint bind_point)
70317ec681f3Smrg{
70327ec681f3Smrg   bool has_prefetch = cmd_buffer->device->physical_device->rad_info.chip_class >= GFX7;
70337ec681f3Smrg   bool pipeline_is_dirty = pipeline && pipeline != cmd_buffer->state.emitted_compute_pipeline;
70347ec681f3Smrg   bool cs_regalloc_hang = cmd_buffer->device->physical_device->rad_info.has_cs_regalloc_hang_bug &&
70357ec681f3Smrg                           info->blocks[0] * info->blocks[1] * info->blocks[2] > 256;
70367ec681f3Smrg
70377ec681f3Smrg   if (cs_regalloc_hang)
70387ec681f3Smrg      cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_PS_PARTIAL_FLUSH |
70397ec681f3Smrg                                      RADV_CMD_FLAG_CS_PARTIAL_FLUSH;
70407ec681f3Smrg
70417ec681f3Smrg   if (cmd_buffer->state.flush_bits &
70427ec681f3Smrg       (RADV_CMD_FLAG_FLUSH_AND_INV_CB | RADV_CMD_FLAG_FLUSH_AND_INV_DB |
70437ec681f3Smrg        RADV_CMD_FLAG_PS_PARTIAL_FLUSH | RADV_CMD_FLAG_CS_PARTIAL_FLUSH)) {
70447ec681f3Smrg      /* If we have to wait for idle, set all states first, so that
70457ec681f3Smrg       * all SET packets are processed in parallel with previous draw
70467ec681f3Smrg       * calls. Then upload descriptors, set shader pointers, and
70477ec681f3Smrg       * dispatch, and prefetch at the end. This ensures that the
70487ec681f3Smrg       * time the CUs are idle is very short. (there are only SET_SH
70497ec681f3Smrg       * packets between the wait and the draw)
70507ec681f3Smrg       */
70517ec681f3Smrg      radv_emit_compute_pipeline(cmd_buffer, pipeline);
70527ec681f3Smrg      si_emit_cache_flush(cmd_buffer);
70537ec681f3Smrg      /* <-- CUs are idle here --> */
70547ec681f3Smrg
70557ec681f3Smrg      radv_upload_compute_shader_descriptors(cmd_buffer, pipeline, bind_point);
70567ec681f3Smrg
70577ec681f3Smrg      radv_emit_dispatch_packets(cmd_buffer, pipeline, info);
70587ec681f3Smrg      /* <-- CUs are busy here --> */
70597ec681f3Smrg
70607ec681f3Smrg      /* Start prefetches after the dispatch has been started. Both
70617ec681f3Smrg       * will run in parallel, but starting the dispatch first is
70627ec681f3Smrg       * more important.
70637ec681f3Smrg       */
70647ec681f3Smrg      if (has_prefetch && pipeline_is_dirty) {
70657ec681f3Smrg         radv_emit_shader_prefetch(cmd_buffer, pipeline->shaders[MESA_SHADER_COMPUTE]);
70667ec681f3Smrg      }
70677ec681f3Smrg   } else {
70687ec681f3Smrg      /* If we don't wait for idle, start prefetches first, then set
70697ec681f3Smrg       * states, and dispatch at the end.
70707ec681f3Smrg       */
70717ec681f3Smrg      si_emit_cache_flush(cmd_buffer);
70727ec681f3Smrg
70737ec681f3Smrg      if (has_prefetch && pipeline_is_dirty) {
70747ec681f3Smrg         radv_emit_shader_prefetch(cmd_buffer, pipeline->shaders[MESA_SHADER_COMPUTE]);
70757ec681f3Smrg      }
70767ec681f3Smrg
70777ec681f3Smrg      radv_upload_compute_shader_descriptors(cmd_buffer, pipeline, bind_point);
70787ec681f3Smrg
70797ec681f3Smrg      radv_emit_compute_pipeline(cmd_buffer, pipeline);
70807ec681f3Smrg      radv_emit_dispatch_packets(cmd_buffer, pipeline, info);
70817ec681f3Smrg   }
70827ec681f3Smrg
70837ec681f3Smrg   if (pipeline_is_dirty) {
70847ec681f3Smrg      /* Raytracing uses compute shaders but has separate bind points and pipelines.
70857ec681f3Smrg       * So if we set compute userdata & shader registers we should dirty the raytracing
70867ec681f3Smrg       * ones and the other way around.
70877ec681f3Smrg       *
70887ec681f3Smrg       * We only need to do this when the pipeline is dirty because when we switch between
70897ec681f3Smrg       * the two we always need to switch pipelines.
70907ec681f3Smrg       */
70917ec681f3Smrg      radv_mark_descriptor_sets_dirty(cmd_buffer, bind_point == VK_PIPELINE_BIND_POINT_COMPUTE
70927ec681f3Smrg                                                     ? VK_PIPELINE_BIND_POINT_RAY_TRACING_KHR
70937ec681f3Smrg                                                     : VK_PIPELINE_BIND_POINT_COMPUTE);
70947ec681f3Smrg   }
70957ec681f3Smrg
70967ec681f3Smrg   if (cs_regalloc_hang)
70977ec681f3Smrg      cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_CS_PARTIAL_FLUSH;
70987ec681f3Smrg
70997ec681f3Smrg   radv_cmd_buffer_after_draw(cmd_buffer, RADV_CMD_FLAG_CS_PARTIAL_FLUSH);
71007ec681f3Smrg}
71017ec681f3Smrg
71027ec681f3Smrgstatic void
71037ec681f3Smrgradv_compute_dispatch(struct radv_cmd_buffer *cmd_buffer, const struct radv_dispatch_info *info)
71047ec681f3Smrg{
71057ec681f3Smrg   radv_dispatch(cmd_buffer, info, cmd_buffer->state.compute_pipeline,
71067ec681f3Smrg                 VK_PIPELINE_BIND_POINT_COMPUTE);
71077ec681f3Smrg}
71087ec681f3Smrg
71097ec681f3Smrgvoid
71107ec681f3Smrgradv_CmdDispatchBase(VkCommandBuffer commandBuffer, uint32_t base_x, uint32_t base_y,
71117ec681f3Smrg                     uint32_t base_z, uint32_t x, uint32_t y, uint32_t z)
71127ec681f3Smrg{
71137ec681f3Smrg   RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
71147ec681f3Smrg   struct radv_dispatch_info info = {0};
71157ec681f3Smrg
71167ec681f3Smrg   info.blocks[0] = x;
71177ec681f3Smrg   info.blocks[1] = y;
71187ec681f3Smrg   info.blocks[2] = z;
71197ec681f3Smrg
71207ec681f3Smrg   info.offsets[0] = base_x;
71217ec681f3Smrg   info.offsets[1] = base_y;
71227ec681f3Smrg   info.offsets[2] = base_z;
71237ec681f3Smrg   radv_compute_dispatch(cmd_buffer, &info);
71247ec681f3Smrg}
71257ec681f3Smrg
71267ec681f3Smrgvoid
71277ec681f3Smrgradv_CmdDispatch(VkCommandBuffer commandBuffer, uint32_t x, uint32_t y, uint32_t z)
71287ec681f3Smrg{
71297ec681f3Smrg   radv_CmdDispatchBase(commandBuffer, 0, 0, 0, x, y, z);
71307ec681f3Smrg}
71317ec681f3Smrg
71327ec681f3Smrgvoid
71337ec681f3Smrgradv_CmdDispatchIndirect(VkCommandBuffer commandBuffer, VkBuffer _buffer, VkDeviceSize offset)
71347ec681f3Smrg{
71357ec681f3Smrg   RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
71367ec681f3Smrg   RADV_FROM_HANDLE(radv_buffer, buffer, _buffer);
71377ec681f3Smrg   struct radv_dispatch_info info = {0};
71387ec681f3Smrg
71397ec681f3Smrg   info.indirect = buffer->bo;
71407ec681f3Smrg   info.va = radv_buffer_get_va(buffer->bo) + buffer->offset + offset;
71417ec681f3Smrg
71427ec681f3Smrg   radv_compute_dispatch(cmd_buffer, &info);
71437ec681f3Smrg}
71447ec681f3Smrg
71457ec681f3Smrgvoid
71467ec681f3Smrgradv_unaligned_dispatch(struct radv_cmd_buffer *cmd_buffer, uint32_t x, uint32_t y, uint32_t z)
71477ec681f3Smrg{
71487ec681f3Smrg   struct radv_dispatch_info info = {0};
71497ec681f3Smrg
71507ec681f3Smrg   info.blocks[0] = x;
71517ec681f3Smrg   info.blocks[1] = y;
71527ec681f3Smrg   info.blocks[2] = z;
71537ec681f3Smrg   info.unaligned = 1;
71547ec681f3Smrg
71557ec681f3Smrg   radv_compute_dispatch(cmd_buffer, &info);
71567ec681f3Smrg}
71577ec681f3Smrg
71587ec681f3Smrgvoid
71597ec681f3Smrgradv_indirect_dispatch(struct radv_cmd_buffer *cmd_buffer, struct radeon_winsys_bo *bo, uint64_t va)
71607ec681f3Smrg{
71617ec681f3Smrg   struct radv_dispatch_info info = {0};
71627ec681f3Smrg
71637ec681f3Smrg   info.indirect = bo;
71647ec681f3Smrg   info.va = va;
71657ec681f3Smrg
71667ec681f3Smrg   radv_compute_dispatch(cmd_buffer, &info);
71677ec681f3Smrg}
71687ec681f3Smrg
71697ec681f3Smrgstatic void
71707ec681f3Smrgradv_rt_dispatch(struct radv_cmd_buffer *cmd_buffer, const struct radv_dispatch_info *info)
71717ec681f3Smrg{
71727ec681f3Smrg   radv_dispatch(cmd_buffer, info, cmd_buffer->state.rt_pipeline,
71737ec681f3Smrg                 VK_PIPELINE_BIND_POINT_RAY_TRACING_KHR);
71747ec681f3Smrg}
71757ec681f3Smrg
71767ec681f3Smrgstatic bool
71777ec681f3Smrgradv_rt_bind_tables(struct radv_cmd_buffer *cmd_buffer,
71787ec681f3Smrg                    const VkStridedDeviceAddressRegionKHR *tables)
71797ec681f3Smrg{
71807ec681f3Smrg   struct radv_pipeline *pipeline = cmd_buffer->state.rt_pipeline;
71817ec681f3Smrg   uint32_t base_reg;
71827ec681f3Smrg   void *ptr;
71837ec681f3Smrg   uint32_t *desc_ptr;
71847ec681f3Smrg   uint32_t offset;
71857ec681f3Smrg
71867ec681f3Smrg   if (!radv_cmd_buffer_upload_alloc(cmd_buffer, 64, &offset, &ptr))
71877ec681f3Smrg      return false;
71887ec681f3Smrg
71897ec681f3Smrg   desc_ptr = ptr;
71907ec681f3Smrg   for (unsigned i = 0; i < 4; ++i, desc_ptr += 4) {
71917ec681f3Smrg      desc_ptr[0] = tables[i].deviceAddress;
71927ec681f3Smrg      desc_ptr[1] = tables[i].deviceAddress >> 32;
71937ec681f3Smrg      desc_ptr[2] = tables[i].stride;
71947ec681f3Smrg      desc_ptr[3] = 0;
71957ec681f3Smrg   }
71967ec681f3Smrg
71977ec681f3Smrg   uint64_t va = radv_buffer_get_va(cmd_buffer->upload.upload_bo) + offset;
71987ec681f3Smrg   struct radv_userdata_info *loc =
71997ec681f3Smrg      radv_lookup_user_sgpr(pipeline, MESA_SHADER_COMPUTE, AC_UD_CS_SBT_DESCRIPTORS);
72007ec681f3Smrg   if (loc->sgpr_idx == -1)
72017ec681f3Smrg      return true;
72027ec681f3Smrg
72037ec681f3Smrg   base_reg = pipeline->user_data_0[MESA_SHADER_COMPUTE];
72047ec681f3Smrg   radv_emit_shader_pointer(cmd_buffer->device, cmd_buffer->cs, base_reg + loc->sgpr_idx * 4, va,
72057ec681f3Smrg                            false);
72067ec681f3Smrg   return true;
72077ec681f3Smrg}
72087ec681f3Smrg
72097ec681f3Smrgvoid
72107ec681f3Smrgradv_CmdTraceRaysKHR(VkCommandBuffer commandBuffer,
72117ec681f3Smrg                     const VkStridedDeviceAddressRegionKHR *pRaygenShaderBindingTable,
72127ec681f3Smrg                     const VkStridedDeviceAddressRegionKHR *pMissShaderBindingTable,
72137ec681f3Smrg                     const VkStridedDeviceAddressRegionKHR *pHitShaderBindingTable,
72147ec681f3Smrg                     const VkStridedDeviceAddressRegionKHR *pCallableShaderBindingTable,
72157ec681f3Smrg                     uint32_t width, uint32_t height, uint32_t depth)
72167ec681f3Smrg{
72177ec681f3Smrg   RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
72187ec681f3Smrg   struct radv_dispatch_info info = {0};
72197ec681f3Smrg
72207ec681f3Smrg   info.blocks[0] = width;
72217ec681f3Smrg   info.blocks[1] = height;
72227ec681f3Smrg   info.blocks[2] = depth;
72237ec681f3Smrg   info.unaligned = 1;
72247ec681f3Smrg
72257ec681f3Smrg   const VkStridedDeviceAddressRegionKHR tables[] = {
72267ec681f3Smrg      *pRaygenShaderBindingTable,
72277ec681f3Smrg      *pMissShaderBindingTable,
72287ec681f3Smrg      *pHitShaderBindingTable,
72297ec681f3Smrg      *pCallableShaderBindingTable,
72307ec681f3Smrg   };
72317ec681f3Smrg
72327ec681f3Smrg   if (!radv_rt_bind_tables(cmd_buffer, tables)) {
72337ec681f3Smrg      return;
72347ec681f3Smrg   }
72357ec681f3Smrg
72367ec681f3Smrg   struct radv_userdata_info *loc = radv_lookup_user_sgpr(
72377ec681f3Smrg      cmd_buffer->state.rt_pipeline, MESA_SHADER_COMPUTE, AC_UD_CS_RAY_LAUNCH_SIZE);
72387ec681f3Smrg
72397ec681f3Smrg   if (loc->sgpr_idx != -1) {
72407ec681f3Smrg      assert(loc->num_sgprs == 3);
72417ec681f3Smrg
72427ec681f3Smrg      radeon_set_sh_reg_seq(cmd_buffer->cs, R_00B900_COMPUTE_USER_DATA_0 + loc->sgpr_idx * 4, 3);
72437ec681f3Smrg      radeon_emit(cmd_buffer->cs, width);
72447ec681f3Smrg      radeon_emit(cmd_buffer->cs, height);
72457ec681f3Smrg      radeon_emit(cmd_buffer->cs, depth);
72467ec681f3Smrg   }
72477ec681f3Smrg
72487ec681f3Smrg   radv_rt_dispatch(cmd_buffer, &info);
72497ec681f3Smrg}
72507ec681f3Smrg
72517ec681f3Smrgstatic void
72527ec681f3Smrgradv_set_rt_stack_size(struct radv_cmd_buffer *cmd_buffer, uint32_t size)
72537ec681f3Smrg{
72547ec681f3Smrg   unsigned wave_size = 0;
72557ec681f3Smrg   unsigned scratch_bytes_per_wave = 0;
72567ec681f3Smrg
72577ec681f3Smrg   if (cmd_buffer->state.rt_pipeline) {
72587ec681f3Smrg      scratch_bytes_per_wave = cmd_buffer->state.rt_pipeline->scratch_bytes_per_wave;
72597ec681f3Smrg      wave_size = cmd_buffer->state.rt_pipeline->shaders[MESA_SHADER_COMPUTE]->info.wave_size;
72607ec681f3Smrg   }
72617ec681f3Smrg
72627ec681f3Smrg   /* The hardware register is specified as a multiple of 256 DWORDS. */
72637ec681f3Smrg   scratch_bytes_per_wave += align(size * wave_size, 1024);
72647ec681f3Smrg
72657ec681f3Smrg   cmd_buffer->compute_scratch_size_per_wave_needed =
72667ec681f3Smrg      MAX2(cmd_buffer->compute_scratch_size_per_wave_needed, scratch_bytes_per_wave);
72677ec681f3Smrg}
72687ec681f3Smrg
72697ec681f3Smrgvoid
72707ec681f3Smrgradv_CmdSetRayTracingPipelineStackSizeKHR(VkCommandBuffer commandBuffer, uint32_t size)
72717ec681f3Smrg{
72727ec681f3Smrg   RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
72737ec681f3Smrg
72747ec681f3Smrg   radv_set_rt_stack_size(cmd_buffer, size);
72757ec681f3Smrg   cmd_buffer->state.rt_stack_size = size;
72767ec681f3Smrg}
72777ec681f3Smrg
72787ec681f3Smrgvoid
72797ec681f3Smrgradv_cmd_buffer_end_render_pass(struct radv_cmd_buffer *cmd_buffer)
72807ec681f3Smrg{
72817ec681f3Smrg   vk_free(&cmd_buffer->pool->alloc, cmd_buffer->state.attachments);
72827ec681f3Smrg   vk_free(&cmd_buffer->pool->alloc, cmd_buffer->state.subpass_sample_locs);
72837ec681f3Smrg
72847ec681f3Smrg   cmd_buffer->state.pass = NULL;
72857ec681f3Smrg   cmd_buffer->state.subpass = NULL;
72867ec681f3Smrg   cmd_buffer->state.attachments = NULL;
72877ec681f3Smrg   cmd_buffer->state.framebuffer = NULL;
72887ec681f3Smrg   cmd_buffer->state.subpass_sample_locs = NULL;
72897ec681f3Smrg}
72907ec681f3Smrg
72917ec681f3Smrgvoid
72927ec681f3Smrgradv_CmdEndRenderPass2(VkCommandBuffer commandBuffer, const VkSubpassEndInfo *pSubpassEndInfo)
72937ec681f3Smrg{
72947ec681f3Smrg   RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
72957ec681f3Smrg
72967ec681f3Smrg   radv_mark_noncoherent_rb(cmd_buffer);
72977ec681f3Smrg
72987ec681f3Smrg   radv_emit_subpass_barrier(cmd_buffer, &cmd_buffer->state.pass->end_barrier);
72997ec681f3Smrg
73007ec681f3Smrg   radv_cmd_buffer_end_subpass(cmd_buffer);
73017ec681f3Smrg
73027ec681f3Smrg   radv_cmd_buffer_end_render_pass(cmd_buffer);
73037ec681f3Smrg}
73047ec681f3Smrg
73057ec681f3Smrg/*
73067ec681f3Smrg * For HTILE we have the following interesting clear words:
73077ec681f3Smrg *   0xfffff30f: Uncompressed, full depth range, for depth+stencil HTILE
73087ec681f3Smrg *   0xfffc000f: Uncompressed, full depth range, for depth only HTILE.
73097ec681f3Smrg *   0xfffffff0: Clear depth to 1.0
73107ec681f3Smrg *   0x00000000: Clear depth to 0.0
73117ec681f3Smrg */
73127ec681f3Smrgstatic void
73137ec681f3Smrgradv_initialize_htile(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image,
73147ec681f3Smrg                      const VkImageSubresourceRange *range)
73157ec681f3Smrg{
73167ec681f3Smrg   struct radv_cmd_state *state = &cmd_buffer->state;
73177ec681f3Smrg   uint32_t htile_value = radv_get_htile_initial_value(cmd_buffer->device, image);
73187ec681f3Smrg   VkClearDepthStencilValue value = {0};
73197ec681f3Smrg   struct radv_barrier_data barrier = {0};
73207ec681f3Smrg
73217ec681f3Smrg   barrier.layout_transitions.init_mask_ram = 1;
73227ec681f3Smrg   radv_describe_layout_transition(cmd_buffer, &barrier);
73237ec681f3Smrg
73247ec681f3Smrg   /* Transitioning from LAYOUT_UNDEFINED layout not everyone is consistent
73257ec681f3Smrg    * in considering previous rendering work for WAW hazards. */
73267ec681f3Smrg   state->flush_bits |=
73277ec681f3Smrg      radv_src_access_flush(cmd_buffer, VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT, image);
73287ec681f3Smrg
73297ec681f3Smrg   if (image->planes[0].surface.has_stencil &&
73307ec681f3Smrg       !(range->aspectMask == (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT))) {
73317ec681f3Smrg      /* Flush caches before performing a separate aspect initialization because it's a
73327ec681f3Smrg       * read-modify-write operation.
73337ec681f3Smrg       */
73347ec681f3Smrg      state->flush_bits |= radv_dst_access_flush(cmd_buffer, VK_ACCESS_SHADER_READ_BIT, image);
73357ec681f3Smrg   }
73367ec681f3Smrg
73377ec681f3Smrg   state->flush_bits |= radv_clear_htile(cmd_buffer, image, range, htile_value);
73387ec681f3Smrg
73397ec681f3Smrg   radv_set_ds_clear_metadata(cmd_buffer, image, range, value, range->aspectMask);
73407ec681f3Smrg
73417ec681f3Smrg   if (radv_image_is_tc_compat_htile(image) && (range->aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT)) {
73427ec681f3Smrg      /* Initialize the TC-compat metada value to 0 because by
73437ec681f3Smrg       * default DB_Z_INFO.RANGE_PRECISION is set to 1, and we only
73447ec681f3Smrg       * need have to conditionally update its value when performing
73457ec681f3Smrg       * a fast depth clear.
73467ec681f3Smrg       */
73477ec681f3Smrg      radv_set_tc_compat_zrange_metadata(cmd_buffer, image, range, 0);
73487ec681f3Smrg   }
73497ec681f3Smrg}
73507ec681f3Smrg
73517ec681f3Smrgstatic void
73527ec681f3Smrgradv_handle_depth_image_transition(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image,
73537ec681f3Smrg                                   VkImageLayout src_layout, bool src_render_loop,
73547ec681f3Smrg                                   VkImageLayout dst_layout, bool dst_render_loop,
73557ec681f3Smrg                                   unsigned src_queue_mask, unsigned dst_queue_mask,
73567ec681f3Smrg                                   const VkImageSubresourceRange *range,
73577ec681f3Smrg                                   struct radv_sample_locations_state *sample_locs)
73587ec681f3Smrg{
73597ec681f3Smrg   struct radv_device *device = cmd_buffer->device;
73607ec681f3Smrg
73617ec681f3Smrg   if (!radv_htile_enabled(image, range->baseMipLevel))
73627ec681f3Smrg      return;
73637ec681f3Smrg
73647ec681f3Smrg   if (src_layout == VK_IMAGE_LAYOUT_UNDEFINED) {
73657ec681f3Smrg      radv_initialize_htile(cmd_buffer, image, range);
73667ec681f3Smrg   } else if (!radv_layout_is_htile_compressed(device, image, src_layout, src_render_loop,
73677ec681f3Smrg                                               src_queue_mask) &&
73687ec681f3Smrg              radv_layout_is_htile_compressed(device, image, dst_layout, dst_render_loop,
73697ec681f3Smrg                                              dst_queue_mask)) {
73707ec681f3Smrg      radv_initialize_htile(cmd_buffer, image, range);
73717ec681f3Smrg   } else if (radv_layout_is_htile_compressed(device, image, src_layout, src_render_loop,
73727ec681f3Smrg                                              src_queue_mask) &&
73737ec681f3Smrg              !radv_layout_is_htile_compressed(device, image, dst_layout, dst_render_loop,
73747ec681f3Smrg                                               dst_queue_mask)) {
73757ec681f3Smrg      cmd_buffer->state.flush_bits |=
73767ec681f3Smrg         RADV_CMD_FLAG_FLUSH_AND_INV_DB | RADV_CMD_FLAG_FLUSH_AND_INV_DB_META;
73777ec681f3Smrg
73787ec681f3Smrg      radv_expand_depth_stencil(cmd_buffer, image, range, sample_locs);
73797ec681f3Smrg
73807ec681f3Smrg      cmd_buffer->state.flush_bits |=
73817ec681f3Smrg         RADV_CMD_FLAG_FLUSH_AND_INV_DB | RADV_CMD_FLAG_FLUSH_AND_INV_DB_META;
73827ec681f3Smrg   }
73837ec681f3Smrg}
73847ec681f3Smrg
73857ec681f3Smrgstatic uint32_t
73867ec681f3Smrgradv_init_cmask(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image,
73877ec681f3Smrg                const VkImageSubresourceRange *range, uint32_t value)
73887ec681f3Smrg{
73897ec681f3Smrg   struct radv_barrier_data barrier = {0};
73907ec681f3Smrg
73917ec681f3Smrg   barrier.layout_transitions.init_mask_ram = 1;
73927ec681f3Smrg   radv_describe_layout_transition(cmd_buffer, &barrier);
73937ec681f3Smrg
73947ec681f3Smrg   return radv_clear_cmask(cmd_buffer, image, range, value);
73957ec681f3Smrg}
73967ec681f3Smrg
73977ec681f3Smrguint32_t
73987ec681f3Smrgradv_init_fmask(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image,
73997ec681f3Smrg                const VkImageSubresourceRange *range)
74007ec681f3Smrg{
74017ec681f3Smrg   static const uint32_t fmask_clear_values[4] = {0x00000000, 0x02020202, 0xE4E4E4E4, 0x76543210};
74027ec681f3Smrg   uint32_t log2_samples = util_logbase2(image->info.samples);
74037ec681f3Smrg   uint32_t value = fmask_clear_values[log2_samples];
74047ec681f3Smrg   struct radv_barrier_data barrier = {0};
74057ec681f3Smrg
74067ec681f3Smrg   barrier.layout_transitions.init_mask_ram = 1;
74077ec681f3Smrg   radv_describe_layout_transition(cmd_buffer, &barrier);
74087ec681f3Smrg
74097ec681f3Smrg   return radv_clear_fmask(cmd_buffer, image, range, value);
74107ec681f3Smrg}
74117ec681f3Smrg
74127ec681f3Smrguint32_t
74137ec681f3Smrgradv_init_dcc(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image,
74147ec681f3Smrg              const VkImageSubresourceRange *range, uint32_t value)
74157ec681f3Smrg{
74167ec681f3Smrg   struct radv_barrier_data barrier = {0};
74177ec681f3Smrg   uint32_t flush_bits = 0;
74187ec681f3Smrg   unsigned size = 0;
74197ec681f3Smrg
74207ec681f3Smrg   barrier.layout_transitions.init_mask_ram = 1;
74217ec681f3Smrg   radv_describe_layout_transition(cmd_buffer, &barrier);
74227ec681f3Smrg
74237ec681f3Smrg   flush_bits |= radv_clear_dcc(cmd_buffer, image, range, value);
74247ec681f3Smrg
74257ec681f3Smrg   if (cmd_buffer->device->physical_device->rad_info.chip_class == GFX8) {
74267ec681f3Smrg      /* When DCC is enabled with mipmaps, some levels might not
74277ec681f3Smrg       * support fast clears and we have to initialize them as "fully
74287ec681f3Smrg       * expanded".
74297ec681f3Smrg       */
74307ec681f3Smrg      /* Compute the size of all fast clearable DCC levels. */
74317ec681f3Smrg      for (unsigned i = 0; i < image->planes[0].surface.num_meta_levels; i++) {
74327ec681f3Smrg         struct legacy_surf_dcc_level *dcc_level = &image->planes[0].surface.u.legacy.color.dcc_level[i];
74337ec681f3Smrg         unsigned dcc_fast_clear_size =
74347ec681f3Smrg            dcc_level->dcc_slice_fast_clear_size * image->info.array_size;
74357ec681f3Smrg
74367ec681f3Smrg         if (!dcc_fast_clear_size)
74377ec681f3Smrg            break;
74387ec681f3Smrg
74397ec681f3Smrg         size = dcc_level->dcc_offset + dcc_fast_clear_size;
74407ec681f3Smrg      }
74417ec681f3Smrg
74427ec681f3Smrg      /* Initialize the mipmap levels without DCC. */
74437ec681f3Smrg      if (size != image->planes[0].surface.meta_size) {
74447ec681f3Smrg         flush_bits |= radv_fill_buffer(cmd_buffer, image, image->bo,
74457ec681f3Smrg                                        image->offset + image->planes[0].surface.meta_offset + size,
74467ec681f3Smrg                                        image->planes[0].surface.meta_size - size, 0xffffffff);
74477ec681f3Smrg      }
74487ec681f3Smrg   }
74497ec681f3Smrg
74507ec681f3Smrg   return flush_bits;
74517ec681f3Smrg}
74527ec681f3Smrg
74537ec681f3Smrg/**
74547ec681f3Smrg * Initialize DCC/FMASK/CMASK metadata for a color image.
74557ec681f3Smrg */
74567ec681f3Smrgstatic void
74577ec681f3Smrgradv_init_color_image_metadata(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image,
74587ec681f3Smrg                               VkImageLayout src_layout, bool src_render_loop,
74597ec681f3Smrg                               VkImageLayout dst_layout, bool dst_render_loop,
74607ec681f3Smrg                               unsigned src_queue_mask, unsigned dst_queue_mask,
74617ec681f3Smrg                               const VkImageSubresourceRange *range)
74627ec681f3Smrg{
74637ec681f3Smrg   uint32_t flush_bits = 0;
74647ec681f3Smrg
74657ec681f3Smrg   /* Transitioning from LAYOUT_UNDEFINED layout not everyone is
74667ec681f3Smrg    * consistent in considering previous rendering work for WAW hazards.
74677ec681f3Smrg    */
74687ec681f3Smrg   cmd_buffer->state.flush_bits |=
74697ec681f3Smrg      radv_src_access_flush(cmd_buffer, VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT, image);
74707ec681f3Smrg
74717ec681f3Smrg   if (radv_image_has_cmask(image)) {
74727ec681f3Smrg      uint32_t value;
74737ec681f3Smrg
74747ec681f3Smrg      if (cmd_buffer->device->physical_device->rad_info.chip_class == GFX9) {
74757ec681f3Smrg         /* TODO: Fix clearing CMASK layers on GFX9. */
74767ec681f3Smrg         if (radv_image_is_tc_compat_cmask(image) ||
74777ec681f3Smrg             (radv_image_has_fmask(image) &&
74787ec681f3Smrg              radv_layout_can_fast_clear(cmd_buffer->device, image, range->baseMipLevel, dst_layout,
74797ec681f3Smrg                                         dst_render_loop, dst_queue_mask))) {
74807ec681f3Smrg            value = 0xccccccccu;
74817ec681f3Smrg         } else {
74827ec681f3Smrg            value = 0xffffffffu;
74837ec681f3Smrg         }
74847ec681f3Smrg      } else {
74857ec681f3Smrg         static const uint32_t cmask_clear_values[4] = {0xffffffff, 0xdddddddd, 0xeeeeeeee, 0xffffffff};
74867ec681f3Smrg         uint32_t log2_samples = util_logbase2(image->info.samples);
74877ec681f3Smrg
74887ec681f3Smrg         value = cmask_clear_values[log2_samples];
74897ec681f3Smrg      }
74907ec681f3Smrg
74917ec681f3Smrg      flush_bits |= radv_init_cmask(cmd_buffer, image, range, value);
74927ec681f3Smrg   }
74937ec681f3Smrg
74947ec681f3Smrg   if (radv_image_has_fmask(image)) {
74957ec681f3Smrg      flush_bits |= radv_init_fmask(cmd_buffer, image, range);
74967ec681f3Smrg   }
74977ec681f3Smrg
74987ec681f3Smrg   if (radv_dcc_enabled(image, range->baseMipLevel)) {
74997ec681f3Smrg      uint32_t value = 0xffffffffu; /* Fully expanded mode. */
75007ec681f3Smrg
75017ec681f3Smrg      if (radv_layout_dcc_compressed(cmd_buffer->device, image, range->baseMipLevel,
75027ec681f3Smrg                                     dst_layout, dst_render_loop, dst_queue_mask)) {
75037ec681f3Smrg         value = 0u;
75047ec681f3Smrg      }
75057ec681f3Smrg
75067ec681f3Smrg      flush_bits |= radv_init_dcc(cmd_buffer, image, range, value);
75077ec681f3Smrg   }
75087ec681f3Smrg
75097ec681f3Smrg   if (radv_image_has_cmask(image) || radv_dcc_enabled(image, range->baseMipLevel)) {
75107ec681f3Smrg      radv_update_fce_metadata(cmd_buffer, image, range, false);
75117ec681f3Smrg
75127ec681f3Smrg      uint32_t color_values[2] = {0};
75137ec681f3Smrg      radv_set_color_clear_metadata(cmd_buffer, image, range, color_values);
75147ec681f3Smrg   }
75157ec681f3Smrg
75167ec681f3Smrg   cmd_buffer->state.flush_bits |= flush_bits;
75177ec681f3Smrg}
75187ec681f3Smrg
75197ec681f3Smrgstatic void
75207ec681f3Smrgradv_retile_transition(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image,
75217ec681f3Smrg                       VkImageLayout src_layout, VkImageLayout dst_layout, unsigned dst_queue_mask)
75227ec681f3Smrg{
75237ec681f3Smrg   if (src_layout != VK_IMAGE_LAYOUT_PRESENT_SRC_KHR &&
75247ec681f3Smrg       (dst_layout == VK_IMAGE_LAYOUT_PRESENT_SRC_KHR ||
75257ec681f3Smrg        (dst_queue_mask & (1u << RADV_QUEUE_FOREIGN))))
75267ec681f3Smrg      radv_retile_dcc(cmd_buffer, image);
75277ec681f3Smrg}
75287ec681f3Smrg
75297ec681f3Smrgstatic bool
75307ec681f3Smrgradv_image_need_retile(const struct radv_image *image)
75317ec681f3Smrg{
75327ec681f3Smrg   return image->planes[0].surface.display_dcc_offset &&
75337ec681f3Smrg          image->planes[0].surface.display_dcc_offset != image->planes[0].surface.meta_offset;
75347ec681f3Smrg}
75357ec681f3Smrg
75367ec681f3Smrg/**
75377ec681f3Smrg * Handle color image transitions for DCC/FMASK/CMASK.
75387ec681f3Smrg */
75397ec681f3Smrgstatic void
75407ec681f3Smrgradv_handle_color_image_transition(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image,
75417ec681f3Smrg                                   VkImageLayout src_layout, bool src_render_loop,
75427ec681f3Smrg                                   VkImageLayout dst_layout, bool dst_render_loop,
75437ec681f3Smrg                                   unsigned src_queue_mask, unsigned dst_queue_mask,
75447ec681f3Smrg                                   const VkImageSubresourceRange *range)
75457ec681f3Smrg{
75467ec681f3Smrg   bool dcc_decompressed = false, fast_clear_flushed = false;
75477ec681f3Smrg
75487ec681f3Smrg   if (!radv_image_has_cmask(image) && !radv_image_has_fmask(image) &&
75497ec681f3Smrg       !radv_dcc_enabled(image, range->baseMipLevel))
75507ec681f3Smrg      return;
75517ec681f3Smrg
75527ec681f3Smrg   if (src_layout == VK_IMAGE_LAYOUT_UNDEFINED) {
75537ec681f3Smrg      radv_init_color_image_metadata(cmd_buffer, image, src_layout, src_render_loop, dst_layout,
75547ec681f3Smrg                                     dst_render_loop, src_queue_mask, dst_queue_mask, range);
75557ec681f3Smrg
75567ec681f3Smrg      if (radv_image_need_retile(image))
75577ec681f3Smrg         radv_retile_transition(cmd_buffer, image, src_layout, dst_layout, dst_queue_mask);
75587ec681f3Smrg      return;
75597ec681f3Smrg   }
75607ec681f3Smrg
75617ec681f3Smrg   if (radv_dcc_enabled(image, range->baseMipLevel)) {
75627ec681f3Smrg      if (src_layout == VK_IMAGE_LAYOUT_PREINITIALIZED) {
75637ec681f3Smrg         cmd_buffer->state.flush_bits |= radv_init_dcc(cmd_buffer, image, range, 0xffffffffu);
75647ec681f3Smrg      } else if (radv_layout_dcc_compressed(cmd_buffer->device, image, range->baseMipLevel,
75657ec681f3Smrg                                            src_layout, src_render_loop, src_queue_mask) &&
75667ec681f3Smrg                 !radv_layout_dcc_compressed(cmd_buffer->device, image, range->baseMipLevel,
75677ec681f3Smrg                                             dst_layout, dst_render_loop, dst_queue_mask)) {
75687ec681f3Smrg         radv_decompress_dcc(cmd_buffer, image, range);
75697ec681f3Smrg         dcc_decompressed = true;
75707ec681f3Smrg      } else if (radv_layout_can_fast_clear(cmd_buffer->device, image, range->baseMipLevel,
75717ec681f3Smrg                                            src_layout, src_render_loop, src_queue_mask) &&
75727ec681f3Smrg                 !radv_layout_can_fast_clear(cmd_buffer->device, image, range->baseMipLevel,
75737ec681f3Smrg                                             dst_layout, dst_render_loop, dst_queue_mask)) {
75747ec681f3Smrg         radv_fast_clear_flush_image_inplace(cmd_buffer, image, range);
75757ec681f3Smrg         fast_clear_flushed = true;
75767ec681f3Smrg      }
75777ec681f3Smrg
75787ec681f3Smrg      if (radv_image_need_retile(image))
75797ec681f3Smrg         radv_retile_transition(cmd_buffer, image, src_layout, dst_layout, dst_queue_mask);
75807ec681f3Smrg   } else if (radv_image_has_cmask(image) || radv_image_has_fmask(image)) {
75817ec681f3Smrg      if (radv_layout_can_fast_clear(cmd_buffer->device, image, range->baseMipLevel,
75827ec681f3Smrg                                     src_layout, src_render_loop, src_queue_mask) &&
75837ec681f3Smrg          !radv_layout_can_fast_clear(cmd_buffer->device, image, range->baseMipLevel,
75847ec681f3Smrg                                      dst_layout, dst_render_loop, dst_queue_mask)) {
75857ec681f3Smrg         radv_fast_clear_flush_image_inplace(cmd_buffer, image, range);
75867ec681f3Smrg         fast_clear_flushed = true;
75877ec681f3Smrg      }
75887ec681f3Smrg   }
75897ec681f3Smrg
75907ec681f3Smrg   /* MSAA color decompress. */
75917ec681f3Smrg   if (radv_image_has_fmask(image) &&
75927ec681f3Smrg       (image->usage & (VK_IMAGE_USAGE_STORAGE_BIT | VK_IMAGE_USAGE_TRANSFER_DST_BIT)) &&
75937ec681f3Smrg       radv_layout_fmask_compressed(cmd_buffer->device, image, src_layout, src_queue_mask) &&
75947ec681f3Smrg       !radv_layout_fmask_compressed(cmd_buffer->device, image, dst_layout, dst_queue_mask)) {
75957ec681f3Smrg      if (radv_dcc_enabled(image, range->baseMipLevel) &&
75967ec681f3Smrg          !radv_image_use_dcc_image_stores(cmd_buffer->device, image) && !dcc_decompressed) {
75977ec681f3Smrg         /* A DCC decompress is required before expanding FMASK
75987ec681f3Smrg          * when DCC stores aren't supported to avoid being in
75997ec681f3Smrg          * a state where DCC is compressed and the main
76007ec681f3Smrg          * surface is uncompressed.
76017ec681f3Smrg          */
76027ec681f3Smrg         radv_decompress_dcc(cmd_buffer, image, range);
76037ec681f3Smrg      } else if (!fast_clear_flushed) {
76047ec681f3Smrg         /* A FMASK decompress is required before expanding
76057ec681f3Smrg          * FMASK.
76067ec681f3Smrg          */
76077ec681f3Smrg         radv_fast_clear_flush_image_inplace(cmd_buffer, image, range);
76087ec681f3Smrg      }
76097ec681f3Smrg
76107ec681f3Smrg      struct radv_barrier_data barrier = {0};
76117ec681f3Smrg      barrier.layout_transitions.fmask_color_expand = 1;
76127ec681f3Smrg      radv_describe_layout_transition(cmd_buffer, &barrier);
76137ec681f3Smrg
76147ec681f3Smrg      radv_expand_fmask_image_inplace(cmd_buffer, image, range);
76157ec681f3Smrg   }
76167ec681f3Smrg}
76177ec681f3Smrg
76187ec681f3Smrgstatic void
76197ec681f3Smrgradv_handle_image_transition(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image,
76207ec681f3Smrg                             VkImageLayout src_layout, bool src_render_loop,
76217ec681f3Smrg                             VkImageLayout dst_layout, bool dst_render_loop, uint32_t src_family,
76227ec681f3Smrg                             uint32_t dst_family, const VkImageSubresourceRange *range,
76237ec681f3Smrg                             struct radv_sample_locations_state *sample_locs)
76247ec681f3Smrg{
76257ec681f3Smrg   if (image->exclusive && src_family != dst_family) {
76267ec681f3Smrg      /* This is an acquire or a release operation and there will be
76277ec681f3Smrg       * a corresponding release/acquire. Do the transition in the
76287ec681f3Smrg       * most flexible queue. */
76297ec681f3Smrg
76307ec681f3Smrg      assert(src_family == cmd_buffer->queue_family_index ||
76317ec681f3Smrg             dst_family == cmd_buffer->queue_family_index);
76327ec681f3Smrg
76337ec681f3Smrg      if (src_family == VK_QUEUE_FAMILY_EXTERNAL || src_family == VK_QUEUE_FAMILY_FOREIGN_EXT)
76347ec681f3Smrg         return;
76357ec681f3Smrg
76367ec681f3Smrg      if (cmd_buffer->queue_family_index == RADV_QUEUE_TRANSFER)
76377ec681f3Smrg         return;
76387ec681f3Smrg
76397ec681f3Smrg      if (cmd_buffer->queue_family_index == RADV_QUEUE_COMPUTE &&
76407ec681f3Smrg          (src_family == RADV_QUEUE_GENERAL || dst_family == RADV_QUEUE_GENERAL))
76417ec681f3Smrg         return;
76427ec681f3Smrg   }
76437ec681f3Smrg
76447ec681f3Smrg   unsigned src_queue_mask =
76457ec681f3Smrg      radv_image_queue_family_mask(image, src_family, cmd_buffer->queue_family_index);
76467ec681f3Smrg   unsigned dst_queue_mask =
76477ec681f3Smrg      radv_image_queue_family_mask(image, dst_family, cmd_buffer->queue_family_index);
76487ec681f3Smrg
76497ec681f3Smrg   if (src_layout == dst_layout && src_render_loop == dst_render_loop && src_queue_mask == dst_queue_mask)
76507ec681f3Smrg      return;
76517ec681f3Smrg
76527ec681f3Smrg   if (vk_format_has_depth(image->vk_format)) {
76537ec681f3Smrg      radv_handle_depth_image_transition(cmd_buffer, image, src_layout, src_render_loop, dst_layout,
76547ec681f3Smrg                                         dst_render_loop, src_queue_mask, dst_queue_mask, range,
76557ec681f3Smrg                                         sample_locs);
76567ec681f3Smrg   } else {
76577ec681f3Smrg      radv_handle_color_image_transition(cmd_buffer, image, src_layout, src_render_loop, dst_layout,
76587ec681f3Smrg                                         dst_render_loop, src_queue_mask, dst_queue_mask, range);
76597ec681f3Smrg   }
76607ec681f3Smrg}
76617ec681f3Smrg
76627ec681f3Smrgstruct radv_barrier_info {
76637ec681f3Smrg   enum rgp_barrier_reason reason;
76647ec681f3Smrg   uint32_t eventCount;
76657ec681f3Smrg   const VkEvent *pEvents;
76667ec681f3Smrg   VkPipelineStageFlags srcStageMask;
76677ec681f3Smrg   VkPipelineStageFlags dstStageMask;
76687ec681f3Smrg};
76697ec681f3Smrg
76707ec681f3Smrgstatic void
76717ec681f3Smrgradv_barrier(struct radv_cmd_buffer *cmd_buffer, uint32_t memoryBarrierCount,
76727ec681f3Smrg             const VkMemoryBarrier *pMemoryBarriers, uint32_t bufferMemoryBarrierCount,
76737ec681f3Smrg             const VkBufferMemoryBarrier *pBufferMemoryBarriers, uint32_t imageMemoryBarrierCount,
76747ec681f3Smrg             const VkImageMemoryBarrier *pImageMemoryBarriers, const struct radv_barrier_info *info)
76757ec681f3Smrg{
76767ec681f3Smrg   struct radeon_cmdbuf *cs = cmd_buffer->cs;
76777ec681f3Smrg   enum radv_cmd_flush_bits src_flush_bits = 0;
76787ec681f3Smrg   enum radv_cmd_flush_bits dst_flush_bits = 0;
76797ec681f3Smrg
76807ec681f3Smrg   if (cmd_buffer->state.subpass)
76817ec681f3Smrg      radv_mark_noncoherent_rb(cmd_buffer);
76827ec681f3Smrg
76837ec681f3Smrg   radv_describe_barrier_start(cmd_buffer, info->reason);
76847ec681f3Smrg
76857ec681f3Smrg   for (unsigned i = 0; i < info->eventCount; ++i) {
76867ec681f3Smrg      RADV_FROM_HANDLE(radv_event, event, info->pEvents[i]);
76877ec681f3Smrg      uint64_t va = radv_buffer_get_va(event->bo);
76887ec681f3Smrg
76897ec681f3Smrg      radv_cs_add_buffer(cmd_buffer->device->ws, cs, event->bo);
76907ec681f3Smrg
76917ec681f3Smrg      ASSERTED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cs, 7);
76927ec681f3Smrg
76937ec681f3Smrg      radv_cp_wait_mem(cs, WAIT_REG_MEM_EQUAL, va, 1, 0xffffffff);
76947ec681f3Smrg      assert(cmd_buffer->cs->cdw <= cdw_max);
76957ec681f3Smrg   }
76967ec681f3Smrg
76977ec681f3Smrg   for (uint32_t i = 0; i < memoryBarrierCount; i++) {
76987ec681f3Smrg      src_flush_bits |= radv_src_access_flush(cmd_buffer, pMemoryBarriers[i].srcAccessMask, NULL);
76997ec681f3Smrg      dst_flush_bits |= radv_dst_access_flush(cmd_buffer, pMemoryBarriers[i].dstAccessMask, NULL);
77007ec681f3Smrg   }
77017ec681f3Smrg
77027ec681f3Smrg   for (uint32_t i = 0; i < bufferMemoryBarrierCount; i++) {
77037ec681f3Smrg      src_flush_bits |=
77047ec681f3Smrg         radv_src_access_flush(cmd_buffer, pBufferMemoryBarriers[i].srcAccessMask, NULL);
77057ec681f3Smrg      dst_flush_bits |=
77067ec681f3Smrg         radv_dst_access_flush(cmd_buffer, pBufferMemoryBarriers[i].dstAccessMask, NULL);
77077ec681f3Smrg   }
77087ec681f3Smrg
77097ec681f3Smrg   for (uint32_t i = 0; i < imageMemoryBarrierCount; i++) {
77107ec681f3Smrg      RADV_FROM_HANDLE(radv_image, image, pImageMemoryBarriers[i].image);
77117ec681f3Smrg
77127ec681f3Smrg      src_flush_bits |=
77137ec681f3Smrg         radv_src_access_flush(cmd_buffer, pImageMemoryBarriers[i].srcAccessMask, image);
77147ec681f3Smrg      dst_flush_bits |=
77157ec681f3Smrg         radv_dst_access_flush(cmd_buffer, pImageMemoryBarriers[i].dstAccessMask, image);
77167ec681f3Smrg   }
77177ec681f3Smrg
77187ec681f3Smrg   /* The Vulkan spec 1.1.98 says:
77197ec681f3Smrg    *
77207ec681f3Smrg    * "An execution dependency with only
77217ec681f3Smrg    *  VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT in the destination stage mask
77227ec681f3Smrg    *  will only prevent that stage from executing in subsequently
77237ec681f3Smrg    *  submitted commands. As this stage does not perform any actual
77247ec681f3Smrg    *  execution, this is not observable - in effect, it does not delay
77257ec681f3Smrg    *  processing of subsequent commands. Similarly an execution dependency
77267ec681f3Smrg    *  with only VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT in the source stage mask
77277ec681f3Smrg    *  will effectively not wait for any prior commands to complete."
77287ec681f3Smrg    */
77297ec681f3Smrg   if (info->dstStageMask != VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT)
77307ec681f3Smrg      radv_stage_flush(cmd_buffer, info->srcStageMask);
77317ec681f3Smrg   cmd_buffer->state.flush_bits |= src_flush_bits;
77327ec681f3Smrg
77337ec681f3Smrg   for (uint32_t i = 0; i < imageMemoryBarrierCount; i++) {
77347ec681f3Smrg      RADV_FROM_HANDLE(radv_image, image, pImageMemoryBarriers[i].image);
77357ec681f3Smrg
77367ec681f3Smrg      const struct VkSampleLocationsInfoEXT *sample_locs_info =
77377ec681f3Smrg         vk_find_struct_const(pImageMemoryBarriers[i].pNext, SAMPLE_LOCATIONS_INFO_EXT);
77387ec681f3Smrg      struct radv_sample_locations_state sample_locations = {0};
77397ec681f3Smrg
77407ec681f3Smrg      if (sample_locs_info) {
77417ec681f3Smrg         assert(image->flags & VK_IMAGE_CREATE_SAMPLE_LOCATIONS_COMPATIBLE_DEPTH_BIT_EXT);
77427ec681f3Smrg         sample_locations.per_pixel = sample_locs_info->sampleLocationsPerPixel;
77437ec681f3Smrg         sample_locations.grid_size = sample_locs_info->sampleLocationGridSize;
77447ec681f3Smrg         sample_locations.count = sample_locs_info->sampleLocationsCount;
77457ec681f3Smrg         typed_memcpy(&sample_locations.locations[0], sample_locs_info->pSampleLocations,
77467ec681f3Smrg                      sample_locs_info->sampleLocationsCount);
77477ec681f3Smrg      }
77487ec681f3Smrg
77497ec681f3Smrg      radv_handle_image_transition(
77507ec681f3Smrg         cmd_buffer, image, pImageMemoryBarriers[i].oldLayout,
77517ec681f3Smrg         false, /* Outside of a renderpass we are never in a renderloop */
77527ec681f3Smrg         pImageMemoryBarriers[i].newLayout,
77537ec681f3Smrg         false, /* Outside of a renderpass we are never in a renderloop */
77547ec681f3Smrg         pImageMemoryBarriers[i].srcQueueFamilyIndex, pImageMemoryBarriers[i].dstQueueFamilyIndex,
77557ec681f3Smrg         &pImageMemoryBarriers[i].subresourceRange, sample_locs_info ? &sample_locations : NULL);
77567ec681f3Smrg   }
77577ec681f3Smrg
77587ec681f3Smrg   /* Make sure CP DMA is idle because the driver might have performed a
77597ec681f3Smrg    * DMA operation for copying or filling buffers/images.
77607ec681f3Smrg    */
77617ec681f3Smrg   if (info->srcStageMask & (VK_PIPELINE_STAGE_TRANSFER_BIT | VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT))
77627ec681f3Smrg      si_cp_dma_wait_for_idle(cmd_buffer);
77637ec681f3Smrg
77647ec681f3Smrg   cmd_buffer->state.flush_bits |= dst_flush_bits;
77657ec681f3Smrg
77667ec681f3Smrg   radv_describe_barrier_end(cmd_buffer);
77677ec681f3Smrg}
77687ec681f3Smrg
77697ec681f3Smrgvoid
77707ec681f3Smrgradv_CmdPipelineBarrier(VkCommandBuffer commandBuffer, VkPipelineStageFlags srcStageMask,
77717ec681f3Smrg                        VkPipelineStageFlags destStageMask, VkBool32 byRegion,
77727ec681f3Smrg                        uint32_t memoryBarrierCount, const VkMemoryBarrier *pMemoryBarriers,
77737ec681f3Smrg                        uint32_t bufferMemoryBarrierCount,
77747ec681f3Smrg                        const VkBufferMemoryBarrier *pBufferMemoryBarriers,
77757ec681f3Smrg                        uint32_t imageMemoryBarrierCount,
77767ec681f3Smrg                        const VkImageMemoryBarrier *pImageMemoryBarriers)
77777ec681f3Smrg{
77787ec681f3Smrg   RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
77797ec681f3Smrg   struct radv_barrier_info info;
77807ec681f3Smrg
77817ec681f3Smrg   info.reason = RGP_BARRIER_EXTERNAL_CMD_PIPELINE_BARRIER;
77827ec681f3Smrg   info.eventCount = 0;
77837ec681f3Smrg   info.pEvents = NULL;
77847ec681f3Smrg   info.srcStageMask = srcStageMask;
77857ec681f3Smrg   info.dstStageMask = destStageMask;
77867ec681f3Smrg
77877ec681f3Smrg   radv_barrier(cmd_buffer, memoryBarrierCount, pMemoryBarriers, bufferMemoryBarrierCount,
77887ec681f3Smrg                pBufferMemoryBarriers, imageMemoryBarrierCount, pImageMemoryBarriers, &info);
77897ec681f3Smrg}
77907ec681f3Smrg
77917ec681f3Smrgstatic void
77927ec681f3Smrgwrite_event(struct radv_cmd_buffer *cmd_buffer, struct radv_event *event,
77937ec681f3Smrg            VkPipelineStageFlags stageMask, unsigned value)
77947ec681f3Smrg{
77957ec681f3Smrg   struct radeon_cmdbuf *cs = cmd_buffer->cs;
77967ec681f3Smrg   uint64_t va = radv_buffer_get_va(event->bo);
77977ec681f3Smrg
77987ec681f3Smrg   si_emit_cache_flush(cmd_buffer);
77997ec681f3Smrg
78007ec681f3Smrg   radv_cs_add_buffer(cmd_buffer->device->ws, cs, event->bo);
78017ec681f3Smrg
78027ec681f3Smrg   ASSERTED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cs, 28);
78037ec681f3Smrg
78047ec681f3Smrg   /* Flags that only require a top-of-pipe event. */
78057ec681f3Smrg   VkPipelineStageFlags top_of_pipe_flags = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT;
78067ec681f3Smrg
78077ec681f3Smrg   /* Flags that only require a post-index-fetch event. */
78087ec681f3Smrg   VkPipelineStageFlags post_index_fetch_flags =
78097ec681f3Smrg      top_of_pipe_flags | VK_PIPELINE_STAGE_DRAW_INDIRECT_BIT | VK_PIPELINE_STAGE_VERTEX_INPUT_BIT;
78107ec681f3Smrg
78117ec681f3Smrg   /* Flags that only require signaling post PS. */
78127ec681f3Smrg   VkPipelineStageFlags post_ps_flags =
78137ec681f3Smrg      post_index_fetch_flags | VK_PIPELINE_STAGE_VERTEX_SHADER_BIT |
78147ec681f3Smrg      VK_PIPELINE_STAGE_TESSELLATION_CONTROL_SHADER_BIT |
78157ec681f3Smrg      VK_PIPELINE_STAGE_TESSELLATION_EVALUATION_SHADER_BIT | VK_PIPELINE_STAGE_GEOMETRY_SHADER_BIT |
78167ec681f3Smrg      VK_PIPELINE_STAGE_TRANSFORM_FEEDBACK_BIT_EXT |
78177ec681f3Smrg      VK_PIPELINE_STAGE_FRAGMENT_SHADING_RATE_ATTACHMENT_BIT_KHR |
78187ec681f3Smrg      VK_PIPELINE_STAGE_EARLY_FRAGMENT_TESTS_BIT | VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT;
78197ec681f3Smrg
78207ec681f3Smrg   /* Flags that only require signaling post CS. */
78217ec681f3Smrg   VkPipelineStageFlags post_cs_flags = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT;
78227ec681f3Smrg
78237ec681f3Smrg   /* Make sure CP DMA is idle because the driver might have performed a
78247ec681f3Smrg    * DMA operation for copying or filling buffers/images.
78257ec681f3Smrg    */
78267ec681f3Smrg   if (stageMask & (VK_PIPELINE_STAGE_TRANSFER_BIT | VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT))
78277ec681f3Smrg      si_cp_dma_wait_for_idle(cmd_buffer);
78287ec681f3Smrg
78297ec681f3Smrg   if (!(stageMask & ~top_of_pipe_flags)) {
78307ec681f3Smrg      /* Just need to sync the PFP engine. */
78317ec681f3Smrg      radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 3, 0));
78327ec681f3Smrg      radeon_emit(cs, S_370_DST_SEL(V_370_MEM) | S_370_WR_CONFIRM(1) | S_370_ENGINE_SEL(V_370_PFP));
78337ec681f3Smrg      radeon_emit(cs, va);
78347ec681f3Smrg      radeon_emit(cs, va >> 32);
78357ec681f3Smrg      radeon_emit(cs, value);
78367ec681f3Smrg   } else if (!(stageMask & ~post_index_fetch_flags)) {
78377ec681f3Smrg      /* Sync ME because PFP reads index and indirect buffers. */
78387ec681f3Smrg      radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 3, 0));
78397ec681f3Smrg      radeon_emit(cs, S_370_DST_SEL(V_370_MEM) | S_370_WR_CONFIRM(1) | S_370_ENGINE_SEL(V_370_ME));
78407ec681f3Smrg      radeon_emit(cs, va);
78417ec681f3Smrg      radeon_emit(cs, va >> 32);
78427ec681f3Smrg      radeon_emit(cs, value);
78437ec681f3Smrg   } else {
78447ec681f3Smrg      unsigned event_type;
78457ec681f3Smrg
78467ec681f3Smrg      if (!(stageMask & ~post_ps_flags)) {
78477ec681f3Smrg         /* Sync previous fragment shaders. */
78487ec681f3Smrg         event_type = V_028A90_PS_DONE;
78497ec681f3Smrg      } else if (!(stageMask & ~post_cs_flags)) {
78507ec681f3Smrg         /* Sync previous compute shaders. */
78517ec681f3Smrg         event_type = V_028A90_CS_DONE;
78527ec681f3Smrg      } else {
78537ec681f3Smrg         /* Otherwise, sync all prior GPU work. */
78547ec681f3Smrg         event_type = V_028A90_BOTTOM_OF_PIPE_TS;
78557ec681f3Smrg      }
78567ec681f3Smrg
78577ec681f3Smrg      si_cs_emit_write_event_eop(cs, cmd_buffer->device->physical_device->rad_info.chip_class,
78587ec681f3Smrg                                 radv_cmd_buffer_uses_mec(cmd_buffer), event_type, 0,
78597ec681f3Smrg                                 EOP_DST_SEL_MEM, EOP_DATA_SEL_VALUE_32BIT, va, value,
78607ec681f3Smrg                                 cmd_buffer->gfx9_eop_bug_va);
78617ec681f3Smrg   }
78627ec681f3Smrg
78637ec681f3Smrg   assert(cmd_buffer->cs->cdw <= cdw_max);
78647ec681f3Smrg}
78657ec681f3Smrg
78667ec681f3Smrgvoid
78677ec681f3Smrgradv_CmdSetEvent(VkCommandBuffer commandBuffer, VkEvent _event, VkPipelineStageFlags stageMask)
78687ec681f3Smrg{
78697ec681f3Smrg   RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
78707ec681f3Smrg   RADV_FROM_HANDLE(radv_event, event, _event);
78717ec681f3Smrg
78727ec681f3Smrg   write_event(cmd_buffer, event, stageMask, 1);
78737ec681f3Smrg}
78747ec681f3Smrg
78757ec681f3Smrgvoid
78767ec681f3Smrgradv_CmdResetEvent(VkCommandBuffer commandBuffer, VkEvent _event, VkPipelineStageFlags stageMask)
78777ec681f3Smrg{
78787ec681f3Smrg   RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
78797ec681f3Smrg   RADV_FROM_HANDLE(radv_event, event, _event);
78807ec681f3Smrg
78817ec681f3Smrg   write_event(cmd_buffer, event, stageMask, 0);
78827ec681f3Smrg}
78837ec681f3Smrg
78847ec681f3Smrgvoid
78857ec681f3Smrgradv_CmdWaitEvents(VkCommandBuffer commandBuffer, uint32_t eventCount, const VkEvent *pEvents,
78867ec681f3Smrg                   VkPipelineStageFlags srcStageMask, VkPipelineStageFlags dstStageMask,
78877ec681f3Smrg                   uint32_t memoryBarrierCount, const VkMemoryBarrier *pMemoryBarriers,
78887ec681f3Smrg                   uint32_t bufferMemoryBarrierCount,
78897ec681f3Smrg                   const VkBufferMemoryBarrier *pBufferMemoryBarriers,
78907ec681f3Smrg                   uint32_t imageMemoryBarrierCount,
78917ec681f3Smrg                   const VkImageMemoryBarrier *pImageMemoryBarriers)
78927ec681f3Smrg{
78937ec681f3Smrg   RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
78947ec681f3Smrg   struct radv_barrier_info info;
78957ec681f3Smrg
78967ec681f3Smrg   info.reason = RGP_BARRIER_EXTERNAL_CMD_WAIT_EVENTS;
78977ec681f3Smrg   info.eventCount = eventCount;
78987ec681f3Smrg   info.pEvents = pEvents;
78997ec681f3Smrg   info.srcStageMask = 0;
79007ec681f3Smrg
79017ec681f3Smrg   radv_barrier(cmd_buffer, memoryBarrierCount, pMemoryBarriers, bufferMemoryBarrierCount,
79027ec681f3Smrg                pBufferMemoryBarriers, imageMemoryBarrierCount, pImageMemoryBarriers, &info);
79037ec681f3Smrg}
79047ec681f3Smrg
79057ec681f3Smrgvoid
79067ec681f3Smrgradv_CmdSetDeviceMask(VkCommandBuffer commandBuffer, uint32_t deviceMask)
79077ec681f3Smrg{
79087ec681f3Smrg   /* No-op */
79097ec681f3Smrg}
79107ec681f3Smrg
79117ec681f3Smrg/* VK_EXT_conditional_rendering */
79127ec681f3Smrgvoid
79137ec681f3Smrgradv_CmdBeginConditionalRenderingEXT(
79147ec681f3Smrg   VkCommandBuffer commandBuffer,
79157ec681f3Smrg   const VkConditionalRenderingBeginInfoEXT *pConditionalRenderingBegin)
79167ec681f3Smrg{
79177ec681f3Smrg   RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
79187ec681f3Smrg   RADV_FROM_HANDLE(radv_buffer, buffer, pConditionalRenderingBegin->buffer);
79197ec681f3Smrg   struct radeon_cmdbuf *cs = cmd_buffer->cs;
79207ec681f3Smrg   unsigned pred_op = PREDICATION_OP_BOOL32;
79217ec681f3Smrg   bool draw_visible = true;
79227ec681f3Smrg   uint64_t va;
79237ec681f3Smrg
79247ec681f3Smrg   va = radv_buffer_get_va(buffer->bo) + pConditionalRenderingBegin->offset;
79257ec681f3Smrg
79267ec681f3Smrg   /* By default, if the 32-bit value at offset in buffer memory is zero,
79277ec681f3Smrg    * then the rendering commands are discarded, otherwise they are
79287ec681f3Smrg    * executed as normal. If the inverted flag is set, all commands are
79297ec681f3Smrg    * discarded if the value is non zero.
79307ec681f3Smrg    */
79317ec681f3Smrg   if (pConditionalRenderingBegin->flags & VK_CONDITIONAL_RENDERING_INVERTED_BIT_EXT) {
79327ec681f3Smrg      draw_visible = false;
79337ec681f3Smrg   }
79347ec681f3Smrg
79357ec681f3Smrg   si_emit_cache_flush(cmd_buffer);
79367ec681f3Smrg
79377ec681f3Smrg   if (cmd_buffer->queue_family_index == RADV_QUEUE_GENERAL &&
79387ec681f3Smrg       !cmd_buffer->device->physical_device->rad_info.has_32bit_predication) {
79397ec681f3Smrg      uint64_t pred_value = 0, pred_va;
79407ec681f3Smrg      unsigned pred_offset;
79417ec681f3Smrg
79427ec681f3Smrg      /* From the Vulkan spec 1.1.107:
79437ec681f3Smrg       *
79447ec681f3Smrg       * "If the 32-bit value at offset in buffer memory is zero,
79457ec681f3Smrg       *  then the rendering commands are discarded, otherwise they
79467ec681f3Smrg       *  are executed as normal. If the value of the predicate in
79477ec681f3Smrg       *  buffer memory changes while conditional rendering is
79487ec681f3Smrg       *  active, the rendering commands may be discarded in an
79497ec681f3Smrg       *  implementation-dependent way. Some implementations may
79507ec681f3Smrg       *  latch the value of the predicate upon beginning conditional
79517ec681f3Smrg       *  rendering while others may read it before every rendering
79527ec681f3Smrg       *  command."
79537ec681f3Smrg       *
79547ec681f3Smrg       * But, the AMD hardware treats the predicate as a 64-bit
79557ec681f3Smrg       * value which means we need a workaround in the driver.
79567ec681f3Smrg       * Luckily, it's not required to support if the value changes
79577ec681f3Smrg       * when predication is active.
79587ec681f3Smrg       *
79597ec681f3Smrg       * The workaround is as follows:
79607ec681f3Smrg       * 1) allocate a 64-value in the upload BO and initialize it
79617ec681f3Smrg       *    to 0
79627ec681f3Smrg       * 2) copy the 32-bit predicate value to the upload BO
79637ec681f3Smrg       * 3) use the new allocated VA address for predication
79647ec681f3Smrg       *
79657ec681f3Smrg       * Based on the conditionalrender demo, it's faster to do the
79667ec681f3Smrg       * COPY_DATA in ME  (+ sync PFP) instead of PFP.
79677ec681f3Smrg       */
79687ec681f3Smrg      radv_cmd_buffer_upload_data(cmd_buffer, 8, &pred_value, &pred_offset);
79697ec681f3Smrg
79707ec681f3Smrg      pred_va = radv_buffer_get_va(cmd_buffer->upload.upload_bo) + pred_offset;
79717ec681f3Smrg
79727ec681f3Smrg      radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
79737ec681f3Smrg      radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_SRC_MEM) | COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) |
79747ec681f3Smrg                         COPY_DATA_WR_CONFIRM);
79757ec681f3Smrg      radeon_emit(cs, va);
79767ec681f3Smrg      radeon_emit(cs, va >> 32);
79777ec681f3Smrg      radeon_emit(cs, pred_va);
79787ec681f3Smrg      radeon_emit(cs, pred_va >> 32);
79797ec681f3Smrg
79807ec681f3Smrg      radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0));
79817ec681f3Smrg      radeon_emit(cs, 0);
79827ec681f3Smrg
79837ec681f3Smrg      va = pred_va;
79847ec681f3Smrg      pred_op = PREDICATION_OP_BOOL64;
79857ec681f3Smrg   }
79867ec681f3Smrg
79877ec681f3Smrg   /* Enable predication for this command buffer. */
79887ec681f3Smrg   si_emit_set_predication_state(cmd_buffer, draw_visible, pred_op, va);
79897ec681f3Smrg   cmd_buffer->state.predicating = true;
79907ec681f3Smrg
79917ec681f3Smrg   /* Store conditional rendering user info. */
79927ec681f3Smrg   cmd_buffer->state.predication_type = draw_visible;
79937ec681f3Smrg   cmd_buffer->state.predication_op = pred_op;
79947ec681f3Smrg   cmd_buffer->state.predication_va = va;
79957ec681f3Smrg}
79967ec681f3Smrg
79977ec681f3Smrgvoid
79987ec681f3Smrgradv_CmdEndConditionalRenderingEXT(VkCommandBuffer commandBuffer)
79997ec681f3Smrg{
80007ec681f3Smrg   RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
80017ec681f3Smrg
80027ec681f3Smrg   /* Disable predication for this command buffer. */
80037ec681f3Smrg   si_emit_set_predication_state(cmd_buffer, false, 0, 0);
80047ec681f3Smrg   cmd_buffer->state.predicating = false;
80057ec681f3Smrg
80067ec681f3Smrg   /* Reset conditional rendering user info. */
80077ec681f3Smrg   cmd_buffer->state.predication_type = -1;
80087ec681f3Smrg   cmd_buffer->state.predication_op = 0;
80097ec681f3Smrg   cmd_buffer->state.predication_va = 0;
80107ec681f3Smrg}
80117ec681f3Smrg
80127ec681f3Smrg/* VK_EXT_transform_feedback */
80137ec681f3Smrgvoid
80147ec681f3Smrgradv_CmdBindTransformFeedbackBuffersEXT(VkCommandBuffer commandBuffer, uint32_t firstBinding,
80157ec681f3Smrg                                        uint32_t bindingCount, const VkBuffer *pBuffers,
80167ec681f3Smrg                                        const VkDeviceSize *pOffsets, const VkDeviceSize *pSizes)
80177ec681f3Smrg{
80187ec681f3Smrg   RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
80197ec681f3Smrg   struct radv_streamout_binding *sb = cmd_buffer->streamout_bindings;
80207ec681f3Smrg   uint8_t enabled_mask = 0;
80217ec681f3Smrg
80227ec681f3Smrg   assert(firstBinding + bindingCount <= MAX_SO_BUFFERS);
80237ec681f3Smrg   for (uint32_t i = 0; i < bindingCount; i++) {
80247ec681f3Smrg      uint32_t idx = firstBinding + i;
80257ec681f3Smrg
80267ec681f3Smrg      sb[idx].buffer = radv_buffer_from_handle(pBuffers[i]);
80277ec681f3Smrg      sb[idx].offset = pOffsets[i];
80287ec681f3Smrg
80297ec681f3Smrg      if (!pSizes || pSizes[i] == VK_WHOLE_SIZE) {
80307ec681f3Smrg         sb[idx].size = sb[idx].buffer->size - sb[idx].offset;
80317ec681f3Smrg      } else {
80327ec681f3Smrg         sb[idx].size = pSizes[i];
80337ec681f3Smrg      }
80347ec681f3Smrg
80357ec681f3Smrg      radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, sb[idx].buffer->bo);
80367ec681f3Smrg
80377ec681f3Smrg      enabled_mask |= 1 << idx;
80387ec681f3Smrg   }
80397ec681f3Smrg
80407ec681f3Smrg   cmd_buffer->state.streamout.enabled_mask |= enabled_mask;
80417ec681f3Smrg
80427ec681f3Smrg   cmd_buffer->state.dirty |= RADV_CMD_DIRTY_STREAMOUT_BUFFER;
80437ec681f3Smrg}
80447ec681f3Smrg
80457ec681f3Smrgstatic void
80467ec681f3Smrgradv_emit_streamout_enable(struct radv_cmd_buffer *cmd_buffer)
80477ec681f3Smrg{
80487ec681f3Smrg   struct radv_streamout_state *so = &cmd_buffer->state.streamout;
80497ec681f3Smrg   struct radeon_cmdbuf *cs = cmd_buffer->cs;
80507ec681f3Smrg
80517ec681f3Smrg   radeon_set_context_reg_seq(cs, R_028B94_VGT_STRMOUT_CONFIG, 2);
80527ec681f3Smrg   radeon_emit(cs, S_028B94_STREAMOUT_0_EN(so->streamout_enabled) | S_028B94_RAST_STREAM(0) |
80537ec681f3Smrg                      S_028B94_STREAMOUT_1_EN(so->streamout_enabled) |
80547ec681f3Smrg                      S_028B94_STREAMOUT_2_EN(so->streamout_enabled) |
80557ec681f3Smrg                      S_028B94_STREAMOUT_3_EN(so->streamout_enabled));
80567ec681f3Smrg   radeon_emit(cs, so->hw_enabled_mask & so->enabled_stream_buffers_mask);
80577ec681f3Smrg
80587ec681f3Smrg   cmd_buffer->state.context_roll_without_scissor_emitted = true;
80597ec681f3Smrg}
80607ec681f3Smrg
80617ec681f3Smrgstatic void
80627ec681f3Smrgradv_set_streamout_enable(struct radv_cmd_buffer *cmd_buffer, bool enable)
80637ec681f3Smrg{
80647ec681f3Smrg   struct radv_streamout_state *so = &cmd_buffer->state.streamout;
80657ec681f3Smrg   bool old_streamout_enabled = so->streamout_enabled;
80667ec681f3Smrg   uint32_t old_hw_enabled_mask = so->hw_enabled_mask;
80677ec681f3Smrg
80687ec681f3Smrg   so->streamout_enabled = enable;
80697ec681f3Smrg
80707ec681f3Smrg   so->hw_enabled_mask = so->enabled_mask | (so->enabled_mask << 4) | (so->enabled_mask << 8) |
80717ec681f3Smrg                         (so->enabled_mask << 12);
80727ec681f3Smrg
80737ec681f3Smrg   if (!cmd_buffer->device->physical_device->use_ngg_streamout &&
80747ec681f3Smrg       ((old_streamout_enabled != so->streamout_enabled) ||
80757ec681f3Smrg        (old_hw_enabled_mask != so->hw_enabled_mask)))
80767ec681f3Smrg      radv_emit_streamout_enable(cmd_buffer);
80777ec681f3Smrg
80787ec681f3Smrg   if (cmd_buffer->device->physical_device->use_ngg_streamout) {
80797ec681f3Smrg      cmd_buffer->gds_needed = true;
80807ec681f3Smrg      cmd_buffer->gds_oa_needed = true;
80817ec681f3Smrg   }
80827ec681f3Smrg}
80837ec681f3Smrg
80847ec681f3Smrgstatic void
80857ec681f3Smrgradv_flush_vgt_streamout(struct radv_cmd_buffer *cmd_buffer)
80867ec681f3Smrg{
80877ec681f3Smrg   struct radeon_cmdbuf *cs = cmd_buffer->cs;
80887ec681f3Smrg   unsigned reg_strmout_cntl;
80897ec681f3Smrg
80907ec681f3Smrg   /* The register is at different places on different ASICs. */
80917ec681f3Smrg   if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX7) {
80927ec681f3Smrg      reg_strmout_cntl = R_0300FC_CP_STRMOUT_CNTL;
80937ec681f3Smrg      radeon_set_uconfig_reg(cs, reg_strmout_cntl, 0);
80947ec681f3Smrg   } else {
80957ec681f3Smrg      reg_strmout_cntl = R_0084FC_CP_STRMOUT_CNTL;
80967ec681f3Smrg      radeon_set_config_reg(cs, reg_strmout_cntl, 0);
80977ec681f3Smrg   }
80987ec681f3Smrg
80997ec681f3Smrg   radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
81007ec681f3Smrg   radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_SO_VGTSTREAMOUT_FLUSH) | EVENT_INDEX(0));
81017ec681f3Smrg
81027ec681f3Smrg   radeon_emit(cs, PKT3(PKT3_WAIT_REG_MEM, 5, 0));
81037ec681f3Smrg   radeon_emit(cs,
81047ec681f3Smrg               WAIT_REG_MEM_EQUAL); /* wait until the register is equal to the reference value */
81057ec681f3Smrg   radeon_emit(cs, reg_strmout_cntl >> 2); /* register */
81067ec681f3Smrg   radeon_emit(cs, 0);
81077ec681f3Smrg   radeon_emit(cs, S_0084FC_OFFSET_UPDATE_DONE(1)); /* reference value */
81087ec681f3Smrg   radeon_emit(cs, S_0084FC_OFFSET_UPDATE_DONE(1)); /* mask */
81097ec681f3Smrg   radeon_emit(cs, 4);                              /* poll interval */
81107ec681f3Smrg}
81117ec681f3Smrg
81127ec681f3Smrgstatic void
81137ec681f3Smrgradv_emit_streamout_begin(struct radv_cmd_buffer *cmd_buffer, uint32_t firstCounterBuffer,
81147ec681f3Smrg                          uint32_t counterBufferCount, const VkBuffer *pCounterBuffers,
81157ec681f3Smrg                          const VkDeviceSize *pCounterBufferOffsets)
81167ec681f3Smrg
81177ec681f3Smrg{
81187ec681f3Smrg   struct radv_streamout_binding *sb = cmd_buffer->streamout_bindings;
81197ec681f3Smrg   struct radv_streamout_state *so = &cmd_buffer->state.streamout;
81207ec681f3Smrg   struct radeon_cmdbuf *cs = cmd_buffer->cs;
81217ec681f3Smrg
81227ec681f3Smrg   radv_flush_vgt_streamout(cmd_buffer);
81237ec681f3Smrg
81247ec681f3Smrg   assert(firstCounterBuffer + counterBufferCount <= MAX_SO_BUFFERS);
81257ec681f3Smrg   u_foreach_bit(i, so->enabled_mask)
81267ec681f3Smrg   {
81277ec681f3Smrg      int32_t counter_buffer_idx = i - firstCounterBuffer;
81287ec681f3Smrg      if (counter_buffer_idx >= 0 && counter_buffer_idx >= counterBufferCount)
81297ec681f3Smrg         counter_buffer_idx = -1;
81307ec681f3Smrg
81317ec681f3Smrg      /* AMD GCN binds streamout buffers as shader resources.
81327ec681f3Smrg       * VGT only counts primitives and tells the shader through
81337ec681f3Smrg       * SGPRs what to do.
81347ec681f3Smrg       */
81357ec681f3Smrg      radeon_set_context_reg_seq(cs, R_028AD0_VGT_STRMOUT_BUFFER_SIZE_0 + 16 * i, 2);
81367ec681f3Smrg      radeon_emit(cs, sb[i].size >> 2);     /* BUFFER_SIZE (in DW) */
81377ec681f3Smrg      radeon_emit(cs, so->stride_in_dw[i]); /* VTX_STRIDE (in DW) */
81387ec681f3Smrg
81397ec681f3Smrg      cmd_buffer->state.context_roll_without_scissor_emitted = true;
81407ec681f3Smrg
81417ec681f3Smrg      if (counter_buffer_idx >= 0 && pCounterBuffers && pCounterBuffers[counter_buffer_idx]) {
81427ec681f3Smrg         /* The array of counter buffers is optional. */
81437ec681f3Smrg         RADV_FROM_HANDLE(radv_buffer, buffer, pCounterBuffers[counter_buffer_idx]);
81447ec681f3Smrg         uint64_t va = radv_buffer_get_va(buffer->bo);
81457ec681f3Smrg         uint64_t counter_buffer_offset = 0;
81467ec681f3Smrg
81477ec681f3Smrg         if (pCounterBufferOffsets)
81487ec681f3Smrg            counter_buffer_offset = pCounterBufferOffsets[counter_buffer_idx];
81497ec681f3Smrg
81507ec681f3Smrg         va += buffer->offset + counter_buffer_offset;
81517ec681f3Smrg
81527ec681f3Smrg         /* Append */
81537ec681f3Smrg         radeon_emit(cs, PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0));
81547ec681f3Smrg         radeon_emit(cs, STRMOUT_SELECT_BUFFER(i) | STRMOUT_DATA_TYPE(1) |   /* offset in bytes */
81557ec681f3Smrg                            STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_FROM_MEM)); /* control */
81567ec681f3Smrg         radeon_emit(cs, 0);                                                 /* unused */
81577ec681f3Smrg         radeon_emit(cs, 0);                                                 /* unused */
81587ec681f3Smrg         radeon_emit(cs, va);                                                /* src address lo */
81597ec681f3Smrg         radeon_emit(cs, va >> 32);                                          /* src address hi */
81607ec681f3Smrg
81617ec681f3Smrg         radv_cs_add_buffer(cmd_buffer->device->ws, cs, buffer->bo);
81627ec681f3Smrg      } else {
81637ec681f3Smrg         /* Start from the beginning. */
81647ec681f3Smrg         radeon_emit(cs, PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0));
81657ec681f3Smrg         radeon_emit(cs, STRMOUT_SELECT_BUFFER(i) | STRMOUT_DATA_TYPE(1) | /* offset in bytes */
81667ec681f3Smrg                            STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_FROM_PACKET)); /* control */
81677ec681f3Smrg         radeon_emit(cs, 0);                                                    /* unused */
81687ec681f3Smrg         radeon_emit(cs, 0);                                                    /* unused */
81697ec681f3Smrg         radeon_emit(cs, 0);                                                    /* unused */
81707ec681f3Smrg         radeon_emit(cs, 0);                                                    /* unused */
81717ec681f3Smrg      }
81727ec681f3Smrg   }
81737ec681f3Smrg
81747ec681f3Smrg   radv_set_streamout_enable(cmd_buffer, true);
81757ec681f3Smrg}
81767ec681f3Smrg
81777ec681f3Smrgstatic void
81787ec681f3Smrggfx10_emit_streamout_begin(struct radv_cmd_buffer *cmd_buffer, uint32_t firstCounterBuffer,
81797ec681f3Smrg                           uint32_t counterBufferCount, const VkBuffer *pCounterBuffers,
81807ec681f3Smrg                           const VkDeviceSize *pCounterBufferOffsets)
81817ec681f3Smrg{
81827ec681f3Smrg   struct radv_streamout_state *so = &cmd_buffer->state.streamout;
81837ec681f3Smrg   unsigned last_target = util_last_bit(so->enabled_mask) - 1;
81847ec681f3Smrg   struct radeon_cmdbuf *cs = cmd_buffer->cs;
81857ec681f3Smrg
81867ec681f3Smrg   assert(cmd_buffer->device->physical_device->rad_info.chip_class >= GFX10);
81877ec681f3Smrg   assert(firstCounterBuffer + counterBufferCount <= MAX_SO_BUFFERS);
81887ec681f3Smrg
81897ec681f3Smrg   /* Sync because the next streamout operation will overwrite GDS and we
81907ec681f3Smrg    * have to make sure it's idle.
81917ec681f3Smrg    * TODO: Improve by tracking if there is a streamout operation in
81927ec681f3Smrg    * flight.
81937ec681f3Smrg    */
81947ec681f3Smrg   cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_VS_PARTIAL_FLUSH;
81957ec681f3Smrg   si_emit_cache_flush(cmd_buffer);
81967ec681f3Smrg
81977ec681f3Smrg   u_foreach_bit(i, so->enabled_mask)
81987ec681f3Smrg   {
81997ec681f3Smrg      int32_t counter_buffer_idx = i - firstCounterBuffer;
82007ec681f3Smrg      if (counter_buffer_idx >= 0 && counter_buffer_idx >= counterBufferCount)
82017ec681f3Smrg         counter_buffer_idx = -1;
82027ec681f3Smrg
82037ec681f3Smrg      bool append =
82047ec681f3Smrg         counter_buffer_idx >= 0 && pCounterBuffers && pCounterBuffers[counter_buffer_idx];
82057ec681f3Smrg      uint64_t va = 0;
82067ec681f3Smrg
82077ec681f3Smrg      if (append) {
82087ec681f3Smrg         RADV_FROM_HANDLE(radv_buffer, buffer, pCounterBuffers[counter_buffer_idx]);
82097ec681f3Smrg         uint64_t counter_buffer_offset = 0;
82107ec681f3Smrg
82117ec681f3Smrg         if (pCounterBufferOffsets)
82127ec681f3Smrg            counter_buffer_offset = pCounterBufferOffsets[counter_buffer_idx];
82137ec681f3Smrg
82147ec681f3Smrg         va += radv_buffer_get_va(buffer->bo);
82157ec681f3Smrg         va += buffer->offset + counter_buffer_offset;
82167ec681f3Smrg
82177ec681f3Smrg         radv_cs_add_buffer(cmd_buffer->device->ws, cs, buffer->bo);
82187ec681f3Smrg      }
82197ec681f3Smrg
82207ec681f3Smrg      radeon_emit(cs, PKT3(PKT3_DMA_DATA, 5, 0));
82217ec681f3Smrg      radeon_emit(cs, S_411_SRC_SEL(append ? V_411_SRC_ADDR_TC_L2 : V_411_DATA) |
82227ec681f3Smrg                         S_411_DST_SEL(V_411_GDS) | S_411_CP_SYNC(i == last_target));
82237ec681f3Smrg      radeon_emit(cs, va);
82247ec681f3Smrg      radeon_emit(cs, va >> 32);
82257ec681f3Smrg      radeon_emit(cs, 4 * i); /* destination in GDS */
82267ec681f3Smrg      radeon_emit(cs, 0);
82277ec681f3Smrg      radeon_emit(cs, S_415_BYTE_COUNT_GFX9(4) | S_415_DISABLE_WR_CONFIRM_GFX9(i != last_target));
82287ec681f3Smrg   }
82297ec681f3Smrg
82307ec681f3Smrg   radv_set_streamout_enable(cmd_buffer, true);
82317ec681f3Smrg}
82327ec681f3Smrg
82337ec681f3Smrgvoid
82347ec681f3Smrgradv_CmdBeginTransformFeedbackEXT(VkCommandBuffer commandBuffer, uint32_t firstCounterBuffer,
82357ec681f3Smrg                                  uint32_t counterBufferCount, const VkBuffer *pCounterBuffers,
82367ec681f3Smrg                                  const VkDeviceSize *pCounterBufferOffsets)
82377ec681f3Smrg{
82387ec681f3Smrg   RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
82397ec681f3Smrg
82407ec681f3Smrg   if (cmd_buffer->device->physical_device->use_ngg_streamout) {
82417ec681f3Smrg      gfx10_emit_streamout_begin(cmd_buffer, firstCounterBuffer, counterBufferCount,
82427ec681f3Smrg                                 pCounterBuffers, pCounterBufferOffsets);
82437ec681f3Smrg   } else {
82447ec681f3Smrg      radv_emit_streamout_begin(cmd_buffer, firstCounterBuffer, counterBufferCount, pCounterBuffers,
82457ec681f3Smrg                                pCounterBufferOffsets);
82467ec681f3Smrg   }
82477ec681f3Smrg}
82487ec681f3Smrg
82497ec681f3Smrgstatic void
82507ec681f3Smrgradv_emit_streamout_end(struct radv_cmd_buffer *cmd_buffer, uint32_t firstCounterBuffer,
82517ec681f3Smrg                        uint32_t counterBufferCount, const VkBuffer *pCounterBuffers,
82527ec681f3Smrg                        const VkDeviceSize *pCounterBufferOffsets)
82537ec681f3Smrg{
82547ec681f3Smrg   struct radv_streamout_state *so = &cmd_buffer->state.streamout;
82557ec681f3Smrg   struct radeon_cmdbuf *cs = cmd_buffer->cs;
82567ec681f3Smrg
82577ec681f3Smrg   radv_flush_vgt_streamout(cmd_buffer);
82587ec681f3Smrg
82597ec681f3Smrg   assert(firstCounterBuffer + counterBufferCount <= MAX_SO_BUFFERS);
82607ec681f3Smrg   u_foreach_bit(i, so->enabled_mask)
82617ec681f3Smrg   {
82627ec681f3Smrg      int32_t counter_buffer_idx = i - firstCounterBuffer;
82637ec681f3Smrg      if (counter_buffer_idx >= 0 && counter_buffer_idx >= counterBufferCount)
82647ec681f3Smrg         counter_buffer_idx = -1;
82657ec681f3Smrg
82667ec681f3Smrg      if (counter_buffer_idx >= 0 && pCounterBuffers && pCounterBuffers[counter_buffer_idx]) {
82677ec681f3Smrg         /* The array of counters buffer is optional. */
82687ec681f3Smrg         RADV_FROM_HANDLE(radv_buffer, buffer, pCounterBuffers[counter_buffer_idx]);
82697ec681f3Smrg         uint64_t va = radv_buffer_get_va(buffer->bo);
82707ec681f3Smrg         uint64_t counter_buffer_offset = 0;
82717ec681f3Smrg
82727ec681f3Smrg         if (pCounterBufferOffsets)
82737ec681f3Smrg            counter_buffer_offset = pCounterBufferOffsets[counter_buffer_idx];
82747ec681f3Smrg
82757ec681f3Smrg         va += buffer->offset + counter_buffer_offset;
82767ec681f3Smrg
82777ec681f3Smrg         radeon_emit(cs, PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0));
82787ec681f3Smrg         radeon_emit(cs, STRMOUT_SELECT_BUFFER(i) | STRMOUT_DATA_TYPE(1) | /* offset in bytes */
82797ec681f3Smrg                            STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_NONE) |
82807ec681f3Smrg                            STRMOUT_STORE_BUFFER_FILLED_SIZE); /* control */
82817ec681f3Smrg         radeon_emit(cs, va);                                  /* dst address lo */
82827ec681f3Smrg         radeon_emit(cs, va >> 32);                            /* dst address hi */
82837ec681f3Smrg         radeon_emit(cs, 0);                                   /* unused */
82847ec681f3Smrg         radeon_emit(cs, 0);                                   /* unused */
82857ec681f3Smrg
82867ec681f3Smrg         radv_cs_add_buffer(cmd_buffer->device->ws, cs, buffer->bo);
82877ec681f3Smrg      }
82887ec681f3Smrg
82897ec681f3Smrg      /* Deactivate transform feedback by zeroing the buffer size.
82907ec681f3Smrg       * The counters (primitives generated, primitives emitted) may
82917ec681f3Smrg       * be enabled even if there is not buffer bound. This ensures
82927ec681f3Smrg       * that the primitives-emitted query won't increment.
82937ec681f3Smrg       */
82947ec681f3Smrg      radeon_set_context_reg(cs, R_028AD0_VGT_STRMOUT_BUFFER_SIZE_0 + 16 * i, 0);
82957ec681f3Smrg
82967ec681f3Smrg      cmd_buffer->state.context_roll_without_scissor_emitted = true;
82977ec681f3Smrg   }
82987ec681f3Smrg
82997ec681f3Smrg   radv_set_streamout_enable(cmd_buffer, false);
83007ec681f3Smrg}
83017ec681f3Smrg
83027ec681f3Smrgstatic void
83037ec681f3Smrggfx10_emit_streamout_end(struct radv_cmd_buffer *cmd_buffer, uint32_t firstCounterBuffer,
83047ec681f3Smrg                         uint32_t counterBufferCount, const VkBuffer *pCounterBuffers,
83057ec681f3Smrg                         const VkDeviceSize *pCounterBufferOffsets)
83067ec681f3Smrg{
83077ec681f3Smrg   struct radv_streamout_state *so = &cmd_buffer->state.streamout;
83087ec681f3Smrg   struct radeon_cmdbuf *cs = cmd_buffer->cs;
83097ec681f3Smrg
83107ec681f3Smrg   assert(cmd_buffer->device->physical_device->rad_info.chip_class >= GFX10);
83117ec681f3Smrg   assert(firstCounterBuffer + counterBufferCount <= MAX_SO_BUFFERS);
83127ec681f3Smrg
83137ec681f3Smrg   u_foreach_bit(i, so->enabled_mask)
83147ec681f3Smrg   {
83157ec681f3Smrg      int32_t counter_buffer_idx = i - firstCounterBuffer;
83167ec681f3Smrg      if (counter_buffer_idx >= 0 && counter_buffer_idx >= counterBufferCount)
83177ec681f3Smrg         counter_buffer_idx = -1;
83187ec681f3Smrg
83197ec681f3Smrg      if (counter_buffer_idx >= 0 && pCounterBuffers && pCounterBuffers[counter_buffer_idx]) {
83207ec681f3Smrg         /* The array of counters buffer is optional. */
83217ec681f3Smrg         RADV_FROM_HANDLE(radv_buffer, buffer, pCounterBuffers[counter_buffer_idx]);
83227ec681f3Smrg         uint64_t va = radv_buffer_get_va(buffer->bo);
83237ec681f3Smrg         uint64_t counter_buffer_offset = 0;
83247ec681f3Smrg
83257ec681f3Smrg         if (pCounterBufferOffsets)
83267ec681f3Smrg            counter_buffer_offset = pCounterBufferOffsets[counter_buffer_idx];
83277ec681f3Smrg
83287ec681f3Smrg         va += buffer->offset + counter_buffer_offset;
83297ec681f3Smrg
83307ec681f3Smrg         si_cs_emit_write_event_eop(cs, cmd_buffer->device->physical_device->rad_info.chip_class,
83317ec681f3Smrg                                    radv_cmd_buffer_uses_mec(cmd_buffer), V_028A90_PS_DONE, 0,
83327ec681f3Smrg                                    EOP_DST_SEL_TC_L2, EOP_DATA_SEL_GDS, va, EOP_DATA_GDS(i, 1), 0);
83337ec681f3Smrg
83347ec681f3Smrg         radv_cs_add_buffer(cmd_buffer->device->ws, cs, buffer->bo);
83357ec681f3Smrg      }
83367ec681f3Smrg   }
83377ec681f3Smrg
83387ec681f3Smrg   radv_set_streamout_enable(cmd_buffer, false);
83397ec681f3Smrg}
83407ec681f3Smrg
83417ec681f3Smrgvoid
83427ec681f3Smrgradv_CmdEndTransformFeedbackEXT(VkCommandBuffer commandBuffer, uint32_t firstCounterBuffer,
83437ec681f3Smrg                                uint32_t counterBufferCount, const VkBuffer *pCounterBuffers,
83447ec681f3Smrg                                const VkDeviceSize *pCounterBufferOffsets)
83457ec681f3Smrg{
83467ec681f3Smrg   RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
83477ec681f3Smrg
83487ec681f3Smrg   if (cmd_buffer->device->physical_device->use_ngg_streamout) {
83497ec681f3Smrg      gfx10_emit_streamout_end(cmd_buffer, firstCounterBuffer, counterBufferCount, pCounterBuffers,
83507ec681f3Smrg                               pCounterBufferOffsets);
83517ec681f3Smrg   } else {
83527ec681f3Smrg      radv_emit_streamout_end(cmd_buffer, firstCounterBuffer, counterBufferCount, pCounterBuffers,
83537ec681f3Smrg                              pCounterBufferOffsets);
83547ec681f3Smrg   }
83557ec681f3Smrg}
83567ec681f3Smrg
83577ec681f3Smrgvoid
83587ec681f3Smrgradv_CmdDrawIndirectByteCountEXT(VkCommandBuffer commandBuffer, uint32_t instanceCount,
83597ec681f3Smrg                                 uint32_t firstInstance, VkBuffer _counterBuffer,
83607ec681f3Smrg                                 VkDeviceSize counterBufferOffset, uint32_t counterOffset,
83617ec681f3Smrg                                 uint32_t vertexStride)
83627ec681f3Smrg{
83637ec681f3Smrg   RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
83647ec681f3Smrg   RADV_FROM_HANDLE(radv_buffer, counterBuffer, _counterBuffer);
83657ec681f3Smrg   struct radv_draw_info info;
83667ec681f3Smrg
83677ec681f3Smrg   info.count = 0;
83687ec681f3Smrg   info.instance_count = instanceCount;
83697ec681f3Smrg   info.first_instance = firstInstance;
83707ec681f3Smrg   info.strmout_buffer = counterBuffer;
83717ec681f3Smrg   info.strmout_buffer_offset = counterBufferOffset;
83727ec681f3Smrg   info.stride = vertexStride;
83737ec681f3Smrg   info.indexed = false;
83747ec681f3Smrg   info.indirect = NULL;
83757ec681f3Smrg
83767ec681f3Smrg   if (!radv_before_draw(cmd_buffer, &info, 1))
83777ec681f3Smrg      return;
83787ec681f3Smrg   struct VkMultiDrawInfoEXT minfo = { 0, 0 };
83797ec681f3Smrg   radv_emit_direct_draw_packets(cmd_buffer, &info, 1, &minfo, S_0287F0_USE_OPAQUE(1), 0);
83807ec681f3Smrg   radv_after_draw(cmd_buffer);
83817ec681f3Smrg}
83827ec681f3Smrg
83837ec681f3Smrg/* VK_AMD_buffer_marker */
83847ec681f3Smrgvoid
83857ec681f3Smrgradv_CmdWriteBufferMarkerAMD(VkCommandBuffer commandBuffer, VkPipelineStageFlagBits pipelineStage,
83867ec681f3Smrg                             VkBuffer dstBuffer, VkDeviceSize dstOffset, uint32_t marker)
83877ec681f3Smrg{
83887ec681f3Smrg   RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
83897ec681f3Smrg   RADV_FROM_HANDLE(radv_buffer, buffer, dstBuffer);
83907ec681f3Smrg   struct radeon_cmdbuf *cs = cmd_buffer->cs;
83917ec681f3Smrg   uint64_t va = radv_buffer_get_va(buffer->bo) + dstOffset;
83927ec681f3Smrg
83937ec681f3Smrg   si_emit_cache_flush(cmd_buffer);
83947ec681f3Smrg
83957ec681f3Smrg   ASSERTED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 12);
83967ec681f3Smrg
83977ec681f3Smrg   if (!(pipelineStage & ~VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT)) {
83987ec681f3Smrg      radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
83997ec681f3Smrg      radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_IMM) | COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) |
84007ec681f3Smrg                         COPY_DATA_WR_CONFIRM);
84017ec681f3Smrg      radeon_emit(cs, marker);
84027ec681f3Smrg      radeon_emit(cs, 0);
84037ec681f3Smrg      radeon_emit(cs, va);
84047ec681f3Smrg      radeon_emit(cs, va >> 32);
84057ec681f3Smrg   } else {
84067ec681f3Smrg      si_cs_emit_write_event_eop(cs, cmd_buffer->device->physical_device->rad_info.chip_class,
84077ec681f3Smrg                                 radv_cmd_buffer_uses_mec(cmd_buffer), V_028A90_BOTTOM_OF_PIPE_TS,
84087ec681f3Smrg                                 0, EOP_DST_SEL_MEM, EOP_DATA_SEL_VALUE_32BIT, va, marker,
84097ec681f3Smrg                                 cmd_buffer->gfx9_eop_bug_va);
84107ec681f3Smrg   }
84117ec681f3Smrg
84127ec681f3Smrg   assert(cmd_buffer->cs->cdw <= cdw_max);
841301e04c3fSmrg}
8414