101e04c3fSmrg/*
201e04c3fSmrg * Copyright © 2016 Red Hat.
301e04c3fSmrg * Copyright © 2016 Bas Nieuwenhuizen
401e04c3fSmrg *
501e04c3fSmrg * based in part on anv driver which is:
601e04c3fSmrg * Copyright © 2015 Intel Corporation
701e04c3fSmrg *
801e04c3fSmrg * Permission is hereby granted, free of charge, to any person obtaining a
901e04c3fSmrg * copy of this software and associated documentation files (the "Software"),
1001e04c3fSmrg * to deal in the Software without restriction, including without limitation
1101e04c3fSmrg * the rights to use, copy, modify, merge, publish, distribute, sublicense,
1201e04c3fSmrg * and/or sell copies of the Software, and to permit persons to whom the
1301e04c3fSmrg * Software is furnished to do so, subject to the following conditions:
1401e04c3fSmrg *
1501e04c3fSmrg * The above copyright notice and this permission notice (including the next
1601e04c3fSmrg * paragraph) shall be included in all copies or substantial portions of the
1701e04c3fSmrg * Software.
1801e04c3fSmrg *
1901e04c3fSmrg * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
2001e04c3fSmrg * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
2101e04c3fSmrg * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
2201e04c3fSmrg * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
2301e04c3fSmrg * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
2401e04c3fSmrg * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
2501e04c3fSmrg * IN THE SOFTWARE.
2601e04c3fSmrg */
2701e04c3fSmrg
287ec681f3Smrg#include "nir/nir.h"
297ec681f3Smrg#include "nir/nir_builder.h"
307ec681f3Smrg#include "nir/nir_xfb_info.h"
317ec681f3Smrg#include "spirv/nir_spirv.h"
327ec681f3Smrg#include "util/disk_cache.h"
3301e04c3fSmrg#include "util/mesa-sha1.h"
3401e04c3fSmrg#include "util/u_atomic.h"
357ec681f3Smrg#include "radv_cs.h"
3601e04c3fSmrg#include "radv_debug.h"
3701e04c3fSmrg#include "radv_private.h"
3801e04c3fSmrg#include "radv_shader.h"
3901e04c3fSmrg#include "vk_util.h"
4001e04c3fSmrg
4101e04c3fSmrg#include "util/debug.h"
427ec681f3Smrg#include "ac_binary.h"
4301e04c3fSmrg#include "ac_exp_param.h"
447ec681f3Smrg#include "ac_nir.h"
4501e04c3fSmrg#include "ac_shader_util.h"
467ec681f3Smrg#include "aco_interface.h"
477ec681f3Smrg#include "sid.h"
487ec681f3Smrg#include "vk_format.h"
4901e04c3fSmrg
5001e04c3fSmrgstruct radv_blend_state {
517ec681f3Smrg   uint32_t blend_enable_4bit;
527ec681f3Smrg   uint32_t need_src_alpha;
5301e04c3fSmrg
547ec681f3Smrg   uint32_t cb_target_mask;
557ec681f3Smrg   uint32_t cb_target_enabled_4bit;
567ec681f3Smrg   uint32_t sx_mrt_blend_opt[8];
577ec681f3Smrg   uint32_t cb_blend_control[8];
5801e04c3fSmrg
597ec681f3Smrg   uint32_t spi_shader_col_format;
607ec681f3Smrg   uint32_t col_format_is_int8;
617ec681f3Smrg   uint32_t col_format_is_int10;
627ec681f3Smrg   uint32_t cb_shader_mask;
637ec681f3Smrg   uint32_t db_alpha_to_mask;
6401e04c3fSmrg
657ec681f3Smrg   uint32_t commutative_4bit;
6601e04c3fSmrg
677ec681f3Smrg   bool single_cb_enable;
687ec681f3Smrg   bool mrt0_is_dual_src;
6901e04c3fSmrg};
7001e04c3fSmrg
7101e04c3fSmrgstruct radv_dsa_order_invariance {
727ec681f3Smrg   /* Whether the final result in Z/S buffers is guaranteed to be
737ec681f3Smrg    * invariant under changes to the order in which fragments arrive.
747ec681f3Smrg    */
757ec681f3Smrg   bool zs;
767ec681f3Smrg
777ec681f3Smrg   /* Whether the set of fragments that pass the combined Z/S test is
787ec681f3Smrg    * guaranteed to be invariant under changes to the order in which
797ec681f3Smrg    * fragments arrive.
807ec681f3Smrg    */
817ec681f3Smrg   bool pass_set;
8201e04c3fSmrg};
8301e04c3fSmrg
847ec681f3Smrgstatic bool
857ec681f3Smrgradv_is_state_dynamic(const VkGraphicsPipelineCreateInfo *pCreateInfo, VkDynamicState state)
867ec681f3Smrg{
877ec681f3Smrg   if (pCreateInfo->pDynamicState) {
887ec681f3Smrg      uint32_t count = pCreateInfo->pDynamicState->dynamicStateCount;
897ec681f3Smrg      for (uint32_t i = 0; i < count; i++) {
907ec681f3Smrg         if (pCreateInfo->pDynamicState->pDynamicStates[i] == state)
917ec681f3Smrg            return true;
927ec681f3Smrg      }
937ec681f3Smrg   }
947ec681f3Smrg
957ec681f3Smrg   return false;
967ec681f3Smrg}
9701e04c3fSmrg
987ec681f3Smrgstatic const VkPipelineMultisampleStateCreateInfo *
997ec681f3Smrgradv_pipeline_get_multisample_state(const VkGraphicsPipelineCreateInfo *pCreateInfo)
1007ec681f3Smrg{
1017ec681f3Smrg   if (!pCreateInfo->pRasterizationState->rasterizerDiscardEnable ||
1027ec681f3Smrg       radv_is_state_dynamic(pCreateInfo, VK_DYNAMIC_STATE_RASTERIZER_DISCARD_ENABLE_EXT))
1037ec681f3Smrg      return pCreateInfo->pMultisampleState;
1047ec681f3Smrg   return NULL;
1057ec681f3Smrg}
10601e04c3fSmrg
1077ec681f3Smrgstatic const VkPipelineTessellationStateCreateInfo *
1087ec681f3Smrgradv_pipeline_get_tessellation_state(const VkGraphicsPipelineCreateInfo *pCreateInfo)
10901e04c3fSmrg{
1107ec681f3Smrg   for (uint32_t i = 0; i < pCreateInfo->stageCount; i++) {
1117ec681f3Smrg      if (pCreateInfo->pStages[i].stage == VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT ||
1127ec681f3Smrg          pCreateInfo->pStages[i].stage == VK_SHADER_STAGE_TESSELLATION_EVALUATION_BIT) {
1137ec681f3Smrg         return pCreateInfo->pTessellationState;
1147ec681f3Smrg      }
1157ec681f3Smrg   }
1167ec681f3Smrg   return NULL;
1177ec681f3Smrg}
1187ec681f3Smrg
1197ec681f3Smrgstatic const VkPipelineDepthStencilStateCreateInfo *
1207ec681f3Smrgradv_pipeline_get_depth_stencil_state(const VkGraphicsPipelineCreateInfo *pCreateInfo)
1217ec681f3Smrg{
1227ec681f3Smrg   RADV_FROM_HANDLE(radv_render_pass, pass, pCreateInfo->renderPass);
1237ec681f3Smrg   struct radv_subpass *subpass = pass->subpasses + pCreateInfo->subpass;
1247ec681f3Smrg
1257ec681f3Smrg   if ((!pCreateInfo->pRasterizationState->rasterizerDiscardEnable &&
1267ec681f3Smrg        subpass->depth_stencil_attachment) ||
1277ec681f3Smrg       radv_is_state_dynamic(pCreateInfo, VK_DYNAMIC_STATE_RASTERIZER_DISCARD_ENABLE_EXT))
1287ec681f3Smrg      return pCreateInfo->pDepthStencilState;
1297ec681f3Smrg   return NULL;
1307ec681f3Smrg}
13101e04c3fSmrg
1327ec681f3Smrgstatic const VkPipelineColorBlendStateCreateInfo *
1337ec681f3Smrgradv_pipeline_get_color_blend_state(const VkGraphicsPipelineCreateInfo *pCreateInfo)
1347ec681f3Smrg{
1357ec681f3Smrg   RADV_FROM_HANDLE(radv_render_pass, pass, pCreateInfo->renderPass);
1367ec681f3Smrg   struct radv_subpass *subpass = pass->subpasses + pCreateInfo->subpass;
13701e04c3fSmrg
1387ec681f3Smrg   if ((!pCreateInfo->pRasterizationState->rasterizerDiscardEnable && subpass->has_color_att) ||
1397ec681f3Smrg       radv_is_state_dynamic(pCreateInfo, VK_DYNAMIC_STATE_RASTERIZER_DISCARD_ENABLE_EXT))
1407ec681f3Smrg      return pCreateInfo->pColorBlendState;
1417ec681f3Smrg   return NULL;
14201e04c3fSmrg}
14301e04c3fSmrg
1447ec681f3Smrgstatic bool
1457ec681f3Smrgradv_pipeline_has_ngg(const struct radv_pipeline *pipeline)
14601e04c3fSmrg{
1477ec681f3Smrg   if (pipeline->graphics.last_vgt_api_stage == MESA_SHADER_NONE)
1487ec681f3Smrg      return false;
14901e04c3fSmrg
1507ec681f3Smrg   struct radv_shader_variant *variant =
1517ec681f3Smrg      pipeline->shaders[pipeline->graphics.last_vgt_api_stage];
15201e04c3fSmrg
1537ec681f3Smrg   return variant->info.is_ngg;
15401e04c3fSmrg}
15501e04c3fSmrg
1567ec681f3Smrgbool
1577ec681f3Smrgradv_pipeline_has_ngg_passthrough(const struct radv_pipeline *pipeline)
15801e04c3fSmrg{
1597ec681f3Smrg   if (pipeline->graphics.last_vgt_api_stage == MESA_SHADER_NONE)
1607ec681f3Smrg      return false;
16101e04c3fSmrg
1627ec681f3Smrg   assert(radv_pipeline_has_ngg(pipeline));
1637ec681f3Smrg
1647ec681f3Smrg   struct radv_shader_variant *variant =
1657ec681f3Smrg      pipeline->shaders[pipeline->graphics.last_vgt_api_stage];
1667ec681f3Smrg
1677ec681f3Smrg   return variant->info.is_ngg_passthrough;
16801e04c3fSmrg}
16901e04c3fSmrg
1707ec681f3Smrgbool
1717ec681f3Smrgradv_pipeline_has_gs_copy_shader(const struct radv_pipeline *pipeline)
1727ec681f3Smrg{
1737ec681f3Smrg   return !!pipeline->gs_copy_shader;
1747ec681f3Smrg}
1757ec681f3Smrg
1767ec681f3Smrgvoid
1777ec681f3Smrgradv_pipeline_destroy(struct radv_device *device, struct radv_pipeline *pipeline,
1787ec681f3Smrg                      const VkAllocationCallbacks *allocator)
1797ec681f3Smrg{
1807ec681f3Smrg   if (pipeline->type == RADV_PIPELINE_COMPUTE) {
1817ec681f3Smrg      free(pipeline->compute.rt_group_handles);
1827ec681f3Smrg      free(pipeline->compute.rt_stack_sizes);
1837ec681f3Smrg   } else if (pipeline->type == RADV_PIPELINE_LIBRARY) {
1847ec681f3Smrg      free(pipeline->library.groups);
1857ec681f3Smrg      free(pipeline->library.stages);
1867ec681f3Smrg   }
1877ec681f3Smrg
1887ec681f3Smrg   for (unsigned i = 0; i < MESA_SHADER_STAGES; ++i)
1897ec681f3Smrg      if (pipeline->shaders[i])
1907ec681f3Smrg         radv_shader_variant_destroy(device, pipeline->shaders[i]);
1917ec681f3Smrg
1927ec681f3Smrg   if (pipeline->gs_copy_shader)
1937ec681f3Smrg      radv_shader_variant_destroy(device, pipeline->gs_copy_shader);
1947ec681f3Smrg
1957ec681f3Smrg   if (pipeline->cs.buf)
1967ec681f3Smrg      free(pipeline->cs.buf);
1977ec681f3Smrg
1987ec681f3Smrg   vk_object_base_finish(&pipeline->base);
1997ec681f3Smrg   vk_free2(&device->vk.alloc, allocator, pipeline);
2007ec681f3Smrg}
2017ec681f3Smrg
2027ec681f3Smrgvoid
2037ec681f3Smrgradv_DestroyPipeline(VkDevice _device, VkPipeline _pipeline,
2047ec681f3Smrg                     const VkAllocationCallbacks *pAllocator)
2057ec681f3Smrg{
2067ec681f3Smrg   RADV_FROM_HANDLE(radv_device, device, _device);
2077ec681f3Smrg   RADV_FROM_HANDLE(radv_pipeline, pipeline, _pipeline);
2087ec681f3Smrg
2097ec681f3Smrg   if (!_pipeline)
2107ec681f3Smrg      return;
2117ec681f3Smrg
2127ec681f3Smrg   radv_pipeline_destroy(device, pipeline, pAllocator);
2137ec681f3Smrg}
2147ec681f3Smrg
2157ec681f3Smrguint32_t
2167ec681f3Smrgradv_get_hash_flags(const struct radv_device *device, bool stats)
2177ec681f3Smrg{
2187ec681f3Smrg   uint32_t hash_flags = 0;
2197ec681f3Smrg
2207ec681f3Smrg   if (device->physical_device->use_ngg_culling)
2217ec681f3Smrg      hash_flags |= RADV_HASH_SHADER_USE_NGG_CULLING;
2227ec681f3Smrg   if (device->instance->perftest_flags & RADV_PERFTEST_FORCE_EMULATE_RT)
2237ec681f3Smrg      hash_flags |= RADV_HASH_SHADER_FORCE_EMULATE_RT;
2247ec681f3Smrg   if (device->physical_device->cs_wave_size == 32)
2257ec681f3Smrg      hash_flags |= RADV_HASH_SHADER_CS_WAVE32;
2267ec681f3Smrg   if (device->physical_device->ps_wave_size == 32)
2277ec681f3Smrg      hash_flags |= RADV_HASH_SHADER_PS_WAVE32;
2287ec681f3Smrg   if (device->physical_device->ge_wave_size == 32)
2297ec681f3Smrg      hash_flags |= RADV_HASH_SHADER_GE_WAVE32;
2307ec681f3Smrg   if (device->physical_device->use_llvm)
2317ec681f3Smrg      hash_flags |= RADV_HASH_SHADER_LLVM;
2327ec681f3Smrg   if (stats)
2337ec681f3Smrg      hash_flags |= RADV_HASH_SHADER_KEEP_STATISTICS;
2347ec681f3Smrg   if (device->robust_buffer_access) /* forces per-attribute vertex descriptors */
2357ec681f3Smrg      hash_flags |= RADV_HASH_SHADER_ROBUST_BUFFER_ACCESS;
2367ec681f3Smrg   if (device->robust_buffer_access2) /* affects load/store vectorizer */
2377ec681f3Smrg      hash_flags |= RADV_HASH_SHADER_ROBUST_BUFFER_ACCESS2;
2387ec681f3Smrg   return hash_flags;
2397ec681f3Smrg}
2407ec681f3Smrg
2417ec681f3Smrgstatic void
2427ec681f3Smrgradv_pipeline_init_scratch(const struct radv_device *device, struct radv_pipeline *pipeline)
2437ec681f3Smrg{
2447ec681f3Smrg   unsigned scratch_bytes_per_wave = 0;
2457ec681f3Smrg   unsigned max_waves = 0;
2467ec681f3Smrg
2477ec681f3Smrg   for (int i = 0; i < MESA_SHADER_STAGES; ++i) {
2487ec681f3Smrg      if (pipeline->shaders[i] && pipeline->shaders[i]->config.scratch_bytes_per_wave) {
2497ec681f3Smrg         unsigned max_stage_waves = device->scratch_waves;
2507ec681f3Smrg
2517ec681f3Smrg         scratch_bytes_per_wave =
2527ec681f3Smrg            MAX2(scratch_bytes_per_wave, pipeline->shaders[i]->config.scratch_bytes_per_wave);
2537ec681f3Smrg
2547ec681f3Smrg         max_stage_waves =
2557ec681f3Smrg            MIN2(max_stage_waves, 4 * device->physical_device->rad_info.num_good_compute_units *
2567ec681f3Smrg                 radv_get_max_waves(device, pipeline->shaders[i], i));
2577ec681f3Smrg         max_waves = MAX2(max_waves, max_stage_waves);
2587ec681f3Smrg      }
2597ec681f3Smrg   }
2607ec681f3Smrg
2617ec681f3Smrg   pipeline->scratch_bytes_per_wave = scratch_bytes_per_wave;
2627ec681f3Smrg   pipeline->max_waves = max_waves;
2637ec681f3Smrg}
2647ec681f3Smrg
2657ec681f3Smrgstatic uint32_t
2667ec681f3Smrgsi_translate_blend_function(VkBlendOp op)
2677ec681f3Smrg{
2687ec681f3Smrg   switch (op) {
2697ec681f3Smrg   case VK_BLEND_OP_ADD:
2707ec681f3Smrg      return V_028780_COMB_DST_PLUS_SRC;
2717ec681f3Smrg   case VK_BLEND_OP_SUBTRACT:
2727ec681f3Smrg      return V_028780_COMB_SRC_MINUS_DST;
2737ec681f3Smrg   case VK_BLEND_OP_REVERSE_SUBTRACT:
2747ec681f3Smrg      return V_028780_COMB_DST_MINUS_SRC;
2757ec681f3Smrg   case VK_BLEND_OP_MIN:
2767ec681f3Smrg      return V_028780_COMB_MIN_DST_SRC;
2777ec681f3Smrg   case VK_BLEND_OP_MAX:
2787ec681f3Smrg      return V_028780_COMB_MAX_DST_SRC;
2797ec681f3Smrg   default:
2807ec681f3Smrg      return 0;
2817ec681f3Smrg   }
2827ec681f3Smrg}
2837ec681f3Smrg
2847ec681f3Smrgstatic uint32_t
2857ec681f3Smrgsi_translate_blend_factor(VkBlendFactor factor)
2867ec681f3Smrg{
2877ec681f3Smrg   switch (factor) {
2887ec681f3Smrg   case VK_BLEND_FACTOR_ZERO:
2897ec681f3Smrg      return V_028780_BLEND_ZERO;
2907ec681f3Smrg   case VK_BLEND_FACTOR_ONE:
2917ec681f3Smrg      return V_028780_BLEND_ONE;
2927ec681f3Smrg   case VK_BLEND_FACTOR_SRC_COLOR:
2937ec681f3Smrg      return V_028780_BLEND_SRC_COLOR;
2947ec681f3Smrg   case VK_BLEND_FACTOR_ONE_MINUS_SRC_COLOR:
2957ec681f3Smrg      return V_028780_BLEND_ONE_MINUS_SRC_COLOR;
2967ec681f3Smrg   case VK_BLEND_FACTOR_DST_COLOR:
2977ec681f3Smrg      return V_028780_BLEND_DST_COLOR;
2987ec681f3Smrg   case VK_BLEND_FACTOR_ONE_MINUS_DST_COLOR:
2997ec681f3Smrg      return V_028780_BLEND_ONE_MINUS_DST_COLOR;
3007ec681f3Smrg   case VK_BLEND_FACTOR_SRC_ALPHA:
3017ec681f3Smrg      return V_028780_BLEND_SRC_ALPHA;
3027ec681f3Smrg   case VK_BLEND_FACTOR_ONE_MINUS_SRC_ALPHA:
3037ec681f3Smrg      return V_028780_BLEND_ONE_MINUS_SRC_ALPHA;
3047ec681f3Smrg   case VK_BLEND_FACTOR_DST_ALPHA:
3057ec681f3Smrg      return V_028780_BLEND_DST_ALPHA;
3067ec681f3Smrg   case VK_BLEND_FACTOR_ONE_MINUS_DST_ALPHA:
3077ec681f3Smrg      return V_028780_BLEND_ONE_MINUS_DST_ALPHA;
3087ec681f3Smrg   case VK_BLEND_FACTOR_CONSTANT_COLOR:
3097ec681f3Smrg      return V_028780_BLEND_CONSTANT_COLOR;
3107ec681f3Smrg   case VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_COLOR:
3117ec681f3Smrg      return V_028780_BLEND_ONE_MINUS_CONSTANT_COLOR;
3127ec681f3Smrg   case VK_BLEND_FACTOR_CONSTANT_ALPHA:
3137ec681f3Smrg      return V_028780_BLEND_CONSTANT_ALPHA;
3147ec681f3Smrg   case VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_ALPHA:
3157ec681f3Smrg      return V_028780_BLEND_ONE_MINUS_CONSTANT_ALPHA;
3167ec681f3Smrg   case VK_BLEND_FACTOR_SRC_ALPHA_SATURATE:
3177ec681f3Smrg      return V_028780_BLEND_SRC_ALPHA_SATURATE;
3187ec681f3Smrg   case VK_BLEND_FACTOR_SRC1_COLOR:
3197ec681f3Smrg      return V_028780_BLEND_SRC1_COLOR;
3207ec681f3Smrg   case VK_BLEND_FACTOR_ONE_MINUS_SRC1_COLOR:
3217ec681f3Smrg      return V_028780_BLEND_INV_SRC1_COLOR;
3227ec681f3Smrg   case VK_BLEND_FACTOR_SRC1_ALPHA:
3237ec681f3Smrg      return V_028780_BLEND_SRC1_ALPHA;
3247ec681f3Smrg   case VK_BLEND_FACTOR_ONE_MINUS_SRC1_ALPHA:
3257ec681f3Smrg      return V_028780_BLEND_INV_SRC1_ALPHA;
3267ec681f3Smrg   default:
3277ec681f3Smrg      return 0;
3287ec681f3Smrg   }
3297ec681f3Smrg}
3307ec681f3Smrg
3317ec681f3Smrgstatic uint32_t
3327ec681f3Smrgsi_translate_blend_opt_function(VkBlendOp op)
3337ec681f3Smrg{
3347ec681f3Smrg   switch (op) {
3357ec681f3Smrg   case VK_BLEND_OP_ADD:
3367ec681f3Smrg      return V_028760_OPT_COMB_ADD;
3377ec681f3Smrg   case VK_BLEND_OP_SUBTRACT:
3387ec681f3Smrg      return V_028760_OPT_COMB_SUBTRACT;
3397ec681f3Smrg   case VK_BLEND_OP_REVERSE_SUBTRACT:
3407ec681f3Smrg      return V_028760_OPT_COMB_REVSUBTRACT;
3417ec681f3Smrg   case VK_BLEND_OP_MIN:
3427ec681f3Smrg      return V_028760_OPT_COMB_MIN;
3437ec681f3Smrg   case VK_BLEND_OP_MAX:
3447ec681f3Smrg      return V_028760_OPT_COMB_MAX;
3457ec681f3Smrg   default:
3467ec681f3Smrg      return V_028760_OPT_COMB_BLEND_DISABLED;
3477ec681f3Smrg   }
3487ec681f3Smrg}
3497ec681f3Smrg
3507ec681f3Smrgstatic uint32_t
3517ec681f3Smrgsi_translate_blend_opt_factor(VkBlendFactor factor, bool is_alpha)
3527ec681f3Smrg{
3537ec681f3Smrg   switch (factor) {
3547ec681f3Smrg   case VK_BLEND_FACTOR_ZERO:
3557ec681f3Smrg      return V_028760_BLEND_OPT_PRESERVE_NONE_IGNORE_ALL;
3567ec681f3Smrg   case VK_BLEND_FACTOR_ONE:
3577ec681f3Smrg      return V_028760_BLEND_OPT_PRESERVE_ALL_IGNORE_NONE;
3587ec681f3Smrg   case VK_BLEND_FACTOR_SRC_COLOR:
3597ec681f3Smrg      return is_alpha ? V_028760_BLEND_OPT_PRESERVE_A1_IGNORE_A0
3607ec681f3Smrg                      : V_028760_BLEND_OPT_PRESERVE_C1_IGNORE_C0;
3617ec681f3Smrg   case VK_BLEND_FACTOR_ONE_MINUS_SRC_COLOR:
3627ec681f3Smrg      return is_alpha ? V_028760_BLEND_OPT_PRESERVE_A0_IGNORE_A1
3637ec681f3Smrg                      : V_028760_BLEND_OPT_PRESERVE_C0_IGNORE_C1;
3647ec681f3Smrg   case VK_BLEND_FACTOR_SRC_ALPHA:
3657ec681f3Smrg      return V_028760_BLEND_OPT_PRESERVE_A1_IGNORE_A0;
3667ec681f3Smrg   case VK_BLEND_FACTOR_ONE_MINUS_SRC_ALPHA:
3677ec681f3Smrg      return V_028760_BLEND_OPT_PRESERVE_A0_IGNORE_A1;
3687ec681f3Smrg   case VK_BLEND_FACTOR_SRC_ALPHA_SATURATE:
3697ec681f3Smrg      return is_alpha ? V_028760_BLEND_OPT_PRESERVE_ALL_IGNORE_NONE
3707ec681f3Smrg                      : V_028760_BLEND_OPT_PRESERVE_NONE_IGNORE_A0;
3717ec681f3Smrg   default:
3727ec681f3Smrg      return V_028760_BLEND_OPT_PRESERVE_NONE_IGNORE_NONE;
3737ec681f3Smrg   }
37401e04c3fSmrg}
37501e04c3fSmrg
37601e04c3fSmrg/**
37701e04c3fSmrg * Get rid of DST in the blend factors by commuting the operands:
37801e04c3fSmrg *    func(src * DST, dst * 0) ---> func(src * 0, dst * SRC)
37901e04c3fSmrg */
3807ec681f3Smrgstatic void
3817ec681f3Smrgsi_blend_remove_dst(VkBlendOp *func, VkBlendFactor *src_factor, VkBlendFactor *dst_factor,
3827ec681f3Smrg                    VkBlendFactor expected_dst, VkBlendFactor replacement_src)
3837ec681f3Smrg{
3847ec681f3Smrg   if (*src_factor == expected_dst && *dst_factor == VK_BLEND_FACTOR_ZERO) {
3857ec681f3Smrg      *src_factor = VK_BLEND_FACTOR_ZERO;
3867ec681f3Smrg      *dst_factor = replacement_src;
3877ec681f3Smrg
3887ec681f3Smrg      /* Commuting the operands requires reversing subtractions. */
3897ec681f3Smrg      if (*func == VK_BLEND_OP_SUBTRACT)
3907ec681f3Smrg         *func = VK_BLEND_OP_REVERSE_SUBTRACT;
3917ec681f3Smrg      else if (*func == VK_BLEND_OP_REVERSE_SUBTRACT)
3927ec681f3Smrg         *func = VK_BLEND_OP_SUBTRACT;
3937ec681f3Smrg   }
39401e04c3fSmrg}
39501e04c3fSmrg
3967ec681f3Smrgstatic bool
3977ec681f3Smrgsi_blend_factor_uses_dst(VkBlendFactor factor)
3987ec681f3Smrg{
3997ec681f3Smrg   return factor == VK_BLEND_FACTOR_DST_COLOR || factor == VK_BLEND_FACTOR_DST_ALPHA ||
4007ec681f3Smrg          factor == VK_BLEND_FACTOR_SRC_ALPHA_SATURATE ||
4017ec681f3Smrg          factor == VK_BLEND_FACTOR_ONE_MINUS_DST_ALPHA ||
4027ec681f3Smrg          factor == VK_BLEND_FACTOR_ONE_MINUS_DST_COLOR;
4037ec681f3Smrg}
4047ec681f3Smrg
4057ec681f3Smrgstatic bool
4067ec681f3Smrgis_dual_src(VkBlendFactor factor)
4077ec681f3Smrg{
4087ec681f3Smrg   switch (factor) {
4097ec681f3Smrg   case VK_BLEND_FACTOR_SRC1_COLOR:
4107ec681f3Smrg   case VK_BLEND_FACTOR_ONE_MINUS_SRC1_COLOR:
4117ec681f3Smrg   case VK_BLEND_FACTOR_SRC1_ALPHA:
4127ec681f3Smrg   case VK_BLEND_FACTOR_ONE_MINUS_SRC1_ALPHA:
4137ec681f3Smrg      return true;
4147ec681f3Smrg   default:
4157ec681f3Smrg      return false;
4167ec681f3Smrg   }
4177ec681f3Smrg}
4187ec681f3Smrg
4197ec681f3Smrgstatic unsigned
4207ec681f3Smrgradv_choose_spi_color_format(const struct radv_device *device, VkFormat vk_format,
4217ec681f3Smrg                             bool blend_enable, bool blend_need_alpha)
4227ec681f3Smrg{
4237ec681f3Smrg   const struct util_format_description *desc = vk_format_description(vk_format);
4247ec681f3Smrg   bool use_rbplus = device->physical_device->rad_info.rbplus_allowed;
4257ec681f3Smrg   struct ac_spi_color_formats formats = {0};
4267ec681f3Smrg   unsigned format, ntype, swap;
4277ec681f3Smrg
4287ec681f3Smrg   format = radv_translate_colorformat(vk_format);
4297ec681f3Smrg   ntype = radv_translate_color_numformat(vk_format, desc,
4307ec681f3Smrg                                          vk_format_get_first_non_void_channel(vk_format));
4317ec681f3Smrg   swap = radv_translate_colorswap(vk_format, false);
4327ec681f3Smrg
4337ec681f3Smrg   ac_choose_spi_color_formats(format, swap, ntype, false, use_rbplus, &formats);
4347ec681f3Smrg
4357ec681f3Smrg   if (blend_enable && blend_need_alpha)
4367ec681f3Smrg      return formats.blend_alpha;
4377ec681f3Smrg   else if (blend_need_alpha)
4387ec681f3Smrg      return formats.alpha;
4397ec681f3Smrg   else if (blend_enable)
4407ec681f3Smrg      return formats.blend;
4417ec681f3Smrg   else
4427ec681f3Smrg      return formats.normal;
44301e04c3fSmrg}
44401e04c3fSmrg
44501e04c3fSmrgstatic bool
44601e04c3fSmrgformat_is_int8(VkFormat format)
44701e04c3fSmrg{
4487ec681f3Smrg   const struct util_format_description *desc = vk_format_description(format);
4497ec681f3Smrg   int channel = vk_format_get_first_non_void_channel(format);
45001e04c3fSmrg
4517ec681f3Smrg   return channel >= 0 && desc->channel[channel].pure_integer && desc->channel[channel].size == 8;
45201e04c3fSmrg}
45301e04c3fSmrg
45401e04c3fSmrgstatic bool
45501e04c3fSmrgformat_is_int10(VkFormat format)
45601e04c3fSmrg{
4577ec681f3Smrg   const struct util_format_description *desc = vk_format_description(format);
4587ec681f3Smrg
4597ec681f3Smrg   if (desc->nr_channels != 4)
4607ec681f3Smrg      return false;
4617ec681f3Smrg   for (unsigned i = 0; i < 4; i++) {
4627ec681f3Smrg      if (desc->channel[i].pure_integer && desc->channel[i].size == 10)
4637ec681f3Smrg         return true;
4647ec681f3Smrg   }
4657ec681f3Smrg   return false;
4667ec681f3Smrg}
46701e04c3fSmrg
4687ec681f3Smrgstatic void
4697ec681f3Smrgradv_pipeline_compute_spi_color_formats(const struct radv_pipeline *pipeline,
4707ec681f3Smrg                                        const VkGraphicsPipelineCreateInfo *pCreateInfo,
4717ec681f3Smrg                                        struct radv_blend_state *blend)
4727ec681f3Smrg{
4737ec681f3Smrg   RADV_FROM_HANDLE(radv_render_pass, pass, pCreateInfo->renderPass);
4747ec681f3Smrg   struct radv_subpass *subpass = pass->subpasses + pCreateInfo->subpass;
4757ec681f3Smrg   unsigned col_format = 0, is_int8 = 0, is_int10 = 0;
4767ec681f3Smrg   unsigned num_targets;
4777ec681f3Smrg
4787ec681f3Smrg   for (unsigned i = 0; i < (blend->single_cb_enable ? 1 : subpass->color_count); ++i) {
4797ec681f3Smrg      unsigned cf;
4807ec681f3Smrg
4817ec681f3Smrg      if (subpass->color_attachments[i].attachment == VK_ATTACHMENT_UNUSED ||
4827ec681f3Smrg          !(blend->cb_target_mask & (0xfu << (i * 4)))) {
4837ec681f3Smrg         cf = V_028714_SPI_SHADER_ZERO;
4847ec681f3Smrg      } else {
4857ec681f3Smrg         struct radv_render_pass_attachment *attachment =
4867ec681f3Smrg            pass->attachments + subpass->color_attachments[i].attachment;
4877ec681f3Smrg         bool blend_enable = blend->blend_enable_4bit & (0xfu << (i * 4));
4887ec681f3Smrg
4897ec681f3Smrg         cf = radv_choose_spi_color_format(pipeline->device, attachment->format, blend_enable,
4907ec681f3Smrg                                           blend->need_src_alpha & (1 << i));
4917ec681f3Smrg
4927ec681f3Smrg         if (format_is_int8(attachment->format))
4937ec681f3Smrg            is_int8 |= 1 << i;
4947ec681f3Smrg         if (format_is_int10(attachment->format))
4957ec681f3Smrg            is_int10 |= 1 << i;
4967ec681f3Smrg      }
4977ec681f3Smrg
4987ec681f3Smrg      col_format |= cf << (4 * i);
4997ec681f3Smrg   }
5007ec681f3Smrg
5017ec681f3Smrg   if (!(col_format & 0xf) && blend->need_src_alpha & (1 << 0)) {
5027ec681f3Smrg      /* When a subpass doesn't have any color attachments, write the
5037ec681f3Smrg       * alpha channel of MRT0 when alpha coverage is enabled because
5047ec681f3Smrg       * the depth attachment needs it.
5057ec681f3Smrg       */
5067ec681f3Smrg      col_format |= V_028714_SPI_SHADER_32_AR;
5077ec681f3Smrg   }
5087ec681f3Smrg
5097ec681f3Smrg   /* If the i-th target format is set, all previous target formats must
5107ec681f3Smrg    * be non-zero to avoid hangs.
5117ec681f3Smrg    */
5127ec681f3Smrg   num_targets = (util_last_bit(col_format) + 3) / 4;
5137ec681f3Smrg   for (unsigned i = 0; i < num_targets; i++) {
5147ec681f3Smrg      if (!(col_format & (0xfu << (i * 4)))) {
5157ec681f3Smrg         col_format |= V_028714_SPI_SHADER_32_R << (i * 4);
5167ec681f3Smrg      }
5177ec681f3Smrg   }
5187ec681f3Smrg
5197ec681f3Smrg   /* The output for dual source blending should have the same format as
5207ec681f3Smrg    * the first output.
5217ec681f3Smrg    */
5227ec681f3Smrg   if (blend->mrt0_is_dual_src) {
5237ec681f3Smrg      assert(!(col_format >> 4));
5247ec681f3Smrg      col_format |= (col_format & 0xf) << 4;
5257ec681f3Smrg   }
5267ec681f3Smrg
5277ec681f3Smrg   blend->cb_shader_mask = ac_get_cb_shader_mask(col_format);
5287ec681f3Smrg   blend->spi_shader_col_format = col_format;
5297ec681f3Smrg   blend->col_format_is_int8 = is_int8;
5307ec681f3Smrg   blend->col_format_is_int10 = is_int10;
53101e04c3fSmrg}
53201e04c3fSmrg
53301e04c3fSmrg/*
53401e04c3fSmrg * Ordered so that for each i,
53501e04c3fSmrg * radv_format_meta_fs_key(radv_fs_key_format_exemplars[i]) == i.
53601e04c3fSmrg */
53701e04c3fSmrgconst VkFormat radv_fs_key_format_exemplars[NUM_META_FS_KEYS] = {
5387ec681f3Smrg   VK_FORMAT_R32_SFLOAT,
5397ec681f3Smrg   VK_FORMAT_R32G32_SFLOAT,
5407ec681f3Smrg   VK_FORMAT_R8G8B8A8_UNORM,
5417ec681f3Smrg   VK_FORMAT_R16G16B16A16_UNORM,
5427ec681f3Smrg   VK_FORMAT_R16G16B16A16_SNORM,
5437ec681f3Smrg   VK_FORMAT_R16G16B16A16_UINT,
5447ec681f3Smrg   VK_FORMAT_R16G16B16A16_SINT,
5457ec681f3Smrg   VK_FORMAT_R32G32B32A32_SFLOAT,
5467ec681f3Smrg   VK_FORMAT_R8G8B8A8_UINT,
5477ec681f3Smrg   VK_FORMAT_R8G8B8A8_SINT,
5487ec681f3Smrg   VK_FORMAT_A2R10G10B10_UINT_PACK32,
5497ec681f3Smrg   VK_FORMAT_A2R10G10B10_SINT_PACK32,
55001e04c3fSmrg};
55101e04c3fSmrg
5527ec681f3Smrgunsigned
5537ec681f3Smrgradv_format_meta_fs_key(struct radv_device *device, VkFormat format)
55401e04c3fSmrg{
5557ec681f3Smrg   unsigned col_format = radv_choose_spi_color_format(device, format, false, false);
5567ec681f3Smrg   assert(col_format != V_028714_SPI_SHADER_32_AR);
5577ec681f3Smrg
5587ec681f3Smrg   bool is_int8 = format_is_int8(format);
5597ec681f3Smrg   bool is_int10 = format_is_int10(format);
5607ec681f3Smrg
5617ec681f3Smrg   if (col_format == V_028714_SPI_SHADER_UINT16_ABGR && is_int8)
5627ec681f3Smrg      return 8;
5637ec681f3Smrg   else if (col_format == V_028714_SPI_SHADER_SINT16_ABGR && is_int8)
5647ec681f3Smrg      return 9;
5657ec681f3Smrg   else if (col_format == V_028714_SPI_SHADER_UINT16_ABGR && is_int10)
5667ec681f3Smrg      return 10;
5677ec681f3Smrg   else if (col_format == V_028714_SPI_SHADER_SINT16_ABGR && is_int10)
5687ec681f3Smrg      return 11;
5697ec681f3Smrg   else {
5707ec681f3Smrg      if (col_format >= V_028714_SPI_SHADER_32_AR)
5717ec681f3Smrg         --col_format; /* Skip V_028714_SPI_SHADER_32_AR  since there is no such VkFormat */
5727ec681f3Smrg
5737ec681f3Smrg      --col_format; /* Skip V_028714_SPI_SHADER_ZERO */
5747ec681f3Smrg      return col_format;
5757ec681f3Smrg   }
57601e04c3fSmrg}
57701e04c3fSmrg
57801e04c3fSmrgstatic void
5797ec681f3Smrgradv_blend_check_commutativity(struct radv_blend_state *blend, VkBlendOp op, VkBlendFactor src,
5807ec681f3Smrg                               VkBlendFactor dst, unsigned chanmask)
58101e04c3fSmrg{
5827ec681f3Smrg   /* Src factor is allowed when it does not depend on Dst. */
5837ec681f3Smrg   static const uint32_t src_allowed =
5847ec681f3Smrg      (1u << VK_BLEND_FACTOR_ONE) | (1u << VK_BLEND_FACTOR_SRC_COLOR) |
5857ec681f3Smrg      (1u << VK_BLEND_FACTOR_SRC_ALPHA) | (1u << VK_BLEND_FACTOR_SRC_ALPHA_SATURATE) |
5867ec681f3Smrg      (1u << VK_BLEND_FACTOR_CONSTANT_COLOR) | (1u << VK_BLEND_FACTOR_CONSTANT_ALPHA) |
5877ec681f3Smrg      (1u << VK_BLEND_FACTOR_SRC1_COLOR) | (1u << VK_BLEND_FACTOR_SRC1_ALPHA) |
5887ec681f3Smrg      (1u << VK_BLEND_FACTOR_ZERO) | (1u << VK_BLEND_FACTOR_ONE_MINUS_SRC_COLOR) |
5897ec681f3Smrg      (1u << VK_BLEND_FACTOR_ONE_MINUS_SRC_ALPHA) |
5907ec681f3Smrg      (1u << VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_COLOR) |
5917ec681f3Smrg      (1u << VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_ALPHA) |
5927ec681f3Smrg      (1u << VK_BLEND_FACTOR_ONE_MINUS_SRC1_COLOR) | (1u << VK_BLEND_FACTOR_ONE_MINUS_SRC1_ALPHA);
5937ec681f3Smrg
5947ec681f3Smrg   if (dst == VK_BLEND_FACTOR_ONE && (src_allowed & (1u << src))) {
5957ec681f3Smrg      /* Addition is commutative, but floating point addition isn't
5967ec681f3Smrg       * associative: subtle changes can be introduced via different
5977ec681f3Smrg       * rounding. Be conservative, only enable for min and max.
5987ec681f3Smrg       */
5997ec681f3Smrg      if (op == VK_BLEND_OP_MAX || op == VK_BLEND_OP_MIN)
6007ec681f3Smrg         blend->commutative_4bit |= chanmask;
6017ec681f3Smrg   }
6027ec681f3Smrg}
60301e04c3fSmrg
6047ec681f3Smrgstatic struct radv_blend_state
6057ec681f3Smrgradv_pipeline_init_blend_state(struct radv_pipeline *pipeline,
6067ec681f3Smrg                               const VkGraphicsPipelineCreateInfo *pCreateInfo,
6077ec681f3Smrg                               const struct radv_graphics_pipeline_create_info *extra)
6087ec681f3Smrg{
6097ec681f3Smrg   const VkPipelineColorBlendStateCreateInfo *vkblend =
6107ec681f3Smrg      radv_pipeline_get_color_blend_state(pCreateInfo);
6117ec681f3Smrg   const VkPipelineMultisampleStateCreateInfo *vkms =
6127ec681f3Smrg      radv_pipeline_get_multisample_state(pCreateInfo);
6137ec681f3Smrg   struct radv_blend_state blend = {0};
6147ec681f3Smrg   unsigned mode = V_028808_CB_NORMAL;
6157ec681f3Smrg   unsigned cb_color_control = 0;
6167ec681f3Smrg   int i;
6177ec681f3Smrg
6187ec681f3Smrg   if (extra && extra->custom_blend_mode) {
6197ec681f3Smrg      blend.single_cb_enable = true;
6207ec681f3Smrg      mode = extra->custom_blend_mode;
6217ec681f3Smrg   }
6227ec681f3Smrg
6237ec681f3Smrg   if (vkblend) {
6247ec681f3Smrg      if (vkblend->logicOpEnable)
6257ec681f3Smrg         cb_color_control |= S_028808_ROP3(si_translate_blend_logic_op(vkblend->logicOp));
6267ec681f3Smrg      else
6277ec681f3Smrg         cb_color_control |= S_028808_ROP3(V_028808_ROP3_COPY);
6287ec681f3Smrg   }
6297ec681f3Smrg
6307ec681f3Smrg   if (pipeline->device->instance->debug_flags & RADV_DEBUG_NO_ATOC_DITHERING)
6317ec681f3Smrg   {
6327ec681f3Smrg      blend.db_alpha_to_mask = S_028B70_ALPHA_TO_MASK_OFFSET0(2) | S_028B70_ALPHA_TO_MASK_OFFSET1(2) |
6337ec681f3Smrg                               S_028B70_ALPHA_TO_MASK_OFFSET2(2) | S_028B70_ALPHA_TO_MASK_OFFSET3(2) |
6347ec681f3Smrg                               S_028B70_OFFSET_ROUND(0);
6357ec681f3Smrg   }
6367ec681f3Smrg   else
6377ec681f3Smrg   {
6387ec681f3Smrg      blend.db_alpha_to_mask = S_028B70_ALPHA_TO_MASK_OFFSET0(3) | S_028B70_ALPHA_TO_MASK_OFFSET1(1) |
6397ec681f3Smrg                               S_028B70_ALPHA_TO_MASK_OFFSET2(0) | S_028B70_ALPHA_TO_MASK_OFFSET3(2) |
6407ec681f3Smrg                               S_028B70_OFFSET_ROUND(1);
6417ec681f3Smrg   }
6427ec681f3Smrg
6437ec681f3Smrg   if (vkms && vkms->alphaToCoverageEnable) {
6447ec681f3Smrg      blend.db_alpha_to_mask |= S_028B70_ALPHA_TO_MASK_ENABLE(1);
6457ec681f3Smrg      blend.need_src_alpha |= 0x1;
6467ec681f3Smrg   }
6477ec681f3Smrg
6487ec681f3Smrg   blend.cb_target_mask = 0;
6497ec681f3Smrg   if (vkblend) {
6507ec681f3Smrg      for (i = 0; i < vkblend->attachmentCount; i++) {
6517ec681f3Smrg         const VkPipelineColorBlendAttachmentState *att = &vkblend->pAttachments[i];
6527ec681f3Smrg         unsigned blend_cntl = 0;
6537ec681f3Smrg         unsigned srcRGB_opt, dstRGB_opt, srcA_opt, dstA_opt;
6547ec681f3Smrg         VkBlendOp eqRGB = att->colorBlendOp;
6557ec681f3Smrg         VkBlendFactor srcRGB = att->srcColorBlendFactor;
6567ec681f3Smrg         VkBlendFactor dstRGB = att->dstColorBlendFactor;
6577ec681f3Smrg         VkBlendOp eqA = att->alphaBlendOp;
6587ec681f3Smrg         VkBlendFactor srcA = att->srcAlphaBlendFactor;
6597ec681f3Smrg         VkBlendFactor dstA = att->dstAlphaBlendFactor;
6607ec681f3Smrg
6617ec681f3Smrg         blend.sx_mrt_blend_opt[i] = S_028760_COLOR_COMB_FCN(V_028760_OPT_COMB_BLEND_DISABLED) |
6627ec681f3Smrg                                     S_028760_ALPHA_COMB_FCN(V_028760_OPT_COMB_BLEND_DISABLED);
6637ec681f3Smrg
6647ec681f3Smrg         if (!att->colorWriteMask)
6657ec681f3Smrg            continue;
6667ec681f3Smrg
6677ec681f3Smrg         /* Ignore other blend targets if dual-source blending
6687ec681f3Smrg          * is enabled to prevent wrong behaviour.
6697ec681f3Smrg          */
6707ec681f3Smrg         if (blend.mrt0_is_dual_src)
6717ec681f3Smrg            continue;
6727ec681f3Smrg
6737ec681f3Smrg         blend.cb_target_mask |= (unsigned)att->colorWriteMask << (4 * i);
6747ec681f3Smrg         blend.cb_target_enabled_4bit |= 0xfu << (4 * i);
6757ec681f3Smrg         if (!att->blendEnable) {
6767ec681f3Smrg            blend.cb_blend_control[i] = blend_cntl;
6777ec681f3Smrg            continue;
6787ec681f3Smrg         }
6797ec681f3Smrg
6807ec681f3Smrg         if (is_dual_src(srcRGB) || is_dual_src(dstRGB) || is_dual_src(srcA) || is_dual_src(dstA))
6817ec681f3Smrg            if (i == 0)
6827ec681f3Smrg               blend.mrt0_is_dual_src = true;
6837ec681f3Smrg
6847ec681f3Smrg         if (eqRGB == VK_BLEND_OP_MIN || eqRGB == VK_BLEND_OP_MAX) {
6857ec681f3Smrg            srcRGB = VK_BLEND_FACTOR_ONE;
6867ec681f3Smrg            dstRGB = VK_BLEND_FACTOR_ONE;
6877ec681f3Smrg         }
6887ec681f3Smrg         if (eqA == VK_BLEND_OP_MIN || eqA == VK_BLEND_OP_MAX) {
6897ec681f3Smrg            srcA = VK_BLEND_FACTOR_ONE;
6907ec681f3Smrg            dstA = VK_BLEND_FACTOR_ONE;
6917ec681f3Smrg         }
6927ec681f3Smrg
6937ec681f3Smrg         radv_blend_check_commutativity(&blend, eqRGB, srcRGB, dstRGB, 0x7u << (4 * i));
6947ec681f3Smrg         radv_blend_check_commutativity(&blend, eqA, srcA, dstA, 0x8u << (4 * i));
6957ec681f3Smrg
6967ec681f3Smrg         /* Blending optimizations for RB+.
6977ec681f3Smrg          * These transformations don't change the behavior.
6987ec681f3Smrg          *
6997ec681f3Smrg          * First, get rid of DST in the blend factors:
7007ec681f3Smrg          *    func(src * DST, dst * 0) ---> func(src * 0, dst * SRC)
7017ec681f3Smrg          */
7027ec681f3Smrg         si_blend_remove_dst(&eqRGB, &srcRGB, &dstRGB, VK_BLEND_FACTOR_DST_COLOR,
7037ec681f3Smrg                             VK_BLEND_FACTOR_SRC_COLOR);
7047ec681f3Smrg
7057ec681f3Smrg         si_blend_remove_dst(&eqA, &srcA, &dstA, VK_BLEND_FACTOR_DST_COLOR,
7067ec681f3Smrg                             VK_BLEND_FACTOR_SRC_COLOR);
7077ec681f3Smrg
7087ec681f3Smrg         si_blend_remove_dst(&eqA, &srcA, &dstA, VK_BLEND_FACTOR_DST_ALPHA,
7097ec681f3Smrg                             VK_BLEND_FACTOR_SRC_ALPHA);
7107ec681f3Smrg
7117ec681f3Smrg         /* Look up the ideal settings from tables. */
7127ec681f3Smrg         srcRGB_opt = si_translate_blend_opt_factor(srcRGB, false);
7137ec681f3Smrg         dstRGB_opt = si_translate_blend_opt_factor(dstRGB, false);
7147ec681f3Smrg         srcA_opt = si_translate_blend_opt_factor(srcA, true);
7157ec681f3Smrg         dstA_opt = si_translate_blend_opt_factor(dstA, true);
7167ec681f3Smrg
7177ec681f3Smrg         /* Handle interdependencies. */
7187ec681f3Smrg         if (si_blend_factor_uses_dst(srcRGB))
7197ec681f3Smrg            dstRGB_opt = V_028760_BLEND_OPT_PRESERVE_NONE_IGNORE_NONE;
7207ec681f3Smrg         if (si_blend_factor_uses_dst(srcA))
7217ec681f3Smrg            dstA_opt = V_028760_BLEND_OPT_PRESERVE_NONE_IGNORE_NONE;
7227ec681f3Smrg
7237ec681f3Smrg         if (srcRGB == VK_BLEND_FACTOR_SRC_ALPHA_SATURATE &&
7247ec681f3Smrg             (dstRGB == VK_BLEND_FACTOR_ZERO || dstRGB == VK_BLEND_FACTOR_SRC_ALPHA ||
7257ec681f3Smrg              dstRGB == VK_BLEND_FACTOR_SRC_ALPHA_SATURATE))
7267ec681f3Smrg            dstRGB_opt = V_028760_BLEND_OPT_PRESERVE_NONE_IGNORE_A0;
7277ec681f3Smrg
7287ec681f3Smrg         /* Set the final value. */
7297ec681f3Smrg         blend.sx_mrt_blend_opt[i] =
7307ec681f3Smrg            S_028760_COLOR_SRC_OPT(srcRGB_opt) | S_028760_COLOR_DST_OPT(dstRGB_opt) |
7317ec681f3Smrg            S_028760_COLOR_COMB_FCN(si_translate_blend_opt_function(eqRGB)) |
7327ec681f3Smrg            S_028760_ALPHA_SRC_OPT(srcA_opt) | S_028760_ALPHA_DST_OPT(dstA_opt) |
7337ec681f3Smrg            S_028760_ALPHA_COMB_FCN(si_translate_blend_opt_function(eqA));
7347ec681f3Smrg         blend_cntl |= S_028780_ENABLE(1);
7357ec681f3Smrg
7367ec681f3Smrg         blend_cntl |= S_028780_COLOR_COMB_FCN(si_translate_blend_function(eqRGB));
7377ec681f3Smrg         blend_cntl |= S_028780_COLOR_SRCBLEND(si_translate_blend_factor(srcRGB));
7387ec681f3Smrg         blend_cntl |= S_028780_COLOR_DESTBLEND(si_translate_blend_factor(dstRGB));
7397ec681f3Smrg         if (srcA != srcRGB || dstA != dstRGB || eqA != eqRGB) {
7407ec681f3Smrg            blend_cntl |= S_028780_SEPARATE_ALPHA_BLEND(1);
7417ec681f3Smrg            blend_cntl |= S_028780_ALPHA_COMB_FCN(si_translate_blend_function(eqA));
7427ec681f3Smrg            blend_cntl |= S_028780_ALPHA_SRCBLEND(si_translate_blend_factor(srcA));
7437ec681f3Smrg            blend_cntl |= S_028780_ALPHA_DESTBLEND(si_translate_blend_factor(dstA));
7447ec681f3Smrg         }
7457ec681f3Smrg         blend.cb_blend_control[i] = blend_cntl;
7467ec681f3Smrg
7477ec681f3Smrg         blend.blend_enable_4bit |= 0xfu << (i * 4);
7487ec681f3Smrg
7497ec681f3Smrg         if (srcRGB == VK_BLEND_FACTOR_SRC_ALPHA || dstRGB == VK_BLEND_FACTOR_SRC_ALPHA ||
7507ec681f3Smrg             srcRGB == VK_BLEND_FACTOR_SRC_ALPHA_SATURATE ||
7517ec681f3Smrg             dstRGB == VK_BLEND_FACTOR_SRC_ALPHA_SATURATE ||
7527ec681f3Smrg             srcRGB == VK_BLEND_FACTOR_ONE_MINUS_SRC_ALPHA ||
7537ec681f3Smrg             dstRGB == VK_BLEND_FACTOR_ONE_MINUS_SRC_ALPHA)
7547ec681f3Smrg            blend.need_src_alpha |= 1 << i;
7557ec681f3Smrg      }
7567ec681f3Smrg      for (i = vkblend->attachmentCount; i < 8; i++) {
7577ec681f3Smrg         blend.cb_blend_control[i] = 0;
7587ec681f3Smrg         blend.sx_mrt_blend_opt[i] = S_028760_COLOR_COMB_FCN(V_028760_OPT_COMB_BLEND_DISABLED) |
7597ec681f3Smrg                                     S_028760_ALPHA_COMB_FCN(V_028760_OPT_COMB_BLEND_DISABLED);
7607ec681f3Smrg      }
7617ec681f3Smrg   }
7627ec681f3Smrg
7637ec681f3Smrg   if (pipeline->device->physical_device->rad_info.has_rbplus) {
7647ec681f3Smrg      /* Disable RB+ blend optimizations for dual source blending. */
7657ec681f3Smrg      if (blend.mrt0_is_dual_src) {
7667ec681f3Smrg         for (i = 0; i < 8; i++) {
7677ec681f3Smrg            blend.sx_mrt_blend_opt[i] = S_028760_COLOR_COMB_FCN(V_028760_OPT_COMB_NONE) |
7687ec681f3Smrg                                        S_028760_ALPHA_COMB_FCN(V_028760_OPT_COMB_NONE);
7697ec681f3Smrg         }
7707ec681f3Smrg      }
7717ec681f3Smrg
7727ec681f3Smrg      /* RB+ doesn't work with dual source blending, logic op and
7737ec681f3Smrg       * RESOLVE.
7747ec681f3Smrg       */
7757ec681f3Smrg      if (blend.mrt0_is_dual_src || (vkblend && vkblend->logicOpEnable) ||
7767ec681f3Smrg          mode == V_028808_CB_RESOLVE)
7777ec681f3Smrg         cb_color_control |= S_028808_DISABLE_DUAL_QUAD(1);
7787ec681f3Smrg   }
7797ec681f3Smrg
7807ec681f3Smrg   if (blend.cb_target_mask)
7817ec681f3Smrg      cb_color_control |= S_028808_MODE(mode);
7827ec681f3Smrg   else
7837ec681f3Smrg      cb_color_control |= S_028808_MODE(V_028808_CB_DISABLE);
7847ec681f3Smrg
7857ec681f3Smrg   radv_pipeline_compute_spi_color_formats(pipeline, pCreateInfo, &blend);
7867ec681f3Smrg
7877ec681f3Smrg   pipeline->graphics.cb_color_control = cb_color_control;
7887ec681f3Smrg
7897ec681f3Smrg   return blend;
79001e04c3fSmrg}
79101e04c3fSmrg
7927ec681f3Smrgstatic uint32_t
7937ec681f3Smrgsi_translate_fill(VkPolygonMode func)
7947ec681f3Smrg{
7957ec681f3Smrg   switch (func) {
7967ec681f3Smrg   case VK_POLYGON_MODE_FILL:
7977ec681f3Smrg      return V_028814_X_DRAW_TRIANGLES;
7987ec681f3Smrg   case VK_POLYGON_MODE_LINE:
7997ec681f3Smrg      return V_028814_X_DRAW_LINES;
8007ec681f3Smrg   case VK_POLYGON_MODE_POINT:
8017ec681f3Smrg      return V_028814_X_DRAW_POINTS;
8027ec681f3Smrg   default:
8037ec681f3Smrg      assert(0);
8047ec681f3Smrg      return V_028814_X_DRAW_POINTS;
8057ec681f3Smrg   }
80601e04c3fSmrg}
80701e04c3fSmrg
8087ec681f3Smrgstatic uint8_t
8097ec681f3Smrgradv_pipeline_get_ps_iter_samples(const VkGraphicsPipelineCreateInfo *pCreateInfo)
8107ec681f3Smrg{
8117ec681f3Smrg   const VkPipelineMultisampleStateCreateInfo *vkms = pCreateInfo->pMultisampleState;
8127ec681f3Smrg   RADV_FROM_HANDLE(radv_render_pass, pass, pCreateInfo->renderPass);
8137ec681f3Smrg   struct radv_subpass *subpass = &pass->subpasses[pCreateInfo->subpass];
8147ec681f3Smrg   uint32_t ps_iter_samples = 1;
8157ec681f3Smrg   uint32_t num_samples;
8167ec681f3Smrg
8177ec681f3Smrg   /* From the Vulkan 1.1.129 spec, 26.7. Sample Shading:
8187ec681f3Smrg    *
8197ec681f3Smrg    * "If the VK_AMD_mixed_attachment_samples extension is enabled and the
8207ec681f3Smrg    *  subpass uses color attachments, totalSamples is the number of
8217ec681f3Smrg    *  samples of the color attachments. Otherwise, totalSamples is the
8227ec681f3Smrg    *  value of VkPipelineMultisampleStateCreateInfo::rasterizationSamples
8237ec681f3Smrg    *  specified at pipeline creation time."
8247ec681f3Smrg    */
8257ec681f3Smrg   if (subpass->has_color_att) {
8267ec681f3Smrg      num_samples = subpass->color_sample_count;
8277ec681f3Smrg   } else {
8287ec681f3Smrg      num_samples = vkms->rasterizationSamples;
8297ec681f3Smrg   }
8307ec681f3Smrg
8317ec681f3Smrg   if (vkms->sampleShadingEnable) {
8327ec681f3Smrg      ps_iter_samples = ceilf(vkms->minSampleShading * num_samples);
8337ec681f3Smrg      ps_iter_samples = util_next_power_of_two(ps_iter_samples);
8347ec681f3Smrg   }
8357ec681f3Smrg   return ps_iter_samples;
83601e04c3fSmrg}
83701e04c3fSmrg
83801e04c3fSmrgstatic bool
83901e04c3fSmrgradv_is_depth_write_enabled(const VkPipelineDepthStencilStateCreateInfo *pCreateInfo)
84001e04c3fSmrg{
8417ec681f3Smrg   return pCreateInfo->depthTestEnable && pCreateInfo->depthWriteEnable &&
8427ec681f3Smrg          pCreateInfo->depthCompareOp != VK_COMPARE_OP_NEVER;
84301e04c3fSmrg}
84401e04c3fSmrg
84501e04c3fSmrgstatic bool
84601e04c3fSmrgradv_writes_stencil(const VkStencilOpState *state)
84701e04c3fSmrg{
8487ec681f3Smrg   return state->writeMask &&
8497ec681f3Smrg          (state->failOp != VK_STENCIL_OP_KEEP || state->passOp != VK_STENCIL_OP_KEEP ||
8507ec681f3Smrg           state->depthFailOp != VK_STENCIL_OP_KEEP);
85101e04c3fSmrg}
85201e04c3fSmrg
85301e04c3fSmrgstatic bool
85401e04c3fSmrgradv_is_stencil_write_enabled(const VkPipelineDepthStencilStateCreateInfo *pCreateInfo)
85501e04c3fSmrg{
8567ec681f3Smrg   return pCreateInfo->stencilTestEnable &&
8577ec681f3Smrg          (radv_writes_stencil(&pCreateInfo->front) || radv_writes_stencil(&pCreateInfo->back));
85801e04c3fSmrg}
85901e04c3fSmrg
86001e04c3fSmrgstatic bool
86101e04c3fSmrgradv_is_ds_write_enabled(const VkPipelineDepthStencilStateCreateInfo *pCreateInfo)
86201e04c3fSmrg{
8637ec681f3Smrg   return radv_is_depth_write_enabled(pCreateInfo) || radv_is_stencil_write_enabled(pCreateInfo);
86401e04c3fSmrg}
86501e04c3fSmrg
86601e04c3fSmrgstatic bool
86701e04c3fSmrgradv_order_invariant_stencil_op(VkStencilOp op)
86801e04c3fSmrg{
8697ec681f3Smrg   /* REPLACE is normally order invariant, except when the stencil
8707ec681f3Smrg    * reference value is written by the fragment shader. Tracking this
8717ec681f3Smrg    * interaction does not seem worth the effort, so be conservative.
8727ec681f3Smrg    */
8737ec681f3Smrg   return op != VK_STENCIL_OP_INCREMENT_AND_CLAMP && op != VK_STENCIL_OP_DECREMENT_AND_CLAMP &&
8747ec681f3Smrg          op != VK_STENCIL_OP_REPLACE;
87501e04c3fSmrg}
87601e04c3fSmrg
87701e04c3fSmrgstatic bool
87801e04c3fSmrgradv_order_invariant_stencil_state(const VkStencilOpState *state)
87901e04c3fSmrg{
8807ec681f3Smrg   /* Compute whether, assuming Z writes are disabled, this stencil state
8817ec681f3Smrg    * is order invariant in the sense that the set of passing fragments as
8827ec681f3Smrg    * well as the final stencil buffer result does not depend on the order
8837ec681f3Smrg    * of fragments.
8847ec681f3Smrg    */
8857ec681f3Smrg   return !state->writeMask ||
8867ec681f3Smrg          /* The following assumes that Z writes are disabled. */
8877ec681f3Smrg          (state->compareOp == VK_COMPARE_OP_ALWAYS &&
8887ec681f3Smrg           radv_order_invariant_stencil_op(state->passOp) &&
8897ec681f3Smrg           radv_order_invariant_stencil_op(state->depthFailOp)) ||
8907ec681f3Smrg          (state->compareOp == VK_COMPARE_OP_NEVER &&
8917ec681f3Smrg           radv_order_invariant_stencil_op(state->failOp));
8927ec681f3Smrg}
8937ec681f3Smrg
8947ec681f3Smrgstatic bool
8957ec681f3Smrgradv_pipeline_has_dynamic_ds_states(const VkGraphicsPipelineCreateInfo *pCreateInfo)
8967ec681f3Smrg{
8977ec681f3Smrg   VkDynamicState ds_states[] = {
8987ec681f3Smrg      VK_DYNAMIC_STATE_DEPTH_TEST_ENABLE_EXT, VK_DYNAMIC_STATE_DEPTH_WRITE_ENABLE_EXT,
8997ec681f3Smrg      VK_DYNAMIC_STATE_DEPTH_COMPARE_OP_EXT,  VK_DYNAMIC_STATE_STENCIL_TEST_ENABLE_EXT,
9007ec681f3Smrg      VK_DYNAMIC_STATE_STENCIL_OP_EXT,
9017ec681f3Smrg   };
9027ec681f3Smrg
9037ec681f3Smrg   for (uint32_t i = 0; i < ARRAY_SIZE(ds_states); i++) {
9047ec681f3Smrg      if (radv_is_state_dynamic(pCreateInfo, ds_states[i]))
9057ec681f3Smrg         return true;
9067ec681f3Smrg   }
9077ec681f3Smrg
9087ec681f3Smrg   return false;
90901e04c3fSmrg}
91001e04c3fSmrg
91101e04c3fSmrgstatic bool
91201e04c3fSmrgradv_pipeline_out_of_order_rast(struct radv_pipeline *pipeline,
9137ec681f3Smrg                                const struct radv_blend_state *blend,
9147ec681f3Smrg                                const VkGraphicsPipelineCreateInfo *pCreateInfo)
9157ec681f3Smrg{
9167ec681f3Smrg   RADV_FROM_HANDLE(radv_render_pass, pass, pCreateInfo->renderPass);
9177ec681f3Smrg   struct radv_subpass *subpass = pass->subpasses + pCreateInfo->subpass;
9187ec681f3Smrg   const VkPipelineDepthStencilStateCreateInfo *vkds =
9197ec681f3Smrg      radv_pipeline_get_depth_stencil_state(pCreateInfo);
9207ec681f3Smrg   const VkPipelineColorBlendStateCreateInfo *vkblend =
9217ec681f3Smrg      radv_pipeline_get_color_blend_state(pCreateInfo);
9227ec681f3Smrg   unsigned colormask = blend->cb_target_enabled_4bit;
9237ec681f3Smrg
9247ec681f3Smrg   if (!pipeline->device->physical_device->out_of_order_rast_allowed)
9257ec681f3Smrg      return false;
9267ec681f3Smrg
9277ec681f3Smrg   /* Be conservative if a logic operation is enabled with color buffers. */
9287ec681f3Smrg   if (colormask && vkblend && vkblend->logicOpEnable)
9297ec681f3Smrg      return false;
9307ec681f3Smrg
9317ec681f3Smrg   /* Be conservative if an extended dynamic depth/stencil state is
9327ec681f3Smrg    * enabled because the driver can't update out-of-order rasterization
9337ec681f3Smrg    * dynamically.
9347ec681f3Smrg    */
9357ec681f3Smrg   if (radv_pipeline_has_dynamic_ds_states(pCreateInfo))
9367ec681f3Smrg      return false;
9377ec681f3Smrg
9387ec681f3Smrg   /* Default depth/stencil invariance when no attachment is bound. */
9397ec681f3Smrg   struct radv_dsa_order_invariance dsa_order_invariant = {.zs = true, .pass_set = true};
9407ec681f3Smrg
9417ec681f3Smrg   if (vkds) {
9427ec681f3Smrg      struct radv_render_pass_attachment *attachment =
9437ec681f3Smrg         pass->attachments + subpass->depth_stencil_attachment->attachment;
9447ec681f3Smrg      bool has_stencil = vk_format_has_stencil(attachment->format);
9457ec681f3Smrg      struct radv_dsa_order_invariance order_invariance[2];
9467ec681f3Smrg      struct radv_shader_variant *ps = pipeline->shaders[MESA_SHADER_FRAGMENT];
9477ec681f3Smrg
9487ec681f3Smrg      /* Compute depth/stencil order invariance in order to know if
9497ec681f3Smrg       * it's safe to enable out-of-order.
9507ec681f3Smrg       */
9517ec681f3Smrg      bool zfunc_is_ordered = vkds->depthCompareOp == VK_COMPARE_OP_NEVER ||
9527ec681f3Smrg                              vkds->depthCompareOp == VK_COMPARE_OP_LESS ||
9537ec681f3Smrg                              vkds->depthCompareOp == VK_COMPARE_OP_LESS_OR_EQUAL ||
9547ec681f3Smrg                              vkds->depthCompareOp == VK_COMPARE_OP_GREATER ||
9557ec681f3Smrg                              vkds->depthCompareOp == VK_COMPARE_OP_GREATER_OR_EQUAL;
9567ec681f3Smrg
9577ec681f3Smrg      bool nozwrite_and_order_invariant_stencil =
9587ec681f3Smrg         !radv_is_ds_write_enabled(vkds) ||
9597ec681f3Smrg         (!radv_is_depth_write_enabled(vkds) && radv_order_invariant_stencil_state(&vkds->front) &&
9607ec681f3Smrg          radv_order_invariant_stencil_state(&vkds->back));
9617ec681f3Smrg
9627ec681f3Smrg      order_invariance[1].zs = nozwrite_and_order_invariant_stencil ||
9637ec681f3Smrg                               (!radv_is_stencil_write_enabled(vkds) && zfunc_is_ordered);
9647ec681f3Smrg      order_invariance[0].zs = !radv_is_depth_write_enabled(vkds) || zfunc_is_ordered;
9657ec681f3Smrg
9667ec681f3Smrg      order_invariance[1].pass_set =
9677ec681f3Smrg         nozwrite_and_order_invariant_stencil ||
9687ec681f3Smrg         (!radv_is_stencil_write_enabled(vkds) && (vkds->depthCompareOp == VK_COMPARE_OP_ALWAYS ||
9697ec681f3Smrg                                                   vkds->depthCompareOp == VK_COMPARE_OP_NEVER));
9707ec681f3Smrg      order_invariance[0].pass_set =
9717ec681f3Smrg         !radv_is_depth_write_enabled(vkds) || (vkds->depthCompareOp == VK_COMPARE_OP_ALWAYS ||
9727ec681f3Smrg                                                vkds->depthCompareOp == VK_COMPARE_OP_NEVER);
9737ec681f3Smrg
9747ec681f3Smrg      dsa_order_invariant = order_invariance[has_stencil];
9757ec681f3Smrg      if (!dsa_order_invariant.zs)
9767ec681f3Smrg         return false;
9777ec681f3Smrg
9787ec681f3Smrg      /* The set of PS invocations is always order invariant,
9797ec681f3Smrg       * except when early Z/S tests are requested.
9807ec681f3Smrg       */
9817ec681f3Smrg      if (ps && ps->info.ps.writes_memory && ps->info.ps.early_fragment_test &&
9827ec681f3Smrg          !dsa_order_invariant.pass_set)
9837ec681f3Smrg         return false;
9847ec681f3Smrg
9857ec681f3Smrg      /* Determine if out-of-order rasterization should be disabled
9867ec681f3Smrg       * when occlusion queries are used.
9877ec681f3Smrg       */
9887ec681f3Smrg      pipeline->graphics.disable_out_of_order_rast_for_occlusion = !dsa_order_invariant.pass_set;
9897ec681f3Smrg   }
9907ec681f3Smrg
9917ec681f3Smrg   /* No color buffers are enabled for writing. */
9927ec681f3Smrg   if (!colormask)
9937ec681f3Smrg      return true;
9947ec681f3Smrg
9957ec681f3Smrg   unsigned blendmask = colormask & blend->blend_enable_4bit;
9967ec681f3Smrg
9977ec681f3Smrg   if (blendmask) {
9987ec681f3Smrg      /* Only commutative blending. */
9997ec681f3Smrg      if (blendmask & ~blend->commutative_4bit)
10007ec681f3Smrg         return false;
10017ec681f3Smrg
10027ec681f3Smrg      if (!dsa_order_invariant.pass_set)
10037ec681f3Smrg         return false;
10047ec681f3Smrg   }
10057ec681f3Smrg
10067ec681f3Smrg   if (colormask & ~blendmask)
10077ec681f3Smrg      return false;
10087ec681f3Smrg
10097ec681f3Smrg   return true;
10107ec681f3Smrg}
10117ec681f3Smrg
10127ec681f3Smrgstatic const VkConservativeRasterizationModeEXT
10137ec681f3Smrgradv_get_conservative_raster_mode(const VkPipelineRasterizationStateCreateInfo *pCreateInfo)
10147ec681f3Smrg{
10157ec681f3Smrg   const VkPipelineRasterizationConservativeStateCreateInfoEXT *conservative_raster =
10167ec681f3Smrg      vk_find_struct_const(pCreateInfo->pNext,
10177ec681f3Smrg                           PIPELINE_RASTERIZATION_CONSERVATIVE_STATE_CREATE_INFO_EXT);
10187ec681f3Smrg
10197ec681f3Smrg   if (!conservative_raster)
10207ec681f3Smrg      return VK_CONSERVATIVE_RASTERIZATION_MODE_DISABLED_EXT;
10217ec681f3Smrg   return conservative_raster->conservativeRasterizationMode;
102201e04c3fSmrg}
102301e04c3fSmrg
102401e04c3fSmrgstatic void
102501e04c3fSmrgradv_pipeline_init_multisample_state(struct radv_pipeline *pipeline,
10267ec681f3Smrg                                     const struct radv_blend_state *blend,
10277ec681f3Smrg                                     const VkGraphicsPipelineCreateInfo *pCreateInfo)
10287ec681f3Smrg{
10297ec681f3Smrg   const VkPipelineMultisampleStateCreateInfo *vkms =
10307ec681f3Smrg      radv_pipeline_get_multisample_state(pCreateInfo);
10317ec681f3Smrg   struct radv_multisample_state *ms = &pipeline->graphics.ms;
10327ec681f3Smrg   unsigned num_tile_pipes = pipeline->device->physical_device->rad_info.num_tile_pipes;
10337ec681f3Smrg   const VkConservativeRasterizationModeEXT mode =
10347ec681f3Smrg      radv_get_conservative_raster_mode(pCreateInfo->pRasterizationState);
10357ec681f3Smrg   bool out_of_order_rast = false;
10367ec681f3Smrg   int ps_iter_samples = 1;
10377ec681f3Smrg   uint32_t mask = 0xffff;
10387ec681f3Smrg
10397ec681f3Smrg   if (vkms) {
10407ec681f3Smrg      ms->num_samples = vkms->rasterizationSamples;
10417ec681f3Smrg
10427ec681f3Smrg      /* From the Vulkan 1.1.129 spec, 26.7. Sample Shading:
10437ec681f3Smrg       *
10447ec681f3Smrg       * "Sample shading is enabled for a graphics pipeline:
10457ec681f3Smrg       *
10467ec681f3Smrg       * - If the interface of the fragment shader entry point of the
10477ec681f3Smrg       *   graphics pipeline includes an input variable decorated
10487ec681f3Smrg       *   with SampleId or SamplePosition. In this case
10497ec681f3Smrg       *   minSampleShadingFactor takes the value 1.0.
10507ec681f3Smrg       * - Else if the sampleShadingEnable member of the
10517ec681f3Smrg       *   VkPipelineMultisampleStateCreateInfo structure specified
10527ec681f3Smrg       *   when creating the graphics pipeline is set to VK_TRUE. In
10537ec681f3Smrg       *   this case minSampleShadingFactor takes the value of
10547ec681f3Smrg       *   VkPipelineMultisampleStateCreateInfo::minSampleShading.
10557ec681f3Smrg       *
10567ec681f3Smrg       * Otherwise, sample shading is considered disabled."
10577ec681f3Smrg       */
10587ec681f3Smrg      if (pipeline->shaders[MESA_SHADER_FRAGMENT]->info.ps.uses_sample_shading) {
10597ec681f3Smrg         ps_iter_samples = ms->num_samples;
10607ec681f3Smrg      } else {
10617ec681f3Smrg         ps_iter_samples = radv_pipeline_get_ps_iter_samples(pCreateInfo);
10627ec681f3Smrg      }
10637ec681f3Smrg   } else {
10647ec681f3Smrg      ms->num_samples = 1;
10657ec681f3Smrg   }
10667ec681f3Smrg
10677ec681f3Smrg   const struct VkPipelineRasterizationStateRasterizationOrderAMD *raster_order =
10687ec681f3Smrg      vk_find_struct_const(pCreateInfo->pRasterizationState->pNext,
10697ec681f3Smrg                           PIPELINE_RASTERIZATION_STATE_RASTERIZATION_ORDER_AMD);
10707ec681f3Smrg   if (raster_order && raster_order->rasterizationOrder == VK_RASTERIZATION_ORDER_RELAXED_AMD) {
10717ec681f3Smrg      /* Out-of-order rasterization is explicitly enabled by the
10727ec681f3Smrg       * application.
10737ec681f3Smrg       */
10747ec681f3Smrg      out_of_order_rast = true;
10757ec681f3Smrg   } else {
10767ec681f3Smrg      /* Determine if the driver can enable out-of-order
10777ec681f3Smrg       * rasterization internally.
10787ec681f3Smrg       */
10797ec681f3Smrg      out_of_order_rast = radv_pipeline_out_of_order_rast(pipeline, blend, pCreateInfo);
10807ec681f3Smrg   }
10817ec681f3Smrg
10827ec681f3Smrg   ms->pa_sc_aa_config = 0;
10837ec681f3Smrg   ms->db_eqaa = S_028804_HIGH_QUALITY_INTERSECTIONS(1) | S_028804_INCOHERENT_EQAA_READS(1) |
10847ec681f3Smrg                 S_028804_INTERPOLATE_COMP_Z(1) | S_028804_STATIC_ANCHOR_ASSOCIATIONS(1);
10857ec681f3Smrg
10867ec681f3Smrg   /* Adjust MSAA state if conservative rasterization is enabled. */
10877ec681f3Smrg   if (mode != VK_CONSERVATIVE_RASTERIZATION_MODE_DISABLED_EXT) {
10887ec681f3Smrg      ms->pa_sc_aa_config |= S_028BE0_AA_MASK_CENTROID_DTMN(1);
10897ec681f3Smrg
10907ec681f3Smrg      ms->db_eqaa |=
10917ec681f3Smrg         S_028804_ENABLE_POSTZ_OVERRASTERIZATION(1) | S_028804_OVERRASTERIZATION_AMOUNT(4);
10927ec681f3Smrg   }
10937ec681f3Smrg
10947ec681f3Smrg   ms->pa_sc_mode_cntl_1 =
10957ec681f3Smrg      S_028A4C_WALK_FENCE_ENABLE(1) | // TODO linear dst fixes
10967ec681f3Smrg      S_028A4C_WALK_FENCE_SIZE(num_tile_pipes == 2 ? 2 : 3) |
10977ec681f3Smrg      S_028A4C_OUT_OF_ORDER_PRIMITIVE_ENABLE(out_of_order_rast) |
10987ec681f3Smrg      S_028A4C_OUT_OF_ORDER_WATER_MARK(0x7) |
10997ec681f3Smrg      /* always 1: */
11007ec681f3Smrg      S_028A4C_WALK_ALIGN8_PRIM_FITS_ST(1) | S_028A4C_SUPERTILE_WALK_ORDER_ENABLE(1) |
11017ec681f3Smrg      S_028A4C_TILE_WALK_ORDER_ENABLE(1) | S_028A4C_MULTI_SHADER_ENGINE_PRIM_DISCARD_ENABLE(1) |
11027ec681f3Smrg      S_028A4C_FORCE_EOV_CNTDWN_ENABLE(1) | S_028A4C_FORCE_EOV_REZ_ENABLE(1);
11037ec681f3Smrg   ms->pa_sc_mode_cntl_0 = S_028A48_ALTERNATE_RBS_PER_TILE(
11047ec681f3Smrg                              pipeline->device->physical_device->rad_info.chip_class >= GFX9) |
11057ec681f3Smrg                           S_028A48_VPORT_SCISSOR_ENABLE(1);
11067ec681f3Smrg
11077ec681f3Smrg   const VkPipelineRasterizationLineStateCreateInfoEXT *rast_line = vk_find_struct_const(
11087ec681f3Smrg      pCreateInfo->pRasterizationState->pNext, PIPELINE_RASTERIZATION_LINE_STATE_CREATE_INFO_EXT);
11097ec681f3Smrg   if (rast_line) {
11107ec681f3Smrg      ms->pa_sc_mode_cntl_0 |= S_028A48_LINE_STIPPLE_ENABLE(rast_line->stippledLineEnable);
11117ec681f3Smrg      if (rast_line->lineRasterizationMode == VK_LINE_RASTERIZATION_MODE_BRESENHAM_EXT) {
11127ec681f3Smrg         /* From the Vulkan spec 1.1.129:
11137ec681f3Smrg          *
11147ec681f3Smrg          * "When VK_LINE_RASTERIZATION_MODE_BRESENHAM_EXT lines
11157ec681f3Smrg          *  are being rasterized, sample locations may all be
11167ec681f3Smrg          *  treated as being at the pixel center (this may
11177ec681f3Smrg          *  affect attribute and depth interpolation)."
11187ec681f3Smrg          */
11197ec681f3Smrg         ms->num_samples = 1;
11207ec681f3Smrg      }
11217ec681f3Smrg   }
11227ec681f3Smrg
11237ec681f3Smrg   if (ms->num_samples > 1) {
11247ec681f3Smrg      RADV_FROM_HANDLE(radv_render_pass, pass, pCreateInfo->renderPass);
11257ec681f3Smrg      struct radv_subpass *subpass = &pass->subpasses[pCreateInfo->subpass];
11267ec681f3Smrg      uint32_t z_samples =
11277ec681f3Smrg         subpass->depth_stencil_attachment ? subpass->depth_sample_count : ms->num_samples;
11287ec681f3Smrg      unsigned log_samples = util_logbase2(ms->num_samples);
11297ec681f3Smrg      unsigned log_z_samples = util_logbase2(z_samples);
11307ec681f3Smrg      unsigned log_ps_iter_samples = util_logbase2(ps_iter_samples);
11317ec681f3Smrg      ms->pa_sc_mode_cntl_0 |= S_028A48_MSAA_ENABLE(1);
11327ec681f3Smrg      ms->db_eqaa |= S_028804_MAX_ANCHOR_SAMPLES(log_z_samples) |
11337ec681f3Smrg                     S_028804_PS_ITER_SAMPLES(log_ps_iter_samples) |
11347ec681f3Smrg                     S_028804_MASK_EXPORT_NUM_SAMPLES(log_samples) |
11357ec681f3Smrg                     S_028804_ALPHA_TO_MASK_NUM_SAMPLES(log_samples);
11367ec681f3Smrg      ms->pa_sc_aa_config |=
11377ec681f3Smrg         S_028BE0_MSAA_NUM_SAMPLES(log_samples) |
11387ec681f3Smrg         S_028BE0_MAX_SAMPLE_DIST(radv_get_default_max_sample_dist(log_samples)) |
11397ec681f3Smrg         S_028BE0_MSAA_EXPOSED_SAMPLES(log_samples) | /* CM_R_028BE0_PA_SC_AA_CONFIG */
11407ec681f3Smrg         S_028BE0_COVERED_CENTROID_IS_CENTER(
11417ec681f3Smrg            pipeline->device->physical_device->rad_info.chip_class >= GFX10_3);
11427ec681f3Smrg      ms->pa_sc_mode_cntl_1 |= S_028A4C_PS_ITER_SAMPLE(ps_iter_samples > 1);
11437ec681f3Smrg      if (ps_iter_samples > 1)
11447ec681f3Smrg         pipeline->graphics.spi_baryc_cntl |= S_0286E0_POS_FLOAT_LOCATION(2);
11457ec681f3Smrg   }
11467ec681f3Smrg
11477ec681f3Smrg   if (vkms && vkms->pSampleMask) {
11487ec681f3Smrg      mask = vkms->pSampleMask[0] & 0xffff;
11497ec681f3Smrg   }
11507ec681f3Smrg
11517ec681f3Smrg   ms->pa_sc_aa_mask[0] = mask | (mask << 16);
11527ec681f3Smrg   ms->pa_sc_aa_mask[1] = mask | (mask << 16);
11537ec681f3Smrg}
11547ec681f3Smrg
11557ec681f3Smrgstatic void
11567ec681f3Smrggfx103_pipeline_init_vrs_state(struct radv_pipeline *pipeline,
11577ec681f3Smrg                               const VkGraphicsPipelineCreateInfo *pCreateInfo)
11587ec681f3Smrg{
11597ec681f3Smrg   const VkPipelineMultisampleStateCreateInfo *vkms =
11607ec681f3Smrg      radv_pipeline_get_multisample_state(pCreateInfo);
11617ec681f3Smrg   struct radv_shader_variant *ps = pipeline->shaders[MESA_SHADER_FRAGMENT];
11627ec681f3Smrg   struct radv_multisample_state *ms = &pipeline->graphics.ms;
11637ec681f3Smrg   struct radv_vrs_state *vrs = &pipeline->graphics.vrs;
11647ec681f3Smrg
11657ec681f3Smrg   if (vkms && (vkms->sampleShadingEnable || ps->info.ps.uses_sample_shading ||
11667ec681f3Smrg                ps->info.ps.reads_sample_mask_in)) {
11677ec681f3Smrg      /* Disable VRS and use the rates from PS_ITER_SAMPLES if:
11687ec681f3Smrg       *
11697ec681f3Smrg       * 1) sample shading is enabled or per-sample interpolation is
11707ec681f3Smrg       *    used by the fragment shader
11717ec681f3Smrg       * 2) the fragment shader reads gl_SampleMaskIn because the
11727ec681f3Smrg       *    16-bit sample coverage mask isn't enough for MSAA8x and
11737ec681f3Smrg       *    2x2 coarse shading isn't enough.
11747ec681f3Smrg       */
11757ec681f3Smrg      vrs->pa_cl_vrs_cntl = S_028848_SAMPLE_ITER_COMBINER_MODE(V_028848_VRS_COMB_MODE_OVERRIDE);
11767ec681f3Smrg
11777ec681f3Smrg      /* Make sure sample shading is enabled even if only MSAA1x is
11787ec681f3Smrg       * used because the SAMPLE_ITER combiner is in passthrough
11797ec681f3Smrg       * mode if PS_ITER_SAMPLE is 0, and it uses the per-draw rate.
11807ec681f3Smrg       * The default VRS rate when sample shading is enabled is 1x1.
11817ec681f3Smrg       */
11827ec681f3Smrg      if (!G_028A4C_PS_ITER_SAMPLE(ms->pa_sc_mode_cntl_1))
11837ec681f3Smrg         ms->pa_sc_mode_cntl_1 |= S_028A4C_PS_ITER_SAMPLE(1);
11847ec681f3Smrg   } else {
11857ec681f3Smrg      vrs->pa_cl_vrs_cntl = S_028848_SAMPLE_ITER_COMBINER_MODE(V_028848_VRS_COMB_MODE_PASSTHRU);
11867ec681f3Smrg   }
11877ec681f3Smrg
11887ec681f3Smrg   /* The primitive combiner is always passthrough. */
11897ec681f3Smrg   vrs->pa_cl_vrs_cntl |= S_028848_PRIMITIVE_RATE_COMBINER_MODE(V_028848_VRS_COMB_MODE_PASSTHRU);
119001e04c3fSmrg}
119101e04c3fSmrg
119201e04c3fSmrgstatic bool
119301e04c3fSmrgradv_prim_can_use_guardband(enum VkPrimitiveTopology topology)
119401e04c3fSmrg{
11957ec681f3Smrg   switch (topology) {
11967ec681f3Smrg   case VK_PRIMITIVE_TOPOLOGY_POINT_LIST:
11977ec681f3Smrg   case VK_PRIMITIVE_TOPOLOGY_LINE_LIST:
11987ec681f3Smrg   case VK_PRIMITIVE_TOPOLOGY_LINE_STRIP:
11997ec681f3Smrg   case VK_PRIMITIVE_TOPOLOGY_LINE_LIST_WITH_ADJACENCY:
12007ec681f3Smrg   case VK_PRIMITIVE_TOPOLOGY_LINE_STRIP_WITH_ADJACENCY:
12017ec681f3Smrg      return false;
12027ec681f3Smrg   case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST:
12037ec681f3Smrg   case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP:
12047ec681f3Smrg   case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_FAN:
12057ec681f3Smrg   case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST_WITH_ADJACENCY:
12067ec681f3Smrg   case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP_WITH_ADJACENCY:
12077ec681f3Smrg   case VK_PRIMITIVE_TOPOLOGY_PATCH_LIST:
12087ec681f3Smrg      return true;
12097ec681f3Smrg   default:
12107ec681f3Smrg      unreachable("unhandled primitive type");
12117ec681f3Smrg   }
121201e04c3fSmrg}
121301e04c3fSmrg
121401e04c3fSmrgstatic uint32_t
12157ec681f3Smrgsi_conv_gl_prim_to_gs_out(unsigned gl_prim)
12167ec681f3Smrg{
12177ec681f3Smrg   switch (gl_prim) {
12187ec681f3Smrg   case 0: /* GL_POINTS */
12197ec681f3Smrg      return V_028A6C_POINTLIST;
12207ec681f3Smrg   case 1:      /* GL_LINES */
12217ec681f3Smrg   case 3:      /* GL_LINE_STRIP */
12227ec681f3Smrg   case 0xA:    /* GL_LINE_STRIP_ADJACENCY_ARB */
12237ec681f3Smrg   case 0x8E7A: /* GL_ISOLINES */
12247ec681f3Smrg      return V_028A6C_LINESTRIP;
12257ec681f3Smrg
12267ec681f3Smrg   case 4:   /* GL_TRIANGLES */
12277ec681f3Smrg   case 0xc: /* GL_TRIANGLES_ADJACENCY_ARB */
12287ec681f3Smrg   case 5:   /* GL_TRIANGLE_STRIP */
12297ec681f3Smrg   case 7:   /* GL_QUADS */
12307ec681f3Smrg      return V_028A6C_TRISTRIP;
12317ec681f3Smrg   default:
12327ec681f3Smrg      assert(0);
12337ec681f3Smrg      return 0;
12347ec681f3Smrg   }
123501e04c3fSmrg}
123601e04c3fSmrg
12377ec681f3Smrgstatic uint64_t
12387ec681f3Smrgradv_dynamic_state_mask(VkDynamicState state)
123901e04c3fSmrg{
12407ec681f3Smrg   switch (state) {
12417ec681f3Smrg   case VK_DYNAMIC_STATE_VIEWPORT:
12427ec681f3Smrg   case VK_DYNAMIC_STATE_VIEWPORT_WITH_COUNT_EXT:
12437ec681f3Smrg      return RADV_DYNAMIC_VIEWPORT;
12447ec681f3Smrg   case VK_DYNAMIC_STATE_SCISSOR:
12457ec681f3Smrg   case VK_DYNAMIC_STATE_SCISSOR_WITH_COUNT_EXT:
12467ec681f3Smrg      return RADV_DYNAMIC_SCISSOR;
12477ec681f3Smrg   case VK_DYNAMIC_STATE_LINE_WIDTH:
12487ec681f3Smrg      return RADV_DYNAMIC_LINE_WIDTH;
12497ec681f3Smrg   case VK_DYNAMIC_STATE_DEPTH_BIAS:
12507ec681f3Smrg      return RADV_DYNAMIC_DEPTH_BIAS;
12517ec681f3Smrg   case VK_DYNAMIC_STATE_BLEND_CONSTANTS:
12527ec681f3Smrg      return RADV_DYNAMIC_BLEND_CONSTANTS;
12537ec681f3Smrg   case VK_DYNAMIC_STATE_DEPTH_BOUNDS:
12547ec681f3Smrg      return RADV_DYNAMIC_DEPTH_BOUNDS;
12557ec681f3Smrg   case VK_DYNAMIC_STATE_STENCIL_COMPARE_MASK:
12567ec681f3Smrg      return RADV_DYNAMIC_STENCIL_COMPARE_MASK;
12577ec681f3Smrg   case VK_DYNAMIC_STATE_STENCIL_WRITE_MASK:
12587ec681f3Smrg      return RADV_DYNAMIC_STENCIL_WRITE_MASK;
12597ec681f3Smrg   case VK_DYNAMIC_STATE_STENCIL_REFERENCE:
12607ec681f3Smrg      return RADV_DYNAMIC_STENCIL_REFERENCE;
12617ec681f3Smrg   case VK_DYNAMIC_STATE_DISCARD_RECTANGLE_EXT:
12627ec681f3Smrg      return RADV_DYNAMIC_DISCARD_RECTANGLE;
12637ec681f3Smrg   case VK_DYNAMIC_STATE_SAMPLE_LOCATIONS_EXT:
12647ec681f3Smrg      return RADV_DYNAMIC_SAMPLE_LOCATIONS;
12657ec681f3Smrg   case VK_DYNAMIC_STATE_LINE_STIPPLE_EXT:
12667ec681f3Smrg      return RADV_DYNAMIC_LINE_STIPPLE;
12677ec681f3Smrg   case VK_DYNAMIC_STATE_CULL_MODE_EXT:
12687ec681f3Smrg      return RADV_DYNAMIC_CULL_MODE;
12697ec681f3Smrg   case VK_DYNAMIC_STATE_FRONT_FACE_EXT:
12707ec681f3Smrg      return RADV_DYNAMIC_FRONT_FACE;
12717ec681f3Smrg   case VK_DYNAMIC_STATE_PRIMITIVE_TOPOLOGY_EXT:
12727ec681f3Smrg      return RADV_DYNAMIC_PRIMITIVE_TOPOLOGY;
12737ec681f3Smrg   case VK_DYNAMIC_STATE_DEPTH_TEST_ENABLE_EXT:
12747ec681f3Smrg      return RADV_DYNAMIC_DEPTH_TEST_ENABLE;
12757ec681f3Smrg   case VK_DYNAMIC_STATE_DEPTH_WRITE_ENABLE_EXT:
12767ec681f3Smrg      return RADV_DYNAMIC_DEPTH_WRITE_ENABLE;
12777ec681f3Smrg   case VK_DYNAMIC_STATE_DEPTH_COMPARE_OP_EXT:
12787ec681f3Smrg      return RADV_DYNAMIC_DEPTH_COMPARE_OP;
12797ec681f3Smrg   case VK_DYNAMIC_STATE_DEPTH_BOUNDS_TEST_ENABLE_EXT:
12807ec681f3Smrg      return RADV_DYNAMIC_DEPTH_BOUNDS_TEST_ENABLE;
12817ec681f3Smrg   case VK_DYNAMIC_STATE_STENCIL_TEST_ENABLE_EXT:
12827ec681f3Smrg      return RADV_DYNAMIC_STENCIL_TEST_ENABLE;
12837ec681f3Smrg   case VK_DYNAMIC_STATE_STENCIL_OP_EXT:
12847ec681f3Smrg      return RADV_DYNAMIC_STENCIL_OP;
12857ec681f3Smrg   case VK_DYNAMIC_STATE_VERTEX_INPUT_BINDING_STRIDE_EXT:
12867ec681f3Smrg      return RADV_DYNAMIC_VERTEX_INPUT_BINDING_STRIDE;
12877ec681f3Smrg   case VK_DYNAMIC_STATE_FRAGMENT_SHADING_RATE_KHR:
12887ec681f3Smrg      return RADV_DYNAMIC_FRAGMENT_SHADING_RATE;
12897ec681f3Smrg   case VK_DYNAMIC_STATE_PATCH_CONTROL_POINTS_EXT:
12907ec681f3Smrg      return RADV_DYNAMIC_PATCH_CONTROL_POINTS;
12917ec681f3Smrg   case VK_DYNAMIC_STATE_RASTERIZER_DISCARD_ENABLE_EXT:
12927ec681f3Smrg      return RADV_DYNAMIC_RASTERIZER_DISCARD_ENABLE;
12937ec681f3Smrg   case VK_DYNAMIC_STATE_DEPTH_BIAS_ENABLE_EXT:
12947ec681f3Smrg      return RADV_DYNAMIC_DEPTH_BIAS_ENABLE;
12957ec681f3Smrg   case VK_DYNAMIC_STATE_LOGIC_OP_EXT:
12967ec681f3Smrg      return RADV_DYNAMIC_LOGIC_OP;
12977ec681f3Smrg   case VK_DYNAMIC_STATE_PRIMITIVE_RESTART_ENABLE_EXT:
12987ec681f3Smrg      return RADV_DYNAMIC_PRIMITIVE_RESTART_ENABLE;
12997ec681f3Smrg   case VK_DYNAMIC_STATE_COLOR_WRITE_ENABLE_EXT:
13007ec681f3Smrg      return RADV_DYNAMIC_COLOR_WRITE_ENABLE;
13017ec681f3Smrg   case VK_DYNAMIC_STATE_VERTEX_INPUT_EXT:
13027ec681f3Smrg      return RADV_DYNAMIC_VERTEX_INPUT;
13037ec681f3Smrg   default:
13047ec681f3Smrg      unreachable("Unhandled dynamic state");
13057ec681f3Smrg   }
130601e04c3fSmrg}
130701e04c3fSmrg
13087ec681f3Smrgstatic bool
13097ec681f3Smrgradv_pipeline_is_blend_enabled(const VkGraphicsPipelineCreateInfo *pCreateInfo)
13107ec681f3Smrg{
13117ec681f3Smrg   const VkPipelineColorBlendStateCreateInfo *vkblend =
13127ec681f3Smrg      radv_pipeline_get_color_blend_state(pCreateInfo);
13137ec681f3Smrg
13147ec681f3Smrg   assert(vkblend);
13157ec681f3Smrg
13167ec681f3Smrg   for (uint32_t i = 0; i < vkblend->attachmentCount; i++) {
13177ec681f3Smrg      const VkPipelineColorBlendAttachmentState *att = &vkblend->pAttachments[i];
13187ec681f3Smrg      if (att->colorWriteMask && att->blendEnable)
13197ec681f3Smrg         return true;
13207ec681f3Smrg   }
13217ec681f3Smrg   return false;
132201e04c3fSmrg}
132301e04c3fSmrg
13247ec681f3Smrgstatic uint64_t
13257ec681f3Smrgradv_pipeline_needed_dynamic_state(const VkGraphicsPipelineCreateInfo *pCreateInfo)
13267ec681f3Smrg{
13277ec681f3Smrg   RADV_FROM_HANDLE(radv_render_pass, pass, pCreateInfo->renderPass);
13287ec681f3Smrg   struct radv_subpass *subpass = &pass->subpasses[pCreateInfo->subpass];
13297ec681f3Smrg   uint64_t states = RADV_DYNAMIC_ALL;
13307ec681f3Smrg
13317ec681f3Smrg   /* If rasterization is disabled we do not care about any of the
13327ec681f3Smrg    * dynamic states, since they are all rasterization related only,
13337ec681f3Smrg    * except primitive topology, primitive restart enable, vertex
13347ec681f3Smrg    * binding stride and rasterization discard itself.
13357ec681f3Smrg    */
13367ec681f3Smrg   if (pCreateInfo->pRasterizationState->rasterizerDiscardEnable &&
13377ec681f3Smrg       !radv_is_state_dynamic(pCreateInfo, VK_DYNAMIC_STATE_RASTERIZER_DISCARD_ENABLE_EXT)) {
13387ec681f3Smrg      return RADV_DYNAMIC_PRIMITIVE_TOPOLOGY | RADV_DYNAMIC_VERTEX_INPUT_BINDING_STRIDE |
13397ec681f3Smrg             RADV_DYNAMIC_PRIMITIVE_RESTART_ENABLE | RADV_DYNAMIC_RASTERIZER_DISCARD_ENABLE |
13407ec681f3Smrg             RADV_DYNAMIC_VERTEX_INPUT;
13417ec681f3Smrg   }
13427ec681f3Smrg
13437ec681f3Smrg   if (!pCreateInfo->pRasterizationState->depthBiasEnable &&
13447ec681f3Smrg       !radv_is_state_dynamic(pCreateInfo, VK_DYNAMIC_STATE_DEPTH_BIAS_ENABLE_EXT))
13457ec681f3Smrg      states &= ~RADV_DYNAMIC_DEPTH_BIAS;
13467ec681f3Smrg
13477ec681f3Smrg   if (!pCreateInfo->pDepthStencilState ||
13487ec681f3Smrg       (!pCreateInfo->pDepthStencilState->depthBoundsTestEnable &&
13497ec681f3Smrg        !radv_is_state_dynamic(pCreateInfo, VK_DYNAMIC_STATE_DEPTH_BOUNDS_TEST_ENABLE_EXT)))
13507ec681f3Smrg      states &= ~RADV_DYNAMIC_DEPTH_BOUNDS;
13517ec681f3Smrg
13527ec681f3Smrg   if (!pCreateInfo->pDepthStencilState ||
13537ec681f3Smrg       (!pCreateInfo->pDepthStencilState->stencilTestEnable &&
13547ec681f3Smrg        !radv_is_state_dynamic(pCreateInfo, VK_DYNAMIC_STATE_STENCIL_TEST_ENABLE_EXT)))
13557ec681f3Smrg      states &= ~(RADV_DYNAMIC_STENCIL_COMPARE_MASK | RADV_DYNAMIC_STENCIL_WRITE_MASK |
13567ec681f3Smrg                  RADV_DYNAMIC_STENCIL_REFERENCE);
13577ec681f3Smrg
13587ec681f3Smrg   if (!vk_find_struct_const(pCreateInfo->pNext, PIPELINE_DISCARD_RECTANGLE_STATE_CREATE_INFO_EXT))
13597ec681f3Smrg      states &= ~RADV_DYNAMIC_DISCARD_RECTANGLE;
13607ec681f3Smrg
13617ec681f3Smrg   if (!pCreateInfo->pMultisampleState ||
13627ec681f3Smrg       !vk_find_struct_const(pCreateInfo->pMultisampleState->pNext,
13637ec681f3Smrg                             PIPELINE_SAMPLE_LOCATIONS_STATE_CREATE_INFO_EXT))
13647ec681f3Smrg      states &= ~RADV_DYNAMIC_SAMPLE_LOCATIONS;
13657ec681f3Smrg
13667ec681f3Smrg   if (!pCreateInfo->pRasterizationState)
13677ec681f3Smrg      states &= ~RADV_DYNAMIC_LINE_STIPPLE;
13687ec681f3Smrg   else {
13697ec681f3Smrg      const VkPipelineRasterizationLineStateCreateInfoEXT *rast_line_info = vk_find_struct_const(pCreateInfo->pRasterizationState->pNext,
13707ec681f3Smrg                                                                                                 PIPELINE_RASTERIZATION_LINE_STATE_CREATE_INFO_EXT);
13717ec681f3Smrg      if (!rast_line_info || !rast_line_info->stippledLineEnable)
13727ec681f3Smrg         states &= ~RADV_DYNAMIC_LINE_STIPPLE;
13737ec681f3Smrg   }
13747ec681f3Smrg
13757ec681f3Smrg   if (!vk_find_struct_const(pCreateInfo->pNext,
13767ec681f3Smrg                             PIPELINE_FRAGMENT_SHADING_RATE_STATE_CREATE_INFO_KHR) &&
13777ec681f3Smrg       !radv_is_state_dynamic(pCreateInfo, VK_DYNAMIC_STATE_FRAGMENT_SHADING_RATE_KHR))
13787ec681f3Smrg      states &= ~RADV_DYNAMIC_FRAGMENT_SHADING_RATE;
13797ec681f3Smrg
13807ec681f3Smrg   if (!subpass->has_color_att ||
13817ec681f3Smrg       !radv_pipeline_is_blend_enabled(pCreateInfo))
13827ec681f3Smrg      states &= ~RADV_DYNAMIC_BLEND_CONSTANTS;
13837ec681f3Smrg
13847ec681f3Smrg   if (!subpass->has_color_att)
13857ec681f3Smrg      states &= ~RADV_DYNAMIC_COLOR_WRITE_ENABLE;
13867ec681f3Smrg
13877ec681f3Smrg   return states;
13887ec681f3Smrg}
13897ec681f3Smrg
13907ec681f3Smrgstatic struct radv_ia_multi_vgt_param_helpers
13917ec681f3Smrgradv_compute_ia_multi_vgt_param_helpers(struct radv_pipeline *pipeline)
13927ec681f3Smrg{
13937ec681f3Smrg   struct radv_ia_multi_vgt_param_helpers ia_multi_vgt_param = {0};
13947ec681f3Smrg   const struct radv_device *device = pipeline->device;
13957ec681f3Smrg
13967ec681f3Smrg   if (radv_pipeline_has_tess(pipeline))
13977ec681f3Smrg      ia_multi_vgt_param.primgroup_size =
13987ec681f3Smrg         pipeline->shaders[MESA_SHADER_TESS_CTRL]->info.num_tess_patches;
13997ec681f3Smrg   else if (radv_pipeline_has_gs(pipeline))
14007ec681f3Smrg      ia_multi_vgt_param.primgroup_size = 64;
14017ec681f3Smrg   else
14027ec681f3Smrg      ia_multi_vgt_param.primgroup_size = 128; /* recommended without a GS */
14037ec681f3Smrg
14047ec681f3Smrg   /* GS requirement. */
14057ec681f3Smrg   ia_multi_vgt_param.partial_es_wave = false;
14067ec681f3Smrg   if (radv_pipeline_has_gs(pipeline) && device->physical_device->rad_info.chip_class <= GFX8)
14077ec681f3Smrg      if (SI_GS_PER_ES / ia_multi_vgt_param.primgroup_size >= pipeline->device->gs_table_depth - 3)
14087ec681f3Smrg         ia_multi_vgt_param.partial_es_wave = true;
14097ec681f3Smrg
14107ec681f3Smrg   ia_multi_vgt_param.ia_switch_on_eoi = false;
14117ec681f3Smrg   if (pipeline->shaders[MESA_SHADER_FRAGMENT]->info.ps.prim_id_input)
14127ec681f3Smrg      ia_multi_vgt_param.ia_switch_on_eoi = true;
14137ec681f3Smrg   if (radv_pipeline_has_gs(pipeline) && pipeline->shaders[MESA_SHADER_GEOMETRY]->info.uses_prim_id)
14147ec681f3Smrg      ia_multi_vgt_param.ia_switch_on_eoi = true;
14157ec681f3Smrg   if (radv_pipeline_has_tess(pipeline)) {
14167ec681f3Smrg      /* SWITCH_ON_EOI must be set if PrimID is used. */
14177ec681f3Smrg      if (pipeline->shaders[MESA_SHADER_TESS_CTRL]->info.uses_prim_id ||
14187ec681f3Smrg          radv_get_shader(pipeline, MESA_SHADER_TESS_EVAL)->info.uses_prim_id)
14197ec681f3Smrg         ia_multi_vgt_param.ia_switch_on_eoi = true;
14207ec681f3Smrg   }
14217ec681f3Smrg
14227ec681f3Smrg   ia_multi_vgt_param.partial_vs_wave = false;
14237ec681f3Smrg   if (radv_pipeline_has_tess(pipeline)) {
14247ec681f3Smrg      /* Bug with tessellation and GS on Bonaire and older 2 SE chips. */
14257ec681f3Smrg      if ((device->physical_device->rad_info.family == CHIP_TAHITI ||
14267ec681f3Smrg           device->physical_device->rad_info.family == CHIP_PITCAIRN ||
14277ec681f3Smrg           device->physical_device->rad_info.family == CHIP_BONAIRE) &&
14287ec681f3Smrg          radv_pipeline_has_gs(pipeline))
14297ec681f3Smrg         ia_multi_vgt_param.partial_vs_wave = true;
14307ec681f3Smrg      /* Needed for 028B6C_DISTRIBUTION_MODE != 0 */
14317ec681f3Smrg      if (device->physical_device->rad_info.has_distributed_tess) {
14327ec681f3Smrg         if (radv_pipeline_has_gs(pipeline)) {
14337ec681f3Smrg            if (device->physical_device->rad_info.chip_class <= GFX8)
14347ec681f3Smrg               ia_multi_vgt_param.partial_es_wave = true;
14357ec681f3Smrg         } else {
14367ec681f3Smrg            ia_multi_vgt_param.partial_vs_wave = true;
14377ec681f3Smrg         }
14387ec681f3Smrg      }
14397ec681f3Smrg   }
14407ec681f3Smrg
14417ec681f3Smrg   if (radv_pipeline_has_gs(pipeline)) {
14427ec681f3Smrg      /* On these chips there is the possibility of a hang if the
14437ec681f3Smrg       * pipeline uses a GS and partial_vs_wave is not set.
14447ec681f3Smrg       *
14457ec681f3Smrg       * This mostly does not hit 4-SE chips, as those typically set
14467ec681f3Smrg       * ia_switch_on_eoi and then partial_vs_wave is set for pipelines
14477ec681f3Smrg       * with GS due to another workaround.
14487ec681f3Smrg       *
14497ec681f3Smrg       * Reproducer: https://bugs.freedesktop.org/show_bug.cgi?id=109242
14507ec681f3Smrg       */
14517ec681f3Smrg      if (device->physical_device->rad_info.family == CHIP_TONGA ||
14527ec681f3Smrg          device->physical_device->rad_info.family == CHIP_FIJI ||
14537ec681f3Smrg          device->physical_device->rad_info.family == CHIP_POLARIS10 ||
14547ec681f3Smrg          device->physical_device->rad_info.family == CHIP_POLARIS11 ||
14557ec681f3Smrg          device->physical_device->rad_info.family == CHIP_POLARIS12 ||
14567ec681f3Smrg          device->physical_device->rad_info.family == CHIP_VEGAM) {
14577ec681f3Smrg         ia_multi_vgt_param.partial_vs_wave = true;
14587ec681f3Smrg      }
14597ec681f3Smrg   }
14607ec681f3Smrg
14617ec681f3Smrg   ia_multi_vgt_param.base =
14627ec681f3Smrg      S_028AA8_PRIMGROUP_SIZE(ia_multi_vgt_param.primgroup_size - 1) |
14637ec681f3Smrg      /* The following field was moved to VGT_SHADER_STAGES_EN in GFX9. */
14647ec681f3Smrg      S_028AA8_MAX_PRIMGRP_IN_WAVE(device->physical_device->rad_info.chip_class == GFX8 ? 2 : 0) |
14657ec681f3Smrg      S_030960_EN_INST_OPT_BASIC(device->physical_device->rad_info.chip_class >= GFX9) |
14667ec681f3Smrg      S_030960_EN_INST_OPT_ADV(device->physical_device->rad_info.chip_class >= GFX9);
14677ec681f3Smrg
14687ec681f3Smrg   return ia_multi_vgt_param;
14697ec681f3Smrg}
14707ec681f3Smrg
14717ec681f3Smrgstatic void
14727ec681f3Smrgradv_pipeline_init_input_assembly_state(struct radv_pipeline *pipeline,
14737ec681f3Smrg                                        const VkGraphicsPipelineCreateInfo *pCreateInfo,
14747ec681f3Smrg                                        const struct radv_graphics_pipeline_create_info *extra)
14757ec681f3Smrg{
14767ec681f3Smrg   const VkPipelineInputAssemblyStateCreateInfo *ia_state = pCreateInfo->pInputAssemblyState;
14777ec681f3Smrg   struct radv_shader_variant *tes = pipeline->shaders[MESA_SHADER_TESS_EVAL];
14787ec681f3Smrg   struct radv_shader_variant *gs = pipeline->shaders[MESA_SHADER_GEOMETRY];
14797ec681f3Smrg
14807ec681f3Smrg   pipeline->graphics.can_use_guardband = radv_prim_can_use_guardband(ia_state->topology);
14817ec681f3Smrg
14827ec681f3Smrg   if (radv_pipeline_has_gs(pipeline)) {
14837ec681f3Smrg      if (si_conv_gl_prim_to_gs_out(gs->info.gs.output_prim) == V_028A6C_TRISTRIP)
14847ec681f3Smrg         pipeline->graphics.can_use_guardband = true;
14857ec681f3Smrg   } else if (radv_pipeline_has_tess(pipeline)) {
14867ec681f3Smrg      if (!tes->info.tes.point_mode &&
14877ec681f3Smrg          si_conv_gl_prim_to_gs_out(tes->info.tes.primitive_mode) == V_028A6C_TRISTRIP)
14887ec681f3Smrg         pipeline->graphics.can_use_guardband = true;
14897ec681f3Smrg   }
14907ec681f3Smrg
14917ec681f3Smrg   if (extra && extra->use_rectlist) {
14927ec681f3Smrg      pipeline->graphics.can_use_guardband = true;
14937ec681f3Smrg   }
14947ec681f3Smrg
14957ec681f3Smrg   pipeline->graphics.ia_multi_vgt_param = radv_compute_ia_multi_vgt_param_helpers(pipeline);
14967ec681f3Smrg}
149701e04c3fSmrg
149801e04c3fSmrgstatic void
149901e04c3fSmrgradv_pipeline_init_dynamic_state(struct radv_pipeline *pipeline,
15007ec681f3Smrg                                 const VkGraphicsPipelineCreateInfo *pCreateInfo,
15017ec681f3Smrg                                 const struct radv_graphics_pipeline_create_info *extra)
15027ec681f3Smrg{
15037ec681f3Smrg   uint64_t needed_states = radv_pipeline_needed_dynamic_state(pCreateInfo);
15047ec681f3Smrg   uint64_t states = needed_states;
15057ec681f3Smrg   RADV_FROM_HANDLE(radv_render_pass, pass, pCreateInfo->renderPass);
15067ec681f3Smrg   struct radv_subpass *subpass = &pass->subpasses[pCreateInfo->subpass];
15077ec681f3Smrg
15087ec681f3Smrg   pipeline->dynamic_state = default_dynamic_state;
15097ec681f3Smrg   pipeline->graphics.needed_dynamic_state = needed_states;
15107ec681f3Smrg
15117ec681f3Smrg   if (pCreateInfo->pDynamicState) {
15127ec681f3Smrg      /* Remove all of the states that are marked as dynamic */
15137ec681f3Smrg      uint32_t count = pCreateInfo->pDynamicState->dynamicStateCount;
15147ec681f3Smrg      for (uint32_t s = 0; s < count; s++)
15157ec681f3Smrg         states &= ~radv_dynamic_state_mask(pCreateInfo->pDynamicState->pDynamicStates[s]);
15167ec681f3Smrg   }
15177ec681f3Smrg
15187ec681f3Smrg   struct radv_dynamic_state *dynamic = &pipeline->dynamic_state;
15197ec681f3Smrg
15207ec681f3Smrg   if (needed_states & RADV_DYNAMIC_VIEWPORT) {
15217ec681f3Smrg      assert(pCreateInfo->pViewportState);
15227ec681f3Smrg
15237ec681f3Smrg      dynamic->viewport.count = pCreateInfo->pViewportState->viewportCount;
15247ec681f3Smrg      if (states & RADV_DYNAMIC_VIEWPORT) {
15257ec681f3Smrg         typed_memcpy(dynamic->viewport.viewports, pCreateInfo->pViewportState->pViewports,
15267ec681f3Smrg                      pCreateInfo->pViewportState->viewportCount);
15277ec681f3Smrg         for (unsigned i = 0; i < dynamic->viewport.count; i++)
15287ec681f3Smrg            radv_get_viewport_xform(&dynamic->viewport.viewports[i],
15297ec681f3Smrg                                    dynamic->viewport.xform[i].scale, dynamic->viewport.xform[i].translate);
15307ec681f3Smrg      }
15317ec681f3Smrg   }
15327ec681f3Smrg
15337ec681f3Smrg   if (needed_states & RADV_DYNAMIC_SCISSOR) {
15347ec681f3Smrg      dynamic->scissor.count = pCreateInfo->pViewportState->scissorCount;
15357ec681f3Smrg      if (states & RADV_DYNAMIC_SCISSOR) {
15367ec681f3Smrg         typed_memcpy(dynamic->scissor.scissors, pCreateInfo->pViewportState->pScissors,
15377ec681f3Smrg                      pCreateInfo->pViewportState->scissorCount);
15387ec681f3Smrg      }
15397ec681f3Smrg   }
15407ec681f3Smrg
15417ec681f3Smrg   if (states & RADV_DYNAMIC_LINE_WIDTH) {
15427ec681f3Smrg      assert(pCreateInfo->pRasterizationState);
15437ec681f3Smrg      dynamic->line_width = pCreateInfo->pRasterizationState->lineWidth;
15447ec681f3Smrg   }
15457ec681f3Smrg
15467ec681f3Smrg   if (states & RADV_DYNAMIC_DEPTH_BIAS) {
15477ec681f3Smrg      assert(pCreateInfo->pRasterizationState);
15487ec681f3Smrg      dynamic->depth_bias.bias = pCreateInfo->pRasterizationState->depthBiasConstantFactor;
15497ec681f3Smrg      dynamic->depth_bias.clamp = pCreateInfo->pRasterizationState->depthBiasClamp;
15507ec681f3Smrg      dynamic->depth_bias.slope = pCreateInfo->pRasterizationState->depthBiasSlopeFactor;
15517ec681f3Smrg   }
15527ec681f3Smrg
15537ec681f3Smrg   /* Section 9.2 of the Vulkan 1.0.15 spec says:
15547ec681f3Smrg    *
15557ec681f3Smrg    *    pColorBlendState is [...] NULL if the pipeline has rasterization
15567ec681f3Smrg    *    disabled or if the subpass of the render pass the pipeline is
15577ec681f3Smrg    *    created against does not use any color attachments.
15587ec681f3Smrg    */
15597ec681f3Smrg   if (states & RADV_DYNAMIC_BLEND_CONSTANTS) {
15607ec681f3Smrg      assert(pCreateInfo->pColorBlendState);
15617ec681f3Smrg      typed_memcpy(dynamic->blend_constants, pCreateInfo->pColorBlendState->blendConstants, 4);
15627ec681f3Smrg   }
15637ec681f3Smrg
15647ec681f3Smrg   if (states & RADV_DYNAMIC_CULL_MODE) {
15657ec681f3Smrg      dynamic->cull_mode = pCreateInfo->pRasterizationState->cullMode;
15667ec681f3Smrg   }
15677ec681f3Smrg
15687ec681f3Smrg   if (states & RADV_DYNAMIC_FRONT_FACE) {
15697ec681f3Smrg      dynamic->front_face = pCreateInfo->pRasterizationState->frontFace;
15707ec681f3Smrg   }
15717ec681f3Smrg
15727ec681f3Smrg   if (states & RADV_DYNAMIC_PRIMITIVE_TOPOLOGY) {
15737ec681f3Smrg      dynamic->primitive_topology = si_translate_prim(pCreateInfo->pInputAssemblyState->topology);
15747ec681f3Smrg      if (extra && extra->use_rectlist) {
15757ec681f3Smrg         dynamic->primitive_topology = V_008958_DI_PT_RECTLIST;
15767ec681f3Smrg      }
15777ec681f3Smrg   }
15787ec681f3Smrg
15797ec681f3Smrg   /* If there is no depthstencil attachment, then don't read
15807ec681f3Smrg    * pDepthStencilState. The Vulkan spec states that pDepthStencilState may
15817ec681f3Smrg    * be NULL in this case. Even if pDepthStencilState is non-NULL, there is
15827ec681f3Smrg    * no need to override the depthstencil defaults in
15837ec681f3Smrg    * radv_pipeline::dynamic_state when there is no depthstencil attachment.
15847ec681f3Smrg    *
15857ec681f3Smrg    * Section 9.2 of the Vulkan 1.0.15 spec says:
15867ec681f3Smrg    *
15877ec681f3Smrg    *    pDepthStencilState is [...] NULL if the pipeline has rasterization
15887ec681f3Smrg    *    disabled or if the subpass of the render pass the pipeline is created
15897ec681f3Smrg    *    against does not use a depth/stencil attachment.
15907ec681f3Smrg    */
15917ec681f3Smrg   if (needed_states && subpass->depth_stencil_attachment) {
15927ec681f3Smrg      if (states & RADV_DYNAMIC_DEPTH_BOUNDS) {
15937ec681f3Smrg         dynamic->depth_bounds.min = pCreateInfo->pDepthStencilState->minDepthBounds;
15947ec681f3Smrg         dynamic->depth_bounds.max = pCreateInfo->pDepthStencilState->maxDepthBounds;
15957ec681f3Smrg      }
15967ec681f3Smrg
15977ec681f3Smrg      if (states & RADV_DYNAMIC_STENCIL_COMPARE_MASK) {
15987ec681f3Smrg         dynamic->stencil_compare_mask.front = pCreateInfo->pDepthStencilState->front.compareMask;
15997ec681f3Smrg         dynamic->stencil_compare_mask.back = pCreateInfo->pDepthStencilState->back.compareMask;
16007ec681f3Smrg      }
16017ec681f3Smrg
16027ec681f3Smrg      if (states & RADV_DYNAMIC_STENCIL_WRITE_MASK) {
16037ec681f3Smrg         dynamic->stencil_write_mask.front = pCreateInfo->pDepthStencilState->front.writeMask;
16047ec681f3Smrg         dynamic->stencil_write_mask.back = pCreateInfo->pDepthStencilState->back.writeMask;
16057ec681f3Smrg      }
16067ec681f3Smrg
16077ec681f3Smrg      if (states & RADV_DYNAMIC_STENCIL_REFERENCE) {
16087ec681f3Smrg         dynamic->stencil_reference.front = pCreateInfo->pDepthStencilState->front.reference;
16097ec681f3Smrg         dynamic->stencil_reference.back = pCreateInfo->pDepthStencilState->back.reference;
16107ec681f3Smrg      }
16117ec681f3Smrg
16127ec681f3Smrg      if (states & RADV_DYNAMIC_DEPTH_TEST_ENABLE) {
16137ec681f3Smrg         dynamic->depth_test_enable = pCreateInfo->pDepthStencilState->depthTestEnable;
16147ec681f3Smrg      }
16157ec681f3Smrg
16167ec681f3Smrg      if (states & RADV_DYNAMIC_DEPTH_WRITE_ENABLE) {
16177ec681f3Smrg         dynamic->depth_write_enable = pCreateInfo->pDepthStencilState->depthWriteEnable;
16187ec681f3Smrg      }
16197ec681f3Smrg
16207ec681f3Smrg      if (states & RADV_DYNAMIC_DEPTH_COMPARE_OP) {
16217ec681f3Smrg         dynamic->depth_compare_op = pCreateInfo->pDepthStencilState->depthCompareOp;
16227ec681f3Smrg      }
16237ec681f3Smrg
16247ec681f3Smrg      if (states & RADV_DYNAMIC_DEPTH_BOUNDS_TEST_ENABLE) {
16257ec681f3Smrg         dynamic->depth_bounds_test_enable = pCreateInfo->pDepthStencilState->depthBoundsTestEnable;
16267ec681f3Smrg      }
16277ec681f3Smrg
16287ec681f3Smrg      if (states & RADV_DYNAMIC_STENCIL_TEST_ENABLE) {
16297ec681f3Smrg         dynamic->stencil_test_enable = pCreateInfo->pDepthStencilState->stencilTestEnable;
16307ec681f3Smrg      }
16317ec681f3Smrg
16327ec681f3Smrg      if (states & RADV_DYNAMIC_STENCIL_OP) {
16337ec681f3Smrg         dynamic->stencil_op.front.compare_op = pCreateInfo->pDepthStencilState->front.compareOp;
16347ec681f3Smrg         dynamic->stencil_op.front.fail_op = pCreateInfo->pDepthStencilState->front.failOp;
16357ec681f3Smrg         dynamic->stencil_op.front.pass_op = pCreateInfo->pDepthStencilState->front.passOp;
16367ec681f3Smrg         dynamic->stencil_op.front.depth_fail_op =
16377ec681f3Smrg            pCreateInfo->pDepthStencilState->front.depthFailOp;
16387ec681f3Smrg
16397ec681f3Smrg         dynamic->stencil_op.back.compare_op = pCreateInfo->pDepthStencilState->back.compareOp;
16407ec681f3Smrg         dynamic->stencil_op.back.fail_op = pCreateInfo->pDepthStencilState->back.failOp;
16417ec681f3Smrg         dynamic->stencil_op.back.pass_op = pCreateInfo->pDepthStencilState->back.passOp;
16427ec681f3Smrg         dynamic->stencil_op.back.depth_fail_op = pCreateInfo->pDepthStencilState->back.depthFailOp;
16437ec681f3Smrg      }
16447ec681f3Smrg   }
16457ec681f3Smrg
16467ec681f3Smrg   const VkPipelineDiscardRectangleStateCreateInfoEXT *discard_rectangle_info =
16477ec681f3Smrg      vk_find_struct_const(pCreateInfo->pNext, PIPELINE_DISCARD_RECTANGLE_STATE_CREATE_INFO_EXT);
16487ec681f3Smrg   if (needed_states & RADV_DYNAMIC_DISCARD_RECTANGLE) {
16497ec681f3Smrg      dynamic->discard_rectangle.count = discard_rectangle_info->discardRectangleCount;
16507ec681f3Smrg      if (states & RADV_DYNAMIC_DISCARD_RECTANGLE) {
16517ec681f3Smrg         typed_memcpy(dynamic->discard_rectangle.rectangles,
16527ec681f3Smrg                      discard_rectangle_info->pDiscardRectangles,
16537ec681f3Smrg                      discard_rectangle_info->discardRectangleCount);
16547ec681f3Smrg      }
16557ec681f3Smrg   }
16567ec681f3Smrg
16577ec681f3Smrg   if (needed_states & RADV_DYNAMIC_SAMPLE_LOCATIONS) {
16587ec681f3Smrg      const VkPipelineSampleLocationsStateCreateInfoEXT *sample_location_info =
16597ec681f3Smrg         vk_find_struct_const(pCreateInfo->pMultisampleState->pNext,
16607ec681f3Smrg                              PIPELINE_SAMPLE_LOCATIONS_STATE_CREATE_INFO_EXT);
16617ec681f3Smrg      /* If sampleLocationsEnable is VK_FALSE, the default sample
16627ec681f3Smrg       * locations are used and the values specified in
16637ec681f3Smrg       * sampleLocationsInfo are ignored.
16647ec681f3Smrg       */
16657ec681f3Smrg      if (sample_location_info->sampleLocationsEnable) {
16667ec681f3Smrg         const VkSampleLocationsInfoEXT *pSampleLocationsInfo =
16677ec681f3Smrg            &sample_location_info->sampleLocationsInfo;
16687ec681f3Smrg
16697ec681f3Smrg         assert(pSampleLocationsInfo->sampleLocationsCount <= MAX_SAMPLE_LOCATIONS);
16707ec681f3Smrg
16717ec681f3Smrg         dynamic->sample_location.per_pixel = pSampleLocationsInfo->sampleLocationsPerPixel;
16727ec681f3Smrg         dynamic->sample_location.grid_size = pSampleLocationsInfo->sampleLocationGridSize;
16737ec681f3Smrg         dynamic->sample_location.count = pSampleLocationsInfo->sampleLocationsCount;
16747ec681f3Smrg         typed_memcpy(&dynamic->sample_location.locations[0],
16757ec681f3Smrg                      pSampleLocationsInfo->pSampleLocations,
16767ec681f3Smrg                      pSampleLocationsInfo->sampleLocationsCount);
16777ec681f3Smrg      }
16787ec681f3Smrg   }
16797ec681f3Smrg
16807ec681f3Smrg   const VkPipelineRasterizationLineStateCreateInfoEXT *rast_line_info = vk_find_struct_const(
16817ec681f3Smrg      pCreateInfo->pRasterizationState->pNext, PIPELINE_RASTERIZATION_LINE_STATE_CREATE_INFO_EXT);
16827ec681f3Smrg   if (needed_states & RADV_DYNAMIC_LINE_STIPPLE) {
16837ec681f3Smrg      dynamic->line_stipple.factor = rast_line_info->lineStippleFactor;
16847ec681f3Smrg      dynamic->line_stipple.pattern = rast_line_info->lineStipplePattern;
16857ec681f3Smrg   }
16867ec681f3Smrg
16877ec681f3Smrg   if (!(states & RADV_DYNAMIC_VERTEX_INPUT_BINDING_STRIDE) ||
16887ec681f3Smrg       !(states & RADV_DYNAMIC_VERTEX_INPUT))
16897ec681f3Smrg      pipeline->graphics.uses_dynamic_stride = true;
16907ec681f3Smrg
16917ec681f3Smrg   const VkPipelineFragmentShadingRateStateCreateInfoKHR *shading_rate = vk_find_struct_const(
16927ec681f3Smrg      pCreateInfo->pNext, PIPELINE_FRAGMENT_SHADING_RATE_STATE_CREATE_INFO_KHR);
16937ec681f3Smrg   if (states & RADV_DYNAMIC_FRAGMENT_SHADING_RATE) {
16947ec681f3Smrg      dynamic->fragment_shading_rate.size = shading_rate->fragmentSize;
16957ec681f3Smrg      for (int i = 0; i < 2; i++)
16967ec681f3Smrg         dynamic->fragment_shading_rate.combiner_ops[i] = shading_rate->combinerOps[i];
16977ec681f3Smrg   }
16987ec681f3Smrg
16997ec681f3Smrg   if (states & RADV_DYNAMIC_DEPTH_BIAS_ENABLE) {
17007ec681f3Smrg      dynamic->depth_bias_enable = pCreateInfo->pRasterizationState->depthBiasEnable;
17017ec681f3Smrg   }
17027ec681f3Smrg
17037ec681f3Smrg   if (states & RADV_DYNAMIC_PRIMITIVE_RESTART_ENABLE) {
17047ec681f3Smrg      dynamic->primitive_restart_enable =
17057ec681f3Smrg         !!pCreateInfo->pInputAssemblyState->primitiveRestartEnable;
17067ec681f3Smrg   }
17077ec681f3Smrg
17087ec681f3Smrg   if (states & RADV_DYNAMIC_RASTERIZER_DISCARD_ENABLE) {
17097ec681f3Smrg      dynamic->rasterizer_discard_enable =
17107ec681f3Smrg         pCreateInfo->pRasterizationState->rasterizerDiscardEnable;
17117ec681f3Smrg   }
17127ec681f3Smrg
17137ec681f3Smrg   if (subpass->has_color_att && states & RADV_DYNAMIC_LOGIC_OP) {
17147ec681f3Smrg      if (pCreateInfo->pColorBlendState->logicOpEnable) {
17157ec681f3Smrg         dynamic->logic_op = si_translate_blend_logic_op(pCreateInfo->pColorBlendState->logicOp);
17167ec681f3Smrg      } else {
17177ec681f3Smrg         dynamic->logic_op = V_028808_ROP3_COPY;
17187ec681f3Smrg      }
17197ec681f3Smrg   }
17207ec681f3Smrg
17217ec681f3Smrg   if (states & RADV_DYNAMIC_COLOR_WRITE_ENABLE) {
17227ec681f3Smrg      const VkPipelineColorWriteCreateInfoEXT *color_write_info = vk_find_struct_const(
17237ec681f3Smrg         pCreateInfo->pColorBlendState->pNext, PIPELINE_COLOR_WRITE_CREATE_INFO_EXT);
17247ec681f3Smrg      if (color_write_info) {
17257ec681f3Smrg         dynamic->color_write_enable = 0;
17267ec681f3Smrg         for (uint32_t i = 0; i < color_write_info->attachmentCount; i++) {
17277ec681f3Smrg            dynamic->color_write_enable |=
17287ec681f3Smrg               color_write_info->pColorWriteEnables[i] ? (0xfu << (i * 4)) : 0;
17297ec681f3Smrg         }
17307ec681f3Smrg      }
17317ec681f3Smrg   }
17327ec681f3Smrg
17337ec681f3Smrg   pipeline->dynamic_state.mask = states;
173401e04c3fSmrg}
173501e04c3fSmrg
173601e04c3fSmrgstatic void
17377ec681f3Smrgradv_pipeline_init_raster_state(struct radv_pipeline *pipeline,
17387ec681f3Smrg                                const VkGraphicsPipelineCreateInfo *pCreateInfo)
17397ec681f3Smrg{
17407ec681f3Smrg   const VkPipelineRasterizationStateCreateInfo *raster_info = pCreateInfo->pRasterizationState;
17417ec681f3Smrg   const VkPipelineRasterizationProvokingVertexStateCreateInfoEXT *provoking_vtx_info =
17427ec681f3Smrg      vk_find_struct_const(raster_info->pNext,
17437ec681f3Smrg                           PIPELINE_RASTERIZATION_PROVOKING_VERTEX_STATE_CREATE_INFO_EXT);
17447ec681f3Smrg   bool provoking_vtx_last = false;
17457ec681f3Smrg
17467ec681f3Smrg   if (provoking_vtx_info &&
17477ec681f3Smrg       provoking_vtx_info->provokingVertexMode == VK_PROVOKING_VERTEX_MODE_LAST_VERTEX_EXT) {
17487ec681f3Smrg      provoking_vtx_last = true;
17497ec681f3Smrg   }
17507ec681f3Smrg
17517ec681f3Smrg   pipeline->graphics.pa_su_sc_mode_cntl =
17527ec681f3Smrg      S_028814_FACE(raster_info->frontFace) |
17537ec681f3Smrg      S_028814_CULL_FRONT(!!(raster_info->cullMode & VK_CULL_MODE_FRONT_BIT)) |
17547ec681f3Smrg      S_028814_CULL_BACK(!!(raster_info->cullMode & VK_CULL_MODE_BACK_BIT)) |
17557ec681f3Smrg      S_028814_POLY_MODE(raster_info->polygonMode != VK_POLYGON_MODE_FILL) |
17567ec681f3Smrg      S_028814_POLYMODE_FRONT_PTYPE(si_translate_fill(raster_info->polygonMode)) |
17577ec681f3Smrg      S_028814_POLYMODE_BACK_PTYPE(si_translate_fill(raster_info->polygonMode)) |
17587ec681f3Smrg      S_028814_POLY_OFFSET_FRONT_ENABLE(raster_info->depthBiasEnable ? 1 : 0) |
17597ec681f3Smrg      S_028814_POLY_OFFSET_BACK_ENABLE(raster_info->depthBiasEnable ? 1 : 0) |
17607ec681f3Smrg      S_028814_POLY_OFFSET_PARA_ENABLE(raster_info->depthBiasEnable ? 1 : 0) |
17617ec681f3Smrg      S_028814_PROVOKING_VTX_LAST(provoking_vtx_last);
17627ec681f3Smrg
17637ec681f3Smrg   if (pipeline->device->physical_device->rad_info.chip_class >= GFX10) {
17647ec681f3Smrg      /* It should also be set if PERPENDICULAR_ENDCAP_ENA is set. */
17657ec681f3Smrg      pipeline->graphics.pa_su_sc_mode_cntl |=
17667ec681f3Smrg         S_028814_KEEP_TOGETHER_ENABLE(raster_info->polygonMode != VK_POLYGON_MODE_FILL);
17677ec681f3Smrg   }
17687ec681f3Smrg
17697ec681f3Smrg   bool depth_clip_disable = raster_info->depthClampEnable;
17707ec681f3Smrg   const VkPipelineRasterizationDepthClipStateCreateInfoEXT *depth_clip_state =
17717ec681f3Smrg      vk_find_struct_const(raster_info->pNext,
17727ec681f3Smrg                           PIPELINE_RASTERIZATION_DEPTH_CLIP_STATE_CREATE_INFO_EXT);
17737ec681f3Smrg   if (depth_clip_state) {
17747ec681f3Smrg      depth_clip_disable = !depth_clip_state->depthClipEnable;
17757ec681f3Smrg   }
17767ec681f3Smrg
17777ec681f3Smrg   pipeline->graphics.pa_cl_clip_cntl =
17787ec681f3Smrg      S_028810_DX_CLIP_SPACE_DEF(1) | // vulkan uses DX conventions.
17797ec681f3Smrg      S_028810_ZCLIP_NEAR_DISABLE(depth_clip_disable ? 1 : 0) |
17807ec681f3Smrg      S_028810_ZCLIP_FAR_DISABLE(depth_clip_disable ? 1 : 0) |
17817ec681f3Smrg      S_028810_DX_RASTERIZATION_KILL(raster_info->rasterizerDiscardEnable ? 1 : 0) |
17827ec681f3Smrg      S_028810_DX_LINEAR_ATTR_CLIP_ENA(1);
17837ec681f3Smrg
17847ec681f3Smrg   pipeline->graphics.uses_conservative_overestimate =
17857ec681f3Smrg      radv_get_conservative_raster_mode(pCreateInfo->pRasterizationState) ==
17867ec681f3Smrg         VK_CONSERVATIVE_RASTERIZATION_MODE_OVERESTIMATE_EXT;
17877ec681f3Smrg}
17887ec681f3Smrg
17897ec681f3Smrgstatic void
17907ec681f3Smrgradv_pipeline_init_depth_stencil_state(struct radv_pipeline *pipeline,
17917ec681f3Smrg                                       const VkGraphicsPipelineCreateInfo *pCreateInfo)
17927ec681f3Smrg{
17937ec681f3Smrg   const VkPipelineDepthStencilStateCreateInfo *ds_info =
17947ec681f3Smrg      radv_pipeline_get_depth_stencil_state(pCreateInfo);
17957ec681f3Smrg   RADV_FROM_HANDLE(radv_render_pass, pass, pCreateInfo->renderPass);
17967ec681f3Smrg   struct radv_subpass *subpass = pass->subpasses + pCreateInfo->subpass;
17977ec681f3Smrg   struct radv_render_pass_attachment *attachment = NULL;
17987ec681f3Smrg   uint32_t db_depth_control = 0;
17997ec681f3Smrg
18007ec681f3Smrg   if (subpass->depth_stencil_attachment)
18017ec681f3Smrg      attachment = pass->attachments + subpass->depth_stencil_attachment->attachment;
18027ec681f3Smrg
18037ec681f3Smrg   bool has_depth_attachment = attachment && vk_format_has_depth(attachment->format);
18047ec681f3Smrg   bool has_stencil_attachment = attachment && vk_format_has_stencil(attachment->format);
18057ec681f3Smrg
18067ec681f3Smrg   if (ds_info) {
18077ec681f3Smrg      if (has_depth_attachment) {
18087ec681f3Smrg         db_depth_control = S_028800_Z_ENABLE(ds_info->depthTestEnable ? 1 : 0) |
18097ec681f3Smrg                            S_028800_Z_WRITE_ENABLE(ds_info->depthWriteEnable ? 1 : 0) |
18107ec681f3Smrg                            S_028800_ZFUNC(ds_info->depthCompareOp) |
18117ec681f3Smrg                            S_028800_DEPTH_BOUNDS_ENABLE(ds_info->depthBoundsTestEnable ? 1 : 0);
18127ec681f3Smrg      }
18137ec681f3Smrg
18147ec681f3Smrg      if (has_stencil_attachment && ds_info->stencilTestEnable) {
18157ec681f3Smrg         db_depth_control |= S_028800_STENCIL_ENABLE(1) | S_028800_BACKFACE_ENABLE(1);
18167ec681f3Smrg         db_depth_control |= S_028800_STENCILFUNC(ds_info->front.compareOp);
18177ec681f3Smrg         db_depth_control |= S_028800_STENCILFUNC_BF(ds_info->back.compareOp);
18187ec681f3Smrg      }
18197ec681f3Smrg   }
18207ec681f3Smrg
18217ec681f3Smrg   pipeline->graphics.db_depth_control = db_depth_control;
18227ec681f3Smrg}
18237ec681f3Smrg
18247ec681f3Smrgstatic void
18257ec681f3Smrggfx9_get_gs_info(const struct radv_pipeline_key *key, const struct radv_pipeline *pipeline,
18267ec681f3Smrg                 nir_shader **nir, struct radv_shader_info *infos, struct gfx9_gs_info *out)
18277ec681f3Smrg{
18287ec681f3Smrg   struct radv_shader_info *gs_info = &infos[MESA_SHADER_GEOMETRY];
18297ec681f3Smrg   struct radv_es_output_info *es_info;
18307ec681f3Smrg   bool has_tess = !!nir[MESA_SHADER_TESS_CTRL];
18317ec681f3Smrg   if (pipeline->device->physical_device->rad_info.chip_class >= GFX9)
18327ec681f3Smrg      es_info = has_tess ? &gs_info->tes.es_info : &gs_info->vs.es_info;
18337ec681f3Smrg   else
18347ec681f3Smrg      es_info = has_tess ? &infos[MESA_SHADER_TESS_EVAL].tes.es_info
18357ec681f3Smrg                         : &infos[MESA_SHADER_VERTEX].vs.es_info;
18367ec681f3Smrg
18377ec681f3Smrg   unsigned gs_num_invocations = MAX2(gs_info->gs.invocations, 1);
18387ec681f3Smrg   bool uses_adjacency;
18397ec681f3Smrg   switch (key->vs.topology) {
18407ec681f3Smrg   case VK_PRIMITIVE_TOPOLOGY_LINE_LIST_WITH_ADJACENCY:
18417ec681f3Smrg   case VK_PRIMITIVE_TOPOLOGY_LINE_STRIP_WITH_ADJACENCY:
18427ec681f3Smrg   case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST_WITH_ADJACENCY:
18437ec681f3Smrg   case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP_WITH_ADJACENCY:
18447ec681f3Smrg      uses_adjacency = true;
18457ec681f3Smrg      break;
18467ec681f3Smrg   default:
18477ec681f3Smrg      uses_adjacency = false;
18487ec681f3Smrg      break;
18497ec681f3Smrg   }
18507ec681f3Smrg
18517ec681f3Smrg   /* All these are in dwords: */
18527ec681f3Smrg   /* We can't allow using the whole LDS, because GS waves compete with
18537ec681f3Smrg    * other shader stages for LDS space. */
18547ec681f3Smrg   const unsigned max_lds_size = 8 * 1024;
18557ec681f3Smrg   const unsigned esgs_itemsize = es_info->esgs_itemsize / 4;
18567ec681f3Smrg   unsigned esgs_lds_size;
18577ec681f3Smrg
18587ec681f3Smrg   /* All these are per subgroup: */
18597ec681f3Smrg   const unsigned max_out_prims = 32 * 1024;
18607ec681f3Smrg   const unsigned max_es_verts = 255;
18617ec681f3Smrg   const unsigned ideal_gs_prims = 64;
18627ec681f3Smrg   unsigned max_gs_prims, gs_prims;
18637ec681f3Smrg   unsigned min_es_verts, es_verts, worst_case_es_verts;
18647ec681f3Smrg
18657ec681f3Smrg   if (uses_adjacency || gs_num_invocations > 1)
18667ec681f3Smrg      max_gs_prims = 127 / gs_num_invocations;
18677ec681f3Smrg   else
18687ec681f3Smrg      max_gs_prims = 255;
18697ec681f3Smrg
18707ec681f3Smrg   /* MAX_PRIMS_PER_SUBGROUP = gs_prims * max_vert_out * gs_invocations.
18717ec681f3Smrg    * Make sure we don't go over the maximum value.
18727ec681f3Smrg    */
18737ec681f3Smrg   if (gs_info->gs.vertices_out > 0) {
18747ec681f3Smrg      max_gs_prims =
18757ec681f3Smrg         MIN2(max_gs_prims, max_out_prims / (gs_info->gs.vertices_out * gs_num_invocations));
18767ec681f3Smrg   }
18777ec681f3Smrg   assert(max_gs_prims > 0);
18787ec681f3Smrg
18797ec681f3Smrg   /* If the primitive has adjacency, halve the number of vertices
18807ec681f3Smrg    * that will be reused in multiple primitives.
18817ec681f3Smrg    */
18827ec681f3Smrg   min_es_verts = gs_info->gs.vertices_in / (uses_adjacency ? 2 : 1);
18837ec681f3Smrg
18847ec681f3Smrg   gs_prims = MIN2(ideal_gs_prims, max_gs_prims);
18857ec681f3Smrg   worst_case_es_verts = MIN2(min_es_verts * gs_prims, max_es_verts);
18867ec681f3Smrg
18877ec681f3Smrg   /* Compute ESGS LDS size based on the worst case number of ES vertices
18887ec681f3Smrg    * needed to create the target number of GS prims per subgroup.
18897ec681f3Smrg    */
18907ec681f3Smrg   esgs_lds_size = esgs_itemsize * worst_case_es_verts;
18917ec681f3Smrg
18927ec681f3Smrg   /* If total LDS usage is too big, refactor partitions based on ratio
18937ec681f3Smrg    * of ESGS item sizes.
18947ec681f3Smrg    */
18957ec681f3Smrg   if (esgs_lds_size > max_lds_size) {
18967ec681f3Smrg      /* Our target GS Prims Per Subgroup was too large. Calculate
18977ec681f3Smrg       * the maximum number of GS Prims Per Subgroup that will fit
18987ec681f3Smrg       * into LDS, capped by the maximum that the hardware can support.
18997ec681f3Smrg       */
19007ec681f3Smrg      gs_prims = MIN2((max_lds_size / (esgs_itemsize * min_es_verts)), max_gs_prims);
19017ec681f3Smrg      assert(gs_prims > 0);
19027ec681f3Smrg      worst_case_es_verts = MIN2(min_es_verts * gs_prims, max_es_verts);
19037ec681f3Smrg
19047ec681f3Smrg      esgs_lds_size = esgs_itemsize * worst_case_es_verts;
19057ec681f3Smrg      assert(esgs_lds_size <= max_lds_size);
19067ec681f3Smrg   }
19077ec681f3Smrg
19087ec681f3Smrg   /* Now calculate remaining ESGS information. */
19097ec681f3Smrg   if (esgs_lds_size)
19107ec681f3Smrg      es_verts = MIN2(esgs_lds_size / esgs_itemsize, max_es_verts);
19117ec681f3Smrg   else
19127ec681f3Smrg      es_verts = max_es_verts;
19137ec681f3Smrg
19147ec681f3Smrg   /* Vertices for adjacency primitives are not always reused, so restore
19157ec681f3Smrg    * it for ES_VERTS_PER_SUBGRP.
19167ec681f3Smrg    */
19177ec681f3Smrg   min_es_verts = gs_info->gs.vertices_in;
19187ec681f3Smrg
19197ec681f3Smrg   /* For normal primitives, the VGT only checks if they are past the ES
19207ec681f3Smrg    * verts per subgroup after allocating a full GS primitive and if they
19217ec681f3Smrg    * are, kick off a new subgroup.  But if those additional ES verts are
19227ec681f3Smrg    * unique (e.g. not reused) we need to make sure there is enough LDS
19237ec681f3Smrg    * space to account for those ES verts beyond ES_VERTS_PER_SUBGRP.
19247ec681f3Smrg    */
19257ec681f3Smrg   es_verts -= min_es_verts - 1;
19267ec681f3Smrg
19277ec681f3Smrg   uint32_t es_verts_per_subgroup = es_verts;
19287ec681f3Smrg   uint32_t gs_prims_per_subgroup = gs_prims;
19297ec681f3Smrg   uint32_t gs_inst_prims_in_subgroup = gs_prims * gs_num_invocations;
19307ec681f3Smrg   uint32_t max_prims_per_subgroup = gs_inst_prims_in_subgroup * gs_info->gs.vertices_out;
19317ec681f3Smrg   out->lds_size = align(esgs_lds_size, 128) / 128;
19327ec681f3Smrg   out->vgt_gs_onchip_cntl = S_028A44_ES_VERTS_PER_SUBGRP(es_verts_per_subgroup) |
19337ec681f3Smrg                             S_028A44_GS_PRIMS_PER_SUBGRP(gs_prims_per_subgroup) |
19347ec681f3Smrg                             S_028A44_GS_INST_PRIMS_IN_SUBGRP(gs_inst_prims_in_subgroup);
19357ec681f3Smrg   out->vgt_gs_max_prims_per_subgroup = S_028A94_MAX_PRIMS_PER_SUBGROUP(max_prims_per_subgroup);
19367ec681f3Smrg   out->vgt_esgs_ring_itemsize = esgs_itemsize;
19377ec681f3Smrg   assert(max_prims_per_subgroup <= max_out_prims);
19387ec681f3Smrg
19397ec681f3Smrg   gl_shader_stage es_stage = has_tess ? MESA_SHADER_TESS_EVAL : MESA_SHADER_VERTEX;
19407ec681f3Smrg   unsigned workgroup_size =
19417ec681f3Smrg      ac_compute_esgs_workgroup_size(
19427ec681f3Smrg         pipeline->device->physical_device->rad_info.chip_class, infos[es_stage].wave_size,
19437ec681f3Smrg         es_verts_per_subgroup, gs_inst_prims_in_subgroup);
19447ec681f3Smrg   infos[es_stage].workgroup_size = workgroup_size;
19457ec681f3Smrg   infos[MESA_SHADER_GEOMETRY].workgroup_size = workgroup_size;
19467ec681f3Smrg}
19477ec681f3Smrg
19487ec681f3Smrgstatic void
19497ec681f3Smrgclamp_gsprims_to_esverts(unsigned *max_gsprims, unsigned max_esverts, unsigned min_verts_per_prim,
19507ec681f3Smrg                         bool use_adjacency)
19517ec681f3Smrg{
19527ec681f3Smrg   unsigned max_reuse = max_esverts - min_verts_per_prim;
19537ec681f3Smrg   if (use_adjacency)
19547ec681f3Smrg      max_reuse /= 2;
19557ec681f3Smrg   *max_gsprims = MIN2(*max_gsprims, 1 + max_reuse);
19567ec681f3Smrg}
19577ec681f3Smrg
19587ec681f3Smrgstatic unsigned
19597ec681f3Smrgradv_get_num_input_vertices(nir_shader **nir)
19607ec681f3Smrg{
19617ec681f3Smrg   if (nir[MESA_SHADER_GEOMETRY]) {
19627ec681f3Smrg      nir_shader *gs = nir[MESA_SHADER_GEOMETRY];
19637ec681f3Smrg
19647ec681f3Smrg      return gs->info.gs.vertices_in;
19657ec681f3Smrg   }
19667ec681f3Smrg
19677ec681f3Smrg   if (nir[MESA_SHADER_TESS_CTRL]) {
19687ec681f3Smrg      nir_shader *tes = nir[MESA_SHADER_TESS_EVAL];
19697ec681f3Smrg
19707ec681f3Smrg      if (tes->info.tess.point_mode)
19717ec681f3Smrg         return 1;
19727ec681f3Smrg      if (tes->info.tess.primitive_mode == GL_ISOLINES)
19737ec681f3Smrg         return 2;
19747ec681f3Smrg      return 3;
19757ec681f3Smrg   }
19767ec681f3Smrg
19777ec681f3Smrg   return 3;
19787ec681f3Smrg}
19797ec681f3Smrg
19807ec681f3Smrgstatic void
19817ec681f3Smrggfx10_emit_ge_pc_alloc(struct radeon_cmdbuf *cs, enum chip_class chip_class, uint32_t oversub_pc_lines)
19827ec681f3Smrg{
19837ec681f3Smrg   radeon_set_uconfig_reg(
19847ec681f3Smrg      cs, R_030980_GE_PC_ALLOC,
19857ec681f3Smrg      S_030980_OVERSUB_EN(oversub_pc_lines > 0) | S_030980_NUM_PC_LINES(oversub_pc_lines - 1));
19867ec681f3Smrg}
19877ec681f3Smrg
19887ec681f3Smrgstatic void
19897ec681f3Smrggfx10_get_ngg_info(const struct radv_pipeline_key *key, struct radv_pipeline *pipeline,
19907ec681f3Smrg                   nir_shader **nir, struct radv_shader_info *infos, struct gfx10_ngg_info *ngg)
19917ec681f3Smrg{
19927ec681f3Smrg   struct radv_shader_info *gs_info = &infos[MESA_SHADER_GEOMETRY];
19937ec681f3Smrg   struct radv_es_output_info *es_info =
19947ec681f3Smrg      nir[MESA_SHADER_TESS_CTRL] ? &gs_info->tes.es_info : &gs_info->vs.es_info;
19957ec681f3Smrg   unsigned gs_type = nir[MESA_SHADER_GEOMETRY] ? MESA_SHADER_GEOMETRY : MESA_SHADER_VERTEX;
19967ec681f3Smrg   unsigned max_verts_per_prim = radv_get_num_input_vertices(nir);
19977ec681f3Smrg   unsigned min_verts_per_prim = gs_type == MESA_SHADER_GEOMETRY ? max_verts_per_prim : 1;
19987ec681f3Smrg   unsigned gs_num_invocations = nir[MESA_SHADER_GEOMETRY] ? MAX2(gs_info->gs.invocations, 1) : 1;
19997ec681f3Smrg   bool uses_adjacency;
20007ec681f3Smrg   switch (key->vs.topology) {
20017ec681f3Smrg   case VK_PRIMITIVE_TOPOLOGY_LINE_LIST_WITH_ADJACENCY:
20027ec681f3Smrg   case VK_PRIMITIVE_TOPOLOGY_LINE_STRIP_WITH_ADJACENCY:
20037ec681f3Smrg   case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST_WITH_ADJACENCY:
20047ec681f3Smrg   case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP_WITH_ADJACENCY:
20057ec681f3Smrg      uses_adjacency = true;
20067ec681f3Smrg      break;
20077ec681f3Smrg   default:
20087ec681f3Smrg      uses_adjacency = false;
20097ec681f3Smrg      break;
20107ec681f3Smrg   }
20117ec681f3Smrg
20127ec681f3Smrg   /* All these are in dwords: */
20137ec681f3Smrg   /* We can't allow using the whole LDS, because GS waves compete with
20147ec681f3Smrg    * other shader stages for LDS space.
20157ec681f3Smrg    *
20167ec681f3Smrg    * TODO: We should really take the shader's internal LDS use into
20177ec681f3Smrg    *       account. The linker will fail if the size is greater than
20187ec681f3Smrg    *       8K dwords.
20197ec681f3Smrg    */
20207ec681f3Smrg   const unsigned max_lds_size = 8 * 1024 - 768;
20217ec681f3Smrg   const unsigned target_lds_size = max_lds_size;
20227ec681f3Smrg   unsigned esvert_lds_size = 0;
20237ec681f3Smrg   unsigned gsprim_lds_size = 0;
20247ec681f3Smrg
20257ec681f3Smrg   /* All these are per subgroup: */
20267ec681f3Smrg   const unsigned min_esverts =
20277ec681f3Smrg      pipeline->device->physical_device->rad_info.chip_class >= GFX10_3 ? 29 : 24;
20287ec681f3Smrg   bool max_vert_out_per_gs_instance = false;
20297ec681f3Smrg   unsigned max_esverts_base = 128;
20307ec681f3Smrg   unsigned max_gsprims_base = 128; /* default prim group size clamp */
20317ec681f3Smrg
20327ec681f3Smrg   /* Hardware has the following non-natural restrictions on the value
20337ec681f3Smrg    * of GE_CNTL.VERT_GRP_SIZE based on based on the primitive type of
20347ec681f3Smrg    * the draw:
20357ec681f3Smrg    *  - at most 252 for any line input primitive type
20367ec681f3Smrg    *  - at most 251 for any quad input primitive type
20377ec681f3Smrg    *  - at most 251 for triangle strips with adjacency (this happens to
20387ec681f3Smrg    *    be the natural limit for triangle *lists* with adjacency)
20397ec681f3Smrg    */
20407ec681f3Smrg   max_esverts_base = MIN2(max_esverts_base, 251 + max_verts_per_prim - 1);
20417ec681f3Smrg
20427ec681f3Smrg   if (gs_type == MESA_SHADER_GEOMETRY) {
20437ec681f3Smrg      unsigned max_out_verts_per_gsprim = gs_info->gs.vertices_out * gs_num_invocations;
20447ec681f3Smrg
20457ec681f3Smrg      if (max_out_verts_per_gsprim <= 256) {
20467ec681f3Smrg         if (max_out_verts_per_gsprim) {
20477ec681f3Smrg            max_gsprims_base = MIN2(max_gsprims_base, 256 / max_out_verts_per_gsprim);
20487ec681f3Smrg         }
20497ec681f3Smrg      } else {
20507ec681f3Smrg         /* Use special multi-cycling mode in which each GS
20517ec681f3Smrg          * instance gets its own subgroup. Does not work with
20527ec681f3Smrg          * tessellation. */
20537ec681f3Smrg         max_vert_out_per_gs_instance = true;
20547ec681f3Smrg         max_gsprims_base = 1;
20557ec681f3Smrg         max_out_verts_per_gsprim = gs_info->gs.vertices_out;
20567ec681f3Smrg      }
20577ec681f3Smrg
20587ec681f3Smrg      esvert_lds_size = es_info->esgs_itemsize / 4;
20597ec681f3Smrg      gsprim_lds_size = (gs_info->gs.gsvs_vertex_size / 4 + 1) * max_out_verts_per_gsprim;
20607ec681f3Smrg   } else {
20617ec681f3Smrg      /* VS and TES. */
20627ec681f3Smrg      /* LDS size for passing data from GS to ES. */
20637ec681f3Smrg      struct radv_streamout_info *so_info = nir[MESA_SHADER_TESS_CTRL]
20647ec681f3Smrg                                               ? &infos[MESA_SHADER_TESS_EVAL].so
20657ec681f3Smrg                                               : &infos[MESA_SHADER_VERTEX].so;
20667ec681f3Smrg
20677ec681f3Smrg      if (so_info->num_outputs)
20687ec681f3Smrg         esvert_lds_size = 4 * so_info->num_outputs + 1;
20697ec681f3Smrg
20707ec681f3Smrg      /* GS stores Primitive IDs (one DWORD) into LDS at the address
20717ec681f3Smrg       * corresponding to the ES thread of the provoking vertex. All
20727ec681f3Smrg       * ES threads load and export PrimitiveID for their thread.
20737ec681f3Smrg       */
20747ec681f3Smrg      if (!nir[MESA_SHADER_TESS_CTRL] && infos[MESA_SHADER_VERTEX].vs.outinfo.export_prim_id)
20757ec681f3Smrg         esvert_lds_size = MAX2(esvert_lds_size, 1);
20767ec681f3Smrg   }
20777ec681f3Smrg
20787ec681f3Smrg   unsigned max_gsprims = max_gsprims_base;
20797ec681f3Smrg   unsigned max_esverts = max_esverts_base;
20807ec681f3Smrg
20817ec681f3Smrg   if (esvert_lds_size)
20827ec681f3Smrg      max_esverts = MIN2(max_esverts, target_lds_size / esvert_lds_size);
20837ec681f3Smrg   if (gsprim_lds_size)
20847ec681f3Smrg      max_gsprims = MIN2(max_gsprims, target_lds_size / gsprim_lds_size);
20857ec681f3Smrg
20867ec681f3Smrg   max_esverts = MIN2(max_esverts, max_gsprims * max_verts_per_prim);
20877ec681f3Smrg   clamp_gsprims_to_esverts(&max_gsprims, max_esverts, min_verts_per_prim, uses_adjacency);
20887ec681f3Smrg   assert(max_esverts >= max_verts_per_prim && max_gsprims >= 1);
20897ec681f3Smrg
20907ec681f3Smrg   if (esvert_lds_size || gsprim_lds_size) {
20917ec681f3Smrg      /* Now that we have a rough proportionality between esverts
20927ec681f3Smrg       * and gsprims based on the primitive type, scale both of them
20937ec681f3Smrg       * down simultaneously based on required LDS space.
20947ec681f3Smrg       *
20957ec681f3Smrg       * We could be smarter about this if we knew how much vertex
20967ec681f3Smrg       * reuse to expect.
20977ec681f3Smrg       */
20987ec681f3Smrg      unsigned lds_total = max_esverts * esvert_lds_size + max_gsprims * gsprim_lds_size;
20997ec681f3Smrg      if (lds_total > target_lds_size) {
21007ec681f3Smrg         max_esverts = max_esverts * target_lds_size / lds_total;
21017ec681f3Smrg         max_gsprims = max_gsprims * target_lds_size / lds_total;
21027ec681f3Smrg
21037ec681f3Smrg         max_esverts = MIN2(max_esverts, max_gsprims * max_verts_per_prim);
21047ec681f3Smrg         clamp_gsprims_to_esverts(&max_gsprims, max_esverts, min_verts_per_prim, uses_adjacency);
21057ec681f3Smrg         assert(max_esverts >= max_verts_per_prim && max_gsprims >= 1);
21067ec681f3Smrg      }
21077ec681f3Smrg   }
21087ec681f3Smrg
21097ec681f3Smrg   /* Round up towards full wave sizes for better ALU utilization. */
21107ec681f3Smrg   if (!max_vert_out_per_gs_instance) {
21117ec681f3Smrg      unsigned orig_max_esverts;
21127ec681f3Smrg      unsigned orig_max_gsprims;
21137ec681f3Smrg      unsigned wavesize;
21147ec681f3Smrg
21157ec681f3Smrg      if (gs_type == MESA_SHADER_GEOMETRY) {
21167ec681f3Smrg         wavesize = gs_info->wave_size;
21177ec681f3Smrg      } else {
21187ec681f3Smrg         wavesize = nir[MESA_SHADER_TESS_CTRL] ? infos[MESA_SHADER_TESS_EVAL].wave_size
21197ec681f3Smrg                                               : infos[MESA_SHADER_VERTEX].wave_size;
21207ec681f3Smrg      }
21217ec681f3Smrg
21227ec681f3Smrg      do {
21237ec681f3Smrg         orig_max_esverts = max_esverts;
21247ec681f3Smrg         orig_max_gsprims = max_gsprims;
21257ec681f3Smrg
21267ec681f3Smrg         max_esverts = align(max_esverts, wavesize);
21277ec681f3Smrg         max_esverts = MIN2(max_esverts, max_esverts_base);
21287ec681f3Smrg         if (esvert_lds_size)
21297ec681f3Smrg            max_esverts =
21307ec681f3Smrg               MIN2(max_esverts, (max_lds_size - max_gsprims * gsprim_lds_size) / esvert_lds_size);
21317ec681f3Smrg         max_esverts = MIN2(max_esverts, max_gsprims * max_verts_per_prim);
21327ec681f3Smrg
21337ec681f3Smrg         /* Hardware restriction: minimum value of max_esverts */
21347ec681f3Smrg         if (pipeline->device->physical_device->rad_info.chip_class == GFX10)
21357ec681f3Smrg            max_esverts = MAX2(max_esverts, min_esverts - 1 + max_verts_per_prim);
21367ec681f3Smrg         else
21377ec681f3Smrg            max_esverts = MAX2(max_esverts, min_esverts);
21387ec681f3Smrg
21397ec681f3Smrg         max_gsprims = align(max_gsprims, wavesize);
21407ec681f3Smrg         max_gsprims = MIN2(max_gsprims, max_gsprims_base);
21417ec681f3Smrg         if (gsprim_lds_size) {
21427ec681f3Smrg            /* Don't count unusable vertices to the LDS
21437ec681f3Smrg             * size. Those are vertices above the maximum
21447ec681f3Smrg             * number of vertices that can occur in the
21457ec681f3Smrg             * workgroup, which is e.g. max_gsprims * 3
21467ec681f3Smrg             * for triangles.
21477ec681f3Smrg             */
21487ec681f3Smrg            unsigned usable_esverts = MIN2(max_esverts, max_gsprims * max_verts_per_prim);
21497ec681f3Smrg            max_gsprims = MIN2(max_gsprims,
21507ec681f3Smrg                               (max_lds_size - usable_esverts * esvert_lds_size) / gsprim_lds_size);
21517ec681f3Smrg         }
21527ec681f3Smrg         clamp_gsprims_to_esverts(&max_gsprims, max_esverts, min_verts_per_prim, uses_adjacency);
21537ec681f3Smrg         assert(max_esverts >= max_verts_per_prim && max_gsprims >= 1);
21547ec681f3Smrg      } while (orig_max_esverts != max_esverts || orig_max_gsprims != max_gsprims);
21557ec681f3Smrg
21567ec681f3Smrg      /* Verify the restriction. */
21577ec681f3Smrg      if (pipeline->device->physical_device->rad_info.chip_class == GFX10)
21587ec681f3Smrg         assert(max_esverts >= min_esverts - 1 + max_verts_per_prim);
21597ec681f3Smrg      else
21607ec681f3Smrg         assert(max_esverts >= min_esverts);
21617ec681f3Smrg   } else {
21627ec681f3Smrg      /* Hardware restriction: minimum value of max_esverts */
21637ec681f3Smrg      if (pipeline->device->physical_device->rad_info.chip_class == GFX10)
21647ec681f3Smrg         max_esverts = MAX2(max_esverts, min_esverts - 1 + max_verts_per_prim);
21657ec681f3Smrg      else
21667ec681f3Smrg         max_esverts = MAX2(max_esverts, min_esverts);
21677ec681f3Smrg   }
21687ec681f3Smrg
21697ec681f3Smrg   unsigned max_out_vertices = max_vert_out_per_gs_instance ? gs_info->gs.vertices_out
21707ec681f3Smrg                               : gs_type == MESA_SHADER_GEOMETRY
21717ec681f3Smrg                                  ? max_gsprims * gs_num_invocations * gs_info->gs.vertices_out
21727ec681f3Smrg                                  : max_esverts;
21737ec681f3Smrg   assert(max_out_vertices <= 256);
21747ec681f3Smrg
21757ec681f3Smrg   unsigned prim_amp_factor = 1;
21767ec681f3Smrg   if (gs_type == MESA_SHADER_GEOMETRY) {
21777ec681f3Smrg      /* Number of output primitives per GS input primitive after
21787ec681f3Smrg       * GS instancing. */
21797ec681f3Smrg      prim_amp_factor = gs_info->gs.vertices_out;
21807ec681f3Smrg   }
21817ec681f3Smrg
21827ec681f3Smrg   /* On Gfx10, the GE only checks against the maximum number of ES verts
21837ec681f3Smrg    * after allocating a full GS primitive. So we need to ensure that
21847ec681f3Smrg    * whenever this check passes, there is enough space for a full
21857ec681f3Smrg    * primitive without vertex reuse.
21867ec681f3Smrg    */
21877ec681f3Smrg   if (pipeline->device->physical_device->rad_info.chip_class == GFX10)
21887ec681f3Smrg      ngg->hw_max_esverts = max_esverts - max_verts_per_prim + 1;
21897ec681f3Smrg   else
21907ec681f3Smrg      ngg->hw_max_esverts = max_esverts;
21917ec681f3Smrg
21927ec681f3Smrg   ngg->max_gsprims = max_gsprims;
21937ec681f3Smrg   ngg->max_out_verts = max_out_vertices;
21947ec681f3Smrg   ngg->prim_amp_factor = prim_amp_factor;
21957ec681f3Smrg   ngg->max_vert_out_per_gs_instance = max_vert_out_per_gs_instance;
21967ec681f3Smrg   ngg->ngg_emit_size = max_gsprims * gsprim_lds_size;
21977ec681f3Smrg   ngg->enable_vertex_grouping = true;
21987ec681f3Smrg
21997ec681f3Smrg   /* Don't count unusable vertices. */
22007ec681f3Smrg   ngg->esgs_ring_size = MIN2(max_esverts, max_gsprims * max_verts_per_prim) * esvert_lds_size * 4;
22017ec681f3Smrg
22027ec681f3Smrg   if (gs_type == MESA_SHADER_GEOMETRY) {
22037ec681f3Smrg      ngg->vgt_esgs_ring_itemsize = es_info->esgs_itemsize / 4;
22047ec681f3Smrg   } else {
22057ec681f3Smrg      ngg->vgt_esgs_ring_itemsize = 1;
22067ec681f3Smrg   }
22077ec681f3Smrg
22087ec681f3Smrg   assert(ngg->hw_max_esverts >= min_esverts); /* HW limitation */
22097ec681f3Smrg
22107ec681f3Smrg   gl_shader_stage es_stage = nir[MESA_SHADER_TESS_CTRL] ? MESA_SHADER_TESS_EVAL : MESA_SHADER_VERTEX;
22117ec681f3Smrg   unsigned workgroup_size =
22127ec681f3Smrg      ac_compute_ngg_workgroup_size(
22137ec681f3Smrg         max_esverts, max_gsprims * gs_num_invocations, max_out_vertices, prim_amp_factor);
22147ec681f3Smrg   infos[MESA_SHADER_GEOMETRY].workgroup_size = workgroup_size;
22157ec681f3Smrg   infos[es_stage].workgroup_size = workgroup_size;
22167ec681f3Smrg}
22177ec681f3Smrg
22187ec681f3Smrgstatic void
22197ec681f3Smrgradv_pipeline_init_gs_ring_state(struct radv_pipeline *pipeline, const struct gfx9_gs_info *gs)
22207ec681f3Smrg{
22217ec681f3Smrg   struct radv_device *device = pipeline->device;
22227ec681f3Smrg   unsigned num_se = device->physical_device->rad_info.max_se;
22237ec681f3Smrg   unsigned wave_size = 64;
22247ec681f3Smrg   unsigned max_gs_waves = 32 * num_se; /* max 32 per SE on GCN */
22257ec681f3Smrg   /* On GFX6-GFX7, the value comes from VGT_GS_VERTEX_REUSE = 16.
22267ec681f3Smrg    * On GFX8+, the value comes from VGT_VERTEX_REUSE_BLOCK_CNTL = 30 (+2).
22277ec681f3Smrg    */
22287ec681f3Smrg   unsigned gs_vertex_reuse =
22297ec681f3Smrg      (device->physical_device->rad_info.chip_class >= GFX8 ? 32 : 16) * num_se;
22307ec681f3Smrg   unsigned alignment = 256 * num_se;
22317ec681f3Smrg   /* The maximum size is 63.999 MB per SE. */
22327ec681f3Smrg   unsigned max_size = ((unsigned)(63.999 * 1024 * 1024) & ~255) * num_se;
22337ec681f3Smrg   struct radv_shader_info *gs_info = &pipeline->shaders[MESA_SHADER_GEOMETRY]->info;
22347ec681f3Smrg
22357ec681f3Smrg   /* Calculate the minimum size. */
22367ec681f3Smrg   unsigned min_esgs_ring_size =
22377ec681f3Smrg      align(gs->vgt_esgs_ring_itemsize * 4 * gs_vertex_reuse * wave_size, alignment);
22387ec681f3Smrg   /* These are recommended sizes, not minimum sizes. */
22397ec681f3Smrg   unsigned esgs_ring_size =
22407ec681f3Smrg      max_gs_waves * 2 * wave_size * gs->vgt_esgs_ring_itemsize * 4 * gs_info->gs.vertices_in;
22417ec681f3Smrg   unsigned gsvs_ring_size = max_gs_waves * 2 * wave_size * gs_info->gs.max_gsvs_emit_size;
22427ec681f3Smrg
22437ec681f3Smrg   min_esgs_ring_size = align(min_esgs_ring_size, alignment);
22447ec681f3Smrg   esgs_ring_size = align(esgs_ring_size, alignment);
22457ec681f3Smrg   gsvs_ring_size = align(gsvs_ring_size, alignment);
22467ec681f3Smrg
22477ec681f3Smrg   if (pipeline->device->physical_device->rad_info.chip_class <= GFX8)
22487ec681f3Smrg      pipeline->graphics.esgs_ring_size = CLAMP(esgs_ring_size, min_esgs_ring_size, max_size);
22497ec681f3Smrg
22507ec681f3Smrg   pipeline->graphics.gsvs_ring_size = MIN2(gsvs_ring_size, max_size);
225101e04c3fSmrg}
225201e04c3fSmrg
225301e04c3fSmrgstruct radv_shader_variant *
22547ec681f3Smrgradv_get_shader(const struct radv_pipeline *pipeline, gl_shader_stage stage)
22557ec681f3Smrg{
22567ec681f3Smrg   if (stage == MESA_SHADER_VERTEX) {
22577ec681f3Smrg      if (pipeline->shaders[MESA_SHADER_VERTEX])
22587ec681f3Smrg         return pipeline->shaders[MESA_SHADER_VERTEX];
22597ec681f3Smrg      if (pipeline->shaders[MESA_SHADER_TESS_CTRL])
22607ec681f3Smrg         return pipeline->shaders[MESA_SHADER_TESS_CTRL];
22617ec681f3Smrg      if (pipeline->shaders[MESA_SHADER_GEOMETRY])
22627ec681f3Smrg         return pipeline->shaders[MESA_SHADER_GEOMETRY];
22637ec681f3Smrg   } else if (stage == MESA_SHADER_TESS_EVAL) {
22647ec681f3Smrg      if (!radv_pipeline_has_tess(pipeline))
22657ec681f3Smrg         return NULL;
22667ec681f3Smrg      if (pipeline->shaders[MESA_SHADER_TESS_EVAL])
22677ec681f3Smrg         return pipeline->shaders[MESA_SHADER_TESS_EVAL];
22687ec681f3Smrg      if (pipeline->shaders[MESA_SHADER_GEOMETRY])
22697ec681f3Smrg         return pipeline->shaders[MESA_SHADER_GEOMETRY];
22707ec681f3Smrg   }
22717ec681f3Smrg   return pipeline->shaders[stage];
22727ec681f3Smrg}
227301e04c3fSmrg
22747ec681f3Smrgstatic const struct radv_vs_output_info *
22757ec681f3Smrgget_vs_output_info(const struct radv_pipeline *pipeline)
227601e04c3fSmrg{
22777ec681f3Smrg   if (radv_pipeline_has_gs(pipeline))
22787ec681f3Smrg      if (radv_pipeline_has_ngg(pipeline))
22797ec681f3Smrg         return &pipeline->shaders[MESA_SHADER_GEOMETRY]->info.vs.outinfo;
22807ec681f3Smrg      else
22817ec681f3Smrg         return &pipeline->gs_copy_shader->info.vs.outinfo;
22827ec681f3Smrg   else if (radv_pipeline_has_tess(pipeline))
22837ec681f3Smrg      return &pipeline->shaders[MESA_SHADER_TESS_EVAL]->info.tes.outinfo;
22847ec681f3Smrg   else
22857ec681f3Smrg      return &pipeline->shaders[MESA_SHADER_VERTEX]->info.vs.outinfo;
22867ec681f3Smrg}
22877ec681f3Smrg
22887ec681f3Smrgstatic bool
22897ec681f3Smrgradv_nir_stage_uses_xfb(const nir_shader *nir)
22907ec681f3Smrg{
22917ec681f3Smrg   nir_xfb_info *xfb = nir_gather_xfb_info(nir, NULL);
22927ec681f3Smrg   bool uses_xfb = !!xfb;
22937ec681f3Smrg
22947ec681f3Smrg   ralloc_free(xfb);
22957ec681f3Smrg   return uses_xfb;
229601e04c3fSmrg}
229701e04c3fSmrg
229801e04c3fSmrgstatic void
22997ec681f3Smrgradv_link_shaders(struct radv_pipeline *pipeline,
23007ec681f3Smrg                  const struct radv_pipeline_key *pipeline_key,
23017ec681f3Smrg                  nir_shader **shaders,
23027ec681f3Smrg                  bool optimize_conservatively)
23037ec681f3Smrg{
23047ec681f3Smrg   nir_shader *ordered_shaders[MESA_SHADER_STAGES];
23057ec681f3Smrg   int shader_count = 0;
23067ec681f3Smrg
23077ec681f3Smrg   if (shaders[MESA_SHADER_FRAGMENT]) {
23087ec681f3Smrg      ordered_shaders[shader_count++] = shaders[MESA_SHADER_FRAGMENT];
23097ec681f3Smrg   }
23107ec681f3Smrg   if (shaders[MESA_SHADER_GEOMETRY]) {
23117ec681f3Smrg      ordered_shaders[shader_count++] = shaders[MESA_SHADER_GEOMETRY];
23127ec681f3Smrg   }
23137ec681f3Smrg   if (shaders[MESA_SHADER_TESS_EVAL]) {
23147ec681f3Smrg      ordered_shaders[shader_count++] = shaders[MESA_SHADER_TESS_EVAL];
23157ec681f3Smrg   }
23167ec681f3Smrg   if (shaders[MESA_SHADER_TESS_CTRL]) {
23177ec681f3Smrg      ordered_shaders[shader_count++] = shaders[MESA_SHADER_TESS_CTRL];
23187ec681f3Smrg   }
23197ec681f3Smrg   if (shaders[MESA_SHADER_VERTEX]) {
23207ec681f3Smrg      ordered_shaders[shader_count++] = shaders[MESA_SHADER_VERTEX];
23217ec681f3Smrg   }
23227ec681f3Smrg   if (shaders[MESA_SHADER_COMPUTE]) {
23237ec681f3Smrg      ordered_shaders[shader_count++] = shaders[MESA_SHADER_COMPUTE];
23247ec681f3Smrg   }
23257ec681f3Smrg
23267ec681f3Smrg   bool has_geom_tess = shaders[MESA_SHADER_GEOMETRY] || shaders[MESA_SHADER_TESS_CTRL];
23277ec681f3Smrg   bool merged_gs = shaders[MESA_SHADER_GEOMETRY] &&
23287ec681f3Smrg                    pipeline->device->physical_device->rad_info.chip_class >= GFX9;
23297ec681f3Smrg
23307ec681f3Smrg   if (!optimize_conservatively && shader_count > 1) {
23317ec681f3Smrg      unsigned first = ordered_shaders[shader_count - 1]->info.stage;
23327ec681f3Smrg      unsigned last = ordered_shaders[0]->info.stage;
23337ec681f3Smrg
23347ec681f3Smrg      if (ordered_shaders[0]->info.stage == MESA_SHADER_FRAGMENT &&
23357ec681f3Smrg          ordered_shaders[1]->info.has_transform_feedback_varyings)
23367ec681f3Smrg         nir_link_xfb_varyings(ordered_shaders[1], ordered_shaders[0]);
23377ec681f3Smrg
23387ec681f3Smrg      for (int i = 1; i < shader_count; ++i) {
23397ec681f3Smrg         nir_lower_io_arrays_to_elements(ordered_shaders[i], ordered_shaders[i - 1]);
23407ec681f3Smrg      }
23417ec681f3Smrg
23427ec681f3Smrg      for (int i = 0; i < shader_count; ++i) {
23437ec681f3Smrg         nir_variable_mode mask = 0;
23447ec681f3Smrg
23457ec681f3Smrg         if (ordered_shaders[i]->info.stage != first)
23467ec681f3Smrg            mask = mask | nir_var_shader_in;
23477ec681f3Smrg
23487ec681f3Smrg         if (ordered_shaders[i]->info.stage != last)
23497ec681f3Smrg            mask = mask | nir_var_shader_out;
23507ec681f3Smrg
23517ec681f3Smrg         if (nir_lower_io_to_scalar_early(ordered_shaders[i], mask)) {
23527ec681f3Smrg            /* Optimize the new vector code and then remove dead vars */
23537ec681f3Smrg            nir_copy_prop(ordered_shaders[i]);
23547ec681f3Smrg            nir_opt_shrink_vectors(ordered_shaders[i],
23557ec681f3Smrg                                   !pipeline->device->instance->disable_shrink_image_store);
23567ec681f3Smrg
23577ec681f3Smrg            if (ordered_shaders[i]->info.stage != last) {
23587ec681f3Smrg               /* Optimize swizzled movs of load_const for
23597ec681f3Smrg                * nir_link_opt_varyings's constant propagation
23607ec681f3Smrg                */
23617ec681f3Smrg               nir_opt_constant_folding(ordered_shaders[i]);
23627ec681f3Smrg               /* For nir_link_opt_varyings's duplicate input opt */
23637ec681f3Smrg               nir_opt_cse(ordered_shaders[i]);
23647ec681f3Smrg            }
23657ec681f3Smrg
23667ec681f3Smrg            /* Run copy-propagation to help remove dead
23677ec681f3Smrg             * output variables (some shaders have useless
23687ec681f3Smrg             * copies to/from an output), so compaction
23697ec681f3Smrg             * later will be more effective.
23707ec681f3Smrg             *
23717ec681f3Smrg             * This will have been done earlier but it might
23727ec681f3Smrg             * not have worked because the outputs were vector.
23737ec681f3Smrg             */
23747ec681f3Smrg            if (ordered_shaders[i]->info.stage == MESA_SHADER_TESS_CTRL)
23757ec681f3Smrg               nir_opt_copy_prop_vars(ordered_shaders[i]);
23767ec681f3Smrg
23777ec681f3Smrg            nir_opt_dce(ordered_shaders[i]);
23787ec681f3Smrg            nir_remove_dead_variables(
23797ec681f3Smrg               ordered_shaders[i], nir_var_function_temp | nir_var_shader_in | nir_var_shader_out,
23807ec681f3Smrg               NULL);
23817ec681f3Smrg         }
23827ec681f3Smrg      }
23837ec681f3Smrg   }
23847ec681f3Smrg
23857ec681f3Smrg   bool uses_xfb = pipeline->graphics.last_vgt_api_stage != -1 &&
23867ec681f3Smrg                   radv_nir_stage_uses_xfb(shaders[pipeline->graphics.last_vgt_api_stage]);
23877ec681f3Smrg   if (!uses_xfb && !optimize_conservatively) {
23887ec681f3Smrg      /* Remove PSIZ from shaders when it's not needed.
23897ec681f3Smrg       * This is typically produced by translation layers like Zink or D9VK.
23907ec681f3Smrg       */
23917ec681f3Smrg      for (unsigned i = 0; i < shader_count; ++i) {
23927ec681f3Smrg         shader_info *info = &ordered_shaders[i]->info;
23937ec681f3Smrg         if (!(info->outputs_written & VARYING_BIT_PSIZ))
23947ec681f3Smrg            continue;
23957ec681f3Smrg
23967ec681f3Smrg         bool next_stage_needs_psiz =
23977ec681f3Smrg            i != 0 && /* ordered_shaders is backwards, so next stage is: i - 1 */
23987ec681f3Smrg            ordered_shaders[i - 1]->info.inputs_read & VARYING_BIT_PSIZ;
23997ec681f3Smrg         bool topology_uses_psiz =
24007ec681f3Smrg            info->stage == pipeline->graphics.last_vgt_api_stage &&
24017ec681f3Smrg            ((info->stage == MESA_SHADER_VERTEX && pipeline_key->vs.topology == VK_PRIMITIVE_TOPOLOGY_POINT_LIST) ||
24027ec681f3Smrg             (info->stage == MESA_SHADER_TESS_EVAL && info->tess.point_mode) ||
24037ec681f3Smrg             (info->stage == MESA_SHADER_GEOMETRY && info->gs.output_primitive == GL_POINTS));
24047ec681f3Smrg
24057ec681f3Smrg         nir_variable *psiz_var =
24067ec681f3Smrg               nir_find_variable_with_location(ordered_shaders[i], nir_var_shader_out, VARYING_SLOT_PSIZ);
24077ec681f3Smrg
24087ec681f3Smrg         if (!next_stage_needs_psiz && !topology_uses_psiz && psiz_var) {
24097ec681f3Smrg            /* Change PSIZ to a global variable which allows it to be DCE'd. */
24107ec681f3Smrg            psiz_var->data.location = 0;
24117ec681f3Smrg            psiz_var->data.mode = nir_var_shader_temp;
24127ec681f3Smrg
24137ec681f3Smrg            info->outputs_written &= ~VARYING_BIT_PSIZ;
24147ec681f3Smrg            nir_fixup_deref_modes(ordered_shaders[i]);
24157ec681f3Smrg            nir_remove_dead_variables(ordered_shaders[i], nir_var_shader_temp, NULL);
24167ec681f3Smrg            nir_opt_dce(ordered_shaders[i]);
24177ec681f3Smrg         }
24187ec681f3Smrg      }
24197ec681f3Smrg   }
24207ec681f3Smrg
24217ec681f3Smrg   for (int i = 1; !optimize_conservatively && (i < shader_count); ++i) {
24227ec681f3Smrg      if (nir_link_opt_varyings(ordered_shaders[i], ordered_shaders[i - 1])) {
24237ec681f3Smrg         nir_opt_constant_folding(ordered_shaders[i - 1]);
24247ec681f3Smrg         nir_opt_algebraic(ordered_shaders[i - 1]);
24257ec681f3Smrg         nir_opt_dce(ordered_shaders[i - 1]);
24267ec681f3Smrg      }
24277ec681f3Smrg
24287ec681f3Smrg      nir_remove_dead_variables(ordered_shaders[i], nir_var_shader_out, NULL);
24297ec681f3Smrg      nir_remove_dead_variables(ordered_shaders[i - 1], nir_var_shader_in, NULL);
24307ec681f3Smrg
24317ec681f3Smrg      bool progress = nir_remove_unused_varyings(ordered_shaders[i], ordered_shaders[i - 1]);
24327ec681f3Smrg
24337ec681f3Smrg      nir_compact_varyings(ordered_shaders[i], ordered_shaders[i - 1], true);
24347ec681f3Smrg
24357ec681f3Smrg      if (ordered_shaders[i]->info.stage == MESA_SHADER_TESS_CTRL ||
24367ec681f3Smrg          (ordered_shaders[i]->info.stage == MESA_SHADER_VERTEX && has_geom_tess) ||
24377ec681f3Smrg          (ordered_shaders[i]->info.stage == MESA_SHADER_TESS_EVAL && merged_gs)) {
24387ec681f3Smrg         nir_lower_io_to_vector(ordered_shaders[i], nir_var_shader_out);
24397ec681f3Smrg         if (ordered_shaders[i]->info.stage == MESA_SHADER_TESS_CTRL)
24407ec681f3Smrg            nir_vectorize_tess_levels(ordered_shaders[i]);
24417ec681f3Smrg         nir_opt_combine_stores(ordered_shaders[i], nir_var_shader_out);
24427ec681f3Smrg      }
24437ec681f3Smrg      if (ordered_shaders[i - 1]->info.stage == MESA_SHADER_GEOMETRY ||
24447ec681f3Smrg          ordered_shaders[i - 1]->info.stage == MESA_SHADER_TESS_CTRL ||
24457ec681f3Smrg          ordered_shaders[i - 1]->info.stage == MESA_SHADER_TESS_EVAL) {
24467ec681f3Smrg         nir_lower_io_to_vector(ordered_shaders[i - 1], nir_var_shader_in);
24477ec681f3Smrg      }
24487ec681f3Smrg
24497ec681f3Smrg      if (progress) {
24507ec681f3Smrg         if (nir_lower_global_vars_to_local(ordered_shaders[i])) {
24517ec681f3Smrg            ac_nir_lower_indirect_derefs(ordered_shaders[i],
24527ec681f3Smrg                                         pipeline->device->physical_device->rad_info.chip_class);
24537ec681f3Smrg            /* remove dead writes, which can remove input loads */
24547ec681f3Smrg            nir_lower_vars_to_ssa(ordered_shaders[i]);
24557ec681f3Smrg            nir_opt_dce(ordered_shaders[i]);
24567ec681f3Smrg         }
24577ec681f3Smrg
24587ec681f3Smrg         if (nir_lower_global_vars_to_local(ordered_shaders[i - 1])) {
24597ec681f3Smrg            ac_nir_lower_indirect_derefs(ordered_shaders[i - 1],
24607ec681f3Smrg                                         pipeline->device->physical_device->rad_info.chip_class);
24617ec681f3Smrg         }
24627ec681f3Smrg      }
24637ec681f3Smrg   }
24647ec681f3Smrg}
24657ec681f3Smrg
24667ec681f3Smrgstatic void
24677ec681f3Smrgradv_set_driver_locations(struct radv_pipeline *pipeline, nir_shader **shaders,
24687ec681f3Smrg                          struct radv_shader_info infos[MESA_SHADER_STAGES])
24697ec681f3Smrg{
24707ec681f3Smrg   if (shaders[MESA_SHADER_FRAGMENT]) {
24717ec681f3Smrg      nir_foreach_shader_out_variable(var, shaders[MESA_SHADER_FRAGMENT])
24727ec681f3Smrg      {
24737ec681f3Smrg         var->data.driver_location = var->data.location + var->data.index;
24747ec681f3Smrg      }
24757ec681f3Smrg   }
24767ec681f3Smrg
24777ec681f3Smrg   if (!shaders[MESA_SHADER_VERTEX])
24787ec681f3Smrg      return;
24797ec681f3Smrg
24807ec681f3Smrg   bool has_tess = shaders[MESA_SHADER_TESS_CTRL];
24817ec681f3Smrg   bool has_gs = shaders[MESA_SHADER_GEOMETRY];
24827ec681f3Smrg
24837ec681f3Smrg   /* Merged stage for VS and TES */
24847ec681f3Smrg   unsigned vs_info_idx = MESA_SHADER_VERTEX;
24857ec681f3Smrg   unsigned tes_info_idx = MESA_SHADER_TESS_EVAL;
24867ec681f3Smrg
24877ec681f3Smrg   if (pipeline->device->physical_device->rad_info.chip_class >= GFX9) {
24887ec681f3Smrg      /* These are merged into the next stage */
24897ec681f3Smrg      vs_info_idx = has_tess ? MESA_SHADER_TESS_CTRL : MESA_SHADER_GEOMETRY;
24907ec681f3Smrg      tes_info_idx = has_gs ? MESA_SHADER_GEOMETRY : MESA_SHADER_TESS_EVAL;
24917ec681f3Smrg   }
24927ec681f3Smrg
24937ec681f3Smrg   nir_foreach_shader_in_variable (var, shaders[MESA_SHADER_VERTEX]) {
24947ec681f3Smrg      var->data.driver_location = var->data.location;
24957ec681f3Smrg   }
24967ec681f3Smrg
24977ec681f3Smrg   if (has_tess) {
24987ec681f3Smrg      nir_linked_io_var_info vs2tcs = nir_assign_linked_io_var_locations(
24997ec681f3Smrg         shaders[MESA_SHADER_VERTEX], shaders[MESA_SHADER_TESS_CTRL]);
25007ec681f3Smrg      nir_linked_io_var_info tcs2tes = nir_assign_linked_io_var_locations(
25017ec681f3Smrg         shaders[MESA_SHADER_TESS_CTRL], shaders[MESA_SHADER_TESS_EVAL]);
25027ec681f3Smrg
25037ec681f3Smrg      infos[MESA_SHADER_VERTEX].vs.num_linked_outputs = vs2tcs.num_linked_io_vars;
25047ec681f3Smrg      infos[MESA_SHADER_TESS_CTRL].tcs.num_linked_inputs = vs2tcs.num_linked_io_vars;
25057ec681f3Smrg      infos[MESA_SHADER_TESS_CTRL].tcs.num_linked_outputs = tcs2tes.num_linked_io_vars;
25067ec681f3Smrg      infos[MESA_SHADER_TESS_CTRL].tcs.num_linked_patch_outputs = tcs2tes.num_linked_patch_io_vars;
25077ec681f3Smrg      infos[MESA_SHADER_TESS_EVAL].tes.num_linked_inputs = tcs2tes.num_linked_io_vars;
25087ec681f3Smrg      infos[MESA_SHADER_TESS_EVAL].tes.num_linked_patch_inputs = tcs2tes.num_linked_patch_io_vars;
25097ec681f3Smrg
25107ec681f3Smrg      /* Copy data to merged stage */
25117ec681f3Smrg      infos[vs_info_idx].vs.num_linked_outputs = vs2tcs.num_linked_io_vars;
25127ec681f3Smrg      infos[tes_info_idx].tes.num_linked_inputs = tcs2tes.num_linked_io_vars;
25137ec681f3Smrg      infos[tes_info_idx].tes.num_linked_patch_inputs = tcs2tes.num_linked_patch_io_vars;
25147ec681f3Smrg
25157ec681f3Smrg      if (has_gs) {
25167ec681f3Smrg         nir_linked_io_var_info tes2gs = nir_assign_linked_io_var_locations(
25177ec681f3Smrg            shaders[MESA_SHADER_TESS_EVAL], shaders[MESA_SHADER_GEOMETRY]);
25187ec681f3Smrg
25197ec681f3Smrg         infos[MESA_SHADER_TESS_EVAL].tes.num_linked_outputs = tes2gs.num_linked_io_vars;
25207ec681f3Smrg         infos[MESA_SHADER_GEOMETRY].gs.num_linked_inputs = tes2gs.num_linked_io_vars;
25217ec681f3Smrg
25227ec681f3Smrg         /* Copy data to merged stage */
25237ec681f3Smrg         infos[tes_info_idx].tes.num_linked_outputs = tes2gs.num_linked_io_vars;
25247ec681f3Smrg      }
25257ec681f3Smrg   } else if (has_gs) {
25267ec681f3Smrg      nir_linked_io_var_info vs2gs = nir_assign_linked_io_var_locations(
25277ec681f3Smrg         shaders[MESA_SHADER_VERTEX], shaders[MESA_SHADER_GEOMETRY]);
25287ec681f3Smrg
25297ec681f3Smrg      infos[MESA_SHADER_VERTEX].vs.num_linked_outputs = vs2gs.num_linked_io_vars;
25307ec681f3Smrg      infos[MESA_SHADER_GEOMETRY].gs.num_linked_inputs = vs2gs.num_linked_io_vars;
25317ec681f3Smrg
25327ec681f3Smrg      /* Copy data to merged stage */
25337ec681f3Smrg      infos[vs_info_idx].vs.num_linked_outputs = vs2gs.num_linked_io_vars;
25347ec681f3Smrg   }
25357ec681f3Smrg
25367ec681f3Smrg   assert(pipeline->graphics.last_vgt_api_stage != MESA_SHADER_NONE);
25377ec681f3Smrg   nir_foreach_shader_out_variable(var, shaders[pipeline->graphics.last_vgt_api_stage])
25387ec681f3Smrg   {
25397ec681f3Smrg      var->data.driver_location = var->data.location;
25407ec681f3Smrg   }
254101e04c3fSmrg}
254201e04c3fSmrg
2543ed98bd31Smayastatic uint32_t
2544ed98bd31Smayaradv_get_attrib_stride(const VkPipelineVertexInputStateCreateInfo *input_state,
25457ec681f3Smrg                       uint32_t attrib_binding)
2546ed98bd31Smaya{
25477ec681f3Smrg   for (uint32_t i = 0; i < input_state->vertexBindingDescriptionCount; i++) {
25487ec681f3Smrg      const VkVertexInputBindingDescription *input_binding =
25497ec681f3Smrg         &input_state->pVertexBindingDescriptions[i];
2550ed98bd31Smaya
25517ec681f3Smrg      if (input_binding->binding == attrib_binding)
25527ec681f3Smrg         return input_binding->stride;
25537ec681f3Smrg   }
2554ed98bd31Smaya
25557ec681f3Smrg   return 0;
2556ed98bd31Smaya}
255701e04c3fSmrg
255801e04c3fSmrgstatic struct radv_pipeline_key
25597ec681f3Smrgradv_generate_graphics_pipeline_key(const struct radv_pipeline *pipeline,
256001e04c3fSmrg                                    const VkGraphicsPipelineCreateInfo *pCreateInfo,
25617ec681f3Smrg                                    const struct radv_blend_state *blend)
25627ec681f3Smrg{
25637ec681f3Smrg   RADV_FROM_HANDLE(radv_render_pass, pass, pCreateInfo->renderPass);
25647ec681f3Smrg   struct radv_subpass *subpass = pass->subpasses + pCreateInfo->subpass;
25657ec681f3Smrg   bool uses_dynamic_stride = false;
25667ec681f3Smrg
25677ec681f3Smrg   struct radv_pipeline_key key;
25687ec681f3Smrg   memset(&key, 0, sizeof(key));
25697ec681f3Smrg
25707ec681f3Smrg   if (pCreateInfo->flags & VK_PIPELINE_CREATE_DISABLE_OPTIMIZATION_BIT)
25717ec681f3Smrg      key.optimisations_disabled = 1;
25727ec681f3Smrg
25737ec681f3Smrg   key.has_multiview_view_index = !!subpass->view_mask;
25747ec681f3Smrg
25757ec681f3Smrg   if (pCreateInfo->pDynamicState) {
25767ec681f3Smrg      uint32_t count = pCreateInfo->pDynamicState->dynamicStateCount;
25777ec681f3Smrg      for (uint32_t i = 0; i < count; i++) {
25787ec681f3Smrg         if (pCreateInfo->pDynamicState->pDynamicStates[i] == VK_DYNAMIC_STATE_VERTEX_INPUT_EXT) {
25797ec681f3Smrg            key.vs.dynamic_input_state = true;
25807ec681f3Smrg            /* we don't care about use_dynamic_stride in this case */
25817ec681f3Smrg            break;
25827ec681f3Smrg         } else if (pCreateInfo->pDynamicState->pDynamicStates[i] ==
25837ec681f3Smrg                    VK_DYNAMIC_STATE_VERTEX_INPUT_BINDING_STRIDE_EXT) {
25847ec681f3Smrg            uses_dynamic_stride = true;
25857ec681f3Smrg         }
25867ec681f3Smrg      }
25877ec681f3Smrg   }
25887ec681f3Smrg
25897ec681f3Smrg   if (!key.vs.dynamic_input_state) {
25907ec681f3Smrg      const VkPipelineVertexInputStateCreateInfo *input_state = pCreateInfo->pVertexInputState;
25917ec681f3Smrg      const VkPipelineVertexInputDivisorStateCreateInfoEXT *divisor_state = vk_find_struct_const(
25927ec681f3Smrg         input_state->pNext, PIPELINE_VERTEX_INPUT_DIVISOR_STATE_CREATE_INFO_EXT);
25937ec681f3Smrg
25947ec681f3Smrg      uint32_t binding_input_rate = 0;
25957ec681f3Smrg      uint32_t instance_rate_divisors[MAX_VERTEX_ATTRIBS];
25967ec681f3Smrg      for (unsigned i = 0; i < input_state->vertexBindingDescriptionCount; ++i) {
25977ec681f3Smrg         if (input_state->pVertexBindingDescriptions[i].inputRate) {
25987ec681f3Smrg            unsigned binding = input_state->pVertexBindingDescriptions[i].binding;
25997ec681f3Smrg            binding_input_rate |= 1u << binding;
26007ec681f3Smrg            instance_rate_divisors[binding] = 1;
26017ec681f3Smrg         }
26027ec681f3Smrg      }
26037ec681f3Smrg      if (divisor_state) {
26047ec681f3Smrg         for (unsigned i = 0; i < divisor_state->vertexBindingDivisorCount; ++i) {
26057ec681f3Smrg            instance_rate_divisors[divisor_state->pVertexBindingDivisors[i].binding] =
26067ec681f3Smrg               divisor_state->pVertexBindingDivisors[i].divisor;
26077ec681f3Smrg         }
26087ec681f3Smrg      }
26097ec681f3Smrg
26107ec681f3Smrg      for (unsigned i = 0; i < input_state->vertexAttributeDescriptionCount; ++i) {
26117ec681f3Smrg         const VkVertexInputAttributeDescription *desc =
26127ec681f3Smrg            &input_state->pVertexAttributeDescriptions[i];
26137ec681f3Smrg         const struct util_format_description *format_desc;
26147ec681f3Smrg         unsigned location = desc->location;
26157ec681f3Smrg         unsigned binding = desc->binding;
26167ec681f3Smrg         unsigned num_format, data_format;
26177ec681f3Smrg         bool post_shuffle;
26187ec681f3Smrg
26197ec681f3Smrg         if (binding_input_rate & (1u << binding)) {
26207ec681f3Smrg            key.vs.instance_rate_inputs |= 1u << location;
26217ec681f3Smrg            key.vs.instance_rate_divisors[location] = instance_rate_divisors[binding];
26227ec681f3Smrg         }
26237ec681f3Smrg
26247ec681f3Smrg         format_desc = vk_format_description(desc->format);
26257ec681f3Smrg         radv_translate_vertex_format(pipeline->device->physical_device, desc->format, format_desc,
26267ec681f3Smrg                                      &data_format, &num_format, &post_shuffle,
26277ec681f3Smrg                                      &key.vs.vertex_alpha_adjust[location]);
26287ec681f3Smrg
26297ec681f3Smrg         key.vs.vertex_attribute_formats[location] = data_format | (num_format << 4);
26307ec681f3Smrg         key.vs.vertex_attribute_bindings[location] = desc->binding;
26317ec681f3Smrg         key.vs.vertex_attribute_offsets[location] = desc->offset;
26327ec681f3Smrg
26337ec681f3Smrg         const struct ac_data_format_info *dfmt_info = ac_get_data_format_info(data_format);
26347ec681f3Smrg         unsigned attrib_align =
26357ec681f3Smrg            dfmt_info->chan_byte_size ? dfmt_info->chan_byte_size : dfmt_info->element_size;
26367ec681f3Smrg
26377ec681f3Smrg         /* If desc->offset is misaligned, then the buffer offset must be too. Just
26387ec681f3Smrg          * skip updating vertex_binding_align in this case.
26397ec681f3Smrg          */
26407ec681f3Smrg         if (desc->offset % attrib_align == 0)
26417ec681f3Smrg            key.vs.vertex_binding_align[desc->binding] =
26427ec681f3Smrg               MAX2(key.vs.vertex_binding_align[desc->binding], attrib_align);
26437ec681f3Smrg
26447ec681f3Smrg         if (!uses_dynamic_stride) {
26457ec681f3Smrg            /* From the Vulkan spec 1.2.157:
26467ec681f3Smrg             *
26477ec681f3Smrg             * "If the bound pipeline state object was created
26487ec681f3Smrg             *  with the
26497ec681f3Smrg             *  VK_DYNAMIC_STATE_VERTEX_INPUT_BINDING_STRIDE_EXT
26507ec681f3Smrg             *  dynamic state enabled then pStrides[i] specifies
26517ec681f3Smrg             *  the distance in bytes between two consecutive
26527ec681f3Smrg             *  elements within the corresponding buffer. In this
26537ec681f3Smrg             *  case the VkVertexInputBindingDescription::stride
26547ec681f3Smrg             *  state from the pipeline state object is ignored."
26557ec681f3Smrg             *
26567ec681f3Smrg             * Make sure the vertex attribute stride is zero to
26577ec681f3Smrg             * avoid computing a wrong offset if it's initialized
26587ec681f3Smrg             * to something else than zero.
26597ec681f3Smrg             */
26607ec681f3Smrg            key.vs.vertex_attribute_strides[location] =
26617ec681f3Smrg               radv_get_attrib_stride(input_state, desc->binding);
26627ec681f3Smrg         }
26637ec681f3Smrg
26647ec681f3Smrg         if (post_shuffle)
26657ec681f3Smrg            key.vs.vertex_post_shuffle |= 1 << location;
26667ec681f3Smrg      }
26677ec681f3Smrg   }
26687ec681f3Smrg
26697ec681f3Smrg   const VkPipelineTessellationStateCreateInfo *tess =
26707ec681f3Smrg      radv_pipeline_get_tessellation_state(pCreateInfo);
26717ec681f3Smrg   if (tess)
26727ec681f3Smrg      key.tcs.tess_input_vertices = tess->patchControlPoints;
26737ec681f3Smrg
26747ec681f3Smrg   const VkPipelineMultisampleStateCreateInfo *vkms =
26757ec681f3Smrg      radv_pipeline_get_multisample_state(pCreateInfo);
26767ec681f3Smrg   if (vkms && vkms->rasterizationSamples > 1) {
26777ec681f3Smrg      uint32_t num_samples = vkms->rasterizationSamples;
26787ec681f3Smrg      uint32_t ps_iter_samples = radv_pipeline_get_ps_iter_samples(pCreateInfo);
26797ec681f3Smrg      key.ps.num_samples = num_samples;
26807ec681f3Smrg      key.ps.log2_ps_iter_samples = util_logbase2(ps_iter_samples);
26817ec681f3Smrg   }
26827ec681f3Smrg
26837ec681f3Smrg   key.ps.col_format = blend->spi_shader_col_format;
26847ec681f3Smrg   if (pipeline->device->physical_device->rad_info.chip_class < GFX8) {
26857ec681f3Smrg      key.ps.is_int8 = blend->col_format_is_int8;
26867ec681f3Smrg      key.ps.is_int10 = blend->col_format_is_int10;
26877ec681f3Smrg   }
26887ec681f3Smrg
26897ec681f3Smrg   if (pipeline->device->physical_device->rad_info.chip_class >= GFX10) {
26907ec681f3Smrg      key.vs.topology = pCreateInfo->pInputAssemblyState->topology;
26917ec681f3Smrg
26927ec681f3Smrg      const VkPipelineRasterizationStateCreateInfo *raster_info = pCreateInfo->pRasterizationState;
26937ec681f3Smrg      const VkPipelineRasterizationProvokingVertexStateCreateInfoEXT *provoking_vtx_info =
26947ec681f3Smrg         vk_find_struct_const(raster_info->pNext,
26957ec681f3Smrg                              PIPELINE_RASTERIZATION_PROVOKING_VERTEX_STATE_CREATE_INFO_EXT);
26967ec681f3Smrg      if (provoking_vtx_info &&
26977ec681f3Smrg          provoking_vtx_info->provokingVertexMode == VK_PROVOKING_VERTEX_MODE_LAST_VERTEX_EXT) {
26987ec681f3Smrg         key.vs.provoking_vtx_last = true;
26997ec681f3Smrg      }
27007ec681f3Smrg   }
27017ec681f3Smrg
27027ec681f3Smrg   if (pipeline->device->instance->debug_flags & RADV_DEBUG_DISCARD_TO_DEMOTE)
27037ec681f3Smrg      key.ps.lower_discard_to_demote = true;
27047ec681f3Smrg
27057ec681f3Smrg   if (pipeline->device->instance->enable_mrt_output_nan_fixup)
27067ec681f3Smrg      key.ps.enable_mrt_output_nan_fixup = true;
27077ec681f3Smrg
27087ec681f3Smrg   key.ps.force_vrs = pipeline->device->force_vrs;
27097ec681f3Smrg
27107ec681f3Smrg   if (pipeline->device->instance->debug_flags & RADV_DEBUG_INVARIANT_GEOM)
27117ec681f3Smrg      key.invariant_geom = true;
27127ec681f3Smrg
27137ec681f3Smrg   key.use_ngg = pipeline->device->physical_device->use_ngg;
27147ec681f3Smrg
27157ec681f3Smrg   return key;
27167ec681f3Smrg}
27177ec681f3Smrg
27187ec681f3Smrgstatic uint8_t
27197ec681f3Smrgradv_get_wave_size(struct radv_device *device, const VkPipelineShaderStageCreateInfo *pStage,
27207ec681f3Smrg                   gl_shader_stage stage, const struct radv_shader_info *info)
27217ec681f3Smrg{
27227ec681f3Smrg   if (stage == MESA_SHADER_GEOMETRY && !info->is_ngg)
27237ec681f3Smrg      return 64;
27247ec681f3Smrg   else if (stage == MESA_SHADER_COMPUTE) {
27257ec681f3Smrg      return info->cs.subgroup_size;
27267ec681f3Smrg   } else if (stage == MESA_SHADER_FRAGMENT)
27277ec681f3Smrg      return device->physical_device->ps_wave_size;
27287ec681f3Smrg   else
27297ec681f3Smrg      return device->physical_device->ge_wave_size;
27307ec681f3Smrg}
27317ec681f3Smrg
27327ec681f3Smrgstatic uint8_t
27337ec681f3Smrgradv_get_ballot_bit_size(struct radv_device *device, const VkPipelineShaderStageCreateInfo *pStage,
27347ec681f3Smrg                         gl_shader_stage stage, const struct radv_shader_info *info)
27357ec681f3Smrg{
27367ec681f3Smrg   if (stage == MESA_SHADER_COMPUTE && info->cs.subgroup_size)
27377ec681f3Smrg      return info->cs.subgroup_size;
27387ec681f3Smrg   return 64;
27397ec681f3Smrg}
27407ec681f3Smrg
27417ec681f3Smrgstatic void
27427ec681f3Smrgradv_determine_ngg_settings(struct radv_pipeline *pipeline,
27437ec681f3Smrg                            const struct radv_pipeline_key *pipeline_key,
27447ec681f3Smrg                            struct radv_shader_info *infos, nir_shader **nir)
27457ec681f3Smrg{
27467ec681f3Smrg   struct radv_device *device = pipeline->device;
27477ec681f3Smrg
27487ec681f3Smrg   if (!nir[MESA_SHADER_GEOMETRY] && pipeline->graphics.last_vgt_api_stage != MESA_SHADER_NONE) {
27497ec681f3Smrg      uint64_t ps_inputs_read =
27507ec681f3Smrg         nir[MESA_SHADER_FRAGMENT] ? nir[MESA_SHADER_FRAGMENT]->info.inputs_read : 0;
27517ec681f3Smrg      gl_shader_stage es_stage = pipeline->graphics.last_vgt_api_stage;
27527ec681f3Smrg
27537ec681f3Smrg      unsigned num_vertices_per_prim = si_conv_prim_to_gs_out(pipeline_key->vs.topology) + 1;
27547ec681f3Smrg      if (es_stage == MESA_SHADER_TESS_EVAL)
27557ec681f3Smrg         num_vertices_per_prim = nir[es_stage]->info.tess.point_mode                      ? 1
27567ec681f3Smrg                                 : nir[es_stage]->info.tess.primitive_mode == GL_ISOLINES ? 2
27577ec681f3Smrg                                                                                          : 3;
27587ec681f3Smrg
27597ec681f3Smrg      infos[es_stage].has_ngg_culling = radv_consider_culling(
27607ec681f3Smrg         device, nir[es_stage], ps_inputs_read, num_vertices_per_prim, &infos[es_stage]);
27617ec681f3Smrg
27627ec681f3Smrg      nir_function_impl *impl = nir_shader_get_entrypoint(nir[es_stage]);
27637ec681f3Smrg      infos[es_stage].has_ngg_early_prim_export = exec_list_is_singular(&impl->body);
27647ec681f3Smrg
27657ec681f3Smrg      /* Invocations that process an input vertex */
27667ec681f3Smrg      const struct gfx10_ngg_info *ngg_info = &infos[es_stage].ngg_info;
27677ec681f3Smrg      unsigned max_vtx_in = MIN2(256, ngg_info->enable_vertex_grouping ? ngg_info->hw_max_esverts : num_vertices_per_prim * ngg_info->max_gsprims);
27687ec681f3Smrg
27697ec681f3Smrg      unsigned lds_bytes_if_culling_off = 0;
27707ec681f3Smrg      /* We need LDS space when VS needs to export the primitive ID. */
27717ec681f3Smrg      if (es_stage == MESA_SHADER_VERTEX && infos[es_stage].vs.outinfo.export_prim_id)
27727ec681f3Smrg         lds_bytes_if_culling_off = max_vtx_in * 4u;
27737ec681f3Smrg      infos[es_stage].num_lds_blocks_when_not_culling =
27747ec681f3Smrg         DIV_ROUND_UP(lds_bytes_if_culling_off,
27757ec681f3Smrg                      device->physical_device->rad_info.lds_encode_granularity);
27767ec681f3Smrg
27777ec681f3Smrg      /* NGG passthrough mode should be disabled when culling and when the vertex shader exports the
27787ec681f3Smrg       * primitive ID.
27797ec681f3Smrg       */
27807ec681f3Smrg      infos[es_stage].is_ngg_passthrough = infos[es_stage].is_ngg_passthrough &&
27817ec681f3Smrg                                           !infos[es_stage].has_ngg_culling &&
27827ec681f3Smrg                                           !(es_stage == MESA_SHADER_VERTEX &&
27837ec681f3Smrg                                             infos[es_stage].vs.outinfo.export_prim_id);
27847ec681f3Smrg   }
27857ec681f3Smrg}
27867ec681f3Smrg
27877ec681f3Smrgstatic void
27887ec681f3Smrgradv_fill_shader_info(struct radv_pipeline *pipeline,
27897ec681f3Smrg                      struct radv_pipeline_layout *pipeline_layout,
27907ec681f3Smrg                      const VkPipelineShaderStageCreateInfo **pStages,
27917ec681f3Smrg                      const struct radv_pipeline_key *pipeline_key,
27927ec681f3Smrg                      struct radv_shader_info *infos, nir_shader **nir)
27937ec681f3Smrg{
27947ec681f3Smrg   struct radv_device *device = pipeline->device;
27957ec681f3Smrg   unsigned active_stages = 0;
27967ec681f3Smrg   unsigned filled_stages = 0;
27977ec681f3Smrg
27987ec681f3Smrg   for (int i = 0; i < MESA_SHADER_STAGES; i++) {
27997ec681f3Smrg      if (nir[i])
28007ec681f3Smrg         active_stages |= (1 << i);
28017ec681f3Smrg   }
28027ec681f3Smrg
28037ec681f3Smrg   if (nir[MESA_SHADER_TESS_CTRL]) {
28047ec681f3Smrg      infos[MESA_SHADER_VERTEX].vs.as_ls = true;
28057ec681f3Smrg   }
28067ec681f3Smrg
28077ec681f3Smrg   if (nir[MESA_SHADER_GEOMETRY]) {
28087ec681f3Smrg      if (nir[MESA_SHADER_TESS_CTRL])
28097ec681f3Smrg         infos[MESA_SHADER_TESS_EVAL].tes.as_es = true;
28107ec681f3Smrg      else
28117ec681f3Smrg         infos[MESA_SHADER_VERTEX].vs.as_es = true;
28127ec681f3Smrg   }
28137ec681f3Smrg
28147ec681f3Smrg   if (pipeline_key->use_ngg) {
28157ec681f3Smrg      if (nir[MESA_SHADER_TESS_CTRL]) {
28167ec681f3Smrg         infos[MESA_SHADER_TESS_EVAL].is_ngg = true;
28177ec681f3Smrg      } else {
28187ec681f3Smrg         infos[MESA_SHADER_VERTEX].is_ngg = true;
28197ec681f3Smrg      }
28207ec681f3Smrg
28217ec681f3Smrg      if (nir[MESA_SHADER_TESS_CTRL] && nir[MESA_SHADER_GEOMETRY] &&
28227ec681f3Smrg          nir[MESA_SHADER_GEOMETRY]->info.gs.invocations *
28237ec681f3Smrg                nir[MESA_SHADER_GEOMETRY]->info.gs.vertices_out >
28247ec681f3Smrg             256) {
28257ec681f3Smrg         /* Fallback to the legacy path if tessellation is
28267ec681f3Smrg          * enabled with extreme geometry because
28277ec681f3Smrg          * EN_MAX_VERT_OUT_PER_GS_INSTANCE doesn't work and it
28287ec681f3Smrg          * might hang.
28297ec681f3Smrg          */
28307ec681f3Smrg         infos[MESA_SHADER_TESS_EVAL].is_ngg = false;
28317ec681f3Smrg      }
28327ec681f3Smrg
28337ec681f3Smrg      gl_shader_stage last_xfb_stage = MESA_SHADER_VERTEX;
28347ec681f3Smrg
28357ec681f3Smrg      for (int i = MESA_SHADER_VERTEX; i <= MESA_SHADER_GEOMETRY; i++) {
28367ec681f3Smrg         if (nir[i])
28377ec681f3Smrg            last_xfb_stage = i;
28387ec681f3Smrg      }
28397ec681f3Smrg
28407ec681f3Smrg      bool uses_xfb = nir[last_xfb_stage] && radv_nir_stage_uses_xfb(nir[last_xfb_stage]);
28417ec681f3Smrg
28427ec681f3Smrg      if (!device->physical_device->use_ngg_streamout && uses_xfb) {
28437ec681f3Smrg         if (nir[MESA_SHADER_TESS_CTRL])
28447ec681f3Smrg           infos[MESA_SHADER_TESS_EVAL].is_ngg = false;
28457ec681f3Smrg         else
28467ec681f3Smrg           infos[MESA_SHADER_VERTEX].is_ngg = false;
28477ec681f3Smrg      }
28487ec681f3Smrg
28497ec681f3Smrg      /* Determine if the pipeline is eligible for the NGG passthrough
28507ec681f3Smrg       * mode. It can't be enabled for geometry shaders, for NGG
28517ec681f3Smrg       * streamout or for vertex shaders that export the primitive ID
28527ec681f3Smrg       * (this is checked later because we don't have the info here.)
28537ec681f3Smrg       */
28547ec681f3Smrg      if (!nir[MESA_SHADER_GEOMETRY] && !uses_xfb) {
28557ec681f3Smrg         if (nir[MESA_SHADER_TESS_CTRL] && infos[MESA_SHADER_TESS_EVAL].is_ngg) {
28567ec681f3Smrg            infos[MESA_SHADER_TESS_EVAL].is_ngg_passthrough = true;
28577ec681f3Smrg         } else if (nir[MESA_SHADER_VERTEX] && infos[MESA_SHADER_VERTEX].is_ngg) {
28587ec681f3Smrg            infos[MESA_SHADER_VERTEX].is_ngg_passthrough = true;
28597ec681f3Smrg         }
28607ec681f3Smrg      }
28617ec681f3Smrg   }
28627ec681f3Smrg
28637ec681f3Smrg   if (nir[MESA_SHADER_FRAGMENT]) {
28647ec681f3Smrg      radv_nir_shader_info_init(&infos[MESA_SHADER_FRAGMENT]);
28657ec681f3Smrg      radv_nir_shader_info_pass(pipeline->device, nir[MESA_SHADER_FRAGMENT], pipeline_layout,
28667ec681f3Smrg                                pipeline_key, &infos[MESA_SHADER_FRAGMENT]);
28677ec681f3Smrg
28687ec681f3Smrg      assert(pipeline->graphics.last_vgt_api_stage != MESA_SHADER_NONE);
28697ec681f3Smrg      if (infos[MESA_SHADER_FRAGMENT].ps.prim_id_input) {
28707ec681f3Smrg         if (pipeline->graphics.last_vgt_api_stage == MESA_SHADER_VERTEX) {
28717ec681f3Smrg            infos[MESA_SHADER_VERTEX].vs.outinfo.export_prim_id = true;
28727ec681f3Smrg         } else if (pipeline->graphics.last_vgt_api_stage == MESA_SHADER_TESS_EVAL) {
28737ec681f3Smrg            infos[MESA_SHADER_TESS_EVAL].tes.outinfo.export_prim_id = true;
28747ec681f3Smrg         } else {
28757ec681f3Smrg            assert(pipeline->graphics.last_vgt_api_stage == MESA_SHADER_GEOMETRY);
28767ec681f3Smrg         }
28777ec681f3Smrg      }
28787ec681f3Smrg
28797ec681f3Smrg      if (!!infos[MESA_SHADER_FRAGMENT].ps.num_input_clips_culls) {
28807ec681f3Smrg         if (pipeline->graphics.last_vgt_api_stage == MESA_SHADER_VERTEX) {
28817ec681f3Smrg            infos[MESA_SHADER_VERTEX].vs.outinfo.export_clip_dists = true;
28827ec681f3Smrg         } else if (pipeline->graphics.last_vgt_api_stage == MESA_SHADER_TESS_EVAL) {
28837ec681f3Smrg            infos[MESA_SHADER_TESS_EVAL].tes.outinfo.export_clip_dists = true;
28847ec681f3Smrg         } else {
28857ec681f3Smrg            assert(pipeline->graphics.last_vgt_api_stage == MESA_SHADER_GEOMETRY);
28867ec681f3Smrg            infos[MESA_SHADER_GEOMETRY].vs.outinfo.export_clip_dists = true;
28877ec681f3Smrg         }
28887ec681f3Smrg      }
28897ec681f3Smrg
28907ec681f3Smrg      filled_stages |= (1 << MESA_SHADER_FRAGMENT);
28917ec681f3Smrg   }
28927ec681f3Smrg
28937ec681f3Smrg   if (pipeline->device->physical_device->rad_info.chip_class >= GFX9 &&
28947ec681f3Smrg       nir[MESA_SHADER_TESS_CTRL]) {
28957ec681f3Smrg      struct nir_shader *combined_nir[] = {nir[MESA_SHADER_VERTEX], nir[MESA_SHADER_TESS_CTRL]};
28967ec681f3Smrg
28977ec681f3Smrg      radv_nir_shader_info_init(&infos[MESA_SHADER_TESS_CTRL]);
28987ec681f3Smrg
28997ec681f3Smrg      /* Copy data to merged stage. */
29007ec681f3Smrg      infos[MESA_SHADER_TESS_CTRL].vs.as_ls = true;
29017ec681f3Smrg
29027ec681f3Smrg      for (int i = 0; i < 2; i++) {
29037ec681f3Smrg         radv_nir_shader_info_pass(pipeline->device, combined_nir[i], pipeline_layout, pipeline_key,
29047ec681f3Smrg                                   &infos[MESA_SHADER_TESS_CTRL]);
29057ec681f3Smrg      }
29067ec681f3Smrg
29077ec681f3Smrg      filled_stages |= (1 << MESA_SHADER_VERTEX);
29087ec681f3Smrg      filled_stages |= (1 << MESA_SHADER_TESS_CTRL);
29097ec681f3Smrg   }
29107ec681f3Smrg
29117ec681f3Smrg   if (pipeline->device->physical_device->rad_info.chip_class >= GFX9 &&
29127ec681f3Smrg       nir[MESA_SHADER_GEOMETRY]) {
29137ec681f3Smrg      gl_shader_stage pre_stage =
29147ec681f3Smrg         nir[MESA_SHADER_TESS_EVAL] ? MESA_SHADER_TESS_EVAL : MESA_SHADER_VERTEX;
29157ec681f3Smrg      struct nir_shader *combined_nir[] = {nir[pre_stage], nir[MESA_SHADER_GEOMETRY]};
29167ec681f3Smrg
29177ec681f3Smrg      radv_nir_shader_info_init(&infos[MESA_SHADER_GEOMETRY]);
29187ec681f3Smrg
29197ec681f3Smrg      /* Copy data to merged stage. */
29207ec681f3Smrg      if (pre_stage == MESA_SHADER_VERTEX) {
29217ec681f3Smrg         infos[MESA_SHADER_GEOMETRY].vs.as_es = infos[MESA_SHADER_VERTEX].vs.as_es;
29227ec681f3Smrg      } else {
29237ec681f3Smrg         infos[MESA_SHADER_GEOMETRY].tes.as_es = infos[MESA_SHADER_TESS_EVAL].tes.as_es;
29247ec681f3Smrg      }
29257ec681f3Smrg      infos[MESA_SHADER_GEOMETRY].is_ngg = infos[pre_stage].is_ngg;
29267ec681f3Smrg      infos[MESA_SHADER_GEOMETRY].gs.es_type = pre_stage;
29277ec681f3Smrg
29287ec681f3Smrg      for (int i = 0; i < 2; i++) {
29297ec681f3Smrg         radv_nir_shader_info_pass(pipeline->device, combined_nir[i], pipeline_layout, pipeline_key,
29307ec681f3Smrg                                   &infos[MESA_SHADER_GEOMETRY]);
29317ec681f3Smrg      }
29327ec681f3Smrg
29337ec681f3Smrg      filled_stages |= (1 << pre_stage);
29347ec681f3Smrg      filled_stages |= (1 << MESA_SHADER_GEOMETRY);
29357ec681f3Smrg   }
29367ec681f3Smrg
29377ec681f3Smrg   active_stages ^= filled_stages;
29387ec681f3Smrg   while (active_stages) {
29397ec681f3Smrg      int i = u_bit_scan(&active_stages);
29407ec681f3Smrg      radv_nir_shader_info_init(&infos[i]);
29417ec681f3Smrg      radv_nir_shader_info_pass(pipeline->device, nir[i], pipeline_layout, pipeline_key, &infos[i]);
29427ec681f3Smrg   }
29437ec681f3Smrg
29447ec681f3Smrg   if (nir[MESA_SHADER_COMPUTE]) {
29457ec681f3Smrg      unsigned subgroup_size = pipeline_key->cs.compute_subgroup_size;
29467ec681f3Smrg      unsigned req_subgroup_size = subgroup_size;
29477ec681f3Smrg      bool require_full_subgroups = pipeline_key->cs.require_full_subgroups;
29487ec681f3Smrg
29497ec681f3Smrg      if (!subgroup_size)
29507ec681f3Smrg         subgroup_size = device->physical_device->cs_wave_size;
29517ec681f3Smrg
29527ec681f3Smrg      unsigned local_size = nir[MESA_SHADER_COMPUTE]->info.workgroup_size[0] *
29537ec681f3Smrg                            nir[MESA_SHADER_COMPUTE]->info.workgroup_size[1] *
29547ec681f3Smrg                            nir[MESA_SHADER_COMPUTE]->info.workgroup_size[2];
29557ec681f3Smrg
29567ec681f3Smrg      /* Games don't always request full subgroups when they should,
29577ec681f3Smrg       * which can cause bugs if cswave32 is enabled.
29587ec681f3Smrg       */
29597ec681f3Smrg      if (device->physical_device->cs_wave_size == 32 &&
29607ec681f3Smrg          nir[MESA_SHADER_COMPUTE]->info.cs.uses_wide_subgroup_intrinsics && !req_subgroup_size &&
29617ec681f3Smrg          local_size % RADV_SUBGROUP_SIZE == 0)
29627ec681f3Smrg         require_full_subgroups = true;
29637ec681f3Smrg
29647ec681f3Smrg      if (require_full_subgroups && !req_subgroup_size) {
29657ec681f3Smrg         /* don't use wave32 pretending to be wave64 */
29667ec681f3Smrg         subgroup_size = RADV_SUBGROUP_SIZE;
29677ec681f3Smrg      }
29687ec681f3Smrg
29697ec681f3Smrg      infos[MESA_SHADER_COMPUTE].cs.subgroup_size = subgroup_size;
29707ec681f3Smrg   }
29717ec681f3Smrg
29727ec681f3Smrg   for (int i = 0; i < MESA_SHADER_STAGES; i++) {
29737ec681f3Smrg      if (nir[i]) {
29747ec681f3Smrg         infos[i].wave_size = radv_get_wave_size(pipeline->device, pStages[i], i, &infos[i]);
29757ec681f3Smrg         infos[i].ballot_bit_size =
29767ec681f3Smrg            radv_get_ballot_bit_size(pipeline->device, pStages[i], i, &infos[i]);
29777ec681f3Smrg      }
29787ec681f3Smrg   }
29797ec681f3Smrg
29807ec681f3Smrg   /* PS always operates without workgroups. */
29817ec681f3Smrg   if (nir[MESA_SHADER_FRAGMENT])
29827ec681f3Smrg      infos[MESA_SHADER_FRAGMENT].workgroup_size = infos[MESA_SHADER_FRAGMENT].wave_size;
29837ec681f3Smrg
29847ec681f3Smrg   if (nir[MESA_SHADER_COMPUTE]) {
29857ec681f3Smrg      /* Variable workgroup size is not supported by Vulkan. */
29867ec681f3Smrg      assert(!nir[MESA_SHADER_COMPUTE]->info.workgroup_size_variable);
29877ec681f3Smrg
29887ec681f3Smrg      infos[MESA_SHADER_COMPUTE].workgroup_size =
29897ec681f3Smrg         ac_compute_cs_workgroup_size(
29907ec681f3Smrg            nir[MESA_SHADER_COMPUTE]->info.workgroup_size, false, UINT32_MAX);
29917ec681f3Smrg   }
29927ec681f3Smrg}
29937ec681f3Smrg
29947ec681f3Smrgstatic void
29957ec681f3Smrgmerge_tess_info(struct shader_info *tes_info, struct shader_info *tcs_info)
29967ec681f3Smrg{
29977ec681f3Smrg   /* The Vulkan 1.0.38 spec, section 21.1 Tessellator says:
29987ec681f3Smrg    *
29997ec681f3Smrg    *    "PointMode. Controls generation of points rather than triangles
30007ec681f3Smrg    *     or lines. This functionality defaults to disabled, and is
30017ec681f3Smrg    *     enabled if either shader stage includes the execution mode.
30027ec681f3Smrg    *
30037ec681f3Smrg    * and about Triangles, Quads, IsoLines, VertexOrderCw, VertexOrderCcw,
30047ec681f3Smrg    * PointMode, SpacingEqual, SpacingFractionalEven, SpacingFractionalOdd,
30057ec681f3Smrg    * and OutputVertices, it says:
30067ec681f3Smrg    *
30077ec681f3Smrg    *    "One mode must be set in at least one of the tessellation
30087ec681f3Smrg    *     shader stages."
30097ec681f3Smrg    *
30107ec681f3Smrg    * So, the fields can be set in either the TCS or TES, but they must
30117ec681f3Smrg    * agree if set in both.  Our backend looks at TES, so bitwise-or in
30127ec681f3Smrg    * the values from the TCS.
30137ec681f3Smrg    */
30147ec681f3Smrg   assert(tcs_info->tess.tcs_vertices_out == 0 || tes_info->tess.tcs_vertices_out == 0 ||
30157ec681f3Smrg          tcs_info->tess.tcs_vertices_out == tes_info->tess.tcs_vertices_out);
30167ec681f3Smrg   tes_info->tess.tcs_vertices_out |= tcs_info->tess.tcs_vertices_out;
30177ec681f3Smrg
30187ec681f3Smrg   assert(tcs_info->tess.spacing == TESS_SPACING_UNSPECIFIED ||
30197ec681f3Smrg          tes_info->tess.spacing == TESS_SPACING_UNSPECIFIED ||
30207ec681f3Smrg          tcs_info->tess.spacing == tes_info->tess.spacing);
30217ec681f3Smrg   tes_info->tess.spacing |= tcs_info->tess.spacing;
30227ec681f3Smrg
30237ec681f3Smrg   assert(tcs_info->tess.primitive_mode == 0 || tes_info->tess.primitive_mode == 0 ||
30247ec681f3Smrg          tcs_info->tess.primitive_mode == tes_info->tess.primitive_mode);
30257ec681f3Smrg   tes_info->tess.primitive_mode |= tcs_info->tess.primitive_mode;
30267ec681f3Smrg   tes_info->tess.ccw |= tcs_info->tess.ccw;
30277ec681f3Smrg   tes_info->tess.point_mode |= tcs_info->tess.point_mode;
30287ec681f3Smrg
30297ec681f3Smrg   /* Copy the merged info back to the TCS */
30307ec681f3Smrg   tcs_info->tess.tcs_vertices_out = tes_info->tess.tcs_vertices_out;
30317ec681f3Smrg   tcs_info->tess.spacing = tes_info->tess.spacing;
30327ec681f3Smrg   tcs_info->tess.primitive_mode = tes_info->tess.primitive_mode;
30337ec681f3Smrg   tcs_info->tess.ccw = tes_info->tess.ccw;
30347ec681f3Smrg   tcs_info->tess.point_mode = tes_info->tess.point_mode;
30357ec681f3Smrg}
30367ec681f3Smrg
30377ec681f3Smrgstatic void
30387ec681f3Smrggather_tess_info(struct radv_device *device, nir_shader **nir, struct radv_shader_info *infos,
30397ec681f3Smrg                 const struct radv_pipeline_key *pipeline_key)
30407ec681f3Smrg{
30417ec681f3Smrg   merge_tess_info(&nir[MESA_SHADER_TESS_EVAL]->info, &nir[MESA_SHADER_TESS_CTRL]->info);
30427ec681f3Smrg
30437ec681f3Smrg   unsigned tess_in_patch_size = pipeline_key->tcs.tess_input_vertices;
30447ec681f3Smrg   unsigned tess_out_patch_size = nir[MESA_SHADER_TESS_CTRL]->info.tess.tcs_vertices_out;
30457ec681f3Smrg
30467ec681f3Smrg   /* Number of tessellation patches per workgroup processed by the current pipeline. */
30477ec681f3Smrg   unsigned num_patches = get_tcs_num_patches(
30487ec681f3Smrg      tess_in_patch_size, tess_out_patch_size,
30497ec681f3Smrg      infos[MESA_SHADER_TESS_CTRL].tcs.num_linked_inputs,
30507ec681f3Smrg      infos[MESA_SHADER_TESS_CTRL].tcs.num_linked_outputs,
30517ec681f3Smrg      infos[MESA_SHADER_TESS_CTRL].tcs.num_linked_patch_outputs, device->tess_offchip_block_dw_size,
30527ec681f3Smrg      device->physical_device->rad_info.chip_class, device->physical_device->rad_info.family);
30537ec681f3Smrg
30547ec681f3Smrg   /* LDS size used by VS+TCS for storing TCS inputs and outputs. */
30557ec681f3Smrg   unsigned tcs_lds_size = calculate_tess_lds_size(
30567ec681f3Smrg      device->physical_device->rad_info.chip_class, tess_in_patch_size, tess_out_patch_size,
30577ec681f3Smrg      infos[MESA_SHADER_TESS_CTRL].tcs.num_linked_inputs, num_patches,
30587ec681f3Smrg      infos[MESA_SHADER_TESS_CTRL].tcs.num_linked_outputs,
30597ec681f3Smrg      infos[MESA_SHADER_TESS_CTRL].tcs.num_linked_patch_outputs);
30607ec681f3Smrg
30617ec681f3Smrg   infos[MESA_SHADER_TESS_CTRL].num_tess_patches = num_patches;
30627ec681f3Smrg   infos[MESA_SHADER_TESS_CTRL].tcs.num_lds_blocks = tcs_lds_size;
30637ec681f3Smrg   infos[MESA_SHADER_TESS_CTRL].tcs.tes_reads_tess_factors =
30647ec681f3Smrg      !!(nir[MESA_SHADER_TESS_EVAL]->info.inputs_read &
30657ec681f3Smrg         (VARYING_BIT_TESS_LEVEL_INNER | VARYING_BIT_TESS_LEVEL_OUTER));
30667ec681f3Smrg   infos[MESA_SHADER_TESS_CTRL].tcs.tes_inputs_read = nir[MESA_SHADER_TESS_EVAL]->info.inputs_read;
30677ec681f3Smrg   infos[MESA_SHADER_TESS_CTRL].tcs.tes_patch_inputs_read =
30687ec681f3Smrg      nir[MESA_SHADER_TESS_EVAL]->info.patch_inputs_read;
30697ec681f3Smrg
30707ec681f3Smrg   infos[MESA_SHADER_TESS_EVAL].num_tess_patches = num_patches;
30717ec681f3Smrg   infos[MESA_SHADER_GEOMETRY].num_tess_patches = num_patches;
30727ec681f3Smrg   infos[MESA_SHADER_VERTEX].num_tess_patches = num_patches;
30737ec681f3Smrg   infos[MESA_SHADER_TESS_CTRL].tcs.tcs_vertices_out = tess_out_patch_size;
30747ec681f3Smrg   infos[MESA_SHADER_VERTEX].tcs.tcs_vertices_out = tess_out_patch_size;
30757ec681f3Smrg
30767ec681f3Smrg   if (!radv_use_llvm_for_stage(device, MESA_SHADER_VERTEX)) {
30777ec681f3Smrg      /* When the number of TCS input and output vertices are the same (typically 3):
30787ec681f3Smrg       * - There is an equal amount of LS and HS invocations
30797ec681f3Smrg       * - In case of merged LSHS shaders, the LS and HS halves of the shader
30807ec681f3Smrg       *   always process the exact same vertex. We can use this knowledge to optimize them.
30817ec681f3Smrg       *
30827ec681f3Smrg       * We don't set tcs_in_out_eq if the float controls differ because that might
30837ec681f3Smrg       * involve different float modes for the same block and our optimizer
30847ec681f3Smrg       * doesn't handle a instruction dominating another with a different mode.
30857ec681f3Smrg       */
30867ec681f3Smrg      infos[MESA_SHADER_VERTEX].vs.tcs_in_out_eq =
30877ec681f3Smrg         device->physical_device->rad_info.chip_class >= GFX9 &&
30887ec681f3Smrg         tess_in_patch_size == tess_out_patch_size &&
30897ec681f3Smrg         nir[MESA_SHADER_VERTEX]->info.float_controls_execution_mode ==
30907ec681f3Smrg            nir[MESA_SHADER_TESS_CTRL]->info.float_controls_execution_mode;
30917ec681f3Smrg
30927ec681f3Smrg      if (infos[MESA_SHADER_VERTEX].vs.tcs_in_out_eq)
30937ec681f3Smrg         infos[MESA_SHADER_VERTEX].vs.tcs_temp_only_input_mask =
30947ec681f3Smrg            nir[MESA_SHADER_TESS_CTRL]->info.inputs_read &
30957ec681f3Smrg            nir[MESA_SHADER_VERTEX]->info.outputs_written &
30967ec681f3Smrg            ~nir[MESA_SHADER_TESS_CTRL]->info.tess.tcs_cross_invocation_inputs_read &
30977ec681f3Smrg            ~nir[MESA_SHADER_TESS_CTRL]->info.inputs_read_indirectly &
30987ec681f3Smrg            ~nir[MESA_SHADER_VERTEX]->info.outputs_accessed_indirectly;
30997ec681f3Smrg
31007ec681f3Smrg      /* Copy data to TCS so it can be accessed by the backend if they are merged. */
31017ec681f3Smrg      infos[MESA_SHADER_TESS_CTRL].vs.tcs_in_out_eq = infos[MESA_SHADER_VERTEX].vs.tcs_in_out_eq;
31027ec681f3Smrg      infos[MESA_SHADER_TESS_CTRL].vs.tcs_temp_only_input_mask =
31037ec681f3Smrg         infos[MESA_SHADER_VERTEX].vs.tcs_temp_only_input_mask;
31047ec681f3Smrg   }
31057ec681f3Smrg
31067ec681f3Smrg   for (gl_shader_stage s = MESA_SHADER_VERTEX; s <= MESA_SHADER_TESS_CTRL; ++s)
31077ec681f3Smrg      infos[s].workgroup_size =
31087ec681f3Smrg         ac_compute_lshs_workgroup_size(
31097ec681f3Smrg            device->physical_device->rad_info.chip_class, s,
31107ec681f3Smrg            num_patches, tess_in_patch_size, tess_out_patch_size);
31117ec681f3Smrg}
31127ec681f3Smrg
31137ec681f3Smrgstatic void
31147ec681f3Smrgradv_init_feedback(const VkPipelineCreationFeedbackCreateInfoEXT *ext)
31157ec681f3Smrg{
31167ec681f3Smrg   if (!ext)
31177ec681f3Smrg      return;
31187ec681f3Smrg
31197ec681f3Smrg   if (ext->pPipelineCreationFeedback) {
31207ec681f3Smrg      ext->pPipelineCreationFeedback->flags = 0;
31217ec681f3Smrg      ext->pPipelineCreationFeedback->duration = 0;
31227ec681f3Smrg   }
31237ec681f3Smrg
31247ec681f3Smrg   for (unsigned i = 0; i < ext->pipelineStageCreationFeedbackCount; ++i) {
31257ec681f3Smrg      ext->pPipelineStageCreationFeedbacks[i].flags = 0;
31267ec681f3Smrg      ext->pPipelineStageCreationFeedbacks[i].duration = 0;
31277ec681f3Smrg   }
312801e04c3fSmrg}
312901e04c3fSmrg
313001e04c3fSmrgstatic void
31317ec681f3Smrgradv_start_feedback(VkPipelineCreationFeedbackEXT *feedback)
31327ec681f3Smrg{
31337ec681f3Smrg   if (!feedback)
31347ec681f3Smrg      return;
31357ec681f3Smrg
31367ec681f3Smrg   feedback->duration -= radv_get_current_time();
31377ec681f3Smrg   feedback->flags = VK_PIPELINE_CREATION_FEEDBACK_VALID_BIT_EXT;
313801e04c3fSmrg}
313901e04c3fSmrg
314001e04c3fSmrgstatic void
31417ec681f3Smrgradv_stop_feedback(VkPipelineCreationFeedbackEXT *feedback, bool cache_hit)
31427ec681f3Smrg{
31437ec681f3Smrg   if (!feedback)
31447ec681f3Smrg      return;
31457ec681f3Smrg
31467ec681f3Smrg   feedback->duration += radv_get_current_time();
31477ec681f3Smrg   feedback->flags =
31487ec681f3Smrg      VK_PIPELINE_CREATION_FEEDBACK_VALID_BIT_EXT |
31497ec681f3Smrg      (cache_hit ? VK_PIPELINE_CREATION_FEEDBACK_APPLICATION_PIPELINE_CACHE_HIT_BIT_EXT : 0);
31507ec681f3Smrg}
31517ec681f3Smrg
31527ec681f3Smrgstatic bool
31537ec681f3Smrgmem_vectorize_callback(unsigned align_mul, unsigned align_offset, unsigned bit_size,
31547ec681f3Smrg                       unsigned num_components, nir_intrinsic_instr *low, nir_intrinsic_instr *high,
31557ec681f3Smrg                       void *data)
31567ec681f3Smrg{
31577ec681f3Smrg   if (num_components > 4)
31587ec681f3Smrg      return false;
31597ec681f3Smrg
31607ec681f3Smrg   /* >128 bit loads are split except with SMEM */
31617ec681f3Smrg   if (bit_size * num_components > 128)
31627ec681f3Smrg      return false;
31637ec681f3Smrg
31647ec681f3Smrg   uint32_t align;
31657ec681f3Smrg   if (align_offset)
31667ec681f3Smrg      align = 1 << (ffs(align_offset) - 1);
31677ec681f3Smrg   else
31687ec681f3Smrg      align = align_mul;
31697ec681f3Smrg
31707ec681f3Smrg   switch (low->intrinsic) {
31717ec681f3Smrg   case nir_intrinsic_load_global:
31727ec681f3Smrg   case nir_intrinsic_store_global:
31737ec681f3Smrg   case nir_intrinsic_store_ssbo:
31747ec681f3Smrg   case nir_intrinsic_load_ssbo:
31757ec681f3Smrg   case nir_intrinsic_load_ubo:
31767ec681f3Smrg   case nir_intrinsic_load_push_constant: {
31777ec681f3Smrg      unsigned max_components;
31787ec681f3Smrg      if (align % 4 == 0)
31797ec681f3Smrg         max_components = NIR_MAX_VEC_COMPONENTS;
31807ec681f3Smrg      else if (align % 2 == 0)
31817ec681f3Smrg         max_components = 16u / bit_size;
31827ec681f3Smrg      else
31837ec681f3Smrg         max_components = 8u / bit_size;
31847ec681f3Smrg      return (align % (bit_size / 8u)) == 0 && num_components <= max_components;
31857ec681f3Smrg   }
31867ec681f3Smrg   case nir_intrinsic_load_deref:
31877ec681f3Smrg   case nir_intrinsic_store_deref:
31887ec681f3Smrg      assert(nir_deref_mode_is(nir_src_as_deref(low->src[0]), nir_var_mem_shared));
31897ec681f3Smrg      FALLTHROUGH;
31907ec681f3Smrg   case nir_intrinsic_load_shared:
31917ec681f3Smrg   case nir_intrinsic_store_shared:
31927ec681f3Smrg      if (bit_size * num_components ==
31937ec681f3Smrg          96) { /* 96 bit loads require 128 bit alignment and are split otherwise */
31947ec681f3Smrg         return align % 16 == 0;
31957ec681f3Smrg      } else if (bit_size == 16 && (align % 4)) {
31967ec681f3Smrg         /* AMD hardware can't do 2-byte aligned f16vec2 loads, but they are useful for ALU
31977ec681f3Smrg          * vectorization, because our vectorizer requires the scalar IR to already contain vectors.
31987ec681f3Smrg          */
31997ec681f3Smrg         return (align % 2 == 0) && num_components <= 2;
32007ec681f3Smrg      } else {
32017ec681f3Smrg         if (num_components == 3) {
32027ec681f3Smrg            /* AMD hardware can't do 3-component loads except for 96-bit loads, handled above. */
32037ec681f3Smrg            return false;
32047ec681f3Smrg         }
32057ec681f3Smrg         unsigned req = bit_size * num_components;
32067ec681f3Smrg         if (req == 64 || req == 128) /* 64-bit and 128-bit loads can use ds_read2_b{32,64} */
32077ec681f3Smrg            req /= 2u;
32087ec681f3Smrg         return align % (req / 8u) == 0;
32097ec681f3Smrg      }
32107ec681f3Smrg   default:
32117ec681f3Smrg      return false;
32127ec681f3Smrg   }
32137ec681f3Smrg   return false;
32147ec681f3Smrg}
32157ec681f3Smrg
32167ec681f3Smrgstatic unsigned
32177ec681f3Smrglower_bit_size_callback(const nir_instr *instr, void *_)
32187ec681f3Smrg{
32197ec681f3Smrg   struct radv_device *device = _;
32207ec681f3Smrg   enum chip_class chip = device->physical_device->rad_info.chip_class;
32217ec681f3Smrg
32227ec681f3Smrg   if (instr->type != nir_instr_type_alu)
32237ec681f3Smrg      return 0;
32247ec681f3Smrg   nir_alu_instr *alu = nir_instr_as_alu(instr);
32257ec681f3Smrg
32267ec681f3Smrg   if (alu->dest.dest.ssa.bit_size & (8 | 16)) {
32277ec681f3Smrg      unsigned bit_size = alu->dest.dest.ssa.bit_size;
32287ec681f3Smrg      switch (alu->op) {
32297ec681f3Smrg      case nir_op_iabs:
32307ec681f3Smrg      case nir_op_bitfield_select:
32317ec681f3Smrg      case nir_op_imul_high:
32327ec681f3Smrg      case nir_op_umul_high:
32337ec681f3Smrg      case nir_op_ineg:
32347ec681f3Smrg      case nir_op_isign:
32357ec681f3Smrg         return 32;
32367ec681f3Smrg      case nir_op_imax:
32377ec681f3Smrg      case nir_op_umax:
32387ec681f3Smrg      case nir_op_imin:
32397ec681f3Smrg      case nir_op_umin:
32407ec681f3Smrg      case nir_op_ishr:
32417ec681f3Smrg      case nir_op_ushr:
32427ec681f3Smrg      case nir_op_ishl:
32437ec681f3Smrg      case nir_op_uadd_sat:
32447ec681f3Smrg         return (bit_size == 8 || !(chip >= GFX8 && nir_dest_is_divergent(alu->dest.dest))) ? 32
32457ec681f3Smrg                                                                                            : 0;
32467ec681f3Smrg      case nir_op_iadd_sat:
32477ec681f3Smrg         return bit_size == 8 || !nir_dest_is_divergent(alu->dest.dest) ? 32 : 0;
32487ec681f3Smrg
32497ec681f3Smrg      default:
32507ec681f3Smrg         return 0;
32517ec681f3Smrg      }
32527ec681f3Smrg   }
32537ec681f3Smrg
32547ec681f3Smrg   if (nir_src_bit_size(alu->src[0].src) & (8 | 16)) {
32557ec681f3Smrg      unsigned bit_size = nir_src_bit_size(alu->src[0].src);
32567ec681f3Smrg      switch (alu->op) {
32577ec681f3Smrg      case nir_op_bit_count:
32587ec681f3Smrg      case nir_op_find_lsb:
32597ec681f3Smrg      case nir_op_ufind_msb:
32607ec681f3Smrg      case nir_op_i2b1:
32617ec681f3Smrg         return 32;
32627ec681f3Smrg      case nir_op_ilt:
32637ec681f3Smrg      case nir_op_ige:
32647ec681f3Smrg      case nir_op_ieq:
32657ec681f3Smrg      case nir_op_ine:
32667ec681f3Smrg      case nir_op_ult:
32677ec681f3Smrg      case nir_op_uge:
32687ec681f3Smrg         return (bit_size == 8 || !(chip >= GFX8 && nir_dest_is_divergent(alu->dest.dest))) ? 32
32697ec681f3Smrg                                                                                            : 0;
32707ec681f3Smrg      default:
32717ec681f3Smrg         return 0;
32727ec681f3Smrg      }
32737ec681f3Smrg   }
32747ec681f3Smrg
32757ec681f3Smrg   return 0;
32767ec681f3Smrg}
32777ec681f3Smrg
32787ec681f3Smrgstatic bool
32797ec681f3Smrgopt_vectorize_callback(const nir_instr *instr, void *_)
32807ec681f3Smrg{
32817ec681f3Smrg   assert(instr->type == nir_instr_type_alu);
32827ec681f3Smrg   nir_alu_instr *alu = nir_instr_as_alu(instr);
32837ec681f3Smrg   unsigned bit_size = alu->dest.dest.ssa.bit_size;
32847ec681f3Smrg   if (bit_size != 16)
32857ec681f3Smrg      return false;
32867ec681f3Smrg
32877ec681f3Smrg   switch (alu->op) {
32887ec681f3Smrg   case nir_op_fadd:
32897ec681f3Smrg   case nir_op_fsub:
32907ec681f3Smrg   case nir_op_fmul:
32917ec681f3Smrg   case nir_op_fneg:
32927ec681f3Smrg   case nir_op_fsat:
32937ec681f3Smrg   case nir_op_fmin:
32947ec681f3Smrg   case nir_op_fmax:
32957ec681f3Smrg   case nir_op_iadd:
32967ec681f3Smrg   case nir_op_isub:
32977ec681f3Smrg   case nir_op_imul:
32987ec681f3Smrg   case nir_op_imin:
32997ec681f3Smrg   case nir_op_imax:
33007ec681f3Smrg   case nir_op_umin:
33017ec681f3Smrg   case nir_op_umax:
33027ec681f3Smrg      return true;
33037ec681f3Smrg   case nir_op_ishl: /* TODO: in NIR, these have 32bit shift operands */
33047ec681f3Smrg   case nir_op_ishr: /* while Radeon needs 16bit operands when vectorized */
33057ec681f3Smrg   case nir_op_ushr:
33067ec681f3Smrg   default:
33077ec681f3Smrg      return false;
33087ec681f3Smrg   }
33097ec681f3Smrg}
33107ec681f3Smrg
33117ec681f3Smrgstatic nir_component_mask_t
33127ec681f3Smrgnon_uniform_access_callback(const nir_src *src, void *_)
33137ec681f3Smrg{
33147ec681f3Smrg   if (src->ssa->num_components == 1)
33157ec681f3Smrg      return 0x1;
33167ec681f3Smrg   return nir_chase_binding(*src).success ? 0x2 : 0x3;
33177ec681f3Smrg}
33187ec681f3Smrg
33197ec681f3SmrgVkResult
33207ec681f3Smrgradv_create_shaders(struct radv_pipeline *pipeline, struct radv_pipeline_layout *pipeline_layout,
33217ec681f3Smrg                    struct radv_device *device, struct radv_pipeline_cache *cache,
33227ec681f3Smrg                    const struct radv_pipeline_key *pipeline_key,
33237ec681f3Smrg                    const VkPipelineShaderStageCreateInfo **pStages,
33247ec681f3Smrg                    const VkPipelineCreateFlags flags, const uint8_t *custom_hash,
33257ec681f3Smrg                    VkPipelineCreationFeedbackEXT *pipeline_feedback,
33267ec681f3Smrg                    VkPipelineCreationFeedbackEXT **stage_feedbacks)
33277ec681f3Smrg{
33287ec681f3Smrg   struct vk_shader_module fs_m = {0};
33297ec681f3Smrg   struct vk_shader_module *modules[MESA_SHADER_STAGES] = {
33307ec681f3Smrg      0,
33317ec681f3Smrg   };
33327ec681f3Smrg   nir_shader *nir[MESA_SHADER_STAGES] = {0};
33337ec681f3Smrg   struct radv_shader_binary *binaries[MESA_SHADER_STAGES] = {NULL};
33347ec681f3Smrg   struct radv_shader_info infos[MESA_SHADER_STAGES] = {0};
33357ec681f3Smrg   unsigned char hash[20], gs_copy_hash[20];
33367ec681f3Smrg   bool keep_executable_info =
33377ec681f3Smrg      (flags & VK_PIPELINE_CREATE_CAPTURE_INTERNAL_REPRESENTATIONS_BIT_KHR) ||
33387ec681f3Smrg      device->keep_shader_info;
33397ec681f3Smrg   bool keep_statistic_info = (flags & VK_PIPELINE_CREATE_CAPTURE_STATISTICS_BIT_KHR) ||
33407ec681f3Smrg                              (device->instance->debug_flags & RADV_DEBUG_DUMP_SHADER_STATS) ||
33417ec681f3Smrg                              device->keep_shader_info;
33427ec681f3Smrg   struct radv_pipeline_shader_stack_size **stack_sizes =
33437ec681f3Smrg      pipeline->type == RADV_PIPELINE_COMPUTE ? &pipeline->compute.rt_stack_sizes : NULL;
33447ec681f3Smrg   uint32_t *num_stack_sizes = stack_sizes ? &pipeline->compute.group_count : NULL;
33457ec681f3Smrg
33467ec681f3Smrg   radv_start_feedback(pipeline_feedback);
33477ec681f3Smrg
33487ec681f3Smrg   for (unsigned i = 0; i < MESA_SHADER_STAGES; ++i) {
33497ec681f3Smrg      if (pStages[i]) {
33507ec681f3Smrg         modules[i] = vk_shader_module_from_handle(pStages[i]->module);
33517ec681f3Smrg         if (modules[i]->nir)
33527ec681f3Smrg            _mesa_sha1_compute(modules[i]->nir->info.name, strlen(modules[i]->nir->info.name),
33537ec681f3Smrg                               modules[i]->sha1);
33547ec681f3Smrg
33557ec681f3Smrg         pipeline->active_stages |= mesa_to_vk_shader_stage(i);
33567ec681f3Smrg         if (i < MESA_SHADER_FRAGMENT)
33577ec681f3Smrg            pipeline->graphics.last_vgt_api_stage = i;
33587ec681f3Smrg      }
33597ec681f3Smrg   }
33607ec681f3Smrg
33617ec681f3Smrg   if (custom_hash)
33627ec681f3Smrg      memcpy(hash, custom_hash, 20);
33637ec681f3Smrg   else {
33647ec681f3Smrg      radv_hash_shaders(hash, pStages, pipeline_layout, pipeline_key,
33657ec681f3Smrg                        radv_get_hash_flags(device, keep_statistic_info));
33667ec681f3Smrg   }
33677ec681f3Smrg   memcpy(gs_copy_hash, hash, 20);
33687ec681f3Smrg   gs_copy_hash[0] ^= 1;
33697ec681f3Smrg
33707ec681f3Smrg   pipeline->pipeline_hash = *(uint64_t *)hash;
33717ec681f3Smrg
33727ec681f3Smrg   bool found_in_application_cache = true;
33737ec681f3Smrg   if (modules[MESA_SHADER_GEOMETRY] && !keep_executable_info) {
33747ec681f3Smrg      struct radv_shader_variant *variants[MESA_SHADER_STAGES] = {0};
33757ec681f3Smrg      radv_create_shader_variants_from_pipeline_cache(device, cache, gs_copy_hash, variants, NULL,
33767ec681f3Smrg                                                      NULL, &found_in_application_cache);
33777ec681f3Smrg      pipeline->gs_copy_shader = variants[MESA_SHADER_GEOMETRY];
33787ec681f3Smrg   }
33797ec681f3Smrg
33807ec681f3Smrg   if (!keep_executable_info &&
33817ec681f3Smrg       radv_create_shader_variants_from_pipeline_cache(device, cache, hash, pipeline->shaders,
33827ec681f3Smrg                                                       stack_sizes, num_stack_sizes,
33837ec681f3Smrg                                                       &found_in_application_cache) &&
33847ec681f3Smrg       (!modules[MESA_SHADER_GEOMETRY] || pipeline->gs_copy_shader ||
33857ec681f3Smrg        pipeline->shaders[MESA_SHADER_GEOMETRY]->info.is_ngg)) {
33867ec681f3Smrg      radv_stop_feedback(pipeline_feedback, found_in_application_cache);
33877ec681f3Smrg      return VK_SUCCESS;
33887ec681f3Smrg   }
33897ec681f3Smrg
33907ec681f3Smrg   if (flags & VK_PIPELINE_CREATE_FAIL_ON_PIPELINE_COMPILE_REQUIRED_BIT_EXT) {
33917ec681f3Smrg      radv_stop_feedback(pipeline_feedback, found_in_application_cache);
33927ec681f3Smrg      return VK_PIPELINE_COMPILE_REQUIRED_EXT;
33937ec681f3Smrg   }
33947ec681f3Smrg
33957ec681f3Smrg   if (!modules[MESA_SHADER_FRAGMENT] && !modules[MESA_SHADER_COMPUTE]) {
33967ec681f3Smrg      nir_builder fs_b = nir_builder_init_simple_shader(MESA_SHADER_FRAGMENT, NULL, "noop_fs");
33977ec681f3Smrg      fs_m = vk_shader_module_from_nir(fs_b.shader);
33987ec681f3Smrg      modules[MESA_SHADER_FRAGMENT] = &fs_m;
33997ec681f3Smrg   }
34007ec681f3Smrg
34017ec681f3Smrg   for (unsigned i = 0; i < MESA_SHADER_STAGES; ++i) {
34027ec681f3Smrg      const VkPipelineShaderStageCreateInfo *stage = pStages[i];
34037ec681f3Smrg
34047ec681f3Smrg      if (!modules[i])
34057ec681f3Smrg         continue;
34067ec681f3Smrg
34077ec681f3Smrg      radv_start_feedback(stage_feedbacks[i]);
34087ec681f3Smrg
34097ec681f3Smrg      nir[i] = radv_shader_compile_to_nir(device, modules[i], stage ? stage->pName : "main", i,
34107ec681f3Smrg                                          stage ? stage->pSpecializationInfo : NULL,
34117ec681f3Smrg                                          pipeline_layout, pipeline_key);
34127ec681f3Smrg
34137ec681f3Smrg      /* We don't want to alter meta shaders IR directly so clone it
34147ec681f3Smrg       * first.
34157ec681f3Smrg       */
34167ec681f3Smrg      if (nir[i]->info.name) {
34177ec681f3Smrg         nir[i] = nir_shader_clone(NULL, nir[i]);
34187ec681f3Smrg      }
34197ec681f3Smrg
34207ec681f3Smrg      radv_stop_feedback(stage_feedbacks[i], false);
34217ec681f3Smrg   }
34227ec681f3Smrg
34237ec681f3Smrg   bool optimize_conservatively = pipeline_key->optimisations_disabled;
34247ec681f3Smrg
34257ec681f3Smrg   radv_link_shaders(pipeline, pipeline_key, nir, optimize_conservatively);
34267ec681f3Smrg   radv_set_driver_locations(pipeline, nir, infos);
34277ec681f3Smrg
34287ec681f3Smrg   for (int i = 0; i < MESA_SHADER_STAGES; ++i) {
34297ec681f3Smrg      if (nir[i]) {
34307ec681f3Smrg         radv_start_feedback(stage_feedbacks[i]);
34317ec681f3Smrg         radv_optimize_nir(device, nir[i], optimize_conservatively, false);
34327ec681f3Smrg
34337ec681f3Smrg         /* Gather info again, information such as outputs_read can be out-of-date. */
34347ec681f3Smrg         nir_shader_gather_info(nir[i], nir_shader_get_entrypoint(nir[i]));
34357ec681f3Smrg         radv_lower_io(device, nir[i]);
34367ec681f3Smrg
34377ec681f3Smrg         radv_stop_feedback(stage_feedbacks[i], false);
34387ec681f3Smrg      }
34397ec681f3Smrg   }
34407ec681f3Smrg
34417ec681f3Smrg   if (nir[MESA_SHADER_TESS_CTRL]) {
34427ec681f3Smrg      nir_lower_patch_vertices(nir[MESA_SHADER_TESS_EVAL],
34437ec681f3Smrg                               nir[MESA_SHADER_TESS_CTRL]->info.tess.tcs_vertices_out, NULL);
34447ec681f3Smrg      gather_tess_info(device, nir, infos, pipeline_key);
34457ec681f3Smrg   }
34467ec681f3Smrg
34477ec681f3Smrg   radv_fill_shader_info(pipeline, pipeline_layout, pStages, pipeline_key, infos, nir);
34487ec681f3Smrg
34497ec681f3Smrg   bool pipeline_has_ngg = (nir[MESA_SHADER_VERTEX] && infos[MESA_SHADER_VERTEX].is_ngg) ||
34507ec681f3Smrg                           (nir[MESA_SHADER_TESS_EVAL] && infos[MESA_SHADER_TESS_EVAL].is_ngg);
34517ec681f3Smrg
34527ec681f3Smrg   if (pipeline_has_ngg) {
34537ec681f3Smrg      struct gfx10_ngg_info *ngg_info;
34547ec681f3Smrg
34557ec681f3Smrg      if (nir[MESA_SHADER_GEOMETRY])
34567ec681f3Smrg         ngg_info = &infos[MESA_SHADER_GEOMETRY].ngg_info;
34577ec681f3Smrg      else if (nir[MESA_SHADER_TESS_CTRL])
34587ec681f3Smrg         ngg_info = &infos[MESA_SHADER_TESS_EVAL].ngg_info;
34597ec681f3Smrg      else
34607ec681f3Smrg         ngg_info = &infos[MESA_SHADER_VERTEX].ngg_info;
34617ec681f3Smrg
34627ec681f3Smrg      gfx10_get_ngg_info(pipeline_key, pipeline, nir, infos, ngg_info);
34637ec681f3Smrg   } else if (nir[MESA_SHADER_GEOMETRY]) {
34647ec681f3Smrg      struct gfx9_gs_info *gs_info = &infos[MESA_SHADER_GEOMETRY].gs_ring_info;
34657ec681f3Smrg
34667ec681f3Smrg      gfx9_get_gs_info(pipeline_key, pipeline, nir, infos, gs_info);
34677ec681f3Smrg   } else {
34687ec681f3Smrg      gl_shader_stage hw_vs_api_stage =
34697ec681f3Smrg         nir[MESA_SHADER_TESS_EVAL] ? MESA_SHADER_TESS_EVAL : MESA_SHADER_VERTEX;
34707ec681f3Smrg      infos[hw_vs_api_stage].workgroup_size = infos[hw_vs_api_stage].wave_size;
34717ec681f3Smrg   }
34727ec681f3Smrg
34737ec681f3Smrg   radv_determine_ngg_settings(pipeline, pipeline_key, infos, nir);
34747ec681f3Smrg
34757ec681f3Smrg   for (int i = 0; i < MESA_SHADER_STAGES; ++i) {
34767ec681f3Smrg      if (nir[i]) {
34777ec681f3Smrg         radv_start_feedback(stage_feedbacks[i]);
34787ec681f3Smrg
34797ec681f3Smrg         /* Wave and workgroup size should already be filled. */
34807ec681f3Smrg         assert(infos[i].wave_size && infos[i].workgroup_size);
34817ec681f3Smrg
34827ec681f3Smrg         if (!radv_use_llvm_for_stage(device, i)) {
34837ec681f3Smrg            nir_lower_non_uniform_access_options options = {
34847ec681f3Smrg               .types = nir_lower_non_uniform_ubo_access | nir_lower_non_uniform_ssbo_access |
34857ec681f3Smrg                        nir_lower_non_uniform_texture_access | nir_lower_non_uniform_image_access,
34867ec681f3Smrg               .callback = &non_uniform_access_callback,
34877ec681f3Smrg               .callback_data = NULL,
34887ec681f3Smrg            };
34897ec681f3Smrg            NIR_PASS_V(nir[i], nir_lower_non_uniform_access, &options);
34907ec681f3Smrg         }
34917ec681f3Smrg         NIR_PASS_V(nir[i], nir_lower_memory_model);
34927ec681f3Smrg
34937ec681f3Smrg         bool lower_to_scalar = false;
34947ec681f3Smrg
34957ec681f3Smrg         nir_load_store_vectorize_options vectorize_opts = {
34967ec681f3Smrg            .modes = nir_var_mem_ssbo | nir_var_mem_ubo | nir_var_mem_push_const |
34977ec681f3Smrg                     nir_var_mem_shared | nir_var_mem_global,
34987ec681f3Smrg            .callback = mem_vectorize_callback,
34997ec681f3Smrg            .robust_modes = 0,
35007ec681f3Smrg         };
35017ec681f3Smrg
35027ec681f3Smrg         if (device->robust_buffer_access2) {
35037ec681f3Smrg            vectorize_opts.robust_modes =
35047ec681f3Smrg               nir_var_mem_ubo | nir_var_mem_ssbo | nir_var_mem_global | nir_var_mem_push_const;
35057ec681f3Smrg         }
35067ec681f3Smrg
35077ec681f3Smrg         if (nir_opt_load_store_vectorize(nir[i], &vectorize_opts)) {
35087ec681f3Smrg            NIR_PASS_V(nir[i], nir_copy_prop);
35097ec681f3Smrg            lower_to_scalar = true;
35107ec681f3Smrg
35117ec681f3Smrg            /* Gather info again, to update whether 8/16-bit are used. */
35127ec681f3Smrg            nir_shader_gather_info(nir[i], nir_shader_get_entrypoint(nir[i]));
35137ec681f3Smrg         }
35147ec681f3Smrg
35157ec681f3Smrg         lower_to_scalar |=
35167ec681f3Smrg            nir_opt_shrink_vectors(nir[i], !device->instance->disable_shrink_image_store);
35177ec681f3Smrg
35187ec681f3Smrg         if (lower_to_scalar)
35197ec681f3Smrg            nir_lower_alu_to_scalar(nir[i], NULL, NULL);
35207ec681f3Smrg
35217ec681f3Smrg         /* lower ALU operations */
35227ec681f3Smrg         nir_lower_int64(nir[i]);
35237ec681f3Smrg
35247ec681f3Smrg         nir_opt_idiv_const(nir[i], 8);
35257ec681f3Smrg
35267ec681f3Smrg         nir_lower_idiv(nir[i],
35277ec681f3Smrg                        &(nir_lower_idiv_options){
35287ec681f3Smrg                           .imprecise_32bit_lowering = false,
35297ec681f3Smrg                           .allow_fp16 = device->physical_device->rad_info.chip_class >= GFX9,
35307ec681f3Smrg                        });
35317ec681f3Smrg
35327ec681f3Smrg         nir_opt_sink(nir[i], nir_move_load_input | nir_move_const_undef | nir_move_copies);
35337ec681f3Smrg         nir_opt_move(nir[i], nir_move_load_input | nir_move_const_undef | nir_move_copies);
35347ec681f3Smrg
35357ec681f3Smrg         /* Lower I/O intrinsics to memory instructions. */
35367ec681f3Smrg         bool io_to_mem = radv_lower_io_to_mem(device, nir[i], &infos[i], pipeline_key);
35377ec681f3Smrg         bool lowered_ngg = pipeline_has_ngg && i == pipeline->graphics.last_vgt_api_stage &&
35387ec681f3Smrg                            !radv_use_llvm_for_stage(device, i);
35397ec681f3Smrg         if (lowered_ngg)
35407ec681f3Smrg            radv_lower_ngg(device, nir[i], &infos[i], pipeline_key);
35417ec681f3Smrg
35427ec681f3Smrg         radv_optimize_nir_algebraic(nir[i], io_to_mem || lowered_ngg || i == MESA_SHADER_COMPUTE);
35437ec681f3Smrg
35447ec681f3Smrg         if (nir[i]->info.bit_sizes_int & (8 | 16)) {
35457ec681f3Smrg            if (device->physical_device->rad_info.chip_class >= GFX8) {
35467ec681f3Smrg               nir_convert_to_lcssa(nir[i], true, true);
35477ec681f3Smrg               nir_divergence_analysis(nir[i]);
35487ec681f3Smrg            }
35497ec681f3Smrg
35507ec681f3Smrg            if (nir_lower_bit_size(nir[i], lower_bit_size_callback, device)) {
35517ec681f3Smrg               NIR_PASS_V(nir[i], nir_opt_constant_folding);
35527ec681f3Smrg               NIR_PASS_V(nir[i], nir_opt_dce);
35537ec681f3Smrg            }
35547ec681f3Smrg
35557ec681f3Smrg            if (device->physical_device->rad_info.chip_class >= GFX8)
35567ec681f3Smrg               nir_opt_remove_phis(nir[i]); /* cleanup LCSSA phis */
35577ec681f3Smrg         }
35587ec681f3Smrg         if (((nir[i]->info.bit_sizes_int | nir[i]->info.bit_sizes_float) & 16) &&
35597ec681f3Smrg             device->physical_device->rad_info.chip_class >= GFX9)
35607ec681f3Smrg            NIR_PASS_V(nir[i], nir_opt_vectorize, opt_vectorize_callback, NULL);
35617ec681f3Smrg
35627ec681f3Smrg         /* cleanup passes */
35637ec681f3Smrg         nir_lower_load_const_to_scalar(nir[i]);
35647ec681f3Smrg         nir_move_options move_opts = nir_move_const_undef | nir_move_load_ubo |
35657ec681f3Smrg                                      nir_move_load_input | nir_move_comparisons | nir_move_copies;
35667ec681f3Smrg         nir_opt_sink(nir[i], move_opts | nir_move_load_ssbo);
35677ec681f3Smrg         nir_opt_move(nir[i], move_opts);
35687ec681f3Smrg
35697ec681f3Smrg         radv_stop_feedback(stage_feedbacks[i], false);
35707ec681f3Smrg      }
35717ec681f3Smrg   }
35727ec681f3Smrg
35737ec681f3Smrg   for (int i = 0; i < MESA_SHADER_STAGES; ++i) {
35747ec681f3Smrg      if (radv_can_dump_shader(device, modules[i], false))
35757ec681f3Smrg         nir_print_shader(nir[i], stderr);
35767ec681f3Smrg   }
35777ec681f3Smrg
35787ec681f3Smrg   if (modules[MESA_SHADER_GEOMETRY]) {
35797ec681f3Smrg      struct radv_shader_binary *gs_copy_binary = NULL;
35807ec681f3Smrg      if (!pipeline_has_ngg) {
35817ec681f3Smrg         struct radv_shader_info info = {0};
35827ec681f3Smrg
35837ec681f3Smrg         if (infos[MESA_SHADER_GEOMETRY].vs.outinfo.export_clip_dists)
35847ec681f3Smrg            info.vs.outinfo.export_clip_dists = true;
35857ec681f3Smrg
35867ec681f3Smrg         radv_nir_shader_info_pass(device, nir[MESA_SHADER_GEOMETRY], pipeline_layout, pipeline_key,
35877ec681f3Smrg                                   &info);
35887ec681f3Smrg         info.wave_size = 64; /* Wave32 not supported. */
35897ec681f3Smrg         info.workgroup_size = 64; /* HW VS: separate waves, no workgroups */
35907ec681f3Smrg         info.ballot_bit_size = 64;
35917ec681f3Smrg
35927ec681f3Smrg         pipeline->gs_copy_shader = radv_create_gs_copy_shader(
35937ec681f3Smrg            device, nir[MESA_SHADER_GEOMETRY], &info, &gs_copy_binary, keep_executable_info,
35947ec681f3Smrg            keep_statistic_info, pipeline_key->has_multiview_view_index,
35957ec681f3Smrg            pipeline_key->optimisations_disabled);
35967ec681f3Smrg      }
35977ec681f3Smrg
35987ec681f3Smrg      if (!keep_executable_info && pipeline->gs_copy_shader) {
35997ec681f3Smrg         struct radv_shader_binary *gs_binaries[MESA_SHADER_STAGES] = {NULL};
36007ec681f3Smrg         struct radv_shader_variant *gs_variants[MESA_SHADER_STAGES] = {0};
36017ec681f3Smrg
36027ec681f3Smrg         gs_binaries[MESA_SHADER_GEOMETRY] = gs_copy_binary;
36037ec681f3Smrg         gs_variants[MESA_SHADER_GEOMETRY] = pipeline->gs_copy_shader;
36047ec681f3Smrg
36057ec681f3Smrg         radv_pipeline_cache_insert_shaders(device, cache, gs_copy_hash, gs_variants, gs_binaries,
36067ec681f3Smrg                                            NULL, 0);
36077ec681f3Smrg
36087ec681f3Smrg         pipeline->gs_copy_shader = gs_variants[MESA_SHADER_GEOMETRY];
36097ec681f3Smrg      }
36107ec681f3Smrg      free(gs_copy_binary);
36117ec681f3Smrg   }
36127ec681f3Smrg
36137ec681f3Smrg   if (nir[MESA_SHADER_FRAGMENT]) {
36147ec681f3Smrg      if (!pipeline->shaders[MESA_SHADER_FRAGMENT]) {
36157ec681f3Smrg         radv_start_feedback(stage_feedbacks[MESA_SHADER_FRAGMENT]);
36167ec681f3Smrg
36177ec681f3Smrg         pipeline->shaders[MESA_SHADER_FRAGMENT] = radv_shader_variant_compile(
36187ec681f3Smrg            device, modules[MESA_SHADER_FRAGMENT], &nir[MESA_SHADER_FRAGMENT], 1, pipeline_layout,
36197ec681f3Smrg            pipeline_key, infos + MESA_SHADER_FRAGMENT, keep_executable_info,
36207ec681f3Smrg            keep_statistic_info, &binaries[MESA_SHADER_FRAGMENT]);
36217ec681f3Smrg
36227ec681f3Smrg         radv_stop_feedback(stage_feedbacks[MESA_SHADER_FRAGMENT], false);
36237ec681f3Smrg      }
36247ec681f3Smrg   }
36257ec681f3Smrg
36267ec681f3Smrg   if (device->physical_device->rad_info.chip_class >= GFX9 && modules[MESA_SHADER_TESS_CTRL]) {
36277ec681f3Smrg      if (!pipeline->shaders[MESA_SHADER_TESS_CTRL]) {
36287ec681f3Smrg         struct nir_shader *combined_nir[] = {nir[MESA_SHADER_VERTEX], nir[MESA_SHADER_TESS_CTRL]};
36297ec681f3Smrg
36307ec681f3Smrg         radv_start_feedback(stage_feedbacks[MESA_SHADER_TESS_CTRL]);
36317ec681f3Smrg
36327ec681f3Smrg         pipeline->shaders[MESA_SHADER_TESS_CTRL] = radv_shader_variant_compile(
36337ec681f3Smrg            device, modules[MESA_SHADER_TESS_CTRL], combined_nir, 2, pipeline_layout, pipeline_key,
36347ec681f3Smrg            &infos[MESA_SHADER_TESS_CTRL], keep_executable_info, keep_statistic_info,
36357ec681f3Smrg            &binaries[MESA_SHADER_TESS_CTRL]);
36367ec681f3Smrg
36377ec681f3Smrg         radv_stop_feedback(stage_feedbacks[MESA_SHADER_TESS_CTRL], false);
36387ec681f3Smrg      }
36397ec681f3Smrg      modules[MESA_SHADER_VERTEX] = NULL;
36407ec681f3Smrg   }
36417ec681f3Smrg
36427ec681f3Smrg   if (device->physical_device->rad_info.chip_class >= GFX9 && modules[MESA_SHADER_GEOMETRY]) {
36437ec681f3Smrg      gl_shader_stage pre_stage =
36447ec681f3Smrg         modules[MESA_SHADER_TESS_EVAL] ? MESA_SHADER_TESS_EVAL : MESA_SHADER_VERTEX;
36457ec681f3Smrg      if (!pipeline->shaders[MESA_SHADER_GEOMETRY]) {
36467ec681f3Smrg         struct nir_shader *combined_nir[] = {nir[pre_stage], nir[MESA_SHADER_GEOMETRY]};
36477ec681f3Smrg
36487ec681f3Smrg         radv_start_feedback(stage_feedbacks[MESA_SHADER_GEOMETRY]);
36497ec681f3Smrg
36507ec681f3Smrg         pipeline->shaders[MESA_SHADER_GEOMETRY] = radv_shader_variant_compile(
36517ec681f3Smrg            device, modules[MESA_SHADER_GEOMETRY], combined_nir, 2, pipeline_layout, pipeline_key,
36527ec681f3Smrg            &infos[MESA_SHADER_GEOMETRY], keep_executable_info,
36537ec681f3Smrg            keep_statistic_info, &binaries[MESA_SHADER_GEOMETRY]);
36547ec681f3Smrg
36557ec681f3Smrg         radv_stop_feedback(stage_feedbacks[MESA_SHADER_GEOMETRY], false);
36567ec681f3Smrg      }
36577ec681f3Smrg      modules[pre_stage] = NULL;
36587ec681f3Smrg   }
36597ec681f3Smrg
36607ec681f3Smrg   for (int i = 0; i < MESA_SHADER_STAGES; ++i) {
36617ec681f3Smrg      if (modules[i] && !pipeline->shaders[i]) {
36627ec681f3Smrg         radv_start_feedback(stage_feedbacks[i]);
36637ec681f3Smrg
36647ec681f3Smrg         pipeline->shaders[i] = radv_shader_variant_compile(
36657ec681f3Smrg            device, modules[i], &nir[i], 1, pipeline_layout, pipeline_key, infos + i,
36667ec681f3Smrg            keep_executable_info, keep_statistic_info, &binaries[i]);
36677ec681f3Smrg
36687ec681f3Smrg         radv_stop_feedback(stage_feedbacks[i], false);
36697ec681f3Smrg      }
36707ec681f3Smrg   }
36717ec681f3Smrg
36727ec681f3Smrg   if (!keep_executable_info) {
36737ec681f3Smrg      radv_pipeline_cache_insert_shaders(device, cache, hash, pipeline->shaders, binaries,
36747ec681f3Smrg                                         stack_sizes ? *stack_sizes : NULL,
36757ec681f3Smrg                                         num_stack_sizes ? *num_stack_sizes : 0);
36767ec681f3Smrg   }
36777ec681f3Smrg
36787ec681f3Smrg   for (int i = 0; i < MESA_SHADER_STAGES; ++i) {
36797ec681f3Smrg      free(binaries[i]);
36807ec681f3Smrg      if (nir[i]) {
36817ec681f3Smrg         ralloc_free(nir[i]);
36827ec681f3Smrg
36837ec681f3Smrg         if (radv_can_dump_shader_stats(device, modules[i])) {
36847ec681f3Smrg            radv_dump_shader_stats(device, pipeline, i, stderr);
36857ec681f3Smrg         }
36867ec681f3Smrg      }
36877ec681f3Smrg   }
36887ec681f3Smrg
36897ec681f3Smrg   if (fs_m.nir)
36907ec681f3Smrg      ralloc_free(fs_m.nir);
36917ec681f3Smrg
36927ec681f3Smrg   radv_stop_feedback(pipeline_feedback, false);
36937ec681f3Smrg   return VK_SUCCESS;
369401e04c3fSmrg}
369501e04c3fSmrg
369601e04c3fSmrgstatic uint32_t
36977ec681f3Smrgradv_pipeline_stage_to_user_data_0(struct radv_pipeline *pipeline, gl_shader_stage stage,
36987ec681f3Smrg                                   enum chip_class chip_class)
36997ec681f3Smrg{
37007ec681f3Smrg   bool has_gs = radv_pipeline_has_gs(pipeline);
37017ec681f3Smrg   bool has_tess = radv_pipeline_has_tess(pipeline);
37027ec681f3Smrg   bool has_ngg = radv_pipeline_has_ngg(pipeline);
37037ec681f3Smrg
37047ec681f3Smrg   switch (stage) {
37057ec681f3Smrg   case MESA_SHADER_FRAGMENT:
37067ec681f3Smrg      return R_00B030_SPI_SHADER_USER_DATA_PS_0;
37077ec681f3Smrg   case MESA_SHADER_VERTEX:
37087ec681f3Smrg      if (has_tess) {
37097ec681f3Smrg         if (chip_class >= GFX10) {
37107ec681f3Smrg            return R_00B430_SPI_SHADER_USER_DATA_HS_0;
37117ec681f3Smrg         } else if (chip_class == GFX9) {
37127ec681f3Smrg            return R_00B430_SPI_SHADER_USER_DATA_LS_0;
37137ec681f3Smrg         } else {
37147ec681f3Smrg            return R_00B530_SPI_SHADER_USER_DATA_LS_0;
37157ec681f3Smrg         }
37167ec681f3Smrg      }
37177ec681f3Smrg
37187ec681f3Smrg      if (has_gs) {
37197ec681f3Smrg         if (chip_class >= GFX10) {
37207ec681f3Smrg            return R_00B230_SPI_SHADER_USER_DATA_GS_0;
37217ec681f3Smrg         } else {
37227ec681f3Smrg            return R_00B330_SPI_SHADER_USER_DATA_ES_0;
37237ec681f3Smrg         }
37247ec681f3Smrg      }
37257ec681f3Smrg
37267ec681f3Smrg      if (has_ngg)
37277ec681f3Smrg         return R_00B230_SPI_SHADER_USER_DATA_GS_0;
37287ec681f3Smrg
37297ec681f3Smrg      return R_00B130_SPI_SHADER_USER_DATA_VS_0;
37307ec681f3Smrg   case MESA_SHADER_GEOMETRY:
37317ec681f3Smrg      return chip_class == GFX9 ? R_00B330_SPI_SHADER_USER_DATA_ES_0
37327ec681f3Smrg                                : R_00B230_SPI_SHADER_USER_DATA_GS_0;
37337ec681f3Smrg   case MESA_SHADER_COMPUTE:
37347ec681f3Smrg      return R_00B900_COMPUTE_USER_DATA_0;
37357ec681f3Smrg   case MESA_SHADER_TESS_CTRL:
37367ec681f3Smrg      return chip_class == GFX9 ? R_00B430_SPI_SHADER_USER_DATA_LS_0
37377ec681f3Smrg                                : R_00B430_SPI_SHADER_USER_DATA_HS_0;
37387ec681f3Smrg   case MESA_SHADER_TESS_EVAL:
37397ec681f3Smrg      if (has_gs) {
37407ec681f3Smrg         return chip_class >= GFX10 ? R_00B230_SPI_SHADER_USER_DATA_GS_0
37417ec681f3Smrg                                    : R_00B330_SPI_SHADER_USER_DATA_ES_0;
37427ec681f3Smrg      } else if (has_ngg) {
37437ec681f3Smrg         return R_00B230_SPI_SHADER_USER_DATA_GS_0;
37447ec681f3Smrg      } else {
37457ec681f3Smrg         return R_00B130_SPI_SHADER_USER_DATA_VS_0;
37467ec681f3Smrg      }
37477ec681f3Smrg   default:
37487ec681f3Smrg      unreachable("unknown shader");
37497ec681f3Smrg   }
375001e04c3fSmrg}
375101e04c3fSmrg
375201e04c3fSmrgstruct radv_bin_size_entry {
37537ec681f3Smrg   unsigned bpp;
37547ec681f3Smrg   VkExtent2D extent;
375501e04c3fSmrg};
375601e04c3fSmrg
375701e04c3fSmrgstatic VkExtent2D
37587ec681f3Smrgradv_gfx9_compute_bin_size(const struct radv_pipeline *pipeline,
37597ec681f3Smrg                           const VkGraphicsPipelineCreateInfo *pCreateInfo)
37607ec681f3Smrg{
37617ec681f3Smrg   static const struct radv_bin_size_entry color_size_table[][3][9] = {
37627ec681f3Smrg      {
37637ec681f3Smrg         /* One RB / SE */
37647ec681f3Smrg         {
37657ec681f3Smrg            /* One shader engine */
37667ec681f3Smrg            {0, {128, 128}},
37677ec681f3Smrg            {1, {64, 128}},
37687ec681f3Smrg            {2, {32, 128}},
37697ec681f3Smrg            {3, {16, 128}},
37707ec681f3Smrg            {17, {0, 0}},
37717ec681f3Smrg            {UINT_MAX, {0, 0}},
37727ec681f3Smrg         },
37737ec681f3Smrg         {
37747ec681f3Smrg            /* Two shader engines */
37757ec681f3Smrg            {0, {128, 128}},
37767ec681f3Smrg            {2, {64, 128}},
37777ec681f3Smrg            {3, {32, 128}},
37787ec681f3Smrg            {5, {16, 128}},
37797ec681f3Smrg            {17, {0, 0}},
37807ec681f3Smrg            {UINT_MAX, {0, 0}},
37817ec681f3Smrg         },
37827ec681f3Smrg         {
37837ec681f3Smrg            /* Four shader engines */
37847ec681f3Smrg            {0, {128, 128}},
37857ec681f3Smrg            {3, {64, 128}},
37867ec681f3Smrg            {5, {16, 128}},
37877ec681f3Smrg            {17, {0, 0}},
37887ec681f3Smrg            {UINT_MAX, {0, 0}},
37897ec681f3Smrg         },
37907ec681f3Smrg      },
37917ec681f3Smrg      {
37927ec681f3Smrg         /* Two RB / SE */
37937ec681f3Smrg         {
37947ec681f3Smrg            /* One shader engine */
37957ec681f3Smrg            {0, {128, 128}},
37967ec681f3Smrg            {2, {64, 128}},
37977ec681f3Smrg            {3, {32, 128}},
37987ec681f3Smrg            {5, {16, 128}},
37997ec681f3Smrg            {33, {0, 0}},
38007ec681f3Smrg            {UINT_MAX, {0, 0}},
38017ec681f3Smrg         },
38027ec681f3Smrg         {
38037ec681f3Smrg            /* Two shader engines */
38047ec681f3Smrg            {0, {128, 128}},
38057ec681f3Smrg            {3, {64, 128}},
38067ec681f3Smrg            {5, {32, 128}},
38077ec681f3Smrg            {9, {16, 128}},
38087ec681f3Smrg            {33, {0, 0}},
38097ec681f3Smrg            {UINT_MAX, {0, 0}},
38107ec681f3Smrg         },
38117ec681f3Smrg         {
38127ec681f3Smrg            /* Four shader engines */
38137ec681f3Smrg            {0, {256, 256}},
38147ec681f3Smrg            {2, {128, 256}},
38157ec681f3Smrg            {3, {128, 128}},
38167ec681f3Smrg            {5, {64, 128}},
38177ec681f3Smrg            {9, {16, 128}},
38187ec681f3Smrg            {33, {0, 0}},
38197ec681f3Smrg            {UINT_MAX, {0, 0}},
38207ec681f3Smrg         },
38217ec681f3Smrg      },
38227ec681f3Smrg      {
38237ec681f3Smrg         /* Four RB / SE */
38247ec681f3Smrg         {
38257ec681f3Smrg            /* One shader engine */
38267ec681f3Smrg            {0, {128, 256}},
38277ec681f3Smrg            {2, {128, 128}},
38287ec681f3Smrg            {3, {64, 128}},
38297ec681f3Smrg            {5, {32, 128}},
38307ec681f3Smrg            {9, {16, 128}},
38317ec681f3Smrg            {33, {0, 0}},
38327ec681f3Smrg            {UINT_MAX, {0, 0}},
38337ec681f3Smrg         },
38347ec681f3Smrg         {
38357ec681f3Smrg            /* Two shader engines */
38367ec681f3Smrg            {0, {256, 256}},
38377ec681f3Smrg            {2, {128, 256}},
38387ec681f3Smrg            {3, {128, 128}},
38397ec681f3Smrg            {5, {64, 128}},
38407ec681f3Smrg            {9, {32, 128}},
38417ec681f3Smrg            {17, {16, 128}},
38427ec681f3Smrg            {33, {0, 0}},
38437ec681f3Smrg            {UINT_MAX, {0, 0}},
38447ec681f3Smrg         },
38457ec681f3Smrg         {
38467ec681f3Smrg            /* Four shader engines */
38477ec681f3Smrg            {0, {256, 512}},
38487ec681f3Smrg            {2, {256, 256}},
38497ec681f3Smrg            {3, {128, 256}},
38507ec681f3Smrg            {5, {128, 128}},
38517ec681f3Smrg            {9, {64, 128}},
38527ec681f3Smrg            {17, {16, 128}},
38537ec681f3Smrg            {33, {0, 0}},
38547ec681f3Smrg            {UINT_MAX, {0, 0}},
38557ec681f3Smrg         },
38567ec681f3Smrg      },
38577ec681f3Smrg   };
38587ec681f3Smrg   static const struct radv_bin_size_entry ds_size_table[][3][9] = {
38597ec681f3Smrg      {
38607ec681f3Smrg         // One RB / SE
38617ec681f3Smrg         {
38627ec681f3Smrg            // One shader engine
38637ec681f3Smrg            {0, {128, 256}},
38647ec681f3Smrg            {2, {128, 128}},
38657ec681f3Smrg            {4, {64, 128}},
38667ec681f3Smrg            {7, {32, 128}},
38677ec681f3Smrg            {13, {16, 128}},
38687ec681f3Smrg            {49, {0, 0}},
38697ec681f3Smrg            {UINT_MAX, {0, 0}},
38707ec681f3Smrg         },
38717ec681f3Smrg         {
38727ec681f3Smrg            // Two shader engines
38737ec681f3Smrg            {0, {256, 256}},
38747ec681f3Smrg            {2, {128, 256}},
38757ec681f3Smrg            {4, {128, 128}},
38767ec681f3Smrg            {7, {64, 128}},
38777ec681f3Smrg            {13, {32, 128}},
38787ec681f3Smrg            {25, {16, 128}},
38797ec681f3Smrg            {49, {0, 0}},
38807ec681f3Smrg            {UINT_MAX, {0, 0}},
38817ec681f3Smrg         },
38827ec681f3Smrg         {
38837ec681f3Smrg            // Four shader engines
38847ec681f3Smrg            {0, {256, 512}},
38857ec681f3Smrg            {2, {256, 256}},
38867ec681f3Smrg            {4, {128, 256}},
38877ec681f3Smrg            {7, {128, 128}},
38887ec681f3Smrg            {13, {64, 128}},
38897ec681f3Smrg            {25, {16, 128}},
38907ec681f3Smrg            {49, {0, 0}},
38917ec681f3Smrg            {UINT_MAX, {0, 0}},
38927ec681f3Smrg         },
38937ec681f3Smrg      },
38947ec681f3Smrg      {
38957ec681f3Smrg         // Two RB / SE
38967ec681f3Smrg         {
38977ec681f3Smrg            // One shader engine
38987ec681f3Smrg            {0, {256, 256}},
38997ec681f3Smrg            {2, {128, 256}},
39007ec681f3Smrg            {4, {128, 128}},
39017ec681f3Smrg            {7, {64, 128}},
39027ec681f3Smrg            {13, {32, 128}},
39037ec681f3Smrg            {25, {16, 128}},
39047ec681f3Smrg            {97, {0, 0}},
39057ec681f3Smrg            {UINT_MAX, {0, 0}},
39067ec681f3Smrg         },
39077ec681f3Smrg         {
39087ec681f3Smrg            // Two shader engines
39097ec681f3Smrg            {0, {256, 512}},
39107ec681f3Smrg            {2, {256, 256}},
39117ec681f3Smrg            {4, {128, 256}},
39127ec681f3Smrg            {7, {128, 128}},
39137ec681f3Smrg            {13, {64, 128}},
39147ec681f3Smrg            {25, {32, 128}},
39157ec681f3Smrg            {49, {16, 128}},
39167ec681f3Smrg            {97, {0, 0}},
39177ec681f3Smrg            {UINT_MAX, {0, 0}},
39187ec681f3Smrg         },
39197ec681f3Smrg         {
39207ec681f3Smrg            // Four shader engines
39217ec681f3Smrg            {0, {512, 512}},
39227ec681f3Smrg            {2, {256, 512}},
39237ec681f3Smrg            {4, {256, 256}},
39247ec681f3Smrg            {7, {128, 256}},
39257ec681f3Smrg            {13, {128, 128}},
39267ec681f3Smrg            {25, {64, 128}},
39277ec681f3Smrg            {49, {16, 128}},
39287ec681f3Smrg            {97, {0, 0}},
39297ec681f3Smrg            {UINT_MAX, {0, 0}},
39307ec681f3Smrg         },
39317ec681f3Smrg      },
39327ec681f3Smrg      {
39337ec681f3Smrg         // Four RB / SE
39347ec681f3Smrg         {
39357ec681f3Smrg            // One shader engine
39367ec681f3Smrg            {0, {256, 512}},
39377ec681f3Smrg            {2, {256, 256}},
39387ec681f3Smrg            {4, {128, 256}},
39397ec681f3Smrg            {7, {128, 128}},
39407ec681f3Smrg            {13, {64, 128}},
39417ec681f3Smrg            {25, {32, 128}},
39427ec681f3Smrg            {49, {16, 128}},
39437ec681f3Smrg            {UINT_MAX, {0, 0}},
39447ec681f3Smrg         },
39457ec681f3Smrg         {
39467ec681f3Smrg            // Two shader engines
39477ec681f3Smrg            {0, {512, 512}},
39487ec681f3Smrg            {2, {256, 512}},
39497ec681f3Smrg            {4, {256, 256}},
39507ec681f3Smrg            {7, {128, 256}},
39517ec681f3Smrg            {13, {128, 128}},
39527ec681f3Smrg            {25, {64, 128}},
39537ec681f3Smrg            {49, {32, 128}},
39547ec681f3Smrg            {97, {16, 128}},
39557ec681f3Smrg            {UINT_MAX, {0, 0}},
39567ec681f3Smrg         },
39577ec681f3Smrg         {
39587ec681f3Smrg            // Four shader engines
39597ec681f3Smrg            {0, {512, 512}},
39607ec681f3Smrg            {4, {256, 512}},
39617ec681f3Smrg            {7, {256, 256}},
39627ec681f3Smrg            {13, {128, 256}},
39637ec681f3Smrg            {25, {128, 128}},
39647ec681f3Smrg            {49, {64, 128}},
39657ec681f3Smrg            {97, {16, 128}},
39667ec681f3Smrg            {UINT_MAX, {0, 0}},
39677ec681f3Smrg         },
39687ec681f3Smrg      },
39697ec681f3Smrg   };
39707ec681f3Smrg
39717ec681f3Smrg   RADV_FROM_HANDLE(radv_render_pass, pass, pCreateInfo->renderPass);
39727ec681f3Smrg   struct radv_subpass *subpass = pass->subpasses + pCreateInfo->subpass;
39737ec681f3Smrg   VkExtent2D extent = {512, 512};
39747ec681f3Smrg
39757ec681f3Smrg   unsigned log_num_rb_per_se =
39767ec681f3Smrg      util_logbase2_ceil(pipeline->device->physical_device->rad_info.max_render_backends /
39777ec681f3Smrg                         pipeline->device->physical_device->rad_info.max_se);
39787ec681f3Smrg   unsigned log_num_se = util_logbase2_ceil(pipeline->device->physical_device->rad_info.max_se);
39797ec681f3Smrg
39807ec681f3Smrg   unsigned total_samples = 1u << G_028BE0_MSAA_NUM_SAMPLES(pipeline->graphics.ms.pa_sc_aa_config);
39817ec681f3Smrg   unsigned ps_iter_samples = 1u << G_028804_PS_ITER_SAMPLES(pipeline->graphics.ms.db_eqaa);
39827ec681f3Smrg   unsigned effective_samples = total_samples;
39837ec681f3Smrg   unsigned color_bytes_per_pixel = 0;
39847ec681f3Smrg
39857ec681f3Smrg   const VkPipelineColorBlendStateCreateInfo *vkblend =
39867ec681f3Smrg      radv_pipeline_get_color_blend_state(pCreateInfo);
39877ec681f3Smrg   if (vkblend) {
39887ec681f3Smrg      for (unsigned i = 0; i < subpass->color_count; i++) {
39897ec681f3Smrg         if (!vkblend->pAttachments[i].colorWriteMask)
39907ec681f3Smrg            continue;
39917ec681f3Smrg
39927ec681f3Smrg         if (subpass->color_attachments[i].attachment == VK_ATTACHMENT_UNUSED)
39937ec681f3Smrg            continue;
39947ec681f3Smrg
39957ec681f3Smrg         VkFormat format = pass->attachments[subpass->color_attachments[i].attachment].format;
39967ec681f3Smrg         color_bytes_per_pixel += vk_format_get_blocksize(format);
39977ec681f3Smrg      }
39987ec681f3Smrg
39997ec681f3Smrg      /* MSAA images typically don't use all samples all the time. */
40007ec681f3Smrg      if (effective_samples >= 2 && ps_iter_samples <= 1)
40017ec681f3Smrg         effective_samples = 2;
40027ec681f3Smrg      color_bytes_per_pixel *= effective_samples;
40037ec681f3Smrg   }
40047ec681f3Smrg
40057ec681f3Smrg   const struct radv_bin_size_entry *color_entry = color_size_table[log_num_rb_per_se][log_num_se];
40067ec681f3Smrg   while (color_entry[1].bpp <= color_bytes_per_pixel)
40077ec681f3Smrg      ++color_entry;
40087ec681f3Smrg
40097ec681f3Smrg   extent = color_entry->extent;
40107ec681f3Smrg
40117ec681f3Smrg   if (subpass->depth_stencil_attachment) {
40127ec681f3Smrg      struct radv_render_pass_attachment *attachment =
40137ec681f3Smrg         pass->attachments + subpass->depth_stencil_attachment->attachment;
40147ec681f3Smrg
40157ec681f3Smrg      /* Coefficients taken from AMDVLK */
40167ec681f3Smrg      unsigned depth_coeff = vk_format_has_depth(attachment->format) ? 5 : 0;
40177ec681f3Smrg      unsigned stencil_coeff = vk_format_has_stencil(attachment->format) ? 1 : 0;
40187ec681f3Smrg      unsigned ds_bytes_per_pixel = 4 * (depth_coeff + stencil_coeff) * total_samples;
40197ec681f3Smrg
40207ec681f3Smrg      const struct radv_bin_size_entry *ds_entry = ds_size_table[log_num_rb_per_se][log_num_se];
40217ec681f3Smrg      while (ds_entry[1].bpp <= ds_bytes_per_pixel)
40227ec681f3Smrg         ++ds_entry;
40237ec681f3Smrg
40247ec681f3Smrg      if (ds_entry->extent.width * ds_entry->extent.height < extent.width * extent.height)
40257ec681f3Smrg         extent = ds_entry->extent;
40267ec681f3Smrg   }
40277ec681f3Smrg
40287ec681f3Smrg   return extent;
40297ec681f3Smrg}
40307ec681f3Smrg
40317ec681f3Smrgstatic VkExtent2D
40327ec681f3Smrgradv_gfx10_compute_bin_size(const struct radv_pipeline *pipeline,
40337ec681f3Smrg                            const VkGraphicsPipelineCreateInfo *pCreateInfo)
40347ec681f3Smrg{
40357ec681f3Smrg   RADV_FROM_HANDLE(radv_render_pass, pass, pCreateInfo->renderPass);
40367ec681f3Smrg   struct radv_subpass *subpass = pass->subpasses + pCreateInfo->subpass;
40377ec681f3Smrg   VkExtent2D extent = {512, 512};
40387ec681f3Smrg
40397ec681f3Smrg   const unsigned db_tag_size = 64;
40407ec681f3Smrg   const unsigned db_tag_count = 312;
40417ec681f3Smrg   const unsigned color_tag_size = 1024;
40427ec681f3Smrg   const unsigned color_tag_count = 31;
40437ec681f3Smrg   const unsigned fmask_tag_size = 256;
40447ec681f3Smrg   const unsigned fmask_tag_count = 44;
40457ec681f3Smrg
40467ec681f3Smrg   const unsigned rb_count = pipeline->device->physical_device->rad_info.max_render_backends;
40477ec681f3Smrg   const unsigned pipe_count =
40487ec681f3Smrg      MAX2(rb_count, pipeline->device->physical_device->rad_info.num_tcc_blocks);
40497ec681f3Smrg
40507ec681f3Smrg   const unsigned db_tag_part = (db_tag_count * rb_count / pipe_count) * db_tag_size * pipe_count;
40517ec681f3Smrg   const unsigned color_tag_part =
40527ec681f3Smrg      (color_tag_count * rb_count / pipe_count) * color_tag_size * pipe_count;
40537ec681f3Smrg   const unsigned fmask_tag_part =
40547ec681f3Smrg      (fmask_tag_count * rb_count / pipe_count) * fmask_tag_size * pipe_count;
40557ec681f3Smrg
40567ec681f3Smrg   const unsigned total_samples =
40577ec681f3Smrg      1u << G_028BE0_MSAA_NUM_SAMPLES(pipeline->graphics.ms.pa_sc_aa_config);
40587ec681f3Smrg   const unsigned samples_log = util_logbase2_ceil(total_samples);
40597ec681f3Smrg
40607ec681f3Smrg   unsigned color_bytes_per_pixel = 0;
40617ec681f3Smrg   unsigned fmask_bytes_per_pixel = 0;
40627ec681f3Smrg
40637ec681f3Smrg   const VkPipelineColorBlendStateCreateInfo *vkblend =
40647ec681f3Smrg      radv_pipeline_get_color_blend_state(pCreateInfo);
40657ec681f3Smrg   if (vkblend) {
40667ec681f3Smrg      for (unsigned i = 0; i < subpass->color_count; i++) {
40677ec681f3Smrg         if (!vkblend->pAttachments[i].colorWriteMask)
40687ec681f3Smrg            continue;
40697ec681f3Smrg
40707ec681f3Smrg         if (subpass->color_attachments[i].attachment == VK_ATTACHMENT_UNUSED)
40717ec681f3Smrg            continue;
40727ec681f3Smrg
40737ec681f3Smrg         VkFormat format = pass->attachments[subpass->color_attachments[i].attachment].format;
40747ec681f3Smrg         color_bytes_per_pixel += vk_format_get_blocksize(format);
40757ec681f3Smrg
40767ec681f3Smrg         if (total_samples > 1) {
40777ec681f3Smrg            assert(samples_log <= 3);
40787ec681f3Smrg            const unsigned fmask_array[] = {0, 1, 1, 4};
40797ec681f3Smrg            fmask_bytes_per_pixel += fmask_array[samples_log];
40807ec681f3Smrg         }
40817ec681f3Smrg      }
40827ec681f3Smrg
40837ec681f3Smrg      color_bytes_per_pixel *= total_samples;
40847ec681f3Smrg   }
40857ec681f3Smrg   color_bytes_per_pixel = MAX2(color_bytes_per_pixel, 1);
40867ec681f3Smrg
40877ec681f3Smrg   const unsigned color_pixel_count_log = util_logbase2(color_tag_part / color_bytes_per_pixel);
40887ec681f3Smrg   extent.width = 1ull << ((color_pixel_count_log + 1) / 2);
40897ec681f3Smrg   extent.height = 1ull << (color_pixel_count_log / 2);
40907ec681f3Smrg
40917ec681f3Smrg   if (fmask_bytes_per_pixel) {
40927ec681f3Smrg      const unsigned fmask_pixel_count_log = util_logbase2(fmask_tag_part / fmask_bytes_per_pixel);
40937ec681f3Smrg
40947ec681f3Smrg      const VkExtent2D fmask_extent =
40957ec681f3Smrg         (VkExtent2D){.width = 1ull << ((fmask_pixel_count_log + 1) / 2),
40967ec681f3Smrg                      .height = 1ull << (color_pixel_count_log / 2)};
40977ec681f3Smrg
40987ec681f3Smrg      if (fmask_extent.width * fmask_extent.height < extent.width * extent.height)
40997ec681f3Smrg         extent = fmask_extent;
41007ec681f3Smrg   }
41017ec681f3Smrg
41027ec681f3Smrg   if (subpass->depth_stencil_attachment) {
41037ec681f3Smrg      struct radv_render_pass_attachment *attachment =
41047ec681f3Smrg         pass->attachments + subpass->depth_stencil_attachment->attachment;
41057ec681f3Smrg
41067ec681f3Smrg      /* Coefficients taken from AMDVLK */
41077ec681f3Smrg      unsigned depth_coeff = vk_format_has_depth(attachment->format) ? 5 : 0;
41087ec681f3Smrg      unsigned stencil_coeff = vk_format_has_stencil(attachment->format) ? 1 : 0;
41097ec681f3Smrg      unsigned db_bytes_per_pixel = (depth_coeff + stencil_coeff) * total_samples;
41107ec681f3Smrg
41117ec681f3Smrg      const unsigned db_pixel_count_log = util_logbase2(db_tag_part / db_bytes_per_pixel);
41127ec681f3Smrg
41137ec681f3Smrg      const VkExtent2D db_extent = (VkExtent2D){.width = 1ull << ((db_pixel_count_log + 1) / 2),
41147ec681f3Smrg                                                .height = 1ull << (color_pixel_count_log / 2)};
41157ec681f3Smrg
41167ec681f3Smrg      if (db_extent.width * db_extent.height < extent.width * extent.height)
41177ec681f3Smrg         extent = db_extent;
41187ec681f3Smrg   }
41197ec681f3Smrg
41207ec681f3Smrg   extent.width = MAX2(extent.width, 128);
41217ec681f3Smrg   extent.height = MAX2(extent.width, 64);
41227ec681f3Smrg
41237ec681f3Smrg   return extent;
412401e04c3fSmrg}
412501e04c3fSmrg
412601e04c3fSmrgstatic void
41277ec681f3Smrgradv_pipeline_init_disabled_binning_state(struct radv_pipeline *pipeline,
41287ec681f3Smrg                                          const VkGraphicsPipelineCreateInfo *pCreateInfo)
41297ec681f3Smrg{
41307ec681f3Smrg   uint32_t pa_sc_binner_cntl_0 = S_028C44_BINNING_MODE(V_028C44_DISABLE_BINNING_USE_LEGACY_SC) |
41317ec681f3Smrg                                  S_028C44_DISABLE_START_OF_PRIM(1);
41327ec681f3Smrg
41337ec681f3Smrg   if (pipeline->device->physical_device->rad_info.chip_class >= GFX10) {
41347ec681f3Smrg      RADV_FROM_HANDLE(radv_render_pass, pass, pCreateInfo->renderPass);
41357ec681f3Smrg      struct radv_subpass *subpass = pass->subpasses + pCreateInfo->subpass;
41367ec681f3Smrg      const VkPipelineColorBlendStateCreateInfo *vkblend =
41377ec681f3Smrg         radv_pipeline_get_color_blend_state(pCreateInfo);
41387ec681f3Smrg      unsigned min_bytes_per_pixel = 0;
41397ec681f3Smrg
41407ec681f3Smrg      if (vkblend) {
41417ec681f3Smrg         for (unsigned i = 0; i < subpass->color_count; i++) {
41427ec681f3Smrg            if (!vkblend->pAttachments[i].colorWriteMask)
41437ec681f3Smrg               continue;
41447ec681f3Smrg
41457ec681f3Smrg            if (subpass->color_attachments[i].attachment == VK_ATTACHMENT_UNUSED)
41467ec681f3Smrg               continue;
41477ec681f3Smrg
41487ec681f3Smrg            VkFormat format = pass->attachments[subpass->color_attachments[i].attachment].format;
41497ec681f3Smrg            unsigned bytes = vk_format_get_blocksize(format);
41507ec681f3Smrg            if (!min_bytes_per_pixel || bytes < min_bytes_per_pixel)
41517ec681f3Smrg               min_bytes_per_pixel = bytes;
41527ec681f3Smrg         }
41537ec681f3Smrg      }
41547ec681f3Smrg
41557ec681f3Smrg      pa_sc_binner_cntl_0 =
41567ec681f3Smrg         S_028C44_BINNING_MODE(V_028C44_DISABLE_BINNING_USE_NEW_SC) | S_028C44_BIN_SIZE_X(0) |
41577ec681f3Smrg         S_028C44_BIN_SIZE_Y(0) | S_028C44_BIN_SIZE_X_EXTEND(2) |       /* 128 */
41587ec681f3Smrg         S_028C44_BIN_SIZE_Y_EXTEND(min_bytes_per_pixel <= 4 ? 2 : 1) | /* 128 or 64 */
41597ec681f3Smrg         S_028C44_DISABLE_START_OF_PRIM(1);
41607ec681f3Smrg   }
41617ec681f3Smrg
41627ec681f3Smrg   pipeline->graphics.binning.pa_sc_binner_cntl_0 = pa_sc_binner_cntl_0;
416301e04c3fSmrg}
416401e04c3fSmrg
41657ec681f3Smrgstruct radv_binning_settings
41667ec681f3Smrgradv_get_binning_settings(const struct radv_physical_device *pdev)
41677ec681f3Smrg{
41687ec681f3Smrg   struct radv_binning_settings settings;
41697ec681f3Smrg   if (pdev->rad_info.has_dedicated_vram) {
41707ec681f3Smrg      if (pdev->rad_info.max_render_backends > 4) {
41717ec681f3Smrg         settings.context_states_per_bin = 1;
41727ec681f3Smrg         settings.persistent_states_per_bin = 1;
41737ec681f3Smrg      } else {
41747ec681f3Smrg         settings.context_states_per_bin = 3;
41757ec681f3Smrg         settings.persistent_states_per_bin = 8;
41767ec681f3Smrg      }
41777ec681f3Smrg      settings.fpovs_per_batch = 63;
41787ec681f3Smrg   } else {
41797ec681f3Smrg      /* The context states are affected by the scissor bug. */
41807ec681f3Smrg      settings.context_states_per_bin = 6;
41817ec681f3Smrg      /* 32 causes hangs for RAVEN. */
41827ec681f3Smrg      settings.persistent_states_per_bin = 16;
41837ec681f3Smrg      settings.fpovs_per_batch = 63;
41847ec681f3Smrg   }
41857ec681f3Smrg
41867ec681f3Smrg   if (pdev->rad_info.has_gfx9_scissor_bug)
41877ec681f3Smrg      settings.context_states_per_bin = 1;
41887ec681f3Smrg
41897ec681f3Smrg   return settings;
41907ec681f3Smrg}
41917ec681f3Smrg
41927ec681f3Smrgstatic void
41937ec681f3Smrgradv_pipeline_init_binning_state(struct radv_pipeline *pipeline,
41947ec681f3Smrg                                 const VkGraphicsPipelineCreateInfo *pCreateInfo,
41957ec681f3Smrg                                 const struct radv_blend_state *blend)
41967ec681f3Smrg{
41977ec681f3Smrg   if (pipeline->device->physical_device->rad_info.chip_class < GFX9)
41987ec681f3Smrg      return;
41997ec681f3Smrg
42007ec681f3Smrg   VkExtent2D bin_size;
42017ec681f3Smrg   if (pipeline->device->physical_device->rad_info.chip_class >= GFX10) {
42027ec681f3Smrg      bin_size = radv_gfx10_compute_bin_size(pipeline, pCreateInfo);
42037ec681f3Smrg   } else if (pipeline->device->physical_device->rad_info.chip_class == GFX9) {
42047ec681f3Smrg      bin_size = radv_gfx9_compute_bin_size(pipeline, pCreateInfo);
42057ec681f3Smrg   } else
42067ec681f3Smrg      unreachable("Unhandled generation for binning bin size calculation");
42077ec681f3Smrg
42087ec681f3Smrg   if (pipeline->device->pbb_allowed && bin_size.width && bin_size.height) {
42097ec681f3Smrg      struct radv_binning_settings settings =
42107ec681f3Smrg         radv_get_binning_settings(pipeline->device->physical_device);
42117ec681f3Smrg
42127ec681f3Smrg      const uint32_t pa_sc_binner_cntl_0 =
42137ec681f3Smrg         S_028C44_BINNING_MODE(V_028C44_BINNING_ALLOWED) |
42147ec681f3Smrg         S_028C44_BIN_SIZE_X(bin_size.width == 16) | S_028C44_BIN_SIZE_Y(bin_size.height == 16) |
42157ec681f3Smrg         S_028C44_BIN_SIZE_X_EXTEND(util_logbase2(MAX2(bin_size.width, 32)) - 5) |
42167ec681f3Smrg         S_028C44_BIN_SIZE_Y_EXTEND(util_logbase2(MAX2(bin_size.height, 32)) - 5) |
42177ec681f3Smrg         S_028C44_CONTEXT_STATES_PER_BIN(settings.context_states_per_bin - 1) |
42187ec681f3Smrg         S_028C44_PERSISTENT_STATES_PER_BIN(settings.persistent_states_per_bin - 1) |
42197ec681f3Smrg         S_028C44_DISABLE_START_OF_PRIM(1) |
42207ec681f3Smrg         S_028C44_FPOVS_PER_BATCH(settings.fpovs_per_batch) | S_028C44_OPTIMAL_BIN_SELECTION(1);
42217ec681f3Smrg
42227ec681f3Smrg      pipeline->graphics.binning.pa_sc_binner_cntl_0 = pa_sc_binner_cntl_0;
42237ec681f3Smrg   } else
42247ec681f3Smrg      radv_pipeline_init_disabled_binning_state(pipeline, pCreateInfo);
42257ec681f3Smrg}
422601e04c3fSmrg
422701e04c3fSmrgstatic void
4228ed98bd31Smayaradv_pipeline_generate_depth_stencil_state(struct radeon_cmdbuf *ctx_cs,
42297ec681f3Smrg                                           const struct radv_pipeline *pipeline,
423001e04c3fSmrg                                           const VkGraphicsPipelineCreateInfo *pCreateInfo,
423101e04c3fSmrg                                           const struct radv_graphics_pipeline_create_info *extra)
423201e04c3fSmrg{
42337ec681f3Smrg   const VkPipelineDepthStencilStateCreateInfo *vkds =
42347ec681f3Smrg      radv_pipeline_get_depth_stencil_state(pCreateInfo);
42357ec681f3Smrg   RADV_FROM_HANDLE(radv_render_pass, pass, pCreateInfo->renderPass);
42367ec681f3Smrg   struct radv_subpass *subpass = pass->subpasses + pCreateInfo->subpass;
42377ec681f3Smrg   struct radv_shader_variant *ps = pipeline->shaders[MESA_SHADER_FRAGMENT];
42387ec681f3Smrg   struct radv_render_pass_attachment *attachment = NULL;
42397ec681f3Smrg   uint32_t db_render_control = 0, db_render_override2 = 0;
42407ec681f3Smrg   uint32_t db_render_override = 0;
42417ec681f3Smrg
42427ec681f3Smrg   if (subpass->depth_stencil_attachment)
42437ec681f3Smrg      attachment = pass->attachments + subpass->depth_stencil_attachment->attachment;
42447ec681f3Smrg
42457ec681f3Smrg   bool has_depth_attachment = attachment && vk_format_has_depth(attachment->format);
42467ec681f3Smrg
42477ec681f3Smrg   if (vkds && has_depth_attachment) {
42487ec681f3Smrg      /* from amdvlk: For 4xAA and 8xAA need to decompress on flush for better performance */
42497ec681f3Smrg      db_render_override2 |= S_028010_DECOMPRESS_Z_ON_FLUSH(attachment->samples > 2);
42507ec681f3Smrg
42517ec681f3Smrg      if (pipeline->device->physical_device->rad_info.chip_class >= GFX10_3)
42527ec681f3Smrg         db_render_override2 |= S_028010_CENTROID_COMPUTATION_MODE(1);
42537ec681f3Smrg   }
42547ec681f3Smrg
42557ec681f3Smrg   if (attachment && extra) {
42567ec681f3Smrg      db_render_control |= S_028000_DEPTH_CLEAR_ENABLE(extra->db_depth_clear);
42577ec681f3Smrg      db_render_control |= S_028000_STENCIL_CLEAR_ENABLE(extra->db_stencil_clear);
42587ec681f3Smrg
42597ec681f3Smrg      db_render_control |= S_028000_RESUMMARIZE_ENABLE(extra->resummarize_enable);
42607ec681f3Smrg      db_render_control |= S_028000_DEPTH_COMPRESS_DISABLE(extra->depth_compress_disable);
42617ec681f3Smrg      db_render_control |= S_028000_STENCIL_COMPRESS_DISABLE(extra->stencil_compress_disable);
42627ec681f3Smrg   }
42637ec681f3Smrg
42647ec681f3Smrg   db_render_override |= S_02800C_FORCE_HIS_ENABLE0(V_02800C_FORCE_DISABLE) |
42657ec681f3Smrg                         S_02800C_FORCE_HIS_ENABLE1(V_02800C_FORCE_DISABLE);
42667ec681f3Smrg
42677ec681f3Smrg   if (!pCreateInfo->pRasterizationState->depthClampEnable && ps->info.ps.writes_z) {
42687ec681f3Smrg      /* From VK_EXT_depth_range_unrestricted spec:
42697ec681f3Smrg       *
42707ec681f3Smrg       * "The behavior described in Primitive Clipping still applies.
42717ec681f3Smrg       *  If depth clamping is disabled the depth values are still
42727ec681f3Smrg       *  clipped to 0 ≤ zc ≤ wc before the viewport transform. If
42737ec681f3Smrg       *  depth clamping is enabled the above equation is ignored and
42747ec681f3Smrg       *  the depth values are instead clamped to the VkViewport
42757ec681f3Smrg       *  minDepth and maxDepth values, which in the case of this
42767ec681f3Smrg       *  extension can be outside of the 0.0 to 1.0 range."
42777ec681f3Smrg       */
42787ec681f3Smrg      db_render_override |= S_02800C_DISABLE_VIEWPORT_CLAMP(1);
42797ec681f3Smrg   }
42807ec681f3Smrg
42817ec681f3Smrg   radeon_set_context_reg(ctx_cs, R_028000_DB_RENDER_CONTROL, db_render_control);
42827ec681f3Smrg
42837ec681f3Smrg   radeon_set_context_reg_seq(ctx_cs, R_02800C_DB_RENDER_OVERRIDE, 2);
42847ec681f3Smrg   radeon_emit(ctx_cs, db_render_override);
42857ec681f3Smrg   radeon_emit(ctx_cs, db_render_override2);
428601e04c3fSmrg}
428701e04c3fSmrg
428801e04c3fSmrgstatic void
4289ed98bd31Smayaradv_pipeline_generate_blend_state(struct radeon_cmdbuf *ctx_cs,
42907ec681f3Smrg                                   const struct radv_pipeline *pipeline,
429101e04c3fSmrg                                   const struct radv_blend_state *blend)
429201e04c3fSmrg{
42937ec681f3Smrg   radeon_set_context_reg_seq(ctx_cs, R_028780_CB_BLEND0_CONTROL, 8);
42947ec681f3Smrg   radeon_emit_array(ctx_cs, blend->cb_blend_control, 8);
42957ec681f3Smrg   radeon_set_context_reg(ctx_cs, R_028B70_DB_ALPHA_TO_MASK, blend->db_alpha_to_mask);
429601e04c3fSmrg
42977ec681f3Smrg   if (pipeline->device->physical_device->rad_info.has_rbplus) {
429801e04c3fSmrg
42997ec681f3Smrg      radeon_set_context_reg_seq(ctx_cs, R_028760_SX_MRT0_BLEND_OPT, 8);
43007ec681f3Smrg      radeon_emit_array(ctx_cs, blend->sx_mrt_blend_opt, 8);
43017ec681f3Smrg   }
430201e04c3fSmrg
43037ec681f3Smrg   radeon_set_context_reg(ctx_cs, R_028714_SPI_SHADER_COL_FORMAT, blend->spi_shader_col_format);
430401e04c3fSmrg
43057ec681f3Smrg   radeon_set_context_reg(ctx_cs, R_02823C_CB_SHADER_MASK, blend->cb_shader_mask);
430601e04c3fSmrg}
430701e04c3fSmrg
430801e04c3fSmrgstatic void
4309ed98bd31Smayaradv_pipeline_generate_raster_state(struct radeon_cmdbuf *ctx_cs,
43107ec681f3Smrg                                    const struct radv_pipeline *pipeline,
431101e04c3fSmrg                                    const VkGraphicsPipelineCreateInfo *pCreateInfo)
431201e04c3fSmrg{
43137ec681f3Smrg   const VkPipelineRasterizationStateCreateInfo *vkraster = pCreateInfo->pRasterizationState;
43147ec681f3Smrg   const VkConservativeRasterizationModeEXT mode = radv_get_conservative_raster_mode(vkraster);
43157ec681f3Smrg   uint32_t pa_sc_conservative_rast = S_028C4C_NULL_SQUAD_AA_MASK_ENABLE(1);
43167ec681f3Smrg
43177ec681f3Smrg   if (pipeline->device->physical_device->rad_info.chip_class >= GFX9) {
43187ec681f3Smrg      /* Conservative rasterization. */
43197ec681f3Smrg      if (mode != VK_CONSERVATIVE_RASTERIZATION_MODE_DISABLED_EXT) {
43207ec681f3Smrg         pa_sc_conservative_rast = S_028C4C_PREZ_AA_MASK_ENABLE(1) | S_028C4C_POSTZ_AA_MASK_ENABLE(1) |
43217ec681f3Smrg                                   S_028C4C_CENTROID_SAMPLE_OVERRIDE(1);
43227ec681f3Smrg
43237ec681f3Smrg         if (mode == VK_CONSERVATIVE_RASTERIZATION_MODE_OVERESTIMATE_EXT) {
43247ec681f3Smrg            pa_sc_conservative_rast |=
43257ec681f3Smrg               S_028C4C_OVER_RAST_ENABLE(1) | S_028C4C_OVER_RAST_SAMPLE_SELECT(0) |
43267ec681f3Smrg               S_028C4C_UNDER_RAST_ENABLE(0) | S_028C4C_UNDER_RAST_SAMPLE_SELECT(1) |
43277ec681f3Smrg               S_028C4C_PBB_UNCERTAINTY_REGION_ENABLE(1);
43287ec681f3Smrg         } else {
43297ec681f3Smrg            assert(mode == VK_CONSERVATIVE_RASTERIZATION_MODE_UNDERESTIMATE_EXT);
43307ec681f3Smrg            pa_sc_conservative_rast |=
43317ec681f3Smrg               S_028C4C_OVER_RAST_ENABLE(0) | S_028C4C_OVER_RAST_SAMPLE_SELECT(1) |
43327ec681f3Smrg               S_028C4C_UNDER_RAST_ENABLE(1) | S_028C4C_UNDER_RAST_SAMPLE_SELECT(0) |
43337ec681f3Smrg               S_028C4C_PBB_UNCERTAINTY_REGION_ENABLE(0);
43347ec681f3Smrg         }
43357ec681f3Smrg      }
43367ec681f3Smrg
43377ec681f3Smrg      radeon_set_context_reg(ctx_cs, R_028C4C_PA_SC_CONSERVATIVE_RASTERIZATION_CNTL,
43387ec681f3Smrg                             pa_sc_conservative_rast);
43397ec681f3Smrg   }
434001e04c3fSmrg}
434101e04c3fSmrg
434201e04c3fSmrgstatic void
4343ed98bd31Smayaradv_pipeline_generate_multisample_state(struct radeon_cmdbuf *ctx_cs,
43447ec681f3Smrg                                         const struct radv_pipeline *pipeline)
434501e04c3fSmrg{
43467ec681f3Smrg   const struct radv_multisample_state *ms = &pipeline->graphics.ms;
43477ec681f3Smrg
43487ec681f3Smrg   radeon_set_context_reg_seq(ctx_cs, R_028C38_PA_SC_AA_MASK_X0Y0_X1Y0, 2);
43497ec681f3Smrg   radeon_emit(ctx_cs, ms->pa_sc_aa_mask[0]);
43507ec681f3Smrg   radeon_emit(ctx_cs, ms->pa_sc_aa_mask[1]);
43517ec681f3Smrg
43527ec681f3Smrg   radeon_set_context_reg(ctx_cs, R_028804_DB_EQAA, ms->db_eqaa);
43537ec681f3Smrg   radeon_set_context_reg(ctx_cs, R_028BE0_PA_SC_AA_CONFIG, ms->pa_sc_aa_config);
43547ec681f3Smrg
43557ec681f3Smrg   radeon_set_context_reg_seq(ctx_cs, R_028A48_PA_SC_MODE_CNTL_0, 2);
43567ec681f3Smrg   radeon_emit(ctx_cs, ms->pa_sc_mode_cntl_0);
43577ec681f3Smrg   radeon_emit(ctx_cs, ms->pa_sc_mode_cntl_1);
43587ec681f3Smrg
43597ec681f3Smrg   /* The exclusion bits can be set to improve rasterization efficiency
43607ec681f3Smrg    * if no sample lies on the pixel boundary (-8 sample offset). It's
43617ec681f3Smrg    * currently always TRUE because the driver doesn't support 16 samples.
43627ec681f3Smrg    */
43637ec681f3Smrg   bool exclusion = pipeline->device->physical_device->rad_info.chip_class >= GFX7;
43647ec681f3Smrg   radeon_set_context_reg(
43657ec681f3Smrg      ctx_cs, R_02882C_PA_SU_PRIM_FILTER_CNTL,
43667ec681f3Smrg      S_02882C_XMAX_RIGHT_EXCLUSION(exclusion) | S_02882C_YMAX_BOTTOM_EXCLUSION(exclusion));
436701e04c3fSmrg}
436801e04c3fSmrg
436901e04c3fSmrgstatic void
4370ed98bd31Smayaradv_pipeline_generate_vgt_gs_mode(struct radeon_cmdbuf *ctx_cs,
43717ec681f3Smrg                                   const struct radv_pipeline *pipeline)
437201e04c3fSmrg{
43737ec681f3Smrg   const struct radv_vs_output_info *outinfo = get_vs_output_info(pipeline);
43747ec681f3Smrg   const struct radv_shader_variant *vs = pipeline->shaders[MESA_SHADER_TESS_EVAL]
43757ec681f3Smrg                                             ? pipeline->shaders[MESA_SHADER_TESS_EVAL]
43767ec681f3Smrg                                             : pipeline->shaders[MESA_SHADER_VERTEX];
43777ec681f3Smrg   unsigned vgt_primitiveid_en = 0;
43787ec681f3Smrg   uint32_t vgt_gs_mode = 0;
43797ec681f3Smrg
43807ec681f3Smrg   if (radv_pipeline_has_ngg(pipeline))
43817ec681f3Smrg      return;
43827ec681f3Smrg
43837ec681f3Smrg   if (radv_pipeline_has_gs(pipeline)) {
43847ec681f3Smrg      const struct radv_shader_variant *gs = pipeline->shaders[MESA_SHADER_GEOMETRY];
43857ec681f3Smrg
43867ec681f3Smrg      vgt_gs_mode = ac_vgt_gs_mode(gs->info.gs.vertices_out,
43877ec681f3Smrg                                   pipeline->device->physical_device->rad_info.chip_class);
43887ec681f3Smrg   } else if (outinfo->export_prim_id || vs->info.uses_prim_id) {
43897ec681f3Smrg      vgt_gs_mode = S_028A40_MODE(V_028A40_GS_SCENARIO_A);
43907ec681f3Smrg      vgt_primitiveid_en |= S_028A84_PRIMITIVEID_EN(1);
43917ec681f3Smrg   }
43927ec681f3Smrg
43937ec681f3Smrg   radeon_set_context_reg(ctx_cs, R_028A84_VGT_PRIMITIVEID_EN, vgt_primitiveid_en);
43947ec681f3Smrg   radeon_set_context_reg(ctx_cs, R_028A40_VGT_GS_MODE, vgt_gs_mode);
439501e04c3fSmrg}
439601e04c3fSmrg
439701e04c3fSmrgstatic void
43987ec681f3Smrgradv_pipeline_generate_hw_vs(struct radeon_cmdbuf *ctx_cs, struct radeon_cmdbuf *cs,
43997ec681f3Smrg                             const struct radv_pipeline *pipeline,
44007ec681f3Smrg                             const struct radv_shader_variant *shader)
44017ec681f3Smrg{
44027ec681f3Smrg   uint64_t va = radv_shader_variant_get_va(shader);
44037ec681f3Smrg
44047ec681f3Smrg   radeon_set_sh_reg_seq(cs, R_00B120_SPI_SHADER_PGM_LO_VS, 4);
44057ec681f3Smrg   radeon_emit(cs, va >> 8);
44067ec681f3Smrg   radeon_emit(cs, S_00B124_MEM_BASE(va >> 40));
44077ec681f3Smrg   radeon_emit(cs, shader->config.rsrc1);
44087ec681f3Smrg   radeon_emit(cs, shader->config.rsrc2);
44097ec681f3Smrg
44107ec681f3Smrg   const struct radv_vs_output_info *outinfo = get_vs_output_info(pipeline);
44117ec681f3Smrg   unsigned clip_dist_mask, cull_dist_mask, total_mask;
44127ec681f3Smrg   clip_dist_mask = outinfo->clip_dist_mask;
44137ec681f3Smrg   cull_dist_mask = outinfo->cull_dist_mask;
44147ec681f3Smrg   total_mask = clip_dist_mask | cull_dist_mask;
44157ec681f3Smrg
44167ec681f3Smrg   bool writes_primitive_shading_rate =
44177ec681f3Smrg      outinfo->writes_primitive_shading_rate || pipeline->device->force_vrs != RADV_FORCE_VRS_NONE;
44187ec681f3Smrg   bool misc_vec_ena = outinfo->writes_pointsize || outinfo->writes_layer ||
44197ec681f3Smrg                       outinfo->writes_viewport_index || writes_primitive_shading_rate;
44207ec681f3Smrg   unsigned spi_vs_out_config, nparams;
44217ec681f3Smrg
44227ec681f3Smrg   /* VS is required to export at least one param. */
44237ec681f3Smrg   nparams = MAX2(outinfo->param_exports, 1);
44247ec681f3Smrg   spi_vs_out_config = S_0286C4_VS_EXPORT_COUNT(nparams - 1);
44257ec681f3Smrg
44267ec681f3Smrg   if (pipeline->device->physical_device->rad_info.chip_class >= GFX10) {
44277ec681f3Smrg      spi_vs_out_config |= S_0286C4_NO_PC_EXPORT(outinfo->param_exports == 0);
44287ec681f3Smrg   }
44297ec681f3Smrg
44307ec681f3Smrg   radeon_set_context_reg(ctx_cs, R_0286C4_SPI_VS_OUT_CONFIG, spi_vs_out_config);
44317ec681f3Smrg
44327ec681f3Smrg   radeon_set_context_reg(
44337ec681f3Smrg      ctx_cs, R_02870C_SPI_SHADER_POS_FORMAT,
44347ec681f3Smrg      S_02870C_POS0_EXPORT_FORMAT(V_02870C_SPI_SHADER_4COMP) |
44357ec681f3Smrg         S_02870C_POS1_EXPORT_FORMAT(outinfo->pos_exports > 1 ? V_02870C_SPI_SHADER_4COMP
44367ec681f3Smrg                                                              : V_02870C_SPI_SHADER_NONE) |
44377ec681f3Smrg         S_02870C_POS2_EXPORT_FORMAT(outinfo->pos_exports > 2 ? V_02870C_SPI_SHADER_4COMP
44387ec681f3Smrg                                                              : V_02870C_SPI_SHADER_NONE) |
44397ec681f3Smrg         S_02870C_POS3_EXPORT_FORMAT(outinfo->pos_exports > 3 ? V_02870C_SPI_SHADER_4COMP
44407ec681f3Smrg                                                              : V_02870C_SPI_SHADER_NONE));
44417ec681f3Smrg
44427ec681f3Smrg   radeon_set_context_reg(ctx_cs, R_02881C_PA_CL_VS_OUT_CNTL,
44437ec681f3Smrg                          S_02881C_USE_VTX_POINT_SIZE(outinfo->writes_pointsize) |
44447ec681f3Smrg                             S_02881C_USE_VTX_RENDER_TARGET_INDX(outinfo->writes_layer) |
44457ec681f3Smrg                             S_02881C_USE_VTX_VIEWPORT_INDX(outinfo->writes_viewport_index) |
44467ec681f3Smrg                             S_02881C_USE_VTX_VRS_RATE(writes_primitive_shading_rate) |
44477ec681f3Smrg                             S_02881C_VS_OUT_MISC_VEC_ENA(misc_vec_ena) |
44487ec681f3Smrg                             S_02881C_VS_OUT_MISC_SIDE_BUS_ENA(misc_vec_ena) |
44497ec681f3Smrg                             S_02881C_VS_OUT_CCDIST0_VEC_ENA((total_mask & 0x0f) != 0) |
44507ec681f3Smrg                             S_02881C_VS_OUT_CCDIST1_VEC_ENA((total_mask & 0xf0) != 0) |
44517ec681f3Smrg                             total_mask << 8 | clip_dist_mask);
44527ec681f3Smrg
44537ec681f3Smrg   if (pipeline->device->physical_device->rad_info.chip_class <= GFX8)
44547ec681f3Smrg      radeon_set_context_reg(ctx_cs, R_028AB4_VGT_REUSE_OFF, outinfo->writes_viewport_index);
44557ec681f3Smrg
44567ec681f3Smrg   unsigned late_alloc_wave64, cu_mask;
44577ec681f3Smrg   ac_compute_late_alloc(&pipeline->device->physical_device->rad_info, false, false,
44587ec681f3Smrg                         shader->config.scratch_bytes_per_wave > 0, &late_alloc_wave64, &cu_mask);
44597ec681f3Smrg
44607ec681f3Smrg   if (pipeline->device->physical_device->rad_info.chip_class >= GFX7) {
44617ec681f3Smrg      radeon_set_sh_reg_idx(pipeline->device->physical_device, cs, R_00B118_SPI_SHADER_PGM_RSRC3_VS, 3,
44627ec681f3Smrg                            S_00B118_CU_EN(cu_mask) | S_00B118_WAVE_LIMIT(0x3F));
44637ec681f3Smrg      radeon_set_sh_reg(cs, R_00B11C_SPI_SHADER_LATE_ALLOC_VS, S_00B11C_LIMIT(late_alloc_wave64));
44647ec681f3Smrg   }
44657ec681f3Smrg   if (pipeline->device->physical_device->rad_info.chip_class >= GFX10) {
44667ec681f3Smrg      uint32_t oversub_pc_lines = late_alloc_wave64 ? pipeline->device->physical_device->rad_info.pc_lines / 4 : 0;
44677ec681f3Smrg      gfx10_emit_ge_pc_alloc(cs, pipeline->device->physical_device->rad_info.chip_class, oversub_pc_lines);
44687ec681f3Smrg   }
446901e04c3fSmrg}
447001e04c3fSmrg
447101e04c3fSmrgstatic void
44727ec681f3Smrgradv_pipeline_generate_hw_es(struct radeon_cmdbuf *cs, const struct radv_pipeline *pipeline,
44737ec681f3Smrg                             const struct radv_shader_variant *shader)
447401e04c3fSmrg{
44757ec681f3Smrg   uint64_t va = radv_shader_variant_get_va(shader);
447601e04c3fSmrg
44777ec681f3Smrg   radeon_set_sh_reg_seq(cs, R_00B320_SPI_SHADER_PGM_LO_ES, 4);
44787ec681f3Smrg   radeon_emit(cs, va >> 8);
44797ec681f3Smrg   radeon_emit(cs, S_00B324_MEM_BASE(va >> 40));
44807ec681f3Smrg   radeon_emit(cs, shader->config.rsrc1);
44817ec681f3Smrg   radeon_emit(cs, shader->config.rsrc2);
448201e04c3fSmrg}
448301e04c3fSmrg
448401e04c3fSmrgstatic void
44857ec681f3Smrgradv_pipeline_generate_hw_ls(struct radeon_cmdbuf *cs, const struct radv_pipeline *pipeline,
44867ec681f3Smrg                             const struct radv_shader_variant *shader)
448701e04c3fSmrg{
44887ec681f3Smrg   unsigned num_lds_blocks = pipeline->shaders[MESA_SHADER_TESS_CTRL]->info.tcs.num_lds_blocks;
44897ec681f3Smrg   uint64_t va = radv_shader_variant_get_va(shader);
44907ec681f3Smrg   uint32_t rsrc2 = shader->config.rsrc2;
449101e04c3fSmrg
44927ec681f3Smrg   radeon_set_sh_reg(cs, R_00B520_SPI_SHADER_PGM_LO_LS, va >> 8);
449301e04c3fSmrg
44947ec681f3Smrg   rsrc2 |= S_00B52C_LDS_SIZE(num_lds_blocks);
44957ec681f3Smrg   if (pipeline->device->physical_device->rad_info.chip_class == GFX7 &&
44967ec681f3Smrg       pipeline->device->physical_device->rad_info.family != CHIP_HAWAII)
44977ec681f3Smrg      radeon_set_sh_reg(cs, R_00B52C_SPI_SHADER_PGM_RSRC2_LS, rsrc2);
449801e04c3fSmrg
44997ec681f3Smrg   radeon_set_sh_reg_seq(cs, R_00B528_SPI_SHADER_PGM_RSRC1_LS, 2);
45007ec681f3Smrg   radeon_emit(cs, shader->config.rsrc1);
45017ec681f3Smrg   radeon_emit(cs, rsrc2);
450201e04c3fSmrg}
450301e04c3fSmrg
450401e04c3fSmrgstatic void
45057ec681f3Smrgradv_pipeline_generate_hw_ngg(struct radeon_cmdbuf *ctx_cs, struct radeon_cmdbuf *cs,
45067ec681f3Smrg                              const struct radv_pipeline *pipeline,
45077ec681f3Smrg                              const struct radv_shader_variant *shader)
45087ec681f3Smrg{
45097ec681f3Smrg   uint64_t va = radv_shader_variant_get_va(shader);
45107ec681f3Smrg   gl_shader_stage es_type =
45117ec681f3Smrg      radv_pipeline_has_tess(pipeline) ? MESA_SHADER_TESS_EVAL : MESA_SHADER_VERTEX;
45127ec681f3Smrg   struct radv_shader_variant *es = es_type == MESA_SHADER_TESS_EVAL
45137ec681f3Smrg                                       ? pipeline->shaders[MESA_SHADER_TESS_EVAL]
45147ec681f3Smrg                                       : pipeline->shaders[MESA_SHADER_VERTEX];
45157ec681f3Smrg   const struct gfx10_ngg_info *ngg_state = &shader->info.ngg_info;
45167ec681f3Smrg
45177ec681f3Smrg   radeon_set_sh_reg(cs, R_00B320_SPI_SHADER_PGM_LO_ES, va >> 8);
45187ec681f3Smrg
45197ec681f3Smrg   radeon_set_sh_reg_seq(cs, R_00B228_SPI_SHADER_PGM_RSRC1_GS, 2);
45207ec681f3Smrg   radeon_emit(cs, shader->config.rsrc1);
45217ec681f3Smrg   radeon_emit(cs, shader->config.rsrc2);
45227ec681f3Smrg
45237ec681f3Smrg   const struct radv_vs_output_info *outinfo = get_vs_output_info(pipeline);
45247ec681f3Smrg   unsigned clip_dist_mask, cull_dist_mask, total_mask;
45257ec681f3Smrg   clip_dist_mask = outinfo->clip_dist_mask;
45267ec681f3Smrg   cull_dist_mask = outinfo->cull_dist_mask;
45277ec681f3Smrg   total_mask = clip_dist_mask | cull_dist_mask;
45287ec681f3Smrg
45297ec681f3Smrg   bool writes_primitive_shading_rate =
45307ec681f3Smrg      outinfo->writes_primitive_shading_rate || pipeline->device->force_vrs != RADV_FORCE_VRS_NONE;
45317ec681f3Smrg   bool misc_vec_ena = outinfo->writes_pointsize || outinfo->writes_layer ||
45327ec681f3Smrg                       outinfo->writes_viewport_index || writes_primitive_shading_rate;
45337ec681f3Smrg   bool es_enable_prim_id = outinfo->export_prim_id || (es && es->info.uses_prim_id);
45347ec681f3Smrg   bool break_wave_at_eoi = false;
45357ec681f3Smrg   unsigned ge_cntl;
45367ec681f3Smrg   unsigned nparams;
45377ec681f3Smrg
45387ec681f3Smrg   if (es_type == MESA_SHADER_TESS_EVAL) {
45397ec681f3Smrg      struct radv_shader_variant *gs = pipeline->shaders[MESA_SHADER_GEOMETRY];
45407ec681f3Smrg
45417ec681f3Smrg      if (es_enable_prim_id || (gs && gs->info.uses_prim_id))
45427ec681f3Smrg         break_wave_at_eoi = true;
45437ec681f3Smrg   }
45447ec681f3Smrg
45457ec681f3Smrg   nparams = MAX2(outinfo->param_exports, 1);
45467ec681f3Smrg   radeon_set_context_reg(
45477ec681f3Smrg      ctx_cs, R_0286C4_SPI_VS_OUT_CONFIG,
45487ec681f3Smrg      S_0286C4_VS_EXPORT_COUNT(nparams - 1) | S_0286C4_NO_PC_EXPORT(outinfo->param_exports == 0));
45497ec681f3Smrg
45507ec681f3Smrg   radeon_set_context_reg(ctx_cs, R_028708_SPI_SHADER_IDX_FORMAT,
45517ec681f3Smrg                          S_028708_IDX0_EXPORT_FORMAT(V_028708_SPI_SHADER_1COMP));
45527ec681f3Smrg   radeon_set_context_reg(
45537ec681f3Smrg      ctx_cs, R_02870C_SPI_SHADER_POS_FORMAT,
45547ec681f3Smrg      S_02870C_POS0_EXPORT_FORMAT(V_02870C_SPI_SHADER_4COMP) |
45557ec681f3Smrg         S_02870C_POS1_EXPORT_FORMAT(outinfo->pos_exports > 1 ? V_02870C_SPI_SHADER_4COMP
45567ec681f3Smrg                                                              : V_02870C_SPI_SHADER_NONE) |
45577ec681f3Smrg         S_02870C_POS2_EXPORT_FORMAT(outinfo->pos_exports > 2 ? V_02870C_SPI_SHADER_4COMP
45587ec681f3Smrg                                                              : V_02870C_SPI_SHADER_NONE) |
45597ec681f3Smrg         S_02870C_POS3_EXPORT_FORMAT(outinfo->pos_exports > 3 ? V_02870C_SPI_SHADER_4COMP
45607ec681f3Smrg                                                              : V_02870C_SPI_SHADER_NONE));
45617ec681f3Smrg
45627ec681f3Smrg   radeon_set_context_reg(ctx_cs, R_02881C_PA_CL_VS_OUT_CNTL,
45637ec681f3Smrg                          S_02881C_USE_VTX_POINT_SIZE(outinfo->writes_pointsize) |
45647ec681f3Smrg                             S_02881C_USE_VTX_RENDER_TARGET_INDX(outinfo->writes_layer) |
45657ec681f3Smrg                             S_02881C_USE_VTX_VIEWPORT_INDX(outinfo->writes_viewport_index) |
45667ec681f3Smrg                             S_02881C_USE_VTX_VRS_RATE(writes_primitive_shading_rate) |
45677ec681f3Smrg                             S_02881C_VS_OUT_MISC_VEC_ENA(misc_vec_ena) |
45687ec681f3Smrg                             S_02881C_VS_OUT_MISC_SIDE_BUS_ENA(misc_vec_ena) |
45697ec681f3Smrg                             S_02881C_VS_OUT_CCDIST0_VEC_ENA((total_mask & 0x0f) != 0) |
45707ec681f3Smrg                             S_02881C_VS_OUT_CCDIST1_VEC_ENA((total_mask & 0xf0) != 0) |
45717ec681f3Smrg                             total_mask << 8 | clip_dist_mask);
45727ec681f3Smrg
45737ec681f3Smrg   radeon_set_context_reg(ctx_cs, R_028A84_VGT_PRIMITIVEID_EN,
45747ec681f3Smrg                          S_028A84_PRIMITIVEID_EN(es_enable_prim_id) |
45757ec681f3Smrg                             S_028A84_NGG_DISABLE_PROVOK_REUSE(outinfo->export_prim_id));
45767ec681f3Smrg
45777ec681f3Smrg   radeon_set_context_reg(ctx_cs, R_028AAC_VGT_ESGS_RING_ITEMSIZE,
45787ec681f3Smrg                          ngg_state->vgt_esgs_ring_itemsize);
45797ec681f3Smrg
45807ec681f3Smrg   /* NGG specific registers. */
45817ec681f3Smrg   struct radv_shader_variant *gs = pipeline->shaders[MESA_SHADER_GEOMETRY];
45827ec681f3Smrg   uint32_t gs_num_invocations = gs ? gs->info.gs.invocations : 1;
45837ec681f3Smrg
45847ec681f3Smrg   radeon_set_context_reg(
45857ec681f3Smrg      ctx_cs, R_028A44_VGT_GS_ONCHIP_CNTL,
45867ec681f3Smrg      S_028A44_ES_VERTS_PER_SUBGRP(ngg_state->hw_max_esverts) |
45877ec681f3Smrg         S_028A44_GS_PRIMS_PER_SUBGRP(ngg_state->max_gsprims) |
45887ec681f3Smrg         S_028A44_GS_INST_PRIMS_IN_SUBGRP(ngg_state->max_gsprims * gs_num_invocations));
45897ec681f3Smrg   radeon_set_context_reg(ctx_cs, R_0287FC_GE_MAX_OUTPUT_PER_SUBGROUP,
45907ec681f3Smrg                          S_0287FC_MAX_VERTS_PER_SUBGROUP(ngg_state->max_out_verts));
45917ec681f3Smrg   radeon_set_context_reg(ctx_cs, R_028B4C_GE_NGG_SUBGRP_CNTL,
45927ec681f3Smrg                          S_028B4C_PRIM_AMP_FACTOR(ngg_state->prim_amp_factor) |
45937ec681f3Smrg                             S_028B4C_THDS_PER_SUBGRP(0)); /* for fast launch */
45947ec681f3Smrg   radeon_set_context_reg(
45957ec681f3Smrg      ctx_cs, R_028B90_VGT_GS_INSTANCE_CNT,
45967ec681f3Smrg      S_028B90_CNT(gs_num_invocations) | S_028B90_ENABLE(gs_num_invocations > 1) |
45977ec681f3Smrg         S_028B90_EN_MAX_VERT_OUT_PER_GS_INSTANCE(ngg_state->max_vert_out_per_gs_instance));
45987ec681f3Smrg
45997ec681f3Smrg   ge_cntl = S_03096C_PRIM_GRP_SIZE(ngg_state->max_gsprims) |
46007ec681f3Smrg             S_03096C_VERT_GRP_SIZE(ngg_state->enable_vertex_grouping ? ngg_state->hw_max_esverts : 256) | /* 256 = disable vertex grouping */
46017ec681f3Smrg             S_03096C_BREAK_WAVE_AT_EOI(break_wave_at_eoi);
46027ec681f3Smrg
46037ec681f3Smrg   /* Bug workaround for a possible hang with non-tessellation cases.
46047ec681f3Smrg    * Tessellation always sets GE_CNTL.VERT_GRP_SIZE = 0
46057ec681f3Smrg    *
46067ec681f3Smrg    * Requirement: GE_CNTL.VERT_GRP_SIZE = VGT_GS_ONCHIP_CNTL.ES_VERTS_PER_SUBGRP - 5
46077ec681f3Smrg    */
46087ec681f3Smrg   if (pipeline->device->physical_device->rad_info.chip_class == GFX10 &&
46097ec681f3Smrg       !radv_pipeline_has_tess(pipeline) && ngg_state->hw_max_esverts != 256) {
46107ec681f3Smrg      ge_cntl &= C_03096C_VERT_GRP_SIZE;
46117ec681f3Smrg
46127ec681f3Smrg      if (ngg_state->hw_max_esverts > 5) {
46137ec681f3Smrg         ge_cntl |= S_03096C_VERT_GRP_SIZE(ngg_state->hw_max_esverts - 5);
46147ec681f3Smrg      }
46157ec681f3Smrg   }
46167ec681f3Smrg
46177ec681f3Smrg   radeon_set_uconfig_reg(ctx_cs, R_03096C_GE_CNTL, ge_cntl);
46187ec681f3Smrg
46197ec681f3Smrg   unsigned late_alloc_wave64, cu_mask;
46207ec681f3Smrg   ac_compute_late_alloc(&pipeline->device->physical_device->rad_info, true, shader->info.has_ngg_culling,
46217ec681f3Smrg                         shader->config.scratch_bytes_per_wave > 0, &late_alloc_wave64, &cu_mask);
46227ec681f3Smrg
46237ec681f3Smrg   radeon_set_sh_reg_idx(
46247ec681f3Smrg      pipeline->device->physical_device, cs, R_00B21C_SPI_SHADER_PGM_RSRC3_GS, 3,
46257ec681f3Smrg      S_00B21C_CU_EN(cu_mask) | S_00B21C_WAVE_LIMIT(0x3F));
46267ec681f3Smrg   radeon_set_sh_reg_idx(
46277ec681f3Smrg      pipeline->device->physical_device, cs, R_00B204_SPI_SHADER_PGM_RSRC4_GS, 3,
46287ec681f3Smrg      S_00B204_CU_EN(0xffff) | S_00B204_SPI_SHADER_LATE_ALLOC_GS_GFX10(late_alloc_wave64));
46297ec681f3Smrg
46307ec681f3Smrg   uint32_t oversub_pc_lines = late_alloc_wave64 ? pipeline->device->physical_device->rad_info.pc_lines / 4 : 0;
46317ec681f3Smrg   if (shader->info.has_ngg_culling) {
46327ec681f3Smrg      unsigned oversub_factor = 2;
46337ec681f3Smrg
46347ec681f3Smrg      if (outinfo->param_exports > 4)
46357ec681f3Smrg         oversub_factor = 4;
46367ec681f3Smrg      else if (outinfo->param_exports > 2)
46377ec681f3Smrg         oversub_factor = 3;
46387ec681f3Smrg
46397ec681f3Smrg      oversub_pc_lines *= oversub_factor;
46407ec681f3Smrg   }
46417ec681f3Smrg
46427ec681f3Smrg   gfx10_emit_ge_pc_alloc(cs, pipeline->device->physical_device->rad_info.chip_class, oversub_pc_lines);
464301e04c3fSmrg}
464401e04c3fSmrg
464501e04c3fSmrgstatic void
46467ec681f3Smrgradv_pipeline_generate_hw_hs(struct radeon_cmdbuf *cs, const struct radv_pipeline *pipeline,
46477ec681f3Smrg                             const struct radv_shader_variant *shader)
464801e04c3fSmrg{
46497ec681f3Smrg   uint64_t va = radv_shader_variant_get_va(shader);
46507ec681f3Smrg
46517ec681f3Smrg   if (pipeline->device->physical_device->rad_info.chip_class >= GFX9) {
46527ec681f3Smrg      if (pipeline->device->physical_device->rad_info.chip_class >= GFX10) {
46537ec681f3Smrg         radeon_set_sh_reg(cs, R_00B520_SPI_SHADER_PGM_LO_LS, va >> 8);
46547ec681f3Smrg      } else {
46557ec681f3Smrg         radeon_set_sh_reg(cs, R_00B410_SPI_SHADER_PGM_LO_LS, va >> 8);
46567ec681f3Smrg      }
46577ec681f3Smrg
46587ec681f3Smrg      radeon_set_sh_reg_seq(cs, R_00B428_SPI_SHADER_PGM_RSRC1_HS, 2);
46597ec681f3Smrg      radeon_emit(cs, shader->config.rsrc1);
46607ec681f3Smrg      radeon_emit(cs, shader->config.rsrc2);
46617ec681f3Smrg   } else {
46627ec681f3Smrg      radeon_set_sh_reg_seq(cs, R_00B420_SPI_SHADER_PGM_LO_HS, 4);
46637ec681f3Smrg      radeon_emit(cs, va >> 8);
46647ec681f3Smrg      radeon_emit(cs, S_00B424_MEM_BASE(va >> 40));
46657ec681f3Smrg      radeon_emit(cs, shader->config.rsrc1);
46667ec681f3Smrg      radeon_emit(cs, shader->config.rsrc2);
46677ec681f3Smrg   }
46687ec681f3Smrg}
466901e04c3fSmrg
46707ec681f3Smrgstatic void
46717ec681f3Smrgradv_pipeline_generate_vertex_shader(struct radeon_cmdbuf *ctx_cs, struct radeon_cmdbuf *cs,
46727ec681f3Smrg                                     const struct radv_pipeline *pipeline)
46737ec681f3Smrg{
46747ec681f3Smrg   struct radv_shader_variant *vs;
46757ec681f3Smrg
46767ec681f3Smrg   /* Skip shaders merged into HS/GS */
46777ec681f3Smrg   vs = pipeline->shaders[MESA_SHADER_VERTEX];
46787ec681f3Smrg   if (!vs)
46797ec681f3Smrg      return;
46807ec681f3Smrg
46817ec681f3Smrg   if (vs->info.vs.as_ls)
46827ec681f3Smrg      radv_pipeline_generate_hw_ls(cs, pipeline, vs);
46837ec681f3Smrg   else if (vs->info.vs.as_es)
46847ec681f3Smrg      radv_pipeline_generate_hw_es(cs, pipeline, vs);
46857ec681f3Smrg   else if (vs->info.is_ngg)
46867ec681f3Smrg      radv_pipeline_generate_hw_ngg(ctx_cs, cs, pipeline, vs);
46877ec681f3Smrg   else
46887ec681f3Smrg      radv_pipeline_generate_hw_vs(ctx_cs, cs, pipeline, vs);
468901e04c3fSmrg}
469001e04c3fSmrg
469101e04c3fSmrgstatic void
46927ec681f3Smrgradv_pipeline_generate_tess_shaders(struct radeon_cmdbuf *ctx_cs, struct radeon_cmdbuf *cs,
46937ec681f3Smrg                                    const struct radv_pipeline *pipeline)
469401e04c3fSmrg{
46957ec681f3Smrg   struct radv_shader_variant *tes, *tcs;
46967ec681f3Smrg
46977ec681f3Smrg   tcs = pipeline->shaders[MESA_SHADER_TESS_CTRL];
46987ec681f3Smrg   tes = pipeline->shaders[MESA_SHADER_TESS_EVAL];
46997ec681f3Smrg
47007ec681f3Smrg   if (tes) {
47017ec681f3Smrg      if (tes->info.is_ngg) {
47027ec681f3Smrg         radv_pipeline_generate_hw_ngg(ctx_cs, cs, pipeline, tes);
47037ec681f3Smrg      } else if (tes->info.tes.as_es)
47047ec681f3Smrg         radv_pipeline_generate_hw_es(cs, pipeline, tes);
47057ec681f3Smrg      else
47067ec681f3Smrg         radv_pipeline_generate_hw_vs(ctx_cs, cs, pipeline, tes);
47077ec681f3Smrg   }
47087ec681f3Smrg
47097ec681f3Smrg   radv_pipeline_generate_hw_hs(cs, pipeline, tcs);
47107ec681f3Smrg
47117ec681f3Smrg   if (pipeline->device->physical_device->rad_info.chip_class >= GFX10 &&
47127ec681f3Smrg       !radv_pipeline_has_gs(pipeline) && !radv_pipeline_has_ngg(pipeline)) {
47137ec681f3Smrg      radeon_set_context_reg(ctx_cs, R_028A44_VGT_GS_ONCHIP_CNTL,
47147ec681f3Smrg                             S_028A44_ES_VERTS_PER_SUBGRP(250) | S_028A44_GS_PRIMS_PER_SUBGRP(126) |
47157ec681f3Smrg                                S_028A44_GS_INST_PRIMS_IN_SUBGRP(126));
47167ec681f3Smrg   }
47177ec681f3Smrg}
471801e04c3fSmrg
47197ec681f3Smrgstatic void
47207ec681f3Smrgradv_pipeline_generate_tess_state(struct radeon_cmdbuf *ctx_cs,
47217ec681f3Smrg                                  const struct radv_pipeline *pipeline,
47227ec681f3Smrg                                  const VkGraphicsPipelineCreateInfo *pCreateInfo)
47237ec681f3Smrg{
47247ec681f3Smrg   struct radv_shader_variant *tes = radv_get_shader(pipeline, MESA_SHADER_TESS_EVAL);
47257ec681f3Smrg   unsigned type = 0, partitioning = 0, topology = 0, distribution_mode = 0;
47267ec681f3Smrg   unsigned num_tcs_input_cp, num_tcs_output_cp, num_patches;
47277ec681f3Smrg   unsigned ls_hs_config;
47287ec681f3Smrg
47297ec681f3Smrg   num_tcs_input_cp = pCreateInfo->pTessellationState->patchControlPoints;
47307ec681f3Smrg   num_tcs_output_cp =
47317ec681f3Smrg      pipeline->shaders[MESA_SHADER_TESS_CTRL]->info.tcs.tcs_vertices_out; // TCS VERTICES OUT
47327ec681f3Smrg   num_patches = pipeline->shaders[MESA_SHADER_TESS_CTRL]->info.num_tess_patches;
47337ec681f3Smrg
47347ec681f3Smrg   ls_hs_config = S_028B58_NUM_PATCHES(num_patches) | S_028B58_HS_NUM_INPUT_CP(num_tcs_input_cp) |
47357ec681f3Smrg                  S_028B58_HS_NUM_OUTPUT_CP(num_tcs_output_cp);
47367ec681f3Smrg
47377ec681f3Smrg   if (pipeline->device->physical_device->rad_info.chip_class >= GFX7) {
47387ec681f3Smrg      radeon_set_context_reg_idx(ctx_cs, R_028B58_VGT_LS_HS_CONFIG, 2, ls_hs_config);
47397ec681f3Smrg   } else {
47407ec681f3Smrg      radeon_set_context_reg(ctx_cs, R_028B58_VGT_LS_HS_CONFIG, ls_hs_config);
47417ec681f3Smrg   }
47427ec681f3Smrg
47437ec681f3Smrg   switch (tes->info.tes.primitive_mode) {
47447ec681f3Smrg   case GL_TRIANGLES:
47457ec681f3Smrg      type = V_028B6C_TESS_TRIANGLE;
47467ec681f3Smrg      break;
47477ec681f3Smrg   case GL_QUADS:
47487ec681f3Smrg      type = V_028B6C_TESS_QUAD;
47497ec681f3Smrg      break;
47507ec681f3Smrg   case GL_ISOLINES:
47517ec681f3Smrg      type = V_028B6C_TESS_ISOLINE;
47527ec681f3Smrg      break;
47537ec681f3Smrg   }
47547ec681f3Smrg
47557ec681f3Smrg   switch (tes->info.tes.spacing) {
47567ec681f3Smrg   case TESS_SPACING_EQUAL:
47577ec681f3Smrg      partitioning = V_028B6C_PART_INTEGER;
47587ec681f3Smrg      break;
47597ec681f3Smrg   case TESS_SPACING_FRACTIONAL_ODD:
47607ec681f3Smrg      partitioning = V_028B6C_PART_FRAC_ODD;
47617ec681f3Smrg      break;
47627ec681f3Smrg   case TESS_SPACING_FRACTIONAL_EVEN:
47637ec681f3Smrg      partitioning = V_028B6C_PART_FRAC_EVEN;
47647ec681f3Smrg      break;
47657ec681f3Smrg   default:
47667ec681f3Smrg      break;
47677ec681f3Smrg   }
47687ec681f3Smrg
47697ec681f3Smrg   bool ccw = tes->info.tes.ccw;
47707ec681f3Smrg   const VkPipelineTessellationDomainOriginStateCreateInfo *domain_origin_state =
47717ec681f3Smrg      vk_find_struct_const(pCreateInfo->pTessellationState,
47727ec681f3Smrg                           PIPELINE_TESSELLATION_DOMAIN_ORIGIN_STATE_CREATE_INFO);
47737ec681f3Smrg
47747ec681f3Smrg   if (domain_origin_state &&
47757ec681f3Smrg       domain_origin_state->domainOrigin != VK_TESSELLATION_DOMAIN_ORIGIN_UPPER_LEFT)
47767ec681f3Smrg      ccw = !ccw;
47777ec681f3Smrg
47787ec681f3Smrg   if (tes->info.tes.point_mode)
47797ec681f3Smrg      topology = V_028B6C_OUTPUT_POINT;
47807ec681f3Smrg   else if (tes->info.tes.primitive_mode == GL_ISOLINES)
47817ec681f3Smrg      topology = V_028B6C_OUTPUT_LINE;
47827ec681f3Smrg   else if (ccw)
47837ec681f3Smrg      topology = V_028B6C_OUTPUT_TRIANGLE_CCW;
47847ec681f3Smrg   else
47857ec681f3Smrg      topology = V_028B6C_OUTPUT_TRIANGLE_CW;
47867ec681f3Smrg
47877ec681f3Smrg   if (pipeline->device->physical_device->rad_info.has_distributed_tess) {
47887ec681f3Smrg      if (pipeline->device->physical_device->rad_info.family == CHIP_FIJI ||
47897ec681f3Smrg          pipeline->device->physical_device->rad_info.family >= CHIP_POLARIS10)
47907ec681f3Smrg         distribution_mode = V_028B6C_TRAPEZOIDS;
47917ec681f3Smrg      else
47927ec681f3Smrg         distribution_mode = V_028B6C_DONUTS;
47937ec681f3Smrg   } else
47947ec681f3Smrg      distribution_mode = V_028B6C_NO_DIST;
47957ec681f3Smrg
47967ec681f3Smrg   radeon_set_context_reg(ctx_cs, R_028B6C_VGT_TF_PARAM,
47977ec681f3Smrg                          S_028B6C_TYPE(type) | S_028B6C_PARTITIONING(partitioning) |
47987ec681f3Smrg                             S_028B6C_TOPOLOGY(topology) |
47997ec681f3Smrg                             S_028B6C_DISTRIBUTION_MODE(distribution_mode));
48007ec681f3Smrg}
480101e04c3fSmrg
48027ec681f3Smrgstatic void
48037ec681f3Smrgradv_pipeline_generate_hw_gs(struct radeon_cmdbuf *ctx_cs, struct radeon_cmdbuf *cs,
48047ec681f3Smrg                             const struct radv_pipeline *pipeline,
48057ec681f3Smrg                             const struct radv_shader_variant *gs)
48067ec681f3Smrg{
48077ec681f3Smrg   const struct gfx9_gs_info *gs_state = &gs->info.gs_ring_info;
48087ec681f3Smrg   unsigned gs_max_out_vertices;
48097ec681f3Smrg   const uint8_t *num_components;
48107ec681f3Smrg   uint8_t max_stream;
48117ec681f3Smrg   unsigned offset;
48127ec681f3Smrg   uint64_t va;
48137ec681f3Smrg
48147ec681f3Smrg   gs_max_out_vertices = gs->info.gs.vertices_out;
48157ec681f3Smrg   max_stream = gs->info.gs.max_stream;
48167ec681f3Smrg   num_components = gs->info.gs.num_stream_output_components;
48177ec681f3Smrg
48187ec681f3Smrg   offset = num_components[0] * gs_max_out_vertices;
48197ec681f3Smrg
48207ec681f3Smrg   radeon_set_context_reg_seq(ctx_cs, R_028A60_VGT_GSVS_RING_OFFSET_1, 3);
48217ec681f3Smrg   radeon_emit(ctx_cs, offset);
48227ec681f3Smrg   if (max_stream >= 1)
48237ec681f3Smrg      offset += num_components[1] * gs_max_out_vertices;
48247ec681f3Smrg   radeon_emit(ctx_cs, offset);
48257ec681f3Smrg   if (max_stream >= 2)
48267ec681f3Smrg      offset += num_components[2] * gs_max_out_vertices;
48277ec681f3Smrg   radeon_emit(ctx_cs, offset);
48287ec681f3Smrg   if (max_stream >= 3)
48297ec681f3Smrg      offset += num_components[3] * gs_max_out_vertices;
48307ec681f3Smrg   radeon_set_context_reg(ctx_cs, R_028AB0_VGT_GSVS_RING_ITEMSIZE, offset);
48317ec681f3Smrg
48327ec681f3Smrg   radeon_set_context_reg_seq(ctx_cs, R_028B5C_VGT_GS_VERT_ITEMSIZE, 4);
48337ec681f3Smrg   radeon_emit(ctx_cs, num_components[0]);
48347ec681f3Smrg   radeon_emit(ctx_cs, (max_stream >= 1) ? num_components[1] : 0);
48357ec681f3Smrg   radeon_emit(ctx_cs, (max_stream >= 2) ? num_components[2] : 0);
48367ec681f3Smrg   radeon_emit(ctx_cs, (max_stream >= 3) ? num_components[3] : 0);
48377ec681f3Smrg
48387ec681f3Smrg   uint32_t gs_num_invocations = gs->info.gs.invocations;
48397ec681f3Smrg   radeon_set_context_reg(
48407ec681f3Smrg      ctx_cs, R_028B90_VGT_GS_INSTANCE_CNT,
48417ec681f3Smrg      S_028B90_CNT(MIN2(gs_num_invocations, 127)) | S_028B90_ENABLE(gs_num_invocations > 0));
48427ec681f3Smrg
48437ec681f3Smrg   radeon_set_context_reg(ctx_cs, R_028AAC_VGT_ESGS_RING_ITEMSIZE,
48447ec681f3Smrg                          gs_state->vgt_esgs_ring_itemsize);
48457ec681f3Smrg
48467ec681f3Smrg   va = radv_shader_variant_get_va(gs);
48477ec681f3Smrg
48487ec681f3Smrg   if (pipeline->device->physical_device->rad_info.chip_class >= GFX9) {
48497ec681f3Smrg      if (pipeline->device->physical_device->rad_info.chip_class >= GFX10) {
48507ec681f3Smrg         radeon_set_sh_reg(cs, R_00B320_SPI_SHADER_PGM_LO_ES, va >> 8);
48517ec681f3Smrg      } else {
48527ec681f3Smrg         radeon_set_sh_reg(cs, R_00B210_SPI_SHADER_PGM_LO_ES, va >> 8);
48537ec681f3Smrg      }
48547ec681f3Smrg
48557ec681f3Smrg      radeon_set_sh_reg_seq(cs, R_00B228_SPI_SHADER_PGM_RSRC1_GS, 2);
48567ec681f3Smrg      radeon_emit(cs, gs->config.rsrc1);
48577ec681f3Smrg      radeon_emit(cs, gs->config.rsrc2 | S_00B22C_LDS_SIZE(gs_state->lds_size));
48587ec681f3Smrg
48597ec681f3Smrg      radeon_set_context_reg(ctx_cs, R_028A44_VGT_GS_ONCHIP_CNTL, gs_state->vgt_gs_onchip_cntl);
48607ec681f3Smrg      radeon_set_context_reg(ctx_cs, R_028A94_VGT_GS_MAX_PRIMS_PER_SUBGROUP,
48617ec681f3Smrg                             gs_state->vgt_gs_max_prims_per_subgroup);
48627ec681f3Smrg   } else {
48637ec681f3Smrg      radeon_set_sh_reg_seq(cs, R_00B220_SPI_SHADER_PGM_LO_GS, 4);
48647ec681f3Smrg      radeon_emit(cs, va >> 8);
48657ec681f3Smrg      radeon_emit(cs, S_00B224_MEM_BASE(va >> 40));
48667ec681f3Smrg      radeon_emit(cs, gs->config.rsrc1);
48677ec681f3Smrg      radeon_emit(cs, gs->config.rsrc2);
48687ec681f3Smrg   }
48697ec681f3Smrg
48707ec681f3Smrg   if (pipeline->device->physical_device->rad_info.chip_class >= GFX7) {
48717ec681f3Smrg      radeon_set_sh_reg_idx(
48727ec681f3Smrg         pipeline->device->physical_device, cs, R_00B21C_SPI_SHADER_PGM_RSRC3_GS, 3,
48737ec681f3Smrg         S_00B21C_CU_EN(0xffff) | S_00B21C_WAVE_LIMIT(0x3F));
48747ec681f3Smrg
48757ec681f3Smrg      if (pipeline->device->physical_device->rad_info.chip_class >= GFX10) {
48767ec681f3Smrg         radeon_set_sh_reg_idx(
48777ec681f3Smrg            pipeline->device->physical_device, cs, R_00B204_SPI_SHADER_PGM_RSRC4_GS, 3,
48787ec681f3Smrg            S_00B204_CU_EN(0xffff) | S_00B204_SPI_SHADER_LATE_ALLOC_GS_GFX10(0));
48797ec681f3Smrg      }
48807ec681f3Smrg   }
48817ec681f3Smrg
48827ec681f3Smrg   radv_pipeline_generate_hw_vs(ctx_cs, cs, pipeline, pipeline->gs_copy_shader);
48837ec681f3Smrg}
488401e04c3fSmrg
48857ec681f3Smrgstatic void
48867ec681f3Smrgradv_pipeline_generate_geometry_shader(struct radeon_cmdbuf *ctx_cs, struct radeon_cmdbuf *cs,
48877ec681f3Smrg                                       const struct radv_pipeline *pipeline)
48887ec681f3Smrg{
48897ec681f3Smrg   struct radv_shader_variant *gs;
489001e04c3fSmrg
48917ec681f3Smrg   gs = pipeline->shaders[MESA_SHADER_GEOMETRY];
48927ec681f3Smrg   if (!gs)
48937ec681f3Smrg      return;
489401e04c3fSmrg
48957ec681f3Smrg   if (gs->info.is_ngg)
48967ec681f3Smrg      radv_pipeline_generate_hw_ngg(ctx_cs, cs, pipeline, gs);
48977ec681f3Smrg   else
48987ec681f3Smrg      radv_pipeline_generate_hw_gs(ctx_cs, cs, pipeline, gs);
489901e04c3fSmrg
49007ec681f3Smrg   radeon_set_context_reg(ctx_cs, R_028B38_VGT_GS_MAX_VERT_OUT, gs->info.gs.vertices_out);
490101e04c3fSmrg}
490201e04c3fSmrg
49037ec681f3Smrgstatic uint32_t
49047ec681f3Smrgoffset_to_ps_input(uint32_t offset, bool flat_shade, bool explicit, bool float16)
49057ec681f3Smrg{
49067ec681f3Smrg   uint32_t ps_input_cntl;
49077ec681f3Smrg   if (offset <= AC_EXP_PARAM_OFFSET_31) {
49087ec681f3Smrg      ps_input_cntl = S_028644_OFFSET(offset);
49097ec681f3Smrg      if (flat_shade || explicit)
49107ec681f3Smrg         ps_input_cntl |= S_028644_FLAT_SHADE(1);
49117ec681f3Smrg      if (explicit) {
49127ec681f3Smrg         /* Force parameter cache to be read in passthrough
49137ec681f3Smrg          * mode.
49147ec681f3Smrg          */
49157ec681f3Smrg         ps_input_cntl |= S_028644_OFFSET(1 << 5);
49167ec681f3Smrg      }
49177ec681f3Smrg      if (float16) {
49187ec681f3Smrg         ps_input_cntl |= S_028644_FP16_INTERP_MODE(1) | S_028644_ATTR0_VALID(1);
49197ec681f3Smrg      }
49207ec681f3Smrg   } else {
49217ec681f3Smrg      /* The input is a DEFAULT_VAL constant. */
49227ec681f3Smrg      assert(offset >= AC_EXP_PARAM_DEFAULT_VAL_0000 && offset <= AC_EXP_PARAM_DEFAULT_VAL_1111);
49237ec681f3Smrg      offset -= AC_EXP_PARAM_DEFAULT_VAL_0000;
49247ec681f3Smrg      ps_input_cntl = S_028644_OFFSET(0x20) | S_028644_DEFAULT_VAL(offset);
49257ec681f3Smrg   }
49267ec681f3Smrg   return ps_input_cntl;
492701e04c3fSmrg}
492801e04c3fSmrg
492901e04c3fSmrgstatic void
49307ec681f3Smrgradv_pipeline_generate_ps_inputs(struct radeon_cmdbuf *ctx_cs, const struct radv_pipeline *pipeline)
49317ec681f3Smrg{
49327ec681f3Smrg   struct radv_shader_variant *ps = pipeline->shaders[MESA_SHADER_FRAGMENT];
49337ec681f3Smrg   const struct radv_vs_output_info *outinfo = get_vs_output_info(pipeline);
49347ec681f3Smrg   uint32_t ps_input_cntl[32];
49357ec681f3Smrg
49367ec681f3Smrg   unsigned ps_offset = 0;
49377ec681f3Smrg
49387ec681f3Smrg   if (ps->info.ps.prim_id_input) {
49397ec681f3Smrg      unsigned vs_offset = outinfo->vs_output_param_offset[VARYING_SLOT_PRIMITIVE_ID];
49407ec681f3Smrg      if (vs_offset != AC_EXP_PARAM_UNDEFINED) {
49417ec681f3Smrg         ps_input_cntl[ps_offset] = offset_to_ps_input(vs_offset, true, false, false);
49427ec681f3Smrg         ++ps_offset;
49437ec681f3Smrg      }
49447ec681f3Smrg   }
49457ec681f3Smrg
49467ec681f3Smrg   if (ps->info.ps.layer_input) {
49477ec681f3Smrg      unsigned vs_offset = outinfo->vs_output_param_offset[VARYING_SLOT_LAYER];
49487ec681f3Smrg      if (vs_offset != AC_EXP_PARAM_UNDEFINED)
49497ec681f3Smrg         ps_input_cntl[ps_offset] = offset_to_ps_input(vs_offset, true, false, false);
49507ec681f3Smrg      else
49517ec681f3Smrg         ps_input_cntl[ps_offset] =
49527ec681f3Smrg            offset_to_ps_input(AC_EXP_PARAM_DEFAULT_VAL_0000, true, false, false);
49537ec681f3Smrg      ++ps_offset;
49547ec681f3Smrg   }
49557ec681f3Smrg
49567ec681f3Smrg   if (ps->info.ps.viewport_index_input) {
49577ec681f3Smrg      unsigned vs_offset = outinfo->vs_output_param_offset[VARYING_SLOT_VIEWPORT];
49587ec681f3Smrg      if (vs_offset != AC_EXP_PARAM_UNDEFINED)
49597ec681f3Smrg         ps_input_cntl[ps_offset] = offset_to_ps_input(vs_offset, true, false, false);
49607ec681f3Smrg      else
49617ec681f3Smrg         ps_input_cntl[ps_offset] =
49627ec681f3Smrg            offset_to_ps_input(AC_EXP_PARAM_DEFAULT_VAL_0000, true, false, false);
49637ec681f3Smrg      ++ps_offset;
49647ec681f3Smrg   }
49657ec681f3Smrg
49667ec681f3Smrg   if (ps->info.ps.has_pcoord) {
49677ec681f3Smrg      unsigned val;
49687ec681f3Smrg      val = S_028644_PT_SPRITE_TEX(1) | S_028644_OFFSET(0x20);
49697ec681f3Smrg      ps_input_cntl[ps_offset] = val;
49707ec681f3Smrg      ps_offset++;
49717ec681f3Smrg   }
49727ec681f3Smrg
49737ec681f3Smrg   if (ps->info.ps.num_input_clips_culls) {
49747ec681f3Smrg      unsigned vs_offset;
49757ec681f3Smrg
49767ec681f3Smrg      vs_offset = outinfo->vs_output_param_offset[VARYING_SLOT_CLIP_DIST0];
49777ec681f3Smrg      if (vs_offset != AC_EXP_PARAM_UNDEFINED) {
49787ec681f3Smrg         ps_input_cntl[ps_offset] = offset_to_ps_input(vs_offset, false, false, false);
49797ec681f3Smrg         ++ps_offset;
49807ec681f3Smrg      }
49817ec681f3Smrg
49827ec681f3Smrg      vs_offset = outinfo->vs_output_param_offset[VARYING_SLOT_CLIP_DIST1];
49837ec681f3Smrg      if (vs_offset != AC_EXP_PARAM_UNDEFINED && ps->info.ps.num_input_clips_culls > 4) {
49847ec681f3Smrg         ps_input_cntl[ps_offset] = offset_to_ps_input(vs_offset, false, false, false);
49857ec681f3Smrg         ++ps_offset;
49867ec681f3Smrg      }
49877ec681f3Smrg   }
49887ec681f3Smrg
49897ec681f3Smrg   for (unsigned i = 0; i < 32 && (1u << i) <= ps->info.ps.input_mask; ++i) {
49907ec681f3Smrg      unsigned vs_offset;
49917ec681f3Smrg      bool flat_shade;
49927ec681f3Smrg      bool explicit;
49937ec681f3Smrg      bool float16;
49947ec681f3Smrg      if (!(ps->info.ps.input_mask & (1u << i)))
49957ec681f3Smrg         continue;
49967ec681f3Smrg
49977ec681f3Smrg      vs_offset = outinfo->vs_output_param_offset[VARYING_SLOT_VAR0 + i];
49987ec681f3Smrg      if (vs_offset == AC_EXP_PARAM_UNDEFINED) {
49997ec681f3Smrg         ps_input_cntl[ps_offset] = S_028644_OFFSET(0x20);
50007ec681f3Smrg         ++ps_offset;
50017ec681f3Smrg         continue;
50027ec681f3Smrg      }
50037ec681f3Smrg
50047ec681f3Smrg      flat_shade = !!(ps->info.ps.flat_shaded_mask & (1u << ps_offset));
50057ec681f3Smrg      explicit = !!(ps->info.ps.explicit_shaded_mask & (1u << ps_offset));
50067ec681f3Smrg      float16 = !!(ps->info.ps.float16_shaded_mask & (1u << ps_offset));
50077ec681f3Smrg
50087ec681f3Smrg      ps_input_cntl[ps_offset] = offset_to_ps_input(vs_offset, flat_shade, explicit, float16);
50097ec681f3Smrg      ++ps_offset;
50107ec681f3Smrg   }
50117ec681f3Smrg
50127ec681f3Smrg   if (ps_offset) {
50137ec681f3Smrg      radeon_set_context_reg_seq(ctx_cs, R_028644_SPI_PS_INPUT_CNTL_0, ps_offset);
50147ec681f3Smrg      for (unsigned i = 0; i < ps_offset; i++) {
50157ec681f3Smrg         radeon_emit(ctx_cs, ps_input_cntl[i]);
50167ec681f3Smrg      }
50177ec681f3Smrg   }
501801e04c3fSmrg}
501901e04c3fSmrg
502001e04c3fSmrgstatic uint32_t
502101e04c3fSmrgradv_compute_db_shader_control(const struct radv_device *device,
50227ec681f3Smrg                               const struct radv_pipeline *pipeline,
502301e04c3fSmrg                               const struct radv_shader_variant *ps)
502401e04c3fSmrg{
50257ec681f3Smrg   unsigned conservative_z_export = V_02880C_EXPORT_ANY_Z;
50267ec681f3Smrg   unsigned z_order;
50277ec681f3Smrg   if (ps->info.ps.early_fragment_test || !ps->info.ps.writes_memory)
50287ec681f3Smrg      z_order = V_02880C_EARLY_Z_THEN_LATE_Z;
50297ec681f3Smrg   else
50307ec681f3Smrg      z_order = V_02880C_LATE_Z;
50317ec681f3Smrg
50327ec681f3Smrg   if (ps->info.ps.depth_layout == FRAG_DEPTH_LAYOUT_GREATER)
50337ec681f3Smrg      conservative_z_export = V_02880C_EXPORT_GREATER_THAN_Z;
50347ec681f3Smrg   else if (ps->info.ps.depth_layout == FRAG_DEPTH_LAYOUT_LESS)
50357ec681f3Smrg      conservative_z_export = V_02880C_EXPORT_LESS_THAN_Z;
50367ec681f3Smrg
50377ec681f3Smrg   bool disable_rbplus = device->physical_device->rad_info.has_rbplus &&
50387ec681f3Smrg                         !device->physical_device->rad_info.rbplus_allowed;
50397ec681f3Smrg
50407ec681f3Smrg   /* It shouldn't be needed to export gl_SampleMask when MSAA is disabled
50417ec681f3Smrg    * but this appears to break Project Cars (DXVK). See
50427ec681f3Smrg    * https://bugs.freedesktop.org/show_bug.cgi?id=109401
50437ec681f3Smrg    */
50447ec681f3Smrg   bool mask_export_enable = ps->info.ps.writes_sample_mask;
50457ec681f3Smrg
50467ec681f3Smrg   return S_02880C_Z_EXPORT_ENABLE(ps->info.ps.writes_z) |
50477ec681f3Smrg          S_02880C_STENCIL_TEST_VAL_EXPORT_ENABLE(ps->info.ps.writes_stencil) |
50487ec681f3Smrg          S_02880C_KILL_ENABLE(!!ps->info.ps.can_discard) |
50497ec681f3Smrg          S_02880C_MASK_EXPORT_ENABLE(mask_export_enable) |
50507ec681f3Smrg          S_02880C_CONSERVATIVE_Z_EXPORT(conservative_z_export) | S_02880C_Z_ORDER(z_order) |
50517ec681f3Smrg          S_02880C_DEPTH_BEFORE_SHADER(ps->info.ps.early_fragment_test) |
50527ec681f3Smrg          S_02880C_PRE_SHADER_DEPTH_COVERAGE_ENABLE(ps->info.ps.post_depth_coverage) |
50537ec681f3Smrg          S_02880C_EXEC_ON_HIER_FAIL(ps->info.ps.writes_memory) |
50547ec681f3Smrg          S_02880C_EXEC_ON_NOOP(ps->info.ps.writes_memory) |
50557ec681f3Smrg          S_02880C_DUAL_QUAD_DISABLE(disable_rbplus);
505601e04c3fSmrg}
505701e04c3fSmrg
505801e04c3fSmrgstatic void
50597ec681f3Smrgradv_pipeline_generate_fragment_shader(struct radeon_cmdbuf *ctx_cs, struct radeon_cmdbuf *cs,
50607ec681f3Smrg                                       struct radv_pipeline *pipeline)
506101e04c3fSmrg{
50627ec681f3Smrg   struct radv_shader_variant *ps;
50637ec681f3Smrg   uint64_t va;
50647ec681f3Smrg   assert(pipeline->shaders[MESA_SHADER_FRAGMENT]);
506501e04c3fSmrg
50667ec681f3Smrg   ps = pipeline->shaders[MESA_SHADER_FRAGMENT];
50677ec681f3Smrg   va = radv_shader_variant_get_va(ps);
506801e04c3fSmrg
50697ec681f3Smrg   radeon_set_sh_reg_seq(cs, R_00B020_SPI_SHADER_PGM_LO_PS, 4);
50707ec681f3Smrg   radeon_emit(cs, va >> 8);
50717ec681f3Smrg   radeon_emit(cs, S_00B024_MEM_BASE(va >> 40));
50727ec681f3Smrg   radeon_emit(cs, ps->config.rsrc1);
50737ec681f3Smrg   radeon_emit(cs, ps->config.rsrc2);
507401e04c3fSmrg
50757ec681f3Smrg   radeon_set_context_reg(ctx_cs, R_02880C_DB_SHADER_CONTROL,
50767ec681f3Smrg                          radv_compute_db_shader_control(pipeline->device, pipeline, ps));
507701e04c3fSmrg
50787ec681f3Smrg   radeon_set_context_reg_seq(ctx_cs, R_0286CC_SPI_PS_INPUT_ENA, 2);
50797ec681f3Smrg   radeon_emit(ctx_cs, ps->config.spi_ps_input_ena);
50807ec681f3Smrg   radeon_emit(ctx_cs, ps->config.spi_ps_input_addr);
508101e04c3fSmrg
50827ec681f3Smrg   radeon_set_context_reg(
50837ec681f3Smrg      ctx_cs, R_0286D8_SPI_PS_IN_CONTROL,
50847ec681f3Smrg      S_0286D8_NUM_INTERP(ps->info.ps.num_interp) | S_0286D8_PS_W32_EN(ps->info.wave_size == 32));
508501e04c3fSmrg
50867ec681f3Smrg   radeon_set_context_reg(ctx_cs, R_0286E0_SPI_BARYC_CNTL, pipeline->graphics.spi_baryc_cntl);
508701e04c3fSmrg
50887ec681f3Smrg   radeon_set_context_reg(
50897ec681f3Smrg      ctx_cs, R_028710_SPI_SHADER_Z_FORMAT,
50907ec681f3Smrg      ac_get_spi_shader_z_format(ps->info.ps.writes_z, ps->info.ps.writes_stencil,
50917ec681f3Smrg                                 ps->info.ps.writes_sample_mask));
50927ec681f3Smrg}
509301e04c3fSmrg
50947ec681f3Smrgstatic void
50957ec681f3Smrgradv_pipeline_generate_vgt_vertex_reuse(struct radeon_cmdbuf *ctx_cs,
50967ec681f3Smrg                                        const struct radv_pipeline *pipeline)
50977ec681f3Smrg{
50987ec681f3Smrg   if (pipeline->device->physical_device->rad_info.family < CHIP_POLARIS10 ||
50997ec681f3Smrg       pipeline->device->physical_device->rad_info.chip_class >= GFX10)
51007ec681f3Smrg      return;
51017ec681f3Smrg
51027ec681f3Smrg   unsigned vtx_reuse_depth = 30;
51037ec681f3Smrg   if (radv_pipeline_has_tess(pipeline) &&
51047ec681f3Smrg       radv_get_shader(pipeline, MESA_SHADER_TESS_EVAL)->info.tes.spacing ==
51057ec681f3Smrg          TESS_SPACING_FRACTIONAL_ODD) {
51067ec681f3Smrg      vtx_reuse_depth = 14;
51077ec681f3Smrg   }
51087ec681f3Smrg   radeon_set_context_reg(ctx_cs, R_028C58_VGT_VERTEX_REUSE_BLOCK_CNTL,
51097ec681f3Smrg                          S_028C58_VTX_REUSE_DEPTH(vtx_reuse_depth));
51107ec681f3Smrg}
511101e04c3fSmrg
51127ec681f3Smrgstatic void
51137ec681f3Smrgradv_pipeline_generate_vgt_shader_config(struct radeon_cmdbuf *ctx_cs,
51147ec681f3Smrg                                         const struct radv_pipeline *pipeline)
51157ec681f3Smrg{
51167ec681f3Smrg   uint32_t stages = 0;
51177ec681f3Smrg   if (radv_pipeline_has_tess(pipeline)) {
51187ec681f3Smrg      stages |= S_028B54_LS_EN(V_028B54_LS_STAGE_ON) | S_028B54_HS_EN(1) | S_028B54_DYNAMIC_HS(1);
51197ec681f3Smrg
51207ec681f3Smrg      if (radv_pipeline_has_gs(pipeline))
51217ec681f3Smrg         stages |= S_028B54_ES_EN(V_028B54_ES_STAGE_DS) | S_028B54_GS_EN(1);
51227ec681f3Smrg      else if (radv_pipeline_has_ngg(pipeline))
51237ec681f3Smrg         stages |= S_028B54_ES_EN(V_028B54_ES_STAGE_DS);
51247ec681f3Smrg      else
51257ec681f3Smrg         stages |= S_028B54_VS_EN(V_028B54_VS_STAGE_DS);
51267ec681f3Smrg   } else if (radv_pipeline_has_gs(pipeline)) {
51277ec681f3Smrg      stages |= S_028B54_ES_EN(V_028B54_ES_STAGE_REAL) | S_028B54_GS_EN(1);
51287ec681f3Smrg   } else if (radv_pipeline_has_ngg(pipeline)) {
51297ec681f3Smrg      stages |= S_028B54_ES_EN(V_028B54_ES_STAGE_REAL);
51307ec681f3Smrg   }
51317ec681f3Smrg
51327ec681f3Smrg   if (radv_pipeline_has_ngg(pipeline)) {
51337ec681f3Smrg      stages |= S_028B54_PRIMGEN_EN(1);
51347ec681f3Smrg      if (pipeline->streamout_shader)
51357ec681f3Smrg         stages |= S_028B54_NGG_WAVE_ID_EN(1);
51367ec681f3Smrg      if (radv_pipeline_has_ngg_passthrough(pipeline))
51377ec681f3Smrg         stages |= S_028B54_PRIMGEN_PASSTHRU_EN(1);
51387ec681f3Smrg   } else if (radv_pipeline_has_gs(pipeline)) {
51397ec681f3Smrg      stages |= S_028B54_VS_EN(V_028B54_VS_STAGE_COPY_SHADER);
51407ec681f3Smrg   }
51417ec681f3Smrg
51427ec681f3Smrg   if (pipeline->device->physical_device->rad_info.chip_class >= GFX9)
51437ec681f3Smrg      stages |= S_028B54_MAX_PRIMGRP_IN_WAVE(2);
51447ec681f3Smrg
51457ec681f3Smrg   if (pipeline->device->physical_device->rad_info.chip_class >= GFX10) {
51467ec681f3Smrg      uint8_t hs_size = 64, gs_size = 64, vs_size = 64;
51477ec681f3Smrg
51487ec681f3Smrg      if (radv_pipeline_has_tess(pipeline))
51497ec681f3Smrg         hs_size = pipeline->shaders[MESA_SHADER_TESS_CTRL]->info.wave_size;
51507ec681f3Smrg
51517ec681f3Smrg      if (pipeline->shaders[MESA_SHADER_GEOMETRY]) {
51527ec681f3Smrg         vs_size = gs_size = pipeline->shaders[MESA_SHADER_GEOMETRY]->info.wave_size;
51537ec681f3Smrg         if (radv_pipeline_has_gs_copy_shader(pipeline))
51547ec681f3Smrg            vs_size = pipeline->gs_copy_shader->info.wave_size;
51557ec681f3Smrg      } else if (pipeline->shaders[MESA_SHADER_TESS_EVAL])
51567ec681f3Smrg         vs_size = pipeline->shaders[MESA_SHADER_TESS_EVAL]->info.wave_size;
51577ec681f3Smrg      else if (pipeline->shaders[MESA_SHADER_VERTEX])
51587ec681f3Smrg         vs_size = pipeline->shaders[MESA_SHADER_VERTEX]->info.wave_size;
51597ec681f3Smrg
51607ec681f3Smrg      if (radv_pipeline_has_ngg(pipeline)) {
51617ec681f3Smrg         assert(!radv_pipeline_has_gs_copy_shader(pipeline));
51627ec681f3Smrg         gs_size = vs_size;
51637ec681f3Smrg      }
51647ec681f3Smrg
51657ec681f3Smrg      /* legacy GS only supports Wave64 */
51667ec681f3Smrg      stages |= S_028B54_HS_W32_EN(hs_size == 32 ? 1 : 0) |
51677ec681f3Smrg                S_028B54_GS_W32_EN(gs_size == 32 ? 1 : 0) |
51687ec681f3Smrg                S_028B54_VS_W32_EN(vs_size == 32 ? 1 : 0);
51697ec681f3Smrg   }
51707ec681f3Smrg
51717ec681f3Smrg   radeon_set_context_reg(ctx_cs, R_028B54_VGT_SHADER_STAGES_EN, stages);
517201e04c3fSmrg}
517301e04c3fSmrg
517401e04c3fSmrgstatic void
51757ec681f3Smrgradv_pipeline_generate_cliprect_rule(struct radeon_cmdbuf *ctx_cs,
51767ec681f3Smrg                                     const VkGraphicsPipelineCreateInfo *pCreateInfo)
517701e04c3fSmrg{
51787ec681f3Smrg   const VkPipelineDiscardRectangleStateCreateInfoEXT *discard_rectangle_info =
51797ec681f3Smrg      vk_find_struct_const(pCreateInfo->pNext, PIPELINE_DISCARD_RECTANGLE_STATE_CREATE_INFO_EXT);
51807ec681f3Smrg   uint32_t cliprect_rule = 0;
51817ec681f3Smrg
51827ec681f3Smrg   if (!discard_rectangle_info) {
51837ec681f3Smrg      cliprect_rule = 0xffff;
51847ec681f3Smrg   } else {
51857ec681f3Smrg      for (unsigned i = 0; i < (1u << MAX_DISCARD_RECTANGLES); ++i) {
51867ec681f3Smrg         /* Interpret i as a bitmask, and then set the bit in
51877ec681f3Smrg          * the mask if that combination of rectangles in which
51887ec681f3Smrg          * the pixel is contained should pass the cliprect
51897ec681f3Smrg          * test.
51907ec681f3Smrg          */
51917ec681f3Smrg         unsigned relevant_subset = i & ((1u << discard_rectangle_info->discardRectangleCount) - 1);
51927ec681f3Smrg
51937ec681f3Smrg         if (discard_rectangle_info->discardRectangleMode ==
51947ec681f3Smrg                VK_DISCARD_RECTANGLE_MODE_INCLUSIVE_EXT &&
51957ec681f3Smrg             !relevant_subset)
51967ec681f3Smrg            continue;
51977ec681f3Smrg
51987ec681f3Smrg         if (discard_rectangle_info->discardRectangleMode ==
51997ec681f3Smrg                VK_DISCARD_RECTANGLE_MODE_EXCLUSIVE_EXT &&
52007ec681f3Smrg             relevant_subset)
52017ec681f3Smrg            continue;
52027ec681f3Smrg
52037ec681f3Smrg         cliprect_rule |= 1u << i;
52047ec681f3Smrg      }
52057ec681f3Smrg   }
52067ec681f3Smrg
52077ec681f3Smrg   radeon_set_context_reg(ctx_cs, R_02820C_PA_SC_CLIPRECT_RULE, cliprect_rule);
52087ec681f3Smrg}
520901e04c3fSmrg
52107ec681f3Smrgstatic void
52117ec681f3Smrggfx10_pipeline_generate_ge_cntl(struct radeon_cmdbuf *ctx_cs, struct radv_pipeline *pipeline)
52127ec681f3Smrg{
52137ec681f3Smrg   bool break_wave_at_eoi = false;
52147ec681f3Smrg   unsigned primgroup_size;
52157ec681f3Smrg   unsigned vertgroup_size = 256; /* 256 = disable vertex grouping */
52167ec681f3Smrg
52177ec681f3Smrg   if (radv_pipeline_has_tess(pipeline)) {
52187ec681f3Smrg      primgroup_size = pipeline->shaders[MESA_SHADER_TESS_CTRL]->info.num_tess_patches;
52197ec681f3Smrg   } else if (radv_pipeline_has_gs(pipeline)) {
52207ec681f3Smrg      const struct gfx9_gs_info *gs_state =
52217ec681f3Smrg         &pipeline->shaders[MESA_SHADER_GEOMETRY]->info.gs_ring_info;
52227ec681f3Smrg      unsigned vgt_gs_onchip_cntl = gs_state->vgt_gs_onchip_cntl;
52237ec681f3Smrg      primgroup_size = G_028A44_GS_PRIMS_PER_SUBGRP(vgt_gs_onchip_cntl);
52247ec681f3Smrg   } else {
52257ec681f3Smrg      primgroup_size = 128; /* recommended without a GS and tess */
52267ec681f3Smrg   }
52277ec681f3Smrg
52287ec681f3Smrg   if (radv_pipeline_has_tess(pipeline)) {
52297ec681f3Smrg      if (pipeline->shaders[MESA_SHADER_TESS_CTRL]->info.uses_prim_id ||
52307ec681f3Smrg          radv_get_shader(pipeline, MESA_SHADER_TESS_EVAL)->info.uses_prim_id)
52317ec681f3Smrg         break_wave_at_eoi = true;
52327ec681f3Smrg   }
52337ec681f3Smrg
52347ec681f3Smrg   radeon_set_uconfig_reg(ctx_cs, R_03096C_GE_CNTL,
52357ec681f3Smrg                          S_03096C_PRIM_GRP_SIZE(primgroup_size) |
52367ec681f3Smrg                             S_03096C_VERT_GRP_SIZE(vertgroup_size) |
52377ec681f3Smrg                             S_03096C_PACKET_TO_ONE_PA(0) /* line stipple */ |
52387ec681f3Smrg                             S_03096C_BREAK_WAVE_AT_EOI(break_wave_at_eoi));
523901e04c3fSmrg}
524001e04c3fSmrg
52417ec681f3Smrgstatic void
52427ec681f3Smrgradv_pipeline_generate_vgt_gs_out(struct radeon_cmdbuf *ctx_cs,
52437ec681f3Smrg                                  const struct radv_pipeline *pipeline,
52447ec681f3Smrg                                  const VkGraphicsPipelineCreateInfo *pCreateInfo,
52457ec681f3Smrg                                  const struct radv_graphics_pipeline_create_info *extra)
524601e04c3fSmrg{
52477ec681f3Smrg   uint32_t gs_out;
52487ec681f3Smrg
52497ec681f3Smrg   if (radv_pipeline_has_gs(pipeline)) {
52507ec681f3Smrg      gs_out =
52517ec681f3Smrg         si_conv_gl_prim_to_gs_out(pipeline->shaders[MESA_SHADER_GEOMETRY]->info.gs.output_prim);
52527ec681f3Smrg   } else if (radv_pipeline_has_tess(pipeline)) {
52537ec681f3Smrg      if (pipeline->shaders[MESA_SHADER_TESS_EVAL]->info.tes.point_mode) {
52547ec681f3Smrg         gs_out = V_028A6C_POINTLIST;
52557ec681f3Smrg      } else {
52567ec681f3Smrg         gs_out = si_conv_gl_prim_to_gs_out(
52577ec681f3Smrg            pipeline->shaders[MESA_SHADER_TESS_EVAL]->info.tes.primitive_mode);
52587ec681f3Smrg      }
52597ec681f3Smrg   } else {
52607ec681f3Smrg      gs_out = si_conv_prim_to_gs_out(pCreateInfo->pInputAssemblyState->topology);
52617ec681f3Smrg   }
52627ec681f3Smrg
52637ec681f3Smrg   if (extra && extra->use_rectlist) {
52647ec681f3Smrg      gs_out = V_028A6C_TRISTRIP;
52657ec681f3Smrg      if (radv_pipeline_has_ngg(pipeline))
52667ec681f3Smrg         gs_out = V_028A6C_RECTLIST;
52677ec681f3Smrg   }
52687ec681f3Smrg
52697ec681f3Smrg   radeon_set_context_reg(ctx_cs, R_028A6C_VGT_GS_OUT_PRIM_TYPE, gs_out);
52707ec681f3Smrg}
527101e04c3fSmrg
52727ec681f3Smrgstatic bool
52737ec681f3Smrggfx103_pipeline_vrs_coarse_shading(const struct radv_pipeline *pipeline)
52747ec681f3Smrg{
52757ec681f3Smrg   struct radv_shader_variant *ps = pipeline->shaders[MESA_SHADER_FRAGMENT];
52767ec681f3Smrg   struct radv_device *device = pipeline->device;
527701e04c3fSmrg
52787ec681f3Smrg   if (device->instance->debug_flags & RADV_DEBUG_NO_VRS_FLAT_SHADING)
52797ec681f3Smrg      return false;
528001e04c3fSmrg
52817ec681f3Smrg   if (!ps->info.ps.allow_flat_shading)
52827ec681f3Smrg      return false;
528301e04c3fSmrg
52847ec681f3Smrg   return true;
528501e04c3fSmrg}
528601e04c3fSmrg
52877ec681f3Smrgstatic void
52887ec681f3Smrggfx103_pipeline_generate_vrs_state(struct radeon_cmdbuf *ctx_cs,
52897ec681f3Smrg                                   const struct radv_pipeline *pipeline,
52907ec681f3Smrg                                   const VkGraphicsPipelineCreateInfo *pCreateInfo)
529101e04c3fSmrg{
52927ec681f3Smrg   uint32_t mode = V_028064_VRS_COMB_MODE_PASSTHRU;
52937ec681f3Smrg   uint8_t rate_x = 0, rate_y = 0;
52947ec681f3Smrg   bool enable_vrs = false;
52957ec681f3Smrg
52967ec681f3Smrg   if (vk_find_struct_const(pCreateInfo->pNext,
52977ec681f3Smrg                            PIPELINE_FRAGMENT_SHADING_RATE_STATE_CREATE_INFO_KHR) ||
52987ec681f3Smrg       radv_is_state_dynamic(pCreateInfo, VK_DYNAMIC_STATE_FRAGMENT_SHADING_RATE_KHR)) {
52997ec681f3Smrg      /* Enable draw call VRS because it's explicitly requested.  */
53007ec681f3Smrg      enable_vrs = true;
53017ec681f3Smrg   } else if (gfx103_pipeline_vrs_coarse_shading(pipeline)) {
53027ec681f3Smrg      /* Enable VRS coarse shading 2x2 if the driver determined that
53037ec681f3Smrg       * it's safe to enable.
53047ec681f3Smrg       */
53057ec681f3Smrg      mode = V_028064_VRS_COMB_MODE_OVERRIDE;
53067ec681f3Smrg      rate_x = rate_y = 1;
53077ec681f3Smrg   } else if (pipeline->device->force_vrs != RADV_FORCE_VRS_NONE) {
53087ec681f3Smrg      /* Force enable vertex VRS if requested by the user. */
53097ec681f3Smrg      radeon_set_context_reg(
53107ec681f3Smrg         ctx_cs, R_028848_PA_CL_VRS_CNTL,
53117ec681f3Smrg         S_028848_SAMPLE_ITER_COMBINER_MODE(V_028848_VRS_COMB_MODE_OVERRIDE) |
53127ec681f3Smrg            S_028848_VERTEX_RATE_COMBINER_MODE(V_028848_VRS_COMB_MODE_OVERRIDE));
53137ec681f3Smrg
53147ec681f3Smrg      /* If the shader is using discard, turn off coarse shading
53157ec681f3Smrg       * because discard at 2x2 pixel granularity degrades quality
53167ec681f3Smrg       * too much. MIN allows sample shading but not coarse shading.
53177ec681f3Smrg       */
53187ec681f3Smrg      struct radv_shader_variant *ps = pipeline->shaders[MESA_SHADER_FRAGMENT];
53197ec681f3Smrg
53207ec681f3Smrg      mode = ps->info.ps.can_discard ? V_028064_VRS_COMB_MODE_MIN : V_028064_VRS_COMB_MODE_PASSTHRU;
53217ec681f3Smrg   }
53227ec681f3Smrg
53237ec681f3Smrg   radeon_set_context_reg(ctx_cs, R_028A98_VGT_DRAW_PAYLOAD_CNTL, S_028A98_EN_VRS_RATE(enable_vrs));
53247ec681f3Smrg
53257ec681f3Smrg   radeon_set_context_reg(ctx_cs, R_028064_DB_VRS_OVERRIDE_CNTL,
53267ec681f3Smrg                          S_028064_VRS_OVERRIDE_RATE_COMBINER_MODE(mode) |
53277ec681f3Smrg                             S_028064_VRS_OVERRIDE_RATE_X(rate_x) |
53287ec681f3Smrg                             S_028064_VRS_OVERRIDE_RATE_Y(rate_y));
53297ec681f3Smrg}
533001e04c3fSmrg
53317ec681f3Smrgstatic void
53327ec681f3Smrgradv_pipeline_generate_pm4(struct radv_pipeline *pipeline,
53337ec681f3Smrg                           const VkGraphicsPipelineCreateInfo *pCreateInfo,
53347ec681f3Smrg                           const struct radv_graphics_pipeline_create_info *extra,
53357ec681f3Smrg                           const struct radv_blend_state *blend)
53367ec681f3Smrg{
53377ec681f3Smrg   struct radeon_cmdbuf *ctx_cs = &pipeline->ctx_cs;
53387ec681f3Smrg   struct radeon_cmdbuf *cs = &pipeline->cs;
53397ec681f3Smrg
53407ec681f3Smrg   cs->max_dw = 64;
53417ec681f3Smrg   ctx_cs->max_dw = 256;
53427ec681f3Smrg   cs->buf = malloc(4 * (cs->max_dw + ctx_cs->max_dw));
53437ec681f3Smrg   ctx_cs->buf = cs->buf + cs->max_dw;
53447ec681f3Smrg
53457ec681f3Smrg   radv_pipeline_generate_depth_stencil_state(ctx_cs, pipeline, pCreateInfo, extra);
53467ec681f3Smrg   radv_pipeline_generate_blend_state(ctx_cs, pipeline, blend);
53477ec681f3Smrg   radv_pipeline_generate_raster_state(ctx_cs, pipeline, pCreateInfo);
53487ec681f3Smrg   radv_pipeline_generate_multisample_state(ctx_cs, pipeline);
53497ec681f3Smrg   radv_pipeline_generate_vgt_gs_mode(ctx_cs, pipeline);
53507ec681f3Smrg   radv_pipeline_generate_vertex_shader(ctx_cs, cs, pipeline);
53517ec681f3Smrg
53527ec681f3Smrg   if (radv_pipeline_has_tess(pipeline)) {
53537ec681f3Smrg      radv_pipeline_generate_tess_shaders(ctx_cs, cs, pipeline);
53547ec681f3Smrg      radv_pipeline_generate_tess_state(ctx_cs, pipeline, pCreateInfo);
53557ec681f3Smrg   }
53567ec681f3Smrg
53577ec681f3Smrg   radv_pipeline_generate_geometry_shader(ctx_cs, cs, pipeline);
53587ec681f3Smrg   radv_pipeline_generate_fragment_shader(ctx_cs, cs, pipeline);
53597ec681f3Smrg   radv_pipeline_generate_ps_inputs(ctx_cs, pipeline);
53607ec681f3Smrg   radv_pipeline_generate_vgt_vertex_reuse(ctx_cs, pipeline);
53617ec681f3Smrg   radv_pipeline_generate_vgt_shader_config(ctx_cs, pipeline);
53627ec681f3Smrg   radv_pipeline_generate_cliprect_rule(ctx_cs, pCreateInfo);
53637ec681f3Smrg   radv_pipeline_generate_vgt_gs_out(ctx_cs, pipeline, pCreateInfo, extra);
53647ec681f3Smrg
53657ec681f3Smrg   if (pipeline->device->physical_device->rad_info.chip_class >= GFX10 &&
53667ec681f3Smrg       !radv_pipeline_has_ngg(pipeline))
53677ec681f3Smrg      gfx10_pipeline_generate_ge_cntl(ctx_cs, pipeline);
53687ec681f3Smrg
53697ec681f3Smrg   if (pipeline->device->physical_device->rad_info.chip_class >= GFX10_3)
53707ec681f3Smrg      gfx103_pipeline_generate_vrs_state(ctx_cs, pipeline, pCreateInfo);
53717ec681f3Smrg
53727ec681f3Smrg   pipeline->ctx_cs_hash = _mesa_hash_data(ctx_cs->buf, ctx_cs->cdw * 4);
53737ec681f3Smrg
53747ec681f3Smrg   assert(ctx_cs->cdw <= ctx_cs->max_dw);
53757ec681f3Smrg   assert(cs->cdw <= cs->max_dw);
53767ec681f3Smrg}
537701e04c3fSmrg
53787ec681f3Smrgstatic void
53797ec681f3Smrgradv_pipeline_init_vertex_input_state(struct radv_pipeline *pipeline,
53807ec681f3Smrg                                      const VkGraphicsPipelineCreateInfo *pCreateInfo,
53817ec681f3Smrg                                      const struct radv_pipeline_key *key)
53827ec681f3Smrg{
53837ec681f3Smrg   const struct radv_shader_info *info = &radv_get_shader(pipeline, MESA_SHADER_VERTEX)->info;
53847ec681f3Smrg   if (!key->vs.dynamic_input_state) {
53857ec681f3Smrg      const VkPipelineVertexInputStateCreateInfo *vi_info = pCreateInfo->pVertexInputState;
53867ec681f3Smrg
53877ec681f3Smrg      for (uint32_t i = 0; i < vi_info->vertexBindingDescriptionCount; i++) {
53887ec681f3Smrg         const VkVertexInputBindingDescription *desc = &vi_info->pVertexBindingDescriptions[i];
53897ec681f3Smrg
53907ec681f3Smrg         pipeline->binding_stride[desc->binding] = desc->stride;
53917ec681f3Smrg      }
53927ec681f3Smrg
53937ec681f3Smrg      for (uint32_t i = 0; i < vi_info->vertexAttributeDescriptionCount; i++) {
53947ec681f3Smrg         const VkVertexInputAttributeDescription *desc = &vi_info->pVertexAttributeDescriptions[i];
53957ec681f3Smrg
53967ec681f3Smrg         uint32_t end = desc->offset + vk_format_get_blocksize(desc->format);
53977ec681f3Smrg         pipeline->attrib_ends[desc->location] = end;
53987ec681f3Smrg         if (pipeline->binding_stride[desc->binding])
53997ec681f3Smrg            pipeline->attrib_index_offset[desc->location] =
54007ec681f3Smrg               desc->offset / pipeline->binding_stride[desc->binding];
54017ec681f3Smrg         pipeline->attrib_bindings[desc->location] = desc->binding;
54027ec681f3Smrg      }
54037ec681f3Smrg   }
54047ec681f3Smrg
54057ec681f3Smrg   pipeline->use_per_attribute_vb_descs = info->vs.use_per_attribute_vb_descs;
54067ec681f3Smrg   pipeline->last_vertex_attrib_bit = util_last_bit(info->vs.vb_desc_usage_mask);
54077ec681f3Smrg   if (pipeline->shaders[MESA_SHADER_VERTEX])
54087ec681f3Smrg      pipeline->next_vertex_stage = MESA_SHADER_VERTEX;
54097ec681f3Smrg   else if (pipeline->shaders[MESA_SHADER_TESS_CTRL])
54107ec681f3Smrg      pipeline->next_vertex_stage = MESA_SHADER_TESS_CTRL;
54117ec681f3Smrg   else
54127ec681f3Smrg      pipeline->next_vertex_stage = MESA_SHADER_GEOMETRY;
54137ec681f3Smrg   if (pipeline->next_vertex_stage == MESA_SHADER_VERTEX) {
54147ec681f3Smrg      const struct radv_shader_variant *vs_shader = pipeline->shaders[MESA_SHADER_VERTEX];
54157ec681f3Smrg      pipeline->can_use_simple_input = vs_shader->info.is_ngg == pipeline->device->physical_device->use_ngg &&
54167ec681f3Smrg                                       vs_shader->info.wave_size == pipeline->device->physical_device->ge_wave_size;
54177ec681f3Smrg   } else {
54187ec681f3Smrg      pipeline->can_use_simple_input = false;
54197ec681f3Smrg   }
54207ec681f3Smrg   if (info->vs.dynamic_inputs)
54217ec681f3Smrg      pipeline->vb_desc_usage_mask = BITFIELD_MASK(pipeline->last_vertex_attrib_bit);
54227ec681f3Smrg   else
54237ec681f3Smrg      pipeline->vb_desc_usage_mask = info->vs.vb_desc_usage_mask;
54247ec681f3Smrg   pipeline->vb_desc_alloc_size = util_bitcount(pipeline->vb_desc_usage_mask) * 16;
54257ec681f3Smrg}
542601e04c3fSmrg
54277ec681f3Smrgstatic struct radv_shader_variant *
54287ec681f3Smrgradv_pipeline_get_streamout_shader(struct radv_pipeline *pipeline)
54297ec681f3Smrg{
54307ec681f3Smrg   int i;
543101e04c3fSmrg
54327ec681f3Smrg   for (i = MESA_SHADER_GEOMETRY; i >= MESA_SHADER_VERTEX; i--) {
54337ec681f3Smrg      struct radv_shader_variant *shader = radv_get_shader(pipeline, i);
543401e04c3fSmrg
54357ec681f3Smrg      if (shader && shader->info.so.num_outputs > 0)
54367ec681f3Smrg         return shader;
54377ec681f3Smrg   }
543801e04c3fSmrg
54397ec681f3Smrg   return NULL;
54407ec681f3Smrg}
544101e04c3fSmrg
54427ec681f3Smrgstatic bool
54437ec681f3Smrgradv_shader_need_indirect_descriptor_sets(struct radv_pipeline *pipeline, gl_shader_stage stage)
54447ec681f3Smrg{
54457ec681f3Smrg   struct radv_userdata_info *loc =
54467ec681f3Smrg      radv_lookup_user_sgpr(pipeline, stage, AC_UD_INDIRECT_DESCRIPTOR_SETS);
54477ec681f3Smrg   return loc->sgpr_idx != -1;
544801e04c3fSmrg}
544901e04c3fSmrg
545001e04c3fSmrgstatic void
54517ec681f3Smrgradv_pipeline_init_shader_stages_state(struct radv_pipeline *pipeline)
545201e04c3fSmrg{
54537ec681f3Smrg   struct radv_device *device = pipeline->device;
54547ec681f3Smrg
54557ec681f3Smrg   for (unsigned i = 0; i < MESA_SHADER_STAGES; i++) {
54567ec681f3Smrg      pipeline->user_data_0[i] = radv_pipeline_stage_to_user_data_0(
54577ec681f3Smrg         pipeline, i, device->physical_device->rad_info.chip_class);
54587ec681f3Smrg
54597ec681f3Smrg      if (pipeline->shaders[i]) {
54607ec681f3Smrg         pipeline->need_indirect_descriptor_sets |=
54617ec681f3Smrg            radv_shader_need_indirect_descriptor_sets(pipeline, i);
54627ec681f3Smrg      }
54637ec681f3Smrg   }
54647ec681f3Smrg
54657ec681f3Smrg   struct radv_userdata_info *loc =
54667ec681f3Smrg      radv_lookup_user_sgpr(pipeline, MESA_SHADER_VERTEX, AC_UD_VS_BASE_VERTEX_START_INSTANCE);
54677ec681f3Smrg   if (loc->sgpr_idx != -1) {
54687ec681f3Smrg      pipeline->graphics.vtx_base_sgpr = pipeline->user_data_0[MESA_SHADER_VERTEX];
54697ec681f3Smrg      pipeline->graphics.vtx_base_sgpr += loc->sgpr_idx * 4;
54707ec681f3Smrg      pipeline->graphics.vtx_emit_num = loc->num_sgprs;
54717ec681f3Smrg      pipeline->graphics.uses_drawid =
54727ec681f3Smrg         radv_get_shader(pipeline, MESA_SHADER_VERTEX)->info.vs.needs_draw_id;
54737ec681f3Smrg      pipeline->graphics.uses_baseinstance =
54747ec681f3Smrg         radv_get_shader(pipeline, MESA_SHADER_VERTEX)->info.vs.needs_base_instance;
54757ec681f3Smrg   }
54767ec681f3Smrg}
5477ed98bd31Smaya
54787ec681f3Smrgstatic VkResult
54797ec681f3Smrgradv_pipeline_init(struct radv_pipeline *pipeline, struct radv_device *device,
54807ec681f3Smrg                   struct radv_pipeline_cache *cache,
54817ec681f3Smrg                   const VkGraphicsPipelineCreateInfo *pCreateInfo,
54827ec681f3Smrg                   const struct radv_graphics_pipeline_create_info *extra)
54837ec681f3Smrg{
54847ec681f3Smrg   RADV_FROM_HANDLE(radv_pipeline_layout, pipeline_layout, pCreateInfo->layout);
54857ec681f3Smrg   VkResult result;
54867ec681f3Smrg
54877ec681f3Smrg   pipeline->device = device;
54887ec681f3Smrg   pipeline->graphics.last_vgt_api_stage = MESA_SHADER_NONE;
54897ec681f3Smrg
54907ec681f3Smrg   struct radv_blend_state blend = radv_pipeline_init_blend_state(pipeline, pCreateInfo, extra);
54917ec681f3Smrg
54927ec681f3Smrg   const VkPipelineCreationFeedbackCreateInfoEXT *creation_feedback =
54937ec681f3Smrg      vk_find_struct_const(pCreateInfo->pNext, PIPELINE_CREATION_FEEDBACK_CREATE_INFO_EXT);
54947ec681f3Smrg   radv_init_feedback(creation_feedback);
54957ec681f3Smrg
54967ec681f3Smrg   VkPipelineCreationFeedbackEXT *pipeline_feedback =
54977ec681f3Smrg      creation_feedback ? creation_feedback->pPipelineCreationFeedback : NULL;
54987ec681f3Smrg
54997ec681f3Smrg   const VkPipelineShaderStageCreateInfo *pStages[MESA_SHADER_STAGES] = {
55007ec681f3Smrg      0,
55017ec681f3Smrg   };
55027ec681f3Smrg   VkPipelineCreationFeedbackEXT *stage_feedbacks[MESA_SHADER_STAGES] = {0};
55037ec681f3Smrg   for (uint32_t i = 0; i < pCreateInfo->stageCount; i++) {
55047ec681f3Smrg      gl_shader_stage stage = ffs(pCreateInfo->pStages[i].stage) - 1;
55057ec681f3Smrg      pStages[stage] = &pCreateInfo->pStages[i];
55067ec681f3Smrg      if (creation_feedback)
55077ec681f3Smrg         stage_feedbacks[stage] = &creation_feedback->pPipelineStageCreationFeedbacks[i];
55087ec681f3Smrg   }
55097ec681f3Smrg
55107ec681f3Smrg   struct radv_pipeline_key key =
55117ec681f3Smrg      radv_generate_graphics_pipeline_key(pipeline, pCreateInfo, &blend);
55127ec681f3Smrg
55137ec681f3Smrg   result = radv_create_shaders(pipeline, pipeline_layout, device, cache, &key, pStages,
55147ec681f3Smrg                                pCreateInfo->flags, NULL, pipeline_feedback, stage_feedbacks);
55157ec681f3Smrg   if (result != VK_SUCCESS)
55167ec681f3Smrg      return result;
55177ec681f3Smrg
55187ec681f3Smrg   pipeline->graphics.spi_baryc_cntl = S_0286E0_FRONT_FACE_ALL_BITS(1);
55197ec681f3Smrg   radv_pipeline_init_multisample_state(pipeline, &blend, pCreateInfo);
55207ec681f3Smrg   radv_pipeline_init_input_assembly_state(pipeline, pCreateInfo, extra);
55217ec681f3Smrg   radv_pipeline_init_dynamic_state(pipeline, pCreateInfo, extra);
55227ec681f3Smrg   radv_pipeline_init_raster_state(pipeline, pCreateInfo);
55237ec681f3Smrg   radv_pipeline_init_depth_stencil_state(pipeline, pCreateInfo);
55247ec681f3Smrg
55257ec681f3Smrg   if (pipeline->device->physical_device->rad_info.chip_class >= GFX10_3)
55267ec681f3Smrg      gfx103_pipeline_init_vrs_state(pipeline, pCreateInfo);
55277ec681f3Smrg
55287ec681f3Smrg   /* Ensure that some export memory is always allocated, for two reasons:
55297ec681f3Smrg    *
55307ec681f3Smrg    * 1) Correctness: The hardware ignores the EXEC mask if no export
55317ec681f3Smrg    *    memory is allocated, so KILL and alpha test do not work correctly
55327ec681f3Smrg    *    without this.
55337ec681f3Smrg    * 2) Performance: Every shader needs at least a NULL export, even when
55347ec681f3Smrg    *    it writes no color/depth output. The NULL export instruction
55357ec681f3Smrg    *    stalls without this setting.
55367ec681f3Smrg    *
55377ec681f3Smrg    * Don't add this to CB_SHADER_MASK.
55387ec681f3Smrg    *
55397ec681f3Smrg    * GFX10 supports pixel shaders without exports by setting both the
55407ec681f3Smrg    * color and Z formats to SPI_SHADER_ZERO. The hw will skip export
55417ec681f3Smrg    * instructions if any are present.
55427ec681f3Smrg    */
55437ec681f3Smrg   struct radv_shader_variant *ps = pipeline->shaders[MESA_SHADER_FRAGMENT];
55447ec681f3Smrg   if ((pipeline->device->physical_device->rad_info.chip_class <= GFX9 ||
55457ec681f3Smrg        ps->info.ps.can_discard) &&
55467ec681f3Smrg       !blend.spi_shader_col_format) {
55477ec681f3Smrg      if (!ps->info.ps.writes_z && !ps->info.ps.writes_stencil && !ps->info.ps.writes_sample_mask)
55487ec681f3Smrg         blend.spi_shader_col_format = V_028714_SPI_SHADER_32_R;
55497ec681f3Smrg   }
55507ec681f3Smrg
55517ec681f3Smrg   if (extra && (extra->custom_blend_mode == V_028808_CB_ELIMINATE_FAST_CLEAR ||
55527ec681f3Smrg                 extra->custom_blend_mode == V_028808_CB_FMASK_DECOMPRESS ||
55537ec681f3Smrg                 extra->custom_blend_mode == V_028808_CB_DCC_DECOMPRESS ||
55547ec681f3Smrg                 extra->custom_blend_mode == V_028808_CB_RESOLVE)) {
55557ec681f3Smrg      /* According to the CB spec states, CB_SHADER_MASK should be
55567ec681f3Smrg       * set to enable writes to all four channels of MRT0.
55577ec681f3Smrg       */
55587ec681f3Smrg      blend.cb_shader_mask = 0xf;
55597ec681f3Smrg   }
55607ec681f3Smrg
55617ec681f3Smrg   pipeline->graphics.col_format = blend.spi_shader_col_format;
55627ec681f3Smrg   pipeline->graphics.cb_target_mask = blend.cb_target_mask;
55637ec681f3Smrg
55647ec681f3Smrg   if (radv_pipeline_has_gs(pipeline) && !radv_pipeline_has_ngg(pipeline)) {
55657ec681f3Smrg      struct radv_shader_variant *gs = pipeline->shaders[MESA_SHADER_GEOMETRY];
55667ec681f3Smrg
55677ec681f3Smrg      radv_pipeline_init_gs_ring_state(pipeline, &gs->info.gs_ring_info);
55687ec681f3Smrg   }
55697ec681f3Smrg
55707ec681f3Smrg   if (radv_pipeline_has_tess(pipeline)) {
55717ec681f3Smrg      pipeline->graphics.tess_patch_control_points =
55727ec681f3Smrg         pCreateInfo->pTessellationState->patchControlPoints;
55737ec681f3Smrg   }
55747ec681f3Smrg
55757ec681f3Smrg   radv_pipeline_init_vertex_input_state(pipeline, pCreateInfo, &key);
55767ec681f3Smrg   radv_pipeline_init_binning_state(pipeline, pCreateInfo, &blend);
55777ec681f3Smrg   radv_pipeline_init_shader_stages_state(pipeline);
55787ec681f3Smrg   radv_pipeline_init_scratch(device, pipeline);
55797ec681f3Smrg
55807ec681f3Smrg   /* Find the last vertex shader stage that eventually uses streamout. */
55817ec681f3Smrg   pipeline->streamout_shader = radv_pipeline_get_streamout_shader(pipeline);
55827ec681f3Smrg
55837ec681f3Smrg   pipeline->graphics.is_ngg = radv_pipeline_has_ngg(pipeline);
55847ec681f3Smrg   pipeline->graphics.has_ngg_culling =
55857ec681f3Smrg      pipeline->graphics.is_ngg &&
55867ec681f3Smrg      pipeline->shaders[pipeline->graphics.last_vgt_api_stage]->info.has_ngg_culling;
55877ec681f3Smrg
55887ec681f3Smrg   pipeline->push_constant_size = pipeline_layout->push_constant_size;
55897ec681f3Smrg   pipeline->dynamic_offset_count = pipeline_layout->dynamic_offset_count;
55907ec681f3Smrg
55917ec681f3Smrg   radv_pipeline_generate_pm4(pipeline, pCreateInfo, extra, &blend);
55927ec681f3Smrg
55937ec681f3Smrg   return result;
55947ec681f3Smrg}
5595ed98bd31Smaya
55967ec681f3SmrgVkResult
55977ec681f3Smrgradv_graphics_pipeline_create(VkDevice _device, VkPipelineCache _cache,
55987ec681f3Smrg                              const VkGraphicsPipelineCreateInfo *pCreateInfo,
55997ec681f3Smrg                              const struct radv_graphics_pipeline_create_info *extra,
56007ec681f3Smrg                              const VkAllocationCallbacks *pAllocator, VkPipeline *pPipeline)
56017ec681f3Smrg{
56027ec681f3Smrg   RADV_FROM_HANDLE(radv_device, device, _device);
56037ec681f3Smrg   RADV_FROM_HANDLE(radv_pipeline_cache, cache, _cache);
56047ec681f3Smrg   struct radv_pipeline *pipeline;
56057ec681f3Smrg   VkResult result;
5606ed98bd31Smaya
56077ec681f3Smrg   pipeline = vk_zalloc2(&device->vk.alloc, pAllocator, sizeof(*pipeline), 8,
56087ec681f3Smrg                         VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
56097ec681f3Smrg   if (pipeline == NULL)
56107ec681f3Smrg      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
561101e04c3fSmrg
56127ec681f3Smrg   vk_object_base_init(&device->vk, &pipeline->base, VK_OBJECT_TYPE_PIPELINE);
56137ec681f3Smrg   pipeline->type = RADV_PIPELINE_GRAPHICS;
561401e04c3fSmrg
56157ec681f3Smrg   result = radv_pipeline_init(pipeline, device, cache, pCreateInfo, extra);
56167ec681f3Smrg   if (result != VK_SUCCESS) {
56177ec681f3Smrg      radv_pipeline_destroy(device, pipeline, pAllocator);
56187ec681f3Smrg      return result;
56197ec681f3Smrg   }
562001e04c3fSmrg
56217ec681f3Smrg   *pPipeline = radv_pipeline_to_handle(pipeline);
562201e04c3fSmrg
56237ec681f3Smrg   return VK_SUCCESS;
56247ec681f3Smrg}
5625ed98bd31Smaya
56267ec681f3SmrgVkResult
56277ec681f3Smrgradv_CreateGraphicsPipelines(VkDevice _device, VkPipelineCache pipelineCache, uint32_t count,
56287ec681f3Smrg                             const VkGraphicsPipelineCreateInfo *pCreateInfos,
56297ec681f3Smrg                             const VkAllocationCallbacks *pAllocator, VkPipeline *pPipelines)
56307ec681f3Smrg{
56317ec681f3Smrg   VkResult result = VK_SUCCESS;
56327ec681f3Smrg   unsigned i = 0;
56337ec681f3Smrg
56347ec681f3Smrg   for (; i < count; i++) {
56357ec681f3Smrg      VkResult r;
56367ec681f3Smrg      r = radv_graphics_pipeline_create(_device, pipelineCache, &pCreateInfos[i], NULL, pAllocator,
56377ec681f3Smrg                                        &pPipelines[i]);
56387ec681f3Smrg      if (r != VK_SUCCESS) {
56397ec681f3Smrg         result = r;
56407ec681f3Smrg         pPipelines[i] = VK_NULL_HANDLE;
56417ec681f3Smrg
56427ec681f3Smrg         if (pCreateInfos[i].flags & VK_PIPELINE_CREATE_EARLY_RETURN_ON_FAILURE_BIT_EXT)
56437ec681f3Smrg            break;
56447ec681f3Smrg      }
56457ec681f3Smrg   }
56467ec681f3Smrg
56477ec681f3Smrg   for (; i < count; ++i)
56487ec681f3Smrg      pPipelines[i] = VK_NULL_HANDLE;
56497ec681f3Smrg
56507ec681f3Smrg   return result;
565101e04c3fSmrg}
565201e04c3fSmrg
56537ec681f3Smrgstatic void
56547ec681f3Smrgradv_pipeline_generate_hw_cs(struct radeon_cmdbuf *cs, const struct radv_pipeline *pipeline)
56557ec681f3Smrg{
56567ec681f3Smrg   struct radv_shader_variant *shader = pipeline->shaders[MESA_SHADER_COMPUTE];
56577ec681f3Smrg   uint64_t va = radv_shader_variant_get_va(shader);
56587ec681f3Smrg   struct radv_device *device = pipeline->device;
56597ec681f3Smrg
56607ec681f3Smrg   radeon_set_sh_reg(cs, R_00B830_COMPUTE_PGM_LO, va >> 8);
56617ec681f3Smrg
56627ec681f3Smrg   radeon_set_sh_reg_seq(cs, R_00B848_COMPUTE_PGM_RSRC1, 2);
56637ec681f3Smrg   radeon_emit(cs, shader->config.rsrc1);
56647ec681f3Smrg   radeon_emit(cs, shader->config.rsrc2);
56657ec681f3Smrg   if (device->physical_device->rad_info.chip_class >= GFX10) {
56667ec681f3Smrg      radeon_set_sh_reg(cs, R_00B8A0_COMPUTE_PGM_RSRC3, shader->config.rsrc3);
56677ec681f3Smrg   }
566801e04c3fSmrg}
566901e04c3fSmrg
56707ec681f3Smrgstatic void
56717ec681f3Smrgradv_pipeline_generate_compute_state(struct radeon_cmdbuf *cs, const struct radv_pipeline *pipeline)
56727ec681f3Smrg{
56737ec681f3Smrg   struct radv_shader_variant *shader = pipeline->shaders[MESA_SHADER_COMPUTE];
56747ec681f3Smrg   struct radv_device *device = pipeline->device;
56757ec681f3Smrg   unsigned threads_per_threadgroup;
56767ec681f3Smrg   unsigned threadgroups_per_cu = 1;
56777ec681f3Smrg   unsigned waves_per_threadgroup;
56787ec681f3Smrg   unsigned max_waves_per_sh = 0;
56797ec681f3Smrg
56807ec681f3Smrg   /* Calculate best compute resource limits. */
56817ec681f3Smrg   threads_per_threadgroup =
56827ec681f3Smrg      shader->info.cs.block_size[0] * shader->info.cs.block_size[1] * shader->info.cs.block_size[2];
56837ec681f3Smrg   waves_per_threadgroup = DIV_ROUND_UP(threads_per_threadgroup, shader->info.wave_size);
56847ec681f3Smrg
56857ec681f3Smrg   if (device->physical_device->rad_info.chip_class >= GFX10 && waves_per_threadgroup == 1)
56867ec681f3Smrg      threadgroups_per_cu = 2;
56877ec681f3Smrg
56887ec681f3Smrg   radeon_set_sh_reg(
56897ec681f3Smrg      cs, R_00B854_COMPUTE_RESOURCE_LIMITS,
56907ec681f3Smrg      ac_get_compute_resource_limits(&device->physical_device->rad_info, waves_per_threadgroup,
56917ec681f3Smrg                                     max_waves_per_sh, threadgroups_per_cu));
56927ec681f3Smrg
56937ec681f3Smrg   radeon_set_sh_reg_seq(cs, R_00B81C_COMPUTE_NUM_THREAD_X, 3);
56947ec681f3Smrg   radeon_emit(cs, S_00B81C_NUM_THREAD_FULL(shader->info.cs.block_size[0]));
56957ec681f3Smrg   radeon_emit(cs, S_00B81C_NUM_THREAD_FULL(shader->info.cs.block_size[1]));
56967ec681f3Smrg   radeon_emit(cs, S_00B81C_NUM_THREAD_FULL(shader->info.cs.block_size[2]));
56977ec681f3Smrg}
569801e04c3fSmrg
569901e04c3fSmrgstatic void
57007ec681f3Smrgradv_compute_generate_pm4(struct radv_pipeline *pipeline)
570101e04c3fSmrg{
57027ec681f3Smrg   struct radv_device *device = pipeline->device;
57037ec681f3Smrg   struct radeon_cmdbuf *cs = &pipeline->cs;
570401e04c3fSmrg
57057ec681f3Smrg   cs->max_dw = device->physical_device->rad_info.chip_class >= GFX10 ? 19 : 16;
57067ec681f3Smrg   cs->buf = malloc(cs->max_dw * 4);
570701e04c3fSmrg
57087ec681f3Smrg   radv_pipeline_generate_hw_cs(cs, pipeline);
57097ec681f3Smrg   radv_pipeline_generate_compute_state(cs, pipeline);
571001e04c3fSmrg
57117ec681f3Smrg   assert(pipeline->cs.cdw <= pipeline->cs.max_dw);
57127ec681f3Smrg}
571301e04c3fSmrg
57147ec681f3Smrgstatic struct radv_pipeline_key
57157ec681f3Smrgradv_generate_compute_pipeline_key(struct radv_pipeline *pipeline,
57167ec681f3Smrg                                   const VkComputePipelineCreateInfo *pCreateInfo)
57177ec681f3Smrg{
57187ec681f3Smrg   const VkPipelineShaderStageCreateInfo *stage = &pCreateInfo->stage;
57197ec681f3Smrg   struct radv_pipeline_key key;
57207ec681f3Smrg   memset(&key, 0, sizeof(key));
57217ec681f3Smrg
57227ec681f3Smrg   if (pCreateInfo->flags & VK_PIPELINE_CREATE_DISABLE_OPTIMIZATION_BIT)
57237ec681f3Smrg      key.optimisations_disabled = 1;
57247ec681f3Smrg
57257ec681f3Smrg   const VkPipelineShaderStageRequiredSubgroupSizeCreateInfoEXT *subgroup_size =
57267ec681f3Smrg      vk_find_struct_const(stage->pNext,
57277ec681f3Smrg                           PIPELINE_SHADER_STAGE_REQUIRED_SUBGROUP_SIZE_CREATE_INFO_EXT);
57287ec681f3Smrg
57297ec681f3Smrg   if (subgroup_size) {
57307ec681f3Smrg      assert(subgroup_size->requiredSubgroupSize == 32 ||
57317ec681f3Smrg             subgroup_size->requiredSubgroupSize == 64);
57327ec681f3Smrg      key.cs.compute_subgroup_size = subgroup_size->requiredSubgroupSize;
57337ec681f3Smrg   } else if (stage->flags & VK_PIPELINE_SHADER_STAGE_CREATE_REQUIRE_FULL_SUBGROUPS_BIT_EXT) {
57347ec681f3Smrg      key.cs.require_full_subgroups = true;
57357ec681f3Smrg   }
57367ec681f3Smrg
57377ec681f3Smrg   return key;
57387ec681f3Smrg}
573901e04c3fSmrg
57407ec681f3SmrgVkResult
57417ec681f3Smrgradv_compute_pipeline_create(VkDevice _device, VkPipelineCache _cache,
57427ec681f3Smrg                             const VkComputePipelineCreateInfo *pCreateInfo,
57437ec681f3Smrg                             const VkAllocationCallbacks *pAllocator, const uint8_t *custom_hash,
57447ec681f3Smrg                             struct radv_pipeline_shader_stack_size *rt_stack_sizes,
57457ec681f3Smrg                             uint32_t rt_group_count, VkPipeline *pPipeline)
57467ec681f3Smrg{
57477ec681f3Smrg   RADV_FROM_HANDLE(radv_device, device, _device);
57487ec681f3Smrg   RADV_FROM_HANDLE(radv_pipeline_cache, cache, _cache);
57497ec681f3Smrg   RADV_FROM_HANDLE(radv_pipeline_layout, pipeline_layout, pCreateInfo->layout);
57507ec681f3Smrg   const VkPipelineShaderStageCreateInfo *pStages[MESA_SHADER_STAGES] = {
57517ec681f3Smrg      0,
57527ec681f3Smrg   };
57537ec681f3Smrg   VkPipelineCreationFeedbackEXT *stage_feedbacks[MESA_SHADER_STAGES] = {0};
57547ec681f3Smrg   struct radv_pipeline *pipeline;
57557ec681f3Smrg   VkResult result;
57567ec681f3Smrg
57577ec681f3Smrg   pipeline = vk_zalloc2(&device->vk.alloc, pAllocator, sizeof(*pipeline), 8,
57587ec681f3Smrg                         VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
57597ec681f3Smrg   if (pipeline == NULL) {
57607ec681f3Smrg      free(rt_stack_sizes);
57617ec681f3Smrg      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
57627ec681f3Smrg   }
57637ec681f3Smrg
57647ec681f3Smrg   vk_object_base_init(&device->vk, &pipeline->base, VK_OBJECT_TYPE_PIPELINE);
57657ec681f3Smrg   pipeline->type = RADV_PIPELINE_COMPUTE;
57667ec681f3Smrg
57677ec681f3Smrg   pipeline->device = device;
57687ec681f3Smrg   pipeline->graphics.last_vgt_api_stage = MESA_SHADER_NONE;
57697ec681f3Smrg   pipeline->compute.rt_stack_sizes = rt_stack_sizes;
57707ec681f3Smrg   pipeline->compute.group_count = rt_group_count;
57717ec681f3Smrg
57727ec681f3Smrg   const VkPipelineCreationFeedbackCreateInfoEXT *creation_feedback =
57737ec681f3Smrg      vk_find_struct_const(pCreateInfo->pNext, PIPELINE_CREATION_FEEDBACK_CREATE_INFO_EXT);
57747ec681f3Smrg   radv_init_feedback(creation_feedback);
57757ec681f3Smrg
57767ec681f3Smrg   VkPipelineCreationFeedbackEXT *pipeline_feedback =
57777ec681f3Smrg      creation_feedback ? creation_feedback->pPipelineCreationFeedback : NULL;
57787ec681f3Smrg   if (creation_feedback)
57797ec681f3Smrg      stage_feedbacks[MESA_SHADER_COMPUTE] = &creation_feedback->pPipelineStageCreationFeedbacks[0];
57807ec681f3Smrg
57817ec681f3Smrg   pStages[MESA_SHADER_COMPUTE] = &pCreateInfo->stage;
57827ec681f3Smrg
57837ec681f3Smrg   struct radv_pipeline_key key = radv_generate_compute_pipeline_key(pipeline, pCreateInfo);
57847ec681f3Smrg
57857ec681f3Smrg   result = radv_create_shaders(pipeline, pipeline_layout, device, cache, &key, pStages,
57867ec681f3Smrg                                pCreateInfo->flags, custom_hash, pipeline_feedback, stage_feedbacks);
57877ec681f3Smrg   if (result != VK_SUCCESS) {
57887ec681f3Smrg      radv_pipeline_destroy(device, pipeline, pAllocator);
57897ec681f3Smrg      return result;
57907ec681f3Smrg   }
57917ec681f3Smrg
57927ec681f3Smrg   pipeline->user_data_0[MESA_SHADER_COMPUTE] = radv_pipeline_stage_to_user_data_0(
57937ec681f3Smrg      pipeline, MESA_SHADER_COMPUTE, device->physical_device->rad_info.chip_class);
57947ec681f3Smrg   pipeline->need_indirect_descriptor_sets |=
57957ec681f3Smrg      radv_shader_need_indirect_descriptor_sets(pipeline, MESA_SHADER_COMPUTE);
57967ec681f3Smrg   radv_pipeline_init_scratch(device, pipeline);
57977ec681f3Smrg
57987ec681f3Smrg   pipeline->push_constant_size = pipeline_layout->push_constant_size;
57997ec681f3Smrg   pipeline->dynamic_offset_count = pipeline_layout->dynamic_offset_count;
58007ec681f3Smrg
58017ec681f3Smrg   radv_compute_generate_pm4(pipeline);
58027ec681f3Smrg
58037ec681f3Smrg   *pPipeline = radv_pipeline_to_handle(pipeline);
58047ec681f3Smrg
58057ec681f3Smrg   return VK_SUCCESS;
580601e04c3fSmrg}
580701e04c3fSmrg
58087ec681f3SmrgVkResult
58097ec681f3Smrgradv_CreateComputePipelines(VkDevice _device, VkPipelineCache pipelineCache, uint32_t count,
58107ec681f3Smrg                            const VkComputePipelineCreateInfo *pCreateInfos,
58117ec681f3Smrg                            const VkAllocationCallbacks *pAllocator, VkPipeline *pPipelines)
581201e04c3fSmrg{
58137ec681f3Smrg   VkResult result = VK_SUCCESS;
58147ec681f3Smrg
58157ec681f3Smrg   unsigned i = 0;
58167ec681f3Smrg   for (; i < count; i++) {
58177ec681f3Smrg      VkResult r;
58187ec681f3Smrg      r = radv_compute_pipeline_create(_device, pipelineCache, &pCreateInfos[i], pAllocator, NULL,
58197ec681f3Smrg                                       NULL, 0, &pPipelines[i]);
58207ec681f3Smrg      if (r != VK_SUCCESS) {
58217ec681f3Smrg         result = r;
58227ec681f3Smrg         pPipelines[i] = VK_NULL_HANDLE;
58237ec681f3Smrg
58247ec681f3Smrg         if (pCreateInfos[i].flags & VK_PIPELINE_CREATE_EARLY_RETURN_ON_FAILURE_BIT_EXT)
58257ec681f3Smrg            break;
58267ec681f3Smrg      }
58277ec681f3Smrg   }
58287ec681f3Smrg
58297ec681f3Smrg   for (; i < count; ++i)
58307ec681f3Smrg      pPipelines[i] = VK_NULL_HANDLE;
58317ec681f3Smrg
58327ec681f3Smrg   return result;
58337ec681f3Smrg}
583401e04c3fSmrg
58357ec681f3Smrgstatic uint32_t
58367ec681f3Smrgradv_get_executable_count(const struct radv_pipeline *pipeline)
58377ec681f3Smrg{
58387ec681f3Smrg   uint32_t ret = 0;
58397ec681f3Smrg   for (int i = 0; i < MESA_SHADER_STAGES; ++i) {
58407ec681f3Smrg      if (!pipeline->shaders[i])
58417ec681f3Smrg         continue;
58427ec681f3Smrg
58437ec681f3Smrg      if (i == MESA_SHADER_GEOMETRY && !radv_pipeline_has_ngg(pipeline)) {
58447ec681f3Smrg         ret += 2u;
58457ec681f3Smrg      } else {
58467ec681f3Smrg         ret += 1u;
58477ec681f3Smrg      }
58487ec681f3Smrg   }
58497ec681f3Smrg   return ret;
58507ec681f3Smrg}
585101e04c3fSmrg
58527ec681f3Smrgstatic struct radv_shader_variant *
58537ec681f3Smrgradv_get_shader_from_executable_index(const struct radv_pipeline *pipeline, int index,
58547ec681f3Smrg                                      gl_shader_stage *stage)
58557ec681f3Smrg{
58567ec681f3Smrg   for (int i = 0; i < MESA_SHADER_STAGES; ++i) {
58577ec681f3Smrg      if (!pipeline->shaders[i])
58587ec681f3Smrg         continue;
58597ec681f3Smrg      if (!index) {
58607ec681f3Smrg         *stage = i;
58617ec681f3Smrg         return pipeline->shaders[i];
58627ec681f3Smrg      }
58637ec681f3Smrg
58647ec681f3Smrg      --index;
58657ec681f3Smrg
58667ec681f3Smrg      if (i == MESA_SHADER_GEOMETRY && !radv_pipeline_has_ngg(pipeline)) {
58677ec681f3Smrg         if (!index) {
58687ec681f3Smrg            *stage = i;
58697ec681f3Smrg            return pipeline->gs_copy_shader;
58707ec681f3Smrg         }
58717ec681f3Smrg         --index;
58727ec681f3Smrg      }
58737ec681f3Smrg   }
58747ec681f3Smrg
58757ec681f3Smrg   *stage = -1;
58767ec681f3Smrg   return NULL;
58777ec681f3Smrg}
587801e04c3fSmrg
58797ec681f3Smrg/* Basically strlcpy (which does not exist on linux) specialized for
58807ec681f3Smrg * descriptions. */
58817ec681f3Smrgstatic void
58827ec681f3Smrgdesc_copy(char *desc, const char *src)
58837ec681f3Smrg{
58847ec681f3Smrg   int len = strlen(src);
58857ec681f3Smrg   assert(len < VK_MAX_DESCRIPTION_SIZE);
58867ec681f3Smrg   memcpy(desc, src, len);
58877ec681f3Smrg   memset(desc + len, 0, VK_MAX_DESCRIPTION_SIZE - len);
588801e04c3fSmrg}
588901e04c3fSmrg
58907ec681f3SmrgVkResult
58917ec681f3Smrgradv_GetPipelineExecutablePropertiesKHR(VkDevice _device, const VkPipelineInfoKHR *pPipelineInfo,
58927ec681f3Smrg                                        uint32_t *pExecutableCount,
58937ec681f3Smrg                                        VkPipelineExecutablePropertiesKHR *pProperties)
58947ec681f3Smrg{
58957ec681f3Smrg   RADV_FROM_HANDLE(radv_pipeline, pipeline, pPipelineInfo->pipeline);
58967ec681f3Smrg   const uint32_t total_count = radv_get_executable_count(pipeline);
58977ec681f3Smrg
58987ec681f3Smrg   if (!pProperties) {
58997ec681f3Smrg      *pExecutableCount = total_count;
59007ec681f3Smrg      return VK_SUCCESS;
59017ec681f3Smrg   }
59027ec681f3Smrg
59037ec681f3Smrg   const uint32_t count = MIN2(total_count, *pExecutableCount);
59047ec681f3Smrg   for (unsigned i = 0, executable_idx = 0; i < MESA_SHADER_STAGES && executable_idx < count; ++i) {
59057ec681f3Smrg      if (!pipeline->shaders[i])
59067ec681f3Smrg         continue;
59077ec681f3Smrg      pProperties[executable_idx].stages = mesa_to_vk_shader_stage(i);
59087ec681f3Smrg      const char *name = NULL;
59097ec681f3Smrg      const char *description = NULL;
59107ec681f3Smrg      switch (i) {
59117ec681f3Smrg      case MESA_SHADER_VERTEX:
59127ec681f3Smrg         name = "Vertex Shader";
59137ec681f3Smrg         description = "Vulkan Vertex Shader";
59147ec681f3Smrg         break;
59157ec681f3Smrg      case MESA_SHADER_TESS_CTRL:
59167ec681f3Smrg         if (!pipeline->shaders[MESA_SHADER_VERTEX]) {
59177ec681f3Smrg            pProperties[executable_idx].stages |= VK_SHADER_STAGE_VERTEX_BIT;
59187ec681f3Smrg            name = "Vertex + Tessellation Control Shaders";
59197ec681f3Smrg            description = "Combined Vulkan Vertex and Tessellation Control Shaders";
59207ec681f3Smrg         } else {
59217ec681f3Smrg            name = "Tessellation Control Shader";
59227ec681f3Smrg            description = "Vulkan Tessellation Control Shader";
59237ec681f3Smrg         }
59247ec681f3Smrg         break;
59257ec681f3Smrg      case MESA_SHADER_TESS_EVAL:
59267ec681f3Smrg         name = "Tessellation Evaluation Shader";
59277ec681f3Smrg         description = "Vulkan Tessellation Evaluation Shader";
59287ec681f3Smrg         break;
59297ec681f3Smrg      case MESA_SHADER_GEOMETRY:
59307ec681f3Smrg         if (radv_pipeline_has_tess(pipeline) && !pipeline->shaders[MESA_SHADER_TESS_EVAL]) {
59317ec681f3Smrg            pProperties[executable_idx].stages |= VK_SHADER_STAGE_TESSELLATION_EVALUATION_BIT;
59327ec681f3Smrg            name = "Tessellation Evaluation + Geometry Shaders";
59337ec681f3Smrg            description = "Combined Vulkan Tessellation Evaluation and Geometry Shaders";
59347ec681f3Smrg         } else if (!radv_pipeline_has_tess(pipeline) && !pipeline->shaders[MESA_SHADER_VERTEX]) {
59357ec681f3Smrg            pProperties[executable_idx].stages |= VK_SHADER_STAGE_VERTEX_BIT;
59367ec681f3Smrg            name = "Vertex + Geometry Shader";
59377ec681f3Smrg            description = "Combined Vulkan Vertex and Geometry Shaders";
59387ec681f3Smrg         } else {
59397ec681f3Smrg            name = "Geometry Shader";
59407ec681f3Smrg            description = "Vulkan Geometry Shader";
59417ec681f3Smrg         }
59427ec681f3Smrg         break;
59437ec681f3Smrg      case MESA_SHADER_FRAGMENT:
59447ec681f3Smrg         name = "Fragment Shader";
59457ec681f3Smrg         description = "Vulkan Fragment Shader";
59467ec681f3Smrg         break;
59477ec681f3Smrg      case MESA_SHADER_COMPUTE:
59487ec681f3Smrg         name = "Compute Shader";
59497ec681f3Smrg         description = "Vulkan Compute Shader";
59507ec681f3Smrg         break;
59517ec681f3Smrg      }
59527ec681f3Smrg
59537ec681f3Smrg      pProperties[executable_idx].subgroupSize = pipeline->shaders[i]->info.wave_size;
59547ec681f3Smrg      desc_copy(pProperties[executable_idx].name, name);
59557ec681f3Smrg      desc_copy(pProperties[executable_idx].description, description);
59567ec681f3Smrg
59577ec681f3Smrg      ++executable_idx;
59587ec681f3Smrg      if (i == MESA_SHADER_GEOMETRY && !radv_pipeline_has_ngg(pipeline)) {
59597ec681f3Smrg         assert(pipeline->gs_copy_shader);
59607ec681f3Smrg         if (executable_idx >= count)
59617ec681f3Smrg            break;
59627ec681f3Smrg
59637ec681f3Smrg         pProperties[executable_idx].stages = VK_SHADER_STAGE_GEOMETRY_BIT;
59647ec681f3Smrg         pProperties[executable_idx].subgroupSize = 64;
59657ec681f3Smrg         desc_copy(pProperties[executable_idx].name, "GS Copy Shader");
59667ec681f3Smrg         desc_copy(pProperties[executable_idx].description,
59677ec681f3Smrg                   "Extra shader stage that loads the GS output ringbuffer into the rasterizer");
59687ec681f3Smrg
59697ec681f3Smrg         ++executable_idx;
59707ec681f3Smrg      }
59717ec681f3Smrg   }
59727ec681f3Smrg
59737ec681f3Smrg   VkResult result = *pExecutableCount < total_count ? VK_INCOMPLETE : VK_SUCCESS;
59747ec681f3Smrg   *pExecutableCount = count;
59757ec681f3Smrg   return result;
597601e04c3fSmrg}
597701e04c3fSmrg
597801e04c3fSmrgVkResult
59797ec681f3Smrgradv_GetPipelineExecutableStatisticsKHR(VkDevice _device,
59807ec681f3Smrg                                        const VkPipelineExecutableInfoKHR *pExecutableInfo,
59817ec681f3Smrg                                        uint32_t *pStatisticCount,
59827ec681f3Smrg                                        VkPipelineExecutableStatisticKHR *pStatistics)
59837ec681f3Smrg{
59847ec681f3Smrg   RADV_FROM_HANDLE(radv_device, device, _device);
59857ec681f3Smrg   RADV_FROM_HANDLE(radv_pipeline, pipeline, pExecutableInfo->pipeline);
59867ec681f3Smrg   gl_shader_stage stage;
59877ec681f3Smrg   struct radv_shader_variant *shader =
59887ec681f3Smrg      radv_get_shader_from_executable_index(pipeline, pExecutableInfo->executableIndex, &stage);
59897ec681f3Smrg
59907ec681f3Smrg   enum chip_class chip_class = device->physical_device->rad_info.chip_class;
59917ec681f3Smrg   unsigned lds_increment = chip_class >= GFX7 ? 512 : 256;
59927ec681f3Smrg   unsigned max_waves = radv_get_max_waves(device, shader, stage);
59937ec681f3Smrg
59947ec681f3Smrg   VkPipelineExecutableStatisticKHR *s = pStatistics;
59957ec681f3Smrg   VkPipelineExecutableStatisticKHR *end = s + (pStatistics ? *pStatisticCount : 0);
59967ec681f3Smrg   VkResult result = VK_SUCCESS;
59977ec681f3Smrg
59987ec681f3Smrg   if (s < end) {
59997ec681f3Smrg      desc_copy(s->name, "SGPRs");
60007ec681f3Smrg      desc_copy(s->description, "Number of SGPR registers allocated per subgroup");
60017ec681f3Smrg      s->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
60027ec681f3Smrg      s->value.u64 = shader->config.num_sgprs;
60037ec681f3Smrg   }
60047ec681f3Smrg   ++s;
60057ec681f3Smrg
60067ec681f3Smrg   if (s < end) {
60077ec681f3Smrg      desc_copy(s->name, "VGPRs");
60087ec681f3Smrg      desc_copy(s->description, "Number of VGPR registers allocated per subgroup");
60097ec681f3Smrg      s->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
60107ec681f3Smrg      s->value.u64 = shader->config.num_vgprs;
60117ec681f3Smrg   }
60127ec681f3Smrg   ++s;
60137ec681f3Smrg
60147ec681f3Smrg   if (s < end) {
60157ec681f3Smrg      desc_copy(s->name, "Spilled SGPRs");
60167ec681f3Smrg      desc_copy(s->description, "Number of SGPR registers spilled per subgroup");
60177ec681f3Smrg      s->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
60187ec681f3Smrg      s->value.u64 = shader->config.spilled_sgprs;
60197ec681f3Smrg   }
60207ec681f3Smrg   ++s;
60217ec681f3Smrg
60227ec681f3Smrg   if (s < end) {
60237ec681f3Smrg      desc_copy(s->name, "Spilled VGPRs");
60247ec681f3Smrg      desc_copy(s->description, "Number of VGPR registers spilled per subgroup");
60257ec681f3Smrg      s->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
60267ec681f3Smrg      s->value.u64 = shader->config.spilled_vgprs;
60277ec681f3Smrg   }
60287ec681f3Smrg   ++s;
60297ec681f3Smrg
60307ec681f3Smrg   if (s < end) {
60317ec681f3Smrg      desc_copy(s->name, "Code size");
60327ec681f3Smrg      desc_copy(s->description, "Code size in bytes");
60337ec681f3Smrg      s->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
60347ec681f3Smrg      s->value.u64 = shader->exec_size;
60357ec681f3Smrg   }
60367ec681f3Smrg   ++s;
60377ec681f3Smrg
60387ec681f3Smrg   if (s < end) {
60397ec681f3Smrg      desc_copy(s->name, "LDS size");
60407ec681f3Smrg      desc_copy(s->description, "LDS size in bytes per workgroup");
60417ec681f3Smrg      s->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
60427ec681f3Smrg      s->value.u64 = shader->config.lds_size * lds_increment;
60437ec681f3Smrg   }
60447ec681f3Smrg   ++s;
60457ec681f3Smrg
60467ec681f3Smrg   if (s < end) {
60477ec681f3Smrg      desc_copy(s->name, "Scratch size");
60487ec681f3Smrg      desc_copy(s->description, "Private memory in bytes per subgroup");
60497ec681f3Smrg      s->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
60507ec681f3Smrg      s->value.u64 = shader->config.scratch_bytes_per_wave;
60517ec681f3Smrg   }
60527ec681f3Smrg   ++s;
60537ec681f3Smrg
60547ec681f3Smrg   if (s < end) {
60557ec681f3Smrg      desc_copy(s->name, "Subgroups per SIMD");
60567ec681f3Smrg      desc_copy(s->description, "The maximum number of subgroups in flight on a SIMD unit");
60577ec681f3Smrg      s->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
60587ec681f3Smrg      s->value.u64 = max_waves;
60597ec681f3Smrg   }
60607ec681f3Smrg   ++s;
60617ec681f3Smrg
60627ec681f3Smrg   if (shader->statistics) {
60637ec681f3Smrg      for (unsigned i = 0; i < aco_num_statistics; i++) {
60647ec681f3Smrg         const struct aco_compiler_statistic_info *info = &aco_statistic_infos[i];
60657ec681f3Smrg         if (s < end) {
60667ec681f3Smrg            desc_copy(s->name, info->name);
60677ec681f3Smrg            desc_copy(s->description, info->desc);
60687ec681f3Smrg            s->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
60697ec681f3Smrg            s->value.u64 = shader->statistics[i];
60707ec681f3Smrg         }
60717ec681f3Smrg         ++s;
60727ec681f3Smrg      }
60737ec681f3Smrg   }
60747ec681f3Smrg
60757ec681f3Smrg   if (!pStatistics)
60767ec681f3Smrg      *pStatisticCount = s - pStatistics;
60777ec681f3Smrg   else if (s > end) {
60787ec681f3Smrg      *pStatisticCount = end - pStatistics;
60797ec681f3Smrg      result = VK_INCOMPLETE;
60807ec681f3Smrg   } else {
60817ec681f3Smrg      *pStatisticCount = s - pStatistics;
60827ec681f3Smrg   }
60837ec681f3Smrg
60847ec681f3Smrg   return result;
608501e04c3fSmrg}
608601e04c3fSmrg
60877ec681f3Smrgstatic VkResult
60887ec681f3Smrgradv_copy_representation(void *data, size_t *data_size, const char *src)
60897ec681f3Smrg{
60907ec681f3Smrg   size_t total_size = strlen(src) + 1;
609101e04c3fSmrg
60927ec681f3Smrg   if (!data) {
60937ec681f3Smrg      *data_size = total_size;
60947ec681f3Smrg      return VK_SUCCESS;
60957ec681f3Smrg   }
60967ec681f3Smrg
60977ec681f3Smrg   size_t size = MIN2(total_size, *data_size);
60987ec681f3Smrg
60997ec681f3Smrg   memcpy(data, src, size);
61007ec681f3Smrg   if (size)
61017ec681f3Smrg      *((char *)data + size - 1) = 0;
61027ec681f3Smrg   return size < total_size ? VK_INCOMPLETE : VK_SUCCESS;
61037ec681f3Smrg}
61047ec681f3Smrg
61057ec681f3SmrgVkResult
61067ec681f3Smrgradv_GetPipelineExecutableInternalRepresentationsKHR(
61077ec681f3Smrg   VkDevice device, const VkPipelineExecutableInfoKHR *pExecutableInfo,
61087ec681f3Smrg   uint32_t *pInternalRepresentationCount,
61097ec681f3Smrg   VkPipelineExecutableInternalRepresentationKHR *pInternalRepresentations)
611001e04c3fSmrg{
61117ec681f3Smrg   RADV_FROM_HANDLE(radv_pipeline, pipeline, pExecutableInfo->pipeline);
61127ec681f3Smrg   gl_shader_stage stage;
61137ec681f3Smrg   struct radv_shader_variant *shader =
61147ec681f3Smrg      radv_get_shader_from_executable_index(pipeline, pExecutableInfo->executableIndex, &stage);
61157ec681f3Smrg
61167ec681f3Smrg   VkPipelineExecutableInternalRepresentationKHR *p = pInternalRepresentations;
61177ec681f3Smrg   VkPipelineExecutableInternalRepresentationKHR *end =
61187ec681f3Smrg      p + (pInternalRepresentations ? *pInternalRepresentationCount : 0);
61197ec681f3Smrg   VkResult result = VK_SUCCESS;
61207ec681f3Smrg   /* optimized NIR */
61217ec681f3Smrg   if (p < end) {
61227ec681f3Smrg      p->isText = true;
61237ec681f3Smrg      desc_copy(p->name, "NIR Shader(s)");
61247ec681f3Smrg      desc_copy(p->description, "The optimized NIR shader(s)");
61257ec681f3Smrg      if (radv_copy_representation(p->pData, &p->dataSize, shader->nir_string) != VK_SUCCESS)
61267ec681f3Smrg         result = VK_INCOMPLETE;
61277ec681f3Smrg   }
61287ec681f3Smrg   ++p;
61297ec681f3Smrg
61307ec681f3Smrg   /* backend IR */
61317ec681f3Smrg   if (p < end) {
61327ec681f3Smrg      p->isText = true;
61337ec681f3Smrg      if (radv_use_llvm_for_stage(pipeline->device, stage)) {
61347ec681f3Smrg         desc_copy(p->name, "LLVM IR");
61357ec681f3Smrg         desc_copy(p->description, "The LLVM IR after some optimizations");
61367ec681f3Smrg      } else {
61377ec681f3Smrg         desc_copy(p->name, "ACO IR");
61387ec681f3Smrg         desc_copy(p->description, "The ACO IR after some optimizations");
61397ec681f3Smrg      }
61407ec681f3Smrg      if (radv_copy_representation(p->pData, &p->dataSize, shader->ir_string) != VK_SUCCESS)
61417ec681f3Smrg         result = VK_INCOMPLETE;
61427ec681f3Smrg   }
61437ec681f3Smrg   ++p;
61447ec681f3Smrg
61457ec681f3Smrg   /* Disassembler */
61467ec681f3Smrg   if (p < end && shader->disasm_string) {
61477ec681f3Smrg      p->isText = true;
61487ec681f3Smrg      desc_copy(p->name, "Assembly");
61497ec681f3Smrg      desc_copy(p->description, "Final Assembly");
61507ec681f3Smrg      if (radv_copy_representation(p->pData, &p->dataSize, shader->disasm_string) != VK_SUCCESS)
61517ec681f3Smrg         result = VK_INCOMPLETE;
61527ec681f3Smrg   }
61537ec681f3Smrg   ++p;
61547ec681f3Smrg
61557ec681f3Smrg   if (!pInternalRepresentations)
61567ec681f3Smrg      *pInternalRepresentationCount = p - pInternalRepresentations;
61577ec681f3Smrg   else if (p > end) {
61587ec681f3Smrg      result = VK_INCOMPLETE;
61597ec681f3Smrg      *pInternalRepresentationCount = end - pInternalRepresentations;
61607ec681f3Smrg   } else {
61617ec681f3Smrg      *pInternalRepresentationCount = p - pInternalRepresentations;
61627ec681f3Smrg   }
61637ec681f3Smrg
61647ec681f3Smrg   return result;
616501e04c3fSmrg}
6166