1af69d88dSmrg/*
2af69d88dSmrg * Copyright 2012 Advanced Micro Devices, Inc.
301e04c3fSmrg * All Rights Reserved.
4af69d88dSmrg *
5af69d88dSmrg * Permission is hereby granted, free of charge, to any person obtaining a
6af69d88dSmrg * copy of this software and associated documentation files (the "Software"),
7af69d88dSmrg * to deal in the Software without restriction, including without limitation
8af69d88dSmrg * on the rights to use, copy, modify, merge, publish, distribute, sub
9af69d88dSmrg * license, and/or sell copies of the Software, and to permit persons to whom
10af69d88dSmrg * the Software is furnished to do so, subject to the following conditions:
11af69d88dSmrg *
12af69d88dSmrg * The above copyright notice and this permission notice (including the next
13af69d88dSmrg * paragraph) shall be included in all copies or substantial portions of the
14af69d88dSmrg * Software.
15af69d88dSmrg *
16af69d88dSmrg * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17af69d88dSmrg * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18af69d88dSmrg * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
19af69d88dSmrg * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
20af69d88dSmrg * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
21af69d88dSmrg * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
22af69d88dSmrg * USE OR OTHER DEALINGS IN THE SOFTWARE.
23af69d88dSmrg */
24af69d88dSmrg
257ec681f3Smrg#include "ac_exp_param.h"
267ec681f3Smrg#include "ac_rtld.h"
277ec681f3Smrg#include "compiler/nir/nir.h"
287ec681f3Smrg#include "compiler/nir/nir_serialize.h"
297ec681f3Smrg#include "si_pipe.h"
307ec681f3Smrg#include "si_shader_internal.h"
317ec681f3Smrg#include "sid.h"
327ec681f3Smrg#include "tgsi/tgsi_from_mesa.h"
337ec681f3Smrg#include "tgsi/tgsi_strings.h"
347ec681f3Smrg#include "util/u_memory.h"
3501e04c3fSmrg
367ec681f3Smrgstatic const char scratch_rsrc_dword0_symbol[] = "SCRATCH_RSRC_DWORD0";
3701e04c3fSmrg
387ec681f3Smrgstatic const char scratch_rsrc_dword1_symbol[] = "SCRATCH_RSRC_DWORD1";
3901e04c3fSmrg
407ec681f3Smrgstatic void si_dump_shader_key(const struct si_shader *shader, FILE *f);
4101e04c3fSmrg
427ec681f3Smrg/** Whether the shader runs as a combination of multiple API shaders */
437ec681f3Smrgbool si_is_multi_part_shader(struct si_shader *shader)
4401e04c3fSmrg{
457ec681f3Smrg   if (shader->selector->screen->info.chip_class <= GFX8)
467ec681f3Smrg      return false;
4701e04c3fSmrg
487ec681f3Smrg   return shader->key.as_ls || shader->key.as_es ||
497ec681f3Smrg          shader->selector->info.stage == MESA_SHADER_TESS_CTRL ||
507ec681f3Smrg          shader->selector->info.stage == MESA_SHADER_GEOMETRY;
5101e04c3fSmrg}
5201e04c3fSmrg
537ec681f3Smrg/** Whether the shader runs on a merged HW stage (LSHS or ESGS) */
547ec681f3Smrgbool si_is_merged_shader(struct si_shader *shader)
5501e04c3fSmrg{
567ec681f3Smrg   return shader->key.as_ngg || si_is_multi_part_shader(shader);
5701e04c3fSmrg}
5801e04c3fSmrg
5901e04c3fSmrg/**
607ec681f3Smrg * Returns a unique index for a per-patch semantic name and index. The index
617ec681f3Smrg * must be less than 32, so that a 32-bit bitmask of used inputs or outputs
627ec681f3Smrg * can be calculated.
6301e04c3fSmrg */
647ec681f3Smrgunsigned si_shader_io_get_unique_index_patch(unsigned semantic)
6501e04c3fSmrg{
667ec681f3Smrg   switch (semantic) {
677ec681f3Smrg   case VARYING_SLOT_TESS_LEVEL_OUTER:
687ec681f3Smrg      return 0;
697ec681f3Smrg   case VARYING_SLOT_TESS_LEVEL_INNER:
707ec681f3Smrg      return 1;
717ec681f3Smrg   default:
727ec681f3Smrg      if (semantic >= VARYING_SLOT_PATCH0 && semantic < VARYING_SLOT_PATCH0 + 30)
737ec681f3Smrg         return 2 + (semantic - VARYING_SLOT_PATCH0);
7401e04c3fSmrg
757ec681f3Smrg      assert(!"invalid semantic");
767ec681f3Smrg      return 0;
777ec681f3Smrg   }
7801e04c3fSmrg}
7901e04c3fSmrg
807ec681f3Smrg/**
817ec681f3Smrg * Returns a unique index for a semantic name and index. The index must be
827ec681f3Smrg * less than 64, so that a 64-bit bitmask of used inputs or outputs can be
837ec681f3Smrg * calculated.
847ec681f3Smrg */
857ec681f3Smrgunsigned si_shader_io_get_unique_index(unsigned semantic, bool is_varying)
867ec681f3Smrg{
877ec681f3Smrg   switch (semantic) {
887ec681f3Smrg   case VARYING_SLOT_POS:
897ec681f3Smrg      return 0;
907ec681f3Smrg   default:
917ec681f3Smrg      /* Since some shader stages use the highest used IO index
927ec681f3Smrg       * to determine the size to allocate for inputs/outputs
937ec681f3Smrg       * (in LDS, tess and GS rings). GENERIC should be placed right
947ec681f3Smrg       * after POSITION to make that size as small as possible.
957ec681f3Smrg       */
967ec681f3Smrg      if (semantic >= VARYING_SLOT_VAR0 && semantic <= VARYING_SLOT_VAR31)
977ec681f3Smrg         return 1 + (semantic - VARYING_SLOT_VAR0); /* 1..32 */
987ec681f3Smrg
997ec681f3Smrg      /* Put 16-bit GLES varyings after 32-bit varyings. They can use the same indices as
1007ec681f3Smrg       * legacy desktop GL varyings because they are mutually exclusive.
1017ec681f3Smrg       */
1027ec681f3Smrg      if (semantic >= VARYING_SLOT_VAR0_16BIT && semantic <= VARYING_SLOT_VAR15_16BIT)
1037ec681f3Smrg         return 33 + (semantic - VARYING_SLOT_VAR0_16BIT); /* 33..48 */
1047ec681f3Smrg
1057ec681f3Smrg      assert(!"invalid generic index");
1067ec681f3Smrg      return 0;
1077ec681f3Smrg
1087ec681f3Smrg   /* Legacy desktop GL varyings. */
1097ec681f3Smrg   case VARYING_SLOT_FOGC:
1107ec681f3Smrg      return 33;
1117ec681f3Smrg   case VARYING_SLOT_COL0:
1127ec681f3Smrg      return 34;
1137ec681f3Smrg   case VARYING_SLOT_COL1:
1147ec681f3Smrg      return 35;
1157ec681f3Smrg   case VARYING_SLOT_BFC0:
1167ec681f3Smrg      /* If it's a varying, COLOR and BCOLOR alias. */
1177ec681f3Smrg      if (is_varying)
1187ec681f3Smrg         return 34;
1197ec681f3Smrg      else
1207ec681f3Smrg         return 36;
1217ec681f3Smrg   case VARYING_SLOT_BFC1:
1227ec681f3Smrg      if (is_varying)
1237ec681f3Smrg         return 35;
1247ec681f3Smrg      else
1257ec681f3Smrg         return 37;
1267ec681f3Smrg   case VARYING_SLOT_TEX0:
1277ec681f3Smrg   case VARYING_SLOT_TEX1:
1287ec681f3Smrg   case VARYING_SLOT_TEX2:
1297ec681f3Smrg   case VARYING_SLOT_TEX3:
1307ec681f3Smrg   case VARYING_SLOT_TEX4:
1317ec681f3Smrg   case VARYING_SLOT_TEX5:
1327ec681f3Smrg   case VARYING_SLOT_TEX6:
1337ec681f3Smrg   case VARYING_SLOT_TEX7:
1347ec681f3Smrg      return 38 + (semantic - VARYING_SLOT_TEX0);
1357ec681f3Smrg   case VARYING_SLOT_CLIP_VERTEX:
1367ec681f3Smrg      return 46;
1377ec681f3Smrg
1387ec681f3Smrg   /* Varyings present in both GLES and desktop GL must start at 49 after 16-bit varyings. */
1397ec681f3Smrg   case VARYING_SLOT_CLIP_DIST0:
1407ec681f3Smrg      return 49;
1417ec681f3Smrg   case VARYING_SLOT_CLIP_DIST1:
1427ec681f3Smrg      return 50;
1437ec681f3Smrg   case VARYING_SLOT_PSIZ:
1447ec681f3Smrg      return 51;
1457ec681f3Smrg
1467ec681f3Smrg   /* These can't be written by LS, HS, and ES. */
1477ec681f3Smrg   case VARYING_SLOT_LAYER:
1487ec681f3Smrg      return 52;
1497ec681f3Smrg   case VARYING_SLOT_VIEWPORT:
1507ec681f3Smrg      return 53;
1517ec681f3Smrg   case VARYING_SLOT_PRIMITIVE_ID:
1527ec681f3Smrg      return 54;
1537ec681f3Smrg   }
15401e04c3fSmrg}
15501e04c3fSmrg
1567ec681f3Smrgstatic void si_dump_streamout(struct pipe_stream_output_info *so)
15701e04c3fSmrg{
1587ec681f3Smrg   unsigned i;
15901e04c3fSmrg
1607ec681f3Smrg   if (so->num_outputs)
1617ec681f3Smrg      fprintf(stderr, "STREAMOUT\n");
16201e04c3fSmrg
1637ec681f3Smrg   for (i = 0; i < so->num_outputs; i++) {
1647ec681f3Smrg      unsigned mask = ((1 << so->output[i].num_components) - 1) << so->output[i].start_component;
1657ec681f3Smrg      fprintf(stderr, "  %i: BUF%i[%i..%i] <- OUT[%i].%s%s%s%s\n", i, so->output[i].output_buffer,
1667ec681f3Smrg              so->output[i].dst_offset, so->output[i].dst_offset + so->output[i].num_components - 1,
1677ec681f3Smrg              so->output[i].register_index, mask & 1 ? "x" : "", mask & 2 ? "y" : "",
1687ec681f3Smrg              mask & 4 ? "z" : "", mask & 8 ? "w" : "");
1697ec681f3Smrg   }
17001e04c3fSmrg}
17101e04c3fSmrg
1727ec681f3Smrgstatic void declare_streamout_params(struct si_shader_context *ctx,
1737ec681f3Smrg                                     struct pipe_stream_output_info *so)
17401e04c3fSmrg{
1757ec681f3Smrg   if (ctx->screen->use_ngg_streamout) {
1767ec681f3Smrg      if (ctx->stage == MESA_SHADER_TESS_EVAL)
1777ec681f3Smrg         ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
1787ec681f3Smrg      return;
1797ec681f3Smrg   }
18001e04c3fSmrg
1817ec681f3Smrg   /* Streamout SGPRs. */
1827ec681f3Smrg   if (so->num_outputs) {
1837ec681f3Smrg      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.streamout_config);
1847ec681f3Smrg      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.streamout_write_index);
1857ec681f3Smrg   } else if (ctx->stage == MESA_SHADER_TESS_EVAL) {
1867ec681f3Smrg      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
1877ec681f3Smrg   }
18801e04c3fSmrg
1897ec681f3Smrg   /* A streamout buffer offset is loaded if the stride is non-zero. */
1907ec681f3Smrg   for (int i = 0; i < 4; i++) {
1917ec681f3Smrg      if (!so->stride[i])
1927ec681f3Smrg         continue;
19301e04c3fSmrg
1947ec681f3Smrg      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.streamout_offset[i]);
1957ec681f3Smrg   }
19601e04c3fSmrg}
19701e04c3fSmrg
1987ec681f3Smrgunsigned si_get_max_workgroup_size(const struct si_shader *shader)
19901e04c3fSmrg{
2007ec681f3Smrg   switch (shader->selector->info.stage) {
2017ec681f3Smrg   case MESA_SHADER_VERTEX:
2027ec681f3Smrg   case MESA_SHADER_TESS_EVAL:
2037ec681f3Smrg      return shader->key.as_ngg ? 128 : 0;
20401e04c3fSmrg
2057ec681f3Smrg   case MESA_SHADER_TESS_CTRL:
2067ec681f3Smrg      /* Return this so that LLVM doesn't remove s_barrier
2077ec681f3Smrg       * instructions on chips where we use s_barrier. */
2087ec681f3Smrg      return shader->selector->screen->info.chip_class >= GFX7 ? 128 : 0;
20901e04c3fSmrg
2107ec681f3Smrg   case MESA_SHADER_GEOMETRY:
2117ec681f3Smrg      return shader->selector->screen->info.chip_class >= GFX9 ? 128 : 0;
21201e04c3fSmrg
2137ec681f3Smrg   case MESA_SHADER_COMPUTE:
2147ec681f3Smrg      break; /* see below */
21501e04c3fSmrg
2167ec681f3Smrg   default:
2177ec681f3Smrg      return 0;
2187ec681f3Smrg   }
21901e04c3fSmrg
2207ec681f3Smrg   /* Compile a variable block size using the maximum variable size. */
2217ec681f3Smrg   if (shader->selector->info.base.workgroup_size_variable)
2227ec681f3Smrg      return SI_MAX_VARIABLE_THREADS_PER_BLOCK;
22301e04c3fSmrg
2247ec681f3Smrg   uint16_t *local_size = shader->selector->info.base.workgroup_size;
2257ec681f3Smrg   unsigned max_work_group_size = (uint32_t)local_size[0] *
2267ec681f3Smrg                                  (uint32_t)local_size[1] *
2277ec681f3Smrg                                  (uint32_t)local_size[2];
2287ec681f3Smrg   assert(max_work_group_size);
2297ec681f3Smrg   return max_work_group_size;
23001e04c3fSmrg}
23101e04c3fSmrg
2327ec681f3Smrgstatic void declare_const_and_shader_buffers(struct si_shader_context *ctx, bool assign_params)
23301e04c3fSmrg{
2347ec681f3Smrg   enum ac_arg_type const_shader_buf_type;
23501e04c3fSmrg
2367ec681f3Smrg   if (ctx->shader->selector->info.base.num_ubos == 1 &&
2377ec681f3Smrg       ctx->shader->selector->info.base.num_ssbos == 0)
2387ec681f3Smrg      const_shader_buf_type = AC_ARG_CONST_FLOAT_PTR;
2397ec681f3Smrg   else
2407ec681f3Smrg      const_shader_buf_type = AC_ARG_CONST_DESC_PTR;
24101e04c3fSmrg
2427ec681f3Smrg   ac_add_arg(
2437ec681f3Smrg      &ctx->args, AC_ARG_SGPR, 1, const_shader_buf_type,
2447ec681f3Smrg      assign_params ? &ctx->const_and_shader_buffers : &ctx->other_const_and_shader_buffers);
24501e04c3fSmrg}
24601e04c3fSmrg
2477ec681f3Smrgstatic void declare_samplers_and_images(struct si_shader_context *ctx, bool assign_params)
24801e04c3fSmrg{
2497ec681f3Smrg   ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_IMAGE_PTR,
2507ec681f3Smrg              assign_params ? &ctx->samplers_and_images : &ctx->other_samplers_and_images);
25101e04c3fSmrg}
25201e04c3fSmrg
2537ec681f3Smrgstatic void declare_per_stage_desc_pointers(struct si_shader_context *ctx, bool assign_params)
25401e04c3fSmrg{
2557ec681f3Smrg   declare_const_and_shader_buffers(ctx, assign_params);
2567ec681f3Smrg   declare_samplers_and_images(ctx, assign_params);
25701e04c3fSmrg}
25801e04c3fSmrg
2597ec681f3Smrgstatic void declare_global_desc_pointers(struct si_shader_context *ctx)
26001e04c3fSmrg{
2617ec681f3Smrg   ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_DESC_PTR, &ctx->internal_bindings);
2627ec681f3Smrg   ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_IMAGE_PTR,
2637ec681f3Smrg              &ctx->bindless_samplers_and_images);
26401e04c3fSmrg}
26501e04c3fSmrg
2667ec681f3Smrgstatic void declare_vs_specific_input_sgprs(struct si_shader_context *ctx)
26701e04c3fSmrg{
2687ec681f3Smrg   ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->vs_state_bits);
2697ec681f3Smrg   if (!ctx->shader->is_gs_copy_shader) {
2707ec681f3Smrg      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.base_vertex);
2717ec681f3Smrg      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.draw_id);
2727ec681f3Smrg      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.start_instance);
2737ec681f3Smrg   }
27401e04c3fSmrg}
27501e04c3fSmrg
2767ec681f3Smrgstatic void declare_vb_descriptor_input_sgprs(struct si_shader_context *ctx)
27701e04c3fSmrg{
2787ec681f3Smrg   ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_DESC_PTR, &ctx->args.vertex_buffers);
27901e04c3fSmrg
2807ec681f3Smrg   unsigned num_vbos_in_user_sgprs = ctx->shader->selector->num_vbos_in_user_sgprs;
2817ec681f3Smrg   if (num_vbos_in_user_sgprs) {
2827ec681f3Smrg      unsigned user_sgprs = ctx->args.num_sgprs_used;
28301e04c3fSmrg
2847ec681f3Smrg      if (si_is_merged_shader(ctx->shader))
2857ec681f3Smrg         user_sgprs -= 8;
2867ec681f3Smrg      assert(user_sgprs <= SI_SGPR_VS_VB_DESCRIPTOR_FIRST);
28701e04c3fSmrg
2887ec681f3Smrg      /* Declare unused SGPRs to align VB descriptors to 4 SGPRs (hw requirement). */
2897ec681f3Smrg      for (unsigned i = user_sgprs; i < SI_SGPR_VS_VB_DESCRIPTOR_FIRST; i++)
2907ec681f3Smrg         ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */
29101e04c3fSmrg
2927ec681f3Smrg      assert(num_vbos_in_user_sgprs <= ARRAY_SIZE(ctx->vb_descriptors));
2937ec681f3Smrg      for (unsigned i = 0; i < num_vbos_in_user_sgprs; i++)
2947ec681f3Smrg         ac_add_arg(&ctx->args, AC_ARG_SGPR, 4, AC_ARG_INT, &ctx->vb_descriptors[i]);
2957ec681f3Smrg   }
29601e04c3fSmrg}
29701e04c3fSmrg
2987ec681f3Smrgstatic void declare_vs_input_vgprs(struct si_shader_context *ctx, unsigned *num_prolog_vgprs)
29901e04c3fSmrg{
3007ec681f3Smrg   struct si_shader *shader = ctx->shader;
30101e04c3fSmrg
3027ec681f3Smrg   ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.vertex_id);
3037ec681f3Smrg   if (shader->key.as_ls) {
3047ec681f3Smrg      ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.vs_rel_patch_id);
3057ec681f3Smrg      if (ctx->screen->info.chip_class >= GFX10) {
3067ec681f3Smrg         ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* user VGPR */
3077ec681f3Smrg         ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.instance_id);
3087ec681f3Smrg      } else {
3097ec681f3Smrg         ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.instance_id);
3107ec681f3Smrg         ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* unused */
3117ec681f3Smrg      }
3127ec681f3Smrg   } else if (ctx->screen->info.chip_class >= GFX10) {
3137ec681f3Smrg      ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* user VGPR */
3147ec681f3Smrg      ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT,
3157ec681f3Smrg                 &ctx->args.vs_prim_id); /* user vgpr or PrimID (legacy) */
3167ec681f3Smrg      ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.instance_id);
3177ec681f3Smrg   } else {
3187ec681f3Smrg      ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.instance_id);
3197ec681f3Smrg      ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.vs_prim_id);
3207ec681f3Smrg      ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* unused */
3217ec681f3Smrg   }
32201e04c3fSmrg
3237ec681f3Smrg   if (!shader->is_gs_copy_shader) {
3247ec681f3Smrg      /* Vertex load indices. */
3257ec681f3Smrg      if (shader->selector->info.num_inputs) {
3267ec681f3Smrg         ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->vertex_index0);
3277ec681f3Smrg         for (unsigned i = 1; i < shader->selector->info.num_inputs; i++)
3287ec681f3Smrg            ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, NULL);
3297ec681f3Smrg      }
3307ec681f3Smrg      *num_prolog_vgprs += shader->selector->info.num_inputs;
3317ec681f3Smrg   }
33201e04c3fSmrg}
33301e04c3fSmrg
3347ec681f3Smrgstatic void declare_vs_blit_inputs(struct si_shader_context *ctx, unsigned vs_blit_property)
33501e04c3fSmrg{
3367ec681f3Smrg   ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->vs_blit_inputs); /* i16 x1, y1 */
3377ec681f3Smrg   ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);                 /* i16 x1, y1 */
3387ec681f3Smrg   ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL);               /* depth */
33901e04c3fSmrg
3407ec681f3Smrg   if (vs_blit_property == SI_VS_BLIT_SGPRS_POS_COLOR) {
3417ec681f3Smrg      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* color0 */
3427ec681f3Smrg      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* color1 */
3437ec681f3Smrg      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* color2 */
3447ec681f3Smrg      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* color3 */
3457ec681f3Smrg   } else if (vs_blit_property == SI_VS_BLIT_SGPRS_POS_TEXCOORD) {
3467ec681f3Smrg      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.x1 */
3477ec681f3Smrg      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.y1 */
3487ec681f3Smrg      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.x2 */
3497ec681f3Smrg      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.y2 */
3507ec681f3Smrg      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.z */
3517ec681f3Smrg      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.w */
3527ec681f3Smrg   }
3539f464c52Smaya}
3549f464c52Smaya
3557ec681f3Smrgstatic void declare_tes_input_vgprs(struct si_shader_context *ctx)
35601e04c3fSmrg{
3577ec681f3Smrg   ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &ctx->args.tes_u);
3587ec681f3Smrg   ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &ctx->args.tes_v);
3597ec681f3Smrg   ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.tes_rel_patch_id);
3607ec681f3Smrg   ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.tes_patch_id);
3617ec681f3Smrg}
3627ec681f3Smrg
3637ec681f3Smrgenum
3647ec681f3Smrg{
3657ec681f3Smrg   /* Convenient merged shader definitions. */
3667ec681f3Smrg   SI_SHADER_MERGED_VERTEX_TESSCTRL = MESA_ALL_SHADER_STAGES,
3677ec681f3Smrg   SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY,
36801e04c3fSmrg};
36901e04c3fSmrg
3707ec681f3Smrgvoid si_add_arg_checked(struct ac_shader_args *args, enum ac_arg_regfile file, unsigned registers,
3717ec681f3Smrg                        enum ac_arg_type type, struct ac_arg *arg, unsigned idx)
3727ec681f3Smrg{
3737ec681f3Smrg   assert(args->arg_count == idx);
3747ec681f3Smrg   ac_add_arg(args, file, registers, type, arg);
3757ec681f3Smrg}
3767ec681f3Smrg
3777ec681f3Smrgvoid si_init_shader_args(struct si_shader_context *ctx, bool ngg_cull_shader)
3787ec681f3Smrg{
3797ec681f3Smrg   struct si_shader *shader = ctx->shader;
3807ec681f3Smrg   unsigned i, num_returns, num_return_sgprs;
3817ec681f3Smrg   unsigned num_prolog_vgprs = 0;
3827ec681f3Smrg   unsigned stage = ctx->stage;
3837ec681f3Smrg
3847ec681f3Smrg   memset(&ctx->args, 0, sizeof(ctx->args));
3857ec681f3Smrg
3867ec681f3Smrg   /* Set MERGED shaders. */
3877ec681f3Smrg   if (ctx->screen->info.chip_class >= GFX9) {
3887ec681f3Smrg      if (shader->key.as_ls || stage == MESA_SHADER_TESS_CTRL)
3897ec681f3Smrg         stage = SI_SHADER_MERGED_VERTEX_TESSCTRL; /* LS or HS */
3907ec681f3Smrg      else if (shader->key.as_es || shader->key.as_ngg || stage == MESA_SHADER_GEOMETRY)
3917ec681f3Smrg         stage = SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY;
3927ec681f3Smrg   }
3937ec681f3Smrg
3947ec681f3Smrg   switch (stage) {
3957ec681f3Smrg   case MESA_SHADER_VERTEX:
3967ec681f3Smrg      declare_global_desc_pointers(ctx);
3977ec681f3Smrg
3987ec681f3Smrg      if (shader->selector->info.base.vs.blit_sgprs_amd) {
3997ec681f3Smrg         declare_vs_blit_inputs(ctx, shader->selector->info.base.vs.blit_sgprs_amd);
4007ec681f3Smrg
4017ec681f3Smrg         /* VGPRs */
4027ec681f3Smrg         declare_vs_input_vgprs(ctx, &num_prolog_vgprs);
4037ec681f3Smrg         break;
4047ec681f3Smrg      }
4057ec681f3Smrg
4067ec681f3Smrg      declare_per_stage_desc_pointers(ctx, true);
4077ec681f3Smrg      declare_vs_specific_input_sgprs(ctx);
4087ec681f3Smrg      if (!shader->is_gs_copy_shader)
4097ec681f3Smrg         declare_vb_descriptor_input_sgprs(ctx);
4107ec681f3Smrg
4117ec681f3Smrg      if (shader->key.as_es) {
4127ec681f3Smrg         ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.es2gs_offset);
4137ec681f3Smrg      } else if (shader->key.as_ls) {
4147ec681f3Smrg         /* no extra parameters */
4157ec681f3Smrg      } else {
4167ec681f3Smrg         /* The locations of the other parameters are assigned dynamically. */
4177ec681f3Smrg         declare_streamout_params(ctx, &shader->selector->so);
4187ec681f3Smrg      }
4197ec681f3Smrg
4207ec681f3Smrg      /* VGPRs */
4217ec681f3Smrg      declare_vs_input_vgprs(ctx, &num_prolog_vgprs);
4227ec681f3Smrg      break;
4237ec681f3Smrg
4247ec681f3Smrg   case MESA_SHADER_TESS_CTRL: /* GFX6-GFX8 */
4257ec681f3Smrg      declare_global_desc_pointers(ctx);
4267ec681f3Smrg      declare_per_stage_desc_pointers(ctx, true);
4277ec681f3Smrg      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_layout);
4287ec681f3Smrg      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_out_lds_offsets);
4297ec681f3Smrg      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_out_lds_layout);
4307ec681f3Smrg      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->vs_state_bits);
4317ec681f3Smrg      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.tess_offchip_offset);
4327ec681f3Smrg      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.tcs_factor_offset);
4337ec681f3Smrg
4347ec681f3Smrg      /* VGPRs */
4357ec681f3Smrg      ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.tcs_patch_id);
4367ec681f3Smrg      ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.tcs_rel_ids);
4377ec681f3Smrg
4387ec681f3Smrg      /* param_tcs_offchip_offset and param_tcs_factor_offset are
4397ec681f3Smrg       * placed after the user SGPRs.
4407ec681f3Smrg       */
4417ec681f3Smrg      for (i = 0; i < GFX6_TCS_NUM_USER_SGPR + 2; i++)
4427ec681f3Smrg         ac_add_return(&ctx->args, AC_ARG_SGPR);
4437ec681f3Smrg      for (i = 0; i < 11; i++)
4447ec681f3Smrg         ac_add_return(&ctx->args, AC_ARG_VGPR);
4457ec681f3Smrg      break;
4467ec681f3Smrg
4477ec681f3Smrg   case SI_SHADER_MERGED_VERTEX_TESSCTRL:
4487ec681f3Smrg      /* Merged stages have 8 system SGPRs at the beginning. */
4497ec681f3Smrg      /* SPI_SHADER_USER_DATA_ADDR_LO/HI_HS */
4507ec681f3Smrg      declare_per_stage_desc_pointers(ctx, ctx->stage == MESA_SHADER_TESS_CTRL);
4517ec681f3Smrg      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.tess_offchip_offset);
4527ec681f3Smrg      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.merged_wave_info);
4537ec681f3Smrg      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.tcs_factor_offset);
4547ec681f3Smrg      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.scratch_offset);
4557ec681f3Smrg      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */
4567ec681f3Smrg      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */
4577ec681f3Smrg
4587ec681f3Smrg      declare_global_desc_pointers(ctx);
4597ec681f3Smrg      declare_per_stage_desc_pointers(ctx, ctx->stage == MESA_SHADER_VERTEX);
4607ec681f3Smrg      declare_vs_specific_input_sgprs(ctx);
4617ec681f3Smrg
4627ec681f3Smrg      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_layout);
4637ec681f3Smrg      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_out_lds_offsets);
4647ec681f3Smrg      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_out_lds_layout);
4657ec681f3Smrg      if (ctx->stage == MESA_SHADER_VERTEX)
4667ec681f3Smrg         declare_vb_descriptor_input_sgprs(ctx);
4677ec681f3Smrg
4687ec681f3Smrg      /* VGPRs (first TCS, then VS) */
4697ec681f3Smrg      ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.tcs_patch_id);
4707ec681f3Smrg      ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.tcs_rel_ids);
4717ec681f3Smrg
4727ec681f3Smrg      if (ctx->stage == MESA_SHADER_VERTEX) {
4737ec681f3Smrg         declare_vs_input_vgprs(ctx, &num_prolog_vgprs);
4747ec681f3Smrg
4757ec681f3Smrg         /* LS return values are inputs to the TCS main shader part. */
4767ec681f3Smrg         for (i = 0; i < 8 + GFX9_TCS_NUM_USER_SGPR; i++)
4777ec681f3Smrg            ac_add_return(&ctx->args, AC_ARG_SGPR);
4787ec681f3Smrg         for (i = 0; i < 2; i++)
4797ec681f3Smrg            ac_add_return(&ctx->args, AC_ARG_VGPR);
4807ec681f3Smrg
4817ec681f3Smrg         /* VS outputs passed via VGPRs to TCS. */
4827ec681f3Smrg         if (shader->key.opt.same_patch_vertices) {
4837ec681f3Smrg            unsigned num_outputs = util_last_bit64(shader->selector->outputs_written);
4847ec681f3Smrg            for (i = 0; i < num_outputs * 4; i++)
4857ec681f3Smrg               ac_add_return(&ctx->args, AC_ARG_VGPR);
4867ec681f3Smrg         }
4877ec681f3Smrg      } else {
4887ec681f3Smrg         /* TCS inputs are passed via VGPRs from VS. */
4897ec681f3Smrg         if (shader->key.opt.same_patch_vertices) {
4907ec681f3Smrg            unsigned num_inputs = util_last_bit64(shader->previous_stage_sel->outputs_written);
4917ec681f3Smrg            for (i = 0; i < num_inputs * 4; i++)
4927ec681f3Smrg               ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, NULL);
4937ec681f3Smrg         }
4947ec681f3Smrg
4957ec681f3Smrg         /* TCS return values are inputs to the TCS epilog.
4967ec681f3Smrg          *
4977ec681f3Smrg          * param_tcs_offchip_offset, param_tcs_factor_offset,
4987ec681f3Smrg          * param_tcs_offchip_layout, and internal_bindings
4997ec681f3Smrg          * should be passed to the epilog.
5007ec681f3Smrg          */
5017ec681f3Smrg         for (i = 0; i <= 8 + GFX9_SGPR_TCS_OUT_LAYOUT; i++)
5027ec681f3Smrg            ac_add_return(&ctx->args, AC_ARG_SGPR);
5037ec681f3Smrg         for (i = 0; i < 11; i++)
5047ec681f3Smrg            ac_add_return(&ctx->args, AC_ARG_VGPR);
5057ec681f3Smrg      }
5067ec681f3Smrg      break;
5077ec681f3Smrg
5087ec681f3Smrg   case SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY:
5097ec681f3Smrg      /* Merged stages have 8 system SGPRs at the beginning. */
5107ec681f3Smrg      /* SPI_SHADER_USER_DATA_ADDR_LO/HI_GS */
5117ec681f3Smrg      declare_per_stage_desc_pointers(ctx, ctx->stage == MESA_SHADER_GEOMETRY);
5127ec681f3Smrg
5137ec681f3Smrg      if (ctx->shader->key.as_ngg)
5147ec681f3Smrg         ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.gs_tg_info);
5157ec681f3Smrg      else
5167ec681f3Smrg         ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.gs2vs_offset);
5177ec681f3Smrg
5187ec681f3Smrg      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.merged_wave_info);
5197ec681f3Smrg      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.tess_offchip_offset);
5207ec681f3Smrg      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.scratch_offset);
5217ec681f3Smrg      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_DESC_PTR,
5227ec681f3Smrg                 &ctx->small_prim_cull_info); /* SPI_SHADER_PGM_LO_GS << 8 */
5237ec681f3Smrg      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT,
5247ec681f3Smrg                 NULL); /* unused (SPI_SHADER_PGM_LO/HI_GS >> 24) */
5257ec681f3Smrg
5267ec681f3Smrg      declare_global_desc_pointers(ctx);
5277ec681f3Smrg      if (ctx->stage != MESA_SHADER_VERTEX || !shader->selector->info.base.vs.blit_sgprs_amd) {
5287ec681f3Smrg         declare_per_stage_desc_pointers(
5297ec681f3Smrg            ctx, (ctx->stage == MESA_SHADER_VERTEX || ctx->stage == MESA_SHADER_TESS_EVAL));
5307ec681f3Smrg      }
5317ec681f3Smrg
5327ec681f3Smrg      if (ctx->stage == MESA_SHADER_VERTEX) {
5337ec681f3Smrg         if (shader->selector->info.base.vs.blit_sgprs_amd)
5347ec681f3Smrg            declare_vs_blit_inputs(ctx, shader->selector->info.base.vs.blit_sgprs_amd);
5357ec681f3Smrg         else
5367ec681f3Smrg            declare_vs_specific_input_sgprs(ctx);
5377ec681f3Smrg      } else {
5387ec681f3Smrg         ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->vs_state_bits);
5397ec681f3Smrg
5407ec681f3Smrg         if (ctx->stage == MESA_SHADER_TESS_EVAL) {
5417ec681f3Smrg            ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_layout);
5427ec681f3Smrg            ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tes_offchip_addr);
5437ec681f3Smrg         }
5447ec681f3Smrg      }
5457ec681f3Smrg
5467ec681f3Smrg      if (ctx->stage == MESA_SHADER_VERTEX)
5477ec681f3Smrg         declare_vb_descriptor_input_sgprs(ctx);
5487ec681f3Smrg
5497ec681f3Smrg      /* VGPRs (first GS, then VS/TES) */
5507ec681f3Smrg      ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.gs_vtx_offset[0]);
5517ec681f3Smrg      ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.gs_vtx_offset[1]);
5527ec681f3Smrg      ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.gs_prim_id);
5537ec681f3Smrg      ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.gs_invocation_id);
5547ec681f3Smrg      ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.gs_vtx_offset[2]);
5557ec681f3Smrg
5567ec681f3Smrg      if (ctx->stage == MESA_SHADER_VERTEX) {
5577ec681f3Smrg         declare_vs_input_vgprs(ctx, &num_prolog_vgprs);
5587ec681f3Smrg      } else if (ctx->stage == MESA_SHADER_TESS_EVAL) {
5597ec681f3Smrg         declare_tes_input_vgprs(ctx);
5607ec681f3Smrg      }
5617ec681f3Smrg
5627ec681f3Smrg      if ((ctx->shader->key.as_es || ngg_cull_shader) &&
5637ec681f3Smrg          (ctx->stage == MESA_SHADER_VERTEX || ctx->stage == MESA_SHADER_TESS_EVAL)) {
5647ec681f3Smrg         unsigned num_user_sgprs, num_vgprs;
5657ec681f3Smrg
5667ec681f3Smrg         if (ctx->stage == MESA_SHADER_VERTEX && ngg_cull_shader) {
5677ec681f3Smrg            /* For the NGG cull shader, add 1 SGPR to hold
5687ec681f3Smrg             * the vertex buffer pointer.
5697ec681f3Smrg             */
5707ec681f3Smrg            num_user_sgprs = GFX9_VSGS_NUM_USER_SGPR + 1;
5717ec681f3Smrg
5727ec681f3Smrg            if (shader->selector->num_vbos_in_user_sgprs) {
5737ec681f3Smrg               assert(num_user_sgprs <= SI_SGPR_VS_VB_DESCRIPTOR_FIRST);
5747ec681f3Smrg               num_user_sgprs =
5757ec681f3Smrg                  SI_SGPR_VS_VB_DESCRIPTOR_FIRST + shader->selector->num_vbos_in_user_sgprs * 4;
5767ec681f3Smrg            }
5777ec681f3Smrg         } else if (ctx->stage == MESA_SHADER_TESS_EVAL && ngg_cull_shader) {
5787ec681f3Smrg            num_user_sgprs = GFX9_TESGS_NUM_USER_SGPR;
5797ec681f3Smrg         } else {
5807ec681f3Smrg            num_user_sgprs = SI_NUM_VS_STATE_RESOURCE_SGPRS;
5817ec681f3Smrg         }
5827ec681f3Smrg
5837ec681f3Smrg         /* The NGG cull shader has to return all 9 VGPRs.
5847ec681f3Smrg          *
5857ec681f3Smrg          * The normal merged ESGS shader only has to return the 5 VGPRs
5867ec681f3Smrg          * for the GS stage.
5877ec681f3Smrg          */
5887ec681f3Smrg         num_vgprs = ngg_cull_shader ? 9 : 5;
5897ec681f3Smrg
5907ec681f3Smrg         /* ES return values are inputs to GS. */
5917ec681f3Smrg         for (i = 0; i < 8 + num_user_sgprs; i++)
5927ec681f3Smrg            ac_add_return(&ctx->args, AC_ARG_SGPR);
5937ec681f3Smrg         for (i = 0; i < num_vgprs; i++)
5947ec681f3Smrg            ac_add_return(&ctx->args, AC_ARG_VGPR);
5957ec681f3Smrg      }
5967ec681f3Smrg      break;
5977ec681f3Smrg
5987ec681f3Smrg   case MESA_SHADER_TESS_EVAL:
5997ec681f3Smrg      declare_global_desc_pointers(ctx);
6007ec681f3Smrg      declare_per_stage_desc_pointers(ctx, true);
6017ec681f3Smrg      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->vs_state_bits);
6027ec681f3Smrg      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_layout);
6037ec681f3Smrg      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tes_offchip_addr);
6047ec681f3Smrg
6057ec681f3Smrg      if (shader->key.as_es) {
6067ec681f3Smrg         ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.tess_offchip_offset);
6077ec681f3Smrg         ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
6087ec681f3Smrg         ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.es2gs_offset);
6097ec681f3Smrg      } else {
6107ec681f3Smrg         declare_streamout_params(ctx, &shader->selector->so);
6117ec681f3Smrg         ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.tess_offchip_offset);
6127ec681f3Smrg      }
6137ec681f3Smrg
6147ec681f3Smrg      /* VGPRs */
6157ec681f3Smrg      declare_tes_input_vgprs(ctx);
6167ec681f3Smrg      break;
6177ec681f3Smrg
6187ec681f3Smrg   case MESA_SHADER_GEOMETRY:
6197ec681f3Smrg      declare_global_desc_pointers(ctx);
6207ec681f3Smrg      declare_per_stage_desc_pointers(ctx, true);
6217ec681f3Smrg      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.gs2vs_offset);
6227ec681f3Smrg      ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.gs_wave_id);
6237ec681f3Smrg
6247ec681f3Smrg      /* VGPRs */
6257ec681f3Smrg      ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.gs_vtx_offset[0]);
6267ec681f3Smrg      ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.gs_vtx_offset[1]);
6277ec681f3Smrg      ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.gs_prim_id);
6287ec681f3Smrg      ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.gs_vtx_offset[2]);
6297ec681f3Smrg      ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.gs_vtx_offset[3]);
6307ec681f3Smrg      ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.gs_vtx_offset[4]);
6317ec681f3Smrg      ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.gs_vtx_offset[5]);
6327ec681f3Smrg      ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.gs_invocation_id);
6337ec681f3Smrg      break;
6347ec681f3Smrg
6357ec681f3Smrg   case MESA_SHADER_FRAGMENT:
6367ec681f3Smrg      declare_global_desc_pointers(ctx);
6377ec681f3Smrg      declare_per_stage_desc_pointers(ctx, true);
6387ec681f3Smrg      si_add_arg_checked(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL, SI_PARAM_ALPHA_REF);
6397ec681f3Smrg      si_add_arg_checked(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.prim_mask,
6407ec681f3Smrg                         SI_PARAM_PRIM_MASK);
6417ec681f3Smrg
6427ec681f3Smrg      si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 2, AC_ARG_INT, &ctx->args.persp_sample,
6437ec681f3Smrg                         SI_PARAM_PERSP_SAMPLE);
6447ec681f3Smrg      si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 2, AC_ARG_INT, &ctx->args.persp_center,
6457ec681f3Smrg                         SI_PARAM_PERSP_CENTER);
6467ec681f3Smrg      si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 2, AC_ARG_INT, &ctx->args.persp_centroid,
6477ec681f3Smrg                         SI_PARAM_PERSP_CENTROID);
6487ec681f3Smrg      si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 3, AC_ARG_INT, NULL, SI_PARAM_PERSP_PULL_MODEL);
6497ec681f3Smrg      si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 2, AC_ARG_INT, &ctx->args.linear_sample,
6507ec681f3Smrg                         SI_PARAM_LINEAR_SAMPLE);
6517ec681f3Smrg      si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 2, AC_ARG_INT, &ctx->args.linear_center,
6527ec681f3Smrg                         SI_PARAM_LINEAR_CENTER);
6537ec681f3Smrg      si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 2, AC_ARG_INT, &ctx->args.linear_centroid,
6547ec681f3Smrg                         SI_PARAM_LINEAR_CENTROID);
6557ec681f3Smrg      si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, NULL, SI_PARAM_LINE_STIPPLE_TEX);
6567ec681f3Smrg      si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &ctx->args.frag_pos[0],
6577ec681f3Smrg                         SI_PARAM_POS_X_FLOAT);
6587ec681f3Smrg      si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &ctx->args.frag_pos[1],
6597ec681f3Smrg                         SI_PARAM_POS_Y_FLOAT);
6607ec681f3Smrg      si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &ctx->args.frag_pos[2],
6617ec681f3Smrg                         SI_PARAM_POS_Z_FLOAT);
6627ec681f3Smrg      si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &ctx->args.frag_pos[3],
6637ec681f3Smrg                         SI_PARAM_POS_W_FLOAT);
6647ec681f3Smrg      shader->info.face_vgpr_index = ctx->args.num_vgprs_used;
6657ec681f3Smrg      si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.front_face,
6667ec681f3Smrg                         SI_PARAM_FRONT_FACE);
6677ec681f3Smrg      shader->info.ancillary_vgpr_index = ctx->args.num_vgprs_used;
6687ec681f3Smrg      si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.ancillary,
6697ec681f3Smrg                         SI_PARAM_ANCILLARY);
6707ec681f3Smrg      si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &ctx->args.sample_coverage,
6717ec681f3Smrg                         SI_PARAM_SAMPLE_COVERAGE);
6727ec681f3Smrg      si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->pos_fixed_pt,
6737ec681f3Smrg                         SI_PARAM_POS_FIXED_PT);
6747ec681f3Smrg
6757ec681f3Smrg      /* Color inputs from the prolog. */
6767ec681f3Smrg      if (shader->selector->info.colors_read) {
6777ec681f3Smrg         unsigned num_color_elements = util_bitcount(shader->selector->info.colors_read);
6787ec681f3Smrg
6797ec681f3Smrg         for (i = 0; i < num_color_elements; i++)
6807ec681f3Smrg            ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, NULL);
6817ec681f3Smrg
6827ec681f3Smrg         num_prolog_vgprs += num_color_elements;
6837ec681f3Smrg      }
6847ec681f3Smrg
6857ec681f3Smrg      /* Outputs for the epilog. */
6867ec681f3Smrg      num_return_sgprs = SI_SGPR_ALPHA_REF + 1;
6877ec681f3Smrg      num_returns = num_return_sgprs + util_bitcount(shader->selector->info.colors_written) * 4 +
6887ec681f3Smrg                    shader->selector->info.writes_z + shader->selector->info.writes_stencil +
6897ec681f3Smrg                    shader->selector->info.writes_samplemask + 1 /* SampleMaskIn */;
6907ec681f3Smrg
6917ec681f3Smrg      num_returns = MAX2(num_returns, num_return_sgprs + PS_EPILOG_SAMPLEMASK_MIN_LOC + 1);
6927ec681f3Smrg
6937ec681f3Smrg      for (i = 0; i < num_return_sgprs; i++)
6947ec681f3Smrg         ac_add_return(&ctx->args, AC_ARG_SGPR);
6957ec681f3Smrg      for (; i < num_returns; i++)
6967ec681f3Smrg         ac_add_return(&ctx->args, AC_ARG_VGPR);
6977ec681f3Smrg      break;
6987ec681f3Smrg
6997ec681f3Smrg   case MESA_SHADER_COMPUTE:
7007ec681f3Smrg      declare_global_desc_pointers(ctx);
7017ec681f3Smrg      declare_per_stage_desc_pointers(ctx, true);
7027ec681f3Smrg      if (shader->selector->info.uses_grid_size)
7037ec681f3Smrg         ac_add_arg(&ctx->args, AC_ARG_SGPR, 3, AC_ARG_INT, &ctx->args.num_work_groups);
7047ec681f3Smrg      if (shader->selector->info.uses_variable_block_size)
7057ec681f3Smrg         ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->block_size);
7067ec681f3Smrg
7077ec681f3Smrg      unsigned cs_user_data_dwords =
7087ec681f3Smrg         shader->selector->info.base.cs.user_data_components_amd;
7097ec681f3Smrg      if (cs_user_data_dwords) {
7107ec681f3Smrg         ac_add_arg(&ctx->args, AC_ARG_SGPR, cs_user_data_dwords, AC_ARG_INT, &ctx->cs_user_data);
7117ec681f3Smrg      }
7127ec681f3Smrg
7137ec681f3Smrg      /* Some descriptors can be in user SGPRs. */
7147ec681f3Smrg      /* Shader buffers in user SGPRs. */
7157ec681f3Smrg      for (unsigned i = 0; i < shader->selector->cs_num_shaderbufs_in_user_sgprs; i++) {
7167ec681f3Smrg         while (ctx->args.num_sgprs_used % 4 != 0)
7177ec681f3Smrg            ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
7187ec681f3Smrg
7197ec681f3Smrg         ac_add_arg(&ctx->args, AC_ARG_SGPR, 4, AC_ARG_INT, &ctx->cs_shaderbuf[i]);
7207ec681f3Smrg      }
7217ec681f3Smrg      /* Images in user SGPRs. */
7227ec681f3Smrg      for (unsigned i = 0; i < shader->selector->cs_num_images_in_user_sgprs; i++) {
7237ec681f3Smrg         unsigned num_sgprs = shader->selector->info.base.image_buffers & (1 << i) ? 4 : 8;
7247ec681f3Smrg
7257ec681f3Smrg         while (ctx->args.num_sgprs_used % num_sgprs != 0)
7267ec681f3Smrg            ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL);
7277ec681f3Smrg
7287ec681f3Smrg         ac_add_arg(&ctx->args, AC_ARG_SGPR, num_sgprs, AC_ARG_INT, &ctx->cs_image[i]);
7297ec681f3Smrg      }
7307ec681f3Smrg
7317ec681f3Smrg      /* Hardware SGPRs. */
7327ec681f3Smrg      for (i = 0; i < 3; i++) {
7337ec681f3Smrg         if (shader->selector->info.uses_block_id[i]) {
7347ec681f3Smrg            ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.workgroup_ids[i]);
7357ec681f3Smrg         }
7367ec681f3Smrg      }
7377ec681f3Smrg      if (shader->selector->info.uses_subgroup_info)
7387ec681f3Smrg         ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.tg_size);
7397ec681f3Smrg
7407ec681f3Smrg      /* Hardware VGPRs. */
7417ec681f3Smrg      if (!ctx->screen->info.has_graphics && ctx->screen->info.family >= CHIP_ALDEBARAN)
7427ec681f3Smrg         ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.local_invocation_ids);
7437ec681f3Smrg      else
7447ec681f3Smrg         ac_add_arg(&ctx->args, AC_ARG_VGPR, 3, AC_ARG_INT, &ctx->args.local_invocation_ids);
7457ec681f3Smrg      break;
7467ec681f3Smrg   default:
7477ec681f3Smrg      assert(0 && "unimplemented shader");
7487ec681f3Smrg      return;
7497ec681f3Smrg   }
7507ec681f3Smrg
7517ec681f3Smrg   shader->info.num_input_sgprs = ctx->args.num_sgprs_used;
7527ec681f3Smrg   shader->info.num_input_vgprs = ctx->args.num_vgprs_used;
7537ec681f3Smrg
7547ec681f3Smrg   assert(shader->info.num_input_vgprs >= num_prolog_vgprs);
7557ec681f3Smrg   shader->info.num_input_vgprs -= num_prolog_vgprs;
75601e04c3fSmrg}
75701e04c3fSmrg
75801e04c3fSmrg/* For the UMR disassembler. */
7597ec681f3Smrg#define DEBUGGER_END_OF_CODE_MARKER 0xbf9f0000 /* invalid instruction */
7607ec681f3Smrg#define DEBUGGER_NUM_MARKERS        5
7617ec681f3Smrg
7627ec681f3Smrgstatic bool si_shader_binary_open(struct si_screen *screen, struct si_shader *shader,
7637ec681f3Smrg                                  struct ac_rtld_binary *rtld)
7647ec681f3Smrg{
7657ec681f3Smrg   const struct si_shader_selector *sel = shader->selector;
7667ec681f3Smrg   const char *part_elfs[5];
7677ec681f3Smrg   size_t part_sizes[5];
7687ec681f3Smrg   unsigned num_parts = 0;
7697ec681f3Smrg
7707ec681f3Smrg#define add_part(shader_or_part)                                                                   \
7717ec681f3Smrg   if (shader_or_part) {                                                                           \
7727ec681f3Smrg      part_elfs[num_parts] = (shader_or_part)->binary.elf_buffer;                                  \
7737ec681f3Smrg      part_sizes[num_parts] = (shader_or_part)->binary.elf_size;                                   \
7747ec681f3Smrg      num_parts++;                                                                                 \
7757ec681f3Smrg   }
7767ec681f3Smrg
7777ec681f3Smrg   add_part(shader->prolog);
7787ec681f3Smrg   add_part(shader->previous_stage);
7797ec681f3Smrg   add_part(shader->prolog2);
7807ec681f3Smrg   add_part(shader);
7817ec681f3Smrg   add_part(shader->epilog);
7827ec681f3Smrg
7837ec681f3Smrg#undef add_part
7847ec681f3Smrg
7857ec681f3Smrg   struct ac_rtld_symbol lds_symbols[2];
7867ec681f3Smrg   unsigned num_lds_symbols = 0;
7877ec681f3Smrg
7887ec681f3Smrg   if (sel && screen->info.chip_class >= GFX9 && !shader->is_gs_copy_shader &&
7897ec681f3Smrg       (sel->info.stage == MESA_SHADER_GEOMETRY || shader->key.as_ngg)) {
7907ec681f3Smrg      struct ac_rtld_symbol *sym = &lds_symbols[num_lds_symbols++];
7917ec681f3Smrg      sym->name = "esgs_ring";
7927ec681f3Smrg      sym->size = shader->gs_info.esgs_ring_size * 4;
7937ec681f3Smrg      sym->align = 64 * 1024;
7947ec681f3Smrg   }
7957ec681f3Smrg
7967ec681f3Smrg   if (shader->key.as_ngg && sel->info.stage == MESA_SHADER_GEOMETRY) {
7977ec681f3Smrg      struct ac_rtld_symbol *sym = &lds_symbols[num_lds_symbols++];
7987ec681f3Smrg      sym->name = "ngg_emit";
7997ec681f3Smrg      sym->size = shader->ngg.ngg_emit_size * 4;
8007ec681f3Smrg      sym->align = 4;
8017ec681f3Smrg   }
8027ec681f3Smrg
8037ec681f3Smrg   bool ok = ac_rtld_open(
8047ec681f3Smrg      rtld, (struct ac_rtld_open_info){.info = &screen->info,
8057ec681f3Smrg                                       .options =
8067ec681f3Smrg                                          {
8077ec681f3Smrg                                             .halt_at_entry = screen->options.halt_shaders,
8087ec681f3Smrg                                          },
8097ec681f3Smrg                                       .shader_type = sel->info.stage,
8107ec681f3Smrg                                       .wave_size = si_get_shader_wave_size(shader),
8117ec681f3Smrg                                       .num_parts = num_parts,
8127ec681f3Smrg                                       .elf_ptrs = part_elfs,
8137ec681f3Smrg                                       .elf_sizes = part_sizes,
8147ec681f3Smrg                                       .num_shared_lds_symbols = num_lds_symbols,
8157ec681f3Smrg                                       .shared_lds_symbols = lds_symbols});
8167ec681f3Smrg
8177ec681f3Smrg   if (rtld->lds_size > 0) {
8187ec681f3Smrg      unsigned alloc_granularity = screen->info.chip_class >= GFX7 ? 512 : 256;
8197ec681f3Smrg      shader->config.lds_size = align(rtld->lds_size, alloc_granularity) / alloc_granularity;
8207ec681f3Smrg   }
8217ec681f3Smrg
8227ec681f3Smrg   return ok;
8237ec681f3Smrg}
8247ec681f3Smrg
8257ec681f3Smrgstatic unsigned si_get_shader_binary_size(struct si_screen *screen, struct si_shader *shader)
8267ec681f3Smrg{
8277ec681f3Smrg   struct ac_rtld_binary rtld;
8287ec681f3Smrg   si_shader_binary_open(screen, shader, &rtld);
8297ec681f3Smrg   uint64_t size = rtld.exec_size;
8307ec681f3Smrg   ac_rtld_close(&rtld);
8317ec681f3Smrg   return size;
8327ec681f3Smrg}
8337ec681f3Smrg
8347ec681f3Smrgstatic bool si_get_external_symbol(void *data, const char *name, uint64_t *value)
8357ec681f3Smrg{
8367ec681f3Smrg   uint64_t *scratch_va = data;
8377ec681f3Smrg
8387ec681f3Smrg   if (!strcmp(scratch_rsrc_dword0_symbol, name)) {
8397ec681f3Smrg      *value = (uint32_t)*scratch_va;
8407ec681f3Smrg      return true;
8417ec681f3Smrg   }
8427ec681f3Smrg   if (!strcmp(scratch_rsrc_dword1_symbol, name)) {
8437ec681f3Smrg      /* Enable scratch coalescing. */
8447ec681f3Smrg      *value = S_008F04_BASE_ADDRESS_HI(*scratch_va >> 32) | S_008F04_SWIZZLE_ENABLE(1);
8457ec681f3Smrg      return true;
8467ec681f3Smrg   }
8477ec681f3Smrg
8487ec681f3Smrg   return false;
8497ec681f3Smrg}
8507ec681f3Smrg
8517ec681f3Smrgbool si_shader_binary_upload(struct si_screen *sscreen, struct si_shader *shader,
8527ec681f3Smrg                             uint64_t scratch_va)
8537ec681f3Smrg{
8547ec681f3Smrg   struct ac_rtld_binary binary;
8557ec681f3Smrg   if (!si_shader_binary_open(sscreen, shader, &binary))
8567ec681f3Smrg      return false;
8577ec681f3Smrg
8587ec681f3Smrg   si_resource_reference(&shader->bo, NULL);
8597ec681f3Smrg   shader->bo = si_aligned_buffer_create(
8607ec681f3Smrg      &sscreen->b,
8617ec681f3Smrg      (sscreen->info.cpdma_prefetch_writes_memory ? 0 : SI_RESOURCE_FLAG_READ_ONLY) |
8627ec681f3Smrg      SI_RESOURCE_FLAG_DRIVER_INTERNAL | SI_RESOURCE_FLAG_32BIT,
8637ec681f3Smrg      PIPE_USAGE_IMMUTABLE, align(binary.rx_size, SI_CPDMA_ALIGNMENT), 256);
8647ec681f3Smrg   if (!shader->bo)
8657ec681f3Smrg      return false;
8667ec681f3Smrg
8677ec681f3Smrg   /* Upload. */
8687ec681f3Smrg   struct ac_rtld_upload_info u = {};
8697ec681f3Smrg   u.binary = &binary;
8707ec681f3Smrg   u.get_external_symbol = si_get_external_symbol;
8717ec681f3Smrg   u.cb_data = &scratch_va;
8727ec681f3Smrg   u.rx_va = shader->bo->gpu_address;
8737ec681f3Smrg   u.rx_ptr = sscreen->ws->buffer_map(sscreen->ws,
8747ec681f3Smrg      shader->bo->buf, NULL,
8757ec681f3Smrg      PIPE_MAP_READ_WRITE | PIPE_MAP_UNSYNCHRONIZED | RADEON_MAP_TEMPORARY);
8767ec681f3Smrg   if (!u.rx_ptr)
8777ec681f3Smrg      return false;
8787ec681f3Smrg
8797ec681f3Smrg   int size = ac_rtld_upload(&u);
8807ec681f3Smrg
8817ec681f3Smrg   if (sscreen->debug_flags & DBG(SQTT)) {
8827ec681f3Smrg      /* Remember the uploaded code */
8837ec681f3Smrg      shader->binary.uploaded_code_size = size;
8847ec681f3Smrg      shader->binary.uploaded_code = malloc(size);
8857ec681f3Smrg      memcpy(shader->binary.uploaded_code, u.rx_ptr, size);
8867ec681f3Smrg   }
8877ec681f3Smrg
8887ec681f3Smrg   sscreen->ws->buffer_unmap(sscreen->ws, shader->bo->buf);
8897ec681f3Smrg   ac_rtld_close(&binary);
8907ec681f3Smrg
8917ec681f3Smrg   return size >= 0;
8927ec681f3Smrg}
8937ec681f3Smrg
8947ec681f3Smrgstatic void si_shader_dump_disassembly(struct si_screen *screen,
8957ec681f3Smrg                                       const struct si_shader_binary *binary,
8967ec681f3Smrg                                       gl_shader_stage stage, unsigned wave_size,
8977ec681f3Smrg                                       struct pipe_debug_callback *debug, const char *name,
8987ec681f3Smrg                                       FILE *file)
8997ec681f3Smrg{
9007ec681f3Smrg   struct ac_rtld_binary rtld_binary;
9017ec681f3Smrg
9027ec681f3Smrg   if (!ac_rtld_open(&rtld_binary, (struct ac_rtld_open_info){
9037ec681f3Smrg                                      .info = &screen->info,
9047ec681f3Smrg                                      .shader_type = stage,
9057ec681f3Smrg                                      .wave_size = wave_size,
9067ec681f3Smrg                                      .num_parts = 1,
9077ec681f3Smrg                                      .elf_ptrs = &binary->elf_buffer,
9087ec681f3Smrg                                      .elf_sizes = &binary->elf_size}))
9097ec681f3Smrg      return;
9107ec681f3Smrg
9117ec681f3Smrg   const char *disasm;
9127ec681f3Smrg   size_t nbytes;
9137ec681f3Smrg
9147ec681f3Smrg   if (!ac_rtld_get_section_by_name(&rtld_binary, ".AMDGPU.disasm", &disasm, &nbytes))
9157ec681f3Smrg      goto out;
91601e04c3fSmrg
9177ec681f3Smrg   if (nbytes > INT_MAX)
9187ec681f3Smrg      goto out;
91901e04c3fSmrg
9207ec681f3Smrg   if (debug && debug->debug_message) {
9217ec681f3Smrg      /* Very long debug messages are cut off, so send the
9227ec681f3Smrg       * disassembly one line at a time. This causes more
9237ec681f3Smrg       * overhead, but on the plus side it simplifies
9247ec681f3Smrg       * parsing of resulting logs.
9257ec681f3Smrg       */
9267ec681f3Smrg      pipe_debug_message(debug, SHADER_INFO, "Shader Disassembly Begin");
9277ec681f3Smrg
9287ec681f3Smrg      uint64_t line = 0;
9297ec681f3Smrg      while (line < nbytes) {
9307ec681f3Smrg         int count = nbytes - line;
9317ec681f3Smrg         const char *nl = memchr(disasm + line, '\n', nbytes - line);
9327ec681f3Smrg         if (nl)
9337ec681f3Smrg            count = nl - (disasm + line);
9347ec681f3Smrg
9357ec681f3Smrg         if (count) {
9367ec681f3Smrg            pipe_debug_message(debug, SHADER_INFO, "%.*s", count, disasm + line);
9377ec681f3Smrg         }
9387ec681f3Smrg
9397ec681f3Smrg         line += count + 1;
9407ec681f3Smrg      }
9417ec681f3Smrg
9427ec681f3Smrg      pipe_debug_message(debug, SHADER_INFO, "Shader Disassembly End");
9437ec681f3Smrg   }
9447ec681f3Smrg
9457ec681f3Smrg   if (file) {
9467ec681f3Smrg      fprintf(file, "Shader %s disassembly:\n", name);
9477ec681f3Smrg      fprintf(file, "%*s", (int)nbytes, disasm);
9487ec681f3Smrg   }
94901e04c3fSmrg
9507ec681f3Smrgout:
9517ec681f3Smrg   ac_rtld_close(&rtld_binary);
95201e04c3fSmrg}
95301e04c3fSmrg
95401e04c3fSmrgstatic void si_calculate_max_simd_waves(struct si_shader *shader)
95501e04c3fSmrg{
9567ec681f3Smrg   struct si_screen *sscreen = shader->selector->screen;
9577ec681f3Smrg   struct ac_shader_config *conf = &shader->config;
9587ec681f3Smrg   unsigned num_inputs = shader->selector->info.num_inputs;
9597ec681f3Smrg   unsigned lds_increment = sscreen->info.chip_class >= GFX7 ? 512 : 256;
9607ec681f3Smrg   unsigned lds_per_wave = 0;
9617ec681f3Smrg   unsigned max_simd_waves;
9627ec681f3Smrg
9637ec681f3Smrg   max_simd_waves = sscreen->info.max_wave64_per_simd;
9647ec681f3Smrg
9657ec681f3Smrg   /* Compute LDS usage for PS. */
9667ec681f3Smrg   switch (shader->selector->info.stage) {
9677ec681f3Smrg   case MESA_SHADER_FRAGMENT:
9687ec681f3Smrg      /* The minimum usage per wave is (num_inputs * 48). The maximum
9697ec681f3Smrg       * usage is (num_inputs * 48 * 16).
9707ec681f3Smrg       * We can get anything in between and it varies between waves.
9717ec681f3Smrg       *
9727ec681f3Smrg       * The 48 bytes per input for a single primitive is equal to
9737ec681f3Smrg       * 4 bytes/component * 4 components/input * 3 points.
9747ec681f3Smrg       *
9757ec681f3Smrg       * Other stages don't know the size at compile time or don't
9767ec681f3Smrg       * allocate LDS per wave, but instead they do it per thread group.
9777ec681f3Smrg       */
9787ec681f3Smrg      lds_per_wave = conf->lds_size * lds_increment + align(num_inputs * 48, lds_increment);
9797ec681f3Smrg      break;
9807ec681f3Smrg   case MESA_SHADER_COMPUTE: {
9817ec681f3Smrg         unsigned max_workgroup_size = si_get_max_workgroup_size(shader);
9827ec681f3Smrg         lds_per_wave = (conf->lds_size * lds_increment) /
9837ec681f3Smrg                        DIV_ROUND_UP(max_workgroup_size, sscreen->compute_wave_size);
9847ec681f3Smrg      }
9857ec681f3Smrg      break;
9867ec681f3Smrg   default:;
9877ec681f3Smrg   }
9887ec681f3Smrg
9897ec681f3Smrg   /* Compute the per-SIMD wave counts. */
9907ec681f3Smrg   if (conf->num_sgprs) {
9917ec681f3Smrg      max_simd_waves =
9927ec681f3Smrg         MIN2(max_simd_waves, sscreen->info.num_physical_sgprs_per_simd / conf->num_sgprs);
9937ec681f3Smrg   }
9947ec681f3Smrg
9957ec681f3Smrg   if (conf->num_vgprs) {
9967ec681f3Smrg      /* Always print wave limits as Wave64, so that we can compare
9977ec681f3Smrg       * Wave32 and Wave64 with shader-db fairly. */
9987ec681f3Smrg      unsigned max_vgprs = sscreen->info.num_physical_wave64_vgprs_per_simd;
9997ec681f3Smrg      max_simd_waves = MIN2(max_simd_waves, max_vgprs / conf->num_vgprs);
10007ec681f3Smrg   }
10017ec681f3Smrg
10027ec681f3Smrg   unsigned max_lds_per_simd = sscreen->info.lds_size_per_workgroup / 4;
10037ec681f3Smrg   if (lds_per_wave)
10047ec681f3Smrg      max_simd_waves = MIN2(max_simd_waves, max_lds_per_simd / lds_per_wave);
10057ec681f3Smrg
10067ec681f3Smrg   shader->info.max_simd_waves = max_simd_waves;
10077ec681f3Smrg}
10087ec681f3Smrg
10097ec681f3Smrgvoid si_shader_dump_stats_for_shader_db(struct si_screen *screen, struct si_shader *shader,
10107ec681f3Smrg                                        struct pipe_debug_callback *debug)
10117ec681f3Smrg{
10127ec681f3Smrg   const struct ac_shader_config *conf = &shader->config;
10137ec681f3Smrg
10147ec681f3Smrg   if (screen->options.debug_disassembly)
10157ec681f3Smrg      si_shader_dump_disassembly(screen, &shader->binary, shader->selector->info.stage,
10167ec681f3Smrg                                 si_get_shader_wave_size(shader), debug, "main", NULL);
10177ec681f3Smrg
10187ec681f3Smrg   pipe_debug_message(debug, SHADER_INFO,
10197ec681f3Smrg                      "Shader Stats: SGPRS: %d VGPRS: %d Code Size: %d "
10207ec681f3Smrg                      "LDS: %d Scratch: %d Max Waves: %d Spilled SGPRs: %d "
10217ec681f3Smrg                      "Spilled VGPRs: %d PrivMem VGPRs: %d",
10227ec681f3Smrg                      conf->num_sgprs, conf->num_vgprs, si_get_shader_binary_size(screen, shader),
10237ec681f3Smrg                      conf->lds_size, conf->scratch_bytes_per_wave, shader->info.max_simd_waves,
10247ec681f3Smrg                      conf->spilled_sgprs, conf->spilled_vgprs, shader->info.private_mem_vgprs);
10257ec681f3Smrg}
10267ec681f3Smrg
10277ec681f3Smrgstatic void si_shader_dump_stats(struct si_screen *sscreen, struct si_shader *shader, FILE *file,
10287ec681f3Smrg                                 bool check_debug_option)
10297ec681f3Smrg{
10307ec681f3Smrg   const struct ac_shader_config *conf = &shader->config;
10317ec681f3Smrg
10327ec681f3Smrg   if (!check_debug_option || si_can_dump_shader(sscreen, shader->selector->info.stage)) {
10337ec681f3Smrg      if (shader->selector->info.stage == MESA_SHADER_FRAGMENT) {
10347ec681f3Smrg         fprintf(file,
10357ec681f3Smrg                 "*** SHADER CONFIG ***\n"
10367ec681f3Smrg                 "SPI_PS_INPUT_ADDR = 0x%04x\n"
10377ec681f3Smrg                 "SPI_PS_INPUT_ENA  = 0x%04x\n",
10387ec681f3Smrg                 conf->spi_ps_input_addr, conf->spi_ps_input_ena);
10397ec681f3Smrg      }
10407ec681f3Smrg
10417ec681f3Smrg      fprintf(file,
10427ec681f3Smrg              "*** SHADER STATS ***\n"
10437ec681f3Smrg              "SGPRS: %d\n"
10447ec681f3Smrg              "VGPRS: %d\n"
10457ec681f3Smrg              "Spilled SGPRs: %d\n"
10467ec681f3Smrg              "Spilled VGPRs: %d\n"
10477ec681f3Smrg              "Private memory VGPRs: %d\n"
10487ec681f3Smrg              "Code Size: %d bytes\n"
10497ec681f3Smrg              "LDS: %d blocks\n"
10507ec681f3Smrg              "Scratch: %d bytes per wave\n"
10517ec681f3Smrg              "Max Waves: %d\n"
10527ec681f3Smrg              "********************\n\n\n",
10537ec681f3Smrg              conf->num_sgprs, conf->num_vgprs, conf->spilled_sgprs, conf->spilled_vgprs,
10547ec681f3Smrg              shader->info.private_mem_vgprs, si_get_shader_binary_size(sscreen, shader),
10557ec681f3Smrg              conf->lds_size, conf->scratch_bytes_per_wave, shader->info.max_simd_waves);
10567ec681f3Smrg   }
10577ec681f3Smrg}
10587ec681f3Smrg
10597ec681f3Smrgconst char *si_get_shader_name(const struct si_shader *shader)
10607ec681f3Smrg{
10617ec681f3Smrg   switch (shader->selector->info.stage) {
10627ec681f3Smrg   case MESA_SHADER_VERTEX:
10637ec681f3Smrg      if (shader->key.as_es)
10647ec681f3Smrg         return "Vertex Shader as ES";
10657ec681f3Smrg      else if (shader->key.as_ls)
10667ec681f3Smrg         return "Vertex Shader as LS";
10677ec681f3Smrg      else if (shader->key.as_ngg)
10687ec681f3Smrg         return "Vertex Shader as ESGS";
10697ec681f3Smrg      else
10707ec681f3Smrg         return "Vertex Shader as VS";
10717ec681f3Smrg   case MESA_SHADER_TESS_CTRL:
10727ec681f3Smrg      return "Tessellation Control Shader";
10737ec681f3Smrg   case MESA_SHADER_TESS_EVAL:
10747ec681f3Smrg      if (shader->key.as_es)
10757ec681f3Smrg         return "Tessellation Evaluation Shader as ES";
10767ec681f3Smrg      else if (shader->key.as_ngg)
10777ec681f3Smrg         return "Tessellation Evaluation Shader as ESGS";
10787ec681f3Smrg      else
10797ec681f3Smrg         return "Tessellation Evaluation Shader as VS";
10807ec681f3Smrg   case MESA_SHADER_GEOMETRY:
10817ec681f3Smrg      if (shader->is_gs_copy_shader)
10827ec681f3Smrg         return "GS Copy Shader as VS";
10837ec681f3Smrg      else
10847ec681f3Smrg         return "Geometry Shader";
10857ec681f3Smrg   case MESA_SHADER_FRAGMENT:
10867ec681f3Smrg      return "Pixel Shader";
10877ec681f3Smrg   case MESA_SHADER_COMPUTE:
10887ec681f3Smrg      return "Compute Shader";
10897ec681f3Smrg   default:
10907ec681f3Smrg      return "Unknown Shader";
10917ec681f3Smrg   }
10927ec681f3Smrg}
10937ec681f3Smrg
10947ec681f3Smrgvoid si_shader_dump(struct si_screen *sscreen, struct si_shader *shader,
10957ec681f3Smrg                    struct pipe_debug_callback *debug, FILE *file, bool check_debug_option)
10967ec681f3Smrg{
10977ec681f3Smrg   gl_shader_stage stage = shader->selector->info.stage;
10987ec681f3Smrg
10997ec681f3Smrg   if (!check_debug_option || si_can_dump_shader(sscreen, stage))
11007ec681f3Smrg      si_dump_shader_key(shader, file);
11017ec681f3Smrg
11027ec681f3Smrg   if (!check_debug_option && shader->binary.llvm_ir_string) {
11037ec681f3Smrg      if (shader->previous_stage && shader->previous_stage->binary.llvm_ir_string) {
11047ec681f3Smrg         fprintf(file, "\n%s - previous stage - LLVM IR:\n\n", si_get_shader_name(shader));
11057ec681f3Smrg         fprintf(file, "%s\n", shader->previous_stage->binary.llvm_ir_string);
11067ec681f3Smrg      }
11077ec681f3Smrg
11087ec681f3Smrg      fprintf(file, "\n%s - main shader part - LLVM IR:\n\n", si_get_shader_name(shader));
11097ec681f3Smrg      fprintf(file, "%s\n", shader->binary.llvm_ir_string);
11107ec681f3Smrg   }
11117ec681f3Smrg
11127ec681f3Smrg   if (!check_debug_option ||
11137ec681f3Smrg       (si_can_dump_shader(sscreen, stage) && !(sscreen->debug_flags & DBG(NO_ASM)))) {
11147ec681f3Smrg      unsigned wave_size = si_get_shader_wave_size(shader);
11157ec681f3Smrg
11167ec681f3Smrg      fprintf(file, "\n%s:\n", si_get_shader_name(shader));
11177ec681f3Smrg
11187ec681f3Smrg      if (shader->prolog)
11197ec681f3Smrg         si_shader_dump_disassembly(sscreen, &shader->prolog->binary, stage, wave_size, debug,
11207ec681f3Smrg                                    "prolog", file);
11217ec681f3Smrg      if (shader->previous_stage)
11227ec681f3Smrg         si_shader_dump_disassembly(sscreen, &shader->previous_stage->binary, stage,
11237ec681f3Smrg                                    wave_size, debug, "previous stage", file);
11247ec681f3Smrg      if (shader->prolog2)
11257ec681f3Smrg         si_shader_dump_disassembly(sscreen, &shader->prolog2->binary, stage, wave_size,
11267ec681f3Smrg                                    debug, "prolog2", file);
11277ec681f3Smrg
11287ec681f3Smrg      si_shader_dump_disassembly(sscreen, &shader->binary, stage, wave_size, debug, "main",
11297ec681f3Smrg                                 file);
11307ec681f3Smrg
11317ec681f3Smrg      if (shader->epilog)
11327ec681f3Smrg         si_shader_dump_disassembly(sscreen, &shader->epilog->binary, stage, wave_size, debug,
11337ec681f3Smrg                                    "epilog", file);
11347ec681f3Smrg      fprintf(file, "\n");
11357ec681f3Smrg   }
11367ec681f3Smrg
11377ec681f3Smrg   si_shader_dump_stats(sscreen, shader, file, check_debug_option);
113801e04c3fSmrg}
113901e04c3fSmrg
114001e04c3fSmrgstatic void si_dump_shader_key_vs(const struct si_shader_key *key,
11417ec681f3Smrg                                  const struct si_vs_prolog_bits *prolog, const char *prefix,
11427ec681f3Smrg                                  FILE *f)
11437ec681f3Smrg{
11447ec681f3Smrg   fprintf(f, "  %s.instance_divisor_is_one = %u\n", prefix, prolog->instance_divisor_is_one);
11457ec681f3Smrg   fprintf(f, "  %s.instance_divisor_is_fetched = %u\n", prefix,
11467ec681f3Smrg           prolog->instance_divisor_is_fetched);
11477ec681f3Smrg   fprintf(f, "  %s.ls_vgpr_fix = %u\n", prefix, prolog->ls_vgpr_fix);
11487ec681f3Smrg
11497ec681f3Smrg   fprintf(f, "  mono.vs.fetch_opencode = %x\n", key->mono.vs_fetch_opencode);
11507ec681f3Smrg   fprintf(f, "  mono.vs.fix_fetch = {");
11517ec681f3Smrg   for (int i = 0; i < SI_MAX_ATTRIBS; i++) {
11527ec681f3Smrg      union si_vs_fix_fetch fix = key->mono.vs_fix_fetch[i];
11537ec681f3Smrg      if (i)
11547ec681f3Smrg         fprintf(f, ", ");
11557ec681f3Smrg      if (!fix.bits)
11567ec681f3Smrg         fprintf(f, "0");
11577ec681f3Smrg      else
11587ec681f3Smrg         fprintf(f, "%u.%u.%u.%u", fix.u.reverse, fix.u.log_size, fix.u.num_channels_m1,
11597ec681f3Smrg                 fix.u.format);
11607ec681f3Smrg   }
11617ec681f3Smrg   fprintf(f, "}\n");
11627ec681f3Smrg}
11637ec681f3Smrg
11647ec681f3Smrgstatic void si_dump_shader_key(const struct si_shader *shader, FILE *f)
11657ec681f3Smrg{
11667ec681f3Smrg   const struct si_shader_key *key = &shader->key;
11677ec681f3Smrg   gl_shader_stage stage = shader->selector->info.stage;
11687ec681f3Smrg
11697ec681f3Smrg   fprintf(f, "SHADER KEY\n");
11707ec681f3Smrg
11717ec681f3Smrg   switch (stage) {
11727ec681f3Smrg   case MESA_SHADER_VERTEX:
11737ec681f3Smrg      si_dump_shader_key_vs(key, &key->part.vs.prolog, "part.vs.prolog", f);
11747ec681f3Smrg      fprintf(f, "  as_es = %u\n", key->as_es);
11757ec681f3Smrg      fprintf(f, "  as_ls = %u\n", key->as_ls);
11767ec681f3Smrg      fprintf(f, "  as_ngg = %u\n", key->as_ngg);
11777ec681f3Smrg      fprintf(f, "  mono.u.vs_export_prim_id = %u\n", key->mono.u.vs_export_prim_id);
11787ec681f3Smrg      break;
11797ec681f3Smrg
11807ec681f3Smrg   case MESA_SHADER_TESS_CTRL:
11817ec681f3Smrg      if (shader->selector->screen->info.chip_class >= GFX9) {
11827ec681f3Smrg         si_dump_shader_key_vs(key, &key->part.tcs.ls_prolog, "part.tcs.ls_prolog", f);
11837ec681f3Smrg      }
11847ec681f3Smrg      fprintf(f, "  part.tcs.epilog.prim_mode = %u\n", key->part.tcs.epilog.prim_mode);
11857ec681f3Smrg      fprintf(f, "  mono.u.ff_tcs_inputs_to_copy = 0x%" PRIx64 "\n",
11867ec681f3Smrg              key->mono.u.ff_tcs_inputs_to_copy);
11877ec681f3Smrg      fprintf(f, "  opt.prefer_mono = %u\n", key->opt.prefer_mono);
11887ec681f3Smrg      fprintf(f, "  opt.same_patch_vertices = %u\n", key->opt.same_patch_vertices);
11897ec681f3Smrg      break;
11907ec681f3Smrg
11917ec681f3Smrg   case MESA_SHADER_TESS_EVAL:
11927ec681f3Smrg      fprintf(f, "  as_es = %u\n", key->as_es);
11937ec681f3Smrg      fprintf(f, "  as_ngg = %u\n", key->as_ngg);
11947ec681f3Smrg      fprintf(f, "  mono.u.vs_export_prim_id = %u\n", key->mono.u.vs_export_prim_id);
11957ec681f3Smrg      break;
11967ec681f3Smrg
11977ec681f3Smrg   case MESA_SHADER_GEOMETRY:
11987ec681f3Smrg      if (shader->is_gs_copy_shader)
11997ec681f3Smrg         break;
12007ec681f3Smrg
12017ec681f3Smrg      if (shader->selector->screen->info.chip_class >= GFX9 &&
12027ec681f3Smrg          key->part.gs.es->info.stage == MESA_SHADER_VERTEX) {
12037ec681f3Smrg         si_dump_shader_key_vs(key, &key->part.gs.vs_prolog, "part.gs.vs_prolog", f);
12047ec681f3Smrg      }
12057ec681f3Smrg      fprintf(f, "  part.gs.prolog.tri_strip_adj_fix = %u\n",
12067ec681f3Smrg              key->part.gs.prolog.tri_strip_adj_fix);
12077ec681f3Smrg      fprintf(f, "  as_ngg = %u\n", key->as_ngg);
12087ec681f3Smrg      break;
12097ec681f3Smrg
12107ec681f3Smrg   case MESA_SHADER_COMPUTE:
12117ec681f3Smrg      break;
12127ec681f3Smrg
12137ec681f3Smrg   case MESA_SHADER_FRAGMENT:
12147ec681f3Smrg      fprintf(f, "  part.ps.prolog.color_two_side = %u\n", key->part.ps.prolog.color_two_side);
12157ec681f3Smrg      fprintf(f, "  part.ps.prolog.flatshade_colors = %u\n", key->part.ps.prolog.flatshade_colors);
12167ec681f3Smrg      fprintf(f, "  part.ps.prolog.poly_stipple = %u\n", key->part.ps.prolog.poly_stipple);
12177ec681f3Smrg      fprintf(f, "  part.ps.prolog.force_persp_sample_interp = %u\n",
12187ec681f3Smrg              key->part.ps.prolog.force_persp_sample_interp);
12197ec681f3Smrg      fprintf(f, "  part.ps.prolog.force_linear_sample_interp = %u\n",
12207ec681f3Smrg              key->part.ps.prolog.force_linear_sample_interp);
12217ec681f3Smrg      fprintf(f, "  part.ps.prolog.force_persp_center_interp = %u\n",
12227ec681f3Smrg              key->part.ps.prolog.force_persp_center_interp);
12237ec681f3Smrg      fprintf(f, "  part.ps.prolog.force_linear_center_interp = %u\n",
12247ec681f3Smrg              key->part.ps.prolog.force_linear_center_interp);
12257ec681f3Smrg      fprintf(f, "  part.ps.prolog.bc_optimize_for_persp = %u\n",
12267ec681f3Smrg              key->part.ps.prolog.bc_optimize_for_persp);
12277ec681f3Smrg      fprintf(f, "  part.ps.prolog.bc_optimize_for_linear = %u\n",
12287ec681f3Smrg              key->part.ps.prolog.bc_optimize_for_linear);
12297ec681f3Smrg      fprintf(f, "  part.ps.prolog.samplemask_log_ps_iter = %u\n",
12307ec681f3Smrg              key->part.ps.prolog.samplemask_log_ps_iter);
12317ec681f3Smrg      fprintf(f, "  part.ps.epilog.spi_shader_col_format = 0x%x\n",
12327ec681f3Smrg              key->part.ps.epilog.spi_shader_col_format);
12337ec681f3Smrg      fprintf(f, "  part.ps.epilog.color_is_int8 = 0x%X\n", key->part.ps.epilog.color_is_int8);
12347ec681f3Smrg      fprintf(f, "  part.ps.epilog.color_is_int10 = 0x%X\n", key->part.ps.epilog.color_is_int10);
12357ec681f3Smrg      fprintf(f, "  part.ps.epilog.last_cbuf = %u\n", key->part.ps.epilog.last_cbuf);
12367ec681f3Smrg      fprintf(f, "  part.ps.epilog.alpha_func = %u\n", key->part.ps.epilog.alpha_func);
12377ec681f3Smrg      fprintf(f, "  part.ps.epilog.alpha_to_one = %u\n", key->part.ps.epilog.alpha_to_one);
12387ec681f3Smrg      fprintf(f, "  part.ps.epilog.poly_line_smoothing = %u\n",
12397ec681f3Smrg              key->part.ps.epilog.poly_line_smoothing);
12407ec681f3Smrg      fprintf(f, "  part.ps.epilog.clamp_color = %u\n", key->part.ps.epilog.clamp_color);
12417ec681f3Smrg      fprintf(f, "  mono.u.ps.interpolate_at_sample_force_center = %u\n",
12427ec681f3Smrg              key->mono.u.ps.interpolate_at_sample_force_center);
12437ec681f3Smrg      fprintf(f, "  mono.u.ps.fbfetch_msaa = %u\n", key->mono.u.ps.fbfetch_msaa);
12447ec681f3Smrg      fprintf(f, "  mono.u.ps.fbfetch_is_1D = %u\n", key->mono.u.ps.fbfetch_is_1D);
12457ec681f3Smrg      fprintf(f, "  mono.u.ps.fbfetch_layered = %u\n", key->mono.u.ps.fbfetch_layered);
12467ec681f3Smrg      break;
12477ec681f3Smrg
12487ec681f3Smrg   default:
12497ec681f3Smrg      assert(0);
12507ec681f3Smrg   }
12517ec681f3Smrg
12527ec681f3Smrg   if ((stage == MESA_SHADER_GEOMETRY || stage == MESA_SHADER_TESS_EVAL ||
12537ec681f3Smrg        stage == MESA_SHADER_VERTEX) &&
12547ec681f3Smrg       !key->as_es && !key->as_ls) {
12557ec681f3Smrg      fprintf(f, "  opt.kill_outputs = 0x%" PRIx64 "\n", key->opt.kill_outputs);
12567ec681f3Smrg      fprintf(f, "  opt.kill_pointsize = 0x%x\n", key->opt.kill_pointsize);
12577ec681f3Smrg      fprintf(f, "  opt.kill_clip_distances = 0x%x\n", key->opt.kill_clip_distances);
12587ec681f3Smrg      if (stage != MESA_SHADER_GEOMETRY)
12597ec681f3Smrg         fprintf(f, "  opt.ngg_culling = 0x%x\n", key->opt.ngg_culling);
12607ec681f3Smrg   }
12617ec681f3Smrg
12627ec681f3Smrg   fprintf(f, "  opt.prefer_mono = %u\n", key->opt.prefer_mono);
12637ec681f3Smrg   fprintf(f, "  opt.inline_uniforms = %u (0x%x, 0x%x, 0x%x, 0x%x)\n",
12647ec681f3Smrg           key->opt.inline_uniforms,
12657ec681f3Smrg           key->opt.inlined_uniform_values[0],
12667ec681f3Smrg           key->opt.inlined_uniform_values[1],
12677ec681f3Smrg           key->opt.inlined_uniform_values[2],
12687ec681f3Smrg           key->opt.inlined_uniform_values[3]);
12697ec681f3Smrg}
12707ec681f3Smrg
12717ec681f3Smrgbool si_vs_needs_prolog(const struct si_shader_selector *sel,
12727ec681f3Smrg                        const struct si_vs_prolog_bits *prolog_key,
12737ec681f3Smrg                        const struct si_shader_key *key, bool ngg_cull_shader)
12747ec681f3Smrg{
12757ec681f3Smrg   /* VGPR initialization fixup for Vega10 and Raven is always done in the
12767ec681f3Smrg    * VS prolog. */
12777ec681f3Smrg   return sel->vs_needs_prolog || prolog_key->ls_vgpr_fix ||
12787ec681f3Smrg          /* The 2nd VS prolog loads input VGPRs from LDS */
12797ec681f3Smrg          (key->opt.ngg_culling && !ngg_cull_shader);
1280af69d88dSmrg}
1281af69d88dSmrg
128201e04c3fSmrg/**
128301e04c3fSmrg * Compute the VS prolog key, which contains all the information needed to
128401e04c3fSmrg * build the VS prolog function, and set shader->info bits where needed.
128501e04c3fSmrg *
128601e04c3fSmrg * \param info             Shader info of the vertex shader.
128701e04c3fSmrg * \param num_input_sgprs  Number of input SGPRs for the vertex shader.
12887ec681f3Smrg * \param has_old_  Whether the preceding shader part is the NGG cull shader.
128901e04c3fSmrg * \param prolog_key       Key of the VS prolog
129001e04c3fSmrg * \param shader_out       The vertex shader, or the next shader if merging LS+HS or ES+GS.
129101e04c3fSmrg * \param key              Output shader part key.
129201e04c3fSmrg */
12937ec681f3Smrgvoid si_get_vs_prolog_key(const struct si_shader_info *info, unsigned num_input_sgprs,
12947ec681f3Smrg                          bool ngg_cull_shader, const struct si_vs_prolog_bits *prolog_key,
12957ec681f3Smrg                          struct si_shader *shader_out, union si_shader_part_key *key)
12967ec681f3Smrg{
12977ec681f3Smrg   memset(key, 0, sizeof(*key));
12987ec681f3Smrg   key->vs_prolog.states = *prolog_key;
12997ec681f3Smrg   key->vs_prolog.num_input_sgprs = num_input_sgprs;
13007ec681f3Smrg   key->vs_prolog.num_inputs = info->num_inputs;
13017ec681f3Smrg   key->vs_prolog.as_ls = shader_out->key.as_ls;
13027ec681f3Smrg   key->vs_prolog.as_es = shader_out->key.as_es;
13037ec681f3Smrg   key->vs_prolog.as_ngg = shader_out->key.as_ngg;
13047ec681f3Smrg
13057ec681f3Smrg   if (!ngg_cull_shader && shader_out->key.opt.ngg_culling)
13067ec681f3Smrg      key->vs_prolog.load_vgprs_after_culling = 1;
13077ec681f3Smrg
13087ec681f3Smrg   if (shader_out->selector->info.stage == MESA_SHADER_TESS_CTRL) {
13097ec681f3Smrg      key->vs_prolog.as_ls = 1;
13107ec681f3Smrg      key->vs_prolog.num_merged_next_stage_vgprs = 2;
13117ec681f3Smrg   } else if (shader_out->selector->info.stage == MESA_SHADER_GEOMETRY) {
13127ec681f3Smrg      key->vs_prolog.as_es = 1;
13137ec681f3Smrg      key->vs_prolog.num_merged_next_stage_vgprs = 5;
13147ec681f3Smrg   } else if (shader_out->key.as_ngg) {
13157ec681f3Smrg      key->vs_prolog.num_merged_next_stage_vgprs = 5;
13167ec681f3Smrg   }
13177ec681f3Smrg
13187ec681f3Smrg   /* Only one of these combinations can be set. as_ngg can be set with as_es. */
13197ec681f3Smrg   assert(key->vs_prolog.as_ls + key->vs_prolog.as_ngg +
13207ec681f3Smrg          (key->vs_prolog.as_es && !key->vs_prolog.as_ngg) <= 1);
13217ec681f3Smrg
13227ec681f3Smrg   /* Enable loading the InstanceID VGPR. */
13237ec681f3Smrg   uint16_t input_mask = u_bit_consecutive(0, info->num_inputs);
13247ec681f3Smrg
13257ec681f3Smrg   if ((key->vs_prolog.states.instance_divisor_is_one |
13267ec681f3Smrg        key->vs_prolog.states.instance_divisor_is_fetched) &
13277ec681f3Smrg       input_mask)
13287ec681f3Smrg      shader_out->info.uses_instanceid = true;
13297ec681f3Smrg}
13307ec681f3Smrg
13317ec681f3Smrgstruct nir_shader *si_get_nir_shader(struct si_shader_selector *sel,
13327ec681f3Smrg                                     const struct si_shader_key *key,
13337ec681f3Smrg                                     bool *free_nir)
13347ec681f3Smrg{
13357ec681f3Smrg   nir_shader *nir;
13367ec681f3Smrg   *free_nir = false;
13377ec681f3Smrg
13387ec681f3Smrg   if (sel->nir) {
13397ec681f3Smrg      nir = sel->nir;
13407ec681f3Smrg   } else if (sel->nir_binary) {
13417ec681f3Smrg      struct pipe_screen *screen = &sel->screen->b;
13427ec681f3Smrg      const void *options = screen->get_compiler_options(screen, PIPE_SHADER_IR_NIR,
13437ec681f3Smrg                                                         pipe_shader_type_from_mesa(sel->info.stage));
13447ec681f3Smrg
13457ec681f3Smrg      struct blob_reader blob_reader;
13467ec681f3Smrg      blob_reader_init(&blob_reader, sel->nir_binary, sel->nir_size);
13477ec681f3Smrg      *free_nir = true;
13487ec681f3Smrg      nir = nir_deserialize(NULL, options, &blob_reader);
13497ec681f3Smrg   } else {
13507ec681f3Smrg      return NULL;
13517ec681f3Smrg   }
13527ec681f3Smrg
13537ec681f3Smrg   if (key && key->opt.inline_uniforms) {
13547ec681f3Smrg      assert(*free_nir);
13557ec681f3Smrg
13567ec681f3Smrg      /* Most places use shader information from the default variant, not
13577ec681f3Smrg       * the optimized variant. These are the things that the driver looks at
13587ec681f3Smrg       * in optimized variants and the list of things that we need to do.
13597ec681f3Smrg       *
13607ec681f3Smrg       * The driver takes into account these things if they suddenly disappear
13617ec681f3Smrg       * from the shader code:
13627ec681f3Smrg       * - Register usage and code size decrease (obvious)
13637ec681f3Smrg       * - Eliminated PS system values are disabled by LLVM
13647ec681f3Smrg       *   (FragCoord, FrontFace, barycentrics)
13657ec681f3Smrg       * - VS/TES/GS outputs feeding PS are eliminated if outputs are undef.
13667ec681f3Smrg       *   (thanks to an LLVM pass in Mesa - TODO: move it to NIR)
13677ec681f3Smrg       *   The storage for eliminated outputs is also not allocated.
13687ec681f3Smrg       * - VS/TCS/TES/GS/PS input loads are eliminated (VS relies on DCE in LLVM)
13697ec681f3Smrg       * - TCS output stores are eliminated
13707ec681f3Smrg       *
13717ec681f3Smrg       * TODO: These are things the driver ignores in the final shader code
13727ec681f3Smrg       * and relies on the default shader info.
13737ec681f3Smrg       * - Other system values are not eliminated
13747ec681f3Smrg       * - PS.NUM_INTERP = bitcount64(inputs_read), renumber inputs
13757ec681f3Smrg       *   to remove holes
13767ec681f3Smrg       * - uses_discard - if it changed to false
13777ec681f3Smrg       * - writes_memory - if it changed to false
13787ec681f3Smrg       * - VS->TCS, VS->GS, TES->GS output stores for the former stage are not
13797ec681f3Smrg       *   eliminated
13807ec681f3Smrg       * - Eliminated VS/TCS/TES outputs are still allocated. (except when feeding PS)
13817ec681f3Smrg       *   GS outputs are eliminated except for the temporary LDS.
13827ec681f3Smrg       *   Clip distances, gl_PointSize, and PS outputs are eliminated based
13837ec681f3Smrg       *   on current states, so we don't care about the shader code.
13847ec681f3Smrg       *
13857ec681f3Smrg       * TODO: Merged shaders don't inline uniforms for the first stage.
13867ec681f3Smrg       * VS-GS: only GS inlines uniforms; VS-TCS: only TCS; TES-GS: only GS.
13877ec681f3Smrg       * (key == NULL for the first stage here)
13887ec681f3Smrg       *
13897ec681f3Smrg       * TODO: Compute shaders don't support inlinable uniforms, because they
13907ec681f3Smrg       * don't have shader variants.
13917ec681f3Smrg       *
13927ec681f3Smrg       * TODO: The driver uses a linear search to find a shader variant. This
13937ec681f3Smrg       * can be really slow if we get too many variants due to uniform inlining.
13947ec681f3Smrg       */
13957ec681f3Smrg      NIR_PASS_V(nir, nir_inline_uniforms,
13967ec681f3Smrg                 nir->info.num_inlinable_uniforms,
13977ec681f3Smrg                 key->opt.inlined_uniform_values,
13987ec681f3Smrg                 nir->info.inlinable_uniform_dw_offsets);
13997ec681f3Smrg
14007ec681f3Smrg      si_nir_opts(sel->screen, nir, true);
14017ec681f3Smrg      si_nir_late_opts(nir);
14027ec681f3Smrg
14037ec681f3Smrg      /* This must be done again. */
14047ec681f3Smrg      NIR_PASS_V(nir, nir_io_add_const_offset_to_base, nir_var_shader_in |
14057ec681f3Smrg                                                       nir_var_shader_out);
14067ec681f3Smrg   }
14077ec681f3Smrg
14087ec681f3Smrg   return nir;
14097ec681f3Smrg}
14107ec681f3Smrg
14117ec681f3Smrgbool si_compile_shader(struct si_screen *sscreen, struct ac_llvm_compiler *compiler,
14127ec681f3Smrg                       struct si_shader *shader, struct pipe_debug_callback *debug)
14137ec681f3Smrg{
14147ec681f3Smrg   struct si_shader_selector *sel = shader->selector;
14157ec681f3Smrg   bool free_nir;
14167ec681f3Smrg   struct nir_shader *nir = si_get_nir_shader(sel, &shader->key, &free_nir);
14177ec681f3Smrg
14187ec681f3Smrg   /* Dump NIR before doing NIR->LLVM conversion in case the
14197ec681f3Smrg    * conversion fails. */
14207ec681f3Smrg   if (si_can_dump_shader(sscreen, sel->info.stage) &&
14217ec681f3Smrg       !(sscreen->debug_flags & DBG(NO_NIR))) {
14227ec681f3Smrg      nir_print_shader(nir, stderr);
14237ec681f3Smrg      si_dump_streamout(&sel->so);
14247ec681f3Smrg   }
14257ec681f3Smrg
14267ec681f3Smrg   /* Initialize vs_output_ps_input_cntl to default. */
14277ec681f3Smrg   for (unsigned i = 0; i < ARRAY_SIZE(shader->info.vs_output_ps_input_cntl); i++)
14287ec681f3Smrg      shader->info.vs_output_ps_input_cntl[i] = SI_PS_INPUT_CNTL_UNUSED;
14297ec681f3Smrg   shader->info.vs_output_ps_input_cntl[VARYING_SLOT_COL0] = SI_PS_INPUT_CNTL_UNUSED_COLOR0;
14307ec681f3Smrg
14317ec681f3Smrg   shader->info.uses_instanceid = sel->info.uses_instanceid;
14327ec681f3Smrg
14337ec681f3Smrg   /* TODO: ACO could compile non-monolithic shaders here (starting
14347ec681f3Smrg    * with PS and NGG VS), but monolithic shaders should be compiled
14357ec681f3Smrg    * by LLVM due to more complicated compilation.
14367ec681f3Smrg    */
14377ec681f3Smrg   if (!si_llvm_compile_shader(sscreen, compiler, shader, debug, nir, free_nir))
14387ec681f3Smrg      return false;
14397ec681f3Smrg
14407ec681f3Smrg   /* Compute vs_output_ps_input_cntl. */
14417ec681f3Smrg   if ((sel->info.stage == MESA_SHADER_VERTEX ||
14427ec681f3Smrg        sel->info.stage == MESA_SHADER_TESS_EVAL ||
14437ec681f3Smrg        sel->info.stage == MESA_SHADER_GEOMETRY) &&
14447ec681f3Smrg       !shader->key.as_ls && !shader->key.as_es) {
14457ec681f3Smrg      ubyte *vs_output_param_offset = shader->info.vs_output_param_offset;
14467ec681f3Smrg
14477ec681f3Smrg      if (sel->info.stage == MESA_SHADER_GEOMETRY && !shader->key.as_ngg)
14487ec681f3Smrg         vs_output_param_offset = sel->gs_copy_shader->info.vs_output_param_offset;
14497ec681f3Smrg
14507ec681f3Smrg      /* VS and TES should also set primitive ID output if it's used. */
14517ec681f3Smrg      unsigned num_outputs_with_prim_id = sel->info.num_outputs +
14527ec681f3Smrg                                          shader->key.mono.u.vs_export_prim_id;
14537ec681f3Smrg
14547ec681f3Smrg      for (unsigned i = 0; i < num_outputs_with_prim_id; i++) {
14557ec681f3Smrg         unsigned semantic = sel->info.output_semantic[i];
14567ec681f3Smrg         unsigned offset = vs_output_param_offset[i];
14577ec681f3Smrg         unsigned ps_input_cntl;
14587ec681f3Smrg
14597ec681f3Smrg         if (offset <= AC_EXP_PARAM_OFFSET_31) {
14607ec681f3Smrg            /* The input is loaded from parameter memory. */
14617ec681f3Smrg            ps_input_cntl = S_028644_OFFSET(offset);
14627ec681f3Smrg         } else {
14637ec681f3Smrg            /* The input is a DEFAULT_VAL constant. */
14647ec681f3Smrg            assert(offset >= AC_EXP_PARAM_DEFAULT_VAL_0000 &&
14657ec681f3Smrg                   offset <= AC_EXP_PARAM_DEFAULT_VAL_1111);
14667ec681f3Smrg            offset -= AC_EXP_PARAM_DEFAULT_VAL_0000;
14677ec681f3Smrg
14687ec681f3Smrg            /* OFFSET=0x20 means that DEFAULT_VAL is used. */
14697ec681f3Smrg            ps_input_cntl = S_028644_OFFSET(0x20) |
14707ec681f3Smrg                            S_028644_DEFAULT_VAL(offset);
14717ec681f3Smrg         }
14727ec681f3Smrg
14737ec681f3Smrg         shader->info.vs_output_ps_input_cntl[semantic] = ps_input_cntl;
14747ec681f3Smrg      }
14757ec681f3Smrg   }
14767ec681f3Smrg
14777ec681f3Smrg   /* Validate SGPR and VGPR usage for compute to detect compiler bugs. */
14787ec681f3Smrg   if (sel->info.stage == MESA_SHADER_COMPUTE) {
14797ec681f3Smrg      unsigned wave_size = sscreen->compute_wave_size;
14807ec681f3Smrg      unsigned max_vgprs =
14817ec681f3Smrg         sscreen->info.num_physical_wave64_vgprs_per_simd * (wave_size == 32 ? 2 : 1);
14827ec681f3Smrg      unsigned max_sgprs = sscreen->info.num_physical_sgprs_per_simd;
14837ec681f3Smrg      unsigned max_sgprs_per_wave = 128;
14847ec681f3Smrg      unsigned simds_per_tg = 4; /* assuming WGP mode on gfx10 */
14857ec681f3Smrg      unsigned threads_per_tg = si_get_max_workgroup_size(shader);
14867ec681f3Smrg      unsigned waves_per_tg = DIV_ROUND_UP(threads_per_tg, wave_size);
14877ec681f3Smrg      unsigned waves_per_simd = DIV_ROUND_UP(waves_per_tg, simds_per_tg);
14887ec681f3Smrg
14897ec681f3Smrg      max_vgprs = max_vgprs / waves_per_simd;
14907ec681f3Smrg      max_sgprs = MIN2(max_sgprs / waves_per_simd, max_sgprs_per_wave);
14917ec681f3Smrg
14927ec681f3Smrg      if (shader->config.num_sgprs > max_sgprs || shader->config.num_vgprs > max_vgprs) {
14937ec681f3Smrg         fprintf(stderr,
14947ec681f3Smrg                 "LLVM failed to compile a shader correctly: "
14957ec681f3Smrg                 "SGPR:VGPR usage is %u:%u, but the hw limit is %u:%u\n",
14967ec681f3Smrg                 shader->config.num_sgprs, shader->config.num_vgprs, max_sgprs, max_vgprs);
14977ec681f3Smrg
14987ec681f3Smrg         /* Just terminate the process, because dependent
14997ec681f3Smrg          * shaders can hang due to bad input data, but use
15007ec681f3Smrg          * the env var to allow shader-db to work.
15017ec681f3Smrg          */
15027ec681f3Smrg         if (!debug_get_bool_option("SI_PASS_BAD_SHADERS", false))
15037ec681f3Smrg            abort();
15047ec681f3Smrg      }
15057ec681f3Smrg   }
15067ec681f3Smrg
15077ec681f3Smrg   /* Add the scratch offset to input SGPRs. */
15087ec681f3Smrg   if (shader->config.scratch_bytes_per_wave && !si_is_merged_shader(shader))
15097ec681f3Smrg      shader->info.num_input_sgprs += 1; /* scratch byte offset */
15107ec681f3Smrg
15117ec681f3Smrg   /* Calculate the number of fragment input VGPRs. */
15127ec681f3Smrg   if (sel->info.stage == MESA_SHADER_FRAGMENT) {
15137ec681f3Smrg      shader->info.num_input_vgprs = ac_get_fs_input_vgpr_cnt(
15147ec681f3Smrg         &shader->config, &shader->info.face_vgpr_index, &shader->info.ancillary_vgpr_index);
15157ec681f3Smrg   }
15167ec681f3Smrg
15177ec681f3Smrg   si_calculate_max_simd_waves(shader);
15187ec681f3Smrg   si_shader_dump_stats_for_shader_db(sscreen, shader, debug);
15197ec681f3Smrg   return true;
152001e04c3fSmrg}
152101e04c3fSmrg
152201e04c3fSmrg/**
152301e04c3fSmrg * Create, compile and return a shader part (prolog or epilog).
152401e04c3fSmrg *
152501e04c3fSmrg * \param sscreen	screen
152601e04c3fSmrg * \param list		list of shader parts of the same category
152701e04c3fSmrg * \param type		shader type
152801e04c3fSmrg * \param key		shader part key
152901e04c3fSmrg * \param prolog	whether the part being requested is a prolog
153001e04c3fSmrg * \param tm		LLVM target machine
153101e04c3fSmrg * \param debug		debug callback
153201e04c3fSmrg * \param build		the callback responsible for building the main function
153301e04c3fSmrg * \return		non-NULL on success
153401e04c3fSmrg */
153501e04c3fSmrgstatic struct si_shader_part *
15367ec681f3Smrgsi_get_shader_part(struct si_screen *sscreen, struct si_shader_part **list,
15377ec681f3Smrg                   gl_shader_stage stage, bool prolog, union si_shader_part_key *key,
15387ec681f3Smrg                   struct ac_llvm_compiler *compiler, struct pipe_debug_callback *debug,
15397ec681f3Smrg                   void (*build)(struct si_shader_context *, union si_shader_part_key *),
15407ec681f3Smrg                   const char *name)
15417ec681f3Smrg{
15427ec681f3Smrg   struct si_shader_part *result;
15437ec681f3Smrg
15447ec681f3Smrg   simple_mtx_lock(&sscreen->shader_parts_mutex);
15457ec681f3Smrg
15467ec681f3Smrg   /* Find existing. */
15477ec681f3Smrg   for (result = *list; result; result = result->next) {
15487ec681f3Smrg      if (memcmp(&result->key, key, sizeof(*key)) == 0) {
15497ec681f3Smrg         simple_mtx_unlock(&sscreen->shader_parts_mutex);
15507ec681f3Smrg         return result;
15517ec681f3Smrg      }
15527ec681f3Smrg   }
15537ec681f3Smrg
15547ec681f3Smrg   /* Compile a new one. */
15557ec681f3Smrg   result = CALLOC_STRUCT(si_shader_part);
15567ec681f3Smrg   result->key = *key;
15577ec681f3Smrg
15587ec681f3Smrg   struct si_shader_selector sel = {};
15597ec681f3Smrg   sel.screen = sscreen;
15607ec681f3Smrg
15617ec681f3Smrg   struct si_shader shader = {};
15627ec681f3Smrg   shader.selector = &sel;
15637ec681f3Smrg
15647ec681f3Smrg   switch (stage) {
15657ec681f3Smrg   case MESA_SHADER_VERTEX:
15667ec681f3Smrg      shader.key.as_ls = key->vs_prolog.as_ls;
15677ec681f3Smrg      shader.key.as_es = key->vs_prolog.as_es;
15687ec681f3Smrg      shader.key.as_ngg = key->vs_prolog.as_ngg;
15697ec681f3Smrg      break;
15707ec681f3Smrg   case MESA_SHADER_TESS_CTRL:
15717ec681f3Smrg      assert(!prolog);
15727ec681f3Smrg      shader.key.part.tcs.epilog = key->tcs_epilog.states;
15737ec681f3Smrg      break;
15747ec681f3Smrg   case MESA_SHADER_GEOMETRY:
15757ec681f3Smrg      assert(prolog);
15767ec681f3Smrg      shader.key.as_ngg = key->gs_prolog.as_ngg;
15777ec681f3Smrg      break;
15787ec681f3Smrg   case MESA_SHADER_FRAGMENT:
15797ec681f3Smrg      if (prolog)
15807ec681f3Smrg         shader.key.part.ps.prolog = key->ps_prolog.states;
15817ec681f3Smrg      else
15827ec681f3Smrg         shader.key.part.ps.epilog = key->ps_epilog.states;
15837ec681f3Smrg      break;
15847ec681f3Smrg   default:
15857ec681f3Smrg      unreachable("bad shader part");
15867ec681f3Smrg   }
15877ec681f3Smrg
15887ec681f3Smrg   struct si_shader_context ctx;
15897ec681f3Smrg   si_llvm_context_init(&ctx, sscreen, compiler,
15907ec681f3Smrg                        si_get_wave_size(sscreen, stage,
15917ec681f3Smrg                                         shader.key.as_ngg, shader.key.as_es));
15927ec681f3Smrg   ctx.shader = &shader;
15937ec681f3Smrg   ctx.stage = stage;
15947ec681f3Smrg
15957ec681f3Smrg   build(&ctx, key);
15967ec681f3Smrg
15977ec681f3Smrg   /* Compile. */
15987ec681f3Smrg   si_llvm_optimize_module(&ctx);
15997ec681f3Smrg
16007ec681f3Smrg   if (!si_compile_llvm(sscreen, &result->binary, &result->config, compiler, &ctx.ac, debug,
16017ec681f3Smrg                        ctx.stage, name, false)) {
16027ec681f3Smrg      FREE(result);
16037ec681f3Smrg      result = NULL;
16047ec681f3Smrg      goto out;
16057ec681f3Smrg   }
16067ec681f3Smrg
16077ec681f3Smrg   result->next = *list;
16087ec681f3Smrg   *list = result;
160901e04c3fSmrg
161001e04c3fSmrgout:
16117ec681f3Smrg   si_llvm_dispose(&ctx);
16127ec681f3Smrg   simple_mtx_unlock(&sscreen->shader_parts_mutex);
16137ec681f3Smrg   return result;
1614af69d88dSmrg}
1615af69d88dSmrg
16167ec681f3Smrgstatic bool si_get_vs_prolog(struct si_screen *sscreen, struct ac_llvm_compiler *compiler,
16177ec681f3Smrg                             struct si_shader *shader, struct pipe_debug_callback *debug,
16187ec681f3Smrg                             struct si_shader *main_part, const struct si_vs_prolog_bits *key)
1619af69d88dSmrg{
16207ec681f3Smrg   struct si_shader_selector *vs = main_part->selector;
162101e04c3fSmrg
16227ec681f3Smrg   if (!si_vs_needs_prolog(vs, key, &shader->key, false))
16237ec681f3Smrg      return true;
1624af69d88dSmrg
16257ec681f3Smrg   /* Get the prolog. */
16267ec681f3Smrg   union si_shader_part_key prolog_key;
16277ec681f3Smrg   si_get_vs_prolog_key(&vs->info, main_part->info.num_input_sgprs, false, key, shader,
16287ec681f3Smrg                        &prolog_key);
162901e04c3fSmrg
16307ec681f3Smrg   shader->prolog =
16317ec681f3Smrg      si_get_shader_part(sscreen, &sscreen->vs_prologs, MESA_SHADER_VERTEX, true, &prolog_key,
16327ec681f3Smrg                         compiler, debug, si_llvm_build_vs_prolog, "Vertex Shader Prolog");
16337ec681f3Smrg   return shader->prolog != NULL;
1634af69d88dSmrg}
1635af69d88dSmrg
163601e04c3fSmrg/**
163701e04c3fSmrg * Select and compile (or reuse) vertex shader parts (prolog & epilog).
163801e04c3fSmrg */
16397ec681f3Smrgstatic bool si_shader_select_vs_parts(struct si_screen *sscreen, struct ac_llvm_compiler *compiler,
16407ec681f3Smrg                                      struct si_shader *shader, struct pipe_debug_callback *debug)
1641af69d88dSmrg{
16427ec681f3Smrg   return si_get_vs_prolog(sscreen, compiler, shader, debug, shader, &shader->key.part.vs.prolog);
164301e04c3fSmrg}
1644af69d88dSmrg
164501e04c3fSmrg/**
16467ec681f3Smrg * Select and compile (or reuse) TCS parts (epilog).
164701e04c3fSmrg */
16487ec681f3Smrgstatic bool si_shader_select_tcs_parts(struct si_screen *sscreen, struct ac_llvm_compiler *compiler,
16497ec681f3Smrg                                       struct si_shader *shader, struct pipe_debug_callback *debug)
165001e04c3fSmrg{
16517ec681f3Smrg   if (sscreen->info.chip_class >= GFX9) {
16527ec681f3Smrg      struct si_shader *ls_main_part = shader->key.part.tcs.ls->main_shader_part_ls;
16537ec681f3Smrg
16547ec681f3Smrg      if (!si_get_vs_prolog(sscreen, compiler, shader, debug, ls_main_part,
16557ec681f3Smrg                            &shader->key.part.tcs.ls_prolog))
16567ec681f3Smrg         return false;
16577ec681f3Smrg
16587ec681f3Smrg      shader->previous_stage = ls_main_part;
16597ec681f3Smrg   }
16607ec681f3Smrg
16617ec681f3Smrg   /* Get the epilog. */
16627ec681f3Smrg   union si_shader_part_key epilog_key;
16637ec681f3Smrg   memset(&epilog_key, 0, sizeof(epilog_key));
16647ec681f3Smrg   epilog_key.tcs_epilog.states = shader->key.part.tcs.epilog;
16657ec681f3Smrg
16667ec681f3Smrg   shader->epilog = si_get_shader_part(sscreen, &sscreen->tcs_epilogs, MESA_SHADER_TESS_CTRL, false,
16677ec681f3Smrg                                       &epilog_key, compiler, debug, si_llvm_build_tcs_epilog,
16687ec681f3Smrg                                       "Tessellation Control Shader Epilog");
16697ec681f3Smrg   return shader->epilog != NULL;
167001e04c3fSmrg}
1671af69d88dSmrg
167201e04c3fSmrg/**
16737ec681f3Smrg * Select and compile (or reuse) GS parts (prolog).
167401e04c3fSmrg */
16757ec681f3Smrgstatic bool si_shader_select_gs_parts(struct si_screen *sscreen, struct ac_llvm_compiler *compiler,
16767ec681f3Smrg                                      struct si_shader *shader, struct pipe_debug_callback *debug)
167701e04c3fSmrg{
16787ec681f3Smrg   if (sscreen->info.chip_class >= GFX9) {
16797ec681f3Smrg      struct si_shader *es_main_part;
16807ec681f3Smrg
16817ec681f3Smrg      if (shader->key.as_ngg)
16827ec681f3Smrg         es_main_part = shader->key.part.gs.es->main_shader_part_ngg_es;
16837ec681f3Smrg      else
16847ec681f3Smrg         es_main_part = shader->key.part.gs.es->main_shader_part_es;
16857ec681f3Smrg
16867ec681f3Smrg      if (shader->key.part.gs.es->info.stage == MESA_SHADER_VERTEX &&
16877ec681f3Smrg          !si_get_vs_prolog(sscreen, compiler, shader, debug, es_main_part,
16887ec681f3Smrg                            &shader->key.part.gs.vs_prolog))
16897ec681f3Smrg         return false;
16907ec681f3Smrg
16917ec681f3Smrg      shader->previous_stage = es_main_part;
16927ec681f3Smrg   }
16937ec681f3Smrg
16947ec681f3Smrg   if (!shader->key.part.gs.prolog.tri_strip_adj_fix)
16957ec681f3Smrg      return true;
16967ec681f3Smrg
16977ec681f3Smrg   union si_shader_part_key prolog_key;
16987ec681f3Smrg   memset(&prolog_key, 0, sizeof(prolog_key));
16997ec681f3Smrg   prolog_key.gs_prolog.states = shader->key.part.gs.prolog;
17007ec681f3Smrg   prolog_key.gs_prolog.as_ngg = shader->key.as_ngg;
17017ec681f3Smrg
17027ec681f3Smrg   shader->prolog2 =
17037ec681f3Smrg      si_get_shader_part(sscreen, &sscreen->gs_prologs, MESA_SHADER_GEOMETRY, true, &prolog_key,
17047ec681f3Smrg                         compiler, debug, si_llvm_build_gs_prolog, "Geometry Shader Prolog");
17057ec681f3Smrg   return shader->prolog2 != NULL;
1706af69d88dSmrg}
1707af69d88dSmrg
170801e04c3fSmrg/**
17097ec681f3Smrg * Compute the PS prolog key, which contains all the information needed to
17107ec681f3Smrg * build the PS prolog function, and set related bits in shader->config.
171101e04c3fSmrg */
17127ec681f3Smrgvoid si_get_ps_prolog_key(struct si_shader *shader, union si_shader_part_key *key,
17137ec681f3Smrg                          bool separate_prolog)
17147ec681f3Smrg{
17157ec681f3Smrg   struct si_shader_info *info = &shader->selector->info;
17167ec681f3Smrg
17177ec681f3Smrg   memset(key, 0, sizeof(*key));
17187ec681f3Smrg   key->ps_prolog.states = shader->key.part.ps.prolog;
17197ec681f3Smrg   key->ps_prolog.colors_read = info->colors_read;
17207ec681f3Smrg   key->ps_prolog.num_input_sgprs = shader->info.num_input_sgprs;
17217ec681f3Smrg   key->ps_prolog.num_input_vgprs = shader->info.num_input_vgprs;
17227ec681f3Smrg   key->ps_prolog.wqm =
17237ec681f3Smrg      info->base.fs.needs_quad_helper_invocations &&
17247ec681f3Smrg      (key->ps_prolog.colors_read || key->ps_prolog.states.force_persp_sample_interp ||
17257ec681f3Smrg       key->ps_prolog.states.force_linear_sample_interp ||
17267ec681f3Smrg       key->ps_prolog.states.force_persp_center_interp ||
17277ec681f3Smrg       key->ps_prolog.states.force_linear_center_interp ||
17287ec681f3Smrg       key->ps_prolog.states.bc_optimize_for_persp || key->ps_prolog.states.bc_optimize_for_linear);
17297ec681f3Smrg   key->ps_prolog.ancillary_vgpr_index = shader->info.ancillary_vgpr_index;
17307ec681f3Smrg
17317ec681f3Smrg   if (info->colors_read) {
17327ec681f3Smrg      ubyte *color = shader->selector->color_attr_index;
17337ec681f3Smrg
17347ec681f3Smrg      if (shader->key.part.ps.prolog.color_two_side) {
17357ec681f3Smrg         /* BCOLORs are stored after the last input. */
17367ec681f3Smrg         key->ps_prolog.num_interp_inputs = info->num_inputs;
17377ec681f3Smrg         key->ps_prolog.face_vgpr_index = shader->info.face_vgpr_index;
17387ec681f3Smrg         if (separate_prolog)
17397ec681f3Smrg            shader->config.spi_ps_input_ena |= S_0286CC_FRONT_FACE_ENA(1);
17407ec681f3Smrg      }
17417ec681f3Smrg
17427ec681f3Smrg      for (unsigned i = 0; i < 2; i++) {
17437ec681f3Smrg         unsigned interp = info->color_interpolate[i];
17447ec681f3Smrg         unsigned location = info->color_interpolate_loc[i];
17457ec681f3Smrg
17467ec681f3Smrg         if (!(info->colors_read & (0xf << i * 4)))
17477ec681f3Smrg            continue;
17487ec681f3Smrg
17497ec681f3Smrg         key->ps_prolog.color_attr_index[i] = color[i];
17507ec681f3Smrg
17517ec681f3Smrg         if (shader->key.part.ps.prolog.flatshade_colors && interp == INTERP_MODE_COLOR)
17527ec681f3Smrg            interp = INTERP_MODE_FLAT;
17537ec681f3Smrg
17547ec681f3Smrg         switch (interp) {
17557ec681f3Smrg         case INTERP_MODE_FLAT:
17567ec681f3Smrg            key->ps_prolog.color_interp_vgpr_index[i] = -1;
17577ec681f3Smrg            break;
17587ec681f3Smrg         case INTERP_MODE_SMOOTH:
17597ec681f3Smrg         case INTERP_MODE_COLOR:
17607ec681f3Smrg            /* Force the interpolation location for colors here. */
17617ec681f3Smrg            if (shader->key.part.ps.prolog.force_persp_sample_interp)
17627ec681f3Smrg               location = TGSI_INTERPOLATE_LOC_SAMPLE;
17637ec681f3Smrg            if (shader->key.part.ps.prolog.force_persp_center_interp)
17647ec681f3Smrg               location = TGSI_INTERPOLATE_LOC_CENTER;
17657ec681f3Smrg
17667ec681f3Smrg            switch (location) {
17677ec681f3Smrg            case TGSI_INTERPOLATE_LOC_SAMPLE:
17687ec681f3Smrg               key->ps_prolog.color_interp_vgpr_index[i] = 0;
17697ec681f3Smrg               if (separate_prolog) {
17707ec681f3Smrg                  shader->config.spi_ps_input_ena |= S_0286CC_PERSP_SAMPLE_ENA(1);
17717ec681f3Smrg               }
17727ec681f3Smrg               break;
17737ec681f3Smrg            case TGSI_INTERPOLATE_LOC_CENTER:
17747ec681f3Smrg               key->ps_prolog.color_interp_vgpr_index[i] = 2;
17757ec681f3Smrg               if (separate_prolog) {
17767ec681f3Smrg                  shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTER_ENA(1);
17777ec681f3Smrg               }
17787ec681f3Smrg               break;
17797ec681f3Smrg            case TGSI_INTERPOLATE_LOC_CENTROID:
17807ec681f3Smrg               key->ps_prolog.color_interp_vgpr_index[i] = 4;
17817ec681f3Smrg               if (separate_prolog) {
17827ec681f3Smrg                  shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTROID_ENA(1);
17837ec681f3Smrg               }
17847ec681f3Smrg               break;
17857ec681f3Smrg            default:
17867ec681f3Smrg               assert(0);
17877ec681f3Smrg            }
17887ec681f3Smrg            break;
17897ec681f3Smrg         case INTERP_MODE_NOPERSPECTIVE:
17907ec681f3Smrg            /* Force the interpolation location for colors here. */
17917ec681f3Smrg            if (shader->key.part.ps.prolog.force_linear_sample_interp)
17927ec681f3Smrg               location = TGSI_INTERPOLATE_LOC_SAMPLE;
17937ec681f3Smrg            if (shader->key.part.ps.prolog.force_linear_center_interp)
17947ec681f3Smrg               location = TGSI_INTERPOLATE_LOC_CENTER;
17957ec681f3Smrg
17967ec681f3Smrg            /* The VGPR assignment for non-monolithic shaders
17977ec681f3Smrg             * works because InitialPSInputAddr is set on the
17987ec681f3Smrg             * main shader and PERSP_PULL_MODEL is never used.
17997ec681f3Smrg             */
18007ec681f3Smrg            switch (location) {
18017ec681f3Smrg            case TGSI_INTERPOLATE_LOC_SAMPLE:
18027ec681f3Smrg               key->ps_prolog.color_interp_vgpr_index[i] = separate_prolog ? 6 : 9;
18037ec681f3Smrg               if (separate_prolog) {
18047ec681f3Smrg                  shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_SAMPLE_ENA(1);
18057ec681f3Smrg               }
18067ec681f3Smrg               break;
18077ec681f3Smrg            case TGSI_INTERPOLATE_LOC_CENTER:
18087ec681f3Smrg               key->ps_prolog.color_interp_vgpr_index[i] = separate_prolog ? 8 : 11;
18097ec681f3Smrg               if (separate_prolog) {
18107ec681f3Smrg                  shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTER_ENA(1);
18117ec681f3Smrg               }
18127ec681f3Smrg               break;
18137ec681f3Smrg            case TGSI_INTERPOLATE_LOC_CENTROID:
18147ec681f3Smrg               key->ps_prolog.color_interp_vgpr_index[i] = separate_prolog ? 10 : 13;
18157ec681f3Smrg               if (separate_prolog) {
18167ec681f3Smrg                  shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTROID_ENA(1);
18177ec681f3Smrg               }
18187ec681f3Smrg               break;
18197ec681f3Smrg            default:
18207ec681f3Smrg               assert(0);
18217ec681f3Smrg            }
18227ec681f3Smrg            break;
18237ec681f3Smrg         default:
18247ec681f3Smrg            assert(0);
18257ec681f3Smrg         }
18267ec681f3Smrg      }
18277ec681f3Smrg   }
182801e04c3fSmrg}
1829af69d88dSmrg
183001e04c3fSmrg/**
18317ec681f3Smrg * Check whether a PS prolog is required based on the key.
183201e04c3fSmrg */
18337ec681f3Smrgbool si_need_ps_prolog(const union si_shader_part_key *key)
183401e04c3fSmrg{
18357ec681f3Smrg   return key->ps_prolog.colors_read || key->ps_prolog.states.force_persp_sample_interp ||
18367ec681f3Smrg          key->ps_prolog.states.force_linear_sample_interp ||
18377ec681f3Smrg          key->ps_prolog.states.force_persp_center_interp ||
18387ec681f3Smrg          key->ps_prolog.states.force_linear_center_interp ||
18397ec681f3Smrg          key->ps_prolog.states.bc_optimize_for_persp ||
18407ec681f3Smrg          key->ps_prolog.states.bc_optimize_for_linear || key->ps_prolog.states.poly_stipple ||
18417ec681f3Smrg          key->ps_prolog.states.samplemask_log_ps_iter;
184201e04c3fSmrg}
1843af69d88dSmrg
184401e04c3fSmrg/**
18457ec681f3Smrg * Compute the PS epilog key, which contains all the information needed to
18467ec681f3Smrg * build the PS epilog function.
184701e04c3fSmrg */
18487ec681f3Smrgvoid si_get_ps_epilog_key(struct si_shader *shader, union si_shader_part_key *key)
184901e04c3fSmrg{
18507ec681f3Smrg   struct si_shader_info *info = &shader->selector->info;
18517ec681f3Smrg   memset(key, 0, sizeof(*key));
18527ec681f3Smrg   key->ps_epilog.colors_written = info->colors_written;
18537ec681f3Smrg   key->ps_epilog.color_types = info->output_color_types;
18547ec681f3Smrg   key->ps_epilog.writes_z = info->writes_z;
18557ec681f3Smrg   key->ps_epilog.writes_stencil = info->writes_stencil;
18567ec681f3Smrg   key->ps_epilog.writes_samplemask = info->writes_samplemask;
18577ec681f3Smrg   key->ps_epilog.states = shader->key.part.ps.epilog;
1858af69d88dSmrg}
1859af69d88dSmrg
186001e04c3fSmrg/**
186101e04c3fSmrg * Select and compile (or reuse) pixel shader parts (prolog & epilog).
186201e04c3fSmrg */
18637ec681f3Smrgstatic bool si_shader_select_ps_parts(struct si_screen *sscreen, struct ac_llvm_compiler *compiler,
18647ec681f3Smrg                                      struct si_shader *shader, struct pipe_debug_callback *debug)
18657ec681f3Smrg{
18667ec681f3Smrg   union si_shader_part_key prolog_key;
18677ec681f3Smrg   union si_shader_part_key epilog_key;
18687ec681f3Smrg
18697ec681f3Smrg   /* Get the prolog. */
18707ec681f3Smrg   si_get_ps_prolog_key(shader, &prolog_key, true);
18717ec681f3Smrg
18727ec681f3Smrg   /* The prolog is a no-op if these aren't set. */
18737ec681f3Smrg   if (si_need_ps_prolog(&prolog_key)) {
18747ec681f3Smrg      shader->prolog =
18757ec681f3Smrg         si_get_shader_part(sscreen, &sscreen->ps_prologs, MESA_SHADER_FRAGMENT, true, &prolog_key,
18767ec681f3Smrg                            compiler, debug, si_llvm_build_ps_prolog, "Fragment Shader Prolog");
18777ec681f3Smrg      if (!shader->prolog)
18787ec681f3Smrg         return false;
18797ec681f3Smrg   }
18807ec681f3Smrg
18817ec681f3Smrg   /* Get the epilog. */
18827ec681f3Smrg   si_get_ps_epilog_key(shader, &epilog_key);
18837ec681f3Smrg
18847ec681f3Smrg   shader->epilog =
18857ec681f3Smrg      si_get_shader_part(sscreen, &sscreen->ps_epilogs, MESA_SHADER_FRAGMENT, false, &epilog_key,
18867ec681f3Smrg                         compiler, debug, si_llvm_build_ps_epilog, "Fragment Shader Epilog");
18877ec681f3Smrg   if (!shader->epilog)
18887ec681f3Smrg      return false;
18897ec681f3Smrg
18907ec681f3Smrg   /* Enable POS_FIXED_PT if polygon stippling is enabled. */
18917ec681f3Smrg   if (shader->key.part.ps.prolog.poly_stipple) {
18927ec681f3Smrg      shader->config.spi_ps_input_ena |= S_0286CC_POS_FIXED_PT_ENA(1);
18937ec681f3Smrg      assert(G_0286CC_POS_FIXED_PT_ENA(shader->config.spi_ps_input_addr));
18947ec681f3Smrg   }
18957ec681f3Smrg
18967ec681f3Smrg   /* Set up the enable bits for per-sample shading if needed. */
18977ec681f3Smrg   if (shader->key.part.ps.prolog.force_persp_sample_interp &&
18987ec681f3Smrg       (G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_ena) ||
18997ec681f3Smrg        G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
19007ec681f3Smrg      shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTER_ENA;
19017ec681f3Smrg      shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTROID_ENA;
19027ec681f3Smrg      shader->config.spi_ps_input_ena |= S_0286CC_PERSP_SAMPLE_ENA(1);
19037ec681f3Smrg   }
19047ec681f3Smrg   if (shader->key.part.ps.prolog.force_linear_sample_interp &&
19057ec681f3Smrg       (G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_ena) ||
19067ec681f3Smrg        G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
19077ec681f3Smrg      shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTER_ENA;
19087ec681f3Smrg      shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTROID_ENA;
19097ec681f3Smrg      shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_SAMPLE_ENA(1);
19107ec681f3Smrg   }
19117ec681f3Smrg   if (shader->key.part.ps.prolog.force_persp_center_interp &&
19127ec681f3Smrg       (G_0286CC_PERSP_SAMPLE_ENA(shader->config.spi_ps_input_ena) ||
19137ec681f3Smrg        G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
19147ec681f3Smrg      shader->config.spi_ps_input_ena &= C_0286CC_PERSP_SAMPLE_ENA;
19157ec681f3Smrg      shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTROID_ENA;
19167ec681f3Smrg      shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTER_ENA(1);
19177ec681f3Smrg   }
19187ec681f3Smrg   if (shader->key.part.ps.prolog.force_linear_center_interp &&
19197ec681f3Smrg       (G_0286CC_LINEAR_SAMPLE_ENA(shader->config.spi_ps_input_ena) ||
19207ec681f3Smrg        G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_ena))) {
19217ec681f3Smrg      shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_SAMPLE_ENA;
19227ec681f3Smrg      shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTROID_ENA;
19237ec681f3Smrg      shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTER_ENA(1);
19247ec681f3Smrg   }
19257ec681f3Smrg
19267ec681f3Smrg   /* POW_W_FLOAT requires that one of the perspective weights is enabled. */
19277ec681f3Smrg   if (G_0286CC_POS_W_FLOAT_ENA(shader->config.spi_ps_input_ena) &&
19287ec681f3Smrg       !(shader->config.spi_ps_input_ena & 0xf)) {
19297ec681f3Smrg      shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTER_ENA(1);
19307ec681f3Smrg      assert(G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_addr));
19317ec681f3Smrg   }
19327ec681f3Smrg
19337ec681f3Smrg   /* At least one pair of interpolation weights must be enabled. */
19347ec681f3Smrg   if (!(shader->config.spi_ps_input_ena & 0x7f)) {
19357ec681f3Smrg      shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTER_ENA(1);
19367ec681f3Smrg      assert(G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_addr));
19377ec681f3Smrg   }
19387ec681f3Smrg
19397ec681f3Smrg   /* Samplemask fixup requires the sample ID. */
19407ec681f3Smrg   if (shader->key.part.ps.prolog.samplemask_log_ps_iter) {
19417ec681f3Smrg      shader->config.spi_ps_input_ena |= S_0286CC_ANCILLARY_ENA(1);
19427ec681f3Smrg      assert(G_0286CC_ANCILLARY_ENA(shader->config.spi_ps_input_addr));
19437ec681f3Smrg   }
19447ec681f3Smrg
19457ec681f3Smrg   /* The sample mask input is always enabled, because the API shader always
19467ec681f3Smrg    * passes it through to the epilog. Disable it here if it's unused.
19477ec681f3Smrg    */
19487ec681f3Smrg   if (!shader->key.part.ps.epilog.poly_line_smoothing && !shader->selector->info.reads_samplemask)
19497ec681f3Smrg      shader->config.spi_ps_input_ena &= C_0286CC_SAMPLE_COVERAGE_ENA;
19507ec681f3Smrg
19517ec681f3Smrg   return true;
19527ec681f3Smrg}
19537ec681f3Smrg
19547ec681f3Smrgvoid si_multiwave_lds_size_workaround(struct si_screen *sscreen, unsigned *lds_size)
19557ec681f3Smrg{
19567ec681f3Smrg   /* If tessellation is all offchip and on-chip GS isn't used, this
19577ec681f3Smrg    * workaround is not needed.
19587ec681f3Smrg    */
19597ec681f3Smrg   return;
19607ec681f3Smrg
19617ec681f3Smrg   /* SPI barrier management bug:
19627ec681f3Smrg    *   Make sure we have at least 4k of LDS in use to avoid the bug.
19637ec681f3Smrg    *   It applies to workgroup sizes of more than one wavefront.
19647ec681f3Smrg    */
19657ec681f3Smrg   if (sscreen->info.family == CHIP_BONAIRE || sscreen->info.family == CHIP_KABINI)
19667ec681f3Smrg      *lds_size = MAX2(*lds_size, 8);
19677ec681f3Smrg}
19687ec681f3Smrg
19697ec681f3Smrgvoid si_fix_resource_usage(struct si_screen *sscreen, struct si_shader *shader)
19707ec681f3Smrg{
19717ec681f3Smrg   unsigned min_sgprs = shader->info.num_input_sgprs + 2; /* VCC */
19727ec681f3Smrg
19737ec681f3Smrg   shader->config.num_sgprs = MAX2(shader->config.num_sgprs, min_sgprs);
19747ec681f3Smrg
19757ec681f3Smrg   if (shader->selector->info.stage == MESA_SHADER_COMPUTE &&
19767ec681f3Smrg       si_get_max_workgroup_size(shader) > sscreen->compute_wave_size) {
19777ec681f3Smrg      si_multiwave_lds_size_workaround(sscreen, &shader->config.lds_size);
19787ec681f3Smrg   }
19797ec681f3Smrg}
19807ec681f3Smrg
19817ec681f3Smrgbool si_create_shader_variant(struct si_screen *sscreen, struct ac_llvm_compiler *compiler,
19827ec681f3Smrg                              struct si_shader *shader, struct pipe_debug_callback *debug)
19837ec681f3Smrg{
19847ec681f3Smrg   struct si_shader_selector *sel = shader->selector;
19857ec681f3Smrg   struct si_shader *mainp = *si_get_main_shader_part(sel, &shader->key);
19867ec681f3Smrg
19877ec681f3Smrg   /* LS, ES, VS are compiled on demand if the main part hasn't been
19887ec681f3Smrg    * compiled for that stage.
19897ec681f3Smrg    *
19907ec681f3Smrg    * GS are compiled on demand if the main part hasn't been compiled
19917ec681f3Smrg    * for the chosen NGG-ness.
19927ec681f3Smrg    *
19937ec681f3Smrg    * Vertex shaders are compiled on demand when a vertex fetch
19947ec681f3Smrg    * workaround must be applied.
19957ec681f3Smrg    */
19967ec681f3Smrg   if (shader->is_monolithic) {
19977ec681f3Smrg      /* Monolithic shader (compiled as a whole, has many variants,
19987ec681f3Smrg       * may take a long time to compile).
19997ec681f3Smrg       */
20007ec681f3Smrg      if (!si_compile_shader(sscreen, compiler, shader, debug))
20017ec681f3Smrg         return false;
20027ec681f3Smrg   } else {
20037ec681f3Smrg      /* The shader consists of several parts:
20047ec681f3Smrg       *
20057ec681f3Smrg       * - the middle part is the user shader, it has 1 variant only
20067ec681f3Smrg       *   and it was compiled during the creation of the shader
20077ec681f3Smrg       *   selector
20087ec681f3Smrg       * - the prolog part is inserted at the beginning
20097ec681f3Smrg       * - the epilog part is inserted at the end
20107ec681f3Smrg       *
20117ec681f3Smrg       * The prolog and epilog have many (but simple) variants.
20127ec681f3Smrg       *
20137ec681f3Smrg       * Starting with gfx9, geometry and tessellation control
20147ec681f3Smrg       * shaders also contain the prolog and user shader parts of
20157ec681f3Smrg       * the previous shader stage.
20167ec681f3Smrg       */
20177ec681f3Smrg
20187ec681f3Smrg      if (!mainp)
20197ec681f3Smrg         return false;
20207ec681f3Smrg
20217ec681f3Smrg      /* Copy the compiled shader data over. */
20227ec681f3Smrg      shader->is_binary_shared = true;
20237ec681f3Smrg      shader->binary = mainp->binary;
20247ec681f3Smrg      shader->config = mainp->config;
20257ec681f3Smrg      shader->info.num_input_sgprs = mainp->info.num_input_sgprs;
20267ec681f3Smrg      shader->info.num_input_vgprs = mainp->info.num_input_vgprs;
20277ec681f3Smrg      shader->info.face_vgpr_index = mainp->info.face_vgpr_index;
20287ec681f3Smrg      shader->info.ancillary_vgpr_index = mainp->info.ancillary_vgpr_index;
20297ec681f3Smrg      memcpy(shader->info.vs_output_ps_input_cntl, mainp->info.vs_output_ps_input_cntl,
20307ec681f3Smrg             sizeof(mainp->info.vs_output_ps_input_cntl));
20317ec681f3Smrg      shader->info.uses_instanceid = mainp->info.uses_instanceid;
20327ec681f3Smrg      shader->info.nr_pos_exports = mainp->info.nr_pos_exports;
20337ec681f3Smrg      shader->info.nr_param_exports = mainp->info.nr_param_exports;
20347ec681f3Smrg
20357ec681f3Smrg      /* Select prologs and/or epilogs. */
20367ec681f3Smrg      switch (sel->info.stage) {
20377ec681f3Smrg      case MESA_SHADER_VERTEX:
20387ec681f3Smrg         if (!si_shader_select_vs_parts(sscreen, compiler, shader, debug))
20397ec681f3Smrg            return false;
20407ec681f3Smrg         break;
20417ec681f3Smrg      case MESA_SHADER_TESS_CTRL:
20427ec681f3Smrg         if (!si_shader_select_tcs_parts(sscreen, compiler, shader, debug))
20437ec681f3Smrg            return false;
20447ec681f3Smrg         break;
20457ec681f3Smrg      case MESA_SHADER_TESS_EVAL:
20467ec681f3Smrg         break;
20477ec681f3Smrg      case MESA_SHADER_GEOMETRY:
20487ec681f3Smrg         if (!si_shader_select_gs_parts(sscreen, compiler, shader, debug))
20497ec681f3Smrg            return false;
20507ec681f3Smrg         break;
20517ec681f3Smrg      case MESA_SHADER_FRAGMENT:
20527ec681f3Smrg         if (!si_shader_select_ps_parts(sscreen, compiler, shader, debug))
20537ec681f3Smrg            return false;
20547ec681f3Smrg
20557ec681f3Smrg         /* Make sure we have at least as many VGPRs as there
20567ec681f3Smrg          * are allocated inputs.
20577ec681f3Smrg          */
20587ec681f3Smrg         shader->config.num_vgprs = MAX2(shader->config.num_vgprs, shader->info.num_input_vgprs);
20597ec681f3Smrg         break;
20607ec681f3Smrg      default:;
20617ec681f3Smrg      }
20627ec681f3Smrg
20637ec681f3Smrg      /* Update SGPR and VGPR counts. */
20647ec681f3Smrg      if (shader->prolog) {
20657ec681f3Smrg         shader->config.num_sgprs =
20667ec681f3Smrg            MAX2(shader->config.num_sgprs, shader->prolog->config.num_sgprs);
20677ec681f3Smrg         shader->config.num_vgprs =
20687ec681f3Smrg            MAX2(shader->config.num_vgprs, shader->prolog->config.num_vgprs);
20697ec681f3Smrg      }
20707ec681f3Smrg      if (shader->previous_stage) {
20717ec681f3Smrg         shader->config.num_sgprs =
20727ec681f3Smrg            MAX2(shader->config.num_sgprs, shader->previous_stage->config.num_sgprs);
20737ec681f3Smrg         shader->config.num_vgprs =
20747ec681f3Smrg            MAX2(shader->config.num_vgprs, shader->previous_stage->config.num_vgprs);
20757ec681f3Smrg         shader->config.spilled_sgprs =
20767ec681f3Smrg            MAX2(shader->config.spilled_sgprs, shader->previous_stage->config.spilled_sgprs);
20777ec681f3Smrg         shader->config.spilled_vgprs =
20787ec681f3Smrg            MAX2(shader->config.spilled_vgprs, shader->previous_stage->config.spilled_vgprs);
20797ec681f3Smrg         shader->info.private_mem_vgprs =
20807ec681f3Smrg            MAX2(shader->info.private_mem_vgprs, shader->previous_stage->info.private_mem_vgprs);
20817ec681f3Smrg         shader->config.scratch_bytes_per_wave =
20827ec681f3Smrg            MAX2(shader->config.scratch_bytes_per_wave,
20837ec681f3Smrg                 shader->previous_stage->config.scratch_bytes_per_wave);
20847ec681f3Smrg         shader->info.uses_instanceid |= shader->previous_stage->info.uses_instanceid;
20857ec681f3Smrg      }
20867ec681f3Smrg      if (shader->prolog2) {
20877ec681f3Smrg         shader->config.num_sgprs =
20887ec681f3Smrg            MAX2(shader->config.num_sgprs, shader->prolog2->config.num_sgprs);
20897ec681f3Smrg         shader->config.num_vgprs =
20907ec681f3Smrg            MAX2(shader->config.num_vgprs, shader->prolog2->config.num_vgprs);
20917ec681f3Smrg      }
20927ec681f3Smrg      if (shader->epilog) {
20937ec681f3Smrg         shader->config.num_sgprs =
20947ec681f3Smrg            MAX2(shader->config.num_sgprs, shader->epilog->config.num_sgprs);
20957ec681f3Smrg         shader->config.num_vgprs =
20967ec681f3Smrg            MAX2(shader->config.num_vgprs, shader->epilog->config.num_vgprs);
20977ec681f3Smrg      }
20987ec681f3Smrg      si_calculate_max_simd_waves(shader);
20997ec681f3Smrg   }
21007ec681f3Smrg
21017ec681f3Smrg   if (shader->key.as_ngg) {
21027ec681f3Smrg      assert(!shader->key.as_es && !shader->key.as_ls);
21037ec681f3Smrg      if (!gfx10_ngg_calculate_subgroup_info(shader)) {
21047ec681f3Smrg         fprintf(stderr, "Failed to compute subgroup info\n");
21057ec681f3Smrg         return false;
21067ec681f3Smrg      }
21077ec681f3Smrg   } else if (sscreen->info.chip_class >= GFX9 && sel->info.stage == MESA_SHADER_GEOMETRY) {
21087ec681f3Smrg      gfx9_get_gs_info(shader->previous_stage_sel, sel, &shader->gs_info);
21097ec681f3Smrg   }
21107ec681f3Smrg
21117ec681f3Smrg   shader->uses_vs_state_provoking_vertex =
21127ec681f3Smrg      sscreen->use_ngg &&
21137ec681f3Smrg      /* Used to convert triangle strips from GS to triangles. */
21147ec681f3Smrg      ((sel->info.stage == MESA_SHADER_GEOMETRY &&
21157ec681f3Smrg        util_rast_prim_is_triangles(sel->info.base.gs.output_primitive)) ||
21167ec681f3Smrg       (sel->info.stage == MESA_SHADER_VERTEX &&
21177ec681f3Smrg        /* Used to export PrimitiveID from the correct vertex. */
21187ec681f3Smrg        shader->key.mono.u.vs_export_prim_id));
21197ec681f3Smrg
21207ec681f3Smrg   shader->uses_vs_state_outprim = sscreen->use_ngg &&
21217ec681f3Smrg                                   /* Only used by streamout in vertex shaders. */
21227ec681f3Smrg                                   sel->info.stage == MESA_SHADER_VERTEX &&
21237ec681f3Smrg                                   sel->so.num_outputs;
21247ec681f3Smrg
21257ec681f3Smrg   if (sel->info.stage == MESA_SHADER_VERTEX) {
21267ec681f3Smrg      shader->uses_base_instance = sel->info.uses_base_instance ||
21277ec681f3Smrg                                   shader->key.part.vs.prolog.instance_divisor_is_one ||
21287ec681f3Smrg                                   shader->key.part.vs.prolog.instance_divisor_is_fetched;
21297ec681f3Smrg   } else if (sel->info.stage == MESA_SHADER_TESS_CTRL) {
21307ec681f3Smrg      shader->uses_base_instance = shader->previous_stage_sel &&
21317ec681f3Smrg                                   (shader->previous_stage_sel->info.uses_base_instance ||
21327ec681f3Smrg                                    shader->key.part.tcs.ls_prolog.instance_divisor_is_one ||
21337ec681f3Smrg                                    shader->key.part.tcs.ls_prolog.instance_divisor_is_fetched);
21347ec681f3Smrg   } else if (sel->info.stage == MESA_SHADER_GEOMETRY) {
21357ec681f3Smrg      shader->uses_base_instance = shader->previous_stage_sel &&
21367ec681f3Smrg                                   (shader->previous_stage_sel->info.uses_base_instance ||
21377ec681f3Smrg                                    shader->key.part.gs.vs_prolog.instance_divisor_is_one ||
21387ec681f3Smrg                                    shader->key.part.gs.vs_prolog.instance_divisor_is_fetched);
21397ec681f3Smrg   }
21407ec681f3Smrg
21417ec681f3Smrg   si_fix_resource_usage(sscreen, shader);
21427ec681f3Smrg   si_shader_dump(sscreen, shader, debug, stderr, true);
21437ec681f3Smrg
21447ec681f3Smrg   /* Upload. */
21457ec681f3Smrg   if (!si_shader_binary_upload(sscreen, shader, 0)) {
21467ec681f3Smrg      fprintf(stderr, "LLVM failed to upload shader\n");
21477ec681f3Smrg      return false;
21487ec681f3Smrg   }
21497ec681f3Smrg
21507ec681f3Smrg   return true;
21517ec681f3Smrg}
21527ec681f3Smrg
21537ec681f3Smrgvoid si_shader_binary_clean(struct si_shader_binary *binary)
21547ec681f3Smrg{
21557ec681f3Smrg   free((void *)binary->elf_buffer);
21567ec681f3Smrg   binary->elf_buffer = NULL;
21577ec681f3Smrg
21587ec681f3Smrg   free(binary->llvm_ir_string);
21597ec681f3Smrg   binary->llvm_ir_string = NULL;
21607ec681f3Smrg
21617ec681f3Smrg   free(binary->uploaded_code);
21627ec681f3Smrg   binary->uploaded_code = NULL;
21637ec681f3Smrg   binary->uploaded_code_size = 0;
2164af69d88dSmrg}
2165af69d88dSmrg
216601e04c3fSmrgvoid si_shader_destroy(struct si_shader *shader)
2167af69d88dSmrg{
21687ec681f3Smrg   if (shader->scratch_bo)
21697ec681f3Smrg      si_resource_reference(&shader->scratch_bo, NULL);
2170af69d88dSmrg
21717ec681f3Smrg   si_resource_reference(&shader->bo, NULL);
217201e04c3fSmrg
21737ec681f3Smrg   if (!shader->is_binary_shared)
21747ec681f3Smrg      si_shader_binary_clean(&shader->binary);
217501e04c3fSmrg
21767ec681f3Smrg   free(shader->shader_log);
2177af69d88dSmrg}
2178