1af69d88dSmrg/* 2af69d88dSmrg * Copyright 2012 Advanced Micro Devices, Inc. 301e04c3fSmrg * All Rights Reserved. 4af69d88dSmrg * 5af69d88dSmrg * Permission is hereby granted, free of charge, to any person obtaining a 6af69d88dSmrg * copy of this software and associated documentation files (the "Software"), 7af69d88dSmrg * to deal in the Software without restriction, including without limitation 8af69d88dSmrg * on the rights to use, copy, modify, merge, publish, distribute, sub 9af69d88dSmrg * license, and/or sell copies of the Software, and to permit persons to whom 10af69d88dSmrg * the Software is furnished to do so, subject to the following conditions: 11af69d88dSmrg * 12af69d88dSmrg * The above copyright notice and this permission notice (including the next 13af69d88dSmrg * paragraph) shall be included in all copies or substantial portions of the 14af69d88dSmrg * Software. 15af69d88dSmrg * 16af69d88dSmrg * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17af69d88dSmrg * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18af69d88dSmrg * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL 19af69d88dSmrg * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, 20af69d88dSmrg * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 21af69d88dSmrg * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE 22af69d88dSmrg * USE OR OTHER DEALINGS IN THE SOFTWARE. 23af69d88dSmrg */ 24af69d88dSmrg 257ec681f3Smrg#include "ac_exp_param.h" 267ec681f3Smrg#include "ac_rtld.h" 277ec681f3Smrg#include "compiler/nir/nir.h" 287ec681f3Smrg#include "compiler/nir/nir_serialize.h" 297ec681f3Smrg#include "si_pipe.h" 307ec681f3Smrg#include "si_shader_internal.h" 317ec681f3Smrg#include "sid.h" 327ec681f3Smrg#include "tgsi/tgsi_from_mesa.h" 337ec681f3Smrg#include "tgsi/tgsi_strings.h" 347ec681f3Smrg#include "util/u_memory.h" 3501e04c3fSmrg 367ec681f3Smrgstatic const char scratch_rsrc_dword0_symbol[] = "SCRATCH_RSRC_DWORD0"; 3701e04c3fSmrg 387ec681f3Smrgstatic const char scratch_rsrc_dword1_symbol[] = "SCRATCH_RSRC_DWORD1"; 3901e04c3fSmrg 407ec681f3Smrgstatic void si_dump_shader_key(const struct si_shader *shader, FILE *f); 4101e04c3fSmrg 427ec681f3Smrg/** Whether the shader runs as a combination of multiple API shaders */ 437ec681f3Smrgbool si_is_multi_part_shader(struct si_shader *shader) 4401e04c3fSmrg{ 457ec681f3Smrg if (shader->selector->screen->info.chip_class <= GFX8) 467ec681f3Smrg return false; 4701e04c3fSmrg 487ec681f3Smrg return shader->key.as_ls || shader->key.as_es || 497ec681f3Smrg shader->selector->info.stage == MESA_SHADER_TESS_CTRL || 507ec681f3Smrg shader->selector->info.stage == MESA_SHADER_GEOMETRY; 5101e04c3fSmrg} 5201e04c3fSmrg 537ec681f3Smrg/** Whether the shader runs on a merged HW stage (LSHS or ESGS) */ 547ec681f3Smrgbool si_is_merged_shader(struct si_shader *shader) 5501e04c3fSmrg{ 567ec681f3Smrg return shader->key.as_ngg || si_is_multi_part_shader(shader); 5701e04c3fSmrg} 5801e04c3fSmrg 5901e04c3fSmrg/** 607ec681f3Smrg * Returns a unique index for a per-patch semantic name and index. The index 617ec681f3Smrg * must be less than 32, so that a 32-bit bitmask of used inputs or outputs 627ec681f3Smrg * can be calculated. 6301e04c3fSmrg */ 647ec681f3Smrgunsigned si_shader_io_get_unique_index_patch(unsigned semantic) 6501e04c3fSmrg{ 667ec681f3Smrg switch (semantic) { 677ec681f3Smrg case VARYING_SLOT_TESS_LEVEL_OUTER: 687ec681f3Smrg return 0; 697ec681f3Smrg case VARYING_SLOT_TESS_LEVEL_INNER: 707ec681f3Smrg return 1; 717ec681f3Smrg default: 727ec681f3Smrg if (semantic >= VARYING_SLOT_PATCH0 && semantic < VARYING_SLOT_PATCH0 + 30) 737ec681f3Smrg return 2 + (semantic - VARYING_SLOT_PATCH0); 7401e04c3fSmrg 757ec681f3Smrg assert(!"invalid semantic"); 767ec681f3Smrg return 0; 777ec681f3Smrg } 7801e04c3fSmrg} 7901e04c3fSmrg 807ec681f3Smrg/** 817ec681f3Smrg * Returns a unique index for a semantic name and index. The index must be 827ec681f3Smrg * less than 64, so that a 64-bit bitmask of used inputs or outputs can be 837ec681f3Smrg * calculated. 847ec681f3Smrg */ 857ec681f3Smrgunsigned si_shader_io_get_unique_index(unsigned semantic, bool is_varying) 867ec681f3Smrg{ 877ec681f3Smrg switch (semantic) { 887ec681f3Smrg case VARYING_SLOT_POS: 897ec681f3Smrg return 0; 907ec681f3Smrg default: 917ec681f3Smrg /* Since some shader stages use the highest used IO index 927ec681f3Smrg * to determine the size to allocate for inputs/outputs 937ec681f3Smrg * (in LDS, tess and GS rings). GENERIC should be placed right 947ec681f3Smrg * after POSITION to make that size as small as possible. 957ec681f3Smrg */ 967ec681f3Smrg if (semantic >= VARYING_SLOT_VAR0 && semantic <= VARYING_SLOT_VAR31) 977ec681f3Smrg return 1 + (semantic - VARYING_SLOT_VAR0); /* 1..32 */ 987ec681f3Smrg 997ec681f3Smrg /* Put 16-bit GLES varyings after 32-bit varyings. They can use the same indices as 1007ec681f3Smrg * legacy desktop GL varyings because they are mutually exclusive. 1017ec681f3Smrg */ 1027ec681f3Smrg if (semantic >= VARYING_SLOT_VAR0_16BIT && semantic <= VARYING_SLOT_VAR15_16BIT) 1037ec681f3Smrg return 33 + (semantic - VARYING_SLOT_VAR0_16BIT); /* 33..48 */ 1047ec681f3Smrg 1057ec681f3Smrg assert(!"invalid generic index"); 1067ec681f3Smrg return 0; 1077ec681f3Smrg 1087ec681f3Smrg /* Legacy desktop GL varyings. */ 1097ec681f3Smrg case VARYING_SLOT_FOGC: 1107ec681f3Smrg return 33; 1117ec681f3Smrg case VARYING_SLOT_COL0: 1127ec681f3Smrg return 34; 1137ec681f3Smrg case VARYING_SLOT_COL1: 1147ec681f3Smrg return 35; 1157ec681f3Smrg case VARYING_SLOT_BFC0: 1167ec681f3Smrg /* If it's a varying, COLOR and BCOLOR alias. */ 1177ec681f3Smrg if (is_varying) 1187ec681f3Smrg return 34; 1197ec681f3Smrg else 1207ec681f3Smrg return 36; 1217ec681f3Smrg case VARYING_SLOT_BFC1: 1227ec681f3Smrg if (is_varying) 1237ec681f3Smrg return 35; 1247ec681f3Smrg else 1257ec681f3Smrg return 37; 1267ec681f3Smrg case VARYING_SLOT_TEX0: 1277ec681f3Smrg case VARYING_SLOT_TEX1: 1287ec681f3Smrg case VARYING_SLOT_TEX2: 1297ec681f3Smrg case VARYING_SLOT_TEX3: 1307ec681f3Smrg case VARYING_SLOT_TEX4: 1317ec681f3Smrg case VARYING_SLOT_TEX5: 1327ec681f3Smrg case VARYING_SLOT_TEX6: 1337ec681f3Smrg case VARYING_SLOT_TEX7: 1347ec681f3Smrg return 38 + (semantic - VARYING_SLOT_TEX0); 1357ec681f3Smrg case VARYING_SLOT_CLIP_VERTEX: 1367ec681f3Smrg return 46; 1377ec681f3Smrg 1387ec681f3Smrg /* Varyings present in both GLES and desktop GL must start at 49 after 16-bit varyings. */ 1397ec681f3Smrg case VARYING_SLOT_CLIP_DIST0: 1407ec681f3Smrg return 49; 1417ec681f3Smrg case VARYING_SLOT_CLIP_DIST1: 1427ec681f3Smrg return 50; 1437ec681f3Smrg case VARYING_SLOT_PSIZ: 1447ec681f3Smrg return 51; 1457ec681f3Smrg 1467ec681f3Smrg /* These can't be written by LS, HS, and ES. */ 1477ec681f3Smrg case VARYING_SLOT_LAYER: 1487ec681f3Smrg return 52; 1497ec681f3Smrg case VARYING_SLOT_VIEWPORT: 1507ec681f3Smrg return 53; 1517ec681f3Smrg case VARYING_SLOT_PRIMITIVE_ID: 1527ec681f3Smrg return 54; 1537ec681f3Smrg } 15401e04c3fSmrg} 15501e04c3fSmrg 1567ec681f3Smrgstatic void si_dump_streamout(struct pipe_stream_output_info *so) 15701e04c3fSmrg{ 1587ec681f3Smrg unsigned i; 15901e04c3fSmrg 1607ec681f3Smrg if (so->num_outputs) 1617ec681f3Smrg fprintf(stderr, "STREAMOUT\n"); 16201e04c3fSmrg 1637ec681f3Smrg for (i = 0; i < so->num_outputs; i++) { 1647ec681f3Smrg unsigned mask = ((1 << so->output[i].num_components) - 1) << so->output[i].start_component; 1657ec681f3Smrg fprintf(stderr, " %i: BUF%i[%i..%i] <- OUT[%i].%s%s%s%s\n", i, so->output[i].output_buffer, 1667ec681f3Smrg so->output[i].dst_offset, so->output[i].dst_offset + so->output[i].num_components - 1, 1677ec681f3Smrg so->output[i].register_index, mask & 1 ? "x" : "", mask & 2 ? "y" : "", 1687ec681f3Smrg mask & 4 ? "z" : "", mask & 8 ? "w" : ""); 1697ec681f3Smrg } 17001e04c3fSmrg} 17101e04c3fSmrg 1727ec681f3Smrgstatic void declare_streamout_params(struct si_shader_context *ctx, 1737ec681f3Smrg struct pipe_stream_output_info *so) 17401e04c3fSmrg{ 1757ec681f3Smrg if (ctx->screen->use_ngg_streamout) { 1767ec681f3Smrg if (ctx->stage == MESA_SHADER_TESS_EVAL) 1777ec681f3Smrg ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); 1787ec681f3Smrg return; 1797ec681f3Smrg } 18001e04c3fSmrg 1817ec681f3Smrg /* Streamout SGPRs. */ 1827ec681f3Smrg if (so->num_outputs) { 1837ec681f3Smrg ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.streamout_config); 1847ec681f3Smrg ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.streamout_write_index); 1857ec681f3Smrg } else if (ctx->stage == MESA_SHADER_TESS_EVAL) { 1867ec681f3Smrg ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); 1877ec681f3Smrg } 18801e04c3fSmrg 1897ec681f3Smrg /* A streamout buffer offset is loaded if the stride is non-zero. */ 1907ec681f3Smrg for (int i = 0; i < 4; i++) { 1917ec681f3Smrg if (!so->stride[i]) 1927ec681f3Smrg continue; 19301e04c3fSmrg 1947ec681f3Smrg ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.streamout_offset[i]); 1957ec681f3Smrg } 19601e04c3fSmrg} 19701e04c3fSmrg 1987ec681f3Smrgunsigned si_get_max_workgroup_size(const struct si_shader *shader) 19901e04c3fSmrg{ 2007ec681f3Smrg switch (shader->selector->info.stage) { 2017ec681f3Smrg case MESA_SHADER_VERTEX: 2027ec681f3Smrg case MESA_SHADER_TESS_EVAL: 2037ec681f3Smrg return shader->key.as_ngg ? 128 : 0; 20401e04c3fSmrg 2057ec681f3Smrg case MESA_SHADER_TESS_CTRL: 2067ec681f3Smrg /* Return this so that LLVM doesn't remove s_barrier 2077ec681f3Smrg * instructions on chips where we use s_barrier. */ 2087ec681f3Smrg return shader->selector->screen->info.chip_class >= GFX7 ? 128 : 0; 20901e04c3fSmrg 2107ec681f3Smrg case MESA_SHADER_GEOMETRY: 2117ec681f3Smrg return shader->selector->screen->info.chip_class >= GFX9 ? 128 : 0; 21201e04c3fSmrg 2137ec681f3Smrg case MESA_SHADER_COMPUTE: 2147ec681f3Smrg break; /* see below */ 21501e04c3fSmrg 2167ec681f3Smrg default: 2177ec681f3Smrg return 0; 2187ec681f3Smrg } 21901e04c3fSmrg 2207ec681f3Smrg /* Compile a variable block size using the maximum variable size. */ 2217ec681f3Smrg if (shader->selector->info.base.workgroup_size_variable) 2227ec681f3Smrg return SI_MAX_VARIABLE_THREADS_PER_BLOCK; 22301e04c3fSmrg 2247ec681f3Smrg uint16_t *local_size = shader->selector->info.base.workgroup_size; 2257ec681f3Smrg unsigned max_work_group_size = (uint32_t)local_size[0] * 2267ec681f3Smrg (uint32_t)local_size[1] * 2277ec681f3Smrg (uint32_t)local_size[2]; 2287ec681f3Smrg assert(max_work_group_size); 2297ec681f3Smrg return max_work_group_size; 23001e04c3fSmrg} 23101e04c3fSmrg 2327ec681f3Smrgstatic void declare_const_and_shader_buffers(struct si_shader_context *ctx, bool assign_params) 23301e04c3fSmrg{ 2347ec681f3Smrg enum ac_arg_type const_shader_buf_type; 23501e04c3fSmrg 2367ec681f3Smrg if (ctx->shader->selector->info.base.num_ubos == 1 && 2377ec681f3Smrg ctx->shader->selector->info.base.num_ssbos == 0) 2387ec681f3Smrg const_shader_buf_type = AC_ARG_CONST_FLOAT_PTR; 2397ec681f3Smrg else 2407ec681f3Smrg const_shader_buf_type = AC_ARG_CONST_DESC_PTR; 24101e04c3fSmrg 2427ec681f3Smrg ac_add_arg( 2437ec681f3Smrg &ctx->args, AC_ARG_SGPR, 1, const_shader_buf_type, 2447ec681f3Smrg assign_params ? &ctx->const_and_shader_buffers : &ctx->other_const_and_shader_buffers); 24501e04c3fSmrg} 24601e04c3fSmrg 2477ec681f3Smrgstatic void declare_samplers_and_images(struct si_shader_context *ctx, bool assign_params) 24801e04c3fSmrg{ 2497ec681f3Smrg ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_IMAGE_PTR, 2507ec681f3Smrg assign_params ? &ctx->samplers_and_images : &ctx->other_samplers_and_images); 25101e04c3fSmrg} 25201e04c3fSmrg 2537ec681f3Smrgstatic void declare_per_stage_desc_pointers(struct si_shader_context *ctx, bool assign_params) 25401e04c3fSmrg{ 2557ec681f3Smrg declare_const_and_shader_buffers(ctx, assign_params); 2567ec681f3Smrg declare_samplers_and_images(ctx, assign_params); 25701e04c3fSmrg} 25801e04c3fSmrg 2597ec681f3Smrgstatic void declare_global_desc_pointers(struct si_shader_context *ctx) 26001e04c3fSmrg{ 2617ec681f3Smrg ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_DESC_PTR, &ctx->internal_bindings); 2627ec681f3Smrg ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_IMAGE_PTR, 2637ec681f3Smrg &ctx->bindless_samplers_and_images); 26401e04c3fSmrg} 26501e04c3fSmrg 2667ec681f3Smrgstatic void declare_vs_specific_input_sgprs(struct si_shader_context *ctx) 26701e04c3fSmrg{ 2687ec681f3Smrg ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->vs_state_bits); 2697ec681f3Smrg if (!ctx->shader->is_gs_copy_shader) { 2707ec681f3Smrg ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.base_vertex); 2717ec681f3Smrg ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.draw_id); 2727ec681f3Smrg ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.start_instance); 2737ec681f3Smrg } 27401e04c3fSmrg} 27501e04c3fSmrg 2767ec681f3Smrgstatic void declare_vb_descriptor_input_sgprs(struct si_shader_context *ctx) 27701e04c3fSmrg{ 2787ec681f3Smrg ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_DESC_PTR, &ctx->args.vertex_buffers); 27901e04c3fSmrg 2807ec681f3Smrg unsigned num_vbos_in_user_sgprs = ctx->shader->selector->num_vbos_in_user_sgprs; 2817ec681f3Smrg if (num_vbos_in_user_sgprs) { 2827ec681f3Smrg unsigned user_sgprs = ctx->args.num_sgprs_used; 28301e04c3fSmrg 2847ec681f3Smrg if (si_is_merged_shader(ctx->shader)) 2857ec681f3Smrg user_sgprs -= 8; 2867ec681f3Smrg assert(user_sgprs <= SI_SGPR_VS_VB_DESCRIPTOR_FIRST); 28701e04c3fSmrg 2887ec681f3Smrg /* Declare unused SGPRs to align VB descriptors to 4 SGPRs (hw requirement). */ 2897ec681f3Smrg for (unsigned i = user_sgprs; i < SI_SGPR_VS_VB_DESCRIPTOR_FIRST; i++) 2907ec681f3Smrg ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */ 29101e04c3fSmrg 2927ec681f3Smrg assert(num_vbos_in_user_sgprs <= ARRAY_SIZE(ctx->vb_descriptors)); 2937ec681f3Smrg for (unsigned i = 0; i < num_vbos_in_user_sgprs; i++) 2947ec681f3Smrg ac_add_arg(&ctx->args, AC_ARG_SGPR, 4, AC_ARG_INT, &ctx->vb_descriptors[i]); 2957ec681f3Smrg } 29601e04c3fSmrg} 29701e04c3fSmrg 2987ec681f3Smrgstatic void declare_vs_input_vgprs(struct si_shader_context *ctx, unsigned *num_prolog_vgprs) 29901e04c3fSmrg{ 3007ec681f3Smrg struct si_shader *shader = ctx->shader; 30101e04c3fSmrg 3027ec681f3Smrg ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.vertex_id); 3037ec681f3Smrg if (shader->key.as_ls) { 3047ec681f3Smrg ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.vs_rel_patch_id); 3057ec681f3Smrg if (ctx->screen->info.chip_class >= GFX10) { 3067ec681f3Smrg ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* user VGPR */ 3077ec681f3Smrg ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.instance_id); 3087ec681f3Smrg } else { 3097ec681f3Smrg ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.instance_id); 3107ec681f3Smrg ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* unused */ 3117ec681f3Smrg } 3127ec681f3Smrg } else if (ctx->screen->info.chip_class >= GFX10) { 3137ec681f3Smrg ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* user VGPR */ 3147ec681f3Smrg ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, 3157ec681f3Smrg &ctx->args.vs_prim_id); /* user vgpr or PrimID (legacy) */ 3167ec681f3Smrg ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.instance_id); 3177ec681f3Smrg } else { 3187ec681f3Smrg ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.instance_id); 3197ec681f3Smrg ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.vs_prim_id); 3207ec681f3Smrg ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); /* unused */ 3217ec681f3Smrg } 32201e04c3fSmrg 3237ec681f3Smrg if (!shader->is_gs_copy_shader) { 3247ec681f3Smrg /* Vertex load indices. */ 3257ec681f3Smrg if (shader->selector->info.num_inputs) { 3267ec681f3Smrg ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->vertex_index0); 3277ec681f3Smrg for (unsigned i = 1; i < shader->selector->info.num_inputs; i++) 3287ec681f3Smrg ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, NULL); 3297ec681f3Smrg } 3307ec681f3Smrg *num_prolog_vgprs += shader->selector->info.num_inputs; 3317ec681f3Smrg } 33201e04c3fSmrg} 33301e04c3fSmrg 3347ec681f3Smrgstatic void declare_vs_blit_inputs(struct si_shader_context *ctx, unsigned vs_blit_property) 33501e04c3fSmrg{ 3367ec681f3Smrg ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->vs_blit_inputs); /* i16 x1, y1 */ 3377ec681f3Smrg ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* i16 x1, y1 */ 3387ec681f3Smrg ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* depth */ 33901e04c3fSmrg 3407ec681f3Smrg if (vs_blit_property == SI_VS_BLIT_SGPRS_POS_COLOR) { 3417ec681f3Smrg ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* color0 */ 3427ec681f3Smrg ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* color1 */ 3437ec681f3Smrg ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* color2 */ 3447ec681f3Smrg ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* color3 */ 3457ec681f3Smrg } else if (vs_blit_property == SI_VS_BLIT_SGPRS_POS_TEXCOORD) { 3467ec681f3Smrg ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.x1 */ 3477ec681f3Smrg ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.y1 */ 3487ec681f3Smrg ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.x2 */ 3497ec681f3Smrg ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.y2 */ 3507ec681f3Smrg ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.z */ 3517ec681f3Smrg ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, NULL); /* texcoord.w */ 3527ec681f3Smrg } 3539f464c52Smaya} 3549f464c52Smaya 3557ec681f3Smrgstatic void declare_tes_input_vgprs(struct si_shader_context *ctx) 35601e04c3fSmrg{ 3577ec681f3Smrg ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &ctx->args.tes_u); 3587ec681f3Smrg ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &ctx->args.tes_v); 3597ec681f3Smrg ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.tes_rel_patch_id); 3607ec681f3Smrg ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.tes_patch_id); 3617ec681f3Smrg} 3627ec681f3Smrg 3637ec681f3Smrgenum 3647ec681f3Smrg{ 3657ec681f3Smrg /* Convenient merged shader definitions. */ 3667ec681f3Smrg SI_SHADER_MERGED_VERTEX_TESSCTRL = MESA_ALL_SHADER_STAGES, 3677ec681f3Smrg SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY, 36801e04c3fSmrg}; 36901e04c3fSmrg 3707ec681f3Smrgvoid si_add_arg_checked(struct ac_shader_args *args, enum ac_arg_regfile file, unsigned registers, 3717ec681f3Smrg enum ac_arg_type type, struct ac_arg *arg, unsigned idx) 3727ec681f3Smrg{ 3737ec681f3Smrg assert(args->arg_count == idx); 3747ec681f3Smrg ac_add_arg(args, file, registers, type, arg); 3757ec681f3Smrg} 3767ec681f3Smrg 3777ec681f3Smrgvoid si_init_shader_args(struct si_shader_context *ctx, bool ngg_cull_shader) 3787ec681f3Smrg{ 3797ec681f3Smrg struct si_shader *shader = ctx->shader; 3807ec681f3Smrg unsigned i, num_returns, num_return_sgprs; 3817ec681f3Smrg unsigned num_prolog_vgprs = 0; 3827ec681f3Smrg unsigned stage = ctx->stage; 3837ec681f3Smrg 3847ec681f3Smrg memset(&ctx->args, 0, sizeof(ctx->args)); 3857ec681f3Smrg 3867ec681f3Smrg /* Set MERGED shaders. */ 3877ec681f3Smrg if (ctx->screen->info.chip_class >= GFX9) { 3887ec681f3Smrg if (shader->key.as_ls || stage == MESA_SHADER_TESS_CTRL) 3897ec681f3Smrg stage = SI_SHADER_MERGED_VERTEX_TESSCTRL; /* LS or HS */ 3907ec681f3Smrg else if (shader->key.as_es || shader->key.as_ngg || stage == MESA_SHADER_GEOMETRY) 3917ec681f3Smrg stage = SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY; 3927ec681f3Smrg } 3937ec681f3Smrg 3947ec681f3Smrg switch (stage) { 3957ec681f3Smrg case MESA_SHADER_VERTEX: 3967ec681f3Smrg declare_global_desc_pointers(ctx); 3977ec681f3Smrg 3987ec681f3Smrg if (shader->selector->info.base.vs.blit_sgprs_amd) { 3997ec681f3Smrg declare_vs_blit_inputs(ctx, shader->selector->info.base.vs.blit_sgprs_amd); 4007ec681f3Smrg 4017ec681f3Smrg /* VGPRs */ 4027ec681f3Smrg declare_vs_input_vgprs(ctx, &num_prolog_vgprs); 4037ec681f3Smrg break; 4047ec681f3Smrg } 4057ec681f3Smrg 4067ec681f3Smrg declare_per_stage_desc_pointers(ctx, true); 4077ec681f3Smrg declare_vs_specific_input_sgprs(ctx); 4087ec681f3Smrg if (!shader->is_gs_copy_shader) 4097ec681f3Smrg declare_vb_descriptor_input_sgprs(ctx); 4107ec681f3Smrg 4117ec681f3Smrg if (shader->key.as_es) { 4127ec681f3Smrg ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.es2gs_offset); 4137ec681f3Smrg } else if (shader->key.as_ls) { 4147ec681f3Smrg /* no extra parameters */ 4157ec681f3Smrg } else { 4167ec681f3Smrg /* The locations of the other parameters are assigned dynamically. */ 4177ec681f3Smrg declare_streamout_params(ctx, &shader->selector->so); 4187ec681f3Smrg } 4197ec681f3Smrg 4207ec681f3Smrg /* VGPRs */ 4217ec681f3Smrg declare_vs_input_vgprs(ctx, &num_prolog_vgprs); 4227ec681f3Smrg break; 4237ec681f3Smrg 4247ec681f3Smrg case MESA_SHADER_TESS_CTRL: /* GFX6-GFX8 */ 4257ec681f3Smrg declare_global_desc_pointers(ctx); 4267ec681f3Smrg declare_per_stage_desc_pointers(ctx, true); 4277ec681f3Smrg ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_layout); 4287ec681f3Smrg ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_out_lds_offsets); 4297ec681f3Smrg ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_out_lds_layout); 4307ec681f3Smrg ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->vs_state_bits); 4317ec681f3Smrg ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.tess_offchip_offset); 4327ec681f3Smrg ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.tcs_factor_offset); 4337ec681f3Smrg 4347ec681f3Smrg /* VGPRs */ 4357ec681f3Smrg ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.tcs_patch_id); 4367ec681f3Smrg ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.tcs_rel_ids); 4377ec681f3Smrg 4387ec681f3Smrg /* param_tcs_offchip_offset and param_tcs_factor_offset are 4397ec681f3Smrg * placed after the user SGPRs. 4407ec681f3Smrg */ 4417ec681f3Smrg for (i = 0; i < GFX6_TCS_NUM_USER_SGPR + 2; i++) 4427ec681f3Smrg ac_add_return(&ctx->args, AC_ARG_SGPR); 4437ec681f3Smrg for (i = 0; i < 11; i++) 4447ec681f3Smrg ac_add_return(&ctx->args, AC_ARG_VGPR); 4457ec681f3Smrg break; 4467ec681f3Smrg 4477ec681f3Smrg case SI_SHADER_MERGED_VERTEX_TESSCTRL: 4487ec681f3Smrg /* Merged stages have 8 system SGPRs at the beginning. */ 4497ec681f3Smrg /* SPI_SHADER_USER_DATA_ADDR_LO/HI_HS */ 4507ec681f3Smrg declare_per_stage_desc_pointers(ctx, ctx->stage == MESA_SHADER_TESS_CTRL); 4517ec681f3Smrg ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.tess_offchip_offset); 4527ec681f3Smrg ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.merged_wave_info); 4537ec681f3Smrg ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.tcs_factor_offset); 4547ec681f3Smrg ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.scratch_offset); 4557ec681f3Smrg ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */ 4567ec681f3Smrg ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); /* unused */ 4577ec681f3Smrg 4587ec681f3Smrg declare_global_desc_pointers(ctx); 4597ec681f3Smrg declare_per_stage_desc_pointers(ctx, ctx->stage == MESA_SHADER_VERTEX); 4607ec681f3Smrg declare_vs_specific_input_sgprs(ctx); 4617ec681f3Smrg 4627ec681f3Smrg ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_layout); 4637ec681f3Smrg ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_out_lds_offsets); 4647ec681f3Smrg ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_out_lds_layout); 4657ec681f3Smrg if (ctx->stage == MESA_SHADER_VERTEX) 4667ec681f3Smrg declare_vb_descriptor_input_sgprs(ctx); 4677ec681f3Smrg 4687ec681f3Smrg /* VGPRs (first TCS, then VS) */ 4697ec681f3Smrg ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.tcs_patch_id); 4707ec681f3Smrg ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.tcs_rel_ids); 4717ec681f3Smrg 4727ec681f3Smrg if (ctx->stage == MESA_SHADER_VERTEX) { 4737ec681f3Smrg declare_vs_input_vgprs(ctx, &num_prolog_vgprs); 4747ec681f3Smrg 4757ec681f3Smrg /* LS return values are inputs to the TCS main shader part. */ 4767ec681f3Smrg for (i = 0; i < 8 + GFX9_TCS_NUM_USER_SGPR; i++) 4777ec681f3Smrg ac_add_return(&ctx->args, AC_ARG_SGPR); 4787ec681f3Smrg for (i = 0; i < 2; i++) 4797ec681f3Smrg ac_add_return(&ctx->args, AC_ARG_VGPR); 4807ec681f3Smrg 4817ec681f3Smrg /* VS outputs passed via VGPRs to TCS. */ 4827ec681f3Smrg if (shader->key.opt.same_patch_vertices) { 4837ec681f3Smrg unsigned num_outputs = util_last_bit64(shader->selector->outputs_written); 4847ec681f3Smrg for (i = 0; i < num_outputs * 4; i++) 4857ec681f3Smrg ac_add_return(&ctx->args, AC_ARG_VGPR); 4867ec681f3Smrg } 4877ec681f3Smrg } else { 4887ec681f3Smrg /* TCS inputs are passed via VGPRs from VS. */ 4897ec681f3Smrg if (shader->key.opt.same_patch_vertices) { 4907ec681f3Smrg unsigned num_inputs = util_last_bit64(shader->previous_stage_sel->outputs_written); 4917ec681f3Smrg for (i = 0; i < num_inputs * 4; i++) 4927ec681f3Smrg ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, NULL); 4937ec681f3Smrg } 4947ec681f3Smrg 4957ec681f3Smrg /* TCS return values are inputs to the TCS epilog. 4967ec681f3Smrg * 4977ec681f3Smrg * param_tcs_offchip_offset, param_tcs_factor_offset, 4987ec681f3Smrg * param_tcs_offchip_layout, and internal_bindings 4997ec681f3Smrg * should be passed to the epilog. 5007ec681f3Smrg */ 5017ec681f3Smrg for (i = 0; i <= 8 + GFX9_SGPR_TCS_OUT_LAYOUT; i++) 5027ec681f3Smrg ac_add_return(&ctx->args, AC_ARG_SGPR); 5037ec681f3Smrg for (i = 0; i < 11; i++) 5047ec681f3Smrg ac_add_return(&ctx->args, AC_ARG_VGPR); 5057ec681f3Smrg } 5067ec681f3Smrg break; 5077ec681f3Smrg 5087ec681f3Smrg case SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY: 5097ec681f3Smrg /* Merged stages have 8 system SGPRs at the beginning. */ 5107ec681f3Smrg /* SPI_SHADER_USER_DATA_ADDR_LO/HI_GS */ 5117ec681f3Smrg declare_per_stage_desc_pointers(ctx, ctx->stage == MESA_SHADER_GEOMETRY); 5127ec681f3Smrg 5137ec681f3Smrg if (ctx->shader->key.as_ngg) 5147ec681f3Smrg ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.gs_tg_info); 5157ec681f3Smrg else 5167ec681f3Smrg ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.gs2vs_offset); 5177ec681f3Smrg 5187ec681f3Smrg ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.merged_wave_info); 5197ec681f3Smrg ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.tess_offchip_offset); 5207ec681f3Smrg ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.scratch_offset); 5217ec681f3Smrg ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_DESC_PTR, 5227ec681f3Smrg &ctx->small_prim_cull_info); /* SPI_SHADER_PGM_LO_GS << 8 */ 5237ec681f3Smrg ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, 5247ec681f3Smrg NULL); /* unused (SPI_SHADER_PGM_LO/HI_GS >> 24) */ 5257ec681f3Smrg 5267ec681f3Smrg declare_global_desc_pointers(ctx); 5277ec681f3Smrg if (ctx->stage != MESA_SHADER_VERTEX || !shader->selector->info.base.vs.blit_sgprs_amd) { 5287ec681f3Smrg declare_per_stage_desc_pointers( 5297ec681f3Smrg ctx, (ctx->stage == MESA_SHADER_VERTEX || ctx->stage == MESA_SHADER_TESS_EVAL)); 5307ec681f3Smrg } 5317ec681f3Smrg 5327ec681f3Smrg if (ctx->stage == MESA_SHADER_VERTEX) { 5337ec681f3Smrg if (shader->selector->info.base.vs.blit_sgprs_amd) 5347ec681f3Smrg declare_vs_blit_inputs(ctx, shader->selector->info.base.vs.blit_sgprs_amd); 5357ec681f3Smrg else 5367ec681f3Smrg declare_vs_specific_input_sgprs(ctx); 5377ec681f3Smrg } else { 5387ec681f3Smrg ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->vs_state_bits); 5397ec681f3Smrg 5407ec681f3Smrg if (ctx->stage == MESA_SHADER_TESS_EVAL) { 5417ec681f3Smrg ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_layout); 5427ec681f3Smrg ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tes_offchip_addr); 5437ec681f3Smrg } 5447ec681f3Smrg } 5457ec681f3Smrg 5467ec681f3Smrg if (ctx->stage == MESA_SHADER_VERTEX) 5477ec681f3Smrg declare_vb_descriptor_input_sgprs(ctx); 5487ec681f3Smrg 5497ec681f3Smrg /* VGPRs (first GS, then VS/TES) */ 5507ec681f3Smrg ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.gs_vtx_offset[0]); 5517ec681f3Smrg ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.gs_vtx_offset[1]); 5527ec681f3Smrg ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.gs_prim_id); 5537ec681f3Smrg ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.gs_invocation_id); 5547ec681f3Smrg ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.gs_vtx_offset[2]); 5557ec681f3Smrg 5567ec681f3Smrg if (ctx->stage == MESA_SHADER_VERTEX) { 5577ec681f3Smrg declare_vs_input_vgprs(ctx, &num_prolog_vgprs); 5587ec681f3Smrg } else if (ctx->stage == MESA_SHADER_TESS_EVAL) { 5597ec681f3Smrg declare_tes_input_vgprs(ctx); 5607ec681f3Smrg } 5617ec681f3Smrg 5627ec681f3Smrg if ((ctx->shader->key.as_es || ngg_cull_shader) && 5637ec681f3Smrg (ctx->stage == MESA_SHADER_VERTEX || ctx->stage == MESA_SHADER_TESS_EVAL)) { 5647ec681f3Smrg unsigned num_user_sgprs, num_vgprs; 5657ec681f3Smrg 5667ec681f3Smrg if (ctx->stage == MESA_SHADER_VERTEX && ngg_cull_shader) { 5677ec681f3Smrg /* For the NGG cull shader, add 1 SGPR to hold 5687ec681f3Smrg * the vertex buffer pointer. 5697ec681f3Smrg */ 5707ec681f3Smrg num_user_sgprs = GFX9_VSGS_NUM_USER_SGPR + 1; 5717ec681f3Smrg 5727ec681f3Smrg if (shader->selector->num_vbos_in_user_sgprs) { 5737ec681f3Smrg assert(num_user_sgprs <= SI_SGPR_VS_VB_DESCRIPTOR_FIRST); 5747ec681f3Smrg num_user_sgprs = 5757ec681f3Smrg SI_SGPR_VS_VB_DESCRIPTOR_FIRST + shader->selector->num_vbos_in_user_sgprs * 4; 5767ec681f3Smrg } 5777ec681f3Smrg } else if (ctx->stage == MESA_SHADER_TESS_EVAL && ngg_cull_shader) { 5787ec681f3Smrg num_user_sgprs = GFX9_TESGS_NUM_USER_SGPR; 5797ec681f3Smrg } else { 5807ec681f3Smrg num_user_sgprs = SI_NUM_VS_STATE_RESOURCE_SGPRS; 5817ec681f3Smrg } 5827ec681f3Smrg 5837ec681f3Smrg /* The NGG cull shader has to return all 9 VGPRs. 5847ec681f3Smrg * 5857ec681f3Smrg * The normal merged ESGS shader only has to return the 5 VGPRs 5867ec681f3Smrg * for the GS stage. 5877ec681f3Smrg */ 5887ec681f3Smrg num_vgprs = ngg_cull_shader ? 9 : 5; 5897ec681f3Smrg 5907ec681f3Smrg /* ES return values are inputs to GS. */ 5917ec681f3Smrg for (i = 0; i < 8 + num_user_sgprs; i++) 5927ec681f3Smrg ac_add_return(&ctx->args, AC_ARG_SGPR); 5937ec681f3Smrg for (i = 0; i < num_vgprs; i++) 5947ec681f3Smrg ac_add_return(&ctx->args, AC_ARG_VGPR); 5957ec681f3Smrg } 5967ec681f3Smrg break; 5977ec681f3Smrg 5987ec681f3Smrg case MESA_SHADER_TESS_EVAL: 5997ec681f3Smrg declare_global_desc_pointers(ctx); 6007ec681f3Smrg declare_per_stage_desc_pointers(ctx, true); 6017ec681f3Smrg ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->vs_state_bits); 6027ec681f3Smrg ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tcs_offchip_layout); 6037ec681f3Smrg ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->tes_offchip_addr); 6047ec681f3Smrg 6057ec681f3Smrg if (shader->key.as_es) { 6067ec681f3Smrg ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.tess_offchip_offset); 6077ec681f3Smrg ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); 6087ec681f3Smrg ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.es2gs_offset); 6097ec681f3Smrg } else { 6107ec681f3Smrg declare_streamout_params(ctx, &shader->selector->so); 6117ec681f3Smrg ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.tess_offchip_offset); 6127ec681f3Smrg } 6137ec681f3Smrg 6147ec681f3Smrg /* VGPRs */ 6157ec681f3Smrg declare_tes_input_vgprs(ctx); 6167ec681f3Smrg break; 6177ec681f3Smrg 6187ec681f3Smrg case MESA_SHADER_GEOMETRY: 6197ec681f3Smrg declare_global_desc_pointers(ctx); 6207ec681f3Smrg declare_per_stage_desc_pointers(ctx, true); 6217ec681f3Smrg ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.gs2vs_offset); 6227ec681f3Smrg ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.gs_wave_id); 6237ec681f3Smrg 6247ec681f3Smrg /* VGPRs */ 6257ec681f3Smrg ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.gs_vtx_offset[0]); 6267ec681f3Smrg ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.gs_vtx_offset[1]); 6277ec681f3Smrg ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.gs_prim_id); 6287ec681f3Smrg ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.gs_vtx_offset[2]); 6297ec681f3Smrg ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.gs_vtx_offset[3]); 6307ec681f3Smrg ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.gs_vtx_offset[4]); 6317ec681f3Smrg ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.gs_vtx_offset[5]); 6327ec681f3Smrg ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.gs_invocation_id); 6337ec681f3Smrg break; 6347ec681f3Smrg 6357ec681f3Smrg case MESA_SHADER_FRAGMENT: 6367ec681f3Smrg declare_global_desc_pointers(ctx); 6377ec681f3Smrg declare_per_stage_desc_pointers(ctx, true); 6387ec681f3Smrg si_add_arg_checked(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL, SI_PARAM_ALPHA_REF); 6397ec681f3Smrg si_add_arg_checked(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.prim_mask, 6407ec681f3Smrg SI_PARAM_PRIM_MASK); 6417ec681f3Smrg 6427ec681f3Smrg si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 2, AC_ARG_INT, &ctx->args.persp_sample, 6437ec681f3Smrg SI_PARAM_PERSP_SAMPLE); 6447ec681f3Smrg si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 2, AC_ARG_INT, &ctx->args.persp_center, 6457ec681f3Smrg SI_PARAM_PERSP_CENTER); 6467ec681f3Smrg si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 2, AC_ARG_INT, &ctx->args.persp_centroid, 6477ec681f3Smrg SI_PARAM_PERSP_CENTROID); 6487ec681f3Smrg si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 3, AC_ARG_INT, NULL, SI_PARAM_PERSP_PULL_MODEL); 6497ec681f3Smrg si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 2, AC_ARG_INT, &ctx->args.linear_sample, 6507ec681f3Smrg SI_PARAM_LINEAR_SAMPLE); 6517ec681f3Smrg si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 2, AC_ARG_INT, &ctx->args.linear_center, 6527ec681f3Smrg SI_PARAM_LINEAR_CENTER); 6537ec681f3Smrg si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 2, AC_ARG_INT, &ctx->args.linear_centroid, 6547ec681f3Smrg SI_PARAM_LINEAR_CENTROID); 6557ec681f3Smrg si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, NULL, SI_PARAM_LINE_STIPPLE_TEX); 6567ec681f3Smrg si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &ctx->args.frag_pos[0], 6577ec681f3Smrg SI_PARAM_POS_X_FLOAT); 6587ec681f3Smrg si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &ctx->args.frag_pos[1], 6597ec681f3Smrg SI_PARAM_POS_Y_FLOAT); 6607ec681f3Smrg si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &ctx->args.frag_pos[2], 6617ec681f3Smrg SI_PARAM_POS_Z_FLOAT); 6627ec681f3Smrg si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &ctx->args.frag_pos[3], 6637ec681f3Smrg SI_PARAM_POS_W_FLOAT); 6647ec681f3Smrg shader->info.face_vgpr_index = ctx->args.num_vgprs_used; 6657ec681f3Smrg si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.front_face, 6667ec681f3Smrg SI_PARAM_FRONT_FACE); 6677ec681f3Smrg shader->info.ancillary_vgpr_index = ctx->args.num_vgprs_used; 6687ec681f3Smrg si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.ancillary, 6697ec681f3Smrg SI_PARAM_ANCILLARY); 6707ec681f3Smrg si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &ctx->args.sample_coverage, 6717ec681f3Smrg SI_PARAM_SAMPLE_COVERAGE); 6727ec681f3Smrg si_add_arg_checked(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->pos_fixed_pt, 6737ec681f3Smrg SI_PARAM_POS_FIXED_PT); 6747ec681f3Smrg 6757ec681f3Smrg /* Color inputs from the prolog. */ 6767ec681f3Smrg if (shader->selector->info.colors_read) { 6777ec681f3Smrg unsigned num_color_elements = util_bitcount(shader->selector->info.colors_read); 6787ec681f3Smrg 6797ec681f3Smrg for (i = 0; i < num_color_elements; i++) 6807ec681f3Smrg ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, NULL); 6817ec681f3Smrg 6827ec681f3Smrg num_prolog_vgprs += num_color_elements; 6837ec681f3Smrg } 6847ec681f3Smrg 6857ec681f3Smrg /* Outputs for the epilog. */ 6867ec681f3Smrg num_return_sgprs = SI_SGPR_ALPHA_REF + 1; 6877ec681f3Smrg num_returns = num_return_sgprs + util_bitcount(shader->selector->info.colors_written) * 4 + 6887ec681f3Smrg shader->selector->info.writes_z + shader->selector->info.writes_stencil + 6897ec681f3Smrg shader->selector->info.writes_samplemask + 1 /* SampleMaskIn */; 6907ec681f3Smrg 6917ec681f3Smrg num_returns = MAX2(num_returns, num_return_sgprs + PS_EPILOG_SAMPLEMASK_MIN_LOC + 1); 6927ec681f3Smrg 6937ec681f3Smrg for (i = 0; i < num_return_sgprs; i++) 6947ec681f3Smrg ac_add_return(&ctx->args, AC_ARG_SGPR); 6957ec681f3Smrg for (; i < num_returns; i++) 6967ec681f3Smrg ac_add_return(&ctx->args, AC_ARG_VGPR); 6977ec681f3Smrg break; 6987ec681f3Smrg 6997ec681f3Smrg case MESA_SHADER_COMPUTE: 7007ec681f3Smrg declare_global_desc_pointers(ctx); 7017ec681f3Smrg declare_per_stage_desc_pointers(ctx, true); 7027ec681f3Smrg if (shader->selector->info.uses_grid_size) 7037ec681f3Smrg ac_add_arg(&ctx->args, AC_ARG_SGPR, 3, AC_ARG_INT, &ctx->args.num_work_groups); 7047ec681f3Smrg if (shader->selector->info.uses_variable_block_size) 7057ec681f3Smrg ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->block_size); 7067ec681f3Smrg 7077ec681f3Smrg unsigned cs_user_data_dwords = 7087ec681f3Smrg shader->selector->info.base.cs.user_data_components_amd; 7097ec681f3Smrg if (cs_user_data_dwords) { 7107ec681f3Smrg ac_add_arg(&ctx->args, AC_ARG_SGPR, cs_user_data_dwords, AC_ARG_INT, &ctx->cs_user_data); 7117ec681f3Smrg } 7127ec681f3Smrg 7137ec681f3Smrg /* Some descriptors can be in user SGPRs. */ 7147ec681f3Smrg /* Shader buffers in user SGPRs. */ 7157ec681f3Smrg for (unsigned i = 0; i < shader->selector->cs_num_shaderbufs_in_user_sgprs; i++) { 7167ec681f3Smrg while (ctx->args.num_sgprs_used % 4 != 0) 7177ec681f3Smrg ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); 7187ec681f3Smrg 7197ec681f3Smrg ac_add_arg(&ctx->args, AC_ARG_SGPR, 4, AC_ARG_INT, &ctx->cs_shaderbuf[i]); 7207ec681f3Smrg } 7217ec681f3Smrg /* Images in user SGPRs. */ 7227ec681f3Smrg for (unsigned i = 0; i < shader->selector->cs_num_images_in_user_sgprs; i++) { 7237ec681f3Smrg unsigned num_sgprs = shader->selector->info.base.image_buffers & (1 << i) ? 4 : 8; 7247ec681f3Smrg 7257ec681f3Smrg while (ctx->args.num_sgprs_used % num_sgprs != 0) 7267ec681f3Smrg ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); 7277ec681f3Smrg 7287ec681f3Smrg ac_add_arg(&ctx->args, AC_ARG_SGPR, num_sgprs, AC_ARG_INT, &ctx->cs_image[i]); 7297ec681f3Smrg } 7307ec681f3Smrg 7317ec681f3Smrg /* Hardware SGPRs. */ 7327ec681f3Smrg for (i = 0; i < 3; i++) { 7337ec681f3Smrg if (shader->selector->info.uses_block_id[i]) { 7347ec681f3Smrg ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.workgroup_ids[i]); 7357ec681f3Smrg } 7367ec681f3Smrg } 7377ec681f3Smrg if (shader->selector->info.uses_subgroup_info) 7387ec681f3Smrg ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, &ctx->args.tg_size); 7397ec681f3Smrg 7407ec681f3Smrg /* Hardware VGPRs. */ 7417ec681f3Smrg if (!ctx->screen->info.has_graphics && ctx->screen->info.family >= CHIP_ALDEBARAN) 7427ec681f3Smrg ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.local_invocation_ids); 7437ec681f3Smrg else 7447ec681f3Smrg ac_add_arg(&ctx->args, AC_ARG_VGPR, 3, AC_ARG_INT, &ctx->args.local_invocation_ids); 7457ec681f3Smrg break; 7467ec681f3Smrg default: 7477ec681f3Smrg assert(0 && "unimplemented shader"); 7487ec681f3Smrg return; 7497ec681f3Smrg } 7507ec681f3Smrg 7517ec681f3Smrg shader->info.num_input_sgprs = ctx->args.num_sgprs_used; 7527ec681f3Smrg shader->info.num_input_vgprs = ctx->args.num_vgprs_used; 7537ec681f3Smrg 7547ec681f3Smrg assert(shader->info.num_input_vgprs >= num_prolog_vgprs); 7557ec681f3Smrg shader->info.num_input_vgprs -= num_prolog_vgprs; 75601e04c3fSmrg} 75701e04c3fSmrg 75801e04c3fSmrg/* For the UMR disassembler. */ 7597ec681f3Smrg#define DEBUGGER_END_OF_CODE_MARKER 0xbf9f0000 /* invalid instruction */ 7607ec681f3Smrg#define DEBUGGER_NUM_MARKERS 5 7617ec681f3Smrg 7627ec681f3Smrgstatic bool si_shader_binary_open(struct si_screen *screen, struct si_shader *shader, 7637ec681f3Smrg struct ac_rtld_binary *rtld) 7647ec681f3Smrg{ 7657ec681f3Smrg const struct si_shader_selector *sel = shader->selector; 7667ec681f3Smrg const char *part_elfs[5]; 7677ec681f3Smrg size_t part_sizes[5]; 7687ec681f3Smrg unsigned num_parts = 0; 7697ec681f3Smrg 7707ec681f3Smrg#define add_part(shader_or_part) \ 7717ec681f3Smrg if (shader_or_part) { \ 7727ec681f3Smrg part_elfs[num_parts] = (shader_or_part)->binary.elf_buffer; \ 7737ec681f3Smrg part_sizes[num_parts] = (shader_or_part)->binary.elf_size; \ 7747ec681f3Smrg num_parts++; \ 7757ec681f3Smrg } 7767ec681f3Smrg 7777ec681f3Smrg add_part(shader->prolog); 7787ec681f3Smrg add_part(shader->previous_stage); 7797ec681f3Smrg add_part(shader->prolog2); 7807ec681f3Smrg add_part(shader); 7817ec681f3Smrg add_part(shader->epilog); 7827ec681f3Smrg 7837ec681f3Smrg#undef add_part 7847ec681f3Smrg 7857ec681f3Smrg struct ac_rtld_symbol lds_symbols[2]; 7867ec681f3Smrg unsigned num_lds_symbols = 0; 7877ec681f3Smrg 7887ec681f3Smrg if (sel && screen->info.chip_class >= GFX9 && !shader->is_gs_copy_shader && 7897ec681f3Smrg (sel->info.stage == MESA_SHADER_GEOMETRY || shader->key.as_ngg)) { 7907ec681f3Smrg struct ac_rtld_symbol *sym = &lds_symbols[num_lds_symbols++]; 7917ec681f3Smrg sym->name = "esgs_ring"; 7927ec681f3Smrg sym->size = shader->gs_info.esgs_ring_size * 4; 7937ec681f3Smrg sym->align = 64 * 1024; 7947ec681f3Smrg } 7957ec681f3Smrg 7967ec681f3Smrg if (shader->key.as_ngg && sel->info.stage == MESA_SHADER_GEOMETRY) { 7977ec681f3Smrg struct ac_rtld_symbol *sym = &lds_symbols[num_lds_symbols++]; 7987ec681f3Smrg sym->name = "ngg_emit"; 7997ec681f3Smrg sym->size = shader->ngg.ngg_emit_size * 4; 8007ec681f3Smrg sym->align = 4; 8017ec681f3Smrg } 8027ec681f3Smrg 8037ec681f3Smrg bool ok = ac_rtld_open( 8047ec681f3Smrg rtld, (struct ac_rtld_open_info){.info = &screen->info, 8057ec681f3Smrg .options = 8067ec681f3Smrg { 8077ec681f3Smrg .halt_at_entry = screen->options.halt_shaders, 8087ec681f3Smrg }, 8097ec681f3Smrg .shader_type = sel->info.stage, 8107ec681f3Smrg .wave_size = si_get_shader_wave_size(shader), 8117ec681f3Smrg .num_parts = num_parts, 8127ec681f3Smrg .elf_ptrs = part_elfs, 8137ec681f3Smrg .elf_sizes = part_sizes, 8147ec681f3Smrg .num_shared_lds_symbols = num_lds_symbols, 8157ec681f3Smrg .shared_lds_symbols = lds_symbols}); 8167ec681f3Smrg 8177ec681f3Smrg if (rtld->lds_size > 0) { 8187ec681f3Smrg unsigned alloc_granularity = screen->info.chip_class >= GFX7 ? 512 : 256; 8197ec681f3Smrg shader->config.lds_size = align(rtld->lds_size, alloc_granularity) / alloc_granularity; 8207ec681f3Smrg } 8217ec681f3Smrg 8227ec681f3Smrg return ok; 8237ec681f3Smrg} 8247ec681f3Smrg 8257ec681f3Smrgstatic unsigned si_get_shader_binary_size(struct si_screen *screen, struct si_shader *shader) 8267ec681f3Smrg{ 8277ec681f3Smrg struct ac_rtld_binary rtld; 8287ec681f3Smrg si_shader_binary_open(screen, shader, &rtld); 8297ec681f3Smrg uint64_t size = rtld.exec_size; 8307ec681f3Smrg ac_rtld_close(&rtld); 8317ec681f3Smrg return size; 8327ec681f3Smrg} 8337ec681f3Smrg 8347ec681f3Smrgstatic bool si_get_external_symbol(void *data, const char *name, uint64_t *value) 8357ec681f3Smrg{ 8367ec681f3Smrg uint64_t *scratch_va = data; 8377ec681f3Smrg 8387ec681f3Smrg if (!strcmp(scratch_rsrc_dword0_symbol, name)) { 8397ec681f3Smrg *value = (uint32_t)*scratch_va; 8407ec681f3Smrg return true; 8417ec681f3Smrg } 8427ec681f3Smrg if (!strcmp(scratch_rsrc_dword1_symbol, name)) { 8437ec681f3Smrg /* Enable scratch coalescing. */ 8447ec681f3Smrg *value = S_008F04_BASE_ADDRESS_HI(*scratch_va >> 32) | S_008F04_SWIZZLE_ENABLE(1); 8457ec681f3Smrg return true; 8467ec681f3Smrg } 8477ec681f3Smrg 8487ec681f3Smrg return false; 8497ec681f3Smrg} 8507ec681f3Smrg 8517ec681f3Smrgbool si_shader_binary_upload(struct si_screen *sscreen, struct si_shader *shader, 8527ec681f3Smrg uint64_t scratch_va) 8537ec681f3Smrg{ 8547ec681f3Smrg struct ac_rtld_binary binary; 8557ec681f3Smrg if (!si_shader_binary_open(sscreen, shader, &binary)) 8567ec681f3Smrg return false; 8577ec681f3Smrg 8587ec681f3Smrg si_resource_reference(&shader->bo, NULL); 8597ec681f3Smrg shader->bo = si_aligned_buffer_create( 8607ec681f3Smrg &sscreen->b, 8617ec681f3Smrg (sscreen->info.cpdma_prefetch_writes_memory ? 0 : SI_RESOURCE_FLAG_READ_ONLY) | 8627ec681f3Smrg SI_RESOURCE_FLAG_DRIVER_INTERNAL | SI_RESOURCE_FLAG_32BIT, 8637ec681f3Smrg PIPE_USAGE_IMMUTABLE, align(binary.rx_size, SI_CPDMA_ALIGNMENT), 256); 8647ec681f3Smrg if (!shader->bo) 8657ec681f3Smrg return false; 8667ec681f3Smrg 8677ec681f3Smrg /* Upload. */ 8687ec681f3Smrg struct ac_rtld_upload_info u = {}; 8697ec681f3Smrg u.binary = &binary; 8707ec681f3Smrg u.get_external_symbol = si_get_external_symbol; 8717ec681f3Smrg u.cb_data = &scratch_va; 8727ec681f3Smrg u.rx_va = shader->bo->gpu_address; 8737ec681f3Smrg u.rx_ptr = sscreen->ws->buffer_map(sscreen->ws, 8747ec681f3Smrg shader->bo->buf, NULL, 8757ec681f3Smrg PIPE_MAP_READ_WRITE | PIPE_MAP_UNSYNCHRONIZED | RADEON_MAP_TEMPORARY); 8767ec681f3Smrg if (!u.rx_ptr) 8777ec681f3Smrg return false; 8787ec681f3Smrg 8797ec681f3Smrg int size = ac_rtld_upload(&u); 8807ec681f3Smrg 8817ec681f3Smrg if (sscreen->debug_flags & DBG(SQTT)) { 8827ec681f3Smrg /* Remember the uploaded code */ 8837ec681f3Smrg shader->binary.uploaded_code_size = size; 8847ec681f3Smrg shader->binary.uploaded_code = malloc(size); 8857ec681f3Smrg memcpy(shader->binary.uploaded_code, u.rx_ptr, size); 8867ec681f3Smrg } 8877ec681f3Smrg 8887ec681f3Smrg sscreen->ws->buffer_unmap(sscreen->ws, shader->bo->buf); 8897ec681f3Smrg ac_rtld_close(&binary); 8907ec681f3Smrg 8917ec681f3Smrg return size >= 0; 8927ec681f3Smrg} 8937ec681f3Smrg 8947ec681f3Smrgstatic void si_shader_dump_disassembly(struct si_screen *screen, 8957ec681f3Smrg const struct si_shader_binary *binary, 8967ec681f3Smrg gl_shader_stage stage, unsigned wave_size, 8977ec681f3Smrg struct pipe_debug_callback *debug, const char *name, 8987ec681f3Smrg FILE *file) 8997ec681f3Smrg{ 9007ec681f3Smrg struct ac_rtld_binary rtld_binary; 9017ec681f3Smrg 9027ec681f3Smrg if (!ac_rtld_open(&rtld_binary, (struct ac_rtld_open_info){ 9037ec681f3Smrg .info = &screen->info, 9047ec681f3Smrg .shader_type = stage, 9057ec681f3Smrg .wave_size = wave_size, 9067ec681f3Smrg .num_parts = 1, 9077ec681f3Smrg .elf_ptrs = &binary->elf_buffer, 9087ec681f3Smrg .elf_sizes = &binary->elf_size})) 9097ec681f3Smrg return; 9107ec681f3Smrg 9117ec681f3Smrg const char *disasm; 9127ec681f3Smrg size_t nbytes; 9137ec681f3Smrg 9147ec681f3Smrg if (!ac_rtld_get_section_by_name(&rtld_binary, ".AMDGPU.disasm", &disasm, &nbytes)) 9157ec681f3Smrg goto out; 91601e04c3fSmrg 9177ec681f3Smrg if (nbytes > INT_MAX) 9187ec681f3Smrg goto out; 91901e04c3fSmrg 9207ec681f3Smrg if (debug && debug->debug_message) { 9217ec681f3Smrg /* Very long debug messages are cut off, so send the 9227ec681f3Smrg * disassembly one line at a time. This causes more 9237ec681f3Smrg * overhead, but on the plus side it simplifies 9247ec681f3Smrg * parsing of resulting logs. 9257ec681f3Smrg */ 9267ec681f3Smrg pipe_debug_message(debug, SHADER_INFO, "Shader Disassembly Begin"); 9277ec681f3Smrg 9287ec681f3Smrg uint64_t line = 0; 9297ec681f3Smrg while (line < nbytes) { 9307ec681f3Smrg int count = nbytes - line; 9317ec681f3Smrg const char *nl = memchr(disasm + line, '\n', nbytes - line); 9327ec681f3Smrg if (nl) 9337ec681f3Smrg count = nl - (disasm + line); 9347ec681f3Smrg 9357ec681f3Smrg if (count) { 9367ec681f3Smrg pipe_debug_message(debug, SHADER_INFO, "%.*s", count, disasm + line); 9377ec681f3Smrg } 9387ec681f3Smrg 9397ec681f3Smrg line += count + 1; 9407ec681f3Smrg } 9417ec681f3Smrg 9427ec681f3Smrg pipe_debug_message(debug, SHADER_INFO, "Shader Disassembly End"); 9437ec681f3Smrg } 9447ec681f3Smrg 9457ec681f3Smrg if (file) { 9467ec681f3Smrg fprintf(file, "Shader %s disassembly:\n", name); 9477ec681f3Smrg fprintf(file, "%*s", (int)nbytes, disasm); 9487ec681f3Smrg } 94901e04c3fSmrg 9507ec681f3Smrgout: 9517ec681f3Smrg ac_rtld_close(&rtld_binary); 95201e04c3fSmrg} 95301e04c3fSmrg 95401e04c3fSmrgstatic void si_calculate_max_simd_waves(struct si_shader *shader) 95501e04c3fSmrg{ 9567ec681f3Smrg struct si_screen *sscreen = shader->selector->screen; 9577ec681f3Smrg struct ac_shader_config *conf = &shader->config; 9587ec681f3Smrg unsigned num_inputs = shader->selector->info.num_inputs; 9597ec681f3Smrg unsigned lds_increment = sscreen->info.chip_class >= GFX7 ? 512 : 256; 9607ec681f3Smrg unsigned lds_per_wave = 0; 9617ec681f3Smrg unsigned max_simd_waves; 9627ec681f3Smrg 9637ec681f3Smrg max_simd_waves = sscreen->info.max_wave64_per_simd; 9647ec681f3Smrg 9657ec681f3Smrg /* Compute LDS usage for PS. */ 9667ec681f3Smrg switch (shader->selector->info.stage) { 9677ec681f3Smrg case MESA_SHADER_FRAGMENT: 9687ec681f3Smrg /* The minimum usage per wave is (num_inputs * 48). The maximum 9697ec681f3Smrg * usage is (num_inputs * 48 * 16). 9707ec681f3Smrg * We can get anything in between and it varies between waves. 9717ec681f3Smrg * 9727ec681f3Smrg * The 48 bytes per input for a single primitive is equal to 9737ec681f3Smrg * 4 bytes/component * 4 components/input * 3 points. 9747ec681f3Smrg * 9757ec681f3Smrg * Other stages don't know the size at compile time or don't 9767ec681f3Smrg * allocate LDS per wave, but instead they do it per thread group. 9777ec681f3Smrg */ 9787ec681f3Smrg lds_per_wave = conf->lds_size * lds_increment + align(num_inputs * 48, lds_increment); 9797ec681f3Smrg break; 9807ec681f3Smrg case MESA_SHADER_COMPUTE: { 9817ec681f3Smrg unsigned max_workgroup_size = si_get_max_workgroup_size(shader); 9827ec681f3Smrg lds_per_wave = (conf->lds_size * lds_increment) / 9837ec681f3Smrg DIV_ROUND_UP(max_workgroup_size, sscreen->compute_wave_size); 9847ec681f3Smrg } 9857ec681f3Smrg break; 9867ec681f3Smrg default:; 9877ec681f3Smrg } 9887ec681f3Smrg 9897ec681f3Smrg /* Compute the per-SIMD wave counts. */ 9907ec681f3Smrg if (conf->num_sgprs) { 9917ec681f3Smrg max_simd_waves = 9927ec681f3Smrg MIN2(max_simd_waves, sscreen->info.num_physical_sgprs_per_simd / conf->num_sgprs); 9937ec681f3Smrg } 9947ec681f3Smrg 9957ec681f3Smrg if (conf->num_vgprs) { 9967ec681f3Smrg /* Always print wave limits as Wave64, so that we can compare 9977ec681f3Smrg * Wave32 and Wave64 with shader-db fairly. */ 9987ec681f3Smrg unsigned max_vgprs = sscreen->info.num_physical_wave64_vgprs_per_simd; 9997ec681f3Smrg max_simd_waves = MIN2(max_simd_waves, max_vgprs / conf->num_vgprs); 10007ec681f3Smrg } 10017ec681f3Smrg 10027ec681f3Smrg unsigned max_lds_per_simd = sscreen->info.lds_size_per_workgroup / 4; 10037ec681f3Smrg if (lds_per_wave) 10047ec681f3Smrg max_simd_waves = MIN2(max_simd_waves, max_lds_per_simd / lds_per_wave); 10057ec681f3Smrg 10067ec681f3Smrg shader->info.max_simd_waves = max_simd_waves; 10077ec681f3Smrg} 10087ec681f3Smrg 10097ec681f3Smrgvoid si_shader_dump_stats_for_shader_db(struct si_screen *screen, struct si_shader *shader, 10107ec681f3Smrg struct pipe_debug_callback *debug) 10117ec681f3Smrg{ 10127ec681f3Smrg const struct ac_shader_config *conf = &shader->config; 10137ec681f3Smrg 10147ec681f3Smrg if (screen->options.debug_disassembly) 10157ec681f3Smrg si_shader_dump_disassembly(screen, &shader->binary, shader->selector->info.stage, 10167ec681f3Smrg si_get_shader_wave_size(shader), debug, "main", NULL); 10177ec681f3Smrg 10187ec681f3Smrg pipe_debug_message(debug, SHADER_INFO, 10197ec681f3Smrg "Shader Stats: SGPRS: %d VGPRS: %d Code Size: %d " 10207ec681f3Smrg "LDS: %d Scratch: %d Max Waves: %d Spilled SGPRs: %d " 10217ec681f3Smrg "Spilled VGPRs: %d PrivMem VGPRs: %d", 10227ec681f3Smrg conf->num_sgprs, conf->num_vgprs, si_get_shader_binary_size(screen, shader), 10237ec681f3Smrg conf->lds_size, conf->scratch_bytes_per_wave, shader->info.max_simd_waves, 10247ec681f3Smrg conf->spilled_sgprs, conf->spilled_vgprs, shader->info.private_mem_vgprs); 10257ec681f3Smrg} 10267ec681f3Smrg 10277ec681f3Smrgstatic void si_shader_dump_stats(struct si_screen *sscreen, struct si_shader *shader, FILE *file, 10287ec681f3Smrg bool check_debug_option) 10297ec681f3Smrg{ 10307ec681f3Smrg const struct ac_shader_config *conf = &shader->config; 10317ec681f3Smrg 10327ec681f3Smrg if (!check_debug_option || si_can_dump_shader(sscreen, shader->selector->info.stage)) { 10337ec681f3Smrg if (shader->selector->info.stage == MESA_SHADER_FRAGMENT) { 10347ec681f3Smrg fprintf(file, 10357ec681f3Smrg "*** SHADER CONFIG ***\n" 10367ec681f3Smrg "SPI_PS_INPUT_ADDR = 0x%04x\n" 10377ec681f3Smrg "SPI_PS_INPUT_ENA = 0x%04x\n", 10387ec681f3Smrg conf->spi_ps_input_addr, conf->spi_ps_input_ena); 10397ec681f3Smrg } 10407ec681f3Smrg 10417ec681f3Smrg fprintf(file, 10427ec681f3Smrg "*** SHADER STATS ***\n" 10437ec681f3Smrg "SGPRS: %d\n" 10447ec681f3Smrg "VGPRS: %d\n" 10457ec681f3Smrg "Spilled SGPRs: %d\n" 10467ec681f3Smrg "Spilled VGPRs: %d\n" 10477ec681f3Smrg "Private memory VGPRs: %d\n" 10487ec681f3Smrg "Code Size: %d bytes\n" 10497ec681f3Smrg "LDS: %d blocks\n" 10507ec681f3Smrg "Scratch: %d bytes per wave\n" 10517ec681f3Smrg "Max Waves: %d\n" 10527ec681f3Smrg "********************\n\n\n", 10537ec681f3Smrg conf->num_sgprs, conf->num_vgprs, conf->spilled_sgprs, conf->spilled_vgprs, 10547ec681f3Smrg shader->info.private_mem_vgprs, si_get_shader_binary_size(sscreen, shader), 10557ec681f3Smrg conf->lds_size, conf->scratch_bytes_per_wave, shader->info.max_simd_waves); 10567ec681f3Smrg } 10577ec681f3Smrg} 10587ec681f3Smrg 10597ec681f3Smrgconst char *si_get_shader_name(const struct si_shader *shader) 10607ec681f3Smrg{ 10617ec681f3Smrg switch (shader->selector->info.stage) { 10627ec681f3Smrg case MESA_SHADER_VERTEX: 10637ec681f3Smrg if (shader->key.as_es) 10647ec681f3Smrg return "Vertex Shader as ES"; 10657ec681f3Smrg else if (shader->key.as_ls) 10667ec681f3Smrg return "Vertex Shader as LS"; 10677ec681f3Smrg else if (shader->key.as_ngg) 10687ec681f3Smrg return "Vertex Shader as ESGS"; 10697ec681f3Smrg else 10707ec681f3Smrg return "Vertex Shader as VS"; 10717ec681f3Smrg case MESA_SHADER_TESS_CTRL: 10727ec681f3Smrg return "Tessellation Control Shader"; 10737ec681f3Smrg case MESA_SHADER_TESS_EVAL: 10747ec681f3Smrg if (shader->key.as_es) 10757ec681f3Smrg return "Tessellation Evaluation Shader as ES"; 10767ec681f3Smrg else if (shader->key.as_ngg) 10777ec681f3Smrg return "Tessellation Evaluation Shader as ESGS"; 10787ec681f3Smrg else 10797ec681f3Smrg return "Tessellation Evaluation Shader as VS"; 10807ec681f3Smrg case MESA_SHADER_GEOMETRY: 10817ec681f3Smrg if (shader->is_gs_copy_shader) 10827ec681f3Smrg return "GS Copy Shader as VS"; 10837ec681f3Smrg else 10847ec681f3Smrg return "Geometry Shader"; 10857ec681f3Smrg case MESA_SHADER_FRAGMENT: 10867ec681f3Smrg return "Pixel Shader"; 10877ec681f3Smrg case MESA_SHADER_COMPUTE: 10887ec681f3Smrg return "Compute Shader"; 10897ec681f3Smrg default: 10907ec681f3Smrg return "Unknown Shader"; 10917ec681f3Smrg } 10927ec681f3Smrg} 10937ec681f3Smrg 10947ec681f3Smrgvoid si_shader_dump(struct si_screen *sscreen, struct si_shader *shader, 10957ec681f3Smrg struct pipe_debug_callback *debug, FILE *file, bool check_debug_option) 10967ec681f3Smrg{ 10977ec681f3Smrg gl_shader_stage stage = shader->selector->info.stage; 10987ec681f3Smrg 10997ec681f3Smrg if (!check_debug_option || si_can_dump_shader(sscreen, stage)) 11007ec681f3Smrg si_dump_shader_key(shader, file); 11017ec681f3Smrg 11027ec681f3Smrg if (!check_debug_option && shader->binary.llvm_ir_string) { 11037ec681f3Smrg if (shader->previous_stage && shader->previous_stage->binary.llvm_ir_string) { 11047ec681f3Smrg fprintf(file, "\n%s - previous stage - LLVM IR:\n\n", si_get_shader_name(shader)); 11057ec681f3Smrg fprintf(file, "%s\n", shader->previous_stage->binary.llvm_ir_string); 11067ec681f3Smrg } 11077ec681f3Smrg 11087ec681f3Smrg fprintf(file, "\n%s - main shader part - LLVM IR:\n\n", si_get_shader_name(shader)); 11097ec681f3Smrg fprintf(file, "%s\n", shader->binary.llvm_ir_string); 11107ec681f3Smrg } 11117ec681f3Smrg 11127ec681f3Smrg if (!check_debug_option || 11137ec681f3Smrg (si_can_dump_shader(sscreen, stage) && !(sscreen->debug_flags & DBG(NO_ASM)))) { 11147ec681f3Smrg unsigned wave_size = si_get_shader_wave_size(shader); 11157ec681f3Smrg 11167ec681f3Smrg fprintf(file, "\n%s:\n", si_get_shader_name(shader)); 11177ec681f3Smrg 11187ec681f3Smrg if (shader->prolog) 11197ec681f3Smrg si_shader_dump_disassembly(sscreen, &shader->prolog->binary, stage, wave_size, debug, 11207ec681f3Smrg "prolog", file); 11217ec681f3Smrg if (shader->previous_stage) 11227ec681f3Smrg si_shader_dump_disassembly(sscreen, &shader->previous_stage->binary, stage, 11237ec681f3Smrg wave_size, debug, "previous stage", file); 11247ec681f3Smrg if (shader->prolog2) 11257ec681f3Smrg si_shader_dump_disassembly(sscreen, &shader->prolog2->binary, stage, wave_size, 11267ec681f3Smrg debug, "prolog2", file); 11277ec681f3Smrg 11287ec681f3Smrg si_shader_dump_disassembly(sscreen, &shader->binary, stage, wave_size, debug, "main", 11297ec681f3Smrg file); 11307ec681f3Smrg 11317ec681f3Smrg if (shader->epilog) 11327ec681f3Smrg si_shader_dump_disassembly(sscreen, &shader->epilog->binary, stage, wave_size, debug, 11337ec681f3Smrg "epilog", file); 11347ec681f3Smrg fprintf(file, "\n"); 11357ec681f3Smrg } 11367ec681f3Smrg 11377ec681f3Smrg si_shader_dump_stats(sscreen, shader, file, check_debug_option); 113801e04c3fSmrg} 113901e04c3fSmrg 114001e04c3fSmrgstatic void si_dump_shader_key_vs(const struct si_shader_key *key, 11417ec681f3Smrg const struct si_vs_prolog_bits *prolog, const char *prefix, 11427ec681f3Smrg FILE *f) 11437ec681f3Smrg{ 11447ec681f3Smrg fprintf(f, " %s.instance_divisor_is_one = %u\n", prefix, prolog->instance_divisor_is_one); 11457ec681f3Smrg fprintf(f, " %s.instance_divisor_is_fetched = %u\n", prefix, 11467ec681f3Smrg prolog->instance_divisor_is_fetched); 11477ec681f3Smrg fprintf(f, " %s.ls_vgpr_fix = %u\n", prefix, prolog->ls_vgpr_fix); 11487ec681f3Smrg 11497ec681f3Smrg fprintf(f, " mono.vs.fetch_opencode = %x\n", key->mono.vs_fetch_opencode); 11507ec681f3Smrg fprintf(f, " mono.vs.fix_fetch = {"); 11517ec681f3Smrg for (int i = 0; i < SI_MAX_ATTRIBS; i++) { 11527ec681f3Smrg union si_vs_fix_fetch fix = key->mono.vs_fix_fetch[i]; 11537ec681f3Smrg if (i) 11547ec681f3Smrg fprintf(f, ", "); 11557ec681f3Smrg if (!fix.bits) 11567ec681f3Smrg fprintf(f, "0"); 11577ec681f3Smrg else 11587ec681f3Smrg fprintf(f, "%u.%u.%u.%u", fix.u.reverse, fix.u.log_size, fix.u.num_channels_m1, 11597ec681f3Smrg fix.u.format); 11607ec681f3Smrg } 11617ec681f3Smrg fprintf(f, "}\n"); 11627ec681f3Smrg} 11637ec681f3Smrg 11647ec681f3Smrgstatic void si_dump_shader_key(const struct si_shader *shader, FILE *f) 11657ec681f3Smrg{ 11667ec681f3Smrg const struct si_shader_key *key = &shader->key; 11677ec681f3Smrg gl_shader_stage stage = shader->selector->info.stage; 11687ec681f3Smrg 11697ec681f3Smrg fprintf(f, "SHADER KEY\n"); 11707ec681f3Smrg 11717ec681f3Smrg switch (stage) { 11727ec681f3Smrg case MESA_SHADER_VERTEX: 11737ec681f3Smrg si_dump_shader_key_vs(key, &key->part.vs.prolog, "part.vs.prolog", f); 11747ec681f3Smrg fprintf(f, " as_es = %u\n", key->as_es); 11757ec681f3Smrg fprintf(f, " as_ls = %u\n", key->as_ls); 11767ec681f3Smrg fprintf(f, " as_ngg = %u\n", key->as_ngg); 11777ec681f3Smrg fprintf(f, " mono.u.vs_export_prim_id = %u\n", key->mono.u.vs_export_prim_id); 11787ec681f3Smrg break; 11797ec681f3Smrg 11807ec681f3Smrg case MESA_SHADER_TESS_CTRL: 11817ec681f3Smrg if (shader->selector->screen->info.chip_class >= GFX9) { 11827ec681f3Smrg si_dump_shader_key_vs(key, &key->part.tcs.ls_prolog, "part.tcs.ls_prolog", f); 11837ec681f3Smrg } 11847ec681f3Smrg fprintf(f, " part.tcs.epilog.prim_mode = %u\n", key->part.tcs.epilog.prim_mode); 11857ec681f3Smrg fprintf(f, " mono.u.ff_tcs_inputs_to_copy = 0x%" PRIx64 "\n", 11867ec681f3Smrg key->mono.u.ff_tcs_inputs_to_copy); 11877ec681f3Smrg fprintf(f, " opt.prefer_mono = %u\n", key->opt.prefer_mono); 11887ec681f3Smrg fprintf(f, " opt.same_patch_vertices = %u\n", key->opt.same_patch_vertices); 11897ec681f3Smrg break; 11907ec681f3Smrg 11917ec681f3Smrg case MESA_SHADER_TESS_EVAL: 11927ec681f3Smrg fprintf(f, " as_es = %u\n", key->as_es); 11937ec681f3Smrg fprintf(f, " as_ngg = %u\n", key->as_ngg); 11947ec681f3Smrg fprintf(f, " mono.u.vs_export_prim_id = %u\n", key->mono.u.vs_export_prim_id); 11957ec681f3Smrg break; 11967ec681f3Smrg 11977ec681f3Smrg case MESA_SHADER_GEOMETRY: 11987ec681f3Smrg if (shader->is_gs_copy_shader) 11997ec681f3Smrg break; 12007ec681f3Smrg 12017ec681f3Smrg if (shader->selector->screen->info.chip_class >= GFX9 && 12027ec681f3Smrg key->part.gs.es->info.stage == MESA_SHADER_VERTEX) { 12037ec681f3Smrg si_dump_shader_key_vs(key, &key->part.gs.vs_prolog, "part.gs.vs_prolog", f); 12047ec681f3Smrg } 12057ec681f3Smrg fprintf(f, " part.gs.prolog.tri_strip_adj_fix = %u\n", 12067ec681f3Smrg key->part.gs.prolog.tri_strip_adj_fix); 12077ec681f3Smrg fprintf(f, " as_ngg = %u\n", key->as_ngg); 12087ec681f3Smrg break; 12097ec681f3Smrg 12107ec681f3Smrg case MESA_SHADER_COMPUTE: 12117ec681f3Smrg break; 12127ec681f3Smrg 12137ec681f3Smrg case MESA_SHADER_FRAGMENT: 12147ec681f3Smrg fprintf(f, " part.ps.prolog.color_two_side = %u\n", key->part.ps.prolog.color_two_side); 12157ec681f3Smrg fprintf(f, " part.ps.prolog.flatshade_colors = %u\n", key->part.ps.prolog.flatshade_colors); 12167ec681f3Smrg fprintf(f, " part.ps.prolog.poly_stipple = %u\n", key->part.ps.prolog.poly_stipple); 12177ec681f3Smrg fprintf(f, " part.ps.prolog.force_persp_sample_interp = %u\n", 12187ec681f3Smrg key->part.ps.prolog.force_persp_sample_interp); 12197ec681f3Smrg fprintf(f, " part.ps.prolog.force_linear_sample_interp = %u\n", 12207ec681f3Smrg key->part.ps.prolog.force_linear_sample_interp); 12217ec681f3Smrg fprintf(f, " part.ps.prolog.force_persp_center_interp = %u\n", 12227ec681f3Smrg key->part.ps.prolog.force_persp_center_interp); 12237ec681f3Smrg fprintf(f, " part.ps.prolog.force_linear_center_interp = %u\n", 12247ec681f3Smrg key->part.ps.prolog.force_linear_center_interp); 12257ec681f3Smrg fprintf(f, " part.ps.prolog.bc_optimize_for_persp = %u\n", 12267ec681f3Smrg key->part.ps.prolog.bc_optimize_for_persp); 12277ec681f3Smrg fprintf(f, " part.ps.prolog.bc_optimize_for_linear = %u\n", 12287ec681f3Smrg key->part.ps.prolog.bc_optimize_for_linear); 12297ec681f3Smrg fprintf(f, " part.ps.prolog.samplemask_log_ps_iter = %u\n", 12307ec681f3Smrg key->part.ps.prolog.samplemask_log_ps_iter); 12317ec681f3Smrg fprintf(f, " part.ps.epilog.spi_shader_col_format = 0x%x\n", 12327ec681f3Smrg key->part.ps.epilog.spi_shader_col_format); 12337ec681f3Smrg fprintf(f, " part.ps.epilog.color_is_int8 = 0x%X\n", key->part.ps.epilog.color_is_int8); 12347ec681f3Smrg fprintf(f, " part.ps.epilog.color_is_int10 = 0x%X\n", key->part.ps.epilog.color_is_int10); 12357ec681f3Smrg fprintf(f, " part.ps.epilog.last_cbuf = %u\n", key->part.ps.epilog.last_cbuf); 12367ec681f3Smrg fprintf(f, " part.ps.epilog.alpha_func = %u\n", key->part.ps.epilog.alpha_func); 12377ec681f3Smrg fprintf(f, " part.ps.epilog.alpha_to_one = %u\n", key->part.ps.epilog.alpha_to_one); 12387ec681f3Smrg fprintf(f, " part.ps.epilog.poly_line_smoothing = %u\n", 12397ec681f3Smrg key->part.ps.epilog.poly_line_smoothing); 12407ec681f3Smrg fprintf(f, " part.ps.epilog.clamp_color = %u\n", key->part.ps.epilog.clamp_color); 12417ec681f3Smrg fprintf(f, " mono.u.ps.interpolate_at_sample_force_center = %u\n", 12427ec681f3Smrg key->mono.u.ps.interpolate_at_sample_force_center); 12437ec681f3Smrg fprintf(f, " mono.u.ps.fbfetch_msaa = %u\n", key->mono.u.ps.fbfetch_msaa); 12447ec681f3Smrg fprintf(f, " mono.u.ps.fbfetch_is_1D = %u\n", key->mono.u.ps.fbfetch_is_1D); 12457ec681f3Smrg fprintf(f, " mono.u.ps.fbfetch_layered = %u\n", key->mono.u.ps.fbfetch_layered); 12467ec681f3Smrg break; 12477ec681f3Smrg 12487ec681f3Smrg default: 12497ec681f3Smrg assert(0); 12507ec681f3Smrg } 12517ec681f3Smrg 12527ec681f3Smrg if ((stage == MESA_SHADER_GEOMETRY || stage == MESA_SHADER_TESS_EVAL || 12537ec681f3Smrg stage == MESA_SHADER_VERTEX) && 12547ec681f3Smrg !key->as_es && !key->as_ls) { 12557ec681f3Smrg fprintf(f, " opt.kill_outputs = 0x%" PRIx64 "\n", key->opt.kill_outputs); 12567ec681f3Smrg fprintf(f, " opt.kill_pointsize = 0x%x\n", key->opt.kill_pointsize); 12577ec681f3Smrg fprintf(f, " opt.kill_clip_distances = 0x%x\n", key->opt.kill_clip_distances); 12587ec681f3Smrg if (stage != MESA_SHADER_GEOMETRY) 12597ec681f3Smrg fprintf(f, " opt.ngg_culling = 0x%x\n", key->opt.ngg_culling); 12607ec681f3Smrg } 12617ec681f3Smrg 12627ec681f3Smrg fprintf(f, " opt.prefer_mono = %u\n", key->opt.prefer_mono); 12637ec681f3Smrg fprintf(f, " opt.inline_uniforms = %u (0x%x, 0x%x, 0x%x, 0x%x)\n", 12647ec681f3Smrg key->opt.inline_uniforms, 12657ec681f3Smrg key->opt.inlined_uniform_values[0], 12667ec681f3Smrg key->opt.inlined_uniform_values[1], 12677ec681f3Smrg key->opt.inlined_uniform_values[2], 12687ec681f3Smrg key->opt.inlined_uniform_values[3]); 12697ec681f3Smrg} 12707ec681f3Smrg 12717ec681f3Smrgbool si_vs_needs_prolog(const struct si_shader_selector *sel, 12727ec681f3Smrg const struct si_vs_prolog_bits *prolog_key, 12737ec681f3Smrg const struct si_shader_key *key, bool ngg_cull_shader) 12747ec681f3Smrg{ 12757ec681f3Smrg /* VGPR initialization fixup for Vega10 and Raven is always done in the 12767ec681f3Smrg * VS prolog. */ 12777ec681f3Smrg return sel->vs_needs_prolog || prolog_key->ls_vgpr_fix || 12787ec681f3Smrg /* The 2nd VS prolog loads input VGPRs from LDS */ 12797ec681f3Smrg (key->opt.ngg_culling && !ngg_cull_shader); 1280af69d88dSmrg} 1281af69d88dSmrg 128201e04c3fSmrg/** 128301e04c3fSmrg * Compute the VS prolog key, which contains all the information needed to 128401e04c3fSmrg * build the VS prolog function, and set shader->info bits where needed. 128501e04c3fSmrg * 128601e04c3fSmrg * \param info Shader info of the vertex shader. 128701e04c3fSmrg * \param num_input_sgprs Number of input SGPRs for the vertex shader. 12887ec681f3Smrg * \param has_old_ Whether the preceding shader part is the NGG cull shader. 128901e04c3fSmrg * \param prolog_key Key of the VS prolog 129001e04c3fSmrg * \param shader_out The vertex shader, or the next shader if merging LS+HS or ES+GS. 129101e04c3fSmrg * \param key Output shader part key. 129201e04c3fSmrg */ 12937ec681f3Smrgvoid si_get_vs_prolog_key(const struct si_shader_info *info, unsigned num_input_sgprs, 12947ec681f3Smrg bool ngg_cull_shader, const struct si_vs_prolog_bits *prolog_key, 12957ec681f3Smrg struct si_shader *shader_out, union si_shader_part_key *key) 12967ec681f3Smrg{ 12977ec681f3Smrg memset(key, 0, sizeof(*key)); 12987ec681f3Smrg key->vs_prolog.states = *prolog_key; 12997ec681f3Smrg key->vs_prolog.num_input_sgprs = num_input_sgprs; 13007ec681f3Smrg key->vs_prolog.num_inputs = info->num_inputs; 13017ec681f3Smrg key->vs_prolog.as_ls = shader_out->key.as_ls; 13027ec681f3Smrg key->vs_prolog.as_es = shader_out->key.as_es; 13037ec681f3Smrg key->vs_prolog.as_ngg = shader_out->key.as_ngg; 13047ec681f3Smrg 13057ec681f3Smrg if (!ngg_cull_shader && shader_out->key.opt.ngg_culling) 13067ec681f3Smrg key->vs_prolog.load_vgprs_after_culling = 1; 13077ec681f3Smrg 13087ec681f3Smrg if (shader_out->selector->info.stage == MESA_SHADER_TESS_CTRL) { 13097ec681f3Smrg key->vs_prolog.as_ls = 1; 13107ec681f3Smrg key->vs_prolog.num_merged_next_stage_vgprs = 2; 13117ec681f3Smrg } else if (shader_out->selector->info.stage == MESA_SHADER_GEOMETRY) { 13127ec681f3Smrg key->vs_prolog.as_es = 1; 13137ec681f3Smrg key->vs_prolog.num_merged_next_stage_vgprs = 5; 13147ec681f3Smrg } else if (shader_out->key.as_ngg) { 13157ec681f3Smrg key->vs_prolog.num_merged_next_stage_vgprs = 5; 13167ec681f3Smrg } 13177ec681f3Smrg 13187ec681f3Smrg /* Only one of these combinations can be set. as_ngg can be set with as_es. */ 13197ec681f3Smrg assert(key->vs_prolog.as_ls + key->vs_prolog.as_ngg + 13207ec681f3Smrg (key->vs_prolog.as_es && !key->vs_prolog.as_ngg) <= 1); 13217ec681f3Smrg 13227ec681f3Smrg /* Enable loading the InstanceID VGPR. */ 13237ec681f3Smrg uint16_t input_mask = u_bit_consecutive(0, info->num_inputs); 13247ec681f3Smrg 13257ec681f3Smrg if ((key->vs_prolog.states.instance_divisor_is_one | 13267ec681f3Smrg key->vs_prolog.states.instance_divisor_is_fetched) & 13277ec681f3Smrg input_mask) 13287ec681f3Smrg shader_out->info.uses_instanceid = true; 13297ec681f3Smrg} 13307ec681f3Smrg 13317ec681f3Smrgstruct nir_shader *si_get_nir_shader(struct si_shader_selector *sel, 13327ec681f3Smrg const struct si_shader_key *key, 13337ec681f3Smrg bool *free_nir) 13347ec681f3Smrg{ 13357ec681f3Smrg nir_shader *nir; 13367ec681f3Smrg *free_nir = false; 13377ec681f3Smrg 13387ec681f3Smrg if (sel->nir) { 13397ec681f3Smrg nir = sel->nir; 13407ec681f3Smrg } else if (sel->nir_binary) { 13417ec681f3Smrg struct pipe_screen *screen = &sel->screen->b; 13427ec681f3Smrg const void *options = screen->get_compiler_options(screen, PIPE_SHADER_IR_NIR, 13437ec681f3Smrg pipe_shader_type_from_mesa(sel->info.stage)); 13447ec681f3Smrg 13457ec681f3Smrg struct blob_reader blob_reader; 13467ec681f3Smrg blob_reader_init(&blob_reader, sel->nir_binary, sel->nir_size); 13477ec681f3Smrg *free_nir = true; 13487ec681f3Smrg nir = nir_deserialize(NULL, options, &blob_reader); 13497ec681f3Smrg } else { 13507ec681f3Smrg return NULL; 13517ec681f3Smrg } 13527ec681f3Smrg 13537ec681f3Smrg if (key && key->opt.inline_uniforms) { 13547ec681f3Smrg assert(*free_nir); 13557ec681f3Smrg 13567ec681f3Smrg /* Most places use shader information from the default variant, not 13577ec681f3Smrg * the optimized variant. These are the things that the driver looks at 13587ec681f3Smrg * in optimized variants and the list of things that we need to do. 13597ec681f3Smrg * 13607ec681f3Smrg * The driver takes into account these things if they suddenly disappear 13617ec681f3Smrg * from the shader code: 13627ec681f3Smrg * - Register usage and code size decrease (obvious) 13637ec681f3Smrg * - Eliminated PS system values are disabled by LLVM 13647ec681f3Smrg * (FragCoord, FrontFace, barycentrics) 13657ec681f3Smrg * - VS/TES/GS outputs feeding PS are eliminated if outputs are undef. 13667ec681f3Smrg * (thanks to an LLVM pass in Mesa - TODO: move it to NIR) 13677ec681f3Smrg * The storage for eliminated outputs is also not allocated. 13687ec681f3Smrg * - VS/TCS/TES/GS/PS input loads are eliminated (VS relies on DCE in LLVM) 13697ec681f3Smrg * - TCS output stores are eliminated 13707ec681f3Smrg * 13717ec681f3Smrg * TODO: These are things the driver ignores in the final shader code 13727ec681f3Smrg * and relies on the default shader info. 13737ec681f3Smrg * - Other system values are not eliminated 13747ec681f3Smrg * - PS.NUM_INTERP = bitcount64(inputs_read), renumber inputs 13757ec681f3Smrg * to remove holes 13767ec681f3Smrg * - uses_discard - if it changed to false 13777ec681f3Smrg * - writes_memory - if it changed to false 13787ec681f3Smrg * - VS->TCS, VS->GS, TES->GS output stores for the former stage are not 13797ec681f3Smrg * eliminated 13807ec681f3Smrg * - Eliminated VS/TCS/TES outputs are still allocated. (except when feeding PS) 13817ec681f3Smrg * GS outputs are eliminated except for the temporary LDS. 13827ec681f3Smrg * Clip distances, gl_PointSize, and PS outputs are eliminated based 13837ec681f3Smrg * on current states, so we don't care about the shader code. 13847ec681f3Smrg * 13857ec681f3Smrg * TODO: Merged shaders don't inline uniforms for the first stage. 13867ec681f3Smrg * VS-GS: only GS inlines uniforms; VS-TCS: only TCS; TES-GS: only GS. 13877ec681f3Smrg * (key == NULL for the first stage here) 13887ec681f3Smrg * 13897ec681f3Smrg * TODO: Compute shaders don't support inlinable uniforms, because they 13907ec681f3Smrg * don't have shader variants. 13917ec681f3Smrg * 13927ec681f3Smrg * TODO: The driver uses a linear search to find a shader variant. This 13937ec681f3Smrg * can be really slow if we get too many variants due to uniform inlining. 13947ec681f3Smrg */ 13957ec681f3Smrg NIR_PASS_V(nir, nir_inline_uniforms, 13967ec681f3Smrg nir->info.num_inlinable_uniforms, 13977ec681f3Smrg key->opt.inlined_uniform_values, 13987ec681f3Smrg nir->info.inlinable_uniform_dw_offsets); 13997ec681f3Smrg 14007ec681f3Smrg si_nir_opts(sel->screen, nir, true); 14017ec681f3Smrg si_nir_late_opts(nir); 14027ec681f3Smrg 14037ec681f3Smrg /* This must be done again. */ 14047ec681f3Smrg NIR_PASS_V(nir, nir_io_add_const_offset_to_base, nir_var_shader_in | 14057ec681f3Smrg nir_var_shader_out); 14067ec681f3Smrg } 14077ec681f3Smrg 14087ec681f3Smrg return nir; 14097ec681f3Smrg} 14107ec681f3Smrg 14117ec681f3Smrgbool si_compile_shader(struct si_screen *sscreen, struct ac_llvm_compiler *compiler, 14127ec681f3Smrg struct si_shader *shader, struct pipe_debug_callback *debug) 14137ec681f3Smrg{ 14147ec681f3Smrg struct si_shader_selector *sel = shader->selector; 14157ec681f3Smrg bool free_nir; 14167ec681f3Smrg struct nir_shader *nir = si_get_nir_shader(sel, &shader->key, &free_nir); 14177ec681f3Smrg 14187ec681f3Smrg /* Dump NIR before doing NIR->LLVM conversion in case the 14197ec681f3Smrg * conversion fails. */ 14207ec681f3Smrg if (si_can_dump_shader(sscreen, sel->info.stage) && 14217ec681f3Smrg !(sscreen->debug_flags & DBG(NO_NIR))) { 14227ec681f3Smrg nir_print_shader(nir, stderr); 14237ec681f3Smrg si_dump_streamout(&sel->so); 14247ec681f3Smrg } 14257ec681f3Smrg 14267ec681f3Smrg /* Initialize vs_output_ps_input_cntl to default. */ 14277ec681f3Smrg for (unsigned i = 0; i < ARRAY_SIZE(shader->info.vs_output_ps_input_cntl); i++) 14287ec681f3Smrg shader->info.vs_output_ps_input_cntl[i] = SI_PS_INPUT_CNTL_UNUSED; 14297ec681f3Smrg shader->info.vs_output_ps_input_cntl[VARYING_SLOT_COL0] = SI_PS_INPUT_CNTL_UNUSED_COLOR0; 14307ec681f3Smrg 14317ec681f3Smrg shader->info.uses_instanceid = sel->info.uses_instanceid; 14327ec681f3Smrg 14337ec681f3Smrg /* TODO: ACO could compile non-monolithic shaders here (starting 14347ec681f3Smrg * with PS and NGG VS), but monolithic shaders should be compiled 14357ec681f3Smrg * by LLVM due to more complicated compilation. 14367ec681f3Smrg */ 14377ec681f3Smrg if (!si_llvm_compile_shader(sscreen, compiler, shader, debug, nir, free_nir)) 14387ec681f3Smrg return false; 14397ec681f3Smrg 14407ec681f3Smrg /* Compute vs_output_ps_input_cntl. */ 14417ec681f3Smrg if ((sel->info.stage == MESA_SHADER_VERTEX || 14427ec681f3Smrg sel->info.stage == MESA_SHADER_TESS_EVAL || 14437ec681f3Smrg sel->info.stage == MESA_SHADER_GEOMETRY) && 14447ec681f3Smrg !shader->key.as_ls && !shader->key.as_es) { 14457ec681f3Smrg ubyte *vs_output_param_offset = shader->info.vs_output_param_offset; 14467ec681f3Smrg 14477ec681f3Smrg if (sel->info.stage == MESA_SHADER_GEOMETRY && !shader->key.as_ngg) 14487ec681f3Smrg vs_output_param_offset = sel->gs_copy_shader->info.vs_output_param_offset; 14497ec681f3Smrg 14507ec681f3Smrg /* VS and TES should also set primitive ID output if it's used. */ 14517ec681f3Smrg unsigned num_outputs_with_prim_id = sel->info.num_outputs + 14527ec681f3Smrg shader->key.mono.u.vs_export_prim_id; 14537ec681f3Smrg 14547ec681f3Smrg for (unsigned i = 0; i < num_outputs_with_prim_id; i++) { 14557ec681f3Smrg unsigned semantic = sel->info.output_semantic[i]; 14567ec681f3Smrg unsigned offset = vs_output_param_offset[i]; 14577ec681f3Smrg unsigned ps_input_cntl; 14587ec681f3Smrg 14597ec681f3Smrg if (offset <= AC_EXP_PARAM_OFFSET_31) { 14607ec681f3Smrg /* The input is loaded from parameter memory. */ 14617ec681f3Smrg ps_input_cntl = S_028644_OFFSET(offset); 14627ec681f3Smrg } else { 14637ec681f3Smrg /* The input is a DEFAULT_VAL constant. */ 14647ec681f3Smrg assert(offset >= AC_EXP_PARAM_DEFAULT_VAL_0000 && 14657ec681f3Smrg offset <= AC_EXP_PARAM_DEFAULT_VAL_1111); 14667ec681f3Smrg offset -= AC_EXP_PARAM_DEFAULT_VAL_0000; 14677ec681f3Smrg 14687ec681f3Smrg /* OFFSET=0x20 means that DEFAULT_VAL is used. */ 14697ec681f3Smrg ps_input_cntl = S_028644_OFFSET(0x20) | 14707ec681f3Smrg S_028644_DEFAULT_VAL(offset); 14717ec681f3Smrg } 14727ec681f3Smrg 14737ec681f3Smrg shader->info.vs_output_ps_input_cntl[semantic] = ps_input_cntl; 14747ec681f3Smrg } 14757ec681f3Smrg } 14767ec681f3Smrg 14777ec681f3Smrg /* Validate SGPR and VGPR usage for compute to detect compiler bugs. */ 14787ec681f3Smrg if (sel->info.stage == MESA_SHADER_COMPUTE) { 14797ec681f3Smrg unsigned wave_size = sscreen->compute_wave_size; 14807ec681f3Smrg unsigned max_vgprs = 14817ec681f3Smrg sscreen->info.num_physical_wave64_vgprs_per_simd * (wave_size == 32 ? 2 : 1); 14827ec681f3Smrg unsigned max_sgprs = sscreen->info.num_physical_sgprs_per_simd; 14837ec681f3Smrg unsigned max_sgprs_per_wave = 128; 14847ec681f3Smrg unsigned simds_per_tg = 4; /* assuming WGP mode on gfx10 */ 14857ec681f3Smrg unsigned threads_per_tg = si_get_max_workgroup_size(shader); 14867ec681f3Smrg unsigned waves_per_tg = DIV_ROUND_UP(threads_per_tg, wave_size); 14877ec681f3Smrg unsigned waves_per_simd = DIV_ROUND_UP(waves_per_tg, simds_per_tg); 14887ec681f3Smrg 14897ec681f3Smrg max_vgprs = max_vgprs / waves_per_simd; 14907ec681f3Smrg max_sgprs = MIN2(max_sgprs / waves_per_simd, max_sgprs_per_wave); 14917ec681f3Smrg 14927ec681f3Smrg if (shader->config.num_sgprs > max_sgprs || shader->config.num_vgprs > max_vgprs) { 14937ec681f3Smrg fprintf(stderr, 14947ec681f3Smrg "LLVM failed to compile a shader correctly: " 14957ec681f3Smrg "SGPR:VGPR usage is %u:%u, but the hw limit is %u:%u\n", 14967ec681f3Smrg shader->config.num_sgprs, shader->config.num_vgprs, max_sgprs, max_vgprs); 14977ec681f3Smrg 14987ec681f3Smrg /* Just terminate the process, because dependent 14997ec681f3Smrg * shaders can hang due to bad input data, but use 15007ec681f3Smrg * the env var to allow shader-db to work. 15017ec681f3Smrg */ 15027ec681f3Smrg if (!debug_get_bool_option("SI_PASS_BAD_SHADERS", false)) 15037ec681f3Smrg abort(); 15047ec681f3Smrg } 15057ec681f3Smrg } 15067ec681f3Smrg 15077ec681f3Smrg /* Add the scratch offset to input SGPRs. */ 15087ec681f3Smrg if (shader->config.scratch_bytes_per_wave && !si_is_merged_shader(shader)) 15097ec681f3Smrg shader->info.num_input_sgprs += 1; /* scratch byte offset */ 15107ec681f3Smrg 15117ec681f3Smrg /* Calculate the number of fragment input VGPRs. */ 15127ec681f3Smrg if (sel->info.stage == MESA_SHADER_FRAGMENT) { 15137ec681f3Smrg shader->info.num_input_vgprs = ac_get_fs_input_vgpr_cnt( 15147ec681f3Smrg &shader->config, &shader->info.face_vgpr_index, &shader->info.ancillary_vgpr_index); 15157ec681f3Smrg } 15167ec681f3Smrg 15177ec681f3Smrg si_calculate_max_simd_waves(shader); 15187ec681f3Smrg si_shader_dump_stats_for_shader_db(sscreen, shader, debug); 15197ec681f3Smrg return true; 152001e04c3fSmrg} 152101e04c3fSmrg 152201e04c3fSmrg/** 152301e04c3fSmrg * Create, compile and return a shader part (prolog or epilog). 152401e04c3fSmrg * 152501e04c3fSmrg * \param sscreen screen 152601e04c3fSmrg * \param list list of shader parts of the same category 152701e04c3fSmrg * \param type shader type 152801e04c3fSmrg * \param key shader part key 152901e04c3fSmrg * \param prolog whether the part being requested is a prolog 153001e04c3fSmrg * \param tm LLVM target machine 153101e04c3fSmrg * \param debug debug callback 153201e04c3fSmrg * \param build the callback responsible for building the main function 153301e04c3fSmrg * \return non-NULL on success 153401e04c3fSmrg */ 153501e04c3fSmrgstatic struct si_shader_part * 15367ec681f3Smrgsi_get_shader_part(struct si_screen *sscreen, struct si_shader_part **list, 15377ec681f3Smrg gl_shader_stage stage, bool prolog, union si_shader_part_key *key, 15387ec681f3Smrg struct ac_llvm_compiler *compiler, struct pipe_debug_callback *debug, 15397ec681f3Smrg void (*build)(struct si_shader_context *, union si_shader_part_key *), 15407ec681f3Smrg const char *name) 15417ec681f3Smrg{ 15427ec681f3Smrg struct si_shader_part *result; 15437ec681f3Smrg 15447ec681f3Smrg simple_mtx_lock(&sscreen->shader_parts_mutex); 15457ec681f3Smrg 15467ec681f3Smrg /* Find existing. */ 15477ec681f3Smrg for (result = *list; result; result = result->next) { 15487ec681f3Smrg if (memcmp(&result->key, key, sizeof(*key)) == 0) { 15497ec681f3Smrg simple_mtx_unlock(&sscreen->shader_parts_mutex); 15507ec681f3Smrg return result; 15517ec681f3Smrg } 15527ec681f3Smrg } 15537ec681f3Smrg 15547ec681f3Smrg /* Compile a new one. */ 15557ec681f3Smrg result = CALLOC_STRUCT(si_shader_part); 15567ec681f3Smrg result->key = *key; 15577ec681f3Smrg 15587ec681f3Smrg struct si_shader_selector sel = {}; 15597ec681f3Smrg sel.screen = sscreen; 15607ec681f3Smrg 15617ec681f3Smrg struct si_shader shader = {}; 15627ec681f3Smrg shader.selector = &sel; 15637ec681f3Smrg 15647ec681f3Smrg switch (stage) { 15657ec681f3Smrg case MESA_SHADER_VERTEX: 15667ec681f3Smrg shader.key.as_ls = key->vs_prolog.as_ls; 15677ec681f3Smrg shader.key.as_es = key->vs_prolog.as_es; 15687ec681f3Smrg shader.key.as_ngg = key->vs_prolog.as_ngg; 15697ec681f3Smrg break; 15707ec681f3Smrg case MESA_SHADER_TESS_CTRL: 15717ec681f3Smrg assert(!prolog); 15727ec681f3Smrg shader.key.part.tcs.epilog = key->tcs_epilog.states; 15737ec681f3Smrg break; 15747ec681f3Smrg case MESA_SHADER_GEOMETRY: 15757ec681f3Smrg assert(prolog); 15767ec681f3Smrg shader.key.as_ngg = key->gs_prolog.as_ngg; 15777ec681f3Smrg break; 15787ec681f3Smrg case MESA_SHADER_FRAGMENT: 15797ec681f3Smrg if (prolog) 15807ec681f3Smrg shader.key.part.ps.prolog = key->ps_prolog.states; 15817ec681f3Smrg else 15827ec681f3Smrg shader.key.part.ps.epilog = key->ps_epilog.states; 15837ec681f3Smrg break; 15847ec681f3Smrg default: 15857ec681f3Smrg unreachable("bad shader part"); 15867ec681f3Smrg } 15877ec681f3Smrg 15887ec681f3Smrg struct si_shader_context ctx; 15897ec681f3Smrg si_llvm_context_init(&ctx, sscreen, compiler, 15907ec681f3Smrg si_get_wave_size(sscreen, stage, 15917ec681f3Smrg shader.key.as_ngg, shader.key.as_es)); 15927ec681f3Smrg ctx.shader = &shader; 15937ec681f3Smrg ctx.stage = stage; 15947ec681f3Smrg 15957ec681f3Smrg build(&ctx, key); 15967ec681f3Smrg 15977ec681f3Smrg /* Compile. */ 15987ec681f3Smrg si_llvm_optimize_module(&ctx); 15997ec681f3Smrg 16007ec681f3Smrg if (!si_compile_llvm(sscreen, &result->binary, &result->config, compiler, &ctx.ac, debug, 16017ec681f3Smrg ctx.stage, name, false)) { 16027ec681f3Smrg FREE(result); 16037ec681f3Smrg result = NULL; 16047ec681f3Smrg goto out; 16057ec681f3Smrg } 16067ec681f3Smrg 16077ec681f3Smrg result->next = *list; 16087ec681f3Smrg *list = result; 160901e04c3fSmrg 161001e04c3fSmrgout: 16117ec681f3Smrg si_llvm_dispose(&ctx); 16127ec681f3Smrg simple_mtx_unlock(&sscreen->shader_parts_mutex); 16137ec681f3Smrg return result; 1614af69d88dSmrg} 1615af69d88dSmrg 16167ec681f3Smrgstatic bool si_get_vs_prolog(struct si_screen *sscreen, struct ac_llvm_compiler *compiler, 16177ec681f3Smrg struct si_shader *shader, struct pipe_debug_callback *debug, 16187ec681f3Smrg struct si_shader *main_part, const struct si_vs_prolog_bits *key) 1619af69d88dSmrg{ 16207ec681f3Smrg struct si_shader_selector *vs = main_part->selector; 162101e04c3fSmrg 16227ec681f3Smrg if (!si_vs_needs_prolog(vs, key, &shader->key, false)) 16237ec681f3Smrg return true; 1624af69d88dSmrg 16257ec681f3Smrg /* Get the prolog. */ 16267ec681f3Smrg union si_shader_part_key prolog_key; 16277ec681f3Smrg si_get_vs_prolog_key(&vs->info, main_part->info.num_input_sgprs, false, key, shader, 16287ec681f3Smrg &prolog_key); 162901e04c3fSmrg 16307ec681f3Smrg shader->prolog = 16317ec681f3Smrg si_get_shader_part(sscreen, &sscreen->vs_prologs, MESA_SHADER_VERTEX, true, &prolog_key, 16327ec681f3Smrg compiler, debug, si_llvm_build_vs_prolog, "Vertex Shader Prolog"); 16337ec681f3Smrg return shader->prolog != NULL; 1634af69d88dSmrg} 1635af69d88dSmrg 163601e04c3fSmrg/** 163701e04c3fSmrg * Select and compile (or reuse) vertex shader parts (prolog & epilog). 163801e04c3fSmrg */ 16397ec681f3Smrgstatic bool si_shader_select_vs_parts(struct si_screen *sscreen, struct ac_llvm_compiler *compiler, 16407ec681f3Smrg struct si_shader *shader, struct pipe_debug_callback *debug) 1641af69d88dSmrg{ 16427ec681f3Smrg return si_get_vs_prolog(sscreen, compiler, shader, debug, shader, &shader->key.part.vs.prolog); 164301e04c3fSmrg} 1644af69d88dSmrg 164501e04c3fSmrg/** 16467ec681f3Smrg * Select and compile (or reuse) TCS parts (epilog). 164701e04c3fSmrg */ 16487ec681f3Smrgstatic bool si_shader_select_tcs_parts(struct si_screen *sscreen, struct ac_llvm_compiler *compiler, 16497ec681f3Smrg struct si_shader *shader, struct pipe_debug_callback *debug) 165001e04c3fSmrg{ 16517ec681f3Smrg if (sscreen->info.chip_class >= GFX9) { 16527ec681f3Smrg struct si_shader *ls_main_part = shader->key.part.tcs.ls->main_shader_part_ls; 16537ec681f3Smrg 16547ec681f3Smrg if (!si_get_vs_prolog(sscreen, compiler, shader, debug, ls_main_part, 16557ec681f3Smrg &shader->key.part.tcs.ls_prolog)) 16567ec681f3Smrg return false; 16577ec681f3Smrg 16587ec681f3Smrg shader->previous_stage = ls_main_part; 16597ec681f3Smrg } 16607ec681f3Smrg 16617ec681f3Smrg /* Get the epilog. */ 16627ec681f3Smrg union si_shader_part_key epilog_key; 16637ec681f3Smrg memset(&epilog_key, 0, sizeof(epilog_key)); 16647ec681f3Smrg epilog_key.tcs_epilog.states = shader->key.part.tcs.epilog; 16657ec681f3Smrg 16667ec681f3Smrg shader->epilog = si_get_shader_part(sscreen, &sscreen->tcs_epilogs, MESA_SHADER_TESS_CTRL, false, 16677ec681f3Smrg &epilog_key, compiler, debug, si_llvm_build_tcs_epilog, 16687ec681f3Smrg "Tessellation Control Shader Epilog"); 16697ec681f3Smrg return shader->epilog != NULL; 167001e04c3fSmrg} 1671af69d88dSmrg 167201e04c3fSmrg/** 16737ec681f3Smrg * Select and compile (or reuse) GS parts (prolog). 167401e04c3fSmrg */ 16757ec681f3Smrgstatic bool si_shader_select_gs_parts(struct si_screen *sscreen, struct ac_llvm_compiler *compiler, 16767ec681f3Smrg struct si_shader *shader, struct pipe_debug_callback *debug) 167701e04c3fSmrg{ 16787ec681f3Smrg if (sscreen->info.chip_class >= GFX9) { 16797ec681f3Smrg struct si_shader *es_main_part; 16807ec681f3Smrg 16817ec681f3Smrg if (shader->key.as_ngg) 16827ec681f3Smrg es_main_part = shader->key.part.gs.es->main_shader_part_ngg_es; 16837ec681f3Smrg else 16847ec681f3Smrg es_main_part = shader->key.part.gs.es->main_shader_part_es; 16857ec681f3Smrg 16867ec681f3Smrg if (shader->key.part.gs.es->info.stage == MESA_SHADER_VERTEX && 16877ec681f3Smrg !si_get_vs_prolog(sscreen, compiler, shader, debug, es_main_part, 16887ec681f3Smrg &shader->key.part.gs.vs_prolog)) 16897ec681f3Smrg return false; 16907ec681f3Smrg 16917ec681f3Smrg shader->previous_stage = es_main_part; 16927ec681f3Smrg } 16937ec681f3Smrg 16947ec681f3Smrg if (!shader->key.part.gs.prolog.tri_strip_adj_fix) 16957ec681f3Smrg return true; 16967ec681f3Smrg 16977ec681f3Smrg union si_shader_part_key prolog_key; 16987ec681f3Smrg memset(&prolog_key, 0, sizeof(prolog_key)); 16997ec681f3Smrg prolog_key.gs_prolog.states = shader->key.part.gs.prolog; 17007ec681f3Smrg prolog_key.gs_prolog.as_ngg = shader->key.as_ngg; 17017ec681f3Smrg 17027ec681f3Smrg shader->prolog2 = 17037ec681f3Smrg si_get_shader_part(sscreen, &sscreen->gs_prologs, MESA_SHADER_GEOMETRY, true, &prolog_key, 17047ec681f3Smrg compiler, debug, si_llvm_build_gs_prolog, "Geometry Shader Prolog"); 17057ec681f3Smrg return shader->prolog2 != NULL; 1706af69d88dSmrg} 1707af69d88dSmrg 170801e04c3fSmrg/** 17097ec681f3Smrg * Compute the PS prolog key, which contains all the information needed to 17107ec681f3Smrg * build the PS prolog function, and set related bits in shader->config. 171101e04c3fSmrg */ 17127ec681f3Smrgvoid si_get_ps_prolog_key(struct si_shader *shader, union si_shader_part_key *key, 17137ec681f3Smrg bool separate_prolog) 17147ec681f3Smrg{ 17157ec681f3Smrg struct si_shader_info *info = &shader->selector->info; 17167ec681f3Smrg 17177ec681f3Smrg memset(key, 0, sizeof(*key)); 17187ec681f3Smrg key->ps_prolog.states = shader->key.part.ps.prolog; 17197ec681f3Smrg key->ps_prolog.colors_read = info->colors_read; 17207ec681f3Smrg key->ps_prolog.num_input_sgprs = shader->info.num_input_sgprs; 17217ec681f3Smrg key->ps_prolog.num_input_vgprs = shader->info.num_input_vgprs; 17227ec681f3Smrg key->ps_prolog.wqm = 17237ec681f3Smrg info->base.fs.needs_quad_helper_invocations && 17247ec681f3Smrg (key->ps_prolog.colors_read || key->ps_prolog.states.force_persp_sample_interp || 17257ec681f3Smrg key->ps_prolog.states.force_linear_sample_interp || 17267ec681f3Smrg key->ps_prolog.states.force_persp_center_interp || 17277ec681f3Smrg key->ps_prolog.states.force_linear_center_interp || 17287ec681f3Smrg key->ps_prolog.states.bc_optimize_for_persp || key->ps_prolog.states.bc_optimize_for_linear); 17297ec681f3Smrg key->ps_prolog.ancillary_vgpr_index = shader->info.ancillary_vgpr_index; 17307ec681f3Smrg 17317ec681f3Smrg if (info->colors_read) { 17327ec681f3Smrg ubyte *color = shader->selector->color_attr_index; 17337ec681f3Smrg 17347ec681f3Smrg if (shader->key.part.ps.prolog.color_two_side) { 17357ec681f3Smrg /* BCOLORs are stored after the last input. */ 17367ec681f3Smrg key->ps_prolog.num_interp_inputs = info->num_inputs; 17377ec681f3Smrg key->ps_prolog.face_vgpr_index = shader->info.face_vgpr_index; 17387ec681f3Smrg if (separate_prolog) 17397ec681f3Smrg shader->config.spi_ps_input_ena |= S_0286CC_FRONT_FACE_ENA(1); 17407ec681f3Smrg } 17417ec681f3Smrg 17427ec681f3Smrg for (unsigned i = 0; i < 2; i++) { 17437ec681f3Smrg unsigned interp = info->color_interpolate[i]; 17447ec681f3Smrg unsigned location = info->color_interpolate_loc[i]; 17457ec681f3Smrg 17467ec681f3Smrg if (!(info->colors_read & (0xf << i * 4))) 17477ec681f3Smrg continue; 17487ec681f3Smrg 17497ec681f3Smrg key->ps_prolog.color_attr_index[i] = color[i]; 17507ec681f3Smrg 17517ec681f3Smrg if (shader->key.part.ps.prolog.flatshade_colors && interp == INTERP_MODE_COLOR) 17527ec681f3Smrg interp = INTERP_MODE_FLAT; 17537ec681f3Smrg 17547ec681f3Smrg switch (interp) { 17557ec681f3Smrg case INTERP_MODE_FLAT: 17567ec681f3Smrg key->ps_prolog.color_interp_vgpr_index[i] = -1; 17577ec681f3Smrg break; 17587ec681f3Smrg case INTERP_MODE_SMOOTH: 17597ec681f3Smrg case INTERP_MODE_COLOR: 17607ec681f3Smrg /* Force the interpolation location for colors here. */ 17617ec681f3Smrg if (shader->key.part.ps.prolog.force_persp_sample_interp) 17627ec681f3Smrg location = TGSI_INTERPOLATE_LOC_SAMPLE; 17637ec681f3Smrg if (shader->key.part.ps.prolog.force_persp_center_interp) 17647ec681f3Smrg location = TGSI_INTERPOLATE_LOC_CENTER; 17657ec681f3Smrg 17667ec681f3Smrg switch (location) { 17677ec681f3Smrg case TGSI_INTERPOLATE_LOC_SAMPLE: 17687ec681f3Smrg key->ps_prolog.color_interp_vgpr_index[i] = 0; 17697ec681f3Smrg if (separate_prolog) { 17707ec681f3Smrg shader->config.spi_ps_input_ena |= S_0286CC_PERSP_SAMPLE_ENA(1); 17717ec681f3Smrg } 17727ec681f3Smrg break; 17737ec681f3Smrg case TGSI_INTERPOLATE_LOC_CENTER: 17747ec681f3Smrg key->ps_prolog.color_interp_vgpr_index[i] = 2; 17757ec681f3Smrg if (separate_prolog) { 17767ec681f3Smrg shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTER_ENA(1); 17777ec681f3Smrg } 17787ec681f3Smrg break; 17797ec681f3Smrg case TGSI_INTERPOLATE_LOC_CENTROID: 17807ec681f3Smrg key->ps_prolog.color_interp_vgpr_index[i] = 4; 17817ec681f3Smrg if (separate_prolog) { 17827ec681f3Smrg shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTROID_ENA(1); 17837ec681f3Smrg } 17847ec681f3Smrg break; 17857ec681f3Smrg default: 17867ec681f3Smrg assert(0); 17877ec681f3Smrg } 17887ec681f3Smrg break; 17897ec681f3Smrg case INTERP_MODE_NOPERSPECTIVE: 17907ec681f3Smrg /* Force the interpolation location for colors here. */ 17917ec681f3Smrg if (shader->key.part.ps.prolog.force_linear_sample_interp) 17927ec681f3Smrg location = TGSI_INTERPOLATE_LOC_SAMPLE; 17937ec681f3Smrg if (shader->key.part.ps.prolog.force_linear_center_interp) 17947ec681f3Smrg location = TGSI_INTERPOLATE_LOC_CENTER; 17957ec681f3Smrg 17967ec681f3Smrg /* The VGPR assignment for non-monolithic shaders 17977ec681f3Smrg * works because InitialPSInputAddr is set on the 17987ec681f3Smrg * main shader and PERSP_PULL_MODEL is never used. 17997ec681f3Smrg */ 18007ec681f3Smrg switch (location) { 18017ec681f3Smrg case TGSI_INTERPOLATE_LOC_SAMPLE: 18027ec681f3Smrg key->ps_prolog.color_interp_vgpr_index[i] = separate_prolog ? 6 : 9; 18037ec681f3Smrg if (separate_prolog) { 18047ec681f3Smrg shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_SAMPLE_ENA(1); 18057ec681f3Smrg } 18067ec681f3Smrg break; 18077ec681f3Smrg case TGSI_INTERPOLATE_LOC_CENTER: 18087ec681f3Smrg key->ps_prolog.color_interp_vgpr_index[i] = separate_prolog ? 8 : 11; 18097ec681f3Smrg if (separate_prolog) { 18107ec681f3Smrg shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTER_ENA(1); 18117ec681f3Smrg } 18127ec681f3Smrg break; 18137ec681f3Smrg case TGSI_INTERPOLATE_LOC_CENTROID: 18147ec681f3Smrg key->ps_prolog.color_interp_vgpr_index[i] = separate_prolog ? 10 : 13; 18157ec681f3Smrg if (separate_prolog) { 18167ec681f3Smrg shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTROID_ENA(1); 18177ec681f3Smrg } 18187ec681f3Smrg break; 18197ec681f3Smrg default: 18207ec681f3Smrg assert(0); 18217ec681f3Smrg } 18227ec681f3Smrg break; 18237ec681f3Smrg default: 18247ec681f3Smrg assert(0); 18257ec681f3Smrg } 18267ec681f3Smrg } 18277ec681f3Smrg } 182801e04c3fSmrg} 1829af69d88dSmrg 183001e04c3fSmrg/** 18317ec681f3Smrg * Check whether a PS prolog is required based on the key. 183201e04c3fSmrg */ 18337ec681f3Smrgbool si_need_ps_prolog(const union si_shader_part_key *key) 183401e04c3fSmrg{ 18357ec681f3Smrg return key->ps_prolog.colors_read || key->ps_prolog.states.force_persp_sample_interp || 18367ec681f3Smrg key->ps_prolog.states.force_linear_sample_interp || 18377ec681f3Smrg key->ps_prolog.states.force_persp_center_interp || 18387ec681f3Smrg key->ps_prolog.states.force_linear_center_interp || 18397ec681f3Smrg key->ps_prolog.states.bc_optimize_for_persp || 18407ec681f3Smrg key->ps_prolog.states.bc_optimize_for_linear || key->ps_prolog.states.poly_stipple || 18417ec681f3Smrg key->ps_prolog.states.samplemask_log_ps_iter; 184201e04c3fSmrg} 1843af69d88dSmrg 184401e04c3fSmrg/** 18457ec681f3Smrg * Compute the PS epilog key, which contains all the information needed to 18467ec681f3Smrg * build the PS epilog function. 184701e04c3fSmrg */ 18487ec681f3Smrgvoid si_get_ps_epilog_key(struct si_shader *shader, union si_shader_part_key *key) 184901e04c3fSmrg{ 18507ec681f3Smrg struct si_shader_info *info = &shader->selector->info; 18517ec681f3Smrg memset(key, 0, sizeof(*key)); 18527ec681f3Smrg key->ps_epilog.colors_written = info->colors_written; 18537ec681f3Smrg key->ps_epilog.color_types = info->output_color_types; 18547ec681f3Smrg key->ps_epilog.writes_z = info->writes_z; 18557ec681f3Smrg key->ps_epilog.writes_stencil = info->writes_stencil; 18567ec681f3Smrg key->ps_epilog.writes_samplemask = info->writes_samplemask; 18577ec681f3Smrg key->ps_epilog.states = shader->key.part.ps.epilog; 1858af69d88dSmrg} 1859af69d88dSmrg 186001e04c3fSmrg/** 186101e04c3fSmrg * Select and compile (or reuse) pixel shader parts (prolog & epilog). 186201e04c3fSmrg */ 18637ec681f3Smrgstatic bool si_shader_select_ps_parts(struct si_screen *sscreen, struct ac_llvm_compiler *compiler, 18647ec681f3Smrg struct si_shader *shader, struct pipe_debug_callback *debug) 18657ec681f3Smrg{ 18667ec681f3Smrg union si_shader_part_key prolog_key; 18677ec681f3Smrg union si_shader_part_key epilog_key; 18687ec681f3Smrg 18697ec681f3Smrg /* Get the prolog. */ 18707ec681f3Smrg si_get_ps_prolog_key(shader, &prolog_key, true); 18717ec681f3Smrg 18727ec681f3Smrg /* The prolog is a no-op if these aren't set. */ 18737ec681f3Smrg if (si_need_ps_prolog(&prolog_key)) { 18747ec681f3Smrg shader->prolog = 18757ec681f3Smrg si_get_shader_part(sscreen, &sscreen->ps_prologs, MESA_SHADER_FRAGMENT, true, &prolog_key, 18767ec681f3Smrg compiler, debug, si_llvm_build_ps_prolog, "Fragment Shader Prolog"); 18777ec681f3Smrg if (!shader->prolog) 18787ec681f3Smrg return false; 18797ec681f3Smrg } 18807ec681f3Smrg 18817ec681f3Smrg /* Get the epilog. */ 18827ec681f3Smrg si_get_ps_epilog_key(shader, &epilog_key); 18837ec681f3Smrg 18847ec681f3Smrg shader->epilog = 18857ec681f3Smrg si_get_shader_part(sscreen, &sscreen->ps_epilogs, MESA_SHADER_FRAGMENT, false, &epilog_key, 18867ec681f3Smrg compiler, debug, si_llvm_build_ps_epilog, "Fragment Shader Epilog"); 18877ec681f3Smrg if (!shader->epilog) 18887ec681f3Smrg return false; 18897ec681f3Smrg 18907ec681f3Smrg /* Enable POS_FIXED_PT if polygon stippling is enabled. */ 18917ec681f3Smrg if (shader->key.part.ps.prolog.poly_stipple) { 18927ec681f3Smrg shader->config.spi_ps_input_ena |= S_0286CC_POS_FIXED_PT_ENA(1); 18937ec681f3Smrg assert(G_0286CC_POS_FIXED_PT_ENA(shader->config.spi_ps_input_addr)); 18947ec681f3Smrg } 18957ec681f3Smrg 18967ec681f3Smrg /* Set up the enable bits for per-sample shading if needed. */ 18977ec681f3Smrg if (shader->key.part.ps.prolog.force_persp_sample_interp && 18987ec681f3Smrg (G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_ena) || 18997ec681f3Smrg G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_ena))) { 19007ec681f3Smrg shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTER_ENA; 19017ec681f3Smrg shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTROID_ENA; 19027ec681f3Smrg shader->config.spi_ps_input_ena |= S_0286CC_PERSP_SAMPLE_ENA(1); 19037ec681f3Smrg } 19047ec681f3Smrg if (shader->key.part.ps.prolog.force_linear_sample_interp && 19057ec681f3Smrg (G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_ena) || 19067ec681f3Smrg G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_ena))) { 19077ec681f3Smrg shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTER_ENA; 19087ec681f3Smrg shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTROID_ENA; 19097ec681f3Smrg shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_SAMPLE_ENA(1); 19107ec681f3Smrg } 19117ec681f3Smrg if (shader->key.part.ps.prolog.force_persp_center_interp && 19127ec681f3Smrg (G_0286CC_PERSP_SAMPLE_ENA(shader->config.spi_ps_input_ena) || 19137ec681f3Smrg G_0286CC_PERSP_CENTROID_ENA(shader->config.spi_ps_input_ena))) { 19147ec681f3Smrg shader->config.spi_ps_input_ena &= C_0286CC_PERSP_SAMPLE_ENA; 19157ec681f3Smrg shader->config.spi_ps_input_ena &= C_0286CC_PERSP_CENTROID_ENA; 19167ec681f3Smrg shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTER_ENA(1); 19177ec681f3Smrg } 19187ec681f3Smrg if (shader->key.part.ps.prolog.force_linear_center_interp && 19197ec681f3Smrg (G_0286CC_LINEAR_SAMPLE_ENA(shader->config.spi_ps_input_ena) || 19207ec681f3Smrg G_0286CC_LINEAR_CENTROID_ENA(shader->config.spi_ps_input_ena))) { 19217ec681f3Smrg shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_SAMPLE_ENA; 19227ec681f3Smrg shader->config.spi_ps_input_ena &= C_0286CC_LINEAR_CENTROID_ENA; 19237ec681f3Smrg shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTER_ENA(1); 19247ec681f3Smrg } 19257ec681f3Smrg 19267ec681f3Smrg /* POW_W_FLOAT requires that one of the perspective weights is enabled. */ 19277ec681f3Smrg if (G_0286CC_POS_W_FLOAT_ENA(shader->config.spi_ps_input_ena) && 19287ec681f3Smrg !(shader->config.spi_ps_input_ena & 0xf)) { 19297ec681f3Smrg shader->config.spi_ps_input_ena |= S_0286CC_PERSP_CENTER_ENA(1); 19307ec681f3Smrg assert(G_0286CC_PERSP_CENTER_ENA(shader->config.spi_ps_input_addr)); 19317ec681f3Smrg } 19327ec681f3Smrg 19337ec681f3Smrg /* At least one pair of interpolation weights must be enabled. */ 19347ec681f3Smrg if (!(shader->config.spi_ps_input_ena & 0x7f)) { 19357ec681f3Smrg shader->config.spi_ps_input_ena |= S_0286CC_LINEAR_CENTER_ENA(1); 19367ec681f3Smrg assert(G_0286CC_LINEAR_CENTER_ENA(shader->config.spi_ps_input_addr)); 19377ec681f3Smrg } 19387ec681f3Smrg 19397ec681f3Smrg /* Samplemask fixup requires the sample ID. */ 19407ec681f3Smrg if (shader->key.part.ps.prolog.samplemask_log_ps_iter) { 19417ec681f3Smrg shader->config.spi_ps_input_ena |= S_0286CC_ANCILLARY_ENA(1); 19427ec681f3Smrg assert(G_0286CC_ANCILLARY_ENA(shader->config.spi_ps_input_addr)); 19437ec681f3Smrg } 19447ec681f3Smrg 19457ec681f3Smrg /* The sample mask input is always enabled, because the API shader always 19467ec681f3Smrg * passes it through to the epilog. Disable it here if it's unused. 19477ec681f3Smrg */ 19487ec681f3Smrg if (!shader->key.part.ps.epilog.poly_line_smoothing && !shader->selector->info.reads_samplemask) 19497ec681f3Smrg shader->config.spi_ps_input_ena &= C_0286CC_SAMPLE_COVERAGE_ENA; 19507ec681f3Smrg 19517ec681f3Smrg return true; 19527ec681f3Smrg} 19537ec681f3Smrg 19547ec681f3Smrgvoid si_multiwave_lds_size_workaround(struct si_screen *sscreen, unsigned *lds_size) 19557ec681f3Smrg{ 19567ec681f3Smrg /* If tessellation is all offchip and on-chip GS isn't used, this 19577ec681f3Smrg * workaround is not needed. 19587ec681f3Smrg */ 19597ec681f3Smrg return; 19607ec681f3Smrg 19617ec681f3Smrg /* SPI barrier management bug: 19627ec681f3Smrg * Make sure we have at least 4k of LDS in use to avoid the bug. 19637ec681f3Smrg * It applies to workgroup sizes of more than one wavefront. 19647ec681f3Smrg */ 19657ec681f3Smrg if (sscreen->info.family == CHIP_BONAIRE || sscreen->info.family == CHIP_KABINI) 19667ec681f3Smrg *lds_size = MAX2(*lds_size, 8); 19677ec681f3Smrg} 19687ec681f3Smrg 19697ec681f3Smrgvoid si_fix_resource_usage(struct si_screen *sscreen, struct si_shader *shader) 19707ec681f3Smrg{ 19717ec681f3Smrg unsigned min_sgprs = shader->info.num_input_sgprs + 2; /* VCC */ 19727ec681f3Smrg 19737ec681f3Smrg shader->config.num_sgprs = MAX2(shader->config.num_sgprs, min_sgprs); 19747ec681f3Smrg 19757ec681f3Smrg if (shader->selector->info.stage == MESA_SHADER_COMPUTE && 19767ec681f3Smrg si_get_max_workgroup_size(shader) > sscreen->compute_wave_size) { 19777ec681f3Smrg si_multiwave_lds_size_workaround(sscreen, &shader->config.lds_size); 19787ec681f3Smrg } 19797ec681f3Smrg} 19807ec681f3Smrg 19817ec681f3Smrgbool si_create_shader_variant(struct si_screen *sscreen, struct ac_llvm_compiler *compiler, 19827ec681f3Smrg struct si_shader *shader, struct pipe_debug_callback *debug) 19837ec681f3Smrg{ 19847ec681f3Smrg struct si_shader_selector *sel = shader->selector; 19857ec681f3Smrg struct si_shader *mainp = *si_get_main_shader_part(sel, &shader->key); 19867ec681f3Smrg 19877ec681f3Smrg /* LS, ES, VS are compiled on demand if the main part hasn't been 19887ec681f3Smrg * compiled for that stage. 19897ec681f3Smrg * 19907ec681f3Smrg * GS are compiled on demand if the main part hasn't been compiled 19917ec681f3Smrg * for the chosen NGG-ness. 19927ec681f3Smrg * 19937ec681f3Smrg * Vertex shaders are compiled on demand when a vertex fetch 19947ec681f3Smrg * workaround must be applied. 19957ec681f3Smrg */ 19967ec681f3Smrg if (shader->is_monolithic) { 19977ec681f3Smrg /* Monolithic shader (compiled as a whole, has many variants, 19987ec681f3Smrg * may take a long time to compile). 19997ec681f3Smrg */ 20007ec681f3Smrg if (!si_compile_shader(sscreen, compiler, shader, debug)) 20017ec681f3Smrg return false; 20027ec681f3Smrg } else { 20037ec681f3Smrg /* The shader consists of several parts: 20047ec681f3Smrg * 20057ec681f3Smrg * - the middle part is the user shader, it has 1 variant only 20067ec681f3Smrg * and it was compiled during the creation of the shader 20077ec681f3Smrg * selector 20087ec681f3Smrg * - the prolog part is inserted at the beginning 20097ec681f3Smrg * - the epilog part is inserted at the end 20107ec681f3Smrg * 20117ec681f3Smrg * The prolog and epilog have many (but simple) variants. 20127ec681f3Smrg * 20137ec681f3Smrg * Starting with gfx9, geometry and tessellation control 20147ec681f3Smrg * shaders also contain the prolog and user shader parts of 20157ec681f3Smrg * the previous shader stage. 20167ec681f3Smrg */ 20177ec681f3Smrg 20187ec681f3Smrg if (!mainp) 20197ec681f3Smrg return false; 20207ec681f3Smrg 20217ec681f3Smrg /* Copy the compiled shader data over. */ 20227ec681f3Smrg shader->is_binary_shared = true; 20237ec681f3Smrg shader->binary = mainp->binary; 20247ec681f3Smrg shader->config = mainp->config; 20257ec681f3Smrg shader->info.num_input_sgprs = mainp->info.num_input_sgprs; 20267ec681f3Smrg shader->info.num_input_vgprs = mainp->info.num_input_vgprs; 20277ec681f3Smrg shader->info.face_vgpr_index = mainp->info.face_vgpr_index; 20287ec681f3Smrg shader->info.ancillary_vgpr_index = mainp->info.ancillary_vgpr_index; 20297ec681f3Smrg memcpy(shader->info.vs_output_ps_input_cntl, mainp->info.vs_output_ps_input_cntl, 20307ec681f3Smrg sizeof(mainp->info.vs_output_ps_input_cntl)); 20317ec681f3Smrg shader->info.uses_instanceid = mainp->info.uses_instanceid; 20327ec681f3Smrg shader->info.nr_pos_exports = mainp->info.nr_pos_exports; 20337ec681f3Smrg shader->info.nr_param_exports = mainp->info.nr_param_exports; 20347ec681f3Smrg 20357ec681f3Smrg /* Select prologs and/or epilogs. */ 20367ec681f3Smrg switch (sel->info.stage) { 20377ec681f3Smrg case MESA_SHADER_VERTEX: 20387ec681f3Smrg if (!si_shader_select_vs_parts(sscreen, compiler, shader, debug)) 20397ec681f3Smrg return false; 20407ec681f3Smrg break; 20417ec681f3Smrg case MESA_SHADER_TESS_CTRL: 20427ec681f3Smrg if (!si_shader_select_tcs_parts(sscreen, compiler, shader, debug)) 20437ec681f3Smrg return false; 20447ec681f3Smrg break; 20457ec681f3Smrg case MESA_SHADER_TESS_EVAL: 20467ec681f3Smrg break; 20477ec681f3Smrg case MESA_SHADER_GEOMETRY: 20487ec681f3Smrg if (!si_shader_select_gs_parts(sscreen, compiler, shader, debug)) 20497ec681f3Smrg return false; 20507ec681f3Smrg break; 20517ec681f3Smrg case MESA_SHADER_FRAGMENT: 20527ec681f3Smrg if (!si_shader_select_ps_parts(sscreen, compiler, shader, debug)) 20537ec681f3Smrg return false; 20547ec681f3Smrg 20557ec681f3Smrg /* Make sure we have at least as many VGPRs as there 20567ec681f3Smrg * are allocated inputs. 20577ec681f3Smrg */ 20587ec681f3Smrg shader->config.num_vgprs = MAX2(shader->config.num_vgprs, shader->info.num_input_vgprs); 20597ec681f3Smrg break; 20607ec681f3Smrg default:; 20617ec681f3Smrg } 20627ec681f3Smrg 20637ec681f3Smrg /* Update SGPR and VGPR counts. */ 20647ec681f3Smrg if (shader->prolog) { 20657ec681f3Smrg shader->config.num_sgprs = 20667ec681f3Smrg MAX2(shader->config.num_sgprs, shader->prolog->config.num_sgprs); 20677ec681f3Smrg shader->config.num_vgprs = 20687ec681f3Smrg MAX2(shader->config.num_vgprs, shader->prolog->config.num_vgprs); 20697ec681f3Smrg } 20707ec681f3Smrg if (shader->previous_stage) { 20717ec681f3Smrg shader->config.num_sgprs = 20727ec681f3Smrg MAX2(shader->config.num_sgprs, shader->previous_stage->config.num_sgprs); 20737ec681f3Smrg shader->config.num_vgprs = 20747ec681f3Smrg MAX2(shader->config.num_vgprs, shader->previous_stage->config.num_vgprs); 20757ec681f3Smrg shader->config.spilled_sgprs = 20767ec681f3Smrg MAX2(shader->config.spilled_sgprs, shader->previous_stage->config.spilled_sgprs); 20777ec681f3Smrg shader->config.spilled_vgprs = 20787ec681f3Smrg MAX2(shader->config.spilled_vgprs, shader->previous_stage->config.spilled_vgprs); 20797ec681f3Smrg shader->info.private_mem_vgprs = 20807ec681f3Smrg MAX2(shader->info.private_mem_vgprs, shader->previous_stage->info.private_mem_vgprs); 20817ec681f3Smrg shader->config.scratch_bytes_per_wave = 20827ec681f3Smrg MAX2(shader->config.scratch_bytes_per_wave, 20837ec681f3Smrg shader->previous_stage->config.scratch_bytes_per_wave); 20847ec681f3Smrg shader->info.uses_instanceid |= shader->previous_stage->info.uses_instanceid; 20857ec681f3Smrg } 20867ec681f3Smrg if (shader->prolog2) { 20877ec681f3Smrg shader->config.num_sgprs = 20887ec681f3Smrg MAX2(shader->config.num_sgprs, shader->prolog2->config.num_sgprs); 20897ec681f3Smrg shader->config.num_vgprs = 20907ec681f3Smrg MAX2(shader->config.num_vgprs, shader->prolog2->config.num_vgprs); 20917ec681f3Smrg } 20927ec681f3Smrg if (shader->epilog) { 20937ec681f3Smrg shader->config.num_sgprs = 20947ec681f3Smrg MAX2(shader->config.num_sgprs, shader->epilog->config.num_sgprs); 20957ec681f3Smrg shader->config.num_vgprs = 20967ec681f3Smrg MAX2(shader->config.num_vgprs, shader->epilog->config.num_vgprs); 20977ec681f3Smrg } 20987ec681f3Smrg si_calculate_max_simd_waves(shader); 20997ec681f3Smrg } 21007ec681f3Smrg 21017ec681f3Smrg if (shader->key.as_ngg) { 21027ec681f3Smrg assert(!shader->key.as_es && !shader->key.as_ls); 21037ec681f3Smrg if (!gfx10_ngg_calculate_subgroup_info(shader)) { 21047ec681f3Smrg fprintf(stderr, "Failed to compute subgroup info\n"); 21057ec681f3Smrg return false; 21067ec681f3Smrg } 21077ec681f3Smrg } else if (sscreen->info.chip_class >= GFX9 && sel->info.stage == MESA_SHADER_GEOMETRY) { 21087ec681f3Smrg gfx9_get_gs_info(shader->previous_stage_sel, sel, &shader->gs_info); 21097ec681f3Smrg } 21107ec681f3Smrg 21117ec681f3Smrg shader->uses_vs_state_provoking_vertex = 21127ec681f3Smrg sscreen->use_ngg && 21137ec681f3Smrg /* Used to convert triangle strips from GS to triangles. */ 21147ec681f3Smrg ((sel->info.stage == MESA_SHADER_GEOMETRY && 21157ec681f3Smrg util_rast_prim_is_triangles(sel->info.base.gs.output_primitive)) || 21167ec681f3Smrg (sel->info.stage == MESA_SHADER_VERTEX && 21177ec681f3Smrg /* Used to export PrimitiveID from the correct vertex. */ 21187ec681f3Smrg shader->key.mono.u.vs_export_prim_id)); 21197ec681f3Smrg 21207ec681f3Smrg shader->uses_vs_state_outprim = sscreen->use_ngg && 21217ec681f3Smrg /* Only used by streamout in vertex shaders. */ 21227ec681f3Smrg sel->info.stage == MESA_SHADER_VERTEX && 21237ec681f3Smrg sel->so.num_outputs; 21247ec681f3Smrg 21257ec681f3Smrg if (sel->info.stage == MESA_SHADER_VERTEX) { 21267ec681f3Smrg shader->uses_base_instance = sel->info.uses_base_instance || 21277ec681f3Smrg shader->key.part.vs.prolog.instance_divisor_is_one || 21287ec681f3Smrg shader->key.part.vs.prolog.instance_divisor_is_fetched; 21297ec681f3Smrg } else if (sel->info.stage == MESA_SHADER_TESS_CTRL) { 21307ec681f3Smrg shader->uses_base_instance = shader->previous_stage_sel && 21317ec681f3Smrg (shader->previous_stage_sel->info.uses_base_instance || 21327ec681f3Smrg shader->key.part.tcs.ls_prolog.instance_divisor_is_one || 21337ec681f3Smrg shader->key.part.tcs.ls_prolog.instance_divisor_is_fetched); 21347ec681f3Smrg } else if (sel->info.stage == MESA_SHADER_GEOMETRY) { 21357ec681f3Smrg shader->uses_base_instance = shader->previous_stage_sel && 21367ec681f3Smrg (shader->previous_stage_sel->info.uses_base_instance || 21377ec681f3Smrg shader->key.part.gs.vs_prolog.instance_divisor_is_one || 21387ec681f3Smrg shader->key.part.gs.vs_prolog.instance_divisor_is_fetched); 21397ec681f3Smrg } 21407ec681f3Smrg 21417ec681f3Smrg si_fix_resource_usage(sscreen, shader); 21427ec681f3Smrg si_shader_dump(sscreen, shader, debug, stderr, true); 21437ec681f3Smrg 21447ec681f3Smrg /* Upload. */ 21457ec681f3Smrg if (!si_shader_binary_upload(sscreen, shader, 0)) { 21467ec681f3Smrg fprintf(stderr, "LLVM failed to upload shader\n"); 21477ec681f3Smrg return false; 21487ec681f3Smrg } 21497ec681f3Smrg 21507ec681f3Smrg return true; 21517ec681f3Smrg} 21527ec681f3Smrg 21537ec681f3Smrgvoid si_shader_binary_clean(struct si_shader_binary *binary) 21547ec681f3Smrg{ 21557ec681f3Smrg free((void *)binary->elf_buffer); 21567ec681f3Smrg binary->elf_buffer = NULL; 21577ec681f3Smrg 21587ec681f3Smrg free(binary->llvm_ir_string); 21597ec681f3Smrg binary->llvm_ir_string = NULL; 21607ec681f3Smrg 21617ec681f3Smrg free(binary->uploaded_code); 21627ec681f3Smrg binary->uploaded_code = NULL; 21637ec681f3Smrg binary->uploaded_code_size = 0; 2164af69d88dSmrg} 2165af69d88dSmrg 216601e04c3fSmrgvoid si_shader_destroy(struct si_shader *shader) 2167af69d88dSmrg{ 21687ec681f3Smrg if (shader->scratch_bo) 21697ec681f3Smrg si_resource_reference(&shader->scratch_bo, NULL); 2170af69d88dSmrg 21717ec681f3Smrg si_resource_reference(&shader->bo, NULL); 217201e04c3fSmrg 21737ec681f3Smrg if (!shader->is_binary_shared) 21747ec681f3Smrg si_shader_binary_clean(&shader->binary); 217501e04c3fSmrg 21767ec681f3Smrg free(shader->shader_log); 2177af69d88dSmrg} 2178