101e04c3fSmrg/* 201e04c3fSmrg * Copyright © 2016 Red Hat. 301e04c3fSmrg * Copyright © 2016 Bas Nieuwenhuizen 401e04c3fSmrg * 501e04c3fSmrg * based in part on anv driver which is: 601e04c3fSmrg * Copyright © 2015 Intel Corporation 701e04c3fSmrg * 801e04c3fSmrg * Permission is hereby granted, free of charge, to any person obtaining a 901e04c3fSmrg * copy of this software and associated documentation files (the "Software"), 1001e04c3fSmrg * to deal in the Software without restriction, including without limitation 1101e04c3fSmrg * the rights to use, copy, modify, merge, publish, distribute, sublicense, 1201e04c3fSmrg * and/or sell copies of the Software, and to permit persons to whom the 1301e04c3fSmrg * Software is furnished to do so, subject to the following conditions: 1401e04c3fSmrg * 1501e04c3fSmrg * The above copyright notice and this permission notice (including the next 1601e04c3fSmrg * paragraph) shall be included in all copies or substantial portions of the 1701e04c3fSmrg * Software. 1801e04c3fSmrg * 1901e04c3fSmrg * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 2001e04c3fSmrg * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 2101e04c3fSmrg * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 2201e04c3fSmrg * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 2301e04c3fSmrg * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 2401e04c3fSmrg * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 2501e04c3fSmrg * IN THE SOFTWARE. 2601e04c3fSmrg */ 2701e04c3fSmrg 2801e04c3fSmrg#ifndef RADV_SHADER_H 2901e04c3fSmrg#define RADV_SHADER_H 3001e04c3fSmrg 317ec681f3Smrg#include "ac_binary.h" 327ec681f3Smrg#include "ac_shader_util.h" 337ec681f3Smrg 347ec681f3Smrg#include "amd_family.h" 357ec681f3Smrg#include "radv_constants.h" 3601e04c3fSmrg 3701e04c3fSmrg#include "nir/nir.h" 387ec681f3Smrg#include "vulkan/util/vk_object.h" 397ec681f3Smrg#include "vulkan/util/vk_shader_module.h" 407ec681f3Smrg#include "vulkan/vulkan.h" 417ec681f3Smrg 427ec681f3Smrg#define RADV_VERT_ATTRIB_MAX MAX2(VERT_ATTRIB_MAX, VERT_ATTRIB_GENERIC0 + MAX_VERTEX_ATTRIBS) 437ec681f3Smrg 447ec681f3Smrgstruct radv_physical_device; 457ec681f3Smrgstruct radv_device; 467ec681f3Smrgstruct radv_pipeline; 477ec681f3Smrgstruct radv_pipeline_cache; 487ec681f3Smrgstruct radv_pipeline_key; 497ec681f3Smrgstruct radv_vs_input_state; 507ec681f3Smrg 517ec681f3Smrgenum radv_vs_input_alpha_adjust { 527ec681f3Smrg ALPHA_ADJUST_NONE = 0, 537ec681f3Smrg ALPHA_ADJUST_SNORM = 1, 547ec681f3Smrg ALPHA_ADJUST_SSCALED = 2, 557ec681f3Smrg ALPHA_ADJUST_SINT = 3, 567ec681f3Smrg}; 5701e04c3fSmrg 587ec681f3Smrgstruct radv_pipeline_key { 597ec681f3Smrg uint32_t has_multiview_view_index : 1; 607ec681f3Smrg uint32_t optimisations_disabled : 1; 617ec681f3Smrg uint32_t invariant_geom : 1; 627ec681f3Smrg uint32_t use_ngg : 1; 637ec681f3Smrg 647ec681f3Smrg struct { 657ec681f3Smrg uint32_t instance_rate_inputs; 667ec681f3Smrg uint32_t instance_rate_divisors[MAX_VERTEX_ATTRIBS]; 677ec681f3Smrg uint8_t vertex_attribute_formats[MAX_VERTEX_ATTRIBS]; 687ec681f3Smrg uint32_t vertex_attribute_bindings[MAX_VERTEX_ATTRIBS]; 697ec681f3Smrg uint32_t vertex_attribute_offsets[MAX_VERTEX_ATTRIBS]; 707ec681f3Smrg uint32_t vertex_attribute_strides[MAX_VERTEX_ATTRIBS]; 717ec681f3Smrg uint8_t vertex_binding_align[MAX_VBS]; 727ec681f3Smrg enum radv_vs_input_alpha_adjust vertex_alpha_adjust[MAX_VERTEX_ATTRIBS]; 737ec681f3Smrg uint32_t vertex_post_shuffle; 747ec681f3Smrg uint32_t provoking_vtx_last : 1; 757ec681f3Smrg uint32_t dynamic_input_state : 1; 767ec681f3Smrg uint8_t topology; 777ec681f3Smrg } vs; 787ec681f3Smrg 797ec681f3Smrg struct { 807ec681f3Smrg unsigned tess_input_vertices; 817ec681f3Smrg } tcs; 827ec681f3Smrg 837ec681f3Smrg struct { 847ec681f3Smrg uint32_t col_format; 857ec681f3Smrg uint32_t is_int8; 867ec681f3Smrg uint32_t is_int10; 877ec681f3Smrg uint8_t log2_ps_iter_samples; 887ec681f3Smrg uint8_t num_samples; 897ec681f3Smrg 907ec681f3Smrg bool lower_discard_to_demote; 917ec681f3Smrg bool enable_mrt_output_nan_fixup; 927ec681f3Smrg uint8_t force_vrs; 937ec681f3Smrg } ps; 947ec681f3Smrg 957ec681f3Smrg struct { 967ec681f3Smrg /* Non-zero if a required subgroup size is specified via 977ec681f3Smrg * VK_EXT_subgroup_size_control. 987ec681f3Smrg */ 997ec681f3Smrg uint8_t compute_subgroup_size; 1007ec681f3Smrg bool require_full_subgroups; 1017ec681f3Smrg } cs; 10201e04c3fSmrg}; 10301e04c3fSmrg 1047ec681f3Smrgenum radv_compiler_debug_level { 1057ec681f3Smrg RADV_COMPILER_DEBUG_LEVEL_PERFWARN, 1067ec681f3Smrg RADV_COMPILER_DEBUG_LEVEL_ERROR, 10701e04c3fSmrg}; 10801e04c3fSmrg 1097ec681f3Smrgstruct radv_nir_compiler_options { 1107ec681f3Smrg struct radv_pipeline_layout *layout; 1117ec681f3Smrg struct radv_pipeline_key key; 1127ec681f3Smrg bool explicit_scratch_args; 1137ec681f3Smrg bool robust_buffer_access; 1147ec681f3Smrg bool adjust_frag_coord_z; 1157ec681f3Smrg bool dump_shader; 1167ec681f3Smrg bool dump_preoptir; 1177ec681f3Smrg bool record_ir; 1187ec681f3Smrg bool record_stats; 1197ec681f3Smrg bool check_ir; 1207ec681f3Smrg bool has_ls_vgpr_init_bug; 1217ec681f3Smrg bool has_image_load_dcc_bug; 1227ec681f3Smrg bool enable_mrt_output_nan_fixup; 1237ec681f3Smrg bool wgp_mode; 1247ec681f3Smrg bool remap_spi_ps_input; 1257ec681f3Smrg enum radeon_family family; 1267ec681f3Smrg enum chip_class chip_class; 1277ec681f3Smrg const struct radeon_info *info; 1287ec681f3Smrg uint32_t address32_hi; 1297ec681f3Smrg uint8_t force_vrs_rates; 1307ec681f3Smrg 1317ec681f3Smrg struct { 1327ec681f3Smrg void (*func)(void *private_data, enum radv_compiler_debug_level level, const char *message); 1337ec681f3Smrg void *private_data; 1347ec681f3Smrg } debug; 13501e04c3fSmrg}; 13601e04c3fSmrg 1377ec681f3Smrgenum radv_ud_index { 1387ec681f3Smrg AC_UD_SCRATCH_RING_OFFSETS = 0, 1397ec681f3Smrg AC_UD_PUSH_CONSTANTS = 1, 1407ec681f3Smrg AC_UD_INLINE_PUSH_CONSTANTS = 2, 1417ec681f3Smrg AC_UD_INDIRECT_DESCRIPTOR_SETS = 3, 1427ec681f3Smrg AC_UD_VIEW_INDEX = 4, 1437ec681f3Smrg AC_UD_STREAMOUT_BUFFERS = 5, 1447ec681f3Smrg AC_UD_NGG_GS_STATE = 6, 1457ec681f3Smrg AC_UD_NGG_CULLING_SETTINGS = 7, 1467ec681f3Smrg AC_UD_NGG_VIEWPORT = 8, 1477ec681f3Smrg AC_UD_SHADER_START = 9, 1487ec681f3Smrg AC_UD_VS_VERTEX_BUFFERS = AC_UD_SHADER_START, 1497ec681f3Smrg AC_UD_VS_BASE_VERTEX_START_INSTANCE, 1507ec681f3Smrg AC_UD_VS_PROLOG_INPUTS, 1517ec681f3Smrg AC_UD_VS_MAX_UD, 1527ec681f3Smrg AC_UD_PS_MAX_UD, 1537ec681f3Smrg AC_UD_CS_GRID_SIZE = AC_UD_SHADER_START, 1547ec681f3Smrg AC_UD_CS_SBT_DESCRIPTORS, 1557ec681f3Smrg AC_UD_CS_RAY_LAUNCH_SIZE, 1567ec681f3Smrg AC_UD_CS_MAX_UD, 1577ec681f3Smrg AC_UD_GS_MAX_UD, 1587ec681f3Smrg AC_UD_TCS_MAX_UD, 1597ec681f3Smrg AC_UD_TES_MAX_UD, 1607ec681f3Smrg AC_UD_MAX_UD = AC_UD_TCS_MAX_UD, 16101e04c3fSmrg}; 16201e04c3fSmrg 1637ec681f3Smrgstruct radv_stream_output { 1647ec681f3Smrg uint8_t location; 1657ec681f3Smrg uint8_t buffer; 1667ec681f3Smrg uint16_t offset; 1677ec681f3Smrg uint8_t component_mask; 1687ec681f3Smrg uint8_t stream; 16901e04c3fSmrg}; 17001e04c3fSmrg 1717ec681f3Smrgstruct radv_streamout_info { 1727ec681f3Smrg uint16_t num_outputs; 1737ec681f3Smrg struct radv_stream_output outputs[MAX_SO_OUTPUTS]; 1747ec681f3Smrg uint16_t strides[MAX_SO_BUFFERS]; 1757ec681f3Smrg uint32_t enabled_stream_buffers_mask; 17601e04c3fSmrg}; 17701e04c3fSmrg 1787ec681f3Smrgstruct radv_userdata_info { 1797ec681f3Smrg int8_t sgpr_idx; 1807ec681f3Smrg uint8_t num_sgprs; 18101e04c3fSmrg}; 18201e04c3fSmrg 1837ec681f3Smrgstruct radv_userdata_locations { 1847ec681f3Smrg struct radv_userdata_info descriptor_sets[MAX_SETS]; 1857ec681f3Smrg struct radv_userdata_info shader_data[AC_UD_MAX_UD]; 1867ec681f3Smrg uint32_t descriptor_sets_enabled; 18701e04c3fSmrg}; 18801e04c3fSmrg 1897ec681f3Smrgstruct radv_vs_output_info { 1907ec681f3Smrg uint8_t vs_output_param_offset[VARYING_SLOT_MAX]; 1917ec681f3Smrg uint8_t clip_dist_mask; 1927ec681f3Smrg uint8_t cull_dist_mask; 1937ec681f3Smrg uint8_t param_exports; 1947ec681f3Smrg bool writes_pointsize; 1957ec681f3Smrg bool writes_layer; 1967ec681f3Smrg bool writes_viewport_index; 1977ec681f3Smrg bool writes_primitive_shading_rate; 1987ec681f3Smrg bool export_prim_id; 1997ec681f3Smrg bool export_clip_dists; 2007ec681f3Smrg unsigned pos_exports; 20101e04c3fSmrg}; 20201e04c3fSmrg 2037ec681f3Smrgstruct radv_es_output_info { 2047ec681f3Smrg uint32_t esgs_itemsize; 20501e04c3fSmrg}; 20601e04c3fSmrg 2077ec681f3Smrgstruct gfx9_gs_info { 2087ec681f3Smrg uint32_t vgt_gs_onchip_cntl; 2097ec681f3Smrg uint32_t vgt_gs_max_prims_per_subgroup; 2107ec681f3Smrg uint32_t vgt_esgs_ring_itemsize; 2117ec681f3Smrg uint32_t lds_size; 2127ec681f3Smrg}; 2137ec681f3Smrg 2147ec681f3Smrgstruct gfx10_ngg_info { 2157ec681f3Smrg uint16_t ngg_emit_size; /* in dwords */ 2167ec681f3Smrg uint32_t hw_max_esverts; 2177ec681f3Smrg uint32_t max_gsprims; 2187ec681f3Smrg uint32_t max_out_verts; 2197ec681f3Smrg uint32_t prim_amp_factor; 2207ec681f3Smrg uint32_t vgt_esgs_ring_itemsize; 2217ec681f3Smrg uint32_t esgs_ring_size; 2227ec681f3Smrg bool max_vert_out_per_gs_instance; 2237ec681f3Smrg bool enable_vertex_grouping; 22401e04c3fSmrg}; 22501e04c3fSmrg 22601e04c3fSmrgstruct radv_shader_info { 2277ec681f3Smrg bool loads_push_constants; 2287ec681f3Smrg bool loads_dynamic_offsets; 2297ec681f3Smrg uint8_t min_push_constant_used; 2307ec681f3Smrg uint8_t max_push_constant_used; 2317ec681f3Smrg bool has_only_32bit_push_constants; 2327ec681f3Smrg bool has_indirect_push_constants; 2337ec681f3Smrg uint32_t desc_set_used_mask; 2347ec681f3Smrg bool uses_view_index; 2357ec681f3Smrg bool uses_invocation_id; 2367ec681f3Smrg bool uses_prim_id; 2377ec681f3Smrg uint8_t wave_size; 2387ec681f3Smrg uint8_t ballot_bit_size; 2397ec681f3Smrg struct radv_userdata_locations user_sgprs_locs; 2407ec681f3Smrg unsigned num_user_sgprs; 2417ec681f3Smrg unsigned num_input_sgprs; 2427ec681f3Smrg unsigned num_input_vgprs; 2437ec681f3Smrg bool is_ngg; 2447ec681f3Smrg bool is_ngg_passthrough; 2457ec681f3Smrg bool has_ngg_culling; 2467ec681f3Smrg bool has_ngg_early_prim_export; 2477ec681f3Smrg uint32_t num_lds_blocks_when_not_culling; 2487ec681f3Smrg uint32_t num_tess_patches; 2497ec681f3Smrg unsigned workgroup_size; 2507ec681f3Smrg struct { 2517ec681f3Smrg uint8_t input_usage_mask[RADV_VERT_ATTRIB_MAX]; 2527ec681f3Smrg uint8_t output_usage_mask[VARYING_SLOT_VAR31 + 1]; 2537ec681f3Smrg bool needs_draw_id; 2547ec681f3Smrg bool needs_instance_id; 2557ec681f3Smrg struct radv_vs_output_info outinfo; 2567ec681f3Smrg struct radv_es_output_info es_info; 2577ec681f3Smrg bool as_es; 2587ec681f3Smrg bool as_ls; 2597ec681f3Smrg bool tcs_in_out_eq; 2607ec681f3Smrg uint64_t tcs_temp_only_input_mask; 2617ec681f3Smrg uint8_t num_linked_outputs; 2627ec681f3Smrg bool needs_base_instance; 2637ec681f3Smrg bool use_per_attribute_vb_descs; 2647ec681f3Smrg uint32_t vb_desc_usage_mask; 2657ec681f3Smrg bool has_prolog; 2667ec681f3Smrg bool dynamic_inputs; 2677ec681f3Smrg } vs; 2687ec681f3Smrg struct { 2697ec681f3Smrg uint8_t output_usage_mask[VARYING_SLOT_VAR31 + 1]; 2707ec681f3Smrg uint8_t num_stream_output_components[4]; 2717ec681f3Smrg uint8_t output_streams[VARYING_SLOT_VAR31 + 1]; 2727ec681f3Smrg uint8_t max_stream; 2737ec681f3Smrg unsigned gsvs_vertex_size; 2747ec681f3Smrg unsigned max_gsvs_emit_size; 2757ec681f3Smrg unsigned vertices_in; 2767ec681f3Smrg unsigned vertices_out; 2777ec681f3Smrg unsigned output_prim; 2787ec681f3Smrg unsigned invocations; 2797ec681f3Smrg unsigned es_type; /* GFX9: VS or TES */ 2807ec681f3Smrg uint8_t num_linked_inputs; 2817ec681f3Smrg } gs; 2827ec681f3Smrg struct { 2837ec681f3Smrg uint8_t output_usage_mask[VARYING_SLOT_VAR31 + 1]; 2847ec681f3Smrg struct radv_vs_output_info outinfo; 2857ec681f3Smrg struct radv_es_output_info es_info; 2867ec681f3Smrg bool as_es; 2877ec681f3Smrg unsigned primitive_mode; 2887ec681f3Smrg enum gl_tess_spacing spacing; 2897ec681f3Smrg bool ccw; 2907ec681f3Smrg bool point_mode; 2917ec681f3Smrg uint8_t num_linked_inputs; 2927ec681f3Smrg uint8_t num_linked_patch_inputs; 2937ec681f3Smrg uint8_t num_linked_outputs; 2947ec681f3Smrg } tes; 2957ec681f3Smrg struct { 2967ec681f3Smrg bool uses_sample_shading; 2977ec681f3Smrg bool needs_sample_positions; 2987ec681f3Smrg bool writes_memory; 2997ec681f3Smrg bool writes_z; 3007ec681f3Smrg bool writes_stencil; 3017ec681f3Smrg bool writes_sample_mask; 3027ec681f3Smrg bool has_pcoord; 3037ec681f3Smrg bool prim_id_input; 3047ec681f3Smrg bool layer_input; 3057ec681f3Smrg bool viewport_index_input; 3067ec681f3Smrg uint8_t num_input_clips_culls; 3077ec681f3Smrg uint32_t input_mask; 3087ec681f3Smrg uint32_t flat_shaded_mask; 3097ec681f3Smrg uint32_t explicit_shaded_mask; 3107ec681f3Smrg uint32_t float16_shaded_mask; 3117ec681f3Smrg uint32_t num_interp; 3127ec681f3Smrg bool can_discard; 3137ec681f3Smrg bool early_fragment_test; 3147ec681f3Smrg bool post_depth_coverage; 3157ec681f3Smrg bool reads_sample_mask_in; 3167ec681f3Smrg bool reads_front_face; 3177ec681f3Smrg bool reads_sample_id; 3187ec681f3Smrg bool reads_frag_shading_rate; 3197ec681f3Smrg bool reads_barycentric_model; 3207ec681f3Smrg bool reads_persp_sample; 3217ec681f3Smrg bool reads_persp_center; 3227ec681f3Smrg bool reads_persp_centroid; 3237ec681f3Smrg bool reads_linear_sample; 3247ec681f3Smrg bool reads_linear_center; 3257ec681f3Smrg bool reads_linear_centroid; 3267ec681f3Smrg uint8_t reads_frag_coord_mask; 3277ec681f3Smrg uint8_t reads_sample_pos_mask; 3287ec681f3Smrg uint8_t depth_layout; 3297ec681f3Smrg bool allow_flat_shading; 3307ec681f3Smrg unsigned spi_ps_input; 3317ec681f3Smrg } ps; 3327ec681f3Smrg struct { 3337ec681f3Smrg bool uses_grid_size; 3347ec681f3Smrg bool uses_block_id[3]; 3357ec681f3Smrg bool uses_thread_id[3]; 3367ec681f3Smrg bool uses_local_invocation_idx; 3377ec681f3Smrg unsigned block_size[3]; 3387ec681f3Smrg 3397ec681f3Smrg uint8_t subgroup_size; 3407ec681f3Smrg 3417ec681f3Smrg bool uses_sbt; 3427ec681f3Smrg bool uses_ray_launch_size; 3437ec681f3Smrg } cs; 3447ec681f3Smrg struct { 3457ec681f3Smrg uint64_t tes_inputs_read; 3467ec681f3Smrg uint64_t tes_patch_inputs_read; 3477ec681f3Smrg unsigned tcs_vertices_out; 3487ec681f3Smrg uint32_t num_lds_blocks; 3497ec681f3Smrg uint8_t num_linked_inputs; 3507ec681f3Smrg uint8_t num_linked_outputs; 3517ec681f3Smrg uint8_t num_linked_patch_outputs; 3527ec681f3Smrg bool tes_reads_tess_factors : 1; 3537ec681f3Smrg } tcs; 3547ec681f3Smrg 3557ec681f3Smrg struct radv_streamout_info so; 3567ec681f3Smrg 3577ec681f3Smrg struct gfx9_gs_info gs_ring_info; 3587ec681f3Smrg struct gfx10_ngg_info ngg_info; 35901e04c3fSmrg}; 36001e04c3fSmrg 3617ec681f3Smrgstruct radv_vs_input_state { 3627ec681f3Smrg uint32_t attribute_mask; 3637ec681f3Smrg uint32_t misaligned_mask; 3647ec681f3Smrg uint32_t possibly_misaligned_mask; 3657ec681f3Smrg 3667ec681f3Smrg uint32_t instance_rate_inputs; 3677ec681f3Smrg uint32_t nontrivial_divisors; 3687ec681f3Smrg uint32_t post_shuffle; 3697ec681f3Smrg /* Having two separate fields instead of a single uint64_t makes it easier to remove attributes 3707ec681f3Smrg * using bitwise arithmetic. 3717ec681f3Smrg */ 3727ec681f3Smrg uint32_t alpha_adjust_lo; 3737ec681f3Smrg uint32_t alpha_adjust_hi; 3747ec681f3Smrg 3757ec681f3Smrg uint8_t bindings[MAX_VERTEX_ATTRIBS]; 3767ec681f3Smrg uint32_t divisors[MAX_VERTEX_ATTRIBS]; 3777ec681f3Smrg uint32_t offsets[MAX_VERTEX_ATTRIBS]; 3787ec681f3Smrg uint8_t formats[MAX_VERTEX_ATTRIBS]; 3797ec681f3Smrg uint8_t format_align_req_minus_1[MAX_VERTEX_ATTRIBS]; 3807ec681f3Smrg uint8_t format_sizes[MAX_VERTEX_ATTRIBS]; 38101e04c3fSmrg}; 38201e04c3fSmrg 3837ec681f3Smrgstruct radv_vs_prolog_key { 3847ec681f3Smrg const struct radv_vs_input_state *state; 3857ec681f3Smrg unsigned num_attributes; 3867ec681f3Smrg uint32_t misaligned_mask; 3877ec681f3Smrg bool as_ls; 3887ec681f3Smrg bool is_ngg; 3897ec681f3Smrg bool wave32; 3907ec681f3Smrg gl_shader_stage next_stage; 39101e04c3fSmrg}; 39201e04c3fSmrg 3937ec681f3Smrgenum radv_shader_binary_type { RADV_BINARY_TYPE_LEGACY, RADV_BINARY_TYPE_RTLD }; 3947ec681f3Smrg 3957ec681f3Smrgstruct radv_shader_binary { 3967ec681f3Smrg enum radv_shader_binary_type type; 3977ec681f3Smrg gl_shader_stage stage; 3987ec681f3Smrg bool is_gs_copy_shader; 3997ec681f3Smrg 4007ec681f3Smrg struct ac_shader_config config; 4017ec681f3Smrg struct radv_shader_info info; 4027ec681f3Smrg 4037ec681f3Smrg /* Self-referential size so we avoid consistency issues. */ 4047ec681f3Smrg uint32_t total_size; 40501e04c3fSmrg}; 40601e04c3fSmrg 4077ec681f3Smrgstruct radv_shader_binary_legacy { 4087ec681f3Smrg struct radv_shader_binary base; 4097ec681f3Smrg unsigned code_size; 4107ec681f3Smrg unsigned exec_size; 4117ec681f3Smrg unsigned ir_size; 4127ec681f3Smrg unsigned disasm_size; 4137ec681f3Smrg unsigned stats_size; 4147ec681f3Smrg 4157ec681f3Smrg /* data has size of stats_size + code_size + ir_size + disasm_size + 2, 4167ec681f3Smrg * where the +2 is for 0 of the ir strings. */ 4177ec681f3Smrg uint8_t data[0]; 41801e04c3fSmrg}; 41901e04c3fSmrg 4207ec681f3Smrgstruct radv_shader_binary_rtld { 4217ec681f3Smrg struct radv_shader_binary base; 4227ec681f3Smrg unsigned elf_size; 4237ec681f3Smrg unsigned llvm_ir_size; 4247ec681f3Smrg uint8_t data[0]; 42501e04c3fSmrg}; 42601e04c3fSmrg 4277ec681f3Smrgstruct radv_prolog_binary { 4287ec681f3Smrg uint8_t num_sgprs; 4297ec681f3Smrg uint8_t num_vgprs; 4307ec681f3Smrg uint8_t num_preserved_sgprs; 4317ec681f3Smrg unsigned code_size; 4327ec681f3Smrg uint8_t data[0]; 43301e04c3fSmrg}; 43401e04c3fSmrg 4357ec681f3Smrgstruct radv_shader_arena { 4367ec681f3Smrg struct list_head list; 4377ec681f3Smrg struct list_head entries; 4387ec681f3Smrg struct radeon_winsys_bo *bo; 4397ec681f3Smrg char *ptr; 44001e04c3fSmrg}; 44101e04c3fSmrg 4427ec681f3Smrgunion radv_shader_arena_block { 4437ec681f3Smrg struct list_head pool; 4447ec681f3Smrg struct { 4457ec681f3Smrg /* List of blocks in the arena, sorted by address. */ 4467ec681f3Smrg struct list_head list; 4477ec681f3Smrg /* For holes, a list_head for the free-list. For allocations, freelist.prev=NULL and 4487ec681f3Smrg * freelist.next is a pointer associated with the allocation. 4497ec681f3Smrg */ 4507ec681f3Smrg struct list_head freelist; 4517ec681f3Smrg struct radv_shader_arena *arena; 4527ec681f3Smrg uint32_t offset; 4537ec681f3Smrg uint32_t size; 4547ec681f3Smrg }; 4557ec681f3Smrg}; 45601e04c3fSmrg 4577ec681f3Smrgstruct radv_shader_variant { 4587ec681f3Smrg uint32_t ref_count; 4597ec681f3Smrg 4607ec681f3Smrg struct radeon_winsys_bo *bo; 4617ec681f3Smrg union radv_shader_arena_block *alloc; 4627ec681f3Smrg struct ac_shader_config config; 4637ec681f3Smrg uint8_t *code_ptr; 4647ec681f3Smrg uint32_t code_size; 4657ec681f3Smrg uint32_t exec_size; 4667ec681f3Smrg struct radv_shader_info info; 4677ec681f3Smrg 4687ec681f3Smrg /* debug only */ 4697ec681f3Smrg char *spirv; 4707ec681f3Smrg uint32_t spirv_size; 4717ec681f3Smrg char *nir_string; 4727ec681f3Smrg char *disasm_string; 4737ec681f3Smrg char *ir_string; 4747ec681f3Smrg uint32_t *statistics; 4757ec681f3Smrg}; 4767ec681f3Smrg 4777ec681f3Smrgstruct radv_shader_prolog { 4787ec681f3Smrg struct radeon_winsys_bo *bo; 4797ec681f3Smrg union radv_shader_arena_block *alloc; 4807ec681f3Smrg uint32_t rsrc1; 4817ec681f3Smrg uint8_t num_preserved_sgprs; 4827ec681f3Smrg bool nontrivial_divisors; 4837ec681f3Smrg}; 4847ec681f3Smrg 4857ec681f3Smrgvoid radv_optimize_nir(const struct radv_device *device, struct nir_shader *shader, 4867ec681f3Smrg bool optimize_conservatively, bool allow_copies); 4877ec681f3Smrgvoid radv_optimize_nir_algebraic(nir_shader *shader, bool opt_offsets); 4887ec681f3Smrgbool radv_nir_lower_ycbcr_textures(nir_shader *shader, const struct radv_pipeline_layout *layout); 4897ec681f3Smrg 4907ec681f3Smrgnir_shader *radv_shader_compile_to_nir(struct radv_device *device, struct vk_shader_module *module, 4917ec681f3Smrg const char *entrypoint_name, gl_shader_stage stage, 4927ec681f3Smrg const VkSpecializationInfo *spec_info, 4937ec681f3Smrg const struct radv_pipeline_layout *layout, 4947ec681f3Smrg const struct radv_pipeline_key *key); 4957ec681f3Smrg 4967ec681f3Smrgvoid radv_init_shader_arenas(struct radv_device *device); 4977ec681f3Smrgvoid radv_destroy_shader_arenas(struct radv_device *device); 4987ec681f3Smrg 4997ec681f3SmrgVkResult radv_create_shaders(struct radv_pipeline *pipeline, 5007ec681f3Smrg struct radv_pipeline_layout *pipeline_layout, 5017ec681f3Smrg struct radv_device *device, struct radv_pipeline_cache *cache, 5027ec681f3Smrg const struct radv_pipeline_key *key, 5037ec681f3Smrg const VkPipelineShaderStageCreateInfo **pStages, 5047ec681f3Smrg const VkPipelineCreateFlags flags, const uint8_t *custom_hash, 5057ec681f3Smrg VkPipelineCreationFeedbackEXT *pipeline_feedback, 5067ec681f3Smrg VkPipelineCreationFeedbackEXT **stage_feedbacks); 5077ec681f3Smrg 5087ec681f3Smrgstruct radv_shader_variant *radv_shader_variant_create(struct radv_device *device, 5097ec681f3Smrg const struct radv_shader_binary *binary, 5107ec681f3Smrg bool keep_shader_info, bool from_cache); 5117ec681f3Smrgstruct radv_shader_variant *radv_shader_variant_compile( 5127ec681f3Smrg struct radv_device *device, struct vk_shader_module *module, struct nir_shader *const *shaders, 5137ec681f3Smrg int shader_count, struct radv_pipeline_layout *layout, const struct radv_pipeline_key *key, 5147ec681f3Smrg struct radv_shader_info *info, bool keep_shader_info, bool keep_statistic_info, 5157ec681f3Smrg struct radv_shader_binary **binary_out); 51601e04c3fSmrg 51701e04c3fSmrgstruct radv_shader_variant * 51801e04c3fSmrgradv_create_gs_copy_shader(struct radv_device *device, struct nir_shader *nir, 5197ec681f3Smrg struct radv_shader_info *info, struct radv_shader_binary **binary_out, 5207ec681f3Smrg bool multiview, bool keep_shader_info, bool keep_statistic_info, 5217ec681f3Smrg bool disable_optimizations); 52201e04c3fSmrg 5237ec681f3Smrgstruct radv_shader_variant *radv_create_trap_handler_shader(struct radv_device *device); 5247ec681f3Smrg 5257ec681f3Smrgstruct radv_shader_prolog *radv_create_vs_prolog(struct radv_device *device, 5267ec681f3Smrg const struct radv_vs_prolog_key *key); 5277ec681f3Smrg 5287ec681f3Smrgvoid radv_shader_variant_destroy(struct radv_device *device, struct radv_shader_variant *variant); 5297ec681f3Smrg 5307ec681f3Smrgvoid radv_prolog_destroy(struct radv_device *device, struct radv_shader_prolog *prolog); 5317ec681f3Smrg 5327ec681f3Smrguint64_t radv_shader_variant_get_va(const struct radv_shader_variant *variant); 5337ec681f3Smrgstruct radv_shader_variant *radv_find_shader_variant(struct radv_device *device, uint64_t pc); 5347ec681f3Smrg 5357ec681f3Smrgunsigned radv_get_max_waves(const struct radv_device *device, struct radv_shader_variant *variant, 5367ec681f3Smrg gl_shader_stage stage); 5377ec681f3Smrg 5387ec681f3Smrgconst char *radv_get_shader_name(struct radv_shader_info *info, gl_shader_stage stage); 5397ec681f3Smrg 5407ec681f3Smrgunsigned radv_compute_spi_ps_input(const struct radv_device *device, 5417ec681f3Smrg const struct radv_shader_info *info); 5427ec681f3Smrg 5437ec681f3Smrgbool radv_can_dump_shader(struct radv_device *device, struct vk_shader_module *module, 5447ec681f3Smrg bool meta_shader); 54501e04c3fSmrg 5467ec681f3Smrgbool radv_can_dump_shader_stats(struct radv_device *device, struct vk_shader_module *module); 5477ec681f3Smrg 5487ec681f3SmrgVkResult radv_dump_shader_stats(struct radv_device *device, struct radv_pipeline *pipeline, 5497ec681f3Smrg gl_shader_stage stage, FILE *output); 5507ec681f3Smrg 5517ec681f3Smrgstatic inline unsigned 5527ec681f3Smrgcalculate_tess_lds_size(enum chip_class chip_class, unsigned tcs_num_input_vertices, 5537ec681f3Smrg unsigned tcs_num_output_vertices, unsigned tcs_num_inputs, 5547ec681f3Smrg unsigned tcs_num_patches, unsigned tcs_num_outputs, 5557ec681f3Smrg unsigned tcs_num_patch_outputs) 55601e04c3fSmrg{ 5577ec681f3Smrg unsigned input_vertex_size = tcs_num_inputs * 16; 5587ec681f3Smrg unsigned output_vertex_size = tcs_num_outputs * 16; 5597ec681f3Smrg 5607ec681f3Smrg unsigned input_patch_size = tcs_num_input_vertices * input_vertex_size; 5617ec681f3Smrg 5627ec681f3Smrg unsigned pervertex_output_patch_size = tcs_num_output_vertices * output_vertex_size; 5637ec681f3Smrg unsigned output_patch_size = pervertex_output_patch_size + tcs_num_patch_outputs * 16; 5647ec681f3Smrg 5657ec681f3Smrg unsigned output_patch0_offset = input_patch_size * tcs_num_patches; 5667ec681f3Smrg 5677ec681f3Smrg unsigned lds_size = output_patch0_offset + output_patch_size * tcs_num_patches; 5687ec681f3Smrg 5697ec681f3Smrg if (chip_class >= GFX7) { 5707ec681f3Smrg assert(lds_size <= 65536); 5717ec681f3Smrg lds_size = align(lds_size, 512) / 512; 5727ec681f3Smrg } else { 5737ec681f3Smrg assert(lds_size <= 32768); 5747ec681f3Smrg lds_size = align(lds_size, 256) / 256; 5757ec681f3Smrg } 5767ec681f3Smrg 5777ec681f3Smrg return lds_size; 57801e04c3fSmrg} 57901e04c3fSmrg 5807ec681f3Smrgstatic inline unsigned 5817ec681f3Smrgget_tcs_num_patches(unsigned tcs_num_input_vertices, unsigned tcs_num_output_vertices, 5827ec681f3Smrg unsigned tcs_num_inputs, unsigned tcs_num_outputs, 5837ec681f3Smrg unsigned tcs_num_patch_outputs, unsigned tess_offchip_block_dw_size, 5847ec681f3Smrg enum chip_class chip_class, enum radeon_family family) 58501e04c3fSmrg{ 5867ec681f3Smrg uint32_t input_vertex_size = tcs_num_inputs * 16; 5877ec681f3Smrg uint32_t input_patch_size = tcs_num_input_vertices * input_vertex_size; 5887ec681f3Smrg uint32_t output_vertex_size = tcs_num_outputs * 16; 5897ec681f3Smrg uint32_t pervertex_output_patch_size = tcs_num_output_vertices * output_vertex_size; 5907ec681f3Smrg uint32_t output_patch_size = pervertex_output_patch_size + tcs_num_patch_outputs * 16; 5917ec681f3Smrg 5927ec681f3Smrg /* Ensure that we only need one wave per SIMD so we don't need to check 5937ec681f3Smrg * resource usage. Also ensures that the number of tcs in and out 5947ec681f3Smrg * vertices per threadgroup are at most 256. 5957ec681f3Smrg */ 5967ec681f3Smrg unsigned num_patches = 64 / MAX2(tcs_num_input_vertices, tcs_num_output_vertices) * 4; 5977ec681f3Smrg /* Make sure that the data fits in LDS. This assumes the shaders only 5987ec681f3Smrg * use LDS for the inputs and outputs. 5997ec681f3Smrg */ 6007ec681f3Smrg unsigned hardware_lds_size = 32768; 6017ec681f3Smrg 6027ec681f3Smrg /* Looks like STONEY hangs if we use more than 32 KiB LDS in a single 6037ec681f3Smrg * threadgroup, even though there is more than 32 KiB LDS. 6047ec681f3Smrg * 6057ec681f3Smrg * Test: dEQP-VK.tessellation.shader_input_output.barrier 6067ec681f3Smrg */ 6077ec681f3Smrg if (chip_class >= GFX7 && family != CHIP_STONEY) 6087ec681f3Smrg hardware_lds_size = 65536; 6097ec681f3Smrg 6107ec681f3Smrg if (input_patch_size + output_patch_size) 6117ec681f3Smrg num_patches = MIN2(num_patches, hardware_lds_size / (input_patch_size + output_patch_size)); 6127ec681f3Smrg /* Make sure the output data fits in the offchip buffer */ 6137ec681f3Smrg if (output_patch_size) 6147ec681f3Smrg num_patches = MIN2(num_patches, (tess_offchip_block_dw_size * 4) / output_patch_size); 6157ec681f3Smrg /* Not necessary for correctness, but improves performance. The 6167ec681f3Smrg * specific value is taken from the proprietary driver. 6177ec681f3Smrg */ 6187ec681f3Smrg num_patches = MIN2(num_patches, 40); 6197ec681f3Smrg 6207ec681f3Smrg /* GFX6 bug workaround - limit LS-HS threadgroups to only one wave. */ 6217ec681f3Smrg if (chip_class == GFX6) { 6227ec681f3Smrg unsigned one_wave = 64 / MAX2(tcs_num_input_vertices, tcs_num_output_vertices); 6237ec681f3Smrg num_patches = MIN2(num_patches, one_wave); 6247ec681f3Smrg } 6257ec681f3Smrg return num_patches; 62601e04c3fSmrg} 62701e04c3fSmrg 6287ec681f3Smrgvoid radv_lower_io(struct radv_device *device, nir_shader *nir); 6297ec681f3Smrg 6307ec681f3Smrgbool radv_lower_io_to_mem(struct radv_device *device, struct nir_shader *nir, 6317ec681f3Smrg const struct radv_shader_info *info, const struct radv_pipeline_key *pl_key); 6327ec681f3Smrg 6337ec681f3Smrgvoid radv_lower_ngg(struct radv_device *device, struct nir_shader *nir, 6347ec681f3Smrg const struct radv_shader_info *info, 6357ec681f3Smrg const struct radv_pipeline_key *pl_key); 6367ec681f3Smrg 6377ec681f3Smrgbool radv_consider_culling(struct radv_device *device, struct nir_shader *nir, 6387ec681f3Smrg uint64_t ps_inputs_read, unsigned num_vertices_per_primitive, 6397ec681f3Smrg const struct radv_shader_info *info); 6407ec681f3Smrg 6417ec681f3Smrgvoid radv_get_nir_options(struct radv_physical_device *device); 6427ec681f3Smrg 64301e04c3fSmrg#endif 644