17e102996Smaya/* 27e102996Smaya * Copyright (C) 2015 Rob Clark <robclark@freedesktop.org> 37e102996Smaya * 47e102996Smaya * Permission is hereby granted, free of charge, to any person obtaining a 57e102996Smaya * copy of this software and associated documentation files (the "Software"), 67e102996Smaya * to deal in the Software without restriction, including without limitation 77e102996Smaya * the rights to use, copy, modify, merge, publish, distribute, sublicense, 87e102996Smaya * and/or sell copies of the Software, and to permit persons to whom the 97e102996Smaya * Software is furnished to do so, subject to the following conditions: 107e102996Smaya * 117e102996Smaya * The above copyright notice and this permission notice (including the next 127e102996Smaya * paragraph) shall be included in all copies or substantial portions of the 137e102996Smaya * Software. 147e102996Smaya * 157e102996Smaya * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 167e102996Smaya * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 177e102996Smaya * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 187e102996Smaya * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 197e102996Smaya * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 207e102996Smaya * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 217e102996Smaya * SOFTWARE. 227e102996Smaya * 237e102996Smaya * Authors: 247e102996Smaya * Rob Clark <robclark@freedesktop.org> 257e102996Smaya */ 267e102996Smaya 277e102996Smaya#include "util/debug.h" 287ec681f3Smrg#include "util/u_math.h" 297e102996Smaya 307e102996Smaya#include "ir3_compiler.h" 317ec681f3Smrg#include "ir3_nir.h" 327e102996Smaya#include "ir3_shader.h" 337e102996Smaya 347e102996Smayastatic const nir_shader_compiler_options options = { 357ec681f3Smrg .lower_fpow = true, 367ec681f3Smrg .lower_scmp = true, 377ec681f3Smrg .lower_flrp16 = true, 387ec681f3Smrg .lower_flrp32 = true, 397ec681f3Smrg .lower_flrp64 = true, 407ec681f3Smrg .lower_ffract = true, 417ec681f3Smrg .lower_fmod = true, 427ec681f3Smrg .lower_fdiv = true, 437ec681f3Smrg .lower_isign = true, 447ec681f3Smrg .lower_ldexp = true, 457ec681f3Smrg .lower_uadd_carry = true, 467ec681f3Smrg .lower_usub_borrow = true, 477ec681f3Smrg .lower_mul_high = true, 487ec681f3Smrg .lower_mul_2x32_64 = true, 497ec681f3Smrg .fuse_ffma16 = true, 507ec681f3Smrg .fuse_ffma32 = true, 517ec681f3Smrg .fuse_ffma64 = true, 527ec681f3Smrg .vertex_id_zero_based = true, 537ec681f3Smrg .lower_extract_byte = true, 547ec681f3Smrg .lower_extract_word = true, 557ec681f3Smrg .lower_insert_byte = true, 567ec681f3Smrg .lower_insert_word = true, 577ec681f3Smrg .lower_helper_invocation = true, 587ec681f3Smrg .lower_bitfield_insert_to_shifts = true, 597ec681f3Smrg .lower_bitfield_extract_to_shifts = true, 607ec681f3Smrg .lower_pack_half_2x16 = true, 617ec681f3Smrg .lower_pack_snorm_4x8 = true, 627ec681f3Smrg .lower_pack_snorm_2x16 = true, 637ec681f3Smrg .lower_pack_unorm_4x8 = true, 647ec681f3Smrg .lower_pack_unorm_2x16 = true, 657ec681f3Smrg .lower_unpack_half_2x16 = true, 667ec681f3Smrg .lower_unpack_snorm_4x8 = true, 677ec681f3Smrg .lower_unpack_snorm_2x16 = true, 687ec681f3Smrg .lower_unpack_unorm_4x8 = true, 697ec681f3Smrg .lower_unpack_unorm_2x16 = true, 707ec681f3Smrg .lower_pack_split = true, 717ec681f3Smrg .use_interpolated_input_intrinsics = true, 727ec681f3Smrg .lower_rotate = true, 737ec681f3Smrg .lower_to_scalar = true, 747ec681f3Smrg .has_imul24 = true, 757ec681f3Smrg .has_fsub = true, 767ec681f3Smrg .has_isub = true, 777ec681f3Smrg .lower_wpos_pntc = true, 787ec681f3Smrg .lower_cs_local_index_from_id = true, 797ec681f3Smrg 807ec681f3Smrg /* Only needed for the spirv_to_nir() pass done in ir3_cmdline.c 817ec681f3Smrg * but that should be harmless for GL since 64b is not 827ec681f3Smrg * supported there. 837ec681f3Smrg */ 847ec681f3Smrg .lower_int64_options = (nir_lower_int64_options)~0, 857ec681f3Smrg .lower_uniforms_to_ubo = true, 867ec681f3Smrg .use_scoped_barrier = true, 877e102996Smaya}; 887e102996Smaya 897e102996Smaya/* we don't want to lower vertex_id to _zero_based on newer gpus: */ 907e102996Smayastatic const nir_shader_compiler_options options_a6xx = { 917ec681f3Smrg .lower_fpow = true, 927ec681f3Smrg .lower_scmp = true, 937ec681f3Smrg .lower_flrp16 = true, 947ec681f3Smrg .lower_flrp32 = true, 957ec681f3Smrg .lower_flrp64 = true, 967ec681f3Smrg .lower_ffract = true, 977ec681f3Smrg .lower_fmod = true, 987ec681f3Smrg .lower_fdiv = true, 997ec681f3Smrg .lower_isign = true, 1007ec681f3Smrg .lower_ldexp = true, 1017ec681f3Smrg .lower_uadd_carry = true, 1027ec681f3Smrg .lower_usub_borrow = true, 1037ec681f3Smrg .lower_mul_high = true, 1047ec681f3Smrg .lower_mul_2x32_64 = true, 1057ec681f3Smrg .fuse_ffma16 = true, 1067ec681f3Smrg .fuse_ffma32 = true, 1077ec681f3Smrg .fuse_ffma64 = true, 1087ec681f3Smrg .vertex_id_zero_based = false, 1097ec681f3Smrg .lower_extract_byte = true, 1107ec681f3Smrg .lower_extract_word = true, 1117ec681f3Smrg .lower_insert_byte = true, 1127ec681f3Smrg .lower_insert_word = true, 1137ec681f3Smrg .lower_helper_invocation = true, 1147ec681f3Smrg .lower_bitfield_insert_to_shifts = true, 1157ec681f3Smrg .lower_bitfield_extract_to_shifts = true, 1167ec681f3Smrg .lower_pack_half_2x16 = true, 1177ec681f3Smrg .lower_pack_snorm_4x8 = true, 1187ec681f3Smrg .lower_pack_snorm_2x16 = true, 1197ec681f3Smrg .lower_pack_unorm_4x8 = true, 1207ec681f3Smrg .lower_pack_unorm_2x16 = true, 1217ec681f3Smrg .lower_unpack_half_2x16 = true, 1227ec681f3Smrg .lower_unpack_snorm_4x8 = true, 1237ec681f3Smrg .lower_unpack_snorm_2x16 = true, 1247ec681f3Smrg .lower_unpack_unorm_4x8 = true, 1257ec681f3Smrg .lower_unpack_unorm_2x16 = true, 1267ec681f3Smrg .lower_pack_split = true, 1277ec681f3Smrg .use_interpolated_input_intrinsics = true, 1287ec681f3Smrg .lower_rotate = true, 1297ec681f3Smrg .vectorize_io = true, 1307ec681f3Smrg .lower_to_scalar = true, 1317ec681f3Smrg .has_imul24 = true, 1327ec681f3Smrg .has_fsub = true, 1337ec681f3Smrg .has_isub = true, 1347ec681f3Smrg .max_unroll_iterations = 32, 1357ec681f3Smrg .force_indirect_unrolling = nir_var_all, 1367ec681f3Smrg .lower_wpos_pntc = true, 1377ec681f3Smrg .lower_cs_local_index_from_id = true, 1387ec681f3Smrg 1397ec681f3Smrg /* Only needed for the spirv_to_nir() pass done in ir3_cmdline.c 1407ec681f3Smrg * but that should be harmless for GL since 64b is not 1417ec681f3Smrg * supported there. 1427ec681f3Smrg */ 1437ec681f3Smrg .lower_int64_options = (nir_lower_int64_options)~0, 1447ec681f3Smrg .lower_uniforms_to_ubo = true, 1457ec681f3Smrg .lower_device_index_to_zero = true, 1467ec681f3Smrg .use_scoped_barrier = true, 1477e102996Smaya}; 1487e102996Smaya 1497e102996Smayaconst nir_shader_compiler_options * 1507e102996Smayair3_get_compiler_options(struct ir3_compiler *compiler) 1517e102996Smaya{ 1527ec681f3Smrg if (compiler->gen >= 6) 1537ec681f3Smrg return &options_a6xx; 1547ec681f3Smrg return &options; 1557e102996Smaya} 1567e102996Smaya 1577ec681f3Smrgstatic bool 1587ec681f3Smrgir3_nir_should_vectorize_mem(unsigned align_mul, unsigned align_offset, 1597ec681f3Smrg unsigned bit_size, unsigned num_components, 1607ec681f3Smrg nir_intrinsic_instr *low, 1617ec681f3Smrg nir_intrinsic_instr *high, void *data) 1627e102996Smaya{ 1637ec681f3Smrg assert(bit_size >= 8); 1647ec681f3Smrg if (bit_size != 32) 1657ec681f3Smrg return false; 1667ec681f3Smrg unsigned byte_size = bit_size / 8; 1677ec681f3Smrg 1687ec681f3Smrg int size = num_components * byte_size; 1697ec681f3Smrg 1707ec681f3Smrg /* Don't care about alignment past vec4. */ 1717ec681f3Smrg assert(util_is_power_of_two_nonzero(align_mul)); 1727ec681f3Smrg align_mul = MIN2(align_mul, 16); 1737ec681f3Smrg align_offset &= 15; 1747ec681f3Smrg 1757ec681f3Smrg /* Our offset alignment should aways be at least 4 bytes */ 1767ec681f3Smrg if (align_mul < 4) 1777ec681f3Smrg return false; 1787ec681f3Smrg 1797ec681f3Smrg unsigned worst_start_offset = 16 - align_mul + align_offset; 1807ec681f3Smrg if (worst_start_offset + size > 16) 1817ec681f3Smrg return false; 1827ec681f3Smrg 1837ec681f3Smrg return true; 1847e102996Smaya} 1857e102996Smaya 1867ec681f3Smrg#define OPT(nir, pass, ...) \ 1877ec681f3Smrg ({ \ 1887ec681f3Smrg bool this_progress = false; \ 1897ec681f3Smrg NIR_PASS(this_progress, nir, pass, ##__VA_ARGS__); \ 1907ec681f3Smrg this_progress; \ 1917ec681f3Smrg }) 1927e102996Smaya 1937e102996Smaya#define OPT_V(nir, pass, ...) NIR_PASS_V(nir, pass, ##__VA_ARGS__) 1947e102996Smaya 1957ec681f3Smrgvoid 1967ec681f3Smrgir3_optimize_loop(struct ir3_compiler *compiler, nir_shader *s) 1977ec681f3Smrg{ 1987ec681f3Smrg bool progress; 1997ec681f3Smrg unsigned lower_flrp = (s->options->lower_flrp16 ? 16 : 0) | 2007ec681f3Smrg (s->options->lower_flrp32 ? 32 : 0) | 2017ec681f3Smrg (s->options->lower_flrp64 ? 64 : 0); 2027ec681f3Smrg 2037ec681f3Smrg do { 2047ec681f3Smrg progress = false; 2057ec681f3Smrg 2067ec681f3Smrg OPT_V(s, nir_lower_vars_to_ssa); 2077ec681f3Smrg progress |= OPT(s, nir_opt_copy_prop_vars); 2087ec681f3Smrg progress |= OPT(s, nir_opt_dead_write_vars); 2097ec681f3Smrg progress |= OPT(s, nir_lower_alu_to_scalar, NULL, NULL); 2107ec681f3Smrg progress |= OPT(s, nir_lower_phis_to_scalar, false); 2117ec681f3Smrg 2127ec681f3Smrg progress |= OPT(s, nir_copy_prop); 2137ec681f3Smrg progress |= OPT(s, nir_opt_deref); 2147ec681f3Smrg progress |= OPT(s, nir_opt_dce); 2157ec681f3Smrg progress |= OPT(s, nir_opt_cse); 2167ec681f3Smrg static int gcm = -1; 2177ec681f3Smrg if (gcm == -1) 2187ec681f3Smrg gcm = env_var_as_unsigned("GCM", 0); 2197ec681f3Smrg if (gcm == 1) 2207ec681f3Smrg progress |= OPT(s, nir_opt_gcm, true); 2217ec681f3Smrg else if (gcm == 2) 2227ec681f3Smrg progress |= OPT(s, nir_opt_gcm, false); 2237ec681f3Smrg progress |= OPT(s, nir_opt_peephole_select, 16, true, true); 2247ec681f3Smrg progress |= OPT(s, nir_opt_intrinsics); 2257ec681f3Smrg /* NOTE: GS lowering inserts an output var with varying slot that 2267ec681f3Smrg * is larger than VARYING_SLOT_MAX (ie. GS_VERTEX_FLAGS_IR3), 2277ec681f3Smrg * which triggers asserts in nir_shader_gather_info(). To work 2287ec681f3Smrg * around that skip lowering phi precision for GS. 2297ec681f3Smrg * 2307ec681f3Smrg * Calling nir_shader_gather_info() late also seems to cause 2317ec681f3Smrg * problems for tess lowering, for now since we only enable 2327ec681f3Smrg * fp16/int16 for frag and compute, skip phi precision lowering 2337ec681f3Smrg * for other stages. 2347ec681f3Smrg */ 2357ec681f3Smrg if ((s->info.stage == MESA_SHADER_FRAGMENT) || 2367ec681f3Smrg (s->info.stage == MESA_SHADER_COMPUTE)) { 2377ec681f3Smrg progress |= OPT(s, nir_opt_phi_precision); 2387ec681f3Smrg } 2397ec681f3Smrg progress |= OPT(s, nir_opt_algebraic); 2407ec681f3Smrg progress |= OPT(s, nir_lower_alu); 2417ec681f3Smrg progress |= OPT(s, nir_lower_pack); 2427ec681f3Smrg progress |= OPT(s, nir_opt_constant_folding); 2437ec681f3Smrg 2447ec681f3Smrg nir_load_store_vectorize_options vectorize_opts = { 2457ec681f3Smrg .modes = nir_var_mem_ubo, 2467ec681f3Smrg .callback = ir3_nir_should_vectorize_mem, 2477ec681f3Smrg .robust_modes = compiler->robust_ubo_access ? nir_var_mem_ubo : 0, 2487ec681f3Smrg }; 2497ec681f3Smrg progress |= OPT(s, nir_opt_load_store_vectorize, &vectorize_opts); 2507ec681f3Smrg 2517ec681f3Smrg if (lower_flrp != 0) { 2527ec681f3Smrg if (OPT(s, nir_lower_flrp, lower_flrp, false /* always_precise */)) { 2537ec681f3Smrg OPT(s, nir_opt_constant_folding); 2547ec681f3Smrg progress = true; 2557ec681f3Smrg } 2567ec681f3Smrg 2577ec681f3Smrg /* Nothing should rematerialize any flrps, so we only 2587ec681f3Smrg * need to do this lowering once. 2597ec681f3Smrg */ 2607ec681f3Smrg lower_flrp = 0; 2617ec681f3Smrg } 2627ec681f3Smrg 2637ec681f3Smrg progress |= OPT(s, nir_opt_dead_cf); 2647ec681f3Smrg if (OPT(s, nir_opt_trivial_continues)) { 2657ec681f3Smrg progress |= true; 2667ec681f3Smrg /* If nir_opt_trivial_continues makes progress, then we need to clean 2677ec681f3Smrg * things up if we want any hope of nir_opt_if or nir_opt_loop_unroll 2687ec681f3Smrg * to make progress. 2697ec681f3Smrg */ 2707ec681f3Smrg OPT(s, nir_copy_prop); 2717ec681f3Smrg OPT(s, nir_opt_dce); 2727ec681f3Smrg } 2737ec681f3Smrg progress |= OPT(s, nir_opt_if, false); 2747ec681f3Smrg progress |= OPT(s, nir_opt_loop_unroll); 2757ec681f3Smrg progress |= OPT(s, nir_opt_remove_phis); 2767ec681f3Smrg progress |= OPT(s, nir_opt_undef); 2777ec681f3Smrg } while (progress); 2787ec681f3Smrg} 2797ec681f3Smrg 2807ec681f3Smrgstatic bool 2817ec681f3Smrgshould_split_wrmask(const nir_instr *instr, const void *data) 2827ec681f3Smrg{ 2837ec681f3Smrg nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); 2847ec681f3Smrg 2857ec681f3Smrg switch (intr->intrinsic) { 2867ec681f3Smrg case nir_intrinsic_store_ssbo: 2877ec681f3Smrg case nir_intrinsic_store_shared: 2887ec681f3Smrg case nir_intrinsic_store_global: 2897ec681f3Smrg case nir_intrinsic_store_scratch: 2907ec681f3Smrg return true; 2917ec681f3Smrg default: 2927ec681f3Smrg return false; 2937ec681f3Smrg } 2947ec681f3Smrg} 2957ec681f3Smrg 2967ec681f3Smrgstatic bool 2977ec681f3Smrgir3_nir_lower_ssbo_size_filter(const nir_instr *instr, const void *data) 2987ec681f3Smrg{ 2997ec681f3Smrg return instr->type == nir_instr_type_intrinsic && 3007ec681f3Smrg nir_instr_as_intrinsic(instr)->intrinsic == 3017ec681f3Smrg nir_intrinsic_get_ssbo_size; 3027ec681f3Smrg} 3037ec681f3Smrg 3047ec681f3Smrgstatic nir_ssa_def * 3057ec681f3Smrgir3_nir_lower_ssbo_size_instr(nir_builder *b, nir_instr *instr, void *data) 3067ec681f3Smrg{ 3077ec681f3Smrg uint8_t ssbo_size_to_bytes_shift = *(uint8_t *) data; 3087ec681f3Smrg nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); 3097ec681f3Smrg return nir_ishl(b, &intr->dest.ssa, nir_imm_int(b, ssbo_size_to_bytes_shift)); 3107ec681f3Smrg} 3117ec681f3Smrg 3127ec681f3Smrg/** 3137ec681f3Smrg * The resinfo opcode we have for getting the SSBO size on a6xx returns a byte 3147ec681f3Smrg * length divided by IBO_0_FMT, while the NIR intrinsic coming in is a number of 3157ec681f3Smrg * bytes. Switch things so the NIR intrinsic in our backend means dwords. 3167ec681f3Smrg */ 3177ec681f3Smrgstatic bool 3187ec681f3Smrgir3_nir_lower_ssbo_size(nir_shader *s, bool storage_16bit) 3197ec681f3Smrg{ 3207ec681f3Smrg uint8_t ssbo_size_to_bytes_shift = storage_16bit ? 1 : 2; 3217ec681f3Smrg return nir_shader_lower_instructions(s, ir3_nir_lower_ssbo_size_filter, 3227ec681f3Smrg ir3_nir_lower_ssbo_size_instr, 3237ec681f3Smrg &ssbo_size_to_bytes_shift); 3247ec681f3Smrg} 3257ec681f3Smrg 3267ec681f3Smrgvoid 3277ec681f3Smrgir3_nir_lower_io_to_temporaries(nir_shader *s) 3287ec681f3Smrg{ 3297ec681f3Smrg /* Outputs consumed by the VPC, VS inputs, and FS outputs are all handled 3307ec681f3Smrg * by the hardware pre-loading registers at the beginning and then reading 3317ec681f3Smrg * them at the end, so we can't access them indirectly except through 3327ec681f3Smrg * normal register-indirect accesses, and therefore ir3 doesn't support 3337ec681f3Smrg * indirect accesses on those. Other i/o is lowered in ir3_nir_lower_tess, 3347ec681f3Smrg * and indirects work just fine for those. GS outputs may be consumed by 3357ec681f3Smrg * VPC, but have their own lowering in ir3_nir_lower_gs() which does 3367ec681f3Smrg * something similar to nir_lower_io_to_temporaries so we shouldn't need 3377ec681f3Smrg * to lower them. 3387ec681f3Smrg * 3397ec681f3Smrg * Note: this might be a little inefficient for VS or TES outputs which are 3407ec681f3Smrg * when the next stage isn't an FS, but it probably don't make sense to 3417ec681f3Smrg * depend on the next stage before variant creation. 3427ec681f3Smrg * 3437ec681f3Smrg * TODO: for gallium, mesa/st also does some redundant lowering, including 3447ec681f3Smrg * running this pass for GS inputs/outputs which we don't want but not 3457ec681f3Smrg * including TES outputs or FS inputs which we do need. We should probably 3467ec681f3Smrg * stop doing that once we're sure all drivers are doing their own 3477ec681f3Smrg * indirect i/o lowering. 3487ec681f3Smrg */ 3497ec681f3Smrg bool lower_input = s->info.stage == MESA_SHADER_VERTEX || 3507ec681f3Smrg s->info.stage == MESA_SHADER_FRAGMENT; 3517ec681f3Smrg bool lower_output = s->info.stage != MESA_SHADER_TESS_CTRL && 3527ec681f3Smrg s->info.stage != MESA_SHADER_GEOMETRY; 3537ec681f3Smrg if (lower_input || lower_output) { 3547ec681f3Smrg NIR_PASS_V(s, nir_lower_io_to_temporaries, nir_shader_get_entrypoint(s), 3557ec681f3Smrg lower_output, lower_input); 3567ec681f3Smrg 3577ec681f3Smrg /* nir_lower_io_to_temporaries() creates global variables and copy 3587ec681f3Smrg * instructions which need to be cleaned up. 3597ec681f3Smrg */ 3607ec681f3Smrg NIR_PASS_V(s, nir_split_var_copies); 3617ec681f3Smrg NIR_PASS_V(s, nir_lower_var_copies); 3627ec681f3Smrg NIR_PASS_V(s, nir_lower_global_vars_to_local); 3637ec681f3Smrg } 3647ec681f3Smrg 3657ec681f3Smrg /* Regardless of the above, we need to lower indirect references to 3667ec681f3Smrg * compact variables such as clip/cull distances because due to how 3677ec681f3Smrg * TCS<->TES IO works we cannot handle indirect accesses that "straddle" 3687ec681f3Smrg * vec4 components. nir_lower_indirect_derefs has a special case for 3697ec681f3Smrg * compact variables, so it will actually lower them even though we pass 3707ec681f3Smrg * in 0 modes. 3717ec681f3Smrg * 3727ec681f3Smrg * Using temporaries would be slightly better but 3737ec681f3Smrg * nir_lower_io_to_temporaries currently doesn't support TCS i/o. 3747ec681f3Smrg */ 3757ec681f3Smrg NIR_PASS_V(s, nir_lower_indirect_derefs, 0, UINT32_MAX); 3767ec681f3Smrg} 3777ec681f3Smrg 3787ec681f3Smrgvoid 3797ec681f3Smrgir3_finalize_nir(struct ir3_compiler *compiler, nir_shader *s) 3807ec681f3Smrg{ 3817ec681f3Smrg struct nir_lower_tex_options tex_options = { 3827ec681f3Smrg .lower_rect = 0, 3837ec681f3Smrg .lower_tg4_offsets = true, 3847ec681f3Smrg }; 3857ec681f3Smrg 3867ec681f3Smrg if (compiler->gen >= 4) { 3877ec681f3Smrg /* a4xx seems to have *no* sam.p */ 3887ec681f3Smrg tex_options.lower_txp = ~0; /* lower all txp */ 3897ec681f3Smrg } else { 3907ec681f3Smrg /* a3xx just needs to avoid sam.p for 3d tex */ 3917ec681f3Smrg tex_options.lower_txp = (1 << GLSL_SAMPLER_DIM_3D); 3927ec681f3Smrg } 3937ec681f3Smrg 3947ec681f3Smrg if (ir3_shader_debug & IR3_DBG_DISASM) { 3957ec681f3Smrg mesa_logi("----------------------"); 3967ec681f3Smrg nir_log_shaderi(s); 3977ec681f3Smrg mesa_logi("----------------------"); 3987ec681f3Smrg } 3997ec681f3Smrg 4007ec681f3Smrg if (s->info.stage == MESA_SHADER_GEOMETRY) 4017ec681f3Smrg NIR_PASS_V(s, ir3_nir_lower_gs); 4027ec681f3Smrg 4037ec681f3Smrg NIR_PASS_V(s, nir_lower_amul, ir3_glsl_type_size); 4047ec681f3Smrg 4057ec681f3Smrg OPT_V(s, nir_lower_regs_to_ssa); 4067ec681f3Smrg OPT_V(s, nir_lower_wrmasks, should_split_wrmask, s); 4077ec681f3Smrg 4087ec681f3Smrg OPT_V(s, nir_lower_tex, &tex_options); 4097ec681f3Smrg OPT_V(s, nir_lower_load_const_to_scalar); 4107ec681f3Smrg if (compiler->gen < 5) 4117ec681f3Smrg OPT_V(s, ir3_nir_lower_tg4_to_tex); 4127ec681f3Smrg 4137ec681f3Smrg ir3_optimize_loop(compiler, s); 4147ec681f3Smrg 4157ec681f3Smrg /* do idiv lowering after first opt loop to get a chance to propagate 4167ec681f3Smrg * constants for divide by immed power-of-two: 4177ec681f3Smrg */ 4187ec681f3Smrg nir_lower_idiv_options idiv_options = { 4197ec681f3Smrg .imprecise_32bit_lowering = true, 4207ec681f3Smrg .allow_fp16 = true, 4217ec681f3Smrg }; 4227ec681f3Smrg const bool idiv_progress = OPT(s, nir_lower_idiv, &idiv_options); 4237ec681f3Smrg 4247ec681f3Smrg if (idiv_progress) 4257ec681f3Smrg ir3_optimize_loop(compiler, s); 4267ec681f3Smrg 4277ec681f3Smrg OPT_V(s, nir_remove_dead_variables, nir_var_function_temp, NULL); 4287ec681f3Smrg 4297ec681f3Smrg if (ir3_shader_debug & IR3_DBG_DISASM) { 4307ec681f3Smrg mesa_logi("----------------------"); 4317ec681f3Smrg nir_log_shaderi(s); 4327ec681f3Smrg mesa_logi("----------------------"); 4337ec681f3Smrg } 4347ec681f3Smrg 4357ec681f3Smrg /* st_program.c's parameter list optimization requires that future nir 4367ec681f3Smrg * variants don't reallocate the uniform storage, so we have to remove 4377ec681f3Smrg * uniforms that occupy storage. But we don't want to remove samplers, 4387ec681f3Smrg * because they're needed for YUV variant lowering. 4397ec681f3Smrg */ 4407ec681f3Smrg nir_foreach_uniform_variable_safe (var, s) { 4417ec681f3Smrg if (var->data.mode == nir_var_uniform && 4427ec681f3Smrg (glsl_type_get_image_count(var->type) || 4437ec681f3Smrg glsl_type_get_sampler_count(var->type))) 4447ec681f3Smrg continue; 4457ec681f3Smrg 4467ec681f3Smrg exec_node_remove(&var->node); 4477ec681f3Smrg } 4487ec681f3Smrg nir_validate_shader(s, "after uniform var removal"); 4497ec681f3Smrg 4507ec681f3Smrg nir_sweep(s); 4517ec681f3Smrg} 4527ec681f3Smrg 4537ec681f3Smrgstatic bool 4547ec681f3Smrglower_subgroup_id_filter(const nir_instr *instr, const void *unused) 4557e102996Smaya{ 4567ec681f3Smrg (void)unused; 4577ec681f3Smrg 4587ec681f3Smrg if (instr->type != nir_instr_type_intrinsic) 4597ec681f3Smrg return false; 4607ec681f3Smrg 4617ec681f3Smrg nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); 4627ec681f3Smrg return intr->intrinsic == nir_intrinsic_load_subgroup_invocation || 4637ec681f3Smrg intr->intrinsic == nir_intrinsic_load_subgroup_id || 4647ec681f3Smrg intr->intrinsic == nir_intrinsic_load_num_subgroups; 4657e102996Smaya} 4667e102996Smaya 4677ec681f3Smrgstatic nir_ssa_def * 4687ec681f3Smrglower_subgroup_id(nir_builder *b, nir_instr *instr, void *unused) 4697e102996Smaya{ 4707ec681f3Smrg (void)unused; 4717ec681f3Smrg 4727ec681f3Smrg nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); 4737ec681f3Smrg if (intr->intrinsic == nir_intrinsic_load_subgroup_invocation) { 4747ec681f3Smrg return nir_iand( 4757ec681f3Smrg b, nir_load_local_invocation_index(b), 4767ec681f3Smrg nir_isub(b, nir_load_subgroup_size(b), nir_imm_int(b, 1))); 4777ec681f3Smrg } else if (intr->intrinsic == nir_intrinsic_load_subgroup_id) { 4787ec681f3Smrg return nir_ishr(b, nir_load_local_invocation_index(b), 4797ec681f3Smrg nir_load_subgroup_id_shift_ir3(b)); 4807ec681f3Smrg } else { 4817ec681f3Smrg assert(intr->intrinsic == nir_intrinsic_load_num_subgroups); 4827ec681f3Smrg /* If the workgroup size is constant, 4837ec681f3Smrg * nir_lower_compute_system_values() will replace local_size with a 4847ec681f3Smrg * constant so this can mostly be constant folded away. 4857ec681f3Smrg */ 4867ec681f3Smrg nir_ssa_def *local_size = nir_load_workgroup_size(b); 4877ec681f3Smrg nir_ssa_def *size = 4887ec681f3Smrg nir_imul24(b, nir_channel(b, local_size, 0), 4897ec681f3Smrg nir_imul24(b, nir_channel(b, local_size, 1), 4907ec681f3Smrg nir_channel(b, local_size, 2))); 4917ec681f3Smrg nir_ssa_def *one = nir_imm_int(b, 1); 4927ec681f3Smrg return nir_iadd(b, one, 4937ec681f3Smrg nir_ishr(b, nir_isub(b, size, one), 4947ec681f3Smrg nir_load_subgroup_id_shift_ir3(b))); 4957ec681f3Smrg } 4967e102996Smaya} 4977e102996Smaya 4987ec681f3Smrgstatic bool 4997ec681f3Smrgir3_nir_lower_subgroup_id_cs(nir_shader *shader) 5007ec681f3Smrg{ 5017ec681f3Smrg return nir_shader_lower_instructions(shader, lower_subgroup_id_filter, 5027ec681f3Smrg lower_subgroup_id, NULL); 5037ec681f3Smrg} 5047ec681f3Smrg 5057ec681f3Smrgstatic const nir_lower_idiv_options idiv_options = { 5067ec681f3Smrg .imprecise_32bit_lowering = true, 5077ec681f3Smrg .allow_fp16 = true, 5087ec681f3Smrg}; 5097ec681f3Smrg 5107ec681f3Smrg/** 5117ec681f3Smrg * Late passes that need to be done after pscreen->finalize_nir() 5127ec681f3Smrg */ 5137ec681f3Smrgvoid 5147ec681f3Smrgir3_nir_post_finalize(struct ir3_compiler *compiler, nir_shader *s) 5157ec681f3Smrg{ 5167ec681f3Smrg NIR_PASS_V(s, nir_lower_io, nir_var_shader_in | nir_var_shader_out, 5177ec681f3Smrg ir3_glsl_type_size, (nir_lower_io_options)0); 5187ec681f3Smrg 5197ec681f3Smrg if (s->info.stage == MESA_SHADER_FRAGMENT) { 5207ec681f3Smrg /* NOTE: lower load_barycentric_at_sample first, since it 5217ec681f3Smrg * produces load_barycentric_at_offset: 5227ec681f3Smrg */ 5237ec681f3Smrg NIR_PASS_V(s, ir3_nir_lower_load_barycentric_at_sample); 5247ec681f3Smrg NIR_PASS_V(s, ir3_nir_lower_load_barycentric_at_offset); 5257ec681f3Smrg NIR_PASS_V(s, ir3_nir_move_varying_inputs); 5267ec681f3Smrg NIR_PASS_V(s, nir_lower_fb_read); 5277ec681f3Smrg } 5287ec681f3Smrg 5297ec681f3Smrg if (compiler->gen >= 6 && s->info.stage == MESA_SHADER_FRAGMENT && 5307ec681f3Smrg !(ir3_shader_debug & IR3_DBG_NOFP16)) { 5317ec681f3Smrg NIR_PASS_V(s, nir_lower_mediump_io, nir_var_shader_out, 0, false); 5327ec681f3Smrg } 5337ec681f3Smrg 5347ec681f3Smrg if (s->info.stage == MESA_SHADER_COMPUTE) { 5357ec681f3Smrg bool progress = false; 5367ec681f3Smrg NIR_PASS(progress, s, nir_lower_subgroups, 5377ec681f3Smrg &(nir_lower_subgroups_options){ 5387ec681f3Smrg .subgroup_size = 128, 5397ec681f3Smrg .ballot_bit_size = 32, 5407ec681f3Smrg .ballot_components = 4, 5417ec681f3Smrg .lower_to_scalar = true, 5427ec681f3Smrg .lower_vote_eq = true, 5437ec681f3Smrg .lower_subgroup_masks = true, 5447ec681f3Smrg .lower_read_invocation_to_cond = true, 5457ec681f3Smrg }); 5467ec681f3Smrg 5477ec681f3Smrg progress = false; 5487ec681f3Smrg NIR_PASS(progress, s, ir3_nir_lower_subgroup_id_cs); 5497ec681f3Smrg 5507ec681f3Smrg /* ir3_nir_lower_subgroup_id_cs creates extra compute intrinsics which 5517ec681f3Smrg * we need to lower again. 5527ec681f3Smrg */ 5537ec681f3Smrg if (progress) 5547ec681f3Smrg NIR_PASS_V(s, nir_lower_compute_system_values, NULL); 5557ec681f3Smrg } 5567ec681f3Smrg 5577ec681f3Smrg /* we cannot ensure that ir3_finalize_nir() is only called once, so 5587ec681f3Smrg * we also need to do any run-once workarounds here: 5597ec681f3Smrg */ 5607ec681f3Smrg OPT_V(s, ir3_nir_apply_trig_workarounds); 5617ec681f3Smrg 5627ec681f3Smrg nir_lower_image_options lower_image_opts = { 5637ec681f3Smrg .lower_cube_size = true, 5647ec681f3Smrg }; 5657ec681f3Smrg NIR_PASS_V(s, nir_lower_image, &lower_image_opts); 5667ec681f3Smrg NIR_PASS_V(s, nir_lower_idiv, &idiv_options); /* idiv generated by cube lowering */ 5677ec681f3Smrg 5687ec681f3Smrg if (compiler->gen >= 6) 5697ec681f3Smrg OPT_V(s, ir3_nir_lower_ssbo_size, compiler->storage_16bit); 5707ec681f3Smrg 5717ec681f3Smrg ir3_optimize_loop(compiler, s); 5727ec681f3Smrg} 5737ec681f3Smrg 5747ec681f3Smrgstatic bool 5757ec681f3Smrgir3_nir_lower_view_layer_id(nir_shader *nir, bool layer_zero, bool view_zero) 5767ec681f3Smrg{ 5777ec681f3Smrg unsigned layer_id_loc = ~0, view_id_loc = ~0; 5787ec681f3Smrg nir_foreach_shader_in_variable (var, nir) { 5797ec681f3Smrg if (var->data.location == VARYING_SLOT_LAYER) 5807ec681f3Smrg layer_id_loc = var->data.driver_location; 5817ec681f3Smrg if (var->data.location == VARYING_SLOT_VIEWPORT) 5827ec681f3Smrg view_id_loc = var->data.driver_location; 5837ec681f3Smrg } 5847ec681f3Smrg 5857ec681f3Smrg assert(!layer_zero || layer_id_loc != ~0); 5867ec681f3Smrg assert(!view_zero || view_id_loc != ~0); 5877ec681f3Smrg 5887ec681f3Smrg bool progress = false; 5897ec681f3Smrg nir_builder b; 5907ec681f3Smrg 5917ec681f3Smrg nir_foreach_function (func, nir) { 5927ec681f3Smrg nir_builder_init(&b, func->impl); 5937ec681f3Smrg 5947ec681f3Smrg nir_foreach_block (block, func->impl) { 5957ec681f3Smrg nir_foreach_instr_safe (instr, block) { 5967ec681f3Smrg if (instr->type != nir_instr_type_intrinsic) 5977ec681f3Smrg continue; 5987ec681f3Smrg 5997ec681f3Smrg nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); 6007ec681f3Smrg 6017ec681f3Smrg if (intrin->intrinsic != nir_intrinsic_load_input) 6027ec681f3Smrg continue; 6037ec681f3Smrg 6047ec681f3Smrg unsigned base = nir_intrinsic_base(intrin); 6057ec681f3Smrg if (base != layer_id_loc && base != view_id_loc) 6067ec681f3Smrg continue; 6077ec681f3Smrg 6087ec681f3Smrg b.cursor = nir_before_instr(&intrin->instr); 6097ec681f3Smrg nir_ssa_def *zero = nir_imm_int(&b, 0); 6107ec681f3Smrg nir_ssa_def_rewrite_uses(&intrin->dest.ssa, zero); 6117ec681f3Smrg nir_instr_remove(&intrin->instr); 6127ec681f3Smrg progress = true; 6137ec681f3Smrg } 6147ec681f3Smrg } 6157ec681f3Smrg 6167ec681f3Smrg if (progress) { 6177ec681f3Smrg nir_metadata_preserve( 6187ec681f3Smrg func->impl, nir_metadata_block_index | nir_metadata_dominance); 6197ec681f3Smrg } else { 6207ec681f3Smrg nir_metadata_preserve(func->impl, nir_metadata_all); 6217ec681f3Smrg } 6227ec681f3Smrg } 6237ec681f3Smrg 6247ec681f3Smrg return progress; 6257ec681f3Smrg} 6267ec681f3Smrg 6277ec681f3Smrgvoid 6287ec681f3Smrgir3_nir_lower_variant(struct ir3_shader_variant *so, nir_shader *s) 6297ec681f3Smrg{ 6307ec681f3Smrg if (ir3_shader_debug & IR3_DBG_DISASM) { 6317ec681f3Smrg mesa_logi("----------------------"); 6327ec681f3Smrg nir_log_shaderi(s); 6337ec681f3Smrg mesa_logi("----------------------"); 6347ec681f3Smrg } 6357ec681f3Smrg 6367ec681f3Smrg bool progress = false; 6377ec681f3Smrg 6387ec681f3Smrg if (so->key.has_gs || so->key.tessellation) { 6397ec681f3Smrg switch (so->shader->type) { 6407ec681f3Smrg case MESA_SHADER_VERTEX: 6417ec681f3Smrg NIR_PASS_V(s, ir3_nir_lower_to_explicit_output, so, 6427ec681f3Smrg so->key.tessellation); 6437ec681f3Smrg progress = true; 6447ec681f3Smrg break; 6457ec681f3Smrg case MESA_SHADER_TESS_CTRL: 6467ec681f3Smrg NIR_PASS_V(s, ir3_nir_lower_tess_ctrl, so, so->key.tessellation); 6477ec681f3Smrg NIR_PASS_V(s, ir3_nir_lower_to_explicit_input, so); 6487ec681f3Smrg progress = true; 6497ec681f3Smrg break; 6507ec681f3Smrg case MESA_SHADER_TESS_EVAL: 6517ec681f3Smrg NIR_PASS_V(s, ir3_nir_lower_tess_eval, so, so->key.tessellation); 6527ec681f3Smrg if (so->key.has_gs) 6537ec681f3Smrg NIR_PASS_V(s, ir3_nir_lower_to_explicit_output, so, 6547ec681f3Smrg so->key.tessellation); 6557ec681f3Smrg progress = true; 6567ec681f3Smrg break; 6577ec681f3Smrg case MESA_SHADER_GEOMETRY: 6587ec681f3Smrg NIR_PASS_V(s, ir3_nir_lower_to_explicit_input, so); 6597ec681f3Smrg progress = true; 6607ec681f3Smrg break; 6617ec681f3Smrg default: 6627ec681f3Smrg break; 6637ec681f3Smrg } 6647ec681f3Smrg } 6657ec681f3Smrg 6667ec681f3Smrg if (s->info.stage == MESA_SHADER_VERTEX) { 6677ec681f3Smrg if (so->key.ucp_enables) 6687ec681f3Smrg progress |= 6697ec681f3Smrg OPT(s, nir_lower_clip_vs, so->key.ucp_enables, false, false, NULL); 6707ec681f3Smrg } else if (s->info.stage == MESA_SHADER_FRAGMENT) { 6717ec681f3Smrg bool layer_zero = 6727ec681f3Smrg so->key.layer_zero && (s->info.inputs_read & VARYING_BIT_LAYER); 6737ec681f3Smrg bool view_zero = 6747ec681f3Smrg so->key.view_zero && (s->info.inputs_read & VARYING_BIT_VIEWPORT); 6757ec681f3Smrg 6767ec681f3Smrg if (so->key.ucp_enables && !so->shader->compiler->has_clip_cull) 6777ec681f3Smrg progress |= OPT(s, nir_lower_clip_fs, so->key.ucp_enables, false); 6787ec681f3Smrg if (layer_zero || view_zero) 6797ec681f3Smrg progress |= OPT(s, ir3_nir_lower_view_layer_id, layer_zero, view_zero); 6807ec681f3Smrg } 6817ec681f3Smrg 6827ec681f3Smrg /* Move large constant variables to the constants attached to the NIR 6837ec681f3Smrg * shader, which we will upload in the immediates range. This generates 6847ec681f3Smrg * amuls, so we need to clean those up after. 6857ec681f3Smrg * 6867ec681f3Smrg * Passing no size_align, we would get packed values, which if we end up 6877ec681f3Smrg * having to load with LDC would result in extra reads to unpack from 6887ec681f3Smrg * straddling loads. Align everything to vec4 to avoid that, though we 6897ec681f3Smrg * could theoretically do better. 6907ec681f3Smrg */ 6917ec681f3Smrg OPT_V(s, nir_opt_large_constants, glsl_get_vec4_size_align_bytes, 6927ec681f3Smrg 32 /* bytes */); 6937ec681f3Smrg OPT_V(s, ir3_nir_lower_load_constant, so); 6947ec681f3Smrg 6957ec681f3Smrg if (!so->binning_pass) 6967ec681f3Smrg OPT_V(s, ir3_nir_analyze_ubo_ranges, so); 6977ec681f3Smrg 6987ec681f3Smrg progress |= OPT(s, ir3_nir_lower_ubo_loads, so); 6997ec681f3Smrg 7007ec681f3Smrg /* Lower large temporaries to scratch, which in Qualcomm terms is private 7017ec681f3Smrg * memory, to avoid excess register pressure. This should happen after 7027ec681f3Smrg * nir_opt_large_constants, because loading from a UBO is much, much less 7037ec681f3Smrg * expensive. 7047ec681f3Smrg */ 7057ec681f3Smrg if (so->shader->compiler->has_pvtmem) { 7067ec681f3Smrg progress |= OPT(s, nir_lower_vars_to_scratch, nir_var_function_temp, 7077ec681f3Smrg 16 * 16 /* bytes */, glsl_get_natural_size_align_bytes); 7087ec681f3Smrg } 7097ec681f3Smrg 7107ec681f3Smrg /* Lower scratch writemasks */ 7117ec681f3Smrg progress |= OPT(s, nir_lower_wrmasks, should_split_wrmask, s); 7127ec681f3Smrg 7137ec681f3Smrg OPT_V(s, nir_lower_amul, ir3_glsl_type_size); 7147ec681f3Smrg 7157ec681f3Smrg /* UBO offset lowering has to come after we've decided what will 7167ec681f3Smrg * be left as load_ubo 7177ec681f3Smrg */ 7187ec681f3Smrg if (so->shader->compiler->gen >= 6) 7197ec681f3Smrg progress |= OPT(s, nir_lower_ubo_vec4); 7207ec681f3Smrg 7217ec681f3Smrg OPT_V(s, ir3_nir_lower_io_offsets); 7227ec681f3Smrg 7237ec681f3Smrg if (progress) 7247ec681f3Smrg ir3_optimize_loop(so->shader->compiler, s); 7257ec681f3Smrg 7267ec681f3Smrg /* Fixup indirect load_uniform's which end up with a const base offset 7277ec681f3Smrg * which is too large to encode. Do this late(ish) so we actually 7287ec681f3Smrg * can differentiate indirect vs non-indirect. 7297ec681f3Smrg */ 7307ec681f3Smrg if (OPT(s, ir3_nir_fixup_load_uniform)) 7317ec681f3Smrg ir3_optimize_loop(so->shader->compiler, s); 7327ec681f3Smrg 7337ec681f3Smrg /* Do late algebraic optimization to turn add(a, neg(b)) back into 7347ec681f3Smrg * subs, then the mandatory cleanup after algebraic. Note that it may 7357ec681f3Smrg * produce fnegs, and if so then we need to keep running to squash 7367ec681f3Smrg * fneg(fneg(a)). 7377ec681f3Smrg */ 7387ec681f3Smrg bool more_late_algebraic = true; 7397ec681f3Smrg while (more_late_algebraic) { 7407ec681f3Smrg more_late_algebraic = OPT(s, nir_opt_algebraic_late); 7417ec681f3Smrg OPT_V(s, nir_opt_constant_folding); 7427ec681f3Smrg OPT_V(s, nir_copy_prop); 7437ec681f3Smrg OPT_V(s, nir_opt_dce); 7447ec681f3Smrg OPT_V(s, nir_opt_cse); 7457ec681f3Smrg } 7467ec681f3Smrg 7477ec681f3Smrg OPT_V(s, nir_opt_sink, nir_move_const_undef); 7487ec681f3Smrg 7497ec681f3Smrg if (ir3_shader_debug & IR3_DBG_DISASM) { 7507ec681f3Smrg mesa_logi("----------------------"); 7517ec681f3Smrg nir_log_shaderi(s); 7527ec681f3Smrg mesa_logi("----------------------"); 7537ec681f3Smrg } 7547ec681f3Smrg 7557ec681f3Smrg nir_sweep(s); 7567ec681f3Smrg 7577ec681f3Smrg /* Binning pass variants re-use the const_state of the corresponding 7587ec681f3Smrg * draw pass shader, so that same const emit can be re-used for both 7597ec681f3Smrg * passes: 7607ec681f3Smrg */ 7617ec681f3Smrg if (!so->binning_pass) 7627ec681f3Smrg ir3_setup_const_state(s, so, ir3_const_state(so)); 7637ec681f3Smrg} 7647ec681f3Smrg 7657ec681f3Smrgstatic void 7667ec681f3Smrgir3_nir_scan_driver_consts(struct ir3_compiler *compiler, nir_shader *shader, struct ir3_const_state *layout) 7677ec681f3Smrg{ 7687ec681f3Smrg nir_foreach_function (function, shader) { 7697ec681f3Smrg if (!function->impl) 7707ec681f3Smrg continue; 7717ec681f3Smrg 7727ec681f3Smrg nir_foreach_block (block, function->impl) { 7737ec681f3Smrg nir_foreach_instr (instr, block) { 7747ec681f3Smrg if (instr->type != nir_instr_type_intrinsic) 7757ec681f3Smrg continue; 7767ec681f3Smrg 7777ec681f3Smrg nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); 7787ec681f3Smrg unsigned idx; 7797ec681f3Smrg 7807ec681f3Smrg switch (intr->intrinsic) { 7817ec681f3Smrg case nir_intrinsic_image_atomic_add: 7827ec681f3Smrg case nir_intrinsic_image_atomic_imin: 7837ec681f3Smrg case nir_intrinsic_image_atomic_umin: 7847ec681f3Smrg case nir_intrinsic_image_atomic_imax: 7857ec681f3Smrg case nir_intrinsic_image_atomic_umax: 7867ec681f3Smrg case nir_intrinsic_image_atomic_and: 7877ec681f3Smrg case nir_intrinsic_image_atomic_or: 7887ec681f3Smrg case nir_intrinsic_image_atomic_xor: 7897ec681f3Smrg case nir_intrinsic_image_atomic_exchange: 7907ec681f3Smrg case nir_intrinsic_image_atomic_comp_swap: 7917ec681f3Smrg case nir_intrinsic_image_load: 7927ec681f3Smrg case nir_intrinsic_image_store: 7937ec681f3Smrg case nir_intrinsic_image_size: 7947ec681f3Smrg if (compiler->gen < 6 && 7957ec681f3Smrg !(intr->intrinsic == nir_intrinsic_image_load && 7967ec681f3Smrg !(nir_intrinsic_access(intr) & ACCESS_COHERENT))) { 7977ec681f3Smrg idx = nir_src_as_uint(intr->src[0]); 7987ec681f3Smrg if (layout->image_dims.mask & (1 << idx)) 7997ec681f3Smrg break; 8007ec681f3Smrg layout->image_dims.mask |= (1 << idx); 8017ec681f3Smrg layout->image_dims.off[idx] = layout->image_dims.count; 8027ec681f3Smrg layout->image_dims.count += 3; /* three const per */ 8037ec681f3Smrg } 8047ec681f3Smrg break; 8057ec681f3Smrg case nir_intrinsic_load_base_vertex: 8067ec681f3Smrg case nir_intrinsic_load_first_vertex: 8077ec681f3Smrg layout->num_driver_params = 8087ec681f3Smrg MAX2(layout->num_driver_params, IR3_DP_VTXID_BASE + 1); 8097ec681f3Smrg break; 8107ec681f3Smrg case nir_intrinsic_load_base_instance: 8117ec681f3Smrg layout->num_driver_params = 8127ec681f3Smrg MAX2(layout->num_driver_params, IR3_DP_INSTID_BASE + 1); 8137ec681f3Smrg break; 8147ec681f3Smrg case nir_intrinsic_load_user_clip_plane: 8157ec681f3Smrg idx = nir_intrinsic_ucp_id(intr); 8167ec681f3Smrg layout->num_driver_params = MAX2(layout->num_driver_params, 8177ec681f3Smrg IR3_DP_UCP0_X + (idx + 1) * 4); 8187ec681f3Smrg break; 8197ec681f3Smrg case nir_intrinsic_load_num_workgroups: 8207ec681f3Smrg layout->num_driver_params = 8217ec681f3Smrg MAX2(layout->num_driver_params, IR3_DP_NUM_WORK_GROUPS_Z + 1); 8227ec681f3Smrg break; 8237ec681f3Smrg case nir_intrinsic_load_workgroup_size: 8247ec681f3Smrg layout->num_driver_params = MAX2(layout->num_driver_params, 8257ec681f3Smrg IR3_DP_LOCAL_GROUP_SIZE_Z + 1); 8267ec681f3Smrg break; 8277ec681f3Smrg case nir_intrinsic_load_base_workgroup_id: 8287ec681f3Smrg layout->num_driver_params = 8297ec681f3Smrg MAX2(layout->num_driver_params, IR3_DP_BASE_GROUP_Z + 1); 8307ec681f3Smrg break; 8317ec681f3Smrg case nir_intrinsic_load_subgroup_size: 8327ec681f3Smrg layout->num_driver_params = 8337ec681f3Smrg MAX2(layout->num_driver_params, IR3_DP_SUBGROUP_SIZE + 1); 8347ec681f3Smrg break; 8357ec681f3Smrg case nir_intrinsic_load_subgroup_id_shift_ir3: 8367ec681f3Smrg layout->num_driver_params = 8377ec681f3Smrg MAX2(layout->num_driver_params, IR3_DP_SUBGROUP_ID_SHIFT + 1); 8387ec681f3Smrg break; 8397ec681f3Smrg default: 8407ec681f3Smrg break; 8417ec681f3Smrg } 8427ec681f3Smrg } 8437ec681f3Smrg } 8447ec681f3Smrg } 8457ec681f3Smrg} 8467ec681f3Smrg 8477ec681f3Smrg/* Sets up the variant-dependent constant state for the ir3_shader. Note 8487ec681f3Smrg * that it is also used from ir3_nir_analyze_ubo_ranges() to figure out the 8497ec681f3Smrg * maximum number of driver params that would eventually be used, to leave 8507ec681f3Smrg * space for this function to allocate the driver params. 8517ec681f3Smrg */ 8527e102996Smayavoid 8537ec681f3Smrgir3_setup_const_state(nir_shader *nir, struct ir3_shader_variant *v, 8547ec681f3Smrg struct ir3_const_state *const_state) 8557e102996Smaya{ 8567ec681f3Smrg struct ir3_compiler *compiler = v->shader->compiler; 8577ec681f3Smrg 8587ec681f3Smrg memset(&const_state->offsets, ~0, sizeof(const_state->offsets)); 8597ec681f3Smrg 8607ec681f3Smrg ir3_nir_scan_driver_consts(compiler, nir, const_state); 8617ec681f3Smrg 8627ec681f3Smrg if ((compiler->gen < 5) && (v->shader->stream_output.num_outputs > 0)) { 8637ec681f3Smrg const_state->num_driver_params = 8647ec681f3Smrg MAX2(const_state->num_driver_params, IR3_DP_VTXCNT_MAX + 1); 8657ec681f3Smrg } 8667ec681f3Smrg 8677ec681f3Smrg const_state->num_ubos = nir->info.num_ubos; 8687ec681f3Smrg 8697ec681f3Smrg debug_assert((const_state->ubo_state.size % 16) == 0); 8707ec681f3Smrg unsigned constoff = const_state->ubo_state.size / 16; 8717ec681f3Smrg unsigned ptrsz = ir3_pointer_size(compiler); 8727ec681f3Smrg 8737ec681f3Smrg if (const_state->num_ubos > 0) { 8747ec681f3Smrg const_state->offsets.ubo = constoff; 8757ec681f3Smrg constoff += align(const_state->num_ubos * ptrsz, 4) / 4; 8767ec681f3Smrg } 8777ec681f3Smrg 8787ec681f3Smrg if (const_state->image_dims.count > 0) { 8797ec681f3Smrg unsigned cnt = const_state->image_dims.count; 8807ec681f3Smrg const_state->offsets.image_dims = constoff; 8817ec681f3Smrg constoff += align(cnt, 4) / 4; 8827ec681f3Smrg } 8837ec681f3Smrg 8847ec681f3Smrg if (const_state->num_driver_params > 0) { 8857ec681f3Smrg /* num_driver_params in dwords. we only need to align to vec4s for the 8867ec681f3Smrg * common case of immediate constant uploads, but for indirect dispatch 8877ec681f3Smrg * the constants may also be indirect and so we have to align the area in 8887ec681f3Smrg * const space to that requirement. 8897ec681f3Smrg */ 8907ec681f3Smrg const_state->num_driver_params = align(const_state->num_driver_params, 4); 8917ec681f3Smrg unsigned upload_unit = 1; 8927ec681f3Smrg if (v->type == MESA_SHADER_COMPUTE || 8937ec681f3Smrg (const_state->num_driver_params >= IR3_DP_VTXID_BASE)) { 8947ec681f3Smrg upload_unit = compiler->const_upload_unit; 8957ec681f3Smrg } 8967ec681f3Smrg 8977ec681f3Smrg /* offset cannot be 0 for vs params loaded by CP_DRAW_INDIRECT_MULTI */ 8987ec681f3Smrg if (v->type == MESA_SHADER_VERTEX && compiler->gen >= 6) 8997ec681f3Smrg constoff = MAX2(constoff, 1); 9007ec681f3Smrg constoff = align(constoff, upload_unit); 9017ec681f3Smrg const_state->offsets.driver_param = constoff; 9027ec681f3Smrg 9037ec681f3Smrg constoff += align(const_state->num_driver_params / 4, upload_unit); 9047ec681f3Smrg } 9057ec681f3Smrg 9067ec681f3Smrg if ((v->type == MESA_SHADER_VERTEX) && (compiler->gen < 5) && 9077ec681f3Smrg v->shader->stream_output.num_outputs > 0) { 9087ec681f3Smrg const_state->offsets.tfbo = constoff; 9097ec681f3Smrg constoff += align(IR3_MAX_SO_BUFFERS * ptrsz, 4) / 4; 9107ec681f3Smrg } 9117ec681f3Smrg 9127ec681f3Smrg switch (v->type) { 9137ec681f3Smrg case MESA_SHADER_VERTEX: 9147ec681f3Smrg const_state->offsets.primitive_param = constoff; 9157ec681f3Smrg constoff += 1; 9167ec681f3Smrg break; 9177ec681f3Smrg case MESA_SHADER_TESS_CTRL: 9187ec681f3Smrg case MESA_SHADER_TESS_EVAL: 9197ec681f3Smrg constoff = align(constoff - 1, 4) + 3; 9207ec681f3Smrg const_state->offsets.primitive_param = constoff; 9217ec681f3Smrg const_state->offsets.primitive_map = constoff + 5; 9227ec681f3Smrg constoff += 5 + DIV_ROUND_UP(v->input_size, 4); 9237ec681f3Smrg break; 9247ec681f3Smrg case MESA_SHADER_GEOMETRY: 9257ec681f3Smrg const_state->offsets.primitive_param = constoff; 9267ec681f3Smrg const_state->offsets.primitive_map = constoff + 1; 9277ec681f3Smrg constoff += 1 + DIV_ROUND_UP(v->input_size, 4); 9287ec681f3Smrg break; 9297ec681f3Smrg default: 9307ec681f3Smrg break; 9317ec681f3Smrg } 9327ec681f3Smrg 9337ec681f3Smrg const_state->offsets.immediate = constoff; 9347ec681f3Smrg 9357ec681f3Smrg assert(constoff <= ir3_max_const(v)); 9367e102996Smaya} 937