17ec681f3Smrg/* 27ec681f3Smrg * Copyright © 2019 Google, Inc. 37ec681f3Smrg * 47ec681f3Smrg * Permission is hereby granted, free of charge, to any person obtaining a 57ec681f3Smrg * copy of this software and associated documentation files (the "Software"), 67ec681f3Smrg * to deal in the Software without restriction, including without limitation 77ec681f3Smrg * the rights to use, copy, modify, merge, publish, distribute, sublicense, 87ec681f3Smrg * and/or sell copies of the Software, and to permit persons to whom the 97ec681f3Smrg * Software is furnished to do so, subject to the following conditions: 107ec681f3Smrg * 117ec681f3Smrg * The above copyright notice and this permission notice (including the next 127ec681f3Smrg * paragraph) shall be included in all copies or substantial portions of the 137ec681f3Smrg * Software. 147ec681f3Smrg * 157ec681f3Smrg * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 167ec681f3Smrg * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 177ec681f3Smrg * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 187ec681f3Smrg * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 197ec681f3Smrg * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 207ec681f3Smrg * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 217ec681f3Smrg * SOFTWARE. 227ec681f3Smrg */ 237ec681f3Smrg 247ec681f3Smrg#include "nir.h" 257ec681f3Smrg#include "nir_vla.h" 267ec681f3Smrg 277ec681f3Smrg/* Lowering for amul instructions, for drivers that support imul24. 287ec681f3Smrg * This pass will analyze indirect derefs, and convert corresponding 297ec681f3Smrg * amul instructions to either imul or imul24, depending on the 307ec681f3Smrg * required range. 317ec681f3Smrg * 327ec681f3Smrg * 1) Analyze the uniform variables and build a table of UBOs and SSBOs 337ec681f3Smrg * that are either too large, or might be too large (unknown size) 347ec681f3Smrg * for imul24 357ec681f3Smrg * 367ec681f3Smrg * 2) Loop thru looking at all the intrinsics, finding dereferences of 377ec681f3Smrg * large variables, and recursively replacing all amul instructions 387ec681f3Smrg * used with imul 397ec681f3Smrg * 407ec681f3Smrg * 3) Finally loop again thru all instructions replacing any remaining 417ec681f3Smrg * amul with imul24. At this point any remaining amul instructions 427ec681f3Smrg * are not involved in calculating an offset into a large variable, 437ec681f3Smrg * thanks to the 2nd step, so they can be safely replace with imul24. 447ec681f3Smrg * 457ec681f3Smrg * Using two passes over all the instructions lets us handle the case 467ec681f3Smrg * where, due to CSE, an amul is used to calculate an offset into both 477ec681f3Smrg * a large and small variable. 487ec681f3Smrg */ 497ec681f3Smrg 507ec681f3Smrgtypedef struct { 517ec681f3Smrg nir_shader *shader; 527ec681f3Smrg 537ec681f3Smrg int (*type_size)(const struct glsl_type *, bool); 547ec681f3Smrg 557ec681f3Smrg /* Tables of UBOs and SSBOs mapping driver_location/base whether 567ec681f3Smrg * they are too large to use imul24: 577ec681f3Smrg */ 587ec681f3Smrg bool *large_ubos; 597ec681f3Smrg bool *large_ssbos; 607ec681f3Smrg 617ec681f3Smrg /* for cases that we cannot determine UBO/SSBO index, track if *any* 627ec681f3Smrg * UBO/SSBO is too large for imul24: 637ec681f3Smrg */ 647ec681f3Smrg bool has_large_ubo; 657ec681f3Smrg bool has_large_ssbo; 667ec681f3Smrg 677ec681f3Smrg unsigned max_slot; 687ec681f3Smrg 697ec681f3Smrg bool progress; 707ec681f3Smrg} lower_state; 717ec681f3Smrg 727ec681f3Smrg/* Lower 'amul's in offset src of large variables to 'imul': */ 737ec681f3Smrgstatic bool 747ec681f3Smrglower_large_src(nir_src *src, void *s) 757ec681f3Smrg{ 767ec681f3Smrg lower_state *state = s; 777ec681f3Smrg 787ec681f3Smrg assert(src->is_ssa); 797ec681f3Smrg 807ec681f3Smrg nir_instr *parent = src->ssa->parent_instr; 817ec681f3Smrg 827ec681f3Smrg /* No need to visit instructions we've already visited.. this also 837ec681f3Smrg * avoids infinite recursion when phi's are involved: 847ec681f3Smrg */ 857ec681f3Smrg if (parent->pass_flags) 867ec681f3Smrg return false; 877ec681f3Smrg 887ec681f3Smrg nir_foreach_src(parent, lower_large_src, state); 897ec681f3Smrg 907ec681f3Smrg if (parent->type == nir_instr_type_alu) { 917ec681f3Smrg nir_alu_instr *alu = nir_instr_as_alu(parent); 927ec681f3Smrg if (alu->op == nir_op_amul) { 937ec681f3Smrg alu->op = nir_op_imul; 947ec681f3Smrg state->progress = true; 957ec681f3Smrg } 967ec681f3Smrg } 977ec681f3Smrg 987ec681f3Smrg parent->pass_flags = 1; 997ec681f3Smrg 1007ec681f3Smrg return true; 1017ec681f3Smrg} 1027ec681f3Smrg 1037ec681f3Smrgstatic bool 1047ec681f3Smrglarge_ubo(lower_state *state, nir_src src) 1057ec681f3Smrg{ 1067ec681f3Smrg if (!nir_src_is_const(src)) 1077ec681f3Smrg return state->has_large_ubo; 1087ec681f3Smrg unsigned idx = nir_src_as_uint(src); 1097ec681f3Smrg assert(idx < state->shader->info.num_ubos); 1107ec681f3Smrg return state->large_ubos[idx]; 1117ec681f3Smrg} 1127ec681f3Smrg 1137ec681f3Smrgstatic bool 1147ec681f3Smrglarge_ssbo(lower_state *state, nir_src src) 1157ec681f3Smrg{ 1167ec681f3Smrg if (!nir_src_is_const(src)) 1177ec681f3Smrg return state->has_large_ssbo; 1187ec681f3Smrg unsigned idx = nir_src_as_uint(src); 1197ec681f3Smrg assert(idx < state->shader->info.num_ssbos); 1207ec681f3Smrg return state->large_ssbos[idx]; 1217ec681f3Smrg} 1227ec681f3Smrg 1237ec681f3Smrgstatic void 1247ec681f3Smrglower_intrinsic(lower_state *state, nir_intrinsic_instr *intr) 1257ec681f3Smrg{ 1267ec681f3Smrg switch (intr->intrinsic) { 1277ec681f3Smrg case nir_intrinsic_load_ubo: 1287ec681f3Smrg //# src[] = { buffer_index, offset }. 1297ec681f3Smrg if (large_ubo(state, intr->src[0])) 1307ec681f3Smrg lower_large_src(&intr->src[1], state); 1317ec681f3Smrg return; 1327ec681f3Smrg 1337ec681f3Smrg case nir_intrinsic_load_ssbo: 1347ec681f3Smrg //# src[] = { buffer_index, offset }. 1357ec681f3Smrg if (large_ssbo(state, intr->src[0])) 1367ec681f3Smrg lower_large_src(&intr->src[1], state); 1377ec681f3Smrg return; 1387ec681f3Smrg 1397ec681f3Smrg case nir_intrinsic_store_ssbo: 1407ec681f3Smrg //# src[] = { value, block_index, offset } 1417ec681f3Smrg if (large_ssbo(state, intr->src[1])) 1427ec681f3Smrg lower_large_src(&intr->src[2], state); 1437ec681f3Smrg return; 1447ec681f3Smrg 1457ec681f3Smrg case nir_intrinsic_ssbo_atomic_add: 1467ec681f3Smrg case nir_intrinsic_ssbo_atomic_imin: 1477ec681f3Smrg case nir_intrinsic_ssbo_atomic_umin: 1487ec681f3Smrg case nir_intrinsic_ssbo_atomic_imax: 1497ec681f3Smrg case nir_intrinsic_ssbo_atomic_umax: 1507ec681f3Smrg case nir_intrinsic_ssbo_atomic_and: 1517ec681f3Smrg case nir_intrinsic_ssbo_atomic_or: 1527ec681f3Smrg case nir_intrinsic_ssbo_atomic_xor: 1537ec681f3Smrg case nir_intrinsic_ssbo_atomic_exchange: 1547ec681f3Smrg case nir_intrinsic_ssbo_atomic_comp_swap: 1557ec681f3Smrg case nir_intrinsic_ssbo_atomic_fadd: 1567ec681f3Smrg case nir_intrinsic_ssbo_atomic_fmin: 1577ec681f3Smrg case nir_intrinsic_ssbo_atomic_fmax: 1587ec681f3Smrg case nir_intrinsic_ssbo_atomic_fcomp_swap: 1597ec681f3Smrg /* 0: SSBO index 1607ec681f3Smrg * 1: offset 1617ec681f3Smrg */ 1627ec681f3Smrg if (large_ssbo(state, intr->src[0])) 1637ec681f3Smrg lower_large_src(&intr->src[1], state); 1647ec681f3Smrg return; 1657ec681f3Smrg 1667ec681f3Smrg case nir_intrinsic_global_atomic_add: 1677ec681f3Smrg case nir_intrinsic_global_atomic_imin: 1687ec681f3Smrg case nir_intrinsic_global_atomic_umin: 1697ec681f3Smrg case nir_intrinsic_global_atomic_imax: 1707ec681f3Smrg case nir_intrinsic_global_atomic_umax: 1717ec681f3Smrg case nir_intrinsic_global_atomic_and: 1727ec681f3Smrg case nir_intrinsic_global_atomic_or: 1737ec681f3Smrg case nir_intrinsic_global_atomic_xor: 1747ec681f3Smrg case nir_intrinsic_global_atomic_exchange: 1757ec681f3Smrg case nir_intrinsic_global_atomic_comp_swap: 1767ec681f3Smrg case nir_intrinsic_global_atomic_fadd: 1777ec681f3Smrg case nir_intrinsic_global_atomic_fmin: 1787ec681f3Smrg case nir_intrinsic_global_atomic_fmax: 1797ec681f3Smrg case nir_intrinsic_global_atomic_fcomp_swap: 1807ec681f3Smrg case nir_intrinsic_load_global_constant: 1817ec681f3Smrg case nir_intrinsic_load_global: 1827ec681f3Smrg /* just assume we that 24b is not sufficient: */ 1837ec681f3Smrg lower_large_src(&intr->src[0], state); 1847ec681f3Smrg return; 1857ec681f3Smrg 1867ec681f3Smrg case nir_intrinsic_store_global: 1877ec681f3Smrg /* just assume we that 24b is not sufficient: */ 1887ec681f3Smrg lower_large_src(&intr->src[1], state); 1897ec681f3Smrg return; 1907ec681f3Smrg 1917ec681f3Smrg /* These should all be small enough to unconditionally use imul24: */ 1927ec681f3Smrg case nir_intrinsic_shared_atomic_add: 1937ec681f3Smrg case nir_intrinsic_shared_atomic_imin: 1947ec681f3Smrg case nir_intrinsic_shared_atomic_umin: 1957ec681f3Smrg case nir_intrinsic_shared_atomic_imax: 1967ec681f3Smrg case nir_intrinsic_shared_atomic_umax: 1977ec681f3Smrg case nir_intrinsic_shared_atomic_and: 1987ec681f3Smrg case nir_intrinsic_shared_atomic_or: 1997ec681f3Smrg case nir_intrinsic_shared_atomic_xor: 2007ec681f3Smrg case nir_intrinsic_shared_atomic_exchange: 2017ec681f3Smrg case nir_intrinsic_shared_atomic_comp_swap: 2027ec681f3Smrg case nir_intrinsic_shared_atomic_fadd: 2037ec681f3Smrg case nir_intrinsic_shared_atomic_fmin: 2047ec681f3Smrg case nir_intrinsic_shared_atomic_fmax: 2057ec681f3Smrg case nir_intrinsic_shared_atomic_fcomp_swap: 2067ec681f3Smrg case nir_intrinsic_load_uniform: 2077ec681f3Smrg case nir_intrinsic_load_input: 2087ec681f3Smrg case nir_intrinsic_load_output: 2097ec681f3Smrg case nir_intrinsic_store_output: 2107ec681f3Smrg default: 2117ec681f3Smrg return; 2127ec681f3Smrg } 2137ec681f3Smrg} 2147ec681f3Smrg 2157ec681f3Smrgstatic void 2167ec681f3Smrglower_instr(lower_state *state, nir_instr *instr) 2177ec681f3Smrg{ 2187ec681f3Smrg if (instr->type == nir_instr_type_intrinsic) { 2197ec681f3Smrg lower_intrinsic(state, nir_instr_as_intrinsic(instr)); 2207ec681f3Smrg } 2217ec681f3Smrg} 2227ec681f3Smrg 2237ec681f3Smrgstatic bool 2247ec681f3Smrgis_large(lower_state *state, nir_variable *var) 2257ec681f3Smrg{ 2267ec681f3Smrg const struct glsl_type *type = glsl_without_array(var->type); 2277ec681f3Smrg unsigned size = state->type_size(type, false); 2287ec681f3Smrg 2297ec681f3Smrg /* if size is not known (ie. VLA) then assume the worst: */ 2307ec681f3Smrg if (!size) 2317ec681f3Smrg return true; 2327ec681f3Smrg 2337ec681f3Smrg return size >= (1 << 23); 2347ec681f3Smrg} 2357ec681f3Smrg 2367ec681f3Smrgbool 2377ec681f3Smrgnir_lower_amul(nir_shader *shader, 2387ec681f3Smrg int (*type_size)(const struct glsl_type *, bool)) 2397ec681f3Smrg{ 2407ec681f3Smrg assert(shader->options->has_imul24); 2417ec681f3Smrg assert(type_size); 2427ec681f3Smrg 2437ec681f3Smrg NIR_VLA_FILL(bool, large_ubos, shader->info.num_ubos, 0); 2447ec681f3Smrg NIR_VLA_FILL(bool, large_ssbos, shader->info.num_ssbos, 0); 2457ec681f3Smrg 2467ec681f3Smrg lower_state state = { 2477ec681f3Smrg .shader = shader, 2487ec681f3Smrg .type_size = type_size, 2497ec681f3Smrg .large_ubos = large_ubos, 2507ec681f3Smrg .large_ssbos = large_ssbos, 2517ec681f3Smrg }; 2527ec681f3Smrg 2537ec681f3Smrg /* Figure out which UBOs or SSBOs are large enough to be 2547ec681f3Smrg * disqualified from imul24: 2557ec681f3Smrg */ 2567ec681f3Smrg nir_foreach_variable_in_shader (var, shader) { 2577ec681f3Smrg if (var->data.mode == nir_var_mem_ubo) { 2587ec681f3Smrg if (is_large(&state, var)) { 2597ec681f3Smrg state.has_large_ubo = true; 2607ec681f3Smrg unsigned size = MAX2(1, glsl_array_size(var->type)); 2617ec681f3Smrg for (unsigned i = 0; i < size; i++) 2627ec681f3Smrg state.large_ubos[var->data.binding + i] = true; 2637ec681f3Smrg } 2647ec681f3Smrg } else if (var->data.mode == nir_var_mem_ssbo) { 2657ec681f3Smrg if (is_large(&state, var)) { 2667ec681f3Smrg state.has_large_ssbo = true; 2677ec681f3Smrg unsigned size = MAX2(1, glsl_array_size(var->type)); 2687ec681f3Smrg for (unsigned i = 0; i < size; i++) 2697ec681f3Smrg state.large_ssbos[var->data.binding + i] = true; 2707ec681f3Smrg } 2717ec681f3Smrg } 2727ec681f3Smrg } 2737ec681f3Smrg 2747ec681f3Smrg /* clear pass flags: */ 2757ec681f3Smrg nir_foreach_function(function, shader) { 2767ec681f3Smrg nir_function_impl *impl = function->impl; 2777ec681f3Smrg if (!impl) 2787ec681f3Smrg continue; 2797ec681f3Smrg 2807ec681f3Smrg nir_foreach_block(block, impl) { 2817ec681f3Smrg nir_foreach_instr(instr, block) { 2827ec681f3Smrg instr->pass_flags = 0; 2837ec681f3Smrg } 2847ec681f3Smrg } 2857ec681f3Smrg } 2867ec681f3Smrg 2877ec681f3Smrg nir_foreach_function(function, shader) { 2887ec681f3Smrg nir_function_impl *impl = function->impl; 2897ec681f3Smrg 2907ec681f3Smrg if (!impl) 2917ec681f3Smrg continue; 2927ec681f3Smrg 2937ec681f3Smrg nir_foreach_block(block, impl) { 2947ec681f3Smrg nir_foreach_instr(instr, block) { 2957ec681f3Smrg lower_instr(&state, instr); 2967ec681f3Smrg } 2977ec681f3Smrg } 2987ec681f3Smrg } 2997ec681f3Smrg 3007ec681f3Smrg /* At this point, all 'amul's used in calculating an offset into 3017ec681f3Smrg * a large variable have been replaced with 'imul'. So remaining 3027ec681f3Smrg * 'amul's can be replaced with 'imul24': 3037ec681f3Smrg */ 3047ec681f3Smrg nir_foreach_function(function, shader) { 3057ec681f3Smrg nir_function_impl *impl = function->impl; 3067ec681f3Smrg 3077ec681f3Smrg if (!impl) 3087ec681f3Smrg continue; 3097ec681f3Smrg 3107ec681f3Smrg nir_foreach_block(block, impl) { 3117ec681f3Smrg nir_foreach_instr(instr, block) { 3127ec681f3Smrg if (instr->type != nir_instr_type_alu) 3137ec681f3Smrg continue; 3147ec681f3Smrg 3157ec681f3Smrg nir_alu_instr *alu = nir_instr_as_alu(instr); 3167ec681f3Smrg if (alu->op != nir_op_amul) 3177ec681f3Smrg continue; 3187ec681f3Smrg 3197ec681f3Smrg alu->op = nir_op_imul24; 3207ec681f3Smrg state.progress |= true; 3217ec681f3Smrg } 3227ec681f3Smrg } 3237ec681f3Smrg 3247ec681f3Smrg nir_metadata_preserve(impl, nir_metadata_block_index | 3257ec681f3Smrg nir_metadata_dominance); 3267ec681f3Smrg 3277ec681f3Smrg } 3287ec681f3Smrg 3297ec681f3Smrg return state.progress; 3307ec681f3Smrg} 331