17ec681f3Smrg/*
27ec681f3Smrg * Copyright © 2019 Google, Inc.
37ec681f3Smrg *
47ec681f3Smrg * Permission is hereby granted, free of charge, to any person obtaining a
57ec681f3Smrg * copy of this software and associated documentation files (the "Software"),
67ec681f3Smrg * to deal in the Software without restriction, including without limitation
77ec681f3Smrg * the rights to use, copy, modify, merge, publish, distribute, sublicense,
87ec681f3Smrg * and/or sell copies of the Software, and to permit persons to whom the
97ec681f3Smrg * Software is furnished to do so, subject to the following conditions:
107ec681f3Smrg *
117ec681f3Smrg * The above copyright notice and this permission notice (including the next
127ec681f3Smrg * paragraph) shall be included in all copies or substantial portions of the
137ec681f3Smrg * Software.
147ec681f3Smrg *
157ec681f3Smrg * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
167ec681f3Smrg * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
177ec681f3Smrg * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
187ec681f3Smrg * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
197ec681f3Smrg * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
207ec681f3Smrg * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
217ec681f3Smrg * SOFTWARE.
227ec681f3Smrg */
237ec681f3Smrg
247ec681f3Smrg#include "nir.h"
257ec681f3Smrg#include "nir_vla.h"
267ec681f3Smrg
277ec681f3Smrg/* Lowering for amul instructions, for drivers that support imul24.
287ec681f3Smrg * This pass will analyze indirect derefs, and convert corresponding
297ec681f3Smrg * amul instructions to either imul or imul24, depending on the
307ec681f3Smrg * required range.
317ec681f3Smrg *
327ec681f3Smrg * 1) Analyze the uniform variables and build a table of UBOs and SSBOs
337ec681f3Smrg *    that are either too large, or might be too large (unknown size)
347ec681f3Smrg *    for imul24
357ec681f3Smrg *
367ec681f3Smrg * 2) Loop thru looking at all the intrinsics, finding dereferences of
377ec681f3Smrg *    large variables, and recursively replacing all amul instructions
387ec681f3Smrg *    used with imul
397ec681f3Smrg *
407ec681f3Smrg * 3) Finally loop again thru all instructions replacing any remaining
417ec681f3Smrg *    amul with imul24.  At this point any remaining amul instructions
427ec681f3Smrg *    are not involved in calculating an offset into a large variable,
437ec681f3Smrg *    thanks to the 2nd step, so they can be safely replace with imul24.
447ec681f3Smrg *
457ec681f3Smrg * Using two passes over all the instructions lets us handle the case
467ec681f3Smrg * where, due to CSE, an amul is used to calculate an offset into both
477ec681f3Smrg * a large and small variable.
487ec681f3Smrg */
497ec681f3Smrg
507ec681f3Smrgtypedef struct {
517ec681f3Smrg   nir_shader *shader;
527ec681f3Smrg
537ec681f3Smrg   int (*type_size)(const struct glsl_type *, bool);
547ec681f3Smrg
557ec681f3Smrg   /* Tables of UBOs and SSBOs mapping driver_location/base whether
567ec681f3Smrg    * they are too large to use imul24:
577ec681f3Smrg    */
587ec681f3Smrg   bool *large_ubos;
597ec681f3Smrg   bool *large_ssbos;
607ec681f3Smrg
617ec681f3Smrg   /* for cases that we cannot determine UBO/SSBO index, track if *any*
627ec681f3Smrg    * UBO/SSBO is too large for imul24:
637ec681f3Smrg    */
647ec681f3Smrg   bool has_large_ubo;
657ec681f3Smrg   bool has_large_ssbo;
667ec681f3Smrg
677ec681f3Smrg   unsigned max_slot;
687ec681f3Smrg
697ec681f3Smrg   bool progress;
707ec681f3Smrg} lower_state;
717ec681f3Smrg
727ec681f3Smrg/* Lower 'amul's in offset src of large variables to 'imul': */
737ec681f3Smrgstatic bool
747ec681f3Smrglower_large_src(nir_src *src, void *s)
757ec681f3Smrg{
767ec681f3Smrg   lower_state *state = s;
777ec681f3Smrg
787ec681f3Smrg   assert(src->is_ssa);
797ec681f3Smrg
807ec681f3Smrg   nir_instr *parent = src->ssa->parent_instr;
817ec681f3Smrg
827ec681f3Smrg   /* No need to visit instructions we've already visited.. this also
837ec681f3Smrg    * avoids infinite recursion when phi's are involved:
847ec681f3Smrg    */
857ec681f3Smrg   if (parent->pass_flags)
867ec681f3Smrg      return false;
877ec681f3Smrg
887ec681f3Smrg   nir_foreach_src(parent, lower_large_src, state);
897ec681f3Smrg
907ec681f3Smrg   if (parent->type == nir_instr_type_alu) {
917ec681f3Smrg      nir_alu_instr *alu = nir_instr_as_alu(parent);
927ec681f3Smrg      if (alu->op == nir_op_amul) {
937ec681f3Smrg         alu->op = nir_op_imul;
947ec681f3Smrg         state->progress = true;
957ec681f3Smrg      }
967ec681f3Smrg   }
977ec681f3Smrg
987ec681f3Smrg   parent->pass_flags = 1;
997ec681f3Smrg
1007ec681f3Smrg   return true;
1017ec681f3Smrg}
1027ec681f3Smrg
1037ec681f3Smrgstatic bool
1047ec681f3Smrglarge_ubo(lower_state *state, nir_src src)
1057ec681f3Smrg{
1067ec681f3Smrg   if (!nir_src_is_const(src))
1077ec681f3Smrg      return state->has_large_ubo;
1087ec681f3Smrg   unsigned idx = nir_src_as_uint(src);
1097ec681f3Smrg   assert(idx < state->shader->info.num_ubos);
1107ec681f3Smrg   return state->large_ubos[idx];
1117ec681f3Smrg}
1127ec681f3Smrg
1137ec681f3Smrgstatic bool
1147ec681f3Smrglarge_ssbo(lower_state *state, nir_src src)
1157ec681f3Smrg{
1167ec681f3Smrg   if (!nir_src_is_const(src))
1177ec681f3Smrg      return state->has_large_ssbo;
1187ec681f3Smrg   unsigned idx = nir_src_as_uint(src);
1197ec681f3Smrg   assert(idx < state->shader->info.num_ssbos);
1207ec681f3Smrg   return state->large_ssbos[idx];
1217ec681f3Smrg}
1227ec681f3Smrg
1237ec681f3Smrgstatic void
1247ec681f3Smrglower_intrinsic(lower_state *state, nir_intrinsic_instr *intr)
1257ec681f3Smrg{
1267ec681f3Smrg   switch (intr->intrinsic) {
1277ec681f3Smrg   case nir_intrinsic_load_ubo:
1287ec681f3Smrg      //# src[] = { buffer_index, offset }.
1297ec681f3Smrg      if (large_ubo(state, intr->src[0]))
1307ec681f3Smrg         lower_large_src(&intr->src[1], state);
1317ec681f3Smrg      return;
1327ec681f3Smrg
1337ec681f3Smrg   case nir_intrinsic_load_ssbo:
1347ec681f3Smrg      //# src[] = { buffer_index, offset }.
1357ec681f3Smrg      if (large_ssbo(state, intr->src[0]))
1367ec681f3Smrg         lower_large_src(&intr->src[1], state);
1377ec681f3Smrg      return;
1387ec681f3Smrg
1397ec681f3Smrg   case nir_intrinsic_store_ssbo:
1407ec681f3Smrg      //# src[] = { value, block_index, offset }
1417ec681f3Smrg      if (large_ssbo(state, intr->src[1]))
1427ec681f3Smrg         lower_large_src(&intr->src[2], state);
1437ec681f3Smrg      return;
1447ec681f3Smrg
1457ec681f3Smrg   case nir_intrinsic_ssbo_atomic_add:
1467ec681f3Smrg   case nir_intrinsic_ssbo_atomic_imin:
1477ec681f3Smrg   case nir_intrinsic_ssbo_atomic_umin:
1487ec681f3Smrg   case nir_intrinsic_ssbo_atomic_imax:
1497ec681f3Smrg   case nir_intrinsic_ssbo_atomic_umax:
1507ec681f3Smrg   case nir_intrinsic_ssbo_atomic_and:
1517ec681f3Smrg   case nir_intrinsic_ssbo_atomic_or:
1527ec681f3Smrg   case nir_intrinsic_ssbo_atomic_xor:
1537ec681f3Smrg   case nir_intrinsic_ssbo_atomic_exchange:
1547ec681f3Smrg   case nir_intrinsic_ssbo_atomic_comp_swap:
1557ec681f3Smrg   case nir_intrinsic_ssbo_atomic_fadd:
1567ec681f3Smrg   case nir_intrinsic_ssbo_atomic_fmin:
1577ec681f3Smrg   case nir_intrinsic_ssbo_atomic_fmax:
1587ec681f3Smrg   case nir_intrinsic_ssbo_atomic_fcomp_swap:
1597ec681f3Smrg      /* 0: SSBO index
1607ec681f3Smrg       * 1: offset
1617ec681f3Smrg       */
1627ec681f3Smrg      if (large_ssbo(state, intr->src[0]))
1637ec681f3Smrg         lower_large_src(&intr->src[1], state);
1647ec681f3Smrg      return;
1657ec681f3Smrg
1667ec681f3Smrg   case nir_intrinsic_global_atomic_add:
1677ec681f3Smrg   case nir_intrinsic_global_atomic_imin:
1687ec681f3Smrg   case nir_intrinsic_global_atomic_umin:
1697ec681f3Smrg   case nir_intrinsic_global_atomic_imax:
1707ec681f3Smrg   case nir_intrinsic_global_atomic_umax:
1717ec681f3Smrg   case nir_intrinsic_global_atomic_and:
1727ec681f3Smrg   case nir_intrinsic_global_atomic_or:
1737ec681f3Smrg   case nir_intrinsic_global_atomic_xor:
1747ec681f3Smrg   case nir_intrinsic_global_atomic_exchange:
1757ec681f3Smrg   case nir_intrinsic_global_atomic_comp_swap:
1767ec681f3Smrg   case nir_intrinsic_global_atomic_fadd:
1777ec681f3Smrg   case nir_intrinsic_global_atomic_fmin:
1787ec681f3Smrg   case nir_intrinsic_global_atomic_fmax:
1797ec681f3Smrg   case nir_intrinsic_global_atomic_fcomp_swap:
1807ec681f3Smrg   case nir_intrinsic_load_global_constant:
1817ec681f3Smrg   case nir_intrinsic_load_global:
1827ec681f3Smrg      /* just assume we that 24b is not sufficient: */
1837ec681f3Smrg      lower_large_src(&intr->src[0], state);
1847ec681f3Smrg      return;
1857ec681f3Smrg
1867ec681f3Smrg   case nir_intrinsic_store_global:
1877ec681f3Smrg      /* just assume we that 24b is not sufficient: */
1887ec681f3Smrg      lower_large_src(&intr->src[1], state);
1897ec681f3Smrg      return;
1907ec681f3Smrg
1917ec681f3Smrg   /* These should all be small enough to unconditionally use imul24: */
1927ec681f3Smrg   case nir_intrinsic_shared_atomic_add:
1937ec681f3Smrg   case nir_intrinsic_shared_atomic_imin:
1947ec681f3Smrg   case nir_intrinsic_shared_atomic_umin:
1957ec681f3Smrg   case nir_intrinsic_shared_atomic_imax:
1967ec681f3Smrg   case nir_intrinsic_shared_atomic_umax:
1977ec681f3Smrg   case nir_intrinsic_shared_atomic_and:
1987ec681f3Smrg   case nir_intrinsic_shared_atomic_or:
1997ec681f3Smrg   case nir_intrinsic_shared_atomic_xor:
2007ec681f3Smrg   case nir_intrinsic_shared_atomic_exchange:
2017ec681f3Smrg   case nir_intrinsic_shared_atomic_comp_swap:
2027ec681f3Smrg   case nir_intrinsic_shared_atomic_fadd:
2037ec681f3Smrg   case nir_intrinsic_shared_atomic_fmin:
2047ec681f3Smrg   case nir_intrinsic_shared_atomic_fmax:
2057ec681f3Smrg   case nir_intrinsic_shared_atomic_fcomp_swap:
2067ec681f3Smrg   case nir_intrinsic_load_uniform:
2077ec681f3Smrg   case nir_intrinsic_load_input:
2087ec681f3Smrg   case nir_intrinsic_load_output:
2097ec681f3Smrg   case nir_intrinsic_store_output:
2107ec681f3Smrg   default:
2117ec681f3Smrg      return;
2127ec681f3Smrg   }
2137ec681f3Smrg}
2147ec681f3Smrg
2157ec681f3Smrgstatic void
2167ec681f3Smrglower_instr(lower_state *state, nir_instr *instr)
2177ec681f3Smrg{
2187ec681f3Smrg   if (instr->type == nir_instr_type_intrinsic) {
2197ec681f3Smrg      lower_intrinsic(state, nir_instr_as_intrinsic(instr));
2207ec681f3Smrg   }
2217ec681f3Smrg}
2227ec681f3Smrg
2237ec681f3Smrgstatic bool
2247ec681f3Smrgis_large(lower_state *state, nir_variable *var)
2257ec681f3Smrg{
2267ec681f3Smrg   const struct glsl_type *type = glsl_without_array(var->type);
2277ec681f3Smrg   unsigned size = state->type_size(type, false);
2287ec681f3Smrg
2297ec681f3Smrg   /* if size is not known (ie. VLA) then assume the worst: */
2307ec681f3Smrg   if (!size)
2317ec681f3Smrg      return true;
2327ec681f3Smrg
2337ec681f3Smrg   return size >= (1 << 23);
2347ec681f3Smrg}
2357ec681f3Smrg
2367ec681f3Smrgbool
2377ec681f3Smrgnir_lower_amul(nir_shader *shader,
2387ec681f3Smrg               int (*type_size)(const struct glsl_type *, bool))
2397ec681f3Smrg{
2407ec681f3Smrg   assert(shader->options->has_imul24);
2417ec681f3Smrg   assert(type_size);
2427ec681f3Smrg
2437ec681f3Smrg   NIR_VLA_FILL(bool, large_ubos, shader->info.num_ubos, 0);
2447ec681f3Smrg   NIR_VLA_FILL(bool, large_ssbos, shader->info.num_ssbos, 0);
2457ec681f3Smrg
2467ec681f3Smrg   lower_state state = {
2477ec681f3Smrg      .shader = shader,
2487ec681f3Smrg      .type_size = type_size,
2497ec681f3Smrg      .large_ubos = large_ubos,
2507ec681f3Smrg      .large_ssbos = large_ssbos,
2517ec681f3Smrg   };
2527ec681f3Smrg
2537ec681f3Smrg   /* Figure out which UBOs or SSBOs are large enough to be
2547ec681f3Smrg    * disqualified from imul24:
2557ec681f3Smrg    */
2567ec681f3Smrg   nir_foreach_variable_in_shader (var, shader) {
2577ec681f3Smrg      if (var->data.mode == nir_var_mem_ubo) {
2587ec681f3Smrg         if (is_large(&state, var)) {
2597ec681f3Smrg            state.has_large_ubo = true;
2607ec681f3Smrg            unsigned size = MAX2(1, glsl_array_size(var->type));
2617ec681f3Smrg            for (unsigned i = 0; i < size; i++)
2627ec681f3Smrg               state.large_ubos[var->data.binding + i] = true;
2637ec681f3Smrg         }
2647ec681f3Smrg      } else if (var->data.mode == nir_var_mem_ssbo) {
2657ec681f3Smrg         if (is_large(&state, var)) {
2667ec681f3Smrg            state.has_large_ssbo = true;
2677ec681f3Smrg            unsigned size = MAX2(1, glsl_array_size(var->type));
2687ec681f3Smrg            for (unsigned i = 0; i < size; i++)
2697ec681f3Smrg               state.large_ssbos[var->data.binding + i] = true;
2707ec681f3Smrg         }
2717ec681f3Smrg      }
2727ec681f3Smrg   }
2737ec681f3Smrg
2747ec681f3Smrg   /* clear pass flags: */
2757ec681f3Smrg   nir_foreach_function(function, shader) {
2767ec681f3Smrg      nir_function_impl *impl = function->impl;
2777ec681f3Smrg      if (!impl)
2787ec681f3Smrg         continue;
2797ec681f3Smrg
2807ec681f3Smrg      nir_foreach_block(block, impl) {
2817ec681f3Smrg         nir_foreach_instr(instr, block) {
2827ec681f3Smrg            instr->pass_flags = 0;
2837ec681f3Smrg         }
2847ec681f3Smrg      }
2857ec681f3Smrg   }
2867ec681f3Smrg
2877ec681f3Smrg   nir_foreach_function(function, shader) {
2887ec681f3Smrg      nir_function_impl *impl = function->impl;
2897ec681f3Smrg
2907ec681f3Smrg      if (!impl)
2917ec681f3Smrg         continue;
2927ec681f3Smrg
2937ec681f3Smrg      nir_foreach_block(block, impl) {
2947ec681f3Smrg         nir_foreach_instr(instr, block) {
2957ec681f3Smrg            lower_instr(&state, instr);
2967ec681f3Smrg         }
2977ec681f3Smrg      }
2987ec681f3Smrg   }
2997ec681f3Smrg
3007ec681f3Smrg   /* At this point, all 'amul's used in calculating an offset into
3017ec681f3Smrg    * a large variable have been replaced with 'imul'.  So remaining
3027ec681f3Smrg    * 'amul's can be replaced with 'imul24':
3037ec681f3Smrg    */
3047ec681f3Smrg   nir_foreach_function(function, shader) {
3057ec681f3Smrg      nir_function_impl *impl = function->impl;
3067ec681f3Smrg
3077ec681f3Smrg      if (!impl)
3087ec681f3Smrg         continue;
3097ec681f3Smrg
3107ec681f3Smrg      nir_foreach_block(block, impl) {
3117ec681f3Smrg         nir_foreach_instr(instr, block) {
3127ec681f3Smrg            if (instr->type != nir_instr_type_alu)
3137ec681f3Smrg               continue;
3147ec681f3Smrg
3157ec681f3Smrg            nir_alu_instr *alu = nir_instr_as_alu(instr);
3167ec681f3Smrg            if (alu->op != nir_op_amul)
3177ec681f3Smrg               continue;
3187ec681f3Smrg
3197ec681f3Smrg            alu->op = nir_op_imul24;
3207ec681f3Smrg            state.progress |= true;
3217ec681f3Smrg         }
3227ec681f3Smrg      }
3237ec681f3Smrg
3247ec681f3Smrg      nir_metadata_preserve(impl, nir_metadata_block_index |
3257ec681f3Smrg                                  nir_metadata_dominance);
3267ec681f3Smrg
3277ec681f3Smrg   }
3287ec681f3Smrg
3297ec681f3Smrg   return state.progress;
3307ec681f3Smrg}
331