17ec681f3Smrg/*
27ec681f3Smrg * Copyright © 2019 Broadcom
37ec681f3Smrg *
47ec681f3Smrg * Permission is hereby granted, free of charge, to any person obtaining a
57ec681f3Smrg * copy of this software and associated documentation files (the "Software"),
67ec681f3Smrg * to deal in the Software without restriction, including without limitation
77ec681f3Smrg * the rights to use, copy, modify, merge, publish, distribute, sublicense,
87ec681f3Smrg * and/or sell copies of the Software, and to permit persons to whom the
97ec681f3Smrg * Software is furnished to do so, subject to the following conditions:
107ec681f3Smrg *
117ec681f3Smrg * The above copyright notice and this permission notice (including the next
127ec681f3Smrg * paragraph) shall be included in all copies or substantial portions of the
137ec681f3Smrg * Software.
147ec681f3Smrg *
157ec681f3Smrg * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
167ec681f3Smrg * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
177ec681f3Smrg * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
187ec681f3Smrg * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
197ec681f3Smrg * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
207ec681f3Smrg * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
217ec681f3Smrg * IN THE SOFTWARE.
227ec681f3Smrg */
237ec681f3Smrg
247ec681f3Smrg#include "nir_schedule.h"
257ec681f3Smrg#include "util/dag.h"
267ec681f3Smrg#include "util/u_dynarray.h"
277ec681f3Smrg
287ec681f3Smrg/** @file
297ec681f3Smrg *
307ec681f3Smrg * Implements basic-block-level prepass instruction scheduling in NIR to
317ec681f3Smrg * manage register pressure.
327ec681f3Smrg *
337ec681f3Smrg * This is based on the Goodman/Hsu paper (1988, cached copy at
347ec681f3Smrg * https://people.freedesktop.org/~anholt/scheduling-goodman-hsu.pdf).  We
357ec681f3Smrg * make up the DDG for NIR (which can be mostly done using the NIR def/use
367ec681f3Smrg * chains for SSA instructions, plus some edges for ordering register writes
377ec681f3Smrg * vs reads, and some more for ordering intrinsics).  Then we pick heads off
387ec681f3Smrg * of the DDG using their heuristic to emit the NIR instructions back into the
397ec681f3Smrg * block in their new order.
407ec681f3Smrg *
417ec681f3Smrg * The hard case for prepass scheduling on GPUs seems to always be consuming
427ec681f3Smrg * texture/ubo results.  The register pressure heuristic doesn't want to pick
437ec681f3Smrg * an instr that starts consuming texture results because it usually won't be
447ec681f3Smrg * the only usage, so that instruction will increase pressure.
457ec681f3Smrg *
467ec681f3Smrg * If you try to force consumption of tex results always, then in a case where
477ec681f3Smrg * single sample is used for many outputs, you'll end up picking every other
487ec681f3Smrg * user and expanding register pressure.  The partially_evaluated_path flag
497ec681f3Smrg * helps tremendously, in that if you happen for whatever reason to pick a
507ec681f3Smrg * texture sample's output, then you'll try to finish off that sample.  Future
517ec681f3Smrg * work may include doing some local search before locking in a choice, to try
527ec681f3Smrg * to more reliably find the case where just a few choices going against the
537ec681f3Smrg * heuristic can manage to free the whole vector.
547ec681f3Smrg */
557ec681f3Smrg
567ec681f3Smrgstatic bool debug;
577ec681f3Smrg
587ec681f3Smrg/**
597ec681f3Smrg * Represents a node in the DDG for a NIR instruction.
607ec681f3Smrg */
617ec681f3Smrgtypedef struct {
627ec681f3Smrg   struct dag_node dag; /* must be first for our u_dynarray_foreach */
637ec681f3Smrg   nir_instr *instr;
647ec681f3Smrg   bool partially_evaluated_path;
657ec681f3Smrg
667ec681f3Smrg   /* Approximate estimate of the delay between starting this instruction and
677ec681f3Smrg    * its results being available.
687ec681f3Smrg    *
697ec681f3Smrg    * Accuracy is not too important, given that we're prepass scheduling here
707ec681f3Smrg    * and just trying to reduce excess dependencies introduced by a register
717ec681f3Smrg    * allocator by stretching out the live intervals of expensive
727ec681f3Smrg    * instructions.
737ec681f3Smrg    */
747ec681f3Smrg   uint32_t delay;
757ec681f3Smrg
767ec681f3Smrg   /* Cost of the maximum-delay path from this node to the leaves. */
777ec681f3Smrg   uint32_t max_delay;
787ec681f3Smrg
797ec681f3Smrg   /* scoreboard->time value when this instruction can be scheduled without
807ec681f3Smrg    * any stalls expected.
817ec681f3Smrg    */
827ec681f3Smrg   uint32_t ready_time;
837ec681f3Smrg} nir_schedule_node;
847ec681f3Smrg
857ec681f3Smrgtypedef struct {
867ec681f3Smrg   struct dag *dag;
877ec681f3Smrg
887ec681f3Smrg   nir_shader *shader;
897ec681f3Smrg
907ec681f3Smrg   /* Mapping from nir_register * or nir_ssa_def * to a struct set of
917ec681f3Smrg    * instructions remaining to be scheduled using the register.
927ec681f3Smrg    */
937ec681f3Smrg   struct hash_table *remaining_uses;
947ec681f3Smrg
957ec681f3Smrg   /* Map from nir_instr to nir_schedule_node * */
967ec681f3Smrg   struct hash_table *instr_map;
977ec681f3Smrg
987ec681f3Smrg   /* Set of nir_register * or nir_ssa_def * that have had any instruction
997ec681f3Smrg    * scheduled on them.
1007ec681f3Smrg    */
1017ec681f3Smrg   struct set *live_values;
1027ec681f3Smrg
1037ec681f3Smrg   /* An abstract approximation of the number of nir_scheduler_node->delay
1047ec681f3Smrg    * units since the start of the shader.
1057ec681f3Smrg    */
1067ec681f3Smrg   uint32_t time;
1077ec681f3Smrg
1087ec681f3Smrg   /* Number of channels currently used by the NIR instructions that have been
1097ec681f3Smrg    * scheduled.
1107ec681f3Smrg    */
1117ec681f3Smrg   int pressure;
1127ec681f3Smrg
1137ec681f3Smrg   /* Options specified by the backend */
1147ec681f3Smrg   const nir_schedule_options *options;
1157ec681f3Smrg} nir_schedule_scoreboard;
1167ec681f3Smrg
1177ec681f3Smrg/* When walking the instructions in reverse, we use this flag to swap
1187ec681f3Smrg * before/after in add_dep().
1197ec681f3Smrg */
1207ec681f3Smrgenum direction { F, R };
1217ec681f3Smrg
1227ec681f3Smrgstruct nir_schedule_class_dep {
1237ec681f3Smrg   int klass;
1247ec681f3Smrg   nir_schedule_node *node;
1257ec681f3Smrg   struct nir_schedule_class_dep *next;
1267ec681f3Smrg};
1277ec681f3Smrg
1287ec681f3Smrgtypedef struct {
1297ec681f3Smrg   nir_schedule_scoreboard *scoreboard;
1307ec681f3Smrg
1317ec681f3Smrg   /* Map from nir_register to nir_schedule_node * */
1327ec681f3Smrg   struct hash_table *reg_map;
1337ec681f3Smrg
1347ec681f3Smrg   /* Scheduler nodes for last instruction involved in some class of dependency.
1357ec681f3Smrg    */
1367ec681f3Smrg   nir_schedule_node *load_input;
1377ec681f3Smrg   nir_schedule_node *store_shared;
1387ec681f3Smrg   nir_schedule_node *unknown_intrinsic;
1397ec681f3Smrg   nir_schedule_node *discard;
1407ec681f3Smrg   nir_schedule_node *jump;
1417ec681f3Smrg
1427ec681f3Smrg   struct nir_schedule_class_dep *class_deps;
1437ec681f3Smrg
1447ec681f3Smrg   enum direction dir;
1457ec681f3Smrg} nir_deps_state;
1467ec681f3Smrg
1477ec681f3Smrgstatic void *
1487ec681f3Smrg_mesa_hash_table_search_data(struct hash_table *ht, void *key)
1497ec681f3Smrg{
1507ec681f3Smrg   struct hash_entry *entry = _mesa_hash_table_search(ht, key);
1517ec681f3Smrg   if (!entry)
1527ec681f3Smrg      return NULL;
1537ec681f3Smrg   return entry->data;
1547ec681f3Smrg}
1557ec681f3Smrg
1567ec681f3Smrgstatic nir_schedule_node *
1577ec681f3Smrgnir_schedule_get_node(struct hash_table *instr_map, nir_instr *instr)
1587ec681f3Smrg{
1597ec681f3Smrg   return _mesa_hash_table_search_data(instr_map, instr);
1607ec681f3Smrg}
1617ec681f3Smrg
1627ec681f3Smrgstatic struct set *
1637ec681f3Smrgnir_schedule_scoreboard_get_src(nir_schedule_scoreboard *scoreboard, nir_src *src)
1647ec681f3Smrg{
1657ec681f3Smrg   if (src->is_ssa) {
1667ec681f3Smrg      return _mesa_hash_table_search_data(scoreboard->remaining_uses, src->ssa);
1677ec681f3Smrg   } else {
1687ec681f3Smrg      return _mesa_hash_table_search_data(scoreboard->remaining_uses,
1697ec681f3Smrg                                          src->reg.reg);
1707ec681f3Smrg   }
1717ec681f3Smrg}
1727ec681f3Smrg
1737ec681f3Smrgstatic int
1747ec681f3Smrgnir_schedule_def_pressure(nir_ssa_def *def)
1757ec681f3Smrg{
1767ec681f3Smrg   return def->num_components;
1777ec681f3Smrg}
1787ec681f3Smrg
1797ec681f3Smrgstatic int
1807ec681f3Smrgnir_schedule_src_pressure(nir_src *src)
1817ec681f3Smrg{
1827ec681f3Smrg   if (src->is_ssa)
1837ec681f3Smrg      return nir_schedule_def_pressure(src->ssa);
1847ec681f3Smrg   else
1857ec681f3Smrg      return src->reg.reg->num_components;
1867ec681f3Smrg}
1877ec681f3Smrg
1887ec681f3Smrgstatic int
1897ec681f3Smrgnir_schedule_dest_pressure(nir_dest *dest)
1907ec681f3Smrg{
1917ec681f3Smrg   if (dest->is_ssa)
1927ec681f3Smrg      return nir_schedule_def_pressure(&dest->ssa);
1937ec681f3Smrg   else
1947ec681f3Smrg      return dest->reg.reg->num_components;
1957ec681f3Smrg}
1967ec681f3Smrg
1977ec681f3Smrg/**
1987ec681f3Smrg * Adds a dependency such that @after must appear in the final program after
1997ec681f3Smrg * @before.
2007ec681f3Smrg *
2017ec681f3Smrg * We add @before as a child of @after, so that DAG heads are the outputs of
2027ec681f3Smrg * the program and we make our scheduling decisions bottom to top.
2037ec681f3Smrg */
2047ec681f3Smrgstatic void
2057ec681f3Smrgadd_dep(nir_deps_state *state,
2067ec681f3Smrg        nir_schedule_node *before,
2077ec681f3Smrg        nir_schedule_node *after)
2087ec681f3Smrg{
2097ec681f3Smrg   if (!before || !after)
2107ec681f3Smrg      return;
2117ec681f3Smrg
2127ec681f3Smrg   assert(before != after);
2137ec681f3Smrg
2147ec681f3Smrg   if (state->dir == F)
2157ec681f3Smrg      dag_add_edge(&before->dag, &after->dag, NULL);
2167ec681f3Smrg   else
2177ec681f3Smrg      dag_add_edge(&after->dag, &before->dag, NULL);
2187ec681f3Smrg}
2197ec681f3Smrg
2207ec681f3Smrg
2217ec681f3Smrgstatic void
2227ec681f3Smrgadd_read_dep(nir_deps_state *state,
2237ec681f3Smrg             nir_schedule_node *before,
2247ec681f3Smrg             nir_schedule_node *after)
2257ec681f3Smrg{
2267ec681f3Smrg   add_dep(state, before, after);
2277ec681f3Smrg}
2287ec681f3Smrg
2297ec681f3Smrgstatic void
2307ec681f3Smrgadd_write_dep(nir_deps_state *state,
2317ec681f3Smrg              nir_schedule_node **before,
2327ec681f3Smrg              nir_schedule_node *after)
2337ec681f3Smrg{
2347ec681f3Smrg   add_dep(state, *before, after);
2357ec681f3Smrg   *before = after;
2367ec681f3Smrg}
2377ec681f3Smrg
2387ec681f3Smrgstatic bool
2397ec681f3Smrgnir_schedule_reg_src_deps(nir_src *src, void *in_state)
2407ec681f3Smrg{
2417ec681f3Smrg   nir_deps_state *state = in_state;
2427ec681f3Smrg
2437ec681f3Smrg   if (src->is_ssa)
2447ec681f3Smrg      return true;
2457ec681f3Smrg
2467ec681f3Smrg   struct hash_entry *entry = _mesa_hash_table_search(state->reg_map,
2477ec681f3Smrg                                                      src->reg.reg);
2487ec681f3Smrg   if (!entry)
2497ec681f3Smrg      return true;
2507ec681f3Smrg   nir_schedule_node *dst_n = entry->data;
2517ec681f3Smrg
2527ec681f3Smrg   nir_schedule_node *src_n =
2537ec681f3Smrg      nir_schedule_get_node(state->scoreboard->instr_map,
2547ec681f3Smrg                            src->parent_instr);
2557ec681f3Smrg
2567ec681f3Smrg   add_dep(state, dst_n, src_n);
2577ec681f3Smrg
2587ec681f3Smrg   return true;
2597ec681f3Smrg}
2607ec681f3Smrg
2617ec681f3Smrgstatic bool
2627ec681f3Smrgnir_schedule_reg_dest_deps(nir_dest *dest, void *in_state)
2637ec681f3Smrg{
2647ec681f3Smrg   nir_deps_state *state = in_state;
2657ec681f3Smrg
2667ec681f3Smrg   if (dest->is_ssa)
2677ec681f3Smrg      return true;
2687ec681f3Smrg
2697ec681f3Smrg   nir_schedule_node *dest_n =
2707ec681f3Smrg      nir_schedule_get_node(state->scoreboard->instr_map,
2717ec681f3Smrg                            dest->reg.parent_instr);
2727ec681f3Smrg
2737ec681f3Smrg   struct hash_entry *entry = _mesa_hash_table_search(state->reg_map,
2747ec681f3Smrg                                                      dest->reg.reg);
2757ec681f3Smrg   if (!entry) {
2767ec681f3Smrg      _mesa_hash_table_insert(state->reg_map, dest->reg.reg, dest_n);
2777ec681f3Smrg      return true;
2787ec681f3Smrg   }
2797ec681f3Smrg   nir_schedule_node **before = (nir_schedule_node **)&entry->data;
2807ec681f3Smrg
2817ec681f3Smrg   add_write_dep(state, before, dest_n);
2827ec681f3Smrg
2837ec681f3Smrg   return true;
2847ec681f3Smrg}
2857ec681f3Smrg
2867ec681f3Smrgstatic bool
2877ec681f3Smrgnir_schedule_ssa_deps(nir_ssa_def *def, void *in_state)
2887ec681f3Smrg{
2897ec681f3Smrg   nir_deps_state *state = in_state;
2907ec681f3Smrg   struct hash_table *instr_map = state->scoreboard->instr_map;
2917ec681f3Smrg   nir_schedule_node *def_n = nir_schedule_get_node(instr_map, def->parent_instr);
2927ec681f3Smrg
2937ec681f3Smrg   nir_foreach_use(src, def) {
2947ec681f3Smrg      nir_schedule_node *use_n = nir_schedule_get_node(instr_map,
2957ec681f3Smrg                                                       src->parent_instr);
2967ec681f3Smrg
2977ec681f3Smrg      add_read_dep(state, def_n, use_n);
2987ec681f3Smrg   }
2997ec681f3Smrg
3007ec681f3Smrg   return true;
3017ec681f3Smrg}
3027ec681f3Smrg
3037ec681f3Smrgstatic struct nir_schedule_class_dep *
3047ec681f3Smrgnir_schedule_get_class_dep(nir_deps_state *state,
3057ec681f3Smrg                           int klass)
3067ec681f3Smrg{
3077ec681f3Smrg   for (struct nir_schedule_class_dep *class_dep = state->class_deps;
3087ec681f3Smrg        class_dep != NULL;
3097ec681f3Smrg        class_dep = class_dep->next) {
3107ec681f3Smrg      if (class_dep->klass == klass)
3117ec681f3Smrg         return class_dep;
3127ec681f3Smrg   }
3137ec681f3Smrg
3147ec681f3Smrg   struct nir_schedule_class_dep *class_dep =
3157ec681f3Smrg      ralloc(state->reg_map, struct nir_schedule_class_dep);
3167ec681f3Smrg
3177ec681f3Smrg   class_dep->klass = klass;
3187ec681f3Smrg   class_dep->node = NULL;
3197ec681f3Smrg   class_dep->next = state->class_deps;
3207ec681f3Smrg
3217ec681f3Smrg   state->class_deps = class_dep;
3227ec681f3Smrg
3237ec681f3Smrg   return class_dep;
3247ec681f3Smrg}
3257ec681f3Smrg
3267ec681f3Smrgstatic void
3277ec681f3Smrgnir_schedule_intrinsic_deps(nir_deps_state *state,
3287ec681f3Smrg                            nir_intrinsic_instr *instr)
3297ec681f3Smrg{
3307ec681f3Smrg   nir_schedule_node *n = nir_schedule_get_node(state->scoreboard->instr_map,
3317ec681f3Smrg                                                &instr->instr);
3327ec681f3Smrg   const nir_schedule_options *options = state->scoreboard->options;
3337ec681f3Smrg   nir_schedule_dependency dep;
3347ec681f3Smrg
3357ec681f3Smrg   if (options->intrinsic_cb &&
3367ec681f3Smrg       options->intrinsic_cb(instr, &dep, options->intrinsic_cb_data)) {
3377ec681f3Smrg      struct nir_schedule_class_dep *class_dep =
3387ec681f3Smrg         nir_schedule_get_class_dep(state, dep.klass);
3397ec681f3Smrg
3407ec681f3Smrg      switch (dep.type) {
3417ec681f3Smrg      case NIR_SCHEDULE_READ_DEPENDENCY:
3427ec681f3Smrg         add_read_dep(state, class_dep->node, n);
3437ec681f3Smrg         break;
3447ec681f3Smrg      case NIR_SCHEDULE_WRITE_DEPENDENCY:
3457ec681f3Smrg         add_write_dep(state, &class_dep->node, n);
3467ec681f3Smrg         break;
3477ec681f3Smrg      }
3487ec681f3Smrg   }
3497ec681f3Smrg
3507ec681f3Smrg   switch (instr->intrinsic) {
3517ec681f3Smrg   case nir_intrinsic_load_uniform:
3527ec681f3Smrg   case nir_intrinsic_load_ubo:
3537ec681f3Smrg   case nir_intrinsic_load_front_face:
3547ec681f3Smrg      break;
3557ec681f3Smrg
3567ec681f3Smrg   case nir_intrinsic_discard:
3577ec681f3Smrg   case nir_intrinsic_discard_if:
3587ec681f3Smrg   case nir_intrinsic_demote:
3597ec681f3Smrg   case nir_intrinsic_demote_if:
3607ec681f3Smrg   case nir_intrinsic_terminate:
3617ec681f3Smrg   case nir_intrinsic_terminate_if:
3627ec681f3Smrg      /* We are adding two dependencies:
3637ec681f3Smrg       *
3647ec681f3Smrg       * * A individual one that we could use to add a read_dep while handling
3657ec681f3Smrg       *   nir_instr_type_tex
3667ec681f3Smrg       *
3677ec681f3Smrg       * * Include it on the unknown intrinsic set, as we want discard to be
3687ec681f3Smrg       *   serialized in in the same order relative to intervening stores or
3697ec681f3Smrg       *   atomic accesses to SSBOs and images
3707ec681f3Smrg       */
3717ec681f3Smrg      add_write_dep(state, &state->discard, n);
3727ec681f3Smrg      add_write_dep(state, &state->unknown_intrinsic, n);
3737ec681f3Smrg      break;
3747ec681f3Smrg
3757ec681f3Smrg   case nir_intrinsic_store_output:
3767ec681f3Smrg      /* For some hardware and stages, output stores affect the same shared
3777ec681f3Smrg       * memory as input loads.
3787ec681f3Smrg       */
3797ec681f3Smrg      if ((state->scoreboard->options->stages_with_shared_io_memory &
3807ec681f3Smrg           (1 << state->scoreboard->shader->info.stage)))
3817ec681f3Smrg         add_write_dep(state, &state->load_input, n);
3827ec681f3Smrg
3837ec681f3Smrg      /* Make sure that preceding discards stay before the store_output */
3847ec681f3Smrg      add_read_dep(state, state->discard, n);
3857ec681f3Smrg
3867ec681f3Smrg      break;
3877ec681f3Smrg
3887ec681f3Smrg   case nir_intrinsic_load_input:
3897ec681f3Smrg   case nir_intrinsic_load_per_vertex_input:
3907ec681f3Smrg      add_read_dep(state, state->load_input, n);
3917ec681f3Smrg      break;
3927ec681f3Smrg
3937ec681f3Smrg   case nir_intrinsic_load_shared:
3947ec681f3Smrg      /* Don't move load_shared beyond a following store_shared, as it could
3957ec681f3Smrg       * change their value
3967ec681f3Smrg       */
3977ec681f3Smrg      add_read_dep(state, state->store_shared, n);
3987ec681f3Smrg      break;
3997ec681f3Smrg
4007ec681f3Smrg   case nir_intrinsic_store_shared:
4017ec681f3Smrg      add_write_dep(state, &state->store_shared, n);
4027ec681f3Smrg      break;
4037ec681f3Smrg
4047ec681f3Smrg   case nir_intrinsic_control_barrier:
4057ec681f3Smrg   case nir_intrinsic_memory_barrier_shared:
4067ec681f3Smrg      add_write_dep(state, &state->store_shared, n);
4077ec681f3Smrg
4087ec681f3Smrg      /* Serialize against ssbos/atomics/etc. */
4097ec681f3Smrg      add_write_dep(state, &state->unknown_intrinsic, n);
4107ec681f3Smrg      break;
4117ec681f3Smrg
4127ec681f3Smrg   default:
4137ec681f3Smrg      /* Attempt to handle other intrinsics that we haven't individually
4147ec681f3Smrg       * categorized by serializing them in the same order relative to each
4157ec681f3Smrg       * other.
4167ec681f3Smrg       */
4177ec681f3Smrg      add_write_dep(state, &state->unknown_intrinsic, n);
4187ec681f3Smrg      break;
4197ec681f3Smrg   }
4207ec681f3Smrg}
4217ec681f3Smrg
4227ec681f3Smrg/**
4237ec681f3Smrg * Common code for dependencies that need to be tracked both forward and
4247ec681f3Smrg * backward.
4257ec681f3Smrg *
4267ec681f3Smrg * This is for things like "all reads of r4 have to happen between the r4
4277ec681f3Smrg * writes that surround them".
4287ec681f3Smrg */
4297ec681f3Smrgstatic void
4307ec681f3Smrgnir_schedule_calculate_deps(nir_deps_state *state, nir_schedule_node *n)
4317ec681f3Smrg{
4327ec681f3Smrg   nir_instr *instr = n->instr;
4337ec681f3Smrg
4347ec681f3Smrg   /* For NIR SSA defs, we only need to do a single pass of making the uses
4357ec681f3Smrg    * depend on the def.
4367ec681f3Smrg    */
4377ec681f3Smrg   if (state->dir == F)
4387ec681f3Smrg      nir_foreach_ssa_def(instr, nir_schedule_ssa_deps, state);
4397ec681f3Smrg
4407ec681f3Smrg   /* For NIR regs, track the last writer in the scheduler state so that we
4417ec681f3Smrg    * can keep the writes in order and let reads get reordered only between
4427ec681f3Smrg    * each write.
4437ec681f3Smrg    */
4447ec681f3Smrg   nir_foreach_src(instr, nir_schedule_reg_src_deps, state);
4457ec681f3Smrg
4467ec681f3Smrg   nir_foreach_dest(instr, nir_schedule_reg_dest_deps, state);
4477ec681f3Smrg
4487ec681f3Smrg   /* Make sure any other instructions keep their positions relative to
4497ec681f3Smrg    * jumps.
4507ec681f3Smrg    */
4517ec681f3Smrg   if (instr->type != nir_instr_type_jump)
4527ec681f3Smrg      add_read_dep(state, state->jump, n);
4537ec681f3Smrg
4547ec681f3Smrg   switch (instr->type) {
4557ec681f3Smrg   case nir_instr_type_ssa_undef:
4567ec681f3Smrg   case nir_instr_type_load_const:
4577ec681f3Smrg   case nir_instr_type_alu:
4587ec681f3Smrg   case nir_instr_type_deref:
4597ec681f3Smrg      break;
4607ec681f3Smrg
4617ec681f3Smrg   case nir_instr_type_tex:
4627ec681f3Smrg      /* Don't move texture ops before a discard, as that could increase
4637ec681f3Smrg       * memory bandwidth for reading the discarded samples.
4647ec681f3Smrg       */
4657ec681f3Smrg      add_read_dep(state, state->discard, n);
4667ec681f3Smrg      break;
4677ec681f3Smrg
4687ec681f3Smrg   case nir_instr_type_jump:
4697ec681f3Smrg      add_write_dep(state, &state->jump, n);
4707ec681f3Smrg      break;
4717ec681f3Smrg
4727ec681f3Smrg   case nir_instr_type_call:
4737ec681f3Smrg      unreachable("Calls should have been lowered");
4747ec681f3Smrg      break;
4757ec681f3Smrg
4767ec681f3Smrg   case nir_instr_type_parallel_copy:
4777ec681f3Smrg      unreachable("Parallel copies should have been lowered");
4787ec681f3Smrg      break;
4797ec681f3Smrg
4807ec681f3Smrg   case nir_instr_type_phi:
4817ec681f3Smrg      unreachable("nir_schedule() should be called after lowering from SSA");
4827ec681f3Smrg      break;
4837ec681f3Smrg
4847ec681f3Smrg   case nir_instr_type_intrinsic:
4857ec681f3Smrg      nir_schedule_intrinsic_deps(state, nir_instr_as_intrinsic(instr));
4867ec681f3Smrg      break;
4877ec681f3Smrg   }
4887ec681f3Smrg}
4897ec681f3Smrg
4907ec681f3Smrgstatic void
4917ec681f3Smrgcalculate_forward_deps(nir_schedule_scoreboard *scoreboard, nir_block *block)
4927ec681f3Smrg{
4937ec681f3Smrg   nir_deps_state state = {
4947ec681f3Smrg      .scoreboard = scoreboard,
4957ec681f3Smrg      .dir = F,
4967ec681f3Smrg      .reg_map = _mesa_pointer_hash_table_create(NULL),
4977ec681f3Smrg   };
4987ec681f3Smrg
4997ec681f3Smrg   nir_foreach_instr(instr, block) {
5007ec681f3Smrg      nir_schedule_node *node = nir_schedule_get_node(scoreboard->instr_map,
5017ec681f3Smrg                                                      instr);
5027ec681f3Smrg      nir_schedule_calculate_deps(&state, node);
5037ec681f3Smrg   }
5047ec681f3Smrg
5057ec681f3Smrg   ralloc_free(state.reg_map);
5067ec681f3Smrg}
5077ec681f3Smrg
5087ec681f3Smrgstatic void
5097ec681f3Smrgcalculate_reverse_deps(nir_schedule_scoreboard *scoreboard, nir_block *block)
5107ec681f3Smrg{
5117ec681f3Smrg   nir_deps_state state = {
5127ec681f3Smrg      .scoreboard = scoreboard,
5137ec681f3Smrg      .dir = R,
5147ec681f3Smrg      .reg_map = _mesa_pointer_hash_table_create(NULL),
5157ec681f3Smrg   };
5167ec681f3Smrg
5177ec681f3Smrg   nir_foreach_instr_reverse(instr, block) {
5187ec681f3Smrg      nir_schedule_node *node = nir_schedule_get_node(scoreboard->instr_map,
5197ec681f3Smrg                                                      instr);
5207ec681f3Smrg      nir_schedule_calculate_deps(&state, node);
5217ec681f3Smrg   }
5227ec681f3Smrg
5237ec681f3Smrg   ralloc_free(state.reg_map);
5247ec681f3Smrg}
5257ec681f3Smrg
5267ec681f3Smrgtypedef struct {
5277ec681f3Smrg   nir_schedule_scoreboard *scoreboard;
5287ec681f3Smrg   int regs_freed;
5297ec681f3Smrg} nir_schedule_regs_freed_state;
5307ec681f3Smrg
5317ec681f3Smrgstatic bool
5327ec681f3Smrgnir_schedule_regs_freed_src_cb(nir_src *src, void *in_state)
5337ec681f3Smrg{
5347ec681f3Smrg   nir_schedule_regs_freed_state *state = in_state;
5357ec681f3Smrg   nir_schedule_scoreboard *scoreboard = state->scoreboard;
5367ec681f3Smrg   struct set *remaining_uses = nir_schedule_scoreboard_get_src(scoreboard, src);
5377ec681f3Smrg
5387ec681f3Smrg   if (remaining_uses->entries == 1 &&
5397ec681f3Smrg       _mesa_set_search(remaining_uses, src->parent_instr)) {
5407ec681f3Smrg      state->regs_freed += nir_schedule_src_pressure(src);
5417ec681f3Smrg   }
5427ec681f3Smrg
5437ec681f3Smrg   return true;
5447ec681f3Smrg}
5457ec681f3Smrg
5467ec681f3Smrgstatic bool
5477ec681f3Smrgnir_schedule_regs_freed_def_cb(nir_ssa_def *def, void *in_state)
5487ec681f3Smrg{
5497ec681f3Smrg   nir_schedule_regs_freed_state *state = in_state;
5507ec681f3Smrg
5517ec681f3Smrg   state->regs_freed -= nir_schedule_def_pressure(def);
5527ec681f3Smrg
5537ec681f3Smrg   return true;
5547ec681f3Smrg}
5557ec681f3Smrg
5567ec681f3Smrgstatic bool
5577ec681f3Smrgnir_schedule_regs_freed_dest_cb(nir_dest *dest, void *in_state)
5587ec681f3Smrg{
5597ec681f3Smrg   nir_schedule_regs_freed_state *state = in_state;
5607ec681f3Smrg   nir_schedule_scoreboard *scoreboard = state->scoreboard;
5617ec681f3Smrg
5627ec681f3Smrg   if (dest->is_ssa)
5637ec681f3Smrg      return true;
5647ec681f3Smrg
5657ec681f3Smrg   nir_register *reg = dest->reg.reg;
5667ec681f3Smrg
5677ec681f3Smrg   /* Only the first def of a reg counts against register pressure. */
5687ec681f3Smrg   if (!_mesa_set_search(scoreboard->live_values, reg))
5697ec681f3Smrg      state->regs_freed -= nir_schedule_dest_pressure(dest);
5707ec681f3Smrg
5717ec681f3Smrg   return true;
5727ec681f3Smrg}
5737ec681f3Smrg
5747ec681f3Smrgstatic int
5757ec681f3Smrgnir_schedule_regs_freed(nir_schedule_scoreboard *scoreboard, nir_schedule_node *n)
5767ec681f3Smrg{
5777ec681f3Smrg   nir_schedule_regs_freed_state state = {
5787ec681f3Smrg      .scoreboard = scoreboard,
5797ec681f3Smrg   };
5807ec681f3Smrg
5817ec681f3Smrg   nir_foreach_src(n->instr, nir_schedule_regs_freed_src_cb, &state);
5827ec681f3Smrg
5837ec681f3Smrg   nir_foreach_ssa_def(n->instr, nir_schedule_regs_freed_def_cb, &state);
5847ec681f3Smrg
5857ec681f3Smrg   nir_foreach_dest(n->instr, nir_schedule_regs_freed_dest_cb, &state);
5867ec681f3Smrg
5877ec681f3Smrg   return state.regs_freed;
5887ec681f3Smrg}
5897ec681f3Smrg
5907ec681f3Smrg/**
5917ec681f3Smrg * Chooses an instruction that will minimise the register pressure as much as
5927ec681f3Smrg * possible. This should only be used as a fallback when the regular scheduling
5937ec681f3Smrg * generates a shader whose register allocation fails.
5947ec681f3Smrg */
5957ec681f3Smrgstatic nir_schedule_node *
5967ec681f3Smrgnir_schedule_choose_instruction_fallback(nir_schedule_scoreboard *scoreboard)
5977ec681f3Smrg{
5987ec681f3Smrg   nir_schedule_node *chosen = NULL;
5997ec681f3Smrg
6007ec681f3Smrg   /* Find the leader in the ready (shouldn't-stall) set with the mininum
6017ec681f3Smrg    * cost.
6027ec681f3Smrg    */
6037ec681f3Smrg   list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
6047ec681f3Smrg      if (scoreboard->time < n->ready_time)
6057ec681f3Smrg         continue;
6067ec681f3Smrg
6077ec681f3Smrg      if (!chosen || chosen->max_delay > n->max_delay)
6087ec681f3Smrg         chosen = n;
6097ec681f3Smrg   }
6107ec681f3Smrg   if (chosen) {
6117ec681f3Smrg      if (debug) {
6127ec681f3Smrg         fprintf(stderr, "chose (ready fallback):          ");
6137ec681f3Smrg         nir_print_instr(chosen->instr, stderr);
6147ec681f3Smrg         fprintf(stderr, "\n");
6157ec681f3Smrg      }
6167ec681f3Smrg
6177ec681f3Smrg      return chosen;
6187ec681f3Smrg   }
6197ec681f3Smrg
6207ec681f3Smrg   /* Otherwise, choose the leader with the minimum cost. */
6217ec681f3Smrg   list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
6227ec681f3Smrg      if (!chosen || chosen->max_delay > n->max_delay)
6237ec681f3Smrg         chosen = n;
6247ec681f3Smrg   }
6257ec681f3Smrg   if (debug) {
6267ec681f3Smrg      fprintf(stderr, "chose (leader fallback):         ");
6277ec681f3Smrg      nir_print_instr(chosen->instr, stderr);
6287ec681f3Smrg      fprintf(stderr, "\n");
6297ec681f3Smrg   }
6307ec681f3Smrg
6317ec681f3Smrg   return chosen;
6327ec681f3Smrg}
6337ec681f3Smrg
6347ec681f3Smrg/**
6357ec681f3Smrg * Chooses an instruction to schedule using the Goodman/Hsu (1988) CSP (Code
6367ec681f3Smrg * Scheduling for Parallelism) heuristic.
6377ec681f3Smrg *
6387ec681f3Smrg * Picks an instruction on the critical that's ready to execute without
6397ec681f3Smrg * stalls, if possible, otherwise picks the instruction on the critical path.
6407ec681f3Smrg */
6417ec681f3Smrgstatic nir_schedule_node *
6427ec681f3Smrgnir_schedule_choose_instruction_csp(nir_schedule_scoreboard *scoreboard)
6437ec681f3Smrg{
6447ec681f3Smrg   nir_schedule_node *chosen = NULL;
6457ec681f3Smrg
6467ec681f3Smrg   /* Find the leader in the ready (shouldn't-stall) set with the maximum
6477ec681f3Smrg    * cost.
6487ec681f3Smrg    */
6497ec681f3Smrg   list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
6507ec681f3Smrg      if (scoreboard->time < n->ready_time)
6517ec681f3Smrg         continue;
6527ec681f3Smrg
6537ec681f3Smrg      if (!chosen || chosen->max_delay < n->max_delay)
6547ec681f3Smrg         chosen = n;
6557ec681f3Smrg   }
6567ec681f3Smrg   if (chosen) {
6577ec681f3Smrg      if (debug) {
6587ec681f3Smrg         fprintf(stderr, "chose (ready):          ");
6597ec681f3Smrg         nir_print_instr(chosen->instr, stderr);
6607ec681f3Smrg         fprintf(stderr, "\n");
6617ec681f3Smrg      }
6627ec681f3Smrg
6637ec681f3Smrg      return chosen;
6647ec681f3Smrg   }
6657ec681f3Smrg
6667ec681f3Smrg   /* Otherwise, choose the leader with the maximum cost. */
6677ec681f3Smrg   list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
6687ec681f3Smrg      if (!chosen || chosen->max_delay < n->max_delay)
6697ec681f3Smrg         chosen = n;
6707ec681f3Smrg   }
6717ec681f3Smrg   if (debug) {
6727ec681f3Smrg      fprintf(stderr, "chose (leader):         ");
6737ec681f3Smrg      nir_print_instr(chosen->instr, stderr);
6747ec681f3Smrg      fprintf(stderr, "\n");
6757ec681f3Smrg   }
6767ec681f3Smrg
6777ec681f3Smrg   return chosen;
6787ec681f3Smrg}
6797ec681f3Smrg
6807ec681f3Smrg/**
6817ec681f3Smrg * Chooses an instruction to schedule using the Goodman/Hsu (1988) CSR (Code
6827ec681f3Smrg * Scheduling for Register pressure) heuristic.
6837ec681f3Smrg */
6847ec681f3Smrgstatic nir_schedule_node *
6857ec681f3Smrgnir_schedule_choose_instruction_csr(nir_schedule_scoreboard *scoreboard)
6867ec681f3Smrg{
6877ec681f3Smrg   nir_schedule_node *chosen = NULL;
6887ec681f3Smrg
6897ec681f3Smrg   /* Find a ready inst with regs freed and pick the one with max cost. */
6907ec681f3Smrg   list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
6917ec681f3Smrg      if (n->ready_time > scoreboard->time)
6927ec681f3Smrg         continue;
6937ec681f3Smrg
6947ec681f3Smrg      int regs_freed = nir_schedule_regs_freed(scoreboard, n);
6957ec681f3Smrg
6967ec681f3Smrg      if (regs_freed > 0 && (!chosen || chosen->max_delay < n->max_delay)) {
6977ec681f3Smrg         chosen = n;
6987ec681f3Smrg      }
6997ec681f3Smrg   }
7007ec681f3Smrg   if (chosen) {
7017ec681f3Smrg      if (debug) {
7027ec681f3Smrg         fprintf(stderr, "chose (freed+ready):    ");
7037ec681f3Smrg         nir_print_instr(chosen->instr, stderr);
7047ec681f3Smrg         fprintf(stderr, "\n");
7057ec681f3Smrg      }
7067ec681f3Smrg
7077ec681f3Smrg      return chosen;
7087ec681f3Smrg   }
7097ec681f3Smrg
7107ec681f3Smrg   /* Find a leader with regs freed and pick the one with max cost. */
7117ec681f3Smrg   list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
7127ec681f3Smrg      int regs_freed = nir_schedule_regs_freed(scoreboard, n);
7137ec681f3Smrg
7147ec681f3Smrg      if (regs_freed > 0 && (!chosen || chosen->max_delay < n->max_delay)) {
7157ec681f3Smrg         chosen = n;
7167ec681f3Smrg      }
7177ec681f3Smrg   }
7187ec681f3Smrg   if (chosen) {
7197ec681f3Smrg      if (debug) {
7207ec681f3Smrg         fprintf(stderr, "chose (regs freed):     ");
7217ec681f3Smrg         nir_print_instr(chosen->instr, stderr);
7227ec681f3Smrg         fprintf(stderr, "\n");
7237ec681f3Smrg      }
7247ec681f3Smrg
7257ec681f3Smrg      return chosen;
7267ec681f3Smrg   }
7277ec681f3Smrg
7287ec681f3Smrg   /* Find a partially evaluated path and try to finish it off */
7297ec681f3Smrg   list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
7307ec681f3Smrg      if (n->partially_evaluated_path &&
7317ec681f3Smrg          (!chosen || chosen->max_delay < n->max_delay)) {
7327ec681f3Smrg         chosen = n;
7337ec681f3Smrg      }
7347ec681f3Smrg   }
7357ec681f3Smrg   if (chosen) {
7367ec681f3Smrg      if (debug) {
7377ec681f3Smrg         fprintf(stderr, "chose (partial path):   ");
7387ec681f3Smrg         nir_print_instr(chosen->instr, stderr);
7397ec681f3Smrg         fprintf(stderr, "\n");
7407ec681f3Smrg      }
7417ec681f3Smrg
7427ec681f3Smrg      return chosen;
7437ec681f3Smrg   }
7447ec681f3Smrg
7457ec681f3Smrg   /* Contra the paper, pick a leader with no effect on used regs.  This may
7467ec681f3Smrg    * open up new opportunities, as otherwise a single-operand instr consuming
7477ec681f3Smrg    * a value will tend to block finding freeing that value.  This had a
7487ec681f3Smrg    * massive effect on reducing spilling on V3D.
7497ec681f3Smrg    *
7507ec681f3Smrg    * XXX: Should this prioritize ready?
7517ec681f3Smrg    */
7527ec681f3Smrg   list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
7537ec681f3Smrg      if (nir_schedule_regs_freed(scoreboard, n) != 0)
7547ec681f3Smrg         continue;
7557ec681f3Smrg
7567ec681f3Smrg      if (!chosen || chosen->max_delay < n->max_delay)
7577ec681f3Smrg         chosen = n;
7587ec681f3Smrg   }
7597ec681f3Smrg   if (chosen) {
7607ec681f3Smrg      if (debug) {
7617ec681f3Smrg         fprintf(stderr, "chose (regs no-op):     ");
7627ec681f3Smrg         nir_print_instr(chosen->instr, stderr);
7637ec681f3Smrg         fprintf(stderr, "\n");
7647ec681f3Smrg      }
7657ec681f3Smrg
7667ec681f3Smrg      return chosen;
7677ec681f3Smrg   }
7687ec681f3Smrg
7697ec681f3Smrg   /* Pick the max delay of the remaining ready set. */
7707ec681f3Smrg   list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
7717ec681f3Smrg      if (n->ready_time > scoreboard->time)
7727ec681f3Smrg         continue;
7737ec681f3Smrg      if (!chosen || chosen->max_delay < n->max_delay)
7747ec681f3Smrg         chosen = n;
7757ec681f3Smrg   }
7767ec681f3Smrg   if (chosen) {
7777ec681f3Smrg      if (debug) {
7787ec681f3Smrg         fprintf(stderr, "chose (ready max delay):   ");
7797ec681f3Smrg         nir_print_instr(chosen->instr, stderr);
7807ec681f3Smrg         fprintf(stderr, "\n");
7817ec681f3Smrg      }
7827ec681f3Smrg      return chosen;
7837ec681f3Smrg   }
7847ec681f3Smrg
7857ec681f3Smrg   /* Pick the max delay of the remaining leaders. */
7867ec681f3Smrg   list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
7877ec681f3Smrg      if (!chosen || chosen->max_delay < n->max_delay)
7887ec681f3Smrg         chosen = n;
7897ec681f3Smrg   }
7907ec681f3Smrg
7917ec681f3Smrg   if (debug) {
7927ec681f3Smrg      fprintf(stderr, "chose (max delay):         ");
7937ec681f3Smrg      nir_print_instr(chosen->instr, stderr);
7947ec681f3Smrg      fprintf(stderr, "\n");
7957ec681f3Smrg   }
7967ec681f3Smrg
7977ec681f3Smrg   return chosen;
7987ec681f3Smrg}
7997ec681f3Smrg
8007ec681f3Smrgstatic void
8017ec681f3Smrgdump_state(nir_schedule_scoreboard *scoreboard)
8027ec681f3Smrg{
8037ec681f3Smrg   list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
8047ec681f3Smrg      fprintf(stderr, "maxdel %5d ", n->max_delay);
8057ec681f3Smrg      nir_print_instr(n->instr, stderr);
8067ec681f3Smrg      fprintf(stderr, "\n");
8077ec681f3Smrg
8087ec681f3Smrg      util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) {
8097ec681f3Smrg         nir_schedule_node *child = (nir_schedule_node *)edge->child;
8107ec681f3Smrg
8117ec681f3Smrg         fprintf(stderr, " -> (%d parents) ", child->dag.parent_count);
8127ec681f3Smrg         nir_print_instr(child->instr, stderr);
8137ec681f3Smrg         fprintf(stderr, "\n");
8147ec681f3Smrg      }
8157ec681f3Smrg   }
8167ec681f3Smrg}
8177ec681f3Smrg
8187ec681f3Smrgstatic void
8197ec681f3Smrgnir_schedule_mark_use(nir_schedule_scoreboard *scoreboard,
8207ec681f3Smrg                      void *reg_or_def,
8217ec681f3Smrg                      nir_instr *reg_or_def_parent,
8227ec681f3Smrg                      int pressure)
8237ec681f3Smrg{
8247ec681f3Smrg   /* Make the value live if it's the first time it's been used. */
8257ec681f3Smrg   if (!_mesa_set_search(scoreboard->live_values, reg_or_def)) {
8267ec681f3Smrg      _mesa_set_add(scoreboard->live_values, reg_or_def);
8277ec681f3Smrg      scoreboard->pressure += pressure;
8287ec681f3Smrg   }
8297ec681f3Smrg
8307ec681f3Smrg   /* Make the value dead if it's the last remaining use.  Be careful when one
8317ec681f3Smrg    * instruction uses a value twice to not decrement pressure twice.
8327ec681f3Smrg    */
8337ec681f3Smrg   struct set *remaining_uses =
8347ec681f3Smrg      _mesa_hash_table_search_data(scoreboard->remaining_uses, reg_or_def);
8357ec681f3Smrg   struct set_entry *entry = _mesa_set_search(remaining_uses, reg_or_def_parent);
8367ec681f3Smrg   if (entry) {
8377ec681f3Smrg      _mesa_set_remove(remaining_uses, entry);
8387ec681f3Smrg
8397ec681f3Smrg      if (remaining_uses->entries == 0)
8407ec681f3Smrg         scoreboard->pressure -= pressure;
8417ec681f3Smrg   }
8427ec681f3Smrg}
8437ec681f3Smrg
8447ec681f3Smrgstatic bool
8457ec681f3Smrgnir_schedule_mark_src_scheduled(nir_src *src, void *state)
8467ec681f3Smrg{
8477ec681f3Smrg   nir_schedule_scoreboard *scoreboard = state;
8487ec681f3Smrg   struct set *remaining_uses = nir_schedule_scoreboard_get_src(scoreboard, src);
8497ec681f3Smrg
8507ec681f3Smrg   struct set_entry *entry = _mesa_set_search(remaining_uses,
8517ec681f3Smrg                                              src->parent_instr);
8527ec681f3Smrg   if (entry) {
8537ec681f3Smrg      /* Once we've used an SSA value in one instruction, bump the priority of
8547ec681f3Smrg       * the other uses so the SSA value can get fully consumed.
8557ec681f3Smrg       *
8567ec681f3Smrg       * We don't do this for registers, and it's would be a hassle and it's
8577ec681f3Smrg       * unclear if that would help or not.  Also, skip it for constants, as
8587ec681f3Smrg       * they're often folded as immediates into backend instructions and have
8597ec681f3Smrg       * many unrelated instructions all referencing the same value (0).
8607ec681f3Smrg       */
8617ec681f3Smrg      if (src->is_ssa &&
8627ec681f3Smrg          src->ssa->parent_instr->type != nir_instr_type_load_const) {
8637ec681f3Smrg         nir_foreach_use(other_src, src->ssa) {
8647ec681f3Smrg            if (other_src->parent_instr == src->parent_instr)
8657ec681f3Smrg               continue;
8667ec681f3Smrg
8677ec681f3Smrg            nir_schedule_node *n =
8687ec681f3Smrg               nir_schedule_get_node(scoreboard->instr_map,
8697ec681f3Smrg                                     other_src->parent_instr);
8707ec681f3Smrg
8717ec681f3Smrg            if (n && !n->partially_evaluated_path) {
8727ec681f3Smrg               if (debug) {
8737ec681f3Smrg                  fprintf(stderr, "  New partially evaluated path: ");
8747ec681f3Smrg                  nir_print_instr(n->instr, stderr);
8757ec681f3Smrg                  fprintf(stderr, "\n");
8767ec681f3Smrg               }
8777ec681f3Smrg
8787ec681f3Smrg               n->partially_evaluated_path = true;
8797ec681f3Smrg            }
8807ec681f3Smrg         }
8817ec681f3Smrg      }
8827ec681f3Smrg   }
8837ec681f3Smrg
8847ec681f3Smrg   nir_schedule_mark_use(scoreboard,
8857ec681f3Smrg                         src->is_ssa ? (void *)src->ssa : (void *)src->reg.reg,
8867ec681f3Smrg                         src->parent_instr,
8877ec681f3Smrg                         nir_schedule_src_pressure(src));
8887ec681f3Smrg
8897ec681f3Smrg   return true;
8907ec681f3Smrg}
8917ec681f3Smrg
8927ec681f3Smrgstatic bool
8937ec681f3Smrgnir_schedule_mark_def_scheduled(nir_ssa_def *def, void *state)
8947ec681f3Smrg{
8957ec681f3Smrg   nir_schedule_scoreboard *scoreboard = state;
8967ec681f3Smrg
8977ec681f3Smrg   nir_schedule_mark_use(scoreboard, def, def->parent_instr,
8987ec681f3Smrg                         nir_schedule_def_pressure(def));
8997ec681f3Smrg
9007ec681f3Smrg   return true;
9017ec681f3Smrg}
9027ec681f3Smrg
9037ec681f3Smrgstatic bool
9047ec681f3Smrgnir_schedule_mark_dest_scheduled(nir_dest *dest, void *state)
9057ec681f3Smrg{
9067ec681f3Smrg   nir_schedule_scoreboard *scoreboard = state;
9077ec681f3Smrg
9087ec681f3Smrg   /* SSA defs were handled in nir_schedule_mark_def_scheduled()
9097ec681f3Smrg    */
9107ec681f3Smrg   if (dest->is_ssa)
9117ec681f3Smrg      return true;
9127ec681f3Smrg
9137ec681f3Smrg   /* XXX: This is not actually accurate for regs -- the last use of a reg may
9147ec681f3Smrg    * have a live interval that extends across control flow.  We should
9157ec681f3Smrg    * calculate the live ranges of regs, and have scheduler nodes for the CF
9167ec681f3Smrg    * nodes that also "use" the reg.
9177ec681f3Smrg    */
9187ec681f3Smrg   nir_schedule_mark_use(scoreboard, dest->reg.reg,
9197ec681f3Smrg                         dest->reg.parent_instr,
9207ec681f3Smrg                         nir_schedule_dest_pressure(dest));
9217ec681f3Smrg
9227ec681f3Smrg   return true;
9237ec681f3Smrg}
9247ec681f3Smrg
9257ec681f3Smrgstatic void
9267ec681f3Smrgnir_schedule_mark_node_scheduled(nir_schedule_scoreboard *scoreboard,
9277ec681f3Smrg                                 nir_schedule_node *n)
9287ec681f3Smrg{
9297ec681f3Smrg   nir_foreach_src(n->instr, nir_schedule_mark_src_scheduled, scoreboard);
9307ec681f3Smrg   nir_foreach_ssa_def(n->instr, nir_schedule_mark_def_scheduled, scoreboard);
9317ec681f3Smrg   nir_foreach_dest(n->instr, nir_schedule_mark_dest_scheduled, scoreboard);
9327ec681f3Smrg
9337ec681f3Smrg   util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) {
9347ec681f3Smrg      nir_schedule_node *child = (nir_schedule_node *)edge->child;
9357ec681f3Smrg
9367ec681f3Smrg      child->ready_time = MAX2(child->ready_time,
9377ec681f3Smrg                               scoreboard->time + n->delay);
9387ec681f3Smrg
9397ec681f3Smrg      if (child->dag.parent_count == 1) {
9407ec681f3Smrg         if (debug) {
9417ec681f3Smrg            fprintf(stderr, "  New DAG head: ");
9427ec681f3Smrg            nir_print_instr(child->instr, stderr);
9437ec681f3Smrg            fprintf(stderr, "\n");
9447ec681f3Smrg         }
9457ec681f3Smrg      }
9467ec681f3Smrg   }
9477ec681f3Smrg
9487ec681f3Smrg   dag_prune_head(scoreboard->dag, &n->dag);
9497ec681f3Smrg
9507ec681f3Smrg   scoreboard->time = MAX2(n->ready_time, scoreboard->time);
9517ec681f3Smrg   scoreboard->time++;
9527ec681f3Smrg}
9537ec681f3Smrg
9547ec681f3Smrgstatic void
9557ec681f3Smrgnir_schedule_instructions(nir_schedule_scoreboard *scoreboard, nir_block *block)
9567ec681f3Smrg{
9577ec681f3Smrg   while (!list_is_empty(&scoreboard->dag->heads)) {
9587ec681f3Smrg      if (debug) {
9597ec681f3Smrg         fprintf(stderr, "current list:\n");
9607ec681f3Smrg         dump_state(scoreboard);
9617ec681f3Smrg      }
9627ec681f3Smrg
9637ec681f3Smrg      nir_schedule_node *chosen;
9647ec681f3Smrg      if (scoreboard->options->fallback)
9657ec681f3Smrg         chosen = nir_schedule_choose_instruction_fallback(scoreboard);
9667ec681f3Smrg      else if (scoreboard->pressure < scoreboard->options->threshold)
9677ec681f3Smrg         chosen = nir_schedule_choose_instruction_csp(scoreboard);
9687ec681f3Smrg      else
9697ec681f3Smrg         chosen = nir_schedule_choose_instruction_csr(scoreboard);
9707ec681f3Smrg
9717ec681f3Smrg      /* Now that we've scheduled a new instruction, some of its children may
9727ec681f3Smrg       * be promoted to the list of instructions ready to be scheduled.
9737ec681f3Smrg       */
9747ec681f3Smrg      nir_schedule_mark_node_scheduled(scoreboard, chosen);
9757ec681f3Smrg
9767ec681f3Smrg      /* Move the instruction to the end (so our first chosen instructions are
9777ec681f3Smrg       * the start of the program).
9787ec681f3Smrg       */
9797ec681f3Smrg      exec_node_remove(&chosen->instr->node);
9807ec681f3Smrg      exec_list_push_tail(&block->instr_list, &chosen->instr->node);
9817ec681f3Smrg
9827ec681f3Smrg      if (debug)
9837ec681f3Smrg         fprintf(stderr, "\n");
9847ec681f3Smrg   }
9857ec681f3Smrg}
9867ec681f3Smrg
9877ec681f3Smrgstatic uint32_t
9887ec681f3Smrgnir_schedule_get_delay(nir_instr *instr)
9897ec681f3Smrg{
9907ec681f3Smrg   switch (instr->type) {
9917ec681f3Smrg   case nir_instr_type_ssa_undef:
9927ec681f3Smrg   case nir_instr_type_load_const:
9937ec681f3Smrg   case nir_instr_type_alu:
9947ec681f3Smrg   case nir_instr_type_deref:
9957ec681f3Smrg   case nir_instr_type_jump:
9967ec681f3Smrg   case nir_instr_type_parallel_copy:
9977ec681f3Smrg   case nir_instr_type_call:
9987ec681f3Smrg   case nir_instr_type_phi:
9997ec681f3Smrg      return 1;
10007ec681f3Smrg
10017ec681f3Smrg   case nir_instr_type_intrinsic:
10027ec681f3Smrg      /* XXX: Pick a large number for UBO/SSBO/image/shared loads */
10037ec681f3Smrg      return 1;
10047ec681f3Smrg
10057ec681f3Smrg   case nir_instr_type_tex:
10067ec681f3Smrg      /* Pick some large number to try to fetch textures early and sample them
10077ec681f3Smrg       * late.
10087ec681f3Smrg       */
10097ec681f3Smrg      return 100;
10107ec681f3Smrg   }
10117ec681f3Smrg
10127ec681f3Smrg   return 0;
10137ec681f3Smrg}
10147ec681f3Smrg
10157ec681f3Smrgstatic void
10167ec681f3Smrgnir_schedule_dag_max_delay_cb(struct dag_node *node, void *state)
10177ec681f3Smrg{
10187ec681f3Smrg   nir_schedule_node *n = (nir_schedule_node *)node;
10197ec681f3Smrg   uint32_t max_delay = 0;
10207ec681f3Smrg
10217ec681f3Smrg   util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) {
10227ec681f3Smrg      nir_schedule_node *child = (nir_schedule_node *)edge->child;
10237ec681f3Smrg      max_delay = MAX2(child->max_delay, max_delay);
10247ec681f3Smrg   }
10257ec681f3Smrg
10267ec681f3Smrg   n->max_delay = MAX2(n->max_delay, max_delay + n->delay);
10277ec681f3Smrg }
10287ec681f3Smrg
10297ec681f3Smrgstatic void
10307ec681f3Smrgnir_schedule_block(nir_schedule_scoreboard *scoreboard, nir_block *block)
10317ec681f3Smrg{
10327ec681f3Smrg   void *mem_ctx = ralloc_context(NULL);
10337ec681f3Smrg   scoreboard->instr_map = _mesa_pointer_hash_table_create(mem_ctx);
10347ec681f3Smrg
10357ec681f3Smrg   scoreboard->dag = dag_create(mem_ctx);
10367ec681f3Smrg
10377ec681f3Smrg   nir_foreach_instr(instr, block) {
10387ec681f3Smrg      nir_schedule_node *n =
10397ec681f3Smrg         rzalloc(mem_ctx, nir_schedule_node);
10407ec681f3Smrg
10417ec681f3Smrg      n->instr = instr;
10427ec681f3Smrg      n->delay = nir_schedule_get_delay(instr);
10437ec681f3Smrg      dag_init_node(scoreboard->dag, &n->dag);
10447ec681f3Smrg
10457ec681f3Smrg      _mesa_hash_table_insert(scoreboard->instr_map, instr, n);
10467ec681f3Smrg   }
10477ec681f3Smrg
10487ec681f3Smrg   calculate_forward_deps(scoreboard, block);
10497ec681f3Smrg   calculate_reverse_deps(scoreboard, block);
10507ec681f3Smrg
10517ec681f3Smrg   dag_traverse_bottom_up(scoreboard->dag, nir_schedule_dag_max_delay_cb, NULL);
10527ec681f3Smrg
10537ec681f3Smrg   nir_schedule_instructions(scoreboard, block);
10547ec681f3Smrg
10557ec681f3Smrg   ralloc_free(mem_ctx);
10567ec681f3Smrg   scoreboard->instr_map = NULL;
10577ec681f3Smrg}
10587ec681f3Smrg
10597ec681f3Smrgstatic bool
10607ec681f3Smrgnir_schedule_ssa_def_init_scoreboard(nir_ssa_def *def, void *state)
10617ec681f3Smrg{
10627ec681f3Smrg   nir_schedule_scoreboard *scoreboard = state;
10637ec681f3Smrg   struct set *def_uses = _mesa_pointer_set_create(scoreboard);
10647ec681f3Smrg
10657ec681f3Smrg   _mesa_hash_table_insert(scoreboard->remaining_uses, def, def_uses);
10667ec681f3Smrg
10677ec681f3Smrg   _mesa_set_add(def_uses, def->parent_instr);
10687ec681f3Smrg
10697ec681f3Smrg   nir_foreach_use(src, def) {
10707ec681f3Smrg      _mesa_set_add(def_uses, src->parent_instr);
10717ec681f3Smrg   }
10727ec681f3Smrg
10737ec681f3Smrg   /* XXX: Handle if uses */
10747ec681f3Smrg
10757ec681f3Smrg   return true;
10767ec681f3Smrg}
10777ec681f3Smrg
10787ec681f3Smrgstatic nir_schedule_scoreboard *
10797ec681f3Smrgnir_schedule_get_scoreboard(nir_shader *shader,
10807ec681f3Smrg                            const nir_schedule_options *options)
10817ec681f3Smrg{
10827ec681f3Smrg   nir_schedule_scoreboard *scoreboard = rzalloc(NULL, nir_schedule_scoreboard);
10837ec681f3Smrg
10847ec681f3Smrg   scoreboard->shader = shader;
10857ec681f3Smrg   scoreboard->live_values = _mesa_pointer_set_create(scoreboard);
10867ec681f3Smrg   scoreboard->remaining_uses = _mesa_pointer_hash_table_create(scoreboard);
10877ec681f3Smrg   scoreboard->options = options;
10887ec681f3Smrg   scoreboard->pressure = 0;
10897ec681f3Smrg
10907ec681f3Smrg   nir_foreach_function(function, shader) {
10917ec681f3Smrg      nir_foreach_register(reg, &function->impl->registers) {
10927ec681f3Smrg         struct set *register_uses =
10937ec681f3Smrg            _mesa_pointer_set_create(scoreboard);
10947ec681f3Smrg
10957ec681f3Smrg         _mesa_hash_table_insert(scoreboard->remaining_uses, reg, register_uses);
10967ec681f3Smrg
10977ec681f3Smrg         nir_foreach_use(src, reg) {
10987ec681f3Smrg            _mesa_set_add(register_uses, src->parent_instr);
10997ec681f3Smrg         }
11007ec681f3Smrg
11017ec681f3Smrg         /* XXX: Handle if uses */
11027ec681f3Smrg
11037ec681f3Smrg         nir_foreach_def(dest, reg) {
11047ec681f3Smrg            _mesa_set_add(register_uses, dest->reg.parent_instr);
11057ec681f3Smrg         }
11067ec681f3Smrg      }
11077ec681f3Smrg
11087ec681f3Smrg      nir_foreach_block(block, function->impl) {
11097ec681f3Smrg         nir_foreach_instr(instr, block) {
11107ec681f3Smrg            nir_foreach_ssa_def(instr, nir_schedule_ssa_def_init_scoreboard,
11117ec681f3Smrg                                scoreboard);
11127ec681f3Smrg         }
11137ec681f3Smrg
11147ec681f3Smrg         /* XXX: We're ignoring if uses, which may prioritize scheduling other
11157ec681f3Smrg          * uses of the if src even when it doesn't help.  That's not many
11167ec681f3Smrg          * values, though, so meh.
11177ec681f3Smrg          */
11187ec681f3Smrg      }
11197ec681f3Smrg   }
11207ec681f3Smrg
11217ec681f3Smrg   return scoreboard;
11227ec681f3Smrg}
11237ec681f3Smrg
11247ec681f3Smrgstatic void
11257ec681f3Smrgnir_schedule_validate_uses(nir_schedule_scoreboard *scoreboard)
11267ec681f3Smrg{
11277ec681f3Smrg#ifdef NDEBUG
11287ec681f3Smrg   return;
11297ec681f3Smrg#endif
11307ec681f3Smrg
11317ec681f3Smrg   bool any_uses = false;
11327ec681f3Smrg
11337ec681f3Smrg   hash_table_foreach(scoreboard->remaining_uses, entry) {
11347ec681f3Smrg      struct set *remaining_uses = entry->data;
11357ec681f3Smrg
11367ec681f3Smrg      set_foreach(remaining_uses, instr_entry) {
11377ec681f3Smrg         if (!any_uses) {
11387ec681f3Smrg            fprintf(stderr, "Tracked uses remain after scheduling.  "
11397ec681f3Smrg                    "Affected instructions: \n");
11407ec681f3Smrg            any_uses = true;
11417ec681f3Smrg         }
11427ec681f3Smrg         nir_print_instr(instr_entry->key, stderr);
11437ec681f3Smrg         fprintf(stderr, "\n");
11447ec681f3Smrg      }
11457ec681f3Smrg   }
11467ec681f3Smrg
11477ec681f3Smrg   assert(!any_uses);
11487ec681f3Smrg}
11497ec681f3Smrg
11507ec681f3Smrg/**
11517ec681f3Smrg * Schedules the NIR instructions to try to decrease stalls (for example,
11527ec681f3Smrg * delaying texture reads) while managing register pressure.
11537ec681f3Smrg *
11547ec681f3Smrg * The threshold represents "number of NIR register/SSA def channels live
11557ec681f3Smrg * before switching the scheduling heuristic to reduce register pressure",
11567ec681f3Smrg * since most of our GPU architectures are scalar (extending to vector with a
11577ec681f3Smrg * flag wouldn't be hard).  This number should be a bit below the number of
11587ec681f3Smrg * registers available (counting any that may be occupied by system value
11597ec681f3Smrg * payload values, for example), since the heuristic may not always be able to
11607ec681f3Smrg * free a register immediately.  The amount below the limit is up to you to
11617ec681f3Smrg * tune.
11627ec681f3Smrg */
11637ec681f3Smrgvoid
11647ec681f3Smrgnir_schedule(nir_shader *shader,
11657ec681f3Smrg             const nir_schedule_options *options)
11667ec681f3Smrg{
11677ec681f3Smrg   nir_schedule_scoreboard *scoreboard = nir_schedule_get_scoreboard(shader,
11687ec681f3Smrg                                                                     options);
11697ec681f3Smrg
11707ec681f3Smrg   if (debug) {
11717ec681f3Smrg      fprintf(stderr, "NIR shader before scheduling:\n");
11727ec681f3Smrg      nir_print_shader(shader, stderr);
11737ec681f3Smrg   }
11747ec681f3Smrg
11757ec681f3Smrg   nir_foreach_function(function, shader) {
11767ec681f3Smrg      if (!function->impl)
11777ec681f3Smrg         continue;
11787ec681f3Smrg
11797ec681f3Smrg      nir_foreach_block(block, function->impl) {
11807ec681f3Smrg         nir_schedule_block(scoreboard, block);
11817ec681f3Smrg      }
11827ec681f3Smrg   }
11837ec681f3Smrg
11847ec681f3Smrg   nir_schedule_validate_uses(scoreboard);
11857ec681f3Smrg
11867ec681f3Smrg   ralloc_free(scoreboard);
11877ec681f3Smrg}
1188