17ec681f3Smrg/* 27ec681f3Smrg * Copyright © 2019 Broadcom 37ec681f3Smrg * 47ec681f3Smrg * Permission is hereby granted, free of charge, to any person obtaining a 57ec681f3Smrg * copy of this software and associated documentation files (the "Software"), 67ec681f3Smrg * to deal in the Software without restriction, including without limitation 77ec681f3Smrg * the rights to use, copy, modify, merge, publish, distribute, sublicense, 87ec681f3Smrg * and/or sell copies of the Software, and to permit persons to whom the 97ec681f3Smrg * Software is furnished to do so, subject to the following conditions: 107ec681f3Smrg * 117ec681f3Smrg * The above copyright notice and this permission notice (including the next 127ec681f3Smrg * paragraph) shall be included in all copies or substantial portions of the 137ec681f3Smrg * Software. 147ec681f3Smrg * 157ec681f3Smrg * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 167ec681f3Smrg * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 177ec681f3Smrg * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 187ec681f3Smrg * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 197ec681f3Smrg * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 207ec681f3Smrg * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 217ec681f3Smrg * IN THE SOFTWARE. 227ec681f3Smrg */ 237ec681f3Smrg 247ec681f3Smrg#include "nir_schedule.h" 257ec681f3Smrg#include "util/dag.h" 267ec681f3Smrg#include "util/u_dynarray.h" 277ec681f3Smrg 287ec681f3Smrg/** @file 297ec681f3Smrg * 307ec681f3Smrg * Implements basic-block-level prepass instruction scheduling in NIR to 317ec681f3Smrg * manage register pressure. 327ec681f3Smrg * 337ec681f3Smrg * This is based on the Goodman/Hsu paper (1988, cached copy at 347ec681f3Smrg * https://people.freedesktop.org/~anholt/scheduling-goodman-hsu.pdf). We 357ec681f3Smrg * make up the DDG for NIR (which can be mostly done using the NIR def/use 367ec681f3Smrg * chains for SSA instructions, plus some edges for ordering register writes 377ec681f3Smrg * vs reads, and some more for ordering intrinsics). Then we pick heads off 387ec681f3Smrg * of the DDG using their heuristic to emit the NIR instructions back into the 397ec681f3Smrg * block in their new order. 407ec681f3Smrg * 417ec681f3Smrg * The hard case for prepass scheduling on GPUs seems to always be consuming 427ec681f3Smrg * texture/ubo results. The register pressure heuristic doesn't want to pick 437ec681f3Smrg * an instr that starts consuming texture results because it usually won't be 447ec681f3Smrg * the only usage, so that instruction will increase pressure. 457ec681f3Smrg * 467ec681f3Smrg * If you try to force consumption of tex results always, then in a case where 477ec681f3Smrg * single sample is used for many outputs, you'll end up picking every other 487ec681f3Smrg * user and expanding register pressure. The partially_evaluated_path flag 497ec681f3Smrg * helps tremendously, in that if you happen for whatever reason to pick a 507ec681f3Smrg * texture sample's output, then you'll try to finish off that sample. Future 517ec681f3Smrg * work may include doing some local search before locking in a choice, to try 527ec681f3Smrg * to more reliably find the case where just a few choices going against the 537ec681f3Smrg * heuristic can manage to free the whole vector. 547ec681f3Smrg */ 557ec681f3Smrg 567ec681f3Smrgstatic bool debug; 577ec681f3Smrg 587ec681f3Smrg/** 597ec681f3Smrg * Represents a node in the DDG for a NIR instruction. 607ec681f3Smrg */ 617ec681f3Smrgtypedef struct { 627ec681f3Smrg struct dag_node dag; /* must be first for our u_dynarray_foreach */ 637ec681f3Smrg nir_instr *instr; 647ec681f3Smrg bool partially_evaluated_path; 657ec681f3Smrg 667ec681f3Smrg /* Approximate estimate of the delay between starting this instruction and 677ec681f3Smrg * its results being available. 687ec681f3Smrg * 697ec681f3Smrg * Accuracy is not too important, given that we're prepass scheduling here 707ec681f3Smrg * and just trying to reduce excess dependencies introduced by a register 717ec681f3Smrg * allocator by stretching out the live intervals of expensive 727ec681f3Smrg * instructions. 737ec681f3Smrg */ 747ec681f3Smrg uint32_t delay; 757ec681f3Smrg 767ec681f3Smrg /* Cost of the maximum-delay path from this node to the leaves. */ 777ec681f3Smrg uint32_t max_delay; 787ec681f3Smrg 797ec681f3Smrg /* scoreboard->time value when this instruction can be scheduled without 807ec681f3Smrg * any stalls expected. 817ec681f3Smrg */ 827ec681f3Smrg uint32_t ready_time; 837ec681f3Smrg} nir_schedule_node; 847ec681f3Smrg 857ec681f3Smrgtypedef struct { 867ec681f3Smrg struct dag *dag; 877ec681f3Smrg 887ec681f3Smrg nir_shader *shader; 897ec681f3Smrg 907ec681f3Smrg /* Mapping from nir_register * or nir_ssa_def * to a struct set of 917ec681f3Smrg * instructions remaining to be scheduled using the register. 927ec681f3Smrg */ 937ec681f3Smrg struct hash_table *remaining_uses; 947ec681f3Smrg 957ec681f3Smrg /* Map from nir_instr to nir_schedule_node * */ 967ec681f3Smrg struct hash_table *instr_map; 977ec681f3Smrg 987ec681f3Smrg /* Set of nir_register * or nir_ssa_def * that have had any instruction 997ec681f3Smrg * scheduled on them. 1007ec681f3Smrg */ 1017ec681f3Smrg struct set *live_values; 1027ec681f3Smrg 1037ec681f3Smrg /* An abstract approximation of the number of nir_scheduler_node->delay 1047ec681f3Smrg * units since the start of the shader. 1057ec681f3Smrg */ 1067ec681f3Smrg uint32_t time; 1077ec681f3Smrg 1087ec681f3Smrg /* Number of channels currently used by the NIR instructions that have been 1097ec681f3Smrg * scheduled. 1107ec681f3Smrg */ 1117ec681f3Smrg int pressure; 1127ec681f3Smrg 1137ec681f3Smrg /* Options specified by the backend */ 1147ec681f3Smrg const nir_schedule_options *options; 1157ec681f3Smrg} nir_schedule_scoreboard; 1167ec681f3Smrg 1177ec681f3Smrg/* When walking the instructions in reverse, we use this flag to swap 1187ec681f3Smrg * before/after in add_dep(). 1197ec681f3Smrg */ 1207ec681f3Smrgenum direction { F, R }; 1217ec681f3Smrg 1227ec681f3Smrgstruct nir_schedule_class_dep { 1237ec681f3Smrg int klass; 1247ec681f3Smrg nir_schedule_node *node; 1257ec681f3Smrg struct nir_schedule_class_dep *next; 1267ec681f3Smrg}; 1277ec681f3Smrg 1287ec681f3Smrgtypedef struct { 1297ec681f3Smrg nir_schedule_scoreboard *scoreboard; 1307ec681f3Smrg 1317ec681f3Smrg /* Map from nir_register to nir_schedule_node * */ 1327ec681f3Smrg struct hash_table *reg_map; 1337ec681f3Smrg 1347ec681f3Smrg /* Scheduler nodes for last instruction involved in some class of dependency. 1357ec681f3Smrg */ 1367ec681f3Smrg nir_schedule_node *load_input; 1377ec681f3Smrg nir_schedule_node *store_shared; 1387ec681f3Smrg nir_schedule_node *unknown_intrinsic; 1397ec681f3Smrg nir_schedule_node *discard; 1407ec681f3Smrg nir_schedule_node *jump; 1417ec681f3Smrg 1427ec681f3Smrg struct nir_schedule_class_dep *class_deps; 1437ec681f3Smrg 1447ec681f3Smrg enum direction dir; 1457ec681f3Smrg} nir_deps_state; 1467ec681f3Smrg 1477ec681f3Smrgstatic void * 1487ec681f3Smrg_mesa_hash_table_search_data(struct hash_table *ht, void *key) 1497ec681f3Smrg{ 1507ec681f3Smrg struct hash_entry *entry = _mesa_hash_table_search(ht, key); 1517ec681f3Smrg if (!entry) 1527ec681f3Smrg return NULL; 1537ec681f3Smrg return entry->data; 1547ec681f3Smrg} 1557ec681f3Smrg 1567ec681f3Smrgstatic nir_schedule_node * 1577ec681f3Smrgnir_schedule_get_node(struct hash_table *instr_map, nir_instr *instr) 1587ec681f3Smrg{ 1597ec681f3Smrg return _mesa_hash_table_search_data(instr_map, instr); 1607ec681f3Smrg} 1617ec681f3Smrg 1627ec681f3Smrgstatic struct set * 1637ec681f3Smrgnir_schedule_scoreboard_get_src(nir_schedule_scoreboard *scoreboard, nir_src *src) 1647ec681f3Smrg{ 1657ec681f3Smrg if (src->is_ssa) { 1667ec681f3Smrg return _mesa_hash_table_search_data(scoreboard->remaining_uses, src->ssa); 1677ec681f3Smrg } else { 1687ec681f3Smrg return _mesa_hash_table_search_data(scoreboard->remaining_uses, 1697ec681f3Smrg src->reg.reg); 1707ec681f3Smrg } 1717ec681f3Smrg} 1727ec681f3Smrg 1737ec681f3Smrgstatic int 1747ec681f3Smrgnir_schedule_def_pressure(nir_ssa_def *def) 1757ec681f3Smrg{ 1767ec681f3Smrg return def->num_components; 1777ec681f3Smrg} 1787ec681f3Smrg 1797ec681f3Smrgstatic int 1807ec681f3Smrgnir_schedule_src_pressure(nir_src *src) 1817ec681f3Smrg{ 1827ec681f3Smrg if (src->is_ssa) 1837ec681f3Smrg return nir_schedule_def_pressure(src->ssa); 1847ec681f3Smrg else 1857ec681f3Smrg return src->reg.reg->num_components; 1867ec681f3Smrg} 1877ec681f3Smrg 1887ec681f3Smrgstatic int 1897ec681f3Smrgnir_schedule_dest_pressure(nir_dest *dest) 1907ec681f3Smrg{ 1917ec681f3Smrg if (dest->is_ssa) 1927ec681f3Smrg return nir_schedule_def_pressure(&dest->ssa); 1937ec681f3Smrg else 1947ec681f3Smrg return dest->reg.reg->num_components; 1957ec681f3Smrg} 1967ec681f3Smrg 1977ec681f3Smrg/** 1987ec681f3Smrg * Adds a dependency such that @after must appear in the final program after 1997ec681f3Smrg * @before. 2007ec681f3Smrg * 2017ec681f3Smrg * We add @before as a child of @after, so that DAG heads are the outputs of 2027ec681f3Smrg * the program and we make our scheduling decisions bottom to top. 2037ec681f3Smrg */ 2047ec681f3Smrgstatic void 2057ec681f3Smrgadd_dep(nir_deps_state *state, 2067ec681f3Smrg nir_schedule_node *before, 2077ec681f3Smrg nir_schedule_node *after) 2087ec681f3Smrg{ 2097ec681f3Smrg if (!before || !after) 2107ec681f3Smrg return; 2117ec681f3Smrg 2127ec681f3Smrg assert(before != after); 2137ec681f3Smrg 2147ec681f3Smrg if (state->dir == F) 2157ec681f3Smrg dag_add_edge(&before->dag, &after->dag, NULL); 2167ec681f3Smrg else 2177ec681f3Smrg dag_add_edge(&after->dag, &before->dag, NULL); 2187ec681f3Smrg} 2197ec681f3Smrg 2207ec681f3Smrg 2217ec681f3Smrgstatic void 2227ec681f3Smrgadd_read_dep(nir_deps_state *state, 2237ec681f3Smrg nir_schedule_node *before, 2247ec681f3Smrg nir_schedule_node *after) 2257ec681f3Smrg{ 2267ec681f3Smrg add_dep(state, before, after); 2277ec681f3Smrg} 2287ec681f3Smrg 2297ec681f3Smrgstatic void 2307ec681f3Smrgadd_write_dep(nir_deps_state *state, 2317ec681f3Smrg nir_schedule_node **before, 2327ec681f3Smrg nir_schedule_node *after) 2337ec681f3Smrg{ 2347ec681f3Smrg add_dep(state, *before, after); 2357ec681f3Smrg *before = after; 2367ec681f3Smrg} 2377ec681f3Smrg 2387ec681f3Smrgstatic bool 2397ec681f3Smrgnir_schedule_reg_src_deps(nir_src *src, void *in_state) 2407ec681f3Smrg{ 2417ec681f3Smrg nir_deps_state *state = in_state; 2427ec681f3Smrg 2437ec681f3Smrg if (src->is_ssa) 2447ec681f3Smrg return true; 2457ec681f3Smrg 2467ec681f3Smrg struct hash_entry *entry = _mesa_hash_table_search(state->reg_map, 2477ec681f3Smrg src->reg.reg); 2487ec681f3Smrg if (!entry) 2497ec681f3Smrg return true; 2507ec681f3Smrg nir_schedule_node *dst_n = entry->data; 2517ec681f3Smrg 2527ec681f3Smrg nir_schedule_node *src_n = 2537ec681f3Smrg nir_schedule_get_node(state->scoreboard->instr_map, 2547ec681f3Smrg src->parent_instr); 2557ec681f3Smrg 2567ec681f3Smrg add_dep(state, dst_n, src_n); 2577ec681f3Smrg 2587ec681f3Smrg return true; 2597ec681f3Smrg} 2607ec681f3Smrg 2617ec681f3Smrgstatic bool 2627ec681f3Smrgnir_schedule_reg_dest_deps(nir_dest *dest, void *in_state) 2637ec681f3Smrg{ 2647ec681f3Smrg nir_deps_state *state = in_state; 2657ec681f3Smrg 2667ec681f3Smrg if (dest->is_ssa) 2677ec681f3Smrg return true; 2687ec681f3Smrg 2697ec681f3Smrg nir_schedule_node *dest_n = 2707ec681f3Smrg nir_schedule_get_node(state->scoreboard->instr_map, 2717ec681f3Smrg dest->reg.parent_instr); 2727ec681f3Smrg 2737ec681f3Smrg struct hash_entry *entry = _mesa_hash_table_search(state->reg_map, 2747ec681f3Smrg dest->reg.reg); 2757ec681f3Smrg if (!entry) { 2767ec681f3Smrg _mesa_hash_table_insert(state->reg_map, dest->reg.reg, dest_n); 2777ec681f3Smrg return true; 2787ec681f3Smrg } 2797ec681f3Smrg nir_schedule_node **before = (nir_schedule_node **)&entry->data; 2807ec681f3Smrg 2817ec681f3Smrg add_write_dep(state, before, dest_n); 2827ec681f3Smrg 2837ec681f3Smrg return true; 2847ec681f3Smrg} 2857ec681f3Smrg 2867ec681f3Smrgstatic bool 2877ec681f3Smrgnir_schedule_ssa_deps(nir_ssa_def *def, void *in_state) 2887ec681f3Smrg{ 2897ec681f3Smrg nir_deps_state *state = in_state; 2907ec681f3Smrg struct hash_table *instr_map = state->scoreboard->instr_map; 2917ec681f3Smrg nir_schedule_node *def_n = nir_schedule_get_node(instr_map, def->parent_instr); 2927ec681f3Smrg 2937ec681f3Smrg nir_foreach_use(src, def) { 2947ec681f3Smrg nir_schedule_node *use_n = nir_schedule_get_node(instr_map, 2957ec681f3Smrg src->parent_instr); 2967ec681f3Smrg 2977ec681f3Smrg add_read_dep(state, def_n, use_n); 2987ec681f3Smrg } 2997ec681f3Smrg 3007ec681f3Smrg return true; 3017ec681f3Smrg} 3027ec681f3Smrg 3037ec681f3Smrgstatic struct nir_schedule_class_dep * 3047ec681f3Smrgnir_schedule_get_class_dep(nir_deps_state *state, 3057ec681f3Smrg int klass) 3067ec681f3Smrg{ 3077ec681f3Smrg for (struct nir_schedule_class_dep *class_dep = state->class_deps; 3087ec681f3Smrg class_dep != NULL; 3097ec681f3Smrg class_dep = class_dep->next) { 3107ec681f3Smrg if (class_dep->klass == klass) 3117ec681f3Smrg return class_dep; 3127ec681f3Smrg } 3137ec681f3Smrg 3147ec681f3Smrg struct nir_schedule_class_dep *class_dep = 3157ec681f3Smrg ralloc(state->reg_map, struct nir_schedule_class_dep); 3167ec681f3Smrg 3177ec681f3Smrg class_dep->klass = klass; 3187ec681f3Smrg class_dep->node = NULL; 3197ec681f3Smrg class_dep->next = state->class_deps; 3207ec681f3Smrg 3217ec681f3Smrg state->class_deps = class_dep; 3227ec681f3Smrg 3237ec681f3Smrg return class_dep; 3247ec681f3Smrg} 3257ec681f3Smrg 3267ec681f3Smrgstatic void 3277ec681f3Smrgnir_schedule_intrinsic_deps(nir_deps_state *state, 3287ec681f3Smrg nir_intrinsic_instr *instr) 3297ec681f3Smrg{ 3307ec681f3Smrg nir_schedule_node *n = nir_schedule_get_node(state->scoreboard->instr_map, 3317ec681f3Smrg &instr->instr); 3327ec681f3Smrg const nir_schedule_options *options = state->scoreboard->options; 3337ec681f3Smrg nir_schedule_dependency dep; 3347ec681f3Smrg 3357ec681f3Smrg if (options->intrinsic_cb && 3367ec681f3Smrg options->intrinsic_cb(instr, &dep, options->intrinsic_cb_data)) { 3377ec681f3Smrg struct nir_schedule_class_dep *class_dep = 3387ec681f3Smrg nir_schedule_get_class_dep(state, dep.klass); 3397ec681f3Smrg 3407ec681f3Smrg switch (dep.type) { 3417ec681f3Smrg case NIR_SCHEDULE_READ_DEPENDENCY: 3427ec681f3Smrg add_read_dep(state, class_dep->node, n); 3437ec681f3Smrg break; 3447ec681f3Smrg case NIR_SCHEDULE_WRITE_DEPENDENCY: 3457ec681f3Smrg add_write_dep(state, &class_dep->node, n); 3467ec681f3Smrg break; 3477ec681f3Smrg } 3487ec681f3Smrg } 3497ec681f3Smrg 3507ec681f3Smrg switch (instr->intrinsic) { 3517ec681f3Smrg case nir_intrinsic_load_uniform: 3527ec681f3Smrg case nir_intrinsic_load_ubo: 3537ec681f3Smrg case nir_intrinsic_load_front_face: 3547ec681f3Smrg break; 3557ec681f3Smrg 3567ec681f3Smrg case nir_intrinsic_discard: 3577ec681f3Smrg case nir_intrinsic_discard_if: 3587ec681f3Smrg case nir_intrinsic_demote: 3597ec681f3Smrg case nir_intrinsic_demote_if: 3607ec681f3Smrg case nir_intrinsic_terminate: 3617ec681f3Smrg case nir_intrinsic_terminate_if: 3627ec681f3Smrg /* We are adding two dependencies: 3637ec681f3Smrg * 3647ec681f3Smrg * * A individual one that we could use to add a read_dep while handling 3657ec681f3Smrg * nir_instr_type_tex 3667ec681f3Smrg * 3677ec681f3Smrg * * Include it on the unknown intrinsic set, as we want discard to be 3687ec681f3Smrg * serialized in in the same order relative to intervening stores or 3697ec681f3Smrg * atomic accesses to SSBOs and images 3707ec681f3Smrg */ 3717ec681f3Smrg add_write_dep(state, &state->discard, n); 3727ec681f3Smrg add_write_dep(state, &state->unknown_intrinsic, n); 3737ec681f3Smrg break; 3747ec681f3Smrg 3757ec681f3Smrg case nir_intrinsic_store_output: 3767ec681f3Smrg /* For some hardware and stages, output stores affect the same shared 3777ec681f3Smrg * memory as input loads. 3787ec681f3Smrg */ 3797ec681f3Smrg if ((state->scoreboard->options->stages_with_shared_io_memory & 3807ec681f3Smrg (1 << state->scoreboard->shader->info.stage))) 3817ec681f3Smrg add_write_dep(state, &state->load_input, n); 3827ec681f3Smrg 3837ec681f3Smrg /* Make sure that preceding discards stay before the store_output */ 3847ec681f3Smrg add_read_dep(state, state->discard, n); 3857ec681f3Smrg 3867ec681f3Smrg break; 3877ec681f3Smrg 3887ec681f3Smrg case nir_intrinsic_load_input: 3897ec681f3Smrg case nir_intrinsic_load_per_vertex_input: 3907ec681f3Smrg add_read_dep(state, state->load_input, n); 3917ec681f3Smrg break; 3927ec681f3Smrg 3937ec681f3Smrg case nir_intrinsic_load_shared: 3947ec681f3Smrg /* Don't move load_shared beyond a following store_shared, as it could 3957ec681f3Smrg * change their value 3967ec681f3Smrg */ 3977ec681f3Smrg add_read_dep(state, state->store_shared, n); 3987ec681f3Smrg break; 3997ec681f3Smrg 4007ec681f3Smrg case nir_intrinsic_store_shared: 4017ec681f3Smrg add_write_dep(state, &state->store_shared, n); 4027ec681f3Smrg break; 4037ec681f3Smrg 4047ec681f3Smrg case nir_intrinsic_control_barrier: 4057ec681f3Smrg case nir_intrinsic_memory_barrier_shared: 4067ec681f3Smrg add_write_dep(state, &state->store_shared, n); 4077ec681f3Smrg 4087ec681f3Smrg /* Serialize against ssbos/atomics/etc. */ 4097ec681f3Smrg add_write_dep(state, &state->unknown_intrinsic, n); 4107ec681f3Smrg break; 4117ec681f3Smrg 4127ec681f3Smrg default: 4137ec681f3Smrg /* Attempt to handle other intrinsics that we haven't individually 4147ec681f3Smrg * categorized by serializing them in the same order relative to each 4157ec681f3Smrg * other. 4167ec681f3Smrg */ 4177ec681f3Smrg add_write_dep(state, &state->unknown_intrinsic, n); 4187ec681f3Smrg break; 4197ec681f3Smrg } 4207ec681f3Smrg} 4217ec681f3Smrg 4227ec681f3Smrg/** 4237ec681f3Smrg * Common code for dependencies that need to be tracked both forward and 4247ec681f3Smrg * backward. 4257ec681f3Smrg * 4267ec681f3Smrg * This is for things like "all reads of r4 have to happen between the r4 4277ec681f3Smrg * writes that surround them". 4287ec681f3Smrg */ 4297ec681f3Smrgstatic void 4307ec681f3Smrgnir_schedule_calculate_deps(nir_deps_state *state, nir_schedule_node *n) 4317ec681f3Smrg{ 4327ec681f3Smrg nir_instr *instr = n->instr; 4337ec681f3Smrg 4347ec681f3Smrg /* For NIR SSA defs, we only need to do a single pass of making the uses 4357ec681f3Smrg * depend on the def. 4367ec681f3Smrg */ 4377ec681f3Smrg if (state->dir == F) 4387ec681f3Smrg nir_foreach_ssa_def(instr, nir_schedule_ssa_deps, state); 4397ec681f3Smrg 4407ec681f3Smrg /* For NIR regs, track the last writer in the scheduler state so that we 4417ec681f3Smrg * can keep the writes in order and let reads get reordered only between 4427ec681f3Smrg * each write. 4437ec681f3Smrg */ 4447ec681f3Smrg nir_foreach_src(instr, nir_schedule_reg_src_deps, state); 4457ec681f3Smrg 4467ec681f3Smrg nir_foreach_dest(instr, nir_schedule_reg_dest_deps, state); 4477ec681f3Smrg 4487ec681f3Smrg /* Make sure any other instructions keep their positions relative to 4497ec681f3Smrg * jumps. 4507ec681f3Smrg */ 4517ec681f3Smrg if (instr->type != nir_instr_type_jump) 4527ec681f3Smrg add_read_dep(state, state->jump, n); 4537ec681f3Smrg 4547ec681f3Smrg switch (instr->type) { 4557ec681f3Smrg case nir_instr_type_ssa_undef: 4567ec681f3Smrg case nir_instr_type_load_const: 4577ec681f3Smrg case nir_instr_type_alu: 4587ec681f3Smrg case nir_instr_type_deref: 4597ec681f3Smrg break; 4607ec681f3Smrg 4617ec681f3Smrg case nir_instr_type_tex: 4627ec681f3Smrg /* Don't move texture ops before a discard, as that could increase 4637ec681f3Smrg * memory bandwidth for reading the discarded samples. 4647ec681f3Smrg */ 4657ec681f3Smrg add_read_dep(state, state->discard, n); 4667ec681f3Smrg break; 4677ec681f3Smrg 4687ec681f3Smrg case nir_instr_type_jump: 4697ec681f3Smrg add_write_dep(state, &state->jump, n); 4707ec681f3Smrg break; 4717ec681f3Smrg 4727ec681f3Smrg case nir_instr_type_call: 4737ec681f3Smrg unreachable("Calls should have been lowered"); 4747ec681f3Smrg break; 4757ec681f3Smrg 4767ec681f3Smrg case nir_instr_type_parallel_copy: 4777ec681f3Smrg unreachable("Parallel copies should have been lowered"); 4787ec681f3Smrg break; 4797ec681f3Smrg 4807ec681f3Smrg case nir_instr_type_phi: 4817ec681f3Smrg unreachable("nir_schedule() should be called after lowering from SSA"); 4827ec681f3Smrg break; 4837ec681f3Smrg 4847ec681f3Smrg case nir_instr_type_intrinsic: 4857ec681f3Smrg nir_schedule_intrinsic_deps(state, nir_instr_as_intrinsic(instr)); 4867ec681f3Smrg break; 4877ec681f3Smrg } 4887ec681f3Smrg} 4897ec681f3Smrg 4907ec681f3Smrgstatic void 4917ec681f3Smrgcalculate_forward_deps(nir_schedule_scoreboard *scoreboard, nir_block *block) 4927ec681f3Smrg{ 4937ec681f3Smrg nir_deps_state state = { 4947ec681f3Smrg .scoreboard = scoreboard, 4957ec681f3Smrg .dir = F, 4967ec681f3Smrg .reg_map = _mesa_pointer_hash_table_create(NULL), 4977ec681f3Smrg }; 4987ec681f3Smrg 4997ec681f3Smrg nir_foreach_instr(instr, block) { 5007ec681f3Smrg nir_schedule_node *node = nir_schedule_get_node(scoreboard->instr_map, 5017ec681f3Smrg instr); 5027ec681f3Smrg nir_schedule_calculate_deps(&state, node); 5037ec681f3Smrg } 5047ec681f3Smrg 5057ec681f3Smrg ralloc_free(state.reg_map); 5067ec681f3Smrg} 5077ec681f3Smrg 5087ec681f3Smrgstatic void 5097ec681f3Smrgcalculate_reverse_deps(nir_schedule_scoreboard *scoreboard, nir_block *block) 5107ec681f3Smrg{ 5117ec681f3Smrg nir_deps_state state = { 5127ec681f3Smrg .scoreboard = scoreboard, 5137ec681f3Smrg .dir = R, 5147ec681f3Smrg .reg_map = _mesa_pointer_hash_table_create(NULL), 5157ec681f3Smrg }; 5167ec681f3Smrg 5177ec681f3Smrg nir_foreach_instr_reverse(instr, block) { 5187ec681f3Smrg nir_schedule_node *node = nir_schedule_get_node(scoreboard->instr_map, 5197ec681f3Smrg instr); 5207ec681f3Smrg nir_schedule_calculate_deps(&state, node); 5217ec681f3Smrg } 5227ec681f3Smrg 5237ec681f3Smrg ralloc_free(state.reg_map); 5247ec681f3Smrg} 5257ec681f3Smrg 5267ec681f3Smrgtypedef struct { 5277ec681f3Smrg nir_schedule_scoreboard *scoreboard; 5287ec681f3Smrg int regs_freed; 5297ec681f3Smrg} nir_schedule_regs_freed_state; 5307ec681f3Smrg 5317ec681f3Smrgstatic bool 5327ec681f3Smrgnir_schedule_regs_freed_src_cb(nir_src *src, void *in_state) 5337ec681f3Smrg{ 5347ec681f3Smrg nir_schedule_regs_freed_state *state = in_state; 5357ec681f3Smrg nir_schedule_scoreboard *scoreboard = state->scoreboard; 5367ec681f3Smrg struct set *remaining_uses = nir_schedule_scoreboard_get_src(scoreboard, src); 5377ec681f3Smrg 5387ec681f3Smrg if (remaining_uses->entries == 1 && 5397ec681f3Smrg _mesa_set_search(remaining_uses, src->parent_instr)) { 5407ec681f3Smrg state->regs_freed += nir_schedule_src_pressure(src); 5417ec681f3Smrg } 5427ec681f3Smrg 5437ec681f3Smrg return true; 5447ec681f3Smrg} 5457ec681f3Smrg 5467ec681f3Smrgstatic bool 5477ec681f3Smrgnir_schedule_regs_freed_def_cb(nir_ssa_def *def, void *in_state) 5487ec681f3Smrg{ 5497ec681f3Smrg nir_schedule_regs_freed_state *state = in_state; 5507ec681f3Smrg 5517ec681f3Smrg state->regs_freed -= nir_schedule_def_pressure(def); 5527ec681f3Smrg 5537ec681f3Smrg return true; 5547ec681f3Smrg} 5557ec681f3Smrg 5567ec681f3Smrgstatic bool 5577ec681f3Smrgnir_schedule_regs_freed_dest_cb(nir_dest *dest, void *in_state) 5587ec681f3Smrg{ 5597ec681f3Smrg nir_schedule_regs_freed_state *state = in_state; 5607ec681f3Smrg nir_schedule_scoreboard *scoreboard = state->scoreboard; 5617ec681f3Smrg 5627ec681f3Smrg if (dest->is_ssa) 5637ec681f3Smrg return true; 5647ec681f3Smrg 5657ec681f3Smrg nir_register *reg = dest->reg.reg; 5667ec681f3Smrg 5677ec681f3Smrg /* Only the first def of a reg counts against register pressure. */ 5687ec681f3Smrg if (!_mesa_set_search(scoreboard->live_values, reg)) 5697ec681f3Smrg state->regs_freed -= nir_schedule_dest_pressure(dest); 5707ec681f3Smrg 5717ec681f3Smrg return true; 5727ec681f3Smrg} 5737ec681f3Smrg 5747ec681f3Smrgstatic int 5757ec681f3Smrgnir_schedule_regs_freed(nir_schedule_scoreboard *scoreboard, nir_schedule_node *n) 5767ec681f3Smrg{ 5777ec681f3Smrg nir_schedule_regs_freed_state state = { 5787ec681f3Smrg .scoreboard = scoreboard, 5797ec681f3Smrg }; 5807ec681f3Smrg 5817ec681f3Smrg nir_foreach_src(n->instr, nir_schedule_regs_freed_src_cb, &state); 5827ec681f3Smrg 5837ec681f3Smrg nir_foreach_ssa_def(n->instr, nir_schedule_regs_freed_def_cb, &state); 5847ec681f3Smrg 5857ec681f3Smrg nir_foreach_dest(n->instr, nir_schedule_regs_freed_dest_cb, &state); 5867ec681f3Smrg 5877ec681f3Smrg return state.regs_freed; 5887ec681f3Smrg} 5897ec681f3Smrg 5907ec681f3Smrg/** 5917ec681f3Smrg * Chooses an instruction that will minimise the register pressure as much as 5927ec681f3Smrg * possible. This should only be used as a fallback when the regular scheduling 5937ec681f3Smrg * generates a shader whose register allocation fails. 5947ec681f3Smrg */ 5957ec681f3Smrgstatic nir_schedule_node * 5967ec681f3Smrgnir_schedule_choose_instruction_fallback(nir_schedule_scoreboard *scoreboard) 5977ec681f3Smrg{ 5987ec681f3Smrg nir_schedule_node *chosen = NULL; 5997ec681f3Smrg 6007ec681f3Smrg /* Find the leader in the ready (shouldn't-stall) set with the mininum 6017ec681f3Smrg * cost. 6027ec681f3Smrg */ 6037ec681f3Smrg list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) { 6047ec681f3Smrg if (scoreboard->time < n->ready_time) 6057ec681f3Smrg continue; 6067ec681f3Smrg 6077ec681f3Smrg if (!chosen || chosen->max_delay > n->max_delay) 6087ec681f3Smrg chosen = n; 6097ec681f3Smrg } 6107ec681f3Smrg if (chosen) { 6117ec681f3Smrg if (debug) { 6127ec681f3Smrg fprintf(stderr, "chose (ready fallback): "); 6137ec681f3Smrg nir_print_instr(chosen->instr, stderr); 6147ec681f3Smrg fprintf(stderr, "\n"); 6157ec681f3Smrg } 6167ec681f3Smrg 6177ec681f3Smrg return chosen; 6187ec681f3Smrg } 6197ec681f3Smrg 6207ec681f3Smrg /* Otherwise, choose the leader with the minimum cost. */ 6217ec681f3Smrg list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) { 6227ec681f3Smrg if (!chosen || chosen->max_delay > n->max_delay) 6237ec681f3Smrg chosen = n; 6247ec681f3Smrg } 6257ec681f3Smrg if (debug) { 6267ec681f3Smrg fprintf(stderr, "chose (leader fallback): "); 6277ec681f3Smrg nir_print_instr(chosen->instr, stderr); 6287ec681f3Smrg fprintf(stderr, "\n"); 6297ec681f3Smrg } 6307ec681f3Smrg 6317ec681f3Smrg return chosen; 6327ec681f3Smrg} 6337ec681f3Smrg 6347ec681f3Smrg/** 6357ec681f3Smrg * Chooses an instruction to schedule using the Goodman/Hsu (1988) CSP (Code 6367ec681f3Smrg * Scheduling for Parallelism) heuristic. 6377ec681f3Smrg * 6387ec681f3Smrg * Picks an instruction on the critical that's ready to execute without 6397ec681f3Smrg * stalls, if possible, otherwise picks the instruction on the critical path. 6407ec681f3Smrg */ 6417ec681f3Smrgstatic nir_schedule_node * 6427ec681f3Smrgnir_schedule_choose_instruction_csp(nir_schedule_scoreboard *scoreboard) 6437ec681f3Smrg{ 6447ec681f3Smrg nir_schedule_node *chosen = NULL; 6457ec681f3Smrg 6467ec681f3Smrg /* Find the leader in the ready (shouldn't-stall) set with the maximum 6477ec681f3Smrg * cost. 6487ec681f3Smrg */ 6497ec681f3Smrg list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) { 6507ec681f3Smrg if (scoreboard->time < n->ready_time) 6517ec681f3Smrg continue; 6527ec681f3Smrg 6537ec681f3Smrg if (!chosen || chosen->max_delay < n->max_delay) 6547ec681f3Smrg chosen = n; 6557ec681f3Smrg } 6567ec681f3Smrg if (chosen) { 6577ec681f3Smrg if (debug) { 6587ec681f3Smrg fprintf(stderr, "chose (ready): "); 6597ec681f3Smrg nir_print_instr(chosen->instr, stderr); 6607ec681f3Smrg fprintf(stderr, "\n"); 6617ec681f3Smrg } 6627ec681f3Smrg 6637ec681f3Smrg return chosen; 6647ec681f3Smrg } 6657ec681f3Smrg 6667ec681f3Smrg /* Otherwise, choose the leader with the maximum cost. */ 6677ec681f3Smrg list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) { 6687ec681f3Smrg if (!chosen || chosen->max_delay < n->max_delay) 6697ec681f3Smrg chosen = n; 6707ec681f3Smrg } 6717ec681f3Smrg if (debug) { 6727ec681f3Smrg fprintf(stderr, "chose (leader): "); 6737ec681f3Smrg nir_print_instr(chosen->instr, stderr); 6747ec681f3Smrg fprintf(stderr, "\n"); 6757ec681f3Smrg } 6767ec681f3Smrg 6777ec681f3Smrg return chosen; 6787ec681f3Smrg} 6797ec681f3Smrg 6807ec681f3Smrg/** 6817ec681f3Smrg * Chooses an instruction to schedule using the Goodman/Hsu (1988) CSR (Code 6827ec681f3Smrg * Scheduling for Register pressure) heuristic. 6837ec681f3Smrg */ 6847ec681f3Smrgstatic nir_schedule_node * 6857ec681f3Smrgnir_schedule_choose_instruction_csr(nir_schedule_scoreboard *scoreboard) 6867ec681f3Smrg{ 6877ec681f3Smrg nir_schedule_node *chosen = NULL; 6887ec681f3Smrg 6897ec681f3Smrg /* Find a ready inst with regs freed and pick the one with max cost. */ 6907ec681f3Smrg list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) { 6917ec681f3Smrg if (n->ready_time > scoreboard->time) 6927ec681f3Smrg continue; 6937ec681f3Smrg 6947ec681f3Smrg int regs_freed = nir_schedule_regs_freed(scoreboard, n); 6957ec681f3Smrg 6967ec681f3Smrg if (regs_freed > 0 && (!chosen || chosen->max_delay < n->max_delay)) { 6977ec681f3Smrg chosen = n; 6987ec681f3Smrg } 6997ec681f3Smrg } 7007ec681f3Smrg if (chosen) { 7017ec681f3Smrg if (debug) { 7027ec681f3Smrg fprintf(stderr, "chose (freed+ready): "); 7037ec681f3Smrg nir_print_instr(chosen->instr, stderr); 7047ec681f3Smrg fprintf(stderr, "\n"); 7057ec681f3Smrg } 7067ec681f3Smrg 7077ec681f3Smrg return chosen; 7087ec681f3Smrg } 7097ec681f3Smrg 7107ec681f3Smrg /* Find a leader with regs freed and pick the one with max cost. */ 7117ec681f3Smrg list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) { 7127ec681f3Smrg int regs_freed = nir_schedule_regs_freed(scoreboard, n); 7137ec681f3Smrg 7147ec681f3Smrg if (regs_freed > 0 && (!chosen || chosen->max_delay < n->max_delay)) { 7157ec681f3Smrg chosen = n; 7167ec681f3Smrg } 7177ec681f3Smrg } 7187ec681f3Smrg if (chosen) { 7197ec681f3Smrg if (debug) { 7207ec681f3Smrg fprintf(stderr, "chose (regs freed): "); 7217ec681f3Smrg nir_print_instr(chosen->instr, stderr); 7227ec681f3Smrg fprintf(stderr, "\n"); 7237ec681f3Smrg } 7247ec681f3Smrg 7257ec681f3Smrg return chosen; 7267ec681f3Smrg } 7277ec681f3Smrg 7287ec681f3Smrg /* Find a partially evaluated path and try to finish it off */ 7297ec681f3Smrg list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) { 7307ec681f3Smrg if (n->partially_evaluated_path && 7317ec681f3Smrg (!chosen || chosen->max_delay < n->max_delay)) { 7327ec681f3Smrg chosen = n; 7337ec681f3Smrg } 7347ec681f3Smrg } 7357ec681f3Smrg if (chosen) { 7367ec681f3Smrg if (debug) { 7377ec681f3Smrg fprintf(stderr, "chose (partial path): "); 7387ec681f3Smrg nir_print_instr(chosen->instr, stderr); 7397ec681f3Smrg fprintf(stderr, "\n"); 7407ec681f3Smrg } 7417ec681f3Smrg 7427ec681f3Smrg return chosen; 7437ec681f3Smrg } 7447ec681f3Smrg 7457ec681f3Smrg /* Contra the paper, pick a leader with no effect on used regs. This may 7467ec681f3Smrg * open up new opportunities, as otherwise a single-operand instr consuming 7477ec681f3Smrg * a value will tend to block finding freeing that value. This had a 7487ec681f3Smrg * massive effect on reducing spilling on V3D. 7497ec681f3Smrg * 7507ec681f3Smrg * XXX: Should this prioritize ready? 7517ec681f3Smrg */ 7527ec681f3Smrg list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) { 7537ec681f3Smrg if (nir_schedule_regs_freed(scoreboard, n) != 0) 7547ec681f3Smrg continue; 7557ec681f3Smrg 7567ec681f3Smrg if (!chosen || chosen->max_delay < n->max_delay) 7577ec681f3Smrg chosen = n; 7587ec681f3Smrg } 7597ec681f3Smrg if (chosen) { 7607ec681f3Smrg if (debug) { 7617ec681f3Smrg fprintf(stderr, "chose (regs no-op): "); 7627ec681f3Smrg nir_print_instr(chosen->instr, stderr); 7637ec681f3Smrg fprintf(stderr, "\n"); 7647ec681f3Smrg } 7657ec681f3Smrg 7667ec681f3Smrg return chosen; 7677ec681f3Smrg } 7687ec681f3Smrg 7697ec681f3Smrg /* Pick the max delay of the remaining ready set. */ 7707ec681f3Smrg list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) { 7717ec681f3Smrg if (n->ready_time > scoreboard->time) 7727ec681f3Smrg continue; 7737ec681f3Smrg if (!chosen || chosen->max_delay < n->max_delay) 7747ec681f3Smrg chosen = n; 7757ec681f3Smrg } 7767ec681f3Smrg if (chosen) { 7777ec681f3Smrg if (debug) { 7787ec681f3Smrg fprintf(stderr, "chose (ready max delay): "); 7797ec681f3Smrg nir_print_instr(chosen->instr, stderr); 7807ec681f3Smrg fprintf(stderr, "\n"); 7817ec681f3Smrg } 7827ec681f3Smrg return chosen; 7837ec681f3Smrg } 7847ec681f3Smrg 7857ec681f3Smrg /* Pick the max delay of the remaining leaders. */ 7867ec681f3Smrg list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) { 7877ec681f3Smrg if (!chosen || chosen->max_delay < n->max_delay) 7887ec681f3Smrg chosen = n; 7897ec681f3Smrg } 7907ec681f3Smrg 7917ec681f3Smrg if (debug) { 7927ec681f3Smrg fprintf(stderr, "chose (max delay): "); 7937ec681f3Smrg nir_print_instr(chosen->instr, stderr); 7947ec681f3Smrg fprintf(stderr, "\n"); 7957ec681f3Smrg } 7967ec681f3Smrg 7977ec681f3Smrg return chosen; 7987ec681f3Smrg} 7997ec681f3Smrg 8007ec681f3Smrgstatic void 8017ec681f3Smrgdump_state(nir_schedule_scoreboard *scoreboard) 8027ec681f3Smrg{ 8037ec681f3Smrg list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) { 8047ec681f3Smrg fprintf(stderr, "maxdel %5d ", n->max_delay); 8057ec681f3Smrg nir_print_instr(n->instr, stderr); 8067ec681f3Smrg fprintf(stderr, "\n"); 8077ec681f3Smrg 8087ec681f3Smrg util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) { 8097ec681f3Smrg nir_schedule_node *child = (nir_schedule_node *)edge->child; 8107ec681f3Smrg 8117ec681f3Smrg fprintf(stderr, " -> (%d parents) ", child->dag.parent_count); 8127ec681f3Smrg nir_print_instr(child->instr, stderr); 8137ec681f3Smrg fprintf(stderr, "\n"); 8147ec681f3Smrg } 8157ec681f3Smrg } 8167ec681f3Smrg} 8177ec681f3Smrg 8187ec681f3Smrgstatic void 8197ec681f3Smrgnir_schedule_mark_use(nir_schedule_scoreboard *scoreboard, 8207ec681f3Smrg void *reg_or_def, 8217ec681f3Smrg nir_instr *reg_or_def_parent, 8227ec681f3Smrg int pressure) 8237ec681f3Smrg{ 8247ec681f3Smrg /* Make the value live if it's the first time it's been used. */ 8257ec681f3Smrg if (!_mesa_set_search(scoreboard->live_values, reg_or_def)) { 8267ec681f3Smrg _mesa_set_add(scoreboard->live_values, reg_or_def); 8277ec681f3Smrg scoreboard->pressure += pressure; 8287ec681f3Smrg } 8297ec681f3Smrg 8307ec681f3Smrg /* Make the value dead if it's the last remaining use. Be careful when one 8317ec681f3Smrg * instruction uses a value twice to not decrement pressure twice. 8327ec681f3Smrg */ 8337ec681f3Smrg struct set *remaining_uses = 8347ec681f3Smrg _mesa_hash_table_search_data(scoreboard->remaining_uses, reg_or_def); 8357ec681f3Smrg struct set_entry *entry = _mesa_set_search(remaining_uses, reg_or_def_parent); 8367ec681f3Smrg if (entry) { 8377ec681f3Smrg _mesa_set_remove(remaining_uses, entry); 8387ec681f3Smrg 8397ec681f3Smrg if (remaining_uses->entries == 0) 8407ec681f3Smrg scoreboard->pressure -= pressure; 8417ec681f3Smrg } 8427ec681f3Smrg} 8437ec681f3Smrg 8447ec681f3Smrgstatic bool 8457ec681f3Smrgnir_schedule_mark_src_scheduled(nir_src *src, void *state) 8467ec681f3Smrg{ 8477ec681f3Smrg nir_schedule_scoreboard *scoreboard = state; 8487ec681f3Smrg struct set *remaining_uses = nir_schedule_scoreboard_get_src(scoreboard, src); 8497ec681f3Smrg 8507ec681f3Smrg struct set_entry *entry = _mesa_set_search(remaining_uses, 8517ec681f3Smrg src->parent_instr); 8527ec681f3Smrg if (entry) { 8537ec681f3Smrg /* Once we've used an SSA value in one instruction, bump the priority of 8547ec681f3Smrg * the other uses so the SSA value can get fully consumed. 8557ec681f3Smrg * 8567ec681f3Smrg * We don't do this for registers, and it's would be a hassle and it's 8577ec681f3Smrg * unclear if that would help or not. Also, skip it for constants, as 8587ec681f3Smrg * they're often folded as immediates into backend instructions and have 8597ec681f3Smrg * many unrelated instructions all referencing the same value (0). 8607ec681f3Smrg */ 8617ec681f3Smrg if (src->is_ssa && 8627ec681f3Smrg src->ssa->parent_instr->type != nir_instr_type_load_const) { 8637ec681f3Smrg nir_foreach_use(other_src, src->ssa) { 8647ec681f3Smrg if (other_src->parent_instr == src->parent_instr) 8657ec681f3Smrg continue; 8667ec681f3Smrg 8677ec681f3Smrg nir_schedule_node *n = 8687ec681f3Smrg nir_schedule_get_node(scoreboard->instr_map, 8697ec681f3Smrg other_src->parent_instr); 8707ec681f3Smrg 8717ec681f3Smrg if (n && !n->partially_evaluated_path) { 8727ec681f3Smrg if (debug) { 8737ec681f3Smrg fprintf(stderr, " New partially evaluated path: "); 8747ec681f3Smrg nir_print_instr(n->instr, stderr); 8757ec681f3Smrg fprintf(stderr, "\n"); 8767ec681f3Smrg } 8777ec681f3Smrg 8787ec681f3Smrg n->partially_evaluated_path = true; 8797ec681f3Smrg } 8807ec681f3Smrg } 8817ec681f3Smrg } 8827ec681f3Smrg } 8837ec681f3Smrg 8847ec681f3Smrg nir_schedule_mark_use(scoreboard, 8857ec681f3Smrg src->is_ssa ? (void *)src->ssa : (void *)src->reg.reg, 8867ec681f3Smrg src->parent_instr, 8877ec681f3Smrg nir_schedule_src_pressure(src)); 8887ec681f3Smrg 8897ec681f3Smrg return true; 8907ec681f3Smrg} 8917ec681f3Smrg 8927ec681f3Smrgstatic bool 8937ec681f3Smrgnir_schedule_mark_def_scheduled(nir_ssa_def *def, void *state) 8947ec681f3Smrg{ 8957ec681f3Smrg nir_schedule_scoreboard *scoreboard = state; 8967ec681f3Smrg 8977ec681f3Smrg nir_schedule_mark_use(scoreboard, def, def->parent_instr, 8987ec681f3Smrg nir_schedule_def_pressure(def)); 8997ec681f3Smrg 9007ec681f3Smrg return true; 9017ec681f3Smrg} 9027ec681f3Smrg 9037ec681f3Smrgstatic bool 9047ec681f3Smrgnir_schedule_mark_dest_scheduled(nir_dest *dest, void *state) 9057ec681f3Smrg{ 9067ec681f3Smrg nir_schedule_scoreboard *scoreboard = state; 9077ec681f3Smrg 9087ec681f3Smrg /* SSA defs were handled in nir_schedule_mark_def_scheduled() 9097ec681f3Smrg */ 9107ec681f3Smrg if (dest->is_ssa) 9117ec681f3Smrg return true; 9127ec681f3Smrg 9137ec681f3Smrg /* XXX: This is not actually accurate for regs -- the last use of a reg may 9147ec681f3Smrg * have a live interval that extends across control flow. We should 9157ec681f3Smrg * calculate the live ranges of regs, and have scheduler nodes for the CF 9167ec681f3Smrg * nodes that also "use" the reg. 9177ec681f3Smrg */ 9187ec681f3Smrg nir_schedule_mark_use(scoreboard, dest->reg.reg, 9197ec681f3Smrg dest->reg.parent_instr, 9207ec681f3Smrg nir_schedule_dest_pressure(dest)); 9217ec681f3Smrg 9227ec681f3Smrg return true; 9237ec681f3Smrg} 9247ec681f3Smrg 9257ec681f3Smrgstatic void 9267ec681f3Smrgnir_schedule_mark_node_scheduled(nir_schedule_scoreboard *scoreboard, 9277ec681f3Smrg nir_schedule_node *n) 9287ec681f3Smrg{ 9297ec681f3Smrg nir_foreach_src(n->instr, nir_schedule_mark_src_scheduled, scoreboard); 9307ec681f3Smrg nir_foreach_ssa_def(n->instr, nir_schedule_mark_def_scheduled, scoreboard); 9317ec681f3Smrg nir_foreach_dest(n->instr, nir_schedule_mark_dest_scheduled, scoreboard); 9327ec681f3Smrg 9337ec681f3Smrg util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) { 9347ec681f3Smrg nir_schedule_node *child = (nir_schedule_node *)edge->child; 9357ec681f3Smrg 9367ec681f3Smrg child->ready_time = MAX2(child->ready_time, 9377ec681f3Smrg scoreboard->time + n->delay); 9387ec681f3Smrg 9397ec681f3Smrg if (child->dag.parent_count == 1) { 9407ec681f3Smrg if (debug) { 9417ec681f3Smrg fprintf(stderr, " New DAG head: "); 9427ec681f3Smrg nir_print_instr(child->instr, stderr); 9437ec681f3Smrg fprintf(stderr, "\n"); 9447ec681f3Smrg } 9457ec681f3Smrg } 9467ec681f3Smrg } 9477ec681f3Smrg 9487ec681f3Smrg dag_prune_head(scoreboard->dag, &n->dag); 9497ec681f3Smrg 9507ec681f3Smrg scoreboard->time = MAX2(n->ready_time, scoreboard->time); 9517ec681f3Smrg scoreboard->time++; 9527ec681f3Smrg} 9537ec681f3Smrg 9547ec681f3Smrgstatic void 9557ec681f3Smrgnir_schedule_instructions(nir_schedule_scoreboard *scoreboard, nir_block *block) 9567ec681f3Smrg{ 9577ec681f3Smrg while (!list_is_empty(&scoreboard->dag->heads)) { 9587ec681f3Smrg if (debug) { 9597ec681f3Smrg fprintf(stderr, "current list:\n"); 9607ec681f3Smrg dump_state(scoreboard); 9617ec681f3Smrg } 9627ec681f3Smrg 9637ec681f3Smrg nir_schedule_node *chosen; 9647ec681f3Smrg if (scoreboard->options->fallback) 9657ec681f3Smrg chosen = nir_schedule_choose_instruction_fallback(scoreboard); 9667ec681f3Smrg else if (scoreboard->pressure < scoreboard->options->threshold) 9677ec681f3Smrg chosen = nir_schedule_choose_instruction_csp(scoreboard); 9687ec681f3Smrg else 9697ec681f3Smrg chosen = nir_schedule_choose_instruction_csr(scoreboard); 9707ec681f3Smrg 9717ec681f3Smrg /* Now that we've scheduled a new instruction, some of its children may 9727ec681f3Smrg * be promoted to the list of instructions ready to be scheduled. 9737ec681f3Smrg */ 9747ec681f3Smrg nir_schedule_mark_node_scheduled(scoreboard, chosen); 9757ec681f3Smrg 9767ec681f3Smrg /* Move the instruction to the end (so our first chosen instructions are 9777ec681f3Smrg * the start of the program). 9787ec681f3Smrg */ 9797ec681f3Smrg exec_node_remove(&chosen->instr->node); 9807ec681f3Smrg exec_list_push_tail(&block->instr_list, &chosen->instr->node); 9817ec681f3Smrg 9827ec681f3Smrg if (debug) 9837ec681f3Smrg fprintf(stderr, "\n"); 9847ec681f3Smrg } 9857ec681f3Smrg} 9867ec681f3Smrg 9877ec681f3Smrgstatic uint32_t 9887ec681f3Smrgnir_schedule_get_delay(nir_instr *instr) 9897ec681f3Smrg{ 9907ec681f3Smrg switch (instr->type) { 9917ec681f3Smrg case nir_instr_type_ssa_undef: 9927ec681f3Smrg case nir_instr_type_load_const: 9937ec681f3Smrg case nir_instr_type_alu: 9947ec681f3Smrg case nir_instr_type_deref: 9957ec681f3Smrg case nir_instr_type_jump: 9967ec681f3Smrg case nir_instr_type_parallel_copy: 9977ec681f3Smrg case nir_instr_type_call: 9987ec681f3Smrg case nir_instr_type_phi: 9997ec681f3Smrg return 1; 10007ec681f3Smrg 10017ec681f3Smrg case nir_instr_type_intrinsic: 10027ec681f3Smrg /* XXX: Pick a large number for UBO/SSBO/image/shared loads */ 10037ec681f3Smrg return 1; 10047ec681f3Smrg 10057ec681f3Smrg case nir_instr_type_tex: 10067ec681f3Smrg /* Pick some large number to try to fetch textures early and sample them 10077ec681f3Smrg * late. 10087ec681f3Smrg */ 10097ec681f3Smrg return 100; 10107ec681f3Smrg } 10117ec681f3Smrg 10127ec681f3Smrg return 0; 10137ec681f3Smrg} 10147ec681f3Smrg 10157ec681f3Smrgstatic void 10167ec681f3Smrgnir_schedule_dag_max_delay_cb(struct dag_node *node, void *state) 10177ec681f3Smrg{ 10187ec681f3Smrg nir_schedule_node *n = (nir_schedule_node *)node; 10197ec681f3Smrg uint32_t max_delay = 0; 10207ec681f3Smrg 10217ec681f3Smrg util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) { 10227ec681f3Smrg nir_schedule_node *child = (nir_schedule_node *)edge->child; 10237ec681f3Smrg max_delay = MAX2(child->max_delay, max_delay); 10247ec681f3Smrg } 10257ec681f3Smrg 10267ec681f3Smrg n->max_delay = MAX2(n->max_delay, max_delay + n->delay); 10277ec681f3Smrg } 10287ec681f3Smrg 10297ec681f3Smrgstatic void 10307ec681f3Smrgnir_schedule_block(nir_schedule_scoreboard *scoreboard, nir_block *block) 10317ec681f3Smrg{ 10327ec681f3Smrg void *mem_ctx = ralloc_context(NULL); 10337ec681f3Smrg scoreboard->instr_map = _mesa_pointer_hash_table_create(mem_ctx); 10347ec681f3Smrg 10357ec681f3Smrg scoreboard->dag = dag_create(mem_ctx); 10367ec681f3Smrg 10377ec681f3Smrg nir_foreach_instr(instr, block) { 10387ec681f3Smrg nir_schedule_node *n = 10397ec681f3Smrg rzalloc(mem_ctx, nir_schedule_node); 10407ec681f3Smrg 10417ec681f3Smrg n->instr = instr; 10427ec681f3Smrg n->delay = nir_schedule_get_delay(instr); 10437ec681f3Smrg dag_init_node(scoreboard->dag, &n->dag); 10447ec681f3Smrg 10457ec681f3Smrg _mesa_hash_table_insert(scoreboard->instr_map, instr, n); 10467ec681f3Smrg } 10477ec681f3Smrg 10487ec681f3Smrg calculate_forward_deps(scoreboard, block); 10497ec681f3Smrg calculate_reverse_deps(scoreboard, block); 10507ec681f3Smrg 10517ec681f3Smrg dag_traverse_bottom_up(scoreboard->dag, nir_schedule_dag_max_delay_cb, NULL); 10527ec681f3Smrg 10537ec681f3Smrg nir_schedule_instructions(scoreboard, block); 10547ec681f3Smrg 10557ec681f3Smrg ralloc_free(mem_ctx); 10567ec681f3Smrg scoreboard->instr_map = NULL; 10577ec681f3Smrg} 10587ec681f3Smrg 10597ec681f3Smrgstatic bool 10607ec681f3Smrgnir_schedule_ssa_def_init_scoreboard(nir_ssa_def *def, void *state) 10617ec681f3Smrg{ 10627ec681f3Smrg nir_schedule_scoreboard *scoreboard = state; 10637ec681f3Smrg struct set *def_uses = _mesa_pointer_set_create(scoreboard); 10647ec681f3Smrg 10657ec681f3Smrg _mesa_hash_table_insert(scoreboard->remaining_uses, def, def_uses); 10667ec681f3Smrg 10677ec681f3Smrg _mesa_set_add(def_uses, def->parent_instr); 10687ec681f3Smrg 10697ec681f3Smrg nir_foreach_use(src, def) { 10707ec681f3Smrg _mesa_set_add(def_uses, src->parent_instr); 10717ec681f3Smrg } 10727ec681f3Smrg 10737ec681f3Smrg /* XXX: Handle if uses */ 10747ec681f3Smrg 10757ec681f3Smrg return true; 10767ec681f3Smrg} 10777ec681f3Smrg 10787ec681f3Smrgstatic nir_schedule_scoreboard * 10797ec681f3Smrgnir_schedule_get_scoreboard(nir_shader *shader, 10807ec681f3Smrg const nir_schedule_options *options) 10817ec681f3Smrg{ 10827ec681f3Smrg nir_schedule_scoreboard *scoreboard = rzalloc(NULL, nir_schedule_scoreboard); 10837ec681f3Smrg 10847ec681f3Smrg scoreboard->shader = shader; 10857ec681f3Smrg scoreboard->live_values = _mesa_pointer_set_create(scoreboard); 10867ec681f3Smrg scoreboard->remaining_uses = _mesa_pointer_hash_table_create(scoreboard); 10877ec681f3Smrg scoreboard->options = options; 10887ec681f3Smrg scoreboard->pressure = 0; 10897ec681f3Smrg 10907ec681f3Smrg nir_foreach_function(function, shader) { 10917ec681f3Smrg nir_foreach_register(reg, &function->impl->registers) { 10927ec681f3Smrg struct set *register_uses = 10937ec681f3Smrg _mesa_pointer_set_create(scoreboard); 10947ec681f3Smrg 10957ec681f3Smrg _mesa_hash_table_insert(scoreboard->remaining_uses, reg, register_uses); 10967ec681f3Smrg 10977ec681f3Smrg nir_foreach_use(src, reg) { 10987ec681f3Smrg _mesa_set_add(register_uses, src->parent_instr); 10997ec681f3Smrg } 11007ec681f3Smrg 11017ec681f3Smrg /* XXX: Handle if uses */ 11027ec681f3Smrg 11037ec681f3Smrg nir_foreach_def(dest, reg) { 11047ec681f3Smrg _mesa_set_add(register_uses, dest->reg.parent_instr); 11057ec681f3Smrg } 11067ec681f3Smrg } 11077ec681f3Smrg 11087ec681f3Smrg nir_foreach_block(block, function->impl) { 11097ec681f3Smrg nir_foreach_instr(instr, block) { 11107ec681f3Smrg nir_foreach_ssa_def(instr, nir_schedule_ssa_def_init_scoreboard, 11117ec681f3Smrg scoreboard); 11127ec681f3Smrg } 11137ec681f3Smrg 11147ec681f3Smrg /* XXX: We're ignoring if uses, which may prioritize scheduling other 11157ec681f3Smrg * uses of the if src even when it doesn't help. That's not many 11167ec681f3Smrg * values, though, so meh. 11177ec681f3Smrg */ 11187ec681f3Smrg } 11197ec681f3Smrg } 11207ec681f3Smrg 11217ec681f3Smrg return scoreboard; 11227ec681f3Smrg} 11237ec681f3Smrg 11247ec681f3Smrgstatic void 11257ec681f3Smrgnir_schedule_validate_uses(nir_schedule_scoreboard *scoreboard) 11267ec681f3Smrg{ 11277ec681f3Smrg#ifdef NDEBUG 11287ec681f3Smrg return; 11297ec681f3Smrg#endif 11307ec681f3Smrg 11317ec681f3Smrg bool any_uses = false; 11327ec681f3Smrg 11337ec681f3Smrg hash_table_foreach(scoreboard->remaining_uses, entry) { 11347ec681f3Smrg struct set *remaining_uses = entry->data; 11357ec681f3Smrg 11367ec681f3Smrg set_foreach(remaining_uses, instr_entry) { 11377ec681f3Smrg if (!any_uses) { 11387ec681f3Smrg fprintf(stderr, "Tracked uses remain after scheduling. " 11397ec681f3Smrg "Affected instructions: \n"); 11407ec681f3Smrg any_uses = true; 11417ec681f3Smrg } 11427ec681f3Smrg nir_print_instr(instr_entry->key, stderr); 11437ec681f3Smrg fprintf(stderr, "\n"); 11447ec681f3Smrg } 11457ec681f3Smrg } 11467ec681f3Smrg 11477ec681f3Smrg assert(!any_uses); 11487ec681f3Smrg} 11497ec681f3Smrg 11507ec681f3Smrg/** 11517ec681f3Smrg * Schedules the NIR instructions to try to decrease stalls (for example, 11527ec681f3Smrg * delaying texture reads) while managing register pressure. 11537ec681f3Smrg * 11547ec681f3Smrg * The threshold represents "number of NIR register/SSA def channels live 11557ec681f3Smrg * before switching the scheduling heuristic to reduce register pressure", 11567ec681f3Smrg * since most of our GPU architectures are scalar (extending to vector with a 11577ec681f3Smrg * flag wouldn't be hard). This number should be a bit below the number of 11587ec681f3Smrg * registers available (counting any that may be occupied by system value 11597ec681f3Smrg * payload values, for example), since the heuristic may not always be able to 11607ec681f3Smrg * free a register immediately. The amount below the limit is up to you to 11617ec681f3Smrg * tune. 11627ec681f3Smrg */ 11637ec681f3Smrgvoid 11647ec681f3Smrgnir_schedule(nir_shader *shader, 11657ec681f3Smrg const nir_schedule_options *options) 11667ec681f3Smrg{ 11677ec681f3Smrg nir_schedule_scoreboard *scoreboard = nir_schedule_get_scoreboard(shader, 11687ec681f3Smrg options); 11697ec681f3Smrg 11707ec681f3Smrg if (debug) { 11717ec681f3Smrg fprintf(stderr, "NIR shader before scheduling:\n"); 11727ec681f3Smrg nir_print_shader(shader, stderr); 11737ec681f3Smrg } 11747ec681f3Smrg 11757ec681f3Smrg nir_foreach_function(function, shader) { 11767ec681f3Smrg if (!function->impl) 11777ec681f3Smrg continue; 11787ec681f3Smrg 11797ec681f3Smrg nir_foreach_block(block, function->impl) { 11807ec681f3Smrg nir_schedule_block(scoreboard, block); 11817ec681f3Smrg } 11827ec681f3Smrg } 11837ec681f3Smrg 11847ec681f3Smrg nir_schedule_validate_uses(scoreboard); 11857ec681f3Smrg 11867ec681f3Smrg ralloc_free(scoreboard); 11877ec681f3Smrg} 1188