101e04c3fSmrg/* 201e04c3fSmrg * Copyright © 2014 Intel Corporation 301e04c3fSmrg * 401e04c3fSmrg * Permission is hereby granted, free of charge, to any person obtaining a 501e04c3fSmrg * copy of this software and associated documentation files (the "Software"), 601e04c3fSmrg * to deal in the Software without restriction, including without limitation 701e04c3fSmrg * the rights to use, copy, modify, merge, publish, distribute, sublicense, 801e04c3fSmrg * and/or sell copies of the Software, and to permit persons to whom the 901e04c3fSmrg * Software is furnished to do so, subject to the following conditions: 1001e04c3fSmrg * 1101e04c3fSmrg * The above copyright notice and this permission notice (including the next 1201e04c3fSmrg * paragraph) shall be included in all copies or substantial portions of the 1301e04c3fSmrg * Software. 1401e04c3fSmrg * 1501e04c3fSmrg * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 1601e04c3fSmrg * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 1701e04c3fSmrg * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 1801e04c3fSmrg * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 1901e04c3fSmrg * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 2001e04c3fSmrg * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 2101e04c3fSmrg * IN THE SOFTWARE. 2201e04c3fSmrg * 2301e04c3fSmrg * Authors: 2401e04c3fSmrg * Jason Ekstrand (jason@jlekstrand.net) 2501e04c3fSmrg * 2601e04c3fSmrg */ 2701e04c3fSmrg 2801e04c3fSmrg#include "nir.h" 2901e04c3fSmrg#include "nir_instr_set.h" 3001e04c3fSmrg 3101e04c3fSmrg/* 3201e04c3fSmrg * Implements Global Code Motion. A description of GCM can be found in 3301e04c3fSmrg * "Global Code Motion; Global Value Numbering" by Cliff Click. 3401e04c3fSmrg * Unfortunately, the algorithm presented in the paper is broken in a 3501e04c3fSmrg * number of ways. The algorithm used here differs substantially from the 3601e04c3fSmrg * one in the paper but it is, in my opinion, much easier to read and 3701e04c3fSmrg * verify correcness. 3801e04c3fSmrg */ 3901e04c3fSmrg 407ec681f3Smrg/* This is used to stop GCM moving instruction out of a loop if the loop 417ec681f3Smrg * contains too many instructions and moving them would create excess spilling. 427ec681f3Smrg * 437ec681f3Smrg * TODO: Figure out a better way to decide if we should remove instructions from 447ec681f3Smrg * a loop. 457ec681f3Smrg */ 467ec681f3Smrg#define MAX_LOOP_INSTRUCTIONS 100 477ec681f3Smrg 4801e04c3fSmrgstruct gcm_block_info { 4901e04c3fSmrg /* Number of loops this block is inside */ 5001e04c3fSmrg unsigned loop_depth; 5101e04c3fSmrg 527ec681f3Smrg unsigned loop_instr_count; 537ec681f3Smrg 547ec681f3Smrg /* The loop the block is nested inside or NULL */ 557ec681f3Smrg nir_loop *loop; 567ec681f3Smrg 5701e04c3fSmrg /* The last instruction inserted into this block. This is used as we 5801e04c3fSmrg * traverse the instructions and insert them back into the program to 5901e04c3fSmrg * put them in the right order. 6001e04c3fSmrg */ 6101e04c3fSmrg nir_instr *last_instr; 6201e04c3fSmrg}; 6301e04c3fSmrg 647ec681f3Smrgstruct gcm_instr_info { 657ec681f3Smrg nir_block *early_block; 667ec681f3Smrg}; 677ec681f3Smrg 6801e04c3fSmrg/* Flags used in the instr->pass_flags field for various instruction states */ 6901e04c3fSmrgenum { 707ec681f3Smrg GCM_INSTR_PINNED = (1 << 0), 717ec681f3Smrg GCM_INSTR_SCHEDULE_EARLIER_ONLY = (1 << 1), 727ec681f3Smrg GCM_INSTR_SCHEDULED_EARLY = (1 << 2), 737ec681f3Smrg GCM_INSTR_SCHEDULED_LATE = (1 << 3), 747ec681f3Smrg GCM_INSTR_PLACED = (1 << 4), 7501e04c3fSmrg}; 7601e04c3fSmrg 7701e04c3fSmrgstruct gcm_state { 7801e04c3fSmrg nir_function_impl *impl; 7901e04c3fSmrg nir_instr *instr; 8001e04c3fSmrg 817ec681f3Smrg bool progress; 827ec681f3Smrg 8301e04c3fSmrg /* The list of non-pinned instructions. As we do the late scheduling, 8401e04c3fSmrg * we pull non-pinned instructions out of their blocks and place them in 8501e04c3fSmrg * this list. This saves us from having linked-list problems when we go 8601e04c3fSmrg * to put instructions back in their blocks. 8701e04c3fSmrg */ 8801e04c3fSmrg struct exec_list instrs; 8901e04c3fSmrg 9001e04c3fSmrg struct gcm_block_info *blocks; 917ec681f3Smrg 927ec681f3Smrg unsigned num_instrs; 937ec681f3Smrg struct gcm_instr_info *instr_infos; 9401e04c3fSmrg}; 9501e04c3fSmrg 967ec681f3Smrgstatic unsigned 977ec681f3Smrgget_loop_instr_count(struct exec_list *cf_list) 987ec681f3Smrg{ 997ec681f3Smrg unsigned loop_instr_count = 0; 1007ec681f3Smrg foreach_list_typed(nir_cf_node, node, node, cf_list) { 1017ec681f3Smrg switch (node->type) { 1027ec681f3Smrg case nir_cf_node_block: { 1037ec681f3Smrg nir_block *block = nir_cf_node_as_block(node); 1047ec681f3Smrg nir_foreach_instr(instr, block) { 1057ec681f3Smrg loop_instr_count++; 1067ec681f3Smrg } 1077ec681f3Smrg break; 1087ec681f3Smrg } 1097ec681f3Smrg case nir_cf_node_if: { 1107ec681f3Smrg nir_if *if_stmt = nir_cf_node_as_if(node); 1117ec681f3Smrg loop_instr_count += get_loop_instr_count(&if_stmt->then_list); 1127ec681f3Smrg loop_instr_count += get_loop_instr_count(&if_stmt->else_list); 1137ec681f3Smrg break; 1147ec681f3Smrg } 1157ec681f3Smrg case nir_cf_node_loop: { 1167ec681f3Smrg nir_loop *loop = nir_cf_node_as_loop(node); 1177ec681f3Smrg loop_instr_count += get_loop_instr_count(&loop->body); 1187ec681f3Smrg break; 1197ec681f3Smrg } 1207ec681f3Smrg default: 1217ec681f3Smrg unreachable("Invalid CF node type"); 1227ec681f3Smrg } 1237ec681f3Smrg } 1247ec681f3Smrg 1257ec681f3Smrg return loop_instr_count; 1267ec681f3Smrg} 1277ec681f3Smrg 12801e04c3fSmrg/* Recursively walks the CFG and builds the block_info structure */ 12901e04c3fSmrgstatic void 13001e04c3fSmrggcm_build_block_info(struct exec_list *cf_list, struct gcm_state *state, 1317ec681f3Smrg nir_loop *loop, unsigned loop_depth, 1327ec681f3Smrg unsigned loop_instr_count) 13301e04c3fSmrg{ 13401e04c3fSmrg foreach_list_typed(nir_cf_node, node, node, cf_list) { 13501e04c3fSmrg switch (node->type) { 13601e04c3fSmrg case nir_cf_node_block: { 13701e04c3fSmrg nir_block *block = nir_cf_node_as_block(node); 13801e04c3fSmrg state->blocks[block->index].loop_depth = loop_depth; 1397ec681f3Smrg state->blocks[block->index].loop_instr_count = loop_instr_count; 1407ec681f3Smrg state->blocks[block->index].loop = loop; 14101e04c3fSmrg break; 14201e04c3fSmrg } 14301e04c3fSmrg case nir_cf_node_if: { 14401e04c3fSmrg nir_if *if_stmt = nir_cf_node_as_if(node); 1457ec681f3Smrg gcm_build_block_info(&if_stmt->then_list, state, loop, loop_depth, ~0u); 1467ec681f3Smrg gcm_build_block_info(&if_stmt->else_list, state, loop, loop_depth, ~0u); 14701e04c3fSmrg break; 14801e04c3fSmrg } 14901e04c3fSmrg case nir_cf_node_loop: { 15001e04c3fSmrg nir_loop *loop = nir_cf_node_as_loop(node); 1517ec681f3Smrg gcm_build_block_info(&loop->body, state, loop, loop_depth + 1, 1527ec681f3Smrg get_loop_instr_count(&loop->body)); 15301e04c3fSmrg break; 15401e04c3fSmrg } 15501e04c3fSmrg default: 15601e04c3fSmrg unreachable("Invalid CF node type"); 15701e04c3fSmrg } 15801e04c3fSmrg } 15901e04c3fSmrg} 16001e04c3fSmrg 1617ec681f3Smrgstatic bool 1627ec681f3Smrgis_src_scalarizable(nir_src *src) 1637ec681f3Smrg{ 1647ec681f3Smrg assert(src->is_ssa); 1657ec681f3Smrg 1667ec681f3Smrg nir_instr *src_instr = src->ssa->parent_instr; 1677ec681f3Smrg switch (src_instr->type) { 1687ec681f3Smrg case nir_instr_type_alu: { 1697ec681f3Smrg nir_alu_instr *src_alu = nir_instr_as_alu(src_instr); 1707ec681f3Smrg 1717ec681f3Smrg /* ALU operations with output_size == 0 should be scalarized. We 1727ec681f3Smrg * will also see a bunch of vecN operations from scalarizing ALU 1737ec681f3Smrg * operations and, since they can easily be copy-propagated, they 1747ec681f3Smrg * are ok too. 1757ec681f3Smrg */ 1767ec681f3Smrg return nir_op_infos[src_alu->op].output_size == 0 || 1777ec681f3Smrg src_alu->op == nir_op_vec2 || 1787ec681f3Smrg src_alu->op == nir_op_vec3 || 1797ec681f3Smrg src_alu->op == nir_op_vec4; 1807ec681f3Smrg } 1817ec681f3Smrg 1827ec681f3Smrg case nir_instr_type_load_const: 1837ec681f3Smrg /* These are trivially scalarizable */ 1847ec681f3Smrg return true; 1857ec681f3Smrg 1867ec681f3Smrg case nir_instr_type_ssa_undef: 1877ec681f3Smrg return true; 1887ec681f3Smrg 1897ec681f3Smrg case nir_instr_type_intrinsic: { 1907ec681f3Smrg nir_intrinsic_instr *src_intrin = nir_instr_as_intrinsic(src_instr); 1917ec681f3Smrg 1927ec681f3Smrg switch (src_intrin->intrinsic) { 1937ec681f3Smrg case nir_intrinsic_load_deref: { 1947ec681f3Smrg /* Don't scalarize if we see a load of a local variable because it 1957ec681f3Smrg * might turn into one of the things we can't scalarize. 1967ec681f3Smrg */ 1977ec681f3Smrg nir_deref_instr *deref = nir_src_as_deref(src_intrin->src[0]); 1987ec681f3Smrg return !nir_deref_mode_may_be(deref, (nir_var_function_temp | 1997ec681f3Smrg nir_var_shader_temp)); 2007ec681f3Smrg } 2017ec681f3Smrg 2027ec681f3Smrg case nir_intrinsic_interp_deref_at_centroid: 2037ec681f3Smrg case nir_intrinsic_interp_deref_at_sample: 2047ec681f3Smrg case nir_intrinsic_interp_deref_at_offset: 2057ec681f3Smrg case nir_intrinsic_load_uniform: 2067ec681f3Smrg case nir_intrinsic_load_ubo: 2077ec681f3Smrg case nir_intrinsic_load_ssbo: 2087ec681f3Smrg case nir_intrinsic_load_global: 2097ec681f3Smrg case nir_intrinsic_load_global_constant: 2107ec681f3Smrg case nir_intrinsic_load_input: 2117ec681f3Smrg return true; 2127ec681f3Smrg default: 2137ec681f3Smrg break; 2147ec681f3Smrg } 2157ec681f3Smrg 2167ec681f3Smrg return false; 2177ec681f3Smrg } 2187ec681f3Smrg 2197ec681f3Smrg default: 2207ec681f3Smrg /* We can't scalarize this type of instruction */ 2217ec681f3Smrg return false; 2227ec681f3Smrg } 2237ec681f3Smrg} 2247ec681f3Smrg 2257ec681f3Smrgstatic bool 2267ec681f3Smrgis_binding_dynamically_uniform(nir_src src) 2277ec681f3Smrg{ 2287ec681f3Smrg nir_binding binding = nir_chase_binding(src); 2297ec681f3Smrg if (!binding.success) 2307ec681f3Smrg return false; 2317ec681f3Smrg 2327ec681f3Smrg for (unsigned i = 0; i < binding.num_indices; i++) { 2337ec681f3Smrg if (!nir_src_is_dynamically_uniform(binding.indices[i])) 2347ec681f3Smrg return false; 2357ec681f3Smrg } 2367ec681f3Smrg 2377ec681f3Smrg return true; 2387ec681f3Smrg} 2397ec681f3Smrg 2407ec681f3Smrgstatic void 2417ec681f3Smrgpin_intrinsic(nir_intrinsic_instr *intrin) 2427ec681f3Smrg{ 2437ec681f3Smrg nir_instr *instr = &intrin->instr; 2447ec681f3Smrg 2457ec681f3Smrg if (!nir_intrinsic_can_reorder(intrin)) { 2467ec681f3Smrg instr->pass_flags = GCM_INSTR_PINNED; 2477ec681f3Smrg return; 2487ec681f3Smrg } 2497ec681f3Smrg 2507ec681f3Smrg instr->pass_flags = 0; 2517ec681f3Smrg 2527ec681f3Smrg /* If the intrinsic requires a uniform source, we can't safely move it across non-uniform 2537ec681f3Smrg * control flow if it's not uniform at the point it's defined. 2547ec681f3Smrg * Stores and atomics can never be re-ordered, so we don't have to consider them here. 2557ec681f3Smrg */ 2567ec681f3Smrg bool non_uniform = nir_intrinsic_has_access(intrin) && 2577ec681f3Smrg (nir_intrinsic_access(intrin) & ACCESS_NON_UNIFORM); 2587ec681f3Smrg if (!non_uniform && 2597ec681f3Smrg (intrin->intrinsic == nir_intrinsic_load_ubo || 2607ec681f3Smrg intrin->intrinsic == nir_intrinsic_load_ssbo || 2617ec681f3Smrg intrin->intrinsic == nir_intrinsic_get_ubo_size || 2627ec681f3Smrg intrin->intrinsic == nir_intrinsic_get_ssbo_size || 2637ec681f3Smrg nir_intrinsic_has_image_dim(intrin) || 2647ec681f3Smrg ((intrin->intrinsic == nir_intrinsic_load_deref || 2657ec681f3Smrg intrin->intrinsic == nir_intrinsic_deref_buffer_array_length) && 2667ec681f3Smrg nir_deref_mode_may_be(nir_src_as_deref(intrin->src[0]), 2677ec681f3Smrg nir_var_mem_ubo | nir_var_mem_ssbo)))) { 2687ec681f3Smrg if (!is_binding_dynamically_uniform(intrin->src[0])) 2697ec681f3Smrg instr->pass_flags = GCM_INSTR_PINNED; 2707ec681f3Smrg } else if (intrin->intrinsic == nir_intrinsic_load_push_constant) { 2717ec681f3Smrg if (!nir_src_is_dynamically_uniform(intrin->src[0])) 2727ec681f3Smrg instr->pass_flags = GCM_INSTR_PINNED; 2737ec681f3Smrg } else if (intrin->intrinsic == nir_intrinsic_load_deref && 2747ec681f3Smrg nir_deref_mode_is(nir_src_as_deref(intrin->src[0]), 2757ec681f3Smrg nir_var_mem_push_const)) { 2767ec681f3Smrg nir_deref_instr *deref = nir_src_as_deref(intrin->src[0]); 2777ec681f3Smrg while (deref->deref_type != nir_deref_type_var) { 2787ec681f3Smrg if ((deref->deref_type == nir_deref_type_array || 2797ec681f3Smrg deref->deref_type == nir_deref_type_ptr_as_array) && 2807ec681f3Smrg !nir_src_is_dynamically_uniform(deref->arr.index)) { 2817ec681f3Smrg instr->pass_flags = GCM_INSTR_PINNED; 2827ec681f3Smrg return; 2837ec681f3Smrg } 2847ec681f3Smrg deref = nir_deref_instr_parent(deref); 2857ec681f3Smrg if (!deref) { 2867ec681f3Smrg instr->pass_flags = GCM_INSTR_PINNED; 2877ec681f3Smrg return; 2887ec681f3Smrg } 2897ec681f3Smrg } 2907ec681f3Smrg } 2917ec681f3Smrg} 2927ec681f3Smrg 2937ec681f3Smrg/* Walks the instruction list and marks immovable instructions as pinned or 2947ec681f3Smrg * placed. 29501e04c3fSmrg * 29601e04c3fSmrg * This function also serves to initialize the instr->pass_flags field. 29701e04c3fSmrg * After this is completed, all instructions' pass_flags fields will be set 2987ec681f3Smrg * to either GCM_INSTR_PINNED, GCM_INSTR_PLACED or 0. 29901e04c3fSmrg */ 3007ec681f3Smrgstatic void 3017ec681f3Smrggcm_pin_instructions(nir_function_impl *impl, struct gcm_state *state) 30201e04c3fSmrg{ 3037ec681f3Smrg state->num_instrs = 0; 3047ec681f3Smrg 3057ec681f3Smrg nir_foreach_block(block, impl) { 3067ec681f3Smrg nir_foreach_instr_safe(instr, block) { 3077ec681f3Smrg /* Index the instructions for use in gcm_state::instrs */ 3087ec681f3Smrg instr->index = state->num_instrs++; 3097ec681f3Smrg 3107ec681f3Smrg switch (instr->type) { 3117ec681f3Smrg case nir_instr_type_alu: 3127ec681f3Smrg switch (nir_instr_as_alu(instr)->op) { 3137ec681f3Smrg case nir_op_fddx: 3147ec681f3Smrg case nir_op_fddy: 3157ec681f3Smrg case nir_op_fddx_fine: 3167ec681f3Smrg case nir_op_fddy_fine: 3177ec681f3Smrg case nir_op_fddx_coarse: 3187ec681f3Smrg case nir_op_fddy_coarse: 3197ec681f3Smrg /* These can only go in uniform control flow */ 3207ec681f3Smrg instr->pass_flags = GCM_INSTR_SCHEDULE_EARLIER_ONLY; 3217ec681f3Smrg break; 3227ec681f3Smrg 3237ec681f3Smrg case nir_op_mov: 3247ec681f3Smrg if (!is_src_scalarizable(&(nir_instr_as_alu(instr)->src[0].src))) { 3257ec681f3Smrg instr->pass_flags = GCM_INSTR_PINNED; 3267ec681f3Smrg break; 3277ec681f3Smrg } 3287ec681f3Smrg FALLTHROUGH; 3297ec681f3Smrg 3307ec681f3Smrg default: 3317ec681f3Smrg instr->pass_flags = 0; 3327ec681f3Smrg break; 3337ec681f3Smrg } 33401e04c3fSmrg break; 33501e04c3fSmrg 3367ec681f3Smrg case nir_instr_type_tex: { 3377ec681f3Smrg nir_tex_instr *tex = nir_instr_as_tex(instr); 3387ec681f3Smrg if (nir_tex_instr_has_implicit_derivative(tex)) 3397ec681f3Smrg instr->pass_flags = GCM_INSTR_SCHEDULE_EARLIER_ONLY; 3407ec681f3Smrg 3417ec681f3Smrg for (unsigned i = 0; i < tex->num_srcs; i++) { 3427ec681f3Smrg nir_tex_src *src = &tex->src[i]; 3437ec681f3Smrg switch (src->src_type) { 3447ec681f3Smrg case nir_tex_src_texture_deref: 3457ec681f3Smrg if (!tex->texture_non_uniform && !is_binding_dynamically_uniform(src->src)) 3467ec681f3Smrg instr->pass_flags = GCM_INSTR_PINNED; 3477ec681f3Smrg break; 3487ec681f3Smrg case nir_tex_src_sampler_deref: 3497ec681f3Smrg if (!tex->sampler_non_uniform && !is_binding_dynamically_uniform(src->src)) 3507ec681f3Smrg instr->pass_flags = GCM_INSTR_PINNED; 3517ec681f3Smrg break; 3527ec681f3Smrg case nir_tex_src_texture_offset: 3537ec681f3Smrg case nir_tex_src_texture_handle: 3547ec681f3Smrg if (!tex->texture_non_uniform && !nir_src_is_dynamically_uniform(src->src)) 3557ec681f3Smrg instr->pass_flags = GCM_INSTR_PINNED; 3567ec681f3Smrg break; 3577ec681f3Smrg case nir_tex_src_sampler_offset: 3587ec681f3Smrg case nir_tex_src_sampler_handle: 3597ec681f3Smrg if (!tex->sampler_non_uniform && !nir_src_is_dynamically_uniform(src->src)) 3607ec681f3Smrg instr->pass_flags = GCM_INSTR_PINNED; 3617ec681f3Smrg break; 3627ec681f3Smrg default: 3637ec681f3Smrg break; 3647ec681f3Smrg } 3657ec681f3Smrg } 36601e04c3fSmrg break; 36701e04c3fSmrg } 36801e04c3fSmrg 3697ec681f3Smrg case nir_instr_type_deref: 3707ec681f3Smrg case nir_instr_type_load_const: 3717ec681f3Smrg instr->pass_flags = 0; 37201e04c3fSmrg break; 37301e04c3fSmrg 3747ec681f3Smrg case nir_instr_type_intrinsic: 3757ec681f3Smrg pin_intrinsic(nir_instr_as_intrinsic(instr)); 37601e04c3fSmrg break; 37701e04c3fSmrg 3787ec681f3Smrg case nir_instr_type_jump: 3797ec681f3Smrg case nir_instr_type_ssa_undef: 3807ec681f3Smrg case nir_instr_type_phi: 3817ec681f3Smrg instr->pass_flags = GCM_INSTR_PLACED; 3827ec681f3Smrg break; 38301e04c3fSmrg 3847ec681f3Smrg default: 3857ec681f3Smrg unreachable("Invalid instruction type in GCM"); 38601e04c3fSmrg } 38701e04c3fSmrg 3887ec681f3Smrg if (!(instr->pass_flags & GCM_INSTR_PLACED)) { 3897ec681f3Smrg /* If this is an unplaced instruction, go ahead and pull it out of 3907ec681f3Smrg * the program and put it on the instrs list. This has a couple 3917ec681f3Smrg * of benifits. First, it makes the scheduling algorithm more 3927ec681f3Smrg * efficient because we can avoid walking over basic blocks. 3937ec681f3Smrg * Second, it keeps us from causing linked list confusion when 3947ec681f3Smrg * we're trying to put everything in its proper place at the end 3957ec681f3Smrg * of the pass. 3967ec681f3Smrg * 3977ec681f3Smrg * Note that we don't use nir_instr_remove here because that also 3987ec681f3Smrg * cleans up uses and defs and we want to keep that information. 3997ec681f3Smrg */ 4007ec681f3Smrg exec_node_remove(&instr->node); 4017ec681f3Smrg exec_list_push_tail(&state->instrs, &instr->node); 4027ec681f3Smrg } 40301e04c3fSmrg } 40401e04c3fSmrg } 40501e04c3fSmrg} 40601e04c3fSmrg 40701e04c3fSmrgstatic void 40801e04c3fSmrggcm_schedule_early_instr(nir_instr *instr, struct gcm_state *state); 40901e04c3fSmrg 41001e04c3fSmrg/** Update an instructions schedule for the given source 41101e04c3fSmrg * 41201e04c3fSmrg * This function is called iteratively as we walk the sources of an 41301e04c3fSmrg * instruction. It ensures that the given source instruction has been 41401e04c3fSmrg * scheduled and then update this instruction's block if the source 41501e04c3fSmrg * instruction is lower down the tree. 41601e04c3fSmrg */ 41701e04c3fSmrgstatic bool 41801e04c3fSmrggcm_schedule_early_src(nir_src *src, void *void_state) 41901e04c3fSmrg{ 42001e04c3fSmrg struct gcm_state *state = void_state; 42101e04c3fSmrg nir_instr *instr = state->instr; 42201e04c3fSmrg 42301e04c3fSmrg assert(src->is_ssa); 42401e04c3fSmrg 42501e04c3fSmrg gcm_schedule_early_instr(src->ssa->parent_instr, void_state); 42601e04c3fSmrg 42701e04c3fSmrg /* While the index isn't a proper dominance depth, it does have the 42801e04c3fSmrg * property that if A dominates B then A->index <= B->index. Since we 42901e04c3fSmrg * know that this instruction must have been dominated by all of its 43001e04c3fSmrg * sources at some point (even if it's gone through value-numbering), 43101e04c3fSmrg * all of the sources must lie on the same branch of the dominance tree. 43201e04c3fSmrg * Therefore, we can just go ahead and just compare indices. 43301e04c3fSmrg */ 4347ec681f3Smrg struct gcm_instr_info *src_info = 4357ec681f3Smrg &state->instr_infos[src->ssa->parent_instr->index]; 4367ec681f3Smrg struct gcm_instr_info *info = &state->instr_infos[instr->index]; 4377ec681f3Smrg if (info->early_block->index < src_info->early_block->index) 4387ec681f3Smrg info->early_block = src_info->early_block; 43901e04c3fSmrg 44001e04c3fSmrg /* We need to restore the state instruction because it may have been 44101e04c3fSmrg * changed through the gcm_schedule_early_instr call above. Since we 44201e04c3fSmrg * may still be iterating through sources and future calls to 44301e04c3fSmrg * gcm_schedule_early_src for the same instruction will still need it. 44401e04c3fSmrg */ 44501e04c3fSmrg state->instr = instr; 44601e04c3fSmrg 44701e04c3fSmrg return true; 44801e04c3fSmrg} 44901e04c3fSmrg 45001e04c3fSmrg/** Schedules an instruction early 45101e04c3fSmrg * 45201e04c3fSmrg * This function performs a recursive depth-first search starting at the 45301e04c3fSmrg * given instruction and proceeding through the sources to schedule 45401e04c3fSmrg * instructions as early as they can possibly go in the dominance tree. 4557ec681f3Smrg * The instructions are "scheduled" by updating the early_block field of 4567ec681f3Smrg * the corresponding gcm_instr_state entry. 45701e04c3fSmrg */ 45801e04c3fSmrgstatic void 45901e04c3fSmrggcm_schedule_early_instr(nir_instr *instr, struct gcm_state *state) 46001e04c3fSmrg{ 46101e04c3fSmrg if (instr->pass_flags & GCM_INSTR_SCHEDULED_EARLY) 46201e04c3fSmrg return; 46301e04c3fSmrg 46401e04c3fSmrg instr->pass_flags |= GCM_INSTR_SCHEDULED_EARLY; 46501e04c3fSmrg 4667ec681f3Smrg /* Pinned/placed instructions always get scheduled in their original block so 4677ec681f3Smrg * we don't need to do anything. Also, bailing here keeps us from ever 4687ec681f3Smrg * following the sources of phi nodes which can be back-edges. 46901e04c3fSmrg */ 4707ec681f3Smrg if (instr->pass_flags & GCM_INSTR_PINNED || 4717ec681f3Smrg instr->pass_flags & GCM_INSTR_PLACED) { 4727ec681f3Smrg state->instr_infos[instr->index].early_block = instr->block; 47301e04c3fSmrg return; 4747ec681f3Smrg } 47501e04c3fSmrg 47601e04c3fSmrg /* Start with the instruction at the top. As we iterate over the 47701e04c3fSmrg * sources, it will get moved down as needed. 47801e04c3fSmrg */ 4797ec681f3Smrg state->instr_infos[instr->index].early_block = nir_start_block(state->impl); 48001e04c3fSmrg state->instr = instr; 48101e04c3fSmrg 48201e04c3fSmrg nir_foreach_src(instr, gcm_schedule_early_src, state); 48301e04c3fSmrg} 48401e04c3fSmrg 4857ec681f3Smrgstatic bool 4867ec681f3Smrgset_block_for_loop_instr(struct gcm_state *state, nir_instr *instr, 4877ec681f3Smrg nir_block *block) 4887ec681f3Smrg{ 4897ec681f3Smrg /* If the instruction wasn't in a loop to begin with we don't want to push 4907ec681f3Smrg * it down into one. 4917ec681f3Smrg */ 4927ec681f3Smrg nir_loop *loop = state->blocks[instr->block->index].loop; 4937ec681f3Smrg if (loop == NULL) 4947ec681f3Smrg return true; 4957ec681f3Smrg 4967ec681f3Smrg if (nir_block_dominates(instr->block, block)) 4977ec681f3Smrg return true; 4987ec681f3Smrg 4997ec681f3Smrg /* If the loop only executes a single time i.e its wrapped in a: 5007ec681f3Smrg * do{ ... break; } while(true) 5017ec681f3Smrg * Don't move the instruction as it will not help anything. 5027ec681f3Smrg */ 5037ec681f3Smrg if (loop->info->limiting_terminator == NULL && !loop->info->complex_loop && 5047ec681f3Smrg nir_block_ends_in_break(nir_loop_last_block(loop))) 5057ec681f3Smrg return false; 5067ec681f3Smrg 5077ec681f3Smrg /* Being too aggressive with how we pull instructions out of loops can 5087ec681f3Smrg * result in extra register pressure and spilling. For example its fairly 5097ec681f3Smrg * common for loops in compute shaders to calculate SSBO offsets using 5107ec681f3Smrg * the workgroup id, subgroup id and subgroup invocation, pulling all 5117ec681f3Smrg * these calculations outside the loop causes register pressure. 5127ec681f3Smrg * 5137ec681f3Smrg * To work around these issues for now we only allow constant and texture 5147ec681f3Smrg * instructions to be moved outside their original loops, or instructions 5157ec681f3Smrg * where the total loop instruction count is less than 5167ec681f3Smrg * MAX_LOOP_INSTRUCTIONS. 5177ec681f3Smrg * 5187ec681f3Smrg * TODO: figure out some more heuristics to allow more to be moved out of 5197ec681f3Smrg * loops. 5207ec681f3Smrg */ 5217ec681f3Smrg if (state->blocks[instr->block->index].loop_instr_count < MAX_LOOP_INSTRUCTIONS) 5227ec681f3Smrg return true; 5237ec681f3Smrg 5247ec681f3Smrg if (instr->type == nir_instr_type_load_const || 5257ec681f3Smrg instr->type == nir_instr_type_tex) 5267ec681f3Smrg return true; 5277ec681f3Smrg 5287ec681f3Smrg return false; 5297ec681f3Smrg} 5307ec681f3Smrg 5317ec681f3Smrgstatic nir_block * 5327ec681f3Smrggcm_choose_block_for_instr(nir_instr *instr, nir_block *early_block, 5337ec681f3Smrg nir_block *late_block, struct gcm_state *state) 5347ec681f3Smrg{ 5357ec681f3Smrg assert(nir_block_dominates(early_block, late_block)); 5367ec681f3Smrg 5377ec681f3Smrg nir_block *best = late_block; 5387ec681f3Smrg for (nir_block *block = late_block; block != NULL; block = block->imm_dom) { 5397ec681f3Smrg if (state->blocks[block->index].loop_depth < 5407ec681f3Smrg state->blocks[best->index].loop_depth && 5417ec681f3Smrg set_block_for_loop_instr(state, instr, block)) 5427ec681f3Smrg best = block; 5437ec681f3Smrg else if (block == instr->block) 5447ec681f3Smrg best = block; 5457ec681f3Smrg 5467ec681f3Smrg if (block == early_block) 5477ec681f3Smrg break; 5487ec681f3Smrg } 5497ec681f3Smrg 5507ec681f3Smrg return best; 5517ec681f3Smrg} 5527ec681f3Smrg 55301e04c3fSmrgstatic void 55401e04c3fSmrggcm_schedule_late_instr(nir_instr *instr, struct gcm_state *state); 55501e04c3fSmrg 55601e04c3fSmrg/** Schedules the instruction associated with the given SSA def late 55701e04c3fSmrg * 55801e04c3fSmrg * This function works by first walking all of the uses of the given SSA 55901e04c3fSmrg * definition, ensuring that they are scheduled, and then computing the LCA 56001e04c3fSmrg * (least common ancestor) of its uses. It then schedules this instruction 56101e04c3fSmrg * as close to the LCA as possible while trying to stay out of loops. 56201e04c3fSmrg */ 56301e04c3fSmrgstatic bool 56401e04c3fSmrggcm_schedule_late_def(nir_ssa_def *def, void *void_state) 56501e04c3fSmrg{ 56601e04c3fSmrg struct gcm_state *state = void_state; 56701e04c3fSmrg 56801e04c3fSmrg nir_block *lca = NULL; 56901e04c3fSmrg 57001e04c3fSmrg nir_foreach_use(use_src, def) { 57101e04c3fSmrg nir_instr *use_instr = use_src->parent_instr; 57201e04c3fSmrg 57301e04c3fSmrg gcm_schedule_late_instr(use_instr, state); 57401e04c3fSmrg 57501e04c3fSmrg /* Phi instructions are a bit special. SSA definitions don't have to 57601e04c3fSmrg * dominate the sources of the phi nodes that use them; instead, they 57701e04c3fSmrg * have to dominate the predecessor block corresponding to the phi 57801e04c3fSmrg * source. We handle this by looking through the sources, finding 57901e04c3fSmrg * any that are usingg this SSA def, and using those blocks instead 58001e04c3fSmrg * of the one the phi lives in. 58101e04c3fSmrg */ 58201e04c3fSmrg if (use_instr->type == nir_instr_type_phi) { 58301e04c3fSmrg nir_phi_instr *phi = nir_instr_as_phi(use_instr); 58401e04c3fSmrg 58501e04c3fSmrg nir_foreach_phi_src(phi_src, phi) { 58601e04c3fSmrg if (phi_src->src.ssa == def) 58701e04c3fSmrg lca = nir_dominance_lca(lca, phi_src->pred); 58801e04c3fSmrg } 58901e04c3fSmrg } else { 59001e04c3fSmrg lca = nir_dominance_lca(lca, use_instr->block); 59101e04c3fSmrg } 59201e04c3fSmrg } 59301e04c3fSmrg 59401e04c3fSmrg nir_foreach_if_use(use_src, def) { 59501e04c3fSmrg nir_if *if_stmt = use_src->parent_if; 59601e04c3fSmrg 59701e04c3fSmrg /* For if statements, we consider the block to be the one immediately 59801e04c3fSmrg * preceding the if CF node. 59901e04c3fSmrg */ 60001e04c3fSmrg nir_block *pred_block = 60101e04c3fSmrg nir_cf_node_as_block(nir_cf_node_prev(&if_stmt->cf_node)); 60201e04c3fSmrg 60301e04c3fSmrg lca = nir_dominance_lca(lca, pred_block); 60401e04c3fSmrg } 60501e04c3fSmrg 6067ec681f3Smrg nir_block *early_block = 6077ec681f3Smrg state->instr_infos[def->parent_instr->index].early_block; 6087ec681f3Smrg 6097ec681f3Smrg /* Some instructions may never be used. Flag them and the instruction 6107ec681f3Smrg * placement code will get rid of them for us. 61101e04c3fSmrg */ 6127ec681f3Smrg if (lca == NULL) { 6137ec681f3Smrg def->parent_instr->block = NULL; 61401e04c3fSmrg return true; 6157ec681f3Smrg } 6167ec681f3Smrg 6177ec681f3Smrg if (def->parent_instr->pass_flags & GCM_INSTR_SCHEDULE_EARLIER_ONLY && 6187ec681f3Smrg lca != def->parent_instr->block && 6197ec681f3Smrg nir_block_dominates(def->parent_instr->block, lca)) { 6207ec681f3Smrg lca = def->parent_instr->block; 6217ec681f3Smrg } 62201e04c3fSmrg 62301e04c3fSmrg /* We now have the LCA of all of the uses. If our invariants hold, 62401e04c3fSmrg * this is dominated by the block that we chose when scheduling early. 62501e04c3fSmrg * We now walk up the dominance tree and pick the lowest block that is 62601e04c3fSmrg * as far outside loops as we can get. 62701e04c3fSmrg */ 6287ec681f3Smrg nir_block *best_block = 6297ec681f3Smrg gcm_choose_block_for_instr(def->parent_instr, early_block, lca, state); 63001e04c3fSmrg 6317ec681f3Smrg if (def->parent_instr->block != best_block) 6327ec681f3Smrg state->progress = true; 6337ec681f3Smrg 6347ec681f3Smrg def->parent_instr->block = best_block; 63501e04c3fSmrg 63601e04c3fSmrg return true; 63701e04c3fSmrg} 63801e04c3fSmrg 63901e04c3fSmrg/** Schedules an instruction late 64001e04c3fSmrg * 64101e04c3fSmrg * This function performs a depth-first search starting at the given 64201e04c3fSmrg * instruction and proceeding through its uses to schedule instructions as 64301e04c3fSmrg * late as they can reasonably go in the dominance tree. The instructions 64401e04c3fSmrg * are "scheduled" by updating their instr->block field. 64501e04c3fSmrg * 64601e04c3fSmrg * The name of this function is actually a bit of a misnomer as it doesn't 64701e04c3fSmrg * schedule them "as late as possible" as the paper implies. Instead, it 64801e04c3fSmrg * first finds the lates possible place it can schedule the instruction and 64901e04c3fSmrg * then possibly schedules it earlier than that. The actual location is as 65001e04c3fSmrg * far down the tree as we can go while trying to stay out of loops. 65101e04c3fSmrg */ 65201e04c3fSmrgstatic void 65301e04c3fSmrggcm_schedule_late_instr(nir_instr *instr, struct gcm_state *state) 65401e04c3fSmrg{ 65501e04c3fSmrg if (instr->pass_flags & GCM_INSTR_SCHEDULED_LATE) 65601e04c3fSmrg return; 65701e04c3fSmrg 65801e04c3fSmrg instr->pass_flags |= GCM_INSTR_SCHEDULED_LATE; 65901e04c3fSmrg 6607ec681f3Smrg /* Pinned/placed instructions are already scheduled so we don't need to do 66101e04c3fSmrg * anything. Also, bailing here keeps us from ever following phi nodes 66201e04c3fSmrg * which can be back-edges. 66301e04c3fSmrg */ 6647ec681f3Smrg if (instr->pass_flags & GCM_INSTR_PLACED || 6657ec681f3Smrg instr->pass_flags & GCM_INSTR_PINNED) 66601e04c3fSmrg return; 66701e04c3fSmrg 66801e04c3fSmrg nir_foreach_ssa_def(instr, gcm_schedule_late_def, state); 66901e04c3fSmrg} 67001e04c3fSmrg 67101e04c3fSmrgstatic bool 6727ec681f3Smrggcm_replace_def_with_undef(nir_ssa_def *def, void *void_state) 67301e04c3fSmrg{ 6747ec681f3Smrg struct gcm_state *state = void_state; 67501e04c3fSmrg 6767ec681f3Smrg if (nir_ssa_def_is_unused(def)) 6777ec681f3Smrg return true; 6787ec681f3Smrg 6797ec681f3Smrg nir_ssa_undef_instr *undef = 6807ec681f3Smrg nir_ssa_undef_instr_create(state->impl->function->shader, 6817ec681f3Smrg def->num_components, def->bit_size); 6827ec681f3Smrg nir_instr_insert(nir_before_cf_list(&state->impl->body), &undef->instr); 6837ec681f3Smrg nir_ssa_def_rewrite_uses(def, &undef->def); 6847ec681f3Smrg 6857ec681f3Smrg return true; 68601e04c3fSmrg} 68701e04c3fSmrg 68801e04c3fSmrg/** Places an instrution back into the program 68901e04c3fSmrg * 69001e04c3fSmrg * The earlier passes of GCM simply choose blocks for each instruction and 69101e04c3fSmrg * otherwise leave them alone. This pass actually places the instructions 69201e04c3fSmrg * into their chosen blocks. 69301e04c3fSmrg * 6947ec681f3Smrg * To do so, we simply insert instructions in the reverse order they were 6957ec681f3Smrg * extracted. This will simply place instructions that were scheduled earlier 6967ec681f3Smrg * onto the end of their new block and instructions that were scheduled later to 6977ec681f3Smrg * the start of their new block. 69801e04c3fSmrg */ 69901e04c3fSmrgstatic void 70001e04c3fSmrggcm_place_instr(nir_instr *instr, struct gcm_state *state) 70101e04c3fSmrg{ 70201e04c3fSmrg if (instr->pass_flags & GCM_INSTR_PLACED) 70301e04c3fSmrg return; 70401e04c3fSmrg 70501e04c3fSmrg instr->pass_flags |= GCM_INSTR_PLACED; 70601e04c3fSmrg 7077ec681f3Smrg if (instr->block == NULL) { 7087ec681f3Smrg nir_foreach_ssa_def(instr, gcm_replace_def_with_undef, state); 7097ec681f3Smrg nir_instr_remove(instr); 71001e04c3fSmrg return; 71101e04c3fSmrg } 71201e04c3fSmrg 71301e04c3fSmrg struct gcm_block_info *block_info = &state->blocks[instr->block->index]; 7147ec681f3Smrg exec_node_remove(&instr->node); 7157ec681f3Smrg 7167ec681f3Smrg if (block_info->last_instr) { 7177ec681f3Smrg exec_node_insert_node_before(&block_info->last_instr->node, 7187ec681f3Smrg &instr->node); 7197ec681f3Smrg } else { 7207ec681f3Smrg /* Schedule it at the end of the block */ 7217ec681f3Smrg nir_instr *jump_instr = nir_block_last_instr(instr->block); 7227ec681f3Smrg if (jump_instr && jump_instr->type == nir_instr_type_jump) { 7237ec681f3Smrg exec_node_insert_node_before(&jump_instr->node, &instr->node); 72401e04c3fSmrg } else { 7257ec681f3Smrg exec_list_push_tail(&instr->block->instr_list, &instr->node); 72601e04c3fSmrg } 72701e04c3fSmrg } 72801e04c3fSmrg 72901e04c3fSmrg block_info->last_instr = instr; 73001e04c3fSmrg} 73101e04c3fSmrg 73201e04c3fSmrgstatic bool 7337ec681f3Smrgopt_gcm_impl(nir_shader *shader, nir_function_impl *impl, bool value_number) 73401e04c3fSmrg{ 73501e04c3fSmrg nir_metadata_require(impl, nir_metadata_block_index | 73601e04c3fSmrg nir_metadata_dominance); 7377ec681f3Smrg nir_metadata_require(impl, nir_metadata_loop_analysis, 7387ec681f3Smrg shader->options->force_indirect_unrolling); 7397ec681f3Smrg 7407ec681f3Smrg /* A previous pass may have left pass_flags dirty, so clear it all out. */ 7417ec681f3Smrg nir_foreach_block(block, impl) 7427ec681f3Smrg nir_foreach_instr(instr, block) 7437ec681f3Smrg instr->pass_flags = 0; 74401e04c3fSmrg 74501e04c3fSmrg struct gcm_state state; 74601e04c3fSmrg 74701e04c3fSmrg state.impl = impl; 74801e04c3fSmrg state.instr = NULL; 7497ec681f3Smrg state.progress = false; 75001e04c3fSmrg exec_list_make_empty(&state.instrs); 75101e04c3fSmrg state.blocks = rzalloc_array(NULL, struct gcm_block_info, impl->num_blocks); 75201e04c3fSmrg 7537ec681f3Smrg gcm_build_block_info(&impl->body, &state, NULL, 0, ~0u); 75401e04c3fSmrg 7557ec681f3Smrg gcm_pin_instructions(impl, &state); 7567ec681f3Smrg 7577ec681f3Smrg state.instr_infos = 7587ec681f3Smrg rzalloc_array(NULL, struct gcm_instr_info, state.num_instrs); 75901e04c3fSmrg 76001e04c3fSmrg if (value_number) { 76101e04c3fSmrg struct set *gvn_set = nir_instr_set_create(NULL); 76201e04c3fSmrg foreach_list_typed_safe(nir_instr, instr, node, &state.instrs) { 7637ec681f3Smrg if (instr->pass_flags & GCM_INSTR_PINNED) 7647ec681f3Smrg continue; 7657ec681f3Smrg 7667ec681f3Smrg if (nir_instr_set_add_or_rewrite(gvn_set, instr, NULL)) 7677ec681f3Smrg state.progress = true; 76801e04c3fSmrg } 76901e04c3fSmrg nir_instr_set_destroy(gvn_set); 77001e04c3fSmrg } 77101e04c3fSmrg 77201e04c3fSmrg foreach_list_typed(nir_instr, instr, node, &state.instrs) 77301e04c3fSmrg gcm_schedule_early_instr(instr, &state); 77401e04c3fSmrg 77501e04c3fSmrg foreach_list_typed(nir_instr, instr, node, &state.instrs) 77601e04c3fSmrg gcm_schedule_late_instr(instr, &state); 77701e04c3fSmrg 77801e04c3fSmrg while (!exec_list_is_empty(&state.instrs)) { 77901e04c3fSmrg nir_instr *instr = exec_node_data(nir_instr, 78001e04c3fSmrg state.instrs.tail_sentinel.prev, node); 78101e04c3fSmrg gcm_place_instr(instr, &state); 78201e04c3fSmrg } 78301e04c3fSmrg 78401e04c3fSmrg ralloc_free(state.blocks); 7857ec681f3Smrg ralloc_free(state.instr_infos); 78601e04c3fSmrg 78701e04c3fSmrg nir_metadata_preserve(impl, nir_metadata_block_index | 7887ec681f3Smrg nir_metadata_dominance | 7897ec681f3Smrg nir_metadata_loop_analysis); 79001e04c3fSmrg 7917ec681f3Smrg return state.progress; 79201e04c3fSmrg} 79301e04c3fSmrg 79401e04c3fSmrgbool 79501e04c3fSmrgnir_opt_gcm(nir_shader *shader, bool value_number) 79601e04c3fSmrg{ 79701e04c3fSmrg bool progress = false; 79801e04c3fSmrg 79901e04c3fSmrg nir_foreach_function(function, shader) { 80001e04c3fSmrg if (function->impl) 8017ec681f3Smrg progress |= opt_gcm_impl(shader, function->impl, value_number); 80201e04c3fSmrg } 80301e04c3fSmrg 80401e04c3fSmrg return progress; 80501e04c3fSmrg} 806