101e04c3fSmrg/* 201e04c3fSmrg * Copyright © 2014 Intel Corporation 301e04c3fSmrg * 401e04c3fSmrg * Permission is hereby granted, free of charge, to any person obtaining a 501e04c3fSmrg * copy of this software and associated documentation files (the "Software"), 601e04c3fSmrg * to deal in the Software without restriction, including without limitation 701e04c3fSmrg * the rights to use, copy, modify, merge, publish, distribute, sublicense, 801e04c3fSmrg * and/or sell copies of the Software, and to permit persons to whom the 901e04c3fSmrg * Software is furnished to do so, subject to the following conditions: 1001e04c3fSmrg * 1101e04c3fSmrg * The above copyright notice and this permission notice (including the next 1201e04c3fSmrg * paragraph) shall be included in all copies or substantial portions of the 1301e04c3fSmrg * Software. 1401e04c3fSmrg * 1501e04c3fSmrg * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 1601e04c3fSmrg * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 1701e04c3fSmrg * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 1801e04c3fSmrg * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 1901e04c3fSmrg * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 2001e04c3fSmrg * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 2101e04c3fSmrg * IN THE SOFTWARE. 2201e04c3fSmrg * 2301e04c3fSmrg * Authors: 2401e04c3fSmrg * Jason Ekstrand (jason@jlekstrand.net) 2501e04c3fSmrg * 2601e04c3fSmrg */ 2701e04c3fSmrg 2801e04c3fSmrg#include "nir.h" 2901e04c3fSmrg#include "nir_builder.h" 3001e04c3fSmrg#include "nir_vla.h" 3101e04c3fSmrg 3201e04c3fSmrg/* 3301e04c3fSmrg * This file implements an out-of-SSA pass as described in "Revisiting 3401e04c3fSmrg * Out-of-SSA Translation for Correctness, Code Quality, and Efficiency" by 3501e04c3fSmrg * Boissinot et al. 3601e04c3fSmrg */ 3701e04c3fSmrg 3801e04c3fSmrgstruct from_ssa_state { 3901e04c3fSmrg nir_builder builder; 4001e04c3fSmrg void *dead_ctx; 417ec681f3Smrg struct exec_list dead_instrs; 4201e04c3fSmrg bool phi_webs_only; 4301e04c3fSmrg struct hash_table *merge_node_table; 4401e04c3fSmrg nir_instr *instr; 4501e04c3fSmrg bool progress; 4601e04c3fSmrg}; 4701e04c3fSmrg 487ec681f3Smrg/* Returns if def @a comes after def @b. 497ec681f3Smrg * 507ec681f3Smrg * The core observation that makes the Boissinot algorithm efficient 517ec681f3Smrg * is that, given two properly sorted sets, we can check for 527ec681f3Smrg * interference in these sets via a linear walk. This is accomplished 537ec681f3Smrg * by doing single combined walk over union of the two sets in DFS 547ec681f3Smrg * order. It doesn't matter what DFS we do so long as we're 557ec681f3Smrg * consistent. Fortunately, the dominance algorithm we ran prior to 567ec681f3Smrg * this pass did such a walk and recorded the pre- and post-indices in 577ec681f3Smrg * the blocks. 587ec681f3Smrg * 597ec681f3Smrg * We treat SSA undefs as always coming before other instruction types. 607ec681f3Smrg */ 617ec681f3Smrgstatic bool 627ec681f3Smrgdef_after(nir_ssa_def *a, nir_ssa_def *b) 637ec681f3Smrg{ 647ec681f3Smrg if (a->parent_instr->type == nir_instr_type_ssa_undef) 657ec681f3Smrg return false; 667ec681f3Smrg 677ec681f3Smrg if (b->parent_instr->type == nir_instr_type_ssa_undef) 687ec681f3Smrg return true; 697ec681f3Smrg 707ec681f3Smrg /* If they're in the same block, we can rely on whichever instruction 717ec681f3Smrg * comes first in the block. 727ec681f3Smrg */ 737ec681f3Smrg if (a->parent_instr->block == b->parent_instr->block) 747ec681f3Smrg return a->parent_instr->index > b->parent_instr->index; 757ec681f3Smrg 767ec681f3Smrg /* Otherwise, if blocks are distinct, we sort them in DFS pre-order */ 777ec681f3Smrg return a->parent_instr->block->dom_pre_index > 787ec681f3Smrg b->parent_instr->block->dom_pre_index; 797ec681f3Smrg} 807ec681f3Smrg 8101e04c3fSmrg/* Returns true if a dominates b */ 8201e04c3fSmrgstatic bool 8301e04c3fSmrgssa_def_dominates(nir_ssa_def *a, nir_ssa_def *b) 8401e04c3fSmrg{ 857ec681f3Smrg if (a->parent_instr->type == nir_instr_type_ssa_undef) { 8601e04c3fSmrg /* SSA undefs always dominate */ 8701e04c3fSmrg return true; 887ec681f3Smrg } if (def_after(a, b)) { 8901e04c3fSmrg return false; 9001e04c3fSmrg } else if (a->parent_instr->block == b->parent_instr->block) { 917ec681f3Smrg return def_after(b, a); 9201e04c3fSmrg } else { 9301e04c3fSmrg return nir_block_dominates(a->parent_instr->block, 9401e04c3fSmrg b->parent_instr->block); 9501e04c3fSmrg } 9601e04c3fSmrg} 9701e04c3fSmrg 9801e04c3fSmrg 9901e04c3fSmrg/* The following data structure, which I have named merge_set is a way of 10001e04c3fSmrg * representing a set registers of non-interfering registers. This is 10101e04c3fSmrg * based on the concept of a "dominance forest" presented in "Fast Copy 10201e04c3fSmrg * Coalescing and Live-Range Identification" by Budimlic et al. but the 10301e04c3fSmrg * implementation concept is taken from "Revisiting Out-of-SSA Translation 10401e04c3fSmrg * for Correctness, Code Quality, and Efficiency" by Boissinot et al. 10501e04c3fSmrg * 10601e04c3fSmrg * Each SSA definition is associated with a merge_node and the association 10701e04c3fSmrg * is represented by a combination of a hash table and the "def" parameter 10801e04c3fSmrg * in the merge_node structure. The merge_set stores a linked list of 1097ec681f3Smrg * merge_nodes, ordered by a pre-order DFS walk of the dominance tree. (Since 1107ec681f3Smrg * the liveness analysis pass indexes the SSA values in dominance order for 1117ec681f3Smrg * us, this is an easy thing to keep up.) It is assumed that no pair of the 11201e04c3fSmrg * nodes in a given set interfere. Merging two sets or checking for 11301e04c3fSmrg * interference can be done in a single linear-time merge-sort walk of the 11401e04c3fSmrg * two lists of nodes. 11501e04c3fSmrg */ 11601e04c3fSmrgstruct merge_set; 11701e04c3fSmrg 11801e04c3fSmrgtypedef struct { 11901e04c3fSmrg struct exec_node node; 12001e04c3fSmrg struct merge_set *set; 12101e04c3fSmrg nir_ssa_def *def; 12201e04c3fSmrg} merge_node; 12301e04c3fSmrg 12401e04c3fSmrgtypedef struct merge_set { 12501e04c3fSmrg struct exec_list nodes; 12601e04c3fSmrg unsigned size; 1277ec681f3Smrg bool divergent; 12801e04c3fSmrg nir_register *reg; 12901e04c3fSmrg} merge_set; 13001e04c3fSmrg 13101e04c3fSmrg#if 0 13201e04c3fSmrgstatic void 13301e04c3fSmrgmerge_set_dump(merge_set *set, FILE *fp) 13401e04c3fSmrg{ 13501e04c3fSmrg nir_ssa_def *dom[set->size]; 13601e04c3fSmrg int dom_idx = -1; 13701e04c3fSmrg 13801e04c3fSmrg foreach_list_typed(merge_node, node, node, &set->nodes) { 13901e04c3fSmrg while (dom_idx >= 0 && !ssa_def_dominates(dom[dom_idx], node->def)) 14001e04c3fSmrg dom_idx--; 14101e04c3fSmrg 14201e04c3fSmrg for (int i = 0; i <= dom_idx; i++) 14301e04c3fSmrg fprintf(fp, " "); 14401e04c3fSmrg 1457ec681f3Smrg fprintf(fp, "ssa_%d\n", node->def->index); 14601e04c3fSmrg 14701e04c3fSmrg dom[++dom_idx] = node->def; 14801e04c3fSmrg } 14901e04c3fSmrg} 15001e04c3fSmrg#endif 15101e04c3fSmrg 15201e04c3fSmrgstatic merge_node * 15301e04c3fSmrgget_merge_node(nir_ssa_def *def, struct from_ssa_state *state) 15401e04c3fSmrg{ 15501e04c3fSmrg struct hash_entry *entry = 15601e04c3fSmrg _mesa_hash_table_search(state->merge_node_table, def); 15701e04c3fSmrg if (entry) 15801e04c3fSmrg return entry->data; 15901e04c3fSmrg 16001e04c3fSmrg merge_set *set = ralloc(state->dead_ctx, merge_set); 16101e04c3fSmrg exec_list_make_empty(&set->nodes); 16201e04c3fSmrg set->size = 1; 1637ec681f3Smrg set->divergent = def->divergent; 16401e04c3fSmrg set->reg = NULL; 16501e04c3fSmrg 16601e04c3fSmrg merge_node *node = ralloc(state->dead_ctx, merge_node); 16701e04c3fSmrg node->set = set; 16801e04c3fSmrg node->def = def; 16901e04c3fSmrg exec_list_push_head(&set->nodes, &node->node); 17001e04c3fSmrg 17101e04c3fSmrg _mesa_hash_table_insert(state->merge_node_table, def, node); 17201e04c3fSmrg 17301e04c3fSmrg return node; 17401e04c3fSmrg} 17501e04c3fSmrg 17601e04c3fSmrgstatic bool 17701e04c3fSmrgmerge_nodes_interfere(merge_node *a, merge_node *b) 17801e04c3fSmrg{ 1797ec681f3Smrg /* There's no need to check for interference within the same set, 1807ec681f3Smrg * because we assume, that sets themselves are already 1817ec681f3Smrg * interference-free. 1827ec681f3Smrg */ 1837ec681f3Smrg if (a->set == b->set) 1847ec681f3Smrg return false; 1857ec681f3Smrg 18601e04c3fSmrg return nir_ssa_defs_interfere(a->def, b->def); 18701e04c3fSmrg} 18801e04c3fSmrg 1897ec681f3Smrg/* Merges b into a 1907ec681f3Smrg * 1917ec681f3Smrg * This algorithm uses def_after to ensure that the sets always stay in the 1927ec681f3Smrg * same order as the pre-order DFS done by the liveness algorithm. 1937ec681f3Smrg */ 19401e04c3fSmrgstatic merge_set * 19501e04c3fSmrgmerge_merge_sets(merge_set *a, merge_set *b) 19601e04c3fSmrg{ 19701e04c3fSmrg struct exec_node *an = exec_list_get_head(&a->nodes); 19801e04c3fSmrg struct exec_node *bn = exec_list_get_head(&b->nodes); 19901e04c3fSmrg while (!exec_node_is_tail_sentinel(bn)) { 20001e04c3fSmrg merge_node *a_node = exec_node_data(merge_node, an, node); 20101e04c3fSmrg merge_node *b_node = exec_node_data(merge_node, bn, node); 20201e04c3fSmrg 20301e04c3fSmrg if (exec_node_is_tail_sentinel(an) || 2047ec681f3Smrg def_after(a_node->def, b_node->def)) { 20501e04c3fSmrg struct exec_node *next = bn->next; 20601e04c3fSmrg exec_node_remove(bn); 20701e04c3fSmrg exec_node_insert_node_before(an, bn); 20801e04c3fSmrg exec_node_data(merge_node, bn, node)->set = a; 20901e04c3fSmrg bn = next; 21001e04c3fSmrg } else { 21101e04c3fSmrg an = an->next; 21201e04c3fSmrg } 21301e04c3fSmrg } 21401e04c3fSmrg 21501e04c3fSmrg a->size += b->size; 21601e04c3fSmrg b->size = 0; 2177ec681f3Smrg a->divergent |= b->divergent; 21801e04c3fSmrg 21901e04c3fSmrg return a; 22001e04c3fSmrg} 22101e04c3fSmrg 22201e04c3fSmrg/* Checks for any interference between two merge sets 22301e04c3fSmrg * 22401e04c3fSmrg * This is an implementation of Algorithm 2 in "Revisiting Out-of-SSA 22501e04c3fSmrg * Translation for Correctness, Code Quality, and Efficiency" by 22601e04c3fSmrg * Boissinot et al. 22701e04c3fSmrg */ 22801e04c3fSmrgstatic bool 22901e04c3fSmrgmerge_sets_interfere(merge_set *a, merge_set *b) 23001e04c3fSmrg{ 2317ec681f3Smrg /* List of all the nodes which dominate the current node, in dominance 2327ec681f3Smrg * order. 2337ec681f3Smrg */ 23401e04c3fSmrg NIR_VLA(merge_node *, dom, a->size + b->size); 23501e04c3fSmrg int dom_idx = -1; 23601e04c3fSmrg 23701e04c3fSmrg struct exec_node *an = exec_list_get_head(&a->nodes); 23801e04c3fSmrg struct exec_node *bn = exec_list_get_head(&b->nodes); 23901e04c3fSmrg while (!exec_node_is_tail_sentinel(an) || 24001e04c3fSmrg !exec_node_is_tail_sentinel(bn)) { 24101e04c3fSmrg 2427ec681f3Smrg /* We walk the union of the two sets in the same order as the pre-order 2437ec681f3Smrg * DFS done by liveness analysis. 2447ec681f3Smrg */ 24501e04c3fSmrg merge_node *current; 24601e04c3fSmrg if (exec_node_is_tail_sentinel(an)) { 24701e04c3fSmrg current = exec_node_data(merge_node, bn, node); 24801e04c3fSmrg bn = bn->next; 24901e04c3fSmrg } else if (exec_node_is_tail_sentinel(bn)) { 25001e04c3fSmrg current = exec_node_data(merge_node, an, node); 25101e04c3fSmrg an = an->next; 25201e04c3fSmrg } else { 25301e04c3fSmrg merge_node *a_node = exec_node_data(merge_node, an, node); 25401e04c3fSmrg merge_node *b_node = exec_node_data(merge_node, bn, node); 25501e04c3fSmrg 2567ec681f3Smrg if (def_after(b_node->def, a_node->def)) { 25701e04c3fSmrg current = a_node; 25801e04c3fSmrg an = an->next; 25901e04c3fSmrg } else { 26001e04c3fSmrg current = b_node; 26101e04c3fSmrg bn = bn->next; 26201e04c3fSmrg } 26301e04c3fSmrg } 26401e04c3fSmrg 2657ec681f3Smrg /* Because our walk is a pre-order DFS, we can maintain the list of 2667ec681f3Smrg * dominating nodes as a simple stack, pushing every node onto the list 2677ec681f3Smrg * after we visit it and popping any non-dominating nodes off before we 2687ec681f3Smrg * visit the current node. 2697ec681f3Smrg */ 27001e04c3fSmrg while (dom_idx >= 0 && 27101e04c3fSmrg !ssa_def_dominates(dom[dom_idx]->def, current->def)) 27201e04c3fSmrg dom_idx--; 27301e04c3fSmrg 2747ec681f3Smrg /* There are three invariants of this algorithm that are important here: 2757ec681f3Smrg * 2767ec681f3Smrg * 1. There is no interference within either set a or set b. 2777ec681f3Smrg * 2. None of the nodes processed up until this point interfere. 2787ec681f3Smrg * 3. All the dominators of `current` have been processed 2797ec681f3Smrg * 2807ec681f3Smrg * Because of these invariants, we only need to check the current node 2817ec681f3Smrg * against its minimal dominator. If any other node N in the union 2827ec681f3Smrg * interferes with current, then N must dominate current because we are 2837ec681f3Smrg * in SSA form. If N dominates current then it must also dominate our 2847ec681f3Smrg * minimal dominator dom[dom_idx]. Since N is live at current it must 2857ec681f3Smrg * also be live at the minimal dominator which means N interferes with 2867ec681f3Smrg * the minimal dominator dom[dom_idx] and, by invariants 2 and 3 above, 2877ec681f3Smrg * the algorithm would have already terminated. Therefore, if we got 2887ec681f3Smrg * here, the only node that can possibly interfere with current is the 2897ec681f3Smrg * minimal dominator dom[dom_idx]. 2907ec681f3Smrg * 2917ec681f3Smrg * This is what allows us to do a interference check of the union of the 2927ec681f3Smrg * two sets with a single linear-time walk. 2937ec681f3Smrg */ 29401e04c3fSmrg if (dom_idx >= 0 && merge_nodes_interfere(current, dom[dom_idx])) 29501e04c3fSmrg return true; 29601e04c3fSmrg 29701e04c3fSmrg dom[++dom_idx] = current; 29801e04c3fSmrg } 29901e04c3fSmrg 30001e04c3fSmrg return false; 30101e04c3fSmrg} 30201e04c3fSmrg 30301e04c3fSmrgstatic bool 3047ec681f3Smrgadd_parallel_copy_to_end_of_block(nir_shader *shader, nir_block *block, void *dead_ctx) 30501e04c3fSmrg{ 30601e04c3fSmrg bool need_end_copy = false; 30701e04c3fSmrg if (block->successors[0]) { 30801e04c3fSmrg nir_instr *instr = nir_block_first_instr(block->successors[0]); 30901e04c3fSmrg if (instr && instr->type == nir_instr_type_phi) 31001e04c3fSmrg need_end_copy = true; 31101e04c3fSmrg } 31201e04c3fSmrg 31301e04c3fSmrg if (block->successors[1]) { 31401e04c3fSmrg nir_instr *instr = nir_block_first_instr(block->successors[1]); 31501e04c3fSmrg if (instr && instr->type == nir_instr_type_phi) 31601e04c3fSmrg need_end_copy = true; 31701e04c3fSmrg } 31801e04c3fSmrg 31901e04c3fSmrg if (need_end_copy) { 32001e04c3fSmrg /* If one of our successors has at least one phi node, we need to 32101e04c3fSmrg * create a parallel copy at the end of the block but before the jump 32201e04c3fSmrg * (if there is one). 32301e04c3fSmrg */ 32401e04c3fSmrg nir_parallel_copy_instr *pcopy = 3257ec681f3Smrg nir_parallel_copy_instr_create(shader); 32601e04c3fSmrg 32701e04c3fSmrg nir_instr_insert(nir_after_block_before_jump(block), &pcopy->instr); 32801e04c3fSmrg } 32901e04c3fSmrg 33001e04c3fSmrg return true; 33101e04c3fSmrg} 33201e04c3fSmrg 33301e04c3fSmrgstatic nir_parallel_copy_instr * 33401e04c3fSmrgget_parallel_copy_at_end_of_block(nir_block *block) 33501e04c3fSmrg{ 33601e04c3fSmrg nir_instr *last_instr = nir_block_last_instr(block); 33701e04c3fSmrg if (last_instr == NULL) 33801e04c3fSmrg return NULL; 33901e04c3fSmrg 34001e04c3fSmrg /* The last instruction may be a jump in which case the parallel copy is 34101e04c3fSmrg * right before it. 34201e04c3fSmrg */ 34301e04c3fSmrg if (last_instr->type == nir_instr_type_jump) 34401e04c3fSmrg last_instr = nir_instr_prev(last_instr); 34501e04c3fSmrg 34601e04c3fSmrg if (last_instr && last_instr->type == nir_instr_type_parallel_copy) 34701e04c3fSmrg return nir_instr_as_parallel_copy(last_instr); 34801e04c3fSmrg else 34901e04c3fSmrg return NULL; 35001e04c3fSmrg} 35101e04c3fSmrg 35201e04c3fSmrg/** Isolate phi nodes with parallel copies 35301e04c3fSmrg * 35401e04c3fSmrg * In order to solve the dependency problems with the sources and 35501e04c3fSmrg * destinations of phi nodes, we first isolate them by adding parallel 35601e04c3fSmrg * copies to the beginnings and ends of basic blocks. For every block with 35701e04c3fSmrg * phi nodes, we add a parallel copy immediately following the last phi 35801e04c3fSmrg * node that copies the destinations of all of the phi nodes to new SSA 35901e04c3fSmrg * values. We also add a parallel copy to the end of every block that has 36001e04c3fSmrg * a successor with phi nodes that, for each phi node in each successor, 36101e04c3fSmrg * copies the corresponding sorce of the phi node and adjust the phi to 36201e04c3fSmrg * used the destination of the parallel copy. 36301e04c3fSmrg * 36401e04c3fSmrg * In SSA form, each value has exactly one definition. What this does is 36501e04c3fSmrg * ensure that each value used in a phi also has exactly one use. The 36601e04c3fSmrg * destinations of phis are only used by the parallel copy immediately 36701e04c3fSmrg * following the phi nodes and. Thanks to the parallel copy at the end of 36801e04c3fSmrg * the predecessor block, the sources of phi nodes are are the only use of 36901e04c3fSmrg * that value. This allows us to immediately assign all the sources and 37001e04c3fSmrg * destinations of any given phi node to the same register without worrying 37101e04c3fSmrg * about interference at all. We do coalescing to get rid of the parallel 37201e04c3fSmrg * copies where possible. 37301e04c3fSmrg * 37401e04c3fSmrg * Before this pass can be run, we have to iterate over the blocks with 37501e04c3fSmrg * add_parallel_copy_to_end_of_block to ensure that the parallel copies at 37601e04c3fSmrg * the ends of blocks exist. We can create the ones at the beginnings as 37701e04c3fSmrg * we go, but the ones at the ends of blocks need to be created ahead of 37801e04c3fSmrg * time because of potential back-edges in the CFG. 37901e04c3fSmrg */ 38001e04c3fSmrgstatic bool 3817ec681f3Smrgisolate_phi_nodes_block(nir_shader *shader, nir_block *block, void *dead_ctx) 38201e04c3fSmrg{ 38301e04c3fSmrg nir_instr *last_phi_instr = NULL; 38401e04c3fSmrg nir_foreach_instr(instr, block) { 38501e04c3fSmrg /* Phi nodes only ever come at the start of a block */ 38601e04c3fSmrg if (instr->type != nir_instr_type_phi) 38701e04c3fSmrg break; 38801e04c3fSmrg 38901e04c3fSmrg last_phi_instr = instr; 39001e04c3fSmrg } 39101e04c3fSmrg 39201e04c3fSmrg /* If we don't have any phis, then there's nothing for us to do. */ 39301e04c3fSmrg if (last_phi_instr == NULL) 39401e04c3fSmrg return true; 39501e04c3fSmrg 39601e04c3fSmrg /* If we have phi nodes, we need to create a parallel copy at the 39701e04c3fSmrg * start of this block but after the phi nodes. 39801e04c3fSmrg */ 39901e04c3fSmrg nir_parallel_copy_instr *block_pcopy = 4007ec681f3Smrg nir_parallel_copy_instr_create(shader); 40101e04c3fSmrg nir_instr_insert_after(last_phi_instr, &block_pcopy->instr); 40201e04c3fSmrg 40301e04c3fSmrg nir_foreach_instr(instr, block) { 40401e04c3fSmrg /* Phi nodes only ever come at the start of a block */ 40501e04c3fSmrg if (instr->type != nir_instr_type_phi) 40601e04c3fSmrg break; 40701e04c3fSmrg 40801e04c3fSmrg nir_phi_instr *phi = nir_instr_as_phi(instr); 40901e04c3fSmrg assert(phi->dest.is_ssa); 41001e04c3fSmrg nir_foreach_phi_src(src, phi) { 41101e04c3fSmrg nir_parallel_copy_instr *pcopy = 41201e04c3fSmrg get_parallel_copy_at_end_of_block(src->pred); 41301e04c3fSmrg assert(pcopy); 41401e04c3fSmrg 41501e04c3fSmrg nir_parallel_copy_entry *entry = rzalloc(dead_ctx, 41601e04c3fSmrg nir_parallel_copy_entry); 41701e04c3fSmrg nir_ssa_dest_init(&pcopy->instr, &entry->dest, 41801e04c3fSmrg phi->dest.ssa.num_components, 4197ec681f3Smrg phi->dest.ssa.bit_size, NULL); 4207ec681f3Smrg entry->dest.ssa.divergent = nir_src_is_divergent(src->src); 42101e04c3fSmrg exec_list_push_tail(&pcopy->entries, &entry->node); 42201e04c3fSmrg 42301e04c3fSmrg assert(src->src.is_ssa); 42401e04c3fSmrg nir_instr_rewrite_src(&pcopy->instr, &entry->src, src->src); 42501e04c3fSmrg 42601e04c3fSmrg nir_instr_rewrite_src(&phi->instr, &src->src, 42701e04c3fSmrg nir_src_for_ssa(&entry->dest.ssa)); 42801e04c3fSmrg } 42901e04c3fSmrg 43001e04c3fSmrg nir_parallel_copy_entry *entry = rzalloc(dead_ctx, 43101e04c3fSmrg nir_parallel_copy_entry); 43201e04c3fSmrg nir_ssa_dest_init(&block_pcopy->instr, &entry->dest, 43301e04c3fSmrg phi->dest.ssa.num_components, phi->dest.ssa.bit_size, 4347ec681f3Smrg NULL); 4357ec681f3Smrg entry->dest.ssa.divergent = phi->dest.ssa.divergent; 43601e04c3fSmrg exec_list_push_tail(&block_pcopy->entries, &entry->node); 43701e04c3fSmrg 43801e04c3fSmrg nir_ssa_def_rewrite_uses(&phi->dest.ssa, 4397ec681f3Smrg &entry->dest.ssa); 44001e04c3fSmrg 44101e04c3fSmrg nir_instr_rewrite_src(&block_pcopy->instr, &entry->src, 44201e04c3fSmrg nir_src_for_ssa(&phi->dest.ssa)); 44301e04c3fSmrg } 44401e04c3fSmrg 44501e04c3fSmrg return true; 44601e04c3fSmrg} 44701e04c3fSmrg 44801e04c3fSmrgstatic bool 44901e04c3fSmrgcoalesce_phi_nodes_block(nir_block *block, struct from_ssa_state *state) 45001e04c3fSmrg{ 45101e04c3fSmrg nir_foreach_instr(instr, block) { 45201e04c3fSmrg /* Phi nodes only ever come at the start of a block */ 45301e04c3fSmrg if (instr->type != nir_instr_type_phi) 45401e04c3fSmrg break; 45501e04c3fSmrg 45601e04c3fSmrg nir_phi_instr *phi = nir_instr_as_phi(instr); 45701e04c3fSmrg 45801e04c3fSmrg assert(phi->dest.is_ssa); 45901e04c3fSmrg merge_node *dest_node = get_merge_node(&phi->dest.ssa, state); 46001e04c3fSmrg 46101e04c3fSmrg nir_foreach_phi_src(src, phi) { 46201e04c3fSmrg assert(src->src.is_ssa); 46301e04c3fSmrg merge_node *src_node = get_merge_node(src->src.ssa, state); 46401e04c3fSmrg if (src_node->set != dest_node->set) 46501e04c3fSmrg merge_merge_sets(dest_node->set, src_node->set); 46601e04c3fSmrg } 46701e04c3fSmrg } 46801e04c3fSmrg 46901e04c3fSmrg return true; 47001e04c3fSmrg} 47101e04c3fSmrg 47201e04c3fSmrgstatic void 47301e04c3fSmrgaggressive_coalesce_parallel_copy(nir_parallel_copy_instr *pcopy, 47401e04c3fSmrg struct from_ssa_state *state) 47501e04c3fSmrg{ 47601e04c3fSmrg nir_foreach_parallel_copy_entry(entry, pcopy) { 47701e04c3fSmrg if (!entry->src.is_ssa) 47801e04c3fSmrg continue; 47901e04c3fSmrg 48001e04c3fSmrg /* Since load_const instructions are SSA only, we can't replace their 48101e04c3fSmrg * destinations with registers and, therefore, can't coalesce them. 48201e04c3fSmrg */ 48301e04c3fSmrg if (entry->src.ssa->parent_instr->type == nir_instr_type_load_const) 48401e04c3fSmrg continue; 48501e04c3fSmrg 48601e04c3fSmrg /* Don't try and coalesce these */ 48701e04c3fSmrg if (entry->dest.ssa.num_components != entry->src.ssa->num_components) 48801e04c3fSmrg continue; 48901e04c3fSmrg 49001e04c3fSmrg merge_node *src_node = get_merge_node(entry->src.ssa, state); 49101e04c3fSmrg merge_node *dest_node = get_merge_node(&entry->dest.ssa, state); 49201e04c3fSmrg 49301e04c3fSmrg if (src_node->set == dest_node->set) 49401e04c3fSmrg continue; 49501e04c3fSmrg 4967ec681f3Smrg /* TODO: We can probably do better here but for now we should be safe if 4977ec681f3Smrg * we just don't coalesce things with different divergence. 4987ec681f3Smrg */ 4997ec681f3Smrg if (dest_node->set->divergent != src_node->set->divergent) 5007ec681f3Smrg continue; 5017ec681f3Smrg 50201e04c3fSmrg if (!merge_sets_interfere(src_node->set, dest_node->set)) 50301e04c3fSmrg merge_merge_sets(src_node->set, dest_node->set); 50401e04c3fSmrg } 50501e04c3fSmrg} 50601e04c3fSmrg 50701e04c3fSmrgstatic bool 50801e04c3fSmrgaggressive_coalesce_block(nir_block *block, struct from_ssa_state *state) 50901e04c3fSmrg{ 51001e04c3fSmrg nir_parallel_copy_instr *start_pcopy = NULL; 51101e04c3fSmrg nir_foreach_instr(instr, block) { 51201e04c3fSmrg /* Phi nodes only ever come at the start of a block */ 51301e04c3fSmrg if (instr->type != nir_instr_type_phi) { 51401e04c3fSmrg if (instr->type != nir_instr_type_parallel_copy) 51501e04c3fSmrg break; /* The parallel copy must be right after the phis */ 51601e04c3fSmrg 51701e04c3fSmrg start_pcopy = nir_instr_as_parallel_copy(instr); 51801e04c3fSmrg 51901e04c3fSmrg aggressive_coalesce_parallel_copy(start_pcopy, state); 52001e04c3fSmrg 52101e04c3fSmrg break; 52201e04c3fSmrg } 52301e04c3fSmrg } 52401e04c3fSmrg 52501e04c3fSmrg nir_parallel_copy_instr *end_pcopy = 52601e04c3fSmrg get_parallel_copy_at_end_of_block(block); 52701e04c3fSmrg 52801e04c3fSmrg if (end_pcopy && end_pcopy != start_pcopy) 52901e04c3fSmrg aggressive_coalesce_parallel_copy(end_pcopy, state); 53001e04c3fSmrg 53101e04c3fSmrg return true; 53201e04c3fSmrg} 53301e04c3fSmrg 53401e04c3fSmrgstatic nir_register * 53501e04c3fSmrgcreate_reg_for_ssa_def(nir_ssa_def *def, nir_function_impl *impl) 53601e04c3fSmrg{ 53701e04c3fSmrg nir_register *reg = nir_local_reg_create(impl); 53801e04c3fSmrg 53901e04c3fSmrg reg->num_components = def->num_components; 54001e04c3fSmrg reg->bit_size = def->bit_size; 54101e04c3fSmrg reg->num_array_elems = 0; 54201e04c3fSmrg 54301e04c3fSmrg return reg; 54401e04c3fSmrg} 54501e04c3fSmrg 54601e04c3fSmrgstatic bool 54701e04c3fSmrgrewrite_ssa_def(nir_ssa_def *def, void *void_state) 54801e04c3fSmrg{ 54901e04c3fSmrg struct from_ssa_state *state = void_state; 55001e04c3fSmrg nir_register *reg; 55101e04c3fSmrg 55201e04c3fSmrg struct hash_entry *entry = 55301e04c3fSmrg _mesa_hash_table_search(state->merge_node_table, def); 55401e04c3fSmrg if (entry) { 55501e04c3fSmrg /* In this case, we're part of a phi web. Use the web's register. */ 55601e04c3fSmrg merge_node *node = (merge_node *)entry->data; 55701e04c3fSmrg 55801e04c3fSmrg /* If it doesn't have a register yet, create one. Note that all of 55901e04c3fSmrg * the things in the merge set should be the same so it doesn't 56001e04c3fSmrg * matter which node's definition we use. 56101e04c3fSmrg */ 5627ec681f3Smrg if (node->set->reg == NULL) { 56301e04c3fSmrg node->set->reg = create_reg_for_ssa_def(def, state->builder.impl); 5647ec681f3Smrg node->set->reg->divergent = node->set->divergent; 5657ec681f3Smrg } 56601e04c3fSmrg 56701e04c3fSmrg reg = node->set->reg; 56801e04c3fSmrg } else { 56901e04c3fSmrg if (state->phi_webs_only) 57001e04c3fSmrg return true; 57101e04c3fSmrg 57201e04c3fSmrg /* We leave load_const SSA values alone. They act as immediates to 57301e04c3fSmrg * the backend. If it got coalesced into a phi, that's ok. 57401e04c3fSmrg */ 57501e04c3fSmrg if (def->parent_instr->type == nir_instr_type_load_const) 57601e04c3fSmrg return true; 57701e04c3fSmrg 57801e04c3fSmrg reg = create_reg_for_ssa_def(def, state->builder.impl); 57901e04c3fSmrg } 58001e04c3fSmrg 5817ec681f3Smrg nir_ssa_def_rewrite_uses_src(def, nir_src_for_reg(reg)); 5827ec681f3Smrg assert(nir_ssa_def_is_unused(def)); 58301e04c3fSmrg 58401e04c3fSmrg if (def->parent_instr->type == nir_instr_type_ssa_undef) { 58501e04c3fSmrg /* If it's an ssa_undef instruction, remove it since we know we just got 58601e04c3fSmrg * rid of all its uses. 58701e04c3fSmrg */ 58801e04c3fSmrg nir_instr *parent_instr = def->parent_instr; 58901e04c3fSmrg nir_instr_remove(parent_instr); 5907ec681f3Smrg exec_list_push_tail(&state->dead_instrs, &parent_instr->node); 59101e04c3fSmrg state->progress = true; 59201e04c3fSmrg return true; 59301e04c3fSmrg } 59401e04c3fSmrg 59501e04c3fSmrg assert(def->parent_instr->type != nir_instr_type_load_const); 59601e04c3fSmrg 59701e04c3fSmrg /* At this point we know a priori that this SSA def is part of a 59801e04c3fSmrg * nir_dest. We can use exec_node_data to get the dest pointer. 59901e04c3fSmrg */ 60001e04c3fSmrg nir_dest *dest = exec_node_data(nir_dest, def, ssa); 60101e04c3fSmrg 60201e04c3fSmrg nir_instr_rewrite_dest(state->instr, dest, nir_dest_for_reg(reg)); 60301e04c3fSmrg state->progress = true; 60401e04c3fSmrg return true; 60501e04c3fSmrg} 60601e04c3fSmrg 60701e04c3fSmrg/* Resolves ssa definitions to registers. While we're at it, we also 60801e04c3fSmrg * remove phi nodes. 60901e04c3fSmrg */ 61001e04c3fSmrgstatic void 61101e04c3fSmrgresolve_registers_block(nir_block *block, struct from_ssa_state *state) 61201e04c3fSmrg{ 61301e04c3fSmrg nir_foreach_instr_safe(instr, block) { 61401e04c3fSmrg state->instr = instr; 61501e04c3fSmrg nir_foreach_ssa_def(instr, rewrite_ssa_def, state); 61601e04c3fSmrg 61701e04c3fSmrg if (instr->type == nir_instr_type_phi) { 61801e04c3fSmrg nir_instr_remove(instr); 6197ec681f3Smrg exec_list_push_tail(&state->dead_instrs, &instr->node); 62001e04c3fSmrg state->progress = true; 62101e04c3fSmrg } 62201e04c3fSmrg } 62301e04c3fSmrg state->instr = NULL; 62401e04c3fSmrg} 62501e04c3fSmrg 62601e04c3fSmrgstatic void 62701e04c3fSmrgemit_copy(nir_builder *b, nir_src src, nir_src dest_src) 62801e04c3fSmrg{ 62901e04c3fSmrg assert(!dest_src.is_ssa && 63001e04c3fSmrg dest_src.reg.indirect == NULL && 63101e04c3fSmrg dest_src.reg.base_offset == 0); 63201e04c3fSmrg 6337ec681f3Smrg assert(!nir_src_is_divergent(src) || nir_src_is_divergent(dest_src)); 6347ec681f3Smrg 63501e04c3fSmrg if (src.is_ssa) 63601e04c3fSmrg assert(src.ssa->num_components >= dest_src.reg.reg->num_components); 63701e04c3fSmrg else 63801e04c3fSmrg assert(src.reg.reg->num_components >= dest_src.reg.reg->num_components); 63901e04c3fSmrg 6407ec681f3Smrg nir_alu_instr *mov = nir_alu_instr_create(b->shader, nir_op_mov); 6417ec681f3Smrg nir_src_copy(&mov->src[0].src, &src); 64201e04c3fSmrg mov->dest.dest = nir_dest_for_reg(dest_src.reg.reg); 64301e04c3fSmrg mov->dest.write_mask = (1 << dest_src.reg.reg->num_components) - 1; 64401e04c3fSmrg 64501e04c3fSmrg nir_builder_instr_insert(b, &mov->instr); 64601e04c3fSmrg} 64701e04c3fSmrg 64801e04c3fSmrg/* Resolves a single parallel copy operation into a sequence of movs 64901e04c3fSmrg * 65001e04c3fSmrg * This is based on Algorithm 1 from "Revisiting Out-of-SSA Translation for 65101e04c3fSmrg * Correctness, Code Quality, and Efficiency" by Boissinot et al. 65201e04c3fSmrg * However, I never got the algorithm to work as written, so this version 65301e04c3fSmrg * is slightly modified. 65401e04c3fSmrg * 65501e04c3fSmrg * The algorithm works by playing this little shell game with the values. 65601e04c3fSmrg * We start by recording where every source value is and which source value 65701e04c3fSmrg * each destination value should receive. We then grab any copy whose 65801e04c3fSmrg * destination is "empty", i.e. not used as a source, and do the following: 65901e04c3fSmrg * - Find where its source value currently lives 66001e04c3fSmrg * - Emit the move instruction 66101e04c3fSmrg * - Set the location of the source value to the destination 66201e04c3fSmrg * - Mark the location containing the source value 66301e04c3fSmrg * - Mark the destination as no longer needing to be copied 66401e04c3fSmrg * 66501e04c3fSmrg * When we run out of "empty" destinations, we have a cycle and so we 66601e04c3fSmrg * create a temporary register, copy to that register, and mark the value 66701e04c3fSmrg * we copied as living in that temporary. Now, the cycle is broken, so we 66801e04c3fSmrg * can continue with the above steps. 66901e04c3fSmrg */ 67001e04c3fSmrgstatic void 67101e04c3fSmrgresolve_parallel_copy(nir_parallel_copy_instr *pcopy, 67201e04c3fSmrg struct from_ssa_state *state) 67301e04c3fSmrg{ 67401e04c3fSmrg unsigned num_copies = 0; 67501e04c3fSmrg nir_foreach_parallel_copy_entry(entry, pcopy) { 67601e04c3fSmrg /* Sources may be SSA */ 67701e04c3fSmrg if (!entry->src.is_ssa && entry->src.reg.reg == entry->dest.reg.reg) 67801e04c3fSmrg continue; 67901e04c3fSmrg 68001e04c3fSmrg num_copies++; 68101e04c3fSmrg } 68201e04c3fSmrg 68301e04c3fSmrg if (num_copies == 0) { 68401e04c3fSmrg /* Hooray, we don't need any copies! */ 68501e04c3fSmrg nir_instr_remove(&pcopy->instr); 6867ec681f3Smrg exec_list_push_tail(&state->dead_instrs, &pcopy->instr.node); 68701e04c3fSmrg return; 68801e04c3fSmrg } 68901e04c3fSmrg 69001e04c3fSmrg /* The register/source corresponding to the given index */ 69101e04c3fSmrg NIR_VLA_ZERO(nir_src, values, num_copies * 2); 69201e04c3fSmrg 69301e04c3fSmrg /* The current location of a given piece of data. We will use -1 for "null" */ 69401e04c3fSmrg NIR_VLA_FILL(int, loc, num_copies * 2, -1); 69501e04c3fSmrg 69601e04c3fSmrg /* The piece of data that the given piece of data is to be copied from. We will use -1 for "null" */ 69701e04c3fSmrg NIR_VLA_FILL(int, pred, num_copies * 2, -1); 69801e04c3fSmrg 69901e04c3fSmrg /* The destinations we have yet to properly fill */ 70001e04c3fSmrg NIR_VLA(int, to_do, num_copies * 2); 70101e04c3fSmrg int to_do_idx = -1; 70201e04c3fSmrg 70301e04c3fSmrg state->builder.cursor = nir_before_instr(&pcopy->instr); 70401e04c3fSmrg 70501e04c3fSmrg /* Now we set everything up: 70601e04c3fSmrg * - All values get assigned a temporary index 70701e04c3fSmrg * - Current locations are set from sources 70801e04c3fSmrg * - Predicessors are recorded from sources and destinations 70901e04c3fSmrg */ 71001e04c3fSmrg int num_vals = 0; 71101e04c3fSmrg nir_foreach_parallel_copy_entry(entry, pcopy) { 71201e04c3fSmrg /* Sources may be SSA */ 71301e04c3fSmrg if (!entry->src.is_ssa && entry->src.reg.reg == entry->dest.reg.reg) 71401e04c3fSmrg continue; 71501e04c3fSmrg 71601e04c3fSmrg int src_idx = -1; 71701e04c3fSmrg for (int i = 0; i < num_vals; ++i) { 71801e04c3fSmrg if (nir_srcs_equal(values[i], entry->src)) 71901e04c3fSmrg src_idx = i; 72001e04c3fSmrg } 72101e04c3fSmrg if (src_idx < 0) { 72201e04c3fSmrg src_idx = num_vals++; 72301e04c3fSmrg values[src_idx] = entry->src; 72401e04c3fSmrg } 72501e04c3fSmrg 72601e04c3fSmrg nir_src dest_src = nir_src_for_reg(entry->dest.reg.reg); 72701e04c3fSmrg 72801e04c3fSmrg int dest_idx = -1; 72901e04c3fSmrg for (int i = 0; i < num_vals; ++i) { 73001e04c3fSmrg if (nir_srcs_equal(values[i], dest_src)) { 73101e04c3fSmrg /* Each destination of a parallel copy instruction should be 73201e04c3fSmrg * unique. A destination may get used as a source, so we still 73301e04c3fSmrg * have to walk the list. However, the predecessor should not, 73401e04c3fSmrg * at this point, be set yet, so we should have -1 here. 73501e04c3fSmrg */ 73601e04c3fSmrg assert(pred[i] == -1); 73701e04c3fSmrg dest_idx = i; 73801e04c3fSmrg } 73901e04c3fSmrg } 74001e04c3fSmrg if (dest_idx < 0) { 74101e04c3fSmrg dest_idx = num_vals++; 74201e04c3fSmrg values[dest_idx] = dest_src; 74301e04c3fSmrg } 74401e04c3fSmrg 74501e04c3fSmrg loc[src_idx] = src_idx; 74601e04c3fSmrg pred[dest_idx] = src_idx; 74701e04c3fSmrg 74801e04c3fSmrg to_do[++to_do_idx] = dest_idx; 74901e04c3fSmrg } 75001e04c3fSmrg 75101e04c3fSmrg /* Currently empty destinations we can go ahead and fill */ 75201e04c3fSmrg NIR_VLA(int, ready, num_copies * 2); 75301e04c3fSmrg int ready_idx = -1; 75401e04c3fSmrg 75501e04c3fSmrg /* Mark the ones that are ready for copying. We know an index is a 75601e04c3fSmrg * destination if it has a predecessor and it's ready for copying if 75701e04c3fSmrg * it's not marked as containing data. 75801e04c3fSmrg */ 75901e04c3fSmrg for (int i = 0; i < num_vals; i++) { 76001e04c3fSmrg if (pred[i] != -1 && loc[i] == -1) 76101e04c3fSmrg ready[++ready_idx] = i; 76201e04c3fSmrg } 76301e04c3fSmrg 76401e04c3fSmrg while (to_do_idx >= 0) { 76501e04c3fSmrg while (ready_idx >= 0) { 76601e04c3fSmrg int b = ready[ready_idx--]; 76701e04c3fSmrg int a = pred[b]; 76801e04c3fSmrg emit_copy(&state->builder, values[loc[a]], values[b]); 76901e04c3fSmrg 77001e04c3fSmrg /* b has been filled, mark it as not needing to be copied */ 77101e04c3fSmrg pred[b] = -1; 77201e04c3fSmrg 7737ec681f3Smrg /* The next bit only applies if the source and destination have the 7747ec681f3Smrg * same divergence. If they differ (it must be convergent -> 7757ec681f3Smrg * divergent), then we can't guarantee we won't need the convergent 7767ec681f3Smrg * version of again. 7777ec681f3Smrg */ 7787ec681f3Smrg if (nir_src_is_divergent(values[a]) == 7797ec681f3Smrg nir_src_is_divergent(values[b])) { 7807ec681f3Smrg /* If any other copies want a they can find it at b but only if the 7817ec681f3Smrg * two have the same divergence. 7827ec681f3Smrg */ 7837ec681f3Smrg loc[a] = b; 7847ec681f3Smrg 7857ec681f3Smrg /* If a needs to be filled... */ 7867ec681f3Smrg if (pred[a] != -1) { 7877ec681f3Smrg /* If any other copies want a they can find it at b */ 7887ec681f3Smrg loc[a] = b; 7897ec681f3Smrg 7907ec681f3Smrg /* It's ready for copying now */ 7917ec681f3Smrg ready[++ready_idx] = a; 7927ec681f3Smrg } 7937ec681f3Smrg } 79401e04c3fSmrg } 79501e04c3fSmrg int b = to_do[to_do_idx--]; 79601e04c3fSmrg if (pred[b] == -1) 79701e04c3fSmrg continue; 79801e04c3fSmrg 79901e04c3fSmrg /* If we got here, then we don't have any more trivial copies that we 80001e04c3fSmrg * can do. We have to break a cycle, so we create a new temporary 80101e04c3fSmrg * register for that purpose. Normally, if going out of SSA after 80201e04c3fSmrg * register allocation, you would want to avoid creating temporary 80301e04c3fSmrg * registers. However, we are going out of SSA before register 80401e04c3fSmrg * allocation, so we would rather not create extra register 80501e04c3fSmrg * dependencies for the backend to deal with. If it wants, the 80601e04c3fSmrg * backend can coalesce the (possibly multiple) temporaries. 80701e04c3fSmrg */ 80801e04c3fSmrg assert(num_vals < num_copies * 2); 80901e04c3fSmrg nir_register *reg = nir_local_reg_create(state->builder.impl); 81001e04c3fSmrg reg->num_array_elems = 0; 8117e102996Smaya if (values[b].is_ssa) { 81201e04c3fSmrg reg->num_components = values[b].ssa->num_components; 8137e102996Smaya reg->bit_size = values[b].ssa->bit_size; 8147e102996Smaya } else { 81501e04c3fSmrg reg->num_components = values[b].reg.reg->num_components; 8167e102996Smaya reg->bit_size = values[b].reg.reg->bit_size; 8177e102996Smaya } 8187ec681f3Smrg reg->divergent = nir_src_is_divergent(values[b]); 81901e04c3fSmrg values[num_vals].is_ssa = false; 82001e04c3fSmrg values[num_vals].reg.reg = reg; 82101e04c3fSmrg 82201e04c3fSmrg emit_copy(&state->builder, values[b], values[num_vals]); 82301e04c3fSmrg loc[b] = num_vals; 82401e04c3fSmrg ready[++ready_idx] = b; 82501e04c3fSmrg num_vals++; 82601e04c3fSmrg } 82701e04c3fSmrg 82801e04c3fSmrg nir_instr_remove(&pcopy->instr); 8297ec681f3Smrg exec_list_push_tail(&state->dead_instrs, &pcopy->instr.node); 83001e04c3fSmrg} 83101e04c3fSmrg 83201e04c3fSmrg/* Resolves the parallel copies in a block. Each block can have at most 83301e04c3fSmrg * two: One at the beginning, right after all the phi noces, and one at 83401e04c3fSmrg * the end (or right before the final jump if it exists). 83501e04c3fSmrg */ 83601e04c3fSmrgstatic bool 83701e04c3fSmrgresolve_parallel_copies_block(nir_block *block, struct from_ssa_state *state) 83801e04c3fSmrg{ 83901e04c3fSmrg /* At this point, we have removed all of the phi nodes. If a parallel 84001e04c3fSmrg * copy existed right after the phi nodes in this block, it is now the 84101e04c3fSmrg * first instruction. 84201e04c3fSmrg */ 84301e04c3fSmrg nir_instr *first_instr = nir_block_first_instr(block); 84401e04c3fSmrg if (first_instr == NULL) 84501e04c3fSmrg return true; /* Empty, nothing to do. */ 84601e04c3fSmrg 84701e04c3fSmrg if (first_instr->type == nir_instr_type_parallel_copy) { 84801e04c3fSmrg nir_parallel_copy_instr *pcopy = nir_instr_as_parallel_copy(first_instr); 84901e04c3fSmrg 85001e04c3fSmrg resolve_parallel_copy(pcopy, state); 85101e04c3fSmrg } 85201e04c3fSmrg 85301e04c3fSmrg /* It's possible that the above code already cleaned up the end parallel 85401e04c3fSmrg * copy. However, doing so removed it form the instructions list so we 85501e04c3fSmrg * won't find it here. Therefore, it's safe to go ahead and just look 85601e04c3fSmrg * for one and clean it up if it exists. 85701e04c3fSmrg */ 85801e04c3fSmrg nir_parallel_copy_instr *end_pcopy = 85901e04c3fSmrg get_parallel_copy_at_end_of_block(block); 86001e04c3fSmrg if (end_pcopy) 86101e04c3fSmrg resolve_parallel_copy(end_pcopy, state); 86201e04c3fSmrg 86301e04c3fSmrg return true; 86401e04c3fSmrg} 86501e04c3fSmrg 86601e04c3fSmrgstatic bool 86701e04c3fSmrgnir_convert_from_ssa_impl(nir_function_impl *impl, bool phi_webs_only) 86801e04c3fSmrg{ 8697ec681f3Smrg nir_shader *shader = impl->function->shader; 8707ec681f3Smrg 87101e04c3fSmrg struct from_ssa_state state; 87201e04c3fSmrg 87301e04c3fSmrg nir_builder_init(&state.builder, impl); 87401e04c3fSmrg state.dead_ctx = ralloc_context(NULL); 87501e04c3fSmrg state.phi_webs_only = phi_webs_only; 8767e102996Smaya state.merge_node_table = _mesa_pointer_hash_table_create(NULL); 87701e04c3fSmrg state.progress = false; 8787ec681f3Smrg exec_list_make_empty(&state.dead_instrs); 87901e04c3fSmrg 88001e04c3fSmrg nir_foreach_block(block, impl) { 8817ec681f3Smrg add_parallel_copy_to_end_of_block(shader, block, state.dead_ctx); 88201e04c3fSmrg } 88301e04c3fSmrg 88401e04c3fSmrg nir_foreach_block(block, impl) { 8857ec681f3Smrg isolate_phi_nodes_block(shader, block, state.dead_ctx); 88601e04c3fSmrg } 88701e04c3fSmrg 88801e04c3fSmrg /* Mark metadata as dirty before we ask for liveness analysis */ 88901e04c3fSmrg nir_metadata_preserve(impl, nir_metadata_block_index | 89001e04c3fSmrg nir_metadata_dominance); 89101e04c3fSmrg 8927ec681f3Smrg nir_metadata_require(impl, nir_metadata_instr_index | 8937ec681f3Smrg nir_metadata_live_ssa_defs | 89401e04c3fSmrg nir_metadata_dominance); 89501e04c3fSmrg 89601e04c3fSmrg nir_foreach_block(block, impl) { 89701e04c3fSmrg coalesce_phi_nodes_block(block, &state); 89801e04c3fSmrg } 89901e04c3fSmrg 90001e04c3fSmrg nir_foreach_block(block, impl) { 90101e04c3fSmrg aggressive_coalesce_block(block, &state); 90201e04c3fSmrg } 90301e04c3fSmrg 90401e04c3fSmrg nir_foreach_block(block, impl) { 90501e04c3fSmrg resolve_registers_block(block, &state); 90601e04c3fSmrg } 90701e04c3fSmrg 90801e04c3fSmrg nir_foreach_block(block, impl) { 90901e04c3fSmrg resolve_parallel_copies_block(block, &state); 91001e04c3fSmrg } 91101e04c3fSmrg 91201e04c3fSmrg nir_metadata_preserve(impl, nir_metadata_block_index | 91301e04c3fSmrg nir_metadata_dominance); 91401e04c3fSmrg 91501e04c3fSmrg /* Clean up dead instructions and the hash tables */ 9167ec681f3Smrg nir_instr_free_list(&state.dead_instrs); 91701e04c3fSmrg _mesa_hash_table_destroy(state.merge_node_table, NULL); 91801e04c3fSmrg ralloc_free(state.dead_ctx); 91901e04c3fSmrg return state.progress; 92001e04c3fSmrg} 92101e04c3fSmrg 92201e04c3fSmrgbool 92301e04c3fSmrgnir_convert_from_ssa(nir_shader *shader, bool phi_webs_only) 92401e04c3fSmrg{ 92501e04c3fSmrg bool progress = false; 92601e04c3fSmrg 92701e04c3fSmrg nir_foreach_function(function, shader) { 92801e04c3fSmrg if (function->impl) 92901e04c3fSmrg progress |= nir_convert_from_ssa_impl(function->impl, phi_webs_only); 93001e04c3fSmrg } 93101e04c3fSmrg 93201e04c3fSmrg return progress; 93301e04c3fSmrg} 93401e04c3fSmrg 93501e04c3fSmrg 93601e04c3fSmrgstatic void 9377ec681f3Smrgplace_phi_read(nir_builder *b, nir_register *reg, 9387ec681f3Smrg nir_ssa_def *def, nir_block *block, struct set *visited_blocks) 93901e04c3fSmrg{ 9407ec681f3Smrg /* Search already visited blocks to avoid back edges in tree */ 9417ec681f3Smrg if (_mesa_set_search(visited_blocks, block) == NULL) { 94201e04c3fSmrg /* Try to go up the single-successor tree */ 94301e04c3fSmrg bool all_single_successors = true; 94401e04c3fSmrg set_foreach(block->predecessors, entry) { 94501e04c3fSmrg nir_block *pred = (nir_block *)entry->key; 94601e04c3fSmrg if (pred->successors[0] && pred->successors[1]) { 94701e04c3fSmrg all_single_successors = false; 94801e04c3fSmrg break; 94901e04c3fSmrg } 95001e04c3fSmrg } 95101e04c3fSmrg 9527ec681f3Smrg if (all_single_successors) { 95301e04c3fSmrg /* All predecessors of this block have exactly one successor and it 95401e04c3fSmrg * is this block so they must eventually lead here without 95501e04c3fSmrg * intersecting each other. Place the reads in the predecessors 95601e04c3fSmrg * instead of this block. 95701e04c3fSmrg */ 9587ec681f3Smrg _mesa_set_add(visited_blocks, block); 9597ec681f3Smrg 9607e102996Smaya set_foreach(block->predecessors, entry) { 9617ec681f3Smrg place_phi_read(b, reg, def, (nir_block *)entry->key, visited_blocks); 9627e102996Smaya } 96301e04c3fSmrg return; 96401e04c3fSmrg } 96501e04c3fSmrg } 96601e04c3fSmrg 9677ec681f3Smrg b->cursor = nir_after_block_before_jump(block); 9687ec681f3Smrg nir_store_reg(b, reg, def, ~0); 96901e04c3fSmrg} 97001e04c3fSmrg 9717ec681f3Smrg/** Lower all of the phi nodes in a block to movs to and from a register 97201e04c3fSmrg * 97301e04c3fSmrg * This provides a very quick-and-dirty out-of-SSA pass that you can run on a 9747ec681f3Smrg * single block to convert all of its phis to a register and some movs. 97501e04c3fSmrg * The code that is generated, while not optimal for actual codegen in a 97601e04c3fSmrg * back-end, is easy to generate, correct, and will turn into the same set of 97701e04c3fSmrg * phis after you call regs_to_ssa and do some copy propagation. 97801e04c3fSmrg * 97901e04c3fSmrg * The one intelligent thing this pass does is that it places the moves from 98001e04c3fSmrg * the phi sources as high up the predecessor tree as possible instead of in 98101e04c3fSmrg * the exact predecessor. This means that, in particular, it will crawl into 98201e04c3fSmrg * the deepest nesting of any if-ladders. In order to ensure that doing so is 98301e04c3fSmrg * safe, it stops as soon as one of the predecessors has multiple successors. 98401e04c3fSmrg */ 98501e04c3fSmrgbool 98601e04c3fSmrgnir_lower_phis_to_regs_block(nir_block *block) 98701e04c3fSmrg{ 9887ec681f3Smrg nir_builder b; 9897ec681f3Smrg nir_builder_init(&b, nir_cf_node_get_function(&block->cf_node)); 9907ec681f3Smrg struct set *visited_blocks = _mesa_set_create(NULL, _mesa_hash_pointer, 9917ec681f3Smrg _mesa_key_pointer_equal); 99201e04c3fSmrg 99301e04c3fSmrg bool progress = false; 99401e04c3fSmrg nir_foreach_instr_safe(instr, block) { 99501e04c3fSmrg if (instr->type != nir_instr_type_phi) 99601e04c3fSmrg break; 99701e04c3fSmrg 99801e04c3fSmrg nir_phi_instr *phi = nir_instr_as_phi(instr); 99901e04c3fSmrg assert(phi->dest.is_ssa); 100001e04c3fSmrg 10017ec681f3Smrg nir_register *reg = create_reg_for_ssa_def(&phi->dest.ssa, b.impl); 100201e04c3fSmrg 10037ec681f3Smrg b.cursor = nir_after_instr(&phi->instr); 10047ec681f3Smrg nir_ssa_def *def = nir_load_reg(&b, reg); 100501e04c3fSmrg 10067ec681f3Smrg nir_ssa_def_rewrite_uses(&phi->dest.ssa, def); 100701e04c3fSmrg 100801e04c3fSmrg nir_foreach_phi_src(src, phi) { 100901e04c3fSmrg assert(src->src.is_ssa); 10107ec681f3Smrg _mesa_set_add(visited_blocks, src->src.ssa->parent_instr->block); 10117ec681f3Smrg place_phi_read(&b, reg, src->src.ssa, src->pred, visited_blocks); 10127ec681f3Smrg _mesa_set_clear(visited_blocks, NULL); 101301e04c3fSmrg } 101401e04c3fSmrg 101501e04c3fSmrg nir_instr_remove(&phi->instr); 101601e04c3fSmrg 101701e04c3fSmrg progress = true; 101801e04c3fSmrg } 101901e04c3fSmrg 10207ec681f3Smrg _mesa_set_destroy(visited_blocks, NULL); 10217ec681f3Smrg 102201e04c3fSmrg return progress; 102301e04c3fSmrg} 102401e04c3fSmrg 102501e04c3fSmrgstruct ssa_def_to_reg_state { 102601e04c3fSmrg nir_function_impl *impl; 102701e04c3fSmrg bool progress; 102801e04c3fSmrg}; 102901e04c3fSmrg 103001e04c3fSmrgstatic bool 103101e04c3fSmrgdest_replace_ssa_with_reg(nir_dest *dest, void *void_state) 103201e04c3fSmrg{ 103301e04c3fSmrg struct ssa_def_to_reg_state *state = void_state; 103401e04c3fSmrg 103501e04c3fSmrg if (!dest->is_ssa) 103601e04c3fSmrg return true; 103701e04c3fSmrg 103801e04c3fSmrg nir_register *reg = create_reg_for_ssa_def(&dest->ssa, state->impl); 103901e04c3fSmrg 10407ec681f3Smrg nir_ssa_def_rewrite_uses_src(&dest->ssa, nir_src_for_reg(reg)); 104101e04c3fSmrg 104201e04c3fSmrg nir_instr *instr = dest->ssa.parent_instr; 104301e04c3fSmrg *dest = nir_dest_for_reg(reg); 104401e04c3fSmrg dest->reg.parent_instr = instr; 104501e04c3fSmrg list_addtail(&dest->reg.def_link, ®->defs); 104601e04c3fSmrg 104701e04c3fSmrg state->progress = true; 104801e04c3fSmrg 104901e04c3fSmrg return true; 105001e04c3fSmrg} 105101e04c3fSmrg 10527ec681f3Smrgstatic bool 10537ec681f3Smrgssa_def_is_local_to_block(nir_ssa_def *def, UNUSED void *state) 10547ec681f3Smrg{ 10557ec681f3Smrg nir_block *block = def->parent_instr->block; 10567ec681f3Smrg nir_foreach_use(use_src, def) { 10577ec681f3Smrg if (use_src->parent_instr->block != block || 10587ec681f3Smrg use_src->parent_instr->type == nir_instr_type_phi) { 10597ec681f3Smrg return false; 10607ec681f3Smrg } 10617ec681f3Smrg } 10627ec681f3Smrg 10637ec681f3Smrg if (!list_is_empty(&def->if_uses)) 10647ec681f3Smrg return false; 10657ec681f3Smrg 10667ec681f3Smrg return true; 10677ec681f3Smrg} 10687ec681f3Smrg 106901e04c3fSmrg/** Lower all of the SSA defs in a block to registers 107001e04c3fSmrg * 107101e04c3fSmrg * This performs the very simple operation of blindly replacing all of the SSA 107201e04c3fSmrg * defs in the given block with registers. If not used carefully, this may 107301e04c3fSmrg * result in phi nodes with register sources which is technically invalid. 107401e04c3fSmrg * Fortunately, the register-based into-SSA pass handles them anyway. 107501e04c3fSmrg */ 107601e04c3fSmrgbool 107701e04c3fSmrgnir_lower_ssa_defs_to_regs_block(nir_block *block) 107801e04c3fSmrg{ 107901e04c3fSmrg nir_function_impl *impl = nir_cf_node_get_function(&block->cf_node); 108001e04c3fSmrg nir_shader *shader = impl->function->shader; 108101e04c3fSmrg 108201e04c3fSmrg struct ssa_def_to_reg_state state = { 108301e04c3fSmrg .impl = impl, 108401e04c3fSmrg .progress = false, 108501e04c3fSmrg }; 108601e04c3fSmrg 108701e04c3fSmrg nir_foreach_instr(instr, block) { 108801e04c3fSmrg if (instr->type == nir_instr_type_ssa_undef) { 108901e04c3fSmrg /* Undefs are just a read of something never written. */ 109001e04c3fSmrg nir_ssa_undef_instr *undef = nir_instr_as_ssa_undef(instr); 109101e04c3fSmrg nir_register *reg = create_reg_for_ssa_def(&undef->def, state.impl); 10927ec681f3Smrg nir_ssa_def_rewrite_uses_src(&undef->def, nir_src_for_reg(reg)); 109301e04c3fSmrg } else if (instr->type == nir_instr_type_load_const) { 109401e04c3fSmrg /* Constant loads are SSA-only, we need to insert a move */ 109501e04c3fSmrg nir_load_const_instr *load = nir_instr_as_load_const(instr); 109601e04c3fSmrg nir_register *reg = create_reg_for_ssa_def(&load->def, state.impl); 10977ec681f3Smrg nir_ssa_def_rewrite_uses_src(&load->def, nir_src_for_reg(reg)); 109801e04c3fSmrg 10997ec681f3Smrg nir_alu_instr *mov = nir_alu_instr_create(shader, nir_op_mov); 110001e04c3fSmrg mov->src[0].src = nir_src_for_ssa(&load->def); 110101e04c3fSmrg mov->dest.dest = nir_dest_for_reg(reg); 110201e04c3fSmrg mov->dest.write_mask = (1 << reg->num_components) - 1; 110301e04c3fSmrg nir_instr_insert(nir_after_instr(&load->instr), &mov->instr); 11047ec681f3Smrg } else if (nir_foreach_ssa_def(instr, ssa_def_is_local_to_block, NULL)) { 11057ec681f3Smrg /* If the SSA def produced by this instruction is only in the block 11067ec681f3Smrg * in which it is defined and is not used by ifs or phis, then we 11077ec681f3Smrg * don't have a reason to convert it to a register. 11087ec681f3Smrg */ 110901e04c3fSmrg } else { 111001e04c3fSmrg nir_foreach_dest(instr, dest_replace_ssa_with_reg, &state); 111101e04c3fSmrg } 111201e04c3fSmrg } 111301e04c3fSmrg 111401e04c3fSmrg return state.progress; 111501e04c3fSmrg} 1116