17ec681f3Smrg/* 27ec681f3Smrg * Copyright (C) 2021 Valve Corporation 37ec681f3Smrg * 47ec681f3Smrg * Permission is hereby granted, free of charge, to any person obtaining a 57ec681f3Smrg * copy of this software and associated documentation files (the "Software"), 67ec681f3Smrg * to deal in the Software without restriction, including without limitation 77ec681f3Smrg * the rights to use, copy, modify, merge, publish, distribute, sublicense, 87ec681f3Smrg * and/or sell copies of the Software, and to permit persons to whom the 97ec681f3Smrg * Software is furnished to do so, subject to the following conditions: 107ec681f3Smrg * 117ec681f3Smrg * The above copyright notice and this permission notice (including the next 127ec681f3Smrg * paragraph) shall be included in all copies or substantial portions of the 137ec681f3Smrg * Software. 147ec681f3Smrg * 157ec681f3Smrg * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 167ec681f3Smrg * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 177ec681f3Smrg * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 187ec681f3Smrg * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 197ec681f3Smrg * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 207ec681f3Smrg * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 217ec681f3Smrg * SOFTWARE. 227ec681f3Smrg */ 237ec681f3Smrg 247ec681f3Smrg#include "ir3_compiler.h" 257ec681f3Smrg#include "ir3_ra.h" 267ec681f3Smrg#include "ralloc.h" 277ec681f3Smrg 287ec681f3Smrg/* This pass "merges" compatible phi-web SSA values. First, we insert a bunch 297ec681f3Smrg * of parallelcopy's to trivially turn the program into CSSA form. Then we 307ec681f3Smrg * try to "merge" SSA def's into "merge sets" which could be allocated to a 317ec681f3Smrg * single register in order to eliminate copies. First we merge phi nodes, 327ec681f3Smrg * which should always succeed because of the parallelcopy's we inserted, and 337ec681f3Smrg * then we try to coalesce the copies we introduced. 347ec681f3Smrg * 357ec681f3Smrg * The merged registers are used for three purposes: 367ec681f3Smrg * 377ec681f3Smrg * 1. We always use the same pvtmem slot for spilling all SSA defs in each 387ec681f3Smrg * merge set. This prevents us from having to insert memory-to-memory copies 397ec681f3Smrg * in the spiller and makes sure we don't insert unecessary copies. 407ec681f3Smrg * 2. When two values are live at the same time, part of the same merge 417ec681f3Smrg * set, and they overlap each other in the merge set, they always occupy 427ec681f3Smrg * overlapping physical registers in RA. This reduces register pressure and 437ec681f3Smrg * copies in several important scenarios: 447ec681f3Smrg * - When sources of a collect are used later by something else, we don't 457ec681f3Smrg * have to introduce copies. 467ec681f3Smrg * - We can handle sequences of extracts that "explode" a vector into its 477ec681f3Smrg * components without any additional copying. 487ec681f3Smrg * 3. We use the merge sets for affinities in register allocation: That is, we 497ec681f3Smrg * try to allocate all the definitions in the same merge set to the 507ec681f3Smrg * same/compatible registers. This helps us e.g. allocate sources of a collect 517ec681f3Smrg * to contiguous registers without too much special code in RA. 527ec681f3Smrg * 537ec681f3Smrg * In a "normal" register allocator, or when spilling, we'd just merge 547ec681f3Smrg * registers in the same merge set to the same register, but with SSA-based 557ec681f3Smrg * register allocation we may have to split the live interval. 567ec681f3Smrg * 577ec681f3Smrg * The implementation is based on "Revisiting Out-of-SSA Translation for 587ec681f3Smrg * Correctness, CodeQuality, and Efficiency," and is broadly similar to the 597ec681f3Smrg * implementation in nir_from_ssa, with the twist that we also try to coalesce 607ec681f3Smrg * META_SPLIT and META_COLLECT. This makes this pass more complicated but 617ec681f3Smrg * prevents us from needing to handle these specially in RA and the spiller, 627ec681f3Smrg * which are already complicated enough. This also forces us to implement that 637ec681f3Smrg * value-comparison optimization they explain, as without it we wouldn't be 647ec681f3Smrg * able to coalesce META_SPLIT even in the simplest of cases. 657ec681f3Smrg */ 667ec681f3Smrg 677ec681f3Smrg/* In order to dynamically reconstruct the dominance forest, we need the 687ec681f3Smrg * instructions ordered by a preorder traversal of the dominance tree: 697ec681f3Smrg */ 707ec681f3Smrg 717ec681f3Smrgstatic unsigned 727ec681f3Smrgindex_instrs(struct ir3_block *block, unsigned index) 737ec681f3Smrg{ 747ec681f3Smrg foreach_instr (instr, &block->instr_list) 757ec681f3Smrg instr->ip = index++; 767ec681f3Smrg 777ec681f3Smrg for (unsigned i = 0; i < block->dom_children_count; i++) 787ec681f3Smrg index = index_instrs(block->dom_children[i], index); 797ec681f3Smrg 807ec681f3Smrg return index; 817ec681f3Smrg} 827ec681f3Smrg 837ec681f3Smrg/* Definitions within a merge set are ordered by instr->ip as set above: */ 847ec681f3Smrg 857ec681f3Smrgstatic bool 867ec681f3Smrgdef_after(struct ir3_register *a, struct ir3_register *b) 877ec681f3Smrg{ 887ec681f3Smrg return a->instr->ip > b->instr->ip; 897ec681f3Smrg} 907ec681f3Smrg 917ec681f3Smrgstatic bool 927ec681f3Smrgdef_dominates(struct ir3_register *a, struct ir3_register *b) 937ec681f3Smrg{ 947ec681f3Smrg if (def_after(a, b)) { 957ec681f3Smrg return false; 967ec681f3Smrg } else if (a->instr->block == b->instr->block) { 977ec681f3Smrg return def_after(b, a); 987ec681f3Smrg } else { 997ec681f3Smrg return ir3_block_dominates(a->instr->block, b->instr->block); 1007ec681f3Smrg } 1017ec681f3Smrg} 1027ec681f3Smrg 1037ec681f3Smrg/* This represents a region inside a register. The offset is relative to the 1047ec681f3Smrg * start of the register, and offset + size <= size(reg). 1057ec681f3Smrg */ 1067ec681f3Smrgstruct def_value { 1077ec681f3Smrg struct ir3_register *reg; 1087ec681f3Smrg unsigned offset, size; 1097ec681f3Smrg}; 1107ec681f3Smrg 1117ec681f3Smrg/* Chase any copies to get the source of a region inside a register. This is 1127ec681f3Smrg * Value(a) in the paper. 1137ec681f3Smrg */ 1147ec681f3Smrgstatic struct def_value 1157ec681f3Smrgchase_copies(struct def_value value) 1167ec681f3Smrg{ 1177ec681f3Smrg while (true) { 1187ec681f3Smrg struct ir3_instruction *instr = value.reg->instr; 1197ec681f3Smrg if (instr->opc == OPC_META_SPLIT) { 1207ec681f3Smrg value.offset += instr->split.off * reg_elem_size(value.reg); 1217ec681f3Smrg value.reg = instr->srcs[0]->def; 1227ec681f3Smrg } else if (instr->opc == OPC_META_COLLECT) { 1237ec681f3Smrg if (value.offset % reg_elem_size(value.reg) != 0 || 1247ec681f3Smrg value.size > reg_elem_size(value.reg) || 1257ec681f3Smrg value.offset + value.size > reg_size(value.reg)) 1267ec681f3Smrg break; 1277ec681f3Smrg struct ir3_register *src = 1287ec681f3Smrg instr->srcs[value.offset / reg_elem_size(value.reg)]; 1297ec681f3Smrg if (!src->def) 1307ec681f3Smrg break; 1317ec681f3Smrg value.offset = 0; 1327ec681f3Smrg value.reg = src->def; 1337ec681f3Smrg } else { 1347ec681f3Smrg /* TODO: parallelcopy */ 1357ec681f3Smrg break; 1367ec681f3Smrg } 1377ec681f3Smrg } 1387ec681f3Smrg 1397ec681f3Smrg return value; 1407ec681f3Smrg} 1417ec681f3Smrg 1427ec681f3Smrg/* This represents an entry in the merge set, and consists of a register + 1437ec681f3Smrg * offset from the merge set base. 1447ec681f3Smrg */ 1457ec681f3Smrgstruct merge_def { 1467ec681f3Smrg struct ir3_register *reg; 1477ec681f3Smrg unsigned offset; 1487ec681f3Smrg}; 1497ec681f3Smrg 1507ec681f3Smrgstatic bool 1517ec681f3Smrgcan_skip_interference(const struct merge_def *a, const struct merge_def *b) 1527ec681f3Smrg{ 1537ec681f3Smrg unsigned a_start = a->offset; 1547ec681f3Smrg unsigned b_start = b->offset; 1557ec681f3Smrg unsigned a_end = a_start + reg_size(a->reg); 1567ec681f3Smrg unsigned b_end = b_start + reg_size(b->reg); 1577ec681f3Smrg 1587ec681f3Smrg /* Registers that don't overlap never interfere */ 1597ec681f3Smrg if (a_end <= b_start || b_end <= a_start) 1607ec681f3Smrg return true; 1617ec681f3Smrg 1627ec681f3Smrg /* Disallow skipping interference unless one definition contains the 1637ec681f3Smrg * other. This restriction is important for register allocation, because 1647ec681f3Smrg * it means that at any given point in the program, the live values in a 1657ec681f3Smrg * given merge set will form a tree. If they didn't, then one live value 1667ec681f3Smrg * would partially overlap another, and they would have overlapping live 1677ec681f3Smrg * ranges because they're live at the same point. This simplifies register 1687ec681f3Smrg * allocation and spilling. 1697ec681f3Smrg */ 1707ec681f3Smrg if (!((a_start <= b_start && a_end >= b_end) || 1717ec681f3Smrg (b_start <= a_start && b_end >= a_end))) 1727ec681f3Smrg return false; 1737ec681f3Smrg 1747ec681f3Smrg /* For each register, chase the intersection of a and b to find the 1757ec681f3Smrg * ultimate source. 1767ec681f3Smrg */ 1777ec681f3Smrg unsigned start = MAX2(a_start, b_start); 1787ec681f3Smrg unsigned end = MIN2(a_end, b_end); 1797ec681f3Smrg struct def_value a_value = chase_copies((struct def_value){ 1807ec681f3Smrg .reg = a->reg, 1817ec681f3Smrg .offset = start - a_start, 1827ec681f3Smrg .size = end - start, 1837ec681f3Smrg }); 1847ec681f3Smrg struct def_value b_value = chase_copies((struct def_value){ 1857ec681f3Smrg .reg = b->reg, 1867ec681f3Smrg .offset = start - b_start, 1877ec681f3Smrg .size = end - start, 1887ec681f3Smrg }); 1897ec681f3Smrg return a_value.reg == b_value.reg && a_value.offset == b_value.offset; 1907ec681f3Smrg} 1917ec681f3Smrg 1927ec681f3Smrgstatic struct ir3_merge_set * 1937ec681f3Smrgget_merge_set(struct ir3_register *def) 1947ec681f3Smrg{ 1957ec681f3Smrg if (def->merge_set) 1967ec681f3Smrg return def->merge_set; 1977ec681f3Smrg 1987ec681f3Smrg struct ir3_merge_set *set = ralloc(def, struct ir3_merge_set); 1997ec681f3Smrg set->preferred_reg = ~0; 2007ec681f3Smrg set->interval_start = ~0; 2017ec681f3Smrg set->spill_slot = ~0; 2027ec681f3Smrg set->size = reg_size(def); 2037ec681f3Smrg set->alignment = (def->flags & IR3_REG_HALF) ? 1 : 2; 2047ec681f3Smrg set->regs_count = 1; 2057ec681f3Smrg set->regs = ralloc(set, struct ir3_register *); 2067ec681f3Smrg set->regs[0] = def; 2077ec681f3Smrg 2087ec681f3Smrg return set; 2097ec681f3Smrg} 2107ec681f3Smrg 2117ec681f3Smrg/* Merges b into a */ 2127ec681f3Smrgstatic struct ir3_merge_set * 2137ec681f3Smrgmerge_merge_sets(struct ir3_merge_set *a, struct ir3_merge_set *b, int b_offset) 2147ec681f3Smrg{ 2157ec681f3Smrg if (b_offset < 0) 2167ec681f3Smrg return merge_merge_sets(b, a, -b_offset); 2177ec681f3Smrg 2187ec681f3Smrg struct ir3_register **new_regs = 2197ec681f3Smrg rzalloc_array(a, struct ir3_register *, a->regs_count + b->regs_count); 2207ec681f3Smrg 2217ec681f3Smrg unsigned a_index = 0, b_index = 0, new_index = 0; 2227ec681f3Smrg for (; a_index < a->regs_count || b_index < b->regs_count; new_index++) { 2237ec681f3Smrg if (b_index < b->regs_count && 2247ec681f3Smrg (a_index == a->regs_count || 2257ec681f3Smrg def_after(a->regs[a_index], b->regs[b_index]))) { 2267ec681f3Smrg new_regs[new_index] = b->regs[b_index++]; 2277ec681f3Smrg new_regs[new_index]->merge_set_offset += b_offset; 2287ec681f3Smrg } else { 2297ec681f3Smrg new_regs[new_index] = a->regs[a_index++]; 2307ec681f3Smrg } 2317ec681f3Smrg new_regs[new_index]->merge_set = a; 2327ec681f3Smrg } 2337ec681f3Smrg 2347ec681f3Smrg assert(new_index == a->regs_count + b->regs_count); 2357ec681f3Smrg 2367ec681f3Smrg /* Technically this should be the lcm, but because alignment is only 1 or 2377ec681f3Smrg * 2 so far this should be ok. 2387ec681f3Smrg */ 2397ec681f3Smrg a->alignment = MAX2(a->alignment, b->alignment); 2407ec681f3Smrg a->regs_count += b->regs_count; 2417ec681f3Smrg ralloc_free(a->regs); 2427ec681f3Smrg a->regs = new_regs; 2437ec681f3Smrg a->size = MAX2(a->size, b->size + b_offset); 2447ec681f3Smrg 2457ec681f3Smrg return a; 2467ec681f3Smrg} 2477ec681f3Smrg 2487ec681f3Smrgstatic bool 2497ec681f3Smrgmerge_sets_interfere(struct ir3_liveness *live, struct ir3_merge_set *a, 2507ec681f3Smrg struct ir3_merge_set *b, int b_offset) 2517ec681f3Smrg{ 2527ec681f3Smrg if (b_offset < 0) 2537ec681f3Smrg return merge_sets_interfere(live, b, a, -b_offset); 2547ec681f3Smrg 2557ec681f3Smrg struct merge_def dom[a->regs_count + b->regs_count]; 2567ec681f3Smrg unsigned a_index = 0, b_index = 0; 2577ec681f3Smrg int dom_index = -1; 2587ec681f3Smrg 2597ec681f3Smrg /* Reject trying to merge the sets if the alignment doesn't work out */ 2607ec681f3Smrg if (b_offset % a->alignment != 0) 2617ec681f3Smrg return true; 2627ec681f3Smrg 2637ec681f3Smrg while (a_index < a->regs_count || b_index < b->regs_count) { 2647ec681f3Smrg struct merge_def current; 2657ec681f3Smrg if (a_index == a->regs_count) { 2667ec681f3Smrg current.reg = b->regs[b_index]; 2677ec681f3Smrg current.offset = current.reg->merge_set_offset + b_offset; 2687ec681f3Smrg b_index++; 2697ec681f3Smrg } else if (b_index == b->regs_count) { 2707ec681f3Smrg current.reg = a->regs[a_index]; 2717ec681f3Smrg current.offset = current.reg->merge_set_offset; 2727ec681f3Smrg a_index++; 2737ec681f3Smrg } else { 2747ec681f3Smrg if (def_after(b->regs[b_index], a->regs[a_index])) { 2757ec681f3Smrg current.reg = a->regs[a_index]; 2767ec681f3Smrg current.offset = current.reg->merge_set_offset; 2777ec681f3Smrg a_index++; 2787ec681f3Smrg } else { 2797ec681f3Smrg current.reg = b->regs[b_index]; 2807ec681f3Smrg current.offset = current.reg->merge_set_offset + b_offset; 2817ec681f3Smrg b_index++; 2827ec681f3Smrg } 2837ec681f3Smrg } 2847ec681f3Smrg 2857ec681f3Smrg while (dom_index >= 0 && 2867ec681f3Smrg !def_dominates(dom[dom_index].reg, current.reg)) { 2877ec681f3Smrg dom_index--; 2887ec681f3Smrg } 2897ec681f3Smrg 2907ec681f3Smrg /* TODO: in the original paper, just dom[dom_index] needs to be 2917ec681f3Smrg * checked for interference. We implement the value-chasing extension 2927ec681f3Smrg * as well as support for sub-registers, which complicates this 2937ec681f3Smrg * significantly because it's no longer the case that if a dominates b 2947ec681f3Smrg * dominates c and a and b don't interfere then we only need to check 2957ec681f3Smrg * interference between b and c to be sure a and c don't interfere -- 2967ec681f3Smrg * this means we may have to check for interference against values 2977ec681f3Smrg * higher in the stack then dom[dom_index]. In the paper there's a 2987ec681f3Smrg * description of a way to do less interference tests with the 2997ec681f3Smrg * value-chasing extension, but we'd have to come up with something 3007ec681f3Smrg * ourselves for handling the similar problems that come up with 3017ec681f3Smrg * allowing values to contain subregisters. For now we just test 3027ec681f3Smrg * everything in the stack. 3037ec681f3Smrg */ 3047ec681f3Smrg for (int i = 0; i <= dom_index; i++) { 3057ec681f3Smrg if (can_skip_interference(¤t, &dom[i])) 3067ec681f3Smrg continue; 3077ec681f3Smrg 3087ec681f3Smrg /* Ok, now we actually have to check interference. Since we know 3097ec681f3Smrg * that dom[i] dominates current, this boils down to checking 3107ec681f3Smrg * whether dom[i] is live after current. 3117ec681f3Smrg */ 3127ec681f3Smrg if (ir3_def_live_after(live, dom[i].reg, current.reg->instr)) 3137ec681f3Smrg return true; 3147ec681f3Smrg } 3157ec681f3Smrg 3167ec681f3Smrg dom[++dom_index] = current; 3177ec681f3Smrg } 3187ec681f3Smrg 3197ec681f3Smrg return false; 3207ec681f3Smrg} 3217ec681f3Smrg 3227ec681f3Smrgstatic void 3237ec681f3Smrgtry_merge_defs(struct ir3_liveness *live, struct ir3_register *a, 3247ec681f3Smrg struct ir3_register *b, unsigned b_offset) 3257ec681f3Smrg{ 3267ec681f3Smrg struct ir3_merge_set *a_set = get_merge_set(a); 3277ec681f3Smrg struct ir3_merge_set *b_set = get_merge_set(b); 3287ec681f3Smrg 3297ec681f3Smrg if (a_set == b_set) { 3307ec681f3Smrg /* Note: Even in this case we may not always successfully be able to 3317ec681f3Smrg * coalesce this copy, if the offsets don't line up. But in any 3327ec681f3Smrg * case, we can't do anything. 3337ec681f3Smrg */ 3347ec681f3Smrg return; 3357ec681f3Smrg } 3367ec681f3Smrg 3377ec681f3Smrg int b_set_offset = a->merge_set_offset + b_offset - b->merge_set_offset; 3387ec681f3Smrg 3397ec681f3Smrg if (!merge_sets_interfere(live, a_set, b_set, b_set_offset)) 3407ec681f3Smrg merge_merge_sets(a_set, b_set, b_set_offset); 3417ec681f3Smrg} 3427ec681f3Smrg 3437ec681f3Smrgvoid 3447ec681f3Smrgir3_force_merge(struct ir3_register *a, struct ir3_register *b, int b_offset) 3457ec681f3Smrg{ 3467ec681f3Smrg struct ir3_merge_set *a_set = get_merge_set(a); 3477ec681f3Smrg struct ir3_merge_set *b_set = get_merge_set(b); 3487ec681f3Smrg 3497ec681f3Smrg if (a_set == b_set) 3507ec681f3Smrg return; 3517ec681f3Smrg 3527ec681f3Smrg int b_set_offset = a->merge_set_offset + b_offset - b->merge_set_offset; 3537ec681f3Smrg merge_merge_sets(a_set, b_set, b_set_offset); 3547ec681f3Smrg} 3557ec681f3Smrg 3567ec681f3Smrgstatic void 3577ec681f3Smrgcoalesce_phi(struct ir3_liveness *live, struct ir3_instruction *phi) 3587ec681f3Smrg{ 3597ec681f3Smrg for (unsigned i = 0; i < phi->srcs_count; i++) { 3607ec681f3Smrg if (phi->srcs[i]->def) 3617ec681f3Smrg try_merge_defs(live, phi->dsts[0], phi->srcs[i]->def, 0); 3627ec681f3Smrg } 3637ec681f3Smrg} 3647ec681f3Smrg 3657ec681f3Smrgstatic void 3667ec681f3Smrgaggressive_coalesce_parallel_copy(struct ir3_liveness *live, 3677ec681f3Smrg struct ir3_instruction *pcopy) 3687ec681f3Smrg{ 3697ec681f3Smrg for (unsigned i = 0; i < pcopy->dsts_count; i++) { 3707ec681f3Smrg if (!(pcopy->srcs[i]->flags & IR3_REG_SSA)) 3717ec681f3Smrg continue; 3727ec681f3Smrg try_merge_defs(live, pcopy->dsts[i], pcopy->srcs[i]->def, 0); 3737ec681f3Smrg } 3747ec681f3Smrg} 3757ec681f3Smrg 3767ec681f3Smrgstatic void 3777ec681f3Smrgaggressive_coalesce_split(struct ir3_liveness *live, 3787ec681f3Smrg struct ir3_instruction *split) 3797ec681f3Smrg{ 3807ec681f3Smrg try_merge_defs(live, split->srcs[0]->def, split->dsts[0], 3817ec681f3Smrg split->split.off * reg_elem_size(split->dsts[0])); 3827ec681f3Smrg} 3837ec681f3Smrg 3847ec681f3Smrgstatic void 3857ec681f3Smrgaggressive_coalesce_collect(struct ir3_liveness *live, 3867ec681f3Smrg struct ir3_instruction *collect) 3877ec681f3Smrg{ 3887ec681f3Smrg for (unsigned i = 0, offset = 0; i < collect->srcs_count; 3897ec681f3Smrg offset += reg_elem_size(collect->srcs[i]), i++) { 3907ec681f3Smrg if (!(collect->srcs[i]->flags & IR3_REG_SSA)) 3917ec681f3Smrg continue; 3927ec681f3Smrg try_merge_defs(live, collect->dsts[0], collect->srcs[i]->def, offset); 3937ec681f3Smrg } 3947ec681f3Smrg} 3957ec681f3Smrg 3967ec681f3Smrgstatic void 3977ec681f3Smrgcreate_parallel_copy(struct ir3_block *block) 3987ec681f3Smrg{ 3997ec681f3Smrg for (unsigned i = 0; i < 2; i++) { 4007ec681f3Smrg if (!block->successors[i]) 4017ec681f3Smrg continue; 4027ec681f3Smrg 4037ec681f3Smrg struct ir3_block *succ = block->successors[i]; 4047ec681f3Smrg 4057ec681f3Smrg unsigned pred_idx = ir3_block_get_pred_index(succ, block); 4067ec681f3Smrg 4077ec681f3Smrg unsigned phi_count = 0; 4087ec681f3Smrg foreach_instr (phi, &succ->instr_list) { 4097ec681f3Smrg if (phi->opc != OPC_META_PHI) 4107ec681f3Smrg break; 4117ec681f3Smrg 4127ec681f3Smrg /* Avoid undef */ 4137ec681f3Smrg if ((phi->srcs[pred_idx]->flags & IR3_REG_SSA) && 4147ec681f3Smrg !phi->srcs[pred_idx]->def) 4157ec681f3Smrg continue; 4167ec681f3Smrg 4177ec681f3Smrg /* We don't support critical edges. If we were to support them, 4187ec681f3Smrg * we'd need to insert parallel copies after the phi node to solve 4197ec681f3Smrg * the lost-copy problem. 4207ec681f3Smrg */ 4217ec681f3Smrg assert(i == 0 && !block->successors[1]); 4227ec681f3Smrg phi_count++; 4237ec681f3Smrg } 4247ec681f3Smrg 4257ec681f3Smrg if (phi_count == 0) 4267ec681f3Smrg continue; 4277ec681f3Smrg 4287ec681f3Smrg struct ir3_register *src[phi_count]; 4297ec681f3Smrg unsigned j = 0; 4307ec681f3Smrg foreach_instr (phi, &succ->instr_list) { 4317ec681f3Smrg if (phi->opc != OPC_META_PHI) 4327ec681f3Smrg break; 4337ec681f3Smrg if ((phi->srcs[pred_idx]->flags & IR3_REG_SSA) && 4347ec681f3Smrg !phi->srcs[pred_idx]->def) 4357ec681f3Smrg continue; 4367ec681f3Smrg src[j++] = phi->srcs[pred_idx]; 4377ec681f3Smrg } 4387ec681f3Smrg assert(j == phi_count); 4397ec681f3Smrg 4407ec681f3Smrg struct ir3_instruction *pcopy = 4417ec681f3Smrg ir3_instr_create(block, OPC_META_PARALLEL_COPY, phi_count, phi_count); 4427ec681f3Smrg 4437ec681f3Smrg for (j = 0; j < phi_count; j++) { 4447ec681f3Smrg struct ir3_register *reg = __ssa_dst(pcopy); 4457ec681f3Smrg reg->flags |= src[j]->flags & (IR3_REG_HALF | IR3_REG_ARRAY); 4467ec681f3Smrg reg->size = src[j]->size; 4477ec681f3Smrg reg->wrmask = src[j]->wrmask; 4487ec681f3Smrg } 4497ec681f3Smrg 4507ec681f3Smrg for (j = 0; j < phi_count; j++) { 4517ec681f3Smrg pcopy->srcs[pcopy->srcs_count++] = 4527ec681f3Smrg ir3_reg_clone(block->shader, src[j]); 4537ec681f3Smrg } 4547ec681f3Smrg 4557ec681f3Smrg j = 0; 4567ec681f3Smrg foreach_instr (phi, &succ->instr_list) { 4577ec681f3Smrg if (phi->opc != OPC_META_PHI) 4587ec681f3Smrg break; 4597ec681f3Smrg if ((phi->srcs[pred_idx]->flags & IR3_REG_SSA) && 4607ec681f3Smrg !phi->srcs[pred_idx]->def) 4617ec681f3Smrg continue; 4627ec681f3Smrg phi->srcs[pred_idx]->def = pcopy->dsts[j]; 4637ec681f3Smrg phi->srcs[pred_idx]->flags = pcopy->dsts[j]->flags; 4647ec681f3Smrg j++; 4657ec681f3Smrg } 4667ec681f3Smrg assert(j == phi_count); 4677ec681f3Smrg } 4687ec681f3Smrg} 4697ec681f3Smrg 4707ec681f3Smrgvoid 4717ec681f3Smrgir3_create_parallel_copies(struct ir3 *ir) 4727ec681f3Smrg{ 4737ec681f3Smrg foreach_block (block, &ir->block_list) { 4747ec681f3Smrg create_parallel_copy(block); 4757ec681f3Smrg } 4767ec681f3Smrg} 4777ec681f3Smrg 4787ec681f3Smrgstatic void 4797ec681f3Smrgindex_merge_sets(struct ir3_liveness *live, struct ir3 *ir) 4807ec681f3Smrg{ 4817ec681f3Smrg unsigned offset = 0; 4827ec681f3Smrg foreach_block (block, &ir->block_list) { 4837ec681f3Smrg foreach_instr (instr, &block->instr_list) { 4847ec681f3Smrg for (unsigned i = 0; i < instr->dsts_count; i++) { 4857ec681f3Smrg struct ir3_register *dst = instr->dsts[i]; 4867ec681f3Smrg 4877ec681f3Smrg unsigned dst_offset; 4887ec681f3Smrg struct ir3_merge_set *merge_set = dst->merge_set; 4897ec681f3Smrg unsigned size = reg_size(dst); 4907ec681f3Smrg if (merge_set) { 4917ec681f3Smrg if (merge_set->interval_start == ~0) { 4927ec681f3Smrg merge_set->interval_start = offset; 4937ec681f3Smrg offset += merge_set->size; 4947ec681f3Smrg } 4957ec681f3Smrg dst_offset = merge_set->interval_start + dst->merge_set_offset; 4967ec681f3Smrg } else { 4977ec681f3Smrg dst_offset = offset; 4987ec681f3Smrg offset += size; 4997ec681f3Smrg } 5007ec681f3Smrg 5017ec681f3Smrg dst->interval_start = dst_offset; 5027ec681f3Smrg dst->interval_end = dst_offset + size; 5037ec681f3Smrg } 5047ec681f3Smrg } 5057ec681f3Smrg } 5067ec681f3Smrg 5077ec681f3Smrg live->interval_offset = offset; 5087ec681f3Smrg} 5097ec681f3Smrg 5107ec681f3Smrg#define RESET "\x1b[0m" 5117ec681f3Smrg#define BLUE "\x1b[0;34m" 5127ec681f3Smrg#define SYN_SSA(x) BLUE x RESET 5137ec681f3Smrg 5147ec681f3Smrgstatic void 5157ec681f3Smrgdump_merge_sets(struct ir3 *ir) 5167ec681f3Smrg{ 5177ec681f3Smrg d("merge sets:"); 5187ec681f3Smrg struct set *merge_sets = _mesa_pointer_set_create(NULL); 5197ec681f3Smrg foreach_block (block, &ir->block_list) { 5207ec681f3Smrg foreach_instr (instr, &block->instr_list) { 5217ec681f3Smrg for (unsigned i = 0; i < instr->dsts_count; i++) { 5227ec681f3Smrg struct ir3_register *dst = instr->dsts[i]; 5237ec681f3Smrg 5247ec681f3Smrg struct ir3_merge_set *merge_set = dst->merge_set; 5257ec681f3Smrg if (!merge_set || _mesa_set_search(merge_sets, merge_set)) 5267ec681f3Smrg continue; 5277ec681f3Smrg 5287ec681f3Smrg d("merge set, size %u, align %u:", merge_set->size, 5297ec681f3Smrg merge_set->alignment); 5307ec681f3Smrg for (unsigned j = 0; j < merge_set->regs_count; j++) { 5317ec681f3Smrg struct ir3_register *reg = merge_set->regs[j]; 5327ec681f3Smrg d("\t" SYN_SSA("ssa_%u") ":%u, offset %u", 5337ec681f3Smrg reg->instr->serialno, reg->name, reg->merge_set_offset); 5347ec681f3Smrg } 5357ec681f3Smrg 5367ec681f3Smrg _mesa_set_add(merge_sets, merge_set); 5377ec681f3Smrg } 5387ec681f3Smrg } 5397ec681f3Smrg } 5407ec681f3Smrg 5417ec681f3Smrg ralloc_free(merge_sets); 5427ec681f3Smrg} 5437ec681f3Smrg 5447ec681f3Smrgvoid 5457ec681f3Smrgir3_merge_regs(struct ir3_liveness *live, struct ir3 *ir) 5467ec681f3Smrg{ 5477ec681f3Smrg index_instrs(ir3_start_block(ir), 0); 5487ec681f3Smrg 5497ec681f3Smrg /* First pass: coalesce phis, which must be together. */ 5507ec681f3Smrg foreach_block (block, &ir->block_list) { 5517ec681f3Smrg foreach_instr (instr, &block->instr_list) { 5527ec681f3Smrg if (instr->opc != OPC_META_PHI) 5537ec681f3Smrg break; 5547ec681f3Smrg 5557ec681f3Smrg coalesce_phi(live, instr); 5567ec681f3Smrg } 5577ec681f3Smrg } 5587ec681f3Smrg 5597ec681f3Smrg /* Second pass: aggressively coalesce parallelcopy, split, collect */ 5607ec681f3Smrg foreach_block (block, &ir->block_list) { 5617ec681f3Smrg foreach_instr (instr, &block->instr_list) { 5627ec681f3Smrg switch (instr->opc) { 5637ec681f3Smrg case OPC_META_SPLIT: 5647ec681f3Smrg aggressive_coalesce_split(live, instr); 5657ec681f3Smrg break; 5667ec681f3Smrg case OPC_META_COLLECT: 5677ec681f3Smrg aggressive_coalesce_collect(live, instr); 5687ec681f3Smrg break; 5697ec681f3Smrg case OPC_META_PARALLEL_COPY: 5707ec681f3Smrg aggressive_coalesce_parallel_copy(live, instr); 5717ec681f3Smrg break; 5727ec681f3Smrg default: 5737ec681f3Smrg break; 5747ec681f3Smrg } 5757ec681f3Smrg } 5767ec681f3Smrg } 5777ec681f3Smrg 5787ec681f3Smrg index_merge_sets(live, ir); 5797ec681f3Smrg 5807ec681f3Smrg if (ir3_shader_debug & IR3_DBG_RAMSGS) 5817ec681f3Smrg dump_merge_sets(ir); 5827ec681f3Smrg} 583