17ec681f3Smrg/* 27ec681f3Smrg * Copyright © 2018 Valve Corporation 37ec681f3Smrg * 47ec681f3Smrg * Permission is hereby granted, free of charge, to any person obtaining a 57ec681f3Smrg * copy of this software and associated documentation files (the "Software"), 67ec681f3Smrg * to deal in the Software without restriction, including without limitation 77ec681f3Smrg * the rights to use, copy, modify, merge, publish, distribute, sublicense, 87ec681f3Smrg * and/or sell copies of the Software, and to permit persons to whom the 97ec681f3Smrg * Software is furnished to do so, subject to the following conditions: 107ec681f3Smrg * 117ec681f3Smrg * The above copyright notice and this permission notice (including the next 127ec681f3Smrg * paragraph) shall be included in all copies or substantial portions of the 137ec681f3Smrg * Software. 147ec681f3Smrg * 157ec681f3Smrg * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 167ec681f3Smrg * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 177ec681f3Smrg * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 187ec681f3Smrg * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 197ec681f3Smrg * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 207ec681f3Smrg * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 217ec681f3Smrg * IN THE SOFTWARE. 227ec681f3Smrg * 237ec681f3Smrg */ 247ec681f3Smrg 257ec681f3Smrg#include "aco_ir.h" 267ec681f3Smrg 277ec681f3Smrg#include <algorithm> 287ec681f3Smrg#include <array> 297ec681f3Smrg#include <bitset> 307ec681f3Smrg#include <map> 317ec681f3Smrg#include <set> 327ec681f3Smrg#include <unordered_map> 337ec681f3Smrg#include <vector> 347ec681f3Smrg 357ec681f3Smrgnamespace aco { 367ec681f3Smrgnamespace { 377ec681f3Smrg 387ec681f3Smrgstruct ra_ctx; 397ec681f3Smrg 407ec681f3Smrgunsigned get_subdword_operand_stride(chip_class chip, const aco_ptr<Instruction>& instr, 417ec681f3Smrg unsigned idx, RegClass rc); 427ec681f3Smrgvoid add_subdword_operand(ra_ctx& ctx, aco_ptr<Instruction>& instr, unsigned idx, unsigned byte, 437ec681f3Smrg RegClass rc); 447ec681f3Smrgstd::pair<unsigned, unsigned> 457ec681f3Smrgget_subdword_definition_info(Program* program, const aco_ptr<Instruction>& instr, RegClass rc); 467ec681f3Smrgvoid add_subdword_definition(Program* program, aco_ptr<Instruction>& instr, PhysReg reg); 477ec681f3Smrg 487ec681f3Smrgstruct assignment { 497ec681f3Smrg PhysReg reg; 507ec681f3Smrg RegClass rc; 517ec681f3Smrg bool assigned = false; 527ec681f3Smrg uint32_t affinity = 0; 537ec681f3Smrg assignment() = default; 547ec681f3Smrg assignment(PhysReg reg_, RegClass rc_) : reg(reg_), rc(rc_), assigned(-1) {} 557ec681f3Smrg void set(const Definition& def) 567ec681f3Smrg { 577ec681f3Smrg assigned = true; 587ec681f3Smrg reg = def.physReg(); 597ec681f3Smrg rc = def.regClass(); 607ec681f3Smrg } 617ec681f3Smrg}; 627ec681f3Smrg 637ec681f3Smrgstruct ra_ctx { 647ec681f3Smrg 657ec681f3Smrg Program* program; 667ec681f3Smrg Block* block = NULL; 677ec681f3Smrg std::vector<assignment> assignments; 687ec681f3Smrg std::vector<std::unordered_map<unsigned, Temp>> renames; 697ec681f3Smrg std::vector<uint32_t> loop_header; 707ec681f3Smrg std::unordered_map<unsigned, Temp> orig_names; 717ec681f3Smrg std::unordered_map<unsigned, Instruction*> vectors; 727ec681f3Smrg std::unordered_map<unsigned, Instruction*> split_vectors; 737ec681f3Smrg aco_ptr<Instruction> pseudo_dummy; 747ec681f3Smrg uint16_t max_used_sgpr = 0; 757ec681f3Smrg uint16_t max_used_vgpr = 0; 767ec681f3Smrg uint16_t sgpr_limit; 777ec681f3Smrg uint16_t vgpr_limit; 787ec681f3Smrg std::bitset<512> war_hint; 797ec681f3Smrg std::bitset<64> defs_done; /* see MAX_ARGS in aco_instruction_selection_setup.cpp */ 807ec681f3Smrg 817ec681f3Smrg ra_test_policy policy; 827ec681f3Smrg 837ec681f3Smrg ra_ctx(Program* program_, ra_test_policy policy_) 847ec681f3Smrg : program(program_), assignments(program->peekAllocationId()), 857ec681f3Smrg renames(program->blocks.size()), policy(policy_) 867ec681f3Smrg { 877ec681f3Smrg pseudo_dummy.reset( 887ec681f3Smrg create_instruction<Instruction>(aco_opcode::p_parallelcopy, Format::PSEUDO, 0, 0)); 897ec681f3Smrg sgpr_limit = get_addr_sgpr_from_waves(program, program->min_waves); 907ec681f3Smrg vgpr_limit = get_addr_sgpr_from_waves(program, program->min_waves); 917ec681f3Smrg } 927ec681f3Smrg}; 937ec681f3Smrg 947ec681f3Smrg/* Iterator type for making PhysRegInterval compatible with range-based for */ 957ec681f3Smrgstruct PhysRegIterator { 967ec681f3Smrg using difference_type = int; 977ec681f3Smrg using value_type = unsigned; 987ec681f3Smrg using reference = const unsigned&; 997ec681f3Smrg using pointer = const unsigned*; 1007ec681f3Smrg using iterator_category = std::bidirectional_iterator_tag; 1017ec681f3Smrg 1027ec681f3Smrg PhysReg reg; 1037ec681f3Smrg 1047ec681f3Smrg PhysReg operator*() const { return reg; } 1057ec681f3Smrg 1067ec681f3Smrg PhysRegIterator& operator++() 1077ec681f3Smrg { 1087ec681f3Smrg reg.reg_b += 4; 1097ec681f3Smrg return *this; 1107ec681f3Smrg } 1117ec681f3Smrg 1127ec681f3Smrg PhysRegIterator& operator--() 1137ec681f3Smrg { 1147ec681f3Smrg reg.reg_b -= 4; 1157ec681f3Smrg return *this; 1167ec681f3Smrg } 1177ec681f3Smrg 1187ec681f3Smrg bool operator==(PhysRegIterator oth) const { return reg == oth.reg; } 1197ec681f3Smrg 1207ec681f3Smrg bool operator!=(PhysRegIterator oth) const { return reg != oth.reg; } 1217ec681f3Smrg 1227ec681f3Smrg bool operator<(PhysRegIterator oth) const { return reg < oth.reg; } 1237ec681f3Smrg}; 1247ec681f3Smrg 1257ec681f3Smrg/* Half-open register interval used in "sliding window"-style for-loops */ 1267ec681f3Smrgstruct PhysRegInterval { 1277ec681f3Smrg PhysReg lo_; 1287ec681f3Smrg unsigned size; 1297ec681f3Smrg 1307ec681f3Smrg /* Inclusive lower bound */ 1317ec681f3Smrg PhysReg lo() const { return lo_; } 1327ec681f3Smrg 1337ec681f3Smrg /* Exclusive upper bound */ 1347ec681f3Smrg PhysReg hi() const { return PhysReg{lo() + size}; } 1357ec681f3Smrg 1367ec681f3Smrg PhysRegInterval& operator+=(uint32_t stride) 1377ec681f3Smrg { 1387ec681f3Smrg lo_ = PhysReg{lo_.reg() + stride}; 1397ec681f3Smrg return *this; 1407ec681f3Smrg } 1417ec681f3Smrg 1427ec681f3Smrg bool operator!=(const PhysRegInterval& oth) const { return lo_ != oth.lo_ || size != oth.size; } 1437ec681f3Smrg 1447ec681f3Smrg /* Construct a half-open interval, excluding the end register */ 1457ec681f3Smrg static PhysRegInterval from_until(PhysReg first, PhysReg end) { return {first, end - first}; } 1467ec681f3Smrg 1477ec681f3Smrg bool contains(PhysReg reg) const { return lo() <= reg && reg < hi(); } 1487ec681f3Smrg 1497ec681f3Smrg bool contains(const PhysRegInterval& needle) const 1507ec681f3Smrg { 1517ec681f3Smrg return needle.lo() >= lo() && needle.hi() <= hi(); 1527ec681f3Smrg } 1537ec681f3Smrg 1547ec681f3Smrg PhysRegIterator begin() const { return {lo_}; } 1557ec681f3Smrg 1567ec681f3Smrg PhysRegIterator end() const { return {PhysReg{lo_ + size}}; } 1577ec681f3Smrg}; 1587ec681f3Smrg 1597ec681f3Smrgbool 1607ec681f3Smrgintersects(const PhysRegInterval& a, const PhysRegInterval& b) 1617ec681f3Smrg{ 1627ec681f3Smrg return a.hi() > b.lo() && b.hi() > a.lo(); 1637ec681f3Smrg} 1647ec681f3Smrg 1657ec681f3Smrg/* Gets the stride for full (non-subdword) registers */ 1667ec681f3Smrguint32_t 1677ec681f3Smrgget_stride(RegClass rc) 1687ec681f3Smrg{ 1697ec681f3Smrg if (rc.type() == RegType::vgpr) { 1707ec681f3Smrg return 1; 1717ec681f3Smrg } else { 1727ec681f3Smrg uint32_t size = rc.size(); 1737ec681f3Smrg if (size == 2) { 1747ec681f3Smrg return 2; 1757ec681f3Smrg } else if (size >= 4) { 1767ec681f3Smrg return 4; 1777ec681f3Smrg } else { 1787ec681f3Smrg return 1; 1797ec681f3Smrg } 1807ec681f3Smrg } 1817ec681f3Smrg} 1827ec681f3Smrg 1837ec681f3SmrgPhysRegInterval 1847ec681f3Smrgget_reg_bounds(Program* program, RegType type) 1857ec681f3Smrg{ 1867ec681f3Smrg if (type == RegType::vgpr) { 1877ec681f3Smrg return {PhysReg{256}, (unsigned)program->max_reg_demand.vgpr}; 1887ec681f3Smrg } else { 1897ec681f3Smrg return {PhysReg{0}, (unsigned)program->max_reg_demand.sgpr}; 1907ec681f3Smrg } 1917ec681f3Smrg} 1927ec681f3Smrg 1937ec681f3Smrgstruct DefInfo { 1947ec681f3Smrg PhysRegInterval bounds; 1957ec681f3Smrg uint8_t size; 1967ec681f3Smrg uint8_t stride; 1977ec681f3Smrg RegClass rc; 1987ec681f3Smrg 1997ec681f3Smrg DefInfo(ra_ctx& ctx, aco_ptr<Instruction>& instr, RegClass rc_, int operand) : rc(rc_) 2007ec681f3Smrg { 2017ec681f3Smrg size = rc.size(); 2027ec681f3Smrg stride = get_stride(rc); 2037ec681f3Smrg 2047ec681f3Smrg bounds = get_reg_bounds(ctx.program, rc.type()); 2057ec681f3Smrg 2067ec681f3Smrg if (rc.is_subdword() && operand >= 0) { 2077ec681f3Smrg /* stride in bytes */ 2087ec681f3Smrg stride = get_subdword_operand_stride(ctx.program->chip_class, instr, operand, rc); 2097ec681f3Smrg } else if (rc.is_subdword()) { 2107ec681f3Smrg std::pair<unsigned, unsigned> info = get_subdword_definition_info(ctx.program, instr, rc); 2117ec681f3Smrg stride = info.first; 2127ec681f3Smrg if (info.second > rc.bytes()) { 2137ec681f3Smrg rc = RegClass::get(rc.type(), info.second); 2147ec681f3Smrg size = rc.size(); 2157ec681f3Smrg /* we might still be able to put the definition in the high half, 2167ec681f3Smrg * but that's only useful for affinities and this information isn't 2177ec681f3Smrg * used for them */ 2187ec681f3Smrg stride = align(stride, info.second); 2197ec681f3Smrg if (!rc.is_subdword()) 2207ec681f3Smrg stride = DIV_ROUND_UP(stride, 4); 2217ec681f3Smrg } 2227ec681f3Smrg assert(stride > 0); 2237ec681f3Smrg } 2247ec681f3Smrg } 2257ec681f3Smrg}; 2267ec681f3Smrg 2277ec681f3Smrgclass RegisterFile { 2287ec681f3Smrgpublic: 2297ec681f3Smrg RegisterFile() { regs.fill(0); } 2307ec681f3Smrg 2317ec681f3Smrg std::array<uint32_t, 512> regs; 2327ec681f3Smrg std::map<uint32_t, std::array<uint32_t, 4>> subdword_regs; 2337ec681f3Smrg 2347ec681f3Smrg const uint32_t& operator[](PhysReg index) const { return regs[index]; } 2357ec681f3Smrg 2367ec681f3Smrg uint32_t& operator[](PhysReg index) { return regs[index]; } 2377ec681f3Smrg 2387ec681f3Smrg unsigned count_zero(PhysRegInterval reg_interval) 2397ec681f3Smrg { 2407ec681f3Smrg unsigned res = 0; 2417ec681f3Smrg for (PhysReg reg : reg_interval) 2427ec681f3Smrg res += !regs[reg]; 2437ec681f3Smrg return res; 2447ec681f3Smrg } 2457ec681f3Smrg 2467ec681f3Smrg /* Returns true if any of the bytes in the given range are allocated or blocked */ 2477ec681f3Smrg bool test(PhysReg start, unsigned num_bytes) 2487ec681f3Smrg { 2497ec681f3Smrg for (PhysReg i = start; i.reg_b < start.reg_b + num_bytes; i = PhysReg(i + 1)) { 2507ec681f3Smrg assert(i <= 511); 2517ec681f3Smrg if (regs[i] & 0x0FFFFFFF) 2527ec681f3Smrg return true; 2537ec681f3Smrg if (regs[i] == 0xF0000000) { 2547ec681f3Smrg assert(subdword_regs.find(i) != subdword_regs.end()); 2557ec681f3Smrg for (unsigned j = i.byte(); i * 4 + j < start.reg_b + num_bytes && j < 4; j++) { 2567ec681f3Smrg if (subdword_regs[i][j]) 2577ec681f3Smrg return true; 2587ec681f3Smrg } 2597ec681f3Smrg } 2607ec681f3Smrg } 2617ec681f3Smrg return false; 2627ec681f3Smrg } 2637ec681f3Smrg 2647ec681f3Smrg void block(PhysReg start, RegClass rc) 2657ec681f3Smrg { 2667ec681f3Smrg if (rc.is_subdword()) 2677ec681f3Smrg fill_subdword(start, rc.bytes(), 0xFFFFFFFF); 2687ec681f3Smrg else 2697ec681f3Smrg fill(start, rc.size(), 0xFFFFFFFF); 2707ec681f3Smrg } 2717ec681f3Smrg 2727ec681f3Smrg bool is_blocked(PhysReg start) 2737ec681f3Smrg { 2747ec681f3Smrg if (regs[start] == 0xFFFFFFFF) 2757ec681f3Smrg return true; 2767ec681f3Smrg if (regs[start] == 0xF0000000) { 2777ec681f3Smrg for (unsigned i = start.byte(); i < 4; i++) 2787ec681f3Smrg if (subdword_regs[start][i] == 0xFFFFFFFF) 2797ec681f3Smrg return true; 2807ec681f3Smrg } 2817ec681f3Smrg return false; 2827ec681f3Smrg } 2837ec681f3Smrg 2847ec681f3Smrg bool is_empty_or_blocked(PhysReg start) 2857ec681f3Smrg { 2867ec681f3Smrg /* Empty is 0, blocked is 0xFFFFFFFF, so to check both we compare the 2877ec681f3Smrg * incremented value to 1 */ 2887ec681f3Smrg if (regs[start] == 0xF0000000) { 2897ec681f3Smrg return subdword_regs[start][start.byte()] + 1 <= 1; 2907ec681f3Smrg } 2917ec681f3Smrg return regs[start] + 1 <= 1; 2927ec681f3Smrg } 2937ec681f3Smrg 2947ec681f3Smrg void clear(PhysReg start, RegClass rc) 2957ec681f3Smrg { 2967ec681f3Smrg if (rc.is_subdword()) 2977ec681f3Smrg fill_subdword(start, rc.bytes(), 0); 2987ec681f3Smrg else 2997ec681f3Smrg fill(start, rc.size(), 0); 3007ec681f3Smrg } 3017ec681f3Smrg 3027ec681f3Smrg void fill(Operand op) 3037ec681f3Smrg { 3047ec681f3Smrg if (op.regClass().is_subdword()) 3057ec681f3Smrg fill_subdword(op.physReg(), op.bytes(), op.tempId()); 3067ec681f3Smrg else 3077ec681f3Smrg fill(op.physReg(), op.size(), op.tempId()); 3087ec681f3Smrg } 3097ec681f3Smrg 3107ec681f3Smrg void clear(Operand op) { clear(op.physReg(), op.regClass()); } 3117ec681f3Smrg 3127ec681f3Smrg void fill(Definition def) 3137ec681f3Smrg { 3147ec681f3Smrg if (def.regClass().is_subdword()) 3157ec681f3Smrg fill_subdword(def.physReg(), def.bytes(), def.tempId()); 3167ec681f3Smrg else 3177ec681f3Smrg fill(def.physReg(), def.size(), def.tempId()); 3187ec681f3Smrg } 3197ec681f3Smrg 3207ec681f3Smrg void clear(Definition def) { clear(def.physReg(), def.regClass()); } 3217ec681f3Smrg 3227ec681f3Smrg unsigned get_id(PhysReg reg) 3237ec681f3Smrg { 3247ec681f3Smrg return regs[reg] == 0xF0000000 ? subdword_regs[reg][reg.byte()] : regs[reg]; 3257ec681f3Smrg } 3267ec681f3Smrg 3277ec681f3Smrgprivate: 3287ec681f3Smrg void fill(PhysReg start, unsigned size, uint32_t val) 3297ec681f3Smrg { 3307ec681f3Smrg for (unsigned i = 0; i < size; i++) 3317ec681f3Smrg regs[start + i] = val; 3327ec681f3Smrg } 3337ec681f3Smrg 3347ec681f3Smrg void fill_subdword(PhysReg start, unsigned num_bytes, uint32_t val) 3357ec681f3Smrg { 3367ec681f3Smrg fill(start, DIV_ROUND_UP(num_bytes, 4), 0xF0000000); 3377ec681f3Smrg for (PhysReg i = start; i.reg_b < start.reg_b + num_bytes; i = PhysReg(i + 1)) { 3387ec681f3Smrg /* emplace or get */ 3397ec681f3Smrg std::array<uint32_t, 4>& sub = 3407ec681f3Smrg subdword_regs.emplace(i, std::array<uint32_t, 4>{0, 0, 0, 0}).first->second; 3417ec681f3Smrg for (unsigned j = i.byte(); i * 4 + j < start.reg_b + num_bytes && j < 4; j++) 3427ec681f3Smrg sub[j] = val; 3437ec681f3Smrg 3447ec681f3Smrg if (sub == std::array<uint32_t, 4>{0, 0, 0, 0}) { 3457ec681f3Smrg subdword_regs.erase(i); 3467ec681f3Smrg regs[i] = 0; 3477ec681f3Smrg } 3487ec681f3Smrg } 3497ec681f3Smrg } 3507ec681f3Smrg}; 3517ec681f3Smrg 3527ec681f3Smrgstd::set<std::pair<unsigned, unsigned>> find_vars(ra_ctx& ctx, RegisterFile& reg_file, 3537ec681f3Smrg const PhysRegInterval reg_interval); 3547ec681f3Smrg 3557ec681f3Smrg/* helper function for debugging */ 3567ec681f3SmrgUNUSED void 3577ec681f3Smrgprint_reg(const RegisterFile& reg_file, PhysReg reg, bool has_adjacent_variable) 3587ec681f3Smrg{ 3597ec681f3Smrg if (reg_file[reg] == 0xFFFFFFFF) { 3607ec681f3Smrg printf("☐"); 3617ec681f3Smrg } else if (reg_file[reg]) { 3627ec681f3Smrg const bool show_subdword_alloc = (reg_file[reg] == 0xF0000000); 3637ec681f3Smrg if (show_subdword_alloc) { 3647ec681f3Smrg const char* block_chars[] = { 3657ec681f3Smrg // clang-format off 3667ec681f3Smrg "?", "▘", "▝", "▀", 3677ec681f3Smrg "▖", "▌", "▞", "▛", 3687ec681f3Smrg "▗", "▚", "▐", "▜", 3697ec681f3Smrg "▄", "▙", "▟", "▉" 3707ec681f3Smrg // clang-format on 3717ec681f3Smrg }; 3727ec681f3Smrg unsigned index = 0; 3737ec681f3Smrg for (int i = 0; i < 4; ++i) { 3747ec681f3Smrg if (reg_file.subdword_regs.at(reg)[i]) { 3757ec681f3Smrg index |= 1 << i; 3767ec681f3Smrg } 3777ec681f3Smrg } 3787ec681f3Smrg printf("%s", block_chars[index]); 3797ec681f3Smrg } else { 3807ec681f3Smrg /* Indicate filled register slot */ 3817ec681f3Smrg if (!has_adjacent_variable) { 3827ec681f3Smrg printf("█"); 3837ec681f3Smrg } else { 3847ec681f3Smrg /* Use a slightly shorter box to leave a small gap between adjacent variables */ 3857ec681f3Smrg printf("▉"); 3867ec681f3Smrg } 3877ec681f3Smrg } 3887ec681f3Smrg } else { 3897ec681f3Smrg printf("·"); 3907ec681f3Smrg } 3917ec681f3Smrg} 3927ec681f3Smrg 3937ec681f3Smrg/* helper function for debugging */ 3947ec681f3SmrgUNUSED void 3957ec681f3Smrgprint_regs(ra_ctx& ctx, bool vgprs, RegisterFile& reg_file) 3967ec681f3Smrg{ 3977ec681f3Smrg PhysRegInterval regs = get_reg_bounds(ctx.program, vgprs ? RegType::vgpr : RegType::sgpr); 3987ec681f3Smrg char reg_char = vgprs ? 'v' : 's'; 3997ec681f3Smrg const int max_regs_per_line = 64; 4007ec681f3Smrg 4017ec681f3Smrg /* print markers */ 4027ec681f3Smrg printf(" "); 4037ec681f3Smrg for (int i = 0; i < std::min<int>(max_regs_per_line, ROUND_DOWN_TO(regs.size, 4)); i += 4) { 4047ec681f3Smrg printf("%-3.2u ", i); 4057ec681f3Smrg } 4067ec681f3Smrg printf("\n"); 4077ec681f3Smrg 4087ec681f3Smrg /* print usage */ 4097ec681f3Smrg auto line_begin_it = regs.begin(); 4107ec681f3Smrg while (line_begin_it != regs.end()) { 4117ec681f3Smrg const int regs_in_line = 4127ec681f3Smrg std::min<int>(max_regs_per_line, std::distance(line_begin_it, regs.end())); 4137ec681f3Smrg 4147ec681f3Smrg if (line_begin_it == regs.begin()) { 4157ec681f3Smrg printf("%cgprs: ", reg_char); 4167ec681f3Smrg } else { 4177ec681f3Smrg printf(" %+4d ", std::distance(regs.begin(), line_begin_it)); 4187ec681f3Smrg } 4197ec681f3Smrg const auto line_end_it = std::next(line_begin_it, regs_in_line); 4207ec681f3Smrg 4217ec681f3Smrg for (auto reg_it = line_begin_it; reg_it != line_end_it; ++reg_it) { 4227ec681f3Smrg bool has_adjacent_variable = 4237ec681f3Smrg (std::next(reg_it) != line_end_it && 4247ec681f3Smrg reg_file[*reg_it] != reg_file[*std::next(reg_it)] && reg_file[*std::next(reg_it)]); 4257ec681f3Smrg print_reg(reg_file, *reg_it, has_adjacent_variable); 4267ec681f3Smrg } 4277ec681f3Smrg 4287ec681f3Smrg line_begin_it = line_end_it; 4297ec681f3Smrg printf("\n"); 4307ec681f3Smrg } 4317ec681f3Smrg 4327ec681f3Smrg const unsigned free_regs = 4337ec681f3Smrg std::count_if(regs.begin(), regs.end(), [&](auto reg) { return !reg_file[reg]; }); 4347ec681f3Smrg printf("%u/%u used, %u/%u free\n", regs.size - free_regs, regs.size, free_regs, regs.size); 4357ec681f3Smrg 4367ec681f3Smrg /* print assignments ordered by registers */ 4377ec681f3Smrg std::map<PhysReg, std::pair<unsigned, unsigned>> 4387ec681f3Smrg regs_to_vars; /* maps to byte size and temp id */ 4397ec681f3Smrg for (const auto& size_id : find_vars(ctx, reg_file, regs)) { 4407ec681f3Smrg auto reg = ctx.assignments[size_id.second].reg; 4417ec681f3Smrg ASSERTED auto inserted = regs_to_vars.emplace(reg, size_id); 4427ec681f3Smrg assert(inserted.second); 4437ec681f3Smrg } 4447ec681f3Smrg 4457ec681f3Smrg for (const auto& reg_and_var : regs_to_vars) { 4467ec681f3Smrg const auto& first_reg = reg_and_var.first; 4477ec681f3Smrg const auto& size_id = reg_and_var.second; 4487ec681f3Smrg 4497ec681f3Smrg printf("%%%u ", size_id.second); 4507ec681f3Smrg if (ctx.orig_names.count(size_id.second) && 4517ec681f3Smrg ctx.orig_names[size_id.second].id() != size_id.second) { 4527ec681f3Smrg printf("(was %%%d) ", ctx.orig_names[size_id.second].id()); 4537ec681f3Smrg } 4547ec681f3Smrg printf("= %c[%d", reg_char, first_reg.reg() - regs.lo()); 4557ec681f3Smrg PhysReg last_reg = first_reg.advance(size_id.first - 1); 4567ec681f3Smrg if (first_reg.reg() != last_reg.reg()) { 4577ec681f3Smrg assert(first_reg.byte() == 0 && last_reg.byte() == 3); 4587ec681f3Smrg printf("-%d", last_reg.reg() - regs.lo()); 4597ec681f3Smrg } 4607ec681f3Smrg printf("]"); 4617ec681f3Smrg if (first_reg.byte() != 0 || last_reg.byte() != 3) { 4627ec681f3Smrg printf("[%d:%d]", first_reg.byte() * 8, (last_reg.byte() + 1) * 8); 4637ec681f3Smrg } 4647ec681f3Smrg printf("\n"); 4657ec681f3Smrg } 4667ec681f3Smrg} 4677ec681f3Smrg 4687ec681f3Smrgunsigned 4697ec681f3Smrgget_subdword_operand_stride(chip_class chip, const aco_ptr<Instruction>& instr, unsigned idx, 4707ec681f3Smrg RegClass rc) 4717ec681f3Smrg{ 4727ec681f3Smrg if (instr->isPseudo()) { 4737ec681f3Smrg /* v_readfirstlane_b32 cannot use SDWA */ 4747ec681f3Smrg if (instr->opcode == aco_opcode::p_as_uniform) 4757ec681f3Smrg return 4; 4767ec681f3Smrg else if (chip >= GFX8) 4777ec681f3Smrg return rc.bytes() % 2 == 0 ? 2 : 1; 4787ec681f3Smrg else 4797ec681f3Smrg return 4; 4807ec681f3Smrg } 4817ec681f3Smrg 4827ec681f3Smrg assert(rc.bytes() <= 2); 4837ec681f3Smrg if (instr->isVALU()) { 4847ec681f3Smrg if (can_use_SDWA(chip, instr, false)) 4857ec681f3Smrg return rc.bytes(); 4867ec681f3Smrg if (can_use_opsel(chip, instr->opcode, idx, true)) 4877ec681f3Smrg return 2; 4887ec681f3Smrg if (instr->format == Format::VOP3P) 4897ec681f3Smrg return 2; 4907ec681f3Smrg } 4917ec681f3Smrg 4927ec681f3Smrg switch (instr->opcode) { 4937ec681f3Smrg case aco_opcode::v_cvt_f32_ubyte0: return 1; 4947ec681f3Smrg case aco_opcode::ds_write_b8: 4957ec681f3Smrg case aco_opcode::ds_write_b16: return chip >= GFX9 ? 2 : 4; 4967ec681f3Smrg case aco_opcode::buffer_store_byte: 4977ec681f3Smrg case aco_opcode::buffer_store_short: 4987ec681f3Smrg case aco_opcode::flat_store_byte: 4997ec681f3Smrg case aco_opcode::flat_store_short: 5007ec681f3Smrg case aco_opcode::scratch_store_byte: 5017ec681f3Smrg case aco_opcode::scratch_store_short: 5027ec681f3Smrg case aco_opcode::global_store_byte: 5037ec681f3Smrg case aco_opcode::global_store_short: return chip >= GFX9 ? 2 : 4; 5047ec681f3Smrg default: return 4; 5057ec681f3Smrg } 5067ec681f3Smrg} 5077ec681f3Smrg 5087ec681f3Smrgvoid 5097ec681f3Smrgadd_subdword_operand(ra_ctx& ctx, aco_ptr<Instruction>& instr, unsigned idx, unsigned byte, 5107ec681f3Smrg RegClass rc) 5117ec681f3Smrg{ 5127ec681f3Smrg chip_class chip = ctx.program->chip_class; 5137ec681f3Smrg if (instr->isPseudo() || byte == 0) 5147ec681f3Smrg return; 5157ec681f3Smrg 5167ec681f3Smrg assert(rc.bytes() <= 2); 5177ec681f3Smrg if (instr->isVALU()) { 5187ec681f3Smrg /* check if we can use opsel */ 5197ec681f3Smrg if (instr->format == Format::VOP3) { 5207ec681f3Smrg assert(byte == 2); 5217ec681f3Smrg instr->vop3().opsel |= 1 << idx; 5227ec681f3Smrg return; 5237ec681f3Smrg } 5247ec681f3Smrg if (instr->isVOP3P()) { 5257ec681f3Smrg assert(byte == 2 && !(instr->vop3p().opsel_lo & (1 << idx))); 5267ec681f3Smrg instr->vop3p().opsel_lo |= 1 << idx; 5277ec681f3Smrg instr->vop3p().opsel_hi |= 1 << idx; 5287ec681f3Smrg return; 5297ec681f3Smrg } 5307ec681f3Smrg if (instr->opcode == aco_opcode::v_cvt_f32_ubyte0) { 5317ec681f3Smrg switch (byte) { 5327ec681f3Smrg case 0: instr->opcode = aco_opcode::v_cvt_f32_ubyte0; break; 5337ec681f3Smrg case 1: instr->opcode = aco_opcode::v_cvt_f32_ubyte1; break; 5347ec681f3Smrg case 2: instr->opcode = aco_opcode::v_cvt_f32_ubyte2; break; 5357ec681f3Smrg case 3: instr->opcode = aco_opcode::v_cvt_f32_ubyte3; break; 5367ec681f3Smrg } 5377ec681f3Smrg return; 5387ec681f3Smrg } 5397ec681f3Smrg 5407ec681f3Smrg /* use SDWA */ 5417ec681f3Smrg assert(can_use_SDWA(chip, instr, false)); 5427ec681f3Smrg convert_to_SDWA(chip, instr); 5437ec681f3Smrg return; 5447ec681f3Smrg } 5457ec681f3Smrg 5467ec681f3Smrg assert(byte == 2); 5477ec681f3Smrg if (instr->opcode == aco_opcode::ds_write_b8) 5487ec681f3Smrg instr->opcode = aco_opcode::ds_write_b8_d16_hi; 5497ec681f3Smrg else if (instr->opcode == aco_opcode::ds_write_b16) 5507ec681f3Smrg instr->opcode = aco_opcode::ds_write_b16_d16_hi; 5517ec681f3Smrg else if (instr->opcode == aco_opcode::buffer_store_byte) 5527ec681f3Smrg instr->opcode = aco_opcode::buffer_store_byte_d16_hi; 5537ec681f3Smrg else if (instr->opcode == aco_opcode::buffer_store_short) 5547ec681f3Smrg instr->opcode = aco_opcode::buffer_store_short_d16_hi; 5557ec681f3Smrg else if (instr->opcode == aco_opcode::flat_store_byte) 5567ec681f3Smrg instr->opcode = aco_opcode::flat_store_byte_d16_hi; 5577ec681f3Smrg else if (instr->opcode == aco_opcode::flat_store_short) 5587ec681f3Smrg instr->opcode = aco_opcode::flat_store_short_d16_hi; 5597ec681f3Smrg else if (instr->opcode == aco_opcode::scratch_store_byte) 5607ec681f3Smrg instr->opcode = aco_opcode::scratch_store_byte_d16_hi; 5617ec681f3Smrg else if (instr->opcode == aco_opcode::scratch_store_short) 5627ec681f3Smrg instr->opcode = aco_opcode::scratch_store_short_d16_hi; 5637ec681f3Smrg else if (instr->opcode == aco_opcode::global_store_byte) 5647ec681f3Smrg instr->opcode = aco_opcode::global_store_byte_d16_hi; 5657ec681f3Smrg else if (instr->opcode == aco_opcode::global_store_short) 5667ec681f3Smrg instr->opcode = aco_opcode::global_store_short_d16_hi; 5677ec681f3Smrg else 5687ec681f3Smrg unreachable("Something went wrong: Impossible register assignment."); 5697ec681f3Smrg return; 5707ec681f3Smrg} 5717ec681f3Smrg 5727ec681f3Smrg/* minimum_stride, bytes_written */ 5737ec681f3Smrgstd::pair<unsigned, unsigned> 5747ec681f3Smrgget_subdword_definition_info(Program* program, const aco_ptr<Instruction>& instr, RegClass rc) 5757ec681f3Smrg{ 5767ec681f3Smrg chip_class chip = program->chip_class; 5777ec681f3Smrg 5787ec681f3Smrg if (instr->isPseudo()) { 5797ec681f3Smrg if (chip >= GFX8) 5807ec681f3Smrg return std::make_pair(rc.bytes() % 2 == 0 ? 2 : 1, rc.bytes()); 5817ec681f3Smrg else 5827ec681f3Smrg return std::make_pair(4, rc.size() * 4u); 5837ec681f3Smrg } 5847ec681f3Smrg 5857ec681f3Smrg if (instr->isVALU() || instr->isVINTRP()) { 5867ec681f3Smrg assert(rc.bytes() <= 2); 5877ec681f3Smrg 5887ec681f3Smrg if (can_use_SDWA(chip, instr, false)) 5897ec681f3Smrg return std::make_pair(rc.bytes(), rc.bytes()); 5907ec681f3Smrg 5917ec681f3Smrg unsigned bytes_written = 4u; 5927ec681f3Smrg if (instr_is_16bit(chip, instr->opcode)) 5937ec681f3Smrg bytes_written = 2u; 5947ec681f3Smrg 5957ec681f3Smrg unsigned stride = 4u; 5967ec681f3Smrg if (instr->opcode == aco_opcode::v_fma_mixlo_f16 || 5977ec681f3Smrg can_use_opsel(chip, instr->opcode, -1, true)) 5987ec681f3Smrg stride = 2u; 5997ec681f3Smrg 6007ec681f3Smrg return std::make_pair(stride, bytes_written); 6017ec681f3Smrg } 6027ec681f3Smrg 6037ec681f3Smrg switch (instr->opcode) { 6047ec681f3Smrg case aco_opcode::ds_read_u8_d16: 6057ec681f3Smrg case aco_opcode::ds_read_i8_d16: 6067ec681f3Smrg case aco_opcode::ds_read_u16_d16: 6077ec681f3Smrg case aco_opcode::flat_load_ubyte_d16: 6087ec681f3Smrg case aco_opcode::flat_load_sbyte_d16: 6097ec681f3Smrg case aco_opcode::flat_load_short_d16: 6107ec681f3Smrg case aco_opcode::global_load_ubyte_d16: 6117ec681f3Smrg case aco_opcode::global_load_sbyte_d16: 6127ec681f3Smrg case aco_opcode::global_load_short_d16: 6137ec681f3Smrg case aco_opcode::scratch_load_ubyte_d16: 6147ec681f3Smrg case aco_opcode::scratch_load_sbyte_d16: 6157ec681f3Smrg case aco_opcode::scratch_load_short_d16: 6167ec681f3Smrg case aco_opcode::buffer_load_ubyte_d16: 6177ec681f3Smrg case aco_opcode::buffer_load_sbyte_d16: 6187ec681f3Smrg case aco_opcode::buffer_load_short_d16: { 6197ec681f3Smrg assert(chip >= GFX9); 6207ec681f3Smrg if (!program->dev.sram_ecc_enabled) 6217ec681f3Smrg return std::make_pair(2u, 2u); 6227ec681f3Smrg else 6237ec681f3Smrg return std::make_pair(2u, 4u); 6247ec681f3Smrg } 6257ec681f3Smrg 6267ec681f3Smrg default: return std::make_pair(4, rc.size() * 4u); 6277ec681f3Smrg } 6287ec681f3Smrg} 6297ec681f3Smrg 6307ec681f3Smrgvoid 6317ec681f3Smrgadd_subdword_definition(Program* program, aco_ptr<Instruction>& instr, PhysReg reg) 6327ec681f3Smrg{ 6337ec681f3Smrg if (instr->isPseudo()) 6347ec681f3Smrg return; 6357ec681f3Smrg 6367ec681f3Smrg if (instr->isVALU()) { 6377ec681f3Smrg chip_class chip = program->chip_class; 6387ec681f3Smrg assert(instr->definitions[0].bytes() <= 2); 6397ec681f3Smrg 6407ec681f3Smrg if (reg.byte() == 0 && instr_is_16bit(chip, instr->opcode)) 6417ec681f3Smrg return; 6427ec681f3Smrg 6437ec681f3Smrg /* check if we can use opsel */ 6447ec681f3Smrg if (instr->format == Format::VOP3) { 6457ec681f3Smrg assert(reg.byte() == 2); 6467ec681f3Smrg assert(can_use_opsel(chip, instr->opcode, -1, true)); 6477ec681f3Smrg instr->vop3().opsel |= (1 << 3); /* dst in high half */ 6487ec681f3Smrg return; 6497ec681f3Smrg } 6507ec681f3Smrg 6517ec681f3Smrg if (instr->opcode == aco_opcode::v_fma_mixlo_f16) { 6527ec681f3Smrg instr->opcode = aco_opcode::v_fma_mixhi_f16; 6537ec681f3Smrg return; 6547ec681f3Smrg } 6557ec681f3Smrg 6567ec681f3Smrg /* use SDWA */ 6577ec681f3Smrg assert(can_use_SDWA(chip, instr, false)); 6587ec681f3Smrg convert_to_SDWA(chip, instr); 6597ec681f3Smrg return; 6607ec681f3Smrg } 6617ec681f3Smrg 6627ec681f3Smrg if (reg.byte() == 0) 6637ec681f3Smrg return; 6647ec681f3Smrg else if (instr->opcode == aco_opcode::buffer_load_ubyte_d16) 6657ec681f3Smrg instr->opcode = aco_opcode::buffer_load_ubyte_d16_hi; 6667ec681f3Smrg else if (instr->opcode == aco_opcode::buffer_load_sbyte_d16) 6677ec681f3Smrg instr->opcode = aco_opcode::buffer_load_sbyte_d16_hi; 6687ec681f3Smrg else if (instr->opcode == aco_opcode::buffer_load_short_d16) 6697ec681f3Smrg instr->opcode = aco_opcode::buffer_load_short_d16_hi; 6707ec681f3Smrg else if (instr->opcode == aco_opcode::flat_load_ubyte_d16) 6717ec681f3Smrg instr->opcode = aco_opcode::flat_load_ubyte_d16_hi; 6727ec681f3Smrg else if (instr->opcode == aco_opcode::flat_load_sbyte_d16) 6737ec681f3Smrg instr->opcode = aco_opcode::flat_load_sbyte_d16_hi; 6747ec681f3Smrg else if (instr->opcode == aco_opcode::flat_load_short_d16) 6757ec681f3Smrg instr->opcode = aco_opcode::flat_load_short_d16_hi; 6767ec681f3Smrg else if (instr->opcode == aco_opcode::scratch_load_ubyte_d16) 6777ec681f3Smrg instr->opcode = aco_opcode::scratch_load_ubyte_d16_hi; 6787ec681f3Smrg else if (instr->opcode == aco_opcode::scratch_load_sbyte_d16) 6797ec681f3Smrg instr->opcode = aco_opcode::scratch_load_sbyte_d16_hi; 6807ec681f3Smrg else if (instr->opcode == aco_opcode::scratch_load_short_d16) 6817ec681f3Smrg instr->opcode = aco_opcode::scratch_load_short_d16_hi; 6827ec681f3Smrg else if (instr->opcode == aco_opcode::global_load_ubyte_d16) 6837ec681f3Smrg instr->opcode = aco_opcode::global_load_ubyte_d16_hi; 6847ec681f3Smrg else if (instr->opcode == aco_opcode::global_load_sbyte_d16) 6857ec681f3Smrg instr->opcode = aco_opcode::global_load_sbyte_d16_hi; 6867ec681f3Smrg else if (instr->opcode == aco_opcode::global_load_short_d16) 6877ec681f3Smrg instr->opcode = aco_opcode::global_load_short_d16_hi; 6887ec681f3Smrg else if (instr->opcode == aco_opcode::ds_read_u8_d16) 6897ec681f3Smrg instr->opcode = aco_opcode::ds_read_u8_d16_hi; 6907ec681f3Smrg else if (instr->opcode == aco_opcode::ds_read_i8_d16) 6917ec681f3Smrg instr->opcode = aco_opcode::ds_read_i8_d16_hi; 6927ec681f3Smrg else if (instr->opcode == aco_opcode::ds_read_u16_d16) 6937ec681f3Smrg instr->opcode = aco_opcode::ds_read_u16_d16_hi; 6947ec681f3Smrg else 6957ec681f3Smrg unreachable("Something went wrong: Impossible register assignment."); 6967ec681f3Smrg} 6977ec681f3Smrg 6987ec681f3Smrgvoid 6997ec681f3Smrgadjust_max_used_regs(ra_ctx& ctx, RegClass rc, unsigned reg) 7007ec681f3Smrg{ 7017ec681f3Smrg uint16_t max_addressible_sgpr = ctx.sgpr_limit; 7027ec681f3Smrg unsigned size = rc.size(); 7037ec681f3Smrg if (rc.type() == RegType::vgpr) { 7047ec681f3Smrg assert(reg >= 256); 7057ec681f3Smrg uint16_t hi = reg - 256 + size - 1; 7067ec681f3Smrg ctx.max_used_vgpr = std::max(ctx.max_used_vgpr, hi); 7077ec681f3Smrg } else if (reg + rc.size() <= max_addressible_sgpr) { 7087ec681f3Smrg uint16_t hi = reg + size - 1; 7097ec681f3Smrg ctx.max_used_sgpr = std::max(ctx.max_used_sgpr, std::min(hi, max_addressible_sgpr)); 7107ec681f3Smrg } 7117ec681f3Smrg} 7127ec681f3Smrg 7137ec681f3Smrgenum UpdateRenames { 7147ec681f3Smrg rename_not_killed_ops = 0x1, 7157ec681f3Smrg fill_killed_ops = 0x2, 7167ec681f3Smrg}; 7177ec681f3SmrgMESA_DEFINE_CPP_ENUM_BITFIELD_OPERATORS(UpdateRenames); 7187ec681f3Smrg 7197ec681f3Smrgvoid 7207ec681f3Smrgupdate_renames(ra_ctx& ctx, RegisterFile& reg_file, 7217ec681f3Smrg std::vector<std::pair<Operand, Definition>>& parallelcopies, 7227ec681f3Smrg aco_ptr<Instruction>& instr, UpdateRenames flags) 7237ec681f3Smrg{ 7247ec681f3Smrg /* clear operands */ 7257ec681f3Smrg for (std::pair<Operand, Definition>& copy : parallelcopies) { 7267ec681f3Smrg /* the definitions with id are not from this function and already handled */ 7277ec681f3Smrg if (copy.second.isTemp()) 7287ec681f3Smrg continue; 7297ec681f3Smrg reg_file.clear(copy.first); 7307ec681f3Smrg } 7317ec681f3Smrg 7327ec681f3Smrg /* allocate id's and rename operands: this is done transparently here */ 7337ec681f3Smrg auto it = parallelcopies.begin(); 7347ec681f3Smrg while (it != parallelcopies.end()) { 7357ec681f3Smrg if (it->second.isTemp()) { 7367ec681f3Smrg ++it; 7377ec681f3Smrg continue; 7387ec681f3Smrg } 7397ec681f3Smrg 7407ec681f3Smrg /* check if we moved a definition: change the register and remove copy */ 7417ec681f3Smrg bool is_def = false; 7427ec681f3Smrg for (Definition& def : instr->definitions) { 7437ec681f3Smrg if (def.isTemp() && def.getTemp() == it->first.getTemp()) { 7447ec681f3Smrg // FIXME: ensure that the definition can use this reg 7457ec681f3Smrg def.setFixed(it->second.physReg()); 7467ec681f3Smrg reg_file.fill(def); 7477ec681f3Smrg ctx.assignments[def.tempId()].reg = def.physReg(); 7487ec681f3Smrg it = parallelcopies.erase(it); 7497ec681f3Smrg is_def = true; 7507ec681f3Smrg break; 7517ec681f3Smrg } 7527ec681f3Smrg } 7537ec681f3Smrg if (is_def) 7547ec681f3Smrg continue; 7557ec681f3Smrg 7567ec681f3Smrg /* check if we moved another parallelcopy definition */ 7577ec681f3Smrg for (std::pair<Operand, Definition>& other : parallelcopies) { 7587ec681f3Smrg if (!other.second.isTemp()) 7597ec681f3Smrg continue; 7607ec681f3Smrg if (it->first.getTemp() == other.second.getTemp()) { 7617ec681f3Smrg other.second.setFixed(it->second.physReg()); 7627ec681f3Smrg ctx.assignments[other.second.tempId()].reg = other.second.physReg(); 7637ec681f3Smrg it = parallelcopies.erase(it); 7647ec681f3Smrg is_def = true; 7657ec681f3Smrg /* check if we moved an operand, again */ 7667ec681f3Smrg bool fill = true; 7677ec681f3Smrg for (Operand& op : instr->operands) { 7687ec681f3Smrg if (op.isTemp() && op.tempId() == other.second.tempId()) { 7697ec681f3Smrg // FIXME: ensure that the operand can use this reg 7707ec681f3Smrg op.setFixed(other.second.physReg()); 7717ec681f3Smrg fill = (flags & fill_killed_ops) || !op.isKillBeforeDef(); 7727ec681f3Smrg } 7737ec681f3Smrg } 7747ec681f3Smrg if (fill) 7757ec681f3Smrg reg_file.fill(other.second); 7767ec681f3Smrg break; 7777ec681f3Smrg } 7787ec681f3Smrg } 7797ec681f3Smrg if (is_def) 7807ec681f3Smrg continue; 7817ec681f3Smrg 7827ec681f3Smrg std::pair<Operand, Definition>& copy = *it; 7837ec681f3Smrg copy.second.setTemp(ctx.program->allocateTmp(copy.second.regClass())); 7847ec681f3Smrg ctx.assignments.emplace_back(copy.second.physReg(), copy.second.regClass()); 7857ec681f3Smrg assert(ctx.assignments.size() == ctx.program->peekAllocationId()); 7867ec681f3Smrg 7877ec681f3Smrg /* check if we moved an operand */ 7887ec681f3Smrg bool first = true; 7897ec681f3Smrg bool fill = true; 7907ec681f3Smrg for (unsigned i = 0; i < instr->operands.size(); i++) { 7917ec681f3Smrg Operand& op = instr->operands[i]; 7927ec681f3Smrg if (!op.isTemp()) 7937ec681f3Smrg continue; 7947ec681f3Smrg if (op.tempId() == copy.first.tempId()) { 7957ec681f3Smrg bool omit_renaming = !(flags & rename_not_killed_ops) && !op.isKillBeforeDef(); 7967ec681f3Smrg for (std::pair<Operand, Definition>& pc : parallelcopies) { 7977ec681f3Smrg PhysReg def_reg = pc.second.physReg(); 7987ec681f3Smrg omit_renaming &= def_reg > copy.first.physReg() 7997ec681f3Smrg ? (copy.first.physReg() + copy.first.size() <= def_reg.reg()) 8007ec681f3Smrg : (def_reg + pc.second.size() <= copy.first.physReg().reg()); 8017ec681f3Smrg } 8027ec681f3Smrg if (omit_renaming) { 8037ec681f3Smrg if (first) 8047ec681f3Smrg op.setFirstKill(true); 8057ec681f3Smrg else 8067ec681f3Smrg op.setKill(true); 8077ec681f3Smrg first = false; 8087ec681f3Smrg continue; 8097ec681f3Smrg } 8107ec681f3Smrg op.setTemp(copy.second.getTemp()); 8117ec681f3Smrg op.setFixed(copy.second.physReg()); 8127ec681f3Smrg 8137ec681f3Smrg fill = (flags & fill_killed_ops) || !op.isKillBeforeDef(); 8147ec681f3Smrg } 8157ec681f3Smrg } 8167ec681f3Smrg 8177ec681f3Smrg if (fill) 8187ec681f3Smrg reg_file.fill(copy.second); 8197ec681f3Smrg 8207ec681f3Smrg ++it; 8217ec681f3Smrg } 8227ec681f3Smrg} 8237ec681f3Smrg 8247ec681f3Smrgstd::pair<PhysReg, bool> 8257ec681f3Smrgget_reg_simple(ra_ctx& ctx, RegisterFile& reg_file, DefInfo info) 8267ec681f3Smrg{ 8277ec681f3Smrg const PhysRegInterval& bounds = info.bounds; 8287ec681f3Smrg uint32_t size = info.size; 8297ec681f3Smrg uint32_t stride = info.rc.is_subdword() ? DIV_ROUND_UP(info.stride, 4) : info.stride; 8307ec681f3Smrg RegClass rc = info.rc; 8317ec681f3Smrg 8327ec681f3Smrg DefInfo new_info = info; 8337ec681f3Smrg new_info.rc = RegClass(rc.type(), size); 8347ec681f3Smrg for (unsigned new_stride = 16; new_stride > stride; new_stride /= 2) { 8357ec681f3Smrg if (size % new_stride) 8367ec681f3Smrg continue; 8377ec681f3Smrg new_info.stride = new_stride; 8387ec681f3Smrg std::pair<PhysReg, bool> res = get_reg_simple(ctx, reg_file, new_info); 8397ec681f3Smrg if (res.second) 8407ec681f3Smrg return res; 8417ec681f3Smrg } 8427ec681f3Smrg 8437ec681f3Smrg auto is_free = [&](PhysReg reg_index) 8447ec681f3Smrg { return reg_file[reg_index] == 0 && !ctx.war_hint[reg_index]; }; 8457ec681f3Smrg 8467ec681f3Smrg if (stride == 1) { 8477ec681f3Smrg /* best fit algorithm: find the smallest gap to fit in the variable */ 8487ec681f3Smrg PhysRegInterval best_gap{PhysReg{0}, UINT_MAX}; 8497ec681f3Smrg const unsigned max_gpr = 8507ec681f3Smrg (rc.type() == RegType::vgpr) ? (256 + ctx.max_used_vgpr) : ctx.max_used_sgpr; 8517ec681f3Smrg 8527ec681f3Smrg PhysRegIterator reg_it = bounds.begin(); 8537ec681f3Smrg const PhysRegIterator end_it = 8547ec681f3Smrg std::min(bounds.end(), std::max(PhysRegIterator{PhysReg{max_gpr + 1}}, reg_it)); 8557ec681f3Smrg while (reg_it != bounds.end()) { 8567ec681f3Smrg /* Find the next chunk of available register slots */ 8577ec681f3Smrg reg_it = std::find_if(reg_it, end_it, is_free); 8587ec681f3Smrg auto next_nonfree_it = std::find_if_not(reg_it, end_it, is_free); 8597ec681f3Smrg if (reg_it == bounds.end()) { 8607ec681f3Smrg break; 8617ec681f3Smrg } 8627ec681f3Smrg 8637ec681f3Smrg if (next_nonfree_it == end_it) { 8647ec681f3Smrg /* All registers past max_used_gpr are free */ 8657ec681f3Smrg next_nonfree_it = bounds.end(); 8667ec681f3Smrg } 8677ec681f3Smrg 8687ec681f3Smrg PhysRegInterval gap = PhysRegInterval::from_until(*reg_it, *next_nonfree_it); 8697ec681f3Smrg 8707ec681f3Smrg /* early return on exact matches */ 8717ec681f3Smrg if (size == gap.size) { 8727ec681f3Smrg adjust_max_used_regs(ctx, rc, gap.lo()); 8737ec681f3Smrg return {gap.lo(), true}; 8747ec681f3Smrg } 8757ec681f3Smrg 8767ec681f3Smrg /* check if it fits and the gap size is smaller */ 8777ec681f3Smrg if (size < gap.size && gap.size < best_gap.size) { 8787ec681f3Smrg best_gap = gap; 8797ec681f3Smrg } 8807ec681f3Smrg 8817ec681f3Smrg /* Move past the processed chunk */ 8827ec681f3Smrg reg_it = next_nonfree_it; 8837ec681f3Smrg } 8847ec681f3Smrg 8857ec681f3Smrg if (best_gap.size == UINT_MAX) 8867ec681f3Smrg return {{}, false}; 8877ec681f3Smrg 8887ec681f3Smrg /* find best position within gap by leaving a good stride for other variables*/ 8897ec681f3Smrg unsigned buffer = best_gap.size - size; 8907ec681f3Smrg if (buffer > 1) { 8917ec681f3Smrg if (((best_gap.lo() + size) % 8 != 0 && (best_gap.lo() + buffer) % 8 == 0) || 8927ec681f3Smrg ((best_gap.lo() + size) % 4 != 0 && (best_gap.lo() + buffer) % 4 == 0) || 8937ec681f3Smrg ((best_gap.lo() + size) % 2 != 0 && (best_gap.lo() + buffer) % 2 == 0)) 8947ec681f3Smrg best_gap = {PhysReg{best_gap.lo() + buffer}, best_gap.size - buffer}; 8957ec681f3Smrg } 8967ec681f3Smrg 8977ec681f3Smrg adjust_max_used_regs(ctx, rc, best_gap.lo()); 8987ec681f3Smrg return {best_gap.lo(), true}; 8997ec681f3Smrg } 9007ec681f3Smrg 9017ec681f3Smrg for (PhysRegInterval reg_win = {bounds.lo(), size}; reg_win.hi() <= bounds.hi(); 9027ec681f3Smrg reg_win += stride) { 9037ec681f3Smrg if (reg_file[reg_win.lo()] != 0) { 9047ec681f3Smrg continue; 9057ec681f3Smrg } 9067ec681f3Smrg 9077ec681f3Smrg bool is_valid = std::all_of(std::next(reg_win.begin()), reg_win.end(), is_free); 9087ec681f3Smrg if (is_valid) { 9097ec681f3Smrg adjust_max_used_regs(ctx, rc, reg_win.lo()); 9107ec681f3Smrg return {reg_win.lo(), true}; 9117ec681f3Smrg } 9127ec681f3Smrg } 9137ec681f3Smrg 9147ec681f3Smrg /* do this late because using the upper bytes of a register can require 9157ec681f3Smrg * larger instruction encodings or copies 9167ec681f3Smrg * TODO: don't do this in situations where it doesn't benefit */ 9177ec681f3Smrg if (rc.is_subdword()) { 9187ec681f3Smrg for (std::pair<const uint32_t, std::array<uint32_t, 4>>& entry : reg_file.subdword_regs) { 9197ec681f3Smrg assert(reg_file[PhysReg{entry.first}] == 0xF0000000); 9207ec681f3Smrg if (!bounds.contains({PhysReg{entry.first}, rc.size()})) 9217ec681f3Smrg continue; 9227ec681f3Smrg 9237ec681f3Smrg for (unsigned i = 0; i < 4; i += info.stride) { 9247ec681f3Smrg /* check if there's a block of free bytes large enough to hold the register */ 9257ec681f3Smrg bool reg_found = 9267ec681f3Smrg std::all_of(&entry.second[i], &entry.second[std::min(4u, i + rc.bytes())], 9277ec681f3Smrg [](unsigned v) { return v == 0; }); 9287ec681f3Smrg 9297ec681f3Smrg /* check if also the neighboring reg is free if needed */ 9307ec681f3Smrg if (reg_found && i + rc.bytes() > 4) 9317ec681f3Smrg reg_found = (reg_file[PhysReg{entry.first + 1}] == 0); 9327ec681f3Smrg 9337ec681f3Smrg if (reg_found) { 9347ec681f3Smrg PhysReg res{entry.first}; 9357ec681f3Smrg res.reg_b += i; 9367ec681f3Smrg adjust_max_used_regs(ctx, rc, entry.first); 9377ec681f3Smrg return {res, true}; 9387ec681f3Smrg } 9397ec681f3Smrg } 9407ec681f3Smrg } 9417ec681f3Smrg } 9427ec681f3Smrg 9437ec681f3Smrg return {{}, false}; 9447ec681f3Smrg} 9457ec681f3Smrg 9467ec681f3Smrg/* collect variables from a register area and clear reg_file */ 9477ec681f3Smrgstd::set<std::pair<unsigned, unsigned>> 9487ec681f3Smrgfind_vars(ra_ctx& ctx, RegisterFile& reg_file, const PhysRegInterval reg_interval) 9497ec681f3Smrg{ 9507ec681f3Smrg std::set<std::pair<unsigned, unsigned>> vars; 9517ec681f3Smrg for (PhysReg j : reg_interval) { 9527ec681f3Smrg if (reg_file.is_blocked(j)) 9537ec681f3Smrg continue; 9547ec681f3Smrg if (reg_file[j] == 0xF0000000) { 9557ec681f3Smrg for (unsigned k = 0; k < 4; k++) { 9567ec681f3Smrg unsigned id = reg_file.subdword_regs[j][k]; 9577ec681f3Smrg if (id) { 9587ec681f3Smrg assignment& var = ctx.assignments[id]; 9597ec681f3Smrg vars.emplace(var.rc.bytes(), id); 9607ec681f3Smrg } 9617ec681f3Smrg } 9627ec681f3Smrg } else if (reg_file[j] != 0) { 9637ec681f3Smrg unsigned id = reg_file[j]; 9647ec681f3Smrg assignment& var = ctx.assignments[id]; 9657ec681f3Smrg vars.emplace(var.rc.bytes(), id); 9667ec681f3Smrg } 9677ec681f3Smrg } 9687ec681f3Smrg return vars; 9697ec681f3Smrg} 9707ec681f3Smrg 9717ec681f3Smrg/* collect variables from a register area and clear reg_file */ 9727ec681f3Smrgstd::set<std::pair<unsigned, unsigned>> 9737ec681f3Smrgcollect_vars(ra_ctx& ctx, RegisterFile& reg_file, const PhysRegInterval reg_interval) 9747ec681f3Smrg{ 9757ec681f3Smrg std::set<std::pair<unsigned, unsigned>> vars = find_vars(ctx, reg_file, reg_interval); 9767ec681f3Smrg for (std::pair<unsigned, unsigned> size_id : vars) { 9777ec681f3Smrg assignment& var = ctx.assignments[size_id.second]; 9787ec681f3Smrg reg_file.clear(var.reg, var.rc); 9797ec681f3Smrg } 9807ec681f3Smrg return vars; 9817ec681f3Smrg} 9827ec681f3Smrg 9837ec681f3Smrgbool 9847ec681f3Smrgget_regs_for_copies(ra_ctx& ctx, RegisterFile& reg_file, 9857ec681f3Smrg std::vector<std::pair<Operand, Definition>>& parallelcopies, 9867ec681f3Smrg const std::set<std::pair<unsigned, unsigned>>& vars, 9877ec681f3Smrg const PhysRegInterval bounds, aco_ptr<Instruction>& instr, 9887ec681f3Smrg const PhysRegInterval def_reg) 9897ec681f3Smrg{ 9907ec681f3Smrg /* variables are sorted from small sized to large */ 9917ec681f3Smrg /* NOTE: variables are also sorted by ID. this only affects a very small number of shaders 9927ec681f3Smrg * slightly though. */ 9937ec681f3Smrg for (std::set<std::pair<unsigned, unsigned>>::const_reverse_iterator it = vars.rbegin(); 9947ec681f3Smrg it != vars.rend(); ++it) { 9957ec681f3Smrg unsigned id = it->second; 9967ec681f3Smrg assignment& var = ctx.assignments[id]; 9977ec681f3Smrg DefInfo info = DefInfo(ctx, ctx.pseudo_dummy, var.rc, -1); 9987ec681f3Smrg uint32_t size = info.size; 9997ec681f3Smrg 10007ec681f3Smrg /* check if this is a dead operand, then we can re-use the space from the definition 10017ec681f3Smrg * also use the correct stride for sub-dword operands */ 10027ec681f3Smrg bool is_dead_operand = false; 10037ec681f3Smrg for (unsigned i = 0; !is_phi(instr) && i < instr->operands.size(); i++) { 10047ec681f3Smrg if (instr->operands[i].isTemp() && instr->operands[i].tempId() == id) { 10057ec681f3Smrg if (instr->operands[i].isKillBeforeDef()) 10067ec681f3Smrg is_dead_operand = true; 10077ec681f3Smrg info = DefInfo(ctx, instr, var.rc, i); 10087ec681f3Smrg break; 10097ec681f3Smrg } 10107ec681f3Smrg } 10117ec681f3Smrg 10127ec681f3Smrg std::pair<PhysReg, bool> res; 10137ec681f3Smrg if (is_dead_operand) { 10147ec681f3Smrg if (instr->opcode == aco_opcode::p_create_vector) { 10157ec681f3Smrg PhysReg reg(def_reg.lo()); 10167ec681f3Smrg for (unsigned i = 0; i < instr->operands.size(); i++) { 10177ec681f3Smrg if (instr->operands[i].isTemp() && instr->operands[i].tempId() == id) { 10187ec681f3Smrg res = {reg, (!var.rc.is_subdword() || (reg.byte() % info.stride == 0)) && 10197ec681f3Smrg !reg_file.test(reg, var.rc.bytes())}; 10207ec681f3Smrg break; 10217ec681f3Smrg } 10227ec681f3Smrg reg.reg_b += instr->operands[i].bytes(); 10237ec681f3Smrg } 10247ec681f3Smrg if (!res.second) 10257ec681f3Smrg res = {var.reg, !reg_file.test(var.reg, var.rc.bytes())}; 10267ec681f3Smrg } else { 10277ec681f3Smrg info.bounds = def_reg; 10287ec681f3Smrg res = get_reg_simple(ctx, reg_file, info); 10297ec681f3Smrg } 10307ec681f3Smrg } else { 10317ec681f3Smrg /* Try to find space within the bounds but outside of the definition */ 10327ec681f3Smrg info.bounds = PhysRegInterval::from_until(bounds.lo(), MIN2(def_reg.lo(), bounds.hi())); 10337ec681f3Smrg res = get_reg_simple(ctx, reg_file, info); 10347ec681f3Smrg if (!res.second && def_reg.hi() <= bounds.hi()) { 10357ec681f3Smrg unsigned lo = (def_reg.hi() + info.stride - 1) & ~(info.stride - 1); 10367ec681f3Smrg info.bounds = PhysRegInterval::from_until(PhysReg{lo}, bounds.hi()); 10377ec681f3Smrg res = get_reg_simple(ctx, reg_file, info); 10387ec681f3Smrg } 10397ec681f3Smrg } 10407ec681f3Smrg 10417ec681f3Smrg if (res.second) { 10427ec681f3Smrg /* mark the area as blocked */ 10437ec681f3Smrg reg_file.block(res.first, var.rc); 10447ec681f3Smrg 10457ec681f3Smrg /* create parallelcopy pair (without definition id) */ 10467ec681f3Smrg Temp tmp = Temp(id, var.rc); 10477ec681f3Smrg Operand pc_op = Operand(tmp); 10487ec681f3Smrg pc_op.setFixed(var.reg); 10497ec681f3Smrg Definition pc_def = Definition(res.first, pc_op.regClass()); 10507ec681f3Smrg parallelcopies.emplace_back(pc_op, pc_def); 10517ec681f3Smrg continue; 10527ec681f3Smrg } 10537ec681f3Smrg 10547ec681f3Smrg PhysReg best_pos = bounds.lo(); 10557ec681f3Smrg unsigned num_moves = 0xFF; 10567ec681f3Smrg unsigned num_vars = 0; 10577ec681f3Smrg 10587ec681f3Smrg /* we use a sliding window to find potential positions */ 10597ec681f3Smrg unsigned stride = var.rc.is_subdword() ? 1 : info.stride; 10607ec681f3Smrg for (PhysRegInterval reg_win{bounds.lo(), size}; reg_win.hi() <= bounds.hi(); 10617ec681f3Smrg reg_win += stride) { 10627ec681f3Smrg if (!is_dead_operand && intersects(reg_win, def_reg)) 10637ec681f3Smrg continue; 10647ec681f3Smrg 10657ec681f3Smrg /* second, check that we have at most k=num_moves elements in the window 10667ec681f3Smrg * and no element is larger than the currently processed one */ 10677ec681f3Smrg unsigned k = 0; 10687ec681f3Smrg unsigned n = 0; 10697ec681f3Smrg unsigned last_var = 0; 10707ec681f3Smrg bool found = true; 10717ec681f3Smrg for (PhysReg j : reg_win) { 10727ec681f3Smrg if (reg_file[j] == 0 || reg_file[j] == last_var) 10737ec681f3Smrg continue; 10747ec681f3Smrg 10757ec681f3Smrg if (reg_file.is_blocked(j) || k > num_moves) { 10767ec681f3Smrg found = false; 10777ec681f3Smrg break; 10787ec681f3Smrg } 10797ec681f3Smrg if (reg_file[j] == 0xF0000000) { 10807ec681f3Smrg k += 1; 10817ec681f3Smrg n++; 10827ec681f3Smrg continue; 10837ec681f3Smrg } 10847ec681f3Smrg /* we cannot split live ranges of linear vgprs inside control flow */ 10857ec681f3Smrg if (!(ctx.block->kind & block_kind_top_level) && 10867ec681f3Smrg ctx.assignments[reg_file[j]].rc.is_linear_vgpr()) { 10877ec681f3Smrg found = false; 10887ec681f3Smrg break; 10897ec681f3Smrg } 10907ec681f3Smrg bool is_kill = false; 10917ec681f3Smrg for (const Operand& op : instr->operands) { 10927ec681f3Smrg if (op.isTemp() && op.isKillBeforeDef() && op.tempId() == reg_file[j]) { 10937ec681f3Smrg is_kill = true; 10947ec681f3Smrg break; 10957ec681f3Smrg } 10967ec681f3Smrg } 10977ec681f3Smrg if (!is_kill && ctx.assignments[reg_file[j]].rc.size() >= size) { 10987ec681f3Smrg found = false; 10997ec681f3Smrg break; 11007ec681f3Smrg } 11017ec681f3Smrg 11027ec681f3Smrg k += ctx.assignments[reg_file[j]].rc.size(); 11037ec681f3Smrg last_var = reg_file[j]; 11047ec681f3Smrg n++; 11057ec681f3Smrg if (k > num_moves || (k == num_moves && n <= num_vars)) { 11067ec681f3Smrg found = false; 11077ec681f3Smrg break; 11087ec681f3Smrg } 11097ec681f3Smrg } 11107ec681f3Smrg 11117ec681f3Smrg if (found) { 11127ec681f3Smrg best_pos = reg_win.lo(); 11137ec681f3Smrg num_moves = k; 11147ec681f3Smrg num_vars = n; 11157ec681f3Smrg } 11167ec681f3Smrg } 11177ec681f3Smrg 11187ec681f3Smrg /* FIXME: we messed up and couldn't find space for the variables to be copied */ 11197ec681f3Smrg if (num_moves == 0xFF) 11207ec681f3Smrg return false; 11217ec681f3Smrg 11227ec681f3Smrg PhysRegInterval reg_win{best_pos, size}; 11237ec681f3Smrg 11247ec681f3Smrg /* collect variables and block reg file */ 11257ec681f3Smrg std::set<std::pair<unsigned, unsigned>> new_vars = collect_vars(ctx, reg_file, reg_win); 11267ec681f3Smrg 11277ec681f3Smrg /* mark the area as blocked */ 11287ec681f3Smrg reg_file.block(reg_win.lo(), var.rc); 11297ec681f3Smrg adjust_max_used_regs(ctx, var.rc, reg_win.lo()); 11307ec681f3Smrg 11317ec681f3Smrg if (!get_regs_for_copies(ctx, reg_file, parallelcopies, new_vars, bounds, instr, def_reg)) 11327ec681f3Smrg return false; 11337ec681f3Smrg 11347ec681f3Smrg /* create parallelcopy pair (without definition id) */ 11357ec681f3Smrg Temp tmp = Temp(id, var.rc); 11367ec681f3Smrg Operand pc_op = Operand(tmp); 11377ec681f3Smrg pc_op.setFixed(var.reg); 11387ec681f3Smrg Definition pc_def = Definition(reg_win.lo(), pc_op.regClass()); 11397ec681f3Smrg parallelcopies.emplace_back(pc_op, pc_def); 11407ec681f3Smrg } 11417ec681f3Smrg 11427ec681f3Smrg return true; 11437ec681f3Smrg} 11447ec681f3Smrg 11457ec681f3Smrgstd::pair<PhysReg, bool> 11467ec681f3Smrgget_reg_impl(ra_ctx& ctx, RegisterFile& reg_file, 11477ec681f3Smrg std::vector<std::pair<Operand, Definition>>& parallelcopies, const DefInfo& info, 11487ec681f3Smrg aco_ptr<Instruction>& instr) 11497ec681f3Smrg{ 11507ec681f3Smrg const PhysRegInterval& bounds = info.bounds; 11517ec681f3Smrg uint32_t size = info.size; 11527ec681f3Smrg uint32_t stride = info.stride; 11537ec681f3Smrg RegClass rc = info.rc; 11547ec681f3Smrg 11557ec681f3Smrg /* check how many free regs we have */ 11567ec681f3Smrg unsigned regs_free = reg_file.count_zero(bounds); 11577ec681f3Smrg 11587ec681f3Smrg /* mark and count killed operands */ 11597ec681f3Smrg unsigned killed_ops = 0; 11607ec681f3Smrg std::bitset<256> is_killed_operand; /* per-register */ 11617ec681f3Smrg for (unsigned j = 0; !is_phi(instr) && j < instr->operands.size(); j++) { 11627ec681f3Smrg Operand& op = instr->operands[j]; 11637ec681f3Smrg if (op.isTemp() && op.isFirstKillBeforeDef() && bounds.contains(op.physReg()) && 11647ec681f3Smrg !reg_file.test(PhysReg{op.physReg().reg()}, align(op.bytes() + op.physReg().byte(), 4))) { 11657ec681f3Smrg assert(op.isFixed()); 11667ec681f3Smrg 11677ec681f3Smrg for (unsigned i = 0; i < op.size(); ++i) { 11687ec681f3Smrg is_killed_operand[(op.physReg() & 0xff) + i] = true; 11697ec681f3Smrg } 11707ec681f3Smrg 11717ec681f3Smrg killed_ops += op.getTemp().size(); 11727ec681f3Smrg } 11737ec681f3Smrg } 11747ec681f3Smrg 11757ec681f3Smrg assert(regs_free >= size); 11767ec681f3Smrg /* we might have to move dead operands to dst in order to make space */ 11777ec681f3Smrg unsigned op_moves = 0; 11787ec681f3Smrg 11797ec681f3Smrg if (size > (regs_free - killed_ops)) 11807ec681f3Smrg op_moves = size - (regs_free - killed_ops); 11817ec681f3Smrg 11827ec681f3Smrg /* find the best position to place the definition */ 11837ec681f3Smrg PhysRegInterval best_win = {bounds.lo(), size}; 11847ec681f3Smrg unsigned num_moves = 0xFF; 11857ec681f3Smrg unsigned num_vars = 0; 11867ec681f3Smrg 11877ec681f3Smrg /* we use a sliding window to check potential positions */ 11887ec681f3Smrg for (PhysRegInterval reg_win = {bounds.lo(), size}; reg_win.hi() <= bounds.hi(); 11897ec681f3Smrg reg_win += stride) { 11907ec681f3Smrg /* first check if the register window starts in the middle of an 11917ec681f3Smrg * allocated variable: this is what we have to fix to allow for 11927ec681f3Smrg * num_moves > size */ 11937ec681f3Smrg if (reg_win.lo() > bounds.lo() && !reg_file.is_empty_or_blocked(reg_win.lo()) && 11947ec681f3Smrg reg_file.get_id(reg_win.lo()) == reg_file.get_id(reg_win.lo().advance(-1))) 11957ec681f3Smrg continue; 11967ec681f3Smrg if (reg_win.hi() < bounds.hi() && !reg_file.is_empty_or_blocked(reg_win.hi().advance(-1)) && 11977ec681f3Smrg reg_file.get_id(reg_win.hi().advance(-1)) == reg_file.get_id(reg_win.hi())) 11987ec681f3Smrg continue; 11997ec681f3Smrg 12007ec681f3Smrg /* second, check that we have at most k=num_moves elements in the window 12017ec681f3Smrg * and no element is larger than the currently processed one */ 12027ec681f3Smrg unsigned k = op_moves; 12037ec681f3Smrg unsigned n = 0; 12047ec681f3Smrg unsigned remaining_op_moves = op_moves; 12057ec681f3Smrg unsigned last_var = 0; 12067ec681f3Smrg bool found = true; 12077ec681f3Smrg bool aligned = rc == RegClass::v4 && reg_win.lo() % 4 == 0; 12087ec681f3Smrg for (const PhysReg j : reg_win) { 12097ec681f3Smrg /* dead operands effectively reduce the number of estimated moves */ 12107ec681f3Smrg if (is_killed_operand[j & 0xFF]) { 12117ec681f3Smrg if (remaining_op_moves) { 12127ec681f3Smrg k--; 12137ec681f3Smrg remaining_op_moves--; 12147ec681f3Smrg } 12157ec681f3Smrg continue; 12167ec681f3Smrg } 12177ec681f3Smrg 12187ec681f3Smrg if (reg_file[j] == 0 || reg_file[j] == last_var) 12197ec681f3Smrg continue; 12207ec681f3Smrg 12217ec681f3Smrg if (reg_file[j] == 0xF0000000) { 12227ec681f3Smrg k += 1; 12237ec681f3Smrg n++; 12247ec681f3Smrg continue; 12257ec681f3Smrg } 12267ec681f3Smrg 12277ec681f3Smrg if (ctx.assignments[reg_file[j]].rc.size() >= size) { 12287ec681f3Smrg found = false; 12297ec681f3Smrg break; 12307ec681f3Smrg } 12317ec681f3Smrg 12327ec681f3Smrg /* we cannot split live ranges of linear vgprs inside control flow */ 12337ec681f3Smrg // TODO: ensure that live range splits inside control flow are never necessary 12347ec681f3Smrg if (!(ctx.block->kind & block_kind_top_level) && 12357ec681f3Smrg ctx.assignments[reg_file[j]].rc.is_linear_vgpr()) { 12367ec681f3Smrg found = false; 12377ec681f3Smrg break; 12387ec681f3Smrg } 12397ec681f3Smrg 12407ec681f3Smrg k += ctx.assignments[reg_file[j]].rc.size(); 12417ec681f3Smrg n++; 12427ec681f3Smrg last_var = reg_file[j]; 12437ec681f3Smrg } 12447ec681f3Smrg 12457ec681f3Smrg if (!found || k > num_moves) 12467ec681f3Smrg continue; 12477ec681f3Smrg if (k == num_moves && n < num_vars) 12487ec681f3Smrg continue; 12497ec681f3Smrg if (!aligned && k == num_moves && n == num_vars) 12507ec681f3Smrg continue; 12517ec681f3Smrg 12527ec681f3Smrg if (found) { 12537ec681f3Smrg best_win = reg_win; 12547ec681f3Smrg num_moves = k; 12557ec681f3Smrg num_vars = n; 12567ec681f3Smrg } 12577ec681f3Smrg } 12587ec681f3Smrg 12597ec681f3Smrg if (num_moves == 0xFF) 12607ec681f3Smrg return {{}, false}; 12617ec681f3Smrg 12627ec681f3Smrg /* now, we figured the placement for our definition */ 12637ec681f3Smrg RegisterFile tmp_file(reg_file); 12647ec681f3Smrg std::set<std::pair<unsigned, unsigned>> vars = collect_vars(ctx, tmp_file, best_win); 12657ec681f3Smrg 12667ec681f3Smrg if (instr->opcode == aco_opcode::p_create_vector) { 12677ec681f3Smrg /* move killed operands which aren't yet at the correct position (GFX9+) 12687ec681f3Smrg * or which are in the definition space */ 12697ec681f3Smrg PhysReg reg = best_win.lo(); 12707ec681f3Smrg for (Operand& op : instr->operands) { 12717ec681f3Smrg if (op.isTemp() && op.isFirstKillBeforeDef() && op.getTemp().type() == rc.type()) { 12727ec681f3Smrg if (op.physReg() != reg && (ctx.program->chip_class >= GFX9 || 12737ec681f3Smrg (op.physReg().advance(op.bytes()) > best_win.lo() && 12747ec681f3Smrg op.physReg() < best_win.hi()))) { 12757ec681f3Smrg vars.emplace(op.bytes(), op.tempId()); 12767ec681f3Smrg tmp_file.clear(op); 12777ec681f3Smrg } else { 12787ec681f3Smrg tmp_file.fill(op); 12797ec681f3Smrg } 12807ec681f3Smrg } 12817ec681f3Smrg reg.reg_b += op.bytes(); 12827ec681f3Smrg } 12837ec681f3Smrg } else if (!is_phi(instr)) { 12847ec681f3Smrg /* re-enable killed operands */ 12857ec681f3Smrg for (Operand& op : instr->operands) { 12867ec681f3Smrg if (op.isTemp() && op.isFirstKillBeforeDef()) 12877ec681f3Smrg tmp_file.fill(op); 12887ec681f3Smrg } 12897ec681f3Smrg } 12907ec681f3Smrg 12917ec681f3Smrg std::vector<std::pair<Operand, Definition>> pc; 12927ec681f3Smrg if (!get_regs_for_copies(ctx, tmp_file, pc, vars, bounds, instr, best_win)) 12937ec681f3Smrg return {{}, false}; 12947ec681f3Smrg 12957ec681f3Smrg parallelcopies.insert(parallelcopies.end(), pc.begin(), pc.end()); 12967ec681f3Smrg 12977ec681f3Smrg adjust_max_used_regs(ctx, rc, best_win.lo()); 12987ec681f3Smrg return {best_win.lo(), true}; 12997ec681f3Smrg} 13007ec681f3Smrg 13017ec681f3Smrgbool 13027ec681f3Smrgget_reg_specified(ra_ctx& ctx, RegisterFile& reg_file, RegClass rc, aco_ptr<Instruction>& instr, 13037ec681f3Smrg PhysReg reg) 13047ec681f3Smrg{ 13057ec681f3Smrg /* catch out-of-range registers */ 13067ec681f3Smrg if (reg >= PhysReg{512}) 13077ec681f3Smrg return false; 13087ec681f3Smrg 13097ec681f3Smrg std::pair<unsigned, unsigned> sdw_def_info; 13107ec681f3Smrg if (rc.is_subdword()) 13117ec681f3Smrg sdw_def_info = get_subdword_definition_info(ctx.program, instr, rc); 13127ec681f3Smrg 13137ec681f3Smrg if (rc.is_subdword() && reg.byte() % sdw_def_info.first) 13147ec681f3Smrg return false; 13157ec681f3Smrg if (!rc.is_subdword() && reg.byte()) 13167ec681f3Smrg return false; 13177ec681f3Smrg 13187ec681f3Smrg if (rc.type() == RegType::sgpr && reg % get_stride(rc) != 0) 13197ec681f3Smrg return false; 13207ec681f3Smrg 13217ec681f3Smrg PhysRegInterval reg_win = {reg, rc.size()}; 13227ec681f3Smrg PhysRegInterval bounds = get_reg_bounds(ctx.program, rc.type()); 13237ec681f3Smrg PhysRegInterval vcc_win = {vcc, 2}; 13247ec681f3Smrg /* VCC is outside the bounds */ 13257ec681f3Smrg bool is_vcc = rc.type() == RegType::sgpr && vcc_win.contains(reg_win); 13267ec681f3Smrg bool is_m0 = rc == s1 && reg == m0; 13277ec681f3Smrg if (!bounds.contains(reg_win) && !is_vcc && !is_m0) 13287ec681f3Smrg return false; 13297ec681f3Smrg 13307ec681f3Smrg if (rc.is_subdword()) { 13317ec681f3Smrg PhysReg test_reg; 13327ec681f3Smrg test_reg.reg_b = reg.reg_b & ~(sdw_def_info.second - 1); 13337ec681f3Smrg if (reg_file.test(test_reg, sdw_def_info.second)) 13347ec681f3Smrg return false; 13357ec681f3Smrg } else { 13367ec681f3Smrg if (reg_file.test(reg, rc.bytes())) 13377ec681f3Smrg return false; 13387ec681f3Smrg } 13397ec681f3Smrg 13407ec681f3Smrg adjust_max_used_regs(ctx, rc, reg_win.lo()); 13417ec681f3Smrg return true; 13427ec681f3Smrg} 13437ec681f3Smrg 13447ec681f3Smrgbool 13457ec681f3Smrgincrease_register_file(ra_ctx& ctx, RegType type) 13467ec681f3Smrg{ 13477ec681f3Smrg if (type == RegType::vgpr && ctx.program->max_reg_demand.vgpr < ctx.vgpr_limit) { 13487ec681f3Smrg update_vgpr_sgpr_demand(ctx.program, RegisterDemand(ctx.program->max_reg_demand.vgpr + 1, 13497ec681f3Smrg ctx.program->max_reg_demand.sgpr)); 13507ec681f3Smrg } else if (type == RegType::sgpr && ctx.program->max_reg_demand.sgpr < ctx.sgpr_limit) { 13517ec681f3Smrg update_vgpr_sgpr_demand(ctx.program, RegisterDemand(ctx.program->max_reg_demand.vgpr, 13527ec681f3Smrg ctx.program->max_reg_demand.sgpr + 1)); 13537ec681f3Smrg } else { 13547ec681f3Smrg return false; 13557ec681f3Smrg } 13567ec681f3Smrg return true; 13577ec681f3Smrg} 13587ec681f3Smrg 13597ec681f3Smrgstruct IDAndRegClass { 13607ec681f3Smrg IDAndRegClass(unsigned id_, RegClass rc_) : id(id_), rc(rc_) {} 13617ec681f3Smrg 13627ec681f3Smrg unsigned id; 13637ec681f3Smrg RegClass rc; 13647ec681f3Smrg}; 13657ec681f3Smrg 13667ec681f3Smrgstruct IDAndInfo { 13677ec681f3Smrg IDAndInfo(unsigned id_, DefInfo info_) : id(id_), info(info_) {} 13687ec681f3Smrg 13697ec681f3Smrg unsigned id; 13707ec681f3Smrg DefInfo info; 13717ec681f3Smrg}; 13727ec681f3Smrg 13737ec681f3Smrg/* Reallocates vars by sorting them and placing each variable after the previous 13747ec681f3Smrg * one. If one of the variables has 0xffffffff as an ID, the register assigned 13757ec681f3Smrg * for that variable will be returned. 13767ec681f3Smrg */ 13777ec681f3SmrgPhysReg 13787ec681f3Smrgcompact_relocate_vars(ra_ctx& ctx, const std::vector<IDAndRegClass>& vars, 13797ec681f3Smrg std::vector<std::pair<Operand, Definition>>& parallelcopies, PhysReg start) 13807ec681f3Smrg{ 13817ec681f3Smrg /* This function assumes RegisterDemand/live_var_analysis rounds up sub-dword 13827ec681f3Smrg * temporary sizes to dwords. 13837ec681f3Smrg */ 13847ec681f3Smrg std::vector<IDAndInfo> sorted; 13857ec681f3Smrg for (IDAndRegClass var : vars) { 13867ec681f3Smrg DefInfo info(ctx, ctx.pseudo_dummy, var.rc, -1); 13877ec681f3Smrg sorted.emplace_back(var.id, info); 13887ec681f3Smrg } 13897ec681f3Smrg 13907ec681f3Smrg std::sort( 13917ec681f3Smrg sorted.begin(), sorted.end(), 13927ec681f3Smrg [&ctx](const IDAndInfo& a, const IDAndInfo& b) 13937ec681f3Smrg { 13947ec681f3Smrg unsigned a_stride = a.info.stride * (a.info.rc.is_subdword() ? 1 : 4); 13957ec681f3Smrg unsigned b_stride = b.info.stride * (b.info.rc.is_subdword() ? 1 : 4); 13967ec681f3Smrg if (a_stride > b_stride) 13977ec681f3Smrg return true; 13987ec681f3Smrg if (a_stride < b_stride) 13997ec681f3Smrg return false; 14007ec681f3Smrg if (a.id == 0xffffffff || b.id == 0xffffffff) 14017ec681f3Smrg return a.id == 14027ec681f3Smrg 0xffffffff; /* place 0xffffffff before others if possible, not for any reason */ 14037ec681f3Smrg return ctx.assignments[a.id].reg < ctx.assignments[b.id].reg; 14047ec681f3Smrg }); 14057ec681f3Smrg 14067ec681f3Smrg PhysReg next_reg = start; 14077ec681f3Smrg PhysReg space_reg; 14087ec681f3Smrg for (IDAndInfo& var : sorted) { 14097ec681f3Smrg unsigned stride = var.info.rc.is_subdword() ? var.info.stride : var.info.stride * 4; 14107ec681f3Smrg next_reg.reg_b = align(next_reg.reg_b, MAX2(stride, 4)); 14117ec681f3Smrg 14127ec681f3Smrg /* 0xffffffff is a special variable ID used reserve a space for killed 14137ec681f3Smrg * operands and definitions. 14147ec681f3Smrg */ 14157ec681f3Smrg if (var.id != 0xffffffff) { 14167ec681f3Smrg if (next_reg != ctx.assignments[var.id].reg) { 14177ec681f3Smrg RegClass rc = ctx.assignments[var.id].rc; 14187ec681f3Smrg Temp tmp(var.id, rc); 14197ec681f3Smrg 14207ec681f3Smrg Operand pc_op(tmp); 14217ec681f3Smrg pc_op.setFixed(ctx.assignments[var.id].reg); 14227ec681f3Smrg Definition pc_def(next_reg, rc); 14237ec681f3Smrg parallelcopies.emplace_back(pc_op, pc_def); 14247ec681f3Smrg } 14257ec681f3Smrg } else { 14267ec681f3Smrg space_reg = next_reg; 14277ec681f3Smrg } 14287ec681f3Smrg 14297ec681f3Smrg adjust_max_used_regs(ctx, var.info.rc, next_reg); 14307ec681f3Smrg 14317ec681f3Smrg next_reg = next_reg.advance(var.info.rc.size() * 4); 14327ec681f3Smrg } 14337ec681f3Smrg 14347ec681f3Smrg return space_reg; 14357ec681f3Smrg} 14367ec681f3Smrg 14377ec681f3Smrgbool 14387ec681f3Smrgis_mimg_vaddr_intact(ra_ctx& ctx, RegisterFile& reg_file, Instruction* instr) 14397ec681f3Smrg{ 14407ec681f3Smrg PhysReg first{512}; 14417ec681f3Smrg for (unsigned i = 0; i < instr->operands.size() - 3u; i++) { 14427ec681f3Smrg Operand op = instr->operands[i + 3]; 14437ec681f3Smrg 14447ec681f3Smrg if (ctx.assignments[op.tempId()].assigned) { 14457ec681f3Smrg PhysReg reg = ctx.assignments[op.tempId()].reg; 14467ec681f3Smrg 14477ec681f3Smrg if (first.reg() == 512) { 14487ec681f3Smrg PhysRegInterval bounds = get_reg_bounds(ctx.program, RegType::vgpr); 14497ec681f3Smrg first = reg.advance(i * -4); 14507ec681f3Smrg PhysRegInterval vec = PhysRegInterval{first, instr->operands.size() - 3u}; 14517ec681f3Smrg if (!bounds.contains(vec)) /* not enough space for other operands */ 14527ec681f3Smrg return false; 14537ec681f3Smrg } else { 14547ec681f3Smrg if (reg != first.advance(i * 4)) /* not at the best position */ 14557ec681f3Smrg return false; 14567ec681f3Smrg } 14577ec681f3Smrg } else { 14587ec681f3Smrg /* If there's an unexpected temporary, this operand is unlikely to be 14597ec681f3Smrg * placed in the best position. 14607ec681f3Smrg */ 14617ec681f3Smrg if (first.reg() != 512 && reg_file.test(first.advance(i * 4), 4)) 14627ec681f3Smrg return false; 14637ec681f3Smrg } 14647ec681f3Smrg } 14657ec681f3Smrg 14667ec681f3Smrg return true; 14677ec681f3Smrg} 14687ec681f3Smrg 14697ec681f3Smrgstd::pair<PhysReg, bool> 14707ec681f3Smrgget_reg_vector(ra_ctx& ctx, RegisterFile& reg_file, Temp temp, aco_ptr<Instruction>& instr) 14717ec681f3Smrg{ 14727ec681f3Smrg Instruction* vec = ctx.vectors[temp.id()]; 14737ec681f3Smrg unsigned first_operand = vec->format == Format::MIMG ? 3 : 0; 14747ec681f3Smrg unsigned our_offset = 0; 14757ec681f3Smrg for (unsigned i = first_operand; i < vec->operands.size(); i++) { 14767ec681f3Smrg Operand& op = vec->operands[i]; 14777ec681f3Smrg if (op.isTemp() && op.tempId() == temp.id()) 14787ec681f3Smrg break; 14797ec681f3Smrg else 14807ec681f3Smrg our_offset += op.bytes(); 14817ec681f3Smrg } 14827ec681f3Smrg 14837ec681f3Smrg if (vec->format != Format::MIMG || is_mimg_vaddr_intact(ctx, reg_file, vec)) { 14847ec681f3Smrg unsigned their_offset = 0; 14857ec681f3Smrg /* check for every operand of the vector 14867ec681f3Smrg * - whether the operand is assigned and 14877ec681f3Smrg * - we can use the register relative to that operand 14887ec681f3Smrg */ 14897ec681f3Smrg for (unsigned i = first_operand; i < vec->operands.size(); i++) { 14907ec681f3Smrg Operand& op = vec->operands[i]; 14917ec681f3Smrg if (op.isTemp() && op.tempId() != temp.id() && op.getTemp().type() == temp.type() && 14927ec681f3Smrg ctx.assignments[op.tempId()].assigned) { 14937ec681f3Smrg PhysReg reg = ctx.assignments[op.tempId()].reg; 14947ec681f3Smrg reg.reg_b += (our_offset - their_offset); 14957ec681f3Smrg if (get_reg_specified(ctx, reg_file, temp.regClass(), instr, reg)) 14967ec681f3Smrg return {reg, true}; 14977ec681f3Smrg 14987ec681f3Smrg /* return if MIMG vaddr components don't remain vector-aligned */ 14997ec681f3Smrg if (vec->format == Format::MIMG) 15007ec681f3Smrg return {{}, false}; 15017ec681f3Smrg } 15027ec681f3Smrg their_offset += op.bytes(); 15037ec681f3Smrg } 15047ec681f3Smrg 15057ec681f3Smrg /* We didn't find a register relative to other vector operands. 15067ec681f3Smrg * Try to find new space which fits the whole vector. 15077ec681f3Smrg */ 15087ec681f3Smrg RegClass vec_rc = RegClass::get(temp.type(), their_offset); 15097ec681f3Smrg DefInfo info(ctx, ctx.pseudo_dummy, vec_rc, -1); 15107ec681f3Smrg std::pair<PhysReg, bool> res = get_reg_simple(ctx, reg_file, info); 15117ec681f3Smrg PhysReg reg = res.first; 15127ec681f3Smrg if (res.second) { 15137ec681f3Smrg reg.reg_b += our_offset; 15147ec681f3Smrg /* make sure to only use byte offset if the instruction supports it */ 15157ec681f3Smrg if (get_reg_specified(ctx, reg_file, temp.regClass(), instr, reg)) 15167ec681f3Smrg return {reg, true}; 15177ec681f3Smrg } 15187ec681f3Smrg } 15197ec681f3Smrg return {{}, false}; 15207ec681f3Smrg} 15217ec681f3Smrg 15227ec681f3SmrgPhysReg 15237ec681f3Smrgget_reg(ra_ctx& ctx, RegisterFile& reg_file, Temp temp, 15247ec681f3Smrg std::vector<std::pair<Operand, Definition>>& parallelcopies, aco_ptr<Instruction>& instr, 15257ec681f3Smrg int operand_index = -1) 15267ec681f3Smrg{ 15277ec681f3Smrg auto split_vec = ctx.split_vectors.find(temp.id()); 15287ec681f3Smrg if (split_vec != ctx.split_vectors.end()) { 15297ec681f3Smrg unsigned offset = 0; 15307ec681f3Smrg for (Definition def : split_vec->second->definitions) { 15317ec681f3Smrg if (ctx.assignments[def.tempId()].affinity) { 15327ec681f3Smrg assignment& affinity = ctx.assignments[ctx.assignments[def.tempId()].affinity]; 15337ec681f3Smrg if (affinity.assigned) { 15347ec681f3Smrg PhysReg reg = affinity.reg; 15357ec681f3Smrg reg.reg_b -= offset; 15367ec681f3Smrg if (get_reg_specified(ctx, reg_file, temp.regClass(), instr, reg)) 15377ec681f3Smrg return reg; 15387ec681f3Smrg } 15397ec681f3Smrg } 15407ec681f3Smrg offset += def.bytes(); 15417ec681f3Smrg } 15427ec681f3Smrg } 15437ec681f3Smrg 15447ec681f3Smrg if (ctx.assignments[temp.id()].affinity) { 15457ec681f3Smrg assignment& affinity = ctx.assignments[ctx.assignments[temp.id()].affinity]; 15467ec681f3Smrg if (affinity.assigned) { 15477ec681f3Smrg if (get_reg_specified(ctx, reg_file, temp.regClass(), instr, affinity.reg)) 15487ec681f3Smrg return affinity.reg; 15497ec681f3Smrg } 15507ec681f3Smrg } 15517ec681f3Smrg 15527ec681f3Smrg std::pair<PhysReg, bool> res; 15537ec681f3Smrg 15547ec681f3Smrg if (ctx.vectors.find(temp.id()) != ctx.vectors.end()) { 15557ec681f3Smrg res = get_reg_vector(ctx, reg_file, temp, instr); 15567ec681f3Smrg if (res.second) 15577ec681f3Smrg return res.first; 15587ec681f3Smrg } 15597ec681f3Smrg 15607ec681f3Smrg DefInfo info(ctx, instr, temp.regClass(), operand_index); 15617ec681f3Smrg 15627ec681f3Smrg if (!ctx.policy.skip_optimistic_path) { 15637ec681f3Smrg /* try to find space without live-range splits */ 15647ec681f3Smrg res = get_reg_simple(ctx, reg_file, info); 15657ec681f3Smrg 15667ec681f3Smrg if (res.second) 15677ec681f3Smrg return res.first; 15687ec681f3Smrg } 15697ec681f3Smrg 15707ec681f3Smrg /* try to find space with live-range splits */ 15717ec681f3Smrg res = get_reg_impl(ctx, reg_file, parallelcopies, info, instr); 15727ec681f3Smrg 15737ec681f3Smrg if (res.second) 15747ec681f3Smrg return res.first; 15757ec681f3Smrg 15767ec681f3Smrg /* try using more registers */ 15777ec681f3Smrg 15787ec681f3Smrg /* We should only fail here because keeping under the limit would require 15797ec681f3Smrg * too many moves. */ 15807ec681f3Smrg assert(reg_file.count_zero(info.bounds) >= info.size); 15817ec681f3Smrg 15827ec681f3Smrg if (!increase_register_file(ctx, info.rc.type())) { 15837ec681f3Smrg /* fallback algorithm: reallocate all variables at once */ 15847ec681f3Smrg unsigned def_size = info.rc.size(); 15857ec681f3Smrg for (Definition def : instr->definitions) { 15867ec681f3Smrg if (ctx.assignments[def.tempId()].assigned && def.regClass().type() == info.rc.type()) 15877ec681f3Smrg def_size += def.regClass().size(); 15887ec681f3Smrg } 15897ec681f3Smrg 15907ec681f3Smrg unsigned killed_op_size = 0; 15917ec681f3Smrg for (Operand op : instr->operands) { 15927ec681f3Smrg if (op.isTemp() && op.isKillBeforeDef() && op.regClass().type() == info.rc.type()) 15937ec681f3Smrg killed_op_size += op.regClass().size(); 15947ec681f3Smrg } 15957ec681f3Smrg 15967ec681f3Smrg const PhysRegInterval regs = get_reg_bounds(ctx.program, info.rc.type()); 15977ec681f3Smrg 15987ec681f3Smrg /* reallocate passthrough variables and non-killed operands */ 15997ec681f3Smrg std::vector<IDAndRegClass> vars; 16007ec681f3Smrg for (const std::pair<unsigned, unsigned>& var : find_vars(ctx, reg_file, regs)) 16017ec681f3Smrg vars.emplace_back(var.second, ctx.assignments[var.second].rc); 16027ec681f3Smrg vars.emplace_back(0xffffffff, RegClass(info.rc.type(), MAX2(def_size, killed_op_size))); 16037ec681f3Smrg 16047ec681f3Smrg PhysReg space = compact_relocate_vars(ctx, vars, parallelcopies, regs.lo()); 16057ec681f3Smrg 16067ec681f3Smrg /* reallocate killed operands */ 16077ec681f3Smrg std::vector<IDAndRegClass> killed_op_vars; 16087ec681f3Smrg for (Operand op : instr->operands) { 16097ec681f3Smrg if (op.isKillBeforeDef() && op.regClass().type() == info.rc.type()) 16107ec681f3Smrg killed_op_vars.emplace_back(op.tempId(), op.regClass()); 16117ec681f3Smrg } 16127ec681f3Smrg compact_relocate_vars(ctx, killed_op_vars, parallelcopies, space); 16137ec681f3Smrg 16147ec681f3Smrg /* reallocate definitions */ 16157ec681f3Smrg std::vector<IDAndRegClass> def_vars; 16167ec681f3Smrg for (Definition def : instr->definitions) { 16177ec681f3Smrg if (ctx.assignments[def.tempId()].assigned && def.regClass().type() == info.rc.type()) 16187ec681f3Smrg def_vars.emplace_back(def.tempId(), def.regClass()); 16197ec681f3Smrg } 16207ec681f3Smrg def_vars.emplace_back(0xffffffff, info.rc); 16217ec681f3Smrg return compact_relocate_vars(ctx, def_vars, parallelcopies, space); 16227ec681f3Smrg } 16237ec681f3Smrg 16247ec681f3Smrg return get_reg(ctx, reg_file, temp, parallelcopies, instr, operand_index); 16257ec681f3Smrg} 16267ec681f3Smrg 16277ec681f3SmrgPhysReg 16287ec681f3Smrgget_reg_create_vector(ra_ctx& ctx, RegisterFile& reg_file, Temp temp, 16297ec681f3Smrg std::vector<std::pair<Operand, Definition>>& parallelcopies, 16307ec681f3Smrg aco_ptr<Instruction>& instr) 16317ec681f3Smrg{ 16327ec681f3Smrg RegClass rc = temp.regClass(); 16337ec681f3Smrg /* create_vector instructions have different costs w.r.t. register coalescing */ 16347ec681f3Smrg uint32_t size = rc.size(); 16357ec681f3Smrg uint32_t bytes = rc.bytes(); 16367ec681f3Smrg uint32_t stride = get_stride(rc); 16377ec681f3Smrg PhysRegInterval bounds = get_reg_bounds(ctx.program, rc.type()); 16387ec681f3Smrg 16397ec681f3Smrg // TODO: improve p_create_vector for sub-dword vectors 16407ec681f3Smrg 16417ec681f3Smrg PhysReg best_pos{0xFFF}; 16427ec681f3Smrg unsigned num_moves = 0xFF; 16437ec681f3Smrg bool best_avoid = true; 16447ec681f3Smrg 16457ec681f3Smrg /* test for each operand which definition placement causes the least shuffle instructions */ 16467ec681f3Smrg for (unsigned i = 0, offset = 0; i < instr->operands.size(); 16477ec681f3Smrg offset += instr->operands[i].bytes(), i++) { 16487ec681f3Smrg // TODO: think about, if we can alias live operands on the same register 16497ec681f3Smrg if (!instr->operands[i].isTemp() || !instr->operands[i].isKillBeforeDef() || 16507ec681f3Smrg instr->operands[i].getTemp().type() != rc.type()) 16517ec681f3Smrg continue; 16527ec681f3Smrg 16537ec681f3Smrg if (offset > instr->operands[i].physReg().reg_b) 16547ec681f3Smrg continue; 16557ec681f3Smrg 16567ec681f3Smrg unsigned reg_lower = instr->operands[i].physReg().reg_b - offset; 16577ec681f3Smrg if (reg_lower % 4) 16587ec681f3Smrg continue; 16597ec681f3Smrg PhysRegInterval reg_win = {PhysReg{reg_lower / 4}, size}; 16607ec681f3Smrg unsigned k = 0; 16617ec681f3Smrg 16627ec681f3Smrg /* no need to check multiple times */ 16637ec681f3Smrg if (reg_win.lo() == best_pos) 16647ec681f3Smrg continue; 16657ec681f3Smrg 16667ec681f3Smrg /* check borders */ 16677ec681f3Smrg // TODO: this can be improved */ 16687ec681f3Smrg if (!bounds.contains(reg_win) || reg_win.lo() % stride != 0) 16697ec681f3Smrg continue; 16707ec681f3Smrg if (reg_win.lo() > bounds.lo() && reg_file[reg_win.lo()] != 0 && 16717ec681f3Smrg reg_file.get_id(reg_win.lo()) == reg_file.get_id(reg_win.lo().advance(-1))) 16727ec681f3Smrg continue; 16737ec681f3Smrg if (reg_win.hi() < bounds.hi() && reg_file[reg_win.hi().advance(-4)] != 0 && 16747ec681f3Smrg reg_file.get_id(reg_win.hi().advance(-1)) == reg_file.get_id(reg_win.hi())) 16757ec681f3Smrg continue; 16767ec681f3Smrg 16777ec681f3Smrg /* count variables to be moved and check "avoid" */ 16787ec681f3Smrg bool avoid = false; 16797ec681f3Smrg bool linear_vgpr = false; 16807ec681f3Smrg for (PhysReg j : reg_win) { 16817ec681f3Smrg if (reg_file[j] != 0) { 16827ec681f3Smrg if (reg_file[j] == 0xF0000000) { 16837ec681f3Smrg PhysReg reg; 16847ec681f3Smrg reg.reg_b = j * 4; 16857ec681f3Smrg unsigned bytes_left = bytes - ((unsigned)j - reg_win.lo()) * 4; 16867ec681f3Smrg for (unsigned byte_idx = 0; byte_idx < MIN2(bytes_left, 4); byte_idx++, reg.reg_b++) 16877ec681f3Smrg k += reg_file.test(reg, 1); 16887ec681f3Smrg } else { 16897ec681f3Smrg k += 4; 16907ec681f3Smrg linear_vgpr |= ctx.assignments[reg_file[j]].rc.is_linear_vgpr(); 16917ec681f3Smrg } 16927ec681f3Smrg } 16937ec681f3Smrg avoid |= ctx.war_hint[j]; 16947ec681f3Smrg } 16957ec681f3Smrg 16967ec681f3Smrg if (linear_vgpr) { 16977ec681f3Smrg /* we cannot split live ranges of linear vgprs inside control flow */ 16987ec681f3Smrg if (ctx.block->kind & block_kind_top_level) 16997ec681f3Smrg avoid = true; 17007ec681f3Smrg else 17017ec681f3Smrg continue; 17027ec681f3Smrg } 17037ec681f3Smrg 17047ec681f3Smrg if (avoid && !best_avoid) 17057ec681f3Smrg continue; 17067ec681f3Smrg 17077ec681f3Smrg /* count operands in wrong positions */ 17087ec681f3Smrg for (unsigned j = 0, offset2 = 0; j < instr->operands.size(); 17097ec681f3Smrg offset2 += instr->operands[j].bytes(), j++) { 17107ec681f3Smrg if (j == i || !instr->operands[j].isTemp() || 17117ec681f3Smrg instr->operands[j].getTemp().type() != rc.type()) 17127ec681f3Smrg continue; 17137ec681f3Smrg if (instr->operands[j].physReg().reg_b != reg_win.lo() * 4 + offset2) 17147ec681f3Smrg k += instr->operands[j].bytes(); 17157ec681f3Smrg } 17167ec681f3Smrg bool aligned = rc == RegClass::v4 && reg_win.lo() % 4 == 0; 17177ec681f3Smrg if (k > num_moves || (!aligned && k == num_moves)) 17187ec681f3Smrg continue; 17197ec681f3Smrg 17207ec681f3Smrg best_pos = reg_win.lo(); 17217ec681f3Smrg num_moves = k; 17227ec681f3Smrg best_avoid = avoid; 17237ec681f3Smrg } 17247ec681f3Smrg 17257ec681f3Smrg if (num_moves >= bytes) 17267ec681f3Smrg return get_reg(ctx, reg_file, temp, parallelcopies, instr); 17277ec681f3Smrg 17287ec681f3Smrg /* re-enable killed operands which are in the wrong position */ 17297ec681f3Smrg RegisterFile tmp_file(reg_file); 17307ec681f3Smrg for (unsigned i = 0, offset = 0; i < instr->operands.size(); 17317ec681f3Smrg offset += instr->operands[i].bytes(), i++) { 17327ec681f3Smrg if (instr->operands[i].isTemp() && instr->operands[i].isFirstKillBeforeDef() && 17337ec681f3Smrg instr->operands[i].physReg().reg_b != best_pos.reg_b + offset) 17347ec681f3Smrg tmp_file.fill(instr->operands[i]); 17357ec681f3Smrg } 17367ec681f3Smrg 17377ec681f3Smrg /* collect variables to be moved */ 17387ec681f3Smrg std::set<std::pair<unsigned, unsigned>> vars = 17397ec681f3Smrg collect_vars(ctx, tmp_file, PhysRegInterval{best_pos, size}); 17407ec681f3Smrg 17417ec681f3Smrg for (unsigned i = 0, offset = 0; i < instr->operands.size(); 17427ec681f3Smrg offset += instr->operands[i].bytes(), i++) { 17437ec681f3Smrg if (!instr->operands[i].isTemp() || !instr->operands[i].isFirstKillBeforeDef() || 17447ec681f3Smrg instr->operands[i].getTemp().type() != rc.type()) 17457ec681f3Smrg continue; 17467ec681f3Smrg bool correct_pos = instr->operands[i].physReg().reg_b == best_pos.reg_b + offset; 17477ec681f3Smrg /* GFX9+: move killed operands which aren't yet at the correct position 17487ec681f3Smrg * Moving all killed operands generally leads to more register swaps. 17497ec681f3Smrg * This is only done on GFX9+ because of the cheap v_swap instruction. 17507ec681f3Smrg */ 17517ec681f3Smrg if (ctx.program->chip_class >= GFX9 && !correct_pos) { 17527ec681f3Smrg vars.emplace(instr->operands[i].bytes(), instr->operands[i].tempId()); 17537ec681f3Smrg tmp_file.clear(instr->operands[i]); 17547ec681f3Smrg /* fill operands which are in the correct position to avoid overwriting */ 17557ec681f3Smrg } else if (correct_pos) { 17567ec681f3Smrg tmp_file.fill(instr->operands[i]); 17577ec681f3Smrg } 17587ec681f3Smrg } 17597ec681f3Smrg bool success = false; 17607ec681f3Smrg std::vector<std::pair<Operand, Definition>> pc; 17617ec681f3Smrg success = 17627ec681f3Smrg get_regs_for_copies(ctx, tmp_file, pc, vars, bounds, instr, PhysRegInterval{best_pos, size}); 17637ec681f3Smrg 17647ec681f3Smrg if (!success) { 17657ec681f3Smrg if (!increase_register_file(ctx, temp.type())) { 17667ec681f3Smrg /* use the fallback algorithm in get_reg() */ 17677ec681f3Smrg return get_reg(ctx, reg_file, temp, parallelcopies, instr); 17687ec681f3Smrg } 17697ec681f3Smrg return get_reg_create_vector(ctx, reg_file, temp, parallelcopies, instr); 17707ec681f3Smrg } 17717ec681f3Smrg 17727ec681f3Smrg parallelcopies.insert(parallelcopies.end(), pc.begin(), pc.end()); 17737ec681f3Smrg adjust_max_used_regs(ctx, rc, best_pos); 17747ec681f3Smrg 17757ec681f3Smrg return best_pos; 17767ec681f3Smrg} 17777ec681f3Smrg 17787ec681f3Smrgvoid 17797ec681f3Smrghandle_pseudo(ra_ctx& ctx, const RegisterFile& reg_file, Instruction* instr) 17807ec681f3Smrg{ 17817ec681f3Smrg if (instr->format != Format::PSEUDO) 17827ec681f3Smrg return; 17837ec681f3Smrg 17847ec681f3Smrg /* all instructions which use handle_operands() need this information */ 17857ec681f3Smrg switch (instr->opcode) { 17867ec681f3Smrg case aco_opcode::p_extract_vector: 17877ec681f3Smrg case aco_opcode::p_create_vector: 17887ec681f3Smrg case aco_opcode::p_split_vector: 17897ec681f3Smrg case aco_opcode::p_parallelcopy: 17907ec681f3Smrg case aco_opcode::p_wqm: break; 17917ec681f3Smrg default: return; 17927ec681f3Smrg } 17937ec681f3Smrg 17947ec681f3Smrg bool writes_linear = false; 17957ec681f3Smrg /* if all definitions are logical vgpr, no need to care for SCC */ 17967ec681f3Smrg for (Definition& def : instr->definitions) { 17977ec681f3Smrg if (def.getTemp().regClass().is_linear()) 17987ec681f3Smrg writes_linear = true; 17997ec681f3Smrg } 18007ec681f3Smrg /* if all operands are constant, no need to care either */ 18017ec681f3Smrg bool reads_linear = false; 18027ec681f3Smrg bool reads_subdword = false; 18037ec681f3Smrg for (Operand& op : instr->operands) { 18047ec681f3Smrg if (op.isTemp() && op.getTemp().regClass().is_linear()) 18057ec681f3Smrg reads_linear = true; 18067ec681f3Smrg if (op.isTemp() && op.regClass().is_subdword()) 18077ec681f3Smrg reads_subdword = true; 18087ec681f3Smrg } 18097ec681f3Smrg bool needs_scratch_reg = (writes_linear && reads_linear && reg_file[scc]) || 18107ec681f3Smrg (ctx.program->chip_class <= GFX7 && reads_subdword); 18117ec681f3Smrg if (!needs_scratch_reg) 18127ec681f3Smrg return; 18137ec681f3Smrg 18147ec681f3Smrg instr->pseudo().tmp_in_scc = reg_file[scc]; 18157ec681f3Smrg 18167ec681f3Smrg int reg = ctx.max_used_sgpr; 18177ec681f3Smrg for (; reg >= 0 && reg_file[PhysReg{(unsigned)reg}]; reg--) 18187ec681f3Smrg ; 18197ec681f3Smrg if (reg < 0) { 18207ec681f3Smrg reg = ctx.max_used_sgpr + 1; 18217ec681f3Smrg for (; reg < ctx.program->max_reg_demand.sgpr && reg_file[PhysReg{(unsigned)reg}]; reg++) 18227ec681f3Smrg ; 18237ec681f3Smrg if (reg == ctx.program->max_reg_demand.sgpr) { 18247ec681f3Smrg assert(reads_subdword && reg_file[m0] == 0); 18257ec681f3Smrg reg = m0; 18267ec681f3Smrg } 18277ec681f3Smrg } 18287ec681f3Smrg 18297ec681f3Smrg adjust_max_used_regs(ctx, s1, reg); 18307ec681f3Smrg instr->pseudo().scratch_sgpr = PhysReg{(unsigned)reg}; 18317ec681f3Smrg} 18327ec681f3Smrg 18337ec681f3Smrgbool 18347ec681f3Smrgoperand_can_use_reg(chip_class chip, aco_ptr<Instruction>& instr, unsigned idx, PhysReg reg, 18357ec681f3Smrg RegClass rc) 18367ec681f3Smrg{ 18377ec681f3Smrg if (instr->operands[idx].isFixed()) 18387ec681f3Smrg return instr->operands[idx].physReg() == reg; 18397ec681f3Smrg 18407ec681f3Smrg bool is_writelane = instr->opcode == aco_opcode::v_writelane_b32 || 18417ec681f3Smrg instr->opcode == aco_opcode::v_writelane_b32_e64; 18427ec681f3Smrg if (chip <= GFX9 && is_writelane && idx <= 1) { 18437ec681f3Smrg /* v_writelane_b32 can take two sgprs but only if one is m0. */ 18447ec681f3Smrg bool is_other_sgpr = 18457ec681f3Smrg instr->operands[!idx].isTemp() && 18467ec681f3Smrg (!instr->operands[!idx].isFixed() || instr->operands[!idx].physReg() != m0); 18477ec681f3Smrg if (is_other_sgpr && instr->operands[!idx].tempId() != instr->operands[idx].tempId()) { 18487ec681f3Smrg instr->operands[idx].setFixed(m0); 18497ec681f3Smrg return reg == m0; 18507ec681f3Smrg } 18517ec681f3Smrg } 18527ec681f3Smrg 18537ec681f3Smrg if (reg.byte()) { 18547ec681f3Smrg unsigned stride = get_subdword_operand_stride(chip, instr, idx, rc); 18557ec681f3Smrg if (reg.byte() % stride) 18567ec681f3Smrg return false; 18577ec681f3Smrg } 18587ec681f3Smrg 18597ec681f3Smrg switch (instr->format) { 18607ec681f3Smrg case Format::SMEM: 18617ec681f3Smrg return reg != scc && reg != exec && 18627ec681f3Smrg (reg != m0 || idx == 1 || idx == 3) && /* offset can be m0 */ 18637ec681f3Smrg (reg != vcc || (instr->definitions.empty() && idx == 2) || 18647ec681f3Smrg chip >= GFX10); /* sdata can be vcc */ 18657ec681f3Smrg default: 18667ec681f3Smrg // TODO: there are more instructions with restrictions on registers 18677ec681f3Smrg return true; 18687ec681f3Smrg } 18697ec681f3Smrg} 18707ec681f3Smrg 18717ec681f3Smrgvoid 18727ec681f3Smrgget_reg_for_operand(ra_ctx& ctx, RegisterFile& register_file, 18737ec681f3Smrg std::vector<std::pair<Operand, Definition>>& parallelcopy, 18747ec681f3Smrg aco_ptr<Instruction>& instr, Operand& operand, unsigned operand_index) 18757ec681f3Smrg{ 18767ec681f3Smrg /* check if the operand is fixed */ 18777ec681f3Smrg PhysReg src = ctx.assignments[operand.tempId()].reg; 18787ec681f3Smrg PhysReg dst; 18797ec681f3Smrg if (operand.isFixed()) { 18807ec681f3Smrg assert(operand.physReg() != src); 18817ec681f3Smrg 18827ec681f3Smrg /* check if target reg is blocked, and move away the blocking var */ 18837ec681f3Smrg if (register_file.test(operand.physReg(), operand.bytes())) { 18847ec681f3Smrg PhysRegInterval target{operand.physReg(), operand.size()}; 18857ec681f3Smrg 18867ec681f3Smrg RegisterFile tmp_file(register_file); 18877ec681f3Smrg 18887ec681f3Smrg std::set<std::pair<unsigned, unsigned>> blocking_vars = 18897ec681f3Smrg collect_vars(ctx, tmp_file, target); 18907ec681f3Smrg 18917ec681f3Smrg tmp_file.clear(src, operand.regClass()); // TODO: try to avoid moving block vars to src 18927ec681f3Smrg tmp_file.block(operand.physReg(), operand.regClass()); 18937ec681f3Smrg 18947ec681f3Smrg DefInfo info(ctx, instr, operand.regClass(), -1); 18957ec681f3Smrg get_regs_for_copies(ctx, tmp_file, parallelcopy, blocking_vars, info.bounds, instr, 18967ec681f3Smrg PhysRegInterval()); 18977ec681f3Smrg } 18987ec681f3Smrg dst = operand.physReg(); 18997ec681f3Smrg 19007ec681f3Smrg } else { 19017ec681f3Smrg /* clear the operand in case it's only a stride mismatch */ 19027ec681f3Smrg register_file.clear(src, operand.regClass()); 19037ec681f3Smrg dst = get_reg(ctx, register_file, operand.getTemp(), parallelcopy, instr, operand_index); 19047ec681f3Smrg } 19057ec681f3Smrg 19067ec681f3Smrg Operand pc_op = operand; 19077ec681f3Smrg pc_op.setFixed(src); 19087ec681f3Smrg Definition pc_def = Definition(dst, pc_op.regClass()); 19097ec681f3Smrg parallelcopy.emplace_back(pc_op, pc_def); 19107ec681f3Smrg update_renames(ctx, register_file, parallelcopy, instr, rename_not_killed_ops | fill_killed_ops); 19117ec681f3Smrg} 19127ec681f3Smrg 19137ec681f3Smrgvoid 19147ec681f3Smrgget_regs_for_phis(ra_ctx& ctx, Block& block, RegisterFile& register_file, 19157ec681f3Smrg std::vector<aco_ptr<Instruction>>& instructions, IDSet& live_in) 19167ec681f3Smrg{ 19177ec681f3Smrg /* assign phis with all-matching registers to that register */ 19187ec681f3Smrg for (aco_ptr<Instruction>& phi : block.instructions) { 19197ec681f3Smrg if (!is_phi(phi)) 19207ec681f3Smrg break; 19217ec681f3Smrg Definition& definition = phi->definitions[0]; 19227ec681f3Smrg if (definition.isKill() || definition.isFixed()) 19237ec681f3Smrg continue; 19247ec681f3Smrg 19257ec681f3Smrg if (!phi->operands[0].isTemp()) 19267ec681f3Smrg continue; 19277ec681f3Smrg 19287ec681f3Smrg PhysReg reg = phi->operands[0].physReg(); 19297ec681f3Smrg auto OpsSame = [=](const Operand& op) -> bool 19307ec681f3Smrg { return op.isTemp() && (!op.isFixed() || op.physReg() == reg); }; 19317ec681f3Smrg bool all_same = std::all_of(phi->operands.cbegin() + 1, phi->operands.cend(), OpsSame); 19327ec681f3Smrg if (!all_same) 19337ec681f3Smrg continue; 19347ec681f3Smrg 19357ec681f3Smrg if (!get_reg_specified(ctx, register_file, definition.regClass(), phi, reg)) 19367ec681f3Smrg continue; 19377ec681f3Smrg 19387ec681f3Smrg definition.setFixed(reg); 19397ec681f3Smrg register_file.fill(definition); 19407ec681f3Smrg ctx.assignments[definition.tempId()].set(definition); 19417ec681f3Smrg } 19427ec681f3Smrg 19437ec681f3Smrg /* try to find a register that is used by at least one operand */ 19447ec681f3Smrg for (aco_ptr<Instruction>& phi : block.instructions) { 19457ec681f3Smrg if (!is_phi(phi)) 19467ec681f3Smrg break; 19477ec681f3Smrg Definition& definition = phi->definitions[0]; 19487ec681f3Smrg if (definition.isKill() || definition.isFixed()) 19497ec681f3Smrg continue; 19507ec681f3Smrg 19517ec681f3Smrg /* use affinity if available */ 19527ec681f3Smrg if (ctx.assignments[definition.tempId()].affinity && 19537ec681f3Smrg ctx.assignments[ctx.assignments[definition.tempId()].affinity].assigned) { 19547ec681f3Smrg assignment& affinity = ctx.assignments[ctx.assignments[definition.tempId()].affinity]; 19557ec681f3Smrg assert(affinity.rc == definition.regClass()); 19567ec681f3Smrg if (get_reg_specified(ctx, register_file, definition.regClass(), phi, affinity.reg)) { 19577ec681f3Smrg definition.setFixed(affinity.reg); 19587ec681f3Smrg register_file.fill(definition); 19597ec681f3Smrg ctx.assignments[definition.tempId()].set(definition); 19607ec681f3Smrg continue; 19617ec681f3Smrg } 19627ec681f3Smrg } 19637ec681f3Smrg 19647ec681f3Smrg /* by going backwards, we aim to avoid copies in else-blocks */ 19657ec681f3Smrg for (int i = phi->operands.size() - 1; i >= 0; i--) { 19667ec681f3Smrg const Operand& op = phi->operands[i]; 19677ec681f3Smrg if (!op.isTemp() || !op.isFixed()) 19687ec681f3Smrg continue; 19697ec681f3Smrg 19707ec681f3Smrg PhysReg reg = op.physReg(); 19717ec681f3Smrg if (get_reg_specified(ctx, register_file, definition.regClass(), phi, reg)) { 19727ec681f3Smrg definition.setFixed(reg); 19737ec681f3Smrg register_file.fill(definition); 19747ec681f3Smrg ctx.assignments[definition.tempId()].set(definition); 19757ec681f3Smrg break; 19767ec681f3Smrg } 19777ec681f3Smrg } 19787ec681f3Smrg } 19797ec681f3Smrg 19807ec681f3Smrg /* find registers for phis where the register was blocked or no operand was assigned */ 19817ec681f3Smrg for (aco_ptr<Instruction>& phi : block.instructions) { 19827ec681f3Smrg if (!is_phi(phi)) 19837ec681f3Smrg break; 19847ec681f3Smrg 19857ec681f3Smrg Definition& definition = phi->definitions[0]; 19867ec681f3Smrg if (definition.isKill()) 19877ec681f3Smrg continue; 19887ec681f3Smrg 19897ec681f3Smrg if (definition.isFixed()) { 19907ec681f3Smrg instructions.emplace_back(std::move(phi)); 19917ec681f3Smrg continue; 19927ec681f3Smrg } 19937ec681f3Smrg 19947ec681f3Smrg std::vector<std::pair<Operand, Definition>> parallelcopy; 19957ec681f3Smrg definition.setFixed(get_reg(ctx, register_file, definition.getTemp(), parallelcopy, phi)); 19967ec681f3Smrg update_renames(ctx, register_file, parallelcopy, phi, rename_not_killed_ops); 19977ec681f3Smrg 19987ec681f3Smrg /* process parallelcopy */ 19997ec681f3Smrg for (std::pair<Operand, Definition> pc : parallelcopy) { 20007ec681f3Smrg /* see if it's a copy from a different phi */ 20017ec681f3Smrg // TODO: prefer moving some previous phis over live-ins 20027ec681f3Smrg // TODO: somehow prevent phis fixed before the RA from being updated (shouldn't be a 20037ec681f3Smrg // problem in practice since they can only be fixed to exec) 20047ec681f3Smrg Instruction* prev_phi = NULL; 20057ec681f3Smrg std::vector<aco_ptr<Instruction>>::iterator phi_it; 20067ec681f3Smrg for (phi_it = instructions.begin(); phi_it != instructions.end(); ++phi_it) { 20077ec681f3Smrg if ((*phi_it)->definitions[0].tempId() == pc.first.tempId()) 20087ec681f3Smrg prev_phi = phi_it->get(); 20097ec681f3Smrg } 20107ec681f3Smrg if (prev_phi) { 20117ec681f3Smrg /* if so, just update that phi's register */ 20127ec681f3Smrg prev_phi->definitions[0].setFixed(pc.second.physReg()); 20137ec681f3Smrg ctx.assignments[prev_phi->definitions[0].tempId()].set(pc.second); 20147ec681f3Smrg continue; 20157ec681f3Smrg } 20167ec681f3Smrg 20177ec681f3Smrg /* rename */ 20187ec681f3Smrg std::unordered_map<unsigned, Temp>::iterator orig_it = 20197ec681f3Smrg ctx.orig_names.find(pc.first.tempId()); 20207ec681f3Smrg Temp orig = pc.first.getTemp(); 20217ec681f3Smrg if (orig_it != ctx.orig_names.end()) 20227ec681f3Smrg orig = orig_it->second; 20237ec681f3Smrg else 20247ec681f3Smrg ctx.orig_names[pc.second.tempId()] = orig; 20257ec681f3Smrg ctx.renames[block.index][orig.id()] = pc.second.getTemp(); 20267ec681f3Smrg 20277ec681f3Smrg /* otherwise, this is a live-in and we need to create a new phi 20287ec681f3Smrg * to move it in this block's predecessors */ 20297ec681f3Smrg aco_opcode opcode = 20307ec681f3Smrg pc.first.getTemp().is_linear() ? aco_opcode::p_linear_phi : aco_opcode::p_phi; 20317ec681f3Smrg std::vector<unsigned>& preds = 20327ec681f3Smrg pc.first.getTemp().is_linear() ? block.linear_preds : block.logical_preds; 20337ec681f3Smrg aco_ptr<Instruction> new_phi{ 20347ec681f3Smrg create_instruction<Pseudo_instruction>(opcode, Format::PSEUDO, preds.size(), 1)}; 20357ec681f3Smrg new_phi->definitions[0] = pc.second; 20367ec681f3Smrg for (unsigned i = 0; i < preds.size(); i++) 20377ec681f3Smrg new_phi->operands[i] = Operand(pc.first); 20387ec681f3Smrg instructions.emplace_back(std::move(new_phi)); 20397ec681f3Smrg 20407ec681f3Smrg /* Remove from live_out_per_block (now used for live-in), because handle_loop_phis() 20417ec681f3Smrg * would re-create this phi later if this is a loop header. 20427ec681f3Smrg */ 20437ec681f3Smrg live_in.erase(orig.id()); 20447ec681f3Smrg } 20457ec681f3Smrg 20467ec681f3Smrg register_file.fill(definition); 20477ec681f3Smrg ctx.assignments[definition.tempId()].set(definition); 20487ec681f3Smrg instructions.emplace_back(std::move(phi)); 20497ec681f3Smrg } 20507ec681f3Smrg} 20517ec681f3Smrg 20527ec681f3SmrgTemp 20537ec681f3Smrgread_variable(ra_ctx& ctx, Temp val, unsigned block_idx) 20547ec681f3Smrg{ 20557ec681f3Smrg std::unordered_map<unsigned, Temp>::iterator it = ctx.renames[block_idx].find(val.id()); 20567ec681f3Smrg if (it == ctx.renames[block_idx].end()) 20577ec681f3Smrg return val; 20587ec681f3Smrg else 20597ec681f3Smrg return it->second; 20607ec681f3Smrg} 20617ec681f3Smrg 20627ec681f3SmrgTemp 20637ec681f3Smrghandle_live_in(ra_ctx& ctx, Temp val, Block* block) 20647ec681f3Smrg{ 20657ec681f3Smrg std::vector<unsigned>& preds = val.is_linear() ? block->linear_preds : block->logical_preds; 20667ec681f3Smrg if (preds.size() == 0) 20677ec681f3Smrg return val; 20687ec681f3Smrg 20697ec681f3Smrg if (preds.size() == 1) { 20707ec681f3Smrg /* if the block has only one predecessor, just look there for the name */ 20717ec681f3Smrg return read_variable(ctx, val, preds[0]); 20727ec681f3Smrg } 20737ec681f3Smrg 20747ec681f3Smrg /* there are multiple predecessors and the block is sealed */ 20757ec681f3Smrg Temp* const ops = (Temp*)alloca(preds.size() * sizeof(Temp)); 20767ec681f3Smrg 20777ec681f3Smrg /* get the rename from each predecessor and check if they are the same */ 20787ec681f3Smrg Temp new_val; 20797ec681f3Smrg bool needs_phi = false; 20807ec681f3Smrg for (unsigned i = 0; i < preds.size(); i++) { 20817ec681f3Smrg ops[i] = read_variable(ctx, val, preds[i]); 20827ec681f3Smrg if (i == 0) 20837ec681f3Smrg new_val = ops[i]; 20847ec681f3Smrg else 20857ec681f3Smrg needs_phi |= !(new_val == ops[i]); 20867ec681f3Smrg } 20877ec681f3Smrg 20887ec681f3Smrg if (needs_phi) { 20897ec681f3Smrg assert(!val.regClass().is_linear_vgpr()); 20907ec681f3Smrg 20917ec681f3Smrg /* the variable has been renamed differently in the predecessors: we need to insert a phi */ 20927ec681f3Smrg aco_opcode opcode = val.is_linear() ? aco_opcode::p_linear_phi : aco_opcode::p_phi; 20937ec681f3Smrg aco_ptr<Instruction> phi{ 20947ec681f3Smrg create_instruction<Pseudo_instruction>(opcode, Format::PSEUDO, preds.size(), 1)}; 20957ec681f3Smrg new_val = ctx.program->allocateTmp(val.regClass()); 20967ec681f3Smrg phi->definitions[0] = Definition(new_val); 20977ec681f3Smrg ctx.assignments.emplace_back(); 20987ec681f3Smrg assert(ctx.assignments.size() == ctx.program->peekAllocationId()); 20997ec681f3Smrg for (unsigned i = 0; i < preds.size(); i++) { 21007ec681f3Smrg /* update the operands so that it uses the new affinity */ 21017ec681f3Smrg phi->operands[i] = Operand(ops[i]); 21027ec681f3Smrg assert(ctx.assignments[ops[i].id()].assigned); 21037ec681f3Smrg assert(ops[i].regClass() == new_val.regClass()); 21047ec681f3Smrg phi->operands[i].setFixed(ctx.assignments[ops[i].id()].reg); 21057ec681f3Smrg } 21067ec681f3Smrg block->instructions.insert(block->instructions.begin(), std::move(phi)); 21077ec681f3Smrg } 21087ec681f3Smrg 21097ec681f3Smrg return new_val; 21107ec681f3Smrg} 21117ec681f3Smrg 21127ec681f3Smrgvoid 21137ec681f3Smrghandle_loop_phis(ra_ctx& ctx, const IDSet& live_in, uint32_t loop_header_idx, 21147ec681f3Smrg uint32_t loop_exit_idx) 21157ec681f3Smrg{ 21167ec681f3Smrg Block& loop_header = ctx.program->blocks[loop_header_idx]; 21177ec681f3Smrg std::unordered_map<unsigned, Temp> renames; 21187ec681f3Smrg 21197ec681f3Smrg /* create phis for variables renamed during the loop */ 21207ec681f3Smrg for (unsigned t : live_in) { 21217ec681f3Smrg Temp val = Temp(t, ctx.program->temp_rc[t]); 21227ec681f3Smrg Temp prev = read_variable(ctx, val, loop_header_idx - 1); 21237ec681f3Smrg Temp renamed = handle_live_in(ctx, val, &loop_header); 21247ec681f3Smrg if (renamed == prev) 21257ec681f3Smrg continue; 21267ec681f3Smrg 21277ec681f3Smrg /* insert additional renames at block end, but don't overwrite */ 21287ec681f3Smrg renames[prev.id()] = renamed; 21297ec681f3Smrg ctx.orig_names[renamed.id()] = val; 21307ec681f3Smrg for (unsigned idx = loop_header_idx; idx < loop_exit_idx; idx++) { 21317ec681f3Smrg auto it = ctx.renames[idx].emplace(val.id(), renamed); 21327ec681f3Smrg /* if insertion is unsuccessful, update if necessary */ 21337ec681f3Smrg if (!it.second && it.first->second == prev) 21347ec681f3Smrg it.first->second = renamed; 21357ec681f3Smrg } 21367ec681f3Smrg 21377ec681f3Smrg /* update loop-carried values of the phi created by handle_live_in() */ 21387ec681f3Smrg for (unsigned i = 1; i < loop_header.instructions[0]->operands.size(); i++) { 21397ec681f3Smrg Operand& op = loop_header.instructions[0]->operands[i]; 21407ec681f3Smrg if (op.getTemp() == prev) 21417ec681f3Smrg op.setTemp(renamed); 21427ec681f3Smrg } 21437ec681f3Smrg 21447ec681f3Smrg /* use the assignment from the loop preheader and fix def reg */ 21457ec681f3Smrg assignment& var = ctx.assignments[prev.id()]; 21467ec681f3Smrg ctx.assignments[renamed.id()] = var; 21477ec681f3Smrg loop_header.instructions[0]->definitions[0].setFixed(var.reg); 21487ec681f3Smrg } 21497ec681f3Smrg 21507ec681f3Smrg /* rename loop carried phi operands */ 21517ec681f3Smrg for (unsigned i = renames.size(); i < loop_header.instructions.size(); i++) { 21527ec681f3Smrg aco_ptr<Instruction>& phi = loop_header.instructions[i]; 21537ec681f3Smrg if (!is_phi(phi)) 21547ec681f3Smrg break; 21557ec681f3Smrg const std::vector<unsigned>& preds = 21567ec681f3Smrg phi->opcode == aco_opcode::p_phi ? loop_header.logical_preds : loop_header.linear_preds; 21577ec681f3Smrg for (unsigned j = 1; j < phi->operands.size(); j++) { 21587ec681f3Smrg Operand& op = phi->operands[j]; 21597ec681f3Smrg if (!op.isTemp()) 21607ec681f3Smrg continue; 21617ec681f3Smrg 21627ec681f3Smrg /* Find the original name, since this operand might not use the original name if the phi 21637ec681f3Smrg * was created after init_reg_file(). 21647ec681f3Smrg */ 21657ec681f3Smrg std::unordered_map<unsigned, Temp>::iterator it = ctx.orig_names.find(op.tempId()); 21667ec681f3Smrg Temp orig = it != ctx.orig_names.end() ? it->second : op.getTemp(); 21677ec681f3Smrg 21687ec681f3Smrg op.setTemp(read_variable(ctx, orig, preds[j])); 21697ec681f3Smrg op.setFixed(ctx.assignments[op.tempId()].reg); 21707ec681f3Smrg } 21717ec681f3Smrg } 21727ec681f3Smrg 21737ec681f3Smrg /* return early if no new phi was created */ 21747ec681f3Smrg if (renames.empty()) 21757ec681f3Smrg return; 21767ec681f3Smrg 21777ec681f3Smrg /* propagate new renames through loop */ 21787ec681f3Smrg for (unsigned idx = loop_header_idx; idx < loop_exit_idx; idx++) { 21797ec681f3Smrg Block& current = ctx.program->blocks[idx]; 21807ec681f3Smrg /* rename all uses in this block */ 21817ec681f3Smrg for (aco_ptr<Instruction>& instr : current.instructions) { 21827ec681f3Smrg /* phis are renamed after RA */ 21837ec681f3Smrg if (idx == loop_header_idx && is_phi(instr)) 21847ec681f3Smrg continue; 21857ec681f3Smrg 21867ec681f3Smrg for (Operand& op : instr->operands) { 21877ec681f3Smrg if (!op.isTemp()) 21887ec681f3Smrg continue; 21897ec681f3Smrg 21907ec681f3Smrg auto rename = renames.find(op.tempId()); 21917ec681f3Smrg if (rename != renames.end()) { 21927ec681f3Smrg assert(rename->second.id()); 21937ec681f3Smrg op.setTemp(rename->second); 21947ec681f3Smrg } 21957ec681f3Smrg } 21967ec681f3Smrg } 21977ec681f3Smrg } 21987ec681f3Smrg} 21997ec681f3Smrg 22007ec681f3Smrg/** 22017ec681f3Smrg * This function serves the purpose to correctly initialize the register file 22027ec681f3Smrg * at the beginning of a block (before any existing phis). 22037ec681f3Smrg * In order to do so, all live-in variables are entered into the RegisterFile. 22047ec681f3Smrg * Reg-to-reg moves (renames) from previous blocks are taken into account and 22057ec681f3Smrg * the SSA is repaired by inserting corresponding phi-nodes. 22067ec681f3Smrg */ 22077ec681f3SmrgRegisterFile 22087ec681f3Smrginit_reg_file(ra_ctx& ctx, const std::vector<IDSet>& live_out_per_block, Block& block) 22097ec681f3Smrg{ 22107ec681f3Smrg if (block.kind & block_kind_loop_exit) { 22117ec681f3Smrg uint32_t header = ctx.loop_header.back(); 22127ec681f3Smrg ctx.loop_header.pop_back(); 22137ec681f3Smrg handle_loop_phis(ctx, live_out_per_block[header], header, block.index); 22147ec681f3Smrg } 22157ec681f3Smrg 22167ec681f3Smrg RegisterFile register_file; 22177ec681f3Smrg const IDSet& live_in = live_out_per_block[block.index]; 22187ec681f3Smrg assert(block.index != 0 || live_in.empty()); 22197ec681f3Smrg 22207ec681f3Smrg if (block.kind & block_kind_loop_header) { 22217ec681f3Smrg ctx.loop_header.emplace_back(block.index); 22227ec681f3Smrg /* already rename phis incoming value */ 22237ec681f3Smrg for (aco_ptr<Instruction>& instr : block.instructions) { 22247ec681f3Smrg if (!is_phi(instr)) 22257ec681f3Smrg break; 22267ec681f3Smrg Operand& operand = instr->operands[0]; 22277ec681f3Smrg if (operand.isTemp()) { 22287ec681f3Smrg operand.setTemp(read_variable(ctx, operand.getTemp(), block.index - 1)); 22297ec681f3Smrg operand.setFixed(ctx.assignments[operand.tempId()].reg); 22307ec681f3Smrg } 22317ec681f3Smrg } 22327ec681f3Smrg for (unsigned t : live_in) { 22337ec681f3Smrg Temp val = Temp(t, ctx.program->temp_rc[t]); 22347ec681f3Smrg Temp renamed = read_variable(ctx, val, block.index - 1); 22357ec681f3Smrg if (renamed != val) 22367ec681f3Smrg ctx.renames[block.index][val.id()] = renamed; 22377ec681f3Smrg assignment& var = ctx.assignments[renamed.id()]; 22387ec681f3Smrg assert(var.assigned); 22397ec681f3Smrg register_file.fill(Definition(renamed.id(), var.reg, var.rc)); 22407ec681f3Smrg } 22417ec681f3Smrg } else { 22427ec681f3Smrg /* rename phi operands */ 22437ec681f3Smrg for (aco_ptr<Instruction>& instr : block.instructions) { 22447ec681f3Smrg if (!is_phi(instr)) 22457ec681f3Smrg break; 22467ec681f3Smrg const std::vector<unsigned>& preds = 22477ec681f3Smrg instr->opcode == aco_opcode::p_phi ? block.logical_preds : block.linear_preds; 22487ec681f3Smrg 22497ec681f3Smrg for (unsigned i = 0; i < instr->operands.size(); i++) { 22507ec681f3Smrg Operand& operand = instr->operands[i]; 22517ec681f3Smrg if (!operand.isTemp()) 22527ec681f3Smrg continue; 22537ec681f3Smrg operand.setTemp(read_variable(ctx, operand.getTemp(), preds[i])); 22547ec681f3Smrg operand.setFixed(ctx.assignments[operand.tempId()].reg); 22557ec681f3Smrg } 22567ec681f3Smrg } 22577ec681f3Smrg for (unsigned t : live_in) { 22587ec681f3Smrg Temp val = Temp(t, ctx.program->temp_rc[t]); 22597ec681f3Smrg Temp renamed = handle_live_in(ctx, val, &block); 22607ec681f3Smrg assignment& var = ctx.assignments[renamed.id()]; 22617ec681f3Smrg /* due to live-range splits, the live-in might be a phi, now */ 22627ec681f3Smrg if (var.assigned) { 22637ec681f3Smrg register_file.fill(Definition(renamed.id(), var.reg, var.rc)); 22647ec681f3Smrg } 22657ec681f3Smrg if (renamed != val) { 22667ec681f3Smrg ctx.renames[block.index].emplace(t, renamed); 22677ec681f3Smrg ctx.orig_names[renamed.id()] = val; 22687ec681f3Smrg } 22697ec681f3Smrg } 22707ec681f3Smrg } 22717ec681f3Smrg 22727ec681f3Smrg return register_file; 22737ec681f3Smrg} 22747ec681f3Smrg 22757ec681f3Smrgvoid 22767ec681f3Smrgget_affinities(ra_ctx& ctx, std::vector<IDSet>& live_out_per_block) 22777ec681f3Smrg{ 22787ec681f3Smrg std::vector<std::vector<Temp>> phi_ressources; 22797ec681f3Smrg std::unordered_map<unsigned, unsigned> temp_to_phi_ressources; 22807ec681f3Smrg 22817ec681f3Smrg for (auto block_rit = ctx.program->blocks.rbegin(); block_rit != ctx.program->blocks.rend(); 22827ec681f3Smrg block_rit++) { 22837ec681f3Smrg Block& block = *block_rit; 22847ec681f3Smrg 22857ec681f3Smrg /* first, compute the death points of all live vars within the block */ 22867ec681f3Smrg IDSet& live = live_out_per_block[block.index]; 22877ec681f3Smrg 22887ec681f3Smrg std::vector<aco_ptr<Instruction>>::reverse_iterator rit; 22897ec681f3Smrg for (rit = block.instructions.rbegin(); rit != block.instructions.rend(); ++rit) { 22907ec681f3Smrg aco_ptr<Instruction>& instr = *rit; 22917ec681f3Smrg if (is_phi(instr)) 22927ec681f3Smrg break; 22937ec681f3Smrg 22947ec681f3Smrg /* add vector affinities */ 22957ec681f3Smrg if (instr->opcode == aco_opcode::p_create_vector) { 22967ec681f3Smrg for (const Operand& op : instr->operands) { 22977ec681f3Smrg if (op.isTemp() && op.isFirstKill() && 22987ec681f3Smrg op.getTemp().type() == instr->definitions[0].getTemp().type()) 22997ec681f3Smrg ctx.vectors[op.tempId()] = instr.get(); 23007ec681f3Smrg } 23017ec681f3Smrg } else if (instr->format == Format::MIMG && instr->operands.size() > 4) { 23027ec681f3Smrg for (unsigned i = 3; i < instr->operands.size(); i++) 23037ec681f3Smrg ctx.vectors[instr->operands[i].tempId()] = instr.get(); 23047ec681f3Smrg } 23057ec681f3Smrg 23067ec681f3Smrg if (instr->opcode == aco_opcode::p_split_vector && 23077ec681f3Smrg instr->operands[0].isFirstKillBeforeDef()) 23087ec681f3Smrg ctx.split_vectors[instr->operands[0].tempId()] = instr.get(); 23097ec681f3Smrg 23107ec681f3Smrg /* add operands to live variables */ 23117ec681f3Smrg for (const Operand& op : instr->operands) { 23127ec681f3Smrg if (op.isTemp()) 23137ec681f3Smrg live.insert(op.tempId()); 23147ec681f3Smrg } 23157ec681f3Smrg 23167ec681f3Smrg /* erase definitions from live */ 23177ec681f3Smrg for (unsigned i = 0; i < instr->definitions.size(); i++) { 23187ec681f3Smrg const Definition& def = instr->definitions[i]; 23197ec681f3Smrg if (!def.isTemp()) 23207ec681f3Smrg continue; 23217ec681f3Smrg live.erase(def.tempId()); 23227ec681f3Smrg /* mark last-seen phi operand */ 23237ec681f3Smrg std::unordered_map<unsigned, unsigned>::iterator it = 23247ec681f3Smrg temp_to_phi_ressources.find(def.tempId()); 23257ec681f3Smrg if (it != temp_to_phi_ressources.end() && 23267ec681f3Smrg def.regClass() == phi_ressources[it->second][0].regClass()) { 23277ec681f3Smrg phi_ressources[it->second][0] = def.getTemp(); 23287ec681f3Smrg /* try to coalesce phi affinities with parallelcopies */ 23297ec681f3Smrg Operand op = Operand(); 23307ec681f3Smrg switch (instr->opcode) { 23317ec681f3Smrg case aco_opcode::p_parallelcopy: op = instr->operands[i]; break; 23327ec681f3Smrg 23337ec681f3Smrg case aco_opcode::v_interp_p2_f32: 23347ec681f3Smrg case aco_opcode::v_writelane_b32: 23357ec681f3Smrg case aco_opcode::v_writelane_b32_e64: op = instr->operands[2]; break; 23367ec681f3Smrg 23377ec681f3Smrg case aco_opcode::v_fma_f32: 23387ec681f3Smrg case aco_opcode::v_fma_f16: 23397ec681f3Smrg case aco_opcode::v_pk_fma_f16: 23407ec681f3Smrg if (ctx.program->chip_class < GFX10) 23417ec681f3Smrg continue; 23427ec681f3Smrg FALLTHROUGH; 23437ec681f3Smrg case aco_opcode::v_mad_f32: 23447ec681f3Smrg case aco_opcode::v_mad_f16: 23457ec681f3Smrg if (instr->usesModifiers()) 23467ec681f3Smrg continue; 23477ec681f3Smrg op = instr->operands[2]; 23487ec681f3Smrg break; 23497ec681f3Smrg 23507ec681f3Smrg default: continue; 23517ec681f3Smrg } 23527ec681f3Smrg 23537ec681f3Smrg if (op.isTemp() && op.isFirstKillBeforeDef() && def.regClass() == op.regClass()) { 23547ec681f3Smrg phi_ressources[it->second].emplace_back(op.getTemp()); 23557ec681f3Smrg temp_to_phi_ressources[op.tempId()] = it->second; 23567ec681f3Smrg } 23577ec681f3Smrg } 23587ec681f3Smrg } 23597ec681f3Smrg } 23607ec681f3Smrg 23617ec681f3Smrg /* collect phi affinities */ 23627ec681f3Smrg for (; rit != block.instructions.rend(); ++rit) { 23637ec681f3Smrg aco_ptr<Instruction>& instr = *rit; 23647ec681f3Smrg assert(is_phi(instr)); 23657ec681f3Smrg 23667ec681f3Smrg live.erase(instr->definitions[0].tempId()); 23677ec681f3Smrg if (instr->definitions[0].isKill() || instr->definitions[0].isFixed()) 23687ec681f3Smrg continue; 23697ec681f3Smrg 23707ec681f3Smrg assert(instr->definitions[0].isTemp()); 23717ec681f3Smrg std::unordered_map<unsigned, unsigned>::iterator it = 23727ec681f3Smrg temp_to_phi_ressources.find(instr->definitions[0].tempId()); 23737ec681f3Smrg unsigned index = phi_ressources.size(); 23747ec681f3Smrg std::vector<Temp>* affinity_related; 23757ec681f3Smrg if (it != temp_to_phi_ressources.end()) { 23767ec681f3Smrg index = it->second; 23777ec681f3Smrg phi_ressources[index][0] = instr->definitions[0].getTemp(); 23787ec681f3Smrg affinity_related = &phi_ressources[index]; 23797ec681f3Smrg } else { 23807ec681f3Smrg phi_ressources.emplace_back(std::vector<Temp>{instr->definitions[0].getTemp()}); 23817ec681f3Smrg affinity_related = &phi_ressources.back(); 23827ec681f3Smrg } 23837ec681f3Smrg 23847ec681f3Smrg for (const Operand& op : instr->operands) { 23857ec681f3Smrg if (op.isTemp() && op.isKill() && op.regClass() == instr->definitions[0].regClass()) { 23867ec681f3Smrg affinity_related->emplace_back(op.getTemp()); 23877ec681f3Smrg if (block.kind & block_kind_loop_header) 23887ec681f3Smrg continue; 23897ec681f3Smrg temp_to_phi_ressources[op.tempId()] = index; 23907ec681f3Smrg } 23917ec681f3Smrg } 23927ec681f3Smrg } 23937ec681f3Smrg 23947ec681f3Smrg /* visit the loop header phis first in order to create nested affinities */ 23957ec681f3Smrg if (block.kind & block_kind_loop_exit) { 23967ec681f3Smrg /* find loop header */ 23977ec681f3Smrg auto header_rit = block_rit; 23987ec681f3Smrg while ((header_rit + 1)->loop_nest_depth > block.loop_nest_depth) 23997ec681f3Smrg header_rit++; 24007ec681f3Smrg 24017ec681f3Smrg for (aco_ptr<Instruction>& phi : header_rit->instructions) { 24027ec681f3Smrg if (!is_phi(phi)) 24037ec681f3Smrg break; 24047ec681f3Smrg if (phi->definitions[0].isKill() || phi->definitions[0].isFixed()) 24057ec681f3Smrg continue; 24067ec681f3Smrg 24077ec681f3Smrg /* create an (empty) merge-set for the phi-related variables */ 24087ec681f3Smrg auto it = temp_to_phi_ressources.find(phi->definitions[0].tempId()); 24097ec681f3Smrg unsigned index = phi_ressources.size(); 24107ec681f3Smrg if (it == temp_to_phi_ressources.end()) { 24117ec681f3Smrg temp_to_phi_ressources[phi->definitions[0].tempId()] = index; 24127ec681f3Smrg phi_ressources.emplace_back(std::vector<Temp>{phi->definitions[0].getTemp()}); 24137ec681f3Smrg } else { 24147ec681f3Smrg index = it->second; 24157ec681f3Smrg } 24167ec681f3Smrg for (unsigned i = 1; i < phi->operands.size(); i++) { 24177ec681f3Smrg const Operand& op = phi->operands[i]; 24187ec681f3Smrg if (op.isTemp() && op.isKill() && op.regClass() == phi->definitions[0].regClass()) { 24197ec681f3Smrg temp_to_phi_ressources[op.tempId()] = index; 24207ec681f3Smrg } 24217ec681f3Smrg } 24227ec681f3Smrg } 24237ec681f3Smrg } 24247ec681f3Smrg } 24257ec681f3Smrg /* create affinities */ 24267ec681f3Smrg for (std::vector<Temp>& vec : phi_ressources) { 24277ec681f3Smrg for (unsigned i = 1; i < vec.size(); i++) 24287ec681f3Smrg if (vec[i].id() != vec[0].id()) 24297ec681f3Smrg ctx.assignments[vec[i].id()].affinity = vec[0].id(); 24307ec681f3Smrg } 24317ec681f3Smrg} 24327ec681f3Smrg 24337ec681f3Smrg} /* end namespace */ 24347ec681f3Smrg 24357ec681f3Smrgvoid 24367ec681f3Smrgregister_allocation(Program* program, std::vector<IDSet>& live_out_per_block, ra_test_policy policy) 24377ec681f3Smrg{ 24387ec681f3Smrg ra_ctx ctx(program, policy); 24397ec681f3Smrg get_affinities(ctx, live_out_per_block); 24407ec681f3Smrg 24417ec681f3Smrg /* state of register file after phis */ 24427ec681f3Smrg std::vector<std::bitset<128>> sgpr_live_in(program->blocks.size()); 24437ec681f3Smrg 24447ec681f3Smrg for (Block& block : program->blocks) { 24457ec681f3Smrg ctx.block = █ 24467ec681f3Smrg 24477ec681f3Smrg /* initialize register file */ 24487ec681f3Smrg RegisterFile register_file = init_reg_file(ctx, live_out_per_block, block); 24497ec681f3Smrg ctx.war_hint.reset(); 24507ec681f3Smrg 24517ec681f3Smrg std::vector<aco_ptr<Instruction>> instructions; 24527ec681f3Smrg 24537ec681f3Smrg /* this is a slight adjustment from the paper as we already have phi nodes: 24547ec681f3Smrg * We consider them incomplete phis and only handle the definition. */ 24557ec681f3Smrg get_regs_for_phis(ctx, block, register_file, instructions, live_out_per_block[block.index]); 24567ec681f3Smrg 24577ec681f3Smrg /* fill in sgpr_live_in */ 24587ec681f3Smrg for (unsigned i = 0; i <= ctx.max_used_sgpr; i++) 24597ec681f3Smrg sgpr_live_in[block.index][i] = register_file[PhysReg{i}]; 24607ec681f3Smrg sgpr_live_in[block.index][127] = register_file[scc]; 24617ec681f3Smrg 24627ec681f3Smrg /* Handle all other instructions of the block */ 24637ec681f3Smrg auto NonPhi = [](aco_ptr<Instruction>& instr) -> bool { return instr && !is_phi(instr); }; 24647ec681f3Smrg std::vector<aco_ptr<Instruction>>::iterator instr_it = 24657ec681f3Smrg std::find_if(block.instructions.begin(), block.instructions.end(), NonPhi); 24667ec681f3Smrg for (; instr_it != block.instructions.end(); ++instr_it) { 24677ec681f3Smrg aco_ptr<Instruction>& instr = *instr_it; 24687ec681f3Smrg 24697ec681f3Smrg /* parallelcopies from p_phi are inserted here which means 24707ec681f3Smrg * live ranges of killed operands end here as well */ 24717ec681f3Smrg if (instr->opcode == aco_opcode::p_logical_end) { 24727ec681f3Smrg /* no need to process this instruction any further */ 24737ec681f3Smrg if (block.logical_succs.size() != 1) { 24747ec681f3Smrg instructions.emplace_back(std::move(instr)); 24757ec681f3Smrg continue; 24767ec681f3Smrg } 24777ec681f3Smrg 24787ec681f3Smrg Block& succ = program->blocks[block.logical_succs[0]]; 24797ec681f3Smrg unsigned idx = 0; 24807ec681f3Smrg for (; idx < succ.logical_preds.size(); idx++) { 24817ec681f3Smrg if (succ.logical_preds[idx] == block.index) 24827ec681f3Smrg break; 24837ec681f3Smrg } 24847ec681f3Smrg for (aco_ptr<Instruction>& phi : succ.instructions) { 24857ec681f3Smrg if (phi->opcode == aco_opcode::p_phi) { 24867ec681f3Smrg if (phi->operands[idx].isTemp() && 24877ec681f3Smrg phi->operands[idx].getTemp().type() == RegType::sgpr && 24887ec681f3Smrg phi->operands[idx].isFirstKillBeforeDef()) { 24897ec681f3Smrg Definition phi_op( 24907ec681f3Smrg read_variable(ctx, phi->operands[idx].getTemp(), block.index)); 24917ec681f3Smrg phi_op.setFixed(ctx.assignments[phi_op.tempId()].reg); 24927ec681f3Smrg register_file.clear(phi_op); 24937ec681f3Smrg } 24947ec681f3Smrg } else if (phi->opcode != aco_opcode::p_linear_phi) { 24957ec681f3Smrg break; 24967ec681f3Smrg } 24977ec681f3Smrg } 24987ec681f3Smrg instructions.emplace_back(std::move(instr)); 24997ec681f3Smrg continue; 25007ec681f3Smrg } 25017ec681f3Smrg 25027ec681f3Smrg std::vector<std::pair<Operand, Definition>> parallelcopy; 25037ec681f3Smrg 25047ec681f3Smrg assert(!is_phi(instr)); 25057ec681f3Smrg 25067ec681f3Smrg bool temp_in_scc = register_file[scc]; 25077ec681f3Smrg 25087ec681f3Smrg /* handle operands */ 25097ec681f3Smrg for (unsigned i = 0; i < instr->operands.size(); ++i) { 25107ec681f3Smrg auto& operand = instr->operands[i]; 25117ec681f3Smrg if (!operand.isTemp()) 25127ec681f3Smrg continue; 25137ec681f3Smrg 25147ec681f3Smrg /* rename operands */ 25157ec681f3Smrg operand.setTemp(read_variable(ctx, operand.getTemp(), block.index)); 25167ec681f3Smrg assert(ctx.assignments[operand.tempId()].assigned); 25177ec681f3Smrg 25187ec681f3Smrg PhysReg reg = ctx.assignments[operand.tempId()].reg; 25197ec681f3Smrg if (operand_can_use_reg(program->chip_class, instr, i, reg, operand.regClass())) 25207ec681f3Smrg operand.setFixed(reg); 25217ec681f3Smrg else 25227ec681f3Smrg get_reg_for_operand(ctx, register_file, parallelcopy, instr, operand, i); 25237ec681f3Smrg 25247ec681f3Smrg if (instr->isEXP() || (instr->isVMEM() && i == 3 && ctx.program->chip_class == GFX6) || 25257ec681f3Smrg (instr->isDS() && instr->ds().gds)) { 25267ec681f3Smrg for (unsigned j = 0; j < operand.size(); j++) 25277ec681f3Smrg ctx.war_hint.set(operand.physReg().reg() + j); 25287ec681f3Smrg } 25297ec681f3Smrg } 25307ec681f3Smrg 25317ec681f3Smrg /* remove dead vars from register file */ 25327ec681f3Smrg for (const Operand& op : instr->operands) { 25337ec681f3Smrg if (op.isTemp() && op.isFirstKillBeforeDef()) 25347ec681f3Smrg register_file.clear(op); 25357ec681f3Smrg } 25367ec681f3Smrg 25377ec681f3Smrg /* try to optimize v_mad_f32 -> v_mac_f32 */ 25387ec681f3Smrg if ((instr->opcode == aco_opcode::v_mad_f32 || 25397ec681f3Smrg (instr->opcode == aco_opcode::v_fma_f32 && program->chip_class >= GFX10) || 25407ec681f3Smrg instr->opcode == aco_opcode::v_mad_f16 || 25417ec681f3Smrg instr->opcode == aco_opcode::v_mad_legacy_f16 || 25427ec681f3Smrg (instr->opcode == aco_opcode::v_fma_f16 && program->chip_class >= GFX10) || 25437ec681f3Smrg (instr->opcode == aco_opcode::v_pk_fma_f16 && program->chip_class >= GFX10) || 25447ec681f3Smrg (instr->opcode == aco_opcode::v_dot4_i32_i8 && program->family != CHIP_VEGA20)) && 25457ec681f3Smrg instr->operands[2].isTemp() && instr->operands[2].isKillBeforeDef() && 25467ec681f3Smrg instr->operands[2].getTemp().type() == RegType::vgpr && instr->operands[1].isTemp() && 25477ec681f3Smrg instr->operands[1].getTemp().type() == RegType::vgpr && !instr->usesModifiers() && 25487ec681f3Smrg instr->operands[0].physReg().byte() == 0 && instr->operands[1].physReg().byte() == 0 && 25497ec681f3Smrg instr->operands[2].physReg().byte() == 0) { 25507ec681f3Smrg unsigned def_id = instr->definitions[0].tempId(); 25517ec681f3Smrg bool use_vop2 = true; 25527ec681f3Smrg if (ctx.assignments[def_id].affinity) { 25537ec681f3Smrg assignment& affinity = ctx.assignments[ctx.assignments[def_id].affinity]; 25547ec681f3Smrg if (affinity.assigned && affinity.reg != instr->operands[2].physReg() && 25557ec681f3Smrg !register_file.test(affinity.reg, instr->operands[2].bytes())) 25567ec681f3Smrg use_vop2 = false; 25577ec681f3Smrg } 25587ec681f3Smrg if (use_vop2) { 25597ec681f3Smrg instr->format = Format::VOP2; 25607ec681f3Smrg switch (instr->opcode) { 25617ec681f3Smrg case aco_opcode::v_mad_f32: instr->opcode = aco_opcode::v_mac_f32; break; 25627ec681f3Smrg case aco_opcode::v_fma_f32: instr->opcode = aco_opcode::v_fmac_f32; break; 25637ec681f3Smrg case aco_opcode::v_mad_f16: 25647ec681f3Smrg case aco_opcode::v_mad_legacy_f16: instr->opcode = aco_opcode::v_mac_f16; break; 25657ec681f3Smrg case aco_opcode::v_fma_f16: instr->opcode = aco_opcode::v_fmac_f16; break; 25667ec681f3Smrg case aco_opcode::v_pk_fma_f16: instr->opcode = aco_opcode::v_pk_fmac_f16; break; 25677ec681f3Smrg case aco_opcode::v_dot4_i32_i8: instr->opcode = aco_opcode::v_dot4c_i32_i8; break; 25687ec681f3Smrg default: break; 25697ec681f3Smrg } 25707ec681f3Smrg } 25717ec681f3Smrg } 25727ec681f3Smrg 25737ec681f3Smrg /* handle definitions which must have the same register as an operand */ 25747ec681f3Smrg if (instr->opcode == aco_opcode::v_interp_p2_f32 || 25757ec681f3Smrg instr->opcode == aco_opcode::v_mac_f32 || instr->opcode == aco_opcode::v_fmac_f32 || 25767ec681f3Smrg instr->opcode == aco_opcode::v_mac_f16 || instr->opcode == aco_opcode::v_fmac_f16 || 25777ec681f3Smrg instr->opcode == aco_opcode::v_pk_fmac_f16 || 25787ec681f3Smrg instr->opcode == aco_opcode::v_writelane_b32 || 25797ec681f3Smrg instr->opcode == aco_opcode::v_writelane_b32_e64 || 25807ec681f3Smrg instr->opcode == aco_opcode::v_dot4c_i32_i8) { 25817ec681f3Smrg instr->definitions[0].setFixed(instr->operands[2].physReg()); 25827ec681f3Smrg } else if (instr->opcode == aco_opcode::s_addk_i32 || 25837ec681f3Smrg instr->opcode == aco_opcode::s_mulk_i32) { 25847ec681f3Smrg instr->definitions[0].setFixed(instr->operands[0].physReg()); 25857ec681f3Smrg } else if (instr->isMUBUF() && instr->definitions.size() == 1 && 25867ec681f3Smrg instr->operands.size() == 4) { 25877ec681f3Smrg instr->definitions[0].setFixed(instr->operands[3].physReg()); 25887ec681f3Smrg } else if (instr->isMIMG() && instr->definitions.size() == 1 && 25897ec681f3Smrg !instr->operands[2].isUndefined()) { 25907ec681f3Smrg instr->definitions[0].setFixed(instr->operands[2].physReg()); 25917ec681f3Smrg } 25927ec681f3Smrg 25937ec681f3Smrg ctx.defs_done.reset(); 25947ec681f3Smrg 25957ec681f3Smrg /* handle fixed definitions first */ 25967ec681f3Smrg for (unsigned i = 0; i < instr->definitions.size(); ++i) { 25977ec681f3Smrg auto& definition = instr->definitions[i]; 25987ec681f3Smrg if (!definition.isFixed()) 25997ec681f3Smrg continue; 26007ec681f3Smrg 26017ec681f3Smrg adjust_max_used_regs(ctx, definition.regClass(), definition.physReg()); 26027ec681f3Smrg /* check if the target register is blocked */ 26037ec681f3Smrg if (register_file.test(definition.physReg(), definition.bytes())) { 26047ec681f3Smrg const PhysRegInterval def_regs{definition.physReg(), definition.size()}; 26057ec681f3Smrg 26067ec681f3Smrg /* create parallelcopy pair to move blocking vars */ 26077ec681f3Smrg std::set<std::pair<unsigned, unsigned>> vars = 26087ec681f3Smrg collect_vars(ctx, register_file, def_regs); 26097ec681f3Smrg 26107ec681f3Smrg RegisterFile tmp_file(register_file); 26117ec681f3Smrg /* re-enable the killed operands, so that we don't move the blocking vars there */ 26127ec681f3Smrg for (const Operand& op : instr->operands) { 26137ec681f3Smrg if (op.isTemp() && op.isFirstKillBeforeDef()) 26147ec681f3Smrg tmp_file.fill(op); 26157ec681f3Smrg } 26167ec681f3Smrg 26177ec681f3Smrg ASSERTED bool success = false; 26187ec681f3Smrg DefInfo info(ctx, instr, definition.regClass(), -1); 26197ec681f3Smrg success = get_regs_for_copies(ctx, tmp_file, parallelcopy, vars, info.bounds, instr, 26207ec681f3Smrg def_regs); 26217ec681f3Smrg assert(success); 26227ec681f3Smrg 26237ec681f3Smrg update_renames(ctx, register_file, parallelcopy, instr, (UpdateRenames)0); 26247ec681f3Smrg } 26257ec681f3Smrg ctx.defs_done.set(i); 26267ec681f3Smrg 26277ec681f3Smrg if (!definition.isTemp()) 26287ec681f3Smrg continue; 26297ec681f3Smrg 26307ec681f3Smrg ctx.assignments[definition.tempId()].set(definition); 26317ec681f3Smrg register_file.fill(definition); 26327ec681f3Smrg } 26337ec681f3Smrg 26347ec681f3Smrg /* handle all other definitions */ 26357ec681f3Smrg for (unsigned i = 0; i < instr->definitions.size(); ++i) { 26367ec681f3Smrg Definition* definition = &instr->definitions[i]; 26377ec681f3Smrg 26387ec681f3Smrg if (definition->isFixed() || !definition->isTemp()) 26397ec681f3Smrg continue; 26407ec681f3Smrg 26417ec681f3Smrg /* find free reg */ 26427ec681f3Smrg if (definition->hasHint() && 26437ec681f3Smrg get_reg_specified(ctx, register_file, definition->regClass(), instr, 26447ec681f3Smrg definition->physReg())) { 26457ec681f3Smrg definition->setFixed(definition->physReg()); 26467ec681f3Smrg } else if (instr->opcode == aco_opcode::p_split_vector) { 26477ec681f3Smrg PhysReg reg = instr->operands[0].physReg(); 26487ec681f3Smrg for (unsigned j = 0; j < i; j++) 26497ec681f3Smrg reg.reg_b += instr->definitions[j].bytes(); 26507ec681f3Smrg if (get_reg_specified(ctx, register_file, definition->regClass(), instr, reg)) 26517ec681f3Smrg definition->setFixed(reg); 26527ec681f3Smrg } else if (instr->opcode == aco_opcode::p_wqm || 26537ec681f3Smrg instr->opcode == aco_opcode::p_parallelcopy) { 26547ec681f3Smrg PhysReg reg = instr->operands[i].physReg(); 26557ec681f3Smrg if (instr->operands[i].isTemp() && 26567ec681f3Smrg instr->operands[i].getTemp().type() == definition->getTemp().type() && 26577ec681f3Smrg !register_file.test(reg, definition->bytes())) 26587ec681f3Smrg definition->setFixed(reg); 26597ec681f3Smrg } else if (instr->opcode == aco_opcode::p_extract_vector) { 26607ec681f3Smrg PhysReg reg = instr->operands[0].physReg(); 26617ec681f3Smrg reg.reg_b += definition->bytes() * instr->operands[1].constantValue(); 26627ec681f3Smrg if (get_reg_specified(ctx, register_file, definition->regClass(), instr, reg)) 26637ec681f3Smrg definition->setFixed(reg); 26647ec681f3Smrg } else if (instr->opcode == aco_opcode::p_create_vector) { 26657ec681f3Smrg PhysReg reg = get_reg_create_vector(ctx, register_file, definition->getTemp(), 26667ec681f3Smrg parallelcopy, instr); 26677ec681f3Smrg update_renames(ctx, register_file, parallelcopy, instr, (UpdateRenames)0); 26687ec681f3Smrg definition->setFixed(reg); 26697ec681f3Smrg } 26707ec681f3Smrg 26717ec681f3Smrg if (!definition->isFixed()) { 26727ec681f3Smrg Temp tmp = definition->getTemp(); 26737ec681f3Smrg if (definition->regClass().is_subdword() && definition->bytes() < 4) { 26747ec681f3Smrg PhysReg reg = get_reg(ctx, register_file, tmp, parallelcopy, instr); 26757ec681f3Smrg definition->setFixed(reg); 26767ec681f3Smrg if (reg.byte() || register_file.test(reg, 4)) { 26777ec681f3Smrg add_subdword_definition(program, instr, reg); 26787ec681f3Smrg definition = &instr->definitions[i]; /* add_subdword_definition can invalidate 26797ec681f3Smrg the reference */ 26807ec681f3Smrg } 26817ec681f3Smrg } else { 26827ec681f3Smrg definition->setFixed(get_reg(ctx, register_file, tmp, parallelcopy, instr)); 26837ec681f3Smrg } 26847ec681f3Smrg update_renames(ctx, register_file, parallelcopy, instr, 26857ec681f3Smrg instr->opcode != aco_opcode::p_create_vector ? rename_not_killed_ops 26867ec681f3Smrg : (UpdateRenames)0); 26877ec681f3Smrg } 26887ec681f3Smrg 26897ec681f3Smrg assert( 26907ec681f3Smrg definition->isFixed() && 26917ec681f3Smrg ((definition->getTemp().type() == RegType::vgpr && definition->physReg() >= 256) || 26927ec681f3Smrg (definition->getTemp().type() != RegType::vgpr && definition->physReg() < 256))); 26937ec681f3Smrg ctx.defs_done.set(i); 26947ec681f3Smrg ctx.assignments[definition->tempId()].set(*definition); 26957ec681f3Smrg register_file.fill(*definition); 26967ec681f3Smrg } 26977ec681f3Smrg 26987ec681f3Smrg handle_pseudo(ctx, register_file, instr.get()); 26997ec681f3Smrg 27007ec681f3Smrg /* kill definitions and late-kill operands and ensure that sub-dword operands can actually 27017ec681f3Smrg * be read */ 27027ec681f3Smrg for (const Definition& def : instr->definitions) { 27037ec681f3Smrg if (def.isTemp() && def.isKill()) 27047ec681f3Smrg register_file.clear(def); 27057ec681f3Smrg } 27067ec681f3Smrg for (unsigned i = 0; i < instr->operands.size(); i++) { 27077ec681f3Smrg const Operand& op = instr->operands[i]; 27087ec681f3Smrg if (op.isTemp() && op.isFirstKill() && op.isLateKill()) 27097ec681f3Smrg register_file.clear(op); 27107ec681f3Smrg if (op.isTemp() && op.physReg().byte() != 0) 27117ec681f3Smrg add_subdword_operand(ctx, instr, i, op.physReg().byte(), op.regClass()); 27127ec681f3Smrg } 27137ec681f3Smrg 27147ec681f3Smrg /* emit parallelcopy */ 27157ec681f3Smrg if (!parallelcopy.empty()) { 27167ec681f3Smrg aco_ptr<Pseudo_instruction> pc; 27177ec681f3Smrg pc.reset(create_instruction<Pseudo_instruction>(aco_opcode::p_parallelcopy, 27187ec681f3Smrg Format::PSEUDO, parallelcopy.size(), 27197ec681f3Smrg parallelcopy.size())); 27207ec681f3Smrg bool linear_vgpr = false; 27217ec681f3Smrg bool sgpr_operands_alias_defs = false; 27227ec681f3Smrg uint64_t sgpr_operands[4] = {0, 0, 0, 0}; 27237ec681f3Smrg for (unsigned i = 0; i < parallelcopy.size(); i++) { 27247ec681f3Smrg linear_vgpr |= parallelcopy[i].first.regClass().is_linear_vgpr(); 27257ec681f3Smrg 27267ec681f3Smrg if (temp_in_scc && parallelcopy[i].first.isTemp() && 27277ec681f3Smrg parallelcopy[i].first.getTemp().type() == RegType::sgpr) { 27287ec681f3Smrg if (!sgpr_operands_alias_defs) { 27297ec681f3Smrg unsigned reg = parallelcopy[i].first.physReg().reg(); 27307ec681f3Smrg unsigned size = parallelcopy[i].first.getTemp().size(); 27317ec681f3Smrg sgpr_operands[reg / 64u] |= u_bit_consecutive64(reg % 64u, size); 27327ec681f3Smrg 27337ec681f3Smrg reg = parallelcopy[i].second.physReg().reg(); 27347ec681f3Smrg size = parallelcopy[i].second.getTemp().size(); 27357ec681f3Smrg if (sgpr_operands[reg / 64u] & u_bit_consecutive64(reg % 64u, size)) 27367ec681f3Smrg sgpr_operands_alias_defs = true; 27377ec681f3Smrg } 27387ec681f3Smrg } 27397ec681f3Smrg 27407ec681f3Smrg pc->operands[i] = parallelcopy[i].first; 27417ec681f3Smrg pc->definitions[i] = parallelcopy[i].second; 27427ec681f3Smrg assert(pc->operands[i].size() == pc->definitions[i].size()); 27437ec681f3Smrg 27447ec681f3Smrg /* it might happen that the operand is already renamed. we have to restore the 27457ec681f3Smrg * original name. */ 27467ec681f3Smrg std::unordered_map<unsigned, Temp>::iterator it = 27477ec681f3Smrg ctx.orig_names.find(pc->operands[i].tempId()); 27487ec681f3Smrg Temp orig = it != ctx.orig_names.end() ? it->second : pc->operands[i].getTemp(); 27497ec681f3Smrg ctx.orig_names[pc->definitions[i].tempId()] = orig; 27507ec681f3Smrg ctx.renames[block.index][orig.id()] = pc->definitions[i].getTemp(); 27517ec681f3Smrg } 27527ec681f3Smrg 27537ec681f3Smrg if (temp_in_scc && (sgpr_operands_alias_defs || linear_vgpr)) { 27547ec681f3Smrg /* disable definitions and re-enable operands */ 27557ec681f3Smrg RegisterFile tmp_file(register_file); 27567ec681f3Smrg for (const Definition& def : instr->definitions) { 27577ec681f3Smrg if (def.isTemp() && !def.isKill()) 27587ec681f3Smrg tmp_file.clear(def); 27597ec681f3Smrg } 27607ec681f3Smrg for (const Operand& op : instr->operands) { 27617ec681f3Smrg if (op.isTemp() && op.isFirstKill()) 27627ec681f3Smrg tmp_file.block(op.physReg(), op.regClass()); 27637ec681f3Smrg } 27647ec681f3Smrg 27657ec681f3Smrg handle_pseudo(ctx, tmp_file, pc.get()); 27667ec681f3Smrg } else { 27677ec681f3Smrg pc->tmp_in_scc = false; 27687ec681f3Smrg } 27697ec681f3Smrg 27707ec681f3Smrg instructions.emplace_back(std::move(pc)); 27717ec681f3Smrg } 27727ec681f3Smrg 27737ec681f3Smrg /* some instructions need VOP3 encoding if operand/definition is not assigned to VCC */ 27747ec681f3Smrg bool instr_needs_vop3 = 27757ec681f3Smrg !instr->isVOP3() && 27767ec681f3Smrg ((instr->format == Format::VOPC && !(instr->definitions[0].physReg() == vcc)) || 27777ec681f3Smrg (instr->opcode == aco_opcode::v_cndmask_b32 && 27787ec681f3Smrg !(instr->operands[2].physReg() == vcc)) || 27797ec681f3Smrg ((instr->opcode == aco_opcode::v_add_co_u32 || 27807ec681f3Smrg instr->opcode == aco_opcode::v_addc_co_u32 || 27817ec681f3Smrg instr->opcode == aco_opcode::v_sub_co_u32 || 27827ec681f3Smrg instr->opcode == aco_opcode::v_subb_co_u32 || 27837ec681f3Smrg instr->opcode == aco_opcode::v_subrev_co_u32 || 27847ec681f3Smrg instr->opcode == aco_opcode::v_subbrev_co_u32) && 27857ec681f3Smrg !(instr->definitions[1].physReg() == vcc)) || 27867ec681f3Smrg ((instr->opcode == aco_opcode::v_addc_co_u32 || 27877ec681f3Smrg instr->opcode == aco_opcode::v_subb_co_u32 || 27887ec681f3Smrg instr->opcode == aco_opcode::v_subbrev_co_u32) && 27897ec681f3Smrg !(instr->operands[2].physReg() == vcc))); 27907ec681f3Smrg if (instr_needs_vop3) { 27917ec681f3Smrg 27927ec681f3Smrg /* if the first operand is a literal, we have to move it to a reg */ 27937ec681f3Smrg if (instr->operands.size() && instr->operands[0].isLiteral() && 27947ec681f3Smrg program->chip_class < GFX10) { 27957ec681f3Smrg bool can_sgpr = true; 27967ec681f3Smrg /* check, if we have to move to vgpr */ 27977ec681f3Smrg for (const Operand& op : instr->operands) { 27987ec681f3Smrg if (op.isTemp() && op.getTemp().type() == RegType::sgpr) { 27997ec681f3Smrg can_sgpr = false; 28007ec681f3Smrg break; 28017ec681f3Smrg } 28027ec681f3Smrg } 28037ec681f3Smrg /* disable definitions and re-enable operands */ 28047ec681f3Smrg RegisterFile tmp_file(register_file); 28057ec681f3Smrg for (const Definition& def : instr->definitions) 28067ec681f3Smrg tmp_file.clear(def); 28077ec681f3Smrg for (const Operand& op : instr->operands) { 28087ec681f3Smrg if (op.isTemp() && op.isFirstKill()) 28097ec681f3Smrg tmp_file.block(op.physReg(), op.regClass()); 28107ec681f3Smrg } 28117ec681f3Smrg Temp tmp = program->allocateTmp(can_sgpr ? s1 : v1); 28127ec681f3Smrg ctx.assignments.emplace_back(); 28137ec681f3Smrg PhysReg reg = get_reg(ctx, tmp_file, tmp, parallelcopy, instr); 28147ec681f3Smrg update_renames(ctx, register_file, parallelcopy, instr, rename_not_killed_ops); 28157ec681f3Smrg 28167ec681f3Smrg aco_ptr<Instruction> mov; 28177ec681f3Smrg if (can_sgpr) 28187ec681f3Smrg mov.reset(create_instruction<SOP1_instruction>(aco_opcode::s_mov_b32, 28197ec681f3Smrg Format::SOP1, 1, 1)); 28207ec681f3Smrg else 28217ec681f3Smrg mov.reset(create_instruction<VOP1_instruction>(aco_opcode::v_mov_b32, 28227ec681f3Smrg Format::VOP1, 1, 1)); 28237ec681f3Smrg mov->operands[0] = instr->operands[0]; 28247ec681f3Smrg mov->definitions[0] = Definition(tmp); 28257ec681f3Smrg mov->definitions[0].setFixed(reg); 28267ec681f3Smrg 28277ec681f3Smrg instr->operands[0] = Operand(tmp); 28287ec681f3Smrg instr->operands[0].setFixed(reg); 28297ec681f3Smrg instr->operands[0].setFirstKill(true); 28307ec681f3Smrg 28317ec681f3Smrg instructions.emplace_back(std::move(mov)); 28327ec681f3Smrg } 28337ec681f3Smrg 28347ec681f3Smrg /* change the instruction to VOP3 to enable an arbitrary register pair as dst */ 28357ec681f3Smrg aco_ptr<Instruction> tmp = std::move(instr); 28367ec681f3Smrg Format format = asVOP3(tmp->format); 28377ec681f3Smrg instr.reset(create_instruction<VOP3_instruction>( 28387ec681f3Smrg tmp->opcode, format, tmp->operands.size(), tmp->definitions.size())); 28397ec681f3Smrg std::copy(tmp->operands.begin(), tmp->operands.end(), instr->operands.begin()); 28407ec681f3Smrg std::copy(tmp->definitions.begin(), tmp->definitions.end(), instr->definitions.begin()); 28417ec681f3Smrg } 28427ec681f3Smrg 28437ec681f3Smrg instructions.emplace_back(std::move(*instr_it)); 28447ec681f3Smrg 28457ec681f3Smrg } /* end for Instr */ 28467ec681f3Smrg 28477ec681f3Smrg block.instructions = std::move(instructions); 28487ec681f3Smrg } /* end for BB */ 28497ec681f3Smrg 28507ec681f3Smrg /* find scc spill registers which may be needed for parallelcopies created by phis */ 28517ec681f3Smrg for (Block& block : program->blocks) { 28527ec681f3Smrg if (block.linear_preds.size() <= 1) 28537ec681f3Smrg continue; 28547ec681f3Smrg 28557ec681f3Smrg std::bitset<128> regs = sgpr_live_in[block.index]; 28567ec681f3Smrg if (!regs[127]) 28577ec681f3Smrg continue; 28587ec681f3Smrg 28597ec681f3Smrg /* choose a register */ 28607ec681f3Smrg int16_t reg = 0; 28617ec681f3Smrg for (; reg < ctx.program->max_reg_demand.sgpr && regs[reg]; reg++) 28627ec681f3Smrg ; 28637ec681f3Smrg assert(reg < ctx.program->max_reg_demand.sgpr); 28647ec681f3Smrg adjust_max_used_regs(ctx, s1, reg); 28657ec681f3Smrg 28667ec681f3Smrg /* update predecessors */ 28677ec681f3Smrg for (unsigned& pred_index : block.linear_preds) { 28687ec681f3Smrg Block& pred = program->blocks[pred_index]; 28697ec681f3Smrg pred.scc_live_out = true; 28707ec681f3Smrg pred.scratch_sgpr = PhysReg{(uint16_t)reg}; 28717ec681f3Smrg } 28727ec681f3Smrg } 28737ec681f3Smrg 28747ec681f3Smrg /* num_gpr = rnd_up(max_used_gpr + 1) */ 28757ec681f3Smrg program->config->num_vgprs = get_vgpr_alloc(program, ctx.max_used_vgpr + 1); 28767ec681f3Smrg program->config->num_sgprs = get_sgpr_alloc(program, ctx.max_used_sgpr + 1); 28777ec681f3Smrg 28787ec681f3Smrg program->progress = CompilationProgress::after_ra; 28797ec681f3Smrg} 28807ec681f3Smrg 28817ec681f3Smrg} // namespace aco 2882