17ec681f3Smrg/*
27ec681f3Smrg * Copyright © 2018 Valve Corporation
37ec681f3Smrg *
47ec681f3Smrg * Permission is hereby granted, free of charge, to any person obtaining a
57ec681f3Smrg * copy of this software and associated documentation files (the "Software"),
67ec681f3Smrg * to deal in the Software without restriction, including without limitation
77ec681f3Smrg * the rights to use, copy, modify, merge, publish, distribute, sublicense,
87ec681f3Smrg * and/or sell copies of the Software, and to permit persons to whom the
97ec681f3Smrg * Software is furnished to do so, subject to the following conditions:
107ec681f3Smrg *
117ec681f3Smrg * The above copyright notice and this permission notice (including the next
127ec681f3Smrg * paragraph) shall be included in all copies or substantial portions of the
137ec681f3Smrg * Software.
147ec681f3Smrg *
157ec681f3Smrg * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
167ec681f3Smrg * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
177ec681f3Smrg * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
187ec681f3Smrg * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
197ec681f3Smrg * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
207ec681f3Smrg * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
217ec681f3Smrg * IN THE SOFTWARE.
227ec681f3Smrg *
237ec681f3Smrg */
247ec681f3Smrg
257ec681f3Smrg#include "aco_ir.h"
267ec681f3Smrg
277ec681f3Smrg#include <algorithm>
287ec681f3Smrg#include <array>
297ec681f3Smrg#include <bitset>
307ec681f3Smrg#include <map>
317ec681f3Smrg#include <set>
327ec681f3Smrg#include <unordered_map>
337ec681f3Smrg#include <vector>
347ec681f3Smrg
357ec681f3Smrgnamespace aco {
367ec681f3Smrgnamespace {
377ec681f3Smrg
387ec681f3Smrgstruct ra_ctx;
397ec681f3Smrg
407ec681f3Smrgunsigned get_subdword_operand_stride(chip_class chip, const aco_ptr<Instruction>& instr,
417ec681f3Smrg                                     unsigned idx, RegClass rc);
427ec681f3Smrgvoid add_subdword_operand(ra_ctx& ctx, aco_ptr<Instruction>& instr, unsigned idx, unsigned byte,
437ec681f3Smrg                          RegClass rc);
447ec681f3Smrgstd::pair<unsigned, unsigned>
457ec681f3Smrgget_subdword_definition_info(Program* program, const aco_ptr<Instruction>& instr, RegClass rc);
467ec681f3Smrgvoid add_subdword_definition(Program* program, aco_ptr<Instruction>& instr, PhysReg reg);
477ec681f3Smrg
487ec681f3Smrgstruct assignment {
497ec681f3Smrg   PhysReg reg;
507ec681f3Smrg   RegClass rc;
517ec681f3Smrg   bool assigned = false;
527ec681f3Smrg   uint32_t affinity = 0;
537ec681f3Smrg   assignment() = default;
547ec681f3Smrg   assignment(PhysReg reg_, RegClass rc_) : reg(reg_), rc(rc_), assigned(-1) {}
557ec681f3Smrg   void set(const Definition& def)
567ec681f3Smrg   {
577ec681f3Smrg      assigned = true;
587ec681f3Smrg      reg = def.physReg();
597ec681f3Smrg      rc = def.regClass();
607ec681f3Smrg   }
617ec681f3Smrg};
627ec681f3Smrg
637ec681f3Smrgstruct ra_ctx {
647ec681f3Smrg
657ec681f3Smrg   Program* program;
667ec681f3Smrg   Block* block = NULL;
677ec681f3Smrg   std::vector<assignment> assignments;
687ec681f3Smrg   std::vector<std::unordered_map<unsigned, Temp>> renames;
697ec681f3Smrg   std::vector<uint32_t> loop_header;
707ec681f3Smrg   std::unordered_map<unsigned, Temp> orig_names;
717ec681f3Smrg   std::unordered_map<unsigned, Instruction*> vectors;
727ec681f3Smrg   std::unordered_map<unsigned, Instruction*> split_vectors;
737ec681f3Smrg   aco_ptr<Instruction> pseudo_dummy;
747ec681f3Smrg   uint16_t max_used_sgpr = 0;
757ec681f3Smrg   uint16_t max_used_vgpr = 0;
767ec681f3Smrg   uint16_t sgpr_limit;
777ec681f3Smrg   uint16_t vgpr_limit;
787ec681f3Smrg   std::bitset<512> war_hint;
797ec681f3Smrg   std::bitset<64> defs_done; /* see MAX_ARGS in aco_instruction_selection_setup.cpp */
807ec681f3Smrg
817ec681f3Smrg   ra_test_policy policy;
827ec681f3Smrg
837ec681f3Smrg   ra_ctx(Program* program_, ra_test_policy policy_)
847ec681f3Smrg       : program(program_), assignments(program->peekAllocationId()),
857ec681f3Smrg         renames(program->blocks.size()), policy(policy_)
867ec681f3Smrg   {
877ec681f3Smrg      pseudo_dummy.reset(
887ec681f3Smrg         create_instruction<Instruction>(aco_opcode::p_parallelcopy, Format::PSEUDO, 0, 0));
897ec681f3Smrg      sgpr_limit = get_addr_sgpr_from_waves(program, program->min_waves);
907ec681f3Smrg      vgpr_limit = get_addr_sgpr_from_waves(program, program->min_waves);
917ec681f3Smrg   }
927ec681f3Smrg};
937ec681f3Smrg
947ec681f3Smrg/* Iterator type for making PhysRegInterval compatible with range-based for */
957ec681f3Smrgstruct PhysRegIterator {
967ec681f3Smrg   using difference_type = int;
977ec681f3Smrg   using value_type = unsigned;
987ec681f3Smrg   using reference = const unsigned&;
997ec681f3Smrg   using pointer = const unsigned*;
1007ec681f3Smrg   using iterator_category = std::bidirectional_iterator_tag;
1017ec681f3Smrg
1027ec681f3Smrg   PhysReg reg;
1037ec681f3Smrg
1047ec681f3Smrg   PhysReg operator*() const { return reg; }
1057ec681f3Smrg
1067ec681f3Smrg   PhysRegIterator& operator++()
1077ec681f3Smrg   {
1087ec681f3Smrg      reg.reg_b += 4;
1097ec681f3Smrg      return *this;
1107ec681f3Smrg   }
1117ec681f3Smrg
1127ec681f3Smrg   PhysRegIterator& operator--()
1137ec681f3Smrg   {
1147ec681f3Smrg      reg.reg_b -= 4;
1157ec681f3Smrg      return *this;
1167ec681f3Smrg   }
1177ec681f3Smrg
1187ec681f3Smrg   bool operator==(PhysRegIterator oth) const { return reg == oth.reg; }
1197ec681f3Smrg
1207ec681f3Smrg   bool operator!=(PhysRegIterator oth) const { return reg != oth.reg; }
1217ec681f3Smrg
1227ec681f3Smrg   bool operator<(PhysRegIterator oth) const { return reg < oth.reg; }
1237ec681f3Smrg};
1247ec681f3Smrg
1257ec681f3Smrg/* Half-open register interval used in "sliding window"-style for-loops */
1267ec681f3Smrgstruct PhysRegInterval {
1277ec681f3Smrg   PhysReg lo_;
1287ec681f3Smrg   unsigned size;
1297ec681f3Smrg
1307ec681f3Smrg   /* Inclusive lower bound */
1317ec681f3Smrg   PhysReg lo() const { return lo_; }
1327ec681f3Smrg
1337ec681f3Smrg   /* Exclusive upper bound */
1347ec681f3Smrg   PhysReg hi() const { return PhysReg{lo() + size}; }
1357ec681f3Smrg
1367ec681f3Smrg   PhysRegInterval& operator+=(uint32_t stride)
1377ec681f3Smrg   {
1387ec681f3Smrg      lo_ = PhysReg{lo_.reg() + stride};
1397ec681f3Smrg      return *this;
1407ec681f3Smrg   }
1417ec681f3Smrg
1427ec681f3Smrg   bool operator!=(const PhysRegInterval& oth) const { return lo_ != oth.lo_ || size != oth.size; }
1437ec681f3Smrg
1447ec681f3Smrg   /* Construct a half-open interval, excluding the end register */
1457ec681f3Smrg   static PhysRegInterval from_until(PhysReg first, PhysReg end) { return {first, end - first}; }
1467ec681f3Smrg
1477ec681f3Smrg   bool contains(PhysReg reg) const { return lo() <= reg && reg < hi(); }
1487ec681f3Smrg
1497ec681f3Smrg   bool contains(const PhysRegInterval& needle) const
1507ec681f3Smrg   {
1517ec681f3Smrg      return needle.lo() >= lo() && needle.hi() <= hi();
1527ec681f3Smrg   }
1537ec681f3Smrg
1547ec681f3Smrg   PhysRegIterator begin() const { return {lo_}; }
1557ec681f3Smrg
1567ec681f3Smrg   PhysRegIterator end() const { return {PhysReg{lo_ + size}}; }
1577ec681f3Smrg};
1587ec681f3Smrg
1597ec681f3Smrgbool
1607ec681f3Smrgintersects(const PhysRegInterval& a, const PhysRegInterval& b)
1617ec681f3Smrg{
1627ec681f3Smrg   return a.hi() > b.lo() && b.hi() > a.lo();
1637ec681f3Smrg}
1647ec681f3Smrg
1657ec681f3Smrg/* Gets the stride for full (non-subdword) registers */
1667ec681f3Smrguint32_t
1677ec681f3Smrgget_stride(RegClass rc)
1687ec681f3Smrg{
1697ec681f3Smrg   if (rc.type() == RegType::vgpr) {
1707ec681f3Smrg      return 1;
1717ec681f3Smrg   } else {
1727ec681f3Smrg      uint32_t size = rc.size();
1737ec681f3Smrg      if (size == 2) {
1747ec681f3Smrg         return 2;
1757ec681f3Smrg      } else if (size >= 4) {
1767ec681f3Smrg         return 4;
1777ec681f3Smrg      } else {
1787ec681f3Smrg         return 1;
1797ec681f3Smrg      }
1807ec681f3Smrg   }
1817ec681f3Smrg}
1827ec681f3Smrg
1837ec681f3SmrgPhysRegInterval
1847ec681f3Smrgget_reg_bounds(Program* program, RegType type)
1857ec681f3Smrg{
1867ec681f3Smrg   if (type == RegType::vgpr) {
1877ec681f3Smrg      return {PhysReg{256}, (unsigned)program->max_reg_demand.vgpr};
1887ec681f3Smrg   } else {
1897ec681f3Smrg      return {PhysReg{0}, (unsigned)program->max_reg_demand.sgpr};
1907ec681f3Smrg   }
1917ec681f3Smrg}
1927ec681f3Smrg
1937ec681f3Smrgstruct DefInfo {
1947ec681f3Smrg   PhysRegInterval bounds;
1957ec681f3Smrg   uint8_t size;
1967ec681f3Smrg   uint8_t stride;
1977ec681f3Smrg   RegClass rc;
1987ec681f3Smrg
1997ec681f3Smrg   DefInfo(ra_ctx& ctx, aco_ptr<Instruction>& instr, RegClass rc_, int operand) : rc(rc_)
2007ec681f3Smrg   {
2017ec681f3Smrg      size = rc.size();
2027ec681f3Smrg      stride = get_stride(rc);
2037ec681f3Smrg
2047ec681f3Smrg      bounds = get_reg_bounds(ctx.program, rc.type());
2057ec681f3Smrg
2067ec681f3Smrg      if (rc.is_subdword() && operand >= 0) {
2077ec681f3Smrg         /* stride in bytes */
2087ec681f3Smrg         stride = get_subdword_operand_stride(ctx.program->chip_class, instr, operand, rc);
2097ec681f3Smrg      } else if (rc.is_subdword()) {
2107ec681f3Smrg         std::pair<unsigned, unsigned> info = get_subdword_definition_info(ctx.program, instr, rc);
2117ec681f3Smrg         stride = info.first;
2127ec681f3Smrg         if (info.second > rc.bytes()) {
2137ec681f3Smrg            rc = RegClass::get(rc.type(), info.second);
2147ec681f3Smrg            size = rc.size();
2157ec681f3Smrg            /* we might still be able to put the definition in the high half,
2167ec681f3Smrg             * but that's only useful for affinities and this information isn't
2177ec681f3Smrg             * used for them */
2187ec681f3Smrg            stride = align(stride, info.second);
2197ec681f3Smrg            if (!rc.is_subdword())
2207ec681f3Smrg               stride = DIV_ROUND_UP(stride, 4);
2217ec681f3Smrg         }
2227ec681f3Smrg         assert(stride > 0);
2237ec681f3Smrg      }
2247ec681f3Smrg   }
2257ec681f3Smrg};
2267ec681f3Smrg
2277ec681f3Smrgclass RegisterFile {
2287ec681f3Smrgpublic:
2297ec681f3Smrg   RegisterFile() { regs.fill(0); }
2307ec681f3Smrg
2317ec681f3Smrg   std::array<uint32_t, 512> regs;
2327ec681f3Smrg   std::map<uint32_t, std::array<uint32_t, 4>> subdword_regs;
2337ec681f3Smrg
2347ec681f3Smrg   const uint32_t& operator[](PhysReg index) const { return regs[index]; }
2357ec681f3Smrg
2367ec681f3Smrg   uint32_t& operator[](PhysReg index) { return regs[index]; }
2377ec681f3Smrg
2387ec681f3Smrg   unsigned count_zero(PhysRegInterval reg_interval)
2397ec681f3Smrg   {
2407ec681f3Smrg      unsigned res = 0;
2417ec681f3Smrg      for (PhysReg reg : reg_interval)
2427ec681f3Smrg         res += !regs[reg];
2437ec681f3Smrg      return res;
2447ec681f3Smrg   }
2457ec681f3Smrg
2467ec681f3Smrg   /* Returns true if any of the bytes in the given range are allocated or blocked */
2477ec681f3Smrg   bool test(PhysReg start, unsigned num_bytes)
2487ec681f3Smrg   {
2497ec681f3Smrg      for (PhysReg i = start; i.reg_b < start.reg_b + num_bytes; i = PhysReg(i + 1)) {
2507ec681f3Smrg         assert(i <= 511);
2517ec681f3Smrg         if (regs[i] & 0x0FFFFFFF)
2527ec681f3Smrg            return true;
2537ec681f3Smrg         if (regs[i] == 0xF0000000) {
2547ec681f3Smrg            assert(subdword_regs.find(i) != subdword_regs.end());
2557ec681f3Smrg            for (unsigned j = i.byte(); i * 4 + j < start.reg_b + num_bytes && j < 4; j++) {
2567ec681f3Smrg               if (subdword_regs[i][j])
2577ec681f3Smrg                  return true;
2587ec681f3Smrg            }
2597ec681f3Smrg         }
2607ec681f3Smrg      }
2617ec681f3Smrg      return false;
2627ec681f3Smrg   }
2637ec681f3Smrg
2647ec681f3Smrg   void block(PhysReg start, RegClass rc)
2657ec681f3Smrg   {
2667ec681f3Smrg      if (rc.is_subdword())
2677ec681f3Smrg         fill_subdword(start, rc.bytes(), 0xFFFFFFFF);
2687ec681f3Smrg      else
2697ec681f3Smrg         fill(start, rc.size(), 0xFFFFFFFF);
2707ec681f3Smrg   }
2717ec681f3Smrg
2727ec681f3Smrg   bool is_blocked(PhysReg start)
2737ec681f3Smrg   {
2747ec681f3Smrg      if (regs[start] == 0xFFFFFFFF)
2757ec681f3Smrg         return true;
2767ec681f3Smrg      if (regs[start] == 0xF0000000) {
2777ec681f3Smrg         for (unsigned i = start.byte(); i < 4; i++)
2787ec681f3Smrg            if (subdword_regs[start][i] == 0xFFFFFFFF)
2797ec681f3Smrg               return true;
2807ec681f3Smrg      }
2817ec681f3Smrg      return false;
2827ec681f3Smrg   }
2837ec681f3Smrg
2847ec681f3Smrg   bool is_empty_or_blocked(PhysReg start)
2857ec681f3Smrg   {
2867ec681f3Smrg      /* Empty is 0, blocked is 0xFFFFFFFF, so to check both we compare the
2877ec681f3Smrg       * incremented value to 1 */
2887ec681f3Smrg      if (regs[start] == 0xF0000000) {
2897ec681f3Smrg         return subdword_regs[start][start.byte()] + 1 <= 1;
2907ec681f3Smrg      }
2917ec681f3Smrg      return regs[start] + 1 <= 1;
2927ec681f3Smrg   }
2937ec681f3Smrg
2947ec681f3Smrg   void clear(PhysReg start, RegClass rc)
2957ec681f3Smrg   {
2967ec681f3Smrg      if (rc.is_subdword())
2977ec681f3Smrg         fill_subdword(start, rc.bytes(), 0);
2987ec681f3Smrg      else
2997ec681f3Smrg         fill(start, rc.size(), 0);
3007ec681f3Smrg   }
3017ec681f3Smrg
3027ec681f3Smrg   void fill(Operand op)
3037ec681f3Smrg   {
3047ec681f3Smrg      if (op.regClass().is_subdword())
3057ec681f3Smrg         fill_subdword(op.physReg(), op.bytes(), op.tempId());
3067ec681f3Smrg      else
3077ec681f3Smrg         fill(op.physReg(), op.size(), op.tempId());
3087ec681f3Smrg   }
3097ec681f3Smrg
3107ec681f3Smrg   void clear(Operand op) { clear(op.physReg(), op.regClass()); }
3117ec681f3Smrg
3127ec681f3Smrg   void fill(Definition def)
3137ec681f3Smrg   {
3147ec681f3Smrg      if (def.regClass().is_subdword())
3157ec681f3Smrg         fill_subdword(def.physReg(), def.bytes(), def.tempId());
3167ec681f3Smrg      else
3177ec681f3Smrg         fill(def.physReg(), def.size(), def.tempId());
3187ec681f3Smrg   }
3197ec681f3Smrg
3207ec681f3Smrg   void clear(Definition def) { clear(def.physReg(), def.regClass()); }
3217ec681f3Smrg
3227ec681f3Smrg   unsigned get_id(PhysReg reg)
3237ec681f3Smrg   {
3247ec681f3Smrg      return regs[reg] == 0xF0000000 ? subdword_regs[reg][reg.byte()] : regs[reg];
3257ec681f3Smrg   }
3267ec681f3Smrg
3277ec681f3Smrgprivate:
3287ec681f3Smrg   void fill(PhysReg start, unsigned size, uint32_t val)
3297ec681f3Smrg   {
3307ec681f3Smrg      for (unsigned i = 0; i < size; i++)
3317ec681f3Smrg         regs[start + i] = val;
3327ec681f3Smrg   }
3337ec681f3Smrg
3347ec681f3Smrg   void fill_subdword(PhysReg start, unsigned num_bytes, uint32_t val)
3357ec681f3Smrg   {
3367ec681f3Smrg      fill(start, DIV_ROUND_UP(num_bytes, 4), 0xF0000000);
3377ec681f3Smrg      for (PhysReg i = start; i.reg_b < start.reg_b + num_bytes; i = PhysReg(i + 1)) {
3387ec681f3Smrg         /* emplace or get */
3397ec681f3Smrg         std::array<uint32_t, 4>& sub =
3407ec681f3Smrg            subdword_regs.emplace(i, std::array<uint32_t, 4>{0, 0, 0, 0}).first->second;
3417ec681f3Smrg         for (unsigned j = i.byte(); i * 4 + j < start.reg_b + num_bytes && j < 4; j++)
3427ec681f3Smrg            sub[j] = val;
3437ec681f3Smrg
3447ec681f3Smrg         if (sub == std::array<uint32_t, 4>{0, 0, 0, 0}) {
3457ec681f3Smrg            subdword_regs.erase(i);
3467ec681f3Smrg            regs[i] = 0;
3477ec681f3Smrg         }
3487ec681f3Smrg      }
3497ec681f3Smrg   }
3507ec681f3Smrg};
3517ec681f3Smrg
3527ec681f3Smrgstd::set<std::pair<unsigned, unsigned>> find_vars(ra_ctx& ctx, RegisterFile& reg_file,
3537ec681f3Smrg                                                  const PhysRegInterval reg_interval);
3547ec681f3Smrg
3557ec681f3Smrg/* helper function for debugging */
3567ec681f3SmrgUNUSED void
3577ec681f3Smrgprint_reg(const RegisterFile& reg_file, PhysReg reg, bool has_adjacent_variable)
3587ec681f3Smrg{
3597ec681f3Smrg   if (reg_file[reg] == 0xFFFFFFFF) {
3607ec681f3Smrg      printf("☐");
3617ec681f3Smrg   } else if (reg_file[reg]) {
3627ec681f3Smrg      const bool show_subdword_alloc = (reg_file[reg] == 0xF0000000);
3637ec681f3Smrg      if (show_subdword_alloc) {
3647ec681f3Smrg         const char* block_chars[] = {
3657ec681f3Smrg            // clang-format off
3667ec681f3Smrg            "?", "▘", "▝", "▀",
3677ec681f3Smrg            "▖", "▌", "▞", "▛",
3687ec681f3Smrg            "▗", "▚", "▐", "▜",
3697ec681f3Smrg            "▄", "▙", "▟", "▉"
3707ec681f3Smrg            // clang-format on
3717ec681f3Smrg         };
3727ec681f3Smrg         unsigned index = 0;
3737ec681f3Smrg         for (int i = 0; i < 4; ++i) {
3747ec681f3Smrg            if (reg_file.subdword_regs.at(reg)[i]) {
3757ec681f3Smrg               index |= 1 << i;
3767ec681f3Smrg            }
3777ec681f3Smrg         }
3787ec681f3Smrg         printf("%s", block_chars[index]);
3797ec681f3Smrg      } else {
3807ec681f3Smrg         /* Indicate filled register slot */
3817ec681f3Smrg         if (!has_adjacent_variable) {
3827ec681f3Smrg            printf("█");
3837ec681f3Smrg         } else {
3847ec681f3Smrg            /* Use a slightly shorter box to leave a small gap between adjacent variables */
3857ec681f3Smrg            printf("▉");
3867ec681f3Smrg         }
3877ec681f3Smrg      }
3887ec681f3Smrg   } else {
3897ec681f3Smrg      printf("·");
3907ec681f3Smrg   }
3917ec681f3Smrg}
3927ec681f3Smrg
3937ec681f3Smrg/* helper function for debugging */
3947ec681f3SmrgUNUSED void
3957ec681f3Smrgprint_regs(ra_ctx& ctx, bool vgprs, RegisterFile& reg_file)
3967ec681f3Smrg{
3977ec681f3Smrg   PhysRegInterval regs = get_reg_bounds(ctx.program, vgprs ? RegType::vgpr : RegType::sgpr);
3987ec681f3Smrg   char reg_char = vgprs ? 'v' : 's';
3997ec681f3Smrg   const int max_regs_per_line = 64;
4007ec681f3Smrg
4017ec681f3Smrg   /* print markers */
4027ec681f3Smrg   printf("       ");
4037ec681f3Smrg   for (int i = 0; i < std::min<int>(max_regs_per_line, ROUND_DOWN_TO(regs.size, 4)); i += 4) {
4047ec681f3Smrg      printf("%-3.2u ", i);
4057ec681f3Smrg   }
4067ec681f3Smrg   printf("\n");
4077ec681f3Smrg
4087ec681f3Smrg   /* print usage */
4097ec681f3Smrg   auto line_begin_it = regs.begin();
4107ec681f3Smrg   while (line_begin_it != regs.end()) {
4117ec681f3Smrg      const int regs_in_line =
4127ec681f3Smrg         std::min<int>(max_regs_per_line, std::distance(line_begin_it, regs.end()));
4137ec681f3Smrg
4147ec681f3Smrg      if (line_begin_it == regs.begin()) {
4157ec681f3Smrg         printf("%cgprs: ", reg_char);
4167ec681f3Smrg      } else {
4177ec681f3Smrg         printf("  %+4d ", std::distance(regs.begin(), line_begin_it));
4187ec681f3Smrg      }
4197ec681f3Smrg      const auto line_end_it = std::next(line_begin_it, regs_in_line);
4207ec681f3Smrg
4217ec681f3Smrg      for (auto reg_it = line_begin_it; reg_it != line_end_it; ++reg_it) {
4227ec681f3Smrg         bool has_adjacent_variable =
4237ec681f3Smrg            (std::next(reg_it) != line_end_it &&
4247ec681f3Smrg             reg_file[*reg_it] != reg_file[*std::next(reg_it)] && reg_file[*std::next(reg_it)]);
4257ec681f3Smrg         print_reg(reg_file, *reg_it, has_adjacent_variable);
4267ec681f3Smrg      }
4277ec681f3Smrg
4287ec681f3Smrg      line_begin_it = line_end_it;
4297ec681f3Smrg      printf("\n");
4307ec681f3Smrg   }
4317ec681f3Smrg
4327ec681f3Smrg   const unsigned free_regs =
4337ec681f3Smrg      std::count_if(regs.begin(), regs.end(), [&](auto reg) { return !reg_file[reg]; });
4347ec681f3Smrg   printf("%u/%u used, %u/%u free\n", regs.size - free_regs, regs.size, free_regs, regs.size);
4357ec681f3Smrg
4367ec681f3Smrg   /* print assignments ordered by registers */
4377ec681f3Smrg   std::map<PhysReg, std::pair<unsigned, unsigned>>
4387ec681f3Smrg      regs_to_vars; /* maps to byte size and temp id */
4397ec681f3Smrg   for (const auto& size_id : find_vars(ctx, reg_file, regs)) {
4407ec681f3Smrg      auto reg = ctx.assignments[size_id.second].reg;
4417ec681f3Smrg      ASSERTED auto inserted = regs_to_vars.emplace(reg, size_id);
4427ec681f3Smrg      assert(inserted.second);
4437ec681f3Smrg   }
4447ec681f3Smrg
4457ec681f3Smrg   for (const auto& reg_and_var : regs_to_vars) {
4467ec681f3Smrg      const auto& first_reg = reg_and_var.first;
4477ec681f3Smrg      const auto& size_id = reg_and_var.second;
4487ec681f3Smrg
4497ec681f3Smrg      printf("%%%u ", size_id.second);
4507ec681f3Smrg      if (ctx.orig_names.count(size_id.second) &&
4517ec681f3Smrg          ctx.orig_names[size_id.second].id() != size_id.second) {
4527ec681f3Smrg         printf("(was %%%d) ", ctx.orig_names[size_id.second].id());
4537ec681f3Smrg      }
4547ec681f3Smrg      printf("= %c[%d", reg_char, first_reg.reg() - regs.lo());
4557ec681f3Smrg      PhysReg last_reg = first_reg.advance(size_id.first - 1);
4567ec681f3Smrg      if (first_reg.reg() != last_reg.reg()) {
4577ec681f3Smrg         assert(first_reg.byte() == 0 && last_reg.byte() == 3);
4587ec681f3Smrg         printf("-%d", last_reg.reg() - regs.lo());
4597ec681f3Smrg      }
4607ec681f3Smrg      printf("]");
4617ec681f3Smrg      if (first_reg.byte() != 0 || last_reg.byte() != 3) {
4627ec681f3Smrg         printf("[%d:%d]", first_reg.byte() * 8, (last_reg.byte() + 1) * 8);
4637ec681f3Smrg      }
4647ec681f3Smrg      printf("\n");
4657ec681f3Smrg   }
4667ec681f3Smrg}
4677ec681f3Smrg
4687ec681f3Smrgunsigned
4697ec681f3Smrgget_subdword_operand_stride(chip_class chip, const aco_ptr<Instruction>& instr, unsigned idx,
4707ec681f3Smrg                            RegClass rc)
4717ec681f3Smrg{
4727ec681f3Smrg   if (instr->isPseudo()) {
4737ec681f3Smrg      /* v_readfirstlane_b32 cannot use SDWA */
4747ec681f3Smrg      if (instr->opcode == aco_opcode::p_as_uniform)
4757ec681f3Smrg         return 4;
4767ec681f3Smrg      else if (chip >= GFX8)
4777ec681f3Smrg         return rc.bytes() % 2 == 0 ? 2 : 1;
4787ec681f3Smrg      else
4797ec681f3Smrg         return 4;
4807ec681f3Smrg   }
4817ec681f3Smrg
4827ec681f3Smrg   assert(rc.bytes() <= 2);
4837ec681f3Smrg   if (instr->isVALU()) {
4847ec681f3Smrg      if (can_use_SDWA(chip, instr, false))
4857ec681f3Smrg         return rc.bytes();
4867ec681f3Smrg      if (can_use_opsel(chip, instr->opcode, idx, true))
4877ec681f3Smrg         return 2;
4887ec681f3Smrg      if (instr->format == Format::VOP3P)
4897ec681f3Smrg         return 2;
4907ec681f3Smrg   }
4917ec681f3Smrg
4927ec681f3Smrg   switch (instr->opcode) {
4937ec681f3Smrg   case aco_opcode::v_cvt_f32_ubyte0: return 1;
4947ec681f3Smrg   case aco_opcode::ds_write_b8:
4957ec681f3Smrg   case aco_opcode::ds_write_b16: return chip >= GFX9 ? 2 : 4;
4967ec681f3Smrg   case aco_opcode::buffer_store_byte:
4977ec681f3Smrg   case aco_opcode::buffer_store_short:
4987ec681f3Smrg   case aco_opcode::flat_store_byte:
4997ec681f3Smrg   case aco_opcode::flat_store_short:
5007ec681f3Smrg   case aco_opcode::scratch_store_byte:
5017ec681f3Smrg   case aco_opcode::scratch_store_short:
5027ec681f3Smrg   case aco_opcode::global_store_byte:
5037ec681f3Smrg   case aco_opcode::global_store_short: return chip >= GFX9 ? 2 : 4;
5047ec681f3Smrg   default: return 4;
5057ec681f3Smrg   }
5067ec681f3Smrg}
5077ec681f3Smrg
5087ec681f3Smrgvoid
5097ec681f3Smrgadd_subdword_operand(ra_ctx& ctx, aco_ptr<Instruction>& instr, unsigned idx, unsigned byte,
5107ec681f3Smrg                     RegClass rc)
5117ec681f3Smrg{
5127ec681f3Smrg   chip_class chip = ctx.program->chip_class;
5137ec681f3Smrg   if (instr->isPseudo() || byte == 0)
5147ec681f3Smrg      return;
5157ec681f3Smrg
5167ec681f3Smrg   assert(rc.bytes() <= 2);
5177ec681f3Smrg   if (instr->isVALU()) {
5187ec681f3Smrg      /* check if we can use opsel */
5197ec681f3Smrg      if (instr->format == Format::VOP3) {
5207ec681f3Smrg         assert(byte == 2);
5217ec681f3Smrg         instr->vop3().opsel |= 1 << idx;
5227ec681f3Smrg         return;
5237ec681f3Smrg      }
5247ec681f3Smrg      if (instr->isVOP3P()) {
5257ec681f3Smrg         assert(byte == 2 && !(instr->vop3p().opsel_lo & (1 << idx)));
5267ec681f3Smrg         instr->vop3p().opsel_lo |= 1 << idx;
5277ec681f3Smrg         instr->vop3p().opsel_hi |= 1 << idx;
5287ec681f3Smrg         return;
5297ec681f3Smrg      }
5307ec681f3Smrg      if (instr->opcode == aco_opcode::v_cvt_f32_ubyte0) {
5317ec681f3Smrg         switch (byte) {
5327ec681f3Smrg         case 0: instr->opcode = aco_opcode::v_cvt_f32_ubyte0; break;
5337ec681f3Smrg         case 1: instr->opcode = aco_opcode::v_cvt_f32_ubyte1; break;
5347ec681f3Smrg         case 2: instr->opcode = aco_opcode::v_cvt_f32_ubyte2; break;
5357ec681f3Smrg         case 3: instr->opcode = aco_opcode::v_cvt_f32_ubyte3; break;
5367ec681f3Smrg         }
5377ec681f3Smrg         return;
5387ec681f3Smrg      }
5397ec681f3Smrg
5407ec681f3Smrg      /* use SDWA */
5417ec681f3Smrg      assert(can_use_SDWA(chip, instr, false));
5427ec681f3Smrg      convert_to_SDWA(chip, instr);
5437ec681f3Smrg      return;
5447ec681f3Smrg   }
5457ec681f3Smrg
5467ec681f3Smrg   assert(byte == 2);
5477ec681f3Smrg   if (instr->opcode == aco_opcode::ds_write_b8)
5487ec681f3Smrg      instr->opcode = aco_opcode::ds_write_b8_d16_hi;
5497ec681f3Smrg   else if (instr->opcode == aco_opcode::ds_write_b16)
5507ec681f3Smrg      instr->opcode = aco_opcode::ds_write_b16_d16_hi;
5517ec681f3Smrg   else if (instr->opcode == aco_opcode::buffer_store_byte)
5527ec681f3Smrg      instr->opcode = aco_opcode::buffer_store_byte_d16_hi;
5537ec681f3Smrg   else if (instr->opcode == aco_opcode::buffer_store_short)
5547ec681f3Smrg      instr->opcode = aco_opcode::buffer_store_short_d16_hi;
5557ec681f3Smrg   else if (instr->opcode == aco_opcode::flat_store_byte)
5567ec681f3Smrg      instr->opcode = aco_opcode::flat_store_byte_d16_hi;
5577ec681f3Smrg   else if (instr->opcode == aco_opcode::flat_store_short)
5587ec681f3Smrg      instr->opcode = aco_opcode::flat_store_short_d16_hi;
5597ec681f3Smrg   else if (instr->opcode == aco_opcode::scratch_store_byte)
5607ec681f3Smrg      instr->opcode = aco_opcode::scratch_store_byte_d16_hi;
5617ec681f3Smrg   else if (instr->opcode == aco_opcode::scratch_store_short)
5627ec681f3Smrg      instr->opcode = aco_opcode::scratch_store_short_d16_hi;
5637ec681f3Smrg   else if (instr->opcode == aco_opcode::global_store_byte)
5647ec681f3Smrg      instr->opcode = aco_opcode::global_store_byte_d16_hi;
5657ec681f3Smrg   else if (instr->opcode == aco_opcode::global_store_short)
5667ec681f3Smrg      instr->opcode = aco_opcode::global_store_short_d16_hi;
5677ec681f3Smrg   else
5687ec681f3Smrg      unreachable("Something went wrong: Impossible register assignment.");
5697ec681f3Smrg   return;
5707ec681f3Smrg}
5717ec681f3Smrg
5727ec681f3Smrg/* minimum_stride, bytes_written */
5737ec681f3Smrgstd::pair<unsigned, unsigned>
5747ec681f3Smrgget_subdword_definition_info(Program* program, const aco_ptr<Instruction>& instr, RegClass rc)
5757ec681f3Smrg{
5767ec681f3Smrg   chip_class chip = program->chip_class;
5777ec681f3Smrg
5787ec681f3Smrg   if (instr->isPseudo()) {
5797ec681f3Smrg      if (chip >= GFX8)
5807ec681f3Smrg         return std::make_pair(rc.bytes() % 2 == 0 ? 2 : 1, rc.bytes());
5817ec681f3Smrg      else
5827ec681f3Smrg         return std::make_pair(4, rc.size() * 4u);
5837ec681f3Smrg   }
5847ec681f3Smrg
5857ec681f3Smrg   if (instr->isVALU() || instr->isVINTRP()) {
5867ec681f3Smrg      assert(rc.bytes() <= 2);
5877ec681f3Smrg
5887ec681f3Smrg      if (can_use_SDWA(chip, instr, false))
5897ec681f3Smrg         return std::make_pair(rc.bytes(), rc.bytes());
5907ec681f3Smrg
5917ec681f3Smrg      unsigned bytes_written = 4u;
5927ec681f3Smrg      if (instr_is_16bit(chip, instr->opcode))
5937ec681f3Smrg         bytes_written = 2u;
5947ec681f3Smrg
5957ec681f3Smrg      unsigned stride = 4u;
5967ec681f3Smrg      if (instr->opcode == aco_opcode::v_fma_mixlo_f16 ||
5977ec681f3Smrg          can_use_opsel(chip, instr->opcode, -1, true))
5987ec681f3Smrg         stride = 2u;
5997ec681f3Smrg
6007ec681f3Smrg      return std::make_pair(stride, bytes_written);
6017ec681f3Smrg   }
6027ec681f3Smrg
6037ec681f3Smrg   switch (instr->opcode) {
6047ec681f3Smrg   case aco_opcode::ds_read_u8_d16:
6057ec681f3Smrg   case aco_opcode::ds_read_i8_d16:
6067ec681f3Smrg   case aco_opcode::ds_read_u16_d16:
6077ec681f3Smrg   case aco_opcode::flat_load_ubyte_d16:
6087ec681f3Smrg   case aco_opcode::flat_load_sbyte_d16:
6097ec681f3Smrg   case aco_opcode::flat_load_short_d16:
6107ec681f3Smrg   case aco_opcode::global_load_ubyte_d16:
6117ec681f3Smrg   case aco_opcode::global_load_sbyte_d16:
6127ec681f3Smrg   case aco_opcode::global_load_short_d16:
6137ec681f3Smrg   case aco_opcode::scratch_load_ubyte_d16:
6147ec681f3Smrg   case aco_opcode::scratch_load_sbyte_d16:
6157ec681f3Smrg   case aco_opcode::scratch_load_short_d16:
6167ec681f3Smrg   case aco_opcode::buffer_load_ubyte_d16:
6177ec681f3Smrg   case aco_opcode::buffer_load_sbyte_d16:
6187ec681f3Smrg   case aco_opcode::buffer_load_short_d16: {
6197ec681f3Smrg      assert(chip >= GFX9);
6207ec681f3Smrg      if (!program->dev.sram_ecc_enabled)
6217ec681f3Smrg         return std::make_pair(2u, 2u);
6227ec681f3Smrg      else
6237ec681f3Smrg         return std::make_pair(2u, 4u);
6247ec681f3Smrg   }
6257ec681f3Smrg
6267ec681f3Smrg   default: return std::make_pair(4, rc.size() * 4u);
6277ec681f3Smrg   }
6287ec681f3Smrg}
6297ec681f3Smrg
6307ec681f3Smrgvoid
6317ec681f3Smrgadd_subdword_definition(Program* program, aco_ptr<Instruction>& instr, PhysReg reg)
6327ec681f3Smrg{
6337ec681f3Smrg   if (instr->isPseudo())
6347ec681f3Smrg      return;
6357ec681f3Smrg
6367ec681f3Smrg   if (instr->isVALU()) {
6377ec681f3Smrg      chip_class chip = program->chip_class;
6387ec681f3Smrg      assert(instr->definitions[0].bytes() <= 2);
6397ec681f3Smrg
6407ec681f3Smrg      if (reg.byte() == 0 && instr_is_16bit(chip, instr->opcode))
6417ec681f3Smrg         return;
6427ec681f3Smrg
6437ec681f3Smrg      /* check if we can use opsel */
6447ec681f3Smrg      if (instr->format == Format::VOP3) {
6457ec681f3Smrg         assert(reg.byte() == 2);
6467ec681f3Smrg         assert(can_use_opsel(chip, instr->opcode, -1, true));
6477ec681f3Smrg         instr->vop3().opsel |= (1 << 3); /* dst in high half */
6487ec681f3Smrg         return;
6497ec681f3Smrg      }
6507ec681f3Smrg
6517ec681f3Smrg      if (instr->opcode == aco_opcode::v_fma_mixlo_f16) {
6527ec681f3Smrg         instr->opcode = aco_opcode::v_fma_mixhi_f16;
6537ec681f3Smrg         return;
6547ec681f3Smrg      }
6557ec681f3Smrg
6567ec681f3Smrg      /* use SDWA */
6577ec681f3Smrg      assert(can_use_SDWA(chip, instr, false));
6587ec681f3Smrg      convert_to_SDWA(chip, instr);
6597ec681f3Smrg      return;
6607ec681f3Smrg   }
6617ec681f3Smrg
6627ec681f3Smrg   if (reg.byte() == 0)
6637ec681f3Smrg      return;
6647ec681f3Smrg   else if (instr->opcode == aco_opcode::buffer_load_ubyte_d16)
6657ec681f3Smrg      instr->opcode = aco_opcode::buffer_load_ubyte_d16_hi;
6667ec681f3Smrg   else if (instr->opcode == aco_opcode::buffer_load_sbyte_d16)
6677ec681f3Smrg      instr->opcode = aco_opcode::buffer_load_sbyte_d16_hi;
6687ec681f3Smrg   else if (instr->opcode == aco_opcode::buffer_load_short_d16)
6697ec681f3Smrg      instr->opcode = aco_opcode::buffer_load_short_d16_hi;
6707ec681f3Smrg   else if (instr->opcode == aco_opcode::flat_load_ubyte_d16)
6717ec681f3Smrg      instr->opcode = aco_opcode::flat_load_ubyte_d16_hi;
6727ec681f3Smrg   else if (instr->opcode == aco_opcode::flat_load_sbyte_d16)
6737ec681f3Smrg      instr->opcode = aco_opcode::flat_load_sbyte_d16_hi;
6747ec681f3Smrg   else if (instr->opcode == aco_opcode::flat_load_short_d16)
6757ec681f3Smrg      instr->opcode = aco_opcode::flat_load_short_d16_hi;
6767ec681f3Smrg   else if (instr->opcode == aco_opcode::scratch_load_ubyte_d16)
6777ec681f3Smrg      instr->opcode = aco_opcode::scratch_load_ubyte_d16_hi;
6787ec681f3Smrg   else if (instr->opcode == aco_opcode::scratch_load_sbyte_d16)
6797ec681f3Smrg      instr->opcode = aco_opcode::scratch_load_sbyte_d16_hi;
6807ec681f3Smrg   else if (instr->opcode == aco_opcode::scratch_load_short_d16)
6817ec681f3Smrg      instr->opcode = aco_opcode::scratch_load_short_d16_hi;
6827ec681f3Smrg   else if (instr->opcode == aco_opcode::global_load_ubyte_d16)
6837ec681f3Smrg      instr->opcode = aco_opcode::global_load_ubyte_d16_hi;
6847ec681f3Smrg   else if (instr->opcode == aco_opcode::global_load_sbyte_d16)
6857ec681f3Smrg      instr->opcode = aco_opcode::global_load_sbyte_d16_hi;
6867ec681f3Smrg   else if (instr->opcode == aco_opcode::global_load_short_d16)
6877ec681f3Smrg      instr->opcode = aco_opcode::global_load_short_d16_hi;
6887ec681f3Smrg   else if (instr->opcode == aco_opcode::ds_read_u8_d16)
6897ec681f3Smrg      instr->opcode = aco_opcode::ds_read_u8_d16_hi;
6907ec681f3Smrg   else if (instr->opcode == aco_opcode::ds_read_i8_d16)
6917ec681f3Smrg      instr->opcode = aco_opcode::ds_read_i8_d16_hi;
6927ec681f3Smrg   else if (instr->opcode == aco_opcode::ds_read_u16_d16)
6937ec681f3Smrg      instr->opcode = aco_opcode::ds_read_u16_d16_hi;
6947ec681f3Smrg   else
6957ec681f3Smrg      unreachable("Something went wrong: Impossible register assignment.");
6967ec681f3Smrg}
6977ec681f3Smrg
6987ec681f3Smrgvoid
6997ec681f3Smrgadjust_max_used_regs(ra_ctx& ctx, RegClass rc, unsigned reg)
7007ec681f3Smrg{
7017ec681f3Smrg   uint16_t max_addressible_sgpr = ctx.sgpr_limit;
7027ec681f3Smrg   unsigned size = rc.size();
7037ec681f3Smrg   if (rc.type() == RegType::vgpr) {
7047ec681f3Smrg      assert(reg >= 256);
7057ec681f3Smrg      uint16_t hi = reg - 256 + size - 1;
7067ec681f3Smrg      ctx.max_used_vgpr = std::max(ctx.max_used_vgpr, hi);
7077ec681f3Smrg   } else if (reg + rc.size() <= max_addressible_sgpr) {
7087ec681f3Smrg      uint16_t hi = reg + size - 1;
7097ec681f3Smrg      ctx.max_used_sgpr = std::max(ctx.max_used_sgpr, std::min(hi, max_addressible_sgpr));
7107ec681f3Smrg   }
7117ec681f3Smrg}
7127ec681f3Smrg
7137ec681f3Smrgenum UpdateRenames {
7147ec681f3Smrg   rename_not_killed_ops = 0x1,
7157ec681f3Smrg   fill_killed_ops = 0x2,
7167ec681f3Smrg};
7177ec681f3SmrgMESA_DEFINE_CPP_ENUM_BITFIELD_OPERATORS(UpdateRenames);
7187ec681f3Smrg
7197ec681f3Smrgvoid
7207ec681f3Smrgupdate_renames(ra_ctx& ctx, RegisterFile& reg_file,
7217ec681f3Smrg               std::vector<std::pair<Operand, Definition>>& parallelcopies,
7227ec681f3Smrg               aco_ptr<Instruction>& instr, UpdateRenames flags)
7237ec681f3Smrg{
7247ec681f3Smrg   /* clear operands */
7257ec681f3Smrg   for (std::pair<Operand, Definition>& copy : parallelcopies) {
7267ec681f3Smrg      /* the definitions with id are not from this function and already handled */
7277ec681f3Smrg      if (copy.second.isTemp())
7287ec681f3Smrg         continue;
7297ec681f3Smrg      reg_file.clear(copy.first);
7307ec681f3Smrg   }
7317ec681f3Smrg
7327ec681f3Smrg   /* allocate id's and rename operands: this is done transparently here */
7337ec681f3Smrg   auto it = parallelcopies.begin();
7347ec681f3Smrg   while (it != parallelcopies.end()) {
7357ec681f3Smrg      if (it->second.isTemp()) {
7367ec681f3Smrg         ++it;
7377ec681f3Smrg         continue;
7387ec681f3Smrg      }
7397ec681f3Smrg
7407ec681f3Smrg      /* check if we moved a definition: change the register and remove copy */
7417ec681f3Smrg      bool is_def = false;
7427ec681f3Smrg      for (Definition& def : instr->definitions) {
7437ec681f3Smrg         if (def.isTemp() && def.getTemp() == it->first.getTemp()) {
7447ec681f3Smrg            // FIXME: ensure that the definition can use this reg
7457ec681f3Smrg            def.setFixed(it->second.physReg());
7467ec681f3Smrg            reg_file.fill(def);
7477ec681f3Smrg            ctx.assignments[def.tempId()].reg = def.physReg();
7487ec681f3Smrg            it = parallelcopies.erase(it);
7497ec681f3Smrg            is_def = true;
7507ec681f3Smrg            break;
7517ec681f3Smrg         }
7527ec681f3Smrg      }
7537ec681f3Smrg      if (is_def)
7547ec681f3Smrg         continue;
7557ec681f3Smrg
7567ec681f3Smrg      /* check if we moved another parallelcopy definition */
7577ec681f3Smrg      for (std::pair<Operand, Definition>& other : parallelcopies) {
7587ec681f3Smrg         if (!other.second.isTemp())
7597ec681f3Smrg            continue;
7607ec681f3Smrg         if (it->first.getTemp() == other.second.getTemp()) {
7617ec681f3Smrg            other.second.setFixed(it->second.physReg());
7627ec681f3Smrg            ctx.assignments[other.second.tempId()].reg = other.second.physReg();
7637ec681f3Smrg            it = parallelcopies.erase(it);
7647ec681f3Smrg            is_def = true;
7657ec681f3Smrg            /* check if we moved an operand, again */
7667ec681f3Smrg            bool fill = true;
7677ec681f3Smrg            for (Operand& op : instr->operands) {
7687ec681f3Smrg               if (op.isTemp() && op.tempId() == other.second.tempId()) {
7697ec681f3Smrg                  // FIXME: ensure that the operand can use this reg
7707ec681f3Smrg                  op.setFixed(other.second.physReg());
7717ec681f3Smrg                  fill = (flags & fill_killed_ops) || !op.isKillBeforeDef();
7727ec681f3Smrg               }
7737ec681f3Smrg            }
7747ec681f3Smrg            if (fill)
7757ec681f3Smrg               reg_file.fill(other.second);
7767ec681f3Smrg            break;
7777ec681f3Smrg         }
7787ec681f3Smrg      }
7797ec681f3Smrg      if (is_def)
7807ec681f3Smrg         continue;
7817ec681f3Smrg
7827ec681f3Smrg      std::pair<Operand, Definition>& copy = *it;
7837ec681f3Smrg      copy.second.setTemp(ctx.program->allocateTmp(copy.second.regClass()));
7847ec681f3Smrg      ctx.assignments.emplace_back(copy.second.physReg(), copy.second.regClass());
7857ec681f3Smrg      assert(ctx.assignments.size() == ctx.program->peekAllocationId());
7867ec681f3Smrg
7877ec681f3Smrg      /* check if we moved an operand */
7887ec681f3Smrg      bool first = true;
7897ec681f3Smrg      bool fill = true;
7907ec681f3Smrg      for (unsigned i = 0; i < instr->operands.size(); i++) {
7917ec681f3Smrg         Operand& op = instr->operands[i];
7927ec681f3Smrg         if (!op.isTemp())
7937ec681f3Smrg            continue;
7947ec681f3Smrg         if (op.tempId() == copy.first.tempId()) {
7957ec681f3Smrg            bool omit_renaming = !(flags & rename_not_killed_ops) && !op.isKillBeforeDef();
7967ec681f3Smrg            for (std::pair<Operand, Definition>& pc : parallelcopies) {
7977ec681f3Smrg               PhysReg def_reg = pc.second.physReg();
7987ec681f3Smrg               omit_renaming &= def_reg > copy.first.physReg()
7997ec681f3Smrg                                   ? (copy.first.physReg() + copy.first.size() <= def_reg.reg())
8007ec681f3Smrg                                   : (def_reg + pc.second.size() <= copy.first.physReg().reg());
8017ec681f3Smrg            }
8027ec681f3Smrg            if (omit_renaming) {
8037ec681f3Smrg               if (first)
8047ec681f3Smrg                  op.setFirstKill(true);
8057ec681f3Smrg               else
8067ec681f3Smrg                  op.setKill(true);
8077ec681f3Smrg               first = false;
8087ec681f3Smrg               continue;
8097ec681f3Smrg            }
8107ec681f3Smrg            op.setTemp(copy.second.getTemp());
8117ec681f3Smrg            op.setFixed(copy.second.physReg());
8127ec681f3Smrg
8137ec681f3Smrg            fill = (flags & fill_killed_ops) || !op.isKillBeforeDef();
8147ec681f3Smrg         }
8157ec681f3Smrg      }
8167ec681f3Smrg
8177ec681f3Smrg      if (fill)
8187ec681f3Smrg         reg_file.fill(copy.second);
8197ec681f3Smrg
8207ec681f3Smrg      ++it;
8217ec681f3Smrg   }
8227ec681f3Smrg}
8237ec681f3Smrg
8247ec681f3Smrgstd::pair<PhysReg, bool>
8257ec681f3Smrgget_reg_simple(ra_ctx& ctx, RegisterFile& reg_file, DefInfo info)
8267ec681f3Smrg{
8277ec681f3Smrg   const PhysRegInterval& bounds = info.bounds;
8287ec681f3Smrg   uint32_t size = info.size;
8297ec681f3Smrg   uint32_t stride = info.rc.is_subdword() ? DIV_ROUND_UP(info.stride, 4) : info.stride;
8307ec681f3Smrg   RegClass rc = info.rc;
8317ec681f3Smrg
8327ec681f3Smrg   DefInfo new_info = info;
8337ec681f3Smrg   new_info.rc = RegClass(rc.type(), size);
8347ec681f3Smrg   for (unsigned new_stride = 16; new_stride > stride; new_stride /= 2) {
8357ec681f3Smrg      if (size % new_stride)
8367ec681f3Smrg         continue;
8377ec681f3Smrg      new_info.stride = new_stride;
8387ec681f3Smrg      std::pair<PhysReg, bool> res = get_reg_simple(ctx, reg_file, new_info);
8397ec681f3Smrg      if (res.second)
8407ec681f3Smrg         return res;
8417ec681f3Smrg   }
8427ec681f3Smrg
8437ec681f3Smrg   auto is_free = [&](PhysReg reg_index)
8447ec681f3Smrg   { return reg_file[reg_index] == 0 && !ctx.war_hint[reg_index]; };
8457ec681f3Smrg
8467ec681f3Smrg   if (stride == 1) {
8477ec681f3Smrg      /* best fit algorithm: find the smallest gap to fit in the variable */
8487ec681f3Smrg      PhysRegInterval best_gap{PhysReg{0}, UINT_MAX};
8497ec681f3Smrg      const unsigned max_gpr =
8507ec681f3Smrg         (rc.type() == RegType::vgpr) ? (256 + ctx.max_used_vgpr) : ctx.max_used_sgpr;
8517ec681f3Smrg
8527ec681f3Smrg      PhysRegIterator reg_it = bounds.begin();
8537ec681f3Smrg      const PhysRegIterator end_it =
8547ec681f3Smrg         std::min(bounds.end(), std::max(PhysRegIterator{PhysReg{max_gpr + 1}}, reg_it));
8557ec681f3Smrg      while (reg_it != bounds.end()) {
8567ec681f3Smrg         /* Find the next chunk of available register slots */
8577ec681f3Smrg         reg_it = std::find_if(reg_it, end_it, is_free);
8587ec681f3Smrg         auto next_nonfree_it = std::find_if_not(reg_it, end_it, is_free);
8597ec681f3Smrg         if (reg_it == bounds.end()) {
8607ec681f3Smrg            break;
8617ec681f3Smrg         }
8627ec681f3Smrg
8637ec681f3Smrg         if (next_nonfree_it == end_it) {
8647ec681f3Smrg            /* All registers past max_used_gpr are free */
8657ec681f3Smrg            next_nonfree_it = bounds.end();
8667ec681f3Smrg         }
8677ec681f3Smrg
8687ec681f3Smrg         PhysRegInterval gap = PhysRegInterval::from_until(*reg_it, *next_nonfree_it);
8697ec681f3Smrg
8707ec681f3Smrg         /* early return on exact matches */
8717ec681f3Smrg         if (size == gap.size) {
8727ec681f3Smrg            adjust_max_used_regs(ctx, rc, gap.lo());
8737ec681f3Smrg            return {gap.lo(), true};
8747ec681f3Smrg         }
8757ec681f3Smrg
8767ec681f3Smrg         /* check if it fits and the gap size is smaller */
8777ec681f3Smrg         if (size < gap.size && gap.size < best_gap.size) {
8787ec681f3Smrg            best_gap = gap;
8797ec681f3Smrg         }
8807ec681f3Smrg
8817ec681f3Smrg         /* Move past the processed chunk */
8827ec681f3Smrg         reg_it = next_nonfree_it;
8837ec681f3Smrg      }
8847ec681f3Smrg
8857ec681f3Smrg      if (best_gap.size == UINT_MAX)
8867ec681f3Smrg         return {{}, false};
8877ec681f3Smrg
8887ec681f3Smrg      /* find best position within gap by leaving a good stride for other variables*/
8897ec681f3Smrg      unsigned buffer = best_gap.size - size;
8907ec681f3Smrg      if (buffer > 1) {
8917ec681f3Smrg         if (((best_gap.lo() + size) % 8 != 0 && (best_gap.lo() + buffer) % 8 == 0) ||
8927ec681f3Smrg             ((best_gap.lo() + size) % 4 != 0 && (best_gap.lo() + buffer) % 4 == 0) ||
8937ec681f3Smrg             ((best_gap.lo() + size) % 2 != 0 && (best_gap.lo() + buffer) % 2 == 0))
8947ec681f3Smrg            best_gap = {PhysReg{best_gap.lo() + buffer}, best_gap.size - buffer};
8957ec681f3Smrg      }
8967ec681f3Smrg
8977ec681f3Smrg      adjust_max_used_regs(ctx, rc, best_gap.lo());
8987ec681f3Smrg      return {best_gap.lo(), true};
8997ec681f3Smrg   }
9007ec681f3Smrg
9017ec681f3Smrg   for (PhysRegInterval reg_win = {bounds.lo(), size}; reg_win.hi() <= bounds.hi();
9027ec681f3Smrg        reg_win += stride) {
9037ec681f3Smrg      if (reg_file[reg_win.lo()] != 0) {
9047ec681f3Smrg         continue;
9057ec681f3Smrg      }
9067ec681f3Smrg
9077ec681f3Smrg      bool is_valid = std::all_of(std::next(reg_win.begin()), reg_win.end(), is_free);
9087ec681f3Smrg      if (is_valid) {
9097ec681f3Smrg         adjust_max_used_regs(ctx, rc, reg_win.lo());
9107ec681f3Smrg         return {reg_win.lo(), true};
9117ec681f3Smrg      }
9127ec681f3Smrg   }
9137ec681f3Smrg
9147ec681f3Smrg   /* do this late because using the upper bytes of a register can require
9157ec681f3Smrg    * larger instruction encodings or copies
9167ec681f3Smrg    * TODO: don't do this in situations where it doesn't benefit */
9177ec681f3Smrg   if (rc.is_subdword()) {
9187ec681f3Smrg      for (std::pair<const uint32_t, std::array<uint32_t, 4>>& entry : reg_file.subdword_regs) {
9197ec681f3Smrg         assert(reg_file[PhysReg{entry.first}] == 0xF0000000);
9207ec681f3Smrg         if (!bounds.contains({PhysReg{entry.first}, rc.size()}))
9217ec681f3Smrg            continue;
9227ec681f3Smrg
9237ec681f3Smrg         for (unsigned i = 0; i < 4; i += info.stride) {
9247ec681f3Smrg            /* check if there's a block of free bytes large enough to hold the register */
9257ec681f3Smrg            bool reg_found =
9267ec681f3Smrg               std::all_of(&entry.second[i], &entry.second[std::min(4u, i + rc.bytes())],
9277ec681f3Smrg                           [](unsigned v) { return v == 0; });
9287ec681f3Smrg
9297ec681f3Smrg            /* check if also the neighboring reg is free if needed */
9307ec681f3Smrg            if (reg_found && i + rc.bytes() > 4)
9317ec681f3Smrg               reg_found = (reg_file[PhysReg{entry.first + 1}] == 0);
9327ec681f3Smrg
9337ec681f3Smrg            if (reg_found) {
9347ec681f3Smrg               PhysReg res{entry.first};
9357ec681f3Smrg               res.reg_b += i;
9367ec681f3Smrg               adjust_max_used_regs(ctx, rc, entry.first);
9377ec681f3Smrg               return {res, true};
9387ec681f3Smrg            }
9397ec681f3Smrg         }
9407ec681f3Smrg      }
9417ec681f3Smrg   }
9427ec681f3Smrg
9437ec681f3Smrg   return {{}, false};
9447ec681f3Smrg}
9457ec681f3Smrg
9467ec681f3Smrg/* collect variables from a register area and clear reg_file */
9477ec681f3Smrgstd::set<std::pair<unsigned, unsigned>>
9487ec681f3Smrgfind_vars(ra_ctx& ctx, RegisterFile& reg_file, const PhysRegInterval reg_interval)
9497ec681f3Smrg{
9507ec681f3Smrg   std::set<std::pair<unsigned, unsigned>> vars;
9517ec681f3Smrg   for (PhysReg j : reg_interval) {
9527ec681f3Smrg      if (reg_file.is_blocked(j))
9537ec681f3Smrg         continue;
9547ec681f3Smrg      if (reg_file[j] == 0xF0000000) {
9557ec681f3Smrg         for (unsigned k = 0; k < 4; k++) {
9567ec681f3Smrg            unsigned id = reg_file.subdword_regs[j][k];
9577ec681f3Smrg            if (id) {
9587ec681f3Smrg               assignment& var = ctx.assignments[id];
9597ec681f3Smrg               vars.emplace(var.rc.bytes(), id);
9607ec681f3Smrg            }
9617ec681f3Smrg         }
9627ec681f3Smrg      } else if (reg_file[j] != 0) {
9637ec681f3Smrg         unsigned id = reg_file[j];
9647ec681f3Smrg         assignment& var = ctx.assignments[id];
9657ec681f3Smrg         vars.emplace(var.rc.bytes(), id);
9667ec681f3Smrg      }
9677ec681f3Smrg   }
9687ec681f3Smrg   return vars;
9697ec681f3Smrg}
9707ec681f3Smrg
9717ec681f3Smrg/* collect variables from a register area and clear reg_file */
9727ec681f3Smrgstd::set<std::pair<unsigned, unsigned>>
9737ec681f3Smrgcollect_vars(ra_ctx& ctx, RegisterFile& reg_file, const PhysRegInterval reg_interval)
9747ec681f3Smrg{
9757ec681f3Smrg   std::set<std::pair<unsigned, unsigned>> vars = find_vars(ctx, reg_file, reg_interval);
9767ec681f3Smrg   for (std::pair<unsigned, unsigned> size_id : vars) {
9777ec681f3Smrg      assignment& var = ctx.assignments[size_id.second];
9787ec681f3Smrg      reg_file.clear(var.reg, var.rc);
9797ec681f3Smrg   }
9807ec681f3Smrg   return vars;
9817ec681f3Smrg}
9827ec681f3Smrg
9837ec681f3Smrgbool
9847ec681f3Smrgget_regs_for_copies(ra_ctx& ctx, RegisterFile& reg_file,
9857ec681f3Smrg                    std::vector<std::pair<Operand, Definition>>& parallelcopies,
9867ec681f3Smrg                    const std::set<std::pair<unsigned, unsigned>>& vars,
9877ec681f3Smrg                    const PhysRegInterval bounds, aco_ptr<Instruction>& instr,
9887ec681f3Smrg                    const PhysRegInterval def_reg)
9897ec681f3Smrg{
9907ec681f3Smrg   /* variables are sorted from small sized to large */
9917ec681f3Smrg   /* NOTE: variables are also sorted by ID. this only affects a very small number of shaders
9927ec681f3Smrg    * slightly though. */
9937ec681f3Smrg   for (std::set<std::pair<unsigned, unsigned>>::const_reverse_iterator it = vars.rbegin();
9947ec681f3Smrg        it != vars.rend(); ++it) {
9957ec681f3Smrg      unsigned id = it->second;
9967ec681f3Smrg      assignment& var = ctx.assignments[id];
9977ec681f3Smrg      DefInfo info = DefInfo(ctx, ctx.pseudo_dummy, var.rc, -1);
9987ec681f3Smrg      uint32_t size = info.size;
9997ec681f3Smrg
10007ec681f3Smrg      /* check if this is a dead operand, then we can re-use the space from the definition
10017ec681f3Smrg       * also use the correct stride for sub-dword operands */
10027ec681f3Smrg      bool is_dead_operand = false;
10037ec681f3Smrg      for (unsigned i = 0; !is_phi(instr) && i < instr->operands.size(); i++) {
10047ec681f3Smrg         if (instr->operands[i].isTemp() && instr->operands[i].tempId() == id) {
10057ec681f3Smrg            if (instr->operands[i].isKillBeforeDef())
10067ec681f3Smrg               is_dead_operand = true;
10077ec681f3Smrg            info = DefInfo(ctx, instr, var.rc, i);
10087ec681f3Smrg            break;
10097ec681f3Smrg         }
10107ec681f3Smrg      }
10117ec681f3Smrg
10127ec681f3Smrg      std::pair<PhysReg, bool> res;
10137ec681f3Smrg      if (is_dead_operand) {
10147ec681f3Smrg         if (instr->opcode == aco_opcode::p_create_vector) {
10157ec681f3Smrg            PhysReg reg(def_reg.lo());
10167ec681f3Smrg            for (unsigned i = 0; i < instr->operands.size(); i++) {
10177ec681f3Smrg               if (instr->operands[i].isTemp() && instr->operands[i].tempId() == id) {
10187ec681f3Smrg                  res = {reg, (!var.rc.is_subdword() || (reg.byte() % info.stride == 0)) &&
10197ec681f3Smrg                                 !reg_file.test(reg, var.rc.bytes())};
10207ec681f3Smrg                  break;
10217ec681f3Smrg               }
10227ec681f3Smrg               reg.reg_b += instr->operands[i].bytes();
10237ec681f3Smrg            }
10247ec681f3Smrg            if (!res.second)
10257ec681f3Smrg               res = {var.reg, !reg_file.test(var.reg, var.rc.bytes())};
10267ec681f3Smrg         } else {
10277ec681f3Smrg            info.bounds = def_reg;
10287ec681f3Smrg            res = get_reg_simple(ctx, reg_file, info);
10297ec681f3Smrg         }
10307ec681f3Smrg      } else {
10317ec681f3Smrg         /* Try to find space within the bounds but outside of the definition */
10327ec681f3Smrg         info.bounds = PhysRegInterval::from_until(bounds.lo(), MIN2(def_reg.lo(), bounds.hi()));
10337ec681f3Smrg         res = get_reg_simple(ctx, reg_file, info);
10347ec681f3Smrg         if (!res.second && def_reg.hi() <= bounds.hi()) {
10357ec681f3Smrg            unsigned lo = (def_reg.hi() + info.stride - 1) & ~(info.stride - 1);
10367ec681f3Smrg            info.bounds = PhysRegInterval::from_until(PhysReg{lo}, bounds.hi());
10377ec681f3Smrg            res = get_reg_simple(ctx, reg_file, info);
10387ec681f3Smrg         }
10397ec681f3Smrg      }
10407ec681f3Smrg
10417ec681f3Smrg      if (res.second) {
10427ec681f3Smrg         /* mark the area as blocked */
10437ec681f3Smrg         reg_file.block(res.first, var.rc);
10447ec681f3Smrg
10457ec681f3Smrg         /* create parallelcopy pair (without definition id) */
10467ec681f3Smrg         Temp tmp = Temp(id, var.rc);
10477ec681f3Smrg         Operand pc_op = Operand(tmp);
10487ec681f3Smrg         pc_op.setFixed(var.reg);
10497ec681f3Smrg         Definition pc_def = Definition(res.first, pc_op.regClass());
10507ec681f3Smrg         parallelcopies.emplace_back(pc_op, pc_def);
10517ec681f3Smrg         continue;
10527ec681f3Smrg      }
10537ec681f3Smrg
10547ec681f3Smrg      PhysReg best_pos = bounds.lo();
10557ec681f3Smrg      unsigned num_moves = 0xFF;
10567ec681f3Smrg      unsigned num_vars = 0;
10577ec681f3Smrg
10587ec681f3Smrg      /* we use a sliding window to find potential positions */
10597ec681f3Smrg      unsigned stride = var.rc.is_subdword() ? 1 : info.stride;
10607ec681f3Smrg      for (PhysRegInterval reg_win{bounds.lo(), size}; reg_win.hi() <= bounds.hi();
10617ec681f3Smrg           reg_win += stride) {
10627ec681f3Smrg         if (!is_dead_operand && intersects(reg_win, def_reg))
10637ec681f3Smrg            continue;
10647ec681f3Smrg
10657ec681f3Smrg         /* second, check that we have at most k=num_moves elements in the window
10667ec681f3Smrg          * and no element is larger than the currently processed one */
10677ec681f3Smrg         unsigned k = 0;
10687ec681f3Smrg         unsigned n = 0;
10697ec681f3Smrg         unsigned last_var = 0;
10707ec681f3Smrg         bool found = true;
10717ec681f3Smrg         for (PhysReg j : reg_win) {
10727ec681f3Smrg            if (reg_file[j] == 0 || reg_file[j] == last_var)
10737ec681f3Smrg               continue;
10747ec681f3Smrg
10757ec681f3Smrg            if (reg_file.is_blocked(j) || k > num_moves) {
10767ec681f3Smrg               found = false;
10777ec681f3Smrg               break;
10787ec681f3Smrg            }
10797ec681f3Smrg            if (reg_file[j] == 0xF0000000) {
10807ec681f3Smrg               k += 1;
10817ec681f3Smrg               n++;
10827ec681f3Smrg               continue;
10837ec681f3Smrg            }
10847ec681f3Smrg            /* we cannot split live ranges of linear vgprs inside control flow */
10857ec681f3Smrg            if (!(ctx.block->kind & block_kind_top_level) &&
10867ec681f3Smrg                ctx.assignments[reg_file[j]].rc.is_linear_vgpr()) {
10877ec681f3Smrg               found = false;
10887ec681f3Smrg               break;
10897ec681f3Smrg            }
10907ec681f3Smrg            bool is_kill = false;
10917ec681f3Smrg            for (const Operand& op : instr->operands) {
10927ec681f3Smrg               if (op.isTemp() && op.isKillBeforeDef() && op.tempId() == reg_file[j]) {
10937ec681f3Smrg                  is_kill = true;
10947ec681f3Smrg                  break;
10957ec681f3Smrg               }
10967ec681f3Smrg            }
10977ec681f3Smrg            if (!is_kill && ctx.assignments[reg_file[j]].rc.size() >= size) {
10987ec681f3Smrg               found = false;
10997ec681f3Smrg               break;
11007ec681f3Smrg            }
11017ec681f3Smrg
11027ec681f3Smrg            k += ctx.assignments[reg_file[j]].rc.size();
11037ec681f3Smrg            last_var = reg_file[j];
11047ec681f3Smrg            n++;
11057ec681f3Smrg            if (k > num_moves || (k == num_moves && n <= num_vars)) {
11067ec681f3Smrg               found = false;
11077ec681f3Smrg               break;
11087ec681f3Smrg            }
11097ec681f3Smrg         }
11107ec681f3Smrg
11117ec681f3Smrg         if (found) {
11127ec681f3Smrg            best_pos = reg_win.lo();
11137ec681f3Smrg            num_moves = k;
11147ec681f3Smrg            num_vars = n;
11157ec681f3Smrg         }
11167ec681f3Smrg      }
11177ec681f3Smrg
11187ec681f3Smrg      /* FIXME: we messed up and couldn't find space for the variables to be copied */
11197ec681f3Smrg      if (num_moves == 0xFF)
11207ec681f3Smrg         return false;
11217ec681f3Smrg
11227ec681f3Smrg      PhysRegInterval reg_win{best_pos, size};
11237ec681f3Smrg
11247ec681f3Smrg      /* collect variables and block reg file */
11257ec681f3Smrg      std::set<std::pair<unsigned, unsigned>> new_vars = collect_vars(ctx, reg_file, reg_win);
11267ec681f3Smrg
11277ec681f3Smrg      /* mark the area as blocked */
11287ec681f3Smrg      reg_file.block(reg_win.lo(), var.rc);
11297ec681f3Smrg      adjust_max_used_regs(ctx, var.rc, reg_win.lo());
11307ec681f3Smrg
11317ec681f3Smrg      if (!get_regs_for_copies(ctx, reg_file, parallelcopies, new_vars, bounds, instr, def_reg))
11327ec681f3Smrg         return false;
11337ec681f3Smrg
11347ec681f3Smrg      /* create parallelcopy pair (without definition id) */
11357ec681f3Smrg      Temp tmp = Temp(id, var.rc);
11367ec681f3Smrg      Operand pc_op = Operand(tmp);
11377ec681f3Smrg      pc_op.setFixed(var.reg);
11387ec681f3Smrg      Definition pc_def = Definition(reg_win.lo(), pc_op.regClass());
11397ec681f3Smrg      parallelcopies.emplace_back(pc_op, pc_def);
11407ec681f3Smrg   }
11417ec681f3Smrg
11427ec681f3Smrg   return true;
11437ec681f3Smrg}
11447ec681f3Smrg
11457ec681f3Smrgstd::pair<PhysReg, bool>
11467ec681f3Smrgget_reg_impl(ra_ctx& ctx, RegisterFile& reg_file,
11477ec681f3Smrg             std::vector<std::pair<Operand, Definition>>& parallelcopies, const DefInfo& info,
11487ec681f3Smrg             aco_ptr<Instruction>& instr)
11497ec681f3Smrg{
11507ec681f3Smrg   const PhysRegInterval& bounds = info.bounds;
11517ec681f3Smrg   uint32_t size = info.size;
11527ec681f3Smrg   uint32_t stride = info.stride;
11537ec681f3Smrg   RegClass rc = info.rc;
11547ec681f3Smrg
11557ec681f3Smrg   /* check how many free regs we have */
11567ec681f3Smrg   unsigned regs_free = reg_file.count_zero(bounds);
11577ec681f3Smrg
11587ec681f3Smrg   /* mark and count killed operands */
11597ec681f3Smrg   unsigned killed_ops = 0;
11607ec681f3Smrg   std::bitset<256> is_killed_operand; /* per-register */
11617ec681f3Smrg   for (unsigned j = 0; !is_phi(instr) && j < instr->operands.size(); j++) {
11627ec681f3Smrg      Operand& op = instr->operands[j];
11637ec681f3Smrg      if (op.isTemp() && op.isFirstKillBeforeDef() && bounds.contains(op.physReg()) &&
11647ec681f3Smrg          !reg_file.test(PhysReg{op.physReg().reg()}, align(op.bytes() + op.physReg().byte(), 4))) {
11657ec681f3Smrg         assert(op.isFixed());
11667ec681f3Smrg
11677ec681f3Smrg         for (unsigned i = 0; i < op.size(); ++i) {
11687ec681f3Smrg            is_killed_operand[(op.physReg() & 0xff) + i] = true;
11697ec681f3Smrg         }
11707ec681f3Smrg
11717ec681f3Smrg         killed_ops += op.getTemp().size();
11727ec681f3Smrg      }
11737ec681f3Smrg   }
11747ec681f3Smrg
11757ec681f3Smrg   assert(regs_free >= size);
11767ec681f3Smrg   /* we might have to move dead operands to dst in order to make space */
11777ec681f3Smrg   unsigned op_moves = 0;
11787ec681f3Smrg
11797ec681f3Smrg   if (size > (regs_free - killed_ops))
11807ec681f3Smrg      op_moves = size - (regs_free - killed_ops);
11817ec681f3Smrg
11827ec681f3Smrg   /* find the best position to place the definition */
11837ec681f3Smrg   PhysRegInterval best_win = {bounds.lo(), size};
11847ec681f3Smrg   unsigned num_moves = 0xFF;
11857ec681f3Smrg   unsigned num_vars = 0;
11867ec681f3Smrg
11877ec681f3Smrg   /* we use a sliding window to check potential positions */
11887ec681f3Smrg   for (PhysRegInterval reg_win = {bounds.lo(), size}; reg_win.hi() <= bounds.hi();
11897ec681f3Smrg        reg_win += stride) {
11907ec681f3Smrg      /* first check if the register window starts in the middle of an
11917ec681f3Smrg       * allocated variable: this is what we have to fix to allow for
11927ec681f3Smrg       * num_moves > size */
11937ec681f3Smrg      if (reg_win.lo() > bounds.lo() && !reg_file.is_empty_or_blocked(reg_win.lo()) &&
11947ec681f3Smrg          reg_file.get_id(reg_win.lo()) == reg_file.get_id(reg_win.lo().advance(-1)))
11957ec681f3Smrg         continue;
11967ec681f3Smrg      if (reg_win.hi() < bounds.hi() && !reg_file.is_empty_or_blocked(reg_win.hi().advance(-1)) &&
11977ec681f3Smrg          reg_file.get_id(reg_win.hi().advance(-1)) == reg_file.get_id(reg_win.hi()))
11987ec681f3Smrg         continue;
11997ec681f3Smrg
12007ec681f3Smrg      /* second, check that we have at most k=num_moves elements in the window
12017ec681f3Smrg       * and no element is larger than the currently processed one */
12027ec681f3Smrg      unsigned k = op_moves;
12037ec681f3Smrg      unsigned n = 0;
12047ec681f3Smrg      unsigned remaining_op_moves = op_moves;
12057ec681f3Smrg      unsigned last_var = 0;
12067ec681f3Smrg      bool found = true;
12077ec681f3Smrg      bool aligned = rc == RegClass::v4 && reg_win.lo() % 4 == 0;
12087ec681f3Smrg      for (const PhysReg j : reg_win) {
12097ec681f3Smrg         /* dead operands effectively reduce the number of estimated moves */
12107ec681f3Smrg         if (is_killed_operand[j & 0xFF]) {
12117ec681f3Smrg            if (remaining_op_moves) {
12127ec681f3Smrg               k--;
12137ec681f3Smrg               remaining_op_moves--;
12147ec681f3Smrg            }
12157ec681f3Smrg            continue;
12167ec681f3Smrg         }
12177ec681f3Smrg
12187ec681f3Smrg         if (reg_file[j] == 0 || reg_file[j] == last_var)
12197ec681f3Smrg            continue;
12207ec681f3Smrg
12217ec681f3Smrg         if (reg_file[j] == 0xF0000000) {
12227ec681f3Smrg            k += 1;
12237ec681f3Smrg            n++;
12247ec681f3Smrg            continue;
12257ec681f3Smrg         }
12267ec681f3Smrg
12277ec681f3Smrg         if (ctx.assignments[reg_file[j]].rc.size() >= size) {
12287ec681f3Smrg            found = false;
12297ec681f3Smrg            break;
12307ec681f3Smrg         }
12317ec681f3Smrg
12327ec681f3Smrg         /* we cannot split live ranges of linear vgprs inside control flow */
12337ec681f3Smrg         // TODO: ensure that live range splits inside control flow are never necessary
12347ec681f3Smrg         if (!(ctx.block->kind & block_kind_top_level) &&
12357ec681f3Smrg             ctx.assignments[reg_file[j]].rc.is_linear_vgpr()) {
12367ec681f3Smrg            found = false;
12377ec681f3Smrg            break;
12387ec681f3Smrg         }
12397ec681f3Smrg
12407ec681f3Smrg         k += ctx.assignments[reg_file[j]].rc.size();
12417ec681f3Smrg         n++;
12427ec681f3Smrg         last_var = reg_file[j];
12437ec681f3Smrg      }
12447ec681f3Smrg
12457ec681f3Smrg      if (!found || k > num_moves)
12467ec681f3Smrg         continue;
12477ec681f3Smrg      if (k == num_moves && n < num_vars)
12487ec681f3Smrg         continue;
12497ec681f3Smrg      if (!aligned && k == num_moves && n == num_vars)
12507ec681f3Smrg         continue;
12517ec681f3Smrg
12527ec681f3Smrg      if (found) {
12537ec681f3Smrg         best_win = reg_win;
12547ec681f3Smrg         num_moves = k;
12557ec681f3Smrg         num_vars = n;
12567ec681f3Smrg      }
12577ec681f3Smrg   }
12587ec681f3Smrg
12597ec681f3Smrg   if (num_moves == 0xFF)
12607ec681f3Smrg      return {{}, false};
12617ec681f3Smrg
12627ec681f3Smrg   /* now, we figured the placement for our definition */
12637ec681f3Smrg   RegisterFile tmp_file(reg_file);
12647ec681f3Smrg   std::set<std::pair<unsigned, unsigned>> vars = collect_vars(ctx, tmp_file, best_win);
12657ec681f3Smrg
12667ec681f3Smrg   if (instr->opcode == aco_opcode::p_create_vector) {
12677ec681f3Smrg      /* move killed operands which aren't yet at the correct position (GFX9+)
12687ec681f3Smrg       * or which are in the definition space */
12697ec681f3Smrg      PhysReg reg = best_win.lo();
12707ec681f3Smrg      for (Operand& op : instr->operands) {
12717ec681f3Smrg         if (op.isTemp() && op.isFirstKillBeforeDef() && op.getTemp().type() == rc.type()) {
12727ec681f3Smrg            if (op.physReg() != reg && (ctx.program->chip_class >= GFX9 ||
12737ec681f3Smrg                                        (op.physReg().advance(op.bytes()) > best_win.lo() &&
12747ec681f3Smrg                                         op.physReg() < best_win.hi()))) {
12757ec681f3Smrg               vars.emplace(op.bytes(), op.tempId());
12767ec681f3Smrg               tmp_file.clear(op);
12777ec681f3Smrg            } else {
12787ec681f3Smrg               tmp_file.fill(op);
12797ec681f3Smrg            }
12807ec681f3Smrg         }
12817ec681f3Smrg         reg.reg_b += op.bytes();
12827ec681f3Smrg      }
12837ec681f3Smrg   } else if (!is_phi(instr)) {
12847ec681f3Smrg      /* re-enable killed operands */
12857ec681f3Smrg      for (Operand& op : instr->operands) {
12867ec681f3Smrg         if (op.isTemp() && op.isFirstKillBeforeDef())
12877ec681f3Smrg            tmp_file.fill(op);
12887ec681f3Smrg      }
12897ec681f3Smrg   }
12907ec681f3Smrg
12917ec681f3Smrg   std::vector<std::pair<Operand, Definition>> pc;
12927ec681f3Smrg   if (!get_regs_for_copies(ctx, tmp_file, pc, vars, bounds, instr, best_win))
12937ec681f3Smrg      return {{}, false};
12947ec681f3Smrg
12957ec681f3Smrg   parallelcopies.insert(parallelcopies.end(), pc.begin(), pc.end());
12967ec681f3Smrg
12977ec681f3Smrg   adjust_max_used_regs(ctx, rc, best_win.lo());
12987ec681f3Smrg   return {best_win.lo(), true};
12997ec681f3Smrg}
13007ec681f3Smrg
13017ec681f3Smrgbool
13027ec681f3Smrgget_reg_specified(ra_ctx& ctx, RegisterFile& reg_file, RegClass rc, aco_ptr<Instruction>& instr,
13037ec681f3Smrg                  PhysReg reg)
13047ec681f3Smrg{
13057ec681f3Smrg   /* catch out-of-range registers */
13067ec681f3Smrg   if (reg >= PhysReg{512})
13077ec681f3Smrg      return false;
13087ec681f3Smrg
13097ec681f3Smrg   std::pair<unsigned, unsigned> sdw_def_info;
13107ec681f3Smrg   if (rc.is_subdword())
13117ec681f3Smrg      sdw_def_info = get_subdword_definition_info(ctx.program, instr, rc);
13127ec681f3Smrg
13137ec681f3Smrg   if (rc.is_subdword() && reg.byte() % sdw_def_info.first)
13147ec681f3Smrg      return false;
13157ec681f3Smrg   if (!rc.is_subdword() && reg.byte())
13167ec681f3Smrg      return false;
13177ec681f3Smrg
13187ec681f3Smrg   if (rc.type() == RegType::sgpr && reg % get_stride(rc) != 0)
13197ec681f3Smrg      return false;
13207ec681f3Smrg
13217ec681f3Smrg   PhysRegInterval reg_win = {reg, rc.size()};
13227ec681f3Smrg   PhysRegInterval bounds = get_reg_bounds(ctx.program, rc.type());
13237ec681f3Smrg   PhysRegInterval vcc_win = {vcc, 2};
13247ec681f3Smrg   /* VCC is outside the bounds */
13257ec681f3Smrg   bool is_vcc = rc.type() == RegType::sgpr && vcc_win.contains(reg_win);
13267ec681f3Smrg   bool is_m0 = rc == s1 && reg == m0;
13277ec681f3Smrg   if (!bounds.contains(reg_win) && !is_vcc && !is_m0)
13287ec681f3Smrg      return false;
13297ec681f3Smrg
13307ec681f3Smrg   if (rc.is_subdword()) {
13317ec681f3Smrg      PhysReg test_reg;
13327ec681f3Smrg      test_reg.reg_b = reg.reg_b & ~(sdw_def_info.second - 1);
13337ec681f3Smrg      if (reg_file.test(test_reg, sdw_def_info.second))
13347ec681f3Smrg         return false;
13357ec681f3Smrg   } else {
13367ec681f3Smrg      if (reg_file.test(reg, rc.bytes()))
13377ec681f3Smrg         return false;
13387ec681f3Smrg   }
13397ec681f3Smrg
13407ec681f3Smrg   adjust_max_used_regs(ctx, rc, reg_win.lo());
13417ec681f3Smrg   return true;
13427ec681f3Smrg}
13437ec681f3Smrg
13447ec681f3Smrgbool
13457ec681f3Smrgincrease_register_file(ra_ctx& ctx, RegType type)
13467ec681f3Smrg{
13477ec681f3Smrg   if (type == RegType::vgpr && ctx.program->max_reg_demand.vgpr < ctx.vgpr_limit) {
13487ec681f3Smrg      update_vgpr_sgpr_demand(ctx.program, RegisterDemand(ctx.program->max_reg_demand.vgpr + 1,
13497ec681f3Smrg                                                          ctx.program->max_reg_demand.sgpr));
13507ec681f3Smrg   } else if (type == RegType::sgpr && ctx.program->max_reg_demand.sgpr < ctx.sgpr_limit) {
13517ec681f3Smrg      update_vgpr_sgpr_demand(ctx.program, RegisterDemand(ctx.program->max_reg_demand.vgpr,
13527ec681f3Smrg                                                          ctx.program->max_reg_demand.sgpr + 1));
13537ec681f3Smrg   } else {
13547ec681f3Smrg      return false;
13557ec681f3Smrg   }
13567ec681f3Smrg   return true;
13577ec681f3Smrg}
13587ec681f3Smrg
13597ec681f3Smrgstruct IDAndRegClass {
13607ec681f3Smrg   IDAndRegClass(unsigned id_, RegClass rc_) : id(id_), rc(rc_) {}
13617ec681f3Smrg
13627ec681f3Smrg   unsigned id;
13637ec681f3Smrg   RegClass rc;
13647ec681f3Smrg};
13657ec681f3Smrg
13667ec681f3Smrgstruct IDAndInfo {
13677ec681f3Smrg   IDAndInfo(unsigned id_, DefInfo info_) : id(id_), info(info_) {}
13687ec681f3Smrg
13697ec681f3Smrg   unsigned id;
13707ec681f3Smrg   DefInfo info;
13717ec681f3Smrg};
13727ec681f3Smrg
13737ec681f3Smrg/* Reallocates vars by sorting them and placing each variable after the previous
13747ec681f3Smrg * one. If one of the variables has 0xffffffff as an ID, the register assigned
13757ec681f3Smrg * for that variable will be returned.
13767ec681f3Smrg */
13777ec681f3SmrgPhysReg
13787ec681f3Smrgcompact_relocate_vars(ra_ctx& ctx, const std::vector<IDAndRegClass>& vars,
13797ec681f3Smrg                      std::vector<std::pair<Operand, Definition>>& parallelcopies, PhysReg start)
13807ec681f3Smrg{
13817ec681f3Smrg   /* This function assumes RegisterDemand/live_var_analysis rounds up sub-dword
13827ec681f3Smrg    * temporary sizes to dwords.
13837ec681f3Smrg    */
13847ec681f3Smrg   std::vector<IDAndInfo> sorted;
13857ec681f3Smrg   for (IDAndRegClass var : vars) {
13867ec681f3Smrg      DefInfo info(ctx, ctx.pseudo_dummy, var.rc, -1);
13877ec681f3Smrg      sorted.emplace_back(var.id, info);
13887ec681f3Smrg   }
13897ec681f3Smrg
13907ec681f3Smrg   std::sort(
13917ec681f3Smrg      sorted.begin(), sorted.end(),
13927ec681f3Smrg      [&ctx](const IDAndInfo& a, const IDAndInfo& b)
13937ec681f3Smrg      {
13947ec681f3Smrg         unsigned a_stride = a.info.stride * (a.info.rc.is_subdword() ? 1 : 4);
13957ec681f3Smrg         unsigned b_stride = b.info.stride * (b.info.rc.is_subdword() ? 1 : 4);
13967ec681f3Smrg         if (a_stride > b_stride)
13977ec681f3Smrg            return true;
13987ec681f3Smrg         if (a_stride < b_stride)
13997ec681f3Smrg            return false;
14007ec681f3Smrg         if (a.id == 0xffffffff || b.id == 0xffffffff)
14017ec681f3Smrg            return a.id ==
14027ec681f3Smrg                   0xffffffff; /* place 0xffffffff before others if possible, not for any reason */
14037ec681f3Smrg         return ctx.assignments[a.id].reg < ctx.assignments[b.id].reg;
14047ec681f3Smrg      });
14057ec681f3Smrg
14067ec681f3Smrg   PhysReg next_reg = start;
14077ec681f3Smrg   PhysReg space_reg;
14087ec681f3Smrg   for (IDAndInfo& var : sorted) {
14097ec681f3Smrg      unsigned stride = var.info.rc.is_subdword() ? var.info.stride : var.info.stride * 4;
14107ec681f3Smrg      next_reg.reg_b = align(next_reg.reg_b, MAX2(stride, 4));
14117ec681f3Smrg
14127ec681f3Smrg      /* 0xffffffff is a special variable ID used reserve a space for killed
14137ec681f3Smrg       * operands and definitions.
14147ec681f3Smrg       */
14157ec681f3Smrg      if (var.id != 0xffffffff) {
14167ec681f3Smrg         if (next_reg != ctx.assignments[var.id].reg) {
14177ec681f3Smrg            RegClass rc = ctx.assignments[var.id].rc;
14187ec681f3Smrg            Temp tmp(var.id, rc);
14197ec681f3Smrg
14207ec681f3Smrg            Operand pc_op(tmp);
14217ec681f3Smrg            pc_op.setFixed(ctx.assignments[var.id].reg);
14227ec681f3Smrg            Definition pc_def(next_reg, rc);
14237ec681f3Smrg            parallelcopies.emplace_back(pc_op, pc_def);
14247ec681f3Smrg         }
14257ec681f3Smrg      } else {
14267ec681f3Smrg         space_reg = next_reg;
14277ec681f3Smrg      }
14287ec681f3Smrg
14297ec681f3Smrg      adjust_max_used_regs(ctx, var.info.rc, next_reg);
14307ec681f3Smrg
14317ec681f3Smrg      next_reg = next_reg.advance(var.info.rc.size() * 4);
14327ec681f3Smrg   }
14337ec681f3Smrg
14347ec681f3Smrg   return space_reg;
14357ec681f3Smrg}
14367ec681f3Smrg
14377ec681f3Smrgbool
14387ec681f3Smrgis_mimg_vaddr_intact(ra_ctx& ctx, RegisterFile& reg_file, Instruction* instr)
14397ec681f3Smrg{
14407ec681f3Smrg   PhysReg first{512};
14417ec681f3Smrg   for (unsigned i = 0; i < instr->operands.size() - 3u; i++) {
14427ec681f3Smrg      Operand op = instr->operands[i + 3];
14437ec681f3Smrg
14447ec681f3Smrg      if (ctx.assignments[op.tempId()].assigned) {
14457ec681f3Smrg         PhysReg reg = ctx.assignments[op.tempId()].reg;
14467ec681f3Smrg
14477ec681f3Smrg         if (first.reg() == 512) {
14487ec681f3Smrg            PhysRegInterval bounds = get_reg_bounds(ctx.program, RegType::vgpr);
14497ec681f3Smrg            first = reg.advance(i * -4);
14507ec681f3Smrg            PhysRegInterval vec = PhysRegInterval{first, instr->operands.size() - 3u};
14517ec681f3Smrg            if (!bounds.contains(vec)) /* not enough space for other operands */
14527ec681f3Smrg               return false;
14537ec681f3Smrg         } else {
14547ec681f3Smrg            if (reg != first.advance(i * 4)) /* not at the best position */
14557ec681f3Smrg               return false;
14567ec681f3Smrg         }
14577ec681f3Smrg      } else {
14587ec681f3Smrg         /* If there's an unexpected temporary, this operand is unlikely to be
14597ec681f3Smrg          * placed in the best position.
14607ec681f3Smrg          */
14617ec681f3Smrg         if (first.reg() != 512 && reg_file.test(first.advance(i * 4), 4))
14627ec681f3Smrg            return false;
14637ec681f3Smrg      }
14647ec681f3Smrg   }
14657ec681f3Smrg
14667ec681f3Smrg   return true;
14677ec681f3Smrg}
14687ec681f3Smrg
14697ec681f3Smrgstd::pair<PhysReg, bool>
14707ec681f3Smrgget_reg_vector(ra_ctx& ctx, RegisterFile& reg_file, Temp temp, aco_ptr<Instruction>& instr)
14717ec681f3Smrg{
14727ec681f3Smrg   Instruction* vec = ctx.vectors[temp.id()];
14737ec681f3Smrg   unsigned first_operand = vec->format == Format::MIMG ? 3 : 0;
14747ec681f3Smrg   unsigned our_offset = 0;
14757ec681f3Smrg   for (unsigned i = first_operand; i < vec->operands.size(); i++) {
14767ec681f3Smrg      Operand& op = vec->operands[i];
14777ec681f3Smrg      if (op.isTemp() && op.tempId() == temp.id())
14787ec681f3Smrg         break;
14797ec681f3Smrg      else
14807ec681f3Smrg         our_offset += op.bytes();
14817ec681f3Smrg   }
14827ec681f3Smrg
14837ec681f3Smrg   if (vec->format != Format::MIMG || is_mimg_vaddr_intact(ctx, reg_file, vec)) {
14847ec681f3Smrg      unsigned their_offset = 0;
14857ec681f3Smrg      /* check for every operand of the vector
14867ec681f3Smrg       * - whether the operand is assigned and
14877ec681f3Smrg       * - we can use the register relative to that operand
14887ec681f3Smrg       */
14897ec681f3Smrg      for (unsigned i = first_operand; i < vec->operands.size(); i++) {
14907ec681f3Smrg         Operand& op = vec->operands[i];
14917ec681f3Smrg         if (op.isTemp() && op.tempId() != temp.id() && op.getTemp().type() == temp.type() &&
14927ec681f3Smrg             ctx.assignments[op.tempId()].assigned) {
14937ec681f3Smrg            PhysReg reg = ctx.assignments[op.tempId()].reg;
14947ec681f3Smrg            reg.reg_b += (our_offset - their_offset);
14957ec681f3Smrg            if (get_reg_specified(ctx, reg_file, temp.regClass(), instr, reg))
14967ec681f3Smrg               return {reg, true};
14977ec681f3Smrg
14987ec681f3Smrg            /* return if MIMG vaddr components don't remain vector-aligned */
14997ec681f3Smrg            if (vec->format == Format::MIMG)
15007ec681f3Smrg               return {{}, false};
15017ec681f3Smrg         }
15027ec681f3Smrg         their_offset += op.bytes();
15037ec681f3Smrg      }
15047ec681f3Smrg
15057ec681f3Smrg      /* We didn't find a register relative to other vector operands.
15067ec681f3Smrg       * Try to find new space which fits the whole vector.
15077ec681f3Smrg       */
15087ec681f3Smrg      RegClass vec_rc = RegClass::get(temp.type(), their_offset);
15097ec681f3Smrg      DefInfo info(ctx, ctx.pseudo_dummy, vec_rc, -1);
15107ec681f3Smrg      std::pair<PhysReg, bool> res = get_reg_simple(ctx, reg_file, info);
15117ec681f3Smrg      PhysReg reg = res.first;
15127ec681f3Smrg      if (res.second) {
15137ec681f3Smrg         reg.reg_b += our_offset;
15147ec681f3Smrg         /* make sure to only use byte offset if the instruction supports it */
15157ec681f3Smrg         if (get_reg_specified(ctx, reg_file, temp.regClass(), instr, reg))
15167ec681f3Smrg            return {reg, true};
15177ec681f3Smrg      }
15187ec681f3Smrg   }
15197ec681f3Smrg   return {{}, false};
15207ec681f3Smrg}
15217ec681f3Smrg
15227ec681f3SmrgPhysReg
15237ec681f3Smrgget_reg(ra_ctx& ctx, RegisterFile& reg_file, Temp temp,
15247ec681f3Smrg        std::vector<std::pair<Operand, Definition>>& parallelcopies, aco_ptr<Instruction>& instr,
15257ec681f3Smrg        int operand_index = -1)
15267ec681f3Smrg{
15277ec681f3Smrg   auto split_vec = ctx.split_vectors.find(temp.id());
15287ec681f3Smrg   if (split_vec != ctx.split_vectors.end()) {
15297ec681f3Smrg      unsigned offset = 0;
15307ec681f3Smrg      for (Definition def : split_vec->second->definitions) {
15317ec681f3Smrg         if (ctx.assignments[def.tempId()].affinity) {
15327ec681f3Smrg            assignment& affinity = ctx.assignments[ctx.assignments[def.tempId()].affinity];
15337ec681f3Smrg            if (affinity.assigned) {
15347ec681f3Smrg               PhysReg reg = affinity.reg;
15357ec681f3Smrg               reg.reg_b -= offset;
15367ec681f3Smrg               if (get_reg_specified(ctx, reg_file, temp.regClass(), instr, reg))
15377ec681f3Smrg                  return reg;
15387ec681f3Smrg            }
15397ec681f3Smrg         }
15407ec681f3Smrg         offset += def.bytes();
15417ec681f3Smrg      }
15427ec681f3Smrg   }
15437ec681f3Smrg
15447ec681f3Smrg   if (ctx.assignments[temp.id()].affinity) {
15457ec681f3Smrg      assignment& affinity = ctx.assignments[ctx.assignments[temp.id()].affinity];
15467ec681f3Smrg      if (affinity.assigned) {
15477ec681f3Smrg         if (get_reg_specified(ctx, reg_file, temp.regClass(), instr, affinity.reg))
15487ec681f3Smrg            return affinity.reg;
15497ec681f3Smrg      }
15507ec681f3Smrg   }
15517ec681f3Smrg
15527ec681f3Smrg   std::pair<PhysReg, bool> res;
15537ec681f3Smrg
15547ec681f3Smrg   if (ctx.vectors.find(temp.id()) != ctx.vectors.end()) {
15557ec681f3Smrg      res = get_reg_vector(ctx, reg_file, temp, instr);
15567ec681f3Smrg      if (res.second)
15577ec681f3Smrg         return res.first;
15587ec681f3Smrg   }
15597ec681f3Smrg
15607ec681f3Smrg   DefInfo info(ctx, instr, temp.regClass(), operand_index);
15617ec681f3Smrg
15627ec681f3Smrg   if (!ctx.policy.skip_optimistic_path) {
15637ec681f3Smrg      /* try to find space without live-range splits */
15647ec681f3Smrg      res = get_reg_simple(ctx, reg_file, info);
15657ec681f3Smrg
15667ec681f3Smrg      if (res.second)
15677ec681f3Smrg         return res.first;
15687ec681f3Smrg   }
15697ec681f3Smrg
15707ec681f3Smrg   /* try to find space with live-range splits */
15717ec681f3Smrg   res = get_reg_impl(ctx, reg_file, parallelcopies, info, instr);
15727ec681f3Smrg
15737ec681f3Smrg   if (res.second)
15747ec681f3Smrg      return res.first;
15757ec681f3Smrg
15767ec681f3Smrg   /* try using more registers */
15777ec681f3Smrg
15787ec681f3Smrg   /* We should only fail here because keeping under the limit would require
15797ec681f3Smrg    * too many moves. */
15807ec681f3Smrg   assert(reg_file.count_zero(info.bounds) >= info.size);
15817ec681f3Smrg
15827ec681f3Smrg   if (!increase_register_file(ctx, info.rc.type())) {
15837ec681f3Smrg      /* fallback algorithm: reallocate all variables at once */
15847ec681f3Smrg      unsigned def_size = info.rc.size();
15857ec681f3Smrg      for (Definition def : instr->definitions) {
15867ec681f3Smrg         if (ctx.assignments[def.tempId()].assigned && def.regClass().type() == info.rc.type())
15877ec681f3Smrg            def_size += def.regClass().size();
15887ec681f3Smrg      }
15897ec681f3Smrg
15907ec681f3Smrg      unsigned killed_op_size = 0;
15917ec681f3Smrg      for (Operand op : instr->operands) {
15927ec681f3Smrg         if (op.isTemp() && op.isKillBeforeDef() && op.regClass().type() == info.rc.type())
15937ec681f3Smrg            killed_op_size += op.regClass().size();
15947ec681f3Smrg      }
15957ec681f3Smrg
15967ec681f3Smrg      const PhysRegInterval regs = get_reg_bounds(ctx.program, info.rc.type());
15977ec681f3Smrg
15987ec681f3Smrg      /* reallocate passthrough variables and non-killed operands */
15997ec681f3Smrg      std::vector<IDAndRegClass> vars;
16007ec681f3Smrg      for (const std::pair<unsigned, unsigned>& var : find_vars(ctx, reg_file, regs))
16017ec681f3Smrg         vars.emplace_back(var.second, ctx.assignments[var.second].rc);
16027ec681f3Smrg      vars.emplace_back(0xffffffff, RegClass(info.rc.type(), MAX2(def_size, killed_op_size)));
16037ec681f3Smrg
16047ec681f3Smrg      PhysReg space = compact_relocate_vars(ctx, vars, parallelcopies, regs.lo());
16057ec681f3Smrg
16067ec681f3Smrg      /* reallocate killed operands */
16077ec681f3Smrg      std::vector<IDAndRegClass> killed_op_vars;
16087ec681f3Smrg      for (Operand op : instr->operands) {
16097ec681f3Smrg         if (op.isKillBeforeDef() && op.regClass().type() == info.rc.type())
16107ec681f3Smrg            killed_op_vars.emplace_back(op.tempId(), op.regClass());
16117ec681f3Smrg      }
16127ec681f3Smrg      compact_relocate_vars(ctx, killed_op_vars, parallelcopies, space);
16137ec681f3Smrg
16147ec681f3Smrg      /* reallocate definitions */
16157ec681f3Smrg      std::vector<IDAndRegClass> def_vars;
16167ec681f3Smrg      for (Definition def : instr->definitions) {
16177ec681f3Smrg         if (ctx.assignments[def.tempId()].assigned && def.regClass().type() == info.rc.type())
16187ec681f3Smrg            def_vars.emplace_back(def.tempId(), def.regClass());
16197ec681f3Smrg      }
16207ec681f3Smrg      def_vars.emplace_back(0xffffffff, info.rc);
16217ec681f3Smrg      return compact_relocate_vars(ctx, def_vars, parallelcopies, space);
16227ec681f3Smrg   }
16237ec681f3Smrg
16247ec681f3Smrg   return get_reg(ctx, reg_file, temp, parallelcopies, instr, operand_index);
16257ec681f3Smrg}
16267ec681f3Smrg
16277ec681f3SmrgPhysReg
16287ec681f3Smrgget_reg_create_vector(ra_ctx& ctx, RegisterFile& reg_file, Temp temp,
16297ec681f3Smrg                      std::vector<std::pair<Operand, Definition>>& parallelcopies,
16307ec681f3Smrg                      aco_ptr<Instruction>& instr)
16317ec681f3Smrg{
16327ec681f3Smrg   RegClass rc = temp.regClass();
16337ec681f3Smrg   /* create_vector instructions have different costs w.r.t. register coalescing */
16347ec681f3Smrg   uint32_t size = rc.size();
16357ec681f3Smrg   uint32_t bytes = rc.bytes();
16367ec681f3Smrg   uint32_t stride = get_stride(rc);
16377ec681f3Smrg   PhysRegInterval bounds = get_reg_bounds(ctx.program, rc.type());
16387ec681f3Smrg
16397ec681f3Smrg   // TODO: improve p_create_vector for sub-dword vectors
16407ec681f3Smrg
16417ec681f3Smrg   PhysReg best_pos{0xFFF};
16427ec681f3Smrg   unsigned num_moves = 0xFF;
16437ec681f3Smrg   bool best_avoid = true;
16447ec681f3Smrg
16457ec681f3Smrg   /* test for each operand which definition placement causes the least shuffle instructions */
16467ec681f3Smrg   for (unsigned i = 0, offset = 0; i < instr->operands.size();
16477ec681f3Smrg        offset += instr->operands[i].bytes(), i++) {
16487ec681f3Smrg      // TODO: think about, if we can alias live operands on the same register
16497ec681f3Smrg      if (!instr->operands[i].isTemp() || !instr->operands[i].isKillBeforeDef() ||
16507ec681f3Smrg          instr->operands[i].getTemp().type() != rc.type())
16517ec681f3Smrg         continue;
16527ec681f3Smrg
16537ec681f3Smrg      if (offset > instr->operands[i].physReg().reg_b)
16547ec681f3Smrg         continue;
16557ec681f3Smrg
16567ec681f3Smrg      unsigned reg_lower = instr->operands[i].physReg().reg_b - offset;
16577ec681f3Smrg      if (reg_lower % 4)
16587ec681f3Smrg         continue;
16597ec681f3Smrg      PhysRegInterval reg_win = {PhysReg{reg_lower / 4}, size};
16607ec681f3Smrg      unsigned k = 0;
16617ec681f3Smrg
16627ec681f3Smrg      /* no need to check multiple times */
16637ec681f3Smrg      if (reg_win.lo() == best_pos)
16647ec681f3Smrg         continue;
16657ec681f3Smrg
16667ec681f3Smrg      /* check borders */
16677ec681f3Smrg      // TODO: this can be improved */
16687ec681f3Smrg      if (!bounds.contains(reg_win) || reg_win.lo() % stride != 0)
16697ec681f3Smrg         continue;
16707ec681f3Smrg      if (reg_win.lo() > bounds.lo() && reg_file[reg_win.lo()] != 0 &&
16717ec681f3Smrg          reg_file.get_id(reg_win.lo()) == reg_file.get_id(reg_win.lo().advance(-1)))
16727ec681f3Smrg         continue;
16737ec681f3Smrg      if (reg_win.hi() < bounds.hi() && reg_file[reg_win.hi().advance(-4)] != 0 &&
16747ec681f3Smrg          reg_file.get_id(reg_win.hi().advance(-1)) == reg_file.get_id(reg_win.hi()))
16757ec681f3Smrg         continue;
16767ec681f3Smrg
16777ec681f3Smrg      /* count variables to be moved and check "avoid" */
16787ec681f3Smrg      bool avoid = false;
16797ec681f3Smrg      bool linear_vgpr = false;
16807ec681f3Smrg      for (PhysReg j : reg_win) {
16817ec681f3Smrg         if (reg_file[j] != 0) {
16827ec681f3Smrg            if (reg_file[j] == 0xF0000000) {
16837ec681f3Smrg               PhysReg reg;
16847ec681f3Smrg               reg.reg_b = j * 4;
16857ec681f3Smrg               unsigned bytes_left = bytes - ((unsigned)j - reg_win.lo()) * 4;
16867ec681f3Smrg               for (unsigned byte_idx = 0; byte_idx < MIN2(bytes_left, 4); byte_idx++, reg.reg_b++)
16877ec681f3Smrg                  k += reg_file.test(reg, 1);
16887ec681f3Smrg            } else {
16897ec681f3Smrg               k += 4;
16907ec681f3Smrg               linear_vgpr |= ctx.assignments[reg_file[j]].rc.is_linear_vgpr();
16917ec681f3Smrg            }
16927ec681f3Smrg         }
16937ec681f3Smrg         avoid |= ctx.war_hint[j];
16947ec681f3Smrg      }
16957ec681f3Smrg
16967ec681f3Smrg      if (linear_vgpr) {
16977ec681f3Smrg         /* we cannot split live ranges of linear vgprs inside control flow */
16987ec681f3Smrg         if (ctx.block->kind & block_kind_top_level)
16997ec681f3Smrg            avoid = true;
17007ec681f3Smrg         else
17017ec681f3Smrg            continue;
17027ec681f3Smrg      }
17037ec681f3Smrg
17047ec681f3Smrg      if (avoid && !best_avoid)
17057ec681f3Smrg         continue;
17067ec681f3Smrg
17077ec681f3Smrg      /* count operands in wrong positions */
17087ec681f3Smrg      for (unsigned j = 0, offset2 = 0; j < instr->operands.size();
17097ec681f3Smrg           offset2 += instr->operands[j].bytes(), j++) {
17107ec681f3Smrg         if (j == i || !instr->operands[j].isTemp() ||
17117ec681f3Smrg             instr->operands[j].getTemp().type() != rc.type())
17127ec681f3Smrg            continue;
17137ec681f3Smrg         if (instr->operands[j].physReg().reg_b != reg_win.lo() * 4 + offset2)
17147ec681f3Smrg            k += instr->operands[j].bytes();
17157ec681f3Smrg      }
17167ec681f3Smrg      bool aligned = rc == RegClass::v4 && reg_win.lo() % 4 == 0;
17177ec681f3Smrg      if (k > num_moves || (!aligned && k == num_moves))
17187ec681f3Smrg         continue;
17197ec681f3Smrg
17207ec681f3Smrg      best_pos = reg_win.lo();
17217ec681f3Smrg      num_moves = k;
17227ec681f3Smrg      best_avoid = avoid;
17237ec681f3Smrg   }
17247ec681f3Smrg
17257ec681f3Smrg   if (num_moves >= bytes)
17267ec681f3Smrg      return get_reg(ctx, reg_file, temp, parallelcopies, instr);
17277ec681f3Smrg
17287ec681f3Smrg   /* re-enable killed operands which are in the wrong position */
17297ec681f3Smrg   RegisterFile tmp_file(reg_file);
17307ec681f3Smrg   for (unsigned i = 0, offset = 0; i < instr->operands.size();
17317ec681f3Smrg        offset += instr->operands[i].bytes(), i++) {
17327ec681f3Smrg      if (instr->operands[i].isTemp() && instr->operands[i].isFirstKillBeforeDef() &&
17337ec681f3Smrg          instr->operands[i].physReg().reg_b != best_pos.reg_b + offset)
17347ec681f3Smrg         tmp_file.fill(instr->operands[i]);
17357ec681f3Smrg   }
17367ec681f3Smrg
17377ec681f3Smrg   /* collect variables to be moved */
17387ec681f3Smrg   std::set<std::pair<unsigned, unsigned>> vars =
17397ec681f3Smrg      collect_vars(ctx, tmp_file, PhysRegInterval{best_pos, size});
17407ec681f3Smrg
17417ec681f3Smrg   for (unsigned i = 0, offset = 0; i < instr->operands.size();
17427ec681f3Smrg        offset += instr->operands[i].bytes(), i++) {
17437ec681f3Smrg      if (!instr->operands[i].isTemp() || !instr->operands[i].isFirstKillBeforeDef() ||
17447ec681f3Smrg          instr->operands[i].getTemp().type() != rc.type())
17457ec681f3Smrg         continue;
17467ec681f3Smrg      bool correct_pos = instr->operands[i].physReg().reg_b == best_pos.reg_b + offset;
17477ec681f3Smrg      /* GFX9+: move killed operands which aren't yet at the correct position
17487ec681f3Smrg       * Moving all killed operands generally leads to more register swaps.
17497ec681f3Smrg       * This is only done on GFX9+ because of the cheap v_swap instruction.
17507ec681f3Smrg       */
17517ec681f3Smrg      if (ctx.program->chip_class >= GFX9 && !correct_pos) {
17527ec681f3Smrg         vars.emplace(instr->operands[i].bytes(), instr->operands[i].tempId());
17537ec681f3Smrg         tmp_file.clear(instr->operands[i]);
17547ec681f3Smrg         /* fill operands which are in the correct position to avoid overwriting */
17557ec681f3Smrg      } else if (correct_pos) {
17567ec681f3Smrg         tmp_file.fill(instr->operands[i]);
17577ec681f3Smrg      }
17587ec681f3Smrg   }
17597ec681f3Smrg   bool success = false;
17607ec681f3Smrg   std::vector<std::pair<Operand, Definition>> pc;
17617ec681f3Smrg   success =
17627ec681f3Smrg      get_regs_for_copies(ctx, tmp_file, pc, vars, bounds, instr, PhysRegInterval{best_pos, size});
17637ec681f3Smrg
17647ec681f3Smrg   if (!success) {
17657ec681f3Smrg      if (!increase_register_file(ctx, temp.type())) {
17667ec681f3Smrg         /* use the fallback algorithm in get_reg() */
17677ec681f3Smrg         return get_reg(ctx, reg_file, temp, parallelcopies, instr);
17687ec681f3Smrg      }
17697ec681f3Smrg      return get_reg_create_vector(ctx, reg_file, temp, parallelcopies, instr);
17707ec681f3Smrg   }
17717ec681f3Smrg
17727ec681f3Smrg   parallelcopies.insert(parallelcopies.end(), pc.begin(), pc.end());
17737ec681f3Smrg   adjust_max_used_regs(ctx, rc, best_pos);
17747ec681f3Smrg
17757ec681f3Smrg   return best_pos;
17767ec681f3Smrg}
17777ec681f3Smrg
17787ec681f3Smrgvoid
17797ec681f3Smrghandle_pseudo(ra_ctx& ctx, const RegisterFile& reg_file, Instruction* instr)
17807ec681f3Smrg{
17817ec681f3Smrg   if (instr->format != Format::PSEUDO)
17827ec681f3Smrg      return;
17837ec681f3Smrg
17847ec681f3Smrg   /* all instructions which use handle_operands() need this information */
17857ec681f3Smrg   switch (instr->opcode) {
17867ec681f3Smrg   case aco_opcode::p_extract_vector:
17877ec681f3Smrg   case aco_opcode::p_create_vector:
17887ec681f3Smrg   case aco_opcode::p_split_vector:
17897ec681f3Smrg   case aco_opcode::p_parallelcopy:
17907ec681f3Smrg   case aco_opcode::p_wqm: break;
17917ec681f3Smrg   default: return;
17927ec681f3Smrg   }
17937ec681f3Smrg
17947ec681f3Smrg   bool writes_linear = false;
17957ec681f3Smrg   /* if all definitions are logical vgpr, no need to care for SCC */
17967ec681f3Smrg   for (Definition& def : instr->definitions) {
17977ec681f3Smrg      if (def.getTemp().regClass().is_linear())
17987ec681f3Smrg         writes_linear = true;
17997ec681f3Smrg   }
18007ec681f3Smrg   /* if all operands are constant, no need to care either */
18017ec681f3Smrg   bool reads_linear = false;
18027ec681f3Smrg   bool reads_subdword = false;
18037ec681f3Smrg   for (Operand& op : instr->operands) {
18047ec681f3Smrg      if (op.isTemp() && op.getTemp().regClass().is_linear())
18057ec681f3Smrg         reads_linear = true;
18067ec681f3Smrg      if (op.isTemp() && op.regClass().is_subdword())
18077ec681f3Smrg         reads_subdword = true;
18087ec681f3Smrg   }
18097ec681f3Smrg   bool needs_scratch_reg = (writes_linear && reads_linear && reg_file[scc]) ||
18107ec681f3Smrg                            (ctx.program->chip_class <= GFX7 && reads_subdword);
18117ec681f3Smrg   if (!needs_scratch_reg)
18127ec681f3Smrg      return;
18137ec681f3Smrg
18147ec681f3Smrg   instr->pseudo().tmp_in_scc = reg_file[scc];
18157ec681f3Smrg
18167ec681f3Smrg   int reg = ctx.max_used_sgpr;
18177ec681f3Smrg   for (; reg >= 0 && reg_file[PhysReg{(unsigned)reg}]; reg--)
18187ec681f3Smrg      ;
18197ec681f3Smrg   if (reg < 0) {
18207ec681f3Smrg      reg = ctx.max_used_sgpr + 1;
18217ec681f3Smrg      for (; reg < ctx.program->max_reg_demand.sgpr && reg_file[PhysReg{(unsigned)reg}]; reg++)
18227ec681f3Smrg         ;
18237ec681f3Smrg      if (reg == ctx.program->max_reg_demand.sgpr) {
18247ec681f3Smrg         assert(reads_subdword && reg_file[m0] == 0);
18257ec681f3Smrg         reg = m0;
18267ec681f3Smrg      }
18277ec681f3Smrg   }
18287ec681f3Smrg
18297ec681f3Smrg   adjust_max_used_regs(ctx, s1, reg);
18307ec681f3Smrg   instr->pseudo().scratch_sgpr = PhysReg{(unsigned)reg};
18317ec681f3Smrg}
18327ec681f3Smrg
18337ec681f3Smrgbool
18347ec681f3Smrgoperand_can_use_reg(chip_class chip, aco_ptr<Instruction>& instr, unsigned idx, PhysReg reg,
18357ec681f3Smrg                    RegClass rc)
18367ec681f3Smrg{
18377ec681f3Smrg   if (instr->operands[idx].isFixed())
18387ec681f3Smrg      return instr->operands[idx].physReg() == reg;
18397ec681f3Smrg
18407ec681f3Smrg   bool is_writelane = instr->opcode == aco_opcode::v_writelane_b32 ||
18417ec681f3Smrg                       instr->opcode == aco_opcode::v_writelane_b32_e64;
18427ec681f3Smrg   if (chip <= GFX9 && is_writelane && idx <= 1) {
18437ec681f3Smrg      /* v_writelane_b32 can take two sgprs but only if one is m0. */
18447ec681f3Smrg      bool is_other_sgpr =
18457ec681f3Smrg         instr->operands[!idx].isTemp() &&
18467ec681f3Smrg         (!instr->operands[!idx].isFixed() || instr->operands[!idx].physReg() != m0);
18477ec681f3Smrg      if (is_other_sgpr && instr->operands[!idx].tempId() != instr->operands[idx].tempId()) {
18487ec681f3Smrg         instr->operands[idx].setFixed(m0);
18497ec681f3Smrg         return reg == m0;
18507ec681f3Smrg      }
18517ec681f3Smrg   }
18527ec681f3Smrg
18537ec681f3Smrg   if (reg.byte()) {
18547ec681f3Smrg      unsigned stride = get_subdword_operand_stride(chip, instr, idx, rc);
18557ec681f3Smrg      if (reg.byte() % stride)
18567ec681f3Smrg         return false;
18577ec681f3Smrg   }
18587ec681f3Smrg
18597ec681f3Smrg   switch (instr->format) {
18607ec681f3Smrg   case Format::SMEM:
18617ec681f3Smrg      return reg != scc && reg != exec &&
18627ec681f3Smrg             (reg != m0 || idx == 1 || idx == 3) && /* offset can be m0 */
18637ec681f3Smrg             (reg != vcc || (instr->definitions.empty() && idx == 2) ||
18647ec681f3Smrg              chip >= GFX10); /* sdata can be vcc */
18657ec681f3Smrg   default:
18667ec681f3Smrg      // TODO: there are more instructions with restrictions on registers
18677ec681f3Smrg      return true;
18687ec681f3Smrg   }
18697ec681f3Smrg}
18707ec681f3Smrg
18717ec681f3Smrgvoid
18727ec681f3Smrgget_reg_for_operand(ra_ctx& ctx, RegisterFile& register_file,
18737ec681f3Smrg                    std::vector<std::pair<Operand, Definition>>& parallelcopy,
18747ec681f3Smrg                    aco_ptr<Instruction>& instr, Operand& operand, unsigned operand_index)
18757ec681f3Smrg{
18767ec681f3Smrg   /* check if the operand is fixed */
18777ec681f3Smrg   PhysReg src = ctx.assignments[operand.tempId()].reg;
18787ec681f3Smrg   PhysReg dst;
18797ec681f3Smrg   if (operand.isFixed()) {
18807ec681f3Smrg      assert(operand.physReg() != src);
18817ec681f3Smrg
18827ec681f3Smrg      /* check if target reg is blocked, and move away the blocking var */
18837ec681f3Smrg      if (register_file.test(operand.physReg(), operand.bytes())) {
18847ec681f3Smrg         PhysRegInterval target{operand.physReg(), operand.size()};
18857ec681f3Smrg
18867ec681f3Smrg         RegisterFile tmp_file(register_file);
18877ec681f3Smrg
18887ec681f3Smrg         std::set<std::pair<unsigned, unsigned>> blocking_vars =
18897ec681f3Smrg            collect_vars(ctx, tmp_file, target);
18907ec681f3Smrg
18917ec681f3Smrg         tmp_file.clear(src, operand.regClass()); // TODO: try to avoid moving block vars to src
18927ec681f3Smrg         tmp_file.block(operand.physReg(), operand.regClass());
18937ec681f3Smrg
18947ec681f3Smrg         DefInfo info(ctx, instr, operand.regClass(), -1);
18957ec681f3Smrg         get_regs_for_copies(ctx, tmp_file, parallelcopy, blocking_vars, info.bounds, instr,
18967ec681f3Smrg                             PhysRegInterval());
18977ec681f3Smrg      }
18987ec681f3Smrg      dst = operand.physReg();
18997ec681f3Smrg
19007ec681f3Smrg   } else {
19017ec681f3Smrg      /* clear the operand in case it's only a stride mismatch */
19027ec681f3Smrg      register_file.clear(src, operand.regClass());
19037ec681f3Smrg      dst = get_reg(ctx, register_file, operand.getTemp(), parallelcopy, instr, operand_index);
19047ec681f3Smrg   }
19057ec681f3Smrg
19067ec681f3Smrg   Operand pc_op = operand;
19077ec681f3Smrg   pc_op.setFixed(src);
19087ec681f3Smrg   Definition pc_def = Definition(dst, pc_op.regClass());
19097ec681f3Smrg   parallelcopy.emplace_back(pc_op, pc_def);
19107ec681f3Smrg   update_renames(ctx, register_file, parallelcopy, instr, rename_not_killed_ops | fill_killed_ops);
19117ec681f3Smrg}
19127ec681f3Smrg
19137ec681f3Smrgvoid
19147ec681f3Smrgget_regs_for_phis(ra_ctx& ctx, Block& block, RegisterFile& register_file,
19157ec681f3Smrg                  std::vector<aco_ptr<Instruction>>& instructions, IDSet& live_in)
19167ec681f3Smrg{
19177ec681f3Smrg   /* assign phis with all-matching registers to that register */
19187ec681f3Smrg   for (aco_ptr<Instruction>& phi : block.instructions) {
19197ec681f3Smrg      if (!is_phi(phi))
19207ec681f3Smrg         break;
19217ec681f3Smrg      Definition& definition = phi->definitions[0];
19227ec681f3Smrg      if (definition.isKill() || definition.isFixed())
19237ec681f3Smrg         continue;
19247ec681f3Smrg
19257ec681f3Smrg      if (!phi->operands[0].isTemp())
19267ec681f3Smrg         continue;
19277ec681f3Smrg
19287ec681f3Smrg      PhysReg reg = phi->operands[0].physReg();
19297ec681f3Smrg      auto OpsSame = [=](const Operand& op) -> bool
19307ec681f3Smrg      { return op.isTemp() && (!op.isFixed() || op.physReg() == reg); };
19317ec681f3Smrg      bool all_same = std::all_of(phi->operands.cbegin() + 1, phi->operands.cend(), OpsSame);
19327ec681f3Smrg      if (!all_same)
19337ec681f3Smrg         continue;
19347ec681f3Smrg
19357ec681f3Smrg      if (!get_reg_specified(ctx, register_file, definition.regClass(), phi, reg))
19367ec681f3Smrg         continue;
19377ec681f3Smrg
19387ec681f3Smrg      definition.setFixed(reg);
19397ec681f3Smrg      register_file.fill(definition);
19407ec681f3Smrg      ctx.assignments[definition.tempId()].set(definition);
19417ec681f3Smrg   }
19427ec681f3Smrg
19437ec681f3Smrg   /* try to find a register that is used by at least one operand */
19447ec681f3Smrg   for (aco_ptr<Instruction>& phi : block.instructions) {
19457ec681f3Smrg      if (!is_phi(phi))
19467ec681f3Smrg         break;
19477ec681f3Smrg      Definition& definition = phi->definitions[0];
19487ec681f3Smrg      if (definition.isKill() || definition.isFixed())
19497ec681f3Smrg         continue;
19507ec681f3Smrg
19517ec681f3Smrg      /* use affinity if available */
19527ec681f3Smrg      if (ctx.assignments[definition.tempId()].affinity &&
19537ec681f3Smrg          ctx.assignments[ctx.assignments[definition.tempId()].affinity].assigned) {
19547ec681f3Smrg         assignment& affinity = ctx.assignments[ctx.assignments[definition.tempId()].affinity];
19557ec681f3Smrg         assert(affinity.rc == definition.regClass());
19567ec681f3Smrg         if (get_reg_specified(ctx, register_file, definition.regClass(), phi, affinity.reg)) {
19577ec681f3Smrg            definition.setFixed(affinity.reg);
19587ec681f3Smrg            register_file.fill(definition);
19597ec681f3Smrg            ctx.assignments[definition.tempId()].set(definition);
19607ec681f3Smrg            continue;
19617ec681f3Smrg         }
19627ec681f3Smrg      }
19637ec681f3Smrg
19647ec681f3Smrg      /* by going backwards, we aim to avoid copies in else-blocks */
19657ec681f3Smrg      for (int i = phi->operands.size() - 1; i >= 0; i--) {
19667ec681f3Smrg         const Operand& op = phi->operands[i];
19677ec681f3Smrg         if (!op.isTemp() || !op.isFixed())
19687ec681f3Smrg            continue;
19697ec681f3Smrg
19707ec681f3Smrg         PhysReg reg = op.physReg();
19717ec681f3Smrg         if (get_reg_specified(ctx, register_file, definition.regClass(), phi, reg)) {
19727ec681f3Smrg            definition.setFixed(reg);
19737ec681f3Smrg            register_file.fill(definition);
19747ec681f3Smrg            ctx.assignments[definition.tempId()].set(definition);
19757ec681f3Smrg            break;
19767ec681f3Smrg         }
19777ec681f3Smrg      }
19787ec681f3Smrg   }
19797ec681f3Smrg
19807ec681f3Smrg   /* find registers for phis where the register was blocked or no operand was assigned */
19817ec681f3Smrg   for (aco_ptr<Instruction>& phi : block.instructions) {
19827ec681f3Smrg      if (!is_phi(phi))
19837ec681f3Smrg         break;
19847ec681f3Smrg
19857ec681f3Smrg      Definition& definition = phi->definitions[0];
19867ec681f3Smrg      if (definition.isKill())
19877ec681f3Smrg         continue;
19887ec681f3Smrg
19897ec681f3Smrg      if (definition.isFixed()) {
19907ec681f3Smrg         instructions.emplace_back(std::move(phi));
19917ec681f3Smrg         continue;
19927ec681f3Smrg      }
19937ec681f3Smrg
19947ec681f3Smrg      std::vector<std::pair<Operand, Definition>> parallelcopy;
19957ec681f3Smrg      definition.setFixed(get_reg(ctx, register_file, definition.getTemp(), parallelcopy, phi));
19967ec681f3Smrg      update_renames(ctx, register_file, parallelcopy, phi, rename_not_killed_ops);
19977ec681f3Smrg
19987ec681f3Smrg      /* process parallelcopy */
19997ec681f3Smrg      for (std::pair<Operand, Definition> pc : parallelcopy) {
20007ec681f3Smrg         /* see if it's a copy from a different phi */
20017ec681f3Smrg         // TODO: prefer moving some previous phis over live-ins
20027ec681f3Smrg         // TODO: somehow prevent phis fixed before the RA from being updated (shouldn't be a
20037ec681f3Smrg         // problem in practice since they can only be fixed to exec)
20047ec681f3Smrg         Instruction* prev_phi = NULL;
20057ec681f3Smrg         std::vector<aco_ptr<Instruction>>::iterator phi_it;
20067ec681f3Smrg         for (phi_it = instructions.begin(); phi_it != instructions.end(); ++phi_it) {
20077ec681f3Smrg            if ((*phi_it)->definitions[0].tempId() == pc.first.tempId())
20087ec681f3Smrg               prev_phi = phi_it->get();
20097ec681f3Smrg         }
20107ec681f3Smrg         if (prev_phi) {
20117ec681f3Smrg            /* if so, just update that phi's register */
20127ec681f3Smrg            prev_phi->definitions[0].setFixed(pc.second.physReg());
20137ec681f3Smrg            ctx.assignments[prev_phi->definitions[0].tempId()].set(pc.second);
20147ec681f3Smrg            continue;
20157ec681f3Smrg         }
20167ec681f3Smrg
20177ec681f3Smrg         /* rename */
20187ec681f3Smrg         std::unordered_map<unsigned, Temp>::iterator orig_it =
20197ec681f3Smrg            ctx.orig_names.find(pc.first.tempId());
20207ec681f3Smrg         Temp orig = pc.first.getTemp();
20217ec681f3Smrg         if (orig_it != ctx.orig_names.end())
20227ec681f3Smrg            orig = orig_it->second;
20237ec681f3Smrg         else
20247ec681f3Smrg            ctx.orig_names[pc.second.tempId()] = orig;
20257ec681f3Smrg         ctx.renames[block.index][orig.id()] = pc.second.getTemp();
20267ec681f3Smrg
20277ec681f3Smrg         /* otherwise, this is a live-in and we need to create a new phi
20287ec681f3Smrg          * to move it in this block's predecessors */
20297ec681f3Smrg         aco_opcode opcode =
20307ec681f3Smrg            pc.first.getTemp().is_linear() ? aco_opcode::p_linear_phi : aco_opcode::p_phi;
20317ec681f3Smrg         std::vector<unsigned>& preds =
20327ec681f3Smrg            pc.first.getTemp().is_linear() ? block.linear_preds : block.logical_preds;
20337ec681f3Smrg         aco_ptr<Instruction> new_phi{
20347ec681f3Smrg            create_instruction<Pseudo_instruction>(opcode, Format::PSEUDO, preds.size(), 1)};
20357ec681f3Smrg         new_phi->definitions[0] = pc.second;
20367ec681f3Smrg         for (unsigned i = 0; i < preds.size(); i++)
20377ec681f3Smrg            new_phi->operands[i] = Operand(pc.first);
20387ec681f3Smrg         instructions.emplace_back(std::move(new_phi));
20397ec681f3Smrg
20407ec681f3Smrg         /* Remove from live_out_per_block (now used for live-in), because handle_loop_phis()
20417ec681f3Smrg          * would re-create this phi later if this is a loop header.
20427ec681f3Smrg          */
20437ec681f3Smrg         live_in.erase(orig.id());
20447ec681f3Smrg      }
20457ec681f3Smrg
20467ec681f3Smrg      register_file.fill(definition);
20477ec681f3Smrg      ctx.assignments[definition.tempId()].set(definition);
20487ec681f3Smrg      instructions.emplace_back(std::move(phi));
20497ec681f3Smrg   }
20507ec681f3Smrg}
20517ec681f3Smrg
20527ec681f3SmrgTemp
20537ec681f3Smrgread_variable(ra_ctx& ctx, Temp val, unsigned block_idx)
20547ec681f3Smrg{
20557ec681f3Smrg   std::unordered_map<unsigned, Temp>::iterator it = ctx.renames[block_idx].find(val.id());
20567ec681f3Smrg   if (it == ctx.renames[block_idx].end())
20577ec681f3Smrg      return val;
20587ec681f3Smrg   else
20597ec681f3Smrg      return it->second;
20607ec681f3Smrg}
20617ec681f3Smrg
20627ec681f3SmrgTemp
20637ec681f3Smrghandle_live_in(ra_ctx& ctx, Temp val, Block* block)
20647ec681f3Smrg{
20657ec681f3Smrg   std::vector<unsigned>& preds = val.is_linear() ? block->linear_preds : block->logical_preds;
20667ec681f3Smrg   if (preds.size() == 0)
20677ec681f3Smrg      return val;
20687ec681f3Smrg
20697ec681f3Smrg   if (preds.size() == 1) {
20707ec681f3Smrg      /* if the block has only one predecessor, just look there for the name */
20717ec681f3Smrg      return read_variable(ctx, val, preds[0]);
20727ec681f3Smrg   }
20737ec681f3Smrg
20747ec681f3Smrg   /* there are multiple predecessors and the block is sealed */
20757ec681f3Smrg   Temp* const ops = (Temp*)alloca(preds.size() * sizeof(Temp));
20767ec681f3Smrg
20777ec681f3Smrg   /* get the rename from each predecessor and check if they are the same */
20787ec681f3Smrg   Temp new_val;
20797ec681f3Smrg   bool needs_phi = false;
20807ec681f3Smrg   for (unsigned i = 0; i < preds.size(); i++) {
20817ec681f3Smrg      ops[i] = read_variable(ctx, val, preds[i]);
20827ec681f3Smrg      if (i == 0)
20837ec681f3Smrg         new_val = ops[i];
20847ec681f3Smrg      else
20857ec681f3Smrg         needs_phi |= !(new_val == ops[i]);
20867ec681f3Smrg   }
20877ec681f3Smrg
20887ec681f3Smrg   if (needs_phi) {
20897ec681f3Smrg      assert(!val.regClass().is_linear_vgpr());
20907ec681f3Smrg
20917ec681f3Smrg      /* the variable has been renamed differently in the predecessors: we need to insert a phi */
20927ec681f3Smrg      aco_opcode opcode = val.is_linear() ? aco_opcode::p_linear_phi : aco_opcode::p_phi;
20937ec681f3Smrg      aco_ptr<Instruction> phi{
20947ec681f3Smrg         create_instruction<Pseudo_instruction>(opcode, Format::PSEUDO, preds.size(), 1)};
20957ec681f3Smrg      new_val = ctx.program->allocateTmp(val.regClass());
20967ec681f3Smrg      phi->definitions[0] = Definition(new_val);
20977ec681f3Smrg      ctx.assignments.emplace_back();
20987ec681f3Smrg      assert(ctx.assignments.size() == ctx.program->peekAllocationId());
20997ec681f3Smrg      for (unsigned i = 0; i < preds.size(); i++) {
21007ec681f3Smrg         /* update the operands so that it uses the new affinity */
21017ec681f3Smrg         phi->operands[i] = Operand(ops[i]);
21027ec681f3Smrg         assert(ctx.assignments[ops[i].id()].assigned);
21037ec681f3Smrg         assert(ops[i].regClass() == new_val.regClass());
21047ec681f3Smrg         phi->operands[i].setFixed(ctx.assignments[ops[i].id()].reg);
21057ec681f3Smrg      }
21067ec681f3Smrg      block->instructions.insert(block->instructions.begin(), std::move(phi));
21077ec681f3Smrg   }
21087ec681f3Smrg
21097ec681f3Smrg   return new_val;
21107ec681f3Smrg}
21117ec681f3Smrg
21127ec681f3Smrgvoid
21137ec681f3Smrghandle_loop_phis(ra_ctx& ctx, const IDSet& live_in, uint32_t loop_header_idx,
21147ec681f3Smrg                 uint32_t loop_exit_idx)
21157ec681f3Smrg{
21167ec681f3Smrg   Block& loop_header = ctx.program->blocks[loop_header_idx];
21177ec681f3Smrg   std::unordered_map<unsigned, Temp> renames;
21187ec681f3Smrg
21197ec681f3Smrg   /* create phis for variables renamed during the loop */
21207ec681f3Smrg   for (unsigned t : live_in) {
21217ec681f3Smrg      Temp val = Temp(t, ctx.program->temp_rc[t]);
21227ec681f3Smrg      Temp prev = read_variable(ctx, val, loop_header_idx - 1);
21237ec681f3Smrg      Temp renamed = handle_live_in(ctx, val, &loop_header);
21247ec681f3Smrg      if (renamed == prev)
21257ec681f3Smrg         continue;
21267ec681f3Smrg
21277ec681f3Smrg      /* insert additional renames at block end, but don't overwrite */
21287ec681f3Smrg      renames[prev.id()] = renamed;
21297ec681f3Smrg      ctx.orig_names[renamed.id()] = val;
21307ec681f3Smrg      for (unsigned idx = loop_header_idx; idx < loop_exit_idx; idx++) {
21317ec681f3Smrg         auto it = ctx.renames[idx].emplace(val.id(), renamed);
21327ec681f3Smrg         /* if insertion is unsuccessful, update if necessary */
21337ec681f3Smrg         if (!it.second && it.first->second == prev)
21347ec681f3Smrg            it.first->second = renamed;
21357ec681f3Smrg      }
21367ec681f3Smrg
21377ec681f3Smrg      /* update loop-carried values of the phi created by handle_live_in() */
21387ec681f3Smrg      for (unsigned i = 1; i < loop_header.instructions[0]->operands.size(); i++) {
21397ec681f3Smrg         Operand& op = loop_header.instructions[0]->operands[i];
21407ec681f3Smrg         if (op.getTemp() == prev)
21417ec681f3Smrg            op.setTemp(renamed);
21427ec681f3Smrg      }
21437ec681f3Smrg
21447ec681f3Smrg      /* use the assignment from the loop preheader and fix def reg */
21457ec681f3Smrg      assignment& var = ctx.assignments[prev.id()];
21467ec681f3Smrg      ctx.assignments[renamed.id()] = var;
21477ec681f3Smrg      loop_header.instructions[0]->definitions[0].setFixed(var.reg);
21487ec681f3Smrg   }
21497ec681f3Smrg
21507ec681f3Smrg   /* rename loop carried phi operands */
21517ec681f3Smrg   for (unsigned i = renames.size(); i < loop_header.instructions.size(); i++) {
21527ec681f3Smrg      aco_ptr<Instruction>& phi = loop_header.instructions[i];
21537ec681f3Smrg      if (!is_phi(phi))
21547ec681f3Smrg         break;
21557ec681f3Smrg      const std::vector<unsigned>& preds =
21567ec681f3Smrg         phi->opcode == aco_opcode::p_phi ? loop_header.logical_preds : loop_header.linear_preds;
21577ec681f3Smrg      for (unsigned j = 1; j < phi->operands.size(); j++) {
21587ec681f3Smrg         Operand& op = phi->operands[j];
21597ec681f3Smrg         if (!op.isTemp())
21607ec681f3Smrg            continue;
21617ec681f3Smrg
21627ec681f3Smrg         /* Find the original name, since this operand might not use the original name if the phi
21637ec681f3Smrg          * was created after init_reg_file().
21647ec681f3Smrg          */
21657ec681f3Smrg         std::unordered_map<unsigned, Temp>::iterator it = ctx.orig_names.find(op.tempId());
21667ec681f3Smrg         Temp orig = it != ctx.orig_names.end() ? it->second : op.getTemp();
21677ec681f3Smrg
21687ec681f3Smrg         op.setTemp(read_variable(ctx, orig, preds[j]));
21697ec681f3Smrg         op.setFixed(ctx.assignments[op.tempId()].reg);
21707ec681f3Smrg      }
21717ec681f3Smrg   }
21727ec681f3Smrg
21737ec681f3Smrg   /* return early if no new phi was created */
21747ec681f3Smrg   if (renames.empty())
21757ec681f3Smrg      return;
21767ec681f3Smrg
21777ec681f3Smrg   /* propagate new renames through loop */
21787ec681f3Smrg   for (unsigned idx = loop_header_idx; idx < loop_exit_idx; idx++) {
21797ec681f3Smrg      Block& current = ctx.program->blocks[idx];
21807ec681f3Smrg      /* rename all uses in this block */
21817ec681f3Smrg      for (aco_ptr<Instruction>& instr : current.instructions) {
21827ec681f3Smrg         /* phis are renamed after RA */
21837ec681f3Smrg         if (idx == loop_header_idx && is_phi(instr))
21847ec681f3Smrg            continue;
21857ec681f3Smrg
21867ec681f3Smrg         for (Operand& op : instr->operands) {
21877ec681f3Smrg            if (!op.isTemp())
21887ec681f3Smrg               continue;
21897ec681f3Smrg
21907ec681f3Smrg            auto rename = renames.find(op.tempId());
21917ec681f3Smrg            if (rename != renames.end()) {
21927ec681f3Smrg               assert(rename->second.id());
21937ec681f3Smrg               op.setTemp(rename->second);
21947ec681f3Smrg            }
21957ec681f3Smrg         }
21967ec681f3Smrg      }
21977ec681f3Smrg   }
21987ec681f3Smrg}
21997ec681f3Smrg
22007ec681f3Smrg/**
22017ec681f3Smrg * This function serves the purpose to correctly initialize the register file
22027ec681f3Smrg * at the beginning of a block (before any existing phis).
22037ec681f3Smrg * In order to do so, all live-in variables are entered into the RegisterFile.
22047ec681f3Smrg * Reg-to-reg moves (renames) from previous blocks are taken into account and
22057ec681f3Smrg * the SSA is repaired by inserting corresponding phi-nodes.
22067ec681f3Smrg */
22077ec681f3SmrgRegisterFile
22087ec681f3Smrginit_reg_file(ra_ctx& ctx, const std::vector<IDSet>& live_out_per_block, Block& block)
22097ec681f3Smrg{
22107ec681f3Smrg   if (block.kind & block_kind_loop_exit) {
22117ec681f3Smrg      uint32_t header = ctx.loop_header.back();
22127ec681f3Smrg      ctx.loop_header.pop_back();
22137ec681f3Smrg      handle_loop_phis(ctx, live_out_per_block[header], header, block.index);
22147ec681f3Smrg   }
22157ec681f3Smrg
22167ec681f3Smrg   RegisterFile register_file;
22177ec681f3Smrg   const IDSet& live_in = live_out_per_block[block.index];
22187ec681f3Smrg   assert(block.index != 0 || live_in.empty());
22197ec681f3Smrg
22207ec681f3Smrg   if (block.kind & block_kind_loop_header) {
22217ec681f3Smrg      ctx.loop_header.emplace_back(block.index);
22227ec681f3Smrg      /* already rename phis incoming value */
22237ec681f3Smrg      for (aco_ptr<Instruction>& instr : block.instructions) {
22247ec681f3Smrg         if (!is_phi(instr))
22257ec681f3Smrg            break;
22267ec681f3Smrg         Operand& operand = instr->operands[0];
22277ec681f3Smrg         if (operand.isTemp()) {
22287ec681f3Smrg            operand.setTemp(read_variable(ctx, operand.getTemp(), block.index - 1));
22297ec681f3Smrg            operand.setFixed(ctx.assignments[operand.tempId()].reg);
22307ec681f3Smrg         }
22317ec681f3Smrg      }
22327ec681f3Smrg      for (unsigned t : live_in) {
22337ec681f3Smrg         Temp val = Temp(t, ctx.program->temp_rc[t]);
22347ec681f3Smrg         Temp renamed = read_variable(ctx, val, block.index - 1);
22357ec681f3Smrg         if (renamed != val)
22367ec681f3Smrg            ctx.renames[block.index][val.id()] = renamed;
22377ec681f3Smrg         assignment& var = ctx.assignments[renamed.id()];
22387ec681f3Smrg         assert(var.assigned);
22397ec681f3Smrg         register_file.fill(Definition(renamed.id(), var.reg, var.rc));
22407ec681f3Smrg      }
22417ec681f3Smrg   } else {
22427ec681f3Smrg      /* rename phi operands */
22437ec681f3Smrg      for (aco_ptr<Instruction>& instr : block.instructions) {
22447ec681f3Smrg         if (!is_phi(instr))
22457ec681f3Smrg            break;
22467ec681f3Smrg         const std::vector<unsigned>& preds =
22477ec681f3Smrg            instr->opcode == aco_opcode::p_phi ? block.logical_preds : block.linear_preds;
22487ec681f3Smrg
22497ec681f3Smrg         for (unsigned i = 0; i < instr->operands.size(); i++) {
22507ec681f3Smrg            Operand& operand = instr->operands[i];
22517ec681f3Smrg            if (!operand.isTemp())
22527ec681f3Smrg               continue;
22537ec681f3Smrg            operand.setTemp(read_variable(ctx, operand.getTemp(), preds[i]));
22547ec681f3Smrg            operand.setFixed(ctx.assignments[operand.tempId()].reg);
22557ec681f3Smrg         }
22567ec681f3Smrg      }
22577ec681f3Smrg      for (unsigned t : live_in) {
22587ec681f3Smrg         Temp val = Temp(t, ctx.program->temp_rc[t]);
22597ec681f3Smrg         Temp renamed = handle_live_in(ctx, val, &block);
22607ec681f3Smrg         assignment& var = ctx.assignments[renamed.id()];
22617ec681f3Smrg         /* due to live-range splits, the live-in might be a phi, now */
22627ec681f3Smrg         if (var.assigned) {
22637ec681f3Smrg            register_file.fill(Definition(renamed.id(), var.reg, var.rc));
22647ec681f3Smrg         }
22657ec681f3Smrg         if (renamed != val) {
22667ec681f3Smrg            ctx.renames[block.index].emplace(t, renamed);
22677ec681f3Smrg            ctx.orig_names[renamed.id()] = val;
22687ec681f3Smrg         }
22697ec681f3Smrg      }
22707ec681f3Smrg   }
22717ec681f3Smrg
22727ec681f3Smrg   return register_file;
22737ec681f3Smrg}
22747ec681f3Smrg
22757ec681f3Smrgvoid
22767ec681f3Smrgget_affinities(ra_ctx& ctx, std::vector<IDSet>& live_out_per_block)
22777ec681f3Smrg{
22787ec681f3Smrg   std::vector<std::vector<Temp>> phi_ressources;
22797ec681f3Smrg   std::unordered_map<unsigned, unsigned> temp_to_phi_ressources;
22807ec681f3Smrg
22817ec681f3Smrg   for (auto block_rit = ctx.program->blocks.rbegin(); block_rit != ctx.program->blocks.rend();
22827ec681f3Smrg        block_rit++) {
22837ec681f3Smrg      Block& block = *block_rit;
22847ec681f3Smrg
22857ec681f3Smrg      /* first, compute the death points of all live vars within the block */
22867ec681f3Smrg      IDSet& live = live_out_per_block[block.index];
22877ec681f3Smrg
22887ec681f3Smrg      std::vector<aco_ptr<Instruction>>::reverse_iterator rit;
22897ec681f3Smrg      for (rit = block.instructions.rbegin(); rit != block.instructions.rend(); ++rit) {
22907ec681f3Smrg         aco_ptr<Instruction>& instr = *rit;
22917ec681f3Smrg         if (is_phi(instr))
22927ec681f3Smrg            break;
22937ec681f3Smrg
22947ec681f3Smrg         /* add vector affinities */
22957ec681f3Smrg         if (instr->opcode == aco_opcode::p_create_vector) {
22967ec681f3Smrg            for (const Operand& op : instr->operands) {
22977ec681f3Smrg               if (op.isTemp() && op.isFirstKill() &&
22987ec681f3Smrg                   op.getTemp().type() == instr->definitions[0].getTemp().type())
22997ec681f3Smrg                  ctx.vectors[op.tempId()] = instr.get();
23007ec681f3Smrg            }
23017ec681f3Smrg         } else if (instr->format == Format::MIMG && instr->operands.size() > 4) {
23027ec681f3Smrg            for (unsigned i = 3; i < instr->operands.size(); i++)
23037ec681f3Smrg               ctx.vectors[instr->operands[i].tempId()] = instr.get();
23047ec681f3Smrg         }
23057ec681f3Smrg
23067ec681f3Smrg         if (instr->opcode == aco_opcode::p_split_vector &&
23077ec681f3Smrg             instr->operands[0].isFirstKillBeforeDef())
23087ec681f3Smrg            ctx.split_vectors[instr->operands[0].tempId()] = instr.get();
23097ec681f3Smrg
23107ec681f3Smrg         /* add operands to live variables */
23117ec681f3Smrg         for (const Operand& op : instr->operands) {
23127ec681f3Smrg            if (op.isTemp())
23137ec681f3Smrg               live.insert(op.tempId());
23147ec681f3Smrg         }
23157ec681f3Smrg
23167ec681f3Smrg         /* erase definitions from live */
23177ec681f3Smrg         for (unsigned i = 0; i < instr->definitions.size(); i++) {
23187ec681f3Smrg            const Definition& def = instr->definitions[i];
23197ec681f3Smrg            if (!def.isTemp())
23207ec681f3Smrg               continue;
23217ec681f3Smrg            live.erase(def.tempId());
23227ec681f3Smrg            /* mark last-seen phi operand */
23237ec681f3Smrg            std::unordered_map<unsigned, unsigned>::iterator it =
23247ec681f3Smrg               temp_to_phi_ressources.find(def.tempId());
23257ec681f3Smrg            if (it != temp_to_phi_ressources.end() &&
23267ec681f3Smrg                def.regClass() == phi_ressources[it->second][0].regClass()) {
23277ec681f3Smrg               phi_ressources[it->second][0] = def.getTemp();
23287ec681f3Smrg               /* try to coalesce phi affinities with parallelcopies */
23297ec681f3Smrg               Operand op = Operand();
23307ec681f3Smrg               switch (instr->opcode) {
23317ec681f3Smrg               case aco_opcode::p_parallelcopy: op = instr->operands[i]; break;
23327ec681f3Smrg
23337ec681f3Smrg               case aco_opcode::v_interp_p2_f32:
23347ec681f3Smrg               case aco_opcode::v_writelane_b32:
23357ec681f3Smrg               case aco_opcode::v_writelane_b32_e64: op = instr->operands[2]; break;
23367ec681f3Smrg
23377ec681f3Smrg               case aco_opcode::v_fma_f32:
23387ec681f3Smrg               case aco_opcode::v_fma_f16:
23397ec681f3Smrg               case aco_opcode::v_pk_fma_f16:
23407ec681f3Smrg                  if (ctx.program->chip_class < GFX10)
23417ec681f3Smrg                     continue;
23427ec681f3Smrg                  FALLTHROUGH;
23437ec681f3Smrg               case aco_opcode::v_mad_f32:
23447ec681f3Smrg               case aco_opcode::v_mad_f16:
23457ec681f3Smrg                  if (instr->usesModifiers())
23467ec681f3Smrg                     continue;
23477ec681f3Smrg                  op = instr->operands[2];
23487ec681f3Smrg                  break;
23497ec681f3Smrg
23507ec681f3Smrg               default: continue;
23517ec681f3Smrg               }
23527ec681f3Smrg
23537ec681f3Smrg               if (op.isTemp() && op.isFirstKillBeforeDef() && def.regClass() == op.regClass()) {
23547ec681f3Smrg                  phi_ressources[it->second].emplace_back(op.getTemp());
23557ec681f3Smrg                  temp_to_phi_ressources[op.tempId()] = it->second;
23567ec681f3Smrg               }
23577ec681f3Smrg            }
23587ec681f3Smrg         }
23597ec681f3Smrg      }
23607ec681f3Smrg
23617ec681f3Smrg      /* collect phi affinities */
23627ec681f3Smrg      for (; rit != block.instructions.rend(); ++rit) {
23637ec681f3Smrg         aco_ptr<Instruction>& instr = *rit;
23647ec681f3Smrg         assert(is_phi(instr));
23657ec681f3Smrg
23667ec681f3Smrg         live.erase(instr->definitions[0].tempId());
23677ec681f3Smrg         if (instr->definitions[0].isKill() || instr->definitions[0].isFixed())
23687ec681f3Smrg            continue;
23697ec681f3Smrg
23707ec681f3Smrg         assert(instr->definitions[0].isTemp());
23717ec681f3Smrg         std::unordered_map<unsigned, unsigned>::iterator it =
23727ec681f3Smrg            temp_to_phi_ressources.find(instr->definitions[0].tempId());
23737ec681f3Smrg         unsigned index = phi_ressources.size();
23747ec681f3Smrg         std::vector<Temp>* affinity_related;
23757ec681f3Smrg         if (it != temp_to_phi_ressources.end()) {
23767ec681f3Smrg            index = it->second;
23777ec681f3Smrg            phi_ressources[index][0] = instr->definitions[0].getTemp();
23787ec681f3Smrg            affinity_related = &phi_ressources[index];
23797ec681f3Smrg         } else {
23807ec681f3Smrg            phi_ressources.emplace_back(std::vector<Temp>{instr->definitions[0].getTemp()});
23817ec681f3Smrg            affinity_related = &phi_ressources.back();
23827ec681f3Smrg         }
23837ec681f3Smrg
23847ec681f3Smrg         for (const Operand& op : instr->operands) {
23857ec681f3Smrg            if (op.isTemp() && op.isKill() && op.regClass() == instr->definitions[0].regClass()) {
23867ec681f3Smrg               affinity_related->emplace_back(op.getTemp());
23877ec681f3Smrg               if (block.kind & block_kind_loop_header)
23887ec681f3Smrg                  continue;
23897ec681f3Smrg               temp_to_phi_ressources[op.tempId()] = index;
23907ec681f3Smrg            }
23917ec681f3Smrg         }
23927ec681f3Smrg      }
23937ec681f3Smrg
23947ec681f3Smrg      /* visit the loop header phis first in order to create nested affinities */
23957ec681f3Smrg      if (block.kind & block_kind_loop_exit) {
23967ec681f3Smrg         /* find loop header */
23977ec681f3Smrg         auto header_rit = block_rit;
23987ec681f3Smrg         while ((header_rit + 1)->loop_nest_depth > block.loop_nest_depth)
23997ec681f3Smrg            header_rit++;
24007ec681f3Smrg
24017ec681f3Smrg         for (aco_ptr<Instruction>& phi : header_rit->instructions) {
24027ec681f3Smrg            if (!is_phi(phi))
24037ec681f3Smrg               break;
24047ec681f3Smrg            if (phi->definitions[0].isKill() || phi->definitions[0].isFixed())
24057ec681f3Smrg               continue;
24067ec681f3Smrg
24077ec681f3Smrg            /* create an (empty) merge-set for the phi-related variables */
24087ec681f3Smrg            auto it = temp_to_phi_ressources.find(phi->definitions[0].tempId());
24097ec681f3Smrg            unsigned index = phi_ressources.size();
24107ec681f3Smrg            if (it == temp_to_phi_ressources.end()) {
24117ec681f3Smrg               temp_to_phi_ressources[phi->definitions[0].tempId()] = index;
24127ec681f3Smrg               phi_ressources.emplace_back(std::vector<Temp>{phi->definitions[0].getTemp()});
24137ec681f3Smrg            } else {
24147ec681f3Smrg               index = it->second;
24157ec681f3Smrg            }
24167ec681f3Smrg            for (unsigned i = 1; i < phi->operands.size(); i++) {
24177ec681f3Smrg               const Operand& op = phi->operands[i];
24187ec681f3Smrg               if (op.isTemp() && op.isKill() && op.regClass() == phi->definitions[0].regClass()) {
24197ec681f3Smrg                  temp_to_phi_ressources[op.tempId()] = index;
24207ec681f3Smrg               }
24217ec681f3Smrg            }
24227ec681f3Smrg         }
24237ec681f3Smrg      }
24247ec681f3Smrg   }
24257ec681f3Smrg   /* create affinities */
24267ec681f3Smrg   for (std::vector<Temp>& vec : phi_ressources) {
24277ec681f3Smrg      for (unsigned i = 1; i < vec.size(); i++)
24287ec681f3Smrg         if (vec[i].id() != vec[0].id())
24297ec681f3Smrg            ctx.assignments[vec[i].id()].affinity = vec[0].id();
24307ec681f3Smrg   }
24317ec681f3Smrg}
24327ec681f3Smrg
24337ec681f3Smrg} /* end namespace */
24347ec681f3Smrg
24357ec681f3Smrgvoid
24367ec681f3Smrgregister_allocation(Program* program, std::vector<IDSet>& live_out_per_block, ra_test_policy policy)
24377ec681f3Smrg{
24387ec681f3Smrg   ra_ctx ctx(program, policy);
24397ec681f3Smrg   get_affinities(ctx, live_out_per_block);
24407ec681f3Smrg
24417ec681f3Smrg   /* state of register file after phis */
24427ec681f3Smrg   std::vector<std::bitset<128>> sgpr_live_in(program->blocks.size());
24437ec681f3Smrg
24447ec681f3Smrg   for (Block& block : program->blocks) {
24457ec681f3Smrg      ctx.block = &block;
24467ec681f3Smrg
24477ec681f3Smrg      /* initialize register file */
24487ec681f3Smrg      RegisterFile register_file = init_reg_file(ctx, live_out_per_block, block);
24497ec681f3Smrg      ctx.war_hint.reset();
24507ec681f3Smrg
24517ec681f3Smrg      std::vector<aco_ptr<Instruction>> instructions;
24527ec681f3Smrg
24537ec681f3Smrg      /* this is a slight adjustment from the paper as we already have phi nodes:
24547ec681f3Smrg       * We consider them incomplete phis and only handle the definition. */
24557ec681f3Smrg      get_regs_for_phis(ctx, block, register_file, instructions, live_out_per_block[block.index]);
24567ec681f3Smrg
24577ec681f3Smrg      /* fill in sgpr_live_in */
24587ec681f3Smrg      for (unsigned i = 0; i <= ctx.max_used_sgpr; i++)
24597ec681f3Smrg         sgpr_live_in[block.index][i] = register_file[PhysReg{i}];
24607ec681f3Smrg      sgpr_live_in[block.index][127] = register_file[scc];
24617ec681f3Smrg
24627ec681f3Smrg      /* Handle all other instructions of the block */
24637ec681f3Smrg      auto NonPhi = [](aco_ptr<Instruction>& instr) -> bool { return instr && !is_phi(instr); };
24647ec681f3Smrg      std::vector<aco_ptr<Instruction>>::iterator instr_it =
24657ec681f3Smrg         std::find_if(block.instructions.begin(), block.instructions.end(), NonPhi);
24667ec681f3Smrg      for (; instr_it != block.instructions.end(); ++instr_it) {
24677ec681f3Smrg         aco_ptr<Instruction>& instr = *instr_it;
24687ec681f3Smrg
24697ec681f3Smrg         /* parallelcopies from p_phi are inserted here which means
24707ec681f3Smrg          * live ranges of killed operands end here as well */
24717ec681f3Smrg         if (instr->opcode == aco_opcode::p_logical_end) {
24727ec681f3Smrg            /* no need to process this instruction any further */
24737ec681f3Smrg            if (block.logical_succs.size() != 1) {
24747ec681f3Smrg               instructions.emplace_back(std::move(instr));
24757ec681f3Smrg               continue;
24767ec681f3Smrg            }
24777ec681f3Smrg
24787ec681f3Smrg            Block& succ = program->blocks[block.logical_succs[0]];
24797ec681f3Smrg            unsigned idx = 0;
24807ec681f3Smrg            for (; idx < succ.logical_preds.size(); idx++) {
24817ec681f3Smrg               if (succ.logical_preds[idx] == block.index)
24827ec681f3Smrg                  break;
24837ec681f3Smrg            }
24847ec681f3Smrg            for (aco_ptr<Instruction>& phi : succ.instructions) {
24857ec681f3Smrg               if (phi->opcode == aco_opcode::p_phi) {
24867ec681f3Smrg                  if (phi->operands[idx].isTemp() &&
24877ec681f3Smrg                      phi->operands[idx].getTemp().type() == RegType::sgpr &&
24887ec681f3Smrg                      phi->operands[idx].isFirstKillBeforeDef()) {
24897ec681f3Smrg                     Definition phi_op(
24907ec681f3Smrg                        read_variable(ctx, phi->operands[idx].getTemp(), block.index));
24917ec681f3Smrg                     phi_op.setFixed(ctx.assignments[phi_op.tempId()].reg);
24927ec681f3Smrg                     register_file.clear(phi_op);
24937ec681f3Smrg                  }
24947ec681f3Smrg               } else if (phi->opcode != aco_opcode::p_linear_phi) {
24957ec681f3Smrg                  break;
24967ec681f3Smrg               }
24977ec681f3Smrg            }
24987ec681f3Smrg            instructions.emplace_back(std::move(instr));
24997ec681f3Smrg            continue;
25007ec681f3Smrg         }
25017ec681f3Smrg
25027ec681f3Smrg         std::vector<std::pair<Operand, Definition>> parallelcopy;
25037ec681f3Smrg
25047ec681f3Smrg         assert(!is_phi(instr));
25057ec681f3Smrg
25067ec681f3Smrg         bool temp_in_scc = register_file[scc];
25077ec681f3Smrg
25087ec681f3Smrg         /* handle operands */
25097ec681f3Smrg         for (unsigned i = 0; i < instr->operands.size(); ++i) {
25107ec681f3Smrg            auto& operand = instr->operands[i];
25117ec681f3Smrg            if (!operand.isTemp())
25127ec681f3Smrg               continue;
25137ec681f3Smrg
25147ec681f3Smrg            /* rename operands */
25157ec681f3Smrg            operand.setTemp(read_variable(ctx, operand.getTemp(), block.index));
25167ec681f3Smrg            assert(ctx.assignments[operand.tempId()].assigned);
25177ec681f3Smrg
25187ec681f3Smrg            PhysReg reg = ctx.assignments[operand.tempId()].reg;
25197ec681f3Smrg            if (operand_can_use_reg(program->chip_class, instr, i, reg, operand.regClass()))
25207ec681f3Smrg               operand.setFixed(reg);
25217ec681f3Smrg            else
25227ec681f3Smrg               get_reg_for_operand(ctx, register_file, parallelcopy, instr, operand, i);
25237ec681f3Smrg
25247ec681f3Smrg            if (instr->isEXP() || (instr->isVMEM() && i == 3 && ctx.program->chip_class == GFX6) ||
25257ec681f3Smrg                (instr->isDS() && instr->ds().gds)) {
25267ec681f3Smrg               for (unsigned j = 0; j < operand.size(); j++)
25277ec681f3Smrg                  ctx.war_hint.set(operand.physReg().reg() + j);
25287ec681f3Smrg            }
25297ec681f3Smrg         }
25307ec681f3Smrg
25317ec681f3Smrg         /* remove dead vars from register file */
25327ec681f3Smrg         for (const Operand& op : instr->operands) {
25337ec681f3Smrg            if (op.isTemp() && op.isFirstKillBeforeDef())
25347ec681f3Smrg               register_file.clear(op);
25357ec681f3Smrg         }
25367ec681f3Smrg
25377ec681f3Smrg         /* try to optimize v_mad_f32 -> v_mac_f32 */
25387ec681f3Smrg         if ((instr->opcode == aco_opcode::v_mad_f32 ||
25397ec681f3Smrg              (instr->opcode == aco_opcode::v_fma_f32 && program->chip_class >= GFX10) ||
25407ec681f3Smrg              instr->opcode == aco_opcode::v_mad_f16 ||
25417ec681f3Smrg              instr->opcode == aco_opcode::v_mad_legacy_f16 ||
25427ec681f3Smrg              (instr->opcode == aco_opcode::v_fma_f16 && program->chip_class >= GFX10) ||
25437ec681f3Smrg              (instr->opcode == aco_opcode::v_pk_fma_f16 && program->chip_class >= GFX10) ||
25447ec681f3Smrg              (instr->opcode == aco_opcode::v_dot4_i32_i8 && program->family != CHIP_VEGA20)) &&
25457ec681f3Smrg             instr->operands[2].isTemp() && instr->operands[2].isKillBeforeDef() &&
25467ec681f3Smrg             instr->operands[2].getTemp().type() == RegType::vgpr && instr->operands[1].isTemp() &&
25477ec681f3Smrg             instr->operands[1].getTemp().type() == RegType::vgpr && !instr->usesModifiers() &&
25487ec681f3Smrg             instr->operands[0].physReg().byte() == 0 && instr->operands[1].physReg().byte() == 0 &&
25497ec681f3Smrg             instr->operands[2].physReg().byte() == 0) {
25507ec681f3Smrg            unsigned def_id = instr->definitions[0].tempId();
25517ec681f3Smrg            bool use_vop2 = true;
25527ec681f3Smrg            if (ctx.assignments[def_id].affinity) {
25537ec681f3Smrg               assignment& affinity = ctx.assignments[ctx.assignments[def_id].affinity];
25547ec681f3Smrg               if (affinity.assigned && affinity.reg != instr->operands[2].physReg() &&
25557ec681f3Smrg                   !register_file.test(affinity.reg, instr->operands[2].bytes()))
25567ec681f3Smrg                  use_vop2 = false;
25577ec681f3Smrg            }
25587ec681f3Smrg            if (use_vop2) {
25597ec681f3Smrg               instr->format = Format::VOP2;
25607ec681f3Smrg               switch (instr->opcode) {
25617ec681f3Smrg               case aco_opcode::v_mad_f32: instr->opcode = aco_opcode::v_mac_f32; break;
25627ec681f3Smrg               case aco_opcode::v_fma_f32: instr->opcode = aco_opcode::v_fmac_f32; break;
25637ec681f3Smrg               case aco_opcode::v_mad_f16:
25647ec681f3Smrg               case aco_opcode::v_mad_legacy_f16: instr->opcode = aco_opcode::v_mac_f16; break;
25657ec681f3Smrg               case aco_opcode::v_fma_f16: instr->opcode = aco_opcode::v_fmac_f16; break;
25667ec681f3Smrg               case aco_opcode::v_pk_fma_f16: instr->opcode = aco_opcode::v_pk_fmac_f16; break;
25677ec681f3Smrg               case aco_opcode::v_dot4_i32_i8: instr->opcode = aco_opcode::v_dot4c_i32_i8; break;
25687ec681f3Smrg               default: break;
25697ec681f3Smrg               }
25707ec681f3Smrg            }
25717ec681f3Smrg         }
25727ec681f3Smrg
25737ec681f3Smrg         /* handle definitions which must have the same register as an operand */
25747ec681f3Smrg         if (instr->opcode == aco_opcode::v_interp_p2_f32 ||
25757ec681f3Smrg             instr->opcode == aco_opcode::v_mac_f32 || instr->opcode == aco_opcode::v_fmac_f32 ||
25767ec681f3Smrg             instr->opcode == aco_opcode::v_mac_f16 || instr->opcode == aco_opcode::v_fmac_f16 ||
25777ec681f3Smrg             instr->opcode == aco_opcode::v_pk_fmac_f16 ||
25787ec681f3Smrg             instr->opcode == aco_opcode::v_writelane_b32 ||
25797ec681f3Smrg             instr->opcode == aco_opcode::v_writelane_b32_e64 ||
25807ec681f3Smrg             instr->opcode == aco_opcode::v_dot4c_i32_i8) {
25817ec681f3Smrg            instr->definitions[0].setFixed(instr->operands[2].physReg());
25827ec681f3Smrg         } else if (instr->opcode == aco_opcode::s_addk_i32 ||
25837ec681f3Smrg                    instr->opcode == aco_opcode::s_mulk_i32) {
25847ec681f3Smrg            instr->definitions[0].setFixed(instr->operands[0].physReg());
25857ec681f3Smrg         } else if (instr->isMUBUF() && instr->definitions.size() == 1 &&
25867ec681f3Smrg                    instr->operands.size() == 4) {
25877ec681f3Smrg            instr->definitions[0].setFixed(instr->operands[3].physReg());
25887ec681f3Smrg         } else if (instr->isMIMG() && instr->definitions.size() == 1 &&
25897ec681f3Smrg                    !instr->operands[2].isUndefined()) {
25907ec681f3Smrg            instr->definitions[0].setFixed(instr->operands[2].physReg());
25917ec681f3Smrg         }
25927ec681f3Smrg
25937ec681f3Smrg         ctx.defs_done.reset();
25947ec681f3Smrg
25957ec681f3Smrg         /* handle fixed definitions first */
25967ec681f3Smrg         for (unsigned i = 0; i < instr->definitions.size(); ++i) {
25977ec681f3Smrg            auto& definition = instr->definitions[i];
25987ec681f3Smrg            if (!definition.isFixed())
25997ec681f3Smrg               continue;
26007ec681f3Smrg
26017ec681f3Smrg            adjust_max_used_regs(ctx, definition.regClass(), definition.physReg());
26027ec681f3Smrg            /* check if the target register is blocked */
26037ec681f3Smrg            if (register_file.test(definition.physReg(), definition.bytes())) {
26047ec681f3Smrg               const PhysRegInterval def_regs{definition.physReg(), definition.size()};
26057ec681f3Smrg
26067ec681f3Smrg               /* create parallelcopy pair to move blocking vars */
26077ec681f3Smrg               std::set<std::pair<unsigned, unsigned>> vars =
26087ec681f3Smrg                  collect_vars(ctx, register_file, def_regs);
26097ec681f3Smrg
26107ec681f3Smrg               RegisterFile tmp_file(register_file);
26117ec681f3Smrg               /* re-enable the killed operands, so that we don't move the blocking vars there */
26127ec681f3Smrg               for (const Operand& op : instr->operands) {
26137ec681f3Smrg                  if (op.isTemp() && op.isFirstKillBeforeDef())
26147ec681f3Smrg                     tmp_file.fill(op);
26157ec681f3Smrg               }
26167ec681f3Smrg
26177ec681f3Smrg               ASSERTED bool success = false;
26187ec681f3Smrg               DefInfo info(ctx, instr, definition.regClass(), -1);
26197ec681f3Smrg               success = get_regs_for_copies(ctx, tmp_file, parallelcopy, vars, info.bounds, instr,
26207ec681f3Smrg                                             def_regs);
26217ec681f3Smrg               assert(success);
26227ec681f3Smrg
26237ec681f3Smrg               update_renames(ctx, register_file, parallelcopy, instr, (UpdateRenames)0);
26247ec681f3Smrg            }
26257ec681f3Smrg            ctx.defs_done.set(i);
26267ec681f3Smrg
26277ec681f3Smrg            if (!definition.isTemp())
26287ec681f3Smrg               continue;
26297ec681f3Smrg
26307ec681f3Smrg            ctx.assignments[definition.tempId()].set(definition);
26317ec681f3Smrg            register_file.fill(definition);
26327ec681f3Smrg         }
26337ec681f3Smrg
26347ec681f3Smrg         /* handle all other definitions */
26357ec681f3Smrg         for (unsigned i = 0; i < instr->definitions.size(); ++i) {
26367ec681f3Smrg            Definition* definition = &instr->definitions[i];
26377ec681f3Smrg
26387ec681f3Smrg            if (definition->isFixed() || !definition->isTemp())
26397ec681f3Smrg               continue;
26407ec681f3Smrg
26417ec681f3Smrg            /* find free reg */
26427ec681f3Smrg            if (definition->hasHint() &&
26437ec681f3Smrg                get_reg_specified(ctx, register_file, definition->regClass(), instr,
26447ec681f3Smrg                                  definition->physReg())) {
26457ec681f3Smrg               definition->setFixed(definition->physReg());
26467ec681f3Smrg            } else if (instr->opcode == aco_opcode::p_split_vector) {
26477ec681f3Smrg               PhysReg reg = instr->operands[0].physReg();
26487ec681f3Smrg               for (unsigned j = 0; j < i; j++)
26497ec681f3Smrg                  reg.reg_b += instr->definitions[j].bytes();
26507ec681f3Smrg               if (get_reg_specified(ctx, register_file, definition->regClass(), instr, reg))
26517ec681f3Smrg                  definition->setFixed(reg);
26527ec681f3Smrg            } else if (instr->opcode == aco_opcode::p_wqm ||
26537ec681f3Smrg                       instr->opcode == aco_opcode::p_parallelcopy) {
26547ec681f3Smrg               PhysReg reg = instr->operands[i].physReg();
26557ec681f3Smrg               if (instr->operands[i].isTemp() &&
26567ec681f3Smrg                   instr->operands[i].getTemp().type() == definition->getTemp().type() &&
26577ec681f3Smrg                   !register_file.test(reg, definition->bytes()))
26587ec681f3Smrg                  definition->setFixed(reg);
26597ec681f3Smrg            } else if (instr->opcode == aco_opcode::p_extract_vector) {
26607ec681f3Smrg               PhysReg reg = instr->operands[0].physReg();
26617ec681f3Smrg               reg.reg_b += definition->bytes() * instr->operands[1].constantValue();
26627ec681f3Smrg               if (get_reg_specified(ctx, register_file, definition->regClass(), instr, reg))
26637ec681f3Smrg                  definition->setFixed(reg);
26647ec681f3Smrg            } else if (instr->opcode == aco_opcode::p_create_vector) {
26657ec681f3Smrg               PhysReg reg = get_reg_create_vector(ctx, register_file, definition->getTemp(),
26667ec681f3Smrg                                                   parallelcopy, instr);
26677ec681f3Smrg               update_renames(ctx, register_file, parallelcopy, instr, (UpdateRenames)0);
26687ec681f3Smrg               definition->setFixed(reg);
26697ec681f3Smrg            }
26707ec681f3Smrg
26717ec681f3Smrg            if (!definition->isFixed()) {
26727ec681f3Smrg               Temp tmp = definition->getTemp();
26737ec681f3Smrg               if (definition->regClass().is_subdword() && definition->bytes() < 4) {
26747ec681f3Smrg                  PhysReg reg = get_reg(ctx, register_file, tmp, parallelcopy, instr);
26757ec681f3Smrg                  definition->setFixed(reg);
26767ec681f3Smrg                  if (reg.byte() || register_file.test(reg, 4)) {
26777ec681f3Smrg                     add_subdword_definition(program, instr, reg);
26787ec681f3Smrg                     definition = &instr->definitions[i]; /* add_subdword_definition can invalidate
26797ec681f3Smrg                                                             the reference */
26807ec681f3Smrg                  }
26817ec681f3Smrg               } else {
26827ec681f3Smrg                  definition->setFixed(get_reg(ctx, register_file, tmp, parallelcopy, instr));
26837ec681f3Smrg               }
26847ec681f3Smrg               update_renames(ctx, register_file, parallelcopy, instr,
26857ec681f3Smrg                              instr->opcode != aco_opcode::p_create_vector ? rename_not_killed_ops
26867ec681f3Smrg                                                                           : (UpdateRenames)0);
26877ec681f3Smrg            }
26887ec681f3Smrg
26897ec681f3Smrg            assert(
26907ec681f3Smrg               definition->isFixed() &&
26917ec681f3Smrg               ((definition->getTemp().type() == RegType::vgpr && definition->physReg() >= 256) ||
26927ec681f3Smrg                (definition->getTemp().type() != RegType::vgpr && definition->physReg() < 256)));
26937ec681f3Smrg            ctx.defs_done.set(i);
26947ec681f3Smrg            ctx.assignments[definition->tempId()].set(*definition);
26957ec681f3Smrg            register_file.fill(*definition);
26967ec681f3Smrg         }
26977ec681f3Smrg
26987ec681f3Smrg         handle_pseudo(ctx, register_file, instr.get());
26997ec681f3Smrg
27007ec681f3Smrg         /* kill definitions and late-kill operands and ensure that sub-dword operands can actually
27017ec681f3Smrg          * be read */
27027ec681f3Smrg         for (const Definition& def : instr->definitions) {
27037ec681f3Smrg            if (def.isTemp() && def.isKill())
27047ec681f3Smrg               register_file.clear(def);
27057ec681f3Smrg         }
27067ec681f3Smrg         for (unsigned i = 0; i < instr->operands.size(); i++) {
27077ec681f3Smrg            const Operand& op = instr->operands[i];
27087ec681f3Smrg            if (op.isTemp() && op.isFirstKill() && op.isLateKill())
27097ec681f3Smrg               register_file.clear(op);
27107ec681f3Smrg            if (op.isTemp() && op.physReg().byte() != 0)
27117ec681f3Smrg               add_subdword_operand(ctx, instr, i, op.physReg().byte(), op.regClass());
27127ec681f3Smrg         }
27137ec681f3Smrg
27147ec681f3Smrg         /* emit parallelcopy */
27157ec681f3Smrg         if (!parallelcopy.empty()) {
27167ec681f3Smrg            aco_ptr<Pseudo_instruction> pc;
27177ec681f3Smrg            pc.reset(create_instruction<Pseudo_instruction>(aco_opcode::p_parallelcopy,
27187ec681f3Smrg                                                            Format::PSEUDO, parallelcopy.size(),
27197ec681f3Smrg                                                            parallelcopy.size()));
27207ec681f3Smrg            bool linear_vgpr = false;
27217ec681f3Smrg            bool sgpr_operands_alias_defs = false;
27227ec681f3Smrg            uint64_t sgpr_operands[4] = {0, 0, 0, 0};
27237ec681f3Smrg            for (unsigned i = 0; i < parallelcopy.size(); i++) {
27247ec681f3Smrg               linear_vgpr |= parallelcopy[i].first.regClass().is_linear_vgpr();
27257ec681f3Smrg
27267ec681f3Smrg               if (temp_in_scc && parallelcopy[i].first.isTemp() &&
27277ec681f3Smrg                   parallelcopy[i].first.getTemp().type() == RegType::sgpr) {
27287ec681f3Smrg                  if (!sgpr_operands_alias_defs) {
27297ec681f3Smrg                     unsigned reg = parallelcopy[i].first.physReg().reg();
27307ec681f3Smrg                     unsigned size = parallelcopy[i].first.getTemp().size();
27317ec681f3Smrg                     sgpr_operands[reg / 64u] |= u_bit_consecutive64(reg % 64u, size);
27327ec681f3Smrg
27337ec681f3Smrg                     reg = parallelcopy[i].second.physReg().reg();
27347ec681f3Smrg                     size = parallelcopy[i].second.getTemp().size();
27357ec681f3Smrg                     if (sgpr_operands[reg / 64u] & u_bit_consecutive64(reg % 64u, size))
27367ec681f3Smrg                        sgpr_operands_alias_defs = true;
27377ec681f3Smrg                  }
27387ec681f3Smrg               }
27397ec681f3Smrg
27407ec681f3Smrg               pc->operands[i] = parallelcopy[i].first;
27417ec681f3Smrg               pc->definitions[i] = parallelcopy[i].second;
27427ec681f3Smrg               assert(pc->operands[i].size() == pc->definitions[i].size());
27437ec681f3Smrg
27447ec681f3Smrg               /* it might happen that the operand is already renamed. we have to restore the
27457ec681f3Smrg                * original name. */
27467ec681f3Smrg               std::unordered_map<unsigned, Temp>::iterator it =
27477ec681f3Smrg                  ctx.orig_names.find(pc->operands[i].tempId());
27487ec681f3Smrg               Temp orig = it != ctx.orig_names.end() ? it->second : pc->operands[i].getTemp();
27497ec681f3Smrg               ctx.orig_names[pc->definitions[i].tempId()] = orig;
27507ec681f3Smrg               ctx.renames[block.index][orig.id()] = pc->definitions[i].getTemp();
27517ec681f3Smrg            }
27527ec681f3Smrg
27537ec681f3Smrg            if (temp_in_scc && (sgpr_operands_alias_defs || linear_vgpr)) {
27547ec681f3Smrg               /* disable definitions and re-enable operands */
27557ec681f3Smrg               RegisterFile tmp_file(register_file);
27567ec681f3Smrg               for (const Definition& def : instr->definitions) {
27577ec681f3Smrg                  if (def.isTemp() && !def.isKill())
27587ec681f3Smrg                     tmp_file.clear(def);
27597ec681f3Smrg               }
27607ec681f3Smrg               for (const Operand& op : instr->operands) {
27617ec681f3Smrg                  if (op.isTemp() && op.isFirstKill())
27627ec681f3Smrg                     tmp_file.block(op.physReg(), op.regClass());
27637ec681f3Smrg               }
27647ec681f3Smrg
27657ec681f3Smrg               handle_pseudo(ctx, tmp_file, pc.get());
27667ec681f3Smrg            } else {
27677ec681f3Smrg               pc->tmp_in_scc = false;
27687ec681f3Smrg            }
27697ec681f3Smrg
27707ec681f3Smrg            instructions.emplace_back(std::move(pc));
27717ec681f3Smrg         }
27727ec681f3Smrg
27737ec681f3Smrg         /* some instructions need VOP3 encoding if operand/definition is not assigned to VCC */
27747ec681f3Smrg         bool instr_needs_vop3 =
27757ec681f3Smrg            !instr->isVOP3() &&
27767ec681f3Smrg            ((instr->format == Format::VOPC && !(instr->definitions[0].physReg() == vcc)) ||
27777ec681f3Smrg             (instr->opcode == aco_opcode::v_cndmask_b32 &&
27787ec681f3Smrg              !(instr->operands[2].physReg() == vcc)) ||
27797ec681f3Smrg             ((instr->opcode == aco_opcode::v_add_co_u32 ||
27807ec681f3Smrg               instr->opcode == aco_opcode::v_addc_co_u32 ||
27817ec681f3Smrg               instr->opcode == aco_opcode::v_sub_co_u32 ||
27827ec681f3Smrg               instr->opcode == aco_opcode::v_subb_co_u32 ||
27837ec681f3Smrg               instr->opcode == aco_opcode::v_subrev_co_u32 ||
27847ec681f3Smrg               instr->opcode == aco_opcode::v_subbrev_co_u32) &&
27857ec681f3Smrg              !(instr->definitions[1].physReg() == vcc)) ||
27867ec681f3Smrg             ((instr->opcode == aco_opcode::v_addc_co_u32 ||
27877ec681f3Smrg               instr->opcode == aco_opcode::v_subb_co_u32 ||
27887ec681f3Smrg               instr->opcode == aco_opcode::v_subbrev_co_u32) &&
27897ec681f3Smrg              !(instr->operands[2].physReg() == vcc)));
27907ec681f3Smrg         if (instr_needs_vop3) {
27917ec681f3Smrg
27927ec681f3Smrg            /* if the first operand is a literal, we have to move it to a reg */
27937ec681f3Smrg            if (instr->operands.size() && instr->operands[0].isLiteral() &&
27947ec681f3Smrg                program->chip_class < GFX10) {
27957ec681f3Smrg               bool can_sgpr = true;
27967ec681f3Smrg               /* check, if we have to move to vgpr */
27977ec681f3Smrg               for (const Operand& op : instr->operands) {
27987ec681f3Smrg                  if (op.isTemp() && op.getTemp().type() == RegType::sgpr) {
27997ec681f3Smrg                     can_sgpr = false;
28007ec681f3Smrg                     break;
28017ec681f3Smrg                  }
28027ec681f3Smrg               }
28037ec681f3Smrg               /* disable definitions and re-enable operands */
28047ec681f3Smrg               RegisterFile tmp_file(register_file);
28057ec681f3Smrg               for (const Definition& def : instr->definitions)
28067ec681f3Smrg                  tmp_file.clear(def);
28077ec681f3Smrg               for (const Operand& op : instr->operands) {
28087ec681f3Smrg                  if (op.isTemp() && op.isFirstKill())
28097ec681f3Smrg                     tmp_file.block(op.physReg(), op.regClass());
28107ec681f3Smrg               }
28117ec681f3Smrg               Temp tmp = program->allocateTmp(can_sgpr ? s1 : v1);
28127ec681f3Smrg               ctx.assignments.emplace_back();
28137ec681f3Smrg               PhysReg reg = get_reg(ctx, tmp_file, tmp, parallelcopy, instr);
28147ec681f3Smrg               update_renames(ctx, register_file, parallelcopy, instr, rename_not_killed_ops);
28157ec681f3Smrg
28167ec681f3Smrg               aco_ptr<Instruction> mov;
28177ec681f3Smrg               if (can_sgpr)
28187ec681f3Smrg                  mov.reset(create_instruction<SOP1_instruction>(aco_opcode::s_mov_b32,
28197ec681f3Smrg                                                                 Format::SOP1, 1, 1));
28207ec681f3Smrg               else
28217ec681f3Smrg                  mov.reset(create_instruction<VOP1_instruction>(aco_opcode::v_mov_b32,
28227ec681f3Smrg                                                                 Format::VOP1, 1, 1));
28237ec681f3Smrg               mov->operands[0] = instr->operands[0];
28247ec681f3Smrg               mov->definitions[0] = Definition(tmp);
28257ec681f3Smrg               mov->definitions[0].setFixed(reg);
28267ec681f3Smrg
28277ec681f3Smrg               instr->operands[0] = Operand(tmp);
28287ec681f3Smrg               instr->operands[0].setFixed(reg);
28297ec681f3Smrg               instr->operands[0].setFirstKill(true);
28307ec681f3Smrg
28317ec681f3Smrg               instructions.emplace_back(std::move(mov));
28327ec681f3Smrg            }
28337ec681f3Smrg
28347ec681f3Smrg            /* change the instruction to VOP3 to enable an arbitrary register pair as dst */
28357ec681f3Smrg            aco_ptr<Instruction> tmp = std::move(instr);
28367ec681f3Smrg            Format format = asVOP3(tmp->format);
28377ec681f3Smrg            instr.reset(create_instruction<VOP3_instruction>(
28387ec681f3Smrg               tmp->opcode, format, tmp->operands.size(), tmp->definitions.size()));
28397ec681f3Smrg            std::copy(tmp->operands.begin(), tmp->operands.end(), instr->operands.begin());
28407ec681f3Smrg            std::copy(tmp->definitions.begin(), tmp->definitions.end(), instr->definitions.begin());
28417ec681f3Smrg         }
28427ec681f3Smrg
28437ec681f3Smrg         instructions.emplace_back(std::move(*instr_it));
28447ec681f3Smrg
28457ec681f3Smrg      } /* end for Instr */
28467ec681f3Smrg
28477ec681f3Smrg      block.instructions = std::move(instructions);
28487ec681f3Smrg   } /* end for BB */
28497ec681f3Smrg
28507ec681f3Smrg   /* find scc spill registers which may be needed for parallelcopies created by phis */
28517ec681f3Smrg   for (Block& block : program->blocks) {
28527ec681f3Smrg      if (block.linear_preds.size() <= 1)
28537ec681f3Smrg         continue;
28547ec681f3Smrg
28557ec681f3Smrg      std::bitset<128> regs = sgpr_live_in[block.index];
28567ec681f3Smrg      if (!regs[127])
28577ec681f3Smrg         continue;
28587ec681f3Smrg
28597ec681f3Smrg      /* choose a register */
28607ec681f3Smrg      int16_t reg = 0;
28617ec681f3Smrg      for (; reg < ctx.program->max_reg_demand.sgpr && regs[reg]; reg++)
28627ec681f3Smrg         ;
28637ec681f3Smrg      assert(reg < ctx.program->max_reg_demand.sgpr);
28647ec681f3Smrg      adjust_max_used_regs(ctx, s1, reg);
28657ec681f3Smrg
28667ec681f3Smrg      /* update predecessors */
28677ec681f3Smrg      for (unsigned& pred_index : block.linear_preds) {
28687ec681f3Smrg         Block& pred = program->blocks[pred_index];
28697ec681f3Smrg         pred.scc_live_out = true;
28707ec681f3Smrg         pred.scratch_sgpr = PhysReg{(uint16_t)reg};
28717ec681f3Smrg      }
28727ec681f3Smrg   }
28737ec681f3Smrg
28747ec681f3Smrg   /* num_gpr = rnd_up(max_used_gpr + 1) */
28757ec681f3Smrg   program->config->num_vgprs = get_vgpr_alloc(program, ctx.max_used_vgpr + 1);
28767ec681f3Smrg   program->config->num_sgprs = get_sgpr_alloc(program, ctx.max_used_sgpr + 1);
28777ec681f3Smrg
28787ec681f3Smrg   program->progress = CompilationProgress::after_ra;
28797ec681f3Smrg}
28807ec681f3Smrg
28817ec681f3Smrg} // namespace aco
2882