aco_register_allocation.cpp revision 7ec681f3
1/* 2 * Copyright © 2018 Valve Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 * 23 */ 24 25#include "aco_ir.h" 26 27#include <algorithm> 28#include <array> 29#include <bitset> 30#include <map> 31#include <set> 32#include <unordered_map> 33#include <vector> 34 35namespace aco { 36namespace { 37 38struct ra_ctx; 39 40unsigned get_subdword_operand_stride(chip_class chip, const aco_ptr<Instruction>& instr, 41 unsigned idx, RegClass rc); 42void add_subdword_operand(ra_ctx& ctx, aco_ptr<Instruction>& instr, unsigned idx, unsigned byte, 43 RegClass rc); 44std::pair<unsigned, unsigned> 45get_subdword_definition_info(Program* program, const aco_ptr<Instruction>& instr, RegClass rc); 46void add_subdword_definition(Program* program, aco_ptr<Instruction>& instr, PhysReg reg); 47 48struct assignment { 49 PhysReg reg; 50 RegClass rc; 51 bool assigned = false; 52 uint32_t affinity = 0; 53 assignment() = default; 54 assignment(PhysReg reg_, RegClass rc_) : reg(reg_), rc(rc_), assigned(-1) {} 55 void set(const Definition& def) 56 { 57 assigned = true; 58 reg = def.physReg(); 59 rc = def.regClass(); 60 } 61}; 62 63struct ra_ctx { 64 65 Program* program; 66 Block* block = NULL; 67 std::vector<assignment> assignments; 68 std::vector<std::unordered_map<unsigned, Temp>> renames; 69 std::vector<uint32_t> loop_header; 70 std::unordered_map<unsigned, Temp> orig_names; 71 std::unordered_map<unsigned, Instruction*> vectors; 72 std::unordered_map<unsigned, Instruction*> split_vectors; 73 aco_ptr<Instruction> pseudo_dummy; 74 uint16_t max_used_sgpr = 0; 75 uint16_t max_used_vgpr = 0; 76 uint16_t sgpr_limit; 77 uint16_t vgpr_limit; 78 std::bitset<512> war_hint; 79 std::bitset<64> defs_done; /* see MAX_ARGS in aco_instruction_selection_setup.cpp */ 80 81 ra_test_policy policy; 82 83 ra_ctx(Program* program_, ra_test_policy policy_) 84 : program(program_), assignments(program->peekAllocationId()), 85 renames(program->blocks.size()), policy(policy_) 86 { 87 pseudo_dummy.reset( 88 create_instruction<Instruction>(aco_opcode::p_parallelcopy, Format::PSEUDO, 0, 0)); 89 sgpr_limit = get_addr_sgpr_from_waves(program, program->min_waves); 90 vgpr_limit = get_addr_sgpr_from_waves(program, program->min_waves); 91 } 92}; 93 94/* Iterator type for making PhysRegInterval compatible with range-based for */ 95struct PhysRegIterator { 96 using difference_type = int; 97 using value_type = unsigned; 98 using reference = const unsigned&; 99 using pointer = const unsigned*; 100 using iterator_category = std::bidirectional_iterator_tag; 101 102 PhysReg reg; 103 104 PhysReg operator*() const { return reg; } 105 106 PhysRegIterator& operator++() 107 { 108 reg.reg_b += 4; 109 return *this; 110 } 111 112 PhysRegIterator& operator--() 113 { 114 reg.reg_b -= 4; 115 return *this; 116 } 117 118 bool operator==(PhysRegIterator oth) const { return reg == oth.reg; } 119 120 bool operator!=(PhysRegIterator oth) const { return reg != oth.reg; } 121 122 bool operator<(PhysRegIterator oth) const { return reg < oth.reg; } 123}; 124 125/* Half-open register interval used in "sliding window"-style for-loops */ 126struct PhysRegInterval { 127 PhysReg lo_; 128 unsigned size; 129 130 /* Inclusive lower bound */ 131 PhysReg lo() const { return lo_; } 132 133 /* Exclusive upper bound */ 134 PhysReg hi() const { return PhysReg{lo() + size}; } 135 136 PhysRegInterval& operator+=(uint32_t stride) 137 { 138 lo_ = PhysReg{lo_.reg() + stride}; 139 return *this; 140 } 141 142 bool operator!=(const PhysRegInterval& oth) const { return lo_ != oth.lo_ || size != oth.size; } 143 144 /* Construct a half-open interval, excluding the end register */ 145 static PhysRegInterval from_until(PhysReg first, PhysReg end) { return {first, end - first}; } 146 147 bool contains(PhysReg reg) const { return lo() <= reg && reg < hi(); } 148 149 bool contains(const PhysRegInterval& needle) const 150 { 151 return needle.lo() >= lo() && needle.hi() <= hi(); 152 } 153 154 PhysRegIterator begin() const { return {lo_}; } 155 156 PhysRegIterator end() const { return {PhysReg{lo_ + size}}; } 157}; 158 159bool 160intersects(const PhysRegInterval& a, const PhysRegInterval& b) 161{ 162 return a.hi() > b.lo() && b.hi() > a.lo(); 163} 164 165/* Gets the stride for full (non-subdword) registers */ 166uint32_t 167get_stride(RegClass rc) 168{ 169 if (rc.type() == RegType::vgpr) { 170 return 1; 171 } else { 172 uint32_t size = rc.size(); 173 if (size == 2) { 174 return 2; 175 } else if (size >= 4) { 176 return 4; 177 } else { 178 return 1; 179 } 180 } 181} 182 183PhysRegInterval 184get_reg_bounds(Program* program, RegType type) 185{ 186 if (type == RegType::vgpr) { 187 return {PhysReg{256}, (unsigned)program->max_reg_demand.vgpr}; 188 } else { 189 return {PhysReg{0}, (unsigned)program->max_reg_demand.sgpr}; 190 } 191} 192 193struct DefInfo { 194 PhysRegInterval bounds; 195 uint8_t size; 196 uint8_t stride; 197 RegClass rc; 198 199 DefInfo(ra_ctx& ctx, aco_ptr<Instruction>& instr, RegClass rc_, int operand) : rc(rc_) 200 { 201 size = rc.size(); 202 stride = get_stride(rc); 203 204 bounds = get_reg_bounds(ctx.program, rc.type()); 205 206 if (rc.is_subdword() && operand >= 0) { 207 /* stride in bytes */ 208 stride = get_subdword_operand_stride(ctx.program->chip_class, instr, operand, rc); 209 } else if (rc.is_subdword()) { 210 std::pair<unsigned, unsigned> info = get_subdword_definition_info(ctx.program, instr, rc); 211 stride = info.first; 212 if (info.second > rc.bytes()) { 213 rc = RegClass::get(rc.type(), info.second); 214 size = rc.size(); 215 /* we might still be able to put the definition in the high half, 216 * but that's only useful for affinities and this information isn't 217 * used for them */ 218 stride = align(stride, info.second); 219 if (!rc.is_subdword()) 220 stride = DIV_ROUND_UP(stride, 4); 221 } 222 assert(stride > 0); 223 } 224 } 225}; 226 227class RegisterFile { 228public: 229 RegisterFile() { regs.fill(0); } 230 231 std::array<uint32_t, 512> regs; 232 std::map<uint32_t, std::array<uint32_t, 4>> subdword_regs; 233 234 const uint32_t& operator[](PhysReg index) const { return regs[index]; } 235 236 uint32_t& operator[](PhysReg index) { return regs[index]; } 237 238 unsigned count_zero(PhysRegInterval reg_interval) 239 { 240 unsigned res = 0; 241 for (PhysReg reg : reg_interval) 242 res += !regs[reg]; 243 return res; 244 } 245 246 /* Returns true if any of the bytes in the given range are allocated or blocked */ 247 bool test(PhysReg start, unsigned num_bytes) 248 { 249 for (PhysReg i = start; i.reg_b < start.reg_b + num_bytes; i = PhysReg(i + 1)) { 250 assert(i <= 511); 251 if (regs[i] & 0x0FFFFFFF) 252 return true; 253 if (regs[i] == 0xF0000000) { 254 assert(subdword_regs.find(i) != subdword_regs.end()); 255 for (unsigned j = i.byte(); i * 4 + j < start.reg_b + num_bytes && j < 4; j++) { 256 if (subdword_regs[i][j]) 257 return true; 258 } 259 } 260 } 261 return false; 262 } 263 264 void block(PhysReg start, RegClass rc) 265 { 266 if (rc.is_subdword()) 267 fill_subdword(start, rc.bytes(), 0xFFFFFFFF); 268 else 269 fill(start, rc.size(), 0xFFFFFFFF); 270 } 271 272 bool is_blocked(PhysReg start) 273 { 274 if (regs[start] == 0xFFFFFFFF) 275 return true; 276 if (regs[start] == 0xF0000000) { 277 for (unsigned i = start.byte(); i < 4; i++) 278 if (subdword_regs[start][i] == 0xFFFFFFFF) 279 return true; 280 } 281 return false; 282 } 283 284 bool is_empty_or_blocked(PhysReg start) 285 { 286 /* Empty is 0, blocked is 0xFFFFFFFF, so to check both we compare the 287 * incremented value to 1 */ 288 if (regs[start] == 0xF0000000) { 289 return subdword_regs[start][start.byte()] + 1 <= 1; 290 } 291 return regs[start] + 1 <= 1; 292 } 293 294 void clear(PhysReg start, RegClass rc) 295 { 296 if (rc.is_subdword()) 297 fill_subdword(start, rc.bytes(), 0); 298 else 299 fill(start, rc.size(), 0); 300 } 301 302 void fill(Operand op) 303 { 304 if (op.regClass().is_subdword()) 305 fill_subdword(op.physReg(), op.bytes(), op.tempId()); 306 else 307 fill(op.physReg(), op.size(), op.tempId()); 308 } 309 310 void clear(Operand op) { clear(op.physReg(), op.regClass()); } 311 312 void fill(Definition def) 313 { 314 if (def.regClass().is_subdword()) 315 fill_subdword(def.physReg(), def.bytes(), def.tempId()); 316 else 317 fill(def.physReg(), def.size(), def.tempId()); 318 } 319 320 void clear(Definition def) { clear(def.physReg(), def.regClass()); } 321 322 unsigned get_id(PhysReg reg) 323 { 324 return regs[reg] == 0xF0000000 ? subdword_regs[reg][reg.byte()] : regs[reg]; 325 } 326 327private: 328 void fill(PhysReg start, unsigned size, uint32_t val) 329 { 330 for (unsigned i = 0; i < size; i++) 331 regs[start + i] = val; 332 } 333 334 void fill_subdword(PhysReg start, unsigned num_bytes, uint32_t val) 335 { 336 fill(start, DIV_ROUND_UP(num_bytes, 4), 0xF0000000); 337 for (PhysReg i = start; i.reg_b < start.reg_b + num_bytes; i = PhysReg(i + 1)) { 338 /* emplace or get */ 339 std::array<uint32_t, 4>& sub = 340 subdword_regs.emplace(i, std::array<uint32_t, 4>{0, 0, 0, 0}).first->second; 341 for (unsigned j = i.byte(); i * 4 + j < start.reg_b + num_bytes && j < 4; j++) 342 sub[j] = val; 343 344 if (sub == std::array<uint32_t, 4>{0, 0, 0, 0}) { 345 subdword_regs.erase(i); 346 regs[i] = 0; 347 } 348 } 349 } 350}; 351 352std::set<std::pair<unsigned, unsigned>> find_vars(ra_ctx& ctx, RegisterFile& reg_file, 353 const PhysRegInterval reg_interval); 354 355/* helper function for debugging */ 356UNUSED void 357print_reg(const RegisterFile& reg_file, PhysReg reg, bool has_adjacent_variable) 358{ 359 if (reg_file[reg] == 0xFFFFFFFF) { 360 printf("☐"); 361 } else if (reg_file[reg]) { 362 const bool show_subdword_alloc = (reg_file[reg] == 0xF0000000); 363 if (show_subdword_alloc) { 364 const char* block_chars[] = { 365 // clang-format off 366 "?", "▘", "▝", "▀", 367 "▖", "▌", "▞", "▛", 368 "▗", "▚", "▐", "▜", 369 "▄", "▙", "▟", "▉" 370 // clang-format on 371 }; 372 unsigned index = 0; 373 for (int i = 0; i < 4; ++i) { 374 if (reg_file.subdword_regs.at(reg)[i]) { 375 index |= 1 << i; 376 } 377 } 378 printf("%s", block_chars[index]); 379 } else { 380 /* Indicate filled register slot */ 381 if (!has_adjacent_variable) { 382 printf("█"); 383 } else { 384 /* Use a slightly shorter box to leave a small gap between adjacent variables */ 385 printf("▉"); 386 } 387 } 388 } else { 389 printf("·"); 390 } 391} 392 393/* helper function for debugging */ 394UNUSED void 395print_regs(ra_ctx& ctx, bool vgprs, RegisterFile& reg_file) 396{ 397 PhysRegInterval regs = get_reg_bounds(ctx.program, vgprs ? RegType::vgpr : RegType::sgpr); 398 char reg_char = vgprs ? 'v' : 's'; 399 const int max_regs_per_line = 64; 400 401 /* print markers */ 402 printf(" "); 403 for (int i = 0; i < std::min<int>(max_regs_per_line, ROUND_DOWN_TO(regs.size, 4)); i += 4) { 404 printf("%-3.2u ", i); 405 } 406 printf("\n"); 407 408 /* print usage */ 409 auto line_begin_it = regs.begin(); 410 while (line_begin_it != regs.end()) { 411 const int regs_in_line = 412 std::min<int>(max_regs_per_line, std::distance(line_begin_it, regs.end())); 413 414 if (line_begin_it == regs.begin()) { 415 printf("%cgprs: ", reg_char); 416 } else { 417 printf(" %+4d ", std::distance(regs.begin(), line_begin_it)); 418 } 419 const auto line_end_it = std::next(line_begin_it, regs_in_line); 420 421 for (auto reg_it = line_begin_it; reg_it != line_end_it; ++reg_it) { 422 bool has_adjacent_variable = 423 (std::next(reg_it) != line_end_it && 424 reg_file[*reg_it] != reg_file[*std::next(reg_it)] && reg_file[*std::next(reg_it)]); 425 print_reg(reg_file, *reg_it, has_adjacent_variable); 426 } 427 428 line_begin_it = line_end_it; 429 printf("\n"); 430 } 431 432 const unsigned free_regs = 433 std::count_if(regs.begin(), regs.end(), [&](auto reg) { return !reg_file[reg]; }); 434 printf("%u/%u used, %u/%u free\n", regs.size - free_regs, regs.size, free_regs, regs.size); 435 436 /* print assignments ordered by registers */ 437 std::map<PhysReg, std::pair<unsigned, unsigned>> 438 regs_to_vars; /* maps to byte size and temp id */ 439 for (const auto& size_id : find_vars(ctx, reg_file, regs)) { 440 auto reg = ctx.assignments[size_id.second].reg; 441 ASSERTED auto inserted = regs_to_vars.emplace(reg, size_id); 442 assert(inserted.second); 443 } 444 445 for (const auto& reg_and_var : regs_to_vars) { 446 const auto& first_reg = reg_and_var.first; 447 const auto& size_id = reg_and_var.second; 448 449 printf("%%%u ", size_id.second); 450 if (ctx.orig_names.count(size_id.second) && 451 ctx.orig_names[size_id.second].id() != size_id.second) { 452 printf("(was %%%d) ", ctx.orig_names[size_id.second].id()); 453 } 454 printf("= %c[%d", reg_char, first_reg.reg() - regs.lo()); 455 PhysReg last_reg = first_reg.advance(size_id.first - 1); 456 if (first_reg.reg() != last_reg.reg()) { 457 assert(first_reg.byte() == 0 && last_reg.byte() == 3); 458 printf("-%d", last_reg.reg() - regs.lo()); 459 } 460 printf("]"); 461 if (first_reg.byte() != 0 || last_reg.byte() != 3) { 462 printf("[%d:%d]", first_reg.byte() * 8, (last_reg.byte() + 1) * 8); 463 } 464 printf("\n"); 465 } 466} 467 468unsigned 469get_subdword_operand_stride(chip_class chip, const aco_ptr<Instruction>& instr, unsigned idx, 470 RegClass rc) 471{ 472 if (instr->isPseudo()) { 473 /* v_readfirstlane_b32 cannot use SDWA */ 474 if (instr->opcode == aco_opcode::p_as_uniform) 475 return 4; 476 else if (chip >= GFX8) 477 return rc.bytes() % 2 == 0 ? 2 : 1; 478 else 479 return 4; 480 } 481 482 assert(rc.bytes() <= 2); 483 if (instr->isVALU()) { 484 if (can_use_SDWA(chip, instr, false)) 485 return rc.bytes(); 486 if (can_use_opsel(chip, instr->opcode, idx, true)) 487 return 2; 488 if (instr->format == Format::VOP3P) 489 return 2; 490 } 491 492 switch (instr->opcode) { 493 case aco_opcode::v_cvt_f32_ubyte0: return 1; 494 case aco_opcode::ds_write_b8: 495 case aco_opcode::ds_write_b16: return chip >= GFX9 ? 2 : 4; 496 case aco_opcode::buffer_store_byte: 497 case aco_opcode::buffer_store_short: 498 case aco_opcode::flat_store_byte: 499 case aco_opcode::flat_store_short: 500 case aco_opcode::scratch_store_byte: 501 case aco_opcode::scratch_store_short: 502 case aco_opcode::global_store_byte: 503 case aco_opcode::global_store_short: return chip >= GFX9 ? 2 : 4; 504 default: return 4; 505 } 506} 507 508void 509add_subdword_operand(ra_ctx& ctx, aco_ptr<Instruction>& instr, unsigned idx, unsigned byte, 510 RegClass rc) 511{ 512 chip_class chip = ctx.program->chip_class; 513 if (instr->isPseudo() || byte == 0) 514 return; 515 516 assert(rc.bytes() <= 2); 517 if (instr->isVALU()) { 518 /* check if we can use opsel */ 519 if (instr->format == Format::VOP3) { 520 assert(byte == 2); 521 instr->vop3().opsel |= 1 << idx; 522 return; 523 } 524 if (instr->isVOP3P()) { 525 assert(byte == 2 && !(instr->vop3p().opsel_lo & (1 << idx))); 526 instr->vop3p().opsel_lo |= 1 << idx; 527 instr->vop3p().opsel_hi |= 1 << idx; 528 return; 529 } 530 if (instr->opcode == aco_opcode::v_cvt_f32_ubyte0) { 531 switch (byte) { 532 case 0: instr->opcode = aco_opcode::v_cvt_f32_ubyte0; break; 533 case 1: instr->opcode = aco_opcode::v_cvt_f32_ubyte1; break; 534 case 2: instr->opcode = aco_opcode::v_cvt_f32_ubyte2; break; 535 case 3: instr->opcode = aco_opcode::v_cvt_f32_ubyte3; break; 536 } 537 return; 538 } 539 540 /* use SDWA */ 541 assert(can_use_SDWA(chip, instr, false)); 542 convert_to_SDWA(chip, instr); 543 return; 544 } 545 546 assert(byte == 2); 547 if (instr->opcode == aco_opcode::ds_write_b8) 548 instr->opcode = aco_opcode::ds_write_b8_d16_hi; 549 else if (instr->opcode == aco_opcode::ds_write_b16) 550 instr->opcode = aco_opcode::ds_write_b16_d16_hi; 551 else if (instr->opcode == aco_opcode::buffer_store_byte) 552 instr->opcode = aco_opcode::buffer_store_byte_d16_hi; 553 else if (instr->opcode == aco_opcode::buffer_store_short) 554 instr->opcode = aco_opcode::buffer_store_short_d16_hi; 555 else if (instr->opcode == aco_opcode::flat_store_byte) 556 instr->opcode = aco_opcode::flat_store_byte_d16_hi; 557 else if (instr->opcode == aco_opcode::flat_store_short) 558 instr->opcode = aco_opcode::flat_store_short_d16_hi; 559 else if (instr->opcode == aco_opcode::scratch_store_byte) 560 instr->opcode = aco_opcode::scratch_store_byte_d16_hi; 561 else if (instr->opcode == aco_opcode::scratch_store_short) 562 instr->opcode = aco_opcode::scratch_store_short_d16_hi; 563 else if (instr->opcode == aco_opcode::global_store_byte) 564 instr->opcode = aco_opcode::global_store_byte_d16_hi; 565 else if (instr->opcode == aco_opcode::global_store_short) 566 instr->opcode = aco_opcode::global_store_short_d16_hi; 567 else 568 unreachable("Something went wrong: Impossible register assignment."); 569 return; 570} 571 572/* minimum_stride, bytes_written */ 573std::pair<unsigned, unsigned> 574get_subdword_definition_info(Program* program, const aco_ptr<Instruction>& instr, RegClass rc) 575{ 576 chip_class chip = program->chip_class; 577 578 if (instr->isPseudo()) { 579 if (chip >= GFX8) 580 return std::make_pair(rc.bytes() % 2 == 0 ? 2 : 1, rc.bytes()); 581 else 582 return std::make_pair(4, rc.size() * 4u); 583 } 584 585 if (instr->isVALU() || instr->isVINTRP()) { 586 assert(rc.bytes() <= 2); 587 588 if (can_use_SDWA(chip, instr, false)) 589 return std::make_pair(rc.bytes(), rc.bytes()); 590 591 unsigned bytes_written = 4u; 592 if (instr_is_16bit(chip, instr->opcode)) 593 bytes_written = 2u; 594 595 unsigned stride = 4u; 596 if (instr->opcode == aco_opcode::v_fma_mixlo_f16 || 597 can_use_opsel(chip, instr->opcode, -1, true)) 598 stride = 2u; 599 600 return std::make_pair(stride, bytes_written); 601 } 602 603 switch (instr->opcode) { 604 case aco_opcode::ds_read_u8_d16: 605 case aco_opcode::ds_read_i8_d16: 606 case aco_opcode::ds_read_u16_d16: 607 case aco_opcode::flat_load_ubyte_d16: 608 case aco_opcode::flat_load_sbyte_d16: 609 case aco_opcode::flat_load_short_d16: 610 case aco_opcode::global_load_ubyte_d16: 611 case aco_opcode::global_load_sbyte_d16: 612 case aco_opcode::global_load_short_d16: 613 case aco_opcode::scratch_load_ubyte_d16: 614 case aco_opcode::scratch_load_sbyte_d16: 615 case aco_opcode::scratch_load_short_d16: 616 case aco_opcode::buffer_load_ubyte_d16: 617 case aco_opcode::buffer_load_sbyte_d16: 618 case aco_opcode::buffer_load_short_d16: { 619 assert(chip >= GFX9); 620 if (!program->dev.sram_ecc_enabled) 621 return std::make_pair(2u, 2u); 622 else 623 return std::make_pair(2u, 4u); 624 } 625 626 default: return std::make_pair(4, rc.size() * 4u); 627 } 628} 629 630void 631add_subdword_definition(Program* program, aco_ptr<Instruction>& instr, PhysReg reg) 632{ 633 if (instr->isPseudo()) 634 return; 635 636 if (instr->isVALU()) { 637 chip_class chip = program->chip_class; 638 assert(instr->definitions[0].bytes() <= 2); 639 640 if (reg.byte() == 0 && instr_is_16bit(chip, instr->opcode)) 641 return; 642 643 /* check if we can use opsel */ 644 if (instr->format == Format::VOP3) { 645 assert(reg.byte() == 2); 646 assert(can_use_opsel(chip, instr->opcode, -1, true)); 647 instr->vop3().opsel |= (1 << 3); /* dst in high half */ 648 return; 649 } 650 651 if (instr->opcode == aco_opcode::v_fma_mixlo_f16) { 652 instr->opcode = aco_opcode::v_fma_mixhi_f16; 653 return; 654 } 655 656 /* use SDWA */ 657 assert(can_use_SDWA(chip, instr, false)); 658 convert_to_SDWA(chip, instr); 659 return; 660 } 661 662 if (reg.byte() == 0) 663 return; 664 else if (instr->opcode == aco_opcode::buffer_load_ubyte_d16) 665 instr->opcode = aco_opcode::buffer_load_ubyte_d16_hi; 666 else if (instr->opcode == aco_opcode::buffer_load_sbyte_d16) 667 instr->opcode = aco_opcode::buffer_load_sbyte_d16_hi; 668 else if (instr->opcode == aco_opcode::buffer_load_short_d16) 669 instr->opcode = aco_opcode::buffer_load_short_d16_hi; 670 else if (instr->opcode == aco_opcode::flat_load_ubyte_d16) 671 instr->opcode = aco_opcode::flat_load_ubyte_d16_hi; 672 else if (instr->opcode == aco_opcode::flat_load_sbyte_d16) 673 instr->opcode = aco_opcode::flat_load_sbyte_d16_hi; 674 else if (instr->opcode == aco_opcode::flat_load_short_d16) 675 instr->opcode = aco_opcode::flat_load_short_d16_hi; 676 else if (instr->opcode == aco_opcode::scratch_load_ubyte_d16) 677 instr->opcode = aco_opcode::scratch_load_ubyte_d16_hi; 678 else if (instr->opcode == aco_opcode::scratch_load_sbyte_d16) 679 instr->opcode = aco_opcode::scratch_load_sbyte_d16_hi; 680 else if (instr->opcode == aco_opcode::scratch_load_short_d16) 681 instr->opcode = aco_opcode::scratch_load_short_d16_hi; 682 else if (instr->opcode == aco_opcode::global_load_ubyte_d16) 683 instr->opcode = aco_opcode::global_load_ubyte_d16_hi; 684 else if (instr->opcode == aco_opcode::global_load_sbyte_d16) 685 instr->opcode = aco_opcode::global_load_sbyte_d16_hi; 686 else if (instr->opcode == aco_opcode::global_load_short_d16) 687 instr->opcode = aco_opcode::global_load_short_d16_hi; 688 else if (instr->opcode == aco_opcode::ds_read_u8_d16) 689 instr->opcode = aco_opcode::ds_read_u8_d16_hi; 690 else if (instr->opcode == aco_opcode::ds_read_i8_d16) 691 instr->opcode = aco_opcode::ds_read_i8_d16_hi; 692 else if (instr->opcode == aco_opcode::ds_read_u16_d16) 693 instr->opcode = aco_opcode::ds_read_u16_d16_hi; 694 else 695 unreachable("Something went wrong: Impossible register assignment."); 696} 697 698void 699adjust_max_used_regs(ra_ctx& ctx, RegClass rc, unsigned reg) 700{ 701 uint16_t max_addressible_sgpr = ctx.sgpr_limit; 702 unsigned size = rc.size(); 703 if (rc.type() == RegType::vgpr) { 704 assert(reg >= 256); 705 uint16_t hi = reg - 256 + size - 1; 706 ctx.max_used_vgpr = std::max(ctx.max_used_vgpr, hi); 707 } else if (reg + rc.size() <= max_addressible_sgpr) { 708 uint16_t hi = reg + size - 1; 709 ctx.max_used_sgpr = std::max(ctx.max_used_sgpr, std::min(hi, max_addressible_sgpr)); 710 } 711} 712 713enum UpdateRenames { 714 rename_not_killed_ops = 0x1, 715 fill_killed_ops = 0x2, 716}; 717MESA_DEFINE_CPP_ENUM_BITFIELD_OPERATORS(UpdateRenames); 718 719void 720update_renames(ra_ctx& ctx, RegisterFile& reg_file, 721 std::vector<std::pair<Operand, Definition>>& parallelcopies, 722 aco_ptr<Instruction>& instr, UpdateRenames flags) 723{ 724 /* clear operands */ 725 for (std::pair<Operand, Definition>& copy : parallelcopies) { 726 /* the definitions with id are not from this function and already handled */ 727 if (copy.second.isTemp()) 728 continue; 729 reg_file.clear(copy.first); 730 } 731 732 /* allocate id's and rename operands: this is done transparently here */ 733 auto it = parallelcopies.begin(); 734 while (it != parallelcopies.end()) { 735 if (it->second.isTemp()) { 736 ++it; 737 continue; 738 } 739 740 /* check if we moved a definition: change the register and remove copy */ 741 bool is_def = false; 742 for (Definition& def : instr->definitions) { 743 if (def.isTemp() && def.getTemp() == it->first.getTemp()) { 744 // FIXME: ensure that the definition can use this reg 745 def.setFixed(it->second.physReg()); 746 reg_file.fill(def); 747 ctx.assignments[def.tempId()].reg = def.physReg(); 748 it = parallelcopies.erase(it); 749 is_def = true; 750 break; 751 } 752 } 753 if (is_def) 754 continue; 755 756 /* check if we moved another parallelcopy definition */ 757 for (std::pair<Operand, Definition>& other : parallelcopies) { 758 if (!other.second.isTemp()) 759 continue; 760 if (it->first.getTemp() == other.second.getTemp()) { 761 other.second.setFixed(it->second.physReg()); 762 ctx.assignments[other.second.tempId()].reg = other.second.physReg(); 763 it = parallelcopies.erase(it); 764 is_def = true; 765 /* check if we moved an operand, again */ 766 bool fill = true; 767 for (Operand& op : instr->operands) { 768 if (op.isTemp() && op.tempId() == other.second.tempId()) { 769 // FIXME: ensure that the operand can use this reg 770 op.setFixed(other.second.physReg()); 771 fill = (flags & fill_killed_ops) || !op.isKillBeforeDef(); 772 } 773 } 774 if (fill) 775 reg_file.fill(other.second); 776 break; 777 } 778 } 779 if (is_def) 780 continue; 781 782 std::pair<Operand, Definition>& copy = *it; 783 copy.second.setTemp(ctx.program->allocateTmp(copy.second.regClass())); 784 ctx.assignments.emplace_back(copy.second.physReg(), copy.second.regClass()); 785 assert(ctx.assignments.size() == ctx.program->peekAllocationId()); 786 787 /* check if we moved an operand */ 788 bool first = true; 789 bool fill = true; 790 for (unsigned i = 0; i < instr->operands.size(); i++) { 791 Operand& op = instr->operands[i]; 792 if (!op.isTemp()) 793 continue; 794 if (op.tempId() == copy.first.tempId()) { 795 bool omit_renaming = !(flags & rename_not_killed_ops) && !op.isKillBeforeDef(); 796 for (std::pair<Operand, Definition>& pc : parallelcopies) { 797 PhysReg def_reg = pc.second.physReg(); 798 omit_renaming &= def_reg > copy.first.physReg() 799 ? (copy.first.physReg() + copy.first.size() <= def_reg.reg()) 800 : (def_reg + pc.second.size() <= copy.first.physReg().reg()); 801 } 802 if (omit_renaming) { 803 if (first) 804 op.setFirstKill(true); 805 else 806 op.setKill(true); 807 first = false; 808 continue; 809 } 810 op.setTemp(copy.second.getTemp()); 811 op.setFixed(copy.second.physReg()); 812 813 fill = (flags & fill_killed_ops) || !op.isKillBeforeDef(); 814 } 815 } 816 817 if (fill) 818 reg_file.fill(copy.second); 819 820 ++it; 821 } 822} 823 824std::pair<PhysReg, bool> 825get_reg_simple(ra_ctx& ctx, RegisterFile& reg_file, DefInfo info) 826{ 827 const PhysRegInterval& bounds = info.bounds; 828 uint32_t size = info.size; 829 uint32_t stride = info.rc.is_subdword() ? DIV_ROUND_UP(info.stride, 4) : info.stride; 830 RegClass rc = info.rc; 831 832 DefInfo new_info = info; 833 new_info.rc = RegClass(rc.type(), size); 834 for (unsigned new_stride = 16; new_stride > stride; new_stride /= 2) { 835 if (size % new_stride) 836 continue; 837 new_info.stride = new_stride; 838 std::pair<PhysReg, bool> res = get_reg_simple(ctx, reg_file, new_info); 839 if (res.second) 840 return res; 841 } 842 843 auto is_free = [&](PhysReg reg_index) 844 { return reg_file[reg_index] == 0 && !ctx.war_hint[reg_index]; }; 845 846 if (stride == 1) { 847 /* best fit algorithm: find the smallest gap to fit in the variable */ 848 PhysRegInterval best_gap{PhysReg{0}, UINT_MAX}; 849 const unsigned max_gpr = 850 (rc.type() == RegType::vgpr) ? (256 + ctx.max_used_vgpr) : ctx.max_used_sgpr; 851 852 PhysRegIterator reg_it = bounds.begin(); 853 const PhysRegIterator end_it = 854 std::min(bounds.end(), std::max(PhysRegIterator{PhysReg{max_gpr + 1}}, reg_it)); 855 while (reg_it != bounds.end()) { 856 /* Find the next chunk of available register slots */ 857 reg_it = std::find_if(reg_it, end_it, is_free); 858 auto next_nonfree_it = std::find_if_not(reg_it, end_it, is_free); 859 if (reg_it == bounds.end()) { 860 break; 861 } 862 863 if (next_nonfree_it == end_it) { 864 /* All registers past max_used_gpr are free */ 865 next_nonfree_it = bounds.end(); 866 } 867 868 PhysRegInterval gap = PhysRegInterval::from_until(*reg_it, *next_nonfree_it); 869 870 /* early return on exact matches */ 871 if (size == gap.size) { 872 adjust_max_used_regs(ctx, rc, gap.lo()); 873 return {gap.lo(), true}; 874 } 875 876 /* check if it fits and the gap size is smaller */ 877 if (size < gap.size && gap.size < best_gap.size) { 878 best_gap = gap; 879 } 880 881 /* Move past the processed chunk */ 882 reg_it = next_nonfree_it; 883 } 884 885 if (best_gap.size == UINT_MAX) 886 return {{}, false}; 887 888 /* find best position within gap by leaving a good stride for other variables*/ 889 unsigned buffer = best_gap.size - size; 890 if (buffer > 1) { 891 if (((best_gap.lo() + size) % 8 != 0 && (best_gap.lo() + buffer) % 8 == 0) || 892 ((best_gap.lo() + size) % 4 != 0 && (best_gap.lo() + buffer) % 4 == 0) || 893 ((best_gap.lo() + size) % 2 != 0 && (best_gap.lo() + buffer) % 2 == 0)) 894 best_gap = {PhysReg{best_gap.lo() + buffer}, best_gap.size - buffer}; 895 } 896 897 adjust_max_used_regs(ctx, rc, best_gap.lo()); 898 return {best_gap.lo(), true}; 899 } 900 901 for (PhysRegInterval reg_win = {bounds.lo(), size}; reg_win.hi() <= bounds.hi(); 902 reg_win += stride) { 903 if (reg_file[reg_win.lo()] != 0) { 904 continue; 905 } 906 907 bool is_valid = std::all_of(std::next(reg_win.begin()), reg_win.end(), is_free); 908 if (is_valid) { 909 adjust_max_used_regs(ctx, rc, reg_win.lo()); 910 return {reg_win.lo(), true}; 911 } 912 } 913 914 /* do this late because using the upper bytes of a register can require 915 * larger instruction encodings or copies 916 * TODO: don't do this in situations where it doesn't benefit */ 917 if (rc.is_subdword()) { 918 for (std::pair<const uint32_t, std::array<uint32_t, 4>>& entry : reg_file.subdword_regs) { 919 assert(reg_file[PhysReg{entry.first}] == 0xF0000000); 920 if (!bounds.contains({PhysReg{entry.first}, rc.size()})) 921 continue; 922 923 for (unsigned i = 0; i < 4; i += info.stride) { 924 /* check if there's a block of free bytes large enough to hold the register */ 925 bool reg_found = 926 std::all_of(&entry.second[i], &entry.second[std::min(4u, i + rc.bytes())], 927 [](unsigned v) { return v == 0; }); 928 929 /* check if also the neighboring reg is free if needed */ 930 if (reg_found && i + rc.bytes() > 4) 931 reg_found = (reg_file[PhysReg{entry.first + 1}] == 0); 932 933 if (reg_found) { 934 PhysReg res{entry.first}; 935 res.reg_b += i; 936 adjust_max_used_regs(ctx, rc, entry.first); 937 return {res, true}; 938 } 939 } 940 } 941 } 942 943 return {{}, false}; 944} 945 946/* collect variables from a register area and clear reg_file */ 947std::set<std::pair<unsigned, unsigned>> 948find_vars(ra_ctx& ctx, RegisterFile& reg_file, const PhysRegInterval reg_interval) 949{ 950 std::set<std::pair<unsigned, unsigned>> vars; 951 for (PhysReg j : reg_interval) { 952 if (reg_file.is_blocked(j)) 953 continue; 954 if (reg_file[j] == 0xF0000000) { 955 for (unsigned k = 0; k < 4; k++) { 956 unsigned id = reg_file.subdword_regs[j][k]; 957 if (id) { 958 assignment& var = ctx.assignments[id]; 959 vars.emplace(var.rc.bytes(), id); 960 } 961 } 962 } else if (reg_file[j] != 0) { 963 unsigned id = reg_file[j]; 964 assignment& var = ctx.assignments[id]; 965 vars.emplace(var.rc.bytes(), id); 966 } 967 } 968 return vars; 969} 970 971/* collect variables from a register area and clear reg_file */ 972std::set<std::pair<unsigned, unsigned>> 973collect_vars(ra_ctx& ctx, RegisterFile& reg_file, const PhysRegInterval reg_interval) 974{ 975 std::set<std::pair<unsigned, unsigned>> vars = find_vars(ctx, reg_file, reg_interval); 976 for (std::pair<unsigned, unsigned> size_id : vars) { 977 assignment& var = ctx.assignments[size_id.second]; 978 reg_file.clear(var.reg, var.rc); 979 } 980 return vars; 981} 982 983bool 984get_regs_for_copies(ra_ctx& ctx, RegisterFile& reg_file, 985 std::vector<std::pair<Operand, Definition>>& parallelcopies, 986 const std::set<std::pair<unsigned, unsigned>>& vars, 987 const PhysRegInterval bounds, aco_ptr<Instruction>& instr, 988 const PhysRegInterval def_reg) 989{ 990 /* variables are sorted from small sized to large */ 991 /* NOTE: variables are also sorted by ID. this only affects a very small number of shaders 992 * slightly though. */ 993 for (std::set<std::pair<unsigned, unsigned>>::const_reverse_iterator it = vars.rbegin(); 994 it != vars.rend(); ++it) { 995 unsigned id = it->second; 996 assignment& var = ctx.assignments[id]; 997 DefInfo info = DefInfo(ctx, ctx.pseudo_dummy, var.rc, -1); 998 uint32_t size = info.size; 999 1000 /* check if this is a dead operand, then we can re-use the space from the definition 1001 * also use the correct stride for sub-dword operands */ 1002 bool is_dead_operand = false; 1003 for (unsigned i = 0; !is_phi(instr) && i < instr->operands.size(); i++) { 1004 if (instr->operands[i].isTemp() && instr->operands[i].tempId() == id) { 1005 if (instr->operands[i].isKillBeforeDef()) 1006 is_dead_operand = true; 1007 info = DefInfo(ctx, instr, var.rc, i); 1008 break; 1009 } 1010 } 1011 1012 std::pair<PhysReg, bool> res; 1013 if (is_dead_operand) { 1014 if (instr->opcode == aco_opcode::p_create_vector) { 1015 PhysReg reg(def_reg.lo()); 1016 for (unsigned i = 0; i < instr->operands.size(); i++) { 1017 if (instr->operands[i].isTemp() && instr->operands[i].tempId() == id) { 1018 res = {reg, (!var.rc.is_subdword() || (reg.byte() % info.stride == 0)) && 1019 !reg_file.test(reg, var.rc.bytes())}; 1020 break; 1021 } 1022 reg.reg_b += instr->operands[i].bytes(); 1023 } 1024 if (!res.second) 1025 res = {var.reg, !reg_file.test(var.reg, var.rc.bytes())}; 1026 } else { 1027 info.bounds = def_reg; 1028 res = get_reg_simple(ctx, reg_file, info); 1029 } 1030 } else { 1031 /* Try to find space within the bounds but outside of the definition */ 1032 info.bounds = PhysRegInterval::from_until(bounds.lo(), MIN2(def_reg.lo(), bounds.hi())); 1033 res = get_reg_simple(ctx, reg_file, info); 1034 if (!res.second && def_reg.hi() <= bounds.hi()) { 1035 unsigned lo = (def_reg.hi() + info.stride - 1) & ~(info.stride - 1); 1036 info.bounds = PhysRegInterval::from_until(PhysReg{lo}, bounds.hi()); 1037 res = get_reg_simple(ctx, reg_file, info); 1038 } 1039 } 1040 1041 if (res.second) { 1042 /* mark the area as blocked */ 1043 reg_file.block(res.first, var.rc); 1044 1045 /* create parallelcopy pair (without definition id) */ 1046 Temp tmp = Temp(id, var.rc); 1047 Operand pc_op = Operand(tmp); 1048 pc_op.setFixed(var.reg); 1049 Definition pc_def = Definition(res.first, pc_op.regClass()); 1050 parallelcopies.emplace_back(pc_op, pc_def); 1051 continue; 1052 } 1053 1054 PhysReg best_pos = bounds.lo(); 1055 unsigned num_moves = 0xFF; 1056 unsigned num_vars = 0; 1057 1058 /* we use a sliding window to find potential positions */ 1059 unsigned stride = var.rc.is_subdword() ? 1 : info.stride; 1060 for (PhysRegInterval reg_win{bounds.lo(), size}; reg_win.hi() <= bounds.hi(); 1061 reg_win += stride) { 1062 if (!is_dead_operand && intersects(reg_win, def_reg)) 1063 continue; 1064 1065 /* second, check that we have at most k=num_moves elements in the window 1066 * and no element is larger than the currently processed one */ 1067 unsigned k = 0; 1068 unsigned n = 0; 1069 unsigned last_var = 0; 1070 bool found = true; 1071 for (PhysReg j : reg_win) { 1072 if (reg_file[j] == 0 || reg_file[j] == last_var) 1073 continue; 1074 1075 if (reg_file.is_blocked(j) || k > num_moves) { 1076 found = false; 1077 break; 1078 } 1079 if (reg_file[j] == 0xF0000000) { 1080 k += 1; 1081 n++; 1082 continue; 1083 } 1084 /* we cannot split live ranges of linear vgprs inside control flow */ 1085 if (!(ctx.block->kind & block_kind_top_level) && 1086 ctx.assignments[reg_file[j]].rc.is_linear_vgpr()) { 1087 found = false; 1088 break; 1089 } 1090 bool is_kill = false; 1091 for (const Operand& op : instr->operands) { 1092 if (op.isTemp() && op.isKillBeforeDef() && op.tempId() == reg_file[j]) { 1093 is_kill = true; 1094 break; 1095 } 1096 } 1097 if (!is_kill && ctx.assignments[reg_file[j]].rc.size() >= size) { 1098 found = false; 1099 break; 1100 } 1101 1102 k += ctx.assignments[reg_file[j]].rc.size(); 1103 last_var = reg_file[j]; 1104 n++; 1105 if (k > num_moves || (k == num_moves && n <= num_vars)) { 1106 found = false; 1107 break; 1108 } 1109 } 1110 1111 if (found) { 1112 best_pos = reg_win.lo(); 1113 num_moves = k; 1114 num_vars = n; 1115 } 1116 } 1117 1118 /* FIXME: we messed up and couldn't find space for the variables to be copied */ 1119 if (num_moves == 0xFF) 1120 return false; 1121 1122 PhysRegInterval reg_win{best_pos, size}; 1123 1124 /* collect variables and block reg file */ 1125 std::set<std::pair<unsigned, unsigned>> new_vars = collect_vars(ctx, reg_file, reg_win); 1126 1127 /* mark the area as blocked */ 1128 reg_file.block(reg_win.lo(), var.rc); 1129 adjust_max_used_regs(ctx, var.rc, reg_win.lo()); 1130 1131 if (!get_regs_for_copies(ctx, reg_file, parallelcopies, new_vars, bounds, instr, def_reg)) 1132 return false; 1133 1134 /* create parallelcopy pair (without definition id) */ 1135 Temp tmp = Temp(id, var.rc); 1136 Operand pc_op = Operand(tmp); 1137 pc_op.setFixed(var.reg); 1138 Definition pc_def = Definition(reg_win.lo(), pc_op.regClass()); 1139 parallelcopies.emplace_back(pc_op, pc_def); 1140 } 1141 1142 return true; 1143} 1144 1145std::pair<PhysReg, bool> 1146get_reg_impl(ra_ctx& ctx, RegisterFile& reg_file, 1147 std::vector<std::pair<Operand, Definition>>& parallelcopies, const DefInfo& info, 1148 aco_ptr<Instruction>& instr) 1149{ 1150 const PhysRegInterval& bounds = info.bounds; 1151 uint32_t size = info.size; 1152 uint32_t stride = info.stride; 1153 RegClass rc = info.rc; 1154 1155 /* check how many free regs we have */ 1156 unsigned regs_free = reg_file.count_zero(bounds); 1157 1158 /* mark and count killed operands */ 1159 unsigned killed_ops = 0; 1160 std::bitset<256> is_killed_operand; /* per-register */ 1161 for (unsigned j = 0; !is_phi(instr) && j < instr->operands.size(); j++) { 1162 Operand& op = instr->operands[j]; 1163 if (op.isTemp() && op.isFirstKillBeforeDef() && bounds.contains(op.physReg()) && 1164 !reg_file.test(PhysReg{op.physReg().reg()}, align(op.bytes() + op.physReg().byte(), 4))) { 1165 assert(op.isFixed()); 1166 1167 for (unsigned i = 0; i < op.size(); ++i) { 1168 is_killed_operand[(op.physReg() & 0xff) + i] = true; 1169 } 1170 1171 killed_ops += op.getTemp().size(); 1172 } 1173 } 1174 1175 assert(regs_free >= size); 1176 /* we might have to move dead operands to dst in order to make space */ 1177 unsigned op_moves = 0; 1178 1179 if (size > (regs_free - killed_ops)) 1180 op_moves = size - (regs_free - killed_ops); 1181 1182 /* find the best position to place the definition */ 1183 PhysRegInterval best_win = {bounds.lo(), size}; 1184 unsigned num_moves = 0xFF; 1185 unsigned num_vars = 0; 1186 1187 /* we use a sliding window to check potential positions */ 1188 for (PhysRegInterval reg_win = {bounds.lo(), size}; reg_win.hi() <= bounds.hi(); 1189 reg_win += stride) { 1190 /* first check if the register window starts in the middle of an 1191 * allocated variable: this is what we have to fix to allow for 1192 * num_moves > size */ 1193 if (reg_win.lo() > bounds.lo() && !reg_file.is_empty_or_blocked(reg_win.lo()) && 1194 reg_file.get_id(reg_win.lo()) == reg_file.get_id(reg_win.lo().advance(-1))) 1195 continue; 1196 if (reg_win.hi() < bounds.hi() && !reg_file.is_empty_or_blocked(reg_win.hi().advance(-1)) && 1197 reg_file.get_id(reg_win.hi().advance(-1)) == reg_file.get_id(reg_win.hi())) 1198 continue; 1199 1200 /* second, check that we have at most k=num_moves elements in the window 1201 * and no element is larger than the currently processed one */ 1202 unsigned k = op_moves; 1203 unsigned n = 0; 1204 unsigned remaining_op_moves = op_moves; 1205 unsigned last_var = 0; 1206 bool found = true; 1207 bool aligned = rc == RegClass::v4 && reg_win.lo() % 4 == 0; 1208 for (const PhysReg j : reg_win) { 1209 /* dead operands effectively reduce the number of estimated moves */ 1210 if (is_killed_operand[j & 0xFF]) { 1211 if (remaining_op_moves) { 1212 k--; 1213 remaining_op_moves--; 1214 } 1215 continue; 1216 } 1217 1218 if (reg_file[j] == 0 || reg_file[j] == last_var) 1219 continue; 1220 1221 if (reg_file[j] == 0xF0000000) { 1222 k += 1; 1223 n++; 1224 continue; 1225 } 1226 1227 if (ctx.assignments[reg_file[j]].rc.size() >= size) { 1228 found = false; 1229 break; 1230 } 1231 1232 /* we cannot split live ranges of linear vgprs inside control flow */ 1233 // TODO: ensure that live range splits inside control flow are never necessary 1234 if (!(ctx.block->kind & block_kind_top_level) && 1235 ctx.assignments[reg_file[j]].rc.is_linear_vgpr()) { 1236 found = false; 1237 break; 1238 } 1239 1240 k += ctx.assignments[reg_file[j]].rc.size(); 1241 n++; 1242 last_var = reg_file[j]; 1243 } 1244 1245 if (!found || k > num_moves) 1246 continue; 1247 if (k == num_moves && n < num_vars) 1248 continue; 1249 if (!aligned && k == num_moves && n == num_vars) 1250 continue; 1251 1252 if (found) { 1253 best_win = reg_win; 1254 num_moves = k; 1255 num_vars = n; 1256 } 1257 } 1258 1259 if (num_moves == 0xFF) 1260 return {{}, false}; 1261 1262 /* now, we figured the placement for our definition */ 1263 RegisterFile tmp_file(reg_file); 1264 std::set<std::pair<unsigned, unsigned>> vars = collect_vars(ctx, tmp_file, best_win); 1265 1266 if (instr->opcode == aco_opcode::p_create_vector) { 1267 /* move killed operands which aren't yet at the correct position (GFX9+) 1268 * or which are in the definition space */ 1269 PhysReg reg = best_win.lo(); 1270 for (Operand& op : instr->operands) { 1271 if (op.isTemp() && op.isFirstKillBeforeDef() && op.getTemp().type() == rc.type()) { 1272 if (op.physReg() != reg && (ctx.program->chip_class >= GFX9 || 1273 (op.physReg().advance(op.bytes()) > best_win.lo() && 1274 op.physReg() < best_win.hi()))) { 1275 vars.emplace(op.bytes(), op.tempId()); 1276 tmp_file.clear(op); 1277 } else { 1278 tmp_file.fill(op); 1279 } 1280 } 1281 reg.reg_b += op.bytes(); 1282 } 1283 } else if (!is_phi(instr)) { 1284 /* re-enable killed operands */ 1285 for (Operand& op : instr->operands) { 1286 if (op.isTemp() && op.isFirstKillBeforeDef()) 1287 tmp_file.fill(op); 1288 } 1289 } 1290 1291 std::vector<std::pair<Operand, Definition>> pc; 1292 if (!get_regs_for_copies(ctx, tmp_file, pc, vars, bounds, instr, best_win)) 1293 return {{}, false}; 1294 1295 parallelcopies.insert(parallelcopies.end(), pc.begin(), pc.end()); 1296 1297 adjust_max_used_regs(ctx, rc, best_win.lo()); 1298 return {best_win.lo(), true}; 1299} 1300 1301bool 1302get_reg_specified(ra_ctx& ctx, RegisterFile& reg_file, RegClass rc, aco_ptr<Instruction>& instr, 1303 PhysReg reg) 1304{ 1305 /* catch out-of-range registers */ 1306 if (reg >= PhysReg{512}) 1307 return false; 1308 1309 std::pair<unsigned, unsigned> sdw_def_info; 1310 if (rc.is_subdword()) 1311 sdw_def_info = get_subdword_definition_info(ctx.program, instr, rc); 1312 1313 if (rc.is_subdword() && reg.byte() % sdw_def_info.first) 1314 return false; 1315 if (!rc.is_subdword() && reg.byte()) 1316 return false; 1317 1318 if (rc.type() == RegType::sgpr && reg % get_stride(rc) != 0) 1319 return false; 1320 1321 PhysRegInterval reg_win = {reg, rc.size()}; 1322 PhysRegInterval bounds = get_reg_bounds(ctx.program, rc.type()); 1323 PhysRegInterval vcc_win = {vcc, 2}; 1324 /* VCC is outside the bounds */ 1325 bool is_vcc = rc.type() == RegType::sgpr && vcc_win.contains(reg_win); 1326 bool is_m0 = rc == s1 && reg == m0; 1327 if (!bounds.contains(reg_win) && !is_vcc && !is_m0) 1328 return false; 1329 1330 if (rc.is_subdword()) { 1331 PhysReg test_reg; 1332 test_reg.reg_b = reg.reg_b & ~(sdw_def_info.second - 1); 1333 if (reg_file.test(test_reg, sdw_def_info.second)) 1334 return false; 1335 } else { 1336 if (reg_file.test(reg, rc.bytes())) 1337 return false; 1338 } 1339 1340 adjust_max_used_regs(ctx, rc, reg_win.lo()); 1341 return true; 1342} 1343 1344bool 1345increase_register_file(ra_ctx& ctx, RegType type) 1346{ 1347 if (type == RegType::vgpr && ctx.program->max_reg_demand.vgpr < ctx.vgpr_limit) { 1348 update_vgpr_sgpr_demand(ctx.program, RegisterDemand(ctx.program->max_reg_demand.vgpr + 1, 1349 ctx.program->max_reg_demand.sgpr)); 1350 } else if (type == RegType::sgpr && ctx.program->max_reg_demand.sgpr < ctx.sgpr_limit) { 1351 update_vgpr_sgpr_demand(ctx.program, RegisterDemand(ctx.program->max_reg_demand.vgpr, 1352 ctx.program->max_reg_demand.sgpr + 1)); 1353 } else { 1354 return false; 1355 } 1356 return true; 1357} 1358 1359struct IDAndRegClass { 1360 IDAndRegClass(unsigned id_, RegClass rc_) : id(id_), rc(rc_) {} 1361 1362 unsigned id; 1363 RegClass rc; 1364}; 1365 1366struct IDAndInfo { 1367 IDAndInfo(unsigned id_, DefInfo info_) : id(id_), info(info_) {} 1368 1369 unsigned id; 1370 DefInfo info; 1371}; 1372 1373/* Reallocates vars by sorting them and placing each variable after the previous 1374 * one. If one of the variables has 0xffffffff as an ID, the register assigned 1375 * for that variable will be returned. 1376 */ 1377PhysReg 1378compact_relocate_vars(ra_ctx& ctx, const std::vector<IDAndRegClass>& vars, 1379 std::vector<std::pair<Operand, Definition>>& parallelcopies, PhysReg start) 1380{ 1381 /* This function assumes RegisterDemand/live_var_analysis rounds up sub-dword 1382 * temporary sizes to dwords. 1383 */ 1384 std::vector<IDAndInfo> sorted; 1385 for (IDAndRegClass var : vars) { 1386 DefInfo info(ctx, ctx.pseudo_dummy, var.rc, -1); 1387 sorted.emplace_back(var.id, info); 1388 } 1389 1390 std::sort( 1391 sorted.begin(), sorted.end(), 1392 [&ctx](const IDAndInfo& a, const IDAndInfo& b) 1393 { 1394 unsigned a_stride = a.info.stride * (a.info.rc.is_subdword() ? 1 : 4); 1395 unsigned b_stride = b.info.stride * (b.info.rc.is_subdword() ? 1 : 4); 1396 if (a_stride > b_stride) 1397 return true; 1398 if (a_stride < b_stride) 1399 return false; 1400 if (a.id == 0xffffffff || b.id == 0xffffffff) 1401 return a.id == 1402 0xffffffff; /* place 0xffffffff before others if possible, not for any reason */ 1403 return ctx.assignments[a.id].reg < ctx.assignments[b.id].reg; 1404 }); 1405 1406 PhysReg next_reg = start; 1407 PhysReg space_reg; 1408 for (IDAndInfo& var : sorted) { 1409 unsigned stride = var.info.rc.is_subdword() ? var.info.stride : var.info.stride * 4; 1410 next_reg.reg_b = align(next_reg.reg_b, MAX2(stride, 4)); 1411 1412 /* 0xffffffff is a special variable ID used reserve a space for killed 1413 * operands and definitions. 1414 */ 1415 if (var.id != 0xffffffff) { 1416 if (next_reg != ctx.assignments[var.id].reg) { 1417 RegClass rc = ctx.assignments[var.id].rc; 1418 Temp tmp(var.id, rc); 1419 1420 Operand pc_op(tmp); 1421 pc_op.setFixed(ctx.assignments[var.id].reg); 1422 Definition pc_def(next_reg, rc); 1423 parallelcopies.emplace_back(pc_op, pc_def); 1424 } 1425 } else { 1426 space_reg = next_reg; 1427 } 1428 1429 adjust_max_used_regs(ctx, var.info.rc, next_reg); 1430 1431 next_reg = next_reg.advance(var.info.rc.size() * 4); 1432 } 1433 1434 return space_reg; 1435} 1436 1437bool 1438is_mimg_vaddr_intact(ra_ctx& ctx, RegisterFile& reg_file, Instruction* instr) 1439{ 1440 PhysReg first{512}; 1441 for (unsigned i = 0; i < instr->operands.size() - 3u; i++) { 1442 Operand op = instr->operands[i + 3]; 1443 1444 if (ctx.assignments[op.tempId()].assigned) { 1445 PhysReg reg = ctx.assignments[op.tempId()].reg; 1446 1447 if (first.reg() == 512) { 1448 PhysRegInterval bounds = get_reg_bounds(ctx.program, RegType::vgpr); 1449 first = reg.advance(i * -4); 1450 PhysRegInterval vec = PhysRegInterval{first, instr->operands.size() - 3u}; 1451 if (!bounds.contains(vec)) /* not enough space for other operands */ 1452 return false; 1453 } else { 1454 if (reg != first.advance(i * 4)) /* not at the best position */ 1455 return false; 1456 } 1457 } else { 1458 /* If there's an unexpected temporary, this operand is unlikely to be 1459 * placed in the best position. 1460 */ 1461 if (first.reg() != 512 && reg_file.test(first.advance(i * 4), 4)) 1462 return false; 1463 } 1464 } 1465 1466 return true; 1467} 1468 1469std::pair<PhysReg, bool> 1470get_reg_vector(ra_ctx& ctx, RegisterFile& reg_file, Temp temp, aco_ptr<Instruction>& instr) 1471{ 1472 Instruction* vec = ctx.vectors[temp.id()]; 1473 unsigned first_operand = vec->format == Format::MIMG ? 3 : 0; 1474 unsigned our_offset = 0; 1475 for (unsigned i = first_operand; i < vec->operands.size(); i++) { 1476 Operand& op = vec->operands[i]; 1477 if (op.isTemp() && op.tempId() == temp.id()) 1478 break; 1479 else 1480 our_offset += op.bytes(); 1481 } 1482 1483 if (vec->format != Format::MIMG || is_mimg_vaddr_intact(ctx, reg_file, vec)) { 1484 unsigned their_offset = 0; 1485 /* check for every operand of the vector 1486 * - whether the operand is assigned and 1487 * - we can use the register relative to that operand 1488 */ 1489 for (unsigned i = first_operand; i < vec->operands.size(); i++) { 1490 Operand& op = vec->operands[i]; 1491 if (op.isTemp() && op.tempId() != temp.id() && op.getTemp().type() == temp.type() && 1492 ctx.assignments[op.tempId()].assigned) { 1493 PhysReg reg = ctx.assignments[op.tempId()].reg; 1494 reg.reg_b += (our_offset - their_offset); 1495 if (get_reg_specified(ctx, reg_file, temp.regClass(), instr, reg)) 1496 return {reg, true}; 1497 1498 /* return if MIMG vaddr components don't remain vector-aligned */ 1499 if (vec->format == Format::MIMG) 1500 return {{}, false}; 1501 } 1502 their_offset += op.bytes(); 1503 } 1504 1505 /* We didn't find a register relative to other vector operands. 1506 * Try to find new space which fits the whole vector. 1507 */ 1508 RegClass vec_rc = RegClass::get(temp.type(), their_offset); 1509 DefInfo info(ctx, ctx.pseudo_dummy, vec_rc, -1); 1510 std::pair<PhysReg, bool> res = get_reg_simple(ctx, reg_file, info); 1511 PhysReg reg = res.first; 1512 if (res.second) { 1513 reg.reg_b += our_offset; 1514 /* make sure to only use byte offset if the instruction supports it */ 1515 if (get_reg_specified(ctx, reg_file, temp.regClass(), instr, reg)) 1516 return {reg, true}; 1517 } 1518 } 1519 return {{}, false}; 1520} 1521 1522PhysReg 1523get_reg(ra_ctx& ctx, RegisterFile& reg_file, Temp temp, 1524 std::vector<std::pair<Operand, Definition>>& parallelcopies, aco_ptr<Instruction>& instr, 1525 int operand_index = -1) 1526{ 1527 auto split_vec = ctx.split_vectors.find(temp.id()); 1528 if (split_vec != ctx.split_vectors.end()) { 1529 unsigned offset = 0; 1530 for (Definition def : split_vec->second->definitions) { 1531 if (ctx.assignments[def.tempId()].affinity) { 1532 assignment& affinity = ctx.assignments[ctx.assignments[def.tempId()].affinity]; 1533 if (affinity.assigned) { 1534 PhysReg reg = affinity.reg; 1535 reg.reg_b -= offset; 1536 if (get_reg_specified(ctx, reg_file, temp.regClass(), instr, reg)) 1537 return reg; 1538 } 1539 } 1540 offset += def.bytes(); 1541 } 1542 } 1543 1544 if (ctx.assignments[temp.id()].affinity) { 1545 assignment& affinity = ctx.assignments[ctx.assignments[temp.id()].affinity]; 1546 if (affinity.assigned) { 1547 if (get_reg_specified(ctx, reg_file, temp.regClass(), instr, affinity.reg)) 1548 return affinity.reg; 1549 } 1550 } 1551 1552 std::pair<PhysReg, bool> res; 1553 1554 if (ctx.vectors.find(temp.id()) != ctx.vectors.end()) { 1555 res = get_reg_vector(ctx, reg_file, temp, instr); 1556 if (res.second) 1557 return res.first; 1558 } 1559 1560 DefInfo info(ctx, instr, temp.regClass(), operand_index); 1561 1562 if (!ctx.policy.skip_optimistic_path) { 1563 /* try to find space without live-range splits */ 1564 res = get_reg_simple(ctx, reg_file, info); 1565 1566 if (res.second) 1567 return res.first; 1568 } 1569 1570 /* try to find space with live-range splits */ 1571 res = get_reg_impl(ctx, reg_file, parallelcopies, info, instr); 1572 1573 if (res.second) 1574 return res.first; 1575 1576 /* try using more registers */ 1577 1578 /* We should only fail here because keeping under the limit would require 1579 * too many moves. */ 1580 assert(reg_file.count_zero(info.bounds) >= info.size); 1581 1582 if (!increase_register_file(ctx, info.rc.type())) { 1583 /* fallback algorithm: reallocate all variables at once */ 1584 unsigned def_size = info.rc.size(); 1585 for (Definition def : instr->definitions) { 1586 if (ctx.assignments[def.tempId()].assigned && def.regClass().type() == info.rc.type()) 1587 def_size += def.regClass().size(); 1588 } 1589 1590 unsigned killed_op_size = 0; 1591 for (Operand op : instr->operands) { 1592 if (op.isTemp() && op.isKillBeforeDef() && op.regClass().type() == info.rc.type()) 1593 killed_op_size += op.regClass().size(); 1594 } 1595 1596 const PhysRegInterval regs = get_reg_bounds(ctx.program, info.rc.type()); 1597 1598 /* reallocate passthrough variables and non-killed operands */ 1599 std::vector<IDAndRegClass> vars; 1600 for (const std::pair<unsigned, unsigned>& var : find_vars(ctx, reg_file, regs)) 1601 vars.emplace_back(var.second, ctx.assignments[var.second].rc); 1602 vars.emplace_back(0xffffffff, RegClass(info.rc.type(), MAX2(def_size, killed_op_size))); 1603 1604 PhysReg space = compact_relocate_vars(ctx, vars, parallelcopies, regs.lo()); 1605 1606 /* reallocate killed operands */ 1607 std::vector<IDAndRegClass> killed_op_vars; 1608 for (Operand op : instr->operands) { 1609 if (op.isKillBeforeDef() && op.regClass().type() == info.rc.type()) 1610 killed_op_vars.emplace_back(op.tempId(), op.regClass()); 1611 } 1612 compact_relocate_vars(ctx, killed_op_vars, parallelcopies, space); 1613 1614 /* reallocate definitions */ 1615 std::vector<IDAndRegClass> def_vars; 1616 for (Definition def : instr->definitions) { 1617 if (ctx.assignments[def.tempId()].assigned && def.regClass().type() == info.rc.type()) 1618 def_vars.emplace_back(def.tempId(), def.regClass()); 1619 } 1620 def_vars.emplace_back(0xffffffff, info.rc); 1621 return compact_relocate_vars(ctx, def_vars, parallelcopies, space); 1622 } 1623 1624 return get_reg(ctx, reg_file, temp, parallelcopies, instr, operand_index); 1625} 1626 1627PhysReg 1628get_reg_create_vector(ra_ctx& ctx, RegisterFile& reg_file, Temp temp, 1629 std::vector<std::pair<Operand, Definition>>& parallelcopies, 1630 aco_ptr<Instruction>& instr) 1631{ 1632 RegClass rc = temp.regClass(); 1633 /* create_vector instructions have different costs w.r.t. register coalescing */ 1634 uint32_t size = rc.size(); 1635 uint32_t bytes = rc.bytes(); 1636 uint32_t stride = get_stride(rc); 1637 PhysRegInterval bounds = get_reg_bounds(ctx.program, rc.type()); 1638 1639 // TODO: improve p_create_vector for sub-dword vectors 1640 1641 PhysReg best_pos{0xFFF}; 1642 unsigned num_moves = 0xFF; 1643 bool best_avoid = true; 1644 1645 /* test for each operand which definition placement causes the least shuffle instructions */ 1646 for (unsigned i = 0, offset = 0; i < instr->operands.size(); 1647 offset += instr->operands[i].bytes(), i++) { 1648 // TODO: think about, if we can alias live operands on the same register 1649 if (!instr->operands[i].isTemp() || !instr->operands[i].isKillBeforeDef() || 1650 instr->operands[i].getTemp().type() != rc.type()) 1651 continue; 1652 1653 if (offset > instr->operands[i].physReg().reg_b) 1654 continue; 1655 1656 unsigned reg_lower = instr->operands[i].physReg().reg_b - offset; 1657 if (reg_lower % 4) 1658 continue; 1659 PhysRegInterval reg_win = {PhysReg{reg_lower / 4}, size}; 1660 unsigned k = 0; 1661 1662 /* no need to check multiple times */ 1663 if (reg_win.lo() == best_pos) 1664 continue; 1665 1666 /* check borders */ 1667 // TODO: this can be improved */ 1668 if (!bounds.contains(reg_win) || reg_win.lo() % stride != 0) 1669 continue; 1670 if (reg_win.lo() > bounds.lo() && reg_file[reg_win.lo()] != 0 && 1671 reg_file.get_id(reg_win.lo()) == reg_file.get_id(reg_win.lo().advance(-1))) 1672 continue; 1673 if (reg_win.hi() < bounds.hi() && reg_file[reg_win.hi().advance(-4)] != 0 && 1674 reg_file.get_id(reg_win.hi().advance(-1)) == reg_file.get_id(reg_win.hi())) 1675 continue; 1676 1677 /* count variables to be moved and check "avoid" */ 1678 bool avoid = false; 1679 bool linear_vgpr = false; 1680 for (PhysReg j : reg_win) { 1681 if (reg_file[j] != 0) { 1682 if (reg_file[j] == 0xF0000000) { 1683 PhysReg reg; 1684 reg.reg_b = j * 4; 1685 unsigned bytes_left = bytes - ((unsigned)j - reg_win.lo()) * 4; 1686 for (unsigned byte_idx = 0; byte_idx < MIN2(bytes_left, 4); byte_idx++, reg.reg_b++) 1687 k += reg_file.test(reg, 1); 1688 } else { 1689 k += 4; 1690 linear_vgpr |= ctx.assignments[reg_file[j]].rc.is_linear_vgpr(); 1691 } 1692 } 1693 avoid |= ctx.war_hint[j]; 1694 } 1695 1696 if (linear_vgpr) { 1697 /* we cannot split live ranges of linear vgprs inside control flow */ 1698 if (ctx.block->kind & block_kind_top_level) 1699 avoid = true; 1700 else 1701 continue; 1702 } 1703 1704 if (avoid && !best_avoid) 1705 continue; 1706 1707 /* count operands in wrong positions */ 1708 for (unsigned j = 0, offset2 = 0; j < instr->operands.size(); 1709 offset2 += instr->operands[j].bytes(), j++) { 1710 if (j == i || !instr->operands[j].isTemp() || 1711 instr->operands[j].getTemp().type() != rc.type()) 1712 continue; 1713 if (instr->operands[j].physReg().reg_b != reg_win.lo() * 4 + offset2) 1714 k += instr->operands[j].bytes(); 1715 } 1716 bool aligned = rc == RegClass::v4 && reg_win.lo() % 4 == 0; 1717 if (k > num_moves || (!aligned && k == num_moves)) 1718 continue; 1719 1720 best_pos = reg_win.lo(); 1721 num_moves = k; 1722 best_avoid = avoid; 1723 } 1724 1725 if (num_moves >= bytes) 1726 return get_reg(ctx, reg_file, temp, parallelcopies, instr); 1727 1728 /* re-enable killed operands which are in the wrong position */ 1729 RegisterFile tmp_file(reg_file); 1730 for (unsigned i = 0, offset = 0; i < instr->operands.size(); 1731 offset += instr->operands[i].bytes(), i++) { 1732 if (instr->operands[i].isTemp() && instr->operands[i].isFirstKillBeforeDef() && 1733 instr->operands[i].physReg().reg_b != best_pos.reg_b + offset) 1734 tmp_file.fill(instr->operands[i]); 1735 } 1736 1737 /* collect variables to be moved */ 1738 std::set<std::pair<unsigned, unsigned>> vars = 1739 collect_vars(ctx, tmp_file, PhysRegInterval{best_pos, size}); 1740 1741 for (unsigned i = 0, offset = 0; i < instr->operands.size(); 1742 offset += instr->operands[i].bytes(), i++) { 1743 if (!instr->operands[i].isTemp() || !instr->operands[i].isFirstKillBeforeDef() || 1744 instr->operands[i].getTemp().type() != rc.type()) 1745 continue; 1746 bool correct_pos = instr->operands[i].physReg().reg_b == best_pos.reg_b + offset; 1747 /* GFX9+: move killed operands which aren't yet at the correct position 1748 * Moving all killed operands generally leads to more register swaps. 1749 * This is only done on GFX9+ because of the cheap v_swap instruction. 1750 */ 1751 if (ctx.program->chip_class >= GFX9 && !correct_pos) { 1752 vars.emplace(instr->operands[i].bytes(), instr->operands[i].tempId()); 1753 tmp_file.clear(instr->operands[i]); 1754 /* fill operands which are in the correct position to avoid overwriting */ 1755 } else if (correct_pos) { 1756 tmp_file.fill(instr->operands[i]); 1757 } 1758 } 1759 bool success = false; 1760 std::vector<std::pair<Operand, Definition>> pc; 1761 success = 1762 get_regs_for_copies(ctx, tmp_file, pc, vars, bounds, instr, PhysRegInterval{best_pos, size}); 1763 1764 if (!success) { 1765 if (!increase_register_file(ctx, temp.type())) { 1766 /* use the fallback algorithm in get_reg() */ 1767 return get_reg(ctx, reg_file, temp, parallelcopies, instr); 1768 } 1769 return get_reg_create_vector(ctx, reg_file, temp, parallelcopies, instr); 1770 } 1771 1772 parallelcopies.insert(parallelcopies.end(), pc.begin(), pc.end()); 1773 adjust_max_used_regs(ctx, rc, best_pos); 1774 1775 return best_pos; 1776} 1777 1778void 1779handle_pseudo(ra_ctx& ctx, const RegisterFile& reg_file, Instruction* instr) 1780{ 1781 if (instr->format != Format::PSEUDO) 1782 return; 1783 1784 /* all instructions which use handle_operands() need this information */ 1785 switch (instr->opcode) { 1786 case aco_opcode::p_extract_vector: 1787 case aco_opcode::p_create_vector: 1788 case aco_opcode::p_split_vector: 1789 case aco_opcode::p_parallelcopy: 1790 case aco_opcode::p_wqm: break; 1791 default: return; 1792 } 1793 1794 bool writes_linear = false; 1795 /* if all definitions are logical vgpr, no need to care for SCC */ 1796 for (Definition& def : instr->definitions) { 1797 if (def.getTemp().regClass().is_linear()) 1798 writes_linear = true; 1799 } 1800 /* if all operands are constant, no need to care either */ 1801 bool reads_linear = false; 1802 bool reads_subdword = false; 1803 for (Operand& op : instr->operands) { 1804 if (op.isTemp() && op.getTemp().regClass().is_linear()) 1805 reads_linear = true; 1806 if (op.isTemp() && op.regClass().is_subdword()) 1807 reads_subdword = true; 1808 } 1809 bool needs_scratch_reg = (writes_linear && reads_linear && reg_file[scc]) || 1810 (ctx.program->chip_class <= GFX7 && reads_subdword); 1811 if (!needs_scratch_reg) 1812 return; 1813 1814 instr->pseudo().tmp_in_scc = reg_file[scc]; 1815 1816 int reg = ctx.max_used_sgpr; 1817 for (; reg >= 0 && reg_file[PhysReg{(unsigned)reg}]; reg--) 1818 ; 1819 if (reg < 0) { 1820 reg = ctx.max_used_sgpr + 1; 1821 for (; reg < ctx.program->max_reg_demand.sgpr && reg_file[PhysReg{(unsigned)reg}]; reg++) 1822 ; 1823 if (reg == ctx.program->max_reg_demand.sgpr) { 1824 assert(reads_subdword && reg_file[m0] == 0); 1825 reg = m0; 1826 } 1827 } 1828 1829 adjust_max_used_regs(ctx, s1, reg); 1830 instr->pseudo().scratch_sgpr = PhysReg{(unsigned)reg}; 1831} 1832 1833bool 1834operand_can_use_reg(chip_class chip, aco_ptr<Instruction>& instr, unsigned idx, PhysReg reg, 1835 RegClass rc) 1836{ 1837 if (instr->operands[idx].isFixed()) 1838 return instr->operands[idx].physReg() == reg; 1839 1840 bool is_writelane = instr->opcode == aco_opcode::v_writelane_b32 || 1841 instr->opcode == aco_opcode::v_writelane_b32_e64; 1842 if (chip <= GFX9 && is_writelane && idx <= 1) { 1843 /* v_writelane_b32 can take two sgprs but only if one is m0. */ 1844 bool is_other_sgpr = 1845 instr->operands[!idx].isTemp() && 1846 (!instr->operands[!idx].isFixed() || instr->operands[!idx].physReg() != m0); 1847 if (is_other_sgpr && instr->operands[!idx].tempId() != instr->operands[idx].tempId()) { 1848 instr->operands[idx].setFixed(m0); 1849 return reg == m0; 1850 } 1851 } 1852 1853 if (reg.byte()) { 1854 unsigned stride = get_subdword_operand_stride(chip, instr, idx, rc); 1855 if (reg.byte() % stride) 1856 return false; 1857 } 1858 1859 switch (instr->format) { 1860 case Format::SMEM: 1861 return reg != scc && reg != exec && 1862 (reg != m0 || idx == 1 || idx == 3) && /* offset can be m0 */ 1863 (reg != vcc || (instr->definitions.empty() && idx == 2) || 1864 chip >= GFX10); /* sdata can be vcc */ 1865 default: 1866 // TODO: there are more instructions with restrictions on registers 1867 return true; 1868 } 1869} 1870 1871void 1872get_reg_for_operand(ra_ctx& ctx, RegisterFile& register_file, 1873 std::vector<std::pair<Operand, Definition>>& parallelcopy, 1874 aco_ptr<Instruction>& instr, Operand& operand, unsigned operand_index) 1875{ 1876 /* check if the operand is fixed */ 1877 PhysReg src = ctx.assignments[operand.tempId()].reg; 1878 PhysReg dst; 1879 if (operand.isFixed()) { 1880 assert(operand.physReg() != src); 1881 1882 /* check if target reg is blocked, and move away the blocking var */ 1883 if (register_file.test(operand.physReg(), operand.bytes())) { 1884 PhysRegInterval target{operand.physReg(), operand.size()}; 1885 1886 RegisterFile tmp_file(register_file); 1887 1888 std::set<std::pair<unsigned, unsigned>> blocking_vars = 1889 collect_vars(ctx, tmp_file, target); 1890 1891 tmp_file.clear(src, operand.regClass()); // TODO: try to avoid moving block vars to src 1892 tmp_file.block(operand.physReg(), operand.regClass()); 1893 1894 DefInfo info(ctx, instr, operand.regClass(), -1); 1895 get_regs_for_copies(ctx, tmp_file, parallelcopy, blocking_vars, info.bounds, instr, 1896 PhysRegInterval()); 1897 } 1898 dst = operand.physReg(); 1899 1900 } else { 1901 /* clear the operand in case it's only a stride mismatch */ 1902 register_file.clear(src, operand.regClass()); 1903 dst = get_reg(ctx, register_file, operand.getTemp(), parallelcopy, instr, operand_index); 1904 } 1905 1906 Operand pc_op = operand; 1907 pc_op.setFixed(src); 1908 Definition pc_def = Definition(dst, pc_op.regClass()); 1909 parallelcopy.emplace_back(pc_op, pc_def); 1910 update_renames(ctx, register_file, parallelcopy, instr, rename_not_killed_ops | fill_killed_ops); 1911} 1912 1913void 1914get_regs_for_phis(ra_ctx& ctx, Block& block, RegisterFile& register_file, 1915 std::vector<aco_ptr<Instruction>>& instructions, IDSet& live_in) 1916{ 1917 /* assign phis with all-matching registers to that register */ 1918 for (aco_ptr<Instruction>& phi : block.instructions) { 1919 if (!is_phi(phi)) 1920 break; 1921 Definition& definition = phi->definitions[0]; 1922 if (definition.isKill() || definition.isFixed()) 1923 continue; 1924 1925 if (!phi->operands[0].isTemp()) 1926 continue; 1927 1928 PhysReg reg = phi->operands[0].physReg(); 1929 auto OpsSame = [=](const Operand& op) -> bool 1930 { return op.isTemp() && (!op.isFixed() || op.physReg() == reg); }; 1931 bool all_same = std::all_of(phi->operands.cbegin() + 1, phi->operands.cend(), OpsSame); 1932 if (!all_same) 1933 continue; 1934 1935 if (!get_reg_specified(ctx, register_file, definition.regClass(), phi, reg)) 1936 continue; 1937 1938 definition.setFixed(reg); 1939 register_file.fill(definition); 1940 ctx.assignments[definition.tempId()].set(definition); 1941 } 1942 1943 /* try to find a register that is used by at least one operand */ 1944 for (aco_ptr<Instruction>& phi : block.instructions) { 1945 if (!is_phi(phi)) 1946 break; 1947 Definition& definition = phi->definitions[0]; 1948 if (definition.isKill() || definition.isFixed()) 1949 continue; 1950 1951 /* use affinity if available */ 1952 if (ctx.assignments[definition.tempId()].affinity && 1953 ctx.assignments[ctx.assignments[definition.tempId()].affinity].assigned) { 1954 assignment& affinity = ctx.assignments[ctx.assignments[definition.tempId()].affinity]; 1955 assert(affinity.rc == definition.regClass()); 1956 if (get_reg_specified(ctx, register_file, definition.regClass(), phi, affinity.reg)) { 1957 definition.setFixed(affinity.reg); 1958 register_file.fill(definition); 1959 ctx.assignments[definition.tempId()].set(definition); 1960 continue; 1961 } 1962 } 1963 1964 /* by going backwards, we aim to avoid copies in else-blocks */ 1965 for (int i = phi->operands.size() - 1; i >= 0; i--) { 1966 const Operand& op = phi->operands[i]; 1967 if (!op.isTemp() || !op.isFixed()) 1968 continue; 1969 1970 PhysReg reg = op.physReg(); 1971 if (get_reg_specified(ctx, register_file, definition.regClass(), phi, reg)) { 1972 definition.setFixed(reg); 1973 register_file.fill(definition); 1974 ctx.assignments[definition.tempId()].set(definition); 1975 break; 1976 } 1977 } 1978 } 1979 1980 /* find registers for phis where the register was blocked or no operand was assigned */ 1981 for (aco_ptr<Instruction>& phi : block.instructions) { 1982 if (!is_phi(phi)) 1983 break; 1984 1985 Definition& definition = phi->definitions[0]; 1986 if (definition.isKill()) 1987 continue; 1988 1989 if (definition.isFixed()) { 1990 instructions.emplace_back(std::move(phi)); 1991 continue; 1992 } 1993 1994 std::vector<std::pair<Operand, Definition>> parallelcopy; 1995 definition.setFixed(get_reg(ctx, register_file, definition.getTemp(), parallelcopy, phi)); 1996 update_renames(ctx, register_file, parallelcopy, phi, rename_not_killed_ops); 1997 1998 /* process parallelcopy */ 1999 for (std::pair<Operand, Definition> pc : parallelcopy) { 2000 /* see if it's a copy from a different phi */ 2001 // TODO: prefer moving some previous phis over live-ins 2002 // TODO: somehow prevent phis fixed before the RA from being updated (shouldn't be a 2003 // problem in practice since they can only be fixed to exec) 2004 Instruction* prev_phi = NULL; 2005 std::vector<aco_ptr<Instruction>>::iterator phi_it; 2006 for (phi_it = instructions.begin(); phi_it != instructions.end(); ++phi_it) { 2007 if ((*phi_it)->definitions[0].tempId() == pc.first.tempId()) 2008 prev_phi = phi_it->get(); 2009 } 2010 if (prev_phi) { 2011 /* if so, just update that phi's register */ 2012 prev_phi->definitions[0].setFixed(pc.second.physReg()); 2013 ctx.assignments[prev_phi->definitions[0].tempId()].set(pc.second); 2014 continue; 2015 } 2016 2017 /* rename */ 2018 std::unordered_map<unsigned, Temp>::iterator orig_it = 2019 ctx.orig_names.find(pc.first.tempId()); 2020 Temp orig = pc.first.getTemp(); 2021 if (orig_it != ctx.orig_names.end()) 2022 orig = orig_it->second; 2023 else 2024 ctx.orig_names[pc.second.tempId()] = orig; 2025 ctx.renames[block.index][orig.id()] = pc.second.getTemp(); 2026 2027 /* otherwise, this is a live-in and we need to create a new phi 2028 * to move it in this block's predecessors */ 2029 aco_opcode opcode = 2030 pc.first.getTemp().is_linear() ? aco_opcode::p_linear_phi : aco_opcode::p_phi; 2031 std::vector<unsigned>& preds = 2032 pc.first.getTemp().is_linear() ? block.linear_preds : block.logical_preds; 2033 aco_ptr<Instruction> new_phi{ 2034 create_instruction<Pseudo_instruction>(opcode, Format::PSEUDO, preds.size(), 1)}; 2035 new_phi->definitions[0] = pc.second; 2036 for (unsigned i = 0; i < preds.size(); i++) 2037 new_phi->operands[i] = Operand(pc.first); 2038 instructions.emplace_back(std::move(new_phi)); 2039 2040 /* Remove from live_out_per_block (now used for live-in), because handle_loop_phis() 2041 * would re-create this phi later if this is a loop header. 2042 */ 2043 live_in.erase(orig.id()); 2044 } 2045 2046 register_file.fill(definition); 2047 ctx.assignments[definition.tempId()].set(definition); 2048 instructions.emplace_back(std::move(phi)); 2049 } 2050} 2051 2052Temp 2053read_variable(ra_ctx& ctx, Temp val, unsigned block_idx) 2054{ 2055 std::unordered_map<unsigned, Temp>::iterator it = ctx.renames[block_idx].find(val.id()); 2056 if (it == ctx.renames[block_idx].end()) 2057 return val; 2058 else 2059 return it->second; 2060} 2061 2062Temp 2063handle_live_in(ra_ctx& ctx, Temp val, Block* block) 2064{ 2065 std::vector<unsigned>& preds = val.is_linear() ? block->linear_preds : block->logical_preds; 2066 if (preds.size() == 0) 2067 return val; 2068 2069 if (preds.size() == 1) { 2070 /* if the block has only one predecessor, just look there for the name */ 2071 return read_variable(ctx, val, preds[0]); 2072 } 2073 2074 /* there are multiple predecessors and the block is sealed */ 2075 Temp* const ops = (Temp*)alloca(preds.size() * sizeof(Temp)); 2076 2077 /* get the rename from each predecessor and check if they are the same */ 2078 Temp new_val; 2079 bool needs_phi = false; 2080 for (unsigned i = 0; i < preds.size(); i++) { 2081 ops[i] = read_variable(ctx, val, preds[i]); 2082 if (i == 0) 2083 new_val = ops[i]; 2084 else 2085 needs_phi |= !(new_val == ops[i]); 2086 } 2087 2088 if (needs_phi) { 2089 assert(!val.regClass().is_linear_vgpr()); 2090 2091 /* the variable has been renamed differently in the predecessors: we need to insert a phi */ 2092 aco_opcode opcode = val.is_linear() ? aco_opcode::p_linear_phi : aco_opcode::p_phi; 2093 aco_ptr<Instruction> phi{ 2094 create_instruction<Pseudo_instruction>(opcode, Format::PSEUDO, preds.size(), 1)}; 2095 new_val = ctx.program->allocateTmp(val.regClass()); 2096 phi->definitions[0] = Definition(new_val); 2097 ctx.assignments.emplace_back(); 2098 assert(ctx.assignments.size() == ctx.program->peekAllocationId()); 2099 for (unsigned i = 0; i < preds.size(); i++) { 2100 /* update the operands so that it uses the new affinity */ 2101 phi->operands[i] = Operand(ops[i]); 2102 assert(ctx.assignments[ops[i].id()].assigned); 2103 assert(ops[i].regClass() == new_val.regClass()); 2104 phi->operands[i].setFixed(ctx.assignments[ops[i].id()].reg); 2105 } 2106 block->instructions.insert(block->instructions.begin(), std::move(phi)); 2107 } 2108 2109 return new_val; 2110} 2111 2112void 2113handle_loop_phis(ra_ctx& ctx, const IDSet& live_in, uint32_t loop_header_idx, 2114 uint32_t loop_exit_idx) 2115{ 2116 Block& loop_header = ctx.program->blocks[loop_header_idx]; 2117 std::unordered_map<unsigned, Temp> renames; 2118 2119 /* create phis for variables renamed during the loop */ 2120 for (unsigned t : live_in) { 2121 Temp val = Temp(t, ctx.program->temp_rc[t]); 2122 Temp prev = read_variable(ctx, val, loop_header_idx - 1); 2123 Temp renamed = handle_live_in(ctx, val, &loop_header); 2124 if (renamed == prev) 2125 continue; 2126 2127 /* insert additional renames at block end, but don't overwrite */ 2128 renames[prev.id()] = renamed; 2129 ctx.orig_names[renamed.id()] = val; 2130 for (unsigned idx = loop_header_idx; idx < loop_exit_idx; idx++) { 2131 auto it = ctx.renames[idx].emplace(val.id(), renamed); 2132 /* if insertion is unsuccessful, update if necessary */ 2133 if (!it.second && it.first->second == prev) 2134 it.first->second = renamed; 2135 } 2136 2137 /* update loop-carried values of the phi created by handle_live_in() */ 2138 for (unsigned i = 1; i < loop_header.instructions[0]->operands.size(); i++) { 2139 Operand& op = loop_header.instructions[0]->operands[i]; 2140 if (op.getTemp() == prev) 2141 op.setTemp(renamed); 2142 } 2143 2144 /* use the assignment from the loop preheader and fix def reg */ 2145 assignment& var = ctx.assignments[prev.id()]; 2146 ctx.assignments[renamed.id()] = var; 2147 loop_header.instructions[0]->definitions[0].setFixed(var.reg); 2148 } 2149 2150 /* rename loop carried phi operands */ 2151 for (unsigned i = renames.size(); i < loop_header.instructions.size(); i++) { 2152 aco_ptr<Instruction>& phi = loop_header.instructions[i]; 2153 if (!is_phi(phi)) 2154 break; 2155 const std::vector<unsigned>& preds = 2156 phi->opcode == aco_opcode::p_phi ? loop_header.logical_preds : loop_header.linear_preds; 2157 for (unsigned j = 1; j < phi->operands.size(); j++) { 2158 Operand& op = phi->operands[j]; 2159 if (!op.isTemp()) 2160 continue; 2161 2162 /* Find the original name, since this operand might not use the original name if the phi 2163 * was created after init_reg_file(). 2164 */ 2165 std::unordered_map<unsigned, Temp>::iterator it = ctx.orig_names.find(op.tempId()); 2166 Temp orig = it != ctx.orig_names.end() ? it->second : op.getTemp(); 2167 2168 op.setTemp(read_variable(ctx, orig, preds[j])); 2169 op.setFixed(ctx.assignments[op.tempId()].reg); 2170 } 2171 } 2172 2173 /* return early if no new phi was created */ 2174 if (renames.empty()) 2175 return; 2176 2177 /* propagate new renames through loop */ 2178 for (unsigned idx = loop_header_idx; idx < loop_exit_idx; idx++) { 2179 Block& current = ctx.program->blocks[idx]; 2180 /* rename all uses in this block */ 2181 for (aco_ptr<Instruction>& instr : current.instructions) { 2182 /* phis are renamed after RA */ 2183 if (idx == loop_header_idx && is_phi(instr)) 2184 continue; 2185 2186 for (Operand& op : instr->operands) { 2187 if (!op.isTemp()) 2188 continue; 2189 2190 auto rename = renames.find(op.tempId()); 2191 if (rename != renames.end()) { 2192 assert(rename->second.id()); 2193 op.setTemp(rename->second); 2194 } 2195 } 2196 } 2197 } 2198} 2199 2200/** 2201 * This function serves the purpose to correctly initialize the register file 2202 * at the beginning of a block (before any existing phis). 2203 * In order to do so, all live-in variables are entered into the RegisterFile. 2204 * Reg-to-reg moves (renames) from previous blocks are taken into account and 2205 * the SSA is repaired by inserting corresponding phi-nodes. 2206 */ 2207RegisterFile 2208init_reg_file(ra_ctx& ctx, const std::vector<IDSet>& live_out_per_block, Block& block) 2209{ 2210 if (block.kind & block_kind_loop_exit) { 2211 uint32_t header = ctx.loop_header.back(); 2212 ctx.loop_header.pop_back(); 2213 handle_loop_phis(ctx, live_out_per_block[header], header, block.index); 2214 } 2215 2216 RegisterFile register_file; 2217 const IDSet& live_in = live_out_per_block[block.index]; 2218 assert(block.index != 0 || live_in.empty()); 2219 2220 if (block.kind & block_kind_loop_header) { 2221 ctx.loop_header.emplace_back(block.index); 2222 /* already rename phis incoming value */ 2223 for (aco_ptr<Instruction>& instr : block.instructions) { 2224 if (!is_phi(instr)) 2225 break; 2226 Operand& operand = instr->operands[0]; 2227 if (operand.isTemp()) { 2228 operand.setTemp(read_variable(ctx, operand.getTemp(), block.index - 1)); 2229 operand.setFixed(ctx.assignments[operand.tempId()].reg); 2230 } 2231 } 2232 for (unsigned t : live_in) { 2233 Temp val = Temp(t, ctx.program->temp_rc[t]); 2234 Temp renamed = read_variable(ctx, val, block.index - 1); 2235 if (renamed != val) 2236 ctx.renames[block.index][val.id()] = renamed; 2237 assignment& var = ctx.assignments[renamed.id()]; 2238 assert(var.assigned); 2239 register_file.fill(Definition(renamed.id(), var.reg, var.rc)); 2240 } 2241 } else { 2242 /* rename phi operands */ 2243 for (aco_ptr<Instruction>& instr : block.instructions) { 2244 if (!is_phi(instr)) 2245 break; 2246 const std::vector<unsigned>& preds = 2247 instr->opcode == aco_opcode::p_phi ? block.logical_preds : block.linear_preds; 2248 2249 for (unsigned i = 0; i < instr->operands.size(); i++) { 2250 Operand& operand = instr->operands[i]; 2251 if (!operand.isTemp()) 2252 continue; 2253 operand.setTemp(read_variable(ctx, operand.getTemp(), preds[i])); 2254 operand.setFixed(ctx.assignments[operand.tempId()].reg); 2255 } 2256 } 2257 for (unsigned t : live_in) { 2258 Temp val = Temp(t, ctx.program->temp_rc[t]); 2259 Temp renamed = handle_live_in(ctx, val, &block); 2260 assignment& var = ctx.assignments[renamed.id()]; 2261 /* due to live-range splits, the live-in might be a phi, now */ 2262 if (var.assigned) { 2263 register_file.fill(Definition(renamed.id(), var.reg, var.rc)); 2264 } 2265 if (renamed != val) { 2266 ctx.renames[block.index].emplace(t, renamed); 2267 ctx.orig_names[renamed.id()] = val; 2268 } 2269 } 2270 } 2271 2272 return register_file; 2273} 2274 2275void 2276get_affinities(ra_ctx& ctx, std::vector<IDSet>& live_out_per_block) 2277{ 2278 std::vector<std::vector<Temp>> phi_ressources; 2279 std::unordered_map<unsigned, unsigned> temp_to_phi_ressources; 2280 2281 for (auto block_rit = ctx.program->blocks.rbegin(); block_rit != ctx.program->blocks.rend(); 2282 block_rit++) { 2283 Block& block = *block_rit; 2284 2285 /* first, compute the death points of all live vars within the block */ 2286 IDSet& live = live_out_per_block[block.index]; 2287 2288 std::vector<aco_ptr<Instruction>>::reverse_iterator rit; 2289 for (rit = block.instructions.rbegin(); rit != block.instructions.rend(); ++rit) { 2290 aco_ptr<Instruction>& instr = *rit; 2291 if (is_phi(instr)) 2292 break; 2293 2294 /* add vector affinities */ 2295 if (instr->opcode == aco_opcode::p_create_vector) { 2296 for (const Operand& op : instr->operands) { 2297 if (op.isTemp() && op.isFirstKill() && 2298 op.getTemp().type() == instr->definitions[0].getTemp().type()) 2299 ctx.vectors[op.tempId()] = instr.get(); 2300 } 2301 } else if (instr->format == Format::MIMG && instr->operands.size() > 4) { 2302 for (unsigned i = 3; i < instr->operands.size(); i++) 2303 ctx.vectors[instr->operands[i].tempId()] = instr.get(); 2304 } 2305 2306 if (instr->opcode == aco_opcode::p_split_vector && 2307 instr->operands[0].isFirstKillBeforeDef()) 2308 ctx.split_vectors[instr->operands[0].tempId()] = instr.get(); 2309 2310 /* add operands to live variables */ 2311 for (const Operand& op : instr->operands) { 2312 if (op.isTemp()) 2313 live.insert(op.tempId()); 2314 } 2315 2316 /* erase definitions from live */ 2317 for (unsigned i = 0; i < instr->definitions.size(); i++) { 2318 const Definition& def = instr->definitions[i]; 2319 if (!def.isTemp()) 2320 continue; 2321 live.erase(def.tempId()); 2322 /* mark last-seen phi operand */ 2323 std::unordered_map<unsigned, unsigned>::iterator it = 2324 temp_to_phi_ressources.find(def.tempId()); 2325 if (it != temp_to_phi_ressources.end() && 2326 def.regClass() == phi_ressources[it->second][0].regClass()) { 2327 phi_ressources[it->second][0] = def.getTemp(); 2328 /* try to coalesce phi affinities with parallelcopies */ 2329 Operand op = Operand(); 2330 switch (instr->opcode) { 2331 case aco_opcode::p_parallelcopy: op = instr->operands[i]; break; 2332 2333 case aco_opcode::v_interp_p2_f32: 2334 case aco_opcode::v_writelane_b32: 2335 case aco_opcode::v_writelane_b32_e64: op = instr->operands[2]; break; 2336 2337 case aco_opcode::v_fma_f32: 2338 case aco_opcode::v_fma_f16: 2339 case aco_opcode::v_pk_fma_f16: 2340 if (ctx.program->chip_class < GFX10) 2341 continue; 2342 FALLTHROUGH; 2343 case aco_opcode::v_mad_f32: 2344 case aco_opcode::v_mad_f16: 2345 if (instr->usesModifiers()) 2346 continue; 2347 op = instr->operands[2]; 2348 break; 2349 2350 default: continue; 2351 } 2352 2353 if (op.isTemp() && op.isFirstKillBeforeDef() && def.regClass() == op.regClass()) { 2354 phi_ressources[it->second].emplace_back(op.getTemp()); 2355 temp_to_phi_ressources[op.tempId()] = it->second; 2356 } 2357 } 2358 } 2359 } 2360 2361 /* collect phi affinities */ 2362 for (; rit != block.instructions.rend(); ++rit) { 2363 aco_ptr<Instruction>& instr = *rit; 2364 assert(is_phi(instr)); 2365 2366 live.erase(instr->definitions[0].tempId()); 2367 if (instr->definitions[0].isKill() || instr->definitions[0].isFixed()) 2368 continue; 2369 2370 assert(instr->definitions[0].isTemp()); 2371 std::unordered_map<unsigned, unsigned>::iterator it = 2372 temp_to_phi_ressources.find(instr->definitions[0].tempId()); 2373 unsigned index = phi_ressources.size(); 2374 std::vector<Temp>* affinity_related; 2375 if (it != temp_to_phi_ressources.end()) { 2376 index = it->second; 2377 phi_ressources[index][0] = instr->definitions[0].getTemp(); 2378 affinity_related = &phi_ressources[index]; 2379 } else { 2380 phi_ressources.emplace_back(std::vector<Temp>{instr->definitions[0].getTemp()}); 2381 affinity_related = &phi_ressources.back(); 2382 } 2383 2384 for (const Operand& op : instr->operands) { 2385 if (op.isTemp() && op.isKill() && op.regClass() == instr->definitions[0].regClass()) { 2386 affinity_related->emplace_back(op.getTemp()); 2387 if (block.kind & block_kind_loop_header) 2388 continue; 2389 temp_to_phi_ressources[op.tempId()] = index; 2390 } 2391 } 2392 } 2393 2394 /* visit the loop header phis first in order to create nested affinities */ 2395 if (block.kind & block_kind_loop_exit) { 2396 /* find loop header */ 2397 auto header_rit = block_rit; 2398 while ((header_rit + 1)->loop_nest_depth > block.loop_nest_depth) 2399 header_rit++; 2400 2401 for (aco_ptr<Instruction>& phi : header_rit->instructions) { 2402 if (!is_phi(phi)) 2403 break; 2404 if (phi->definitions[0].isKill() || phi->definitions[0].isFixed()) 2405 continue; 2406 2407 /* create an (empty) merge-set for the phi-related variables */ 2408 auto it = temp_to_phi_ressources.find(phi->definitions[0].tempId()); 2409 unsigned index = phi_ressources.size(); 2410 if (it == temp_to_phi_ressources.end()) { 2411 temp_to_phi_ressources[phi->definitions[0].tempId()] = index; 2412 phi_ressources.emplace_back(std::vector<Temp>{phi->definitions[0].getTemp()}); 2413 } else { 2414 index = it->second; 2415 } 2416 for (unsigned i = 1; i < phi->operands.size(); i++) { 2417 const Operand& op = phi->operands[i]; 2418 if (op.isTemp() && op.isKill() && op.regClass() == phi->definitions[0].regClass()) { 2419 temp_to_phi_ressources[op.tempId()] = index; 2420 } 2421 } 2422 } 2423 } 2424 } 2425 /* create affinities */ 2426 for (std::vector<Temp>& vec : phi_ressources) { 2427 for (unsigned i = 1; i < vec.size(); i++) 2428 if (vec[i].id() != vec[0].id()) 2429 ctx.assignments[vec[i].id()].affinity = vec[0].id(); 2430 } 2431} 2432 2433} /* end namespace */ 2434 2435void 2436register_allocation(Program* program, std::vector<IDSet>& live_out_per_block, ra_test_policy policy) 2437{ 2438 ra_ctx ctx(program, policy); 2439 get_affinities(ctx, live_out_per_block); 2440 2441 /* state of register file after phis */ 2442 std::vector<std::bitset<128>> sgpr_live_in(program->blocks.size()); 2443 2444 for (Block& block : program->blocks) { 2445 ctx.block = █ 2446 2447 /* initialize register file */ 2448 RegisterFile register_file = init_reg_file(ctx, live_out_per_block, block); 2449 ctx.war_hint.reset(); 2450 2451 std::vector<aco_ptr<Instruction>> instructions; 2452 2453 /* this is a slight adjustment from the paper as we already have phi nodes: 2454 * We consider them incomplete phis and only handle the definition. */ 2455 get_regs_for_phis(ctx, block, register_file, instructions, live_out_per_block[block.index]); 2456 2457 /* fill in sgpr_live_in */ 2458 for (unsigned i = 0; i <= ctx.max_used_sgpr; i++) 2459 sgpr_live_in[block.index][i] = register_file[PhysReg{i}]; 2460 sgpr_live_in[block.index][127] = register_file[scc]; 2461 2462 /* Handle all other instructions of the block */ 2463 auto NonPhi = [](aco_ptr<Instruction>& instr) -> bool { return instr && !is_phi(instr); }; 2464 std::vector<aco_ptr<Instruction>>::iterator instr_it = 2465 std::find_if(block.instructions.begin(), block.instructions.end(), NonPhi); 2466 for (; instr_it != block.instructions.end(); ++instr_it) { 2467 aco_ptr<Instruction>& instr = *instr_it; 2468 2469 /* parallelcopies from p_phi are inserted here which means 2470 * live ranges of killed operands end here as well */ 2471 if (instr->opcode == aco_opcode::p_logical_end) { 2472 /* no need to process this instruction any further */ 2473 if (block.logical_succs.size() != 1) { 2474 instructions.emplace_back(std::move(instr)); 2475 continue; 2476 } 2477 2478 Block& succ = program->blocks[block.logical_succs[0]]; 2479 unsigned idx = 0; 2480 for (; idx < succ.logical_preds.size(); idx++) { 2481 if (succ.logical_preds[idx] == block.index) 2482 break; 2483 } 2484 for (aco_ptr<Instruction>& phi : succ.instructions) { 2485 if (phi->opcode == aco_opcode::p_phi) { 2486 if (phi->operands[idx].isTemp() && 2487 phi->operands[idx].getTemp().type() == RegType::sgpr && 2488 phi->operands[idx].isFirstKillBeforeDef()) { 2489 Definition phi_op( 2490 read_variable(ctx, phi->operands[idx].getTemp(), block.index)); 2491 phi_op.setFixed(ctx.assignments[phi_op.tempId()].reg); 2492 register_file.clear(phi_op); 2493 } 2494 } else if (phi->opcode != aco_opcode::p_linear_phi) { 2495 break; 2496 } 2497 } 2498 instructions.emplace_back(std::move(instr)); 2499 continue; 2500 } 2501 2502 std::vector<std::pair<Operand, Definition>> parallelcopy; 2503 2504 assert(!is_phi(instr)); 2505 2506 bool temp_in_scc = register_file[scc]; 2507 2508 /* handle operands */ 2509 for (unsigned i = 0; i < instr->operands.size(); ++i) { 2510 auto& operand = instr->operands[i]; 2511 if (!operand.isTemp()) 2512 continue; 2513 2514 /* rename operands */ 2515 operand.setTemp(read_variable(ctx, operand.getTemp(), block.index)); 2516 assert(ctx.assignments[operand.tempId()].assigned); 2517 2518 PhysReg reg = ctx.assignments[operand.tempId()].reg; 2519 if (operand_can_use_reg(program->chip_class, instr, i, reg, operand.regClass())) 2520 operand.setFixed(reg); 2521 else 2522 get_reg_for_operand(ctx, register_file, parallelcopy, instr, operand, i); 2523 2524 if (instr->isEXP() || (instr->isVMEM() && i == 3 && ctx.program->chip_class == GFX6) || 2525 (instr->isDS() && instr->ds().gds)) { 2526 for (unsigned j = 0; j < operand.size(); j++) 2527 ctx.war_hint.set(operand.physReg().reg() + j); 2528 } 2529 } 2530 2531 /* remove dead vars from register file */ 2532 for (const Operand& op : instr->operands) { 2533 if (op.isTemp() && op.isFirstKillBeforeDef()) 2534 register_file.clear(op); 2535 } 2536 2537 /* try to optimize v_mad_f32 -> v_mac_f32 */ 2538 if ((instr->opcode == aco_opcode::v_mad_f32 || 2539 (instr->opcode == aco_opcode::v_fma_f32 && program->chip_class >= GFX10) || 2540 instr->opcode == aco_opcode::v_mad_f16 || 2541 instr->opcode == aco_opcode::v_mad_legacy_f16 || 2542 (instr->opcode == aco_opcode::v_fma_f16 && program->chip_class >= GFX10) || 2543 (instr->opcode == aco_opcode::v_pk_fma_f16 && program->chip_class >= GFX10) || 2544 (instr->opcode == aco_opcode::v_dot4_i32_i8 && program->family != CHIP_VEGA20)) && 2545 instr->operands[2].isTemp() && instr->operands[2].isKillBeforeDef() && 2546 instr->operands[2].getTemp().type() == RegType::vgpr && instr->operands[1].isTemp() && 2547 instr->operands[1].getTemp().type() == RegType::vgpr && !instr->usesModifiers() && 2548 instr->operands[0].physReg().byte() == 0 && instr->operands[1].physReg().byte() == 0 && 2549 instr->operands[2].physReg().byte() == 0) { 2550 unsigned def_id = instr->definitions[0].tempId(); 2551 bool use_vop2 = true; 2552 if (ctx.assignments[def_id].affinity) { 2553 assignment& affinity = ctx.assignments[ctx.assignments[def_id].affinity]; 2554 if (affinity.assigned && affinity.reg != instr->operands[2].physReg() && 2555 !register_file.test(affinity.reg, instr->operands[2].bytes())) 2556 use_vop2 = false; 2557 } 2558 if (use_vop2) { 2559 instr->format = Format::VOP2; 2560 switch (instr->opcode) { 2561 case aco_opcode::v_mad_f32: instr->opcode = aco_opcode::v_mac_f32; break; 2562 case aco_opcode::v_fma_f32: instr->opcode = aco_opcode::v_fmac_f32; break; 2563 case aco_opcode::v_mad_f16: 2564 case aco_opcode::v_mad_legacy_f16: instr->opcode = aco_opcode::v_mac_f16; break; 2565 case aco_opcode::v_fma_f16: instr->opcode = aco_opcode::v_fmac_f16; break; 2566 case aco_opcode::v_pk_fma_f16: instr->opcode = aco_opcode::v_pk_fmac_f16; break; 2567 case aco_opcode::v_dot4_i32_i8: instr->opcode = aco_opcode::v_dot4c_i32_i8; break; 2568 default: break; 2569 } 2570 } 2571 } 2572 2573 /* handle definitions which must have the same register as an operand */ 2574 if (instr->opcode == aco_opcode::v_interp_p2_f32 || 2575 instr->opcode == aco_opcode::v_mac_f32 || instr->opcode == aco_opcode::v_fmac_f32 || 2576 instr->opcode == aco_opcode::v_mac_f16 || instr->opcode == aco_opcode::v_fmac_f16 || 2577 instr->opcode == aco_opcode::v_pk_fmac_f16 || 2578 instr->opcode == aco_opcode::v_writelane_b32 || 2579 instr->opcode == aco_opcode::v_writelane_b32_e64 || 2580 instr->opcode == aco_opcode::v_dot4c_i32_i8) { 2581 instr->definitions[0].setFixed(instr->operands[2].physReg()); 2582 } else if (instr->opcode == aco_opcode::s_addk_i32 || 2583 instr->opcode == aco_opcode::s_mulk_i32) { 2584 instr->definitions[0].setFixed(instr->operands[0].physReg()); 2585 } else if (instr->isMUBUF() && instr->definitions.size() == 1 && 2586 instr->operands.size() == 4) { 2587 instr->definitions[0].setFixed(instr->operands[3].physReg()); 2588 } else if (instr->isMIMG() && instr->definitions.size() == 1 && 2589 !instr->operands[2].isUndefined()) { 2590 instr->definitions[0].setFixed(instr->operands[2].physReg()); 2591 } 2592 2593 ctx.defs_done.reset(); 2594 2595 /* handle fixed definitions first */ 2596 for (unsigned i = 0; i < instr->definitions.size(); ++i) { 2597 auto& definition = instr->definitions[i]; 2598 if (!definition.isFixed()) 2599 continue; 2600 2601 adjust_max_used_regs(ctx, definition.regClass(), definition.physReg()); 2602 /* check if the target register is blocked */ 2603 if (register_file.test(definition.physReg(), definition.bytes())) { 2604 const PhysRegInterval def_regs{definition.physReg(), definition.size()}; 2605 2606 /* create parallelcopy pair to move blocking vars */ 2607 std::set<std::pair<unsigned, unsigned>> vars = 2608 collect_vars(ctx, register_file, def_regs); 2609 2610 RegisterFile tmp_file(register_file); 2611 /* re-enable the killed operands, so that we don't move the blocking vars there */ 2612 for (const Operand& op : instr->operands) { 2613 if (op.isTemp() && op.isFirstKillBeforeDef()) 2614 tmp_file.fill(op); 2615 } 2616 2617 ASSERTED bool success = false; 2618 DefInfo info(ctx, instr, definition.regClass(), -1); 2619 success = get_regs_for_copies(ctx, tmp_file, parallelcopy, vars, info.bounds, instr, 2620 def_regs); 2621 assert(success); 2622 2623 update_renames(ctx, register_file, parallelcopy, instr, (UpdateRenames)0); 2624 } 2625 ctx.defs_done.set(i); 2626 2627 if (!definition.isTemp()) 2628 continue; 2629 2630 ctx.assignments[definition.tempId()].set(definition); 2631 register_file.fill(definition); 2632 } 2633 2634 /* handle all other definitions */ 2635 for (unsigned i = 0; i < instr->definitions.size(); ++i) { 2636 Definition* definition = &instr->definitions[i]; 2637 2638 if (definition->isFixed() || !definition->isTemp()) 2639 continue; 2640 2641 /* find free reg */ 2642 if (definition->hasHint() && 2643 get_reg_specified(ctx, register_file, definition->regClass(), instr, 2644 definition->physReg())) { 2645 definition->setFixed(definition->physReg()); 2646 } else if (instr->opcode == aco_opcode::p_split_vector) { 2647 PhysReg reg = instr->operands[0].physReg(); 2648 for (unsigned j = 0; j < i; j++) 2649 reg.reg_b += instr->definitions[j].bytes(); 2650 if (get_reg_specified(ctx, register_file, definition->regClass(), instr, reg)) 2651 definition->setFixed(reg); 2652 } else if (instr->opcode == aco_opcode::p_wqm || 2653 instr->opcode == aco_opcode::p_parallelcopy) { 2654 PhysReg reg = instr->operands[i].physReg(); 2655 if (instr->operands[i].isTemp() && 2656 instr->operands[i].getTemp().type() == definition->getTemp().type() && 2657 !register_file.test(reg, definition->bytes())) 2658 definition->setFixed(reg); 2659 } else if (instr->opcode == aco_opcode::p_extract_vector) { 2660 PhysReg reg = instr->operands[0].physReg(); 2661 reg.reg_b += definition->bytes() * instr->operands[1].constantValue(); 2662 if (get_reg_specified(ctx, register_file, definition->regClass(), instr, reg)) 2663 definition->setFixed(reg); 2664 } else if (instr->opcode == aco_opcode::p_create_vector) { 2665 PhysReg reg = get_reg_create_vector(ctx, register_file, definition->getTemp(), 2666 parallelcopy, instr); 2667 update_renames(ctx, register_file, parallelcopy, instr, (UpdateRenames)0); 2668 definition->setFixed(reg); 2669 } 2670 2671 if (!definition->isFixed()) { 2672 Temp tmp = definition->getTemp(); 2673 if (definition->regClass().is_subdword() && definition->bytes() < 4) { 2674 PhysReg reg = get_reg(ctx, register_file, tmp, parallelcopy, instr); 2675 definition->setFixed(reg); 2676 if (reg.byte() || register_file.test(reg, 4)) { 2677 add_subdword_definition(program, instr, reg); 2678 definition = &instr->definitions[i]; /* add_subdword_definition can invalidate 2679 the reference */ 2680 } 2681 } else { 2682 definition->setFixed(get_reg(ctx, register_file, tmp, parallelcopy, instr)); 2683 } 2684 update_renames(ctx, register_file, parallelcopy, instr, 2685 instr->opcode != aco_opcode::p_create_vector ? rename_not_killed_ops 2686 : (UpdateRenames)0); 2687 } 2688 2689 assert( 2690 definition->isFixed() && 2691 ((definition->getTemp().type() == RegType::vgpr && definition->physReg() >= 256) || 2692 (definition->getTemp().type() != RegType::vgpr && definition->physReg() < 256))); 2693 ctx.defs_done.set(i); 2694 ctx.assignments[definition->tempId()].set(*definition); 2695 register_file.fill(*definition); 2696 } 2697 2698 handle_pseudo(ctx, register_file, instr.get()); 2699 2700 /* kill definitions and late-kill operands and ensure that sub-dword operands can actually 2701 * be read */ 2702 for (const Definition& def : instr->definitions) { 2703 if (def.isTemp() && def.isKill()) 2704 register_file.clear(def); 2705 } 2706 for (unsigned i = 0; i < instr->operands.size(); i++) { 2707 const Operand& op = instr->operands[i]; 2708 if (op.isTemp() && op.isFirstKill() && op.isLateKill()) 2709 register_file.clear(op); 2710 if (op.isTemp() && op.physReg().byte() != 0) 2711 add_subdword_operand(ctx, instr, i, op.physReg().byte(), op.regClass()); 2712 } 2713 2714 /* emit parallelcopy */ 2715 if (!parallelcopy.empty()) { 2716 aco_ptr<Pseudo_instruction> pc; 2717 pc.reset(create_instruction<Pseudo_instruction>(aco_opcode::p_parallelcopy, 2718 Format::PSEUDO, parallelcopy.size(), 2719 parallelcopy.size())); 2720 bool linear_vgpr = false; 2721 bool sgpr_operands_alias_defs = false; 2722 uint64_t sgpr_operands[4] = {0, 0, 0, 0}; 2723 for (unsigned i = 0; i < parallelcopy.size(); i++) { 2724 linear_vgpr |= parallelcopy[i].first.regClass().is_linear_vgpr(); 2725 2726 if (temp_in_scc && parallelcopy[i].first.isTemp() && 2727 parallelcopy[i].first.getTemp().type() == RegType::sgpr) { 2728 if (!sgpr_operands_alias_defs) { 2729 unsigned reg = parallelcopy[i].first.physReg().reg(); 2730 unsigned size = parallelcopy[i].first.getTemp().size(); 2731 sgpr_operands[reg / 64u] |= u_bit_consecutive64(reg % 64u, size); 2732 2733 reg = parallelcopy[i].second.physReg().reg(); 2734 size = parallelcopy[i].second.getTemp().size(); 2735 if (sgpr_operands[reg / 64u] & u_bit_consecutive64(reg % 64u, size)) 2736 sgpr_operands_alias_defs = true; 2737 } 2738 } 2739 2740 pc->operands[i] = parallelcopy[i].first; 2741 pc->definitions[i] = parallelcopy[i].second; 2742 assert(pc->operands[i].size() == pc->definitions[i].size()); 2743 2744 /* it might happen that the operand is already renamed. we have to restore the 2745 * original name. */ 2746 std::unordered_map<unsigned, Temp>::iterator it = 2747 ctx.orig_names.find(pc->operands[i].tempId()); 2748 Temp orig = it != ctx.orig_names.end() ? it->second : pc->operands[i].getTemp(); 2749 ctx.orig_names[pc->definitions[i].tempId()] = orig; 2750 ctx.renames[block.index][orig.id()] = pc->definitions[i].getTemp(); 2751 } 2752 2753 if (temp_in_scc && (sgpr_operands_alias_defs || linear_vgpr)) { 2754 /* disable definitions and re-enable operands */ 2755 RegisterFile tmp_file(register_file); 2756 for (const Definition& def : instr->definitions) { 2757 if (def.isTemp() && !def.isKill()) 2758 tmp_file.clear(def); 2759 } 2760 for (const Operand& op : instr->operands) { 2761 if (op.isTemp() && op.isFirstKill()) 2762 tmp_file.block(op.physReg(), op.regClass()); 2763 } 2764 2765 handle_pseudo(ctx, tmp_file, pc.get()); 2766 } else { 2767 pc->tmp_in_scc = false; 2768 } 2769 2770 instructions.emplace_back(std::move(pc)); 2771 } 2772 2773 /* some instructions need VOP3 encoding if operand/definition is not assigned to VCC */ 2774 bool instr_needs_vop3 = 2775 !instr->isVOP3() && 2776 ((instr->format == Format::VOPC && !(instr->definitions[0].physReg() == vcc)) || 2777 (instr->opcode == aco_opcode::v_cndmask_b32 && 2778 !(instr->operands[2].physReg() == vcc)) || 2779 ((instr->opcode == aco_opcode::v_add_co_u32 || 2780 instr->opcode == aco_opcode::v_addc_co_u32 || 2781 instr->opcode == aco_opcode::v_sub_co_u32 || 2782 instr->opcode == aco_opcode::v_subb_co_u32 || 2783 instr->opcode == aco_opcode::v_subrev_co_u32 || 2784 instr->opcode == aco_opcode::v_subbrev_co_u32) && 2785 !(instr->definitions[1].physReg() == vcc)) || 2786 ((instr->opcode == aco_opcode::v_addc_co_u32 || 2787 instr->opcode == aco_opcode::v_subb_co_u32 || 2788 instr->opcode == aco_opcode::v_subbrev_co_u32) && 2789 !(instr->operands[2].physReg() == vcc))); 2790 if (instr_needs_vop3) { 2791 2792 /* if the first operand is a literal, we have to move it to a reg */ 2793 if (instr->operands.size() && instr->operands[0].isLiteral() && 2794 program->chip_class < GFX10) { 2795 bool can_sgpr = true; 2796 /* check, if we have to move to vgpr */ 2797 for (const Operand& op : instr->operands) { 2798 if (op.isTemp() && op.getTemp().type() == RegType::sgpr) { 2799 can_sgpr = false; 2800 break; 2801 } 2802 } 2803 /* disable definitions and re-enable operands */ 2804 RegisterFile tmp_file(register_file); 2805 for (const Definition& def : instr->definitions) 2806 tmp_file.clear(def); 2807 for (const Operand& op : instr->operands) { 2808 if (op.isTemp() && op.isFirstKill()) 2809 tmp_file.block(op.physReg(), op.regClass()); 2810 } 2811 Temp tmp = program->allocateTmp(can_sgpr ? s1 : v1); 2812 ctx.assignments.emplace_back(); 2813 PhysReg reg = get_reg(ctx, tmp_file, tmp, parallelcopy, instr); 2814 update_renames(ctx, register_file, parallelcopy, instr, rename_not_killed_ops); 2815 2816 aco_ptr<Instruction> mov; 2817 if (can_sgpr) 2818 mov.reset(create_instruction<SOP1_instruction>(aco_opcode::s_mov_b32, 2819 Format::SOP1, 1, 1)); 2820 else 2821 mov.reset(create_instruction<VOP1_instruction>(aco_opcode::v_mov_b32, 2822 Format::VOP1, 1, 1)); 2823 mov->operands[0] = instr->operands[0]; 2824 mov->definitions[0] = Definition(tmp); 2825 mov->definitions[0].setFixed(reg); 2826 2827 instr->operands[0] = Operand(tmp); 2828 instr->operands[0].setFixed(reg); 2829 instr->operands[0].setFirstKill(true); 2830 2831 instructions.emplace_back(std::move(mov)); 2832 } 2833 2834 /* change the instruction to VOP3 to enable an arbitrary register pair as dst */ 2835 aco_ptr<Instruction> tmp = std::move(instr); 2836 Format format = asVOP3(tmp->format); 2837 instr.reset(create_instruction<VOP3_instruction>( 2838 tmp->opcode, format, tmp->operands.size(), tmp->definitions.size())); 2839 std::copy(tmp->operands.begin(), tmp->operands.end(), instr->operands.begin()); 2840 std::copy(tmp->definitions.begin(), tmp->definitions.end(), instr->definitions.begin()); 2841 } 2842 2843 instructions.emplace_back(std::move(*instr_it)); 2844 2845 } /* end for Instr */ 2846 2847 block.instructions = std::move(instructions); 2848 } /* end for BB */ 2849 2850 /* find scc spill registers which may be needed for parallelcopies created by phis */ 2851 for (Block& block : program->blocks) { 2852 if (block.linear_preds.size() <= 1) 2853 continue; 2854 2855 std::bitset<128> regs = sgpr_live_in[block.index]; 2856 if (!regs[127]) 2857 continue; 2858 2859 /* choose a register */ 2860 int16_t reg = 0; 2861 for (; reg < ctx.program->max_reg_demand.sgpr && regs[reg]; reg++) 2862 ; 2863 assert(reg < ctx.program->max_reg_demand.sgpr); 2864 adjust_max_used_regs(ctx, s1, reg); 2865 2866 /* update predecessors */ 2867 for (unsigned& pred_index : block.linear_preds) { 2868 Block& pred = program->blocks[pred_index]; 2869 pred.scc_live_out = true; 2870 pred.scratch_sgpr = PhysReg{(uint16_t)reg}; 2871 } 2872 } 2873 2874 /* num_gpr = rnd_up(max_used_gpr + 1) */ 2875 program->config->num_vgprs = get_vgpr_alloc(program, ctx.max_used_vgpr + 1); 2876 program->config->num_sgprs = get_sgpr_alloc(program, ctx.max_used_sgpr + 1); 2877 2878 program->progress = CompilationProgress::after_ra; 2879} 2880 2881} // namespace aco 2882