1/* 2 * Copyright © 2018 Valve Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 * 23 */ 24 25#include "aco_builder.h" 26#include "aco_ir.h" 27 28#include "common/amdgfxregs.h" 29 30#include <algorithm> 31#include <unordered_set> 32#include <vector> 33 34#define SMEM_WINDOW_SIZE (350 - ctx.num_waves * 35) 35#define VMEM_WINDOW_SIZE (1024 - ctx.num_waves * 64) 36#define POS_EXP_WINDOW_SIZE 512 37#define SMEM_MAX_MOVES (64 - ctx.num_waves * 4) 38#define VMEM_MAX_MOVES (256 - ctx.num_waves * 16) 39/* creating clauses decreases def-use distances, so make it less aggressive the lower num_waves is */ 40#define VMEM_CLAUSE_MAX_GRAB_DIST (ctx.num_waves * 2) 41#define POS_EXP_MAX_MOVES 512 42 43namespace aco { 44 45enum MoveResult { 46 move_success, 47 move_fail_ssa, 48 move_fail_rar, 49 move_fail_pressure, 50}; 51 52/** 53 * Cursor for downwards moves, where a single instruction is moved towards 54 * or below a group of instruction that hardware can execute as a clause. 55 */ 56struct DownwardsCursor { 57 int source_idx; /* Current instruction to consider for moving */ 58 59 int insert_idx_clause; /* First clause instruction */ 60 int insert_idx; /* First instruction *after* the clause */ 61 62 /* Maximum demand of all clause instructions, 63 * i.e. from insert_idx_clause (inclusive) to insert_idx (exclusive) */ 64 RegisterDemand clause_demand; 65 /* Maximum demand of instructions from source_idx to insert_idx_clause (both exclusive) */ 66 RegisterDemand total_demand; 67 68 DownwardsCursor(int current_idx, RegisterDemand initial_clause_demand) 69 : source_idx(current_idx - 1), insert_idx_clause(current_idx), insert_idx(current_idx + 1), 70 clause_demand(initial_clause_demand) 71 {} 72 73 void verify_invariants(const RegisterDemand* register_demand); 74}; 75 76/** 77 * Cursor for upwards moves, where a single instruction is moved below 78 * another instruction. 79 */ 80struct UpwardsCursor { 81 int source_idx; /* Current instruction to consider for moving */ 82 int insert_idx; /* Instruction to move in front of */ 83 84 /* Maximum demand of instructions from insert_idx (inclusive) to source_idx (exclusive) */ 85 RegisterDemand total_demand; 86 87 UpwardsCursor(int source_idx_) : source_idx(source_idx_) 88 { 89 insert_idx = -1; /* to be initialized later */ 90 } 91 92 bool has_insert_idx() const { return insert_idx != -1; } 93 void verify_invariants(const RegisterDemand* register_demand); 94}; 95 96struct MoveState { 97 RegisterDemand max_registers; 98 99 Block* block; 100 Instruction* current; 101 RegisterDemand* register_demand; /* demand per instruction */ 102 bool improved_rar; 103 104 std::vector<bool> depends_on; 105 /* Two are needed because, for downwards VMEM scheduling, one needs to 106 * exclude the instructions in the clause, since new instructions in the 107 * clause are not moved past any other instructions in the clause. */ 108 std::vector<bool> RAR_dependencies; 109 std::vector<bool> RAR_dependencies_clause; 110 111 /* for moving instructions before the current instruction to after it */ 112 DownwardsCursor downwards_init(int current_idx, bool improved_rar, bool may_form_clauses); 113 MoveResult downwards_move(DownwardsCursor&, bool clause); 114 void downwards_skip(DownwardsCursor&); 115 116 /* for moving instructions after the first use of the current instruction upwards */ 117 UpwardsCursor upwards_init(int source_idx, bool improved_rar); 118 bool upwards_check_deps(UpwardsCursor&); 119 void upwards_update_insert_idx(UpwardsCursor&); 120 MoveResult upwards_move(UpwardsCursor&); 121 void upwards_skip(UpwardsCursor&); 122}; 123 124struct sched_ctx { 125 int16_t num_waves; 126 int16_t last_SMEM_stall; 127 int last_SMEM_dep_idx; 128 MoveState mv; 129 bool schedule_pos_exports = true; 130 unsigned schedule_pos_export_div = 1; 131}; 132 133/* This scheduler is a simple bottom-up pass based on ideas from 134 * "A Novel Lightweight Instruction Scheduling Algorithm for Just-In-Time Compiler" 135 * from Xiaohua Shi and Peng Guo. 136 * The basic approach is to iterate over all instructions. When a memory instruction 137 * is encountered it tries to move independent instructions from above and below 138 * between the memory instruction and it's first user. 139 * The novelty is that this scheduler cares for the current register pressure: 140 * Instructions will only be moved if the register pressure won't exceed a certain bound. 141 */ 142 143template <typename T> 144void 145move_element(T begin_it, size_t idx, size_t before) 146{ 147 if (idx < before) { 148 auto begin = std::next(begin_it, idx); 149 auto end = std::next(begin_it, before); 150 std::rotate(begin, begin + 1, end); 151 } else if (idx > before) { 152 auto begin = std::next(begin_it, before); 153 auto end = std::next(begin_it, idx + 1); 154 std::rotate(begin, end - 1, end); 155 } 156} 157 158void 159DownwardsCursor::verify_invariants(const RegisterDemand* register_demand) 160{ 161 assert(source_idx < insert_idx_clause); 162 assert(insert_idx_clause < insert_idx); 163 164#ifndef NDEBUG 165 RegisterDemand reference_demand; 166 for (int i = source_idx + 1; i < insert_idx_clause; ++i) { 167 reference_demand.update(register_demand[i]); 168 } 169 assert(total_demand == reference_demand); 170 171 reference_demand = {}; 172 for (int i = insert_idx_clause; i < insert_idx; ++i) { 173 reference_demand.update(register_demand[i]); 174 } 175 assert(clause_demand == reference_demand); 176#endif 177} 178 179DownwardsCursor 180MoveState::downwards_init(int current_idx, bool improved_rar_, bool may_form_clauses) 181{ 182 improved_rar = improved_rar_; 183 184 std::fill(depends_on.begin(), depends_on.end(), false); 185 if (improved_rar) { 186 std::fill(RAR_dependencies.begin(), RAR_dependencies.end(), false); 187 if (may_form_clauses) 188 std::fill(RAR_dependencies_clause.begin(), RAR_dependencies_clause.end(), false); 189 } 190 191 for (const Operand& op : current->operands) { 192 if (op.isTemp()) { 193 depends_on[op.tempId()] = true; 194 if (improved_rar && op.isFirstKill()) 195 RAR_dependencies[op.tempId()] = true; 196 } 197 } 198 199 DownwardsCursor cursor(current_idx, register_demand[current_idx]); 200 cursor.verify_invariants(register_demand); 201 return cursor; 202} 203 204/* If add_to_clause is true, the current clause is extended by moving the 205 * instruction at source_idx in front of the clause. Otherwise, the instruction 206 * is moved past the end of the clause without extending it */ 207MoveResult 208MoveState::downwards_move(DownwardsCursor& cursor, bool add_to_clause) 209{ 210 aco_ptr<Instruction>& instr = block->instructions[cursor.source_idx]; 211 212 for (const Definition& def : instr->definitions) 213 if (def.isTemp() && depends_on[def.tempId()]) 214 return move_fail_ssa; 215 216 /* check if one of candidate's operands is killed by depending instruction */ 217 std::vector<bool>& RAR_deps = 218 improved_rar ? (add_to_clause ? RAR_dependencies_clause : RAR_dependencies) : depends_on; 219 for (const Operand& op : instr->operands) { 220 if (op.isTemp() && RAR_deps[op.tempId()]) { 221 // FIXME: account for difference in register pressure 222 return move_fail_rar; 223 } 224 } 225 226 if (add_to_clause) { 227 for (const Operand& op : instr->operands) { 228 if (op.isTemp()) { 229 depends_on[op.tempId()] = true; 230 if (op.isFirstKill()) 231 RAR_dependencies[op.tempId()] = true; 232 } 233 } 234 } 235 236 const int dest_insert_idx = add_to_clause ? cursor.insert_idx_clause : cursor.insert_idx; 237 RegisterDemand register_pressure = cursor.total_demand; 238 if (!add_to_clause) { 239 register_pressure.update(cursor.clause_demand); 240 } 241 242 /* Check the new demand of the instructions being moved over */ 243 const RegisterDemand candidate_diff = get_live_changes(instr); 244 if (RegisterDemand(register_pressure - candidate_diff).exceeds(max_registers)) 245 return move_fail_pressure; 246 247 /* New demand for the moved instruction */ 248 const RegisterDemand temp = get_temp_registers(instr); 249 const RegisterDemand temp2 = get_temp_registers(block->instructions[dest_insert_idx - 1]); 250 const RegisterDemand new_demand = register_demand[dest_insert_idx - 1] - temp2 + temp; 251 if (new_demand.exceeds(max_registers)) 252 return move_fail_pressure; 253 254 /* move the candidate below the memory load */ 255 move_element(block->instructions.begin(), cursor.source_idx, dest_insert_idx); 256 257 /* update register pressure */ 258 move_element(register_demand, cursor.source_idx, dest_insert_idx); 259 for (int i = cursor.source_idx; i < dest_insert_idx - 1; i++) 260 register_demand[i] -= candidate_diff; 261 register_demand[dest_insert_idx - 1] = new_demand; 262 cursor.insert_idx_clause--; 263 if (cursor.source_idx != cursor.insert_idx_clause) { 264 /* Update demand if we moved over any instructions before the clause */ 265 cursor.total_demand -= candidate_diff; 266 } else { 267 assert(cursor.total_demand == RegisterDemand{}); 268 } 269 if (add_to_clause) { 270 cursor.clause_demand.update(new_demand); 271 } else { 272 cursor.clause_demand -= candidate_diff; 273 cursor.insert_idx--; 274 } 275 276 cursor.source_idx--; 277 cursor.verify_invariants(register_demand); 278 return move_success; 279} 280 281void 282MoveState::downwards_skip(DownwardsCursor& cursor) 283{ 284 aco_ptr<Instruction>& instr = block->instructions[cursor.source_idx]; 285 286 for (const Operand& op : instr->operands) { 287 if (op.isTemp()) { 288 depends_on[op.tempId()] = true; 289 if (improved_rar && op.isFirstKill()) { 290 RAR_dependencies[op.tempId()] = true; 291 RAR_dependencies_clause[op.tempId()] = true; 292 } 293 } 294 } 295 cursor.total_demand.update(register_demand[cursor.source_idx]); 296 cursor.source_idx--; 297 cursor.verify_invariants(register_demand); 298} 299 300void 301UpwardsCursor::verify_invariants(const RegisterDemand* register_demand) 302{ 303#ifndef NDEBUG 304 if (!has_insert_idx()) { 305 return; 306 } 307 308 assert(insert_idx < source_idx); 309 310 RegisterDemand reference_demand; 311 for (int i = insert_idx; i < source_idx; ++i) { 312 reference_demand.update(register_demand[i]); 313 } 314 assert(total_demand == reference_demand); 315#endif 316} 317 318UpwardsCursor 319MoveState::upwards_init(int source_idx, bool improved_rar_) 320{ 321 improved_rar = improved_rar_; 322 323 std::fill(depends_on.begin(), depends_on.end(), false); 324 std::fill(RAR_dependencies.begin(), RAR_dependencies.end(), false); 325 326 for (const Definition& def : current->definitions) { 327 if (def.isTemp()) 328 depends_on[def.tempId()] = true; 329 } 330 331 return UpwardsCursor(source_idx); 332} 333 334bool 335MoveState::upwards_check_deps(UpwardsCursor& cursor) 336{ 337 aco_ptr<Instruction>& instr = block->instructions[cursor.source_idx]; 338 for (const Operand& op : instr->operands) { 339 if (op.isTemp() && depends_on[op.tempId()]) 340 return false; 341 } 342 return true; 343} 344 345void 346MoveState::upwards_update_insert_idx(UpwardsCursor& cursor) 347{ 348 cursor.insert_idx = cursor.source_idx; 349 cursor.total_demand = register_demand[cursor.insert_idx]; 350} 351 352MoveResult 353MoveState::upwards_move(UpwardsCursor& cursor) 354{ 355 assert(cursor.has_insert_idx()); 356 357 aco_ptr<Instruction>& instr = block->instructions[cursor.source_idx]; 358 for (const Operand& op : instr->operands) { 359 if (op.isTemp() && depends_on[op.tempId()]) 360 return move_fail_ssa; 361 } 362 363 /* check if candidate uses/kills an operand which is used by a dependency */ 364 for (const Operand& op : instr->operands) { 365 if (op.isTemp() && (!improved_rar || op.isFirstKill()) && RAR_dependencies[op.tempId()]) 366 return move_fail_rar; 367 } 368 369 /* check if register pressure is low enough: the diff is negative if register pressure is 370 * decreased */ 371 const RegisterDemand candidate_diff = get_live_changes(instr); 372 const RegisterDemand temp = get_temp_registers(instr); 373 if (RegisterDemand(cursor.total_demand + candidate_diff).exceeds(max_registers)) 374 return move_fail_pressure; 375 const RegisterDemand temp2 = get_temp_registers(block->instructions[cursor.insert_idx - 1]); 376 const RegisterDemand new_demand = 377 register_demand[cursor.insert_idx - 1] - temp2 + candidate_diff + temp; 378 if (new_demand.exceeds(max_registers)) 379 return move_fail_pressure; 380 381 /* move the candidate above the insert_idx */ 382 move_element(block->instructions.begin(), cursor.source_idx, cursor.insert_idx); 383 384 /* update register pressure */ 385 move_element(register_demand, cursor.source_idx, cursor.insert_idx); 386 register_demand[cursor.insert_idx] = new_demand; 387 for (int i = cursor.insert_idx + 1; i <= cursor.source_idx; i++) 388 register_demand[i] += candidate_diff; 389 cursor.total_demand += candidate_diff; 390 391 cursor.total_demand.update(register_demand[cursor.source_idx]); 392 393 cursor.insert_idx++; 394 cursor.source_idx++; 395 396 cursor.verify_invariants(register_demand); 397 398 return move_success; 399} 400 401void 402MoveState::upwards_skip(UpwardsCursor& cursor) 403{ 404 if (cursor.has_insert_idx()) { 405 aco_ptr<Instruction>& instr = block->instructions[cursor.source_idx]; 406 for (const Definition& def : instr->definitions) { 407 if (def.isTemp()) 408 depends_on[def.tempId()] = true; 409 } 410 for (const Operand& op : instr->operands) { 411 if (op.isTemp()) 412 RAR_dependencies[op.tempId()] = true; 413 } 414 cursor.total_demand.update(register_demand[cursor.source_idx]); 415 } 416 417 cursor.source_idx++; 418 419 cursor.verify_invariants(register_demand); 420} 421 422bool 423is_gs_or_done_sendmsg(const Instruction* instr) 424{ 425 if (instr->opcode == aco_opcode::s_sendmsg) { 426 uint16_t imm = instr->sopp().imm; 427 return (imm & sendmsg_id_mask) == _sendmsg_gs || (imm & sendmsg_id_mask) == _sendmsg_gs_done; 428 } 429 return false; 430} 431 432bool 433is_done_sendmsg(const Instruction* instr) 434{ 435 if (instr->opcode == aco_opcode::s_sendmsg) 436 return (instr->sopp().imm & sendmsg_id_mask) == _sendmsg_gs_done; 437 return false; 438} 439 440memory_sync_info 441get_sync_info_with_hack(const Instruction* instr) 442{ 443 memory_sync_info sync = get_sync_info(instr); 444 if (instr->isSMEM() && !instr->operands.empty() && instr->operands[0].bytes() == 16) { 445 // FIXME: currently, it doesn't seem beneficial to omit this due to how our scheduler works 446 sync.storage = (storage_class)(sync.storage | storage_buffer); 447 sync.semantics = 448 (memory_semantics)((sync.semantics | semantic_private) & ~semantic_can_reorder); 449 } 450 return sync; 451} 452 453struct memory_event_set { 454 bool has_control_barrier; 455 456 unsigned bar_acquire; 457 unsigned bar_release; 458 unsigned bar_classes; 459 460 unsigned access_acquire; 461 unsigned access_release; 462 unsigned access_relaxed; 463 unsigned access_atomic; 464}; 465 466struct hazard_query { 467 bool contains_spill; 468 bool contains_sendmsg; 469 bool uses_exec; 470 memory_event_set mem_events; 471 unsigned aliasing_storage; /* storage classes which are accessed (non-SMEM) */ 472 unsigned aliasing_storage_smem; /* storage classes which are accessed (SMEM) */ 473}; 474 475void 476init_hazard_query(hazard_query* query) 477{ 478 query->contains_spill = false; 479 query->contains_sendmsg = false; 480 query->uses_exec = false; 481 memset(&query->mem_events, 0, sizeof(query->mem_events)); 482 query->aliasing_storage = 0; 483 query->aliasing_storage_smem = 0; 484} 485 486void 487add_memory_event(memory_event_set* set, Instruction* instr, memory_sync_info* sync) 488{ 489 set->has_control_barrier |= is_done_sendmsg(instr); 490 if (instr->opcode == aco_opcode::p_barrier) { 491 Pseudo_barrier_instruction& bar = instr->barrier(); 492 if (bar.sync.semantics & semantic_acquire) 493 set->bar_acquire |= bar.sync.storage; 494 if (bar.sync.semantics & semantic_release) 495 set->bar_release |= bar.sync.storage; 496 set->bar_classes |= bar.sync.storage; 497 498 set->has_control_barrier |= bar.exec_scope > scope_invocation; 499 } 500 501 if (!sync->storage) 502 return; 503 504 if (sync->semantics & semantic_acquire) 505 set->access_acquire |= sync->storage; 506 if (sync->semantics & semantic_release) 507 set->access_release |= sync->storage; 508 509 if (!(sync->semantics & semantic_private)) { 510 if (sync->semantics & semantic_atomic) 511 set->access_atomic |= sync->storage; 512 else 513 set->access_relaxed |= sync->storage; 514 } 515} 516 517void 518add_to_hazard_query(hazard_query* query, Instruction* instr) 519{ 520 if (instr->opcode == aco_opcode::p_spill || instr->opcode == aco_opcode::p_reload) 521 query->contains_spill = true; 522 query->contains_sendmsg |= instr->opcode == aco_opcode::s_sendmsg; 523 query->uses_exec |= needs_exec_mask(instr); 524 525 memory_sync_info sync = get_sync_info_with_hack(instr); 526 527 add_memory_event(&query->mem_events, instr, &sync); 528 529 if (!(sync.semantics & semantic_can_reorder)) { 530 unsigned storage = sync.storage; 531 /* images and buffer/global memory can alias */ // TODO: more precisely, buffer images and 532 // buffer/global memory can alias 533 if (storage & (storage_buffer | storage_image)) 534 storage |= storage_buffer | storage_image; 535 if (instr->isSMEM()) 536 query->aliasing_storage_smem |= storage; 537 else 538 query->aliasing_storage |= storage; 539 } 540} 541 542enum HazardResult { 543 hazard_success, 544 hazard_fail_reorder_vmem_smem, 545 hazard_fail_reorder_ds, 546 hazard_fail_reorder_sendmsg, 547 hazard_fail_spill, 548 hazard_fail_export, 549 hazard_fail_barrier, 550 /* Must stop at these failures. The hazard query code doesn't consider them 551 * when added. */ 552 hazard_fail_exec, 553 hazard_fail_unreorderable, 554}; 555 556HazardResult 557perform_hazard_query(hazard_query* query, Instruction* instr, bool upwards) 558{ 559 /* don't schedule discards downwards */ 560 if (!upwards && instr->opcode == aco_opcode::p_exit_early_if) 561 return hazard_fail_unreorderable; 562 563 if (query->uses_exec) { 564 for (const Definition& def : instr->definitions) { 565 if (def.isFixed() && def.physReg() == exec) 566 return hazard_fail_exec; 567 } 568 } 569 570 /* don't move exports so that they stay closer together */ 571 if (instr->isEXP()) 572 return hazard_fail_export; 573 574 /* don't move non-reorderable instructions */ 575 if (instr->opcode == aco_opcode::s_memtime || instr->opcode == aco_opcode::s_memrealtime || 576 instr->opcode == aco_opcode::s_setprio || instr->opcode == aco_opcode::s_getreg_b32) 577 return hazard_fail_unreorderable; 578 579 memory_event_set instr_set; 580 memset(&instr_set, 0, sizeof(instr_set)); 581 memory_sync_info sync = get_sync_info_with_hack(instr); 582 add_memory_event(&instr_set, instr, &sync); 583 584 memory_event_set* first = &instr_set; 585 memory_event_set* second = &query->mem_events; 586 if (upwards) 587 std::swap(first, second); 588 589 /* everything after barrier(acquire) happens after the atomics/control_barriers before 590 * everything after load(acquire) happens after the load 591 */ 592 if ((first->has_control_barrier || first->access_atomic) && second->bar_acquire) 593 return hazard_fail_barrier; 594 if (((first->access_acquire || first->bar_acquire) && second->bar_classes) || 595 ((first->access_acquire | first->bar_acquire) & 596 (second->access_relaxed | second->access_atomic))) 597 return hazard_fail_barrier; 598 599 /* everything before barrier(release) happens before the atomics/control_barriers after * 600 * everything before store(release) happens before the store 601 */ 602 if (first->bar_release && (second->has_control_barrier || second->access_atomic)) 603 return hazard_fail_barrier; 604 if ((first->bar_classes && (second->bar_release || second->access_release)) || 605 ((first->access_relaxed | first->access_atomic) & 606 (second->bar_release | second->access_release))) 607 return hazard_fail_barrier; 608 609 /* don't move memory barriers around other memory barriers */ 610 if (first->bar_classes && second->bar_classes) 611 return hazard_fail_barrier; 612 613 /* Don't move memory accesses to before control barriers. I don't think 614 * this is necessary for the Vulkan memory model, but it might be for GLSL450. */ 615 unsigned control_classes = 616 storage_buffer | storage_atomic_counter | storage_image | storage_shared; 617 if (first->has_control_barrier && 618 ((second->access_atomic | second->access_relaxed) & control_classes)) 619 return hazard_fail_barrier; 620 621 /* don't move memory loads/stores past potentially aliasing loads/stores */ 622 unsigned aliasing_storage = 623 instr->isSMEM() ? query->aliasing_storage_smem : query->aliasing_storage; 624 if ((sync.storage & aliasing_storage) && !(sync.semantics & semantic_can_reorder)) { 625 unsigned intersect = sync.storage & aliasing_storage; 626 if (intersect & storage_shared) 627 return hazard_fail_reorder_ds; 628 return hazard_fail_reorder_vmem_smem; 629 } 630 631 if ((instr->opcode == aco_opcode::p_spill || instr->opcode == aco_opcode::p_reload) && 632 query->contains_spill) 633 return hazard_fail_spill; 634 635 if (instr->opcode == aco_opcode::s_sendmsg && query->contains_sendmsg) 636 return hazard_fail_reorder_sendmsg; 637 638 return hazard_success; 639} 640 641void 642schedule_SMEM(sched_ctx& ctx, Block* block, std::vector<RegisterDemand>& register_demand, 643 Instruction* current, int idx) 644{ 645 assert(idx != 0); 646 int window_size = SMEM_WINDOW_SIZE; 647 int max_moves = SMEM_MAX_MOVES; 648 int16_t k = 0; 649 650 /* don't move s_memtime/s_memrealtime */ 651 if (current->opcode == aco_opcode::s_memtime || current->opcode == aco_opcode::s_memrealtime) 652 return; 653 654 /* first, check if we have instructions before current to move down */ 655 hazard_query hq; 656 init_hazard_query(&hq); 657 add_to_hazard_query(&hq, current); 658 659 DownwardsCursor cursor = ctx.mv.downwards_init(idx, false, false); 660 661 for (int candidate_idx = idx - 1; k < max_moves && candidate_idx > (int)idx - window_size; 662 candidate_idx--) { 663 assert(candidate_idx >= 0); 664 assert(candidate_idx == cursor.source_idx); 665 aco_ptr<Instruction>& candidate = block->instructions[candidate_idx]; 666 667 /* break if we'd make the previous SMEM instruction stall */ 668 bool can_stall_prev_smem = 669 idx <= ctx.last_SMEM_dep_idx && candidate_idx < ctx.last_SMEM_dep_idx; 670 if (can_stall_prev_smem && ctx.last_SMEM_stall >= 0) 671 break; 672 673 /* break when encountering another MEM instruction, logical_start or barriers */ 674 if (candidate->opcode == aco_opcode::p_logical_start) 675 break; 676 /* only move VMEM instructions below descriptor loads. be more aggressive at higher num_waves 677 * to help create more vmem clauses */ 678 if (candidate->isVMEM() && (cursor.insert_idx - cursor.source_idx > (ctx.num_waves * 4) || 679 current->operands[0].size() == 4)) 680 break; 681 /* don't move descriptor loads below buffer loads */ 682 if (candidate->format == Format::SMEM && current->operands[0].size() == 4 && 683 candidate->operands[0].size() == 2) 684 break; 685 686 bool can_move_down = true; 687 688 HazardResult haz = perform_hazard_query(&hq, candidate.get(), false); 689 if (haz == hazard_fail_reorder_ds || haz == hazard_fail_spill || 690 haz == hazard_fail_reorder_sendmsg || haz == hazard_fail_barrier || 691 haz == hazard_fail_export) 692 can_move_down = false; 693 else if (haz != hazard_success) 694 break; 695 696 /* don't use LDS/GDS instructions to hide latency since it can 697 * significanly worsen LDS scheduling */ 698 if (candidate->isDS() || !can_move_down) { 699 add_to_hazard_query(&hq, candidate.get()); 700 ctx.mv.downwards_skip(cursor); 701 continue; 702 } 703 704 MoveResult res = ctx.mv.downwards_move(cursor, false); 705 if (res == move_fail_ssa || res == move_fail_rar) { 706 add_to_hazard_query(&hq, candidate.get()); 707 ctx.mv.downwards_skip(cursor); 708 continue; 709 } else if (res == move_fail_pressure) { 710 break; 711 } 712 713 if (candidate_idx < ctx.last_SMEM_dep_idx) 714 ctx.last_SMEM_stall++; 715 k++; 716 } 717 718 /* find the first instruction depending on current or find another MEM */ 719 UpwardsCursor up_cursor = ctx.mv.upwards_init(idx + 1, false); 720 721 bool found_dependency = false; 722 /* second, check if we have instructions after current to move up */ 723 for (int candidate_idx = idx + 1; k < max_moves && candidate_idx < (int)idx + window_size; 724 candidate_idx++) { 725 assert(candidate_idx == up_cursor.source_idx); 726 assert(candidate_idx < (int)block->instructions.size()); 727 aco_ptr<Instruction>& candidate = block->instructions[candidate_idx]; 728 729 if (candidate->opcode == aco_opcode::p_logical_end) 730 break; 731 732 /* check if candidate depends on current */ 733 bool is_dependency = !found_dependency && !ctx.mv.upwards_check_deps(up_cursor); 734 /* no need to steal from following VMEM instructions */ 735 if (is_dependency && candidate->isVMEM()) 736 break; 737 738 if (found_dependency) { 739 HazardResult haz = perform_hazard_query(&hq, candidate.get(), true); 740 if (haz == hazard_fail_reorder_ds || haz == hazard_fail_spill || 741 haz == hazard_fail_reorder_sendmsg || haz == hazard_fail_barrier || 742 haz == hazard_fail_export) 743 is_dependency = true; 744 else if (haz != hazard_success) 745 break; 746 } 747 748 if (is_dependency) { 749 if (!found_dependency) { 750 ctx.mv.upwards_update_insert_idx(up_cursor); 751 init_hazard_query(&hq); 752 found_dependency = true; 753 } 754 } 755 756 if (is_dependency || !found_dependency) { 757 if (found_dependency) 758 add_to_hazard_query(&hq, candidate.get()); 759 else 760 k++; 761 ctx.mv.upwards_skip(up_cursor); 762 continue; 763 } 764 765 MoveResult res = ctx.mv.upwards_move(up_cursor); 766 if (res == move_fail_ssa || res == move_fail_rar) { 767 /* no need to steal from following VMEM instructions */ 768 if (res == move_fail_ssa && candidate->isVMEM()) 769 break; 770 add_to_hazard_query(&hq, candidate.get()); 771 ctx.mv.upwards_skip(up_cursor); 772 continue; 773 } else if (res == move_fail_pressure) { 774 break; 775 } 776 k++; 777 } 778 779 ctx.last_SMEM_dep_idx = found_dependency ? up_cursor.insert_idx : 0; 780 ctx.last_SMEM_stall = 10 - ctx.num_waves - k; 781} 782 783void 784schedule_VMEM(sched_ctx& ctx, Block* block, std::vector<RegisterDemand>& register_demand, 785 Instruction* current, int idx) 786{ 787 assert(idx != 0); 788 int window_size = VMEM_WINDOW_SIZE; 789 int max_moves = VMEM_MAX_MOVES; 790 int clause_max_grab_dist = VMEM_CLAUSE_MAX_GRAB_DIST; 791 bool only_clauses = false; 792 int16_t k = 0; 793 794 /* first, check if we have instructions before current to move down */ 795 hazard_query indep_hq; 796 hazard_query clause_hq; 797 init_hazard_query(&indep_hq); 798 init_hazard_query(&clause_hq); 799 add_to_hazard_query(&indep_hq, current); 800 801 DownwardsCursor cursor = ctx.mv.downwards_init(idx, true, true); 802 803 for (int candidate_idx = idx - 1; k < max_moves && candidate_idx > (int)idx - window_size; 804 candidate_idx--) { 805 assert(candidate_idx == cursor.source_idx); 806 assert(candidate_idx >= 0); 807 aco_ptr<Instruction>& candidate = block->instructions[candidate_idx]; 808 bool is_vmem = candidate->isVMEM() || candidate->isFlatLike(); 809 810 /* break when encountering another VMEM instruction, logical_start or barriers */ 811 if (candidate->opcode == aco_opcode::p_logical_start) 812 break; 813 814 /* break if we'd make the previous SMEM instruction stall */ 815 bool can_stall_prev_smem = 816 idx <= ctx.last_SMEM_dep_idx && candidate_idx < ctx.last_SMEM_dep_idx; 817 if (can_stall_prev_smem && ctx.last_SMEM_stall >= 0) 818 break; 819 820 bool part_of_clause = false; 821 if (current->isVMEM() == candidate->isVMEM()) { 822 int grab_dist = cursor.insert_idx_clause - candidate_idx; 823 /* We can't easily tell how much this will decrease the def-to-use 824 * distances, so just use how far it will be moved as a heuristic. */ 825 part_of_clause = 826 grab_dist < clause_max_grab_dist + k && should_form_clause(current, candidate.get()); 827 } 828 829 /* if current depends on candidate, add additional dependencies and continue */ 830 bool can_move_down = !is_vmem || part_of_clause || candidate->definitions.empty(); 831 if (only_clauses) { 832 /* In case of high register pressure, only try to form clauses, 833 * and only if the previous clause is not larger 834 * than the current one will be. 835 */ 836 if (part_of_clause) { 837 int clause_size = cursor.insert_idx - cursor.insert_idx_clause; 838 int prev_clause_size = 1; 839 while (should_form_clause(current, 840 block->instructions[candidate_idx - prev_clause_size].get())) 841 prev_clause_size++; 842 if (prev_clause_size > clause_size + 1) 843 break; 844 } else { 845 can_move_down = false; 846 } 847 } 848 HazardResult haz = 849 perform_hazard_query(part_of_clause ? &clause_hq : &indep_hq, candidate.get(), false); 850 if (haz == hazard_fail_reorder_ds || haz == hazard_fail_spill || 851 haz == hazard_fail_reorder_sendmsg || haz == hazard_fail_barrier || 852 haz == hazard_fail_export) 853 can_move_down = false; 854 else if (haz != hazard_success) 855 break; 856 857 if (!can_move_down) { 858 if (part_of_clause) 859 break; 860 add_to_hazard_query(&indep_hq, candidate.get()); 861 add_to_hazard_query(&clause_hq, candidate.get()); 862 ctx.mv.downwards_skip(cursor); 863 continue; 864 } 865 866 Instruction* candidate_ptr = candidate.get(); 867 MoveResult res = ctx.mv.downwards_move(cursor, part_of_clause); 868 if (res == move_fail_ssa || res == move_fail_rar) { 869 if (part_of_clause) 870 break; 871 add_to_hazard_query(&indep_hq, candidate.get()); 872 add_to_hazard_query(&clause_hq, candidate.get()); 873 ctx.mv.downwards_skip(cursor); 874 continue; 875 } else if (res == move_fail_pressure) { 876 only_clauses = true; 877 if (part_of_clause) 878 break; 879 add_to_hazard_query(&indep_hq, candidate.get()); 880 add_to_hazard_query(&clause_hq, candidate.get()); 881 ctx.mv.downwards_skip(cursor); 882 continue; 883 } 884 if (part_of_clause) 885 add_to_hazard_query(&indep_hq, candidate_ptr); 886 else 887 k++; 888 if (candidate_idx < ctx.last_SMEM_dep_idx) 889 ctx.last_SMEM_stall++; 890 } 891 892 /* find the first instruction depending on current or find another VMEM */ 893 UpwardsCursor up_cursor = ctx.mv.upwards_init(idx + 1, true); 894 895 bool found_dependency = false; 896 /* second, check if we have instructions after current to move up */ 897 for (int candidate_idx = idx + 1; k < max_moves && candidate_idx < (int)idx + window_size; 898 candidate_idx++) { 899 assert(candidate_idx == up_cursor.source_idx); 900 assert(candidate_idx < (int)block->instructions.size()); 901 aco_ptr<Instruction>& candidate = block->instructions[candidate_idx]; 902 bool is_vmem = candidate->isVMEM() || candidate->isFlatLike(); 903 904 if (candidate->opcode == aco_opcode::p_logical_end) 905 break; 906 907 /* check if candidate depends on current */ 908 bool is_dependency = false; 909 if (found_dependency) { 910 HazardResult haz = perform_hazard_query(&indep_hq, candidate.get(), true); 911 if (haz == hazard_fail_reorder_ds || haz == hazard_fail_spill || 912 haz == hazard_fail_reorder_vmem_smem || haz == hazard_fail_reorder_sendmsg || 913 haz == hazard_fail_barrier || haz == hazard_fail_export) 914 is_dependency = true; 915 else if (haz != hazard_success) 916 break; 917 } 918 919 is_dependency |= !found_dependency && !ctx.mv.upwards_check_deps(up_cursor); 920 if (is_dependency) { 921 if (!found_dependency) { 922 ctx.mv.upwards_update_insert_idx(up_cursor); 923 init_hazard_query(&indep_hq); 924 found_dependency = true; 925 } 926 } else if (is_vmem) { 927 /* don't move up dependencies of other VMEM instructions */ 928 for (const Definition& def : candidate->definitions) { 929 if (def.isTemp()) 930 ctx.mv.depends_on[def.tempId()] = true; 931 } 932 } 933 934 if (is_dependency || !found_dependency) { 935 if (found_dependency) 936 add_to_hazard_query(&indep_hq, candidate.get()); 937 else 938 k++; 939 ctx.mv.upwards_skip(up_cursor); 940 continue; 941 } 942 943 MoveResult res = ctx.mv.upwards_move(up_cursor); 944 if (res == move_fail_ssa || res == move_fail_rar) { 945 add_to_hazard_query(&indep_hq, candidate.get()); 946 ctx.mv.upwards_skip(up_cursor); 947 continue; 948 } else if (res == move_fail_pressure) { 949 break; 950 } 951 k++; 952 } 953} 954 955void 956schedule_position_export(sched_ctx& ctx, Block* block, std::vector<RegisterDemand>& register_demand, 957 Instruction* current, int idx) 958{ 959 assert(idx != 0); 960 int window_size = POS_EXP_WINDOW_SIZE / ctx.schedule_pos_export_div; 961 int max_moves = POS_EXP_MAX_MOVES / ctx.schedule_pos_export_div; 962 int16_t k = 0; 963 964 DownwardsCursor cursor = ctx.mv.downwards_init(idx, true, false); 965 966 hazard_query hq; 967 init_hazard_query(&hq); 968 add_to_hazard_query(&hq, current); 969 970 for (int candidate_idx = idx - 1; k < max_moves && candidate_idx > (int)idx - window_size; 971 candidate_idx--) { 972 assert(candidate_idx >= 0); 973 aco_ptr<Instruction>& candidate = block->instructions[candidate_idx]; 974 975 if (candidate->opcode == aco_opcode::p_logical_start) 976 break; 977 if (candidate->isVMEM() || candidate->isSMEM() || candidate->isFlatLike()) 978 break; 979 980 HazardResult haz = perform_hazard_query(&hq, candidate.get(), false); 981 if (haz == hazard_fail_exec || haz == hazard_fail_unreorderable) 982 break; 983 984 if (haz != hazard_success) { 985 add_to_hazard_query(&hq, candidate.get()); 986 ctx.mv.downwards_skip(cursor); 987 continue; 988 } 989 990 MoveResult res = ctx.mv.downwards_move(cursor, false); 991 if (res == move_fail_ssa || res == move_fail_rar) { 992 add_to_hazard_query(&hq, candidate.get()); 993 ctx.mv.downwards_skip(cursor); 994 continue; 995 } else if (res == move_fail_pressure) { 996 break; 997 } 998 k++; 999 } 1000} 1001 1002void 1003schedule_block(sched_ctx& ctx, Program* program, Block* block, live& live_vars) 1004{ 1005 ctx.last_SMEM_dep_idx = 0; 1006 ctx.last_SMEM_stall = INT16_MIN; 1007 ctx.mv.block = block; 1008 ctx.mv.register_demand = live_vars.register_demand[block->index].data(); 1009 1010 /* go through all instructions and find memory loads */ 1011 for (unsigned idx = 0; idx < block->instructions.size(); idx++) { 1012 Instruction* current = block->instructions[idx].get(); 1013 1014 if (block->kind & block_kind_export_end && current->isEXP() && ctx.schedule_pos_exports) { 1015 unsigned target = current->exp().dest; 1016 if (target >= V_008DFC_SQ_EXP_POS && target < V_008DFC_SQ_EXP_PRIM) { 1017 ctx.mv.current = current; 1018 schedule_position_export(ctx, block, live_vars.register_demand[block->index], current, 1019 idx); 1020 } 1021 } 1022 1023 if (current->definitions.empty()) 1024 continue; 1025 1026 if (current->isVMEM() || current->isFlatLike()) { 1027 ctx.mv.current = current; 1028 schedule_VMEM(ctx, block, live_vars.register_demand[block->index], current, idx); 1029 } 1030 1031 if (current->isSMEM()) { 1032 ctx.mv.current = current; 1033 schedule_SMEM(ctx, block, live_vars.register_demand[block->index], current, idx); 1034 } 1035 } 1036 1037 /* resummarize the block's register demand */ 1038 block->register_demand = RegisterDemand(); 1039 for (unsigned idx = 0; idx < block->instructions.size(); idx++) { 1040 block->register_demand.update(live_vars.register_demand[block->index][idx]); 1041 } 1042} 1043 1044void 1045schedule_program(Program* program, live& live_vars) 1046{ 1047 /* don't use program->max_reg_demand because that is affected by max_waves_per_simd */ 1048 RegisterDemand demand; 1049 for (Block& block : program->blocks) 1050 demand.update(block.register_demand); 1051 demand.vgpr += program->config->num_shared_vgprs / 2; 1052 1053 sched_ctx ctx; 1054 ctx.mv.depends_on.resize(program->peekAllocationId()); 1055 ctx.mv.RAR_dependencies.resize(program->peekAllocationId()); 1056 ctx.mv.RAR_dependencies_clause.resize(program->peekAllocationId()); 1057 /* Allowing the scheduler to reduce the number of waves to as low as 5 1058 * improves performance of Thrones of Britannia significantly and doesn't 1059 * seem to hurt anything else. */ 1060 // TODO: account for possible uneven num_waves on GFX10+ 1061 unsigned wave_fac = program->dev.physical_vgprs / 256; 1062 if (program->num_waves <= 5 * wave_fac) 1063 ctx.num_waves = program->num_waves; 1064 else if (demand.vgpr >= 29) 1065 ctx.num_waves = 5 * wave_fac; 1066 else if (demand.vgpr >= 25) 1067 ctx.num_waves = 6 * wave_fac; 1068 else 1069 ctx.num_waves = 7 * wave_fac; 1070 ctx.num_waves = std::max<uint16_t>(ctx.num_waves, program->min_waves); 1071 ctx.num_waves = std::min<uint16_t>(ctx.num_waves, program->num_waves); 1072 1073 /* VMEM_MAX_MOVES and such assume pre-GFX10 wave count */ 1074 ctx.num_waves = std::max<uint16_t>(ctx.num_waves / wave_fac, 1); 1075 1076 assert(ctx.num_waves > 0); 1077 ctx.mv.max_registers = {int16_t(get_addr_vgpr_from_waves(program, ctx.num_waves * wave_fac) - 2), 1078 int16_t(get_addr_sgpr_from_waves(program, ctx.num_waves * wave_fac))}; 1079 1080 /* NGG culling shaders are very sensitive to position export scheduling. 1081 * Schedule less aggressively when early primitive export is used, and 1082 * keep the position export at the very bottom when late primitive export is used. 1083 */ 1084 if (program->info->has_ngg_culling && program->stage.num_sw_stages() == 1) { 1085 if (!program->info->has_ngg_early_prim_export) 1086 ctx.schedule_pos_exports = false; 1087 else 1088 ctx.schedule_pos_export_div = 4; 1089 } 1090 1091 for (Block& block : program->blocks) 1092 schedule_block(ctx, program, &block, live_vars); 1093 1094 /* update max_reg_demand and num_waves */ 1095 RegisterDemand new_demand; 1096 for (Block& block : program->blocks) { 1097 new_demand.update(block.register_demand); 1098 } 1099 update_vgpr_sgpr_demand(program, new_demand); 1100 1101/* if enabled, this code asserts that register_demand is updated correctly */ 1102#if 0 1103 int prev_num_waves = program->num_waves; 1104 const RegisterDemand prev_max_demand = program->max_reg_demand; 1105 1106 std::vector<RegisterDemand> demands(program->blocks.size()); 1107 for (unsigned j = 0; j < program->blocks.size(); j++) { 1108 demands[j] = program->blocks[j].register_demand; 1109 } 1110 1111 live live_vars2 = aco::live_var_analysis(program); 1112 1113 for (unsigned j = 0; j < program->blocks.size(); j++) { 1114 Block &b = program->blocks[j]; 1115 for (unsigned i = 0; i < b.instructions.size(); i++) 1116 assert(live_vars.register_demand[b.index][i] == live_vars2.register_demand[b.index][i]); 1117 assert(b.register_demand == demands[j]); 1118 } 1119 1120 assert(program->max_reg_demand == prev_max_demand); 1121 assert(program->num_waves == prev_num_waves); 1122#endif 1123} 1124 1125} // namespace aco 1126