17ec681f3Smrg/* 27ec681f3Smrg * Copyright © 2020 Valve Corporation 37ec681f3Smrg * 47ec681f3Smrg * Permission is hereby granted, free of charge, to any person obtaining a 57ec681f3Smrg * copy of this software and associated documentation files (the "Software"), 67ec681f3Smrg * to deal in the Software without restriction, including without limitation 77ec681f3Smrg * the rights to use, copy, modify, merge, publish, distribute, sublicense, 87ec681f3Smrg * and/or sell copies of the Software, and to permit persons to whom the 97ec681f3Smrg * Software is furnished to do so, subject to the following conditions: 107ec681f3Smrg * 117ec681f3Smrg * The above copyright notice and this permission notice (including the next 127ec681f3Smrg * paragraph) shall be included in all copies or substantial portions of the 137ec681f3Smrg * Software. 147ec681f3Smrg * 157ec681f3Smrg * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 167ec681f3Smrg * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 177ec681f3Smrg * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 187ec681f3Smrg * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 197ec681f3Smrg * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 207ec681f3Smrg * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 217ec681f3Smrg * IN THE SOFTWARE. 227ec681f3Smrg * 237ec681f3Smrg */ 247ec681f3Smrg 257ec681f3Smrg#include "aco_ir.h" 267ec681f3Smrg 277ec681f3Smrg#include "util/crc32.h" 287ec681f3Smrg 297ec681f3Smrg#include <algorithm> 307ec681f3Smrg#include <deque> 317ec681f3Smrg#include <set> 327ec681f3Smrg#include <vector> 337ec681f3Smrg 347ec681f3Smrgnamespace aco { 357ec681f3Smrg 367ec681f3Smrg/* sgpr_presched/vgpr_presched */ 377ec681f3Smrgvoid 387ec681f3Smrgcollect_presched_stats(Program* program) 397ec681f3Smrg{ 407ec681f3Smrg RegisterDemand presched_demand; 417ec681f3Smrg for (Block& block : program->blocks) 427ec681f3Smrg presched_demand.update(block.register_demand); 437ec681f3Smrg program->statistics[statistic_sgpr_presched] = presched_demand.sgpr; 447ec681f3Smrg program->statistics[statistic_vgpr_presched] = presched_demand.vgpr; 457ec681f3Smrg} 467ec681f3Smrg 477ec681f3Smrgclass BlockCycleEstimator { 487ec681f3Smrgpublic: 497ec681f3Smrg enum resource { 507ec681f3Smrg null = 0, 517ec681f3Smrg scalar, 527ec681f3Smrg branch_sendmsg, 537ec681f3Smrg valu, 547ec681f3Smrg valu_complex, 557ec681f3Smrg lds, 567ec681f3Smrg export_gds, 577ec681f3Smrg vmem, 587ec681f3Smrg resource_count, 597ec681f3Smrg }; 607ec681f3Smrg 617ec681f3Smrg BlockCycleEstimator(Program* program_) : program(program_) {} 627ec681f3Smrg 637ec681f3Smrg Program* program; 647ec681f3Smrg 657ec681f3Smrg int32_t cur_cycle = 0; 667ec681f3Smrg int32_t res_available[(int)BlockCycleEstimator::resource_count] = {0}; 677ec681f3Smrg unsigned res_usage[(int)BlockCycleEstimator::resource_count] = {0}; 687ec681f3Smrg int32_t reg_available[512] = {0}; 697ec681f3Smrg std::deque<int32_t> lgkm; 707ec681f3Smrg std::deque<int32_t> exp; 717ec681f3Smrg std::deque<int32_t> vm; 727ec681f3Smrg std::deque<int32_t> vs; 737ec681f3Smrg 747ec681f3Smrg unsigned predict_cost(aco_ptr<Instruction>& instr); 757ec681f3Smrg void add(aco_ptr<Instruction>& instr); 767ec681f3Smrg void join(const BlockCycleEstimator& other); 777ec681f3Smrg 787ec681f3Smrgprivate: 797ec681f3Smrg unsigned get_waitcnt_cost(wait_imm imm); 807ec681f3Smrg unsigned get_dependency_cost(aco_ptr<Instruction>& instr); 817ec681f3Smrg 827ec681f3Smrg void use_resources(aco_ptr<Instruction>& instr); 837ec681f3Smrg int32_t cycles_until_res_available(aco_ptr<Instruction>& instr); 847ec681f3Smrg}; 857ec681f3Smrg 867ec681f3Smrgstruct wait_counter_info { 877ec681f3Smrg wait_counter_info(unsigned vm_, unsigned exp_, unsigned lgkm_, unsigned vs_) 887ec681f3Smrg : vm(vm_), exp(exp_), lgkm(lgkm_), vs(vs_) 897ec681f3Smrg {} 907ec681f3Smrg 917ec681f3Smrg unsigned vm; 927ec681f3Smrg unsigned exp; 937ec681f3Smrg unsigned lgkm; 947ec681f3Smrg unsigned vs; 957ec681f3Smrg}; 967ec681f3Smrg 977ec681f3Smrgstruct perf_info { 987ec681f3Smrg int latency; 997ec681f3Smrg 1007ec681f3Smrg BlockCycleEstimator::resource rsrc0; 1017ec681f3Smrg unsigned cost0; 1027ec681f3Smrg 1037ec681f3Smrg BlockCycleEstimator::resource rsrc1; 1047ec681f3Smrg unsigned cost1; 1057ec681f3Smrg}; 1067ec681f3Smrg 1077ec681f3Smrgstatic perf_info 1087ec681f3Smrgget_perf_info(Program* program, aco_ptr<Instruction>& instr) 1097ec681f3Smrg{ 1107ec681f3Smrg instr_class cls = instr_info.classes[(int)instr->opcode]; 1117ec681f3Smrg 1127ec681f3Smrg#define WAIT(res) BlockCycleEstimator::res, 0 1137ec681f3Smrg#define WAIT_USE(res, cnt) BlockCycleEstimator::res, cnt 1147ec681f3Smrg 1157ec681f3Smrg if (program->chip_class >= GFX10) { 1167ec681f3Smrg /* fp64 might be incorrect */ 1177ec681f3Smrg switch (cls) { 1187ec681f3Smrg case instr_class::valu32: 1197ec681f3Smrg case instr_class::valu_convert32: 1207ec681f3Smrg case instr_class::valu_fma: return {5, WAIT_USE(valu, 1)}; 1217ec681f3Smrg case instr_class::valu64: return {6, WAIT_USE(valu, 2), WAIT_USE(valu_complex, 2)}; 1227ec681f3Smrg case instr_class::valu_quarter_rate32: 1237ec681f3Smrg return {8, WAIT_USE(valu, 4), WAIT_USE(valu_complex, 4)}; 1247ec681f3Smrg case instr_class::valu_transcendental32: 1257ec681f3Smrg return {10, WAIT_USE(valu, 1), WAIT_USE(valu_complex, 4)}; 1267ec681f3Smrg case instr_class::valu_double: return {22, WAIT_USE(valu, 16), WAIT_USE(valu_complex, 16)}; 1277ec681f3Smrg case instr_class::valu_double_add: 1287ec681f3Smrg return {22, WAIT_USE(valu, 16), WAIT_USE(valu_complex, 16)}; 1297ec681f3Smrg case instr_class::valu_double_convert: 1307ec681f3Smrg return {22, WAIT_USE(valu, 16), WAIT_USE(valu_complex, 16)}; 1317ec681f3Smrg case instr_class::valu_double_transcendental: 1327ec681f3Smrg return {24, WAIT_USE(valu, 16), WAIT_USE(valu_complex, 16)}; 1337ec681f3Smrg case instr_class::salu: return {2, WAIT_USE(scalar, 1)}; 1347ec681f3Smrg case instr_class::smem: return {0, WAIT_USE(scalar, 1)}; 1357ec681f3Smrg case instr_class::branch: 1367ec681f3Smrg case instr_class::sendmsg: return {0, WAIT_USE(branch_sendmsg, 1)}; 1377ec681f3Smrg case instr_class::ds: 1387ec681f3Smrg return instr->ds().gds ? perf_info{0, WAIT_USE(export_gds, 1)} 1397ec681f3Smrg : perf_info{0, WAIT_USE(lds, 1)}; 1407ec681f3Smrg case instr_class::exp: return {0, WAIT_USE(export_gds, 1)}; 1417ec681f3Smrg case instr_class::vmem: return {0, WAIT_USE(vmem, 1)}; 1427ec681f3Smrg case instr_class::barrier: 1437ec681f3Smrg case instr_class::waitcnt: 1447ec681f3Smrg case instr_class::other: 1457ec681f3Smrg default: return {0}; 1467ec681f3Smrg } 1477ec681f3Smrg } else { 1487ec681f3Smrg switch (cls) { 1497ec681f3Smrg case instr_class::valu32: return {4, WAIT_USE(valu, 4)}; 1507ec681f3Smrg case instr_class::valu_convert32: return {16, WAIT_USE(valu, 16)}; 1517ec681f3Smrg case instr_class::valu64: return {8, WAIT_USE(valu, 8)}; 1527ec681f3Smrg case instr_class::valu_quarter_rate32: return {16, WAIT_USE(valu, 16)}; 1537ec681f3Smrg case instr_class::valu_fma: 1547ec681f3Smrg return program->dev.has_fast_fma32 ? perf_info{4, WAIT_USE(valu, 4)} 1557ec681f3Smrg : perf_info{16, WAIT_USE(valu, 16)}; 1567ec681f3Smrg case instr_class::valu_transcendental32: return {16, WAIT_USE(valu, 16)}; 1577ec681f3Smrg case instr_class::valu_double: return {64, WAIT_USE(valu, 64)}; 1587ec681f3Smrg case instr_class::valu_double_add: return {32, WAIT_USE(valu, 32)}; 1597ec681f3Smrg case instr_class::valu_double_convert: return {16, WAIT_USE(valu, 16)}; 1607ec681f3Smrg case instr_class::valu_double_transcendental: return {64, WAIT_USE(valu, 64)}; 1617ec681f3Smrg case instr_class::salu: return {4, WAIT_USE(scalar, 4)}; 1627ec681f3Smrg case instr_class::smem: return {4, WAIT_USE(scalar, 4)}; 1637ec681f3Smrg case instr_class::branch: 1647ec681f3Smrg return {8, WAIT_USE(branch_sendmsg, 8)}; 1657ec681f3Smrg return {4, WAIT_USE(branch_sendmsg, 4)}; 1667ec681f3Smrg case instr_class::ds: 1677ec681f3Smrg return instr->ds().gds ? perf_info{4, WAIT_USE(export_gds, 4)} 1687ec681f3Smrg : perf_info{4, WAIT_USE(lds, 4)}; 1697ec681f3Smrg case instr_class::exp: return {16, WAIT_USE(export_gds, 16)}; 1707ec681f3Smrg case instr_class::vmem: return {4, WAIT_USE(vmem, 4)}; 1717ec681f3Smrg case instr_class::barrier: 1727ec681f3Smrg case instr_class::waitcnt: 1737ec681f3Smrg case instr_class::other: 1747ec681f3Smrg default: return {4}; 1757ec681f3Smrg } 1767ec681f3Smrg } 1777ec681f3Smrg 1787ec681f3Smrg#undef WAIT_USE 1797ec681f3Smrg#undef WAIT 1807ec681f3Smrg} 1817ec681f3Smrg 1827ec681f3Smrgvoid 1837ec681f3SmrgBlockCycleEstimator::use_resources(aco_ptr<Instruction>& instr) 1847ec681f3Smrg{ 1857ec681f3Smrg perf_info perf = get_perf_info(program, instr); 1867ec681f3Smrg 1877ec681f3Smrg if (perf.rsrc0 != resource_count) { 1887ec681f3Smrg res_available[(int)perf.rsrc0] = cur_cycle + perf.cost0; 1897ec681f3Smrg res_usage[(int)perf.rsrc0] += perf.cost0; 1907ec681f3Smrg } 1917ec681f3Smrg 1927ec681f3Smrg if (perf.rsrc1 != resource_count) { 1937ec681f3Smrg res_available[(int)perf.rsrc1] = cur_cycle + perf.cost1; 1947ec681f3Smrg res_usage[(int)perf.rsrc1] += perf.cost1; 1957ec681f3Smrg } 1967ec681f3Smrg} 1977ec681f3Smrg 1987ec681f3Smrgint32_t 1997ec681f3SmrgBlockCycleEstimator::cycles_until_res_available(aco_ptr<Instruction>& instr) 2007ec681f3Smrg{ 2017ec681f3Smrg perf_info perf = get_perf_info(program, instr); 2027ec681f3Smrg 2037ec681f3Smrg int32_t cost = 0; 2047ec681f3Smrg if (perf.rsrc0 != resource_count) 2057ec681f3Smrg cost = MAX2(cost, res_available[(int)perf.rsrc0] - cur_cycle); 2067ec681f3Smrg if (perf.rsrc1 != resource_count) 2077ec681f3Smrg cost = MAX2(cost, res_available[(int)perf.rsrc1] - cur_cycle); 2087ec681f3Smrg 2097ec681f3Smrg return cost; 2107ec681f3Smrg} 2117ec681f3Smrg 2127ec681f3Smrgstatic wait_counter_info 2137ec681f3Smrgget_wait_counter_info(aco_ptr<Instruction>& instr) 2147ec681f3Smrg{ 2157ec681f3Smrg /* These numbers are all a bit nonsense. LDS/VMEM/SMEM/EXP performance 2167ec681f3Smrg * depends a lot on the situation. */ 2177ec681f3Smrg 2187ec681f3Smrg if (instr->isEXP()) 2197ec681f3Smrg return wait_counter_info(0, 16, 0, 0); 2207ec681f3Smrg 2217ec681f3Smrg if (instr->isFlatLike()) { 2227ec681f3Smrg unsigned lgkm = instr->isFlat() ? 20 : 0; 2237ec681f3Smrg if (!instr->definitions.empty()) 2247ec681f3Smrg return wait_counter_info(230, 0, lgkm, 0); 2257ec681f3Smrg else 2267ec681f3Smrg return wait_counter_info(0, 0, lgkm, 230); 2277ec681f3Smrg } 2287ec681f3Smrg 2297ec681f3Smrg if (instr->isSMEM()) { 2307ec681f3Smrg if (instr->definitions.empty()) 2317ec681f3Smrg return wait_counter_info(0, 0, 200, 0); 2327ec681f3Smrg if (instr->operands.empty()) /* s_memtime and s_memrealtime */ 2337ec681f3Smrg return wait_counter_info(0, 0, 1, 0); 2347ec681f3Smrg 2357ec681f3Smrg bool likely_desc_load = instr->operands[0].size() == 2; 2367ec681f3Smrg bool soe = instr->operands.size() >= (!instr->definitions.empty() ? 3 : 4); 2377ec681f3Smrg bool const_offset = 2387ec681f3Smrg instr->operands[1].isConstant() && (!soe || instr->operands.back().isConstant()); 2397ec681f3Smrg 2407ec681f3Smrg if (likely_desc_load || const_offset) 2417ec681f3Smrg return wait_counter_info(0, 0, 30, 0); /* likely to hit L0 cache */ 2427ec681f3Smrg 2437ec681f3Smrg return wait_counter_info(0, 0, 200, 0); 2447ec681f3Smrg } 2457ec681f3Smrg 2467ec681f3Smrg if (instr->format == Format::DS) 2477ec681f3Smrg return wait_counter_info(0, 0, 20, 0); 2487ec681f3Smrg 2497ec681f3Smrg if (instr->isVMEM() && !instr->definitions.empty()) 2507ec681f3Smrg return wait_counter_info(320, 0, 0, 0); 2517ec681f3Smrg 2527ec681f3Smrg if (instr->isVMEM() && instr->definitions.empty()) 2537ec681f3Smrg return wait_counter_info(0, 0, 0, 320); 2547ec681f3Smrg 2557ec681f3Smrg return wait_counter_info(0, 0, 0, 0); 2567ec681f3Smrg} 2577ec681f3Smrg 2587ec681f3Smrgstatic wait_imm 2597ec681f3Smrgget_wait_imm(Program* program, aco_ptr<Instruction>& instr) 2607ec681f3Smrg{ 2617ec681f3Smrg if (instr->opcode == aco_opcode::s_endpgm) { 2627ec681f3Smrg return wait_imm(0, 0, 0, 0); 2637ec681f3Smrg } else if (instr->opcode == aco_opcode::s_waitcnt) { 2647ec681f3Smrg return wait_imm(GFX10_3, instr->sopp().imm); 2657ec681f3Smrg } else if (instr->opcode == aco_opcode::s_waitcnt_vscnt) { 2667ec681f3Smrg return wait_imm(0, 0, 0, instr->sopk().imm); 2677ec681f3Smrg } else { 2687ec681f3Smrg unsigned max_lgkm_cnt = program->chip_class >= GFX10 ? 62 : 14; 2697ec681f3Smrg unsigned max_exp_cnt = 6; 2707ec681f3Smrg unsigned max_vm_cnt = program->chip_class >= GFX9 ? 62 : 14; 2717ec681f3Smrg unsigned max_vs_cnt = 62; 2727ec681f3Smrg 2737ec681f3Smrg wait_counter_info wait_info = get_wait_counter_info(instr); 2747ec681f3Smrg wait_imm imm; 2757ec681f3Smrg imm.lgkm = wait_info.lgkm ? max_lgkm_cnt : wait_imm::unset_counter; 2767ec681f3Smrg imm.exp = wait_info.exp ? max_exp_cnt : wait_imm::unset_counter; 2777ec681f3Smrg imm.vm = wait_info.vm ? max_vm_cnt : wait_imm::unset_counter; 2787ec681f3Smrg imm.vs = wait_info.vs ? max_vs_cnt : wait_imm::unset_counter; 2797ec681f3Smrg return imm; 2807ec681f3Smrg } 2817ec681f3Smrg} 2827ec681f3Smrg 2837ec681f3Smrgunsigned 2847ec681f3SmrgBlockCycleEstimator::get_dependency_cost(aco_ptr<Instruction>& instr) 2857ec681f3Smrg{ 2867ec681f3Smrg int deps_available = cur_cycle; 2877ec681f3Smrg 2887ec681f3Smrg wait_imm imm = get_wait_imm(program, instr); 2897ec681f3Smrg if (imm.vm != wait_imm::unset_counter) { 2907ec681f3Smrg for (int i = 0; i < (int)vm.size() - imm.vm; i++) 2917ec681f3Smrg deps_available = MAX2(deps_available, vm[i]); 2927ec681f3Smrg } 2937ec681f3Smrg if (imm.exp != wait_imm::unset_counter) { 2947ec681f3Smrg for (int i = 0; i < (int)exp.size() - imm.exp; i++) 2957ec681f3Smrg deps_available = MAX2(deps_available, exp[i]); 2967ec681f3Smrg } 2977ec681f3Smrg if (imm.lgkm != wait_imm::unset_counter) { 2987ec681f3Smrg for (int i = 0; i < (int)lgkm.size() - imm.lgkm; i++) 2997ec681f3Smrg deps_available = MAX2(deps_available, lgkm[i]); 3007ec681f3Smrg } 3017ec681f3Smrg if (imm.vs != wait_imm::unset_counter) { 3027ec681f3Smrg for (int i = 0; i < (int)vs.size() - imm.vs; i++) 3037ec681f3Smrg deps_available = MAX2(deps_available, vs[i]); 3047ec681f3Smrg } 3057ec681f3Smrg 3067ec681f3Smrg if (instr->opcode == aco_opcode::s_endpgm) { 3077ec681f3Smrg for (unsigned i = 0; i < 512; i++) 3087ec681f3Smrg deps_available = MAX2(deps_available, reg_available[i]); 3097ec681f3Smrg } else if (program->chip_class >= GFX10) { 3107ec681f3Smrg for (Operand& op : instr->operands) { 3117ec681f3Smrg if (op.isConstant() || op.isUndefined()) 3127ec681f3Smrg continue; 3137ec681f3Smrg for (unsigned i = 0; i < op.size(); i++) 3147ec681f3Smrg deps_available = MAX2(deps_available, reg_available[op.physReg().reg() + i]); 3157ec681f3Smrg } 3167ec681f3Smrg } 3177ec681f3Smrg 3187ec681f3Smrg if (program->chip_class < GFX10) 3197ec681f3Smrg deps_available = align(deps_available, 4); 3207ec681f3Smrg 3217ec681f3Smrg return deps_available - cur_cycle; 3227ec681f3Smrg} 3237ec681f3Smrg 3247ec681f3Smrgunsigned 3257ec681f3SmrgBlockCycleEstimator::predict_cost(aco_ptr<Instruction>& instr) 3267ec681f3Smrg{ 3277ec681f3Smrg int32_t dep = get_dependency_cost(instr); 3287ec681f3Smrg return dep + std::max(cycles_until_res_available(instr) - dep, 0); 3297ec681f3Smrg} 3307ec681f3Smrg 3317ec681f3Smrgstatic bool 3327ec681f3Smrgis_vector(aco_opcode op) 3337ec681f3Smrg{ 3347ec681f3Smrg switch (instr_info.classes[(int)op]) { 3357ec681f3Smrg case instr_class::valu32: 3367ec681f3Smrg case instr_class::valu_convert32: 3377ec681f3Smrg case instr_class::valu_fma: 3387ec681f3Smrg case instr_class::valu_double: 3397ec681f3Smrg case instr_class::valu_double_add: 3407ec681f3Smrg case instr_class::valu_double_convert: 3417ec681f3Smrg case instr_class::valu_double_transcendental: 3427ec681f3Smrg case instr_class::vmem: 3437ec681f3Smrg case instr_class::ds: 3447ec681f3Smrg case instr_class::exp: 3457ec681f3Smrg case instr_class::valu64: 3467ec681f3Smrg case instr_class::valu_quarter_rate32: 3477ec681f3Smrg case instr_class::valu_transcendental32: return true; 3487ec681f3Smrg default: return false; 3497ec681f3Smrg } 3507ec681f3Smrg} 3517ec681f3Smrg 3527ec681f3Smrgvoid 3537ec681f3SmrgBlockCycleEstimator::add(aco_ptr<Instruction>& instr) 3547ec681f3Smrg{ 3557ec681f3Smrg perf_info perf = get_perf_info(program, instr); 3567ec681f3Smrg 3577ec681f3Smrg cur_cycle += get_dependency_cost(instr); 3587ec681f3Smrg 3597ec681f3Smrg unsigned start; 3607ec681f3Smrg bool dual_issue = program->chip_class >= GFX10 && program->wave_size == 64 && 3617ec681f3Smrg is_vector(instr->opcode) && program->workgroup_size > 32; 3627ec681f3Smrg for (unsigned i = 0; i < (dual_issue ? 2 : 1); i++) { 3637ec681f3Smrg cur_cycle += cycles_until_res_available(instr); 3647ec681f3Smrg 3657ec681f3Smrg start = cur_cycle; 3667ec681f3Smrg use_resources(instr); 3677ec681f3Smrg 3687ec681f3Smrg /* GCN is in-order and doesn't begin the next instruction until the current one finishes */ 3697ec681f3Smrg cur_cycle += program->chip_class >= GFX10 ? 1 : perf.latency; 3707ec681f3Smrg } 3717ec681f3Smrg 3727ec681f3Smrg wait_imm imm = get_wait_imm(program, instr); 3737ec681f3Smrg while (lgkm.size() > imm.lgkm) 3747ec681f3Smrg lgkm.pop_front(); 3757ec681f3Smrg while (exp.size() > imm.exp) 3767ec681f3Smrg exp.pop_front(); 3777ec681f3Smrg while (vm.size() > imm.vm) 3787ec681f3Smrg vm.pop_front(); 3797ec681f3Smrg while (vs.size() > imm.vs) 3807ec681f3Smrg vs.pop_front(); 3817ec681f3Smrg 3827ec681f3Smrg wait_counter_info wait_info = get_wait_counter_info(instr); 3837ec681f3Smrg if (wait_info.exp) 3847ec681f3Smrg exp.push_back(cur_cycle + wait_info.exp); 3857ec681f3Smrg if (wait_info.lgkm) 3867ec681f3Smrg lgkm.push_back(cur_cycle + wait_info.lgkm); 3877ec681f3Smrg if (wait_info.vm) 3887ec681f3Smrg vm.push_back(cur_cycle + wait_info.vm); 3897ec681f3Smrg if (wait_info.vs) 3907ec681f3Smrg vs.push_back(cur_cycle + wait_info.vs); 3917ec681f3Smrg 3927ec681f3Smrg /* This is inaccurate but shouldn't affect anything after waitcnt insertion. 3937ec681f3Smrg * Before waitcnt insertion, this is necessary to consider memory operations. 3947ec681f3Smrg */ 3957ec681f3Smrg int latency = MAX3(wait_info.exp, wait_info.lgkm, wait_info.vm); 3967ec681f3Smrg int32_t result_available = start + MAX2(perf.latency, latency); 3977ec681f3Smrg 3987ec681f3Smrg for (Definition& def : instr->definitions) { 3997ec681f3Smrg int32_t* available = ®_available[def.physReg().reg()]; 4007ec681f3Smrg for (unsigned i = 0; i < def.size(); i++) 4017ec681f3Smrg available[i] = MAX2(available[i], result_available); 4027ec681f3Smrg } 4037ec681f3Smrg} 4047ec681f3Smrg 4057ec681f3Smrgstatic void 4067ec681f3Smrgjoin_queue(std::deque<int32_t>& queue, const std::deque<int32_t>& pred, int cycle_diff) 4077ec681f3Smrg{ 4087ec681f3Smrg for (unsigned i = 0; i < MIN2(queue.size(), pred.size()); i++) 4097ec681f3Smrg queue.rbegin()[i] = MAX2(queue.rbegin()[i], pred.rbegin()[i] + cycle_diff); 4107ec681f3Smrg for (int i = pred.size() - queue.size() - 1; i >= 0; i--) 4117ec681f3Smrg queue.push_front(pred[i] + cycle_diff); 4127ec681f3Smrg} 4137ec681f3Smrg 4147ec681f3Smrgvoid 4157ec681f3SmrgBlockCycleEstimator::join(const BlockCycleEstimator& pred) 4167ec681f3Smrg{ 4177ec681f3Smrg assert(cur_cycle == 0); 4187ec681f3Smrg 4197ec681f3Smrg for (unsigned i = 0; i < (unsigned)resource_count; i++) { 4207ec681f3Smrg assert(res_usage[i] == 0); 4217ec681f3Smrg res_available[i] = MAX2(res_available[i], pred.res_available[i] - pred.cur_cycle); 4227ec681f3Smrg } 4237ec681f3Smrg 4247ec681f3Smrg for (unsigned i = 0; i < 512; i++) 4257ec681f3Smrg reg_available[i] = MAX2(reg_available[i], pred.reg_available[i] - pred.cur_cycle + cur_cycle); 4267ec681f3Smrg 4277ec681f3Smrg join_queue(lgkm, pred.lgkm, -pred.cur_cycle); 4287ec681f3Smrg join_queue(exp, pred.exp, -pred.cur_cycle); 4297ec681f3Smrg join_queue(vm, pred.vm, -pred.cur_cycle); 4307ec681f3Smrg join_queue(vs, pred.vs, -pred.cur_cycle); 4317ec681f3Smrg} 4327ec681f3Smrg 4337ec681f3Smrg/* instructions/branches/vmem_clauses/smem_clauses/cycles */ 4347ec681f3Smrgvoid 4357ec681f3Smrgcollect_preasm_stats(Program* program) 4367ec681f3Smrg{ 4377ec681f3Smrg for (Block& block : program->blocks) { 4387ec681f3Smrg std::set<Instruction*> vmem_clause; 4397ec681f3Smrg std::set<Instruction*> smem_clause; 4407ec681f3Smrg 4417ec681f3Smrg program->statistics[statistic_instructions] += block.instructions.size(); 4427ec681f3Smrg 4437ec681f3Smrg for (aco_ptr<Instruction>& instr : block.instructions) { 4447ec681f3Smrg if (instr->isSOPP() && instr->sopp().block != -1) 4457ec681f3Smrg program->statistics[statistic_branches]++; 4467ec681f3Smrg 4477ec681f3Smrg if (instr->opcode == aco_opcode::p_constaddr) 4487ec681f3Smrg program->statistics[statistic_instructions] += 2; 4497ec681f3Smrg 4507ec681f3Smrg if (instr->isVMEM() && !instr->operands.empty()) { 4517ec681f3Smrg if (std::none_of(vmem_clause.begin(), vmem_clause.end(), 4527ec681f3Smrg [&](Instruction* other) 4537ec681f3Smrg { return should_form_clause(instr.get(), other); })) 4547ec681f3Smrg program->statistics[statistic_vmem_clauses]++; 4557ec681f3Smrg vmem_clause.insert(instr.get()); 4567ec681f3Smrg } else { 4577ec681f3Smrg vmem_clause.clear(); 4587ec681f3Smrg } 4597ec681f3Smrg 4607ec681f3Smrg if (instr->isSMEM() && !instr->operands.empty()) { 4617ec681f3Smrg if (std::none_of(smem_clause.begin(), smem_clause.end(), 4627ec681f3Smrg [&](Instruction* other) 4637ec681f3Smrg { return should_form_clause(instr.get(), other); })) 4647ec681f3Smrg program->statistics[statistic_smem_clauses]++; 4657ec681f3Smrg smem_clause.insert(instr.get()); 4667ec681f3Smrg } else { 4677ec681f3Smrg smem_clause.clear(); 4687ec681f3Smrg } 4697ec681f3Smrg } 4707ec681f3Smrg } 4717ec681f3Smrg 4727ec681f3Smrg double latency = 0; 4737ec681f3Smrg double usage[(int)BlockCycleEstimator::resource_count] = {0}; 4747ec681f3Smrg std::vector<BlockCycleEstimator> blocks(program->blocks.size(), program); 4757ec681f3Smrg 4767ec681f3Smrg if (program->stage.has(SWStage::VS) && program->info->vs.has_prolog) { 4777ec681f3Smrg unsigned vs_input_latency = 320; 4787ec681f3Smrg for (Definition def : program->vs_inputs) { 4797ec681f3Smrg blocks[0].vm.push_back(vs_input_latency); 4807ec681f3Smrg for (unsigned i = 0; i < def.size(); i++) 4817ec681f3Smrg blocks[0].reg_available[def.physReg().reg() + i] = vs_input_latency; 4827ec681f3Smrg } 4837ec681f3Smrg } 4847ec681f3Smrg 4857ec681f3Smrg for (Block& block : program->blocks) { 4867ec681f3Smrg BlockCycleEstimator& block_est = blocks[block.index]; 4877ec681f3Smrg for (unsigned pred : block.linear_preds) 4887ec681f3Smrg block_est.join(blocks[pred]); 4897ec681f3Smrg 4907ec681f3Smrg for (aco_ptr<Instruction>& instr : block.instructions) { 4917ec681f3Smrg unsigned before = block_est.cur_cycle; 4927ec681f3Smrg block_est.add(instr); 4937ec681f3Smrg instr->pass_flags = block_est.cur_cycle - before; 4947ec681f3Smrg } 4957ec681f3Smrg 4967ec681f3Smrg /* TODO: it would be nice to be able to consider estimated loop trip 4977ec681f3Smrg * counts used for loop unrolling. 4987ec681f3Smrg */ 4997ec681f3Smrg 5007ec681f3Smrg /* TODO: estimate the trip_count of divergent loops (those which break 5017ec681f3Smrg * divergent) higher than of uniform loops 5027ec681f3Smrg */ 5037ec681f3Smrg 5047ec681f3Smrg /* Assume loops execute 8-2 times, uniform branches are taken 50% the time, 5057ec681f3Smrg * and any lane in the wave takes a side of a divergent branch 75% of the 5067ec681f3Smrg * time. 5077ec681f3Smrg */ 5087ec681f3Smrg double iter = 1.0; 5097ec681f3Smrg iter *= block.loop_nest_depth > 0 ? 8.0 : 1.0; 5107ec681f3Smrg iter *= block.loop_nest_depth > 1 ? 4.0 : 1.0; 5117ec681f3Smrg iter *= block.loop_nest_depth > 2 ? pow(2.0, block.loop_nest_depth - 2) : 1.0; 5127ec681f3Smrg iter *= pow(0.5, block.uniform_if_depth); 5137ec681f3Smrg iter *= pow(0.75, block.divergent_if_logical_depth); 5147ec681f3Smrg 5157ec681f3Smrg bool divergent_if_linear_else = 5167ec681f3Smrg block.logical_preds.empty() && block.linear_preds.size() == 1 && 5177ec681f3Smrg block.linear_succs.size() == 1 && 5187ec681f3Smrg program->blocks[block.linear_preds[0]].kind & (block_kind_branch | block_kind_invert); 5197ec681f3Smrg if (divergent_if_linear_else) 5207ec681f3Smrg iter *= 0.25; 5217ec681f3Smrg 5227ec681f3Smrg latency += block_est.cur_cycle * iter; 5237ec681f3Smrg for (unsigned i = 0; i < (unsigned)BlockCycleEstimator::resource_count; i++) 5247ec681f3Smrg usage[i] += block_est.res_usage[i] * iter; 5257ec681f3Smrg } 5267ec681f3Smrg 5277ec681f3Smrg /* This likely exaggerates the effectiveness of parallelism because it 5287ec681f3Smrg * ignores instruction ordering. It can assume there might be SALU/VALU/etc 5297ec681f3Smrg * work to from other waves while one is idle but that might not be the case 5307ec681f3Smrg * because those other waves have not reached such a point yet. 5317ec681f3Smrg */ 5327ec681f3Smrg 5337ec681f3Smrg double parallelism = program->num_waves; 5347ec681f3Smrg for (unsigned i = 0; i < (unsigned)BlockCycleEstimator::resource_count; i++) { 5357ec681f3Smrg if (usage[i] > 0.0) 5367ec681f3Smrg parallelism = MIN2(parallelism, latency / usage[i]); 5377ec681f3Smrg } 5387ec681f3Smrg double waves_per_cycle = 1.0 / latency * parallelism; 5397ec681f3Smrg double wave64_per_cycle = waves_per_cycle * (program->wave_size / 64.0); 5407ec681f3Smrg 5417ec681f3Smrg double max_utilization = 1.0; 5427ec681f3Smrg if (program->workgroup_size != UINT_MAX) 5437ec681f3Smrg max_utilization = 5447ec681f3Smrg program->workgroup_size / (double)align(program->workgroup_size, program->wave_size); 5457ec681f3Smrg wave64_per_cycle *= max_utilization; 5467ec681f3Smrg 5477ec681f3Smrg program->statistics[statistic_latency] = round(latency); 5487ec681f3Smrg program->statistics[statistic_inv_throughput] = round(1.0 / wave64_per_cycle); 5497ec681f3Smrg 5507ec681f3Smrg if (debug_flags & DEBUG_PERF_INFO) { 5517ec681f3Smrg aco_print_program(program, stderr, print_no_ssa | print_perf_info); 5527ec681f3Smrg 5537ec681f3Smrg fprintf(stderr, "num_waves: %u\n", program->num_waves); 5547ec681f3Smrg fprintf(stderr, "salu_smem_usage: %f\n", usage[(int)BlockCycleEstimator::scalar]); 5557ec681f3Smrg fprintf(stderr, "branch_sendmsg_usage: %f\n", 5567ec681f3Smrg usage[(int)BlockCycleEstimator::branch_sendmsg]); 5577ec681f3Smrg fprintf(stderr, "valu_usage: %f\n", usage[(int)BlockCycleEstimator::valu]); 5587ec681f3Smrg fprintf(stderr, "valu_complex_usage: %f\n", usage[(int)BlockCycleEstimator::valu_complex]); 5597ec681f3Smrg fprintf(stderr, "lds_usage: %f\n", usage[(int)BlockCycleEstimator::lds]); 5607ec681f3Smrg fprintf(stderr, "export_gds_usage: %f\n", usage[(int)BlockCycleEstimator::export_gds]); 5617ec681f3Smrg fprintf(stderr, "vmem_usage: %f\n", usage[(int)BlockCycleEstimator::vmem]); 5627ec681f3Smrg fprintf(stderr, "latency: %f\n", latency); 5637ec681f3Smrg fprintf(stderr, "parallelism: %f\n", parallelism); 5647ec681f3Smrg fprintf(stderr, "max_utilization: %f\n", max_utilization); 5657ec681f3Smrg fprintf(stderr, "wave64_per_cycle: %f\n", wave64_per_cycle); 5667ec681f3Smrg fprintf(stderr, "\n"); 5677ec681f3Smrg } 5687ec681f3Smrg} 5697ec681f3Smrg 5707ec681f3Smrgvoid 5717ec681f3Smrgcollect_postasm_stats(Program* program, const std::vector<uint32_t>& code) 5727ec681f3Smrg{ 5737ec681f3Smrg program->statistics[aco::statistic_hash] = util_hash_crc32(code.data(), code.size() * 4); 5747ec681f3Smrg} 5757ec681f3Smrg 5767ec681f3Smrg} // namespace aco 577