17ec681f3Smrg/*
27ec681f3Smrg * Copyright © 2020 Valve Corporation
37ec681f3Smrg *
47ec681f3Smrg * Permission is hereby granted, free of charge, to any person obtaining a
57ec681f3Smrg * copy of this software and associated documentation files (the "Software"),
67ec681f3Smrg * to deal in the Software without restriction, including without limitation
77ec681f3Smrg * the rights to use, copy, modify, merge, publish, distribute, sublicense,
87ec681f3Smrg * and/or sell copies of the Software, and to permit persons to whom the
97ec681f3Smrg * Software is furnished to do so, subject to the following conditions:
107ec681f3Smrg *
117ec681f3Smrg * The above copyright notice and this permission notice (including the next
127ec681f3Smrg * paragraph) shall be included in all copies or substantial portions of the
137ec681f3Smrg * Software.
147ec681f3Smrg *
157ec681f3Smrg * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
167ec681f3Smrg * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
177ec681f3Smrg * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
187ec681f3Smrg * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
197ec681f3Smrg * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
207ec681f3Smrg * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
217ec681f3Smrg * IN THE SOFTWARE.
227ec681f3Smrg *
237ec681f3Smrg */
247ec681f3Smrg
257ec681f3Smrg#include "aco_ir.h"
267ec681f3Smrg
277ec681f3Smrg#include "util/crc32.h"
287ec681f3Smrg
297ec681f3Smrg#include <algorithm>
307ec681f3Smrg#include <deque>
317ec681f3Smrg#include <set>
327ec681f3Smrg#include <vector>
337ec681f3Smrg
347ec681f3Smrgnamespace aco {
357ec681f3Smrg
367ec681f3Smrg/* sgpr_presched/vgpr_presched */
377ec681f3Smrgvoid
387ec681f3Smrgcollect_presched_stats(Program* program)
397ec681f3Smrg{
407ec681f3Smrg   RegisterDemand presched_demand;
417ec681f3Smrg   for (Block& block : program->blocks)
427ec681f3Smrg      presched_demand.update(block.register_demand);
437ec681f3Smrg   program->statistics[statistic_sgpr_presched] = presched_demand.sgpr;
447ec681f3Smrg   program->statistics[statistic_vgpr_presched] = presched_demand.vgpr;
457ec681f3Smrg}
467ec681f3Smrg
477ec681f3Smrgclass BlockCycleEstimator {
487ec681f3Smrgpublic:
497ec681f3Smrg   enum resource {
507ec681f3Smrg      null = 0,
517ec681f3Smrg      scalar,
527ec681f3Smrg      branch_sendmsg,
537ec681f3Smrg      valu,
547ec681f3Smrg      valu_complex,
557ec681f3Smrg      lds,
567ec681f3Smrg      export_gds,
577ec681f3Smrg      vmem,
587ec681f3Smrg      resource_count,
597ec681f3Smrg   };
607ec681f3Smrg
617ec681f3Smrg   BlockCycleEstimator(Program* program_) : program(program_) {}
627ec681f3Smrg
637ec681f3Smrg   Program* program;
647ec681f3Smrg
657ec681f3Smrg   int32_t cur_cycle = 0;
667ec681f3Smrg   int32_t res_available[(int)BlockCycleEstimator::resource_count] = {0};
677ec681f3Smrg   unsigned res_usage[(int)BlockCycleEstimator::resource_count] = {0};
687ec681f3Smrg   int32_t reg_available[512] = {0};
697ec681f3Smrg   std::deque<int32_t> lgkm;
707ec681f3Smrg   std::deque<int32_t> exp;
717ec681f3Smrg   std::deque<int32_t> vm;
727ec681f3Smrg   std::deque<int32_t> vs;
737ec681f3Smrg
747ec681f3Smrg   unsigned predict_cost(aco_ptr<Instruction>& instr);
757ec681f3Smrg   void add(aco_ptr<Instruction>& instr);
767ec681f3Smrg   void join(const BlockCycleEstimator& other);
777ec681f3Smrg
787ec681f3Smrgprivate:
797ec681f3Smrg   unsigned get_waitcnt_cost(wait_imm imm);
807ec681f3Smrg   unsigned get_dependency_cost(aco_ptr<Instruction>& instr);
817ec681f3Smrg
827ec681f3Smrg   void use_resources(aco_ptr<Instruction>& instr);
837ec681f3Smrg   int32_t cycles_until_res_available(aco_ptr<Instruction>& instr);
847ec681f3Smrg};
857ec681f3Smrg
867ec681f3Smrgstruct wait_counter_info {
877ec681f3Smrg   wait_counter_info(unsigned vm_, unsigned exp_, unsigned lgkm_, unsigned vs_)
887ec681f3Smrg       : vm(vm_), exp(exp_), lgkm(lgkm_), vs(vs_)
897ec681f3Smrg   {}
907ec681f3Smrg
917ec681f3Smrg   unsigned vm;
927ec681f3Smrg   unsigned exp;
937ec681f3Smrg   unsigned lgkm;
947ec681f3Smrg   unsigned vs;
957ec681f3Smrg};
967ec681f3Smrg
977ec681f3Smrgstruct perf_info {
987ec681f3Smrg   int latency;
997ec681f3Smrg
1007ec681f3Smrg   BlockCycleEstimator::resource rsrc0;
1017ec681f3Smrg   unsigned cost0;
1027ec681f3Smrg
1037ec681f3Smrg   BlockCycleEstimator::resource rsrc1;
1047ec681f3Smrg   unsigned cost1;
1057ec681f3Smrg};
1067ec681f3Smrg
1077ec681f3Smrgstatic perf_info
1087ec681f3Smrgget_perf_info(Program* program, aco_ptr<Instruction>& instr)
1097ec681f3Smrg{
1107ec681f3Smrg   instr_class cls = instr_info.classes[(int)instr->opcode];
1117ec681f3Smrg
1127ec681f3Smrg#define WAIT(res)          BlockCycleEstimator::res, 0
1137ec681f3Smrg#define WAIT_USE(res, cnt) BlockCycleEstimator::res, cnt
1147ec681f3Smrg
1157ec681f3Smrg   if (program->chip_class >= GFX10) {
1167ec681f3Smrg      /* fp64 might be incorrect */
1177ec681f3Smrg      switch (cls) {
1187ec681f3Smrg      case instr_class::valu32:
1197ec681f3Smrg      case instr_class::valu_convert32:
1207ec681f3Smrg      case instr_class::valu_fma: return {5, WAIT_USE(valu, 1)};
1217ec681f3Smrg      case instr_class::valu64: return {6, WAIT_USE(valu, 2), WAIT_USE(valu_complex, 2)};
1227ec681f3Smrg      case instr_class::valu_quarter_rate32:
1237ec681f3Smrg         return {8, WAIT_USE(valu, 4), WAIT_USE(valu_complex, 4)};
1247ec681f3Smrg      case instr_class::valu_transcendental32:
1257ec681f3Smrg         return {10, WAIT_USE(valu, 1), WAIT_USE(valu_complex, 4)};
1267ec681f3Smrg      case instr_class::valu_double: return {22, WAIT_USE(valu, 16), WAIT_USE(valu_complex, 16)};
1277ec681f3Smrg      case instr_class::valu_double_add:
1287ec681f3Smrg         return {22, WAIT_USE(valu, 16), WAIT_USE(valu_complex, 16)};
1297ec681f3Smrg      case instr_class::valu_double_convert:
1307ec681f3Smrg         return {22, WAIT_USE(valu, 16), WAIT_USE(valu_complex, 16)};
1317ec681f3Smrg      case instr_class::valu_double_transcendental:
1327ec681f3Smrg         return {24, WAIT_USE(valu, 16), WAIT_USE(valu_complex, 16)};
1337ec681f3Smrg      case instr_class::salu: return {2, WAIT_USE(scalar, 1)};
1347ec681f3Smrg      case instr_class::smem: return {0, WAIT_USE(scalar, 1)};
1357ec681f3Smrg      case instr_class::branch:
1367ec681f3Smrg      case instr_class::sendmsg: return {0, WAIT_USE(branch_sendmsg, 1)};
1377ec681f3Smrg      case instr_class::ds:
1387ec681f3Smrg         return instr->ds().gds ? perf_info{0, WAIT_USE(export_gds, 1)}
1397ec681f3Smrg                                : perf_info{0, WAIT_USE(lds, 1)};
1407ec681f3Smrg      case instr_class::exp: return {0, WAIT_USE(export_gds, 1)};
1417ec681f3Smrg      case instr_class::vmem: return {0, WAIT_USE(vmem, 1)};
1427ec681f3Smrg      case instr_class::barrier:
1437ec681f3Smrg      case instr_class::waitcnt:
1447ec681f3Smrg      case instr_class::other:
1457ec681f3Smrg      default: return {0};
1467ec681f3Smrg      }
1477ec681f3Smrg   } else {
1487ec681f3Smrg      switch (cls) {
1497ec681f3Smrg      case instr_class::valu32: return {4, WAIT_USE(valu, 4)};
1507ec681f3Smrg      case instr_class::valu_convert32: return {16, WAIT_USE(valu, 16)};
1517ec681f3Smrg      case instr_class::valu64: return {8, WAIT_USE(valu, 8)};
1527ec681f3Smrg      case instr_class::valu_quarter_rate32: return {16, WAIT_USE(valu, 16)};
1537ec681f3Smrg      case instr_class::valu_fma:
1547ec681f3Smrg         return program->dev.has_fast_fma32 ? perf_info{4, WAIT_USE(valu, 4)}
1557ec681f3Smrg                                            : perf_info{16, WAIT_USE(valu, 16)};
1567ec681f3Smrg      case instr_class::valu_transcendental32: return {16, WAIT_USE(valu, 16)};
1577ec681f3Smrg      case instr_class::valu_double: return {64, WAIT_USE(valu, 64)};
1587ec681f3Smrg      case instr_class::valu_double_add: return {32, WAIT_USE(valu, 32)};
1597ec681f3Smrg      case instr_class::valu_double_convert: return {16, WAIT_USE(valu, 16)};
1607ec681f3Smrg      case instr_class::valu_double_transcendental: return {64, WAIT_USE(valu, 64)};
1617ec681f3Smrg      case instr_class::salu: return {4, WAIT_USE(scalar, 4)};
1627ec681f3Smrg      case instr_class::smem: return {4, WAIT_USE(scalar, 4)};
1637ec681f3Smrg      case instr_class::branch:
1647ec681f3Smrg         return {8, WAIT_USE(branch_sendmsg, 8)};
1657ec681f3Smrg         return {4, WAIT_USE(branch_sendmsg, 4)};
1667ec681f3Smrg      case instr_class::ds:
1677ec681f3Smrg         return instr->ds().gds ? perf_info{4, WAIT_USE(export_gds, 4)}
1687ec681f3Smrg                                : perf_info{4, WAIT_USE(lds, 4)};
1697ec681f3Smrg      case instr_class::exp: return {16, WAIT_USE(export_gds, 16)};
1707ec681f3Smrg      case instr_class::vmem: return {4, WAIT_USE(vmem, 4)};
1717ec681f3Smrg      case instr_class::barrier:
1727ec681f3Smrg      case instr_class::waitcnt:
1737ec681f3Smrg      case instr_class::other:
1747ec681f3Smrg      default: return {4};
1757ec681f3Smrg      }
1767ec681f3Smrg   }
1777ec681f3Smrg
1787ec681f3Smrg#undef WAIT_USE
1797ec681f3Smrg#undef WAIT
1807ec681f3Smrg}
1817ec681f3Smrg
1827ec681f3Smrgvoid
1837ec681f3SmrgBlockCycleEstimator::use_resources(aco_ptr<Instruction>& instr)
1847ec681f3Smrg{
1857ec681f3Smrg   perf_info perf = get_perf_info(program, instr);
1867ec681f3Smrg
1877ec681f3Smrg   if (perf.rsrc0 != resource_count) {
1887ec681f3Smrg      res_available[(int)perf.rsrc0] = cur_cycle + perf.cost0;
1897ec681f3Smrg      res_usage[(int)perf.rsrc0] += perf.cost0;
1907ec681f3Smrg   }
1917ec681f3Smrg
1927ec681f3Smrg   if (perf.rsrc1 != resource_count) {
1937ec681f3Smrg      res_available[(int)perf.rsrc1] = cur_cycle + perf.cost1;
1947ec681f3Smrg      res_usage[(int)perf.rsrc1] += perf.cost1;
1957ec681f3Smrg   }
1967ec681f3Smrg}
1977ec681f3Smrg
1987ec681f3Smrgint32_t
1997ec681f3SmrgBlockCycleEstimator::cycles_until_res_available(aco_ptr<Instruction>& instr)
2007ec681f3Smrg{
2017ec681f3Smrg   perf_info perf = get_perf_info(program, instr);
2027ec681f3Smrg
2037ec681f3Smrg   int32_t cost = 0;
2047ec681f3Smrg   if (perf.rsrc0 != resource_count)
2057ec681f3Smrg      cost = MAX2(cost, res_available[(int)perf.rsrc0] - cur_cycle);
2067ec681f3Smrg   if (perf.rsrc1 != resource_count)
2077ec681f3Smrg      cost = MAX2(cost, res_available[(int)perf.rsrc1] - cur_cycle);
2087ec681f3Smrg
2097ec681f3Smrg   return cost;
2107ec681f3Smrg}
2117ec681f3Smrg
2127ec681f3Smrgstatic wait_counter_info
2137ec681f3Smrgget_wait_counter_info(aco_ptr<Instruction>& instr)
2147ec681f3Smrg{
2157ec681f3Smrg   /* These numbers are all a bit nonsense. LDS/VMEM/SMEM/EXP performance
2167ec681f3Smrg    * depends a lot on the situation. */
2177ec681f3Smrg
2187ec681f3Smrg   if (instr->isEXP())
2197ec681f3Smrg      return wait_counter_info(0, 16, 0, 0);
2207ec681f3Smrg
2217ec681f3Smrg   if (instr->isFlatLike()) {
2227ec681f3Smrg      unsigned lgkm = instr->isFlat() ? 20 : 0;
2237ec681f3Smrg      if (!instr->definitions.empty())
2247ec681f3Smrg         return wait_counter_info(230, 0, lgkm, 0);
2257ec681f3Smrg      else
2267ec681f3Smrg         return wait_counter_info(0, 0, lgkm, 230);
2277ec681f3Smrg   }
2287ec681f3Smrg
2297ec681f3Smrg   if (instr->isSMEM()) {
2307ec681f3Smrg      if (instr->definitions.empty())
2317ec681f3Smrg         return wait_counter_info(0, 0, 200, 0);
2327ec681f3Smrg      if (instr->operands.empty()) /* s_memtime and s_memrealtime */
2337ec681f3Smrg         return wait_counter_info(0, 0, 1, 0);
2347ec681f3Smrg
2357ec681f3Smrg      bool likely_desc_load = instr->operands[0].size() == 2;
2367ec681f3Smrg      bool soe = instr->operands.size() >= (!instr->definitions.empty() ? 3 : 4);
2377ec681f3Smrg      bool const_offset =
2387ec681f3Smrg         instr->operands[1].isConstant() && (!soe || instr->operands.back().isConstant());
2397ec681f3Smrg
2407ec681f3Smrg      if (likely_desc_load || const_offset)
2417ec681f3Smrg         return wait_counter_info(0, 0, 30, 0); /* likely to hit L0 cache */
2427ec681f3Smrg
2437ec681f3Smrg      return wait_counter_info(0, 0, 200, 0);
2447ec681f3Smrg   }
2457ec681f3Smrg
2467ec681f3Smrg   if (instr->format == Format::DS)
2477ec681f3Smrg      return wait_counter_info(0, 0, 20, 0);
2487ec681f3Smrg
2497ec681f3Smrg   if (instr->isVMEM() && !instr->definitions.empty())
2507ec681f3Smrg      return wait_counter_info(320, 0, 0, 0);
2517ec681f3Smrg
2527ec681f3Smrg   if (instr->isVMEM() && instr->definitions.empty())
2537ec681f3Smrg      return wait_counter_info(0, 0, 0, 320);
2547ec681f3Smrg
2557ec681f3Smrg   return wait_counter_info(0, 0, 0, 0);
2567ec681f3Smrg}
2577ec681f3Smrg
2587ec681f3Smrgstatic wait_imm
2597ec681f3Smrgget_wait_imm(Program* program, aco_ptr<Instruction>& instr)
2607ec681f3Smrg{
2617ec681f3Smrg   if (instr->opcode == aco_opcode::s_endpgm) {
2627ec681f3Smrg      return wait_imm(0, 0, 0, 0);
2637ec681f3Smrg   } else if (instr->opcode == aco_opcode::s_waitcnt) {
2647ec681f3Smrg      return wait_imm(GFX10_3, instr->sopp().imm);
2657ec681f3Smrg   } else if (instr->opcode == aco_opcode::s_waitcnt_vscnt) {
2667ec681f3Smrg      return wait_imm(0, 0, 0, instr->sopk().imm);
2677ec681f3Smrg   } else {
2687ec681f3Smrg      unsigned max_lgkm_cnt = program->chip_class >= GFX10 ? 62 : 14;
2697ec681f3Smrg      unsigned max_exp_cnt = 6;
2707ec681f3Smrg      unsigned max_vm_cnt = program->chip_class >= GFX9 ? 62 : 14;
2717ec681f3Smrg      unsigned max_vs_cnt = 62;
2727ec681f3Smrg
2737ec681f3Smrg      wait_counter_info wait_info = get_wait_counter_info(instr);
2747ec681f3Smrg      wait_imm imm;
2757ec681f3Smrg      imm.lgkm = wait_info.lgkm ? max_lgkm_cnt : wait_imm::unset_counter;
2767ec681f3Smrg      imm.exp = wait_info.exp ? max_exp_cnt : wait_imm::unset_counter;
2777ec681f3Smrg      imm.vm = wait_info.vm ? max_vm_cnt : wait_imm::unset_counter;
2787ec681f3Smrg      imm.vs = wait_info.vs ? max_vs_cnt : wait_imm::unset_counter;
2797ec681f3Smrg      return imm;
2807ec681f3Smrg   }
2817ec681f3Smrg}
2827ec681f3Smrg
2837ec681f3Smrgunsigned
2847ec681f3SmrgBlockCycleEstimator::get_dependency_cost(aco_ptr<Instruction>& instr)
2857ec681f3Smrg{
2867ec681f3Smrg   int deps_available = cur_cycle;
2877ec681f3Smrg
2887ec681f3Smrg   wait_imm imm = get_wait_imm(program, instr);
2897ec681f3Smrg   if (imm.vm != wait_imm::unset_counter) {
2907ec681f3Smrg      for (int i = 0; i < (int)vm.size() - imm.vm; i++)
2917ec681f3Smrg         deps_available = MAX2(deps_available, vm[i]);
2927ec681f3Smrg   }
2937ec681f3Smrg   if (imm.exp != wait_imm::unset_counter) {
2947ec681f3Smrg      for (int i = 0; i < (int)exp.size() - imm.exp; i++)
2957ec681f3Smrg         deps_available = MAX2(deps_available, exp[i]);
2967ec681f3Smrg   }
2977ec681f3Smrg   if (imm.lgkm != wait_imm::unset_counter) {
2987ec681f3Smrg      for (int i = 0; i < (int)lgkm.size() - imm.lgkm; i++)
2997ec681f3Smrg         deps_available = MAX2(deps_available, lgkm[i]);
3007ec681f3Smrg   }
3017ec681f3Smrg   if (imm.vs != wait_imm::unset_counter) {
3027ec681f3Smrg      for (int i = 0; i < (int)vs.size() - imm.vs; i++)
3037ec681f3Smrg         deps_available = MAX2(deps_available, vs[i]);
3047ec681f3Smrg   }
3057ec681f3Smrg
3067ec681f3Smrg   if (instr->opcode == aco_opcode::s_endpgm) {
3077ec681f3Smrg      for (unsigned i = 0; i < 512; i++)
3087ec681f3Smrg         deps_available = MAX2(deps_available, reg_available[i]);
3097ec681f3Smrg   } else if (program->chip_class >= GFX10) {
3107ec681f3Smrg      for (Operand& op : instr->operands) {
3117ec681f3Smrg         if (op.isConstant() || op.isUndefined())
3127ec681f3Smrg            continue;
3137ec681f3Smrg         for (unsigned i = 0; i < op.size(); i++)
3147ec681f3Smrg            deps_available = MAX2(deps_available, reg_available[op.physReg().reg() + i]);
3157ec681f3Smrg      }
3167ec681f3Smrg   }
3177ec681f3Smrg
3187ec681f3Smrg   if (program->chip_class < GFX10)
3197ec681f3Smrg      deps_available = align(deps_available, 4);
3207ec681f3Smrg
3217ec681f3Smrg   return deps_available - cur_cycle;
3227ec681f3Smrg}
3237ec681f3Smrg
3247ec681f3Smrgunsigned
3257ec681f3SmrgBlockCycleEstimator::predict_cost(aco_ptr<Instruction>& instr)
3267ec681f3Smrg{
3277ec681f3Smrg   int32_t dep = get_dependency_cost(instr);
3287ec681f3Smrg   return dep + std::max(cycles_until_res_available(instr) - dep, 0);
3297ec681f3Smrg}
3307ec681f3Smrg
3317ec681f3Smrgstatic bool
3327ec681f3Smrgis_vector(aco_opcode op)
3337ec681f3Smrg{
3347ec681f3Smrg   switch (instr_info.classes[(int)op]) {
3357ec681f3Smrg   case instr_class::valu32:
3367ec681f3Smrg   case instr_class::valu_convert32:
3377ec681f3Smrg   case instr_class::valu_fma:
3387ec681f3Smrg   case instr_class::valu_double:
3397ec681f3Smrg   case instr_class::valu_double_add:
3407ec681f3Smrg   case instr_class::valu_double_convert:
3417ec681f3Smrg   case instr_class::valu_double_transcendental:
3427ec681f3Smrg   case instr_class::vmem:
3437ec681f3Smrg   case instr_class::ds:
3447ec681f3Smrg   case instr_class::exp:
3457ec681f3Smrg   case instr_class::valu64:
3467ec681f3Smrg   case instr_class::valu_quarter_rate32:
3477ec681f3Smrg   case instr_class::valu_transcendental32: return true;
3487ec681f3Smrg   default: return false;
3497ec681f3Smrg   }
3507ec681f3Smrg}
3517ec681f3Smrg
3527ec681f3Smrgvoid
3537ec681f3SmrgBlockCycleEstimator::add(aco_ptr<Instruction>& instr)
3547ec681f3Smrg{
3557ec681f3Smrg   perf_info perf = get_perf_info(program, instr);
3567ec681f3Smrg
3577ec681f3Smrg   cur_cycle += get_dependency_cost(instr);
3587ec681f3Smrg
3597ec681f3Smrg   unsigned start;
3607ec681f3Smrg   bool dual_issue = program->chip_class >= GFX10 && program->wave_size == 64 &&
3617ec681f3Smrg                     is_vector(instr->opcode) && program->workgroup_size > 32;
3627ec681f3Smrg   for (unsigned i = 0; i < (dual_issue ? 2 : 1); i++) {
3637ec681f3Smrg      cur_cycle += cycles_until_res_available(instr);
3647ec681f3Smrg
3657ec681f3Smrg      start = cur_cycle;
3667ec681f3Smrg      use_resources(instr);
3677ec681f3Smrg
3687ec681f3Smrg      /* GCN is in-order and doesn't begin the next instruction until the current one finishes */
3697ec681f3Smrg      cur_cycle += program->chip_class >= GFX10 ? 1 : perf.latency;
3707ec681f3Smrg   }
3717ec681f3Smrg
3727ec681f3Smrg   wait_imm imm = get_wait_imm(program, instr);
3737ec681f3Smrg   while (lgkm.size() > imm.lgkm)
3747ec681f3Smrg      lgkm.pop_front();
3757ec681f3Smrg   while (exp.size() > imm.exp)
3767ec681f3Smrg      exp.pop_front();
3777ec681f3Smrg   while (vm.size() > imm.vm)
3787ec681f3Smrg      vm.pop_front();
3797ec681f3Smrg   while (vs.size() > imm.vs)
3807ec681f3Smrg      vs.pop_front();
3817ec681f3Smrg
3827ec681f3Smrg   wait_counter_info wait_info = get_wait_counter_info(instr);
3837ec681f3Smrg   if (wait_info.exp)
3847ec681f3Smrg      exp.push_back(cur_cycle + wait_info.exp);
3857ec681f3Smrg   if (wait_info.lgkm)
3867ec681f3Smrg      lgkm.push_back(cur_cycle + wait_info.lgkm);
3877ec681f3Smrg   if (wait_info.vm)
3887ec681f3Smrg      vm.push_back(cur_cycle + wait_info.vm);
3897ec681f3Smrg   if (wait_info.vs)
3907ec681f3Smrg      vs.push_back(cur_cycle + wait_info.vs);
3917ec681f3Smrg
3927ec681f3Smrg   /* This is inaccurate but shouldn't affect anything after waitcnt insertion.
3937ec681f3Smrg    * Before waitcnt insertion, this is necessary to consider memory operations.
3947ec681f3Smrg    */
3957ec681f3Smrg   int latency = MAX3(wait_info.exp, wait_info.lgkm, wait_info.vm);
3967ec681f3Smrg   int32_t result_available = start + MAX2(perf.latency, latency);
3977ec681f3Smrg
3987ec681f3Smrg   for (Definition& def : instr->definitions) {
3997ec681f3Smrg      int32_t* available = &reg_available[def.physReg().reg()];
4007ec681f3Smrg      for (unsigned i = 0; i < def.size(); i++)
4017ec681f3Smrg         available[i] = MAX2(available[i], result_available);
4027ec681f3Smrg   }
4037ec681f3Smrg}
4047ec681f3Smrg
4057ec681f3Smrgstatic void
4067ec681f3Smrgjoin_queue(std::deque<int32_t>& queue, const std::deque<int32_t>& pred, int cycle_diff)
4077ec681f3Smrg{
4087ec681f3Smrg   for (unsigned i = 0; i < MIN2(queue.size(), pred.size()); i++)
4097ec681f3Smrg      queue.rbegin()[i] = MAX2(queue.rbegin()[i], pred.rbegin()[i] + cycle_diff);
4107ec681f3Smrg   for (int i = pred.size() - queue.size() - 1; i >= 0; i--)
4117ec681f3Smrg      queue.push_front(pred[i] + cycle_diff);
4127ec681f3Smrg}
4137ec681f3Smrg
4147ec681f3Smrgvoid
4157ec681f3SmrgBlockCycleEstimator::join(const BlockCycleEstimator& pred)
4167ec681f3Smrg{
4177ec681f3Smrg   assert(cur_cycle == 0);
4187ec681f3Smrg
4197ec681f3Smrg   for (unsigned i = 0; i < (unsigned)resource_count; i++) {
4207ec681f3Smrg      assert(res_usage[i] == 0);
4217ec681f3Smrg      res_available[i] = MAX2(res_available[i], pred.res_available[i] - pred.cur_cycle);
4227ec681f3Smrg   }
4237ec681f3Smrg
4247ec681f3Smrg   for (unsigned i = 0; i < 512; i++)
4257ec681f3Smrg      reg_available[i] = MAX2(reg_available[i], pred.reg_available[i] - pred.cur_cycle + cur_cycle);
4267ec681f3Smrg
4277ec681f3Smrg   join_queue(lgkm, pred.lgkm, -pred.cur_cycle);
4287ec681f3Smrg   join_queue(exp, pred.exp, -pred.cur_cycle);
4297ec681f3Smrg   join_queue(vm, pred.vm, -pred.cur_cycle);
4307ec681f3Smrg   join_queue(vs, pred.vs, -pred.cur_cycle);
4317ec681f3Smrg}
4327ec681f3Smrg
4337ec681f3Smrg/* instructions/branches/vmem_clauses/smem_clauses/cycles */
4347ec681f3Smrgvoid
4357ec681f3Smrgcollect_preasm_stats(Program* program)
4367ec681f3Smrg{
4377ec681f3Smrg   for (Block& block : program->blocks) {
4387ec681f3Smrg      std::set<Instruction*> vmem_clause;
4397ec681f3Smrg      std::set<Instruction*> smem_clause;
4407ec681f3Smrg
4417ec681f3Smrg      program->statistics[statistic_instructions] += block.instructions.size();
4427ec681f3Smrg
4437ec681f3Smrg      for (aco_ptr<Instruction>& instr : block.instructions) {
4447ec681f3Smrg         if (instr->isSOPP() && instr->sopp().block != -1)
4457ec681f3Smrg            program->statistics[statistic_branches]++;
4467ec681f3Smrg
4477ec681f3Smrg         if (instr->opcode == aco_opcode::p_constaddr)
4487ec681f3Smrg            program->statistics[statistic_instructions] += 2;
4497ec681f3Smrg
4507ec681f3Smrg         if (instr->isVMEM() && !instr->operands.empty()) {
4517ec681f3Smrg            if (std::none_of(vmem_clause.begin(), vmem_clause.end(),
4527ec681f3Smrg                             [&](Instruction* other)
4537ec681f3Smrg                             { return should_form_clause(instr.get(), other); }))
4547ec681f3Smrg               program->statistics[statistic_vmem_clauses]++;
4557ec681f3Smrg            vmem_clause.insert(instr.get());
4567ec681f3Smrg         } else {
4577ec681f3Smrg            vmem_clause.clear();
4587ec681f3Smrg         }
4597ec681f3Smrg
4607ec681f3Smrg         if (instr->isSMEM() && !instr->operands.empty()) {
4617ec681f3Smrg            if (std::none_of(smem_clause.begin(), smem_clause.end(),
4627ec681f3Smrg                             [&](Instruction* other)
4637ec681f3Smrg                             { return should_form_clause(instr.get(), other); }))
4647ec681f3Smrg               program->statistics[statistic_smem_clauses]++;
4657ec681f3Smrg            smem_clause.insert(instr.get());
4667ec681f3Smrg         } else {
4677ec681f3Smrg            smem_clause.clear();
4687ec681f3Smrg         }
4697ec681f3Smrg      }
4707ec681f3Smrg   }
4717ec681f3Smrg
4727ec681f3Smrg   double latency = 0;
4737ec681f3Smrg   double usage[(int)BlockCycleEstimator::resource_count] = {0};
4747ec681f3Smrg   std::vector<BlockCycleEstimator> blocks(program->blocks.size(), program);
4757ec681f3Smrg
4767ec681f3Smrg   if (program->stage.has(SWStage::VS) && program->info->vs.has_prolog) {
4777ec681f3Smrg      unsigned vs_input_latency = 320;
4787ec681f3Smrg      for (Definition def : program->vs_inputs) {
4797ec681f3Smrg         blocks[0].vm.push_back(vs_input_latency);
4807ec681f3Smrg         for (unsigned i = 0; i < def.size(); i++)
4817ec681f3Smrg            blocks[0].reg_available[def.physReg().reg() + i] = vs_input_latency;
4827ec681f3Smrg      }
4837ec681f3Smrg   }
4847ec681f3Smrg
4857ec681f3Smrg   for (Block& block : program->blocks) {
4867ec681f3Smrg      BlockCycleEstimator& block_est = blocks[block.index];
4877ec681f3Smrg      for (unsigned pred : block.linear_preds)
4887ec681f3Smrg         block_est.join(blocks[pred]);
4897ec681f3Smrg
4907ec681f3Smrg      for (aco_ptr<Instruction>& instr : block.instructions) {
4917ec681f3Smrg         unsigned before = block_est.cur_cycle;
4927ec681f3Smrg         block_est.add(instr);
4937ec681f3Smrg         instr->pass_flags = block_est.cur_cycle - before;
4947ec681f3Smrg      }
4957ec681f3Smrg
4967ec681f3Smrg      /* TODO: it would be nice to be able to consider estimated loop trip
4977ec681f3Smrg       * counts used for loop unrolling.
4987ec681f3Smrg       */
4997ec681f3Smrg
5007ec681f3Smrg      /* TODO: estimate the trip_count of divergent loops (those which break
5017ec681f3Smrg       * divergent) higher than of uniform loops
5027ec681f3Smrg       */
5037ec681f3Smrg
5047ec681f3Smrg      /* Assume loops execute 8-2 times, uniform branches are taken 50% the time,
5057ec681f3Smrg       * and any lane in the wave takes a side of a divergent branch 75% of the
5067ec681f3Smrg       * time.
5077ec681f3Smrg       */
5087ec681f3Smrg      double iter = 1.0;
5097ec681f3Smrg      iter *= block.loop_nest_depth > 0 ? 8.0 : 1.0;
5107ec681f3Smrg      iter *= block.loop_nest_depth > 1 ? 4.0 : 1.0;
5117ec681f3Smrg      iter *= block.loop_nest_depth > 2 ? pow(2.0, block.loop_nest_depth - 2) : 1.0;
5127ec681f3Smrg      iter *= pow(0.5, block.uniform_if_depth);
5137ec681f3Smrg      iter *= pow(0.75, block.divergent_if_logical_depth);
5147ec681f3Smrg
5157ec681f3Smrg      bool divergent_if_linear_else =
5167ec681f3Smrg         block.logical_preds.empty() && block.linear_preds.size() == 1 &&
5177ec681f3Smrg         block.linear_succs.size() == 1 &&
5187ec681f3Smrg         program->blocks[block.linear_preds[0]].kind & (block_kind_branch | block_kind_invert);
5197ec681f3Smrg      if (divergent_if_linear_else)
5207ec681f3Smrg         iter *= 0.25;
5217ec681f3Smrg
5227ec681f3Smrg      latency += block_est.cur_cycle * iter;
5237ec681f3Smrg      for (unsigned i = 0; i < (unsigned)BlockCycleEstimator::resource_count; i++)
5247ec681f3Smrg         usage[i] += block_est.res_usage[i] * iter;
5257ec681f3Smrg   }
5267ec681f3Smrg
5277ec681f3Smrg   /* This likely exaggerates the effectiveness of parallelism because it
5287ec681f3Smrg    * ignores instruction ordering. It can assume there might be SALU/VALU/etc
5297ec681f3Smrg    * work to from other waves while one is idle but that might not be the case
5307ec681f3Smrg    * because those other waves have not reached such a point yet.
5317ec681f3Smrg    */
5327ec681f3Smrg
5337ec681f3Smrg   double parallelism = program->num_waves;
5347ec681f3Smrg   for (unsigned i = 0; i < (unsigned)BlockCycleEstimator::resource_count; i++) {
5357ec681f3Smrg      if (usage[i] > 0.0)
5367ec681f3Smrg         parallelism = MIN2(parallelism, latency / usage[i]);
5377ec681f3Smrg   }
5387ec681f3Smrg   double waves_per_cycle = 1.0 / latency * parallelism;
5397ec681f3Smrg   double wave64_per_cycle = waves_per_cycle * (program->wave_size / 64.0);
5407ec681f3Smrg
5417ec681f3Smrg   double max_utilization = 1.0;
5427ec681f3Smrg   if (program->workgroup_size != UINT_MAX)
5437ec681f3Smrg      max_utilization =
5447ec681f3Smrg         program->workgroup_size / (double)align(program->workgroup_size, program->wave_size);
5457ec681f3Smrg   wave64_per_cycle *= max_utilization;
5467ec681f3Smrg
5477ec681f3Smrg   program->statistics[statistic_latency] = round(latency);
5487ec681f3Smrg   program->statistics[statistic_inv_throughput] = round(1.0 / wave64_per_cycle);
5497ec681f3Smrg
5507ec681f3Smrg   if (debug_flags & DEBUG_PERF_INFO) {
5517ec681f3Smrg      aco_print_program(program, stderr, print_no_ssa | print_perf_info);
5527ec681f3Smrg
5537ec681f3Smrg      fprintf(stderr, "num_waves: %u\n", program->num_waves);
5547ec681f3Smrg      fprintf(stderr, "salu_smem_usage: %f\n", usage[(int)BlockCycleEstimator::scalar]);
5557ec681f3Smrg      fprintf(stderr, "branch_sendmsg_usage: %f\n",
5567ec681f3Smrg              usage[(int)BlockCycleEstimator::branch_sendmsg]);
5577ec681f3Smrg      fprintf(stderr, "valu_usage: %f\n", usage[(int)BlockCycleEstimator::valu]);
5587ec681f3Smrg      fprintf(stderr, "valu_complex_usage: %f\n", usage[(int)BlockCycleEstimator::valu_complex]);
5597ec681f3Smrg      fprintf(stderr, "lds_usage: %f\n", usage[(int)BlockCycleEstimator::lds]);
5607ec681f3Smrg      fprintf(stderr, "export_gds_usage: %f\n", usage[(int)BlockCycleEstimator::export_gds]);
5617ec681f3Smrg      fprintf(stderr, "vmem_usage: %f\n", usage[(int)BlockCycleEstimator::vmem]);
5627ec681f3Smrg      fprintf(stderr, "latency: %f\n", latency);
5637ec681f3Smrg      fprintf(stderr, "parallelism: %f\n", parallelism);
5647ec681f3Smrg      fprintf(stderr, "max_utilization: %f\n", max_utilization);
5657ec681f3Smrg      fprintf(stderr, "wave64_per_cycle: %f\n", wave64_per_cycle);
5667ec681f3Smrg      fprintf(stderr, "\n");
5677ec681f3Smrg   }
5687ec681f3Smrg}
5697ec681f3Smrg
5707ec681f3Smrgvoid
5717ec681f3Smrgcollect_postasm_stats(Program* program, const std::vector<uint32_t>& code)
5727ec681f3Smrg{
5737ec681f3Smrg   program->statistics[aco::statistic_hash] = util_hash_crc32(code.data(), code.size() * 4);
5747ec681f3Smrg}
5757ec681f3Smrg
5767ec681f3Smrg} // namespace aco
577