17ec681f3Smrg/*
27ec681f3Smrg * Copyright © 2018 Valve Corporation
37ec681f3Smrg *
47ec681f3Smrg * Permission is hereby granted, free of charge, to any person obtaining a
57ec681f3Smrg * copy of this software and associated documentation files (the "Software"),
67ec681f3Smrg * to deal in the Software without restriction, including without limitation
77ec681f3Smrg * the rights to use, copy, modify, merge, publish, distribute, sublicense,
87ec681f3Smrg * and/or sell copies of the Software, and to permit persons to whom the
97ec681f3Smrg * Software is furnished to do so, subject to the following conditions:
107ec681f3Smrg *
117ec681f3Smrg * The above copyright notice and this permission notice (including the next
127ec681f3Smrg * paragraph) shall be included in all copies or substantial portions of the
137ec681f3Smrg * Software.
147ec681f3Smrg *
157ec681f3Smrg * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
167ec681f3Smrg * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
177ec681f3Smrg * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
187ec681f3Smrg * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
197ec681f3Smrg * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
207ec681f3Smrg * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
217ec681f3Smrg * IN THE SOFTWARE.
227ec681f3Smrg *
237ec681f3Smrg */
247ec681f3Smrg
257ec681f3Smrg#include "aco_builder.h"
267ec681f3Smrg#include "aco_ir.h"
277ec681f3Smrg
287ec681f3Smrg#include "util/half_float.h"
297ec681f3Smrg#include "util/memstream.h"
307ec681f3Smrg
317ec681f3Smrg#include <algorithm>
327ec681f3Smrg#include <array>
337ec681f3Smrg#include <vector>
347ec681f3Smrg
357ec681f3Smrgnamespace aco {
367ec681f3Smrg
377ec681f3Smrg#ifndef NDEBUG
387ec681f3Smrgvoid
397ec681f3Smrgperfwarn(Program* program, bool cond, const char* msg, Instruction* instr)
407ec681f3Smrg{
417ec681f3Smrg   if (cond) {
427ec681f3Smrg      char* out;
437ec681f3Smrg      size_t outsize;
447ec681f3Smrg      struct u_memstream mem;
457ec681f3Smrg      u_memstream_open(&mem, &out, &outsize);
467ec681f3Smrg      FILE* const memf = u_memstream_get(&mem);
477ec681f3Smrg
487ec681f3Smrg      fprintf(memf, "%s: ", msg);
497ec681f3Smrg      aco_print_instr(instr, memf);
507ec681f3Smrg      u_memstream_close(&mem);
517ec681f3Smrg
527ec681f3Smrg      aco_perfwarn(program, out);
537ec681f3Smrg      free(out);
547ec681f3Smrg
557ec681f3Smrg      if (debug_flags & DEBUG_PERFWARN)
567ec681f3Smrg         exit(1);
577ec681f3Smrg   }
587ec681f3Smrg}
597ec681f3Smrg#endif
607ec681f3Smrg
617ec681f3Smrg/**
627ec681f3Smrg * The optimizer works in 4 phases:
637ec681f3Smrg * (1) The first pass collects information for each ssa-def,
647ec681f3Smrg *     propagates reg->reg operands of the same type, inline constants
657ec681f3Smrg *     and neg/abs input modifiers.
667ec681f3Smrg * (2) The second pass combines instructions like mad, omod, clamp and
677ec681f3Smrg *     propagates sgpr's on VALU instructions.
687ec681f3Smrg *     This pass depends on information collected in the first pass.
697ec681f3Smrg * (3) The third pass goes backwards, and selects instructions,
707ec681f3Smrg *     i.e. decides if a mad instruction is profitable and eliminates dead code.
717ec681f3Smrg * (4) The fourth pass cleans up the sequence: literals get applied and dead
727ec681f3Smrg *     instructions are removed from the sequence.
737ec681f3Smrg */
747ec681f3Smrg
757ec681f3Smrgstruct mad_info {
767ec681f3Smrg   aco_ptr<Instruction> add_instr;
777ec681f3Smrg   uint32_t mul_temp_id;
787ec681f3Smrg   uint16_t literal_idx;
797ec681f3Smrg   bool check_literal;
807ec681f3Smrg
817ec681f3Smrg   mad_info(aco_ptr<Instruction> instr, uint32_t id)
827ec681f3Smrg       : add_instr(std::move(instr)), mul_temp_id(id), literal_idx(0), check_literal(false)
837ec681f3Smrg   {}
847ec681f3Smrg};
857ec681f3Smrg
867ec681f3Smrgenum Label {
877ec681f3Smrg   label_vec = 1 << 0,
887ec681f3Smrg   label_constant_32bit = 1 << 1,
897ec681f3Smrg   /* label_{abs,neg,mul,omod2,omod4,omod5,clamp} are used for both 16 and
907ec681f3Smrg    * 32-bit operations but this shouldn't cause any issues because we don't
917ec681f3Smrg    * look through any conversions */
927ec681f3Smrg   label_abs = 1 << 2,
937ec681f3Smrg   label_neg = 1 << 3,
947ec681f3Smrg   label_mul = 1 << 4,
957ec681f3Smrg   label_temp = 1 << 5,
967ec681f3Smrg   label_literal = 1 << 6,
977ec681f3Smrg   label_mad = 1 << 7,
987ec681f3Smrg   label_omod2 = 1 << 8,
997ec681f3Smrg   label_omod4 = 1 << 9,
1007ec681f3Smrg   label_omod5 = 1 << 10,
1017ec681f3Smrg   label_clamp = 1 << 12,
1027ec681f3Smrg   label_undefined = 1 << 14,
1037ec681f3Smrg   label_vcc = 1 << 15,
1047ec681f3Smrg   label_b2f = 1 << 16,
1057ec681f3Smrg   label_add_sub = 1 << 17,
1067ec681f3Smrg   label_bitwise = 1 << 18,
1077ec681f3Smrg   label_minmax = 1 << 19,
1087ec681f3Smrg   label_vopc = 1 << 20,
1097ec681f3Smrg   label_uniform_bool = 1 << 21,
1107ec681f3Smrg   label_constant_64bit = 1 << 22,
1117ec681f3Smrg   label_uniform_bitwise = 1 << 23,
1127ec681f3Smrg   label_scc_invert = 1 << 24,
1137ec681f3Smrg   label_vcc_hint = 1 << 25,
1147ec681f3Smrg   label_scc_needed = 1 << 26,
1157ec681f3Smrg   label_b2i = 1 << 27,
1167ec681f3Smrg   label_fcanonicalize = 1 << 28,
1177ec681f3Smrg   label_constant_16bit = 1 << 29,
1187ec681f3Smrg   label_usedef = 1 << 30,   /* generic label */
1197ec681f3Smrg   label_vop3p = 1ull << 31, /* 1ull to prevent sign extension */
1207ec681f3Smrg   label_canonicalized = 1ull << 32,
1217ec681f3Smrg   label_extract = 1ull << 33,
1227ec681f3Smrg   label_insert = 1ull << 34,
1237ec681f3Smrg   label_dpp = 1ull << 35,
1247ec681f3Smrg};
1257ec681f3Smrg
1267ec681f3Smrgstatic constexpr uint64_t instr_usedef_labels =
1277ec681f3Smrg   label_vec | label_mul | label_mad | label_add_sub | label_vop3p | label_bitwise |
1287ec681f3Smrg   label_uniform_bitwise | label_minmax | label_vopc | label_usedef | label_extract | label_dpp;
1297ec681f3Smrgstatic constexpr uint64_t instr_mod_labels =
1307ec681f3Smrg   label_omod2 | label_omod4 | label_omod5 | label_clamp | label_insert;
1317ec681f3Smrg
1327ec681f3Smrgstatic constexpr uint64_t instr_labels = instr_usedef_labels | instr_mod_labels;
1337ec681f3Smrgstatic constexpr uint64_t temp_labels = label_abs | label_neg | label_temp | label_vcc | label_b2f |
1347ec681f3Smrg                                        label_uniform_bool | label_scc_invert | label_b2i |
1357ec681f3Smrg                                        label_fcanonicalize;
1367ec681f3Smrgstatic constexpr uint32_t val_labels =
1377ec681f3Smrg   label_constant_32bit | label_constant_64bit | label_constant_16bit | label_literal;
1387ec681f3Smrg
1397ec681f3Smrgstatic_assert((instr_labels & temp_labels) == 0, "labels cannot intersect");
1407ec681f3Smrgstatic_assert((instr_labels & val_labels) == 0, "labels cannot intersect");
1417ec681f3Smrgstatic_assert((temp_labels & val_labels) == 0, "labels cannot intersect");
1427ec681f3Smrg
1437ec681f3Smrgstruct ssa_info {
1447ec681f3Smrg   uint64_t label;
1457ec681f3Smrg   union {
1467ec681f3Smrg      uint32_t val;
1477ec681f3Smrg      Temp temp;
1487ec681f3Smrg      Instruction* instr;
1497ec681f3Smrg   };
1507ec681f3Smrg
1517ec681f3Smrg   ssa_info() : label(0) {}
1527ec681f3Smrg
1537ec681f3Smrg   void add_label(Label new_label)
1547ec681f3Smrg   {
1557ec681f3Smrg      /* Since all the instr_usedef_labels use instr for the same thing
1567ec681f3Smrg       * (indicating the defining instruction), there is usually no need to
1577ec681f3Smrg       * clear any other instr labels. */
1587ec681f3Smrg      if (new_label & instr_usedef_labels)
1597ec681f3Smrg         label &= ~(instr_mod_labels | temp_labels | val_labels); /* instr, temp and val alias */
1607ec681f3Smrg
1617ec681f3Smrg      if (new_label & instr_mod_labels) {
1627ec681f3Smrg         label &= ~instr_labels;
1637ec681f3Smrg         label &= ~(temp_labels | val_labels); /* instr, temp and val alias */
1647ec681f3Smrg      }
1657ec681f3Smrg
1667ec681f3Smrg      if (new_label & temp_labels) {
1677ec681f3Smrg         label &= ~temp_labels;
1687ec681f3Smrg         label &= ~(instr_labels | val_labels); /* instr, temp and val alias */
1697ec681f3Smrg      }
1707ec681f3Smrg
1717ec681f3Smrg      uint32_t const_labels =
1727ec681f3Smrg         label_literal | label_constant_32bit | label_constant_64bit | label_constant_16bit;
1737ec681f3Smrg      if (new_label & const_labels) {
1747ec681f3Smrg         label &= ~val_labels | const_labels;
1757ec681f3Smrg         label &= ~(instr_labels | temp_labels); /* instr, temp and val alias */
1767ec681f3Smrg      } else if (new_label & val_labels) {
1777ec681f3Smrg         label &= ~val_labels;
1787ec681f3Smrg         label &= ~(instr_labels | temp_labels); /* instr, temp and val alias */
1797ec681f3Smrg      }
1807ec681f3Smrg
1817ec681f3Smrg      label |= new_label;
1827ec681f3Smrg   }
1837ec681f3Smrg
1847ec681f3Smrg   void set_vec(Instruction* vec)
1857ec681f3Smrg   {
1867ec681f3Smrg      add_label(label_vec);
1877ec681f3Smrg      instr = vec;
1887ec681f3Smrg   }
1897ec681f3Smrg
1907ec681f3Smrg   bool is_vec() { return label & label_vec; }
1917ec681f3Smrg
1927ec681f3Smrg   void set_constant(chip_class chip, uint64_t constant)
1937ec681f3Smrg   {
1947ec681f3Smrg      Operand op16 = Operand::c16(constant);
1957ec681f3Smrg      Operand op32 = Operand::get_const(chip, constant, 4);
1967ec681f3Smrg      add_label(label_literal);
1977ec681f3Smrg      val = constant;
1987ec681f3Smrg
1997ec681f3Smrg      /* check that no upper bits are lost in case of packed 16bit constants */
2007ec681f3Smrg      if (chip >= GFX8 && !op16.isLiteral() && op16.constantValue64() == constant)
2017ec681f3Smrg         add_label(label_constant_16bit);
2027ec681f3Smrg
2037ec681f3Smrg      if (!op32.isLiteral())
2047ec681f3Smrg         add_label(label_constant_32bit);
2057ec681f3Smrg
2067ec681f3Smrg      if (Operand::is_constant_representable(constant, 8))
2077ec681f3Smrg         add_label(label_constant_64bit);
2087ec681f3Smrg
2097ec681f3Smrg      if (label & label_constant_64bit) {
2107ec681f3Smrg         val = Operand::c64(constant).constantValue();
2117ec681f3Smrg         if (val != constant)
2127ec681f3Smrg            label &= ~(label_literal | label_constant_16bit | label_constant_32bit);
2137ec681f3Smrg      }
2147ec681f3Smrg   }
2157ec681f3Smrg
2167ec681f3Smrg   bool is_constant(unsigned bits)
2177ec681f3Smrg   {
2187ec681f3Smrg      switch (bits) {
2197ec681f3Smrg      case 8: return label & label_literal;
2207ec681f3Smrg      case 16: return label & label_constant_16bit;
2217ec681f3Smrg      case 32: return label & label_constant_32bit;
2227ec681f3Smrg      case 64: return label & label_constant_64bit;
2237ec681f3Smrg      }
2247ec681f3Smrg      return false;
2257ec681f3Smrg   }
2267ec681f3Smrg
2277ec681f3Smrg   bool is_literal(unsigned bits)
2287ec681f3Smrg   {
2297ec681f3Smrg      bool is_lit = label & label_literal;
2307ec681f3Smrg      switch (bits) {
2317ec681f3Smrg      case 8: return false;
2327ec681f3Smrg      case 16: return is_lit && ~(label & label_constant_16bit);
2337ec681f3Smrg      case 32: return is_lit && ~(label & label_constant_32bit);
2347ec681f3Smrg      case 64: return false;
2357ec681f3Smrg      }
2367ec681f3Smrg      return false;
2377ec681f3Smrg   }
2387ec681f3Smrg
2397ec681f3Smrg   bool is_constant_or_literal(unsigned bits)
2407ec681f3Smrg   {
2417ec681f3Smrg      if (bits == 64)
2427ec681f3Smrg         return label & label_constant_64bit;
2437ec681f3Smrg      else
2447ec681f3Smrg         return label & label_literal;
2457ec681f3Smrg   }
2467ec681f3Smrg
2477ec681f3Smrg   void set_abs(Temp abs_temp)
2487ec681f3Smrg   {
2497ec681f3Smrg      add_label(label_abs);
2507ec681f3Smrg      temp = abs_temp;
2517ec681f3Smrg   }
2527ec681f3Smrg
2537ec681f3Smrg   bool is_abs() { return label & label_abs; }
2547ec681f3Smrg
2557ec681f3Smrg   void set_neg(Temp neg_temp)
2567ec681f3Smrg   {
2577ec681f3Smrg      add_label(label_neg);
2587ec681f3Smrg      temp = neg_temp;
2597ec681f3Smrg   }
2607ec681f3Smrg
2617ec681f3Smrg   bool is_neg() { return label & label_neg; }
2627ec681f3Smrg
2637ec681f3Smrg   void set_neg_abs(Temp neg_abs_temp)
2647ec681f3Smrg   {
2657ec681f3Smrg      add_label((Label)((uint32_t)label_abs | (uint32_t)label_neg));
2667ec681f3Smrg      temp = neg_abs_temp;
2677ec681f3Smrg   }
2687ec681f3Smrg
2697ec681f3Smrg   void set_mul(Instruction* mul)
2707ec681f3Smrg   {
2717ec681f3Smrg      add_label(label_mul);
2727ec681f3Smrg      instr = mul;
2737ec681f3Smrg   }
2747ec681f3Smrg
2757ec681f3Smrg   bool is_mul() { return label & label_mul; }
2767ec681f3Smrg
2777ec681f3Smrg   void set_temp(Temp tmp)
2787ec681f3Smrg   {
2797ec681f3Smrg      add_label(label_temp);
2807ec681f3Smrg      temp = tmp;
2817ec681f3Smrg   }
2827ec681f3Smrg
2837ec681f3Smrg   bool is_temp() { return label & label_temp; }
2847ec681f3Smrg
2857ec681f3Smrg   void set_mad(Instruction* mad, uint32_t mad_info_idx)
2867ec681f3Smrg   {
2877ec681f3Smrg      add_label(label_mad);
2887ec681f3Smrg      mad->pass_flags = mad_info_idx;
2897ec681f3Smrg      instr = mad;
2907ec681f3Smrg   }
2917ec681f3Smrg
2927ec681f3Smrg   bool is_mad() { return label & label_mad; }
2937ec681f3Smrg
2947ec681f3Smrg   void set_omod2(Instruction* mul)
2957ec681f3Smrg   {
2967ec681f3Smrg      add_label(label_omod2);
2977ec681f3Smrg      instr = mul;
2987ec681f3Smrg   }
2997ec681f3Smrg
3007ec681f3Smrg   bool is_omod2() { return label & label_omod2; }
3017ec681f3Smrg
3027ec681f3Smrg   void set_omod4(Instruction* mul)
3037ec681f3Smrg   {
3047ec681f3Smrg      add_label(label_omod4);
3057ec681f3Smrg      instr = mul;
3067ec681f3Smrg   }
3077ec681f3Smrg
3087ec681f3Smrg   bool is_omod4() { return label & label_omod4; }
3097ec681f3Smrg
3107ec681f3Smrg   void set_omod5(Instruction* mul)
3117ec681f3Smrg   {
3127ec681f3Smrg      add_label(label_omod5);
3137ec681f3Smrg      instr = mul;
3147ec681f3Smrg   }
3157ec681f3Smrg
3167ec681f3Smrg   bool is_omod5() { return label & label_omod5; }
3177ec681f3Smrg
3187ec681f3Smrg   void set_clamp(Instruction* med3)
3197ec681f3Smrg   {
3207ec681f3Smrg      add_label(label_clamp);
3217ec681f3Smrg      instr = med3;
3227ec681f3Smrg   }
3237ec681f3Smrg
3247ec681f3Smrg   bool is_clamp() { return label & label_clamp; }
3257ec681f3Smrg
3267ec681f3Smrg   void set_undefined() { add_label(label_undefined); }
3277ec681f3Smrg
3287ec681f3Smrg   bool is_undefined() { return label & label_undefined; }
3297ec681f3Smrg
3307ec681f3Smrg   void set_vcc(Temp vcc_val)
3317ec681f3Smrg   {
3327ec681f3Smrg      add_label(label_vcc);
3337ec681f3Smrg      temp = vcc_val;
3347ec681f3Smrg   }
3357ec681f3Smrg
3367ec681f3Smrg   bool is_vcc() { return label & label_vcc; }
3377ec681f3Smrg
3387ec681f3Smrg   void set_b2f(Temp b2f_val)
3397ec681f3Smrg   {
3407ec681f3Smrg      add_label(label_b2f);
3417ec681f3Smrg      temp = b2f_val;
3427ec681f3Smrg   }
3437ec681f3Smrg
3447ec681f3Smrg   bool is_b2f() { return label & label_b2f; }
3457ec681f3Smrg
3467ec681f3Smrg   void set_add_sub(Instruction* add_sub_instr)
3477ec681f3Smrg   {
3487ec681f3Smrg      add_label(label_add_sub);
3497ec681f3Smrg      instr = add_sub_instr;
3507ec681f3Smrg   }
3517ec681f3Smrg
3527ec681f3Smrg   bool is_add_sub() { return label & label_add_sub; }
3537ec681f3Smrg
3547ec681f3Smrg   void set_bitwise(Instruction* bitwise_instr)
3557ec681f3Smrg   {
3567ec681f3Smrg      add_label(label_bitwise);
3577ec681f3Smrg      instr = bitwise_instr;
3587ec681f3Smrg   }
3597ec681f3Smrg
3607ec681f3Smrg   bool is_bitwise() { return label & label_bitwise; }
3617ec681f3Smrg
3627ec681f3Smrg   void set_uniform_bitwise() { add_label(label_uniform_bitwise); }
3637ec681f3Smrg
3647ec681f3Smrg   bool is_uniform_bitwise() { return label & label_uniform_bitwise; }
3657ec681f3Smrg
3667ec681f3Smrg   void set_minmax(Instruction* minmax_instr)
3677ec681f3Smrg   {
3687ec681f3Smrg      add_label(label_minmax);
3697ec681f3Smrg      instr = minmax_instr;
3707ec681f3Smrg   }
3717ec681f3Smrg
3727ec681f3Smrg   bool is_minmax() { return label & label_minmax; }
3737ec681f3Smrg
3747ec681f3Smrg   void set_vopc(Instruction* vopc_instr)
3757ec681f3Smrg   {
3767ec681f3Smrg      add_label(label_vopc);
3777ec681f3Smrg      instr = vopc_instr;
3787ec681f3Smrg   }
3797ec681f3Smrg
3807ec681f3Smrg   bool is_vopc() { return label & label_vopc; }
3817ec681f3Smrg
3827ec681f3Smrg   void set_scc_needed() { add_label(label_scc_needed); }
3837ec681f3Smrg
3847ec681f3Smrg   bool is_scc_needed() { return label & label_scc_needed; }
3857ec681f3Smrg
3867ec681f3Smrg   void set_scc_invert(Temp scc_inv)
3877ec681f3Smrg   {
3887ec681f3Smrg      add_label(label_scc_invert);
3897ec681f3Smrg      temp = scc_inv;
3907ec681f3Smrg   }
3917ec681f3Smrg
3927ec681f3Smrg   bool is_scc_invert() { return label & label_scc_invert; }
3937ec681f3Smrg
3947ec681f3Smrg   void set_uniform_bool(Temp uniform_bool)
3957ec681f3Smrg   {
3967ec681f3Smrg      add_label(label_uniform_bool);
3977ec681f3Smrg      temp = uniform_bool;
3987ec681f3Smrg   }
3997ec681f3Smrg
4007ec681f3Smrg   bool is_uniform_bool() { return label & label_uniform_bool; }
4017ec681f3Smrg
4027ec681f3Smrg   void set_vcc_hint() { add_label(label_vcc_hint); }
4037ec681f3Smrg
4047ec681f3Smrg   bool is_vcc_hint() { return label & label_vcc_hint; }
4057ec681f3Smrg
4067ec681f3Smrg   void set_b2i(Temp b2i_val)
4077ec681f3Smrg   {
4087ec681f3Smrg      add_label(label_b2i);
4097ec681f3Smrg      temp = b2i_val;
4107ec681f3Smrg   }
4117ec681f3Smrg
4127ec681f3Smrg   bool is_b2i() { return label & label_b2i; }
4137ec681f3Smrg
4147ec681f3Smrg   void set_usedef(Instruction* label_instr)
4157ec681f3Smrg   {
4167ec681f3Smrg      add_label(label_usedef);
4177ec681f3Smrg      instr = label_instr;
4187ec681f3Smrg   }
4197ec681f3Smrg
4207ec681f3Smrg   bool is_usedef() { return label & label_usedef; }
4217ec681f3Smrg
4227ec681f3Smrg   void set_vop3p(Instruction* vop3p_instr)
4237ec681f3Smrg   {
4247ec681f3Smrg      add_label(label_vop3p);
4257ec681f3Smrg      instr = vop3p_instr;
4267ec681f3Smrg   }
4277ec681f3Smrg
4287ec681f3Smrg   bool is_vop3p() { return label & label_vop3p; }
4297ec681f3Smrg
4307ec681f3Smrg   void set_fcanonicalize(Temp tmp)
4317ec681f3Smrg   {
4327ec681f3Smrg      add_label(label_fcanonicalize);
4337ec681f3Smrg      temp = tmp;
4347ec681f3Smrg   }
4357ec681f3Smrg
4367ec681f3Smrg   bool is_fcanonicalize() { return label & label_fcanonicalize; }
4377ec681f3Smrg
4387ec681f3Smrg   void set_canonicalized() { add_label(label_canonicalized); }
4397ec681f3Smrg
4407ec681f3Smrg   bool is_canonicalized() { return label & label_canonicalized; }
4417ec681f3Smrg
4427ec681f3Smrg   void set_extract(Instruction* extract)
4437ec681f3Smrg   {
4447ec681f3Smrg      add_label(label_extract);
4457ec681f3Smrg      instr = extract;
4467ec681f3Smrg   }
4477ec681f3Smrg
4487ec681f3Smrg   bool is_extract() { return label & label_extract; }
4497ec681f3Smrg
4507ec681f3Smrg   void set_insert(Instruction* insert)
4517ec681f3Smrg   {
4527ec681f3Smrg      add_label(label_insert);
4537ec681f3Smrg      instr = insert;
4547ec681f3Smrg   }
4557ec681f3Smrg
4567ec681f3Smrg   bool is_insert() { return label & label_insert; }
4577ec681f3Smrg
4587ec681f3Smrg   void set_dpp(Instruction* mov)
4597ec681f3Smrg   {
4607ec681f3Smrg      add_label(label_dpp);
4617ec681f3Smrg      instr = mov;
4627ec681f3Smrg   }
4637ec681f3Smrg
4647ec681f3Smrg   bool is_dpp() { return label & label_dpp; }
4657ec681f3Smrg};
4667ec681f3Smrg
4677ec681f3Smrgstruct opt_ctx {
4687ec681f3Smrg   Program* program;
4697ec681f3Smrg   float_mode fp_mode;
4707ec681f3Smrg   std::vector<aco_ptr<Instruction>> instructions;
4717ec681f3Smrg   ssa_info* info;
4727ec681f3Smrg   std::pair<uint32_t, Temp> last_literal;
4737ec681f3Smrg   std::vector<mad_info> mad_infos;
4747ec681f3Smrg   std::vector<uint16_t> uses;
4757ec681f3Smrg};
4767ec681f3Smrg
4777ec681f3Smrgbool
4787ec681f3Smrgcan_use_VOP3(opt_ctx& ctx, const aco_ptr<Instruction>& instr)
4797ec681f3Smrg{
4807ec681f3Smrg   if (instr->isVOP3())
4817ec681f3Smrg      return true;
4827ec681f3Smrg
4837ec681f3Smrg   if (instr->isVOP3P())
4847ec681f3Smrg      return false;
4857ec681f3Smrg
4867ec681f3Smrg   if (instr->operands.size() && instr->operands[0].isLiteral() && ctx.program->chip_class < GFX10)
4877ec681f3Smrg      return false;
4887ec681f3Smrg
4897ec681f3Smrg   if (instr->isDPP() || instr->isSDWA())
4907ec681f3Smrg      return false;
4917ec681f3Smrg
4927ec681f3Smrg   return instr->opcode != aco_opcode::v_madmk_f32 && instr->opcode != aco_opcode::v_madak_f32 &&
4937ec681f3Smrg          instr->opcode != aco_opcode::v_madmk_f16 && instr->opcode != aco_opcode::v_madak_f16 &&
4947ec681f3Smrg          instr->opcode != aco_opcode::v_fmamk_f32 && instr->opcode != aco_opcode::v_fmaak_f32 &&
4957ec681f3Smrg          instr->opcode != aco_opcode::v_fmamk_f16 && instr->opcode != aco_opcode::v_fmaak_f16 &&
4967ec681f3Smrg          instr->opcode != aco_opcode::v_readlane_b32 &&
4977ec681f3Smrg          instr->opcode != aco_opcode::v_writelane_b32 &&
4987ec681f3Smrg          instr->opcode != aco_opcode::v_readfirstlane_b32;
4997ec681f3Smrg}
5007ec681f3Smrg
5017ec681f3Smrgbool
5027ec681f3Smrgpseudo_propagate_temp(opt_ctx& ctx, aco_ptr<Instruction>& instr, Temp temp, unsigned index)
5037ec681f3Smrg{
5047ec681f3Smrg   if (instr->definitions.empty())
5057ec681f3Smrg      return false;
5067ec681f3Smrg
5077ec681f3Smrg   const bool vgpr =
5087ec681f3Smrg      instr->opcode == aco_opcode::p_as_uniform ||
5097ec681f3Smrg      std::all_of(instr->definitions.begin(), instr->definitions.end(),
5107ec681f3Smrg                  [](const Definition& def) { return def.regClass().type() == RegType::vgpr; });
5117ec681f3Smrg
5127ec681f3Smrg   /* don't propagate VGPRs into SGPR instructions */
5137ec681f3Smrg   if (temp.type() == RegType::vgpr && !vgpr)
5147ec681f3Smrg      return false;
5157ec681f3Smrg
5167ec681f3Smrg   bool can_accept_sgpr =
5177ec681f3Smrg      ctx.program->chip_class >= GFX9 ||
5187ec681f3Smrg      std::none_of(instr->definitions.begin(), instr->definitions.end(),
5197ec681f3Smrg                   [](const Definition& def) { return def.regClass().is_subdword(); });
5207ec681f3Smrg
5217ec681f3Smrg   switch (instr->opcode) {
5227ec681f3Smrg   case aco_opcode::p_phi:
5237ec681f3Smrg   case aco_opcode::p_linear_phi:
5247ec681f3Smrg   case aco_opcode::p_parallelcopy:
5257ec681f3Smrg   case aco_opcode::p_create_vector:
5267ec681f3Smrg      if (temp.bytes() != instr->operands[index].bytes())
5277ec681f3Smrg         return false;
5287ec681f3Smrg      break;
5297ec681f3Smrg   case aco_opcode::p_extract_vector:
5307ec681f3Smrg      if (temp.type() == RegType::sgpr && !can_accept_sgpr)
5317ec681f3Smrg         return false;
5327ec681f3Smrg      break;
5337ec681f3Smrg   case aco_opcode::p_split_vector: {
5347ec681f3Smrg      if (temp.type() == RegType::sgpr && !can_accept_sgpr)
5357ec681f3Smrg         return false;
5367ec681f3Smrg      /* don't increase the vector size */
5377ec681f3Smrg      if (temp.bytes() > instr->operands[index].bytes())
5387ec681f3Smrg         return false;
5397ec681f3Smrg      /* We can decrease the vector size as smaller temporaries are only
5407ec681f3Smrg       * propagated by p_as_uniform instructions.
5417ec681f3Smrg       * If this propagation leads to invalid IR or hits the assertion below,
5427ec681f3Smrg       * it means that some undefined bytes within a dword are begin accessed
5437ec681f3Smrg       * and a bug in instruction_selection is likely. */
5447ec681f3Smrg      int decrease = instr->operands[index].bytes() - temp.bytes();
5457ec681f3Smrg      while (decrease > 0) {
5467ec681f3Smrg         decrease -= instr->definitions.back().bytes();
5477ec681f3Smrg         instr->definitions.pop_back();
5487ec681f3Smrg      }
5497ec681f3Smrg      assert(decrease == 0);
5507ec681f3Smrg      break;
5517ec681f3Smrg   }
5527ec681f3Smrg   case aco_opcode::p_as_uniform:
5537ec681f3Smrg      if (temp.regClass() == instr->definitions[0].regClass())
5547ec681f3Smrg         instr->opcode = aco_opcode::p_parallelcopy;
5557ec681f3Smrg      break;
5567ec681f3Smrg   default: return false;
5577ec681f3Smrg   }
5587ec681f3Smrg
5597ec681f3Smrg   instr->operands[index].setTemp(temp);
5607ec681f3Smrg   return true;
5617ec681f3Smrg}
5627ec681f3Smrg
5637ec681f3Smrg/* This expects the DPP modifier to be removed. */
5647ec681f3Smrgbool
5657ec681f3Smrgcan_apply_sgprs(opt_ctx& ctx, aco_ptr<Instruction>& instr)
5667ec681f3Smrg{
5677ec681f3Smrg   if (instr->isSDWA() && ctx.program->chip_class < GFX9)
5687ec681f3Smrg      return false;
5697ec681f3Smrg   return instr->opcode != aco_opcode::v_readfirstlane_b32 &&
5707ec681f3Smrg          instr->opcode != aco_opcode::v_readlane_b32 &&
5717ec681f3Smrg          instr->opcode != aco_opcode::v_readlane_b32_e64 &&
5727ec681f3Smrg          instr->opcode != aco_opcode::v_writelane_b32 &&
5737ec681f3Smrg          instr->opcode != aco_opcode::v_writelane_b32_e64 &&
5747ec681f3Smrg          instr->opcode != aco_opcode::v_permlane16_b32 &&
5757ec681f3Smrg          instr->opcode != aco_opcode::v_permlanex16_b32;
5767ec681f3Smrg}
5777ec681f3Smrg
5787ec681f3Smrgvoid
5797ec681f3Smrgto_VOP3(opt_ctx& ctx, aco_ptr<Instruction>& instr)
5807ec681f3Smrg{
5817ec681f3Smrg   if (instr->isVOP3())
5827ec681f3Smrg      return;
5837ec681f3Smrg
5847ec681f3Smrg   aco_ptr<Instruction> tmp = std::move(instr);
5857ec681f3Smrg   Format format = asVOP3(tmp->format);
5867ec681f3Smrg   instr.reset(create_instruction<VOP3_instruction>(tmp->opcode, format, tmp->operands.size(),
5877ec681f3Smrg                                                    tmp->definitions.size()));
5887ec681f3Smrg   std::copy(tmp->operands.cbegin(), tmp->operands.cend(), instr->operands.begin());
5897ec681f3Smrg   for (unsigned i = 0; i < instr->definitions.size(); i++) {
5907ec681f3Smrg      instr->definitions[i] = tmp->definitions[i];
5917ec681f3Smrg      if (instr->definitions[i].isTemp()) {
5927ec681f3Smrg         ssa_info& info = ctx.info[instr->definitions[i].tempId()];
5937ec681f3Smrg         if (info.label & instr_usedef_labels && info.instr == tmp.get())
5947ec681f3Smrg            info.instr = instr.get();
5957ec681f3Smrg      }
5967ec681f3Smrg   }
5977ec681f3Smrg   /* we don't need to update any instr_mod_labels because they either haven't
5987ec681f3Smrg    * been applied yet or this instruction isn't dead and so they've been ignored */
5997ec681f3Smrg}
6007ec681f3Smrg
6017ec681f3Smrgbool
6027ec681f3Smrgis_operand_vgpr(Operand op)
6037ec681f3Smrg{
6047ec681f3Smrg   return op.isTemp() && op.getTemp().type() == RegType::vgpr;
6057ec681f3Smrg}
6067ec681f3Smrg
6077ec681f3Smrgvoid
6087ec681f3Smrgto_SDWA(opt_ctx& ctx, aco_ptr<Instruction>& instr)
6097ec681f3Smrg{
6107ec681f3Smrg   aco_ptr<Instruction> tmp = convert_to_SDWA(ctx.program->chip_class, instr);
6117ec681f3Smrg   if (!tmp)
6127ec681f3Smrg      return;
6137ec681f3Smrg
6147ec681f3Smrg   for (unsigned i = 0; i < instr->definitions.size(); i++) {
6157ec681f3Smrg      ssa_info& info = ctx.info[instr->definitions[i].tempId()];
6167ec681f3Smrg      if (info.label & instr_labels && info.instr == tmp.get())
6177ec681f3Smrg         info.instr = instr.get();
6187ec681f3Smrg   }
6197ec681f3Smrg}
6207ec681f3Smrg
6217ec681f3Smrg/* only covers special cases */
6227ec681f3Smrgbool
6237ec681f3Smrgalu_can_accept_constant(aco_opcode opcode, unsigned operand)
6247ec681f3Smrg{
6257ec681f3Smrg   switch (opcode) {
6267ec681f3Smrg   case aco_opcode::v_interp_p2_f32:
6277ec681f3Smrg   case aco_opcode::v_mac_f32:
6287ec681f3Smrg   case aco_opcode::v_writelane_b32:
6297ec681f3Smrg   case aco_opcode::v_writelane_b32_e64:
6307ec681f3Smrg   case aco_opcode::v_cndmask_b32: return operand != 2;
6317ec681f3Smrg   case aco_opcode::s_addk_i32:
6327ec681f3Smrg   case aco_opcode::s_mulk_i32:
6337ec681f3Smrg   case aco_opcode::p_wqm:
6347ec681f3Smrg   case aco_opcode::p_extract_vector:
6357ec681f3Smrg   case aco_opcode::p_split_vector:
6367ec681f3Smrg   case aco_opcode::v_readlane_b32:
6377ec681f3Smrg   case aco_opcode::v_readlane_b32_e64:
6387ec681f3Smrg   case aco_opcode::v_readfirstlane_b32:
6397ec681f3Smrg   case aco_opcode::p_extract:
6407ec681f3Smrg   case aco_opcode::p_insert: return operand != 0;
6417ec681f3Smrg   default: return true;
6427ec681f3Smrg   }
6437ec681f3Smrg}
6447ec681f3Smrg
6457ec681f3Smrgbool
6467ec681f3Smrgvalu_can_accept_vgpr(aco_ptr<Instruction>& instr, unsigned operand)
6477ec681f3Smrg{
6487ec681f3Smrg   if (instr->opcode == aco_opcode::v_readlane_b32 ||
6497ec681f3Smrg       instr->opcode == aco_opcode::v_readlane_b32_e64 ||
6507ec681f3Smrg       instr->opcode == aco_opcode::v_writelane_b32 ||
6517ec681f3Smrg       instr->opcode == aco_opcode::v_writelane_b32_e64)
6527ec681f3Smrg      return operand != 1;
6537ec681f3Smrg   if (instr->opcode == aco_opcode::v_permlane16_b32 ||
6547ec681f3Smrg       instr->opcode == aco_opcode::v_permlanex16_b32)
6557ec681f3Smrg      return operand == 0;
6567ec681f3Smrg   return true;
6577ec681f3Smrg}
6587ec681f3Smrg
6597ec681f3Smrg/* check constant bus and literal limitations */
6607ec681f3Smrgbool
6617ec681f3Smrgcheck_vop3_operands(opt_ctx& ctx, unsigned num_operands, Operand* operands)
6627ec681f3Smrg{
6637ec681f3Smrg   int limit = ctx.program->chip_class >= GFX10 ? 2 : 1;
6647ec681f3Smrg   Operand literal32(s1);
6657ec681f3Smrg   Operand literal64(s2);
6667ec681f3Smrg   unsigned num_sgprs = 0;
6677ec681f3Smrg   unsigned sgpr[] = {0, 0};
6687ec681f3Smrg
6697ec681f3Smrg   for (unsigned i = 0; i < num_operands; i++) {
6707ec681f3Smrg      Operand op = operands[i];
6717ec681f3Smrg
6727ec681f3Smrg      if (op.hasRegClass() && op.regClass().type() == RegType::sgpr) {
6737ec681f3Smrg         /* two reads of the same SGPR count as 1 to the limit */
6747ec681f3Smrg         if (op.tempId() != sgpr[0] && op.tempId() != sgpr[1]) {
6757ec681f3Smrg            if (num_sgprs < 2)
6767ec681f3Smrg               sgpr[num_sgprs++] = op.tempId();
6777ec681f3Smrg            limit--;
6787ec681f3Smrg            if (limit < 0)
6797ec681f3Smrg               return false;
6807ec681f3Smrg         }
6817ec681f3Smrg      } else if (op.isLiteral()) {
6827ec681f3Smrg         if (ctx.program->chip_class < GFX10)
6837ec681f3Smrg            return false;
6847ec681f3Smrg
6857ec681f3Smrg         if (!literal32.isUndefined() && literal32.constantValue() != op.constantValue())
6867ec681f3Smrg            return false;
6877ec681f3Smrg         if (!literal64.isUndefined() && literal64.constantValue() != op.constantValue())
6887ec681f3Smrg            return false;
6897ec681f3Smrg
6907ec681f3Smrg         /* Any number of 32-bit literals counts as only 1 to the limit. Same
6917ec681f3Smrg          * (but separately) for 64-bit literals. */
6927ec681f3Smrg         if (op.size() == 1 && literal32.isUndefined()) {
6937ec681f3Smrg            limit--;
6947ec681f3Smrg            literal32 = op;
6957ec681f3Smrg         } else if (op.size() == 2 && literal64.isUndefined()) {
6967ec681f3Smrg            limit--;
6977ec681f3Smrg            literal64 = op;
6987ec681f3Smrg         }
6997ec681f3Smrg
7007ec681f3Smrg         if (limit < 0)
7017ec681f3Smrg            return false;
7027ec681f3Smrg      }
7037ec681f3Smrg   }
7047ec681f3Smrg
7057ec681f3Smrg   return true;
7067ec681f3Smrg}
7077ec681f3Smrg
7087ec681f3Smrgbool
7097ec681f3Smrgparse_base_offset(opt_ctx& ctx, Instruction* instr, unsigned op_index, Temp* base, uint32_t* offset,
7107ec681f3Smrg                  bool prevent_overflow)
7117ec681f3Smrg{
7127ec681f3Smrg   Operand op = instr->operands[op_index];
7137ec681f3Smrg
7147ec681f3Smrg   if (!op.isTemp())
7157ec681f3Smrg      return false;
7167ec681f3Smrg   Temp tmp = op.getTemp();
7177ec681f3Smrg   if (!ctx.info[tmp.id()].is_add_sub())
7187ec681f3Smrg      return false;
7197ec681f3Smrg
7207ec681f3Smrg   Instruction* add_instr = ctx.info[tmp.id()].instr;
7217ec681f3Smrg
7227ec681f3Smrg   switch (add_instr->opcode) {
7237ec681f3Smrg   case aco_opcode::v_add_u32:
7247ec681f3Smrg   case aco_opcode::v_add_co_u32:
7257ec681f3Smrg   case aco_opcode::v_add_co_u32_e64:
7267ec681f3Smrg   case aco_opcode::s_add_i32:
7277ec681f3Smrg   case aco_opcode::s_add_u32: break;
7287ec681f3Smrg   default: return false;
7297ec681f3Smrg   }
7307ec681f3Smrg   if (prevent_overflow && !add_instr->definitions[0].isNUW())
7317ec681f3Smrg      return false;
7327ec681f3Smrg
7337ec681f3Smrg   if (add_instr->usesModifiers())
7347ec681f3Smrg      return false;
7357ec681f3Smrg
7367ec681f3Smrg   for (unsigned i = 0; i < 2; i++) {
7377ec681f3Smrg      if (add_instr->operands[i].isConstant()) {
7387ec681f3Smrg         *offset = add_instr->operands[i].constantValue();
7397ec681f3Smrg      } else if (add_instr->operands[i].isTemp() &&
7407ec681f3Smrg                 ctx.info[add_instr->operands[i].tempId()].is_constant_or_literal(32)) {
7417ec681f3Smrg         *offset = ctx.info[add_instr->operands[i].tempId()].val;
7427ec681f3Smrg      } else {
7437ec681f3Smrg         continue;
7447ec681f3Smrg      }
7457ec681f3Smrg      if (!add_instr->operands[!i].isTemp())
7467ec681f3Smrg         continue;
7477ec681f3Smrg
7487ec681f3Smrg      uint32_t offset2 = 0;
7497ec681f3Smrg      if (parse_base_offset(ctx, add_instr, !i, base, &offset2, prevent_overflow)) {
7507ec681f3Smrg         *offset += offset2;
7517ec681f3Smrg      } else {
7527ec681f3Smrg         *base = add_instr->operands[!i].getTemp();
7537ec681f3Smrg      }
7547ec681f3Smrg      return true;
7557ec681f3Smrg   }
7567ec681f3Smrg
7577ec681f3Smrg   return false;
7587ec681f3Smrg}
7597ec681f3Smrg
7607ec681f3Smrgunsigned
7617ec681f3Smrgget_operand_size(aco_ptr<Instruction>& instr, unsigned index)
7627ec681f3Smrg{
7637ec681f3Smrg   if (instr->isPseudo())
7647ec681f3Smrg      return instr->operands[index].bytes() * 8u;
7657ec681f3Smrg   else if (instr->opcode == aco_opcode::v_mad_u64_u32 ||
7667ec681f3Smrg            instr->opcode == aco_opcode::v_mad_i64_i32)
7677ec681f3Smrg      return index == 2 ? 64 : 32;
7687ec681f3Smrg   else if (instr->isVALU() || instr->isSALU())
7697ec681f3Smrg      return instr_info.operand_size[(int)instr->opcode];
7707ec681f3Smrg   else
7717ec681f3Smrg      return 0;
7727ec681f3Smrg}
7737ec681f3Smrg
7747ec681f3SmrgOperand
7757ec681f3Smrgget_constant_op(opt_ctx& ctx, ssa_info info, uint32_t bits)
7767ec681f3Smrg{
7777ec681f3Smrg   if (bits == 64)
7787ec681f3Smrg      return Operand::c32_or_c64(info.val, true);
7797ec681f3Smrg   return Operand::get_const(ctx.program->chip_class, info.val, bits / 8u);
7807ec681f3Smrg}
7817ec681f3Smrg
7827ec681f3Smrgbool
7837ec681f3Smrgfixed_to_exec(Operand op)
7847ec681f3Smrg{
7857ec681f3Smrg   return op.isFixed() && op.physReg() == exec;
7867ec681f3Smrg}
7877ec681f3Smrg
7887ec681f3SmrgSubdwordSel
7897ec681f3Smrgparse_extract(Instruction* instr)
7907ec681f3Smrg{
7917ec681f3Smrg   if (instr->opcode == aco_opcode::p_extract) {
7927ec681f3Smrg      unsigned size = instr->operands[2].constantValue() / 8;
7937ec681f3Smrg      unsigned offset = instr->operands[1].constantValue() * size;
7947ec681f3Smrg      bool sext = instr->operands[3].constantEquals(1);
7957ec681f3Smrg      return SubdwordSel(size, offset, sext);
7967ec681f3Smrg   } else if (instr->opcode == aco_opcode::p_insert && instr->operands[1].constantEquals(0)) {
7977ec681f3Smrg      return instr->operands[2].constantEquals(8) ? SubdwordSel::ubyte : SubdwordSel::uword;
7987ec681f3Smrg   } else {
7997ec681f3Smrg      return SubdwordSel();
8007ec681f3Smrg   }
8017ec681f3Smrg}
8027ec681f3Smrg
8037ec681f3SmrgSubdwordSel
8047ec681f3Smrgparse_insert(Instruction* instr)
8057ec681f3Smrg{
8067ec681f3Smrg   if (instr->opcode == aco_opcode::p_extract && instr->operands[3].constantEquals(0) &&
8077ec681f3Smrg       instr->operands[1].constantEquals(0)) {
8087ec681f3Smrg      return instr->operands[2].constantEquals(8) ? SubdwordSel::ubyte : SubdwordSel::uword;
8097ec681f3Smrg   } else if (instr->opcode == aco_opcode::p_insert) {
8107ec681f3Smrg      unsigned size = instr->operands[2].constantValue() / 8;
8117ec681f3Smrg      unsigned offset = instr->operands[1].constantValue() * size;
8127ec681f3Smrg      return SubdwordSel(size, offset, false);
8137ec681f3Smrg   } else {
8147ec681f3Smrg      return SubdwordSel();
8157ec681f3Smrg   }
8167ec681f3Smrg}
8177ec681f3Smrg
8187ec681f3Smrgbool
8197ec681f3Smrgcan_apply_extract(opt_ctx& ctx, aco_ptr<Instruction>& instr, unsigned idx, ssa_info& info)
8207ec681f3Smrg{
8217ec681f3Smrg   if (idx >= 2)
8227ec681f3Smrg      return false;
8237ec681f3Smrg
8247ec681f3Smrg   Temp tmp = info.instr->operands[0].getTemp();
8257ec681f3Smrg   SubdwordSel sel = parse_extract(info.instr);
8267ec681f3Smrg
8277ec681f3Smrg   if (!sel) {
8287ec681f3Smrg      return false;
8297ec681f3Smrg   } else if (sel.size() == 4) {
8307ec681f3Smrg      return true;
8317ec681f3Smrg   } else if (instr->opcode == aco_opcode::v_cvt_f32_u32 && sel.size() == 1 && !sel.sign_extend()) {
8327ec681f3Smrg      return true;
8337ec681f3Smrg   } else if (can_use_SDWA(ctx.program->chip_class, instr, true) &&
8347ec681f3Smrg              (tmp.type() == RegType::vgpr || ctx.program->chip_class >= GFX9)) {
8357ec681f3Smrg      if (instr->isSDWA() && instr->sdwa().sel[idx] != SubdwordSel::dword)
8367ec681f3Smrg         return false;
8377ec681f3Smrg      return true;
8387ec681f3Smrg   } else if (instr->isVOP3() && sel.size() == 2 &&
8397ec681f3Smrg              can_use_opsel(ctx.program->chip_class, instr->opcode, idx, sel.offset()) &&
8407ec681f3Smrg              !(instr->vop3().opsel & (1 << idx))) {
8417ec681f3Smrg      return true;
8427ec681f3Smrg   } else {
8437ec681f3Smrg      return false;
8447ec681f3Smrg   }
8457ec681f3Smrg}
8467ec681f3Smrg
8477ec681f3Smrg/* Combine an p_extract (or p_insert, in some cases) instruction with instr.
8487ec681f3Smrg * instr(p_extract(...)) -> instr()
8497ec681f3Smrg */
8507ec681f3Smrgvoid
8517ec681f3Smrgapply_extract(opt_ctx& ctx, aco_ptr<Instruction>& instr, unsigned idx, ssa_info& info)
8527ec681f3Smrg{
8537ec681f3Smrg   Temp tmp = info.instr->operands[0].getTemp();
8547ec681f3Smrg   SubdwordSel sel = parse_extract(info.instr);
8557ec681f3Smrg   assert(sel);
8567ec681f3Smrg
8577ec681f3Smrg   instr->operands[idx].set16bit(false);
8587ec681f3Smrg   instr->operands[idx].set24bit(false);
8597ec681f3Smrg
8607ec681f3Smrg   ctx.info[tmp.id()].label &= ~label_insert;
8617ec681f3Smrg
8627ec681f3Smrg   if (sel.size() == 4) {
8637ec681f3Smrg      /* full dword selection */
8647ec681f3Smrg   } else if (instr->opcode == aco_opcode::v_cvt_f32_u32 && sel.size() == 1 && !sel.sign_extend()) {
8657ec681f3Smrg      switch (sel.offset()) {
8667ec681f3Smrg      case 0: instr->opcode = aco_opcode::v_cvt_f32_ubyte0; break;
8677ec681f3Smrg      case 1: instr->opcode = aco_opcode::v_cvt_f32_ubyte1; break;
8687ec681f3Smrg      case 2: instr->opcode = aco_opcode::v_cvt_f32_ubyte2; break;
8697ec681f3Smrg      case 3: instr->opcode = aco_opcode::v_cvt_f32_ubyte3; break;
8707ec681f3Smrg      }
8717ec681f3Smrg   } else if (instr->opcode == aco_opcode::v_lshlrev_b32 && instr->operands[0].isConstant() &&
8727ec681f3Smrg              sel.offset() == 0 &&
8737ec681f3Smrg              ((sel.size() == 2 && instr->operands[0].constantValue() >= 16u) ||
8747ec681f3Smrg               (sel.size() == 1 && instr->operands[0].constantValue() >= 24u))) {
8757ec681f3Smrg      /* The undesireable upper bits are already shifted out. */
8767ec681f3Smrg      return;
8777ec681f3Smrg   } else if (can_use_SDWA(ctx.program->chip_class, instr, true) &&
8787ec681f3Smrg              (tmp.type() == RegType::vgpr || ctx.program->chip_class >= GFX9)) {
8797ec681f3Smrg      to_SDWA(ctx, instr);
8807ec681f3Smrg      static_cast<SDWA_instruction*>(instr.get())->sel[idx] = sel;
8817ec681f3Smrg   } else if (instr->isVOP3()) {
8827ec681f3Smrg      if (sel.offset())
8837ec681f3Smrg         instr->vop3().opsel |= 1 << idx;
8847ec681f3Smrg   }
8857ec681f3Smrg
8867ec681f3Smrg   /* label_vopc seems to be the only one worth keeping at the moment */
8877ec681f3Smrg   for (Definition& def : instr->definitions)
8887ec681f3Smrg      ctx.info[def.tempId()].label &= label_vopc;
8897ec681f3Smrg}
8907ec681f3Smrg
8917ec681f3Smrgvoid
8927ec681f3Smrgcheck_sdwa_extract(opt_ctx& ctx, aco_ptr<Instruction>& instr)
8937ec681f3Smrg{
8947ec681f3Smrg   for (unsigned i = 0; i < instr->operands.size(); i++) {
8957ec681f3Smrg      Operand op = instr->operands[i];
8967ec681f3Smrg      if (!op.isTemp())
8977ec681f3Smrg         continue;
8987ec681f3Smrg      ssa_info& info = ctx.info[op.tempId()];
8997ec681f3Smrg      if (info.is_extract() && (info.instr->operands[0].getTemp().type() == RegType::vgpr ||
9007ec681f3Smrg                                op.getTemp().type() == RegType::sgpr)) {
9017ec681f3Smrg         if (!can_apply_extract(ctx, instr, i, info))
9027ec681f3Smrg            info.label &= ~label_extract;
9037ec681f3Smrg      }
9047ec681f3Smrg   }
9057ec681f3Smrg}
9067ec681f3Smrg
9077ec681f3Smrgbool
9087ec681f3Smrgdoes_fp_op_flush_denorms(opt_ctx& ctx, aco_opcode op)
9097ec681f3Smrg{
9107ec681f3Smrg   if (ctx.program->chip_class <= GFX8) {
9117ec681f3Smrg      switch (op) {
9127ec681f3Smrg      case aco_opcode::v_min_f32:
9137ec681f3Smrg      case aco_opcode::v_max_f32:
9147ec681f3Smrg      case aco_opcode::v_med3_f32:
9157ec681f3Smrg      case aco_opcode::v_min3_f32:
9167ec681f3Smrg      case aco_opcode::v_max3_f32:
9177ec681f3Smrg      case aco_opcode::v_min_f16:
9187ec681f3Smrg      case aco_opcode::v_max_f16: return false;
9197ec681f3Smrg      default: break;
9207ec681f3Smrg      }
9217ec681f3Smrg   }
9227ec681f3Smrg   return op != aco_opcode::v_cndmask_b32;
9237ec681f3Smrg}
9247ec681f3Smrg
9257ec681f3Smrgbool
9267ec681f3Smrgcan_eliminate_fcanonicalize(opt_ctx& ctx, aco_ptr<Instruction>& instr, Temp tmp)
9277ec681f3Smrg{
9287ec681f3Smrg   float_mode* fp = &ctx.fp_mode;
9297ec681f3Smrg   if (ctx.info[tmp.id()].is_canonicalized() ||
9307ec681f3Smrg       (tmp.bytes() == 4 ? fp->denorm32 : fp->denorm16_64) == fp_denorm_keep)
9317ec681f3Smrg      return true;
9327ec681f3Smrg
9337ec681f3Smrg   aco_opcode op = instr->opcode;
9347ec681f3Smrg   return instr_info.can_use_input_modifiers[(int)op] && does_fp_op_flush_denorms(ctx, op);
9357ec681f3Smrg}
9367ec681f3Smrg
9377ec681f3Smrgbool
9387ec681f3Smrgis_copy_label(opt_ctx& ctx, aco_ptr<Instruction>& instr, ssa_info& info)
9397ec681f3Smrg{
9407ec681f3Smrg   return info.is_temp() ||
9417ec681f3Smrg          (info.is_fcanonicalize() && can_eliminate_fcanonicalize(ctx, instr, info.temp));
9427ec681f3Smrg}
9437ec681f3Smrg
9447ec681f3Smrgbool
9457ec681f3Smrgis_op_canonicalized(opt_ctx& ctx, Operand op)
9467ec681f3Smrg{
9477ec681f3Smrg   float_mode* fp = &ctx.fp_mode;
9487ec681f3Smrg   if ((op.isTemp() && ctx.info[op.tempId()].is_canonicalized()) ||
9497ec681f3Smrg       (op.bytes() == 4 ? fp->denorm32 : fp->denorm16_64) == fp_denorm_keep)
9507ec681f3Smrg      return true;
9517ec681f3Smrg
9527ec681f3Smrg   if (op.isConstant() || (op.isTemp() && ctx.info[op.tempId()].is_constant_or_literal(32))) {
9537ec681f3Smrg      uint32_t val = op.isTemp() ? ctx.info[op.tempId()].val : op.constantValue();
9547ec681f3Smrg      if (op.bytes() == 2)
9557ec681f3Smrg         return (val & 0x7fff) == 0 || (val & 0x7fff) > 0x3ff;
9567ec681f3Smrg      else if (op.bytes() == 4)
9577ec681f3Smrg         return (val & 0x7fffffff) == 0 || (val & 0x7fffffff) > 0x7fffff;
9587ec681f3Smrg   }
9597ec681f3Smrg   return false;
9607ec681f3Smrg}
9617ec681f3Smrg
9627ec681f3Smrgvoid
9637ec681f3Smrglabel_instruction(opt_ctx& ctx, aco_ptr<Instruction>& instr)
9647ec681f3Smrg{
9657ec681f3Smrg   if (instr->isSALU() || instr->isVALU() || instr->isPseudo()) {
9667ec681f3Smrg      ASSERTED bool all_const = false;
9677ec681f3Smrg      for (Operand& op : instr->operands)
9687ec681f3Smrg         all_const =
9697ec681f3Smrg            all_const && (!op.isTemp() || ctx.info[op.tempId()].is_constant_or_literal(32));
9707ec681f3Smrg      perfwarn(ctx.program, all_const, "All instruction operands are constant", instr.get());
9717ec681f3Smrg
9727ec681f3Smrg      ASSERTED bool is_copy = instr->opcode == aco_opcode::s_mov_b32 ||
9737ec681f3Smrg                              instr->opcode == aco_opcode::s_mov_b64 ||
9747ec681f3Smrg                              instr->opcode == aco_opcode::v_mov_b32;
9757ec681f3Smrg      perfwarn(ctx.program, is_copy && !instr->usesModifiers(), "Use p_parallelcopy instead",
9767ec681f3Smrg               instr.get());
9777ec681f3Smrg   }
9787ec681f3Smrg
9797ec681f3Smrg   for (unsigned i = 0; i < instr->operands.size(); i++) {
9807ec681f3Smrg      if (!instr->operands[i].isTemp())
9817ec681f3Smrg         continue;
9827ec681f3Smrg
9837ec681f3Smrg      ssa_info info = ctx.info[instr->operands[i].tempId()];
9847ec681f3Smrg      /* propagate undef */
9857ec681f3Smrg      if (info.is_undefined() && is_phi(instr))
9867ec681f3Smrg         instr->operands[i] = Operand(instr->operands[i].regClass());
9877ec681f3Smrg      /* propagate reg->reg of same type */
9887ec681f3Smrg      while (info.is_temp() && info.temp.regClass() == instr->operands[i].getTemp().regClass()) {
9897ec681f3Smrg         instr->operands[i].setTemp(ctx.info[instr->operands[i].tempId()].temp);
9907ec681f3Smrg         info = ctx.info[info.temp.id()];
9917ec681f3Smrg      }
9927ec681f3Smrg
9937ec681f3Smrg      /* PSEUDO: propagate temporaries */
9947ec681f3Smrg      if (instr->isPseudo()) {
9957ec681f3Smrg         while (info.is_temp()) {
9967ec681f3Smrg            pseudo_propagate_temp(ctx, instr, info.temp, i);
9977ec681f3Smrg            info = ctx.info[info.temp.id()];
9987ec681f3Smrg         }
9997ec681f3Smrg      }
10007ec681f3Smrg
10017ec681f3Smrg      /* SALU / PSEUDO: propagate inline constants */
10027ec681f3Smrg      if (instr->isSALU() || instr->isPseudo()) {
10037ec681f3Smrg         unsigned bits = get_operand_size(instr, i);
10047ec681f3Smrg         if ((info.is_constant(bits) || (info.is_literal(bits) && instr->isPseudo())) &&
10057ec681f3Smrg             !instr->operands[i].isFixed() && alu_can_accept_constant(instr->opcode, i)) {
10067ec681f3Smrg            instr->operands[i] = get_constant_op(ctx, info, bits);
10077ec681f3Smrg            continue;
10087ec681f3Smrg         }
10097ec681f3Smrg      }
10107ec681f3Smrg
10117ec681f3Smrg      /* VALU: propagate neg, abs & inline constants */
10127ec681f3Smrg      else if (instr->isVALU()) {
10137ec681f3Smrg         if (is_copy_label(ctx, instr, info) && info.temp.type() == RegType::vgpr &&
10147ec681f3Smrg             valu_can_accept_vgpr(instr, i)) {
10157ec681f3Smrg            instr->operands[i].setTemp(info.temp);
10167ec681f3Smrg            info = ctx.info[info.temp.id()];
10177ec681f3Smrg         }
10187ec681f3Smrg         /* applying SGPRs to VOP1 doesn't increase code size and DCE is helped by doing it earlier */
10197ec681f3Smrg         if (info.is_temp() && info.temp.type() == RegType::sgpr && can_apply_sgprs(ctx, instr) &&
10207ec681f3Smrg             instr->operands.size() == 1) {
10217ec681f3Smrg            instr->format = withoutDPP(instr->format);
10227ec681f3Smrg            instr->operands[i].setTemp(info.temp);
10237ec681f3Smrg            info = ctx.info[info.temp.id()];
10247ec681f3Smrg         }
10257ec681f3Smrg
10267ec681f3Smrg         /* for instructions other than v_cndmask_b32, the size of the instruction should match the
10277ec681f3Smrg          * operand size */
10287ec681f3Smrg         unsigned can_use_mod =
10297ec681f3Smrg            instr->opcode != aco_opcode::v_cndmask_b32 || instr->operands[i].getTemp().bytes() == 4;
10307ec681f3Smrg         can_use_mod = can_use_mod && instr_info.can_use_input_modifiers[(int)instr->opcode];
10317ec681f3Smrg
10327ec681f3Smrg         if (instr->isSDWA())
10337ec681f3Smrg            can_use_mod = can_use_mod && instr->sdwa().sel[i].size() == 4;
10347ec681f3Smrg         else
10357ec681f3Smrg            can_use_mod = can_use_mod && (instr->isDPP() || can_use_VOP3(ctx, instr));
10367ec681f3Smrg
10377ec681f3Smrg         if (info.is_neg() && instr->opcode == aco_opcode::v_add_f32) {
10387ec681f3Smrg            instr->opcode = i ? aco_opcode::v_sub_f32 : aco_opcode::v_subrev_f32;
10397ec681f3Smrg            instr->operands[i].setTemp(info.temp);
10407ec681f3Smrg         } else if (info.is_neg() && instr->opcode == aco_opcode::v_add_f16) {
10417ec681f3Smrg            instr->opcode = i ? aco_opcode::v_sub_f16 : aco_opcode::v_subrev_f16;
10427ec681f3Smrg            instr->operands[i].setTemp(info.temp);
10437ec681f3Smrg         } else if (info.is_neg() && can_use_mod &&
10447ec681f3Smrg                    can_eliminate_fcanonicalize(ctx, instr, info.temp)) {
10457ec681f3Smrg            if (!instr->isDPP() && !instr->isSDWA())
10467ec681f3Smrg               to_VOP3(ctx, instr);
10477ec681f3Smrg            instr->operands[i].setTemp(info.temp);
10487ec681f3Smrg            if (instr->isDPP() && !instr->dpp().abs[i])
10497ec681f3Smrg               instr->dpp().neg[i] = true;
10507ec681f3Smrg            else if (instr->isSDWA() && !instr->sdwa().abs[i])
10517ec681f3Smrg               instr->sdwa().neg[i] = true;
10527ec681f3Smrg            else if (instr->isVOP3() && !instr->vop3().abs[i])
10537ec681f3Smrg               instr->vop3().neg[i] = true;
10547ec681f3Smrg         }
10557ec681f3Smrg         if (info.is_abs() && can_use_mod && can_eliminate_fcanonicalize(ctx, instr, info.temp)) {
10567ec681f3Smrg            if (!instr->isDPP() && !instr->isSDWA())
10577ec681f3Smrg               to_VOP3(ctx, instr);
10587ec681f3Smrg            instr->operands[i] = Operand(info.temp);
10597ec681f3Smrg            if (instr->isDPP())
10607ec681f3Smrg               instr->dpp().abs[i] = true;
10617ec681f3Smrg            else if (instr->isSDWA())
10627ec681f3Smrg               instr->sdwa().abs[i] = true;
10637ec681f3Smrg            else
10647ec681f3Smrg               instr->vop3().abs[i] = true;
10657ec681f3Smrg            continue;
10667ec681f3Smrg         }
10677ec681f3Smrg
10687ec681f3Smrg         unsigned bits = get_operand_size(instr, i);
10697ec681f3Smrg         if (info.is_constant(bits) && alu_can_accept_constant(instr->opcode, i) &&
10707ec681f3Smrg             (!instr->isSDWA() || ctx.program->chip_class >= GFX9)) {
10717ec681f3Smrg            Operand op = get_constant_op(ctx, info, bits);
10727ec681f3Smrg            perfwarn(ctx.program, instr->opcode == aco_opcode::v_cndmask_b32 && i == 2,
10737ec681f3Smrg                     "v_cndmask_b32 with a constant selector", instr.get());
10747ec681f3Smrg            if (i == 0 || instr->isSDWA() || instr->isVOP3P() ||
10757ec681f3Smrg                instr->opcode == aco_opcode::v_readlane_b32 ||
10767ec681f3Smrg                instr->opcode == aco_opcode::v_writelane_b32) {
10777ec681f3Smrg               instr->format = withoutDPP(instr->format);
10787ec681f3Smrg               instr->operands[i] = op;
10797ec681f3Smrg               continue;
10807ec681f3Smrg            } else if (!instr->isVOP3() && can_swap_operands(instr, &instr->opcode)) {
10817ec681f3Smrg               instr->operands[i] = instr->operands[0];
10827ec681f3Smrg               instr->operands[0] = op;
10837ec681f3Smrg               continue;
10847ec681f3Smrg            } else if (can_use_VOP3(ctx, instr)) {
10857ec681f3Smrg               to_VOP3(ctx, instr);
10867ec681f3Smrg               instr->operands[i] = op;
10877ec681f3Smrg               continue;
10887ec681f3Smrg            }
10897ec681f3Smrg         }
10907ec681f3Smrg      }
10917ec681f3Smrg
10927ec681f3Smrg      /* MUBUF: propagate constants and combine additions */
10937ec681f3Smrg      else if (instr->isMUBUF()) {
10947ec681f3Smrg         MUBUF_instruction& mubuf = instr->mubuf();
10957ec681f3Smrg         Temp base;
10967ec681f3Smrg         uint32_t offset;
10977ec681f3Smrg         while (info.is_temp())
10987ec681f3Smrg            info = ctx.info[info.temp.id()];
10997ec681f3Smrg
11007ec681f3Smrg         /* According to AMDGPUDAGToDAGISel::SelectMUBUFScratchOffen(), vaddr
11017ec681f3Smrg          * overflow for scratch accesses works only on GFX9+ and saddr overflow
11027ec681f3Smrg          * never works. Since swizzling is the only thing that separates
11037ec681f3Smrg          * scratch accesses and other accesses and swizzling changing how
11047ec681f3Smrg          * addressing works significantly, this probably applies to swizzled
11057ec681f3Smrg          * MUBUF accesses. */
11067ec681f3Smrg         bool vaddr_prevent_overflow = mubuf.swizzled && ctx.program->chip_class < GFX9;
11077ec681f3Smrg         bool saddr_prevent_overflow = mubuf.swizzled;
11087ec681f3Smrg
11097ec681f3Smrg         if (mubuf.offen && i == 1 && info.is_constant_or_literal(32) &&
11107ec681f3Smrg             mubuf.offset + info.val < 4096) {
11117ec681f3Smrg            assert(!mubuf.idxen);
11127ec681f3Smrg            instr->operands[1] = Operand(v1);
11137ec681f3Smrg            mubuf.offset += info.val;
11147ec681f3Smrg            mubuf.offen = false;
11157ec681f3Smrg            continue;
11167ec681f3Smrg         } else if (i == 2 && info.is_constant_or_literal(32) && mubuf.offset + info.val < 4096) {
11177ec681f3Smrg            instr->operands[2] = Operand::c32(0);
11187ec681f3Smrg            mubuf.offset += info.val;
11197ec681f3Smrg            continue;
11207ec681f3Smrg         } else if (mubuf.offen && i == 1 &&
11217ec681f3Smrg                    parse_base_offset(ctx, instr.get(), i, &base, &offset,
11227ec681f3Smrg                                      vaddr_prevent_overflow) &&
11237ec681f3Smrg                    base.regClass() == v1 && mubuf.offset + offset < 4096) {
11247ec681f3Smrg            assert(!mubuf.idxen);
11257ec681f3Smrg            instr->operands[1].setTemp(base);
11267ec681f3Smrg            mubuf.offset += offset;
11277ec681f3Smrg            continue;
11287ec681f3Smrg         } else if (i == 2 &&
11297ec681f3Smrg                    parse_base_offset(ctx, instr.get(), i, &base, &offset,
11307ec681f3Smrg                                      saddr_prevent_overflow) &&
11317ec681f3Smrg                    base.regClass() == s1 && mubuf.offset + offset < 4096) {
11327ec681f3Smrg            instr->operands[i].setTemp(base);
11337ec681f3Smrg            mubuf.offset += offset;
11347ec681f3Smrg            continue;
11357ec681f3Smrg         }
11367ec681f3Smrg      }
11377ec681f3Smrg
11387ec681f3Smrg      /* DS: combine additions */
11397ec681f3Smrg      else if (instr->isDS()) {
11407ec681f3Smrg
11417ec681f3Smrg         DS_instruction& ds = instr->ds();
11427ec681f3Smrg         Temp base;
11437ec681f3Smrg         uint32_t offset;
11447ec681f3Smrg         bool has_usable_ds_offset = ctx.program->chip_class >= GFX7;
11457ec681f3Smrg         if (has_usable_ds_offset && i == 0 &&
11467ec681f3Smrg             parse_base_offset(ctx, instr.get(), i, &base, &offset, false) &&
11477ec681f3Smrg             base.regClass() == instr->operands[i].regClass() &&
11487ec681f3Smrg             instr->opcode != aco_opcode::ds_swizzle_b32) {
11497ec681f3Smrg            if (instr->opcode == aco_opcode::ds_write2_b32 ||
11507ec681f3Smrg                instr->opcode == aco_opcode::ds_read2_b32 ||
11517ec681f3Smrg                instr->opcode == aco_opcode::ds_write2_b64 ||
11527ec681f3Smrg                instr->opcode == aco_opcode::ds_read2_b64) {
11537ec681f3Smrg               unsigned mask = (instr->opcode == aco_opcode::ds_write2_b64 ||
11547ec681f3Smrg                                instr->opcode == aco_opcode::ds_read2_b64)
11557ec681f3Smrg                                  ? 0x7
11567ec681f3Smrg                                  : 0x3;
11577ec681f3Smrg               unsigned shifts = (instr->opcode == aco_opcode::ds_write2_b64 ||
11587ec681f3Smrg                                  instr->opcode == aco_opcode::ds_read2_b64)
11597ec681f3Smrg                                    ? 3
11607ec681f3Smrg                                    : 2;
11617ec681f3Smrg
11627ec681f3Smrg               if ((offset & mask) == 0 && ds.offset0 + (offset >> shifts) <= 255 &&
11637ec681f3Smrg                   ds.offset1 + (offset >> shifts) <= 255) {
11647ec681f3Smrg                  instr->operands[i].setTemp(base);
11657ec681f3Smrg                  ds.offset0 += offset >> shifts;
11667ec681f3Smrg                  ds.offset1 += offset >> shifts;
11677ec681f3Smrg               }
11687ec681f3Smrg            } else {
11697ec681f3Smrg               if (ds.offset0 + offset <= 65535) {
11707ec681f3Smrg                  instr->operands[i].setTemp(base);
11717ec681f3Smrg                  ds.offset0 += offset;
11727ec681f3Smrg               }
11737ec681f3Smrg            }
11747ec681f3Smrg         }
11757ec681f3Smrg      }
11767ec681f3Smrg
11777ec681f3Smrg      /* SMEM: propagate constants and combine additions */
11787ec681f3Smrg      else if (instr->isSMEM()) {
11797ec681f3Smrg
11807ec681f3Smrg         SMEM_instruction& smem = instr->smem();
11817ec681f3Smrg         Temp base;
11827ec681f3Smrg         uint32_t offset;
11837ec681f3Smrg         bool prevent_overflow = smem.operands[0].size() > 2 || smem.prevent_overflow;
11847ec681f3Smrg         if (i == 1 && info.is_constant_or_literal(32) &&
11857ec681f3Smrg             ((ctx.program->chip_class == GFX6 && info.val <= 0x3FF) ||
11867ec681f3Smrg              (ctx.program->chip_class == GFX7 && info.val <= 0xFFFFFFFF) ||
11877ec681f3Smrg              (ctx.program->chip_class >= GFX8 && info.val <= 0xFFFFF))) {
11887ec681f3Smrg            instr->operands[i] = Operand::c32(info.val);
11897ec681f3Smrg            continue;
11907ec681f3Smrg         } else if (i == 1 &&
11917ec681f3Smrg                    parse_base_offset(ctx, instr.get(), i, &base, &offset, prevent_overflow) &&
11927ec681f3Smrg                    base.regClass() == s1 && offset <= 0xFFFFF && ctx.program->chip_class >= GFX9) {
11937ec681f3Smrg            bool soe = smem.operands.size() >= (!smem.definitions.empty() ? 3 : 4);
11947ec681f3Smrg            if (soe && (!ctx.info[smem.operands.back().tempId()].is_constant_or_literal(32) ||
11957ec681f3Smrg                        ctx.info[smem.operands.back().tempId()].val != 0)) {
11967ec681f3Smrg               continue;
11977ec681f3Smrg            }
11987ec681f3Smrg            if (soe) {
11997ec681f3Smrg               smem.operands[1] = Operand::c32(offset);
12007ec681f3Smrg               smem.operands.back() = Operand(base);
12017ec681f3Smrg            } else {
12027ec681f3Smrg               SMEM_instruction* new_instr = create_instruction<SMEM_instruction>(
12037ec681f3Smrg                  smem.opcode, Format::SMEM, smem.operands.size() + 1, smem.definitions.size());
12047ec681f3Smrg               new_instr->operands[0] = smem.operands[0];
12057ec681f3Smrg               new_instr->operands[1] = Operand::c32(offset);
12067ec681f3Smrg               if (smem.definitions.empty())
12077ec681f3Smrg                  new_instr->operands[2] = smem.operands[2];
12087ec681f3Smrg               new_instr->operands.back() = Operand(base);
12097ec681f3Smrg               if (!smem.definitions.empty())
12107ec681f3Smrg                  new_instr->definitions[0] = smem.definitions[0];
12117ec681f3Smrg               new_instr->sync = smem.sync;
12127ec681f3Smrg               new_instr->glc = smem.glc;
12137ec681f3Smrg               new_instr->dlc = smem.dlc;
12147ec681f3Smrg               new_instr->nv = smem.nv;
12157ec681f3Smrg               new_instr->disable_wqm = smem.disable_wqm;
12167ec681f3Smrg               instr.reset(new_instr);
12177ec681f3Smrg            }
12187ec681f3Smrg            continue;
12197ec681f3Smrg         }
12207ec681f3Smrg      }
12217ec681f3Smrg
12227ec681f3Smrg      else if (instr->isBranch()) {
12237ec681f3Smrg         if (ctx.info[instr->operands[0].tempId()].is_scc_invert()) {
12247ec681f3Smrg            /* Flip the branch instruction to get rid of the scc_invert instruction */
12257ec681f3Smrg            instr->opcode = instr->opcode == aco_opcode::p_cbranch_z ? aco_opcode::p_cbranch_nz
12267ec681f3Smrg                                                                     : aco_opcode::p_cbranch_z;
12277ec681f3Smrg            instr->operands[0].setTemp(ctx.info[instr->operands[0].tempId()].temp);
12287ec681f3Smrg         }
12297ec681f3Smrg      }
12307ec681f3Smrg   }
12317ec681f3Smrg
12327ec681f3Smrg   /* if this instruction doesn't define anything, return */
12337ec681f3Smrg   if (instr->definitions.empty()) {
12347ec681f3Smrg      check_sdwa_extract(ctx, instr);
12357ec681f3Smrg      return;
12367ec681f3Smrg   }
12377ec681f3Smrg
12387ec681f3Smrg   if (instr->isVALU() || instr->isVINTRP()) {
12397ec681f3Smrg      if (instr_info.can_use_output_modifiers[(int)instr->opcode] || instr->isVINTRP() ||
12407ec681f3Smrg          instr->opcode == aco_opcode::v_cndmask_b32) {
12417ec681f3Smrg         bool canonicalized = true;
12427ec681f3Smrg         if (!does_fp_op_flush_denorms(ctx, instr->opcode)) {
12437ec681f3Smrg            unsigned ops = instr->opcode == aco_opcode::v_cndmask_b32 ? 2 : instr->operands.size();
12447ec681f3Smrg            for (unsigned i = 0; canonicalized && (i < ops); i++)
12457ec681f3Smrg               canonicalized = is_op_canonicalized(ctx, instr->operands[i]);
12467ec681f3Smrg         }
12477ec681f3Smrg         if (canonicalized)
12487ec681f3Smrg            ctx.info[instr->definitions[0].tempId()].set_canonicalized();
12497ec681f3Smrg      }
12507ec681f3Smrg
12517ec681f3Smrg      if (instr->isVOPC()) {
12527ec681f3Smrg         ctx.info[instr->definitions[0].tempId()].set_vopc(instr.get());
12537ec681f3Smrg         check_sdwa_extract(ctx, instr);
12547ec681f3Smrg         return;
12557ec681f3Smrg      }
12567ec681f3Smrg      if (instr->isVOP3P()) {
12577ec681f3Smrg         ctx.info[instr->definitions[0].tempId()].set_vop3p(instr.get());
12587ec681f3Smrg         return;
12597ec681f3Smrg      }
12607ec681f3Smrg   }
12617ec681f3Smrg
12627ec681f3Smrg   switch (instr->opcode) {
12637ec681f3Smrg   case aco_opcode::p_create_vector: {
12647ec681f3Smrg      bool copy_prop = instr->operands.size() == 1 && instr->operands[0].isTemp() &&
12657ec681f3Smrg                       instr->operands[0].regClass() == instr->definitions[0].regClass();
12667ec681f3Smrg      if (copy_prop) {
12677ec681f3Smrg         ctx.info[instr->definitions[0].tempId()].set_temp(instr->operands[0].getTemp());
12687ec681f3Smrg         break;
12697ec681f3Smrg      }
12707ec681f3Smrg
12717ec681f3Smrg      /* expand vector operands */
12727ec681f3Smrg      std::vector<Operand> ops;
12737ec681f3Smrg      unsigned offset = 0;
12747ec681f3Smrg      for (const Operand& op : instr->operands) {
12757ec681f3Smrg         /* ensure that any expanded operands are properly aligned */
12767ec681f3Smrg         bool aligned = offset % 4 == 0 || op.bytes() < 4;
12777ec681f3Smrg         offset += op.bytes();
12787ec681f3Smrg         if (aligned && op.isTemp() && ctx.info[op.tempId()].is_vec()) {
12797ec681f3Smrg            Instruction* vec = ctx.info[op.tempId()].instr;
12807ec681f3Smrg            for (const Operand& vec_op : vec->operands)
12817ec681f3Smrg               ops.emplace_back(vec_op);
12827ec681f3Smrg         } else {
12837ec681f3Smrg            ops.emplace_back(op);
12847ec681f3Smrg         }
12857ec681f3Smrg      }
12867ec681f3Smrg
12877ec681f3Smrg      /* combine expanded operands to new vector */
12887ec681f3Smrg      if (ops.size() != instr->operands.size()) {
12897ec681f3Smrg         assert(ops.size() > instr->operands.size());
12907ec681f3Smrg         Definition def = instr->definitions[0];
12917ec681f3Smrg         instr.reset(create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector,
12927ec681f3Smrg                                                            Format::PSEUDO, ops.size(), 1));
12937ec681f3Smrg         for (unsigned i = 0; i < ops.size(); i++) {
12947ec681f3Smrg            if (ops[i].isTemp() && ctx.info[ops[i].tempId()].is_temp() &&
12957ec681f3Smrg                ops[i].regClass() == ctx.info[ops[i].tempId()].temp.regClass())
12967ec681f3Smrg               ops[i].setTemp(ctx.info[ops[i].tempId()].temp);
12977ec681f3Smrg            instr->operands[i] = ops[i];
12987ec681f3Smrg         }
12997ec681f3Smrg         instr->definitions[0] = def;
13007ec681f3Smrg      } else {
13017ec681f3Smrg         for (unsigned i = 0; i < ops.size(); i++) {
13027ec681f3Smrg            assert(instr->operands[i] == ops[i]);
13037ec681f3Smrg         }
13047ec681f3Smrg      }
13057ec681f3Smrg      ctx.info[instr->definitions[0].tempId()].set_vec(instr.get());
13067ec681f3Smrg      break;
13077ec681f3Smrg   }
13087ec681f3Smrg   case aco_opcode::p_split_vector: {
13097ec681f3Smrg      ssa_info& info = ctx.info[instr->operands[0].tempId()];
13107ec681f3Smrg
13117ec681f3Smrg      if (info.is_constant_or_literal(32)) {
13127ec681f3Smrg         uint32_t val = info.val;
13137ec681f3Smrg         for (Definition def : instr->definitions) {
13147ec681f3Smrg            uint32_t mask = u_bit_consecutive(0, def.bytes() * 8u);
13157ec681f3Smrg            ctx.info[def.tempId()].set_constant(ctx.program->chip_class, val & mask);
13167ec681f3Smrg            val >>= def.bytes() * 8u;
13177ec681f3Smrg         }
13187ec681f3Smrg         break;
13197ec681f3Smrg      } else if (!info.is_vec()) {
13207ec681f3Smrg         break;
13217ec681f3Smrg      }
13227ec681f3Smrg
13237ec681f3Smrg      Instruction* vec = ctx.info[instr->operands[0].tempId()].instr;
13247ec681f3Smrg      unsigned split_offset = 0;
13257ec681f3Smrg      unsigned vec_offset = 0;
13267ec681f3Smrg      unsigned vec_index = 0;
13277ec681f3Smrg      for (unsigned i = 0; i < instr->definitions.size();
13287ec681f3Smrg           split_offset += instr->definitions[i++].bytes()) {
13297ec681f3Smrg         while (vec_offset < split_offset && vec_index < vec->operands.size())
13307ec681f3Smrg            vec_offset += vec->operands[vec_index++].bytes();
13317ec681f3Smrg
13327ec681f3Smrg         if (vec_offset != split_offset ||
13337ec681f3Smrg             vec->operands[vec_index].bytes() != instr->definitions[i].bytes())
13347ec681f3Smrg            continue;
13357ec681f3Smrg
13367ec681f3Smrg         Operand vec_op = vec->operands[vec_index];
13377ec681f3Smrg         if (vec_op.isConstant()) {
13387ec681f3Smrg            ctx.info[instr->definitions[i].tempId()].set_constant(ctx.program->chip_class,
13397ec681f3Smrg                                                                  vec_op.constantValue64());
13407ec681f3Smrg         } else if (vec_op.isUndefined()) {
13417ec681f3Smrg            ctx.info[instr->definitions[i].tempId()].set_undefined();
13427ec681f3Smrg         } else {
13437ec681f3Smrg            assert(vec_op.isTemp());
13447ec681f3Smrg            ctx.info[instr->definitions[i].tempId()].set_temp(vec_op.getTemp());
13457ec681f3Smrg         }
13467ec681f3Smrg      }
13477ec681f3Smrg      break;
13487ec681f3Smrg   }
13497ec681f3Smrg   case aco_opcode::p_extract_vector: { /* mov */
13507ec681f3Smrg      ssa_info& info = ctx.info[instr->operands[0].tempId()];
13517ec681f3Smrg      const unsigned index = instr->operands[1].constantValue();
13527ec681f3Smrg      const unsigned dst_offset = index * instr->definitions[0].bytes();
13537ec681f3Smrg
13547ec681f3Smrg      if (info.is_vec()) {
13557ec681f3Smrg         /* check if we index directly into a vector element */
13567ec681f3Smrg         Instruction* vec = info.instr;
13577ec681f3Smrg         unsigned offset = 0;
13587ec681f3Smrg
13597ec681f3Smrg         for (const Operand& op : vec->operands) {
13607ec681f3Smrg            if (offset < dst_offset) {
13617ec681f3Smrg               offset += op.bytes();
13627ec681f3Smrg               continue;
13637ec681f3Smrg            } else if (offset != dst_offset || op.bytes() != instr->definitions[0].bytes()) {
13647ec681f3Smrg               break;
13657ec681f3Smrg            }
13667ec681f3Smrg            instr->operands[0] = op;
13677ec681f3Smrg            break;
13687ec681f3Smrg         }
13697ec681f3Smrg      } else if (info.is_constant_or_literal(32)) {
13707ec681f3Smrg         /* propagate constants */
13717ec681f3Smrg         uint32_t mask = u_bit_consecutive(0, instr->definitions[0].bytes() * 8u);
13727ec681f3Smrg         uint32_t val = (info.val >> (dst_offset * 8u)) & mask;
13737ec681f3Smrg         instr->operands[0] =
13747ec681f3Smrg            Operand::get_const(ctx.program->chip_class, val, instr->definitions[0].bytes());
13757ec681f3Smrg         ;
13767ec681f3Smrg      } else if (index == 0 && instr->operands[0].size() == instr->definitions[0].size()) {
13777ec681f3Smrg         ctx.info[instr->definitions[0].tempId()].set_temp(instr->operands[0].getTemp());
13787ec681f3Smrg      }
13797ec681f3Smrg
13807ec681f3Smrg      if (instr->operands[0].bytes() != instr->definitions[0].bytes())
13817ec681f3Smrg         break;
13827ec681f3Smrg
13837ec681f3Smrg      /* convert this extract into a copy instruction */
13847ec681f3Smrg      instr->opcode = aco_opcode::p_parallelcopy;
13857ec681f3Smrg      instr->operands.pop_back();
13867ec681f3Smrg      FALLTHROUGH;
13877ec681f3Smrg   }
13887ec681f3Smrg   case aco_opcode::p_parallelcopy: /* propagate */
13897ec681f3Smrg      if (instr->operands[0].isTemp() && ctx.info[instr->operands[0].tempId()].is_vec() &&
13907ec681f3Smrg          instr->operands[0].regClass() != instr->definitions[0].regClass()) {
13917ec681f3Smrg         /* We might not be able to copy-propagate if it's a SGPR->VGPR copy, so
13927ec681f3Smrg          * duplicate the vector instead.
13937ec681f3Smrg          */
13947ec681f3Smrg         Instruction* vec = ctx.info[instr->operands[0].tempId()].instr;
13957ec681f3Smrg         aco_ptr<Instruction> old_copy = std::move(instr);
13967ec681f3Smrg
13977ec681f3Smrg         instr.reset(create_instruction<Pseudo_instruction>(
13987ec681f3Smrg            aco_opcode::p_create_vector, Format::PSEUDO, vec->operands.size(), 1));
13997ec681f3Smrg         instr->definitions[0] = old_copy->definitions[0];
14007ec681f3Smrg         std::copy(vec->operands.begin(), vec->operands.end(), instr->operands.begin());
14017ec681f3Smrg         for (unsigned i = 0; i < vec->operands.size(); i++) {
14027ec681f3Smrg            Operand& op = instr->operands[i];
14037ec681f3Smrg            if (op.isTemp() && ctx.info[op.tempId()].is_temp() &&
14047ec681f3Smrg                ctx.info[op.tempId()].temp.type() == instr->definitions[0].regClass().type())
14057ec681f3Smrg               op.setTemp(ctx.info[op.tempId()].temp);
14067ec681f3Smrg         }
14077ec681f3Smrg         ctx.info[instr->definitions[0].tempId()].set_vec(instr.get());
14087ec681f3Smrg         break;
14097ec681f3Smrg      }
14107ec681f3Smrg      FALLTHROUGH;
14117ec681f3Smrg   case aco_opcode::p_as_uniform:
14127ec681f3Smrg      if (instr->definitions[0].isFixed()) {
14137ec681f3Smrg         /* don't copy-propagate copies into fixed registers */
14147ec681f3Smrg      } else if (instr->usesModifiers()) {
14157ec681f3Smrg         // TODO
14167ec681f3Smrg      } else if (instr->operands[0].isConstant()) {
14177ec681f3Smrg         ctx.info[instr->definitions[0].tempId()].set_constant(
14187ec681f3Smrg            ctx.program->chip_class, instr->operands[0].constantValue64());
14197ec681f3Smrg      } else if (instr->operands[0].isTemp()) {
14207ec681f3Smrg         ctx.info[instr->definitions[0].tempId()].set_temp(instr->operands[0].getTemp());
14217ec681f3Smrg         if (ctx.info[instr->operands[0].tempId()].is_canonicalized())
14227ec681f3Smrg            ctx.info[instr->definitions[0].tempId()].set_canonicalized();
14237ec681f3Smrg      } else {
14247ec681f3Smrg         assert(instr->operands[0].isFixed());
14257ec681f3Smrg      }
14267ec681f3Smrg      break;
14277ec681f3Smrg   case aco_opcode::v_mov_b32:
14287ec681f3Smrg      if (instr->isDPP()) {
14297ec681f3Smrg         /* anything else doesn't make sense in SSA */
14307ec681f3Smrg         assert(instr->dpp().row_mask == 0xf && instr->dpp().bank_mask == 0xf);
14317ec681f3Smrg         ctx.info[instr->definitions[0].tempId()].set_dpp(instr.get());
14327ec681f3Smrg      }
14337ec681f3Smrg      break;
14347ec681f3Smrg   case aco_opcode::p_is_helper:
14357ec681f3Smrg      if (!ctx.program->needs_wqm)
14367ec681f3Smrg         ctx.info[instr->definitions[0].tempId()].set_constant(ctx.program->chip_class, 0u);
14377ec681f3Smrg      break;
14387ec681f3Smrg   case aco_opcode::v_mul_f64: ctx.info[instr->definitions[0].tempId()].set_mul(instr.get()); break;
14397ec681f3Smrg   case aco_opcode::v_mul_f16:
14407ec681f3Smrg   case aco_opcode::v_mul_f32: { /* omod */
14417ec681f3Smrg      ctx.info[instr->definitions[0].tempId()].set_mul(instr.get());
14427ec681f3Smrg
14437ec681f3Smrg      /* TODO: try to move the negate/abs modifier to the consumer instead */
14447ec681f3Smrg      bool uses_mods = instr->usesModifiers();
14457ec681f3Smrg      bool fp16 = instr->opcode == aco_opcode::v_mul_f16;
14467ec681f3Smrg
14477ec681f3Smrg      for (unsigned i = 0; i < 2; i++) {
14487ec681f3Smrg         if (instr->operands[!i].isConstant() && instr->operands[i].isTemp()) {
14497ec681f3Smrg            if (!instr->isDPP() && !instr->isSDWA() &&
14507ec681f3Smrg                (instr->operands[!i].constantEquals(fp16 ? 0x3c00 : 0x3f800000) ||   /* 1.0 */
14517ec681f3Smrg                 instr->operands[!i].constantEquals(fp16 ? 0xbc00 : 0xbf800000u))) { /* -1.0 */
14527ec681f3Smrg               bool neg1 = instr->operands[!i].constantEquals(fp16 ? 0xbc00 : 0xbf800000u);
14537ec681f3Smrg
14547ec681f3Smrg               VOP3_instruction* vop3 = instr->isVOP3() ? &instr->vop3() : NULL;
14557ec681f3Smrg               if (vop3 && (vop3->abs[!i] || vop3->neg[!i] || vop3->clamp || vop3->omod))
14567ec681f3Smrg                  continue;
14577ec681f3Smrg
14587ec681f3Smrg               bool abs = vop3 && vop3->abs[i];
14597ec681f3Smrg               bool neg = neg1 ^ (vop3 && vop3->neg[i]);
14607ec681f3Smrg
14617ec681f3Smrg               Temp other = instr->operands[i].getTemp();
14627ec681f3Smrg               if (abs && neg && other.type() == RegType::vgpr)
14637ec681f3Smrg                  ctx.info[instr->definitions[0].tempId()].set_neg_abs(other);
14647ec681f3Smrg               else if (abs && !neg && other.type() == RegType::vgpr)
14657ec681f3Smrg                  ctx.info[instr->definitions[0].tempId()].set_abs(other);
14667ec681f3Smrg               else if (!abs && neg && other.type() == RegType::vgpr)
14677ec681f3Smrg                  ctx.info[instr->definitions[0].tempId()].set_neg(other);
14687ec681f3Smrg               else if (!abs && !neg)
14697ec681f3Smrg                  ctx.info[instr->definitions[0].tempId()].set_fcanonicalize(other);
14707ec681f3Smrg            } else if (uses_mods) {
14717ec681f3Smrg               continue;
14727ec681f3Smrg            } else if (instr->operands[!i].constantValue() ==
14737ec681f3Smrg                       (fp16 ? 0x4000 : 0x40000000)) { /* 2.0 */
14747ec681f3Smrg               ctx.info[instr->operands[i].tempId()].set_omod2(instr.get());
14757ec681f3Smrg            } else if (instr->operands[!i].constantValue() ==
14767ec681f3Smrg                       (fp16 ? 0x4400 : 0x40800000)) { /* 4.0 */
14777ec681f3Smrg               ctx.info[instr->operands[i].tempId()].set_omod4(instr.get());
14787ec681f3Smrg            } else if (instr->operands[!i].constantValue() ==
14797ec681f3Smrg                       (fp16 ? 0x3800 : 0x3f000000)) { /* 0.5 */
14807ec681f3Smrg               ctx.info[instr->operands[i].tempId()].set_omod5(instr.get());
14817ec681f3Smrg            } else if (instr->operands[!i].constantValue() == 0u &&
14827ec681f3Smrg                       !(fp16 ? ctx.fp_mode.preserve_signed_zero_inf_nan16_64
14837ec681f3Smrg                              : ctx.fp_mode.preserve_signed_zero_inf_nan32)) { /* 0.0 */
14847ec681f3Smrg               ctx.info[instr->definitions[0].tempId()].set_constant(ctx.program->chip_class, 0u);
14857ec681f3Smrg            } else {
14867ec681f3Smrg               continue;
14877ec681f3Smrg            }
14887ec681f3Smrg            break;
14897ec681f3Smrg         }
14907ec681f3Smrg      }
14917ec681f3Smrg      break;
14927ec681f3Smrg   }
14937ec681f3Smrg   case aco_opcode::v_mul_lo_u16:
14947ec681f3Smrg   case aco_opcode::v_mul_lo_u16_e64:
14957ec681f3Smrg   case aco_opcode::v_mul_u32_u24:
14967ec681f3Smrg      ctx.info[instr->definitions[0].tempId()].set_usedef(instr.get());
14977ec681f3Smrg      break;
14987ec681f3Smrg   case aco_opcode::v_med3_f16:
14997ec681f3Smrg   case aco_opcode::v_med3_f32: { /* clamp */
15007ec681f3Smrg      VOP3_instruction& vop3 = instr->vop3();
15017ec681f3Smrg      if (vop3.abs[0] || vop3.abs[1] || vop3.abs[2] || vop3.neg[0] || vop3.neg[1] || vop3.neg[2] ||
15027ec681f3Smrg          vop3.omod != 0 || vop3.opsel != 0)
15037ec681f3Smrg         break;
15047ec681f3Smrg
15057ec681f3Smrg      unsigned idx = 0;
15067ec681f3Smrg      bool found_zero = false, found_one = false;
15077ec681f3Smrg      bool is_fp16 = instr->opcode == aco_opcode::v_med3_f16;
15087ec681f3Smrg      for (unsigned i = 0; i < 3; i++) {
15097ec681f3Smrg         if (instr->operands[i].constantEquals(0))
15107ec681f3Smrg            found_zero = true;
15117ec681f3Smrg         else if (instr->operands[i].constantEquals(is_fp16 ? 0x3c00 : 0x3f800000)) /* 1.0 */
15127ec681f3Smrg            found_one = true;
15137ec681f3Smrg         else
15147ec681f3Smrg            idx = i;
15157ec681f3Smrg      }
15167ec681f3Smrg      if (found_zero && found_one && instr->operands[idx].isTemp())
15177ec681f3Smrg         ctx.info[instr->operands[idx].tempId()].set_clamp(instr.get());
15187ec681f3Smrg      break;
15197ec681f3Smrg   }
15207ec681f3Smrg   case aco_opcode::v_cndmask_b32:
15217ec681f3Smrg      if (instr->operands[0].constantEquals(0) && instr->operands[1].constantEquals(0xFFFFFFFF))
15227ec681f3Smrg         ctx.info[instr->definitions[0].tempId()].set_vcc(instr->operands[2].getTemp());
15237ec681f3Smrg      else if (instr->operands[0].constantEquals(0) &&
15247ec681f3Smrg               instr->operands[1].constantEquals(0x3f800000u))
15257ec681f3Smrg         ctx.info[instr->definitions[0].tempId()].set_b2f(instr->operands[2].getTemp());
15267ec681f3Smrg      else if (instr->operands[0].constantEquals(0) && instr->operands[1].constantEquals(1))
15277ec681f3Smrg         ctx.info[instr->definitions[0].tempId()].set_b2i(instr->operands[2].getTemp());
15287ec681f3Smrg
15297ec681f3Smrg      ctx.info[instr->operands[2].tempId()].set_vcc_hint();
15307ec681f3Smrg      break;
15317ec681f3Smrg   case aco_opcode::v_cmp_lg_u32:
15327ec681f3Smrg      if (instr->format == Format::VOPC && /* don't optimize VOP3 / SDWA / DPP */
15337ec681f3Smrg          instr->operands[0].constantEquals(0) && instr->operands[1].isTemp() &&
15347ec681f3Smrg          ctx.info[instr->operands[1].tempId()].is_vcc())
15357ec681f3Smrg         ctx.info[instr->definitions[0].tempId()].set_temp(
15367ec681f3Smrg            ctx.info[instr->operands[1].tempId()].temp);
15377ec681f3Smrg      break;
15387ec681f3Smrg   case aco_opcode::p_linear_phi: {
15397ec681f3Smrg      /* lower_bool_phis() can create phis like this */
15407ec681f3Smrg      bool all_same_temp = instr->operands[0].isTemp();
15417ec681f3Smrg      /* this check is needed when moving uniform loop counters out of a divergent loop */
15427ec681f3Smrg      if (all_same_temp)
15437ec681f3Smrg         all_same_temp = instr->definitions[0].regClass() == instr->operands[0].regClass();
15447ec681f3Smrg      for (unsigned i = 1; all_same_temp && (i < instr->operands.size()); i++) {
15457ec681f3Smrg         if (!instr->operands[i].isTemp() ||
15467ec681f3Smrg             instr->operands[i].tempId() != instr->operands[0].tempId())
15477ec681f3Smrg            all_same_temp = false;
15487ec681f3Smrg      }
15497ec681f3Smrg      if (all_same_temp) {
15507ec681f3Smrg         ctx.info[instr->definitions[0].tempId()].set_temp(instr->operands[0].getTemp());
15517ec681f3Smrg      } else {
15527ec681f3Smrg         bool all_undef = instr->operands[0].isUndefined();
15537ec681f3Smrg         for (unsigned i = 1; all_undef && (i < instr->operands.size()); i++) {
15547ec681f3Smrg            if (!instr->operands[i].isUndefined())
15557ec681f3Smrg               all_undef = false;
15567ec681f3Smrg         }
15577ec681f3Smrg         if (all_undef)
15587ec681f3Smrg            ctx.info[instr->definitions[0].tempId()].set_undefined();
15597ec681f3Smrg      }
15607ec681f3Smrg      break;
15617ec681f3Smrg   }
15627ec681f3Smrg   case aco_opcode::v_add_u32:
15637ec681f3Smrg   case aco_opcode::v_add_co_u32:
15647ec681f3Smrg   case aco_opcode::v_add_co_u32_e64:
15657ec681f3Smrg   case aco_opcode::s_add_i32:
15667ec681f3Smrg   case aco_opcode::s_add_u32:
15677ec681f3Smrg   case aco_opcode::v_subbrev_co_u32:
15687ec681f3Smrg      ctx.info[instr->definitions[0].tempId()].set_add_sub(instr.get());
15697ec681f3Smrg      break;
15707ec681f3Smrg   case aco_opcode::s_not_b32:
15717ec681f3Smrg   case aco_opcode::s_not_b64:
15727ec681f3Smrg      if (ctx.info[instr->operands[0].tempId()].is_uniform_bool()) {
15737ec681f3Smrg         ctx.info[instr->definitions[0].tempId()].set_uniform_bitwise();
15747ec681f3Smrg         ctx.info[instr->definitions[1].tempId()].set_scc_invert(
15757ec681f3Smrg            ctx.info[instr->operands[0].tempId()].temp);
15767ec681f3Smrg      } else if (ctx.info[instr->operands[0].tempId()].is_uniform_bitwise()) {
15777ec681f3Smrg         ctx.info[instr->definitions[0].tempId()].set_uniform_bitwise();
15787ec681f3Smrg         ctx.info[instr->definitions[1].tempId()].set_scc_invert(
15797ec681f3Smrg            ctx.info[instr->operands[0].tempId()].instr->definitions[1].getTemp());
15807ec681f3Smrg      }
15817ec681f3Smrg      ctx.info[instr->definitions[0].tempId()].set_bitwise(instr.get());
15827ec681f3Smrg      break;
15837ec681f3Smrg   case aco_opcode::s_and_b32:
15847ec681f3Smrg   case aco_opcode::s_and_b64:
15857ec681f3Smrg      if (fixed_to_exec(instr->operands[1]) && instr->operands[0].isTemp()) {
15867ec681f3Smrg         if (ctx.info[instr->operands[0].tempId()].is_uniform_bool()) {
15877ec681f3Smrg            /* Try to get rid of the superfluous s_cselect + s_and_b64 that comes from turning a
15887ec681f3Smrg             * uniform bool into divergent */
15897ec681f3Smrg            ctx.info[instr->definitions[1].tempId()].set_temp(
15907ec681f3Smrg               ctx.info[instr->operands[0].tempId()].temp);
15917ec681f3Smrg            ctx.info[instr->definitions[0].tempId()].set_uniform_bool(
15927ec681f3Smrg               ctx.info[instr->operands[0].tempId()].temp);
15937ec681f3Smrg            break;
15947ec681f3Smrg         } else if (ctx.info[instr->operands[0].tempId()].is_uniform_bitwise()) {
15957ec681f3Smrg            /* Try to get rid of the superfluous s_and_b64, since the uniform bitwise instruction
15967ec681f3Smrg             * already produces the same SCC */
15977ec681f3Smrg            ctx.info[instr->definitions[1].tempId()].set_temp(
15987ec681f3Smrg               ctx.info[instr->operands[0].tempId()].instr->definitions[1].getTemp());
15997ec681f3Smrg            ctx.info[instr->definitions[0].tempId()].set_uniform_bool(
16007ec681f3Smrg               ctx.info[instr->operands[0].tempId()].instr->definitions[1].getTemp());
16017ec681f3Smrg            break;
16027ec681f3Smrg         } else if ((ctx.program->stage.num_sw_stages() > 1 ||
16037ec681f3Smrg                     ctx.program->stage.hw == HWStage::NGG) &&
16047ec681f3Smrg                    instr->pass_flags == 1) {
16057ec681f3Smrg            /* In case of merged shaders, pass_flags=1 means that all lanes are active (exec=-1), so
16067ec681f3Smrg             * s_and is unnecessary. */
16077ec681f3Smrg            ctx.info[instr->definitions[0].tempId()].set_temp(instr->operands[0].getTemp());
16087ec681f3Smrg            break;
16097ec681f3Smrg         } else if (ctx.info[instr->operands[0].tempId()].is_vopc()) {
16107ec681f3Smrg            Instruction* vopc_instr = ctx.info[instr->operands[0].tempId()].instr;
16117ec681f3Smrg            /* Remove superfluous s_and when the VOPC instruction uses the same exec and thus
16127ec681f3Smrg             * already produces the same result */
16137ec681f3Smrg            if (vopc_instr->pass_flags == instr->pass_flags) {
16147ec681f3Smrg               assert(instr->pass_flags > 0);
16157ec681f3Smrg               ctx.info[instr->definitions[0].tempId()].set_temp(
16167ec681f3Smrg                  vopc_instr->definitions[0].getTemp());
16177ec681f3Smrg               break;
16187ec681f3Smrg            }
16197ec681f3Smrg         }
16207ec681f3Smrg      }
16217ec681f3Smrg      FALLTHROUGH;
16227ec681f3Smrg   case aco_opcode::s_or_b32:
16237ec681f3Smrg   case aco_opcode::s_or_b64:
16247ec681f3Smrg   case aco_opcode::s_xor_b32:
16257ec681f3Smrg   case aco_opcode::s_xor_b64:
16267ec681f3Smrg      if (std::all_of(instr->operands.begin(), instr->operands.end(),
16277ec681f3Smrg                      [&ctx](const Operand& op)
16287ec681f3Smrg                      {
16297ec681f3Smrg                         return op.isTemp() && (ctx.info[op.tempId()].is_uniform_bool() ||
16307ec681f3Smrg                                                ctx.info[op.tempId()].is_uniform_bitwise());
16317ec681f3Smrg                      })) {
16327ec681f3Smrg         ctx.info[instr->definitions[0].tempId()].set_uniform_bitwise();
16337ec681f3Smrg      }
16347ec681f3Smrg      FALLTHROUGH;
16357ec681f3Smrg   case aco_opcode::s_lshl_b32:
16367ec681f3Smrg   case aco_opcode::v_or_b32:
16377ec681f3Smrg   case aco_opcode::v_lshlrev_b32:
16387ec681f3Smrg   case aco_opcode::v_bcnt_u32_b32:
16397ec681f3Smrg   case aco_opcode::v_and_b32:
16407ec681f3Smrg   case aco_opcode::v_xor_b32:
16417ec681f3Smrg      ctx.info[instr->definitions[0].tempId()].set_bitwise(instr.get());
16427ec681f3Smrg      break;
16437ec681f3Smrg   case aco_opcode::v_min_f32:
16447ec681f3Smrg   case aco_opcode::v_min_f16:
16457ec681f3Smrg   case aco_opcode::v_min_u32:
16467ec681f3Smrg   case aco_opcode::v_min_i32:
16477ec681f3Smrg   case aco_opcode::v_min_u16:
16487ec681f3Smrg   case aco_opcode::v_min_i16:
16497ec681f3Smrg   case aco_opcode::v_max_f32:
16507ec681f3Smrg   case aco_opcode::v_max_f16:
16517ec681f3Smrg   case aco_opcode::v_max_u32:
16527ec681f3Smrg   case aco_opcode::v_max_i32:
16537ec681f3Smrg   case aco_opcode::v_max_u16:
16547ec681f3Smrg   case aco_opcode::v_max_i16:
16557ec681f3Smrg      ctx.info[instr->definitions[0].tempId()].set_minmax(instr.get());
16567ec681f3Smrg      break;
16577ec681f3Smrg   case aco_opcode::s_cselect_b64:
16587ec681f3Smrg   case aco_opcode::s_cselect_b32:
16597ec681f3Smrg      if (instr->operands[0].constantEquals((unsigned)-1) && instr->operands[1].constantEquals(0)) {
16607ec681f3Smrg         /* Found a cselect that operates on a uniform bool that comes from eg. s_cmp */
16617ec681f3Smrg         ctx.info[instr->definitions[0].tempId()].set_uniform_bool(instr->operands[2].getTemp());
16627ec681f3Smrg      }
16637ec681f3Smrg      if (instr->operands[2].isTemp() && ctx.info[instr->operands[2].tempId()].is_scc_invert()) {
16647ec681f3Smrg         /* Flip the operands to get rid of the scc_invert instruction */
16657ec681f3Smrg         std::swap(instr->operands[0], instr->operands[1]);
16667ec681f3Smrg         instr->operands[2].setTemp(ctx.info[instr->operands[2].tempId()].temp);
16677ec681f3Smrg      }
16687ec681f3Smrg      break;
16697ec681f3Smrg   case aco_opcode::p_wqm:
16707ec681f3Smrg      if (instr->operands[0].isTemp() && ctx.info[instr->operands[0].tempId()].is_scc_invert()) {
16717ec681f3Smrg         ctx.info[instr->definitions[0].tempId()].set_temp(instr->operands[0].getTemp());
16727ec681f3Smrg      }
16737ec681f3Smrg      break;
16747ec681f3Smrg   case aco_opcode::s_mul_i32:
16757ec681f3Smrg      /* Testing every uint32_t shows that 0x3f800000*n is never a denormal.
16767ec681f3Smrg       * This pattern is created from a uniform nir_op_b2f. */
16777ec681f3Smrg      if (instr->operands[0].constantEquals(0x3f800000u))
16787ec681f3Smrg         ctx.info[instr->definitions[0].tempId()].set_canonicalized();
16797ec681f3Smrg      break;
16807ec681f3Smrg   case aco_opcode::p_extract: {
16817ec681f3Smrg      if (instr->definitions[0].bytes() == 4) {
16827ec681f3Smrg         ctx.info[instr->definitions[0].tempId()].set_extract(instr.get());
16837ec681f3Smrg         if (instr->operands[0].regClass() == v1 && parse_insert(instr.get()))
16847ec681f3Smrg            ctx.info[instr->operands[0].tempId()].set_insert(instr.get());
16857ec681f3Smrg      }
16867ec681f3Smrg      break;
16877ec681f3Smrg   }
16887ec681f3Smrg   case aco_opcode::p_insert: {
16897ec681f3Smrg      if (instr->operands[0].bytes() == 4) {
16907ec681f3Smrg         if (instr->operands[0].regClass() == v1)
16917ec681f3Smrg            ctx.info[instr->operands[0].tempId()].set_insert(instr.get());
16927ec681f3Smrg         if (parse_extract(instr.get()))
16937ec681f3Smrg            ctx.info[instr->definitions[0].tempId()].set_extract(instr.get());
16947ec681f3Smrg         ctx.info[instr->definitions[0].tempId()].set_bitwise(instr.get());
16957ec681f3Smrg      }
16967ec681f3Smrg      break;
16977ec681f3Smrg   }
16987ec681f3Smrg   case aco_opcode::ds_read_u8:
16997ec681f3Smrg   case aco_opcode::ds_read_u8_d16:
17007ec681f3Smrg   case aco_opcode::ds_read_u16:
17017ec681f3Smrg   case aco_opcode::ds_read_u16_d16: {
17027ec681f3Smrg      ctx.info[instr->definitions[0].tempId()].set_usedef(instr.get());
17037ec681f3Smrg      break;
17047ec681f3Smrg   }
17057ec681f3Smrg   default: break;
17067ec681f3Smrg   }
17077ec681f3Smrg
17087ec681f3Smrg   /* Don't remove label_extract if we can't apply the extract to
17097ec681f3Smrg    * neg/abs instructions because we'll likely combine it into another valu. */
17107ec681f3Smrg   if (!(ctx.info[instr->definitions[0].tempId()].label & (label_neg | label_abs)))
17117ec681f3Smrg      check_sdwa_extract(ctx, instr);
17127ec681f3Smrg}
17137ec681f3Smrg
17147ec681f3Smrgunsigned
17157ec681f3Smrgoriginal_temp_id(opt_ctx& ctx, Temp tmp)
17167ec681f3Smrg{
17177ec681f3Smrg   if (ctx.info[tmp.id()].is_temp())
17187ec681f3Smrg      return ctx.info[tmp.id()].temp.id();
17197ec681f3Smrg   else
17207ec681f3Smrg      return tmp.id();
17217ec681f3Smrg}
17227ec681f3Smrg
17237ec681f3Smrgvoid
17247ec681f3Smrgdecrease_uses(opt_ctx& ctx, Instruction* instr)
17257ec681f3Smrg{
17267ec681f3Smrg   if (!--ctx.uses[instr->definitions[0].tempId()]) {
17277ec681f3Smrg      for (const Operand& op : instr->operands) {
17287ec681f3Smrg         if (op.isTemp())
17297ec681f3Smrg            ctx.uses[op.tempId()]--;
17307ec681f3Smrg      }
17317ec681f3Smrg   }
17327ec681f3Smrg}
17337ec681f3Smrg
17347ec681f3SmrgInstruction*
17357ec681f3Smrgfollow_operand(opt_ctx& ctx, Operand op, bool ignore_uses = false)
17367ec681f3Smrg{
17377ec681f3Smrg   if (!op.isTemp() || !(ctx.info[op.tempId()].label & instr_usedef_labels))
17387ec681f3Smrg      return nullptr;
17397ec681f3Smrg   if (!ignore_uses && ctx.uses[op.tempId()] > 1)
17407ec681f3Smrg      return nullptr;
17417ec681f3Smrg
17427ec681f3Smrg   Instruction* instr = ctx.info[op.tempId()].instr;
17437ec681f3Smrg
17447ec681f3Smrg   if (instr->definitions.size() == 2) {
17457ec681f3Smrg      assert(instr->definitions[0].isTemp() && instr->definitions[0].tempId() == op.tempId());
17467ec681f3Smrg      if (instr->definitions[1].isTemp() && ctx.uses[instr->definitions[1].tempId()])
17477ec681f3Smrg         return nullptr;
17487ec681f3Smrg   }
17497ec681f3Smrg
17507ec681f3Smrg   return instr;
17517ec681f3Smrg}
17527ec681f3Smrg
17537ec681f3Smrg/* s_or_b64(neq(a, a), neq(b, b)) -> v_cmp_u_f32(a, b)
17547ec681f3Smrg * s_and_b64(eq(a, a), eq(b, b)) -> v_cmp_o_f32(a, b) */
17557ec681f3Smrgbool
17567ec681f3Smrgcombine_ordering_test(opt_ctx& ctx, aco_ptr<Instruction>& instr)
17577ec681f3Smrg{
17587ec681f3Smrg   if (instr->definitions[0].regClass() != ctx.program->lane_mask)
17597ec681f3Smrg      return false;
17607ec681f3Smrg   if (instr->definitions[1].isTemp() && ctx.uses[instr->definitions[1].tempId()])
17617ec681f3Smrg      return false;
17627ec681f3Smrg
17637ec681f3Smrg   bool is_or = instr->opcode == aco_opcode::s_or_b64 || instr->opcode == aco_opcode::s_or_b32;
17647ec681f3Smrg
17657ec681f3Smrg   bool neg[2] = {false, false};
17667ec681f3Smrg   bool abs[2] = {false, false};
17677ec681f3Smrg   uint8_t opsel = 0;
17687ec681f3Smrg   Instruction* op_instr[2];
17697ec681f3Smrg   Temp op[2];
17707ec681f3Smrg
17717ec681f3Smrg   unsigned bitsize = 0;
17727ec681f3Smrg   for (unsigned i = 0; i < 2; i++) {
17737ec681f3Smrg      op_instr[i] = follow_operand(ctx, instr->operands[i], true);
17747ec681f3Smrg      if (!op_instr[i])
17757ec681f3Smrg         return false;
17767ec681f3Smrg
17777ec681f3Smrg      aco_opcode expected_cmp = is_or ? aco_opcode::v_cmp_neq_f32 : aco_opcode::v_cmp_eq_f32;
17787ec681f3Smrg      unsigned op_bitsize = get_cmp_bitsize(op_instr[i]->opcode);
17797ec681f3Smrg
17807ec681f3Smrg      if (get_f32_cmp(op_instr[i]->opcode) != expected_cmp)
17817ec681f3Smrg         return false;
17827ec681f3Smrg      if (bitsize && op_bitsize != bitsize)
17837ec681f3Smrg         return false;
17847ec681f3Smrg      if (!op_instr[i]->operands[0].isTemp() || !op_instr[i]->operands[1].isTemp())
17857ec681f3Smrg         return false;
17867ec681f3Smrg
17877ec681f3Smrg      if (op_instr[i]->isVOP3()) {
17887ec681f3Smrg         VOP3_instruction& vop3 = op_instr[i]->vop3();
17897ec681f3Smrg         if (vop3.neg[0] != vop3.neg[1] || vop3.abs[0] != vop3.abs[1] || vop3.opsel == 1 ||
17907ec681f3Smrg             vop3.opsel == 2)
17917ec681f3Smrg            return false;
17927ec681f3Smrg         neg[i] = vop3.neg[0];
17937ec681f3Smrg         abs[i] = vop3.abs[0];
17947ec681f3Smrg         opsel |= (vop3.opsel & 1) << i;
17957ec681f3Smrg      } else if (op_instr[i]->isSDWA()) {
17967ec681f3Smrg         return false;
17977ec681f3Smrg      }
17987ec681f3Smrg
17997ec681f3Smrg      Temp op0 = op_instr[i]->operands[0].getTemp();
18007ec681f3Smrg      Temp op1 = op_instr[i]->operands[1].getTemp();
18017ec681f3Smrg      if (original_temp_id(ctx, op0) != original_temp_id(ctx, op1))
18027ec681f3Smrg         return false;
18037ec681f3Smrg
18047ec681f3Smrg      op[i] = op1;
18057ec681f3Smrg      bitsize = op_bitsize;
18067ec681f3Smrg   }
18077ec681f3Smrg
18087ec681f3Smrg   if (op[1].type() == RegType::sgpr)
18097ec681f3Smrg      std::swap(op[0], op[1]);
18107ec681f3Smrg   unsigned num_sgprs = (op[0].type() == RegType::sgpr) + (op[1].type() == RegType::sgpr);
18117ec681f3Smrg   if (num_sgprs > (ctx.program->chip_class >= GFX10 ? 2 : 1))
18127ec681f3Smrg      return false;
18137ec681f3Smrg
18147ec681f3Smrg   ctx.uses[op[0].id()]++;
18157ec681f3Smrg   ctx.uses[op[1].id()]++;
18167ec681f3Smrg   decrease_uses(ctx, op_instr[0]);
18177ec681f3Smrg   decrease_uses(ctx, op_instr[1]);
18187ec681f3Smrg
18197ec681f3Smrg   aco_opcode new_op = aco_opcode::num_opcodes;
18207ec681f3Smrg   switch (bitsize) {
18217ec681f3Smrg   case 16: new_op = is_or ? aco_opcode::v_cmp_u_f16 : aco_opcode::v_cmp_o_f16; break;
18227ec681f3Smrg   case 32: new_op = is_or ? aco_opcode::v_cmp_u_f32 : aco_opcode::v_cmp_o_f32; break;
18237ec681f3Smrg   case 64: new_op = is_or ? aco_opcode::v_cmp_u_f64 : aco_opcode::v_cmp_o_f64; break;
18247ec681f3Smrg   }
18257ec681f3Smrg   Instruction* new_instr;
18267ec681f3Smrg   if (neg[0] || neg[1] || abs[0] || abs[1] || opsel || num_sgprs > 1) {
18277ec681f3Smrg      VOP3_instruction* vop3 =
18287ec681f3Smrg         create_instruction<VOP3_instruction>(new_op, asVOP3(Format::VOPC), 2, 1);
18297ec681f3Smrg      for (unsigned i = 0; i < 2; i++) {
18307ec681f3Smrg         vop3->neg[i] = neg[i];
18317ec681f3Smrg         vop3->abs[i] = abs[i];
18327ec681f3Smrg      }
18337ec681f3Smrg      vop3->opsel = opsel;
18347ec681f3Smrg      new_instr = static_cast<Instruction*>(vop3);
18357ec681f3Smrg   } else {
18367ec681f3Smrg      new_instr = create_instruction<VOPC_instruction>(new_op, Format::VOPC, 2, 1);
18377ec681f3Smrg      instr->definitions[0].setHint(vcc);
18387ec681f3Smrg   }
18397ec681f3Smrg   new_instr->operands[0] = Operand(op[0]);
18407ec681f3Smrg   new_instr->operands[1] = Operand(op[1]);
18417ec681f3Smrg   new_instr->definitions[0] = instr->definitions[0];
18427ec681f3Smrg
18437ec681f3Smrg   ctx.info[instr->definitions[0].tempId()].label = 0;
18447ec681f3Smrg   ctx.info[instr->definitions[0].tempId()].set_vopc(new_instr);
18457ec681f3Smrg
18467ec681f3Smrg   instr.reset(new_instr);
18477ec681f3Smrg
18487ec681f3Smrg   return true;
18497ec681f3Smrg}
18507ec681f3Smrg
18517ec681f3Smrg/* s_or_b64(v_cmp_u_f32(a, b), cmp(a, b)) -> get_unordered(cmp)(a, b)
18527ec681f3Smrg * s_and_b64(v_cmp_o_f32(a, b), cmp(a, b)) -> get_ordered(cmp)(a, b) */
18537ec681f3Smrgbool
18547ec681f3Smrgcombine_comparison_ordering(opt_ctx& ctx, aco_ptr<Instruction>& instr)
18557ec681f3Smrg{
18567ec681f3Smrg   if (instr->definitions[0].regClass() != ctx.program->lane_mask)
18577ec681f3Smrg      return false;
18587ec681f3Smrg   if (instr->definitions[1].isTemp() && ctx.uses[instr->definitions[1].tempId()])
18597ec681f3Smrg      return false;
18607ec681f3Smrg
18617ec681f3Smrg   bool is_or = instr->opcode == aco_opcode::s_or_b64 || instr->opcode == aco_opcode::s_or_b32;
18627ec681f3Smrg   aco_opcode expected_nan_test = is_or ? aco_opcode::v_cmp_u_f32 : aco_opcode::v_cmp_o_f32;
18637ec681f3Smrg
18647ec681f3Smrg   Instruction* nan_test = follow_operand(ctx, instr->operands[0], true);
18657ec681f3Smrg   Instruction* cmp = follow_operand(ctx, instr->operands[1], true);
18667ec681f3Smrg   if (!nan_test || !cmp)
18677ec681f3Smrg      return false;
18687ec681f3Smrg   if (nan_test->isSDWA() || cmp->isSDWA())
18697ec681f3Smrg      return false;
18707ec681f3Smrg
18717ec681f3Smrg   if (get_f32_cmp(cmp->opcode) == expected_nan_test)
18727ec681f3Smrg      std::swap(nan_test, cmp);
18737ec681f3Smrg   else if (get_f32_cmp(nan_test->opcode) != expected_nan_test)
18747ec681f3Smrg      return false;
18757ec681f3Smrg
18767ec681f3Smrg   if (!is_cmp(cmp->opcode) || get_cmp_bitsize(cmp->opcode) != get_cmp_bitsize(nan_test->opcode))
18777ec681f3Smrg      return false;
18787ec681f3Smrg
18797ec681f3Smrg   if (!nan_test->operands[0].isTemp() || !nan_test->operands[1].isTemp())
18807ec681f3Smrg      return false;
18817ec681f3Smrg   if (!cmp->operands[0].isTemp() || !cmp->operands[1].isTemp())
18827ec681f3Smrg      return false;
18837ec681f3Smrg
18847ec681f3Smrg   unsigned prop_cmp0 = original_temp_id(ctx, cmp->operands[0].getTemp());
18857ec681f3Smrg   unsigned prop_cmp1 = original_temp_id(ctx, cmp->operands[1].getTemp());
18867ec681f3Smrg   unsigned prop_nan0 = original_temp_id(ctx, nan_test->operands[0].getTemp());
18877ec681f3Smrg   unsigned prop_nan1 = original_temp_id(ctx, nan_test->operands[1].getTemp());
18887ec681f3Smrg   if (prop_cmp0 != prop_nan0 && prop_cmp0 != prop_nan1)
18897ec681f3Smrg      return false;
18907ec681f3Smrg   if (prop_cmp1 != prop_nan0 && prop_cmp1 != prop_nan1)
18917ec681f3Smrg      return false;
18927ec681f3Smrg
18937ec681f3Smrg   ctx.uses[cmp->operands[0].tempId()]++;
18947ec681f3Smrg   ctx.uses[cmp->operands[1].tempId()]++;
18957ec681f3Smrg   decrease_uses(ctx, nan_test);
18967ec681f3Smrg   decrease_uses(ctx, cmp);
18977ec681f3Smrg
18987ec681f3Smrg   aco_opcode new_op = is_or ? get_unordered(cmp->opcode) : get_ordered(cmp->opcode);
18997ec681f3Smrg   Instruction* new_instr;
19007ec681f3Smrg   if (cmp->isVOP3()) {
19017ec681f3Smrg      VOP3_instruction* new_vop3 =
19027ec681f3Smrg         create_instruction<VOP3_instruction>(new_op, asVOP3(Format::VOPC), 2, 1);
19037ec681f3Smrg      VOP3_instruction& cmp_vop3 = cmp->vop3();
19047ec681f3Smrg      memcpy(new_vop3->abs, cmp_vop3.abs, sizeof(new_vop3->abs));
19057ec681f3Smrg      memcpy(new_vop3->neg, cmp_vop3.neg, sizeof(new_vop3->neg));
19067ec681f3Smrg      new_vop3->clamp = cmp_vop3.clamp;
19077ec681f3Smrg      new_vop3->omod = cmp_vop3.omod;
19087ec681f3Smrg      new_vop3->opsel = cmp_vop3.opsel;
19097ec681f3Smrg      new_instr = new_vop3;
19107ec681f3Smrg   } else {
19117ec681f3Smrg      new_instr = create_instruction<VOPC_instruction>(new_op, Format::VOPC, 2, 1);
19127ec681f3Smrg      instr->definitions[0].setHint(vcc);
19137ec681f3Smrg   }
19147ec681f3Smrg   new_instr->operands[0] = cmp->operands[0];
19157ec681f3Smrg   new_instr->operands[1] = cmp->operands[1];
19167ec681f3Smrg   new_instr->definitions[0] = instr->definitions[0];
19177ec681f3Smrg
19187ec681f3Smrg   ctx.info[instr->definitions[0].tempId()].label = 0;
19197ec681f3Smrg   ctx.info[instr->definitions[0].tempId()].set_vopc(new_instr);
19207ec681f3Smrg
19217ec681f3Smrg   instr.reset(new_instr);
19227ec681f3Smrg
19237ec681f3Smrg   return true;
19247ec681f3Smrg}
19257ec681f3Smrg
19267ec681f3Smrgbool
19277ec681f3Smrgis_operand_constant(opt_ctx& ctx, Operand op, unsigned bit_size, uint64_t* value)
19287ec681f3Smrg{
19297ec681f3Smrg   if (op.isConstant()) {
19307ec681f3Smrg      *value = op.constantValue64();
19317ec681f3Smrg      return true;
19327ec681f3Smrg   } else if (op.isTemp()) {
19337ec681f3Smrg      unsigned id = original_temp_id(ctx, op.getTemp());
19347ec681f3Smrg      if (!ctx.info[id].is_constant_or_literal(bit_size))
19357ec681f3Smrg         return false;
19367ec681f3Smrg      *value = get_constant_op(ctx, ctx.info[id], bit_size).constantValue64();
19377ec681f3Smrg      return true;
19387ec681f3Smrg   }
19397ec681f3Smrg   return false;
19407ec681f3Smrg}
19417ec681f3Smrg
19427ec681f3Smrgbool
19437ec681f3Smrgis_constant_nan(uint64_t value, unsigned bit_size)
19447ec681f3Smrg{
19457ec681f3Smrg   if (bit_size == 16)
19467ec681f3Smrg      return ((value >> 10) & 0x1f) == 0x1f && (value & 0x3ff);
19477ec681f3Smrg   else if (bit_size == 32)
19487ec681f3Smrg      return ((value >> 23) & 0xff) == 0xff && (value & 0x7fffff);
19497ec681f3Smrg   else
19507ec681f3Smrg      return ((value >> 52) & 0x7ff) == 0x7ff && (value & 0xfffffffffffff);
19517ec681f3Smrg}
19527ec681f3Smrg
19537ec681f3Smrg/* s_or_b64(v_cmp_neq_f32(a, a), cmp(a, #b)) and b is not NaN -> get_unordered(cmp)(a, b)
19547ec681f3Smrg * s_and_b64(v_cmp_eq_f32(a, a), cmp(a, #b)) and b is not NaN -> get_ordered(cmp)(a, b) */
19557ec681f3Smrgbool
19567ec681f3Smrgcombine_constant_comparison_ordering(opt_ctx& ctx, aco_ptr<Instruction>& instr)
19577ec681f3Smrg{
19587ec681f3Smrg   if (instr->definitions[0].regClass() != ctx.program->lane_mask)
19597ec681f3Smrg      return false;
19607ec681f3Smrg   if (instr->definitions[1].isTemp() && ctx.uses[instr->definitions[1].tempId()])
19617ec681f3Smrg      return false;
19627ec681f3Smrg
19637ec681f3Smrg   bool is_or = instr->opcode == aco_opcode::s_or_b64 || instr->opcode == aco_opcode::s_or_b32;
19647ec681f3Smrg
19657ec681f3Smrg   Instruction* nan_test = follow_operand(ctx, instr->operands[0], true);
19667ec681f3Smrg   Instruction* cmp = follow_operand(ctx, instr->operands[1], true);
19677ec681f3Smrg
19687ec681f3Smrg   if (!nan_test || !cmp || nan_test->isSDWA() || cmp->isSDWA())
19697ec681f3Smrg      return false;
19707ec681f3Smrg   if (nan_test->isSDWA() || cmp->isSDWA())
19717ec681f3Smrg      return false;
19727ec681f3Smrg
19737ec681f3Smrg   aco_opcode expected_nan_test = is_or ? aco_opcode::v_cmp_neq_f32 : aco_opcode::v_cmp_eq_f32;
19747ec681f3Smrg   if (get_f32_cmp(cmp->opcode) == expected_nan_test)
19757ec681f3Smrg      std::swap(nan_test, cmp);
19767ec681f3Smrg   else if (get_f32_cmp(nan_test->opcode) != expected_nan_test)
19777ec681f3Smrg      return false;
19787ec681f3Smrg
19797ec681f3Smrg   unsigned bit_size = get_cmp_bitsize(cmp->opcode);
19807ec681f3Smrg   if (!is_cmp(cmp->opcode) || get_cmp_bitsize(nan_test->opcode) != bit_size)
19817ec681f3Smrg      return false;
19827ec681f3Smrg
19837ec681f3Smrg   if (!nan_test->operands[0].isTemp() || !nan_test->operands[1].isTemp())
19847ec681f3Smrg      return false;
19857ec681f3Smrg   if (!cmp->operands[0].isTemp() && !cmp->operands[1].isTemp())
19867ec681f3Smrg      return false;
19877ec681f3Smrg
19887ec681f3Smrg   unsigned prop_nan0 = original_temp_id(ctx, nan_test->operands[0].getTemp());
19897ec681f3Smrg   unsigned prop_nan1 = original_temp_id(ctx, nan_test->operands[1].getTemp());
19907ec681f3Smrg   if (prop_nan0 != prop_nan1)
19917ec681f3Smrg      return false;
19927ec681f3Smrg
19937ec681f3Smrg   if (nan_test->isVOP3()) {
19947ec681f3Smrg      VOP3_instruction& vop3 = nan_test->vop3();
19957ec681f3Smrg      if (vop3.neg[0] != vop3.neg[1] || vop3.abs[0] != vop3.abs[1] || vop3.opsel == 1 ||
19967ec681f3Smrg          vop3.opsel == 2)
19977ec681f3Smrg         return false;
19987ec681f3Smrg   }
19997ec681f3Smrg
20007ec681f3Smrg   int constant_operand = -1;
20017ec681f3Smrg   for (unsigned i = 0; i < 2; i++) {
20027ec681f3Smrg      if (cmp->operands[i].isTemp() &&
20037ec681f3Smrg          original_temp_id(ctx, cmp->operands[i].getTemp()) == prop_nan0) {
20047ec681f3Smrg         constant_operand = !i;
20057ec681f3Smrg         break;
20067ec681f3Smrg      }
20077ec681f3Smrg   }
20087ec681f3Smrg   if (constant_operand == -1)
20097ec681f3Smrg      return false;
20107ec681f3Smrg
20117ec681f3Smrg   uint64_t constant_value;
20127ec681f3Smrg   if (!is_operand_constant(ctx, cmp->operands[constant_operand], bit_size, &constant_value))
20137ec681f3Smrg      return false;
20147ec681f3Smrg   if (is_constant_nan(constant_value, bit_size))
20157ec681f3Smrg      return false;
20167ec681f3Smrg
20177ec681f3Smrg   if (cmp->operands[0].isTemp())
20187ec681f3Smrg      ctx.uses[cmp->operands[0].tempId()]++;
20197ec681f3Smrg   if (cmp->operands[1].isTemp())
20207ec681f3Smrg      ctx.uses[cmp->operands[1].tempId()]++;
20217ec681f3Smrg   decrease_uses(ctx, nan_test);
20227ec681f3Smrg   decrease_uses(ctx, cmp);
20237ec681f3Smrg
20247ec681f3Smrg   aco_opcode new_op = is_or ? get_unordered(cmp->opcode) : get_ordered(cmp->opcode);
20257ec681f3Smrg   Instruction* new_instr;
20267ec681f3Smrg   if (cmp->isVOP3()) {
20277ec681f3Smrg      VOP3_instruction* new_vop3 =
20287ec681f3Smrg         create_instruction<VOP3_instruction>(new_op, asVOP3(Format::VOPC), 2, 1);
20297ec681f3Smrg      VOP3_instruction& cmp_vop3 = cmp->vop3();
20307ec681f3Smrg      memcpy(new_vop3->abs, cmp_vop3.abs, sizeof(new_vop3->abs));
20317ec681f3Smrg      memcpy(new_vop3->neg, cmp_vop3.neg, sizeof(new_vop3->neg));
20327ec681f3Smrg      new_vop3->clamp = cmp_vop3.clamp;
20337ec681f3Smrg      new_vop3->omod = cmp_vop3.omod;
20347ec681f3Smrg      new_vop3->opsel = cmp_vop3.opsel;
20357ec681f3Smrg      new_instr = new_vop3;
20367ec681f3Smrg   } else {
20377ec681f3Smrg      new_instr = create_instruction<VOPC_instruction>(new_op, Format::VOPC, 2, 1);
20387ec681f3Smrg      instr->definitions[0].setHint(vcc);
20397ec681f3Smrg   }
20407ec681f3Smrg   new_instr->operands[0] = cmp->operands[0];
20417ec681f3Smrg   new_instr->operands[1] = cmp->operands[1];
20427ec681f3Smrg   new_instr->definitions[0] = instr->definitions[0];
20437ec681f3Smrg
20447ec681f3Smrg   ctx.info[instr->definitions[0].tempId()].label = 0;
20457ec681f3Smrg   ctx.info[instr->definitions[0].tempId()].set_vopc(new_instr);
20467ec681f3Smrg
20477ec681f3Smrg   instr.reset(new_instr);
20487ec681f3Smrg
20497ec681f3Smrg   return true;
20507ec681f3Smrg}
20517ec681f3Smrg
20527ec681f3Smrg/* s_andn2(exec, cmp(a, b)) -> get_inverse(cmp)(a, b) */
20537ec681f3Smrgbool
20547ec681f3Smrgcombine_inverse_comparison(opt_ctx& ctx, aco_ptr<Instruction>& instr)
20557ec681f3Smrg{
20567ec681f3Smrg   if (!instr->operands[0].isFixed() || instr->operands[0].physReg() != exec)
20577ec681f3Smrg      return false;
20587ec681f3Smrg   if (ctx.uses[instr->definitions[1].tempId()])
20597ec681f3Smrg      return false;
20607ec681f3Smrg
20617ec681f3Smrg   Instruction* cmp = follow_operand(ctx, instr->operands[1]);
20627ec681f3Smrg   if (!cmp)
20637ec681f3Smrg      return false;
20647ec681f3Smrg
20657ec681f3Smrg   aco_opcode new_opcode = get_inverse(cmp->opcode);
20667ec681f3Smrg   if (new_opcode == aco_opcode::num_opcodes)
20677ec681f3Smrg      return false;
20687ec681f3Smrg
20697ec681f3Smrg   if (cmp->operands[0].isTemp())
20707ec681f3Smrg      ctx.uses[cmp->operands[0].tempId()]++;
20717ec681f3Smrg   if (cmp->operands[1].isTemp())
20727ec681f3Smrg      ctx.uses[cmp->operands[1].tempId()]++;
20737ec681f3Smrg   decrease_uses(ctx, cmp);
20747ec681f3Smrg
20757ec681f3Smrg   /* This creates a new instruction instead of modifying the existing
20767ec681f3Smrg    * comparison so that the comparison is done with the correct exec mask. */
20777ec681f3Smrg   Instruction* new_instr;
20787ec681f3Smrg   if (cmp->isVOP3()) {
20797ec681f3Smrg      VOP3_instruction* new_vop3 =
20807ec681f3Smrg         create_instruction<VOP3_instruction>(new_opcode, asVOP3(Format::VOPC), 2, 1);
20817ec681f3Smrg      VOP3_instruction& cmp_vop3 = cmp->vop3();
20827ec681f3Smrg      memcpy(new_vop3->abs, cmp_vop3.abs, sizeof(new_vop3->abs));
20837ec681f3Smrg      memcpy(new_vop3->neg, cmp_vop3.neg, sizeof(new_vop3->neg));
20847ec681f3Smrg      new_vop3->clamp = cmp_vop3.clamp;
20857ec681f3Smrg      new_vop3->omod = cmp_vop3.omod;
20867ec681f3Smrg      new_vop3->opsel = cmp_vop3.opsel;
20877ec681f3Smrg      new_instr = new_vop3;
20887ec681f3Smrg   } else if (cmp->isSDWA()) {
20897ec681f3Smrg      SDWA_instruction* new_sdwa = create_instruction<SDWA_instruction>(
20907ec681f3Smrg         new_opcode, (Format)((uint16_t)Format::SDWA | (uint16_t)Format::VOPC), 2, 1);
20917ec681f3Smrg      SDWA_instruction& cmp_sdwa = cmp->sdwa();
20927ec681f3Smrg      memcpy(new_sdwa->abs, cmp_sdwa.abs, sizeof(new_sdwa->abs));
20937ec681f3Smrg      memcpy(new_sdwa->sel, cmp_sdwa.sel, sizeof(new_sdwa->sel));
20947ec681f3Smrg      memcpy(new_sdwa->neg, cmp_sdwa.neg, sizeof(new_sdwa->neg));
20957ec681f3Smrg      new_sdwa->dst_sel = cmp_sdwa.dst_sel;
20967ec681f3Smrg      new_sdwa->clamp = cmp_sdwa.clamp;
20977ec681f3Smrg      new_sdwa->omod = cmp_sdwa.omod;
20987ec681f3Smrg      new_instr = new_sdwa;
20997ec681f3Smrg   } else if (cmp->isDPP()) {
21007ec681f3Smrg      DPP_instruction* new_dpp = create_instruction<DPP_instruction>(
21017ec681f3Smrg         new_opcode, (Format)((uint16_t)Format::DPP | (uint16_t)Format::VOPC), 2, 1);
21027ec681f3Smrg      DPP_instruction& cmp_dpp = cmp->dpp();
21037ec681f3Smrg      memcpy(new_dpp->abs, cmp_dpp.abs, sizeof(new_dpp->abs));
21047ec681f3Smrg      memcpy(new_dpp->neg, cmp_dpp.neg, sizeof(new_dpp->neg));
21057ec681f3Smrg      new_dpp->dpp_ctrl = cmp_dpp.dpp_ctrl;
21067ec681f3Smrg      new_dpp->row_mask = cmp_dpp.row_mask;
21077ec681f3Smrg      new_dpp->bank_mask = cmp_dpp.bank_mask;
21087ec681f3Smrg      new_dpp->bound_ctrl = cmp_dpp.bound_ctrl;
21097ec681f3Smrg      new_instr = new_dpp;
21107ec681f3Smrg   } else {
21117ec681f3Smrg      new_instr = create_instruction<VOPC_instruction>(new_opcode, Format::VOPC, 2, 1);
21127ec681f3Smrg      instr->definitions[0].setHint(vcc);
21137ec681f3Smrg   }
21147ec681f3Smrg   new_instr->operands[0] = cmp->operands[0];
21157ec681f3Smrg   new_instr->operands[1] = cmp->operands[1];
21167ec681f3Smrg   new_instr->definitions[0] = instr->definitions[0];
21177ec681f3Smrg
21187ec681f3Smrg   ctx.info[instr->definitions[0].tempId()].label = 0;
21197ec681f3Smrg   ctx.info[instr->definitions[0].tempId()].set_vopc(new_instr);
21207ec681f3Smrg
21217ec681f3Smrg   instr.reset(new_instr);
21227ec681f3Smrg
21237ec681f3Smrg   return true;
21247ec681f3Smrg}
21257ec681f3Smrg
21267ec681f3Smrg/* op1(op2(1, 2), 0) if swap = false
21277ec681f3Smrg * op1(0, op2(1, 2)) if swap = true */
21287ec681f3Smrgbool
21297ec681f3Smrgmatch_op3_for_vop3(opt_ctx& ctx, aco_opcode op1, aco_opcode op2, Instruction* op1_instr, bool swap,
21307ec681f3Smrg                   const char* shuffle_str, Operand operands[3], bool neg[3], bool abs[3],
21317ec681f3Smrg                   uint8_t* opsel, bool* op1_clamp, uint8_t* op1_omod, bool* inbetween_neg,
21327ec681f3Smrg                   bool* inbetween_abs, bool* inbetween_opsel, bool* precise)
21337ec681f3Smrg{
21347ec681f3Smrg   /* checks */
21357ec681f3Smrg   if (op1_instr->opcode != op1)
21367ec681f3Smrg      return false;
21377ec681f3Smrg
21387ec681f3Smrg   Instruction* op2_instr = follow_operand(ctx, op1_instr->operands[swap]);
21397ec681f3Smrg   if (!op2_instr || op2_instr->opcode != op2)
21407ec681f3Smrg      return false;
21417ec681f3Smrg   if (fixed_to_exec(op2_instr->operands[0]) || fixed_to_exec(op2_instr->operands[1]))
21427ec681f3Smrg      return false;
21437ec681f3Smrg
21447ec681f3Smrg   VOP3_instruction* op1_vop3 = op1_instr->isVOP3() ? &op1_instr->vop3() : NULL;
21457ec681f3Smrg   VOP3_instruction* op2_vop3 = op2_instr->isVOP3() ? &op2_instr->vop3() : NULL;
21467ec681f3Smrg
21477ec681f3Smrg   if (op1_instr->isSDWA() || op2_instr->isSDWA())
21487ec681f3Smrg      return false;
21497ec681f3Smrg   if (op1_instr->isDPP() || op2_instr->isDPP())
21507ec681f3Smrg      return false;
21517ec681f3Smrg
21527ec681f3Smrg   /* don't support inbetween clamp/omod */
21537ec681f3Smrg   if (op2_vop3 && (op2_vop3->clamp || op2_vop3->omod))
21547ec681f3Smrg      return false;
21557ec681f3Smrg
21567ec681f3Smrg   /* get operands and modifiers and check inbetween modifiers */
21577ec681f3Smrg   *op1_clamp = op1_vop3 ? op1_vop3->clamp : false;
21587ec681f3Smrg   *op1_omod = op1_vop3 ? op1_vop3->omod : 0u;
21597ec681f3Smrg
21607ec681f3Smrg   if (inbetween_neg)
21617ec681f3Smrg      *inbetween_neg = op1_vop3 ? op1_vop3->neg[swap] : false;
21627ec681f3Smrg   else if (op1_vop3 && op1_vop3->neg[swap])
21637ec681f3Smrg      return false;
21647ec681f3Smrg
21657ec681f3Smrg   if (inbetween_abs)
21667ec681f3Smrg      *inbetween_abs = op1_vop3 ? op1_vop3->abs[swap] : false;
21677ec681f3Smrg   else if (op1_vop3 && op1_vop3->abs[swap])
21687ec681f3Smrg      return false;
21697ec681f3Smrg
21707ec681f3Smrg   if (inbetween_opsel)
21717ec681f3Smrg      *inbetween_opsel = op1_vop3 ? op1_vop3->opsel & (1 << (unsigned)swap) : false;
21727ec681f3Smrg   else if (op1_vop3 && op1_vop3->opsel & (1 << (unsigned)swap))
21737ec681f3Smrg      return false;
21747ec681f3Smrg
21757ec681f3Smrg   *precise = op1_instr->definitions[0].isPrecise() || op2_instr->definitions[0].isPrecise();
21767ec681f3Smrg
21777ec681f3Smrg   int shuffle[3];
21787ec681f3Smrg   shuffle[shuffle_str[0] - '0'] = 0;
21797ec681f3Smrg   shuffle[shuffle_str[1] - '0'] = 1;
21807ec681f3Smrg   shuffle[shuffle_str[2] - '0'] = 2;
21817ec681f3Smrg
21827ec681f3Smrg   operands[shuffle[0]] = op1_instr->operands[!swap];
21837ec681f3Smrg   neg[shuffle[0]] = op1_vop3 ? op1_vop3->neg[!swap] : false;
21847ec681f3Smrg   abs[shuffle[0]] = op1_vop3 ? op1_vop3->abs[!swap] : false;
21857ec681f3Smrg   if (op1_vop3 && (op1_vop3->opsel & (1 << (unsigned)!swap)))
21867ec681f3Smrg      *opsel |= 1 << shuffle[0];
21877ec681f3Smrg
21887ec681f3Smrg   for (unsigned i = 0; i < 2; i++) {
21897ec681f3Smrg      operands[shuffle[i + 1]] = op2_instr->operands[i];
21907ec681f3Smrg      neg[shuffle[i + 1]] = op2_vop3 ? op2_vop3->neg[i] : false;
21917ec681f3Smrg      abs[shuffle[i + 1]] = op2_vop3 ? op2_vop3->abs[i] : false;
21927ec681f3Smrg      if (op2_vop3 && op2_vop3->opsel & (1 << i))
21937ec681f3Smrg         *opsel |= 1 << shuffle[i + 1];
21947ec681f3Smrg   }
21957ec681f3Smrg
21967ec681f3Smrg   /* check operands */
21977ec681f3Smrg   if (!check_vop3_operands(ctx, 3, operands))
21987ec681f3Smrg      return false;
21997ec681f3Smrg
22007ec681f3Smrg   return true;
22017ec681f3Smrg}
22027ec681f3Smrg
22037ec681f3Smrgvoid
22047ec681f3Smrgcreate_vop3_for_op3(opt_ctx& ctx, aco_opcode opcode, aco_ptr<Instruction>& instr,
22057ec681f3Smrg                    Operand operands[3], bool neg[3], bool abs[3], uint8_t opsel, bool clamp,
22067ec681f3Smrg                    unsigned omod)
22077ec681f3Smrg{
22087ec681f3Smrg   VOP3_instruction* new_instr = create_instruction<VOP3_instruction>(opcode, Format::VOP3, 3, 1);
22097ec681f3Smrg   memcpy(new_instr->abs, abs, sizeof(bool[3]));
22107ec681f3Smrg   memcpy(new_instr->neg, neg, sizeof(bool[3]));
22117ec681f3Smrg   new_instr->clamp = clamp;
22127ec681f3Smrg   new_instr->omod = omod;
22137ec681f3Smrg   new_instr->opsel = opsel;
22147ec681f3Smrg   new_instr->operands[0] = operands[0];
22157ec681f3Smrg   new_instr->operands[1] = operands[1];
22167ec681f3Smrg   new_instr->operands[2] = operands[2];
22177ec681f3Smrg   new_instr->definitions[0] = instr->definitions[0];
22187ec681f3Smrg   ctx.info[instr->definitions[0].tempId()].label = 0;
22197ec681f3Smrg
22207ec681f3Smrg   instr.reset(new_instr);
22217ec681f3Smrg}
22227ec681f3Smrg
22237ec681f3Smrgbool
22247ec681f3Smrgcombine_three_valu_op(opt_ctx& ctx, aco_ptr<Instruction>& instr, aco_opcode op2, aco_opcode new_op,
22257ec681f3Smrg                      const char* shuffle, uint8_t ops)
22267ec681f3Smrg{
22277ec681f3Smrg   for (unsigned swap = 0; swap < 2; swap++) {
22287ec681f3Smrg      if (!((1 << swap) & ops))
22297ec681f3Smrg         continue;
22307ec681f3Smrg
22317ec681f3Smrg      Operand operands[3];
22327ec681f3Smrg      bool neg[3], abs[3], clamp, precise;
22337ec681f3Smrg      uint8_t opsel = 0, omod = 0;
22347ec681f3Smrg      if (match_op3_for_vop3(ctx, instr->opcode, op2, instr.get(), swap, shuffle, operands, neg,
22357ec681f3Smrg                             abs, &opsel, &clamp, &omod, NULL, NULL, NULL, &precise)) {
22367ec681f3Smrg         ctx.uses[instr->operands[swap].tempId()]--;
22377ec681f3Smrg         create_vop3_for_op3(ctx, new_op, instr, operands, neg, abs, opsel, clamp, omod);
22387ec681f3Smrg         return true;
22397ec681f3Smrg      }
22407ec681f3Smrg   }
22417ec681f3Smrg   return false;
22427ec681f3Smrg}
22437ec681f3Smrg
22447ec681f3Smrg/* creates v_lshl_add_u32, v_lshl_or_b32 or v_and_or_b32 */
22457ec681f3Smrgbool
22467ec681f3Smrgcombine_add_or_then_and_lshl(opt_ctx& ctx, aco_ptr<Instruction>& instr)
22477ec681f3Smrg{
22487ec681f3Smrg   bool is_or = instr->opcode == aco_opcode::v_or_b32;
22497ec681f3Smrg   aco_opcode new_op_lshl = is_or ? aco_opcode::v_lshl_or_b32 : aco_opcode::v_lshl_add_u32;
22507ec681f3Smrg
22517ec681f3Smrg   if (is_or && combine_three_valu_op(ctx, instr, aco_opcode::s_and_b32, aco_opcode::v_and_or_b32,
22527ec681f3Smrg                                      "120", 1 | 2))
22537ec681f3Smrg      return true;
22547ec681f3Smrg   if (is_or && combine_three_valu_op(ctx, instr, aco_opcode::v_and_b32, aco_opcode::v_and_or_b32,
22557ec681f3Smrg                                      "120", 1 | 2))
22567ec681f3Smrg      return true;
22577ec681f3Smrg   if (combine_three_valu_op(ctx, instr, aco_opcode::s_lshl_b32, new_op_lshl, "120", 1 | 2))
22587ec681f3Smrg      return true;
22597ec681f3Smrg   if (combine_three_valu_op(ctx, instr, aco_opcode::v_lshlrev_b32, new_op_lshl, "210", 1 | 2))
22607ec681f3Smrg      return true;
22617ec681f3Smrg
22627ec681f3Smrg   if (instr->isSDWA() || instr->isDPP())
22637ec681f3Smrg      return false;
22647ec681f3Smrg
22657ec681f3Smrg   /* v_or_b32(p_extract(a, 0, 8/16, 0), b) -> v_and_or_b32(a, 0xff/0xffff, b)
22667ec681f3Smrg    * v_or_b32(p_insert(a, 0, 8/16), b) -> v_and_or_b32(a, 0xff/0xffff, b)
22677ec681f3Smrg    * v_or_b32(p_insert(a, 24/16, 8/16), b) -> v_lshl_or_b32(a, 24/16, b)
22687ec681f3Smrg    * v_add_u32(p_insert(a, 24/16, 8/16), b) -> v_lshl_add_b32(a, 24/16, b)
22697ec681f3Smrg    */
22707ec681f3Smrg   for (unsigned i = 0; i < 2; i++) {
22717ec681f3Smrg      Instruction* extins = follow_operand(ctx, instr->operands[i]);
22727ec681f3Smrg      if (!extins)
22737ec681f3Smrg         continue;
22747ec681f3Smrg
22757ec681f3Smrg      aco_opcode op;
22767ec681f3Smrg      Operand operands[3];
22777ec681f3Smrg
22787ec681f3Smrg      if (extins->opcode == aco_opcode::p_insert &&
22797ec681f3Smrg          (extins->operands[1].constantValue() + 1) * extins->operands[2].constantValue() == 32) {
22807ec681f3Smrg         op = new_op_lshl;
22817ec681f3Smrg         operands[1] =
22827ec681f3Smrg            Operand::c32(extins->operands[1].constantValue() * extins->operands[2].constantValue());
22837ec681f3Smrg      } else if (is_or &&
22847ec681f3Smrg                 (extins->opcode == aco_opcode::p_insert ||
22857ec681f3Smrg                  (extins->opcode == aco_opcode::p_extract &&
22867ec681f3Smrg                   extins->operands[3].constantEquals(0))) &&
22877ec681f3Smrg                 extins->operands[1].constantEquals(0)) {
22887ec681f3Smrg         op = aco_opcode::v_and_or_b32;
22897ec681f3Smrg         operands[1] = Operand::c32(extins->operands[2].constantEquals(8) ? 0xffu : 0xffffu);
22907ec681f3Smrg      } else {
22917ec681f3Smrg         continue;
22927ec681f3Smrg      }
22937ec681f3Smrg
22947ec681f3Smrg      operands[0] = extins->operands[0];
22957ec681f3Smrg      operands[2] = instr->operands[!i];
22967ec681f3Smrg
22977ec681f3Smrg      if (!check_vop3_operands(ctx, 3, operands))
22987ec681f3Smrg         continue;
22997ec681f3Smrg
23007ec681f3Smrg      bool neg[3] = {}, abs[3] = {};
23017ec681f3Smrg      uint8_t opsel = 0, omod = 0;
23027ec681f3Smrg      bool clamp = false;
23037ec681f3Smrg      if (instr->isVOP3())
23047ec681f3Smrg         clamp = instr->vop3().clamp;
23057ec681f3Smrg
23067ec681f3Smrg      ctx.uses[instr->operands[i].tempId()]--;
23077ec681f3Smrg      create_vop3_for_op3(ctx, op, instr, operands, neg, abs, opsel, clamp, omod);
23087ec681f3Smrg      return true;
23097ec681f3Smrg   }
23107ec681f3Smrg
23117ec681f3Smrg   return false;
23127ec681f3Smrg}
23137ec681f3Smrg
23147ec681f3Smrgbool
23157ec681f3Smrgcombine_minmax(opt_ctx& ctx, aco_ptr<Instruction>& instr, aco_opcode opposite, aco_opcode minmax3)
23167ec681f3Smrg{
23177ec681f3Smrg   /* TODO: this can handle SDWA min/max instructions by using opsel */
23187ec681f3Smrg   if (combine_three_valu_op(ctx, instr, instr->opcode, minmax3, "012", 1 | 2))
23197ec681f3Smrg      return true;
23207ec681f3Smrg
23217ec681f3Smrg   /* min(-max(a, b), c) -> min3(c, -a, -b) *
23227ec681f3Smrg    * max(-min(a, b), c) -> max3(c, -a, -b) */
23237ec681f3Smrg   for (unsigned swap = 0; swap < 2; swap++) {
23247ec681f3Smrg      Operand operands[3];
23257ec681f3Smrg      bool neg[3], abs[3], clamp, precise;
23267ec681f3Smrg      uint8_t opsel = 0, omod = 0;
23277ec681f3Smrg      bool inbetween_neg;
23287ec681f3Smrg      if (match_op3_for_vop3(ctx, instr->opcode, opposite, instr.get(), swap, "012", operands, neg,
23297ec681f3Smrg                             abs, &opsel, &clamp, &omod, &inbetween_neg, NULL, NULL, &precise) &&
23307ec681f3Smrg          inbetween_neg) {
23317ec681f3Smrg         ctx.uses[instr->operands[swap].tempId()]--;
23327ec681f3Smrg         neg[1] = !neg[1];
23337ec681f3Smrg         neg[2] = !neg[2];
23347ec681f3Smrg         create_vop3_for_op3(ctx, minmax3, instr, operands, neg, abs, opsel, clamp, omod);
23357ec681f3Smrg         return true;
23367ec681f3Smrg      }
23377ec681f3Smrg   }
23387ec681f3Smrg   return false;
23397ec681f3Smrg}
23407ec681f3Smrg
23417ec681f3Smrg/* s_not_b32(s_and_b32(a, b)) -> s_nand_b32(a, b)
23427ec681f3Smrg * s_not_b32(s_or_b32(a, b)) -> s_nor_b32(a, b)
23437ec681f3Smrg * s_not_b32(s_xor_b32(a, b)) -> s_xnor_b32(a, b)
23447ec681f3Smrg * s_not_b64(s_and_b64(a, b)) -> s_nand_b64(a, b)
23457ec681f3Smrg * s_not_b64(s_or_b64(a, b)) -> s_nor_b64(a, b)
23467ec681f3Smrg * s_not_b64(s_xor_b64(a, b)) -> s_xnor_b64(a, b) */
23477ec681f3Smrgbool
23487ec681f3Smrgcombine_salu_not_bitwise(opt_ctx& ctx, aco_ptr<Instruction>& instr)
23497ec681f3Smrg{
23507ec681f3Smrg   /* checks */
23517ec681f3Smrg   if (!instr->operands[0].isTemp())
23527ec681f3Smrg      return false;
23537ec681f3Smrg   if (instr->definitions[1].isTemp() && ctx.uses[instr->definitions[1].tempId()])
23547ec681f3Smrg      return false;
23557ec681f3Smrg
23567ec681f3Smrg   Instruction* op2_instr = follow_operand(ctx, instr->operands[0]);
23577ec681f3Smrg   if (!op2_instr)
23587ec681f3Smrg      return false;
23597ec681f3Smrg   switch (op2_instr->opcode) {
23607ec681f3Smrg   case aco_opcode::s_and_b32:
23617ec681f3Smrg   case aco_opcode::s_or_b32:
23627ec681f3Smrg   case aco_opcode::s_xor_b32:
23637ec681f3Smrg   case aco_opcode::s_and_b64:
23647ec681f3Smrg   case aco_opcode::s_or_b64:
23657ec681f3Smrg   case aco_opcode::s_xor_b64: break;
23667ec681f3Smrg   default: return false;
23677ec681f3Smrg   }
23687ec681f3Smrg
23697ec681f3Smrg   /* create instruction */
23707ec681f3Smrg   std::swap(instr->definitions[0], op2_instr->definitions[0]);
23717ec681f3Smrg   std::swap(instr->definitions[1], op2_instr->definitions[1]);
23727ec681f3Smrg   ctx.uses[instr->operands[0].tempId()]--;
23737ec681f3Smrg   ctx.info[op2_instr->definitions[0].tempId()].label = 0;
23747ec681f3Smrg
23757ec681f3Smrg   switch (op2_instr->opcode) {
23767ec681f3Smrg   case aco_opcode::s_and_b32: op2_instr->opcode = aco_opcode::s_nand_b32; break;
23777ec681f3Smrg   case aco_opcode::s_or_b32: op2_instr->opcode = aco_opcode::s_nor_b32; break;
23787ec681f3Smrg   case aco_opcode::s_xor_b32: op2_instr->opcode = aco_opcode::s_xnor_b32; break;
23797ec681f3Smrg   case aco_opcode::s_and_b64: op2_instr->opcode = aco_opcode::s_nand_b64; break;
23807ec681f3Smrg   case aco_opcode::s_or_b64: op2_instr->opcode = aco_opcode::s_nor_b64; break;
23817ec681f3Smrg   case aco_opcode::s_xor_b64: op2_instr->opcode = aco_opcode::s_xnor_b64; break;
23827ec681f3Smrg   default: break;
23837ec681f3Smrg   }
23847ec681f3Smrg
23857ec681f3Smrg   return true;
23867ec681f3Smrg}
23877ec681f3Smrg
23887ec681f3Smrg/* s_and_b32(a, s_not_b32(b)) -> s_andn2_b32(a, b)
23897ec681f3Smrg * s_or_b32(a, s_not_b32(b)) -> s_orn2_b32(a, b)
23907ec681f3Smrg * s_and_b64(a, s_not_b64(b)) -> s_andn2_b64(a, b)
23917ec681f3Smrg * s_or_b64(a, s_not_b64(b)) -> s_orn2_b64(a, b) */
23927ec681f3Smrgbool
23937ec681f3Smrgcombine_salu_n2(opt_ctx& ctx, aco_ptr<Instruction>& instr)
23947ec681f3Smrg{
23957ec681f3Smrg   if (instr->definitions[0].isTemp() && ctx.info[instr->definitions[0].tempId()].is_uniform_bool())
23967ec681f3Smrg      return false;
23977ec681f3Smrg
23987ec681f3Smrg   for (unsigned i = 0; i < 2; i++) {
23997ec681f3Smrg      Instruction* op2_instr = follow_operand(ctx, instr->operands[i]);
24007ec681f3Smrg      if (!op2_instr || (op2_instr->opcode != aco_opcode::s_not_b32 &&
24017ec681f3Smrg                         op2_instr->opcode != aco_opcode::s_not_b64))
24027ec681f3Smrg         continue;
24037ec681f3Smrg      if (ctx.uses[op2_instr->definitions[1].tempId()] || fixed_to_exec(op2_instr->operands[0]))
24047ec681f3Smrg         continue;
24057ec681f3Smrg
24067ec681f3Smrg      if (instr->operands[!i].isLiteral() && op2_instr->operands[0].isLiteral() &&
24077ec681f3Smrg          instr->operands[!i].constantValue() != op2_instr->operands[0].constantValue())
24087ec681f3Smrg         continue;
24097ec681f3Smrg
24107ec681f3Smrg      ctx.uses[instr->operands[i].tempId()]--;
24117ec681f3Smrg      instr->operands[0] = instr->operands[!i];
24127ec681f3Smrg      instr->operands[1] = op2_instr->operands[0];
24137ec681f3Smrg      ctx.info[instr->definitions[0].tempId()].label = 0;
24147ec681f3Smrg
24157ec681f3Smrg      switch (instr->opcode) {
24167ec681f3Smrg      case aco_opcode::s_and_b32: instr->opcode = aco_opcode::s_andn2_b32; break;
24177ec681f3Smrg      case aco_opcode::s_or_b32: instr->opcode = aco_opcode::s_orn2_b32; break;
24187ec681f3Smrg      case aco_opcode::s_and_b64: instr->opcode = aco_opcode::s_andn2_b64; break;
24197ec681f3Smrg      case aco_opcode::s_or_b64: instr->opcode = aco_opcode::s_orn2_b64; break;
24207ec681f3Smrg      default: break;
24217ec681f3Smrg      }
24227ec681f3Smrg
24237ec681f3Smrg      return true;
24247ec681f3Smrg   }
24257ec681f3Smrg   return false;
24267ec681f3Smrg}
24277ec681f3Smrg
24287ec681f3Smrg/* s_add_{i32,u32}(a, s_lshl_b32(b, <n>)) -> s_lshl<n>_add_u32(a, b) */
24297ec681f3Smrgbool
24307ec681f3Smrgcombine_salu_lshl_add(opt_ctx& ctx, aco_ptr<Instruction>& instr)
24317ec681f3Smrg{
24327ec681f3Smrg   if (instr->opcode == aco_opcode::s_add_i32 && ctx.uses[instr->definitions[1].tempId()])
24337ec681f3Smrg      return false;
24347ec681f3Smrg
24357ec681f3Smrg   for (unsigned i = 0; i < 2; i++) {
24367ec681f3Smrg      Instruction* op2_instr = follow_operand(ctx, instr->operands[i], true);
24377ec681f3Smrg      if (!op2_instr || op2_instr->opcode != aco_opcode::s_lshl_b32 ||
24387ec681f3Smrg          ctx.uses[op2_instr->definitions[1].tempId()])
24397ec681f3Smrg         continue;
24407ec681f3Smrg      if (!op2_instr->operands[1].isConstant() || fixed_to_exec(op2_instr->operands[0]))
24417ec681f3Smrg         continue;
24427ec681f3Smrg
24437ec681f3Smrg      uint32_t shift = op2_instr->operands[1].constantValue();
24447ec681f3Smrg      if (shift < 1 || shift > 4)
24457ec681f3Smrg         continue;
24467ec681f3Smrg
24477ec681f3Smrg      if (instr->operands[!i].isLiteral() && op2_instr->operands[0].isLiteral() &&
24487ec681f3Smrg          instr->operands[!i].constantValue() != op2_instr->operands[0].constantValue())
24497ec681f3Smrg         continue;
24507ec681f3Smrg
24517ec681f3Smrg      ctx.uses[instr->operands[i].tempId()]--;
24527ec681f3Smrg      instr->operands[1] = instr->operands[!i];
24537ec681f3Smrg      instr->operands[0] = op2_instr->operands[0];
24547ec681f3Smrg      ctx.info[instr->definitions[0].tempId()].label = 0;
24557ec681f3Smrg
24567ec681f3Smrg      instr->opcode = std::array<aco_opcode, 4>{
24577ec681f3Smrg         aco_opcode::s_lshl1_add_u32, aco_opcode::s_lshl2_add_u32, aco_opcode::s_lshl3_add_u32,
24587ec681f3Smrg         aco_opcode::s_lshl4_add_u32}[shift - 1];
24597ec681f3Smrg
24607ec681f3Smrg      return true;
24617ec681f3Smrg   }
24627ec681f3Smrg   return false;
24637ec681f3Smrg}
24647ec681f3Smrg
24657ec681f3Smrgbool
24667ec681f3Smrgcombine_add_sub_b2i(opt_ctx& ctx, aco_ptr<Instruction>& instr, aco_opcode new_op, uint8_t ops)
24677ec681f3Smrg{
24687ec681f3Smrg   if (instr->usesModifiers())
24697ec681f3Smrg      return false;
24707ec681f3Smrg
24717ec681f3Smrg   for (unsigned i = 0; i < 2; i++) {
24727ec681f3Smrg      if (!((1 << i) & ops))
24737ec681f3Smrg         continue;
24747ec681f3Smrg      if (instr->operands[i].isTemp() && ctx.info[instr->operands[i].tempId()].is_b2i() &&
24757ec681f3Smrg          ctx.uses[instr->operands[i].tempId()] == 1) {
24767ec681f3Smrg
24777ec681f3Smrg         aco_ptr<Instruction> new_instr;
24787ec681f3Smrg         if (instr->operands[!i].isTemp() &&
24797ec681f3Smrg             instr->operands[!i].getTemp().type() == RegType::vgpr) {
24807ec681f3Smrg            new_instr.reset(create_instruction<VOP2_instruction>(new_op, Format::VOP2, 3, 2));
24817ec681f3Smrg         } else if (ctx.program->chip_class >= GFX10 ||
24827ec681f3Smrg                    (instr->operands[!i].isConstant() && !instr->operands[!i].isLiteral())) {
24837ec681f3Smrg            new_instr.reset(
24847ec681f3Smrg               create_instruction<VOP3_instruction>(new_op, asVOP3(Format::VOP2), 3, 2));
24857ec681f3Smrg         } else {
24867ec681f3Smrg            return false;
24877ec681f3Smrg         }
24887ec681f3Smrg         ctx.uses[instr->operands[i].tempId()]--;
24897ec681f3Smrg         new_instr->definitions[0] = instr->definitions[0];
24907ec681f3Smrg         if (instr->definitions.size() == 2) {
24917ec681f3Smrg            new_instr->definitions[1] = instr->definitions[1];
24927ec681f3Smrg         } else {
24937ec681f3Smrg            new_instr->definitions[1] =
24947ec681f3Smrg               Definition(ctx.program->allocateTmp(ctx.program->lane_mask));
24957ec681f3Smrg            /* Make sure the uses vector is large enough and the number of
24967ec681f3Smrg             * uses properly initialized to 0.
24977ec681f3Smrg             */
24987ec681f3Smrg            ctx.uses.push_back(0);
24997ec681f3Smrg         }
25007ec681f3Smrg         new_instr->definitions[1].setHint(vcc);
25017ec681f3Smrg         new_instr->operands[0] = Operand::zero();
25027ec681f3Smrg         new_instr->operands[1] = instr->operands[!i];
25037ec681f3Smrg         new_instr->operands[2] = Operand(ctx.info[instr->operands[i].tempId()].temp);
25047ec681f3Smrg         instr = std::move(new_instr);
25057ec681f3Smrg         ctx.info[instr->definitions[0].tempId()].set_add_sub(instr.get());
25067ec681f3Smrg         return true;
25077ec681f3Smrg      }
25087ec681f3Smrg   }
25097ec681f3Smrg
25107ec681f3Smrg   return false;
25117ec681f3Smrg}
25127ec681f3Smrg
25137ec681f3Smrgbool
25147ec681f3Smrgcombine_add_bcnt(opt_ctx& ctx, aco_ptr<Instruction>& instr)
25157ec681f3Smrg{
25167ec681f3Smrg   if (instr->usesModifiers())
25177ec681f3Smrg      return false;
25187ec681f3Smrg
25197ec681f3Smrg   for (unsigned i = 0; i < 2; i++) {
25207ec681f3Smrg      Instruction* op_instr = follow_operand(ctx, instr->operands[i]);
25217ec681f3Smrg      if (op_instr && op_instr->opcode == aco_opcode::v_bcnt_u32_b32 &&
25227ec681f3Smrg          !op_instr->usesModifiers() && op_instr->operands[0].isTemp() &&
25237ec681f3Smrg          op_instr->operands[0].getTemp().type() == RegType::vgpr &&
25247ec681f3Smrg          op_instr->operands[1].constantEquals(0)) {
25257ec681f3Smrg         aco_ptr<Instruction> new_instr{
25267ec681f3Smrg            create_instruction<VOP3_instruction>(aco_opcode::v_bcnt_u32_b32, Format::VOP3, 2, 1)};
25277ec681f3Smrg         ctx.uses[instr->operands[i].tempId()]--;
25287ec681f3Smrg         new_instr->operands[0] = op_instr->operands[0];
25297ec681f3Smrg         new_instr->operands[1] = instr->operands[!i];
25307ec681f3Smrg         new_instr->definitions[0] = instr->definitions[0];
25317ec681f3Smrg         instr = std::move(new_instr);
25327ec681f3Smrg         ctx.info[instr->definitions[0].tempId()].label = 0;
25337ec681f3Smrg
25347ec681f3Smrg         return true;
25357ec681f3Smrg      }
25367ec681f3Smrg   }
25377ec681f3Smrg
25387ec681f3Smrg   return false;
25397ec681f3Smrg}
25407ec681f3Smrg
25417ec681f3Smrgbool
25427ec681f3Smrgget_minmax_info(aco_opcode op, aco_opcode* min, aco_opcode* max, aco_opcode* min3, aco_opcode* max3,
25437ec681f3Smrg                aco_opcode* med3, bool* some_gfx9_only)
25447ec681f3Smrg{
25457ec681f3Smrg   switch (op) {
25467ec681f3Smrg#define MINMAX(type, gfx9)                                                                         \
25477ec681f3Smrg   case aco_opcode::v_min_##type:                                                                  \
25487ec681f3Smrg   case aco_opcode::v_max_##type:                                                                  \
25497ec681f3Smrg   case aco_opcode::v_med3_##type:                                                                 \
25507ec681f3Smrg      *min = aco_opcode::v_min_##type;                                                             \
25517ec681f3Smrg      *max = aco_opcode::v_max_##type;                                                             \
25527ec681f3Smrg      *med3 = aco_opcode::v_med3_##type;                                                           \
25537ec681f3Smrg      *min3 = aco_opcode::v_min3_##type;                                                           \
25547ec681f3Smrg      *max3 = aco_opcode::v_max3_##type;                                                           \
25557ec681f3Smrg      *some_gfx9_only = gfx9;                                                                      \
25567ec681f3Smrg      return true;
25577ec681f3Smrg      MINMAX(f32, false)
25587ec681f3Smrg      MINMAX(u32, false)
25597ec681f3Smrg      MINMAX(i32, false)
25607ec681f3Smrg      MINMAX(f16, true)
25617ec681f3Smrg      MINMAX(u16, true)
25627ec681f3Smrg      MINMAX(i16, true)
25637ec681f3Smrg#undef MINMAX
25647ec681f3Smrg   default: return false;
25657ec681f3Smrg   }
25667ec681f3Smrg}
25677ec681f3Smrg
25687ec681f3Smrg/* when ub > lb:
25697ec681f3Smrg * v_min_{f,u,i}{16,32}(v_max_{f,u,i}{16,32}(a, lb), ub) -> v_med3_{f,u,i}{16,32}(a, lb, ub)
25707ec681f3Smrg * v_max_{f,u,i}{16,32}(v_min_{f,u,i}{16,32}(a, ub), lb) -> v_med3_{f,u,i}{16,32}(a, lb, ub)
25717ec681f3Smrg */
25727ec681f3Smrgbool
25737ec681f3Smrgcombine_clamp(opt_ctx& ctx, aco_ptr<Instruction>& instr, aco_opcode min, aco_opcode max,
25747ec681f3Smrg              aco_opcode med)
25757ec681f3Smrg{
25767ec681f3Smrg   /* TODO: GLSL's clamp(x, minVal, maxVal) and SPIR-V's
25777ec681f3Smrg    * FClamp(x, minVal, maxVal)/NClamp(x, minVal, maxVal) are undefined if
25787ec681f3Smrg    * minVal > maxVal, which means we can always select it to a v_med3_f32 */
25797ec681f3Smrg   aco_opcode other_op;
25807ec681f3Smrg   if (instr->opcode == min)
25817ec681f3Smrg      other_op = max;
25827ec681f3Smrg   else if (instr->opcode == max)
25837ec681f3Smrg      other_op = min;
25847ec681f3Smrg   else
25857ec681f3Smrg      return false;
25867ec681f3Smrg
25877ec681f3Smrg   for (unsigned swap = 0; swap < 2; swap++) {
25887ec681f3Smrg      Operand operands[3];
25897ec681f3Smrg      bool neg[3], abs[3], clamp, precise;
25907ec681f3Smrg      uint8_t opsel = 0, omod = 0;
25917ec681f3Smrg      if (match_op3_for_vop3(ctx, instr->opcode, other_op, instr.get(), swap, "012", operands, neg,
25927ec681f3Smrg                             abs, &opsel, &clamp, &omod, NULL, NULL, NULL, &precise)) {
25937ec681f3Smrg         /* max(min(src, upper), lower) returns upper if src is NaN, but
25947ec681f3Smrg          * med3(src, lower, upper) returns lower.
25957ec681f3Smrg          */
25967ec681f3Smrg         if (precise && instr->opcode != min)
25977ec681f3Smrg            continue;
25987ec681f3Smrg
25997ec681f3Smrg         int const0_idx = -1, const1_idx = -1;
26007ec681f3Smrg         uint32_t const0 = 0, const1 = 0;
26017ec681f3Smrg         for (int i = 0; i < 3; i++) {
26027ec681f3Smrg            uint32_t val;
26037ec681f3Smrg            if (operands[i].isConstant()) {
26047ec681f3Smrg               val = operands[i].constantValue();
26057ec681f3Smrg            } else if (operands[i].isTemp() &&
26067ec681f3Smrg                       ctx.info[operands[i].tempId()].is_constant_or_literal(32)) {
26077ec681f3Smrg               val = ctx.info[operands[i].tempId()].val;
26087ec681f3Smrg            } else {
26097ec681f3Smrg               continue;
26107ec681f3Smrg            }
26117ec681f3Smrg            if (const0_idx >= 0) {
26127ec681f3Smrg               const1_idx = i;
26137ec681f3Smrg               const1 = val;
26147ec681f3Smrg            } else {
26157ec681f3Smrg               const0_idx = i;
26167ec681f3Smrg               const0 = val;
26177ec681f3Smrg            }
26187ec681f3Smrg         }
26197ec681f3Smrg         if (const0_idx < 0 || const1_idx < 0)
26207ec681f3Smrg            continue;
26217ec681f3Smrg
26227ec681f3Smrg         if (opsel & (1 << const0_idx))
26237ec681f3Smrg            const0 >>= 16;
26247ec681f3Smrg         if (opsel & (1 << const1_idx))
26257ec681f3Smrg            const1 >>= 16;
26267ec681f3Smrg
26277ec681f3Smrg         int lower_idx = const0_idx;
26287ec681f3Smrg         switch (min) {
26297ec681f3Smrg         case aco_opcode::v_min_f32:
26307ec681f3Smrg         case aco_opcode::v_min_f16: {
26317ec681f3Smrg            float const0_f, const1_f;
26327ec681f3Smrg            if (min == aco_opcode::v_min_f32) {
26337ec681f3Smrg               memcpy(&const0_f, &const0, 4);
26347ec681f3Smrg               memcpy(&const1_f, &const1, 4);
26357ec681f3Smrg            } else {
26367ec681f3Smrg               const0_f = _mesa_half_to_float(const0);
26377ec681f3Smrg               const1_f = _mesa_half_to_float(const1);
26387ec681f3Smrg            }
26397ec681f3Smrg            if (abs[const0_idx])
26407ec681f3Smrg               const0_f = fabsf(const0_f);
26417ec681f3Smrg            if (abs[const1_idx])
26427ec681f3Smrg               const1_f = fabsf(const1_f);
26437ec681f3Smrg            if (neg[const0_idx])
26447ec681f3Smrg               const0_f = -const0_f;
26457ec681f3Smrg            if (neg[const1_idx])
26467ec681f3Smrg               const1_f = -const1_f;
26477ec681f3Smrg            lower_idx = const0_f < const1_f ? const0_idx : const1_idx;
26487ec681f3Smrg            break;
26497ec681f3Smrg         }
26507ec681f3Smrg         case aco_opcode::v_min_u32: {
26517ec681f3Smrg            lower_idx = const0 < const1 ? const0_idx : const1_idx;
26527ec681f3Smrg            break;
26537ec681f3Smrg         }
26547ec681f3Smrg         case aco_opcode::v_min_u16: {
26557ec681f3Smrg            lower_idx = (uint16_t)const0 < (uint16_t)const1 ? const0_idx : const1_idx;
26567ec681f3Smrg            break;
26577ec681f3Smrg         }
26587ec681f3Smrg         case aco_opcode::v_min_i32: {
26597ec681f3Smrg            int32_t const0_i =
26607ec681f3Smrg               const0 & 0x80000000u ? -2147483648 + (int32_t)(const0 & 0x7fffffffu) : const0;
26617ec681f3Smrg            int32_t const1_i =
26627ec681f3Smrg               const1 & 0x80000000u ? -2147483648 + (int32_t)(const1 & 0x7fffffffu) : const1;
26637ec681f3Smrg            lower_idx = const0_i < const1_i ? const0_idx : const1_idx;
26647ec681f3Smrg            break;
26657ec681f3Smrg         }
26667ec681f3Smrg         case aco_opcode::v_min_i16: {
26677ec681f3Smrg            int16_t const0_i = const0 & 0x8000u ? -32768 + (int16_t)(const0 & 0x7fffu) : const0;
26687ec681f3Smrg            int16_t const1_i = const1 & 0x8000u ? -32768 + (int16_t)(const1 & 0x7fffu) : const1;
26697ec681f3Smrg            lower_idx = const0_i < const1_i ? const0_idx : const1_idx;
26707ec681f3Smrg            break;
26717ec681f3Smrg         }
26727ec681f3Smrg         default: break;
26737ec681f3Smrg         }
26747ec681f3Smrg         int upper_idx = lower_idx == const0_idx ? const1_idx : const0_idx;
26757ec681f3Smrg
26767ec681f3Smrg         if (instr->opcode == min) {
26777ec681f3Smrg            if (upper_idx != 0 || lower_idx == 0)
26787ec681f3Smrg               return false;
26797ec681f3Smrg         } else {
26807ec681f3Smrg            if (upper_idx == 0 || lower_idx != 0)
26817ec681f3Smrg               return false;
26827ec681f3Smrg         }
26837ec681f3Smrg
26847ec681f3Smrg         ctx.uses[instr->operands[swap].tempId()]--;
26857ec681f3Smrg         create_vop3_for_op3(ctx, med, instr, operands, neg, abs, opsel, clamp, omod);
26867ec681f3Smrg
26877ec681f3Smrg         return true;
26887ec681f3Smrg      }
26897ec681f3Smrg   }
26907ec681f3Smrg
26917ec681f3Smrg   return false;
26927ec681f3Smrg}
26937ec681f3Smrg
26947ec681f3Smrgvoid
26957ec681f3Smrgapply_sgprs(opt_ctx& ctx, aco_ptr<Instruction>& instr)
26967ec681f3Smrg{
26977ec681f3Smrg   bool is_shift64 = instr->opcode == aco_opcode::v_lshlrev_b64 ||
26987ec681f3Smrg                     instr->opcode == aco_opcode::v_lshrrev_b64 ||
26997ec681f3Smrg                     instr->opcode == aco_opcode::v_ashrrev_i64;
27007ec681f3Smrg
27017ec681f3Smrg   /* find candidates and create the set of sgprs already read */
27027ec681f3Smrg   unsigned sgpr_ids[2] = {0, 0};
27037ec681f3Smrg   uint32_t operand_mask = 0;
27047ec681f3Smrg   bool has_literal = false;
27057ec681f3Smrg   for (unsigned i = 0; i < instr->operands.size(); i++) {
27067ec681f3Smrg      if (instr->operands[i].isLiteral())
27077ec681f3Smrg         has_literal = true;
27087ec681f3Smrg      if (!instr->operands[i].isTemp())
27097ec681f3Smrg         continue;
27107ec681f3Smrg      if (instr->operands[i].getTemp().type() == RegType::sgpr) {
27117ec681f3Smrg         if (instr->operands[i].tempId() != sgpr_ids[0])
27127ec681f3Smrg            sgpr_ids[!!sgpr_ids[0]] = instr->operands[i].tempId();
27137ec681f3Smrg      }
27147ec681f3Smrg      ssa_info& info = ctx.info[instr->operands[i].tempId()];
27157ec681f3Smrg      if (is_copy_label(ctx, instr, info) && info.temp.type() == RegType::sgpr)
27167ec681f3Smrg         operand_mask |= 1u << i;
27177ec681f3Smrg      if (info.is_extract() && info.instr->operands[0].getTemp().type() == RegType::sgpr)
27187ec681f3Smrg         operand_mask |= 1u << i;
27197ec681f3Smrg   }
27207ec681f3Smrg   unsigned max_sgprs = 1;
27217ec681f3Smrg   if (ctx.program->chip_class >= GFX10 && !is_shift64)
27227ec681f3Smrg      max_sgprs = 2;
27237ec681f3Smrg   if (has_literal)
27247ec681f3Smrg      max_sgprs--;
27257ec681f3Smrg
27267ec681f3Smrg   unsigned num_sgprs = !!sgpr_ids[0] + !!sgpr_ids[1];
27277ec681f3Smrg
27287ec681f3Smrg   /* keep on applying sgprs until there is nothing left to be done */
27297ec681f3Smrg   while (operand_mask) {
27307ec681f3Smrg      uint32_t sgpr_idx = 0;
27317ec681f3Smrg      uint32_t sgpr_info_id = 0;
27327ec681f3Smrg      uint32_t mask = operand_mask;
27337ec681f3Smrg      /* choose a sgpr */
27347ec681f3Smrg      while (mask) {
27357ec681f3Smrg         unsigned i = u_bit_scan(&mask);
27367ec681f3Smrg         uint16_t uses = ctx.uses[instr->operands[i].tempId()];
27377ec681f3Smrg         if (sgpr_info_id == 0 || uses < ctx.uses[sgpr_info_id]) {
27387ec681f3Smrg            sgpr_idx = i;
27397ec681f3Smrg            sgpr_info_id = instr->operands[i].tempId();
27407ec681f3Smrg         }
27417ec681f3Smrg      }
27427ec681f3Smrg      operand_mask &= ~(1u << sgpr_idx);
27437ec681f3Smrg
27447ec681f3Smrg      ssa_info& info = ctx.info[sgpr_info_id];
27457ec681f3Smrg
27467ec681f3Smrg      /* Applying two sgprs require making it VOP3, so don't do it unless it's
27477ec681f3Smrg       * definitively beneficial.
27487ec681f3Smrg       * TODO: this is too conservative because later the use count could be reduced to 1 */
27497ec681f3Smrg      if (!info.is_extract() && num_sgprs && ctx.uses[sgpr_info_id] > 1 && !instr->isVOP3() &&
27507ec681f3Smrg          !instr->isSDWA() && instr->format != Format::VOP3P)
27517ec681f3Smrg         break;
27527ec681f3Smrg
27537ec681f3Smrg      Temp sgpr = info.is_extract() ? info.instr->operands[0].getTemp() : info.temp;
27547ec681f3Smrg      bool new_sgpr = sgpr.id() != sgpr_ids[0] && sgpr.id() != sgpr_ids[1];
27557ec681f3Smrg      if (new_sgpr && num_sgprs >= max_sgprs)
27567ec681f3Smrg         continue;
27577ec681f3Smrg
27587ec681f3Smrg      if (sgpr_idx == 0)
27597ec681f3Smrg         instr->format = withoutDPP(instr->format);
27607ec681f3Smrg
27617ec681f3Smrg      if (sgpr_idx == 0 || instr->isVOP3() || instr->isSDWA() || instr->isVOP3P() ||
27627ec681f3Smrg          info.is_extract()) {
27637ec681f3Smrg         /* can_apply_extract() checks SGPR encoding restrictions */
27647ec681f3Smrg         if (info.is_extract() && can_apply_extract(ctx, instr, sgpr_idx, info))
27657ec681f3Smrg            apply_extract(ctx, instr, sgpr_idx, info);
27667ec681f3Smrg         else if (info.is_extract())
27677ec681f3Smrg            continue;
27687ec681f3Smrg         instr->operands[sgpr_idx] = Operand(sgpr);
27697ec681f3Smrg      } else if (can_swap_operands(instr, &instr->opcode)) {
27707ec681f3Smrg         instr->operands[sgpr_idx] = instr->operands[0];
27717ec681f3Smrg         instr->operands[0] = Operand(sgpr);
27727ec681f3Smrg         /* swap bits using a 4-entry LUT */
27737ec681f3Smrg         uint32_t swapped = (0x3120 >> (operand_mask & 0x3)) & 0xf;
27747ec681f3Smrg         operand_mask = (operand_mask & ~0x3) | swapped;
27757ec681f3Smrg      } else if (can_use_VOP3(ctx, instr) && !info.is_extract()) {
27767ec681f3Smrg         to_VOP3(ctx, instr);
27777ec681f3Smrg         instr->operands[sgpr_idx] = Operand(sgpr);
27787ec681f3Smrg      } else {
27797ec681f3Smrg         continue;
27807ec681f3Smrg      }
27817ec681f3Smrg
27827ec681f3Smrg      if (new_sgpr)
27837ec681f3Smrg         sgpr_ids[num_sgprs++] = sgpr.id();
27847ec681f3Smrg      ctx.uses[sgpr_info_id]--;
27857ec681f3Smrg      ctx.uses[sgpr.id()]++;
27867ec681f3Smrg
27877ec681f3Smrg      /* TODO: handle when it's a VGPR */
27887ec681f3Smrg      if ((ctx.info[sgpr.id()].label & (label_extract | label_temp)) &&
27897ec681f3Smrg          ctx.info[sgpr.id()].temp.type() == RegType::sgpr)
27907ec681f3Smrg         operand_mask |= 1u << sgpr_idx;
27917ec681f3Smrg   }
27927ec681f3Smrg}
27937ec681f3Smrg
27947ec681f3Smrgtemplate <typename T>
27957ec681f3Smrgbool
27967ec681f3Smrgapply_omod_clamp_helper(opt_ctx& ctx, T* instr, ssa_info& def_info)
27977ec681f3Smrg{
27987ec681f3Smrg   if (!def_info.is_clamp() && (instr->clamp || instr->omod))
27997ec681f3Smrg      return false;
28007ec681f3Smrg
28017ec681f3Smrg   if (def_info.is_omod2())
28027ec681f3Smrg      instr->omod = 1;
28037ec681f3Smrg   else if (def_info.is_omod4())
28047ec681f3Smrg      instr->omod = 2;
28057ec681f3Smrg   else if (def_info.is_omod5())
28067ec681f3Smrg      instr->omod = 3;
28077ec681f3Smrg   else if (def_info.is_clamp())
28087ec681f3Smrg      instr->clamp = true;
28097ec681f3Smrg
28107ec681f3Smrg   return true;
28117ec681f3Smrg}
28127ec681f3Smrg
28137ec681f3Smrg/* apply omod / clamp modifiers if the def is used only once and the instruction can have modifiers */
28147ec681f3Smrgbool
28157ec681f3Smrgapply_omod_clamp(opt_ctx& ctx, aco_ptr<Instruction>& instr)
28167ec681f3Smrg{
28177ec681f3Smrg   if (instr->definitions.empty() || ctx.uses[instr->definitions[0].tempId()] != 1 ||
28187ec681f3Smrg       !instr_info.can_use_output_modifiers[(int)instr->opcode])
28197ec681f3Smrg      return false;
28207ec681f3Smrg
28217ec681f3Smrg   bool can_vop3 = can_use_VOP3(ctx, instr);
28227ec681f3Smrg   if (!instr->isSDWA() && !can_vop3)
28237ec681f3Smrg      return false;
28247ec681f3Smrg
28257ec681f3Smrg   /* omod flushes -0 to +0 and has no effect if denormals are enabled */
28267ec681f3Smrg   bool can_use_omod = (can_vop3 || ctx.program->chip_class >= GFX9); /* SDWA omod is GFX9+ */
28277ec681f3Smrg   if (instr->definitions[0].bytes() == 4)
28287ec681f3Smrg      can_use_omod =
28297ec681f3Smrg         can_use_omod && ctx.fp_mode.denorm32 == 0 && !ctx.fp_mode.preserve_signed_zero_inf_nan32;
28307ec681f3Smrg   else
28317ec681f3Smrg      can_use_omod = can_use_omod && ctx.fp_mode.denorm16_64 == 0 &&
28327ec681f3Smrg                     !ctx.fp_mode.preserve_signed_zero_inf_nan16_64;
28337ec681f3Smrg
28347ec681f3Smrg   ssa_info& def_info = ctx.info[instr->definitions[0].tempId()];
28357ec681f3Smrg
28367ec681f3Smrg   uint64_t omod_labels = label_omod2 | label_omod4 | label_omod5;
28377ec681f3Smrg   if (!def_info.is_clamp() && !(can_use_omod && (def_info.label & omod_labels)))
28387ec681f3Smrg      return false;
28397ec681f3Smrg   /* if the omod/clamp instruction is dead, then the single user of this
28407ec681f3Smrg    * instruction is a different instruction */
28417ec681f3Smrg   if (!ctx.uses[def_info.instr->definitions[0].tempId()])
28427ec681f3Smrg      return false;
28437ec681f3Smrg
28447ec681f3Smrg   /* MADs/FMAs are created later, so we don't have to update the original add */
28457ec681f3Smrg   assert(!ctx.info[instr->definitions[0].tempId()].is_mad());
28467ec681f3Smrg
28477ec681f3Smrg   if (instr->isSDWA()) {
28487ec681f3Smrg      if (!apply_omod_clamp_helper(ctx, &instr->sdwa(), def_info))
28497ec681f3Smrg         return false;
28507ec681f3Smrg   } else {
28517ec681f3Smrg      to_VOP3(ctx, instr);
28527ec681f3Smrg      if (!apply_omod_clamp_helper(ctx, &instr->vop3(), def_info))
28537ec681f3Smrg         return false;
28547ec681f3Smrg   }
28557ec681f3Smrg
28567ec681f3Smrg   instr->definitions[0].swapTemp(def_info.instr->definitions[0]);
28577ec681f3Smrg   ctx.info[instr->definitions[0].tempId()].label &= label_clamp | label_insert;
28587ec681f3Smrg   ctx.uses[def_info.instr->definitions[0].tempId()]--;
28597ec681f3Smrg
28607ec681f3Smrg   return true;
28617ec681f3Smrg}
28627ec681f3Smrg
28637ec681f3Smrg/* Combine an p_insert (or p_extract, in some cases) instruction with instr.
28647ec681f3Smrg * p_insert(instr(...)) -> instr_insert().
28657ec681f3Smrg */
28667ec681f3Smrgbool
28677ec681f3Smrgapply_insert(opt_ctx& ctx, aco_ptr<Instruction>& instr)
28687ec681f3Smrg{
28697ec681f3Smrg   if (instr->definitions.empty() || ctx.uses[instr->definitions[0].tempId()] != 1)
28707ec681f3Smrg      return false;
28717ec681f3Smrg
28727ec681f3Smrg   ssa_info& def_info = ctx.info[instr->definitions[0].tempId()];
28737ec681f3Smrg   if (!def_info.is_insert())
28747ec681f3Smrg      return false;
28757ec681f3Smrg   /* if the insert instruction is dead, then the single user of this
28767ec681f3Smrg    * instruction is a different instruction */
28777ec681f3Smrg   if (!ctx.uses[def_info.instr->definitions[0].tempId()])
28787ec681f3Smrg      return false;
28797ec681f3Smrg
28807ec681f3Smrg   /* MADs/FMAs are created later, so we don't have to update the original add */
28817ec681f3Smrg   assert(!ctx.info[instr->definitions[0].tempId()].is_mad());
28827ec681f3Smrg
28837ec681f3Smrg   SubdwordSel sel = parse_insert(def_info.instr);
28847ec681f3Smrg   assert(sel);
28857ec681f3Smrg
28867ec681f3Smrg   if (instr->isVOP3() && sel.size() == 2 && !sel.sign_extend() &&
28877ec681f3Smrg       can_use_opsel(ctx.program->chip_class, instr->opcode, 3, sel.offset())) {
28887ec681f3Smrg      if (instr->vop3().opsel & (1 << 3))
28897ec681f3Smrg         return false;
28907ec681f3Smrg      if (sel.offset())
28917ec681f3Smrg         instr->vop3().opsel |= 1 << 3;
28927ec681f3Smrg   } else {
28937ec681f3Smrg      if (!can_use_SDWA(ctx.program->chip_class, instr, true))
28947ec681f3Smrg         return false;
28957ec681f3Smrg
28967ec681f3Smrg      to_SDWA(ctx, instr);
28977ec681f3Smrg      if (instr->sdwa().dst_sel.size() != 4)
28987ec681f3Smrg         return false;
28997ec681f3Smrg      static_cast<SDWA_instruction*>(instr.get())->dst_sel = sel;
29007ec681f3Smrg   }
29017ec681f3Smrg
29027ec681f3Smrg   instr->definitions[0].swapTemp(def_info.instr->definitions[0]);
29037ec681f3Smrg   ctx.info[instr->definitions[0].tempId()].label = 0;
29047ec681f3Smrg   ctx.uses[def_info.instr->definitions[0].tempId()]--;
29057ec681f3Smrg
29067ec681f3Smrg   return true;
29077ec681f3Smrg}
29087ec681f3Smrg
29097ec681f3Smrg/* Remove superfluous extract after ds_read like so:
29107ec681f3Smrg * p_extract(ds_read_uN(), 0, N, 0) -> ds_read_uN()
29117ec681f3Smrg */
29127ec681f3Smrgbool
29137ec681f3Smrgapply_ds_extract(opt_ctx& ctx, aco_ptr<Instruction>& extract)
29147ec681f3Smrg{
29157ec681f3Smrg   /* Check if p_extract has a usedef operand and is the only user. */
29167ec681f3Smrg   if (!ctx.info[extract->operands[0].tempId()].is_usedef() ||
29177ec681f3Smrg       ctx.uses[extract->operands[0].tempId()] > 1)
29187ec681f3Smrg      return false;
29197ec681f3Smrg
29207ec681f3Smrg   /* Check if the usedef is a DS instruction. */
29217ec681f3Smrg   Instruction* ds = ctx.info[extract->operands[0].tempId()].instr;
29227ec681f3Smrg   if (ds->format != Format::DS)
29237ec681f3Smrg      return false;
29247ec681f3Smrg
29257ec681f3Smrg   unsigned extract_idx = extract->operands[1].constantValue();
29267ec681f3Smrg   unsigned bits_extracted = extract->operands[2].constantValue();
29277ec681f3Smrg   unsigned sign_ext = extract->operands[3].constantValue();
29287ec681f3Smrg   unsigned dst_bitsize = extract->definitions[0].bytes() * 8u;
29297ec681f3Smrg
29307ec681f3Smrg   /* TODO: These are doable, but probably don't occour too often. */
29317ec681f3Smrg   if (extract_idx || sign_ext || dst_bitsize != 32)
29327ec681f3Smrg      return false;
29337ec681f3Smrg
29347ec681f3Smrg   unsigned bits_loaded = 0;
29357ec681f3Smrg   if (ds->opcode == aco_opcode::ds_read_u8 || ds->opcode == aco_opcode::ds_read_u8_d16)
29367ec681f3Smrg      bits_loaded = 8;
29377ec681f3Smrg   else if (ds->opcode == aco_opcode::ds_read_u16 || ds->opcode == aco_opcode::ds_read_u16_d16)
29387ec681f3Smrg      bits_loaded = 16;
29397ec681f3Smrg   else
29407ec681f3Smrg      return false;
29417ec681f3Smrg
29427ec681f3Smrg   /* Shrink the DS load if the extracted bit size is smaller. */
29437ec681f3Smrg   bits_loaded = MIN2(bits_loaded, bits_extracted);
29447ec681f3Smrg
29457ec681f3Smrg   /* Change the DS opcode so it writes the full register. */
29467ec681f3Smrg   if (bits_loaded == 8)
29477ec681f3Smrg      ds->opcode = aco_opcode::ds_read_u8;
29487ec681f3Smrg   else if (bits_loaded == 16)
29497ec681f3Smrg      ds->opcode = aco_opcode::ds_read_u16;
29507ec681f3Smrg   else
29517ec681f3Smrg      unreachable("Forgot to add DS opcode above.");
29527ec681f3Smrg
29537ec681f3Smrg   /* The DS now produces the exact same thing as the extract, remove the extract. */
29547ec681f3Smrg   std::swap(ds->definitions[0], extract->definitions[0]);
29557ec681f3Smrg   ctx.uses[extract->definitions[0].tempId()] = 0;
29567ec681f3Smrg   ctx.info[ds->definitions[0].tempId()].label = 0;
29577ec681f3Smrg   return true;
29587ec681f3Smrg}
29597ec681f3Smrg
29607ec681f3Smrg/* v_and(a, v_subbrev_co(0, 0, vcc)) -> v_cndmask(0, a, vcc) */
29617ec681f3Smrgbool
29627ec681f3Smrgcombine_and_subbrev(opt_ctx& ctx, aco_ptr<Instruction>& instr)
29637ec681f3Smrg{
29647ec681f3Smrg   if (instr->usesModifiers())
29657ec681f3Smrg      return false;
29667ec681f3Smrg
29677ec681f3Smrg   for (unsigned i = 0; i < 2; i++) {
29687ec681f3Smrg      Instruction* op_instr = follow_operand(ctx, instr->operands[i], true);
29697ec681f3Smrg      if (op_instr && op_instr->opcode == aco_opcode::v_subbrev_co_u32 &&
29707ec681f3Smrg          op_instr->operands[0].constantEquals(0) && op_instr->operands[1].constantEquals(0) &&
29717ec681f3Smrg          !op_instr->usesModifiers()) {
29727ec681f3Smrg
29737ec681f3Smrg         aco_ptr<Instruction> new_instr;
29747ec681f3Smrg         if (instr->operands[!i].isTemp() &&
29757ec681f3Smrg             instr->operands[!i].getTemp().type() == RegType::vgpr) {
29767ec681f3Smrg            new_instr.reset(
29777ec681f3Smrg               create_instruction<VOP2_instruction>(aco_opcode::v_cndmask_b32, Format::VOP2, 3, 1));
29787ec681f3Smrg         } else if (ctx.program->chip_class >= GFX10 ||
29797ec681f3Smrg                    (instr->operands[!i].isConstant() && !instr->operands[!i].isLiteral())) {
29807ec681f3Smrg            new_instr.reset(create_instruction<VOP3_instruction>(aco_opcode::v_cndmask_b32,
29817ec681f3Smrg                                                                 asVOP3(Format::VOP2), 3, 1));
29827ec681f3Smrg         } else {
29837ec681f3Smrg            return false;
29847ec681f3Smrg         }
29857ec681f3Smrg
29867ec681f3Smrg         ctx.uses[instr->operands[i].tempId()]--;
29877ec681f3Smrg         if (ctx.uses[instr->operands[i].tempId()])
29887ec681f3Smrg            ctx.uses[op_instr->operands[2].tempId()]++;
29897ec681f3Smrg
29907ec681f3Smrg         new_instr->operands[0] = Operand::zero();
29917ec681f3Smrg         new_instr->operands[1] = instr->operands[!i];
29927ec681f3Smrg         new_instr->operands[2] = Operand(op_instr->operands[2]);
29937ec681f3Smrg         new_instr->definitions[0] = instr->definitions[0];
29947ec681f3Smrg         instr = std::move(new_instr);
29957ec681f3Smrg         ctx.info[instr->definitions[0].tempId()].label = 0;
29967ec681f3Smrg         return true;
29977ec681f3Smrg      }
29987ec681f3Smrg   }
29997ec681f3Smrg
30007ec681f3Smrg   return false;
30017ec681f3Smrg}
30027ec681f3Smrg
30037ec681f3Smrg/* v_add_co(c, s_lshl(a, b)) -> v_mad_u32_u24(a, 1<<b, c)
30047ec681f3Smrg * v_add_co(c, v_lshlrev(a, b)) -> v_mad_u32_u24(b, 1<<a, c)
30057ec681f3Smrg * v_sub(c, s_lshl(a, b)) -> v_mad_i32_i24(a, -(1<<b), c)
30067ec681f3Smrg * v_sub(c, v_lshlrev(a, b)) -> v_mad_i32_i24(b, -(1<<a), c)
30077ec681f3Smrg */
30087ec681f3Smrgbool
30097ec681f3Smrgcombine_add_lshl(opt_ctx& ctx, aco_ptr<Instruction>& instr, bool is_sub)
30107ec681f3Smrg{
30117ec681f3Smrg   if (instr->usesModifiers())
30127ec681f3Smrg      return false;
30137ec681f3Smrg
30147ec681f3Smrg   /* Substractions: start at operand 1 to avoid mixup such as
30157ec681f3Smrg    * turning v_sub(v_lshlrev(a, b), c) into v_mad_i32_i24(b, -(1<<a), c)
30167ec681f3Smrg    */
30177ec681f3Smrg   unsigned start_op_idx = is_sub ? 1 : 0;
30187ec681f3Smrg
30197ec681f3Smrg   /* Don't allow 24-bit operands on subtraction because
30207ec681f3Smrg    * v_mad_i32_i24 applies a sign extension.
30217ec681f3Smrg    */
30227ec681f3Smrg   bool allow_24bit = !is_sub;
30237ec681f3Smrg
30247ec681f3Smrg   for (unsigned i = start_op_idx; i < 2; i++) {
30257ec681f3Smrg      Instruction* op_instr = follow_operand(ctx, instr->operands[i]);
30267ec681f3Smrg      if (!op_instr)
30277ec681f3Smrg         continue;
30287ec681f3Smrg
30297ec681f3Smrg      if (op_instr->opcode != aco_opcode::s_lshl_b32 &&
30307ec681f3Smrg          op_instr->opcode != aco_opcode::v_lshlrev_b32)
30317ec681f3Smrg         continue;
30327ec681f3Smrg
30337ec681f3Smrg      int shift_op_idx = op_instr->opcode == aco_opcode::s_lshl_b32 ? 1 : 0;
30347ec681f3Smrg
30357ec681f3Smrg      if (op_instr->operands[shift_op_idx].isConstant() &&
30367ec681f3Smrg          ((allow_24bit && op_instr->operands[!shift_op_idx].is24bit()) ||
30377ec681f3Smrg           op_instr->operands[!shift_op_idx].is16bit())) {
30387ec681f3Smrg         uint32_t multiplier = 1 << (op_instr->operands[shift_op_idx].constantValue() % 32u);
30397ec681f3Smrg         if (is_sub)
30407ec681f3Smrg            multiplier = -multiplier;
30417ec681f3Smrg         if (is_sub ? (multiplier < 0xff800000) : (multiplier > 0xffffff))
30427ec681f3Smrg            continue;
30437ec681f3Smrg
30447ec681f3Smrg         Operand ops[3] = {
30457ec681f3Smrg            op_instr->operands[!shift_op_idx],
30467ec681f3Smrg            Operand::c32(multiplier),
30477ec681f3Smrg            instr->operands[!i],
30487ec681f3Smrg         };
30497ec681f3Smrg         if (!check_vop3_operands(ctx, 3, ops))
30507ec681f3Smrg            return false;
30517ec681f3Smrg
30527ec681f3Smrg         ctx.uses[instr->operands[i].tempId()]--;
30537ec681f3Smrg
30547ec681f3Smrg         aco_opcode mad_op = is_sub ? aco_opcode::v_mad_i32_i24 : aco_opcode::v_mad_u32_u24;
30557ec681f3Smrg         aco_ptr<VOP3_instruction> new_instr{
30567ec681f3Smrg            create_instruction<VOP3_instruction>(mad_op, Format::VOP3, 3, 1)};
30577ec681f3Smrg         for (unsigned op_idx = 0; op_idx < 3; ++op_idx)
30587ec681f3Smrg            new_instr->operands[op_idx] = ops[op_idx];
30597ec681f3Smrg         new_instr->definitions[0] = instr->definitions[0];
30607ec681f3Smrg         instr = std::move(new_instr);
30617ec681f3Smrg         ctx.info[instr->definitions[0].tempId()].label = 0;
30627ec681f3Smrg         return true;
30637ec681f3Smrg      }
30647ec681f3Smrg   }
30657ec681f3Smrg
30667ec681f3Smrg   return false;
30677ec681f3Smrg}
30687ec681f3Smrg
30697ec681f3Smrgvoid
30707ec681f3Smrgpropagate_swizzles(VOP3P_instruction* instr, uint8_t opsel_lo, uint8_t opsel_hi)
30717ec681f3Smrg{
30727ec681f3Smrg   /* propagate swizzles which apply to a result down to the instruction's operands:
30737ec681f3Smrg    * result = a.xy + b.xx -> result.yx = a.yx + b.xx */
30747ec681f3Smrg   assert((opsel_lo & 1) == opsel_lo);
30757ec681f3Smrg   assert((opsel_hi & 1) == opsel_hi);
30767ec681f3Smrg   uint8_t tmp_lo = instr->opsel_lo;
30777ec681f3Smrg   uint8_t tmp_hi = instr->opsel_hi;
30787ec681f3Smrg   bool neg_lo[3] = {instr->neg_lo[0], instr->neg_lo[1], instr->neg_lo[2]};
30797ec681f3Smrg   bool neg_hi[3] = {instr->neg_hi[0], instr->neg_hi[1], instr->neg_hi[2]};
30807ec681f3Smrg   if (opsel_lo == 1) {
30817ec681f3Smrg      instr->opsel_lo = tmp_hi;
30827ec681f3Smrg      for (unsigned i = 0; i < 3; i++)
30837ec681f3Smrg         instr->neg_lo[i] = neg_hi[i];
30847ec681f3Smrg   }
30857ec681f3Smrg   if (opsel_hi == 0) {
30867ec681f3Smrg      instr->opsel_hi = tmp_lo;
30877ec681f3Smrg      for (unsigned i = 0; i < 3; i++)
30887ec681f3Smrg         instr->neg_hi[i] = neg_lo[i];
30897ec681f3Smrg   }
30907ec681f3Smrg}
30917ec681f3Smrg
30927ec681f3Smrgvoid
30937ec681f3Smrgcombine_vop3p(opt_ctx& ctx, aco_ptr<Instruction>& instr)
30947ec681f3Smrg{
30957ec681f3Smrg   VOP3P_instruction* vop3p = &instr->vop3p();
30967ec681f3Smrg
30977ec681f3Smrg   /* apply clamp */
30987ec681f3Smrg   if (instr->opcode == aco_opcode::v_pk_mul_f16 && instr->operands[1].constantEquals(0x3C00) &&
30997ec681f3Smrg       vop3p->clamp && instr->operands[0].isTemp() && ctx.uses[instr->operands[0].tempId()] == 1) {
31007ec681f3Smrg
31017ec681f3Smrg      ssa_info& info = ctx.info[instr->operands[0].tempId()];
31027ec681f3Smrg      if (info.is_vop3p() && instr_info.can_use_output_modifiers[(int)info.instr->opcode]) {
31037ec681f3Smrg         VOP3P_instruction* candidate = &ctx.info[instr->operands[0].tempId()].instr->vop3p();
31047ec681f3Smrg         candidate->clamp = true;
31057ec681f3Smrg         propagate_swizzles(candidate, vop3p->opsel_lo, vop3p->opsel_hi);
31067ec681f3Smrg         instr->definitions[0].swapTemp(candidate->definitions[0]);
31077ec681f3Smrg         ctx.info[candidate->definitions[0].tempId()].instr = candidate;
31087ec681f3Smrg         ctx.uses[instr->definitions[0].tempId()]--;
31097ec681f3Smrg         return;
31107ec681f3Smrg      }
31117ec681f3Smrg   }
31127ec681f3Smrg
31137ec681f3Smrg   /* check for fneg modifiers */
31147ec681f3Smrg   if (instr_info.can_use_input_modifiers[(int)instr->opcode]) {
31157ec681f3Smrg      /* at this point, we only have 2-operand instructions */
31167ec681f3Smrg      assert(instr->operands.size() == 2);
31177ec681f3Smrg      for (unsigned i = 0; i < 2; i++) {
31187ec681f3Smrg         Operand& op = instr->operands[i];
31197ec681f3Smrg         if (!op.isTemp())
31207ec681f3Smrg            continue;
31217ec681f3Smrg
31227ec681f3Smrg         ssa_info& info = ctx.info[op.tempId()];
31237ec681f3Smrg         if (info.is_vop3p() && info.instr->opcode == aco_opcode::v_pk_mul_f16 &&
31247ec681f3Smrg             info.instr->operands[1].constantEquals(0xBC00)) {
31257ec681f3Smrg            Operand ops[2] = {instr->operands[!i], info.instr->operands[0]};
31267ec681f3Smrg            if (!check_vop3_operands(ctx, 2, ops))
31277ec681f3Smrg               continue;
31287ec681f3Smrg
31297ec681f3Smrg            VOP3P_instruction* fneg = &info.instr->vop3p();
31307ec681f3Smrg            if (fneg->clamp)
31317ec681f3Smrg               continue;
31327ec681f3Smrg            instr->operands[i] = fneg->operands[0];
31337ec681f3Smrg
31347ec681f3Smrg            /* opsel_lo/hi is either 0 or 1:
31357ec681f3Smrg             * if 0 - pick selection from fneg->lo
31367ec681f3Smrg             * if 1 - pick selection from fneg->hi
31377ec681f3Smrg             */
31387ec681f3Smrg            bool opsel_lo = (vop3p->opsel_lo >> i) & 1;
31397ec681f3Smrg            bool opsel_hi = (vop3p->opsel_hi >> i) & 1;
31407ec681f3Smrg            bool neg_lo = true ^ fneg->neg_lo[0] ^ fneg->neg_lo[1];
31417ec681f3Smrg            bool neg_hi = true ^ fneg->neg_hi[0] ^ fneg->neg_hi[1];
31427ec681f3Smrg            vop3p->neg_lo[i] ^= opsel_lo ? neg_hi : neg_lo;
31437ec681f3Smrg            vop3p->neg_hi[i] ^= opsel_hi ? neg_hi : neg_lo;
31447ec681f3Smrg            vop3p->opsel_lo ^= ((opsel_lo ? ~fneg->opsel_hi : fneg->opsel_lo) & 1) << i;
31457ec681f3Smrg            vop3p->opsel_hi ^= ((opsel_hi ? ~fneg->opsel_hi : fneg->opsel_lo) & 1) << i;
31467ec681f3Smrg
31477ec681f3Smrg            if (--ctx.uses[fneg->definitions[0].tempId()])
31487ec681f3Smrg               ctx.uses[fneg->operands[0].tempId()]++;
31497ec681f3Smrg         }
31507ec681f3Smrg      }
31517ec681f3Smrg   }
31527ec681f3Smrg
31537ec681f3Smrg   if (instr->opcode == aco_opcode::v_pk_add_f16 || instr->opcode == aco_opcode::v_pk_add_u16) {
31547ec681f3Smrg      bool fadd = instr->opcode == aco_opcode::v_pk_add_f16;
31557ec681f3Smrg      if (fadd && instr->definitions[0].isPrecise())
31567ec681f3Smrg         return;
31577ec681f3Smrg
31587ec681f3Smrg      Instruction* mul_instr = nullptr;
31597ec681f3Smrg      unsigned add_op_idx = 0;
31607ec681f3Smrg      uint8_t opsel_lo = 0, opsel_hi = 0;
31617ec681f3Smrg      uint32_t uses = UINT32_MAX;
31627ec681f3Smrg
31637ec681f3Smrg      /* find the 'best' mul instruction to combine with the add */
31647ec681f3Smrg      for (unsigned i = 0; i < 2; i++) {
31657ec681f3Smrg         if (!instr->operands[i].isTemp() || !ctx.info[instr->operands[i].tempId()].is_vop3p())
31667ec681f3Smrg            continue;
31677ec681f3Smrg         ssa_info& info = ctx.info[instr->operands[i].tempId()];
31687ec681f3Smrg         if (fadd) {
31697ec681f3Smrg            if (info.instr->opcode != aco_opcode::v_pk_mul_f16 ||
31707ec681f3Smrg                info.instr->definitions[0].isPrecise())
31717ec681f3Smrg               continue;
31727ec681f3Smrg         } else {
31737ec681f3Smrg            if (info.instr->opcode != aco_opcode::v_pk_mul_lo_u16)
31747ec681f3Smrg               continue;
31757ec681f3Smrg         }
31767ec681f3Smrg
31777ec681f3Smrg         Operand op[3] = {info.instr->operands[0], info.instr->operands[1], instr->operands[1 - i]};
31787ec681f3Smrg         if (ctx.uses[instr->operands[i].tempId()] >= uses || !check_vop3_operands(ctx, 3, op))
31797ec681f3Smrg            continue;
31807ec681f3Smrg
31817ec681f3Smrg         /* no clamp allowed between mul and add */
31827ec681f3Smrg         if (info.instr->vop3p().clamp)
31837ec681f3Smrg            continue;
31847ec681f3Smrg
31857ec681f3Smrg         mul_instr = info.instr;
31867ec681f3Smrg         add_op_idx = 1 - i;
31877ec681f3Smrg         opsel_lo = (vop3p->opsel_lo >> i) & 1;
31887ec681f3Smrg         opsel_hi = (vop3p->opsel_hi >> i) & 1;
31897ec681f3Smrg         uses = ctx.uses[instr->operands[i].tempId()];
31907ec681f3Smrg      }
31917ec681f3Smrg
31927ec681f3Smrg      if (!mul_instr)
31937ec681f3Smrg         return;
31947ec681f3Smrg
31957ec681f3Smrg      /* convert to mad */
31967ec681f3Smrg      Operand op[3] = {mul_instr->operands[0], mul_instr->operands[1], instr->operands[add_op_idx]};
31977ec681f3Smrg      ctx.uses[mul_instr->definitions[0].tempId()]--;
31987ec681f3Smrg      if (ctx.uses[mul_instr->definitions[0].tempId()]) {
31997ec681f3Smrg         if (op[0].isTemp())
32007ec681f3Smrg            ctx.uses[op[0].tempId()]++;
32017ec681f3Smrg         if (op[1].isTemp())
32027ec681f3Smrg            ctx.uses[op[1].tempId()]++;
32037ec681f3Smrg      }
32047ec681f3Smrg
32057ec681f3Smrg      /* turn packed mul+add into v_pk_fma_f16 */
32067ec681f3Smrg      assert(mul_instr->isVOP3P());
32077ec681f3Smrg      aco_opcode mad = fadd ? aco_opcode::v_pk_fma_f16 : aco_opcode::v_pk_mad_u16;
32087ec681f3Smrg      aco_ptr<VOP3P_instruction> fma{
32097ec681f3Smrg         create_instruction<VOP3P_instruction>(mad, Format::VOP3P, 3, 1)};
32107ec681f3Smrg      VOP3P_instruction* mul = &mul_instr->vop3p();
32117ec681f3Smrg      for (unsigned i = 0; i < 2; i++) {
32127ec681f3Smrg         fma->operands[i] = op[i];
32137ec681f3Smrg         fma->neg_lo[i] = mul->neg_lo[i];
32147ec681f3Smrg         fma->neg_hi[i] = mul->neg_hi[i];
32157ec681f3Smrg      }
32167ec681f3Smrg      fma->operands[2] = op[2];
32177ec681f3Smrg      fma->clamp = vop3p->clamp;
32187ec681f3Smrg      fma->opsel_lo = mul->opsel_lo;
32197ec681f3Smrg      fma->opsel_hi = mul->opsel_hi;
32207ec681f3Smrg      propagate_swizzles(fma.get(), opsel_lo, opsel_hi);
32217ec681f3Smrg      fma->opsel_lo |= (vop3p->opsel_lo << (2 - add_op_idx)) & 0x4;
32227ec681f3Smrg      fma->opsel_hi |= (vop3p->opsel_hi << (2 - add_op_idx)) & 0x4;
32237ec681f3Smrg      fma->neg_lo[2] = vop3p->neg_lo[add_op_idx];
32247ec681f3Smrg      fma->neg_hi[2] = vop3p->neg_hi[add_op_idx];
32257ec681f3Smrg      fma->neg_lo[1] = fma->neg_lo[1] ^ vop3p->neg_lo[1 - add_op_idx];
32267ec681f3Smrg      fma->neg_hi[1] = fma->neg_hi[1] ^ vop3p->neg_hi[1 - add_op_idx];
32277ec681f3Smrg      fma->definitions[0] = instr->definitions[0];
32287ec681f3Smrg      instr = std::move(fma);
32297ec681f3Smrg      ctx.info[instr->definitions[0].tempId()].set_vop3p(instr.get());
32307ec681f3Smrg      return;
32317ec681f3Smrg   }
32327ec681f3Smrg}
32337ec681f3Smrg
32347ec681f3Smrg// TODO: we could possibly move the whole label_instruction pass to combine_instruction:
32357ec681f3Smrg// this would mean that we'd have to fix the instruction uses while value propagation
32367ec681f3Smrg
32377ec681f3Smrgvoid
32387ec681f3Smrgcombine_instruction(opt_ctx& ctx, aco_ptr<Instruction>& instr)
32397ec681f3Smrg{
32407ec681f3Smrg   if (instr->definitions.empty() || is_dead(ctx.uses, instr.get()))
32417ec681f3Smrg      return;
32427ec681f3Smrg
32437ec681f3Smrg   if (instr->isVALU()) {
32447ec681f3Smrg      /* Apply SDWA. Do this after label_instruction() so it can remove
32457ec681f3Smrg       * label_extract if not all instructions can take SDWA. */
32467ec681f3Smrg      for (unsigned i = 0; i < instr->operands.size(); i++) {
32477ec681f3Smrg         Operand& op = instr->operands[i];
32487ec681f3Smrg         if (!op.isTemp())
32497ec681f3Smrg            continue;
32507ec681f3Smrg         ssa_info& info = ctx.info[op.tempId()];
32517ec681f3Smrg         if (!info.is_extract())
32527ec681f3Smrg            continue;
32537ec681f3Smrg         /* if there are that many uses, there are likely better combinations */
32547ec681f3Smrg         // TODO: delay applying extract to a point where we know better
32557ec681f3Smrg         if (ctx.uses[op.tempId()] > 4) {
32567ec681f3Smrg            info.label &= ~label_extract;
32577ec681f3Smrg            continue;
32587ec681f3Smrg         }
32597ec681f3Smrg         if (info.is_extract() &&
32607ec681f3Smrg             (info.instr->operands[0].getTemp().type() == RegType::vgpr ||
32617ec681f3Smrg              instr->operands[i].getTemp().type() == RegType::sgpr) &&
32627ec681f3Smrg             can_apply_extract(ctx, instr, i, info)) {
32637ec681f3Smrg            apply_extract(ctx, instr, i, info);
32647ec681f3Smrg            ctx.uses[instr->operands[i].tempId()]--;
32657ec681f3Smrg            instr->operands[i].setTemp(info.instr->operands[0].getTemp());
32667ec681f3Smrg         }
32677ec681f3Smrg      }
32687ec681f3Smrg
32697ec681f3Smrg      if (can_apply_sgprs(ctx, instr))
32707ec681f3Smrg         apply_sgprs(ctx, instr);
32717ec681f3Smrg      while (apply_omod_clamp(ctx, instr))
32727ec681f3Smrg         ;
32737ec681f3Smrg      apply_insert(ctx, instr);
32747ec681f3Smrg   }
32757ec681f3Smrg
32767ec681f3Smrg   if (instr->isVOP3P())
32777ec681f3Smrg      return combine_vop3p(ctx, instr);
32787ec681f3Smrg
32797ec681f3Smrg   if (ctx.info[instr->definitions[0].tempId()].is_vcc_hint()) {
32807ec681f3Smrg      instr->definitions[0].setHint(vcc);
32817ec681f3Smrg   }
32827ec681f3Smrg
32837ec681f3Smrg   if (instr->isSDWA() || instr->isDPP())
32847ec681f3Smrg      return;
32857ec681f3Smrg
32867ec681f3Smrg   if (instr->opcode == aco_opcode::p_extract)
32877ec681f3Smrg      apply_ds_extract(ctx, instr);
32887ec681f3Smrg
32897ec681f3Smrg   /* TODO: There are still some peephole optimizations that could be done:
32907ec681f3Smrg    * - abs(a - b) -> s_absdiff_i32
32917ec681f3Smrg    * - various patterns for s_bitcmp{0,1}_b32 and s_bitset{0,1}_b32
32927ec681f3Smrg    * - patterns for v_alignbit_b32 and v_alignbyte_b32
32937ec681f3Smrg    * These aren't probably too interesting though.
32947ec681f3Smrg    * There are also patterns for v_cmp_class_f{16,32,64}. This is difficult but
32957ec681f3Smrg    * probably more useful than the previously mentioned optimizations.
32967ec681f3Smrg    * The various comparison optimizations also currently only work with 32-bit
32977ec681f3Smrg    * floats. */
32987ec681f3Smrg
32997ec681f3Smrg   /* neg(mul(a, b)) -> mul(neg(a), b) */
33007ec681f3Smrg   if (ctx.info[instr->definitions[0].tempId()].is_neg() &&
33017ec681f3Smrg       ctx.uses[instr->operands[1].tempId()] == 1) {
33027ec681f3Smrg      Temp val = ctx.info[instr->definitions[0].tempId()].temp;
33037ec681f3Smrg
33047ec681f3Smrg      if (!ctx.info[val.id()].is_mul())
33057ec681f3Smrg         return;
33067ec681f3Smrg
33077ec681f3Smrg      Instruction* mul_instr = ctx.info[val.id()].instr;
33087ec681f3Smrg
33097ec681f3Smrg      if (mul_instr->operands[0].isLiteral())
33107ec681f3Smrg         return;
33117ec681f3Smrg      if (mul_instr->isVOP3() && mul_instr->vop3().clamp)
33127ec681f3Smrg         return;
33137ec681f3Smrg      if (mul_instr->isSDWA() || mul_instr->isDPP())
33147ec681f3Smrg         return;
33157ec681f3Smrg
33167ec681f3Smrg      /* convert to mul(neg(a), b) */
33177ec681f3Smrg      ctx.uses[mul_instr->definitions[0].tempId()]--;
33187ec681f3Smrg      Definition def = instr->definitions[0];
33197ec681f3Smrg      /* neg(abs(mul(a, b))) -> mul(neg(abs(a)), abs(b)) */
33207ec681f3Smrg      bool is_abs = ctx.info[instr->definitions[0].tempId()].is_abs();
33217ec681f3Smrg      instr.reset(
33227ec681f3Smrg         create_instruction<VOP3_instruction>(mul_instr->opcode, asVOP3(Format::VOP2), 2, 1));
33237ec681f3Smrg      instr->operands[0] = mul_instr->operands[0];
33247ec681f3Smrg      instr->operands[1] = mul_instr->operands[1];
33257ec681f3Smrg      instr->definitions[0] = def;
33267ec681f3Smrg      VOP3_instruction& new_mul = instr->vop3();
33277ec681f3Smrg      if (mul_instr->isVOP3()) {
33287ec681f3Smrg         VOP3_instruction& mul = mul_instr->vop3();
33297ec681f3Smrg         new_mul.neg[0] = mul.neg[0];
33307ec681f3Smrg         new_mul.neg[1] = mul.neg[1];
33317ec681f3Smrg         new_mul.abs[0] = mul.abs[0];
33327ec681f3Smrg         new_mul.abs[1] = mul.abs[1];
33337ec681f3Smrg         new_mul.omod = mul.omod;
33347ec681f3Smrg      }
33357ec681f3Smrg      if (is_abs) {
33367ec681f3Smrg         new_mul.neg[0] = new_mul.neg[1] = false;
33377ec681f3Smrg         new_mul.abs[0] = new_mul.abs[1] = true;
33387ec681f3Smrg      }
33397ec681f3Smrg      new_mul.neg[0] ^= true;
33407ec681f3Smrg      new_mul.clamp = false;
33417ec681f3Smrg
33427ec681f3Smrg      ctx.info[instr->definitions[0].tempId()].set_mul(instr.get());
33437ec681f3Smrg      return;
33447ec681f3Smrg   }
33457ec681f3Smrg
33467ec681f3Smrg   /* combine mul+add -> mad */
33477ec681f3Smrg   bool mad32 = instr->opcode == aco_opcode::v_add_f32 || instr->opcode == aco_opcode::v_sub_f32 ||
33487ec681f3Smrg                instr->opcode == aco_opcode::v_subrev_f32;
33497ec681f3Smrg   bool mad16 = instr->opcode == aco_opcode::v_add_f16 || instr->opcode == aco_opcode::v_sub_f16 ||
33507ec681f3Smrg                instr->opcode == aco_opcode::v_subrev_f16;
33517ec681f3Smrg   bool mad64 = instr->opcode == aco_opcode::v_add_f64;
33527ec681f3Smrg   if (mad16 || mad32 || mad64) {
33537ec681f3Smrg      bool need_fma =
33547ec681f3Smrg         mad32 ? (ctx.fp_mode.denorm32 != 0 || ctx.program->chip_class >= GFX10_3)
33557ec681f3Smrg               : (ctx.fp_mode.denorm16_64 != 0 || ctx.program->chip_class >= GFX10 || mad64);
33567ec681f3Smrg      if (need_fma && instr->definitions[0].isPrecise())
33577ec681f3Smrg         return;
33587ec681f3Smrg      if (need_fma && mad32 && !ctx.program->dev.has_fast_fma32)
33597ec681f3Smrg         return;
33607ec681f3Smrg
33617ec681f3Smrg      Instruction* mul_instr = nullptr;
33627ec681f3Smrg      unsigned add_op_idx = 0;
33637ec681f3Smrg      uint32_t uses = UINT32_MAX;
33647ec681f3Smrg      /* find the 'best' mul instruction to combine with the add */
33657ec681f3Smrg      for (unsigned i = 0; i < 2; i++) {
33667ec681f3Smrg         if (!instr->operands[i].isTemp() || !ctx.info[instr->operands[i].tempId()].is_mul())
33677ec681f3Smrg            continue;
33687ec681f3Smrg         /* check precision requirements */
33697ec681f3Smrg         ssa_info& info = ctx.info[instr->operands[i].tempId()];
33707ec681f3Smrg         if (need_fma && info.instr->definitions[0].isPrecise())
33717ec681f3Smrg            continue;
33727ec681f3Smrg
33737ec681f3Smrg         /* no clamp/omod allowed between mul and add */
33747ec681f3Smrg         if (info.instr->isVOP3() && (info.instr->vop3().clamp || info.instr->vop3().omod))
33757ec681f3Smrg            continue;
33767ec681f3Smrg
33777ec681f3Smrg         Operand op[3] = {info.instr->operands[0], info.instr->operands[1], instr->operands[1 - i]};
33787ec681f3Smrg         if (info.instr->isSDWA() || info.instr->isDPP() || !check_vop3_operands(ctx, 3, op) ||
33797ec681f3Smrg             ctx.uses[instr->operands[i].tempId()] >= uses)
33807ec681f3Smrg            continue;
33817ec681f3Smrg
33827ec681f3Smrg         mul_instr = info.instr;
33837ec681f3Smrg         add_op_idx = 1 - i;
33847ec681f3Smrg         uses = ctx.uses[instr->operands[i].tempId()];
33857ec681f3Smrg      }
33867ec681f3Smrg
33877ec681f3Smrg      if (mul_instr) {
33887ec681f3Smrg         /* turn mul+add into v_mad/v_fma */
33897ec681f3Smrg         Operand op[3] = {mul_instr->operands[0], mul_instr->operands[1],
33907ec681f3Smrg                          instr->operands[add_op_idx]};
33917ec681f3Smrg         ctx.uses[mul_instr->definitions[0].tempId()]--;
33927ec681f3Smrg         if (ctx.uses[mul_instr->definitions[0].tempId()]) {
33937ec681f3Smrg            if (op[0].isTemp())
33947ec681f3Smrg               ctx.uses[op[0].tempId()]++;
33957ec681f3Smrg            if (op[1].isTemp())
33967ec681f3Smrg               ctx.uses[op[1].tempId()]++;
33977ec681f3Smrg         }
33987ec681f3Smrg
33997ec681f3Smrg         bool neg[3] = {false, false, false};
34007ec681f3Smrg         bool abs[3] = {false, false, false};
34017ec681f3Smrg         unsigned omod = 0;
34027ec681f3Smrg         bool clamp = false;
34037ec681f3Smrg
34047ec681f3Smrg         if (mul_instr->isVOP3()) {
34057ec681f3Smrg            VOP3_instruction& vop3 = mul_instr->vop3();
34067ec681f3Smrg            neg[0] = vop3.neg[0];
34077ec681f3Smrg            neg[1] = vop3.neg[1];
34087ec681f3Smrg            abs[0] = vop3.abs[0];
34097ec681f3Smrg            abs[1] = vop3.abs[1];
34107ec681f3Smrg         }
34117ec681f3Smrg
34127ec681f3Smrg         if (instr->isVOP3()) {
34137ec681f3Smrg            VOP3_instruction& vop3 = instr->vop3();
34147ec681f3Smrg            neg[2] = vop3.neg[add_op_idx];
34157ec681f3Smrg            abs[2] = vop3.abs[add_op_idx];
34167ec681f3Smrg            omod = vop3.omod;
34177ec681f3Smrg            clamp = vop3.clamp;
34187ec681f3Smrg            /* abs of the multiplication result */
34197ec681f3Smrg            if (vop3.abs[1 - add_op_idx]) {
34207ec681f3Smrg               neg[0] = false;
34217ec681f3Smrg               neg[1] = false;
34227ec681f3Smrg               abs[0] = true;
34237ec681f3Smrg               abs[1] = true;
34247ec681f3Smrg            }
34257ec681f3Smrg            /* neg of the multiplication result */
34267ec681f3Smrg            neg[1] = neg[1] ^ vop3.neg[1 - add_op_idx];
34277ec681f3Smrg         }
34287ec681f3Smrg         if (instr->opcode == aco_opcode::v_sub_f32 || instr->opcode == aco_opcode::v_sub_f16)
34297ec681f3Smrg            neg[1 + add_op_idx] = neg[1 + add_op_idx] ^ true;
34307ec681f3Smrg         else if (instr->opcode == aco_opcode::v_subrev_f32 ||
34317ec681f3Smrg                  instr->opcode == aco_opcode::v_subrev_f16)
34327ec681f3Smrg            neg[2 - add_op_idx] = neg[2 - add_op_idx] ^ true;
34337ec681f3Smrg
34347ec681f3Smrg         aco_opcode mad_op = need_fma ? aco_opcode::v_fma_f32 : aco_opcode::v_mad_f32;
34357ec681f3Smrg         if (mad16)
34367ec681f3Smrg            mad_op = need_fma ? (ctx.program->chip_class == GFX8 ? aco_opcode::v_fma_legacy_f16
34377ec681f3Smrg                                                                 : aco_opcode::v_fma_f16)
34387ec681f3Smrg                              : (ctx.program->chip_class == GFX8 ? aco_opcode::v_mad_legacy_f16
34397ec681f3Smrg                                                                 : aco_opcode::v_mad_f16);
34407ec681f3Smrg         if (mad64)
34417ec681f3Smrg            mad_op = aco_opcode::v_fma_f64;
34427ec681f3Smrg
34437ec681f3Smrg         aco_ptr<VOP3_instruction> mad{
34447ec681f3Smrg            create_instruction<VOP3_instruction>(mad_op, Format::VOP3, 3, 1)};
34457ec681f3Smrg         for (unsigned i = 0; i < 3; i++) {
34467ec681f3Smrg            mad->operands[i] = op[i];
34477ec681f3Smrg            mad->neg[i] = neg[i];
34487ec681f3Smrg            mad->abs[i] = abs[i];
34497ec681f3Smrg         }
34507ec681f3Smrg         mad->omod = omod;
34517ec681f3Smrg         mad->clamp = clamp;
34527ec681f3Smrg         mad->definitions[0] = instr->definitions[0];
34537ec681f3Smrg
34547ec681f3Smrg         /* mark this ssa_def to be re-checked for profitability and literals */
34557ec681f3Smrg         ctx.mad_infos.emplace_back(std::move(instr), mul_instr->definitions[0].tempId());
34567ec681f3Smrg         ctx.info[mad->definitions[0].tempId()].set_mad(mad.get(), ctx.mad_infos.size() - 1);
34577ec681f3Smrg         instr = std::move(mad);
34587ec681f3Smrg         return;
34597ec681f3Smrg      }
34607ec681f3Smrg   }
34617ec681f3Smrg   /* v_mul_f32(v_cndmask_b32(0, 1.0, cond), a) -> v_cndmask_b32(0, a, cond) */
34627ec681f3Smrg   else if (instr->opcode == aco_opcode::v_mul_f32 && !instr->isVOP3()) {
34637ec681f3Smrg      for (unsigned i = 0; i < 2; i++) {
34647ec681f3Smrg         if (instr->operands[i].isTemp() && ctx.info[instr->operands[i].tempId()].is_b2f() &&
34657ec681f3Smrg             ctx.uses[instr->operands[i].tempId()] == 1 && instr->operands[!i].isTemp() &&
34667ec681f3Smrg             instr->operands[!i].getTemp().type() == RegType::vgpr) {
34677ec681f3Smrg            ctx.uses[instr->operands[i].tempId()]--;
34687ec681f3Smrg            ctx.uses[ctx.info[instr->operands[i].tempId()].temp.id()]++;
34697ec681f3Smrg
34707ec681f3Smrg            aco_ptr<VOP2_instruction> new_instr{
34717ec681f3Smrg               create_instruction<VOP2_instruction>(aco_opcode::v_cndmask_b32, Format::VOP2, 3, 1)};
34727ec681f3Smrg            new_instr->operands[0] = Operand::zero();
34737ec681f3Smrg            new_instr->operands[1] = instr->operands[!i];
34747ec681f3Smrg            new_instr->operands[2] = Operand(ctx.info[instr->operands[i].tempId()].temp);
34757ec681f3Smrg            new_instr->definitions[0] = instr->definitions[0];
34767ec681f3Smrg            instr = std::move(new_instr);
34777ec681f3Smrg            ctx.info[instr->definitions[0].tempId()].label = 0;
34787ec681f3Smrg            return;
34797ec681f3Smrg         }
34807ec681f3Smrg      }
34817ec681f3Smrg   } else if (instr->opcode == aco_opcode::v_or_b32 && ctx.program->chip_class >= GFX9) {
34827ec681f3Smrg      if (combine_three_valu_op(ctx, instr, aco_opcode::s_or_b32, aco_opcode::v_or3_b32, "012",
34837ec681f3Smrg                                1 | 2)) {
34847ec681f3Smrg      } else if (combine_three_valu_op(ctx, instr, aco_opcode::v_or_b32, aco_opcode::v_or3_b32,
34857ec681f3Smrg                                       "012", 1 | 2)) {
34867ec681f3Smrg      } else if (combine_add_or_then_and_lshl(ctx, instr)) {
34877ec681f3Smrg      }
34887ec681f3Smrg   } else if (instr->opcode == aco_opcode::v_xor_b32 && ctx.program->chip_class >= GFX10) {
34897ec681f3Smrg      if (combine_three_valu_op(ctx, instr, aco_opcode::v_xor_b32, aco_opcode::v_xor3_b32, "012",
34907ec681f3Smrg                                1 | 2)) {
34917ec681f3Smrg      } else if (combine_three_valu_op(ctx, instr, aco_opcode::s_xor_b32, aco_opcode::v_xor3_b32,
34927ec681f3Smrg                                       "012", 1 | 2)) {
34937ec681f3Smrg      }
34947ec681f3Smrg   } else if (instr->opcode == aco_opcode::v_add_u16) {
34957ec681f3Smrg      combine_three_valu_op(
34967ec681f3Smrg         ctx, instr, aco_opcode::v_mul_lo_u16,
34977ec681f3Smrg         ctx.program->chip_class == GFX8 ? aco_opcode::v_mad_legacy_u16 : aco_opcode::v_mad_u16,
34987ec681f3Smrg         "120", 1 | 2);
34997ec681f3Smrg   } else if (instr->opcode == aco_opcode::v_add_u16_e64) {
35007ec681f3Smrg      combine_three_valu_op(ctx, instr, aco_opcode::v_mul_lo_u16_e64, aco_opcode::v_mad_u16, "120",
35017ec681f3Smrg                            1 | 2);
35027ec681f3Smrg   } else if (instr->opcode == aco_opcode::v_add_u32) {
35037ec681f3Smrg      if (combine_add_sub_b2i(ctx, instr, aco_opcode::v_addc_co_u32, 1 | 2)) {
35047ec681f3Smrg      } else if (combine_add_bcnt(ctx, instr)) {
35057ec681f3Smrg      } else if (combine_three_valu_op(ctx, instr, aco_opcode::v_mul_u32_u24,
35067ec681f3Smrg                                       aco_opcode::v_mad_u32_u24, "120", 1 | 2)) {
35077ec681f3Smrg      } else if (ctx.program->chip_class >= GFX9 && !instr->usesModifiers()) {
35087ec681f3Smrg         if (combine_three_valu_op(ctx, instr, aco_opcode::s_xor_b32, aco_opcode::v_xad_u32, "120",
35097ec681f3Smrg                                   1 | 2)) {
35107ec681f3Smrg         } else if (combine_three_valu_op(ctx, instr, aco_opcode::v_xor_b32, aco_opcode::v_xad_u32,
35117ec681f3Smrg                                          "120", 1 | 2)) {
35127ec681f3Smrg         } else if (combine_three_valu_op(ctx, instr, aco_opcode::s_add_i32, aco_opcode::v_add3_u32,
35137ec681f3Smrg                                          "012", 1 | 2)) {
35147ec681f3Smrg         } else if (combine_three_valu_op(ctx, instr, aco_opcode::s_add_u32, aco_opcode::v_add3_u32,
35157ec681f3Smrg                                          "012", 1 | 2)) {
35167ec681f3Smrg         } else if (combine_three_valu_op(ctx, instr, aco_opcode::v_add_u32, aco_opcode::v_add3_u32,
35177ec681f3Smrg                                          "012", 1 | 2)) {
35187ec681f3Smrg         } else if (combine_add_or_then_and_lshl(ctx, instr)) {
35197ec681f3Smrg         }
35207ec681f3Smrg      }
35217ec681f3Smrg   } else if (instr->opcode == aco_opcode::v_add_co_u32 ||
35227ec681f3Smrg              instr->opcode == aco_opcode::v_add_co_u32_e64) {
35237ec681f3Smrg      bool carry_out = ctx.uses[instr->definitions[1].tempId()] > 0;
35247ec681f3Smrg      if (combine_add_sub_b2i(ctx, instr, aco_opcode::v_addc_co_u32, 1 | 2)) {
35257ec681f3Smrg      } else if (!carry_out && combine_add_bcnt(ctx, instr)) {
35267ec681f3Smrg      } else if (!carry_out && combine_three_valu_op(ctx, instr, aco_opcode::v_mul_u32_u24,
35277ec681f3Smrg                                                     aco_opcode::v_mad_u32_u24, "120", 1 | 2)) {
35287ec681f3Smrg      } else if (!carry_out && combine_add_lshl(ctx, instr, false)) {
35297ec681f3Smrg      }
35307ec681f3Smrg   } else if (instr->opcode == aco_opcode::v_sub_u32 || instr->opcode == aco_opcode::v_sub_co_u32 ||
35317ec681f3Smrg              instr->opcode == aco_opcode::v_sub_co_u32_e64) {
35327ec681f3Smrg      bool carry_out =
35337ec681f3Smrg         instr->opcode != aco_opcode::v_sub_u32 && ctx.uses[instr->definitions[1].tempId()] > 0;
35347ec681f3Smrg      if (combine_add_sub_b2i(ctx, instr, aco_opcode::v_subbrev_co_u32, 2)) {
35357ec681f3Smrg      } else if (!carry_out && combine_add_lshl(ctx, instr, true)) {
35367ec681f3Smrg      }
35377ec681f3Smrg   } else if (instr->opcode == aco_opcode::v_subrev_u32 ||
35387ec681f3Smrg              instr->opcode == aco_opcode::v_subrev_co_u32 ||
35397ec681f3Smrg              instr->opcode == aco_opcode::v_subrev_co_u32_e64) {
35407ec681f3Smrg      combine_add_sub_b2i(ctx, instr, aco_opcode::v_subbrev_co_u32, 1);
35417ec681f3Smrg   } else if (instr->opcode == aco_opcode::v_lshlrev_b32 && ctx.program->chip_class >= GFX9) {
35427ec681f3Smrg      combine_three_valu_op(ctx, instr, aco_opcode::v_add_u32, aco_opcode::v_add_lshl_u32, "120",
35437ec681f3Smrg                            2);
35447ec681f3Smrg   } else if ((instr->opcode == aco_opcode::s_add_u32 || instr->opcode == aco_opcode::s_add_i32) &&
35457ec681f3Smrg              ctx.program->chip_class >= GFX9) {
35467ec681f3Smrg      combine_salu_lshl_add(ctx, instr);
35477ec681f3Smrg   } else if (instr->opcode == aco_opcode::s_not_b32 || instr->opcode == aco_opcode::s_not_b64) {
35487ec681f3Smrg      combine_salu_not_bitwise(ctx, instr);
35497ec681f3Smrg   } else if (instr->opcode == aco_opcode::s_and_b32 || instr->opcode == aco_opcode::s_or_b32 ||
35507ec681f3Smrg              instr->opcode == aco_opcode::s_and_b64 || instr->opcode == aco_opcode::s_or_b64) {
35517ec681f3Smrg      if (combine_ordering_test(ctx, instr)) {
35527ec681f3Smrg      } else if (combine_comparison_ordering(ctx, instr)) {
35537ec681f3Smrg      } else if (combine_constant_comparison_ordering(ctx, instr)) {
35547ec681f3Smrg      } else if (combine_salu_n2(ctx, instr)) {
35557ec681f3Smrg      }
35567ec681f3Smrg   } else if (instr->opcode == aco_opcode::v_and_b32) {
35577ec681f3Smrg      combine_and_subbrev(ctx, instr);
35587ec681f3Smrg   } else {
35597ec681f3Smrg      aco_opcode min, max, min3, max3, med3;
35607ec681f3Smrg      bool some_gfx9_only;
35617ec681f3Smrg      if (get_minmax_info(instr->opcode, &min, &max, &min3, &max3, &med3, &some_gfx9_only) &&
35627ec681f3Smrg          (!some_gfx9_only || ctx.program->chip_class >= GFX9)) {
35637ec681f3Smrg         if (combine_minmax(ctx, instr, instr->opcode == min ? max : min,
35647ec681f3Smrg                            instr->opcode == min ? min3 : max3)) {
35657ec681f3Smrg         } else {
35667ec681f3Smrg            combine_clamp(ctx, instr, min, max, med3);
35677ec681f3Smrg         }
35687ec681f3Smrg      }
35697ec681f3Smrg   }
35707ec681f3Smrg
35717ec681f3Smrg   /* do this after combine_salu_n2() */
35727ec681f3Smrg   if (instr->opcode == aco_opcode::s_andn2_b32 || instr->opcode == aco_opcode::s_andn2_b64)
35737ec681f3Smrg      combine_inverse_comparison(ctx, instr);
35747ec681f3Smrg}
35757ec681f3Smrg
35767ec681f3Smrgbool
35777ec681f3Smrgto_uniform_bool_instr(opt_ctx& ctx, aco_ptr<Instruction>& instr)
35787ec681f3Smrg{
35797ec681f3Smrg   /* Check every operand to make sure they are suitable. */
35807ec681f3Smrg   for (Operand& op : instr->operands) {
35817ec681f3Smrg      if (!op.isTemp())
35827ec681f3Smrg         return false;
35837ec681f3Smrg      if (!ctx.info[op.tempId()].is_uniform_bool() && !ctx.info[op.tempId()].is_uniform_bitwise())
35847ec681f3Smrg         return false;
35857ec681f3Smrg   }
35867ec681f3Smrg
35877ec681f3Smrg   switch (instr->opcode) {
35887ec681f3Smrg   case aco_opcode::s_and_b32:
35897ec681f3Smrg   case aco_opcode::s_and_b64: instr->opcode = aco_opcode::s_and_b32; break;
35907ec681f3Smrg   case aco_opcode::s_or_b32:
35917ec681f3Smrg   case aco_opcode::s_or_b64: instr->opcode = aco_opcode::s_or_b32; break;
35927ec681f3Smrg   case aco_opcode::s_xor_b32:
35937ec681f3Smrg   case aco_opcode::s_xor_b64: instr->opcode = aco_opcode::s_absdiff_i32; break;
35947ec681f3Smrg   default:
35957ec681f3Smrg      /* Don't transform other instructions. They are very unlikely to appear here. */
35967ec681f3Smrg      return false;
35977ec681f3Smrg   }
35987ec681f3Smrg
35997ec681f3Smrg   for (Operand& op : instr->operands) {
36007ec681f3Smrg      ctx.uses[op.tempId()]--;
36017ec681f3Smrg
36027ec681f3Smrg      if (ctx.info[op.tempId()].is_uniform_bool()) {
36037ec681f3Smrg         /* Just use the uniform boolean temp. */
36047ec681f3Smrg         op.setTemp(ctx.info[op.tempId()].temp);
36057ec681f3Smrg      } else if (ctx.info[op.tempId()].is_uniform_bitwise()) {
36067ec681f3Smrg         /* Use the SCC definition of the predecessor instruction.
36077ec681f3Smrg          * This allows the predecessor to get picked up by the same optimization (if it has no
36087ec681f3Smrg          * divergent users), and it also makes sure that the current instruction will keep working
36097ec681f3Smrg          * even if the predecessor won't be transformed.
36107ec681f3Smrg          */
36117ec681f3Smrg         Instruction* pred_instr = ctx.info[op.tempId()].instr;
36127ec681f3Smrg         assert(pred_instr->definitions.size() >= 2);
36137ec681f3Smrg         assert(pred_instr->definitions[1].isFixed() &&
36147ec681f3Smrg                pred_instr->definitions[1].physReg() == scc);
36157ec681f3Smrg         op.setTemp(pred_instr->definitions[1].getTemp());
36167ec681f3Smrg      } else {
36177ec681f3Smrg         unreachable("Invalid operand on uniform bitwise instruction.");
36187ec681f3Smrg      }
36197ec681f3Smrg
36207ec681f3Smrg      ctx.uses[op.tempId()]++;
36217ec681f3Smrg   }
36227ec681f3Smrg
36237ec681f3Smrg   instr->definitions[0].setTemp(Temp(instr->definitions[0].tempId(), s1));
36247ec681f3Smrg   assert(instr->operands[0].regClass() == s1);
36257ec681f3Smrg   assert(instr->operands[1].regClass() == s1);
36267ec681f3Smrg   return true;
36277ec681f3Smrg}
36287ec681f3Smrg
36297ec681f3Smrgvoid
36307ec681f3Smrgselect_instruction(opt_ctx& ctx, aco_ptr<Instruction>& instr)
36317ec681f3Smrg{
36327ec681f3Smrg   const uint32_t threshold = 4;
36337ec681f3Smrg
36347ec681f3Smrg   if (is_dead(ctx.uses, instr.get())) {
36357ec681f3Smrg      instr.reset();
36367ec681f3Smrg      return;
36377ec681f3Smrg   }
36387ec681f3Smrg
36397ec681f3Smrg   /* convert split_vector into a copy or extract_vector if only one definition is ever used */
36407ec681f3Smrg   if (instr->opcode == aco_opcode::p_split_vector) {
36417ec681f3Smrg      unsigned num_used = 0;
36427ec681f3Smrg      unsigned idx = 0;
36437ec681f3Smrg      unsigned split_offset = 0;
36447ec681f3Smrg      for (unsigned i = 0, offset = 0; i < instr->definitions.size();
36457ec681f3Smrg           offset += instr->definitions[i++].bytes()) {
36467ec681f3Smrg         if (ctx.uses[instr->definitions[i].tempId()]) {
36477ec681f3Smrg            num_used++;
36487ec681f3Smrg            idx = i;
36497ec681f3Smrg            split_offset = offset;
36507ec681f3Smrg         }
36517ec681f3Smrg      }
36527ec681f3Smrg      bool done = false;
36537ec681f3Smrg      if (num_used == 1 && ctx.info[instr->operands[0].tempId()].is_vec() &&
36547ec681f3Smrg          ctx.uses[instr->operands[0].tempId()] == 1) {
36557ec681f3Smrg         Instruction* vec = ctx.info[instr->operands[0].tempId()].instr;
36567ec681f3Smrg
36577ec681f3Smrg         unsigned off = 0;
36587ec681f3Smrg         Operand op;
36597ec681f3Smrg         for (Operand& vec_op : vec->operands) {
36607ec681f3Smrg            if (off == split_offset) {
36617ec681f3Smrg               op = vec_op;
36627ec681f3Smrg               break;
36637ec681f3Smrg            }
36647ec681f3Smrg            off += vec_op.bytes();
36657ec681f3Smrg         }
36667ec681f3Smrg         if (off != instr->operands[0].bytes() && op.bytes() == instr->definitions[idx].bytes()) {
36677ec681f3Smrg            ctx.uses[instr->operands[0].tempId()]--;
36687ec681f3Smrg            for (Operand& vec_op : vec->operands) {
36697ec681f3Smrg               if (vec_op.isTemp())
36707ec681f3Smrg                  ctx.uses[vec_op.tempId()]--;
36717ec681f3Smrg            }
36727ec681f3Smrg            if (op.isTemp())
36737ec681f3Smrg               ctx.uses[op.tempId()]++;
36747ec681f3Smrg
36757ec681f3Smrg            aco_ptr<Pseudo_instruction> extract{create_instruction<Pseudo_instruction>(
36767ec681f3Smrg               aco_opcode::p_create_vector, Format::PSEUDO, 1, 1)};
36777ec681f3Smrg            extract->operands[0] = op;
36787ec681f3Smrg            extract->definitions[0] = instr->definitions[idx];
36797ec681f3Smrg            instr = std::move(extract);
36807ec681f3Smrg
36817ec681f3Smrg            done = true;
36827ec681f3Smrg         }
36837ec681f3Smrg      }
36847ec681f3Smrg
36857ec681f3Smrg      if (!done && num_used == 1 &&
36867ec681f3Smrg          instr->operands[0].bytes() % instr->definitions[idx].bytes() == 0 &&
36877ec681f3Smrg          split_offset % instr->definitions[idx].bytes() == 0) {
36887ec681f3Smrg         aco_ptr<Pseudo_instruction> extract{create_instruction<Pseudo_instruction>(
36897ec681f3Smrg            aco_opcode::p_extract_vector, Format::PSEUDO, 2, 1)};
36907ec681f3Smrg         extract->operands[0] = instr->operands[0];
36917ec681f3Smrg         extract->operands[1] =
36927ec681f3Smrg            Operand::c32((uint32_t)split_offset / instr->definitions[idx].bytes());
36937ec681f3Smrg         extract->definitions[0] = instr->definitions[idx];
36947ec681f3Smrg         instr = std::move(extract);
36957ec681f3Smrg      }
36967ec681f3Smrg   }
36977ec681f3Smrg
36987ec681f3Smrg   mad_info* mad_info = NULL;
36997ec681f3Smrg   if (!instr->definitions.empty() && ctx.info[instr->definitions[0].tempId()].is_mad()) {
37007ec681f3Smrg      mad_info = &ctx.mad_infos[ctx.info[instr->definitions[0].tempId()].instr->pass_flags];
37017ec681f3Smrg      /* re-check mad instructions */
37027ec681f3Smrg      if (ctx.uses[mad_info->mul_temp_id] && mad_info->add_instr) {
37037ec681f3Smrg         ctx.uses[mad_info->mul_temp_id]++;
37047ec681f3Smrg         if (instr->operands[0].isTemp())
37057ec681f3Smrg            ctx.uses[instr->operands[0].tempId()]--;
37067ec681f3Smrg         if (instr->operands[1].isTemp())
37077ec681f3Smrg            ctx.uses[instr->operands[1].tempId()]--;
37087ec681f3Smrg         instr.swap(mad_info->add_instr);
37097ec681f3Smrg         mad_info = NULL;
37107ec681f3Smrg      }
37117ec681f3Smrg      /* check literals */
37127ec681f3Smrg      else if (!instr->usesModifiers() && instr->opcode != aco_opcode::v_fma_f64) {
37137ec681f3Smrg         /* FMA can only take literals on GFX10+ */
37147ec681f3Smrg         if ((instr->opcode == aco_opcode::v_fma_f32 || instr->opcode == aco_opcode::v_fma_f16) &&
37157ec681f3Smrg             ctx.program->chip_class < GFX10)
37167ec681f3Smrg            return;
37177ec681f3Smrg         /* There are no v_fmaak_legacy_f16/v_fmamk_legacy_f16 and on chips where VOP3 can take
37187ec681f3Smrg          * literals (GFX10+), these instructions don't exist.
37197ec681f3Smrg          */
37207ec681f3Smrg         if (instr->opcode == aco_opcode::v_fma_legacy_f16)
37217ec681f3Smrg            return;
37227ec681f3Smrg
37237ec681f3Smrg         bool sgpr_used = false;
37247ec681f3Smrg         uint32_t literal_idx = 0;
37257ec681f3Smrg         uint32_t literal_uses = UINT32_MAX;
37267ec681f3Smrg         for (unsigned i = 0; i < instr->operands.size(); i++) {
37277ec681f3Smrg            if (instr->operands[i].isConstant() && i > 0) {
37287ec681f3Smrg               literal_uses = UINT32_MAX;
37297ec681f3Smrg               break;
37307ec681f3Smrg            }
37317ec681f3Smrg            if (!instr->operands[i].isTemp())
37327ec681f3Smrg               continue;
37337ec681f3Smrg            unsigned bits = get_operand_size(instr, i);
37347ec681f3Smrg            /* if one of the operands is sgpr, we cannot add a literal somewhere else on pre-GFX10
37357ec681f3Smrg             * or operands other than the 1st */
37367ec681f3Smrg            if (instr->operands[i].getTemp().type() == RegType::sgpr &&
37377ec681f3Smrg                (i > 0 || ctx.program->chip_class < GFX10)) {
37387ec681f3Smrg               if (!sgpr_used && ctx.info[instr->operands[i].tempId()].is_literal(bits)) {
37397ec681f3Smrg                  literal_uses = ctx.uses[instr->operands[i].tempId()];
37407ec681f3Smrg                  literal_idx = i;
37417ec681f3Smrg               } else {
37427ec681f3Smrg                  literal_uses = UINT32_MAX;
37437ec681f3Smrg               }
37447ec681f3Smrg               sgpr_used = true;
37457ec681f3Smrg               /* don't break because we still need to check constants */
37467ec681f3Smrg            } else if (!sgpr_used && ctx.info[instr->operands[i].tempId()].is_literal(bits) &&
37477ec681f3Smrg                       ctx.uses[instr->operands[i].tempId()] < literal_uses) {
37487ec681f3Smrg               literal_uses = ctx.uses[instr->operands[i].tempId()];
37497ec681f3Smrg               literal_idx = i;
37507ec681f3Smrg            }
37517ec681f3Smrg         }
37527ec681f3Smrg
37537ec681f3Smrg         /* Limit the number of literals to apply to not increase the code
37547ec681f3Smrg          * size too much, but always apply literals for v_mad->v_madak
37557ec681f3Smrg          * because both instructions are 64-bit and this doesn't increase
37567ec681f3Smrg          * code size.
37577ec681f3Smrg          * TODO: try to apply the literals earlier to lower the number of
37587ec681f3Smrg          * uses below threshold
37597ec681f3Smrg          */
37607ec681f3Smrg         if (literal_uses < threshold || literal_idx == 2) {
37617ec681f3Smrg            ctx.uses[instr->operands[literal_idx].tempId()]--;
37627ec681f3Smrg            mad_info->check_literal = true;
37637ec681f3Smrg            mad_info->literal_idx = literal_idx;
37647ec681f3Smrg            return;
37657ec681f3Smrg         }
37667ec681f3Smrg      }
37677ec681f3Smrg   }
37687ec681f3Smrg
37697ec681f3Smrg   /* Mark SCC needed, so the uniform boolean transformation won't swap the definitions
37707ec681f3Smrg    * when it isn't beneficial */
37717ec681f3Smrg   if (instr->isBranch() && instr->operands.size() && instr->operands[0].isTemp() &&
37727ec681f3Smrg       instr->operands[0].isFixed() && instr->operands[0].physReg() == scc) {
37737ec681f3Smrg      ctx.info[instr->operands[0].tempId()].set_scc_needed();
37747ec681f3Smrg      return;
37757ec681f3Smrg   } else if ((instr->opcode == aco_opcode::s_cselect_b64 ||
37767ec681f3Smrg               instr->opcode == aco_opcode::s_cselect_b32) &&
37777ec681f3Smrg              instr->operands[2].isTemp()) {
37787ec681f3Smrg      ctx.info[instr->operands[2].tempId()].set_scc_needed();
37797ec681f3Smrg   } else if (instr->opcode == aco_opcode::p_wqm && instr->operands[0].isTemp() &&
37807ec681f3Smrg              ctx.info[instr->definitions[0].tempId()].is_scc_needed()) {
37817ec681f3Smrg      /* Propagate label so it is correctly detected by the uniform bool transform */
37827ec681f3Smrg      ctx.info[instr->operands[0].tempId()].set_scc_needed();
37837ec681f3Smrg
37847ec681f3Smrg      /* Fix definition to SCC, this will prevent RA from adding superfluous moves */
37857ec681f3Smrg      instr->definitions[0].setFixed(scc);
37867ec681f3Smrg   }
37877ec681f3Smrg
37887ec681f3Smrg   /* check for literals */
37897ec681f3Smrg   if (!instr->isSALU() && !instr->isVALU())
37907ec681f3Smrg      return;
37917ec681f3Smrg
37927ec681f3Smrg   /* Transform uniform bitwise boolean operations to 32-bit when there are no divergent uses. */
37937ec681f3Smrg   if (instr->definitions.size() && ctx.uses[instr->definitions[0].tempId()] == 0 &&
37947ec681f3Smrg       ctx.info[instr->definitions[0].tempId()].is_uniform_bitwise()) {
37957ec681f3Smrg      bool transform_done = to_uniform_bool_instr(ctx, instr);
37967ec681f3Smrg
37977ec681f3Smrg      if (transform_done && !ctx.info[instr->definitions[1].tempId()].is_scc_needed()) {
37987ec681f3Smrg         /* Swap the two definition IDs in order to avoid overusing the SCC.
37997ec681f3Smrg          * This reduces extra moves generated by RA. */
38007ec681f3Smrg         uint32_t def0_id = instr->definitions[0].getTemp().id();
38017ec681f3Smrg         uint32_t def1_id = instr->definitions[1].getTemp().id();
38027ec681f3Smrg         instr->definitions[0].setTemp(Temp(def1_id, s1));
38037ec681f3Smrg         instr->definitions[1].setTemp(Temp(def0_id, s1));
38047ec681f3Smrg      }
38057ec681f3Smrg
38067ec681f3Smrg      return;
38077ec681f3Smrg   }
38087ec681f3Smrg
38097ec681f3Smrg   /* Combine DPP copies into VALU. This should be done after creating MAD/FMA. */
38107ec681f3Smrg   if (instr->isVALU()) {
38117ec681f3Smrg      for (unsigned i = 0; i < instr->operands.size(); i++) {
38127ec681f3Smrg         if (!instr->operands[i].isTemp())
38137ec681f3Smrg            continue;
38147ec681f3Smrg         ssa_info info = ctx.info[instr->operands[i].tempId()];
38157ec681f3Smrg
38167ec681f3Smrg         aco_opcode swapped_op;
38177ec681f3Smrg         if (info.is_dpp() && info.instr->pass_flags == instr->pass_flags &&
38187ec681f3Smrg             (i == 0 || can_swap_operands(instr, &swapped_op)) && can_use_DPP(instr, true) &&
38197ec681f3Smrg             !instr->isDPP()) {
38207ec681f3Smrg            convert_to_DPP(instr);
38217ec681f3Smrg            DPP_instruction* dpp = static_cast<DPP_instruction*>(instr.get());
38227ec681f3Smrg            if (i) {
38237ec681f3Smrg               instr->opcode = swapped_op;
38247ec681f3Smrg               std::swap(instr->operands[0], instr->operands[1]);
38257ec681f3Smrg               std::swap(dpp->neg[0], dpp->neg[1]);
38267ec681f3Smrg               std::swap(dpp->abs[0], dpp->abs[1]);
38277ec681f3Smrg            }
38287ec681f3Smrg            if (--ctx.uses[info.instr->definitions[0].tempId()])
38297ec681f3Smrg               ctx.uses[info.instr->operands[0].tempId()]++;
38307ec681f3Smrg            instr->operands[0].setTemp(info.instr->operands[0].getTemp());
38317ec681f3Smrg            dpp->dpp_ctrl = info.instr->dpp().dpp_ctrl;
38327ec681f3Smrg            dpp->bound_ctrl = info.instr->dpp().bound_ctrl;
38337ec681f3Smrg            dpp->neg[0] ^= info.instr->dpp().neg[0] && !dpp->abs[0];
38347ec681f3Smrg            dpp->abs[0] |= info.instr->dpp().abs[0];
38357ec681f3Smrg            break;
38367ec681f3Smrg         }
38377ec681f3Smrg      }
38387ec681f3Smrg   }
38397ec681f3Smrg
38407ec681f3Smrg   if (instr->isSDWA() || (instr->isVOP3() && ctx.program->chip_class < GFX10) ||
38417ec681f3Smrg       (instr->isVOP3P() && ctx.program->chip_class < GFX10))
38427ec681f3Smrg      return; /* some encodings can't ever take literals */
38437ec681f3Smrg
38447ec681f3Smrg   /* we do not apply the literals yet as we don't know if it is profitable */
38457ec681f3Smrg   Operand current_literal(s1);
38467ec681f3Smrg
38477ec681f3Smrg   unsigned literal_id = 0;
38487ec681f3Smrg   unsigned literal_uses = UINT32_MAX;
38497ec681f3Smrg   Operand literal(s1);
38507ec681f3Smrg   unsigned num_operands = 1;
38517ec681f3Smrg   if (instr->isSALU() ||
38527ec681f3Smrg       (ctx.program->chip_class >= GFX10 && (can_use_VOP3(ctx, instr) || instr->isVOP3P())))
38537ec681f3Smrg      num_operands = instr->operands.size();
38547ec681f3Smrg   /* catch VOP2 with a 3rd SGPR operand (e.g. v_cndmask_b32, v_addc_co_u32) */
38557ec681f3Smrg   else if (instr->isVALU() && instr->operands.size() >= 3)
38567ec681f3Smrg      return;
38577ec681f3Smrg
38587ec681f3Smrg   unsigned sgpr_ids[2] = {0, 0};
38597ec681f3Smrg   bool is_literal_sgpr = false;
38607ec681f3Smrg   uint32_t mask = 0;
38617ec681f3Smrg
38627ec681f3Smrg   /* choose a literal to apply */
38637ec681f3Smrg   for (unsigned i = 0; i < num_operands; i++) {
38647ec681f3Smrg      Operand op = instr->operands[i];
38657ec681f3Smrg      unsigned bits = get_operand_size(instr, i);
38667ec681f3Smrg
38677ec681f3Smrg      if (instr->isVALU() && op.isTemp() && op.getTemp().type() == RegType::sgpr &&
38687ec681f3Smrg          op.tempId() != sgpr_ids[0])
38697ec681f3Smrg         sgpr_ids[!!sgpr_ids[0]] = op.tempId();
38707ec681f3Smrg
38717ec681f3Smrg      if (op.isLiteral()) {
38727ec681f3Smrg         current_literal = op;
38737ec681f3Smrg         continue;
38747ec681f3Smrg      } else if (!op.isTemp() || !ctx.info[op.tempId()].is_literal(bits)) {
38757ec681f3Smrg         continue;
38767ec681f3Smrg      }
38777ec681f3Smrg
38787ec681f3Smrg      if (!alu_can_accept_constant(instr->opcode, i))
38797ec681f3Smrg         continue;
38807ec681f3Smrg
38817ec681f3Smrg      if (ctx.uses[op.tempId()] < literal_uses) {
38827ec681f3Smrg         is_literal_sgpr = op.getTemp().type() == RegType::sgpr;
38837ec681f3Smrg         mask = 0;
38847ec681f3Smrg         literal = Operand::c32(ctx.info[op.tempId()].val);
38857ec681f3Smrg         literal_uses = ctx.uses[op.tempId()];
38867ec681f3Smrg         literal_id = op.tempId();
38877ec681f3Smrg      }
38887ec681f3Smrg
38897ec681f3Smrg      mask |= (op.tempId() == literal_id) << i;
38907ec681f3Smrg   }
38917ec681f3Smrg
38927ec681f3Smrg   /* don't go over the constant bus limit */
38937ec681f3Smrg   bool is_shift64 = instr->opcode == aco_opcode::v_lshlrev_b64 ||
38947ec681f3Smrg                     instr->opcode == aco_opcode::v_lshrrev_b64 ||
38957ec681f3Smrg                     instr->opcode == aco_opcode::v_ashrrev_i64;
38967ec681f3Smrg   unsigned const_bus_limit = instr->isVALU() ? 1 : UINT32_MAX;
38977ec681f3Smrg   if (ctx.program->chip_class >= GFX10 && !is_shift64)
38987ec681f3Smrg      const_bus_limit = 2;
38997ec681f3Smrg
39007ec681f3Smrg   unsigned num_sgprs = !!sgpr_ids[0] + !!sgpr_ids[1];
39017ec681f3Smrg   if (num_sgprs == const_bus_limit && !is_literal_sgpr)
39027ec681f3Smrg      return;
39037ec681f3Smrg
39047ec681f3Smrg   if (literal_id && literal_uses < threshold &&
39057ec681f3Smrg       (current_literal.isUndefined() ||
39067ec681f3Smrg        (current_literal.size() == literal.size() &&
39077ec681f3Smrg         current_literal.constantValue() == literal.constantValue()))) {
39087ec681f3Smrg      /* mark the literal to be applied */
39097ec681f3Smrg      while (mask) {
39107ec681f3Smrg         unsigned i = u_bit_scan(&mask);
39117ec681f3Smrg         if (instr->operands[i].isTemp() && instr->operands[i].tempId() == literal_id)
39127ec681f3Smrg            ctx.uses[instr->operands[i].tempId()]--;
39137ec681f3Smrg      }
39147ec681f3Smrg   }
39157ec681f3Smrg}
39167ec681f3Smrg
39177ec681f3Smrgvoid
39187ec681f3Smrgapply_literals(opt_ctx& ctx, aco_ptr<Instruction>& instr)
39197ec681f3Smrg{
39207ec681f3Smrg   /* Cleanup Dead Instructions */
39217ec681f3Smrg   if (!instr)
39227ec681f3Smrg      return;
39237ec681f3Smrg
39247ec681f3Smrg   /* apply literals on MAD */
39257ec681f3Smrg   if (!instr->definitions.empty() && ctx.info[instr->definitions[0].tempId()].is_mad()) {
39267ec681f3Smrg      mad_info* info = &ctx.mad_infos[ctx.info[instr->definitions[0].tempId()].instr->pass_flags];
39277ec681f3Smrg      if (info->check_literal &&
39287ec681f3Smrg          (ctx.uses[instr->operands[info->literal_idx].tempId()] == 0 || info->literal_idx == 2)) {
39297ec681f3Smrg         aco_ptr<Instruction> new_mad;
39307ec681f3Smrg
39317ec681f3Smrg         aco_opcode new_op =
39327ec681f3Smrg            info->literal_idx == 2 ? aco_opcode::v_madak_f32 : aco_opcode::v_madmk_f32;
39337ec681f3Smrg         if (instr->opcode == aco_opcode::v_fma_f32)
39347ec681f3Smrg            new_op = info->literal_idx == 2 ? aco_opcode::v_fmaak_f32 : aco_opcode::v_fmamk_f32;
39357ec681f3Smrg         else if (instr->opcode == aco_opcode::v_mad_f16 ||
39367ec681f3Smrg                  instr->opcode == aco_opcode::v_mad_legacy_f16)
39377ec681f3Smrg            new_op = info->literal_idx == 2 ? aco_opcode::v_madak_f16 : aco_opcode::v_madmk_f16;
39387ec681f3Smrg         else if (instr->opcode == aco_opcode::v_fma_f16)
39397ec681f3Smrg            new_op = info->literal_idx == 2 ? aco_opcode::v_fmaak_f16 : aco_opcode::v_fmamk_f16;
39407ec681f3Smrg
39417ec681f3Smrg         new_mad.reset(create_instruction<VOP2_instruction>(new_op, Format::VOP2, 3, 1));
39427ec681f3Smrg         if (info->literal_idx == 2) { /* add literal -> madak */
39437ec681f3Smrg            new_mad->operands[0] = instr->operands[0];
39447ec681f3Smrg            new_mad->operands[1] = instr->operands[1];
39457ec681f3Smrg         } else { /* mul literal -> madmk */
39467ec681f3Smrg            new_mad->operands[0] = instr->operands[1 - info->literal_idx];
39477ec681f3Smrg            new_mad->operands[1] = instr->operands[2];
39487ec681f3Smrg         }
39497ec681f3Smrg         new_mad->operands[2] =
39507ec681f3Smrg            Operand::c32(ctx.info[instr->operands[info->literal_idx].tempId()].val);
39517ec681f3Smrg         new_mad->definitions[0] = instr->definitions[0];
39527ec681f3Smrg         ctx.instructions.emplace_back(std::move(new_mad));
39537ec681f3Smrg         return;
39547ec681f3Smrg      }
39557ec681f3Smrg   }
39567ec681f3Smrg
39577ec681f3Smrg   /* apply literals on other SALU/VALU */
39587ec681f3Smrg   if (instr->isSALU() || instr->isVALU()) {
39597ec681f3Smrg      for (unsigned i = 0; i < instr->operands.size(); i++) {
39607ec681f3Smrg         Operand op = instr->operands[i];
39617ec681f3Smrg         unsigned bits = get_operand_size(instr, i);
39627ec681f3Smrg         if (op.isTemp() && ctx.info[op.tempId()].is_literal(bits) && ctx.uses[op.tempId()] == 0) {
39637ec681f3Smrg            Operand literal = Operand::c32(ctx.info[op.tempId()].val);
39647ec681f3Smrg            instr->format = withoutDPP(instr->format);
39657ec681f3Smrg            if (instr->isVALU() && i > 0 && instr->format != Format::VOP3P)
39667ec681f3Smrg               to_VOP3(ctx, instr);
39677ec681f3Smrg            instr->operands[i] = literal;
39687ec681f3Smrg         }
39697ec681f3Smrg      }
39707ec681f3Smrg   }
39717ec681f3Smrg
39727ec681f3Smrg   ctx.instructions.emplace_back(std::move(instr));
39737ec681f3Smrg}
39747ec681f3Smrg
39757ec681f3Smrgvoid
39767ec681f3Smrgoptimize(Program* program)
39777ec681f3Smrg{
39787ec681f3Smrg   opt_ctx ctx;
39797ec681f3Smrg   ctx.program = program;
39807ec681f3Smrg   std::vector<ssa_info> info(program->peekAllocationId());
39817ec681f3Smrg   ctx.info = info.data();
39827ec681f3Smrg
39837ec681f3Smrg   /* 1. Bottom-Up DAG pass (forward) to label all ssa-defs */
39847ec681f3Smrg   for (Block& block : program->blocks) {
39857ec681f3Smrg      ctx.fp_mode = block.fp_mode;
39867ec681f3Smrg      for (aco_ptr<Instruction>& instr : block.instructions)
39877ec681f3Smrg         label_instruction(ctx, instr);
39887ec681f3Smrg   }
39897ec681f3Smrg
39907ec681f3Smrg   ctx.uses = dead_code_analysis(program);
39917ec681f3Smrg
39927ec681f3Smrg   /* 2. Combine v_mad, omod, clamp and propagate sgpr on VALU instructions */
39937ec681f3Smrg   for (Block& block : program->blocks) {
39947ec681f3Smrg      ctx.fp_mode = block.fp_mode;
39957ec681f3Smrg      for (aco_ptr<Instruction>& instr : block.instructions)
39967ec681f3Smrg         combine_instruction(ctx, instr);
39977ec681f3Smrg   }
39987ec681f3Smrg
39997ec681f3Smrg   /* 3. Top-Down DAG pass (backward) to select instructions (includes DCE) */
40007ec681f3Smrg   for (auto block_rit = program->blocks.rbegin(); block_rit != program->blocks.rend();
40017ec681f3Smrg        ++block_rit) {
40027ec681f3Smrg      Block* block = &(*block_rit);
40037ec681f3Smrg      ctx.fp_mode = block->fp_mode;
40047ec681f3Smrg      for (auto instr_rit = block->instructions.rbegin(); instr_rit != block->instructions.rend();
40057ec681f3Smrg           ++instr_rit)
40067ec681f3Smrg         select_instruction(ctx, *instr_rit);
40077ec681f3Smrg   }
40087ec681f3Smrg
40097ec681f3Smrg   /* 4. Add literals to instructions */
40107ec681f3Smrg   for (Block& block : program->blocks) {
40117ec681f3Smrg      ctx.instructions.clear();
40127ec681f3Smrg      ctx.fp_mode = block.fp_mode;
40137ec681f3Smrg      for (aco_ptr<Instruction>& instr : block.instructions)
40147ec681f3Smrg         apply_literals(ctx, instr);
40157ec681f3Smrg      block.instructions.swap(ctx.instructions);
40167ec681f3Smrg   }
40177ec681f3Smrg}
40187ec681f3Smrg
40197ec681f3Smrg} // namespace aco
4020