17ec681f3Smrg/*
27ec681f3Smrg * Copyright (C) 2021 Alyssa Rosenzweig <alyssa@rosenzweig.io>
37ec681f3Smrg *
47ec681f3Smrg * Permission is hereby granted, free of charge, to any person obtaining a
57ec681f3Smrg * copy of this software and associated documentation files (the "Software"),
67ec681f3Smrg * to deal in the Software without restriction, including without limitation
77ec681f3Smrg * the rights to use, copy, modify, merge, publish, distribute, sublicense,
87ec681f3Smrg * and/or sell copies of the Software, and to permit persons to whom the
97ec681f3Smrg * Software is furnished to do so, subject to the following conditions:
107ec681f3Smrg *
117ec681f3Smrg * The above copyright notice and this permission notice (including the next
127ec681f3Smrg * paragraph) shall be included in all copies or substantial portions of the
137ec681f3Smrg * Software.
147ec681f3Smrg *
157ec681f3Smrg * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
167ec681f3Smrg * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
177ec681f3Smrg * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
187ec681f3Smrg * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
197ec681f3Smrg * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
207ec681f3Smrg * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
217ec681f3Smrg * SOFTWARE.
227ec681f3Smrg */
237ec681f3Smrg
247ec681f3Smrg#include "agx_compiler.h"
257ec681f3Smrg#include "agx_minifloat.h"
267ec681f3Smrg
277ec681f3Smrg/* AGX peephole optimizer responsible for instruction combining. It operates in
287ec681f3Smrg * a forward direction and a backward direction, in each case traversing in
297ec681f3Smrg * source order. SSA means the forward pass satisfies the invariant:
307ec681f3Smrg *
317ec681f3Smrg *    Every def is visited before any of its uses.
327ec681f3Smrg *
337ec681f3Smrg * Dually, the backend pass satisfies the invariant:
347ec681f3Smrg *
357ec681f3Smrg *    Every use of a def is visited before the def.
367ec681f3Smrg *
377ec681f3Smrg * This means the forward pass can propagate modifiers forward, whereas the
387ec681f3Smrg * backwards pass propagates modifiers backward. Consider an example:
397ec681f3Smrg *
407ec681f3Smrg *    1 = fabs 0
417ec681f3Smrg *    2 = fround 1
427ec681f3Smrg *    3 = fsat 1
437ec681f3Smrg *
447ec681f3Smrg * The forwards pass would propagate the fabs to the fround (since we can
457ec681f3Smrg * lookup the fabs from the fround source and do the replacement). By contrast
467ec681f3Smrg * the backwards pass would propagate the fsat back to the fround (since when
477ec681f3Smrg * we see the fround we know it has only a single user, fsat).  Propagatable
487ec681f3Smrg * instruction have natural directions (like pushforwards and pullbacks).
497ec681f3Smrg *
507ec681f3Smrg * We are careful to update the tracked state whenever we modify an instruction
517ec681f3Smrg * to ensure the passes are linear-time and converge in a single iteration.
527ec681f3Smrg *
537ec681f3Smrg * Size conversions are worth special discussion. Consider the snippet:
547ec681f3Smrg *
557ec681f3Smrg *    2 = fadd 0, 1
567ec681f3Smrg *    3 = f2f16 2
577ec681f3Smrg *    4 = fround 3
587ec681f3Smrg *
597ec681f3Smrg * A priori, we can move the f2f16 in either direction. But it's not equal --
607ec681f3Smrg * if we move it up to the fadd, we get FP16 for two instructions, whereas if
617ec681f3Smrg * we push it into the fround, we effectively get FP32 for two instructions. So
627ec681f3Smrg * f2f16 is backwards. Likewise, consider
637ec681f3Smrg *
647ec681f3Smrg *    2 = fadd 0, 1
657ec681f3Smrg *    3 = f2f32 1
667ec681f3Smrg *    4 = fround 3
677ec681f3Smrg *
687ec681f3Smrg * This time if we move f2f32 up to the fadd, we get FP32 for two, but if we
697ec681f3Smrg * move it down to the fround, we get FP16 to too. So f2f32 is backwards.
707ec681f3Smrg */
717ec681f3Smrg
727ec681f3Smrgstatic bool
737ec681f3Smrgagx_is_fmov(agx_instr *def)
747ec681f3Smrg{
757ec681f3Smrg   return (def->op == AGX_OPCODE_FADD)
767ec681f3Smrg      && agx_is_equiv(def->src[1], agx_negzero());
777ec681f3Smrg}
787ec681f3Smrg
797ec681f3Smrg/* Compose floating-point modifiers with floating-point sources */
807ec681f3Smrg
817ec681f3Smrgstatic agx_index
827ec681f3Smrgagx_compose_float_src(agx_index to, agx_index from)
837ec681f3Smrg{
847ec681f3Smrg   if (to.abs)
857ec681f3Smrg      from.neg = false;
867ec681f3Smrg
877ec681f3Smrg   from.abs |= to.abs;
887ec681f3Smrg   from.neg |= to.neg;
897ec681f3Smrg
907ec681f3Smrg   return from;
917ec681f3Smrg}
927ec681f3Smrg
937ec681f3Smrgstatic void
947ec681f3Smrgagx_optimizer_fmov(agx_instr **defs, agx_instr *ins, unsigned srcs)
957ec681f3Smrg{
967ec681f3Smrg   for (unsigned s = 0; s < srcs; ++s) {
977ec681f3Smrg      agx_index src = ins->src[s];
987ec681f3Smrg      if (src.type != AGX_INDEX_NORMAL) continue;
997ec681f3Smrg
1007ec681f3Smrg      agx_instr *def = defs[src.value];
1017ec681f3Smrg      if (!agx_is_fmov(def)) continue;
1027ec681f3Smrg      if (def->saturate) continue;
1037ec681f3Smrg
1047ec681f3Smrg      ins->src[s] = agx_compose_float_src(src, def->src[0]);
1057ec681f3Smrg   }
1067ec681f3Smrg}
1077ec681f3Smrg
1087ec681f3Smrgstatic void
1097ec681f3Smrgagx_optimizer_inline_imm(agx_instr **defs, agx_instr *I,
1107ec681f3Smrg      unsigned srcs, bool is_float)
1117ec681f3Smrg{
1127ec681f3Smrg   for (unsigned s = 0; s < srcs; ++s) {
1137ec681f3Smrg      agx_index src = I->src[s];
1147ec681f3Smrg      if (src.type != AGX_INDEX_NORMAL) continue;
1157ec681f3Smrg
1167ec681f3Smrg      agx_instr *def = defs[src.value];
1177ec681f3Smrg      if (def->op != AGX_OPCODE_MOV_IMM) continue;
1187ec681f3Smrg
1197ec681f3Smrg      uint8_t value = def->imm;
1207ec681f3Smrg      bool float_src = is_float;
1217ec681f3Smrg
1227ec681f3Smrg      /* cmpselsrc takes integer immediates only */
1237ec681f3Smrg      if (s >= 2 && I->op == AGX_OPCODE_FCMPSEL) float_src = false;
1247ec681f3Smrg
1257ec681f3Smrg      if (float_src) {
1267ec681f3Smrg         bool fp16 = (def->dest[0].size == AGX_SIZE_16);
1277ec681f3Smrg         assert(fp16 || (def->dest[0].size == AGX_SIZE_32));
1287ec681f3Smrg
1297ec681f3Smrg         float f = fp16 ? _mesa_half_to_float(def->imm) : uif(def->imm);
1307ec681f3Smrg         if (!agx_minifloat_exact(f)) continue;
1317ec681f3Smrg
1327ec681f3Smrg         value = agx_minifloat_encode(f);
1337ec681f3Smrg      } else if (value != def->imm) {
1347ec681f3Smrg         continue;
1357ec681f3Smrg      }
1367ec681f3Smrg
1377ec681f3Smrg      I->src[s].type = AGX_INDEX_IMMEDIATE;
1387ec681f3Smrg      I->src[s].value = value;
1397ec681f3Smrg   }
1407ec681f3Smrg}
1417ec681f3Smrg
1427ec681f3Smrgstatic bool
1437ec681f3Smrgagx_optimizer_fmov_rev(agx_instr *I, agx_instr *use)
1447ec681f3Smrg{
1457ec681f3Smrg   if (!agx_is_fmov(use)) return false;
1467ec681f3Smrg   if (use->src[0].neg || use->src[0].abs) return false;
1477ec681f3Smrg
1487ec681f3Smrg   /* saturate(saturate(x)) = saturate(x) */
1497ec681f3Smrg   I->saturate |= use->saturate;
1507ec681f3Smrg   I->dest[0] = use->dest[0];
1517ec681f3Smrg   return true;
1527ec681f3Smrg}
1537ec681f3Smrg
1547ec681f3Smrgstatic void
1557ec681f3Smrgagx_optimizer_forward(agx_context *ctx)
1567ec681f3Smrg{
1577ec681f3Smrg   agx_instr **defs = calloc(ctx->alloc, sizeof(*defs));
1587ec681f3Smrg
1597ec681f3Smrg   agx_foreach_instr_global(ctx, I) {
1607ec681f3Smrg      struct agx_opcode_info info = agx_opcodes_info[I->op];
1617ec681f3Smrg
1627ec681f3Smrg      for (unsigned d = 0; d < info.nr_dests; ++d) {
1637ec681f3Smrg         if (I->dest[d].type == AGX_INDEX_NORMAL)
1647ec681f3Smrg            defs[I->dest[d].value] = I;
1657ec681f3Smrg      }
1667ec681f3Smrg
1677ec681f3Smrg      /* Propagate fmov down */
1687ec681f3Smrg      if (info.is_float)
1697ec681f3Smrg         agx_optimizer_fmov(defs, I, info.nr_srcs);
1707ec681f3Smrg
1717ec681f3Smrg      /* Inline immediates if we can. TODO: systematic */
1727ec681f3Smrg      if (I->op != AGX_OPCODE_ST_VARY && I->op != AGX_OPCODE_ST_TILE && I->op != AGX_OPCODE_P_EXTRACT && I->op != AGX_OPCODE_P_COMBINE)
1737ec681f3Smrg         agx_optimizer_inline_imm(defs, I, info.nr_srcs, info.is_float);
1747ec681f3Smrg   }
1757ec681f3Smrg
1767ec681f3Smrg   free(defs);
1777ec681f3Smrg}
1787ec681f3Smrg
1797ec681f3Smrgstatic void
1807ec681f3Smrgagx_optimizer_backward(agx_context *ctx)
1817ec681f3Smrg{
1827ec681f3Smrg   agx_instr **uses = calloc(ctx->alloc, sizeof(*uses));
1837ec681f3Smrg   BITSET_WORD *multiple = calloc(BITSET_WORDS(ctx->alloc), sizeof(*multiple));
1847ec681f3Smrg
1857ec681f3Smrg   agx_foreach_instr_global_rev(ctx, I) {
1867ec681f3Smrg      struct agx_opcode_info info = agx_opcodes_info[I->op];
1877ec681f3Smrg
1887ec681f3Smrg      for (unsigned s = 0; s < info.nr_srcs; ++s) {
1897ec681f3Smrg         if (I->src[s].type == AGX_INDEX_NORMAL) {
1907ec681f3Smrg            unsigned v = I->src[s].value;
1917ec681f3Smrg
1927ec681f3Smrg            if (uses[v])
1937ec681f3Smrg               BITSET_SET(multiple, v);
1947ec681f3Smrg            else
1957ec681f3Smrg               uses[v] = I;
1967ec681f3Smrg         }
1977ec681f3Smrg      }
1987ec681f3Smrg
1997ec681f3Smrg      if (info.nr_dests != 1)
2007ec681f3Smrg         continue;
2017ec681f3Smrg
2027ec681f3Smrg      if (I->dest[0].type != AGX_INDEX_NORMAL)
2037ec681f3Smrg         continue;
2047ec681f3Smrg
2057ec681f3Smrg      agx_instr *use = uses[I->dest[0].value];
2067ec681f3Smrg
2077ec681f3Smrg      if (!use || BITSET_TEST(multiple, I->dest[0].value))
2087ec681f3Smrg         continue;
2097ec681f3Smrg
2107ec681f3Smrg      /* Destination has a single use, try to propagate */
2117ec681f3Smrg      if (info.is_float && agx_optimizer_fmov_rev(I, use)) {
2127ec681f3Smrg         agx_remove_instruction(use);
2137ec681f3Smrg         continue;
2147ec681f3Smrg      }
2157ec681f3Smrg   }
2167ec681f3Smrg
2177ec681f3Smrg   free(uses);
2187ec681f3Smrg   free(multiple);
2197ec681f3Smrg}
2207ec681f3Smrg
2217ec681f3Smrgvoid
2227ec681f3Smrgagx_optimizer(agx_context *ctx)
2237ec681f3Smrg{
2247ec681f3Smrg   agx_optimizer_backward(ctx);
2257ec681f3Smrg   agx_optimizer_forward(ctx);
2267ec681f3Smrg}
227