17ec681f3Smrg/*
27ec681f3Smrg * Copyright (C) 2021 Valve Corporation
37ec681f3Smrg *
47ec681f3Smrg * Permission is hereby granted, free of charge, to any person obtaining a
57ec681f3Smrg * copy of this software and associated documentation files (the "Software"),
67ec681f3Smrg * to deal in the Software without restriction, including without limitation
77ec681f3Smrg * the rights to use, copy, modify, merge, publish, distribute, sublicense,
87ec681f3Smrg * and/or sell copies of the Software, and to permit persons to whom the
97ec681f3Smrg * Software is furnished to do so, subject to the following conditions:
107ec681f3Smrg *
117ec681f3Smrg * The above copyright notice and this permission notice (including the next
127ec681f3Smrg * paragraph) shall be included in all copies or substantial portions of the
137ec681f3Smrg * Software.
147ec681f3Smrg *
157ec681f3Smrg * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
167ec681f3Smrg * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
177ec681f3Smrg * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
187ec681f3Smrg * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
197ec681f3Smrg * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
207ec681f3Smrg * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
217ec681f3Smrg * SOFTWARE.
227ec681f3Smrg */
237ec681f3Smrg
247ec681f3Smrg#include "ir3_ra.h"
257ec681f3Smrg#include "ir3_shader.h"
267ec681f3Smrg
277ec681f3Smrgstruct copy_src {
287ec681f3Smrg   unsigned flags;
297ec681f3Smrg   union {
307ec681f3Smrg      uint32_t imm;
317ec681f3Smrg      physreg_t reg;
327ec681f3Smrg      unsigned const_num;
337ec681f3Smrg   };
347ec681f3Smrg};
357ec681f3Smrg
367ec681f3Smrgstruct copy_entry {
377ec681f3Smrg   physreg_t dst;
387ec681f3Smrg   unsigned flags;
397ec681f3Smrg   bool done;
407ec681f3Smrg
417ec681f3Smrg   struct copy_src src;
427ec681f3Smrg};
437ec681f3Smrg
447ec681f3Smrgstatic unsigned
457ec681f3Smrgcopy_entry_size(const struct copy_entry *entry)
467ec681f3Smrg{
477ec681f3Smrg   return (entry->flags & IR3_REG_HALF) ? 1 : 2;
487ec681f3Smrg}
497ec681f3Smrg
507ec681f3Smrgstatic struct copy_src
517ec681f3Smrgget_copy_src(const struct ir3_register *reg, unsigned offset)
527ec681f3Smrg{
537ec681f3Smrg   if (reg->flags & IR3_REG_IMMED) {
547ec681f3Smrg      return (struct copy_src){
557ec681f3Smrg         .flags = IR3_REG_IMMED,
567ec681f3Smrg         .imm = reg->uim_val,
577ec681f3Smrg      };
587ec681f3Smrg   } else if (reg->flags & IR3_REG_CONST) {
597ec681f3Smrg      return (struct copy_src){
607ec681f3Smrg         .flags = IR3_REG_CONST,
617ec681f3Smrg         .const_num = reg->num,
627ec681f3Smrg      };
637ec681f3Smrg   } else {
647ec681f3Smrg      return (struct copy_src){
657ec681f3Smrg         .flags = 0,
667ec681f3Smrg         .reg = ra_reg_get_physreg(reg) + offset,
677ec681f3Smrg      };
687ec681f3Smrg   }
697ec681f3Smrg}
707ec681f3Smrg
717ec681f3Smrgstatic void
727ec681f3Smrgdo_xor(struct ir3_instruction *instr, unsigned dst_num, unsigned src1_num,
737ec681f3Smrg       unsigned src2_num, unsigned flags)
747ec681f3Smrg{
757ec681f3Smrg   struct ir3_instruction * xor
767ec681f3Smrg      = ir3_instr_create(instr->block, OPC_XOR_B, 1, 2);
777ec681f3Smrg   ir3_dst_create(xor, dst_num, flags);
787ec681f3Smrg   ir3_src_create(xor, src1_num, flags);
797ec681f3Smrg   ir3_src_create(xor, src2_num, flags);
807ec681f3Smrg
817ec681f3Smrg   ir3_instr_move_before(xor, instr);
827ec681f3Smrg}
837ec681f3Smrg
847ec681f3Smrgstatic void
857ec681f3Smrgdo_swap(struct ir3_compiler *compiler, struct ir3_instruction *instr,
867ec681f3Smrg        const struct copy_entry *entry)
877ec681f3Smrg{
887ec681f3Smrg   assert(!entry->src.flags);
897ec681f3Smrg
907ec681f3Smrg   if (entry->flags & IR3_REG_HALF) {
917ec681f3Smrg      /* We currently make sure to never emit parallel copies where the
927ec681f3Smrg       * source/destination is a half-reg above the range accessable to half
937ec681f3Smrg       * registers. However, when a full-reg source overlaps a half-reg
947ec681f3Smrg       * destination or vice versa, it can be very, very complicated to come
957ec681f3Smrg       * up with a series of "legal" swaps and copies to resolve the
967ec681f3Smrg       * parallel copy. So here we provide a fallback to implement the
977ec681f3Smrg       * "illegal" swap instead. This may also be useful for implementing
987ec681f3Smrg       * "spilling" half-regs to the inaccessable space.
997ec681f3Smrg       */
1007ec681f3Smrg      if (entry->src.reg >= RA_HALF_SIZE) {
1017ec681f3Smrg         /* Choose a temporary that doesn't overlap src or dst */
1027ec681f3Smrg         physreg_t tmp = entry->dst < 2 ? 2 : 0;
1037ec681f3Smrg
1047ec681f3Smrg         /* Swap src and the temporary */
1057ec681f3Smrg         do_swap(compiler, instr,
1067ec681f3Smrg                 &(struct copy_entry){
1077ec681f3Smrg                    .src = {.reg = entry->src.reg & ~1u},
1087ec681f3Smrg                    .dst = tmp,
1097ec681f3Smrg                    .flags = entry->flags & ~IR3_REG_HALF,
1107ec681f3Smrg                 });
1117ec681f3Smrg
1127ec681f3Smrg         /* If src and dst are within the same full register, then swapping src
1137ec681f3Smrg          * with tmp above will also move dst to tmp. Account for that here.
1147ec681f3Smrg          */
1157ec681f3Smrg         unsigned dst =
1167ec681f3Smrg            (entry->src.reg & ~1u) == (entry->dst & ~1u) ?
1177ec681f3Smrg            tmp + (entry->dst & 1u) : entry->dst;
1187ec681f3Smrg
1197ec681f3Smrg         /* Do the original swap with src replaced with tmp */
1207ec681f3Smrg         do_swap(compiler, instr,
1217ec681f3Smrg                 &(struct copy_entry){
1227ec681f3Smrg                    .src = {.reg = tmp + (entry->src.reg & 1)},
1237ec681f3Smrg                    .dst = dst,
1247ec681f3Smrg                    .flags = entry->flags,
1257ec681f3Smrg                 });
1267ec681f3Smrg
1277ec681f3Smrg         /* Swap src and the temporary back */
1287ec681f3Smrg         do_swap(compiler, instr,
1297ec681f3Smrg                 &(struct copy_entry){
1307ec681f3Smrg                    .src = {.reg = entry->src.reg & ~1u},
1317ec681f3Smrg                    .dst = tmp,
1327ec681f3Smrg                    .flags = entry->flags & ~IR3_REG_HALF,
1337ec681f3Smrg                 });
1347ec681f3Smrg         return;
1357ec681f3Smrg      }
1367ec681f3Smrg
1377ec681f3Smrg      /* If dst is not addressable, we only need to swap the arguments and
1387ec681f3Smrg       * let the case above handle it.
1397ec681f3Smrg       */
1407ec681f3Smrg      if (entry->dst >= RA_HALF_SIZE) {
1417ec681f3Smrg         do_swap(compiler, instr,
1427ec681f3Smrg                 &(struct copy_entry){
1437ec681f3Smrg                    .src = {.reg = entry->dst},
1447ec681f3Smrg                    .dst = entry->src.reg,
1457ec681f3Smrg                    .flags = entry->flags,
1467ec681f3Smrg                 });
1477ec681f3Smrg         return;
1487ec681f3Smrg      }
1497ec681f3Smrg   }
1507ec681f3Smrg
1517ec681f3Smrg   unsigned src_num = ra_physreg_to_num(entry->src.reg, entry->flags);
1527ec681f3Smrg   unsigned dst_num = ra_physreg_to_num(entry->dst, entry->flags);
1537ec681f3Smrg
1547ec681f3Smrg   /* a5xx+ is known to support swz, which enables us to swap two registers
1557ec681f3Smrg    * in-place. If unsupported we emulate it using the xor trick.
1567ec681f3Smrg    */
1577ec681f3Smrg   if (compiler->gen < 5) {
1587ec681f3Smrg      /* Shared regs only exist since a5xx, so we don't have to provide a
1597ec681f3Smrg       * fallback path for them.
1607ec681f3Smrg       */
1617ec681f3Smrg      assert(!(entry->flags & IR3_REG_SHARED));
1627ec681f3Smrg      do_xor(instr, dst_num, dst_num, src_num, entry->flags);
1637ec681f3Smrg      do_xor(instr, src_num, src_num, dst_num, entry->flags);
1647ec681f3Smrg      do_xor(instr, dst_num, dst_num, src_num, entry->flags);
1657ec681f3Smrg   } else {
1667ec681f3Smrg      /* Use a macro for shared regs because any shared reg writes need to
1677ec681f3Smrg       * be wrapped in a getone block to work correctly. Writing shared regs
1687ec681f3Smrg       * with multiple threads active does not work, even if they all return
1697ec681f3Smrg       * the same value.
1707ec681f3Smrg       */
1717ec681f3Smrg      unsigned opc =
1727ec681f3Smrg         (entry->flags & IR3_REG_SHARED) ? OPC_SWZ_SHARED_MACRO : OPC_SWZ;
1737ec681f3Smrg      struct ir3_instruction *swz = ir3_instr_create(instr->block, opc, 2, 2);
1747ec681f3Smrg      ir3_dst_create(swz, dst_num, entry->flags);
1757ec681f3Smrg      ir3_dst_create(swz, src_num, entry->flags);
1767ec681f3Smrg      ir3_src_create(swz, src_num, entry->flags);
1777ec681f3Smrg      ir3_src_create(swz, dst_num, entry->flags);
1787ec681f3Smrg      swz->cat1.dst_type = (entry->flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32;
1797ec681f3Smrg      swz->cat1.src_type = (entry->flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32;
1807ec681f3Smrg      swz->repeat = 1;
1817ec681f3Smrg      ir3_instr_move_before(swz, instr);
1827ec681f3Smrg   }
1837ec681f3Smrg}
1847ec681f3Smrg
1857ec681f3Smrgstatic void
1867ec681f3Smrgdo_copy(struct ir3_compiler *compiler, struct ir3_instruction *instr,
1877ec681f3Smrg        const struct copy_entry *entry)
1887ec681f3Smrg{
1897ec681f3Smrg   if (entry->flags & IR3_REG_HALF) {
1907ec681f3Smrg      /* See do_swap() for why this is here. */
1917ec681f3Smrg      if (entry->dst >= RA_HALF_SIZE) {
1927ec681f3Smrg         /* TODO: is there a hw instruction we can use for this case? */
1937ec681f3Smrg         physreg_t tmp = !entry->src.flags && entry->src.reg < 2 ? 2 : 0;
1947ec681f3Smrg
1957ec681f3Smrg         do_swap(compiler, instr,
1967ec681f3Smrg                 &(struct copy_entry){
1977ec681f3Smrg                    .src = {.reg = entry->dst & ~1u},
1987ec681f3Smrg                    .dst = tmp,
1997ec681f3Smrg                    .flags = entry->flags & ~IR3_REG_HALF,
2007ec681f3Smrg                 });
2017ec681f3Smrg
2027ec681f3Smrg         /* Similar to in do_swap(), account for src being swapped with tmp if
2037ec681f3Smrg          * src and dst are in the same register.
2047ec681f3Smrg          */
2057ec681f3Smrg         struct copy_src src = entry->src;
2067ec681f3Smrg         if (!src.flags && (src.reg & ~1u) == (entry->dst & ~1u))
2077ec681f3Smrg            src.reg = tmp + (src.reg & 1u);
2087ec681f3Smrg
2097ec681f3Smrg         do_copy(compiler, instr,
2107ec681f3Smrg                 &(struct copy_entry){
2117ec681f3Smrg                    .src = src,
2127ec681f3Smrg                    .dst = tmp + (entry->dst & 1),
2137ec681f3Smrg                    .flags = entry->flags,
2147ec681f3Smrg                 });
2157ec681f3Smrg
2167ec681f3Smrg         do_swap(compiler, instr,
2177ec681f3Smrg                 &(struct copy_entry){
2187ec681f3Smrg                    .src = {.reg = entry->dst & ~1u},
2197ec681f3Smrg                    .dst = tmp,
2207ec681f3Smrg                    .flags = entry->flags & ~IR3_REG_HALF,
2217ec681f3Smrg                 });
2227ec681f3Smrg         return;
2237ec681f3Smrg      }
2247ec681f3Smrg
2257ec681f3Smrg      if (!entry->src.flags && entry->src.reg >= RA_HALF_SIZE) {
2267ec681f3Smrg         unsigned src_num = ra_physreg_to_num(entry->src.reg & ~1u,
2277ec681f3Smrg                                              entry->flags & ~IR3_REG_HALF);
2287ec681f3Smrg         unsigned dst_num = ra_physreg_to_num(entry->dst, entry->flags);
2297ec681f3Smrg
2307ec681f3Smrg         if (entry->src.reg % 2 == 0) {
2317ec681f3Smrg            /* cov.u32u16 dst, src */
2327ec681f3Smrg            struct ir3_instruction *cov =
2337ec681f3Smrg               ir3_instr_create(instr->block, OPC_MOV, 1, 1);
2347ec681f3Smrg            ir3_dst_create(cov, dst_num, entry->flags);
2357ec681f3Smrg            ir3_src_create(cov, src_num, entry->flags & ~IR3_REG_HALF);
2367ec681f3Smrg            cov->cat1.dst_type = TYPE_U16;
2377ec681f3Smrg            cov->cat1.src_type = TYPE_U32;
2387ec681f3Smrg            ir3_instr_move_before(cov, instr);
2397ec681f3Smrg         } else {
2407ec681f3Smrg            /* shr.b dst, src, (16) */
2417ec681f3Smrg            struct ir3_instruction *shr =
2427ec681f3Smrg               ir3_instr_create(instr->block, OPC_SHR_B, 1, 2);
2437ec681f3Smrg            ir3_dst_create(shr, dst_num, entry->flags);
2447ec681f3Smrg            ir3_src_create(shr, src_num, entry->flags & ~IR3_REG_HALF);
2457ec681f3Smrg            ir3_src_create(shr, 0, IR3_REG_IMMED)->uim_val = 16;
2467ec681f3Smrg            ir3_instr_move_before(shr, instr);
2477ec681f3Smrg         }
2487ec681f3Smrg         return;
2497ec681f3Smrg      }
2507ec681f3Smrg   }
2517ec681f3Smrg
2527ec681f3Smrg   unsigned src_num = ra_physreg_to_num(entry->src.reg, entry->flags);
2537ec681f3Smrg   unsigned dst_num = ra_physreg_to_num(entry->dst, entry->flags);
2547ec681f3Smrg
2557ec681f3Smrg   /* Similar to the swap case, we have to use a macro for shared regs. */
2567ec681f3Smrg   unsigned opc =
2577ec681f3Smrg      (entry->flags & IR3_REG_SHARED) ? OPC_READ_FIRST_MACRO : OPC_MOV;
2587ec681f3Smrg   struct ir3_instruction *mov = ir3_instr_create(instr->block, opc, 1, 1);
2597ec681f3Smrg   ir3_dst_create(mov, dst_num, entry->flags);
2607ec681f3Smrg   ir3_src_create(mov, src_num, entry->flags | entry->src.flags);
2617ec681f3Smrg   mov->cat1.dst_type = (entry->flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32;
2627ec681f3Smrg   mov->cat1.src_type = (entry->flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32;
2637ec681f3Smrg   if (entry->src.flags & IR3_REG_IMMED)
2647ec681f3Smrg      mov->srcs[0]->uim_val = entry->src.imm;
2657ec681f3Smrg   else if (entry->src.flags & IR3_REG_CONST)
2667ec681f3Smrg      mov->srcs[0]->num = entry->src.const_num;
2677ec681f3Smrg   ir3_instr_move_before(mov, instr);
2687ec681f3Smrg}
2697ec681f3Smrg
2707ec681f3Smrgstruct copy_ctx {
2717ec681f3Smrg   /* For each physreg, the number of pending copy entries that use it as a
2727ec681f3Smrg    * source. Once this drops to zero, then the physreg is unblocked and can
2737ec681f3Smrg    * be moved to.
2747ec681f3Smrg    */
2757ec681f3Smrg   unsigned physreg_use_count[RA_MAX_FILE_SIZE];
2767ec681f3Smrg
2777ec681f3Smrg   /* For each physreg, the pending copy_entry that uses it as a dest. */
2787ec681f3Smrg   struct copy_entry *physreg_dst[RA_MAX_FILE_SIZE];
2797ec681f3Smrg
2807ec681f3Smrg   struct copy_entry entries[RA_MAX_FILE_SIZE];
2817ec681f3Smrg   unsigned entry_count;
2827ec681f3Smrg};
2837ec681f3Smrg
2847ec681f3Smrgstatic bool
2857ec681f3Smrgentry_blocked(struct copy_entry *entry, struct copy_ctx *ctx)
2867ec681f3Smrg{
2877ec681f3Smrg   for (unsigned i = 0; i < copy_entry_size(entry); i++) {
2887ec681f3Smrg      if (ctx->physreg_use_count[entry->dst + i] != 0)
2897ec681f3Smrg         return true;
2907ec681f3Smrg   }
2917ec681f3Smrg
2927ec681f3Smrg   return false;
2937ec681f3Smrg}
2947ec681f3Smrg
2957ec681f3Smrgstatic void
2967ec681f3Smrgsplit_32bit_copy(struct copy_ctx *ctx, struct copy_entry *entry)
2977ec681f3Smrg{
2987ec681f3Smrg   assert(!entry->done);
2997ec681f3Smrg   assert(!(entry->src.flags & (IR3_REG_IMMED | IR3_REG_CONST)));
3007ec681f3Smrg   assert(copy_entry_size(entry) == 2);
3017ec681f3Smrg   struct copy_entry *new_entry = &ctx->entries[ctx->entry_count++];
3027ec681f3Smrg
3037ec681f3Smrg   new_entry->dst = entry->dst + 1;
3047ec681f3Smrg   new_entry->src.flags = entry->src.flags;
3057ec681f3Smrg   new_entry->src.reg = entry->src.reg + 1;
3067ec681f3Smrg   new_entry->done = false;
3077ec681f3Smrg   entry->flags |= IR3_REG_HALF;
3087ec681f3Smrg   new_entry->flags = entry->flags;
3097ec681f3Smrg   ctx->physreg_dst[entry->dst + 1] = new_entry;
3107ec681f3Smrg}
3117ec681f3Smrg
3127ec681f3Smrgstatic void
3137ec681f3Smrg_handle_copies(struct ir3_compiler *compiler, struct ir3_instruction *instr,
3147ec681f3Smrg               struct copy_ctx *ctx)
3157ec681f3Smrg{
3167ec681f3Smrg   /* Set up the bookkeeping */
3177ec681f3Smrg   memset(ctx->physreg_dst, 0, sizeof(ctx->physreg_dst));
3187ec681f3Smrg   memset(ctx->physreg_use_count, 0, sizeof(ctx->physreg_use_count));
3197ec681f3Smrg
3207ec681f3Smrg   for (unsigned i = 0; i < ctx->entry_count; i++) {
3217ec681f3Smrg      struct copy_entry *entry = &ctx->entries[i];
3227ec681f3Smrg      for (unsigned j = 0; j < copy_entry_size(entry); j++) {
3237ec681f3Smrg         if (!entry->src.flags)
3247ec681f3Smrg            ctx->physreg_use_count[entry->src.reg + j]++;
3257ec681f3Smrg
3267ec681f3Smrg         /* Copies should not have overlapping destinations. */
3277ec681f3Smrg         assert(!ctx->physreg_dst[entry->dst + j]);
3287ec681f3Smrg         ctx->physreg_dst[entry->dst + j] = entry;
3297ec681f3Smrg      }
3307ec681f3Smrg   }
3317ec681f3Smrg
3327ec681f3Smrg   bool progress = true;
3337ec681f3Smrg   while (progress) {
3347ec681f3Smrg      progress = false;
3357ec681f3Smrg
3367ec681f3Smrg      /* Step 1: resolve paths in the transfer graph. This means finding
3377ec681f3Smrg       * copies whose destination aren't blocked by something else and then
3387ec681f3Smrg       * emitting them, continuing this process until every copy is blocked
3397ec681f3Smrg       * and there are only cycles left.
3407ec681f3Smrg       *
3417ec681f3Smrg       * TODO: We should note that src is also available in dst to unblock
3427ec681f3Smrg       * cycles that src is involved in.
3437ec681f3Smrg       */
3447ec681f3Smrg
3457ec681f3Smrg      for (unsigned i = 0; i < ctx->entry_count; i++) {
3467ec681f3Smrg         struct copy_entry *entry = &ctx->entries[i];
3477ec681f3Smrg         if (!entry->done && !entry_blocked(entry, ctx)) {
3487ec681f3Smrg            entry->done = true;
3497ec681f3Smrg            progress = true;
3507ec681f3Smrg            do_copy(compiler, instr, entry);
3517ec681f3Smrg            for (unsigned j = 0; j < copy_entry_size(entry); j++) {
3527ec681f3Smrg               if (!entry->src.flags)
3537ec681f3Smrg                  ctx->physreg_use_count[entry->src.reg + j]--;
3547ec681f3Smrg               ctx->physreg_dst[entry->dst + j] = NULL;
3557ec681f3Smrg            }
3567ec681f3Smrg         }
3577ec681f3Smrg      }
3587ec681f3Smrg
3597ec681f3Smrg      if (progress)
3607ec681f3Smrg         continue;
3617ec681f3Smrg
3627ec681f3Smrg      /* Step 2: Find partially blocked copies and split them. In the
3637ec681f3Smrg       * mergedregs case, we can 32-bit copies which are only blocked on one
3647ec681f3Smrg       * 16-bit half, and splitting them helps get things moving.
3657ec681f3Smrg       *
3667ec681f3Smrg       * We can skip splitting copies if the source isn't a register,
3677ec681f3Smrg       * however, because it does not unblock anything and therefore doesn't
3687ec681f3Smrg       * contribute to making forward progress with step 1. These copies
3697ec681f3Smrg       * should still be resolved eventually in step 1 because they can't be
3707ec681f3Smrg       * part of a cycle.
3717ec681f3Smrg       */
3727ec681f3Smrg      for (unsigned i = 0; i < ctx->entry_count; i++) {
3737ec681f3Smrg         struct copy_entry *entry = &ctx->entries[i];
3747ec681f3Smrg         if (entry->done || entry->flags & IR3_REG_HALF)
3757ec681f3Smrg            continue;
3767ec681f3Smrg
3777ec681f3Smrg         if (((ctx->physreg_use_count[entry->dst] == 0 ||
3787ec681f3Smrg               ctx->physreg_use_count[entry->dst + 1] == 0)) &&
3797ec681f3Smrg             !(entry->src.flags & (IR3_REG_IMMED | IR3_REG_CONST))) {
3807ec681f3Smrg            split_32bit_copy(ctx, entry);
3817ec681f3Smrg            progress = true;
3827ec681f3Smrg         }
3837ec681f3Smrg      }
3847ec681f3Smrg   }
3857ec681f3Smrg
3867ec681f3Smrg   /* Step 3: resolve cycles through swapping.
3877ec681f3Smrg    *
3887ec681f3Smrg    * At this point, the transfer graph should consist of only cycles.
3897ec681f3Smrg    * The reason is that, given any physreg n_1 that's the source of a
3907ec681f3Smrg    * remaining entry, it has a destination n_2, which (because every
3917ec681f3Smrg    * copy is blocked) is the source of some other copy whose destination
3927ec681f3Smrg    * is n_3, and so we can follow the chain until we get a cycle. If we
3937ec681f3Smrg    * reached some other node than n_1:
3947ec681f3Smrg    *
3957ec681f3Smrg    *  n_1 -> n_2 -> ... -> n_i
3967ec681f3Smrg    *          ^             |
3977ec681f3Smrg    *          |-------------|
3987ec681f3Smrg    *
3997ec681f3Smrg    *  then n_2 would be the destination of 2 copies, which is illegal
4007ec681f3Smrg    *  (checked above in an assert). So n_1 must be part of a cycle:
4017ec681f3Smrg    *
4027ec681f3Smrg    *  n_1 -> n_2 -> ... -> n_i
4037ec681f3Smrg    *  ^                     |
4047ec681f3Smrg    *  |---------------------|
4057ec681f3Smrg    *
4067ec681f3Smrg    *  and this must be only cycle n_1 is involved in, because any other
4077ec681f3Smrg    *  path starting from n_1 would also have to end in n_1, resulting in
4087ec681f3Smrg    *  a node somewhere along the way being the destination of 2 copies
4097ec681f3Smrg    *  when the 2 paths merge.
4107ec681f3Smrg    *
4117ec681f3Smrg    *  The way we resolve the cycle is through picking a copy (n_1, n_2)
4127ec681f3Smrg    *  and swapping n_1 and n_2. This moves n_1 to n_2, so n_2 is taken
4137ec681f3Smrg    *  out of the cycle:
4147ec681f3Smrg    *
4157ec681f3Smrg    *  n_1 -> ... -> n_i
4167ec681f3Smrg    *  ^              |
4177ec681f3Smrg    *  |--------------|
4187ec681f3Smrg    *
4197ec681f3Smrg    *  and we can keep repeating this until the cycle is empty.
4207ec681f3Smrg    */
4217ec681f3Smrg
4227ec681f3Smrg   for (unsigned i = 0; i < ctx->entry_count; i++) {
4237ec681f3Smrg      struct copy_entry *entry = &ctx->entries[i];
4247ec681f3Smrg      if (entry->done)
4257ec681f3Smrg         continue;
4267ec681f3Smrg
4277ec681f3Smrg      assert(!entry->src.flags);
4287ec681f3Smrg
4297ec681f3Smrg      /* catch trivial copies */
4307ec681f3Smrg      if (entry->dst == entry->src.reg) {
4317ec681f3Smrg         entry->done = true;
4327ec681f3Smrg         continue;
4337ec681f3Smrg      }
4347ec681f3Smrg
4357ec681f3Smrg      do_swap(compiler, instr, entry);
4367ec681f3Smrg
4377ec681f3Smrg      /* Split any blocking copies whose sources are only partially
4387ec681f3Smrg       * contained within our destination.
4397ec681f3Smrg       */
4407ec681f3Smrg      if (entry->flags & IR3_REG_HALF) {
4417ec681f3Smrg         for (unsigned j = 0; j < ctx->entry_count; j++) {
4427ec681f3Smrg            struct copy_entry *blocking = &ctx->entries[j];
4437ec681f3Smrg
4447ec681f3Smrg            if (blocking->done)
4457ec681f3Smrg               continue;
4467ec681f3Smrg
4477ec681f3Smrg            if (blocking->src.reg <= entry->dst &&
4487ec681f3Smrg                blocking->src.reg + 1 >= entry->dst &&
4497ec681f3Smrg                !(blocking->flags & IR3_REG_HALF)) {
4507ec681f3Smrg               split_32bit_copy(ctx, blocking);
4517ec681f3Smrg            }
4527ec681f3Smrg         }
4537ec681f3Smrg      }
4547ec681f3Smrg
4557ec681f3Smrg      /* Update sources of blocking copies.
4567ec681f3Smrg       *
4577ec681f3Smrg       * Note: at this point, every blocking copy's source should be
4587ec681f3Smrg       * contained within our destination.
4597ec681f3Smrg       */
4607ec681f3Smrg      for (unsigned j = 0; j < ctx->entry_count; j++) {
4617ec681f3Smrg         struct copy_entry *blocking = &ctx->entries[j];
4627ec681f3Smrg         if (blocking->src.reg >= entry->dst &&
4637ec681f3Smrg             blocking->src.reg < entry->dst + copy_entry_size(entry)) {
4647ec681f3Smrg            blocking->src.reg =
4657ec681f3Smrg               entry->src.reg + (blocking->src.reg - entry->dst);
4667ec681f3Smrg         }
4677ec681f3Smrg      }
4687ec681f3Smrg
4697ec681f3Smrg      entry->done = true;
4707ec681f3Smrg   }
4717ec681f3Smrg}
4727ec681f3Smrg
4737ec681f3Smrgstatic void
4747ec681f3Smrghandle_copies(struct ir3_shader_variant *v, struct ir3_instruction *instr,
4757ec681f3Smrg              struct copy_entry *entries, unsigned entry_count)
4767ec681f3Smrg{
4777ec681f3Smrg   struct copy_ctx ctx;
4787ec681f3Smrg
4797ec681f3Smrg   /* handle shared copies first */
4807ec681f3Smrg   ctx.entry_count = 0;
4817ec681f3Smrg   for (unsigned i = 0; i < entry_count; i++) {
4827ec681f3Smrg      if (entries[i].flags & IR3_REG_SHARED)
4837ec681f3Smrg         ctx.entries[ctx.entry_count++] = entries[i];
4847ec681f3Smrg   }
4857ec681f3Smrg   _handle_copies(v->shader->compiler, instr, &ctx);
4867ec681f3Smrg
4877ec681f3Smrg   if (v->mergedregs) {
4887ec681f3Smrg      /* Half regs and full regs are in the same file, so handle everything
4897ec681f3Smrg       * at once.
4907ec681f3Smrg       */
4917ec681f3Smrg      ctx.entry_count = 0;
4927ec681f3Smrg      for (unsigned i = 0; i < entry_count; i++) {
4937ec681f3Smrg         if (!(entries[i].flags & IR3_REG_SHARED))
4947ec681f3Smrg            ctx.entries[ctx.entry_count++] = entries[i];
4957ec681f3Smrg      }
4967ec681f3Smrg      _handle_copies(v->shader->compiler, instr, &ctx);
4977ec681f3Smrg   } else {
4987ec681f3Smrg      /* There may be both half copies and full copies, so we have to split
4997ec681f3Smrg       * them up since they don't interfere.
5007ec681f3Smrg       */
5017ec681f3Smrg      ctx.entry_count = 0;
5027ec681f3Smrg      for (unsigned i = 0; i < entry_count; i++) {
5037ec681f3Smrg         if (entries[i].flags & IR3_REG_HALF)
5047ec681f3Smrg            ctx.entries[ctx.entry_count++] = entries[i];
5057ec681f3Smrg      }
5067ec681f3Smrg      _handle_copies(v->shader->compiler, instr, &ctx);
5077ec681f3Smrg
5087ec681f3Smrg      ctx.entry_count = 0;
5097ec681f3Smrg      for (unsigned i = 0; i < entry_count; i++) {
5107ec681f3Smrg         if (!(entries[i].flags & (IR3_REG_HALF | IR3_REG_SHARED)))
5117ec681f3Smrg            ctx.entries[ctx.entry_count++] = entries[i];
5127ec681f3Smrg      }
5137ec681f3Smrg      _handle_copies(v->shader->compiler, instr, &ctx);
5147ec681f3Smrg   }
5157ec681f3Smrg}
5167ec681f3Smrg
5177ec681f3Smrgvoid
5187ec681f3Smrgir3_lower_copies(struct ir3_shader_variant *v)
5197ec681f3Smrg{
5207ec681f3Smrg   DECLARE_ARRAY(struct copy_entry, copies);
5217ec681f3Smrg   copies_count = copies_sz = 0;
5227ec681f3Smrg   copies = NULL;
5237ec681f3Smrg
5247ec681f3Smrg   foreach_block (block, &v->ir->block_list) {
5257ec681f3Smrg      foreach_instr_safe (instr, &block->instr_list) {
5267ec681f3Smrg         if (instr->opc == OPC_META_PARALLEL_COPY) {
5277ec681f3Smrg            copies_count = 0;
5287ec681f3Smrg            for (unsigned i = 0; i < instr->dsts_count; i++) {
5297ec681f3Smrg               struct ir3_register *dst = instr->dsts[i];
5307ec681f3Smrg               struct ir3_register *src = instr->srcs[i];
5317ec681f3Smrg               unsigned flags = src->flags & (IR3_REG_HALF | IR3_REG_SHARED);
5327ec681f3Smrg               unsigned dst_physreg = ra_reg_get_physreg(dst);
5337ec681f3Smrg               for (unsigned j = 0; j < reg_elems(dst); j++) {
5347ec681f3Smrg                  array_insert(
5357ec681f3Smrg                     NULL, copies,
5367ec681f3Smrg                     (struct copy_entry){
5377ec681f3Smrg                        .dst = dst_physreg + j * reg_elem_size(dst),
5387ec681f3Smrg                        .src = get_copy_src(src, j * reg_elem_size(dst)),
5397ec681f3Smrg                        .flags = flags,
5407ec681f3Smrg                     });
5417ec681f3Smrg               }
5427ec681f3Smrg            }
5437ec681f3Smrg            handle_copies(v, instr, copies, copies_count);
5447ec681f3Smrg            list_del(&instr->node);
5457ec681f3Smrg         } else if (instr->opc == OPC_META_COLLECT) {
5467ec681f3Smrg            copies_count = 0;
5477ec681f3Smrg            struct ir3_register *dst = instr->dsts[0];
5487ec681f3Smrg            unsigned flags = dst->flags & (IR3_REG_HALF | IR3_REG_SHARED);
5497ec681f3Smrg            for (unsigned i = 0; i < instr->srcs_count; i++) {
5507ec681f3Smrg               struct ir3_register *src = instr->srcs[i];
5517ec681f3Smrg               array_insert(NULL, copies,
5527ec681f3Smrg                            (struct copy_entry){
5537ec681f3Smrg                               .dst = ra_num_to_physreg(dst->num + i, flags),
5547ec681f3Smrg                               .src = get_copy_src(src, 0),
5557ec681f3Smrg                               .flags = flags,
5567ec681f3Smrg                            });
5577ec681f3Smrg            }
5587ec681f3Smrg            handle_copies(v, instr, copies, copies_count);
5597ec681f3Smrg            list_del(&instr->node);
5607ec681f3Smrg         } else if (instr->opc == OPC_META_SPLIT) {
5617ec681f3Smrg            copies_count = 0;
5627ec681f3Smrg            struct ir3_register *dst = instr->dsts[0];
5637ec681f3Smrg            struct ir3_register *src = instr->srcs[0];
5647ec681f3Smrg            unsigned flags = src->flags & (IR3_REG_HALF | IR3_REG_SHARED);
5657ec681f3Smrg            array_insert(NULL, copies,
5667ec681f3Smrg                         (struct copy_entry){
5677ec681f3Smrg                            .dst = ra_reg_get_physreg(dst),
5687ec681f3Smrg                            .src = get_copy_src(
5697ec681f3Smrg                               src, instr->split.off * reg_elem_size(dst)),
5707ec681f3Smrg                            .flags = flags,
5717ec681f3Smrg                         });
5727ec681f3Smrg            handle_copies(v, instr, copies, copies_count);
5737ec681f3Smrg            list_del(&instr->node);
5747ec681f3Smrg         } else if (instr->opc == OPC_META_PHI) {
5757ec681f3Smrg            list_del(&instr->node);
5767ec681f3Smrg         }
5777ec681f3Smrg      }
5787ec681f3Smrg   }
5797ec681f3Smrg
5807ec681f3Smrg   if (copies)
5817ec681f3Smrg      ralloc_free(copies);
5827ec681f3Smrg}
583