17ec681f3Smrg/*
27ec681f3Smrg * Copyright (C) 2021 Valve Corporation
37ec681f3Smrg *
47ec681f3Smrg * Permission is hereby granted, free of charge, to any person obtaining a
57ec681f3Smrg * copy of this software and associated documentation files (the "Software"),
67ec681f3Smrg * to deal in the Software without restriction, including without limitation
77ec681f3Smrg * the rights to use, copy, modify, merge, publish, distribute, sublicense,
87ec681f3Smrg * and/or sell copies of the Software, and to permit persons to whom the
97ec681f3Smrg * Software is furnished to do so, subject to the following conditions:
107ec681f3Smrg *
117ec681f3Smrg * The above copyright notice and this permission notice (including the next
127ec681f3Smrg * paragraph) shall be included in all copies or substantial portions of the
137ec681f3Smrg * Software.
147ec681f3Smrg *
157ec681f3Smrg * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
167ec681f3Smrg * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
177ec681f3Smrg * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
187ec681f3Smrg * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
197ec681f3Smrg * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
207ec681f3Smrg * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
217ec681f3Smrg * SOFTWARE.
227ec681f3Smrg */
237ec681f3Smrg
247ec681f3Smrg#include "ir3_compiler.h"
257ec681f3Smrg#include "ir3_ra.h"
267ec681f3Smrg#include "ralloc.h"
277ec681f3Smrg
287ec681f3Smrg/* This pass "merges" compatible phi-web SSA values. First, we insert a bunch
297ec681f3Smrg * of parallelcopy's to trivially turn the program into CSSA form. Then we
307ec681f3Smrg * try to "merge" SSA def's into "merge sets" which could be allocated to a
317ec681f3Smrg * single register in order to eliminate copies. First we merge phi nodes,
327ec681f3Smrg * which should always succeed because of the parallelcopy's we inserted, and
337ec681f3Smrg * then we try to coalesce the copies we introduced.
347ec681f3Smrg *
357ec681f3Smrg * The merged registers are used for three purposes:
367ec681f3Smrg *
377ec681f3Smrg * 1. We always use the same pvtmem slot for spilling all SSA defs in each
387ec681f3Smrg * merge set. This prevents us from having to insert memory-to-memory copies
397ec681f3Smrg * in the spiller and makes sure we don't insert unecessary copies.
407ec681f3Smrg * 2. When two values are live at the same time, part of the same merge
417ec681f3Smrg * set, and they overlap each other in the merge set, they always occupy
427ec681f3Smrg * overlapping physical registers in RA. This reduces register pressure and
437ec681f3Smrg * copies in several important scenarios:
447ec681f3Smrg *	- When sources of a collect are used later by something else, we don't
457ec681f3Smrg *	have to introduce copies.
467ec681f3Smrg *	- We can handle sequences of extracts that "explode" a vector into its
477ec681f3Smrg *	components without any additional copying.
487ec681f3Smrg * 3. We use the merge sets for affinities in register allocation: That is, we
497ec681f3Smrg * try to allocate all the definitions in the same merge set to the
507ec681f3Smrg * same/compatible registers. This helps us e.g. allocate sources of a collect
517ec681f3Smrg * to contiguous registers without too much special code in RA.
527ec681f3Smrg *
537ec681f3Smrg * In a "normal" register allocator, or when spilling, we'd just merge
547ec681f3Smrg * registers in the same merge set to the same register, but with SSA-based
557ec681f3Smrg * register allocation we may have to split the live interval.
567ec681f3Smrg *
577ec681f3Smrg * The implementation is based on "Revisiting Out-of-SSA Translation for
587ec681f3Smrg * Correctness, CodeQuality, and Efficiency," and is broadly similar to the
597ec681f3Smrg * implementation in nir_from_ssa, with the twist that we also try to coalesce
607ec681f3Smrg * META_SPLIT and META_COLLECT. This makes this pass more complicated but
617ec681f3Smrg * prevents us from needing to handle these specially in RA and the spiller,
627ec681f3Smrg * which are already complicated enough. This also forces us to implement that
637ec681f3Smrg * value-comparison optimization they explain, as without it we wouldn't be
647ec681f3Smrg * able to coalesce META_SPLIT even in the simplest of cases.
657ec681f3Smrg */
667ec681f3Smrg
677ec681f3Smrg/* In order to dynamically reconstruct the dominance forest, we need the
687ec681f3Smrg * instructions ordered by a preorder traversal of the dominance tree:
697ec681f3Smrg */
707ec681f3Smrg
717ec681f3Smrgstatic unsigned
727ec681f3Smrgindex_instrs(struct ir3_block *block, unsigned index)
737ec681f3Smrg{
747ec681f3Smrg   foreach_instr (instr, &block->instr_list)
757ec681f3Smrg      instr->ip = index++;
767ec681f3Smrg
777ec681f3Smrg   for (unsigned i = 0; i < block->dom_children_count; i++)
787ec681f3Smrg      index = index_instrs(block->dom_children[i], index);
797ec681f3Smrg
807ec681f3Smrg   return index;
817ec681f3Smrg}
827ec681f3Smrg
837ec681f3Smrg/* Definitions within a merge set are ordered by instr->ip as set above: */
847ec681f3Smrg
857ec681f3Smrgstatic bool
867ec681f3Smrgdef_after(struct ir3_register *a, struct ir3_register *b)
877ec681f3Smrg{
887ec681f3Smrg   return a->instr->ip > b->instr->ip;
897ec681f3Smrg}
907ec681f3Smrg
917ec681f3Smrgstatic bool
927ec681f3Smrgdef_dominates(struct ir3_register *a, struct ir3_register *b)
937ec681f3Smrg{
947ec681f3Smrg   if (def_after(a, b)) {
957ec681f3Smrg      return false;
967ec681f3Smrg   } else if (a->instr->block == b->instr->block) {
977ec681f3Smrg      return def_after(b, a);
987ec681f3Smrg   } else {
997ec681f3Smrg      return ir3_block_dominates(a->instr->block, b->instr->block);
1007ec681f3Smrg   }
1017ec681f3Smrg}
1027ec681f3Smrg
1037ec681f3Smrg/* This represents a region inside a register. The offset is relative to the
1047ec681f3Smrg * start of the register, and offset + size <= size(reg).
1057ec681f3Smrg */
1067ec681f3Smrgstruct def_value {
1077ec681f3Smrg   struct ir3_register *reg;
1087ec681f3Smrg   unsigned offset, size;
1097ec681f3Smrg};
1107ec681f3Smrg
1117ec681f3Smrg/* Chase any copies to get the source of a region inside a register. This is
1127ec681f3Smrg * Value(a) in the paper.
1137ec681f3Smrg */
1147ec681f3Smrgstatic struct def_value
1157ec681f3Smrgchase_copies(struct def_value value)
1167ec681f3Smrg{
1177ec681f3Smrg   while (true) {
1187ec681f3Smrg      struct ir3_instruction *instr = value.reg->instr;
1197ec681f3Smrg      if (instr->opc == OPC_META_SPLIT) {
1207ec681f3Smrg         value.offset += instr->split.off * reg_elem_size(value.reg);
1217ec681f3Smrg         value.reg = instr->srcs[0]->def;
1227ec681f3Smrg      } else if (instr->opc == OPC_META_COLLECT) {
1237ec681f3Smrg         if (value.offset % reg_elem_size(value.reg) != 0 ||
1247ec681f3Smrg             value.size > reg_elem_size(value.reg) ||
1257ec681f3Smrg             value.offset + value.size > reg_size(value.reg))
1267ec681f3Smrg            break;
1277ec681f3Smrg         struct ir3_register *src =
1287ec681f3Smrg            instr->srcs[value.offset / reg_elem_size(value.reg)];
1297ec681f3Smrg         if (!src->def)
1307ec681f3Smrg            break;
1317ec681f3Smrg         value.offset = 0;
1327ec681f3Smrg         value.reg = src->def;
1337ec681f3Smrg      } else {
1347ec681f3Smrg         /* TODO: parallelcopy */
1357ec681f3Smrg         break;
1367ec681f3Smrg      }
1377ec681f3Smrg   }
1387ec681f3Smrg
1397ec681f3Smrg   return value;
1407ec681f3Smrg}
1417ec681f3Smrg
1427ec681f3Smrg/* This represents an entry in the merge set, and consists of a register +
1437ec681f3Smrg * offset from the merge set base.
1447ec681f3Smrg */
1457ec681f3Smrgstruct merge_def {
1467ec681f3Smrg   struct ir3_register *reg;
1477ec681f3Smrg   unsigned offset;
1487ec681f3Smrg};
1497ec681f3Smrg
1507ec681f3Smrgstatic bool
1517ec681f3Smrgcan_skip_interference(const struct merge_def *a, const struct merge_def *b)
1527ec681f3Smrg{
1537ec681f3Smrg   unsigned a_start = a->offset;
1547ec681f3Smrg   unsigned b_start = b->offset;
1557ec681f3Smrg   unsigned a_end = a_start + reg_size(a->reg);
1567ec681f3Smrg   unsigned b_end = b_start + reg_size(b->reg);
1577ec681f3Smrg
1587ec681f3Smrg   /* Registers that don't overlap never interfere */
1597ec681f3Smrg   if (a_end <= b_start || b_end <= a_start)
1607ec681f3Smrg      return true;
1617ec681f3Smrg
1627ec681f3Smrg   /* Disallow skipping interference unless one definition contains the
1637ec681f3Smrg    * other. This restriction is important for register allocation, because
1647ec681f3Smrg    * it means that at any given point in the program, the live values in a
1657ec681f3Smrg    * given merge set will form a tree. If they didn't, then one live value
1667ec681f3Smrg    * would partially overlap another, and they would have overlapping live
1677ec681f3Smrg    * ranges because they're live at the same point. This simplifies register
1687ec681f3Smrg    * allocation and spilling.
1697ec681f3Smrg    */
1707ec681f3Smrg   if (!((a_start <= b_start && a_end >= b_end) ||
1717ec681f3Smrg         (b_start <= a_start && b_end >= a_end)))
1727ec681f3Smrg      return false;
1737ec681f3Smrg
1747ec681f3Smrg   /* For each register, chase the intersection of a and b to find the
1757ec681f3Smrg    * ultimate source.
1767ec681f3Smrg    */
1777ec681f3Smrg   unsigned start = MAX2(a_start, b_start);
1787ec681f3Smrg   unsigned end = MIN2(a_end, b_end);
1797ec681f3Smrg   struct def_value a_value = chase_copies((struct def_value){
1807ec681f3Smrg      .reg = a->reg,
1817ec681f3Smrg      .offset = start - a_start,
1827ec681f3Smrg      .size = end - start,
1837ec681f3Smrg   });
1847ec681f3Smrg   struct def_value b_value = chase_copies((struct def_value){
1857ec681f3Smrg      .reg = b->reg,
1867ec681f3Smrg      .offset = start - b_start,
1877ec681f3Smrg      .size = end - start,
1887ec681f3Smrg   });
1897ec681f3Smrg   return a_value.reg == b_value.reg && a_value.offset == b_value.offset;
1907ec681f3Smrg}
1917ec681f3Smrg
1927ec681f3Smrgstatic struct ir3_merge_set *
1937ec681f3Smrgget_merge_set(struct ir3_register *def)
1947ec681f3Smrg{
1957ec681f3Smrg   if (def->merge_set)
1967ec681f3Smrg      return def->merge_set;
1977ec681f3Smrg
1987ec681f3Smrg   struct ir3_merge_set *set = ralloc(def, struct ir3_merge_set);
1997ec681f3Smrg   set->preferred_reg = ~0;
2007ec681f3Smrg   set->interval_start = ~0;
2017ec681f3Smrg   set->spill_slot = ~0;
2027ec681f3Smrg   set->size = reg_size(def);
2037ec681f3Smrg   set->alignment = (def->flags & IR3_REG_HALF) ? 1 : 2;
2047ec681f3Smrg   set->regs_count = 1;
2057ec681f3Smrg   set->regs = ralloc(set, struct ir3_register *);
2067ec681f3Smrg   set->regs[0] = def;
2077ec681f3Smrg
2087ec681f3Smrg   return set;
2097ec681f3Smrg}
2107ec681f3Smrg
2117ec681f3Smrg/* Merges b into a */
2127ec681f3Smrgstatic struct ir3_merge_set *
2137ec681f3Smrgmerge_merge_sets(struct ir3_merge_set *a, struct ir3_merge_set *b, int b_offset)
2147ec681f3Smrg{
2157ec681f3Smrg   if (b_offset < 0)
2167ec681f3Smrg      return merge_merge_sets(b, a, -b_offset);
2177ec681f3Smrg
2187ec681f3Smrg   struct ir3_register **new_regs =
2197ec681f3Smrg      rzalloc_array(a, struct ir3_register *, a->regs_count + b->regs_count);
2207ec681f3Smrg
2217ec681f3Smrg   unsigned a_index = 0, b_index = 0, new_index = 0;
2227ec681f3Smrg   for (; a_index < a->regs_count || b_index < b->regs_count; new_index++) {
2237ec681f3Smrg      if (b_index < b->regs_count &&
2247ec681f3Smrg          (a_index == a->regs_count ||
2257ec681f3Smrg           def_after(a->regs[a_index], b->regs[b_index]))) {
2267ec681f3Smrg         new_regs[new_index] = b->regs[b_index++];
2277ec681f3Smrg         new_regs[new_index]->merge_set_offset += b_offset;
2287ec681f3Smrg      } else {
2297ec681f3Smrg         new_regs[new_index] = a->regs[a_index++];
2307ec681f3Smrg      }
2317ec681f3Smrg      new_regs[new_index]->merge_set = a;
2327ec681f3Smrg   }
2337ec681f3Smrg
2347ec681f3Smrg   assert(new_index == a->regs_count + b->regs_count);
2357ec681f3Smrg
2367ec681f3Smrg   /* Technically this should be the lcm, but because alignment is only 1 or
2377ec681f3Smrg    * 2 so far this should be ok.
2387ec681f3Smrg    */
2397ec681f3Smrg   a->alignment = MAX2(a->alignment, b->alignment);
2407ec681f3Smrg   a->regs_count += b->regs_count;
2417ec681f3Smrg   ralloc_free(a->regs);
2427ec681f3Smrg   a->regs = new_regs;
2437ec681f3Smrg   a->size = MAX2(a->size, b->size + b_offset);
2447ec681f3Smrg
2457ec681f3Smrg   return a;
2467ec681f3Smrg}
2477ec681f3Smrg
2487ec681f3Smrgstatic bool
2497ec681f3Smrgmerge_sets_interfere(struct ir3_liveness *live, struct ir3_merge_set *a,
2507ec681f3Smrg                     struct ir3_merge_set *b, int b_offset)
2517ec681f3Smrg{
2527ec681f3Smrg   if (b_offset < 0)
2537ec681f3Smrg      return merge_sets_interfere(live, b, a, -b_offset);
2547ec681f3Smrg
2557ec681f3Smrg   struct merge_def dom[a->regs_count + b->regs_count];
2567ec681f3Smrg   unsigned a_index = 0, b_index = 0;
2577ec681f3Smrg   int dom_index = -1;
2587ec681f3Smrg
2597ec681f3Smrg   /* Reject trying to merge the sets if the alignment doesn't work out */
2607ec681f3Smrg   if (b_offset % a->alignment != 0)
2617ec681f3Smrg      return true;
2627ec681f3Smrg
2637ec681f3Smrg   while (a_index < a->regs_count || b_index < b->regs_count) {
2647ec681f3Smrg      struct merge_def current;
2657ec681f3Smrg      if (a_index == a->regs_count) {
2667ec681f3Smrg         current.reg = b->regs[b_index];
2677ec681f3Smrg         current.offset = current.reg->merge_set_offset + b_offset;
2687ec681f3Smrg         b_index++;
2697ec681f3Smrg      } else if (b_index == b->regs_count) {
2707ec681f3Smrg         current.reg = a->regs[a_index];
2717ec681f3Smrg         current.offset = current.reg->merge_set_offset;
2727ec681f3Smrg         a_index++;
2737ec681f3Smrg      } else {
2747ec681f3Smrg         if (def_after(b->regs[b_index], a->regs[a_index])) {
2757ec681f3Smrg            current.reg = a->regs[a_index];
2767ec681f3Smrg            current.offset = current.reg->merge_set_offset;
2777ec681f3Smrg            a_index++;
2787ec681f3Smrg         } else {
2797ec681f3Smrg            current.reg = b->regs[b_index];
2807ec681f3Smrg            current.offset = current.reg->merge_set_offset + b_offset;
2817ec681f3Smrg            b_index++;
2827ec681f3Smrg         }
2837ec681f3Smrg      }
2847ec681f3Smrg
2857ec681f3Smrg      while (dom_index >= 0 &&
2867ec681f3Smrg             !def_dominates(dom[dom_index].reg, current.reg)) {
2877ec681f3Smrg         dom_index--;
2887ec681f3Smrg      }
2897ec681f3Smrg
2907ec681f3Smrg      /* TODO: in the original paper, just dom[dom_index] needs to be
2917ec681f3Smrg       * checked for interference. We implement the value-chasing extension
2927ec681f3Smrg       * as well as support for sub-registers, which complicates this
2937ec681f3Smrg       * significantly because it's no longer the case that if a dominates b
2947ec681f3Smrg       * dominates c and a and b don't interfere then we only need to check
2957ec681f3Smrg       * interference between b and c to be sure a and c don't interfere --
2967ec681f3Smrg       * this means we may have to check for interference against values
2977ec681f3Smrg       * higher in the stack then dom[dom_index]. In the paper there's a
2987ec681f3Smrg       * description of a way to do less interference tests with the
2997ec681f3Smrg       * value-chasing extension, but we'd have to come up with something
3007ec681f3Smrg       * ourselves for handling the similar problems that come up with
3017ec681f3Smrg       * allowing values to contain subregisters. For now we just test
3027ec681f3Smrg       * everything in the stack.
3037ec681f3Smrg       */
3047ec681f3Smrg      for (int i = 0; i <= dom_index; i++) {
3057ec681f3Smrg         if (can_skip_interference(&current, &dom[i]))
3067ec681f3Smrg            continue;
3077ec681f3Smrg
3087ec681f3Smrg         /* Ok, now we actually have to check interference. Since we know
3097ec681f3Smrg          * that dom[i] dominates current, this boils down to checking
3107ec681f3Smrg          * whether dom[i] is live after current.
3117ec681f3Smrg          */
3127ec681f3Smrg         if (ir3_def_live_after(live, dom[i].reg, current.reg->instr))
3137ec681f3Smrg            return true;
3147ec681f3Smrg      }
3157ec681f3Smrg
3167ec681f3Smrg      dom[++dom_index] = current;
3177ec681f3Smrg   }
3187ec681f3Smrg
3197ec681f3Smrg   return false;
3207ec681f3Smrg}
3217ec681f3Smrg
3227ec681f3Smrgstatic void
3237ec681f3Smrgtry_merge_defs(struct ir3_liveness *live, struct ir3_register *a,
3247ec681f3Smrg               struct ir3_register *b, unsigned b_offset)
3257ec681f3Smrg{
3267ec681f3Smrg   struct ir3_merge_set *a_set = get_merge_set(a);
3277ec681f3Smrg   struct ir3_merge_set *b_set = get_merge_set(b);
3287ec681f3Smrg
3297ec681f3Smrg   if (a_set == b_set) {
3307ec681f3Smrg      /* Note: Even in this case we may not always successfully be able to
3317ec681f3Smrg       * coalesce this copy, if the offsets don't line up. But in any
3327ec681f3Smrg       * case, we can't do anything.
3337ec681f3Smrg       */
3347ec681f3Smrg      return;
3357ec681f3Smrg   }
3367ec681f3Smrg
3377ec681f3Smrg   int b_set_offset = a->merge_set_offset + b_offset - b->merge_set_offset;
3387ec681f3Smrg
3397ec681f3Smrg   if (!merge_sets_interfere(live, a_set, b_set, b_set_offset))
3407ec681f3Smrg      merge_merge_sets(a_set, b_set, b_set_offset);
3417ec681f3Smrg}
3427ec681f3Smrg
3437ec681f3Smrgvoid
3447ec681f3Smrgir3_force_merge(struct ir3_register *a, struct ir3_register *b, int b_offset)
3457ec681f3Smrg{
3467ec681f3Smrg   struct ir3_merge_set *a_set = get_merge_set(a);
3477ec681f3Smrg   struct ir3_merge_set *b_set = get_merge_set(b);
3487ec681f3Smrg
3497ec681f3Smrg   if (a_set == b_set)
3507ec681f3Smrg      return;
3517ec681f3Smrg
3527ec681f3Smrg   int b_set_offset = a->merge_set_offset + b_offset - b->merge_set_offset;
3537ec681f3Smrg   merge_merge_sets(a_set, b_set, b_set_offset);
3547ec681f3Smrg}
3557ec681f3Smrg
3567ec681f3Smrgstatic void
3577ec681f3Smrgcoalesce_phi(struct ir3_liveness *live, struct ir3_instruction *phi)
3587ec681f3Smrg{
3597ec681f3Smrg   for (unsigned i = 0; i < phi->srcs_count; i++) {
3607ec681f3Smrg      if (phi->srcs[i]->def)
3617ec681f3Smrg         try_merge_defs(live, phi->dsts[0], phi->srcs[i]->def, 0);
3627ec681f3Smrg   }
3637ec681f3Smrg}
3647ec681f3Smrg
3657ec681f3Smrgstatic void
3667ec681f3Smrgaggressive_coalesce_parallel_copy(struct ir3_liveness *live,
3677ec681f3Smrg                                  struct ir3_instruction *pcopy)
3687ec681f3Smrg{
3697ec681f3Smrg   for (unsigned i = 0; i < pcopy->dsts_count; i++) {
3707ec681f3Smrg      if (!(pcopy->srcs[i]->flags & IR3_REG_SSA))
3717ec681f3Smrg         continue;
3727ec681f3Smrg      try_merge_defs(live, pcopy->dsts[i], pcopy->srcs[i]->def, 0);
3737ec681f3Smrg   }
3747ec681f3Smrg}
3757ec681f3Smrg
3767ec681f3Smrgstatic void
3777ec681f3Smrgaggressive_coalesce_split(struct ir3_liveness *live,
3787ec681f3Smrg                          struct ir3_instruction *split)
3797ec681f3Smrg{
3807ec681f3Smrg   try_merge_defs(live, split->srcs[0]->def, split->dsts[0],
3817ec681f3Smrg                  split->split.off * reg_elem_size(split->dsts[0]));
3827ec681f3Smrg}
3837ec681f3Smrg
3847ec681f3Smrgstatic void
3857ec681f3Smrgaggressive_coalesce_collect(struct ir3_liveness *live,
3867ec681f3Smrg                            struct ir3_instruction *collect)
3877ec681f3Smrg{
3887ec681f3Smrg   for (unsigned i = 0, offset = 0; i < collect->srcs_count;
3897ec681f3Smrg        offset += reg_elem_size(collect->srcs[i]), i++) {
3907ec681f3Smrg      if (!(collect->srcs[i]->flags & IR3_REG_SSA))
3917ec681f3Smrg         continue;
3927ec681f3Smrg      try_merge_defs(live, collect->dsts[0], collect->srcs[i]->def, offset);
3937ec681f3Smrg   }
3947ec681f3Smrg}
3957ec681f3Smrg
3967ec681f3Smrgstatic void
3977ec681f3Smrgcreate_parallel_copy(struct ir3_block *block)
3987ec681f3Smrg{
3997ec681f3Smrg   for (unsigned i = 0; i < 2; i++) {
4007ec681f3Smrg      if (!block->successors[i])
4017ec681f3Smrg         continue;
4027ec681f3Smrg
4037ec681f3Smrg      struct ir3_block *succ = block->successors[i];
4047ec681f3Smrg
4057ec681f3Smrg      unsigned pred_idx = ir3_block_get_pred_index(succ, block);
4067ec681f3Smrg
4077ec681f3Smrg      unsigned phi_count = 0;
4087ec681f3Smrg      foreach_instr (phi, &succ->instr_list) {
4097ec681f3Smrg         if (phi->opc != OPC_META_PHI)
4107ec681f3Smrg            break;
4117ec681f3Smrg
4127ec681f3Smrg         /* Avoid undef */
4137ec681f3Smrg         if ((phi->srcs[pred_idx]->flags & IR3_REG_SSA) &&
4147ec681f3Smrg             !phi->srcs[pred_idx]->def)
4157ec681f3Smrg            continue;
4167ec681f3Smrg
4177ec681f3Smrg         /* We don't support critical edges. If we were to support them,
4187ec681f3Smrg          * we'd need to insert parallel copies after the phi node to solve
4197ec681f3Smrg          * the lost-copy problem.
4207ec681f3Smrg          */
4217ec681f3Smrg         assert(i == 0 && !block->successors[1]);
4227ec681f3Smrg         phi_count++;
4237ec681f3Smrg      }
4247ec681f3Smrg
4257ec681f3Smrg      if (phi_count == 0)
4267ec681f3Smrg         continue;
4277ec681f3Smrg
4287ec681f3Smrg      struct ir3_register *src[phi_count];
4297ec681f3Smrg      unsigned j = 0;
4307ec681f3Smrg      foreach_instr (phi, &succ->instr_list) {
4317ec681f3Smrg         if (phi->opc != OPC_META_PHI)
4327ec681f3Smrg            break;
4337ec681f3Smrg         if ((phi->srcs[pred_idx]->flags & IR3_REG_SSA) &&
4347ec681f3Smrg             !phi->srcs[pred_idx]->def)
4357ec681f3Smrg            continue;
4367ec681f3Smrg         src[j++] = phi->srcs[pred_idx];
4377ec681f3Smrg      }
4387ec681f3Smrg      assert(j == phi_count);
4397ec681f3Smrg
4407ec681f3Smrg      struct ir3_instruction *pcopy =
4417ec681f3Smrg         ir3_instr_create(block, OPC_META_PARALLEL_COPY, phi_count, phi_count);
4427ec681f3Smrg
4437ec681f3Smrg      for (j = 0; j < phi_count; j++) {
4447ec681f3Smrg         struct ir3_register *reg = __ssa_dst(pcopy);
4457ec681f3Smrg         reg->flags |= src[j]->flags & (IR3_REG_HALF | IR3_REG_ARRAY);
4467ec681f3Smrg         reg->size = src[j]->size;
4477ec681f3Smrg         reg->wrmask = src[j]->wrmask;
4487ec681f3Smrg      }
4497ec681f3Smrg
4507ec681f3Smrg      for (j = 0; j < phi_count; j++) {
4517ec681f3Smrg         pcopy->srcs[pcopy->srcs_count++] =
4527ec681f3Smrg            ir3_reg_clone(block->shader, src[j]);
4537ec681f3Smrg      }
4547ec681f3Smrg
4557ec681f3Smrg      j = 0;
4567ec681f3Smrg      foreach_instr (phi, &succ->instr_list) {
4577ec681f3Smrg         if (phi->opc != OPC_META_PHI)
4587ec681f3Smrg            break;
4597ec681f3Smrg         if ((phi->srcs[pred_idx]->flags & IR3_REG_SSA) &&
4607ec681f3Smrg             !phi->srcs[pred_idx]->def)
4617ec681f3Smrg            continue;
4627ec681f3Smrg         phi->srcs[pred_idx]->def = pcopy->dsts[j];
4637ec681f3Smrg         phi->srcs[pred_idx]->flags = pcopy->dsts[j]->flags;
4647ec681f3Smrg         j++;
4657ec681f3Smrg      }
4667ec681f3Smrg      assert(j == phi_count);
4677ec681f3Smrg   }
4687ec681f3Smrg}
4697ec681f3Smrg
4707ec681f3Smrgvoid
4717ec681f3Smrgir3_create_parallel_copies(struct ir3 *ir)
4727ec681f3Smrg{
4737ec681f3Smrg   foreach_block (block, &ir->block_list) {
4747ec681f3Smrg      create_parallel_copy(block);
4757ec681f3Smrg   }
4767ec681f3Smrg}
4777ec681f3Smrg
4787ec681f3Smrgstatic void
4797ec681f3Smrgindex_merge_sets(struct ir3_liveness *live, struct ir3 *ir)
4807ec681f3Smrg{
4817ec681f3Smrg   unsigned offset = 0;
4827ec681f3Smrg   foreach_block (block, &ir->block_list) {
4837ec681f3Smrg      foreach_instr (instr, &block->instr_list) {
4847ec681f3Smrg         for (unsigned i = 0; i < instr->dsts_count; i++) {
4857ec681f3Smrg            struct ir3_register *dst = instr->dsts[i];
4867ec681f3Smrg
4877ec681f3Smrg            unsigned dst_offset;
4887ec681f3Smrg            struct ir3_merge_set *merge_set = dst->merge_set;
4897ec681f3Smrg            unsigned size = reg_size(dst);
4907ec681f3Smrg            if (merge_set) {
4917ec681f3Smrg               if (merge_set->interval_start == ~0) {
4927ec681f3Smrg                  merge_set->interval_start = offset;
4937ec681f3Smrg                  offset += merge_set->size;
4947ec681f3Smrg               }
4957ec681f3Smrg               dst_offset = merge_set->interval_start + dst->merge_set_offset;
4967ec681f3Smrg            } else {
4977ec681f3Smrg               dst_offset = offset;
4987ec681f3Smrg               offset += size;
4997ec681f3Smrg            }
5007ec681f3Smrg
5017ec681f3Smrg            dst->interval_start = dst_offset;
5027ec681f3Smrg            dst->interval_end = dst_offset + size;
5037ec681f3Smrg         }
5047ec681f3Smrg      }
5057ec681f3Smrg   }
5067ec681f3Smrg
5077ec681f3Smrg   live->interval_offset = offset;
5087ec681f3Smrg}
5097ec681f3Smrg
5107ec681f3Smrg#define RESET      "\x1b[0m"
5117ec681f3Smrg#define BLUE       "\x1b[0;34m"
5127ec681f3Smrg#define SYN_SSA(x) BLUE x RESET
5137ec681f3Smrg
5147ec681f3Smrgstatic void
5157ec681f3Smrgdump_merge_sets(struct ir3 *ir)
5167ec681f3Smrg{
5177ec681f3Smrg   d("merge sets:");
5187ec681f3Smrg   struct set *merge_sets = _mesa_pointer_set_create(NULL);
5197ec681f3Smrg   foreach_block (block, &ir->block_list) {
5207ec681f3Smrg      foreach_instr (instr, &block->instr_list) {
5217ec681f3Smrg         for (unsigned i = 0; i < instr->dsts_count; i++) {
5227ec681f3Smrg            struct ir3_register *dst = instr->dsts[i];
5237ec681f3Smrg
5247ec681f3Smrg            struct ir3_merge_set *merge_set = dst->merge_set;
5257ec681f3Smrg            if (!merge_set || _mesa_set_search(merge_sets, merge_set))
5267ec681f3Smrg               continue;
5277ec681f3Smrg
5287ec681f3Smrg            d("merge set, size %u, align %u:", merge_set->size,
5297ec681f3Smrg              merge_set->alignment);
5307ec681f3Smrg            for (unsigned j = 0; j < merge_set->regs_count; j++) {
5317ec681f3Smrg               struct ir3_register *reg = merge_set->regs[j];
5327ec681f3Smrg               d("\t" SYN_SSA("ssa_%u") ":%u, offset %u",
5337ec681f3Smrg                 reg->instr->serialno, reg->name, reg->merge_set_offset);
5347ec681f3Smrg            }
5357ec681f3Smrg
5367ec681f3Smrg            _mesa_set_add(merge_sets, merge_set);
5377ec681f3Smrg         }
5387ec681f3Smrg      }
5397ec681f3Smrg   }
5407ec681f3Smrg
5417ec681f3Smrg   ralloc_free(merge_sets);
5427ec681f3Smrg}
5437ec681f3Smrg
5447ec681f3Smrgvoid
5457ec681f3Smrgir3_merge_regs(struct ir3_liveness *live, struct ir3 *ir)
5467ec681f3Smrg{
5477ec681f3Smrg   index_instrs(ir3_start_block(ir), 0);
5487ec681f3Smrg
5497ec681f3Smrg   /* First pass: coalesce phis, which must be together. */
5507ec681f3Smrg   foreach_block (block, &ir->block_list) {
5517ec681f3Smrg      foreach_instr (instr, &block->instr_list) {
5527ec681f3Smrg         if (instr->opc != OPC_META_PHI)
5537ec681f3Smrg            break;
5547ec681f3Smrg
5557ec681f3Smrg         coalesce_phi(live, instr);
5567ec681f3Smrg      }
5577ec681f3Smrg   }
5587ec681f3Smrg
5597ec681f3Smrg   /* Second pass: aggressively coalesce parallelcopy, split, collect */
5607ec681f3Smrg   foreach_block (block, &ir->block_list) {
5617ec681f3Smrg      foreach_instr (instr, &block->instr_list) {
5627ec681f3Smrg         switch (instr->opc) {
5637ec681f3Smrg         case OPC_META_SPLIT:
5647ec681f3Smrg            aggressive_coalesce_split(live, instr);
5657ec681f3Smrg            break;
5667ec681f3Smrg         case OPC_META_COLLECT:
5677ec681f3Smrg            aggressive_coalesce_collect(live, instr);
5687ec681f3Smrg            break;
5697ec681f3Smrg         case OPC_META_PARALLEL_COPY:
5707ec681f3Smrg            aggressive_coalesce_parallel_copy(live, instr);
5717ec681f3Smrg            break;
5727ec681f3Smrg         default:
5737ec681f3Smrg            break;
5747ec681f3Smrg         }
5757ec681f3Smrg      }
5767ec681f3Smrg   }
5777ec681f3Smrg
5787ec681f3Smrg   index_merge_sets(live, ir);
5797ec681f3Smrg
5807ec681f3Smrg   if (ir3_shader_debug & IR3_DBG_RAMSGS)
5817ec681f3Smrg      dump_merge_sets(ir);
5827ec681f3Smrg}
583