17ec681f3Smrg/*
27ec681f3Smrg * Copyright (C) 2019 Google, Inc.
37ec681f3Smrg *
47ec681f3Smrg * Permission is hereby granted, free of charge, to any person obtaining a
57ec681f3Smrg * copy of this software and associated documentation files (the "Software"),
67ec681f3Smrg * to deal in the Software without restriction, including without limitation
77ec681f3Smrg * the rights to use, copy, modify, merge, publish, distribute, sublicense,
87ec681f3Smrg * and/or sell copies of the Software, and to permit persons to whom the
97ec681f3Smrg * Software is furnished to do so, subject to the following conditions:
107ec681f3Smrg *
117ec681f3Smrg * The above copyright notice and this permission notice (including the next
127ec681f3Smrg * paragraph) shall be included in all copies or substantial portions of the
137ec681f3Smrg * Software.
147ec681f3Smrg *
157ec681f3Smrg * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
167ec681f3Smrg * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
177ec681f3Smrg * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
187ec681f3Smrg * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
197ec681f3Smrg * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
207ec681f3Smrg * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
217ec681f3Smrg * SOFTWARE.
227ec681f3Smrg *
237ec681f3Smrg * Authors:
247ec681f3Smrg *    Rob Clark <robclark@freedesktop.org>
257ec681f3Smrg */
267ec681f3Smrg
277ec681f3Smrg#include "ir3.h"
287ec681f3Smrg
297ec681f3Smrg/* The maximum number of nop's we may need to insert between two instructions.
307ec681f3Smrg */
317ec681f3Smrg#define MAX_NOPS 6
327ec681f3Smrg
337ec681f3Smrg/* The soft delay for approximating the cost of (ss). On a6xx, it takes the
347ec681f3Smrg * number of delay slots to get a SFU result back (ie. using nop's instead of
357ec681f3Smrg * (ss) is:
367ec681f3Smrg *
377ec681f3Smrg *     8 - single warp
387ec681f3Smrg *     9 - two warps
397ec681f3Smrg *    10 - four warps
407ec681f3Smrg *
417ec681f3Smrg * and so on. Not quite sure where it tapers out (ie. how many warps share an
427ec681f3Smrg * SFU unit). But 10 seems like a reasonable # to choose:
437ec681f3Smrg */
447ec681f3Smrg#define SOFT_SS_NOPS 10
457ec681f3Smrg
467ec681f3Smrg/*
477ec681f3Smrg * Helpers to figure out the necessary delay slots between instructions.  Used
487ec681f3Smrg * both in scheduling pass(es) and the final pass to insert any required nop's
497ec681f3Smrg * so that the shader program is valid.
507ec681f3Smrg *
517ec681f3Smrg * Note that this needs to work both pre and post RA, so we can't assume ssa
527ec681f3Smrg * src iterators work.
537ec681f3Smrg */
547ec681f3Smrg
557ec681f3Smrg/* calculate required # of delay slots between the instruction that
567ec681f3Smrg * assigns a value and the one that consumes
577ec681f3Smrg */
587ec681f3Smrgint
597ec681f3Smrgir3_delayslots(struct ir3_instruction *assigner,
607ec681f3Smrg               struct ir3_instruction *consumer, unsigned n, bool soft)
617ec681f3Smrg{
627ec681f3Smrg   /* generally don't count false dependencies, since this can just be
637ec681f3Smrg    * something like a barrier, or SSBO store.
647ec681f3Smrg    */
657ec681f3Smrg   if (__is_false_dep(consumer, n))
667ec681f3Smrg      return 0;
677ec681f3Smrg
687ec681f3Smrg   /* worst case is cat1-3 (alu) -> cat4/5 needing 6 cycles, normal
697ec681f3Smrg    * alu -> alu needs 3 cycles, cat4 -> alu and texture fetch
707ec681f3Smrg    * handled with sync bits
717ec681f3Smrg    */
727ec681f3Smrg
737ec681f3Smrg   if (is_meta(assigner) || is_meta(consumer))
747ec681f3Smrg      return 0;
757ec681f3Smrg
767ec681f3Smrg   if (writes_addr0(assigner) || writes_addr1(assigner))
777ec681f3Smrg      return 6;
787ec681f3Smrg
797ec681f3Smrg   if (soft && is_sfu(assigner))
807ec681f3Smrg      return SOFT_SS_NOPS;
817ec681f3Smrg
827ec681f3Smrg   /* handled via sync flags: */
837ec681f3Smrg   if (is_sfu(assigner) || is_tex(assigner) || is_mem(assigner))
847ec681f3Smrg      return 0;
857ec681f3Smrg
867ec681f3Smrg   /* As far as we know, shader outputs don't need any delay. */
877ec681f3Smrg   if (consumer->opc == OPC_END || consumer->opc == OPC_CHMASK)
887ec681f3Smrg      return 0;
897ec681f3Smrg
907ec681f3Smrg   /* assigner must be alu: */
917ec681f3Smrg   if (is_flow(consumer) || is_sfu(consumer) || is_tex(consumer) ||
927ec681f3Smrg       is_mem(consumer) || (assigner->dsts[0]->flags & IR3_REG_SHARED)) {
937ec681f3Smrg      return 6;
947ec681f3Smrg   } else {
957ec681f3Smrg      /* In mergedregs mode, there is an extra 2-cycle penalty when half of
967ec681f3Smrg       * a full-reg is read as a half-reg or when a half-reg is read as a
977ec681f3Smrg       * full-reg.
987ec681f3Smrg       */
997ec681f3Smrg      bool mismatched_half = (assigner->dsts[0]->flags & IR3_REG_HALF) !=
1007ec681f3Smrg                             (consumer->srcs[n]->flags & IR3_REG_HALF);
1017ec681f3Smrg      unsigned penalty = mismatched_half ? 3 : 0;
1027ec681f3Smrg      if ((is_mad(consumer->opc) || is_madsh(consumer->opc)) && (n == 2)) {
1037ec681f3Smrg         /* special case, 3rd src to cat3 not required on first cycle */
1047ec681f3Smrg         return 1 + penalty;
1057ec681f3Smrg      } else {
1067ec681f3Smrg         return 3 + penalty;
1077ec681f3Smrg      }
1087ec681f3Smrg   }
1097ec681f3Smrg}
1107ec681f3Smrg
1117ec681f3Smrgstatic bool
1127ec681f3Smrgcount_instruction(struct ir3_instruction *n)
1137ec681f3Smrg{
1147ec681f3Smrg   /* NOTE: don't count branch/jump since we don't know yet if they will
1157ec681f3Smrg    * be eliminated later in resolve_jumps().. really should do that
1167ec681f3Smrg    * earlier so we don't have this constraint.
1177ec681f3Smrg    */
1187ec681f3Smrg   return is_alu(n) ||
1197ec681f3Smrg          (is_flow(n) && (n->opc != OPC_JUMP) && (n->opc != OPC_B));
1207ec681f3Smrg}
1217ec681f3Smrg
1227ec681f3Smrgstatic unsigned
1237ec681f3Smrgdistance(struct ir3_block *block, struct ir3_instruction *instr, unsigned maxd)
1247ec681f3Smrg{
1257ec681f3Smrg   unsigned d = 0;
1267ec681f3Smrg
1277ec681f3Smrg   /* Note that this relies on incrementally building up the block's
1287ec681f3Smrg    * instruction list.. but this is how scheduling and nopsched
1297ec681f3Smrg    * work.
1307ec681f3Smrg    */
1317ec681f3Smrg   foreach_instr_rev (n, &block->instr_list) {
1327ec681f3Smrg      if ((n == instr) || (d >= maxd))
1337ec681f3Smrg         return MIN2(maxd, d + n->nop);
1347ec681f3Smrg      if (count_instruction(n))
1357ec681f3Smrg         d = MIN2(maxd, d + 1 + n->repeat + n->nop);
1367ec681f3Smrg   }
1377ec681f3Smrg
1387ec681f3Smrg   return maxd;
1397ec681f3Smrg}
1407ec681f3Smrg
1417ec681f3Smrgstatic unsigned
1427ec681f3Smrgdelay_calc_srcn_prera(struct ir3_block *block, struct ir3_instruction *assigner,
1437ec681f3Smrg                      struct ir3_instruction *consumer, unsigned srcn)
1447ec681f3Smrg{
1457ec681f3Smrg   unsigned delay = 0;
1467ec681f3Smrg
1477ec681f3Smrg   if (assigner->opc == OPC_META_PHI)
1487ec681f3Smrg      return 0;
1497ec681f3Smrg
1507ec681f3Smrg   if (is_meta(assigner)) {
1517ec681f3Smrg      foreach_src_n (src, n, assigner) {
1527ec681f3Smrg         unsigned d;
1537ec681f3Smrg
1547ec681f3Smrg         if (!src->def)
1557ec681f3Smrg            continue;
1567ec681f3Smrg
1577ec681f3Smrg         d = delay_calc_srcn_prera(block, src->def->instr, consumer, srcn);
1587ec681f3Smrg         delay = MAX2(delay, d);
1597ec681f3Smrg      }
1607ec681f3Smrg   } else {
1617ec681f3Smrg      delay = ir3_delayslots(assigner, consumer, srcn, false);
1627ec681f3Smrg      delay -= distance(block, assigner, delay);
1637ec681f3Smrg   }
1647ec681f3Smrg
1657ec681f3Smrg   return delay;
1667ec681f3Smrg}
1677ec681f3Smrg
1687ec681f3Smrg/**
1697ec681f3Smrg * Calculate delay for instruction before register allocation, using SSA
1707ec681f3Smrg * source pointers. This can't handle inter-block dependencies.
1717ec681f3Smrg */
1727ec681f3Smrgunsigned
1737ec681f3Smrgir3_delay_calc_prera(struct ir3_block *block, struct ir3_instruction *instr)
1747ec681f3Smrg{
1757ec681f3Smrg   unsigned delay = 0;
1767ec681f3Smrg
1777ec681f3Smrg   foreach_src_n (src, i, instr) {
1787ec681f3Smrg      unsigned d = 0;
1797ec681f3Smrg
1807ec681f3Smrg      if (src->def && src->def->instr->block == block) {
1817ec681f3Smrg         d = delay_calc_srcn_prera(block, src->def->instr, instr, i);
1827ec681f3Smrg      }
1837ec681f3Smrg
1847ec681f3Smrg      delay = MAX2(delay, d);
1857ec681f3Smrg   }
1867ec681f3Smrg
1877ec681f3Smrg   return delay;
1887ec681f3Smrg}
1897ec681f3Smrg
1907ec681f3Smrg/* Post-RA, we don't have arrays any more, so we have to be a bit careful here
1917ec681f3Smrg * and have to handle relative accesses specially.
1927ec681f3Smrg */
1937ec681f3Smrg
1947ec681f3Smrgstatic unsigned
1957ec681f3Smrgpost_ra_reg_elems(struct ir3_register *reg)
1967ec681f3Smrg{
1977ec681f3Smrg   if (reg->flags & IR3_REG_RELATIV)
1987ec681f3Smrg      return reg->size;
1997ec681f3Smrg   return reg_elems(reg);
2007ec681f3Smrg}
2017ec681f3Smrg
2027ec681f3Smrgstatic unsigned
2037ec681f3Smrgpost_ra_reg_num(struct ir3_register *reg)
2047ec681f3Smrg{
2057ec681f3Smrg   if (reg->flags & IR3_REG_RELATIV)
2067ec681f3Smrg      return reg->array.base;
2077ec681f3Smrg   return reg->num;
2087ec681f3Smrg}
2097ec681f3Smrg
2107ec681f3Smrgstatic unsigned
2117ec681f3Smrgdelay_calc_srcn_postra(struct ir3_instruction *assigner,
2127ec681f3Smrg                       struct ir3_instruction *consumer, unsigned assigner_n,
2137ec681f3Smrg                       unsigned consumer_n, bool soft, bool mergedregs)
2147ec681f3Smrg{
2157ec681f3Smrg   struct ir3_register *src = consumer->srcs[consumer_n];
2167ec681f3Smrg   struct ir3_register *dst = assigner->dsts[assigner_n];
2177ec681f3Smrg   bool mismatched_half =
2187ec681f3Smrg      (src->flags & IR3_REG_HALF) != (dst->flags & IR3_REG_HALF);
2197ec681f3Smrg
2207ec681f3Smrg   /* In the mergedregs case or when the register is a special register,
2217ec681f3Smrg    * half-registers do not alias with full registers.
2227ec681f3Smrg    */
2237ec681f3Smrg   if ((!mergedregs || is_reg_special(src) || is_reg_special(dst)) &&
2247ec681f3Smrg       mismatched_half)
2257ec681f3Smrg      return 0;
2267ec681f3Smrg
2277ec681f3Smrg   unsigned src_start = post_ra_reg_num(src) * reg_elem_size(src);
2287ec681f3Smrg   unsigned src_end = src_start + post_ra_reg_elems(src) * reg_elem_size(src);
2297ec681f3Smrg   unsigned dst_start = post_ra_reg_num(dst) * reg_elem_size(dst);
2307ec681f3Smrg   unsigned dst_end = dst_start + post_ra_reg_elems(dst) * reg_elem_size(dst);
2317ec681f3Smrg
2327ec681f3Smrg   if (dst_start >= src_end || src_start >= dst_end)
2337ec681f3Smrg      return 0;
2347ec681f3Smrg
2357ec681f3Smrg   unsigned delay = ir3_delayslots(assigner, consumer, consumer_n, soft);
2367ec681f3Smrg
2377ec681f3Smrg   if (assigner->repeat == 0 && consumer->repeat == 0)
2387ec681f3Smrg      return delay;
2397ec681f3Smrg
2407ec681f3Smrg   /* If either side is a relative access, we can't really apply most of the
2417ec681f3Smrg    * reasoning below because we don't know which component aliases which.
2427ec681f3Smrg    * Just bail in this case.
2437ec681f3Smrg    */
2447ec681f3Smrg   if ((src->flags & IR3_REG_RELATIV) || (dst->flags & IR3_REG_RELATIV))
2457ec681f3Smrg      return delay;
2467ec681f3Smrg
2477ec681f3Smrg   /* MOVMSK seems to require that all users wait until the entire
2487ec681f3Smrg    * instruction is finished, so just bail here.
2497ec681f3Smrg    */
2507ec681f3Smrg   if (assigner->opc == OPC_MOVMSK)
2517ec681f3Smrg      return delay;
2527ec681f3Smrg
2537ec681f3Smrg   /* TODO: Handle the combination of (rpt) and different component sizes
2547ec681f3Smrg    * better like below. This complicates things significantly because the
2557ec681f3Smrg    * components don't line up.
2567ec681f3Smrg    */
2577ec681f3Smrg   if (mismatched_half)
2587ec681f3Smrg      return delay;
2597ec681f3Smrg
2607ec681f3Smrg   /* If an instruction has a (rpt), then it acts as a sequence of
2617ec681f3Smrg    * instructions, reading its non-(r) sources at each cycle. First, get the
2627ec681f3Smrg    * register num for the first instruction where they interfere:
2637ec681f3Smrg    */
2647ec681f3Smrg
2657ec681f3Smrg   unsigned first_num = MAX2(src_start, dst_start) / reg_elem_size(dst);
2667ec681f3Smrg
2677ec681f3Smrg   /* Now, for that first conflicting half/full register, figure out the
2687ec681f3Smrg    * sub-instruction within assigner/consumer it corresponds to. For (r)
2697ec681f3Smrg    * sources, this should already return the correct answer of 0. However we
2707ec681f3Smrg    * have to special-case the multi-mov instructions, where the
2717ec681f3Smrg    * sub-instructions sometimes come from the src/dst indices instead.
2727ec681f3Smrg    */
2737ec681f3Smrg   unsigned first_src_instr;
2747ec681f3Smrg   if (consumer->opc == OPC_SWZ || consumer->opc == OPC_GAT)
2757ec681f3Smrg      first_src_instr = consumer_n;
2767ec681f3Smrg   else
2777ec681f3Smrg      first_src_instr = first_num - src->num;
2787ec681f3Smrg
2797ec681f3Smrg   unsigned first_dst_instr;
2807ec681f3Smrg   if (assigner->opc == OPC_SWZ || assigner->opc == OPC_SCT)
2817ec681f3Smrg      first_dst_instr = assigner_n;
2827ec681f3Smrg   else
2837ec681f3Smrg      first_dst_instr = first_num - dst->num;
2847ec681f3Smrg
2857ec681f3Smrg   /* The delay we return is relative to the *end* of assigner and the
2867ec681f3Smrg    * *beginning* of consumer, because it's the number of nops (or other
2877ec681f3Smrg    * things) needed between them. Any instructions after first_dst_instr
2887ec681f3Smrg    * subtract from the delay, and so do any instructions before
2897ec681f3Smrg    * first_src_instr. Calculate an offset to subtract from the non-rpt-aware
2907ec681f3Smrg    * delay to account for that.
2917ec681f3Smrg    *
2927ec681f3Smrg    * Now, a priori, we need to go through this process for every
2937ec681f3Smrg    * conflicting regnum and take the minimum of the offsets to make sure
2947ec681f3Smrg    * that the appropriate number of nop's is inserted for every conflicting
2957ec681f3Smrg    * pair of sub-instructions. However, as we go to the next conflicting
2967ec681f3Smrg    * regnum (if any), the number of instructions after first_dst_instr
2977ec681f3Smrg    * decreases by 1 and the number of source instructions before
2987ec681f3Smrg    * first_src_instr correspondingly increases by 1, so the offset stays the
2997ec681f3Smrg    * same for all conflicting registers.
3007ec681f3Smrg    */
3017ec681f3Smrg   unsigned offset = first_src_instr + (assigner->repeat - first_dst_instr);
3027ec681f3Smrg   return offset > delay ? 0 : delay - offset;
3037ec681f3Smrg}
3047ec681f3Smrg
3057ec681f3Smrgstatic unsigned
3067ec681f3Smrgdelay_calc_postra(struct ir3_block *block, struct ir3_instruction *start,
3077ec681f3Smrg                  struct ir3_instruction *consumer, unsigned distance,
3087ec681f3Smrg                  bool soft, bool pred, bool mergedregs)
3097ec681f3Smrg{
3107ec681f3Smrg   unsigned delay = 0;
3117ec681f3Smrg   /* Search backwards starting at the instruction before start, unless it's
3127ec681f3Smrg    * NULL then search backwards from the block end.
3137ec681f3Smrg    */
3147ec681f3Smrg   struct list_head *start_list =
3157ec681f3Smrg      start ? start->node.prev : block->instr_list.prev;
3167ec681f3Smrg   list_for_each_entry_from_rev (struct ir3_instruction, assigner, start_list,
3177ec681f3Smrg                                 &block->instr_list, node) {
3187ec681f3Smrg      if (count_instruction(assigner))
3197ec681f3Smrg         distance += assigner->nop;
3207ec681f3Smrg
3217ec681f3Smrg      if (distance + delay >= (soft ? SOFT_SS_NOPS : MAX_NOPS))
3227ec681f3Smrg         return delay;
3237ec681f3Smrg
3247ec681f3Smrg      if (is_meta(assigner))
3257ec681f3Smrg         continue;
3267ec681f3Smrg
3277ec681f3Smrg      unsigned new_delay = 0;
3287ec681f3Smrg
3297ec681f3Smrg      foreach_dst_n (dst, dst_n, assigner) {
3307ec681f3Smrg         if (dst->wrmask == 0)
3317ec681f3Smrg            continue;
3327ec681f3Smrg         foreach_src_n (src, src_n, consumer) {
3337ec681f3Smrg            if (src->flags & (IR3_REG_IMMED | IR3_REG_CONST))
3347ec681f3Smrg               continue;
3357ec681f3Smrg
3367ec681f3Smrg            unsigned src_delay = delay_calc_srcn_postra(
3377ec681f3Smrg               assigner, consumer, dst_n, src_n, soft, mergedregs);
3387ec681f3Smrg            new_delay = MAX2(new_delay, src_delay);
3397ec681f3Smrg         }
3407ec681f3Smrg      }
3417ec681f3Smrg
3427ec681f3Smrg      new_delay = new_delay > distance ? new_delay - distance : 0;
3437ec681f3Smrg      delay = MAX2(delay, new_delay);
3447ec681f3Smrg
3457ec681f3Smrg      if (count_instruction(assigner))
3467ec681f3Smrg         distance += 1 + assigner->repeat;
3477ec681f3Smrg   }
3487ec681f3Smrg
3497ec681f3Smrg   /* Note: this allows recursion into "block" if it has already been
3507ec681f3Smrg    * visited, but *not* recursion into its predecessors. We may have to
3517ec681f3Smrg    * visit the original block twice, for the loop case where we have to
3527ec681f3Smrg    * consider definititons in an earlier iterations of the same loop:
3537ec681f3Smrg    *
3547ec681f3Smrg    * while (...) {
3557ec681f3Smrg    *		mov.u32u32 ..., r0.x
3567ec681f3Smrg    *		...
3577ec681f3Smrg    *		mov.u32u32 r0.x, ...
3587ec681f3Smrg    * }
3597ec681f3Smrg    *
3607ec681f3Smrg    * However any other recursion would be unnecessary.
3617ec681f3Smrg    */
3627ec681f3Smrg
3637ec681f3Smrg   if (pred && block->data != block) {
3647ec681f3Smrg      block->data = block;
3657ec681f3Smrg
3667ec681f3Smrg      for (unsigned i = 0; i < block->predecessors_count; i++) {
3677ec681f3Smrg         struct ir3_block *pred = block->predecessors[i];
3687ec681f3Smrg         unsigned pred_delay = delay_calc_postra(pred, NULL, consumer, distance,
3697ec681f3Smrg                                                 soft, pred, mergedregs);
3707ec681f3Smrg         delay = MAX2(delay, pred_delay);
3717ec681f3Smrg      }
3727ec681f3Smrg
3737ec681f3Smrg      block->data = NULL;
3747ec681f3Smrg   }
3757ec681f3Smrg
3767ec681f3Smrg   return delay;
3777ec681f3Smrg}
3787ec681f3Smrg
3797ec681f3Smrg/**
3807ec681f3Smrg * Calculate delay for post-RA scheduling based on physical registers but not
3817ec681f3Smrg * exact (i.e. don't recurse into predecessors, and make it possible to
3827ec681f3Smrg * estimate impact of sync flags).
3837ec681f3Smrg *
3847ec681f3Smrg * @soft:  If true, add additional delay for situations where they
3857ec681f3Smrg *    would not be strictly required because a sync flag would be
3867ec681f3Smrg *    used (but scheduler would prefer to schedule some other
3877ec681f3Smrg *    instructions first to avoid stalling on sync flag)
3887ec681f3Smrg * @mergedregs: True if mergedregs is enabled.
3897ec681f3Smrg */
3907ec681f3Smrgunsigned
3917ec681f3Smrgir3_delay_calc_postra(struct ir3_block *block, struct ir3_instruction *instr,
3927ec681f3Smrg                      bool soft, bool mergedregs)
3937ec681f3Smrg{
3947ec681f3Smrg   return delay_calc_postra(block, NULL, instr, 0, soft, false, mergedregs);
3957ec681f3Smrg}
3967ec681f3Smrg
3977ec681f3Smrg/**
3987ec681f3Smrg * Calculate delay for nop insertion. This must exactly match hardware
3997ec681f3Smrg * requirements, including recursing into predecessor blocks.
4007ec681f3Smrg */
4017ec681f3Smrgunsigned
4027ec681f3Smrgir3_delay_calc_exact(struct ir3_block *block, struct ir3_instruction *instr,
4037ec681f3Smrg                     bool mergedregs)
4047ec681f3Smrg{
4057ec681f3Smrg   return delay_calc_postra(block, NULL, instr, 0, false, true, mergedregs);
4067ec681f3Smrg}
4077ec681f3Smrg
4087ec681f3Smrg/**
4097ec681f3Smrg * Remove nop instructions.  The scheduler can insert placeholder nop's
4107ec681f3Smrg * so that ir3_delay_calc() can account for nop's that won't be needed
4117ec681f3Smrg * due to nop's triggered by a previous instruction.  However, before
4127ec681f3Smrg * legalize, we want to remove these.  The legalize pass can insert
4137ec681f3Smrg * some nop's if needed to hold (for example) sync flags.  This final
4147ec681f3Smrg * remaining nops are inserted by legalize after this.
4157ec681f3Smrg */
4167ec681f3Smrgvoid
4177ec681f3Smrgir3_remove_nops(struct ir3 *ir)
4187ec681f3Smrg{
4197ec681f3Smrg   foreach_block (block, &ir->block_list) {
4207ec681f3Smrg      foreach_instr_safe (instr, &block->instr_list) {
4217ec681f3Smrg         if (instr->opc == OPC_NOP) {
4227ec681f3Smrg            list_del(&instr->node);
4237ec681f3Smrg         }
4247ec681f3Smrg      }
4257ec681f3Smrg   }
4267ec681f3Smrg}
427