17e102996Smaya/*
27e102996Smaya * Copyright (C) 2014 Rob Clark <robclark@freedesktop.org>
37e102996Smaya *
47e102996Smaya * Permission is hereby granted, free of charge, to any person obtaining a
57e102996Smaya * copy of this software and associated documentation files (the "Software"),
67e102996Smaya * to deal in the Software without restriction, including without limitation
77e102996Smaya * the rights to use, copy, modify, merge, publish, distribute, sublicense,
87e102996Smaya * and/or sell copies of the Software, and to permit persons to whom the
97e102996Smaya * Software is furnished to do so, subject to the following conditions:
107e102996Smaya *
117e102996Smaya * The above copyright notice and this permission notice (including the next
127e102996Smaya * paragraph) shall be included in all copies or substantial portions of the
137e102996Smaya * Software.
147e102996Smaya *
157e102996Smaya * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
167e102996Smaya * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
177e102996Smaya * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
187e102996Smaya * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
197e102996Smaya * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
207e102996Smaya * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
217e102996Smaya * SOFTWARE.
227e102996Smaya *
237e102996Smaya * Authors:
247e102996Smaya *    Rob Clark <robclark@freedesktop.org>
257e102996Smaya */
267e102996Smaya
277e102996Smaya#include "util/ralloc.h"
287e102996Smaya#include "util/u_math.h"
297e102996Smaya
307e102996Smaya#include "ir3.h"
317ec681f3Smrg#include "ir3_shader.h"
327e102996Smaya
337e102996Smaya/*
347e102996Smaya * Legalize:
357e102996Smaya *
367ec681f3Smrg * The legalize pass handles ensuring sufficient nop's and sync flags for
377ec681f3Smrg * correct execution.
387ec681f3Smrg *
397ec681f3Smrg * 1) Iteratively determine where sync ((sy)/(ss)) flags are needed,
407ec681f3Smrg *    based on state flowing out of predecessor blocks until there is
417ec681f3Smrg *    no further change.  In some cases this requires inserting nops.
427ec681f3Smrg * 2) Mark (ei) on last varying input, and (ul) on last use of a0.x
437ec681f3Smrg * 3) Final nop scheduling for instruction latency
447ec681f3Smrg * 4) Resolve jumps and schedule blocks, marking potential convergence
457ec681f3Smrg *    points with (jp)
467e102996Smaya */
477e102996Smaya
487e102996Smayastruct ir3_legalize_ctx {
497ec681f3Smrg   struct ir3_compiler *compiler;
507ec681f3Smrg   struct ir3_shader_variant *so;
517ec681f3Smrg   gl_shader_stage type;
527ec681f3Smrg   int max_bary;
537ec681f3Smrg   bool early_input_release;
547e102996Smaya};
557e102996Smaya
567e102996Smayastruct ir3_legalize_state {
577ec681f3Smrg   regmask_t needs_ss;
587ec681f3Smrg   regmask_t needs_ss_war; /* write after read */
597ec681f3Smrg   regmask_t needs_sy;
607e102996Smaya};
617e102996Smaya
627e102996Smayastruct ir3_legalize_block_data {
637ec681f3Smrg   bool valid;
647ec681f3Smrg   struct ir3_legalize_state state;
657e102996Smaya};
667e102996Smaya
677e102996Smaya/* We want to evaluate each block from the position of any other
687e102996Smaya * predecessor block, in order that the flags set are the union of
697e102996Smaya * all possible program paths.
707e102996Smaya *
717e102996Smaya * To do this, we need to know the output state (needs_ss/ss_war/sy)
727e102996Smaya * of all predecessor blocks.  The tricky thing is loops, which mean
737e102996Smaya * that we can't simply recursively process each predecessor block
747e102996Smaya * before legalizing the current block.
757e102996Smaya *
767e102996Smaya * How we handle that is by looping over all the blocks until the
777e102996Smaya * results converge.  If the output state of a given block changes
787e102996Smaya * in a given pass, this means that all successor blocks are not
797e102996Smaya * yet fully legalized.
807e102996Smaya */
817e102996Smaya
827e102996Smayastatic bool
837e102996Smayalegalize_block(struct ir3_legalize_ctx *ctx, struct ir3_block *block)
847e102996Smaya{
857ec681f3Smrg   struct ir3_legalize_block_data *bd = block->data;
867ec681f3Smrg
877ec681f3Smrg   if (bd->valid)
887ec681f3Smrg      return false;
897ec681f3Smrg
907ec681f3Smrg   struct ir3_instruction *last_rel = NULL;
917ec681f3Smrg   struct ir3_instruction *last_n = NULL;
927ec681f3Smrg   struct list_head instr_list;
937ec681f3Smrg   struct ir3_legalize_state prev_state = bd->state;
947ec681f3Smrg   struct ir3_legalize_state *state = &bd->state;
957ec681f3Smrg   bool last_input_needs_ss = false;
967ec681f3Smrg   bool has_tex_prefetch = false;
977ec681f3Smrg   bool mergedregs = ctx->so->mergedregs;
987ec681f3Smrg
997ec681f3Smrg   /* our input state is the OR of all predecessor blocks' state: */
1007ec681f3Smrg   for (unsigned i = 0; i < block->predecessors_count; i++) {
1017ec681f3Smrg      struct ir3_block *predecessor = block->predecessors[i];
1027ec681f3Smrg      struct ir3_legalize_block_data *pbd = predecessor->data;
1037ec681f3Smrg      struct ir3_legalize_state *pstate = &pbd->state;
1047ec681f3Smrg
1057ec681f3Smrg      /* Our input (ss)/(sy) state is based on OR'ing the output
1067ec681f3Smrg       * state of all our predecessor blocks
1077ec681f3Smrg       */
1087ec681f3Smrg      regmask_or(&state->needs_ss, &state->needs_ss, &pstate->needs_ss);
1097ec681f3Smrg      regmask_or(&state->needs_ss_war, &state->needs_ss_war,
1107ec681f3Smrg                 &pstate->needs_ss_war);
1117ec681f3Smrg      regmask_or(&state->needs_sy, &state->needs_sy, &pstate->needs_sy);
1127ec681f3Smrg   }
1137ec681f3Smrg
1147ec681f3Smrg   unsigned input_count = 0;
1157ec681f3Smrg
1167ec681f3Smrg   foreach_instr (n, &block->instr_list) {
1177ec681f3Smrg      if (is_input(n)) {
1187ec681f3Smrg         input_count++;
1197ec681f3Smrg      }
1207ec681f3Smrg   }
1217ec681f3Smrg
1227ec681f3Smrg   unsigned inputs_remaining = input_count;
1237ec681f3Smrg
1247ec681f3Smrg   /* Either inputs are in the first block or we expect inputs to be released
1257ec681f3Smrg    * with the end of the program.
1267ec681f3Smrg    */
1277ec681f3Smrg   assert(input_count == 0 || !ctx->early_input_release ||
1287ec681f3Smrg          block == ir3_start_block(block->shader));
1297ec681f3Smrg
1307ec681f3Smrg   /* remove all the instructions from the list, we'll be adding
1317ec681f3Smrg    * them back in as we go
1327ec681f3Smrg    */
1337ec681f3Smrg   list_replace(&block->instr_list, &instr_list);
1347ec681f3Smrg   list_inithead(&block->instr_list);
1357ec681f3Smrg
1367ec681f3Smrg   foreach_instr_safe (n, &instr_list) {
1377ec681f3Smrg      unsigned i;
1387ec681f3Smrg
1397ec681f3Smrg      n->flags &= ~(IR3_INSTR_SS | IR3_INSTR_SY);
1407ec681f3Smrg
1417ec681f3Smrg      /* _meta::tex_prefetch instructions removed later in
1427ec681f3Smrg       * collect_tex_prefetches()
1437ec681f3Smrg       */
1447ec681f3Smrg      if (is_meta(n) && (n->opc != OPC_META_TEX_PREFETCH))
1457ec681f3Smrg         continue;
1467ec681f3Smrg
1477ec681f3Smrg      if (is_input(n)) {
1487ec681f3Smrg         struct ir3_register *inloc = n->srcs[0];
1497ec681f3Smrg         assert(inloc->flags & IR3_REG_IMMED);
1507ec681f3Smrg         ctx->max_bary = MAX2(ctx->max_bary, inloc->iim_val);
1517ec681f3Smrg      }
1527ec681f3Smrg
1537ec681f3Smrg      if (last_n && is_barrier(last_n)) {
1547ec681f3Smrg         n->flags |= IR3_INSTR_SS | IR3_INSTR_SY;
1557ec681f3Smrg         last_input_needs_ss = false;
1567ec681f3Smrg         regmask_init(&state->needs_ss_war, mergedregs);
1577ec681f3Smrg         regmask_init(&state->needs_ss, mergedregs);
1587ec681f3Smrg         regmask_init(&state->needs_sy, mergedregs);
1597ec681f3Smrg      }
1607ec681f3Smrg
1617ec681f3Smrg      if (last_n && (last_n->opc == OPC_PREDT)) {
1627ec681f3Smrg         n->flags |= IR3_INSTR_SS;
1637ec681f3Smrg         regmask_init(&state->needs_ss_war, mergedregs);
1647ec681f3Smrg         regmask_init(&state->needs_ss, mergedregs);
1657ec681f3Smrg      }
1667ec681f3Smrg
1677ec681f3Smrg      /* NOTE: consider dst register too.. it could happen that
1687ec681f3Smrg       * texture sample instruction (for example) writes some
1697ec681f3Smrg       * components which are unused.  A subsequent instruction
1707ec681f3Smrg       * that writes the same register can race w/ the sam instr
1717ec681f3Smrg       * resulting in undefined results:
1727ec681f3Smrg       */
1737ec681f3Smrg      for (i = 0; i < n->dsts_count + n->srcs_count; i++) {
1747ec681f3Smrg         struct ir3_register *reg;
1757ec681f3Smrg         if (i < n->dsts_count)
1767ec681f3Smrg            reg = n->dsts[i];
1777ec681f3Smrg         else
1787ec681f3Smrg            reg = n->srcs[i - n->dsts_count];
1797ec681f3Smrg
1807ec681f3Smrg         if (reg_gpr(reg)) {
1817ec681f3Smrg
1827ec681f3Smrg            /* TODO: we probably only need (ss) for alu
1837ec681f3Smrg             * instr consuming sfu result.. need to make
1847ec681f3Smrg             * some tests for both this and (sy)..
1857ec681f3Smrg             */
1867ec681f3Smrg            if (regmask_get(&state->needs_ss, reg)) {
1877ec681f3Smrg               n->flags |= IR3_INSTR_SS;
1887ec681f3Smrg               last_input_needs_ss = false;
1897ec681f3Smrg               regmask_init(&state->needs_ss_war, mergedregs);
1907ec681f3Smrg               regmask_init(&state->needs_ss, mergedregs);
1917ec681f3Smrg            }
1927ec681f3Smrg
1937ec681f3Smrg            if (regmask_get(&state->needs_sy, reg)) {
1947ec681f3Smrg               n->flags |= IR3_INSTR_SY;
1957ec681f3Smrg               regmask_init(&state->needs_sy, mergedregs);
1967ec681f3Smrg            }
1977ec681f3Smrg         }
1987ec681f3Smrg
1997ec681f3Smrg         /* TODO: is it valid to have address reg loaded from a
2007ec681f3Smrg          * relative src (ie. mova a0, c<a0.x+4>)?  If so, the
2017ec681f3Smrg          * last_rel check below should be moved ahead of this:
2027ec681f3Smrg          */
2037ec681f3Smrg         if (reg->flags & IR3_REG_RELATIV)
2047ec681f3Smrg            last_rel = n;
2057ec681f3Smrg      }
2067ec681f3Smrg
2077ec681f3Smrg      foreach_dst (reg, n) {
2087ec681f3Smrg         if (regmask_get(&state->needs_ss_war, reg)) {
2097ec681f3Smrg            n->flags |= IR3_INSTR_SS;
2107ec681f3Smrg            last_input_needs_ss = false;
2117ec681f3Smrg            regmask_init(&state->needs_ss_war, mergedregs);
2127ec681f3Smrg            regmask_init(&state->needs_ss, mergedregs);
2137ec681f3Smrg         }
2147ec681f3Smrg
2157ec681f3Smrg         if (last_rel && (reg->num == regid(REG_A0, 0))) {
2167ec681f3Smrg            last_rel->flags |= IR3_INSTR_UL;
2177ec681f3Smrg            last_rel = NULL;
2187ec681f3Smrg         }
2197ec681f3Smrg      }
2207ec681f3Smrg
2217ec681f3Smrg      /* cat5+ does not have an (ss) bit, if needed we need to
2227ec681f3Smrg       * insert a nop to carry the sync flag.  Would be kinda
2237ec681f3Smrg       * clever if we were aware of this during scheduling, but
2247ec681f3Smrg       * this should be a pretty rare case:
2257ec681f3Smrg       */
2267ec681f3Smrg      if ((n->flags & IR3_INSTR_SS) && (opc_cat(n->opc) >= 5)) {
2277ec681f3Smrg         struct ir3_instruction *nop;
2287ec681f3Smrg         nop = ir3_NOP(block);
2297ec681f3Smrg         nop->flags |= IR3_INSTR_SS;
2307ec681f3Smrg         n->flags &= ~IR3_INSTR_SS;
2317ec681f3Smrg      }
2327ec681f3Smrg
2337ec681f3Smrg      /* need to be able to set (ss) on first instruction: */
2347ec681f3Smrg      if (list_is_empty(&block->instr_list) && (opc_cat(n->opc) >= 5))
2357ec681f3Smrg         ir3_NOP(block);
2367ec681f3Smrg
2377ec681f3Smrg      if (ctx->compiler->samgq_workaround &&
2387ec681f3Smrg          ctx->type != MESA_SHADER_FRAGMENT &&
2397ec681f3Smrg          ctx->type != MESA_SHADER_COMPUTE && n->opc == OPC_SAMGQ) {
2407ec681f3Smrg         struct ir3_instruction *samgp;
2417ec681f3Smrg
2427ec681f3Smrg         list_delinit(&n->node);
2437ec681f3Smrg
2447ec681f3Smrg         for (i = 0; i < 4; i++) {
2457ec681f3Smrg            samgp = ir3_instr_clone(n);
2467ec681f3Smrg            samgp->opc = OPC_SAMGP0 + i;
2477ec681f3Smrg            if (i > 1)
2487ec681f3Smrg               samgp->flags |= IR3_INSTR_SY;
2497ec681f3Smrg         }
2507ec681f3Smrg      } else {
2517ec681f3Smrg         list_delinit(&n->node);
2527ec681f3Smrg         list_addtail(&n->node, &block->instr_list);
2537ec681f3Smrg      }
2547ec681f3Smrg
2557ec681f3Smrg      if (is_sfu(n))
2567ec681f3Smrg         regmask_set(&state->needs_ss, n->dsts[0]);
2577ec681f3Smrg
2587ec681f3Smrg      if (is_tex_or_prefetch(n)) {
2597ec681f3Smrg         regmask_set(&state->needs_sy, n->dsts[0]);
2607ec681f3Smrg         if (n->opc == OPC_META_TEX_PREFETCH)
2617ec681f3Smrg            has_tex_prefetch = true;
2627ec681f3Smrg      } else if (n->opc == OPC_RESINFO) {
2637ec681f3Smrg         regmask_set(&state->needs_ss, n->dsts[0]);
2647ec681f3Smrg         ir3_NOP(block)->flags |= IR3_INSTR_SS;
2657ec681f3Smrg         last_input_needs_ss = false;
2667ec681f3Smrg      } else if (is_load(n)) {
2677ec681f3Smrg         /* seems like ldlv needs (ss) bit instead??  which is odd but
2687ec681f3Smrg          * makes a bunch of flat-varying tests start working on a4xx.
2697ec681f3Smrg          */
2707ec681f3Smrg         if ((n->opc == OPC_LDLV) || (n->opc == OPC_LDL) ||
2717ec681f3Smrg             (n->opc == OPC_LDLW))
2727ec681f3Smrg            regmask_set(&state->needs_ss, n->dsts[0]);
2737ec681f3Smrg         else
2747ec681f3Smrg            regmask_set(&state->needs_sy, n->dsts[0]);
2757ec681f3Smrg      } else if (is_atomic(n->opc)) {
2767ec681f3Smrg         if (n->flags & IR3_INSTR_G) {
2777ec681f3Smrg            if (ctx->compiler->gen >= 6) {
2787ec681f3Smrg               /* New encoding, returns  result via second src: */
2797ec681f3Smrg               regmask_set(&state->needs_sy, n->srcs[2]);
2807ec681f3Smrg            } else {
2817ec681f3Smrg               regmask_set(&state->needs_sy, n->dsts[0]);
2827ec681f3Smrg            }
2837ec681f3Smrg         } else {
2847ec681f3Smrg            regmask_set(&state->needs_ss, n->dsts[0]);
2857ec681f3Smrg         }
2867ec681f3Smrg      }
2877ec681f3Smrg
2887ec681f3Smrg      if (is_ssbo(n->opc) || (is_atomic(n->opc) && (n->flags & IR3_INSTR_G)))
2897ec681f3Smrg         ctx->so->has_ssbo = true;
2907ec681f3Smrg
2917ec681f3Smrg      /* both tex/sfu appear to not always immediately consume
2927ec681f3Smrg       * their src register(s):
2937ec681f3Smrg       */
2947ec681f3Smrg      if (is_tex(n) || is_sfu(n) || is_mem(n)) {
2957ec681f3Smrg         foreach_src (reg, n) {
2967ec681f3Smrg            regmask_set(&state->needs_ss_war, reg);
2977ec681f3Smrg         }
2987ec681f3Smrg      }
2997ec681f3Smrg
3007ec681f3Smrg      if (ctx->early_input_release && is_input(n)) {
3017ec681f3Smrg         last_input_needs_ss |= (n->opc == OPC_LDLV);
3027ec681f3Smrg
3037ec681f3Smrg         assert(inputs_remaining > 0);
3047ec681f3Smrg         inputs_remaining--;
3057ec681f3Smrg         if (inputs_remaining == 0) {
3067ec681f3Smrg            /* This is the last input. We add the (ei) flag to release
3077ec681f3Smrg             * varying memory after this executes. If it's an ldlv,
3087ec681f3Smrg             * however, we need to insert a dummy bary.f on which we can
3097ec681f3Smrg             * set the (ei) flag. We may also need to insert an (ss) to
3107ec681f3Smrg             * guarantee that all ldlv's have finished fetching their
3117ec681f3Smrg             * results before releasing the varying memory.
3127ec681f3Smrg             */
3137ec681f3Smrg            struct ir3_instruction *last_input = n;
3147ec681f3Smrg            if (n->opc == OPC_LDLV) {
3157ec681f3Smrg               struct ir3_instruction *baryf;
3167ec681f3Smrg
3177ec681f3Smrg               /* (ss)bary.f (ei)r63.x, 0, r0.x */
3187ec681f3Smrg               baryf = ir3_instr_create(block, OPC_BARY_F, 1, 2);
3197ec681f3Smrg               ir3_dst_create(baryf, regid(63, 0), 0);
3207ec681f3Smrg               ir3_src_create(baryf, 0, IR3_REG_IMMED)->iim_val = 0;
3217ec681f3Smrg               ir3_src_create(baryf, regid(0, 0), 0);
3227ec681f3Smrg
3237ec681f3Smrg               last_input = baryf;
3247ec681f3Smrg            }
3257ec681f3Smrg
3267ec681f3Smrg            last_input->dsts[0]->flags |= IR3_REG_EI;
3277ec681f3Smrg            if (last_input_needs_ss) {
3287ec681f3Smrg               last_input->flags |= IR3_INSTR_SS;
3297ec681f3Smrg               regmask_init(&state->needs_ss_war, mergedregs);
3307ec681f3Smrg               regmask_init(&state->needs_ss, mergedregs);
3317ec681f3Smrg            }
3327ec681f3Smrg         }
3337ec681f3Smrg      }
3347ec681f3Smrg
3357ec681f3Smrg      last_n = n;
3367ec681f3Smrg   }
3377ec681f3Smrg
3387ec681f3Smrg   assert(inputs_remaining == 0 || !ctx->early_input_release);
3397ec681f3Smrg
3407ec681f3Smrg   if (has_tex_prefetch && input_count == 0) {
3417ec681f3Smrg      /* texture prefetch, but *no* inputs.. we need to insert a
3427ec681f3Smrg       * dummy bary.f at the top of the shader to unblock varying
3437ec681f3Smrg       * storage:
3447ec681f3Smrg       */
3457ec681f3Smrg      struct ir3_instruction *baryf;
3467ec681f3Smrg
3477ec681f3Smrg      /* (ss)bary.f (ei)r63.x, 0, r0.x */
3487ec681f3Smrg      baryf = ir3_instr_create(block, OPC_BARY_F, 1, 2);
3497ec681f3Smrg      ir3_dst_create(baryf, regid(63, 0), 0)->flags |= IR3_REG_EI;
3507ec681f3Smrg      ir3_src_create(baryf, 0, IR3_REG_IMMED)->iim_val = 0;
3517ec681f3Smrg      ir3_src_create(baryf, regid(0, 0), 0);
3527ec681f3Smrg
3537ec681f3Smrg      /* insert the dummy bary.f at head: */
3547ec681f3Smrg      list_delinit(&baryf->node);
3557ec681f3Smrg      list_add(&baryf->node, &block->instr_list);
3567ec681f3Smrg   }
3577ec681f3Smrg
3587ec681f3Smrg   if (last_rel)
3597ec681f3Smrg      last_rel->flags |= IR3_INSTR_UL;
3607ec681f3Smrg
3617ec681f3Smrg   bd->valid = true;
3627ec681f3Smrg
3637ec681f3Smrg   if (memcmp(&prev_state, state, sizeof(*state))) {
3647ec681f3Smrg      /* our output state changed, this invalidates all of our
3657ec681f3Smrg       * successors:
3667ec681f3Smrg       */
3677ec681f3Smrg      for (unsigned i = 0; i < ARRAY_SIZE(block->successors); i++) {
3687ec681f3Smrg         if (!block->successors[i])
3697ec681f3Smrg            break;
3707ec681f3Smrg         struct ir3_legalize_block_data *pbd = block->successors[i]->data;
3717ec681f3Smrg         pbd->valid = false;
3727ec681f3Smrg      }
3737ec681f3Smrg   }
3747ec681f3Smrg
3757ec681f3Smrg   return true;
3767ec681f3Smrg}
3777ec681f3Smrg
3787ec681f3Smrg/* Expands dsxpp and dsypp macros to:
3797ec681f3Smrg *
3807ec681f3Smrg * dsxpp.1 dst, src
3817ec681f3Smrg * dsxpp.1.p dst, src
3827ec681f3Smrg *
3837ec681f3Smrg * We apply this after flags syncing, as we don't want to sync in between the
3847ec681f3Smrg * two (which might happen if dst == src).  We do it before nop scheduling
3857ec681f3Smrg * because that needs to count actual instructions.
3867ec681f3Smrg */
3877ec681f3Smrgstatic bool
3887ec681f3Smrgapply_fine_deriv_macro(struct ir3_legalize_ctx *ctx, struct ir3_block *block)
3897ec681f3Smrg{
3907ec681f3Smrg   struct list_head instr_list;
3917ec681f3Smrg
3927ec681f3Smrg   /* remove all the instructions from the list, we'll be adding
3937ec681f3Smrg    * them back in as we go
3947ec681f3Smrg    */
3957ec681f3Smrg   list_replace(&block->instr_list, &instr_list);
3967ec681f3Smrg   list_inithead(&block->instr_list);
3977ec681f3Smrg
3987ec681f3Smrg   foreach_instr_safe (n, &instr_list) {
3997ec681f3Smrg      list_addtail(&n->node, &block->instr_list);
4007ec681f3Smrg
4017ec681f3Smrg      if (n->opc == OPC_DSXPP_MACRO || n->opc == OPC_DSYPP_MACRO) {
4027ec681f3Smrg         n->opc = (n->opc == OPC_DSXPP_MACRO) ? OPC_DSXPP_1 : OPC_DSYPP_1;
4037ec681f3Smrg
4047ec681f3Smrg         struct ir3_instruction *op_p = ir3_instr_clone(n);
4057ec681f3Smrg         op_p->flags = IR3_INSTR_P;
4067ec681f3Smrg
4077ec681f3Smrg         ctx->so->need_fine_derivatives = true;
4087ec681f3Smrg      }
4097ec681f3Smrg   }
4107ec681f3Smrg
4117ec681f3Smrg   return true;
4127e102996Smaya}
4137e102996Smaya
4147e102996Smaya/* NOTE: branch instructions are always the last instruction(s)
4157e102996Smaya * in the block.  We take advantage of this as we resolve the
4167e102996Smaya * branches, since "if (foo) break;" constructs turn into
4177e102996Smaya * something like:
4187e102996Smaya *
4197e102996Smaya *   block3 {
4207e102996Smaya *   	...
4217e102996Smaya *   	0029:021: mov.s32s32 r62.x, r1.y
4227e102996Smaya *   	0082:022: br !p0.x, target=block5
4237e102996Smaya *   	0083:023: br p0.x, target=block4
4247e102996Smaya *   	// succs: if _[0029:021: mov.s32s32] block4; else block5;
4257e102996Smaya *   }
4267e102996Smaya *   block4 {
4277e102996Smaya *   	0084:024: jump, target=block6
4287e102996Smaya *   	// succs: block6;
4297e102996Smaya *   }
4307e102996Smaya *   block5 {
4317e102996Smaya *   	0085:025: jump, target=block7
4327e102996Smaya *   	// succs: block7;
4337e102996Smaya *   }
4347e102996Smaya *
4357e102996Smaya * ie. only instruction in block4/block5 is a jump, so when
4367e102996Smaya * resolving branches we can easily detect this by checking
4377e102996Smaya * that the first instruction in the target block is itself
4387e102996Smaya * a jump, and setup the br directly to the jump's target
4397e102996Smaya * (and strip back out the now unreached jump)
4407e102996Smaya *
4417e102996Smaya * TODO sometimes we end up with things like:
4427e102996Smaya *
4437e102996Smaya *    br !p0.x, #2
4447e102996Smaya *    br p0.x, #12
4457e102996Smaya *    add.u r0.y, r0.y, 1
4467e102996Smaya *
4477e102996Smaya * If we swapped the order of the branches, we could drop one.
4487e102996Smaya */
4497e102996Smayastatic struct ir3_block *
4507e102996Smayaresolve_dest_block(struct ir3_block *block)
4517e102996Smaya{
4527ec681f3Smrg   /* special case for last block: */
4537ec681f3Smrg   if (!block->successors[0])
4547ec681f3Smrg      return block;
4557ec681f3Smrg
4567ec681f3Smrg   /* NOTE that we may or may not have inserted the jump
4577ec681f3Smrg    * in the target block yet, so conditions to resolve
4587ec681f3Smrg    * the dest to the dest block's successor are:
4597ec681f3Smrg    *
4607ec681f3Smrg    *   (1) successor[1] == NULL &&
4617ec681f3Smrg    *   (2) (block-is-empty || only-instr-is-jump)
4627ec681f3Smrg    */
4637ec681f3Smrg   if (block->successors[1] == NULL) {
4647ec681f3Smrg      if (list_is_empty(&block->instr_list)) {
4657ec681f3Smrg         return block->successors[0];
4667ec681f3Smrg      } else if (list_length(&block->instr_list) == 1) {
4677ec681f3Smrg         struct ir3_instruction *instr =
4687ec681f3Smrg            list_first_entry(&block->instr_list, struct ir3_instruction, node);
4697ec681f3Smrg         if (instr->opc == OPC_JUMP) {
4707ec681f3Smrg            /* If this jump is backwards, then we will probably convert
4717ec681f3Smrg             * the jump being resolved to a backwards jump, which will
4727ec681f3Smrg             * change a loop-with-continue or loop-with-if into a
4737ec681f3Smrg             * doubly-nested loop and change the convergence behavior.
4747ec681f3Smrg             * Disallow this here.
4757ec681f3Smrg             */
4767ec681f3Smrg            if (block->successors[0]->index <= block->index)
4777ec681f3Smrg               return block;
4787ec681f3Smrg            return block->successors[0];
4797ec681f3Smrg         }
4807ec681f3Smrg      }
4817ec681f3Smrg   }
4827ec681f3Smrg   return block;
4837ec681f3Smrg}
4847ec681f3Smrg
4857ec681f3Smrgstatic void
4867ec681f3Smrgremove_unused_block(struct ir3_block *old_target)
4877ec681f3Smrg{
4887ec681f3Smrg   list_delinit(&old_target->node);
4897ec681f3Smrg
4907ec681f3Smrg   /* cleanup dangling predecessors: */
4917ec681f3Smrg   for (unsigned i = 0; i < ARRAY_SIZE(old_target->successors); i++) {
4927ec681f3Smrg      if (old_target->successors[i]) {
4937ec681f3Smrg         struct ir3_block *succ = old_target->successors[i];
4947ec681f3Smrg         ir3_block_remove_predecessor(succ, old_target);
4957ec681f3Smrg      }
4967ec681f3Smrg   }
4977e102996Smaya}
4987e102996Smaya
4997e102996Smayastatic bool
5007ec681f3Smrgretarget_jump(struct ir3_instruction *instr, struct ir3_block *new_target)
5017e102996Smaya{
5027ec681f3Smrg   struct ir3_block *old_target = instr->cat0.target;
5037ec681f3Smrg   struct ir3_block *cur_block = instr->block;
5047ec681f3Smrg
5057ec681f3Smrg   /* update current blocks successors to reflect the retargetting: */
5067ec681f3Smrg   if (cur_block->successors[0] == old_target) {
5077ec681f3Smrg      cur_block->successors[0] = new_target;
5087ec681f3Smrg   } else {
5097ec681f3Smrg      debug_assert(cur_block->successors[1] == old_target);
5107ec681f3Smrg      cur_block->successors[1] = new_target;
5117ec681f3Smrg   }
5127ec681f3Smrg
5137ec681f3Smrg   /* also update physical_successors.. we don't really need them at
5147ec681f3Smrg    * this stage, but it keeps ir3_validate happy:
5157ec681f3Smrg    */
5167ec681f3Smrg   if (cur_block->physical_successors[0] == old_target) {
5177ec681f3Smrg      cur_block->physical_successors[0] = new_target;
5187ec681f3Smrg   } else {
5197ec681f3Smrg      debug_assert(cur_block->physical_successors[1] == old_target);
5207ec681f3Smrg      cur_block->physical_successors[1] = new_target;
5217ec681f3Smrg   }
5227ec681f3Smrg
5237ec681f3Smrg   /* update new target's predecessors: */
5247ec681f3Smrg   ir3_block_add_predecessor(new_target, cur_block);
5257ec681f3Smrg
5267ec681f3Smrg   /* and remove old_target's predecessor: */
5277ec681f3Smrg   ir3_block_remove_predecessor(old_target, cur_block);
5287ec681f3Smrg
5297ec681f3Smrg   instr->cat0.target = new_target;
5307ec681f3Smrg
5317ec681f3Smrg   if (old_target->predecessors_count == 0) {
5327ec681f3Smrg      remove_unused_block(old_target);
5337ec681f3Smrg      return true;
5347ec681f3Smrg   }
5357ec681f3Smrg
5367ec681f3Smrg   return false;
5377e102996Smaya}
5387e102996Smaya
5397e102996Smayastatic bool
5407ec681f3Smrgopt_jump(struct ir3 *ir)
5417ec681f3Smrg{
5427ec681f3Smrg   bool progress = false;
5437ec681f3Smrg
5447ec681f3Smrg   unsigned index = 0;
5457ec681f3Smrg   foreach_block (block, &ir->block_list)
5467ec681f3Smrg      block->index = index++;
5477ec681f3Smrg
5487ec681f3Smrg   foreach_block (block, &ir->block_list) {
5497ec681f3Smrg      foreach_instr (instr, &block->instr_list) {
5507ec681f3Smrg         if (!is_flow(instr) || !instr->cat0.target)
5517ec681f3Smrg            continue;
5527ec681f3Smrg
5537ec681f3Smrg         struct ir3_block *tblock = resolve_dest_block(instr->cat0.target);
5547ec681f3Smrg         if (tblock != instr->cat0.target) {
5557ec681f3Smrg            progress = true;
5567ec681f3Smrg
5577ec681f3Smrg            /* Exit early if we deleted a block to avoid iterator
5587ec681f3Smrg             * weirdness/assert fails
5597ec681f3Smrg             */
5607ec681f3Smrg            if (retarget_jump(instr, tblock))
5617ec681f3Smrg               return true;
5627ec681f3Smrg         }
5637ec681f3Smrg      }
5647ec681f3Smrg
5657ec681f3Smrg      /* Detect the case where the block ends either with:
5667ec681f3Smrg       * - A single unconditional jump to the next block.
5677ec681f3Smrg       * - Two jump instructions with opposite conditions, and one of the
5687ec681f3Smrg       *   them jumps to the next block.
5697ec681f3Smrg       * We can remove the one that jumps to the next block in either case.
5707ec681f3Smrg       */
5717ec681f3Smrg      if (list_is_empty(&block->instr_list))
5727ec681f3Smrg         continue;
5737ec681f3Smrg
5747ec681f3Smrg      struct ir3_instruction *jumps[2] = {NULL, NULL};
5757ec681f3Smrg      jumps[0] =
5767ec681f3Smrg         list_last_entry(&block->instr_list, struct ir3_instruction, node);
5777ec681f3Smrg      if (!list_is_singular(&block->instr_list))
5787ec681f3Smrg         jumps[1] =
5797ec681f3Smrg            list_last_entry(&jumps[0]->node, struct ir3_instruction, node);
5807ec681f3Smrg
5817ec681f3Smrg      if (jumps[0]->opc == OPC_JUMP)
5827ec681f3Smrg         jumps[1] = NULL;
5837ec681f3Smrg      else if (jumps[0]->opc != OPC_B || !jumps[1] || jumps[1]->opc != OPC_B)
5847ec681f3Smrg         continue;
5857ec681f3Smrg
5867ec681f3Smrg      for (unsigned i = 0; i < 2; i++) {
5877ec681f3Smrg         if (!jumps[i])
5887ec681f3Smrg            continue;
5897ec681f3Smrg
5907ec681f3Smrg         struct ir3_block *tblock = jumps[i]->cat0.target;
5917ec681f3Smrg         if (&tblock->node == block->node.next) {
5927ec681f3Smrg            list_delinit(&jumps[i]->node);
5937ec681f3Smrg            progress = true;
5947ec681f3Smrg            break;
5957ec681f3Smrg         }
5967ec681f3Smrg      }
5977ec681f3Smrg   }
5987ec681f3Smrg
5997ec681f3Smrg   return progress;
6007ec681f3Smrg}
6017ec681f3Smrg
6027ec681f3Smrgstatic void
6037e102996Smayaresolve_jumps(struct ir3 *ir)
6047e102996Smaya{
6057ec681f3Smrg   foreach_block (block, &ir->block_list)
6067ec681f3Smrg      foreach_instr (instr, &block->instr_list)
6077ec681f3Smrg         if (is_flow(instr) && instr->cat0.target) {
6087ec681f3Smrg            struct ir3_instruction *target = list_first_entry(
6097ec681f3Smrg               &instr->cat0.target->instr_list, struct ir3_instruction, node);
6107ec681f3Smrg
6117ec681f3Smrg            instr->cat0.immed = (int)target->ip - (int)instr->ip;
6127ec681f3Smrg         }
6137ec681f3Smrg}
6147e102996Smaya
6157ec681f3Smrgstatic void
6167ec681f3Smrgmark_jp(struct ir3_block *block)
6177ec681f3Smrg{
6187ec681f3Smrg   struct ir3_instruction *target =
6197ec681f3Smrg      list_first_entry(&block->instr_list, struct ir3_instruction, node);
6207ec681f3Smrg   target->flags |= IR3_INSTR_JP;
6217e102996Smaya}
6227e102996Smaya
6237ec681f3Smrg/* Mark points where control flow converges or diverges.
6247e102996Smaya *
6257ec681f3Smrg * Divergence points could actually be re-convergence points where
6267ec681f3Smrg * "parked" threads are recoverged with threads that took the opposite
6277ec681f3Smrg * path last time around.  Possibly it is easier to think of (jp) as
6287ec681f3Smrg * "the execution mask might have changed".
6297e102996Smaya */
6307e102996Smayastatic void
6317ec681f3Smrgmark_xvergence_points(struct ir3 *ir)
6327e102996Smaya{
6337ec681f3Smrg   foreach_block (block, &ir->block_list) {
6347ec681f3Smrg      if (block->predecessors_count > 1) {
6357ec681f3Smrg         /* if a block has more than one possible predecessor, then
6367ec681f3Smrg          * the first instruction is a convergence point.
6377ec681f3Smrg          */
6387ec681f3Smrg         mark_jp(block);
6397ec681f3Smrg      } else if (block->predecessors_count == 1) {
6407ec681f3Smrg         /* If a block has one predecessor, which has multiple possible
6417ec681f3Smrg          * successors, it is a divergence point.
6427ec681f3Smrg          */
6437ec681f3Smrg         for (unsigned i = 0; i < block->predecessors_count; i++) {
6447ec681f3Smrg            struct ir3_block *predecessor = block->predecessors[i];
6457ec681f3Smrg            if (predecessor->successors[1]) {
6467ec681f3Smrg               mark_jp(block);
6477ec681f3Smrg            }
6487ec681f3Smrg         }
6497ec681f3Smrg      }
6507ec681f3Smrg   }
6517e102996Smaya}
6527e102996Smaya
6537ec681f3Smrg/* Insert the branch/jump instructions for flow control between blocks.
6547ec681f3Smrg * Initially this is done naively, without considering if the successor
6557ec681f3Smrg * block immediately follows the current block (ie. so no jump required),
6567ec681f3Smrg * but that is cleaned up in opt_jump().
6577ec681f3Smrg *
6587ec681f3Smrg * TODO what ensures that the last write to p0.x in a block is the
6597ec681f3Smrg * branch condition?  Have we been getting lucky all this time?
6607ec681f3Smrg */
6617ec681f3Smrgstatic void
6627ec681f3Smrgblock_sched(struct ir3 *ir)
6637e102996Smaya{
6647ec681f3Smrg   foreach_block (block, &ir->block_list) {
6657ec681f3Smrg      if (block->successors[1]) {
6667ec681f3Smrg         /* if/else, conditional branches to "then" or "else": */
6677ec681f3Smrg         struct ir3_instruction *br1, *br2;
6687ec681f3Smrg
6697ec681f3Smrg         if (block->brtype == IR3_BRANCH_GETONE) {
6707ec681f3Smrg            /* getone can't be inverted, and it wouldn't even make sense
6717ec681f3Smrg             * to follow it with an inverted branch, so follow it by an
6727ec681f3Smrg             * unconditional branch.
6737ec681f3Smrg             */
6747ec681f3Smrg            debug_assert(!block->condition);
6757ec681f3Smrg            br1 = ir3_GETONE(block);
6767ec681f3Smrg            br1->cat0.target = block->successors[1];
6777ec681f3Smrg
6787ec681f3Smrg            br2 = ir3_JUMP(block);
6797ec681f3Smrg            br2->cat0.target = block->successors[0];
6807ec681f3Smrg         } else {
6817ec681f3Smrg            debug_assert(block->condition);
6827ec681f3Smrg
6837ec681f3Smrg            /* create "else" branch first (since "then" block should
6847ec681f3Smrg             * frequently/always end up being a fall-thru):
6857ec681f3Smrg             */
6867ec681f3Smrg            br1 = ir3_instr_create(block, OPC_B, 0, 1);
6877ec681f3Smrg            ir3_src_create(br1, regid(REG_P0, 0), 0)->def =
6887ec681f3Smrg               block->condition->dsts[0];
6897ec681f3Smrg            br1->cat0.inv1 = true;
6907ec681f3Smrg            br1->cat0.target = block->successors[1];
6917ec681f3Smrg
6927ec681f3Smrg            /* "then" branch: */
6937ec681f3Smrg            br2 = ir3_instr_create(block, OPC_B, 0, 1);
6947ec681f3Smrg            ir3_src_create(br2, regid(REG_P0, 0), 0)->def =
6957ec681f3Smrg               block->condition->dsts[0];
6967ec681f3Smrg            br2->cat0.target = block->successors[0];
6977ec681f3Smrg
6987ec681f3Smrg            switch (block->brtype) {
6997ec681f3Smrg            case IR3_BRANCH_COND:
7007ec681f3Smrg               br1->cat0.brtype = br2->cat0.brtype = BRANCH_PLAIN;
7017ec681f3Smrg               break;
7027ec681f3Smrg            case IR3_BRANCH_ALL:
7037ec681f3Smrg               br1->cat0.brtype = BRANCH_ANY;
7047ec681f3Smrg               br2->cat0.brtype = BRANCH_ALL;
7057ec681f3Smrg               break;
7067ec681f3Smrg            case IR3_BRANCH_ANY:
7077ec681f3Smrg               br1->cat0.brtype = BRANCH_ALL;
7087ec681f3Smrg               br2->cat0.brtype = BRANCH_ANY;
7097ec681f3Smrg               break;
7107ec681f3Smrg            case IR3_BRANCH_GETONE:
7117ec681f3Smrg               unreachable("can't get here");
7127ec681f3Smrg            }
7137ec681f3Smrg         }
7147ec681f3Smrg      } else if (block->successors[0]) {
7157ec681f3Smrg         /* otherwise unconditional jump to next block: */
7167ec681f3Smrg         struct ir3_instruction *jmp;
7177ec681f3Smrg
7187ec681f3Smrg         jmp = ir3_JUMP(block);
7197ec681f3Smrg         jmp->cat0.target = block->successors[0];
7207ec681f3Smrg      }
7217ec681f3Smrg   }
7227ec681f3Smrg}
7237e102996Smaya
7247ec681f3Smrg/* Here we workaround the fact that kill doesn't actually kill the thread as
7257ec681f3Smrg * GL expects. The last instruction always needs to be an end instruction,
7267ec681f3Smrg * which means that if we're stuck in a loop where kill is the only way out,
7277ec681f3Smrg * then we may have to jump out to the end. kill may also have the d3d
7287ec681f3Smrg * semantics of converting the thread to a helper thread, rather than setting
7297ec681f3Smrg * the exec mask to 0, in which case the helper thread could get stuck in an
7307ec681f3Smrg * infinite loop.
7317ec681f3Smrg *
7327ec681f3Smrg * We do this late, both to give the scheduler the opportunity to reschedule
7337ec681f3Smrg * kill instructions earlier and to avoid having to create a separate basic
7347ec681f3Smrg * block.
7357ec681f3Smrg *
7367ec681f3Smrg * TODO: Assuming that the wavefront doesn't stop as soon as all threads are
7377ec681f3Smrg * killed, we might benefit by doing this more aggressively when the remaining
7387ec681f3Smrg * part of the program after the kill is large, since that would let us
7397ec681f3Smrg * skip over the instructions when there are no non-killed threads left.
7407ec681f3Smrg */
7417ec681f3Smrgstatic void
7427ec681f3Smrgkill_sched(struct ir3 *ir, struct ir3_shader_variant *so)
7437ec681f3Smrg{
7447ec681f3Smrg   /* True if we know that this block will always eventually lead to the end
7457ec681f3Smrg    * block:
7467ec681f3Smrg    */
7477ec681f3Smrg   bool always_ends = true;
7487ec681f3Smrg   bool added = false;
7497ec681f3Smrg   struct ir3_block *last_block =
7507ec681f3Smrg      list_last_entry(&ir->block_list, struct ir3_block, node);
7517ec681f3Smrg
7527ec681f3Smrg   foreach_block_rev (block, &ir->block_list) {
7537ec681f3Smrg      for (unsigned i = 0; i < 2 && block->successors[i]; i++) {
7547ec681f3Smrg         if (block->successors[i]->start_ip <= block->end_ip)
7557ec681f3Smrg            always_ends = false;
7567ec681f3Smrg      }
7577ec681f3Smrg
7587ec681f3Smrg      if (always_ends)
7597ec681f3Smrg         continue;
7607ec681f3Smrg
7617ec681f3Smrg      foreach_instr_safe (instr, &block->instr_list) {
7627ec681f3Smrg         if (instr->opc != OPC_KILL)
7637ec681f3Smrg            continue;
7647ec681f3Smrg
7657ec681f3Smrg         struct ir3_instruction *br = ir3_instr_create(block, OPC_B, 0, 1);
7667ec681f3Smrg         ir3_src_create(br, instr->srcs[0]->num, instr->srcs[0]->flags)->wrmask =
7677ec681f3Smrg            1;
7687ec681f3Smrg         br->cat0.target =
7697ec681f3Smrg            list_last_entry(&ir->block_list, struct ir3_block, node);
7707ec681f3Smrg
7717ec681f3Smrg         list_del(&br->node);
7727ec681f3Smrg         list_add(&br->node, &instr->node);
7737ec681f3Smrg
7747ec681f3Smrg         added = true;
7757ec681f3Smrg      }
7767ec681f3Smrg   }
7777ec681f3Smrg
7787ec681f3Smrg   if (added) {
7797ec681f3Smrg      /* I'm not entirely sure how the branchstack works, but we probably
7807ec681f3Smrg       * need to add at least one entry for the divergence which is resolved
7817ec681f3Smrg       * at the end:
7827ec681f3Smrg       */
7837ec681f3Smrg      so->branchstack++;
7847ec681f3Smrg
7857ec681f3Smrg      /* We don't update predecessors/successors, so we have to do this
7867ec681f3Smrg       * manually:
7877ec681f3Smrg       */
7887ec681f3Smrg      mark_jp(last_block);
7897ec681f3Smrg   }
7907ec681f3Smrg}
7917e102996Smaya
7927ec681f3Smrg/* Insert nop's required to make this a legal/valid shader program: */
7937ec681f3Smrgstatic void
7947ec681f3Smrgnop_sched(struct ir3 *ir, struct ir3_shader_variant *so)
7957ec681f3Smrg{
7967ec681f3Smrg   foreach_block (block, &ir->block_list) {
7977ec681f3Smrg      struct ir3_instruction *last = NULL;
7987ec681f3Smrg      struct list_head instr_list;
7997ec681f3Smrg
8007ec681f3Smrg      /* remove all the instructions from the list, we'll be adding
8017ec681f3Smrg       * them back in as we go
8027ec681f3Smrg       */
8037ec681f3Smrg      list_replace(&block->instr_list, &instr_list);
8047ec681f3Smrg      list_inithead(&block->instr_list);
8057ec681f3Smrg
8067ec681f3Smrg      foreach_instr_safe (instr, &instr_list) {
8077ec681f3Smrg         unsigned delay = ir3_delay_calc_exact(block, instr, so->mergedregs);
8087ec681f3Smrg
8097ec681f3Smrg         /* NOTE: I think the nopN encoding works for a5xx and
8107ec681f3Smrg          * probably a4xx, but not a3xx.  So far only tested on
8117ec681f3Smrg          * a6xx.
8127ec681f3Smrg          */
8137ec681f3Smrg
8147ec681f3Smrg         if ((delay > 0) && (ir->compiler->gen >= 6) && last &&
8157ec681f3Smrg             ((opc_cat(last->opc) == 2) || (opc_cat(last->opc) == 3)) &&
8167ec681f3Smrg             (last->repeat == 0)) {
8177ec681f3Smrg            /* the previous cat2/cat3 instruction can encode at most 3 nop's: */
8187ec681f3Smrg            unsigned transfer = MIN2(delay, 3 - last->nop);
8197ec681f3Smrg            last->nop += transfer;
8207ec681f3Smrg            delay -= transfer;
8217ec681f3Smrg         }
8227ec681f3Smrg
8237ec681f3Smrg         if ((delay > 0) && last && (last->opc == OPC_NOP)) {
8247ec681f3Smrg            /* the previous nop can encode at most 5 repeats: */
8257ec681f3Smrg            unsigned transfer = MIN2(delay, 5 - last->repeat);
8267ec681f3Smrg            last->repeat += transfer;
8277ec681f3Smrg            delay -= transfer;
8287ec681f3Smrg         }
8297ec681f3Smrg
8307ec681f3Smrg         if (delay > 0) {
8317ec681f3Smrg            debug_assert(delay <= 6);
8327ec681f3Smrg            ir3_NOP(block)->repeat = delay - 1;
8337ec681f3Smrg         }
8347ec681f3Smrg
8357ec681f3Smrg         list_addtail(&instr->node, &block->instr_list);
8367ec681f3Smrg         last = instr;
8377ec681f3Smrg      }
8387ec681f3Smrg   }
8397ec681f3Smrg}
8407e102996Smaya
8417ec681f3Smrgbool
8427ec681f3Smrgir3_legalize(struct ir3 *ir, struct ir3_shader_variant *so, int *max_bary)
8437ec681f3Smrg{
8447ec681f3Smrg   struct ir3_legalize_ctx *ctx = rzalloc(ir, struct ir3_legalize_ctx);
8457ec681f3Smrg   bool mergedregs = so->mergedregs;
8467ec681f3Smrg   bool progress;
8477ec681f3Smrg
8487ec681f3Smrg   ctx->so = so;
8497ec681f3Smrg   ctx->max_bary = -1;
8507ec681f3Smrg   ctx->compiler = ir->compiler;
8517ec681f3Smrg   ctx->type = ir->type;
8527ec681f3Smrg
8537ec681f3Smrg   /* allocate per-block data: */
8547ec681f3Smrg   foreach_block (block, &ir->block_list) {
8557ec681f3Smrg      struct ir3_legalize_block_data *bd =
8567ec681f3Smrg         rzalloc(ctx, struct ir3_legalize_block_data);
8577ec681f3Smrg
8587ec681f3Smrg      regmask_init(&bd->state.needs_ss_war, mergedregs);
8597ec681f3Smrg      regmask_init(&bd->state.needs_ss, mergedregs);
8607ec681f3Smrg      regmask_init(&bd->state.needs_sy, mergedregs);
8617ec681f3Smrg
8627ec681f3Smrg      block->data = bd;
8637ec681f3Smrg   }
8647ec681f3Smrg
8657ec681f3Smrg   ir3_remove_nops(ir);
8667ec681f3Smrg
8677ec681f3Smrg   /* We may have failed to pull all input loads into the first block.
8687ec681f3Smrg    * In such case at the moment we aren't able to find a better place
8697ec681f3Smrg    * to for (ei) than the end of the program.
8707ec681f3Smrg    * a5xx and a6xx do automatically release varying storage at the end.
8717ec681f3Smrg    */
8727ec681f3Smrg   ctx->early_input_release = true;
8737ec681f3Smrg   struct ir3_block *start_block = ir3_start_block(ir);
8747ec681f3Smrg   foreach_block (block, &ir->block_list) {
8757ec681f3Smrg      foreach_instr (instr, &block->instr_list) {
8767ec681f3Smrg         if (is_input(instr) && block != start_block) {
8777ec681f3Smrg            ctx->early_input_release = false;
8787ec681f3Smrg            break;
8797ec681f3Smrg         }
8807ec681f3Smrg      }
8817ec681f3Smrg   }
8827ec681f3Smrg
8837ec681f3Smrg   assert(ctx->early_input_release || ctx->compiler->gen >= 5);
8847ec681f3Smrg
8857ec681f3Smrg   /* process each block: */
8867ec681f3Smrg   do {
8877ec681f3Smrg      progress = false;
8887ec681f3Smrg      foreach_block (block, &ir->block_list) {
8897ec681f3Smrg         progress |= legalize_block(ctx, block);
8907ec681f3Smrg      }
8917ec681f3Smrg   } while (progress);
8927ec681f3Smrg
8937ec681f3Smrg   *max_bary = ctx->max_bary;
8947ec681f3Smrg
8957ec681f3Smrg   block_sched(ir);
8967ec681f3Smrg   if (so->type == MESA_SHADER_FRAGMENT)
8977ec681f3Smrg      kill_sched(ir, so);
8987ec681f3Smrg
8997ec681f3Smrg   foreach_block (block, &ir->block_list) {
9007ec681f3Smrg      progress |= apply_fine_deriv_macro(ctx, block);
9017ec681f3Smrg   }
9027ec681f3Smrg
9037ec681f3Smrg   nop_sched(ir, so);
9047ec681f3Smrg
9057ec681f3Smrg   while (opt_jump(ir))
9067ec681f3Smrg      ;
9077ec681f3Smrg
9087ec681f3Smrg   ir3_count_instructions(ir);
9097ec681f3Smrg   resolve_jumps(ir);
9107ec681f3Smrg
9117ec681f3Smrg   mark_xvergence_points(ir);
9127ec681f3Smrg
9137ec681f3Smrg   ralloc_free(ctx);
9147ec681f3Smrg
9157ec681f3Smrg   return true;
9167e102996Smaya}
917