17ec681f3Smrg/*
27ec681f3Smrg * Copyright (C) 2019 Google, Inc.
37ec681f3Smrg *
47ec681f3Smrg * Permission is hereby granted, free of charge, to any person obtaining a
57ec681f3Smrg * copy of this software and associated documentation files (the "Software"),
67ec681f3Smrg * to deal in the Software without restriction, including without limitation
77ec681f3Smrg * the rights to use, copy, modify, merge, publish, distribute, sublicense,
87ec681f3Smrg * and/or sell copies of the Software, and to permit persons to whom the
97ec681f3Smrg * Software is furnished to do so, subject to the following conditions:
107ec681f3Smrg *
117ec681f3Smrg * The above copyright notice and this permission notice (including the next
127ec681f3Smrg * paragraph) shall be included in all copies or substantial portions of the
137ec681f3Smrg * Software.
147ec681f3Smrg *
157ec681f3Smrg * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
167ec681f3Smrg * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
177ec681f3Smrg * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
187ec681f3Smrg * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
197ec681f3Smrg * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
207ec681f3Smrg * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
217ec681f3Smrg * SOFTWARE.
227ec681f3Smrg *
237ec681f3Smrg * Authors:
247ec681f3Smrg *    Rob Clark <robclark@freedesktop.org>
257ec681f3Smrg */
267ec681f3Smrg
277ec681f3Smrg#include "util/dag.h"
287ec681f3Smrg#include "util/u_math.h"
297ec681f3Smrg
307ec681f3Smrg#include "ir3.h"
317ec681f3Smrg#include "ir3_compiler.h"
327ec681f3Smrg#include "ir3_context.h"
337ec681f3Smrg
347ec681f3Smrg#ifdef DEBUG
357ec681f3Smrg#define SCHED_DEBUG (ir3_shader_debug & IR3_DBG_SCHEDMSGS)
367ec681f3Smrg#else
377ec681f3Smrg#define SCHED_DEBUG 0
387ec681f3Smrg#endif
397ec681f3Smrg#define d(fmt, ...)                                                            \
407ec681f3Smrg   do {                                                                        \
417ec681f3Smrg      if (SCHED_DEBUG) {                                                       \
427ec681f3Smrg         mesa_logi("PSCHED: " fmt, ##__VA_ARGS__);                             \
437ec681f3Smrg      }                                                                        \
447ec681f3Smrg   } while (0)
457ec681f3Smrg
467ec681f3Smrg#define di(instr, fmt, ...)                                                    \
477ec681f3Smrg   do {                                                                        \
487ec681f3Smrg      if (SCHED_DEBUG) {                                                       \
497ec681f3Smrg         struct log_stream *stream = mesa_log_streami();                       \
507ec681f3Smrg         mesa_log_stream_printf(stream, "PSCHED: " fmt ": ", ##__VA_ARGS__);   \
517ec681f3Smrg         ir3_print_instr_stream(stream, instr);                                \
527ec681f3Smrg         mesa_log_stream_destroy(stream);                                      \
537ec681f3Smrg      }                                                                        \
547ec681f3Smrg   } while (0)
557ec681f3Smrg
567ec681f3Smrg/*
577ec681f3Smrg * Post RA Instruction Scheduling
587ec681f3Smrg */
597ec681f3Smrg
607ec681f3Smrgstruct ir3_postsched_ctx {
617ec681f3Smrg   struct ir3 *ir;
627ec681f3Smrg
637ec681f3Smrg   struct ir3_shader_variant *v;
647ec681f3Smrg
657ec681f3Smrg   void *mem_ctx;
667ec681f3Smrg   struct ir3_block *block; /* the current block */
677ec681f3Smrg   struct dag *dag;
687ec681f3Smrg
697ec681f3Smrg   struct list_head unscheduled_list; /* unscheduled instructions */
707ec681f3Smrg
717ec681f3Smrg   int sfu_delay;
727ec681f3Smrg   int tex_delay;
737ec681f3Smrg};
747ec681f3Smrg
757ec681f3Smrgstruct ir3_postsched_node {
767ec681f3Smrg   struct dag_node dag; /* must be first for util_dynarray_foreach */
777ec681f3Smrg   struct ir3_instruction *instr;
787ec681f3Smrg   bool partially_evaluated_path;
797ec681f3Smrg
807ec681f3Smrg   bool has_tex_src, has_sfu_src;
817ec681f3Smrg
827ec681f3Smrg   unsigned delay;
837ec681f3Smrg   unsigned max_delay;
847ec681f3Smrg};
857ec681f3Smrg
867ec681f3Smrg#define foreach_sched_node(__n, __list)                                        \
877ec681f3Smrg   list_for_each_entry (struct ir3_postsched_node, __n, __list, dag.link)
887ec681f3Smrg
897ec681f3Smrgstatic bool
907ec681f3Smrghas_tex_src(struct ir3_instruction *instr)
917ec681f3Smrg{
927ec681f3Smrg   struct ir3_postsched_node *node = instr->data;
937ec681f3Smrg   return node->has_tex_src;
947ec681f3Smrg}
957ec681f3Smrg
967ec681f3Smrgstatic bool
977ec681f3Smrghas_sfu_src(struct ir3_instruction *instr)
987ec681f3Smrg{
997ec681f3Smrg   struct ir3_postsched_node *node = instr->data;
1007ec681f3Smrg   return node->has_sfu_src;
1017ec681f3Smrg}
1027ec681f3Smrg
1037ec681f3Smrgstatic void
1047ec681f3Smrgschedule(struct ir3_postsched_ctx *ctx, struct ir3_instruction *instr)
1057ec681f3Smrg{
1067ec681f3Smrg   debug_assert(ctx->block == instr->block);
1077ec681f3Smrg
1087ec681f3Smrg   /* remove from unscheduled_list:
1097ec681f3Smrg    */
1107ec681f3Smrg   list_delinit(&instr->node);
1117ec681f3Smrg
1127ec681f3Smrg   di(instr, "schedule");
1137ec681f3Smrg
1147ec681f3Smrg   list_addtail(&instr->node, &instr->block->instr_list);
1157ec681f3Smrg
1167ec681f3Smrg   struct ir3_postsched_node *n = instr->data;
1177ec681f3Smrg   dag_prune_head(ctx->dag, &n->dag);
1187ec681f3Smrg
1197ec681f3Smrg   if (is_meta(instr) && (instr->opc != OPC_META_TEX_PREFETCH))
1207ec681f3Smrg      return;
1217ec681f3Smrg
1227ec681f3Smrg   if (is_sfu(instr)) {
1237ec681f3Smrg      ctx->sfu_delay = 8;
1247ec681f3Smrg   } else if (has_sfu_src(instr)) {
1257ec681f3Smrg      ctx->sfu_delay = 0;
1267ec681f3Smrg   } else if (ctx->sfu_delay > 0) {
1277ec681f3Smrg      ctx->sfu_delay--;
1287ec681f3Smrg   }
1297ec681f3Smrg
1307ec681f3Smrg   if (is_tex_or_prefetch(instr)) {
1317ec681f3Smrg      ctx->tex_delay = 10;
1327ec681f3Smrg   } else if (has_tex_src(instr)) {
1337ec681f3Smrg      ctx->tex_delay = 0;
1347ec681f3Smrg   } else if (ctx->tex_delay > 0) {
1357ec681f3Smrg      ctx->tex_delay--;
1367ec681f3Smrg   }
1377ec681f3Smrg}
1387ec681f3Smrg
1397ec681f3Smrgstatic void
1407ec681f3Smrgdump_state(struct ir3_postsched_ctx *ctx)
1417ec681f3Smrg{
1427ec681f3Smrg   if (!SCHED_DEBUG)
1437ec681f3Smrg      return;
1447ec681f3Smrg
1457ec681f3Smrg   foreach_sched_node (n, &ctx->dag->heads) {
1467ec681f3Smrg      di(n->instr, "maxdel=%3d    ", n->max_delay);
1477ec681f3Smrg
1487ec681f3Smrg      util_dynarray_foreach (&n->dag.edges, struct dag_edge, edge) {
1497ec681f3Smrg         struct ir3_postsched_node *child =
1507ec681f3Smrg            (struct ir3_postsched_node *)edge->child;
1517ec681f3Smrg
1527ec681f3Smrg         di(child->instr, " -> (%d parents) ", child->dag.parent_count);
1537ec681f3Smrg      }
1547ec681f3Smrg   }
1557ec681f3Smrg}
1567ec681f3Smrg
1577ec681f3Smrg/* Determine if this is an instruction that we'd prefer not to schedule
1587ec681f3Smrg * yet, in order to avoid an (ss) sync.  This is limited by the sfu_delay
1597ec681f3Smrg * counter, ie. the more cycles it has been since the last SFU, the less
1607ec681f3Smrg * costly a sync would be.
1617ec681f3Smrg */
1627ec681f3Smrgstatic bool
1637ec681f3Smrgwould_sync(struct ir3_postsched_ctx *ctx, struct ir3_instruction *instr)
1647ec681f3Smrg{
1657ec681f3Smrg   if (ctx->sfu_delay) {
1667ec681f3Smrg      if (has_sfu_src(instr))
1677ec681f3Smrg         return true;
1687ec681f3Smrg   }
1697ec681f3Smrg
1707ec681f3Smrg   if (ctx->tex_delay) {
1717ec681f3Smrg      if (has_tex_src(instr))
1727ec681f3Smrg         return true;
1737ec681f3Smrg   }
1747ec681f3Smrg
1757ec681f3Smrg   return false;
1767ec681f3Smrg}
1777ec681f3Smrg
1787ec681f3Smrg/* find instruction to schedule: */
1797ec681f3Smrgstatic struct ir3_instruction *
1807ec681f3Smrgchoose_instr(struct ir3_postsched_ctx *ctx)
1817ec681f3Smrg{
1827ec681f3Smrg   struct ir3_postsched_node *chosen = NULL;
1837ec681f3Smrg
1847ec681f3Smrg   dump_state(ctx);
1857ec681f3Smrg
1867ec681f3Smrg   foreach_sched_node (n, &ctx->dag->heads) {
1877ec681f3Smrg      if (!is_meta(n->instr))
1887ec681f3Smrg         continue;
1897ec681f3Smrg
1907ec681f3Smrg      if (!chosen || (chosen->max_delay < n->max_delay))
1917ec681f3Smrg         chosen = n;
1927ec681f3Smrg   }
1937ec681f3Smrg
1947ec681f3Smrg   if (chosen) {
1957ec681f3Smrg      di(chosen->instr, "prio: chose (meta)");
1967ec681f3Smrg      return chosen->instr;
1977ec681f3Smrg   }
1987ec681f3Smrg
1997ec681f3Smrg   /* Try to schedule inputs with a higher priority, if possible, as
2007ec681f3Smrg    * the last bary.f unlocks varying storage to unblock more VS
2017ec681f3Smrg    * warps.
2027ec681f3Smrg    */
2037ec681f3Smrg   foreach_sched_node (n, &ctx->dag->heads) {
2047ec681f3Smrg      if (!is_input(n->instr))
2057ec681f3Smrg         continue;
2067ec681f3Smrg
2077ec681f3Smrg      if (!chosen || (chosen->max_delay < n->max_delay))
2087ec681f3Smrg         chosen = n;
2097ec681f3Smrg   }
2107ec681f3Smrg
2117ec681f3Smrg   if (chosen) {
2127ec681f3Smrg      di(chosen->instr, "prio: chose (input)");
2137ec681f3Smrg      return chosen->instr;
2147ec681f3Smrg   }
2157ec681f3Smrg
2167ec681f3Smrg   /* Next prioritize discards: */
2177ec681f3Smrg   foreach_sched_node (n, &ctx->dag->heads) {
2187ec681f3Smrg      unsigned d =
2197ec681f3Smrg         ir3_delay_calc_postra(ctx->block, n->instr, false, ctx->v->mergedregs);
2207ec681f3Smrg
2217ec681f3Smrg      if (d > 0)
2227ec681f3Smrg         continue;
2237ec681f3Smrg
2247ec681f3Smrg      if (!is_kill_or_demote(n->instr))
2257ec681f3Smrg         continue;
2267ec681f3Smrg
2277ec681f3Smrg      if (!chosen || (chosen->max_delay < n->max_delay))
2287ec681f3Smrg         chosen = n;
2297ec681f3Smrg   }
2307ec681f3Smrg
2317ec681f3Smrg   if (chosen) {
2327ec681f3Smrg      di(chosen->instr, "csp: chose (kill, hard ready)");
2337ec681f3Smrg      return chosen->instr;
2347ec681f3Smrg   }
2357ec681f3Smrg
2367ec681f3Smrg   /* Next prioritize expensive instructions: */
2377ec681f3Smrg   foreach_sched_node (n, &ctx->dag->heads) {
2387ec681f3Smrg      unsigned d =
2397ec681f3Smrg         ir3_delay_calc_postra(ctx->block, n->instr, false, ctx->v->mergedregs);
2407ec681f3Smrg
2417ec681f3Smrg      if (d > 0)
2427ec681f3Smrg         continue;
2437ec681f3Smrg
2447ec681f3Smrg      if (!(is_sfu(n->instr) || is_tex(n->instr)))
2457ec681f3Smrg         continue;
2467ec681f3Smrg
2477ec681f3Smrg      if (!chosen || (chosen->max_delay < n->max_delay))
2487ec681f3Smrg         chosen = n;
2497ec681f3Smrg   }
2507ec681f3Smrg
2517ec681f3Smrg   if (chosen) {
2527ec681f3Smrg      di(chosen->instr, "csp: chose (sfu/tex, hard ready)");
2537ec681f3Smrg      return chosen->instr;
2547ec681f3Smrg   }
2557ec681f3Smrg
2567ec681f3Smrg   /*
2577ec681f3Smrg    * Sometimes be better to take a nop, rather than scheduling an
2587ec681f3Smrg    * instruction that would require an (ss) shortly after another
2597ec681f3Smrg    * SFU..  ie. if last SFU was just one or two instr ago, and we
2607ec681f3Smrg    * could choose between taking a nop and then scheduling
2617ec681f3Smrg    * something else, vs scheduling the immed avail instruction that
2627ec681f3Smrg    * would require (ss), we are better with the nop.
2637ec681f3Smrg    */
2647ec681f3Smrg   for (unsigned delay = 0; delay < 4; delay++) {
2657ec681f3Smrg      foreach_sched_node (n, &ctx->dag->heads) {
2667ec681f3Smrg         if (would_sync(ctx, n->instr))
2677ec681f3Smrg            continue;
2687ec681f3Smrg
2697ec681f3Smrg         unsigned d = ir3_delay_calc_postra(ctx->block, n->instr, true,
2707ec681f3Smrg                                            ctx->v->mergedregs);
2717ec681f3Smrg
2727ec681f3Smrg         if (d > delay)
2737ec681f3Smrg            continue;
2747ec681f3Smrg
2757ec681f3Smrg         if (!chosen || (chosen->max_delay < n->max_delay))
2767ec681f3Smrg            chosen = n;
2777ec681f3Smrg      }
2787ec681f3Smrg
2797ec681f3Smrg      if (chosen) {
2807ec681f3Smrg         di(chosen->instr, "csp: chose (soft ready, delay=%u)", delay);
2817ec681f3Smrg         return chosen->instr;
2827ec681f3Smrg      }
2837ec681f3Smrg   }
2847ec681f3Smrg
2857ec681f3Smrg   /* Next try to find a ready leader w/ soft delay (ie. including extra
2867ec681f3Smrg    * delay for things like tex fetch which can be synchronized w/ sync
2877ec681f3Smrg    * bit (but we probably do want to schedule some other instructions
2887ec681f3Smrg    * while we wait)
2897ec681f3Smrg    */
2907ec681f3Smrg   foreach_sched_node (n, &ctx->dag->heads) {
2917ec681f3Smrg      unsigned d =
2927ec681f3Smrg         ir3_delay_calc_postra(ctx->block, n->instr, true, ctx->v->mergedregs);
2937ec681f3Smrg
2947ec681f3Smrg      if (d > 0)
2957ec681f3Smrg         continue;
2967ec681f3Smrg
2977ec681f3Smrg      if (!chosen || (chosen->max_delay < n->max_delay))
2987ec681f3Smrg         chosen = n;
2997ec681f3Smrg   }
3007ec681f3Smrg
3017ec681f3Smrg   if (chosen) {
3027ec681f3Smrg      di(chosen->instr, "csp: chose (soft ready)");
3037ec681f3Smrg      return chosen->instr;
3047ec681f3Smrg   }
3057ec681f3Smrg
3067ec681f3Smrg   /* Next try to find a ready leader that can be scheduled without nop's,
3077ec681f3Smrg    * which in the case of things that need (sy)/(ss) could result in
3087ec681f3Smrg    * stalls.. but we've already decided there is not a better option.
3097ec681f3Smrg    */
3107ec681f3Smrg   foreach_sched_node (n, &ctx->dag->heads) {
3117ec681f3Smrg      unsigned d =
3127ec681f3Smrg         ir3_delay_calc_postra(ctx->block, n->instr, false, ctx->v->mergedregs);
3137ec681f3Smrg
3147ec681f3Smrg      if (d > 0)
3157ec681f3Smrg         continue;
3167ec681f3Smrg
3177ec681f3Smrg      if (!chosen || (chosen->max_delay < n->max_delay))
3187ec681f3Smrg         chosen = n;
3197ec681f3Smrg   }
3207ec681f3Smrg
3217ec681f3Smrg   if (chosen) {
3227ec681f3Smrg      di(chosen->instr, "csp: chose (hard ready)");
3237ec681f3Smrg      return chosen->instr;
3247ec681f3Smrg   }
3257ec681f3Smrg
3267ec681f3Smrg   /* Otherwise choose leader with maximum cost:
3277ec681f3Smrg    *
3287ec681f3Smrg    * TODO should we try to balance cost and delays?  I guess it is
3297ec681f3Smrg    * a balance between now-nop's and future-nop's?
3307ec681f3Smrg    */
3317ec681f3Smrg   foreach_sched_node (n, &ctx->dag->heads) {
3327ec681f3Smrg      if (!chosen || chosen->max_delay < n->max_delay)
3337ec681f3Smrg         chosen = n;
3347ec681f3Smrg   }
3357ec681f3Smrg
3367ec681f3Smrg   if (chosen) {
3377ec681f3Smrg      di(chosen->instr, "csp: chose (leader)");
3387ec681f3Smrg      return chosen->instr;
3397ec681f3Smrg   }
3407ec681f3Smrg
3417ec681f3Smrg   return NULL;
3427ec681f3Smrg}
3437ec681f3Smrg
3447ec681f3Smrgstruct ir3_postsched_deps_state {
3457ec681f3Smrg   struct ir3_postsched_ctx *ctx;
3467ec681f3Smrg
3477ec681f3Smrg   enum { F, R } direction;
3487ec681f3Smrg
3497ec681f3Smrg   bool merged;
3507ec681f3Smrg
3517ec681f3Smrg   /* Track the mapping between sched node (instruction) that last
3527ec681f3Smrg    * wrote a given register (in whichever direction we are iterating
3537ec681f3Smrg    * the block)
3547ec681f3Smrg    *
3557ec681f3Smrg    * Note, this table is twice as big as the # of regs, to deal with
3567ec681f3Smrg    * half-precision regs.  The approach differs depending on whether
3577ec681f3Smrg    * the half and full precision register files are "merged" (conflict,
3587ec681f3Smrg    * ie. a6xx+) in which case we consider each full precision dep
3597ec681f3Smrg    * as two half-precision dependencies, vs older separate (non-
3607ec681f3Smrg    * conflicting) in which case the first half of the table is used
3617ec681f3Smrg    * for full precision and 2nd half for half-precision.
3627ec681f3Smrg    */
3637ec681f3Smrg   struct ir3_postsched_node *regs[2 * 256];
3647ec681f3Smrg};
3657ec681f3Smrg
3667ec681f3Smrg/* bounds checking read/write accessors, since OoB access to stuff on
3677ec681f3Smrg * the stack is gonna cause a bad day.
3687ec681f3Smrg */
3697ec681f3Smrg#define dep_reg(state, idx)                                                    \
3707ec681f3Smrg   *({                                                                         \
3717ec681f3Smrg      assert((idx) < ARRAY_SIZE((state)->regs));                               \
3727ec681f3Smrg      &(state)->regs[(idx)];                                                   \
3737ec681f3Smrg   })
3747ec681f3Smrg
3757ec681f3Smrgstatic void
3767ec681f3Smrgadd_dep(struct ir3_postsched_deps_state *state,
3777ec681f3Smrg        struct ir3_postsched_node *before, struct ir3_postsched_node *after)
3787ec681f3Smrg{
3797ec681f3Smrg   if (!before || !after)
3807ec681f3Smrg      return;
3817ec681f3Smrg
3827ec681f3Smrg   assert(before != after);
3837ec681f3Smrg
3847ec681f3Smrg   if (state->direction == F) {
3857ec681f3Smrg      dag_add_edge(&before->dag, &after->dag, NULL);
3867ec681f3Smrg   } else {
3877ec681f3Smrg      dag_add_edge(&after->dag, &before->dag, NULL);
3887ec681f3Smrg   }
3897ec681f3Smrg}
3907ec681f3Smrg
3917ec681f3Smrgstatic void
3927ec681f3Smrgadd_single_reg_dep(struct ir3_postsched_deps_state *state,
3937ec681f3Smrg                   struct ir3_postsched_node *node, unsigned num, int src_n)
3947ec681f3Smrg{
3957ec681f3Smrg   struct ir3_postsched_node *dep = dep_reg(state, num);
3967ec681f3Smrg
3977ec681f3Smrg   if (src_n >= 0 && dep && state->direction == F) {
3987ec681f3Smrg      unsigned d = ir3_delayslots(dep->instr, node->instr, src_n, true);
3997ec681f3Smrg      node->delay = MAX2(node->delay, d);
4007ec681f3Smrg      if (is_tex_or_prefetch(dep->instr))
4017ec681f3Smrg         node->has_tex_src = true;
4027ec681f3Smrg      if (is_tex_or_prefetch(dep->instr))
4037ec681f3Smrg         node->has_sfu_src = true;
4047ec681f3Smrg   }
4057ec681f3Smrg
4067ec681f3Smrg   add_dep(state, dep, node);
4077ec681f3Smrg   if (src_n < 0) {
4087ec681f3Smrg      dep_reg(state, num) = node;
4097ec681f3Smrg   }
4107ec681f3Smrg}
4117ec681f3Smrg
4127ec681f3Smrg/* This is where we handled full vs half-precision, and potential conflicts
4137ec681f3Smrg * between half and full precision that result in additional dependencies.
4147ec681f3Smrg * The 'reg' arg is really just to know half vs full precision.
4157ec681f3Smrg *
4167ec681f3Smrg * If non-negative, then this adds a dependency on a source register, and
4177ec681f3Smrg * src_n is the index passed into ir3_delayslots() for calculating the delay:
4187ec681f3Smrg * If positive, corresponds to node->instr->regs[src_n]. If negative, then
4197ec681f3Smrg * this is for a destination register.
4207ec681f3Smrg */
4217ec681f3Smrgstatic void
4227ec681f3Smrgadd_reg_dep(struct ir3_postsched_deps_state *state,
4237ec681f3Smrg            struct ir3_postsched_node *node, const struct ir3_register *reg,
4247ec681f3Smrg            unsigned num, int src_n)
4257ec681f3Smrg{
4267ec681f3Smrg   if (state->merged) {
4277ec681f3Smrg      /* Make sure that special registers like a0.x that are written as
4287ec681f3Smrg       * half-registers don't alias random full registers by pretending that
4297ec681f3Smrg       * they're full registers:
4307ec681f3Smrg       */
4317ec681f3Smrg      if ((reg->flags & IR3_REG_HALF) && !is_reg_special(reg)) {
4327ec681f3Smrg         /* single conflict in half-reg space: */
4337ec681f3Smrg         add_single_reg_dep(state, node, num, src_n);
4347ec681f3Smrg      } else {
4357ec681f3Smrg         /* two conflicts in half-reg space: */
4367ec681f3Smrg         add_single_reg_dep(state, node, 2 * num + 0, src_n);
4377ec681f3Smrg         add_single_reg_dep(state, node, 2 * num + 1, src_n);
4387ec681f3Smrg      }
4397ec681f3Smrg   } else {
4407ec681f3Smrg      if (reg->flags & IR3_REG_HALF)
4417ec681f3Smrg         num += ARRAY_SIZE(state->regs) / 2;
4427ec681f3Smrg      add_single_reg_dep(state, node, num, src_n);
4437ec681f3Smrg   }
4447ec681f3Smrg}
4457ec681f3Smrg
4467ec681f3Smrgstatic void
4477ec681f3Smrgcalculate_deps(struct ir3_postsched_deps_state *state,
4487ec681f3Smrg               struct ir3_postsched_node *node)
4497ec681f3Smrg{
4507ec681f3Smrg   /* Add dependencies on instructions that previously (or next,
4517ec681f3Smrg    * in the reverse direction) wrote any of our src registers:
4527ec681f3Smrg    */
4537ec681f3Smrg   foreach_src_n (reg, i, node->instr) {
4547ec681f3Smrg      if (reg->flags & (IR3_REG_CONST | IR3_REG_IMMED))
4557ec681f3Smrg         continue;
4567ec681f3Smrg
4577ec681f3Smrg      if (reg->flags & IR3_REG_RELATIV) {
4587ec681f3Smrg         /* mark entire array as read: */
4597ec681f3Smrg         for (unsigned j = 0; j < reg->size; j++) {
4607ec681f3Smrg            add_reg_dep(state, node, reg, reg->array.base + j, i);
4617ec681f3Smrg         }
4627ec681f3Smrg      } else {
4637ec681f3Smrg         assert(reg->wrmask >= 1);
4647ec681f3Smrg         u_foreach_bit (b, reg->wrmask) {
4657ec681f3Smrg            add_reg_dep(state, node, reg, reg->num + b, i);
4667ec681f3Smrg         }
4677ec681f3Smrg      }
4687ec681f3Smrg   }
4697ec681f3Smrg
4707ec681f3Smrg   /* And then after we update the state for what this instruction
4717ec681f3Smrg    * wrote:
4727ec681f3Smrg    */
4737ec681f3Smrg   foreach_dst (reg, node->instr) {
4747ec681f3Smrg      if (reg->wrmask == 0)
4757ec681f3Smrg         continue;
4767ec681f3Smrg      if (reg->flags & IR3_REG_RELATIV) {
4777ec681f3Smrg         /* mark the entire array as written: */
4787ec681f3Smrg         for (unsigned i = 0; i < reg->size; i++) {
4797ec681f3Smrg            add_reg_dep(state, node, reg, reg->array.base + i, -1);
4807ec681f3Smrg         }
4817ec681f3Smrg      } else {
4827ec681f3Smrg         assert(reg->wrmask >= 1);
4837ec681f3Smrg         u_foreach_bit (b, reg->wrmask) {
4847ec681f3Smrg            add_reg_dep(state, node, reg, reg->num + b, -1);
4857ec681f3Smrg         }
4867ec681f3Smrg      }
4877ec681f3Smrg   }
4887ec681f3Smrg}
4897ec681f3Smrg
4907ec681f3Smrgstatic void
4917ec681f3Smrgcalculate_forward_deps(struct ir3_postsched_ctx *ctx)
4927ec681f3Smrg{
4937ec681f3Smrg   struct ir3_postsched_deps_state state = {
4947ec681f3Smrg      .ctx = ctx,
4957ec681f3Smrg      .direction = F,
4967ec681f3Smrg      .merged = ctx->v->mergedregs,
4977ec681f3Smrg   };
4987ec681f3Smrg
4997ec681f3Smrg   foreach_instr (instr, &ctx->unscheduled_list) {
5007ec681f3Smrg      calculate_deps(&state, instr->data);
5017ec681f3Smrg   }
5027ec681f3Smrg}
5037ec681f3Smrg
5047ec681f3Smrgstatic void
5057ec681f3Smrgcalculate_reverse_deps(struct ir3_postsched_ctx *ctx)
5067ec681f3Smrg{
5077ec681f3Smrg   struct ir3_postsched_deps_state state = {
5087ec681f3Smrg      .ctx = ctx,
5097ec681f3Smrg      .direction = R,
5107ec681f3Smrg      .merged = ctx->v->mergedregs,
5117ec681f3Smrg   };
5127ec681f3Smrg
5137ec681f3Smrg   foreach_instr_rev (instr, &ctx->unscheduled_list) {
5147ec681f3Smrg      calculate_deps(&state, instr->data);
5157ec681f3Smrg   }
5167ec681f3Smrg}
5177ec681f3Smrg
5187ec681f3Smrgstatic void
5197ec681f3Smrgsched_node_init(struct ir3_postsched_ctx *ctx, struct ir3_instruction *instr)
5207ec681f3Smrg{
5217ec681f3Smrg   struct ir3_postsched_node *n =
5227ec681f3Smrg      rzalloc(ctx->mem_ctx, struct ir3_postsched_node);
5237ec681f3Smrg
5247ec681f3Smrg   dag_init_node(ctx->dag, &n->dag);
5257ec681f3Smrg
5267ec681f3Smrg   n->instr = instr;
5277ec681f3Smrg   instr->data = n;
5287ec681f3Smrg}
5297ec681f3Smrg
5307ec681f3Smrgstatic void
5317ec681f3Smrgsched_dag_max_delay_cb(struct dag_node *node, void *state)
5327ec681f3Smrg{
5337ec681f3Smrg   struct ir3_postsched_node *n = (struct ir3_postsched_node *)node;
5347ec681f3Smrg   uint32_t max_delay = 0;
5357ec681f3Smrg
5367ec681f3Smrg   util_dynarray_foreach (&n->dag.edges, struct dag_edge, edge) {
5377ec681f3Smrg      struct ir3_postsched_node *child =
5387ec681f3Smrg         (struct ir3_postsched_node *)edge->child;
5397ec681f3Smrg      max_delay = MAX2(child->max_delay, max_delay);
5407ec681f3Smrg   }
5417ec681f3Smrg
5427ec681f3Smrg   n->max_delay = MAX2(n->max_delay, max_delay + n->delay);
5437ec681f3Smrg}
5447ec681f3Smrg
5457ec681f3Smrgstatic void
5467ec681f3Smrgsched_dag_init(struct ir3_postsched_ctx *ctx)
5477ec681f3Smrg{
5487ec681f3Smrg   ctx->mem_ctx = ralloc_context(NULL);
5497ec681f3Smrg
5507ec681f3Smrg   ctx->dag = dag_create(ctx->mem_ctx);
5517ec681f3Smrg
5527ec681f3Smrg   foreach_instr (instr, &ctx->unscheduled_list)
5537ec681f3Smrg      sched_node_init(ctx, instr);
5547ec681f3Smrg
5557ec681f3Smrg   calculate_forward_deps(ctx);
5567ec681f3Smrg   calculate_reverse_deps(ctx);
5577ec681f3Smrg
5587ec681f3Smrg   /*
5597ec681f3Smrg    * To avoid expensive texture fetches, etc, from being moved ahead
5607ec681f3Smrg    * of kills, track the kills we've seen so far, so we can add an
5617ec681f3Smrg    * extra dependency on them for tex/mem instructions
5627ec681f3Smrg    */
5637ec681f3Smrg   struct util_dynarray kills;
5647ec681f3Smrg   util_dynarray_init(&kills, ctx->mem_ctx);
5657ec681f3Smrg
5667ec681f3Smrg   /* The last bary.f with the (ei) flag must be scheduled before any kills,
5677ec681f3Smrg    * or the hw gets angry. Keep track of inputs here so we can add the
5687ec681f3Smrg    * false dep on the kill instruction.
5697ec681f3Smrg    */
5707ec681f3Smrg   struct util_dynarray inputs;
5717ec681f3Smrg   util_dynarray_init(&inputs, ctx->mem_ctx);
5727ec681f3Smrg
5737ec681f3Smrg   /*
5747ec681f3Smrg    * Normal srcs won't be in SSA at this point, those are dealt with in
5757ec681f3Smrg    * calculate_forward_deps() and calculate_reverse_deps().  But we still
5767ec681f3Smrg    * have the false-dep information in SSA form, so go ahead and add
5777ec681f3Smrg    * dependencies for that here:
5787ec681f3Smrg    */
5797ec681f3Smrg   foreach_instr (instr, &ctx->unscheduled_list) {
5807ec681f3Smrg      struct ir3_postsched_node *n = instr->data;
5817ec681f3Smrg
5827ec681f3Smrg      foreach_ssa_src_n (src, i, instr) {
5837ec681f3Smrg         if (src->block != instr->block)
5847ec681f3Smrg            continue;
5857ec681f3Smrg
5867ec681f3Smrg         /* we can end up with unused false-deps.. just skip them: */
5877ec681f3Smrg         if (src->flags & IR3_INSTR_UNUSED)
5887ec681f3Smrg            continue;
5897ec681f3Smrg
5907ec681f3Smrg         struct ir3_postsched_node *sn = src->data;
5917ec681f3Smrg
5927ec681f3Smrg         /* don't consider dependencies in other blocks: */
5937ec681f3Smrg         if (src->block != instr->block)
5947ec681f3Smrg            continue;
5957ec681f3Smrg
5967ec681f3Smrg         dag_add_edge(&sn->dag, &n->dag, NULL);
5977ec681f3Smrg      }
5987ec681f3Smrg
5997ec681f3Smrg      if (is_input(instr)) {
6007ec681f3Smrg         util_dynarray_append(&inputs, struct ir3_instruction *, instr);
6017ec681f3Smrg      } else if (is_kill_or_demote(instr)) {
6027ec681f3Smrg         util_dynarray_foreach (&inputs, struct ir3_instruction *, instrp) {
6037ec681f3Smrg            struct ir3_instruction *input = *instrp;
6047ec681f3Smrg            struct ir3_postsched_node *in = input->data;
6057ec681f3Smrg            dag_add_edge(&in->dag, &n->dag, NULL);
6067ec681f3Smrg         }
6077ec681f3Smrg         util_dynarray_append(&kills, struct ir3_instruction *, instr);
6087ec681f3Smrg      } else if (is_tex(instr) || is_mem(instr)) {
6097ec681f3Smrg         util_dynarray_foreach (&kills, struct ir3_instruction *, instrp) {
6107ec681f3Smrg            struct ir3_instruction *kill = *instrp;
6117ec681f3Smrg            struct ir3_postsched_node *kn = kill->data;
6127ec681f3Smrg            dag_add_edge(&kn->dag, &n->dag, NULL);
6137ec681f3Smrg         }
6147ec681f3Smrg      }
6157ec681f3Smrg   }
6167ec681f3Smrg
6177ec681f3Smrg   // TODO do we want to do this after reverse-dependencies?
6187ec681f3Smrg   dag_traverse_bottom_up(ctx->dag, sched_dag_max_delay_cb, NULL);
6197ec681f3Smrg}
6207ec681f3Smrg
6217ec681f3Smrgstatic void
6227ec681f3Smrgsched_dag_destroy(struct ir3_postsched_ctx *ctx)
6237ec681f3Smrg{
6247ec681f3Smrg   ralloc_free(ctx->mem_ctx);
6257ec681f3Smrg   ctx->mem_ctx = NULL;
6267ec681f3Smrg   ctx->dag = NULL;
6277ec681f3Smrg}
6287ec681f3Smrg
6297ec681f3Smrgstatic void
6307ec681f3Smrgsched_block(struct ir3_postsched_ctx *ctx, struct ir3_block *block)
6317ec681f3Smrg{
6327ec681f3Smrg   ctx->block = block;
6337ec681f3Smrg   ctx->tex_delay = 0;
6347ec681f3Smrg   ctx->sfu_delay = 0;
6357ec681f3Smrg
6367ec681f3Smrg   /* move all instructions to the unscheduled list, and
6377ec681f3Smrg    * empty the block's instruction list (to which we will
6387ec681f3Smrg    * be inserting).
6397ec681f3Smrg    */
6407ec681f3Smrg   list_replace(&block->instr_list, &ctx->unscheduled_list);
6417ec681f3Smrg   list_inithead(&block->instr_list);
6427ec681f3Smrg
6437ec681f3Smrg   // TODO once we are using post-sched for everything we can
6447ec681f3Smrg   // just not stick in NOP's prior to post-sched, and drop this.
6457ec681f3Smrg   // for now keep this, since it makes post-sched optional:
6467ec681f3Smrg   foreach_instr_safe (instr, &ctx->unscheduled_list) {
6477ec681f3Smrg      switch (instr->opc) {
6487ec681f3Smrg      case OPC_NOP:
6497ec681f3Smrg      case OPC_B:
6507ec681f3Smrg      case OPC_JUMP:
6517ec681f3Smrg         list_delinit(&instr->node);
6527ec681f3Smrg         break;
6537ec681f3Smrg      default:
6547ec681f3Smrg         break;
6557ec681f3Smrg      }
6567ec681f3Smrg   }
6577ec681f3Smrg
6587ec681f3Smrg   sched_dag_init(ctx);
6597ec681f3Smrg
6607ec681f3Smrg   /* First schedule all meta:input instructions, followed by
6617ec681f3Smrg    * tex-prefetch.  We want all of the instructions that load
6627ec681f3Smrg    * values into registers before the shader starts to go
6637ec681f3Smrg    * before any other instructions.  But in particular we
6647ec681f3Smrg    * want inputs to come before prefetches.  This is because
6657ec681f3Smrg    * a FS's bary_ij input may not actually be live in the
6667ec681f3Smrg    * shader, but it should not be scheduled on top of any
6677ec681f3Smrg    * other input (but can be overwritten by a tex prefetch)
6687ec681f3Smrg    */
6697ec681f3Smrg   foreach_instr_safe (instr, &ctx->unscheduled_list)
6707ec681f3Smrg      if (instr->opc == OPC_META_INPUT)
6717ec681f3Smrg         schedule(ctx, instr);
6727ec681f3Smrg
6737ec681f3Smrg   foreach_instr_safe (instr, &ctx->unscheduled_list)
6747ec681f3Smrg      if (instr->opc == OPC_META_TEX_PREFETCH)
6757ec681f3Smrg         schedule(ctx, instr);
6767ec681f3Smrg
6777ec681f3Smrg   while (!list_is_empty(&ctx->unscheduled_list)) {
6787ec681f3Smrg      struct ir3_instruction *instr = choose_instr(ctx);
6797ec681f3Smrg
6807ec681f3Smrg      unsigned delay =
6817ec681f3Smrg         ir3_delay_calc_postra(ctx->block, instr, false, ctx->v->mergedregs);
6827ec681f3Smrg      d("delay=%u", delay);
6837ec681f3Smrg
6847ec681f3Smrg      /* and if we run out of instructions that can be scheduled,
6857ec681f3Smrg       * then it is time for nop's:
6867ec681f3Smrg       */
6877ec681f3Smrg      debug_assert(delay <= 6);
6887ec681f3Smrg      while (delay > 0) {
6897ec681f3Smrg         ir3_NOP(block);
6907ec681f3Smrg         delay--;
6917ec681f3Smrg      }
6927ec681f3Smrg
6937ec681f3Smrg      schedule(ctx, instr);
6947ec681f3Smrg   }
6957ec681f3Smrg
6967ec681f3Smrg   sched_dag_destroy(ctx);
6977ec681f3Smrg}
6987ec681f3Smrg
6997ec681f3Smrgstatic bool
7007ec681f3Smrgis_self_mov(struct ir3_instruction *instr)
7017ec681f3Smrg{
7027ec681f3Smrg   if (!is_same_type_mov(instr))
7037ec681f3Smrg      return false;
7047ec681f3Smrg
7057ec681f3Smrg   if (instr->dsts[0]->num != instr->srcs[0]->num)
7067ec681f3Smrg      return false;
7077ec681f3Smrg
7087ec681f3Smrg   if (instr->dsts[0]->flags & IR3_REG_RELATIV)
7097ec681f3Smrg      return false;
7107ec681f3Smrg
7117ec681f3Smrg   if (instr->cat1.round != ROUND_ZERO)
7127ec681f3Smrg      return false;
7137ec681f3Smrg
7147ec681f3Smrg   if (instr->srcs[0]->flags &
7157ec681f3Smrg       (IR3_REG_CONST | IR3_REG_IMMED | IR3_REG_RELATIV | IR3_REG_FNEG |
7167ec681f3Smrg        IR3_REG_FABS | IR3_REG_SNEG | IR3_REG_SABS | IR3_REG_BNOT))
7177ec681f3Smrg      return false;
7187ec681f3Smrg
7197ec681f3Smrg   return true;
7207ec681f3Smrg}
7217ec681f3Smrg
7227ec681f3Smrg/* sometimes we end up w/ in-place mov's, ie. mov.u32u32 r1.y, r1.y
7237ec681f3Smrg * as a result of places were before RA we are not sure that it is
7247ec681f3Smrg * safe to eliminate.  We could eliminate these earlier, but sometimes
7257ec681f3Smrg * they are tangled up in false-dep's, etc, so it is easier just to
7267ec681f3Smrg * let them exist until after RA
7277ec681f3Smrg */
7287ec681f3Smrgstatic void
7297ec681f3Smrgcleanup_self_movs(struct ir3 *ir)
7307ec681f3Smrg{
7317ec681f3Smrg   foreach_block (block, &ir->block_list) {
7327ec681f3Smrg      foreach_instr_safe (instr, &block->instr_list) {
7337ec681f3Smrg         for (unsigned i = 0; i < instr->deps_count; i++) {
7347ec681f3Smrg            if (instr->deps[i] && is_self_mov(instr->deps[i])) {
7357ec681f3Smrg               instr->deps[i] = NULL;
7367ec681f3Smrg            }
7377ec681f3Smrg         }
7387ec681f3Smrg
7397ec681f3Smrg         if (is_self_mov(instr))
7407ec681f3Smrg            list_delinit(&instr->node);
7417ec681f3Smrg      }
7427ec681f3Smrg   }
7437ec681f3Smrg}
7447ec681f3Smrg
7457ec681f3Smrgbool
7467ec681f3Smrgir3_postsched(struct ir3 *ir, struct ir3_shader_variant *v)
7477ec681f3Smrg{
7487ec681f3Smrg   struct ir3_postsched_ctx ctx = {
7497ec681f3Smrg      .ir = ir,
7507ec681f3Smrg      .v = v,
7517ec681f3Smrg   };
7527ec681f3Smrg
7537ec681f3Smrg   ir3_remove_nops(ir);
7547ec681f3Smrg   cleanup_self_movs(ir);
7557ec681f3Smrg
7567ec681f3Smrg   foreach_block (block, &ir->block_list) {
7577ec681f3Smrg      sched_block(&ctx, block);
7587ec681f3Smrg   }
7597ec681f3Smrg
7607ec681f3Smrg   return true;
7617ec681f3Smrg}
762