17ec681f3Smrg/* 27ec681f3Smrg * Copyright (C) 2019 Google, Inc. 37ec681f3Smrg * 47ec681f3Smrg * Permission is hereby granted, free of charge, to any person obtaining a 57ec681f3Smrg * copy of this software and associated documentation files (the "Software"), 67ec681f3Smrg * to deal in the Software without restriction, including without limitation 77ec681f3Smrg * the rights to use, copy, modify, merge, publish, distribute, sublicense, 87ec681f3Smrg * and/or sell copies of the Software, and to permit persons to whom the 97ec681f3Smrg * Software is furnished to do so, subject to the following conditions: 107ec681f3Smrg * 117ec681f3Smrg * The above copyright notice and this permission notice (including the next 127ec681f3Smrg * paragraph) shall be included in all copies or substantial portions of the 137ec681f3Smrg * Software. 147ec681f3Smrg * 157ec681f3Smrg * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 167ec681f3Smrg * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 177ec681f3Smrg * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 187ec681f3Smrg * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 197ec681f3Smrg * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 207ec681f3Smrg * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 217ec681f3Smrg * SOFTWARE. 227ec681f3Smrg * 237ec681f3Smrg * Authors: 247ec681f3Smrg * Rob Clark <robclark@freedesktop.org> 257ec681f3Smrg */ 267ec681f3Smrg 277ec681f3Smrg#include "util/dag.h" 287ec681f3Smrg#include "util/u_math.h" 297ec681f3Smrg 307ec681f3Smrg#include "ir3.h" 317ec681f3Smrg#include "ir3_compiler.h" 327ec681f3Smrg#include "ir3_context.h" 337ec681f3Smrg 347ec681f3Smrg#ifdef DEBUG 357ec681f3Smrg#define SCHED_DEBUG (ir3_shader_debug & IR3_DBG_SCHEDMSGS) 367ec681f3Smrg#else 377ec681f3Smrg#define SCHED_DEBUG 0 387ec681f3Smrg#endif 397ec681f3Smrg#define d(fmt, ...) \ 407ec681f3Smrg do { \ 417ec681f3Smrg if (SCHED_DEBUG) { \ 427ec681f3Smrg mesa_logi("PSCHED: " fmt, ##__VA_ARGS__); \ 437ec681f3Smrg } \ 447ec681f3Smrg } while (0) 457ec681f3Smrg 467ec681f3Smrg#define di(instr, fmt, ...) \ 477ec681f3Smrg do { \ 487ec681f3Smrg if (SCHED_DEBUG) { \ 497ec681f3Smrg struct log_stream *stream = mesa_log_streami(); \ 507ec681f3Smrg mesa_log_stream_printf(stream, "PSCHED: " fmt ": ", ##__VA_ARGS__); \ 517ec681f3Smrg ir3_print_instr_stream(stream, instr); \ 527ec681f3Smrg mesa_log_stream_destroy(stream); \ 537ec681f3Smrg } \ 547ec681f3Smrg } while (0) 557ec681f3Smrg 567ec681f3Smrg/* 577ec681f3Smrg * Post RA Instruction Scheduling 587ec681f3Smrg */ 597ec681f3Smrg 607ec681f3Smrgstruct ir3_postsched_ctx { 617ec681f3Smrg struct ir3 *ir; 627ec681f3Smrg 637ec681f3Smrg struct ir3_shader_variant *v; 647ec681f3Smrg 657ec681f3Smrg void *mem_ctx; 667ec681f3Smrg struct ir3_block *block; /* the current block */ 677ec681f3Smrg struct dag *dag; 687ec681f3Smrg 697ec681f3Smrg struct list_head unscheduled_list; /* unscheduled instructions */ 707ec681f3Smrg 717ec681f3Smrg int sfu_delay; 727ec681f3Smrg int tex_delay; 737ec681f3Smrg}; 747ec681f3Smrg 757ec681f3Smrgstruct ir3_postsched_node { 767ec681f3Smrg struct dag_node dag; /* must be first for util_dynarray_foreach */ 777ec681f3Smrg struct ir3_instruction *instr; 787ec681f3Smrg bool partially_evaluated_path; 797ec681f3Smrg 807ec681f3Smrg bool has_tex_src, has_sfu_src; 817ec681f3Smrg 827ec681f3Smrg unsigned delay; 837ec681f3Smrg unsigned max_delay; 847ec681f3Smrg}; 857ec681f3Smrg 867ec681f3Smrg#define foreach_sched_node(__n, __list) \ 877ec681f3Smrg list_for_each_entry (struct ir3_postsched_node, __n, __list, dag.link) 887ec681f3Smrg 897ec681f3Smrgstatic bool 907ec681f3Smrghas_tex_src(struct ir3_instruction *instr) 917ec681f3Smrg{ 927ec681f3Smrg struct ir3_postsched_node *node = instr->data; 937ec681f3Smrg return node->has_tex_src; 947ec681f3Smrg} 957ec681f3Smrg 967ec681f3Smrgstatic bool 977ec681f3Smrghas_sfu_src(struct ir3_instruction *instr) 987ec681f3Smrg{ 997ec681f3Smrg struct ir3_postsched_node *node = instr->data; 1007ec681f3Smrg return node->has_sfu_src; 1017ec681f3Smrg} 1027ec681f3Smrg 1037ec681f3Smrgstatic void 1047ec681f3Smrgschedule(struct ir3_postsched_ctx *ctx, struct ir3_instruction *instr) 1057ec681f3Smrg{ 1067ec681f3Smrg debug_assert(ctx->block == instr->block); 1077ec681f3Smrg 1087ec681f3Smrg /* remove from unscheduled_list: 1097ec681f3Smrg */ 1107ec681f3Smrg list_delinit(&instr->node); 1117ec681f3Smrg 1127ec681f3Smrg di(instr, "schedule"); 1137ec681f3Smrg 1147ec681f3Smrg list_addtail(&instr->node, &instr->block->instr_list); 1157ec681f3Smrg 1167ec681f3Smrg struct ir3_postsched_node *n = instr->data; 1177ec681f3Smrg dag_prune_head(ctx->dag, &n->dag); 1187ec681f3Smrg 1197ec681f3Smrg if (is_meta(instr) && (instr->opc != OPC_META_TEX_PREFETCH)) 1207ec681f3Smrg return; 1217ec681f3Smrg 1227ec681f3Smrg if (is_sfu(instr)) { 1237ec681f3Smrg ctx->sfu_delay = 8; 1247ec681f3Smrg } else if (has_sfu_src(instr)) { 1257ec681f3Smrg ctx->sfu_delay = 0; 1267ec681f3Smrg } else if (ctx->sfu_delay > 0) { 1277ec681f3Smrg ctx->sfu_delay--; 1287ec681f3Smrg } 1297ec681f3Smrg 1307ec681f3Smrg if (is_tex_or_prefetch(instr)) { 1317ec681f3Smrg ctx->tex_delay = 10; 1327ec681f3Smrg } else if (has_tex_src(instr)) { 1337ec681f3Smrg ctx->tex_delay = 0; 1347ec681f3Smrg } else if (ctx->tex_delay > 0) { 1357ec681f3Smrg ctx->tex_delay--; 1367ec681f3Smrg } 1377ec681f3Smrg} 1387ec681f3Smrg 1397ec681f3Smrgstatic void 1407ec681f3Smrgdump_state(struct ir3_postsched_ctx *ctx) 1417ec681f3Smrg{ 1427ec681f3Smrg if (!SCHED_DEBUG) 1437ec681f3Smrg return; 1447ec681f3Smrg 1457ec681f3Smrg foreach_sched_node (n, &ctx->dag->heads) { 1467ec681f3Smrg di(n->instr, "maxdel=%3d ", n->max_delay); 1477ec681f3Smrg 1487ec681f3Smrg util_dynarray_foreach (&n->dag.edges, struct dag_edge, edge) { 1497ec681f3Smrg struct ir3_postsched_node *child = 1507ec681f3Smrg (struct ir3_postsched_node *)edge->child; 1517ec681f3Smrg 1527ec681f3Smrg di(child->instr, " -> (%d parents) ", child->dag.parent_count); 1537ec681f3Smrg } 1547ec681f3Smrg } 1557ec681f3Smrg} 1567ec681f3Smrg 1577ec681f3Smrg/* Determine if this is an instruction that we'd prefer not to schedule 1587ec681f3Smrg * yet, in order to avoid an (ss) sync. This is limited by the sfu_delay 1597ec681f3Smrg * counter, ie. the more cycles it has been since the last SFU, the less 1607ec681f3Smrg * costly a sync would be. 1617ec681f3Smrg */ 1627ec681f3Smrgstatic bool 1637ec681f3Smrgwould_sync(struct ir3_postsched_ctx *ctx, struct ir3_instruction *instr) 1647ec681f3Smrg{ 1657ec681f3Smrg if (ctx->sfu_delay) { 1667ec681f3Smrg if (has_sfu_src(instr)) 1677ec681f3Smrg return true; 1687ec681f3Smrg } 1697ec681f3Smrg 1707ec681f3Smrg if (ctx->tex_delay) { 1717ec681f3Smrg if (has_tex_src(instr)) 1727ec681f3Smrg return true; 1737ec681f3Smrg } 1747ec681f3Smrg 1757ec681f3Smrg return false; 1767ec681f3Smrg} 1777ec681f3Smrg 1787ec681f3Smrg/* find instruction to schedule: */ 1797ec681f3Smrgstatic struct ir3_instruction * 1807ec681f3Smrgchoose_instr(struct ir3_postsched_ctx *ctx) 1817ec681f3Smrg{ 1827ec681f3Smrg struct ir3_postsched_node *chosen = NULL; 1837ec681f3Smrg 1847ec681f3Smrg dump_state(ctx); 1857ec681f3Smrg 1867ec681f3Smrg foreach_sched_node (n, &ctx->dag->heads) { 1877ec681f3Smrg if (!is_meta(n->instr)) 1887ec681f3Smrg continue; 1897ec681f3Smrg 1907ec681f3Smrg if (!chosen || (chosen->max_delay < n->max_delay)) 1917ec681f3Smrg chosen = n; 1927ec681f3Smrg } 1937ec681f3Smrg 1947ec681f3Smrg if (chosen) { 1957ec681f3Smrg di(chosen->instr, "prio: chose (meta)"); 1967ec681f3Smrg return chosen->instr; 1977ec681f3Smrg } 1987ec681f3Smrg 1997ec681f3Smrg /* Try to schedule inputs with a higher priority, if possible, as 2007ec681f3Smrg * the last bary.f unlocks varying storage to unblock more VS 2017ec681f3Smrg * warps. 2027ec681f3Smrg */ 2037ec681f3Smrg foreach_sched_node (n, &ctx->dag->heads) { 2047ec681f3Smrg if (!is_input(n->instr)) 2057ec681f3Smrg continue; 2067ec681f3Smrg 2077ec681f3Smrg if (!chosen || (chosen->max_delay < n->max_delay)) 2087ec681f3Smrg chosen = n; 2097ec681f3Smrg } 2107ec681f3Smrg 2117ec681f3Smrg if (chosen) { 2127ec681f3Smrg di(chosen->instr, "prio: chose (input)"); 2137ec681f3Smrg return chosen->instr; 2147ec681f3Smrg } 2157ec681f3Smrg 2167ec681f3Smrg /* Next prioritize discards: */ 2177ec681f3Smrg foreach_sched_node (n, &ctx->dag->heads) { 2187ec681f3Smrg unsigned d = 2197ec681f3Smrg ir3_delay_calc_postra(ctx->block, n->instr, false, ctx->v->mergedregs); 2207ec681f3Smrg 2217ec681f3Smrg if (d > 0) 2227ec681f3Smrg continue; 2237ec681f3Smrg 2247ec681f3Smrg if (!is_kill_or_demote(n->instr)) 2257ec681f3Smrg continue; 2267ec681f3Smrg 2277ec681f3Smrg if (!chosen || (chosen->max_delay < n->max_delay)) 2287ec681f3Smrg chosen = n; 2297ec681f3Smrg } 2307ec681f3Smrg 2317ec681f3Smrg if (chosen) { 2327ec681f3Smrg di(chosen->instr, "csp: chose (kill, hard ready)"); 2337ec681f3Smrg return chosen->instr; 2347ec681f3Smrg } 2357ec681f3Smrg 2367ec681f3Smrg /* Next prioritize expensive instructions: */ 2377ec681f3Smrg foreach_sched_node (n, &ctx->dag->heads) { 2387ec681f3Smrg unsigned d = 2397ec681f3Smrg ir3_delay_calc_postra(ctx->block, n->instr, false, ctx->v->mergedregs); 2407ec681f3Smrg 2417ec681f3Smrg if (d > 0) 2427ec681f3Smrg continue; 2437ec681f3Smrg 2447ec681f3Smrg if (!(is_sfu(n->instr) || is_tex(n->instr))) 2457ec681f3Smrg continue; 2467ec681f3Smrg 2477ec681f3Smrg if (!chosen || (chosen->max_delay < n->max_delay)) 2487ec681f3Smrg chosen = n; 2497ec681f3Smrg } 2507ec681f3Smrg 2517ec681f3Smrg if (chosen) { 2527ec681f3Smrg di(chosen->instr, "csp: chose (sfu/tex, hard ready)"); 2537ec681f3Smrg return chosen->instr; 2547ec681f3Smrg } 2557ec681f3Smrg 2567ec681f3Smrg /* 2577ec681f3Smrg * Sometimes be better to take a nop, rather than scheduling an 2587ec681f3Smrg * instruction that would require an (ss) shortly after another 2597ec681f3Smrg * SFU.. ie. if last SFU was just one or two instr ago, and we 2607ec681f3Smrg * could choose between taking a nop and then scheduling 2617ec681f3Smrg * something else, vs scheduling the immed avail instruction that 2627ec681f3Smrg * would require (ss), we are better with the nop. 2637ec681f3Smrg */ 2647ec681f3Smrg for (unsigned delay = 0; delay < 4; delay++) { 2657ec681f3Smrg foreach_sched_node (n, &ctx->dag->heads) { 2667ec681f3Smrg if (would_sync(ctx, n->instr)) 2677ec681f3Smrg continue; 2687ec681f3Smrg 2697ec681f3Smrg unsigned d = ir3_delay_calc_postra(ctx->block, n->instr, true, 2707ec681f3Smrg ctx->v->mergedregs); 2717ec681f3Smrg 2727ec681f3Smrg if (d > delay) 2737ec681f3Smrg continue; 2747ec681f3Smrg 2757ec681f3Smrg if (!chosen || (chosen->max_delay < n->max_delay)) 2767ec681f3Smrg chosen = n; 2777ec681f3Smrg } 2787ec681f3Smrg 2797ec681f3Smrg if (chosen) { 2807ec681f3Smrg di(chosen->instr, "csp: chose (soft ready, delay=%u)", delay); 2817ec681f3Smrg return chosen->instr; 2827ec681f3Smrg } 2837ec681f3Smrg } 2847ec681f3Smrg 2857ec681f3Smrg /* Next try to find a ready leader w/ soft delay (ie. including extra 2867ec681f3Smrg * delay for things like tex fetch which can be synchronized w/ sync 2877ec681f3Smrg * bit (but we probably do want to schedule some other instructions 2887ec681f3Smrg * while we wait) 2897ec681f3Smrg */ 2907ec681f3Smrg foreach_sched_node (n, &ctx->dag->heads) { 2917ec681f3Smrg unsigned d = 2927ec681f3Smrg ir3_delay_calc_postra(ctx->block, n->instr, true, ctx->v->mergedregs); 2937ec681f3Smrg 2947ec681f3Smrg if (d > 0) 2957ec681f3Smrg continue; 2967ec681f3Smrg 2977ec681f3Smrg if (!chosen || (chosen->max_delay < n->max_delay)) 2987ec681f3Smrg chosen = n; 2997ec681f3Smrg } 3007ec681f3Smrg 3017ec681f3Smrg if (chosen) { 3027ec681f3Smrg di(chosen->instr, "csp: chose (soft ready)"); 3037ec681f3Smrg return chosen->instr; 3047ec681f3Smrg } 3057ec681f3Smrg 3067ec681f3Smrg /* Next try to find a ready leader that can be scheduled without nop's, 3077ec681f3Smrg * which in the case of things that need (sy)/(ss) could result in 3087ec681f3Smrg * stalls.. but we've already decided there is not a better option. 3097ec681f3Smrg */ 3107ec681f3Smrg foreach_sched_node (n, &ctx->dag->heads) { 3117ec681f3Smrg unsigned d = 3127ec681f3Smrg ir3_delay_calc_postra(ctx->block, n->instr, false, ctx->v->mergedregs); 3137ec681f3Smrg 3147ec681f3Smrg if (d > 0) 3157ec681f3Smrg continue; 3167ec681f3Smrg 3177ec681f3Smrg if (!chosen || (chosen->max_delay < n->max_delay)) 3187ec681f3Smrg chosen = n; 3197ec681f3Smrg } 3207ec681f3Smrg 3217ec681f3Smrg if (chosen) { 3227ec681f3Smrg di(chosen->instr, "csp: chose (hard ready)"); 3237ec681f3Smrg return chosen->instr; 3247ec681f3Smrg } 3257ec681f3Smrg 3267ec681f3Smrg /* Otherwise choose leader with maximum cost: 3277ec681f3Smrg * 3287ec681f3Smrg * TODO should we try to balance cost and delays? I guess it is 3297ec681f3Smrg * a balance between now-nop's and future-nop's? 3307ec681f3Smrg */ 3317ec681f3Smrg foreach_sched_node (n, &ctx->dag->heads) { 3327ec681f3Smrg if (!chosen || chosen->max_delay < n->max_delay) 3337ec681f3Smrg chosen = n; 3347ec681f3Smrg } 3357ec681f3Smrg 3367ec681f3Smrg if (chosen) { 3377ec681f3Smrg di(chosen->instr, "csp: chose (leader)"); 3387ec681f3Smrg return chosen->instr; 3397ec681f3Smrg } 3407ec681f3Smrg 3417ec681f3Smrg return NULL; 3427ec681f3Smrg} 3437ec681f3Smrg 3447ec681f3Smrgstruct ir3_postsched_deps_state { 3457ec681f3Smrg struct ir3_postsched_ctx *ctx; 3467ec681f3Smrg 3477ec681f3Smrg enum { F, R } direction; 3487ec681f3Smrg 3497ec681f3Smrg bool merged; 3507ec681f3Smrg 3517ec681f3Smrg /* Track the mapping between sched node (instruction) that last 3527ec681f3Smrg * wrote a given register (in whichever direction we are iterating 3537ec681f3Smrg * the block) 3547ec681f3Smrg * 3557ec681f3Smrg * Note, this table is twice as big as the # of regs, to deal with 3567ec681f3Smrg * half-precision regs. The approach differs depending on whether 3577ec681f3Smrg * the half and full precision register files are "merged" (conflict, 3587ec681f3Smrg * ie. a6xx+) in which case we consider each full precision dep 3597ec681f3Smrg * as two half-precision dependencies, vs older separate (non- 3607ec681f3Smrg * conflicting) in which case the first half of the table is used 3617ec681f3Smrg * for full precision and 2nd half for half-precision. 3627ec681f3Smrg */ 3637ec681f3Smrg struct ir3_postsched_node *regs[2 * 256]; 3647ec681f3Smrg}; 3657ec681f3Smrg 3667ec681f3Smrg/* bounds checking read/write accessors, since OoB access to stuff on 3677ec681f3Smrg * the stack is gonna cause a bad day. 3687ec681f3Smrg */ 3697ec681f3Smrg#define dep_reg(state, idx) \ 3707ec681f3Smrg *({ \ 3717ec681f3Smrg assert((idx) < ARRAY_SIZE((state)->regs)); \ 3727ec681f3Smrg &(state)->regs[(idx)]; \ 3737ec681f3Smrg }) 3747ec681f3Smrg 3757ec681f3Smrgstatic void 3767ec681f3Smrgadd_dep(struct ir3_postsched_deps_state *state, 3777ec681f3Smrg struct ir3_postsched_node *before, struct ir3_postsched_node *after) 3787ec681f3Smrg{ 3797ec681f3Smrg if (!before || !after) 3807ec681f3Smrg return; 3817ec681f3Smrg 3827ec681f3Smrg assert(before != after); 3837ec681f3Smrg 3847ec681f3Smrg if (state->direction == F) { 3857ec681f3Smrg dag_add_edge(&before->dag, &after->dag, NULL); 3867ec681f3Smrg } else { 3877ec681f3Smrg dag_add_edge(&after->dag, &before->dag, NULL); 3887ec681f3Smrg } 3897ec681f3Smrg} 3907ec681f3Smrg 3917ec681f3Smrgstatic void 3927ec681f3Smrgadd_single_reg_dep(struct ir3_postsched_deps_state *state, 3937ec681f3Smrg struct ir3_postsched_node *node, unsigned num, int src_n) 3947ec681f3Smrg{ 3957ec681f3Smrg struct ir3_postsched_node *dep = dep_reg(state, num); 3967ec681f3Smrg 3977ec681f3Smrg if (src_n >= 0 && dep && state->direction == F) { 3987ec681f3Smrg unsigned d = ir3_delayslots(dep->instr, node->instr, src_n, true); 3997ec681f3Smrg node->delay = MAX2(node->delay, d); 4007ec681f3Smrg if (is_tex_or_prefetch(dep->instr)) 4017ec681f3Smrg node->has_tex_src = true; 4027ec681f3Smrg if (is_tex_or_prefetch(dep->instr)) 4037ec681f3Smrg node->has_sfu_src = true; 4047ec681f3Smrg } 4057ec681f3Smrg 4067ec681f3Smrg add_dep(state, dep, node); 4077ec681f3Smrg if (src_n < 0) { 4087ec681f3Smrg dep_reg(state, num) = node; 4097ec681f3Smrg } 4107ec681f3Smrg} 4117ec681f3Smrg 4127ec681f3Smrg/* This is where we handled full vs half-precision, and potential conflicts 4137ec681f3Smrg * between half and full precision that result in additional dependencies. 4147ec681f3Smrg * The 'reg' arg is really just to know half vs full precision. 4157ec681f3Smrg * 4167ec681f3Smrg * If non-negative, then this adds a dependency on a source register, and 4177ec681f3Smrg * src_n is the index passed into ir3_delayslots() for calculating the delay: 4187ec681f3Smrg * If positive, corresponds to node->instr->regs[src_n]. If negative, then 4197ec681f3Smrg * this is for a destination register. 4207ec681f3Smrg */ 4217ec681f3Smrgstatic void 4227ec681f3Smrgadd_reg_dep(struct ir3_postsched_deps_state *state, 4237ec681f3Smrg struct ir3_postsched_node *node, const struct ir3_register *reg, 4247ec681f3Smrg unsigned num, int src_n) 4257ec681f3Smrg{ 4267ec681f3Smrg if (state->merged) { 4277ec681f3Smrg /* Make sure that special registers like a0.x that are written as 4287ec681f3Smrg * half-registers don't alias random full registers by pretending that 4297ec681f3Smrg * they're full registers: 4307ec681f3Smrg */ 4317ec681f3Smrg if ((reg->flags & IR3_REG_HALF) && !is_reg_special(reg)) { 4327ec681f3Smrg /* single conflict in half-reg space: */ 4337ec681f3Smrg add_single_reg_dep(state, node, num, src_n); 4347ec681f3Smrg } else { 4357ec681f3Smrg /* two conflicts in half-reg space: */ 4367ec681f3Smrg add_single_reg_dep(state, node, 2 * num + 0, src_n); 4377ec681f3Smrg add_single_reg_dep(state, node, 2 * num + 1, src_n); 4387ec681f3Smrg } 4397ec681f3Smrg } else { 4407ec681f3Smrg if (reg->flags & IR3_REG_HALF) 4417ec681f3Smrg num += ARRAY_SIZE(state->regs) / 2; 4427ec681f3Smrg add_single_reg_dep(state, node, num, src_n); 4437ec681f3Smrg } 4447ec681f3Smrg} 4457ec681f3Smrg 4467ec681f3Smrgstatic void 4477ec681f3Smrgcalculate_deps(struct ir3_postsched_deps_state *state, 4487ec681f3Smrg struct ir3_postsched_node *node) 4497ec681f3Smrg{ 4507ec681f3Smrg /* Add dependencies on instructions that previously (or next, 4517ec681f3Smrg * in the reverse direction) wrote any of our src registers: 4527ec681f3Smrg */ 4537ec681f3Smrg foreach_src_n (reg, i, node->instr) { 4547ec681f3Smrg if (reg->flags & (IR3_REG_CONST | IR3_REG_IMMED)) 4557ec681f3Smrg continue; 4567ec681f3Smrg 4577ec681f3Smrg if (reg->flags & IR3_REG_RELATIV) { 4587ec681f3Smrg /* mark entire array as read: */ 4597ec681f3Smrg for (unsigned j = 0; j < reg->size; j++) { 4607ec681f3Smrg add_reg_dep(state, node, reg, reg->array.base + j, i); 4617ec681f3Smrg } 4627ec681f3Smrg } else { 4637ec681f3Smrg assert(reg->wrmask >= 1); 4647ec681f3Smrg u_foreach_bit (b, reg->wrmask) { 4657ec681f3Smrg add_reg_dep(state, node, reg, reg->num + b, i); 4667ec681f3Smrg } 4677ec681f3Smrg } 4687ec681f3Smrg } 4697ec681f3Smrg 4707ec681f3Smrg /* And then after we update the state for what this instruction 4717ec681f3Smrg * wrote: 4727ec681f3Smrg */ 4737ec681f3Smrg foreach_dst (reg, node->instr) { 4747ec681f3Smrg if (reg->wrmask == 0) 4757ec681f3Smrg continue; 4767ec681f3Smrg if (reg->flags & IR3_REG_RELATIV) { 4777ec681f3Smrg /* mark the entire array as written: */ 4787ec681f3Smrg for (unsigned i = 0; i < reg->size; i++) { 4797ec681f3Smrg add_reg_dep(state, node, reg, reg->array.base + i, -1); 4807ec681f3Smrg } 4817ec681f3Smrg } else { 4827ec681f3Smrg assert(reg->wrmask >= 1); 4837ec681f3Smrg u_foreach_bit (b, reg->wrmask) { 4847ec681f3Smrg add_reg_dep(state, node, reg, reg->num + b, -1); 4857ec681f3Smrg } 4867ec681f3Smrg } 4877ec681f3Smrg } 4887ec681f3Smrg} 4897ec681f3Smrg 4907ec681f3Smrgstatic void 4917ec681f3Smrgcalculate_forward_deps(struct ir3_postsched_ctx *ctx) 4927ec681f3Smrg{ 4937ec681f3Smrg struct ir3_postsched_deps_state state = { 4947ec681f3Smrg .ctx = ctx, 4957ec681f3Smrg .direction = F, 4967ec681f3Smrg .merged = ctx->v->mergedregs, 4977ec681f3Smrg }; 4987ec681f3Smrg 4997ec681f3Smrg foreach_instr (instr, &ctx->unscheduled_list) { 5007ec681f3Smrg calculate_deps(&state, instr->data); 5017ec681f3Smrg } 5027ec681f3Smrg} 5037ec681f3Smrg 5047ec681f3Smrgstatic void 5057ec681f3Smrgcalculate_reverse_deps(struct ir3_postsched_ctx *ctx) 5067ec681f3Smrg{ 5077ec681f3Smrg struct ir3_postsched_deps_state state = { 5087ec681f3Smrg .ctx = ctx, 5097ec681f3Smrg .direction = R, 5107ec681f3Smrg .merged = ctx->v->mergedregs, 5117ec681f3Smrg }; 5127ec681f3Smrg 5137ec681f3Smrg foreach_instr_rev (instr, &ctx->unscheduled_list) { 5147ec681f3Smrg calculate_deps(&state, instr->data); 5157ec681f3Smrg } 5167ec681f3Smrg} 5177ec681f3Smrg 5187ec681f3Smrgstatic void 5197ec681f3Smrgsched_node_init(struct ir3_postsched_ctx *ctx, struct ir3_instruction *instr) 5207ec681f3Smrg{ 5217ec681f3Smrg struct ir3_postsched_node *n = 5227ec681f3Smrg rzalloc(ctx->mem_ctx, struct ir3_postsched_node); 5237ec681f3Smrg 5247ec681f3Smrg dag_init_node(ctx->dag, &n->dag); 5257ec681f3Smrg 5267ec681f3Smrg n->instr = instr; 5277ec681f3Smrg instr->data = n; 5287ec681f3Smrg} 5297ec681f3Smrg 5307ec681f3Smrgstatic void 5317ec681f3Smrgsched_dag_max_delay_cb(struct dag_node *node, void *state) 5327ec681f3Smrg{ 5337ec681f3Smrg struct ir3_postsched_node *n = (struct ir3_postsched_node *)node; 5347ec681f3Smrg uint32_t max_delay = 0; 5357ec681f3Smrg 5367ec681f3Smrg util_dynarray_foreach (&n->dag.edges, struct dag_edge, edge) { 5377ec681f3Smrg struct ir3_postsched_node *child = 5387ec681f3Smrg (struct ir3_postsched_node *)edge->child; 5397ec681f3Smrg max_delay = MAX2(child->max_delay, max_delay); 5407ec681f3Smrg } 5417ec681f3Smrg 5427ec681f3Smrg n->max_delay = MAX2(n->max_delay, max_delay + n->delay); 5437ec681f3Smrg} 5447ec681f3Smrg 5457ec681f3Smrgstatic void 5467ec681f3Smrgsched_dag_init(struct ir3_postsched_ctx *ctx) 5477ec681f3Smrg{ 5487ec681f3Smrg ctx->mem_ctx = ralloc_context(NULL); 5497ec681f3Smrg 5507ec681f3Smrg ctx->dag = dag_create(ctx->mem_ctx); 5517ec681f3Smrg 5527ec681f3Smrg foreach_instr (instr, &ctx->unscheduled_list) 5537ec681f3Smrg sched_node_init(ctx, instr); 5547ec681f3Smrg 5557ec681f3Smrg calculate_forward_deps(ctx); 5567ec681f3Smrg calculate_reverse_deps(ctx); 5577ec681f3Smrg 5587ec681f3Smrg /* 5597ec681f3Smrg * To avoid expensive texture fetches, etc, from being moved ahead 5607ec681f3Smrg * of kills, track the kills we've seen so far, so we can add an 5617ec681f3Smrg * extra dependency on them for tex/mem instructions 5627ec681f3Smrg */ 5637ec681f3Smrg struct util_dynarray kills; 5647ec681f3Smrg util_dynarray_init(&kills, ctx->mem_ctx); 5657ec681f3Smrg 5667ec681f3Smrg /* The last bary.f with the (ei) flag must be scheduled before any kills, 5677ec681f3Smrg * or the hw gets angry. Keep track of inputs here so we can add the 5687ec681f3Smrg * false dep on the kill instruction. 5697ec681f3Smrg */ 5707ec681f3Smrg struct util_dynarray inputs; 5717ec681f3Smrg util_dynarray_init(&inputs, ctx->mem_ctx); 5727ec681f3Smrg 5737ec681f3Smrg /* 5747ec681f3Smrg * Normal srcs won't be in SSA at this point, those are dealt with in 5757ec681f3Smrg * calculate_forward_deps() and calculate_reverse_deps(). But we still 5767ec681f3Smrg * have the false-dep information in SSA form, so go ahead and add 5777ec681f3Smrg * dependencies for that here: 5787ec681f3Smrg */ 5797ec681f3Smrg foreach_instr (instr, &ctx->unscheduled_list) { 5807ec681f3Smrg struct ir3_postsched_node *n = instr->data; 5817ec681f3Smrg 5827ec681f3Smrg foreach_ssa_src_n (src, i, instr) { 5837ec681f3Smrg if (src->block != instr->block) 5847ec681f3Smrg continue; 5857ec681f3Smrg 5867ec681f3Smrg /* we can end up with unused false-deps.. just skip them: */ 5877ec681f3Smrg if (src->flags & IR3_INSTR_UNUSED) 5887ec681f3Smrg continue; 5897ec681f3Smrg 5907ec681f3Smrg struct ir3_postsched_node *sn = src->data; 5917ec681f3Smrg 5927ec681f3Smrg /* don't consider dependencies in other blocks: */ 5937ec681f3Smrg if (src->block != instr->block) 5947ec681f3Smrg continue; 5957ec681f3Smrg 5967ec681f3Smrg dag_add_edge(&sn->dag, &n->dag, NULL); 5977ec681f3Smrg } 5987ec681f3Smrg 5997ec681f3Smrg if (is_input(instr)) { 6007ec681f3Smrg util_dynarray_append(&inputs, struct ir3_instruction *, instr); 6017ec681f3Smrg } else if (is_kill_or_demote(instr)) { 6027ec681f3Smrg util_dynarray_foreach (&inputs, struct ir3_instruction *, instrp) { 6037ec681f3Smrg struct ir3_instruction *input = *instrp; 6047ec681f3Smrg struct ir3_postsched_node *in = input->data; 6057ec681f3Smrg dag_add_edge(&in->dag, &n->dag, NULL); 6067ec681f3Smrg } 6077ec681f3Smrg util_dynarray_append(&kills, struct ir3_instruction *, instr); 6087ec681f3Smrg } else if (is_tex(instr) || is_mem(instr)) { 6097ec681f3Smrg util_dynarray_foreach (&kills, struct ir3_instruction *, instrp) { 6107ec681f3Smrg struct ir3_instruction *kill = *instrp; 6117ec681f3Smrg struct ir3_postsched_node *kn = kill->data; 6127ec681f3Smrg dag_add_edge(&kn->dag, &n->dag, NULL); 6137ec681f3Smrg } 6147ec681f3Smrg } 6157ec681f3Smrg } 6167ec681f3Smrg 6177ec681f3Smrg // TODO do we want to do this after reverse-dependencies? 6187ec681f3Smrg dag_traverse_bottom_up(ctx->dag, sched_dag_max_delay_cb, NULL); 6197ec681f3Smrg} 6207ec681f3Smrg 6217ec681f3Smrgstatic void 6227ec681f3Smrgsched_dag_destroy(struct ir3_postsched_ctx *ctx) 6237ec681f3Smrg{ 6247ec681f3Smrg ralloc_free(ctx->mem_ctx); 6257ec681f3Smrg ctx->mem_ctx = NULL; 6267ec681f3Smrg ctx->dag = NULL; 6277ec681f3Smrg} 6287ec681f3Smrg 6297ec681f3Smrgstatic void 6307ec681f3Smrgsched_block(struct ir3_postsched_ctx *ctx, struct ir3_block *block) 6317ec681f3Smrg{ 6327ec681f3Smrg ctx->block = block; 6337ec681f3Smrg ctx->tex_delay = 0; 6347ec681f3Smrg ctx->sfu_delay = 0; 6357ec681f3Smrg 6367ec681f3Smrg /* move all instructions to the unscheduled list, and 6377ec681f3Smrg * empty the block's instruction list (to which we will 6387ec681f3Smrg * be inserting). 6397ec681f3Smrg */ 6407ec681f3Smrg list_replace(&block->instr_list, &ctx->unscheduled_list); 6417ec681f3Smrg list_inithead(&block->instr_list); 6427ec681f3Smrg 6437ec681f3Smrg // TODO once we are using post-sched for everything we can 6447ec681f3Smrg // just not stick in NOP's prior to post-sched, and drop this. 6457ec681f3Smrg // for now keep this, since it makes post-sched optional: 6467ec681f3Smrg foreach_instr_safe (instr, &ctx->unscheduled_list) { 6477ec681f3Smrg switch (instr->opc) { 6487ec681f3Smrg case OPC_NOP: 6497ec681f3Smrg case OPC_B: 6507ec681f3Smrg case OPC_JUMP: 6517ec681f3Smrg list_delinit(&instr->node); 6527ec681f3Smrg break; 6537ec681f3Smrg default: 6547ec681f3Smrg break; 6557ec681f3Smrg } 6567ec681f3Smrg } 6577ec681f3Smrg 6587ec681f3Smrg sched_dag_init(ctx); 6597ec681f3Smrg 6607ec681f3Smrg /* First schedule all meta:input instructions, followed by 6617ec681f3Smrg * tex-prefetch. We want all of the instructions that load 6627ec681f3Smrg * values into registers before the shader starts to go 6637ec681f3Smrg * before any other instructions. But in particular we 6647ec681f3Smrg * want inputs to come before prefetches. This is because 6657ec681f3Smrg * a FS's bary_ij input may not actually be live in the 6667ec681f3Smrg * shader, but it should not be scheduled on top of any 6677ec681f3Smrg * other input (but can be overwritten by a tex prefetch) 6687ec681f3Smrg */ 6697ec681f3Smrg foreach_instr_safe (instr, &ctx->unscheduled_list) 6707ec681f3Smrg if (instr->opc == OPC_META_INPUT) 6717ec681f3Smrg schedule(ctx, instr); 6727ec681f3Smrg 6737ec681f3Smrg foreach_instr_safe (instr, &ctx->unscheduled_list) 6747ec681f3Smrg if (instr->opc == OPC_META_TEX_PREFETCH) 6757ec681f3Smrg schedule(ctx, instr); 6767ec681f3Smrg 6777ec681f3Smrg while (!list_is_empty(&ctx->unscheduled_list)) { 6787ec681f3Smrg struct ir3_instruction *instr = choose_instr(ctx); 6797ec681f3Smrg 6807ec681f3Smrg unsigned delay = 6817ec681f3Smrg ir3_delay_calc_postra(ctx->block, instr, false, ctx->v->mergedregs); 6827ec681f3Smrg d("delay=%u", delay); 6837ec681f3Smrg 6847ec681f3Smrg /* and if we run out of instructions that can be scheduled, 6857ec681f3Smrg * then it is time for nop's: 6867ec681f3Smrg */ 6877ec681f3Smrg debug_assert(delay <= 6); 6887ec681f3Smrg while (delay > 0) { 6897ec681f3Smrg ir3_NOP(block); 6907ec681f3Smrg delay--; 6917ec681f3Smrg } 6927ec681f3Smrg 6937ec681f3Smrg schedule(ctx, instr); 6947ec681f3Smrg } 6957ec681f3Smrg 6967ec681f3Smrg sched_dag_destroy(ctx); 6977ec681f3Smrg} 6987ec681f3Smrg 6997ec681f3Smrgstatic bool 7007ec681f3Smrgis_self_mov(struct ir3_instruction *instr) 7017ec681f3Smrg{ 7027ec681f3Smrg if (!is_same_type_mov(instr)) 7037ec681f3Smrg return false; 7047ec681f3Smrg 7057ec681f3Smrg if (instr->dsts[0]->num != instr->srcs[0]->num) 7067ec681f3Smrg return false; 7077ec681f3Smrg 7087ec681f3Smrg if (instr->dsts[0]->flags & IR3_REG_RELATIV) 7097ec681f3Smrg return false; 7107ec681f3Smrg 7117ec681f3Smrg if (instr->cat1.round != ROUND_ZERO) 7127ec681f3Smrg return false; 7137ec681f3Smrg 7147ec681f3Smrg if (instr->srcs[0]->flags & 7157ec681f3Smrg (IR3_REG_CONST | IR3_REG_IMMED | IR3_REG_RELATIV | IR3_REG_FNEG | 7167ec681f3Smrg IR3_REG_FABS | IR3_REG_SNEG | IR3_REG_SABS | IR3_REG_BNOT)) 7177ec681f3Smrg return false; 7187ec681f3Smrg 7197ec681f3Smrg return true; 7207ec681f3Smrg} 7217ec681f3Smrg 7227ec681f3Smrg/* sometimes we end up w/ in-place mov's, ie. mov.u32u32 r1.y, r1.y 7237ec681f3Smrg * as a result of places were before RA we are not sure that it is 7247ec681f3Smrg * safe to eliminate. We could eliminate these earlier, but sometimes 7257ec681f3Smrg * they are tangled up in false-dep's, etc, so it is easier just to 7267ec681f3Smrg * let them exist until after RA 7277ec681f3Smrg */ 7287ec681f3Smrgstatic void 7297ec681f3Smrgcleanup_self_movs(struct ir3 *ir) 7307ec681f3Smrg{ 7317ec681f3Smrg foreach_block (block, &ir->block_list) { 7327ec681f3Smrg foreach_instr_safe (instr, &block->instr_list) { 7337ec681f3Smrg for (unsigned i = 0; i < instr->deps_count; i++) { 7347ec681f3Smrg if (instr->deps[i] && is_self_mov(instr->deps[i])) { 7357ec681f3Smrg instr->deps[i] = NULL; 7367ec681f3Smrg } 7377ec681f3Smrg } 7387ec681f3Smrg 7397ec681f3Smrg if (is_self_mov(instr)) 7407ec681f3Smrg list_delinit(&instr->node); 7417ec681f3Smrg } 7427ec681f3Smrg } 7437ec681f3Smrg} 7447ec681f3Smrg 7457ec681f3Smrgbool 7467ec681f3Smrgir3_postsched(struct ir3 *ir, struct ir3_shader_variant *v) 7477ec681f3Smrg{ 7487ec681f3Smrg struct ir3_postsched_ctx ctx = { 7497ec681f3Smrg .ir = ir, 7507ec681f3Smrg .v = v, 7517ec681f3Smrg }; 7527ec681f3Smrg 7537ec681f3Smrg ir3_remove_nops(ir); 7547ec681f3Smrg cleanup_self_movs(ir); 7557ec681f3Smrg 7567ec681f3Smrg foreach_block (block, &ir->block_list) { 7577ec681f3Smrg sched_block(&ctx, block); 7587ec681f3Smrg } 7597ec681f3Smrg 7607ec681f3Smrg return true; 7617ec681f3Smrg} 762