17e102996Smaya/* 27e102996Smaya * Copyright (C) 2014 Rob Clark <robclark@freedesktop.org> 37e102996Smaya * 47e102996Smaya * Permission is hereby granted, free of charge, to any person obtaining a 57e102996Smaya * copy of this software and associated documentation files (the "Software"), 67e102996Smaya * to deal in the Software without restriction, including without limitation 77e102996Smaya * the rights to use, copy, modify, merge, publish, distribute, sublicense, 87e102996Smaya * and/or sell copies of the Software, and to permit persons to whom the 97e102996Smaya * Software is furnished to do so, subject to the following conditions: 107e102996Smaya * 117e102996Smaya * The above copyright notice and this permission notice (including the next 127e102996Smaya * paragraph) shall be included in all copies or substantial portions of the 137e102996Smaya * Software. 147e102996Smaya * 157e102996Smaya * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 167e102996Smaya * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 177e102996Smaya * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 187e102996Smaya * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 197e102996Smaya * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 207e102996Smaya * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 217e102996Smaya * SOFTWARE. 227e102996Smaya * 237e102996Smaya * Authors: 247e102996Smaya * Rob Clark <robclark@freedesktop.org> 257e102996Smaya */ 267e102996Smaya 277e102996Smaya#include "util/ralloc.h" 287e102996Smaya#include "util/u_math.h" 297e102996Smaya 307e102996Smaya#include "ir3.h" 317ec681f3Smrg#include "ir3_shader.h" 327e102996Smaya 337e102996Smaya/* 347e102996Smaya * Legalize: 357e102996Smaya * 367ec681f3Smrg * The legalize pass handles ensuring sufficient nop's and sync flags for 377ec681f3Smrg * correct execution. 387ec681f3Smrg * 397ec681f3Smrg * 1) Iteratively determine where sync ((sy)/(ss)) flags are needed, 407ec681f3Smrg * based on state flowing out of predecessor blocks until there is 417ec681f3Smrg * no further change. In some cases this requires inserting nops. 427ec681f3Smrg * 2) Mark (ei) on last varying input, and (ul) on last use of a0.x 437ec681f3Smrg * 3) Final nop scheduling for instruction latency 447ec681f3Smrg * 4) Resolve jumps and schedule blocks, marking potential convergence 457ec681f3Smrg * points with (jp) 467e102996Smaya */ 477e102996Smaya 487e102996Smayastruct ir3_legalize_ctx { 497ec681f3Smrg struct ir3_compiler *compiler; 507ec681f3Smrg struct ir3_shader_variant *so; 517ec681f3Smrg gl_shader_stage type; 527ec681f3Smrg int max_bary; 537ec681f3Smrg bool early_input_release; 547e102996Smaya}; 557e102996Smaya 567e102996Smayastruct ir3_legalize_state { 577ec681f3Smrg regmask_t needs_ss; 587ec681f3Smrg regmask_t needs_ss_war; /* write after read */ 597ec681f3Smrg regmask_t needs_sy; 607e102996Smaya}; 617e102996Smaya 627e102996Smayastruct ir3_legalize_block_data { 637ec681f3Smrg bool valid; 647ec681f3Smrg struct ir3_legalize_state state; 657e102996Smaya}; 667e102996Smaya 677e102996Smaya/* We want to evaluate each block from the position of any other 687e102996Smaya * predecessor block, in order that the flags set are the union of 697e102996Smaya * all possible program paths. 707e102996Smaya * 717e102996Smaya * To do this, we need to know the output state (needs_ss/ss_war/sy) 727e102996Smaya * of all predecessor blocks. The tricky thing is loops, which mean 737e102996Smaya * that we can't simply recursively process each predecessor block 747e102996Smaya * before legalizing the current block. 757e102996Smaya * 767e102996Smaya * How we handle that is by looping over all the blocks until the 777e102996Smaya * results converge. If the output state of a given block changes 787e102996Smaya * in a given pass, this means that all successor blocks are not 797e102996Smaya * yet fully legalized. 807e102996Smaya */ 817e102996Smaya 827e102996Smayastatic bool 837e102996Smayalegalize_block(struct ir3_legalize_ctx *ctx, struct ir3_block *block) 847e102996Smaya{ 857ec681f3Smrg struct ir3_legalize_block_data *bd = block->data; 867ec681f3Smrg 877ec681f3Smrg if (bd->valid) 887ec681f3Smrg return false; 897ec681f3Smrg 907ec681f3Smrg struct ir3_instruction *last_rel = NULL; 917ec681f3Smrg struct ir3_instruction *last_n = NULL; 927ec681f3Smrg struct list_head instr_list; 937ec681f3Smrg struct ir3_legalize_state prev_state = bd->state; 947ec681f3Smrg struct ir3_legalize_state *state = &bd->state; 957ec681f3Smrg bool last_input_needs_ss = false; 967ec681f3Smrg bool has_tex_prefetch = false; 977ec681f3Smrg bool mergedregs = ctx->so->mergedregs; 987ec681f3Smrg 997ec681f3Smrg /* our input state is the OR of all predecessor blocks' state: */ 1007ec681f3Smrg for (unsigned i = 0; i < block->predecessors_count; i++) { 1017ec681f3Smrg struct ir3_block *predecessor = block->predecessors[i]; 1027ec681f3Smrg struct ir3_legalize_block_data *pbd = predecessor->data; 1037ec681f3Smrg struct ir3_legalize_state *pstate = &pbd->state; 1047ec681f3Smrg 1057ec681f3Smrg /* Our input (ss)/(sy) state is based on OR'ing the output 1067ec681f3Smrg * state of all our predecessor blocks 1077ec681f3Smrg */ 1087ec681f3Smrg regmask_or(&state->needs_ss, &state->needs_ss, &pstate->needs_ss); 1097ec681f3Smrg regmask_or(&state->needs_ss_war, &state->needs_ss_war, 1107ec681f3Smrg &pstate->needs_ss_war); 1117ec681f3Smrg regmask_or(&state->needs_sy, &state->needs_sy, &pstate->needs_sy); 1127ec681f3Smrg } 1137ec681f3Smrg 1147ec681f3Smrg unsigned input_count = 0; 1157ec681f3Smrg 1167ec681f3Smrg foreach_instr (n, &block->instr_list) { 1177ec681f3Smrg if (is_input(n)) { 1187ec681f3Smrg input_count++; 1197ec681f3Smrg } 1207ec681f3Smrg } 1217ec681f3Smrg 1227ec681f3Smrg unsigned inputs_remaining = input_count; 1237ec681f3Smrg 1247ec681f3Smrg /* Either inputs are in the first block or we expect inputs to be released 1257ec681f3Smrg * with the end of the program. 1267ec681f3Smrg */ 1277ec681f3Smrg assert(input_count == 0 || !ctx->early_input_release || 1287ec681f3Smrg block == ir3_start_block(block->shader)); 1297ec681f3Smrg 1307ec681f3Smrg /* remove all the instructions from the list, we'll be adding 1317ec681f3Smrg * them back in as we go 1327ec681f3Smrg */ 1337ec681f3Smrg list_replace(&block->instr_list, &instr_list); 1347ec681f3Smrg list_inithead(&block->instr_list); 1357ec681f3Smrg 1367ec681f3Smrg foreach_instr_safe (n, &instr_list) { 1377ec681f3Smrg unsigned i; 1387ec681f3Smrg 1397ec681f3Smrg n->flags &= ~(IR3_INSTR_SS | IR3_INSTR_SY); 1407ec681f3Smrg 1417ec681f3Smrg /* _meta::tex_prefetch instructions removed later in 1427ec681f3Smrg * collect_tex_prefetches() 1437ec681f3Smrg */ 1447ec681f3Smrg if (is_meta(n) && (n->opc != OPC_META_TEX_PREFETCH)) 1457ec681f3Smrg continue; 1467ec681f3Smrg 1477ec681f3Smrg if (is_input(n)) { 1487ec681f3Smrg struct ir3_register *inloc = n->srcs[0]; 1497ec681f3Smrg assert(inloc->flags & IR3_REG_IMMED); 1507ec681f3Smrg ctx->max_bary = MAX2(ctx->max_bary, inloc->iim_val); 1517ec681f3Smrg } 1527ec681f3Smrg 1537ec681f3Smrg if (last_n && is_barrier(last_n)) { 1547ec681f3Smrg n->flags |= IR3_INSTR_SS | IR3_INSTR_SY; 1557ec681f3Smrg last_input_needs_ss = false; 1567ec681f3Smrg regmask_init(&state->needs_ss_war, mergedregs); 1577ec681f3Smrg regmask_init(&state->needs_ss, mergedregs); 1587ec681f3Smrg regmask_init(&state->needs_sy, mergedregs); 1597ec681f3Smrg } 1607ec681f3Smrg 1617ec681f3Smrg if (last_n && (last_n->opc == OPC_PREDT)) { 1627ec681f3Smrg n->flags |= IR3_INSTR_SS; 1637ec681f3Smrg regmask_init(&state->needs_ss_war, mergedregs); 1647ec681f3Smrg regmask_init(&state->needs_ss, mergedregs); 1657ec681f3Smrg } 1667ec681f3Smrg 1677ec681f3Smrg /* NOTE: consider dst register too.. it could happen that 1687ec681f3Smrg * texture sample instruction (for example) writes some 1697ec681f3Smrg * components which are unused. A subsequent instruction 1707ec681f3Smrg * that writes the same register can race w/ the sam instr 1717ec681f3Smrg * resulting in undefined results: 1727ec681f3Smrg */ 1737ec681f3Smrg for (i = 0; i < n->dsts_count + n->srcs_count; i++) { 1747ec681f3Smrg struct ir3_register *reg; 1757ec681f3Smrg if (i < n->dsts_count) 1767ec681f3Smrg reg = n->dsts[i]; 1777ec681f3Smrg else 1787ec681f3Smrg reg = n->srcs[i - n->dsts_count]; 1797ec681f3Smrg 1807ec681f3Smrg if (reg_gpr(reg)) { 1817ec681f3Smrg 1827ec681f3Smrg /* TODO: we probably only need (ss) for alu 1837ec681f3Smrg * instr consuming sfu result.. need to make 1847ec681f3Smrg * some tests for both this and (sy).. 1857ec681f3Smrg */ 1867ec681f3Smrg if (regmask_get(&state->needs_ss, reg)) { 1877ec681f3Smrg n->flags |= IR3_INSTR_SS; 1887ec681f3Smrg last_input_needs_ss = false; 1897ec681f3Smrg regmask_init(&state->needs_ss_war, mergedregs); 1907ec681f3Smrg regmask_init(&state->needs_ss, mergedregs); 1917ec681f3Smrg } 1927ec681f3Smrg 1937ec681f3Smrg if (regmask_get(&state->needs_sy, reg)) { 1947ec681f3Smrg n->flags |= IR3_INSTR_SY; 1957ec681f3Smrg regmask_init(&state->needs_sy, mergedregs); 1967ec681f3Smrg } 1977ec681f3Smrg } 1987ec681f3Smrg 1997ec681f3Smrg /* TODO: is it valid to have address reg loaded from a 2007ec681f3Smrg * relative src (ie. mova a0, c<a0.x+4>)? If so, the 2017ec681f3Smrg * last_rel check below should be moved ahead of this: 2027ec681f3Smrg */ 2037ec681f3Smrg if (reg->flags & IR3_REG_RELATIV) 2047ec681f3Smrg last_rel = n; 2057ec681f3Smrg } 2067ec681f3Smrg 2077ec681f3Smrg foreach_dst (reg, n) { 2087ec681f3Smrg if (regmask_get(&state->needs_ss_war, reg)) { 2097ec681f3Smrg n->flags |= IR3_INSTR_SS; 2107ec681f3Smrg last_input_needs_ss = false; 2117ec681f3Smrg regmask_init(&state->needs_ss_war, mergedregs); 2127ec681f3Smrg regmask_init(&state->needs_ss, mergedregs); 2137ec681f3Smrg } 2147ec681f3Smrg 2157ec681f3Smrg if (last_rel && (reg->num == regid(REG_A0, 0))) { 2167ec681f3Smrg last_rel->flags |= IR3_INSTR_UL; 2177ec681f3Smrg last_rel = NULL; 2187ec681f3Smrg } 2197ec681f3Smrg } 2207ec681f3Smrg 2217ec681f3Smrg /* cat5+ does not have an (ss) bit, if needed we need to 2227ec681f3Smrg * insert a nop to carry the sync flag. Would be kinda 2237ec681f3Smrg * clever if we were aware of this during scheduling, but 2247ec681f3Smrg * this should be a pretty rare case: 2257ec681f3Smrg */ 2267ec681f3Smrg if ((n->flags & IR3_INSTR_SS) && (opc_cat(n->opc) >= 5)) { 2277ec681f3Smrg struct ir3_instruction *nop; 2287ec681f3Smrg nop = ir3_NOP(block); 2297ec681f3Smrg nop->flags |= IR3_INSTR_SS; 2307ec681f3Smrg n->flags &= ~IR3_INSTR_SS; 2317ec681f3Smrg } 2327ec681f3Smrg 2337ec681f3Smrg /* need to be able to set (ss) on first instruction: */ 2347ec681f3Smrg if (list_is_empty(&block->instr_list) && (opc_cat(n->opc) >= 5)) 2357ec681f3Smrg ir3_NOP(block); 2367ec681f3Smrg 2377ec681f3Smrg if (ctx->compiler->samgq_workaround && 2387ec681f3Smrg ctx->type != MESA_SHADER_FRAGMENT && 2397ec681f3Smrg ctx->type != MESA_SHADER_COMPUTE && n->opc == OPC_SAMGQ) { 2407ec681f3Smrg struct ir3_instruction *samgp; 2417ec681f3Smrg 2427ec681f3Smrg list_delinit(&n->node); 2437ec681f3Smrg 2447ec681f3Smrg for (i = 0; i < 4; i++) { 2457ec681f3Smrg samgp = ir3_instr_clone(n); 2467ec681f3Smrg samgp->opc = OPC_SAMGP0 + i; 2477ec681f3Smrg if (i > 1) 2487ec681f3Smrg samgp->flags |= IR3_INSTR_SY; 2497ec681f3Smrg } 2507ec681f3Smrg } else { 2517ec681f3Smrg list_delinit(&n->node); 2527ec681f3Smrg list_addtail(&n->node, &block->instr_list); 2537ec681f3Smrg } 2547ec681f3Smrg 2557ec681f3Smrg if (is_sfu(n)) 2567ec681f3Smrg regmask_set(&state->needs_ss, n->dsts[0]); 2577ec681f3Smrg 2587ec681f3Smrg if (is_tex_or_prefetch(n)) { 2597ec681f3Smrg regmask_set(&state->needs_sy, n->dsts[0]); 2607ec681f3Smrg if (n->opc == OPC_META_TEX_PREFETCH) 2617ec681f3Smrg has_tex_prefetch = true; 2627ec681f3Smrg } else if (n->opc == OPC_RESINFO) { 2637ec681f3Smrg regmask_set(&state->needs_ss, n->dsts[0]); 2647ec681f3Smrg ir3_NOP(block)->flags |= IR3_INSTR_SS; 2657ec681f3Smrg last_input_needs_ss = false; 2667ec681f3Smrg } else if (is_load(n)) { 2677ec681f3Smrg /* seems like ldlv needs (ss) bit instead?? which is odd but 2687ec681f3Smrg * makes a bunch of flat-varying tests start working on a4xx. 2697ec681f3Smrg */ 2707ec681f3Smrg if ((n->opc == OPC_LDLV) || (n->opc == OPC_LDL) || 2717ec681f3Smrg (n->opc == OPC_LDLW)) 2727ec681f3Smrg regmask_set(&state->needs_ss, n->dsts[0]); 2737ec681f3Smrg else 2747ec681f3Smrg regmask_set(&state->needs_sy, n->dsts[0]); 2757ec681f3Smrg } else if (is_atomic(n->opc)) { 2767ec681f3Smrg if (n->flags & IR3_INSTR_G) { 2777ec681f3Smrg if (ctx->compiler->gen >= 6) { 2787ec681f3Smrg /* New encoding, returns result via second src: */ 2797ec681f3Smrg regmask_set(&state->needs_sy, n->srcs[2]); 2807ec681f3Smrg } else { 2817ec681f3Smrg regmask_set(&state->needs_sy, n->dsts[0]); 2827ec681f3Smrg } 2837ec681f3Smrg } else { 2847ec681f3Smrg regmask_set(&state->needs_ss, n->dsts[0]); 2857ec681f3Smrg } 2867ec681f3Smrg } 2877ec681f3Smrg 2887ec681f3Smrg if (is_ssbo(n->opc) || (is_atomic(n->opc) && (n->flags & IR3_INSTR_G))) 2897ec681f3Smrg ctx->so->has_ssbo = true; 2907ec681f3Smrg 2917ec681f3Smrg /* both tex/sfu appear to not always immediately consume 2927ec681f3Smrg * their src register(s): 2937ec681f3Smrg */ 2947ec681f3Smrg if (is_tex(n) || is_sfu(n) || is_mem(n)) { 2957ec681f3Smrg foreach_src (reg, n) { 2967ec681f3Smrg regmask_set(&state->needs_ss_war, reg); 2977ec681f3Smrg } 2987ec681f3Smrg } 2997ec681f3Smrg 3007ec681f3Smrg if (ctx->early_input_release && is_input(n)) { 3017ec681f3Smrg last_input_needs_ss |= (n->opc == OPC_LDLV); 3027ec681f3Smrg 3037ec681f3Smrg assert(inputs_remaining > 0); 3047ec681f3Smrg inputs_remaining--; 3057ec681f3Smrg if (inputs_remaining == 0) { 3067ec681f3Smrg /* This is the last input. We add the (ei) flag to release 3077ec681f3Smrg * varying memory after this executes. If it's an ldlv, 3087ec681f3Smrg * however, we need to insert a dummy bary.f on which we can 3097ec681f3Smrg * set the (ei) flag. We may also need to insert an (ss) to 3107ec681f3Smrg * guarantee that all ldlv's have finished fetching their 3117ec681f3Smrg * results before releasing the varying memory. 3127ec681f3Smrg */ 3137ec681f3Smrg struct ir3_instruction *last_input = n; 3147ec681f3Smrg if (n->opc == OPC_LDLV) { 3157ec681f3Smrg struct ir3_instruction *baryf; 3167ec681f3Smrg 3177ec681f3Smrg /* (ss)bary.f (ei)r63.x, 0, r0.x */ 3187ec681f3Smrg baryf = ir3_instr_create(block, OPC_BARY_F, 1, 2); 3197ec681f3Smrg ir3_dst_create(baryf, regid(63, 0), 0); 3207ec681f3Smrg ir3_src_create(baryf, 0, IR3_REG_IMMED)->iim_val = 0; 3217ec681f3Smrg ir3_src_create(baryf, regid(0, 0), 0); 3227ec681f3Smrg 3237ec681f3Smrg last_input = baryf; 3247ec681f3Smrg } 3257ec681f3Smrg 3267ec681f3Smrg last_input->dsts[0]->flags |= IR3_REG_EI; 3277ec681f3Smrg if (last_input_needs_ss) { 3287ec681f3Smrg last_input->flags |= IR3_INSTR_SS; 3297ec681f3Smrg regmask_init(&state->needs_ss_war, mergedregs); 3307ec681f3Smrg regmask_init(&state->needs_ss, mergedregs); 3317ec681f3Smrg } 3327ec681f3Smrg } 3337ec681f3Smrg } 3347ec681f3Smrg 3357ec681f3Smrg last_n = n; 3367ec681f3Smrg } 3377ec681f3Smrg 3387ec681f3Smrg assert(inputs_remaining == 0 || !ctx->early_input_release); 3397ec681f3Smrg 3407ec681f3Smrg if (has_tex_prefetch && input_count == 0) { 3417ec681f3Smrg /* texture prefetch, but *no* inputs.. we need to insert a 3427ec681f3Smrg * dummy bary.f at the top of the shader to unblock varying 3437ec681f3Smrg * storage: 3447ec681f3Smrg */ 3457ec681f3Smrg struct ir3_instruction *baryf; 3467ec681f3Smrg 3477ec681f3Smrg /* (ss)bary.f (ei)r63.x, 0, r0.x */ 3487ec681f3Smrg baryf = ir3_instr_create(block, OPC_BARY_F, 1, 2); 3497ec681f3Smrg ir3_dst_create(baryf, regid(63, 0), 0)->flags |= IR3_REG_EI; 3507ec681f3Smrg ir3_src_create(baryf, 0, IR3_REG_IMMED)->iim_val = 0; 3517ec681f3Smrg ir3_src_create(baryf, regid(0, 0), 0); 3527ec681f3Smrg 3537ec681f3Smrg /* insert the dummy bary.f at head: */ 3547ec681f3Smrg list_delinit(&baryf->node); 3557ec681f3Smrg list_add(&baryf->node, &block->instr_list); 3567ec681f3Smrg } 3577ec681f3Smrg 3587ec681f3Smrg if (last_rel) 3597ec681f3Smrg last_rel->flags |= IR3_INSTR_UL; 3607ec681f3Smrg 3617ec681f3Smrg bd->valid = true; 3627ec681f3Smrg 3637ec681f3Smrg if (memcmp(&prev_state, state, sizeof(*state))) { 3647ec681f3Smrg /* our output state changed, this invalidates all of our 3657ec681f3Smrg * successors: 3667ec681f3Smrg */ 3677ec681f3Smrg for (unsigned i = 0; i < ARRAY_SIZE(block->successors); i++) { 3687ec681f3Smrg if (!block->successors[i]) 3697ec681f3Smrg break; 3707ec681f3Smrg struct ir3_legalize_block_data *pbd = block->successors[i]->data; 3717ec681f3Smrg pbd->valid = false; 3727ec681f3Smrg } 3737ec681f3Smrg } 3747ec681f3Smrg 3757ec681f3Smrg return true; 3767ec681f3Smrg} 3777ec681f3Smrg 3787ec681f3Smrg/* Expands dsxpp and dsypp macros to: 3797ec681f3Smrg * 3807ec681f3Smrg * dsxpp.1 dst, src 3817ec681f3Smrg * dsxpp.1.p dst, src 3827ec681f3Smrg * 3837ec681f3Smrg * We apply this after flags syncing, as we don't want to sync in between the 3847ec681f3Smrg * two (which might happen if dst == src). We do it before nop scheduling 3857ec681f3Smrg * because that needs to count actual instructions. 3867ec681f3Smrg */ 3877ec681f3Smrgstatic bool 3887ec681f3Smrgapply_fine_deriv_macro(struct ir3_legalize_ctx *ctx, struct ir3_block *block) 3897ec681f3Smrg{ 3907ec681f3Smrg struct list_head instr_list; 3917ec681f3Smrg 3927ec681f3Smrg /* remove all the instructions from the list, we'll be adding 3937ec681f3Smrg * them back in as we go 3947ec681f3Smrg */ 3957ec681f3Smrg list_replace(&block->instr_list, &instr_list); 3967ec681f3Smrg list_inithead(&block->instr_list); 3977ec681f3Smrg 3987ec681f3Smrg foreach_instr_safe (n, &instr_list) { 3997ec681f3Smrg list_addtail(&n->node, &block->instr_list); 4007ec681f3Smrg 4017ec681f3Smrg if (n->opc == OPC_DSXPP_MACRO || n->opc == OPC_DSYPP_MACRO) { 4027ec681f3Smrg n->opc = (n->opc == OPC_DSXPP_MACRO) ? OPC_DSXPP_1 : OPC_DSYPP_1; 4037ec681f3Smrg 4047ec681f3Smrg struct ir3_instruction *op_p = ir3_instr_clone(n); 4057ec681f3Smrg op_p->flags = IR3_INSTR_P; 4067ec681f3Smrg 4077ec681f3Smrg ctx->so->need_fine_derivatives = true; 4087ec681f3Smrg } 4097ec681f3Smrg } 4107ec681f3Smrg 4117ec681f3Smrg return true; 4127e102996Smaya} 4137e102996Smaya 4147e102996Smaya/* NOTE: branch instructions are always the last instruction(s) 4157e102996Smaya * in the block. We take advantage of this as we resolve the 4167e102996Smaya * branches, since "if (foo) break;" constructs turn into 4177e102996Smaya * something like: 4187e102996Smaya * 4197e102996Smaya * block3 { 4207e102996Smaya * ... 4217e102996Smaya * 0029:021: mov.s32s32 r62.x, r1.y 4227e102996Smaya * 0082:022: br !p0.x, target=block5 4237e102996Smaya * 0083:023: br p0.x, target=block4 4247e102996Smaya * // succs: if _[0029:021: mov.s32s32] block4; else block5; 4257e102996Smaya * } 4267e102996Smaya * block4 { 4277e102996Smaya * 0084:024: jump, target=block6 4287e102996Smaya * // succs: block6; 4297e102996Smaya * } 4307e102996Smaya * block5 { 4317e102996Smaya * 0085:025: jump, target=block7 4327e102996Smaya * // succs: block7; 4337e102996Smaya * } 4347e102996Smaya * 4357e102996Smaya * ie. only instruction in block4/block5 is a jump, so when 4367e102996Smaya * resolving branches we can easily detect this by checking 4377e102996Smaya * that the first instruction in the target block is itself 4387e102996Smaya * a jump, and setup the br directly to the jump's target 4397e102996Smaya * (and strip back out the now unreached jump) 4407e102996Smaya * 4417e102996Smaya * TODO sometimes we end up with things like: 4427e102996Smaya * 4437e102996Smaya * br !p0.x, #2 4447e102996Smaya * br p0.x, #12 4457e102996Smaya * add.u r0.y, r0.y, 1 4467e102996Smaya * 4477e102996Smaya * If we swapped the order of the branches, we could drop one. 4487e102996Smaya */ 4497e102996Smayastatic struct ir3_block * 4507e102996Smayaresolve_dest_block(struct ir3_block *block) 4517e102996Smaya{ 4527ec681f3Smrg /* special case for last block: */ 4537ec681f3Smrg if (!block->successors[0]) 4547ec681f3Smrg return block; 4557ec681f3Smrg 4567ec681f3Smrg /* NOTE that we may or may not have inserted the jump 4577ec681f3Smrg * in the target block yet, so conditions to resolve 4587ec681f3Smrg * the dest to the dest block's successor are: 4597ec681f3Smrg * 4607ec681f3Smrg * (1) successor[1] == NULL && 4617ec681f3Smrg * (2) (block-is-empty || only-instr-is-jump) 4627ec681f3Smrg */ 4637ec681f3Smrg if (block->successors[1] == NULL) { 4647ec681f3Smrg if (list_is_empty(&block->instr_list)) { 4657ec681f3Smrg return block->successors[0]; 4667ec681f3Smrg } else if (list_length(&block->instr_list) == 1) { 4677ec681f3Smrg struct ir3_instruction *instr = 4687ec681f3Smrg list_first_entry(&block->instr_list, struct ir3_instruction, node); 4697ec681f3Smrg if (instr->opc == OPC_JUMP) { 4707ec681f3Smrg /* If this jump is backwards, then we will probably convert 4717ec681f3Smrg * the jump being resolved to a backwards jump, which will 4727ec681f3Smrg * change a loop-with-continue or loop-with-if into a 4737ec681f3Smrg * doubly-nested loop and change the convergence behavior. 4747ec681f3Smrg * Disallow this here. 4757ec681f3Smrg */ 4767ec681f3Smrg if (block->successors[0]->index <= block->index) 4777ec681f3Smrg return block; 4787ec681f3Smrg return block->successors[0]; 4797ec681f3Smrg } 4807ec681f3Smrg } 4817ec681f3Smrg } 4827ec681f3Smrg return block; 4837ec681f3Smrg} 4847ec681f3Smrg 4857ec681f3Smrgstatic void 4867ec681f3Smrgremove_unused_block(struct ir3_block *old_target) 4877ec681f3Smrg{ 4887ec681f3Smrg list_delinit(&old_target->node); 4897ec681f3Smrg 4907ec681f3Smrg /* cleanup dangling predecessors: */ 4917ec681f3Smrg for (unsigned i = 0; i < ARRAY_SIZE(old_target->successors); i++) { 4927ec681f3Smrg if (old_target->successors[i]) { 4937ec681f3Smrg struct ir3_block *succ = old_target->successors[i]; 4947ec681f3Smrg ir3_block_remove_predecessor(succ, old_target); 4957ec681f3Smrg } 4967ec681f3Smrg } 4977e102996Smaya} 4987e102996Smaya 4997e102996Smayastatic bool 5007ec681f3Smrgretarget_jump(struct ir3_instruction *instr, struct ir3_block *new_target) 5017e102996Smaya{ 5027ec681f3Smrg struct ir3_block *old_target = instr->cat0.target; 5037ec681f3Smrg struct ir3_block *cur_block = instr->block; 5047ec681f3Smrg 5057ec681f3Smrg /* update current blocks successors to reflect the retargetting: */ 5067ec681f3Smrg if (cur_block->successors[0] == old_target) { 5077ec681f3Smrg cur_block->successors[0] = new_target; 5087ec681f3Smrg } else { 5097ec681f3Smrg debug_assert(cur_block->successors[1] == old_target); 5107ec681f3Smrg cur_block->successors[1] = new_target; 5117ec681f3Smrg } 5127ec681f3Smrg 5137ec681f3Smrg /* also update physical_successors.. we don't really need them at 5147ec681f3Smrg * this stage, but it keeps ir3_validate happy: 5157ec681f3Smrg */ 5167ec681f3Smrg if (cur_block->physical_successors[0] == old_target) { 5177ec681f3Smrg cur_block->physical_successors[0] = new_target; 5187ec681f3Smrg } else { 5197ec681f3Smrg debug_assert(cur_block->physical_successors[1] == old_target); 5207ec681f3Smrg cur_block->physical_successors[1] = new_target; 5217ec681f3Smrg } 5227ec681f3Smrg 5237ec681f3Smrg /* update new target's predecessors: */ 5247ec681f3Smrg ir3_block_add_predecessor(new_target, cur_block); 5257ec681f3Smrg 5267ec681f3Smrg /* and remove old_target's predecessor: */ 5277ec681f3Smrg ir3_block_remove_predecessor(old_target, cur_block); 5287ec681f3Smrg 5297ec681f3Smrg instr->cat0.target = new_target; 5307ec681f3Smrg 5317ec681f3Smrg if (old_target->predecessors_count == 0) { 5327ec681f3Smrg remove_unused_block(old_target); 5337ec681f3Smrg return true; 5347ec681f3Smrg } 5357ec681f3Smrg 5367ec681f3Smrg return false; 5377e102996Smaya} 5387e102996Smaya 5397e102996Smayastatic bool 5407ec681f3Smrgopt_jump(struct ir3 *ir) 5417ec681f3Smrg{ 5427ec681f3Smrg bool progress = false; 5437ec681f3Smrg 5447ec681f3Smrg unsigned index = 0; 5457ec681f3Smrg foreach_block (block, &ir->block_list) 5467ec681f3Smrg block->index = index++; 5477ec681f3Smrg 5487ec681f3Smrg foreach_block (block, &ir->block_list) { 5497ec681f3Smrg foreach_instr (instr, &block->instr_list) { 5507ec681f3Smrg if (!is_flow(instr) || !instr->cat0.target) 5517ec681f3Smrg continue; 5527ec681f3Smrg 5537ec681f3Smrg struct ir3_block *tblock = resolve_dest_block(instr->cat0.target); 5547ec681f3Smrg if (tblock != instr->cat0.target) { 5557ec681f3Smrg progress = true; 5567ec681f3Smrg 5577ec681f3Smrg /* Exit early if we deleted a block to avoid iterator 5587ec681f3Smrg * weirdness/assert fails 5597ec681f3Smrg */ 5607ec681f3Smrg if (retarget_jump(instr, tblock)) 5617ec681f3Smrg return true; 5627ec681f3Smrg } 5637ec681f3Smrg } 5647ec681f3Smrg 5657ec681f3Smrg /* Detect the case where the block ends either with: 5667ec681f3Smrg * - A single unconditional jump to the next block. 5677ec681f3Smrg * - Two jump instructions with opposite conditions, and one of the 5687ec681f3Smrg * them jumps to the next block. 5697ec681f3Smrg * We can remove the one that jumps to the next block in either case. 5707ec681f3Smrg */ 5717ec681f3Smrg if (list_is_empty(&block->instr_list)) 5727ec681f3Smrg continue; 5737ec681f3Smrg 5747ec681f3Smrg struct ir3_instruction *jumps[2] = {NULL, NULL}; 5757ec681f3Smrg jumps[0] = 5767ec681f3Smrg list_last_entry(&block->instr_list, struct ir3_instruction, node); 5777ec681f3Smrg if (!list_is_singular(&block->instr_list)) 5787ec681f3Smrg jumps[1] = 5797ec681f3Smrg list_last_entry(&jumps[0]->node, struct ir3_instruction, node); 5807ec681f3Smrg 5817ec681f3Smrg if (jumps[0]->opc == OPC_JUMP) 5827ec681f3Smrg jumps[1] = NULL; 5837ec681f3Smrg else if (jumps[0]->opc != OPC_B || !jumps[1] || jumps[1]->opc != OPC_B) 5847ec681f3Smrg continue; 5857ec681f3Smrg 5867ec681f3Smrg for (unsigned i = 0; i < 2; i++) { 5877ec681f3Smrg if (!jumps[i]) 5887ec681f3Smrg continue; 5897ec681f3Smrg 5907ec681f3Smrg struct ir3_block *tblock = jumps[i]->cat0.target; 5917ec681f3Smrg if (&tblock->node == block->node.next) { 5927ec681f3Smrg list_delinit(&jumps[i]->node); 5937ec681f3Smrg progress = true; 5947ec681f3Smrg break; 5957ec681f3Smrg } 5967ec681f3Smrg } 5977ec681f3Smrg } 5987ec681f3Smrg 5997ec681f3Smrg return progress; 6007ec681f3Smrg} 6017ec681f3Smrg 6027ec681f3Smrgstatic void 6037e102996Smayaresolve_jumps(struct ir3 *ir) 6047e102996Smaya{ 6057ec681f3Smrg foreach_block (block, &ir->block_list) 6067ec681f3Smrg foreach_instr (instr, &block->instr_list) 6077ec681f3Smrg if (is_flow(instr) && instr->cat0.target) { 6087ec681f3Smrg struct ir3_instruction *target = list_first_entry( 6097ec681f3Smrg &instr->cat0.target->instr_list, struct ir3_instruction, node); 6107ec681f3Smrg 6117ec681f3Smrg instr->cat0.immed = (int)target->ip - (int)instr->ip; 6127ec681f3Smrg } 6137ec681f3Smrg} 6147e102996Smaya 6157ec681f3Smrgstatic void 6167ec681f3Smrgmark_jp(struct ir3_block *block) 6177ec681f3Smrg{ 6187ec681f3Smrg struct ir3_instruction *target = 6197ec681f3Smrg list_first_entry(&block->instr_list, struct ir3_instruction, node); 6207ec681f3Smrg target->flags |= IR3_INSTR_JP; 6217e102996Smaya} 6227e102996Smaya 6237ec681f3Smrg/* Mark points where control flow converges or diverges. 6247e102996Smaya * 6257ec681f3Smrg * Divergence points could actually be re-convergence points where 6267ec681f3Smrg * "parked" threads are recoverged with threads that took the opposite 6277ec681f3Smrg * path last time around. Possibly it is easier to think of (jp) as 6287ec681f3Smrg * "the execution mask might have changed". 6297e102996Smaya */ 6307e102996Smayastatic void 6317ec681f3Smrgmark_xvergence_points(struct ir3 *ir) 6327e102996Smaya{ 6337ec681f3Smrg foreach_block (block, &ir->block_list) { 6347ec681f3Smrg if (block->predecessors_count > 1) { 6357ec681f3Smrg /* if a block has more than one possible predecessor, then 6367ec681f3Smrg * the first instruction is a convergence point. 6377ec681f3Smrg */ 6387ec681f3Smrg mark_jp(block); 6397ec681f3Smrg } else if (block->predecessors_count == 1) { 6407ec681f3Smrg /* If a block has one predecessor, which has multiple possible 6417ec681f3Smrg * successors, it is a divergence point. 6427ec681f3Smrg */ 6437ec681f3Smrg for (unsigned i = 0; i < block->predecessors_count; i++) { 6447ec681f3Smrg struct ir3_block *predecessor = block->predecessors[i]; 6457ec681f3Smrg if (predecessor->successors[1]) { 6467ec681f3Smrg mark_jp(block); 6477ec681f3Smrg } 6487ec681f3Smrg } 6497ec681f3Smrg } 6507ec681f3Smrg } 6517e102996Smaya} 6527e102996Smaya 6537ec681f3Smrg/* Insert the branch/jump instructions for flow control between blocks. 6547ec681f3Smrg * Initially this is done naively, without considering if the successor 6557ec681f3Smrg * block immediately follows the current block (ie. so no jump required), 6567ec681f3Smrg * but that is cleaned up in opt_jump(). 6577ec681f3Smrg * 6587ec681f3Smrg * TODO what ensures that the last write to p0.x in a block is the 6597ec681f3Smrg * branch condition? Have we been getting lucky all this time? 6607ec681f3Smrg */ 6617ec681f3Smrgstatic void 6627ec681f3Smrgblock_sched(struct ir3 *ir) 6637e102996Smaya{ 6647ec681f3Smrg foreach_block (block, &ir->block_list) { 6657ec681f3Smrg if (block->successors[1]) { 6667ec681f3Smrg /* if/else, conditional branches to "then" or "else": */ 6677ec681f3Smrg struct ir3_instruction *br1, *br2; 6687ec681f3Smrg 6697ec681f3Smrg if (block->brtype == IR3_BRANCH_GETONE) { 6707ec681f3Smrg /* getone can't be inverted, and it wouldn't even make sense 6717ec681f3Smrg * to follow it with an inverted branch, so follow it by an 6727ec681f3Smrg * unconditional branch. 6737ec681f3Smrg */ 6747ec681f3Smrg debug_assert(!block->condition); 6757ec681f3Smrg br1 = ir3_GETONE(block); 6767ec681f3Smrg br1->cat0.target = block->successors[1]; 6777ec681f3Smrg 6787ec681f3Smrg br2 = ir3_JUMP(block); 6797ec681f3Smrg br2->cat0.target = block->successors[0]; 6807ec681f3Smrg } else { 6817ec681f3Smrg debug_assert(block->condition); 6827ec681f3Smrg 6837ec681f3Smrg /* create "else" branch first (since "then" block should 6847ec681f3Smrg * frequently/always end up being a fall-thru): 6857ec681f3Smrg */ 6867ec681f3Smrg br1 = ir3_instr_create(block, OPC_B, 0, 1); 6877ec681f3Smrg ir3_src_create(br1, regid(REG_P0, 0), 0)->def = 6887ec681f3Smrg block->condition->dsts[0]; 6897ec681f3Smrg br1->cat0.inv1 = true; 6907ec681f3Smrg br1->cat0.target = block->successors[1]; 6917ec681f3Smrg 6927ec681f3Smrg /* "then" branch: */ 6937ec681f3Smrg br2 = ir3_instr_create(block, OPC_B, 0, 1); 6947ec681f3Smrg ir3_src_create(br2, regid(REG_P0, 0), 0)->def = 6957ec681f3Smrg block->condition->dsts[0]; 6967ec681f3Smrg br2->cat0.target = block->successors[0]; 6977ec681f3Smrg 6987ec681f3Smrg switch (block->brtype) { 6997ec681f3Smrg case IR3_BRANCH_COND: 7007ec681f3Smrg br1->cat0.brtype = br2->cat0.brtype = BRANCH_PLAIN; 7017ec681f3Smrg break; 7027ec681f3Smrg case IR3_BRANCH_ALL: 7037ec681f3Smrg br1->cat0.brtype = BRANCH_ANY; 7047ec681f3Smrg br2->cat0.brtype = BRANCH_ALL; 7057ec681f3Smrg break; 7067ec681f3Smrg case IR3_BRANCH_ANY: 7077ec681f3Smrg br1->cat0.brtype = BRANCH_ALL; 7087ec681f3Smrg br2->cat0.brtype = BRANCH_ANY; 7097ec681f3Smrg break; 7107ec681f3Smrg case IR3_BRANCH_GETONE: 7117ec681f3Smrg unreachable("can't get here"); 7127ec681f3Smrg } 7137ec681f3Smrg } 7147ec681f3Smrg } else if (block->successors[0]) { 7157ec681f3Smrg /* otherwise unconditional jump to next block: */ 7167ec681f3Smrg struct ir3_instruction *jmp; 7177ec681f3Smrg 7187ec681f3Smrg jmp = ir3_JUMP(block); 7197ec681f3Smrg jmp->cat0.target = block->successors[0]; 7207ec681f3Smrg } 7217ec681f3Smrg } 7227ec681f3Smrg} 7237e102996Smaya 7247ec681f3Smrg/* Here we workaround the fact that kill doesn't actually kill the thread as 7257ec681f3Smrg * GL expects. The last instruction always needs to be an end instruction, 7267ec681f3Smrg * which means that if we're stuck in a loop where kill is the only way out, 7277ec681f3Smrg * then we may have to jump out to the end. kill may also have the d3d 7287ec681f3Smrg * semantics of converting the thread to a helper thread, rather than setting 7297ec681f3Smrg * the exec mask to 0, in which case the helper thread could get stuck in an 7307ec681f3Smrg * infinite loop. 7317ec681f3Smrg * 7327ec681f3Smrg * We do this late, both to give the scheduler the opportunity to reschedule 7337ec681f3Smrg * kill instructions earlier and to avoid having to create a separate basic 7347ec681f3Smrg * block. 7357ec681f3Smrg * 7367ec681f3Smrg * TODO: Assuming that the wavefront doesn't stop as soon as all threads are 7377ec681f3Smrg * killed, we might benefit by doing this more aggressively when the remaining 7387ec681f3Smrg * part of the program after the kill is large, since that would let us 7397ec681f3Smrg * skip over the instructions when there are no non-killed threads left. 7407ec681f3Smrg */ 7417ec681f3Smrgstatic void 7427ec681f3Smrgkill_sched(struct ir3 *ir, struct ir3_shader_variant *so) 7437ec681f3Smrg{ 7447ec681f3Smrg /* True if we know that this block will always eventually lead to the end 7457ec681f3Smrg * block: 7467ec681f3Smrg */ 7477ec681f3Smrg bool always_ends = true; 7487ec681f3Smrg bool added = false; 7497ec681f3Smrg struct ir3_block *last_block = 7507ec681f3Smrg list_last_entry(&ir->block_list, struct ir3_block, node); 7517ec681f3Smrg 7527ec681f3Smrg foreach_block_rev (block, &ir->block_list) { 7537ec681f3Smrg for (unsigned i = 0; i < 2 && block->successors[i]; i++) { 7547ec681f3Smrg if (block->successors[i]->start_ip <= block->end_ip) 7557ec681f3Smrg always_ends = false; 7567ec681f3Smrg } 7577ec681f3Smrg 7587ec681f3Smrg if (always_ends) 7597ec681f3Smrg continue; 7607ec681f3Smrg 7617ec681f3Smrg foreach_instr_safe (instr, &block->instr_list) { 7627ec681f3Smrg if (instr->opc != OPC_KILL) 7637ec681f3Smrg continue; 7647ec681f3Smrg 7657ec681f3Smrg struct ir3_instruction *br = ir3_instr_create(block, OPC_B, 0, 1); 7667ec681f3Smrg ir3_src_create(br, instr->srcs[0]->num, instr->srcs[0]->flags)->wrmask = 7677ec681f3Smrg 1; 7687ec681f3Smrg br->cat0.target = 7697ec681f3Smrg list_last_entry(&ir->block_list, struct ir3_block, node); 7707ec681f3Smrg 7717ec681f3Smrg list_del(&br->node); 7727ec681f3Smrg list_add(&br->node, &instr->node); 7737ec681f3Smrg 7747ec681f3Smrg added = true; 7757ec681f3Smrg } 7767ec681f3Smrg } 7777ec681f3Smrg 7787ec681f3Smrg if (added) { 7797ec681f3Smrg /* I'm not entirely sure how the branchstack works, but we probably 7807ec681f3Smrg * need to add at least one entry for the divergence which is resolved 7817ec681f3Smrg * at the end: 7827ec681f3Smrg */ 7837ec681f3Smrg so->branchstack++; 7847ec681f3Smrg 7857ec681f3Smrg /* We don't update predecessors/successors, so we have to do this 7867ec681f3Smrg * manually: 7877ec681f3Smrg */ 7887ec681f3Smrg mark_jp(last_block); 7897ec681f3Smrg } 7907ec681f3Smrg} 7917e102996Smaya 7927ec681f3Smrg/* Insert nop's required to make this a legal/valid shader program: */ 7937ec681f3Smrgstatic void 7947ec681f3Smrgnop_sched(struct ir3 *ir, struct ir3_shader_variant *so) 7957ec681f3Smrg{ 7967ec681f3Smrg foreach_block (block, &ir->block_list) { 7977ec681f3Smrg struct ir3_instruction *last = NULL; 7987ec681f3Smrg struct list_head instr_list; 7997ec681f3Smrg 8007ec681f3Smrg /* remove all the instructions from the list, we'll be adding 8017ec681f3Smrg * them back in as we go 8027ec681f3Smrg */ 8037ec681f3Smrg list_replace(&block->instr_list, &instr_list); 8047ec681f3Smrg list_inithead(&block->instr_list); 8057ec681f3Smrg 8067ec681f3Smrg foreach_instr_safe (instr, &instr_list) { 8077ec681f3Smrg unsigned delay = ir3_delay_calc_exact(block, instr, so->mergedregs); 8087ec681f3Smrg 8097ec681f3Smrg /* NOTE: I think the nopN encoding works for a5xx and 8107ec681f3Smrg * probably a4xx, but not a3xx. So far only tested on 8117ec681f3Smrg * a6xx. 8127ec681f3Smrg */ 8137ec681f3Smrg 8147ec681f3Smrg if ((delay > 0) && (ir->compiler->gen >= 6) && last && 8157ec681f3Smrg ((opc_cat(last->opc) == 2) || (opc_cat(last->opc) == 3)) && 8167ec681f3Smrg (last->repeat == 0)) { 8177ec681f3Smrg /* the previous cat2/cat3 instruction can encode at most 3 nop's: */ 8187ec681f3Smrg unsigned transfer = MIN2(delay, 3 - last->nop); 8197ec681f3Smrg last->nop += transfer; 8207ec681f3Smrg delay -= transfer; 8217ec681f3Smrg } 8227ec681f3Smrg 8237ec681f3Smrg if ((delay > 0) && last && (last->opc == OPC_NOP)) { 8247ec681f3Smrg /* the previous nop can encode at most 5 repeats: */ 8257ec681f3Smrg unsigned transfer = MIN2(delay, 5 - last->repeat); 8267ec681f3Smrg last->repeat += transfer; 8277ec681f3Smrg delay -= transfer; 8287ec681f3Smrg } 8297ec681f3Smrg 8307ec681f3Smrg if (delay > 0) { 8317ec681f3Smrg debug_assert(delay <= 6); 8327ec681f3Smrg ir3_NOP(block)->repeat = delay - 1; 8337ec681f3Smrg } 8347ec681f3Smrg 8357ec681f3Smrg list_addtail(&instr->node, &block->instr_list); 8367ec681f3Smrg last = instr; 8377ec681f3Smrg } 8387ec681f3Smrg } 8397ec681f3Smrg} 8407e102996Smaya 8417ec681f3Smrgbool 8427ec681f3Smrgir3_legalize(struct ir3 *ir, struct ir3_shader_variant *so, int *max_bary) 8437ec681f3Smrg{ 8447ec681f3Smrg struct ir3_legalize_ctx *ctx = rzalloc(ir, struct ir3_legalize_ctx); 8457ec681f3Smrg bool mergedregs = so->mergedregs; 8467ec681f3Smrg bool progress; 8477ec681f3Smrg 8487ec681f3Smrg ctx->so = so; 8497ec681f3Smrg ctx->max_bary = -1; 8507ec681f3Smrg ctx->compiler = ir->compiler; 8517ec681f3Smrg ctx->type = ir->type; 8527ec681f3Smrg 8537ec681f3Smrg /* allocate per-block data: */ 8547ec681f3Smrg foreach_block (block, &ir->block_list) { 8557ec681f3Smrg struct ir3_legalize_block_data *bd = 8567ec681f3Smrg rzalloc(ctx, struct ir3_legalize_block_data); 8577ec681f3Smrg 8587ec681f3Smrg regmask_init(&bd->state.needs_ss_war, mergedregs); 8597ec681f3Smrg regmask_init(&bd->state.needs_ss, mergedregs); 8607ec681f3Smrg regmask_init(&bd->state.needs_sy, mergedregs); 8617ec681f3Smrg 8627ec681f3Smrg block->data = bd; 8637ec681f3Smrg } 8647ec681f3Smrg 8657ec681f3Smrg ir3_remove_nops(ir); 8667ec681f3Smrg 8677ec681f3Smrg /* We may have failed to pull all input loads into the first block. 8687ec681f3Smrg * In such case at the moment we aren't able to find a better place 8697ec681f3Smrg * to for (ei) than the end of the program. 8707ec681f3Smrg * a5xx and a6xx do automatically release varying storage at the end. 8717ec681f3Smrg */ 8727ec681f3Smrg ctx->early_input_release = true; 8737ec681f3Smrg struct ir3_block *start_block = ir3_start_block(ir); 8747ec681f3Smrg foreach_block (block, &ir->block_list) { 8757ec681f3Smrg foreach_instr (instr, &block->instr_list) { 8767ec681f3Smrg if (is_input(instr) && block != start_block) { 8777ec681f3Smrg ctx->early_input_release = false; 8787ec681f3Smrg break; 8797ec681f3Smrg } 8807ec681f3Smrg } 8817ec681f3Smrg } 8827ec681f3Smrg 8837ec681f3Smrg assert(ctx->early_input_release || ctx->compiler->gen >= 5); 8847ec681f3Smrg 8857ec681f3Smrg /* process each block: */ 8867ec681f3Smrg do { 8877ec681f3Smrg progress = false; 8887ec681f3Smrg foreach_block (block, &ir->block_list) { 8897ec681f3Smrg progress |= legalize_block(ctx, block); 8907ec681f3Smrg } 8917ec681f3Smrg } while (progress); 8927ec681f3Smrg 8937ec681f3Smrg *max_bary = ctx->max_bary; 8947ec681f3Smrg 8957ec681f3Smrg block_sched(ir); 8967ec681f3Smrg if (so->type == MESA_SHADER_FRAGMENT) 8977ec681f3Smrg kill_sched(ir, so); 8987ec681f3Smrg 8997ec681f3Smrg foreach_block (block, &ir->block_list) { 9007ec681f3Smrg progress |= apply_fine_deriv_macro(ctx, block); 9017ec681f3Smrg } 9027ec681f3Smrg 9037ec681f3Smrg nop_sched(ir, so); 9047ec681f3Smrg 9057ec681f3Smrg while (opt_jump(ir)) 9067ec681f3Smrg ; 9077ec681f3Smrg 9087ec681f3Smrg ir3_count_instructions(ir); 9097ec681f3Smrg resolve_jumps(ir); 9107ec681f3Smrg 9117ec681f3Smrg mark_xvergence_points(ir); 9127ec681f3Smrg 9137ec681f3Smrg ralloc_free(ctx); 9147ec681f3Smrg 9157ec681f3Smrg return true; 9167e102996Smaya} 917