1b8e80941Smrg/* 2b8e80941Smrg * Copyright (C) 2014 Rob Clark <robclark@freedesktop.org> 3b8e80941Smrg * 4b8e80941Smrg * Permission is hereby granted, free of charge, to any person obtaining a 5b8e80941Smrg * copy of this software and associated documentation files (the "Software"), 6b8e80941Smrg * to deal in the Software without restriction, including without limitation 7b8e80941Smrg * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8b8e80941Smrg * and/or sell copies of the Software, and to permit persons to whom the 9b8e80941Smrg * Software is furnished to do so, subject to the following conditions: 10b8e80941Smrg * 11b8e80941Smrg * The above copyright notice and this permission notice (including the next 12b8e80941Smrg * paragraph) shall be included in all copies or substantial portions of the 13b8e80941Smrg * Software. 14b8e80941Smrg * 15b8e80941Smrg * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16b8e80941Smrg * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17b8e80941Smrg * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18b8e80941Smrg * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19b8e80941Smrg * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20b8e80941Smrg * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21b8e80941Smrg * SOFTWARE. 22b8e80941Smrg * 23b8e80941Smrg * Authors: 24b8e80941Smrg * Rob Clark <robclark@freedesktop.org> 25b8e80941Smrg */ 26b8e80941Smrg 27b8e80941Smrg#include "util/ralloc.h" 28b8e80941Smrg#include "util/u_math.h" 29b8e80941Smrg 30b8e80941Smrg#include "ir3.h" 31b8e80941Smrg#include "ir3_compiler.h" 32b8e80941Smrg 33b8e80941Smrg/* 34b8e80941Smrg * Legalize: 35b8e80941Smrg * 36b8e80941Smrg * We currently require that scheduling ensures that we have enough nop's 37b8e80941Smrg * in all the right places. The legalize step mostly handles fixing up 38b8e80941Smrg * instruction flags ((ss)/(sy)/(ei)), and collapses sequences of nop's 39b8e80941Smrg * into fewer nop's w/ rpt flag. 40b8e80941Smrg */ 41b8e80941Smrg 42b8e80941Smrgstruct ir3_legalize_ctx { 43b8e80941Smrg struct ir3_compiler *compiler; 44b8e80941Smrg gl_shader_stage type; 45b8e80941Smrg bool has_ssbo; 46b8e80941Smrg bool need_pixlod; 47b8e80941Smrg int max_bary; 48b8e80941Smrg}; 49b8e80941Smrg 50b8e80941Smrgstruct ir3_legalize_state { 51b8e80941Smrg regmask_t needs_ss; 52b8e80941Smrg regmask_t needs_ss_war; /* write after read */ 53b8e80941Smrg regmask_t needs_sy; 54b8e80941Smrg}; 55b8e80941Smrg 56b8e80941Smrgstruct ir3_legalize_block_data { 57b8e80941Smrg bool valid; 58b8e80941Smrg struct ir3_legalize_state state; 59b8e80941Smrg}; 60b8e80941Smrg 61b8e80941Smrg/* We want to evaluate each block from the position of any other 62b8e80941Smrg * predecessor block, in order that the flags set are the union of 63b8e80941Smrg * all possible program paths. 64b8e80941Smrg * 65b8e80941Smrg * To do this, we need to know the output state (needs_ss/ss_war/sy) 66b8e80941Smrg * of all predecessor blocks. The tricky thing is loops, which mean 67b8e80941Smrg * that we can't simply recursively process each predecessor block 68b8e80941Smrg * before legalizing the current block. 69b8e80941Smrg * 70b8e80941Smrg * How we handle that is by looping over all the blocks until the 71b8e80941Smrg * results converge. If the output state of a given block changes 72b8e80941Smrg * in a given pass, this means that all successor blocks are not 73b8e80941Smrg * yet fully legalized. 74b8e80941Smrg */ 75b8e80941Smrg 76b8e80941Smrgstatic bool 77b8e80941Smrglegalize_block(struct ir3_legalize_ctx *ctx, struct ir3_block *block) 78b8e80941Smrg{ 79b8e80941Smrg struct ir3_legalize_block_data *bd = block->data; 80b8e80941Smrg 81b8e80941Smrg if (bd->valid) 82b8e80941Smrg return false; 83b8e80941Smrg 84b8e80941Smrg struct ir3_instruction *last_input = NULL; 85b8e80941Smrg struct ir3_instruction *last_rel = NULL; 86b8e80941Smrg struct ir3_instruction *last_n = NULL; 87b8e80941Smrg struct list_head instr_list; 88b8e80941Smrg struct ir3_legalize_state prev_state = bd->state; 89b8e80941Smrg struct ir3_legalize_state *state = &bd->state; 90b8e80941Smrg 91b8e80941Smrg /* our input state is the OR of all predecessor blocks' state: */ 92b8e80941Smrg for (unsigned i = 0; i < block->predecessors_count; i++) { 93b8e80941Smrg struct ir3_legalize_block_data *pbd = block->predecessors[i]->data; 94b8e80941Smrg struct ir3_legalize_state *pstate = &pbd->state; 95b8e80941Smrg 96b8e80941Smrg /* Our input (ss)/(sy) state is based on OR'ing the output 97b8e80941Smrg * state of all our predecessor blocks 98b8e80941Smrg */ 99b8e80941Smrg regmask_or(&state->needs_ss, 100b8e80941Smrg &state->needs_ss, &pstate->needs_ss); 101b8e80941Smrg regmask_or(&state->needs_ss_war, 102b8e80941Smrg &state->needs_ss_war, &pstate->needs_ss_war); 103b8e80941Smrg regmask_or(&state->needs_sy, 104b8e80941Smrg &state->needs_sy, &pstate->needs_sy); 105b8e80941Smrg } 106b8e80941Smrg 107b8e80941Smrg /* remove all the instructions from the list, we'll be adding 108b8e80941Smrg * them back in as we go 109b8e80941Smrg */ 110b8e80941Smrg list_replace(&block->instr_list, &instr_list); 111b8e80941Smrg list_inithead(&block->instr_list); 112b8e80941Smrg 113b8e80941Smrg list_for_each_entry_safe (struct ir3_instruction, n, &instr_list, node) { 114b8e80941Smrg struct ir3_register *reg; 115b8e80941Smrg unsigned i; 116b8e80941Smrg 117b8e80941Smrg n->flags &= ~(IR3_INSTR_SS | IR3_INSTR_SY); 118b8e80941Smrg 119b8e80941Smrg if (is_meta(n)) 120b8e80941Smrg continue; 121b8e80941Smrg 122b8e80941Smrg if (is_input(n)) { 123b8e80941Smrg struct ir3_register *inloc = n->regs[1]; 124b8e80941Smrg assert(inloc->flags & IR3_REG_IMMED); 125b8e80941Smrg ctx->max_bary = MAX2(ctx->max_bary, inloc->iim_val); 126b8e80941Smrg } 127b8e80941Smrg 128b8e80941Smrg if (last_n && is_barrier(last_n)) 129b8e80941Smrg n->flags |= IR3_INSTR_SS | IR3_INSTR_SY; 130b8e80941Smrg 131b8e80941Smrg /* NOTE: consider dst register too.. it could happen that 132b8e80941Smrg * texture sample instruction (for example) writes some 133b8e80941Smrg * components which are unused. A subsequent instruction 134b8e80941Smrg * that writes the same register can race w/ the sam instr 135b8e80941Smrg * resulting in undefined results: 136b8e80941Smrg */ 137b8e80941Smrg for (i = 0; i < n->regs_count; i++) { 138b8e80941Smrg reg = n->regs[i]; 139b8e80941Smrg 140b8e80941Smrg if (reg_gpr(reg)) { 141b8e80941Smrg 142b8e80941Smrg /* TODO: we probably only need (ss) for alu 143b8e80941Smrg * instr consuming sfu result.. need to make 144b8e80941Smrg * some tests for both this and (sy).. 145b8e80941Smrg */ 146b8e80941Smrg if (regmask_get(&state->needs_ss, reg)) { 147b8e80941Smrg n->flags |= IR3_INSTR_SS; 148b8e80941Smrg regmask_init(&state->needs_ss_war); 149b8e80941Smrg regmask_init(&state->needs_ss); 150b8e80941Smrg } 151b8e80941Smrg 152b8e80941Smrg if (regmask_get(&state->needs_sy, reg)) { 153b8e80941Smrg n->flags |= IR3_INSTR_SY; 154b8e80941Smrg regmask_init(&state->needs_sy); 155b8e80941Smrg } 156b8e80941Smrg } 157b8e80941Smrg 158b8e80941Smrg /* TODO: is it valid to have address reg loaded from a 159b8e80941Smrg * relative src (ie. mova a0, c<a0.x+4>)? If so, the 160b8e80941Smrg * last_rel check below should be moved ahead of this: 161b8e80941Smrg */ 162b8e80941Smrg if (reg->flags & IR3_REG_RELATIV) 163b8e80941Smrg last_rel = n; 164b8e80941Smrg } 165b8e80941Smrg 166b8e80941Smrg if (n->regs_count > 0) { 167b8e80941Smrg reg = n->regs[0]; 168b8e80941Smrg if (regmask_get(&state->needs_ss_war, reg)) { 169b8e80941Smrg n->flags |= IR3_INSTR_SS; 170b8e80941Smrg regmask_init(&state->needs_ss_war); 171b8e80941Smrg regmask_init(&state->needs_ss); 172b8e80941Smrg } 173b8e80941Smrg 174b8e80941Smrg if (last_rel && (reg->num == regid(REG_A0, 0))) { 175b8e80941Smrg last_rel->flags |= IR3_INSTR_UL; 176b8e80941Smrg last_rel = NULL; 177b8e80941Smrg } 178b8e80941Smrg } 179b8e80941Smrg 180b8e80941Smrg /* cat5+ does not have an (ss) bit, if needed we need to 181b8e80941Smrg * insert a nop to carry the sync flag. Would be kinda 182b8e80941Smrg * clever if we were aware of this during scheduling, but 183b8e80941Smrg * this should be a pretty rare case: 184b8e80941Smrg */ 185b8e80941Smrg if ((n->flags & IR3_INSTR_SS) && (opc_cat(n->opc) >= 5)) { 186b8e80941Smrg struct ir3_instruction *nop; 187b8e80941Smrg nop = ir3_NOP(block); 188b8e80941Smrg nop->flags |= IR3_INSTR_SS; 189b8e80941Smrg n->flags &= ~IR3_INSTR_SS; 190b8e80941Smrg } 191b8e80941Smrg 192b8e80941Smrg /* need to be able to set (ss) on first instruction: */ 193b8e80941Smrg if (list_empty(&block->instr_list) && (opc_cat(n->opc) >= 5)) 194b8e80941Smrg ir3_NOP(block); 195b8e80941Smrg 196b8e80941Smrg if (is_nop(n) && !list_empty(&block->instr_list)) { 197b8e80941Smrg struct ir3_instruction *last = list_last_entry(&block->instr_list, 198b8e80941Smrg struct ir3_instruction, node); 199b8e80941Smrg if (is_nop(last) && (last->repeat < 5)) { 200b8e80941Smrg last->repeat++; 201b8e80941Smrg last->flags |= n->flags; 202b8e80941Smrg continue; 203b8e80941Smrg } 204b8e80941Smrg 205b8e80941Smrg /* NOTE: I think the nopN encoding works for a5xx and 206b8e80941Smrg * probably a4xx, but not a3xx. So far only tested on 207b8e80941Smrg * a6xx. 208b8e80941Smrg */ 209b8e80941Smrg if ((ctx->compiler->gpu_id >= 600) && !n->flags && (last->nop < 3) && 210b8e80941Smrg ((opc_cat(last->opc) == 2) || (opc_cat(last->opc) == 3))) { 211b8e80941Smrg last->nop++; 212b8e80941Smrg continue; 213b8e80941Smrg } 214b8e80941Smrg } 215b8e80941Smrg 216b8e80941Smrg if (ctx->compiler->samgq_workaround && 217b8e80941Smrg ctx->type == MESA_SHADER_VERTEX && n->opc == OPC_SAMGQ) { 218b8e80941Smrg struct ir3_instruction *samgp; 219b8e80941Smrg 220b8e80941Smrg for (i = 0; i < 4; i++) { 221b8e80941Smrg samgp = ir3_instr_clone(n); 222b8e80941Smrg samgp->opc = OPC_SAMGP0 + i; 223b8e80941Smrg if (i > 1) 224b8e80941Smrg samgp->flags |= IR3_INSTR_SY; 225b8e80941Smrg } 226b8e80941Smrg list_delinit(&n->node); 227b8e80941Smrg } else { 228b8e80941Smrg list_addtail(&n->node, &block->instr_list); 229b8e80941Smrg } 230b8e80941Smrg 231b8e80941Smrg if (is_sfu(n)) 232b8e80941Smrg regmask_set(&state->needs_ss, n->regs[0]); 233b8e80941Smrg 234b8e80941Smrg if (is_tex(n)) { 235b8e80941Smrg regmask_set(&state->needs_sy, n->regs[0]); 236b8e80941Smrg ctx->need_pixlod = true; 237b8e80941Smrg } else if (n->opc == OPC_RESINFO) { 238b8e80941Smrg regmask_set(&state->needs_ss, n->regs[0]); 239b8e80941Smrg ir3_NOP(block)->flags |= IR3_INSTR_SS; 240b8e80941Smrg } else if (is_load(n)) { 241b8e80941Smrg /* seems like ldlv needs (ss) bit instead?? which is odd but 242b8e80941Smrg * makes a bunch of flat-varying tests start working on a4xx. 243b8e80941Smrg */ 244b8e80941Smrg if ((n->opc == OPC_LDLV) || (n->opc == OPC_LDL)) 245b8e80941Smrg regmask_set(&state->needs_ss, n->regs[0]); 246b8e80941Smrg else 247b8e80941Smrg regmask_set(&state->needs_sy, n->regs[0]); 248b8e80941Smrg } else if (is_atomic(n->opc)) { 249b8e80941Smrg if (n->flags & IR3_INSTR_G) { 250b8e80941Smrg if (ctx->compiler->gpu_id >= 600) { 251b8e80941Smrg /* New encoding, returns result via second src: */ 252b8e80941Smrg regmask_set(&state->needs_sy, n->regs[3]); 253b8e80941Smrg } else { 254b8e80941Smrg regmask_set(&state->needs_sy, n->regs[0]); 255b8e80941Smrg } 256b8e80941Smrg } else { 257b8e80941Smrg regmask_set(&state->needs_ss, n->regs[0]); 258b8e80941Smrg } 259b8e80941Smrg } 260b8e80941Smrg 261b8e80941Smrg if (is_ssbo(n->opc) || (is_atomic(n->opc) && (n->flags & IR3_INSTR_G))) 262b8e80941Smrg ctx->has_ssbo = true; 263b8e80941Smrg 264b8e80941Smrg /* both tex/sfu appear to not always immediately consume 265b8e80941Smrg * their src register(s): 266b8e80941Smrg */ 267b8e80941Smrg if (is_tex(n) || is_sfu(n) || is_mem(n)) { 268b8e80941Smrg foreach_src(reg, n) { 269b8e80941Smrg if (reg_gpr(reg)) 270b8e80941Smrg regmask_set(&state->needs_ss_war, reg); 271b8e80941Smrg } 272b8e80941Smrg } 273b8e80941Smrg 274b8e80941Smrg if (is_input(n)) 275b8e80941Smrg last_input = n; 276b8e80941Smrg 277b8e80941Smrg last_n = n; 278b8e80941Smrg } 279b8e80941Smrg 280b8e80941Smrg if (last_input) { 281b8e80941Smrg /* special hack.. if using ldlv to bypass interpolation, 282b8e80941Smrg * we need to insert a dummy bary.f on which we can set 283b8e80941Smrg * the (ei) flag: 284b8e80941Smrg */ 285b8e80941Smrg if (is_mem(last_input) && (last_input->opc == OPC_LDLV)) { 286b8e80941Smrg struct ir3_instruction *baryf; 287b8e80941Smrg 288b8e80941Smrg /* (ss)bary.f (ei)r63.x, 0, r0.x */ 289b8e80941Smrg baryf = ir3_instr_create(block, OPC_BARY_F); 290b8e80941Smrg baryf->flags |= IR3_INSTR_SS; 291b8e80941Smrg ir3_reg_create(baryf, regid(63, 0), 0); 292b8e80941Smrg ir3_reg_create(baryf, 0, IR3_REG_IMMED)->iim_val = 0; 293b8e80941Smrg ir3_reg_create(baryf, regid(0, 0), 0); 294b8e80941Smrg 295b8e80941Smrg /* insert the dummy bary.f after last_input: */ 296b8e80941Smrg list_delinit(&baryf->node); 297b8e80941Smrg list_add(&baryf->node, &last_input->node); 298b8e80941Smrg 299b8e80941Smrg last_input = baryf; 300b8e80941Smrg } 301b8e80941Smrg last_input->regs[0]->flags |= IR3_REG_EI; 302b8e80941Smrg } 303b8e80941Smrg 304b8e80941Smrg if (last_rel) 305b8e80941Smrg last_rel->flags |= IR3_INSTR_UL; 306b8e80941Smrg 307b8e80941Smrg bd->valid = true; 308b8e80941Smrg 309b8e80941Smrg if (memcmp(&prev_state, state, sizeof(*state))) { 310b8e80941Smrg /* our output state changed, this invalidates all of our 311b8e80941Smrg * successors: 312b8e80941Smrg */ 313b8e80941Smrg for (unsigned i = 0; i < ARRAY_SIZE(block->successors); i++) { 314b8e80941Smrg if (!block->successors[i]) 315b8e80941Smrg break; 316b8e80941Smrg struct ir3_legalize_block_data *pbd = block->successors[i]->data; 317b8e80941Smrg pbd->valid = false; 318b8e80941Smrg } 319b8e80941Smrg } 320b8e80941Smrg 321b8e80941Smrg return true; 322b8e80941Smrg} 323b8e80941Smrg 324b8e80941Smrg/* NOTE: branch instructions are always the last instruction(s) 325b8e80941Smrg * in the block. We take advantage of this as we resolve the 326b8e80941Smrg * branches, since "if (foo) break;" constructs turn into 327b8e80941Smrg * something like: 328b8e80941Smrg * 329b8e80941Smrg * block3 { 330b8e80941Smrg * ... 331b8e80941Smrg * 0029:021: mov.s32s32 r62.x, r1.y 332b8e80941Smrg * 0082:022: br !p0.x, target=block5 333b8e80941Smrg * 0083:023: br p0.x, target=block4 334b8e80941Smrg * // succs: if _[0029:021: mov.s32s32] block4; else block5; 335b8e80941Smrg * } 336b8e80941Smrg * block4 { 337b8e80941Smrg * 0084:024: jump, target=block6 338b8e80941Smrg * // succs: block6; 339b8e80941Smrg * } 340b8e80941Smrg * block5 { 341b8e80941Smrg * 0085:025: jump, target=block7 342b8e80941Smrg * // succs: block7; 343b8e80941Smrg * } 344b8e80941Smrg * 345b8e80941Smrg * ie. only instruction in block4/block5 is a jump, so when 346b8e80941Smrg * resolving branches we can easily detect this by checking 347b8e80941Smrg * that the first instruction in the target block is itself 348b8e80941Smrg * a jump, and setup the br directly to the jump's target 349b8e80941Smrg * (and strip back out the now unreached jump) 350b8e80941Smrg * 351b8e80941Smrg * TODO sometimes we end up with things like: 352b8e80941Smrg * 353b8e80941Smrg * br !p0.x, #2 354b8e80941Smrg * br p0.x, #12 355b8e80941Smrg * add.u r0.y, r0.y, 1 356b8e80941Smrg * 357b8e80941Smrg * If we swapped the order of the branches, we could drop one. 358b8e80941Smrg */ 359b8e80941Smrgstatic struct ir3_block * 360b8e80941Smrgresolve_dest_block(struct ir3_block *block) 361b8e80941Smrg{ 362b8e80941Smrg /* special case for last block: */ 363b8e80941Smrg if (!block->successors[0]) 364b8e80941Smrg return block; 365b8e80941Smrg 366b8e80941Smrg /* NOTE that we may or may not have inserted the jump 367b8e80941Smrg * in the target block yet, so conditions to resolve 368b8e80941Smrg * the dest to the dest block's successor are: 369b8e80941Smrg * 370b8e80941Smrg * (1) successor[1] == NULL && 371b8e80941Smrg * (2) (block-is-empty || only-instr-is-jump) 372b8e80941Smrg */ 373b8e80941Smrg if (block->successors[1] == NULL) { 374b8e80941Smrg if (list_empty(&block->instr_list)) { 375b8e80941Smrg return block->successors[0]; 376b8e80941Smrg } else if (list_length(&block->instr_list) == 1) { 377b8e80941Smrg struct ir3_instruction *instr = list_first_entry( 378b8e80941Smrg &block->instr_list, struct ir3_instruction, node); 379b8e80941Smrg if (instr->opc == OPC_JUMP) 380b8e80941Smrg return block->successors[0]; 381b8e80941Smrg } 382b8e80941Smrg } 383b8e80941Smrg return block; 384b8e80941Smrg} 385b8e80941Smrg 386b8e80941Smrgstatic bool 387b8e80941Smrgresolve_jump(struct ir3_instruction *instr) 388b8e80941Smrg{ 389b8e80941Smrg struct ir3_block *tblock = 390b8e80941Smrg resolve_dest_block(instr->cat0.target); 391b8e80941Smrg struct ir3_instruction *target; 392b8e80941Smrg 393b8e80941Smrg if (tblock != instr->cat0.target) { 394b8e80941Smrg list_delinit(&instr->cat0.target->node); 395b8e80941Smrg instr->cat0.target = tblock; 396b8e80941Smrg return true; 397b8e80941Smrg } 398b8e80941Smrg 399b8e80941Smrg target = list_first_entry(&tblock->instr_list, 400b8e80941Smrg struct ir3_instruction, node); 401b8e80941Smrg 402b8e80941Smrg /* TODO maybe a less fragile way to do this. But we are expecting 403b8e80941Smrg * a pattern from sched_block() that looks like: 404b8e80941Smrg * 405b8e80941Smrg * br !p0.x, #else-block 406b8e80941Smrg * br p0.x, #if-block 407b8e80941Smrg * 408b8e80941Smrg * if the first branch target is +2, or if 2nd branch target is +1 409b8e80941Smrg * then we can just drop the jump. 410b8e80941Smrg */ 411b8e80941Smrg unsigned next_block; 412b8e80941Smrg if (instr->cat0.inv == true) 413b8e80941Smrg next_block = 2; 414b8e80941Smrg else 415b8e80941Smrg next_block = 1; 416b8e80941Smrg 417b8e80941Smrg if ((!target) || (target->ip == (instr->ip + next_block))) { 418b8e80941Smrg list_delinit(&instr->node); 419b8e80941Smrg return true; 420b8e80941Smrg } else { 421b8e80941Smrg instr->cat0.immed = 422b8e80941Smrg (int)target->ip - (int)instr->ip; 423b8e80941Smrg } 424b8e80941Smrg return false; 425b8e80941Smrg} 426b8e80941Smrg 427b8e80941Smrg/* resolve jumps, removing jumps/branches to immediately following 428b8e80941Smrg * instruction which we end up with from earlier stages. Since 429b8e80941Smrg * removing an instruction can invalidate earlier instruction's 430b8e80941Smrg * branch offsets, we need to do this iteratively until no more 431b8e80941Smrg * branches are removed. 432b8e80941Smrg */ 433b8e80941Smrgstatic bool 434b8e80941Smrgresolve_jumps(struct ir3 *ir) 435b8e80941Smrg{ 436b8e80941Smrg list_for_each_entry (struct ir3_block, block, &ir->block_list, node) 437b8e80941Smrg list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) 438b8e80941Smrg if (is_flow(instr) && instr->cat0.target) 439b8e80941Smrg if (resolve_jump(instr)) 440b8e80941Smrg return true; 441b8e80941Smrg 442b8e80941Smrg return false; 443b8e80941Smrg} 444b8e80941Smrg 445b8e80941Smrg/* we want to mark points where divergent flow control re-converges 446b8e80941Smrg * with (jp) flags. For now, since we don't do any optimization for 447b8e80941Smrg * things that start out as a 'do {} while()', re-convergence points 448b8e80941Smrg * will always be a branch or jump target. Note that this is overly 449b8e80941Smrg * conservative, since unconditional jump targets are not convergence 450b8e80941Smrg * points, we are just assuming that the other path to reach the jump 451b8e80941Smrg * target was divergent. If we were clever enough to optimize the 452b8e80941Smrg * jump at end of a loop back to a conditional branch into a single 453b8e80941Smrg * conditional branch, ie. like: 454b8e80941Smrg * 455b8e80941Smrg * add.f r1.w, r0.x, (neg)(r)c2.x <= loop start 456b8e80941Smrg * mul.f r1.z, r1.z, r0.x 457b8e80941Smrg * mul.f r1.y, r1.y, r0.x 458b8e80941Smrg * mul.f r0.z, r1.x, r0.x 459b8e80941Smrg * mul.f r0.w, r0.y, r0.x 460b8e80941Smrg * cmps.f.ge r0.x, (r)c2.y, (r)r1.w 461b8e80941Smrg * add.s r0.x, (r)r0.x, (r)-1 462b8e80941Smrg * sel.f32 r0.x, (r)c3.y, (r)r0.x, c3.x 463b8e80941Smrg * cmps.f.eq p0.x, r0.x, c3.y 464b8e80941Smrg * mov.f32f32 r0.x, r1.w 465b8e80941Smrg * mov.f32f32 r0.y, r0.w 466b8e80941Smrg * mov.f32f32 r1.x, r0.z 467b8e80941Smrg * (rpt2)nop 468b8e80941Smrg * br !p0.x, #-13 469b8e80941Smrg * (jp)mul.f r0.x, c263.y, r1.y 470b8e80941Smrg * 471b8e80941Smrg * Then we'd have to be more clever, as the convergence point is no 472b8e80941Smrg * longer a branch or jump target. 473b8e80941Smrg */ 474b8e80941Smrgstatic void 475b8e80941Smrgmark_convergence_points(struct ir3 *ir) 476b8e80941Smrg{ 477b8e80941Smrg list_for_each_entry (struct ir3_block, block, &ir->block_list, node) { 478b8e80941Smrg list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) { 479b8e80941Smrg if (is_flow(instr) && instr->cat0.target) { 480b8e80941Smrg struct ir3_instruction *target = 481b8e80941Smrg list_first_entry(&instr->cat0.target->instr_list, 482b8e80941Smrg struct ir3_instruction, node); 483b8e80941Smrg target->flags |= IR3_INSTR_JP; 484b8e80941Smrg } 485b8e80941Smrg } 486b8e80941Smrg } 487b8e80941Smrg} 488b8e80941Smrg 489b8e80941Smrgvoid 490b8e80941Smrgir3_legalize(struct ir3 *ir, bool *has_ssbo, bool *need_pixlod, int *max_bary) 491b8e80941Smrg{ 492b8e80941Smrg struct ir3_legalize_ctx *ctx = rzalloc(ir, struct ir3_legalize_ctx); 493b8e80941Smrg bool progress; 494b8e80941Smrg 495b8e80941Smrg ctx->max_bary = -1; 496b8e80941Smrg ctx->compiler = ir->compiler; 497b8e80941Smrg ctx->type = ir->type; 498b8e80941Smrg 499b8e80941Smrg /* allocate per-block data: */ 500b8e80941Smrg list_for_each_entry (struct ir3_block, block, &ir->block_list, node) { 501b8e80941Smrg block->data = rzalloc(ctx, struct ir3_legalize_block_data); 502b8e80941Smrg } 503b8e80941Smrg 504b8e80941Smrg /* process each block: */ 505b8e80941Smrg do { 506b8e80941Smrg progress = false; 507b8e80941Smrg list_for_each_entry (struct ir3_block, block, &ir->block_list, node) { 508b8e80941Smrg progress |= legalize_block(ctx, block); 509b8e80941Smrg } 510b8e80941Smrg } while (progress); 511b8e80941Smrg 512b8e80941Smrg *has_ssbo = ctx->has_ssbo; 513b8e80941Smrg *need_pixlod = ctx->need_pixlod; 514b8e80941Smrg *max_bary = ctx->max_bary; 515b8e80941Smrg 516b8e80941Smrg do { 517b8e80941Smrg ir3_count_instructions(ir); 518b8e80941Smrg } while(resolve_jumps(ir)); 519b8e80941Smrg 520b8e80941Smrg mark_convergence_points(ir); 521b8e80941Smrg 522b8e80941Smrg ralloc_free(ctx); 523b8e80941Smrg} 524