1b8e80941Smrg/* 2b8e80941Smrg * Copyright © 2016 Intel Corporation 3b8e80941Smrg * 4b8e80941Smrg * Permission is hereby granted, free of charge, to any person obtaining a 5b8e80941Smrg * copy of this software and associated documentation files (the "Software"), 6b8e80941Smrg * to deal in the Software without restriction, including without limitation 7b8e80941Smrg * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8b8e80941Smrg * and/or sell copies of the Software, and to permit persons to whom the 9b8e80941Smrg * Software is furnished to do so, subject to the following conditions: 10b8e80941Smrg * 11b8e80941Smrg * The above copyright notice and this permission notice (including the next 12b8e80941Smrg * paragraph) shall be included in all copies or substantial portions of the 13b8e80941Smrg * Software. 14b8e80941Smrg * 15b8e80941Smrg * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16b8e80941Smrg * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17b8e80941Smrg * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18b8e80941Smrg * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19b8e80941Smrg * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20b8e80941Smrg * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21b8e80941Smrg * IN THE SOFTWARE. 22b8e80941Smrg */ 23b8e80941Smrg 24b8e80941Smrg#include "nir.h" 25b8e80941Smrg#include "nir/nir_builder.h" 26b8e80941Smrg#include "nir_constant_expressions.h" 27b8e80941Smrg#include "nir_control_flow.h" 28b8e80941Smrg#include "nir_loop_analyze.h" 29b8e80941Smrg 30b8e80941Smrgstatic nir_ssa_def *clone_alu_and_replace_src_defs(nir_builder *b, 31b8e80941Smrg const nir_alu_instr *alu, 32b8e80941Smrg nir_ssa_def **src_defs); 33b8e80941Smrg 34b8e80941Smrg/** 35b8e80941Smrg * Gets the single block that jumps back to the loop header. Already assumes 36b8e80941Smrg * there is exactly one such block. 37b8e80941Smrg */ 38b8e80941Smrgstatic nir_block* 39b8e80941Smrgfind_continue_block(nir_loop *loop) 40b8e80941Smrg{ 41b8e80941Smrg nir_block *header_block = nir_loop_first_block(loop); 42b8e80941Smrg nir_block *prev_block = 43b8e80941Smrg nir_cf_node_as_block(nir_cf_node_prev(&loop->cf_node)); 44b8e80941Smrg 45b8e80941Smrg assert(header_block->predecessors->entries == 2); 46b8e80941Smrg 47b8e80941Smrg set_foreach(header_block->predecessors, pred_entry) { 48b8e80941Smrg if (pred_entry->key != prev_block) 49b8e80941Smrg return (nir_block*)pred_entry->key; 50b8e80941Smrg } 51b8e80941Smrg 52b8e80941Smrg unreachable("Continue block not found!"); 53b8e80941Smrg} 54b8e80941Smrg 55b8e80941Smrg/** 56b8e80941Smrg * Does a phi have one constant value from outside a loop and one from inside? 57b8e80941Smrg */ 58b8e80941Smrgstatic bool 59b8e80941Smrgphi_has_constant_from_outside_and_one_from_inside_loop(nir_phi_instr *phi, 60b8e80941Smrg const nir_block *entry_block, 61b8e80941Smrg uint32_t *entry_val, 62b8e80941Smrg uint32_t *continue_val) 63b8e80941Smrg{ 64b8e80941Smrg /* We already know we have exactly one continue */ 65b8e80941Smrg assert(exec_list_length(&phi->srcs) == 2); 66b8e80941Smrg 67b8e80941Smrg *entry_val = 0; 68b8e80941Smrg *continue_val = 0; 69b8e80941Smrg 70b8e80941Smrg nir_foreach_phi_src(src, phi) { 71b8e80941Smrg assert(src->src.is_ssa); 72b8e80941Smrg nir_const_value *const_src = nir_src_as_const_value(src->src); 73b8e80941Smrg if (!const_src) 74b8e80941Smrg return false; 75b8e80941Smrg 76b8e80941Smrg if (src->pred != entry_block) { 77b8e80941Smrg *continue_val = const_src[0].u32; 78b8e80941Smrg } else { 79b8e80941Smrg *entry_val = const_src[0].u32; 80b8e80941Smrg } 81b8e80941Smrg } 82b8e80941Smrg 83b8e80941Smrg return true; 84b8e80941Smrg} 85b8e80941Smrg 86b8e80941Smrg/** 87b8e80941Smrg * This optimization detects if statements at the tops of loops where the 88b8e80941Smrg * condition is a phi node of two constants and moves half of the if to above 89b8e80941Smrg * the loop and the other half of the if to the end of the loop. A simple for 90b8e80941Smrg * loop "for (int i = 0; i < 4; i++)", when run through the SPIR-V front-end, 91b8e80941Smrg * ends up looking something like this: 92b8e80941Smrg * 93b8e80941Smrg * vec1 32 ssa_0 = load_const (0x00000000) 94b8e80941Smrg * vec1 32 ssa_1 = load_const (0xffffffff) 95b8e80941Smrg * loop { 96b8e80941Smrg * block block_1: 97b8e80941Smrg * vec1 32 ssa_2 = phi block_0: ssa_0, block_7: ssa_5 98b8e80941Smrg * vec1 32 ssa_3 = phi block_0: ssa_0, block_7: ssa_1 99b8e80941Smrg * if ssa_3 { 100b8e80941Smrg * block block_2: 101b8e80941Smrg * vec1 32 ssa_4 = load_const (0x00000001) 102b8e80941Smrg * vec1 32 ssa_5 = iadd ssa_2, ssa_4 103b8e80941Smrg * } else { 104b8e80941Smrg * block block_3: 105b8e80941Smrg * } 106b8e80941Smrg * block block_4: 107b8e80941Smrg * vec1 32 ssa_6 = load_const (0x00000004) 108b8e80941Smrg * vec1 32 ssa_7 = ilt ssa_5, ssa_6 109b8e80941Smrg * if ssa_7 { 110b8e80941Smrg * block block_5: 111b8e80941Smrg * } else { 112b8e80941Smrg * block block_6: 113b8e80941Smrg * break 114b8e80941Smrg * } 115b8e80941Smrg * block block_7: 116b8e80941Smrg * } 117b8e80941Smrg * 118b8e80941Smrg * This turns it into something like this: 119b8e80941Smrg * 120b8e80941Smrg * // Stuff from block 1 121b8e80941Smrg * // Stuff from block 3 122b8e80941Smrg * loop { 123b8e80941Smrg * block block_1: 124b8e80941Smrg * vec1 32 ssa_2 = phi block_0: ssa_0, block_7: ssa_5 125b8e80941Smrg * vec1 32 ssa_6 = load_const (0x00000004) 126b8e80941Smrg * vec1 32 ssa_7 = ilt ssa_2, ssa_6 127b8e80941Smrg * if ssa_7 { 128b8e80941Smrg * block block_5: 129b8e80941Smrg * } else { 130b8e80941Smrg * block block_6: 131b8e80941Smrg * break 132b8e80941Smrg * } 133b8e80941Smrg * block block_7: 134b8e80941Smrg * // Stuff from block 1 135b8e80941Smrg * // Stuff from block 2 136b8e80941Smrg * vec1 32 ssa_4 = load_const (0x00000001) 137b8e80941Smrg * vec1 32 ssa_5 = iadd ssa_2, ssa_4 138b8e80941Smrg * } 139b8e80941Smrg */ 140b8e80941Smrgstatic bool 141b8e80941Smrgopt_peel_loop_initial_if(nir_loop *loop) 142b8e80941Smrg{ 143b8e80941Smrg nir_block *header_block = nir_loop_first_block(loop); 144b8e80941Smrg nir_block *const prev_block = 145b8e80941Smrg nir_cf_node_as_block(nir_cf_node_prev(&loop->cf_node)); 146b8e80941Smrg 147b8e80941Smrg /* It would be insane if this were not true */ 148b8e80941Smrg assert(_mesa_set_search(header_block->predecessors, prev_block)); 149b8e80941Smrg 150b8e80941Smrg /* The loop must have exactly one continue block which could be a block 151b8e80941Smrg * ending in a continue instruction or the "natural" continue from the 152b8e80941Smrg * last block in the loop back to the top. 153b8e80941Smrg */ 154b8e80941Smrg if (header_block->predecessors->entries != 2) 155b8e80941Smrg return false; 156b8e80941Smrg 157b8e80941Smrg nir_cf_node *if_node = nir_cf_node_next(&header_block->cf_node); 158b8e80941Smrg if (!if_node || if_node->type != nir_cf_node_if) 159b8e80941Smrg return false; 160b8e80941Smrg 161b8e80941Smrg nir_if *nif = nir_cf_node_as_if(if_node); 162b8e80941Smrg assert(nif->condition.is_ssa); 163b8e80941Smrg 164b8e80941Smrg nir_ssa_def *cond = nif->condition.ssa; 165b8e80941Smrg if (cond->parent_instr->type != nir_instr_type_phi) 166b8e80941Smrg return false; 167b8e80941Smrg 168b8e80941Smrg nir_phi_instr *cond_phi = nir_instr_as_phi(cond->parent_instr); 169b8e80941Smrg if (cond->parent_instr->block != header_block) 170b8e80941Smrg return false; 171b8e80941Smrg 172b8e80941Smrg uint32_t entry_val = 0, continue_val = 0; 173b8e80941Smrg if (!phi_has_constant_from_outside_and_one_from_inside_loop(cond_phi, 174b8e80941Smrg prev_block, 175b8e80941Smrg &entry_val, 176b8e80941Smrg &continue_val)) 177b8e80941Smrg return false; 178b8e80941Smrg 179b8e80941Smrg /* If they both execute or both don't execute, this is a job for 180b8e80941Smrg * nir_dead_cf, not this pass. 181b8e80941Smrg */ 182b8e80941Smrg if ((entry_val && continue_val) || (!entry_val && !continue_val)) 183b8e80941Smrg return false; 184b8e80941Smrg 185b8e80941Smrg struct exec_list *continue_list, *entry_list; 186b8e80941Smrg if (continue_val) { 187b8e80941Smrg continue_list = &nif->then_list; 188b8e80941Smrg entry_list = &nif->else_list; 189b8e80941Smrg } else { 190b8e80941Smrg continue_list = &nif->else_list; 191b8e80941Smrg entry_list = &nif->then_list; 192b8e80941Smrg } 193b8e80941Smrg 194b8e80941Smrg /* We want to be moving the contents of entry_list to above the loop so it 195b8e80941Smrg * can't contain any break or continue instructions. 196b8e80941Smrg */ 197b8e80941Smrg foreach_list_typed(nir_cf_node, cf_node, node, entry_list) { 198b8e80941Smrg nir_foreach_block_in_cf_node(block, cf_node) { 199b8e80941Smrg nir_instr *last_instr = nir_block_last_instr(block); 200b8e80941Smrg if (last_instr && last_instr->type == nir_instr_type_jump) 201b8e80941Smrg return false; 202b8e80941Smrg } 203b8e80941Smrg } 204b8e80941Smrg 205b8e80941Smrg /* We're about to re-arrange a bunch of blocks so make sure that we don't 206b8e80941Smrg * have deref uses which cross block boundaries. We don't want a deref 207b8e80941Smrg * accidentally ending up in a phi. 208b8e80941Smrg */ 209b8e80941Smrg nir_rematerialize_derefs_in_use_blocks_impl( 210b8e80941Smrg nir_cf_node_get_function(&loop->cf_node)); 211b8e80941Smrg 212b8e80941Smrg /* Before we do anything, convert the loop to LCSSA. We're about to 213b8e80941Smrg * replace a bunch of SSA defs with registers and this will prevent any of 214b8e80941Smrg * it from leaking outside the loop. 215b8e80941Smrg */ 216b8e80941Smrg nir_convert_loop_to_lcssa(loop); 217b8e80941Smrg 218b8e80941Smrg nir_block *after_if_block = 219b8e80941Smrg nir_cf_node_as_block(nir_cf_node_next(&nif->cf_node)); 220b8e80941Smrg 221b8e80941Smrg /* Get rid of phis in the header block since we will be duplicating it */ 222b8e80941Smrg nir_lower_phis_to_regs_block(header_block); 223b8e80941Smrg /* Get rid of phis after the if since dominance will change */ 224b8e80941Smrg nir_lower_phis_to_regs_block(after_if_block); 225b8e80941Smrg 226b8e80941Smrg /* Get rid of SSA defs in the pieces we're about to move around */ 227b8e80941Smrg nir_lower_ssa_defs_to_regs_block(header_block); 228b8e80941Smrg nir_foreach_block_in_cf_node(block, &nif->cf_node) 229b8e80941Smrg nir_lower_ssa_defs_to_regs_block(block); 230b8e80941Smrg 231b8e80941Smrg nir_cf_list header, tmp; 232b8e80941Smrg nir_cf_extract(&header, nir_before_block(header_block), 233b8e80941Smrg nir_after_block(header_block)); 234b8e80941Smrg 235b8e80941Smrg nir_cf_list_clone(&tmp, &header, &loop->cf_node, NULL); 236b8e80941Smrg nir_cf_reinsert(&tmp, nir_before_cf_node(&loop->cf_node)); 237b8e80941Smrg nir_cf_extract(&tmp, nir_before_cf_list(entry_list), 238b8e80941Smrg nir_after_cf_list(entry_list)); 239b8e80941Smrg nir_cf_reinsert(&tmp, nir_before_cf_node(&loop->cf_node)); 240b8e80941Smrg 241b8e80941Smrg nir_cf_reinsert(&header, 242b8e80941Smrg nir_after_block_before_jump(find_continue_block(loop))); 243b8e80941Smrg 244b8e80941Smrg bool continue_list_jumps = 245b8e80941Smrg nir_block_ends_in_jump(exec_node_data(nir_block, 246b8e80941Smrg exec_list_get_tail(continue_list), 247b8e80941Smrg cf_node.node)); 248b8e80941Smrg 249b8e80941Smrg nir_cf_extract(&tmp, nir_before_cf_list(continue_list), 250b8e80941Smrg nir_after_cf_list(continue_list)); 251b8e80941Smrg 252b8e80941Smrg /* Get continue block again as the previous reinsert might have removed the 253b8e80941Smrg * block. Also, if both the continue list and the continue block ends in 254b8e80941Smrg * jump instructions, removes the jump from the latter, as it will not be 255b8e80941Smrg * executed if we insert the continue list before it. */ 256b8e80941Smrg 257b8e80941Smrg nir_block *continue_block = find_continue_block(loop); 258b8e80941Smrg 259b8e80941Smrg if (continue_list_jumps) { 260b8e80941Smrg nir_instr *last_instr = nir_block_last_instr(continue_block); 261b8e80941Smrg if (last_instr && last_instr->type == nir_instr_type_jump) 262b8e80941Smrg nir_instr_remove(last_instr); 263b8e80941Smrg } 264b8e80941Smrg 265b8e80941Smrg nir_cf_reinsert(&tmp, 266b8e80941Smrg nir_after_block_before_jump(continue_block)); 267b8e80941Smrg 268b8e80941Smrg nir_cf_node_remove(&nif->cf_node); 269b8e80941Smrg 270b8e80941Smrg return true; 271b8e80941Smrg} 272b8e80941Smrg 273b8e80941Smrgstatic bool 274b8e80941Smrgalu_instr_is_comparison(const nir_alu_instr *alu) 275b8e80941Smrg{ 276b8e80941Smrg switch (alu->op) { 277b8e80941Smrg case nir_op_flt32: 278b8e80941Smrg case nir_op_fge32: 279b8e80941Smrg case nir_op_feq32: 280b8e80941Smrg case nir_op_fne32: 281b8e80941Smrg case nir_op_ilt32: 282b8e80941Smrg case nir_op_ult32: 283b8e80941Smrg case nir_op_ige32: 284b8e80941Smrg case nir_op_uge32: 285b8e80941Smrg case nir_op_ieq32: 286b8e80941Smrg case nir_op_ine32: 287b8e80941Smrg return true; 288b8e80941Smrg default: 289b8e80941Smrg return nir_alu_instr_is_comparison(alu); 290b8e80941Smrg } 291b8e80941Smrg} 292b8e80941Smrg 293b8e80941Smrgstatic bool 294b8e80941Smrgalu_instr_is_type_conversion(const nir_alu_instr *alu) 295b8e80941Smrg{ 296b8e80941Smrg return nir_op_infos[alu->op].num_inputs == 1 && 297b8e80941Smrg nir_alu_type_get_base_type(nir_op_infos[alu->op].output_type) != 298b8e80941Smrg nir_alu_type_get_base_type(nir_op_infos[alu->op].input_types[0]); 299b8e80941Smrg} 300b8e80941Smrg 301b8e80941Smrg/** 302b8e80941Smrg * Splits ALU instructions that have a source that is a phi node 303b8e80941Smrg * 304b8e80941Smrg * ALU instructions in the header block of a loop that meet the following 305b8e80941Smrg * criteria can be split. 306b8e80941Smrg * 307b8e80941Smrg * - The loop has no continue instructions other than the "natural" continue 308b8e80941Smrg * at the bottom of the loop. 309b8e80941Smrg * 310b8e80941Smrg * - At least one source of the instruction is a phi node from the header block. 311b8e80941Smrg * 312b8e80941Smrg * and either this rule 313b8e80941Smrg * 314b8e80941Smrg * - The phi node selects undef from the block before the loop and a value 315b8e80941Smrg * from the continue block of the loop. 316b8e80941Smrg * 317b8e80941Smrg * or these two rules 318b8e80941Smrg * 319b8e80941Smrg * - The phi node selects a constant from the block before the loop. 320b8e80941Smrg * 321b8e80941Smrg * - The non-phi source of the ALU instruction comes from a block that 322b8e80941Smrg * dominates the block before the loop. The most common failure mode for 323b8e80941Smrg * this check is sources that are generated in the loop header block. 324b8e80941Smrg * 325b8e80941Smrg * The split process moves the original ALU instruction to the bottom of the 326b8e80941Smrg * loop. The phi node source is replaced with the value from the phi node 327b8e80941Smrg * selected from the continue block (i.e., the non-undef value). A new phi 328b8e80941Smrg * node is added to the header block that selects either undef from the block 329b8e80941Smrg * before the loop or the result of the (moved) ALU instruction. 330b8e80941Smrg * 331b8e80941Smrg * The splitting transforms a loop like: 332b8e80941Smrg * 333b8e80941Smrg * vec1 32 ssa_7 = undefined 334b8e80941Smrg * vec1 32 ssa_8 = load_const (0x00000001) 335b8e80941Smrg * vec1 32 ssa_10 = load_const (0x00000000) 336b8e80941Smrg * // succs: block_1 337b8e80941Smrg * loop { 338b8e80941Smrg * block block_1: 339b8e80941Smrg * // preds: block_0 block_4 340b8e80941Smrg * vec1 32 ssa_11 = phi block_0: ssa_7, block_4: ssa_15 341b8e80941Smrg * vec1 32 ssa_12 = phi block_0: ssa_1, block_4: ssa_15 342b8e80941Smrg * vec1 32 ssa_13 = phi block_0: ssa_10, block_4: ssa_16 343b8e80941Smrg * vec1 32 ssa_14 = iadd ssa_11, ssa_8 344b8e80941Smrg * vec1 32 ssa_15 = b32csel ssa_13, ssa_14, ssa_12 345b8e80941Smrg * ... 346b8e80941Smrg * // succs: block_1 347b8e80941Smrg * } 348b8e80941Smrg * 349b8e80941Smrg * into: 350b8e80941Smrg * 351b8e80941Smrg * vec1 32 ssa_7 = undefined 352b8e80941Smrg * vec1 32 ssa_8 = load_const (0x00000001) 353b8e80941Smrg * vec1 32 ssa_10 = load_const (0x00000000) 354b8e80941Smrg * // succs: block_1 355b8e80941Smrg * loop { 356b8e80941Smrg * block block_1: 357b8e80941Smrg * // preds: block_0 block_4 358b8e80941Smrg * vec1 32 ssa_11 = phi block_0: ssa_7, block_4: ssa_15 359b8e80941Smrg * vec1 32 ssa_12 = phi block_0: ssa_1, block_4: ssa_15 360b8e80941Smrg * vec1 32 ssa_13 = phi block_0: ssa_10, block_4: ssa_16 361b8e80941Smrg * vec1 32 ssa_21 = phi block_0: sss_7, block_4: ssa_20 362b8e80941Smrg * vec1 32 ssa_15 = b32csel ssa_13, ssa_21, ssa_12 363b8e80941Smrg * ... 364b8e80941Smrg * vec1 32 ssa_20 = iadd ssa_15, ssa_8 365b8e80941Smrg * // succs: block_1 366b8e80941Smrg * } 367b8e80941Smrg * 368b8e80941Smrg * If the phi does not select an undef, the instruction is duplicated in the 369b8e80941Smrg * loop continue block (as in the undef case) and in the previous block. When 370b8e80941Smrg * the ALU instruction is duplicated in the previous block, the correct source 371b8e80941Smrg * must be selected from the phi node. 372b8e80941Smrg */ 373b8e80941Smrgstatic bool 374b8e80941Smrgopt_split_alu_of_phi(nir_builder *b, nir_loop *loop) 375b8e80941Smrg{ 376b8e80941Smrg bool progress = false; 377b8e80941Smrg nir_block *header_block = nir_loop_first_block(loop); 378b8e80941Smrg nir_block *const prev_block = 379b8e80941Smrg nir_cf_node_as_block(nir_cf_node_prev(&loop->cf_node)); 380b8e80941Smrg 381b8e80941Smrg /* It would be insane if this were not true */ 382b8e80941Smrg assert(_mesa_set_search(header_block->predecessors, prev_block)); 383b8e80941Smrg 384b8e80941Smrg /* The loop must have exactly one continue block which could be a block 385b8e80941Smrg * ending in a continue instruction or the "natural" continue from the 386b8e80941Smrg * last block in the loop back to the top. 387b8e80941Smrg */ 388b8e80941Smrg if (header_block->predecessors->entries != 2) 389b8e80941Smrg return false; 390b8e80941Smrg 391b8e80941Smrg nir_foreach_instr_safe(instr, header_block) { 392b8e80941Smrg if (instr->type != nir_instr_type_alu) 393b8e80941Smrg continue; 394b8e80941Smrg 395b8e80941Smrg nir_alu_instr *const alu = nir_instr_as_alu(instr); 396b8e80941Smrg 397b8e80941Smrg /* Most ALU ops produce an undefined result if any source is undef. 398b8e80941Smrg * However, operations like bcsel only produce undefined results of the 399b8e80941Smrg * first operand is undef. Even in the undefined case, the result 400b8e80941Smrg * should be one of the other two operands, so the result of the bcsel 401b8e80941Smrg * should never be replaced with undef. 402b8e80941Smrg * 403b8e80941Smrg * nir_op_vec{2,3,4}, nir_op_imov, and nir_op_fmov are excluded because 404b8e80941Smrg * they can easily lead to infinite optimization loops. 405b8e80941Smrg */ 406b8e80941Smrg if (alu->op == nir_op_bcsel || 407b8e80941Smrg alu->op == nir_op_b32csel || 408b8e80941Smrg alu->op == nir_op_fcsel || 409b8e80941Smrg alu->op == nir_op_vec2 || 410b8e80941Smrg alu->op == nir_op_vec3 || 411b8e80941Smrg alu->op == nir_op_vec4 || 412b8e80941Smrg alu->op == nir_op_imov || 413b8e80941Smrg alu->op == nir_op_fmov || 414b8e80941Smrg alu_instr_is_comparison(alu) || 415b8e80941Smrg alu_instr_is_type_conversion(alu)) 416b8e80941Smrg continue; 417b8e80941Smrg 418b8e80941Smrg bool has_phi_src_from_prev_block = false; 419b8e80941Smrg bool all_non_phi_exist_in_prev_block = true; 420b8e80941Smrg bool is_prev_result_undef = true; 421b8e80941Smrg bool is_prev_result_const = true; 422b8e80941Smrg nir_ssa_def *prev_srcs[8]; // FINISHME: Array size? 423b8e80941Smrg nir_ssa_def *continue_srcs[8]; // FINISHME: Array size? 424b8e80941Smrg 425b8e80941Smrg for (unsigned i = 0; i < nir_op_infos[alu->op].num_inputs; i++) { 426b8e80941Smrg nir_instr *const src_instr = alu->src[i].src.ssa->parent_instr; 427b8e80941Smrg 428b8e80941Smrg /* If the source is a phi in the loop header block, then the 429b8e80941Smrg * prev_srcs and continue_srcs will come from the different sources 430b8e80941Smrg * of the phi. 431b8e80941Smrg */ 432b8e80941Smrg if (src_instr->type == nir_instr_type_phi && 433b8e80941Smrg src_instr->block == header_block) { 434b8e80941Smrg nir_phi_instr *const phi = nir_instr_as_phi(src_instr); 435b8e80941Smrg 436b8e80941Smrg /* Only strictly need to NULL out the pointers when the assertions 437b8e80941Smrg * (below) are compiled in. Debugging a NULL pointer deref in the 438b8e80941Smrg * wild is easier than debugging a random pointer deref, so set 439b8e80941Smrg * NULL unconditionally just to be safe. 440b8e80941Smrg */ 441b8e80941Smrg prev_srcs[i] = NULL; 442b8e80941Smrg continue_srcs[i] = NULL; 443b8e80941Smrg 444b8e80941Smrg nir_foreach_phi_src(src_of_phi, phi) { 445b8e80941Smrg if (src_of_phi->pred == prev_block) { 446b8e80941Smrg if (src_of_phi->src.ssa->parent_instr->type != 447b8e80941Smrg nir_instr_type_ssa_undef) { 448b8e80941Smrg is_prev_result_undef = false; 449b8e80941Smrg } 450b8e80941Smrg 451b8e80941Smrg if (src_of_phi->src.ssa->parent_instr->type != 452b8e80941Smrg nir_instr_type_load_const) { 453b8e80941Smrg is_prev_result_const = false; 454b8e80941Smrg } 455b8e80941Smrg 456b8e80941Smrg prev_srcs[i] = src_of_phi->src.ssa; 457b8e80941Smrg has_phi_src_from_prev_block = true; 458b8e80941Smrg } else 459b8e80941Smrg continue_srcs[i] = src_of_phi->src.ssa; 460b8e80941Smrg } 461b8e80941Smrg 462b8e80941Smrg assert(prev_srcs[i] != NULL); 463b8e80941Smrg assert(continue_srcs[i] != NULL); 464b8e80941Smrg } else { 465b8e80941Smrg /* If the source is not a phi (or a phi in a block other than the 466b8e80941Smrg * loop header), then the value must exist in prev_block. 467b8e80941Smrg */ 468b8e80941Smrg if (!nir_block_dominates(src_instr->block, prev_block)) { 469b8e80941Smrg all_non_phi_exist_in_prev_block = false; 470b8e80941Smrg break; 471b8e80941Smrg } 472b8e80941Smrg 473b8e80941Smrg prev_srcs[i] = alu->src[i].src.ssa; 474b8e80941Smrg continue_srcs[i] = alu->src[i].src.ssa; 475b8e80941Smrg } 476b8e80941Smrg } 477b8e80941Smrg 478b8e80941Smrg if (has_phi_src_from_prev_block && all_non_phi_exist_in_prev_block && 479b8e80941Smrg (is_prev_result_undef || is_prev_result_const)) { 480b8e80941Smrg nir_block *const continue_block = find_continue_block(loop); 481b8e80941Smrg nir_ssa_def *prev_value; 482b8e80941Smrg 483b8e80941Smrg if (!is_prev_result_undef) { 484b8e80941Smrg b->cursor = nir_after_block(prev_block); 485b8e80941Smrg prev_value = clone_alu_and_replace_src_defs(b, alu, prev_srcs); 486b8e80941Smrg } else { 487b8e80941Smrg /* Since the undef used as the source of the original ALU 488b8e80941Smrg * instruction may have different number of components or 489b8e80941Smrg * bit size than the result of that instruction, a new 490b8e80941Smrg * undef must be created. 491b8e80941Smrg */ 492b8e80941Smrg nir_ssa_undef_instr *undef = 493b8e80941Smrg nir_ssa_undef_instr_create(b->shader, 494b8e80941Smrg alu->dest.dest.ssa.num_components, 495b8e80941Smrg alu->dest.dest.ssa.bit_size); 496b8e80941Smrg 497b8e80941Smrg nir_instr_insert_after_block(prev_block, &undef->instr); 498b8e80941Smrg 499b8e80941Smrg prev_value = &undef->def; 500b8e80941Smrg } 501b8e80941Smrg 502b8e80941Smrg /* Make a copy of the original ALU instruction. Replace the sources 503b8e80941Smrg * of the new instruction that read a phi with an undef source from 504b8e80941Smrg * prev_block with the non-undef source of that phi. 505b8e80941Smrg * 506b8e80941Smrg * Insert the new instruction at the end of the continue block. 507b8e80941Smrg */ 508b8e80941Smrg b->cursor = nir_after_block_before_jump(continue_block); 509b8e80941Smrg 510b8e80941Smrg nir_ssa_def *const alu_copy = 511b8e80941Smrg clone_alu_and_replace_src_defs(b, alu, continue_srcs); 512b8e80941Smrg 513b8e80941Smrg /* Make a new phi node that selects a value from prev_block and the 514b8e80941Smrg * result of the new instruction from continue_block. 515b8e80941Smrg */ 516b8e80941Smrg nir_phi_instr *const phi = nir_phi_instr_create(b->shader); 517b8e80941Smrg nir_phi_src *phi_src; 518b8e80941Smrg 519b8e80941Smrg phi_src = ralloc(phi, nir_phi_src); 520b8e80941Smrg phi_src->pred = prev_block; 521b8e80941Smrg phi_src->src = nir_src_for_ssa(prev_value); 522b8e80941Smrg exec_list_push_tail(&phi->srcs, &phi_src->node); 523b8e80941Smrg 524b8e80941Smrg phi_src = ralloc(phi, nir_phi_src); 525b8e80941Smrg phi_src->pred = continue_block; 526b8e80941Smrg phi_src->src = nir_src_for_ssa(alu_copy); 527b8e80941Smrg exec_list_push_tail(&phi->srcs, &phi_src->node); 528b8e80941Smrg 529b8e80941Smrg nir_ssa_dest_init(&phi->instr, &phi->dest, 530b8e80941Smrg alu_copy->num_components, alu_copy->bit_size, NULL); 531b8e80941Smrg 532b8e80941Smrg b->cursor = nir_after_phis(header_block); 533b8e80941Smrg nir_builder_instr_insert(b, &phi->instr); 534b8e80941Smrg 535b8e80941Smrg /* Modify all readers of the original ALU instruction to read the 536b8e80941Smrg * result of the phi. 537b8e80941Smrg */ 538b8e80941Smrg nir_foreach_use_safe(use_src, &alu->dest.dest.ssa) { 539b8e80941Smrg nir_instr_rewrite_src(use_src->parent_instr, 540b8e80941Smrg use_src, 541b8e80941Smrg nir_src_for_ssa(&phi->dest.ssa)); 542b8e80941Smrg } 543b8e80941Smrg 544b8e80941Smrg nir_foreach_if_use_safe(use_src, &alu->dest.dest.ssa) { 545b8e80941Smrg nir_if_rewrite_condition(use_src->parent_if, 546b8e80941Smrg nir_src_for_ssa(&phi->dest.ssa)); 547b8e80941Smrg } 548b8e80941Smrg 549b8e80941Smrg /* Since the original ALU instruction no longer has any readers, just 550b8e80941Smrg * remove it. 551b8e80941Smrg */ 552b8e80941Smrg nir_instr_remove_v(&alu->instr); 553b8e80941Smrg ralloc_free(alu); 554b8e80941Smrg 555b8e80941Smrg progress = true; 556b8e80941Smrg } 557b8e80941Smrg } 558b8e80941Smrg 559b8e80941Smrg return progress; 560b8e80941Smrg} 561b8e80941Smrg 562b8e80941Smrg/** 563b8e80941Smrg * Get the SSA value from a phi node that corresponds to a specific block 564b8e80941Smrg */ 565b8e80941Smrgstatic nir_ssa_def * 566b8e80941Smrgssa_for_phi_from_block(nir_phi_instr *phi, nir_block *block) 567b8e80941Smrg{ 568b8e80941Smrg nir_foreach_phi_src(src, phi) { 569b8e80941Smrg if (src->pred == block) 570b8e80941Smrg return src->src.ssa; 571b8e80941Smrg } 572b8e80941Smrg 573b8e80941Smrg assert(!"Block is not a predecessor of phi."); 574b8e80941Smrg return NULL; 575b8e80941Smrg} 576b8e80941Smrg 577b8e80941Smrg/** 578b8e80941Smrg * Simplify a bcsel whose sources are all phi nodes from the loop header block 579b8e80941Smrg * 580b8e80941Smrg * bcsel instructions in a loop that meet the following criteria can be 581b8e80941Smrg * converted to phi nodes: 582b8e80941Smrg * 583b8e80941Smrg * - The loop has no continue instructions other than the "natural" continue 584b8e80941Smrg * at the bottom of the loop. 585b8e80941Smrg * 586b8e80941Smrg * - All of the sources of the bcsel are phi nodes in the header block of the 587b8e80941Smrg * loop. 588b8e80941Smrg * 589b8e80941Smrg * - The phi node representing the condition of the bcsel instruction chooses 590b8e80941Smrg * only constant values. 591b8e80941Smrg * 592b8e80941Smrg * The contant value from the condition will select one of the other sources 593b8e80941Smrg * when entered from outside the loop and the remaining source when entered 594b8e80941Smrg * from the continue block. Since each of these sources is also a phi node in 595b8e80941Smrg * the header block, the value of the phi node can be "evaluated." These 596b8e80941Smrg * evaluated phi nodes provide the sources for a new phi node. All users of 597b8e80941Smrg * the bcsel result are updated to use the phi node result. 598b8e80941Smrg * 599b8e80941Smrg * The replacement transforms loops like: 600b8e80941Smrg * 601b8e80941Smrg * vec1 32 ssa_7 = undefined 602b8e80941Smrg * vec1 32 ssa_8 = load_const (0x00000001) 603b8e80941Smrg * vec1 32 ssa_9 = load_const (0x000000c8) 604b8e80941Smrg * vec1 32 ssa_10 = load_const (0x00000000) 605b8e80941Smrg * // succs: block_1 606b8e80941Smrg * loop { 607b8e80941Smrg * block block_1: 608b8e80941Smrg * // preds: block_0 block_4 609b8e80941Smrg * vec1 32 ssa_11 = phi block_0: ssa_1, block_4: ssa_14 610b8e80941Smrg * vec1 32 ssa_12 = phi block_0: ssa_10, block_4: ssa_15 611b8e80941Smrg * vec1 32 ssa_13 = phi block_0: ssa_7, block_4: ssa_25 612b8e80941Smrg * vec1 32 ssa_14 = b32csel ssa_12, ssa_13, ssa_11 613b8e80941Smrg * vec1 32 ssa_16 = ige32 ssa_14, ssa_9 614b8e80941Smrg * ... 615b8e80941Smrg * vec1 32 ssa_15 = load_const (0xffffffff) 616b8e80941Smrg * ... 617b8e80941Smrg * vec1 32 ssa_25 = iadd ssa_14, ssa_8 618b8e80941Smrg * // succs: block_1 619b8e80941Smrg * } 620b8e80941Smrg * 621b8e80941Smrg * into: 622b8e80941Smrg * 623b8e80941Smrg * vec1 32 ssa_7 = undefined 624b8e80941Smrg * vec1 32 ssa_8 = load_const (0x00000001) 625b8e80941Smrg * vec1 32 ssa_9 = load_const (0x000000c8) 626b8e80941Smrg * vec1 32 ssa_10 = load_const (0x00000000) 627b8e80941Smrg * // succs: block_1 628b8e80941Smrg * loop { 629b8e80941Smrg * block block_1: 630b8e80941Smrg * // preds: block_0 block_4 631b8e80941Smrg * vec1 32 ssa_11 = phi block_0: ssa_1, block_4: ssa_14 632b8e80941Smrg * vec1 32 ssa_12 = phi block_0: ssa_10, block_4: ssa_15 633b8e80941Smrg * vec1 32 ssa_13 = phi block_0: ssa_7, block_4: ssa_25 634b8e80941Smrg * vec1 32 sss_26 = phi block_0: ssa_1, block_4: ssa_25 635b8e80941Smrg * vec1 32 ssa_16 = ige32 ssa_26, ssa_9 636b8e80941Smrg * ... 637b8e80941Smrg * vec1 32 ssa_15 = load_const (0xffffffff) 638b8e80941Smrg * ... 639b8e80941Smrg * vec1 32 ssa_25 = iadd ssa_26, ssa_8 640b8e80941Smrg * // succs: block_1 641b8e80941Smrg * } 642b8e80941Smrg * 643b8e80941Smrg * \note 644b8e80941Smrg * It may be possible modify this function to not require a phi node as the 645b8e80941Smrg * source of the bcsel that is selected when entering from outside the loop. 646b8e80941Smrg * The only restriction is that the source must be geneated outside the loop 647b8e80941Smrg * (since it will become the source of a phi node in the header block of the 648b8e80941Smrg * loop). 649b8e80941Smrg */ 650b8e80941Smrgstatic bool 651b8e80941Smrgopt_simplify_bcsel_of_phi(nir_builder *b, nir_loop *loop) 652b8e80941Smrg{ 653b8e80941Smrg bool progress = false; 654b8e80941Smrg nir_block *header_block = nir_loop_first_block(loop); 655b8e80941Smrg nir_block *const prev_block = 656b8e80941Smrg nir_cf_node_as_block(nir_cf_node_prev(&loop->cf_node)); 657b8e80941Smrg 658b8e80941Smrg /* It would be insane if this were not true */ 659b8e80941Smrg assert(_mesa_set_search(header_block->predecessors, prev_block)); 660b8e80941Smrg 661b8e80941Smrg /* The loop must have exactly one continue block which could be a block 662b8e80941Smrg * ending in a continue instruction or the "natural" continue from the 663b8e80941Smrg * last block in the loop back to the top. 664b8e80941Smrg */ 665b8e80941Smrg if (header_block->predecessors->entries != 2) 666b8e80941Smrg return false; 667b8e80941Smrg 668b8e80941Smrg /* We can move any bcsel that can guaranteed to execut on every iteration 669b8e80941Smrg * of a loop. For now this is accomplished by only taking bcsels from the 670b8e80941Smrg * header_block. In the future, this could be expanced to include any 671b8e80941Smrg * bcsel that must come before any break. 672b8e80941Smrg * 673b8e80941Smrg * For more details, see 674b8e80941Smrg * https://gitlab.freedesktop.org/mesa/mesa/merge_requests/170#note_110305 675b8e80941Smrg */ 676b8e80941Smrg nir_foreach_instr_safe(instr, header_block) { 677b8e80941Smrg if (instr->type != nir_instr_type_alu) 678b8e80941Smrg continue; 679b8e80941Smrg 680b8e80941Smrg nir_alu_instr *const bcsel = nir_instr_as_alu(instr); 681b8e80941Smrg if (bcsel->op != nir_op_bcsel && 682b8e80941Smrg bcsel->op != nir_op_b32csel && 683b8e80941Smrg bcsel->op != nir_op_fcsel) 684b8e80941Smrg continue; 685b8e80941Smrg 686b8e80941Smrg bool match = true; 687b8e80941Smrg for (unsigned i = 0; i < 3; i++) { 688b8e80941Smrg /* FINISHME: The abs and negate cases could be handled by adding 689b8e80941Smrg * move instructions at the bottom of the continue block and more 690b8e80941Smrg * phi nodes in the header_block. 691b8e80941Smrg */ 692b8e80941Smrg if (!bcsel->src[i].src.is_ssa || 693b8e80941Smrg bcsel->src[i].src.ssa->parent_instr->type != nir_instr_type_phi || 694b8e80941Smrg bcsel->src[i].src.ssa->parent_instr->block != header_block || 695b8e80941Smrg bcsel->src[i].negate || bcsel->src[i].abs) { 696b8e80941Smrg match = false; 697b8e80941Smrg break; 698b8e80941Smrg } 699b8e80941Smrg } 700b8e80941Smrg 701b8e80941Smrg if (!match) 702b8e80941Smrg continue; 703b8e80941Smrg 704b8e80941Smrg nir_phi_instr *const cond_phi = 705b8e80941Smrg nir_instr_as_phi(bcsel->src[0].src.ssa->parent_instr); 706b8e80941Smrg 707b8e80941Smrg uint32_t entry_val = 0, continue_val = 0; 708b8e80941Smrg if (!phi_has_constant_from_outside_and_one_from_inside_loop(cond_phi, 709b8e80941Smrg prev_block, 710b8e80941Smrg &entry_val, 711b8e80941Smrg &continue_val)) 712b8e80941Smrg continue; 713b8e80941Smrg 714b8e80941Smrg /* If they both execute or both don't execute, this is a job for 715b8e80941Smrg * nir_dead_cf, not this pass. 716b8e80941Smrg */ 717b8e80941Smrg if ((entry_val && continue_val) || (!entry_val && !continue_val)) 718b8e80941Smrg continue; 719b8e80941Smrg 720b8e80941Smrg const unsigned entry_src = entry_val ? 1 : 2; 721b8e80941Smrg const unsigned continue_src = entry_val ? 2 : 1; 722b8e80941Smrg 723b8e80941Smrg /* Create a new phi node that selects the value for prev_block from 724b8e80941Smrg * the bcsel source that is selected by entry_val and the value for 725b8e80941Smrg * continue_block from the other bcsel source. Both sources have 726b8e80941Smrg * already been verified to be phi nodes. 727b8e80941Smrg */ 728b8e80941Smrg nir_block *const continue_block = find_continue_block(loop); 729b8e80941Smrg nir_phi_instr *const phi = nir_phi_instr_create(b->shader); 730b8e80941Smrg nir_phi_src *phi_src; 731b8e80941Smrg 732b8e80941Smrg phi_src = ralloc(phi, nir_phi_src); 733b8e80941Smrg phi_src->pred = prev_block; 734b8e80941Smrg phi_src->src = 735b8e80941Smrg nir_src_for_ssa(ssa_for_phi_from_block(nir_instr_as_phi(bcsel->src[entry_src].src.ssa->parent_instr), 736b8e80941Smrg prev_block)); 737b8e80941Smrg exec_list_push_tail(&phi->srcs, &phi_src->node); 738b8e80941Smrg 739b8e80941Smrg phi_src = ralloc(phi, nir_phi_src); 740b8e80941Smrg phi_src->pred = continue_block; 741b8e80941Smrg phi_src->src = 742b8e80941Smrg nir_src_for_ssa(ssa_for_phi_from_block(nir_instr_as_phi(bcsel->src[continue_src].src.ssa->parent_instr), 743b8e80941Smrg continue_block)); 744b8e80941Smrg exec_list_push_tail(&phi->srcs, &phi_src->node); 745b8e80941Smrg 746b8e80941Smrg nir_ssa_dest_init(&phi->instr, 747b8e80941Smrg &phi->dest, 748b8e80941Smrg nir_dest_num_components(bcsel->dest.dest), 749b8e80941Smrg nir_dest_bit_size(bcsel->dest.dest), 750b8e80941Smrg NULL); 751b8e80941Smrg 752b8e80941Smrg b->cursor = nir_after_phis(header_block); 753b8e80941Smrg nir_builder_instr_insert(b, &phi->instr); 754b8e80941Smrg 755b8e80941Smrg /* Modify all readers of the bcsel instruction to read the result of 756b8e80941Smrg * the phi. 757b8e80941Smrg */ 758b8e80941Smrg nir_foreach_use_safe(use_src, &bcsel->dest.dest.ssa) { 759b8e80941Smrg nir_instr_rewrite_src(use_src->parent_instr, 760b8e80941Smrg use_src, 761b8e80941Smrg nir_src_for_ssa(&phi->dest.ssa)); 762b8e80941Smrg } 763b8e80941Smrg 764b8e80941Smrg nir_foreach_if_use_safe(use_src, &bcsel->dest.dest.ssa) { 765b8e80941Smrg nir_if_rewrite_condition(use_src->parent_if, 766b8e80941Smrg nir_src_for_ssa(&phi->dest.ssa)); 767b8e80941Smrg } 768b8e80941Smrg 769b8e80941Smrg /* Since the original bcsel instruction no longer has any readers, 770b8e80941Smrg * just remove it. 771b8e80941Smrg */ 772b8e80941Smrg nir_instr_remove_v(&bcsel->instr); 773b8e80941Smrg ralloc_free(bcsel); 774b8e80941Smrg 775b8e80941Smrg progress = true; 776b8e80941Smrg } 777b8e80941Smrg 778b8e80941Smrg return progress; 779b8e80941Smrg} 780b8e80941Smrg 781b8e80941Smrgstatic bool 782b8e80941Smrgis_block_empty(nir_block *block) 783b8e80941Smrg{ 784b8e80941Smrg return nir_cf_node_is_last(&block->cf_node) && 785b8e80941Smrg exec_list_is_empty(&block->instr_list); 786b8e80941Smrg} 787b8e80941Smrg 788b8e80941Smrgstatic bool 789b8e80941Smrgnir_block_ends_in_continue(nir_block *block) 790b8e80941Smrg{ 791b8e80941Smrg if (exec_list_is_empty(&block->instr_list)) 792b8e80941Smrg return false; 793b8e80941Smrg 794b8e80941Smrg nir_instr *instr = nir_block_last_instr(block); 795b8e80941Smrg return instr->type == nir_instr_type_jump && 796b8e80941Smrg nir_instr_as_jump(instr)->type == nir_jump_continue; 797b8e80941Smrg} 798b8e80941Smrg 799b8e80941Smrg/** 800b8e80941Smrg * This optimization turns: 801b8e80941Smrg * 802b8e80941Smrg * loop { 803b8e80941Smrg * ... 804b8e80941Smrg * if (cond) { 805b8e80941Smrg * do_work_1(); 806b8e80941Smrg * continue; 807b8e80941Smrg * } else { 808b8e80941Smrg * } 809b8e80941Smrg * do_work_2(); 810b8e80941Smrg * } 811b8e80941Smrg * 812b8e80941Smrg * into: 813b8e80941Smrg * 814b8e80941Smrg * loop { 815b8e80941Smrg * ... 816b8e80941Smrg * if (cond) { 817b8e80941Smrg * do_work_1(); 818b8e80941Smrg * continue; 819b8e80941Smrg * } else { 820b8e80941Smrg * do_work_2(); 821b8e80941Smrg * } 822b8e80941Smrg * } 823b8e80941Smrg * 824b8e80941Smrg * The continue should then be removed by nir_opt_trivial_continues() and the 825b8e80941Smrg * loop can potentially be unrolled. 826b8e80941Smrg * 827b8e80941Smrg * Note: Unless the function param aggressive_last_continue==true do_work_2() 828b8e80941Smrg * is only ever blocks and nested loops. We avoid nesting other if-statments 829b8e80941Smrg * in the branch as this can result in increased register pressure, and in 830b8e80941Smrg * the i965 driver it causes a large amount of spilling in shader-db. 831b8e80941Smrg * For RADV however nesting these if-statements allows further continues to be 832b8e80941Smrg * remove and provides a significant FPS boost in Doom, which is why we have 833b8e80941Smrg * opted for this special bool to enable more aggresive optimisations. 834b8e80941Smrg * TODO: The GCM pass solves most of the spilling regressions in i965, if it 835b8e80941Smrg * is ever enabled we should consider removing the aggressive_last_continue 836b8e80941Smrg * param. 837b8e80941Smrg */ 838b8e80941Smrgstatic bool 839b8e80941Smrgopt_if_loop_last_continue(nir_loop *loop, bool aggressive_last_continue) 840b8e80941Smrg{ 841b8e80941Smrg nir_if *nif; 842b8e80941Smrg bool then_ends_in_continue = false; 843b8e80941Smrg bool else_ends_in_continue = false; 844b8e80941Smrg 845b8e80941Smrg /* Scan the control flow of the loop from the last to the first node 846b8e80941Smrg * looking for an if-statement we can optimise. 847b8e80941Smrg */ 848b8e80941Smrg nir_block *last_block = nir_loop_last_block(loop); 849b8e80941Smrg nir_cf_node *if_node = nir_cf_node_prev(&last_block->cf_node); 850b8e80941Smrg while (if_node) { 851b8e80941Smrg if (if_node->type == nir_cf_node_if) { 852b8e80941Smrg nif = nir_cf_node_as_if(if_node); 853b8e80941Smrg nir_block *then_block = nir_if_last_then_block(nif); 854b8e80941Smrg nir_block *else_block = nir_if_last_else_block(nif); 855b8e80941Smrg 856b8e80941Smrg then_ends_in_continue = nir_block_ends_in_continue(then_block); 857b8e80941Smrg else_ends_in_continue = nir_block_ends_in_continue(else_block); 858b8e80941Smrg 859b8e80941Smrg /* If both branches end in a jump do nothing, this should be handled 860b8e80941Smrg * by nir_opt_dead_cf(). 861b8e80941Smrg */ 862b8e80941Smrg if ((then_ends_in_continue || nir_block_ends_in_break(then_block)) && 863b8e80941Smrg (else_ends_in_continue || nir_block_ends_in_break(else_block))) 864b8e80941Smrg return false; 865b8e80941Smrg 866b8e80941Smrg /* If continue found stop scanning and attempt optimisation, or 867b8e80941Smrg */ 868b8e80941Smrg if (then_ends_in_continue || else_ends_in_continue || 869b8e80941Smrg !aggressive_last_continue) 870b8e80941Smrg break; 871b8e80941Smrg } 872b8e80941Smrg 873b8e80941Smrg if_node = nir_cf_node_prev(if_node); 874b8e80941Smrg } 875b8e80941Smrg 876b8e80941Smrg /* If we didn't find an if to optimise return */ 877b8e80941Smrg if (!then_ends_in_continue && !else_ends_in_continue) 878b8e80941Smrg return false; 879b8e80941Smrg 880b8e80941Smrg /* If there is nothing after the if-statement we bail */ 881b8e80941Smrg if (&nif->cf_node == nir_cf_node_prev(&last_block->cf_node) && 882b8e80941Smrg exec_list_is_empty(&last_block->instr_list)) 883b8e80941Smrg return false; 884b8e80941Smrg 885b8e80941Smrg /* Move the last block of the loop inside the last if-statement */ 886b8e80941Smrg nir_cf_list tmp; 887b8e80941Smrg nir_cf_extract(&tmp, nir_after_cf_node(if_node), 888b8e80941Smrg nir_after_block(last_block)); 889b8e80941Smrg if (then_ends_in_continue) 890b8e80941Smrg nir_cf_reinsert(&tmp, nir_after_cf_list(&nif->else_list)); 891b8e80941Smrg else 892b8e80941Smrg nir_cf_reinsert(&tmp, nir_after_cf_list(&nif->then_list)); 893b8e80941Smrg 894b8e80941Smrg /* In order to avoid running nir_lower_regs_to_ssa_impl() every time an if 895b8e80941Smrg * opt makes progress we leave nir_opt_trivial_continues() to remove the 896b8e80941Smrg * continue now that the end of the loop has been simplified. 897b8e80941Smrg */ 898b8e80941Smrg 899b8e80941Smrg return true; 900b8e80941Smrg} 901b8e80941Smrg 902b8e80941Smrg/* Walk all the phis in the block immediately following the if statement and 903b8e80941Smrg * swap the blocks. 904b8e80941Smrg */ 905b8e80941Smrgstatic void 906b8e80941Smrgrewrite_phi_predecessor_blocks(nir_if *nif, 907b8e80941Smrg nir_block *old_then_block, 908b8e80941Smrg nir_block *old_else_block, 909b8e80941Smrg nir_block *new_then_block, 910b8e80941Smrg nir_block *new_else_block) 911b8e80941Smrg{ 912b8e80941Smrg nir_block *after_if_block = 913b8e80941Smrg nir_cf_node_as_block(nir_cf_node_next(&nif->cf_node)); 914b8e80941Smrg 915b8e80941Smrg nir_foreach_instr(instr, after_if_block) { 916b8e80941Smrg if (instr->type != nir_instr_type_phi) 917b8e80941Smrg continue; 918b8e80941Smrg 919b8e80941Smrg nir_phi_instr *phi = nir_instr_as_phi(instr); 920b8e80941Smrg 921b8e80941Smrg foreach_list_typed(nir_phi_src, src, node, &phi->srcs) { 922b8e80941Smrg if (src->pred == old_then_block) { 923b8e80941Smrg src->pred = new_then_block; 924b8e80941Smrg } else if (src->pred == old_else_block) { 925b8e80941Smrg src->pred = new_else_block; 926b8e80941Smrg } 927b8e80941Smrg } 928b8e80941Smrg } 929b8e80941Smrg} 930b8e80941Smrg 931b8e80941Smrg/** 932b8e80941Smrg * This optimization turns: 933b8e80941Smrg * 934b8e80941Smrg * if (cond) { 935b8e80941Smrg * } else { 936b8e80941Smrg * do_work(); 937b8e80941Smrg * } 938b8e80941Smrg * 939b8e80941Smrg * into: 940b8e80941Smrg * 941b8e80941Smrg * if (!cond) { 942b8e80941Smrg * do_work(); 943b8e80941Smrg * } else { 944b8e80941Smrg * } 945b8e80941Smrg */ 946b8e80941Smrgstatic bool 947b8e80941Smrgopt_if_simplification(nir_builder *b, nir_if *nif) 948b8e80941Smrg{ 949b8e80941Smrg /* Only simplify if the then block is empty and the else block is not. */ 950b8e80941Smrg if (!is_block_empty(nir_if_first_then_block(nif)) || 951b8e80941Smrg is_block_empty(nir_if_first_else_block(nif))) 952b8e80941Smrg return false; 953b8e80941Smrg 954b8e80941Smrg /* Make sure the condition is a comparison operation. */ 955b8e80941Smrg nir_instr *src_instr = nif->condition.ssa->parent_instr; 956b8e80941Smrg if (src_instr->type != nir_instr_type_alu) 957b8e80941Smrg return false; 958b8e80941Smrg 959b8e80941Smrg nir_alu_instr *alu_instr = nir_instr_as_alu(src_instr); 960b8e80941Smrg if (!nir_alu_instr_is_comparison(alu_instr)) 961b8e80941Smrg return false; 962b8e80941Smrg 963b8e80941Smrg /* Insert the inverted instruction and rewrite the condition. */ 964b8e80941Smrg b->cursor = nir_after_instr(&alu_instr->instr); 965b8e80941Smrg 966b8e80941Smrg nir_ssa_def *new_condition = 967b8e80941Smrg nir_inot(b, &alu_instr->dest.dest.ssa); 968b8e80941Smrg 969b8e80941Smrg nir_if_rewrite_condition(nif, nir_src_for_ssa(new_condition)); 970b8e80941Smrg 971b8e80941Smrg /* Grab pointers to the last then/else blocks for fixing up the phis. */ 972b8e80941Smrg nir_block *then_block = nir_if_last_then_block(nif); 973b8e80941Smrg nir_block *else_block = nir_if_last_else_block(nif); 974b8e80941Smrg 975b8e80941Smrg rewrite_phi_predecessor_blocks(nif, then_block, else_block, else_block, 976b8e80941Smrg then_block); 977b8e80941Smrg 978b8e80941Smrg /* Finally, move the else block to the then block. */ 979b8e80941Smrg nir_cf_list tmp; 980b8e80941Smrg nir_cf_extract(&tmp, nir_before_cf_list(&nif->else_list), 981b8e80941Smrg nir_after_cf_list(&nif->else_list)); 982b8e80941Smrg nir_cf_reinsert(&tmp, nir_before_cf_list(&nif->then_list)); 983b8e80941Smrg 984b8e80941Smrg return true; 985b8e80941Smrg} 986b8e80941Smrg 987b8e80941Smrg/** 988b8e80941Smrg * This optimization simplifies potential loop terminators which then allows 989b8e80941Smrg * other passes such as opt_if_simplification() and loop unrolling to progress 990b8e80941Smrg * further: 991b8e80941Smrg * 992b8e80941Smrg * if (cond) { 993b8e80941Smrg * ... then block instructions ... 994b8e80941Smrg * } else { 995b8e80941Smrg * ... 996b8e80941Smrg * break; 997b8e80941Smrg * } 998b8e80941Smrg * 999b8e80941Smrg * into: 1000b8e80941Smrg * 1001b8e80941Smrg * if (cond) { 1002b8e80941Smrg * } else { 1003b8e80941Smrg * ... 1004b8e80941Smrg * break; 1005b8e80941Smrg * } 1006b8e80941Smrg * ... then block instructions ... 1007b8e80941Smrg */ 1008b8e80941Smrgstatic bool 1009b8e80941Smrgopt_if_loop_terminator(nir_if *nif) 1010b8e80941Smrg{ 1011b8e80941Smrg nir_block *break_blk = NULL; 1012b8e80941Smrg nir_block *continue_from_blk = NULL; 1013b8e80941Smrg bool continue_from_then = true; 1014b8e80941Smrg 1015b8e80941Smrg nir_block *last_then = nir_if_last_then_block(nif); 1016b8e80941Smrg nir_block *last_else = nir_if_last_else_block(nif); 1017b8e80941Smrg 1018b8e80941Smrg if (nir_block_ends_in_break(last_then)) { 1019b8e80941Smrg break_blk = last_then; 1020b8e80941Smrg continue_from_blk = last_else; 1021b8e80941Smrg continue_from_then = false; 1022b8e80941Smrg } else if (nir_block_ends_in_break(last_else)) { 1023b8e80941Smrg break_blk = last_else; 1024b8e80941Smrg continue_from_blk = last_then; 1025b8e80941Smrg } 1026b8e80941Smrg 1027b8e80941Smrg /* Continue if the if-statement contained no jumps at all */ 1028b8e80941Smrg if (!break_blk) 1029b8e80941Smrg return false; 1030b8e80941Smrg 1031b8e80941Smrg /* If the continue from block is empty then return as there is nothing to 1032b8e80941Smrg * move. 1033b8e80941Smrg */ 1034b8e80941Smrg nir_block *first_continue_from_blk = continue_from_then ? 1035b8e80941Smrg nir_if_first_then_block(nif) : 1036b8e80941Smrg nir_if_first_else_block(nif); 1037b8e80941Smrg if (is_block_empty(first_continue_from_blk)) 1038b8e80941Smrg return false; 1039b8e80941Smrg 1040b8e80941Smrg if (!nir_is_trivial_loop_if(nif, break_blk)) 1041b8e80941Smrg return false; 1042b8e80941Smrg 1043b8e80941Smrg /* Even though this if statement has a jump on one side, we may still have 1044b8e80941Smrg * phis afterwards. Single-source phis can be produced by loop unrolling 1045b8e80941Smrg * or dead control-flow passes and are perfectly legal. Run a quick phi 1046b8e80941Smrg * removal on the block after the if to clean up any such phis. 1047b8e80941Smrg */ 1048b8e80941Smrg nir_opt_remove_phis_block(nir_cf_node_as_block(nir_cf_node_next(&nif->cf_node))); 1049b8e80941Smrg 1050b8e80941Smrg /* Finally, move the continue from branch after the if-statement. */ 1051b8e80941Smrg nir_cf_list tmp; 1052b8e80941Smrg nir_cf_extract(&tmp, nir_before_block(first_continue_from_blk), 1053b8e80941Smrg nir_after_block(continue_from_blk)); 1054b8e80941Smrg nir_cf_reinsert(&tmp, nir_after_cf_node(&nif->cf_node)); 1055b8e80941Smrg 1056b8e80941Smrg return true; 1057b8e80941Smrg} 1058b8e80941Smrg 1059b8e80941Smrgstatic bool 1060b8e80941Smrgevaluate_if_condition(nir_if *nif, nir_cursor cursor, bool *value) 1061b8e80941Smrg{ 1062b8e80941Smrg nir_block *use_block = nir_cursor_current_block(cursor); 1063b8e80941Smrg if (nir_block_dominates(nir_if_first_then_block(nif), use_block)) { 1064b8e80941Smrg *value = true; 1065b8e80941Smrg return true; 1066b8e80941Smrg } else if (nir_block_dominates(nir_if_first_else_block(nif), use_block)) { 1067b8e80941Smrg *value = false; 1068b8e80941Smrg return true; 1069b8e80941Smrg } else { 1070b8e80941Smrg return false; 1071b8e80941Smrg } 1072b8e80941Smrg} 1073b8e80941Smrg 1074b8e80941Smrgstatic nir_ssa_def * 1075b8e80941Smrgclone_alu_and_replace_src_defs(nir_builder *b, const nir_alu_instr *alu, 1076b8e80941Smrg nir_ssa_def **src_defs) 1077b8e80941Smrg{ 1078b8e80941Smrg nir_alu_instr *nalu = nir_alu_instr_create(b->shader, alu->op); 1079b8e80941Smrg nalu->exact = alu->exact; 1080b8e80941Smrg 1081b8e80941Smrg nir_ssa_dest_init(&nalu->instr, &nalu->dest.dest, 1082b8e80941Smrg alu->dest.dest.ssa.num_components, 1083b8e80941Smrg alu->dest.dest.ssa.bit_size, alu->dest.dest.ssa.name); 1084b8e80941Smrg 1085b8e80941Smrg nalu->dest.saturate = alu->dest.saturate; 1086b8e80941Smrg nalu->dest.write_mask = alu->dest.write_mask; 1087b8e80941Smrg 1088b8e80941Smrg for (unsigned i = 0; i < nir_op_infos[alu->op].num_inputs; i++) { 1089b8e80941Smrg assert(alu->src[i].src.is_ssa); 1090b8e80941Smrg nalu->src[i].src = nir_src_for_ssa(src_defs[i]); 1091b8e80941Smrg nalu->src[i].negate = alu->src[i].negate; 1092b8e80941Smrg nalu->src[i].abs = alu->src[i].abs; 1093b8e80941Smrg memcpy(nalu->src[i].swizzle, alu->src[i].swizzle, 1094b8e80941Smrg sizeof(nalu->src[i].swizzle)); 1095b8e80941Smrg } 1096b8e80941Smrg 1097b8e80941Smrg nir_builder_instr_insert(b, &nalu->instr); 1098b8e80941Smrg 1099b8e80941Smrg return &nalu->dest.dest.ssa;; 1100b8e80941Smrg} 1101b8e80941Smrg 1102b8e80941Smrg/* 1103b8e80941Smrg * This propagates if condition evaluation down the chain of some alu 1104b8e80941Smrg * instructions. For example by checking the use of some of the following alu 1105b8e80941Smrg * instruction we can eventually replace ssa_107 with NIR_TRUE. 1106b8e80941Smrg * 1107b8e80941Smrg * loop { 1108b8e80941Smrg * block block_1: 1109b8e80941Smrg * vec1 32 ssa_85 = load_const (0x00000002) 1110b8e80941Smrg * vec1 32 ssa_86 = ieq ssa_48, ssa_85 1111b8e80941Smrg * vec1 32 ssa_87 = load_const (0x00000001) 1112b8e80941Smrg * vec1 32 ssa_88 = ieq ssa_48, ssa_87 1113b8e80941Smrg * vec1 32 ssa_89 = ior ssa_86, ssa_88 1114b8e80941Smrg * vec1 32 ssa_90 = ieq ssa_48, ssa_0 1115b8e80941Smrg * vec1 32 ssa_91 = ior ssa_89, ssa_90 1116b8e80941Smrg * if ssa_86 { 1117b8e80941Smrg * block block_2: 1118b8e80941Smrg * ... 1119b8e80941Smrg * break 1120b8e80941Smrg * } else { 1121b8e80941Smrg * block block_3: 1122b8e80941Smrg * } 1123b8e80941Smrg * block block_4: 1124b8e80941Smrg * if ssa_88 { 1125b8e80941Smrg * block block_5: 1126b8e80941Smrg * ... 1127b8e80941Smrg * break 1128b8e80941Smrg * } else { 1129b8e80941Smrg * block block_6: 1130b8e80941Smrg * } 1131b8e80941Smrg * block block_7: 1132b8e80941Smrg * if ssa_90 { 1133b8e80941Smrg * block block_8: 1134b8e80941Smrg * ... 1135b8e80941Smrg * break 1136b8e80941Smrg * } else { 1137b8e80941Smrg * block block_9: 1138b8e80941Smrg * } 1139b8e80941Smrg * block block_10: 1140b8e80941Smrg * vec1 32 ssa_107 = inot ssa_91 1141b8e80941Smrg * if ssa_107 { 1142b8e80941Smrg * block block_11: 1143b8e80941Smrg * break 1144b8e80941Smrg * } else { 1145b8e80941Smrg * block block_12: 1146b8e80941Smrg * } 1147b8e80941Smrg * } 1148b8e80941Smrg */ 1149b8e80941Smrgstatic bool 1150b8e80941Smrgpropagate_condition_eval(nir_builder *b, nir_if *nif, nir_src *use_src, 1151b8e80941Smrg nir_src *alu_use, nir_alu_instr *alu, 1152b8e80941Smrg bool is_if_condition) 1153b8e80941Smrg{ 1154b8e80941Smrg bool bool_value; 1155b8e80941Smrg b->cursor = nir_before_src(alu_use, is_if_condition); 1156b8e80941Smrg if (!evaluate_if_condition(nif, b->cursor, &bool_value)) 1157b8e80941Smrg return false; 1158b8e80941Smrg 1159b8e80941Smrg nir_ssa_def *def[4] = {0}; 1160b8e80941Smrg for (unsigned i = 0; i < nir_op_infos[alu->op].num_inputs; i++) { 1161b8e80941Smrg if (alu->src[i].src.ssa == use_src->ssa) { 1162b8e80941Smrg def[i] = nir_imm_bool(b, bool_value); 1163b8e80941Smrg } else { 1164b8e80941Smrg def[i] = alu->src[i].src.ssa; 1165b8e80941Smrg } 1166b8e80941Smrg } 1167b8e80941Smrg 1168b8e80941Smrg nir_ssa_def *nalu = clone_alu_and_replace_src_defs(b, alu, def); 1169b8e80941Smrg 1170b8e80941Smrg /* Rewrite use to use new alu instruction */ 1171b8e80941Smrg nir_src new_src = nir_src_for_ssa(nalu); 1172b8e80941Smrg 1173b8e80941Smrg if (is_if_condition) 1174b8e80941Smrg nir_if_rewrite_condition(alu_use->parent_if, new_src); 1175b8e80941Smrg else 1176b8e80941Smrg nir_instr_rewrite_src(alu_use->parent_instr, alu_use, new_src); 1177b8e80941Smrg 1178b8e80941Smrg return true; 1179b8e80941Smrg} 1180b8e80941Smrg 1181b8e80941Smrgstatic bool 1182b8e80941Smrgcan_propagate_through_alu(nir_src *src) 1183b8e80941Smrg{ 1184b8e80941Smrg if (src->parent_instr->type != nir_instr_type_alu) 1185b8e80941Smrg return false; 1186b8e80941Smrg 1187b8e80941Smrg nir_alu_instr *alu = nir_instr_as_alu(src->parent_instr); 1188b8e80941Smrg switch (alu->op) { 1189b8e80941Smrg case nir_op_ior: 1190b8e80941Smrg case nir_op_iand: 1191b8e80941Smrg case nir_op_inot: 1192b8e80941Smrg case nir_op_b2i32: 1193b8e80941Smrg return true; 1194b8e80941Smrg case nir_op_bcsel: 1195b8e80941Smrg return src == &alu->src[0].src; 1196b8e80941Smrg default: 1197b8e80941Smrg return false; 1198b8e80941Smrg } 1199b8e80941Smrg} 1200b8e80941Smrg 1201b8e80941Smrgstatic bool 1202b8e80941Smrgevaluate_condition_use(nir_builder *b, nir_if *nif, nir_src *use_src, 1203b8e80941Smrg bool is_if_condition) 1204b8e80941Smrg{ 1205b8e80941Smrg bool progress = false; 1206b8e80941Smrg 1207b8e80941Smrg b->cursor = nir_before_src(use_src, is_if_condition); 1208b8e80941Smrg 1209b8e80941Smrg bool bool_value; 1210b8e80941Smrg if (evaluate_if_condition(nif, b->cursor, &bool_value)) { 1211b8e80941Smrg /* Rewrite use to use const */ 1212b8e80941Smrg nir_src imm_src = nir_src_for_ssa(nir_imm_bool(b, bool_value)); 1213b8e80941Smrg if (is_if_condition) 1214b8e80941Smrg nir_if_rewrite_condition(use_src->parent_if, imm_src); 1215b8e80941Smrg else 1216b8e80941Smrg nir_instr_rewrite_src(use_src->parent_instr, use_src, imm_src); 1217b8e80941Smrg 1218b8e80941Smrg progress = true; 1219b8e80941Smrg } 1220b8e80941Smrg 1221b8e80941Smrg if (!is_if_condition && can_propagate_through_alu(use_src)) { 1222b8e80941Smrg nir_alu_instr *alu = nir_instr_as_alu(use_src->parent_instr); 1223b8e80941Smrg 1224b8e80941Smrg nir_foreach_use_safe(alu_use, &alu->dest.dest.ssa) { 1225b8e80941Smrg progress |= propagate_condition_eval(b, nif, use_src, alu_use, alu, 1226b8e80941Smrg false); 1227b8e80941Smrg } 1228b8e80941Smrg 1229b8e80941Smrg nir_foreach_if_use_safe(alu_use, &alu->dest.dest.ssa) { 1230b8e80941Smrg progress |= propagate_condition_eval(b, nif, use_src, alu_use, alu, 1231b8e80941Smrg true); 1232b8e80941Smrg } 1233b8e80941Smrg } 1234b8e80941Smrg 1235b8e80941Smrg return progress; 1236b8e80941Smrg} 1237b8e80941Smrg 1238b8e80941Smrgstatic bool 1239b8e80941Smrgopt_if_evaluate_condition_use(nir_builder *b, nir_if *nif) 1240b8e80941Smrg{ 1241b8e80941Smrg bool progress = false; 1242b8e80941Smrg 1243b8e80941Smrg /* Evaluate any uses of the if condition inside the if branches */ 1244b8e80941Smrg assert(nif->condition.is_ssa); 1245b8e80941Smrg nir_foreach_use_safe(use_src, nif->condition.ssa) { 1246b8e80941Smrg progress |= evaluate_condition_use(b, nif, use_src, false); 1247b8e80941Smrg } 1248b8e80941Smrg 1249b8e80941Smrg nir_foreach_if_use_safe(use_src, nif->condition.ssa) { 1250b8e80941Smrg if (use_src->parent_if != nif) 1251b8e80941Smrg progress |= evaluate_condition_use(b, nif, use_src, true); 1252b8e80941Smrg } 1253b8e80941Smrg 1254b8e80941Smrg return progress; 1255b8e80941Smrg} 1256b8e80941Smrg 1257b8e80941Smrgstatic void 1258b8e80941Smrgsimple_merge_if(nir_if *dest_if, nir_if *src_if, bool dest_if_then, 1259b8e80941Smrg bool src_if_then) 1260b8e80941Smrg{ 1261b8e80941Smrg /* Now merge the if branch */ 1262b8e80941Smrg nir_block *dest_blk = dest_if_then ? nir_if_last_then_block(dest_if) 1263b8e80941Smrg : nir_if_last_else_block(dest_if); 1264b8e80941Smrg 1265b8e80941Smrg struct exec_list *list = src_if_then ? &src_if->then_list 1266b8e80941Smrg : &src_if->else_list; 1267b8e80941Smrg 1268b8e80941Smrg nir_cf_list if_cf_list; 1269b8e80941Smrg nir_cf_extract(&if_cf_list, nir_before_cf_list(list), 1270b8e80941Smrg nir_after_cf_list(list)); 1271b8e80941Smrg nir_cf_reinsert(&if_cf_list, nir_after_block(dest_blk)); 1272b8e80941Smrg} 1273b8e80941Smrg 1274b8e80941Smrgstatic bool 1275b8e80941Smrgopt_if_merge(nir_if *nif) 1276b8e80941Smrg{ 1277b8e80941Smrg bool progress = false; 1278b8e80941Smrg 1279b8e80941Smrg nir_block *next_blk = nir_cf_node_cf_tree_next(&nif->cf_node); 1280b8e80941Smrg if (next_blk && nif->condition.is_ssa) { 1281b8e80941Smrg nir_if *next_if = nir_block_get_following_if(next_blk); 1282b8e80941Smrg if (next_if && next_if->condition.is_ssa) { 1283b8e80941Smrg 1284b8e80941Smrg /* Here we merge two consecutive ifs that have the same 1285b8e80941Smrg * condition e.g: 1286b8e80941Smrg * 1287b8e80941Smrg * if ssa_12 { 1288b8e80941Smrg * ... 1289b8e80941Smrg * } else { 1290b8e80941Smrg * ... 1291b8e80941Smrg * } 1292b8e80941Smrg * if ssa_12 { 1293b8e80941Smrg * ... 1294b8e80941Smrg * } else { 1295b8e80941Smrg * ... 1296b8e80941Smrg * } 1297b8e80941Smrg * 1298b8e80941Smrg * Note: This only merges if-statements when the block between them 1299b8e80941Smrg * is empty. The reason we don't try to merge ifs that just have phis 1300b8e80941Smrg * between them is because this can results in increased register 1301b8e80941Smrg * pressure. For example when merging if ladders created by indirect 1302b8e80941Smrg * indexing. 1303b8e80941Smrg */ 1304b8e80941Smrg if (nif->condition.ssa == next_if->condition.ssa && 1305b8e80941Smrg exec_list_is_empty(&next_blk->instr_list)) { 1306b8e80941Smrg 1307b8e80941Smrg simple_merge_if(nif, next_if, true, true); 1308b8e80941Smrg simple_merge_if(nif, next_if, false, false); 1309b8e80941Smrg 1310b8e80941Smrg nir_block *new_then_block = nir_if_last_then_block(nif); 1311b8e80941Smrg nir_block *new_else_block = nir_if_last_else_block(nif); 1312b8e80941Smrg 1313b8e80941Smrg nir_block *old_then_block = nir_if_last_then_block(next_if); 1314b8e80941Smrg nir_block *old_else_block = nir_if_last_else_block(next_if); 1315b8e80941Smrg 1316b8e80941Smrg /* Rewrite the predecessor block for any phis following the second 1317b8e80941Smrg * if-statement. 1318b8e80941Smrg */ 1319b8e80941Smrg rewrite_phi_predecessor_blocks(next_if, old_then_block, 1320b8e80941Smrg old_else_block, 1321b8e80941Smrg new_then_block, 1322b8e80941Smrg new_else_block); 1323b8e80941Smrg 1324b8e80941Smrg /* Move phis after merged if to avoid them being deleted when we 1325b8e80941Smrg * remove the merged if-statement. 1326b8e80941Smrg */ 1327b8e80941Smrg nir_block *after_next_if_block = 1328b8e80941Smrg nir_cf_node_as_block(nir_cf_node_next(&next_if->cf_node)); 1329b8e80941Smrg 1330b8e80941Smrg nir_foreach_instr_safe(instr, after_next_if_block) { 1331b8e80941Smrg if (instr->type != nir_instr_type_phi) 1332b8e80941Smrg break; 1333b8e80941Smrg 1334b8e80941Smrg exec_node_remove(&instr->node); 1335b8e80941Smrg exec_list_push_tail(&next_blk->instr_list, &instr->node); 1336b8e80941Smrg instr->block = next_blk; 1337b8e80941Smrg } 1338b8e80941Smrg 1339b8e80941Smrg nir_cf_node_remove(&next_if->cf_node); 1340b8e80941Smrg 1341b8e80941Smrg progress = true; 1342b8e80941Smrg } 1343b8e80941Smrg } 1344b8e80941Smrg } 1345b8e80941Smrg 1346b8e80941Smrg return progress; 1347b8e80941Smrg} 1348b8e80941Smrg 1349b8e80941Smrgstatic bool 1350b8e80941Smrgopt_if_cf_list(nir_builder *b, struct exec_list *cf_list, 1351b8e80941Smrg bool aggressive_last_continue) 1352b8e80941Smrg{ 1353b8e80941Smrg bool progress = false; 1354b8e80941Smrg foreach_list_typed(nir_cf_node, cf_node, node, cf_list) { 1355b8e80941Smrg switch (cf_node->type) { 1356b8e80941Smrg case nir_cf_node_block: 1357b8e80941Smrg break; 1358b8e80941Smrg 1359b8e80941Smrg case nir_cf_node_if: { 1360b8e80941Smrg nir_if *nif = nir_cf_node_as_if(cf_node); 1361b8e80941Smrg progress |= opt_if_cf_list(b, &nif->then_list, 1362b8e80941Smrg aggressive_last_continue); 1363b8e80941Smrg progress |= opt_if_cf_list(b, &nif->else_list, 1364b8e80941Smrg aggressive_last_continue); 1365b8e80941Smrg progress |= opt_if_loop_terminator(nif); 1366b8e80941Smrg progress |= opt_if_merge(nif); 1367b8e80941Smrg progress |= opt_if_simplification(b, nif); 1368b8e80941Smrg break; 1369b8e80941Smrg } 1370b8e80941Smrg 1371b8e80941Smrg case nir_cf_node_loop: { 1372b8e80941Smrg nir_loop *loop = nir_cf_node_as_loop(cf_node); 1373b8e80941Smrg progress |= opt_if_cf_list(b, &loop->body, 1374b8e80941Smrg aggressive_last_continue); 1375b8e80941Smrg progress |= opt_simplify_bcsel_of_phi(b, loop); 1376b8e80941Smrg progress |= opt_peel_loop_initial_if(loop); 1377b8e80941Smrg progress |= opt_if_loop_last_continue(loop, 1378b8e80941Smrg aggressive_last_continue); 1379b8e80941Smrg break; 1380b8e80941Smrg } 1381b8e80941Smrg 1382b8e80941Smrg case nir_cf_node_function: 1383b8e80941Smrg unreachable("Invalid cf type"); 1384b8e80941Smrg } 1385b8e80941Smrg } 1386b8e80941Smrg 1387b8e80941Smrg return progress; 1388b8e80941Smrg} 1389b8e80941Smrg 1390b8e80941Smrg/** 1391b8e80941Smrg * These optimisations depend on nir_metadata_block_index and therefore must 1392b8e80941Smrg * not do anything to cause the metadata to become invalid. 1393b8e80941Smrg */ 1394b8e80941Smrgstatic bool 1395b8e80941Smrgopt_if_safe_cf_list(nir_builder *b, struct exec_list *cf_list) 1396b8e80941Smrg{ 1397b8e80941Smrg bool progress = false; 1398b8e80941Smrg foreach_list_typed(nir_cf_node, cf_node, node, cf_list) { 1399b8e80941Smrg switch (cf_node->type) { 1400b8e80941Smrg case nir_cf_node_block: 1401b8e80941Smrg break; 1402b8e80941Smrg 1403b8e80941Smrg case nir_cf_node_if: { 1404b8e80941Smrg nir_if *nif = nir_cf_node_as_if(cf_node); 1405b8e80941Smrg progress |= opt_if_safe_cf_list(b, &nif->then_list); 1406b8e80941Smrg progress |= opt_if_safe_cf_list(b, &nif->else_list); 1407b8e80941Smrg progress |= opt_if_evaluate_condition_use(b, nif); 1408b8e80941Smrg break; 1409b8e80941Smrg } 1410b8e80941Smrg 1411b8e80941Smrg case nir_cf_node_loop: { 1412b8e80941Smrg nir_loop *loop = nir_cf_node_as_loop(cf_node); 1413b8e80941Smrg progress |= opt_if_safe_cf_list(b, &loop->body); 1414b8e80941Smrg progress |= opt_split_alu_of_phi(b, loop); 1415b8e80941Smrg break; 1416b8e80941Smrg } 1417b8e80941Smrg 1418b8e80941Smrg case nir_cf_node_function: 1419b8e80941Smrg unreachable("Invalid cf type"); 1420b8e80941Smrg } 1421b8e80941Smrg } 1422b8e80941Smrg 1423b8e80941Smrg return progress; 1424b8e80941Smrg} 1425b8e80941Smrg 1426b8e80941Smrgbool 1427b8e80941Smrgnir_opt_if(nir_shader *shader, bool aggressive_last_continue) 1428b8e80941Smrg{ 1429b8e80941Smrg bool progress = false; 1430b8e80941Smrg 1431b8e80941Smrg nir_foreach_function(function, shader) { 1432b8e80941Smrg if (function->impl == NULL) 1433b8e80941Smrg continue; 1434b8e80941Smrg 1435b8e80941Smrg nir_builder b; 1436b8e80941Smrg nir_builder_init(&b, function->impl); 1437b8e80941Smrg 1438b8e80941Smrg nir_metadata_require(function->impl, nir_metadata_block_index | 1439b8e80941Smrg nir_metadata_dominance); 1440b8e80941Smrg progress = opt_if_safe_cf_list(&b, &function->impl->body); 1441b8e80941Smrg nir_metadata_preserve(function->impl, nir_metadata_block_index | 1442b8e80941Smrg nir_metadata_dominance); 1443b8e80941Smrg 1444b8e80941Smrg if (opt_if_cf_list(&b, &function->impl->body, 1445b8e80941Smrg aggressive_last_continue)) { 1446b8e80941Smrg nir_metadata_preserve(function->impl, nir_metadata_none); 1447b8e80941Smrg 1448b8e80941Smrg /* If that made progress, we're no longer really in SSA form. We 1449b8e80941Smrg * need to convert registers back into SSA defs and clean up SSA defs 1450b8e80941Smrg * that don't dominate their uses. 1451b8e80941Smrg */ 1452b8e80941Smrg nir_lower_regs_to_ssa_impl(function->impl); 1453b8e80941Smrg 1454b8e80941Smrg progress = true; 1455b8e80941Smrg } else { 1456b8e80941Smrg #ifndef NDEBUG 1457b8e80941Smrg function->impl->valid_metadata &= ~nir_metadata_not_properly_reset; 1458b8e80941Smrg #endif 1459b8e80941Smrg } 1460b8e80941Smrg } 1461b8e80941Smrg 1462b8e80941Smrg return progress; 1463b8e80941Smrg} 1464