1/* 2 * Copyright © 2014 Intel Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 * 23 * Authors: 24 * Jason Ekstrand (jason@jlekstrand.net) 25 * 26 */ 27 28#include "nir.h" 29#include "nir_instr_set.h" 30 31/* 32 * Implements Global Code Motion. A description of GCM can be found in 33 * "Global Code Motion; Global Value Numbering" by Cliff Click. 34 * Unfortunately, the algorithm presented in the paper is broken in a 35 * number of ways. The algorithm used here differs substantially from the 36 * one in the paper but it is, in my opinion, much easier to read and 37 * verify correcness. 38 */ 39 40/* This is used to stop GCM moving instruction out of a loop if the loop 41 * contains too many instructions and moving them would create excess spilling. 42 * 43 * TODO: Figure out a better way to decide if we should remove instructions from 44 * a loop. 45 */ 46#define MAX_LOOP_INSTRUCTIONS 100 47 48struct gcm_block_info { 49 /* Number of loops this block is inside */ 50 unsigned loop_depth; 51 52 unsigned loop_instr_count; 53 54 /* The loop the block is nested inside or NULL */ 55 nir_loop *loop; 56 57 /* The last instruction inserted into this block. This is used as we 58 * traverse the instructions and insert them back into the program to 59 * put them in the right order. 60 */ 61 nir_instr *last_instr; 62}; 63 64struct gcm_instr_info { 65 nir_block *early_block; 66}; 67 68/* Flags used in the instr->pass_flags field for various instruction states */ 69enum { 70 GCM_INSTR_PINNED = (1 << 0), 71 GCM_INSTR_SCHEDULE_EARLIER_ONLY = (1 << 1), 72 GCM_INSTR_SCHEDULED_EARLY = (1 << 2), 73 GCM_INSTR_SCHEDULED_LATE = (1 << 3), 74 GCM_INSTR_PLACED = (1 << 4), 75}; 76 77struct gcm_state { 78 nir_function_impl *impl; 79 nir_instr *instr; 80 81 bool progress; 82 83 /* The list of non-pinned instructions. As we do the late scheduling, 84 * we pull non-pinned instructions out of their blocks and place them in 85 * this list. This saves us from having linked-list problems when we go 86 * to put instructions back in their blocks. 87 */ 88 struct exec_list instrs; 89 90 struct gcm_block_info *blocks; 91 92 unsigned num_instrs; 93 struct gcm_instr_info *instr_infos; 94}; 95 96static unsigned 97get_loop_instr_count(struct exec_list *cf_list) 98{ 99 unsigned loop_instr_count = 0; 100 foreach_list_typed(nir_cf_node, node, node, cf_list) { 101 switch (node->type) { 102 case nir_cf_node_block: { 103 nir_block *block = nir_cf_node_as_block(node); 104 nir_foreach_instr(instr, block) { 105 loop_instr_count++; 106 } 107 break; 108 } 109 case nir_cf_node_if: { 110 nir_if *if_stmt = nir_cf_node_as_if(node); 111 loop_instr_count += get_loop_instr_count(&if_stmt->then_list); 112 loop_instr_count += get_loop_instr_count(&if_stmt->else_list); 113 break; 114 } 115 case nir_cf_node_loop: { 116 nir_loop *loop = nir_cf_node_as_loop(node); 117 loop_instr_count += get_loop_instr_count(&loop->body); 118 break; 119 } 120 default: 121 unreachable("Invalid CF node type"); 122 } 123 } 124 125 return loop_instr_count; 126} 127 128/* Recursively walks the CFG and builds the block_info structure */ 129static void 130gcm_build_block_info(struct exec_list *cf_list, struct gcm_state *state, 131 nir_loop *loop, unsigned loop_depth, 132 unsigned loop_instr_count) 133{ 134 foreach_list_typed(nir_cf_node, node, node, cf_list) { 135 switch (node->type) { 136 case nir_cf_node_block: { 137 nir_block *block = nir_cf_node_as_block(node); 138 state->blocks[block->index].loop_depth = loop_depth; 139 state->blocks[block->index].loop_instr_count = loop_instr_count; 140 state->blocks[block->index].loop = loop; 141 break; 142 } 143 case nir_cf_node_if: { 144 nir_if *if_stmt = nir_cf_node_as_if(node); 145 gcm_build_block_info(&if_stmt->then_list, state, loop, loop_depth, ~0u); 146 gcm_build_block_info(&if_stmt->else_list, state, loop, loop_depth, ~0u); 147 break; 148 } 149 case nir_cf_node_loop: { 150 nir_loop *loop = nir_cf_node_as_loop(node); 151 gcm_build_block_info(&loop->body, state, loop, loop_depth + 1, 152 get_loop_instr_count(&loop->body)); 153 break; 154 } 155 default: 156 unreachable("Invalid CF node type"); 157 } 158 } 159} 160 161static bool 162is_src_scalarizable(nir_src *src) 163{ 164 assert(src->is_ssa); 165 166 nir_instr *src_instr = src->ssa->parent_instr; 167 switch (src_instr->type) { 168 case nir_instr_type_alu: { 169 nir_alu_instr *src_alu = nir_instr_as_alu(src_instr); 170 171 /* ALU operations with output_size == 0 should be scalarized. We 172 * will also see a bunch of vecN operations from scalarizing ALU 173 * operations and, since they can easily be copy-propagated, they 174 * are ok too. 175 */ 176 return nir_op_infos[src_alu->op].output_size == 0 || 177 src_alu->op == nir_op_vec2 || 178 src_alu->op == nir_op_vec3 || 179 src_alu->op == nir_op_vec4; 180 } 181 182 case nir_instr_type_load_const: 183 /* These are trivially scalarizable */ 184 return true; 185 186 case nir_instr_type_ssa_undef: 187 return true; 188 189 case nir_instr_type_intrinsic: { 190 nir_intrinsic_instr *src_intrin = nir_instr_as_intrinsic(src_instr); 191 192 switch (src_intrin->intrinsic) { 193 case nir_intrinsic_load_deref: { 194 /* Don't scalarize if we see a load of a local variable because it 195 * might turn into one of the things we can't scalarize. 196 */ 197 nir_deref_instr *deref = nir_src_as_deref(src_intrin->src[0]); 198 return !nir_deref_mode_may_be(deref, (nir_var_function_temp | 199 nir_var_shader_temp)); 200 } 201 202 case nir_intrinsic_interp_deref_at_centroid: 203 case nir_intrinsic_interp_deref_at_sample: 204 case nir_intrinsic_interp_deref_at_offset: 205 case nir_intrinsic_load_uniform: 206 case nir_intrinsic_load_ubo: 207 case nir_intrinsic_load_ssbo: 208 case nir_intrinsic_load_global: 209 case nir_intrinsic_load_global_constant: 210 case nir_intrinsic_load_input: 211 return true; 212 default: 213 break; 214 } 215 216 return false; 217 } 218 219 default: 220 /* We can't scalarize this type of instruction */ 221 return false; 222 } 223} 224 225static bool 226is_binding_dynamically_uniform(nir_src src) 227{ 228 nir_binding binding = nir_chase_binding(src); 229 if (!binding.success) 230 return false; 231 232 for (unsigned i = 0; i < binding.num_indices; i++) { 233 if (!nir_src_is_dynamically_uniform(binding.indices[i])) 234 return false; 235 } 236 237 return true; 238} 239 240static void 241pin_intrinsic(nir_intrinsic_instr *intrin) 242{ 243 nir_instr *instr = &intrin->instr; 244 245 if (!nir_intrinsic_can_reorder(intrin)) { 246 instr->pass_flags = GCM_INSTR_PINNED; 247 return; 248 } 249 250 instr->pass_flags = 0; 251 252 /* If the intrinsic requires a uniform source, we can't safely move it across non-uniform 253 * control flow if it's not uniform at the point it's defined. 254 * Stores and atomics can never be re-ordered, so we don't have to consider them here. 255 */ 256 bool non_uniform = nir_intrinsic_has_access(intrin) && 257 (nir_intrinsic_access(intrin) & ACCESS_NON_UNIFORM); 258 if (!non_uniform && 259 (intrin->intrinsic == nir_intrinsic_load_ubo || 260 intrin->intrinsic == nir_intrinsic_load_ssbo || 261 intrin->intrinsic == nir_intrinsic_get_ubo_size || 262 intrin->intrinsic == nir_intrinsic_get_ssbo_size || 263 nir_intrinsic_has_image_dim(intrin) || 264 ((intrin->intrinsic == nir_intrinsic_load_deref || 265 intrin->intrinsic == nir_intrinsic_deref_buffer_array_length) && 266 nir_deref_mode_may_be(nir_src_as_deref(intrin->src[0]), 267 nir_var_mem_ubo | nir_var_mem_ssbo)))) { 268 if (!is_binding_dynamically_uniform(intrin->src[0])) 269 instr->pass_flags = GCM_INSTR_PINNED; 270 } else if (intrin->intrinsic == nir_intrinsic_load_push_constant) { 271 if (!nir_src_is_dynamically_uniform(intrin->src[0])) 272 instr->pass_flags = GCM_INSTR_PINNED; 273 } else if (intrin->intrinsic == nir_intrinsic_load_deref && 274 nir_deref_mode_is(nir_src_as_deref(intrin->src[0]), 275 nir_var_mem_push_const)) { 276 nir_deref_instr *deref = nir_src_as_deref(intrin->src[0]); 277 while (deref->deref_type != nir_deref_type_var) { 278 if ((deref->deref_type == nir_deref_type_array || 279 deref->deref_type == nir_deref_type_ptr_as_array) && 280 !nir_src_is_dynamically_uniform(deref->arr.index)) { 281 instr->pass_flags = GCM_INSTR_PINNED; 282 return; 283 } 284 deref = nir_deref_instr_parent(deref); 285 if (!deref) { 286 instr->pass_flags = GCM_INSTR_PINNED; 287 return; 288 } 289 } 290 } 291} 292 293/* Walks the instruction list and marks immovable instructions as pinned or 294 * placed. 295 * 296 * This function also serves to initialize the instr->pass_flags field. 297 * After this is completed, all instructions' pass_flags fields will be set 298 * to either GCM_INSTR_PINNED, GCM_INSTR_PLACED or 0. 299 */ 300static void 301gcm_pin_instructions(nir_function_impl *impl, struct gcm_state *state) 302{ 303 state->num_instrs = 0; 304 305 nir_foreach_block(block, impl) { 306 nir_foreach_instr_safe(instr, block) { 307 /* Index the instructions for use in gcm_state::instrs */ 308 instr->index = state->num_instrs++; 309 310 switch (instr->type) { 311 case nir_instr_type_alu: 312 switch (nir_instr_as_alu(instr)->op) { 313 case nir_op_fddx: 314 case nir_op_fddy: 315 case nir_op_fddx_fine: 316 case nir_op_fddy_fine: 317 case nir_op_fddx_coarse: 318 case nir_op_fddy_coarse: 319 /* These can only go in uniform control flow */ 320 instr->pass_flags = GCM_INSTR_SCHEDULE_EARLIER_ONLY; 321 break; 322 323 case nir_op_mov: 324 if (!is_src_scalarizable(&(nir_instr_as_alu(instr)->src[0].src))) { 325 instr->pass_flags = GCM_INSTR_PINNED; 326 break; 327 } 328 FALLTHROUGH; 329 330 default: 331 instr->pass_flags = 0; 332 break; 333 } 334 break; 335 336 case nir_instr_type_tex: { 337 nir_tex_instr *tex = nir_instr_as_tex(instr); 338 if (nir_tex_instr_has_implicit_derivative(tex)) 339 instr->pass_flags = GCM_INSTR_SCHEDULE_EARLIER_ONLY; 340 341 for (unsigned i = 0; i < tex->num_srcs; i++) { 342 nir_tex_src *src = &tex->src[i]; 343 switch (src->src_type) { 344 case nir_tex_src_texture_deref: 345 if (!tex->texture_non_uniform && !is_binding_dynamically_uniform(src->src)) 346 instr->pass_flags = GCM_INSTR_PINNED; 347 break; 348 case nir_tex_src_sampler_deref: 349 if (!tex->sampler_non_uniform && !is_binding_dynamically_uniform(src->src)) 350 instr->pass_flags = GCM_INSTR_PINNED; 351 break; 352 case nir_tex_src_texture_offset: 353 case nir_tex_src_texture_handle: 354 if (!tex->texture_non_uniform && !nir_src_is_dynamically_uniform(src->src)) 355 instr->pass_flags = GCM_INSTR_PINNED; 356 break; 357 case nir_tex_src_sampler_offset: 358 case nir_tex_src_sampler_handle: 359 if (!tex->sampler_non_uniform && !nir_src_is_dynamically_uniform(src->src)) 360 instr->pass_flags = GCM_INSTR_PINNED; 361 break; 362 default: 363 break; 364 } 365 } 366 break; 367 } 368 369 case nir_instr_type_deref: 370 case nir_instr_type_load_const: 371 instr->pass_flags = 0; 372 break; 373 374 case nir_instr_type_intrinsic: 375 pin_intrinsic(nir_instr_as_intrinsic(instr)); 376 break; 377 378 case nir_instr_type_jump: 379 case nir_instr_type_ssa_undef: 380 case nir_instr_type_phi: 381 instr->pass_flags = GCM_INSTR_PLACED; 382 break; 383 384 default: 385 unreachable("Invalid instruction type in GCM"); 386 } 387 388 if (!(instr->pass_flags & GCM_INSTR_PLACED)) { 389 /* If this is an unplaced instruction, go ahead and pull it out of 390 * the program and put it on the instrs list. This has a couple 391 * of benifits. First, it makes the scheduling algorithm more 392 * efficient because we can avoid walking over basic blocks. 393 * Second, it keeps us from causing linked list confusion when 394 * we're trying to put everything in its proper place at the end 395 * of the pass. 396 * 397 * Note that we don't use nir_instr_remove here because that also 398 * cleans up uses and defs and we want to keep that information. 399 */ 400 exec_node_remove(&instr->node); 401 exec_list_push_tail(&state->instrs, &instr->node); 402 } 403 } 404 } 405} 406 407static void 408gcm_schedule_early_instr(nir_instr *instr, struct gcm_state *state); 409 410/** Update an instructions schedule for the given source 411 * 412 * This function is called iteratively as we walk the sources of an 413 * instruction. It ensures that the given source instruction has been 414 * scheduled and then update this instruction's block if the source 415 * instruction is lower down the tree. 416 */ 417static bool 418gcm_schedule_early_src(nir_src *src, void *void_state) 419{ 420 struct gcm_state *state = void_state; 421 nir_instr *instr = state->instr; 422 423 assert(src->is_ssa); 424 425 gcm_schedule_early_instr(src->ssa->parent_instr, void_state); 426 427 /* While the index isn't a proper dominance depth, it does have the 428 * property that if A dominates B then A->index <= B->index. Since we 429 * know that this instruction must have been dominated by all of its 430 * sources at some point (even if it's gone through value-numbering), 431 * all of the sources must lie on the same branch of the dominance tree. 432 * Therefore, we can just go ahead and just compare indices. 433 */ 434 struct gcm_instr_info *src_info = 435 &state->instr_infos[src->ssa->parent_instr->index]; 436 struct gcm_instr_info *info = &state->instr_infos[instr->index]; 437 if (info->early_block->index < src_info->early_block->index) 438 info->early_block = src_info->early_block; 439 440 /* We need to restore the state instruction because it may have been 441 * changed through the gcm_schedule_early_instr call above. Since we 442 * may still be iterating through sources and future calls to 443 * gcm_schedule_early_src for the same instruction will still need it. 444 */ 445 state->instr = instr; 446 447 return true; 448} 449 450/** Schedules an instruction early 451 * 452 * This function performs a recursive depth-first search starting at the 453 * given instruction and proceeding through the sources to schedule 454 * instructions as early as they can possibly go in the dominance tree. 455 * The instructions are "scheduled" by updating the early_block field of 456 * the corresponding gcm_instr_state entry. 457 */ 458static void 459gcm_schedule_early_instr(nir_instr *instr, struct gcm_state *state) 460{ 461 if (instr->pass_flags & GCM_INSTR_SCHEDULED_EARLY) 462 return; 463 464 instr->pass_flags |= GCM_INSTR_SCHEDULED_EARLY; 465 466 /* Pinned/placed instructions always get scheduled in their original block so 467 * we don't need to do anything. Also, bailing here keeps us from ever 468 * following the sources of phi nodes which can be back-edges. 469 */ 470 if (instr->pass_flags & GCM_INSTR_PINNED || 471 instr->pass_flags & GCM_INSTR_PLACED) { 472 state->instr_infos[instr->index].early_block = instr->block; 473 return; 474 } 475 476 /* Start with the instruction at the top. As we iterate over the 477 * sources, it will get moved down as needed. 478 */ 479 state->instr_infos[instr->index].early_block = nir_start_block(state->impl); 480 state->instr = instr; 481 482 nir_foreach_src(instr, gcm_schedule_early_src, state); 483} 484 485static bool 486set_block_for_loop_instr(struct gcm_state *state, nir_instr *instr, 487 nir_block *block) 488{ 489 /* If the instruction wasn't in a loop to begin with we don't want to push 490 * it down into one. 491 */ 492 nir_loop *loop = state->blocks[instr->block->index].loop; 493 if (loop == NULL) 494 return true; 495 496 if (nir_block_dominates(instr->block, block)) 497 return true; 498 499 /* If the loop only executes a single time i.e its wrapped in a: 500 * do{ ... break; } while(true) 501 * Don't move the instruction as it will not help anything. 502 */ 503 if (loop->info->limiting_terminator == NULL && !loop->info->complex_loop && 504 nir_block_ends_in_break(nir_loop_last_block(loop))) 505 return false; 506 507 /* Being too aggressive with how we pull instructions out of loops can 508 * result in extra register pressure and spilling. For example its fairly 509 * common for loops in compute shaders to calculate SSBO offsets using 510 * the workgroup id, subgroup id and subgroup invocation, pulling all 511 * these calculations outside the loop causes register pressure. 512 * 513 * To work around these issues for now we only allow constant and texture 514 * instructions to be moved outside their original loops, or instructions 515 * where the total loop instruction count is less than 516 * MAX_LOOP_INSTRUCTIONS. 517 * 518 * TODO: figure out some more heuristics to allow more to be moved out of 519 * loops. 520 */ 521 if (state->blocks[instr->block->index].loop_instr_count < MAX_LOOP_INSTRUCTIONS) 522 return true; 523 524 if (instr->type == nir_instr_type_load_const || 525 instr->type == nir_instr_type_tex) 526 return true; 527 528 return false; 529} 530 531static nir_block * 532gcm_choose_block_for_instr(nir_instr *instr, nir_block *early_block, 533 nir_block *late_block, struct gcm_state *state) 534{ 535 assert(nir_block_dominates(early_block, late_block)); 536 537 nir_block *best = late_block; 538 for (nir_block *block = late_block; block != NULL; block = block->imm_dom) { 539 if (state->blocks[block->index].loop_depth < 540 state->blocks[best->index].loop_depth && 541 set_block_for_loop_instr(state, instr, block)) 542 best = block; 543 else if (block == instr->block) 544 best = block; 545 546 if (block == early_block) 547 break; 548 } 549 550 return best; 551} 552 553static void 554gcm_schedule_late_instr(nir_instr *instr, struct gcm_state *state); 555 556/** Schedules the instruction associated with the given SSA def late 557 * 558 * This function works by first walking all of the uses of the given SSA 559 * definition, ensuring that they are scheduled, and then computing the LCA 560 * (least common ancestor) of its uses. It then schedules this instruction 561 * as close to the LCA as possible while trying to stay out of loops. 562 */ 563static bool 564gcm_schedule_late_def(nir_ssa_def *def, void *void_state) 565{ 566 struct gcm_state *state = void_state; 567 568 nir_block *lca = NULL; 569 570 nir_foreach_use(use_src, def) { 571 nir_instr *use_instr = use_src->parent_instr; 572 573 gcm_schedule_late_instr(use_instr, state); 574 575 /* Phi instructions are a bit special. SSA definitions don't have to 576 * dominate the sources of the phi nodes that use them; instead, they 577 * have to dominate the predecessor block corresponding to the phi 578 * source. We handle this by looking through the sources, finding 579 * any that are usingg this SSA def, and using those blocks instead 580 * of the one the phi lives in. 581 */ 582 if (use_instr->type == nir_instr_type_phi) { 583 nir_phi_instr *phi = nir_instr_as_phi(use_instr); 584 585 nir_foreach_phi_src(phi_src, phi) { 586 if (phi_src->src.ssa == def) 587 lca = nir_dominance_lca(lca, phi_src->pred); 588 } 589 } else { 590 lca = nir_dominance_lca(lca, use_instr->block); 591 } 592 } 593 594 nir_foreach_if_use(use_src, def) { 595 nir_if *if_stmt = use_src->parent_if; 596 597 /* For if statements, we consider the block to be the one immediately 598 * preceding the if CF node. 599 */ 600 nir_block *pred_block = 601 nir_cf_node_as_block(nir_cf_node_prev(&if_stmt->cf_node)); 602 603 lca = nir_dominance_lca(lca, pred_block); 604 } 605 606 nir_block *early_block = 607 state->instr_infos[def->parent_instr->index].early_block; 608 609 /* Some instructions may never be used. Flag them and the instruction 610 * placement code will get rid of them for us. 611 */ 612 if (lca == NULL) { 613 def->parent_instr->block = NULL; 614 return true; 615 } 616 617 if (def->parent_instr->pass_flags & GCM_INSTR_SCHEDULE_EARLIER_ONLY && 618 lca != def->parent_instr->block && 619 nir_block_dominates(def->parent_instr->block, lca)) { 620 lca = def->parent_instr->block; 621 } 622 623 /* We now have the LCA of all of the uses. If our invariants hold, 624 * this is dominated by the block that we chose when scheduling early. 625 * We now walk up the dominance tree and pick the lowest block that is 626 * as far outside loops as we can get. 627 */ 628 nir_block *best_block = 629 gcm_choose_block_for_instr(def->parent_instr, early_block, lca, state); 630 631 if (def->parent_instr->block != best_block) 632 state->progress = true; 633 634 def->parent_instr->block = best_block; 635 636 return true; 637} 638 639/** Schedules an instruction late 640 * 641 * This function performs a depth-first search starting at the given 642 * instruction and proceeding through its uses to schedule instructions as 643 * late as they can reasonably go in the dominance tree. The instructions 644 * are "scheduled" by updating their instr->block field. 645 * 646 * The name of this function is actually a bit of a misnomer as it doesn't 647 * schedule them "as late as possible" as the paper implies. Instead, it 648 * first finds the lates possible place it can schedule the instruction and 649 * then possibly schedules it earlier than that. The actual location is as 650 * far down the tree as we can go while trying to stay out of loops. 651 */ 652static void 653gcm_schedule_late_instr(nir_instr *instr, struct gcm_state *state) 654{ 655 if (instr->pass_flags & GCM_INSTR_SCHEDULED_LATE) 656 return; 657 658 instr->pass_flags |= GCM_INSTR_SCHEDULED_LATE; 659 660 /* Pinned/placed instructions are already scheduled so we don't need to do 661 * anything. Also, bailing here keeps us from ever following phi nodes 662 * which can be back-edges. 663 */ 664 if (instr->pass_flags & GCM_INSTR_PLACED || 665 instr->pass_flags & GCM_INSTR_PINNED) 666 return; 667 668 nir_foreach_ssa_def(instr, gcm_schedule_late_def, state); 669} 670 671static bool 672gcm_replace_def_with_undef(nir_ssa_def *def, void *void_state) 673{ 674 struct gcm_state *state = void_state; 675 676 if (nir_ssa_def_is_unused(def)) 677 return true; 678 679 nir_ssa_undef_instr *undef = 680 nir_ssa_undef_instr_create(state->impl->function->shader, 681 def->num_components, def->bit_size); 682 nir_instr_insert(nir_before_cf_list(&state->impl->body), &undef->instr); 683 nir_ssa_def_rewrite_uses(def, &undef->def); 684 685 return true; 686} 687 688/** Places an instrution back into the program 689 * 690 * The earlier passes of GCM simply choose blocks for each instruction and 691 * otherwise leave them alone. This pass actually places the instructions 692 * into their chosen blocks. 693 * 694 * To do so, we simply insert instructions in the reverse order they were 695 * extracted. This will simply place instructions that were scheduled earlier 696 * onto the end of their new block and instructions that were scheduled later to 697 * the start of their new block. 698 */ 699static void 700gcm_place_instr(nir_instr *instr, struct gcm_state *state) 701{ 702 if (instr->pass_flags & GCM_INSTR_PLACED) 703 return; 704 705 instr->pass_flags |= GCM_INSTR_PLACED; 706 707 if (instr->block == NULL) { 708 nir_foreach_ssa_def(instr, gcm_replace_def_with_undef, state); 709 nir_instr_remove(instr); 710 return; 711 } 712 713 struct gcm_block_info *block_info = &state->blocks[instr->block->index]; 714 exec_node_remove(&instr->node); 715 716 if (block_info->last_instr) { 717 exec_node_insert_node_before(&block_info->last_instr->node, 718 &instr->node); 719 } else { 720 /* Schedule it at the end of the block */ 721 nir_instr *jump_instr = nir_block_last_instr(instr->block); 722 if (jump_instr && jump_instr->type == nir_instr_type_jump) { 723 exec_node_insert_node_before(&jump_instr->node, &instr->node); 724 } else { 725 exec_list_push_tail(&instr->block->instr_list, &instr->node); 726 } 727 } 728 729 block_info->last_instr = instr; 730} 731 732static bool 733opt_gcm_impl(nir_shader *shader, nir_function_impl *impl, bool value_number) 734{ 735 nir_metadata_require(impl, nir_metadata_block_index | 736 nir_metadata_dominance); 737 nir_metadata_require(impl, nir_metadata_loop_analysis, 738 shader->options->force_indirect_unrolling); 739 740 /* A previous pass may have left pass_flags dirty, so clear it all out. */ 741 nir_foreach_block(block, impl) 742 nir_foreach_instr(instr, block) 743 instr->pass_flags = 0; 744 745 struct gcm_state state; 746 747 state.impl = impl; 748 state.instr = NULL; 749 state.progress = false; 750 exec_list_make_empty(&state.instrs); 751 state.blocks = rzalloc_array(NULL, struct gcm_block_info, impl->num_blocks); 752 753 gcm_build_block_info(&impl->body, &state, NULL, 0, ~0u); 754 755 gcm_pin_instructions(impl, &state); 756 757 state.instr_infos = 758 rzalloc_array(NULL, struct gcm_instr_info, state.num_instrs); 759 760 if (value_number) { 761 struct set *gvn_set = nir_instr_set_create(NULL); 762 foreach_list_typed_safe(nir_instr, instr, node, &state.instrs) { 763 if (instr->pass_flags & GCM_INSTR_PINNED) 764 continue; 765 766 if (nir_instr_set_add_or_rewrite(gvn_set, instr, NULL)) 767 state.progress = true; 768 } 769 nir_instr_set_destroy(gvn_set); 770 } 771 772 foreach_list_typed(nir_instr, instr, node, &state.instrs) 773 gcm_schedule_early_instr(instr, &state); 774 775 foreach_list_typed(nir_instr, instr, node, &state.instrs) 776 gcm_schedule_late_instr(instr, &state); 777 778 while (!exec_list_is_empty(&state.instrs)) { 779 nir_instr *instr = exec_node_data(nir_instr, 780 state.instrs.tail_sentinel.prev, node); 781 gcm_place_instr(instr, &state); 782 } 783 784 ralloc_free(state.blocks); 785 ralloc_free(state.instr_infos); 786 787 nir_metadata_preserve(impl, nir_metadata_block_index | 788 nir_metadata_dominance | 789 nir_metadata_loop_analysis); 790 791 return state.progress; 792} 793 794bool 795nir_opt_gcm(nir_shader *shader, bool value_number) 796{ 797 bool progress = false; 798 799 nir_foreach_function(function, shader) { 800 if (function->impl) 801 progress |= opt_gcm_impl(shader, function->impl, value_number); 802 } 803 804 return progress; 805} 806