1/* 2 * Copyright © 2012 Intel Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 */ 23 24/** @file brw_fs_copy_propagation.cpp 25 * 26 * Support for global copy propagation in two passes: A local pass that does 27 * intra-block copy (and constant) propagation, and a global pass that uses 28 * dataflow analysis on the copies available at the end of each block to re-do 29 * local copy propagation with more copies available. 30 * 31 * See Muchnick's Advanced Compiler Design and Implementation, section 32 * 12.5 (p356). 33 */ 34 35#define ACP_HASH_SIZE 16 36 37#include "util/bitset.h" 38#include "brw_fs.h" 39#include "brw_fs_live_variables.h" 40#include "brw_cfg.h" 41#include "brw_eu.h" 42 43using namespace brw; 44 45namespace { /* avoid conflict with opt_copy_propagation_elements */ 46struct acp_entry : public exec_node { 47 fs_reg dst; 48 fs_reg src; 49 uint8_t size_written; 50 uint8_t size_read; 51 enum opcode opcode; 52 bool saturate; 53}; 54 55struct block_data { 56 /** 57 * Which entries in the fs_copy_prop_dataflow acp table are live at the 58 * start of this block. This is the useful output of the analysis, since 59 * it lets us plug those into the local copy propagation on the second 60 * pass. 61 */ 62 BITSET_WORD *livein; 63 64 /** 65 * Which entries in the fs_copy_prop_dataflow acp table are live at the end 66 * of this block. This is done in initial setup from the per-block acps 67 * returned by the first local copy prop pass. 68 */ 69 BITSET_WORD *liveout; 70 71 /** 72 * Which entries in the fs_copy_prop_dataflow acp table are generated by 73 * instructions in this block which reach the end of the block without 74 * being killed. 75 */ 76 BITSET_WORD *copy; 77 78 /** 79 * Which entries in the fs_copy_prop_dataflow acp table are killed over the 80 * course of this block. 81 */ 82 BITSET_WORD *kill; 83 84 /** 85 * Which entries in the fs_copy_prop_dataflow acp table are guaranteed to 86 * have a fully uninitialized destination at the end of this block. 87 */ 88 BITSET_WORD *undef; 89}; 90 91class fs_copy_prop_dataflow 92{ 93public: 94 fs_copy_prop_dataflow(void *mem_ctx, cfg_t *cfg, 95 const fs_live_variables *live, 96 exec_list *out_acp[ACP_HASH_SIZE]); 97 98 void setup_initial_values(); 99 void run(); 100 101 void dump_block_data() const UNUSED; 102 103 void *mem_ctx; 104 cfg_t *cfg; 105 const fs_live_variables *live; 106 107 acp_entry **acp; 108 int num_acp; 109 int bitset_words; 110 111 struct block_data *bd; 112}; 113} /* anonymous namespace */ 114 115fs_copy_prop_dataflow::fs_copy_prop_dataflow(void *mem_ctx, cfg_t *cfg, 116 const fs_live_variables *live, 117 exec_list *out_acp[ACP_HASH_SIZE]) 118 : mem_ctx(mem_ctx), cfg(cfg), live(live) 119{ 120 bd = rzalloc_array(mem_ctx, struct block_data, cfg->num_blocks); 121 122 num_acp = 0; 123 foreach_block (block, cfg) { 124 for (int i = 0; i < ACP_HASH_SIZE; i++) { 125 num_acp += out_acp[block->num][i].length(); 126 } 127 } 128 129 acp = rzalloc_array(mem_ctx, struct acp_entry *, num_acp); 130 131 bitset_words = BITSET_WORDS(num_acp); 132 133 int next_acp = 0; 134 foreach_block (block, cfg) { 135 bd[block->num].livein = rzalloc_array(bd, BITSET_WORD, bitset_words); 136 bd[block->num].liveout = rzalloc_array(bd, BITSET_WORD, bitset_words); 137 bd[block->num].copy = rzalloc_array(bd, BITSET_WORD, bitset_words); 138 bd[block->num].kill = rzalloc_array(bd, BITSET_WORD, bitset_words); 139 bd[block->num].undef = rzalloc_array(bd, BITSET_WORD, bitset_words); 140 141 for (int i = 0; i < ACP_HASH_SIZE; i++) { 142 foreach_in_list(acp_entry, entry, &out_acp[block->num][i]) { 143 acp[next_acp] = entry; 144 145 /* opt_copy_propagation_local populates out_acp with copies created 146 * in a block which are still live at the end of the block. This 147 * is exactly what we want in the COPY set. 148 */ 149 BITSET_SET(bd[block->num].copy, next_acp); 150 151 next_acp++; 152 } 153 } 154 } 155 156 assert(next_acp == num_acp); 157 158 setup_initial_values(); 159 run(); 160} 161 162/** 163 * Set up initial values for each of the data flow sets, prior to running 164 * the fixed-point algorithm. 165 */ 166void 167fs_copy_prop_dataflow::setup_initial_values() 168{ 169 /* Initialize the COPY and KILL sets. */ 170 foreach_block (block, cfg) { 171 foreach_inst_in_block(fs_inst, inst, block) { 172 if (inst->dst.file != VGRF) 173 continue; 174 175 /* Mark ACP entries which are killed by this instruction. */ 176 for (int i = 0; i < num_acp; i++) { 177 if (regions_overlap(inst->dst, inst->size_written, 178 acp[i]->dst, acp[i]->size_written) || 179 regions_overlap(inst->dst, inst->size_written, 180 acp[i]->src, acp[i]->size_read)) { 181 BITSET_SET(bd[block->num].kill, i); 182 } 183 } 184 } 185 } 186 187 /* Populate the initial values for the livein and liveout sets. For the 188 * block at the start of the program, livein = 0 and liveout = copy. 189 * For the others, set liveout and livein to ~0 (the universal set). 190 */ 191 foreach_block (block, cfg) { 192 if (block->parents.is_empty()) { 193 for (int i = 0; i < bitset_words; i++) { 194 bd[block->num].livein[i] = 0u; 195 bd[block->num].liveout[i] = bd[block->num].copy[i]; 196 } 197 } else { 198 for (int i = 0; i < bitset_words; i++) { 199 bd[block->num].liveout[i] = ~0u; 200 bd[block->num].livein[i] = ~0u; 201 } 202 } 203 } 204 205 /* Initialize the undef set. */ 206 foreach_block (block, cfg) { 207 for (int i = 0; i < num_acp; i++) { 208 BITSET_SET(bd[block->num].undef, i); 209 for (unsigned off = 0; off < acp[i]->size_written; off += REG_SIZE) { 210 if (BITSET_TEST(live->block_data[block->num].defout, 211 live->var_from_reg(byte_offset(acp[i]->dst, off)))) 212 BITSET_CLEAR(bd[block->num].undef, i); 213 } 214 } 215 } 216} 217 218/** 219 * Walk the set of instructions in the block, marking which entries in the acp 220 * are killed by the block. 221 */ 222void 223fs_copy_prop_dataflow::run() 224{ 225 bool progress; 226 227 do { 228 progress = false; 229 230 foreach_block (block, cfg) { 231 if (block->parents.is_empty()) 232 continue; 233 234 for (int i = 0; i < bitset_words; i++) { 235 const BITSET_WORD old_liveout = bd[block->num].liveout[i]; 236 BITSET_WORD livein_from_any_block = 0; 237 238 /* Update livein for this block. If a copy is live out of all 239 * parent blocks, it's live coming in to this block. 240 */ 241 bd[block->num].livein[i] = ~0u; 242 foreach_list_typed(bblock_link, parent_link, link, &block->parents) { 243 bblock_t *parent = parent_link->block; 244 /* Consider ACP entries with a known-undefined destination to 245 * be available from the parent. This is valid because we're 246 * free to set the undefined variable equal to the source of 247 * the ACP entry without breaking the application's 248 * expectations, since the variable is undefined. 249 */ 250 bd[block->num].livein[i] &= (bd[parent->num].liveout[i] | 251 bd[parent->num].undef[i]); 252 livein_from_any_block |= bd[parent->num].liveout[i]; 253 } 254 255 /* Limit to the set of ACP entries that can possibly be available 256 * at the start of the block, since propagating from a variable 257 * which is guaranteed to be undefined (rather than potentially 258 * undefined for some dynamic control-flow paths) doesn't seem 259 * particularly useful. 260 */ 261 bd[block->num].livein[i] &= livein_from_any_block; 262 263 /* Update liveout for this block. */ 264 bd[block->num].liveout[i] = 265 bd[block->num].copy[i] | (bd[block->num].livein[i] & 266 ~bd[block->num].kill[i]); 267 268 if (old_liveout != bd[block->num].liveout[i]) 269 progress = true; 270 } 271 } 272 } while (progress); 273} 274 275void 276fs_copy_prop_dataflow::dump_block_data() const 277{ 278 foreach_block (block, cfg) { 279 fprintf(stderr, "Block %d [%d, %d] (parents ", block->num, 280 block->start_ip, block->end_ip); 281 foreach_list_typed(bblock_link, link, link, &block->parents) { 282 bblock_t *parent = link->block; 283 fprintf(stderr, "%d ", parent->num); 284 } 285 fprintf(stderr, "):\n"); 286 fprintf(stderr, " livein = 0x"); 287 for (int i = 0; i < bitset_words; i++) 288 fprintf(stderr, "%08x", bd[block->num].livein[i]); 289 fprintf(stderr, ", liveout = 0x"); 290 for (int i = 0; i < bitset_words; i++) 291 fprintf(stderr, "%08x", bd[block->num].liveout[i]); 292 fprintf(stderr, ",\n copy = 0x"); 293 for (int i = 0; i < bitset_words; i++) 294 fprintf(stderr, "%08x", bd[block->num].copy[i]); 295 fprintf(stderr, ", kill = 0x"); 296 for (int i = 0; i < bitset_words; i++) 297 fprintf(stderr, "%08x", bd[block->num].kill[i]); 298 fprintf(stderr, "\n"); 299 } 300} 301 302static bool 303is_logic_op(enum opcode opcode) 304{ 305 return (opcode == BRW_OPCODE_AND || 306 opcode == BRW_OPCODE_OR || 307 opcode == BRW_OPCODE_XOR || 308 opcode == BRW_OPCODE_NOT); 309} 310 311static bool 312can_take_stride(fs_inst *inst, unsigned arg, unsigned stride, 313 const gen_device_info *devinfo) 314{ 315 if (stride > 4) 316 return false; 317 318 /* Bail if the channels of the source need to be aligned to the byte offset 319 * of the corresponding channel of the destination, and the provided stride 320 * would break this restriction. 321 */ 322 if (has_dst_aligned_region_restriction(devinfo, inst) && 323 !(type_sz(inst->src[arg].type) * stride == 324 type_sz(inst->dst.type) * inst->dst.stride || 325 stride == 0)) 326 return false; 327 328 /* 3-source instructions can only be Align16, which restricts what strides 329 * they can take. They can only take a stride of 1 (the usual case), or 0 330 * with a special "repctrl" bit. But the repctrl bit doesn't work for 331 * 64-bit datatypes, so if the source type is 64-bit then only a stride of 332 * 1 is allowed. From the Broadwell PRM, Volume 7 "3D Media GPGPU", page 333 * 944: 334 * 335 * This is applicable to 32b datatypes and 16b datatype. 64b datatypes 336 * cannot use the replicate control. 337 */ 338 if (inst->is_3src(devinfo)) { 339 if (type_sz(inst->src[arg].type) > 4) 340 return stride == 1; 341 else 342 return stride == 1 || stride == 0; 343 } 344 345 /* From the Broadwell PRM, Volume 2a "Command Reference - Instructions", 346 * page 391 ("Extended Math Function"): 347 * 348 * The following restrictions apply for align1 mode: Scalar source is 349 * supported. Source and destination horizontal stride must be the 350 * same. 351 * 352 * From the Haswell PRM Volume 2b "Command Reference - Instructions", page 353 * 134 ("Extended Math Function"): 354 * 355 * Scalar source is supported. Source and destination horizontal stride 356 * must be 1. 357 * 358 * and similar language exists for IVB and SNB. Pre-SNB, math instructions 359 * are sends, so the sources are moved to MRF's and there are no 360 * restrictions. 361 */ 362 if (inst->is_math()) { 363 if (devinfo->gen == 6 || devinfo->gen == 7) { 364 assert(inst->dst.stride == 1); 365 return stride == 1 || stride == 0; 366 } else if (devinfo->gen >= 8) { 367 return stride == inst->dst.stride || stride == 0; 368 } 369 } 370 371 return true; 372} 373 374static bool 375instruction_requires_packed_data(fs_inst *inst) 376{ 377 switch (inst->opcode) { 378 case FS_OPCODE_DDX_FINE: 379 case FS_OPCODE_DDX_COARSE: 380 case FS_OPCODE_DDY_FINE: 381 case FS_OPCODE_DDY_COARSE: 382 return true; 383 default: 384 return false; 385 } 386} 387 388bool 389fs_visitor::try_copy_propagate(fs_inst *inst, int arg, acp_entry *entry) 390{ 391 if (inst->src[arg].file != VGRF) 392 return false; 393 394 if (entry->src.file == IMM) 395 return false; 396 assert(entry->src.file == VGRF || entry->src.file == UNIFORM || 397 entry->src.file == ATTR); 398 399 if (entry->opcode == SHADER_OPCODE_LOAD_PAYLOAD && 400 inst->opcode == SHADER_OPCODE_LOAD_PAYLOAD) 401 return false; 402 403 assert(entry->dst.file == VGRF); 404 if (inst->src[arg].nr != entry->dst.nr) 405 return false; 406 407 /* Bail if inst is reading a range that isn't contained in the range 408 * that entry is writing. 409 */ 410 if (!region_contained_in(inst->src[arg], inst->size_read(arg), 411 entry->dst, entry->size_written)) 412 return false; 413 414 /* we can't generally copy-propagate UD negations because we 415 * can end up accessing the resulting values as signed integers 416 * instead. See also resolve_ud_negate() and comment in 417 * fs_generator::generate_code. 418 */ 419 if (entry->src.type == BRW_REGISTER_TYPE_UD && 420 entry->src.negate) 421 return false; 422 423 bool has_source_modifiers = entry->src.abs || entry->src.negate; 424 425 if ((has_source_modifiers || entry->src.file == UNIFORM || 426 !entry->src.is_contiguous()) && 427 !inst->can_do_source_mods(devinfo)) 428 return false; 429 430 if (has_source_modifiers && 431 inst->opcode == SHADER_OPCODE_GEN4_SCRATCH_WRITE) 432 return false; 433 434 /* Some instructions implemented in the generator backend, such as 435 * derivatives, assume that their operands are packed so we can't 436 * generally propagate strided regions to them. 437 */ 438 if (instruction_requires_packed_data(inst) && entry->src.stride > 1) 439 return false; 440 441 /* Bail if the result of composing both strides would exceed the 442 * hardware limit. 443 */ 444 if (!can_take_stride(inst, arg, entry->src.stride * inst->src[arg].stride, 445 devinfo)) 446 return false; 447 448 /* Bail if the instruction type is larger than the execution type of the 449 * copy, what implies that each channel is reading multiple channels of the 450 * destination of the copy, and simply replacing the sources would give a 451 * program with different semantics. 452 */ 453 if (type_sz(entry->dst.type) < type_sz(inst->src[arg].type)) 454 return false; 455 456 /* Bail if the result of composing both strides cannot be expressed 457 * as another stride. This avoids, for example, trying to transform 458 * this: 459 * 460 * MOV (8) rX<1>UD rY<0;1,0>UD 461 * FOO (8) ... rX<8;8,1>UW 462 * 463 * into this: 464 * 465 * FOO (8) ... rY<0;1,0>UW 466 * 467 * Which would have different semantics. 468 */ 469 if (entry->src.stride != 1 && 470 (inst->src[arg].stride * 471 type_sz(inst->src[arg].type)) % type_sz(entry->src.type) != 0) 472 return false; 473 474 /* Since semantics of source modifiers are type-dependent we need to 475 * ensure that the meaning of the instruction remains the same if we 476 * change the type. If the sizes of the types are different the new 477 * instruction will read a different amount of data than the original 478 * and the semantics will always be different. 479 */ 480 if (has_source_modifiers && 481 entry->dst.type != inst->src[arg].type && 482 (!inst->can_change_types() || 483 type_sz(entry->dst.type) != type_sz(inst->src[arg].type))) 484 return false; 485 486 if (devinfo->gen >= 8 && (entry->src.negate || entry->src.abs) && 487 is_logic_op(inst->opcode)) { 488 return false; 489 } 490 491 if (entry->saturate) { 492 switch(inst->opcode) { 493 case BRW_OPCODE_SEL: 494 if ((inst->conditional_mod != BRW_CONDITIONAL_GE && 495 inst->conditional_mod != BRW_CONDITIONAL_L) || 496 inst->src[1].file != IMM || 497 inst->src[1].f < 0.0 || 498 inst->src[1].f > 1.0) { 499 return false; 500 } 501 break; 502 default: 503 return false; 504 } 505 } 506 507 inst->src[arg].file = entry->src.file; 508 inst->src[arg].nr = entry->src.nr; 509 inst->src[arg].stride *= entry->src.stride; 510 inst->saturate = inst->saturate || entry->saturate; 511 512 /* Compute the offset of inst->src[arg] relative to entry->dst */ 513 const unsigned rel_offset = inst->src[arg].offset - entry->dst.offset; 514 515 /* Compute the first component of the copy that the instruction is 516 * reading, and the base byte offset within that component. 517 */ 518 assert(entry->dst.offset % REG_SIZE == 0 && entry->dst.stride == 1); 519 const unsigned component = rel_offset / type_sz(entry->dst.type); 520 const unsigned suboffset = rel_offset % type_sz(entry->dst.type); 521 522 /* Calculate the byte offset at the origin of the copy of the given 523 * component and suboffset. 524 */ 525 inst->src[arg].offset = suboffset + 526 component * entry->src.stride * type_sz(entry->src.type) + 527 entry->src.offset; 528 529 if (has_source_modifiers) { 530 if (entry->dst.type != inst->src[arg].type) { 531 /* We are propagating source modifiers from a MOV with a different 532 * type. If we got here, then we can just change the source and 533 * destination types of the instruction and keep going. 534 */ 535 assert(inst->can_change_types()); 536 for (int i = 0; i < inst->sources; i++) { 537 inst->src[i].type = entry->dst.type; 538 } 539 inst->dst.type = entry->dst.type; 540 } 541 542 if (!inst->src[arg].abs) { 543 inst->src[arg].abs = entry->src.abs; 544 inst->src[arg].negate ^= entry->src.negate; 545 } 546 } 547 548 return true; 549} 550 551 552bool 553fs_visitor::try_constant_propagate(fs_inst *inst, acp_entry *entry) 554{ 555 bool progress = false; 556 557 if (entry->src.file != IMM) 558 return false; 559 if (type_sz(entry->src.type) > 4) 560 return false; 561 if (entry->saturate) 562 return false; 563 564 for (int i = inst->sources - 1; i >= 0; i--) { 565 if (inst->src[i].file != VGRF) 566 continue; 567 568 assert(entry->dst.file == VGRF); 569 if (inst->src[i].nr != entry->dst.nr) 570 continue; 571 572 /* Bail if inst is reading a range that isn't contained in the range 573 * that entry is writing. 574 */ 575 if (!region_contained_in(inst->src[i], inst->size_read(i), 576 entry->dst, entry->size_written)) 577 continue; 578 579 /* If the type sizes don't match each channel of the instruction is 580 * either extracting a portion of the constant (which could be handled 581 * with some effort but the code below doesn't) or reading multiple 582 * channels of the source at once. 583 */ 584 if (type_sz(inst->src[i].type) != type_sz(entry->dst.type)) 585 continue; 586 587 fs_reg val = entry->src; 588 val.type = inst->src[i].type; 589 590 if (inst->src[i].abs) { 591 if ((devinfo->gen >= 8 && is_logic_op(inst->opcode)) || 592 !brw_abs_immediate(val.type, &val.as_brw_reg())) { 593 continue; 594 } 595 } 596 597 if (inst->src[i].negate) { 598 if ((devinfo->gen >= 8 && is_logic_op(inst->opcode)) || 599 !brw_negate_immediate(val.type, &val.as_brw_reg())) { 600 continue; 601 } 602 } 603 604 switch (inst->opcode) { 605 case BRW_OPCODE_MOV: 606 case SHADER_OPCODE_LOAD_PAYLOAD: 607 case FS_OPCODE_PACK: 608 inst->src[i] = val; 609 progress = true; 610 break; 611 612 case SHADER_OPCODE_INT_QUOTIENT: 613 case SHADER_OPCODE_INT_REMAINDER: 614 /* FINISHME: Promote non-float constants and remove this. */ 615 if (devinfo->gen < 8) 616 break; 617 /* fallthrough */ 618 case SHADER_OPCODE_POW: 619 /* Allow constant propagation into src1 (except on Gen 6 which 620 * doesn't support scalar source math), and let constant combining 621 * promote the constant on Gen < 8. 622 */ 623 if (devinfo->gen == 6) 624 break; 625 /* fallthrough */ 626 case BRW_OPCODE_BFI1: 627 case BRW_OPCODE_ASR: 628 case BRW_OPCODE_SHL: 629 case BRW_OPCODE_SHR: 630 case BRW_OPCODE_SUBB: 631 if (i == 1) { 632 inst->src[i] = val; 633 progress = true; 634 } 635 break; 636 637 case BRW_OPCODE_MACH: 638 case BRW_OPCODE_MUL: 639 case SHADER_OPCODE_MULH: 640 case BRW_OPCODE_ADD: 641 case BRW_OPCODE_OR: 642 case BRW_OPCODE_AND: 643 case BRW_OPCODE_XOR: 644 case BRW_OPCODE_ADDC: 645 if (i == 1) { 646 inst->src[i] = val; 647 progress = true; 648 } else if (i == 0 && inst->src[1].file != IMM) { 649 /* Fit this constant in by commuting the operands. 650 * Exception: we can't do this for 32-bit integer MUL/MACH 651 * because it's asymmetric. 652 * 653 * The BSpec says for Broadwell that 654 * 655 * "When multiplying DW x DW, the dst cannot be accumulator." 656 * 657 * Integer MUL with a non-accumulator destination will be lowered 658 * by lower_integer_multiplication(), so don't restrict it. 659 */ 660 if (((inst->opcode == BRW_OPCODE_MUL && 661 inst->dst.is_accumulator()) || 662 inst->opcode == BRW_OPCODE_MACH) && 663 (inst->src[1].type == BRW_REGISTER_TYPE_D || 664 inst->src[1].type == BRW_REGISTER_TYPE_UD)) 665 break; 666 inst->src[0] = inst->src[1]; 667 inst->src[1] = val; 668 progress = true; 669 } 670 break; 671 672 case BRW_OPCODE_CMP: 673 case BRW_OPCODE_IF: 674 if (i == 1) { 675 inst->src[i] = val; 676 progress = true; 677 } else if (i == 0 && inst->src[1].file != IMM) { 678 enum brw_conditional_mod new_cmod; 679 680 new_cmod = brw_swap_cmod(inst->conditional_mod); 681 if (new_cmod != BRW_CONDITIONAL_NONE) { 682 /* Fit this constant in by swapping the operands and 683 * flipping the test 684 */ 685 inst->src[0] = inst->src[1]; 686 inst->src[1] = val; 687 inst->conditional_mod = new_cmod; 688 progress = true; 689 } 690 } 691 break; 692 693 case BRW_OPCODE_SEL: 694 if (i == 1) { 695 inst->src[i] = val; 696 progress = true; 697 } else if (i == 0 && inst->src[1].file != IMM) { 698 inst->src[0] = inst->src[1]; 699 inst->src[1] = val; 700 701 /* If this was predicated, flipping operands means 702 * we also need to flip the predicate. 703 */ 704 if (inst->conditional_mod == BRW_CONDITIONAL_NONE) { 705 inst->predicate_inverse = 706 !inst->predicate_inverse; 707 } 708 progress = true; 709 } 710 break; 711 712 case FS_OPCODE_FB_WRITE_LOGICAL: 713 /* The stencil and omask sources of FS_OPCODE_FB_WRITE_LOGICAL are 714 * bit-cast using a strided region so they cannot be immediates. 715 */ 716 if (i != FB_WRITE_LOGICAL_SRC_SRC_STENCIL && 717 i != FB_WRITE_LOGICAL_SRC_OMASK) { 718 inst->src[i] = val; 719 progress = true; 720 } 721 break; 722 723 case SHADER_OPCODE_TEX_LOGICAL: 724 case SHADER_OPCODE_TXD_LOGICAL: 725 case SHADER_OPCODE_TXF_LOGICAL: 726 case SHADER_OPCODE_TXL_LOGICAL: 727 case SHADER_OPCODE_TXS_LOGICAL: 728 case FS_OPCODE_TXB_LOGICAL: 729 case SHADER_OPCODE_TXF_CMS_LOGICAL: 730 case SHADER_OPCODE_TXF_CMS_W_LOGICAL: 731 case SHADER_OPCODE_TXF_UMS_LOGICAL: 732 case SHADER_OPCODE_TXF_MCS_LOGICAL: 733 case SHADER_OPCODE_LOD_LOGICAL: 734 case SHADER_OPCODE_TG4_LOGICAL: 735 case SHADER_OPCODE_TG4_OFFSET_LOGICAL: 736 case SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL: 737 case SHADER_OPCODE_UNTYPED_ATOMIC_FLOAT_LOGICAL: 738 case SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL: 739 case SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL: 740 case SHADER_OPCODE_TYPED_ATOMIC_LOGICAL: 741 case SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL: 742 case SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL: 743 case SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL: 744 case SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL: 745 inst->src[i] = val; 746 progress = true; 747 break; 748 749 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD: 750 case SHADER_OPCODE_BROADCAST: 751 inst->src[i] = val; 752 progress = true; 753 break; 754 755 case BRW_OPCODE_MAD: 756 case BRW_OPCODE_LRP: 757 inst->src[i] = val; 758 progress = true; 759 break; 760 761 default: 762 break; 763 } 764 } 765 766 return progress; 767} 768 769static bool 770can_propagate_from(fs_inst *inst) 771{ 772 return (inst->opcode == BRW_OPCODE_MOV && 773 inst->dst.file == VGRF && 774 ((inst->src[0].file == VGRF && 775 !regions_overlap(inst->dst, inst->size_written, 776 inst->src[0], inst->size_read(0))) || 777 inst->src[0].file == ATTR || 778 inst->src[0].file == UNIFORM || 779 inst->src[0].file == IMM) && 780 inst->src[0].type == inst->dst.type && 781 !inst->is_partial_write()); 782} 783 784/* Walks a basic block and does copy propagation on it using the acp 785 * list. 786 */ 787bool 788fs_visitor::opt_copy_propagation_local(void *copy_prop_ctx, bblock_t *block, 789 exec_list *acp) 790{ 791 bool progress = false; 792 793 foreach_inst_in_block(fs_inst, inst, block) { 794 /* Try propagating into this instruction. */ 795 for (int i = 0; i < inst->sources; i++) { 796 if (inst->src[i].file != VGRF) 797 continue; 798 799 foreach_in_list(acp_entry, entry, &acp[inst->src[i].nr % ACP_HASH_SIZE]) { 800 if (try_constant_propagate(inst, entry)) 801 progress = true; 802 else if (try_copy_propagate(inst, i, entry)) 803 progress = true; 804 } 805 } 806 807 /* kill the destination from the ACP */ 808 if (inst->dst.file == VGRF) { 809 foreach_in_list_safe(acp_entry, entry, &acp[inst->dst.nr % ACP_HASH_SIZE]) { 810 if (regions_overlap(entry->dst, entry->size_written, 811 inst->dst, inst->size_written)) 812 entry->remove(); 813 } 814 815 /* Oops, we only have the chaining hash based on the destination, not 816 * the source, so walk across the entire table. 817 */ 818 for (int i = 0; i < ACP_HASH_SIZE; i++) { 819 foreach_in_list_safe(acp_entry, entry, &acp[i]) { 820 /* Make sure we kill the entry if this instruction overwrites 821 * _any_ of the registers that it reads 822 */ 823 if (regions_overlap(entry->src, entry->size_read, 824 inst->dst, inst->size_written)) 825 entry->remove(); 826 } 827 } 828 } 829 830 /* If this instruction's source could potentially be folded into the 831 * operand of another instruction, add it to the ACP. 832 */ 833 if (can_propagate_from(inst)) { 834 acp_entry *entry = ralloc(copy_prop_ctx, acp_entry); 835 entry->dst = inst->dst; 836 entry->src = inst->src[0]; 837 entry->size_written = inst->size_written; 838 entry->size_read = inst->size_read(0); 839 entry->opcode = inst->opcode; 840 entry->saturate = inst->saturate; 841 acp[entry->dst.nr % ACP_HASH_SIZE].push_tail(entry); 842 } else if (inst->opcode == SHADER_OPCODE_LOAD_PAYLOAD && 843 inst->dst.file == VGRF) { 844 int offset = 0; 845 for (int i = 0; i < inst->sources; i++) { 846 int effective_width = i < inst->header_size ? 8 : inst->exec_size; 847 assert(effective_width * type_sz(inst->src[i].type) % REG_SIZE == 0); 848 const unsigned size_written = effective_width * 849 type_sz(inst->src[i].type); 850 if (inst->src[i].file == VGRF) { 851 acp_entry *entry = rzalloc(copy_prop_ctx, acp_entry); 852 entry->dst = byte_offset(inst->dst, offset); 853 entry->src = inst->src[i]; 854 entry->size_written = size_written; 855 entry->size_read = inst->size_read(i); 856 entry->opcode = inst->opcode; 857 if (!entry->dst.equals(inst->src[i])) { 858 acp[entry->dst.nr % ACP_HASH_SIZE].push_tail(entry); 859 } else { 860 ralloc_free(entry); 861 } 862 } 863 offset += size_written; 864 } 865 } 866 } 867 868 return progress; 869} 870 871bool 872fs_visitor::opt_copy_propagation() 873{ 874 bool progress = false; 875 void *copy_prop_ctx = ralloc_context(NULL); 876 exec_list *out_acp[cfg->num_blocks]; 877 878 for (int i = 0; i < cfg->num_blocks; i++) 879 out_acp[i] = new exec_list [ACP_HASH_SIZE]; 880 881 calculate_live_intervals(); 882 883 /* First, walk through each block doing local copy propagation and getting 884 * the set of copies available at the end of the block. 885 */ 886 foreach_block (block, cfg) { 887 progress = opt_copy_propagation_local(copy_prop_ctx, block, 888 out_acp[block->num]) || progress; 889 } 890 891 /* Do dataflow analysis for those available copies. */ 892 fs_copy_prop_dataflow dataflow(copy_prop_ctx, cfg, live_intervals, out_acp); 893 894 /* Next, re-run local copy propagation, this time with the set of copies 895 * provided by the dataflow analysis available at the start of a block. 896 */ 897 foreach_block (block, cfg) { 898 exec_list in_acp[ACP_HASH_SIZE]; 899 900 for (int i = 0; i < dataflow.num_acp; i++) { 901 if (BITSET_TEST(dataflow.bd[block->num].livein, i)) { 902 struct acp_entry *entry = dataflow.acp[i]; 903 in_acp[entry->dst.nr % ACP_HASH_SIZE].push_tail(entry); 904 } 905 } 906 907 progress = opt_copy_propagation_local(copy_prop_ctx, block, in_acp) || 908 progress; 909 } 910 911 for (int i = 0; i < cfg->num_blocks; i++) 912 delete [] out_acp[i]; 913 ralloc_free(copy_prop_ctx); 914 915 if (progress) 916 invalidate_live_intervals(); 917 918 return progress; 919} 920