1/* 2 * Copyright © 2011 Intel Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 */ 23 24/** 25 * @file brw_vec4_copy_propagation.cpp 26 * 27 * Implements tracking of values copied between registers, and 28 * optimizations based on that: copy propagation and constant 29 * propagation. 30 */ 31 32#include "brw_vec4.h" 33#include "brw_cfg.h" 34#include "brw_eu.h" 35 36namespace brw { 37 38struct copy_entry { 39 src_reg *value[4]; 40 int saturatemask; 41}; 42 43static bool 44is_direct_copy(vec4_instruction *inst) 45{ 46 return (inst->opcode == BRW_OPCODE_MOV && 47 !inst->predicate && 48 inst->dst.file == VGRF && 49 inst->dst.offset % REG_SIZE == 0 && 50 !inst->dst.reladdr && 51 !inst->src[0].reladdr && 52 (inst->dst.type == inst->src[0].type || 53 (inst->dst.type == BRW_REGISTER_TYPE_F && 54 inst->src[0].type == BRW_REGISTER_TYPE_VF))); 55} 56 57static bool 58is_dominated_by_previous_instruction(vec4_instruction *inst) 59{ 60 return (inst->opcode != BRW_OPCODE_DO && 61 inst->opcode != BRW_OPCODE_WHILE && 62 inst->opcode != BRW_OPCODE_ELSE && 63 inst->opcode != BRW_OPCODE_ENDIF); 64} 65 66static bool 67is_channel_updated(vec4_instruction *inst, src_reg *values[4], int ch) 68{ 69 const src_reg *src = values[ch]; 70 71 /* consider GRF only */ 72 assert(inst->dst.file == VGRF); 73 if (!src || src->file != VGRF) 74 return false; 75 76 return regions_overlap(*src, REG_SIZE, inst->dst, inst->size_written) && 77 (inst->dst.offset != src->offset || 78 inst->dst.writemask & (1 << BRW_GET_SWZ(src->swizzle, ch))); 79} 80 81static bool 82is_logic_op(enum opcode opcode) 83{ 84 return (opcode == BRW_OPCODE_AND || 85 opcode == BRW_OPCODE_OR || 86 opcode == BRW_OPCODE_XOR || 87 opcode == BRW_OPCODE_NOT); 88} 89 90/** 91 * Get the origin of a copy as a single register if all components present in 92 * the given readmask originate from the same register and have compatible 93 * regions, otherwise return a BAD_FILE register. 94 */ 95static src_reg 96get_copy_value(const copy_entry &entry, unsigned readmask) 97{ 98 unsigned swz[4] = {}; 99 src_reg value; 100 101 for (unsigned i = 0; i < 4; i++) { 102 if (readmask & (1 << i)) { 103 if (entry.value[i]) { 104 src_reg src = *entry.value[i]; 105 106 if (src.file == IMM) { 107 swz[i] = i; 108 } else { 109 swz[i] = BRW_GET_SWZ(src.swizzle, i); 110 /* Overwrite the original swizzle so the src_reg::equals call 111 * below doesn't care about it, the correct swizzle will be 112 * calculated once the swizzles of all components are known. 113 */ 114 src.swizzle = BRW_SWIZZLE_XYZW; 115 } 116 117 if (value.file == BAD_FILE) { 118 value = src; 119 } else if (!value.equals(src)) { 120 return src_reg(); 121 } 122 } else { 123 return src_reg(); 124 } 125 } 126 } 127 128 return swizzle(value, 129 brw_compose_swizzle(brw_swizzle_for_mask(readmask), 130 BRW_SWIZZLE4(swz[0], swz[1], 131 swz[2], swz[3]))); 132} 133 134static bool 135try_constant_propagate(const struct gen_device_info *devinfo, 136 vec4_instruction *inst, 137 int arg, const copy_entry *entry) 138{ 139 /* For constant propagation, we only handle the same constant 140 * across all 4 channels. Some day, we should handle the 8-bit 141 * float vector format, which would let us constant propagate 142 * vectors better. 143 * We could be more aggressive here -- some channels might not get used 144 * based on the destination writemask. 145 */ 146 src_reg value = 147 get_copy_value(*entry, 148 brw_apply_inv_swizzle_to_mask(inst->src[arg].swizzle, 149 WRITEMASK_XYZW)); 150 151 if (value.file != IMM) 152 return false; 153 154 /* 64-bit types can't be used except for one-source instructions, which 155 * higher levels should have constant folded away, so there's no point in 156 * propagating immediates here. 157 */ 158 if (type_sz(value.type) == 8 || type_sz(inst->src[arg].type) == 8) 159 return false; 160 161 if (value.type == BRW_REGISTER_TYPE_VF) { 162 /* The result of bit-casting the component values of a vector float 163 * cannot in general be represented as an immediate. 164 */ 165 if (inst->src[arg].type != BRW_REGISTER_TYPE_F) 166 return false; 167 } else { 168 value.type = inst->src[arg].type; 169 } 170 171 if (inst->src[arg].abs) { 172 if ((devinfo->gen >= 8 && is_logic_op(inst->opcode)) || 173 !brw_abs_immediate(value.type, &value.as_brw_reg())) { 174 return false; 175 } 176 } 177 178 if (inst->src[arg].negate) { 179 if ((devinfo->gen >= 8 && is_logic_op(inst->opcode)) || 180 !brw_negate_immediate(value.type, &value.as_brw_reg())) { 181 return false; 182 } 183 } 184 185 value = swizzle(value, inst->src[arg].swizzle); 186 187 switch (inst->opcode) { 188 case BRW_OPCODE_MOV: 189 case SHADER_OPCODE_BROADCAST: 190 inst->src[arg] = value; 191 return true; 192 193 case VEC4_OPCODE_UNTYPED_ATOMIC: 194 if (arg == 1) { 195 inst->src[arg] = value; 196 return true; 197 } 198 break; 199 200 case SHADER_OPCODE_POW: 201 case SHADER_OPCODE_INT_QUOTIENT: 202 case SHADER_OPCODE_INT_REMAINDER: 203 if (devinfo->gen < 8) 204 break; 205 /* fallthrough */ 206 case BRW_OPCODE_DP2: 207 case BRW_OPCODE_DP3: 208 case BRW_OPCODE_DP4: 209 case BRW_OPCODE_DPH: 210 case BRW_OPCODE_BFI1: 211 case BRW_OPCODE_ASR: 212 case BRW_OPCODE_SHL: 213 case BRW_OPCODE_SHR: 214 case BRW_OPCODE_SUBB: 215 if (arg == 1) { 216 inst->src[arg] = value; 217 return true; 218 } 219 break; 220 221 case BRW_OPCODE_MACH: 222 case BRW_OPCODE_MUL: 223 case SHADER_OPCODE_MULH: 224 case BRW_OPCODE_ADD: 225 case BRW_OPCODE_OR: 226 case BRW_OPCODE_AND: 227 case BRW_OPCODE_XOR: 228 case BRW_OPCODE_ADDC: 229 if (arg == 1) { 230 inst->src[arg] = value; 231 return true; 232 } else if (arg == 0 && inst->src[1].file != IMM) { 233 /* Fit this constant in by commuting the operands. Exception: we 234 * can't do this for 32-bit integer MUL/MACH because it's asymmetric. 235 */ 236 if ((inst->opcode == BRW_OPCODE_MUL || 237 inst->opcode == BRW_OPCODE_MACH) && 238 (inst->src[1].type == BRW_REGISTER_TYPE_D || 239 inst->src[1].type == BRW_REGISTER_TYPE_UD)) 240 break; 241 inst->src[0] = inst->src[1]; 242 inst->src[1] = value; 243 return true; 244 } 245 break; 246 case GS_OPCODE_SET_WRITE_OFFSET: 247 /* This is just a multiply by a constant with special strides. 248 * The generator will handle immediates in both arguments (generating 249 * a single MOV of the product). So feel free to propagate in src0. 250 */ 251 inst->src[arg] = value; 252 return true; 253 254 case BRW_OPCODE_CMP: 255 if (arg == 1) { 256 inst->src[arg] = value; 257 return true; 258 } else if (arg == 0 && inst->src[1].file != IMM) { 259 enum brw_conditional_mod new_cmod; 260 261 new_cmod = brw_swap_cmod(inst->conditional_mod); 262 if (new_cmod != BRW_CONDITIONAL_NONE) { 263 /* Fit this constant in by swapping the operands and 264 * flipping the test. 265 */ 266 inst->src[0] = inst->src[1]; 267 inst->src[1] = value; 268 inst->conditional_mod = new_cmod; 269 return true; 270 } 271 } 272 break; 273 274 case BRW_OPCODE_SEL: 275 if (arg == 1) { 276 inst->src[arg] = value; 277 return true; 278 } else if (arg == 0 && inst->src[1].file != IMM) { 279 inst->src[0] = inst->src[1]; 280 inst->src[1] = value; 281 282 /* If this was predicated, flipping operands means 283 * we also need to flip the predicate. 284 */ 285 if (inst->conditional_mod == BRW_CONDITIONAL_NONE) { 286 inst->predicate_inverse = !inst->predicate_inverse; 287 } 288 return true; 289 } 290 break; 291 292 default: 293 break; 294 } 295 296 return false; 297} 298 299static bool 300is_align1_opcode(unsigned opcode) 301{ 302 switch (opcode) { 303 case VEC4_OPCODE_DOUBLE_TO_F32: 304 case VEC4_OPCODE_DOUBLE_TO_D32: 305 case VEC4_OPCODE_DOUBLE_TO_U32: 306 case VEC4_OPCODE_TO_DOUBLE: 307 case VEC4_OPCODE_PICK_LOW_32BIT: 308 case VEC4_OPCODE_PICK_HIGH_32BIT: 309 case VEC4_OPCODE_SET_LOW_32BIT: 310 case VEC4_OPCODE_SET_HIGH_32BIT: 311 return true; 312 default: 313 return false; 314 } 315} 316 317static bool 318try_copy_propagate(const struct gen_device_info *devinfo, 319 vec4_instruction *inst, int arg, 320 const copy_entry *entry, int attributes_per_reg) 321{ 322 /* Build up the value we are propagating as if it were the source of a 323 * single MOV 324 */ 325 src_reg value = 326 get_copy_value(*entry, 327 brw_apply_inv_swizzle_to_mask(inst->src[arg].swizzle, 328 WRITEMASK_XYZW)); 329 330 /* Check that we can propagate that value */ 331 if (value.file != UNIFORM && 332 value.file != VGRF && 333 value.file != ATTR) 334 return false; 335 336 /* In gen < 8 instructions that write 2 registers also need to read 2 337 * registers. Make sure we don't break that restriction by copy 338 * propagating from a uniform. 339 */ 340 if (devinfo->gen < 8 && inst->size_written > REG_SIZE && is_uniform(value)) 341 return false; 342 343 /* There is a regioning restriction such that if execsize == width 344 * and hstride != 0 then the vstride can't be 0. When we split instrutions 345 * that take a single-precision source (like F->DF conversions) we end up 346 * with a 4-wide source on an instruction with an execution size of 4. 347 * If we then copy-propagate the source from a uniform we also end up with a 348 * vstride of 0 and we violate the restriction. 349 */ 350 if (inst->exec_size == 4 && value.file == UNIFORM && 351 type_sz(value.type) == 4) 352 return false; 353 354 /* If the type of the copy value is different from the type of the 355 * instruction then the swizzles and writemasks involved don't have the same 356 * meaning and simply replacing the source would produce different semantics. 357 */ 358 if (type_sz(value.type) != type_sz(inst->src[arg].type)) 359 return false; 360 361 if (devinfo->gen >= 8 && (value.negate || value.abs) && 362 is_logic_op(inst->opcode)) { 363 return false; 364 } 365 366 if (inst->src[arg].offset % REG_SIZE || value.offset % REG_SIZE) 367 return false; 368 369 bool has_source_modifiers = value.negate || value.abs; 370 371 /* gen6 math and gen7+ SENDs from GRFs ignore source modifiers on 372 * instructions. 373 */ 374 if ((has_source_modifiers || value.file == UNIFORM || 375 value.swizzle != BRW_SWIZZLE_XYZW) && !inst->can_do_source_mods(devinfo)) 376 return false; 377 378 if (has_source_modifiers && 379 value.type != inst->src[arg].type && 380 !inst->can_change_types()) 381 return false; 382 383 if (has_source_modifiers && 384 inst->opcode == SHADER_OPCODE_GEN4_SCRATCH_WRITE) 385 return false; 386 387 unsigned composed_swizzle = brw_compose_swizzle(inst->src[arg].swizzle, 388 value.swizzle); 389 390 /* Instructions that operate on vectors in ALIGN1 mode will ignore swizzles 391 * so copy-propagation won't be safe if the composed swizzle is anything 392 * other than the identity. 393 */ 394 if (is_align1_opcode(inst->opcode) && composed_swizzle != BRW_SWIZZLE_XYZW) 395 return false; 396 397 if (inst->is_3src(devinfo) && 398 (value.file == UNIFORM || 399 (value.file == ATTR && attributes_per_reg != 1)) && 400 !brw_is_single_value_swizzle(composed_swizzle)) 401 return false; 402 403 if (inst->is_send_from_grf()) 404 return false; 405 406 /* we can't generally copy-propagate UD negations becuse we 407 * end up accessing the resulting values as signed integers 408 * instead. See also resolve_ud_negate(). 409 */ 410 if (value.negate && 411 value.type == BRW_REGISTER_TYPE_UD) 412 return false; 413 414 /* Don't report progress if this is a noop. */ 415 if (value.equals(inst->src[arg])) 416 return false; 417 418 const unsigned dst_saturate_mask = inst->dst.writemask & 419 brw_apply_swizzle_to_mask(inst->src[arg].swizzle, entry->saturatemask); 420 421 if (dst_saturate_mask) { 422 /* We either saturate all or nothing. */ 423 if (dst_saturate_mask != inst->dst.writemask) 424 return false; 425 426 /* Limit saturate propagation only to SEL with src1 bounded within 0.0 427 * and 1.0, otherwise skip copy propagate altogether. 428 */ 429 switch(inst->opcode) { 430 case BRW_OPCODE_SEL: 431 if (arg != 0 || 432 inst->src[0].type != BRW_REGISTER_TYPE_F || 433 inst->src[1].file != IMM || 434 inst->src[1].type != BRW_REGISTER_TYPE_F || 435 inst->src[1].f < 0.0 || 436 inst->src[1].f > 1.0) { 437 return false; 438 } 439 if (!inst->saturate) 440 inst->saturate = true; 441 break; 442 default: 443 return false; 444 } 445 } 446 447 /* Build the final value */ 448 if (inst->src[arg].abs) { 449 value.negate = false; 450 value.abs = true; 451 } 452 if (inst->src[arg].negate) 453 value.negate = !value.negate; 454 455 value.swizzle = composed_swizzle; 456 if (has_source_modifiers && 457 value.type != inst->src[arg].type) { 458 assert(inst->can_change_types()); 459 for (int i = 0; i < 3; i++) { 460 inst->src[i].type = value.type; 461 } 462 inst->dst.type = value.type; 463 } else { 464 value.type = inst->src[arg].type; 465 } 466 467 inst->src[arg] = value; 468 return true; 469} 470 471bool 472vec4_visitor::opt_copy_propagation(bool do_constant_prop) 473{ 474 /* If we are in dual instanced or single mode, then attributes are going 475 * to be interleaved, so one register contains two attribute slots. 476 */ 477 const int attributes_per_reg = 478 prog_data->dispatch_mode == DISPATCH_MODE_4X2_DUAL_OBJECT ? 1 : 2; 479 bool progress = false; 480 struct copy_entry entries[alloc.total_size]; 481 482 memset(&entries, 0, sizeof(entries)); 483 484 foreach_block_and_inst(block, vec4_instruction, inst, cfg) { 485 /* This pass only works on basic blocks. If there's flow 486 * control, throw out all our information and start from 487 * scratch. 488 * 489 * This should really be fixed by using a structure like in 490 * src/glsl/opt_copy_propagation.cpp to track available copies. 491 */ 492 if (!is_dominated_by_previous_instruction(inst)) { 493 memset(&entries, 0, sizeof(entries)); 494 continue; 495 } 496 497 /* For each source arg, see if each component comes from a copy 498 * from the same type file (IMM, VGRF, UNIFORM), and try 499 * optimizing out access to the copy result 500 */ 501 for (int i = 2; i >= 0; i--) { 502 /* Copied values end up in GRFs, and we don't track reladdr 503 * accesses. 504 */ 505 if (inst->src[i].file != VGRF || 506 inst->src[i].reladdr) 507 continue; 508 509 /* We only handle register-aligned single GRF copies. */ 510 if (inst->size_read(i) != REG_SIZE || 511 inst->src[i].offset % REG_SIZE) 512 continue; 513 514 const unsigned reg = (alloc.offsets[inst->src[i].nr] + 515 inst->src[i].offset / REG_SIZE); 516 const copy_entry &entry = entries[reg]; 517 518 if (do_constant_prop && try_constant_propagate(devinfo, inst, i, &entry)) 519 progress = true; 520 else if (try_copy_propagate(devinfo, inst, i, &entry, attributes_per_reg)) 521 progress = true; 522 } 523 524 /* Track available source registers. */ 525 if (inst->dst.file == VGRF) { 526 const int reg = 527 alloc.offsets[inst->dst.nr] + inst->dst.offset / REG_SIZE; 528 529 /* Update our destination's current channel values. For a direct copy, 530 * the value is the newly propagated source. Otherwise, we don't know 531 * the new value, so clear it. 532 */ 533 bool direct_copy = is_direct_copy(inst); 534 entries[reg].saturatemask &= ~inst->dst.writemask; 535 for (int i = 0; i < 4; i++) { 536 if (inst->dst.writemask & (1 << i)) { 537 entries[reg].value[i] = direct_copy ? &inst->src[0] : NULL; 538 entries[reg].saturatemask |= 539 inst->saturate && direct_copy ? 1 << i : 0; 540 } 541 } 542 543 /* Clear the records for any registers whose current value came from 544 * our destination's updated channels, as the two are no longer equal. 545 */ 546 if (inst->dst.reladdr) 547 memset(&entries, 0, sizeof(entries)); 548 else { 549 for (unsigned i = 0; i < alloc.total_size; i++) { 550 for (int j = 0; j < 4; j++) { 551 if (is_channel_updated(inst, entries[i].value, j)) { 552 entries[i].value[j] = NULL; 553 entries[i].saturatemask &= ~(1 << j); 554 } 555 } 556 } 557 } 558 } 559 } 560 561 if (progress) 562 invalidate_live_intervals(); 563 564 return progress; 565} 566 567} /* namespace brw */ 568