1/* 2 * Copyright © 2011 Intel Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 */ 23 24/** 25 * @file brw_vec4_copy_propagation.cpp 26 * 27 * Implements tracking of values copied between registers, and 28 * optimizations based on that: copy propagation and constant 29 * propagation. 30 */ 31 32#include "brw_vec4.h" 33#include "brw_cfg.h" 34#include "brw_eu.h" 35 36namespace brw { 37 38struct copy_entry { 39 src_reg *value[4]; 40 int saturatemask; 41}; 42 43static bool 44is_direct_copy(vec4_instruction *inst) 45{ 46 return (inst->opcode == BRW_OPCODE_MOV && 47 !inst->predicate && 48 inst->dst.file == VGRF && 49 inst->dst.offset % REG_SIZE == 0 && 50 !inst->dst.reladdr && 51 !inst->src[0].reladdr && 52 (inst->dst.type == inst->src[0].type || 53 (inst->dst.type == BRW_REGISTER_TYPE_F && 54 inst->src[0].type == BRW_REGISTER_TYPE_VF))); 55} 56 57static bool 58is_dominated_by_previous_instruction(vec4_instruction *inst) 59{ 60 return (inst->opcode != BRW_OPCODE_DO && 61 inst->opcode != BRW_OPCODE_WHILE && 62 inst->opcode != BRW_OPCODE_ELSE && 63 inst->opcode != BRW_OPCODE_ENDIF); 64} 65 66static bool 67is_channel_updated(vec4_instruction *inst, src_reg *values[4], int ch) 68{ 69 const src_reg *src = values[ch]; 70 71 /* consider GRF only */ 72 assert(inst->dst.file == VGRF); 73 if (!src || src->file != VGRF) 74 return false; 75 76 return regions_overlap(*src, REG_SIZE, inst->dst, inst->size_written) && 77 (inst->dst.offset != src->offset || 78 inst->dst.writemask & (1 << BRW_GET_SWZ(src->swizzle, ch))); 79} 80 81/** 82 * Get the origin of a copy as a single register if all components present in 83 * the given readmask originate from the same register and have compatible 84 * regions, otherwise return a BAD_FILE register. 85 */ 86static src_reg 87get_copy_value(const copy_entry &entry, unsigned readmask) 88{ 89 unsigned swz[4] = {}; 90 src_reg value; 91 92 for (unsigned i = 0; i < 4; i++) { 93 if (readmask & (1 << i)) { 94 if (entry.value[i]) { 95 src_reg src = *entry.value[i]; 96 97 if (src.file == IMM) { 98 swz[i] = i; 99 } else { 100 swz[i] = BRW_GET_SWZ(src.swizzle, i); 101 /* Overwrite the original swizzle so the src_reg::equals call 102 * below doesn't care about it, the correct swizzle will be 103 * calculated once the swizzles of all components are known. 104 */ 105 src.swizzle = BRW_SWIZZLE_XYZW; 106 } 107 108 if (value.file == BAD_FILE) { 109 value = src; 110 } else if (!value.equals(src)) { 111 return src_reg(); 112 } 113 } else { 114 return src_reg(); 115 } 116 } 117 } 118 119 return swizzle(value, 120 brw_compose_swizzle(brw_swizzle_for_mask(readmask), 121 BRW_SWIZZLE4(swz[0], swz[1], 122 swz[2], swz[3]))); 123} 124 125static bool 126try_constant_propagate(vec4_instruction *inst, 127 int arg, const copy_entry *entry) 128{ 129 /* For constant propagation, we only handle the same constant 130 * across all 4 channels. Some day, we should handle the 8-bit 131 * float vector format, which would let us constant propagate 132 * vectors better. 133 * We could be more aggressive here -- some channels might not get used 134 * based on the destination writemask. 135 */ 136 src_reg value = 137 get_copy_value(*entry, 138 brw_apply_inv_swizzle_to_mask(inst->src[arg].swizzle, 139 WRITEMASK_XYZW)); 140 141 if (value.file != IMM) 142 return false; 143 144 /* 64-bit types can't be used except for one-source instructions, which 145 * higher levels should have constant folded away, so there's no point in 146 * propagating immediates here. 147 */ 148 if (type_sz(value.type) == 8 || type_sz(inst->src[arg].type) == 8) 149 return false; 150 151 if (value.type == BRW_REGISTER_TYPE_VF) { 152 /* The result of bit-casting the component values of a vector float 153 * cannot in general be represented as an immediate. 154 */ 155 if (inst->src[arg].type != BRW_REGISTER_TYPE_F) 156 return false; 157 } else { 158 value.type = inst->src[arg].type; 159 } 160 161 if (inst->src[arg].abs) { 162 if (!brw_abs_immediate(value.type, &value.as_brw_reg())) 163 return false; 164 } 165 166 if (inst->src[arg].negate) { 167 if (!brw_negate_immediate(value.type, &value.as_brw_reg())) 168 return false; 169 } 170 171 value = swizzle(value, inst->src[arg].swizzle); 172 173 switch (inst->opcode) { 174 case BRW_OPCODE_MOV: 175 case SHADER_OPCODE_BROADCAST: 176 inst->src[arg] = value; 177 return true; 178 179 case VEC4_OPCODE_UNTYPED_ATOMIC: 180 if (arg == 1) { 181 inst->src[arg] = value; 182 return true; 183 } 184 break; 185 186 case SHADER_OPCODE_POW: 187 case SHADER_OPCODE_INT_QUOTIENT: 188 case SHADER_OPCODE_INT_REMAINDER: 189 break; 190 case BRW_OPCODE_DP2: 191 case BRW_OPCODE_DP3: 192 case BRW_OPCODE_DP4: 193 case BRW_OPCODE_DPH: 194 case BRW_OPCODE_BFI1: 195 case BRW_OPCODE_ASR: 196 case BRW_OPCODE_SHL: 197 case BRW_OPCODE_SHR: 198 case BRW_OPCODE_SUBB: 199 if (arg == 1) { 200 inst->src[arg] = value; 201 return true; 202 } 203 break; 204 205 case BRW_OPCODE_MACH: 206 case BRW_OPCODE_MUL: 207 case SHADER_OPCODE_MULH: 208 case BRW_OPCODE_ADD: 209 case BRW_OPCODE_OR: 210 case BRW_OPCODE_AND: 211 case BRW_OPCODE_XOR: 212 case BRW_OPCODE_ADDC: 213 if (arg == 1) { 214 inst->src[arg] = value; 215 return true; 216 } else if (arg == 0 && inst->src[1].file != IMM) { 217 /* Fit this constant in by commuting the operands. Exception: we 218 * can't do this for 32-bit integer MUL/MACH because it's asymmetric. 219 */ 220 if ((inst->opcode == BRW_OPCODE_MUL || 221 inst->opcode == BRW_OPCODE_MACH) && 222 (inst->src[1].type == BRW_REGISTER_TYPE_D || 223 inst->src[1].type == BRW_REGISTER_TYPE_UD)) 224 break; 225 inst->src[0] = inst->src[1]; 226 inst->src[1] = value; 227 return true; 228 } 229 break; 230 case GS_OPCODE_SET_WRITE_OFFSET: 231 /* This is just a multiply by a constant with special strides. 232 * The generator will handle immediates in both arguments (generating 233 * a single MOV of the product). So feel free to propagate in src0. 234 */ 235 inst->src[arg] = value; 236 return true; 237 238 case BRW_OPCODE_CMP: 239 if (arg == 1) { 240 inst->src[arg] = value; 241 return true; 242 } else if (arg == 0 && inst->src[1].file != IMM) { 243 enum brw_conditional_mod new_cmod; 244 245 new_cmod = brw_swap_cmod(inst->conditional_mod); 246 if (new_cmod != BRW_CONDITIONAL_NONE) { 247 /* Fit this constant in by swapping the operands and 248 * flipping the test. 249 */ 250 inst->src[0] = inst->src[1]; 251 inst->src[1] = value; 252 inst->conditional_mod = new_cmod; 253 return true; 254 } 255 } 256 break; 257 258 case BRW_OPCODE_SEL: 259 if (arg == 1) { 260 inst->src[arg] = value; 261 return true; 262 } else if (arg == 0 && inst->src[1].file != IMM) { 263 inst->src[0] = inst->src[1]; 264 inst->src[1] = value; 265 266 /* If this was predicated, flipping operands means 267 * we also need to flip the predicate. 268 */ 269 if (inst->conditional_mod == BRW_CONDITIONAL_NONE) { 270 inst->predicate_inverse = !inst->predicate_inverse; 271 } 272 return true; 273 } 274 break; 275 276 default: 277 break; 278 } 279 280 return false; 281} 282 283static bool 284is_align1_opcode(unsigned opcode) 285{ 286 switch (opcode) { 287 case VEC4_OPCODE_DOUBLE_TO_F32: 288 case VEC4_OPCODE_DOUBLE_TO_D32: 289 case VEC4_OPCODE_DOUBLE_TO_U32: 290 case VEC4_OPCODE_TO_DOUBLE: 291 case VEC4_OPCODE_PICK_LOW_32BIT: 292 case VEC4_OPCODE_PICK_HIGH_32BIT: 293 case VEC4_OPCODE_SET_LOW_32BIT: 294 case VEC4_OPCODE_SET_HIGH_32BIT: 295 return true; 296 default: 297 return false; 298 } 299} 300 301static bool 302try_copy_propagate(const struct intel_device_info *devinfo, 303 vec4_instruction *inst, int arg, 304 const copy_entry *entry, int attributes_per_reg) 305{ 306 /* Build up the value we are propagating as if it were the source of a 307 * single MOV 308 */ 309 src_reg value = 310 get_copy_value(*entry, 311 brw_apply_inv_swizzle_to_mask(inst->src[arg].swizzle, 312 WRITEMASK_XYZW)); 313 314 /* Check that we can propagate that value */ 315 if (value.file != UNIFORM && 316 value.file != VGRF && 317 value.file != ATTR) 318 return false; 319 320 /* Instructions that write 2 registers also need to read 2 registers. Make 321 * sure we don't break that restriction by copy propagating from a uniform. 322 */ 323 if (inst->size_written > REG_SIZE && is_uniform(value)) 324 return false; 325 326 /* There is a regioning restriction such that if execsize == width 327 * and hstride != 0 then the vstride can't be 0. When we split instrutions 328 * that take a single-precision source (like F->DF conversions) we end up 329 * with a 4-wide source on an instruction with an execution size of 4. 330 * If we then copy-propagate the source from a uniform we also end up with a 331 * vstride of 0 and we violate the restriction. 332 */ 333 if (inst->exec_size == 4 && value.file == UNIFORM && 334 type_sz(value.type) == 4) 335 return false; 336 337 /* If the type of the copy value is different from the type of the 338 * instruction then the swizzles and writemasks involved don't have the same 339 * meaning and simply replacing the source would produce different semantics. 340 */ 341 if (type_sz(value.type) != type_sz(inst->src[arg].type)) 342 return false; 343 344 if (inst->src[arg].offset % REG_SIZE || value.offset % REG_SIZE) 345 return false; 346 347 bool has_source_modifiers = value.negate || value.abs; 348 349 /* gfx6 math and gfx7+ SENDs from GRFs ignore source modifiers on 350 * instructions. 351 */ 352 if (has_source_modifiers && !inst->can_do_source_mods(devinfo)) 353 return false; 354 355 /* Reject cases that would violate register regioning restrictions. */ 356 if ((value.file == UNIFORM || value.swizzle != BRW_SWIZZLE_XYZW) && 357 ((devinfo->ver == 6 && inst->is_math()) || 358 inst->is_send_from_grf() || 359 inst->uses_indirect_addressing())) { 360 return false; 361 } 362 363 if (has_source_modifiers && 364 value.type != inst->src[arg].type && 365 !inst->can_change_types()) 366 return false; 367 368 if (has_source_modifiers && 369 (inst->opcode == SHADER_OPCODE_GFX4_SCRATCH_WRITE || 370 inst->opcode == VEC4_OPCODE_PICK_HIGH_32BIT)) 371 return false; 372 373 unsigned composed_swizzle = brw_compose_swizzle(inst->src[arg].swizzle, 374 value.swizzle); 375 376 /* Instructions that operate on vectors in ALIGN1 mode will ignore swizzles 377 * so copy-propagation won't be safe if the composed swizzle is anything 378 * other than the identity. 379 */ 380 if (is_align1_opcode(inst->opcode) && composed_swizzle != BRW_SWIZZLE_XYZW) 381 return false; 382 383 if (inst->is_3src(devinfo) && 384 (value.file == UNIFORM || 385 (value.file == ATTR && attributes_per_reg != 1)) && 386 !brw_is_single_value_swizzle(composed_swizzle)) 387 return false; 388 389 if (inst->is_send_from_grf()) 390 return false; 391 392 /* we can't generally copy-propagate UD negations becuse we 393 * end up accessing the resulting values as signed integers 394 * instead. See also resolve_ud_negate(). 395 */ 396 if (value.negate && 397 value.type == BRW_REGISTER_TYPE_UD) 398 return false; 399 400 /* Don't report progress if this is a noop. */ 401 if (value.equals(inst->src[arg])) 402 return false; 403 404 const unsigned dst_saturate_mask = inst->dst.writemask & 405 brw_apply_swizzle_to_mask(inst->src[arg].swizzle, entry->saturatemask); 406 407 if (dst_saturate_mask) { 408 /* We either saturate all or nothing. */ 409 if (dst_saturate_mask != inst->dst.writemask) 410 return false; 411 412 /* Limit saturate propagation only to SEL with src1 bounded within 0.0 413 * and 1.0, otherwise skip copy propagate altogether. 414 */ 415 switch(inst->opcode) { 416 case BRW_OPCODE_SEL: 417 if (arg != 0 || 418 inst->src[0].type != BRW_REGISTER_TYPE_F || 419 inst->src[1].file != IMM || 420 inst->src[1].type != BRW_REGISTER_TYPE_F || 421 inst->src[1].f < 0.0 || 422 inst->src[1].f > 1.0) { 423 return false; 424 } 425 if (!inst->saturate) 426 inst->saturate = true; 427 break; 428 default: 429 return false; 430 } 431 } 432 433 /* Build the final value */ 434 if (inst->src[arg].abs) { 435 value.negate = false; 436 value.abs = true; 437 } 438 if (inst->src[arg].negate) 439 value.negate = !value.negate; 440 441 value.swizzle = composed_swizzle; 442 if (has_source_modifiers && 443 value.type != inst->src[arg].type) { 444 assert(inst->can_change_types()); 445 for (int i = 0; i < 3; i++) { 446 inst->src[i].type = value.type; 447 } 448 inst->dst.type = value.type; 449 } else { 450 value.type = inst->src[arg].type; 451 } 452 453 inst->src[arg] = value; 454 return true; 455} 456 457bool 458vec4_visitor::opt_copy_propagation(bool do_constant_prop) 459{ 460 /* If we are in dual instanced or single mode, then attributes are going 461 * to be interleaved, so one register contains two attribute slots. 462 */ 463 const int attributes_per_reg = 464 prog_data->dispatch_mode == DISPATCH_MODE_4X2_DUAL_OBJECT ? 1 : 2; 465 bool progress = false; 466 struct copy_entry entries[alloc.total_size]; 467 468 memset(&entries, 0, sizeof(entries)); 469 470 foreach_block_and_inst(block, vec4_instruction, inst, cfg) { 471 /* This pass only works on basic blocks. If there's flow 472 * control, throw out all our information and start from 473 * scratch. 474 * 475 * This should really be fixed by using a structure like in 476 * src/glsl/opt_copy_propagation.cpp to track available copies. 477 */ 478 if (!is_dominated_by_previous_instruction(inst)) { 479 memset(&entries, 0, sizeof(entries)); 480 continue; 481 } 482 483 /* For each source arg, see if each component comes from a copy 484 * from the same type file (IMM, VGRF, UNIFORM), and try 485 * optimizing out access to the copy result 486 */ 487 for (int i = 2; i >= 0; i--) { 488 /* Copied values end up in GRFs, and we don't track reladdr 489 * accesses. 490 */ 491 if (inst->src[i].file != VGRF || 492 inst->src[i].reladdr) 493 continue; 494 495 /* We only handle register-aligned single GRF copies. */ 496 if (inst->size_read(i) != REG_SIZE || 497 inst->src[i].offset % REG_SIZE) 498 continue; 499 500 const unsigned reg = (alloc.offsets[inst->src[i].nr] + 501 inst->src[i].offset / REG_SIZE); 502 const copy_entry &entry = entries[reg]; 503 504 if (do_constant_prop && try_constant_propagate(inst, i, &entry)) 505 progress = true; 506 else if (try_copy_propagate(devinfo, inst, i, &entry, attributes_per_reg)) 507 progress = true; 508 } 509 510 /* Track available source registers. */ 511 if (inst->dst.file == VGRF) { 512 const int reg = 513 alloc.offsets[inst->dst.nr] + inst->dst.offset / REG_SIZE; 514 515 /* Update our destination's current channel values. For a direct copy, 516 * the value is the newly propagated source. Otherwise, we don't know 517 * the new value, so clear it. 518 */ 519 bool direct_copy = is_direct_copy(inst); 520 entries[reg].saturatemask &= ~inst->dst.writemask; 521 for (int i = 0; i < 4; i++) { 522 if (inst->dst.writemask & (1 << i)) { 523 entries[reg].value[i] = direct_copy ? &inst->src[0] : NULL; 524 entries[reg].saturatemask |= 525 inst->saturate && direct_copy ? 1 << i : 0; 526 } 527 } 528 529 /* Clear the records for any registers whose current value came from 530 * our destination's updated channels, as the two are no longer equal. 531 */ 532 if (inst->dst.reladdr) 533 memset(&entries, 0, sizeof(entries)); 534 else { 535 for (unsigned i = 0; i < alloc.total_size; i++) { 536 for (int j = 0; j < 4; j++) { 537 if (is_channel_updated(inst, entries[i].value, j)) { 538 entries[i].value[j] = NULL; 539 entries[i].saturatemask &= ~(1 << j); 540 } 541 } 542 } 543 } 544 } 545 } 546 547 if (progress) 548 invalidate_analysis(DEPENDENCY_INSTRUCTION_DATA_FLOW | 549 DEPENDENCY_INSTRUCTION_DETAIL); 550 551 return progress; 552} 553 554} /* namespace brw */ 555