1/* 2 * Copyright © 2018 Valve Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 * 23 */ 24 25#include "aco_ir.h" 26 27#include <map> 28#include <unordered_map> 29#include <vector> 30 31/* 32 * Implements the algorithm for dominator-tree value numbering 33 * from "Value Numbering" by Briggs, Cooper, and Simpson. 34 */ 35 36namespace aco { 37namespace { 38 39inline uint32_t 40murmur_32_scramble(uint32_t h, uint32_t k) 41{ 42 k *= 0xcc9e2d51; 43 k = (k << 15) | (k >> 17); 44 h ^= k * 0x1b873593; 45 h = (h << 13) | (h >> 19); 46 h = h * 5 + 0xe6546b64; 47 return h; 48} 49 50template <typename T> 51uint32_t 52hash_murmur_32(Instruction* instr) 53{ 54 uint32_t hash = uint32_t(instr->format) << 16 | uint32_t(instr->opcode); 55 56 for (const Operand& op : instr->operands) 57 hash = murmur_32_scramble(hash, op.constantValue()); 58 59 /* skip format, opcode and pass_flags */ 60 for (unsigned i = 2; i < (sizeof(T) >> 2); i++) { 61 uint32_t u; 62 /* Accesses it though a byte array, so doesn't violate the strict aliasing rule */ 63 memcpy(&u, reinterpret_cast<uint8_t*>(instr) + i * 4, 4); 64 hash = murmur_32_scramble(hash, u); 65 } 66 67 /* Finalize. */ 68 uint32_t len = instr->operands.size() + instr->definitions.size() + sizeof(T); 69 hash ^= len; 70 hash ^= hash >> 16; 71 hash *= 0x85ebca6b; 72 hash ^= hash >> 13; 73 hash *= 0xc2b2ae35; 74 hash ^= hash >> 16; 75 return hash; 76} 77 78struct InstrHash { 79 /* This hash function uses the Murmur3 algorithm written by Austin Appleby 80 * https://github.com/aappleby/smhasher/blob/master/src/MurmurHash3.cpp 81 * 82 * In order to calculate the expression set, only the right-hand-side of an 83 * instruction is used for the hash, i.e. everything except the definitions. 84 */ 85 std::size_t operator()(Instruction* instr) const 86 { 87 if (instr->isVOP3()) 88 return hash_murmur_32<VOP3_instruction>(instr); 89 90 if (instr->isDPP()) 91 return hash_murmur_32<DPP_instruction>(instr); 92 93 if (instr->isSDWA()) 94 return hash_murmur_32<SDWA_instruction>(instr); 95 96 switch (instr->format) { 97 case Format::SMEM: return hash_murmur_32<SMEM_instruction>(instr); 98 case Format::VINTRP: return hash_murmur_32<Interp_instruction>(instr); 99 case Format::DS: return hash_murmur_32<DS_instruction>(instr); 100 case Format::SOPP: return hash_murmur_32<SOPP_instruction>(instr); 101 case Format::SOPK: return hash_murmur_32<SOPK_instruction>(instr); 102 case Format::EXP: return hash_murmur_32<Export_instruction>(instr); 103 case Format::MUBUF: return hash_murmur_32<MUBUF_instruction>(instr); 104 case Format::MIMG: return hash_murmur_32<MIMG_instruction>(instr); 105 case Format::MTBUF: return hash_murmur_32<MTBUF_instruction>(instr); 106 case Format::FLAT: return hash_murmur_32<FLAT_instruction>(instr); 107 case Format::PSEUDO_BRANCH: return hash_murmur_32<Pseudo_branch_instruction>(instr); 108 case Format::PSEUDO_REDUCTION: return hash_murmur_32<Pseudo_reduction_instruction>(instr); 109 default: return hash_murmur_32<Instruction>(instr); 110 } 111 } 112}; 113 114struct InstrPred { 115 bool operator()(Instruction* a, Instruction* b) const 116 { 117 if (a->format != b->format) 118 return false; 119 if (a->opcode != b->opcode) 120 return false; 121 if (a->operands.size() != b->operands.size() || 122 a->definitions.size() != b->definitions.size()) 123 return false; /* possible with pseudo-instructions */ 124 for (unsigned i = 0; i < a->operands.size(); i++) { 125 if (a->operands[i].isConstant()) { 126 if (!b->operands[i].isConstant()) 127 return false; 128 if (a->operands[i].constantValue() != b->operands[i].constantValue()) 129 return false; 130 } else if (a->operands[i].isTemp()) { 131 if (!b->operands[i].isTemp()) 132 return false; 133 if (a->operands[i].tempId() != b->operands[i].tempId()) 134 return false; 135 } else if (a->operands[i].isUndefined() ^ b->operands[i].isUndefined()) 136 return false; 137 if (a->operands[i].isFixed()) { 138 if (!b->operands[i].isFixed()) 139 return false; 140 if (a->operands[i].physReg() != b->operands[i].physReg()) 141 return false; 142 if (a->operands[i].physReg() == exec && a->pass_flags != b->pass_flags) 143 return false; 144 } 145 } 146 for (unsigned i = 0; i < a->definitions.size(); i++) { 147 if (a->definitions[i].isTemp()) { 148 if (!b->definitions[i].isTemp()) 149 return false; 150 if (a->definitions[i].regClass() != b->definitions[i].regClass()) 151 return false; 152 } 153 if (a->definitions[i].isFixed()) { 154 if (!b->definitions[i].isFixed()) 155 return false; 156 if (a->definitions[i].physReg() != b->definitions[i].physReg()) 157 return false; 158 if (a->definitions[i].physReg() == exec) 159 return false; 160 } 161 } 162 163 if (a->opcode == aco_opcode::v_readfirstlane_b32) 164 return a->pass_flags == b->pass_flags; 165 166 if (a->isVOP3()) { 167 VOP3_instruction& a3 = a->vop3(); 168 VOP3_instruction& b3 = b->vop3(); 169 for (unsigned i = 0; i < 3; i++) { 170 if (a3.abs[i] != b3.abs[i] || a3.neg[i] != b3.neg[i]) 171 return false; 172 } 173 return a3.clamp == b3.clamp && a3.omod == b3.omod && a3.opsel == b3.opsel; 174 } 175 if (a->isDPP()) { 176 DPP_instruction& aDPP = a->dpp(); 177 DPP_instruction& bDPP = b->dpp(); 178 return aDPP.pass_flags == bDPP.pass_flags && aDPP.dpp_ctrl == bDPP.dpp_ctrl && 179 aDPP.bank_mask == bDPP.bank_mask && aDPP.row_mask == bDPP.row_mask && 180 aDPP.bound_ctrl == bDPP.bound_ctrl && aDPP.abs[0] == bDPP.abs[0] && 181 aDPP.abs[1] == bDPP.abs[1] && aDPP.neg[0] == bDPP.neg[0] && 182 aDPP.neg[1] == bDPP.neg[1]; 183 } 184 if (a->isSDWA()) { 185 SDWA_instruction& aSDWA = a->sdwa(); 186 SDWA_instruction& bSDWA = b->sdwa(); 187 return aSDWA.sel[0] == bSDWA.sel[0] && aSDWA.sel[1] == bSDWA.sel[1] && 188 aSDWA.dst_sel == bSDWA.dst_sel && aSDWA.abs[0] == bSDWA.abs[0] && 189 aSDWA.abs[1] == bSDWA.abs[1] && aSDWA.neg[0] == bSDWA.neg[0] && 190 aSDWA.neg[1] == bSDWA.neg[1] && aSDWA.clamp == bSDWA.clamp && 191 aSDWA.omod == bSDWA.omod; 192 } 193 194 switch (a->format) { 195 case Format::SOPK: { 196 if (a->opcode == aco_opcode::s_getreg_b32) 197 return false; 198 SOPK_instruction& aK = a->sopk(); 199 SOPK_instruction& bK = b->sopk(); 200 return aK.imm == bK.imm; 201 } 202 case Format::SMEM: { 203 SMEM_instruction& aS = a->smem(); 204 SMEM_instruction& bS = b->smem(); 205 /* isel shouldn't be creating situations where this assertion fails */ 206 assert(aS.prevent_overflow == bS.prevent_overflow); 207 return aS.sync == bS.sync && aS.glc == bS.glc && aS.dlc == bS.dlc && aS.nv == bS.nv && 208 aS.disable_wqm == bS.disable_wqm && aS.prevent_overflow == bS.prevent_overflow; 209 } 210 case Format::VINTRP: { 211 Interp_instruction& aI = a->vintrp(); 212 Interp_instruction& bI = b->vintrp(); 213 if (aI.attribute != bI.attribute) 214 return false; 215 if (aI.component != bI.component) 216 return false; 217 return true; 218 } 219 case Format::VOP3P: { 220 VOP3P_instruction& a3P = a->vop3p(); 221 VOP3P_instruction& b3P = b->vop3p(); 222 for (unsigned i = 0; i < 3; i++) { 223 if (a3P.neg_lo[i] != b3P.neg_lo[i] || a3P.neg_hi[i] != b3P.neg_hi[i]) 224 return false; 225 } 226 return a3P.opsel_lo == b3P.opsel_lo && a3P.opsel_hi == b3P.opsel_hi && 227 a3P.clamp == b3P.clamp; 228 } 229 case Format::PSEUDO_REDUCTION: { 230 Pseudo_reduction_instruction& aR = a->reduction(); 231 Pseudo_reduction_instruction& bR = b->reduction(); 232 return aR.pass_flags == bR.pass_flags && aR.reduce_op == bR.reduce_op && 233 aR.cluster_size == bR.cluster_size; 234 } 235 case Format::DS: { 236 assert(a->opcode == aco_opcode::ds_bpermute_b32 || 237 a->opcode == aco_opcode::ds_permute_b32 || a->opcode == aco_opcode::ds_swizzle_b32); 238 DS_instruction& aD = a->ds(); 239 DS_instruction& bD = b->ds(); 240 return aD.sync == bD.sync && aD.pass_flags == bD.pass_flags && aD.gds == bD.gds && 241 aD.offset0 == bD.offset0 && aD.offset1 == bD.offset1; 242 } 243 case Format::MTBUF: { 244 MTBUF_instruction& aM = a->mtbuf(); 245 MTBUF_instruction& bM = b->mtbuf(); 246 return aM.sync == bM.sync && aM.dfmt == bM.dfmt && aM.nfmt == bM.nfmt && 247 aM.offset == bM.offset && aM.offen == bM.offen && aM.idxen == bM.idxen && 248 aM.glc == bM.glc && aM.dlc == bM.dlc && aM.slc == bM.slc && aM.tfe == bM.tfe && 249 aM.disable_wqm == bM.disable_wqm; 250 } 251 case Format::MUBUF: { 252 MUBUF_instruction& aM = a->mubuf(); 253 MUBUF_instruction& bM = b->mubuf(); 254 return aM.sync == bM.sync && aM.offset == bM.offset && aM.offen == bM.offen && 255 aM.idxen == bM.idxen && aM.glc == bM.glc && aM.dlc == bM.dlc && aM.slc == bM.slc && 256 aM.tfe == bM.tfe && aM.lds == bM.lds && aM.disable_wqm == bM.disable_wqm; 257 } 258 case Format::MIMG: { 259 MIMG_instruction& aM = a->mimg(); 260 MIMG_instruction& bM = b->mimg(); 261 return aM.sync == bM.sync && aM.dmask == bM.dmask && aM.unrm == bM.unrm && 262 aM.glc == bM.glc && aM.slc == bM.slc && aM.tfe == bM.tfe && aM.da == bM.da && 263 aM.lwe == bM.lwe && aM.r128 == bM.r128 && aM.a16 == bM.a16 && aM.d16 == bM.d16 && 264 aM.disable_wqm == bM.disable_wqm; 265 } 266 case Format::FLAT: 267 case Format::GLOBAL: 268 case Format::SCRATCH: 269 case Format::EXP: 270 case Format::SOPP: 271 case Format::PSEUDO_BRANCH: 272 case Format::PSEUDO_BARRIER: assert(false); 273 default: return true; 274 } 275 } 276}; 277 278using expr_set = std::unordered_map<Instruction*, uint32_t, InstrHash, InstrPred>; 279 280struct vn_ctx { 281 Program* program; 282 expr_set expr_values; 283 std::map<uint32_t, Temp> renames; 284 285 /* The exec id should be the same on the same level of control flow depth. 286 * Together with the check for dominator relations, it is safe to assume 287 * that the same exec_id also means the same execution mask. 288 * Discards increment the exec_id, so that it won't return to the previous value. 289 */ 290 uint32_t exec_id = 1; 291 292 vn_ctx(Program* program_) : program(program_) 293 { 294 static_assert(sizeof(Temp) == 4, "Temp must fit in 32bits"); 295 unsigned size = 0; 296 for (Block& block : program->blocks) 297 size += block.instructions.size(); 298 expr_values.reserve(size); 299 } 300}; 301 302/* dominates() returns true if the parent block dominates the child block and 303 * if the parent block is part of the same loop or has a smaller loop nest depth. 304 */ 305bool 306dominates(vn_ctx& ctx, uint32_t parent, uint32_t child) 307{ 308 unsigned parent_loop_nest_depth = ctx.program->blocks[parent].loop_nest_depth; 309 while (parent < child && parent_loop_nest_depth <= ctx.program->blocks[child].loop_nest_depth) 310 child = ctx.program->blocks[child].logical_idom; 311 312 return parent == child; 313} 314 315/** Returns whether this instruction can safely be removed 316 * and replaced by an equal expression. 317 * This is in particular true for ALU instructions and 318 * read-only memory instructions. 319 * 320 * Note that expr_set must not be used with instructions 321 * which cannot be eliminated. 322 */ 323bool 324can_eliminate(aco_ptr<Instruction>& instr) 325{ 326 switch (instr->format) { 327 case Format::FLAT: 328 case Format::GLOBAL: 329 case Format::SCRATCH: 330 case Format::EXP: 331 case Format::SOPP: 332 case Format::PSEUDO_BRANCH: 333 case Format::PSEUDO_BARRIER: return false; 334 case Format::DS: 335 return instr->opcode == aco_opcode::ds_bpermute_b32 || 336 instr->opcode == aco_opcode::ds_permute_b32 || 337 instr->opcode == aco_opcode::ds_swizzle_b32; 338 case Format::SMEM: 339 case Format::MUBUF: 340 case Format::MIMG: 341 case Format::MTBUF: 342 if (!get_sync_info(instr.get()).can_reorder()) 343 return false; 344 break; 345 default: break; 346 } 347 348 if (instr->definitions.empty() || instr->opcode == aco_opcode::p_phi || 349 instr->opcode == aco_opcode::p_linear_phi || instr->definitions[0].isNoCSE()) 350 return false; 351 352 return true; 353} 354 355void 356process_block(vn_ctx& ctx, Block& block) 357{ 358 std::vector<aco_ptr<Instruction>> new_instructions; 359 new_instructions.reserve(block.instructions.size()); 360 361 for (aco_ptr<Instruction>& instr : block.instructions) { 362 /* first, rename operands */ 363 for (Operand& op : instr->operands) { 364 if (!op.isTemp()) 365 continue; 366 auto it = ctx.renames.find(op.tempId()); 367 if (it != ctx.renames.end()) 368 op.setTemp(it->second); 369 } 370 371 if (instr->opcode == aco_opcode::p_discard_if || 372 instr->opcode == aco_opcode::p_demote_to_helper) 373 ctx.exec_id++; 374 375 if (!can_eliminate(instr)) { 376 new_instructions.emplace_back(std::move(instr)); 377 continue; 378 } 379 380 /* simple copy-propagation through renaming */ 381 bool copy_instr = 382 instr->opcode == aco_opcode::p_parallelcopy || 383 (instr->opcode == aco_opcode::p_create_vector && instr->operands.size() == 1); 384 if (copy_instr && !instr->definitions[0].isFixed() && instr->operands[0].isTemp() && 385 instr->operands[0].regClass() == instr->definitions[0].regClass()) { 386 ctx.renames[instr->definitions[0].tempId()] = instr->operands[0].getTemp(); 387 continue; 388 } 389 390 instr->pass_flags = ctx.exec_id; 391 std::pair<expr_set::iterator, bool> res = ctx.expr_values.emplace(instr.get(), block.index); 392 393 /* if there was already an expression with the same value number */ 394 if (!res.second) { 395 Instruction* orig_instr = res.first->first; 396 assert(instr->definitions.size() == orig_instr->definitions.size()); 397 /* check if the original instruction dominates the current one */ 398 if (dominates(ctx, res.first->second, block.index) && 399 ctx.program->blocks[res.first->second].fp_mode.canReplace(block.fp_mode)) { 400 for (unsigned i = 0; i < instr->definitions.size(); i++) { 401 assert(instr->definitions[i].regClass() == orig_instr->definitions[i].regClass()); 402 assert(instr->definitions[i].isTemp()); 403 ctx.renames[instr->definitions[i].tempId()] = orig_instr->definitions[i].getTemp(); 404 if (instr->definitions[i].isPrecise()) 405 orig_instr->definitions[i].setPrecise(true); 406 /* SPIR_V spec says that an instruction marked with NUW wrapping 407 * around is undefined behaviour, so we can break additions in 408 * other contexts. 409 */ 410 if (instr->definitions[i].isNUW()) 411 orig_instr->definitions[i].setNUW(true); 412 } 413 } else { 414 ctx.expr_values.erase(res.first); 415 ctx.expr_values.emplace(instr.get(), block.index); 416 new_instructions.emplace_back(std::move(instr)); 417 } 418 } else { 419 new_instructions.emplace_back(std::move(instr)); 420 } 421 } 422 423 block.instructions = std::move(new_instructions); 424} 425 426void 427rename_phi_operands(Block& block, std::map<uint32_t, Temp>& renames) 428{ 429 for (aco_ptr<Instruction>& phi : block.instructions) { 430 if (phi->opcode != aco_opcode::p_phi && phi->opcode != aco_opcode::p_linear_phi) 431 break; 432 433 for (Operand& op : phi->operands) { 434 if (!op.isTemp()) 435 continue; 436 auto it = renames.find(op.tempId()); 437 if (it != renames.end()) 438 op.setTemp(it->second); 439 } 440 } 441} 442} /* end namespace */ 443 444void 445value_numbering(Program* program) 446{ 447 vn_ctx ctx(program); 448 std::vector<unsigned> loop_headers; 449 450 for (Block& block : program->blocks) { 451 assert(ctx.exec_id > 0); 452 /* decrement exec_id when leaving nested control flow */ 453 if (block.kind & block_kind_loop_header) 454 loop_headers.push_back(block.index); 455 if (block.kind & block_kind_merge) { 456 ctx.exec_id--; 457 } else if (block.kind & block_kind_loop_exit) { 458 ctx.exec_id -= program->blocks[loop_headers.back()].linear_preds.size(); 459 ctx.exec_id -= block.linear_preds.size(); 460 loop_headers.pop_back(); 461 } 462 463 if (block.logical_idom != -1) 464 process_block(ctx, block); 465 else 466 rename_phi_operands(block, ctx.renames); 467 468 /* increment exec_id when entering nested control flow */ 469 if (block.kind & block_kind_branch || block.kind & block_kind_loop_preheader || 470 block.kind & block_kind_break || block.kind & block_kind_continue || 471 block.kind & block_kind_discard) 472 ctx.exec_id++; 473 else if (block.kind & block_kind_continue_or_break) 474 ctx.exec_id += 2; 475 } 476 477 /* rename loop header phi operands */ 478 for (Block& block : program->blocks) { 479 if (block.kind & block_kind_loop_header) 480 rename_phi_operands(block, ctx.renames); 481 } 482} 483 484} // namespace aco 485