1 //===- AMDGPURegisterBankInfo.cpp -------------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the RegisterBankInfo class for 10 /// AMDGPU. 11 /// 12 /// \par 13 /// 14 /// AMDGPU has unique register bank constraints that require special high level 15 /// strategies to deal with. There are two main true physical register banks 16 /// VGPR (vector), and SGPR (scalar). Additionally the VCC register bank is a 17 /// sort of pseudo-register bank needed to represent SGPRs used in a vector 18 /// boolean context. There is also the AGPR bank, which is a special purpose 19 /// physical register bank present on some subtargets. 20 /// 21 /// Copying from VGPR to SGPR is generally illegal, unless the value is known to 22 /// be uniform. It is generally not valid to legalize operands by inserting 23 /// copies as on other targets. Operations which require uniform, SGPR operands 24 /// generally require scalarization by repeatedly executing the instruction, 25 /// activating each set of lanes using a unique set of input values. This is 26 /// referred to as a waterfall loop. 27 /// 28 /// \par Booleans 29 /// 30 /// Booleans (s1 values) requires special consideration. A vector compare result 31 /// is naturally a bitmask with one bit per lane, in a 32 or 64-bit 32 /// register. These are represented with the VCC bank. During selection, we need 33 /// to be able to unambiguously go back from a register class to a register 34 /// bank. To distinguish whether an SGPR should use the SGPR or VCC register 35 /// bank, we need to know the use context type. An SGPR s1 value always means a 36 /// VCC bank value, otherwise it will be the SGPR bank. A scalar compare sets 37 /// SCC, which is a 1-bit unaddressable register. This will need to be copied to 38 /// a 32-bit virtual register. Taken together, this means we need to adjust the 39 /// type of boolean operations to be regbank legal. All SALU booleans need to be 40 /// widened to 32-bits, and all VALU booleans need to be s1 values. 41 /// 42 /// A noteworthy exception to the s1-means-vcc rule is for legalization artifact 43 /// casts. G_TRUNC s1 results, and G_SEXT/G_ZEXT/G_ANYEXT sources are never vcc 44 /// bank. A non-boolean source (such as a truncate from a 1-bit load from 45 /// memory) will require a copy to the VCC bank which will require clearing the 46 /// high bits and inserting a compare. 47 /// 48 /// \par Constant bus restriction 49 /// 50 /// VALU instructions have a limitation known as the constant bus 51 /// restriction. Most VALU instructions can use SGPR operands, but may read at 52 /// most 1 SGPR or constant literal value (this to 2 in gfx10 for most 53 /// instructions). This is one unique SGPR, so the same SGPR may be used for 54 /// multiple operands. From a register bank perspective, any combination of 55 /// operands should be legal as an SGPR, but this is contextually dependent on 56 /// the SGPR operands all being the same register. There is therefore optimal to 57 /// choose the SGPR with the most uses to minimize the number of copies. 58 /// 59 /// We avoid trying to solve this problem in RegBankSelect. Any VALU G_* 60 /// operation should have its source operands all mapped to VGPRs (except for 61 /// VCC), inserting copies from any SGPR operands. This the most trival legal 62 /// mapping. Anything beyond the simplest 1:1 instruction selection would be too 63 /// complicated to solve here. Every optimization pattern or instruction 64 /// selected to multiple outputs would have to enforce this rule, and there 65 /// would be additional complexity in tracking this rule for every G_* 66 /// operation. By forcing all inputs to VGPRs, it also simplifies the task of 67 /// picking the optimal operand combination from a post-isel optimization pass. 68 /// 69 //===----------------------------------------------------------------------===// 70 71 #include "AMDGPURegisterBankInfo.h" 72 73 #include "AMDGPU.h" 74 #include "AMDGPUGlobalISelUtils.h" 75 #include "AMDGPUInstrInfo.h" 76 #include "GCNSubtarget.h" 77 #include "SIMachineFunctionInfo.h" 78 #include "SIRegisterInfo.h" 79 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" 80 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 81 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 82 #include "llvm/CodeGen/GlobalISel/RegisterBank.h" 83 #include "llvm/IR/IntrinsicsAMDGPU.h" 84 85 #define GET_TARGET_REGBANK_IMPL 86 #include "AMDGPUGenRegisterBank.inc" 87 88 // This file will be TableGen'ed at some point. 89 #include "AMDGPUGenRegisterBankInfo.def" 90 91 using namespace llvm; 92 using namespace MIPatternMatch; 93 94 namespace { 95 96 // Observer to apply a register bank to new registers created by LegalizerHelper. 97 class ApplyRegBankMapping final : public GISelChangeObserver { 98 private: 99 const AMDGPURegisterBankInfo &RBI; 100 MachineRegisterInfo &MRI; 101 const RegisterBank *NewBank; 102 SmallVector<MachineInstr *, 4> NewInsts; 103 104 public: 105 ApplyRegBankMapping(const AMDGPURegisterBankInfo &RBI_, 106 MachineRegisterInfo &MRI_, const RegisterBank *RB) 107 : RBI(RBI_), MRI(MRI_), NewBank(RB) {} 108 109 ~ApplyRegBankMapping() { 110 for (MachineInstr *MI : NewInsts) 111 applyBank(*MI); 112 } 113 114 /// Set any registers that don't have a set register class or bank to SALU. 115 void applyBank(MachineInstr &MI) { 116 const unsigned Opc = MI.getOpcode(); 117 if (Opc == AMDGPU::G_ANYEXT || Opc == AMDGPU::G_ZEXT || 118 Opc == AMDGPU::G_SEXT) { 119 // LegalizerHelper wants to use the basic legalization artifacts when 120 // widening etc. We don't handle selection with vcc in artifact sources, 121 // so we need to use a sslect instead to handle these properly. 122 Register DstReg = MI.getOperand(0).getReg(); 123 Register SrcReg = MI.getOperand(1).getReg(); 124 const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, MRI, *RBI.TRI); 125 if (SrcBank == &AMDGPU::VCCRegBank) { 126 const LLT S32 = LLT::scalar(32); 127 assert(MRI.getType(SrcReg) == LLT::scalar(1)); 128 assert(MRI.getType(DstReg) == S32); 129 assert(NewBank == &AMDGPU::VGPRRegBank); 130 131 // Replace the extension with a select, which really uses the boolean 132 // source. 133 MachineIRBuilder B(MI); 134 auto True = B.buildConstant(S32, Opc == AMDGPU::G_SEXT ? -1 : 1); 135 auto False = B.buildConstant(S32, 0); 136 B.buildSelect(DstReg, SrcReg, True, False); 137 MRI.setRegBank(True.getReg(0), *NewBank); 138 MRI.setRegBank(False.getReg(0), *NewBank); 139 MI.eraseFromParent(); 140 } 141 142 assert(!MRI.getRegClassOrRegBank(DstReg)); 143 MRI.setRegBank(DstReg, *NewBank); 144 return; 145 } 146 147 #ifndef NDEBUG 148 if (Opc == AMDGPU::G_TRUNC) { 149 Register DstReg = MI.getOperand(0).getReg(); 150 const RegisterBank *DstBank = RBI.getRegBank(DstReg, MRI, *RBI.TRI); 151 assert(DstBank != &AMDGPU::VCCRegBank); 152 } 153 #endif 154 155 for (MachineOperand &Op : MI.operands()) { 156 if (!Op.isReg()) 157 continue; 158 159 // We may see physical registers if building a real MI 160 Register Reg = Op.getReg(); 161 if (Reg.isPhysical() || MRI.getRegClassOrRegBank(Reg)) 162 continue; 163 164 const RegisterBank *RB = NewBank; 165 if (MRI.getType(Reg) == LLT::scalar(1)) { 166 assert(NewBank == &AMDGPU::VGPRRegBank && 167 "s1 operands should only be used for vector bools"); 168 assert((MI.getOpcode() != AMDGPU::G_TRUNC && 169 MI.getOpcode() != AMDGPU::G_ANYEXT) && 170 "not expecting legalization artifacts here"); 171 RB = &AMDGPU::VCCRegBank; 172 } 173 174 MRI.setRegBank(Reg, *RB); 175 } 176 } 177 178 void erasingInstr(MachineInstr &MI) override {} 179 180 void createdInstr(MachineInstr &MI) override { 181 // At this point, the instruction was just inserted and has no operands. 182 NewInsts.push_back(&MI); 183 } 184 185 void changingInstr(MachineInstr &MI) override {} 186 void changedInstr(MachineInstr &MI) override { 187 // FIXME: In principle we should probably add the instruction to NewInsts, 188 // but the way the LegalizerHelper uses the observer, we will always see the 189 // registers we need to set the regbank on also referenced in a new 190 // instruction. 191 } 192 }; 193 194 } 195 AMDGPURegisterBankInfo::AMDGPURegisterBankInfo(const GCNSubtarget &ST) 196 : AMDGPUGenRegisterBankInfo(), 197 Subtarget(ST), 198 TRI(Subtarget.getRegisterInfo()), 199 TII(Subtarget.getInstrInfo()) { 200 201 // HACK: Until this is fully tablegen'd. 202 static llvm::once_flag InitializeRegisterBankFlag; 203 204 static auto InitializeRegisterBankOnce = [this]() { 205 assert(&getRegBank(AMDGPU::SGPRRegBankID) == &AMDGPU::SGPRRegBank && 206 &getRegBank(AMDGPU::VGPRRegBankID) == &AMDGPU::VGPRRegBank && 207 &getRegBank(AMDGPU::AGPRRegBankID) == &AMDGPU::AGPRRegBank); 208 (void)this; 209 }; 210 211 llvm::call_once(InitializeRegisterBankFlag, InitializeRegisterBankOnce); 212 } 213 214 static bool isVectorRegisterBank(const RegisterBank &Bank) { 215 unsigned BankID = Bank.getID(); 216 return BankID == AMDGPU::VGPRRegBankID || BankID == AMDGPU::AGPRRegBankID; 217 } 218 219 unsigned AMDGPURegisterBankInfo::copyCost(const RegisterBank &Dst, 220 const RegisterBank &Src, 221 unsigned Size) const { 222 // TODO: Should there be a UniformVGPRRegBank which can use readfirstlane? 223 if (Dst.getID() == AMDGPU::SGPRRegBankID && 224 (isVectorRegisterBank(Src) || Src.getID() == AMDGPU::VCCRegBankID)) { 225 return std::numeric_limits<unsigned>::max(); 226 } 227 228 // Bool values are tricky, because the meaning is based on context. The SCC 229 // and VCC banks are for the natural scalar and vector conditions produced by 230 // a compare. 231 // 232 // Legalization doesn't know about the necessary context, so an s1 use may 233 // have been a truncate from an arbitrary value, in which case a copy (lowered 234 // as a compare with 0) needs to be inserted. 235 if (Size == 1 && 236 (Dst.getID() == AMDGPU::SGPRRegBankID) && 237 (isVectorRegisterBank(Src) || 238 Src.getID() == AMDGPU::SGPRRegBankID || 239 Src.getID() == AMDGPU::VCCRegBankID)) 240 return std::numeric_limits<unsigned>::max(); 241 242 // There is no direct copy between AGPRs. 243 if (Dst.getID() == AMDGPU::AGPRRegBankID && 244 Src.getID() == AMDGPU::AGPRRegBankID) 245 return 4; 246 247 return RegisterBankInfo::copyCost(Dst, Src, Size); 248 } 249 250 unsigned AMDGPURegisterBankInfo::getBreakDownCost( 251 const ValueMapping &ValMapping, 252 const RegisterBank *CurBank) const { 253 // Check if this is a breakdown for G_LOAD to move the pointer from SGPR to 254 // VGPR. 255 // FIXME: Is there a better way to do this? 256 if (ValMapping.NumBreakDowns >= 2 || ValMapping.BreakDown[0].Length >= 64) 257 return 10; // This is expensive. 258 259 assert(ValMapping.NumBreakDowns == 2 && 260 ValMapping.BreakDown[0].Length == 32 && 261 ValMapping.BreakDown[0].StartIdx == 0 && 262 ValMapping.BreakDown[1].Length == 32 && 263 ValMapping.BreakDown[1].StartIdx == 32 && 264 ValMapping.BreakDown[0].RegBank == ValMapping.BreakDown[1].RegBank); 265 266 // 32-bit extract of a 64-bit value is just access of a subregister, so free. 267 // TODO: Cost of 0 hits assert, though it's not clear it's what we really 268 // want. 269 270 // TODO: 32-bit insert to a 64-bit SGPR may incur a non-free copy due to SGPR 271 // alignment restrictions, but this probably isn't important. 272 return 1; 273 } 274 275 const RegisterBank & 276 AMDGPURegisterBankInfo::getRegBankFromRegClass(const TargetRegisterClass &RC, 277 LLT Ty) const { 278 if (&RC == &AMDGPU::SReg_1RegClass) 279 return AMDGPU::VCCRegBank; 280 281 // We promote real scalar booleans to SReg_32. Any SGPR using s1 is really a 282 // VCC-like use. 283 if (TRI->isSGPRClass(&RC)) { 284 // FIXME: This probably came from a copy from a physical register, which 285 // should be inferrrable from the copied to-type. We don't have many boolean 286 // physical register constraints so just assume a normal SGPR for now. 287 if (!Ty.isValid()) 288 return AMDGPU::SGPRRegBank; 289 290 return Ty == LLT::scalar(1) ? AMDGPU::VCCRegBank : AMDGPU::SGPRRegBank; 291 } 292 293 return TRI->isAGPRClass(&RC) ? AMDGPU::AGPRRegBank : AMDGPU::VGPRRegBank; 294 } 295 296 template <unsigned NumOps> 297 RegisterBankInfo::InstructionMappings 298 AMDGPURegisterBankInfo::addMappingFromTable( 299 const MachineInstr &MI, const MachineRegisterInfo &MRI, 300 const std::array<unsigned, NumOps> RegSrcOpIdx, 301 ArrayRef<OpRegBankEntry<NumOps>> Table) const { 302 303 InstructionMappings AltMappings; 304 305 SmallVector<const ValueMapping *, 10> Operands(MI.getNumOperands()); 306 307 unsigned Sizes[NumOps]; 308 for (unsigned I = 0; I < NumOps; ++I) { 309 Register Reg = MI.getOperand(RegSrcOpIdx[I]).getReg(); 310 Sizes[I] = getSizeInBits(Reg, MRI, *TRI); 311 } 312 313 for (unsigned I = 0, E = MI.getNumExplicitDefs(); I != E; ++I) { 314 unsigned SizeI = getSizeInBits(MI.getOperand(I).getReg(), MRI, *TRI); 315 Operands[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SizeI); 316 } 317 318 // getInstrMapping's default mapping uses ID 1, so start at 2. 319 unsigned MappingID = 2; 320 for (const auto &Entry : Table) { 321 for (unsigned I = 0; I < NumOps; ++I) { 322 int OpIdx = RegSrcOpIdx[I]; 323 Operands[OpIdx] = AMDGPU::getValueMapping(Entry.RegBanks[I], Sizes[I]); 324 } 325 326 AltMappings.push_back(&getInstructionMapping(MappingID++, Entry.Cost, 327 getOperandsMapping(Operands), 328 Operands.size())); 329 } 330 331 return AltMappings; 332 } 333 334 RegisterBankInfo::InstructionMappings 335 AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsic( 336 const MachineInstr &MI, const MachineRegisterInfo &MRI) const { 337 switch (MI.getIntrinsicID()) { 338 case Intrinsic::amdgcn_readlane: { 339 static const OpRegBankEntry<3> Table[2] = { 340 // Perfectly legal. 341 { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1 }, 342 343 // Need a readfirstlane for the index. 344 { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 } 345 }; 346 347 const std::array<unsigned, 3> RegSrcOpIdx = { { 0, 2, 3 } }; 348 return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table)); 349 } 350 case Intrinsic::amdgcn_writelane: { 351 static const OpRegBankEntry<4> Table[4] = { 352 // Perfectly legal. 353 { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 }, 354 355 // Need readfirstlane of first op 356 { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 }, 357 358 // Need readfirstlane of second op 359 { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 }, 360 361 // Need readfirstlane of both ops 362 { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 3 } 363 }; 364 365 // rsrc, voffset, offset 366 const std::array<unsigned, 4> RegSrcOpIdx = { { 0, 2, 3, 4 } }; 367 return addMappingFromTable<4>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table)); 368 } 369 default: 370 return RegisterBankInfo::getInstrAlternativeMappings(MI); 371 } 372 } 373 374 RegisterBankInfo::InstructionMappings 375 AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsicWSideEffects( 376 const MachineInstr &MI, const MachineRegisterInfo &MRI) const { 377 378 switch (MI.getIntrinsicID()) { 379 case Intrinsic::amdgcn_s_buffer_load: { 380 static const OpRegBankEntry<2> Table[4] = { 381 // Perfectly legal. 382 { { AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID }, 1 }, 383 384 // Only need 1 register in loop 385 { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 300 }, 386 387 // Have to waterfall the resource. 388 { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1000 }, 389 390 // Have to waterfall the resource, and the offset. 391 { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 1500 } 392 }; 393 394 // rsrc, offset 395 const std::array<unsigned, 2> RegSrcOpIdx = { { 2, 3 } }; 396 return addMappingFromTable<2>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table)); 397 } 398 case Intrinsic::amdgcn_ds_ordered_add: 399 case Intrinsic::amdgcn_ds_ordered_swap: { 400 // VGPR = M0, VGPR 401 static const OpRegBankEntry<3> Table[2] = { 402 // Perfectly legal. 403 { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 }, 404 405 // Need a readfirstlane for m0 406 { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 } 407 }; 408 409 const std::array<unsigned, 3> RegSrcOpIdx = { { 0, 2, 3 } }; 410 return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table)); 411 } 412 case Intrinsic::amdgcn_s_sendmsg: 413 case Intrinsic::amdgcn_s_sendmsghalt: { 414 // FIXME: Should have no register for immediate 415 static const OpRegBankEntry<1> Table[2] = { 416 // Perfectly legal. 417 { { AMDGPU::SGPRRegBankID }, 1 }, 418 419 // Need readlane 420 { { AMDGPU::VGPRRegBankID }, 3 } 421 }; 422 423 const std::array<unsigned, 1> RegSrcOpIdx = { { 2 } }; 424 return addMappingFromTable<1>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table)); 425 } 426 default: 427 return RegisterBankInfo::getInstrAlternativeMappings(MI); 428 } 429 } 430 431 static bool memOpHasNoClobbered(const MachineMemOperand *MMO) { 432 const Instruction *I = dyn_cast_or_null<Instruction>(MMO->getValue()); 433 return I && I->getMetadata("amdgpu.noclobber"); 434 } 435 436 // FIXME: Returns uniform if there's no source value information. This is 437 // probably wrong. 438 static bool isScalarLoadLegal(const MachineInstr &MI) { 439 if (!MI.hasOneMemOperand()) 440 return false; 441 442 const MachineMemOperand *MMO = *MI.memoperands_begin(); 443 const unsigned AS = MMO->getAddrSpace(); 444 const bool IsConst = AS == AMDGPUAS::CONSTANT_ADDRESS || 445 AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT; 446 // Require 4-byte alignment. 447 return MMO->getAlign() >= Align(4) && 448 // Can't do a scalar atomic load. 449 !MMO->isAtomic() && 450 // Don't use scalar loads for volatile accesses to non-constant address 451 // spaces. 452 (IsConst || !MMO->isVolatile()) && 453 // Memory must be known constant, or not written before this load. 454 (IsConst || MMO->isInvariant() || memOpHasNoClobbered(MMO)) && 455 AMDGPUInstrInfo::isUniformMMO(MMO); 456 } 457 458 RegisterBankInfo::InstructionMappings 459 AMDGPURegisterBankInfo::getInstrAlternativeMappings( 460 const MachineInstr &MI) const { 461 462 const MachineFunction &MF = *MI.getParent()->getParent(); 463 const MachineRegisterInfo &MRI = MF.getRegInfo(); 464 465 466 InstructionMappings AltMappings; 467 switch (MI.getOpcode()) { 468 case TargetOpcode::G_CONSTANT: { 469 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 470 if (Size == 1) { 471 static const OpRegBankEntry<1> Table[3] = { 472 { { AMDGPU::VGPRRegBankID }, 1 }, 473 { { AMDGPU::SGPRRegBankID }, 1 }, 474 { { AMDGPU::VCCRegBankID }, 1 } 475 }; 476 477 return addMappingFromTable<1>(MI, MRI, {{ 0 }}, Table); 478 } 479 480 LLVM_FALLTHROUGH; 481 } 482 case TargetOpcode::G_FCONSTANT: 483 case TargetOpcode::G_FRAME_INDEX: 484 case TargetOpcode::G_GLOBAL_VALUE: { 485 static const OpRegBankEntry<1> Table[2] = { 486 { { AMDGPU::VGPRRegBankID }, 1 }, 487 { { AMDGPU::SGPRRegBankID }, 1 } 488 }; 489 490 return addMappingFromTable<1>(MI, MRI, {{ 0 }}, Table); 491 } 492 case TargetOpcode::G_AND: 493 case TargetOpcode::G_OR: 494 case TargetOpcode::G_XOR: { 495 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 496 497 if (Size == 1) { 498 // s_{and|or|xor}_b32 set scc when the result of the 32-bit op is not 0. 499 const InstructionMapping &SCCMapping = getInstructionMapping( 500 1, 1, getOperandsMapping( 501 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32), 502 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32), 503 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32)}), 504 3); // Num Operands 505 AltMappings.push_back(&SCCMapping); 506 507 const InstructionMapping &VCCMapping0 = getInstructionMapping( 508 2, 1, getOperandsMapping( 509 {AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size), 510 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size), 511 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size)}), 512 3); // Num Operands 513 AltMappings.push_back(&VCCMapping0); 514 return AltMappings; 515 } 516 517 if (Size != 64) 518 break; 519 520 const InstructionMapping &SSMapping = getInstructionMapping( 521 1, 1, getOperandsMapping( 522 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), 523 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), 524 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}), 525 3); // Num Operands 526 AltMappings.push_back(&SSMapping); 527 528 const InstructionMapping &VVMapping = getInstructionMapping( 529 2, 2, getOperandsMapping( 530 {AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size), 531 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size), 532 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size)}), 533 3); // Num Operands 534 AltMappings.push_back(&VVMapping); 535 break; 536 } 537 case TargetOpcode::G_LOAD: 538 case TargetOpcode::G_ZEXTLOAD: 539 case TargetOpcode::G_SEXTLOAD: { 540 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 541 LLT PtrTy = MRI.getType(MI.getOperand(1).getReg()); 542 unsigned PtrSize = PtrTy.getSizeInBits(); 543 unsigned AS = PtrTy.getAddressSpace(); 544 545 if ((AS != AMDGPUAS::LOCAL_ADDRESS && AS != AMDGPUAS::REGION_ADDRESS && 546 AS != AMDGPUAS::PRIVATE_ADDRESS) && 547 isScalarLoadLegal(MI)) { 548 const InstructionMapping &SSMapping = getInstructionMapping( 549 1, 1, getOperandsMapping( 550 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), 551 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, PtrSize)}), 552 2); // Num Operands 553 AltMappings.push_back(&SSMapping); 554 } 555 556 const InstructionMapping &VVMapping = getInstructionMapping( 557 2, 1, 558 getOperandsMapping( 559 {AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size), 560 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, PtrSize)}), 561 2); // Num Operands 562 AltMappings.push_back(&VVMapping); 563 564 // It may be possible to have a vgpr = load sgpr mapping here, because 565 // the mubuf instructions support this kind of load, but probably for only 566 // gfx7 and older. However, the addressing mode matching in the instruction 567 // selector should be able to do a better job of detecting and selecting 568 // these kinds of loads from the vgpr = load vgpr mapping. 569 570 return AltMappings; 571 572 } 573 case TargetOpcode::G_SELECT: { 574 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 575 const InstructionMapping &SSMapping = getInstructionMapping(1, 1, 576 getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), 577 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1), 578 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), 579 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}), 580 4); // Num Operands 581 AltMappings.push_back(&SSMapping); 582 583 const InstructionMapping &VVMapping = getInstructionMapping(2, 1, 584 getOperandsMapping({AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size), 585 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1), 586 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size), 587 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size)}), 588 4); // Num Operands 589 AltMappings.push_back(&VVMapping); 590 591 return AltMappings; 592 } 593 case TargetOpcode::G_UADDE: 594 case TargetOpcode::G_USUBE: 595 case TargetOpcode::G_SADDE: 596 case TargetOpcode::G_SSUBE: { 597 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 598 const InstructionMapping &SSMapping = getInstructionMapping(1, 1, 599 getOperandsMapping( 600 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), 601 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1), 602 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), 603 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size), 604 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1)}), 605 5); // Num Operands 606 AltMappings.push_back(&SSMapping); 607 608 const InstructionMapping &VVMapping = getInstructionMapping(2, 1, 609 getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size), 610 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1), 611 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size), 612 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size), 613 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1)}), 614 5); // Num Operands 615 AltMappings.push_back(&VVMapping); 616 return AltMappings; 617 } 618 case AMDGPU::G_BRCOND: { 619 assert(MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() == 1); 620 621 // TODO: Change type to 32 for scalar 622 const InstructionMapping &SMapping = getInstructionMapping( 623 1, 1, getOperandsMapping( 624 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1), nullptr}), 625 2); // Num Operands 626 AltMappings.push_back(&SMapping); 627 628 const InstructionMapping &VMapping = getInstructionMapping( 629 1, 1, getOperandsMapping( 630 {AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1), nullptr }), 631 2); // Num Operands 632 AltMappings.push_back(&VMapping); 633 return AltMappings; 634 } 635 case AMDGPU::G_INTRINSIC: 636 return getInstrAlternativeMappingsIntrinsic(MI, MRI); 637 case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: 638 return getInstrAlternativeMappingsIntrinsicWSideEffects(MI, MRI); 639 default: 640 break; 641 } 642 return RegisterBankInfo::getInstrAlternativeMappings(MI); 643 } 644 645 void AMDGPURegisterBankInfo::split64BitValueForMapping( 646 MachineIRBuilder &B, 647 SmallVector<Register, 2> &Regs, 648 LLT HalfTy, 649 Register Reg) const { 650 assert(HalfTy.getSizeInBits() == 32); 651 MachineRegisterInfo *MRI = B.getMRI(); 652 Register LoLHS = MRI->createGenericVirtualRegister(HalfTy); 653 Register HiLHS = MRI->createGenericVirtualRegister(HalfTy); 654 const RegisterBank *Bank = getRegBank(Reg, *MRI, *TRI); 655 MRI->setRegBank(LoLHS, *Bank); 656 MRI->setRegBank(HiLHS, *Bank); 657 658 Regs.push_back(LoLHS); 659 Regs.push_back(HiLHS); 660 661 B.buildInstr(AMDGPU::G_UNMERGE_VALUES) 662 .addDef(LoLHS) 663 .addDef(HiLHS) 664 .addUse(Reg); 665 } 666 667 /// Replace the current type each register in \p Regs has with \p NewTy 668 static void setRegsToType(MachineRegisterInfo &MRI, ArrayRef<Register> Regs, 669 LLT NewTy) { 670 for (Register Reg : Regs) { 671 assert(MRI.getType(Reg).getSizeInBits() == NewTy.getSizeInBits()); 672 MRI.setType(Reg, NewTy); 673 } 674 } 675 676 static LLT getHalfSizedType(LLT Ty) { 677 if (Ty.isVector()) { 678 assert(Ty.getNumElements() % 2 == 0); 679 return LLT::scalarOrVector(Ty.getNumElements() / 2, Ty.getElementType()); 680 } 681 682 assert(Ty.getSizeInBits() % 2 == 0); 683 return LLT::scalar(Ty.getSizeInBits() / 2); 684 } 685 686 /// Legalize instruction \p MI where operands in \p OpIndices must be SGPRs. If 687 /// any of the required SGPR operands are VGPRs, perform a waterfall loop to 688 /// execute the instruction for each unique combination of values in all lanes 689 /// in the wave. The block will be split such that rest of the instructions are 690 /// moved to a new block. 691 /// 692 /// Essentially performs this loop: 693 // 694 /// Save Execution Mask 695 /// For (Lane : Wavefront) { 696 /// Enable Lane, Disable all other lanes 697 /// SGPR = read SGPR value for current lane from VGPR 698 /// VGPRResult[Lane] = use_op SGPR 699 /// } 700 /// Restore Execution Mask 701 /// 702 /// There is additional complexity to try for compare values to identify the 703 /// unique values used. 704 bool AMDGPURegisterBankInfo::executeInWaterfallLoop( 705 MachineIRBuilder &B, 706 iterator_range<MachineBasicBlock::iterator> Range, 707 SmallSet<Register, 4> &SGPROperandRegs, 708 MachineRegisterInfo &MRI) const { 709 SmallVector<Register, 4> ResultRegs; 710 SmallVector<Register, 4> InitResultRegs; 711 SmallVector<Register, 4> PhiRegs; 712 713 // Track use registers which have already been expanded with a readfirstlane 714 // sequence. This may have multiple uses if moving a sequence. 715 DenseMap<Register, Register> WaterfalledRegMap; 716 717 MachineBasicBlock &MBB = B.getMBB(); 718 MachineFunction *MF = &B.getMF(); 719 720 const TargetRegisterClass *WaveRC = TRI->getWaveMaskRegClass(); 721 const unsigned WaveAndOpc = Subtarget.isWave32() ? 722 AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64; 723 const unsigned MovTermOpc = Subtarget.isWave32() ? 724 AMDGPU::S_MOV_B32_term : AMDGPU::S_MOV_B64_term; 725 const unsigned XorTermOpc = Subtarget.isWave32() ? 726 AMDGPU::S_XOR_B32_term : AMDGPU::S_XOR_B64_term; 727 const unsigned AndSaveExecOpc = Subtarget.isWave32() ? 728 AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64; 729 const unsigned ExecReg = Subtarget.isWave32() ? 730 AMDGPU::EXEC_LO : AMDGPU::EXEC; 731 732 #ifndef NDEBUG 733 const int OrigRangeSize = std::distance(Range.begin(), Range.end()); 734 #endif 735 736 for (MachineInstr &MI : Range) { 737 for (MachineOperand &Def : MI.defs()) { 738 if (MRI.use_nodbg_empty(Def.getReg())) 739 continue; 740 741 LLT ResTy = MRI.getType(Def.getReg()); 742 const RegisterBank *DefBank = getRegBank(Def.getReg(), MRI, *TRI); 743 ResultRegs.push_back(Def.getReg()); 744 Register InitReg = B.buildUndef(ResTy).getReg(0); 745 Register PhiReg = MRI.createGenericVirtualRegister(ResTy); 746 InitResultRegs.push_back(InitReg); 747 PhiRegs.push_back(PhiReg); 748 MRI.setRegBank(PhiReg, *DefBank); 749 MRI.setRegBank(InitReg, *DefBank); 750 } 751 } 752 753 Register SaveExecReg = MRI.createVirtualRegister(WaveRC); 754 Register InitSaveExecReg = MRI.createVirtualRegister(WaveRC); 755 756 // Don't bother using generic instructions/registers for the exec mask. 757 B.buildInstr(TargetOpcode::IMPLICIT_DEF) 758 .addDef(InitSaveExecReg); 759 760 Register PhiExec = MRI.createVirtualRegister(WaveRC); 761 Register NewExec = MRI.createVirtualRegister(WaveRC); 762 763 // To insert the loop we need to split the block. Move everything before this 764 // point to a new block, and insert a new empty block before this instruction. 765 MachineBasicBlock *LoopBB = MF->CreateMachineBasicBlock(); 766 MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock(); 767 MachineBasicBlock *RestoreExecBB = MF->CreateMachineBasicBlock(); 768 MachineFunction::iterator MBBI(MBB); 769 ++MBBI; 770 MF->insert(MBBI, LoopBB); 771 MF->insert(MBBI, RestoreExecBB); 772 MF->insert(MBBI, RemainderBB); 773 774 LoopBB->addSuccessor(RestoreExecBB); 775 LoopBB->addSuccessor(LoopBB); 776 777 // Move the rest of the block into a new block. 778 RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB); 779 RemainderBB->splice(RemainderBB->begin(), &MBB, Range.end(), MBB.end()); 780 781 MBB.addSuccessor(LoopBB); 782 RestoreExecBB->addSuccessor(RemainderBB); 783 784 B.setInsertPt(*LoopBB, LoopBB->end()); 785 786 B.buildInstr(TargetOpcode::PHI) 787 .addDef(PhiExec) 788 .addReg(InitSaveExecReg) 789 .addMBB(&MBB) 790 .addReg(NewExec) 791 .addMBB(LoopBB); 792 793 for (auto Result : zip(InitResultRegs, ResultRegs, PhiRegs)) { 794 B.buildInstr(TargetOpcode::G_PHI) 795 .addDef(std::get<2>(Result)) 796 .addReg(std::get<0>(Result)) // Initial value / implicit_def 797 .addMBB(&MBB) 798 .addReg(std::get<1>(Result)) // Mid-loop value. 799 .addMBB(LoopBB); 800 } 801 802 const DebugLoc &DL = B.getDL(); 803 804 MachineInstr &FirstInst = *Range.begin(); 805 806 // Move the instruction into the loop. Note we moved everything after 807 // Range.end() already into a new block, so Range.end() is no longer valid. 808 LoopBB->splice(LoopBB->end(), &MBB, Range.begin(), MBB.end()); 809 810 // Figure out the iterator range after splicing the instructions. 811 MachineBasicBlock::iterator NewBegin = FirstInst.getIterator(); 812 auto NewEnd = LoopBB->end(); 813 814 MachineBasicBlock::iterator I = Range.begin(); 815 B.setInsertPt(*LoopBB, I); 816 817 Register CondReg; 818 819 assert(std::distance(NewBegin, NewEnd) == OrigRangeSize); 820 821 for (MachineInstr &MI : make_range(NewBegin, NewEnd)) { 822 for (MachineOperand &Op : MI.uses()) { 823 if (!Op.isReg() || Op.isDef()) 824 continue; 825 826 Register OldReg = Op.getReg(); 827 if (!SGPROperandRegs.count(OldReg)) 828 continue; 829 830 // See if we already processed this register in another instruction in the 831 // sequence. 832 auto OldVal = WaterfalledRegMap.find(OldReg); 833 if (OldVal != WaterfalledRegMap.end()) { 834 Op.setReg(OldVal->second); 835 continue; 836 } 837 838 Register OpReg = Op.getReg(); 839 LLT OpTy = MRI.getType(OpReg); 840 841 const RegisterBank *OpBank = getRegBank(OpReg, MRI, *TRI); 842 if (OpBank != &AMDGPU::VGPRRegBank) { 843 // Insert copy from AGPR to VGPR before the loop. 844 B.setMBB(MBB); 845 OpReg = B.buildCopy(OpTy, OpReg).getReg(0); 846 MRI.setRegBank(OpReg, AMDGPU::VGPRRegBank); 847 B.setInstr(*I); 848 } 849 850 unsigned OpSize = OpTy.getSizeInBits(); 851 852 // Can only do a readlane of 32-bit pieces. 853 if (OpSize == 32) { 854 // Avoid extra copies in the simple case of one 32-bit register. 855 Register CurrentLaneOpReg 856 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); 857 MRI.setType(CurrentLaneOpReg, OpTy); 858 859 constrainGenericRegister(OpReg, AMDGPU::VGPR_32RegClass, MRI); 860 // Read the next variant <- also loop target. 861 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), 862 CurrentLaneOpReg) 863 .addReg(OpReg); 864 865 Register NewCondReg = MRI.createVirtualRegister(WaveRC); 866 bool First = CondReg == AMDGPU::NoRegister; 867 if (First) 868 CondReg = NewCondReg; 869 870 // Compare the just read M0 value to all possible Idx values. 871 B.buildInstr(AMDGPU::V_CMP_EQ_U32_e64) 872 .addDef(NewCondReg) 873 .addReg(CurrentLaneOpReg) 874 .addReg(OpReg); 875 Op.setReg(CurrentLaneOpReg); 876 877 if (!First) { 878 Register AndReg = MRI.createVirtualRegister(WaveRC); 879 880 // If there are multiple operands to consider, and the conditions. 881 B.buildInstr(WaveAndOpc) 882 .addDef(AndReg) 883 .addReg(NewCondReg) 884 .addReg(CondReg); 885 CondReg = AndReg; 886 } 887 } else { 888 LLT S32 = LLT::scalar(32); 889 SmallVector<Register, 8> ReadlanePieces; 890 891 // The compares can be done as 64-bit, but the extract needs to be done 892 // in 32-bit pieces. 893 894 bool Is64 = OpSize % 64 == 0; 895 896 LLT UnmergeTy = OpSize % 64 == 0 ? LLT::scalar(64) : LLT::scalar(32); 897 unsigned CmpOp = OpSize % 64 == 0 ? AMDGPU::V_CMP_EQ_U64_e64 898 : AMDGPU::V_CMP_EQ_U32_e64; 899 900 // The compares can be done as 64-bit, but the extract needs to be done 901 // in 32-bit pieces. 902 903 // Insert the unmerge before the loop. 904 905 B.setMBB(MBB); 906 auto Unmerge = B.buildUnmerge(UnmergeTy, OpReg); 907 B.setInstr(*I); 908 909 unsigned NumPieces = Unmerge->getNumOperands() - 1; 910 for (unsigned PieceIdx = 0; PieceIdx != NumPieces; ++PieceIdx) { 911 Register UnmergePiece = Unmerge.getReg(PieceIdx); 912 913 Register CurrentLaneOpReg; 914 if (Is64) { 915 Register CurrentLaneOpRegLo = MRI.createGenericVirtualRegister(S32); 916 Register CurrentLaneOpRegHi = MRI.createGenericVirtualRegister(S32); 917 918 MRI.setRegClass(UnmergePiece, &AMDGPU::VReg_64RegClass); 919 MRI.setRegClass(CurrentLaneOpRegLo, &AMDGPU::SReg_32_XM0RegClass); 920 MRI.setRegClass(CurrentLaneOpRegHi, &AMDGPU::SReg_32_XM0RegClass); 921 922 // Read the next variant <- also loop target. 923 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), 924 CurrentLaneOpRegLo) 925 .addReg(UnmergePiece, 0, AMDGPU::sub0); 926 927 // Read the next variant <- also loop target. 928 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), 929 CurrentLaneOpRegHi) 930 .addReg(UnmergePiece, 0, AMDGPU::sub1); 931 932 CurrentLaneOpReg = 933 B.buildMerge(LLT::scalar(64), 934 {CurrentLaneOpRegLo, CurrentLaneOpRegHi}) 935 .getReg(0); 936 937 MRI.setRegClass(CurrentLaneOpReg, &AMDGPU::SReg_64_XEXECRegClass); 938 939 if (OpTy.getScalarSizeInBits() == 64) { 940 // If we need to produce a 64-bit element vector, so use the 941 // merged pieces 942 ReadlanePieces.push_back(CurrentLaneOpReg); 943 } else { 944 // 32-bit element type. 945 ReadlanePieces.push_back(CurrentLaneOpRegLo); 946 ReadlanePieces.push_back(CurrentLaneOpRegHi); 947 } 948 } else { 949 CurrentLaneOpReg = MRI.createGenericVirtualRegister(S32); 950 MRI.setRegClass(UnmergePiece, &AMDGPU::VGPR_32RegClass); 951 MRI.setRegClass(CurrentLaneOpReg, &AMDGPU::SReg_32_XM0RegClass); 952 953 // Read the next variant <- also loop target. 954 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), 955 CurrentLaneOpReg) 956 .addReg(UnmergePiece); 957 ReadlanePieces.push_back(CurrentLaneOpReg); 958 } 959 960 Register NewCondReg = MRI.createVirtualRegister(WaveRC); 961 bool First = CondReg == AMDGPU::NoRegister; 962 if (First) 963 CondReg = NewCondReg; 964 965 B.buildInstr(CmpOp) 966 .addDef(NewCondReg) 967 .addReg(CurrentLaneOpReg) 968 .addReg(UnmergePiece); 969 970 if (!First) { 971 Register AndReg = MRI.createVirtualRegister(WaveRC); 972 973 // If there are multiple operands to consider, and the conditions. 974 B.buildInstr(WaveAndOpc) 975 .addDef(AndReg) 976 .addReg(NewCondReg) 977 .addReg(CondReg); 978 CondReg = AndReg; 979 } 980 } 981 982 // FIXME: Build merge seems to switch to CONCAT_VECTORS but not 983 // BUILD_VECTOR 984 if (OpTy.isVector()) { 985 auto Merge = B.buildBuildVector(OpTy, ReadlanePieces); 986 Op.setReg(Merge.getReg(0)); 987 } else { 988 auto Merge = B.buildMerge(OpTy, ReadlanePieces); 989 Op.setReg(Merge.getReg(0)); 990 } 991 992 MRI.setRegBank(Op.getReg(), AMDGPU::SGPRRegBank); 993 } 994 995 // Make sure we don't re-process this register again. 996 WaterfalledRegMap.insert(std::make_pair(OldReg, Op.getReg())); 997 } 998 } 999 1000 B.setInsertPt(*LoopBB, LoopBB->end()); 1001 1002 // Update EXEC, save the original EXEC value to VCC. 1003 B.buildInstr(AndSaveExecOpc) 1004 .addDef(NewExec) 1005 .addReg(CondReg, RegState::Kill); 1006 1007 MRI.setSimpleHint(NewExec, CondReg); 1008 1009 // Update EXEC, switch all done bits to 0 and all todo bits to 1. 1010 B.buildInstr(XorTermOpc) 1011 .addDef(ExecReg) 1012 .addReg(ExecReg) 1013 .addReg(NewExec); 1014 1015 // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use 1016 // s_cbranch_scc0? 1017 1018 // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover. 1019 B.buildInstr(AMDGPU::S_CBRANCH_EXECNZ) 1020 .addMBB(LoopBB); 1021 1022 // Save the EXEC mask before the loop. 1023 BuildMI(MBB, MBB.end(), DL, TII->get(MovTermOpc), SaveExecReg) 1024 .addReg(ExecReg); 1025 1026 // Restore the EXEC mask after the loop. 1027 B.setMBB(*RestoreExecBB); 1028 B.buildInstr(MovTermOpc) 1029 .addDef(ExecReg) 1030 .addReg(SaveExecReg); 1031 1032 // Set the insert point after the original instruction, so any new 1033 // instructions will be in the remainder. 1034 B.setInsertPt(*RemainderBB, RemainderBB->begin()); 1035 1036 return true; 1037 } 1038 1039 // Return any unique registers used by \p MI at \p OpIndices that need to be 1040 // handled in a waterfall loop. Returns these registers in \p 1041 // SGPROperandRegs. Returns true if there are any operands to handle and a 1042 // waterfall loop is necessary. 1043 bool AMDGPURegisterBankInfo::collectWaterfallOperands( 1044 SmallSet<Register, 4> &SGPROperandRegs, MachineInstr &MI, 1045 MachineRegisterInfo &MRI, ArrayRef<unsigned> OpIndices) const { 1046 for (unsigned Op : OpIndices) { 1047 assert(MI.getOperand(Op).isUse()); 1048 Register Reg = MI.getOperand(Op).getReg(); 1049 const RegisterBank *OpBank = getRegBank(Reg, MRI, *TRI); 1050 if (OpBank->getID() != AMDGPU::SGPRRegBankID) 1051 SGPROperandRegs.insert(Reg); 1052 } 1053 1054 // No operands need to be replaced, so no need to loop. 1055 return !SGPROperandRegs.empty(); 1056 } 1057 1058 bool AMDGPURegisterBankInfo::executeInWaterfallLoop( 1059 MachineIRBuilder &B, MachineInstr &MI, MachineRegisterInfo &MRI, 1060 ArrayRef<unsigned> OpIndices) const { 1061 // Use a set to avoid extra readfirstlanes in the case where multiple operands 1062 // are the same register. 1063 SmallSet<Register, 4> SGPROperandRegs; 1064 1065 if (!collectWaterfallOperands(SGPROperandRegs, MI, MRI, OpIndices)) 1066 return false; 1067 1068 MachineBasicBlock::iterator I = MI.getIterator(); 1069 return executeInWaterfallLoop(B, make_range(I, std::next(I)), 1070 SGPROperandRegs, MRI); 1071 } 1072 1073 bool AMDGPURegisterBankInfo::executeInWaterfallLoop( 1074 MachineInstr &MI, MachineRegisterInfo &MRI, 1075 ArrayRef<unsigned> OpIndices) const { 1076 MachineIRBuilder B(MI); 1077 return executeInWaterfallLoop(B, MI, MRI, OpIndices); 1078 } 1079 1080 // Legalize an operand that must be an SGPR by inserting a readfirstlane. 1081 void AMDGPURegisterBankInfo::constrainOpWithReadfirstlane( 1082 MachineInstr &MI, MachineRegisterInfo &MRI, unsigned OpIdx) const { 1083 Register Reg = MI.getOperand(OpIdx).getReg(); 1084 const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI); 1085 if (Bank == &AMDGPU::SGPRRegBank) 1086 return; 1087 1088 LLT Ty = MRI.getType(Reg); 1089 MachineIRBuilder B(MI); 1090 1091 if (Bank != &AMDGPU::VGPRRegBank) { 1092 // We need to copy from AGPR to VGPR 1093 Reg = B.buildCopy(Ty, Reg).getReg(0); 1094 MRI.setRegBank(Reg, AMDGPU::VGPRRegBank); 1095 } 1096 1097 Register SGPR = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 1098 B.buildInstr(AMDGPU::V_READFIRSTLANE_B32) 1099 .addDef(SGPR) 1100 .addReg(Reg); 1101 1102 MRI.setType(SGPR, Ty); 1103 1104 const TargetRegisterClass *Constrained = 1105 constrainGenericRegister(Reg, AMDGPU::VGPR_32RegClass, MRI); 1106 (void)Constrained; 1107 assert(Constrained && "Failed to constrain readfirstlane src reg"); 1108 1109 MI.getOperand(OpIdx).setReg(SGPR); 1110 } 1111 1112 /// Split \p Ty into 2 pieces. The first will have \p FirstSize bits, and the 1113 /// rest will be in the remainder. 1114 static std::pair<LLT, LLT> splitUnequalType(LLT Ty, unsigned FirstSize) { 1115 unsigned TotalSize = Ty.getSizeInBits(); 1116 if (!Ty.isVector()) 1117 return {LLT::scalar(FirstSize), LLT::scalar(TotalSize - FirstSize)}; 1118 1119 LLT EltTy = Ty.getElementType(); 1120 unsigned EltSize = EltTy.getSizeInBits(); 1121 assert(FirstSize % EltSize == 0); 1122 1123 unsigned FirstPartNumElts = FirstSize / EltSize; 1124 unsigned RemainderElts = (TotalSize - FirstSize) / EltSize; 1125 1126 return {LLT::scalarOrVector(FirstPartNumElts, EltTy), 1127 LLT::scalarOrVector(RemainderElts, EltTy)}; 1128 } 1129 1130 static LLT widen96To128(LLT Ty) { 1131 if (!Ty.isVector()) 1132 return LLT::scalar(128); 1133 1134 LLT EltTy = Ty.getElementType(); 1135 assert(128 % EltTy.getSizeInBits() == 0); 1136 return LLT::vector(128 / EltTy.getSizeInBits(), EltTy); 1137 } 1138 1139 bool AMDGPURegisterBankInfo::applyMappingLoad(MachineInstr &MI, 1140 const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper, 1141 MachineRegisterInfo &MRI) const { 1142 Register DstReg = MI.getOperand(0).getReg(); 1143 const LLT LoadTy = MRI.getType(DstReg); 1144 unsigned LoadSize = LoadTy.getSizeInBits(); 1145 const unsigned MaxNonSmrdLoadSize = 128; 1146 1147 const RegisterBank *DstBank = 1148 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; 1149 if (DstBank == &AMDGPU::SGPRRegBank) { 1150 // There are some special cases that we need to look at for 32 bit and 96 1151 // bit SGPR loads otherwise we have nothing to do. 1152 if (LoadSize != 32 && LoadSize != 96) 1153 return false; 1154 1155 MachineMemOperand *MMO = *MI.memoperands_begin(); 1156 const unsigned MemSize = 8 * MMO->getSize(); 1157 // Scalar loads of size 8 or 16 bit with proper alignment may be widened to 1158 // 32 bit. Check to see if we need to widen the memory access, 8 or 16 bit 1159 // scalar loads should have a load size of 32 but memory access size of less 1160 // than 32. 1161 if (LoadSize == 32 && 1162 (MemSize == 32 || LoadTy.isVector() || !isScalarLoadLegal(MI))) 1163 return false; 1164 1165 Register PtrReg = MI.getOperand(1).getReg(); 1166 1167 ApplyRegBankMapping O(*this, MRI, &AMDGPU::SGPRRegBank); 1168 MachineIRBuilder B(MI, O); 1169 1170 if (LoadSize == 32) { 1171 // This is an extending load from a sub-dword size. Widen the memory 1172 // access size to 4 bytes and clear the extra high bits appropriately 1173 const LLT S32 = LLT::scalar(32); 1174 if (MI.getOpcode() == AMDGPU::G_SEXTLOAD) { 1175 // Must extend the sign bit into higher bits for a G_SEXTLOAD 1176 auto WideLoad = B.buildLoadFromOffset(S32, PtrReg, *MMO, 0); 1177 B.buildSExtInReg(MI.getOperand(0), WideLoad, MemSize); 1178 } else if (MI.getOpcode() == AMDGPU::G_ZEXTLOAD) { 1179 // Must extend zero into higher bits with an AND for a G_ZEXTLOAD 1180 auto WideLoad = B.buildLoadFromOffset(S32, PtrReg, *MMO, 0); 1181 B.buildZExtInReg(MI.getOperand(0), WideLoad, MemSize); 1182 } else 1183 // We do not need to touch the higher bits for regular loads. 1184 B.buildLoadFromOffset(MI.getOperand(0), PtrReg, *MMO, 0); 1185 } else { 1186 // 96-bit loads are only available for vector loads. We need to split this 1187 // into a 64-bit part, and 32 (unless we can widen to a 128-bit load). 1188 if (MMO->getAlign() < Align(16)) { 1189 LLT Part64, Part32; 1190 std::tie(Part64, Part32) = splitUnequalType(LoadTy, 64); 1191 auto Load0 = B.buildLoadFromOffset(Part64, PtrReg, *MMO, 0); 1192 auto Load1 = B.buildLoadFromOffset(Part32, PtrReg, *MMO, 8); 1193 1194 auto Undef = B.buildUndef(LoadTy); 1195 auto Ins0 = B.buildInsert(LoadTy, Undef, Load0, 0); 1196 B.buildInsert(MI.getOperand(0), Ins0, Load1, 64); 1197 } else { 1198 LLT WiderTy = widen96To128(LoadTy); 1199 auto WideLoad = B.buildLoadFromOffset(WiderTy, PtrReg, *MMO, 0); 1200 B.buildExtract(MI.getOperand(0), WideLoad, 0); 1201 } 1202 } 1203 1204 MI.eraseFromParent(); 1205 return true; 1206 } 1207 1208 // 128-bit loads are supported for all instruction types. 1209 if (LoadSize <= MaxNonSmrdLoadSize) 1210 return false; 1211 1212 SmallVector<Register, 16> DefRegs(OpdMapper.getVRegs(0)); 1213 SmallVector<Register, 1> SrcRegs(OpdMapper.getVRegs(1)); 1214 1215 if (SrcRegs.empty()) 1216 SrcRegs.push_back(MI.getOperand(1).getReg()); 1217 1218 assert(LoadSize % MaxNonSmrdLoadSize == 0); 1219 1220 // RegBankSelect only emits scalar types, so we need to reset the pointer 1221 // operand to a pointer type. 1222 Register BasePtrReg = SrcRegs[0]; 1223 LLT PtrTy = MRI.getType(MI.getOperand(1).getReg()); 1224 MRI.setType(BasePtrReg, PtrTy); 1225 1226 unsigned NumSplitParts = LoadTy.getSizeInBits() / MaxNonSmrdLoadSize; 1227 const LLT LoadSplitTy = LoadTy.divide(NumSplitParts); 1228 ApplyRegBankMapping Observer(*this, MRI, &AMDGPU::VGPRRegBank); 1229 MachineIRBuilder B(MI, Observer); 1230 LegalizerHelper Helper(B.getMF(), Observer, B); 1231 1232 if (LoadTy.isVector()) { 1233 if (Helper.fewerElementsVector(MI, 0, LoadSplitTy) != LegalizerHelper::Legalized) 1234 return false; 1235 } else { 1236 if (Helper.narrowScalar(MI, 0, LoadSplitTy) != LegalizerHelper::Legalized) 1237 return false; 1238 } 1239 1240 MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank); 1241 return true; 1242 } 1243 1244 bool AMDGPURegisterBankInfo::applyMappingDynStackAlloc( 1245 MachineInstr &MI, 1246 const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper, 1247 MachineRegisterInfo &MRI) const { 1248 const MachineFunction &MF = *MI.getMF(); 1249 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1250 const auto &TFI = *ST.getFrameLowering(); 1251 1252 // Guard in case the stack growth direction ever changes with scratch 1253 // instructions. 1254 if (TFI.getStackGrowthDirection() == TargetFrameLowering::StackGrowsDown) 1255 return false; 1256 1257 Register Dst = MI.getOperand(0).getReg(); 1258 Register AllocSize = MI.getOperand(1).getReg(); 1259 Align Alignment = assumeAligned(MI.getOperand(2).getImm()); 1260 1261 const RegisterBank *SizeBank = getRegBank(AllocSize, MRI, *TRI); 1262 1263 // TODO: Need to emit a wave reduction to get the maximum size. 1264 if (SizeBank != &AMDGPU::SGPRRegBank) 1265 return false; 1266 1267 LLT PtrTy = MRI.getType(Dst); 1268 LLT IntPtrTy = LLT::scalar(PtrTy.getSizeInBits()); 1269 1270 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 1271 Register SPReg = Info->getStackPtrOffsetReg(); 1272 ApplyRegBankMapping ApplyBank(*this, MRI, &AMDGPU::SGPRRegBank); 1273 MachineIRBuilder B(MI, ApplyBank); 1274 1275 auto WaveSize = B.buildConstant(LLT::scalar(32), ST.getWavefrontSizeLog2()); 1276 auto ScaledSize = B.buildShl(IntPtrTy, AllocSize, WaveSize); 1277 1278 auto SPCopy = B.buildCopy(PtrTy, SPReg); 1279 if (Alignment > TFI.getStackAlign()) { 1280 auto PtrAdd = B.buildPtrAdd(PtrTy, SPCopy, ScaledSize); 1281 B.buildMaskLowPtrBits(Dst, PtrAdd, 1282 Log2(Alignment) + ST.getWavefrontSizeLog2()); 1283 } else { 1284 B.buildPtrAdd(Dst, SPCopy, ScaledSize); 1285 } 1286 1287 MI.eraseFromParent(); 1288 return true; 1289 } 1290 1291 bool AMDGPURegisterBankInfo::applyMappingImage( 1292 MachineInstr &MI, const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper, 1293 MachineRegisterInfo &MRI, int RsrcIdx) const { 1294 const int NumDefs = MI.getNumExplicitDefs(); 1295 1296 // The reported argument index is relative to the IR intrinsic call arguments, 1297 // so we need to shift by the number of defs and the intrinsic ID. 1298 RsrcIdx += NumDefs + 1; 1299 1300 // Insert copies to VGPR arguments. 1301 applyDefaultMapping(OpdMapper); 1302 1303 // Fixup any SGPR arguments. 1304 SmallVector<unsigned, 4> SGPRIndexes; 1305 for (int I = NumDefs, NumOps = MI.getNumOperands(); I != NumOps; ++I) { 1306 if (!MI.getOperand(I).isReg()) 1307 continue; 1308 1309 // If this intrinsic has a sampler, it immediately follows rsrc. 1310 if (I == RsrcIdx || I == RsrcIdx + 1) 1311 SGPRIndexes.push_back(I); 1312 } 1313 1314 executeInWaterfallLoop(MI, MRI, SGPRIndexes); 1315 return true; 1316 } 1317 1318 static Register getSrcRegIgnoringCopies(const MachineRegisterInfo &MRI, 1319 Register Reg) { 1320 MachineInstr *Def = getDefIgnoringCopies(Reg, MRI); 1321 if (!Def) 1322 return Reg; 1323 1324 // TODO: Guard against this being an implicit def 1325 return Def->getOperand(0).getReg(); 1326 } 1327 1328 // Analyze a combined offset from an llvm.amdgcn.s.buffer intrinsic and store 1329 // the three offsets (voffset, soffset and instoffset) 1330 static unsigned setBufferOffsets(MachineIRBuilder &B, 1331 const AMDGPURegisterBankInfo &RBI, 1332 Register CombinedOffset, Register &VOffsetReg, 1333 Register &SOffsetReg, int64_t &InstOffsetVal, 1334 Align Alignment) { 1335 const LLT S32 = LLT::scalar(32); 1336 MachineRegisterInfo *MRI = B.getMRI(); 1337 1338 if (Optional<int64_t> Imm = getConstantVRegSExtVal(CombinedOffset, *MRI)) { 1339 uint32_t SOffset, ImmOffset; 1340 if (AMDGPU::splitMUBUFOffset(*Imm, SOffset, ImmOffset, &RBI.Subtarget, 1341 Alignment)) { 1342 VOffsetReg = B.buildConstant(S32, 0).getReg(0); 1343 SOffsetReg = B.buildConstant(S32, SOffset).getReg(0); 1344 InstOffsetVal = ImmOffset; 1345 1346 B.getMRI()->setRegBank(VOffsetReg, AMDGPU::VGPRRegBank); 1347 B.getMRI()->setRegBank(SOffsetReg, AMDGPU::SGPRRegBank); 1348 return SOffset + ImmOffset; 1349 } 1350 } 1351 1352 Register Base; 1353 unsigned Offset; 1354 1355 std::tie(Base, Offset) = 1356 AMDGPU::getBaseWithConstantOffset(*MRI, CombinedOffset); 1357 1358 uint32_t SOffset, ImmOffset; 1359 if ((int)Offset > 0 && AMDGPU::splitMUBUFOffset(Offset, SOffset, ImmOffset, 1360 &RBI.Subtarget, Alignment)) { 1361 if (RBI.getRegBank(Base, *MRI, *RBI.TRI) == &AMDGPU::VGPRRegBank) { 1362 VOffsetReg = Base; 1363 SOffsetReg = B.buildConstant(S32, SOffset).getReg(0); 1364 B.getMRI()->setRegBank(SOffsetReg, AMDGPU::SGPRRegBank); 1365 InstOffsetVal = ImmOffset; 1366 return 0; // XXX - Why is this 0? 1367 } 1368 1369 // If we have SGPR base, we can use it for soffset. 1370 if (SOffset == 0) { 1371 VOffsetReg = B.buildConstant(S32, 0).getReg(0); 1372 B.getMRI()->setRegBank(VOffsetReg, AMDGPU::VGPRRegBank); 1373 SOffsetReg = Base; 1374 InstOffsetVal = ImmOffset; 1375 return 0; // XXX - Why is this 0? 1376 } 1377 } 1378 1379 // Handle the variable sgpr + vgpr case. 1380 MachineInstr *Add = getOpcodeDef(AMDGPU::G_ADD, CombinedOffset, *MRI); 1381 if (Add && (int)Offset >= 0) { 1382 Register Src0 = getSrcRegIgnoringCopies(*MRI, Add->getOperand(1).getReg()); 1383 Register Src1 = getSrcRegIgnoringCopies(*MRI, Add->getOperand(2).getReg()); 1384 1385 const RegisterBank *Src0Bank = RBI.getRegBank(Src0, *MRI, *RBI.TRI); 1386 const RegisterBank *Src1Bank = RBI.getRegBank(Src1, *MRI, *RBI.TRI); 1387 1388 if (Src0Bank == &AMDGPU::VGPRRegBank && Src1Bank == &AMDGPU::SGPRRegBank) { 1389 VOffsetReg = Src0; 1390 SOffsetReg = Src1; 1391 return 0; 1392 } 1393 1394 if (Src0Bank == &AMDGPU::SGPRRegBank && Src1Bank == &AMDGPU::VGPRRegBank) { 1395 VOffsetReg = Src1; 1396 SOffsetReg = Src0; 1397 return 0; 1398 } 1399 } 1400 1401 // Ensure we have a VGPR for the combined offset. This could be an issue if we 1402 // have an SGPR offset and a VGPR resource. 1403 if (RBI.getRegBank(CombinedOffset, *MRI, *RBI.TRI) == &AMDGPU::VGPRRegBank) { 1404 VOffsetReg = CombinedOffset; 1405 } else { 1406 VOffsetReg = B.buildCopy(S32, CombinedOffset).getReg(0); 1407 B.getMRI()->setRegBank(VOffsetReg, AMDGPU::VGPRRegBank); 1408 } 1409 1410 SOffsetReg = B.buildConstant(S32, 0).getReg(0); 1411 B.getMRI()->setRegBank(SOffsetReg, AMDGPU::SGPRRegBank); 1412 return 0; 1413 } 1414 1415 bool AMDGPURegisterBankInfo::applyMappingSBufferLoad( 1416 const OperandsMapper &OpdMapper) const { 1417 MachineInstr &MI = OpdMapper.getMI(); 1418 MachineRegisterInfo &MRI = OpdMapper.getMRI(); 1419 1420 const LLT S32 = LLT::scalar(32); 1421 Register Dst = MI.getOperand(0).getReg(); 1422 LLT Ty = MRI.getType(Dst); 1423 1424 const RegisterBank *RSrcBank = 1425 OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank; 1426 const RegisterBank *OffsetBank = 1427 OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank; 1428 if (RSrcBank == &AMDGPU::SGPRRegBank && 1429 OffsetBank == &AMDGPU::SGPRRegBank) 1430 return true; // Legal mapping 1431 1432 // FIXME: 96-bit case was widened during legalize. We neeed to narrow it back 1433 // here but don't have an MMO. 1434 1435 unsigned LoadSize = Ty.getSizeInBits(); 1436 int NumLoads = 1; 1437 if (LoadSize == 256 || LoadSize == 512) { 1438 NumLoads = LoadSize / 128; 1439 Ty = Ty.divide(NumLoads); 1440 } 1441 1442 // Use the alignment to ensure that the required offsets will fit into the 1443 // immediate offsets. 1444 const Align Alignment = NumLoads > 1 ? Align(16 * NumLoads) : Align(1); 1445 1446 MachineIRBuilder B(MI); 1447 MachineFunction &MF = B.getMF(); 1448 1449 Register SOffset; 1450 Register VOffset; 1451 int64_t ImmOffset = 0; 1452 1453 unsigned MMOOffset = setBufferOffsets(B, *this, MI.getOperand(2).getReg(), 1454 VOffset, SOffset, ImmOffset, Alignment); 1455 1456 // TODO: 96-bit loads were widened to 128-bit results. Shrink the result if we 1457 // can, but we neeed to track an MMO for that. 1458 const unsigned MemSize = (Ty.getSizeInBits() + 7) / 8; 1459 const Align MemAlign(4); // FIXME: ABI type alignment? 1460 MachineMemOperand *BaseMMO = MF.getMachineMemOperand( 1461 MachinePointerInfo(), 1462 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 1463 MachineMemOperand::MOInvariant, 1464 MemSize, MemAlign); 1465 if (MMOOffset != 0) 1466 BaseMMO = MF.getMachineMemOperand(BaseMMO, MMOOffset, MemSize); 1467 1468 // If only the offset is divergent, emit a MUBUF buffer load instead. We can 1469 // assume that the buffer is unswizzled. 1470 1471 Register RSrc = MI.getOperand(1).getReg(); 1472 Register VIndex = B.buildConstant(S32, 0).getReg(0); 1473 B.getMRI()->setRegBank(VIndex, AMDGPU::VGPRRegBank); 1474 1475 SmallVector<Register, 4> LoadParts(NumLoads); 1476 1477 MachineBasicBlock::iterator MII = MI.getIterator(); 1478 MachineInstrSpan Span(MII, &B.getMBB()); 1479 1480 for (int i = 0; i < NumLoads; ++i) { 1481 if (NumLoads == 1) { 1482 LoadParts[i] = Dst; 1483 } else { 1484 LoadParts[i] = MRI.createGenericVirtualRegister(Ty); 1485 MRI.setRegBank(LoadParts[i], AMDGPU::VGPRRegBank); 1486 } 1487 1488 MachineMemOperand *MMO = BaseMMO; 1489 if (i != 0) 1490 BaseMMO = MF.getMachineMemOperand(BaseMMO, MMOOffset + 16 * i, MemSize); 1491 1492 B.buildInstr(AMDGPU::G_AMDGPU_BUFFER_LOAD) 1493 .addDef(LoadParts[i]) // vdata 1494 .addUse(RSrc) // rsrc 1495 .addUse(VIndex) // vindex 1496 .addUse(VOffset) // voffset 1497 .addUse(SOffset) // soffset 1498 .addImm(ImmOffset + 16 * i) // offset(imm) 1499 .addImm(0) // cachepolicy, swizzled buffer(imm) 1500 .addImm(0) // idxen(imm) 1501 .addMemOperand(MMO); 1502 } 1503 1504 // TODO: If only the resource is a VGPR, it may be better to execute the 1505 // scalar load in the waterfall loop if the resource is expected to frequently 1506 // be dynamically uniform. 1507 if (RSrcBank != &AMDGPU::SGPRRegBank) { 1508 // Remove the original instruction to avoid potentially confusing the 1509 // waterfall loop logic. 1510 B.setInstr(*Span.begin()); 1511 MI.eraseFromParent(); 1512 1513 SmallSet<Register, 4> OpsToWaterfall; 1514 1515 OpsToWaterfall.insert(RSrc); 1516 executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()), 1517 OpsToWaterfall, MRI); 1518 } 1519 1520 if (NumLoads != 1) { 1521 if (Ty.isVector()) 1522 B.buildConcatVectors(Dst, LoadParts); 1523 else 1524 B.buildMerge(Dst, LoadParts); 1525 } 1526 1527 // We removed the instruction earlier with a waterfall loop. 1528 if (RSrcBank == &AMDGPU::SGPRRegBank) 1529 MI.eraseFromParent(); 1530 1531 return true; 1532 } 1533 1534 bool AMDGPURegisterBankInfo::applyMappingBFEIntrinsic( 1535 const OperandsMapper &OpdMapper, bool Signed) const { 1536 MachineInstr &MI = OpdMapper.getMI(); 1537 MachineRegisterInfo &MRI = OpdMapper.getMRI(); 1538 1539 // Insert basic copies 1540 applyDefaultMapping(OpdMapper); 1541 1542 Register DstReg = MI.getOperand(0).getReg(); 1543 LLT Ty = MRI.getType(DstReg); 1544 1545 const LLT S32 = LLT::scalar(32); 1546 1547 const RegisterBank *DstBank = 1548 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; 1549 if (DstBank == &AMDGPU::VGPRRegBank) { 1550 if (Ty == S32) 1551 return true; 1552 1553 // TODO: 64-bit version is scalar only, so we need to expand this. 1554 return false; 1555 } 1556 1557 Register SrcReg = MI.getOperand(2).getReg(); 1558 Register OffsetReg = MI.getOperand(3).getReg(); 1559 Register WidthReg = MI.getOperand(4).getReg(); 1560 1561 // The scalar form packs the offset and width in a single operand. 1562 1563 ApplyRegBankMapping ApplyBank(*this, MRI, &AMDGPU::SGPRRegBank); 1564 MachineIRBuilder B(MI, ApplyBank); 1565 1566 // Ensure the high bits are clear to insert the offset. 1567 auto OffsetMask = B.buildConstant(S32, maskTrailingOnes<unsigned>(6)); 1568 auto ClampOffset = B.buildAnd(S32, OffsetReg, OffsetMask); 1569 1570 // Zeros out the low bits, so don't bother clamping the input value. 1571 auto ShiftWidth = B.buildShl(S32, WidthReg, B.buildConstant(S32, 16)); 1572 1573 // Transformation function, pack the offset and width of a BFE into 1574 // the format expected by the S_BFE_I32 / S_BFE_U32. In the second 1575 // source, bits [5:0] contain the offset and bits [22:16] the width. 1576 auto MergedInputs = B.buildOr(S32, ClampOffset, ShiftWidth); 1577 1578 // TODO: It might be worth using a pseudo here to avoid scc clobber and 1579 // register class constraints. 1580 unsigned Opc = Ty == S32 ? (Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32) : 1581 (Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64); 1582 1583 auto MIB = B.buildInstr(Opc, {DstReg}, {SrcReg, MergedInputs}); 1584 if (!constrainSelectedInstRegOperands(*MIB, *TII, *TRI, *this)) 1585 llvm_unreachable("failed to constrain BFE"); 1586 1587 MI.eraseFromParent(); 1588 return true; 1589 } 1590 1591 // Return a suitable opcode for extending the operands of Opc when widening. 1592 static unsigned getExtendOp(unsigned Opc) { 1593 switch (Opc) { 1594 case TargetOpcode::G_ASHR: 1595 case TargetOpcode::G_SMIN: 1596 case TargetOpcode::G_SMAX: 1597 return TargetOpcode::G_SEXT; 1598 case TargetOpcode::G_LSHR: 1599 case TargetOpcode::G_UMIN: 1600 case TargetOpcode::G_UMAX: 1601 return TargetOpcode::G_ZEXT; 1602 default: 1603 return TargetOpcode::G_ANYEXT; 1604 } 1605 } 1606 1607 // Emit a legalized extension from <2 x s16> to 2 32-bit components, avoiding 1608 // any illegal vector extend or unmerge operations. 1609 static std::pair<Register, Register> 1610 unpackV2S16ToS32(MachineIRBuilder &B, Register Src, unsigned ExtOpcode) { 1611 const LLT S32 = LLT::scalar(32); 1612 auto Bitcast = B.buildBitcast(S32, Src); 1613 1614 if (ExtOpcode == TargetOpcode::G_SEXT) { 1615 auto ExtLo = B.buildSExtInReg(S32, Bitcast, 16); 1616 auto ShiftHi = B.buildAShr(S32, Bitcast, B.buildConstant(S32, 16)); 1617 return std::make_pair(ExtLo.getReg(0), ShiftHi.getReg(0)); 1618 } 1619 1620 auto ShiftHi = B.buildLShr(S32, Bitcast, B.buildConstant(S32, 16)); 1621 if (ExtOpcode == TargetOpcode::G_ZEXT) { 1622 auto ExtLo = B.buildAnd(S32, Bitcast, B.buildConstant(S32, 0xffff)); 1623 return std::make_pair(ExtLo.getReg(0), ShiftHi.getReg(0)); 1624 } 1625 1626 assert(ExtOpcode == TargetOpcode::G_ANYEXT); 1627 return std::make_pair(Bitcast.getReg(0), ShiftHi.getReg(0)); 1628 } 1629 1630 // For cases where only a single copy is inserted for matching register banks. 1631 // Replace the register in the instruction operand 1632 static bool substituteSimpleCopyRegs( 1633 const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper, unsigned OpIdx) { 1634 SmallVector<unsigned, 1> SrcReg(OpdMapper.getVRegs(OpIdx)); 1635 if (!SrcReg.empty()) { 1636 assert(SrcReg.size() == 1); 1637 OpdMapper.getMI().getOperand(OpIdx).setReg(SrcReg[0]); 1638 return true; 1639 } 1640 1641 return false; 1642 } 1643 1644 /// Handle register layout difference for f16 images for some subtargets. 1645 Register AMDGPURegisterBankInfo::handleD16VData(MachineIRBuilder &B, 1646 MachineRegisterInfo &MRI, 1647 Register Reg) const { 1648 if (!Subtarget.hasUnpackedD16VMem()) 1649 return Reg; 1650 1651 const LLT S16 = LLT::scalar(16); 1652 LLT StoreVT = MRI.getType(Reg); 1653 if (!StoreVT.isVector() || StoreVT.getElementType() != S16) 1654 return Reg; 1655 1656 auto Unmerge = B.buildUnmerge(S16, Reg); 1657 1658 1659 SmallVector<Register, 4> WideRegs; 1660 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 1661 WideRegs.push_back(Unmerge.getReg(I)); 1662 1663 const LLT S32 = LLT::scalar(32); 1664 int NumElts = StoreVT.getNumElements(); 1665 1666 return B.buildMerge(LLT::vector(NumElts, S32), WideRegs).getReg(0); 1667 } 1668 1669 static std::pair<Register, unsigned> 1670 getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg) { 1671 int64_t Const; 1672 if (mi_match(Reg, MRI, m_ICst(Const))) 1673 return std::make_pair(Register(), Const); 1674 1675 Register Base; 1676 if (mi_match(Reg, MRI, m_GAdd(m_Reg(Base), m_ICst(Const)))) 1677 return std::make_pair(Base, Const); 1678 1679 // TODO: Handle G_OR used for add case 1680 return std::make_pair(Reg, 0); 1681 } 1682 1683 std::pair<Register, unsigned> 1684 AMDGPURegisterBankInfo::splitBufferOffsets(MachineIRBuilder &B, 1685 Register OrigOffset) const { 1686 const unsigned MaxImm = 4095; 1687 Register BaseReg; 1688 unsigned ImmOffset; 1689 const LLT S32 = LLT::scalar(32); 1690 1691 std::tie(BaseReg, ImmOffset) = getBaseWithConstantOffset(*B.getMRI(), 1692 OrigOffset); 1693 1694 unsigned C1 = 0; 1695 if (ImmOffset != 0) { 1696 // If the immediate value is too big for the immoffset field, put the value 1697 // and -4096 into the immoffset field so that the value that is copied/added 1698 // for the voffset field is a multiple of 4096, and it stands more chance 1699 // of being CSEd with the copy/add for another similar load/store. 1700 // However, do not do that rounding down to a multiple of 4096 if that is a 1701 // negative number, as it appears to be illegal to have a negative offset 1702 // in the vgpr, even if adding the immediate offset makes it positive. 1703 unsigned Overflow = ImmOffset & ~MaxImm; 1704 ImmOffset -= Overflow; 1705 if ((int32_t)Overflow < 0) { 1706 Overflow += ImmOffset; 1707 ImmOffset = 0; 1708 } 1709 1710 C1 = ImmOffset; 1711 if (Overflow != 0) { 1712 if (!BaseReg) 1713 BaseReg = B.buildConstant(S32, Overflow).getReg(0); 1714 else { 1715 auto OverflowVal = B.buildConstant(S32, Overflow); 1716 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0); 1717 } 1718 } 1719 } 1720 1721 if (!BaseReg) 1722 BaseReg = B.buildConstant(S32, 0).getReg(0); 1723 1724 return {BaseReg, C1}; 1725 } 1726 1727 static bool isZero(Register Reg, MachineRegisterInfo &MRI) { 1728 int64_t C; 1729 return mi_match(Reg, MRI, m_ICst(C)) && C == 0; 1730 } 1731 1732 static unsigned extractCPol(unsigned CachePolicy) { 1733 return CachePolicy & AMDGPU::CPol::ALL; 1734 } 1735 1736 static unsigned extractSWZ(unsigned CachePolicy) { 1737 return (CachePolicy >> 3) & 1; 1738 } 1739 1740 1741 MachineInstr * 1742 AMDGPURegisterBankInfo::selectStoreIntrinsic(MachineIRBuilder &B, 1743 MachineInstr &MI) const { 1744 MachineRegisterInfo &MRI = *B.getMRI(); 1745 executeInWaterfallLoop(B, MI, MRI, {2, 4}); 1746 1747 // FIXME: DAG lowering brokenly changes opcode based on FP vs. integer. 1748 1749 Register VData = MI.getOperand(1).getReg(); 1750 LLT Ty = MRI.getType(VData); 1751 1752 int EltSize = Ty.getScalarSizeInBits(); 1753 int Size = Ty.getSizeInBits(); 1754 1755 // FIXME: Broken integer truncstore. 1756 if (EltSize != 32) 1757 report_fatal_error("unhandled intrinsic store"); 1758 1759 // FIXME: Verifier should enforce 1 MMO for these intrinsics. 1760 const int MemSize = (*MI.memoperands_begin())->getSize(); 1761 1762 1763 Register RSrc = MI.getOperand(2).getReg(); 1764 Register VOffset = MI.getOperand(3).getReg(); 1765 Register SOffset = MI.getOperand(4).getReg(); 1766 unsigned CachePolicy = MI.getOperand(5).getImm(); 1767 1768 unsigned ImmOffset; 1769 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset); 1770 1771 const bool Offen = !isZero(VOffset, MRI); 1772 1773 unsigned Opc = AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact; 1774 switch (8 * MemSize) { 1775 case 8: 1776 Opc = Offen ? AMDGPU::BUFFER_STORE_BYTE_OFFEN_exact : 1777 AMDGPU::BUFFER_STORE_BYTE_OFFSET_exact; 1778 break; 1779 case 16: 1780 Opc = Offen ? AMDGPU::BUFFER_STORE_SHORT_OFFEN_exact : 1781 AMDGPU::BUFFER_STORE_SHORT_OFFSET_exact; 1782 break; 1783 default: 1784 Opc = Offen ? AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact : 1785 AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact; 1786 if (Size > 32) 1787 Opc = AMDGPU::getMUBUFOpcode(Opc, Size / 32); 1788 break; 1789 } 1790 1791 1792 // Set the insertion point back to the instruction in case it was moved into a 1793 // loop. 1794 B.setInstr(MI); 1795 1796 MachineInstrBuilder MIB = B.buildInstr(Opc) 1797 .addUse(VData); 1798 1799 if (Offen) 1800 MIB.addUse(VOffset); 1801 1802 MIB.addUse(RSrc) 1803 .addUse(SOffset) 1804 .addImm(ImmOffset) 1805 .addImm(extractCPol(CachePolicy)) 1806 .addImm(0) // tfe: FIXME: Remove from inst 1807 .addImm(extractSWZ(CachePolicy)) 1808 .cloneMemRefs(MI); 1809 1810 // FIXME: We need a way to report failure from applyMappingImpl. 1811 // Insert constrain copies before inserting the loop. 1812 if (!constrainSelectedInstRegOperands(*MIB, *TII, *TRI, *this)) 1813 report_fatal_error("failed to constrain selected store intrinsic"); 1814 1815 return MIB; 1816 } 1817 1818 bool AMDGPURegisterBankInfo::buildVCopy(MachineIRBuilder &B, Register DstReg, 1819 Register SrcReg) const { 1820 MachineRegisterInfo &MRI = *B.getMRI(); 1821 LLT SrcTy = MRI.getType(SrcReg); 1822 if (SrcTy.getSizeInBits() == 32) { 1823 // Use a v_mov_b32 here to make the exec dependency explicit. 1824 B.buildInstr(AMDGPU::V_MOV_B32_e32) 1825 .addDef(DstReg) 1826 .addUse(SrcReg); 1827 return constrainGenericRegister(DstReg, AMDGPU::VGPR_32RegClass, MRI) && 1828 constrainGenericRegister(SrcReg, AMDGPU::SReg_32RegClass, MRI); 1829 } 1830 1831 Register TmpReg0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1832 Register TmpReg1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1833 1834 B.buildInstr(AMDGPU::V_MOV_B32_e32) 1835 .addDef(TmpReg0) 1836 .addUse(SrcReg, 0, AMDGPU::sub0); 1837 B.buildInstr(AMDGPU::V_MOV_B32_e32) 1838 .addDef(TmpReg1) 1839 .addUse(SrcReg, 0, AMDGPU::sub1); 1840 B.buildInstr(AMDGPU::REG_SEQUENCE) 1841 .addDef(DstReg) 1842 .addUse(TmpReg0) 1843 .addImm(AMDGPU::sub0) 1844 .addUse(TmpReg1) 1845 .addImm(AMDGPU::sub1); 1846 1847 return constrainGenericRegister(SrcReg, AMDGPU::SReg_64RegClass, MRI) && 1848 constrainGenericRegister(DstReg, AMDGPU::VReg_64RegClass, MRI); 1849 } 1850 1851 /// Utility function for pushing dynamic vector indexes with a constant offset 1852 /// into waterwall loops. 1853 static void reinsertVectorIndexAdd(MachineIRBuilder &B, 1854 MachineInstr &IdxUseInstr, 1855 unsigned OpIdx, 1856 unsigned ConstOffset) { 1857 MachineRegisterInfo &MRI = *B.getMRI(); 1858 const LLT S32 = LLT::scalar(32); 1859 Register WaterfallIdx = IdxUseInstr.getOperand(OpIdx).getReg(); 1860 B.setInsertPt(*IdxUseInstr.getParent(), IdxUseInstr.getIterator()); 1861 1862 auto MaterializedOffset = B.buildConstant(S32, ConstOffset); 1863 1864 auto Add = B.buildAdd(S32, WaterfallIdx, MaterializedOffset); 1865 MRI.setRegBank(MaterializedOffset.getReg(0), AMDGPU::SGPRRegBank); 1866 MRI.setRegBank(Add.getReg(0), AMDGPU::SGPRRegBank); 1867 IdxUseInstr.getOperand(OpIdx).setReg(Add.getReg(0)); 1868 } 1869 1870 /// Implement extending a 32-bit value to a 64-bit value. \p Lo32Reg is the 1871 /// original 32-bit source value (to be inserted in the low part of the combined 1872 /// 64-bit result), and \p Hi32Reg is the high half of the combined 64-bit 1873 /// value. 1874 static void extendLow32IntoHigh32(MachineIRBuilder &B, 1875 Register Hi32Reg, Register Lo32Reg, 1876 unsigned ExtOpc, 1877 const RegisterBank &RegBank, 1878 bool IsBooleanSrc = false) { 1879 if (ExtOpc == AMDGPU::G_ZEXT) { 1880 B.buildConstant(Hi32Reg, 0); 1881 } else if (ExtOpc == AMDGPU::G_SEXT) { 1882 if (IsBooleanSrc) { 1883 // If we know the original source was an s1, the high half is the same as 1884 // the low. 1885 B.buildCopy(Hi32Reg, Lo32Reg); 1886 } else { 1887 // Replicate sign bit from 32-bit extended part. 1888 auto ShiftAmt = B.buildConstant(LLT::scalar(32), 31); 1889 B.getMRI()->setRegBank(ShiftAmt.getReg(0), RegBank); 1890 B.buildAShr(Hi32Reg, Lo32Reg, ShiftAmt); 1891 } 1892 } else { 1893 assert(ExtOpc == AMDGPU::G_ANYEXT && "not an integer extension"); 1894 B.buildUndef(Hi32Reg); 1895 } 1896 } 1897 1898 bool AMDGPURegisterBankInfo::foldExtractEltToCmpSelect( 1899 MachineInstr &MI, MachineRegisterInfo &MRI, 1900 const OperandsMapper &OpdMapper) const { 1901 1902 Register VecReg = MI.getOperand(1).getReg(); 1903 Register Idx = MI.getOperand(2).getReg(); 1904 1905 const RegisterBank &IdxBank = 1906 *OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank; 1907 1908 bool IsDivergentIdx = IdxBank != AMDGPU::SGPRRegBank; 1909 1910 LLT VecTy = MRI.getType(VecReg); 1911 unsigned EltSize = VecTy.getScalarSizeInBits(); 1912 unsigned NumElem = VecTy.getNumElements(); 1913 1914 if (!SITargetLowering::shouldExpandVectorDynExt(EltSize, NumElem, 1915 IsDivergentIdx)) 1916 return false; 1917 1918 MachineIRBuilder B(MI); 1919 LLT S32 = LLT::scalar(32); 1920 1921 const RegisterBank &DstBank = 1922 *OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; 1923 const RegisterBank &SrcBank = 1924 *OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank; 1925 1926 const RegisterBank &CCBank = 1927 (DstBank == AMDGPU::SGPRRegBank && 1928 SrcBank == AMDGPU::SGPRRegBank && 1929 IdxBank == AMDGPU::SGPRRegBank) ? AMDGPU::SGPRRegBank 1930 : AMDGPU::VCCRegBank; 1931 LLT CCTy = (CCBank == AMDGPU::SGPRRegBank) ? S32 : LLT::scalar(1); 1932 1933 if (CCBank == AMDGPU::VCCRegBank && IdxBank == AMDGPU::SGPRRegBank) { 1934 Idx = B.buildCopy(S32, Idx)->getOperand(0).getReg(); 1935 MRI.setRegBank(Idx, AMDGPU::VGPRRegBank); 1936 } 1937 1938 LLT EltTy = VecTy.getScalarType(); 1939 SmallVector<Register, 2> DstRegs(OpdMapper.getVRegs(0)); 1940 unsigned NumLanes = DstRegs.size(); 1941 if (!NumLanes) 1942 NumLanes = 1; 1943 else 1944 EltTy = MRI.getType(DstRegs[0]); 1945 1946 auto UnmergeToEltTy = B.buildUnmerge(EltTy, VecReg); 1947 SmallVector<Register, 2> Res(NumLanes); 1948 for (unsigned L = 0; L < NumLanes; ++L) 1949 Res[L] = UnmergeToEltTy.getReg(L); 1950 1951 for (unsigned I = 1; I < NumElem; ++I) { 1952 auto IC = B.buildConstant(S32, I); 1953 MRI.setRegBank(IC->getOperand(0).getReg(), AMDGPU::SGPRRegBank); 1954 auto Cmp = B.buildICmp(CmpInst::ICMP_EQ, CCTy, Idx, IC); 1955 MRI.setRegBank(Cmp->getOperand(0).getReg(), CCBank); 1956 1957 for (unsigned L = 0; L < NumLanes; ++L) { 1958 auto S = B.buildSelect(EltTy, Cmp, 1959 UnmergeToEltTy.getReg(I * NumLanes + L), Res[L]); 1960 1961 for (unsigned N : { 0, 2, 3 }) 1962 MRI.setRegBank(S->getOperand(N).getReg(), DstBank); 1963 1964 Res[L] = S->getOperand(0).getReg(); 1965 } 1966 } 1967 1968 for (unsigned L = 0; L < NumLanes; ++L) { 1969 Register DstReg = (NumLanes == 1) ? MI.getOperand(0).getReg() : DstRegs[L]; 1970 B.buildCopy(DstReg, Res[L]); 1971 MRI.setRegBank(DstReg, DstBank); 1972 } 1973 1974 MRI.setRegBank(MI.getOperand(0).getReg(), DstBank); 1975 MI.eraseFromParent(); 1976 1977 return true; 1978 } 1979 1980 bool AMDGPURegisterBankInfo::foldInsertEltToCmpSelect( 1981 MachineInstr &MI, MachineRegisterInfo &MRI, 1982 const OperandsMapper &OpdMapper) const { 1983 1984 Register VecReg = MI.getOperand(1).getReg(); 1985 Register Idx = MI.getOperand(3).getReg(); 1986 1987 const RegisterBank &IdxBank = 1988 *OpdMapper.getInstrMapping().getOperandMapping(3).BreakDown[0].RegBank; 1989 1990 bool IsDivergentIdx = IdxBank != AMDGPU::SGPRRegBank; 1991 1992 LLT VecTy = MRI.getType(VecReg); 1993 unsigned EltSize = VecTy.getScalarSizeInBits(); 1994 unsigned NumElem = VecTy.getNumElements(); 1995 1996 if (!SITargetLowering::shouldExpandVectorDynExt(EltSize, NumElem, 1997 IsDivergentIdx)) 1998 return false; 1999 2000 MachineIRBuilder B(MI); 2001 LLT S32 = LLT::scalar(32); 2002 2003 const RegisterBank &DstBank = 2004 *OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; 2005 const RegisterBank &SrcBank = 2006 *OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank; 2007 const RegisterBank &InsBank = 2008 *OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank; 2009 2010 const RegisterBank &CCBank = 2011 (DstBank == AMDGPU::SGPRRegBank && 2012 SrcBank == AMDGPU::SGPRRegBank && 2013 InsBank == AMDGPU::SGPRRegBank && 2014 IdxBank == AMDGPU::SGPRRegBank) ? AMDGPU::SGPRRegBank 2015 : AMDGPU::VCCRegBank; 2016 LLT CCTy = (CCBank == AMDGPU::SGPRRegBank) ? S32 : LLT::scalar(1); 2017 2018 if (CCBank == AMDGPU::VCCRegBank && IdxBank == AMDGPU::SGPRRegBank) { 2019 Idx = B.buildCopy(S32, Idx)->getOperand(0).getReg(); 2020 MRI.setRegBank(Idx, AMDGPU::VGPRRegBank); 2021 } 2022 2023 LLT EltTy = VecTy.getScalarType(); 2024 SmallVector<Register, 2> InsRegs(OpdMapper.getVRegs(2)); 2025 unsigned NumLanes = InsRegs.size(); 2026 if (!NumLanes) { 2027 NumLanes = 1; 2028 InsRegs.push_back(MI.getOperand(2).getReg()); 2029 } else { 2030 EltTy = MRI.getType(InsRegs[0]); 2031 } 2032 2033 auto UnmergeToEltTy = B.buildUnmerge(EltTy, VecReg); 2034 SmallVector<Register, 16> Ops(NumElem * NumLanes); 2035 2036 for (unsigned I = 0; I < NumElem; ++I) { 2037 auto IC = B.buildConstant(S32, I); 2038 MRI.setRegBank(IC->getOperand(0).getReg(), AMDGPU::SGPRRegBank); 2039 auto Cmp = B.buildICmp(CmpInst::ICMP_EQ, CCTy, Idx, IC); 2040 MRI.setRegBank(Cmp->getOperand(0).getReg(), CCBank); 2041 2042 for (unsigned L = 0; L < NumLanes; ++L) { 2043 auto S = B.buildSelect(EltTy, Cmp, InsRegs[L], 2044 UnmergeToEltTy.getReg(I * NumLanes + L)); 2045 2046 for (unsigned N : { 0, 2, 3 }) 2047 MRI.setRegBank(S->getOperand(N).getReg(), DstBank); 2048 2049 Ops[I * NumLanes + L] = S->getOperand(0).getReg(); 2050 } 2051 } 2052 2053 LLT MergeTy = LLT::vector(Ops.size(), EltTy); 2054 if (MergeTy == MRI.getType(MI.getOperand(0).getReg())) { 2055 B.buildBuildVector(MI.getOperand(0), Ops); 2056 } else { 2057 auto Vec = B.buildBuildVector(MergeTy, Ops); 2058 MRI.setRegBank(Vec->getOperand(0).getReg(), DstBank); 2059 B.buildBitcast(MI.getOperand(0).getReg(), Vec); 2060 } 2061 2062 MRI.setRegBank(MI.getOperand(0).getReg(), DstBank); 2063 MI.eraseFromParent(); 2064 2065 return true; 2066 } 2067 2068 void AMDGPURegisterBankInfo::applyMappingImpl( 2069 const OperandsMapper &OpdMapper) const { 2070 MachineInstr &MI = OpdMapper.getMI(); 2071 unsigned Opc = MI.getOpcode(); 2072 MachineRegisterInfo &MRI = OpdMapper.getMRI(); 2073 switch (Opc) { 2074 case AMDGPU::G_PHI: { 2075 Register DstReg = MI.getOperand(0).getReg(); 2076 LLT DstTy = MRI.getType(DstReg); 2077 if (DstTy != LLT::scalar(1)) 2078 break; 2079 2080 const LLT S32 = LLT::scalar(32); 2081 const RegisterBank *DstBank = 2082 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; 2083 if (DstBank == &AMDGPU::VCCRegBank) { 2084 applyDefaultMapping(OpdMapper); 2085 // The standard handling only considers the result register bank for 2086 // phis. For VCC, blindly inserting a copy when the phi is lowered will 2087 // produce an invalid copy. We can only copy with some kind of compare to 2088 // get a vector boolean result. Insert a regitser bank copy that will be 2089 // correctly lowered to a compare. 2090 MachineIRBuilder B(*MI.getParent()->getParent()); 2091 2092 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) { 2093 Register SrcReg = MI.getOperand(I).getReg(); 2094 const RegisterBank *SrcBank = getRegBank(SrcReg, MRI, *TRI); 2095 2096 if (SrcBank != &AMDGPU::VCCRegBank) { 2097 MachineBasicBlock *SrcMBB = MI.getOperand(I + 1).getMBB(); 2098 B.setInsertPt(*SrcMBB, SrcMBB->getFirstTerminator()); 2099 2100 auto Copy = B.buildCopy(LLT::scalar(1), SrcReg); 2101 MRI.setRegBank(Copy.getReg(0), AMDGPU::VCCRegBank); 2102 MI.getOperand(I).setReg(Copy.getReg(0)); 2103 } 2104 } 2105 2106 return; 2107 } 2108 2109 // Phi handling is strange and only considers the bank of the destination. 2110 substituteSimpleCopyRegs(OpdMapper, 0); 2111 2112 // Promote SGPR/VGPR booleans to s32 2113 MachineFunction *MF = MI.getParent()->getParent(); 2114 ApplyRegBankMapping ApplyBank(*this, MRI, DstBank); 2115 MachineIRBuilder B(MI, ApplyBank); 2116 LegalizerHelper Helper(*MF, ApplyBank, B); 2117 2118 if (Helper.widenScalar(MI, 0, S32) != LegalizerHelper::Legalized) 2119 llvm_unreachable("widen scalar should have succeeded"); 2120 2121 return; 2122 } 2123 case AMDGPU::G_ICMP: 2124 case AMDGPU::G_UADDO: 2125 case AMDGPU::G_USUBO: 2126 case AMDGPU::G_UADDE: 2127 case AMDGPU::G_SADDE: 2128 case AMDGPU::G_USUBE: 2129 case AMDGPU::G_SSUBE: { 2130 unsigned BoolDstOp = Opc == AMDGPU::G_ICMP ? 0 : 1; 2131 Register DstReg = MI.getOperand(BoolDstOp).getReg(); 2132 2133 const RegisterBank *DstBank = 2134 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; 2135 if (DstBank != &AMDGPU::SGPRRegBank) 2136 break; 2137 2138 const bool HasCarryIn = MI.getNumOperands() == 5; 2139 2140 // If this is a scalar compare, promote the result to s32, as the selection 2141 // will end up using a copy to a 32-bit vreg. 2142 const LLT S32 = LLT::scalar(32); 2143 Register NewDstReg = MRI.createGenericVirtualRegister(S32); 2144 MRI.setRegBank(NewDstReg, AMDGPU::SGPRRegBank); 2145 MI.getOperand(BoolDstOp).setReg(NewDstReg); 2146 MachineIRBuilder B(MI); 2147 2148 if (HasCarryIn) { 2149 Register NewSrcReg = MRI.createGenericVirtualRegister(S32); 2150 MRI.setRegBank(NewSrcReg, AMDGPU::SGPRRegBank); 2151 B.buildZExt(NewSrcReg, MI.getOperand(4).getReg()); 2152 MI.getOperand(4).setReg(NewSrcReg); 2153 } 2154 2155 MachineBasicBlock *MBB = MI.getParent(); 2156 B.setInsertPt(*MBB, std::next(MI.getIterator())); 2157 2158 // If we had a constrained VCC result register, a copy was inserted to VCC 2159 // from SGPR. 2160 SmallVector<Register, 1> DefRegs(OpdMapper.getVRegs(0)); 2161 if (DefRegs.empty()) 2162 DefRegs.push_back(DstReg); 2163 B.buildTrunc(DefRegs[0], NewDstReg); 2164 return; 2165 } 2166 case AMDGPU::G_SELECT: { 2167 Register DstReg = MI.getOperand(0).getReg(); 2168 LLT DstTy = MRI.getType(DstReg); 2169 2170 SmallVector<Register, 1> CondRegs(OpdMapper.getVRegs(1)); 2171 if (CondRegs.empty()) 2172 CondRegs.push_back(MI.getOperand(1).getReg()); 2173 else { 2174 assert(CondRegs.size() == 1); 2175 } 2176 2177 const RegisterBank *CondBank = getRegBank(CondRegs[0], MRI, *TRI); 2178 if (CondBank == &AMDGPU::SGPRRegBank) { 2179 MachineIRBuilder B(MI); 2180 const LLT S32 = LLT::scalar(32); 2181 Register NewCondReg = MRI.createGenericVirtualRegister(S32); 2182 MRI.setRegBank(NewCondReg, AMDGPU::SGPRRegBank); 2183 2184 MI.getOperand(1).setReg(NewCondReg); 2185 B.buildZExt(NewCondReg, CondRegs[0]); 2186 } 2187 2188 if (DstTy.getSizeInBits() != 64) 2189 break; 2190 2191 MachineIRBuilder B(MI); 2192 LLT HalfTy = getHalfSizedType(DstTy); 2193 2194 SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0)); 2195 SmallVector<Register, 2> Src1Regs(OpdMapper.getVRegs(2)); 2196 SmallVector<Register, 2> Src2Regs(OpdMapper.getVRegs(3)); 2197 2198 // All inputs are SGPRs, nothing special to do. 2199 if (DefRegs.empty()) { 2200 assert(Src1Regs.empty() && Src2Regs.empty()); 2201 break; 2202 } 2203 2204 if (Src1Regs.empty()) 2205 split64BitValueForMapping(B, Src1Regs, HalfTy, MI.getOperand(2).getReg()); 2206 else { 2207 setRegsToType(MRI, Src1Regs, HalfTy); 2208 } 2209 2210 if (Src2Regs.empty()) 2211 split64BitValueForMapping(B, Src2Regs, HalfTy, MI.getOperand(3).getReg()); 2212 else 2213 setRegsToType(MRI, Src2Regs, HalfTy); 2214 2215 setRegsToType(MRI, DefRegs, HalfTy); 2216 2217 B.buildSelect(DefRegs[0], CondRegs[0], Src1Regs[0], Src2Regs[0]); 2218 B.buildSelect(DefRegs[1], CondRegs[0], Src1Regs[1], Src2Regs[1]); 2219 2220 MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank); 2221 MI.eraseFromParent(); 2222 return; 2223 } 2224 case AMDGPU::G_BRCOND: { 2225 Register CondReg = MI.getOperand(0).getReg(); 2226 // FIXME: Should use legalizer helper, but should change bool ext type. 2227 const RegisterBank *CondBank = 2228 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; 2229 2230 if (CondBank == &AMDGPU::SGPRRegBank) { 2231 MachineIRBuilder B(MI); 2232 const LLT S32 = LLT::scalar(32); 2233 Register NewCondReg = MRI.createGenericVirtualRegister(S32); 2234 MRI.setRegBank(NewCondReg, AMDGPU::SGPRRegBank); 2235 2236 MI.getOperand(0).setReg(NewCondReg); 2237 B.buildZExt(NewCondReg, CondReg); 2238 return; 2239 } 2240 2241 break; 2242 } 2243 case AMDGPU::G_AND: 2244 case AMDGPU::G_OR: 2245 case AMDGPU::G_XOR: { 2246 // 64-bit and is only available on the SALU, so split into 2 32-bit ops if 2247 // there is a VGPR input. 2248 Register DstReg = MI.getOperand(0).getReg(); 2249 LLT DstTy = MRI.getType(DstReg); 2250 2251 if (DstTy.getSizeInBits() == 1) { 2252 const RegisterBank *DstBank = 2253 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; 2254 if (DstBank == &AMDGPU::VCCRegBank) 2255 break; 2256 2257 MachineFunction *MF = MI.getParent()->getParent(); 2258 ApplyRegBankMapping ApplyBank(*this, MRI, DstBank); 2259 MachineIRBuilder B(MI, ApplyBank); 2260 LegalizerHelper Helper(*MF, ApplyBank, B); 2261 2262 if (Helper.widenScalar(MI, 0, LLT::scalar(32)) != 2263 LegalizerHelper::Legalized) 2264 llvm_unreachable("widen scalar should have succeeded"); 2265 return; 2266 } 2267 2268 if (DstTy.getSizeInBits() != 64) 2269 break; 2270 2271 LLT HalfTy = getHalfSizedType(DstTy); 2272 SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0)); 2273 SmallVector<Register, 2> Src0Regs(OpdMapper.getVRegs(1)); 2274 SmallVector<Register, 2> Src1Regs(OpdMapper.getVRegs(2)); 2275 2276 // All inputs are SGPRs, nothing special to do. 2277 if (DefRegs.empty()) { 2278 assert(Src0Regs.empty() && Src1Regs.empty()); 2279 break; 2280 } 2281 2282 assert(DefRegs.size() == 2); 2283 assert(Src0Regs.size() == Src1Regs.size() && 2284 (Src0Regs.empty() || Src0Regs.size() == 2)); 2285 2286 // Depending on where the source registers came from, the generic code may 2287 // have decided to split the inputs already or not. If not, we still need to 2288 // extract the values. 2289 MachineIRBuilder B(MI); 2290 2291 if (Src0Regs.empty()) 2292 split64BitValueForMapping(B, Src0Regs, HalfTy, MI.getOperand(1).getReg()); 2293 else 2294 setRegsToType(MRI, Src0Regs, HalfTy); 2295 2296 if (Src1Regs.empty()) 2297 split64BitValueForMapping(B, Src1Regs, HalfTy, MI.getOperand(2).getReg()); 2298 else 2299 setRegsToType(MRI, Src1Regs, HalfTy); 2300 2301 setRegsToType(MRI, DefRegs, HalfTy); 2302 2303 B.buildInstr(Opc, {DefRegs[0]}, {Src0Regs[0], Src1Regs[0]}); 2304 B.buildInstr(Opc, {DefRegs[1]}, {Src0Regs[1], Src1Regs[1]}); 2305 2306 MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank); 2307 MI.eraseFromParent(); 2308 return; 2309 } 2310 case AMDGPU::G_ADD: 2311 case AMDGPU::G_SUB: 2312 case AMDGPU::G_MUL: 2313 case AMDGPU::G_SHL: 2314 case AMDGPU::G_LSHR: 2315 case AMDGPU::G_ASHR: 2316 case AMDGPU::G_SMIN: 2317 case AMDGPU::G_SMAX: 2318 case AMDGPU::G_UMIN: 2319 case AMDGPU::G_UMAX: { 2320 Register DstReg = MI.getOperand(0).getReg(); 2321 LLT DstTy = MRI.getType(DstReg); 2322 2323 // 16-bit operations are VALU only, but can be promoted to 32-bit SALU. 2324 // Packed 16-bit operations need to be scalarized and promoted. 2325 if (DstTy != LLT::scalar(16) && DstTy != LLT::vector(2, 16)) 2326 break; 2327 2328 const RegisterBank *DstBank = 2329 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; 2330 if (DstBank == &AMDGPU::VGPRRegBank) 2331 break; 2332 2333 const LLT S32 = LLT::scalar(32); 2334 MachineBasicBlock *MBB = MI.getParent(); 2335 MachineFunction *MF = MBB->getParent(); 2336 ApplyRegBankMapping ApplySALU(*this, MRI, &AMDGPU::SGPRRegBank); 2337 MachineIRBuilder B(MI, ApplySALU); 2338 2339 if (DstTy.isVector()) { 2340 Register WideSrc0Lo, WideSrc0Hi; 2341 Register WideSrc1Lo, WideSrc1Hi; 2342 2343 unsigned ExtendOp = getExtendOp(MI.getOpcode()); 2344 std::tie(WideSrc0Lo, WideSrc0Hi) 2345 = unpackV2S16ToS32(B, MI.getOperand(1).getReg(), ExtendOp); 2346 std::tie(WideSrc1Lo, WideSrc1Hi) 2347 = unpackV2S16ToS32(B, MI.getOperand(2).getReg(), ExtendOp); 2348 auto Lo = B.buildInstr(MI.getOpcode(), {S32}, {WideSrc0Lo, WideSrc1Lo}); 2349 auto Hi = B.buildInstr(MI.getOpcode(), {S32}, {WideSrc0Hi, WideSrc1Hi}); 2350 B.buildBuildVectorTrunc(DstReg, {Lo.getReg(0), Hi.getReg(0)}); 2351 MI.eraseFromParent(); 2352 } else { 2353 LegalizerHelper Helper(*MF, ApplySALU, B); 2354 2355 if (Helper.widenScalar(MI, 0, S32) != LegalizerHelper::Legalized) 2356 llvm_unreachable("widen scalar should have succeeded"); 2357 2358 // FIXME: s16 shift amounts should be legal. 2359 if (Opc == AMDGPU::G_SHL || Opc == AMDGPU::G_LSHR || 2360 Opc == AMDGPU::G_ASHR) { 2361 B.setInsertPt(*MBB, MI.getIterator()); 2362 if (Helper.widenScalar(MI, 1, S32) != LegalizerHelper::Legalized) 2363 llvm_unreachable("widen scalar should have succeeded"); 2364 } 2365 } 2366 2367 return; 2368 } 2369 case AMDGPU::G_SEXT_INREG: { 2370 SmallVector<Register, 2> SrcRegs(OpdMapper.getVRegs(1)); 2371 if (SrcRegs.empty()) 2372 break; // Nothing to repair 2373 2374 const LLT S32 = LLT::scalar(32); 2375 MachineIRBuilder B(MI); 2376 ApplyRegBankMapping O(*this, MRI, &AMDGPU::VGPRRegBank); 2377 GISelObserverWrapper Observer(&O); 2378 B.setChangeObserver(Observer); 2379 2380 // Don't use LegalizerHelper's narrowScalar. It produces unwanted G_SEXTs 2381 // we would need to further expand, and doesn't let us directly set the 2382 // result registers. 2383 SmallVector<Register, 2> DstRegs(OpdMapper.getVRegs(0)); 2384 2385 int Amt = MI.getOperand(2).getImm(); 2386 if (Amt <= 32) { 2387 if (Amt == 32) { 2388 // The low bits are unchanged. 2389 B.buildCopy(DstRegs[0], SrcRegs[0]); 2390 } else { 2391 // Extend in the low bits and propagate the sign bit to the high half. 2392 B.buildSExtInReg(DstRegs[0], SrcRegs[0], Amt); 2393 } 2394 2395 B.buildAShr(DstRegs[1], DstRegs[0], B.buildConstant(S32, 31)); 2396 } else { 2397 // The low bits are unchanged, and extend in the high bits. 2398 B.buildCopy(DstRegs[0], SrcRegs[0]); 2399 B.buildSExtInReg(DstRegs[1], DstRegs[0], Amt - 32); 2400 } 2401 2402 Register DstReg = MI.getOperand(0).getReg(); 2403 MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank); 2404 MI.eraseFromParent(); 2405 return; 2406 } 2407 case AMDGPU::G_CTPOP: 2408 case AMDGPU::G_BITREVERSE: 2409 case AMDGPU::G_CTLZ_ZERO_UNDEF: 2410 case AMDGPU::G_CTTZ_ZERO_UNDEF: { 2411 const RegisterBank *DstBank = 2412 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; 2413 if (DstBank == &AMDGPU::SGPRRegBank) 2414 break; 2415 2416 Register SrcReg = MI.getOperand(1).getReg(); 2417 const LLT S32 = LLT::scalar(32); 2418 LLT Ty = MRI.getType(SrcReg); 2419 if (Ty == S32) 2420 break; 2421 2422 ApplyRegBankMapping ApplyVALU(*this, MRI, &AMDGPU::VGPRRegBank); 2423 MachineIRBuilder B(MI, ApplyVALU); 2424 2425 MachineFunction &MF = B.getMF(); 2426 LegalizerHelper Helper(MF, ApplyVALU, B); 2427 2428 if (Helper.narrowScalar(MI, 1, S32) != LegalizerHelper::Legalized) 2429 llvm_unreachable("narrowScalar should have succeeded"); 2430 return; 2431 } 2432 case AMDGPU::G_SEXT: 2433 case AMDGPU::G_ZEXT: 2434 case AMDGPU::G_ANYEXT: { 2435 Register SrcReg = MI.getOperand(1).getReg(); 2436 LLT SrcTy = MRI.getType(SrcReg); 2437 const bool Signed = Opc == AMDGPU::G_SEXT; 2438 2439 assert(empty(OpdMapper.getVRegs(1))); 2440 2441 MachineIRBuilder B(MI); 2442 const RegisterBank *SrcBank = 2443 OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank; 2444 2445 Register DstReg = MI.getOperand(0).getReg(); 2446 LLT DstTy = MRI.getType(DstReg); 2447 if (DstTy.isScalar() && 2448 SrcBank != &AMDGPU::SGPRRegBank && 2449 SrcBank != &AMDGPU::VCCRegBank && 2450 // FIXME: Should handle any type that round to s64 when irregular 2451 // breakdowns supported. 2452 DstTy.getSizeInBits() == 64 && 2453 SrcTy.getSizeInBits() <= 32) { 2454 SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0)); 2455 2456 // Extend to 32-bit, and then extend the low half. 2457 if (Signed) { 2458 // TODO: Should really be buildSExtOrCopy 2459 B.buildSExtOrTrunc(DefRegs[0], SrcReg); 2460 } else if (Opc == AMDGPU::G_ZEXT) { 2461 B.buildZExtOrTrunc(DefRegs[0], SrcReg); 2462 } else { 2463 B.buildAnyExtOrTrunc(DefRegs[0], SrcReg); 2464 } 2465 2466 extendLow32IntoHigh32(B, DefRegs[1], DefRegs[0], Opc, *SrcBank); 2467 MRI.setRegBank(DstReg, *SrcBank); 2468 MI.eraseFromParent(); 2469 return; 2470 } 2471 2472 if (SrcTy != LLT::scalar(1)) 2473 return; 2474 2475 // It is not legal to have a legalization artifact with a VCC source. Rather 2476 // than introducing a copy, insert the select we would have to select the 2477 // copy to. 2478 if (SrcBank == &AMDGPU::VCCRegBank) { 2479 SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0)); 2480 2481 const RegisterBank *DstBank = &AMDGPU::VGPRRegBank; 2482 2483 unsigned DstSize = DstTy.getSizeInBits(); 2484 // 64-bit select is SGPR only 2485 const bool UseSel64 = DstSize > 32 && 2486 SrcBank->getID() == AMDGPU::SGPRRegBankID; 2487 2488 // TODO: Should s16 select be legal? 2489 LLT SelType = UseSel64 ? LLT::scalar(64) : LLT::scalar(32); 2490 auto True = B.buildConstant(SelType, Signed ? -1 : 1); 2491 auto False = B.buildConstant(SelType, 0); 2492 2493 MRI.setRegBank(True.getReg(0), *DstBank); 2494 MRI.setRegBank(False.getReg(0), *DstBank); 2495 MRI.setRegBank(DstReg, *DstBank); 2496 2497 if (DstSize > 32) { 2498 B.buildSelect(DefRegs[0], SrcReg, True, False); 2499 extendLow32IntoHigh32(B, DefRegs[1], DefRegs[0], Opc, *SrcBank, true); 2500 } else if (DstSize < 32) { 2501 auto Sel = B.buildSelect(SelType, SrcReg, True, False); 2502 MRI.setRegBank(Sel.getReg(0), *DstBank); 2503 B.buildTrunc(DstReg, Sel); 2504 } else { 2505 B.buildSelect(DstReg, SrcReg, True, False); 2506 } 2507 2508 MI.eraseFromParent(); 2509 return; 2510 } 2511 2512 break; 2513 } 2514 case AMDGPU::G_BUILD_VECTOR: 2515 case AMDGPU::G_BUILD_VECTOR_TRUNC: { 2516 Register DstReg = MI.getOperand(0).getReg(); 2517 LLT DstTy = MRI.getType(DstReg); 2518 if (DstTy != LLT::vector(2, 16)) 2519 break; 2520 2521 assert(MI.getNumOperands() == 3 && OpdMapper.getVRegs(0).empty()); 2522 substituteSimpleCopyRegs(OpdMapper, 1); 2523 substituteSimpleCopyRegs(OpdMapper, 2); 2524 2525 const RegisterBank *DstBank = 2526 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; 2527 if (DstBank == &AMDGPU::SGPRRegBank) 2528 break; // Can use S_PACK_* instructions. 2529 2530 MachineIRBuilder B(MI); 2531 2532 Register Lo = MI.getOperand(1).getReg(); 2533 Register Hi = MI.getOperand(2).getReg(); 2534 const LLT S32 = LLT::scalar(32); 2535 2536 const RegisterBank *BankLo = 2537 OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank; 2538 const RegisterBank *BankHi = 2539 OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank; 2540 2541 Register ZextLo; 2542 Register ShiftHi; 2543 2544 if (Opc == AMDGPU::G_BUILD_VECTOR) { 2545 ZextLo = B.buildZExt(S32, Lo).getReg(0); 2546 MRI.setRegBank(ZextLo, *BankLo); 2547 2548 Register ZextHi = B.buildZExt(S32, Hi).getReg(0); 2549 MRI.setRegBank(ZextHi, *BankHi); 2550 2551 auto ShiftAmt = B.buildConstant(S32, 16); 2552 MRI.setRegBank(ShiftAmt.getReg(0), *BankHi); 2553 2554 ShiftHi = B.buildShl(S32, ZextHi, ShiftAmt).getReg(0); 2555 MRI.setRegBank(ShiftHi, *BankHi); 2556 } else { 2557 Register MaskLo = B.buildConstant(S32, 0xffff).getReg(0); 2558 MRI.setRegBank(MaskLo, *BankLo); 2559 2560 auto ShiftAmt = B.buildConstant(S32, 16); 2561 MRI.setRegBank(ShiftAmt.getReg(0), *BankHi); 2562 2563 ShiftHi = B.buildShl(S32, Hi, ShiftAmt).getReg(0); 2564 MRI.setRegBank(ShiftHi, *BankHi); 2565 2566 ZextLo = B.buildAnd(S32, Lo, MaskLo).getReg(0); 2567 MRI.setRegBank(ZextLo, *BankLo); 2568 } 2569 2570 auto Or = B.buildOr(S32, ZextLo, ShiftHi); 2571 MRI.setRegBank(Or.getReg(0), *DstBank); 2572 2573 B.buildBitcast(DstReg, Or); 2574 MI.eraseFromParent(); 2575 return; 2576 } 2577 case AMDGPU::G_EXTRACT_VECTOR_ELT: { 2578 SmallVector<Register, 2> DstRegs(OpdMapper.getVRegs(0)); 2579 2580 assert(OpdMapper.getVRegs(1).empty() && OpdMapper.getVRegs(2).empty()); 2581 2582 Register DstReg = MI.getOperand(0).getReg(); 2583 Register SrcReg = MI.getOperand(1).getReg(); 2584 2585 const LLT S32 = LLT::scalar(32); 2586 LLT DstTy = MRI.getType(DstReg); 2587 LLT SrcTy = MRI.getType(SrcReg); 2588 2589 if (foldExtractEltToCmpSelect(MI, MRI, OpdMapper)) 2590 return; 2591 2592 MachineIRBuilder B(MI); 2593 2594 const ValueMapping &DstMapping 2595 = OpdMapper.getInstrMapping().getOperandMapping(0); 2596 const RegisterBank *DstBank = DstMapping.BreakDown[0].RegBank; 2597 const RegisterBank *SrcBank = 2598 OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank; 2599 const RegisterBank *IdxBank = 2600 OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank; 2601 2602 Register BaseIdxReg; 2603 unsigned ConstOffset; 2604 std::tie(BaseIdxReg, ConstOffset) = 2605 AMDGPU::getBaseWithConstantOffset(MRI, MI.getOperand(2).getReg()); 2606 2607 // See if the index is an add of a constant which will be foldable by moving 2608 // the base register of the index later if this is going to be executed in a 2609 // waterfall loop. This is essentially to reassociate the add of a constant 2610 // with the readfirstlane. 2611 bool ShouldMoveIndexIntoLoop = IdxBank != &AMDGPU::SGPRRegBank && 2612 ConstOffset > 0 && 2613 ConstOffset < SrcTy.getNumElements(); 2614 2615 // Move the base register. We'll re-insert the add later. 2616 if (ShouldMoveIndexIntoLoop) 2617 MI.getOperand(2).setReg(BaseIdxReg); 2618 2619 // If this is a VGPR result only because the index was a VGPR result, the 2620 // actual indexing will be done on the SGPR source vector, which will 2621 // produce a scalar result. We need to copy to the VGPR result inside the 2622 // waterfall loop. 2623 const bool NeedCopyToVGPR = DstBank == &AMDGPU::VGPRRegBank && 2624 SrcBank == &AMDGPU::SGPRRegBank; 2625 if (DstRegs.empty()) { 2626 applyDefaultMapping(OpdMapper); 2627 2628 executeInWaterfallLoop(MI, MRI, { 2 }); 2629 2630 if (NeedCopyToVGPR) { 2631 // We don't want a phi for this temporary reg. 2632 Register TmpReg = MRI.createGenericVirtualRegister(DstTy); 2633 MRI.setRegBank(TmpReg, AMDGPU::SGPRRegBank); 2634 MI.getOperand(0).setReg(TmpReg); 2635 B.setInsertPt(*MI.getParent(), ++MI.getIterator()); 2636 2637 // Use a v_mov_b32 here to make the exec dependency explicit. 2638 buildVCopy(B, DstReg, TmpReg); 2639 } 2640 2641 // Re-insert the constant offset add inside the waterfall loop. 2642 if (ShouldMoveIndexIntoLoop) 2643 reinsertVectorIndexAdd(B, MI, 2, ConstOffset); 2644 2645 return; 2646 } 2647 2648 assert(DstTy.getSizeInBits() == 64); 2649 2650 LLT Vec32 = LLT::vector(2 * SrcTy.getNumElements(), 32); 2651 2652 auto CastSrc = B.buildBitcast(Vec32, SrcReg); 2653 auto One = B.buildConstant(S32, 1); 2654 2655 MachineBasicBlock::iterator MII = MI.getIterator(); 2656 2657 // Split the vector index into 32-bit pieces. Prepare to move all of the 2658 // new instructions into a waterfall loop if necessary. 2659 // 2660 // Don't put the bitcast or constant in the loop. 2661 MachineInstrSpan Span(MII, &B.getMBB()); 2662 2663 // Compute 32-bit element indices, (2 * OrigIdx, 2 * OrigIdx + 1). 2664 auto IdxLo = B.buildShl(S32, BaseIdxReg, One); 2665 auto IdxHi = B.buildAdd(S32, IdxLo, One); 2666 2667 auto Extract0 = B.buildExtractVectorElement(DstRegs[0], CastSrc, IdxLo); 2668 auto Extract1 = B.buildExtractVectorElement(DstRegs[1], CastSrc, IdxHi); 2669 2670 MRI.setRegBank(DstReg, *DstBank); 2671 MRI.setRegBank(CastSrc.getReg(0), *SrcBank); 2672 MRI.setRegBank(One.getReg(0), AMDGPU::SGPRRegBank); 2673 MRI.setRegBank(IdxLo.getReg(0), AMDGPU::SGPRRegBank); 2674 MRI.setRegBank(IdxHi.getReg(0), AMDGPU::SGPRRegBank); 2675 2676 SmallSet<Register, 4> OpsToWaterfall; 2677 if (!collectWaterfallOperands(OpsToWaterfall, MI, MRI, { 2 })) { 2678 MI.eraseFromParent(); 2679 return; 2680 } 2681 2682 // Remove the original instruction to avoid potentially confusing the 2683 // waterfall loop logic. 2684 B.setInstr(*Span.begin()); 2685 MI.eraseFromParent(); 2686 executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()), 2687 OpsToWaterfall, MRI); 2688 2689 if (NeedCopyToVGPR) { 2690 MachineBasicBlock *LoopBB = Extract1->getParent(); 2691 Register TmpReg0 = MRI.createGenericVirtualRegister(S32); 2692 Register TmpReg1 = MRI.createGenericVirtualRegister(S32); 2693 MRI.setRegBank(TmpReg0, AMDGPU::SGPRRegBank); 2694 MRI.setRegBank(TmpReg1, AMDGPU::SGPRRegBank); 2695 2696 Extract0->getOperand(0).setReg(TmpReg0); 2697 Extract1->getOperand(0).setReg(TmpReg1); 2698 2699 B.setInsertPt(*LoopBB, ++Extract1->getIterator()); 2700 2701 buildVCopy(B, DstRegs[0], TmpReg0); 2702 buildVCopy(B, DstRegs[1], TmpReg1); 2703 } 2704 2705 if (ShouldMoveIndexIntoLoop) 2706 reinsertVectorIndexAdd(B, *IdxLo, 1, ConstOffset); 2707 2708 return; 2709 } 2710 case AMDGPU::G_INSERT_VECTOR_ELT: { 2711 SmallVector<Register, 2> InsRegs(OpdMapper.getVRegs(2)); 2712 2713 Register DstReg = MI.getOperand(0).getReg(); 2714 LLT VecTy = MRI.getType(DstReg); 2715 2716 assert(OpdMapper.getVRegs(0).empty()); 2717 assert(OpdMapper.getVRegs(3).empty()); 2718 2719 if (substituteSimpleCopyRegs(OpdMapper, 1)) 2720 MRI.setType(MI.getOperand(1).getReg(), VecTy); 2721 2722 if (foldInsertEltToCmpSelect(MI, MRI, OpdMapper)) 2723 return; 2724 2725 const RegisterBank *IdxBank = 2726 OpdMapper.getInstrMapping().getOperandMapping(3).BreakDown[0].RegBank; 2727 2728 Register SrcReg = MI.getOperand(1).getReg(); 2729 Register InsReg = MI.getOperand(2).getReg(); 2730 LLT InsTy = MRI.getType(InsReg); 2731 (void)InsTy; 2732 2733 Register BaseIdxReg; 2734 unsigned ConstOffset; 2735 std::tie(BaseIdxReg, ConstOffset) = 2736 AMDGPU::getBaseWithConstantOffset(MRI, MI.getOperand(3).getReg()); 2737 2738 // See if the index is an add of a constant which will be foldable by moving 2739 // the base register of the index later if this is going to be executed in a 2740 // waterfall loop. This is essentially to reassociate the add of a constant 2741 // with the readfirstlane. 2742 bool ShouldMoveIndexIntoLoop = IdxBank != &AMDGPU::SGPRRegBank && 2743 ConstOffset > 0 && 2744 ConstOffset < VecTy.getNumElements(); 2745 2746 // Move the base register. We'll re-insert the add later. 2747 if (ShouldMoveIndexIntoLoop) 2748 MI.getOperand(3).setReg(BaseIdxReg); 2749 2750 2751 if (InsRegs.empty()) { 2752 executeInWaterfallLoop(MI, MRI, { 3 }); 2753 2754 // Re-insert the constant offset add inside the waterfall loop. 2755 if (ShouldMoveIndexIntoLoop) { 2756 MachineIRBuilder B(MI); 2757 reinsertVectorIndexAdd(B, MI, 3, ConstOffset); 2758 } 2759 2760 return; 2761 } 2762 2763 2764 assert(InsTy.getSizeInBits() == 64); 2765 2766 const LLT S32 = LLT::scalar(32); 2767 LLT Vec32 = LLT::vector(2 * VecTy.getNumElements(), 32); 2768 2769 MachineIRBuilder B(MI); 2770 auto CastSrc = B.buildBitcast(Vec32, SrcReg); 2771 auto One = B.buildConstant(S32, 1); 2772 2773 // Split the vector index into 32-bit pieces. Prepare to move all of the 2774 // new instructions into a waterfall loop if necessary. 2775 // 2776 // Don't put the bitcast or constant in the loop. 2777 MachineInstrSpan Span(MachineBasicBlock::iterator(&MI), &B.getMBB()); 2778 2779 // Compute 32-bit element indices, (2 * OrigIdx, 2 * OrigIdx + 1). 2780 auto IdxLo = B.buildShl(S32, BaseIdxReg, One); 2781 auto IdxHi = B.buildAdd(S32, IdxLo, One); 2782 2783 auto InsLo = B.buildInsertVectorElement(Vec32, CastSrc, InsRegs[0], IdxLo); 2784 auto InsHi = B.buildInsertVectorElement(Vec32, InsLo, InsRegs[1], IdxHi); 2785 2786 const RegisterBank *DstBank = 2787 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; 2788 const RegisterBank *SrcBank = 2789 OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank; 2790 const RegisterBank *InsSrcBank = 2791 OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank; 2792 2793 MRI.setRegBank(InsReg, *InsSrcBank); 2794 MRI.setRegBank(CastSrc.getReg(0), *SrcBank); 2795 MRI.setRegBank(InsLo.getReg(0), *DstBank); 2796 MRI.setRegBank(InsHi.getReg(0), *DstBank); 2797 MRI.setRegBank(One.getReg(0), AMDGPU::SGPRRegBank); 2798 MRI.setRegBank(IdxLo.getReg(0), AMDGPU::SGPRRegBank); 2799 MRI.setRegBank(IdxHi.getReg(0), AMDGPU::SGPRRegBank); 2800 2801 2802 SmallSet<Register, 4> OpsToWaterfall; 2803 if (!collectWaterfallOperands(OpsToWaterfall, MI, MRI, { 3 })) { 2804 B.setInsertPt(B.getMBB(), MI); 2805 B.buildBitcast(DstReg, InsHi); 2806 MI.eraseFromParent(); 2807 return; 2808 } 2809 2810 B.setInstr(*Span.begin()); 2811 MI.eraseFromParent(); 2812 2813 // Figure out the point after the waterfall loop before mangling the control 2814 // flow. 2815 executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()), 2816 OpsToWaterfall, MRI); 2817 2818 // The insertion point is now right after the original instruction. 2819 // 2820 // Keep the bitcast to the original vector type out of the loop. Doing this 2821 // saved an extra phi we don't need inside the loop. 2822 B.buildBitcast(DstReg, InsHi); 2823 2824 // Re-insert the constant offset add inside the waterfall loop. 2825 if (ShouldMoveIndexIntoLoop) 2826 reinsertVectorIndexAdd(B, *IdxLo, 1, ConstOffset); 2827 2828 return; 2829 } 2830 case AMDGPU::G_AMDGPU_BUFFER_LOAD: 2831 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT: 2832 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT: 2833 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE: 2834 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE: 2835 case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT: 2836 case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16: 2837 case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT: 2838 case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16: 2839 case AMDGPU::G_AMDGPU_BUFFER_STORE: 2840 case AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE: 2841 case AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT: 2842 case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT: 2843 case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16: 2844 case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT: 2845 case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16: { 2846 applyDefaultMapping(OpdMapper); 2847 executeInWaterfallLoop(MI, MRI, {1, 4}); 2848 return; 2849 } 2850 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP: 2851 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD: 2852 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB: 2853 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN: 2854 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN: 2855 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX: 2856 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX: 2857 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND: 2858 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR: 2859 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR: 2860 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC: 2861 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC: { 2862 applyDefaultMapping(OpdMapper); 2863 executeInWaterfallLoop(MI, MRI, {2, 5}); 2864 return; 2865 } 2866 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD: 2867 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN: 2868 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX: { 2869 applyDefaultMapping(OpdMapper); 2870 executeInWaterfallLoop(MI, MRI, {2, 5}); 2871 return; 2872 } 2873 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP: { 2874 applyDefaultMapping(OpdMapper); 2875 executeInWaterfallLoop(MI, MRI, {3, 6}); 2876 return; 2877 } 2878 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD: { 2879 applyMappingSBufferLoad(OpdMapper); 2880 return; 2881 } 2882 case AMDGPU::G_INTRINSIC: { 2883 switch (MI.getIntrinsicID()) { 2884 case Intrinsic::amdgcn_readlane: { 2885 substituteSimpleCopyRegs(OpdMapper, 2); 2886 2887 assert(OpdMapper.getVRegs(0).empty()); 2888 assert(OpdMapper.getVRegs(3).empty()); 2889 2890 // Make sure the index is an SGPR. It doesn't make sense to run this in a 2891 // waterfall loop, so assume it's a uniform value. 2892 constrainOpWithReadfirstlane(MI, MRI, 3); // Index 2893 return; 2894 } 2895 case Intrinsic::amdgcn_writelane: { 2896 assert(OpdMapper.getVRegs(0).empty()); 2897 assert(OpdMapper.getVRegs(2).empty()); 2898 assert(OpdMapper.getVRegs(3).empty()); 2899 2900 substituteSimpleCopyRegs(OpdMapper, 4); // VGPR input val 2901 constrainOpWithReadfirstlane(MI, MRI, 2); // Source value 2902 constrainOpWithReadfirstlane(MI, MRI, 3); // Index 2903 return; 2904 } 2905 case Intrinsic::amdgcn_interp_p1: 2906 case Intrinsic::amdgcn_interp_p2: 2907 case Intrinsic::amdgcn_interp_mov: 2908 case Intrinsic::amdgcn_interp_p1_f16: 2909 case Intrinsic::amdgcn_interp_p2_f16: { 2910 applyDefaultMapping(OpdMapper); 2911 2912 // Readlane for m0 value, which is always the last operand. 2913 // FIXME: Should this be a waterfall loop instead? 2914 constrainOpWithReadfirstlane(MI, MRI, MI.getNumOperands() - 1); // Index 2915 return; 2916 } 2917 case Intrinsic::amdgcn_permlane16: 2918 case Intrinsic::amdgcn_permlanex16: { 2919 // Doing a waterfall loop over these wouldn't make any sense. 2920 substituteSimpleCopyRegs(OpdMapper, 2); 2921 substituteSimpleCopyRegs(OpdMapper, 3); 2922 constrainOpWithReadfirstlane(MI, MRI, 4); 2923 constrainOpWithReadfirstlane(MI, MRI, 5); 2924 return; 2925 } 2926 case Intrinsic::amdgcn_sbfe: 2927 applyMappingBFEIntrinsic(OpdMapper, true); 2928 return; 2929 case Intrinsic::amdgcn_ubfe: 2930 applyMappingBFEIntrinsic(OpdMapper, false); 2931 return; 2932 case Intrinsic::amdgcn_ballot: 2933 // Use default handling and insert copy to vcc source. 2934 break; 2935 } 2936 break; 2937 } 2938 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD: 2939 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE: { 2940 const AMDGPU::RsrcIntrinsic *RSrcIntrin 2941 = AMDGPU::lookupRsrcIntrinsic(MI.getIntrinsicID()); 2942 assert(RSrcIntrin && RSrcIntrin->IsImage); 2943 // Non-images can have complications from operands that allow both SGPR 2944 // and VGPR. For now it's too complicated to figure out the final opcode 2945 // to derive the register bank from the MCInstrDesc. 2946 applyMappingImage(MI, OpdMapper, MRI, RSrcIntrin->RsrcArg); 2947 return; 2948 } 2949 case AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY: { 2950 unsigned N = MI.getNumExplicitOperands() - 2; 2951 executeInWaterfallLoop(MI, MRI, { N }); 2952 return; 2953 } 2954 case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: { 2955 auto IntrID = MI.getIntrinsicID(); 2956 switch (IntrID) { 2957 case Intrinsic::amdgcn_ds_ordered_add: 2958 case Intrinsic::amdgcn_ds_ordered_swap: { 2959 // This is only allowed to execute with 1 lane, so readfirstlane is safe. 2960 assert(OpdMapper.getVRegs(0).empty()); 2961 substituteSimpleCopyRegs(OpdMapper, 3); 2962 constrainOpWithReadfirstlane(MI, MRI, 2); // M0 2963 return; 2964 } 2965 case Intrinsic::amdgcn_ds_gws_init: 2966 case Intrinsic::amdgcn_ds_gws_barrier: 2967 case Intrinsic::amdgcn_ds_gws_sema_br: { 2968 // Only the first lane is executes, so readfirstlane is safe. 2969 substituteSimpleCopyRegs(OpdMapper, 1); 2970 constrainOpWithReadfirstlane(MI, MRI, 2); // M0 2971 return; 2972 } 2973 case Intrinsic::amdgcn_ds_gws_sema_v: 2974 case Intrinsic::amdgcn_ds_gws_sema_p: 2975 case Intrinsic::amdgcn_ds_gws_sema_release_all: { 2976 // Only the first lane is executes, so readfirstlane is safe. 2977 constrainOpWithReadfirstlane(MI, MRI, 1); // M0 2978 return; 2979 } 2980 case Intrinsic::amdgcn_ds_append: 2981 case Intrinsic::amdgcn_ds_consume: { 2982 constrainOpWithReadfirstlane(MI, MRI, 2); // M0 2983 return; 2984 } 2985 case Intrinsic::amdgcn_s_sendmsg: 2986 case Intrinsic::amdgcn_s_sendmsghalt: { 2987 // FIXME: Should this use a waterfall loop? 2988 constrainOpWithReadfirstlane(MI, MRI, 2); // M0 2989 return; 2990 } 2991 case Intrinsic::amdgcn_s_setreg: { 2992 constrainOpWithReadfirstlane(MI, MRI, 2); 2993 return; 2994 } 2995 default: { 2996 if (const AMDGPU::RsrcIntrinsic *RSrcIntrin = 2997 AMDGPU::lookupRsrcIntrinsic(IntrID)) { 2998 // Non-images can have complications from operands that allow both SGPR 2999 // and VGPR. For now it's too complicated to figure out the final opcode 3000 // to derive the register bank from the MCInstrDesc. 3001 if (RSrcIntrin->IsImage) { 3002 applyMappingImage(MI, OpdMapper, MRI, RSrcIntrin->RsrcArg); 3003 return; 3004 } 3005 } 3006 3007 break; 3008 } 3009 } 3010 break; 3011 } 3012 case AMDGPU::G_LOAD: 3013 case AMDGPU::G_ZEXTLOAD: 3014 case AMDGPU::G_SEXTLOAD: { 3015 if (applyMappingLoad(MI, OpdMapper, MRI)) 3016 return; 3017 break; 3018 } 3019 case AMDGPU::G_DYN_STACKALLOC: 3020 applyMappingDynStackAlloc(MI, OpdMapper, MRI); 3021 return; 3022 default: 3023 break; 3024 } 3025 3026 return applyDefaultMapping(OpdMapper); 3027 } 3028 3029 // vgpr, sgpr -> vgpr 3030 // vgpr, agpr -> vgpr 3031 // agpr, agpr -> agpr 3032 // agpr, sgpr -> vgpr 3033 static unsigned regBankUnion(unsigned RB0, unsigned RB1) { 3034 if (RB0 == AMDGPU::InvalidRegBankID) 3035 return RB1; 3036 if (RB1 == AMDGPU::InvalidRegBankID) 3037 return RB0; 3038 3039 if (RB0 == AMDGPU::SGPRRegBankID && RB1 == AMDGPU::SGPRRegBankID) 3040 return AMDGPU::SGPRRegBankID; 3041 3042 if (RB0 == AMDGPU::AGPRRegBankID && RB1 == AMDGPU::AGPRRegBankID) 3043 return AMDGPU::AGPRRegBankID; 3044 3045 return AMDGPU::VGPRRegBankID; 3046 } 3047 3048 static unsigned regBankBoolUnion(unsigned RB0, unsigned RB1) { 3049 if (RB0 == AMDGPU::InvalidRegBankID) 3050 return RB1; 3051 if (RB1 == AMDGPU::InvalidRegBankID) 3052 return RB0; 3053 3054 // vcc, vcc -> vcc 3055 // vcc, sgpr -> vcc 3056 // vcc, vgpr -> vcc 3057 if (RB0 == AMDGPU::VCCRegBankID || RB1 == AMDGPU::VCCRegBankID) 3058 return AMDGPU::VCCRegBankID; 3059 3060 // vcc, vgpr -> vgpr 3061 return regBankUnion(RB0, RB1); 3062 } 3063 3064 unsigned AMDGPURegisterBankInfo::getMappingType(const MachineRegisterInfo &MRI, 3065 const MachineInstr &MI) const { 3066 unsigned RegBank = AMDGPU::InvalidRegBankID; 3067 3068 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { 3069 if (!MI.getOperand(i).isReg()) 3070 continue; 3071 Register Reg = MI.getOperand(i).getReg(); 3072 if (const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI)) { 3073 RegBank = regBankUnion(RegBank, Bank->getID()); 3074 if (RegBank == AMDGPU::VGPRRegBankID) 3075 break; 3076 } 3077 } 3078 3079 return RegBank; 3080 } 3081 3082 bool AMDGPURegisterBankInfo::isSALUMapping(const MachineInstr &MI) const { 3083 const MachineFunction &MF = *MI.getParent()->getParent(); 3084 const MachineRegisterInfo &MRI = MF.getRegInfo(); 3085 for (unsigned i = 0, e = MI.getNumOperands();i != e; ++i) { 3086 if (!MI.getOperand(i).isReg()) 3087 continue; 3088 Register Reg = MI.getOperand(i).getReg(); 3089 if (const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI)) { 3090 if (Bank->getID() != AMDGPU::SGPRRegBankID) 3091 return false; 3092 } 3093 } 3094 return true; 3095 } 3096 3097 const RegisterBankInfo::InstructionMapping & 3098 AMDGPURegisterBankInfo::getDefaultMappingSOP(const MachineInstr &MI) const { 3099 const MachineFunction &MF = *MI.getParent()->getParent(); 3100 const MachineRegisterInfo &MRI = MF.getRegInfo(); 3101 SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands()); 3102 3103 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { 3104 const MachineOperand &SrcOp = MI.getOperand(i); 3105 if (!SrcOp.isReg()) 3106 continue; 3107 3108 unsigned Size = getSizeInBits(SrcOp.getReg(), MRI, *TRI); 3109 OpdsMapping[i] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); 3110 } 3111 return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping), 3112 MI.getNumOperands()); 3113 } 3114 3115 const RegisterBankInfo::InstructionMapping & 3116 AMDGPURegisterBankInfo::getDefaultMappingVOP(const MachineInstr &MI) const { 3117 const MachineFunction &MF = *MI.getParent()->getParent(); 3118 const MachineRegisterInfo &MRI = MF.getRegInfo(); 3119 SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands()); 3120 3121 // Even though we technically could use SGPRs, this would require knowledge of 3122 // the constant bus restriction. Force all sources to VGPR (except for VCC). 3123 // 3124 // TODO: Unary ops are trivially OK, so accept SGPRs? 3125 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { 3126 const MachineOperand &Src = MI.getOperand(i); 3127 if (!Src.isReg()) 3128 continue; 3129 3130 unsigned Size = getSizeInBits(Src.getReg(), MRI, *TRI); 3131 unsigned BankID = Size == 1 ? AMDGPU::VCCRegBankID : AMDGPU::VGPRRegBankID; 3132 OpdsMapping[i] = AMDGPU::getValueMapping(BankID, Size); 3133 } 3134 3135 return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping), 3136 MI.getNumOperands()); 3137 } 3138 3139 const RegisterBankInfo::InstructionMapping & 3140 AMDGPURegisterBankInfo::getDefaultMappingAllVGPR(const MachineInstr &MI) const { 3141 const MachineFunction &MF = *MI.getParent()->getParent(); 3142 const MachineRegisterInfo &MRI = MF.getRegInfo(); 3143 SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands()); 3144 3145 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) { 3146 const MachineOperand &Op = MI.getOperand(I); 3147 if (!Op.isReg()) 3148 continue; 3149 3150 unsigned Size = getSizeInBits(Op.getReg(), MRI, *TRI); 3151 OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 3152 } 3153 3154 return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping), 3155 MI.getNumOperands()); 3156 } 3157 3158 const RegisterBankInfo::InstructionMapping & 3159 AMDGPURegisterBankInfo::getImageMapping(const MachineRegisterInfo &MRI, 3160 const MachineInstr &MI, 3161 int RsrcIdx) const { 3162 // The reported argument index is relative to the IR intrinsic call arguments, 3163 // so we need to shift by the number of defs and the intrinsic ID. 3164 RsrcIdx += MI.getNumExplicitDefs() + 1; 3165 3166 const int NumOps = MI.getNumOperands(); 3167 SmallVector<const ValueMapping *, 8> OpdsMapping(NumOps); 3168 3169 // TODO: Should packed/unpacked D16 difference be reported here as part of 3170 // the value mapping? 3171 for (int I = 0; I != NumOps; ++I) { 3172 if (!MI.getOperand(I).isReg()) 3173 continue; 3174 3175 Register OpReg = MI.getOperand(I).getReg(); 3176 // We replace some dead address operands with $noreg 3177 if (!OpReg) 3178 continue; 3179 3180 unsigned Size = getSizeInBits(OpReg, MRI, *TRI); 3181 3182 // FIXME: Probably need a new intrinsic register bank searchable table to 3183 // handle arbitrary intrinsics easily. 3184 // 3185 // If this has a sampler, it immediately follows rsrc. 3186 const bool MustBeSGPR = I == RsrcIdx || I == RsrcIdx + 1; 3187 3188 if (MustBeSGPR) { 3189 // If this must be an SGPR, so we must report whatever it is as legal. 3190 unsigned NewBank = getRegBankID(OpReg, MRI, AMDGPU::SGPRRegBankID); 3191 OpdsMapping[I] = AMDGPU::getValueMapping(NewBank, Size); 3192 } else { 3193 // Some operands must be VGPR, and these are easy to copy to. 3194 OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 3195 } 3196 } 3197 3198 return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping), NumOps); 3199 } 3200 3201 /// Return the mapping for a pointer arugment. 3202 const RegisterBankInfo::ValueMapping * 3203 AMDGPURegisterBankInfo::getValueMappingForPtr(const MachineRegisterInfo &MRI, 3204 Register PtrReg) const { 3205 LLT PtrTy = MRI.getType(PtrReg); 3206 unsigned Size = PtrTy.getSizeInBits(); 3207 if (Subtarget.useFlatForGlobal() || 3208 !AMDGPU::isFlatGlobalAddrSpace(PtrTy.getAddressSpace())) 3209 return AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 3210 3211 // If we're using MUBUF instructions for global memory, an SGPR base register 3212 // is possible. Otherwise this needs to be a VGPR. 3213 const RegisterBank *PtrBank = getRegBank(PtrReg, MRI, *TRI); 3214 return AMDGPU::getValueMapping(PtrBank->getID(), Size); 3215 } 3216 3217 const RegisterBankInfo::InstructionMapping & 3218 AMDGPURegisterBankInfo::getInstrMappingForLoad(const MachineInstr &MI) const { 3219 3220 const MachineFunction &MF = *MI.getParent()->getParent(); 3221 const MachineRegisterInfo &MRI = MF.getRegInfo(); 3222 SmallVector<const ValueMapping*, 2> OpdsMapping(2); 3223 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 3224 Register PtrReg = MI.getOperand(1).getReg(); 3225 LLT PtrTy = MRI.getType(PtrReg); 3226 unsigned AS = PtrTy.getAddressSpace(); 3227 unsigned PtrSize = PtrTy.getSizeInBits(); 3228 3229 const ValueMapping *ValMapping; 3230 const ValueMapping *PtrMapping; 3231 3232 const RegisterBank *PtrBank = getRegBank(PtrReg, MRI, *TRI); 3233 3234 if (PtrBank == &AMDGPU::SGPRRegBank && AMDGPU::isFlatGlobalAddrSpace(AS)) { 3235 if (isScalarLoadLegal(MI)) { 3236 // We have a uniform instruction so we want to use an SMRD load 3237 ValMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); 3238 PtrMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, PtrSize); 3239 } else { 3240 ValMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 3241 3242 // If we're using MUBUF instructions for global memory, an SGPR base 3243 // register is possible. Otherwise this needs to be a VGPR. 3244 unsigned PtrBankID = Subtarget.useFlatForGlobal() ? 3245 AMDGPU::VGPRRegBankID : AMDGPU::SGPRRegBankID; 3246 3247 PtrMapping = AMDGPU::getValueMapping(PtrBankID, PtrSize); 3248 } 3249 } else { 3250 ValMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 3251 PtrMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, PtrSize); 3252 } 3253 3254 OpdsMapping[0] = ValMapping; 3255 OpdsMapping[1] = PtrMapping; 3256 const RegisterBankInfo::InstructionMapping &Mapping = getInstructionMapping( 3257 1, 1, getOperandsMapping(OpdsMapping), MI.getNumOperands()); 3258 return Mapping; 3259 3260 // FIXME: Do we want to add a mapping for FLAT load, or should we just 3261 // handle that during instruction selection? 3262 } 3263 3264 unsigned 3265 AMDGPURegisterBankInfo::getRegBankID(Register Reg, 3266 const MachineRegisterInfo &MRI, 3267 unsigned Default) const { 3268 const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI); 3269 return Bank ? Bank->getID() : Default; 3270 } 3271 3272 const RegisterBankInfo::ValueMapping * 3273 AMDGPURegisterBankInfo::getSGPROpMapping(Register Reg, 3274 const MachineRegisterInfo &MRI, 3275 const TargetRegisterInfo &TRI) const { 3276 // Lie and claim anything is legal, even though this needs to be an SGPR 3277 // applyMapping will have to deal with it as a waterfall loop. 3278 unsigned Bank = getRegBankID(Reg, MRI, AMDGPU::SGPRRegBankID); 3279 unsigned Size = getSizeInBits(Reg, MRI, TRI); 3280 return AMDGPU::getValueMapping(Bank, Size); 3281 } 3282 3283 const RegisterBankInfo::ValueMapping * 3284 AMDGPURegisterBankInfo::getVGPROpMapping(Register Reg, 3285 const MachineRegisterInfo &MRI, 3286 const TargetRegisterInfo &TRI) const { 3287 unsigned Size = getSizeInBits(Reg, MRI, TRI); 3288 return AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 3289 } 3290 3291 const RegisterBankInfo::ValueMapping * 3292 AMDGPURegisterBankInfo::getAGPROpMapping(Register Reg, 3293 const MachineRegisterInfo &MRI, 3294 const TargetRegisterInfo &TRI) const { 3295 unsigned Size = getSizeInBits(Reg, MRI, TRI); 3296 return AMDGPU::getValueMapping(AMDGPU::AGPRRegBankID, Size); 3297 } 3298 3299 /// 3300 /// This function must return a legal mapping, because 3301 /// AMDGPURegisterBankInfo::getInstrAlternativeMappings() is not called 3302 /// in RegBankSelect::Mode::Fast. Any mapping that would cause a 3303 /// VGPR to SGPR generated is illegal. 3304 /// 3305 // Operands that must be SGPRs must accept potentially divergent VGPRs as 3306 // legal. These will be dealt with in applyMappingImpl. 3307 // 3308 const RegisterBankInfo::InstructionMapping & 3309 AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { 3310 const MachineFunction &MF = *MI.getParent()->getParent(); 3311 const MachineRegisterInfo &MRI = MF.getRegInfo(); 3312 3313 if (MI.isCopy() || MI.getOpcode() == AMDGPU::G_FREEZE) { 3314 // The default logic bothers to analyze impossible alternative mappings. We 3315 // want the most straightforward mapping, so just directly handle this. 3316 const RegisterBank *DstBank = getRegBank(MI.getOperand(0).getReg(), MRI, 3317 *TRI); 3318 const RegisterBank *SrcBank = getRegBank(MI.getOperand(1).getReg(), MRI, 3319 *TRI); 3320 assert(SrcBank && "src bank should have been assigned already"); 3321 if (!DstBank) 3322 DstBank = SrcBank; 3323 3324 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 3325 if (cannotCopy(*DstBank, *SrcBank, Size)) 3326 return getInvalidInstructionMapping(); 3327 3328 const ValueMapping &ValMap = getValueMapping(0, Size, *DstBank); 3329 unsigned OpdsMappingSize = MI.isCopy() ? 1 : 2; 3330 SmallVector<const ValueMapping *, 1> OpdsMapping(OpdsMappingSize); 3331 OpdsMapping[0] = &ValMap; 3332 if (MI.getOpcode() == AMDGPU::G_FREEZE) 3333 OpdsMapping[1] = &ValMap; 3334 3335 return getInstructionMapping( 3336 1, /*Cost*/ 1, 3337 /*OperandsMapping*/ getOperandsMapping(OpdsMapping), OpdsMappingSize); 3338 } 3339 3340 if (MI.isRegSequence()) { 3341 // If any input is a VGPR, the result must be a VGPR. The default handling 3342 // assumes any copy between banks is legal. 3343 unsigned BankID = AMDGPU::SGPRRegBankID; 3344 3345 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) { 3346 auto OpBank = getRegBankID(MI.getOperand(I).getReg(), MRI); 3347 // It doesn't make sense to use vcc or scc banks here, so just ignore 3348 // them. 3349 if (OpBank != AMDGPU::SGPRRegBankID) { 3350 BankID = AMDGPU::VGPRRegBankID; 3351 break; 3352 } 3353 } 3354 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 3355 3356 const ValueMapping &ValMap = getValueMapping(0, Size, getRegBank(BankID)); 3357 return getInstructionMapping( 3358 1, /*Cost*/ 1, 3359 /*OperandsMapping*/ getOperandsMapping({&ValMap}), 1); 3360 } 3361 3362 // The default handling is broken and doesn't handle illegal SGPR->VGPR copies 3363 // properly. 3364 // 3365 // TODO: There are additional exec masking dependencies to analyze. 3366 if (MI.getOpcode() == TargetOpcode::G_PHI) { 3367 unsigned ResultBank = AMDGPU::InvalidRegBankID; 3368 Register DstReg = MI.getOperand(0).getReg(); 3369 3370 // Sometimes the result may have already been assigned a bank. 3371 if (const RegisterBank *DstBank = getRegBank(DstReg, MRI, *TRI)) 3372 ResultBank = DstBank->getID(); 3373 3374 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) { 3375 Register Reg = MI.getOperand(I).getReg(); 3376 const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI); 3377 3378 // FIXME: Assuming VGPR for any undetermined inputs. 3379 if (!Bank || Bank->getID() == AMDGPU::VGPRRegBankID) { 3380 ResultBank = AMDGPU::VGPRRegBankID; 3381 break; 3382 } 3383 3384 // FIXME: Need to promote SGPR case to s32 3385 unsigned OpBank = Bank->getID(); 3386 ResultBank = regBankBoolUnion(ResultBank, OpBank); 3387 } 3388 3389 assert(ResultBank != AMDGPU::InvalidRegBankID); 3390 3391 unsigned Size = MRI.getType(DstReg).getSizeInBits(); 3392 3393 const ValueMapping &ValMap = 3394 getValueMapping(0, Size, getRegBank(ResultBank)); 3395 return getInstructionMapping( 3396 1, /*Cost*/ 1, 3397 /*OperandsMapping*/ getOperandsMapping({&ValMap}), 1); 3398 } 3399 3400 const RegisterBankInfo::InstructionMapping &Mapping = getInstrMappingImpl(MI); 3401 if (Mapping.isValid()) 3402 return Mapping; 3403 3404 SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands()); 3405 3406 switch (MI.getOpcode()) { 3407 default: 3408 return getInvalidInstructionMapping(); 3409 3410 case AMDGPU::G_AND: 3411 case AMDGPU::G_OR: 3412 case AMDGPU::G_XOR: { 3413 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 3414 if (Size == 1) { 3415 const RegisterBank *DstBank 3416 = getRegBank(MI.getOperand(0).getReg(), MRI, *TRI); 3417 3418 unsigned TargetBankID = AMDGPU::InvalidRegBankID; 3419 unsigned BankLHS = AMDGPU::InvalidRegBankID; 3420 unsigned BankRHS = AMDGPU::InvalidRegBankID; 3421 if (DstBank) { 3422 TargetBankID = DstBank->getID(); 3423 if (DstBank == &AMDGPU::VCCRegBank) { 3424 TargetBankID = AMDGPU::VCCRegBankID; 3425 BankLHS = AMDGPU::VCCRegBankID; 3426 BankRHS = AMDGPU::VCCRegBankID; 3427 } else { 3428 BankLHS = getRegBankID(MI.getOperand(1).getReg(), MRI, 3429 AMDGPU::SGPRRegBankID); 3430 BankRHS = getRegBankID(MI.getOperand(2).getReg(), MRI, 3431 AMDGPU::SGPRRegBankID); 3432 } 3433 } else { 3434 BankLHS = getRegBankID(MI.getOperand(1).getReg(), MRI, 3435 AMDGPU::VCCRegBankID); 3436 BankRHS = getRegBankID(MI.getOperand(2).getReg(), MRI, 3437 AMDGPU::VCCRegBankID); 3438 3439 // Both inputs should be true booleans to produce a boolean result. 3440 if (BankLHS == AMDGPU::VGPRRegBankID || BankRHS == AMDGPU::VGPRRegBankID) { 3441 TargetBankID = AMDGPU::VGPRRegBankID; 3442 } else if (BankLHS == AMDGPU::VCCRegBankID || BankRHS == AMDGPU::VCCRegBankID) { 3443 TargetBankID = AMDGPU::VCCRegBankID; 3444 BankLHS = AMDGPU::VCCRegBankID; 3445 BankRHS = AMDGPU::VCCRegBankID; 3446 } else if (BankLHS == AMDGPU::SGPRRegBankID && BankRHS == AMDGPU::SGPRRegBankID) { 3447 TargetBankID = AMDGPU::SGPRRegBankID; 3448 } 3449 } 3450 3451 OpdsMapping[0] = AMDGPU::getValueMapping(TargetBankID, Size); 3452 OpdsMapping[1] = AMDGPU::getValueMapping(BankLHS, Size); 3453 OpdsMapping[2] = AMDGPU::getValueMapping(BankRHS, Size); 3454 break; 3455 } 3456 3457 if (Size == 64) { 3458 3459 if (isSALUMapping(MI)) { 3460 OpdsMapping[0] = getValueMappingSGPR64Only(AMDGPU::SGPRRegBankID, Size); 3461 OpdsMapping[1] = OpdsMapping[2] = OpdsMapping[0]; 3462 } else { 3463 OpdsMapping[0] = getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size); 3464 unsigned Bank1 = getRegBankID(MI.getOperand(1).getReg(), MRI /*, DefaultBankID*/); 3465 OpdsMapping[1] = AMDGPU::getValueMapping(Bank1, Size); 3466 3467 unsigned Bank2 = getRegBankID(MI.getOperand(2).getReg(), MRI /*, DefaultBankID*/); 3468 OpdsMapping[2] = AMDGPU::getValueMapping(Bank2, Size); 3469 } 3470 3471 break; 3472 } 3473 3474 LLVM_FALLTHROUGH; 3475 } 3476 case AMDGPU::G_PTR_ADD: 3477 case AMDGPU::G_PTRMASK: 3478 case AMDGPU::G_ADD: 3479 case AMDGPU::G_SUB: 3480 case AMDGPU::G_MUL: 3481 case AMDGPU::G_SHL: 3482 case AMDGPU::G_LSHR: 3483 case AMDGPU::G_ASHR: 3484 case AMDGPU::G_UADDO: 3485 case AMDGPU::G_USUBO: 3486 case AMDGPU::G_UADDE: 3487 case AMDGPU::G_SADDE: 3488 case AMDGPU::G_USUBE: 3489 case AMDGPU::G_SSUBE: 3490 case AMDGPU::G_SMIN: 3491 case AMDGPU::G_SMAX: 3492 case AMDGPU::G_UMIN: 3493 case AMDGPU::G_UMAX: 3494 case AMDGPU::G_SHUFFLE_VECTOR: 3495 if (isSALUMapping(MI)) 3496 return getDefaultMappingSOP(MI); 3497 LLVM_FALLTHROUGH; 3498 3499 case AMDGPU::G_SADDSAT: // FIXME: Could lower sat ops for SALU 3500 case AMDGPU::G_SSUBSAT: 3501 case AMDGPU::G_UADDSAT: 3502 case AMDGPU::G_USUBSAT: 3503 case AMDGPU::G_FADD: 3504 case AMDGPU::G_FSUB: 3505 case AMDGPU::G_FPTOSI: 3506 case AMDGPU::G_FPTOUI: 3507 case AMDGPU::G_FMUL: 3508 case AMDGPU::G_FMA: 3509 case AMDGPU::G_FMAD: 3510 case AMDGPU::G_FSQRT: 3511 case AMDGPU::G_FFLOOR: 3512 case AMDGPU::G_FCEIL: 3513 case AMDGPU::G_FRINT: 3514 case AMDGPU::G_SITOFP: 3515 case AMDGPU::G_UITOFP: 3516 case AMDGPU::G_FPTRUNC: 3517 case AMDGPU::G_FPEXT: 3518 case AMDGPU::G_FEXP2: 3519 case AMDGPU::G_FLOG2: 3520 case AMDGPU::G_FMINNUM: 3521 case AMDGPU::G_FMAXNUM: 3522 case AMDGPU::G_FMINNUM_IEEE: 3523 case AMDGPU::G_FMAXNUM_IEEE: 3524 case AMDGPU::G_FCANONICALIZE: 3525 case AMDGPU::G_INTRINSIC_TRUNC: 3526 case AMDGPU::G_BSWAP: // TODO: Somehow expand for scalar? 3527 case AMDGPU::G_FSHR: // TODO: Expand for scalar 3528 case AMDGPU::G_AMDGPU_FFBH_U32: 3529 case AMDGPU::G_AMDGPU_FMIN_LEGACY: 3530 case AMDGPU::G_AMDGPU_FMAX_LEGACY: 3531 case AMDGPU::G_AMDGPU_RCP_IFLAG: 3532 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE0: 3533 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE1: 3534 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE2: 3535 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE3: 3536 case AMDGPU::G_AMDGPU_CVT_PK_I16_I32: 3537 case AMDGPU::G_AMDGPU_SMED3: 3538 return getDefaultMappingVOP(MI); 3539 case AMDGPU::G_UMULH: 3540 case AMDGPU::G_SMULH: { 3541 if (Subtarget.hasScalarMulHiInsts() && isSALUMapping(MI)) 3542 return getDefaultMappingSOP(MI); 3543 return getDefaultMappingVOP(MI); 3544 } 3545 case AMDGPU::G_IMPLICIT_DEF: { 3546 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 3547 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); 3548 break; 3549 } 3550 case AMDGPU::G_FCONSTANT: 3551 case AMDGPU::G_CONSTANT: 3552 case AMDGPU::G_GLOBAL_VALUE: 3553 case AMDGPU::G_BLOCK_ADDR: 3554 case AMDGPU::G_READCYCLECOUNTER: { 3555 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 3556 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); 3557 break; 3558 } 3559 case AMDGPU::G_FRAME_INDEX: { 3560 // TODO: This should be the same as other constants, but eliminateFrameIndex 3561 // currently assumes VALU uses. 3562 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 3563 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 3564 break; 3565 } 3566 case AMDGPU::G_DYN_STACKALLOC: { 3567 // Result is always uniform, and a wave reduction is needed for the source. 3568 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32); 3569 unsigned SrcBankID = getRegBankID(MI.getOperand(1).getReg(), MRI); 3570 OpdsMapping[1] = AMDGPU::getValueMapping(SrcBankID, 32); 3571 break; 3572 } 3573 case AMDGPU::G_INSERT: { 3574 unsigned BankID = getMappingType(MRI, MI); 3575 unsigned DstSize = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 3576 unsigned SrcSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI); 3577 unsigned EltSize = getSizeInBits(MI.getOperand(2).getReg(), MRI, *TRI); 3578 OpdsMapping[0] = AMDGPU::getValueMapping(BankID, DstSize); 3579 OpdsMapping[1] = AMDGPU::getValueMapping(BankID, SrcSize); 3580 OpdsMapping[2] = AMDGPU::getValueMapping(BankID, EltSize); 3581 OpdsMapping[3] = nullptr; 3582 break; 3583 } 3584 case AMDGPU::G_EXTRACT: { 3585 unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI); 3586 unsigned DstSize = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 3587 unsigned SrcSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI); 3588 OpdsMapping[0] = AMDGPU::getValueMapping(BankID, DstSize); 3589 OpdsMapping[1] = AMDGPU::getValueMapping(BankID, SrcSize); 3590 OpdsMapping[2] = nullptr; 3591 break; 3592 } 3593 case AMDGPU::G_BUILD_VECTOR: 3594 case AMDGPU::G_BUILD_VECTOR_TRUNC: { 3595 LLT DstTy = MRI.getType(MI.getOperand(0).getReg()); 3596 if (DstTy == LLT::vector(2, 16)) { 3597 unsigned DstSize = DstTy.getSizeInBits(); 3598 unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits(); 3599 unsigned Src0BankID = getRegBankID(MI.getOperand(1).getReg(), MRI); 3600 unsigned Src1BankID = getRegBankID(MI.getOperand(2).getReg(), MRI); 3601 unsigned DstBankID = regBankUnion(Src0BankID, Src1BankID); 3602 3603 OpdsMapping[0] = AMDGPU::getValueMapping(DstBankID, DstSize); 3604 OpdsMapping[1] = AMDGPU::getValueMapping(Src0BankID, SrcSize); 3605 OpdsMapping[2] = AMDGPU::getValueMapping(Src1BankID, SrcSize); 3606 break; 3607 } 3608 3609 LLVM_FALLTHROUGH; 3610 } 3611 case AMDGPU::G_MERGE_VALUES: 3612 case AMDGPU::G_CONCAT_VECTORS: { 3613 unsigned Bank = getMappingType(MRI, MI); 3614 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 3615 unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits(); 3616 3617 OpdsMapping[0] = AMDGPU::getValueMapping(Bank, DstSize); 3618 // Op1 and Dst should use the same register bank. 3619 for (unsigned i = 1, e = MI.getNumOperands(); i != e; ++i) 3620 OpdsMapping[i] = AMDGPU::getValueMapping(Bank, SrcSize); 3621 break; 3622 } 3623 case AMDGPU::G_BITREVERSE: 3624 case AMDGPU::G_BITCAST: 3625 case AMDGPU::G_INTTOPTR: 3626 case AMDGPU::G_PTRTOINT: 3627 case AMDGPU::G_FABS: 3628 case AMDGPU::G_FNEG: { 3629 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 3630 unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI); 3631 OpdsMapping[0] = OpdsMapping[1] = AMDGPU::getValueMapping(BankID, Size); 3632 break; 3633 } 3634 case AMDGPU::G_CTLZ_ZERO_UNDEF: 3635 case AMDGPU::G_CTTZ_ZERO_UNDEF: 3636 case AMDGPU::G_CTPOP: { 3637 unsigned Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits(); 3638 unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI); 3639 OpdsMapping[0] = AMDGPU::getValueMapping(BankID, 32); 3640 3641 // This should really be getValueMappingSGPR64Only, but allowing the generic 3642 // code to handle the register split just makes using LegalizerHelper more 3643 // difficult. 3644 OpdsMapping[1] = AMDGPU::getValueMapping(BankID, Size); 3645 break; 3646 } 3647 case AMDGPU::G_TRUNC: { 3648 Register Dst = MI.getOperand(0).getReg(); 3649 Register Src = MI.getOperand(1).getReg(); 3650 unsigned Bank = getRegBankID(Src, MRI); 3651 unsigned DstSize = getSizeInBits(Dst, MRI, *TRI); 3652 unsigned SrcSize = getSizeInBits(Src, MRI, *TRI); 3653 OpdsMapping[0] = AMDGPU::getValueMapping(Bank, DstSize); 3654 OpdsMapping[1] = AMDGPU::getValueMapping(Bank, SrcSize); 3655 break; 3656 } 3657 case AMDGPU::G_ZEXT: 3658 case AMDGPU::G_SEXT: 3659 case AMDGPU::G_ANYEXT: 3660 case AMDGPU::G_SEXT_INREG: { 3661 Register Dst = MI.getOperand(0).getReg(); 3662 Register Src = MI.getOperand(1).getReg(); 3663 unsigned DstSize = getSizeInBits(Dst, MRI, *TRI); 3664 unsigned SrcSize = getSizeInBits(Src, MRI, *TRI); 3665 3666 unsigned DstBank; 3667 const RegisterBank *SrcBank = getRegBank(Src, MRI, *TRI); 3668 assert(SrcBank); 3669 switch (SrcBank->getID()) { 3670 case AMDGPU::SGPRRegBankID: 3671 DstBank = AMDGPU::SGPRRegBankID; 3672 break; 3673 default: 3674 DstBank = AMDGPU::VGPRRegBankID; 3675 break; 3676 } 3677 3678 // Scalar extend can use 64-bit BFE, but VGPRs require extending to 3679 // 32-bits, and then to 64. 3680 OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(DstBank, DstSize); 3681 OpdsMapping[1] = AMDGPU::getValueMappingSGPR64Only(SrcBank->getID(), 3682 SrcSize); 3683 break; 3684 } 3685 case AMDGPU::G_FCMP: { 3686 unsigned Size = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits(); 3687 unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI); 3688 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1); 3689 OpdsMapping[1] = nullptr; // Predicate Operand. 3690 OpdsMapping[2] = AMDGPU::getValueMapping(Op2Bank, Size); 3691 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 3692 break; 3693 } 3694 case AMDGPU::G_STORE: { 3695 assert(MI.getOperand(0).isReg()); 3696 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 3697 3698 // FIXME: We need to specify a different reg bank once scalar stores are 3699 // supported. 3700 const ValueMapping *ValMapping = 3701 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 3702 OpdsMapping[0] = ValMapping; 3703 OpdsMapping[1] = getValueMappingForPtr(MRI, MI.getOperand(1).getReg()); 3704 break; 3705 } 3706 case AMDGPU::G_ICMP: { 3707 auto Pred = static_cast<CmpInst::Predicate>(MI.getOperand(1).getPredicate()); 3708 unsigned Size = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits(); 3709 3710 // See if the result register has already been constrained to vcc, which may 3711 // happen due to control flow intrinsic lowering. 3712 unsigned DstBank = getRegBankID(MI.getOperand(0).getReg(), MRI, 3713 AMDGPU::SGPRRegBankID); 3714 unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI); 3715 unsigned Op3Bank = getRegBankID(MI.getOperand(3).getReg(), MRI); 3716 3717 bool CanUseSCC = DstBank == AMDGPU::SGPRRegBankID && 3718 Op2Bank == AMDGPU::SGPRRegBankID && 3719 Op3Bank == AMDGPU::SGPRRegBankID && 3720 (Size == 32 || (Size == 64 && 3721 (Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE) && 3722 Subtarget.hasScalarCompareEq64())); 3723 3724 DstBank = CanUseSCC ? AMDGPU::SGPRRegBankID : AMDGPU::VCCRegBankID; 3725 unsigned SrcBank = CanUseSCC ? AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID; 3726 3727 // TODO: Use 32-bit for scalar output size. 3728 // SCC results will need to be copied to a 32-bit SGPR virtual register. 3729 const unsigned ResultSize = 1; 3730 3731 OpdsMapping[0] = AMDGPU::getValueMapping(DstBank, ResultSize); 3732 OpdsMapping[2] = AMDGPU::getValueMapping(SrcBank, Size); 3733 OpdsMapping[3] = AMDGPU::getValueMapping(SrcBank, Size); 3734 break; 3735 } 3736 case AMDGPU::G_EXTRACT_VECTOR_ELT: { 3737 // VGPR index can be used for waterfall when indexing a SGPR vector. 3738 unsigned SrcBankID = getRegBankID(MI.getOperand(1).getReg(), MRI); 3739 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 3740 unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits(); 3741 unsigned IdxSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits(); 3742 unsigned IdxBank = getRegBankID(MI.getOperand(2).getReg(), MRI); 3743 unsigned OutputBankID = regBankUnion(SrcBankID, IdxBank); 3744 3745 OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(OutputBankID, DstSize); 3746 OpdsMapping[1] = AMDGPU::getValueMapping(SrcBankID, SrcSize); 3747 3748 // The index can be either if the source vector is VGPR. 3749 OpdsMapping[2] = AMDGPU::getValueMapping(IdxBank, IdxSize); 3750 break; 3751 } 3752 case AMDGPU::G_INSERT_VECTOR_ELT: { 3753 unsigned OutputBankID = isSALUMapping(MI) ? 3754 AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID; 3755 3756 unsigned VecSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 3757 unsigned InsertSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits(); 3758 unsigned IdxSize = MRI.getType(MI.getOperand(3).getReg()).getSizeInBits(); 3759 unsigned InsertEltBankID = getRegBankID(MI.getOperand(2).getReg(), MRI); 3760 unsigned IdxBankID = getRegBankID(MI.getOperand(3).getReg(), MRI); 3761 3762 OpdsMapping[0] = AMDGPU::getValueMapping(OutputBankID, VecSize); 3763 OpdsMapping[1] = AMDGPU::getValueMapping(OutputBankID, VecSize); 3764 3765 // This is a weird case, because we need to break down the mapping based on 3766 // the register bank of a different operand. 3767 if (InsertSize == 64 && OutputBankID == AMDGPU::VGPRRegBankID) { 3768 OpdsMapping[2] = AMDGPU::getValueMappingSplit64(InsertEltBankID, 3769 InsertSize); 3770 } else { 3771 assert(InsertSize == 32 || InsertSize == 64); 3772 OpdsMapping[2] = AMDGPU::getValueMapping(InsertEltBankID, InsertSize); 3773 } 3774 3775 // The index can be either if the source vector is VGPR. 3776 OpdsMapping[3] = AMDGPU::getValueMapping(IdxBankID, IdxSize); 3777 break; 3778 } 3779 case AMDGPU::G_UNMERGE_VALUES: { 3780 unsigned Bank = getMappingType(MRI, MI); 3781 3782 // Op1 and Dst should use the same register bank. 3783 // FIXME: Shouldn't this be the default? Why do we need to handle this? 3784 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { 3785 unsigned Size = getSizeInBits(MI.getOperand(i).getReg(), MRI, *TRI); 3786 OpdsMapping[i] = AMDGPU::getValueMapping(Bank, Size); 3787 } 3788 break; 3789 } 3790 case AMDGPU::G_AMDGPU_BUFFER_LOAD: 3791 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE: 3792 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE: 3793 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT: 3794 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT: 3795 case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT: 3796 case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16: 3797 case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT: 3798 case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16: 3799 case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT: 3800 case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16: 3801 case AMDGPU::G_AMDGPU_BUFFER_STORE: 3802 case AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE: 3803 case AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT: 3804 case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT: 3805 case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16: { 3806 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); 3807 3808 // rsrc 3809 OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); 3810 3811 // vindex 3812 OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 3813 3814 // voffset 3815 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); 3816 3817 // soffset 3818 OpdsMapping[4] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); 3819 3820 // Any remaining operands are immediates and were correctly null 3821 // initialized. 3822 break; 3823 } 3824 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP: 3825 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD: 3826 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB: 3827 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN: 3828 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN: 3829 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX: 3830 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX: 3831 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND: 3832 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR: 3833 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR: 3834 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC: 3835 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC: 3836 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD: 3837 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN: 3838 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX: { 3839 // vdata_out 3840 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); 3841 3842 // vdata_in 3843 OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); 3844 3845 // rsrc 3846 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 3847 3848 // vindex 3849 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); 3850 3851 // voffset 3852 OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); 3853 3854 // soffset 3855 OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI); 3856 3857 // Any remaining operands are immediates and were correctly null 3858 // initialized. 3859 break; 3860 } 3861 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP: { 3862 // vdata_out 3863 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); 3864 3865 // vdata_in 3866 OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); 3867 3868 // cmp 3869 OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 3870 3871 // rsrc 3872 OpdsMapping[3] = getSGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); 3873 3874 // vindex 3875 OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); 3876 3877 // voffset 3878 OpdsMapping[5] = getVGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI); 3879 3880 // soffset 3881 OpdsMapping[6] = getSGPROpMapping(MI.getOperand(6).getReg(), MRI, *TRI); 3882 3883 // Any remaining operands are immediates and were correctly null 3884 // initialized. 3885 break; 3886 } 3887 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD: { 3888 // Lie and claim everything is legal, even though some need to be 3889 // SGPRs. applyMapping will have to deal with it as a waterfall loop. 3890 OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); 3891 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 3892 3893 // We need to convert this to a MUBUF if either the resource of offset is 3894 // VGPR. 3895 unsigned RSrcBank = OpdsMapping[1]->BreakDown[0].RegBank->getID(); 3896 unsigned OffsetBank = OpdsMapping[2]->BreakDown[0].RegBank->getID(); 3897 unsigned ResultBank = regBankUnion(RSrcBank, OffsetBank); 3898 3899 unsigned Size0 = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 3900 OpdsMapping[0] = AMDGPU::getValueMapping(ResultBank, Size0); 3901 break; 3902 } 3903 case AMDGPU::G_INTRINSIC: { 3904 switch (MI.getIntrinsicID()) { 3905 default: 3906 return getInvalidInstructionMapping(); 3907 case Intrinsic::amdgcn_div_fmas: 3908 case Intrinsic::amdgcn_div_fixup: 3909 case Intrinsic::amdgcn_trig_preop: 3910 case Intrinsic::amdgcn_sin: 3911 case Intrinsic::amdgcn_cos: 3912 case Intrinsic::amdgcn_log_clamp: 3913 case Intrinsic::amdgcn_rcp: 3914 case Intrinsic::amdgcn_rcp_legacy: 3915 case Intrinsic::amdgcn_sqrt: 3916 case Intrinsic::amdgcn_rsq: 3917 case Intrinsic::amdgcn_rsq_legacy: 3918 case Intrinsic::amdgcn_rsq_clamp: 3919 case Intrinsic::amdgcn_fmul_legacy: 3920 case Intrinsic::amdgcn_fma_legacy: 3921 case Intrinsic::amdgcn_ldexp: 3922 case Intrinsic::amdgcn_frexp_mant: 3923 case Intrinsic::amdgcn_frexp_exp: 3924 case Intrinsic::amdgcn_fract: 3925 case Intrinsic::amdgcn_cvt_pkrtz: 3926 case Intrinsic::amdgcn_cvt_pknorm_i16: 3927 case Intrinsic::amdgcn_cvt_pknorm_u16: 3928 case Intrinsic::amdgcn_cvt_pk_i16: 3929 case Intrinsic::amdgcn_cvt_pk_u16: 3930 case Intrinsic::amdgcn_fmed3: 3931 case Intrinsic::amdgcn_cubeid: 3932 case Intrinsic::amdgcn_cubema: 3933 case Intrinsic::amdgcn_cubesc: 3934 case Intrinsic::amdgcn_cubetc: 3935 case Intrinsic::amdgcn_sffbh: 3936 case Intrinsic::amdgcn_fmad_ftz: 3937 case Intrinsic::amdgcn_mbcnt_lo: 3938 case Intrinsic::amdgcn_mbcnt_hi: 3939 case Intrinsic::amdgcn_mul_u24: 3940 case Intrinsic::amdgcn_mul_i24: 3941 case Intrinsic::amdgcn_lerp: 3942 case Intrinsic::amdgcn_sad_u8: 3943 case Intrinsic::amdgcn_msad_u8: 3944 case Intrinsic::amdgcn_sad_hi_u8: 3945 case Intrinsic::amdgcn_sad_u16: 3946 case Intrinsic::amdgcn_qsad_pk_u16_u8: 3947 case Intrinsic::amdgcn_mqsad_pk_u16_u8: 3948 case Intrinsic::amdgcn_mqsad_u32_u8: 3949 case Intrinsic::amdgcn_cvt_pk_u8_f32: 3950 case Intrinsic::amdgcn_alignbit: 3951 case Intrinsic::amdgcn_alignbyte: 3952 case Intrinsic::amdgcn_perm: 3953 case Intrinsic::amdgcn_fdot2: 3954 case Intrinsic::amdgcn_sdot2: 3955 case Intrinsic::amdgcn_udot2: 3956 case Intrinsic::amdgcn_sdot4: 3957 case Intrinsic::amdgcn_udot4: 3958 case Intrinsic::amdgcn_sdot8: 3959 case Intrinsic::amdgcn_udot8: 3960 return getDefaultMappingVOP(MI); 3961 case Intrinsic::amdgcn_sbfe: 3962 case Intrinsic::amdgcn_ubfe: 3963 if (isSALUMapping(MI)) 3964 return getDefaultMappingSOP(MI); 3965 return getDefaultMappingVOP(MI); 3966 case Intrinsic::amdgcn_ds_swizzle: 3967 case Intrinsic::amdgcn_ds_permute: 3968 case Intrinsic::amdgcn_ds_bpermute: 3969 case Intrinsic::amdgcn_update_dpp: 3970 case Intrinsic::amdgcn_mov_dpp8: 3971 case Intrinsic::amdgcn_mov_dpp: 3972 case Intrinsic::amdgcn_strict_wwm: 3973 case Intrinsic::amdgcn_wwm: 3974 case Intrinsic::amdgcn_strict_wqm: 3975 case Intrinsic::amdgcn_wqm: 3976 case Intrinsic::amdgcn_softwqm: 3977 case Intrinsic::amdgcn_set_inactive: 3978 return getDefaultMappingAllVGPR(MI); 3979 case Intrinsic::amdgcn_kernarg_segment_ptr: 3980 case Intrinsic::amdgcn_s_getpc: 3981 case Intrinsic::amdgcn_groupstaticsize: 3982 case Intrinsic::amdgcn_reloc_constant: 3983 case Intrinsic::returnaddress: { 3984 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 3985 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); 3986 break; 3987 } 3988 case Intrinsic::amdgcn_wqm_vote: { 3989 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 3990 OpdsMapping[0] = OpdsMapping[2] 3991 = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size); 3992 break; 3993 } 3994 case Intrinsic::amdgcn_ps_live: { 3995 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1); 3996 break; 3997 } 3998 case Intrinsic::amdgcn_div_scale: { 3999 unsigned Dst0Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4000 unsigned Dst1Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits(); 4001 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Dst0Size); 4002 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Dst1Size); 4003 4004 unsigned SrcSize = MRI.getType(MI.getOperand(3).getReg()).getSizeInBits(); 4005 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize); 4006 OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize); 4007 break; 4008 } 4009 case Intrinsic::amdgcn_class: { 4010 Register Src0Reg = MI.getOperand(2).getReg(); 4011 Register Src1Reg = MI.getOperand(3).getReg(); 4012 unsigned Src0Size = MRI.getType(Src0Reg).getSizeInBits(); 4013 unsigned Src1Size = MRI.getType(Src1Reg).getSizeInBits(); 4014 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4015 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, DstSize); 4016 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Src0Size); 4017 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Src1Size); 4018 break; 4019 } 4020 case Intrinsic::amdgcn_icmp: 4021 case Intrinsic::amdgcn_fcmp: { 4022 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4023 // This is not VCCRegBank because this is not used in boolean contexts. 4024 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize); 4025 unsigned OpSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits(); 4026 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, OpSize); 4027 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, OpSize); 4028 break; 4029 } 4030 case Intrinsic::amdgcn_readlane: { 4031 // This must be an SGPR, but accept a VGPR. 4032 Register IdxReg = MI.getOperand(3).getReg(); 4033 unsigned IdxSize = MRI.getType(IdxReg).getSizeInBits(); 4034 unsigned IdxBank = getRegBankID(IdxReg, MRI, AMDGPU::SGPRRegBankID); 4035 OpdsMapping[3] = AMDGPU::getValueMapping(IdxBank, IdxSize); 4036 LLVM_FALLTHROUGH; 4037 } 4038 case Intrinsic::amdgcn_readfirstlane: { 4039 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4040 unsigned SrcSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits(); 4041 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize); 4042 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize); 4043 break; 4044 } 4045 case Intrinsic::amdgcn_writelane: { 4046 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4047 Register SrcReg = MI.getOperand(2).getReg(); 4048 unsigned SrcSize = MRI.getType(SrcReg).getSizeInBits(); 4049 unsigned SrcBank = getRegBankID(SrcReg, MRI, AMDGPU::SGPRRegBankID); 4050 Register IdxReg = MI.getOperand(3).getReg(); 4051 unsigned IdxSize = MRI.getType(IdxReg).getSizeInBits(); 4052 unsigned IdxBank = getRegBankID(IdxReg, MRI, AMDGPU::SGPRRegBankID); 4053 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize); 4054 4055 // These 2 must be SGPRs, but accept VGPRs. Readfirstlane will be inserted 4056 // to legalize. 4057 OpdsMapping[2] = AMDGPU::getValueMapping(SrcBank, SrcSize); 4058 OpdsMapping[3] = AMDGPU::getValueMapping(IdxBank, IdxSize); 4059 OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize); 4060 break; 4061 } 4062 case Intrinsic::amdgcn_if_break: { 4063 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 4064 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); 4065 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1); 4066 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); 4067 break; 4068 } 4069 case Intrinsic::amdgcn_permlane16: 4070 case Intrinsic::amdgcn_permlanex16: { 4071 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI); 4072 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 4073 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 4074 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size); 4075 OpdsMapping[4] = getSGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); 4076 OpdsMapping[5] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); 4077 break; 4078 } 4079 case Intrinsic::amdgcn_mfma_f32_4x4x1f32: 4080 case Intrinsic::amdgcn_mfma_f32_4x4x4f16: 4081 case Intrinsic::amdgcn_mfma_i32_4x4x4i8: 4082 case Intrinsic::amdgcn_mfma_f32_4x4x2bf16: 4083 case Intrinsic::amdgcn_mfma_f32_16x16x1f32: 4084 case Intrinsic::amdgcn_mfma_f32_16x16x4f32: 4085 case Intrinsic::amdgcn_mfma_f32_16x16x4f16: 4086 case Intrinsic::amdgcn_mfma_f32_16x16x16f16: 4087 case Intrinsic::amdgcn_mfma_i32_16x16x4i8: 4088 case Intrinsic::amdgcn_mfma_i32_16x16x16i8: 4089 case Intrinsic::amdgcn_mfma_f32_16x16x2bf16: 4090 case Intrinsic::amdgcn_mfma_f32_16x16x8bf16: 4091 case Intrinsic::amdgcn_mfma_f32_32x32x1f32: 4092 case Intrinsic::amdgcn_mfma_f32_32x32x2f32: 4093 case Intrinsic::amdgcn_mfma_f32_32x32x4f16: 4094 case Intrinsic::amdgcn_mfma_f32_32x32x8f16: 4095 case Intrinsic::amdgcn_mfma_i32_32x32x4i8: 4096 case Intrinsic::amdgcn_mfma_i32_32x32x8i8: 4097 case Intrinsic::amdgcn_mfma_f32_32x32x2bf16: 4098 case Intrinsic::amdgcn_mfma_f32_32x32x4bf16: 4099 case Intrinsic::amdgcn_mfma_f32_32x32x4bf16_1k: 4100 case Intrinsic::amdgcn_mfma_f32_16x16x4bf16_1k: 4101 case Intrinsic::amdgcn_mfma_f32_4x4x4bf16_1k: 4102 case Intrinsic::amdgcn_mfma_f32_32x32x8bf16_1k: 4103 case Intrinsic::amdgcn_mfma_f32_16x16x16bf16_1k: 4104 case Intrinsic::amdgcn_mfma_f64_16x16x4f64: 4105 case Intrinsic::amdgcn_mfma_f64_4x4x4f64: { 4106 // Default for MAI intrinsics. 4107 // srcC can also be an immediate which can be folded later. 4108 // FIXME: Should we eventually add an alternative mapping with AGPR src 4109 // for srcA/srcB? 4110 // 4111 // vdst, srcA, srcB, srcC 4112 OpdsMapping[0] = getAGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); 4113 OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 4114 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); 4115 OpdsMapping[4] = getAGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); 4116 break; 4117 } 4118 case Intrinsic::amdgcn_interp_p1: 4119 case Intrinsic::amdgcn_interp_p2: 4120 case Intrinsic::amdgcn_interp_mov: 4121 case Intrinsic::amdgcn_interp_p1_f16: 4122 case Intrinsic::amdgcn_interp_p2_f16: { 4123 const int M0Idx = MI.getNumOperands() - 1; 4124 Register M0Reg = MI.getOperand(M0Idx).getReg(); 4125 unsigned M0Bank = getRegBankID(M0Reg, MRI, AMDGPU::SGPRRegBankID); 4126 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4127 4128 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize); 4129 for (int I = 2; I != M0Idx && MI.getOperand(I).isReg(); ++I) 4130 OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 4131 4132 // Must be SGPR, but we must take whatever the original bank is and fix it 4133 // later. 4134 OpdsMapping[M0Idx] = AMDGPU::getValueMapping(M0Bank, 32); 4135 break; 4136 } 4137 case Intrinsic::amdgcn_ballot: { 4138 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4139 unsigned SrcSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits(); 4140 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize); 4141 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, SrcSize); 4142 break; 4143 } 4144 } 4145 break; 4146 } 4147 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD: 4148 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE: { 4149 auto IntrID = MI.getIntrinsicID(); 4150 const AMDGPU::RsrcIntrinsic *RSrcIntrin = AMDGPU::lookupRsrcIntrinsic(IntrID); 4151 assert(RSrcIntrin && "missing RsrcIntrinsic for image intrinsic"); 4152 // Non-images can have complications from operands that allow both SGPR 4153 // and VGPR. For now it's too complicated to figure out the final opcode 4154 // to derive the register bank from the MCInstrDesc. 4155 assert(RSrcIntrin->IsImage); 4156 return getImageMapping(MRI, MI, RSrcIntrin->RsrcArg); 4157 } 4158 case AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY: { 4159 unsigned N = MI.getNumExplicitOperands() - 2; 4160 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 128); 4161 OpdsMapping[N] = getSGPROpMapping(MI.getOperand(N).getReg(), MRI, *TRI); 4162 for (unsigned I = 2; I < N; ++I) 4163 OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 4164 break; 4165 } 4166 case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: { 4167 auto IntrID = MI.getIntrinsicID(); 4168 switch (IntrID) { 4169 case Intrinsic::amdgcn_s_getreg: 4170 case Intrinsic::amdgcn_s_memtime: 4171 case Intrinsic::amdgcn_s_memrealtime: 4172 case Intrinsic::amdgcn_s_get_waveid_in_workgroup: { 4173 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4174 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); 4175 break; 4176 } 4177 case Intrinsic::amdgcn_global_atomic_fadd: 4178 case Intrinsic::amdgcn_global_atomic_csub: 4179 case Intrinsic::amdgcn_global_atomic_fmin: 4180 case Intrinsic::amdgcn_global_atomic_fmax: 4181 case Intrinsic::amdgcn_flat_atomic_fadd: 4182 case Intrinsic::amdgcn_flat_atomic_fmin: 4183 case Intrinsic::amdgcn_flat_atomic_fmax: 4184 return getDefaultMappingAllVGPR(MI); 4185 case Intrinsic::amdgcn_ds_ordered_add: 4186 case Intrinsic::amdgcn_ds_ordered_swap: { 4187 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4188 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize); 4189 unsigned M0Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, 4190 AMDGPU::SGPRRegBankID); 4191 OpdsMapping[2] = AMDGPU::getValueMapping(M0Bank, 32); 4192 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 4193 break; 4194 } 4195 case Intrinsic::amdgcn_ds_append: 4196 case Intrinsic::amdgcn_ds_consume: { 4197 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4198 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize); 4199 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 4200 break; 4201 } 4202 case Intrinsic::amdgcn_exp_compr: 4203 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 4204 OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 4205 break; 4206 case Intrinsic::amdgcn_exp: 4207 // FIXME: Could we support packed types here? 4208 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 4209 OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 4210 OpdsMapping[5] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 4211 OpdsMapping[6] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 4212 break; 4213 case Intrinsic::amdgcn_s_sendmsg: 4214 case Intrinsic::amdgcn_s_sendmsghalt: { 4215 // This must be an SGPR, but accept a VGPR. 4216 unsigned Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, 4217 AMDGPU::SGPRRegBankID); 4218 OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32); 4219 break; 4220 } 4221 case Intrinsic::amdgcn_s_setreg: { 4222 // This must be an SGPR, but accept a VGPR. 4223 unsigned Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, 4224 AMDGPU::SGPRRegBankID); 4225 OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32); 4226 break; 4227 } 4228 case Intrinsic::amdgcn_end_cf: { 4229 unsigned Size = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI); 4230 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); 4231 break; 4232 } 4233 case Intrinsic::amdgcn_else: { 4234 unsigned WaveSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI); 4235 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1); 4236 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, WaveSize); 4237 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, WaveSize); 4238 break; 4239 } 4240 case Intrinsic::amdgcn_live_mask: { 4241 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1); 4242 break; 4243 } 4244 case Intrinsic::amdgcn_wqm_demote: 4245 case Intrinsic::amdgcn_kill: { 4246 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1); 4247 break; 4248 } 4249 case Intrinsic::amdgcn_raw_buffer_load: 4250 case Intrinsic::amdgcn_raw_tbuffer_load: { 4251 // FIXME: Should make intrinsic ID the last operand of the instruction, 4252 // then this would be the same as store 4253 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); 4254 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 4255 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); 4256 OpdsMapping[4] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); 4257 break; 4258 } 4259 case Intrinsic::amdgcn_raw_buffer_store: 4260 case Intrinsic::amdgcn_raw_buffer_store_format: 4261 case Intrinsic::amdgcn_raw_tbuffer_store: { 4262 OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); 4263 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 4264 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); 4265 OpdsMapping[4] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); 4266 break; 4267 } 4268 case Intrinsic::amdgcn_struct_buffer_load: 4269 case Intrinsic::amdgcn_struct_tbuffer_load: { 4270 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); 4271 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 4272 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); 4273 OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); 4274 OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI); 4275 break; 4276 } 4277 case Intrinsic::amdgcn_struct_buffer_store: 4278 case Intrinsic::amdgcn_struct_tbuffer_store: { 4279 OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); 4280 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 4281 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); 4282 OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); 4283 OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI); 4284 break; 4285 } 4286 case Intrinsic::amdgcn_init_exec_from_input: { 4287 unsigned Size = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI); 4288 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); 4289 break; 4290 } 4291 case Intrinsic::amdgcn_ds_gws_init: 4292 case Intrinsic::amdgcn_ds_gws_barrier: 4293 case Intrinsic::amdgcn_ds_gws_sema_br: { 4294 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); 4295 4296 // This must be an SGPR, but accept a VGPR. 4297 unsigned Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, 4298 AMDGPU::SGPRRegBankID); 4299 OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32); 4300 break; 4301 } 4302 case Intrinsic::amdgcn_ds_gws_sema_v: 4303 case Intrinsic::amdgcn_ds_gws_sema_p: 4304 case Intrinsic::amdgcn_ds_gws_sema_release_all: { 4305 // This must be an SGPR, but accept a VGPR. 4306 unsigned Bank = getRegBankID(MI.getOperand(1).getReg(), MRI, 4307 AMDGPU::SGPRRegBankID); 4308 OpdsMapping[1] = AMDGPU::getValueMapping(Bank, 32); 4309 break; 4310 } 4311 default: 4312 return getInvalidInstructionMapping(); 4313 } 4314 break; 4315 } 4316 case AMDGPU::G_SELECT: { 4317 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); 4318 unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, 4319 AMDGPU::SGPRRegBankID); 4320 unsigned Op3Bank = getRegBankID(MI.getOperand(3).getReg(), MRI, 4321 AMDGPU::SGPRRegBankID); 4322 bool SGPRSrcs = Op2Bank == AMDGPU::SGPRRegBankID && 4323 Op3Bank == AMDGPU::SGPRRegBankID; 4324 4325 unsigned CondBankDefault = SGPRSrcs ? 4326 AMDGPU::SGPRRegBankID : AMDGPU::VCCRegBankID; 4327 unsigned CondBank = getRegBankID(MI.getOperand(1).getReg(), MRI, 4328 CondBankDefault); 4329 if (CondBank == AMDGPU::SGPRRegBankID) 4330 CondBank = SGPRSrcs ? AMDGPU::SGPRRegBankID : AMDGPU::VCCRegBankID; 4331 else if (CondBank == AMDGPU::VGPRRegBankID) 4332 CondBank = AMDGPU::VCCRegBankID; 4333 4334 unsigned Bank = SGPRSrcs && CondBank == AMDGPU::SGPRRegBankID ? 4335 AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID; 4336 4337 assert(CondBank == AMDGPU::VCCRegBankID || CondBank == AMDGPU::SGPRRegBankID); 4338 4339 // TODO: Should report 32-bit for scalar condition type. 4340 if (Size == 64) { 4341 OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(Bank, Size); 4342 OpdsMapping[1] = AMDGPU::getValueMapping(CondBank, 1); 4343 OpdsMapping[2] = AMDGPU::getValueMappingSGPR64Only(Bank, Size); 4344 OpdsMapping[3] = AMDGPU::getValueMappingSGPR64Only(Bank, Size); 4345 } else { 4346 OpdsMapping[0] = AMDGPU::getValueMapping(Bank, Size); 4347 OpdsMapping[1] = AMDGPU::getValueMapping(CondBank, 1); 4348 OpdsMapping[2] = AMDGPU::getValueMapping(Bank, Size); 4349 OpdsMapping[3] = AMDGPU::getValueMapping(Bank, Size); 4350 } 4351 4352 break; 4353 } 4354 4355 case AMDGPU::G_LOAD: 4356 case AMDGPU::G_ZEXTLOAD: 4357 case AMDGPU::G_SEXTLOAD: 4358 return getInstrMappingForLoad(MI); 4359 4360 case AMDGPU::G_ATOMICRMW_XCHG: 4361 case AMDGPU::G_ATOMICRMW_ADD: 4362 case AMDGPU::G_ATOMICRMW_SUB: 4363 case AMDGPU::G_ATOMICRMW_AND: 4364 case AMDGPU::G_ATOMICRMW_OR: 4365 case AMDGPU::G_ATOMICRMW_XOR: 4366 case AMDGPU::G_ATOMICRMW_MAX: 4367 case AMDGPU::G_ATOMICRMW_MIN: 4368 case AMDGPU::G_ATOMICRMW_UMAX: 4369 case AMDGPU::G_ATOMICRMW_UMIN: 4370 case AMDGPU::G_ATOMICRMW_FADD: 4371 case AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG: 4372 case AMDGPU::G_AMDGPU_ATOMIC_INC: 4373 case AMDGPU::G_AMDGPU_ATOMIC_DEC: 4374 case AMDGPU::G_AMDGPU_ATOMIC_FMIN: 4375 case AMDGPU::G_AMDGPU_ATOMIC_FMAX: { 4376 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); 4377 OpdsMapping[1] = getValueMappingForPtr(MRI, MI.getOperand(1).getReg()); 4378 OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 4379 break; 4380 } 4381 case AMDGPU::G_ATOMIC_CMPXCHG: { 4382 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); 4383 OpdsMapping[1] = getValueMappingForPtr(MRI, MI.getOperand(1).getReg()); 4384 OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI); 4385 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); 4386 break; 4387 } 4388 case AMDGPU::G_BRCOND: { 4389 unsigned Bank = getRegBankID(MI.getOperand(0).getReg(), MRI, 4390 AMDGPU::SGPRRegBankID); 4391 assert(MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() == 1); 4392 if (Bank != AMDGPU::SGPRRegBankID) 4393 Bank = AMDGPU::VCCRegBankID; 4394 4395 OpdsMapping[0] = AMDGPU::getValueMapping(Bank, 1); 4396 break; 4397 } 4398 } 4399 4400 return getInstructionMapping(/*ID*/1, /*Cost*/1, 4401 getOperandsMapping(OpdsMapping), 4402 MI.getNumOperands()); 4403 } 4404