Home | History | Annotate | Line # | Download | only in AMDGPU
      1 //===-- llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp - Call lowering -----===//
      2 //
      3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
      4 // See https://llvm.org/LICENSE.txt for license information.
      5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
      6 //
      7 //===----------------------------------------------------------------------===//
      8 ///
      9 /// \file
     10 /// This file implements the lowering of LLVM calls to machine code calls for
     11 /// GlobalISel.
     12 ///
     13 //===----------------------------------------------------------------------===//
     14 
     15 #include "AMDGPUCallLowering.h"
     16 #include "AMDGPU.h"
     17 #include "AMDGPULegalizerInfo.h"
     18 #include "AMDGPUTargetMachine.h"
     19 #include "SIMachineFunctionInfo.h"
     20 #include "SIRegisterInfo.h"
     21 #include "llvm/CodeGen/Analysis.h"
     22 #include "llvm/CodeGen/FunctionLoweringInfo.h"
     23 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
     24 #include "llvm/IR/IntrinsicsAMDGPU.h"
     25 
     26 #define DEBUG_TYPE "amdgpu-call-lowering"
     27 
     28 using namespace llvm;
     29 
     30 namespace {
     31 
     32 /// Wrapper around extendRegister to ensure we extend to a full 32-bit register.
     33 static Register extendRegisterMin32(CallLowering::ValueHandler &Handler,
     34                                     Register ValVReg, CCValAssign &VA) {
     35   if (VA.getLocVT().getSizeInBits() < 32) {
     36     // 16-bit types are reported as legal for 32-bit registers. We need to
     37     // extend and do a 32-bit copy to avoid the verifier complaining about it.
     38     return Handler.MIRBuilder.buildAnyExt(LLT::scalar(32), ValVReg).getReg(0);
     39   }
     40 
     41   return Handler.extendRegister(ValVReg, VA);
     42 }
     43 
     44 struct AMDGPUOutgoingValueHandler : public CallLowering::OutgoingValueHandler {
     45   AMDGPUOutgoingValueHandler(MachineIRBuilder &B, MachineRegisterInfo &MRI,
     46                              MachineInstrBuilder MIB)
     47       : OutgoingValueHandler(B, MRI), MIB(MIB) {}
     48 
     49   MachineInstrBuilder MIB;
     50 
     51   Register getStackAddress(uint64_t Size, int64_t Offset,
     52                            MachinePointerInfo &MPO,
     53                            ISD::ArgFlagsTy Flags) override {
     54     llvm_unreachable("not implemented");
     55   }
     56 
     57   void assignValueToAddress(Register ValVReg, Register Addr, uint64_t Size,
     58                             MachinePointerInfo &MPO, CCValAssign &VA) override {
     59     llvm_unreachable("not implemented");
     60   }
     61 
     62   void assignValueToReg(Register ValVReg, Register PhysReg,
     63                         CCValAssign &VA) override {
     64     Register ExtReg = extendRegisterMin32(*this, ValVReg, VA);
     65 
     66     // If this is a scalar return, insert a readfirstlane just in case the value
     67     // ends up in a VGPR.
     68     // FIXME: Assert this is a shader return.
     69     const SIRegisterInfo *TRI
     70       = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
     71     if (TRI->isSGPRReg(MRI, PhysReg)) {
     72       auto ToSGPR = MIRBuilder.buildIntrinsic(Intrinsic::amdgcn_readfirstlane,
     73                                               {MRI.getType(ExtReg)}, false)
     74         .addReg(ExtReg);
     75       ExtReg = ToSGPR.getReg(0);
     76     }
     77 
     78     MIRBuilder.buildCopy(PhysReg, ExtReg);
     79     MIB.addUse(PhysReg, RegState::Implicit);
     80   }
     81 };
     82 
     83 struct AMDGPUIncomingArgHandler : public CallLowering::IncomingValueHandler {
     84   uint64_t StackUsed = 0;
     85 
     86   AMDGPUIncomingArgHandler(MachineIRBuilder &B, MachineRegisterInfo &MRI)
     87       : IncomingValueHandler(B, MRI) {}
     88 
     89   Register getStackAddress(uint64_t Size, int64_t Offset,
     90                            MachinePointerInfo &MPO,
     91                            ISD::ArgFlagsTy Flags) override {
     92     auto &MFI = MIRBuilder.getMF().getFrameInfo();
     93 
     94     // Byval is assumed to be writable memory, but other stack passed arguments
     95     // are not.
     96     const bool IsImmutable = !Flags.isByVal();
     97     int FI = MFI.CreateFixedObject(Size, Offset, IsImmutable);
     98     MPO = MachinePointerInfo::getFixedStack(MIRBuilder.getMF(), FI);
     99     auto AddrReg = MIRBuilder.buildFrameIndex(
    100         LLT::pointer(AMDGPUAS::PRIVATE_ADDRESS, 32), FI);
    101     StackUsed = std::max(StackUsed, Size + Offset);
    102     return AddrReg.getReg(0);
    103   }
    104 
    105   void assignValueToReg(Register ValVReg, Register PhysReg,
    106                         CCValAssign &VA) override {
    107     markPhysRegUsed(PhysReg);
    108 
    109     if (VA.getLocVT().getSizeInBits() < 32) {
    110       // 16-bit types are reported as legal for 32-bit registers. We need to do
    111       // a 32-bit copy, and truncate to avoid the verifier complaining about it.
    112       auto Copy = MIRBuilder.buildCopy(LLT::scalar(32), PhysReg);
    113 
    114       // If we have signext/zeroext, it applies to the whole 32-bit register
    115       // before truncation.
    116       auto Extended =
    117           buildExtensionHint(VA, Copy.getReg(0), LLT(VA.getLocVT()));
    118       MIRBuilder.buildTrunc(ValVReg, Extended);
    119       return;
    120     }
    121 
    122     IncomingValueHandler::assignValueToReg(ValVReg, PhysReg, VA);
    123   }
    124 
    125   void assignValueToAddress(Register ValVReg, Register Addr, uint64_t MemSize,
    126                             MachinePointerInfo &MPO, CCValAssign &VA) override {
    127     MachineFunction &MF = MIRBuilder.getMF();
    128 
    129     // The reported memory location may be wider than the value.
    130     const LLT RegTy = MRI.getType(ValVReg);
    131     MemSize = std::min(static_cast<uint64_t>(RegTy.getSizeInBytes()), MemSize);
    132 
    133     // FIXME: Get alignment
    134     auto MMO = MF.getMachineMemOperand(
    135         MPO, MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant, MemSize,
    136         inferAlignFromPtrInfo(MF, MPO));
    137     MIRBuilder.buildLoad(ValVReg, Addr, *MMO);
    138   }
    139 
    140   /// How the physical register gets marked varies between formal
    141   /// parameters (it's a basic-block live-in), and a call instruction
    142   /// (it's an implicit-def of the BL).
    143   virtual void markPhysRegUsed(unsigned PhysReg) = 0;
    144 };
    145 
    146 struct FormalArgHandler : public AMDGPUIncomingArgHandler {
    147   FormalArgHandler(MachineIRBuilder &B, MachineRegisterInfo &MRI)
    148       : AMDGPUIncomingArgHandler(B, MRI) {}
    149 
    150   void markPhysRegUsed(unsigned PhysReg) override {
    151     MIRBuilder.getMBB().addLiveIn(PhysReg);
    152   }
    153 };
    154 
    155 struct CallReturnHandler : public AMDGPUIncomingArgHandler {
    156   CallReturnHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI,
    157                     MachineInstrBuilder MIB)
    158       : AMDGPUIncomingArgHandler(MIRBuilder, MRI), MIB(MIB) {}
    159 
    160   void markPhysRegUsed(unsigned PhysReg) override {
    161     MIB.addDef(PhysReg, RegState::Implicit);
    162   }
    163 
    164   MachineInstrBuilder MIB;
    165 };
    166 
    167 struct AMDGPUOutgoingArgHandler : public AMDGPUOutgoingValueHandler {
    168   /// For tail calls, the byte offset of the call's argument area from the
    169   /// callee's. Unused elsewhere.
    170   int FPDiff;
    171 
    172   // Cache the SP register vreg if we need it more than once in this call site.
    173   Register SPReg;
    174 
    175   bool IsTailCall;
    176 
    177   AMDGPUOutgoingArgHandler(MachineIRBuilder &MIRBuilder,
    178                            MachineRegisterInfo &MRI, MachineInstrBuilder MIB,
    179                            bool IsTailCall = false, int FPDiff = 0)
    180       : AMDGPUOutgoingValueHandler(MIRBuilder, MRI, MIB), FPDiff(FPDiff),
    181         IsTailCall(IsTailCall) {}
    182 
    183   Register getStackAddress(uint64_t Size, int64_t Offset,
    184                            MachinePointerInfo &MPO,
    185                            ISD::ArgFlagsTy Flags) override {
    186     MachineFunction &MF = MIRBuilder.getMF();
    187     const LLT PtrTy = LLT::pointer(AMDGPUAS::PRIVATE_ADDRESS, 32);
    188     const LLT S32 = LLT::scalar(32);
    189 
    190     if (IsTailCall) {
    191       Offset += FPDiff;
    192       int FI = MF.getFrameInfo().CreateFixedObject(Size, Offset, true);
    193       auto FIReg = MIRBuilder.buildFrameIndex(PtrTy, FI);
    194       MPO = MachinePointerInfo::getFixedStack(MF, FI);
    195       return FIReg.getReg(0);
    196     }
    197 
    198     const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
    199 
    200     if (!SPReg)
    201       SPReg = MIRBuilder.buildCopy(PtrTy, MFI->getStackPtrOffsetReg()).getReg(0);
    202 
    203     auto OffsetReg = MIRBuilder.buildConstant(S32, Offset);
    204 
    205     auto AddrReg = MIRBuilder.buildPtrAdd(PtrTy, SPReg, OffsetReg);
    206     MPO = MachinePointerInfo::getStack(MF, Offset);
    207     return AddrReg.getReg(0);
    208   }
    209 
    210   void assignValueToReg(Register ValVReg, Register PhysReg,
    211                         CCValAssign &VA) override {
    212     MIB.addUse(PhysReg, RegState::Implicit);
    213     Register ExtReg = extendRegisterMin32(*this, ValVReg, VA);
    214     MIRBuilder.buildCopy(PhysReg, ExtReg);
    215   }
    216 
    217   void assignValueToAddress(Register ValVReg, Register Addr, uint64_t Size,
    218                             MachinePointerInfo &MPO, CCValAssign &VA) override {
    219     MachineFunction &MF = MIRBuilder.getMF();
    220     uint64_t LocMemOffset = VA.getLocMemOffset();
    221     const auto &ST = MF.getSubtarget<GCNSubtarget>();
    222 
    223     auto MMO = MF.getMachineMemOperand(
    224       MPO, MachineMemOperand::MOStore, Size,
    225       commonAlignment(ST.getStackAlignment(), LocMemOffset));
    226     MIRBuilder.buildStore(ValVReg, Addr, *MMO);
    227   }
    228 
    229   void assignValueToAddress(const CallLowering::ArgInfo &Arg,
    230                             unsigned ValRegIndex, Register Addr,
    231                             uint64_t MemSize, MachinePointerInfo &MPO,
    232                             CCValAssign &VA) override {
    233     Register ValVReg = VA.getLocInfo() != CCValAssign::LocInfo::FPExt
    234                            ? extendRegister(Arg.Regs[ValRegIndex], VA)
    235                            : Arg.Regs[ValRegIndex];
    236 
    237     // If we extended the value type we might need to adjust the MMO's
    238     // Size. This happens if ComputeValueVTs widened a small type value to a
    239     // legal register type (e.g. s8->s16)
    240     const LLT RegTy = MRI.getType(ValVReg);
    241     MemSize = std::min(MemSize, (uint64_t)RegTy.getSizeInBytes());
    242     assignValueToAddress(ValVReg, Addr, MemSize, MPO, VA);
    243   }
    244 };
    245 }
    246 
    247 AMDGPUCallLowering::AMDGPUCallLowering(const AMDGPUTargetLowering &TLI)
    248   : CallLowering(&TLI) {
    249 }
    250 
    251 // FIXME: Compatability shim
    252 static ISD::NodeType extOpcodeToISDExtOpcode(unsigned MIOpc) {
    253   switch (MIOpc) {
    254   case TargetOpcode::G_SEXT:
    255     return ISD::SIGN_EXTEND;
    256   case TargetOpcode::G_ZEXT:
    257     return ISD::ZERO_EXTEND;
    258   case TargetOpcode::G_ANYEXT:
    259     return ISD::ANY_EXTEND;
    260   default:
    261     llvm_unreachable("not an extend opcode");
    262   }
    263 }
    264 
    265 bool AMDGPUCallLowering::canLowerReturn(MachineFunction &MF,
    266                                         CallingConv::ID CallConv,
    267                                         SmallVectorImpl<BaseArgInfo> &Outs,
    268                                         bool IsVarArg) const {
    269   // For shaders. Vector types should be explicitly handled by CC.
    270   if (AMDGPU::isEntryFunctionCC(CallConv))
    271     return true;
    272 
    273   SmallVector<CCValAssign, 16> ArgLocs;
    274   const SITargetLowering &TLI = *getTLI<SITargetLowering>();
    275   CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs,
    276                  MF.getFunction().getContext());
    277 
    278   return checkReturn(CCInfo, Outs, TLI.CCAssignFnForReturn(CallConv, IsVarArg));
    279 }
    280 
    281 /// Lower the return value for the already existing \p Ret. This assumes that
    282 /// \p B's insertion point is correct.
    283 bool AMDGPUCallLowering::lowerReturnVal(MachineIRBuilder &B,
    284                                         const Value *Val, ArrayRef<Register> VRegs,
    285                                         MachineInstrBuilder &Ret) const {
    286   if (!Val)
    287     return true;
    288 
    289   auto &MF = B.getMF();
    290   const auto &F = MF.getFunction();
    291   const DataLayout &DL = MF.getDataLayout();
    292   MachineRegisterInfo *MRI = B.getMRI();
    293   LLVMContext &Ctx = F.getContext();
    294 
    295   CallingConv::ID CC = F.getCallingConv();
    296   const SITargetLowering &TLI = *getTLI<SITargetLowering>();
    297 
    298   SmallVector<EVT, 8> SplitEVTs;
    299   ComputeValueVTs(TLI, DL, Val->getType(), SplitEVTs);
    300   assert(VRegs.size() == SplitEVTs.size() &&
    301          "For each split Type there should be exactly one VReg.");
    302 
    303   SmallVector<ArgInfo, 8> SplitRetInfos;
    304 
    305   for (unsigned i = 0; i < SplitEVTs.size(); ++i) {
    306     EVT VT = SplitEVTs[i];
    307     Register Reg = VRegs[i];
    308     ArgInfo RetInfo(Reg, VT.getTypeForEVT(Ctx));
    309     setArgFlags(RetInfo, AttributeList::ReturnIndex, DL, F);
    310 
    311     if (VT.isScalarInteger()) {
    312       unsigned ExtendOp = TargetOpcode::G_ANYEXT;
    313       if (RetInfo.Flags[0].isSExt()) {
    314         assert(RetInfo.Regs.size() == 1 && "expect only simple return values");
    315         ExtendOp = TargetOpcode::G_SEXT;
    316       } else if (RetInfo.Flags[0].isZExt()) {
    317         assert(RetInfo.Regs.size() == 1 && "expect only simple return values");
    318         ExtendOp = TargetOpcode::G_ZEXT;
    319       }
    320 
    321       EVT ExtVT = TLI.getTypeForExtReturn(Ctx, VT,
    322                                           extOpcodeToISDExtOpcode(ExtendOp));
    323       if (ExtVT != VT) {
    324         RetInfo.Ty = ExtVT.getTypeForEVT(Ctx);
    325         LLT ExtTy = getLLTForType(*RetInfo.Ty, DL);
    326         Reg = B.buildInstr(ExtendOp, {ExtTy}, {Reg}).getReg(0);
    327       }
    328     }
    329 
    330     if (Reg != RetInfo.Regs[0]) {
    331       RetInfo.Regs[0] = Reg;
    332       // Reset the arg flags after modifying Reg.
    333       setArgFlags(RetInfo, AttributeList::ReturnIndex, DL, F);
    334     }
    335 
    336     splitToValueTypes(RetInfo, SplitRetInfos, DL, CC);
    337   }
    338 
    339   CCAssignFn *AssignFn = TLI.CCAssignFnForReturn(CC, F.isVarArg());
    340 
    341   OutgoingValueAssigner Assigner(AssignFn);
    342   AMDGPUOutgoingValueHandler RetHandler(B, *MRI, Ret);
    343   return determineAndHandleAssignments(RetHandler, Assigner, SplitRetInfos, B,
    344                                        CC, F.isVarArg());
    345 }
    346 
    347 bool AMDGPUCallLowering::lowerReturn(MachineIRBuilder &B, const Value *Val,
    348                                      ArrayRef<Register> VRegs,
    349                                      FunctionLoweringInfo &FLI) const {
    350 
    351   MachineFunction &MF = B.getMF();
    352   MachineRegisterInfo &MRI = MF.getRegInfo();
    353   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
    354   MFI->setIfReturnsVoid(!Val);
    355 
    356   assert(!Val == VRegs.empty() && "Return value without a vreg");
    357 
    358   CallingConv::ID CC = B.getMF().getFunction().getCallingConv();
    359   const bool IsShader = AMDGPU::isShader(CC);
    360   const bool IsWaveEnd =
    361       (IsShader && MFI->returnsVoid()) || AMDGPU::isKernel(CC);
    362   if (IsWaveEnd) {
    363     B.buildInstr(AMDGPU::S_ENDPGM)
    364       .addImm(0);
    365     return true;
    366   }
    367 
    368   auto const &ST = MF.getSubtarget<GCNSubtarget>();
    369 
    370   unsigned ReturnOpc =
    371       IsShader ? AMDGPU::SI_RETURN_TO_EPILOG : AMDGPU::S_SETPC_B64_return;
    372 
    373   auto Ret = B.buildInstrNoInsert(ReturnOpc);
    374   Register ReturnAddrVReg;
    375   if (ReturnOpc == AMDGPU::S_SETPC_B64_return) {
    376     ReturnAddrVReg = MRI.createVirtualRegister(&AMDGPU::CCR_SGPR_64RegClass);
    377     Ret.addUse(ReturnAddrVReg);
    378   }
    379 
    380   if (!FLI.CanLowerReturn)
    381     insertSRetStores(B, Val->getType(), VRegs, FLI.DemoteRegister);
    382   else if (!lowerReturnVal(B, Val, VRegs, Ret))
    383     return false;
    384 
    385   if (ReturnOpc == AMDGPU::S_SETPC_B64_return) {
    386     const SIRegisterInfo *TRI = ST.getRegisterInfo();
    387     Register LiveInReturn = MF.addLiveIn(TRI->getReturnAddressReg(MF),
    388                                          &AMDGPU::SGPR_64RegClass);
    389     B.buildCopy(ReturnAddrVReg, LiveInReturn);
    390   }
    391 
    392   // TODO: Handle CalleeSavedRegsViaCopy.
    393 
    394   B.insertInstr(Ret);
    395   return true;
    396 }
    397 
    398 void AMDGPUCallLowering::lowerParameterPtr(Register DstReg, MachineIRBuilder &B,
    399                                            Type *ParamTy,
    400                                            uint64_t Offset) const {
    401   MachineFunction &MF = B.getMF();
    402   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
    403   MachineRegisterInfo &MRI = MF.getRegInfo();
    404   Register KernArgSegmentPtr =
    405     MFI->getPreloadedReg(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
    406   Register KernArgSegmentVReg = MRI.getLiveInVirtReg(KernArgSegmentPtr);
    407 
    408   auto OffsetReg = B.buildConstant(LLT::scalar(64), Offset);
    409 
    410   B.buildPtrAdd(DstReg, KernArgSegmentVReg, OffsetReg);
    411 }
    412 
    413 void AMDGPUCallLowering::lowerParameter(MachineIRBuilder &B, Type *ParamTy,
    414                                         uint64_t Offset, Align Alignment,
    415                                         Register DstReg) const {
    416   MachineFunction &MF = B.getMF();
    417   const Function &F = MF.getFunction();
    418   const DataLayout &DL = F.getParent()->getDataLayout();
    419   MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
    420   unsigned TypeSize = DL.getTypeStoreSize(ParamTy);
    421 
    422   LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
    423   Register PtrReg = B.getMRI()->createGenericVirtualRegister(PtrTy);
    424   lowerParameterPtr(PtrReg, B, ParamTy, Offset);
    425 
    426   MachineMemOperand *MMO = MF.getMachineMemOperand(
    427       PtrInfo,
    428       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
    429           MachineMemOperand::MOInvariant,
    430       TypeSize, Alignment);
    431 
    432   B.buildLoad(DstReg, PtrReg, *MMO);
    433 }
    434 
    435 // Allocate special inputs passed in user SGPRs.
    436 static void allocateHSAUserSGPRs(CCState &CCInfo,
    437                                  MachineIRBuilder &B,
    438                                  MachineFunction &MF,
    439                                  const SIRegisterInfo &TRI,
    440                                  SIMachineFunctionInfo &Info) {
    441   // FIXME: How should these inputs interact with inreg / custom SGPR inputs?
    442   if (Info.hasPrivateSegmentBuffer()) {
    443     Register PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(TRI);
    444     MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass);
    445     CCInfo.AllocateReg(PrivateSegmentBufferReg);
    446   }
    447 
    448   if (Info.hasDispatchPtr()) {
    449     Register DispatchPtrReg = Info.addDispatchPtr(TRI);
    450     MF.addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass);
    451     CCInfo.AllocateReg(DispatchPtrReg);
    452   }
    453 
    454   if (Info.hasQueuePtr()) {
    455     Register QueuePtrReg = Info.addQueuePtr(TRI);
    456     MF.addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass);
    457     CCInfo.AllocateReg(QueuePtrReg);
    458   }
    459 
    460   if (Info.hasKernargSegmentPtr()) {
    461     MachineRegisterInfo &MRI = MF.getRegInfo();
    462     Register InputPtrReg = Info.addKernargSegmentPtr(TRI);
    463     const LLT P4 = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
    464     Register VReg = MRI.createGenericVirtualRegister(P4);
    465     MRI.addLiveIn(InputPtrReg, VReg);
    466     B.getMBB().addLiveIn(InputPtrReg);
    467     B.buildCopy(VReg, InputPtrReg);
    468     CCInfo.AllocateReg(InputPtrReg);
    469   }
    470 
    471   if (Info.hasDispatchID()) {
    472     Register DispatchIDReg = Info.addDispatchID(TRI);
    473     MF.addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass);
    474     CCInfo.AllocateReg(DispatchIDReg);
    475   }
    476 
    477   if (Info.hasFlatScratchInit()) {
    478     Register FlatScratchInitReg = Info.addFlatScratchInit(TRI);
    479     MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass);
    480     CCInfo.AllocateReg(FlatScratchInitReg);
    481   }
    482 
    483   // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read
    484   // these from the dispatch pointer.
    485 }
    486 
    487 bool AMDGPUCallLowering::lowerFormalArgumentsKernel(
    488     MachineIRBuilder &B, const Function &F,
    489     ArrayRef<ArrayRef<Register>> VRegs) const {
    490   MachineFunction &MF = B.getMF();
    491   const GCNSubtarget *Subtarget = &MF.getSubtarget<GCNSubtarget>();
    492   MachineRegisterInfo &MRI = MF.getRegInfo();
    493   SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
    494   const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
    495   const SITargetLowering &TLI = *getTLI<SITargetLowering>();
    496   const DataLayout &DL = F.getParent()->getDataLayout();
    497 
    498   Info->allocateModuleLDSGlobal(F.getParent());
    499 
    500   SmallVector<CCValAssign, 16> ArgLocs;
    501   CCState CCInfo(F.getCallingConv(), F.isVarArg(), MF, ArgLocs, F.getContext());
    502 
    503   allocateHSAUserSGPRs(CCInfo, B, MF, *TRI, *Info);
    504 
    505   unsigned i = 0;
    506   const Align KernArgBaseAlign(16);
    507   const unsigned BaseOffset = Subtarget->getExplicitKernelArgOffset(F);
    508   uint64_t ExplicitArgOffset = 0;
    509 
    510   // TODO: Align down to dword alignment and extract bits for extending loads.
    511   for (auto &Arg : F.args()) {
    512     const bool IsByRef = Arg.hasByRefAttr();
    513     Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType();
    514     unsigned AllocSize = DL.getTypeAllocSize(ArgTy);
    515     if (AllocSize == 0)
    516       continue;
    517 
    518     MaybeAlign ABIAlign = IsByRef ? Arg.getParamAlign() : None;
    519     if (!ABIAlign)
    520       ABIAlign = DL.getABITypeAlign(ArgTy);
    521 
    522     uint64_t ArgOffset = alignTo(ExplicitArgOffset, ABIAlign) + BaseOffset;
    523     ExplicitArgOffset = alignTo(ExplicitArgOffset, ABIAlign) + AllocSize;
    524 
    525     if (Arg.use_empty()) {
    526       ++i;
    527       continue;
    528     }
    529 
    530     Align Alignment = commonAlignment(KernArgBaseAlign, ArgOffset);
    531 
    532     if (IsByRef) {
    533       unsigned ByRefAS = cast<PointerType>(Arg.getType())->getAddressSpace();
    534 
    535       assert(VRegs[i].size() == 1 &&
    536              "expected only one register for byval pointers");
    537       if (ByRefAS == AMDGPUAS::CONSTANT_ADDRESS) {
    538         lowerParameterPtr(VRegs[i][0], B, ArgTy, ArgOffset);
    539       } else {
    540         const LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
    541         Register PtrReg = MRI.createGenericVirtualRegister(ConstPtrTy);
    542         lowerParameterPtr(PtrReg, B, ArgTy, ArgOffset);
    543 
    544         B.buildAddrSpaceCast(VRegs[i][0], PtrReg);
    545       }
    546     } else {
    547       ArrayRef<Register> OrigArgRegs = VRegs[i];
    548       Register ArgReg =
    549         OrigArgRegs.size() == 1
    550         ? OrigArgRegs[0]
    551         : MRI.createGenericVirtualRegister(getLLTForType(*ArgTy, DL));
    552 
    553       lowerParameter(B, ArgTy, ArgOffset, Alignment, ArgReg);
    554       if (OrigArgRegs.size() > 1)
    555         unpackRegs(OrigArgRegs, ArgReg, ArgTy, B);
    556     }
    557 
    558     ++i;
    559   }
    560 
    561   TLI.allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info);
    562   TLI.allocateSystemSGPRs(CCInfo, MF, *Info, F.getCallingConv(), false);
    563   return true;
    564 }
    565 
    566 bool AMDGPUCallLowering::lowerFormalArguments(
    567     MachineIRBuilder &B, const Function &F, ArrayRef<ArrayRef<Register>> VRegs,
    568     FunctionLoweringInfo &FLI) const {
    569   CallingConv::ID CC = F.getCallingConv();
    570 
    571   // The infrastructure for normal calling convention lowering is essentially
    572   // useless for kernels. We want to avoid any kind of legalization or argument
    573   // splitting.
    574   if (CC == CallingConv::AMDGPU_KERNEL)
    575     return lowerFormalArgumentsKernel(B, F, VRegs);
    576 
    577   const bool IsGraphics = AMDGPU::isGraphics(CC);
    578   const bool IsEntryFunc = AMDGPU::isEntryFunctionCC(CC);
    579 
    580   MachineFunction &MF = B.getMF();
    581   MachineBasicBlock &MBB = B.getMBB();
    582   MachineRegisterInfo &MRI = MF.getRegInfo();
    583   SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
    584   const GCNSubtarget &Subtarget = MF.getSubtarget<GCNSubtarget>();
    585   const SIRegisterInfo *TRI = Subtarget.getRegisterInfo();
    586   const DataLayout &DL = F.getParent()->getDataLayout();
    587 
    588   Info->allocateModuleLDSGlobal(F.getParent());
    589 
    590   SmallVector<CCValAssign, 16> ArgLocs;
    591   CCState CCInfo(CC, F.isVarArg(), MF, ArgLocs, F.getContext());
    592 
    593   if (!IsEntryFunc) {
    594     Register ReturnAddrReg = TRI->getReturnAddressReg(MF);
    595     Register LiveInReturn = MF.addLiveIn(ReturnAddrReg,
    596                                          &AMDGPU::SGPR_64RegClass);
    597     MBB.addLiveIn(ReturnAddrReg);
    598     B.buildCopy(LiveInReturn, ReturnAddrReg);
    599   }
    600 
    601   if (Info->hasImplicitBufferPtr()) {
    602     Register ImplicitBufferPtrReg = Info->addImplicitBufferPtr(*TRI);
    603     MF.addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass);
    604     CCInfo.AllocateReg(ImplicitBufferPtrReg);
    605   }
    606 
    607   SmallVector<ArgInfo, 32> SplitArgs;
    608   unsigned Idx = 0;
    609   unsigned PSInputNum = 0;
    610 
    611   // Insert the hidden sret parameter if the return value won't fit in the
    612   // return registers.
    613   if (!FLI.CanLowerReturn)
    614     insertSRetIncomingArgument(F, SplitArgs, FLI.DemoteRegister, MRI, DL);
    615 
    616   for (auto &Arg : F.args()) {
    617     if (DL.getTypeStoreSize(Arg.getType()) == 0)
    618       continue;
    619 
    620     const bool InReg = Arg.hasAttribute(Attribute::InReg);
    621 
    622     // SGPR arguments to functions not implemented.
    623     if (!IsGraphics && InReg)
    624       return false;
    625 
    626     if (Arg.hasAttribute(Attribute::SwiftSelf) ||
    627         Arg.hasAttribute(Attribute::SwiftError) ||
    628         Arg.hasAttribute(Attribute::Nest))
    629       return false;
    630 
    631     if (CC == CallingConv::AMDGPU_PS && !InReg && PSInputNum <= 15) {
    632       const bool ArgUsed = !Arg.use_empty();
    633       bool SkipArg = !ArgUsed && !Info->isPSInputAllocated(PSInputNum);
    634 
    635       if (!SkipArg) {
    636         Info->markPSInputAllocated(PSInputNum);
    637         if (ArgUsed)
    638           Info->markPSInputEnabled(PSInputNum);
    639       }
    640 
    641       ++PSInputNum;
    642 
    643       if (SkipArg) {
    644         for (int I = 0, E = VRegs[Idx].size(); I != E; ++I)
    645           B.buildUndef(VRegs[Idx][I]);
    646 
    647         ++Idx;
    648         continue;
    649       }
    650     }
    651 
    652     ArgInfo OrigArg(VRegs[Idx], Arg);
    653     const unsigned OrigArgIdx = Idx + AttributeList::FirstArgIndex;
    654     setArgFlags(OrigArg, OrigArgIdx, DL, F);
    655 
    656     splitToValueTypes(OrigArg, SplitArgs, DL, CC);
    657     ++Idx;
    658   }
    659 
    660   // At least one interpolation mode must be enabled or else the GPU will
    661   // hang.
    662   //
    663   // Check PSInputAddr instead of PSInputEnable. The idea is that if the user
    664   // set PSInputAddr, the user wants to enable some bits after the compilation
    665   // based on run-time states. Since we can't know what the final PSInputEna
    666   // will look like, so we shouldn't do anything here and the user should take
    667   // responsibility for the correct programming.
    668   //
    669   // Otherwise, the following restrictions apply:
    670   // - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled.
    671   // - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be
    672   //   enabled too.
    673   if (CC == CallingConv::AMDGPU_PS) {
    674     if ((Info->getPSInputAddr() & 0x7F) == 0 ||
    675         ((Info->getPSInputAddr() & 0xF) == 0 &&
    676          Info->isPSInputAllocated(11))) {
    677       CCInfo.AllocateReg(AMDGPU::VGPR0);
    678       CCInfo.AllocateReg(AMDGPU::VGPR1);
    679       Info->markPSInputAllocated(0);
    680       Info->markPSInputEnabled(0);
    681     }
    682 
    683     if (Subtarget.isAmdPalOS()) {
    684       // For isAmdPalOS, the user does not enable some bits after compilation
    685       // based on run-time states; the register values being generated here are
    686       // the final ones set in hardware. Therefore we need to apply the
    687       // workaround to PSInputAddr and PSInputEnable together.  (The case where
    688       // a bit is set in PSInputAddr but not PSInputEnable is where the frontend
    689       // set up an input arg for a particular interpolation mode, but nothing
    690       // uses that input arg. Really we should have an earlier pass that removes
    691       // such an arg.)
    692       unsigned PsInputBits = Info->getPSInputAddr() & Info->getPSInputEnable();
    693       if ((PsInputBits & 0x7F) == 0 ||
    694           ((PsInputBits & 0xF) == 0 &&
    695            (PsInputBits >> 11 & 1)))
    696         Info->markPSInputEnabled(
    697           countTrailingZeros(Info->getPSInputAddr(), ZB_Undefined));
    698     }
    699   }
    700 
    701   const SITargetLowering &TLI = *getTLI<SITargetLowering>();
    702   CCAssignFn *AssignFn = TLI.CCAssignFnForCall(CC, F.isVarArg());
    703 
    704   if (!MBB.empty())
    705     B.setInstr(*MBB.begin());
    706 
    707   if (!IsEntryFunc) {
    708     // For the fixed ABI, pass workitem IDs in the last argument register.
    709     if (AMDGPUTargetMachine::EnableFixedFunctionABI)
    710       TLI.allocateSpecialInputVGPRsFixed(CCInfo, MF, *TRI, *Info);
    711   }
    712 
    713   IncomingValueAssigner Assigner(AssignFn);
    714   if (!determineAssignments(Assigner, SplitArgs, CCInfo))
    715     return false;
    716 
    717   FormalArgHandler Handler(B, MRI);
    718   if (!handleAssignments(Handler, SplitArgs, CCInfo, ArgLocs, B))
    719     return false;
    720 
    721   uint64_t StackOffset = Assigner.StackOffset;
    722 
    723   if (!IsEntryFunc && !AMDGPUTargetMachine::EnableFixedFunctionABI) {
    724     // Special inputs come after user arguments.
    725     TLI.allocateSpecialInputVGPRs(CCInfo, MF, *TRI, *Info);
    726   }
    727 
    728   // Start adding system SGPRs.
    729   if (IsEntryFunc) {
    730     TLI.allocateSystemSGPRs(CCInfo, MF, *Info, CC, IsGraphics);
    731   } else {
    732     if (!Subtarget.enableFlatScratch())
    733       CCInfo.AllocateReg(Info->getScratchRSrcReg());
    734     TLI.allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info);
    735   }
    736 
    737   // When we tail call, we need to check if the callee's arguments will fit on
    738   // the caller's stack. So, whenever we lower formal arguments, we should keep
    739   // track of this information, since we might lower a tail call in this
    740   // function later.
    741   Info->setBytesInStackArgArea(StackOffset);
    742 
    743   // Move back to the end of the basic block.
    744   B.setMBB(MBB);
    745 
    746   return true;
    747 }
    748 
    749 bool AMDGPUCallLowering::passSpecialInputs(MachineIRBuilder &MIRBuilder,
    750                                            CCState &CCInfo,
    751                                            SmallVectorImpl<std::pair<MCRegister, Register>> &ArgRegs,
    752                                            CallLoweringInfo &Info) const {
    753   MachineFunction &MF = MIRBuilder.getMF();
    754 
    755   const AMDGPUFunctionArgInfo *CalleeArgInfo
    756     = &AMDGPUArgumentUsageInfo::FixedABIFunctionInfo;
    757 
    758   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
    759   const AMDGPUFunctionArgInfo &CallerArgInfo = MFI->getArgInfo();
    760 
    761 
    762   // TODO: Unify with private memory register handling. This is complicated by
    763   // the fact that at least in kernels, the input argument is not necessarily
    764   // in the same location as the input.
    765   AMDGPUFunctionArgInfo::PreloadedValue InputRegs[] = {
    766     AMDGPUFunctionArgInfo::DISPATCH_PTR,
    767     AMDGPUFunctionArgInfo::QUEUE_PTR,
    768     AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR,
    769     AMDGPUFunctionArgInfo::DISPATCH_ID,
    770     AMDGPUFunctionArgInfo::WORKGROUP_ID_X,
    771     AMDGPUFunctionArgInfo::WORKGROUP_ID_Y,
    772     AMDGPUFunctionArgInfo::WORKGROUP_ID_Z
    773   };
    774 
    775   MachineRegisterInfo &MRI = MF.getRegInfo();
    776 
    777   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
    778   const AMDGPULegalizerInfo *LI
    779     = static_cast<const AMDGPULegalizerInfo*>(ST.getLegalizerInfo());
    780 
    781   for (auto InputID : InputRegs) {
    782     const ArgDescriptor *OutgoingArg;
    783     const TargetRegisterClass *ArgRC;
    784     LLT ArgTy;
    785 
    786     std::tie(OutgoingArg, ArgRC, ArgTy) =
    787         CalleeArgInfo->getPreloadedValue(InputID);
    788     if (!OutgoingArg)
    789       continue;
    790 
    791     const ArgDescriptor *IncomingArg;
    792     const TargetRegisterClass *IncomingArgRC;
    793     std::tie(IncomingArg, IncomingArgRC, ArgTy) =
    794         CallerArgInfo.getPreloadedValue(InputID);
    795     assert(IncomingArgRC == ArgRC);
    796 
    797     Register InputReg = MRI.createGenericVirtualRegister(ArgTy);
    798 
    799     if (IncomingArg) {
    800       LI->loadInputValue(InputReg, MIRBuilder, IncomingArg, ArgRC, ArgTy);
    801     } else {
    802       assert(InputID == AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
    803       LI->getImplicitArgPtr(InputReg, MRI, MIRBuilder);
    804     }
    805 
    806     if (OutgoingArg->isRegister()) {
    807       ArgRegs.emplace_back(OutgoingArg->getRegister(), InputReg);
    808       if (!CCInfo.AllocateReg(OutgoingArg->getRegister()))
    809         report_fatal_error("failed to allocate implicit input argument");
    810     } else {
    811       LLVM_DEBUG(dbgs() << "Unhandled stack passed implicit input argument\n");
    812       return false;
    813     }
    814   }
    815 
    816   // Pack workitem IDs into a single register or pass it as is if already
    817   // packed.
    818   const ArgDescriptor *OutgoingArg;
    819   const TargetRegisterClass *ArgRC;
    820   LLT ArgTy;
    821 
    822   std::tie(OutgoingArg, ArgRC, ArgTy) =
    823       CalleeArgInfo->getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_X);
    824   if (!OutgoingArg)
    825     std::tie(OutgoingArg, ArgRC, ArgTy) =
    826         CalleeArgInfo->getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
    827   if (!OutgoingArg)
    828     std::tie(OutgoingArg, ArgRC, ArgTy) =
    829         CalleeArgInfo->getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
    830   if (!OutgoingArg)
    831     return false;
    832 
    833   auto WorkitemIDX =
    834       CallerArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_X);
    835   auto WorkitemIDY =
    836       CallerArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
    837   auto WorkitemIDZ =
    838       CallerArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
    839 
    840   const ArgDescriptor *IncomingArgX = std::get<0>(WorkitemIDX);
    841   const ArgDescriptor *IncomingArgY = std::get<0>(WorkitemIDY);
    842   const ArgDescriptor *IncomingArgZ = std::get<0>(WorkitemIDZ);
    843   const LLT S32 = LLT::scalar(32);
    844 
    845   // If incoming ids are not packed we need to pack them.
    846   // FIXME: Should consider known workgroup size to eliminate known 0 cases.
    847   Register InputReg;
    848   if (IncomingArgX && !IncomingArgX->isMasked() && CalleeArgInfo->WorkItemIDX) {
    849     InputReg = MRI.createGenericVirtualRegister(S32);
    850     LI->loadInputValue(InputReg, MIRBuilder, IncomingArgX,
    851                        std::get<1>(WorkitemIDX), std::get<2>(WorkitemIDX));
    852   }
    853 
    854   if (IncomingArgY && !IncomingArgY->isMasked() && CalleeArgInfo->WorkItemIDY) {
    855     Register Y = MRI.createGenericVirtualRegister(S32);
    856     LI->loadInputValue(Y, MIRBuilder, IncomingArgY, std::get<1>(WorkitemIDY),
    857                        std::get<2>(WorkitemIDY));
    858 
    859     Y = MIRBuilder.buildShl(S32, Y, MIRBuilder.buildConstant(S32, 10)).getReg(0);
    860     InputReg = InputReg ? MIRBuilder.buildOr(S32, InputReg, Y).getReg(0) : Y;
    861   }
    862 
    863   if (IncomingArgZ && !IncomingArgZ->isMasked() && CalleeArgInfo->WorkItemIDZ) {
    864     Register Z = MRI.createGenericVirtualRegister(S32);
    865     LI->loadInputValue(Z, MIRBuilder, IncomingArgZ, std::get<1>(WorkitemIDZ),
    866                        std::get<2>(WorkitemIDZ));
    867 
    868     Z = MIRBuilder.buildShl(S32, Z, MIRBuilder.buildConstant(S32, 20)).getReg(0);
    869     InputReg = InputReg ? MIRBuilder.buildOr(S32, InputReg, Z).getReg(0) : Z;
    870   }
    871 
    872   if (!InputReg) {
    873     InputReg = MRI.createGenericVirtualRegister(S32);
    874 
    875     // Workitem ids are already packed, any of present incoming arguments will
    876     // carry all required fields.
    877     ArgDescriptor IncomingArg = ArgDescriptor::createArg(
    878       IncomingArgX ? *IncomingArgX :
    879         IncomingArgY ? *IncomingArgY : *IncomingArgZ, ~0u);
    880     LI->loadInputValue(InputReg, MIRBuilder, &IncomingArg,
    881                        &AMDGPU::VGPR_32RegClass, S32);
    882   }
    883 
    884   if (OutgoingArg->isRegister()) {
    885     ArgRegs.emplace_back(OutgoingArg->getRegister(), InputReg);
    886     if (!CCInfo.AllocateReg(OutgoingArg->getRegister()))
    887       report_fatal_error("failed to allocate implicit input argument");
    888   } else {
    889     LLVM_DEBUG(dbgs() << "Unhandled stack passed implicit input argument\n");
    890     return false;
    891   }
    892 
    893   return true;
    894 }
    895 
    896 /// Returns a pair containing the fixed CCAssignFn and the vararg CCAssignFn for
    897 /// CC.
    898 static std::pair<CCAssignFn *, CCAssignFn *>
    899 getAssignFnsForCC(CallingConv::ID CC, const SITargetLowering &TLI) {
    900   return {TLI.CCAssignFnForCall(CC, false), TLI.CCAssignFnForCall(CC, true)};
    901 }
    902 
    903 static unsigned getCallOpcode(const MachineFunction &CallerF, bool IsIndirect,
    904                               bool IsTailCall) {
    905   return IsTailCall ? AMDGPU::SI_TCRETURN : AMDGPU::SI_CALL;
    906 }
    907 
    908 // Add operands to call instruction to track the callee.
    909 static bool addCallTargetOperands(MachineInstrBuilder &CallInst,
    910                                   MachineIRBuilder &MIRBuilder,
    911                                   AMDGPUCallLowering::CallLoweringInfo &Info) {
    912   if (Info.Callee.isReg()) {
    913     CallInst.addReg(Info.Callee.getReg());
    914     CallInst.addImm(0);
    915   } else if (Info.Callee.isGlobal() && Info.Callee.getOffset() == 0) {
    916     // The call lowering lightly assumed we can directly encode a call target in
    917     // the instruction, which is not the case. Materialize the address here.
    918     const GlobalValue *GV = Info.Callee.getGlobal();
    919     auto Ptr = MIRBuilder.buildGlobalValue(
    920       LLT::pointer(GV->getAddressSpace(), 64), GV);
    921     CallInst.addReg(Ptr.getReg(0));
    922     CallInst.add(Info.Callee);
    923   } else
    924     return false;
    925 
    926   return true;
    927 }
    928 
    929 bool AMDGPUCallLowering::doCallerAndCalleePassArgsTheSameWay(
    930     CallLoweringInfo &Info, MachineFunction &MF,
    931     SmallVectorImpl<ArgInfo> &InArgs) const {
    932   const Function &CallerF = MF.getFunction();
    933   CallingConv::ID CalleeCC = Info.CallConv;
    934   CallingConv::ID CallerCC = CallerF.getCallingConv();
    935 
    936   // If the calling conventions match, then everything must be the same.
    937   if (CalleeCC == CallerCC)
    938     return true;
    939 
    940   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
    941 
    942   // Make sure that the caller and callee preserve all of the same registers.
    943   auto TRI = ST.getRegisterInfo();
    944 
    945   const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
    946   const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
    947   if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
    948     return false;
    949 
    950   // Check if the caller and callee will handle arguments in the same way.
    951   const SITargetLowering &TLI = *getTLI<SITargetLowering>();
    952   CCAssignFn *CalleeAssignFnFixed;
    953   CCAssignFn *CalleeAssignFnVarArg;
    954   std::tie(CalleeAssignFnFixed, CalleeAssignFnVarArg) =
    955       getAssignFnsForCC(CalleeCC, TLI);
    956 
    957   CCAssignFn *CallerAssignFnFixed;
    958   CCAssignFn *CallerAssignFnVarArg;
    959   std::tie(CallerAssignFnFixed, CallerAssignFnVarArg) =
    960       getAssignFnsForCC(CallerCC, TLI);
    961 
    962   // FIXME: We are not accounting for potential differences in implicitly passed
    963   // inputs, but only the fixed ABI is supported now anyway.
    964   IncomingValueAssigner CalleeAssigner(CalleeAssignFnFixed,
    965                                        CalleeAssignFnVarArg);
    966   IncomingValueAssigner CallerAssigner(CallerAssignFnFixed,
    967                                        CallerAssignFnVarArg);
    968   return resultsCompatible(Info, MF, InArgs, CalleeAssigner, CallerAssigner);
    969 }
    970 
    971 bool AMDGPUCallLowering::areCalleeOutgoingArgsTailCallable(
    972     CallLoweringInfo &Info, MachineFunction &MF,
    973     SmallVectorImpl<ArgInfo> &OutArgs) const {
    974   // If there are no outgoing arguments, then we are done.
    975   if (OutArgs.empty())
    976     return true;
    977 
    978   const Function &CallerF = MF.getFunction();
    979   CallingConv::ID CalleeCC = Info.CallConv;
    980   CallingConv::ID CallerCC = CallerF.getCallingConv();
    981   const SITargetLowering &TLI = *getTLI<SITargetLowering>();
    982 
    983   CCAssignFn *AssignFnFixed;
    984   CCAssignFn *AssignFnVarArg;
    985   std::tie(AssignFnFixed, AssignFnVarArg) = getAssignFnsForCC(CalleeCC, TLI);
    986 
    987   // We have outgoing arguments. Make sure that we can tail call with them.
    988   SmallVector<CCValAssign, 16> OutLocs;
    989   CCState OutInfo(CalleeCC, false, MF, OutLocs, CallerF.getContext());
    990   OutgoingValueAssigner Assigner(AssignFnFixed, AssignFnVarArg);
    991 
    992   if (!determineAssignments(Assigner, OutArgs, OutInfo)) {
    993     LLVM_DEBUG(dbgs() << "... Could not analyze call operands.\n");
    994     return false;
    995   }
    996 
    997   // Make sure that they can fit on the caller's stack.
    998   const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
    999   if (OutInfo.getNextStackOffset() > FuncInfo->getBytesInStackArgArea()) {
   1000     LLVM_DEBUG(dbgs() << "... Cannot fit call operands on caller's stack.\n");
   1001     return false;
   1002   }
   1003 
   1004   // Verify that the parameters in callee-saved registers match.
   1005   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
   1006   const SIRegisterInfo *TRI = ST.getRegisterInfo();
   1007   const uint32_t *CallerPreservedMask = TRI->getCallPreservedMask(MF, CallerCC);
   1008   MachineRegisterInfo &MRI = MF.getRegInfo();
   1009   return parametersInCSRMatch(MRI, CallerPreservedMask, OutLocs, OutArgs);
   1010 }
   1011 
   1012 /// Return true if the calling convention is one that we can guarantee TCO for.
   1013 static bool canGuaranteeTCO(CallingConv::ID CC) {
   1014   return CC == CallingConv::Fast;
   1015 }
   1016 
   1017 /// Return true if we might ever do TCO for calls with this calling convention.
   1018 static bool mayTailCallThisCC(CallingConv::ID CC) {
   1019   switch (CC) {
   1020   case CallingConv::C:
   1021   case CallingConv::AMDGPU_Gfx:
   1022     return true;
   1023   default:
   1024     return canGuaranteeTCO(CC);
   1025   }
   1026 }
   1027 
   1028 bool AMDGPUCallLowering::isEligibleForTailCallOptimization(
   1029     MachineIRBuilder &B, CallLoweringInfo &Info,
   1030     SmallVectorImpl<ArgInfo> &InArgs, SmallVectorImpl<ArgInfo> &OutArgs) const {
   1031   // Must pass all target-independent checks in order to tail call optimize.
   1032   if (!Info.IsTailCall)
   1033     return false;
   1034 
   1035   MachineFunction &MF = B.getMF();
   1036   const Function &CallerF = MF.getFunction();
   1037   CallingConv::ID CalleeCC = Info.CallConv;
   1038   CallingConv::ID CallerCC = CallerF.getCallingConv();
   1039 
   1040   const SIRegisterInfo *TRI = MF.getSubtarget<GCNSubtarget>().getRegisterInfo();
   1041   const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
   1042   // Kernels aren't callable, and don't have a live in return address so it
   1043   // doesn't make sense to do a tail call with entry functions.
   1044   if (!CallerPreserved)
   1045     return false;
   1046 
   1047   if (!mayTailCallThisCC(CalleeCC)) {
   1048     LLVM_DEBUG(dbgs() << "... Calling convention cannot be tail called.\n");
   1049     return false;
   1050   }
   1051 
   1052   if (any_of(CallerF.args(), [](const Argument &A) {
   1053         return A.hasByValAttr() || A.hasSwiftErrorAttr();
   1054       })) {
   1055     LLVM_DEBUG(dbgs() << "... Cannot tail call from callers with byval "
   1056                          "or swifterror arguments\n");
   1057     return false;
   1058   }
   1059 
   1060   // If we have -tailcallopt, then we're done.
   1061   if (MF.getTarget().Options.GuaranteedTailCallOpt)
   1062     return canGuaranteeTCO(CalleeCC) && CalleeCC == CallerF.getCallingConv();
   1063 
   1064   // Verify that the incoming and outgoing arguments from the callee are
   1065   // safe to tail call.
   1066   if (!doCallerAndCalleePassArgsTheSameWay(Info, MF, InArgs)) {
   1067     LLVM_DEBUG(
   1068         dbgs()
   1069         << "... Caller and callee have incompatible calling conventions.\n");
   1070     return false;
   1071   }
   1072 
   1073   if (!areCalleeOutgoingArgsTailCallable(Info, MF, OutArgs))
   1074     return false;
   1075 
   1076   LLVM_DEBUG(dbgs() << "... Call is eligible for tail call optimization.\n");
   1077   return true;
   1078 }
   1079 
   1080 // Insert outgoing implicit arguments for a call, by inserting copies to the
   1081 // implicit argument registers and adding the necessary implicit uses to the
   1082 // call instruction.
   1083 void AMDGPUCallLowering::handleImplicitCallArguments(
   1084     MachineIRBuilder &MIRBuilder, MachineInstrBuilder &CallInst,
   1085     const GCNSubtarget &ST, const SIMachineFunctionInfo &FuncInfo,
   1086     ArrayRef<std::pair<MCRegister, Register>> ImplicitArgRegs) const {
   1087   if (!ST.enableFlatScratch()) {
   1088     // Insert copies for the SRD. In the HSA case, this should be an identity
   1089     // copy.
   1090     auto ScratchRSrcReg =
   1091         MIRBuilder.buildCopy(LLT::vector(4, 32), FuncInfo.getScratchRSrcReg());
   1092     MIRBuilder.buildCopy(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, ScratchRSrcReg);
   1093     CallInst.addReg(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, RegState::Implicit);
   1094   }
   1095 
   1096   for (std::pair<MCRegister, Register> ArgReg : ImplicitArgRegs) {
   1097     MIRBuilder.buildCopy((Register)ArgReg.first, ArgReg.second);
   1098     CallInst.addReg(ArgReg.first, RegState::Implicit);
   1099   }
   1100 }
   1101 
   1102 bool AMDGPUCallLowering::lowerTailCall(
   1103     MachineIRBuilder &MIRBuilder, CallLoweringInfo &Info,
   1104     SmallVectorImpl<ArgInfo> &OutArgs) const {
   1105   MachineFunction &MF = MIRBuilder.getMF();
   1106   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
   1107   SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
   1108   const Function &F = MF.getFunction();
   1109   MachineRegisterInfo &MRI = MF.getRegInfo();
   1110   const SITargetLowering &TLI = *getTLI<SITargetLowering>();
   1111 
   1112   // True when we're tail calling, but without -tailcallopt.
   1113   bool IsSibCall = !MF.getTarget().Options.GuaranteedTailCallOpt;
   1114 
   1115   // Find out which ABI gets to decide where things go.
   1116   CallingConv::ID CalleeCC = Info.CallConv;
   1117   CCAssignFn *AssignFnFixed;
   1118   CCAssignFn *AssignFnVarArg;
   1119   std::tie(AssignFnFixed, AssignFnVarArg) = getAssignFnsForCC(CalleeCC, TLI);
   1120 
   1121   MachineInstrBuilder CallSeqStart;
   1122   if (!IsSibCall)
   1123     CallSeqStart = MIRBuilder.buildInstr(AMDGPU::ADJCALLSTACKUP);
   1124 
   1125   unsigned Opc = getCallOpcode(MF, Info.Callee.isReg(), true);
   1126   auto MIB = MIRBuilder.buildInstrNoInsert(Opc);
   1127   if (!addCallTargetOperands(MIB, MIRBuilder, Info))
   1128     return false;
   1129 
   1130   // Byte offset for the tail call. When we are sibcalling, this will always
   1131   // be 0.
   1132   MIB.addImm(0);
   1133 
   1134   // Tell the call which registers are clobbered.
   1135   const SIRegisterInfo *TRI = ST.getRegisterInfo();
   1136   const uint32_t *Mask = TRI->getCallPreservedMask(MF, CalleeCC);
   1137   MIB.addRegMask(Mask);
   1138 
   1139   // FPDiff is the byte offset of the call's argument area from the callee's.
   1140   // Stores to callee stack arguments will be placed in FixedStackSlots offset
   1141   // by this amount for a tail call. In a sibling call it must be 0 because the
   1142   // caller will deallocate the entire stack and the callee still expects its
   1143   // arguments to begin at SP+0.
   1144   int FPDiff = 0;
   1145 
   1146   // This will be 0 for sibcalls, potentially nonzero for tail calls produced
   1147   // by -tailcallopt. For sibcalls, the memory operands for the call are
   1148   // already available in the caller's incoming argument space.
   1149   unsigned NumBytes = 0;
   1150   if (!IsSibCall) {
   1151     // We aren't sibcalling, so we need to compute FPDiff. We need to do this
   1152     // before handling assignments, because FPDiff must be known for memory
   1153     // arguments.
   1154     unsigned NumReusableBytes = FuncInfo->getBytesInStackArgArea();
   1155     SmallVector<CCValAssign, 16> OutLocs;
   1156     CCState OutInfo(CalleeCC, false, MF, OutLocs, F.getContext());
   1157 
   1158     // FIXME: Not accounting for callee implicit inputs
   1159     OutgoingValueAssigner CalleeAssigner(AssignFnFixed, AssignFnVarArg);
   1160     if (!determineAssignments(CalleeAssigner, OutArgs, OutInfo))
   1161       return false;
   1162 
   1163     // The callee will pop the argument stack as a tail call. Thus, we must
   1164     // keep it 16-byte aligned.
   1165     NumBytes = alignTo(OutInfo.getNextStackOffset(), ST.getStackAlignment());
   1166 
   1167     // FPDiff will be negative if this tail call requires more space than we
   1168     // would automatically have in our incoming argument space. Positive if we
   1169     // actually shrink the stack.
   1170     FPDiff = NumReusableBytes - NumBytes;
   1171 
   1172     // The stack pointer must be 16-byte aligned at all times it's used for a
   1173     // memory operation, which in practice means at *all* times and in
   1174     // particular across call boundaries. Therefore our own arguments started at
   1175     // a 16-byte aligned SP and the delta applied for the tail call should
   1176     // satisfy the same constraint.
   1177     assert(isAligned(ST.getStackAlignment(), FPDiff) &&
   1178            "unaligned stack on tail call");
   1179   }
   1180 
   1181   SmallVector<CCValAssign, 16> ArgLocs;
   1182   CCState CCInfo(Info.CallConv, Info.IsVarArg, MF, ArgLocs, F.getContext());
   1183 
   1184   // We could pass MIB and directly add the implicit uses to the call
   1185   // now. However, as an aesthetic choice, place implicit argument operands
   1186   // after the ordinary user argument registers.
   1187   SmallVector<std::pair<MCRegister, Register>, 12> ImplicitArgRegs;
   1188 
   1189   if (AMDGPUTargetMachine::EnableFixedFunctionABI &&
   1190       Info.CallConv != CallingConv::AMDGPU_Gfx) {
   1191     // With a fixed ABI, allocate fixed registers before user arguments.
   1192     if (!passSpecialInputs(MIRBuilder, CCInfo, ImplicitArgRegs, Info))
   1193       return false;
   1194   }
   1195 
   1196   OutgoingValueAssigner Assigner(AssignFnFixed, AssignFnVarArg);
   1197 
   1198   if (!determineAssignments(Assigner, OutArgs, CCInfo))
   1199     return false;
   1200 
   1201   // Do the actual argument marshalling.
   1202   AMDGPUOutgoingArgHandler Handler(MIRBuilder, MRI, MIB, true, FPDiff);
   1203   if (!handleAssignments(Handler, OutArgs, CCInfo, ArgLocs, MIRBuilder))
   1204     return false;
   1205 
   1206   handleImplicitCallArguments(MIRBuilder, MIB, ST, *FuncInfo, ImplicitArgRegs);
   1207 
   1208   // If we have -tailcallopt, we need to adjust the stack. We'll do the call
   1209   // sequence start and end here.
   1210   if (!IsSibCall) {
   1211     MIB->getOperand(1).setImm(FPDiff);
   1212     CallSeqStart.addImm(NumBytes).addImm(0);
   1213     // End the call sequence *before* emitting the call. Normally, we would
   1214     // tidy the frame up after the call. However, here, we've laid out the
   1215     // parameters so that when SP is reset, they will be in the correct
   1216     // location.
   1217     MIRBuilder.buildInstr(AMDGPU::ADJCALLSTACKDOWN).addImm(NumBytes).addImm(0);
   1218   }
   1219 
   1220   // Now we can add the actual call instruction to the correct basic block.
   1221   MIRBuilder.insertInstr(MIB);
   1222 
   1223   // If Callee is a reg, since it is used by a target specific
   1224   // instruction, it must have a register class matching the
   1225   // constraint of that instruction.
   1226 
   1227   // FIXME: We should define regbankselectable call instructions to handle
   1228   // divergent call targets.
   1229   if (MIB->getOperand(0).isReg()) {
   1230     MIB->getOperand(0).setReg(constrainOperandRegClass(
   1231         MF, *TRI, MRI, *ST.getInstrInfo(), *ST.getRegBankInfo(), *MIB,
   1232         MIB->getDesc(), MIB->getOperand(0), 0));
   1233   }
   1234 
   1235   MF.getFrameInfo().setHasTailCall();
   1236   Info.LoweredTailCall = true;
   1237   return true;
   1238 }
   1239 
   1240 bool AMDGPUCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
   1241                                    CallLoweringInfo &Info) const {
   1242   if (Info.IsVarArg) {
   1243     LLVM_DEBUG(dbgs() << "Variadic functions not implemented\n");
   1244     return false;
   1245   }
   1246 
   1247   MachineFunction &MF = MIRBuilder.getMF();
   1248   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
   1249   const SIRegisterInfo *TRI = ST.getRegisterInfo();
   1250 
   1251   const Function &F = MF.getFunction();
   1252   MachineRegisterInfo &MRI = MF.getRegInfo();
   1253   const SITargetLowering &TLI = *getTLI<SITargetLowering>();
   1254   const DataLayout &DL = F.getParent()->getDataLayout();
   1255   CallingConv::ID CallConv = F.getCallingConv();
   1256 
   1257   if (!AMDGPUTargetMachine::EnableFixedFunctionABI &&
   1258       CallConv != CallingConv::AMDGPU_Gfx) {
   1259     LLVM_DEBUG(dbgs() << "Variable function ABI not implemented\n");
   1260     return false;
   1261   }
   1262 
   1263   if (AMDGPU::isShader(CallConv)) {
   1264     LLVM_DEBUG(dbgs() << "Unhandled call from graphics shader\n");
   1265     return false;
   1266   }
   1267 
   1268   SmallVector<ArgInfo, 8> OutArgs;
   1269   for (auto &OrigArg : Info.OrigArgs)
   1270     splitToValueTypes(OrigArg, OutArgs, DL, Info.CallConv);
   1271 
   1272   SmallVector<ArgInfo, 8> InArgs;
   1273   if (Info.CanLowerReturn && !Info.OrigRet.Ty->isVoidTy())
   1274     splitToValueTypes(Info.OrigRet, InArgs, DL, Info.CallConv);
   1275 
   1276   // If we can lower as a tail call, do that instead.
   1277   bool CanTailCallOpt =
   1278       isEligibleForTailCallOptimization(MIRBuilder, Info, InArgs, OutArgs);
   1279 
   1280   // We must emit a tail call if we have musttail.
   1281   if (Info.IsMustTailCall && !CanTailCallOpt) {
   1282     LLVM_DEBUG(dbgs() << "Failed to lower musttail call as tail call\n");
   1283     return false;
   1284   }
   1285 
   1286   if (CanTailCallOpt)
   1287     return lowerTailCall(MIRBuilder, Info, OutArgs);
   1288 
   1289   // Find out which ABI gets to decide where things go.
   1290   CCAssignFn *AssignFnFixed;
   1291   CCAssignFn *AssignFnVarArg;
   1292   std::tie(AssignFnFixed, AssignFnVarArg) =
   1293       getAssignFnsForCC(Info.CallConv, TLI);
   1294 
   1295   MIRBuilder.buildInstr(AMDGPU::ADJCALLSTACKUP)
   1296     .addImm(0)
   1297     .addImm(0);
   1298 
   1299   // Create a temporarily-floating call instruction so we can add the implicit
   1300   // uses of arg registers.
   1301   unsigned Opc = getCallOpcode(MF, Info.Callee.isReg(), false);
   1302 
   1303   auto MIB = MIRBuilder.buildInstrNoInsert(Opc);
   1304   MIB.addDef(TRI->getReturnAddressReg(MF));
   1305 
   1306   if (!addCallTargetOperands(MIB, MIRBuilder, Info))
   1307     return false;
   1308 
   1309   // Tell the call which registers are clobbered.
   1310   const uint32_t *Mask = TRI->getCallPreservedMask(MF, Info.CallConv);
   1311   MIB.addRegMask(Mask);
   1312 
   1313   SmallVector<CCValAssign, 16> ArgLocs;
   1314   CCState CCInfo(Info.CallConv, Info.IsVarArg, MF, ArgLocs, F.getContext());
   1315 
   1316   // We could pass MIB and directly add the implicit uses to the call
   1317   // now. However, as an aesthetic choice, place implicit argument operands
   1318   // after the ordinary user argument registers.
   1319   SmallVector<std::pair<MCRegister, Register>, 12> ImplicitArgRegs;
   1320 
   1321   if (AMDGPUTargetMachine::EnableFixedFunctionABI &&
   1322       Info.CallConv != CallingConv::AMDGPU_Gfx) {
   1323     // With a fixed ABI, allocate fixed registers before user arguments.
   1324     if (!passSpecialInputs(MIRBuilder, CCInfo, ImplicitArgRegs, Info))
   1325       return false;
   1326   }
   1327 
   1328   // Do the actual argument marshalling.
   1329   SmallVector<Register, 8> PhysRegs;
   1330 
   1331   OutgoingValueAssigner Assigner(AssignFnFixed, AssignFnVarArg);
   1332   if (!determineAssignments(Assigner, OutArgs, CCInfo))
   1333     return false;
   1334 
   1335   AMDGPUOutgoingArgHandler Handler(MIRBuilder, MRI, MIB, false);
   1336   if (!handleAssignments(Handler, OutArgs, CCInfo, ArgLocs, MIRBuilder))
   1337     return false;
   1338 
   1339   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
   1340 
   1341   handleImplicitCallArguments(MIRBuilder, MIB, ST, *MFI, ImplicitArgRegs);
   1342 
   1343   // Get a count of how many bytes are to be pushed on the stack.
   1344   unsigned NumBytes = CCInfo.getNextStackOffset();
   1345 
   1346   // If Callee is a reg, since it is used by a target specific
   1347   // instruction, it must have a register class matching the
   1348   // constraint of that instruction.
   1349 
   1350   // FIXME: We should define regbankselectable call instructions to handle
   1351   // divergent call targets.
   1352   if (MIB->getOperand(1).isReg()) {
   1353     MIB->getOperand(1).setReg(constrainOperandRegClass(
   1354         MF, *TRI, MRI, *ST.getInstrInfo(),
   1355         *ST.getRegBankInfo(), *MIB, MIB->getDesc(), MIB->getOperand(1),
   1356         1));
   1357   }
   1358 
   1359   // Now we can add the actual call instruction to the correct position.
   1360   MIRBuilder.insertInstr(MIB);
   1361 
   1362   // Finally we can copy the returned value back into its virtual-register. In
   1363   // symmetry with the arguments, the physical register must be an
   1364   // implicit-define of the call instruction.
   1365   if (Info.CanLowerReturn && !Info.OrigRet.Ty->isVoidTy()) {
   1366     CCAssignFn *RetAssignFn = TLI.CCAssignFnForReturn(Info.CallConv,
   1367                                                       Info.IsVarArg);
   1368     OutgoingValueAssigner Assigner(RetAssignFn);
   1369     CallReturnHandler Handler(MIRBuilder, MRI, MIB);
   1370     if (!determineAndHandleAssignments(Handler, Assigner, InArgs, MIRBuilder,
   1371                                        Info.CallConv, Info.IsVarArg))
   1372       return false;
   1373   }
   1374 
   1375   uint64_t CalleePopBytes = NumBytes;
   1376 
   1377   MIRBuilder.buildInstr(AMDGPU::ADJCALLSTACKDOWN)
   1378             .addImm(0)
   1379             .addImm(CalleePopBytes);
   1380 
   1381   if (!Info.CanLowerReturn) {
   1382     insertSRetLoads(MIRBuilder, Info.OrigRet.Ty, Info.OrigRet.Regs,
   1383                     Info.DemoteRegister, Info.DemoteStackIndex);
   1384   }
   1385 
   1386   return true;
   1387 }
   1388