Home | History | Annotate | Line # | Download | only in AMDGPU
      1 //===- AMDGPUAnnotateKernelFeaturesPass.cpp -------------------------------===//
      2 //
      3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
      4 // See https://llvm.org/LICENSE.txt for license information.
      5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
      6 //
      7 //===----------------------------------------------------------------------===//
      8 //
      9 /// \file This pass adds target attributes to functions which use intrinsics
     10 /// which will impact calling convention lowering.
     11 //
     12 //===----------------------------------------------------------------------===//
     13 
     14 #include "AMDGPU.h"
     15 #include "GCNSubtarget.h"
     16 #include "llvm/Analysis/CallGraph.h"
     17 #include "llvm/Analysis/CallGraphSCCPass.h"
     18 #include "llvm/CodeGen/TargetPassConfig.h"
     19 #include "llvm/IR/IntrinsicsAMDGPU.h"
     20 #include "llvm/IR/IntrinsicsR600.h"
     21 #include "llvm/Target/TargetMachine.h"
     22 
     23 #define DEBUG_TYPE "amdgpu-annotate-kernel-features"
     24 
     25 using namespace llvm;
     26 
     27 namespace {
     28 static constexpr StringLiteral ImplicitAttrNames[] = {
     29     // X ids unnecessarily propagated to kernels.
     30     "amdgpu-work-item-id-x",  "amdgpu-work-item-id-y",
     31     "amdgpu-work-item-id-z",  "amdgpu-work-group-id-x",
     32     "amdgpu-work-group-id-y", "amdgpu-work-group-id-z",
     33     "amdgpu-dispatch-ptr",    "amdgpu-dispatch-id",
     34     "amdgpu-queue-ptr",       "amdgpu-implicitarg-ptr"};
     35 
     36 class AMDGPUAnnotateKernelFeatures : public CallGraphSCCPass {
     37 private:
     38   const TargetMachine *TM = nullptr;
     39   SmallVector<CallGraphNode*, 8> NodeList;
     40 
     41   bool addFeatureAttributes(Function &F);
     42   bool processUniformWorkGroupAttribute();
     43   bool propagateUniformWorkGroupAttribute(Function &Caller, Function &Callee);
     44 
     45 public:
     46   static char ID;
     47 
     48   AMDGPUAnnotateKernelFeatures() : CallGraphSCCPass(ID) {}
     49 
     50   bool doInitialization(CallGraph &CG) override;
     51   bool runOnSCC(CallGraphSCC &SCC) override;
     52 
     53   StringRef getPassName() const override {
     54     return "AMDGPU Annotate Kernel Features";
     55   }
     56 
     57   void getAnalysisUsage(AnalysisUsage &AU) const override {
     58     AU.setPreservesAll();
     59     CallGraphSCCPass::getAnalysisUsage(AU);
     60   }
     61 
     62   static bool visitConstantExpr(const ConstantExpr *CE);
     63   static bool visitConstantExprsRecursively(
     64     const Constant *EntryC,
     65     SmallPtrSet<const Constant *, 8> &ConstantExprVisited, bool IsFunc,
     66     bool HasApertureRegs);
     67 };
     68 
     69 } // end anonymous namespace
     70 
     71 char AMDGPUAnnotateKernelFeatures::ID = 0;
     72 
     73 char &llvm::AMDGPUAnnotateKernelFeaturesID = AMDGPUAnnotateKernelFeatures::ID;
     74 
     75 INITIALIZE_PASS(AMDGPUAnnotateKernelFeatures, DEBUG_TYPE,
     76                 "Add AMDGPU function attributes", false, false)
     77 
     78 
     79 // The queue ptr is only needed when casting to flat, not from it.
     80 static bool castRequiresQueuePtr(unsigned SrcAS) {
     81   return SrcAS == AMDGPUAS::LOCAL_ADDRESS || SrcAS == AMDGPUAS::PRIVATE_ADDRESS;
     82 }
     83 
     84 static bool castRequiresQueuePtr(const AddrSpaceCastInst *ASC) {
     85   return castRequiresQueuePtr(ASC->getSrcAddressSpace());
     86 }
     87 
     88 static bool isDSAddress(const Constant *C) {
     89   const GlobalValue *GV = dyn_cast<GlobalValue>(C);
     90   if (!GV)
     91     return false;
     92   unsigned AS = GV->getAddressSpace();
     93   return AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS;
     94 }
     95 
     96 bool AMDGPUAnnotateKernelFeatures::visitConstantExpr(const ConstantExpr *CE) {
     97   if (CE->getOpcode() == Instruction::AddrSpaceCast) {
     98     unsigned SrcAS = CE->getOperand(0)->getType()->getPointerAddressSpace();
     99     return castRequiresQueuePtr(SrcAS);
    100   }
    101 
    102   return false;
    103 }
    104 
    105 bool AMDGPUAnnotateKernelFeatures::visitConstantExprsRecursively(
    106   const Constant *EntryC,
    107   SmallPtrSet<const Constant *, 8> &ConstantExprVisited,
    108   bool IsFunc, bool HasApertureRegs) {
    109 
    110   if (!ConstantExprVisited.insert(EntryC).second)
    111     return false;
    112 
    113   SmallVector<const Constant *, 16> Stack;
    114   Stack.push_back(EntryC);
    115 
    116   while (!Stack.empty()) {
    117     const Constant *C = Stack.pop_back_val();
    118 
    119     // We need to trap on DS globals in non-entry functions.
    120     if (IsFunc && isDSAddress(C))
    121       return true;
    122 
    123     // Check this constant expression.
    124     if (const auto *CE = dyn_cast<ConstantExpr>(C)) {
    125       if (!HasApertureRegs && visitConstantExpr(CE))
    126         return true;
    127     }
    128 
    129     // Visit all sub-expressions.
    130     for (const Use &U : C->operands()) {
    131       const auto *OpC = dyn_cast<Constant>(U);
    132       if (!OpC)
    133         continue;
    134 
    135       if (!ConstantExprVisited.insert(OpC).second)
    136         continue;
    137 
    138       Stack.push_back(OpC);
    139     }
    140   }
    141 
    142   return false;
    143 }
    144 
    145 // We do not need to note the x workitem or workgroup id because they are always
    146 // initialized.
    147 //
    148 // TODO: We should not add the attributes if the known compile time workgroup
    149 // size is 1 for y/z.
    150 static StringRef intrinsicToAttrName(Intrinsic::ID ID,
    151                                      bool &NonKernelOnly,
    152                                      bool &IsQueuePtr) {
    153   switch (ID) {
    154   case Intrinsic::amdgcn_workitem_id_x:
    155     NonKernelOnly = true;
    156     return "amdgpu-work-item-id-x";
    157   case Intrinsic::amdgcn_workgroup_id_x:
    158     NonKernelOnly = true;
    159     return "amdgpu-work-group-id-x";
    160   case Intrinsic::amdgcn_workitem_id_y:
    161   case Intrinsic::r600_read_tidig_y:
    162     return "amdgpu-work-item-id-y";
    163   case Intrinsic::amdgcn_workitem_id_z:
    164   case Intrinsic::r600_read_tidig_z:
    165     return "amdgpu-work-item-id-z";
    166   case Intrinsic::amdgcn_workgroup_id_y:
    167   case Intrinsic::r600_read_tgid_y:
    168     return "amdgpu-work-group-id-y";
    169   case Intrinsic::amdgcn_workgroup_id_z:
    170   case Intrinsic::r600_read_tgid_z:
    171     return "amdgpu-work-group-id-z";
    172   case Intrinsic::amdgcn_dispatch_ptr:
    173     return "amdgpu-dispatch-ptr";
    174   case Intrinsic::amdgcn_dispatch_id:
    175     return "amdgpu-dispatch-id";
    176   case Intrinsic::amdgcn_kernarg_segment_ptr:
    177     return "amdgpu-kernarg-segment-ptr";
    178   case Intrinsic::amdgcn_implicitarg_ptr:
    179     return "amdgpu-implicitarg-ptr";
    180   case Intrinsic::amdgcn_queue_ptr:
    181   case Intrinsic::amdgcn_is_shared:
    182   case Intrinsic::amdgcn_is_private:
    183     // TODO: Does not require queue ptr on gfx9+
    184   case Intrinsic::trap:
    185   case Intrinsic::debugtrap:
    186     IsQueuePtr = true;
    187     return "amdgpu-queue-ptr";
    188   default:
    189     return "";
    190   }
    191 }
    192 
    193 static bool handleAttr(Function &Parent, const Function &Callee,
    194                        StringRef Name) {
    195   if (Callee.hasFnAttribute(Name)) {
    196     Parent.addFnAttr(Name);
    197     return true;
    198   }
    199   return false;
    200 }
    201 
    202 static void copyFeaturesToFunction(Function &Parent, const Function &Callee,
    203                                    bool &NeedQueuePtr) {
    204   if (handleAttr(Parent, Callee, "amdgpu-queue-ptr"))
    205     NeedQueuePtr = true;
    206 
    207   for (StringRef AttrName : ImplicitAttrNames)
    208     handleAttr(Parent, Callee, AttrName);
    209 }
    210 
    211 bool AMDGPUAnnotateKernelFeatures::processUniformWorkGroupAttribute() {
    212   bool Changed = false;
    213 
    214   for (auto *Node : reverse(NodeList)) {
    215     Function *Caller = Node->getFunction();
    216 
    217     for (auto I : *Node) {
    218       Function *Callee = std::get<1>(I)->getFunction();
    219       if (Callee)
    220         Changed = propagateUniformWorkGroupAttribute(*Caller, *Callee);
    221     }
    222   }
    223 
    224   return Changed;
    225 }
    226 
    227 bool AMDGPUAnnotateKernelFeatures::propagateUniformWorkGroupAttribute(
    228        Function &Caller, Function &Callee) {
    229 
    230   // Check for externally defined function
    231   if (!Callee.hasExactDefinition()) {
    232     Callee.addFnAttr("uniform-work-group-size", "false");
    233     if (!Caller.hasFnAttribute("uniform-work-group-size"))
    234       Caller.addFnAttr("uniform-work-group-size", "false");
    235 
    236     return true;
    237   }
    238   // Check if the Caller has the attribute
    239   if (Caller.hasFnAttribute("uniform-work-group-size")) {
    240     // Check if the value of the attribute is true
    241     if (Caller.getFnAttribute("uniform-work-group-size")
    242         .getValueAsString().equals("true")) {
    243       // Propagate the attribute to the Callee, if it does not have it
    244       if (!Callee.hasFnAttribute("uniform-work-group-size")) {
    245         Callee.addFnAttr("uniform-work-group-size", "true");
    246         return true;
    247       }
    248     } else {
    249       Callee.addFnAttr("uniform-work-group-size", "false");
    250       return true;
    251     }
    252   } else {
    253     // If the attribute is absent, set it as false
    254     Caller.addFnAttr("uniform-work-group-size", "false");
    255     Callee.addFnAttr("uniform-work-group-size", "false");
    256     return true;
    257   }
    258   return false;
    259 }
    260 
    261 bool AMDGPUAnnotateKernelFeatures::addFeatureAttributes(Function &F) {
    262   const GCNSubtarget &ST = TM->getSubtarget<GCNSubtarget>(F);
    263   bool HasApertureRegs = ST.hasApertureRegs();
    264   SmallPtrSet<const Constant *, 8> ConstantExprVisited;
    265 
    266   bool HaveStackObjects = false;
    267   bool Changed = false;
    268   bool NeedQueuePtr = false;
    269   bool HaveCall = false;
    270   bool HasIndirectCall = false;
    271   bool IsFunc = !AMDGPU::isEntryFunctionCC(F.getCallingConv());
    272   CallingConv::ID CC = F.getCallingConv();
    273   bool CallingConvSupportsAllImplicits = (CC != CallingConv::AMDGPU_Gfx);
    274 
    275   // If this function hasAddressTaken() = true
    276   // then add all attributes corresponding to the implicit args.
    277   if (CallingConvSupportsAllImplicits &&
    278       F.hasAddressTaken(nullptr, true, true, true)) {
    279     for (StringRef AttrName : ImplicitAttrNames) {
    280       F.addFnAttr(AttrName);
    281     }
    282     Changed = true;
    283   }
    284 
    285   for (BasicBlock &BB : F) {
    286     for (Instruction &I : BB) {
    287       if (isa<AllocaInst>(I)) {
    288         HaveStackObjects = true;
    289         continue;
    290       }
    291 
    292       if (auto *CB = dyn_cast<CallBase>(&I)) {
    293         const Function *Callee =
    294             dyn_cast<Function>(CB->getCalledOperand()->stripPointerCasts());
    295 
    296         // Note the occurence of indirect call.
    297         if (!Callee) {
    298           if (!CB->isInlineAsm()) {
    299             HasIndirectCall = true;
    300             HaveCall = true;
    301           }
    302           continue;
    303         }
    304 
    305         Intrinsic::ID IID = Callee->getIntrinsicID();
    306         if (IID == Intrinsic::not_intrinsic) {
    307           HaveCall = true;
    308           copyFeaturesToFunction(F, *Callee, NeedQueuePtr);
    309           Changed = true;
    310         } else {
    311           bool NonKernelOnly = false;
    312 
    313           if (!IsFunc && IID == Intrinsic::amdgcn_kernarg_segment_ptr) {
    314             F.addFnAttr("amdgpu-kernarg-segment-ptr");
    315           } else {
    316             StringRef AttrName = intrinsicToAttrName(IID, NonKernelOnly,
    317                                                      NeedQueuePtr);
    318             if (!AttrName.empty() && (IsFunc || !NonKernelOnly)) {
    319               F.addFnAttr(AttrName);
    320               Changed = true;
    321             }
    322           }
    323         }
    324       }
    325 
    326       if (NeedQueuePtr || (!IsFunc && HasApertureRegs))
    327         continue;
    328 
    329       if (const AddrSpaceCastInst *ASC = dyn_cast<AddrSpaceCastInst>(&I)) {
    330         if (!HasApertureRegs && castRequiresQueuePtr(ASC)) {
    331           NeedQueuePtr = true;
    332           continue;
    333         }
    334       }
    335 
    336       for (const Use &U : I.operands()) {
    337         const auto *OpC = dyn_cast<Constant>(U);
    338         if (!OpC)
    339           continue;
    340 
    341         if (visitConstantExprsRecursively(OpC, ConstantExprVisited, IsFunc,
    342                                           HasApertureRegs)) {
    343           NeedQueuePtr = true;
    344           break;
    345         }
    346       }
    347     }
    348   }
    349 
    350   if (NeedQueuePtr) {
    351     F.addFnAttr("amdgpu-queue-ptr");
    352     Changed = true;
    353   }
    354 
    355   // TODO: We could refine this to captured pointers that could possibly be
    356   // accessed by flat instructions. For now this is mostly a poor way of
    357   // estimating whether there are calls before argument lowering.
    358   if (!IsFunc && HaveCall) {
    359     F.addFnAttr("amdgpu-calls");
    360     Changed = true;
    361   }
    362 
    363   if (HaveStackObjects) {
    364     F.addFnAttr("amdgpu-stack-objects");
    365     Changed = true;
    366   }
    367 
    368   // This pass cannot copy attributes from callees to callers
    369   // if there is an indirect call and in thus such cases,
    370   // hasAddressTaken() would be false for kernels and functions
    371   // making an indirect call (if they are themselves not indirectly called).
    372   // We must tag all such kernels/functions with all implicits attributes
    373   // for correctness.
    374   // e.g.
    375   // 1. Kernel K1 makes an indirect call to function F1.
    376   //    Without detecting an indirect call in K1, this pass will not
    377   //    add all implicit args to K1 (which is incorrect).
    378   // 2. Kernel K1 makes direct call to F1 which makes indirect call to function
    379   // F2.
    380   //    Without detecting an indirect call in F1 (whose hasAddressTaken() is
    381   //    false), the pass will not add all implicit args to F1 (which is
    382   //    essential for correctness).
    383   if (CallingConvSupportsAllImplicits && HasIndirectCall) {
    384     for (StringRef AttrName : ImplicitAttrNames) {
    385       F.addFnAttr(AttrName);
    386     }
    387     Changed = true;
    388   }
    389 
    390   return Changed;
    391 }
    392 
    393 bool AMDGPUAnnotateKernelFeatures::runOnSCC(CallGraphSCC &SCC) {
    394   bool Changed = false;
    395 
    396   for (CallGraphNode *I : SCC) {
    397     // Build a list of CallGraphNodes from most number of uses to least
    398     if (I->getNumReferences())
    399       NodeList.push_back(I);
    400     else {
    401       processUniformWorkGroupAttribute();
    402       NodeList.clear();
    403     }
    404 
    405     Function *F = I->getFunction();
    406     // Ignore functions with graphics calling conventions, these are currently
    407     // not allowed to have kernel arguments.
    408     if (!F || F->isDeclaration() || AMDGPU::isGraphics(F->getCallingConv()))
    409       continue;
    410     // Add feature attributes
    411     Changed |= addFeatureAttributes(*F);
    412   }
    413 
    414   return Changed;
    415 }
    416 
    417 bool AMDGPUAnnotateKernelFeatures::doInitialization(CallGraph &CG) {
    418   auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
    419   if (!TPC)
    420     report_fatal_error("TargetMachine is required");
    421 
    422   TM = &TPC->getTM<TargetMachine>();
    423   return false;
    424 }
    425 
    426 Pass *llvm::createAMDGPUAnnotateKernelFeaturesPass() {
    427   return new AMDGPUAnnotateKernelFeatures();
    428 }
    429