Home | History | Annotate | Line # | Download | only in AArch64
      1 //===-- AArch64Subtarget.cpp - AArch64 Subtarget Information ----*- C++ -*-===//
      2 //
      3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
      4 // See https://llvm.org/LICENSE.txt for license information.
      5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
      6 //
      7 //===----------------------------------------------------------------------===//
      8 //
      9 // This file implements the AArch64 specific subclass of TargetSubtarget.
     10 //
     11 //===----------------------------------------------------------------------===//
     12 
     13 #include "AArch64Subtarget.h"
     14 
     15 #include "AArch64.h"
     16 #include "AArch64InstrInfo.h"
     17 #include "AArch64PBQPRegAlloc.h"
     18 #include "AArch64TargetMachine.h"
     19 #include "GISel/AArch64CallLowering.h"
     20 #include "GISel/AArch64LegalizerInfo.h"
     21 #include "GISel/AArch64RegisterBankInfo.h"
     22 #include "MCTargetDesc/AArch64AddressingModes.h"
     23 #include "llvm/CodeGen/GlobalISel/InstructionSelect.h"
     24 #include "llvm/CodeGen/MachineScheduler.h"
     25 #include "llvm/IR/GlobalValue.h"
     26 #include "llvm/Support/TargetParser.h"
     27 
     28 using namespace llvm;
     29 
     30 #define DEBUG_TYPE "aarch64-subtarget"
     31 
     32 #define GET_SUBTARGETINFO_CTOR
     33 #define GET_SUBTARGETINFO_TARGET_DESC
     34 #include "AArch64GenSubtargetInfo.inc"
     35 
     36 static cl::opt<bool>
     37 EnableEarlyIfConvert("aarch64-early-ifcvt", cl::desc("Enable the early if "
     38                      "converter pass"), cl::init(true), cl::Hidden);
     39 
     40 // If OS supports TBI, use this flag to enable it.
     41 static cl::opt<bool>
     42 UseAddressTopByteIgnored("aarch64-use-tbi", cl::desc("Assume that top byte of "
     43                          "an address is ignored"), cl::init(false), cl::Hidden);
     44 
     45 static cl::opt<bool>
     46     UseNonLazyBind("aarch64-enable-nonlazybind",
     47                    cl::desc("Call nonlazybind functions via direct GOT load"),
     48                    cl::init(false), cl::Hidden);
     49 
     50 static cl::opt<unsigned> SVEVectorBitsMax(
     51     "aarch64-sve-vector-bits-max",
     52     cl::desc("Assume SVE vector registers are at most this big, "
     53              "with zero meaning no maximum size is assumed."),
     54     cl::init(0), cl::Hidden);
     55 
     56 static cl::opt<unsigned> SVEVectorBitsMin(
     57     "aarch64-sve-vector-bits-min",
     58     cl::desc("Assume SVE vector registers are at least this big, "
     59              "with zero meaning no minimum size is assumed."),
     60     cl::init(0), cl::Hidden);
     61 
     62 static cl::opt<bool> UseAA("aarch64-use-aa", cl::init(true),
     63                            cl::desc("Enable the use of AA during codegen."));
     64 
     65 AArch64Subtarget &
     66 AArch64Subtarget::initializeSubtargetDependencies(StringRef FS,
     67                                                   StringRef CPUString) {
     68   // Determine default and user-specified characteristics
     69 
     70   if (CPUString.empty())
     71     CPUString = "generic";
     72 
     73   ParseSubtargetFeatures(CPUString, /*TuneCPU*/ CPUString, FS);
     74   initializeProperties();
     75 
     76   return *this;
     77 }
     78 
     79 void AArch64Subtarget::initializeProperties() {
     80   // Initialize CPU specific properties. We should add a tablegen feature for
     81   // this in the future so we can specify it together with the subtarget
     82   // features.
     83   switch (ARMProcFamily) {
     84   case Others:
     85     break;
     86   case Carmel:
     87     CacheLineSize = 64;
     88     break;
     89   case CortexA35:
     90     break;
     91   case CortexA53:
     92   case CortexA55:
     93     PrefFunctionLogAlignment = 4;
     94     break;
     95   case CortexA57:
     96     MaxInterleaveFactor = 4;
     97     PrefFunctionLogAlignment = 4;
     98     break;
     99   case CortexA65:
    100     PrefFunctionLogAlignment = 3;
    101     break;
    102   case CortexA72:
    103   case CortexA73:
    104   case CortexA75:
    105   case CortexA76:
    106   case CortexA77:
    107   case CortexA78:
    108   case CortexA78C:
    109   case CortexR82:
    110   case CortexX1:
    111     PrefFunctionLogAlignment = 4;
    112     break;
    113   case A64FX:
    114     CacheLineSize = 256;
    115     PrefFunctionLogAlignment = 3;
    116     PrefLoopLogAlignment = 2;
    117     MaxInterleaveFactor = 4;
    118     PrefetchDistance = 128;
    119     MinPrefetchStride = 1024;
    120     MaxPrefetchIterationsAhead = 4;
    121     break;
    122   case AppleA7:
    123   case AppleA10:
    124   case AppleA11:
    125   case AppleA12:
    126   case AppleA13:
    127   case AppleA14:
    128     CacheLineSize = 64;
    129     PrefetchDistance = 280;
    130     MinPrefetchStride = 2048;
    131     MaxPrefetchIterationsAhead = 3;
    132     break;
    133   case ExynosM3:
    134     MaxInterleaveFactor = 4;
    135     MaxJumpTableSize = 20;
    136     PrefFunctionLogAlignment = 5;
    137     PrefLoopLogAlignment = 4;
    138     break;
    139   case Falkor:
    140     MaxInterleaveFactor = 4;
    141     // FIXME: remove this to enable 64-bit SLP if performance looks good.
    142     MinVectorRegisterBitWidth = 128;
    143     CacheLineSize = 128;
    144     PrefetchDistance = 820;
    145     MinPrefetchStride = 2048;
    146     MaxPrefetchIterationsAhead = 8;
    147     break;
    148   case Kryo:
    149     MaxInterleaveFactor = 4;
    150     VectorInsertExtractBaseCost = 2;
    151     CacheLineSize = 128;
    152     PrefetchDistance = 740;
    153     MinPrefetchStride = 1024;
    154     MaxPrefetchIterationsAhead = 11;
    155     // FIXME: remove this to enable 64-bit SLP if performance looks good.
    156     MinVectorRegisterBitWidth = 128;
    157     break;
    158   case NeoverseE1:
    159     PrefFunctionLogAlignment = 3;
    160     break;
    161   case NeoverseN1:
    162   case NeoverseN2:
    163   case NeoverseV1:
    164     PrefFunctionLogAlignment = 4;
    165     break;
    166   case Saphira:
    167     MaxInterleaveFactor = 4;
    168     // FIXME: remove this to enable 64-bit SLP if performance looks good.
    169     MinVectorRegisterBitWidth = 128;
    170     break;
    171   case ThunderX2T99:
    172     CacheLineSize = 64;
    173     PrefFunctionLogAlignment = 3;
    174     PrefLoopLogAlignment = 2;
    175     MaxInterleaveFactor = 4;
    176     PrefetchDistance = 128;
    177     MinPrefetchStride = 1024;
    178     MaxPrefetchIterationsAhead = 4;
    179     // FIXME: remove this to enable 64-bit SLP if performance looks good.
    180     MinVectorRegisterBitWidth = 128;
    181     break;
    182   case ThunderX:
    183   case ThunderXT88:
    184   case ThunderXT81:
    185   case ThunderXT83:
    186     CacheLineSize = 128;
    187     PrefFunctionLogAlignment = 3;
    188     PrefLoopLogAlignment = 2;
    189     // FIXME: remove this to enable 64-bit SLP if performance looks good.
    190     MinVectorRegisterBitWidth = 128;
    191     break;
    192   case TSV110:
    193     CacheLineSize = 64;
    194     PrefFunctionLogAlignment = 4;
    195     PrefLoopLogAlignment = 2;
    196     break;
    197   case ThunderX3T110:
    198     CacheLineSize = 64;
    199     PrefFunctionLogAlignment = 4;
    200     PrefLoopLogAlignment = 2;
    201     MaxInterleaveFactor = 4;
    202     PrefetchDistance = 128;
    203     MinPrefetchStride = 1024;
    204     MaxPrefetchIterationsAhead = 4;
    205     // FIXME: remove this to enable 64-bit SLP if performance looks good.
    206     MinVectorRegisterBitWidth = 128;
    207     break;
    208   }
    209 }
    210 
    211 AArch64Subtarget::AArch64Subtarget(const Triple &TT, const std::string &CPU,
    212                                    const std::string &FS,
    213                                    const TargetMachine &TM, bool LittleEndian)
    214     : AArch64GenSubtargetInfo(TT, CPU, /*TuneCPU*/ CPU, FS),
    215       ReserveXRegister(AArch64::GPR64commonRegClass.getNumRegs()),
    216       CustomCallSavedXRegs(AArch64::GPR64commonRegClass.getNumRegs()),
    217       IsLittle(LittleEndian),
    218       TargetTriple(TT), FrameLowering(),
    219       InstrInfo(initializeSubtargetDependencies(FS, CPU)), TSInfo(),
    220       TLInfo(TM, *this) {
    221   if (AArch64::isX18ReservedByDefault(TT))
    222     ReserveXRegister.set(18);
    223 
    224   CallLoweringInfo.reset(new AArch64CallLowering(*getTargetLowering()));
    225   InlineAsmLoweringInfo.reset(new InlineAsmLowering(getTargetLowering()));
    226   Legalizer.reset(new AArch64LegalizerInfo(*this));
    227 
    228   auto *RBI = new AArch64RegisterBankInfo(*getRegisterInfo());
    229 
    230   // FIXME: At this point, we can't rely on Subtarget having RBI.
    231   // It's awkward to mix passing RBI and the Subtarget; should we pass
    232   // TII/TRI as well?
    233   InstSelector.reset(createAArch64InstructionSelector(
    234       *static_cast<const AArch64TargetMachine *>(&TM), *this, *RBI));
    235 
    236   RegBankInfo.reset(RBI);
    237 }
    238 
    239 const CallLowering *AArch64Subtarget::getCallLowering() const {
    240   return CallLoweringInfo.get();
    241 }
    242 
    243 const InlineAsmLowering *AArch64Subtarget::getInlineAsmLowering() const {
    244   return InlineAsmLoweringInfo.get();
    245 }
    246 
    247 InstructionSelector *AArch64Subtarget::getInstructionSelector() const {
    248   return InstSelector.get();
    249 }
    250 
    251 const LegalizerInfo *AArch64Subtarget::getLegalizerInfo() const {
    252   return Legalizer.get();
    253 }
    254 
    255 const RegisterBankInfo *AArch64Subtarget::getRegBankInfo() const {
    256   return RegBankInfo.get();
    257 }
    258 
    259 /// Find the target operand flags that describe how a global value should be
    260 /// referenced for the current subtarget.
    261 unsigned
    262 AArch64Subtarget::ClassifyGlobalReference(const GlobalValue *GV,
    263                                           const TargetMachine &TM) const {
    264   // MachO large model always goes via a GOT, simply to get a single 8-byte
    265   // absolute relocation on all global addresses.
    266   if (TM.getCodeModel() == CodeModel::Large && isTargetMachO())
    267     return AArch64II::MO_GOT;
    268 
    269   if (!TM.shouldAssumeDSOLocal(*GV->getParent(), GV)) {
    270     if (GV->hasDLLImportStorageClass())
    271       return AArch64II::MO_GOT | AArch64II::MO_DLLIMPORT;
    272     if (getTargetTriple().isOSWindows())
    273       return AArch64II::MO_GOT | AArch64II::MO_COFFSTUB;
    274     return AArch64II::MO_GOT;
    275   }
    276 
    277   // The small code model's direct accesses use ADRP, which cannot
    278   // necessarily produce the value 0 (if the code is above 4GB).
    279   // Same for the tiny code model, where we have a pc relative LDR.
    280   if ((useSmallAddressing() || TM.getCodeModel() == CodeModel::Tiny) &&
    281       GV->hasExternalWeakLinkage())
    282     return AArch64II::MO_GOT;
    283 
    284   // References to tagged globals are marked with MO_NC | MO_TAGGED to indicate
    285   // that their nominal addresses are tagged and outside of the code model. In
    286   // AArch64ExpandPseudo::expandMI we emit an additional instruction to set the
    287   // tag if necessary based on MO_TAGGED.
    288   if (AllowTaggedGlobals && !isa<FunctionType>(GV->getValueType()))
    289     return AArch64II::MO_NC | AArch64II::MO_TAGGED;
    290 
    291   return AArch64II::MO_NO_FLAG;
    292 }
    293 
    294 unsigned AArch64Subtarget::classifyGlobalFunctionReference(
    295     const GlobalValue *GV, const TargetMachine &TM) const {
    296   // MachO large model always goes via a GOT, because we don't have the
    297   // relocations available to do anything else..
    298   if (TM.getCodeModel() == CodeModel::Large && isTargetMachO() &&
    299       !GV->hasInternalLinkage())
    300     return AArch64II::MO_GOT;
    301 
    302   // NonLazyBind goes via GOT unless we know it's available locally.
    303   auto *F = dyn_cast<Function>(GV);
    304   if (UseNonLazyBind && F && F->hasFnAttribute(Attribute::NonLazyBind) &&
    305       !TM.shouldAssumeDSOLocal(*GV->getParent(), GV))
    306     return AArch64II::MO_GOT;
    307 
    308   // Use ClassifyGlobalReference for setting MO_DLLIMPORT/MO_COFFSTUB.
    309   if (getTargetTriple().isOSWindows())
    310     return ClassifyGlobalReference(GV, TM);
    311 
    312   return AArch64II::MO_NO_FLAG;
    313 }
    314 
    315 void AArch64Subtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
    316                                            unsigned NumRegionInstrs) const {
    317   // LNT run (at least on Cyclone) showed reasonably significant gains for
    318   // bi-directional scheduling. 253.perlbmk.
    319   Policy.OnlyTopDown = false;
    320   Policy.OnlyBottomUp = false;
    321   // Enabling or Disabling the latency heuristic is a close call: It seems to
    322   // help nearly no benchmark on out-of-order architectures, on the other hand
    323   // it regresses register pressure on a few benchmarking.
    324   Policy.DisableLatencyHeuristic = DisableLatencySchedHeuristic;
    325 }
    326 
    327 bool AArch64Subtarget::enableEarlyIfConversion() const {
    328   return EnableEarlyIfConvert;
    329 }
    330 
    331 bool AArch64Subtarget::supportsAddressTopByteIgnored() const {
    332   if (!UseAddressTopByteIgnored)
    333     return false;
    334 
    335   if (TargetTriple.isiOS()) {
    336     unsigned Major, Minor, Micro;
    337     TargetTriple.getiOSVersion(Major, Minor, Micro);
    338     return Major >= 8;
    339   }
    340 
    341   return false;
    342 }
    343 
    344 std::unique_ptr<PBQPRAConstraint>
    345 AArch64Subtarget::getCustomPBQPConstraints() const {
    346   return balanceFPOps() ? std::make_unique<A57ChainingConstraint>() : nullptr;
    347 }
    348 
    349 void AArch64Subtarget::mirFileLoaded(MachineFunction &MF) const {
    350   // We usually compute max call frame size after ISel. Do the computation now
    351   // if the .mir file didn't specify it. Note that this will probably give you
    352   // bogus values after PEI has eliminated the callframe setup/destroy pseudo
    353   // instructions, specify explicitly if you need it to be correct.
    354   MachineFrameInfo &MFI = MF.getFrameInfo();
    355   if (!MFI.isMaxCallFrameSizeComputed())
    356     MFI.computeMaxCallFrameSize(MF);
    357 }
    358 
    359 unsigned AArch64Subtarget::getMaxSVEVectorSizeInBits() const {
    360   assert(HasSVE && "Tried to get SVE vector length without SVE support!");
    361   assert(SVEVectorBitsMax % 128 == 0 &&
    362          "SVE requires vector length in multiples of 128!");
    363   assert((SVEVectorBitsMax >= SVEVectorBitsMin || SVEVectorBitsMax == 0) &&
    364          "Minimum SVE vector size should not be larger than its maximum!");
    365   if (SVEVectorBitsMax == 0)
    366     return 0;
    367   return (std::max(SVEVectorBitsMin, SVEVectorBitsMax) / 128) * 128;
    368 }
    369 
    370 unsigned AArch64Subtarget::getMinSVEVectorSizeInBits() const {
    371   assert(HasSVE && "Tried to get SVE vector length without SVE support!");
    372   assert(SVEVectorBitsMin % 128 == 0 &&
    373          "SVE requires vector length in multiples of 128!");
    374   assert((SVEVectorBitsMax >= SVEVectorBitsMin || SVEVectorBitsMax == 0) &&
    375          "Minimum SVE vector size should not be larger than its maximum!");
    376   if (SVEVectorBitsMax == 0)
    377     return (SVEVectorBitsMin / 128) * 128;
    378   return (std::min(SVEVectorBitsMin, SVEVectorBitsMax) / 128) * 128;
    379 }
    380 
    381 bool AArch64Subtarget::useSVEForFixedLengthVectors() const {
    382   // Prefer NEON unless larger SVE registers are available.
    383   return hasSVE() && getMinSVEVectorSizeInBits() >= 256;
    384 }
    385 
    386 bool AArch64Subtarget::useAA() const { return UseAA; }
    387