1 //===-- AArch64Subtarget.cpp - AArch64 Subtarget Information ----*- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements the AArch64 specific subclass of TargetSubtarget. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "AArch64Subtarget.h" 14 15 #include "AArch64.h" 16 #include "AArch64InstrInfo.h" 17 #include "AArch64PBQPRegAlloc.h" 18 #include "AArch64TargetMachine.h" 19 #include "GISel/AArch64CallLowering.h" 20 #include "GISel/AArch64LegalizerInfo.h" 21 #include "GISel/AArch64RegisterBankInfo.h" 22 #include "MCTargetDesc/AArch64AddressingModes.h" 23 #include "llvm/CodeGen/GlobalISel/InstructionSelect.h" 24 #include "llvm/CodeGen/MachineScheduler.h" 25 #include "llvm/IR/GlobalValue.h" 26 #include "llvm/Support/TargetParser.h" 27 28 using namespace llvm; 29 30 #define DEBUG_TYPE "aarch64-subtarget" 31 32 #define GET_SUBTARGETINFO_CTOR 33 #define GET_SUBTARGETINFO_TARGET_DESC 34 #include "AArch64GenSubtargetInfo.inc" 35 36 static cl::opt<bool> 37 EnableEarlyIfConvert("aarch64-early-ifcvt", cl::desc("Enable the early if " 38 "converter pass"), cl::init(true), cl::Hidden); 39 40 // If OS supports TBI, use this flag to enable it. 41 static cl::opt<bool> 42 UseAddressTopByteIgnored("aarch64-use-tbi", cl::desc("Assume that top byte of " 43 "an address is ignored"), cl::init(false), cl::Hidden); 44 45 static cl::opt<bool> 46 UseNonLazyBind("aarch64-enable-nonlazybind", 47 cl::desc("Call nonlazybind functions via direct GOT load"), 48 cl::init(false), cl::Hidden); 49 50 static cl::opt<unsigned> SVEVectorBitsMax( 51 "aarch64-sve-vector-bits-max", 52 cl::desc("Assume SVE vector registers are at most this big, " 53 "with zero meaning no maximum size is assumed."), 54 cl::init(0), cl::Hidden); 55 56 static cl::opt<unsigned> SVEVectorBitsMin( 57 "aarch64-sve-vector-bits-min", 58 cl::desc("Assume SVE vector registers are at least this big, " 59 "with zero meaning no minimum size is assumed."), 60 cl::init(0), cl::Hidden); 61 62 static cl::opt<bool> UseAA("aarch64-use-aa", cl::init(true), 63 cl::desc("Enable the use of AA during codegen.")); 64 65 AArch64Subtarget & 66 AArch64Subtarget::initializeSubtargetDependencies(StringRef FS, 67 StringRef CPUString) { 68 // Determine default and user-specified characteristics 69 70 if (CPUString.empty()) 71 CPUString = "generic"; 72 73 ParseSubtargetFeatures(CPUString, /*TuneCPU*/ CPUString, FS); 74 initializeProperties(); 75 76 return *this; 77 } 78 79 void AArch64Subtarget::initializeProperties() { 80 // Initialize CPU specific properties. We should add a tablegen feature for 81 // this in the future so we can specify it together with the subtarget 82 // features. 83 switch (ARMProcFamily) { 84 case Others: 85 break; 86 case Carmel: 87 CacheLineSize = 64; 88 break; 89 case CortexA35: 90 break; 91 case CortexA53: 92 case CortexA55: 93 PrefFunctionLogAlignment = 4; 94 break; 95 case CortexA57: 96 MaxInterleaveFactor = 4; 97 PrefFunctionLogAlignment = 4; 98 break; 99 case CortexA65: 100 PrefFunctionLogAlignment = 3; 101 break; 102 case CortexA72: 103 case CortexA73: 104 case CortexA75: 105 case CortexA76: 106 case CortexA77: 107 case CortexA78: 108 case CortexA78C: 109 case CortexR82: 110 case CortexX1: 111 PrefFunctionLogAlignment = 4; 112 break; 113 case A64FX: 114 CacheLineSize = 256; 115 PrefFunctionLogAlignment = 3; 116 PrefLoopLogAlignment = 2; 117 MaxInterleaveFactor = 4; 118 PrefetchDistance = 128; 119 MinPrefetchStride = 1024; 120 MaxPrefetchIterationsAhead = 4; 121 break; 122 case AppleA7: 123 case AppleA10: 124 case AppleA11: 125 case AppleA12: 126 case AppleA13: 127 case AppleA14: 128 CacheLineSize = 64; 129 PrefetchDistance = 280; 130 MinPrefetchStride = 2048; 131 MaxPrefetchIterationsAhead = 3; 132 break; 133 case ExynosM3: 134 MaxInterleaveFactor = 4; 135 MaxJumpTableSize = 20; 136 PrefFunctionLogAlignment = 5; 137 PrefLoopLogAlignment = 4; 138 break; 139 case Falkor: 140 MaxInterleaveFactor = 4; 141 // FIXME: remove this to enable 64-bit SLP if performance looks good. 142 MinVectorRegisterBitWidth = 128; 143 CacheLineSize = 128; 144 PrefetchDistance = 820; 145 MinPrefetchStride = 2048; 146 MaxPrefetchIterationsAhead = 8; 147 break; 148 case Kryo: 149 MaxInterleaveFactor = 4; 150 VectorInsertExtractBaseCost = 2; 151 CacheLineSize = 128; 152 PrefetchDistance = 740; 153 MinPrefetchStride = 1024; 154 MaxPrefetchIterationsAhead = 11; 155 // FIXME: remove this to enable 64-bit SLP if performance looks good. 156 MinVectorRegisterBitWidth = 128; 157 break; 158 case NeoverseE1: 159 PrefFunctionLogAlignment = 3; 160 break; 161 case NeoverseN1: 162 case NeoverseN2: 163 case NeoverseV1: 164 PrefFunctionLogAlignment = 4; 165 break; 166 case Saphira: 167 MaxInterleaveFactor = 4; 168 // FIXME: remove this to enable 64-bit SLP if performance looks good. 169 MinVectorRegisterBitWidth = 128; 170 break; 171 case ThunderX2T99: 172 CacheLineSize = 64; 173 PrefFunctionLogAlignment = 3; 174 PrefLoopLogAlignment = 2; 175 MaxInterleaveFactor = 4; 176 PrefetchDistance = 128; 177 MinPrefetchStride = 1024; 178 MaxPrefetchIterationsAhead = 4; 179 // FIXME: remove this to enable 64-bit SLP if performance looks good. 180 MinVectorRegisterBitWidth = 128; 181 break; 182 case ThunderX: 183 case ThunderXT88: 184 case ThunderXT81: 185 case ThunderXT83: 186 CacheLineSize = 128; 187 PrefFunctionLogAlignment = 3; 188 PrefLoopLogAlignment = 2; 189 // FIXME: remove this to enable 64-bit SLP if performance looks good. 190 MinVectorRegisterBitWidth = 128; 191 break; 192 case TSV110: 193 CacheLineSize = 64; 194 PrefFunctionLogAlignment = 4; 195 PrefLoopLogAlignment = 2; 196 break; 197 case ThunderX3T110: 198 CacheLineSize = 64; 199 PrefFunctionLogAlignment = 4; 200 PrefLoopLogAlignment = 2; 201 MaxInterleaveFactor = 4; 202 PrefetchDistance = 128; 203 MinPrefetchStride = 1024; 204 MaxPrefetchIterationsAhead = 4; 205 // FIXME: remove this to enable 64-bit SLP if performance looks good. 206 MinVectorRegisterBitWidth = 128; 207 break; 208 } 209 } 210 211 AArch64Subtarget::AArch64Subtarget(const Triple &TT, const std::string &CPU, 212 const std::string &FS, 213 const TargetMachine &TM, bool LittleEndian) 214 : AArch64GenSubtargetInfo(TT, CPU, /*TuneCPU*/ CPU, FS), 215 ReserveXRegister(AArch64::GPR64commonRegClass.getNumRegs()), 216 CustomCallSavedXRegs(AArch64::GPR64commonRegClass.getNumRegs()), 217 IsLittle(LittleEndian), 218 TargetTriple(TT), FrameLowering(), 219 InstrInfo(initializeSubtargetDependencies(FS, CPU)), TSInfo(), 220 TLInfo(TM, *this) { 221 if (AArch64::isX18ReservedByDefault(TT)) 222 ReserveXRegister.set(18); 223 224 CallLoweringInfo.reset(new AArch64CallLowering(*getTargetLowering())); 225 InlineAsmLoweringInfo.reset(new InlineAsmLowering(getTargetLowering())); 226 Legalizer.reset(new AArch64LegalizerInfo(*this)); 227 228 auto *RBI = new AArch64RegisterBankInfo(*getRegisterInfo()); 229 230 // FIXME: At this point, we can't rely on Subtarget having RBI. 231 // It's awkward to mix passing RBI and the Subtarget; should we pass 232 // TII/TRI as well? 233 InstSelector.reset(createAArch64InstructionSelector( 234 *static_cast<const AArch64TargetMachine *>(&TM), *this, *RBI)); 235 236 RegBankInfo.reset(RBI); 237 } 238 239 const CallLowering *AArch64Subtarget::getCallLowering() const { 240 return CallLoweringInfo.get(); 241 } 242 243 const InlineAsmLowering *AArch64Subtarget::getInlineAsmLowering() const { 244 return InlineAsmLoweringInfo.get(); 245 } 246 247 InstructionSelector *AArch64Subtarget::getInstructionSelector() const { 248 return InstSelector.get(); 249 } 250 251 const LegalizerInfo *AArch64Subtarget::getLegalizerInfo() const { 252 return Legalizer.get(); 253 } 254 255 const RegisterBankInfo *AArch64Subtarget::getRegBankInfo() const { 256 return RegBankInfo.get(); 257 } 258 259 /// Find the target operand flags that describe how a global value should be 260 /// referenced for the current subtarget. 261 unsigned 262 AArch64Subtarget::ClassifyGlobalReference(const GlobalValue *GV, 263 const TargetMachine &TM) const { 264 // MachO large model always goes via a GOT, simply to get a single 8-byte 265 // absolute relocation on all global addresses. 266 if (TM.getCodeModel() == CodeModel::Large && isTargetMachO()) 267 return AArch64II::MO_GOT; 268 269 if (!TM.shouldAssumeDSOLocal(*GV->getParent(), GV)) { 270 if (GV->hasDLLImportStorageClass()) 271 return AArch64II::MO_GOT | AArch64II::MO_DLLIMPORT; 272 if (getTargetTriple().isOSWindows()) 273 return AArch64II::MO_GOT | AArch64II::MO_COFFSTUB; 274 return AArch64II::MO_GOT; 275 } 276 277 // The small code model's direct accesses use ADRP, which cannot 278 // necessarily produce the value 0 (if the code is above 4GB). 279 // Same for the tiny code model, where we have a pc relative LDR. 280 if ((useSmallAddressing() || TM.getCodeModel() == CodeModel::Tiny) && 281 GV->hasExternalWeakLinkage()) 282 return AArch64II::MO_GOT; 283 284 // References to tagged globals are marked with MO_NC | MO_TAGGED to indicate 285 // that their nominal addresses are tagged and outside of the code model. In 286 // AArch64ExpandPseudo::expandMI we emit an additional instruction to set the 287 // tag if necessary based on MO_TAGGED. 288 if (AllowTaggedGlobals && !isa<FunctionType>(GV->getValueType())) 289 return AArch64II::MO_NC | AArch64II::MO_TAGGED; 290 291 return AArch64II::MO_NO_FLAG; 292 } 293 294 unsigned AArch64Subtarget::classifyGlobalFunctionReference( 295 const GlobalValue *GV, const TargetMachine &TM) const { 296 // MachO large model always goes via a GOT, because we don't have the 297 // relocations available to do anything else.. 298 if (TM.getCodeModel() == CodeModel::Large && isTargetMachO() && 299 !GV->hasInternalLinkage()) 300 return AArch64II::MO_GOT; 301 302 // NonLazyBind goes via GOT unless we know it's available locally. 303 auto *F = dyn_cast<Function>(GV); 304 if (UseNonLazyBind && F && F->hasFnAttribute(Attribute::NonLazyBind) && 305 !TM.shouldAssumeDSOLocal(*GV->getParent(), GV)) 306 return AArch64II::MO_GOT; 307 308 // Use ClassifyGlobalReference for setting MO_DLLIMPORT/MO_COFFSTUB. 309 if (getTargetTriple().isOSWindows()) 310 return ClassifyGlobalReference(GV, TM); 311 312 return AArch64II::MO_NO_FLAG; 313 } 314 315 void AArch64Subtarget::overrideSchedPolicy(MachineSchedPolicy &Policy, 316 unsigned NumRegionInstrs) const { 317 // LNT run (at least on Cyclone) showed reasonably significant gains for 318 // bi-directional scheduling. 253.perlbmk. 319 Policy.OnlyTopDown = false; 320 Policy.OnlyBottomUp = false; 321 // Enabling or Disabling the latency heuristic is a close call: It seems to 322 // help nearly no benchmark on out-of-order architectures, on the other hand 323 // it regresses register pressure on a few benchmarking. 324 Policy.DisableLatencyHeuristic = DisableLatencySchedHeuristic; 325 } 326 327 bool AArch64Subtarget::enableEarlyIfConversion() const { 328 return EnableEarlyIfConvert; 329 } 330 331 bool AArch64Subtarget::supportsAddressTopByteIgnored() const { 332 if (!UseAddressTopByteIgnored) 333 return false; 334 335 if (TargetTriple.isiOS()) { 336 unsigned Major, Minor, Micro; 337 TargetTriple.getiOSVersion(Major, Minor, Micro); 338 return Major >= 8; 339 } 340 341 return false; 342 } 343 344 std::unique_ptr<PBQPRAConstraint> 345 AArch64Subtarget::getCustomPBQPConstraints() const { 346 return balanceFPOps() ? std::make_unique<A57ChainingConstraint>() : nullptr; 347 } 348 349 void AArch64Subtarget::mirFileLoaded(MachineFunction &MF) const { 350 // We usually compute max call frame size after ISel. Do the computation now 351 // if the .mir file didn't specify it. Note that this will probably give you 352 // bogus values after PEI has eliminated the callframe setup/destroy pseudo 353 // instructions, specify explicitly if you need it to be correct. 354 MachineFrameInfo &MFI = MF.getFrameInfo(); 355 if (!MFI.isMaxCallFrameSizeComputed()) 356 MFI.computeMaxCallFrameSize(MF); 357 } 358 359 unsigned AArch64Subtarget::getMaxSVEVectorSizeInBits() const { 360 assert(HasSVE && "Tried to get SVE vector length without SVE support!"); 361 assert(SVEVectorBitsMax % 128 == 0 && 362 "SVE requires vector length in multiples of 128!"); 363 assert((SVEVectorBitsMax >= SVEVectorBitsMin || SVEVectorBitsMax == 0) && 364 "Minimum SVE vector size should not be larger than its maximum!"); 365 if (SVEVectorBitsMax == 0) 366 return 0; 367 return (std::max(SVEVectorBitsMin, SVEVectorBitsMax) / 128) * 128; 368 } 369 370 unsigned AArch64Subtarget::getMinSVEVectorSizeInBits() const { 371 assert(HasSVE && "Tried to get SVE vector length without SVE support!"); 372 assert(SVEVectorBitsMin % 128 == 0 && 373 "SVE requires vector length in multiples of 128!"); 374 assert((SVEVectorBitsMax >= SVEVectorBitsMin || SVEVectorBitsMax == 0) && 375 "Minimum SVE vector size should not be larger than its maximum!"); 376 if (SVEVectorBitsMax == 0) 377 return (SVEVectorBitsMin / 128) * 128; 378 return (std::min(SVEVectorBitsMin, SVEVectorBitsMax) / 128) * 128; 379 } 380 381 bool AArch64Subtarget::useSVEForFixedLengthVectors() const { 382 // Prefer NEON unless larger SVE registers are available. 383 return hasSVE() && getMinSVEVectorSizeInBits() >= 256; 384 } 385 386 bool AArch64Subtarget::useAA() const { return UseAA; } 387