1 //===- ARMISelLowering.cpp - ARM DAG Lowering Implementation --------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file defines the interfaces that ARM uses to lower LLVM code into a 10 // selection DAG. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "ARMISelLowering.h" 15 #include "ARMBaseInstrInfo.h" 16 #include "ARMBaseRegisterInfo.h" 17 #include "ARMCallingConv.h" 18 #include "ARMConstantPoolValue.h" 19 #include "ARMMachineFunctionInfo.h" 20 #include "ARMPerfectShuffle.h" 21 #include "ARMRegisterInfo.h" 22 #include "ARMSelectionDAGInfo.h" 23 #include "ARMSubtarget.h" 24 #include "ARMTargetTransformInfo.h" 25 #include "MCTargetDesc/ARMAddressingModes.h" 26 #include "MCTargetDesc/ARMBaseInfo.h" 27 #include "Utils/ARMBaseInfo.h" 28 #include "llvm/ADT/APFloat.h" 29 #include "llvm/ADT/APInt.h" 30 #include "llvm/ADT/ArrayRef.h" 31 #include "llvm/ADT/BitVector.h" 32 #include "llvm/ADT/DenseMap.h" 33 #include "llvm/ADT/STLExtras.h" 34 #include "llvm/ADT/SmallPtrSet.h" 35 #include "llvm/ADT/SmallVector.h" 36 #include "llvm/ADT/Statistic.h" 37 #include "llvm/ADT/StringExtras.h" 38 #include "llvm/ADT/StringRef.h" 39 #include "llvm/ADT/StringSwitch.h" 40 #include "llvm/ADT/Triple.h" 41 #include "llvm/ADT/Twine.h" 42 #include "llvm/Analysis/VectorUtils.h" 43 #include "llvm/CodeGen/CallingConvLower.h" 44 #include "llvm/CodeGen/ISDOpcodes.h" 45 #include "llvm/CodeGen/IntrinsicLowering.h" 46 #include "llvm/CodeGen/MachineBasicBlock.h" 47 #include "llvm/CodeGen/MachineConstantPool.h" 48 #include "llvm/CodeGen/MachineFrameInfo.h" 49 #include "llvm/CodeGen/MachineFunction.h" 50 #include "llvm/CodeGen/MachineInstr.h" 51 #include "llvm/CodeGen/MachineInstrBuilder.h" 52 #include "llvm/CodeGen/MachineJumpTableInfo.h" 53 #include "llvm/CodeGen/MachineMemOperand.h" 54 #include "llvm/CodeGen/MachineOperand.h" 55 #include "llvm/CodeGen/MachineRegisterInfo.h" 56 #include "llvm/CodeGen/RuntimeLibcalls.h" 57 #include "llvm/CodeGen/SelectionDAG.h" 58 #include "llvm/CodeGen/SelectionDAGNodes.h" 59 #include "llvm/CodeGen/TargetInstrInfo.h" 60 #include "llvm/CodeGen/TargetLowering.h" 61 #include "llvm/CodeGen/TargetOpcodes.h" 62 #include "llvm/CodeGen/TargetRegisterInfo.h" 63 #include "llvm/CodeGen/TargetSubtargetInfo.h" 64 #include "llvm/CodeGen/ValueTypes.h" 65 #include "llvm/IR/Attributes.h" 66 #include "llvm/IR/CallingConv.h" 67 #include "llvm/IR/Constant.h" 68 #include "llvm/IR/Constants.h" 69 #include "llvm/IR/DataLayout.h" 70 #include "llvm/IR/DebugLoc.h" 71 #include "llvm/IR/DerivedTypes.h" 72 #include "llvm/IR/Function.h" 73 #include "llvm/IR/GlobalAlias.h" 74 #include "llvm/IR/GlobalValue.h" 75 #include "llvm/IR/GlobalVariable.h" 76 #include "llvm/IR/IRBuilder.h" 77 #include "llvm/IR/InlineAsm.h" 78 #include "llvm/IR/Instruction.h" 79 #include "llvm/IR/Instructions.h" 80 #include "llvm/IR/IntrinsicInst.h" 81 #include "llvm/IR/Intrinsics.h" 82 #include "llvm/IR/IntrinsicsARM.h" 83 #include "llvm/IR/Module.h" 84 #include "llvm/IR/PatternMatch.h" 85 #include "llvm/IR/Type.h" 86 #include "llvm/IR/User.h" 87 #include "llvm/IR/Value.h" 88 #include "llvm/MC/MCInstrDesc.h" 89 #include "llvm/MC/MCInstrItineraries.h" 90 #include "llvm/MC/MCRegisterInfo.h" 91 #include "llvm/MC/MCSchedule.h" 92 #include "llvm/Support/AtomicOrdering.h" 93 #include "llvm/Support/BranchProbability.h" 94 #include "llvm/Support/Casting.h" 95 #include "llvm/Support/CodeGen.h" 96 #include "llvm/Support/CommandLine.h" 97 #include "llvm/Support/Compiler.h" 98 #include "llvm/Support/Debug.h" 99 #include "llvm/Support/ErrorHandling.h" 100 #include "llvm/Support/KnownBits.h" 101 #include "llvm/Support/MachineValueType.h" 102 #include "llvm/Support/MathExtras.h" 103 #include "llvm/Support/raw_ostream.h" 104 #include "llvm/Target/TargetMachine.h" 105 #include "llvm/Target/TargetOptions.h" 106 #include <algorithm> 107 #include <cassert> 108 #include <cstdint> 109 #include <cstdlib> 110 #include <iterator> 111 #include <limits> 112 #include <string> 113 #include <tuple> 114 #include <utility> 115 #include <vector> 116 117 using namespace llvm; 118 using namespace llvm::PatternMatch; 119 120 #define DEBUG_TYPE "arm-isel" 121 122 STATISTIC(NumTailCalls, "Number of tail calls"); 123 STATISTIC(NumMovwMovt, "Number of GAs materialized with movw + movt"); 124 STATISTIC(NumLoopByVals, "Number of loops generated for byval arguments"); 125 STATISTIC(NumConstpoolPromoted, 126 "Number of constants with their storage promoted into constant pools"); 127 128 static cl::opt<bool> 129 ARMInterworking("arm-interworking", cl::Hidden, 130 cl::desc("Enable / disable ARM interworking (for debugging only)"), 131 cl::init(true)); 132 133 static cl::opt<bool> EnableConstpoolPromotion( 134 "arm-promote-constant", cl::Hidden, 135 cl::desc("Enable / disable promotion of unnamed_addr constants into " 136 "constant pools"), 137 cl::init(false)); // FIXME: set to true by default once PR32780 is fixed 138 static cl::opt<unsigned> ConstpoolPromotionMaxSize( 139 "arm-promote-constant-max-size", cl::Hidden, 140 cl::desc("Maximum size of constant to promote into a constant pool"), 141 cl::init(64)); 142 static cl::opt<unsigned> ConstpoolPromotionMaxTotal( 143 "arm-promote-constant-max-total", cl::Hidden, 144 cl::desc("Maximum size of ALL constants to promote into a constant pool"), 145 cl::init(128)); 146 147 cl::opt<unsigned> 148 MVEMaxSupportedInterleaveFactor("mve-max-interleave-factor", cl::Hidden, 149 cl::desc("Maximum interleave factor for MVE VLDn to generate."), 150 cl::init(2)); 151 152 // The APCS parameter registers. 153 static const MCPhysReg GPRArgRegs[] = { 154 ARM::R0, ARM::R1, ARM::R2, ARM::R3 155 }; 156 157 void ARMTargetLowering::addTypeForNEON(MVT VT, MVT PromotedLdStVT, 158 MVT PromotedBitwiseVT) { 159 if (VT != PromotedLdStVT) { 160 setOperationAction(ISD::LOAD, VT, Promote); 161 AddPromotedToType (ISD::LOAD, VT, PromotedLdStVT); 162 163 setOperationAction(ISD::STORE, VT, Promote); 164 AddPromotedToType (ISD::STORE, VT, PromotedLdStVT); 165 } 166 167 MVT ElemTy = VT.getVectorElementType(); 168 if (ElemTy != MVT::f64) 169 setOperationAction(ISD::SETCC, VT, Custom); 170 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); 171 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); 172 if (ElemTy == MVT::i32) { 173 setOperationAction(ISD::SINT_TO_FP, VT, Custom); 174 setOperationAction(ISD::UINT_TO_FP, VT, Custom); 175 setOperationAction(ISD::FP_TO_SINT, VT, Custom); 176 setOperationAction(ISD::FP_TO_UINT, VT, Custom); 177 } else { 178 setOperationAction(ISD::SINT_TO_FP, VT, Expand); 179 setOperationAction(ISD::UINT_TO_FP, VT, Expand); 180 setOperationAction(ISD::FP_TO_SINT, VT, Expand); 181 setOperationAction(ISD::FP_TO_UINT, VT, Expand); 182 } 183 setOperationAction(ISD::BUILD_VECTOR, VT, Custom); 184 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); 185 setOperationAction(ISD::CONCAT_VECTORS, VT, Legal); 186 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal); 187 setOperationAction(ISD::SELECT, VT, Expand); 188 setOperationAction(ISD::SELECT_CC, VT, Expand); 189 setOperationAction(ISD::VSELECT, VT, Expand); 190 setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Expand); 191 if (VT.isInteger()) { 192 setOperationAction(ISD::SHL, VT, Custom); 193 setOperationAction(ISD::SRA, VT, Custom); 194 setOperationAction(ISD::SRL, VT, Custom); 195 } 196 197 // Promote all bit-wise operations. 198 if (VT.isInteger() && VT != PromotedBitwiseVT) { 199 setOperationAction(ISD::AND, VT, Promote); 200 AddPromotedToType (ISD::AND, VT, PromotedBitwiseVT); 201 setOperationAction(ISD::OR, VT, Promote); 202 AddPromotedToType (ISD::OR, VT, PromotedBitwiseVT); 203 setOperationAction(ISD::XOR, VT, Promote); 204 AddPromotedToType (ISD::XOR, VT, PromotedBitwiseVT); 205 } 206 207 // Neon does not support vector divide/remainder operations. 208 setOperationAction(ISD::SDIV, VT, Expand); 209 setOperationAction(ISD::UDIV, VT, Expand); 210 setOperationAction(ISD::FDIV, VT, Expand); 211 setOperationAction(ISD::SREM, VT, Expand); 212 setOperationAction(ISD::UREM, VT, Expand); 213 setOperationAction(ISD::FREM, VT, Expand); 214 setOperationAction(ISD::SDIVREM, VT, Expand); 215 setOperationAction(ISD::UDIVREM, VT, Expand); 216 217 if (!VT.isFloatingPoint() && 218 VT != MVT::v2i64 && VT != MVT::v1i64) 219 for (auto Opcode : {ISD::ABS, ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX}) 220 setOperationAction(Opcode, VT, Legal); 221 if (!VT.isFloatingPoint()) 222 for (auto Opcode : {ISD::SADDSAT, ISD::UADDSAT, ISD::SSUBSAT, ISD::USUBSAT}) 223 setOperationAction(Opcode, VT, Legal); 224 } 225 226 void ARMTargetLowering::addDRTypeForNEON(MVT VT) { 227 addRegisterClass(VT, &ARM::DPRRegClass); 228 addTypeForNEON(VT, MVT::f64, MVT::v2i32); 229 } 230 231 void ARMTargetLowering::addQRTypeForNEON(MVT VT) { 232 addRegisterClass(VT, &ARM::DPairRegClass); 233 addTypeForNEON(VT, MVT::v2f64, MVT::v4i32); 234 } 235 236 void ARMTargetLowering::setAllExpand(MVT VT) { 237 for (unsigned Opc = 0; Opc < ISD::BUILTIN_OP_END; ++Opc) 238 setOperationAction(Opc, VT, Expand); 239 240 // We support these really simple operations even on types where all 241 // the actual arithmetic has to be broken down into simpler 242 // operations or turned into library calls. 243 setOperationAction(ISD::BITCAST, VT, Legal); 244 setOperationAction(ISD::LOAD, VT, Legal); 245 setOperationAction(ISD::STORE, VT, Legal); 246 setOperationAction(ISD::UNDEF, VT, Legal); 247 } 248 249 void ARMTargetLowering::addAllExtLoads(const MVT From, const MVT To, 250 LegalizeAction Action) { 251 setLoadExtAction(ISD::EXTLOAD, From, To, Action); 252 setLoadExtAction(ISD::ZEXTLOAD, From, To, Action); 253 setLoadExtAction(ISD::SEXTLOAD, From, To, Action); 254 } 255 256 void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) { 257 const MVT IntTypes[] = { MVT::v16i8, MVT::v8i16, MVT::v4i32 }; 258 259 for (auto VT : IntTypes) { 260 addRegisterClass(VT, &ARM::MQPRRegClass); 261 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); 262 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); 263 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); 264 setOperationAction(ISD::BUILD_VECTOR, VT, Custom); 265 setOperationAction(ISD::SHL, VT, Custom); 266 setOperationAction(ISD::SRA, VT, Custom); 267 setOperationAction(ISD::SRL, VT, Custom); 268 setOperationAction(ISD::SMIN, VT, Legal); 269 setOperationAction(ISD::SMAX, VT, Legal); 270 setOperationAction(ISD::UMIN, VT, Legal); 271 setOperationAction(ISD::UMAX, VT, Legal); 272 setOperationAction(ISD::ABS, VT, Legal); 273 setOperationAction(ISD::SETCC, VT, Custom); 274 setOperationAction(ISD::MLOAD, VT, Custom); 275 setOperationAction(ISD::MSTORE, VT, Legal); 276 setOperationAction(ISD::CTLZ, VT, Legal); 277 setOperationAction(ISD::CTTZ, VT, Custom); 278 setOperationAction(ISD::BITREVERSE, VT, Legal); 279 setOperationAction(ISD::BSWAP, VT, Legal); 280 setOperationAction(ISD::SADDSAT, VT, Legal); 281 setOperationAction(ISD::UADDSAT, VT, Legal); 282 setOperationAction(ISD::SSUBSAT, VT, Legal); 283 setOperationAction(ISD::USUBSAT, VT, Legal); 284 285 // No native support for these. 286 setOperationAction(ISD::UDIV, VT, Expand); 287 setOperationAction(ISD::SDIV, VT, Expand); 288 setOperationAction(ISD::UREM, VT, Expand); 289 setOperationAction(ISD::SREM, VT, Expand); 290 setOperationAction(ISD::UDIVREM, VT, Expand); 291 setOperationAction(ISD::SDIVREM, VT, Expand); 292 setOperationAction(ISD::CTPOP, VT, Expand); 293 setOperationAction(ISD::SELECT, VT, Expand); 294 setOperationAction(ISD::SELECT_CC, VT, Expand); 295 296 // Vector reductions 297 setOperationAction(ISD::VECREDUCE_ADD, VT, Legal); 298 setOperationAction(ISD::VECREDUCE_SMAX, VT, Legal); 299 setOperationAction(ISD::VECREDUCE_UMAX, VT, Legal); 300 setOperationAction(ISD::VECREDUCE_SMIN, VT, Legal); 301 setOperationAction(ISD::VECREDUCE_UMIN, VT, Legal); 302 setOperationAction(ISD::VECREDUCE_MUL, VT, Custom); 303 setOperationAction(ISD::VECREDUCE_AND, VT, Custom); 304 setOperationAction(ISD::VECREDUCE_OR, VT, Custom); 305 setOperationAction(ISD::VECREDUCE_XOR, VT, Custom); 306 307 if (!HasMVEFP) { 308 setOperationAction(ISD::SINT_TO_FP, VT, Expand); 309 setOperationAction(ISD::UINT_TO_FP, VT, Expand); 310 setOperationAction(ISD::FP_TO_SINT, VT, Expand); 311 setOperationAction(ISD::FP_TO_UINT, VT, Expand); 312 } 313 314 // Pre and Post inc are supported on loads and stores 315 for (unsigned im = (unsigned)ISD::PRE_INC; 316 im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) { 317 setIndexedLoadAction(im, VT, Legal); 318 setIndexedStoreAction(im, VT, Legal); 319 setIndexedMaskedLoadAction(im, VT, Legal); 320 setIndexedMaskedStoreAction(im, VT, Legal); 321 } 322 } 323 324 const MVT FloatTypes[] = { MVT::v8f16, MVT::v4f32 }; 325 for (auto VT : FloatTypes) { 326 addRegisterClass(VT, &ARM::MQPRRegClass); 327 if (!HasMVEFP) 328 setAllExpand(VT); 329 330 // These are legal or custom whether we have MVE.fp or not 331 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); 332 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); 333 setOperationAction(ISD::INSERT_VECTOR_ELT, VT.getVectorElementType(), Custom); 334 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); 335 setOperationAction(ISD::BUILD_VECTOR, VT, Custom); 336 setOperationAction(ISD::BUILD_VECTOR, VT.getVectorElementType(), Custom); 337 setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Legal); 338 setOperationAction(ISD::SETCC, VT, Custom); 339 setOperationAction(ISD::MLOAD, VT, Custom); 340 setOperationAction(ISD::MSTORE, VT, Legal); 341 setOperationAction(ISD::SELECT, VT, Expand); 342 setOperationAction(ISD::SELECT_CC, VT, Expand); 343 344 // Pre and Post inc are supported on loads and stores 345 for (unsigned im = (unsigned)ISD::PRE_INC; 346 im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) { 347 setIndexedLoadAction(im, VT, Legal); 348 setIndexedStoreAction(im, VT, Legal); 349 setIndexedMaskedLoadAction(im, VT, Legal); 350 setIndexedMaskedStoreAction(im, VT, Legal); 351 } 352 353 if (HasMVEFP) { 354 setOperationAction(ISD::FMINNUM, VT, Legal); 355 setOperationAction(ISD::FMAXNUM, VT, Legal); 356 setOperationAction(ISD::FROUND, VT, Legal); 357 setOperationAction(ISD::VECREDUCE_FADD, VT, Custom); 358 setOperationAction(ISD::VECREDUCE_FMUL, VT, Custom); 359 setOperationAction(ISD::VECREDUCE_FMIN, VT, Custom); 360 setOperationAction(ISD::VECREDUCE_FMAX, VT, Custom); 361 362 // No native support for these. 363 setOperationAction(ISD::FDIV, VT, Expand); 364 setOperationAction(ISD::FREM, VT, Expand); 365 setOperationAction(ISD::FSQRT, VT, Expand); 366 setOperationAction(ISD::FSIN, VT, Expand); 367 setOperationAction(ISD::FCOS, VT, Expand); 368 setOperationAction(ISD::FPOW, VT, Expand); 369 setOperationAction(ISD::FLOG, VT, Expand); 370 setOperationAction(ISD::FLOG2, VT, Expand); 371 setOperationAction(ISD::FLOG10, VT, Expand); 372 setOperationAction(ISD::FEXP, VT, Expand); 373 setOperationAction(ISD::FEXP2, VT, Expand); 374 setOperationAction(ISD::FNEARBYINT, VT, Expand); 375 } 376 } 377 378 // Custom Expand smaller than legal vector reductions to prevent false zero 379 // items being added. 380 setOperationAction(ISD::VECREDUCE_FADD, MVT::v4f16, Custom); 381 setOperationAction(ISD::VECREDUCE_FMUL, MVT::v4f16, Custom); 382 setOperationAction(ISD::VECREDUCE_FMIN, MVT::v4f16, Custom); 383 setOperationAction(ISD::VECREDUCE_FMAX, MVT::v4f16, Custom); 384 setOperationAction(ISD::VECREDUCE_FADD, MVT::v2f16, Custom); 385 setOperationAction(ISD::VECREDUCE_FMUL, MVT::v2f16, Custom); 386 setOperationAction(ISD::VECREDUCE_FMIN, MVT::v2f16, Custom); 387 setOperationAction(ISD::VECREDUCE_FMAX, MVT::v2f16, Custom); 388 389 // We 'support' these types up to bitcast/load/store level, regardless of 390 // MVE integer-only / float support. Only doing FP data processing on the FP 391 // vector types is inhibited at integer-only level. 392 const MVT LongTypes[] = { MVT::v2i64, MVT::v2f64 }; 393 for (auto VT : LongTypes) { 394 addRegisterClass(VT, &ARM::MQPRRegClass); 395 setAllExpand(VT); 396 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); 397 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); 398 setOperationAction(ISD::BUILD_VECTOR, VT, Custom); 399 } 400 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v2f64, Legal); 401 402 // We can do bitwise operations on v2i64 vectors 403 setOperationAction(ISD::AND, MVT::v2i64, Legal); 404 setOperationAction(ISD::OR, MVT::v2i64, Legal); 405 setOperationAction(ISD::XOR, MVT::v2i64, Legal); 406 407 // It is legal to extload from v4i8 to v4i16 or v4i32. 408 addAllExtLoads(MVT::v8i16, MVT::v8i8, Legal); 409 addAllExtLoads(MVT::v4i32, MVT::v4i16, Legal); 410 addAllExtLoads(MVT::v4i32, MVT::v4i8, Legal); 411 412 // It is legal to sign extend from v4i8/v4i16 to v4i32 or v8i8 to v8i16. 413 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i8, Legal); 414 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i16, Legal); 415 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i32, Legal); 416 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v8i8, Legal); 417 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v8i16, Legal); 418 419 // Some truncating stores are legal too. 420 setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal); 421 setTruncStoreAction(MVT::v4i32, MVT::v4i8, Legal); 422 setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal); 423 424 // Pre and Post inc on these are legal, given the correct extends 425 for (unsigned im = (unsigned)ISD::PRE_INC; 426 im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) { 427 for (auto VT : {MVT::v8i8, MVT::v4i8, MVT::v4i16}) { 428 setIndexedLoadAction(im, VT, Legal); 429 setIndexedStoreAction(im, VT, Legal); 430 setIndexedMaskedLoadAction(im, VT, Legal); 431 setIndexedMaskedStoreAction(im, VT, Legal); 432 } 433 } 434 435 // Predicate types 436 const MVT pTypes[] = {MVT::v16i1, MVT::v8i1, MVT::v4i1}; 437 for (auto VT : pTypes) { 438 addRegisterClass(VT, &ARM::VCCRRegClass); 439 setOperationAction(ISD::BUILD_VECTOR, VT, Custom); 440 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); 441 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom); 442 setOperationAction(ISD::CONCAT_VECTORS, VT, Custom); 443 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); 444 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); 445 setOperationAction(ISD::SETCC, VT, Custom); 446 setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Expand); 447 setOperationAction(ISD::LOAD, VT, Custom); 448 setOperationAction(ISD::STORE, VT, Custom); 449 setOperationAction(ISD::TRUNCATE, VT, Custom); 450 setOperationAction(ISD::VSELECT, VT, Expand); 451 setOperationAction(ISD::SELECT, VT, Expand); 452 } 453 } 454 455 ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, 456 const ARMSubtarget &STI) 457 : TargetLowering(TM), Subtarget(&STI) { 458 RegInfo = Subtarget->getRegisterInfo(); 459 Itins = Subtarget->getInstrItineraryData(); 460 461 setBooleanContents(ZeroOrOneBooleanContent); 462 setBooleanVectorContents(ZeroOrNegativeOneBooleanContent); 463 464 if (!Subtarget->isTargetDarwin() && !Subtarget->isTargetIOS() && 465 !Subtarget->isTargetWatchOS()) { 466 bool IsHFTarget = TM.Options.FloatABIType == FloatABI::Hard; 467 for (int LCID = 0; LCID < RTLIB::UNKNOWN_LIBCALL; ++LCID) 468 setLibcallCallingConv(static_cast<RTLIB::Libcall>(LCID), 469 IsHFTarget ? CallingConv::ARM_AAPCS_VFP 470 : CallingConv::ARM_AAPCS); 471 } 472 473 if (Subtarget->isTargetMachO()) { 474 // Uses VFP for Thumb libfuncs if available. 475 if (Subtarget->isThumb() && Subtarget->hasVFP2Base() && 476 Subtarget->hasARMOps() && !Subtarget->useSoftFloat()) { 477 static const struct { 478 const RTLIB::Libcall Op; 479 const char * const Name; 480 const ISD::CondCode Cond; 481 } LibraryCalls[] = { 482 // Single-precision floating-point arithmetic. 483 { RTLIB::ADD_F32, "__addsf3vfp", ISD::SETCC_INVALID }, 484 { RTLIB::SUB_F32, "__subsf3vfp", ISD::SETCC_INVALID }, 485 { RTLIB::MUL_F32, "__mulsf3vfp", ISD::SETCC_INVALID }, 486 { RTLIB::DIV_F32, "__divsf3vfp", ISD::SETCC_INVALID }, 487 488 // Double-precision floating-point arithmetic. 489 { RTLIB::ADD_F64, "__adddf3vfp", ISD::SETCC_INVALID }, 490 { RTLIB::SUB_F64, "__subdf3vfp", ISD::SETCC_INVALID }, 491 { RTLIB::MUL_F64, "__muldf3vfp", ISD::SETCC_INVALID }, 492 { RTLIB::DIV_F64, "__divdf3vfp", ISD::SETCC_INVALID }, 493 494 // Single-precision comparisons. 495 { RTLIB::OEQ_F32, "__eqsf2vfp", ISD::SETNE }, 496 { RTLIB::UNE_F32, "__nesf2vfp", ISD::SETNE }, 497 { RTLIB::OLT_F32, "__ltsf2vfp", ISD::SETNE }, 498 { RTLIB::OLE_F32, "__lesf2vfp", ISD::SETNE }, 499 { RTLIB::OGE_F32, "__gesf2vfp", ISD::SETNE }, 500 { RTLIB::OGT_F32, "__gtsf2vfp", ISD::SETNE }, 501 { RTLIB::UO_F32, "__unordsf2vfp", ISD::SETNE }, 502 503 // Double-precision comparisons. 504 { RTLIB::OEQ_F64, "__eqdf2vfp", ISD::SETNE }, 505 { RTLIB::UNE_F64, "__nedf2vfp", ISD::SETNE }, 506 { RTLIB::OLT_F64, "__ltdf2vfp", ISD::SETNE }, 507 { RTLIB::OLE_F64, "__ledf2vfp", ISD::SETNE }, 508 { RTLIB::OGE_F64, "__gedf2vfp", ISD::SETNE }, 509 { RTLIB::OGT_F64, "__gtdf2vfp", ISD::SETNE }, 510 { RTLIB::UO_F64, "__unorddf2vfp", ISD::SETNE }, 511 512 // Floating-point to integer conversions. 513 // i64 conversions are done via library routines even when generating VFP 514 // instructions, so use the same ones. 515 { RTLIB::FPTOSINT_F64_I32, "__fixdfsivfp", ISD::SETCC_INVALID }, 516 { RTLIB::FPTOUINT_F64_I32, "__fixunsdfsivfp", ISD::SETCC_INVALID }, 517 { RTLIB::FPTOSINT_F32_I32, "__fixsfsivfp", ISD::SETCC_INVALID }, 518 { RTLIB::FPTOUINT_F32_I32, "__fixunssfsivfp", ISD::SETCC_INVALID }, 519 520 // Conversions between floating types. 521 { RTLIB::FPROUND_F64_F32, "__truncdfsf2vfp", ISD::SETCC_INVALID }, 522 { RTLIB::FPEXT_F32_F64, "__extendsfdf2vfp", ISD::SETCC_INVALID }, 523 524 // Integer to floating-point conversions. 525 // i64 conversions are done via library routines even when generating VFP 526 // instructions, so use the same ones. 527 // FIXME: There appears to be some naming inconsistency in ARM libgcc: 528 // e.g., __floatunsidf vs. __floatunssidfvfp. 529 { RTLIB::SINTTOFP_I32_F64, "__floatsidfvfp", ISD::SETCC_INVALID }, 530 { RTLIB::UINTTOFP_I32_F64, "__floatunssidfvfp", ISD::SETCC_INVALID }, 531 { RTLIB::SINTTOFP_I32_F32, "__floatsisfvfp", ISD::SETCC_INVALID }, 532 { RTLIB::UINTTOFP_I32_F32, "__floatunssisfvfp", ISD::SETCC_INVALID }, 533 }; 534 535 for (const auto &LC : LibraryCalls) { 536 setLibcallName(LC.Op, LC.Name); 537 if (LC.Cond != ISD::SETCC_INVALID) 538 setCmpLibcallCC(LC.Op, LC.Cond); 539 } 540 } 541 } 542 543 // These libcalls are not available in 32-bit. 544 setLibcallName(RTLIB::SHL_I128, nullptr); 545 setLibcallName(RTLIB::SRL_I128, nullptr); 546 setLibcallName(RTLIB::SRA_I128, nullptr); 547 548 // RTLIB 549 if (Subtarget->isAAPCS_ABI() && 550 (Subtarget->isTargetAEABI() || Subtarget->isTargetGNUAEABI() || 551 Subtarget->isTargetMuslAEABI() || Subtarget->isTargetAndroid())) { 552 static const struct { 553 const RTLIB::Libcall Op; 554 const char * const Name; 555 const CallingConv::ID CC; 556 const ISD::CondCode Cond; 557 } LibraryCalls[] = { 558 // Double-precision floating-point arithmetic helper functions 559 // RTABI chapter 4.1.2, Table 2 560 { RTLIB::ADD_F64, "__aeabi_dadd", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 561 { RTLIB::DIV_F64, "__aeabi_ddiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 562 { RTLIB::MUL_F64, "__aeabi_dmul", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 563 { RTLIB::SUB_F64, "__aeabi_dsub", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 564 565 // Double-precision floating-point comparison helper functions 566 // RTABI chapter 4.1.2, Table 3 567 { RTLIB::OEQ_F64, "__aeabi_dcmpeq", CallingConv::ARM_AAPCS, ISD::SETNE }, 568 { RTLIB::UNE_F64, "__aeabi_dcmpeq", CallingConv::ARM_AAPCS, ISD::SETEQ }, 569 { RTLIB::OLT_F64, "__aeabi_dcmplt", CallingConv::ARM_AAPCS, ISD::SETNE }, 570 { RTLIB::OLE_F64, "__aeabi_dcmple", CallingConv::ARM_AAPCS, ISD::SETNE }, 571 { RTLIB::OGE_F64, "__aeabi_dcmpge", CallingConv::ARM_AAPCS, ISD::SETNE }, 572 { RTLIB::OGT_F64, "__aeabi_dcmpgt", CallingConv::ARM_AAPCS, ISD::SETNE }, 573 { RTLIB::UO_F64, "__aeabi_dcmpun", CallingConv::ARM_AAPCS, ISD::SETNE }, 574 575 // Single-precision floating-point arithmetic helper functions 576 // RTABI chapter 4.1.2, Table 4 577 { RTLIB::ADD_F32, "__aeabi_fadd", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 578 { RTLIB::DIV_F32, "__aeabi_fdiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 579 { RTLIB::MUL_F32, "__aeabi_fmul", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 580 { RTLIB::SUB_F32, "__aeabi_fsub", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 581 582 // Single-precision floating-point comparison helper functions 583 // RTABI chapter 4.1.2, Table 5 584 { RTLIB::OEQ_F32, "__aeabi_fcmpeq", CallingConv::ARM_AAPCS, ISD::SETNE }, 585 { RTLIB::UNE_F32, "__aeabi_fcmpeq", CallingConv::ARM_AAPCS, ISD::SETEQ }, 586 { RTLIB::OLT_F32, "__aeabi_fcmplt", CallingConv::ARM_AAPCS, ISD::SETNE }, 587 { RTLIB::OLE_F32, "__aeabi_fcmple", CallingConv::ARM_AAPCS, ISD::SETNE }, 588 { RTLIB::OGE_F32, "__aeabi_fcmpge", CallingConv::ARM_AAPCS, ISD::SETNE }, 589 { RTLIB::OGT_F32, "__aeabi_fcmpgt", CallingConv::ARM_AAPCS, ISD::SETNE }, 590 { RTLIB::UO_F32, "__aeabi_fcmpun", CallingConv::ARM_AAPCS, ISD::SETNE }, 591 592 // Floating-point to integer conversions. 593 // RTABI chapter 4.1.2, Table 6 594 { RTLIB::FPTOSINT_F64_I32, "__aeabi_d2iz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 595 { RTLIB::FPTOUINT_F64_I32, "__aeabi_d2uiz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 596 { RTLIB::FPTOSINT_F64_I64, "__aeabi_d2lz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 597 { RTLIB::FPTOUINT_F64_I64, "__aeabi_d2ulz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 598 { RTLIB::FPTOSINT_F32_I32, "__aeabi_f2iz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 599 { RTLIB::FPTOUINT_F32_I32, "__aeabi_f2uiz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 600 { RTLIB::FPTOSINT_F32_I64, "__aeabi_f2lz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 601 { RTLIB::FPTOUINT_F32_I64, "__aeabi_f2ulz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 602 603 // Conversions between floating types. 604 // RTABI chapter 4.1.2, Table 7 605 { RTLIB::FPROUND_F64_F32, "__aeabi_d2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 606 { RTLIB::FPROUND_F64_F16, "__aeabi_d2h", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 607 { RTLIB::FPEXT_F32_F64, "__aeabi_f2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 608 609 // Integer to floating-point conversions. 610 // RTABI chapter 4.1.2, Table 8 611 { RTLIB::SINTTOFP_I32_F64, "__aeabi_i2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 612 { RTLIB::UINTTOFP_I32_F64, "__aeabi_ui2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 613 { RTLIB::SINTTOFP_I64_F64, "__aeabi_l2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 614 { RTLIB::UINTTOFP_I64_F64, "__aeabi_ul2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 615 { RTLIB::SINTTOFP_I32_F32, "__aeabi_i2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 616 { RTLIB::UINTTOFP_I32_F32, "__aeabi_ui2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 617 { RTLIB::SINTTOFP_I64_F32, "__aeabi_l2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 618 { RTLIB::UINTTOFP_I64_F32, "__aeabi_ul2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 619 620 // Long long helper functions 621 // RTABI chapter 4.2, Table 9 622 { RTLIB::MUL_I64, "__aeabi_lmul", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 623 { RTLIB::SHL_I64, "__aeabi_llsl", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 624 { RTLIB::SRL_I64, "__aeabi_llsr", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 625 { RTLIB::SRA_I64, "__aeabi_lasr", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 626 627 // Integer division functions 628 // RTABI chapter 4.3.1 629 { RTLIB::SDIV_I8, "__aeabi_idiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 630 { RTLIB::SDIV_I16, "__aeabi_idiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 631 { RTLIB::SDIV_I32, "__aeabi_idiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 632 { RTLIB::SDIV_I64, "__aeabi_ldivmod", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 633 { RTLIB::UDIV_I8, "__aeabi_uidiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 634 { RTLIB::UDIV_I16, "__aeabi_uidiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 635 { RTLIB::UDIV_I32, "__aeabi_uidiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 636 { RTLIB::UDIV_I64, "__aeabi_uldivmod", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 637 }; 638 639 for (const auto &LC : LibraryCalls) { 640 setLibcallName(LC.Op, LC.Name); 641 setLibcallCallingConv(LC.Op, LC.CC); 642 if (LC.Cond != ISD::SETCC_INVALID) 643 setCmpLibcallCC(LC.Op, LC.Cond); 644 } 645 646 // EABI dependent RTLIB 647 if (TM.Options.EABIVersion == EABI::EABI4 || 648 TM.Options.EABIVersion == EABI::EABI5) { 649 static const struct { 650 const RTLIB::Libcall Op; 651 const char *const Name; 652 const CallingConv::ID CC; 653 const ISD::CondCode Cond; 654 } MemOpsLibraryCalls[] = { 655 // Memory operations 656 // RTABI chapter 4.3.4 657 { RTLIB::MEMCPY, "__aeabi_memcpy", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 658 { RTLIB::MEMMOVE, "__aeabi_memmove", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 659 { RTLIB::MEMSET, "__aeabi_memset", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, 660 }; 661 662 for (const auto &LC : MemOpsLibraryCalls) { 663 setLibcallName(LC.Op, LC.Name); 664 setLibcallCallingConv(LC.Op, LC.CC); 665 if (LC.Cond != ISD::SETCC_INVALID) 666 setCmpLibcallCC(LC.Op, LC.Cond); 667 } 668 } 669 } 670 671 if (Subtarget->isTargetWindows()) { 672 static const struct { 673 const RTLIB::Libcall Op; 674 const char * const Name; 675 const CallingConv::ID CC; 676 } LibraryCalls[] = { 677 { RTLIB::FPTOSINT_F32_I64, "__stoi64", CallingConv::ARM_AAPCS_VFP }, 678 { RTLIB::FPTOSINT_F64_I64, "__dtoi64", CallingConv::ARM_AAPCS_VFP }, 679 { RTLIB::FPTOUINT_F32_I64, "__stou64", CallingConv::ARM_AAPCS_VFP }, 680 { RTLIB::FPTOUINT_F64_I64, "__dtou64", CallingConv::ARM_AAPCS_VFP }, 681 { RTLIB::SINTTOFP_I64_F32, "__i64tos", CallingConv::ARM_AAPCS_VFP }, 682 { RTLIB::SINTTOFP_I64_F64, "__i64tod", CallingConv::ARM_AAPCS_VFP }, 683 { RTLIB::UINTTOFP_I64_F32, "__u64tos", CallingConv::ARM_AAPCS_VFP }, 684 { RTLIB::UINTTOFP_I64_F64, "__u64tod", CallingConv::ARM_AAPCS_VFP }, 685 }; 686 687 for (const auto &LC : LibraryCalls) { 688 setLibcallName(LC.Op, LC.Name); 689 setLibcallCallingConv(LC.Op, LC.CC); 690 } 691 } 692 693 // Use divmod compiler-rt calls for iOS 5.0 and later. 694 if (Subtarget->isTargetMachO() && 695 !(Subtarget->isTargetIOS() && 696 Subtarget->getTargetTriple().isOSVersionLT(5, 0))) { 697 setLibcallName(RTLIB::SDIVREM_I32, "__divmodsi4"); 698 setLibcallName(RTLIB::UDIVREM_I32, "__udivmodsi4"); 699 } 700 701 // The half <-> float conversion functions are always soft-float on 702 // non-watchos platforms, but are needed for some targets which use a 703 // hard-float calling convention by default. 704 if (!Subtarget->isTargetWatchABI()) { 705 if (Subtarget->isAAPCS_ABI()) { 706 setLibcallCallingConv(RTLIB::FPROUND_F32_F16, CallingConv::ARM_AAPCS); 707 setLibcallCallingConv(RTLIB::FPROUND_F64_F16, CallingConv::ARM_AAPCS); 708 setLibcallCallingConv(RTLIB::FPEXT_F16_F32, CallingConv::ARM_AAPCS); 709 } else { 710 setLibcallCallingConv(RTLIB::FPROUND_F32_F16, CallingConv::ARM_APCS); 711 setLibcallCallingConv(RTLIB::FPROUND_F64_F16, CallingConv::ARM_APCS); 712 setLibcallCallingConv(RTLIB::FPEXT_F16_F32, CallingConv::ARM_APCS); 713 } 714 } 715 716 // In EABI, these functions have an __aeabi_ prefix, but in GNUEABI they have 717 // a __gnu_ prefix (which is the default). 718 if (Subtarget->isTargetAEABI()) { 719 static const struct { 720 const RTLIB::Libcall Op; 721 const char * const Name; 722 const CallingConv::ID CC; 723 } LibraryCalls[] = { 724 { RTLIB::FPROUND_F32_F16, "__aeabi_f2h", CallingConv::ARM_AAPCS }, 725 { RTLIB::FPROUND_F64_F16, "__aeabi_d2h", CallingConv::ARM_AAPCS }, 726 { RTLIB::FPEXT_F16_F32, "__aeabi_h2f", CallingConv::ARM_AAPCS }, 727 }; 728 729 for (const auto &LC : LibraryCalls) { 730 setLibcallName(LC.Op, LC.Name); 731 setLibcallCallingConv(LC.Op, LC.CC); 732 } 733 } 734 735 if (Subtarget->isThumb1Only()) 736 addRegisterClass(MVT::i32, &ARM::tGPRRegClass); 737 else 738 addRegisterClass(MVT::i32, &ARM::GPRRegClass); 739 740 if (!Subtarget->useSoftFloat() && !Subtarget->isThumb1Only() && 741 Subtarget->hasFPRegs()) { 742 addRegisterClass(MVT::f32, &ARM::SPRRegClass); 743 addRegisterClass(MVT::f64, &ARM::DPRRegClass); 744 if (!Subtarget->hasVFP2Base()) 745 setAllExpand(MVT::f32); 746 if (!Subtarget->hasFP64()) 747 setAllExpand(MVT::f64); 748 } 749 750 if (Subtarget->hasFullFP16()) { 751 addRegisterClass(MVT::f16, &ARM::HPRRegClass); 752 setOperationAction(ISD::BITCAST, MVT::i16, Custom); 753 setOperationAction(ISD::BITCAST, MVT::f16, Custom); 754 755 setOperationAction(ISD::FMINNUM, MVT::f16, Legal); 756 setOperationAction(ISD::FMAXNUM, MVT::f16, Legal); 757 } 758 759 if (Subtarget->hasBF16()) { 760 addRegisterClass(MVT::bf16, &ARM::HPRRegClass); 761 setAllExpand(MVT::bf16); 762 if (!Subtarget->hasFullFP16()) 763 setOperationAction(ISD::BITCAST, MVT::bf16, Custom); 764 } 765 766 for (MVT VT : MVT::fixedlen_vector_valuetypes()) { 767 for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) { 768 setTruncStoreAction(VT, InnerVT, Expand); 769 addAllExtLoads(VT, InnerVT, Expand); 770 } 771 772 setOperationAction(ISD::MULHS, VT, Expand); 773 setOperationAction(ISD::SMUL_LOHI, VT, Expand); 774 setOperationAction(ISD::MULHU, VT, Expand); 775 setOperationAction(ISD::UMUL_LOHI, VT, Expand); 776 777 setOperationAction(ISD::BSWAP, VT, Expand); 778 } 779 780 setOperationAction(ISD::ConstantFP, MVT::f32, Custom); 781 setOperationAction(ISD::ConstantFP, MVT::f64, Custom); 782 783 setOperationAction(ISD::READ_REGISTER, MVT::i64, Custom); 784 setOperationAction(ISD::WRITE_REGISTER, MVT::i64, Custom); 785 786 if (Subtarget->hasMVEIntegerOps()) 787 addMVEVectorTypes(Subtarget->hasMVEFloatOps()); 788 789 // Combine low-overhead loop intrinsics so that we can lower i1 types. 790 if (Subtarget->hasLOB()) { 791 setTargetDAGCombine(ISD::BRCOND); 792 setTargetDAGCombine(ISD::BR_CC); 793 } 794 795 if (Subtarget->hasNEON()) { 796 addDRTypeForNEON(MVT::v2f32); 797 addDRTypeForNEON(MVT::v8i8); 798 addDRTypeForNEON(MVT::v4i16); 799 addDRTypeForNEON(MVT::v2i32); 800 addDRTypeForNEON(MVT::v1i64); 801 802 addQRTypeForNEON(MVT::v4f32); 803 addQRTypeForNEON(MVT::v2f64); 804 addQRTypeForNEON(MVT::v16i8); 805 addQRTypeForNEON(MVT::v8i16); 806 addQRTypeForNEON(MVT::v4i32); 807 addQRTypeForNEON(MVT::v2i64); 808 809 if (Subtarget->hasFullFP16()) { 810 addQRTypeForNEON(MVT::v8f16); 811 addDRTypeForNEON(MVT::v4f16); 812 } 813 814 if (Subtarget->hasBF16()) { 815 addQRTypeForNEON(MVT::v8bf16); 816 addDRTypeForNEON(MVT::v4bf16); 817 } 818 } 819 820 if (Subtarget->hasMVEIntegerOps() || Subtarget->hasNEON()) { 821 // v2f64 is legal so that QR subregs can be extracted as f64 elements, but 822 // none of Neon, MVE or VFP supports any arithmetic operations on it. 823 setOperationAction(ISD::FADD, MVT::v2f64, Expand); 824 setOperationAction(ISD::FSUB, MVT::v2f64, Expand); 825 setOperationAction(ISD::FMUL, MVT::v2f64, Expand); 826 // FIXME: Code duplication: FDIV and FREM are expanded always, see 827 // ARMTargetLowering::addTypeForNEON method for details. 828 setOperationAction(ISD::FDIV, MVT::v2f64, Expand); 829 setOperationAction(ISD::FREM, MVT::v2f64, Expand); 830 // FIXME: Create unittest. 831 // In another words, find a way when "copysign" appears in DAG with vector 832 // operands. 833 setOperationAction(ISD::FCOPYSIGN, MVT::v2f64, Expand); 834 // FIXME: Code duplication: SETCC has custom operation action, see 835 // ARMTargetLowering::addTypeForNEON method for details. 836 setOperationAction(ISD::SETCC, MVT::v2f64, Expand); 837 // FIXME: Create unittest for FNEG and for FABS. 838 setOperationAction(ISD::FNEG, MVT::v2f64, Expand); 839 setOperationAction(ISD::FABS, MVT::v2f64, Expand); 840 setOperationAction(ISD::FSQRT, MVT::v2f64, Expand); 841 setOperationAction(ISD::FSIN, MVT::v2f64, Expand); 842 setOperationAction(ISD::FCOS, MVT::v2f64, Expand); 843 setOperationAction(ISD::FPOW, MVT::v2f64, Expand); 844 setOperationAction(ISD::FLOG, MVT::v2f64, Expand); 845 setOperationAction(ISD::FLOG2, MVT::v2f64, Expand); 846 setOperationAction(ISD::FLOG10, MVT::v2f64, Expand); 847 setOperationAction(ISD::FEXP, MVT::v2f64, Expand); 848 setOperationAction(ISD::FEXP2, MVT::v2f64, Expand); 849 // FIXME: Create unittest for FCEIL, FTRUNC, FRINT, FNEARBYINT, FFLOOR. 850 setOperationAction(ISD::FCEIL, MVT::v2f64, Expand); 851 setOperationAction(ISD::FTRUNC, MVT::v2f64, Expand); 852 setOperationAction(ISD::FRINT, MVT::v2f64, Expand); 853 setOperationAction(ISD::FNEARBYINT, MVT::v2f64, Expand); 854 setOperationAction(ISD::FFLOOR, MVT::v2f64, Expand); 855 setOperationAction(ISD::FMA, MVT::v2f64, Expand); 856 } 857 858 if (Subtarget->hasNEON()) { 859 // The same with v4f32. But keep in mind that vadd, vsub, vmul are natively 860 // supported for v4f32. 861 setOperationAction(ISD::FSQRT, MVT::v4f32, Expand); 862 setOperationAction(ISD::FSIN, MVT::v4f32, Expand); 863 setOperationAction(ISD::FCOS, MVT::v4f32, Expand); 864 setOperationAction(ISD::FPOW, MVT::v4f32, Expand); 865 setOperationAction(ISD::FLOG, MVT::v4f32, Expand); 866 setOperationAction(ISD::FLOG2, MVT::v4f32, Expand); 867 setOperationAction(ISD::FLOG10, MVT::v4f32, Expand); 868 setOperationAction(ISD::FEXP, MVT::v4f32, Expand); 869 setOperationAction(ISD::FEXP2, MVT::v4f32, Expand); 870 setOperationAction(ISD::FCEIL, MVT::v4f32, Expand); 871 setOperationAction(ISD::FTRUNC, MVT::v4f32, Expand); 872 setOperationAction(ISD::FRINT, MVT::v4f32, Expand); 873 setOperationAction(ISD::FNEARBYINT, MVT::v4f32, Expand); 874 setOperationAction(ISD::FFLOOR, MVT::v4f32, Expand); 875 876 // Mark v2f32 intrinsics. 877 setOperationAction(ISD::FSQRT, MVT::v2f32, Expand); 878 setOperationAction(ISD::FSIN, MVT::v2f32, Expand); 879 setOperationAction(ISD::FCOS, MVT::v2f32, Expand); 880 setOperationAction(ISD::FPOW, MVT::v2f32, Expand); 881 setOperationAction(ISD::FLOG, MVT::v2f32, Expand); 882 setOperationAction(ISD::FLOG2, MVT::v2f32, Expand); 883 setOperationAction(ISD::FLOG10, MVT::v2f32, Expand); 884 setOperationAction(ISD::FEXP, MVT::v2f32, Expand); 885 setOperationAction(ISD::FEXP2, MVT::v2f32, Expand); 886 setOperationAction(ISD::FCEIL, MVT::v2f32, Expand); 887 setOperationAction(ISD::FTRUNC, MVT::v2f32, Expand); 888 setOperationAction(ISD::FRINT, MVT::v2f32, Expand); 889 setOperationAction(ISD::FNEARBYINT, MVT::v2f32, Expand); 890 setOperationAction(ISD::FFLOOR, MVT::v2f32, Expand); 891 892 // Neon does not support some operations on v1i64 and v2i64 types. 893 setOperationAction(ISD::MUL, MVT::v1i64, Expand); 894 // Custom handling for some quad-vector types to detect VMULL. 895 setOperationAction(ISD::MUL, MVT::v8i16, Custom); 896 setOperationAction(ISD::MUL, MVT::v4i32, Custom); 897 setOperationAction(ISD::MUL, MVT::v2i64, Custom); 898 // Custom handling for some vector types to avoid expensive expansions 899 setOperationAction(ISD::SDIV, MVT::v4i16, Custom); 900 setOperationAction(ISD::SDIV, MVT::v8i8, Custom); 901 setOperationAction(ISD::UDIV, MVT::v4i16, Custom); 902 setOperationAction(ISD::UDIV, MVT::v8i8, Custom); 903 // Neon does not have single instruction SINT_TO_FP and UINT_TO_FP with 904 // a destination type that is wider than the source, and nor does 905 // it have a FP_TO_[SU]INT instruction with a narrower destination than 906 // source. 907 setOperationAction(ISD::SINT_TO_FP, MVT::v4i16, Custom); 908 setOperationAction(ISD::SINT_TO_FP, MVT::v8i16, Custom); 909 setOperationAction(ISD::UINT_TO_FP, MVT::v4i16, Custom); 910 setOperationAction(ISD::UINT_TO_FP, MVT::v8i16, Custom); 911 setOperationAction(ISD::FP_TO_UINT, MVT::v4i16, Custom); 912 setOperationAction(ISD::FP_TO_UINT, MVT::v8i16, Custom); 913 setOperationAction(ISD::FP_TO_SINT, MVT::v4i16, Custom); 914 setOperationAction(ISD::FP_TO_SINT, MVT::v8i16, Custom); 915 916 setOperationAction(ISD::FP_ROUND, MVT::v2f32, Expand); 917 setOperationAction(ISD::FP_EXTEND, MVT::v2f64, Expand); 918 919 // NEON does not have single instruction CTPOP for vectors with element 920 // types wider than 8-bits. However, custom lowering can leverage the 921 // v8i8/v16i8 vcnt instruction. 922 setOperationAction(ISD::CTPOP, MVT::v2i32, Custom); 923 setOperationAction(ISD::CTPOP, MVT::v4i32, Custom); 924 setOperationAction(ISD::CTPOP, MVT::v4i16, Custom); 925 setOperationAction(ISD::CTPOP, MVT::v8i16, Custom); 926 setOperationAction(ISD::CTPOP, MVT::v1i64, Custom); 927 setOperationAction(ISD::CTPOP, MVT::v2i64, Custom); 928 929 setOperationAction(ISD::CTLZ, MVT::v1i64, Expand); 930 setOperationAction(ISD::CTLZ, MVT::v2i64, Expand); 931 932 // NEON does not have single instruction CTTZ for vectors. 933 setOperationAction(ISD::CTTZ, MVT::v8i8, Custom); 934 setOperationAction(ISD::CTTZ, MVT::v4i16, Custom); 935 setOperationAction(ISD::CTTZ, MVT::v2i32, Custom); 936 setOperationAction(ISD::CTTZ, MVT::v1i64, Custom); 937 938 setOperationAction(ISD::CTTZ, MVT::v16i8, Custom); 939 setOperationAction(ISD::CTTZ, MVT::v8i16, Custom); 940 setOperationAction(ISD::CTTZ, MVT::v4i32, Custom); 941 setOperationAction(ISD::CTTZ, MVT::v2i64, Custom); 942 943 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v8i8, Custom); 944 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v4i16, Custom); 945 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v2i32, Custom); 946 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v1i64, Custom); 947 948 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v16i8, Custom); 949 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v8i16, Custom); 950 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v4i32, Custom); 951 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v2i64, Custom); 952 953 // NEON only has FMA instructions as of VFP4. 954 if (!Subtarget->hasVFP4Base()) { 955 setOperationAction(ISD::FMA, MVT::v2f32, Expand); 956 setOperationAction(ISD::FMA, MVT::v4f32, Expand); 957 } 958 959 setTargetDAGCombine(ISD::SHL); 960 setTargetDAGCombine(ISD::SRL); 961 setTargetDAGCombine(ISD::SRA); 962 setTargetDAGCombine(ISD::FP_TO_SINT); 963 setTargetDAGCombine(ISD::FP_TO_UINT); 964 setTargetDAGCombine(ISD::FDIV); 965 setTargetDAGCombine(ISD::LOAD); 966 967 // It is legal to extload from v4i8 to v4i16 or v4i32. 968 for (MVT Ty : {MVT::v8i8, MVT::v4i8, MVT::v2i8, MVT::v4i16, MVT::v2i16, 969 MVT::v2i32}) { 970 for (MVT VT : MVT::integer_fixedlen_vector_valuetypes()) { 971 setLoadExtAction(ISD::EXTLOAD, VT, Ty, Legal); 972 setLoadExtAction(ISD::ZEXTLOAD, VT, Ty, Legal); 973 setLoadExtAction(ISD::SEXTLOAD, VT, Ty, Legal); 974 } 975 } 976 } 977 978 if (Subtarget->hasNEON() || Subtarget->hasMVEIntegerOps()) { 979 setTargetDAGCombine(ISD::BUILD_VECTOR); 980 setTargetDAGCombine(ISD::VECTOR_SHUFFLE); 981 setTargetDAGCombine(ISD::INSERT_VECTOR_ELT); 982 setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT); 983 setTargetDAGCombine(ISD::SIGN_EXTEND_INREG); 984 setTargetDAGCombine(ISD::STORE); 985 setTargetDAGCombine(ISD::SIGN_EXTEND); 986 setTargetDAGCombine(ISD::ZERO_EXTEND); 987 setTargetDAGCombine(ISD::ANY_EXTEND); 988 setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN); 989 setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN); 990 setTargetDAGCombine(ISD::INTRINSIC_VOID); 991 setTargetDAGCombine(ISD::VECREDUCE_ADD); 992 setTargetDAGCombine(ISD::ADD); 993 setTargetDAGCombine(ISD::BITCAST); 994 } 995 if (Subtarget->hasMVEIntegerOps()) { 996 setTargetDAGCombine(ISD::SMIN); 997 setTargetDAGCombine(ISD::UMIN); 998 setTargetDAGCombine(ISD::SMAX); 999 setTargetDAGCombine(ISD::UMAX); 1000 setTargetDAGCombine(ISD::FP_EXTEND); 1001 setTargetDAGCombine(ISD::SELECT); 1002 setTargetDAGCombine(ISD::SELECT_CC); 1003 } 1004 1005 if (!Subtarget->hasFP64()) { 1006 // When targeting a floating-point unit with only single-precision 1007 // operations, f64 is legal for the few double-precision instructions which 1008 // are present However, no double-precision operations other than moves, 1009 // loads and stores are provided by the hardware. 1010 setOperationAction(ISD::FADD, MVT::f64, Expand); 1011 setOperationAction(ISD::FSUB, MVT::f64, Expand); 1012 setOperationAction(ISD::FMUL, MVT::f64, Expand); 1013 setOperationAction(ISD::FMA, MVT::f64, Expand); 1014 setOperationAction(ISD::FDIV, MVT::f64, Expand); 1015 setOperationAction(ISD::FREM, MVT::f64, Expand); 1016 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); 1017 setOperationAction(ISD::FGETSIGN, MVT::f64, Expand); 1018 setOperationAction(ISD::FNEG, MVT::f64, Expand); 1019 setOperationAction(ISD::FABS, MVT::f64, Expand); 1020 setOperationAction(ISD::FSQRT, MVT::f64, Expand); 1021 setOperationAction(ISD::FSIN, MVT::f64, Expand); 1022 setOperationAction(ISD::FCOS, MVT::f64, Expand); 1023 setOperationAction(ISD::FPOW, MVT::f64, Expand); 1024 setOperationAction(ISD::FLOG, MVT::f64, Expand); 1025 setOperationAction(ISD::FLOG2, MVT::f64, Expand); 1026 setOperationAction(ISD::FLOG10, MVT::f64, Expand); 1027 setOperationAction(ISD::FEXP, MVT::f64, Expand); 1028 setOperationAction(ISD::FEXP2, MVT::f64, Expand); 1029 setOperationAction(ISD::FCEIL, MVT::f64, Expand); 1030 setOperationAction(ISD::FTRUNC, MVT::f64, Expand); 1031 setOperationAction(ISD::FRINT, MVT::f64, Expand); 1032 setOperationAction(ISD::FNEARBYINT, MVT::f64, Expand); 1033 setOperationAction(ISD::FFLOOR, MVT::f64, Expand); 1034 setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom); 1035 setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom); 1036 setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom); 1037 setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom); 1038 setOperationAction(ISD::FP_TO_SINT, MVT::f64, Custom); 1039 setOperationAction(ISD::FP_TO_UINT, MVT::f64, Custom); 1040 setOperationAction(ISD::FP_ROUND, MVT::f32, Custom); 1041 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i32, Custom); 1042 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i32, Custom); 1043 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::f64, Custom); 1044 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::f64, Custom); 1045 setOperationAction(ISD::STRICT_FP_ROUND, MVT::f32, Custom); 1046 } 1047 1048 if (!Subtarget->hasFP64() || !Subtarget->hasFPARMv8Base()) { 1049 setOperationAction(ISD::FP_EXTEND, MVT::f64, Custom); 1050 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f64, Custom); 1051 if (Subtarget->hasFullFP16()) { 1052 setOperationAction(ISD::FP_ROUND, MVT::f16, Custom); 1053 setOperationAction(ISD::STRICT_FP_ROUND, MVT::f16, Custom); 1054 } 1055 } 1056 1057 if (!Subtarget->hasFP16()) { 1058 setOperationAction(ISD::FP_EXTEND, MVT::f32, Custom); 1059 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f32, Custom); 1060 } 1061 1062 computeRegisterProperties(Subtarget->getRegisterInfo()); 1063 1064 // ARM does not have floating-point extending loads. 1065 for (MVT VT : MVT::fp_valuetypes()) { 1066 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand); 1067 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand); 1068 } 1069 1070 // ... or truncating stores 1071 setTruncStoreAction(MVT::f64, MVT::f32, Expand); 1072 setTruncStoreAction(MVT::f32, MVT::f16, Expand); 1073 setTruncStoreAction(MVT::f64, MVT::f16, Expand); 1074 1075 // ARM does not have i1 sign extending load. 1076 for (MVT VT : MVT::integer_valuetypes()) 1077 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote); 1078 1079 // ARM supports all 4 flavors of integer indexed load / store. 1080 if (!Subtarget->isThumb1Only()) { 1081 for (unsigned im = (unsigned)ISD::PRE_INC; 1082 im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) { 1083 setIndexedLoadAction(im, MVT::i1, Legal); 1084 setIndexedLoadAction(im, MVT::i8, Legal); 1085 setIndexedLoadAction(im, MVT::i16, Legal); 1086 setIndexedLoadAction(im, MVT::i32, Legal); 1087 setIndexedStoreAction(im, MVT::i1, Legal); 1088 setIndexedStoreAction(im, MVT::i8, Legal); 1089 setIndexedStoreAction(im, MVT::i16, Legal); 1090 setIndexedStoreAction(im, MVT::i32, Legal); 1091 } 1092 } else { 1093 // Thumb-1 has limited post-inc load/store support - LDM r0!, {r1}. 1094 setIndexedLoadAction(ISD::POST_INC, MVT::i32, Legal); 1095 setIndexedStoreAction(ISD::POST_INC, MVT::i32, Legal); 1096 } 1097 1098 setOperationAction(ISD::SADDO, MVT::i32, Custom); 1099 setOperationAction(ISD::UADDO, MVT::i32, Custom); 1100 setOperationAction(ISD::SSUBO, MVT::i32, Custom); 1101 setOperationAction(ISD::USUBO, MVT::i32, Custom); 1102 1103 setOperationAction(ISD::ADDCARRY, MVT::i32, Custom); 1104 setOperationAction(ISD::SUBCARRY, MVT::i32, Custom); 1105 if (Subtarget->hasDSP()) { 1106 setOperationAction(ISD::SADDSAT, MVT::i8, Custom); 1107 setOperationAction(ISD::SSUBSAT, MVT::i8, Custom); 1108 setOperationAction(ISD::SADDSAT, MVT::i16, Custom); 1109 setOperationAction(ISD::SSUBSAT, MVT::i16, Custom); 1110 } 1111 if (Subtarget->hasBaseDSP()) { 1112 setOperationAction(ISD::SADDSAT, MVT::i32, Legal); 1113 setOperationAction(ISD::SSUBSAT, MVT::i32, Legal); 1114 } 1115 1116 // i64 operation support. 1117 setOperationAction(ISD::MUL, MVT::i64, Expand); 1118 setOperationAction(ISD::MULHU, MVT::i32, Expand); 1119 if (Subtarget->isThumb1Only()) { 1120 setOperationAction(ISD::UMUL_LOHI, MVT::i32, Expand); 1121 setOperationAction(ISD::SMUL_LOHI, MVT::i32, Expand); 1122 } 1123 if (Subtarget->isThumb1Only() || !Subtarget->hasV6Ops() 1124 || (Subtarget->isThumb2() && !Subtarget->hasDSP())) 1125 setOperationAction(ISD::MULHS, MVT::i32, Expand); 1126 1127 setOperationAction(ISD::SHL_PARTS, MVT::i32, Custom); 1128 setOperationAction(ISD::SRA_PARTS, MVT::i32, Custom); 1129 setOperationAction(ISD::SRL_PARTS, MVT::i32, Custom); 1130 setOperationAction(ISD::SRL, MVT::i64, Custom); 1131 setOperationAction(ISD::SRA, MVT::i64, Custom); 1132 setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom); 1133 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i64, Custom); 1134 setOperationAction(ISD::LOAD, MVT::i64, Custom); 1135 setOperationAction(ISD::STORE, MVT::i64, Custom); 1136 1137 // MVE lowers 64 bit shifts to lsll and lsrl 1138 // assuming that ISD::SRL and SRA of i64 are already marked custom 1139 if (Subtarget->hasMVEIntegerOps()) 1140 setOperationAction(ISD::SHL, MVT::i64, Custom); 1141 1142 // Expand to __aeabi_l{lsl,lsr,asr} calls for Thumb1. 1143 if (Subtarget->isThumb1Only()) { 1144 setOperationAction(ISD::SHL_PARTS, MVT::i32, Expand); 1145 setOperationAction(ISD::SRA_PARTS, MVT::i32, Expand); 1146 setOperationAction(ISD::SRL_PARTS, MVT::i32, Expand); 1147 } 1148 1149 if (!Subtarget->isThumb1Only() && Subtarget->hasV6T2Ops()) 1150 setOperationAction(ISD::BITREVERSE, MVT::i32, Legal); 1151 1152 // ARM does not have ROTL. 1153 setOperationAction(ISD::ROTL, MVT::i32, Expand); 1154 for (MVT VT : MVT::fixedlen_vector_valuetypes()) { 1155 setOperationAction(ISD::ROTL, VT, Expand); 1156 setOperationAction(ISD::ROTR, VT, Expand); 1157 } 1158 setOperationAction(ISD::CTTZ, MVT::i32, Custom); 1159 setOperationAction(ISD::CTPOP, MVT::i32, Expand); 1160 if (!Subtarget->hasV5TOps() || Subtarget->isThumb1Only()) { 1161 setOperationAction(ISD::CTLZ, MVT::i32, Expand); 1162 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, LibCall); 1163 } 1164 1165 // @llvm.readcyclecounter requires the Performance Monitors extension. 1166 // Default to the 0 expansion on unsupported platforms. 1167 // FIXME: Technically there are older ARM CPUs that have 1168 // implementation-specific ways of obtaining this information. 1169 if (Subtarget->hasPerfMon()) 1170 setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Custom); 1171 1172 // Only ARMv6 has BSWAP. 1173 if (!Subtarget->hasV6Ops()) 1174 setOperationAction(ISD::BSWAP, MVT::i32, Expand); 1175 1176 bool hasDivide = Subtarget->isThumb() ? Subtarget->hasDivideInThumbMode() 1177 : Subtarget->hasDivideInARMMode(); 1178 if (!hasDivide) { 1179 // These are expanded into libcalls if the cpu doesn't have HW divider. 1180 setOperationAction(ISD::SDIV, MVT::i32, LibCall); 1181 setOperationAction(ISD::UDIV, MVT::i32, LibCall); 1182 } 1183 1184 if (Subtarget->isTargetWindows() && !Subtarget->hasDivideInThumbMode()) { 1185 setOperationAction(ISD::SDIV, MVT::i32, Custom); 1186 setOperationAction(ISD::UDIV, MVT::i32, Custom); 1187 1188 setOperationAction(ISD::SDIV, MVT::i64, Custom); 1189 setOperationAction(ISD::UDIV, MVT::i64, Custom); 1190 } 1191 1192 setOperationAction(ISD::SREM, MVT::i32, Expand); 1193 setOperationAction(ISD::UREM, MVT::i32, Expand); 1194 1195 // Register based DivRem for AEABI (RTABI 4.2) 1196 if (Subtarget->isTargetAEABI() || Subtarget->isTargetAndroid() || 1197 Subtarget->isTargetGNUAEABI() || Subtarget->isTargetMuslAEABI() || 1198 Subtarget->isTargetWindows()) { 1199 setOperationAction(ISD::SREM, MVT::i64, Custom); 1200 setOperationAction(ISD::UREM, MVT::i64, Custom); 1201 HasStandaloneRem = false; 1202 1203 if (Subtarget->isTargetWindows()) { 1204 const struct { 1205 const RTLIB::Libcall Op; 1206 const char * const Name; 1207 const CallingConv::ID CC; 1208 } LibraryCalls[] = { 1209 { RTLIB::SDIVREM_I8, "__rt_sdiv", CallingConv::ARM_AAPCS }, 1210 { RTLIB::SDIVREM_I16, "__rt_sdiv", CallingConv::ARM_AAPCS }, 1211 { RTLIB::SDIVREM_I32, "__rt_sdiv", CallingConv::ARM_AAPCS }, 1212 { RTLIB::SDIVREM_I64, "__rt_sdiv64", CallingConv::ARM_AAPCS }, 1213 1214 { RTLIB::UDIVREM_I8, "__rt_udiv", CallingConv::ARM_AAPCS }, 1215 { RTLIB::UDIVREM_I16, "__rt_udiv", CallingConv::ARM_AAPCS }, 1216 { RTLIB::UDIVREM_I32, "__rt_udiv", CallingConv::ARM_AAPCS }, 1217 { RTLIB::UDIVREM_I64, "__rt_udiv64", CallingConv::ARM_AAPCS }, 1218 }; 1219 1220 for (const auto &LC : LibraryCalls) { 1221 setLibcallName(LC.Op, LC.Name); 1222 setLibcallCallingConv(LC.Op, LC.CC); 1223 } 1224 } else { 1225 const struct { 1226 const RTLIB::Libcall Op; 1227 const char * const Name; 1228 const CallingConv::ID CC; 1229 } LibraryCalls[] = { 1230 { RTLIB::SDIVREM_I8, "__aeabi_idivmod", CallingConv::ARM_AAPCS }, 1231 { RTLIB::SDIVREM_I16, "__aeabi_idivmod", CallingConv::ARM_AAPCS }, 1232 { RTLIB::SDIVREM_I32, "__aeabi_idivmod", CallingConv::ARM_AAPCS }, 1233 { RTLIB::SDIVREM_I64, "__aeabi_ldivmod", CallingConv::ARM_AAPCS }, 1234 1235 { RTLIB::UDIVREM_I8, "__aeabi_uidivmod", CallingConv::ARM_AAPCS }, 1236 { RTLIB::UDIVREM_I16, "__aeabi_uidivmod", CallingConv::ARM_AAPCS }, 1237 { RTLIB::UDIVREM_I32, "__aeabi_uidivmod", CallingConv::ARM_AAPCS }, 1238 { RTLIB::UDIVREM_I64, "__aeabi_uldivmod", CallingConv::ARM_AAPCS }, 1239 }; 1240 1241 for (const auto &LC : LibraryCalls) { 1242 setLibcallName(LC.Op, LC.Name); 1243 setLibcallCallingConv(LC.Op, LC.CC); 1244 } 1245 } 1246 1247 setOperationAction(ISD::SDIVREM, MVT::i32, Custom); 1248 setOperationAction(ISD::UDIVREM, MVT::i32, Custom); 1249 setOperationAction(ISD::SDIVREM, MVT::i64, Custom); 1250 setOperationAction(ISD::UDIVREM, MVT::i64, Custom); 1251 } else { 1252 setOperationAction(ISD::SDIVREM, MVT::i32, Expand); 1253 setOperationAction(ISD::UDIVREM, MVT::i32, Expand); 1254 } 1255 1256 if (Subtarget->getTargetTriple().isOSMSVCRT()) { 1257 // MSVCRT doesn't have powi; fall back to pow 1258 setLibcallName(RTLIB::POWI_F32, nullptr); 1259 setLibcallName(RTLIB::POWI_F64, nullptr); 1260 } 1261 1262 setOperationAction(ISD::GlobalAddress, MVT::i32, Custom); 1263 setOperationAction(ISD::ConstantPool, MVT::i32, Custom); 1264 setOperationAction(ISD::GlobalTLSAddress, MVT::i32, Custom); 1265 setOperationAction(ISD::BlockAddress, MVT::i32, Custom); 1266 1267 setOperationAction(ISD::TRAP, MVT::Other, Legal); 1268 setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal); 1269 1270 // Use the default implementation. 1271 setOperationAction(ISD::VASTART, MVT::Other, Custom); 1272 setOperationAction(ISD::VAARG, MVT::Other, Expand); 1273 setOperationAction(ISD::VACOPY, MVT::Other, Expand); 1274 setOperationAction(ISD::VAEND, MVT::Other, Expand); 1275 setOperationAction(ISD::STACKSAVE, MVT::Other, Expand); 1276 setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand); 1277 1278 if (Subtarget->isTargetWindows()) 1279 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom); 1280 else 1281 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Expand); 1282 1283 // ARMv6 Thumb1 (except for CPUs that support dmb / dsb) and earlier use 1284 // the default expansion. 1285 InsertFencesForAtomic = false; 1286 if (Subtarget->hasAnyDataBarrier() && 1287 (!Subtarget->isThumb() || Subtarget->hasV8MBaselineOps())) { 1288 // ATOMIC_FENCE needs custom lowering; the others should have been expanded 1289 // to ldrex/strex loops already. 1290 setOperationAction(ISD::ATOMIC_FENCE, MVT::Other, Custom); 1291 if (!Subtarget->isThumb() || !Subtarget->isMClass()) 1292 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i64, Custom); 1293 1294 // On v8, we have particularly efficient implementations of atomic fences 1295 // if they can be combined with nearby atomic loads and stores. 1296 if (!Subtarget->hasAcquireRelease() || 1297 getTargetMachine().getOptLevel() == 0) { 1298 // Automatically insert fences (dmb ish) around ATOMIC_SWAP etc. 1299 InsertFencesForAtomic = true; 1300 } 1301 } else { 1302 // If there's anything we can use as a barrier, go through custom lowering 1303 // for ATOMIC_FENCE. 1304 // If target has DMB in thumb, Fences can be inserted. 1305 if (Subtarget->hasDataBarrier()) 1306 InsertFencesForAtomic = true; 1307 1308 setOperationAction(ISD::ATOMIC_FENCE, MVT::Other, 1309 Subtarget->hasAnyDataBarrier() ? Custom : Expand); 1310 1311 // Set them all for expansion, which will force libcalls. 1312 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, Expand); 1313 setOperationAction(ISD::ATOMIC_SWAP, MVT::i32, Expand); 1314 setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i32, Expand); 1315 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i32, Expand); 1316 setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i32, Expand); 1317 setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i32, Expand); 1318 setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i32, Expand); 1319 setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i32, Expand); 1320 setOperationAction(ISD::ATOMIC_LOAD_MIN, MVT::i32, Expand); 1321 setOperationAction(ISD::ATOMIC_LOAD_MAX, MVT::i32, Expand); 1322 setOperationAction(ISD::ATOMIC_LOAD_UMIN, MVT::i32, Expand); 1323 setOperationAction(ISD::ATOMIC_LOAD_UMAX, MVT::i32, Expand); 1324 // Mark ATOMIC_LOAD and ATOMIC_STORE custom so we can handle the 1325 // Unordered/Monotonic case. 1326 if (!InsertFencesForAtomic) { 1327 setOperationAction(ISD::ATOMIC_LOAD, MVT::i32, Custom); 1328 setOperationAction(ISD::ATOMIC_STORE, MVT::i32, Custom); 1329 } 1330 } 1331 1332 setOperationAction(ISD::PREFETCH, MVT::Other, Custom); 1333 1334 // Requires SXTB/SXTH, available on v6 and up in both ARM and Thumb modes. 1335 if (!Subtarget->hasV6Ops()) { 1336 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Expand); 1337 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8, Expand); 1338 } 1339 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand); 1340 1341 if (!Subtarget->useSoftFloat() && Subtarget->hasFPRegs() && 1342 !Subtarget->isThumb1Only()) { 1343 // Turn f64->i64 into VMOVRRD, i64 -> f64 to VMOVDRR 1344 // iff target supports vfp2. 1345 setOperationAction(ISD::BITCAST, MVT::i64, Custom); 1346 setOperationAction(ISD::FLT_ROUNDS_, MVT::i32, Custom); 1347 setOperationAction(ISD::SET_ROUNDING, MVT::Other, Custom); 1348 } 1349 1350 // We want to custom lower some of our intrinsics. 1351 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); 1352 setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom); 1353 setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom); 1354 setOperationAction(ISD::EH_SJLJ_SETUP_DISPATCH, MVT::Other, Custom); 1355 if (Subtarget->useSjLjEH()) 1356 setLibcallName(RTLIB::UNWIND_RESUME, "_Unwind_SjLj_Resume"); 1357 1358 setOperationAction(ISD::SETCC, MVT::i32, Expand); 1359 setOperationAction(ISD::SETCC, MVT::f32, Expand); 1360 setOperationAction(ISD::SETCC, MVT::f64, Expand); 1361 setOperationAction(ISD::SELECT, MVT::i32, Custom); 1362 setOperationAction(ISD::SELECT, MVT::f32, Custom); 1363 setOperationAction(ISD::SELECT, MVT::f64, Custom); 1364 setOperationAction(ISD::SELECT_CC, MVT::i32, Custom); 1365 setOperationAction(ISD::SELECT_CC, MVT::f32, Custom); 1366 setOperationAction(ISD::SELECT_CC, MVT::f64, Custom); 1367 if (Subtarget->hasFullFP16()) { 1368 setOperationAction(ISD::SETCC, MVT::f16, Expand); 1369 setOperationAction(ISD::SELECT, MVT::f16, Custom); 1370 setOperationAction(ISD::SELECT_CC, MVT::f16, Custom); 1371 } 1372 1373 setOperationAction(ISD::SETCCCARRY, MVT::i32, Custom); 1374 1375 setOperationAction(ISD::BRCOND, MVT::Other, Custom); 1376 setOperationAction(ISD::BR_CC, MVT::i32, Custom); 1377 if (Subtarget->hasFullFP16()) 1378 setOperationAction(ISD::BR_CC, MVT::f16, Custom); 1379 setOperationAction(ISD::BR_CC, MVT::f32, Custom); 1380 setOperationAction(ISD::BR_CC, MVT::f64, Custom); 1381 setOperationAction(ISD::BR_JT, MVT::Other, Custom); 1382 1383 // We don't support sin/cos/fmod/copysign/pow 1384 setOperationAction(ISD::FSIN, MVT::f64, Expand); 1385 setOperationAction(ISD::FSIN, MVT::f32, Expand); 1386 setOperationAction(ISD::FCOS, MVT::f32, Expand); 1387 setOperationAction(ISD::FCOS, MVT::f64, Expand); 1388 setOperationAction(ISD::FSINCOS, MVT::f64, Expand); 1389 setOperationAction(ISD::FSINCOS, MVT::f32, Expand); 1390 setOperationAction(ISD::FREM, MVT::f64, Expand); 1391 setOperationAction(ISD::FREM, MVT::f32, Expand); 1392 if (!Subtarget->useSoftFloat() && Subtarget->hasVFP2Base() && 1393 !Subtarget->isThumb1Only()) { 1394 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom); 1395 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom); 1396 } 1397 setOperationAction(ISD::FPOW, MVT::f64, Expand); 1398 setOperationAction(ISD::FPOW, MVT::f32, Expand); 1399 1400 if (!Subtarget->hasVFP4Base()) { 1401 setOperationAction(ISD::FMA, MVT::f64, Expand); 1402 setOperationAction(ISD::FMA, MVT::f32, Expand); 1403 } 1404 1405 // Various VFP goodness 1406 if (!Subtarget->useSoftFloat() && !Subtarget->isThumb1Only()) { 1407 // FP-ARMv8 adds f64 <-> f16 conversion. Before that it should be expanded. 1408 if (!Subtarget->hasFPARMv8Base() || !Subtarget->hasFP64()) { 1409 setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand); 1410 setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand); 1411 } 1412 1413 // fp16 is a special v7 extension that adds f16 <-> f32 conversions. 1414 if (!Subtarget->hasFP16()) { 1415 setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand); 1416 setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand); 1417 } 1418 1419 // Strict floating-point comparisons need custom lowering. 1420 setOperationAction(ISD::STRICT_FSETCC, MVT::f16, Custom); 1421 setOperationAction(ISD::STRICT_FSETCCS, MVT::f16, Custom); 1422 setOperationAction(ISD::STRICT_FSETCC, MVT::f32, Custom); 1423 setOperationAction(ISD::STRICT_FSETCCS, MVT::f32, Custom); 1424 setOperationAction(ISD::STRICT_FSETCC, MVT::f64, Custom); 1425 setOperationAction(ISD::STRICT_FSETCCS, MVT::f64, Custom); 1426 } 1427 1428 // Use __sincos_stret if available. 1429 if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr && 1430 getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) { 1431 setOperationAction(ISD::FSINCOS, MVT::f64, Custom); 1432 setOperationAction(ISD::FSINCOS, MVT::f32, Custom); 1433 } 1434 1435 // FP-ARMv8 implements a lot of rounding-like FP operations. 1436 if (Subtarget->hasFPARMv8Base()) { 1437 setOperationAction(ISD::FFLOOR, MVT::f32, Legal); 1438 setOperationAction(ISD::FCEIL, MVT::f32, Legal); 1439 setOperationAction(ISD::FROUND, MVT::f32, Legal); 1440 setOperationAction(ISD::FTRUNC, MVT::f32, Legal); 1441 setOperationAction(ISD::FNEARBYINT, MVT::f32, Legal); 1442 setOperationAction(ISD::FRINT, MVT::f32, Legal); 1443 setOperationAction(ISD::FMINNUM, MVT::f32, Legal); 1444 setOperationAction(ISD::FMAXNUM, MVT::f32, Legal); 1445 if (Subtarget->hasNEON()) { 1446 setOperationAction(ISD::FMINNUM, MVT::v2f32, Legal); 1447 setOperationAction(ISD::FMAXNUM, MVT::v2f32, Legal); 1448 setOperationAction(ISD::FMINNUM, MVT::v4f32, Legal); 1449 setOperationAction(ISD::FMAXNUM, MVT::v4f32, Legal); 1450 } 1451 1452 if (Subtarget->hasFP64()) { 1453 setOperationAction(ISD::FFLOOR, MVT::f64, Legal); 1454 setOperationAction(ISD::FCEIL, MVT::f64, Legal); 1455 setOperationAction(ISD::FROUND, MVT::f64, Legal); 1456 setOperationAction(ISD::FTRUNC, MVT::f64, Legal); 1457 setOperationAction(ISD::FNEARBYINT, MVT::f64, Legal); 1458 setOperationAction(ISD::FRINT, MVT::f64, Legal); 1459 setOperationAction(ISD::FMINNUM, MVT::f64, Legal); 1460 setOperationAction(ISD::FMAXNUM, MVT::f64, Legal); 1461 } 1462 } 1463 1464 // FP16 often need to be promoted to call lib functions 1465 if (Subtarget->hasFullFP16()) { 1466 setOperationAction(ISD::FREM, MVT::f16, Promote); 1467 setOperationAction(ISD::FCOPYSIGN, MVT::f16, Expand); 1468 setOperationAction(ISD::FSIN, MVT::f16, Promote); 1469 setOperationAction(ISD::FCOS, MVT::f16, Promote); 1470 setOperationAction(ISD::FSINCOS, MVT::f16, Promote); 1471 setOperationAction(ISD::FPOWI, MVT::f16, Promote); 1472 setOperationAction(ISD::FPOW, MVT::f16, Promote); 1473 setOperationAction(ISD::FEXP, MVT::f16, Promote); 1474 setOperationAction(ISD::FEXP2, MVT::f16, Promote); 1475 setOperationAction(ISD::FLOG, MVT::f16, Promote); 1476 setOperationAction(ISD::FLOG10, MVT::f16, Promote); 1477 setOperationAction(ISD::FLOG2, MVT::f16, Promote); 1478 1479 setOperationAction(ISD::FROUND, MVT::f16, Legal); 1480 } 1481 1482 if (Subtarget->hasNEON()) { 1483 // vmin and vmax aren't available in a scalar form, so we can use 1484 // a NEON instruction with an undef lane instead. This has a performance 1485 // penalty on some cores, so we don't do this unless we have been 1486 // asked to by the core tuning model. 1487 if (Subtarget->useNEONForSinglePrecisionFP()) { 1488 setOperationAction(ISD::FMINIMUM, MVT::f32, Legal); 1489 setOperationAction(ISD::FMAXIMUM, MVT::f32, Legal); 1490 setOperationAction(ISD::FMINIMUM, MVT::f16, Legal); 1491 setOperationAction(ISD::FMAXIMUM, MVT::f16, Legal); 1492 } 1493 setOperationAction(ISD::FMINIMUM, MVT::v2f32, Legal); 1494 setOperationAction(ISD::FMAXIMUM, MVT::v2f32, Legal); 1495 setOperationAction(ISD::FMINIMUM, MVT::v4f32, Legal); 1496 setOperationAction(ISD::FMAXIMUM, MVT::v4f32, Legal); 1497 1498 if (Subtarget->hasFullFP16()) { 1499 setOperationAction(ISD::FMINNUM, MVT::v4f16, Legal); 1500 setOperationAction(ISD::FMAXNUM, MVT::v4f16, Legal); 1501 setOperationAction(ISD::FMINNUM, MVT::v8f16, Legal); 1502 setOperationAction(ISD::FMAXNUM, MVT::v8f16, Legal); 1503 1504 setOperationAction(ISD::FMINIMUM, MVT::v4f16, Legal); 1505 setOperationAction(ISD::FMAXIMUM, MVT::v4f16, Legal); 1506 setOperationAction(ISD::FMINIMUM, MVT::v8f16, Legal); 1507 setOperationAction(ISD::FMAXIMUM, MVT::v8f16, Legal); 1508 } 1509 } 1510 1511 // We have target-specific dag combine patterns for the following nodes: 1512 // ARMISD::VMOVRRD - No need to call setTargetDAGCombine 1513 setTargetDAGCombine(ISD::ADD); 1514 setTargetDAGCombine(ISD::SUB); 1515 setTargetDAGCombine(ISD::MUL); 1516 setTargetDAGCombine(ISD::AND); 1517 setTargetDAGCombine(ISD::OR); 1518 setTargetDAGCombine(ISD::XOR); 1519 1520 if (Subtarget->hasMVEIntegerOps()) 1521 setTargetDAGCombine(ISD::VSELECT); 1522 1523 if (Subtarget->hasV6Ops()) 1524 setTargetDAGCombine(ISD::SRL); 1525 if (Subtarget->isThumb1Only()) 1526 setTargetDAGCombine(ISD::SHL); 1527 1528 setStackPointerRegisterToSaveRestore(ARM::SP); 1529 1530 if (Subtarget->useSoftFloat() || Subtarget->isThumb1Only() || 1531 !Subtarget->hasVFP2Base() || Subtarget->hasMinSize()) 1532 setSchedulingPreference(Sched::RegPressure); 1533 else 1534 setSchedulingPreference(Sched::Hybrid); 1535 1536 //// temporary - rewrite interface to use type 1537 MaxStoresPerMemset = 8; 1538 MaxStoresPerMemsetOptSize = 4; 1539 MaxStoresPerMemcpy = 4; // For @llvm.memcpy -> sequence of stores 1540 MaxStoresPerMemcpyOptSize = 2; 1541 MaxStoresPerMemmove = 4; // For @llvm.memmove -> sequence of stores 1542 MaxStoresPerMemmoveOptSize = 2; 1543 1544 // On ARM arguments smaller than 4 bytes are extended, so all arguments 1545 // are at least 4 bytes aligned. 1546 setMinStackArgumentAlignment(Align(4)); 1547 1548 // Prefer likely predicted branches to selects on out-of-order cores. 1549 PredictableSelectIsExpensive = Subtarget->getSchedModel().isOutOfOrder(); 1550 1551 setPrefLoopAlignment(Align(1ULL << Subtarget->getPrefLoopLogAlignment())); 1552 1553 setMinFunctionAlignment(Subtarget->isThumb() ? Align(2) : Align(4)); 1554 1555 if (Subtarget->isThumb() || Subtarget->isThumb2()) 1556 setTargetDAGCombine(ISD::ABS); 1557 } 1558 1559 bool ARMTargetLowering::useSoftFloat() const { 1560 return Subtarget->useSoftFloat(); 1561 } 1562 1563 // FIXME: It might make sense to define the representative register class as the 1564 // nearest super-register that has a non-null superset. For example, DPR_VFP2 is 1565 // a super-register of SPR, and DPR is a superset if DPR_VFP2. Consequently, 1566 // SPR's representative would be DPR_VFP2. This should work well if register 1567 // pressure tracking were modified such that a register use would increment the 1568 // pressure of the register class's representative and all of it's super 1569 // classes' representatives transitively. We have not implemented this because 1570 // of the difficulty prior to coalescing of modeling operand register classes 1571 // due to the common occurrence of cross class copies and subregister insertions 1572 // and extractions. 1573 std::pair<const TargetRegisterClass *, uint8_t> 1574 ARMTargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI, 1575 MVT VT) const { 1576 const TargetRegisterClass *RRC = nullptr; 1577 uint8_t Cost = 1; 1578 switch (VT.SimpleTy) { 1579 default: 1580 return TargetLowering::findRepresentativeClass(TRI, VT); 1581 // Use DPR as representative register class for all floating point 1582 // and vector types. Since there are 32 SPR registers and 32 DPR registers so 1583 // the cost is 1 for both f32 and f64. 1584 case MVT::f32: case MVT::f64: case MVT::v8i8: case MVT::v4i16: 1585 case MVT::v2i32: case MVT::v1i64: case MVT::v2f32: 1586 RRC = &ARM::DPRRegClass; 1587 // When NEON is used for SP, only half of the register file is available 1588 // because operations that define both SP and DP results will be constrained 1589 // to the VFP2 class (D0-D15). We currently model this constraint prior to 1590 // coalescing by double-counting the SP regs. See the FIXME above. 1591 if (Subtarget->useNEONForSinglePrecisionFP()) 1592 Cost = 2; 1593 break; 1594 case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64: 1595 case MVT::v4f32: case MVT::v2f64: 1596 RRC = &ARM::DPRRegClass; 1597 Cost = 2; 1598 break; 1599 case MVT::v4i64: 1600 RRC = &ARM::DPRRegClass; 1601 Cost = 4; 1602 break; 1603 case MVT::v8i64: 1604 RRC = &ARM::DPRRegClass; 1605 Cost = 8; 1606 break; 1607 } 1608 return std::make_pair(RRC, Cost); 1609 } 1610 1611 const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const { 1612 #define MAKE_CASE(V) \ 1613 case V: \ 1614 return #V; 1615 switch ((ARMISD::NodeType)Opcode) { 1616 case ARMISD::FIRST_NUMBER: 1617 break; 1618 MAKE_CASE(ARMISD::Wrapper) 1619 MAKE_CASE(ARMISD::WrapperPIC) 1620 MAKE_CASE(ARMISD::WrapperJT) 1621 MAKE_CASE(ARMISD::COPY_STRUCT_BYVAL) 1622 MAKE_CASE(ARMISD::CALL) 1623 MAKE_CASE(ARMISD::CALL_PRED) 1624 MAKE_CASE(ARMISD::CALL_NOLINK) 1625 MAKE_CASE(ARMISD::tSECALL) 1626 MAKE_CASE(ARMISD::BRCOND) 1627 MAKE_CASE(ARMISD::BR_JT) 1628 MAKE_CASE(ARMISD::BR2_JT) 1629 MAKE_CASE(ARMISD::RET_FLAG) 1630 MAKE_CASE(ARMISD::SERET_FLAG) 1631 MAKE_CASE(ARMISD::INTRET_FLAG) 1632 MAKE_CASE(ARMISD::PIC_ADD) 1633 MAKE_CASE(ARMISD::CMP) 1634 MAKE_CASE(ARMISD::CMN) 1635 MAKE_CASE(ARMISD::CMPZ) 1636 MAKE_CASE(ARMISD::CMPFP) 1637 MAKE_CASE(ARMISD::CMPFPE) 1638 MAKE_CASE(ARMISD::CMPFPw0) 1639 MAKE_CASE(ARMISD::CMPFPEw0) 1640 MAKE_CASE(ARMISD::BCC_i64) 1641 MAKE_CASE(ARMISD::FMSTAT) 1642 MAKE_CASE(ARMISD::CMOV) 1643 MAKE_CASE(ARMISD::SUBS) 1644 MAKE_CASE(ARMISD::SSAT) 1645 MAKE_CASE(ARMISD::USAT) 1646 MAKE_CASE(ARMISD::ASRL) 1647 MAKE_CASE(ARMISD::LSRL) 1648 MAKE_CASE(ARMISD::LSLL) 1649 MAKE_CASE(ARMISD::SRL_FLAG) 1650 MAKE_CASE(ARMISD::SRA_FLAG) 1651 MAKE_CASE(ARMISD::RRX) 1652 MAKE_CASE(ARMISD::ADDC) 1653 MAKE_CASE(ARMISD::ADDE) 1654 MAKE_CASE(ARMISD::SUBC) 1655 MAKE_CASE(ARMISD::SUBE) 1656 MAKE_CASE(ARMISD::LSLS) 1657 MAKE_CASE(ARMISD::VMOVRRD) 1658 MAKE_CASE(ARMISD::VMOVDRR) 1659 MAKE_CASE(ARMISD::VMOVhr) 1660 MAKE_CASE(ARMISD::VMOVrh) 1661 MAKE_CASE(ARMISD::VMOVSR) 1662 MAKE_CASE(ARMISD::EH_SJLJ_SETJMP) 1663 MAKE_CASE(ARMISD::EH_SJLJ_LONGJMP) 1664 MAKE_CASE(ARMISD::EH_SJLJ_SETUP_DISPATCH) 1665 MAKE_CASE(ARMISD::TC_RETURN) 1666 MAKE_CASE(ARMISD::THREAD_POINTER) 1667 MAKE_CASE(ARMISD::DYN_ALLOC) 1668 MAKE_CASE(ARMISD::MEMBARRIER_MCR) 1669 MAKE_CASE(ARMISD::PRELOAD) 1670 MAKE_CASE(ARMISD::LDRD) 1671 MAKE_CASE(ARMISD::STRD) 1672 MAKE_CASE(ARMISD::WIN__CHKSTK) 1673 MAKE_CASE(ARMISD::WIN__DBZCHK) 1674 MAKE_CASE(ARMISD::PREDICATE_CAST) 1675 MAKE_CASE(ARMISD::VECTOR_REG_CAST) 1676 MAKE_CASE(ARMISD::VCMP) 1677 MAKE_CASE(ARMISD::VCMPZ) 1678 MAKE_CASE(ARMISD::VTST) 1679 MAKE_CASE(ARMISD::VSHLs) 1680 MAKE_CASE(ARMISD::VSHLu) 1681 MAKE_CASE(ARMISD::VSHLIMM) 1682 MAKE_CASE(ARMISD::VSHRsIMM) 1683 MAKE_CASE(ARMISD::VSHRuIMM) 1684 MAKE_CASE(ARMISD::VRSHRsIMM) 1685 MAKE_CASE(ARMISD::VRSHRuIMM) 1686 MAKE_CASE(ARMISD::VRSHRNIMM) 1687 MAKE_CASE(ARMISD::VQSHLsIMM) 1688 MAKE_CASE(ARMISD::VQSHLuIMM) 1689 MAKE_CASE(ARMISD::VQSHLsuIMM) 1690 MAKE_CASE(ARMISD::VQSHRNsIMM) 1691 MAKE_CASE(ARMISD::VQSHRNuIMM) 1692 MAKE_CASE(ARMISD::VQSHRNsuIMM) 1693 MAKE_CASE(ARMISD::VQRSHRNsIMM) 1694 MAKE_CASE(ARMISD::VQRSHRNuIMM) 1695 MAKE_CASE(ARMISD::VQRSHRNsuIMM) 1696 MAKE_CASE(ARMISD::VSLIIMM) 1697 MAKE_CASE(ARMISD::VSRIIMM) 1698 MAKE_CASE(ARMISD::VGETLANEu) 1699 MAKE_CASE(ARMISD::VGETLANEs) 1700 MAKE_CASE(ARMISD::VMOVIMM) 1701 MAKE_CASE(ARMISD::VMVNIMM) 1702 MAKE_CASE(ARMISD::VMOVFPIMM) 1703 MAKE_CASE(ARMISD::VDUP) 1704 MAKE_CASE(ARMISD::VDUPLANE) 1705 MAKE_CASE(ARMISD::VEXT) 1706 MAKE_CASE(ARMISD::VREV64) 1707 MAKE_CASE(ARMISD::VREV32) 1708 MAKE_CASE(ARMISD::VREV16) 1709 MAKE_CASE(ARMISD::VZIP) 1710 MAKE_CASE(ARMISD::VUZP) 1711 MAKE_CASE(ARMISD::VTRN) 1712 MAKE_CASE(ARMISD::VTBL1) 1713 MAKE_CASE(ARMISD::VTBL2) 1714 MAKE_CASE(ARMISD::VMOVN) 1715 MAKE_CASE(ARMISD::VQMOVNs) 1716 MAKE_CASE(ARMISD::VQMOVNu) 1717 MAKE_CASE(ARMISD::VCVTN) 1718 MAKE_CASE(ARMISD::VCVTL) 1719 MAKE_CASE(ARMISD::VIDUP) 1720 MAKE_CASE(ARMISD::VMULLs) 1721 MAKE_CASE(ARMISD::VMULLu) 1722 MAKE_CASE(ARMISD::VQDMULH) 1723 MAKE_CASE(ARMISD::VADDVs) 1724 MAKE_CASE(ARMISD::VADDVu) 1725 MAKE_CASE(ARMISD::VADDVps) 1726 MAKE_CASE(ARMISD::VADDVpu) 1727 MAKE_CASE(ARMISD::VADDLVs) 1728 MAKE_CASE(ARMISD::VADDLVu) 1729 MAKE_CASE(ARMISD::VADDLVAs) 1730 MAKE_CASE(ARMISD::VADDLVAu) 1731 MAKE_CASE(ARMISD::VADDLVps) 1732 MAKE_CASE(ARMISD::VADDLVpu) 1733 MAKE_CASE(ARMISD::VADDLVAps) 1734 MAKE_CASE(ARMISD::VADDLVApu) 1735 MAKE_CASE(ARMISD::VMLAVs) 1736 MAKE_CASE(ARMISD::VMLAVu) 1737 MAKE_CASE(ARMISD::VMLAVps) 1738 MAKE_CASE(ARMISD::VMLAVpu) 1739 MAKE_CASE(ARMISD::VMLALVs) 1740 MAKE_CASE(ARMISD::VMLALVu) 1741 MAKE_CASE(ARMISD::VMLALVps) 1742 MAKE_CASE(ARMISD::VMLALVpu) 1743 MAKE_CASE(ARMISD::VMLALVAs) 1744 MAKE_CASE(ARMISD::VMLALVAu) 1745 MAKE_CASE(ARMISD::VMLALVAps) 1746 MAKE_CASE(ARMISD::VMLALVApu) 1747 MAKE_CASE(ARMISD::VMINVu) 1748 MAKE_CASE(ARMISD::VMINVs) 1749 MAKE_CASE(ARMISD::VMAXVu) 1750 MAKE_CASE(ARMISD::VMAXVs) 1751 MAKE_CASE(ARMISD::UMAAL) 1752 MAKE_CASE(ARMISD::UMLAL) 1753 MAKE_CASE(ARMISD::SMLAL) 1754 MAKE_CASE(ARMISD::SMLALBB) 1755 MAKE_CASE(ARMISD::SMLALBT) 1756 MAKE_CASE(ARMISD::SMLALTB) 1757 MAKE_CASE(ARMISD::SMLALTT) 1758 MAKE_CASE(ARMISD::SMULWB) 1759 MAKE_CASE(ARMISD::SMULWT) 1760 MAKE_CASE(ARMISD::SMLALD) 1761 MAKE_CASE(ARMISD::SMLALDX) 1762 MAKE_CASE(ARMISD::SMLSLD) 1763 MAKE_CASE(ARMISD::SMLSLDX) 1764 MAKE_CASE(ARMISD::SMMLAR) 1765 MAKE_CASE(ARMISD::SMMLSR) 1766 MAKE_CASE(ARMISD::QADD16b) 1767 MAKE_CASE(ARMISD::QSUB16b) 1768 MAKE_CASE(ARMISD::QADD8b) 1769 MAKE_CASE(ARMISD::QSUB8b) 1770 MAKE_CASE(ARMISD::BUILD_VECTOR) 1771 MAKE_CASE(ARMISD::BFI) 1772 MAKE_CASE(ARMISD::VORRIMM) 1773 MAKE_CASE(ARMISD::VBICIMM) 1774 MAKE_CASE(ARMISD::VBSP) 1775 MAKE_CASE(ARMISD::MEMCPY) 1776 MAKE_CASE(ARMISD::VLD1DUP) 1777 MAKE_CASE(ARMISD::VLD2DUP) 1778 MAKE_CASE(ARMISD::VLD3DUP) 1779 MAKE_CASE(ARMISD::VLD4DUP) 1780 MAKE_CASE(ARMISD::VLD1_UPD) 1781 MAKE_CASE(ARMISD::VLD2_UPD) 1782 MAKE_CASE(ARMISD::VLD3_UPD) 1783 MAKE_CASE(ARMISD::VLD4_UPD) 1784 MAKE_CASE(ARMISD::VLD2LN_UPD) 1785 MAKE_CASE(ARMISD::VLD3LN_UPD) 1786 MAKE_CASE(ARMISD::VLD4LN_UPD) 1787 MAKE_CASE(ARMISD::VLD1DUP_UPD) 1788 MAKE_CASE(ARMISD::VLD2DUP_UPD) 1789 MAKE_CASE(ARMISD::VLD3DUP_UPD) 1790 MAKE_CASE(ARMISD::VLD4DUP_UPD) 1791 MAKE_CASE(ARMISD::VST1_UPD) 1792 MAKE_CASE(ARMISD::VST2_UPD) 1793 MAKE_CASE(ARMISD::VST3_UPD) 1794 MAKE_CASE(ARMISD::VST4_UPD) 1795 MAKE_CASE(ARMISD::VST1x2_UPD) 1796 MAKE_CASE(ARMISD::VST1x3_UPD) 1797 MAKE_CASE(ARMISD::VST1x4_UPD) 1798 MAKE_CASE(ARMISD::VST2LN_UPD) 1799 MAKE_CASE(ARMISD::VST3LN_UPD) 1800 MAKE_CASE(ARMISD::VST4LN_UPD) 1801 MAKE_CASE(ARMISD::WLS) 1802 MAKE_CASE(ARMISD::WLSSETUP) 1803 MAKE_CASE(ARMISD::LE) 1804 MAKE_CASE(ARMISD::LOOP_DEC) 1805 MAKE_CASE(ARMISD::CSINV) 1806 MAKE_CASE(ARMISD::CSNEG) 1807 MAKE_CASE(ARMISD::CSINC) 1808 MAKE_CASE(ARMISD::MEMCPYLOOP) 1809 MAKE_CASE(ARMISD::MEMSETLOOP) 1810 #undef MAKE_CASE 1811 } 1812 return nullptr; 1813 } 1814 1815 EVT ARMTargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &, 1816 EVT VT) const { 1817 if (!VT.isVector()) 1818 return getPointerTy(DL); 1819 1820 // MVE has a predicate register. 1821 if (Subtarget->hasMVEIntegerOps() && 1822 (VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8)) 1823 return MVT::getVectorVT(MVT::i1, VT.getVectorElementCount()); 1824 return VT.changeVectorElementTypeToInteger(); 1825 } 1826 1827 /// getRegClassFor - Return the register class that should be used for the 1828 /// specified value type. 1829 const TargetRegisterClass * 1830 ARMTargetLowering::getRegClassFor(MVT VT, bool isDivergent) const { 1831 (void)isDivergent; 1832 // Map v4i64 to QQ registers but do not make the type legal. Similarly map 1833 // v8i64 to QQQQ registers. v4i64 and v8i64 are only used for REG_SEQUENCE to 1834 // load / store 4 to 8 consecutive NEON D registers, or 2 to 4 consecutive 1835 // MVE Q registers. 1836 if (Subtarget->hasNEON() || Subtarget->hasMVEIntegerOps()) { 1837 if (VT == MVT::v4i64) 1838 return &ARM::QQPRRegClass; 1839 if (VT == MVT::v8i64) 1840 return &ARM::QQQQPRRegClass; 1841 } 1842 return TargetLowering::getRegClassFor(VT); 1843 } 1844 1845 // memcpy, and other memory intrinsics, typically tries to use LDM/STM if the 1846 // source/dest is aligned and the copy size is large enough. We therefore want 1847 // to align such objects passed to memory intrinsics. 1848 bool ARMTargetLowering::shouldAlignPointerArgs(CallInst *CI, unsigned &MinSize, 1849 unsigned &PrefAlign) const { 1850 if (!isa<MemIntrinsic>(CI)) 1851 return false; 1852 MinSize = 8; 1853 // On ARM11 onwards (excluding M class) 8-byte aligned LDM is typically 1 1854 // cycle faster than 4-byte aligned LDM. 1855 PrefAlign = (Subtarget->hasV6Ops() && !Subtarget->isMClass() ? 8 : 4); 1856 return true; 1857 } 1858 1859 // Create a fast isel object. 1860 FastISel * 1861 ARMTargetLowering::createFastISel(FunctionLoweringInfo &funcInfo, 1862 const TargetLibraryInfo *libInfo) const { 1863 return ARM::createFastISel(funcInfo, libInfo); 1864 } 1865 1866 Sched::Preference ARMTargetLowering::getSchedulingPreference(SDNode *N) const { 1867 unsigned NumVals = N->getNumValues(); 1868 if (!NumVals) 1869 return Sched::RegPressure; 1870 1871 for (unsigned i = 0; i != NumVals; ++i) { 1872 EVT VT = N->getValueType(i); 1873 if (VT == MVT::Glue || VT == MVT::Other) 1874 continue; 1875 if (VT.isFloatingPoint() || VT.isVector()) 1876 return Sched::ILP; 1877 } 1878 1879 if (!N->isMachineOpcode()) 1880 return Sched::RegPressure; 1881 1882 // Load are scheduled for latency even if there instruction itinerary 1883 // is not available. 1884 const TargetInstrInfo *TII = Subtarget->getInstrInfo(); 1885 const MCInstrDesc &MCID = TII->get(N->getMachineOpcode()); 1886 1887 if (MCID.getNumDefs() == 0) 1888 return Sched::RegPressure; 1889 if (!Itins->isEmpty() && 1890 Itins->getOperandCycle(MCID.getSchedClass(), 0) > 2) 1891 return Sched::ILP; 1892 1893 return Sched::RegPressure; 1894 } 1895 1896 //===----------------------------------------------------------------------===// 1897 // Lowering Code 1898 //===----------------------------------------------------------------------===// 1899 1900 static bool isSRL16(const SDValue &Op) { 1901 if (Op.getOpcode() != ISD::SRL) 1902 return false; 1903 if (auto Const = dyn_cast<ConstantSDNode>(Op.getOperand(1))) 1904 return Const->getZExtValue() == 16; 1905 return false; 1906 } 1907 1908 static bool isSRA16(const SDValue &Op) { 1909 if (Op.getOpcode() != ISD::SRA) 1910 return false; 1911 if (auto Const = dyn_cast<ConstantSDNode>(Op.getOperand(1))) 1912 return Const->getZExtValue() == 16; 1913 return false; 1914 } 1915 1916 static bool isSHL16(const SDValue &Op) { 1917 if (Op.getOpcode() != ISD::SHL) 1918 return false; 1919 if (auto Const = dyn_cast<ConstantSDNode>(Op.getOperand(1))) 1920 return Const->getZExtValue() == 16; 1921 return false; 1922 } 1923 1924 // Check for a signed 16-bit value. We special case SRA because it makes it 1925 // more simple when also looking for SRAs that aren't sign extending a 1926 // smaller value. Without the check, we'd need to take extra care with 1927 // checking order for some operations. 1928 static bool isS16(const SDValue &Op, SelectionDAG &DAG) { 1929 if (isSRA16(Op)) 1930 return isSHL16(Op.getOperand(0)); 1931 return DAG.ComputeNumSignBits(Op) == 17; 1932 } 1933 1934 /// IntCCToARMCC - Convert a DAG integer condition code to an ARM CC 1935 static ARMCC::CondCodes IntCCToARMCC(ISD::CondCode CC) { 1936 switch (CC) { 1937 default: llvm_unreachable("Unknown condition code!"); 1938 case ISD::SETNE: return ARMCC::NE; 1939 case ISD::SETEQ: return ARMCC::EQ; 1940 case ISD::SETGT: return ARMCC::GT; 1941 case ISD::SETGE: return ARMCC::GE; 1942 case ISD::SETLT: return ARMCC::LT; 1943 case ISD::SETLE: return ARMCC::LE; 1944 case ISD::SETUGT: return ARMCC::HI; 1945 case ISD::SETUGE: return ARMCC::HS; 1946 case ISD::SETULT: return ARMCC::LO; 1947 case ISD::SETULE: return ARMCC::LS; 1948 } 1949 } 1950 1951 /// FPCCToARMCC - Convert a DAG fp condition code to an ARM CC. 1952 static void FPCCToARMCC(ISD::CondCode CC, ARMCC::CondCodes &CondCode, 1953 ARMCC::CondCodes &CondCode2) { 1954 CondCode2 = ARMCC::AL; 1955 switch (CC) { 1956 default: llvm_unreachable("Unknown FP condition!"); 1957 case ISD::SETEQ: 1958 case ISD::SETOEQ: CondCode = ARMCC::EQ; break; 1959 case ISD::SETGT: 1960 case ISD::SETOGT: CondCode = ARMCC::GT; break; 1961 case ISD::SETGE: 1962 case ISD::SETOGE: CondCode = ARMCC::GE; break; 1963 case ISD::SETOLT: CondCode = ARMCC::MI; break; 1964 case ISD::SETOLE: CondCode = ARMCC::LS; break; 1965 case ISD::SETONE: CondCode = ARMCC::MI; CondCode2 = ARMCC::GT; break; 1966 case ISD::SETO: CondCode = ARMCC::VC; break; 1967 case ISD::SETUO: CondCode = ARMCC::VS; break; 1968 case ISD::SETUEQ: CondCode = ARMCC::EQ; CondCode2 = ARMCC::VS; break; 1969 case ISD::SETUGT: CondCode = ARMCC::HI; break; 1970 case ISD::SETUGE: CondCode = ARMCC::PL; break; 1971 case ISD::SETLT: 1972 case ISD::SETULT: CondCode = ARMCC::LT; break; 1973 case ISD::SETLE: 1974 case ISD::SETULE: CondCode = ARMCC::LE; break; 1975 case ISD::SETNE: 1976 case ISD::SETUNE: CondCode = ARMCC::NE; break; 1977 } 1978 } 1979 1980 //===----------------------------------------------------------------------===// 1981 // Calling Convention Implementation 1982 //===----------------------------------------------------------------------===// 1983 1984 /// getEffectiveCallingConv - Get the effective calling convention, taking into 1985 /// account presence of floating point hardware and calling convention 1986 /// limitations, such as support for variadic functions. 1987 CallingConv::ID 1988 ARMTargetLowering::getEffectiveCallingConv(CallingConv::ID CC, 1989 bool isVarArg) const { 1990 switch (CC) { 1991 default: 1992 report_fatal_error("Unsupported calling convention"); 1993 case CallingConv::ARM_AAPCS: 1994 case CallingConv::ARM_APCS: 1995 case CallingConv::GHC: 1996 case CallingConv::CFGuard_Check: 1997 return CC; 1998 case CallingConv::PreserveMost: 1999 return CallingConv::PreserveMost; 2000 case CallingConv::ARM_AAPCS_VFP: 2001 case CallingConv::Swift: 2002 case CallingConv::SwiftTail: 2003 return isVarArg ? CallingConv::ARM_AAPCS : CallingConv::ARM_AAPCS_VFP; 2004 case CallingConv::C: 2005 if (!Subtarget->isAAPCS_ABI()) 2006 return CallingConv::ARM_APCS; 2007 else if (Subtarget->hasVFP2Base() && !Subtarget->isThumb1Only() && 2008 getTargetMachine().Options.FloatABIType == FloatABI::Hard && 2009 !isVarArg) 2010 return CallingConv::ARM_AAPCS_VFP; 2011 else 2012 return CallingConv::ARM_AAPCS; 2013 case CallingConv::Fast: 2014 case CallingConv::CXX_FAST_TLS: 2015 if (!Subtarget->isAAPCS_ABI()) { 2016 if (Subtarget->hasVFP2Base() && !Subtarget->isThumb1Only() && !isVarArg) 2017 return CallingConv::Fast; 2018 return CallingConv::ARM_APCS; 2019 } else if (Subtarget->hasVFP2Base() && 2020 !Subtarget->isThumb1Only() && !isVarArg) 2021 return CallingConv::ARM_AAPCS_VFP; 2022 else 2023 return CallingConv::ARM_AAPCS; 2024 } 2025 } 2026 2027 CCAssignFn *ARMTargetLowering::CCAssignFnForCall(CallingConv::ID CC, 2028 bool isVarArg) const { 2029 return CCAssignFnForNode(CC, false, isVarArg); 2030 } 2031 2032 CCAssignFn *ARMTargetLowering::CCAssignFnForReturn(CallingConv::ID CC, 2033 bool isVarArg) const { 2034 return CCAssignFnForNode(CC, true, isVarArg); 2035 } 2036 2037 /// CCAssignFnForNode - Selects the correct CCAssignFn for the given 2038 /// CallingConvention. 2039 CCAssignFn *ARMTargetLowering::CCAssignFnForNode(CallingConv::ID CC, 2040 bool Return, 2041 bool isVarArg) const { 2042 switch (getEffectiveCallingConv(CC, isVarArg)) { 2043 default: 2044 report_fatal_error("Unsupported calling convention"); 2045 case CallingConv::ARM_APCS: 2046 return (Return ? RetCC_ARM_APCS : CC_ARM_APCS); 2047 case CallingConv::ARM_AAPCS: 2048 return (Return ? RetCC_ARM_AAPCS : CC_ARM_AAPCS); 2049 case CallingConv::ARM_AAPCS_VFP: 2050 return (Return ? RetCC_ARM_AAPCS_VFP : CC_ARM_AAPCS_VFP); 2051 case CallingConv::Fast: 2052 return (Return ? RetFastCC_ARM_APCS : FastCC_ARM_APCS); 2053 case CallingConv::GHC: 2054 return (Return ? RetCC_ARM_APCS : CC_ARM_APCS_GHC); 2055 case CallingConv::PreserveMost: 2056 return (Return ? RetCC_ARM_AAPCS : CC_ARM_AAPCS); 2057 case CallingConv::CFGuard_Check: 2058 return (Return ? RetCC_ARM_AAPCS : CC_ARM_Win32_CFGuard_Check); 2059 } 2060 } 2061 2062 SDValue ARMTargetLowering::MoveToHPR(const SDLoc &dl, SelectionDAG &DAG, 2063 MVT LocVT, MVT ValVT, SDValue Val) const { 2064 Val = DAG.getNode(ISD::BITCAST, dl, MVT::getIntegerVT(LocVT.getSizeInBits()), 2065 Val); 2066 if (Subtarget->hasFullFP16()) { 2067 Val = DAG.getNode(ARMISD::VMOVhr, dl, ValVT, Val); 2068 } else { 2069 Val = DAG.getNode(ISD::TRUNCATE, dl, 2070 MVT::getIntegerVT(ValVT.getSizeInBits()), Val); 2071 Val = DAG.getNode(ISD::BITCAST, dl, ValVT, Val); 2072 } 2073 return Val; 2074 } 2075 2076 SDValue ARMTargetLowering::MoveFromHPR(const SDLoc &dl, SelectionDAG &DAG, 2077 MVT LocVT, MVT ValVT, 2078 SDValue Val) const { 2079 if (Subtarget->hasFullFP16()) { 2080 Val = DAG.getNode(ARMISD::VMOVrh, dl, 2081 MVT::getIntegerVT(LocVT.getSizeInBits()), Val); 2082 } else { 2083 Val = DAG.getNode(ISD::BITCAST, dl, 2084 MVT::getIntegerVT(ValVT.getSizeInBits()), Val); 2085 Val = DAG.getNode(ISD::ZERO_EXTEND, dl, 2086 MVT::getIntegerVT(LocVT.getSizeInBits()), Val); 2087 } 2088 return DAG.getNode(ISD::BITCAST, dl, LocVT, Val); 2089 } 2090 2091 /// LowerCallResult - Lower the result values of a call into the 2092 /// appropriate copies out of appropriate physical registers. 2093 SDValue ARMTargetLowering::LowerCallResult( 2094 SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg, 2095 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl, 2096 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool isThisReturn, 2097 SDValue ThisVal) const { 2098 // Assign locations to each value returned by this call. 2099 SmallVector<CCValAssign, 16> RVLocs; 2100 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs, 2101 *DAG.getContext()); 2102 CCInfo.AnalyzeCallResult(Ins, CCAssignFnForReturn(CallConv, isVarArg)); 2103 2104 // Copy all of the result registers out of their specified physreg. 2105 for (unsigned i = 0; i != RVLocs.size(); ++i) { 2106 CCValAssign VA = RVLocs[i]; 2107 2108 // Pass 'this' value directly from the argument to return value, to avoid 2109 // reg unit interference 2110 if (i == 0 && isThisReturn) { 2111 assert(!VA.needsCustom() && VA.getLocVT() == MVT::i32 && 2112 "unexpected return calling convention register assignment"); 2113 InVals.push_back(ThisVal); 2114 continue; 2115 } 2116 2117 SDValue Val; 2118 if (VA.needsCustom() && 2119 (VA.getLocVT() == MVT::f64 || VA.getLocVT() == MVT::v2f64)) { 2120 // Handle f64 or half of a v2f64. 2121 SDValue Lo = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, 2122 InFlag); 2123 Chain = Lo.getValue(1); 2124 InFlag = Lo.getValue(2); 2125 VA = RVLocs[++i]; // skip ahead to next loc 2126 SDValue Hi = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, 2127 InFlag); 2128 Chain = Hi.getValue(1); 2129 InFlag = Hi.getValue(2); 2130 if (!Subtarget->isLittle()) 2131 std::swap (Lo, Hi); 2132 Val = DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi); 2133 2134 if (VA.getLocVT() == MVT::v2f64) { 2135 SDValue Vec = DAG.getNode(ISD::UNDEF, dl, MVT::v2f64); 2136 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Vec, Val, 2137 DAG.getConstant(0, dl, MVT::i32)); 2138 2139 VA = RVLocs[++i]; // skip ahead to next loc 2140 Lo = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, InFlag); 2141 Chain = Lo.getValue(1); 2142 InFlag = Lo.getValue(2); 2143 VA = RVLocs[++i]; // skip ahead to next loc 2144 Hi = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, InFlag); 2145 Chain = Hi.getValue(1); 2146 InFlag = Hi.getValue(2); 2147 if (!Subtarget->isLittle()) 2148 std::swap (Lo, Hi); 2149 Val = DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi); 2150 Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Vec, Val, 2151 DAG.getConstant(1, dl, MVT::i32)); 2152 } 2153 } else { 2154 Val = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), VA.getLocVT(), 2155 InFlag); 2156 Chain = Val.getValue(1); 2157 InFlag = Val.getValue(2); 2158 } 2159 2160 switch (VA.getLocInfo()) { 2161 default: llvm_unreachable("Unknown loc info!"); 2162 case CCValAssign::Full: break; 2163 case CCValAssign::BCvt: 2164 Val = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), Val); 2165 break; 2166 } 2167 2168 // f16 arguments have their size extended to 4 bytes and passed as if they 2169 // had been copied to the LSBs of a 32-bit register. 2170 // For that, it's passed extended to i32 (soft ABI) or to f32 (hard ABI) 2171 if (VA.needsCustom() && 2172 (VA.getValVT() == MVT::f16 || VA.getValVT() == MVT::bf16)) 2173 Val = MoveToHPR(dl, DAG, VA.getLocVT(), VA.getValVT(), Val); 2174 2175 InVals.push_back(Val); 2176 } 2177 2178 return Chain; 2179 } 2180 2181 /// LowerMemOpCallTo - Store the argument to the stack. 2182 SDValue ARMTargetLowering::LowerMemOpCallTo(SDValue Chain, SDValue StackPtr, 2183 SDValue Arg, const SDLoc &dl, 2184 SelectionDAG &DAG, 2185 const CCValAssign &VA, 2186 ISD::ArgFlagsTy Flags) const { 2187 unsigned LocMemOffset = VA.getLocMemOffset(); 2188 SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl); 2189 PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()), 2190 StackPtr, PtrOff); 2191 return DAG.getStore( 2192 Chain, dl, Arg, PtrOff, 2193 MachinePointerInfo::getStack(DAG.getMachineFunction(), LocMemOffset)); 2194 } 2195 2196 void ARMTargetLowering::PassF64ArgInRegs(const SDLoc &dl, SelectionDAG &DAG, 2197 SDValue Chain, SDValue &Arg, 2198 RegsToPassVector &RegsToPass, 2199 CCValAssign &VA, CCValAssign &NextVA, 2200 SDValue &StackPtr, 2201 SmallVectorImpl<SDValue> &MemOpChains, 2202 ISD::ArgFlagsTy Flags) const { 2203 SDValue fmrrd = DAG.getNode(ARMISD::VMOVRRD, dl, 2204 DAG.getVTList(MVT::i32, MVT::i32), Arg); 2205 unsigned id = Subtarget->isLittle() ? 0 : 1; 2206 RegsToPass.push_back(std::make_pair(VA.getLocReg(), fmrrd.getValue(id))); 2207 2208 if (NextVA.isRegLoc()) 2209 RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), fmrrd.getValue(1-id))); 2210 else { 2211 assert(NextVA.isMemLoc()); 2212 if (!StackPtr.getNode()) 2213 StackPtr = DAG.getCopyFromReg(Chain, dl, ARM::SP, 2214 getPointerTy(DAG.getDataLayout())); 2215 2216 MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, fmrrd.getValue(1-id), 2217 dl, DAG, NextVA, 2218 Flags)); 2219 } 2220 } 2221 2222 /// LowerCall - Lowering a call into a callseq_start <- 2223 /// ARMISD:CALL <- callseq_end chain. Also add input and output parameter 2224 /// nodes. 2225 SDValue 2226 ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, 2227 SmallVectorImpl<SDValue> &InVals) const { 2228 SelectionDAG &DAG = CLI.DAG; 2229 SDLoc &dl = CLI.DL; 2230 SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs; 2231 SmallVectorImpl<SDValue> &OutVals = CLI.OutVals; 2232 SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins; 2233 SDValue Chain = CLI.Chain; 2234 SDValue Callee = CLI.Callee; 2235 bool &isTailCall = CLI.IsTailCall; 2236 CallingConv::ID CallConv = CLI.CallConv; 2237 bool doesNotRet = CLI.DoesNotReturn; 2238 bool isVarArg = CLI.IsVarArg; 2239 2240 MachineFunction &MF = DAG.getMachineFunction(); 2241 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); 2242 MachineFunction::CallSiteInfo CSInfo; 2243 bool isStructRet = (Outs.empty()) ? false : Outs[0].Flags.isSRet(); 2244 bool isThisReturn = false; 2245 bool isCmseNSCall = false; 2246 bool PreferIndirect = false; 2247 2248 // Determine whether this is a non-secure function call. 2249 if (CLI.CB && CLI.CB->getAttributes().hasFnAttribute("cmse_nonsecure_call")) 2250 isCmseNSCall = true; 2251 2252 // Disable tail calls if they're not supported. 2253 if (!Subtarget->supportsTailCall()) 2254 isTailCall = false; 2255 2256 // For both the non-secure calls and the returns from a CMSE entry function, 2257 // the function needs to do some extra work afte r the call, or before the 2258 // return, respectively, thus it cannot end with atail call 2259 if (isCmseNSCall || AFI->isCmseNSEntryFunction()) 2260 isTailCall = false; 2261 2262 if (isa<GlobalAddressSDNode>(Callee)) { 2263 // If we're optimizing for minimum size and the function is called three or 2264 // more times in this block, we can improve codesize by calling indirectly 2265 // as BLXr has a 16-bit encoding. 2266 auto *GV = cast<GlobalAddressSDNode>(Callee)->getGlobal(); 2267 if (CLI.CB) { 2268 auto *BB = CLI.CB->getParent(); 2269 PreferIndirect = Subtarget->isThumb() && Subtarget->hasMinSize() && 2270 count_if(GV->users(), [&BB](const User *U) { 2271 return isa<Instruction>(U) && 2272 cast<Instruction>(U)->getParent() == BB; 2273 }) > 2; 2274 } 2275 } 2276 if (isTailCall) { 2277 // Check if it's really possible to do a tail call. 2278 isTailCall = IsEligibleForTailCallOptimization( 2279 Callee, CallConv, isVarArg, isStructRet, 2280 MF.getFunction().hasStructRetAttr(), Outs, OutVals, Ins, DAG, 2281 PreferIndirect); 2282 if (!isTailCall && CLI.CB && CLI.CB->isMustTailCall()) 2283 report_fatal_error("failed to perform tail call elimination on a call " 2284 "site marked musttail"); 2285 // We don't support GuaranteedTailCallOpt for ARM, only automatically 2286 // detected sibcalls. 2287 if (isTailCall) 2288 ++NumTailCalls; 2289 } 2290 2291 // Analyze operands of the call, assigning locations to each operand. 2292 SmallVector<CCValAssign, 16> ArgLocs; 2293 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs, 2294 *DAG.getContext()); 2295 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CallConv, isVarArg)); 2296 2297 // Get a count of how many bytes are to be pushed on the stack. 2298 unsigned NumBytes = CCInfo.getNextStackOffset(); 2299 2300 if (isTailCall) { 2301 // For tail calls, memory operands are available in our caller's stack. 2302 NumBytes = 0; 2303 } else { 2304 // Adjust the stack pointer for the new arguments... 2305 // These operations are automatically eliminated by the prolog/epilog pass 2306 Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, dl); 2307 } 2308 2309 SDValue StackPtr = 2310 DAG.getCopyFromReg(Chain, dl, ARM::SP, getPointerTy(DAG.getDataLayout())); 2311 2312 RegsToPassVector RegsToPass; 2313 SmallVector<SDValue, 8> MemOpChains; 2314 2315 // Walk the register/memloc assignments, inserting copies/loads. In the case 2316 // of tail call optimization, arguments are handled later. 2317 for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size(); 2318 i != e; 2319 ++i, ++realArgIdx) { 2320 CCValAssign &VA = ArgLocs[i]; 2321 SDValue Arg = OutVals[realArgIdx]; 2322 ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags; 2323 bool isByVal = Flags.isByVal(); 2324 2325 // Promote the value if needed. 2326 switch (VA.getLocInfo()) { 2327 default: llvm_unreachable("Unknown loc info!"); 2328 case CCValAssign::Full: break; 2329 case CCValAssign::SExt: 2330 Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), Arg); 2331 break; 2332 case CCValAssign::ZExt: 2333 Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), Arg); 2334 break; 2335 case CCValAssign::AExt: 2336 Arg = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), Arg); 2337 break; 2338 case CCValAssign::BCvt: 2339 Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg); 2340 break; 2341 } 2342 2343 // f16 arguments have their size extended to 4 bytes and passed as if they 2344 // had been copied to the LSBs of a 32-bit register. 2345 // For that, it's passed extended to i32 (soft ABI) or to f32 (hard ABI) 2346 if (VA.needsCustom() && 2347 (VA.getValVT() == MVT::f16 || VA.getValVT() == MVT::bf16)) { 2348 Arg = MoveFromHPR(dl, DAG, VA.getLocVT(), VA.getValVT(), Arg); 2349 } else { 2350 // f16 arguments could have been extended prior to argument lowering. 2351 // Mask them arguments if this is a CMSE nonsecure call. 2352 auto ArgVT = Outs[realArgIdx].ArgVT; 2353 if (isCmseNSCall && (ArgVT == MVT::f16)) { 2354 auto LocBits = VA.getLocVT().getSizeInBits(); 2355 auto MaskValue = APInt::getLowBitsSet(LocBits, ArgVT.getSizeInBits()); 2356 SDValue Mask = 2357 DAG.getConstant(MaskValue, dl, MVT::getIntegerVT(LocBits)); 2358 Arg = DAG.getNode(ISD::BITCAST, dl, MVT::getIntegerVT(LocBits), Arg); 2359 Arg = DAG.getNode(ISD::AND, dl, MVT::getIntegerVT(LocBits), Arg, Mask); 2360 Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg); 2361 } 2362 } 2363 2364 // f64 and v2f64 might be passed in i32 pairs and must be split into pieces 2365 if (VA.needsCustom() && VA.getLocVT() == MVT::v2f64) { 2366 SDValue Op0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg, 2367 DAG.getConstant(0, dl, MVT::i32)); 2368 SDValue Op1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg, 2369 DAG.getConstant(1, dl, MVT::i32)); 2370 2371 PassF64ArgInRegs(dl, DAG, Chain, Op0, RegsToPass, VA, ArgLocs[++i], 2372 StackPtr, MemOpChains, Flags); 2373 2374 VA = ArgLocs[++i]; // skip ahead to next loc 2375 if (VA.isRegLoc()) { 2376 PassF64ArgInRegs(dl, DAG, Chain, Op1, RegsToPass, VA, ArgLocs[++i], 2377 StackPtr, MemOpChains, Flags); 2378 } else { 2379 assert(VA.isMemLoc()); 2380 2381 MemOpChains.push_back( 2382 LowerMemOpCallTo(Chain, StackPtr, Op1, dl, DAG, VA, Flags)); 2383 } 2384 } else if (VA.needsCustom() && VA.getLocVT() == MVT::f64) { 2385 PassF64ArgInRegs(dl, DAG, Chain, Arg, RegsToPass, VA, ArgLocs[++i], 2386 StackPtr, MemOpChains, Flags); 2387 } else if (VA.isRegLoc()) { 2388 if (realArgIdx == 0 && Flags.isReturned() && !Flags.isSwiftSelf() && 2389 Outs[0].VT == MVT::i32) { 2390 assert(VA.getLocVT() == MVT::i32 && 2391 "unexpected calling convention register assignment"); 2392 assert(!Ins.empty() && Ins[0].VT == MVT::i32 && 2393 "unexpected use of 'returned'"); 2394 isThisReturn = true; 2395 } 2396 const TargetOptions &Options = DAG.getTarget().Options; 2397 if (Options.EmitCallSiteInfo) 2398 CSInfo.emplace_back(VA.getLocReg(), i); 2399 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg)); 2400 } else if (isByVal) { 2401 assert(VA.isMemLoc()); 2402 unsigned offset = 0; 2403 2404 // True if this byval aggregate will be split between registers 2405 // and memory. 2406 unsigned ByValArgsCount = CCInfo.getInRegsParamsCount(); 2407 unsigned CurByValIdx = CCInfo.getInRegsParamsProcessed(); 2408 2409 if (CurByValIdx < ByValArgsCount) { 2410 2411 unsigned RegBegin, RegEnd; 2412 CCInfo.getInRegsParamInfo(CurByValIdx, RegBegin, RegEnd); 2413 2414 EVT PtrVT = 2415 DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout()); 2416 unsigned int i, j; 2417 for (i = 0, j = RegBegin; j < RegEnd; i++, j++) { 2418 SDValue Const = DAG.getConstant(4*i, dl, MVT::i32); 2419 SDValue AddArg = DAG.getNode(ISD::ADD, dl, PtrVT, Arg, Const); 2420 SDValue Load = 2421 DAG.getLoad(PtrVT, dl, Chain, AddArg, MachinePointerInfo(), 2422 DAG.InferPtrAlign(AddArg)); 2423 MemOpChains.push_back(Load.getValue(1)); 2424 RegsToPass.push_back(std::make_pair(j, Load)); 2425 } 2426 2427 // If parameter size outsides register area, "offset" value 2428 // helps us to calculate stack slot for remained part properly. 2429 offset = RegEnd - RegBegin; 2430 2431 CCInfo.nextInRegsParam(); 2432 } 2433 2434 if (Flags.getByValSize() > 4*offset) { 2435 auto PtrVT = getPointerTy(DAG.getDataLayout()); 2436 unsigned LocMemOffset = VA.getLocMemOffset(); 2437 SDValue StkPtrOff = DAG.getIntPtrConstant(LocMemOffset, dl); 2438 SDValue Dst = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, StkPtrOff); 2439 SDValue SrcOffset = DAG.getIntPtrConstant(4*offset, dl); 2440 SDValue Src = DAG.getNode(ISD::ADD, dl, PtrVT, Arg, SrcOffset); 2441 SDValue SizeNode = DAG.getConstant(Flags.getByValSize() - 4*offset, dl, 2442 MVT::i32); 2443 SDValue AlignNode = 2444 DAG.getConstant(Flags.getNonZeroByValAlign().value(), dl, MVT::i32); 2445 2446 SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue); 2447 SDValue Ops[] = { Chain, Dst, Src, SizeNode, AlignNode}; 2448 MemOpChains.push_back(DAG.getNode(ARMISD::COPY_STRUCT_BYVAL, dl, VTs, 2449 Ops)); 2450 } 2451 } else if (!isTailCall) { 2452 assert(VA.isMemLoc()); 2453 2454 MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg, 2455 dl, DAG, VA, Flags)); 2456 } 2457 } 2458 2459 if (!MemOpChains.empty()) 2460 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains); 2461 2462 // Build a sequence of copy-to-reg nodes chained together with token chain 2463 // and flag operands which copy the outgoing args into the appropriate regs. 2464 SDValue InFlag; 2465 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { 2466 Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, 2467 RegsToPass[i].second, InFlag); 2468 InFlag = Chain.getValue(1); 2469 } 2470 2471 // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every 2472 // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol 2473 // node so that legalize doesn't hack it. 2474 bool isDirect = false; 2475 2476 const TargetMachine &TM = getTargetMachine(); 2477 const Module *Mod = MF.getFunction().getParent(); 2478 const GlobalValue *GV = nullptr; 2479 if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) 2480 GV = G->getGlobal(); 2481 bool isStub = 2482 !TM.shouldAssumeDSOLocal(*Mod, GV) && Subtarget->isTargetMachO(); 2483 2484 bool isARMFunc = !Subtarget->isThumb() || (isStub && !Subtarget->isMClass()); 2485 bool isLocalARMFunc = false; 2486 auto PtrVt = getPointerTy(DAG.getDataLayout()); 2487 2488 if (Subtarget->genLongCalls()) { 2489 assert((!isPositionIndependent() || Subtarget->isTargetWindows()) && 2490 "long-calls codegen is not position independent!"); 2491 // Handle a global address or an external symbol. If it's not one of 2492 // those, the target's already in a register, so we don't need to do 2493 // anything extra. 2494 if (isa<GlobalAddressSDNode>(Callee)) { 2495 // Create a constant pool entry for the callee address 2496 unsigned ARMPCLabelIndex = AFI->createPICLabelUId(); 2497 ARMConstantPoolValue *CPV = 2498 ARMConstantPoolConstant::Create(GV, ARMPCLabelIndex, ARMCP::CPValue, 0); 2499 2500 // Get the address of the callee into a register 2501 SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVt, Align(4)); 2502 CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); 2503 Callee = DAG.getLoad( 2504 PtrVt, dl, DAG.getEntryNode(), CPAddr, 2505 MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); 2506 } else if (ExternalSymbolSDNode *S=dyn_cast<ExternalSymbolSDNode>(Callee)) { 2507 const char *Sym = S->getSymbol(); 2508 2509 // Create a constant pool entry for the callee address 2510 unsigned ARMPCLabelIndex = AFI->createPICLabelUId(); 2511 ARMConstantPoolValue *CPV = 2512 ARMConstantPoolSymbol::Create(*DAG.getContext(), Sym, 2513 ARMPCLabelIndex, 0); 2514 // Get the address of the callee into a register 2515 SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVt, Align(4)); 2516 CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); 2517 Callee = DAG.getLoad( 2518 PtrVt, dl, DAG.getEntryNode(), CPAddr, 2519 MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); 2520 } 2521 } else if (isa<GlobalAddressSDNode>(Callee)) { 2522 if (!PreferIndirect) { 2523 isDirect = true; 2524 bool isDef = GV->isStrongDefinitionForLinker(); 2525 2526 // ARM call to a local ARM function is predicable. 2527 isLocalARMFunc = !Subtarget->isThumb() && (isDef || !ARMInterworking); 2528 // tBX takes a register source operand. 2529 if (isStub && Subtarget->isThumb1Only() && !Subtarget->hasV5TOps()) { 2530 assert(Subtarget->isTargetMachO() && "WrapperPIC use on non-MachO?"); 2531 Callee = DAG.getNode( 2532 ARMISD::WrapperPIC, dl, PtrVt, 2533 DAG.getTargetGlobalAddress(GV, dl, PtrVt, 0, ARMII::MO_NONLAZY)); 2534 Callee = DAG.getLoad( 2535 PtrVt, dl, DAG.getEntryNode(), Callee, 2536 MachinePointerInfo::getGOT(DAG.getMachineFunction()), MaybeAlign(), 2537 MachineMemOperand::MODereferenceable | 2538 MachineMemOperand::MOInvariant); 2539 } else if (Subtarget->isTargetCOFF()) { 2540 assert(Subtarget->isTargetWindows() && 2541 "Windows is the only supported COFF target"); 2542 unsigned TargetFlags = ARMII::MO_NO_FLAG; 2543 if (GV->hasDLLImportStorageClass()) 2544 TargetFlags = ARMII::MO_DLLIMPORT; 2545 else if (!TM.shouldAssumeDSOLocal(*GV->getParent(), GV)) 2546 TargetFlags = ARMII::MO_COFFSTUB; 2547 Callee = DAG.getTargetGlobalAddress(GV, dl, PtrVt, /*offset=*/0, 2548 TargetFlags); 2549 if (TargetFlags & (ARMII::MO_DLLIMPORT | ARMII::MO_COFFSTUB)) 2550 Callee = 2551 DAG.getLoad(PtrVt, dl, DAG.getEntryNode(), 2552 DAG.getNode(ARMISD::Wrapper, dl, PtrVt, Callee), 2553 MachinePointerInfo::getGOT(DAG.getMachineFunction())); 2554 } else { 2555 Callee = DAG.getTargetGlobalAddress(GV, dl, PtrVt, 0, 0); 2556 } 2557 } 2558 } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) { 2559 isDirect = true; 2560 // tBX takes a register source operand. 2561 const char *Sym = S->getSymbol(); 2562 if (isARMFunc && Subtarget->isThumb1Only() && !Subtarget->hasV5TOps()) { 2563 unsigned ARMPCLabelIndex = AFI->createPICLabelUId(); 2564 ARMConstantPoolValue *CPV = 2565 ARMConstantPoolSymbol::Create(*DAG.getContext(), Sym, 2566 ARMPCLabelIndex, 4); 2567 SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVt, Align(4)); 2568 CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); 2569 Callee = DAG.getLoad( 2570 PtrVt, dl, DAG.getEntryNode(), CPAddr, 2571 MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); 2572 SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32); 2573 Callee = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVt, Callee, PICLabel); 2574 } else { 2575 Callee = DAG.getTargetExternalSymbol(Sym, PtrVt, 0); 2576 } 2577 } 2578 2579 if (isCmseNSCall) { 2580 assert(!isARMFunc && !isDirect && 2581 "Cannot handle call to ARM function or direct call"); 2582 if (NumBytes > 0) { 2583 DiagnosticInfoUnsupported Diag(DAG.getMachineFunction().getFunction(), 2584 "call to non-secure function would " 2585 "require passing arguments on stack", 2586 dl.getDebugLoc()); 2587 DAG.getContext()->diagnose(Diag); 2588 } 2589 if (isStructRet) { 2590 DiagnosticInfoUnsupported Diag( 2591 DAG.getMachineFunction().getFunction(), 2592 "call to non-secure function would return value through pointer", 2593 dl.getDebugLoc()); 2594 DAG.getContext()->diagnose(Diag); 2595 } 2596 } 2597 2598 // FIXME: handle tail calls differently. 2599 unsigned CallOpc; 2600 if (Subtarget->isThumb()) { 2601 if (isCmseNSCall) 2602 CallOpc = ARMISD::tSECALL; 2603 else if ((!isDirect || isARMFunc) && !Subtarget->hasV5TOps()) 2604 CallOpc = ARMISD::CALL_NOLINK; 2605 else 2606 CallOpc = ARMISD::CALL; 2607 } else { 2608 if (!isDirect && !Subtarget->hasV5TOps()) 2609 CallOpc = ARMISD::CALL_NOLINK; 2610 else if (doesNotRet && isDirect && Subtarget->hasRetAddrStack() && 2611 // Emit regular call when code size is the priority 2612 !Subtarget->hasMinSize()) 2613 // "mov lr, pc; b _foo" to avoid confusing the RSP 2614 CallOpc = ARMISD::CALL_NOLINK; 2615 else 2616 CallOpc = isLocalARMFunc ? ARMISD::CALL_PRED : ARMISD::CALL; 2617 } 2618 2619 std::vector<SDValue> Ops; 2620 Ops.push_back(Chain); 2621 Ops.push_back(Callee); 2622 2623 // Add argument registers to the end of the list so that they are known live 2624 // into the call. 2625 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) 2626 Ops.push_back(DAG.getRegister(RegsToPass[i].first, 2627 RegsToPass[i].second.getValueType())); 2628 2629 // Add a register mask operand representing the call-preserved registers. 2630 if (!isTailCall) { 2631 const uint32_t *Mask; 2632 const ARMBaseRegisterInfo *ARI = Subtarget->getRegisterInfo(); 2633 if (isThisReturn) { 2634 // For 'this' returns, use the R0-preserving mask if applicable 2635 Mask = ARI->getThisReturnPreservedMask(MF, CallConv); 2636 if (!Mask) { 2637 // Set isThisReturn to false if the calling convention is not one that 2638 // allows 'returned' to be modeled in this way, so LowerCallResult does 2639 // not try to pass 'this' straight through 2640 isThisReturn = false; 2641 Mask = ARI->getCallPreservedMask(MF, CallConv); 2642 } 2643 } else 2644 Mask = ARI->getCallPreservedMask(MF, CallConv); 2645 2646 assert(Mask && "Missing call preserved mask for calling convention"); 2647 Ops.push_back(DAG.getRegisterMask(Mask)); 2648 } 2649 2650 if (InFlag.getNode()) 2651 Ops.push_back(InFlag); 2652 2653 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); 2654 if (isTailCall) { 2655 MF.getFrameInfo().setHasTailCall(); 2656 SDValue Ret = DAG.getNode(ARMISD::TC_RETURN, dl, NodeTys, Ops); 2657 DAG.addCallSiteInfo(Ret.getNode(), std::move(CSInfo)); 2658 return Ret; 2659 } 2660 2661 // Returns a chain and a flag for retval copy to use. 2662 Chain = DAG.getNode(CallOpc, dl, NodeTys, Ops); 2663 DAG.addNoMergeSiteInfo(Chain.getNode(), CLI.NoMerge); 2664 InFlag = Chain.getValue(1); 2665 DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo)); 2666 2667 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, dl, true), 2668 DAG.getIntPtrConstant(0, dl, true), InFlag, dl); 2669 if (!Ins.empty()) 2670 InFlag = Chain.getValue(1); 2671 2672 // Handle result values, copying them out of physregs into vregs that we 2673 // return. 2674 return LowerCallResult(Chain, InFlag, CallConv, isVarArg, Ins, dl, DAG, 2675 InVals, isThisReturn, 2676 isThisReturn ? OutVals[0] : SDValue()); 2677 } 2678 2679 /// HandleByVal - Every parameter *after* a byval parameter is passed 2680 /// on the stack. Remember the next parameter register to allocate, 2681 /// and then confiscate the rest of the parameter registers to insure 2682 /// this. 2683 void ARMTargetLowering::HandleByVal(CCState *State, unsigned &Size, 2684 Align Alignment) const { 2685 // Byval (as with any stack) slots are always at least 4 byte aligned. 2686 Alignment = std::max(Alignment, Align(4)); 2687 2688 unsigned Reg = State->AllocateReg(GPRArgRegs); 2689 if (!Reg) 2690 return; 2691 2692 unsigned AlignInRegs = Alignment.value() / 4; 2693 unsigned Waste = (ARM::R4 - Reg) % AlignInRegs; 2694 for (unsigned i = 0; i < Waste; ++i) 2695 Reg = State->AllocateReg(GPRArgRegs); 2696 2697 if (!Reg) 2698 return; 2699 2700 unsigned Excess = 4 * (ARM::R4 - Reg); 2701 2702 // Special case when NSAA != SP and parameter size greater than size of 2703 // all remained GPR regs. In that case we can't split parameter, we must 2704 // send it to stack. We also must set NCRN to R4, so waste all 2705 // remained registers. 2706 const unsigned NSAAOffset = State->getNextStackOffset(); 2707 if (NSAAOffset != 0 && Size > Excess) { 2708 while (State->AllocateReg(GPRArgRegs)) 2709 ; 2710 return; 2711 } 2712 2713 // First register for byval parameter is the first register that wasn't 2714 // allocated before this method call, so it would be "reg". 2715 // If parameter is small enough to be saved in range [reg, r4), then 2716 // the end (first after last) register would be reg + param-size-in-regs, 2717 // else parameter would be splitted between registers and stack, 2718 // end register would be r4 in this case. 2719 unsigned ByValRegBegin = Reg; 2720 unsigned ByValRegEnd = std::min<unsigned>(Reg + Size / 4, ARM::R4); 2721 State->addInRegsParamInfo(ByValRegBegin, ByValRegEnd); 2722 // Note, first register is allocated in the beginning of function already, 2723 // allocate remained amount of registers we need. 2724 for (unsigned i = Reg + 1; i != ByValRegEnd; ++i) 2725 State->AllocateReg(GPRArgRegs); 2726 // A byval parameter that is split between registers and memory needs its 2727 // size truncated here. 2728 // In the case where the entire structure fits in registers, we set the 2729 // size in memory to zero. 2730 Size = std::max<int>(Size - Excess, 0); 2731 } 2732 2733 /// MatchingStackOffset - Return true if the given stack call argument is 2734 /// already available in the same position (relatively) of the caller's 2735 /// incoming argument stack. 2736 static 2737 bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags, 2738 MachineFrameInfo &MFI, const MachineRegisterInfo *MRI, 2739 const TargetInstrInfo *TII) { 2740 unsigned Bytes = Arg.getValueSizeInBits() / 8; 2741 int FI = std::numeric_limits<int>::max(); 2742 if (Arg.getOpcode() == ISD::CopyFromReg) { 2743 unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg(); 2744 if (!Register::isVirtualRegister(VR)) 2745 return false; 2746 MachineInstr *Def = MRI->getVRegDef(VR); 2747 if (!Def) 2748 return false; 2749 if (!Flags.isByVal()) { 2750 if (!TII->isLoadFromStackSlot(*Def, FI)) 2751 return false; 2752 } else { 2753 return false; 2754 } 2755 } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) { 2756 if (Flags.isByVal()) 2757 // ByVal argument is passed in as a pointer but it's now being 2758 // dereferenced. e.g. 2759 // define @foo(%struct.X* %A) { 2760 // tail call @bar(%struct.X* byval %A) 2761 // } 2762 return false; 2763 SDValue Ptr = Ld->getBasePtr(); 2764 FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr); 2765 if (!FINode) 2766 return false; 2767 FI = FINode->getIndex(); 2768 } else 2769 return false; 2770 2771 assert(FI != std::numeric_limits<int>::max()); 2772 if (!MFI.isFixedObjectIndex(FI)) 2773 return false; 2774 return Offset == MFI.getObjectOffset(FI) && Bytes == MFI.getObjectSize(FI); 2775 } 2776 2777 /// IsEligibleForTailCallOptimization - Check whether the call is eligible 2778 /// for tail call optimization. Targets which want to do tail call 2779 /// optimization should implement this function. 2780 bool ARMTargetLowering::IsEligibleForTailCallOptimization( 2781 SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg, 2782 bool isCalleeStructRet, bool isCallerStructRet, 2783 const SmallVectorImpl<ISD::OutputArg> &Outs, 2784 const SmallVectorImpl<SDValue> &OutVals, 2785 const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG, 2786 const bool isIndirect) const { 2787 MachineFunction &MF = DAG.getMachineFunction(); 2788 const Function &CallerF = MF.getFunction(); 2789 CallingConv::ID CallerCC = CallerF.getCallingConv(); 2790 2791 assert(Subtarget->supportsTailCall()); 2792 2793 // Indirect tail calls cannot be optimized for Thumb1 if the args 2794 // to the call take up r0-r3. The reason is that there are no legal registers 2795 // left to hold the pointer to the function to be called. 2796 if (Subtarget->isThumb1Only() && Outs.size() >= 4 && 2797 (!isa<GlobalAddressSDNode>(Callee.getNode()) || isIndirect)) 2798 return false; 2799 2800 // Look for obvious safe cases to perform tail call optimization that do not 2801 // require ABI changes. This is what gcc calls sibcall. 2802 2803 // Exception-handling functions need a special set of instructions to indicate 2804 // a return to the hardware. Tail-calling another function would probably 2805 // break this. 2806 if (CallerF.hasFnAttribute("interrupt")) 2807 return false; 2808 2809 // Also avoid sibcall optimization if either caller or callee uses struct 2810 // return semantics. 2811 if (isCalleeStructRet || isCallerStructRet) 2812 return false; 2813 2814 // Externally-defined functions with weak linkage should not be 2815 // tail-called on ARM when the OS does not support dynamic 2816 // pre-emption of symbols, as the AAELF spec requires normal calls 2817 // to undefined weak functions to be replaced with a NOP or jump to the 2818 // next instruction. The behaviour of branch instructions in this 2819 // situation (as used for tail calls) is implementation-defined, so we 2820 // cannot rely on the linker replacing the tail call with a return. 2821 if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) { 2822 const GlobalValue *GV = G->getGlobal(); 2823 const Triple &TT = getTargetMachine().getTargetTriple(); 2824 if (GV->hasExternalWeakLinkage() && 2825 (!TT.isOSWindows() || TT.isOSBinFormatELF() || TT.isOSBinFormatMachO())) 2826 return false; 2827 } 2828 2829 // Check that the call results are passed in the same way. 2830 LLVMContext &C = *DAG.getContext(); 2831 if (!CCState::resultsCompatible( 2832 getEffectiveCallingConv(CalleeCC, isVarArg), 2833 getEffectiveCallingConv(CallerCC, CallerF.isVarArg()), MF, C, Ins, 2834 CCAssignFnForReturn(CalleeCC, isVarArg), 2835 CCAssignFnForReturn(CallerCC, CallerF.isVarArg()))) 2836 return false; 2837 // The callee has to preserve all registers the caller needs to preserve. 2838 const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo(); 2839 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC); 2840 if (CalleeCC != CallerCC) { 2841 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC); 2842 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved)) 2843 return false; 2844 } 2845 2846 // If Caller's vararg or byval argument has been split between registers and 2847 // stack, do not perform tail call, since part of the argument is in caller's 2848 // local frame. 2849 const ARMFunctionInfo *AFI_Caller = MF.getInfo<ARMFunctionInfo>(); 2850 if (AFI_Caller->getArgRegsSaveSize()) 2851 return false; 2852 2853 // If the callee takes no arguments then go on to check the results of the 2854 // call. 2855 if (!Outs.empty()) { 2856 // Check if stack adjustment is needed. For now, do not do this if any 2857 // argument is passed on the stack. 2858 SmallVector<CCValAssign, 16> ArgLocs; 2859 CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C); 2860 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, isVarArg)); 2861 if (CCInfo.getNextStackOffset()) { 2862 // Check if the arguments are already laid out in the right way as 2863 // the caller's fixed stack objects. 2864 MachineFrameInfo &MFI = MF.getFrameInfo(); 2865 const MachineRegisterInfo *MRI = &MF.getRegInfo(); 2866 const TargetInstrInfo *TII = Subtarget->getInstrInfo(); 2867 for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size(); 2868 i != e; 2869 ++i, ++realArgIdx) { 2870 CCValAssign &VA = ArgLocs[i]; 2871 EVT RegVT = VA.getLocVT(); 2872 SDValue Arg = OutVals[realArgIdx]; 2873 ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags; 2874 if (VA.getLocInfo() == CCValAssign::Indirect) 2875 return false; 2876 if (VA.needsCustom() && (RegVT == MVT::f64 || RegVT == MVT::v2f64)) { 2877 // f64 and vector types are split into multiple registers or 2878 // register/stack-slot combinations. The types will not match 2879 // the registers; give up on memory f64 refs until we figure 2880 // out what to do about this. 2881 if (!VA.isRegLoc()) 2882 return false; 2883 if (!ArgLocs[++i].isRegLoc()) 2884 return false; 2885 if (RegVT == MVT::v2f64) { 2886 if (!ArgLocs[++i].isRegLoc()) 2887 return false; 2888 if (!ArgLocs[++i].isRegLoc()) 2889 return false; 2890 } 2891 } else if (!VA.isRegLoc()) { 2892 if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags, 2893 MFI, MRI, TII)) 2894 return false; 2895 } 2896 } 2897 } 2898 2899 const MachineRegisterInfo &MRI = MF.getRegInfo(); 2900 if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals)) 2901 return false; 2902 } 2903 2904 return true; 2905 } 2906 2907 bool 2908 ARMTargetLowering::CanLowerReturn(CallingConv::ID CallConv, 2909 MachineFunction &MF, bool isVarArg, 2910 const SmallVectorImpl<ISD::OutputArg> &Outs, 2911 LLVMContext &Context) const { 2912 SmallVector<CCValAssign, 16> RVLocs; 2913 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context); 2914 return CCInfo.CheckReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg)); 2915 } 2916 2917 static SDValue LowerInterruptReturn(SmallVectorImpl<SDValue> &RetOps, 2918 const SDLoc &DL, SelectionDAG &DAG) { 2919 const MachineFunction &MF = DAG.getMachineFunction(); 2920 const Function &F = MF.getFunction(); 2921 2922 StringRef IntKind = F.getFnAttribute("interrupt").getValueAsString(); 2923 2924 // See ARM ARM v7 B1.8.3. On exception entry LR is set to a possibly offset 2925 // version of the "preferred return address". These offsets affect the return 2926 // instruction if this is a return from PL1 without hypervisor extensions. 2927 // IRQ/FIQ: +4 "subs pc, lr, #4" 2928 // SWI: 0 "subs pc, lr, #0" 2929 // ABORT: +4 "subs pc, lr, #4" 2930 // UNDEF: +4/+2 "subs pc, lr, #0" 2931 // UNDEF varies depending on where the exception came from ARM or Thumb 2932 // mode. Alongside GCC, we throw our hands up in disgust and pretend it's 0. 2933 2934 int64_t LROffset; 2935 if (IntKind == "" || IntKind == "IRQ" || IntKind == "FIQ" || 2936 IntKind == "ABORT") 2937 LROffset = 4; 2938 else if (IntKind == "SWI" || IntKind == "UNDEF") 2939 LROffset = 0; 2940 else 2941 report_fatal_error("Unsupported interrupt attribute. If present, value " 2942 "must be one of: IRQ, FIQ, SWI, ABORT or UNDEF"); 2943 2944 RetOps.insert(RetOps.begin() + 1, 2945 DAG.getConstant(LROffset, DL, MVT::i32, false)); 2946 2947 return DAG.getNode(ARMISD::INTRET_FLAG, DL, MVT::Other, RetOps); 2948 } 2949 2950 SDValue 2951 ARMTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, 2952 bool isVarArg, 2953 const SmallVectorImpl<ISD::OutputArg> &Outs, 2954 const SmallVectorImpl<SDValue> &OutVals, 2955 const SDLoc &dl, SelectionDAG &DAG) const { 2956 // CCValAssign - represent the assignment of the return value to a location. 2957 SmallVector<CCValAssign, 16> RVLocs; 2958 2959 // CCState - Info about the registers and stack slots. 2960 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs, 2961 *DAG.getContext()); 2962 2963 // Analyze outgoing return values. 2964 CCInfo.AnalyzeReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg)); 2965 2966 SDValue Flag; 2967 SmallVector<SDValue, 4> RetOps; 2968 RetOps.push_back(Chain); // Operand #0 = Chain (updated below) 2969 bool isLittleEndian = Subtarget->isLittle(); 2970 2971 MachineFunction &MF = DAG.getMachineFunction(); 2972 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); 2973 AFI->setReturnRegsCount(RVLocs.size()); 2974 2975 // Report error if cmse entry function returns structure through first ptr arg. 2976 if (AFI->isCmseNSEntryFunction() && MF.getFunction().hasStructRetAttr()) { 2977 // Note: using an empty SDLoc(), as the first line of the function is a 2978 // better place to report than the last line. 2979 DiagnosticInfoUnsupported Diag( 2980 DAG.getMachineFunction().getFunction(), 2981 "secure entry function would return value through pointer", 2982 SDLoc().getDebugLoc()); 2983 DAG.getContext()->diagnose(Diag); 2984 } 2985 2986 // Copy the result values into the output registers. 2987 for (unsigned i = 0, realRVLocIdx = 0; 2988 i != RVLocs.size(); 2989 ++i, ++realRVLocIdx) { 2990 CCValAssign &VA = RVLocs[i]; 2991 assert(VA.isRegLoc() && "Can only return in registers!"); 2992 2993 SDValue Arg = OutVals[realRVLocIdx]; 2994 bool ReturnF16 = false; 2995 2996 if (Subtarget->hasFullFP16() && Subtarget->isTargetHardFloat()) { 2997 // Half-precision return values can be returned like this: 2998 // 2999 // t11 f16 = fadd ... 3000 // t12: i16 = bitcast t11 3001 // t13: i32 = zero_extend t12 3002 // t14: f32 = bitcast t13 <~~~~~~~ Arg 3003 // 3004 // to avoid code generation for bitcasts, we simply set Arg to the node 3005 // that produces the f16 value, t11 in this case. 3006 // 3007 if (Arg.getValueType() == MVT::f32 && Arg.getOpcode() == ISD::BITCAST) { 3008 SDValue ZE = Arg.getOperand(0); 3009 if (ZE.getOpcode() == ISD::ZERO_EXTEND && ZE.getValueType() == MVT::i32) { 3010 SDValue BC = ZE.getOperand(0); 3011 if (BC.getOpcode() == ISD::BITCAST && BC.getValueType() == MVT::i16) { 3012 Arg = BC.getOperand(0); 3013 ReturnF16 = true; 3014 } 3015 } 3016 } 3017 } 3018 3019 switch (VA.getLocInfo()) { 3020 default: llvm_unreachable("Unknown loc info!"); 3021 case CCValAssign::Full: break; 3022 case CCValAssign::BCvt: 3023 if (!ReturnF16) 3024 Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg); 3025 break; 3026 } 3027 3028 // Mask f16 arguments if this is a CMSE nonsecure entry. 3029 auto RetVT = Outs[realRVLocIdx].ArgVT; 3030 if (AFI->isCmseNSEntryFunction() && (RetVT == MVT::f16)) { 3031 if (VA.needsCustom() && VA.getValVT() == MVT::f16) { 3032 Arg = MoveFromHPR(dl, DAG, VA.getLocVT(), VA.getValVT(), Arg); 3033 } else { 3034 auto LocBits = VA.getLocVT().getSizeInBits(); 3035 auto MaskValue = APInt::getLowBitsSet(LocBits, RetVT.getSizeInBits()); 3036 SDValue Mask = 3037 DAG.getConstant(MaskValue, dl, MVT::getIntegerVT(LocBits)); 3038 Arg = DAG.getNode(ISD::BITCAST, dl, MVT::getIntegerVT(LocBits), Arg); 3039 Arg = DAG.getNode(ISD::AND, dl, MVT::getIntegerVT(LocBits), Arg, Mask); 3040 Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg); 3041 } 3042 } 3043 3044 if (VA.needsCustom() && 3045 (VA.getLocVT() == MVT::v2f64 || VA.getLocVT() == MVT::f64)) { 3046 if (VA.getLocVT() == MVT::v2f64) { 3047 // Extract the first half and return it in two registers. 3048 SDValue Half = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg, 3049 DAG.getConstant(0, dl, MVT::i32)); 3050 SDValue HalfGPRs = DAG.getNode(ARMISD::VMOVRRD, dl, 3051 DAG.getVTList(MVT::i32, MVT::i32), Half); 3052 3053 Chain = 3054 DAG.getCopyToReg(Chain, dl, VA.getLocReg(), 3055 HalfGPRs.getValue(isLittleEndian ? 0 : 1), Flag); 3056 Flag = Chain.getValue(1); 3057 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT())); 3058 VA = RVLocs[++i]; // skip ahead to next loc 3059 Chain = 3060 DAG.getCopyToReg(Chain, dl, VA.getLocReg(), 3061 HalfGPRs.getValue(isLittleEndian ? 1 : 0), Flag); 3062 Flag = Chain.getValue(1); 3063 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT())); 3064 VA = RVLocs[++i]; // skip ahead to next loc 3065 3066 // Extract the 2nd half and fall through to handle it as an f64 value. 3067 Arg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg, 3068 DAG.getConstant(1, dl, MVT::i32)); 3069 } 3070 // Legalize ret f64 -> ret 2 x i32. We always have fmrrd if f64 is 3071 // available. 3072 SDValue fmrrd = DAG.getNode(ARMISD::VMOVRRD, dl, 3073 DAG.getVTList(MVT::i32, MVT::i32), Arg); 3074 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), 3075 fmrrd.getValue(isLittleEndian ? 0 : 1), Flag); 3076 Flag = Chain.getValue(1); 3077 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT())); 3078 VA = RVLocs[++i]; // skip ahead to next loc 3079 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), 3080 fmrrd.getValue(isLittleEndian ? 1 : 0), Flag); 3081 } else 3082 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), Arg, Flag); 3083 3084 // Guarantee that all emitted copies are 3085 // stuck together, avoiding something bad. 3086 Flag = Chain.getValue(1); 3087 RetOps.push_back(DAG.getRegister( 3088 VA.getLocReg(), ReturnF16 ? Arg.getValueType() : VA.getLocVT())); 3089 } 3090 const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo(); 3091 const MCPhysReg *I = 3092 TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction()); 3093 if (I) { 3094 for (; *I; ++I) { 3095 if (ARM::GPRRegClass.contains(*I)) 3096 RetOps.push_back(DAG.getRegister(*I, MVT::i32)); 3097 else if (ARM::DPRRegClass.contains(*I)) 3098 RetOps.push_back(DAG.getRegister(*I, MVT::getFloatingPointVT(64))); 3099 else 3100 llvm_unreachable("Unexpected register class in CSRsViaCopy!"); 3101 } 3102 } 3103 3104 // Update chain and glue. 3105 RetOps[0] = Chain; 3106 if (Flag.getNode()) 3107 RetOps.push_back(Flag); 3108 3109 // CPUs which aren't M-class use a special sequence to return from 3110 // exceptions (roughly, any instruction setting pc and cpsr simultaneously, 3111 // though we use "subs pc, lr, #N"). 3112 // 3113 // M-class CPUs actually use a normal return sequence with a special 3114 // (hardware-provided) value in LR, so the normal code path works. 3115 if (DAG.getMachineFunction().getFunction().hasFnAttribute("interrupt") && 3116 !Subtarget->isMClass()) { 3117 if (Subtarget->isThumb1Only()) 3118 report_fatal_error("interrupt attribute is not supported in Thumb1"); 3119 return LowerInterruptReturn(RetOps, dl, DAG); 3120 } 3121 3122 ARMISD::NodeType RetNode = AFI->isCmseNSEntryFunction() ? ARMISD::SERET_FLAG : 3123 ARMISD::RET_FLAG; 3124 return DAG.getNode(RetNode, dl, MVT::Other, RetOps); 3125 } 3126 3127 bool ARMTargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const { 3128 if (N->getNumValues() != 1) 3129 return false; 3130 if (!N->hasNUsesOfValue(1, 0)) 3131 return false; 3132 3133 SDValue TCChain = Chain; 3134 SDNode *Copy = *N->use_begin(); 3135 if (Copy->getOpcode() == ISD::CopyToReg) { 3136 // If the copy has a glue operand, we conservatively assume it isn't safe to 3137 // perform a tail call. 3138 if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue) 3139 return false; 3140 TCChain = Copy->getOperand(0); 3141 } else if (Copy->getOpcode() == ARMISD::VMOVRRD) { 3142 SDNode *VMov = Copy; 3143 // f64 returned in a pair of GPRs. 3144 SmallPtrSet<SDNode*, 2> Copies; 3145 for (SDNode::use_iterator UI = VMov->use_begin(), UE = VMov->use_end(); 3146 UI != UE; ++UI) { 3147 if (UI->getOpcode() != ISD::CopyToReg) 3148 return false; 3149 Copies.insert(*UI); 3150 } 3151 if (Copies.size() > 2) 3152 return false; 3153 3154 for (SDNode::use_iterator UI = VMov->use_begin(), UE = VMov->use_end(); 3155 UI != UE; ++UI) { 3156 SDValue UseChain = UI->getOperand(0); 3157 if (Copies.count(UseChain.getNode())) 3158 // Second CopyToReg 3159 Copy = *UI; 3160 else { 3161 // We are at the top of this chain. 3162 // If the copy has a glue operand, we conservatively assume it 3163 // isn't safe to perform a tail call. 3164 if (UI->getOperand(UI->getNumOperands()-1).getValueType() == MVT::Glue) 3165 return false; 3166 // First CopyToReg 3167 TCChain = UseChain; 3168 } 3169 } 3170 } else if (Copy->getOpcode() == ISD::BITCAST) { 3171 // f32 returned in a single GPR. 3172 if (!Copy->hasOneUse()) 3173 return false; 3174 Copy = *Copy->use_begin(); 3175 if (Copy->getOpcode() != ISD::CopyToReg || !Copy->hasNUsesOfValue(1, 0)) 3176 return false; 3177 // If the copy has a glue operand, we conservatively assume it isn't safe to 3178 // perform a tail call. 3179 if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue) 3180 return false; 3181 TCChain = Copy->getOperand(0); 3182 } else { 3183 return false; 3184 } 3185 3186 bool HasRet = false; 3187 for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end(); 3188 UI != UE; ++UI) { 3189 if (UI->getOpcode() != ARMISD::RET_FLAG && 3190 UI->getOpcode() != ARMISD::INTRET_FLAG) 3191 return false; 3192 HasRet = true; 3193 } 3194 3195 if (!HasRet) 3196 return false; 3197 3198 Chain = TCChain; 3199 return true; 3200 } 3201 3202 bool ARMTargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const { 3203 if (!Subtarget->supportsTailCall()) 3204 return false; 3205 3206 if (!CI->isTailCall()) 3207 return false; 3208 3209 return true; 3210 } 3211 3212 // Trying to write a 64 bit value so need to split into two 32 bit values first, 3213 // and pass the lower and high parts through. 3214 static SDValue LowerWRITE_REGISTER(SDValue Op, SelectionDAG &DAG) { 3215 SDLoc DL(Op); 3216 SDValue WriteValue = Op->getOperand(2); 3217 3218 // This function is only supposed to be called for i64 type argument. 3219 assert(WriteValue.getValueType() == MVT::i64 3220 && "LowerWRITE_REGISTER called for non-i64 type argument."); 3221 3222 SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, WriteValue, 3223 DAG.getConstant(0, DL, MVT::i32)); 3224 SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, WriteValue, 3225 DAG.getConstant(1, DL, MVT::i32)); 3226 SDValue Ops[] = { Op->getOperand(0), Op->getOperand(1), Lo, Hi }; 3227 return DAG.getNode(ISD::WRITE_REGISTER, DL, MVT::Other, Ops); 3228 } 3229 3230 // ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as 3231 // their target counterpart wrapped in the ARMISD::Wrapper node. Suppose N is 3232 // one of the above mentioned nodes. It has to be wrapped because otherwise 3233 // Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only 3234 // be used to form addressing mode. These wrapped nodes will be selected 3235 // into MOVi. 3236 SDValue ARMTargetLowering::LowerConstantPool(SDValue Op, 3237 SelectionDAG &DAG) const { 3238 EVT PtrVT = Op.getValueType(); 3239 // FIXME there is no actual debug info here 3240 SDLoc dl(Op); 3241 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op); 3242 SDValue Res; 3243 3244 // When generating execute-only code Constant Pools must be promoted to the 3245 // global data section. It's a bit ugly that we can't share them across basic 3246 // blocks, but this way we guarantee that execute-only behaves correct with 3247 // position-independent addressing modes. 3248 if (Subtarget->genExecuteOnly()) { 3249 auto AFI = DAG.getMachineFunction().getInfo<ARMFunctionInfo>(); 3250 auto T = const_cast<Type*>(CP->getType()); 3251 auto C = const_cast<Constant*>(CP->getConstVal()); 3252 auto M = const_cast<Module*>(DAG.getMachineFunction(). 3253 getFunction().getParent()); 3254 auto GV = new GlobalVariable( 3255 *M, T, /*isConstant=*/true, GlobalVariable::InternalLinkage, C, 3256 Twine(DAG.getDataLayout().getPrivateGlobalPrefix()) + "CP" + 3257 Twine(DAG.getMachineFunction().getFunctionNumber()) + "_" + 3258 Twine(AFI->createPICLabelUId()) 3259 ); 3260 SDValue GA = DAG.getTargetGlobalAddress(dyn_cast<GlobalValue>(GV), 3261 dl, PtrVT); 3262 return LowerGlobalAddress(GA, DAG); 3263 } 3264 3265 if (CP->isMachineConstantPoolEntry()) 3266 Res = 3267 DAG.getTargetConstantPool(CP->getMachineCPVal(), PtrVT, CP->getAlign()); 3268 else 3269 Res = DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlign()); 3270 return DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Res); 3271 } 3272 3273 unsigned ARMTargetLowering::getJumpTableEncoding() const { 3274 return MachineJumpTableInfo::EK_Inline; 3275 } 3276 3277 SDValue ARMTargetLowering::LowerBlockAddress(SDValue Op, 3278 SelectionDAG &DAG) const { 3279 MachineFunction &MF = DAG.getMachineFunction(); 3280 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); 3281 unsigned ARMPCLabelIndex = 0; 3282 SDLoc DL(Op); 3283 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 3284 const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress(); 3285 SDValue CPAddr; 3286 bool IsPositionIndependent = isPositionIndependent() || Subtarget->isROPI(); 3287 if (!IsPositionIndependent) { 3288 CPAddr = DAG.getTargetConstantPool(BA, PtrVT, Align(4)); 3289 } else { 3290 unsigned PCAdj = Subtarget->isThumb() ? 4 : 8; 3291 ARMPCLabelIndex = AFI->createPICLabelUId(); 3292 ARMConstantPoolValue *CPV = 3293 ARMConstantPoolConstant::Create(BA, ARMPCLabelIndex, 3294 ARMCP::CPBlockAddress, PCAdj); 3295 CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, Align(4)); 3296 } 3297 CPAddr = DAG.getNode(ARMISD::Wrapper, DL, PtrVT, CPAddr); 3298 SDValue Result = DAG.getLoad( 3299 PtrVT, DL, DAG.getEntryNode(), CPAddr, 3300 MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); 3301 if (!IsPositionIndependent) 3302 return Result; 3303 SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, DL, MVT::i32); 3304 return DAG.getNode(ARMISD::PIC_ADD, DL, PtrVT, Result, PICLabel); 3305 } 3306 3307 /// Convert a TLS address reference into the correct sequence of loads 3308 /// and calls to compute the variable's address for Darwin, and return an 3309 /// SDValue containing the final node. 3310 3311 /// Darwin only has one TLS scheme which must be capable of dealing with the 3312 /// fully general situation, in the worst case. This means: 3313 /// + "extern __thread" declaration. 3314 /// + Defined in a possibly unknown dynamic library. 3315 /// 3316 /// The general system is that each __thread variable has a [3 x i32] descriptor 3317 /// which contains information used by the runtime to calculate the address. The 3318 /// only part of this the compiler needs to know about is the first word, which 3319 /// contains a function pointer that must be called with the address of the 3320 /// entire descriptor in "r0". 3321 /// 3322 /// Since this descriptor may be in a different unit, in general access must 3323 /// proceed along the usual ARM rules. A common sequence to produce is: 3324 /// 3325 /// movw rT1, :lower16:_var$non_lazy_ptr 3326 /// movt rT1, :upper16:_var$non_lazy_ptr 3327 /// ldr r0, [rT1] 3328 /// ldr rT2, [r0] 3329 /// blx rT2 3330 /// [...address now in r0...] 3331 SDValue 3332 ARMTargetLowering::LowerGlobalTLSAddressDarwin(SDValue Op, 3333 SelectionDAG &DAG) const { 3334 assert(Subtarget->isTargetDarwin() && 3335 "This function expects a Darwin target"); 3336 SDLoc DL(Op); 3337 3338 // First step is to get the address of the actua global symbol. This is where 3339 // the TLS descriptor lives. 3340 SDValue DescAddr = LowerGlobalAddressDarwin(Op, DAG); 3341 3342 // The first entry in the descriptor is a function pointer that we must call 3343 // to obtain the address of the variable. 3344 SDValue Chain = DAG.getEntryNode(); 3345 SDValue FuncTLVGet = DAG.getLoad( 3346 MVT::i32, DL, Chain, DescAddr, 3347 MachinePointerInfo::getGOT(DAG.getMachineFunction()), Align(4), 3348 MachineMemOperand::MONonTemporal | MachineMemOperand::MODereferenceable | 3349 MachineMemOperand::MOInvariant); 3350 Chain = FuncTLVGet.getValue(1); 3351 3352 MachineFunction &F = DAG.getMachineFunction(); 3353 MachineFrameInfo &MFI = F.getFrameInfo(); 3354 MFI.setAdjustsStack(true); 3355 3356 // TLS calls preserve all registers except those that absolutely must be 3357 // trashed: R0 (it takes an argument), LR (it's a call) and CPSR (let's not be 3358 // silly). 3359 auto TRI = 3360 getTargetMachine().getSubtargetImpl(F.getFunction())->getRegisterInfo(); 3361 auto ARI = static_cast<const ARMRegisterInfo *>(TRI); 3362 const uint32_t *Mask = ARI->getTLSCallPreservedMask(DAG.getMachineFunction()); 3363 3364 // Finally, we can make the call. This is just a degenerate version of a 3365 // normal AArch64 call node: r0 takes the address of the descriptor, and 3366 // returns the address of the variable in this thread. 3367 Chain = DAG.getCopyToReg(Chain, DL, ARM::R0, DescAddr, SDValue()); 3368 Chain = 3369 DAG.getNode(ARMISD::CALL, DL, DAG.getVTList(MVT::Other, MVT::Glue), 3370 Chain, FuncTLVGet, DAG.getRegister(ARM::R0, MVT::i32), 3371 DAG.getRegisterMask(Mask), Chain.getValue(1)); 3372 return DAG.getCopyFromReg(Chain, DL, ARM::R0, MVT::i32, Chain.getValue(1)); 3373 } 3374 3375 SDValue 3376 ARMTargetLowering::LowerGlobalTLSAddressWindows(SDValue Op, 3377 SelectionDAG &DAG) const { 3378 assert(Subtarget->isTargetWindows() && "Windows specific TLS lowering"); 3379 3380 SDValue Chain = DAG.getEntryNode(); 3381 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 3382 SDLoc DL(Op); 3383 3384 // Load the current TEB (thread environment block) 3385 SDValue Ops[] = {Chain, 3386 DAG.getTargetConstant(Intrinsic::arm_mrc, DL, MVT::i32), 3387 DAG.getTargetConstant(15, DL, MVT::i32), 3388 DAG.getTargetConstant(0, DL, MVT::i32), 3389 DAG.getTargetConstant(13, DL, MVT::i32), 3390 DAG.getTargetConstant(0, DL, MVT::i32), 3391 DAG.getTargetConstant(2, DL, MVT::i32)}; 3392 SDValue CurrentTEB = DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, 3393 DAG.getVTList(MVT::i32, MVT::Other), Ops); 3394 3395 SDValue TEB = CurrentTEB.getValue(0); 3396 Chain = CurrentTEB.getValue(1); 3397 3398 // Load the ThreadLocalStoragePointer from the TEB 3399 // A pointer to the TLS array is located at offset 0x2c from the TEB. 3400 SDValue TLSArray = 3401 DAG.getNode(ISD::ADD, DL, PtrVT, TEB, DAG.getIntPtrConstant(0x2c, DL)); 3402 TLSArray = DAG.getLoad(PtrVT, DL, Chain, TLSArray, MachinePointerInfo()); 3403 3404 // The pointer to the thread's TLS data area is at the TLS Index scaled by 4 3405 // offset into the TLSArray. 3406 3407 // Load the TLS index from the C runtime 3408 SDValue TLSIndex = 3409 DAG.getTargetExternalSymbol("_tls_index", PtrVT, ARMII::MO_NO_FLAG); 3410 TLSIndex = DAG.getNode(ARMISD::Wrapper, DL, PtrVT, TLSIndex); 3411 TLSIndex = DAG.getLoad(PtrVT, DL, Chain, TLSIndex, MachinePointerInfo()); 3412 3413 SDValue Slot = DAG.getNode(ISD::SHL, DL, PtrVT, TLSIndex, 3414 DAG.getConstant(2, DL, MVT::i32)); 3415 SDValue TLS = DAG.getLoad(PtrVT, DL, Chain, 3416 DAG.getNode(ISD::ADD, DL, PtrVT, TLSArray, Slot), 3417 MachinePointerInfo()); 3418 3419 // Get the offset of the start of the .tls section (section base) 3420 const auto *GA = cast<GlobalAddressSDNode>(Op); 3421 auto *CPV = ARMConstantPoolConstant::Create(GA->getGlobal(), ARMCP::SECREL); 3422 SDValue Offset = DAG.getLoad( 3423 PtrVT, DL, Chain, 3424 DAG.getNode(ARMISD::Wrapper, DL, MVT::i32, 3425 DAG.getTargetConstantPool(CPV, PtrVT, Align(4))), 3426 MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); 3427 3428 return DAG.getNode(ISD::ADD, DL, PtrVT, TLS, Offset); 3429 } 3430 3431 // Lower ISD::GlobalTLSAddress using the "general dynamic" model 3432 SDValue 3433 ARMTargetLowering::LowerToTLSGeneralDynamicModel(GlobalAddressSDNode *GA, 3434 SelectionDAG &DAG) const { 3435 SDLoc dl(GA); 3436 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 3437 unsigned char PCAdj = Subtarget->isThumb() ? 4 : 8; 3438 MachineFunction &MF = DAG.getMachineFunction(); 3439 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); 3440 unsigned ARMPCLabelIndex = AFI->createPICLabelUId(); 3441 ARMConstantPoolValue *CPV = 3442 ARMConstantPoolConstant::Create(GA->getGlobal(), ARMPCLabelIndex, 3443 ARMCP::CPValue, PCAdj, ARMCP::TLSGD, true); 3444 SDValue Argument = DAG.getTargetConstantPool(CPV, PtrVT, Align(4)); 3445 Argument = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Argument); 3446 Argument = DAG.getLoad( 3447 PtrVT, dl, DAG.getEntryNode(), Argument, 3448 MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); 3449 SDValue Chain = Argument.getValue(1); 3450 3451 SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32); 3452 Argument = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Argument, PICLabel); 3453 3454 // call __tls_get_addr. 3455 ArgListTy Args; 3456 ArgListEntry Entry; 3457 Entry.Node = Argument; 3458 Entry.Ty = (Type *) Type::getInt32Ty(*DAG.getContext()); 3459 Args.push_back(Entry); 3460 3461 // FIXME: is there useful debug info available here? 3462 TargetLowering::CallLoweringInfo CLI(DAG); 3463 CLI.setDebugLoc(dl).setChain(Chain).setLibCallee( 3464 CallingConv::C, Type::getInt32Ty(*DAG.getContext()), 3465 DAG.getExternalSymbol("__tls_get_addr", PtrVT), std::move(Args)); 3466 3467 std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI); 3468 return CallResult.first; 3469 } 3470 3471 // Lower ISD::GlobalTLSAddress using the "initial exec" or 3472 // "local exec" model. 3473 SDValue 3474 ARMTargetLowering::LowerToTLSExecModels(GlobalAddressSDNode *GA, 3475 SelectionDAG &DAG, 3476 TLSModel::Model model) const { 3477 const GlobalValue *GV = GA->getGlobal(); 3478 SDLoc dl(GA); 3479 SDValue Offset; 3480 SDValue Chain = DAG.getEntryNode(); 3481 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 3482 // Get the Thread Pointer 3483 SDValue ThreadPointer = DAG.getNode(ARMISD::THREAD_POINTER, dl, PtrVT); 3484 3485 if (model == TLSModel::InitialExec) { 3486 MachineFunction &MF = DAG.getMachineFunction(); 3487 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); 3488 unsigned ARMPCLabelIndex = AFI->createPICLabelUId(); 3489 // Initial exec model. 3490 unsigned char PCAdj = Subtarget->isThumb() ? 4 : 8; 3491 ARMConstantPoolValue *CPV = 3492 ARMConstantPoolConstant::Create(GA->getGlobal(), ARMPCLabelIndex, 3493 ARMCP::CPValue, PCAdj, ARMCP::GOTTPOFF, 3494 true); 3495 Offset = DAG.getTargetConstantPool(CPV, PtrVT, Align(4)); 3496 Offset = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Offset); 3497 Offset = DAG.getLoad( 3498 PtrVT, dl, Chain, Offset, 3499 MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); 3500 Chain = Offset.getValue(1); 3501 3502 SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32); 3503 Offset = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Offset, PICLabel); 3504 3505 Offset = DAG.getLoad( 3506 PtrVT, dl, Chain, Offset, 3507 MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); 3508 } else { 3509 // local exec model 3510 assert(model == TLSModel::LocalExec); 3511 ARMConstantPoolValue *CPV = 3512 ARMConstantPoolConstant::Create(GV, ARMCP::TPOFF); 3513 Offset = DAG.getTargetConstantPool(CPV, PtrVT, Align(4)); 3514 Offset = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Offset); 3515 Offset = DAG.getLoad( 3516 PtrVT, dl, Chain, Offset, 3517 MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); 3518 } 3519 3520 // The address of the thread local variable is the add of the thread 3521 // pointer with the offset of the variable. 3522 return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset); 3523 } 3524 3525 SDValue 3526 ARMTargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { 3527 GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op); 3528 if (DAG.getTarget().useEmulatedTLS()) 3529 return LowerToTLSEmulatedModel(GA, DAG); 3530 3531 if (Subtarget->isTargetDarwin()) 3532 return LowerGlobalTLSAddressDarwin(Op, DAG); 3533 3534 if (Subtarget->isTargetWindows()) 3535 return LowerGlobalTLSAddressWindows(Op, DAG); 3536 3537 // TODO: implement the "local dynamic" model 3538 assert(Subtarget->isTargetELF() && "Only ELF implemented here"); 3539 TLSModel::Model model = getTargetMachine().getTLSModel(GA->getGlobal()); 3540 3541 switch (model) { 3542 case TLSModel::GeneralDynamic: 3543 case TLSModel::LocalDynamic: 3544 return LowerToTLSGeneralDynamicModel(GA, DAG); 3545 case TLSModel::InitialExec: 3546 case TLSModel::LocalExec: 3547 return LowerToTLSExecModels(GA, DAG, model); 3548 } 3549 llvm_unreachable("bogus TLS model"); 3550 } 3551 3552 /// Return true if all users of V are within function F, looking through 3553 /// ConstantExprs. 3554 static bool allUsersAreInFunction(const Value *V, const Function *F) { 3555 SmallVector<const User*,4> Worklist(V->users()); 3556 while (!Worklist.empty()) { 3557 auto *U = Worklist.pop_back_val(); 3558 if (isa<ConstantExpr>(U)) { 3559 append_range(Worklist, U->users()); 3560 continue; 3561 } 3562 3563 auto *I = dyn_cast<Instruction>(U); 3564 if (!I || I->getParent()->getParent() != F) 3565 return false; 3566 } 3567 return true; 3568 } 3569 3570 static SDValue promoteToConstantPool(const ARMTargetLowering *TLI, 3571 const GlobalValue *GV, SelectionDAG &DAG, 3572 EVT PtrVT, const SDLoc &dl) { 3573 // If we're creating a pool entry for a constant global with unnamed address, 3574 // and the global is small enough, we can emit it inline into the constant pool 3575 // to save ourselves an indirection. 3576 // 3577 // This is a win if the constant is only used in one function (so it doesn't 3578 // need to be duplicated) or duplicating the constant wouldn't increase code 3579 // size (implying the constant is no larger than 4 bytes). 3580 const Function &F = DAG.getMachineFunction().getFunction(); 3581 3582 // We rely on this decision to inline being idemopotent and unrelated to the 3583 // use-site. We know that if we inline a variable at one use site, we'll 3584 // inline it elsewhere too (and reuse the constant pool entry). Fast-isel 3585 // doesn't know about this optimization, so bail out if it's enabled else 3586 // we could decide to inline here (and thus never emit the GV) but require 3587 // the GV from fast-isel generated code. 3588 if (!EnableConstpoolPromotion || 3589 DAG.getMachineFunction().getTarget().Options.EnableFastISel) 3590 return SDValue(); 3591 3592 auto *GVar = dyn_cast<GlobalVariable>(GV); 3593 if (!GVar || !GVar->hasInitializer() || 3594 !GVar->isConstant() || !GVar->hasGlobalUnnamedAddr() || 3595 !GVar->hasLocalLinkage()) 3596 return SDValue(); 3597 3598 // If we inline a value that contains relocations, we move the relocations 3599 // from .data to .text. This is not allowed in position-independent code. 3600 auto *Init = GVar->getInitializer(); 3601 if ((TLI->isPositionIndependent() || TLI->getSubtarget()->isROPI()) && 3602 Init->needsDynamicRelocation()) 3603 return SDValue(); 3604 3605 // The constant islands pass can only really deal with alignment requests 3606 // <= 4 bytes and cannot pad constants itself. Therefore we cannot promote 3607 // any type wanting greater alignment requirements than 4 bytes. We also 3608 // can only promote constants that are multiples of 4 bytes in size or 3609 // are paddable to a multiple of 4. Currently we only try and pad constants 3610 // that are strings for simplicity. 3611 auto *CDAInit = dyn_cast<ConstantDataArray>(Init); 3612 unsigned Size = DAG.getDataLayout().getTypeAllocSize(Init->getType()); 3613 Align PrefAlign = DAG.getDataLayout().getPreferredAlign(GVar); 3614 unsigned RequiredPadding = 4 - (Size % 4); 3615 bool PaddingPossible = 3616 RequiredPadding == 4 || (CDAInit && CDAInit->isString()); 3617 if (!PaddingPossible || PrefAlign > 4 || Size > ConstpoolPromotionMaxSize || 3618 Size == 0) 3619 return SDValue(); 3620 3621 unsigned PaddedSize = Size + ((RequiredPadding == 4) ? 0 : RequiredPadding); 3622 MachineFunction &MF = DAG.getMachineFunction(); 3623 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); 3624 3625 // We can't bloat the constant pool too much, else the ConstantIslands pass 3626 // may fail to converge. If we haven't promoted this global yet (it may have 3627 // multiple uses), and promoting it would increase the constant pool size (Sz 3628 // > 4), ensure we have space to do so up to MaxTotal. 3629 if (!AFI->getGlobalsPromotedToConstantPool().count(GVar) && Size > 4) 3630 if (AFI->getPromotedConstpoolIncrease() + PaddedSize - 4 >= 3631 ConstpoolPromotionMaxTotal) 3632 return SDValue(); 3633 3634 // This is only valid if all users are in a single function; we can't clone 3635 // the constant in general. The LLVM IR unnamed_addr allows merging 3636 // constants, but not cloning them. 3637 // 3638 // We could potentially allow cloning if we could prove all uses of the 3639 // constant in the current function don't care about the address, like 3640 // printf format strings. But that isn't implemented for now. 3641 if (!allUsersAreInFunction(GVar, &F)) 3642 return SDValue(); 3643 3644 // We're going to inline this global. Pad it out if needed. 3645 if (RequiredPadding != 4) { 3646 StringRef S = CDAInit->getAsString(); 3647 3648 SmallVector<uint8_t,16> V(S.size()); 3649 std::copy(S.bytes_begin(), S.bytes_end(), V.begin()); 3650 while (RequiredPadding--) 3651 V.push_back(0); 3652 Init = ConstantDataArray::get(*DAG.getContext(), V); 3653 } 3654 3655 auto CPVal = ARMConstantPoolConstant::Create(GVar, Init); 3656 SDValue CPAddr = DAG.getTargetConstantPool(CPVal, PtrVT, Align(4)); 3657 if (!AFI->getGlobalsPromotedToConstantPool().count(GVar)) { 3658 AFI->markGlobalAsPromotedToConstantPool(GVar); 3659 AFI->setPromotedConstpoolIncrease(AFI->getPromotedConstpoolIncrease() + 3660 PaddedSize - 4); 3661 } 3662 ++NumConstpoolPromoted; 3663 return DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); 3664 } 3665 3666 bool ARMTargetLowering::isReadOnly(const GlobalValue *GV) const { 3667 if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV)) 3668 if (!(GV = GA->getBaseObject())) 3669 return false; 3670 if (const auto *V = dyn_cast<GlobalVariable>(GV)) 3671 return V->isConstant(); 3672 return isa<Function>(GV); 3673 } 3674 3675 SDValue ARMTargetLowering::LowerGlobalAddress(SDValue Op, 3676 SelectionDAG &DAG) const { 3677 switch (Subtarget->getTargetTriple().getObjectFormat()) { 3678 default: llvm_unreachable("unknown object format"); 3679 case Triple::COFF: 3680 return LowerGlobalAddressWindows(Op, DAG); 3681 case Triple::ELF: 3682 return LowerGlobalAddressELF(Op, DAG); 3683 case Triple::MachO: 3684 return LowerGlobalAddressDarwin(Op, DAG); 3685 } 3686 } 3687 3688 SDValue ARMTargetLowering::LowerGlobalAddressELF(SDValue Op, 3689 SelectionDAG &DAG) const { 3690 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 3691 SDLoc dl(Op); 3692 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal(); 3693 const TargetMachine &TM = getTargetMachine(); 3694 bool IsRO = isReadOnly(GV); 3695 3696 // promoteToConstantPool only if not generating XO text section 3697 if (TM.shouldAssumeDSOLocal(*GV->getParent(), GV) && !Subtarget->genExecuteOnly()) 3698 if (SDValue V = promoteToConstantPool(this, GV, DAG, PtrVT, dl)) 3699 return V; 3700 3701 if (isPositionIndependent()) { 3702 bool UseGOT_PREL = !TM.shouldAssumeDSOLocal(*GV->getParent(), GV); 3703 SDValue G = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, 3704 UseGOT_PREL ? ARMII::MO_GOT : 0); 3705 SDValue Result = DAG.getNode(ARMISD::WrapperPIC, dl, PtrVT, G); 3706 if (UseGOT_PREL) 3707 Result = 3708 DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result, 3709 MachinePointerInfo::getGOT(DAG.getMachineFunction())); 3710 return Result; 3711 } else if (Subtarget->isROPI() && IsRO) { 3712 // PC-relative. 3713 SDValue G = DAG.getTargetGlobalAddress(GV, dl, PtrVT); 3714 SDValue Result = DAG.getNode(ARMISD::WrapperPIC, dl, PtrVT, G); 3715 return Result; 3716 } else if (Subtarget->isRWPI() && !IsRO) { 3717 // SB-relative. 3718 SDValue RelAddr; 3719 if (Subtarget->useMovt()) { 3720 ++NumMovwMovt; 3721 SDValue G = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, ARMII::MO_SBREL); 3722 RelAddr = DAG.getNode(ARMISD::Wrapper, dl, PtrVT, G); 3723 } else { // use literal pool for address constant 3724 ARMConstantPoolValue *CPV = 3725 ARMConstantPoolConstant::Create(GV, ARMCP::SBREL); 3726 SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, Align(4)); 3727 CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); 3728 RelAddr = DAG.getLoad( 3729 PtrVT, dl, DAG.getEntryNode(), CPAddr, 3730 MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); 3731 } 3732 SDValue SB = DAG.getCopyFromReg(DAG.getEntryNode(), dl, ARM::R9, PtrVT); 3733 SDValue Result = DAG.getNode(ISD::ADD, dl, PtrVT, SB, RelAddr); 3734 return Result; 3735 } 3736 3737 // If we have T2 ops, we can materialize the address directly via movt/movw 3738 // pair. This is always cheaper. 3739 if (Subtarget->useMovt()) { 3740 ++NumMovwMovt; 3741 // FIXME: Once remat is capable of dealing with instructions with register 3742 // operands, expand this into two nodes. 3743 return DAG.getNode(ARMISD::Wrapper, dl, PtrVT, 3744 DAG.getTargetGlobalAddress(GV, dl, PtrVT)); 3745 } else { 3746 SDValue CPAddr = DAG.getTargetConstantPool(GV, PtrVT, Align(4)); 3747 CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); 3748 return DAG.getLoad( 3749 PtrVT, dl, DAG.getEntryNode(), CPAddr, 3750 MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); 3751 } 3752 } 3753 3754 SDValue ARMTargetLowering::LowerGlobalAddressDarwin(SDValue Op, 3755 SelectionDAG &DAG) const { 3756 assert(!Subtarget->isROPI() && !Subtarget->isRWPI() && 3757 "ROPI/RWPI not currently supported for Darwin"); 3758 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 3759 SDLoc dl(Op); 3760 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal(); 3761 3762 if (Subtarget->useMovt()) 3763 ++NumMovwMovt; 3764 3765 // FIXME: Once remat is capable of dealing with instructions with register 3766 // operands, expand this into multiple nodes 3767 unsigned Wrapper = 3768 isPositionIndependent() ? ARMISD::WrapperPIC : ARMISD::Wrapper; 3769 3770 SDValue G = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, ARMII::MO_NONLAZY); 3771 SDValue Result = DAG.getNode(Wrapper, dl, PtrVT, G); 3772 3773 if (Subtarget->isGVIndirectSymbol(GV)) 3774 Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result, 3775 MachinePointerInfo::getGOT(DAG.getMachineFunction())); 3776 return Result; 3777 } 3778 3779 SDValue ARMTargetLowering::LowerGlobalAddressWindows(SDValue Op, 3780 SelectionDAG &DAG) const { 3781 assert(Subtarget->isTargetWindows() && "non-Windows COFF is not supported"); 3782 assert(Subtarget->useMovt() && 3783 "Windows on ARM expects to use movw/movt"); 3784 assert(!Subtarget->isROPI() && !Subtarget->isRWPI() && 3785 "ROPI/RWPI not currently supported for Windows"); 3786 3787 const TargetMachine &TM = getTargetMachine(); 3788 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal(); 3789 ARMII::TOF TargetFlags = ARMII::MO_NO_FLAG; 3790 if (GV->hasDLLImportStorageClass()) 3791 TargetFlags = ARMII::MO_DLLIMPORT; 3792 else if (!TM.shouldAssumeDSOLocal(*GV->getParent(), GV)) 3793 TargetFlags = ARMII::MO_COFFSTUB; 3794 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 3795 SDValue Result; 3796 SDLoc DL(Op); 3797 3798 ++NumMovwMovt; 3799 3800 // FIXME: Once remat is capable of dealing with instructions with register 3801 // operands, expand this into two nodes. 3802 Result = DAG.getNode(ARMISD::Wrapper, DL, PtrVT, 3803 DAG.getTargetGlobalAddress(GV, DL, PtrVT, /*offset=*/0, 3804 TargetFlags)); 3805 if (TargetFlags & (ARMII::MO_DLLIMPORT | ARMII::MO_COFFSTUB)) 3806 Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result, 3807 MachinePointerInfo::getGOT(DAG.getMachineFunction())); 3808 return Result; 3809 } 3810 3811 SDValue 3812 ARMTargetLowering::LowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const { 3813 SDLoc dl(Op); 3814 SDValue Val = DAG.getConstant(0, dl, MVT::i32); 3815 return DAG.getNode(ARMISD::EH_SJLJ_SETJMP, dl, 3816 DAG.getVTList(MVT::i32, MVT::Other), Op.getOperand(0), 3817 Op.getOperand(1), Val); 3818 } 3819 3820 SDValue 3821 ARMTargetLowering::LowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG) const { 3822 SDLoc dl(Op); 3823 return DAG.getNode(ARMISD::EH_SJLJ_LONGJMP, dl, MVT::Other, Op.getOperand(0), 3824 Op.getOperand(1), DAG.getConstant(0, dl, MVT::i32)); 3825 } 3826 3827 SDValue ARMTargetLowering::LowerEH_SJLJ_SETUP_DISPATCH(SDValue Op, 3828 SelectionDAG &DAG) const { 3829 SDLoc dl(Op); 3830 return DAG.getNode(ARMISD::EH_SJLJ_SETUP_DISPATCH, dl, MVT::Other, 3831 Op.getOperand(0)); 3832 } 3833 3834 SDValue ARMTargetLowering::LowerINTRINSIC_VOID( 3835 SDValue Op, SelectionDAG &DAG, const ARMSubtarget *Subtarget) const { 3836 unsigned IntNo = 3837 cast<ConstantSDNode>( 3838 Op.getOperand(Op.getOperand(0).getValueType() == MVT::Other)) 3839 ->getZExtValue(); 3840 switch (IntNo) { 3841 default: 3842 return SDValue(); // Don't custom lower most intrinsics. 3843 case Intrinsic::arm_gnu_eabi_mcount: { 3844 MachineFunction &MF = DAG.getMachineFunction(); 3845 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 3846 SDLoc dl(Op); 3847 SDValue Chain = Op.getOperand(0); 3848 // call "\01__gnu_mcount_nc" 3849 const ARMBaseRegisterInfo *ARI = Subtarget->getRegisterInfo(); 3850 const uint32_t *Mask = 3851 ARI->getCallPreservedMask(DAG.getMachineFunction(), CallingConv::C); 3852 assert(Mask && "Missing call preserved mask for calling convention"); 3853 // Mark LR an implicit live-in. 3854 unsigned Reg = MF.addLiveIn(ARM::LR, getRegClassFor(MVT::i32)); 3855 SDValue ReturnAddress = 3856 DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, PtrVT); 3857 constexpr EVT ResultTys[] = {MVT::Other, MVT::Glue}; 3858 SDValue Callee = 3859 DAG.getTargetExternalSymbol("\01__gnu_mcount_nc", PtrVT, 0); 3860 SDValue RegisterMask = DAG.getRegisterMask(Mask); 3861 if (Subtarget->isThumb()) 3862 return SDValue( 3863 DAG.getMachineNode( 3864 ARM::tBL_PUSHLR, dl, ResultTys, 3865 {ReturnAddress, DAG.getTargetConstant(ARMCC::AL, dl, PtrVT), 3866 DAG.getRegister(0, PtrVT), Callee, RegisterMask, Chain}), 3867 0); 3868 return SDValue( 3869 DAG.getMachineNode(ARM::BL_PUSHLR, dl, ResultTys, 3870 {ReturnAddress, Callee, RegisterMask, Chain}), 3871 0); 3872 } 3873 } 3874 } 3875 3876 SDValue 3877 ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG, 3878 const ARMSubtarget *Subtarget) const { 3879 unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 3880 SDLoc dl(Op); 3881 switch (IntNo) { 3882 default: return SDValue(); // Don't custom lower most intrinsics. 3883 case Intrinsic::thread_pointer: { 3884 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 3885 return DAG.getNode(ARMISD::THREAD_POINTER, dl, PtrVT); 3886 } 3887 case Intrinsic::arm_cls: { 3888 const SDValue &Operand = Op.getOperand(1); 3889 const EVT VTy = Op.getValueType(); 3890 SDValue SRA = 3891 DAG.getNode(ISD::SRA, dl, VTy, Operand, DAG.getConstant(31, dl, VTy)); 3892 SDValue XOR = DAG.getNode(ISD::XOR, dl, VTy, SRA, Operand); 3893 SDValue SHL = 3894 DAG.getNode(ISD::SHL, dl, VTy, XOR, DAG.getConstant(1, dl, VTy)); 3895 SDValue OR = 3896 DAG.getNode(ISD::OR, dl, VTy, SHL, DAG.getConstant(1, dl, VTy)); 3897 SDValue Result = DAG.getNode(ISD::CTLZ, dl, VTy, OR); 3898 return Result; 3899 } 3900 case Intrinsic::arm_cls64: { 3901 // cls(x) = if cls(hi(x)) != 31 then cls(hi(x)) 3902 // else 31 + clz(if hi(x) == 0 then lo(x) else not(lo(x))) 3903 const SDValue &Operand = Op.getOperand(1); 3904 const EVT VTy = Op.getValueType(); 3905 3906 SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, VTy, Operand, 3907 DAG.getConstant(1, dl, VTy)); 3908 SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, VTy, Operand, 3909 DAG.getConstant(0, dl, VTy)); 3910 SDValue Constant0 = DAG.getConstant(0, dl, VTy); 3911 SDValue Constant1 = DAG.getConstant(1, dl, VTy); 3912 SDValue Constant31 = DAG.getConstant(31, dl, VTy); 3913 SDValue SRAHi = DAG.getNode(ISD::SRA, dl, VTy, Hi, Constant31); 3914 SDValue XORHi = DAG.getNode(ISD::XOR, dl, VTy, SRAHi, Hi); 3915 SDValue SHLHi = DAG.getNode(ISD::SHL, dl, VTy, XORHi, Constant1); 3916 SDValue ORHi = DAG.getNode(ISD::OR, dl, VTy, SHLHi, Constant1); 3917 SDValue CLSHi = DAG.getNode(ISD::CTLZ, dl, VTy, ORHi); 3918 SDValue CheckLo = 3919 DAG.getSetCC(dl, MVT::i1, CLSHi, Constant31, ISD::CondCode::SETEQ); 3920 SDValue HiIsZero = 3921 DAG.getSetCC(dl, MVT::i1, Hi, Constant0, ISD::CondCode::SETEQ); 3922 SDValue AdjustedLo = 3923 DAG.getSelect(dl, VTy, HiIsZero, Lo, DAG.getNOT(dl, Lo, VTy)); 3924 SDValue CLZAdjustedLo = DAG.getNode(ISD::CTLZ, dl, VTy, AdjustedLo); 3925 SDValue Result = 3926 DAG.getSelect(dl, VTy, CheckLo, 3927 DAG.getNode(ISD::ADD, dl, VTy, CLZAdjustedLo, Constant31), CLSHi); 3928 return Result; 3929 } 3930 case Intrinsic::eh_sjlj_lsda: { 3931 MachineFunction &MF = DAG.getMachineFunction(); 3932 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); 3933 unsigned ARMPCLabelIndex = AFI->createPICLabelUId(); 3934 EVT PtrVT = getPointerTy(DAG.getDataLayout()); 3935 SDValue CPAddr; 3936 bool IsPositionIndependent = isPositionIndependent(); 3937 unsigned PCAdj = IsPositionIndependent ? (Subtarget->isThumb() ? 4 : 8) : 0; 3938 ARMConstantPoolValue *CPV = 3939 ARMConstantPoolConstant::Create(&MF.getFunction(), ARMPCLabelIndex, 3940 ARMCP::CPLSDA, PCAdj); 3941 CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, Align(4)); 3942 CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); 3943 SDValue Result = DAG.getLoad( 3944 PtrVT, dl, DAG.getEntryNode(), CPAddr, 3945 MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); 3946 3947 if (IsPositionIndependent) { 3948 SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32); 3949 Result = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Result, PICLabel); 3950 } 3951 return Result; 3952 } 3953 case Intrinsic::arm_neon_vabs: 3954 return DAG.getNode(ISD::ABS, SDLoc(Op), Op.getValueType(), 3955 Op.getOperand(1)); 3956 case Intrinsic::arm_neon_vmulls: 3957 case Intrinsic::arm_neon_vmullu: { 3958 unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmulls) 3959 ? ARMISD::VMULLs : ARMISD::VMULLu; 3960 return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(), 3961 Op.getOperand(1), Op.getOperand(2)); 3962 } 3963 case Intrinsic::arm_neon_vminnm: 3964 case Intrinsic::arm_neon_vmaxnm: { 3965 unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vminnm) 3966 ? ISD::FMINNUM : ISD::FMAXNUM; 3967 return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(), 3968 Op.getOperand(1), Op.getOperand(2)); 3969 } 3970 case Intrinsic::arm_neon_vminu: 3971 case Intrinsic::arm_neon_vmaxu: { 3972 if (Op.getValueType().isFloatingPoint()) 3973 return SDValue(); 3974 unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vminu) 3975 ? ISD::UMIN : ISD::UMAX; 3976 return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(), 3977 Op.getOperand(1), Op.getOperand(2)); 3978 } 3979 case Intrinsic::arm_neon_vmins: 3980 case Intrinsic::arm_neon_vmaxs: { 3981 // v{min,max}s is overloaded between signed integers and floats. 3982 if (!Op.getValueType().isFloatingPoint()) { 3983 unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmins) 3984 ? ISD::SMIN : ISD::SMAX; 3985 return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(), 3986 Op.getOperand(1), Op.getOperand(2)); 3987 } 3988 unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmins) 3989 ? ISD::FMINIMUM : ISD::FMAXIMUM; 3990 return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(), 3991 Op.getOperand(1), Op.getOperand(2)); 3992 } 3993 case Intrinsic::arm_neon_vtbl1: 3994 return DAG.getNode(ARMISD::VTBL1, SDLoc(Op), Op.getValueType(), 3995 Op.getOperand(1), Op.getOperand(2)); 3996 case Intrinsic::arm_neon_vtbl2: 3997 return DAG.getNode(ARMISD::VTBL2, SDLoc(Op), Op.getValueType(), 3998 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); 3999 case Intrinsic::arm_mve_pred_i2v: 4000 case Intrinsic::arm_mve_pred_v2i: 4001 return DAG.getNode(ARMISD::PREDICATE_CAST, SDLoc(Op), Op.getValueType(), 4002 Op.getOperand(1)); 4003 case Intrinsic::arm_mve_vreinterpretq: 4004 return DAG.getNode(ARMISD::VECTOR_REG_CAST, SDLoc(Op), Op.getValueType(), 4005 Op.getOperand(1)); 4006 case Intrinsic::arm_mve_lsll: 4007 return DAG.getNode(ARMISD::LSLL, SDLoc(Op), Op->getVTList(), 4008 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); 4009 case Intrinsic::arm_mve_asrl: 4010 return DAG.getNode(ARMISD::ASRL, SDLoc(Op), Op->getVTList(), 4011 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); 4012 } 4013 } 4014 4015 static SDValue LowerATOMIC_FENCE(SDValue Op, SelectionDAG &DAG, 4016 const ARMSubtarget *Subtarget) { 4017 SDLoc dl(Op); 4018 ConstantSDNode *SSIDNode = cast<ConstantSDNode>(Op.getOperand(2)); 4019 auto SSID = static_cast<SyncScope::ID>(SSIDNode->getZExtValue()); 4020 if (SSID == SyncScope::SingleThread) 4021 return Op; 4022 4023 if (!Subtarget->hasDataBarrier()) { 4024 // Some ARMv6 cpus can support data barriers with an mcr instruction. 4025 // Thumb1 and pre-v6 ARM mode use a libcall instead and should never get 4026 // here. 4027 assert(Subtarget->hasV6Ops() && !Subtarget->isThumb() && 4028 "Unexpected ISD::ATOMIC_FENCE encountered. Should be libcall!"); 4029 return DAG.getNode(ARMISD::MEMBARRIER_MCR, dl, MVT::Other, Op.getOperand(0), 4030 DAG.getConstant(0, dl, MVT::i32)); 4031 } 4032 4033 ConstantSDNode *OrdN = cast<ConstantSDNode>(Op.getOperand(1)); 4034 AtomicOrdering Ord = static_cast<AtomicOrdering>(OrdN->getZExtValue()); 4035 ARM_MB::MemBOpt Domain = ARM_MB::ISH; 4036 if (Subtarget->isMClass()) { 4037 // Only a full system barrier exists in the M-class architectures. 4038 Domain = ARM_MB::SY; 4039 } else if (Subtarget->preferISHSTBarriers() && 4040 Ord == AtomicOrdering::Release) { 4041 // Swift happens to implement ISHST barriers in a way that's compatible with 4042 // Release semantics but weaker than ISH so we'd be fools not to use 4043 // it. Beware: other processors probably don't! 4044 Domain = ARM_MB::ISHST; 4045 } 4046 4047 return DAG.getNode(ISD::INTRINSIC_VOID, dl, MVT::Other, Op.getOperand(0), 4048 DAG.getConstant(Intrinsic::arm_dmb, dl, MVT::i32), 4049 DAG.getConstant(Domain, dl, MVT::i32)); 4050 } 4051 4052 static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG, 4053 const ARMSubtarget *Subtarget) { 4054 // ARM pre v5TE and Thumb1 does not have preload instructions. 4055 if (!(Subtarget->isThumb2() || 4056 (!Subtarget->isThumb1Only() && Subtarget->hasV5TEOps()))) 4057 // Just preserve the chain. 4058 return Op.getOperand(0); 4059 4060 SDLoc dl(Op); 4061 unsigned isRead = ~cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue() & 1; 4062 if (!isRead && 4063 (!Subtarget->hasV7Ops() || !Subtarget->hasMPExtension())) 4064 // ARMv7 with MP extension has PLDW. 4065 return Op.getOperand(0); 4066 4067 unsigned isData = cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue(); 4068 if (Subtarget->isThumb()) { 4069 // Invert the bits. 4070 isRead = ~isRead & 1; 4071 isData = ~isData & 1; 4072 } 4073 4074 return DAG.getNode(ARMISD::PRELOAD, dl, MVT::Other, Op.getOperand(0), 4075 Op.getOperand(1), DAG.getConstant(isRead, dl, MVT::i32), 4076 DAG.getConstant(isData, dl, MVT::i32)); 4077 } 4078 4079 static SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) { 4080 MachineFunction &MF = DAG.getMachineFunction(); 4081 ARMFunctionInfo *FuncInfo = MF.getInfo<ARMFunctionInfo>(); 4082 4083 // vastart just stores the address of the VarArgsFrameIndex slot into the 4084 // memory location argument. 4085 SDLoc dl(Op); 4086 EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout()); 4087 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT); 4088 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); 4089 return DAG.getStore(Op.getOperand(0), dl, FR, Op.getOperand(1), 4090 MachinePointerInfo(SV)); 4091 } 4092 4093 SDValue ARMTargetLowering::GetF64FormalArgument(CCValAssign &VA, 4094 CCValAssign &NextVA, 4095 SDValue &Root, 4096 SelectionDAG &DAG, 4097 const SDLoc &dl) const { 4098 MachineFunction &MF = DAG.getMachineFunction(); 4099 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); 4100 4101 const TargetRegisterClass *RC; 4102 if (AFI->isThumb1OnlyFunction()) 4103 RC = &ARM::tGPRRegClass; 4104 else 4105 RC = &ARM::GPRRegClass; 4106 4107 // Transform the arguments stored in physical registers into virtual ones. 4108 unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC); 4109 SDValue ArgValue = DAG.getCopyFromReg(Root, dl, Reg, MVT::i32); 4110 4111 SDValue ArgValue2; 4112 if (NextVA.isMemLoc()) { 4113 MachineFrameInfo &MFI = MF.getFrameInfo(); 4114 int FI = MFI.CreateFixedObject(4, NextVA.getLocMemOffset(), true); 4115 4116 // Create load node to retrieve arguments from the stack. 4117 SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout())); 4118 ArgValue2 = DAG.getLoad( 4119 MVT::i32, dl, Root, FIN, 4120 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI)); 4121 } else { 4122 Reg = MF.addLiveIn(NextVA.getLocReg(), RC); 4123 ArgValue2 = DAG.getCopyFromReg(Root, dl, Reg, MVT::i32); 4124 } 4125 if (!Subtarget->isLittle()) 4126 std::swap (ArgValue, ArgValue2); 4127 return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, ArgValue, ArgValue2); 4128 } 4129 4130 // The remaining GPRs hold either the beginning of variable-argument 4131 // data, or the beginning of an aggregate passed by value (usually 4132 // byval). Either way, we allocate stack slots adjacent to the data 4133 // provided by our caller, and store the unallocated registers there. 4134 // If this is a variadic function, the va_list pointer will begin with 4135 // these values; otherwise, this reassembles a (byval) structure that 4136 // was split between registers and memory. 4137 // Return: The frame index registers were stored into. 4138 int ARMTargetLowering::StoreByValRegs(CCState &CCInfo, SelectionDAG &DAG, 4139 const SDLoc &dl, SDValue &Chain, 4140 const Value *OrigArg, 4141 unsigned InRegsParamRecordIdx, 4142 int ArgOffset, unsigned ArgSize) const { 4143 // Currently, two use-cases possible: 4144 // Case #1. Non-var-args function, and we meet first byval parameter. 4145 // Setup first unallocated register as first byval register; 4146 // eat all remained registers 4147 // (these two actions are performed by HandleByVal method). 4148 // Then, here, we initialize stack frame with 4149 // "store-reg" instructions. 4150 // Case #2. Var-args function, that doesn't contain byval parameters. 4151 // The same: eat all remained unallocated registers, 4152 // initialize stack frame. 4153 4154 MachineFunction &MF = DAG.getMachineFunction(); 4155 MachineFrameInfo &MFI = MF.getFrameInfo(); 4156 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); 4157 unsigned RBegin, REnd; 4158 if (InRegsParamRecordIdx < CCInfo.getInRegsParamsCount()) { 4159 CCInfo.getInRegsParamInfo(InRegsParamRecordIdx, RBegin, REnd); 4160 } else { 4161 unsigned RBeginIdx = CCInfo.getFirstUnallocated(GPRArgRegs); 4162 RBegin = RBeginIdx == 4 ? (unsigned)ARM::R4 : GPRArgRegs[RBeginIdx]; 4163 REnd = ARM::R4; 4164 } 4165 4166 if (REnd != RBegin) 4167 ArgOffset = -4 * (ARM::R4 - RBegin); 4168 4169 auto PtrVT = getPointerTy(DAG.getDataLayout()); 4170 int FrameIndex = MFI.CreateFixedObject(ArgSize, ArgOffset, false); 4171 SDValue FIN = DAG.getFrameIndex(FrameIndex, PtrVT); 4172 4173 SmallVector<SDValue, 4> MemOps; 4174 const TargetRegisterClass *RC = 4175 AFI->isThumb1OnlyFunction() ? &ARM::tGPRRegClass : &ARM::GPRRegClass; 4176 4177 for (unsigned Reg = RBegin, i = 0; Reg < REnd; ++Reg, ++i) { 4178 unsigned VReg = MF.addLiveIn(Reg, RC); 4179 SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i32); 4180 SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, FIN, 4181 MachinePointerInfo(OrigArg, 4 * i)); 4182 MemOps.push_back(Store); 4183 FIN = DAG.getNode(ISD::ADD, dl, PtrVT, FIN, DAG.getConstant(4, dl, PtrVT)); 4184 } 4185 4186 if (!MemOps.empty()) 4187 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps); 4188 return FrameIndex; 4189 } 4190 4191 // Setup stack frame, the va_list pointer will start from. 4192 void ARMTargetLowering::VarArgStyleRegisters(CCState &CCInfo, SelectionDAG &DAG, 4193 const SDLoc &dl, SDValue &Chain, 4194 unsigned ArgOffset, 4195 unsigned TotalArgRegsSaveSize, 4196 bool ForceMutable) const { 4197 MachineFunction &MF = DAG.getMachineFunction(); 4198 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); 4199 4200 // Try to store any remaining integer argument regs 4201 // to their spots on the stack so that they may be loaded by dereferencing 4202 // the result of va_next. 4203 // If there is no regs to be stored, just point address after last 4204 // argument passed via stack. 4205 int FrameIndex = StoreByValRegs(CCInfo, DAG, dl, Chain, nullptr, 4206 CCInfo.getInRegsParamsCount(), 4207 CCInfo.getNextStackOffset(), 4208 std::max(4U, TotalArgRegsSaveSize)); 4209 AFI->setVarArgsFrameIndex(FrameIndex); 4210 } 4211 4212 bool ARMTargetLowering::splitValueIntoRegisterParts( 4213 SelectionDAG &DAG, const SDLoc &DL, SDValue Val, SDValue *Parts, 4214 unsigned NumParts, MVT PartVT, Optional<CallingConv::ID> CC) const { 4215 bool IsABIRegCopy = CC.hasValue(); 4216 EVT ValueVT = Val.getValueType(); 4217 if (IsABIRegCopy && (ValueVT == MVT::f16 || ValueVT == MVT::bf16) && 4218 PartVT == MVT::f32) { 4219 unsigned ValueBits = ValueVT.getSizeInBits(); 4220 unsigned PartBits = PartVT.getSizeInBits(); 4221 Val = DAG.getNode(ISD::BITCAST, DL, MVT::getIntegerVT(ValueBits), Val); 4222 Val = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::getIntegerVT(PartBits), Val); 4223 Val = DAG.getNode(ISD::BITCAST, DL, PartVT, Val); 4224 Parts[0] = Val; 4225 return true; 4226 } 4227 return false; 4228 } 4229 4230 SDValue ARMTargetLowering::joinRegisterPartsIntoValue( 4231 SelectionDAG &DAG, const SDLoc &DL, const SDValue *Parts, unsigned NumParts, 4232 MVT PartVT, EVT ValueVT, Optional<CallingConv::ID> CC) const { 4233 bool IsABIRegCopy = CC.hasValue(); 4234 if (IsABIRegCopy && (ValueVT == MVT::f16 || ValueVT == MVT::bf16) && 4235 PartVT == MVT::f32) { 4236 unsigned ValueBits = ValueVT.getSizeInBits(); 4237 unsigned PartBits = PartVT.getSizeInBits(); 4238 SDValue Val = Parts[0]; 4239 4240 Val = DAG.getNode(ISD::BITCAST, DL, MVT::getIntegerVT(PartBits), Val); 4241 Val = DAG.getNode(ISD::TRUNCATE, DL, MVT::getIntegerVT(ValueBits), Val); 4242 Val = DAG.getNode(ISD::BITCAST, DL, ValueVT, Val); 4243 return Val; 4244 } 4245 return SDValue(); 4246 } 4247 4248 SDValue ARMTargetLowering::LowerFormalArguments( 4249 SDValue Chain, CallingConv::ID CallConv, bool isVarArg, 4250 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl, 4251 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const { 4252 MachineFunction &MF = DAG.getMachineFunction(); 4253 MachineFrameInfo &MFI = MF.getFrameInfo(); 4254 4255 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); 4256 4257 // Assign locations to all of the incoming arguments. 4258 SmallVector<CCValAssign, 16> ArgLocs; 4259 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs, 4260 *DAG.getContext()); 4261 CCInfo.AnalyzeFormalArguments(Ins, CCAssignFnForCall(CallConv, isVarArg)); 4262 4263 SmallVector<SDValue, 16> ArgValues; 4264 SDValue ArgValue; 4265 Function::const_arg_iterator CurOrigArg = MF.getFunction().arg_begin(); 4266 unsigned CurArgIdx = 0; 4267 4268 // Initially ArgRegsSaveSize is zero. 4269 // Then we increase this value each time we meet byval parameter. 4270 // We also increase this value in case of varargs function. 4271 AFI->setArgRegsSaveSize(0); 4272 4273 // Calculate the amount of stack space that we need to allocate to store 4274 // byval and variadic arguments that are passed in registers. 4275 // We need to know this before we allocate the first byval or variadic 4276 // argument, as they will be allocated a stack slot below the CFA (Canonical 4277 // Frame Address, the stack pointer at entry to the function). 4278 unsigned ArgRegBegin = ARM::R4; 4279 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 4280 if (CCInfo.getInRegsParamsProcessed() >= CCInfo.getInRegsParamsCount()) 4281 break; 4282 4283 CCValAssign &VA = ArgLocs[i]; 4284 unsigned Index = VA.getValNo(); 4285 ISD::ArgFlagsTy Flags = Ins[Index].Flags; 4286 if (!Flags.isByVal()) 4287 continue; 4288 4289 assert(VA.isMemLoc() && "unexpected byval pointer in reg"); 4290 unsigned RBegin, REnd; 4291 CCInfo.getInRegsParamInfo(CCInfo.getInRegsParamsProcessed(), RBegin, REnd); 4292 ArgRegBegin = std::min(ArgRegBegin, RBegin); 4293 4294 CCInfo.nextInRegsParam(); 4295 } 4296 CCInfo.rewindByValRegsInfo(); 4297 4298 int lastInsIndex = -1; 4299 if (isVarArg && MFI.hasVAStart()) { 4300 unsigned RegIdx = CCInfo.getFirstUnallocated(GPRArgRegs); 4301 if (RegIdx != array_lengthof(GPRArgRegs)) 4302 ArgRegBegin = std::min(ArgRegBegin, (unsigned)GPRArgRegs[RegIdx]); 4303 } 4304 4305 unsigned TotalArgRegsSaveSize = 4 * (ARM::R4 - ArgRegBegin); 4306 AFI->setArgRegsSaveSize(TotalArgRegsSaveSize); 4307 auto PtrVT = getPointerTy(DAG.getDataLayout()); 4308 4309 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 4310 CCValAssign &VA = ArgLocs[i]; 4311 if (Ins[VA.getValNo()].isOrigArg()) { 4312 std::advance(CurOrigArg, 4313 Ins[VA.getValNo()].getOrigArgIndex() - CurArgIdx); 4314 CurArgIdx = Ins[VA.getValNo()].getOrigArgIndex(); 4315 } 4316 // Arguments stored in registers. 4317 if (VA.isRegLoc()) { 4318 EVT RegVT = VA.getLocVT(); 4319 4320 if (VA.needsCustom() && VA.getLocVT() == MVT::v2f64) { 4321 // f64 and vector types are split up into multiple registers or 4322 // combinations of registers and stack slots. 4323 SDValue ArgValue1 = 4324 GetF64FormalArgument(VA, ArgLocs[++i], Chain, DAG, dl); 4325 VA = ArgLocs[++i]; // skip ahead to next loc 4326 SDValue ArgValue2; 4327 if (VA.isMemLoc()) { 4328 int FI = MFI.CreateFixedObject(8, VA.getLocMemOffset(), true); 4329 SDValue FIN = DAG.getFrameIndex(FI, PtrVT); 4330 ArgValue2 = DAG.getLoad( 4331 MVT::f64, dl, Chain, FIN, 4332 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI)); 4333 } else { 4334 ArgValue2 = GetF64FormalArgument(VA, ArgLocs[++i], Chain, DAG, dl); 4335 } 4336 ArgValue = DAG.getNode(ISD::UNDEF, dl, MVT::v2f64); 4337 ArgValue = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, ArgValue, 4338 ArgValue1, DAG.getIntPtrConstant(0, dl)); 4339 ArgValue = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, ArgValue, 4340 ArgValue2, DAG.getIntPtrConstant(1, dl)); 4341 } else if (VA.needsCustom() && VA.getLocVT() == MVT::f64) { 4342 ArgValue = GetF64FormalArgument(VA, ArgLocs[++i], Chain, DAG, dl); 4343 } else { 4344 const TargetRegisterClass *RC; 4345 4346 if (RegVT == MVT::f16 || RegVT == MVT::bf16) 4347 RC = &ARM::HPRRegClass; 4348 else if (RegVT == MVT::f32) 4349 RC = &ARM::SPRRegClass; 4350 else if (RegVT == MVT::f64 || RegVT == MVT::v4f16 || 4351 RegVT == MVT::v4bf16) 4352 RC = &ARM::DPRRegClass; 4353 else if (RegVT == MVT::v2f64 || RegVT == MVT::v8f16 || 4354 RegVT == MVT::v8bf16) 4355 RC = &ARM::QPRRegClass; 4356 else if (RegVT == MVT::i32) 4357 RC = AFI->isThumb1OnlyFunction() ? &ARM::tGPRRegClass 4358 : &ARM::GPRRegClass; 4359 else 4360 llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering"); 4361 4362 // Transform the arguments in physical registers into virtual ones. 4363 unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC); 4364 ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT); 4365 4366 // If this value is passed in r0 and has the returned attribute (e.g. 4367 // C++ 'structors), record this fact for later use. 4368 if (VA.getLocReg() == ARM::R0 && Ins[VA.getValNo()].Flags.isReturned()) { 4369 AFI->setPreservesR0(); 4370 } 4371 } 4372 4373 // If this is an 8 or 16-bit value, it is really passed promoted 4374 // to 32 bits. Insert an assert[sz]ext to capture this, then 4375 // truncate to the right size. 4376 switch (VA.getLocInfo()) { 4377 default: llvm_unreachable("Unknown loc info!"); 4378 case CCValAssign::Full: break; 4379 case CCValAssign::BCvt: 4380 ArgValue = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), ArgValue); 4381 break; 4382 case CCValAssign::SExt: 4383 ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue, 4384 DAG.getValueType(VA.getValVT())); 4385 ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue); 4386 break; 4387 case CCValAssign::ZExt: 4388 ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue, 4389 DAG.getValueType(VA.getValVT())); 4390 ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue); 4391 break; 4392 } 4393 4394 // f16 arguments have their size extended to 4 bytes and passed as if they 4395 // had been copied to the LSBs of a 32-bit register. 4396 // For that, it's passed extended to i32 (soft ABI) or to f32 (hard ABI) 4397 if (VA.needsCustom() && 4398 (VA.getValVT() == MVT::f16 || VA.getValVT() == MVT::bf16)) 4399 ArgValue = MoveToHPR(dl, DAG, VA.getLocVT(), VA.getValVT(), ArgValue); 4400 4401 InVals.push_back(ArgValue); 4402 } else { // VA.isRegLoc() 4403 // sanity check 4404 assert(VA.isMemLoc()); 4405 assert(VA.getValVT() != MVT::i64 && "i64 should already be lowered"); 4406 4407 int index = VA.getValNo(); 4408 4409 // Some Ins[] entries become multiple ArgLoc[] entries. 4410 // Process them only once. 4411 if (index != lastInsIndex) 4412 { 4413 ISD::ArgFlagsTy Flags = Ins[index].Flags; 4414 // FIXME: For now, all byval parameter objects are marked mutable. 4415 // This can be changed with more analysis. 4416 // In case of tail call optimization mark all arguments mutable. 4417 // Since they could be overwritten by lowering of arguments in case of 4418 // a tail call. 4419 if (Flags.isByVal()) { 4420 assert(Ins[index].isOrigArg() && 4421 "Byval arguments cannot be implicit"); 4422 unsigned CurByValIndex = CCInfo.getInRegsParamsProcessed(); 4423 4424 int FrameIndex = StoreByValRegs( 4425 CCInfo, DAG, dl, Chain, &*CurOrigArg, CurByValIndex, 4426 VA.getLocMemOffset(), Flags.getByValSize()); 4427 InVals.push_back(DAG.getFrameIndex(FrameIndex, PtrVT)); 4428 CCInfo.nextInRegsParam(); 4429 } else { 4430 unsigned FIOffset = VA.getLocMemOffset(); 4431 int FI = MFI.CreateFixedObject(VA.getLocVT().getSizeInBits()/8, 4432 FIOffset, true); 4433 4434 // Create load nodes to retrieve arguments from the stack. 4435 SDValue FIN = DAG.getFrameIndex(FI, PtrVT); 4436 InVals.push_back(DAG.getLoad(VA.getValVT(), dl, Chain, FIN, 4437 MachinePointerInfo::getFixedStack( 4438 DAG.getMachineFunction(), FI))); 4439 } 4440 lastInsIndex = index; 4441 } 4442 } 4443 } 4444 4445 // varargs 4446 if (isVarArg && MFI.hasVAStart()) { 4447 VarArgStyleRegisters(CCInfo, DAG, dl, Chain, CCInfo.getNextStackOffset(), 4448 TotalArgRegsSaveSize); 4449 if (AFI->isCmseNSEntryFunction()) { 4450 DiagnosticInfoUnsupported Diag( 4451 DAG.getMachineFunction().getFunction(), 4452 "secure entry function must not be variadic", dl.getDebugLoc()); 4453 DAG.getContext()->diagnose(Diag); 4454 } 4455 } 4456 4457 AFI->setArgumentStackSize(CCInfo.getNextStackOffset()); 4458 4459 if (CCInfo.getNextStackOffset() > 0 && AFI->isCmseNSEntryFunction()) { 4460 DiagnosticInfoUnsupported Diag( 4461 DAG.getMachineFunction().getFunction(), 4462 "secure entry function requires arguments on stack", dl.getDebugLoc()); 4463 DAG.getContext()->diagnose(Diag); 4464 } 4465 4466 return Chain; 4467 } 4468 4469 /// isFloatingPointZero - Return true if this is +0.0. 4470 static bool isFloatingPointZero(SDValue Op) { 4471 if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Op)) 4472 return CFP->getValueAPF().isPosZero(); 4473 else if (ISD::isEXTLoad(Op.getNode()) || ISD::isNON_EXTLoad(Op.getNode())) { 4474 // Maybe this has already been legalized into the constant pool? 4475 if (Op.getOperand(1).getOpcode() == ARMISD::Wrapper) { 4476 SDValue WrapperOp = Op.getOperand(1).getOperand(0); 4477 if (ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(WrapperOp)) 4478 if (const ConstantFP *CFP = dyn_cast<ConstantFP>(CP->getConstVal())) 4479 return CFP->getValueAPF().isPosZero(); 4480 } 4481 } else if (Op->getOpcode() == ISD::BITCAST && 4482 Op->getValueType(0) == MVT::f64) { 4483 // Handle (ISD::BITCAST (ARMISD::VMOVIMM (ISD::TargetConstant 0)) MVT::f64) 4484 // created by LowerConstantFP(). 4485 SDValue BitcastOp = Op->getOperand(0); 4486 if (BitcastOp->getOpcode() == ARMISD::VMOVIMM && 4487 isNullConstant(BitcastOp->getOperand(0))) 4488 return true; 4489 } 4490 return false; 4491 } 4492 4493 /// Returns appropriate ARM CMP (cmp) and corresponding condition code for 4494 /// the given operands. 4495 SDValue ARMTargetLowering::getARMCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC, 4496 SDValue &ARMcc, SelectionDAG &DAG, 4497 const SDLoc &dl) const { 4498 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) { 4499 unsigned C = RHSC->getZExtValue(); 4500 if (!isLegalICmpImmediate((int32_t)C)) { 4501 // Constant does not fit, try adjusting it by one. 4502 switch (CC) { 4503 default: break; 4504 case ISD::SETLT: 4505 case ISD::SETGE: 4506 if (C != 0x80000000 && isLegalICmpImmediate(C-1)) { 4507 CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT; 4508 RHS = DAG.getConstant(C - 1, dl, MVT::i32); 4509 } 4510 break; 4511 case ISD::SETULT: 4512 case ISD::SETUGE: 4513 if (C != 0 && isLegalICmpImmediate(C-1)) { 4514 CC = (CC == ISD::SETULT) ? ISD::SETULE : ISD::SETUGT; 4515 RHS = DAG.getConstant(C - 1, dl, MVT::i32); 4516 } 4517 break; 4518 case ISD::SETLE: 4519 case ISD::SETGT: 4520 if (C != 0x7fffffff && isLegalICmpImmediate(C+1)) { 4521 CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE; 4522 RHS = DAG.getConstant(C + 1, dl, MVT::i32); 4523 } 4524 break; 4525 case ISD::SETULE: 4526 case ISD::SETUGT: 4527 if (C != 0xffffffff && isLegalICmpImmediate(C+1)) { 4528 CC = (CC == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE; 4529 RHS = DAG.getConstant(C + 1, dl, MVT::i32); 4530 } 4531 break; 4532 } 4533 } 4534 } else if ((ARM_AM::getShiftOpcForNode(LHS.getOpcode()) != ARM_AM::no_shift) && 4535 (ARM_AM::getShiftOpcForNode(RHS.getOpcode()) == ARM_AM::no_shift)) { 4536 // In ARM and Thumb-2, the compare instructions can shift their second 4537 // operand. 4538 CC = ISD::getSetCCSwappedOperands(CC); 4539 std::swap(LHS, RHS); 4540 } 4541 4542 // Thumb1 has very limited immediate modes, so turning an "and" into a 4543 // shift can save multiple instructions. 4544 // 4545 // If we have (x & C1), and C1 is an appropriate mask, we can transform it 4546 // into "((x << n) >> n)". But that isn't necessarily profitable on its 4547 // own. If it's the operand to an unsigned comparison with an immediate, 4548 // we can eliminate one of the shifts: we transform 4549 // "((x << n) >> n) == C2" to "(x << n) == (C2 << n)". 4550 // 4551 // We avoid transforming cases which aren't profitable due to encoding 4552 // details: 4553 // 4554 // 1. C2 fits into the immediate field of a cmp, and the transformed version 4555 // would not; in that case, we're essentially trading one immediate load for 4556 // another. 4557 // 2. C1 is 255 or 65535, so we can use uxtb or uxth. 4558 // 3. C2 is zero; we have other code for this special case. 4559 // 4560 // FIXME: Figure out profitability for Thumb2; we usually can't save an 4561 // instruction, since the AND is always one instruction anyway, but we could 4562 // use narrow instructions in some cases. 4563 if (Subtarget->isThumb1Only() && LHS->getOpcode() == ISD::AND && 4564 LHS->hasOneUse() && isa<ConstantSDNode>(LHS.getOperand(1)) && 4565 LHS.getValueType() == MVT::i32 && isa<ConstantSDNode>(RHS) && 4566 !isSignedIntSetCC(CC)) { 4567 unsigned Mask = cast<ConstantSDNode>(LHS.getOperand(1))->getZExtValue(); 4568 auto *RHSC = cast<ConstantSDNode>(RHS.getNode()); 4569 uint64_t RHSV = RHSC->getZExtValue(); 4570 if (isMask_32(Mask) && (RHSV & ~Mask) == 0 && Mask != 255 && Mask != 65535) { 4571 unsigned ShiftBits = countLeadingZeros(Mask); 4572 if (RHSV && (RHSV > 255 || (RHSV << ShiftBits) <= 255)) { 4573 SDValue ShiftAmt = DAG.getConstant(ShiftBits, dl, MVT::i32); 4574 LHS = DAG.getNode(ISD::SHL, dl, MVT::i32, LHS.getOperand(0), ShiftAmt); 4575 RHS = DAG.getConstant(RHSV << ShiftBits, dl, MVT::i32); 4576 } 4577 } 4578 } 4579 4580 // The specific comparison "(x<<c) > 0x80000000U" can be optimized to a 4581 // single "lsls x, c+1". The shift sets the "C" and "Z" flags the same 4582 // way a cmp would. 4583 // FIXME: Add support for ARM/Thumb2; this would need isel patterns, and 4584 // some tweaks to the heuristics for the previous and->shift transform. 4585 // FIXME: Optimize cases where the LHS isn't a shift. 4586 if (Subtarget->isThumb1Only() && LHS->getOpcode() == ISD::SHL && 4587 isa<ConstantSDNode>(RHS) && 4588 cast<ConstantSDNode>(RHS)->getZExtValue() == 0x80000000U && 4589 CC == ISD::SETUGT && isa<ConstantSDNode>(LHS.getOperand(1)) && 4590 cast<ConstantSDNode>(LHS.getOperand(1))->getZExtValue() < 31) { 4591 unsigned ShiftAmt = 4592 cast<ConstantSDNode>(LHS.getOperand(1))->getZExtValue() + 1; 4593 SDValue Shift = DAG.getNode(ARMISD::LSLS, dl, 4594 DAG.getVTList(MVT::i32, MVT::i32), 4595 LHS.getOperand(0), 4596 DAG.getConstant(ShiftAmt, dl, MVT::i32)); 4597 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, ARM::CPSR, 4598 Shift.getValue(1), SDValue()); 4599 ARMcc = DAG.getConstant(ARMCC::HI, dl, MVT::i32); 4600 return Chain.getValue(1); 4601 } 4602 4603 ARMCC::CondCodes CondCode = IntCCToARMCC(CC); 4604 4605 // If the RHS is a constant zero then the V (overflow) flag will never be 4606 // set. This can allow us to simplify GE to PL or LT to MI, which can be 4607 // simpler for other passes (like the peephole optimiser) to deal with. 4608 if (isNullConstant(RHS)) { 4609 switch (CondCode) { 4610 default: break; 4611 case ARMCC::GE: 4612 CondCode = ARMCC::PL; 4613 break; 4614 case ARMCC::LT: 4615 CondCode = ARMCC::MI; 4616 break; 4617 } 4618 } 4619 4620 ARMISD::NodeType CompareType; 4621 switch (CondCode) { 4622 default: 4623 CompareType = ARMISD::CMP; 4624 break; 4625 case ARMCC::EQ: 4626 case ARMCC::NE: 4627 // Uses only Z Flag 4628 CompareType = ARMISD::CMPZ; 4629 break; 4630 } 4631 ARMcc = DAG.getConstant(CondCode, dl, MVT::i32); 4632 return DAG.getNode(CompareType, dl, MVT::Glue, LHS, RHS); 4633 } 4634 4635 /// Returns a appropriate VFP CMP (fcmp{s|d}+fmstat) for the given operands. 4636 SDValue ARMTargetLowering::getVFPCmp(SDValue LHS, SDValue RHS, 4637 SelectionDAG &DAG, const SDLoc &dl, 4638 bool Signaling) const { 4639 assert(Subtarget->hasFP64() || RHS.getValueType() != MVT::f64); 4640 SDValue Cmp; 4641 if (!isFloatingPointZero(RHS)) 4642 Cmp = DAG.getNode(Signaling ? ARMISD::CMPFPE : ARMISD::CMPFP, 4643 dl, MVT::Glue, LHS, RHS); 4644 else 4645 Cmp = DAG.getNode(Signaling ? ARMISD::CMPFPEw0 : ARMISD::CMPFPw0, 4646 dl, MVT::Glue, LHS); 4647 return DAG.getNode(ARMISD::FMSTAT, dl, MVT::Glue, Cmp); 4648 } 4649 4650 /// duplicateCmp - Glue values can have only one use, so this function 4651 /// duplicates a comparison node. 4652 SDValue 4653 ARMTargetLowering::duplicateCmp(SDValue Cmp, SelectionDAG &DAG) const { 4654 unsigned Opc = Cmp.getOpcode(); 4655 SDLoc DL(Cmp); 4656 if (Opc == ARMISD::CMP || Opc == ARMISD::CMPZ) 4657 return DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0),Cmp.getOperand(1)); 4658 4659 assert(Opc == ARMISD::FMSTAT && "unexpected comparison operation"); 4660 Cmp = Cmp.getOperand(0); 4661 Opc = Cmp.getOpcode(); 4662 if (Opc == ARMISD::CMPFP) 4663 Cmp = DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0),Cmp.getOperand(1)); 4664 else { 4665 assert(Opc == ARMISD::CMPFPw0 && "unexpected operand of FMSTAT"); 4666 Cmp = DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0)); 4667 } 4668 return DAG.getNode(ARMISD::FMSTAT, DL, MVT::Glue, Cmp); 4669 } 4670 4671 // This function returns three things: the arithmetic computation itself 4672 // (Value), a comparison (OverflowCmp), and a condition code (ARMcc). The 4673 // comparison and the condition code define the case in which the arithmetic 4674 // computation *does not* overflow. 4675 std::pair<SDValue, SDValue> 4676 ARMTargetLowering::getARMXALUOOp(SDValue Op, SelectionDAG &DAG, 4677 SDValue &ARMcc) const { 4678 assert(Op.getValueType() == MVT::i32 && "Unsupported value type"); 4679 4680 SDValue Value, OverflowCmp; 4681 SDValue LHS = Op.getOperand(0); 4682 SDValue RHS = Op.getOperand(1); 4683 SDLoc dl(Op); 4684 4685 // FIXME: We are currently always generating CMPs because we don't support 4686 // generating CMN through the backend. This is not as good as the natural 4687 // CMP case because it causes a register dependency and cannot be folded 4688 // later. 4689 4690 switch (Op.getOpcode()) { 4691 default: 4692 llvm_unreachable("Unknown overflow instruction!"); 4693 case ISD::SADDO: 4694 ARMcc = DAG.getConstant(ARMCC::VC, dl, MVT::i32); 4695 Value = DAG.getNode(ISD::ADD, dl, Op.getValueType(), LHS, RHS); 4696 OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, Value, LHS); 4697 break; 4698 case ISD::UADDO: 4699 ARMcc = DAG.getConstant(ARMCC::HS, dl, MVT::i32); 4700 // We use ADDC here to correspond to its use in LowerUnsignedALUO. 4701 // We do not use it in the USUBO case as Value may not be used. 4702 Value = DAG.getNode(ARMISD::ADDC, dl, 4703 DAG.getVTList(Op.getValueType(), MVT::i32), LHS, RHS) 4704 .getValue(0); 4705 OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, Value, LHS); 4706 break; 4707 case ISD::SSUBO: 4708 ARMcc = DAG.getConstant(ARMCC::VC, dl, MVT::i32); 4709 Value = DAG.getNode(ISD::SUB, dl, Op.getValueType(), LHS, RHS); 4710 OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, LHS, RHS); 4711 break; 4712 case ISD::USUBO: 4713 ARMcc = DAG.getConstant(ARMCC::HS, dl, MVT::i32); 4714 Value = DAG.getNode(ISD::SUB, dl, Op.getValueType(), LHS, RHS); 4715 OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, LHS, RHS); 4716 break; 4717 case ISD::UMULO: 4718 // We generate a UMUL_LOHI and then check if the high word is 0. 4719 ARMcc = DAG.getConstant(ARMCC::EQ, dl, MVT::i32); 4720 Value = DAG.getNode(ISD::UMUL_LOHI, dl, 4721 DAG.getVTList(Op.getValueType(), Op.getValueType()), 4722 LHS, RHS); 4723 OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, Value.getValue(1), 4724 DAG.getConstant(0, dl, MVT::i32)); 4725 Value = Value.getValue(0); // We only want the low 32 bits for the result. 4726 break; 4727 case ISD::SMULO: 4728 // We generate a SMUL_LOHI and then check if all the bits of the high word 4729 // are the same as the sign bit of the low word. 4730 ARMcc = DAG.getConstant(ARMCC::EQ, dl, MVT::i32); 4731 Value = DAG.getNode(ISD::SMUL_LOHI, dl, 4732 DAG.getVTList(Op.getValueType(), Op.getValueType()), 4733 LHS, RHS); 4734 OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, Value.getValue(1), 4735 DAG.getNode(ISD::SRA, dl, Op.getValueType(), 4736 Value.getValue(0), 4737 DAG.getConstant(31, dl, MVT::i32))); 4738 Value = Value.getValue(0); // We only want the low 32 bits for the result. 4739 break; 4740 } // switch (...) 4741 4742 return std::make_pair(Value, OverflowCmp); 4743 } 4744 4745 SDValue 4746 ARMTargetLowering::LowerSignedALUO(SDValue Op, SelectionDAG &DAG) const { 4747 // Let legalize expand this if it isn't a legal type yet. 4748 if (!DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType())) 4749 return SDValue(); 4750 4751 SDValue Value, OverflowCmp; 4752 SDValue ARMcc; 4753 std::tie(Value, OverflowCmp) = getARMXALUOOp(Op, DAG, ARMcc); 4754 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); 4755 SDLoc dl(Op); 4756 // We use 0 and 1 as false and true values. 4757 SDValue TVal = DAG.getConstant(1, dl, MVT::i32); 4758 SDValue FVal = DAG.getConstant(0, dl, MVT::i32); 4759 EVT VT = Op.getValueType(); 4760 4761 SDValue Overflow = DAG.getNode(ARMISD::CMOV, dl, VT, TVal, FVal, 4762 ARMcc, CCR, OverflowCmp); 4763 4764 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32); 4765 return DAG.getNode(ISD::MERGE_VALUES, dl, VTs, Value, Overflow); 4766 } 4767 4768 static SDValue ConvertBooleanCarryToCarryFlag(SDValue BoolCarry, 4769 SelectionDAG &DAG) { 4770 SDLoc DL(BoolCarry); 4771 EVT CarryVT = BoolCarry.getValueType(); 4772 4773 // This converts the boolean value carry into the carry flag by doing 4774 // ARMISD::SUBC Carry, 1 4775 SDValue Carry = DAG.getNode(ARMISD::SUBC, DL, 4776 DAG.getVTList(CarryVT, MVT::i32), 4777 BoolCarry, DAG.getConstant(1, DL, CarryVT)); 4778 return Carry.getValue(1); 4779 } 4780 4781 static SDValue ConvertCarryFlagToBooleanCarry(SDValue Flags, EVT VT, 4782 SelectionDAG &DAG) { 4783 SDLoc DL(Flags); 4784 4785 // Now convert the carry flag into a boolean carry. We do this 4786 // using ARMISD:ADDE 0, 0, Carry 4787 return DAG.getNode(ARMISD::ADDE, DL, DAG.getVTList(VT, MVT::i32), 4788 DAG.getConstant(0, DL, MVT::i32), 4789 DAG.getConstant(0, DL, MVT::i32), Flags); 4790 } 4791 4792 SDValue ARMTargetLowering::LowerUnsignedALUO(SDValue Op, 4793 SelectionDAG &DAG) const { 4794 // Let legalize expand this if it isn't a legal type yet. 4795 if (!DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType())) 4796 return SDValue(); 4797 4798 SDValue LHS = Op.getOperand(0); 4799 SDValue RHS = Op.getOperand(1); 4800 SDLoc dl(Op); 4801 4802 EVT VT = Op.getValueType(); 4803 SDVTList VTs = DAG.getVTList(VT, MVT::i32); 4804 SDValue Value; 4805 SDValue Overflow; 4806 switch (Op.getOpcode()) { 4807 default: 4808 llvm_unreachable("Unknown overflow instruction!"); 4809 case ISD::UADDO: 4810 Value = DAG.getNode(ARMISD::ADDC, dl, VTs, LHS, RHS); 4811 // Convert the carry flag into a boolean value. 4812 Overflow = ConvertCarryFlagToBooleanCarry(Value.getValue(1), VT, DAG); 4813 break; 4814 case ISD::USUBO: { 4815 Value = DAG.getNode(ARMISD::SUBC, dl, VTs, LHS, RHS); 4816 // Convert the carry flag into a boolean value. 4817 Overflow = ConvertCarryFlagToBooleanCarry(Value.getValue(1), VT, DAG); 4818 // ARMISD::SUBC returns 0 when we have to borrow, so make it an overflow 4819 // value. So compute 1 - C. 4820 Overflow = DAG.getNode(ISD::SUB, dl, MVT::i32, 4821 DAG.getConstant(1, dl, MVT::i32), Overflow); 4822 break; 4823 } 4824 } 4825 4826 return DAG.getNode(ISD::MERGE_VALUES, dl, VTs, Value, Overflow); 4827 } 4828 4829 static SDValue LowerSADDSUBSAT(SDValue Op, SelectionDAG &DAG, 4830 const ARMSubtarget *Subtarget) { 4831 EVT VT = Op.getValueType(); 4832 if (!Subtarget->hasV6Ops() || !Subtarget->hasDSP()) 4833 return SDValue(); 4834 if (!VT.isSimple()) 4835 return SDValue(); 4836 4837 unsigned NewOpcode; 4838 bool IsAdd = Op->getOpcode() == ISD::SADDSAT; 4839 switch (VT.getSimpleVT().SimpleTy) { 4840 default: 4841 return SDValue(); 4842 case MVT::i8: 4843 NewOpcode = IsAdd ? ARMISD::QADD8b : ARMISD::QSUB8b; 4844 break; 4845 case MVT::i16: 4846 NewOpcode = IsAdd ? ARMISD::QADD16b : ARMISD::QSUB16b; 4847 break; 4848 } 4849 4850 SDLoc dl(Op); 4851 SDValue Add = 4852 DAG.getNode(NewOpcode, dl, MVT::i32, 4853 DAG.getSExtOrTrunc(Op->getOperand(0), dl, MVT::i32), 4854 DAG.getSExtOrTrunc(Op->getOperand(1), dl, MVT::i32)); 4855 return DAG.getNode(ISD::TRUNCATE, dl, VT, Add); 4856 } 4857 4858 SDValue ARMTargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { 4859 SDValue Cond = Op.getOperand(0); 4860 SDValue SelectTrue = Op.getOperand(1); 4861 SDValue SelectFalse = Op.getOperand(2); 4862 SDLoc dl(Op); 4863 unsigned Opc = Cond.getOpcode(); 4864 4865 if (Cond.getResNo() == 1 && 4866 (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO || 4867 Opc == ISD::USUBO)) { 4868 if (!DAG.getTargetLoweringInfo().isTypeLegal(Cond->getValueType(0))) 4869 return SDValue(); 4870 4871 SDValue Value, OverflowCmp; 4872 SDValue ARMcc; 4873 std::tie(Value, OverflowCmp) = getARMXALUOOp(Cond, DAG, ARMcc); 4874 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); 4875 EVT VT = Op.getValueType(); 4876 4877 return getCMOV(dl, VT, SelectTrue, SelectFalse, ARMcc, CCR, 4878 OverflowCmp, DAG); 4879 } 4880 4881 // Convert: 4882 // 4883 // (select (cmov 1, 0, cond), t, f) -> (cmov t, f, cond) 4884 // (select (cmov 0, 1, cond), t, f) -> (cmov f, t, cond) 4885 // 4886 if (Cond.getOpcode() == ARMISD::CMOV && Cond.hasOneUse()) { 4887 const ConstantSDNode *CMOVTrue = 4888 dyn_cast<ConstantSDNode>(Cond.getOperand(0)); 4889 const ConstantSDNode *CMOVFalse = 4890 dyn_cast<ConstantSDNode>(Cond.getOperand(1)); 4891 4892 if (CMOVTrue && CMOVFalse) { 4893 unsigned CMOVTrueVal = CMOVTrue->getZExtValue(); 4894 unsigned CMOVFalseVal = CMOVFalse->getZExtValue(); 4895 4896 SDValue True; 4897 SDValue False; 4898 if (CMOVTrueVal == 1 && CMOVFalseVal == 0) { 4899 True = SelectTrue; 4900 False = SelectFalse; 4901 } else if (CMOVTrueVal == 0 && CMOVFalseVal == 1) { 4902 True = SelectFalse; 4903 False = SelectTrue; 4904 } 4905 4906 if (True.getNode() && False.getNode()) { 4907 EVT VT = Op.getValueType(); 4908 SDValue ARMcc = Cond.getOperand(2); 4909 SDValue CCR = Cond.getOperand(3); 4910 SDValue Cmp = duplicateCmp(Cond.getOperand(4), DAG); 4911 assert(True.getValueType() == VT); 4912 return getCMOV(dl, VT, True, False, ARMcc, CCR, Cmp, DAG); 4913 } 4914 } 4915 } 4916 4917 // ARM's BooleanContents value is UndefinedBooleanContent. Mask out the 4918 // undefined bits before doing a full-word comparison with zero. 4919 Cond = DAG.getNode(ISD::AND, dl, Cond.getValueType(), Cond, 4920 DAG.getConstant(1, dl, Cond.getValueType())); 4921 4922 return DAG.getSelectCC(dl, Cond, 4923 DAG.getConstant(0, dl, Cond.getValueType()), 4924 SelectTrue, SelectFalse, ISD::SETNE); 4925 } 4926 4927 static void checkVSELConstraints(ISD::CondCode CC, ARMCC::CondCodes &CondCode, 4928 bool &swpCmpOps, bool &swpVselOps) { 4929 // Start by selecting the GE condition code for opcodes that return true for 4930 // 'equality' 4931 if (CC == ISD::SETUGE || CC == ISD::SETOGE || CC == ISD::SETOLE || 4932 CC == ISD::SETULE || CC == ISD::SETGE || CC == ISD::SETLE) 4933 CondCode = ARMCC::GE; 4934 4935 // and GT for opcodes that return false for 'equality'. 4936 else if (CC == ISD::SETUGT || CC == ISD::SETOGT || CC == ISD::SETOLT || 4937 CC == ISD::SETULT || CC == ISD::SETGT || CC == ISD::SETLT) 4938 CondCode = ARMCC::GT; 4939 4940 // Since we are constrained to GE/GT, if the opcode contains 'less', we need 4941 // to swap the compare operands. 4942 if (CC == ISD::SETOLE || CC == ISD::SETULE || CC == ISD::SETOLT || 4943 CC == ISD::SETULT || CC == ISD::SETLE || CC == ISD::SETLT) 4944 swpCmpOps = true; 4945 4946 // Both GT and GE are ordered comparisons, and return false for 'unordered'. 4947 // If we have an unordered opcode, we need to swap the operands to the VSEL 4948 // instruction (effectively negating the condition). 4949 // 4950 // This also has the effect of swapping which one of 'less' or 'greater' 4951 // returns true, so we also swap the compare operands. It also switches 4952 // whether we return true for 'equality', so we compensate by picking the 4953 // opposite condition code to our original choice. 4954 if (CC == ISD::SETULE || CC == ISD::SETULT || CC == ISD::SETUGE || 4955 CC == ISD::SETUGT) { 4956 swpCmpOps = !swpCmpOps; 4957 swpVselOps = !swpVselOps; 4958 CondCode = CondCode == ARMCC::GT ? ARMCC::GE : ARMCC::GT; 4959 } 4960 4961 // 'ordered' is 'anything but unordered', so use the VS condition code and 4962 // swap the VSEL operands. 4963 if (CC == ISD::SETO) { 4964 CondCode = ARMCC::VS; 4965 swpVselOps = true; 4966 } 4967 4968 // 'unordered or not equal' is 'anything but equal', so use the EQ condition 4969 // code and swap the VSEL operands. Also do this if we don't care about the 4970 // unordered case. 4971 if (CC == ISD::SETUNE || CC == ISD::SETNE) { 4972 CondCode = ARMCC::EQ; 4973 swpVselOps = true; 4974 } 4975 } 4976 4977 SDValue ARMTargetLowering::getCMOV(const SDLoc &dl, EVT VT, SDValue FalseVal, 4978 SDValue TrueVal, SDValue ARMcc, SDValue CCR, 4979 SDValue Cmp, SelectionDAG &DAG) const { 4980 if (!Subtarget->hasFP64() && VT == MVT::f64) { 4981 FalseVal = DAG.getNode(ARMISD::VMOVRRD, dl, 4982 DAG.getVTList(MVT::i32, MVT::i32), FalseVal); 4983 TrueVal = DAG.getNode(ARMISD::VMOVRRD, dl, 4984 DAG.getVTList(MVT::i32, MVT::i32), TrueVal); 4985 4986 SDValue TrueLow = TrueVal.getValue(0); 4987 SDValue TrueHigh = TrueVal.getValue(1); 4988 SDValue FalseLow = FalseVal.getValue(0); 4989 SDValue FalseHigh = FalseVal.getValue(1); 4990 4991 SDValue Low = DAG.getNode(ARMISD::CMOV, dl, MVT::i32, FalseLow, TrueLow, 4992 ARMcc, CCR, Cmp); 4993 SDValue High = DAG.getNode(ARMISD::CMOV, dl, MVT::i32, FalseHigh, TrueHigh, 4994 ARMcc, CCR, duplicateCmp(Cmp, DAG)); 4995 4996 return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Low, High); 4997 } else { 4998 return DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, TrueVal, ARMcc, CCR, 4999 Cmp); 5000 } 5001 } 5002 5003 static bool isGTorGE(ISD::CondCode CC) { 5004 return CC == ISD::SETGT || CC == ISD::SETGE; 5005 } 5006 5007 static bool isLTorLE(ISD::CondCode CC) { 5008 return CC == ISD::SETLT || CC == ISD::SETLE; 5009 } 5010 5011 // See if a conditional (LHS CC RHS ? TrueVal : FalseVal) is lower-saturating. 5012 // All of these conditions (and their <= and >= counterparts) will do: 5013 // x < k ? k : x 5014 // x > k ? x : k 5015 // k < x ? x : k 5016 // k > x ? k : x 5017 static bool isLowerSaturate(const SDValue LHS, const SDValue RHS, 5018 const SDValue TrueVal, const SDValue FalseVal, 5019 const ISD::CondCode CC, const SDValue K) { 5020 return (isGTorGE(CC) && 5021 ((K == LHS && K == TrueVal) || (K == RHS && K == FalseVal))) || 5022 (isLTorLE(CC) && 5023 ((K == RHS && K == TrueVal) || (K == LHS && K == FalseVal))); 5024 } 5025 5026 // Check if two chained conditionals could be converted into SSAT or USAT. 5027 // 5028 // SSAT can replace a set of two conditional selectors that bound a number to an 5029 // interval of type [k, ~k] when k + 1 is a power of 2. Here are some examples: 5030 // 5031 // x < -k ? -k : (x > k ? k : x) 5032 // x < -k ? -k : (x < k ? x : k) 5033 // x > -k ? (x > k ? k : x) : -k 5034 // x < k ? (x < -k ? -k : x) : k 5035 // etc. 5036 // 5037 // LLVM canonicalizes these to either a min(max()) or a max(min()) 5038 // pattern. This function tries to match one of these and will return a SSAT 5039 // node if successful. 5040 // 5041 // USAT works similarily to SSAT but bounds on the interval [0, k] where k + 1 5042 // is a power of 2. 5043 static SDValue LowerSaturatingConditional(SDValue Op, SelectionDAG &DAG) { 5044 EVT VT = Op.getValueType(); 5045 SDValue V1 = Op.getOperand(0); 5046 SDValue K1 = Op.getOperand(1); 5047 SDValue TrueVal1 = Op.getOperand(2); 5048 SDValue FalseVal1 = Op.getOperand(3); 5049 ISD::CondCode CC1 = cast<CondCodeSDNode>(Op.getOperand(4))->get(); 5050 5051 const SDValue Op2 = isa<ConstantSDNode>(TrueVal1) ? FalseVal1 : TrueVal1; 5052 if (Op2.getOpcode() != ISD::SELECT_CC) 5053 return SDValue(); 5054 5055 SDValue V2 = Op2.getOperand(0); 5056 SDValue K2 = Op2.getOperand(1); 5057 SDValue TrueVal2 = Op2.getOperand(2); 5058 SDValue FalseVal2 = Op2.getOperand(3); 5059 ISD::CondCode CC2 = cast<CondCodeSDNode>(Op2.getOperand(4))->get(); 5060 5061 SDValue V1Tmp = V1; 5062 SDValue V2Tmp = V2; 5063 5064 // Check that the registers and the constants match a max(min()) or min(max()) 5065 // pattern 5066 if (V1Tmp != TrueVal1 || V2Tmp != TrueVal2 || K1 != FalseVal1 || 5067 K2 != FalseVal2 || 5068 !((isGTorGE(CC1) && isLTorLE(CC2)) || (isLTorLE(CC1) && isGTorGE(CC2)))) 5069 return SDValue(); 5070 5071 // Check that the constant in the lower-bound check is 5072 // the opposite of the constant in the upper-bound check 5073 // in 1's complement. 5074 if (!isa<ConstantSDNode>(K1) || !isa<ConstantSDNode>(K2)) 5075 return SDValue(); 5076 5077 int64_t Val1 = cast<ConstantSDNode>(K1)->getSExtValue(); 5078 int64_t Val2 = cast<ConstantSDNode>(K2)->getSExtValue(); 5079 int64_t PosVal = std::max(Val1, Val2); 5080 int64_t NegVal = std::min(Val1, Val2); 5081 5082 if (!((Val1 > Val2 && isLTorLE(CC1)) || (Val1 < Val2 && isLTorLE(CC2))) || 5083 !isPowerOf2_64(PosVal + 1)) 5084 return SDValue(); 5085 5086 // Handle the difference between USAT (unsigned) and SSAT (signed) 5087 // saturation 5088 // At this point, PosVal is guaranteed to be positive 5089 uint64_t K = PosVal; 5090 SDLoc dl(Op); 5091 if (Val1 == ~Val2) 5092 return DAG.getNode(ARMISD::SSAT, dl, VT, V2Tmp, 5093 DAG.getConstant(countTrailingOnes(K), dl, VT)); 5094 if (NegVal == 0) 5095 return DAG.getNode(ARMISD::USAT, dl, VT, V2Tmp, 5096 DAG.getConstant(countTrailingOnes(K), dl, VT)); 5097 5098 return SDValue(); 5099 } 5100 5101 // Check if a condition of the type x < k ? k : x can be converted into a 5102 // bit operation instead of conditional moves. 5103 // Currently this is allowed given: 5104 // - The conditions and values match up 5105 // - k is 0 or -1 (all ones) 5106 // This function will not check the last condition, thats up to the caller 5107 // It returns true if the transformation can be made, and in such case 5108 // returns x in V, and k in SatK. 5109 static bool isLowerSaturatingConditional(const SDValue &Op, SDValue &V, 5110 SDValue &SatK) 5111 { 5112 SDValue LHS = Op.getOperand(0); 5113 SDValue RHS = Op.getOperand(1); 5114 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get(); 5115 SDValue TrueVal = Op.getOperand(2); 5116 SDValue FalseVal = Op.getOperand(3); 5117 5118 SDValue *K = isa<ConstantSDNode>(LHS) ? &LHS : isa<ConstantSDNode>(RHS) 5119 ? &RHS 5120 : nullptr; 5121 5122 // No constant operation in comparison, early out 5123 if (!K) 5124 return false; 5125 5126 SDValue KTmp = isa<ConstantSDNode>(TrueVal) ? TrueVal : FalseVal; 5127 V = (KTmp == TrueVal) ? FalseVal : TrueVal; 5128 SDValue VTmp = (K && *K == LHS) ? RHS : LHS; 5129 5130 // If the constant on left and right side, or variable on left and right, 5131 // does not match, early out 5132 if (*K != KTmp || V != VTmp) 5133 return false; 5134 5135 if (isLowerSaturate(LHS, RHS, TrueVal, FalseVal, CC, *K)) { 5136 SatK = *K; 5137 return true; 5138 } 5139 5140 return false; 5141 } 5142 5143 bool ARMTargetLowering::isUnsupportedFloatingType(EVT VT) const { 5144 if (VT == MVT::f32) 5145 return !Subtarget->hasVFP2Base(); 5146 if (VT == MVT::f64) 5147 return !Subtarget->hasFP64(); 5148 if (VT == MVT::f16) 5149 return !Subtarget->hasFullFP16(); 5150 return false; 5151 } 5152 5153 SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const { 5154 EVT VT = Op.getValueType(); 5155 SDLoc dl(Op); 5156 5157 // Try to convert two saturating conditional selects into a single SSAT 5158 if ((!Subtarget->isThumb() && Subtarget->hasV6Ops()) || Subtarget->isThumb2()) 5159 if (SDValue SatValue = LowerSaturatingConditional(Op, DAG)) 5160 return SatValue; 5161 5162 // Try to convert expressions of the form x < k ? k : x (and similar forms) 5163 // into more efficient bit operations, which is possible when k is 0 or -1 5164 // On ARM and Thumb-2 which have flexible operand 2 this will result in 5165 // single instructions. On Thumb the shift and the bit operation will be two 5166 // instructions. 5167 // Only allow this transformation on full-width (32-bit) operations 5168 SDValue LowerSatConstant; 5169 SDValue SatValue; 5170 if (VT == MVT::i32 && 5171 isLowerSaturatingConditional(Op, SatValue, LowerSatConstant)) { 5172 SDValue ShiftV = DAG.getNode(ISD::SRA, dl, VT, SatValue, 5173 DAG.getConstant(31, dl, VT)); 5174 if (isNullConstant(LowerSatConstant)) { 5175 SDValue NotShiftV = DAG.getNode(ISD::XOR, dl, VT, ShiftV, 5176 DAG.getAllOnesConstant(dl, VT)); 5177 return DAG.getNode(ISD::AND, dl, VT, SatValue, NotShiftV); 5178 } else if (isAllOnesConstant(LowerSatConstant)) 5179 return DAG.getNode(ISD::OR, dl, VT, SatValue, ShiftV); 5180 } 5181 5182 SDValue LHS = Op.getOperand(0); 5183 SDValue RHS = Op.getOperand(1); 5184 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get(); 5185 SDValue TrueVal = Op.getOperand(2); 5186 SDValue FalseVal = Op.getOperand(3); 5187 ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FalseVal); 5188 ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TrueVal); 5189 5190 if (Subtarget->hasV8_1MMainlineOps() && CFVal && CTVal && 5191 LHS.getValueType() == MVT::i32 && RHS.getValueType() == MVT::i32) { 5192 unsigned TVal = CTVal->getZExtValue(); 5193 unsigned FVal = CFVal->getZExtValue(); 5194 unsigned Opcode = 0; 5195 5196 if (TVal == ~FVal) { 5197 Opcode = ARMISD::CSINV; 5198 } else if (TVal == ~FVal + 1) { 5199 Opcode = ARMISD::CSNEG; 5200 } else if (TVal + 1 == FVal) { 5201 Opcode = ARMISD::CSINC; 5202 } else if (TVal == FVal + 1) { 5203 Opcode = ARMISD::CSINC; 5204 std::swap(TrueVal, FalseVal); 5205 std::swap(TVal, FVal); 5206 CC = ISD::getSetCCInverse(CC, LHS.getValueType()); 5207 } 5208 5209 if (Opcode) { 5210 // If one of the constants is cheaper than another, materialise the 5211 // cheaper one and let the csel generate the other. 5212 if (Opcode != ARMISD::CSINC && 5213 HasLowerConstantMaterializationCost(FVal, TVal, Subtarget)) { 5214 std::swap(TrueVal, FalseVal); 5215 std::swap(TVal, FVal); 5216 CC = ISD::getSetCCInverse(CC, LHS.getValueType()); 5217 } 5218 5219 // Attempt to use ZR checking TVal is 0, possibly inverting the condition 5220 // to get there. CSINC not is invertable like the other two (~(~a) == a, 5221 // -(-a) == a, but (a+1)+1 != a). 5222 if (FVal == 0 && Opcode != ARMISD::CSINC) { 5223 std::swap(TrueVal, FalseVal); 5224 std::swap(TVal, FVal); 5225 CC = ISD::getSetCCInverse(CC, LHS.getValueType()); 5226 } 5227 5228 // Drops F's value because we can get it by inverting/negating TVal. 5229 FalseVal = TrueVal; 5230 5231 SDValue ARMcc; 5232 SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl); 5233 EVT VT = TrueVal.getValueType(); 5234 return DAG.getNode(Opcode, dl, VT, TrueVal, FalseVal, ARMcc, Cmp); 5235 } 5236 } 5237 5238 if (isUnsupportedFloatingType(LHS.getValueType())) { 5239 DAG.getTargetLoweringInfo().softenSetCCOperands( 5240 DAG, LHS.getValueType(), LHS, RHS, CC, dl, LHS, RHS); 5241 5242 // If softenSetCCOperands only returned one value, we should compare it to 5243 // zero. 5244 if (!RHS.getNode()) { 5245 RHS = DAG.getConstant(0, dl, LHS.getValueType()); 5246 CC = ISD::SETNE; 5247 } 5248 } 5249 5250 if (LHS.getValueType() == MVT::i32) { 5251 // Try to generate VSEL on ARMv8. 5252 // The VSEL instruction can't use all the usual ARM condition 5253 // codes: it only has two bits to select the condition code, so it's 5254 // constrained to use only GE, GT, VS and EQ. 5255 // 5256 // To implement all the various ISD::SETXXX opcodes, we sometimes need to 5257 // swap the operands of the previous compare instruction (effectively 5258 // inverting the compare condition, swapping 'less' and 'greater') and 5259 // sometimes need to swap the operands to the VSEL (which inverts the 5260 // condition in the sense of firing whenever the previous condition didn't) 5261 if (Subtarget->hasFPARMv8Base() && (TrueVal.getValueType() == MVT::f16 || 5262 TrueVal.getValueType() == MVT::f32 || 5263 TrueVal.getValueType() == MVT::f64)) { 5264 ARMCC::CondCodes CondCode = IntCCToARMCC(CC); 5265 if (CondCode == ARMCC::LT || CondCode == ARMCC::LE || 5266 CondCode == ARMCC::VC || CondCode == ARMCC::NE) { 5267 CC = ISD::getSetCCInverse(CC, LHS.getValueType()); 5268 std::swap(TrueVal, FalseVal); 5269 } 5270 } 5271 5272 SDValue ARMcc; 5273 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); 5274 SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl); 5275 // Choose GE over PL, which vsel does now support 5276 if (cast<ConstantSDNode>(ARMcc)->getZExtValue() == ARMCC::PL) 5277 ARMcc = DAG.getConstant(ARMCC::GE, dl, MVT::i32); 5278 return getCMOV(dl, VT, FalseVal, TrueVal, ARMcc, CCR, Cmp, DAG); 5279 } 5280 5281 ARMCC::CondCodes CondCode, CondCode2; 5282 FPCCToARMCC(CC, CondCode, CondCode2); 5283 5284 // Normalize the fp compare. If RHS is zero we prefer to keep it there so we 5285 // match CMPFPw0 instead of CMPFP, though we don't do this for f16 because we 5286 // must use VSEL (limited condition codes), due to not having conditional f16 5287 // moves. 5288 if (Subtarget->hasFPARMv8Base() && 5289 !(isFloatingPointZero(RHS) && TrueVal.getValueType() != MVT::f16) && 5290 (TrueVal.getValueType() == MVT::f16 || 5291 TrueVal.getValueType() == MVT::f32 || 5292 TrueVal.getValueType() == MVT::f64)) { 5293 bool swpCmpOps = false; 5294 bool swpVselOps = false; 5295 checkVSELConstraints(CC, CondCode, swpCmpOps, swpVselOps); 5296 5297 if (CondCode == ARMCC::GT || CondCode == ARMCC::GE || 5298 CondCode == ARMCC::VS || CondCode == ARMCC::EQ) { 5299 if (swpCmpOps) 5300 std::swap(LHS, RHS); 5301 if (swpVselOps) 5302 std::swap(TrueVal, FalseVal); 5303 } 5304 } 5305 5306 SDValue ARMcc = DAG.getConstant(CondCode, dl, MVT::i32); 5307 SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl); 5308 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); 5309 SDValue Result = getCMOV(dl, VT, FalseVal, TrueVal, ARMcc, CCR, Cmp, DAG); 5310 if (CondCode2 != ARMCC::AL) { 5311 SDValue ARMcc2 = DAG.getConstant(CondCode2, dl, MVT::i32); 5312 // FIXME: Needs another CMP because flag can have but one use. 5313 SDValue Cmp2 = getVFPCmp(LHS, RHS, DAG, dl); 5314 Result = getCMOV(dl, VT, Result, TrueVal, ARMcc2, CCR, Cmp2, DAG); 5315 } 5316 return Result; 5317 } 5318 5319 /// canChangeToInt - Given the fp compare operand, return true if it is suitable 5320 /// to morph to an integer compare sequence. 5321 static bool canChangeToInt(SDValue Op, bool &SeenZero, 5322 const ARMSubtarget *Subtarget) { 5323 SDNode *N = Op.getNode(); 5324 if (!N->hasOneUse()) 5325 // Otherwise it requires moving the value from fp to integer registers. 5326 return false; 5327 if (!N->getNumValues()) 5328 return false; 5329 EVT VT = Op.getValueType(); 5330 if (VT != MVT::f32 && !Subtarget->isFPBrccSlow()) 5331 // f32 case is generally profitable. f64 case only makes sense when vcmpe + 5332 // vmrs are very slow, e.g. cortex-a8. 5333 return false; 5334 5335 if (isFloatingPointZero(Op)) { 5336 SeenZero = true; 5337 return true; 5338 } 5339 return ISD::isNormalLoad(N); 5340 } 5341 5342 static SDValue bitcastf32Toi32(SDValue Op, SelectionDAG &DAG) { 5343 if (isFloatingPointZero(Op)) 5344 return DAG.getConstant(0, SDLoc(Op), MVT::i32); 5345 5346 if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Op)) 5347 return DAG.getLoad(MVT::i32, SDLoc(Op), Ld->getChain(), Ld->getBasePtr(), 5348 Ld->getPointerInfo(), Ld->getAlignment(), 5349 Ld->getMemOperand()->getFlags()); 5350 5351 llvm_unreachable("Unknown VFP cmp argument!"); 5352 } 5353 5354 static void expandf64Toi32(SDValue Op, SelectionDAG &DAG, 5355 SDValue &RetVal1, SDValue &RetVal2) { 5356 SDLoc dl(Op); 5357 5358 if (isFloatingPointZero(Op)) { 5359 RetVal1 = DAG.getConstant(0, dl, MVT::i32); 5360 RetVal2 = DAG.getConstant(0, dl, MVT::i32); 5361 return; 5362 } 5363 5364 if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Op)) { 5365 SDValue Ptr = Ld->getBasePtr(); 5366 RetVal1 = 5367 DAG.getLoad(MVT::i32, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(), 5368 Ld->getAlignment(), Ld->getMemOperand()->getFlags()); 5369 5370 EVT PtrType = Ptr.getValueType(); 5371 unsigned NewAlign = MinAlign(Ld->getAlignment(), 4); 5372 SDValue NewPtr = DAG.getNode(ISD::ADD, dl, 5373 PtrType, Ptr, DAG.getConstant(4, dl, PtrType)); 5374 RetVal2 = DAG.getLoad(MVT::i32, dl, Ld->getChain(), NewPtr, 5375 Ld->getPointerInfo().getWithOffset(4), NewAlign, 5376 Ld->getMemOperand()->getFlags()); 5377 return; 5378 } 5379 5380 llvm_unreachable("Unknown VFP cmp argument!"); 5381 } 5382 5383 /// OptimizeVFPBrcond - With -enable-unsafe-fp-math, it's legal to optimize some 5384 /// f32 and even f64 comparisons to integer ones. 5385 SDValue 5386 ARMTargetLowering::OptimizeVFPBrcond(SDValue Op, SelectionDAG &DAG) const { 5387 SDValue Chain = Op.getOperand(0); 5388 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get(); 5389 SDValue LHS = Op.getOperand(2); 5390 SDValue RHS = Op.getOperand(3); 5391 SDValue Dest = Op.getOperand(4); 5392 SDLoc dl(Op); 5393 5394 bool LHSSeenZero = false; 5395 bool LHSOk = canChangeToInt(LHS, LHSSeenZero, Subtarget); 5396 bool RHSSeenZero = false; 5397 bool RHSOk = canChangeToInt(RHS, RHSSeenZero, Subtarget); 5398 if (LHSOk && RHSOk && (LHSSeenZero || RHSSeenZero)) { 5399 // If unsafe fp math optimization is enabled and there are no other uses of 5400 // the CMP operands, and the condition code is EQ or NE, we can optimize it 5401 // to an integer comparison. 5402 if (CC == ISD::SETOEQ) 5403 CC = ISD::SETEQ; 5404 else if (CC == ISD::SETUNE) 5405 CC = ISD::SETNE; 5406 5407 SDValue Mask = DAG.getConstant(0x7fffffff, dl, MVT::i32); 5408 SDValue ARMcc; 5409 if (LHS.getValueType() == MVT::f32) { 5410 LHS = DAG.getNode(ISD::AND, dl, MVT::i32, 5411 bitcastf32Toi32(LHS, DAG), Mask); 5412 RHS = DAG.getNode(ISD::AND, dl, MVT::i32, 5413 bitcastf32Toi32(RHS, DAG), Mask); 5414 SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl); 5415 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); 5416 return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, 5417 Chain, Dest, ARMcc, CCR, Cmp); 5418 } 5419 5420 SDValue LHS1, LHS2; 5421 SDValue RHS1, RHS2; 5422 expandf64Toi32(LHS, DAG, LHS1, LHS2); 5423 expandf64Toi32(RHS, DAG, RHS1, RHS2); 5424 LHS2 = DAG.getNode(ISD::AND, dl, MVT::i32, LHS2, Mask); 5425 RHS2 = DAG.getNode(ISD::AND, dl, MVT::i32, RHS2, Mask); 5426 ARMCC::CondCodes CondCode = IntCCToARMCC(CC); 5427 ARMcc = DAG.getConstant(CondCode, dl, MVT::i32); 5428 SDVTList VTList = DAG.getVTList(MVT::Other, MVT::Glue); 5429 SDValue Ops[] = { Chain, ARMcc, LHS1, LHS2, RHS1, RHS2, Dest }; 5430 return DAG.getNode(ARMISD::BCC_i64, dl, VTList, Ops); 5431 } 5432 5433 return SDValue(); 5434 } 5435 5436 SDValue ARMTargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { 5437 SDValue Chain = Op.getOperand(0); 5438 SDValue Cond = Op.getOperand(1); 5439 SDValue Dest = Op.getOperand(2); 5440 SDLoc dl(Op); 5441 5442 // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch 5443 // instruction. 5444 unsigned Opc = Cond.getOpcode(); 5445 bool OptimizeMul = (Opc == ISD::SMULO || Opc == ISD::UMULO) && 5446 !Subtarget->isThumb1Only(); 5447 if (Cond.getResNo() == 1 && 5448 (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO || 5449 Opc == ISD::USUBO || OptimizeMul)) { 5450 // Only lower legal XALUO ops. 5451 if (!DAG.getTargetLoweringInfo().isTypeLegal(Cond->getValueType(0))) 5452 return SDValue(); 5453 5454 // The actual operation with overflow check. 5455 SDValue Value, OverflowCmp; 5456 SDValue ARMcc; 5457 std::tie(Value, OverflowCmp) = getARMXALUOOp(Cond, DAG, ARMcc); 5458 5459 // Reverse the condition code. 5460 ARMCC::CondCodes CondCode = 5461 (ARMCC::CondCodes)cast<const ConstantSDNode>(ARMcc)->getZExtValue(); 5462 CondCode = ARMCC::getOppositeCondition(CondCode); 5463 ARMcc = DAG.getConstant(CondCode, SDLoc(ARMcc), MVT::i32); 5464 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); 5465 5466 return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, Chain, Dest, ARMcc, CCR, 5467 OverflowCmp); 5468 } 5469 5470 return SDValue(); 5471 } 5472 5473 SDValue ARMTargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const { 5474 SDValue Chain = Op.getOperand(0); 5475 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get(); 5476 SDValue LHS = Op.getOperand(2); 5477 SDValue RHS = Op.getOperand(3); 5478 SDValue Dest = Op.getOperand(4); 5479 SDLoc dl(Op); 5480 5481 if (isUnsupportedFloatingType(LHS.getValueType())) { 5482 DAG.getTargetLoweringInfo().softenSetCCOperands( 5483 DAG, LHS.getValueType(), LHS, RHS, CC, dl, LHS, RHS); 5484 5485 // If softenSetCCOperands only returned one value, we should compare it to 5486 // zero. 5487 if (!RHS.getNode()) { 5488 RHS = DAG.getConstant(0, dl, LHS.getValueType()); 5489 CC = ISD::SETNE; 5490 } 5491 } 5492 5493 // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch 5494 // instruction. 5495 unsigned Opc = LHS.getOpcode(); 5496 bool OptimizeMul = (Opc == ISD::SMULO || Opc == ISD::UMULO) && 5497 !Subtarget->isThumb1Only(); 5498 if (LHS.getResNo() == 1 && (isOneConstant(RHS) || isNullConstant(RHS)) && 5499 (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO || 5500 Opc == ISD::USUBO || OptimizeMul) && 5501 (CC == ISD::SETEQ || CC == ISD::SETNE)) { 5502 // Only lower legal XALUO ops. 5503 if (!DAG.getTargetLoweringInfo().isTypeLegal(LHS->getValueType(0))) 5504 return SDValue(); 5505 5506 // The actual operation with overflow check. 5507 SDValue Value, OverflowCmp; 5508 SDValue ARMcc; 5509 std::tie(Value, OverflowCmp) = getARMXALUOOp(LHS.getValue(0), DAG, ARMcc); 5510 5511 if ((CC == ISD::SETNE) != isOneConstant(RHS)) { 5512 // Reverse the condition code. 5513 ARMCC::CondCodes CondCode = 5514 (ARMCC::CondCodes)cast<const ConstantSDNode>(ARMcc)->getZExtValue(); 5515 CondCode = ARMCC::getOppositeCondition(CondCode); 5516 ARMcc = DAG.getConstant(CondCode, SDLoc(ARMcc), MVT::i32); 5517 } 5518 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); 5519 5520 return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, Chain, Dest, ARMcc, CCR, 5521 OverflowCmp); 5522 } 5523 5524 if (LHS.getValueType() == MVT::i32) { 5525 SDValue ARMcc; 5526 SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl); 5527 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); 5528 return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, 5529 Chain, Dest, ARMcc, CCR, Cmp); 5530 } 5531 5532 if (getTargetMachine().Options.UnsafeFPMath && 5533 (CC == ISD::SETEQ || CC == ISD::SETOEQ || 5534 CC == ISD::SETNE || CC == ISD::SETUNE)) { 5535 if (SDValue Result = OptimizeVFPBrcond(Op, DAG)) 5536 return Result; 5537 } 5538 5539 ARMCC::CondCodes CondCode, CondCode2; 5540 FPCCToARMCC(CC, CondCode, CondCode2); 5541 5542 SDValue ARMcc = DAG.getConstant(CondCode, dl, MVT::i32); 5543 SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl); 5544 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); 5545 SDVTList VTList = DAG.getVTList(MVT::Other, MVT::Glue); 5546 SDValue Ops[] = { Chain, Dest, ARMcc, CCR, Cmp }; 5547 SDValue Res = DAG.getNode(ARMISD::BRCOND, dl, VTList, Ops); 5548 if (CondCode2 != ARMCC::AL) { 5549 ARMcc = DAG.getConstant(CondCode2, dl, MVT::i32); 5550 SDValue Ops[] = { Res, Dest, ARMcc, CCR, Res.getValue(1) }; 5551 Res = DAG.getNode(ARMISD::BRCOND, dl, VTList, Ops); 5552 } 5553 return Res; 5554 } 5555 5556 SDValue ARMTargetLowering::LowerBR_JT(SDValue Op, SelectionDAG &DAG) const { 5557 SDValue Chain = Op.getOperand(0); 5558 SDValue Table = Op.getOperand(1); 5559 SDValue Index = Op.getOperand(2); 5560 SDLoc dl(Op); 5561 5562 EVT PTy = getPointerTy(DAG.getDataLayout()); 5563 JumpTableSDNode *JT = cast<JumpTableSDNode>(Table); 5564 SDValue JTI = DAG.getTargetJumpTable(JT->getIndex(), PTy); 5565 Table = DAG.getNode(ARMISD::WrapperJT, dl, MVT::i32, JTI); 5566 Index = DAG.getNode(ISD::MUL, dl, PTy, Index, DAG.getConstant(4, dl, PTy)); 5567 SDValue Addr = DAG.getNode(ISD::ADD, dl, PTy, Table, Index); 5568 if (Subtarget->isThumb2() || (Subtarget->hasV8MBaselineOps() && Subtarget->isThumb())) { 5569 // Thumb2 and ARMv8-M use a two-level jump. That is, it jumps into the jump table 5570 // which does another jump to the destination. This also makes it easier 5571 // to translate it to TBB / TBH later (Thumb2 only). 5572 // FIXME: This might not work if the function is extremely large. 5573 return DAG.getNode(ARMISD::BR2_JT, dl, MVT::Other, Chain, 5574 Addr, Op.getOperand(2), JTI); 5575 } 5576 if (isPositionIndependent() || Subtarget->isROPI()) { 5577 Addr = 5578 DAG.getLoad((EVT)MVT::i32, dl, Chain, Addr, 5579 MachinePointerInfo::getJumpTable(DAG.getMachineFunction())); 5580 Chain = Addr.getValue(1); 5581 Addr = DAG.getNode(ISD::ADD, dl, PTy, Table, Addr); 5582 return DAG.getNode(ARMISD::BR_JT, dl, MVT::Other, Chain, Addr, JTI); 5583 } else { 5584 Addr = 5585 DAG.getLoad(PTy, dl, Chain, Addr, 5586 MachinePointerInfo::getJumpTable(DAG.getMachineFunction())); 5587 Chain = Addr.getValue(1); 5588 return DAG.getNode(ARMISD::BR_JT, dl, MVT::Other, Chain, Addr, JTI); 5589 } 5590 } 5591 5592 static SDValue LowerVectorFP_TO_INT(SDValue Op, SelectionDAG &DAG) { 5593 EVT VT = Op.getValueType(); 5594 SDLoc dl(Op); 5595 5596 if (Op.getValueType().getVectorElementType() == MVT::i32) { 5597 if (Op.getOperand(0).getValueType().getVectorElementType() == MVT::f32) 5598 return Op; 5599 return DAG.UnrollVectorOp(Op.getNode()); 5600 } 5601 5602 const bool HasFullFP16 = 5603 static_cast<const ARMSubtarget&>(DAG.getSubtarget()).hasFullFP16(); 5604 5605 EVT NewTy; 5606 const EVT OpTy = Op.getOperand(0).getValueType(); 5607 if (OpTy == MVT::v4f32) 5608 NewTy = MVT::v4i32; 5609 else if (OpTy == MVT::v4f16 && HasFullFP16) 5610 NewTy = MVT::v4i16; 5611 else if (OpTy == MVT::v8f16 && HasFullFP16) 5612 NewTy = MVT::v8i16; 5613 else 5614 llvm_unreachable("Invalid type for custom lowering!"); 5615 5616 if (VT != MVT::v4i16 && VT != MVT::v8i16) 5617 return DAG.UnrollVectorOp(Op.getNode()); 5618 5619 Op = DAG.getNode(Op.getOpcode(), dl, NewTy, Op.getOperand(0)); 5620 return DAG.getNode(ISD::TRUNCATE, dl, VT, Op); 5621 } 5622 5623 SDValue ARMTargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const { 5624 EVT VT = Op.getValueType(); 5625 if (VT.isVector()) 5626 return LowerVectorFP_TO_INT(Op, DAG); 5627 5628 bool IsStrict = Op->isStrictFPOpcode(); 5629 SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0); 5630 5631 if (isUnsupportedFloatingType(SrcVal.getValueType())) { 5632 RTLIB::Libcall LC; 5633 if (Op.getOpcode() == ISD::FP_TO_SINT || 5634 Op.getOpcode() == ISD::STRICT_FP_TO_SINT) 5635 LC = RTLIB::getFPTOSINT(SrcVal.getValueType(), 5636 Op.getValueType()); 5637 else 5638 LC = RTLIB::getFPTOUINT(SrcVal.getValueType(), 5639 Op.getValueType()); 5640 SDLoc Loc(Op); 5641 MakeLibCallOptions CallOptions; 5642 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue(); 5643 SDValue Result; 5644 std::tie(Result, Chain) = makeLibCall(DAG, LC, Op.getValueType(), SrcVal, 5645 CallOptions, Loc, Chain); 5646 return IsStrict ? DAG.getMergeValues({Result, Chain}, Loc) : Result; 5647 } 5648 5649 // FIXME: Remove this when we have strict fp instruction selection patterns 5650 if (IsStrict) { 5651 SDLoc Loc(Op); 5652 SDValue Result = 5653 DAG.getNode(Op.getOpcode() == ISD::STRICT_FP_TO_SINT ? ISD::FP_TO_SINT 5654 : ISD::FP_TO_UINT, 5655 Loc, Op.getValueType(), SrcVal); 5656 return DAG.getMergeValues({Result, Op.getOperand(0)}, Loc); 5657 } 5658 5659 return Op; 5660 } 5661 5662 static SDValue LowerVectorINT_TO_FP(SDValue Op, SelectionDAG &DAG) { 5663 EVT VT = Op.getValueType(); 5664 SDLoc dl(Op); 5665 5666 if (Op.getOperand(0).getValueType().getVectorElementType() == MVT::i32) { 5667 if (VT.getVectorElementType() == MVT::f32) 5668 return Op; 5669 return DAG.UnrollVectorOp(Op.getNode()); 5670 } 5671 5672 assert((Op.getOperand(0).getValueType() == MVT::v4i16 || 5673 Op.getOperand(0).getValueType() == MVT::v8i16) && 5674 "Invalid type for custom lowering!"); 5675 5676 const bool HasFullFP16 = 5677 static_cast<const ARMSubtarget&>(DAG.getSubtarget()).hasFullFP16(); 5678 5679 EVT DestVecType; 5680 if (VT == MVT::v4f32) 5681 DestVecType = MVT::v4i32; 5682 else if (VT == MVT::v4f16 && HasFullFP16) 5683 DestVecType = MVT::v4i16; 5684 else if (VT == MVT::v8f16 && HasFullFP16) 5685 DestVecType = MVT::v8i16; 5686 else 5687 return DAG.UnrollVectorOp(Op.getNode()); 5688 5689 unsigned CastOpc; 5690 unsigned Opc; 5691 switch (Op.getOpcode()) { 5692 default: llvm_unreachable("Invalid opcode!"); 5693 case ISD::SINT_TO_FP: 5694 CastOpc = ISD::SIGN_EXTEND; 5695 Opc = ISD::SINT_TO_FP; 5696 break; 5697 case ISD::UINT_TO_FP: 5698 CastOpc = ISD::ZERO_EXTEND; 5699 Opc = ISD::UINT_TO_FP; 5700 break; 5701 } 5702 5703 Op = DAG.getNode(CastOpc, dl, DestVecType, Op.getOperand(0)); 5704 return DAG.getNode(Opc, dl, VT, Op); 5705 } 5706 5707 SDValue ARMTargetLowering::LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG) const { 5708 EVT VT = Op.getValueType(); 5709 if (VT.isVector()) 5710 return LowerVectorINT_TO_FP(Op, DAG); 5711 if (isUnsupportedFloatingType(VT)) { 5712 RTLIB::Libcall LC; 5713 if (Op.getOpcode() == ISD::SINT_TO_FP) 5714 LC = RTLIB::getSINTTOFP(Op.getOperand(0).getValueType(), 5715 Op.getValueType()); 5716 else 5717 LC = RTLIB::getUINTTOFP(Op.getOperand(0).getValueType(), 5718 Op.getValueType()); 5719 MakeLibCallOptions CallOptions; 5720 return makeLibCall(DAG, LC, Op.getValueType(), Op.getOperand(0), 5721 CallOptions, SDLoc(Op)).first; 5722 } 5723 5724 return Op; 5725 } 5726 5727 SDValue ARMTargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const { 5728 // Implement fcopysign with a fabs and a conditional fneg. 5729 SDValue Tmp0 = Op.getOperand(0); 5730 SDValue Tmp1 = Op.getOperand(1); 5731 SDLoc dl(Op); 5732 EVT VT = Op.getValueType(); 5733 EVT SrcVT = Tmp1.getValueType(); 5734 bool InGPR = Tmp0.getOpcode() == ISD::BITCAST || 5735 Tmp0.getOpcode() == ARMISD::VMOVDRR; 5736 bool UseNEON = !InGPR && Subtarget->hasNEON(); 5737 5738 if (UseNEON) { 5739 // Use VBSL to copy the sign bit. 5740 unsigned EncodedVal = ARM_AM::createVMOVModImm(0x6, 0x80); 5741 SDValue Mask = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v2i32, 5742 DAG.getTargetConstant(EncodedVal, dl, MVT::i32)); 5743 EVT OpVT = (VT == MVT::f32) ? MVT::v2i32 : MVT::v1i64; 5744 if (VT == MVT::f64) 5745 Mask = DAG.getNode(ARMISD::VSHLIMM, dl, OpVT, 5746 DAG.getNode(ISD::BITCAST, dl, OpVT, Mask), 5747 DAG.getConstant(32, dl, MVT::i32)); 5748 else /*if (VT == MVT::f32)*/ 5749 Tmp0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f32, Tmp0); 5750 if (SrcVT == MVT::f32) { 5751 Tmp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f32, Tmp1); 5752 if (VT == MVT::f64) 5753 Tmp1 = DAG.getNode(ARMISD::VSHLIMM, dl, OpVT, 5754 DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp1), 5755 DAG.getConstant(32, dl, MVT::i32)); 5756 } else if (VT == MVT::f32) 5757 Tmp1 = DAG.getNode(ARMISD::VSHRuIMM, dl, MVT::v1i64, 5758 DAG.getNode(ISD::BITCAST, dl, MVT::v1i64, Tmp1), 5759 DAG.getConstant(32, dl, MVT::i32)); 5760 Tmp0 = DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp0); 5761 Tmp1 = DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp1); 5762 5763 SDValue AllOnes = DAG.getTargetConstant(ARM_AM::createVMOVModImm(0xe, 0xff), 5764 dl, MVT::i32); 5765 AllOnes = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v8i8, AllOnes); 5766 SDValue MaskNot = DAG.getNode(ISD::XOR, dl, OpVT, Mask, 5767 DAG.getNode(ISD::BITCAST, dl, OpVT, AllOnes)); 5768 5769 SDValue Res = DAG.getNode(ISD::OR, dl, OpVT, 5770 DAG.getNode(ISD::AND, dl, OpVT, Tmp1, Mask), 5771 DAG.getNode(ISD::AND, dl, OpVT, Tmp0, MaskNot)); 5772 if (VT == MVT::f32) { 5773 Res = DAG.getNode(ISD::BITCAST, dl, MVT::v2f32, Res); 5774 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res, 5775 DAG.getConstant(0, dl, MVT::i32)); 5776 } else { 5777 Res = DAG.getNode(ISD::BITCAST, dl, MVT::f64, Res); 5778 } 5779 5780 return Res; 5781 } 5782 5783 // Bitcast operand 1 to i32. 5784 if (SrcVT == MVT::f64) 5785 Tmp1 = DAG.getNode(ARMISD::VMOVRRD, dl, DAG.getVTList(MVT::i32, MVT::i32), 5786 Tmp1).getValue(1); 5787 Tmp1 = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Tmp1); 5788 5789 // Or in the signbit with integer operations. 5790 SDValue Mask1 = DAG.getConstant(0x80000000, dl, MVT::i32); 5791 SDValue Mask2 = DAG.getConstant(0x7fffffff, dl, MVT::i32); 5792 Tmp1 = DAG.getNode(ISD::AND, dl, MVT::i32, Tmp1, Mask1); 5793 if (VT == MVT::f32) { 5794 Tmp0 = DAG.getNode(ISD::AND, dl, MVT::i32, 5795 DAG.getNode(ISD::BITCAST, dl, MVT::i32, Tmp0), Mask2); 5796 return DAG.getNode(ISD::BITCAST, dl, MVT::f32, 5797 DAG.getNode(ISD::OR, dl, MVT::i32, Tmp0, Tmp1)); 5798 } 5799 5800 // f64: Or the high part with signbit and then combine two parts. 5801 Tmp0 = DAG.getNode(ARMISD::VMOVRRD, dl, DAG.getVTList(MVT::i32, MVT::i32), 5802 Tmp0); 5803 SDValue Lo = Tmp0.getValue(0); 5804 SDValue Hi = DAG.getNode(ISD::AND, dl, MVT::i32, Tmp0.getValue(1), Mask2); 5805 Hi = DAG.getNode(ISD::OR, dl, MVT::i32, Hi, Tmp1); 5806 return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi); 5807 } 5808 5809 SDValue ARMTargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const{ 5810 MachineFunction &MF = DAG.getMachineFunction(); 5811 MachineFrameInfo &MFI = MF.getFrameInfo(); 5812 MFI.setReturnAddressIsTaken(true); 5813 5814 if (verifyReturnAddressArgumentIsConstant(Op, DAG)) 5815 return SDValue(); 5816 5817 EVT VT = Op.getValueType(); 5818 SDLoc dl(Op); 5819 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 5820 if (Depth) { 5821 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG); 5822 SDValue Offset = DAG.getConstant(4, dl, MVT::i32); 5823 return DAG.getLoad(VT, dl, DAG.getEntryNode(), 5824 DAG.getNode(ISD::ADD, dl, VT, FrameAddr, Offset), 5825 MachinePointerInfo()); 5826 } 5827 5828 // Return LR, which contains the return address. Mark it an implicit live-in. 5829 unsigned Reg = MF.addLiveIn(ARM::LR, getRegClassFor(MVT::i32)); 5830 return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT); 5831 } 5832 5833 SDValue ARMTargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const { 5834 const ARMBaseRegisterInfo &ARI = 5835 *static_cast<const ARMBaseRegisterInfo*>(RegInfo); 5836 MachineFunction &MF = DAG.getMachineFunction(); 5837 MachineFrameInfo &MFI = MF.getFrameInfo(); 5838 MFI.setFrameAddressIsTaken(true); 5839 5840 EVT VT = Op.getValueType(); 5841 SDLoc dl(Op); // FIXME probably not meaningful 5842 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 5843 Register FrameReg = ARI.getFrameRegister(MF); 5844 SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT); 5845 while (Depth--) 5846 FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr, 5847 MachinePointerInfo()); 5848 return FrameAddr; 5849 } 5850 5851 // FIXME? Maybe this could be a TableGen attribute on some registers and 5852 // this table could be generated automatically from RegInfo. 5853 Register ARMTargetLowering::getRegisterByName(const char* RegName, LLT VT, 5854 const MachineFunction &MF) const { 5855 Register Reg = StringSwitch<unsigned>(RegName) 5856 .Case("sp", ARM::SP) 5857 .Default(0); 5858 if (Reg) 5859 return Reg; 5860 report_fatal_error(Twine("Invalid register name \"" 5861 + StringRef(RegName) + "\".")); 5862 } 5863 5864 // Result is 64 bit value so split into two 32 bit values and return as a 5865 // pair of values. 5866 static void ExpandREAD_REGISTER(SDNode *N, SmallVectorImpl<SDValue> &Results, 5867 SelectionDAG &DAG) { 5868 SDLoc DL(N); 5869 5870 // This function is only supposed to be called for i64 type destination. 5871 assert(N->getValueType(0) == MVT::i64 5872 && "ExpandREAD_REGISTER called for non-i64 type result."); 5873 5874 SDValue Read = DAG.getNode(ISD::READ_REGISTER, DL, 5875 DAG.getVTList(MVT::i32, MVT::i32, MVT::Other), 5876 N->getOperand(0), 5877 N->getOperand(1)); 5878 5879 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Read.getValue(0), 5880 Read.getValue(1))); 5881 Results.push_back(Read.getOperand(0)); 5882 } 5883 5884 /// \p BC is a bitcast that is about to be turned into a VMOVDRR. 5885 /// When \p DstVT, the destination type of \p BC, is on the vector 5886 /// register bank and the source of bitcast, \p Op, operates on the same bank, 5887 /// it might be possible to combine them, such that everything stays on the 5888 /// vector register bank. 5889 /// \p return The node that would replace \p BT, if the combine 5890 /// is possible. 5891 static SDValue CombineVMOVDRRCandidateWithVecOp(const SDNode *BC, 5892 SelectionDAG &DAG) { 5893 SDValue Op = BC->getOperand(0); 5894 EVT DstVT = BC->getValueType(0); 5895 5896 // The only vector instruction that can produce a scalar (remember, 5897 // since the bitcast was about to be turned into VMOVDRR, the source 5898 // type is i64) from a vector is EXTRACT_VECTOR_ELT. 5899 // Moreover, we can do this combine only if there is one use. 5900 // Finally, if the destination type is not a vector, there is not 5901 // much point on forcing everything on the vector bank. 5902 if (!DstVT.isVector() || Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT || 5903 !Op.hasOneUse()) 5904 return SDValue(); 5905 5906 // If the index is not constant, we will introduce an additional 5907 // multiply that will stick. 5908 // Give up in that case. 5909 ConstantSDNode *Index = dyn_cast<ConstantSDNode>(Op.getOperand(1)); 5910 if (!Index) 5911 return SDValue(); 5912 unsigned DstNumElt = DstVT.getVectorNumElements(); 5913 5914 // Compute the new index. 5915 const APInt &APIntIndex = Index->getAPIntValue(); 5916 APInt NewIndex(APIntIndex.getBitWidth(), DstNumElt); 5917 NewIndex *= APIntIndex; 5918 // Check if the new constant index fits into i32. 5919 if (NewIndex.getBitWidth() > 32) 5920 return SDValue(); 5921 5922 // vMTy bitcast(i64 extractelt vNi64 src, i32 index) -> 5923 // vMTy extractsubvector vNxMTy (bitcast vNi64 src), i32 index*M) 5924 SDLoc dl(Op); 5925 SDValue ExtractSrc = Op.getOperand(0); 5926 EVT VecVT = EVT::getVectorVT( 5927 *DAG.getContext(), DstVT.getScalarType(), 5928 ExtractSrc.getValueType().getVectorNumElements() * DstNumElt); 5929 SDValue BitCast = DAG.getNode(ISD::BITCAST, dl, VecVT, ExtractSrc); 5930 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DstVT, BitCast, 5931 DAG.getConstant(NewIndex.getZExtValue(), dl, MVT::i32)); 5932 } 5933 5934 /// ExpandBITCAST - If the target supports VFP, this function is called to 5935 /// expand a bit convert where either the source or destination type is i64 to 5936 /// use a VMOVDRR or VMOVRRD node. This should not be done when the non-i64 5937 /// operand type is illegal (e.g., v2f32 for a target that doesn't support 5938 /// vectors), since the legalizer won't know what to do with that. 5939 SDValue ARMTargetLowering::ExpandBITCAST(SDNode *N, SelectionDAG &DAG, 5940 const ARMSubtarget *Subtarget) const { 5941 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 5942 SDLoc dl(N); 5943 SDValue Op = N->getOperand(0); 5944 5945 // This function is only supposed to be called for i16 and i64 types, either 5946 // as the source or destination of the bit convert. 5947 EVT SrcVT = Op.getValueType(); 5948 EVT DstVT = N->getValueType(0); 5949 5950 if ((SrcVT == MVT::i16 || SrcVT == MVT::i32) && 5951 (DstVT == MVT::f16 || DstVT == MVT::bf16)) 5952 return MoveToHPR(SDLoc(N), DAG, MVT::i32, DstVT.getSimpleVT(), 5953 DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), MVT::i32, Op)); 5954 5955 if ((DstVT == MVT::i16 || DstVT == MVT::i32) && 5956 (SrcVT == MVT::f16 || SrcVT == MVT::bf16)) 5957 return DAG.getNode( 5958 ISD::TRUNCATE, SDLoc(N), DstVT, 5959 MoveFromHPR(SDLoc(N), DAG, MVT::i32, SrcVT.getSimpleVT(), Op)); 5960 5961 if (!(SrcVT == MVT::i64 || DstVT == MVT::i64)) 5962 return SDValue(); 5963 5964 // Turn i64->f64 into VMOVDRR. 5965 if (SrcVT == MVT::i64 && TLI.isTypeLegal(DstVT)) { 5966 // Do not force values to GPRs (this is what VMOVDRR does for the inputs) 5967 // if we can combine the bitcast with its source. 5968 if (SDValue Val = CombineVMOVDRRCandidateWithVecOp(N, DAG)) 5969 return Val; 5970 5971 SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op, 5972 DAG.getConstant(0, dl, MVT::i32)); 5973 SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op, 5974 DAG.getConstant(1, dl, MVT::i32)); 5975 return DAG.getNode(ISD::BITCAST, dl, DstVT, 5976 DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi)); 5977 } 5978 5979 // Turn f64->i64 into VMOVRRD. 5980 if (DstVT == MVT::i64 && TLI.isTypeLegal(SrcVT)) { 5981 SDValue Cvt; 5982 if (DAG.getDataLayout().isBigEndian() && SrcVT.isVector() && 5983 SrcVT.getVectorNumElements() > 1) 5984 Cvt = DAG.getNode(ARMISD::VMOVRRD, dl, 5985 DAG.getVTList(MVT::i32, MVT::i32), 5986 DAG.getNode(ARMISD::VREV64, dl, SrcVT, Op)); 5987 else 5988 Cvt = DAG.getNode(ARMISD::VMOVRRD, dl, 5989 DAG.getVTList(MVT::i32, MVT::i32), Op); 5990 // Merge the pieces into a single i64 value. 5991 return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Cvt, Cvt.getValue(1)); 5992 } 5993 5994 return SDValue(); 5995 } 5996 5997 /// getZeroVector - Returns a vector of specified type with all zero elements. 5998 /// Zero vectors are used to represent vector negation and in those cases 5999 /// will be implemented with the NEON VNEG instruction. However, VNEG does 6000 /// not support i64 elements, so sometimes the zero vectors will need to be 6001 /// explicitly constructed. Regardless, use a canonical VMOV to create the 6002 /// zero vector. 6003 static SDValue getZeroVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl) { 6004 assert(VT.isVector() && "Expected a vector type"); 6005 // The canonical modified immediate encoding of a zero vector is....0! 6006 SDValue EncodedVal = DAG.getTargetConstant(0, dl, MVT::i32); 6007 EVT VmovVT = VT.is128BitVector() ? MVT::v4i32 : MVT::v2i32; 6008 SDValue Vmov = DAG.getNode(ARMISD::VMOVIMM, dl, VmovVT, EncodedVal); 6009 return DAG.getNode(ISD::BITCAST, dl, VT, Vmov); 6010 } 6011 6012 /// LowerShiftRightParts - Lower SRA_PARTS, which returns two 6013 /// i32 values and take a 2 x i32 value to shift plus a shift amount. 6014 SDValue ARMTargetLowering::LowerShiftRightParts(SDValue Op, 6015 SelectionDAG &DAG) const { 6016 assert(Op.getNumOperands() == 3 && "Not a double-shift!"); 6017 EVT VT = Op.getValueType(); 6018 unsigned VTBits = VT.getSizeInBits(); 6019 SDLoc dl(Op); 6020 SDValue ShOpLo = Op.getOperand(0); 6021 SDValue ShOpHi = Op.getOperand(1); 6022 SDValue ShAmt = Op.getOperand(2); 6023 SDValue ARMcc; 6024 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); 6025 unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL; 6026 6027 assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS); 6028 6029 SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, 6030 DAG.getConstant(VTBits, dl, MVT::i32), ShAmt); 6031 SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt); 6032 SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt, 6033 DAG.getConstant(VTBits, dl, MVT::i32)); 6034 SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt); 6035 SDValue LoSmallShift = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2); 6036 SDValue LoBigShift = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt); 6037 SDValue CmpLo = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32), 6038 ISD::SETGE, ARMcc, DAG, dl); 6039 SDValue Lo = DAG.getNode(ARMISD::CMOV, dl, VT, LoSmallShift, LoBigShift, 6040 ARMcc, CCR, CmpLo); 6041 6042 SDValue HiSmallShift = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt); 6043 SDValue HiBigShift = Opc == ISD::SRA 6044 ? DAG.getNode(Opc, dl, VT, ShOpHi, 6045 DAG.getConstant(VTBits - 1, dl, VT)) 6046 : DAG.getConstant(0, dl, VT); 6047 SDValue CmpHi = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32), 6048 ISD::SETGE, ARMcc, DAG, dl); 6049 SDValue Hi = DAG.getNode(ARMISD::CMOV, dl, VT, HiSmallShift, HiBigShift, 6050 ARMcc, CCR, CmpHi); 6051 6052 SDValue Ops[2] = { Lo, Hi }; 6053 return DAG.getMergeValues(Ops, dl); 6054 } 6055 6056 /// LowerShiftLeftParts - Lower SHL_PARTS, which returns two 6057 /// i32 values and take a 2 x i32 value to shift plus a shift amount. 6058 SDValue ARMTargetLowering::LowerShiftLeftParts(SDValue Op, 6059 SelectionDAG &DAG) const { 6060 assert(Op.getNumOperands() == 3 && "Not a double-shift!"); 6061 EVT VT = Op.getValueType(); 6062 unsigned VTBits = VT.getSizeInBits(); 6063 SDLoc dl(Op); 6064 SDValue ShOpLo = Op.getOperand(0); 6065 SDValue ShOpHi = Op.getOperand(1); 6066 SDValue ShAmt = Op.getOperand(2); 6067 SDValue ARMcc; 6068 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); 6069 6070 assert(Op.getOpcode() == ISD::SHL_PARTS); 6071 SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, 6072 DAG.getConstant(VTBits, dl, MVT::i32), ShAmt); 6073 SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt); 6074 SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt); 6075 SDValue HiSmallShift = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2); 6076 6077 SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt, 6078 DAG.getConstant(VTBits, dl, MVT::i32)); 6079 SDValue HiBigShift = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt); 6080 SDValue CmpHi = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32), 6081 ISD::SETGE, ARMcc, DAG, dl); 6082 SDValue Hi = DAG.getNode(ARMISD::CMOV, dl, VT, HiSmallShift, HiBigShift, 6083 ARMcc, CCR, CmpHi); 6084 6085 SDValue CmpLo = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32), 6086 ISD::SETGE, ARMcc, DAG, dl); 6087 SDValue LoSmallShift = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt); 6088 SDValue Lo = DAG.getNode(ARMISD::CMOV, dl, VT, LoSmallShift, 6089 DAG.getConstant(0, dl, VT), ARMcc, CCR, CmpLo); 6090 6091 SDValue Ops[2] = { Lo, Hi }; 6092 return DAG.getMergeValues(Ops, dl); 6093 } 6094 6095 SDValue ARMTargetLowering::LowerFLT_ROUNDS_(SDValue Op, 6096 SelectionDAG &DAG) const { 6097 // The rounding mode is in bits 23:22 of the FPSCR. 6098 // The ARM rounding mode value to FLT_ROUNDS mapping is 0->1, 1->2, 2->3, 3->0 6099 // The formula we use to implement this is (((FPSCR + 1 << 22) >> 22) & 3) 6100 // so that the shift + and get folded into a bitfield extract. 6101 SDLoc dl(Op); 6102 SDValue Chain = Op.getOperand(0); 6103 SDValue Ops[] = {Chain, 6104 DAG.getConstant(Intrinsic::arm_get_fpscr, dl, MVT::i32)}; 6105 6106 SDValue FPSCR = 6107 DAG.getNode(ISD::INTRINSIC_W_CHAIN, dl, {MVT::i32, MVT::Other}, Ops); 6108 Chain = FPSCR.getValue(1); 6109 SDValue FltRounds = DAG.getNode(ISD::ADD, dl, MVT::i32, FPSCR, 6110 DAG.getConstant(1U << 22, dl, MVT::i32)); 6111 SDValue RMODE = DAG.getNode(ISD::SRL, dl, MVT::i32, FltRounds, 6112 DAG.getConstant(22, dl, MVT::i32)); 6113 SDValue And = DAG.getNode(ISD::AND, dl, MVT::i32, RMODE, 6114 DAG.getConstant(3, dl, MVT::i32)); 6115 return DAG.getMergeValues({And, Chain}, dl); 6116 } 6117 6118 SDValue ARMTargetLowering::LowerSET_ROUNDING(SDValue Op, 6119 SelectionDAG &DAG) const { 6120 SDLoc DL(Op); 6121 SDValue Chain = Op->getOperand(0); 6122 SDValue RMValue = Op->getOperand(1); 6123 6124 // The rounding mode is in bits 23:22 of the FPSCR. 6125 // The llvm.set.rounding argument value to ARM rounding mode value mapping 6126 // is 0->3, 1->0, 2->1, 3->2. The formula we use to implement this is 6127 // ((arg - 1) & 3) << 22). 6128 // 6129 // It is expected that the argument of llvm.set.rounding is within the 6130 // segment [0, 3], so NearestTiesToAway (4) is not handled here. It is 6131 // responsibility of the code generated llvm.set.rounding to ensure this 6132 // condition. 6133 6134 // Calculate new value of FPSCR[23:22]. 6135 RMValue = DAG.getNode(ISD::SUB, DL, MVT::i32, RMValue, 6136 DAG.getConstant(1, DL, MVT::i32)); 6137 RMValue = DAG.getNode(ISD::AND, DL, MVT::i32, RMValue, 6138 DAG.getConstant(0x3, DL, MVT::i32)); 6139 RMValue = DAG.getNode(ISD::SHL, DL, MVT::i32, RMValue, 6140 DAG.getConstant(ARM::RoundingBitsPos, DL, MVT::i32)); 6141 6142 // Get current value of FPSCR. 6143 SDValue Ops[] = {Chain, 6144 DAG.getConstant(Intrinsic::arm_get_fpscr, DL, MVT::i32)}; 6145 SDValue FPSCR = 6146 DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, {MVT::i32, MVT::Other}, Ops); 6147 Chain = FPSCR.getValue(1); 6148 FPSCR = FPSCR.getValue(0); 6149 6150 // Put new rounding mode into FPSCR[23:22]. 6151 const unsigned RMMask = ~(ARM::Rounding::rmMask << ARM::RoundingBitsPos); 6152 FPSCR = DAG.getNode(ISD::AND, DL, MVT::i32, FPSCR, 6153 DAG.getConstant(RMMask, DL, MVT::i32)); 6154 FPSCR = DAG.getNode(ISD::OR, DL, MVT::i32, FPSCR, RMValue); 6155 SDValue Ops2[] = { 6156 Chain, DAG.getConstant(Intrinsic::arm_set_fpscr, DL, MVT::i32), FPSCR}; 6157 return DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, Ops2); 6158 } 6159 6160 static SDValue LowerCTTZ(SDNode *N, SelectionDAG &DAG, 6161 const ARMSubtarget *ST) { 6162 SDLoc dl(N); 6163 EVT VT = N->getValueType(0); 6164 if (VT.isVector() && ST->hasNEON()) { 6165 6166 // Compute the least significant set bit: LSB = X & -X 6167 SDValue X = N->getOperand(0); 6168 SDValue NX = DAG.getNode(ISD::SUB, dl, VT, getZeroVector(VT, DAG, dl), X); 6169 SDValue LSB = DAG.getNode(ISD::AND, dl, VT, X, NX); 6170 6171 EVT ElemTy = VT.getVectorElementType(); 6172 6173 if (ElemTy == MVT::i8) { 6174 // Compute with: cttz(x) = ctpop(lsb - 1) 6175 SDValue One = DAG.getNode(ARMISD::VMOVIMM, dl, VT, 6176 DAG.getTargetConstant(1, dl, ElemTy)); 6177 SDValue Bits = DAG.getNode(ISD::SUB, dl, VT, LSB, One); 6178 return DAG.getNode(ISD::CTPOP, dl, VT, Bits); 6179 } 6180 6181 if ((ElemTy == MVT::i16 || ElemTy == MVT::i32) && 6182 (N->getOpcode() == ISD::CTTZ_ZERO_UNDEF)) { 6183 // Compute with: cttz(x) = (width - 1) - ctlz(lsb), if x != 0 6184 unsigned NumBits = ElemTy.getSizeInBits(); 6185 SDValue WidthMinus1 = 6186 DAG.getNode(ARMISD::VMOVIMM, dl, VT, 6187 DAG.getTargetConstant(NumBits - 1, dl, ElemTy)); 6188 SDValue CTLZ = DAG.getNode(ISD::CTLZ, dl, VT, LSB); 6189 return DAG.getNode(ISD::SUB, dl, VT, WidthMinus1, CTLZ); 6190 } 6191 6192 // Compute with: cttz(x) = ctpop(lsb - 1) 6193 6194 // Compute LSB - 1. 6195 SDValue Bits; 6196 if (ElemTy == MVT::i64) { 6197 // Load constant 0xffff'ffff'ffff'ffff to register. 6198 SDValue FF = DAG.getNode(ARMISD::VMOVIMM, dl, VT, 6199 DAG.getTargetConstant(0x1eff, dl, MVT::i32)); 6200 Bits = DAG.getNode(ISD::ADD, dl, VT, LSB, FF); 6201 } else { 6202 SDValue One = DAG.getNode(ARMISD::VMOVIMM, dl, VT, 6203 DAG.getTargetConstant(1, dl, ElemTy)); 6204 Bits = DAG.getNode(ISD::SUB, dl, VT, LSB, One); 6205 } 6206 return DAG.getNode(ISD::CTPOP, dl, VT, Bits); 6207 } 6208 6209 if (!ST->hasV6T2Ops()) 6210 return SDValue(); 6211 6212 SDValue rbit = DAG.getNode(ISD::BITREVERSE, dl, VT, N->getOperand(0)); 6213 return DAG.getNode(ISD::CTLZ, dl, VT, rbit); 6214 } 6215 6216 static SDValue LowerCTPOP(SDNode *N, SelectionDAG &DAG, 6217 const ARMSubtarget *ST) { 6218 EVT VT = N->getValueType(0); 6219 SDLoc DL(N); 6220 6221 assert(ST->hasNEON() && "Custom ctpop lowering requires NEON."); 6222 assert((VT == MVT::v1i64 || VT == MVT::v2i64 || VT == MVT::v2i32 || 6223 VT == MVT::v4i32 || VT == MVT::v4i16 || VT == MVT::v8i16) && 6224 "Unexpected type for custom ctpop lowering"); 6225 6226 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 6227 EVT VT8Bit = VT.is64BitVector() ? MVT::v8i8 : MVT::v16i8; 6228 SDValue Res = DAG.getBitcast(VT8Bit, N->getOperand(0)); 6229 Res = DAG.getNode(ISD::CTPOP, DL, VT8Bit, Res); 6230 6231 // Widen v8i8/v16i8 CTPOP result to VT by repeatedly widening pairwise adds. 6232 unsigned EltSize = 8; 6233 unsigned NumElts = VT.is64BitVector() ? 8 : 16; 6234 while (EltSize != VT.getScalarSizeInBits()) { 6235 SmallVector<SDValue, 8> Ops; 6236 Ops.push_back(DAG.getConstant(Intrinsic::arm_neon_vpaddlu, DL, 6237 TLI.getPointerTy(DAG.getDataLayout()))); 6238 Ops.push_back(Res); 6239 6240 EltSize *= 2; 6241 NumElts /= 2; 6242 MVT WidenVT = MVT::getVectorVT(MVT::getIntegerVT(EltSize), NumElts); 6243 Res = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, WidenVT, Ops); 6244 } 6245 6246 return Res; 6247 } 6248 6249 /// Getvshiftimm - Check if this is a valid build_vector for the immediate 6250 /// operand of a vector shift operation, where all the elements of the 6251 /// build_vector must have the same constant integer value. 6252 static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) { 6253 // Ignore bit_converts. 6254 while (Op.getOpcode() == ISD::BITCAST) 6255 Op = Op.getOperand(0); 6256 BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Op.getNode()); 6257 APInt SplatBits, SplatUndef; 6258 unsigned SplatBitSize; 6259 bool HasAnyUndefs; 6260 if (!BVN || 6261 !BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs, 6262 ElementBits) || 6263 SplatBitSize > ElementBits) 6264 return false; 6265 Cnt = SplatBits.getSExtValue(); 6266 return true; 6267 } 6268 6269 /// isVShiftLImm - Check if this is a valid build_vector for the immediate 6270 /// operand of a vector shift left operation. That value must be in the range: 6271 /// 0 <= Value < ElementBits for a left shift; or 6272 /// 0 <= Value <= ElementBits for a long left shift. 6273 static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt) { 6274 assert(VT.isVector() && "vector shift count is not a vector type"); 6275 int64_t ElementBits = VT.getScalarSizeInBits(); 6276 if (!getVShiftImm(Op, ElementBits, Cnt)) 6277 return false; 6278 return (Cnt >= 0 && (isLong ? Cnt - 1 : Cnt) < ElementBits); 6279 } 6280 6281 /// isVShiftRImm - Check if this is a valid build_vector for the immediate 6282 /// operand of a vector shift right operation. For a shift opcode, the value 6283 /// is positive, but for an intrinsic the value count must be negative. The 6284 /// absolute value must be in the range: 6285 /// 1 <= |Value| <= ElementBits for a right shift; or 6286 /// 1 <= |Value| <= ElementBits/2 for a narrow right shift. 6287 static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, bool isIntrinsic, 6288 int64_t &Cnt) { 6289 assert(VT.isVector() && "vector shift count is not a vector type"); 6290 int64_t ElementBits = VT.getScalarSizeInBits(); 6291 if (!getVShiftImm(Op, ElementBits, Cnt)) 6292 return false; 6293 if (!isIntrinsic) 6294 return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits / 2 : ElementBits)); 6295 if (Cnt >= -(isNarrow ? ElementBits / 2 : ElementBits) && Cnt <= -1) { 6296 Cnt = -Cnt; 6297 return true; 6298 } 6299 return false; 6300 } 6301 6302 static SDValue LowerShift(SDNode *N, SelectionDAG &DAG, 6303 const ARMSubtarget *ST) { 6304 EVT VT = N->getValueType(0); 6305 SDLoc dl(N); 6306 int64_t Cnt; 6307 6308 if (!VT.isVector()) 6309 return SDValue(); 6310 6311 // We essentially have two forms here. Shift by an immediate and shift by a 6312 // vector register (there are also shift by a gpr, but that is just handled 6313 // with a tablegen pattern). We cannot easily match shift by an immediate in 6314 // tablegen so we do that here and generate a VSHLIMM/VSHRsIMM/VSHRuIMM. 6315 // For shifting by a vector, we don't have VSHR, only VSHL (which can be 6316 // signed or unsigned, and a negative shift indicates a shift right). 6317 if (N->getOpcode() == ISD::SHL) { 6318 if (isVShiftLImm(N->getOperand(1), VT, false, Cnt)) 6319 return DAG.getNode(ARMISD::VSHLIMM, dl, VT, N->getOperand(0), 6320 DAG.getConstant(Cnt, dl, MVT::i32)); 6321 return DAG.getNode(ARMISD::VSHLu, dl, VT, N->getOperand(0), 6322 N->getOperand(1)); 6323 } 6324 6325 assert((N->getOpcode() == ISD::SRA || N->getOpcode() == ISD::SRL) && 6326 "unexpected vector shift opcode"); 6327 6328 if (isVShiftRImm(N->getOperand(1), VT, false, false, Cnt)) { 6329 unsigned VShiftOpc = 6330 (N->getOpcode() == ISD::SRA ? ARMISD::VSHRsIMM : ARMISD::VSHRuIMM); 6331 return DAG.getNode(VShiftOpc, dl, VT, N->getOperand(0), 6332 DAG.getConstant(Cnt, dl, MVT::i32)); 6333 } 6334 6335 // Other right shifts we don't have operations for (we use a shift left by a 6336 // negative number). 6337 EVT ShiftVT = N->getOperand(1).getValueType(); 6338 SDValue NegatedCount = DAG.getNode( 6339 ISD::SUB, dl, ShiftVT, getZeroVector(ShiftVT, DAG, dl), N->getOperand(1)); 6340 unsigned VShiftOpc = 6341 (N->getOpcode() == ISD::SRA ? ARMISD::VSHLs : ARMISD::VSHLu); 6342 return DAG.getNode(VShiftOpc, dl, VT, N->getOperand(0), NegatedCount); 6343 } 6344 6345 static SDValue Expand64BitShift(SDNode *N, SelectionDAG &DAG, 6346 const ARMSubtarget *ST) { 6347 EVT VT = N->getValueType(0); 6348 SDLoc dl(N); 6349 6350 // We can get here for a node like i32 = ISD::SHL i32, i64 6351 if (VT != MVT::i64) 6352 return SDValue(); 6353 6354 assert((N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA || 6355 N->getOpcode() == ISD::SHL) && 6356 "Unknown shift to lower!"); 6357 6358 unsigned ShOpc = N->getOpcode(); 6359 if (ST->hasMVEIntegerOps()) { 6360 SDValue ShAmt = N->getOperand(1); 6361 unsigned ShPartsOpc = ARMISD::LSLL; 6362 ConstantSDNode *Con = dyn_cast<ConstantSDNode>(ShAmt); 6363 6364 // If the shift amount is greater than 32 or has a greater bitwidth than 64 6365 // then do the default optimisation 6366 if (ShAmt->getValueType(0).getSizeInBits() > 64 || 6367 (Con && (Con->getZExtValue() == 0 || Con->getZExtValue() >= 32))) 6368 return SDValue(); 6369 6370 // Extract the lower 32 bits of the shift amount if it's not an i32 6371 if (ShAmt->getValueType(0) != MVT::i32) 6372 ShAmt = DAG.getZExtOrTrunc(ShAmt, dl, MVT::i32); 6373 6374 if (ShOpc == ISD::SRL) { 6375 if (!Con) 6376 // There is no t2LSRLr instruction so negate and perform an lsll if the 6377 // shift amount is in a register, emulating a right shift. 6378 ShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, 6379 DAG.getConstant(0, dl, MVT::i32), ShAmt); 6380 else 6381 // Else generate an lsrl on the immediate shift amount 6382 ShPartsOpc = ARMISD::LSRL; 6383 } else if (ShOpc == ISD::SRA) 6384 ShPartsOpc = ARMISD::ASRL; 6385 6386 // Lower 32 bits of the destination/source 6387 SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(0), 6388 DAG.getConstant(0, dl, MVT::i32)); 6389 // Upper 32 bits of the destination/source 6390 SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(0), 6391 DAG.getConstant(1, dl, MVT::i32)); 6392 6393 // Generate the shift operation as computed above 6394 Lo = DAG.getNode(ShPartsOpc, dl, DAG.getVTList(MVT::i32, MVT::i32), Lo, Hi, 6395 ShAmt); 6396 // The upper 32 bits come from the second return value of lsll 6397 Hi = SDValue(Lo.getNode(), 1); 6398 return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi); 6399 } 6400 6401 // We only lower SRA, SRL of 1 here, all others use generic lowering. 6402 if (!isOneConstant(N->getOperand(1)) || N->getOpcode() == ISD::SHL) 6403 return SDValue(); 6404 6405 // If we are in thumb mode, we don't have RRX. 6406 if (ST->isThumb1Only()) 6407 return SDValue(); 6408 6409 // Okay, we have a 64-bit SRA or SRL of 1. Lower this to an RRX expr. 6410 SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(0), 6411 DAG.getConstant(0, dl, MVT::i32)); 6412 SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(0), 6413 DAG.getConstant(1, dl, MVT::i32)); 6414 6415 // First, build a SRA_FLAG/SRL_FLAG op, which shifts the top part by one and 6416 // captures the result into a carry flag. 6417 unsigned Opc = N->getOpcode() == ISD::SRL ? ARMISD::SRL_FLAG:ARMISD::SRA_FLAG; 6418 Hi = DAG.getNode(Opc, dl, DAG.getVTList(MVT::i32, MVT::Glue), Hi); 6419 6420 // The low part is an ARMISD::RRX operand, which shifts the carry in. 6421 Lo = DAG.getNode(ARMISD::RRX, dl, MVT::i32, Lo, Hi.getValue(1)); 6422 6423 // Merge the pieces into a single i64 value. 6424 return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi); 6425 } 6426 6427 static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG, 6428 const ARMSubtarget *ST) { 6429 bool Invert = false; 6430 bool Swap = false; 6431 unsigned Opc = ARMCC::AL; 6432 6433 SDValue Op0 = Op.getOperand(0); 6434 SDValue Op1 = Op.getOperand(1); 6435 SDValue CC = Op.getOperand(2); 6436 EVT VT = Op.getValueType(); 6437 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get(); 6438 SDLoc dl(Op); 6439 6440 EVT CmpVT; 6441 if (ST->hasNEON()) 6442 CmpVT = Op0.getValueType().changeVectorElementTypeToInteger(); 6443 else { 6444 assert(ST->hasMVEIntegerOps() && 6445 "No hardware support for integer vector comparison!"); 6446 6447 if (Op.getValueType().getVectorElementType() != MVT::i1) 6448 return SDValue(); 6449 6450 // Make sure we expand floating point setcc to scalar if we do not have 6451 // mve.fp, so that we can handle them from there. 6452 if (Op0.getValueType().isFloatingPoint() && !ST->hasMVEFloatOps()) 6453 return SDValue(); 6454 6455 CmpVT = VT; 6456 } 6457 6458 if (Op0.getValueType().getVectorElementType() == MVT::i64 && 6459 (SetCCOpcode == ISD::SETEQ || SetCCOpcode == ISD::SETNE)) { 6460 // Special-case integer 64-bit equality comparisons. They aren't legal, 6461 // but they can be lowered with a few vector instructions. 6462 unsigned CmpElements = CmpVT.getVectorNumElements() * 2; 6463 EVT SplitVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, CmpElements); 6464 SDValue CastOp0 = DAG.getNode(ISD::BITCAST, dl, SplitVT, Op0); 6465 SDValue CastOp1 = DAG.getNode(ISD::BITCAST, dl, SplitVT, Op1); 6466 SDValue Cmp = DAG.getNode(ISD::SETCC, dl, SplitVT, CastOp0, CastOp1, 6467 DAG.getCondCode(ISD::SETEQ)); 6468 SDValue Reversed = DAG.getNode(ARMISD::VREV64, dl, SplitVT, Cmp); 6469 SDValue Merged = DAG.getNode(ISD::AND, dl, SplitVT, Cmp, Reversed); 6470 Merged = DAG.getNode(ISD::BITCAST, dl, CmpVT, Merged); 6471 if (SetCCOpcode == ISD::SETNE) 6472 Merged = DAG.getNOT(dl, Merged, CmpVT); 6473 Merged = DAG.getSExtOrTrunc(Merged, dl, VT); 6474 return Merged; 6475 } 6476 6477 if (CmpVT.getVectorElementType() == MVT::i64) 6478 // 64-bit comparisons are not legal in general. 6479 return SDValue(); 6480 6481 if (Op1.getValueType().isFloatingPoint()) { 6482 switch (SetCCOpcode) { 6483 default: llvm_unreachable("Illegal FP comparison"); 6484 case ISD::SETUNE: 6485 case ISD::SETNE: 6486 if (ST->hasMVEFloatOps()) { 6487 Opc = ARMCC::NE; break; 6488 } else { 6489 Invert = true; LLVM_FALLTHROUGH; 6490 } 6491 case ISD::SETOEQ: 6492 case ISD::SETEQ: Opc = ARMCC::EQ; break; 6493 case ISD::SETOLT: 6494 case ISD::SETLT: Swap = true; LLVM_FALLTHROUGH; 6495 case ISD::SETOGT: 6496 case ISD::SETGT: Opc = ARMCC::GT; break; 6497 case ISD::SETOLE: 6498 case ISD::SETLE: Swap = true; LLVM_FALLTHROUGH; 6499 case ISD::SETOGE: 6500 case ISD::SETGE: Opc = ARMCC::GE; break; 6501 case ISD::SETUGE: Swap = true; LLVM_FALLTHROUGH; 6502 case ISD::SETULE: Invert = true; Opc = ARMCC::GT; break; 6503 case ISD::SETUGT: Swap = true; LLVM_FALLTHROUGH; 6504 case ISD::SETULT: Invert = true; Opc = ARMCC::GE; break; 6505 case ISD::SETUEQ: Invert = true; LLVM_FALLTHROUGH; 6506 case ISD::SETONE: { 6507 // Expand this to (OLT | OGT). 6508 SDValue TmpOp0 = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op1, Op0, 6509 DAG.getConstant(ARMCC::GT, dl, MVT::i32)); 6510 SDValue TmpOp1 = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op0, Op1, 6511 DAG.getConstant(ARMCC::GT, dl, MVT::i32)); 6512 SDValue Result = DAG.getNode(ISD::OR, dl, CmpVT, TmpOp0, TmpOp1); 6513 if (Invert) 6514 Result = DAG.getNOT(dl, Result, VT); 6515 return Result; 6516 } 6517 case ISD::SETUO: Invert = true; LLVM_FALLTHROUGH; 6518 case ISD::SETO: { 6519 // Expand this to (OLT | OGE). 6520 SDValue TmpOp0 = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op1, Op0, 6521 DAG.getConstant(ARMCC::GT, dl, MVT::i32)); 6522 SDValue TmpOp1 = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op0, Op1, 6523 DAG.getConstant(ARMCC::GE, dl, MVT::i32)); 6524 SDValue Result = DAG.getNode(ISD::OR, dl, CmpVT, TmpOp0, TmpOp1); 6525 if (Invert) 6526 Result = DAG.getNOT(dl, Result, VT); 6527 return Result; 6528 } 6529 } 6530 } else { 6531 // Integer comparisons. 6532 switch (SetCCOpcode) { 6533 default: llvm_unreachable("Illegal integer comparison"); 6534 case ISD::SETNE: 6535 if (ST->hasMVEIntegerOps()) { 6536 Opc = ARMCC::NE; break; 6537 } else { 6538 Invert = true; LLVM_FALLTHROUGH; 6539 } 6540 case ISD::SETEQ: Opc = ARMCC::EQ; break; 6541 case ISD::SETLT: Swap = true; LLVM_FALLTHROUGH; 6542 case ISD::SETGT: Opc = ARMCC::GT; break; 6543 case ISD::SETLE: Swap = true; LLVM_FALLTHROUGH; 6544 case ISD::SETGE: Opc = ARMCC::GE; break; 6545 case ISD::SETULT: Swap = true; LLVM_FALLTHROUGH; 6546 case ISD::SETUGT: Opc = ARMCC::HI; break; 6547 case ISD::SETULE: Swap = true; LLVM_FALLTHROUGH; 6548 case ISD::SETUGE: Opc = ARMCC::HS; break; 6549 } 6550 6551 // Detect VTST (Vector Test Bits) = icmp ne (and (op0, op1), zero). 6552 if (ST->hasNEON() && Opc == ARMCC::EQ) { 6553 SDValue AndOp; 6554 if (ISD::isBuildVectorAllZeros(Op1.getNode())) 6555 AndOp = Op0; 6556 else if (ISD::isBuildVectorAllZeros(Op0.getNode())) 6557 AndOp = Op1; 6558 6559 // Ignore bitconvert. 6560 if (AndOp.getNode() && AndOp.getOpcode() == ISD::BITCAST) 6561 AndOp = AndOp.getOperand(0); 6562 6563 if (AndOp.getNode() && AndOp.getOpcode() == ISD::AND) { 6564 Op0 = DAG.getNode(ISD::BITCAST, dl, CmpVT, AndOp.getOperand(0)); 6565 Op1 = DAG.getNode(ISD::BITCAST, dl, CmpVT, AndOp.getOperand(1)); 6566 SDValue Result = DAG.getNode(ARMISD::VTST, dl, CmpVT, Op0, Op1); 6567 if (!Invert) 6568 Result = DAG.getNOT(dl, Result, VT); 6569 return Result; 6570 } 6571 } 6572 } 6573 6574 if (Swap) 6575 std::swap(Op0, Op1); 6576 6577 // If one of the operands is a constant vector zero, attempt to fold the 6578 // comparison to a specialized compare-against-zero form. 6579 SDValue SingleOp; 6580 if (ISD::isBuildVectorAllZeros(Op1.getNode())) 6581 SingleOp = Op0; 6582 else if (ISD::isBuildVectorAllZeros(Op0.getNode())) { 6583 if (Opc == ARMCC::GE) 6584 Opc = ARMCC::LE; 6585 else if (Opc == ARMCC::GT) 6586 Opc = ARMCC::LT; 6587 SingleOp = Op1; 6588 } 6589 6590 SDValue Result; 6591 if (SingleOp.getNode()) { 6592 Result = DAG.getNode(ARMISD::VCMPZ, dl, CmpVT, SingleOp, 6593 DAG.getConstant(Opc, dl, MVT::i32)); 6594 } else { 6595 Result = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op0, Op1, 6596 DAG.getConstant(Opc, dl, MVT::i32)); 6597 } 6598 6599 Result = DAG.getSExtOrTrunc(Result, dl, VT); 6600 6601 if (Invert) 6602 Result = DAG.getNOT(dl, Result, VT); 6603 6604 return Result; 6605 } 6606 6607 static SDValue LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) { 6608 SDValue LHS = Op.getOperand(0); 6609 SDValue RHS = Op.getOperand(1); 6610 SDValue Carry = Op.getOperand(2); 6611 SDValue Cond = Op.getOperand(3); 6612 SDLoc DL(Op); 6613 6614 assert(LHS.getSimpleValueType().isInteger() && "SETCCCARRY is integer only."); 6615 6616 // ARMISD::SUBE expects a carry not a borrow like ISD::SUBCARRY so we 6617 // have to invert the carry first. 6618 Carry = DAG.getNode(ISD::SUB, DL, MVT::i32, 6619 DAG.getConstant(1, DL, MVT::i32), Carry); 6620 // This converts the boolean value carry into the carry flag. 6621 Carry = ConvertBooleanCarryToCarryFlag(Carry, DAG); 6622 6623 SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32); 6624 SDValue Cmp = DAG.getNode(ARMISD::SUBE, DL, VTs, LHS, RHS, Carry); 6625 6626 SDValue FVal = DAG.getConstant(0, DL, MVT::i32); 6627 SDValue TVal = DAG.getConstant(1, DL, MVT::i32); 6628 SDValue ARMcc = DAG.getConstant( 6629 IntCCToARMCC(cast<CondCodeSDNode>(Cond)->get()), DL, MVT::i32); 6630 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); 6631 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), DL, ARM::CPSR, 6632 Cmp.getValue(1), SDValue()); 6633 return DAG.getNode(ARMISD::CMOV, DL, Op.getValueType(), FVal, TVal, ARMcc, 6634 CCR, Chain.getValue(1)); 6635 } 6636 6637 /// isVMOVModifiedImm - Check if the specified splat value corresponds to a 6638 /// valid vector constant for a NEON or MVE instruction with a "modified 6639 /// immediate" operand (e.g., VMOV). If so, return the encoded value. 6640 static SDValue isVMOVModifiedImm(uint64_t SplatBits, uint64_t SplatUndef, 6641 unsigned SplatBitSize, SelectionDAG &DAG, 6642 const SDLoc &dl, EVT &VT, EVT VectorVT, 6643 VMOVModImmType type) { 6644 unsigned OpCmode, Imm; 6645 bool is128Bits = VectorVT.is128BitVector(); 6646 6647 // SplatBitSize is set to the smallest size that splats the vector, so a 6648 // zero vector will always have SplatBitSize == 8. However, NEON modified 6649 // immediate instructions others than VMOV do not support the 8-bit encoding 6650 // of a zero vector, and the default encoding of zero is supposed to be the 6651 // 32-bit version. 6652 if (SplatBits == 0) 6653 SplatBitSize = 32; 6654 6655 switch (SplatBitSize) { 6656 case 8: 6657 if (type != VMOVModImm) 6658 return SDValue(); 6659 // Any 1-byte value is OK. Op=0, Cmode=1110. 6660 assert((SplatBits & ~0xff) == 0 && "one byte splat value is too big"); 6661 OpCmode = 0xe; 6662 Imm = SplatBits; 6663 VT = is128Bits ? MVT::v16i8 : MVT::v8i8; 6664 break; 6665 6666 case 16: 6667 // NEON's 16-bit VMOV supports splat values where only one byte is nonzero. 6668 VT = is128Bits ? MVT::v8i16 : MVT::v4i16; 6669 if ((SplatBits & ~0xff) == 0) { 6670 // Value = 0x00nn: Op=x, Cmode=100x. 6671 OpCmode = 0x8; 6672 Imm = SplatBits; 6673 break; 6674 } 6675 if ((SplatBits & ~0xff00) == 0) { 6676 // Value = 0xnn00: Op=x, Cmode=101x. 6677 OpCmode = 0xa; 6678 Imm = SplatBits >> 8; 6679 break; 6680 } 6681 return SDValue(); 6682 6683 case 32: 6684 // NEON's 32-bit VMOV supports splat values where: 6685 // * only one byte is nonzero, or 6686 // * the least significant byte is 0xff and the second byte is nonzero, or 6687 // * the least significant 2 bytes are 0xff and the third is nonzero. 6688 VT = is128Bits ? MVT::v4i32 : MVT::v2i32; 6689 if ((SplatBits & ~0xff) == 0) { 6690 // Value = 0x000000nn: Op=x, Cmode=000x. 6691 OpCmode = 0; 6692 Imm = SplatBits; 6693 break; 6694 } 6695 if ((SplatBits & ~0xff00) == 0) { 6696 // Value = 0x0000nn00: Op=x, Cmode=001x. 6697 OpCmode = 0x2; 6698 Imm = SplatBits >> 8; 6699 break; 6700 } 6701 if ((SplatBits & ~0xff0000) == 0) { 6702 // Value = 0x00nn0000: Op=x, Cmode=010x. 6703 OpCmode = 0x4; 6704 Imm = SplatBits >> 16; 6705 break; 6706 } 6707 if ((SplatBits & ~0xff000000) == 0) { 6708 // Value = 0xnn000000: Op=x, Cmode=011x. 6709 OpCmode = 0x6; 6710 Imm = SplatBits >> 24; 6711 break; 6712 } 6713 6714 // cmode == 0b1100 and cmode == 0b1101 are not supported for VORR or VBIC 6715 if (type == OtherModImm) return SDValue(); 6716 6717 if ((SplatBits & ~0xffff) == 0 && 6718 ((SplatBits | SplatUndef) & 0xff) == 0xff) { 6719 // Value = 0x0000nnff: Op=x, Cmode=1100. 6720 OpCmode = 0xc; 6721 Imm = SplatBits >> 8; 6722 break; 6723 } 6724 6725 // cmode == 0b1101 is not supported for MVE VMVN 6726 if (type == MVEVMVNModImm) 6727 return SDValue(); 6728 6729 if ((SplatBits & ~0xffffff) == 0 && 6730 ((SplatBits | SplatUndef) & 0xffff) == 0xffff) { 6731 // Value = 0x00nnffff: Op=x, Cmode=1101. 6732 OpCmode = 0xd; 6733 Imm = SplatBits >> 16; 6734 break; 6735 } 6736 6737 // Note: there are a few 32-bit splat values (specifically: 00ffff00, 6738 // ff000000, ff0000ff, and ffff00ff) that are valid for VMOV.I64 but not 6739 // VMOV.I32. A (very) minor optimization would be to replicate the value 6740 // and fall through here to test for a valid 64-bit splat. But, then the 6741 // caller would also need to check and handle the change in size. 6742 return SDValue(); 6743 6744 case 64: { 6745 if (type != VMOVModImm) 6746 return SDValue(); 6747 // NEON has a 64-bit VMOV splat where each byte is either 0 or 0xff. 6748 uint64_t BitMask = 0xff; 6749 uint64_t Val = 0; 6750 unsigned ImmMask = 1; 6751 Imm = 0; 6752 for (int ByteNum = 0; ByteNum < 8; ++ByteNum) { 6753 if (((SplatBits | SplatUndef) & BitMask) == BitMask) { 6754 Val |= BitMask; 6755 Imm |= ImmMask; 6756 } else if ((SplatBits & BitMask) != 0) { 6757 return SDValue(); 6758 } 6759 BitMask <<= 8; 6760 ImmMask <<= 1; 6761 } 6762 6763 if (DAG.getDataLayout().isBigEndian()) { 6764 // Reverse the order of elements within the vector. 6765 unsigned BytesPerElem = VectorVT.getScalarSizeInBits() / 8; 6766 unsigned Mask = (1 << BytesPerElem) - 1; 6767 unsigned NumElems = 8 / BytesPerElem; 6768 unsigned NewImm = 0; 6769 for (unsigned ElemNum = 0; ElemNum < NumElems; ++ElemNum) { 6770 unsigned Elem = ((Imm >> ElemNum * BytesPerElem) & Mask); 6771 NewImm |= Elem << (NumElems - ElemNum - 1) * BytesPerElem; 6772 } 6773 Imm = NewImm; 6774 } 6775 6776 // Op=1, Cmode=1110. 6777 OpCmode = 0x1e; 6778 VT = is128Bits ? MVT::v2i64 : MVT::v1i64; 6779 break; 6780 } 6781 6782 default: 6783 llvm_unreachable("unexpected size for isVMOVModifiedImm"); 6784 } 6785 6786 unsigned EncodedVal = ARM_AM::createVMOVModImm(OpCmode, Imm); 6787 return DAG.getTargetConstant(EncodedVal, dl, MVT::i32); 6788 } 6789 6790 SDValue ARMTargetLowering::LowerConstantFP(SDValue Op, SelectionDAG &DAG, 6791 const ARMSubtarget *ST) const { 6792 EVT VT = Op.getValueType(); 6793 bool IsDouble = (VT == MVT::f64); 6794 ConstantFPSDNode *CFP = cast<ConstantFPSDNode>(Op); 6795 const APFloat &FPVal = CFP->getValueAPF(); 6796 6797 // Prevent floating-point constants from using literal loads 6798 // when execute-only is enabled. 6799 if (ST->genExecuteOnly()) { 6800 // If we can represent the constant as an immediate, don't lower it 6801 if (isFPImmLegal(FPVal, VT)) 6802 return Op; 6803 // Otherwise, construct as integer, and move to float register 6804 APInt INTVal = FPVal.bitcastToAPInt(); 6805 SDLoc DL(CFP); 6806 switch (VT.getSimpleVT().SimpleTy) { 6807 default: 6808 llvm_unreachable("Unknown floating point type!"); 6809 break; 6810 case MVT::f64: { 6811 SDValue Lo = DAG.getConstant(INTVal.trunc(32), DL, MVT::i32); 6812 SDValue Hi = DAG.getConstant(INTVal.lshr(32).trunc(32), DL, MVT::i32); 6813 return DAG.getNode(ARMISD::VMOVDRR, DL, MVT::f64, Lo, Hi); 6814 } 6815 case MVT::f32: 6816 return DAG.getNode(ARMISD::VMOVSR, DL, VT, 6817 DAG.getConstant(INTVal, DL, MVT::i32)); 6818 } 6819 } 6820 6821 if (!ST->hasVFP3Base()) 6822 return SDValue(); 6823 6824 // Use the default (constant pool) lowering for double constants when we have 6825 // an SP-only FPU 6826 if (IsDouble && !Subtarget->hasFP64()) 6827 return SDValue(); 6828 6829 // Try splatting with a VMOV.f32... 6830 int ImmVal = IsDouble ? ARM_AM::getFP64Imm(FPVal) : ARM_AM::getFP32Imm(FPVal); 6831 6832 if (ImmVal != -1) { 6833 if (IsDouble || !ST->useNEONForSinglePrecisionFP()) { 6834 // We have code in place to select a valid ConstantFP already, no need to 6835 // do any mangling. 6836 return Op; 6837 } 6838 6839 // It's a float and we are trying to use NEON operations where 6840 // possible. Lower it to a splat followed by an extract. 6841 SDLoc DL(Op); 6842 SDValue NewVal = DAG.getTargetConstant(ImmVal, DL, MVT::i32); 6843 SDValue VecConstant = DAG.getNode(ARMISD::VMOVFPIMM, DL, MVT::v2f32, 6844 NewVal); 6845 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecConstant, 6846 DAG.getConstant(0, DL, MVT::i32)); 6847 } 6848 6849 // The rest of our options are NEON only, make sure that's allowed before 6850 // proceeding.. 6851 if (!ST->hasNEON() || (!IsDouble && !ST->useNEONForSinglePrecisionFP())) 6852 return SDValue(); 6853 6854 EVT VMovVT; 6855 uint64_t iVal = FPVal.bitcastToAPInt().getZExtValue(); 6856 6857 // It wouldn't really be worth bothering for doubles except for one very 6858 // important value, which does happen to match: 0.0. So make sure we don't do 6859 // anything stupid. 6860 if (IsDouble && (iVal & 0xffffffff) != (iVal >> 32)) 6861 return SDValue(); 6862 6863 // Try a VMOV.i32 (FIXME: i8, i16, or i64 could work too). 6864 SDValue NewVal = isVMOVModifiedImm(iVal & 0xffffffffU, 0, 32, DAG, SDLoc(Op), 6865 VMovVT, VT, VMOVModImm); 6866 if (NewVal != SDValue()) { 6867 SDLoc DL(Op); 6868 SDValue VecConstant = DAG.getNode(ARMISD::VMOVIMM, DL, VMovVT, 6869 NewVal); 6870 if (IsDouble) 6871 return DAG.getNode(ISD::BITCAST, DL, MVT::f64, VecConstant); 6872 6873 // It's a float: cast and extract a vector element. 6874 SDValue VecFConstant = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32, 6875 VecConstant); 6876 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecFConstant, 6877 DAG.getConstant(0, DL, MVT::i32)); 6878 } 6879 6880 // Finally, try a VMVN.i32 6881 NewVal = isVMOVModifiedImm(~iVal & 0xffffffffU, 0, 32, DAG, SDLoc(Op), VMovVT, 6882 VT, VMVNModImm); 6883 if (NewVal != SDValue()) { 6884 SDLoc DL(Op); 6885 SDValue VecConstant = DAG.getNode(ARMISD::VMVNIMM, DL, VMovVT, NewVal); 6886 6887 if (IsDouble) 6888 return DAG.getNode(ISD::BITCAST, DL, MVT::f64, VecConstant); 6889 6890 // It's a float: cast and extract a vector element. 6891 SDValue VecFConstant = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32, 6892 VecConstant); 6893 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecFConstant, 6894 DAG.getConstant(0, DL, MVT::i32)); 6895 } 6896 6897 return SDValue(); 6898 } 6899 6900 // check if an VEXT instruction can handle the shuffle mask when the 6901 // vector sources of the shuffle are the same. 6902 static bool isSingletonVEXTMask(ArrayRef<int> M, EVT VT, unsigned &Imm) { 6903 unsigned NumElts = VT.getVectorNumElements(); 6904 6905 // Assume that the first shuffle index is not UNDEF. Fail if it is. 6906 if (M[0] < 0) 6907 return false; 6908 6909 Imm = M[0]; 6910 6911 // If this is a VEXT shuffle, the immediate value is the index of the first 6912 // element. The other shuffle indices must be the successive elements after 6913 // the first one. 6914 unsigned ExpectedElt = Imm; 6915 for (unsigned i = 1; i < NumElts; ++i) { 6916 // Increment the expected index. If it wraps around, just follow it 6917 // back to index zero and keep going. 6918 ++ExpectedElt; 6919 if (ExpectedElt == NumElts) 6920 ExpectedElt = 0; 6921 6922 if (M[i] < 0) continue; // ignore UNDEF indices 6923 if (ExpectedElt != static_cast<unsigned>(M[i])) 6924 return false; 6925 } 6926 6927 return true; 6928 } 6929 6930 static bool isVEXTMask(ArrayRef<int> M, EVT VT, 6931 bool &ReverseVEXT, unsigned &Imm) { 6932 unsigned NumElts = VT.getVectorNumElements(); 6933 ReverseVEXT = false; 6934 6935 // Assume that the first shuffle index is not UNDEF. Fail if it is. 6936 if (M[0] < 0) 6937 return false; 6938 6939 Imm = M[0]; 6940 6941 // If this is a VEXT shuffle, the immediate value is the index of the first 6942 // element. The other shuffle indices must be the successive elements after 6943 // the first one. 6944 unsigned ExpectedElt = Imm; 6945 for (unsigned i = 1; i < NumElts; ++i) { 6946 // Increment the expected index. If it wraps around, it may still be 6947 // a VEXT but the source vectors must be swapped. 6948 ExpectedElt += 1; 6949 if (ExpectedElt == NumElts * 2) { 6950 ExpectedElt = 0; 6951 ReverseVEXT = true; 6952 } 6953 6954 if (M[i] < 0) continue; // ignore UNDEF indices 6955 if (ExpectedElt != static_cast<unsigned>(M[i])) 6956 return false; 6957 } 6958 6959 // Adjust the index value if the source operands will be swapped. 6960 if (ReverseVEXT) 6961 Imm -= NumElts; 6962 6963 return true; 6964 } 6965 6966 static bool isVTBLMask(ArrayRef<int> M, EVT VT) { 6967 // We can handle <8 x i8> vector shuffles. If the index in the mask is out of 6968 // range, then 0 is placed into the resulting vector. So pretty much any mask 6969 // of 8 elements can work here. 6970 return VT == MVT::v8i8 && M.size() == 8; 6971 } 6972 6973 static unsigned SelectPairHalf(unsigned Elements, ArrayRef<int> Mask, 6974 unsigned Index) { 6975 if (Mask.size() == Elements * 2) 6976 return Index / Elements; 6977 return Mask[Index] == 0 ? 0 : 1; 6978 } 6979 6980 // Checks whether the shuffle mask represents a vector transpose (VTRN) by 6981 // checking that pairs of elements in the shuffle mask represent the same index 6982 // in each vector, incrementing the expected index by 2 at each step. 6983 // e.g. For v1,v2 of type v4i32 a valid shuffle mask is: [0, 4, 2, 6] 6984 // v1={a,b,c,d} => x=shufflevector v1, v2 shufflemask => x={a,e,c,g} 6985 // v2={e,f,g,h} 6986 // WhichResult gives the offset for each element in the mask based on which 6987 // of the two results it belongs to. 6988 // 6989 // The transpose can be represented either as: 6990 // result1 = shufflevector v1, v2, result1_shuffle_mask 6991 // result2 = shufflevector v1, v2, result2_shuffle_mask 6992 // where v1/v2 and the shuffle masks have the same number of elements 6993 // (here WhichResult (see below) indicates which result is being checked) 6994 // 6995 // or as: 6996 // results = shufflevector v1, v2, shuffle_mask 6997 // where both results are returned in one vector and the shuffle mask has twice 6998 // as many elements as v1/v2 (here WhichResult will always be 0 if true) here we 6999 // want to check the low half and high half of the shuffle mask as if it were 7000 // the other case 7001 static bool isVTRNMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) { 7002 unsigned EltSz = VT.getScalarSizeInBits(); 7003 if (EltSz == 64) 7004 return false; 7005 7006 unsigned NumElts = VT.getVectorNumElements(); 7007 if (M.size() != NumElts && M.size() != NumElts*2) 7008 return false; 7009 7010 // If the mask is twice as long as the input vector then we need to check the 7011 // upper and lower parts of the mask with a matching value for WhichResult 7012 // FIXME: A mask with only even values will be rejected in case the first 7013 // element is undefined, e.g. [-1, 4, 2, 6] will be rejected, because only 7014 // M[0] is used to determine WhichResult 7015 for (unsigned i = 0; i < M.size(); i += NumElts) { 7016 WhichResult = SelectPairHalf(NumElts, M, i); 7017 for (unsigned j = 0; j < NumElts; j += 2) { 7018 if ((M[i+j] >= 0 && (unsigned) M[i+j] != j + WhichResult) || 7019 (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != j + NumElts + WhichResult)) 7020 return false; 7021 } 7022 } 7023 7024 if (M.size() == NumElts*2) 7025 WhichResult = 0; 7026 7027 return true; 7028 } 7029 7030 /// isVTRN_v_undef_Mask - Special case of isVTRNMask for canonical form of 7031 /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef". 7032 /// Mask is e.g., <0, 0, 2, 2> instead of <0, 4, 2, 6>. 7033 static bool isVTRN_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){ 7034 unsigned EltSz = VT.getScalarSizeInBits(); 7035 if (EltSz == 64) 7036 return false; 7037 7038 unsigned NumElts = VT.getVectorNumElements(); 7039 if (M.size() != NumElts && M.size() != NumElts*2) 7040 return false; 7041 7042 for (unsigned i = 0; i < M.size(); i += NumElts) { 7043 WhichResult = SelectPairHalf(NumElts, M, i); 7044 for (unsigned j = 0; j < NumElts; j += 2) { 7045 if ((M[i+j] >= 0 && (unsigned) M[i+j] != j + WhichResult) || 7046 (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != j + WhichResult)) 7047 return false; 7048 } 7049 } 7050 7051 if (M.size() == NumElts*2) 7052 WhichResult = 0; 7053 7054 return true; 7055 } 7056 7057 // Checks whether the shuffle mask represents a vector unzip (VUZP) by checking 7058 // that the mask elements are either all even and in steps of size 2 or all odd 7059 // and in steps of size 2. 7060 // e.g. For v1,v2 of type v4i32 a valid shuffle mask is: [0, 2, 4, 6] 7061 // v1={a,b,c,d} => x=shufflevector v1, v2 shufflemask => x={a,c,e,g} 7062 // v2={e,f,g,h} 7063 // Requires similar checks to that of isVTRNMask with 7064 // respect the how results are returned. 7065 static bool isVUZPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) { 7066 unsigned EltSz = VT.getScalarSizeInBits(); 7067 if (EltSz == 64) 7068 return false; 7069 7070 unsigned NumElts = VT.getVectorNumElements(); 7071 if (M.size() != NumElts && M.size() != NumElts*2) 7072 return false; 7073 7074 for (unsigned i = 0; i < M.size(); i += NumElts) { 7075 WhichResult = SelectPairHalf(NumElts, M, i); 7076 for (unsigned j = 0; j < NumElts; ++j) { 7077 if (M[i+j] >= 0 && (unsigned) M[i+j] != 2 * j + WhichResult) 7078 return false; 7079 } 7080 } 7081 7082 if (M.size() == NumElts*2) 7083 WhichResult = 0; 7084 7085 // VUZP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32. 7086 if (VT.is64BitVector() && EltSz == 32) 7087 return false; 7088 7089 return true; 7090 } 7091 7092 /// isVUZP_v_undef_Mask - Special case of isVUZPMask for canonical form of 7093 /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef". 7094 /// Mask is e.g., <0, 2, 0, 2> instead of <0, 2, 4, 6>, 7095 static bool isVUZP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){ 7096 unsigned EltSz = VT.getScalarSizeInBits(); 7097 if (EltSz == 64) 7098 return false; 7099 7100 unsigned NumElts = VT.getVectorNumElements(); 7101 if (M.size() != NumElts && M.size() != NumElts*2) 7102 return false; 7103 7104 unsigned Half = NumElts / 2; 7105 for (unsigned i = 0; i < M.size(); i += NumElts) { 7106 WhichResult = SelectPairHalf(NumElts, M, i); 7107 for (unsigned j = 0; j < NumElts; j += Half) { 7108 unsigned Idx = WhichResult; 7109 for (unsigned k = 0; k < Half; ++k) { 7110 int MIdx = M[i + j + k]; 7111 if (MIdx >= 0 && (unsigned) MIdx != Idx) 7112 return false; 7113 Idx += 2; 7114 } 7115 } 7116 } 7117 7118 if (M.size() == NumElts*2) 7119 WhichResult = 0; 7120 7121 // VUZP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32. 7122 if (VT.is64BitVector() && EltSz == 32) 7123 return false; 7124 7125 return true; 7126 } 7127 7128 // Checks whether the shuffle mask represents a vector zip (VZIP) by checking 7129 // that pairs of elements of the shufflemask represent the same index in each 7130 // vector incrementing sequentially through the vectors. 7131 // e.g. For v1,v2 of type v4i32 a valid shuffle mask is: [0, 4, 1, 5] 7132 // v1={a,b,c,d} => x=shufflevector v1, v2 shufflemask => x={a,e,b,f} 7133 // v2={e,f,g,h} 7134 // Requires similar checks to that of isVTRNMask with respect the how results 7135 // are returned. 7136 static bool isVZIPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) { 7137 unsigned EltSz = VT.getScalarSizeInBits(); 7138 if (EltSz == 64) 7139 return false; 7140 7141 unsigned NumElts = VT.getVectorNumElements(); 7142 if (M.size() != NumElts && M.size() != NumElts*2) 7143 return false; 7144 7145 for (unsigned i = 0; i < M.size(); i += NumElts) { 7146 WhichResult = SelectPairHalf(NumElts, M, i); 7147 unsigned Idx = WhichResult * NumElts / 2; 7148 for (unsigned j = 0; j < NumElts; j += 2) { 7149 if ((M[i+j] >= 0 && (unsigned) M[i+j] != Idx) || 7150 (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != Idx + NumElts)) 7151 return false; 7152 Idx += 1; 7153 } 7154 } 7155 7156 if (M.size() == NumElts*2) 7157 WhichResult = 0; 7158 7159 // VZIP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32. 7160 if (VT.is64BitVector() && EltSz == 32) 7161 return false; 7162 7163 return true; 7164 } 7165 7166 /// isVZIP_v_undef_Mask - Special case of isVZIPMask for canonical form of 7167 /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef". 7168 /// Mask is e.g., <0, 0, 1, 1> instead of <0, 4, 1, 5>. 7169 static bool isVZIP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){ 7170 unsigned EltSz = VT.getScalarSizeInBits(); 7171 if (EltSz == 64) 7172 return false; 7173 7174 unsigned NumElts = VT.getVectorNumElements(); 7175 if (M.size() != NumElts && M.size() != NumElts*2) 7176 return false; 7177 7178 for (unsigned i = 0; i < M.size(); i += NumElts) { 7179 WhichResult = SelectPairHalf(NumElts, M, i); 7180 unsigned Idx = WhichResult * NumElts / 2; 7181 for (unsigned j = 0; j < NumElts; j += 2) { 7182 if ((M[i+j] >= 0 && (unsigned) M[i+j] != Idx) || 7183 (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != Idx)) 7184 return false; 7185 Idx += 1; 7186 } 7187 } 7188 7189 if (M.size() == NumElts*2) 7190 WhichResult = 0; 7191 7192 // VZIP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32. 7193 if (VT.is64BitVector() && EltSz == 32) 7194 return false; 7195 7196 return true; 7197 } 7198 7199 /// Check if \p ShuffleMask is a NEON two-result shuffle (VZIP, VUZP, VTRN), 7200 /// and return the corresponding ARMISD opcode if it is, or 0 if it isn't. 7201 static unsigned isNEONTwoResultShuffleMask(ArrayRef<int> ShuffleMask, EVT VT, 7202 unsigned &WhichResult, 7203 bool &isV_UNDEF) { 7204 isV_UNDEF = false; 7205 if (isVTRNMask(ShuffleMask, VT, WhichResult)) 7206 return ARMISD::VTRN; 7207 if (isVUZPMask(ShuffleMask, VT, WhichResult)) 7208 return ARMISD::VUZP; 7209 if (isVZIPMask(ShuffleMask, VT, WhichResult)) 7210 return ARMISD::VZIP; 7211 7212 isV_UNDEF = true; 7213 if (isVTRN_v_undef_Mask(ShuffleMask, VT, WhichResult)) 7214 return ARMISD::VTRN; 7215 if (isVUZP_v_undef_Mask(ShuffleMask, VT, WhichResult)) 7216 return ARMISD::VUZP; 7217 if (isVZIP_v_undef_Mask(ShuffleMask, VT, WhichResult)) 7218 return ARMISD::VZIP; 7219 7220 return 0; 7221 } 7222 7223 /// \return true if this is a reverse operation on an vector. 7224 static bool isReverseMask(ArrayRef<int> M, EVT VT) { 7225 unsigned NumElts = VT.getVectorNumElements(); 7226 // Make sure the mask has the right size. 7227 if (NumElts != M.size()) 7228 return false; 7229 7230 // Look for <15, ..., 3, -1, 1, 0>. 7231 for (unsigned i = 0; i != NumElts; ++i) 7232 if (M[i] >= 0 && M[i] != (int) (NumElts - 1 - i)) 7233 return false; 7234 7235 return true; 7236 } 7237 7238 static bool isVMOVNMask(ArrayRef<int> M, EVT VT, bool Top, bool SingleSource) { 7239 unsigned NumElts = VT.getVectorNumElements(); 7240 // Make sure the mask has the right size. 7241 if (NumElts != M.size() || (VT != MVT::v8i16 && VT != MVT::v16i8)) 7242 return false; 7243 7244 // If Top 7245 // Look for <0, N, 2, N+2, 4, N+4, ..>. 7246 // This inserts Input2 into Input1 7247 // else if not Top 7248 // Look for <0, N+1, 2, N+3, 4, N+5, ..> 7249 // This inserts Input1 into Input2 7250 unsigned Offset = Top ? 0 : 1; 7251 unsigned N = SingleSource ? 0 : NumElts; 7252 for (unsigned i = 0; i < NumElts; i += 2) { 7253 if (M[i] >= 0 && M[i] != (int)i) 7254 return false; 7255 if (M[i + 1] >= 0 && M[i + 1] != (int)(N + i + Offset)) 7256 return false; 7257 } 7258 7259 return true; 7260 } 7261 7262 // Reconstruct an MVE VCVT from a BuildVector of scalar fptrunc, all extracted 7263 // from a pair of inputs. For example: 7264 // BUILDVECTOR(FP_ROUND(EXTRACT_ELT(X, 0), 7265 // FP_ROUND(EXTRACT_ELT(Y, 0), 7266 // FP_ROUND(EXTRACT_ELT(X, 1), 7267 // FP_ROUND(EXTRACT_ELT(Y, 1), ...) 7268 static SDValue LowerBuildVectorOfFPTrunc(SDValue BV, SelectionDAG &DAG, 7269 const ARMSubtarget *ST) { 7270 assert(BV.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!"); 7271 if (!ST->hasMVEFloatOps()) 7272 return SDValue(); 7273 7274 SDLoc dl(BV); 7275 EVT VT = BV.getValueType(); 7276 if (VT != MVT::v8f16) 7277 return SDValue(); 7278 7279 // We are looking for a buildvector of fptrunc elements, where all the 7280 // elements are interleavingly extracted from two sources. Check the first two 7281 // items are valid enough and extract some info from them (they are checked 7282 // properly in the loop below). 7283 if (BV.getOperand(0).getOpcode() != ISD::FP_ROUND || 7284 BV.getOperand(0).getOperand(0).getOpcode() != ISD::EXTRACT_VECTOR_ELT || 7285 BV.getOperand(0).getOperand(0).getConstantOperandVal(1) != 0) 7286 return SDValue(); 7287 if (BV.getOperand(1).getOpcode() != ISD::FP_ROUND || 7288 BV.getOperand(1).getOperand(0).getOpcode() != ISD::EXTRACT_VECTOR_ELT || 7289 BV.getOperand(1).getOperand(0).getConstantOperandVal(1) != 0) 7290 return SDValue(); 7291 SDValue Op0 = BV.getOperand(0).getOperand(0).getOperand(0); 7292 SDValue Op1 = BV.getOperand(1).getOperand(0).getOperand(0); 7293 if (Op0.getValueType() != MVT::v4f32 || Op1.getValueType() != MVT::v4f32) 7294 return SDValue(); 7295 7296 // Check all the values in the BuildVector line up with our expectations. 7297 for (unsigned i = 1; i < 4; i++) { 7298 auto Check = [](SDValue Trunc, SDValue Op, unsigned Idx) { 7299 return Trunc.getOpcode() == ISD::FP_ROUND && 7300 Trunc.getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT && 7301 Trunc.getOperand(0).getOperand(0) == Op && 7302 Trunc.getOperand(0).getConstantOperandVal(1) == Idx; 7303 }; 7304 if (!Check(BV.getOperand(i * 2 + 0), Op0, i)) 7305 return SDValue(); 7306 if (!Check(BV.getOperand(i * 2 + 1), Op1, i)) 7307 return SDValue(); 7308 } 7309 7310 SDValue N1 = DAG.getNode(ARMISD::VCVTN, dl, VT, DAG.getUNDEF(VT), Op0, 7311 DAG.getConstant(0, dl, MVT::i32)); 7312 return DAG.getNode(ARMISD::VCVTN, dl, VT, N1, Op1, 7313 DAG.getConstant(1, dl, MVT::i32)); 7314 } 7315 7316 // Reconstruct an MVE VCVT from a BuildVector of scalar fpext, all extracted 7317 // from a single input on alternating lanes. For example: 7318 // BUILDVECTOR(FP_ROUND(EXTRACT_ELT(X, 0), 7319 // FP_ROUND(EXTRACT_ELT(X, 2), 7320 // FP_ROUND(EXTRACT_ELT(X, 4), ...) 7321 static SDValue LowerBuildVectorOfFPExt(SDValue BV, SelectionDAG &DAG, 7322 const ARMSubtarget *ST) { 7323 assert(BV.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!"); 7324 if (!ST->hasMVEFloatOps()) 7325 return SDValue(); 7326 7327 SDLoc dl(BV); 7328 EVT VT = BV.getValueType(); 7329 if (VT != MVT::v4f32) 7330 return SDValue(); 7331 7332 // We are looking for a buildvector of fptext elements, where all the 7333 // elements are alternating lanes from a single source. For example <0,2,4,6> 7334 // or <1,3,5,7>. Check the first two items are valid enough and extract some 7335 // info from them (they are checked properly in the loop below). 7336 if (BV.getOperand(0).getOpcode() != ISD::FP_EXTEND || 7337 BV.getOperand(0).getOperand(0).getOpcode() != ISD::EXTRACT_VECTOR_ELT) 7338 return SDValue(); 7339 SDValue Op0 = BV.getOperand(0).getOperand(0).getOperand(0); 7340 int Offset = BV.getOperand(0).getOperand(0).getConstantOperandVal(1); 7341 if (Op0.getValueType() != MVT::v8f16 || (Offset != 0 && Offset != 1)) 7342 return SDValue(); 7343 7344 // Check all the values in the BuildVector line up with our expectations. 7345 for (unsigned i = 1; i < 4; i++) { 7346 auto Check = [](SDValue Trunc, SDValue Op, unsigned Idx) { 7347 return Trunc.getOpcode() == ISD::FP_EXTEND && 7348 Trunc.getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT && 7349 Trunc.getOperand(0).getOperand(0) == Op && 7350 Trunc.getOperand(0).getConstantOperandVal(1) == Idx; 7351 }; 7352 if (!Check(BV.getOperand(i), Op0, 2 * i + Offset)) 7353 return SDValue(); 7354 } 7355 7356 return DAG.getNode(ARMISD::VCVTL, dl, VT, Op0, 7357 DAG.getConstant(Offset, dl, MVT::i32)); 7358 } 7359 7360 // If N is an integer constant that can be moved into a register in one 7361 // instruction, return an SDValue of such a constant (will become a MOV 7362 // instruction). Otherwise return null. 7363 static SDValue IsSingleInstrConstant(SDValue N, SelectionDAG &DAG, 7364 const ARMSubtarget *ST, const SDLoc &dl) { 7365 uint64_t Val; 7366 if (!isa<ConstantSDNode>(N)) 7367 return SDValue(); 7368 Val = cast<ConstantSDNode>(N)->getZExtValue(); 7369 7370 if (ST->isThumb1Only()) { 7371 if (Val <= 255 || ~Val <= 255) 7372 return DAG.getConstant(Val, dl, MVT::i32); 7373 } else { 7374 if (ARM_AM::getSOImmVal(Val) != -1 || ARM_AM::getSOImmVal(~Val) != -1) 7375 return DAG.getConstant(Val, dl, MVT::i32); 7376 } 7377 return SDValue(); 7378 } 7379 7380 static SDValue LowerBUILD_VECTOR_i1(SDValue Op, SelectionDAG &DAG, 7381 const ARMSubtarget *ST) { 7382 SDLoc dl(Op); 7383 EVT VT = Op.getValueType(); 7384 7385 assert(ST->hasMVEIntegerOps() && "LowerBUILD_VECTOR_i1 called without MVE!"); 7386 7387 unsigned NumElts = VT.getVectorNumElements(); 7388 unsigned BoolMask; 7389 unsigned BitsPerBool; 7390 if (NumElts == 4) { 7391 BitsPerBool = 4; 7392 BoolMask = 0xf; 7393 } else if (NumElts == 8) { 7394 BitsPerBool = 2; 7395 BoolMask = 0x3; 7396 } else if (NumElts == 16) { 7397 BitsPerBool = 1; 7398 BoolMask = 0x1; 7399 } else 7400 return SDValue(); 7401 7402 // If this is a single value copied into all lanes (a splat), we can just sign 7403 // extend that single value 7404 SDValue FirstOp = Op.getOperand(0); 7405 if (!isa<ConstantSDNode>(FirstOp) && 7406 std::all_of(std::next(Op->op_begin()), Op->op_end(), 7407 [&FirstOp](SDUse &U) { 7408 return U.get().isUndef() || U.get() == FirstOp; 7409 })) { 7410 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::i32, FirstOp, 7411 DAG.getValueType(MVT::i1)); 7412 return DAG.getNode(ARMISD::PREDICATE_CAST, dl, Op.getValueType(), Ext); 7413 } 7414 7415 // First create base with bits set where known 7416 unsigned Bits32 = 0; 7417 for (unsigned i = 0; i < NumElts; ++i) { 7418 SDValue V = Op.getOperand(i); 7419 if (!isa<ConstantSDNode>(V) && !V.isUndef()) 7420 continue; 7421 bool BitSet = V.isUndef() ? false : cast<ConstantSDNode>(V)->getZExtValue(); 7422 if (BitSet) 7423 Bits32 |= BoolMask << (i * BitsPerBool); 7424 } 7425 7426 // Add in unknown nodes 7427 SDValue Base = DAG.getNode(ARMISD::PREDICATE_CAST, dl, VT, 7428 DAG.getConstant(Bits32, dl, MVT::i32)); 7429 for (unsigned i = 0; i < NumElts; ++i) { 7430 SDValue V = Op.getOperand(i); 7431 if (isa<ConstantSDNode>(V) || V.isUndef()) 7432 continue; 7433 Base = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Base, V, 7434 DAG.getConstant(i, dl, MVT::i32)); 7435 } 7436 7437 return Base; 7438 } 7439 7440 static SDValue LowerBUILD_VECTORToVIDUP(SDValue Op, SelectionDAG &DAG, 7441 const ARMSubtarget *ST) { 7442 if (!ST->hasMVEIntegerOps()) 7443 return SDValue(); 7444 7445 // We are looking for a buildvector where each element is Op[0] + i*N 7446 EVT VT = Op.getValueType(); 7447 SDValue Op0 = Op.getOperand(0); 7448 unsigned NumElts = VT.getVectorNumElements(); 7449 7450 // Get the increment value from operand 1 7451 SDValue Op1 = Op.getOperand(1); 7452 if (Op1.getOpcode() != ISD::ADD || Op1.getOperand(0) != Op0 || 7453 !isa<ConstantSDNode>(Op1.getOperand(1))) 7454 return SDValue(); 7455 unsigned N = Op1.getConstantOperandVal(1); 7456 if (N != 1 && N != 2 && N != 4 && N != 8) 7457 return SDValue(); 7458 7459 // Check that each other operand matches 7460 for (unsigned I = 2; I < NumElts; I++) { 7461 SDValue OpI = Op.getOperand(I); 7462 if (OpI.getOpcode() != ISD::ADD || OpI.getOperand(0) != Op0 || 7463 !isa<ConstantSDNode>(OpI.getOperand(1)) || 7464 OpI.getConstantOperandVal(1) != I * N) 7465 return SDValue(); 7466 } 7467 7468 SDLoc DL(Op); 7469 return DAG.getNode(ARMISD::VIDUP, DL, DAG.getVTList(VT, MVT::i32), Op0, 7470 DAG.getConstant(N, DL, MVT::i32)); 7471 } 7472 7473 // If this is a case we can't handle, return null and let the default 7474 // expansion code take care of it. 7475 SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG, 7476 const ARMSubtarget *ST) const { 7477 BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode()); 7478 SDLoc dl(Op); 7479 EVT VT = Op.getValueType(); 7480 7481 if (ST->hasMVEIntegerOps() && VT.getScalarSizeInBits() == 1) 7482 return LowerBUILD_VECTOR_i1(Op, DAG, ST); 7483 7484 if (SDValue R = LowerBUILD_VECTORToVIDUP(Op, DAG, ST)) 7485 return R; 7486 7487 APInt SplatBits, SplatUndef; 7488 unsigned SplatBitSize; 7489 bool HasAnyUndefs; 7490 if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) { 7491 if (SplatUndef.isAllOnesValue()) 7492 return DAG.getUNDEF(VT); 7493 7494 if ((ST->hasNEON() && SplatBitSize <= 64) || 7495 (ST->hasMVEIntegerOps() && SplatBitSize <= 64)) { 7496 // Check if an immediate VMOV works. 7497 EVT VmovVT; 7498 SDValue Val = 7499 isVMOVModifiedImm(SplatBits.getZExtValue(), SplatUndef.getZExtValue(), 7500 SplatBitSize, DAG, dl, VmovVT, VT, VMOVModImm); 7501 7502 if (Val.getNode()) { 7503 SDValue Vmov = DAG.getNode(ARMISD::VMOVIMM, dl, VmovVT, Val); 7504 return DAG.getNode(ISD::BITCAST, dl, VT, Vmov); 7505 } 7506 7507 // Try an immediate VMVN. 7508 uint64_t NegatedImm = (~SplatBits).getZExtValue(); 7509 Val = isVMOVModifiedImm( 7510 NegatedImm, SplatUndef.getZExtValue(), SplatBitSize, DAG, dl, VmovVT, 7511 VT, ST->hasMVEIntegerOps() ? MVEVMVNModImm : VMVNModImm); 7512 if (Val.getNode()) { 7513 SDValue Vmov = DAG.getNode(ARMISD::VMVNIMM, dl, VmovVT, Val); 7514 return DAG.getNode(ISD::BITCAST, dl, VT, Vmov); 7515 } 7516 7517 // Use vmov.f32 to materialize other v2f32 and v4f32 splats. 7518 if ((VT == MVT::v2f32 || VT == MVT::v4f32) && SplatBitSize == 32) { 7519 int ImmVal = ARM_AM::getFP32Imm(SplatBits); 7520 if (ImmVal != -1) { 7521 SDValue Val = DAG.getTargetConstant(ImmVal, dl, MVT::i32); 7522 return DAG.getNode(ARMISD::VMOVFPIMM, dl, VT, Val); 7523 } 7524 } 7525 } 7526 } 7527 7528 // Scan through the operands to see if only one value is used. 7529 // 7530 // As an optimisation, even if more than one value is used it may be more 7531 // profitable to splat with one value then change some lanes. 7532 // 7533 // Heuristically we decide to do this if the vector has a "dominant" value, 7534 // defined as splatted to more than half of the lanes. 7535 unsigned NumElts = VT.getVectorNumElements(); 7536 bool isOnlyLowElement = true; 7537 bool usesOnlyOneValue = true; 7538 bool hasDominantValue = false; 7539 bool isConstant = true; 7540 7541 // Map of the number of times a particular SDValue appears in the 7542 // element list. 7543 DenseMap<SDValue, unsigned> ValueCounts; 7544 SDValue Value; 7545 for (unsigned i = 0; i < NumElts; ++i) { 7546 SDValue V = Op.getOperand(i); 7547 if (V.isUndef()) 7548 continue; 7549 if (i > 0) 7550 isOnlyLowElement = false; 7551 if (!isa<ConstantFPSDNode>(V) && !isa<ConstantSDNode>(V)) 7552 isConstant = false; 7553 7554 ValueCounts.insert(std::make_pair(V, 0)); 7555 unsigned &Count = ValueCounts[V]; 7556 7557 // Is this value dominant? (takes up more than half of the lanes) 7558 if (++Count > (NumElts / 2)) { 7559 hasDominantValue = true; 7560 Value = V; 7561 } 7562 } 7563 if (ValueCounts.size() != 1) 7564 usesOnlyOneValue = false; 7565 if (!Value.getNode() && !ValueCounts.empty()) 7566 Value = ValueCounts.begin()->first; 7567 7568 if (ValueCounts.empty()) 7569 return DAG.getUNDEF(VT); 7570 7571 // Loads are better lowered with insert_vector_elt/ARMISD::BUILD_VECTOR. 7572 // Keep going if we are hitting this case. 7573 if (isOnlyLowElement && !ISD::isNormalLoad(Value.getNode())) 7574 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Value); 7575 7576 unsigned EltSize = VT.getScalarSizeInBits(); 7577 7578 // Use VDUP for non-constant splats. For f32 constant splats, reduce to 7579 // i32 and try again. 7580 if (hasDominantValue && EltSize <= 32) { 7581 if (!isConstant) { 7582 SDValue N; 7583 7584 // If we are VDUPing a value that comes directly from a vector, that will 7585 // cause an unnecessary move to and from a GPR, where instead we could 7586 // just use VDUPLANE. We can only do this if the lane being extracted 7587 // is at a constant index, as the VDUP from lane instructions only have 7588 // constant-index forms. 7589 ConstantSDNode *constIndex; 7590 if (Value->getOpcode() == ISD::EXTRACT_VECTOR_ELT && 7591 (constIndex = dyn_cast<ConstantSDNode>(Value->getOperand(1)))) { 7592 // We need to create a new undef vector to use for the VDUPLANE if the 7593 // size of the vector from which we get the value is different than the 7594 // size of the vector that we need to create. We will insert the element 7595 // such that the register coalescer will remove unnecessary copies. 7596 if (VT != Value->getOperand(0).getValueType()) { 7597 unsigned index = constIndex->getAPIntValue().getLimitedValue() % 7598 VT.getVectorNumElements(); 7599 N = DAG.getNode(ARMISD::VDUPLANE, dl, VT, 7600 DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DAG.getUNDEF(VT), 7601 Value, DAG.getConstant(index, dl, MVT::i32)), 7602 DAG.getConstant(index, dl, MVT::i32)); 7603 } else 7604 N = DAG.getNode(ARMISD::VDUPLANE, dl, VT, 7605 Value->getOperand(0), Value->getOperand(1)); 7606 } else 7607 N = DAG.getNode(ARMISD::VDUP, dl, VT, Value); 7608 7609 if (!usesOnlyOneValue) { 7610 // The dominant value was splatted as 'N', but we now have to insert 7611 // all differing elements. 7612 for (unsigned I = 0; I < NumElts; ++I) { 7613 if (Op.getOperand(I) == Value) 7614 continue; 7615 SmallVector<SDValue, 3> Ops; 7616 Ops.push_back(N); 7617 Ops.push_back(Op.getOperand(I)); 7618 Ops.push_back(DAG.getConstant(I, dl, MVT::i32)); 7619 N = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Ops); 7620 } 7621 } 7622 return N; 7623 } 7624 if (VT.getVectorElementType().isFloatingPoint()) { 7625 SmallVector<SDValue, 8> Ops; 7626 MVT FVT = VT.getVectorElementType().getSimpleVT(); 7627 assert(FVT == MVT::f32 || FVT == MVT::f16); 7628 MVT IVT = (FVT == MVT::f32) ? MVT::i32 : MVT::i16; 7629 for (unsigned i = 0; i < NumElts; ++i) 7630 Ops.push_back(DAG.getNode(ISD::BITCAST, dl, IVT, 7631 Op.getOperand(i))); 7632 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), IVT, NumElts); 7633 SDValue Val = DAG.getBuildVector(VecVT, dl, Ops); 7634 Val = LowerBUILD_VECTOR(Val, DAG, ST); 7635 if (Val.getNode()) 7636 return DAG.getNode(ISD::BITCAST, dl, VT, Val); 7637 } 7638 if (usesOnlyOneValue) { 7639 SDValue Val = IsSingleInstrConstant(Value, DAG, ST, dl); 7640 if (isConstant && Val.getNode()) 7641 return DAG.getNode(ARMISD::VDUP, dl, VT, Val); 7642 } 7643 } 7644 7645 // If all elements are constants and the case above didn't get hit, fall back 7646 // to the default expansion, which will generate a load from the constant 7647 // pool. 7648 if (isConstant) 7649 return SDValue(); 7650 7651 // Reconstruct the BUILDVECTOR to one of the legal shuffles (such as vext and 7652 // vmovn). Empirical tests suggest this is rarely worth it for vectors of 7653 // length <= 2. 7654 if (NumElts >= 4) 7655 if (SDValue shuffle = ReconstructShuffle(Op, DAG)) 7656 return shuffle; 7657 7658 // Attempt to turn a buildvector of scalar fptrunc's or fpext's back into 7659 // VCVT's 7660 if (SDValue VCVT = LowerBuildVectorOfFPTrunc(Op, DAG, Subtarget)) 7661 return VCVT; 7662 if (SDValue VCVT = LowerBuildVectorOfFPExt(Op, DAG, Subtarget)) 7663 return VCVT; 7664 7665 if (ST->hasNEON() && VT.is128BitVector() && VT != MVT::v2f64 && VT != MVT::v4f32) { 7666 // If we haven't found an efficient lowering, try splitting a 128-bit vector 7667 // into two 64-bit vectors; we might discover a better way to lower it. 7668 SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElts); 7669 EVT ExtVT = VT.getVectorElementType(); 7670 EVT HVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElts / 2); 7671 SDValue Lower = 7672 DAG.getBuildVector(HVT, dl, makeArrayRef(&Ops[0], NumElts / 2)); 7673 if (Lower.getOpcode() == ISD::BUILD_VECTOR) 7674 Lower = LowerBUILD_VECTOR(Lower, DAG, ST); 7675 SDValue Upper = DAG.getBuildVector( 7676 HVT, dl, makeArrayRef(&Ops[NumElts / 2], NumElts / 2)); 7677 if (Upper.getOpcode() == ISD::BUILD_VECTOR) 7678 Upper = LowerBUILD_VECTOR(Upper, DAG, ST); 7679 if (Lower && Upper) 7680 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lower, Upper); 7681 } 7682 7683 // Vectors with 32- or 64-bit elements can be built by directly assigning 7684 // the subregisters. Lower it to an ARMISD::BUILD_VECTOR so the operands 7685 // will be legalized. 7686 if (EltSize >= 32) { 7687 // Do the expansion with floating-point types, since that is what the VFP 7688 // registers are defined to use, and since i64 is not legal. 7689 EVT EltVT = EVT::getFloatingPointVT(EltSize); 7690 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts); 7691 SmallVector<SDValue, 8> Ops; 7692 for (unsigned i = 0; i < NumElts; ++i) 7693 Ops.push_back(DAG.getNode(ISD::BITCAST, dl, EltVT, Op.getOperand(i))); 7694 SDValue Val = DAG.getNode(ARMISD::BUILD_VECTOR, dl, VecVT, Ops); 7695 return DAG.getNode(ISD::BITCAST, dl, VT, Val); 7696 } 7697 7698 // If all else fails, just use a sequence of INSERT_VECTOR_ELT when we 7699 // know the default expansion would otherwise fall back on something even 7700 // worse. For a vector with one or two non-undef values, that's 7701 // scalar_to_vector for the elements followed by a shuffle (provided the 7702 // shuffle is valid for the target) and materialization element by element 7703 // on the stack followed by a load for everything else. 7704 if (!isConstant && !usesOnlyOneValue) { 7705 SDValue Vec = DAG.getUNDEF(VT); 7706 for (unsigned i = 0 ; i < NumElts; ++i) { 7707 SDValue V = Op.getOperand(i); 7708 if (V.isUndef()) 7709 continue; 7710 SDValue LaneIdx = DAG.getConstant(i, dl, MVT::i32); 7711 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Vec, V, LaneIdx); 7712 } 7713 return Vec; 7714 } 7715 7716 return SDValue(); 7717 } 7718 7719 // Gather data to see if the operation can be modelled as a 7720 // shuffle in combination with VEXTs. 7721 SDValue ARMTargetLowering::ReconstructShuffle(SDValue Op, 7722 SelectionDAG &DAG) const { 7723 assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!"); 7724 SDLoc dl(Op); 7725 EVT VT = Op.getValueType(); 7726 unsigned NumElts = VT.getVectorNumElements(); 7727 7728 struct ShuffleSourceInfo { 7729 SDValue Vec; 7730 unsigned MinElt = std::numeric_limits<unsigned>::max(); 7731 unsigned MaxElt = 0; 7732 7733 // We may insert some combination of BITCASTs and VEXT nodes to force Vec to 7734 // be compatible with the shuffle we intend to construct. As a result 7735 // ShuffleVec will be some sliding window into the original Vec. 7736 SDValue ShuffleVec; 7737 7738 // Code should guarantee that element i in Vec starts at element "WindowBase 7739 // + i * WindowScale in ShuffleVec". 7740 int WindowBase = 0; 7741 int WindowScale = 1; 7742 7743 ShuffleSourceInfo(SDValue Vec) : Vec(Vec), ShuffleVec(Vec) {} 7744 7745 bool operator ==(SDValue OtherVec) { return Vec == OtherVec; } 7746 }; 7747 7748 // First gather all vectors used as an immediate source for this BUILD_VECTOR 7749 // node. 7750 SmallVector<ShuffleSourceInfo, 2> Sources; 7751 for (unsigned i = 0; i < NumElts; ++i) { 7752 SDValue V = Op.getOperand(i); 7753 if (V.isUndef()) 7754 continue; 7755 else if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT) { 7756 // A shuffle can only come from building a vector from various 7757 // elements of other vectors. 7758 return SDValue(); 7759 } else if (!isa<ConstantSDNode>(V.getOperand(1))) { 7760 // Furthermore, shuffles require a constant mask, whereas extractelts 7761 // accept variable indices. 7762 return SDValue(); 7763 } 7764 7765 // Add this element source to the list if it's not already there. 7766 SDValue SourceVec = V.getOperand(0); 7767 auto Source = llvm::find(Sources, SourceVec); 7768 if (Source == Sources.end()) 7769 Source = Sources.insert(Sources.end(), ShuffleSourceInfo(SourceVec)); 7770 7771 // Update the minimum and maximum lane number seen. 7772 unsigned EltNo = cast<ConstantSDNode>(V.getOperand(1))->getZExtValue(); 7773 Source->MinElt = std::min(Source->MinElt, EltNo); 7774 Source->MaxElt = std::max(Source->MaxElt, EltNo); 7775 } 7776 7777 // Currently only do something sane when at most two source vectors 7778 // are involved. 7779 if (Sources.size() > 2) 7780 return SDValue(); 7781 7782 // Find out the smallest element size among result and two sources, and use 7783 // it as element size to build the shuffle_vector. 7784 EVT SmallestEltTy = VT.getVectorElementType(); 7785 for (auto &Source : Sources) { 7786 EVT SrcEltTy = Source.Vec.getValueType().getVectorElementType(); 7787 if (SrcEltTy.bitsLT(SmallestEltTy)) 7788 SmallestEltTy = SrcEltTy; 7789 } 7790 unsigned ResMultiplier = 7791 VT.getScalarSizeInBits() / SmallestEltTy.getSizeInBits(); 7792 NumElts = VT.getSizeInBits() / SmallestEltTy.getSizeInBits(); 7793 EVT ShuffleVT = EVT::getVectorVT(*DAG.getContext(), SmallestEltTy, NumElts); 7794 7795 // If the source vector is too wide or too narrow, we may nevertheless be able 7796 // to construct a compatible shuffle either by concatenating it with UNDEF or 7797 // extracting a suitable range of elements. 7798 for (auto &Src : Sources) { 7799 EVT SrcVT = Src.ShuffleVec.getValueType(); 7800 7801 uint64_t SrcVTSize = SrcVT.getFixedSizeInBits(); 7802 uint64_t VTSize = VT.getFixedSizeInBits(); 7803 if (SrcVTSize == VTSize) 7804 continue; 7805 7806 // This stage of the search produces a source with the same element type as 7807 // the original, but with a total width matching the BUILD_VECTOR output. 7808 EVT EltVT = SrcVT.getVectorElementType(); 7809 unsigned NumSrcElts = VTSize / EltVT.getFixedSizeInBits(); 7810 EVT DestVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumSrcElts); 7811 7812 if (SrcVTSize < VTSize) { 7813 if (2 * SrcVTSize != VTSize) 7814 return SDValue(); 7815 // We can pad out the smaller vector for free, so if it's part of a 7816 // shuffle... 7817 Src.ShuffleVec = 7818 DAG.getNode(ISD::CONCAT_VECTORS, dl, DestVT, Src.ShuffleVec, 7819 DAG.getUNDEF(Src.ShuffleVec.getValueType())); 7820 continue; 7821 } 7822 7823 if (SrcVTSize != 2 * VTSize) 7824 return SDValue(); 7825 7826 if (Src.MaxElt - Src.MinElt >= NumSrcElts) { 7827 // Span too large for a VEXT to cope 7828 return SDValue(); 7829 } 7830 7831 if (Src.MinElt >= NumSrcElts) { 7832 // The extraction can just take the second half 7833 Src.ShuffleVec = 7834 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec, 7835 DAG.getConstant(NumSrcElts, dl, MVT::i32)); 7836 Src.WindowBase = -NumSrcElts; 7837 } else if (Src.MaxElt < NumSrcElts) { 7838 // The extraction can just take the first half 7839 Src.ShuffleVec = 7840 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec, 7841 DAG.getConstant(0, dl, MVT::i32)); 7842 } else { 7843 // An actual VEXT is needed 7844 SDValue VEXTSrc1 = 7845 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec, 7846 DAG.getConstant(0, dl, MVT::i32)); 7847 SDValue VEXTSrc2 = 7848 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec, 7849 DAG.getConstant(NumSrcElts, dl, MVT::i32)); 7850 7851 Src.ShuffleVec = DAG.getNode(ARMISD::VEXT, dl, DestVT, VEXTSrc1, 7852 VEXTSrc2, 7853 DAG.getConstant(Src.MinElt, dl, MVT::i32)); 7854 Src.WindowBase = -Src.MinElt; 7855 } 7856 } 7857 7858 // Another possible incompatibility occurs from the vector element types. We 7859 // can fix this by bitcasting the source vectors to the same type we intend 7860 // for the shuffle. 7861 for (auto &Src : Sources) { 7862 EVT SrcEltTy = Src.ShuffleVec.getValueType().getVectorElementType(); 7863 if (SrcEltTy == SmallestEltTy) 7864 continue; 7865 assert(ShuffleVT.getVectorElementType() == SmallestEltTy); 7866 Src.ShuffleVec = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, ShuffleVT, Src.ShuffleVec); 7867 Src.WindowScale = SrcEltTy.getSizeInBits() / SmallestEltTy.getSizeInBits(); 7868 Src.WindowBase *= Src.WindowScale; 7869 } 7870 7871 // Final sanity check before we try to actually produce a shuffle. 7872 LLVM_DEBUG(for (auto Src 7873 : Sources) 7874 assert(Src.ShuffleVec.getValueType() == ShuffleVT);); 7875 7876 // The stars all align, our next step is to produce the mask for the shuffle. 7877 SmallVector<int, 8> Mask(ShuffleVT.getVectorNumElements(), -1); 7878 int BitsPerShuffleLane = ShuffleVT.getScalarSizeInBits(); 7879 for (unsigned i = 0; i < VT.getVectorNumElements(); ++i) { 7880 SDValue Entry = Op.getOperand(i); 7881 if (Entry.isUndef()) 7882 continue; 7883 7884 auto Src = llvm::find(Sources, Entry.getOperand(0)); 7885 int EltNo = cast<ConstantSDNode>(Entry.getOperand(1))->getSExtValue(); 7886 7887 // EXTRACT_VECTOR_ELT performs an implicit any_ext; BUILD_VECTOR an implicit 7888 // trunc. So only std::min(SrcBits, DestBits) actually get defined in this 7889 // segment. 7890 EVT OrigEltTy = Entry.getOperand(0).getValueType().getVectorElementType(); 7891 int BitsDefined = std::min(OrigEltTy.getScalarSizeInBits(), 7892 VT.getScalarSizeInBits()); 7893 int LanesDefined = BitsDefined / BitsPerShuffleLane; 7894 7895 // This source is expected to fill ResMultiplier lanes of the final shuffle, 7896 // starting at the appropriate offset. 7897 int *LaneMask = &Mask[i * ResMultiplier]; 7898 7899 int ExtractBase = EltNo * Src->WindowScale + Src->WindowBase; 7900 ExtractBase += NumElts * (Src - Sources.begin()); 7901 for (int j = 0; j < LanesDefined; ++j) 7902 LaneMask[j] = ExtractBase + j; 7903 } 7904 7905 7906 // We can't handle more than two sources. This should have already 7907 // been checked before this point. 7908 assert(Sources.size() <= 2 && "Too many sources!"); 7909 7910 SDValue ShuffleOps[] = { DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT) }; 7911 for (unsigned i = 0; i < Sources.size(); ++i) 7912 ShuffleOps[i] = Sources[i].ShuffleVec; 7913 7914 SDValue Shuffle = buildLegalVectorShuffle(ShuffleVT, dl, ShuffleOps[0], 7915 ShuffleOps[1], Mask, DAG); 7916 if (!Shuffle) 7917 return SDValue(); 7918 return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, Shuffle); 7919 } 7920 7921 enum ShuffleOpCodes { 7922 OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3> 7923 OP_VREV, 7924 OP_VDUP0, 7925 OP_VDUP1, 7926 OP_VDUP2, 7927 OP_VDUP3, 7928 OP_VEXT1, 7929 OP_VEXT2, 7930 OP_VEXT3, 7931 OP_VUZPL, // VUZP, left result 7932 OP_VUZPR, // VUZP, right result 7933 OP_VZIPL, // VZIP, left result 7934 OP_VZIPR, // VZIP, right result 7935 OP_VTRNL, // VTRN, left result 7936 OP_VTRNR // VTRN, right result 7937 }; 7938 7939 static bool isLegalMVEShuffleOp(unsigned PFEntry) { 7940 unsigned OpNum = (PFEntry >> 26) & 0x0F; 7941 switch (OpNum) { 7942 case OP_COPY: 7943 case OP_VREV: 7944 case OP_VDUP0: 7945 case OP_VDUP1: 7946 case OP_VDUP2: 7947 case OP_VDUP3: 7948 return true; 7949 } 7950 return false; 7951 } 7952 7953 /// isShuffleMaskLegal - Targets can use this to indicate that they only 7954 /// support *some* VECTOR_SHUFFLE operations, those with specific masks. 7955 /// By default, if a target supports the VECTOR_SHUFFLE node, all mask values 7956 /// are assumed to be legal. 7957 bool ARMTargetLowering::isShuffleMaskLegal(ArrayRef<int> M, EVT VT) const { 7958 if (VT.getVectorNumElements() == 4 && 7959 (VT.is128BitVector() || VT.is64BitVector())) { 7960 unsigned PFIndexes[4]; 7961 for (unsigned i = 0; i != 4; ++i) { 7962 if (M[i] < 0) 7963 PFIndexes[i] = 8; 7964 else 7965 PFIndexes[i] = M[i]; 7966 } 7967 7968 // Compute the index in the perfect shuffle table. 7969 unsigned PFTableIndex = 7970 PFIndexes[0]*9*9*9+PFIndexes[1]*9*9+PFIndexes[2]*9+PFIndexes[3]; 7971 unsigned PFEntry = PerfectShuffleTable[PFTableIndex]; 7972 unsigned Cost = (PFEntry >> 30); 7973 7974 if (Cost <= 4 && (Subtarget->hasNEON() || isLegalMVEShuffleOp(PFEntry))) 7975 return true; 7976 } 7977 7978 bool ReverseVEXT, isV_UNDEF; 7979 unsigned Imm, WhichResult; 7980 7981 unsigned EltSize = VT.getScalarSizeInBits(); 7982 if (EltSize >= 32 || 7983 ShuffleVectorSDNode::isSplatMask(&M[0], VT) || 7984 ShuffleVectorInst::isIdentityMask(M) || 7985 isVREVMask(M, VT, 64) || 7986 isVREVMask(M, VT, 32) || 7987 isVREVMask(M, VT, 16)) 7988 return true; 7989 else if (Subtarget->hasNEON() && 7990 (isVEXTMask(M, VT, ReverseVEXT, Imm) || 7991 isVTBLMask(M, VT) || 7992 isNEONTwoResultShuffleMask(M, VT, WhichResult, isV_UNDEF))) 7993 return true; 7994 else if (Subtarget->hasNEON() && (VT == MVT::v8i16 || VT == MVT::v16i8) && 7995 isReverseMask(M, VT)) 7996 return true; 7997 else if (Subtarget->hasMVEIntegerOps() && 7998 (isVMOVNMask(M, VT, true, false) || 7999 isVMOVNMask(M, VT, false, false) || isVMOVNMask(M, VT, true, true))) 8000 return true; 8001 else 8002 return false; 8003 } 8004 8005 /// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit 8006 /// the specified operations to build the shuffle. 8007 static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS, 8008 SDValue RHS, SelectionDAG &DAG, 8009 const SDLoc &dl) { 8010 unsigned OpNum = (PFEntry >> 26) & 0x0F; 8011 unsigned LHSID = (PFEntry >> 13) & ((1 << 13)-1); 8012 unsigned RHSID = (PFEntry >> 0) & ((1 << 13)-1); 8013 8014 if (OpNum == OP_COPY) { 8015 if (LHSID == (1*9+2)*9+3) return LHS; 8016 assert(LHSID == ((4*9+5)*9+6)*9+7 && "Illegal OP_COPY!"); 8017 return RHS; 8018 } 8019 8020 SDValue OpLHS, OpRHS; 8021 OpLHS = GeneratePerfectShuffle(PerfectShuffleTable[LHSID], LHS, RHS, DAG, dl); 8022 OpRHS = GeneratePerfectShuffle(PerfectShuffleTable[RHSID], LHS, RHS, DAG, dl); 8023 EVT VT = OpLHS.getValueType(); 8024 8025 switch (OpNum) { 8026 default: llvm_unreachable("Unknown shuffle opcode!"); 8027 case OP_VREV: 8028 // VREV divides the vector in half and swaps within the half. 8029 if (VT.getVectorElementType() == MVT::i32 || 8030 VT.getVectorElementType() == MVT::f32) 8031 return DAG.getNode(ARMISD::VREV64, dl, VT, OpLHS); 8032 // vrev <4 x i16> -> VREV32 8033 if (VT.getVectorElementType() == MVT::i16 || 8034 VT.getVectorElementType() == MVT::f16) 8035 return DAG.getNode(ARMISD::VREV32, dl, VT, OpLHS); 8036 // vrev <4 x i8> -> VREV16 8037 assert(VT.getVectorElementType() == MVT::i8); 8038 return DAG.getNode(ARMISD::VREV16, dl, VT, OpLHS); 8039 case OP_VDUP0: 8040 case OP_VDUP1: 8041 case OP_VDUP2: 8042 case OP_VDUP3: 8043 return DAG.getNode(ARMISD::VDUPLANE, dl, VT, 8044 OpLHS, DAG.getConstant(OpNum-OP_VDUP0, dl, MVT::i32)); 8045 case OP_VEXT1: 8046 case OP_VEXT2: 8047 case OP_VEXT3: 8048 return DAG.getNode(ARMISD::VEXT, dl, VT, 8049 OpLHS, OpRHS, 8050 DAG.getConstant(OpNum - OP_VEXT1 + 1, dl, MVT::i32)); 8051 case OP_VUZPL: 8052 case OP_VUZPR: 8053 return DAG.getNode(ARMISD::VUZP, dl, DAG.getVTList(VT, VT), 8054 OpLHS, OpRHS).getValue(OpNum-OP_VUZPL); 8055 case OP_VZIPL: 8056 case OP_VZIPR: 8057 return DAG.getNode(ARMISD::VZIP, dl, DAG.getVTList(VT, VT), 8058 OpLHS, OpRHS).getValue(OpNum-OP_VZIPL); 8059 case OP_VTRNL: 8060 case OP_VTRNR: 8061 return DAG.getNode(ARMISD::VTRN, dl, DAG.getVTList(VT, VT), 8062 OpLHS, OpRHS).getValue(OpNum-OP_VTRNL); 8063 } 8064 } 8065 8066 static SDValue LowerVECTOR_SHUFFLEv8i8(SDValue Op, 8067 ArrayRef<int> ShuffleMask, 8068 SelectionDAG &DAG) { 8069 // Check to see if we can use the VTBL instruction. 8070 SDValue V1 = Op.getOperand(0); 8071 SDValue V2 = Op.getOperand(1); 8072 SDLoc DL(Op); 8073 8074 SmallVector<SDValue, 8> VTBLMask; 8075 for (ArrayRef<int>::iterator 8076 I = ShuffleMask.begin(), E = ShuffleMask.end(); I != E; ++I) 8077 VTBLMask.push_back(DAG.getConstant(*I, DL, MVT::i32)); 8078 8079 if (V2.getNode()->isUndef()) 8080 return DAG.getNode(ARMISD::VTBL1, DL, MVT::v8i8, V1, 8081 DAG.getBuildVector(MVT::v8i8, DL, VTBLMask)); 8082 8083 return DAG.getNode(ARMISD::VTBL2, DL, MVT::v8i8, V1, V2, 8084 DAG.getBuildVector(MVT::v8i8, DL, VTBLMask)); 8085 } 8086 8087 static SDValue LowerReverse_VECTOR_SHUFFLEv16i8_v8i16(SDValue Op, 8088 SelectionDAG &DAG) { 8089 SDLoc DL(Op); 8090 SDValue OpLHS = Op.getOperand(0); 8091 EVT VT = OpLHS.getValueType(); 8092 8093 assert((VT == MVT::v8i16 || VT == MVT::v16i8) && 8094 "Expect an v8i16/v16i8 type"); 8095 OpLHS = DAG.getNode(ARMISD::VREV64, DL, VT, OpLHS); 8096 // For a v16i8 type: After the VREV, we have got <8, ...15, 8, ..., 0>. Now, 8097 // extract the first 8 bytes into the top double word and the last 8 bytes 8098 // into the bottom double word. The v8i16 case is similar. 8099 unsigned ExtractNum = (VT == MVT::v16i8) ? 8 : 4; 8100 return DAG.getNode(ARMISD::VEXT, DL, VT, OpLHS, OpLHS, 8101 DAG.getConstant(ExtractNum, DL, MVT::i32)); 8102 } 8103 8104 static EVT getVectorTyFromPredicateVector(EVT VT) { 8105 switch (VT.getSimpleVT().SimpleTy) { 8106 case MVT::v4i1: 8107 return MVT::v4i32; 8108 case MVT::v8i1: 8109 return MVT::v8i16; 8110 case MVT::v16i1: 8111 return MVT::v16i8; 8112 default: 8113 llvm_unreachable("Unexpected vector predicate type"); 8114 } 8115 } 8116 8117 static SDValue PromoteMVEPredVector(SDLoc dl, SDValue Pred, EVT VT, 8118 SelectionDAG &DAG) { 8119 // Converting from boolean predicates to integers involves creating a vector 8120 // of all ones or all zeroes and selecting the lanes based upon the real 8121 // predicate. 8122 SDValue AllOnes = 8123 DAG.getTargetConstant(ARM_AM::createVMOVModImm(0xe, 0xff), dl, MVT::i32); 8124 AllOnes = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v16i8, AllOnes); 8125 8126 SDValue AllZeroes = 8127 DAG.getTargetConstant(ARM_AM::createVMOVModImm(0xe, 0x0), dl, MVT::i32); 8128 AllZeroes = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v16i8, AllZeroes); 8129 8130 // Get full vector type from predicate type 8131 EVT NewVT = getVectorTyFromPredicateVector(VT); 8132 8133 SDValue RecastV1; 8134 // If the real predicate is an v8i1 or v4i1 (not v16i1) then we need to recast 8135 // this to a v16i1. This cannot be done with an ordinary bitcast because the 8136 // sizes are not the same. We have to use a MVE specific PREDICATE_CAST node, 8137 // since we know in hardware the sizes are really the same. 8138 if (VT != MVT::v16i1) 8139 RecastV1 = DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::v16i1, Pred); 8140 else 8141 RecastV1 = Pred; 8142 8143 // Select either all ones or zeroes depending upon the real predicate bits. 8144 SDValue PredAsVector = 8145 DAG.getNode(ISD::VSELECT, dl, MVT::v16i8, RecastV1, AllOnes, AllZeroes); 8146 8147 // Recast our new predicate-as-integer v16i8 vector into something 8148 // appropriate for the shuffle, i.e. v4i32 for a real v4i1 predicate. 8149 return DAG.getNode(ISD::BITCAST, dl, NewVT, PredAsVector); 8150 } 8151 8152 static SDValue LowerVECTOR_SHUFFLE_i1(SDValue Op, SelectionDAG &DAG, 8153 const ARMSubtarget *ST) { 8154 EVT VT = Op.getValueType(); 8155 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode()); 8156 ArrayRef<int> ShuffleMask = SVN->getMask(); 8157 8158 assert(ST->hasMVEIntegerOps() && 8159 "No support for vector shuffle of boolean predicates"); 8160 8161 SDValue V1 = Op.getOperand(0); 8162 SDLoc dl(Op); 8163 if (isReverseMask(ShuffleMask, VT)) { 8164 SDValue cast = DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::i32, V1); 8165 SDValue rbit = DAG.getNode(ISD::BITREVERSE, dl, MVT::i32, cast); 8166 SDValue srl = DAG.getNode(ISD::SRL, dl, MVT::i32, rbit, 8167 DAG.getConstant(16, dl, MVT::i32)); 8168 return DAG.getNode(ARMISD::PREDICATE_CAST, dl, VT, srl); 8169 } 8170 8171 // Until we can come up with optimised cases for every single vector 8172 // shuffle in existence we have chosen the least painful strategy. This is 8173 // to essentially promote the boolean predicate to a 8-bit integer, where 8174 // each predicate represents a byte. Then we fall back on a normal integer 8175 // vector shuffle and convert the result back into a predicate vector. In 8176 // many cases the generated code might be even better than scalar code 8177 // operating on bits. Just imagine trying to shuffle 8 arbitrary 2-bit 8178 // fields in a register into 8 other arbitrary 2-bit fields! 8179 SDValue PredAsVector = PromoteMVEPredVector(dl, V1, VT, DAG); 8180 EVT NewVT = PredAsVector.getValueType(); 8181 8182 // Do the shuffle! 8183 SDValue Shuffled = DAG.getVectorShuffle(NewVT, dl, PredAsVector, 8184 DAG.getUNDEF(NewVT), ShuffleMask); 8185 8186 // Now return the result of comparing the shuffled vector with zero, 8187 // which will generate a real predicate, i.e. v4i1, v8i1 or v16i1. 8188 return DAG.getNode(ARMISD::VCMPZ, dl, VT, Shuffled, 8189 DAG.getConstant(ARMCC::NE, dl, MVT::i32)); 8190 } 8191 8192 static SDValue LowerVECTOR_SHUFFLEUsingMovs(SDValue Op, 8193 ArrayRef<int> ShuffleMask, 8194 SelectionDAG &DAG) { 8195 // Attempt to lower the vector shuffle using as many whole register movs as 8196 // possible. This is useful for types smaller than 32bits, which would 8197 // often otherwise become a series for grp movs. 8198 SDLoc dl(Op); 8199 EVT VT = Op.getValueType(); 8200 if (VT.getScalarSizeInBits() >= 32) 8201 return SDValue(); 8202 8203 assert((VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i8) && 8204 "Unexpected vector type"); 8205 int NumElts = VT.getVectorNumElements(); 8206 int QuarterSize = NumElts / 4; 8207 // The four final parts of the vector, as i32's 8208 SDValue Parts[4]; 8209 8210 // Look for full lane vmovs like <0,1,2,3> or <u,5,6,7> etc, (but not 8211 // <u,u,u,u>), returning the vmov lane index 8212 auto getMovIdx = [](ArrayRef<int> ShuffleMask, int Start, int Length) { 8213 // Detect which mov lane this would be from the first non-undef element. 8214 int MovIdx = -1; 8215 for (int i = 0; i < Length; i++) { 8216 if (ShuffleMask[Start + i] >= 0) { 8217 if (ShuffleMask[Start + i] % Length != i) 8218 return -1; 8219 MovIdx = ShuffleMask[Start + i] / Length; 8220 break; 8221 } 8222 } 8223 // If all items are undef, leave this for other combines 8224 if (MovIdx == -1) 8225 return -1; 8226 // Check the remaining values are the correct part of the same mov 8227 for (int i = 1; i < Length; i++) { 8228 if (ShuffleMask[Start + i] >= 0 && 8229 (ShuffleMask[Start + i] / Length != MovIdx || 8230 ShuffleMask[Start + i] % Length != i)) 8231 return -1; 8232 } 8233 return MovIdx; 8234 }; 8235 8236 for (int Part = 0; Part < 4; ++Part) { 8237 // Does this part look like a mov 8238 int Elt = getMovIdx(ShuffleMask, Part * QuarterSize, QuarterSize); 8239 if (Elt != -1) { 8240 SDValue Input = Op->getOperand(0); 8241 if (Elt >= 4) { 8242 Input = Op->getOperand(1); 8243 Elt -= 4; 8244 } 8245 SDValue BitCast = DAG.getBitcast(MVT::v4f32, Input); 8246 Parts[Part] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, BitCast, 8247 DAG.getConstant(Elt, dl, MVT::i32)); 8248 } 8249 } 8250 8251 // Nothing interesting found, just return 8252 if (!Parts[0] && !Parts[1] && !Parts[2] && !Parts[3]) 8253 return SDValue(); 8254 8255 // The other parts need to be built with the old shuffle vector, cast to a 8256 // v4i32 and extract_vector_elts 8257 if (!Parts[0] || !Parts[1] || !Parts[2] || !Parts[3]) { 8258 SmallVector<int, 16> NewShuffleMask; 8259 for (int Part = 0; Part < 4; ++Part) 8260 for (int i = 0; i < QuarterSize; i++) 8261 NewShuffleMask.push_back( 8262 Parts[Part] ? -1 : ShuffleMask[Part * QuarterSize + i]); 8263 SDValue NewShuffle = DAG.getVectorShuffle( 8264 VT, dl, Op->getOperand(0), Op->getOperand(1), NewShuffleMask); 8265 SDValue BitCast = DAG.getBitcast(MVT::v4f32, NewShuffle); 8266 8267 for (int Part = 0; Part < 4; ++Part) 8268 if (!Parts[Part]) 8269 Parts[Part] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, 8270 BitCast, DAG.getConstant(Part, dl, MVT::i32)); 8271 } 8272 // Build a vector out of the various parts and bitcast it back to the original 8273 // type. 8274 SDValue NewVec = DAG.getNode(ARMISD::BUILD_VECTOR, dl, MVT::v4f32, Parts); 8275 return DAG.getBitcast(VT, NewVec); 8276 } 8277 8278 static SDValue LowerVECTOR_SHUFFLEUsingOneOff(SDValue Op, 8279 ArrayRef<int> ShuffleMask, 8280 SelectionDAG &DAG) { 8281 SDValue V1 = Op.getOperand(0); 8282 SDValue V2 = Op.getOperand(1); 8283 EVT VT = Op.getValueType(); 8284 unsigned NumElts = VT.getVectorNumElements(); 8285 8286 // An One-Off Identity mask is one that is mostly an identity mask from as 8287 // single source but contains a single element out-of-place, either from a 8288 // different vector or from another position in the same vector. As opposed to 8289 // lowering this via a ARMISD::BUILD_VECTOR we can generate an extract/insert 8290 // pair directly. 8291 auto isOneOffIdentityMask = [](ArrayRef<int> Mask, EVT VT, int BaseOffset, 8292 int &OffElement) { 8293 OffElement = -1; 8294 int NonUndef = 0; 8295 for (int i = 0, NumMaskElts = Mask.size(); i < NumMaskElts; ++i) { 8296 if (Mask[i] == -1) 8297 continue; 8298 NonUndef++; 8299 if (Mask[i] != i + BaseOffset) { 8300 if (OffElement == -1) 8301 OffElement = i; 8302 else 8303 return false; 8304 } 8305 } 8306 return NonUndef > 2 && OffElement != -1; 8307 }; 8308 int OffElement; 8309 SDValue VInput; 8310 if (isOneOffIdentityMask(ShuffleMask, VT, 0, OffElement)) 8311 VInput = V1; 8312 else if (isOneOffIdentityMask(ShuffleMask, VT, NumElts, OffElement)) 8313 VInput = V2; 8314 else 8315 return SDValue(); 8316 8317 SDLoc dl(Op); 8318 EVT SVT = VT.getScalarType() == MVT::i8 || VT.getScalarType() == MVT::i16 8319 ? MVT::i32 8320 : VT.getScalarType(); 8321 SDValue Elt = DAG.getNode( 8322 ISD::EXTRACT_VECTOR_ELT, dl, SVT, 8323 ShuffleMask[OffElement] < (int)NumElts ? V1 : V2, 8324 DAG.getVectorIdxConstant(ShuffleMask[OffElement] % NumElts, dl)); 8325 return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, VInput, Elt, 8326 DAG.getVectorIdxConstant(OffElement % NumElts, dl)); 8327 } 8328 8329 static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG, 8330 const ARMSubtarget *ST) { 8331 SDValue V1 = Op.getOperand(0); 8332 SDValue V2 = Op.getOperand(1); 8333 SDLoc dl(Op); 8334 EVT VT = Op.getValueType(); 8335 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode()); 8336 unsigned EltSize = VT.getScalarSizeInBits(); 8337 8338 if (ST->hasMVEIntegerOps() && EltSize == 1) 8339 return LowerVECTOR_SHUFFLE_i1(Op, DAG, ST); 8340 8341 // Convert shuffles that are directly supported on NEON to target-specific 8342 // DAG nodes, instead of keeping them as shuffles and matching them again 8343 // during code selection. This is more efficient and avoids the possibility 8344 // of inconsistencies between legalization and selection. 8345 // FIXME: floating-point vectors should be canonicalized to integer vectors 8346 // of the same time so that they get CSEd properly. 8347 ArrayRef<int> ShuffleMask = SVN->getMask(); 8348 8349 if (EltSize <= 32) { 8350 if (SVN->isSplat()) { 8351 int Lane = SVN->getSplatIndex(); 8352 // If this is undef splat, generate it via "just" vdup, if possible. 8353 if (Lane == -1) Lane = 0; 8354 8355 // Test if V1 is a SCALAR_TO_VECTOR. 8356 if (Lane == 0 && V1.getOpcode() == ISD::SCALAR_TO_VECTOR) { 8357 return DAG.getNode(ARMISD::VDUP, dl, VT, V1.getOperand(0)); 8358 } 8359 // Test if V1 is a BUILD_VECTOR which is equivalent to a SCALAR_TO_VECTOR 8360 // (and probably will turn into a SCALAR_TO_VECTOR once legalization 8361 // reaches it). 8362 if (Lane == 0 && V1.getOpcode() == ISD::BUILD_VECTOR && 8363 !isa<ConstantSDNode>(V1.getOperand(0))) { 8364 bool IsScalarToVector = true; 8365 for (unsigned i = 1, e = V1.getNumOperands(); i != e; ++i) 8366 if (!V1.getOperand(i).isUndef()) { 8367 IsScalarToVector = false; 8368 break; 8369 } 8370 if (IsScalarToVector) 8371 return DAG.getNode(ARMISD::VDUP, dl, VT, V1.getOperand(0)); 8372 } 8373 return DAG.getNode(ARMISD::VDUPLANE, dl, VT, V1, 8374 DAG.getConstant(Lane, dl, MVT::i32)); 8375 } 8376 8377 bool ReverseVEXT = false; 8378 unsigned Imm = 0; 8379 if (ST->hasNEON() && isVEXTMask(ShuffleMask, VT, ReverseVEXT, Imm)) { 8380 if (ReverseVEXT) 8381 std::swap(V1, V2); 8382 return DAG.getNode(ARMISD::VEXT, dl, VT, V1, V2, 8383 DAG.getConstant(Imm, dl, MVT::i32)); 8384 } 8385 8386 if (isVREVMask(ShuffleMask, VT, 64)) 8387 return DAG.getNode(ARMISD::VREV64, dl, VT, V1); 8388 if (isVREVMask(ShuffleMask, VT, 32)) 8389 return DAG.getNode(ARMISD::VREV32, dl, VT, V1); 8390 if (isVREVMask(ShuffleMask, VT, 16)) 8391 return DAG.getNode(ARMISD::VREV16, dl, VT, V1); 8392 8393 if (ST->hasNEON() && V2->isUndef() && isSingletonVEXTMask(ShuffleMask, VT, Imm)) { 8394 return DAG.getNode(ARMISD::VEXT, dl, VT, V1, V1, 8395 DAG.getConstant(Imm, dl, MVT::i32)); 8396 } 8397 8398 // Check for Neon shuffles that modify both input vectors in place. 8399 // If both results are used, i.e., if there are two shuffles with the same 8400 // source operands and with masks corresponding to both results of one of 8401 // these operations, DAG memoization will ensure that a single node is 8402 // used for both shuffles. 8403 unsigned WhichResult = 0; 8404 bool isV_UNDEF = false; 8405 if (ST->hasNEON()) { 8406 if (unsigned ShuffleOpc = isNEONTwoResultShuffleMask( 8407 ShuffleMask, VT, WhichResult, isV_UNDEF)) { 8408 if (isV_UNDEF) 8409 V2 = V1; 8410 return DAG.getNode(ShuffleOpc, dl, DAG.getVTList(VT, VT), V1, V2) 8411 .getValue(WhichResult); 8412 } 8413 } 8414 if (ST->hasMVEIntegerOps()) { 8415 if (isVMOVNMask(ShuffleMask, VT, false, false)) 8416 return DAG.getNode(ARMISD::VMOVN, dl, VT, V2, V1, 8417 DAG.getConstant(0, dl, MVT::i32)); 8418 if (isVMOVNMask(ShuffleMask, VT, true, false)) 8419 return DAG.getNode(ARMISD::VMOVN, dl, VT, V1, V2, 8420 DAG.getConstant(1, dl, MVT::i32)); 8421 if (isVMOVNMask(ShuffleMask, VT, true, true)) 8422 return DAG.getNode(ARMISD::VMOVN, dl, VT, V1, V1, 8423 DAG.getConstant(1, dl, MVT::i32)); 8424 } 8425 8426 // Also check for these shuffles through CONCAT_VECTORS: we canonicalize 8427 // shuffles that produce a result larger than their operands with: 8428 // shuffle(concat(v1, undef), concat(v2, undef)) 8429 // -> 8430 // shuffle(concat(v1, v2), undef) 8431 // because we can access quad vectors (see PerformVECTOR_SHUFFLECombine). 8432 // 8433 // This is useful in the general case, but there are special cases where 8434 // native shuffles produce larger results: the two-result ops. 8435 // 8436 // Look through the concat when lowering them: 8437 // shuffle(concat(v1, v2), undef) 8438 // -> 8439 // concat(VZIP(v1, v2):0, :1) 8440 // 8441 if (ST->hasNEON() && V1->getOpcode() == ISD::CONCAT_VECTORS && V2->isUndef()) { 8442 SDValue SubV1 = V1->getOperand(0); 8443 SDValue SubV2 = V1->getOperand(1); 8444 EVT SubVT = SubV1.getValueType(); 8445 8446 // We expect these to have been canonicalized to -1. 8447 assert(llvm::all_of(ShuffleMask, [&](int i) { 8448 return i < (int)VT.getVectorNumElements(); 8449 }) && "Unexpected shuffle index into UNDEF operand!"); 8450 8451 if (unsigned ShuffleOpc = isNEONTwoResultShuffleMask( 8452 ShuffleMask, SubVT, WhichResult, isV_UNDEF)) { 8453 if (isV_UNDEF) 8454 SubV2 = SubV1; 8455 assert((WhichResult == 0) && 8456 "In-place shuffle of concat can only have one result!"); 8457 SDValue Res = DAG.getNode(ShuffleOpc, dl, DAG.getVTList(SubVT, SubVT), 8458 SubV1, SubV2); 8459 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Res.getValue(0), 8460 Res.getValue(1)); 8461 } 8462 } 8463 } 8464 8465 if (ST->hasMVEIntegerOps() && EltSize <= 32) 8466 if (SDValue V = LowerVECTOR_SHUFFLEUsingOneOff(Op, ShuffleMask, DAG)) 8467 return V; 8468 8469 // If the shuffle is not directly supported and it has 4 elements, use 8470 // the PerfectShuffle-generated table to synthesize it from other shuffles. 8471 unsigned NumElts = VT.getVectorNumElements(); 8472 if (NumElts == 4) { 8473 unsigned PFIndexes[4]; 8474 for (unsigned i = 0; i != 4; ++i) { 8475 if (ShuffleMask[i] < 0) 8476 PFIndexes[i] = 8; 8477 else 8478 PFIndexes[i] = ShuffleMask[i]; 8479 } 8480 8481 // Compute the index in the perfect shuffle table. 8482 unsigned PFTableIndex = 8483 PFIndexes[0]*9*9*9+PFIndexes[1]*9*9+PFIndexes[2]*9+PFIndexes[3]; 8484 unsigned PFEntry = PerfectShuffleTable[PFTableIndex]; 8485 unsigned Cost = (PFEntry >> 30); 8486 8487 if (Cost <= 4) { 8488 if (ST->hasNEON()) 8489 return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl); 8490 else if (isLegalMVEShuffleOp(PFEntry)) { 8491 unsigned LHSID = (PFEntry >> 13) & ((1 << 13)-1); 8492 unsigned RHSID = (PFEntry >> 0) & ((1 << 13)-1); 8493 unsigned PFEntryLHS = PerfectShuffleTable[LHSID]; 8494 unsigned PFEntryRHS = PerfectShuffleTable[RHSID]; 8495 if (isLegalMVEShuffleOp(PFEntryLHS) && isLegalMVEShuffleOp(PFEntryRHS)) 8496 return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl); 8497 } 8498 } 8499 } 8500 8501 // Implement shuffles with 32- or 64-bit elements as ARMISD::BUILD_VECTORs. 8502 if (EltSize >= 32) { 8503 // Do the expansion with floating-point types, since that is what the VFP 8504 // registers are defined to use, and since i64 is not legal. 8505 EVT EltVT = EVT::getFloatingPointVT(EltSize); 8506 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts); 8507 V1 = DAG.getNode(ISD::BITCAST, dl, VecVT, V1); 8508 V2 = DAG.getNode(ISD::BITCAST, dl, VecVT, V2); 8509 SmallVector<SDValue, 8> Ops; 8510 for (unsigned i = 0; i < NumElts; ++i) { 8511 if (ShuffleMask[i] < 0) 8512 Ops.push_back(DAG.getUNDEF(EltVT)); 8513 else 8514 Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, 8515 ShuffleMask[i] < (int)NumElts ? V1 : V2, 8516 DAG.getConstant(ShuffleMask[i] & (NumElts-1), 8517 dl, MVT::i32))); 8518 } 8519 SDValue Val = DAG.getNode(ARMISD::BUILD_VECTOR, dl, VecVT, Ops); 8520 return DAG.getNode(ISD::BITCAST, dl, VT, Val); 8521 } 8522 8523 if (ST->hasNEON() && (VT == MVT::v8i16 || VT == MVT::v16i8) && isReverseMask(ShuffleMask, VT)) 8524 return LowerReverse_VECTOR_SHUFFLEv16i8_v8i16(Op, DAG); 8525 8526 if (ST->hasNEON() && VT == MVT::v8i8) 8527 if (SDValue NewOp = LowerVECTOR_SHUFFLEv8i8(Op, ShuffleMask, DAG)) 8528 return NewOp; 8529 8530 if (ST->hasMVEIntegerOps()) 8531 if (SDValue NewOp = LowerVECTOR_SHUFFLEUsingMovs(Op, ShuffleMask, DAG)) 8532 return NewOp; 8533 8534 return SDValue(); 8535 } 8536 8537 static SDValue LowerINSERT_VECTOR_ELT_i1(SDValue Op, SelectionDAG &DAG, 8538 const ARMSubtarget *ST) { 8539 EVT VecVT = Op.getOperand(0).getValueType(); 8540 SDLoc dl(Op); 8541 8542 assert(ST->hasMVEIntegerOps() && 8543 "LowerINSERT_VECTOR_ELT_i1 called without MVE!"); 8544 8545 SDValue Conv = 8546 DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::i32, Op->getOperand(0)); 8547 unsigned Lane = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue(); 8548 unsigned LaneWidth = 8549 getVectorTyFromPredicateVector(VecVT).getScalarSizeInBits() / 8; 8550 unsigned Mask = ((1 << LaneWidth) - 1) << Lane * LaneWidth; 8551 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::i32, 8552 Op.getOperand(1), DAG.getValueType(MVT::i1)); 8553 SDValue BFI = DAG.getNode(ARMISD::BFI, dl, MVT::i32, Conv, Ext, 8554 DAG.getConstant(~Mask, dl, MVT::i32)); 8555 return DAG.getNode(ARMISD::PREDICATE_CAST, dl, Op.getValueType(), BFI); 8556 } 8557 8558 SDValue ARMTargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, 8559 SelectionDAG &DAG) const { 8560 // INSERT_VECTOR_ELT is legal only for immediate indexes. 8561 SDValue Lane = Op.getOperand(2); 8562 if (!isa<ConstantSDNode>(Lane)) 8563 return SDValue(); 8564 8565 SDValue Elt = Op.getOperand(1); 8566 EVT EltVT = Elt.getValueType(); 8567 8568 if (Subtarget->hasMVEIntegerOps() && 8569 Op.getValueType().getScalarSizeInBits() == 1) 8570 return LowerINSERT_VECTOR_ELT_i1(Op, DAG, Subtarget); 8571 8572 if (getTypeAction(*DAG.getContext(), EltVT) == 8573 TargetLowering::TypePromoteFloat) { 8574 // INSERT_VECTOR_ELT doesn't want f16 operands promoting to f32, 8575 // but the type system will try to do that if we don't intervene. 8576 // Reinterpret any such vector-element insertion as one with the 8577 // corresponding integer types. 8578 8579 SDLoc dl(Op); 8580 8581 EVT IEltVT = MVT::getIntegerVT(EltVT.getScalarSizeInBits()); 8582 assert(getTypeAction(*DAG.getContext(), IEltVT) != 8583 TargetLowering::TypePromoteFloat); 8584 8585 SDValue VecIn = Op.getOperand(0); 8586 EVT VecVT = VecIn.getValueType(); 8587 EVT IVecVT = EVT::getVectorVT(*DAG.getContext(), IEltVT, 8588 VecVT.getVectorNumElements()); 8589 8590 SDValue IElt = DAG.getNode(ISD::BITCAST, dl, IEltVT, Elt); 8591 SDValue IVecIn = DAG.getNode(ISD::BITCAST, dl, IVecVT, VecIn); 8592 SDValue IVecOut = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, IVecVT, 8593 IVecIn, IElt, Lane); 8594 return DAG.getNode(ISD::BITCAST, dl, VecVT, IVecOut); 8595 } 8596 8597 return Op; 8598 } 8599 8600 static SDValue LowerEXTRACT_VECTOR_ELT_i1(SDValue Op, SelectionDAG &DAG, 8601 const ARMSubtarget *ST) { 8602 EVT VecVT = Op.getOperand(0).getValueType(); 8603 SDLoc dl(Op); 8604 8605 assert(ST->hasMVEIntegerOps() && 8606 "LowerINSERT_VECTOR_ELT_i1 called without MVE!"); 8607 8608 SDValue Conv = 8609 DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::i32, Op->getOperand(0)); 8610 unsigned Lane = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 8611 unsigned LaneWidth = 8612 getVectorTyFromPredicateVector(VecVT).getScalarSizeInBits() / 8; 8613 SDValue Shift = DAG.getNode(ISD::SRL, dl, MVT::i32, Conv, 8614 DAG.getConstant(Lane * LaneWidth, dl, MVT::i32)); 8615 return Shift; 8616 } 8617 8618 static SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG, 8619 const ARMSubtarget *ST) { 8620 // EXTRACT_VECTOR_ELT is legal only for immediate indexes. 8621 SDValue Lane = Op.getOperand(1); 8622 if (!isa<ConstantSDNode>(Lane)) 8623 return SDValue(); 8624 8625 SDValue Vec = Op.getOperand(0); 8626 EVT VT = Vec.getValueType(); 8627 8628 if (ST->hasMVEIntegerOps() && VT.getScalarSizeInBits() == 1) 8629 return LowerEXTRACT_VECTOR_ELT_i1(Op, DAG, ST); 8630 8631 if (Op.getValueType() == MVT::i32 && Vec.getScalarValueSizeInBits() < 32) { 8632 SDLoc dl(Op); 8633 return DAG.getNode(ARMISD::VGETLANEu, dl, MVT::i32, Vec, Lane); 8634 } 8635 8636 return Op; 8637 } 8638 8639 static SDValue LowerCONCAT_VECTORS_i1(SDValue Op, SelectionDAG &DAG, 8640 const ARMSubtarget *ST) { 8641 SDValue V1 = Op.getOperand(0); 8642 SDValue V2 = Op.getOperand(1); 8643 SDLoc dl(Op); 8644 EVT VT = Op.getValueType(); 8645 EVT Op1VT = V1.getValueType(); 8646 EVT Op2VT = V2.getValueType(); 8647 unsigned NumElts = VT.getVectorNumElements(); 8648 8649 assert(Op1VT == Op2VT && "Operand types don't match!"); 8650 assert(VT.getScalarSizeInBits() == 1 && 8651 "Unexpected custom CONCAT_VECTORS lowering"); 8652 assert(ST->hasMVEIntegerOps() && 8653 "CONCAT_VECTORS lowering only supported for MVE"); 8654 8655 SDValue NewV1 = PromoteMVEPredVector(dl, V1, Op1VT, DAG); 8656 SDValue NewV2 = PromoteMVEPredVector(dl, V2, Op2VT, DAG); 8657 8658 // We now have Op1 + Op2 promoted to vectors of integers, where v8i1 gets 8659 // promoted to v8i16, etc. 8660 8661 MVT ElType = getVectorTyFromPredicateVector(VT).getScalarType().getSimpleVT(); 8662 8663 // Extract the vector elements from Op1 and Op2 one by one and truncate them 8664 // to be the right size for the destination. For example, if Op1 is v4i1 then 8665 // the promoted vector is v4i32. The result of concatentation gives a v8i1, 8666 // which when promoted is v8i16. That means each i32 element from Op1 needs 8667 // truncating to i16 and inserting in the result. 8668 EVT ConcatVT = MVT::getVectorVT(ElType, NumElts); 8669 SDValue ConVec = DAG.getNode(ISD::UNDEF, dl, ConcatVT); 8670 auto ExractInto = [&DAG, &dl](SDValue NewV, SDValue ConVec, unsigned &j) { 8671 EVT NewVT = NewV.getValueType(); 8672 EVT ConcatVT = ConVec.getValueType(); 8673 for (unsigned i = 0, e = NewVT.getVectorNumElements(); i < e; i++, j++) { 8674 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, NewV, 8675 DAG.getIntPtrConstant(i, dl)); 8676 ConVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ConcatVT, ConVec, Elt, 8677 DAG.getConstant(j, dl, MVT::i32)); 8678 } 8679 return ConVec; 8680 }; 8681 unsigned j = 0; 8682 ConVec = ExractInto(NewV1, ConVec, j); 8683 ConVec = ExractInto(NewV2, ConVec, j); 8684 8685 // Now return the result of comparing the subvector with zero, 8686 // which will generate a real predicate, i.e. v4i1, v8i1 or v16i1. 8687 return DAG.getNode(ARMISD::VCMPZ, dl, VT, ConVec, 8688 DAG.getConstant(ARMCC::NE, dl, MVT::i32)); 8689 } 8690 8691 static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG, 8692 const ARMSubtarget *ST) { 8693 EVT VT = Op->getValueType(0); 8694 if (ST->hasMVEIntegerOps() && VT.getScalarSizeInBits() == 1) 8695 return LowerCONCAT_VECTORS_i1(Op, DAG, ST); 8696 8697 // The only time a CONCAT_VECTORS operation can have legal types is when 8698 // two 64-bit vectors are concatenated to a 128-bit vector. 8699 assert(Op.getValueType().is128BitVector() && Op.getNumOperands() == 2 && 8700 "unexpected CONCAT_VECTORS"); 8701 SDLoc dl(Op); 8702 SDValue Val = DAG.getUNDEF(MVT::v2f64); 8703 SDValue Op0 = Op.getOperand(0); 8704 SDValue Op1 = Op.getOperand(1); 8705 if (!Op0.isUndef()) 8706 Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Val, 8707 DAG.getNode(ISD::BITCAST, dl, MVT::f64, Op0), 8708 DAG.getIntPtrConstant(0, dl)); 8709 if (!Op1.isUndef()) 8710 Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Val, 8711 DAG.getNode(ISD::BITCAST, dl, MVT::f64, Op1), 8712 DAG.getIntPtrConstant(1, dl)); 8713 return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Val); 8714 } 8715 8716 static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG, 8717 const ARMSubtarget *ST) { 8718 SDValue V1 = Op.getOperand(0); 8719 SDValue V2 = Op.getOperand(1); 8720 SDLoc dl(Op); 8721 EVT VT = Op.getValueType(); 8722 EVT Op1VT = V1.getValueType(); 8723 unsigned NumElts = VT.getVectorNumElements(); 8724 unsigned Index = cast<ConstantSDNode>(V2)->getZExtValue(); 8725 8726 assert(VT.getScalarSizeInBits() == 1 && 8727 "Unexpected custom EXTRACT_SUBVECTOR lowering"); 8728 assert(ST->hasMVEIntegerOps() && 8729 "EXTRACT_SUBVECTOR lowering only supported for MVE"); 8730 8731 SDValue NewV1 = PromoteMVEPredVector(dl, V1, Op1VT, DAG); 8732 8733 // We now have Op1 promoted to a vector of integers, where v8i1 gets 8734 // promoted to v8i16, etc. 8735 8736 MVT ElType = getVectorTyFromPredicateVector(VT).getScalarType().getSimpleVT(); 8737 8738 EVT SubVT = MVT::getVectorVT(ElType, NumElts); 8739 SDValue SubVec = DAG.getNode(ISD::UNDEF, dl, SubVT); 8740 for (unsigned i = Index, j = 0; i < (Index + NumElts); i++, j++) { 8741 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, NewV1, 8742 DAG.getIntPtrConstant(i, dl)); 8743 SubVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, SubVT, SubVec, Elt, 8744 DAG.getConstant(j, dl, MVT::i32)); 8745 } 8746 8747 // Now return the result of comparing the subvector with zero, 8748 // which will generate a real predicate, i.e. v4i1, v8i1 or v16i1. 8749 return DAG.getNode(ARMISD::VCMPZ, dl, VT, SubVec, 8750 DAG.getConstant(ARMCC::NE, dl, MVT::i32)); 8751 } 8752 8753 // Turn a truncate into a predicate (an i1 vector) into icmp(and(x, 1), 0). 8754 static SDValue LowerTruncatei1(SDValue N, SelectionDAG &DAG, 8755 const ARMSubtarget *ST) { 8756 assert(ST->hasMVEIntegerOps() && "Expected MVE!"); 8757 EVT VT = N.getValueType(); 8758 assert((VT == MVT::v16i1 || VT == MVT::v8i1 || VT == MVT::v4i1) && 8759 "Expected a vector i1 type!"); 8760 SDValue Op = N.getOperand(0); 8761 EVT FromVT = Op.getValueType(); 8762 SDLoc DL(N); 8763 8764 SDValue And = 8765 DAG.getNode(ISD::AND, DL, FromVT, Op, DAG.getConstant(1, DL, FromVT)); 8766 return DAG.getNode(ISD::SETCC, DL, VT, And, DAG.getConstant(0, DL, FromVT), 8767 DAG.getCondCode(ISD::SETNE)); 8768 } 8769 8770 /// isExtendedBUILD_VECTOR - Check if N is a constant BUILD_VECTOR where each 8771 /// element has been zero/sign-extended, depending on the isSigned parameter, 8772 /// from an integer type half its size. 8773 static bool isExtendedBUILD_VECTOR(SDNode *N, SelectionDAG &DAG, 8774 bool isSigned) { 8775 // A v2i64 BUILD_VECTOR will have been legalized to a BITCAST from v4i32. 8776 EVT VT = N->getValueType(0); 8777 if (VT == MVT::v2i64 && N->getOpcode() == ISD::BITCAST) { 8778 SDNode *BVN = N->getOperand(0).getNode(); 8779 if (BVN->getValueType(0) != MVT::v4i32 || 8780 BVN->getOpcode() != ISD::BUILD_VECTOR) 8781 return false; 8782 unsigned LoElt = DAG.getDataLayout().isBigEndian() ? 1 : 0; 8783 unsigned HiElt = 1 - LoElt; 8784 ConstantSDNode *Lo0 = dyn_cast<ConstantSDNode>(BVN->getOperand(LoElt)); 8785 ConstantSDNode *Hi0 = dyn_cast<ConstantSDNode>(BVN->getOperand(HiElt)); 8786 ConstantSDNode *Lo1 = dyn_cast<ConstantSDNode>(BVN->getOperand(LoElt+2)); 8787 ConstantSDNode *Hi1 = dyn_cast<ConstantSDNode>(BVN->getOperand(HiElt+2)); 8788 if (!Lo0 || !Hi0 || !Lo1 || !Hi1) 8789 return false; 8790 if (isSigned) { 8791 if (Hi0->getSExtValue() == Lo0->getSExtValue() >> 32 && 8792 Hi1->getSExtValue() == Lo1->getSExtValue() >> 32) 8793 return true; 8794 } else { 8795 if (Hi0->isNullValue() && Hi1->isNullValue()) 8796 return true; 8797 } 8798 return false; 8799 } 8800 8801 if (N->getOpcode() != ISD::BUILD_VECTOR) 8802 return false; 8803 8804 for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) { 8805 SDNode *Elt = N->getOperand(i).getNode(); 8806 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Elt)) { 8807 unsigned EltSize = VT.getScalarSizeInBits(); 8808 unsigned HalfSize = EltSize / 2; 8809 if (isSigned) { 8810 if (!isIntN(HalfSize, C->getSExtValue())) 8811 return false; 8812 } else { 8813 if (!isUIntN(HalfSize, C->getZExtValue())) 8814 return false; 8815 } 8816 continue; 8817 } 8818 return false; 8819 } 8820 8821 return true; 8822 } 8823 8824 /// isSignExtended - Check if a node is a vector value that is sign-extended 8825 /// or a constant BUILD_VECTOR with sign-extended elements. 8826 static bool isSignExtended(SDNode *N, SelectionDAG &DAG) { 8827 if (N->getOpcode() == ISD::SIGN_EXTEND || ISD::isSEXTLoad(N)) 8828 return true; 8829 if (isExtendedBUILD_VECTOR(N, DAG, true)) 8830 return true; 8831 return false; 8832 } 8833 8834 /// isZeroExtended - Check if a node is a vector value that is zero-extended (or 8835 /// any-extended) or a constant BUILD_VECTOR with zero-extended elements. 8836 static bool isZeroExtended(SDNode *N, SelectionDAG &DAG) { 8837 if (N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode() == ISD::ANY_EXTEND || 8838 ISD::isZEXTLoad(N)) 8839 return true; 8840 if (isExtendedBUILD_VECTOR(N, DAG, false)) 8841 return true; 8842 return false; 8843 } 8844 8845 static EVT getExtensionTo64Bits(const EVT &OrigVT) { 8846 if (OrigVT.getSizeInBits() >= 64) 8847 return OrigVT; 8848 8849 assert(OrigVT.isSimple() && "Expecting a simple value type"); 8850 8851 MVT::SimpleValueType OrigSimpleTy = OrigVT.getSimpleVT().SimpleTy; 8852 switch (OrigSimpleTy) { 8853 default: llvm_unreachable("Unexpected Vector Type"); 8854 case MVT::v2i8: 8855 case MVT::v2i16: 8856 return MVT::v2i32; 8857 case MVT::v4i8: 8858 return MVT::v4i16; 8859 } 8860 } 8861 8862 /// AddRequiredExtensionForVMULL - Add a sign/zero extension to extend the total 8863 /// value size to 64 bits. We need a 64-bit D register as an operand to VMULL. 8864 /// We insert the required extension here to get the vector to fill a D register. 8865 static SDValue AddRequiredExtensionForVMULL(SDValue N, SelectionDAG &DAG, 8866 const EVT &OrigTy, 8867 const EVT &ExtTy, 8868 unsigned ExtOpcode) { 8869 // The vector originally had a size of OrigTy. It was then extended to ExtTy. 8870 // We expect the ExtTy to be 128-bits total. If the OrigTy is less than 8871 // 64-bits we need to insert a new extension so that it will be 64-bits. 8872 assert(ExtTy.is128BitVector() && "Unexpected extension size"); 8873 if (OrigTy.getSizeInBits() >= 64) 8874 return N; 8875 8876 // Must extend size to at least 64 bits to be used as an operand for VMULL. 8877 EVT NewVT = getExtensionTo64Bits(OrigTy); 8878 8879 return DAG.getNode(ExtOpcode, SDLoc(N), NewVT, N); 8880 } 8881 8882 /// SkipLoadExtensionForVMULL - return a load of the original vector size that 8883 /// does not do any sign/zero extension. If the original vector is less 8884 /// than 64 bits, an appropriate extension will be added after the load to 8885 /// reach a total size of 64 bits. We have to add the extension separately 8886 /// because ARM does not have a sign/zero extending load for vectors. 8887 static SDValue SkipLoadExtensionForVMULL(LoadSDNode *LD, SelectionDAG& DAG) { 8888 EVT ExtendedTy = getExtensionTo64Bits(LD->getMemoryVT()); 8889 8890 // The load already has the right type. 8891 if (ExtendedTy == LD->getMemoryVT()) 8892 return DAG.getLoad(LD->getMemoryVT(), SDLoc(LD), LD->getChain(), 8893 LD->getBasePtr(), LD->getPointerInfo(), 8894 LD->getAlignment(), LD->getMemOperand()->getFlags()); 8895 8896 // We need to create a zextload/sextload. We cannot just create a load 8897 // followed by a zext/zext node because LowerMUL is also run during normal 8898 // operation legalization where we can't create illegal types. 8899 return DAG.getExtLoad(LD->getExtensionType(), SDLoc(LD), ExtendedTy, 8900 LD->getChain(), LD->getBasePtr(), LD->getPointerInfo(), 8901 LD->getMemoryVT(), LD->getAlignment(), 8902 LD->getMemOperand()->getFlags()); 8903 } 8904 8905 /// SkipExtensionForVMULL - For a node that is a SIGN_EXTEND, ZERO_EXTEND, 8906 /// ANY_EXTEND, extending load, or BUILD_VECTOR with extended elements, return 8907 /// the unextended value. The unextended vector should be 64 bits so that it can 8908 /// be used as an operand to a VMULL instruction. If the original vector size 8909 /// before extension is less than 64 bits we add a an extension to resize 8910 /// the vector to 64 bits. 8911 static SDValue SkipExtensionForVMULL(SDNode *N, SelectionDAG &DAG) { 8912 if (N->getOpcode() == ISD::SIGN_EXTEND || 8913 N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode() == ISD::ANY_EXTEND) 8914 return AddRequiredExtensionForVMULL(N->getOperand(0), DAG, 8915 N->getOperand(0)->getValueType(0), 8916 N->getValueType(0), 8917 N->getOpcode()); 8918 8919 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) { 8920 assert((ISD::isSEXTLoad(LD) || ISD::isZEXTLoad(LD)) && 8921 "Expected extending load"); 8922 8923 SDValue newLoad = SkipLoadExtensionForVMULL(LD, DAG); 8924 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), newLoad.getValue(1)); 8925 unsigned Opcode = ISD::isSEXTLoad(LD) ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; 8926 SDValue extLoad = 8927 DAG.getNode(Opcode, SDLoc(newLoad), LD->getValueType(0), newLoad); 8928 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 0), extLoad); 8929 8930 return newLoad; 8931 } 8932 8933 // Otherwise, the value must be a BUILD_VECTOR. For v2i64, it will 8934 // have been legalized as a BITCAST from v4i32. 8935 if (N->getOpcode() == ISD::BITCAST) { 8936 SDNode *BVN = N->getOperand(0).getNode(); 8937 assert(BVN->getOpcode() == ISD::BUILD_VECTOR && 8938 BVN->getValueType(0) == MVT::v4i32 && "expected v4i32 BUILD_VECTOR"); 8939 unsigned LowElt = DAG.getDataLayout().isBigEndian() ? 1 : 0; 8940 return DAG.getBuildVector( 8941 MVT::v2i32, SDLoc(N), 8942 {BVN->getOperand(LowElt), BVN->getOperand(LowElt + 2)}); 8943 } 8944 // Construct a new BUILD_VECTOR with elements truncated to half the size. 8945 assert(N->getOpcode() == ISD::BUILD_VECTOR && "expected BUILD_VECTOR"); 8946 EVT VT = N->getValueType(0); 8947 unsigned EltSize = VT.getScalarSizeInBits() / 2; 8948 unsigned NumElts = VT.getVectorNumElements(); 8949 MVT TruncVT = MVT::getIntegerVT(EltSize); 8950 SmallVector<SDValue, 8> Ops; 8951 SDLoc dl(N); 8952 for (unsigned i = 0; i != NumElts; ++i) { 8953 ConstantSDNode *C = cast<ConstantSDNode>(N->getOperand(i)); 8954 const APInt &CInt = C->getAPIntValue(); 8955 // Element types smaller than 32 bits are not legal, so use i32 elements. 8956 // The values are implicitly truncated so sext vs. zext doesn't matter. 8957 Ops.push_back(DAG.getConstant(CInt.zextOrTrunc(32), dl, MVT::i32)); 8958 } 8959 return DAG.getBuildVector(MVT::getVectorVT(TruncVT, NumElts), dl, Ops); 8960 } 8961 8962 static bool isAddSubSExt(SDNode *N, SelectionDAG &DAG) { 8963 unsigned Opcode = N->getOpcode(); 8964 if (Opcode == ISD::ADD || Opcode == ISD::SUB) { 8965 SDNode *N0 = N->getOperand(0).getNode(); 8966 SDNode *N1 = N->getOperand(1).getNode(); 8967 return N0->hasOneUse() && N1->hasOneUse() && 8968 isSignExtended(N0, DAG) && isSignExtended(N1, DAG); 8969 } 8970 return false; 8971 } 8972 8973 static bool isAddSubZExt(SDNode *N, SelectionDAG &DAG) { 8974 unsigned Opcode = N->getOpcode(); 8975 if (Opcode == ISD::ADD || Opcode == ISD::SUB) { 8976 SDNode *N0 = N->getOperand(0).getNode(); 8977 SDNode *N1 = N->getOperand(1).getNode(); 8978 return N0->hasOneUse() && N1->hasOneUse() && 8979 isZeroExtended(N0, DAG) && isZeroExtended(N1, DAG); 8980 } 8981 return false; 8982 } 8983 8984 static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) { 8985 // Multiplications are only custom-lowered for 128-bit vectors so that 8986 // VMULL can be detected. Otherwise v2i64 multiplications are not legal. 8987 EVT VT = Op.getValueType(); 8988 assert(VT.is128BitVector() && VT.isInteger() && 8989 "unexpected type for custom-lowering ISD::MUL"); 8990 SDNode *N0 = Op.getOperand(0).getNode(); 8991 SDNode *N1 = Op.getOperand(1).getNode(); 8992 unsigned NewOpc = 0; 8993 bool isMLA = false; 8994 bool isN0SExt = isSignExtended(N0, DAG); 8995 bool isN1SExt = isSignExtended(N1, DAG); 8996 if (isN0SExt && isN1SExt) 8997 NewOpc = ARMISD::VMULLs; 8998 else { 8999 bool isN0ZExt = isZeroExtended(N0, DAG); 9000 bool isN1ZExt = isZeroExtended(N1, DAG); 9001 if (isN0ZExt && isN1ZExt) 9002 NewOpc = ARMISD::VMULLu; 9003 else if (isN1SExt || isN1ZExt) { 9004 // Look for (s/zext A + s/zext B) * (s/zext C). We want to turn these 9005 // into (s/zext A * s/zext C) + (s/zext B * s/zext C) 9006 if (isN1SExt && isAddSubSExt(N0, DAG)) { 9007 NewOpc = ARMISD::VMULLs; 9008 isMLA = true; 9009 } else if (isN1ZExt && isAddSubZExt(N0, DAG)) { 9010 NewOpc = ARMISD::VMULLu; 9011 isMLA = true; 9012 } else if (isN0ZExt && isAddSubZExt(N1, DAG)) { 9013 std::swap(N0, N1); 9014 NewOpc = ARMISD::VMULLu; 9015 isMLA = true; 9016 } 9017 } 9018 9019 if (!NewOpc) { 9020 if (VT == MVT::v2i64) 9021 // Fall through to expand this. It is not legal. 9022 return SDValue(); 9023 else 9024 // Other vector multiplications are legal. 9025 return Op; 9026 } 9027 } 9028 9029 // Legalize to a VMULL instruction. 9030 SDLoc DL(Op); 9031 SDValue Op0; 9032 SDValue Op1 = SkipExtensionForVMULL(N1, DAG); 9033 if (!isMLA) { 9034 Op0 = SkipExtensionForVMULL(N0, DAG); 9035 assert(Op0.getValueType().is64BitVector() && 9036 Op1.getValueType().is64BitVector() && 9037 "unexpected types for extended operands to VMULL"); 9038 return DAG.getNode(NewOpc, DL, VT, Op0, Op1); 9039 } 9040 9041 // Optimizing (zext A + zext B) * C, to (VMULL A, C) + (VMULL B, C) during 9042 // isel lowering to take advantage of no-stall back to back vmul + vmla. 9043 // vmull q0, d4, d6 9044 // vmlal q0, d5, d6 9045 // is faster than 9046 // vaddl q0, d4, d5 9047 // vmovl q1, d6 9048 // vmul q0, q0, q1 9049 SDValue N00 = SkipExtensionForVMULL(N0->getOperand(0).getNode(), DAG); 9050 SDValue N01 = SkipExtensionForVMULL(N0->getOperand(1).getNode(), DAG); 9051 EVT Op1VT = Op1.getValueType(); 9052 return DAG.getNode(N0->getOpcode(), DL, VT, 9053 DAG.getNode(NewOpc, DL, VT, 9054 DAG.getNode(ISD::BITCAST, DL, Op1VT, N00), Op1), 9055 DAG.getNode(NewOpc, DL, VT, 9056 DAG.getNode(ISD::BITCAST, DL, Op1VT, N01), Op1)); 9057 } 9058 9059 static SDValue LowerSDIV_v4i8(SDValue X, SDValue Y, const SDLoc &dl, 9060 SelectionDAG &DAG) { 9061 // TODO: Should this propagate fast-math-flags? 9062 9063 // Convert to float 9064 // float4 xf = vcvt_f32_s32(vmovl_s16(a.lo)); 9065 // float4 yf = vcvt_f32_s32(vmovl_s16(b.lo)); 9066 X = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, X); 9067 Y = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Y); 9068 X = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, X); 9069 Y = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, Y); 9070 // Get reciprocal estimate. 9071 // float4 recip = vrecpeq_f32(yf); 9072 Y = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32, 9073 DAG.getConstant(Intrinsic::arm_neon_vrecpe, dl, MVT::i32), 9074 Y); 9075 // Because char has a smaller range than uchar, we can actually get away 9076 // without any newton steps. This requires that we use a weird bias 9077 // of 0xb000, however (again, this has been exhaustively tested). 9078 // float4 result = as_float4(as_int4(xf*recip) + 0xb000); 9079 X = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, X, Y); 9080 X = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, X); 9081 Y = DAG.getConstant(0xb000, dl, MVT::v4i32); 9082 X = DAG.getNode(ISD::ADD, dl, MVT::v4i32, X, Y); 9083 X = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, X); 9084 // Convert back to short. 9085 X = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, X); 9086 X = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, X); 9087 return X; 9088 } 9089 9090 static SDValue LowerSDIV_v4i16(SDValue N0, SDValue N1, const SDLoc &dl, 9091 SelectionDAG &DAG) { 9092 // TODO: Should this propagate fast-math-flags? 9093 9094 SDValue N2; 9095 // Convert to float. 9096 // float4 yf = vcvt_f32_s32(vmovl_s16(y)); 9097 // float4 xf = vcvt_f32_s32(vmovl_s16(x)); 9098 N0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, N0); 9099 N1 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, N1); 9100 N0 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N0); 9101 N1 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N1); 9102 9103 // Use reciprocal estimate and one refinement step. 9104 // float4 recip = vrecpeq_f32(yf); 9105 // recip *= vrecpsq_f32(yf, recip); 9106 N2 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32, 9107 DAG.getConstant(Intrinsic::arm_neon_vrecpe, dl, MVT::i32), 9108 N1); 9109 N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32, 9110 DAG.getConstant(Intrinsic::arm_neon_vrecps, dl, MVT::i32), 9111 N1, N2); 9112 N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2); 9113 // Because short has a smaller range than ushort, we can actually get away 9114 // with only a single newton step. This requires that we use a weird bias 9115 // of 89, however (again, this has been exhaustively tested). 9116 // float4 result = as_float4(as_int4(xf*recip) + 0x89); 9117 N0 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N0, N2); 9118 N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, N0); 9119 N1 = DAG.getConstant(0x89, dl, MVT::v4i32); 9120 N0 = DAG.getNode(ISD::ADD, dl, MVT::v4i32, N0, N1); 9121 N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, N0); 9122 // Convert back to integer and return. 9123 // return vmovn_s32(vcvt_s32_f32(result)); 9124 N0 = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, N0); 9125 N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, N0); 9126 return N0; 9127 } 9128 9129 static SDValue LowerSDIV(SDValue Op, SelectionDAG &DAG, 9130 const ARMSubtarget *ST) { 9131 EVT VT = Op.getValueType(); 9132 assert((VT == MVT::v4i16 || VT == MVT::v8i8) && 9133 "unexpected type for custom-lowering ISD::SDIV"); 9134 9135 SDLoc dl(Op); 9136 SDValue N0 = Op.getOperand(0); 9137 SDValue N1 = Op.getOperand(1); 9138 SDValue N2, N3; 9139 9140 if (VT == MVT::v8i8) { 9141 N0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i16, N0); 9142 N1 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i16, N1); 9143 9144 N2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0, 9145 DAG.getIntPtrConstant(4, dl)); 9146 N3 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1, 9147 DAG.getIntPtrConstant(4, dl)); 9148 N0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0, 9149 DAG.getIntPtrConstant(0, dl)); 9150 N1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1, 9151 DAG.getIntPtrConstant(0, dl)); 9152 9153 N0 = LowerSDIV_v4i8(N0, N1, dl, DAG); // v4i16 9154 N2 = LowerSDIV_v4i8(N2, N3, dl, DAG); // v4i16 9155 9156 N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, N0, N2); 9157 N0 = LowerCONCAT_VECTORS(N0, DAG, ST); 9158 9159 N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v8i8, N0); 9160 return N0; 9161 } 9162 return LowerSDIV_v4i16(N0, N1, dl, DAG); 9163 } 9164 9165 static SDValue LowerUDIV(SDValue Op, SelectionDAG &DAG, 9166 const ARMSubtarget *ST) { 9167 // TODO: Should this propagate fast-math-flags? 9168 EVT VT = Op.getValueType(); 9169 assert((VT == MVT::v4i16 || VT == MVT::v8i8) && 9170 "unexpected type for custom-lowering ISD::UDIV"); 9171 9172 SDLoc dl(Op); 9173 SDValue N0 = Op.getOperand(0); 9174 SDValue N1 = Op.getOperand(1); 9175 SDValue N2, N3; 9176 9177 if (VT == MVT::v8i8) { 9178 N0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v8i16, N0); 9179 N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v8i16, N1); 9180 9181 N2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0, 9182 DAG.getIntPtrConstant(4, dl)); 9183 N3 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1, 9184 DAG.getIntPtrConstant(4, dl)); 9185 N0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0, 9186 DAG.getIntPtrConstant(0, dl)); 9187 N1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1, 9188 DAG.getIntPtrConstant(0, dl)); 9189 9190 N0 = LowerSDIV_v4i16(N0, N1, dl, DAG); // v4i16 9191 N2 = LowerSDIV_v4i16(N2, N3, dl, DAG); // v4i16 9192 9193 N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, N0, N2); 9194 N0 = LowerCONCAT_VECTORS(N0, DAG, ST); 9195 9196 N0 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v8i8, 9197 DAG.getConstant(Intrinsic::arm_neon_vqmovnsu, dl, 9198 MVT::i32), 9199 N0); 9200 return N0; 9201 } 9202 9203 // v4i16 sdiv ... Convert to float. 9204 // float4 yf = vcvt_f32_s32(vmovl_u16(y)); 9205 // float4 xf = vcvt_f32_s32(vmovl_u16(x)); 9206 N0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v4i32, N0); 9207 N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v4i32, N1); 9208 N0 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N0); 9209 SDValue BN1 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N1); 9210 9211 // Use reciprocal estimate and two refinement steps. 9212 // float4 recip = vrecpeq_f32(yf); 9213 // recip *= vrecpsq_f32(yf, recip); 9214 // recip *= vrecpsq_f32(yf, recip); 9215 N2 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32, 9216 DAG.getConstant(Intrinsic::arm_neon_vrecpe, dl, MVT::i32), 9217 BN1); 9218 N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32, 9219 DAG.getConstant(Intrinsic::arm_neon_vrecps, dl, MVT::i32), 9220 BN1, N2); 9221 N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2); 9222 N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32, 9223 DAG.getConstant(Intrinsic::arm_neon_vrecps, dl, MVT::i32), 9224 BN1, N2); 9225 N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2); 9226 // Simply multiplying by the reciprocal estimate can leave us a few ulps 9227 // too low, so we add 2 ulps (exhaustive testing shows that this is enough, 9228 // and that it will never cause us to return an answer too large). 9229 // float4 result = as_float4(as_int4(xf*recip) + 2); 9230 N0 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N0, N2); 9231 N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, N0); 9232 N1 = DAG.getConstant(2, dl, MVT::v4i32); 9233 N0 = DAG.getNode(ISD::ADD, dl, MVT::v4i32, N0, N1); 9234 N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, N0); 9235 // Convert back to integer and return. 9236 // return vmovn_u32(vcvt_s32_f32(result)); 9237 N0 = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, N0); 9238 N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, N0); 9239 return N0; 9240 } 9241 9242 static SDValue LowerADDSUBCARRY(SDValue Op, SelectionDAG &DAG) { 9243 SDNode *N = Op.getNode(); 9244 EVT VT = N->getValueType(0); 9245 SDVTList VTs = DAG.getVTList(VT, MVT::i32); 9246 9247 SDValue Carry = Op.getOperand(2); 9248 9249 SDLoc DL(Op); 9250 9251 SDValue Result; 9252 if (Op.getOpcode() == ISD::ADDCARRY) { 9253 // This converts the boolean value carry into the carry flag. 9254 Carry = ConvertBooleanCarryToCarryFlag(Carry, DAG); 9255 9256 // Do the addition proper using the carry flag we wanted. 9257 Result = DAG.getNode(ARMISD::ADDE, DL, VTs, Op.getOperand(0), 9258 Op.getOperand(1), Carry); 9259 9260 // Now convert the carry flag into a boolean value. 9261 Carry = ConvertCarryFlagToBooleanCarry(Result.getValue(1), VT, DAG); 9262 } else { 9263 // ARMISD::SUBE expects a carry not a borrow like ISD::SUBCARRY so we 9264 // have to invert the carry first. 9265 Carry = DAG.getNode(ISD::SUB, DL, MVT::i32, 9266 DAG.getConstant(1, DL, MVT::i32), Carry); 9267 // This converts the boolean value carry into the carry flag. 9268 Carry = ConvertBooleanCarryToCarryFlag(Carry, DAG); 9269 9270 // Do the subtraction proper using the carry flag we wanted. 9271 Result = DAG.getNode(ARMISD::SUBE, DL, VTs, Op.getOperand(0), 9272 Op.getOperand(1), Carry); 9273 9274 // Now convert the carry flag into a boolean value. 9275 Carry = ConvertCarryFlagToBooleanCarry(Result.getValue(1), VT, DAG); 9276 // But the carry returned by ARMISD::SUBE is not a borrow as expected 9277 // by ISD::SUBCARRY, so compute 1 - C. 9278 Carry = DAG.getNode(ISD::SUB, DL, MVT::i32, 9279 DAG.getConstant(1, DL, MVT::i32), Carry); 9280 } 9281 9282 // Return both values. 9283 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Result, Carry); 9284 } 9285 9286 SDValue ARMTargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const { 9287 assert(Subtarget->isTargetDarwin()); 9288 9289 // For iOS, we want to call an alternative entry point: __sincos_stret, 9290 // return values are passed via sret. 9291 SDLoc dl(Op); 9292 SDValue Arg = Op.getOperand(0); 9293 EVT ArgVT = Arg.getValueType(); 9294 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); 9295 auto PtrVT = getPointerTy(DAG.getDataLayout()); 9296 9297 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo(); 9298 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 9299 9300 // Pair of floats / doubles used to pass the result. 9301 Type *RetTy = StructType::get(ArgTy, ArgTy); 9302 auto &DL = DAG.getDataLayout(); 9303 9304 ArgListTy Args; 9305 bool ShouldUseSRet = Subtarget->isAPCS_ABI(); 9306 SDValue SRet; 9307 if (ShouldUseSRet) { 9308 // Create stack object for sret. 9309 const uint64_t ByteSize = DL.getTypeAllocSize(RetTy); 9310 const Align StackAlign = DL.getPrefTypeAlign(RetTy); 9311 int FrameIdx = MFI.CreateStackObject(ByteSize, StackAlign, false); 9312 SRet = DAG.getFrameIndex(FrameIdx, TLI.getPointerTy(DL)); 9313 9314 ArgListEntry Entry; 9315 Entry.Node = SRet; 9316 Entry.Ty = RetTy->getPointerTo(); 9317 Entry.IsSExt = false; 9318 Entry.IsZExt = false; 9319 Entry.IsSRet = true; 9320 Args.push_back(Entry); 9321 RetTy = Type::getVoidTy(*DAG.getContext()); 9322 } 9323 9324 ArgListEntry Entry; 9325 Entry.Node = Arg; 9326 Entry.Ty = ArgTy; 9327 Entry.IsSExt = false; 9328 Entry.IsZExt = false; 9329 Args.push_back(Entry); 9330 9331 RTLIB::Libcall LC = 9332 (ArgVT == MVT::f64) ? RTLIB::SINCOS_STRET_F64 : RTLIB::SINCOS_STRET_F32; 9333 const char *LibcallName = getLibcallName(LC); 9334 CallingConv::ID CC = getLibcallCallingConv(LC); 9335 SDValue Callee = DAG.getExternalSymbol(LibcallName, getPointerTy(DL)); 9336 9337 TargetLowering::CallLoweringInfo CLI(DAG); 9338 CLI.setDebugLoc(dl) 9339 .setChain(DAG.getEntryNode()) 9340 .setCallee(CC, RetTy, Callee, std::move(Args)) 9341 .setDiscardResult(ShouldUseSRet); 9342 std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI); 9343 9344 if (!ShouldUseSRet) 9345 return CallResult.first; 9346 9347 SDValue LoadSin = 9348 DAG.getLoad(ArgVT, dl, CallResult.second, SRet, MachinePointerInfo()); 9349 9350 // Address of cos field. 9351 SDValue Add = DAG.getNode(ISD::ADD, dl, PtrVT, SRet, 9352 DAG.getIntPtrConstant(ArgVT.getStoreSize(), dl)); 9353 SDValue LoadCos = 9354 DAG.getLoad(ArgVT, dl, LoadSin.getValue(1), Add, MachinePointerInfo()); 9355 9356 SDVTList Tys = DAG.getVTList(ArgVT, ArgVT); 9357 return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, 9358 LoadSin.getValue(0), LoadCos.getValue(0)); 9359 } 9360 9361 SDValue ARMTargetLowering::LowerWindowsDIVLibCall(SDValue Op, SelectionDAG &DAG, 9362 bool Signed, 9363 SDValue &Chain) const { 9364 EVT VT = Op.getValueType(); 9365 assert((VT == MVT::i32 || VT == MVT::i64) && 9366 "unexpected type for custom lowering DIV"); 9367 SDLoc dl(Op); 9368 9369 const auto &DL = DAG.getDataLayout(); 9370 const auto &TLI = DAG.getTargetLoweringInfo(); 9371 9372 const char *Name = nullptr; 9373 if (Signed) 9374 Name = (VT == MVT::i32) ? "__rt_sdiv" : "__rt_sdiv64"; 9375 else 9376 Name = (VT == MVT::i32) ? "__rt_udiv" : "__rt_udiv64"; 9377 9378 SDValue ES = DAG.getExternalSymbol(Name, TLI.getPointerTy(DL)); 9379 9380 ARMTargetLowering::ArgListTy Args; 9381 9382 for (auto AI : {1, 0}) { 9383 ArgListEntry Arg; 9384 Arg.Node = Op.getOperand(AI); 9385 Arg.Ty = Arg.Node.getValueType().getTypeForEVT(*DAG.getContext()); 9386 Args.push_back(Arg); 9387 } 9388 9389 CallLoweringInfo CLI(DAG); 9390 CLI.setDebugLoc(dl) 9391 .setChain(Chain) 9392 .setCallee(CallingConv::ARM_AAPCS_VFP, VT.getTypeForEVT(*DAG.getContext()), 9393 ES, std::move(Args)); 9394 9395 return LowerCallTo(CLI).first; 9396 } 9397 9398 // This is a code size optimisation: return the original SDIV node to 9399 // DAGCombiner when we don't want to expand SDIV into a sequence of 9400 // instructions, and an empty node otherwise which will cause the 9401 // SDIV to be expanded in DAGCombine. 9402 SDValue 9403 ARMTargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor, 9404 SelectionDAG &DAG, 9405 SmallVectorImpl<SDNode *> &Created) const { 9406 // TODO: Support SREM 9407 if (N->getOpcode() != ISD::SDIV) 9408 return SDValue(); 9409 9410 const auto &ST = static_cast<const ARMSubtarget&>(DAG.getSubtarget()); 9411 const bool MinSize = ST.hasMinSize(); 9412 const bool HasDivide = ST.isThumb() ? ST.hasDivideInThumbMode() 9413 : ST.hasDivideInARMMode(); 9414 9415 // Don't touch vector types; rewriting this may lead to scalarizing 9416 // the int divs. 9417 if (N->getOperand(0).getValueType().isVector()) 9418 return SDValue(); 9419 9420 // Bail if MinSize is not set, and also for both ARM and Thumb mode we need 9421 // hwdiv support for this to be really profitable. 9422 if (!(MinSize && HasDivide)) 9423 return SDValue(); 9424 9425 // ARM mode is a bit simpler than Thumb: we can handle large power 9426 // of 2 immediates with 1 mov instruction; no further checks required, 9427 // just return the sdiv node. 9428 if (!ST.isThumb()) 9429 return SDValue(N, 0); 9430 9431 // In Thumb mode, immediates larger than 128 need a wide 4-byte MOV, 9432 // and thus lose the code size benefits of a MOVS that requires only 2. 9433 // TargetTransformInfo and 'getIntImmCodeSizeCost' could be helpful here, 9434 // but as it's doing exactly this, it's not worth the trouble to get TTI. 9435 if (Divisor.sgt(128)) 9436 return SDValue(); 9437 9438 return SDValue(N, 0); 9439 } 9440 9441 SDValue ARMTargetLowering::LowerDIV_Windows(SDValue Op, SelectionDAG &DAG, 9442 bool Signed) const { 9443 assert(Op.getValueType() == MVT::i32 && 9444 "unexpected type for custom lowering DIV"); 9445 SDLoc dl(Op); 9446 9447 SDValue DBZCHK = DAG.getNode(ARMISD::WIN__DBZCHK, dl, MVT::Other, 9448 DAG.getEntryNode(), Op.getOperand(1)); 9449 9450 return LowerWindowsDIVLibCall(Op, DAG, Signed, DBZCHK); 9451 } 9452 9453 static SDValue WinDBZCheckDenominator(SelectionDAG &DAG, SDNode *N, SDValue InChain) { 9454 SDLoc DL(N); 9455 SDValue Op = N->getOperand(1); 9456 if (N->getValueType(0) == MVT::i32) 9457 return DAG.getNode(ARMISD::WIN__DBZCHK, DL, MVT::Other, InChain, Op); 9458 SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, Op, 9459 DAG.getConstant(0, DL, MVT::i32)); 9460 SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, Op, 9461 DAG.getConstant(1, DL, MVT::i32)); 9462 return DAG.getNode(ARMISD::WIN__DBZCHK, DL, MVT::Other, InChain, 9463 DAG.getNode(ISD::OR, DL, MVT::i32, Lo, Hi)); 9464 } 9465 9466 void ARMTargetLowering::ExpandDIV_Windows( 9467 SDValue Op, SelectionDAG &DAG, bool Signed, 9468 SmallVectorImpl<SDValue> &Results) const { 9469 const auto &DL = DAG.getDataLayout(); 9470 const auto &TLI = DAG.getTargetLoweringInfo(); 9471 9472 assert(Op.getValueType() == MVT::i64 && 9473 "unexpected type for custom lowering DIV"); 9474 SDLoc dl(Op); 9475 9476 SDValue DBZCHK = WinDBZCheckDenominator(DAG, Op.getNode(), DAG.getEntryNode()); 9477 9478 SDValue Result = LowerWindowsDIVLibCall(Op, DAG, Signed, DBZCHK); 9479 9480 SDValue Lower = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Result); 9481 SDValue Upper = DAG.getNode(ISD::SRL, dl, MVT::i64, Result, 9482 DAG.getConstant(32, dl, TLI.getPointerTy(DL))); 9483 Upper = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Upper); 9484 9485 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lower, Upper)); 9486 } 9487 9488 static SDValue LowerPredicateLoad(SDValue Op, SelectionDAG &DAG) { 9489 LoadSDNode *LD = cast<LoadSDNode>(Op.getNode()); 9490 EVT MemVT = LD->getMemoryVT(); 9491 assert((MemVT == MVT::v4i1 || MemVT == MVT::v8i1 || MemVT == MVT::v16i1) && 9492 "Expected a predicate type!"); 9493 assert(MemVT == Op.getValueType()); 9494 assert(LD->getExtensionType() == ISD::NON_EXTLOAD && 9495 "Expected a non-extending load"); 9496 assert(LD->isUnindexed() && "Expected a unindexed load"); 9497 9498 // The basic MVE VLDR on a v4i1/v8i1 actually loads the entire 16bit 9499 // predicate, with the "v4i1" bits spread out over the 16 bits loaded. We 9500 // need to make sure that 8/4 bits are actually loaded into the correct 9501 // place, which means loading the value and then shuffling the values into 9502 // the bottom bits of the predicate. 9503 // Equally, VLDR for an v16i1 will actually load 32bits (so will be incorrect 9504 // for BE). 9505 // Speaking of BE, apparently the rest of llvm will assume a reverse order to 9506 // a natural VMSR(load), so needs to be reversed. 9507 9508 SDLoc dl(Op); 9509 SDValue Load = DAG.getExtLoad( 9510 ISD::EXTLOAD, dl, MVT::i32, LD->getChain(), LD->getBasePtr(), 9511 EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits()), 9512 LD->getMemOperand()); 9513 SDValue Val = Load; 9514 if (DAG.getDataLayout().isBigEndian()) 9515 Val = DAG.getNode(ISD::SRL, dl, MVT::i32, 9516 DAG.getNode(ISD::BITREVERSE, dl, MVT::i32, Load), 9517 DAG.getConstant(32 - MemVT.getSizeInBits(), dl, MVT::i32)); 9518 SDValue Pred = DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::v16i1, Val); 9519 if (MemVT != MVT::v16i1) 9520 Pred = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MemVT, Pred, 9521 DAG.getConstant(0, dl, MVT::i32)); 9522 return DAG.getMergeValues({Pred, Load.getValue(1)}, dl); 9523 } 9524 9525 void ARMTargetLowering::LowerLOAD(SDNode *N, SmallVectorImpl<SDValue> &Results, 9526 SelectionDAG &DAG) const { 9527 LoadSDNode *LD = cast<LoadSDNode>(N); 9528 EVT MemVT = LD->getMemoryVT(); 9529 assert(LD->isUnindexed() && "Loads should be unindexed at this point."); 9530 9531 if (MemVT == MVT::i64 && Subtarget->hasV5TEOps() && 9532 !Subtarget->isThumb1Only() && LD->isVolatile()) { 9533 SDLoc dl(N); 9534 SDValue Result = DAG.getMemIntrinsicNode( 9535 ARMISD::LDRD, dl, DAG.getVTList({MVT::i32, MVT::i32, MVT::Other}), 9536 {LD->getChain(), LD->getBasePtr()}, MemVT, LD->getMemOperand()); 9537 SDValue Lo = Result.getValue(DAG.getDataLayout().isLittleEndian() ? 0 : 1); 9538 SDValue Hi = Result.getValue(DAG.getDataLayout().isLittleEndian() ? 1 : 0); 9539 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi); 9540 Results.append({Pair, Result.getValue(2)}); 9541 } 9542 } 9543 9544 static SDValue LowerPredicateStore(SDValue Op, SelectionDAG &DAG) { 9545 StoreSDNode *ST = cast<StoreSDNode>(Op.getNode()); 9546 EVT MemVT = ST->getMemoryVT(); 9547 assert((MemVT == MVT::v4i1 || MemVT == MVT::v8i1 || MemVT == MVT::v16i1) && 9548 "Expected a predicate type!"); 9549 assert(MemVT == ST->getValue().getValueType()); 9550 assert(!ST->isTruncatingStore() && "Expected a non-extending store"); 9551 assert(ST->isUnindexed() && "Expected a unindexed store"); 9552 9553 // Only store the v4i1 or v8i1 worth of bits, via a buildvector with top bits 9554 // unset and a scalar store. 9555 SDLoc dl(Op); 9556 SDValue Build = ST->getValue(); 9557 if (MemVT != MVT::v16i1) { 9558 SmallVector<SDValue, 16> Ops; 9559 for (unsigned I = 0; I < MemVT.getVectorNumElements(); I++) { 9560 unsigned Elt = DAG.getDataLayout().isBigEndian() 9561 ? MemVT.getVectorNumElements() - I - 1 9562 : I; 9563 Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, Build, 9564 DAG.getConstant(Elt, dl, MVT::i32))); 9565 } 9566 for (unsigned I = MemVT.getVectorNumElements(); I < 16; I++) 9567 Ops.push_back(DAG.getUNDEF(MVT::i32)); 9568 Build = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i1, Ops); 9569 } 9570 SDValue GRP = DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::i32, Build); 9571 if (MemVT == MVT::v16i1 && DAG.getDataLayout().isBigEndian()) 9572 GRP = DAG.getNode(ISD::SRL, dl, MVT::i32, 9573 DAG.getNode(ISD::BITREVERSE, dl, MVT::i32, GRP), 9574 DAG.getConstant(16, dl, MVT::i32)); 9575 return DAG.getTruncStore( 9576 ST->getChain(), dl, GRP, ST->getBasePtr(), 9577 EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits()), 9578 ST->getMemOperand()); 9579 } 9580 9581 static SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG, 9582 const ARMSubtarget *Subtarget) { 9583 StoreSDNode *ST = cast<StoreSDNode>(Op.getNode()); 9584 EVT MemVT = ST->getMemoryVT(); 9585 assert(ST->isUnindexed() && "Stores should be unindexed at this point."); 9586 9587 if (MemVT == MVT::i64 && Subtarget->hasV5TEOps() && 9588 !Subtarget->isThumb1Only() && ST->isVolatile()) { 9589 SDNode *N = Op.getNode(); 9590 SDLoc dl(N); 9591 9592 SDValue Lo = DAG.getNode( 9593 ISD::EXTRACT_ELEMENT, dl, MVT::i32, ST->getValue(), 9594 DAG.getTargetConstant(DAG.getDataLayout().isLittleEndian() ? 0 : 1, dl, 9595 MVT::i32)); 9596 SDValue Hi = DAG.getNode( 9597 ISD::EXTRACT_ELEMENT, dl, MVT::i32, ST->getValue(), 9598 DAG.getTargetConstant(DAG.getDataLayout().isLittleEndian() ? 1 : 0, dl, 9599 MVT::i32)); 9600 9601 return DAG.getMemIntrinsicNode(ARMISD::STRD, dl, DAG.getVTList(MVT::Other), 9602 {ST->getChain(), Lo, Hi, ST->getBasePtr()}, 9603 MemVT, ST->getMemOperand()); 9604 } else if (Subtarget->hasMVEIntegerOps() && 9605 ((MemVT == MVT::v4i1 || MemVT == MVT::v8i1 || 9606 MemVT == MVT::v16i1))) { 9607 return LowerPredicateStore(Op, DAG); 9608 } 9609 9610 return SDValue(); 9611 } 9612 9613 static bool isZeroVector(SDValue N) { 9614 return (ISD::isBuildVectorAllZeros(N.getNode()) || 9615 (N->getOpcode() == ARMISD::VMOVIMM && 9616 isNullConstant(N->getOperand(0)))); 9617 } 9618 9619 static SDValue LowerMLOAD(SDValue Op, SelectionDAG &DAG) { 9620 MaskedLoadSDNode *N = cast<MaskedLoadSDNode>(Op.getNode()); 9621 MVT VT = Op.getSimpleValueType(); 9622 SDValue Mask = N->getMask(); 9623 SDValue PassThru = N->getPassThru(); 9624 SDLoc dl(Op); 9625 9626 if (isZeroVector(PassThru)) 9627 return Op; 9628 9629 // MVE Masked loads use zero as the passthru value. Here we convert undef to 9630 // zero too, and other values are lowered to a select. 9631 SDValue ZeroVec = DAG.getNode(ARMISD::VMOVIMM, dl, VT, 9632 DAG.getTargetConstant(0, dl, MVT::i32)); 9633 SDValue NewLoad = DAG.getMaskedLoad( 9634 VT, dl, N->getChain(), N->getBasePtr(), N->getOffset(), Mask, ZeroVec, 9635 N->getMemoryVT(), N->getMemOperand(), N->getAddressingMode(), 9636 N->getExtensionType(), N->isExpandingLoad()); 9637 SDValue Combo = NewLoad; 9638 bool PassThruIsCastZero = (PassThru.getOpcode() == ISD::BITCAST || 9639 PassThru.getOpcode() == ARMISD::VECTOR_REG_CAST) && 9640 isZeroVector(PassThru->getOperand(0)); 9641 if (!PassThru.isUndef() && !PassThruIsCastZero) 9642 Combo = DAG.getNode(ISD::VSELECT, dl, VT, Mask, NewLoad, PassThru); 9643 return DAG.getMergeValues({Combo, NewLoad.getValue(1)}, dl); 9644 } 9645 9646 static SDValue LowerVecReduce(SDValue Op, SelectionDAG &DAG, 9647 const ARMSubtarget *ST) { 9648 if (!ST->hasMVEIntegerOps()) 9649 return SDValue(); 9650 9651 SDLoc dl(Op); 9652 unsigned BaseOpcode = 0; 9653 switch (Op->getOpcode()) { 9654 default: llvm_unreachable("Expected VECREDUCE opcode"); 9655 case ISD::VECREDUCE_FADD: BaseOpcode = ISD::FADD; break; 9656 case ISD::VECREDUCE_FMUL: BaseOpcode = ISD::FMUL; break; 9657 case ISD::VECREDUCE_MUL: BaseOpcode = ISD::MUL; break; 9658 case ISD::VECREDUCE_AND: BaseOpcode = ISD::AND; break; 9659 case ISD::VECREDUCE_OR: BaseOpcode = ISD::OR; break; 9660 case ISD::VECREDUCE_XOR: BaseOpcode = ISD::XOR; break; 9661 case ISD::VECREDUCE_FMAX: BaseOpcode = ISD::FMAXNUM; break; 9662 case ISD::VECREDUCE_FMIN: BaseOpcode = ISD::FMINNUM; break; 9663 } 9664 9665 SDValue Op0 = Op->getOperand(0); 9666 EVT VT = Op0.getValueType(); 9667 EVT EltVT = VT.getVectorElementType(); 9668 unsigned NumElts = VT.getVectorNumElements(); 9669 unsigned NumActiveLanes = NumElts; 9670 9671 assert((NumActiveLanes == 16 || NumActiveLanes == 8 || NumActiveLanes == 4 || 9672 NumActiveLanes == 2) && 9673 "Only expected a power 2 vector size"); 9674 9675 // Use Mul(X, Rev(X)) until 4 items remain. Going down to 4 vector elements 9676 // allows us to easily extract vector elements from the lanes. 9677 while (NumActiveLanes > 4) { 9678 unsigned RevOpcode = NumActiveLanes == 16 ? ARMISD::VREV16 : ARMISD::VREV32; 9679 SDValue Rev = DAG.getNode(RevOpcode, dl, VT, Op0); 9680 Op0 = DAG.getNode(BaseOpcode, dl, VT, Op0, Rev); 9681 NumActiveLanes /= 2; 9682 } 9683 9684 SDValue Res; 9685 if (NumActiveLanes == 4) { 9686 // The remaining 4 elements are summed sequentially 9687 SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0, 9688 DAG.getConstant(0 * NumElts / 4, dl, MVT::i32)); 9689 SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0, 9690 DAG.getConstant(1 * NumElts / 4, dl, MVT::i32)); 9691 SDValue Ext2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0, 9692 DAG.getConstant(2 * NumElts / 4, dl, MVT::i32)); 9693 SDValue Ext3 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0, 9694 DAG.getConstant(3 * NumElts / 4, dl, MVT::i32)); 9695 SDValue Res0 = DAG.getNode(BaseOpcode, dl, EltVT, Ext0, Ext1, Op->getFlags()); 9696 SDValue Res1 = DAG.getNode(BaseOpcode, dl, EltVT, Ext2, Ext3, Op->getFlags()); 9697 Res = DAG.getNode(BaseOpcode, dl, EltVT, Res0, Res1, Op->getFlags()); 9698 } else { 9699 SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0, 9700 DAG.getConstant(0, dl, MVT::i32)); 9701 SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0, 9702 DAG.getConstant(1, dl, MVT::i32)); 9703 Res = DAG.getNode(BaseOpcode, dl, EltVT, Ext0, Ext1, Op->getFlags()); 9704 } 9705 9706 // Result type may be wider than element type. 9707 if (EltVT != Op->getValueType(0)) 9708 Res = DAG.getNode(ISD::ANY_EXTEND, dl, Op->getValueType(0), Res); 9709 return Res; 9710 } 9711 9712 static SDValue LowerVecReduceF(SDValue Op, SelectionDAG &DAG, 9713 const ARMSubtarget *ST) { 9714 if (!ST->hasMVEFloatOps()) 9715 return SDValue(); 9716 return LowerVecReduce(Op, DAG, ST); 9717 } 9718 9719 static SDValue LowerAtomicLoadStore(SDValue Op, SelectionDAG &DAG) { 9720 if (isStrongerThanMonotonic(cast<AtomicSDNode>(Op)->getOrdering())) 9721 // Acquire/Release load/store is not legal for targets without a dmb or 9722 // equivalent available. 9723 return SDValue(); 9724 9725 // Monotonic load/store is legal for all targets. 9726 return Op; 9727 } 9728 9729 static void ReplaceREADCYCLECOUNTER(SDNode *N, 9730 SmallVectorImpl<SDValue> &Results, 9731 SelectionDAG &DAG, 9732 const ARMSubtarget *Subtarget) { 9733 SDLoc DL(N); 9734 // Under Power Management extensions, the cycle-count is: 9735 // mrc p15, #0, <Rt>, c9, c13, #0 9736 SDValue Ops[] = { N->getOperand(0), // Chain 9737 DAG.getTargetConstant(Intrinsic::arm_mrc, DL, MVT::i32), 9738 DAG.getTargetConstant(15, DL, MVT::i32), 9739 DAG.getTargetConstant(0, DL, MVT::i32), 9740 DAG.getTargetConstant(9, DL, MVT::i32), 9741 DAG.getTargetConstant(13, DL, MVT::i32), 9742 DAG.getTargetConstant(0, DL, MVT::i32) 9743 }; 9744 9745 SDValue Cycles32 = DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, 9746 DAG.getVTList(MVT::i32, MVT::Other), Ops); 9747 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Cycles32, 9748 DAG.getConstant(0, DL, MVT::i32))); 9749 Results.push_back(Cycles32.getValue(1)); 9750 } 9751 9752 static SDValue createGPRPairNode(SelectionDAG &DAG, SDValue V) { 9753 SDLoc dl(V.getNode()); 9754 SDValue VLo = DAG.getAnyExtOrTrunc(V, dl, MVT::i32); 9755 SDValue VHi = DAG.getAnyExtOrTrunc( 9756 DAG.getNode(ISD::SRL, dl, MVT::i64, V, DAG.getConstant(32, dl, MVT::i32)), 9757 dl, MVT::i32); 9758 bool isBigEndian = DAG.getDataLayout().isBigEndian(); 9759 if (isBigEndian) 9760 std::swap (VLo, VHi); 9761 SDValue RegClass = 9762 DAG.getTargetConstant(ARM::GPRPairRegClassID, dl, MVT::i32); 9763 SDValue SubReg0 = DAG.getTargetConstant(ARM::gsub_0, dl, MVT::i32); 9764 SDValue SubReg1 = DAG.getTargetConstant(ARM::gsub_1, dl, MVT::i32); 9765 const SDValue Ops[] = { RegClass, VLo, SubReg0, VHi, SubReg1 }; 9766 return SDValue( 9767 DAG.getMachineNode(TargetOpcode::REG_SEQUENCE, dl, MVT::Untyped, Ops), 0); 9768 } 9769 9770 static void ReplaceCMP_SWAP_64Results(SDNode *N, 9771 SmallVectorImpl<SDValue> & Results, 9772 SelectionDAG &DAG) { 9773 assert(N->getValueType(0) == MVT::i64 && 9774 "AtomicCmpSwap on types less than 64 should be legal"); 9775 SDValue Ops[] = {N->getOperand(1), 9776 createGPRPairNode(DAG, N->getOperand(2)), 9777 createGPRPairNode(DAG, N->getOperand(3)), 9778 N->getOperand(0)}; 9779 SDNode *CmpSwap = DAG.getMachineNode( 9780 ARM::CMP_SWAP_64, SDLoc(N), 9781 DAG.getVTList(MVT::Untyped, MVT::i32, MVT::Other), Ops); 9782 9783 MachineMemOperand *MemOp = cast<MemSDNode>(N)->getMemOperand(); 9784 DAG.setNodeMemRefs(cast<MachineSDNode>(CmpSwap), {MemOp}); 9785 9786 bool isBigEndian = DAG.getDataLayout().isBigEndian(); 9787 9788 SDValue Lo = 9789 DAG.getTargetExtractSubreg(isBigEndian ? ARM::gsub_1 : ARM::gsub_0, 9790 SDLoc(N), MVT::i32, SDValue(CmpSwap, 0)); 9791 SDValue Hi = 9792 DAG.getTargetExtractSubreg(isBigEndian ? ARM::gsub_0 : ARM::gsub_1, 9793 SDLoc(N), MVT::i32, SDValue(CmpSwap, 0)); 9794 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i64, Lo, Hi)); 9795 Results.push_back(SDValue(CmpSwap, 2)); 9796 } 9797 9798 SDValue ARMTargetLowering::LowerFSETCC(SDValue Op, SelectionDAG &DAG) const { 9799 SDLoc dl(Op); 9800 EVT VT = Op.getValueType(); 9801 SDValue Chain = Op.getOperand(0); 9802 SDValue LHS = Op.getOperand(1); 9803 SDValue RHS = Op.getOperand(2); 9804 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(3))->get(); 9805 bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS; 9806 9807 // If we don't have instructions of this float type then soften to a libcall 9808 // and use SETCC instead. 9809 if (isUnsupportedFloatingType(LHS.getValueType())) { 9810 DAG.getTargetLoweringInfo().softenSetCCOperands( 9811 DAG, LHS.getValueType(), LHS, RHS, CC, dl, LHS, RHS, Chain, IsSignaling); 9812 if (!RHS.getNode()) { 9813 RHS = DAG.getConstant(0, dl, LHS.getValueType()); 9814 CC = ISD::SETNE; 9815 } 9816 SDValue Result = DAG.getNode(ISD::SETCC, dl, VT, LHS, RHS, 9817 DAG.getCondCode(CC)); 9818 return DAG.getMergeValues({Result, Chain}, dl); 9819 } 9820 9821 ARMCC::CondCodes CondCode, CondCode2; 9822 FPCCToARMCC(CC, CondCode, CondCode2); 9823 9824 // FIXME: Chain is not handled correctly here. Currently the FPSCR is implicit 9825 // in CMPFP and CMPFPE, but instead it should be made explicit by these 9826 // instructions using a chain instead of glue. This would also fix the problem 9827 // here (and also in LowerSELECT_CC) where we generate two comparisons when 9828 // CondCode2 != AL. 9829 SDValue True = DAG.getConstant(1, dl, VT); 9830 SDValue False = DAG.getConstant(0, dl, VT); 9831 SDValue ARMcc = DAG.getConstant(CondCode, dl, MVT::i32); 9832 SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); 9833 SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl, IsSignaling); 9834 SDValue Result = getCMOV(dl, VT, False, True, ARMcc, CCR, Cmp, DAG); 9835 if (CondCode2 != ARMCC::AL) { 9836 ARMcc = DAG.getConstant(CondCode2, dl, MVT::i32); 9837 Cmp = getVFPCmp(LHS, RHS, DAG, dl, IsSignaling); 9838 Result = getCMOV(dl, VT, Result, True, ARMcc, CCR, Cmp, DAG); 9839 } 9840 return DAG.getMergeValues({Result, Chain}, dl); 9841 } 9842 9843 SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { 9844 LLVM_DEBUG(dbgs() << "Lowering node: "; Op.dump()); 9845 switch (Op.getOpcode()) { 9846 default: llvm_unreachable("Don't know how to custom lower this!"); 9847 case ISD::WRITE_REGISTER: return LowerWRITE_REGISTER(Op, DAG); 9848 case ISD::ConstantPool: return LowerConstantPool(Op, DAG); 9849 case ISD::BlockAddress: return LowerBlockAddress(Op, DAG); 9850 case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG); 9851 case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG); 9852 case ISD::SELECT: return LowerSELECT(Op, DAG); 9853 case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG); 9854 case ISD::BRCOND: return LowerBRCOND(Op, DAG); 9855 case ISD::BR_CC: return LowerBR_CC(Op, DAG); 9856 case ISD::BR_JT: return LowerBR_JT(Op, DAG); 9857 case ISD::VASTART: return LowerVASTART(Op, DAG); 9858 case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op, DAG, Subtarget); 9859 case ISD::PREFETCH: return LowerPREFETCH(Op, DAG, Subtarget); 9860 case ISD::SINT_TO_FP: 9861 case ISD::UINT_TO_FP: return LowerINT_TO_FP(Op, DAG); 9862 case ISD::STRICT_FP_TO_SINT: 9863 case ISD::STRICT_FP_TO_UINT: 9864 case ISD::FP_TO_SINT: 9865 case ISD::FP_TO_UINT: return LowerFP_TO_INT(Op, DAG); 9866 case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG); 9867 case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG); 9868 case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG); 9869 case ISD::EH_SJLJ_SETJMP: return LowerEH_SJLJ_SETJMP(Op, DAG); 9870 case ISD::EH_SJLJ_LONGJMP: return LowerEH_SJLJ_LONGJMP(Op, DAG); 9871 case ISD::EH_SJLJ_SETUP_DISPATCH: return LowerEH_SJLJ_SETUP_DISPATCH(Op, DAG); 9872 case ISD::INTRINSIC_VOID: return LowerINTRINSIC_VOID(Op, DAG, Subtarget); 9873 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG, 9874 Subtarget); 9875 case ISD::BITCAST: return ExpandBITCAST(Op.getNode(), DAG, Subtarget); 9876 case ISD::SHL: 9877 case ISD::SRL: 9878 case ISD::SRA: return LowerShift(Op.getNode(), DAG, Subtarget); 9879 case ISD::SREM: return LowerREM(Op.getNode(), DAG); 9880 case ISD::UREM: return LowerREM(Op.getNode(), DAG); 9881 case ISD::SHL_PARTS: return LowerShiftLeftParts(Op, DAG); 9882 case ISD::SRL_PARTS: 9883 case ISD::SRA_PARTS: return LowerShiftRightParts(Op, DAG); 9884 case ISD::CTTZ: 9885 case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(Op.getNode(), DAG, Subtarget); 9886 case ISD::CTPOP: return LowerCTPOP(Op.getNode(), DAG, Subtarget); 9887 case ISD::SETCC: return LowerVSETCC(Op, DAG, Subtarget); 9888 case ISD::SETCCCARRY: return LowerSETCCCARRY(Op, DAG); 9889 case ISD::ConstantFP: return LowerConstantFP(Op, DAG, Subtarget); 9890 case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG, Subtarget); 9891 case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG, Subtarget); 9892 case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op, DAG, Subtarget); 9893 case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG); 9894 case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG, Subtarget); 9895 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG, Subtarget); 9896 case ISD::TRUNCATE: return LowerTruncatei1(Op, DAG, Subtarget); 9897 case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG); 9898 case ISD::SET_ROUNDING: return LowerSET_ROUNDING(Op, DAG); 9899 case ISD::MUL: return LowerMUL(Op, DAG); 9900 case ISD::SDIV: 9901 if (Subtarget->isTargetWindows() && !Op.getValueType().isVector()) 9902 return LowerDIV_Windows(Op, DAG, /* Signed */ true); 9903 return LowerSDIV(Op, DAG, Subtarget); 9904 case ISD::UDIV: 9905 if (Subtarget->isTargetWindows() && !Op.getValueType().isVector()) 9906 return LowerDIV_Windows(Op, DAG, /* Signed */ false); 9907 return LowerUDIV(Op, DAG, Subtarget); 9908 case ISD::ADDCARRY: 9909 case ISD::SUBCARRY: return LowerADDSUBCARRY(Op, DAG); 9910 case ISD::SADDO: 9911 case ISD::SSUBO: 9912 return LowerSignedALUO(Op, DAG); 9913 case ISD::UADDO: 9914 case ISD::USUBO: 9915 return LowerUnsignedALUO(Op, DAG); 9916 case ISD::SADDSAT: 9917 case ISD::SSUBSAT: 9918 return LowerSADDSUBSAT(Op, DAG, Subtarget); 9919 case ISD::LOAD: 9920 return LowerPredicateLoad(Op, DAG); 9921 case ISD::STORE: 9922 return LowerSTORE(Op, DAG, Subtarget); 9923 case ISD::MLOAD: 9924 return LowerMLOAD(Op, DAG); 9925 case ISD::VECREDUCE_MUL: 9926 case ISD::VECREDUCE_AND: 9927 case ISD::VECREDUCE_OR: 9928 case ISD::VECREDUCE_XOR: 9929 return LowerVecReduce(Op, DAG, Subtarget); 9930 case ISD::VECREDUCE_FADD: 9931 case ISD::VECREDUCE_FMUL: 9932 case ISD::VECREDUCE_FMIN: 9933 case ISD::VECREDUCE_FMAX: 9934 return LowerVecReduceF(Op, DAG, Subtarget); 9935 case ISD::ATOMIC_LOAD: 9936 case ISD::ATOMIC_STORE: return LowerAtomicLoadStore(Op, DAG); 9937 case ISD::FSINCOS: return LowerFSINCOS(Op, DAG); 9938 case ISD::SDIVREM: 9939 case ISD::UDIVREM: return LowerDivRem(Op, DAG); 9940 case ISD::DYNAMIC_STACKALLOC: 9941 if (Subtarget->isTargetWindows()) 9942 return LowerDYNAMIC_STACKALLOC(Op, DAG); 9943 llvm_unreachable("Don't know how to custom lower this!"); 9944 case ISD::STRICT_FP_ROUND: 9945 case ISD::FP_ROUND: return LowerFP_ROUND(Op, DAG); 9946 case ISD::STRICT_FP_EXTEND: 9947 case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG); 9948 case ISD::STRICT_FSETCC: 9949 case ISD::STRICT_FSETCCS: return LowerFSETCC(Op, DAG); 9950 case ARMISD::WIN__DBZCHK: return SDValue(); 9951 } 9952 } 9953 9954 static void ReplaceLongIntrinsic(SDNode *N, SmallVectorImpl<SDValue> &Results, 9955 SelectionDAG &DAG) { 9956 unsigned IntNo = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue(); 9957 unsigned Opc = 0; 9958 if (IntNo == Intrinsic::arm_smlald) 9959 Opc = ARMISD::SMLALD; 9960 else if (IntNo == Intrinsic::arm_smlaldx) 9961 Opc = ARMISD::SMLALDX; 9962 else if (IntNo == Intrinsic::arm_smlsld) 9963 Opc = ARMISD::SMLSLD; 9964 else if (IntNo == Intrinsic::arm_smlsldx) 9965 Opc = ARMISD::SMLSLDX; 9966 else 9967 return; 9968 9969 SDLoc dl(N); 9970 SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 9971 N->getOperand(3), 9972 DAG.getConstant(0, dl, MVT::i32)); 9973 SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 9974 N->getOperand(3), 9975 DAG.getConstant(1, dl, MVT::i32)); 9976 9977 SDValue LongMul = DAG.getNode(Opc, dl, 9978 DAG.getVTList(MVT::i32, MVT::i32), 9979 N->getOperand(1), N->getOperand(2), 9980 Lo, Hi); 9981 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, 9982 LongMul.getValue(0), LongMul.getValue(1))); 9983 } 9984 9985 /// ReplaceNodeResults - Replace the results of node with an illegal result 9986 /// type with new values built out of custom code. 9987 void ARMTargetLowering::ReplaceNodeResults(SDNode *N, 9988 SmallVectorImpl<SDValue> &Results, 9989 SelectionDAG &DAG) const { 9990 SDValue Res; 9991 switch (N->getOpcode()) { 9992 default: 9993 llvm_unreachable("Don't know how to custom expand this!"); 9994 case ISD::READ_REGISTER: 9995 ExpandREAD_REGISTER(N, Results, DAG); 9996 break; 9997 case ISD::BITCAST: 9998 Res = ExpandBITCAST(N, DAG, Subtarget); 9999 break; 10000 case ISD::SRL: 10001 case ISD::SRA: 10002 case ISD::SHL: 10003 Res = Expand64BitShift(N, DAG, Subtarget); 10004 break; 10005 case ISD::SREM: 10006 case ISD::UREM: 10007 Res = LowerREM(N, DAG); 10008 break; 10009 case ISD::SDIVREM: 10010 case ISD::UDIVREM: 10011 Res = LowerDivRem(SDValue(N, 0), DAG); 10012 assert(Res.getNumOperands() == 2 && "DivRem needs two values"); 10013 Results.push_back(Res.getValue(0)); 10014 Results.push_back(Res.getValue(1)); 10015 return; 10016 case ISD::SADDSAT: 10017 case ISD::SSUBSAT: 10018 Res = LowerSADDSUBSAT(SDValue(N, 0), DAG, Subtarget); 10019 break; 10020 case ISD::READCYCLECOUNTER: 10021 ReplaceREADCYCLECOUNTER(N, Results, DAG, Subtarget); 10022 return; 10023 case ISD::UDIV: 10024 case ISD::SDIV: 10025 assert(Subtarget->isTargetWindows() && "can only expand DIV on Windows"); 10026 return ExpandDIV_Windows(SDValue(N, 0), DAG, N->getOpcode() == ISD::SDIV, 10027 Results); 10028 case ISD::ATOMIC_CMP_SWAP: 10029 ReplaceCMP_SWAP_64Results(N, Results, DAG); 10030 return; 10031 case ISD::INTRINSIC_WO_CHAIN: 10032 return ReplaceLongIntrinsic(N, Results, DAG); 10033 case ISD::ABS: 10034 lowerABS(N, Results, DAG); 10035 return ; 10036 case ISD::LOAD: 10037 LowerLOAD(N, Results, DAG); 10038 break; 10039 } 10040 if (Res.getNode()) 10041 Results.push_back(Res); 10042 } 10043 10044 //===----------------------------------------------------------------------===// 10045 // ARM Scheduler Hooks 10046 //===----------------------------------------------------------------------===// 10047 10048 /// SetupEntryBlockForSjLj - Insert code into the entry block that creates and 10049 /// registers the function context. 10050 void ARMTargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI, 10051 MachineBasicBlock *MBB, 10052 MachineBasicBlock *DispatchBB, 10053 int FI) const { 10054 assert(!Subtarget->isROPI() && !Subtarget->isRWPI() && 10055 "ROPI/RWPI not currently supported with SjLj"); 10056 const TargetInstrInfo *TII = Subtarget->getInstrInfo(); 10057 DebugLoc dl = MI.getDebugLoc(); 10058 MachineFunction *MF = MBB->getParent(); 10059 MachineRegisterInfo *MRI = &MF->getRegInfo(); 10060 MachineConstantPool *MCP = MF->getConstantPool(); 10061 ARMFunctionInfo *AFI = MF->getInfo<ARMFunctionInfo>(); 10062 const Function &F = MF->getFunction(); 10063 10064 bool isThumb = Subtarget->isThumb(); 10065 bool isThumb2 = Subtarget->isThumb2(); 10066 10067 unsigned PCLabelId = AFI->createPICLabelUId(); 10068 unsigned PCAdj = (isThumb || isThumb2) ? 4 : 8; 10069 ARMConstantPoolValue *CPV = 10070 ARMConstantPoolMBB::Create(F.getContext(), DispatchBB, PCLabelId, PCAdj); 10071 unsigned CPI = MCP->getConstantPoolIndex(CPV, Align(4)); 10072 10073 const TargetRegisterClass *TRC = isThumb ? &ARM::tGPRRegClass 10074 : &ARM::GPRRegClass; 10075 10076 // Grab constant pool and fixed stack memory operands. 10077 MachineMemOperand *CPMMO = 10078 MF->getMachineMemOperand(MachinePointerInfo::getConstantPool(*MF), 10079 MachineMemOperand::MOLoad, 4, Align(4)); 10080 10081 MachineMemOperand *FIMMOSt = 10082 MF->getMachineMemOperand(MachinePointerInfo::getFixedStack(*MF, FI), 10083 MachineMemOperand::MOStore, 4, Align(4)); 10084 10085 // Load the address of the dispatch MBB into the jump buffer. 10086 if (isThumb2) { 10087 // Incoming value: jbuf 10088 // ldr.n r5, LCPI1_1 10089 // orr r5, r5, #1 10090 // add r5, pc 10091 // str r5, [$jbuf, #+4] ; &jbuf[1] 10092 Register NewVReg1 = MRI->createVirtualRegister(TRC); 10093 BuildMI(*MBB, MI, dl, TII->get(ARM::t2LDRpci), NewVReg1) 10094 .addConstantPoolIndex(CPI) 10095 .addMemOperand(CPMMO) 10096 .add(predOps(ARMCC::AL)); 10097 // Set the low bit because of thumb mode. 10098 Register NewVReg2 = MRI->createVirtualRegister(TRC); 10099 BuildMI(*MBB, MI, dl, TII->get(ARM::t2ORRri), NewVReg2) 10100 .addReg(NewVReg1, RegState::Kill) 10101 .addImm(0x01) 10102 .add(predOps(ARMCC::AL)) 10103 .add(condCodeOp()); 10104 Register NewVReg3 = MRI->createVirtualRegister(TRC); 10105 BuildMI(*MBB, MI, dl, TII->get(ARM::tPICADD), NewVReg3) 10106 .addReg(NewVReg2, RegState::Kill) 10107 .addImm(PCLabelId); 10108 BuildMI(*MBB, MI, dl, TII->get(ARM::t2STRi12)) 10109 .addReg(NewVReg3, RegState::Kill) 10110 .addFrameIndex(FI) 10111 .addImm(36) // &jbuf[1] :: pc 10112 .addMemOperand(FIMMOSt) 10113 .add(predOps(ARMCC::AL)); 10114 } else if (isThumb) { 10115 // Incoming value: jbuf 10116 // ldr.n r1, LCPI1_4 10117 // add r1, pc 10118 // mov r2, #1 10119 // orrs r1, r2 10120 // add r2, $jbuf, #+4 ; &jbuf[1] 10121 // str r1, [r2] 10122 Register NewVReg1 = MRI->createVirtualRegister(TRC); 10123 BuildMI(*MBB, MI, dl, TII->get(ARM::tLDRpci), NewVReg1) 10124 .addConstantPoolIndex(CPI) 10125 .addMemOperand(CPMMO) 10126 .add(predOps(ARMCC::AL)); 10127 Register NewVReg2 = MRI->createVirtualRegister(TRC); 10128 BuildMI(*MBB, MI, dl, TII->get(ARM::tPICADD), NewVReg2) 10129 .addReg(NewVReg1, RegState::Kill) 10130 .addImm(PCLabelId); 10131 // Set the low bit because of thumb mode. 10132 Register NewVReg3 = MRI->createVirtualRegister(TRC); 10133 BuildMI(*MBB, MI, dl, TII->get(ARM::tMOVi8), NewVReg3) 10134 .addReg(ARM::CPSR, RegState::Define) 10135 .addImm(1) 10136 .add(predOps(ARMCC::AL)); 10137 Register NewVReg4 = MRI->createVirtualRegister(TRC); 10138 BuildMI(*MBB, MI, dl, TII->get(ARM::tORR), NewVReg4) 10139 .addReg(ARM::CPSR, RegState::Define) 10140 .addReg(NewVReg2, RegState::Kill) 10141 .addReg(NewVReg3, RegState::Kill) 10142 .add(predOps(ARMCC::AL)); 10143 Register NewVReg5 = MRI->createVirtualRegister(TRC); 10144 BuildMI(*MBB, MI, dl, TII->get(ARM::tADDframe), NewVReg5) 10145 .addFrameIndex(FI) 10146 .addImm(36); // &jbuf[1] :: pc 10147 BuildMI(*MBB, MI, dl, TII->get(ARM::tSTRi)) 10148 .addReg(NewVReg4, RegState::Kill) 10149 .addReg(NewVReg5, RegState::Kill) 10150 .addImm(0) 10151 .addMemOperand(FIMMOSt) 10152 .add(predOps(ARMCC::AL)); 10153 } else { 10154 // Incoming value: jbuf 10155 // ldr r1, LCPI1_1 10156 // add r1, pc, r1 10157 // str r1, [$jbuf, #+4] ; &jbuf[1] 10158 Register NewVReg1 = MRI->createVirtualRegister(TRC); 10159 BuildMI(*MBB, MI, dl, TII->get(ARM::LDRi12), NewVReg1) 10160 .addConstantPoolIndex(CPI) 10161 .addImm(0) 10162 .addMemOperand(CPMMO) 10163 .add(predOps(ARMCC::AL)); 10164 Register NewVReg2 = MRI->createVirtualRegister(TRC); 10165 BuildMI(*MBB, MI, dl, TII->get(ARM::PICADD), NewVReg2) 10166 .addReg(NewVReg1, RegState::Kill) 10167 .addImm(PCLabelId) 10168 .add(predOps(ARMCC::AL)); 10169 BuildMI(*MBB, MI, dl, TII->get(ARM::STRi12)) 10170 .addReg(NewVReg2, RegState::Kill) 10171 .addFrameIndex(FI) 10172 .addImm(36) // &jbuf[1] :: pc 10173 .addMemOperand(FIMMOSt) 10174 .add(predOps(ARMCC::AL)); 10175 } 10176 } 10177 10178 void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI, 10179 MachineBasicBlock *MBB) const { 10180 const TargetInstrInfo *TII = Subtarget->getInstrInfo(); 10181 DebugLoc dl = MI.getDebugLoc(); 10182 MachineFunction *MF = MBB->getParent(); 10183 MachineRegisterInfo *MRI = &MF->getRegInfo(); 10184 MachineFrameInfo &MFI = MF->getFrameInfo(); 10185 int FI = MFI.getFunctionContextIndex(); 10186 10187 const TargetRegisterClass *TRC = Subtarget->isThumb() ? &ARM::tGPRRegClass 10188 : &ARM::GPRnopcRegClass; 10189 10190 // Get a mapping of the call site numbers to all of the landing pads they're 10191 // associated with. 10192 DenseMap<unsigned, SmallVector<MachineBasicBlock*, 2>> CallSiteNumToLPad; 10193 unsigned MaxCSNum = 0; 10194 for (MachineFunction::iterator BB = MF->begin(), E = MF->end(); BB != E; 10195 ++BB) { 10196 if (!BB->isEHPad()) continue; 10197 10198 // FIXME: We should assert that the EH_LABEL is the first MI in the landing 10199 // pad. 10200 for (MachineBasicBlock::iterator 10201 II = BB->begin(), IE = BB->end(); II != IE; ++II) { 10202 if (!II->isEHLabel()) continue; 10203 10204 MCSymbol *Sym = II->getOperand(0).getMCSymbol(); 10205 if (!MF->hasCallSiteLandingPad(Sym)) continue; 10206 10207 SmallVectorImpl<unsigned> &CallSiteIdxs = MF->getCallSiteLandingPad(Sym); 10208 for (SmallVectorImpl<unsigned>::iterator 10209 CSI = CallSiteIdxs.begin(), CSE = CallSiteIdxs.end(); 10210 CSI != CSE; ++CSI) { 10211 CallSiteNumToLPad[*CSI].push_back(&*BB); 10212 MaxCSNum = std::max(MaxCSNum, *CSI); 10213 } 10214 break; 10215 } 10216 } 10217 10218 // Get an ordered list of the machine basic blocks for the jump table. 10219 std::vector<MachineBasicBlock*> LPadList; 10220 SmallPtrSet<MachineBasicBlock*, 32> InvokeBBs; 10221 LPadList.reserve(CallSiteNumToLPad.size()); 10222 for (unsigned I = 1; I <= MaxCSNum; ++I) { 10223 SmallVectorImpl<MachineBasicBlock*> &MBBList = CallSiteNumToLPad[I]; 10224 for (SmallVectorImpl<MachineBasicBlock*>::iterator 10225 II = MBBList.begin(), IE = MBBList.end(); II != IE; ++II) { 10226 LPadList.push_back(*II); 10227 InvokeBBs.insert((*II)->pred_begin(), (*II)->pred_end()); 10228 } 10229 } 10230 10231 assert(!LPadList.empty() && 10232 "No landing pad destinations for the dispatch jump table!"); 10233 10234 // Create the jump table and associated information. 10235 MachineJumpTableInfo *JTI = 10236 MF->getOrCreateJumpTableInfo(MachineJumpTableInfo::EK_Inline); 10237 unsigned MJTI = JTI->createJumpTableIndex(LPadList); 10238 10239 // Create the MBBs for the dispatch code. 10240 10241 // Shove the dispatch's address into the return slot in the function context. 10242 MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock(); 10243 DispatchBB->setIsEHPad(); 10244 10245 MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock(); 10246 unsigned trap_opcode; 10247 if (Subtarget->isThumb()) 10248 trap_opcode = ARM::tTRAP; 10249 else 10250 trap_opcode = Subtarget->useNaClTrap() ? ARM::TRAPNaCl : ARM::TRAP; 10251 10252 BuildMI(TrapBB, dl, TII->get(trap_opcode)); 10253 DispatchBB->addSuccessor(TrapBB); 10254 10255 MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock(); 10256 DispatchBB->addSuccessor(DispContBB); 10257 10258 // Insert and MBBs. 10259 MF->insert(MF->end(), DispatchBB); 10260 MF->insert(MF->end(), DispContBB); 10261 MF->insert(MF->end(), TrapBB); 10262 10263 // Insert code into the entry block that creates and registers the function 10264 // context. 10265 SetupEntryBlockForSjLj(MI, MBB, DispatchBB, FI); 10266 10267 MachineMemOperand *FIMMOLd = MF->getMachineMemOperand( 10268 MachinePointerInfo::getFixedStack(*MF, FI), 10269 MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile, 4, Align(4)); 10270 10271 MachineInstrBuilder MIB; 10272 MIB = BuildMI(DispatchBB, dl, TII->get(ARM::Int_eh_sjlj_dispatchsetup)); 10273 10274 const ARMBaseInstrInfo *AII = static_cast<const ARMBaseInstrInfo*>(TII); 10275 const ARMBaseRegisterInfo &RI = AII->getRegisterInfo(); 10276 10277 // Add a register mask with no preserved registers. This results in all 10278 // registers being marked as clobbered. This can't work if the dispatch block 10279 // is in a Thumb1 function and is linked with ARM code which uses the FP 10280 // registers, as there is no way to preserve the FP registers in Thumb1 mode. 10281 MIB.addRegMask(RI.getSjLjDispatchPreservedMask(*MF)); 10282 10283 bool IsPositionIndependent = isPositionIndependent(); 10284 unsigned NumLPads = LPadList.size(); 10285 if (Subtarget->isThumb2()) { 10286 Register NewVReg1 = MRI->createVirtualRegister(TRC); 10287 BuildMI(DispatchBB, dl, TII->get(ARM::t2LDRi12), NewVReg1) 10288 .addFrameIndex(FI) 10289 .addImm(4) 10290 .addMemOperand(FIMMOLd) 10291 .add(predOps(ARMCC::AL)); 10292 10293 if (NumLPads < 256) { 10294 BuildMI(DispatchBB, dl, TII->get(ARM::t2CMPri)) 10295 .addReg(NewVReg1) 10296 .addImm(LPadList.size()) 10297 .add(predOps(ARMCC::AL)); 10298 } else { 10299 Register VReg1 = MRI->createVirtualRegister(TRC); 10300 BuildMI(DispatchBB, dl, TII->get(ARM::t2MOVi16), VReg1) 10301 .addImm(NumLPads & 0xFFFF) 10302 .add(predOps(ARMCC::AL)); 10303 10304 unsigned VReg2 = VReg1; 10305 if ((NumLPads & 0xFFFF0000) != 0) { 10306 VReg2 = MRI->createVirtualRegister(TRC); 10307 BuildMI(DispatchBB, dl, TII->get(ARM::t2MOVTi16), VReg2) 10308 .addReg(VReg1) 10309 .addImm(NumLPads >> 16) 10310 .add(predOps(ARMCC::AL)); 10311 } 10312 10313 BuildMI(DispatchBB, dl, TII->get(ARM::t2CMPrr)) 10314 .addReg(NewVReg1) 10315 .addReg(VReg2) 10316 .add(predOps(ARMCC::AL)); 10317 } 10318 10319 BuildMI(DispatchBB, dl, TII->get(ARM::t2Bcc)) 10320 .addMBB(TrapBB) 10321 .addImm(ARMCC::HI) 10322 .addReg(ARM::CPSR); 10323 10324 Register NewVReg3 = MRI->createVirtualRegister(TRC); 10325 BuildMI(DispContBB, dl, TII->get(ARM::t2LEApcrelJT), NewVReg3) 10326 .addJumpTableIndex(MJTI) 10327 .add(predOps(ARMCC::AL)); 10328 10329 Register NewVReg4 = MRI->createVirtualRegister(TRC); 10330 BuildMI(DispContBB, dl, TII->get(ARM::t2ADDrs), NewVReg4) 10331 .addReg(NewVReg3, RegState::Kill) 10332 .addReg(NewVReg1) 10333 .addImm(ARM_AM::getSORegOpc(ARM_AM::lsl, 2)) 10334 .add(predOps(ARMCC::AL)) 10335 .add(condCodeOp()); 10336 10337 BuildMI(DispContBB, dl, TII->get(ARM::t2BR_JT)) 10338 .addReg(NewVReg4, RegState::Kill) 10339 .addReg(NewVReg1) 10340 .addJumpTableIndex(MJTI); 10341 } else if (Subtarget->isThumb()) { 10342 Register NewVReg1 = MRI->createVirtualRegister(TRC); 10343 BuildMI(DispatchBB, dl, TII->get(ARM::tLDRspi), NewVReg1) 10344 .addFrameIndex(FI) 10345 .addImm(1) 10346 .addMemOperand(FIMMOLd) 10347 .add(predOps(ARMCC::AL)); 10348 10349 if (NumLPads < 256) { 10350 BuildMI(DispatchBB, dl, TII->get(ARM::tCMPi8)) 10351 .addReg(NewVReg1) 10352 .addImm(NumLPads) 10353 .add(predOps(ARMCC::AL)); 10354 } else { 10355 MachineConstantPool *ConstantPool = MF->getConstantPool(); 10356 Type *Int32Ty = Type::getInt32Ty(MF->getFunction().getContext()); 10357 const Constant *C = ConstantInt::get(Int32Ty, NumLPads); 10358 10359 // MachineConstantPool wants an explicit alignment. 10360 Align Alignment = MF->getDataLayout().getPrefTypeAlign(Int32Ty); 10361 unsigned Idx = ConstantPool->getConstantPoolIndex(C, Alignment); 10362 10363 Register VReg1 = MRI->createVirtualRegister(TRC); 10364 BuildMI(DispatchBB, dl, TII->get(ARM::tLDRpci)) 10365 .addReg(VReg1, RegState::Define) 10366 .addConstantPoolIndex(Idx) 10367 .add(predOps(ARMCC::AL)); 10368 BuildMI(DispatchBB, dl, TII->get(ARM::tCMPr)) 10369 .addReg(NewVReg1) 10370 .addReg(VReg1) 10371 .add(predOps(ARMCC::AL)); 10372 } 10373 10374 BuildMI(DispatchBB, dl, TII->get(ARM::tBcc)) 10375 .addMBB(TrapBB) 10376 .addImm(ARMCC::HI) 10377 .addReg(ARM::CPSR); 10378 10379 Register NewVReg2 = MRI->createVirtualRegister(TRC); 10380 BuildMI(DispContBB, dl, TII->get(ARM::tLSLri), NewVReg2) 10381 .addReg(ARM::CPSR, RegState::Define) 10382 .addReg(NewVReg1) 10383 .addImm(2) 10384 .add(predOps(ARMCC::AL)); 10385 10386 Register NewVReg3 = MRI->createVirtualRegister(TRC); 10387 BuildMI(DispContBB, dl, TII->get(ARM::tLEApcrelJT), NewVReg3) 10388 .addJumpTableIndex(MJTI) 10389 .add(predOps(ARMCC::AL)); 10390 10391 Register NewVReg4 = MRI->createVirtualRegister(TRC); 10392 BuildMI(DispContBB, dl, TII->get(ARM::tADDrr), NewVReg4) 10393 .addReg(ARM::CPSR, RegState::Define) 10394 .addReg(NewVReg2, RegState::Kill) 10395 .addReg(NewVReg3) 10396 .add(predOps(ARMCC::AL)); 10397 10398 MachineMemOperand *JTMMOLd = 10399 MF->getMachineMemOperand(MachinePointerInfo::getJumpTable(*MF), 10400 MachineMemOperand::MOLoad, 4, Align(4)); 10401 10402 Register NewVReg5 = MRI->createVirtualRegister(TRC); 10403 BuildMI(DispContBB, dl, TII->get(ARM::tLDRi), NewVReg5) 10404 .addReg(NewVReg4, RegState::Kill) 10405 .addImm(0) 10406 .addMemOperand(JTMMOLd) 10407 .add(predOps(ARMCC::AL)); 10408 10409 unsigned NewVReg6 = NewVReg5; 10410 if (IsPositionIndependent) { 10411 NewVReg6 = MRI->createVirtualRegister(TRC); 10412 BuildMI(DispContBB, dl, TII->get(ARM::tADDrr), NewVReg6) 10413 .addReg(ARM::CPSR, RegState::Define) 10414 .addReg(NewVReg5, RegState::Kill) 10415 .addReg(NewVReg3) 10416 .add(predOps(ARMCC::AL)); 10417 } 10418 10419 BuildMI(DispContBB, dl, TII->get(ARM::tBR_JTr)) 10420 .addReg(NewVReg6, RegState::Kill) 10421 .addJumpTableIndex(MJTI); 10422 } else { 10423 Register NewVReg1 = MRI->createVirtualRegister(TRC); 10424 BuildMI(DispatchBB, dl, TII->get(ARM::LDRi12), NewVReg1) 10425 .addFrameIndex(FI) 10426 .addImm(4) 10427 .addMemOperand(FIMMOLd) 10428 .add(predOps(ARMCC::AL)); 10429 10430 if (NumLPads < 256) { 10431 BuildMI(DispatchBB, dl, TII->get(ARM::CMPri)) 10432 .addReg(NewVReg1) 10433 .addImm(NumLPads) 10434 .add(predOps(ARMCC::AL)); 10435 } else if (Subtarget->hasV6T2Ops() && isUInt<16>(NumLPads)) { 10436 Register VReg1 = MRI->createVirtualRegister(TRC); 10437 BuildMI(DispatchBB, dl, TII->get(ARM::MOVi16), VReg1) 10438 .addImm(NumLPads & 0xFFFF) 10439 .add(predOps(ARMCC::AL)); 10440 10441 unsigned VReg2 = VReg1; 10442 if ((NumLPads & 0xFFFF0000) != 0) { 10443 VReg2 = MRI->createVirtualRegister(TRC); 10444 BuildMI(DispatchBB, dl, TII->get(ARM::MOVTi16), VReg2) 10445 .addReg(VReg1) 10446 .addImm(NumLPads >> 16) 10447 .add(predOps(ARMCC::AL)); 10448 } 10449 10450 BuildMI(DispatchBB, dl, TII->get(ARM::CMPrr)) 10451 .addReg(NewVReg1) 10452 .addReg(VReg2) 10453 .add(predOps(ARMCC::AL)); 10454 } else { 10455 MachineConstantPool *ConstantPool = MF->getConstantPool(); 10456 Type *Int32Ty = Type::getInt32Ty(MF->getFunction().getContext()); 10457 const Constant *C = ConstantInt::get(Int32Ty, NumLPads); 10458 10459 // MachineConstantPool wants an explicit alignment. 10460 Align Alignment = MF->getDataLayout().getPrefTypeAlign(Int32Ty); 10461 unsigned Idx = ConstantPool->getConstantPoolIndex(C, Alignment); 10462 10463 Register VReg1 = MRI->createVirtualRegister(TRC); 10464 BuildMI(DispatchBB, dl, TII->get(ARM::LDRcp)) 10465 .addReg(VReg1, RegState::Define) 10466 .addConstantPoolIndex(Idx) 10467 .addImm(0) 10468 .add(predOps(ARMCC::AL)); 10469 BuildMI(DispatchBB, dl, TII->get(ARM::CMPrr)) 10470 .addReg(NewVReg1) 10471 .addReg(VReg1, RegState::Kill) 10472 .add(predOps(ARMCC::AL)); 10473 } 10474 10475 BuildMI(DispatchBB, dl, TII->get(ARM::Bcc)) 10476 .addMBB(TrapBB) 10477 .addImm(ARMCC::HI) 10478 .addReg(ARM::CPSR); 10479 10480 Register NewVReg3 = MRI->createVirtualRegister(TRC); 10481 BuildMI(DispContBB, dl, TII->get(ARM::MOVsi), NewVReg3) 10482 .addReg(NewVReg1) 10483 .addImm(ARM_AM::getSORegOpc(ARM_AM::lsl, 2)) 10484 .add(predOps(ARMCC::AL)) 10485 .add(condCodeOp()); 10486 Register NewVReg4 = MRI->createVirtualRegister(TRC); 10487 BuildMI(DispContBB, dl, TII->get(ARM::LEApcrelJT), NewVReg4) 10488 .addJumpTableIndex(MJTI) 10489 .add(predOps(ARMCC::AL)); 10490 10491 MachineMemOperand *JTMMOLd = 10492 MF->getMachineMemOperand(MachinePointerInfo::getJumpTable(*MF), 10493 MachineMemOperand::MOLoad, 4, Align(4)); 10494 Register NewVReg5 = MRI->createVirtualRegister(TRC); 10495 BuildMI(DispContBB, dl, TII->get(ARM::LDRrs), NewVReg5) 10496 .addReg(NewVReg3, RegState::Kill) 10497 .addReg(NewVReg4) 10498 .addImm(0) 10499 .addMemOperand(JTMMOLd) 10500 .add(predOps(ARMCC::AL)); 10501 10502 if (IsPositionIndependent) { 10503 BuildMI(DispContBB, dl, TII->get(ARM::BR_JTadd)) 10504 .addReg(NewVReg5, RegState::Kill) 10505 .addReg(NewVReg4) 10506 .addJumpTableIndex(MJTI); 10507 } else { 10508 BuildMI(DispContBB, dl, TII->get(ARM::BR_JTr)) 10509 .addReg(NewVReg5, RegState::Kill) 10510 .addJumpTableIndex(MJTI); 10511 } 10512 } 10513 10514 // Add the jump table entries as successors to the MBB. 10515 SmallPtrSet<MachineBasicBlock*, 8> SeenMBBs; 10516 for (std::vector<MachineBasicBlock*>::iterator 10517 I = LPadList.begin(), E = LPadList.end(); I != E; ++I) { 10518 MachineBasicBlock *CurMBB = *I; 10519 if (SeenMBBs.insert(CurMBB).second) 10520 DispContBB->addSuccessor(CurMBB); 10521 } 10522 10523 // N.B. the order the invoke BBs are processed in doesn't matter here. 10524 const MCPhysReg *SavedRegs = RI.getCalleeSavedRegs(MF); 10525 SmallVector<MachineBasicBlock*, 64> MBBLPads; 10526 for (MachineBasicBlock *BB : InvokeBBs) { 10527 10528 // Remove the landing pad successor from the invoke block and replace it 10529 // with the new dispatch block. 10530 SmallVector<MachineBasicBlock*, 4> Successors(BB->successors()); 10531 while (!Successors.empty()) { 10532 MachineBasicBlock *SMBB = Successors.pop_back_val(); 10533 if (SMBB->isEHPad()) { 10534 BB->removeSuccessor(SMBB); 10535 MBBLPads.push_back(SMBB); 10536 } 10537 } 10538 10539 BB->addSuccessor(DispatchBB, BranchProbability::getZero()); 10540 BB->normalizeSuccProbs(); 10541 10542 // Find the invoke call and mark all of the callee-saved registers as 10543 // 'implicit defined' so that they're spilled. This prevents code from 10544 // moving instructions to before the EH block, where they will never be 10545 // executed. 10546 for (MachineBasicBlock::reverse_iterator 10547 II = BB->rbegin(), IE = BB->rend(); II != IE; ++II) { 10548 if (!II->isCall()) continue; 10549 10550 DenseMap<unsigned, bool> DefRegs; 10551 for (MachineInstr::mop_iterator 10552 OI = II->operands_begin(), OE = II->operands_end(); 10553 OI != OE; ++OI) { 10554 if (!OI->isReg()) continue; 10555 DefRegs[OI->getReg()] = true; 10556 } 10557 10558 MachineInstrBuilder MIB(*MF, &*II); 10559 10560 for (unsigned i = 0; SavedRegs[i] != 0; ++i) { 10561 unsigned Reg = SavedRegs[i]; 10562 if (Subtarget->isThumb2() && 10563 !ARM::tGPRRegClass.contains(Reg) && 10564 !ARM::hGPRRegClass.contains(Reg)) 10565 continue; 10566 if (Subtarget->isThumb1Only() && !ARM::tGPRRegClass.contains(Reg)) 10567 continue; 10568 if (!Subtarget->isThumb() && !ARM::GPRRegClass.contains(Reg)) 10569 continue; 10570 if (!DefRegs[Reg]) 10571 MIB.addReg(Reg, RegState::ImplicitDefine | RegState::Dead); 10572 } 10573 10574 break; 10575 } 10576 } 10577 10578 // Mark all former landing pads as non-landing pads. The dispatch is the only 10579 // landing pad now. 10580 for (SmallVectorImpl<MachineBasicBlock*>::iterator 10581 I = MBBLPads.begin(), E = MBBLPads.end(); I != E; ++I) 10582 (*I)->setIsEHPad(false); 10583 10584 // The instruction is gone now. 10585 MI.eraseFromParent(); 10586 } 10587 10588 static 10589 MachineBasicBlock *OtherSucc(MachineBasicBlock *MBB, MachineBasicBlock *Succ) { 10590 for (MachineBasicBlock::succ_iterator I = MBB->succ_begin(), 10591 E = MBB->succ_end(); I != E; ++I) 10592 if (*I != Succ) 10593 return *I; 10594 llvm_unreachable("Expecting a BB with two successors!"); 10595 } 10596 10597 /// Return the load opcode for a given load size. If load size >= 8, 10598 /// neon opcode will be returned. 10599 static unsigned getLdOpcode(unsigned LdSize, bool IsThumb1, bool IsThumb2) { 10600 if (LdSize >= 8) 10601 return LdSize == 16 ? ARM::VLD1q32wb_fixed 10602 : LdSize == 8 ? ARM::VLD1d32wb_fixed : 0; 10603 if (IsThumb1) 10604 return LdSize == 4 ? ARM::tLDRi 10605 : LdSize == 2 ? ARM::tLDRHi 10606 : LdSize == 1 ? ARM::tLDRBi : 0; 10607 if (IsThumb2) 10608 return LdSize == 4 ? ARM::t2LDR_POST 10609 : LdSize == 2 ? ARM::t2LDRH_POST 10610 : LdSize == 1 ? ARM::t2LDRB_POST : 0; 10611 return LdSize == 4 ? ARM::LDR_POST_IMM 10612 : LdSize == 2 ? ARM::LDRH_POST 10613 : LdSize == 1 ? ARM::LDRB_POST_IMM : 0; 10614 } 10615 10616 /// Return the store opcode for a given store size. If store size >= 8, 10617 /// neon opcode will be returned. 10618 static unsigned getStOpcode(unsigned StSize, bool IsThumb1, bool IsThumb2) { 10619 if (StSize >= 8) 10620 return StSize == 16 ? ARM::VST1q32wb_fixed 10621 : StSize == 8 ? ARM::VST1d32wb_fixed : 0; 10622 if (IsThumb1) 10623 return StSize == 4 ? ARM::tSTRi 10624 : StSize == 2 ? ARM::tSTRHi 10625 : StSize == 1 ? ARM::tSTRBi : 0; 10626 if (IsThumb2) 10627 return StSize == 4 ? ARM::t2STR_POST 10628 : StSize == 2 ? ARM::t2STRH_POST 10629 : StSize == 1 ? ARM::t2STRB_POST : 0; 10630 return StSize == 4 ? ARM::STR_POST_IMM 10631 : StSize == 2 ? ARM::STRH_POST 10632 : StSize == 1 ? ARM::STRB_POST_IMM : 0; 10633 } 10634 10635 /// Emit a post-increment load operation with given size. The instructions 10636 /// will be added to BB at Pos. 10637 static void emitPostLd(MachineBasicBlock *BB, MachineBasicBlock::iterator Pos, 10638 const TargetInstrInfo *TII, const DebugLoc &dl, 10639 unsigned LdSize, unsigned Data, unsigned AddrIn, 10640 unsigned AddrOut, bool IsThumb1, bool IsThumb2) { 10641 unsigned LdOpc = getLdOpcode(LdSize, IsThumb1, IsThumb2); 10642 assert(LdOpc != 0 && "Should have a load opcode"); 10643 if (LdSize >= 8) { 10644 BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data) 10645 .addReg(AddrOut, RegState::Define) 10646 .addReg(AddrIn) 10647 .addImm(0) 10648 .add(predOps(ARMCC::AL)); 10649 } else if (IsThumb1) { 10650 // load + update AddrIn 10651 BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data) 10652 .addReg(AddrIn) 10653 .addImm(0) 10654 .add(predOps(ARMCC::AL)); 10655 BuildMI(*BB, Pos, dl, TII->get(ARM::tADDi8), AddrOut) 10656 .add(t1CondCodeOp()) 10657 .addReg(AddrIn) 10658 .addImm(LdSize) 10659 .add(predOps(ARMCC::AL)); 10660 } else if (IsThumb2) { 10661 BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data) 10662 .addReg(AddrOut, RegState::Define) 10663 .addReg(AddrIn) 10664 .addImm(LdSize) 10665 .add(predOps(ARMCC::AL)); 10666 } else { // arm 10667 BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data) 10668 .addReg(AddrOut, RegState::Define) 10669 .addReg(AddrIn) 10670 .addReg(0) 10671 .addImm(LdSize) 10672 .add(predOps(ARMCC::AL)); 10673 } 10674 } 10675 10676 /// Emit a post-increment store operation with given size. The instructions 10677 /// will be added to BB at Pos. 10678 static void emitPostSt(MachineBasicBlock *BB, MachineBasicBlock::iterator Pos, 10679 const TargetInstrInfo *TII, const DebugLoc &dl, 10680 unsigned StSize, unsigned Data, unsigned AddrIn, 10681 unsigned AddrOut, bool IsThumb1, bool IsThumb2) { 10682 unsigned StOpc = getStOpcode(StSize, IsThumb1, IsThumb2); 10683 assert(StOpc != 0 && "Should have a store opcode"); 10684 if (StSize >= 8) { 10685 BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut) 10686 .addReg(AddrIn) 10687 .addImm(0) 10688 .addReg(Data) 10689 .add(predOps(ARMCC::AL)); 10690 } else if (IsThumb1) { 10691 // store + update AddrIn 10692 BuildMI(*BB, Pos, dl, TII->get(StOpc)) 10693 .addReg(Data) 10694 .addReg(AddrIn) 10695 .addImm(0) 10696 .add(predOps(ARMCC::AL)); 10697 BuildMI(*BB, Pos, dl, TII->get(ARM::tADDi8), AddrOut) 10698 .add(t1CondCodeOp()) 10699 .addReg(AddrIn) 10700 .addImm(StSize) 10701 .add(predOps(ARMCC::AL)); 10702 } else if (IsThumb2) { 10703 BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut) 10704 .addReg(Data) 10705 .addReg(AddrIn) 10706 .addImm(StSize) 10707 .add(predOps(ARMCC::AL)); 10708 } else { // arm 10709 BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut) 10710 .addReg(Data) 10711 .addReg(AddrIn) 10712 .addReg(0) 10713 .addImm(StSize) 10714 .add(predOps(ARMCC::AL)); 10715 } 10716 } 10717 10718 MachineBasicBlock * 10719 ARMTargetLowering::EmitStructByval(MachineInstr &MI, 10720 MachineBasicBlock *BB) const { 10721 // This pseudo instruction has 3 operands: dst, src, size 10722 // We expand it to a loop if size > Subtarget->getMaxInlineSizeThreshold(). 10723 // Otherwise, we will generate unrolled scalar copies. 10724 const TargetInstrInfo *TII = Subtarget->getInstrInfo(); 10725 const BasicBlock *LLVM_BB = BB->getBasicBlock(); 10726 MachineFunction::iterator It = ++BB->getIterator(); 10727 10728 Register dest = MI.getOperand(0).getReg(); 10729 Register src = MI.getOperand(1).getReg(); 10730 unsigned SizeVal = MI.getOperand(2).getImm(); 10731 unsigned Alignment = MI.getOperand(3).getImm(); 10732 DebugLoc dl = MI.getDebugLoc(); 10733 10734 MachineFunction *MF = BB->getParent(); 10735 MachineRegisterInfo &MRI = MF->getRegInfo(); 10736 unsigned UnitSize = 0; 10737 const TargetRegisterClass *TRC = nullptr; 10738 const TargetRegisterClass *VecTRC = nullptr; 10739 10740 bool IsThumb1 = Subtarget->isThumb1Only(); 10741 bool IsThumb2 = Subtarget->isThumb2(); 10742 bool IsThumb = Subtarget->isThumb(); 10743 10744 if (Alignment & 1) { 10745 UnitSize = 1; 10746 } else if (Alignment & 2) { 10747 UnitSize = 2; 10748 } else { 10749 // Check whether we can use NEON instructions. 10750 if (!MF->getFunction().hasFnAttribute(Attribute::NoImplicitFloat) && 10751 Subtarget->hasNEON()) { 10752 if ((Alignment % 16 == 0) && SizeVal >= 16) 10753 UnitSize = 16; 10754 else if ((Alignment % 8 == 0) && SizeVal >= 8) 10755 UnitSize = 8; 10756 } 10757 // Can't use NEON instructions. 10758 if (UnitSize == 0) 10759 UnitSize = 4; 10760 } 10761 10762 // Select the correct opcode and register class for unit size load/store 10763 bool IsNeon = UnitSize >= 8; 10764 TRC = IsThumb ? &ARM::tGPRRegClass : &ARM::GPRRegClass; 10765 if (IsNeon) 10766 VecTRC = UnitSize == 16 ? &ARM::DPairRegClass 10767 : UnitSize == 8 ? &ARM::DPRRegClass 10768 : nullptr; 10769 10770 unsigned BytesLeft = SizeVal % UnitSize; 10771 unsigned LoopSize = SizeVal - BytesLeft; 10772 10773 if (SizeVal <= Subtarget->getMaxInlineSizeThreshold()) { 10774 // Use LDR and STR to copy. 10775 // [scratch, srcOut] = LDR_POST(srcIn, UnitSize) 10776 // [destOut] = STR_POST(scratch, destIn, UnitSize) 10777 unsigned srcIn = src; 10778 unsigned destIn = dest; 10779 for (unsigned i = 0; i < LoopSize; i+=UnitSize) { 10780 Register srcOut = MRI.createVirtualRegister(TRC); 10781 Register destOut = MRI.createVirtualRegister(TRC); 10782 Register scratch = MRI.createVirtualRegister(IsNeon ? VecTRC : TRC); 10783 emitPostLd(BB, MI, TII, dl, UnitSize, scratch, srcIn, srcOut, 10784 IsThumb1, IsThumb2); 10785 emitPostSt(BB, MI, TII, dl, UnitSize, scratch, destIn, destOut, 10786 IsThumb1, IsThumb2); 10787 srcIn = srcOut; 10788 destIn = destOut; 10789 } 10790 10791 // Handle the leftover bytes with LDRB and STRB. 10792 // [scratch, srcOut] = LDRB_POST(srcIn, 1) 10793 // [destOut] = STRB_POST(scratch, destIn, 1) 10794 for (unsigned i = 0; i < BytesLeft; i++) { 10795 Register srcOut = MRI.createVirtualRegister(TRC); 10796 Register destOut = MRI.createVirtualRegister(TRC); 10797 Register scratch = MRI.createVirtualRegister(TRC); 10798 emitPostLd(BB, MI, TII, dl, 1, scratch, srcIn, srcOut, 10799 IsThumb1, IsThumb2); 10800 emitPostSt(BB, MI, TII, dl, 1, scratch, destIn, destOut, 10801 IsThumb1, IsThumb2); 10802 srcIn = srcOut; 10803 destIn = destOut; 10804 } 10805 MI.eraseFromParent(); // The instruction is gone now. 10806 return BB; 10807 } 10808 10809 // Expand the pseudo op to a loop. 10810 // thisMBB: 10811 // ... 10812 // movw varEnd, # --> with thumb2 10813 // movt varEnd, # 10814 // ldrcp varEnd, idx --> without thumb2 10815 // fallthrough --> loopMBB 10816 // loopMBB: 10817 // PHI varPhi, varEnd, varLoop 10818 // PHI srcPhi, src, srcLoop 10819 // PHI destPhi, dst, destLoop 10820 // [scratch, srcLoop] = LDR_POST(srcPhi, UnitSize) 10821 // [destLoop] = STR_POST(scratch, destPhi, UnitSize) 10822 // subs varLoop, varPhi, #UnitSize 10823 // bne loopMBB 10824 // fallthrough --> exitMBB 10825 // exitMBB: 10826 // epilogue to handle left-over bytes 10827 // [scratch, srcOut] = LDRB_POST(srcLoop, 1) 10828 // [destOut] = STRB_POST(scratch, destLoop, 1) 10829 MachineBasicBlock *loopMBB = MF->CreateMachineBasicBlock(LLVM_BB); 10830 MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB); 10831 MF->insert(It, loopMBB); 10832 MF->insert(It, exitMBB); 10833 10834 // Transfer the remainder of BB and its successor edges to exitMBB. 10835 exitMBB->splice(exitMBB->begin(), BB, 10836 std::next(MachineBasicBlock::iterator(MI)), BB->end()); 10837 exitMBB->transferSuccessorsAndUpdatePHIs(BB); 10838 10839 // Load an immediate to varEnd. 10840 Register varEnd = MRI.createVirtualRegister(TRC); 10841 if (Subtarget->useMovt()) { 10842 unsigned Vtmp = varEnd; 10843 if ((LoopSize & 0xFFFF0000) != 0) 10844 Vtmp = MRI.createVirtualRegister(TRC); 10845 BuildMI(BB, dl, TII->get(IsThumb ? ARM::t2MOVi16 : ARM::MOVi16), Vtmp) 10846 .addImm(LoopSize & 0xFFFF) 10847 .add(predOps(ARMCC::AL)); 10848 10849 if ((LoopSize & 0xFFFF0000) != 0) 10850 BuildMI(BB, dl, TII->get(IsThumb ? ARM::t2MOVTi16 : ARM::MOVTi16), varEnd) 10851 .addReg(Vtmp) 10852 .addImm(LoopSize >> 16) 10853 .add(predOps(ARMCC::AL)); 10854 } else { 10855 MachineConstantPool *ConstantPool = MF->getConstantPool(); 10856 Type *Int32Ty = Type::getInt32Ty(MF->getFunction().getContext()); 10857 const Constant *C = ConstantInt::get(Int32Ty, LoopSize); 10858 10859 // MachineConstantPool wants an explicit alignment. 10860 Align Alignment = MF->getDataLayout().getPrefTypeAlign(Int32Ty); 10861 unsigned Idx = ConstantPool->getConstantPoolIndex(C, Alignment); 10862 MachineMemOperand *CPMMO = 10863 MF->getMachineMemOperand(MachinePointerInfo::getConstantPool(*MF), 10864 MachineMemOperand::MOLoad, 4, Align(4)); 10865 10866 if (IsThumb) 10867 BuildMI(*BB, MI, dl, TII->get(ARM::tLDRpci)) 10868 .addReg(varEnd, RegState::Define) 10869 .addConstantPoolIndex(Idx) 10870 .add(predOps(ARMCC::AL)) 10871 .addMemOperand(CPMMO); 10872 else 10873 BuildMI(*BB, MI, dl, TII->get(ARM::LDRcp)) 10874 .addReg(varEnd, RegState::Define) 10875 .addConstantPoolIndex(Idx) 10876 .addImm(0) 10877 .add(predOps(ARMCC::AL)) 10878 .addMemOperand(CPMMO); 10879 } 10880 BB->addSuccessor(loopMBB); 10881 10882 // Generate the loop body: 10883 // varPhi = PHI(varLoop, varEnd) 10884 // srcPhi = PHI(srcLoop, src) 10885 // destPhi = PHI(destLoop, dst) 10886 MachineBasicBlock *entryBB = BB; 10887 BB = loopMBB; 10888 Register varLoop = MRI.createVirtualRegister(TRC); 10889 Register varPhi = MRI.createVirtualRegister(TRC); 10890 Register srcLoop = MRI.createVirtualRegister(TRC); 10891 Register srcPhi = MRI.createVirtualRegister(TRC); 10892 Register destLoop = MRI.createVirtualRegister(TRC); 10893 Register destPhi = MRI.createVirtualRegister(TRC); 10894 10895 BuildMI(*BB, BB->begin(), dl, TII->get(ARM::PHI), varPhi) 10896 .addReg(varLoop).addMBB(loopMBB) 10897 .addReg(varEnd).addMBB(entryBB); 10898 BuildMI(BB, dl, TII->get(ARM::PHI), srcPhi) 10899 .addReg(srcLoop).addMBB(loopMBB) 10900 .addReg(src).addMBB(entryBB); 10901 BuildMI(BB, dl, TII->get(ARM::PHI), destPhi) 10902 .addReg(destLoop).addMBB(loopMBB) 10903 .addReg(dest).addMBB(entryBB); 10904 10905 // [scratch, srcLoop] = LDR_POST(srcPhi, UnitSize) 10906 // [destLoop] = STR_POST(scratch, destPhi, UnitSiz) 10907 Register scratch = MRI.createVirtualRegister(IsNeon ? VecTRC : TRC); 10908 emitPostLd(BB, BB->end(), TII, dl, UnitSize, scratch, srcPhi, srcLoop, 10909 IsThumb1, IsThumb2); 10910 emitPostSt(BB, BB->end(), TII, dl, UnitSize, scratch, destPhi, destLoop, 10911 IsThumb1, IsThumb2); 10912 10913 // Decrement loop variable by UnitSize. 10914 if (IsThumb1) { 10915 BuildMI(*BB, BB->end(), dl, TII->get(ARM::tSUBi8), varLoop) 10916 .add(t1CondCodeOp()) 10917 .addReg(varPhi) 10918 .addImm(UnitSize) 10919 .add(predOps(ARMCC::AL)); 10920 } else { 10921 MachineInstrBuilder MIB = 10922 BuildMI(*BB, BB->end(), dl, 10923 TII->get(IsThumb2 ? ARM::t2SUBri : ARM::SUBri), varLoop); 10924 MIB.addReg(varPhi) 10925 .addImm(UnitSize) 10926 .add(predOps(ARMCC::AL)) 10927 .add(condCodeOp()); 10928 MIB->getOperand(5).setReg(ARM::CPSR); 10929 MIB->getOperand(5).setIsDef(true); 10930 } 10931 BuildMI(*BB, BB->end(), dl, 10932 TII->get(IsThumb1 ? ARM::tBcc : IsThumb2 ? ARM::t2Bcc : ARM::Bcc)) 10933 .addMBB(loopMBB).addImm(ARMCC::NE).addReg(ARM::CPSR); 10934 10935 // loopMBB can loop back to loopMBB or fall through to exitMBB. 10936 BB->addSuccessor(loopMBB); 10937 BB->addSuccessor(exitMBB); 10938 10939 // Add epilogue to handle BytesLeft. 10940 BB = exitMBB; 10941 auto StartOfExit = exitMBB->begin(); 10942 10943 // [scratch, srcOut] = LDRB_POST(srcLoop, 1) 10944 // [destOut] = STRB_POST(scratch, destLoop, 1) 10945 unsigned srcIn = srcLoop; 10946 unsigned destIn = destLoop; 10947 for (unsigned i = 0; i < BytesLeft; i++) { 10948 Register srcOut = MRI.createVirtualRegister(TRC); 10949 Register destOut = MRI.createVirtualRegister(TRC); 10950 Register scratch = MRI.createVirtualRegister(TRC); 10951 emitPostLd(BB, StartOfExit, TII, dl, 1, scratch, srcIn, srcOut, 10952 IsThumb1, IsThumb2); 10953 emitPostSt(BB, StartOfExit, TII, dl, 1, scratch, destIn, destOut, 10954 IsThumb1, IsThumb2); 10955 srcIn = srcOut; 10956 destIn = destOut; 10957 } 10958 10959 MI.eraseFromParent(); // The instruction is gone now. 10960 return BB; 10961 } 10962 10963 MachineBasicBlock * 10964 ARMTargetLowering::EmitLowered__chkstk(MachineInstr &MI, 10965 MachineBasicBlock *MBB) const { 10966 const TargetMachine &TM = getTargetMachine(); 10967 const TargetInstrInfo &TII = *Subtarget->getInstrInfo(); 10968 DebugLoc DL = MI.getDebugLoc(); 10969 10970 assert(Subtarget->isTargetWindows() && 10971 "__chkstk is only supported on Windows"); 10972 assert(Subtarget->isThumb2() && "Windows on ARM requires Thumb-2 mode"); 10973 10974 // __chkstk takes the number of words to allocate on the stack in R4, and 10975 // returns the stack adjustment in number of bytes in R4. This will not 10976 // clober any other registers (other than the obvious lr). 10977 // 10978 // Although, technically, IP should be considered a register which may be 10979 // clobbered, the call itself will not touch it. Windows on ARM is a pure 10980 // thumb-2 environment, so there is no interworking required. As a result, we 10981 // do not expect a veneer to be emitted by the linker, clobbering IP. 10982 // 10983 // Each module receives its own copy of __chkstk, so no import thunk is 10984 // required, again, ensuring that IP is not clobbered. 10985 // 10986 // Finally, although some linkers may theoretically provide a trampoline for 10987 // out of range calls (which is quite common due to a 32M range limitation of 10988 // branches for Thumb), we can generate the long-call version via 10989 // -mcmodel=large, alleviating the need for the trampoline which may clobber 10990 // IP. 10991 10992 switch (TM.getCodeModel()) { 10993 case CodeModel::Tiny: 10994 llvm_unreachable("Tiny code model not available on ARM."); 10995 case CodeModel::Small: 10996 case CodeModel::Medium: 10997 case CodeModel::Kernel: 10998 BuildMI(*MBB, MI, DL, TII.get(ARM::tBL)) 10999 .add(predOps(ARMCC::AL)) 11000 .addExternalSymbol("__chkstk") 11001 .addReg(ARM::R4, RegState::Implicit | RegState::Kill) 11002 .addReg(ARM::R4, RegState::Implicit | RegState::Define) 11003 .addReg(ARM::R12, 11004 RegState::Implicit | RegState::Define | RegState::Dead) 11005 .addReg(ARM::CPSR, 11006 RegState::Implicit | RegState::Define | RegState::Dead); 11007 break; 11008 case CodeModel::Large: { 11009 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); 11010 Register Reg = MRI.createVirtualRegister(&ARM::rGPRRegClass); 11011 11012 BuildMI(*MBB, MI, DL, TII.get(ARM::t2MOVi32imm), Reg) 11013 .addExternalSymbol("__chkstk"); 11014 BuildMI(*MBB, MI, DL, TII.get(gettBLXrOpcode(*MBB->getParent()))) 11015 .add(predOps(ARMCC::AL)) 11016 .addReg(Reg, RegState::Kill) 11017 .addReg(ARM::R4, RegState::Implicit | RegState::Kill) 11018 .addReg(ARM::R4, RegState::Implicit | RegState::Define) 11019 .addReg(ARM::R12, 11020 RegState::Implicit | RegState::Define | RegState::Dead) 11021 .addReg(ARM::CPSR, 11022 RegState::Implicit | RegState::Define | RegState::Dead); 11023 break; 11024 } 11025 } 11026 11027 BuildMI(*MBB, MI, DL, TII.get(ARM::t2SUBrr), ARM::SP) 11028 .addReg(ARM::SP, RegState::Kill) 11029 .addReg(ARM::R4, RegState::Kill) 11030 .setMIFlags(MachineInstr::FrameSetup) 11031 .add(predOps(ARMCC::AL)) 11032 .add(condCodeOp()); 11033 11034 MI.eraseFromParent(); 11035 return MBB; 11036 } 11037 11038 MachineBasicBlock * 11039 ARMTargetLowering::EmitLowered__dbzchk(MachineInstr &MI, 11040 MachineBasicBlock *MBB) const { 11041 DebugLoc DL = MI.getDebugLoc(); 11042 MachineFunction *MF = MBB->getParent(); 11043 const TargetInstrInfo *TII = Subtarget->getInstrInfo(); 11044 11045 MachineBasicBlock *ContBB = MF->CreateMachineBasicBlock(); 11046 MF->insert(++MBB->getIterator(), ContBB); 11047 ContBB->splice(ContBB->begin(), MBB, 11048 std::next(MachineBasicBlock::iterator(MI)), MBB->end()); 11049 ContBB->transferSuccessorsAndUpdatePHIs(MBB); 11050 MBB->addSuccessor(ContBB); 11051 11052 MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock(); 11053 BuildMI(TrapBB, DL, TII->get(ARM::t__brkdiv0)); 11054 MF->push_back(TrapBB); 11055 MBB->addSuccessor(TrapBB); 11056 11057 BuildMI(*MBB, MI, DL, TII->get(ARM::tCMPi8)) 11058 .addReg(MI.getOperand(0).getReg()) 11059 .addImm(0) 11060 .add(predOps(ARMCC::AL)); 11061 BuildMI(*MBB, MI, DL, TII->get(ARM::t2Bcc)) 11062 .addMBB(TrapBB) 11063 .addImm(ARMCC::EQ) 11064 .addReg(ARM::CPSR); 11065 11066 MI.eraseFromParent(); 11067 return ContBB; 11068 } 11069 11070 // The CPSR operand of SelectItr might be missing a kill marker 11071 // because there were multiple uses of CPSR, and ISel didn't know 11072 // which to mark. Figure out whether SelectItr should have had a 11073 // kill marker, and set it if it should. Returns the correct kill 11074 // marker value. 11075 static bool checkAndUpdateCPSRKill(MachineBasicBlock::iterator SelectItr, 11076 MachineBasicBlock* BB, 11077 const TargetRegisterInfo* TRI) { 11078 // Scan forward through BB for a use/def of CPSR. 11079 MachineBasicBlock::iterator miI(std::next(SelectItr)); 11080 for (MachineBasicBlock::iterator miE = BB->end(); miI != miE; ++miI) { 11081 const MachineInstr& mi = *miI; 11082 if (mi.readsRegister(ARM::CPSR)) 11083 return false; 11084 if (mi.definesRegister(ARM::CPSR)) 11085 break; // Should have kill-flag - update below. 11086 } 11087 11088 // If we hit the end of the block, check whether CPSR is live into a 11089 // successor. 11090 if (miI == BB->end()) { 11091 for (MachineBasicBlock::succ_iterator sItr = BB->succ_begin(), 11092 sEnd = BB->succ_end(); 11093 sItr != sEnd; ++sItr) { 11094 MachineBasicBlock* succ = *sItr; 11095 if (succ->isLiveIn(ARM::CPSR)) 11096 return false; 11097 } 11098 } 11099 11100 // We found a def, or hit the end of the basic block and CPSR wasn't live 11101 // out. SelectMI should have a kill flag on CPSR. 11102 SelectItr->addRegisterKilled(ARM::CPSR, TRI); 11103 return true; 11104 } 11105 11106 /// Adds logic in loop entry MBB to calculate loop iteration count and adds 11107 /// t2WhileLoopSetup and t2WhileLoopStart to generate WLS loop 11108 static Register genTPEntry(MachineBasicBlock *TpEntry, 11109 MachineBasicBlock *TpLoopBody, 11110 MachineBasicBlock *TpExit, Register OpSizeReg, 11111 const TargetInstrInfo *TII, DebugLoc Dl, 11112 MachineRegisterInfo &MRI) { 11113 // Calculates loop iteration count = ceil(n/16)/16 = ((n + 15)&(-16)) / 16. 11114 Register AddDestReg = MRI.createVirtualRegister(&ARM::rGPRRegClass); 11115 BuildMI(TpEntry, Dl, TII->get(ARM::t2ADDri), AddDestReg) 11116 .addUse(OpSizeReg) 11117 .addImm(15) 11118 .add(predOps(ARMCC::AL)) 11119 .addReg(0); 11120 11121 Register BicDestReg = MRI.createVirtualRegister(&ARM::rGPRRegClass); 11122 BuildMI(TpEntry, Dl, TII->get(ARM::t2BICri), BicDestReg) 11123 .addUse(AddDestReg, RegState::Kill) 11124 .addImm(16) 11125 .add(predOps(ARMCC::AL)) 11126 .addReg(0); 11127 11128 Register LsrDestReg = MRI.createVirtualRegister(&ARM::GPRlrRegClass); 11129 BuildMI(TpEntry, Dl, TII->get(ARM::t2LSRri), LsrDestReg) 11130 .addUse(BicDestReg, RegState::Kill) 11131 .addImm(4) 11132 .add(predOps(ARMCC::AL)) 11133 .addReg(0); 11134 11135 Register TotalIterationsReg = MRI.createVirtualRegister(&ARM::GPRlrRegClass); 11136 BuildMI(TpEntry, Dl, TII->get(ARM::t2WhileLoopSetup), TotalIterationsReg) 11137 .addUse(LsrDestReg, RegState::Kill); 11138 11139 BuildMI(TpEntry, Dl, TII->get(ARM::t2WhileLoopStart)) 11140 .addUse(TotalIterationsReg) 11141 .addMBB(TpExit); 11142 11143 return TotalIterationsReg; 11144 } 11145 11146 /// Adds logic in the loopBody MBB to generate MVE_VCTP, t2DoLoopDec and 11147 /// t2DoLoopEnd. These are used by later passes to generate tail predicated 11148 /// loops. 11149 static void genTPLoopBody(MachineBasicBlock *TpLoopBody, 11150 MachineBasicBlock *TpEntry, MachineBasicBlock *TpExit, 11151 const TargetInstrInfo *TII, DebugLoc Dl, 11152 MachineRegisterInfo &MRI, Register OpSrcReg, 11153 Register OpDestReg, Register ElementCountReg, 11154 Register TotalIterationsReg, bool IsMemcpy) { 11155 // First insert 4 PHI nodes for: Current pointer to Src (if memcpy), Dest 11156 // array, loop iteration counter, predication counter. 11157 11158 Register SrcPhiReg, CurrSrcReg; 11159 if (IsMemcpy) { 11160 // Current position in the src array 11161 SrcPhiReg = MRI.createVirtualRegister(&ARM::rGPRRegClass); 11162 CurrSrcReg = MRI.createVirtualRegister(&ARM::rGPRRegClass); 11163 BuildMI(TpLoopBody, Dl, TII->get(ARM::PHI), SrcPhiReg) 11164 .addUse(OpSrcReg) 11165 .addMBB(TpEntry) 11166 .addUse(CurrSrcReg) 11167 .addMBB(TpLoopBody); 11168 } 11169 11170 // Current position in the dest array 11171 Register DestPhiReg = MRI.createVirtualRegister(&ARM::rGPRRegClass); 11172 Register CurrDestReg = MRI.createVirtualRegister(&ARM::rGPRRegClass); 11173 BuildMI(TpLoopBody, Dl, TII->get(ARM::PHI), DestPhiReg) 11174 .addUse(OpDestReg) 11175 .addMBB(TpEntry) 11176 .addUse(CurrDestReg) 11177 .addMBB(TpLoopBody); 11178 11179 // Current loop counter 11180 Register LoopCounterPhiReg = MRI.createVirtualRegister(&ARM::GPRlrRegClass); 11181 Register RemainingLoopIterationsReg = 11182 MRI.createVirtualRegister(&ARM::GPRlrRegClass); 11183 BuildMI(TpLoopBody, Dl, TII->get(ARM::PHI), LoopCounterPhiReg) 11184 .addUse(TotalIterationsReg) 11185 .addMBB(TpEntry) 11186 .addUse(RemainingLoopIterationsReg) 11187 .addMBB(TpLoopBody); 11188 11189 // Predication counter 11190 Register PredCounterPhiReg = MRI.createVirtualRegister(&ARM::rGPRRegClass); 11191 Register RemainingElementsReg = MRI.createVirtualRegister(&ARM::rGPRRegClass); 11192 BuildMI(TpLoopBody, Dl, TII->get(ARM::PHI), PredCounterPhiReg) 11193 .addUse(ElementCountReg) 11194 .addMBB(TpEntry) 11195 .addUse(RemainingElementsReg) 11196 .addMBB(TpLoopBody); 11197 11198 // Pass predication counter to VCTP 11199 Register VccrReg = MRI.createVirtualRegister(&ARM::VCCRRegClass); 11200 BuildMI(TpLoopBody, Dl, TII->get(ARM::MVE_VCTP8), VccrReg) 11201 .addUse(PredCounterPhiReg) 11202 .addImm(ARMVCC::None) 11203 .addReg(0); 11204 11205 BuildMI(TpLoopBody, Dl, TII->get(ARM::t2SUBri), RemainingElementsReg) 11206 .addUse(PredCounterPhiReg) 11207 .addImm(16) 11208 .add(predOps(ARMCC::AL)) 11209 .addReg(0); 11210 11211 // VLDRB (only if memcpy) and VSTRB instructions, predicated using VPR 11212 Register SrcValueReg; 11213 if (IsMemcpy) { 11214 SrcValueReg = MRI.createVirtualRegister(&ARM::MQPRRegClass); 11215 BuildMI(TpLoopBody, Dl, TII->get(ARM::MVE_VLDRBU8_post)) 11216 .addDef(CurrSrcReg) 11217 .addDef(SrcValueReg) 11218 .addReg(SrcPhiReg) 11219 .addImm(16) 11220 .addImm(ARMVCC::Then) 11221 .addUse(VccrReg); 11222 } else 11223 SrcValueReg = OpSrcReg; 11224 11225 BuildMI(TpLoopBody, Dl, TII->get(ARM::MVE_VSTRBU8_post)) 11226 .addDef(CurrDestReg) 11227 .addUse(SrcValueReg) 11228 .addReg(DestPhiReg) 11229 .addImm(16) 11230 .addImm(ARMVCC::Then) 11231 .addUse(VccrReg); 11232 11233 // Add the pseudoInstrs for decrementing the loop counter and marking the 11234 // end:t2DoLoopDec and t2DoLoopEnd 11235 BuildMI(TpLoopBody, Dl, TII->get(ARM::t2LoopDec), RemainingLoopIterationsReg) 11236 .addUse(LoopCounterPhiReg) 11237 .addImm(1); 11238 11239 BuildMI(TpLoopBody, Dl, TII->get(ARM::t2LoopEnd)) 11240 .addUse(RemainingLoopIterationsReg) 11241 .addMBB(TpLoopBody); 11242 11243 BuildMI(TpLoopBody, Dl, TII->get(ARM::t2B)) 11244 .addMBB(TpExit) 11245 .add(predOps(ARMCC::AL)); 11246 } 11247 11248 MachineBasicBlock * 11249 ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, 11250 MachineBasicBlock *BB) const { 11251 const TargetInstrInfo *TII = Subtarget->getInstrInfo(); 11252 DebugLoc dl = MI.getDebugLoc(); 11253 bool isThumb2 = Subtarget->isThumb2(); 11254 switch (MI.getOpcode()) { 11255 default: { 11256 MI.print(errs()); 11257 llvm_unreachable("Unexpected instr type to insert"); 11258 } 11259 11260 // Thumb1 post-indexed loads are really just single-register LDMs. 11261 case ARM::tLDR_postidx: { 11262 MachineOperand Def(MI.getOperand(1)); 11263 BuildMI(*BB, MI, dl, TII->get(ARM::tLDMIA_UPD)) 11264 .add(Def) // Rn_wb 11265 .add(MI.getOperand(2)) // Rn 11266 .add(MI.getOperand(3)) // PredImm 11267 .add(MI.getOperand(4)) // PredReg 11268 .add(MI.getOperand(0)) // Rt 11269 .cloneMemRefs(MI); 11270 MI.eraseFromParent(); 11271 return BB; 11272 } 11273 11274 case ARM::MVE_MEMCPYLOOPINST: 11275 case ARM::MVE_MEMSETLOOPINST: { 11276 11277 // Transformation below expands MVE_MEMCPYLOOPINST/MVE_MEMSETLOOPINST Pseudo 11278 // into a Tail Predicated (TP) Loop. It adds the instructions to calculate 11279 // the iteration count =ceil(size_in_bytes/16)) in the TP entry block and 11280 // adds the relevant instructions in the TP loop Body for generation of a 11281 // WLSTP loop. 11282 11283 // Below is relevant portion of the CFG after the transformation. 11284 // The Machine Basic Blocks are shown along with branch conditions (in 11285 // brackets). Note that TP entry/exit MBBs depict the entry/exit of this 11286 // portion of the CFG and may not necessarily be the entry/exit of the 11287 // function. 11288 11289 // (Relevant) CFG after transformation: 11290 // TP entry MBB 11291 // | 11292 // |-----------------| 11293 // (n <= 0) (n > 0) 11294 // | | 11295 // | TP loop Body MBB<--| 11296 // | | | 11297 // \ |___________| 11298 // \ / 11299 // TP exit MBB 11300 11301 MachineFunction *MF = BB->getParent(); 11302 MachineFunctionProperties &Properties = MF->getProperties(); 11303 MachineRegisterInfo &MRI = MF->getRegInfo(); 11304 11305 Register OpDestReg = MI.getOperand(0).getReg(); 11306 Register OpSrcReg = MI.getOperand(1).getReg(); 11307 Register OpSizeReg = MI.getOperand(2).getReg(); 11308 11309 // Allocate the required MBBs and add to parent function. 11310 MachineBasicBlock *TpEntry = BB; 11311 MachineBasicBlock *TpLoopBody = MF->CreateMachineBasicBlock(); 11312 MachineBasicBlock *TpExit; 11313 11314 MF->push_back(TpLoopBody); 11315 11316 // If any instructions are present in the current block after 11317 // MVE_MEMCPYLOOPINST or MVE_MEMSETLOOPINST, split the current block and 11318 // move the instructions into the newly created exit block. If there are no 11319 // instructions add an explicit branch to the FallThrough block and then 11320 // split. 11321 // 11322 // The split is required for two reasons: 11323 // 1) A terminator(t2WhileLoopStart) will be placed at that site. 11324 // 2) Since a TPLoopBody will be added later, any phis in successive blocks 11325 // need to be updated. splitAt() already handles this. 11326 TpExit = BB->splitAt(MI, false); 11327 if (TpExit == BB) { 11328 assert(BB->canFallThrough() && "Exit Block must be Fallthrough of the " 11329 "block containing memcpy/memset Pseudo"); 11330 TpExit = BB->getFallThrough(); 11331 BuildMI(BB, dl, TII->get(ARM::t2B)) 11332 .addMBB(TpExit) 11333 .add(predOps(ARMCC::AL)); 11334 TpExit = BB->splitAt(MI, false); 11335 } 11336 11337 // Add logic for iteration count 11338 Register TotalIterationsReg = 11339 genTPEntry(TpEntry, TpLoopBody, TpExit, OpSizeReg, TII, dl, MRI); 11340 11341 // Add the vectorized (and predicated) loads/store instructions 11342 bool IsMemcpy = MI.getOpcode() == ARM::MVE_MEMCPYLOOPINST; 11343 genTPLoopBody(TpLoopBody, TpEntry, TpExit, TII, dl, MRI, OpSrcReg, 11344 OpDestReg, OpSizeReg, TotalIterationsReg, IsMemcpy); 11345 11346 // Required to avoid conflict with the MachineVerifier during testing. 11347 Properties.reset(MachineFunctionProperties::Property::NoPHIs); 11348 11349 // Connect the blocks 11350 TpEntry->addSuccessor(TpLoopBody); 11351 TpLoopBody->addSuccessor(TpLoopBody); 11352 TpLoopBody->addSuccessor(TpExit); 11353 11354 // Reorder for a more natural layout 11355 TpLoopBody->moveAfter(TpEntry); 11356 TpExit->moveAfter(TpLoopBody); 11357 11358 // Finally, remove the memcpy Psuedo Instruction 11359 MI.eraseFromParent(); 11360 11361 // Return the exit block as it may contain other instructions requiring a 11362 // custom inserter 11363 return TpExit; 11364 } 11365 11366 // The Thumb2 pre-indexed stores have the same MI operands, they just 11367 // define them differently in the .td files from the isel patterns, so 11368 // they need pseudos. 11369 case ARM::t2STR_preidx: 11370 MI.setDesc(TII->get(ARM::t2STR_PRE)); 11371 return BB; 11372 case ARM::t2STRB_preidx: 11373 MI.setDesc(TII->get(ARM::t2STRB_PRE)); 11374 return BB; 11375 case ARM::t2STRH_preidx: 11376 MI.setDesc(TII->get(ARM::t2STRH_PRE)); 11377 return BB; 11378 11379 case ARM::STRi_preidx: 11380 case ARM::STRBi_preidx: { 11381 unsigned NewOpc = MI.getOpcode() == ARM::STRi_preidx ? ARM::STR_PRE_IMM 11382 : ARM::STRB_PRE_IMM; 11383 // Decode the offset. 11384 unsigned Offset = MI.getOperand(4).getImm(); 11385 bool isSub = ARM_AM::getAM2Op(Offset) == ARM_AM::sub; 11386 Offset = ARM_AM::getAM2Offset(Offset); 11387 if (isSub) 11388 Offset = -Offset; 11389 11390 MachineMemOperand *MMO = *MI.memoperands_begin(); 11391 BuildMI(*BB, MI, dl, TII->get(NewOpc)) 11392 .add(MI.getOperand(0)) // Rn_wb 11393 .add(MI.getOperand(1)) // Rt 11394 .add(MI.getOperand(2)) // Rn 11395 .addImm(Offset) // offset (skip GPR==zero_reg) 11396 .add(MI.getOperand(5)) // pred 11397 .add(MI.getOperand(6)) 11398 .addMemOperand(MMO); 11399 MI.eraseFromParent(); 11400 return BB; 11401 } 11402 case ARM::STRr_preidx: 11403 case ARM::STRBr_preidx: 11404 case ARM::STRH_preidx: { 11405 unsigned NewOpc; 11406 switch (MI.getOpcode()) { 11407 default: llvm_unreachable("unexpected opcode!"); 11408 case ARM::STRr_preidx: NewOpc = ARM::STR_PRE_REG; break; 11409 case ARM::STRBr_preidx: NewOpc = ARM::STRB_PRE_REG; break; 11410 case ARM::STRH_preidx: NewOpc = ARM::STRH_PRE; break; 11411 } 11412 MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(NewOpc)); 11413 for (unsigned i = 0; i < MI.getNumOperands(); ++i) 11414 MIB.add(MI.getOperand(i)); 11415 MI.eraseFromParent(); 11416 return BB; 11417 } 11418 11419 case ARM::tMOVCCr_pseudo: { 11420 // To "insert" a SELECT_CC instruction, we actually have to insert the 11421 // diamond control-flow pattern. The incoming instruction knows the 11422 // destination vreg to set, the condition code register to branch on, the 11423 // true/false values to select between, and a branch opcode to use. 11424 const BasicBlock *LLVM_BB = BB->getBasicBlock(); 11425 MachineFunction::iterator It = ++BB->getIterator(); 11426 11427 // thisMBB: 11428 // ... 11429 // TrueVal = ... 11430 // cmpTY ccX, r1, r2 11431 // bCC copy1MBB 11432 // fallthrough --> copy0MBB 11433 MachineBasicBlock *thisMBB = BB; 11434 MachineFunction *F = BB->getParent(); 11435 MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB); 11436 MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB); 11437 F->insert(It, copy0MBB); 11438 F->insert(It, sinkMBB); 11439 11440 // Check whether CPSR is live past the tMOVCCr_pseudo. 11441 const TargetRegisterInfo *TRI = Subtarget->getRegisterInfo(); 11442 if (!MI.killsRegister(ARM::CPSR) && 11443 !checkAndUpdateCPSRKill(MI, thisMBB, TRI)) { 11444 copy0MBB->addLiveIn(ARM::CPSR); 11445 sinkMBB->addLiveIn(ARM::CPSR); 11446 } 11447 11448 // Transfer the remainder of BB and its successor edges to sinkMBB. 11449 sinkMBB->splice(sinkMBB->begin(), BB, 11450 std::next(MachineBasicBlock::iterator(MI)), BB->end()); 11451 sinkMBB->transferSuccessorsAndUpdatePHIs(BB); 11452 11453 BB->addSuccessor(copy0MBB); 11454 BB->addSuccessor(sinkMBB); 11455 11456 BuildMI(BB, dl, TII->get(ARM::tBcc)) 11457 .addMBB(sinkMBB) 11458 .addImm(MI.getOperand(3).getImm()) 11459 .addReg(MI.getOperand(4).getReg()); 11460 11461 // copy0MBB: 11462 // %FalseValue = ... 11463 // # fallthrough to sinkMBB 11464 BB = copy0MBB; 11465 11466 // Update machine-CFG edges 11467 BB->addSuccessor(sinkMBB); 11468 11469 // sinkMBB: 11470 // %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ] 11471 // ... 11472 BB = sinkMBB; 11473 BuildMI(*BB, BB->begin(), dl, TII->get(ARM::PHI), MI.getOperand(0).getReg()) 11474 .addReg(MI.getOperand(1).getReg()) 11475 .addMBB(copy0MBB) 11476 .addReg(MI.getOperand(2).getReg()) 11477 .addMBB(thisMBB); 11478 11479 MI.eraseFromParent(); // The pseudo instruction is gone now. 11480 return BB; 11481 } 11482 11483 case ARM::BCCi64: 11484 case ARM::BCCZi64: { 11485 // If there is an unconditional branch to the other successor, remove it. 11486 BB->erase(std::next(MachineBasicBlock::iterator(MI)), BB->end()); 11487 11488 // Compare both parts that make up the double comparison separately for 11489 // equality. 11490 bool RHSisZero = MI.getOpcode() == ARM::BCCZi64; 11491 11492 Register LHS1 = MI.getOperand(1).getReg(); 11493 Register LHS2 = MI.getOperand(2).getReg(); 11494 if (RHSisZero) { 11495 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri)) 11496 .addReg(LHS1) 11497 .addImm(0) 11498 .add(predOps(ARMCC::AL)); 11499 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri)) 11500 .addReg(LHS2).addImm(0) 11501 .addImm(ARMCC::EQ).addReg(ARM::CPSR); 11502 } else { 11503 Register RHS1 = MI.getOperand(3).getReg(); 11504 Register RHS2 = MI.getOperand(4).getReg(); 11505 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPrr : ARM::CMPrr)) 11506 .addReg(LHS1) 11507 .addReg(RHS1) 11508 .add(predOps(ARMCC::AL)); 11509 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPrr : ARM::CMPrr)) 11510 .addReg(LHS2).addReg(RHS2) 11511 .addImm(ARMCC::EQ).addReg(ARM::CPSR); 11512 } 11513 11514 MachineBasicBlock *destMBB = MI.getOperand(RHSisZero ? 3 : 5).getMBB(); 11515 MachineBasicBlock *exitMBB = OtherSucc(BB, destMBB); 11516 if (MI.getOperand(0).getImm() == ARMCC::NE) 11517 std::swap(destMBB, exitMBB); 11518 11519 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc)) 11520 .addMBB(destMBB).addImm(ARMCC::EQ).addReg(ARM::CPSR); 11521 if (isThumb2) 11522 BuildMI(BB, dl, TII->get(ARM::t2B)) 11523 .addMBB(exitMBB) 11524 .add(predOps(ARMCC::AL)); 11525 else 11526 BuildMI(BB, dl, TII->get(ARM::B)) .addMBB(exitMBB); 11527 11528 MI.eraseFromParent(); // The pseudo instruction is gone now. 11529 return BB; 11530 } 11531 11532 case ARM::Int_eh_sjlj_setjmp: 11533 case ARM::Int_eh_sjlj_setjmp_nofp: 11534 case ARM::tInt_eh_sjlj_setjmp: 11535 case ARM::t2Int_eh_sjlj_setjmp: 11536 case ARM::t2Int_eh_sjlj_setjmp_nofp: 11537 return BB; 11538 11539 case ARM::Int_eh_sjlj_setup_dispatch: 11540 EmitSjLjDispatchBlock(MI, BB); 11541 return BB; 11542 11543 case ARM::ABS: 11544 case ARM::t2ABS: { 11545 // To insert an ABS instruction, we have to insert the 11546 // diamond control-flow pattern. The incoming instruction knows the 11547 // source vreg to test against 0, the destination vreg to set, 11548 // the condition code register to branch on, the 11549 // true/false values to select between, and a branch opcode to use. 11550 // It transforms 11551 // V1 = ABS V0 11552 // into 11553 // V2 = MOVS V0 11554 // BCC (branch to SinkBB if V0 >= 0) 11555 // RSBBB: V3 = RSBri V2, 0 (compute ABS if V2 < 0) 11556 // SinkBB: V1 = PHI(V2, V3) 11557 const BasicBlock *LLVM_BB = BB->getBasicBlock(); 11558 MachineFunction::iterator BBI = ++BB->getIterator(); 11559 MachineFunction *Fn = BB->getParent(); 11560 MachineBasicBlock *RSBBB = Fn->CreateMachineBasicBlock(LLVM_BB); 11561 MachineBasicBlock *SinkBB = Fn->CreateMachineBasicBlock(LLVM_BB); 11562 Fn->insert(BBI, RSBBB); 11563 Fn->insert(BBI, SinkBB); 11564 11565 Register ABSSrcReg = MI.getOperand(1).getReg(); 11566 Register ABSDstReg = MI.getOperand(0).getReg(); 11567 bool ABSSrcKIll = MI.getOperand(1).isKill(); 11568 bool isThumb2 = Subtarget->isThumb2(); 11569 MachineRegisterInfo &MRI = Fn->getRegInfo(); 11570 // In Thumb mode S must not be specified if source register is the SP or 11571 // PC and if destination register is the SP, so restrict register class 11572 Register NewRsbDstReg = MRI.createVirtualRegister( 11573 isThumb2 ? &ARM::rGPRRegClass : &ARM::GPRRegClass); 11574 11575 // Transfer the remainder of BB and its successor edges to sinkMBB. 11576 SinkBB->splice(SinkBB->begin(), BB, 11577 std::next(MachineBasicBlock::iterator(MI)), BB->end()); 11578 SinkBB->transferSuccessorsAndUpdatePHIs(BB); 11579 11580 BB->addSuccessor(RSBBB); 11581 BB->addSuccessor(SinkBB); 11582 11583 // fall through to SinkMBB 11584 RSBBB->addSuccessor(SinkBB); 11585 11586 // insert a cmp at the end of BB 11587 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri)) 11588 .addReg(ABSSrcReg) 11589 .addImm(0) 11590 .add(predOps(ARMCC::AL)); 11591 11592 // insert a bcc with opposite CC to ARMCC::MI at the end of BB 11593 BuildMI(BB, dl, 11594 TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc)).addMBB(SinkBB) 11595 .addImm(ARMCC::getOppositeCondition(ARMCC::MI)).addReg(ARM::CPSR); 11596 11597 // insert rsbri in RSBBB 11598 // Note: BCC and rsbri will be converted into predicated rsbmi 11599 // by if-conversion pass 11600 BuildMI(*RSBBB, RSBBB->begin(), dl, 11601 TII->get(isThumb2 ? ARM::t2RSBri : ARM::RSBri), NewRsbDstReg) 11602 .addReg(ABSSrcReg, ABSSrcKIll ? RegState::Kill : 0) 11603 .addImm(0) 11604 .add(predOps(ARMCC::AL)) 11605 .add(condCodeOp()); 11606 11607 // insert PHI in SinkBB, 11608 // reuse ABSDstReg to not change uses of ABS instruction 11609 BuildMI(*SinkBB, SinkBB->begin(), dl, 11610 TII->get(ARM::PHI), ABSDstReg) 11611 .addReg(NewRsbDstReg).addMBB(RSBBB) 11612 .addReg(ABSSrcReg).addMBB(BB); 11613 11614 // remove ABS instruction 11615 MI.eraseFromParent(); 11616 11617 // return last added BB 11618 return SinkBB; 11619 } 11620 case ARM::COPY_STRUCT_BYVAL_I32: 11621 ++NumLoopByVals; 11622 return EmitStructByval(MI, BB); 11623 case ARM::WIN__CHKSTK: 11624 return EmitLowered__chkstk(MI, BB); 11625 case ARM::WIN__DBZCHK: 11626 return EmitLowered__dbzchk(MI, BB); 11627 } 11628 } 11629 11630 /// Attaches vregs to MEMCPY that it will use as scratch registers 11631 /// when it is expanded into LDM/STM. This is done as a post-isel lowering 11632 /// instead of as a custom inserter because we need the use list from the SDNode. 11633 static void attachMEMCPYScratchRegs(const ARMSubtarget *Subtarget, 11634 MachineInstr &MI, const SDNode *Node) { 11635 bool isThumb1 = Subtarget->isThumb1Only(); 11636 11637 DebugLoc DL = MI.getDebugLoc(); 11638 MachineFunction *MF = MI.getParent()->getParent(); 11639 MachineRegisterInfo &MRI = MF->getRegInfo(); 11640 MachineInstrBuilder MIB(*MF, MI); 11641 11642 // If the new dst/src is unused mark it as dead. 11643 if (!Node->hasAnyUseOfValue(0)) { 11644 MI.getOperand(0).setIsDead(true); 11645 } 11646 if (!Node->hasAnyUseOfValue(1)) { 11647 MI.getOperand(1).setIsDead(true); 11648 } 11649 11650 // The MEMCPY both defines and kills the scratch registers. 11651 for (unsigned I = 0; I != MI.getOperand(4).getImm(); ++I) { 11652 Register TmpReg = MRI.createVirtualRegister(isThumb1 ? &ARM::tGPRRegClass 11653 : &ARM::GPRRegClass); 11654 MIB.addReg(TmpReg, RegState::Define|RegState::Dead); 11655 } 11656 } 11657 11658 void ARMTargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI, 11659 SDNode *Node) const { 11660 if (MI.getOpcode() == ARM::MEMCPY) { 11661 attachMEMCPYScratchRegs(Subtarget, MI, Node); 11662 return; 11663 } 11664 11665 const MCInstrDesc *MCID = &MI.getDesc(); 11666 // Adjust potentially 's' setting instructions after isel, i.e. ADC, SBC, RSB, 11667 // RSC. Coming out of isel, they have an implicit CPSR def, but the optional 11668 // operand is still set to noreg. If needed, set the optional operand's 11669 // register to CPSR, and remove the redundant implicit def. 11670 // 11671 // e.g. ADCS (..., implicit-def CPSR) -> ADC (... opt:def CPSR). 11672 11673 // Rename pseudo opcodes. 11674 unsigned NewOpc = convertAddSubFlagsOpcode(MI.getOpcode()); 11675 unsigned ccOutIdx; 11676 if (NewOpc) { 11677 const ARMBaseInstrInfo *TII = Subtarget->getInstrInfo(); 11678 MCID = &TII->get(NewOpc); 11679 11680 assert(MCID->getNumOperands() == 11681 MI.getDesc().getNumOperands() + 5 - MI.getDesc().getSize() 11682 && "converted opcode should be the same except for cc_out" 11683 " (and, on Thumb1, pred)"); 11684 11685 MI.setDesc(*MCID); 11686 11687 // Add the optional cc_out operand 11688 MI.addOperand(MachineOperand::CreateReg(0, /*isDef=*/true)); 11689 11690 // On Thumb1, move all input operands to the end, then add the predicate 11691 if (Subtarget->isThumb1Only()) { 11692 for (unsigned c = MCID->getNumOperands() - 4; c--;) { 11693 MI.addOperand(MI.getOperand(1)); 11694 MI.RemoveOperand(1); 11695 } 11696 11697 // Restore the ties 11698 for (unsigned i = MI.getNumOperands(); i--;) { 11699 const MachineOperand& op = MI.getOperand(i); 11700 if (op.isReg() && op.isUse()) { 11701 int DefIdx = MCID->getOperandConstraint(i, MCOI::TIED_TO); 11702 if (DefIdx != -1) 11703 MI.tieOperands(DefIdx, i); 11704 } 11705 } 11706 11707 MI.addOperand(MachineOperand::CreateImm(ARMCC::AL)); 11708 MI.addOperand(MachineOperand::CreateReg(0, /*isDef=*/false)); 11709 ccOutIdx = 1; 11710 } else 11711 ccOutIdx = MCID->getNumOperands() - 1; 11712 } else 11713 ccOutIdx = MCID->getNumOperands() - 1; 11714 11715 // Any ARM instruction that sets the 's' bit should specify an optional 11716 // "cc_out" operand in the last operand position. 11717 if (!MI.hasOptionalDef() || !MCID->OpInfo[ccOutIdx].isOptionalDef()) { 11718 assert(!NewOpc && "Optional cc_out operand required"); 11719 return; 11720 } 11721 // Look for an implicit def of CPSR added by MachineInstr ctor. Remove it 11722 // since we already have an optional CPSR def. 11723 bool definesCPSR = false; 11724 bool deadCPSR = false; 11725 for (unsigned i = MCID->getNumOperands(), e = MI.getNumOperands(); i != e; 11726 ++i) { 11727 const MachineOperand &MO = MI.getOperand(i); 11728 if (MO.isReg() && MO.isDef() && MO.getReg() == ARM::CPSR) { 11729 definesCPSR = true; 11730 if (MO.isDead()) 11731 deadCPSR = true; 11732 MI.RemoveOperand(i); 11733 break; 11734 } 11735 } 11736 if (!definesCPSR) { 11737 assert(!NewOpc && "Optional cc_out operand required"); 11738 return; 11739 } 11740 assert(deadCPSR == !Node->hasAnyUseOfValue(1) && "inconsistent dead flag"); 11741 if (deadCPSR) { 11742 assert(!MI.getOperand(ccOutIdx).getReg() && 11743 "expect uninitialized optional cc_out operand"); 11744 // Thumb1 instructions must have the S bit even if the CPSR is dead. 11745 if (!Subtarget->isThumb1Only()) 11746 return; 11747 } 11748 11749 // If this instruction was defined with an optional CPSR def and its dag node 11750 // had a live implicit CPSR def, then activate the optional CPSR def. 11751 MachineOperand &MO = MI.getOperand(ccOutIdx); 11752 MO.setReg(ARM::CPSR); 11753 MO.setIsDef(true); 11754 } 11755 11756 //===----------------------------------------------------------------------===// 11757 // ARM Optimization Hooks 11758 //===----------------------------------------------------------------------===// 11759 11760 // Helper function that checks if N is a null or all ones constant. 11761 static inline bool isZeroOrAllOnes(SDValue N, bool AllOnes) { 11762 return AllOnes ? isAllOnesConstant(N) : isNullConstant(N); 11763 } 11764 11765 // Return true if N is conditionally 0 or all ones. 11766 // Detects these expressions where cc is an i1 value: 11767 // 11768 // (select cc 0, y) [AllOnes=0] 11769 // (select cc y, 0) [AllOnes=0] 11770 // (zext cc) [AllOnes=0] 11771 // (sext cc) [AllOnes=0/1] 11772 // (select cc -1, y) [AllOnes=1] 11773 // (select cc y, -1) [AllOnes=1] 11774 // 11775 // Invert is set when N is the null/all ones constant when CC is false. 11776 // OtherOp is set to the alternative value of N. 11777 static bool isConditionalZeroOrAllOnes(SDNode *N, bool AllOnes, 11778 SDValue &CC, bool &Invert, 11779 SDValue &OtherOp, 11780 SelectionDAG &DAG) { 11781 switch (N->getOpcode()) { 11782 default: return false; 11783 case ISD::SELECT: { 11784 CC = N->getOperand(0); 11785 SDValue N1 = N->getOperand(1); 11786 SDValue N2 = N->getOperand(2); 11787 if (isZeroOrAllOnes(N1, AllOnes)) { 11788 Invert = false; 11789 OtherOp = N2; 11790 return true; 11791 } 11792 if (isZeroOrAllOnes(N2, AllOnes)) { 11793 Invert = true; 11794 OtherOp = N1; 11795 return true; 11796 } 11797 return false; 11798 } 11799 case ISD::ZERO_EXTEND: 11800 // (zext cc) can never be the all ones value. 11801 if (AllOnes) 11802 return false; 11803 LLVM_FALLTHROUGH; 11804 case ISD::SIGN_EXTEND: { 11805 SDLoc dl(N); 11806 EVT VT = N->getValueType(0); 11807 CC = N->getOperand(0); 11808 if (CC.getValueType() != MVT::i1 || CC.getOpcode() != ISD::SETCC) 11809 return false; 11810 Invert = !AllOnes; 11811 if (AllOnes) 11812 // When looking for an AllOnes constant, N is an sext, and the 'other' 11813 // value is 0. 11814 OtherOp = DAG.getConstant(0, dl, VT); 11815 else if (N->getOpcode() == ISD::ZERO_EXTEND) 11816 // When looking for a 0 constant, N can be zext or sext. 11817 OtherOp = DAG.getConstant(1, dl, VT); 11818 else 11819 OtherOp = DAG.getConstant(APInt::getAllOnesValue(VT.getSizeInBits()), dl, 11820 VT); 11821 return true; 11822 } 11823 } 11824 } 11825 11826 // Combine a constant select operand into its use: 11827 // 11828 // (add (select cc, 0, c), x) -> (select cc, x, (add, x, c)) 11829 // (sub x, (select cc, 0, c)) -> (select cc, x, (sub, x, c)) 11830 // (and (select cc, -1, c), x) -> (select cc, x, (and, x, c)) [AllOnes=1] 11831 // (or (select cc, 0, c), x) -> (select cc, x, (or, x, c)) 11832 // (xor (select cc, 0, c), x) -> (select cc, x, (xor, x, c)) 11833 // 11834 // The transform is rejected if the select doesn't have a constant operand that 11835 // is null, or all ones when AllOnes is set. 11836 // 11837 // Also recognize sext/zext from i1: 11838 // 11839 // (add (zext cc), x) -> (select cc (add x, 1), x) 11840 // (add (sext cc), x) -> (select cc (add x, -1), x) 11841 // 11842 // These transformations eventually create predicated instructions. 11843 // 11844 // @param N The node to transform. 11845 // @param Slct The N operand that is a select. 11846 // @param OtherOp The other N operand (x above). 11847 // @param DCI Context. 11848 // @param AllOnes Require the select constant to be all ones instead of null. 11849 // @returns The new node, or SDValue() on failure. 11850 static 11851 SDValue combineSelectAndUse(SDNode *N, SDValue Slct, SDValue OtherOp, 11852 TargetLowering::DAGCombinerInfo &DCI, 11853 bool AllOnes = false) { 11854 SelectionDAG &DAG = DCI.DAG; 11855 EVT VT = N->getValueType(0); 11856 SDValue NonConstantVal; 11857 SDValue CCOp; 11858 bool SwapSelectOps; 11859 if (!isConditionalZeroOrAllOnes(Slct.getNode(), AllOnes, CCOp, SwapSelectOps, 11860 NonConstantVal, DAG)) 11861 return SDValue(); 11862 11863 // Slct is now know to be the desired identity constant when CC is true. 11864 SDValue TrueVal = OtherOp; 11865 SDValue FalseVal = DAG.getNode(N->getOpcode(), SDLoc(N), VT, 11866 OtherOp, NonConstantVal); 11867 // Unless SwapSelectOps says CC should be false. 11868 if (SwapSelectOps) 11869 std::swap(TrueVal, FalseVal); 11870 11871 return DAG.getNode(ISD::SELECT, SDLoc(N), VT, 11872 CCOp, TrueVal, FalseVal); 11873 } 11874 11875 // Attempt combineSelectAndUse on each operand of a commutative operator N. 11876 static 11877 SDValue combineSelectAndUseCommutative(SDNode *N, bool AllOnes, 11878 TargetLowering::DAGCombinerInfo &DCI) { 11879 SDValue N0 = N->getOperand(0); 11880 SDValue N1 = N->getOperand(1); 11881 if (N0.getNode()->hasOneUse()) 11882 if (SDValue Result = combineSelectAndUse(N, N0, N1, DCI, AllOnes)) 11883 return Result; 11884 if (N1.getNode()->hasOneUse()) 11885 if (SDValue Result = combineSelectAndUse(N, N1, N0, DCI, AllOnes)) 11886 return Result; 11887 return SDValue(); 11888 } 11889 11890 static bool IsVUZPShuffleNode(SDNode *N) { 11891 // VUZP shuffle node. 11892 if (N->getOpcode() == ARMISD::VUZP) 11893 return true; 11894 11895 // "VUZP" on i32 is an alias for VTRN. 11896 if (N->getOpcode() == ARMISD::VTRN && N->getValueType(0) == MVT::v2i32) 11897 return true; 11898 11899 return false; 11900 } 11901 11902 static SDValue AddCombineToVPADD(SDNode *N, SDValue N0, SDValue N1, 11903 TargetLowering::DAGCombinerInfo &DCI, 11904 const ARMSubtarget *Subtarget) { 11905 // Look for ADD(VUZP.0, VUZP.1). 11906 if (!IsVUZPShuffleNode(N0.getNode()) || N0.getNode() != N1.getNode() || 11907 N0 == N1) 11908 return SDValue(); 11909 11910 // Make sure the ADD is a 64-bit add; there is no 128-bit VPADD. 11911 if (!N->getValueType(0).is64BitVector()) 11912 return SDValue(); 11913 11914 // Generate vpadd. 11915 SelectionDAG &DAG = DCI.DAG; 11916 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 11917 SDLoc dl(N); 11918 SDNode *Unzip = N0.getNode(); 11919 EVT VT = N->getValueType(0); 11920 11921 SmallVector<SDValue, 8> Ops; 11922 Ops.push_back(DAG.getConstant(Intrinsic::arm_neon_vpadd, dl, 11923 TLI.getPointerTy(DAG.getDataLayout()))); 11924 Ops.push_back(Unzip->getOperand(0)); 11925 Ops.push_back(Unzip->getOperand(1)); 11926 11927 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, Ops); 11928 } 11929 11930 static SDValue AddCombineVUZPToVPADDL(SDNode *N, SDValue N0, SDValue N1, 11931 TargetLowering::DAGCombinerInfo &DCI, 11932 const ARMSubtarget *Subtarget) { 11933 // Check for two extended operands. 11934 if (!(N0.getOpcode() == ISD::SIGN_EXTEND && 11935 N1.getOpcode() == ISD::SIGN_EXTEND) && 11936 !(N0.getOpcode() == ISD::ZERO_EXTEND && 11937 N1.getOpcode() == ISD::ZERO_EXTEND)) 11938 return SDValue(); 11939 11940 SDValue N00 = N0.getOperand(0); 11941 SDValue N10 = N1.getOperand(0); 11942 11943 // Look for ADD(SEXT(VUZP.0), SEXT(VUZP.1)) 11944 if (!IsVUZPShuffleNode(N00.getNode()) || N00.getNode() != N10.getNode() || 11945 N00 == N10) 11946 return SDValue(); 11947 11948 // We only recognize Q register paddl here; this can't be reached until 11949 // after type legalization. 11950 if (!N00.getValueType().is64BitVector() || 11951 !N0.getValueType().is128BitVector()) 11952 return SDValue(); 11953 11954 // Generate vpaddl. 11955 SelectionDAG &DAG = DCI.DAG; 11956 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 11957 SDLoc dl(N); 11958 EVT VT = N->getValueType(0); 11959 11960 SmallVector<SDValue, 8> Ops; 11961 // Form vpaddl.sN or vpaddl.uN depending on the kind of extension. 11962 unsigned Opcode; 11963 if (N0.getOpcode() == ISD::SIGN_EXTEND) 11964 Opcode = Intrinsic::arm_neon_vpaddls; 11965 else 11966 Opcode = Intrinsic::arm_neon_vpaddlu; 11967 Ops.push_back(DAG.getConstant(Opcode, dl, 11968 TLI.getPointerTy(DAG.getDataLayout()))); 11969 EVT ElemTy = N00.getValueType().getVectorElementType(); 11970 unsigned NumElts = VT.getVectorNumElements(); 11971 EVT ConcatVT = EVT::getVectorVT(*DAG.getContext(), ElemTy, NumElts * 2); 11972 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), ConcatVT, 11973 N00.getOperand(0), N00.getOperand(1)); 11974 Ops.push_back(Concat); 11975 11976 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, Ops); 11977 } 11978 11979 // FIXME: This function shouldn't be necessary; if we lower BUILD_VECTOR in 11980 // an appropriate manner, we end up with ADD(VUZP(ZEXT(N))), which is 11981 // much easier to match. 11982 static SDValue 11983 AddCombineBUILD_VECTORToVPADDL(SDNode *N, SDValue N0, SDValue N1, 11984 TargetLowering::DAGCombinerInfo &DCI, 11985 const ARMSubtarget *Subtarget) { 11986 // Only perform optimization if after legalize, and if NEON is available. We 11987 // also expected both operands to be BUILD_VECTORs. 11988 if (DCI.isBeforeLegalize() || !Subtarget->hasNEON() 11989 || N0.getOpcode() != ISD::BUILD_VECTOR 11990 || N1.getOpcode() != ISD::BUILD_VECTOR) 11991 return SDValue(); 11992 11993 // Check output type since VPADDL operand elements can only be 8, 16, or 32. 11994 EVT VT = N->getValueType(0); 11995 if (!VT.isInteger() || VT.getVectorElementType() == MVT::i64) 11996 return SDValue(); 11997 11998 // Check that the vector operands are of the right form. 11999 // N0 and N1 are BUILD_VECTOR nodes with N number of EXTRACT_VECTOR 12000 // operands, where N is the size of the formed vector. 12001 // Each EXTRACT_VECTOR should have the same input vector and odd or even 12002 // index such that we have a pair wise add pattern. 12003 12004 // Grab the vector that all EXTRACT_VECTOR nodes should be referencing. 12005 if (N0->getOperand(0)->getOpcode() != ISD::EXTRACT_VECTOR_ELT) 12006 return SDValue(); 12007 SDValue Vec = N0->getOperand(0)->getOperand(0); 12008 SDNode *V = Vec.getNode(); 12009 unsigned nextIndex = 0; 12010 12011 // For each operands to the ADD which are BUILD_VECTORs, 12012 // check to see if each of their operands are an EXTRACT_VECTOR with 12013 // the same vector and appropriate index. 12014 for (unsigned i = 0, e = N0->getNumOperands(); i != e; ++i) { 12015 if (N0->getOperand(i)->getOpcode() == ISD::EXTRACT_VECTOR_ELT 12016 && N1->getOperand(i)->getOpcode() == ISD::EXTRACT_VECTOR_ELT) { 12017 12018 SDValue ExtVec0 = N0->getOperand(i); 12019 SDValue ExtVec1 = N1->getOperand(i); 12020 12021 // First operand is the vector, verify its the same. 12022 if (V != ExtVec0->getOperand(0).getNode() || 12023 V != ExtVec1->getOperand(0).getNode()) 12024 return SDValue(); 12025 12026 // Second is the constant, verify its correct. 12027 ConstantSDNode *C0 = dyn_cast<ConstantSDNode>(ExtVec0->getOperand(1)); 12028 ConstantSDNode *C1 = dyn_cast<ConstantSDNode>(ExtVec1->getOperand(1)); 12029 12030 // For the constant, we want to see all the even or all the odd. 12031 if (!C0 || !C1 || C0->getZExtValue() != nextIndex 12032 || C1->getZExtValue() != nextIndex+1) 12033 return SDValue(); 12034 12035 // Increment index. 12036 nextIndex+=2; 12037 } else 12038 return SDValue(); 12039 } 12040 12041 // Don't generate vpaddl+vmovn; we'll match it to vpadd later. Also make sure 12042 // we're using the entire input vector, otherwise there's a size/legality 12043 // mismatch somewhere. 12044 if (nextIndex != Vec.getValueType().getVectorNumElements() || 12045 Vec.getValueType().getVectorElementType() == VT.getVectorElementType()) 12046 return SDValue(); 12047 12048 // Create VPADDL node. 12049 SelectionDAG &DAG = DCI.DAG; 12050 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 12051 12052 SDLoc dl(N); 12053 12054 // Build operand list. 12055 SmallVector<SDValue, 8> Ops; 12056 Ops.push_back(DAG.getConstant(Intrinsic::arm_neon_vpaddls, dl, 12057 TLI.getPointerTy(DAG.getDataLayout()))); 12058 12059 // Input is the vector. 12060 Ops.push_back(Vec); 12061 12062 // Get widened type and narrowed type. 12063 MVT widenType; 12064 unsigned numElem = VT.getVectorNumElements(); 12065 12066 EVT inputLaneType = Vec.getValueType().getVectorElementType(); 12067 switch (inputLaneType.getSimpleVT().SimpleTy) { 12068 case MVT::i8: widenType = MVT::getVectorVT(MVT::i16, numElem); break; 12069 case MVT::i16: widenType = MVT::getVectorVT(MVT::i32, numElem); break; 12070 case MVT::i32: widenType = MVT::getVectorVT(MVT::i64, numElem); break; 12071 default: 12072 llvm_unreachable("Invalid vector element type for padd optimization."); 12073 } 12074 12075 SDValue tmp = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, widenType, Ops); 12076 unsigned ExtOp = VT.bitsGT(tmp.getValueType()) ? ISD::ANY_EXTEND : ISD::TRUNCATE; 12077 return DAG.getNode(ExtOp, dl, VT, tmp); 12078 } 12079 12080 static SDValue findMUL_LOHI(SDValue V) { 12081 if (V->getOpcode() == ISD::UMUL_LOHI || 12082 V->getOpcode() == ISD::SMUL_LOHI) 12083 return V; 12084 return SDValue(); 12085 } 12086 12087 static SDValue AddCombineTo64BitSMLAL16(SDNode *AddcNode, SDNode *AddeNode, 12088 TargetLowering::DAGCombinerInfo &DCI, 12089 const ARMSubtarget *Subtarget) { 12090 if (!Subtarget->hasBaseDSP()) 12091 return SDValue(); 12092 12093 // SMLALBB, SMLALBT, SMLALTB, SMLALTT multiply two 16-bit values and 12094 // accumulates the product into a 64-bit value. The 16-bit values will 12095 // be sign extended somehow or SRA'd into 32-bit values 12096 // (addc (adde (mul 16bit, 16bit), lo), hi) 12097 SDValue Mul = AddcNode->getOperand(0); 12098 SDValue Lo = AddcNode->getOperand(1); 12099 if (Mul.getOpcode() != ISD::MUL) { 12100 Lo = AddcNode->getOperand(0); 12101 Mul = AddcNode->getOperand(1); 12102 if (Mul.getOpcode() != ISD::MUL) 12103 return SDValue(); 12104 } 12105 12106 SDValue SRA = AddeNode->getOperand(0); 12107 SDValue Hi = AddeNode->getOperand(1); 12108 if (SRA.getOpcode() != ISD::SRA) { 12109 SRA = AddeNode->getOperand(1); 12110 Hi = AddeNode->getOperand(0); 12111 if (SRA.getOpcode() != ISD::SRA) 12112 return SDValue(); 12113 } 12114 if (auto Const = dyn_cast<ConstantSDNode>(SRA.getOperand(1))) { 12115 if (Const->getZExtValue() != 31) 12116 return SDValue(); 12117 } else 12118 return SDValue(); 12119 12120 if (SRA.getOperand(0) != Mul) 12121 return SDValue(); 12122 12123 SelectionDAG &DAG = DCI.DAG; 12124 SDLoc dl(AddcNode); 12125 unsigned Opcode = 0; 12126 SDValue Op0; 12127 SDValue Op1; 12128 12129 if (isS16(Mul.getOperand(0), DAG) && isS16(Mul.getOperand(1), DAG)) { 12130 Opcode = ARMISD::SMLALBB; 12131 Op0 = Mul.getOperand(0); 12132 Op1 = Mul.getOperand(1); 12133 } else if (isS16(Mul.getOperand(0), DAG) && isSRA16(Mul.getOperand(1))) { 12134 Opcode = ARMISD::SMLALBT; 12135 Op0 = Mul.getOperand(0); 12136 Op1 = Mul.getOperand(1).getOperand(0); 12137 } else if (isSRA16(Mul.getOperand(0)) && isS16(Mul.getOperand(1), DAG)) { 12138 Opcode = ARMISD::SMLALTB; 12139 Op0 = Mul.getOperand(0).getOperand(0); 12140 Op1 = Mul.getOperand(1); 12141 } else if (isSRA16(Mul.getOperand(0)) && isSRA16(Mul.getOperand(1))) { 12142 Opcode = ARMISD::SMLALTT; 12143 Op0 = Mul->getOperand(0).getOperand(0); 12144 Op1 = Mul->getOperand(1).getOperand(0); 12145 } 12146 12147 if (!Op0 || !Op1) 12148 return SDValue(); 12149 12150 SDValue SMLAL = DAG.getNode(Opcode, dl, DAG.getVTList(MVT::i32, MVT::i32), 12151 Op0, Op1, Lo, Hi); 12152 // Replace the ADDs' nodes uses by the MLA node's values. 12153 SDValue HiMLALResult(SMLAL.getNode(), 1); 12154 SDValue LoMLALResult(SMLAL.getNode(), 0); 12155 12156 DAG.ReplaceAllUsesOfValueWith(SDValue(AddcNode, 0), LoMLALResult); 12157 DAG.ReplaceAllUsesOfValueWith(SDValue(AddeNode, 0), HiMLALResult); 12158 12159 // Return original node to notify the driver to stop replacing. 12160 SDValue resNode(AddcNode, 0); 12161 return resNode; 12162 } 12163 12164 static SDValue AddCombineTo64bitMLAL(SDNode *AddeSubeNode, 12165 TargetLowering::DAGCombinerInfo &DCI, 12166 const ARMSubtarget *Subtarget) { 12167 // Look for multiply add opportunities. 12168 // The pattern is a ISD::UMUL_LOHI followed by two add nodes, where 12169 // each add nodes consumes a value from ISD::UMUL_LOHI and there is 12170 // a glue link from the first add to the second add. 12171 // If we find this pattern, we can replace the U/SMUL_LOHI, ADDC, and ADDE by 12172 // a S/UMLAL instruction. 12173 // UMUL_LOHI 12174 // / :lo \ :hi 12175 // V \ [no multiline comment] 12176 // loAdd -> ADDC | 12177 // \ :carry / 12178 // V V 12179 // ADDE <- hiAdd 12180 // 12181 // In the special case where only the higher part of a signed result is used 12182 // and the add to the low part of the result of ISD::UMUL_LOHI adds or subtracts 12183 // a constant with the exact value of 0x80000000, we recognize we are dealing 12184 // with a "rounded multiply and add" (or subtract) and transform it into 12185 // either a ARMISD::SMMLAR or ARMISD::SMMLSR respectively. 12186 12187 assert((AddeSubeNode->getOpcode() == ARMISD::ADDE || 12188 AddeSubeNode->getOpcode() == ARMISD::SUBE) && 12189 "Expect an ADDE or SUBE"); 12190 12191 assert(AddeSubeNode->getNumOperands() == 3 && 12192 AddeSubeNode->getOperand(2).getValueType() == MVT::i32 && 12193 "ADDE node has the wrong inputs"); 12194 12195 // Check that we are chained to the right ADDC or SUBC node. 12196 SDNode *AddcSubcNode = AddeSubeNode->getOperand(2).getNode(); 12197 if ((AddeSubeNode->getOpcode() == ARMISD::ADDE && 12198 AddcSubcNode->getOpcode() != ARMISD::ADDC) || 12199 (AddeSubeNode->getOpcode() == ARMISD::SUBE && 12200 AddcSubcNode->getOpcode() != ARMISD::SUBC)) 12201 return SDValue(); 12202 12203 SDValue AddcSubcOp0 = AddcSubcNode->getOperand(0); 12204 SDValue AddcSubcOp1 = AddcSubcNode->getOperand(1); 12205 12206 // Check if the two operands are from the same mul_lohi node. 12207 if (AddcSubcOp0.getNode() == AddcSubcOp1.getNode()) 12208 return SDValue(); 12209 12210 assert(AddcSubcNode->getNumValues() == 2 && 12211 AddcSubcNode->getValueType(0) == MVT::i32 && 12212 "Expect ADDC with two result values. First: i32"); 12213 12214 // Check that the ADDC adds the low result of the S/UMUL_LOHI. If not, it 12215 // maybe a SMLAL which multiplies two 16-bit values. 12216 if (AddeSubeNode->getOpcode() == ARMISD::ADDE && 12217 AddcSubcOp0->getOpcode() != ISD::UMUL_LOHI && 12218 AddcSubcOp0->getOpcode() != ISD::SMUL_LOHI && 12219 AddcSubcOp1->getOpcode() != ISD::UMUL_LOHI && 12220 AddcSubcOp1->getOpcode() != ISD::SMUL_LOHI) 12221 return AddCombineTo64BitSMLAL16(AddcSubcNode, AddeSubeNode, DCI, Subtarget); 12222 12223 // Check for the triangle shape. 12224 SDValue AddeSubeOp0 = AddeSubeNode->getOperand(0); 12225 SDValue AddeSubeOp1 = AddeSubeNode->getOperand(1); 12226 12227 // Make sure that the ADDE/SUBE operands are not coming from the same node. 12228 if (AddeSubeOp0.getNode() == AddeSubeOp1.getNode()) 12229 return SDValue(); 12230 12231 // Find the MUL_LOHI node walking up ADDE/SUBE's operands. 12232 bool IsLeftOperandMUL = false; 12233 SDValue MULOp = findMUL_LOHI(AddeSubeOp0); 12234 if (MULOp == SDValue()) 12235 MULOp = findMUL_LOHI(AddeSubeOp1); 12236 else 12237 IsLeftOperandMUL = true; 12238 if (MULOp == SDValue()) 12239 return SDValue(); 12240 12241 // Figure out the right opcode. 12242 unsigned Opc = MULOp->getOpcode(); 12243 unsigned FinalOpc = (Opc == ISD::SMUL_LOHI) ? ARMISD::SMLAL : ARMISD::UMLAL; 12244 12245 // Figure out the high and low input values to the MLAL node. 12246 SDValue *HiAddSub = nullptr; 12247 SDValue *LoMul = nullptr; 12248 SDValue *LowAddSub = nullptr; 12249 12250 // Ensure that ADDE/SUBE is from high result of ISD::xMUL_LOHI. 12251 if ((AddeSubeOp0 != MULOp.getValue(1)) && (AddeSubeOp1 != MULOp.getValue(1))) 12252 return SDValue(); 12253 12254 if (IsLeftOperandMUL) 12255 HiAddSub = &AddeSubeOp1; 12256 else 12257 HiAddSub = &AddeSubeOp0; 12258 12259 // Ensure that LoMul and LowAddSub are taken from correct ISD::SMUL_LOHI node 12260 // whose low result is fed to the ADDC/SUBC we are checking. 12261 12262 if (AddcSubcOp0 == MULOp.getValue(0)) { 12263 LoMul = &AddcSubcOp0; 12264 LowAddSub = &AddcSubcOp1; 12265 } 12266 if (AddcSubcOp1 == MULOp.getValue(0)) { 12267 LoMul = &AddcSubcOp1; 12268 LowAddSub = &AddcSubcOp0; 12269 } 12270 12271 if (!LoMul) 12272 return SDValue(); 12273 12274 // If HiAddSub is the same node as ADDC/SUBC or is a predecessor of ADDC/SUBC 12275 // the replacement below will create a cycle. 12276 if (AddcSubcNode == HiAddSub->getNode() || 12277 AddcSubcNode->isPredecessorOf(HiAddSub->getNode())) 12278 return SDValue(); 12279 12280 // Create the merged node. 12281 SelectionDAG &DAG = DCI.DAG; 12282 12283 // Start building operand list. 12284 SmallVector<SDValue, 8> Ops; 12285 Ops.push_back(LoMul->getOperand(0)); 12286 Ops.push_back(LoMul->getOperand(1)); 12287 12288 // Check whether we can use SMMLAR, SMMLSR or SMMULR instead. For this to be 12289 // the case, we must be doing signed multiplication and only use the higher 12290 // part of the result of the MLAL, furthermore the LowAddSub must be a constant 12291 // addition or subtraction with the value of 0x800000. 12292 if (Subtarget->hasV6Ops() && Subtarget->hasDSP() && Subtarget->useMulOps() && 12293 FinalOpc == ARMISD::SMLAL && !AddeSubeNode->hasAnyUseOfValue(1) && 12294 LowAddSub->getNode()->getOpcode() == ISD::Constant && 12295 static_cast<ConstantSDNode *>(LowAddSub->getNode())->getZExtValue() == 12296 0x80000000) { 12297 Ops.push_back(*HiAddSub); 12298 if (AddcSubcNode->getOpcode() == ARMISD::SUBC) { 12299 FinalOpc = ARMISD::SMMLSR; 12300 } else { 12301 FinalOpc = ARMISD::SMMLAR; 12302 } 12303 SDValue NewNode = DAG.getNode(FinalOpc, SDLoc(AddcSubcNode), MVT::i32, Ops); 12304 DAG.ReplaceAllUsesOfValueWith(SDValue(AddeSubeNode, 0), NewNode); 12305 12306 return SDValue(AddeSubeNode, 0); 12307 } else if (AddcSubcNode->getOpcode() == ARMISD::SUBC) 12308 // SMMLS is generated during instruction selection and the rest of this 12309 // function can not handle the case where AddcSubcNode is a SUBC. 12310 return SDValue(); 12311 12312 // Finish building the operand list for {U/S}MLAL 12313 Ops.push_back(*LowAddSub); 12314 Ops.push_back(*HiAddSub); 12315 12316 SDValue MLALNode = DAG.getNode(FinalOpc, SDLoc(AddcSubcNode), 12317 DAG.getVTList(MVT::i32, MVT::i32), Ops); 12318 12319 // Replace the ADDs' nodes uses by the MLA node's values. 12320 SDValue HiMLALResult(MLALNode.getNode(), 1); 12321 DAG.ReplaceAllUsesOfValueWith(SDValue(AddeSubeNode, 0), HiMLALResult); 12322 12323 SDValue LoMLALResult(MLALNode.getNode(), 0); 12324 DAG.ReplaceAllUsesOfValueWith(SDValue(AddcSubcNode, 0), LoMLALResult); 12325 12326 // Return original node to notify the driver to stop replacing. 12327 return SDValue(AddeSubeNode, 0); 12328 } 12329 12330 static SDValue AddCombineTo64bitUMAAL(SDNode *AddeNode, 12331 TargetLowering::DAGCombinerInfo &DCI, 12332 const ARMSubtarget *Subtarget) { 12333 // UMAAL is similar to UMLAL except that it adds two unsigned values. 12334 // While trying to combine for the other MLAL nodes, first search for the 12335 // chance to use UMAAL. Check if Addc uses a node which has already 12336 // been combined into a UMLAL. The other pattern is UMLAL using Addc/Adde 12337 // as the addend, and it's handled in PerformUMLALCombine. 12338 12339 if (!Subtarget->hasV6Ops() || !Subtarget->hasDSP()) 12340 return AddCombineTo64bitMLAL(AddeNode, DCI, Subtarget); 12341 12342 // Check that we have a glued ADDC node. 12343 SDNode* AddcNode = AddeNode->getOperand(2).getNode(); 12344 if (AddcNode->getOpcode() != ARMISD::ADDC) 12345 return SDValue(); 12346 12347 // Find the converted UMAAL or quit if it doesn't exist. 12348 SDNode *UmlalNode = nullptr; 12349 SDValue AddHi; 12350 if (AddcNode->getOperand(0).getOpcode() == ARMISD::UMLAL) { 12351 UmlalNode = AddcNode->getOperand(0).getNode(); 12352 AddHi = AddcNode->getOperand(1); 12353 } else if (AddcNode->getOperand(1).getOpcode() == ARMISD::UMLAL) { 12354 UmlalNode = AddcNode->getOperand(1).getNode(); 12355 AddHi = AddcNode->getOperand(0); 12356 } else { 12357 return AddCombineTo64bitMLAL(AddeNode, DCI, Subtarget); 12358 } 12359 12360 // The ADDC should be glued to an ADDE node, which uses the same UMLAL as 12361 // the ADDC as well as Zero. 12362 if (!isNullConstant(UmlalNode->getOperand(3))) 12363 return SDValue(); 12364 12365 if ((isNullConstant(AddeNode->getOperand(0)) && 12366 AddeNode->getOperand(1).getNode() == UmlalNode) || 12367 (AddeNode->getOperand(0).getNode() == UmlalNode && 12368 isNullConstant(AddeNode->getOperand(1)))) { 12369 SelectionDAG &DAG = DCI.DAG; 12370 SDValue Ops[] = { UmlalNode->getOperand(0), UmlalNode->getOperand(1), 12371 UmlalNode->getOperand(2), AddHi }; 12372 SDValue UMAAL = DAG.getNode(ARMISD::UMAAL, SDLoc(AddcNode), 12373 DAG.getVTList(MVT::i32, MVT::i32), Ops); 12374 12375 // Replace the ADDs' nodes uses by the UMAAL node's values. 12376 DAG.ReplaceAllUsesOfValueWith(SDValue(AddeNode, 0), SDValue(UMAAL.getNode(), 1)); 12377 DAG.ReplaceAllUsesOfValueWith(SDValue(AddcNode, 0), SDValue(UMAAL.getNode(), 0)); 12378 12379 // Return original node to notify the driver to stop replacing. 12380 return SDValue(AddeNode, 0); 12381 } 12382 return SDValue(); 12383 } 12384 12385 static SDValue PerformUMLALCombine(SDNode *N, SelectionDAG &DAG, 12386 const ARMSubtarget *Subtarget) { 12387 if (!Subtarget->hasV6Ops() || !Subtarget->hasDSP()) 12388 return SDValue(); 12389 12390 // Check that we have a pair of ADDC and ADDE as operands. 12391 // Both addends of the ADDE must be zero. 12392 SDNode* AddcNode = N->getOperand(2).getNode(); 12393 SDNode* AddeNode = N->getOperand(3).getNode(); 12394 if ((AddcNode->getOpcode() == ARMISD::ADDC) && 12395 (AddeNode->getOpcode() == ARMISD::ADDE) && 12396 isNullConstant(AddeNode->getOperand(0)) && 12397 isNullConstant(AddeNode->getOperand(1)) && 12398 (AddeNode->getOperand(2).getNode() == AddcNode)) 12399 return DAG.getNode(ARMISD::UMAAL, SDLoc(N), 12400 DAG.getVTList(MVT::i32, MVT::i32), 12401 {N->getOperand(0), N->getOperand(1), 12402 AddcNode->getOperand(0), AddcNode->getOperand(1)}); 12403 else 12404 return SDValue(); 12405 } 12406 12407 static SDValue PerformAddcSubcCombine(SDNode *N, 12408 TargetLowering::DAGCombinerInfo &DCI, 12409 const ARMSubtarget *Subtarget) { 12410 SelectionDAG &DAG(DCI.DAG); 12411 12412 if (N->getOpcode() == ARMISD::SUBC) { 12413 // (SUBC (ADDE 0, 0, C), 1) -> C 12414 SDValue LHS = N->getOperand(0); 12415 SDValue RHS = N->getOperand(1); 12416 if (LHS->getOpcode() == ARMISD::ADDE && 12417 isNullConstant(LHS->getOperand(0)) && 12418 isNullConstant(LHS->getOperand(1)) && isOneConstant(RHS)) { 12419 return DCI.CombineTo(N, SDValue(N, 0), LHS->getOperand(2)); 12420 } 12421 } 12422 12423 if (Subtarget->isThumb1Only()) { 12424 SDValue RHS = N->getOperand(1); 12425 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS)) { 12426 int32_t imm = C->getSExtValue(); 12427 if (imm < 0 && imm > std::numeric_limits<int>::min()) { 12428 SDLoc DL(N); 12429 RHS = DAG.getConstant(-imm, DL, MVT::i32); 12430 unsigned Opcode = (N->getOpcode() == ARMISD::ADDC) ? ARMISD::SUBC 12431 : ARMISD::ADDC; 12432 return DAG.getNode(Opcode, DL, N->getVTList(), N->getOperand(0), RHS); 12433 } 12434 } 12435 } 12436 12437 return SDValue(); 12438 } 12439 12440 static SDValue PerformAddeSubeCombine(SDNode *N, 12441 TargetLowering::DAGCombinerInfo &DCI, 12442 const ARMSubtarget *Subtarget) { 12443 if (Subtarget->isThumb1Only()) { 12444 SelectionDAG &DAG = DCI.DAG; 12445 SDValue RHS = N->getOperand(1); 12446 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS)) { 12447 int64_t imm = C->getSExtValue(); 12448 if (imm < 0) { 12449 SDLoc DL(N); 12450 12451 // The with-carry-in form matches bitwise not instead of the negation. 12452 // Effectively, the inverse interpretation of the carry flag already 12453 // accounts for part of the negation. 12454 RHS = DAG.getConstant(~imm, DL, MVT::i32); 12455 12456 unsigned Opcode = (N->getOpcode() == ARMISD::ADDE) ? ARMISD::SUBE 12457 : ARMISD::ADDE; 12458 return DAG.getNode(Opcode, DL, N->getVTList(), 12459 N->getOperand(0), RHS, N->getOperand(2)); 12460 } 12461 } 12462 } else if (N->getOperand(1)->getOpcode() == ISD::SMUL_LOHI) { 12463 return AddCombineTo64bitMLAL(N, DCI, Subtarget); 12464 } 12465 return SDValue(); 12466 } 12467 12468 static SDValue PerformSELECTCombine(SDNode *N, 12469 TargetLowering::DAGCombinerInfo &DCI, 12470 const ARMSubtarget *Subtarget) { 12471 if (!Subtarget->hasMVEIntegerOps()) 12472 return SDValue(); 12473 12474 SDLoc dl(N); 12475 SDValue SetCC; 12476 SDValue LHS; 12477 SDValue RHS; 12478 ISD::CondCode CC; 12479 SDValue TrueVal; 12480 SDValue FalseVal; 12481 12482 if (N->getOpcode() == ISD::SELECT && 12483 N->getOperand(0)->getOpcode() == ISD::SETCC) { 12484 SetCC = N->getOperand(0); 12485 LHS = SetCC->getOperand(0); 12486 RHS = SetCC->getOperand(1); 12487 CC = cast<CondCodeSDNode>(SetCC->getOperand(2))->get(); 12488 TrueVal = N->getOperand(1); 12489 FalseVal = N->getOperand(2); 12490 } else if (N->getOpcode() == ISD::SELECT_CC) { 12491 LHS = N->getOperand(0); 12492 RHS = N->getOperand(1); 12493 CC = cast<CondCodeSDNode>(N->getOperand(4))->get(); 12494 TrueVal = N->getOperand(2); 12495 FalseVal = N->getOperand(3); 12496 } else { 12497 return SDValue(); 12498 } 12499 12500 unsigned int Opcode = 0; 12501 if ((TrueVal->getOpcode() == ISD::VECREDUCE_UMIN || 12502 FalseVal->getOpcode() == ISD::VECREDUCE_UMIN) && 12503 (CC == ISD::SETULT || CC == ISD::SETUGT)) { 12504 Opcode = ARMISD::VMINVu; 12505 if (CC == ISD::SETUGT) 12506 std::swap(TrueVal, FalseVal); 12507 } else if ((TrueVal->getOpcode() == ISD::VECREDUCE_SMIN || 12508 FalseVal->getOpcode() == ISD::VECREDUCE_SMIN) && 12509 (CC == ISD::SETLT || CC == ISD::SETGT)) { 12510 Opcode = ARMISD::VMINVs; 12511 if (CC == ISD::SETGT) 12512 std::swap(TrueVal, FalseVal); 12513 } else if ((TrueVal->getOpcode() == ISD::VECREDUCE_UMAX || 12514 FalseVal->getOpcode() == ISD::VECREDUCE_UMAX) && 12515 (CC == ISD::SETUGT || CC == ISD::SETULT)) { 12516 Opcode = ARMISD::VMAXVu; 12517 if (CC == ISD::SETULT) 12518 std::swap(TrueVal, FalseVal); 12519 } else if ((TrueVal->getOpcode() == ISD::VECREDUCE_SMAX || 12520 FalseVal->getOpcode() == ISD::VECREDUCE_SMAX) && 12521 (CC == ISD::SETGT || CC == ISD::SETLT)) { 12522 Opcode = ARMISD::VMAXVs; 12523 if (CC == ISD::SETLT) 12524 std::swap(TrueVal, FalseVal); 12525 } else 12526 return SDValue(); 12527 12528 // Normalise to the right hand side being the vector reduction 12529 switch (TrueVal->getOpcode()) { 12530 case ISD::VECREDUCE_UMIN: 12531 case ISD::VECREDUCE_SMIN: 12532 case ISD::VECREDUCE_UMAX: 12533 case ISD::VECREDUCE_SMAX: 12534 std::swap(LHS, RHS); 12535 std::swap(TrueVal, FalseVal); 12536 break; 12537 } 12538 12539 EVT VectorType = FalseVal->getOperand(0).getValueType(); 12540 12541 if (VectorType != MVT::v16i8 && VectorType != MVT::v8i16 && 12542 VectorType != MVT::v4i32) 12543 return SDValue(); 12544 12545 EVT VectorScalarType = VectorType.getVectorElementType(); 12546 12547 // The values being selected must also be the ones being compared 12548 if (TrueVal != LHS || FalseVal != RHS) 12549 return SDValue(); 12550 12551 EVT LeftType = LHS->getValueType(0); 12552 EVT RightType = RHS->getValueType(0); 12553 12554 // The types must match the reduced type too 12555 if (LeftType != VectorScalarType || RightType != VectorScalarType) 12556 return SDValue(); 12557 12558 // Legalise the scalar to an i32 12559 if (VectorScalarType != MVT::i32) 12560 LHS = DCI.DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, LHS); 12561 12562 // Generate the reduction as an i32 for legalisation purposes 12563 auto Reduction = 12564 DCI.DAG.getNode(Opcode, dl, MVT::i32, LHS, RHS->getOperand(0)); 12565 12566 // The result isn't actually an i32 so truncate it back to its original type 12567 if (VectorScalarType != MVT::i32) 12568 Reduction = DCI.DAG.getNode(ISD::TRUNCATE, dl, VectorScalarType, Reduction); 12569 12570 return Reduction; 12571 } 12572 12573 // A special combine for the vqdmulh family of instructions. This is one of the 12574 // potential set of patterns that could patch this instruction. The base pattern 12575 // you would expect to be min(max(ashr(mul(mul(sext(x), 2), sext(y)), 16))). 12576 // This matches the different min(max(ashr(mul(mul(sext(x), sext(y)), 2), 16))), 12577 // which llvm will have optimized to min(ashr(mul(sext(x), sext(y)), 15))) as 12578 // the max is unnecessary. 12579 static SDValue PerformVQDMULHCombine(SDNode *N, SelectionDAG &DAG) { 12580 EVT VT = N->getValueType(0); 12581 SDValue Shft; 12582 ConstantSDNode *Clamp; 12583 12584 if (N->getOpcode() == ISD::SMIN) { 12585 Shft = N->getOperand(0); 12586 Clamp = isConstOrConstSplat(N->getOperand(1)); 12587 } else if (N->getOpcode() == ISD::VSELECT) { 12588 // Detect a SMIN, which for an i64 node will be a vselect/setcc, not a smin. 12589 SDValue Cmp = N->getOperand(0); 12590 if (Cmp.getOpcode() != ISD::SETCC || 12591 cast<CondCodeSDNode>(Cmp.getOperand(2))->get() != ISD::SETLT || 12592 Cmp.getOperand(0) != N->getOperand(1) || 12593 Cmp.getOperand(1) != N->getOperand(2)) 12594 return SDValue(); 12595 Shft = N->getOperand(1); 12596 Clamp = isConstOrConstSplat(N->getOperand(2)); 12597 } else 12598 return SDValue(); 12599 12600 if (!Clamp) 12601 return SDValue(); 12602 12603 MVT ScalarType; 12604 int ShftAmt = 0; 12605 switch (Clamp->getSExtValue()) { 12606 case (1 << 7) - 1: 12607 ScalarType = MVT::i8; 12608 ShftAmt = 7; 12609 break; 12610 case (1 << 15) - 1: 12611 ScalarType = MVT::i16; 12612 ShftAmt = 15; 12613 break; 12614 case (1ULL << 31) - 1: 12615 ScalarType = MVT::i32; 12616 ShftAmt = 31; 12617 break; 12618 default: 12619 return SDValue(); 12620 } 12621 12622 if (Shft.getOpcode() != ISD::SRA) 12623 return SDValue(); 12624 ConstantSDNode *N1 = isConstOrConstSplat(Shft.getOperand(1)); 12625 if (!N1 || N1->getSExtValue() != ShftAmt) 12626 return SDValue(); 12627 12628 SDValue Mul = Shft.getOperand(0); 12629 if (Mul.getOpcode() != ISD::MUL) 12630 return SDValue(); 12631 12632 SDValue Ext0 = Mul.getOperand(0); 12633 SDValue Ext1 = Mul.getOperand(1); 12634 if (Ext0.getOpcode() != ISD::SIGN_EXTEND || 12635 Ext1.getOpcode() != ISD::SIGN_EXTEND) 12636 return SDValue(); 12637 EVT VecVT = Ext0.getOperand(0).getValueType(); 12638 if (VecVT != MVT::v4i32 && VecVT != MVT::v8i16 && VecVT != MVT::v16i8) 12639 return SDValue(); 12640 if (Ext1.getOperand(0).getValueType() != VecVT || 12641 VecVT.getScalarType() != ScalarType || 12642 VT.getScalarSizeInBits() < ScalarType.getScalarSizeInBits() * 2) 12643 return SDValue(); 12644 12645 SDLoc DL(Mul); 12646 SDValue VQDMULH = DAG.getNode(ARMISD::VQDMULH, DL, VecVT, Ext0.getOperand(0), 12647 Ext1.getOperand(0)); 12648 return DAG.getNode(ISD::SIGN_EXTEND, DL, VT, VQDMULH); 12649 } 12650 12651 static SDValue PerformVSELECTCombine(SDNode *N, 12652 TargetLowering::DAGCombinerInfo &DCI, 12653 const ARMSubtarget *Subtarget) { 12654 if (!Subtarget->hasMVEIntegerOps()) 12655 return SDValue(); 12656 12657 if (SDValue V = PerformVQDMULHCombine(N, DCI.DAG)) 12658 return V; 12659 12660 // Transforms vselect(not(cond), lhs, rhs) into vselect(cond, rhs, lhs). 12661 // 12662 // We need to re-implement this optimization here as the implementation in the 12663 // Target-Independent DAGCombiner does not handle the kind of constant we make 12664 // (it calls isConstOrConstSplat with AllowTruncation set to false - and for 12665 // good reason, allowing truncation there would break other targets). 12666 // 12667 // Currently, this is only done for MVE, as it's the only target that benefits 12668 // from this transformation (e.g. VPNOT+VPSEL becomes a single VPSEL). 12669 if (N->getOperand(0).getOpcode() != ISD::XOR) 12670 return SDValue(); 12671 SDValue XOR = N->getOperand(0); 12672 12673 // Check if the XOR's RHS is either a 1, or a BUILD_VECTOR of 1s. 12674 // It is important to check with truncation allowed as the BUILD_VECTORs we 12675 // generate in those situations will truncate their operands. 12676 ConstantSDNode *Const = 12677 isConstOrConstSplat(XOR->getOperand(1), /*AllowUndefs*/ false, 12678 /*AllowTruncation*/ true); 12679 if (!Const || !Const->isOne()) 12680 return SDValue(); 12681 12682 // Rewrite into vselect(cond, rhs, lhs). 12683 SDValue Cond = XOR->getOperand(0); 12684 SDValue LHS = N->getOperand(1); 12685 SDValue RHS = N->getOperand(2); 12686 EVT Type = N->getValueType(0); 12687 return DCI.DAG.getNode(ISD::VSELECT, SDLoc(N), Type, Cond, RHS, LHS); 12688 } 12689 12690 static SDValue PerformABSCombine(SDNode *N, 12691 TargetLowering::DAGCombinerInfo &DCI, 12692 const ARMSubtarget *Subtarget) { 12693 SDValue res; 12694 SelectionDAG &DAG = DCI.DAG; 12695 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 12696 12697 if (TLI.isOperationLegal(N->getOpcode(), N->getValueType(0))) 12698 return SDValue(); 12699 12700 if (!TLI.expandABS(N, res, DAG)) 12701 return SDValue(); 12702 12703 return res; 12704 } 12705 12706 /// PerformADDECombine - Target-specific dag combine transform from 12707 /// ARMISD::ADDC, ARMISD::ADDE, and ISD::MUL_LOHI to MLAL or 12708 /// ARMISD::ADDC, ARMISD::ADDE and ARMISD::UMLAL to ARMISD::UMAAL 12709 static SDValue PerformADDECombine(SDNode *N, 12710 TargetLowering::DAGCombinerInfo &DCI, 12711 const ARMSubtarget *Subtarget) { 12712 // Only ARM and Thumb2 support UMLAL/SMLAL. 12713 if (Subtarget->isThumb1Only()) 12714 return PerformAddeSubeCombine(N, DCI, Subtarget); 12715 12716 // Only perform the checks after legalize when the pattern is available. 12717 if (DCI.isBeforeLegalize()) return SDValue(); 12718 12719 return AddCombineTo64bitUMAAL(N, DCI, Subtarget); 12720 } 12721 12722 /// PerformADDCombineWithOperands - Try DAG combinations for an ADD with 12723 /// operands N0 and N1. This is a helper for PerformADDCombine that is 12724 /// called with the default operands, and if that fails, with commuted 12725 /// operands. 12726 static SDValue PerformADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1, 12727 TargetLowering::DAGCombinerInfo &DCI, 12728 const ARMSubtarget *Subtarget){ 12729 // Attempt to create vpadd for this add. 12730 if (SDValue Result = AddCombineToVPADD(N, N0, N1, DCI, Subtarget)) 12731 return Result; 12732 12733 // Attempt to create vpaddl for this add. 12734 if (SDValue Result = AddCombineVUZPToVPADDL(N, N0, N1, DCI, Subtarget)) 12735 return Result; 12736 if (SDValue Result = AddCombineBUILD_VECTORToVPADDL(N, N0, N1, DCI, 12737 Subtarget)) 12738 return Result; 12739 12740 // fold (add (select cc, 0, c), x) -> (select cc, x, (add, x, c)) 12741 if (N0.getNode()->hasOneUse()) 12742 if (SDValue Result = combineSelectAndUse(N, N0, N1, DCI)) 12743 return Result; 12744 return SDValue(); 12745 } 12746 12747 static SDValue PerformADDVecReduce(SDNode *N, 12748 TargetLowering::DAGCombinerInfo &DCI, 12749 const ARMSubtarget *Subtarget) { 12750 if (!Subtarget->hasMVEIntegerOps() || N->getValueType(0) != MVT::i64) 12751 return SDValue(); 12752 12753 SDValue N0 = N->getOperand(0); 12754 SDValue N1 = N->getOperand(1); 12755 12756 // We are looking for a i64 add of a VADDLVx. Due to these being i64's, this 12757 // will look like: 12758 // t1: i32,i32 = ARMISD::VADDLVs x 12759 // t2: i64 = build_pair t1, t1:1 12760 // t3: i64 = add t2, y 12761 // We also need to check for sext / zext and commutitive adds. 12762 auto MakeVecReduce = [&](unsigned Opcode, unsigned OpcodeA, SDValue NA, 12763 SDValue NB) { 12764 if (NB->getOpcode() != ISD::BUILD_PAIR) 12765 return SDValue(); 12766 SDValue VecRed = NB->getOperand(0); 12767 if (VecRed->getOpcode() != Opcode || VecRed.getResNo() != 0 || 12768 NB->getOperand(1) != SDValue(VecRed.getNode(), 1)) 12769 return SDValue(); 12770 12771 SDLoc dl(N); 12772 SmallVector<SDValue, 4> Ops; 12773 Ops.push_back(DCI.DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, NA, 12774 DCI.DAG.getConstant(0, dl, MVT::i32))); 12775 Ops.push_back(DCI.DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, NA, 12776 DCI.DAG.getConstant(1, dl, MVT::i32))); 12777 for (unsigned i = 0, e = VecRed.getNumOperands(); i < e; i++) 12778 Ops.push_back(VecRed->getOperand(i)); 12779 SDValue Red = DCI.DAG.getNode(OpcodeA, dl, 12780 DCI.DAG.getVTList({MVT::i32, MVT::i32}), Ops); 12781 return DCI.DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Red, 12782 SDValue(Red.getNode(), 1)); 12783 }; 12784 12785 if (SDValue M = MakeVecReduce(ARMISD::VADDLVs, ARMISD::VADDLVAs, N0, N1)) 12786 return M; 12787 if (SDValue M = MakeVecReduce(ARMISD::VADDLVu, ARMISD::VADDLVAu, N0, N1)) 12788 return M; 12789 if (SDValue M = MakeVecReduce(ARMISD::VADDLVs, ARMISD::VADDLVAs, N1, N0)) 12790 return M; 12791 if (SDValue M = MakeVecReduce(ARMISD::VADDLVu, ARMISD::VADDLVAu, N1, N0)) 12792 return M; 12793 if (SDValue M = MakeVecReduce(ARMISD::VADDLVps, ARMISD::VADDLVAps, N0, N1)) 12794 return M; 12795 if (SDValue M = MakeVecReduce(ARMISD::VADDLVpu, ARMISD::VADDLVApu, N0, N1)) 12796 return M; 12797 if (SDValue M = MakeVecReduce(ARMISD::VADDLVps, ARMISD::VADDLVAps, N1, N0)) 12798 return M; 12799 if (SDValue M = MakeVecReduce(ARMISD::VADDLVpu, ARMISD::VADDLVApu, N1, N0)) 12800 return M; 12801 if (SDValue M = MakeVecReduce(ARMISD::VMLALVs, ARMISD::VMLALVAs, N0, N1)) 12802 return M; 12803 if (SDValue M = MakeVecReduce(ARMISD::VMLALVu, ARMISD::VMLALVAu, N0, N1)) 12804 return M; 12805 if (SDValue M = MakeVecReduce(ARMISD::VMLALVs, ARMISD::VMLALVAs, N1, N0)) 12806 return M; 12807 if (SDValue M = MakeVecReduce(ARMISD::VMLALVu, ARMISD::VMLALVAu, N1, N0)) 12808 return M; 12809 if (SDValue M = MakeVecReduce(ARMISD::VMLALVps, ARMISD::VMLALVAps, N0, N1)) 12810 return M; 12811 if (SDValue M = MakeVecReduce(ARMISD::VMLALVpu, ARMISD::VMLALVApu, N0, N1)) 12812 return M; 12813 if (SDValue M = MakeVecReduce(ARMISD::VMLALVps, ARMISD::VMLALVAps, N1, N0)) 12814 return M; 12815 if (SDValue M = MakeVecReduce(ARMISD::VMLALVpu, ARMISD::VMLALVApu, N1, N0)) 12816 return M; 12817 return SDValue(); 12818 } 12819 12820 bool 12821 ARMTargetLowering::isDesirableToCommuteWithShift(const SDNode *N, 12822 CombineLevel Level) const { 12823 if (Level == BeforeLegalizeTypes) 12824 return true; 12825 12826 if (N->getOpcode() != ISD::SHL) 12827 return true; 12828 12829 if (Subtarget->isThumb1Only()) { 12830 // Avoid making expensive immediates by commuting shifts. (This logic 12831 // only applies to Thumb1 because ARM and Thumb2 immediates can be shifted 12832 // for free.) 12833 if (N->getOpcode() != ISD::SHL) 12834 return true; 12835 SDValue N1 = N->getOperand(0); 12836 if (N1->getOpcode() != ISD::ADD && N1->getOpcode() != ISD::AND && 12837 N1->getOpcode() != ISD::OR && N1->getOpcode() != ISD::XOR) 12838 return true; 12839 if (auto *Const = dyn_cast<ConstantSDNode>(N1->getOperand(1))) { 12840 if (Const->getAPIntValue().ult(256)) 12841 return false; 12842 if (N1->getOpcode() == ISD::ADD && Const->getAPIntValue().slt(0) && 12843 Const->getAPIntValue().sgt(-256)) 12844 return false; 12845 } 12846 return true; 12847 } 12848 12849 // Turn off commute-with-shift transform after legalization, so it doesn't 12850 // conflict with PerformSHLSimplify. (We could try to detect when 12851 // PerformSHLSimplify would trigger more precisely, but it isn't 12852 // really necessary.) 12853 return false; 12854 } 12855 12856 bool ARMTargetLowering::shouldFoldConstantShiftPairToMask( 12857 const SDNode *N, CombineLevel Level) const { 12858 if (!Subtarget->isThumb1Only()) 12859 return true; 12860 12861 if (Level == BeforeLegalizeTypes) 12862 return true; 12863 12864 return false; 12865 } 12866 12867 bool ARMTargetLowering::preferIncOfAddToSubOfNot(EVT VT) const { 12868 if (!Subtarget->hasNEON()) { 12869 if (Subtarget->isThumb1Only()) 12870 return VT.getScalarSizeInBits() <= 32; 12871 return true; 12872 } 12873 return VT.isScalarInteger(); 12874 } 12875 12876 static SDValue PerformSHLSimplify(SDNode *N, 12877 TargetLowering::DAGCombinerInfo &DCI, 12878 const ARMSubtarget *ST) { 12879 // Allow the generic combiner to identify potential bswaps. 12880 if (DCI.isBeforeLegalize()) 12881 return SDValue(); 12882 12883 // DAG combiner will fold: 12884 // (shl (add x, c1), c2) -> (add (shl x, c2), c1 << c2) 12885 // (shl (or x, c1), c2) -> (or (shl x, c2), c1 << c2 12886 // Other code patterns that can be also be modified have the following form: 12887 // b + ((a << 1) | 510) 12888 // b + ((a << 1) & 510) 12889 // b + ((a << 1) ^ 510) 12890 // b + ((a << 1) + 510) 12891 12892 // Many instructions can perform the shift for free, but it requires both 12893 // the operands to be registers. If c1 << c2 is too large, a mov immediate 12894 // instruction will needed. So, unfold back to the original pattern if: 12895 // - if c1 and c2 are small enough that they don't require mov imms. 12896 // - the user(s) of the node can perform an shl 12897 12898 // No shifted operands for 16-bit instructions. 12899 if (ST->isThumb() && ST->isThumb1Only()) 12900 return SDValue(); 12901 12902 // Check that all the users could perform the shl themselves. 12903 for (auto U : N->uses()) { 12904 switch(U->getOpcode()) { 12905 default: 12906 return SDValue(); 12907 case ISD::SUB: 12908 case ISD::ADD: 12909 case ISD::AND: 12910 case ISD::OR: 12911 case ISD::XOR: 12912 case ISD::SETCC: 12913 case ARMISD::CMP: 12914 // Check that the user isn't already using a constant because there 12915 // aren't any instructions that support an immediate operand and a 12916 // shifted operand. 12917 if (isa<ConstantSDNode>(U->getOperand(0)) || 12918 isa<ConstantSDNode>(U->getOperand(1))) 12919 return SDValue(); 12920 12921 // Check that it's not already using a shift. 12922 if (U->getOperand(0).getOpcode() == ISD::SHL || 12923 U->getOperand(1).getOpcode() == ISD::SHL) 12924 return SDValue(); 12925 break; 12926 } 12927 } 12928 12929 if (N->getOpcode() != ISD::ADD && N->getOpcode() != ISD::OR && 12930 N->getOpcode() != ISD::XOR && N->getOpcode() != ISD::AND) 12931 return SDValue(); 12932 12933 if (N->getOperand(0).getOpcode() != ISD::SHL) 12934 return SDValue(); 12935 12936 SDValue SHL = N->getOperand(0); 12937 12938 auto *C1ShlC2 = dyn_cast<ConstantSDNode>(N->getOperand(1)); 12939 auto *C2 = dyn_cast<ConstantSDNode>(SHL.getOperand(1)); 12940 if (!C1ShlC2 || !C2) 12941 return SDValue(); 12942 12943 APInt C2Int = C2->getAPIntValue(); 12944 APInt C1Int = C1ShlC2->getAPIntValue(); 12945 12946 // Check that performing a lshr will not lose any information. 12947 APInt Mask = APInt::getHighBitsSet(C2Int.getBitWidth(), 12948 C2Int.getBitWidth() - C2->getZExtValue()); 12949 if ((C1Int & Mask) != C1Int) 12950 return SDValue(); 12951 12952 // Shift the first constant. 12953 C1Int.lshrInPlace(C2Int); 12954 12955 // The immediates are encoded as an 8-bit value that can be rotated. 12956 auto LargeImm = [](const APInt &Imm) { 12957 unsigned Zeros = Imm.countLeadingZeros() + Imm.countTrailingZeros(); 12958 return Imm.getBitWidth() - Zeros > 8; 12959 }; 12960 12961 if (LargeImm(C1Int) || LargeImm(C2Int)) 12962 return SDValue(); 12963 12964 SelectionDAG &DAG = DCI.DAG; 12965 SDLoc dl(N); 12966 SDValue X = SHL.getOperand(0); 12967 SDValue BinOp = DAG.getNode(N->getOpcode(), dl, MVT::i32, X, 12968 DAG.getConstant(C1Int, dl, MVT::i32)); 12969 // Shift left to compensate for the lshr of C1Int. 12970 SDValue Res = DAG.getNode(ISD::SHL, dl, MVT::i32, BinOp, SHL.getOperand(1)); 12971 12972 LLVM_DEBUG(dbgs() << "Simplify shl use:\n"; SHL.getOperand(0).dump(); 12973 SHL.dump(); N->dump()); 12974 LLVM_DEBUG(dbgs() << "Into:\n"; X.dump(); BinOp.dump(); Res.dump()); 12975 return Res; 12976 } 12977 12978 12979 /// PerformADDCombine - Target-specific dag combine xforms for ISD::ADD. 12980 /// 12981 static SDValue PerformADDCombine(SDNode *N, 12982 TargetLowering::DAGCombinerInfo &DCI, 12983 const ARMSubtarget *Subtarget) { 12984 SDValue N0 = N->getOperand(0); 12985 SDValue N1 = N->getOperand(1); 12986 12987 // Only works one way, because it needs an immediate operand. 12988 if (SDValue Result = PerformSHLSimplify(N, DCI, Subtarget)) 12989 return Result; 12990 12991 if (SDValue Result = PerformADDVecReduce(N, DCI, Subtarget)) 12992 return Result; 12993 12994 // First try with the default operand order. 12995 if (SDValue Result = PerformADDCombineWithOperands(N, N0, N1, DCI, Subtarget)) 12996 return Result; 12997 12998 // If that didn't work, try again with the operands commuted. 12999 return PerformADDCombineWithOperands(N, N1, N0, DCI, Subtarget); 13000 } 13001 13002 // Combine (sub 0, (csinc X, Y, CC)) -> (csinv -X, Y, CC) 13003 // providing -X is as cheap as X (currently, just a constant). 13004 static SDValue PerformSubCSINCCombine(SDNode *N, 13005 TargetLowering::DAGCombinerInfo &DCI) { 13006 if (N->getValueType(0) != MVT::i32 || !isNullConstant(N->getOperand(0))) 13007 return SDValue(); 13008 SDValue CSINC = N->getOperand(1); 13009 if (CSINC.getOpcode() != ARMISD::CSINC || !CSINC.hasOneUse()) 13010 return SDValue(); 13011 13012 ConstantSDNode *X = dyn_cast<ConstantSDNode>(CSINC.getOperand(0)); 13013 if (!X) 13014 return SDValue(); 13015 13016 return DCI.DAG.getNode(ARMISD::CSINV, SDLoc(N), MVT::i32, 13017 DCI.DAG.getNode(ISD::SUB, SDLoc(N), MVT::i32, 13018 N->getOperand(0), CSINC.getOperand(0)), 13019 CSINC.getOperand(1), CSINC.getOperand(2), 13020 CSINC.getOperand(3)); 13021 } 13022 13023 /// PerformSUBCombine - Target-specific dag combine xforms for ISD::SUB. 13024 /// 13025 static SDValue PerformSUBCombine(SDNode *N, 13026 TargetLowering::DAGCombinerInfo &DCI, 13027 const ARMSubtarget *Subtarget) { 13028 SDValue N0 = N->getOperand(0); 13029 SDValue N1 = N->getOperand(1); 13030 13031 // fold (sub x, (select cc, 0, c)) -> (select cc, x, (sub, x, c)) 13032 if (N1.getNode()->hasOneUse()) 13033 if (SDValue Result = combineSelectAndUse(N, N1, N0, DCI)) 13034 return Result; 13035 13036 if (SDValue R = PerformSubCSINCCombine(N, DCI)) 13037 return R; 13038 13039 if (!Subtarget->hasMVEIntegerOps() || !N->getValueType(0).isVector()) 13040 return SDValue(); 13041 13042 // Fold (sub (ARMvmovImm 0), (ARMvdup x)) -> (ARMvdup (sub 0, x)) 13043 // so that we can readily pattern match more mve instructions which can use 13044 // a scalar operand. 13045 SDValue VDup = N->getOperand(1); 13046 if (VDup->getOpcode() != ARMISD::VDUP) 13047 return SDValue(); 13048 13049 SDValue VMov = N->getOperand(0); 13050 if (VMov->getOpcode() == ISD::BITCAST) 13051 VMov = VMov->getOperand(0); 13052 13053 if (VMov->getOpcode() != ARMISD::VMOVIMM || !isZeroVector(VMov)) 13054 return SDValue(); 13055 13056 SDLoc dl(N); 13057 SDValue Negate = DCI.DAG.getNode(ISD::SUB, dl, MVT::i32, 13058 DCI.DAG.getConstant(0, dl, MVT::i32), 13059 VDup->getOperand(0)); 13060 return DCI.DAG.getNode(ARMISD::VDUP, dl, N->getValueType(0), Negate); 13061 } 13062 13063 /// PerformVMULCombine 13064 /// Distribute (A + B) * C to (A * C) + (B * C) to take advantage of the 13065 /// special multiplier accumulator forwarding. 13066 /// vmul d3, d0, d2 13067 /// vmla d3, d1, d2 13068 /// is faster than 13069 /// vadd d3, d0, d1 13070 /// vmul d3, d3, d2 13071 // However, for (A + B) * (A + B), 13072 // vadd d2, d0, d1 13073 // vmul d3, d0, d2 13074 // vmla d3, d1, d2 13075 // is slower than 13076 // vadd d2, d0, d1 13077 // vmul d3, d2, d2 13078 static SDValue PerformVMULCombine(SDNode *N, 13079 TargetLowering::DAGCombinerInfo &DCI, 13080 const ARMSubtarget *Subtarget) { 13081 if (!Subtarget->hasVMLxForwarding()) 13082 return SDValue(); 13083 13084 SelectionDAG &DAG = DCI.DAG; 13085 SDValue N0 = N->getOperand(0); 13086 SDValue N1 = N->getOperand(1); 13087 unsigned Opcode = N0.getOpcode(); 13088 if (Opcode != ISD::ADD && Opcode != ISD::SUB && 13089 Opcode != ISD::FADD && Opcode != ISD::FSUB) { 13090 Opcode = N1.getOpcode(); 13091 if (Opcode != ISD::ADD && Opcode != ISD::SUB && 13092 Opcode != ISD::FADD && Opcode != ISD::FSUB) 13093 return SDValue(); 13094 std::swap(N0, N1); 13095 } 13096 13097 if (N0 == N1) 13098 return SDValue(); 13099 13100 EVT VT = N->getValueType(0); 13101 SDLoc DL(N); 13102 SDValue N00 = N0->getOperand(0); 13103 SDValue N01 = N0->getOperand(1); 13104 return DAG.getNode(Opcode, DL, VT, 13105 DAG.getNode(ISD::MUL, DL, VT, N00, N1), 13106 DAG.getNode(ISD::MUL, DL, VT, N01, N1)); 13107 } 13108 13109 static SDValue PerformMVEVMULLCombine(SDNode *N, SelectionDAG &DAG, 13110 const ARMSubtarget *Subtarget) { 13111 EVT VT = N->getValueType(0); 13112 if (VT != MVT::v2i64) 13113 return SDValue(); 13114 13115 SDValue N0 = N->getOperand(0); 13116 SDValue N1 = N->getOperand(1); 13117 13118 auto IsSignExt = [&](SDValue Op) { 13119 if (Op->getOpcode() != ISD::SIGN_EXTEND_INREG) 13120 return SDValue(); 13121 EVT VT = cast<VTSDNode>(Op->getOperand(1))->getVT(); 13122 if (VT.getScalarSizeInBits() == 32) 13123 return Op->getOperand(0); 13124 return SDValue(); 13125 }; 13126 auto IsZeroExt = [&](SDValue Op) { 13127 // Zero extends are a little more awkward. At the point we are matching 13128 // this, we are looking for an AND with a (-1, 0, -1, 0) buildvector mask. 13129 // That might be before of after a bitcast depending on how the and is 13130 // placed. Because this has to look through bitcasts, it is currently only 13131 // supported on LE. 13132 if (!Subtarget->isLittle()) 13133 return SDValue(); 13134 13135 SDValue And = Op; 13136 if (And->getOpcode() == ISD::BITCAST) 13137 And = And->getOperand(0); 13138 if (And->getOpcode() != ISD::AND) 13139 return SDValue(); 13140 SDValue Mask = And->getOperand(1); 13141 if (Mask->getOpcode() == ISD::BITCAST) 13142 Mask = Mask->getOperand(0); 13143 13144 if (Mask->getOpcode() != ISD::BUILD_VECTOR || 13145 Mask.getValueType() != MVT::v4i32) 13146 return SDValue(); 13147 if (isAllOnesConstant(Mask->getOperand(0)) && 13148 isNullConstant(Mask->getOperand(1)) && 13149 isAllOnesConstant(Mask->getOperand(2)) && 13150 isNullConstant(Mask->getOperand(3))) 13151 return And->getOperand(0); 13152 return SDValue(); 13153 }; 13154 13155 SDLoc dl(N); 13156 if (SDValue Op0 = IsSignExt(N0)) { 13157 if (SDValue Op1 = IsSignExt(N1)) { 13158 SDValue New0a = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v4i32, Op0); 13159 SDValue New1a = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v4i32, Op1); 13160 return DAG.getNode(ARMISD::VMULLs, dl, VT, New0a, New1a); 13161 } 13162 } 13163 if (SDValue Op0 = IsZeroExt(N0)) { 13164 if (SDValue Op1 = IsZeroExt(N1)) { 13165 SDValue New0a = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v4i32, Op0); 13166 SDValue New1a = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v4i32, Op1); 13167 return DAG.getNode(ARMISD::VMULLu, dl, VT, New0a, New1a); 13168 } 13169 } 13170 13171 return SDValue(); 13172 } 13173 13174 static SDValue PerformMULCombine(SDNode *N, 13175 TargetLowering::DAGCombinerInfo &DCI, 13176 const ARMSubtarget *Subtarget) { 13177 SelectionDAG &DAG = DCI.DAG; 13178 13179 EVT VT = N->getValueType(0); 13180 if (Subtarget->hasMVEIntegerOps() && VT == MVT::v2i64) 13181 return PerformMVEVMULLCombine(N, DAG, Subtarget); 13182 13183 if (Subtarget->isThumb1Only()) 13184 return SDValue(); 13185 13186 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) 13187 return SDValue(); 13188 13189 if (VT.is64BitVector() || VT.is128BitVector()) 13190 return PerformVMULCombine(N, DCI, Subtarget); 13191 if (VT != MVT::i32) 13192 return SDValue(); 13193 13194 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1)); 13195 if (!C) 13196 return SDValue(); 13197 13198 int64_t MulAmt = C->getSExtValue(); 13199 unsigned ShiftAmt = countTrailingZeros<uint64_t>(MulAmt); 13200 13201 ShiftAmt = ShiftAmt & (32 - 1); 13202 SDValue V = N->getOperand(0); 13203 SDLoc DL(N); 13204 13205 SDValue Res; 13206 MulAmt >>= ShiftAmt; 13207 13208 if (MulAmt >= 0) { 13209 if (isPowerOf2_32(MulAmt - 1)) { 13210 // (mul x, 2^N + 1) => (add (shl x, N), x) 13211 Res = DAG.getNode(ISD::ADD, DL, VT, 13212 V, 13213 DAG.getNode(ISD::SHL, DL, VT, 13214 V, 13215 DAG.getConstant(Log2_32(MulAmt - 1), DL, 13216 MVT::i32))); 13217 } else if (isPowerOf2_32(MulAmt + 1)) { 13218 // (mul x, 2^N - 1) => (sub (shl x, N), x) 13219 Res = DAG.getNode(ISD::SUB, DL, VT, 13220 DAG.getNode(ISD::SHL, DL, VT, 13221 V, 13222 DAG.getConstant(Log2_32(MulAmt + 1), DL, 13223 MVT::i32)), 13224 V); 13225 } else 13226 return SDValue(); 13227 } else { 13228 uint64_t MulAmtAbs = -MulAmt; 13229 if (isPowerOf2_32(MulAmtAbs + 1)) { 13230 // (mul x, -(2^N - 1)) => (sub x, (shl x, N)) 13231 Res = DAG.getNode(ISD::SUB, DL, VT, 13232 V, 13233 DAG.getNode(ISD::SHL, DL, VT, 13234 V, 13235 DAG.getConstant(Log2_32(MulAmtAbs + 1), DL, 13236 MVT::i32))); 13237 } else if (isPowerOf2_32(MulAmtAbs - 1)) { 13238 // (mul x, -(2^N + 1)) => - (add (shl x, N), x) 13239 Res = DAG.getNode(ISD::ADD, DL, VT, 13240 V, 13241 DAG.getNode(ISD::SHL, DL, VT, 13242 V, 13243 DAG.getConstant(Log2_32(MulAmtAbs - 1), DL, 13244 MVT::i32))); 13245 Res = DAG.getNode(ISD::SUB, DL, VT, 13246 DAG.getConstant(0, DL, MVT::i32), Res); 13247 } else 13248 return SDValue(); 13249 } 13250 13251 if (ShiftAmt != 0) 13252 Res = DAG.getNode(ISD::SHL, DL, VT, 13253 Res, DAG.getConstant(ShiftAmt, DL, MVT::i32)); 13254 13255 // Do not add new nodes to DAG combiner worklist. 13256 DCI.CombineTo(N, Res, false); 13257 return SDValue(); 13258 } 13259 13260 static SDValue CombineANDShift(SDNode *N, 13261 TargetLowering::DAGCombinerInfo &DCI, 13262 const ARMSubtarget *Subtarget) { 13263 // Allow DAGCombine to pattern-match before we touch the canonical form. 13264 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) 13265 return SDValue(); 13266 13267 if (N->getValueType(0) != MVT::i32) 13268 return SDValue(); 13269 13270 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N->getOperand(1)); 13271 if (!N1C) 13272 return SDValue(); 13273 13274 uint32_t C1 = (uint32_t)N1C->getZExtValue(); 13275 // Don't transform uxtb/uxth. 13276 if (C1 == 255 || C1 == 65535) 13277 return SDValue(); 13278 13279 SDNode *N0 = N->getOperand(0).getNode(); 13280 if (!N0->hasOneUse()) 13281 return SDValue(); 13282 13283 if (N0->getOpcode() != ISD::SHL && N0->getOpcode() != ISD::SRL) 13284 return SDValue(); 13285 13286 bool LeftShift = N0->getOpcode() == ISD::SHL; 13287 13288 ConstantSDNode *N01C = dyn_cast<ConstantSDNode>(N0->getOperand(1)); 13289 if (!N01C) 13290 return SDValue(); 13291 13292 uint32_t C2 = (uint32_t)N01C->getZExtValue(); 13293 if (!C2 || C2 >= 32) 13294 return SDValue(); 13295 13296 // Clear irrelevant bits in the mask. 13297 if (LeftShift) 13298 C1 &= (-1U << C2); 13299 else 13300 C1 &= (-1U >> C2); 13301 13302 SelectionDAG &DAG = DCI.DAG; 13303 SDLoc DL(N); 13304 13305 // We have a pattern of the form "(and (shl x, c2) c1)" or 13306 // "(and (srl x, c2) c1)", where c1 is a shifted mask. Try to 13307 // transform to a pair of shifts, to save materializing c1. 13308 13309 // First pattern: right shift, then mask off leading bits. 13310 // FIXME: Use demanded bits? 13311 if (!LeftShift && isMask_32(C1)) { 13312 uint32_t C3 = countLeadingZeros(C1); 13313 if (C2 < C3) { 13314 SDValue SHL = DAG.getNode(ISD::SHL, DL, MVT::i32, N0->getOperand(0), 13315 DAG.getConstant(C3 - C2, DL, MVT::i32)); 13316 return DAG.getNode(ISD::SRL, DL, MVT::i32, SHL, 13317 DAG.getConstant(C3, DL, MVT::i32)); 13318 } 13319 } 13320 13321 // First pattern, reversed: left shift, then mask off trailing bits. 13322 if (LeftShift && isMask_32(~C1)) { 13323 uint32_t C3 = countTrailingZeros(C1); 13324 if (C2 < C3) { 13325 SDValue SHL = DAG.getNode(ISD::SRL, DL, MVT::i32, N0->getOperand(0), 13326 DAG.getConstant(C3 - C2, DL, MVT::i32)); 13327 return DAG.getNode(ISD::SHL, DL, MVT::i32, SHL, 13328 DAG.getConstant(C3, DL, MVT::i32)); 13329 } 13330 } 13331 13332 // Second pattern: left shift, then mask off leading bits. 13333 // FIXME: Use demanded bits? 13334 if (LeftShift && isShiftedMask_32(C1)) { 13335 uint32_t Trailing = countTrailingZeros(C1); 13336 uint32_t C3 = countLeadingZeros(C1); 13337 if (Trailing == C2 && C2 + C3 < 32) { 13338 SDValue SHL = DAG.getNode(ISD::SHL, DL, MVT::i32, N0->getOperand(0), 13339 DAG.getConstant(C2 + C3, DL, MVT::i32)); 13340 return DAG.getNode(ISD::SRL, DL, MVT::i32, SHL, 13341 DAG.getConstant(C3, DL, MVT::i32)); 13342 } 13343 } 13344 13345 // Second pattern, reversed: right shift, then mask off trailing bits. 13346 // FIXME: Handle other patterns of known/demanded bits. 13347 if (!LeftShift && isShiftedMask_32(C1)) { 13348 uint32_t Leading = countLeadingZeros(C1); 13349 uint32_t C3 = countTrailingZeros(C1); 13350 if (Leading == C2 && C2 + C3 < 32) { 13351 SDValue SHL = DAG.getNode(ISD::SRL, DL, MVT::i32, N0->getOperand(0), 13352 DAG.getConstant(C2 + C3, DL, MVT::i32)); 13353 return DAG.getNode(ISD::SHL, DL, MVT::i32, SHL, 13354 DAG.getConstant(C3, DL, MVT::i32)); 13355 } 13356 } 13357 13358 // FIXME: Transform "(and (shl x, c2) c1)" -> 13359 // "(shl (and x, c1>>c2), c2)" if "c1 >> c2" is a cheaper immediate than 13360 // c1. 13361 return SDValue(); 13362 } 13363 13364 static SDValue PerformANDCombine(SDNode *N, 13365 TargetLowering::DAGCombinerInfo &DCI, 13366 const ARMSubtarget *Subtarget) { 13367 // Attempt to use immediate-form VBIC 13368 BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(1)); 13369 SDLoc dl(N); 13370 EVT VT = N->getValueType(0); 13371 SelectionDAG &DAG = DCI.DAG; 13372 13373 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT) || VT == MVT::v4i1 || 13374 VT == MVT::v8i1 || VT == MVT::v16i1) 13375 return SDValue(); 13376 13377 APInt SplatBits, SplatUndef; 13378 unsigned SplatBitSize; 13379 bool HasAnyUndefs; 13380 if (BVN && (Subtarget->hasNEON() || Subtarget->hasMVEIntegerOps()) && 13381 BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) { 13382 if (SplatBitSize == 8 || SplatBitSize == 16 || SplatBitSize == 32 || 13383 SplatBitSize == 64) { 13384 EVT VbicVT; 13385 SDValue Val = isVMOVModifiedImm((~SplatBits).getZExtValue(), 13386 SplatUndef.getZExtValue(), SplatBitSize, 13387 DAG, dl, VbicVT, VT, OtherModImm); 13388 if (Val.getNode()) { 13389 SDValue Input = 13390 DAG.getNode(ISD::BITCAST, dl, VbicVT, N->getOperand(0)); 13391 SDValue Vbic = DAG.getNode(ARMISD::VBICIMM, dl, VbicVT, Input, Val); 13392 return DAG.getNode(ISD::BITCAST, dl, VT, Vbic); 13393 } 13394 } 13395 } 13396 13397 if (!Subtarget->isThumb1Only()) { 13398 // fold (and (select cc, -1, c), x) -> (select cc, x, (and, x, c)) 13399 if (SDValue Result = combineSelectAndUseCommutative(N, true, DCI)) 13400 return Result; 13401 13402 if (SDValue Result = PerformSHLSimplify(N, DCI, Subtarget)) 13403 return Result; 13404 } 13405 13406 if (Subtarget->isThumb1Only()) 13407 if (SDValue Result = CombineANDShift(N, DCI, Subtarget)) 13408 return Result; 13409 13410 return SDValue(); 13411 } 13412 13413 // Try combining OR nodes to SMULWB, SMULWT. 13414 static SDValue PerformORCombineToSMULWBT(SDNode *OR, 13415 TargetLowering::DAGCombinerInfo &DCI, 13416 const ARMSubtarget *Subtarget) { 13417 if (!Subtarget->hasV6Ops() || 13418 (Subtarget->isThumb() && 13419 (!Subtarget->hasThumb2() || !Subtarget->hasDSP()))) 13420 return SDValue(); 13421 13422 SDValue SRL = OR->getOperand(0); 13423 SDValue SHL = OR->getOperand(1); 13424 13425 if (SRL.getOpcode() != ISD::SRL || SHL.getOpcode() != ISD::SHL) { 13426 SRL = OR->getOperand(1); 13427 SHL = OR->getOperand(0); 13428 } 13429 if (!isSRL16(SRL) || !isSHL16(SHL)) 13430 return SDValue(); 13431 13432 // The first operands to the shifts need to be the two results from the 13433 // same smul_lohi node. 13434 if ((SRL.getOperand(0).getNode() != SHL.getOperand(0).getNode()) || 13435 SRL.getOperand(0).getOpcode() != ISD::SMUL_LOHI) 13436 return SDValue(); 13437 13438 SDNode *SMULLOHI = SRL.getOperand(0).getNode(); 13439 if (SRL.getOperand(0) != SDValue(SMULLOHI, 0) || 13440 SHL.getOperand(0) != SDValue(SMULLOHI, 1)) 13441 return SDValue(); 13442 13443 // Now we have: 13444 // (or (srl (smul_lohi ?, ?), 16), (shl (smul_lohi ?, ?), 16))) 13445 // For SMUL[B|T] smul_lohi will take a 32-bit and a 16-bit arguments. 13446 // For SMUWB the 16-bit value will signed extended somehow. 13447 // For SMULWT only the SRA is required. 13448 // Check both sides of SMUL_LOHI 13449 SDValue OpS16 = SMULLOHI->getOperand(0); 13450 SDValue OpS32 = SMULLOHI->getOperand(1); 13451 13452 SelectionDAG &DAG = DCI.DAG; 13453 if (!isS16(OpS16, DAG) && !isSRA16(OpS16)) { 13454 OpS16 = OpS32; 13455 OpS32 = SMULLOHI->getOperand(0); 13456 } 13457 13458 SDLoc dl(OR); 13459 unsigned Opcode = 0; 13460 if (isS16(OpS16, DAG)) 13461 Opcode = ARMISD::SMULWB; 13462 else if (isSRA16(OpS16)) { 13463 Opcode = ARMISD::SMULWT; 13464 OpS16 = OpS16->getOperand(0); 13465 } 13466 else 13467 return SDValue(); 13468 13469 SDValue Res = DAG.getNode(Opcode, dl, MVT::i32, OpS32, OpS16); 13470 DAG.ReplaceAllUsesOfValueWith(SDValue(OR, 0), Res); 13471 return SDValue(OR, 0); 13472 } 13473 13474 static SDValue PerformORCombineToBFI(SDNode *N, 13475 TargetLowering::DAGCombinerInfo &DCI, 13476 const ARMSubtarget *Subtarget) { 13477 // BFI is only available on V6T2+ 13478 if (Subtarget->isThumb1Only() || !Subtarget->hasV6T2Ops()) 13479 return SDValue(); 13480 13481 EVT VT = N->getValueType(0); 13482 SDValue N0 = N->getOperand(0); 13483 SDValue N1 = N->getOperand(1); 13484 SelectionDAG &DAG = DCI.DAG; 13485 SDLoc DL(N); 13486 // 1) or (and A, mask), val => ARMbfi A, val, mask 13487 // iff (val & mask) == val 13488 // 13489 // 2) or (and A, mask), (and B, mask2) => ARMbfi A, (lsr B, amt), mask 13490 // 2a) iff isBitFieldInvertedMask(mask) && isBitFieldInvertedMask(~mask2) 13491 // && mask == ~mask2 13492 // 2b) iff isBitFieldInvertedMask(~mask) && isBitFieldInvertedMask(mask2) 13493 // && ~mask == mask2 13494 // (i.e., copy a bitfield value into another bitfield of the same width) 13495 13496 if (VT != MVT::i32) 13497 return SDValue(); 13498 13499 SDValue N00 = N0.getOperand(0); 13500 13501 // The value and the mask need to be constants so we can verify this is 13502 // actually a bitfield set. If the mask is 0xffff, we can do better 13503 // via a movt instruction, so don't use BFI in that case. 13504 SDValue MaskOp = N0.getOperand(1); 13505 ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(MaskOp); 13506 if (!MaskC) 13507 return SDValue(); 13508 unsigned Mask = MaskC->getZExtValue(); 13509 if (Mask == 0xffff) 13510 return SDValue(); 13511 SDValue Res; 13512 // Case (1): or (and A, mask), val => ARMbfi A, val, mask 13513 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1); 13514 if (N1C) { 13515 unsigned Val = N1C->getZExtValue(); 13516 if ((Val & ~Mask) != Val) 13517 return SDValue(); 13518 13519 if (ARM::isBitFieldInvertedMask(Mask)) { 13520 Val >>= countTrailingZeros(~Mask); 13521 13522 Res = DAG.getNode(ARMISD::BFI, DL, VT, N00, 13523 DAG.getConstant(Val, DL, MVT::i32), 13524 DAG.getConstant(Mask, DL, MVT::i32)); 13525 13526 DCI.CombineTo(N, Res, false); 13527 // Return value from the original node to inform the combiner than N is 13528 // now dead. 13529 return SDValue(N, 0); 13530 } 13531 } else if (N1.getOpcode() == ISD::AND) { 13532 // case (2) or (and A, mask), (and B, mask2) => ARMbfi A, (lsr B, amt), mask 13533 ConstantSDNode *N11C = dyn_cast<ConstantSDNode>(N1.getOperand(1)); 13534 if (!N11C) 13535 return SDValue(); 13536 unsigned Mask2 = N11C->getZExtValue(); 13537 13538 // Mask and ~Mask2 (or reverse) must be equivalent for the BFI pattern 13539 // as is to match. 13540 if (ARM::isBitFieldInvertedMask(Mask) && 13541 (Mask == ~Mask2)) { 13542 // The pack halfword instruction works better for masks that fit it, 13543 // so use that when it's available. 13544 if (Subtarget->hasDSP() && 13545 (Mask == 0xffff || Mask == 0xffff0000)) 13546 return SDValue(); 13547 // 2a 13548 unsigned amt = countTrailingZeros(Mask2); 13549 Res = DAG.getNode(ISD::SRL, DL, VT, N1.getOperand(0), 13550 DAG.getConstant(amt, DL, MVT::i32)); 13551 Res = DAG.getNode(ARMISD::BFI, DL, VT, N00, Res, 13552 DAG.getConstant(Mask, DL, MVT::i32)); 13553 DCI.CombineTo(N, Res, false); 13554 // Return value from the original node to inform the combiner than N is 13555 // now dead. 13556 return SDValue(N, 0); 13557 } else if (ARM::isBitFieldInvertedMask(~Mask) && 13558 (~Mask == Mask2)) { 13559 // The pack halfword instruction works better for masks that fit it, 13560 // so use that when it's available. 13561 if (Subtarget->hasDSP() && 13562 (Mask2 == 0xffff || Mask2 == 0xffff0000)) 13563 return SDValue(); 13564 // 2b 13565 unsigned lsb = countTrailingZeros(Mask); 13566 Res = DAG.getNode(ISD::SRL, DL, VT, N00, 13567 DAG.getConstant(lsb, DL, MVT::i32)); 13568 Res = DAG.getNode(ARMISD::BFI, DL, VT, N1.getOperand(0), Res, 13569 DAG.getConstant(Mask2, DL, MVT::i32)); 13570 DCI.CombineTo(N, Res, false); 13571 // Return value from the original node to inform the combiner than N is 13572 // now dead. 13573 return SDValue(N, 0); 13574 } 13575 } 13576 13577 if (DAG.MaskedValueIsZero(N1, MaskC->getAPIntValue()) && 13578 N00.getOpcode() == ISD::SHL && isa<ConstantSDNode>(N00.getOperand(1)) && 13579 ARM::isBitFieldInvertedMask(~Mask)) { 13580 // Case (3): or (and (shl A, #shamt), mask), B => ARMbfi B, A, ~mask 13581 // where lsb(mask) == #shamt and masked bits of B are known zero. 13582 SDValue ShAmt = N00.getOperand(1); 13583 unsigned ShAmtC = cast<ConstantSDNode>(ShAmt)->getZExtValue(); 13584 unsigned LSB = countTrailingZeros(Mask); 13585 if (ShAmtC != LSB) 13586 return SDValue(); 13587 13588 Res = DAG.getNode(ARMISD::BFI, DL, VT, N1, N00.getOperand(0), 13589 DAG.getConstant(~Mask, DL, MVT::i32)); 13590 13591 DCI.CombineTo(N, Res, false); 13592 // Return value from the original node to inform the combiner than N is 13593 // now dead. 13594 return SDValue(N, 0); 13595 } 13596 13597 return SDValue(); 13598 } 13599 13600 static bool isValidMVECond(unsigned CC, bool IsFloat) { 13601 switch (CC) { 13602 case ARMCC::EQ: 13603 case ARMCC::NE: 13604 case ARMCC::LE: 13605 case ARMCC::GT: 13606 case ARMCC::GE: 13607 case ARMCC::LT: 13608 return true; 13609 case ARMCC::HS: 13610 case ARMCC::HI: 13611 return !IsFloat; 13612 default: 13613 return false; 13614 }; 13615 } 13616 13617 static ARMCC::CondCodes getVCMPCondCode(SDValue N) { 13618 if (N->getOpcode() == ARMISD::VCMP) 13619 return (ARMCC::CondCodes)N->getConstantOperandVal(2); 13620 else if (N->getOpcode() == ARMISD::VCMPZ) 13621 return (ARMCC::CondCodes)N->getConstantOperandVal(1); 13622 else 13623 llvm_unreachable("Not a VCMP/VCMPZ!"); 13624 } 13625 13626 static bool CanInvertMVEVCMP(SDValue N) { 13627 ARMCC::CondCodes CC = ARMCC::getOppositeCondition(getVCMPCondCode(N)); 13628 return isValidMVECond(CC, N->getOperand(0).getValueType().isFloatingPoint()); 13629 } 13630 13631 static SDValue PerformORCombine_i1(SDNode *N, 13632 TargetLowering::DAGCombinerInfo &DCI, 13633 const ARMSubtarget *Subtarget) { 13634 // Try to invert "or A, B" -> "and ~A, ~B", as the "and" is easier to chain 13635 // together with predicates 13636 EVT VT = N->getValueType(0); 13637 SDLoc DL(N); 13638 SDValue N0 = N->getOperand(0); 13639 SDValue N1 = N->getOperand(1); 13640 13641 auto IsFreelyInvertable = [&](SDValue V) { 13642 if (V->getOpcode() == ARMISD::VCMP || V->getOpcode() == ARMISD::VCMPZ) 13643 return CanInvertMVEVCMP(V); 13644 return false; 13645 }; 13646 13647 // At least one operand must be freely invertable. 13648 if (!(IsFreelyInvertable(N0) || IsFreelyInvertable(N1))) 13649 return SDValue(); 13650 13651 SDValue NewN0 = DCI.DAG.getLogicalNOT(DL, N0, VT); 13652 SDValue NewN1 = DCI.DAG.getLogicalNOT(DL, N1, VT); 13653 SDValue And = DCI.DAG.getNode(ISD::AND, DL, VT, NewN0, NewN1); 13654 return DCI.DAG.getLogicalNOT(DL, And, VT); 13655 } 13656 13657 /// PerformORCombine - Target-specific dag combine xforms for ISD::OR 13658 static SDValue PerformORCombine(SDNode *N, 13659 TargetLowering::DAGCombinerInfo &DCI, 13660 const ARMSubtarget *Subtarget) { 13661 // Attempt to use immediate-form VORR 13662 BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(1)); 13663 SDLoc dl(N); 13664 EVT VT = N->getValueType(0); 13665 SelectionDAG &DAG = DCI.DAG; 13666 13667 if(!DAG.getTargetLoweringInfo().isTypeLegal(VT)) 13668 return SDValue(); 13669 13670 if (Subtarget->hasMVEIntegerOps() && 13671 (VT == MVT::v4i1 || VT == MVT::v8i1 || VT == MVT::v16i1)) 13672 return PerformORCombine_i1(N, DCI, Subtarget); 13673 13674 APInt SplatBits, SplatUndef; 13675 unsigned SplatBitSize; 13676 bool HasAnyUndefs; 13677 if (BVN && (Subtarget->hasNEON() || Subtarget->hasMVEIntegerOps()) && 13678 BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) { 13679 if (SplatBitSize == 8 || SplatBitSize == 16 || SplatBitSize == 32 || 13680 SplatBitSize == 64) { 13681 EVT VorrVT; 13682 SDValue Val = 13683 isVMOVModifiedImm(SplatBits.getZExtValue(), SplatUndef.getZExtValue(), 13684 SplatBitSize, DAG, dl, VorrVT, VT, OtherModImm); 13685 if (Val.getNode()) { 13686 SDValue Input = 13687 DAG.getNode(ISD::BITCAST, dl, VorrVT, N->getOperand(0)); 13688 SDValue Vorr = DAG.getNode(ARMISD::VORRIMM, dl, VorrVT, Input, Val); 13689 return DAG.getNode(ISD::BITCAST, dl, VT, Vorr); 13690 } 13691 } 13692 } 13693 13694 if (!Subtarget->isThumb1Only()) { 13695 // fold (or (select cc, 0, c), x) -> (select cc, x, (or, x, c)) 13696 if (SDValue Result = combineSelectAndUseCommutative(N, false, DCI)) 13697 return Result; 13698 if (SDValue Result = PerformORCombineToSMULWBT(N, DCI, Subtarget)) 13699 return Result; 13700 } 13701 13702 SDValue N0 = N->getOperand(0); 13703 SDValue N1 = N->getOperand(1); 13704 13705 // (or (and B, A), (and C, ~A)) => (VBSL A, B, C) when A is a constant. 13706 if (Subtarget->hasNEON() && N1.getOpcode() == ISD::AND && VT.isVector() && 13707 DAG.getTargetLoweringInfo().isTypeLegal(VT)) { 13708 13709 // The code below optimizes (or (and X, Y), Z). 13710 // The AND operand needs to have a single user to make these optimizations 13711 // profitable. 13712 if (N0.getOpcode() != ISD::AND || !N0.hasOneUse()) 13713 return SDValue(); 13714 13715 APInt SplatUndef; 13716 unsigned SplatBitSize; 13717 bool HasAnyUndefs; 13718 13719 APInt SplatBits0, SplatBits1; 13720 BuildVectorSDNode *BVN0 = dyn_cast<BuildVectorSDNode>(N0->getOperand(1)); 13721 BuildVectorSDNode *BVN1 = dyn_cast<BuildVectorSDNode>(N1->getOperand(1)); 13722 // Ensure that the second operand of both ands are constants 13723 if (BVN0 && BVN0->isConstantSplat(SplatBits0, SplatUndef, SplatBitSize, 13724 HasAnyUndefs) && !HasAnyUndefs) { 13725 if (BVN1 && BVN1->isConstantSplat(SplatBits1, SplatUndef, SplatBitSize, 13726 HasAnyUndefs) && !HasAnyUndefs) { 13727 // Ensure that the bit width of the constants are the same and that 13728 // the splat arguments are logical inverses as per the pattern we 13729 // are trying to simplify. 13730 if (SplatBits0.getBitWidth() == SplatBits1.getBitWidth() && 13731 SplatBits0 == ~SplatBits1) { 13732 // Canonicalize the vector type to make instruction selection 13733 // simpler. 13734 EVT CanonicalVT = VT.is128BitVector() ? MVT::v4i32 : MVT::v2i32; 13735 SDValue Result = DAG.getNode(ARMISD::VBSP, dl, CanonicalVT, 13736 N0->getOperand(1), 13737 N0->getOperand(0), 13738 N1->getOperand(0)); 13739 return DAG.getNode(ISD::BITCAST, dl, VT, Result); 13740 } 13741 } 13742 } 13743 } 13744 13745 // Try to use the ARM/Thumb2 BFI (bitfield insert) instruction when 13746 // reasonable. 13747 if (N0.getOpcode() == ISD::AND && N0.hasOneUse()) { 13748 if (SDValue Res = PerformORCombineToBFI(N, DCI, Subtarget)) 13749 return Res; 13750 } 13751 13752 if (SDValue Result = PerformSHLSimplify(N, DCI, Subtarget)) 13753 return Result; 13754 13755 return SDValue(); 13756 } 13757 13758 static SDValue PerformXORCombine(SDNode *N, 13759 TargetLowering::DAGCombinerInfo &DCI, 13760 const ARMSubtarget *Subtarget) { 13761 EVT VT = N->getValueType(0); 13762 SelectionDAG &DAG = DCI.DAG; 13763 13764 if(!DAG.getTargetLoweringInfo().isTypeLegal(VT)) 13765 return SDValue(); 13766 13767 if (!Subtarget->isThumb1Only()) { 13768 // fold (xor (select cc, 0, c), x) -> (select cc, x, (xor, x, c)) 13769 if (SDValue Result = combineSelectAndUseCommutative(N, false, DCI)) 13770 return Result; 13771 13772 if (SDValue Result = PerformSHLSimplify(N, DCI, Subtarget)) 13773 return Result; 13774 } 13775 13776 if (Subtarget->hasMVEIntegerOps()) { 13777 // fold (xor(vcmp/z, 1)) into a vcmp with the opposite condition. 13778 SDValue N0 = N->getOperand(0); 13779 SDValue N1 = N->getOperand(1); 13780 const TargetLowering *TLI = Subtarget->getTargetLowering(); 13781 if (TLI->isConstTrueVal(N1.getNode()) && 13782 (N0->getOpcode() == ARMISD::VCMP || N0->getOpcode() == ARMISD::VCMPZ)) { 13783 if (CanInvertMVEVCMP(N0)) { 13784 SDLoc DL(N0); 13785 ARMCC::CondCodes CC = ARMCC::getOppositeCondition(getVCMPCondCode(N0)); 13786 13787 SmallVector<SDValue, 4> Ops; 13788 Ops.push_back(N0->getOperand(0)); 13789 if (N0->getOpcode() == ARMISD::VCMP) 13790 Ops.push_back(N0->getOperand(1)); 13791 Ops.push_back(DCI.DAG.getConstant(CC, DL, MVT::i32)); 13792 return DCI.DAG.getNode(N0->getOpcode(), DL, N0->getValueType(0), Ops); 13793 } 13794 } 13795 } 13796 13797 return SDValue(); 13798 } 13799 13800 // ParseBFI - given a BFI instruction in N, extract the "from" value (Rn) and return it, 13801 // and fill in FromMask and ToMask with (consecutive) bits in "from" to be extracted and 13802 // their position in "to" (Rd). 13803 static SDValue ParseBFI(SDNode *N, APInt &ToMask, APInt &FromMask) { 13804 assert(N->getOpcode() == ARMISD::BFI); 13805 13806 SDValue From = N->getOperand(1); 13807 ToMask = ~cast<ConstantSDNode>(N->getOperand(2))->getAPIntValue(); 13808 FromMask = APInt::getLowBitsSet(ToMask.getBitWidth(), ToMask.countPopulation()); 13809 13810 // If the Base came from a SHR #C, we can deduce that it is really testing bit 13811 // #C in the base of the SHR. 13812 if (From->getOpcode() == ISD::SRL && 13813 isa<ConstantSDNode>(From->getOperand(1))) { 13814 APInt Shift = cast<ConstantSDNode>(From->getOperand(1))->getAPIntValue(); 13815 assert(Shift.getLimitedValue() < 32 && "Shift too large!"); 13816 FromMask <<= Shift.getLimitedValue(31); 13817 From = From->getOperand(0); 13818 } 13819 13820 return From; 13821 } 13822 13823 // If A and B contain one contiguous set of bits, does A | B == A . B? 13824 // 13825 // Neither A nor B must be zero. 13826 static bool BitsProperlyConcatenate(const APInt &A, const APInt &B) { 13827 unsigned LastActiveBitInA = A.countTrailingZeros(); 13828 unsigned FirstActiveBitInB = B.getBitWidth() - B.countLeadingZeros() - 1; 13829 return LastActiveBitInA - 1 == FirstActiveBitInB; 13830 } 13831 13832 static SDValue FindBFIToCombineWith(SDNode *N) { 13833 // We have a BFI in N. Follow a possible chain of BFIs and find a BFI it can combine with, 13834 // if one exists. 13835 APInt ToMask, FromMask; 13836 SDValue From = ParseBFI(N, ToMask, FromMask); 13837 SDValue To = N->getOperand(0); 13838 13839 // Now check for a compatible BFI to merge with. We can pass through BFIs that 13840 // aren't compatible, but not if they set the same bit in their destination as 13841 // we do (or that of any BFI we're going to combine with). 13842 SDValue V = To; 13843 APInt CombinedToMask = ToMask; 13844 while (V.getOpcode() == ARMISD::BFI) { 13845 APInt NewToMask, NewFromMask; 13846 SDValue NewFrom = ParseBFI(V.getNode(), NewToMask, NewFromMask); 13847 if (NewFrom != From) { 13848 // This BFI has a different base. Keep going. 13849 CombinedToMask |= NewToMask; 13850 V = V.getOperand(0); 13851 continue; 13852 } 13853 13854 // Do the written bits conflict with any we've seen so far? 13855 if ((NewToMask & CombinedToMask).getBoolValue()) 13856 // Conflicting bits - bail out because going further is unsafe. 13857 return SDValue(); 13858 13859 // Are the new bits contiguous when combined with the old bits? 13860 if (BitsProperlyConcatenate(ToMask, NewToMask) && 13861 BitsProperlyConcatenate(FromMask, NewFromMask)) 13862 return V; 13863 if (BitsProperlyConcatenate(NewToMask, ToMask) && 13864 BitsProperlyConcatenate(NewFromMask, FromMask)) 13865 return V; 13866 13867 // We've seen a write to some bits, so track it. 13868 CombinedToMask |= NewToMask; 13869 // Keep going... 13870 V = V.getOperand(0); 13871 } 13872 13873 return SDValue(); 13874 } 13875 13876 static SDValue PerformBFICombine(SDNode *N, 13877 TargetLowering::DAGCombinerInfo &DCI) { 13878 SDValue N1 = N->getOperand(1); 13879 if (N1.getOpcode() == ISD::AND) { 13880 // (bfi A, (and B, Mask1), Mask2) -> (bfi A, B, Mask2) iff 13881 // the bits being cleared by the AND are not demanded by the BFI. 13882 ConstantSDNode *N11C = dyn_cast<ConstantSDNode>(N1.getOperand(1)); 13883 if (!N11C) 13884 return SDValue(); 13885 unsigned InvMask = cast<ConstantSDNode>(N->getOperand(2))->getZExtValue(); 13886 unsigned LSB = countTrailingZeros(~InvMask); 13887 unsigned Width = (32 - countLeadingZeros(~InvMask)) - LSB; 13888 assert(Width < 13889 static_cast<unsigned>(std::numeric_limits<unsigned>::digits) && 13890 "undefined behavior"); 13891 unsigned Mask = (1u << Width) - 1; 13892 unsigned Mask2 = N11C->getZExtValue(); 13893 if ((Mask & (~Mask2)) == 0) 13894 return DCI.DAG.getNode(ARMISD::BFI, SDLoc(N), N->getValueType(0), 13895 N->getOperand(0), N1.getOperand(0), 13896 N->getOperand(2)); 13897 } else if (N->getOperand(0).getOpcode() == ARMISD::BFI) { 13898 // We have a BFI of a BFI. Walk up the BFI chain to see how long it goes. 13899 // Keep track of any consecutive bits set that all come from the same base 13900 // value. We can combine these together into a single BFI. 13901 SDValue CombineBFI = FindBFIToCombineWith(N); 13902 if (CombineBFI == SDValue()) 13903 return SDValue(); 13904 13905 // We've found a BFI. 13906 APInt ToMask1, FromMask1; 13907 SDValue From1 = ParseBFI(N, ToMask1, FromMask1); 13908 13909 APInt ToMask2, FromMask2; 13910 SDValue From2 = ParseBFI(CombineBFI.getNode(), ToMask2, FromMask2); 13911 assert(From1 == From2); 13912 (void)From2; 13913 13914 // First, unlink CombineBFI. 13915 DCI.DAG.ReplaceAllUsesWith(CombineBFI, CombineBFI.getOperand(0)); 13916 // Then create a new BFI, combining the two together. 13917 APInt NewFromMask = FromMask1 | FromMask2; 13918 APInt NewToMask = ToMask1 | ToMask2; 13919 13920 EVT VT = N->getValueType(0); 13921 SDLoc dl(N); 13922 13923 if (NewFromMask[0] == 0) 13924 From1 = DCI.DAG.getNode( 13925 ISD::SRL, dl, VT, From1, 13926 DCI.DAG.getConstant(NewFromMask.countTrailingZeros(), dl, VT)); 13927 return DCI.DAG.getNode(ARMISD::BFI, dl, VT, N->getOperand(0), From1, 13928 DCI.DAG.getConstant(~NewToMask, dl, VT)); 13929 } 13930 return SDValue(); 13931 } 13932 13933 /// PerformVMOVRRDCombine - Target-specific dag combine xforms for 13934 /// ARMISD::VMOVRRD. 13935 static SDValue PerformVMOVRRDCombine(SDNode *N, 13936 TargetLowering::DAGCombinerInfo &DCI, 13937 const ARMSubtarget *Subtarget) { 13938 // vmovrrd(vmovdrr x, y) -> x,y 13939 SDValue InDouble = N->getOperand(0); 13940 if (InDouble.getOpcode() == ARMISD::VMOVDRR && Subtarget->hasFP64()) 13941 return DCI.CombineTo(N, InDouble.getOperand(0), InDouble.getOperand(1)); 13942 13943 // vmovrrd(load f64) -> (load i32), (load i32) 13944 SDNode *InNode = InDouble.getNode(); 13945 if (ISD::isNormalLoad(InNode) && InNode->hasOneUse() && 13946 InNode->getValueType(0) == MVT::f64 && 13947 InNode->getOperand(1).getOpcode() == ISD::FrameIndex && 13948 !cast<LoadSDNode>(InNode)->isVolatile()) { 13949 // TODO: Should this be done for non-FrameIndex operands? 13950 LoadSDNode *LD = cast<LoadSDNode>(InNode); 13951 13952 SelectionDAG &DAG = DCI.DAG; 13953 SDLoc DL(LD); 13954 SDValue BasePtr = LD->getBasePtr(); 13955 SDValue NewLD1 = 13956 DAG.getLoad(MVT::i32, DL, LD->getChain(), BasePtr, LD->getPointerInfo(), 13957 LD->getAlignment(), LD->getMemOperand()->getFlags()); 13958 13959 SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr, 13960 DAG.getConstant(4, DL, MVT::i32)); 13961 13962 SDValue NewLD2 = DAG.getLoad(MVT::i32, DL, LD->getChain(), OffsetPtr, 13963 LD->getPointerInfo().getWithOffset(4), 13964 std::min(4U, LD->getAlignment()), 13965 LD->getMemOperand()->getFlags()); 13966 13967 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewLD2.getValue(1)); 13968 if (DCI.DAG.getDataLayout().isBigEndian()) 13969 std::swap (NewLD1, NewLD2); 13970 SDValue Result = DCI.CombineTo(N, NewLD1, NewLD2); 13971 return Result; 13972 } 13973 13974 // VMOVRRD(extract(..(build_vector(a, b, c, d)))) -> a,b or c,d 13975 // VMOVRRD(extract(insert_vector(insert_vector(.., a, l1), b, l2))) -> a,b 13976 if (InDouble.getOpcode() == ISD::EXTRACT_VECTOR_ELT && 13977 isa<ConstantSDNode>(InDouble.getOperand(1))) { 13978 SDValue BV = InDouble.getOperand(0); 13979 // Look up through any nop bitcasts and vector_reg_casts. bitcasts may 13980 // change lane order under big endian. 13981 bool BVSwap = BV.getOpcode() == ISD::BITCAST; 13982 while ( 13983 (BV.getOpcode() == ISD::BITCAST || 13984 BV.getOpcode() == ARMISD::VECTOR_REG_CAST) && 13985 (BV.getValueType() == MVT::v2f64 || BV.getValueType() == MVT::v2i64)) { 13986 BVSwap = BV.getOpcode() == ISD::BITCAST; 13987 BV = BV.getOperand(0); 13988 } 13989 if (BV.getValueType() != MVT::v4i32) 13990 return SDValue(); 13991 13992 // Handle buildvectors, pulling out the correct lane depending on 13993 // endianness. 13994 unsigned Offset = InDouble.getConstantOperandVal(1) == 1 ? 2 : 0; 13995 if (BV.getOpcode() == ISD::BUILD_VECTOR) { 13996 SDValue Op0 = BV.getOperand(Offset); 13997 SDValue Op1 = BV.getOperand(Offset + 1); 13998 if (!Subtarget->isLittle() && BVSwap) 13999 std::swap(Op0, Op1); 14000 14001 return DCI.DAG.getMergeValues({Op0, Op1}, SDLoc(N)); 14002 } 14003 14004 // A chain of insert_vectors, grabbing the correct value of the chain of 14005 // inserts. 14006 SDValue Op0, Op1; 14007 while (BV.getOpcode() == ISD::INSERT_VECTOR_ELT) { 14008 if (isa<ConstantSDNode>(BV.getOperand(2))) { 14009 if (BV.getConstantOperandVal(2) == Offset) 14010 Op0 = BV.getOperand(1); 14011 if (BV.getConstantOperandVal(2) == Offset + 1) 14012 Op1 = BV.getOperand(1); 14013 } 14014 BV = BV.getOperand(0); 14015 } 14016 if (!Subtarget->isLittle() && BVSwap) 14017 std::swap(Op0, Op1); 14018 if (Op0 && Op1) 14019 return DCI.DAG.getMergeValues({Op0, Op1}, SDLoc(N)); 14020 } 14021 14022 return SDValue(); 14023 } 14024 14025 /// PerformVMOVDRRCombine - Target-specific dag combine xforms for 14026 /// ARMISD::VMOVDRR. This is also used for BUILD_VECTORs with 2 operands. 14027 static SDValue PerformVMOVDRRCombine(SDNode *N, SelectionDAG &DAG) { 14028 // N=vmovrrd(X); vmovdrr(N:0, N:1) -> bit_convert(X) 14029 SDValue Op0 = N->getOperand(0); 14030 SDValue Op1 = N->getOperand(1); 14031 if (Op0.getOpcode() == ISD::BITCAST) 14032 Op0 = Op0.getOperand(0); 14033 if (Op1.getOpcode() == ISD::BITCAST) 14034 Op1 = Op1.getOperand(0); 14035 if (Op0.getOpcode() == ARMISD::VMOVRRD && 14036 Op0.getNode() == Op1.getNode() && 14037 Op0.getResNo() == 0 && Op1.getResNo() == 1) 14038 return DAG.getNode(ISD::BITCAST, SDLoc(N), 14039 N->getValueType(0), Op0.getOperand(0)); 14040 return SDValue(); 14041 } 14042 14043 static SDValue PerformVMOVhrCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { 14044 SDValue Op0 = N->getOperand(0); 14045 14046 // VMOVhr (VMOVrh (X)) -> X 14047 if (Op0->getOpcode() == ARMISD::VMOVrh) 14048 return Op0->getOperand(0); 14049 14050 // FullFP16: half values are passed in S-registers, and we don't 14051 // need any of the bitcast and moves: 14052 // 14053 // t2: f32,ch = CopyFromReg t0, Register:f32 %0 14054 // t5: i32 = bitcast t2 14055 // t18: f16 = ARMISD::VMOVhr t5 14056 if (Op0->getOpcode() == ISD::BITCAST) { 14057 SDValue Copy = Op0->getOperand(0); 14058 if (Copy.getValueType() == MVT::f32 && 14059 Copy->getOpcode() == ISD::CopyFromReg) { 14060 SDValue Ops[] = {Copy->getOperand(0), Copy->getOperand(1)}; 14061 SDValue NewCopy = 14062 DCI.DAG.getNode(ISD::CopyFromReg, SDLoc(N), N->getValueType(0), Ops); 14063 return NewCopy; 14064 } 14065 } 14066 14067 // fold (VMOVhr (load x)) -> (load (f16*)x) 14068 if (LoadSDNode *LN0 = dyn_cast<LoadSDNode>(Op0)) { 14069 if (LN0->hasOneUse() && LN0->isUnindexed() && 14070 LN0->getMemoryVT() == MVT::i16) { 14071 SDValue Load = 14072 DCI.DAG.getLoad(N->getValueType(0), SDLoc(N), LN0->getChain(), 14073 LN0->getBasePtr(), LN0->getMemOperand()); 14074 DCI.DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Load.getValue(0)); 14075 DCI.DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), Load.getValue(1)); 14076 return Load; 14077 } 14078 } 14079 14080 // Only the bottom 16 bits of the source register are used. 14081 APInt DemandedMask = APInt::getLowBitsSet(32, 16); 14082 const TargetLowering &TLI = DCI.DAG.getTargetLoweringInfo(); 14083 if (TLI.SimplifyDemandedBits(Op0, DemandedMask, DCI)) 14084 return SDValue(N, 0); 14085 14086 return SDValue(); 14087 } 14088 14089 static SDValue PerformVMOVrhCombine(SDNode *N, 14090 TargetLowering::DAGCombinerInfo &DCI) { 14091 SDValue N0 = N->getOperand(0); 14092 EVT VT = N->getValueType(0); 14093 14094 // fold (VMOVrh (fpconst x)) -> const x 14095 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N0)) { 14096 APFloat V = C->getValueAPF(); 14097 return DCI.DAG.getConstant(V.bitcastToAPInt().getZExtValue(), SDLoc(N), VT); 14098 } 14099 14100 // fold (VMOVrh (load x)) -> (zextload (i16*)x) 14101 if (ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse()) { 14102 LoadSDNode *LN0 = cast<LoadSDNode>(N0); 14103 14104 SDValue Load = 14105 DCI.DAG.getExtLoad(ISD::ZEXTLOAD, SDLoc(N), VT, LN0->getChain(), 14106 LN0->getBasePtr(), MVT::i16, LN0->getMemOperand()); 14107 DCI.DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Load.getValue(0)); 14108 DCI.DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), Load.getValue(1)); 14109 return Load; 14110 } 14111 14112 // Fold VMOVrh(extract(x, n)) -> vgetlaneu(x, n) 14113 if (N0->getOpcode() == ISD::EXTRACT_VECTOR_ELT && 14114 isa<ConstantSDNode>(N0->getOperand(1))) 14115 return DCI.DAG.getNode(ARMISD::VGETLANEu, SDLoc(N), VT, N0->getOperand(0), 14116 N0->getOperand(1)); 14117 14118 return SDValue(); 14119 } 14120 14121 /// hasNormalLoadOperand - Check if any of the operands of a BUILD_VECTOR node 14122 /// are normal, non-volatile loads. If so, it is profitable to bitcast an 14123 /// i64 vector to have f64 elements, since the value can then be loaded 14124 /// directly into a VFP register. 14125 static bool hasNormalLoadOperand(SDNode *N) { 14126 unsigned NumElts = N->getValueType(0).getVectorNumElements(); 14127 for (unsigned i = 0; i < NumElts; ++i) { 14128 SDNode *Elt = N->getOperand(i).getNode(); 14129 if (ISD::isNormalLoad(Elt) && !cast<LoadSDNode>(Elt)->isVolatile()) 14130 return true; 14131 } 14132 return false; 14133 } 14134 14135 /// PerformBUILD_VECTORCombine - Target-specific dag combine xforms for 14136 /// ISD::BUILD_VECTOR. 14137 static SDValue PerformBUILD_VECTORCombine(SDNode *N, 14138 TargetLowering::DAGCombinerInfo &DCI, 14139 const ARMSubtarget *Subtarget) { 14140 // build_vector(N=ARMISD::VMOVRRD(X), N:1) -> bit_convert(X): 14141 // VMOVRRD is introduced when legalizing i64 types. It forces the i64 value 14142 // into a pair of GPRs, which is fine when the value is used as a scalar, 14143 // but if the i64 value is converted to a vector, we need to undo the VMOVRRD. 14144 SelectionDAG &DAG = DCI.DAG; 14145 if (N->getNumOperands() == 2) 14146 if (SDValue RV = PerformVMOVDRRCombine(N, DAG)) 14147 return RV; 14148 14149 // Load i64 elements as f64 values so that type legalization does not split 14150 // them up into i32 values. 14151 EVT VT = N->getValueType(0); 14152 if (VT.getVectorElementType() != MVT::i64 || !hasNormalLoadOperand(N)) 14153 return SDValue(); 14154 SDLoc dl(N); 14155 SmallVector<SDValue, 8> Ops; 14156 unsigned NumElts = VT.getVectorNumElements(); 14157 for (unsigned i = 0; i < NumElts; ++i) { 14158 SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::f64, N->getOperand(i)); 14159 Ops.push_back(V); 14160 // Make the DAGCombiner fold the bitcast. 14161 DCI.AddToWorklist(V.getNode()); 14162 } 14163 EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, NumElts); 14164 SDValue BV = DAG.getBuildVector(FloatVT, dl, Ops); 14165 return DAG.getNode(ISD::BITCAST, dl, VT, BV); 14166 } 14167 14168 /// Target-specific dag combine xforms for ARMISD::BUILD_VECTOR. 14169 static SDValue 14170 PerformARMBUILD_VECTORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { 14171 // ARMISD::BUILD_VECTOR is introduced when legalizing ISD::BUILD_VECTOR. 14172 // At that time, we may have inserted bitcasts from integer to float. 14173 // If these bitcasts have survived DAGCombine, change the lowering of this 14174 // BUILD_VECTOR in something more vector friendly, i.e., that does not 14175 // force to use floating point types. 14176 14177 // Make sure we can change the type of the vector. 14178 // This is possible iff: 14179 // 1. The vector is only used in a bitcast to a integer type. I.e., 14180 // 1.1. Vector is used only once. 14181 // 1.2. Use is a bit convert to an integer type. 14182 // 2. The size of its operands are 32-bits (64-bits are not legal). 14183 EVT VT = N->getValueType(0); 14184 EVT EltVT = VT.getVectorElementType(); 14185 14186 // Check 1.1. and 2. 14187 if (EltVT.getSizeInBits() != 32 || !N->hasOneUse()) 14188 return SDValue(); 14189 14190 // By construction, the input type must be float. 14191 assert(EltVT == MVT::f32 && "Unexpected type!"); 14192 14193 // Check 1.2. 14194 SDNode *Use = *N->use_begin(); 14195 if (Use->getOpcode() != ISD::BITCAST || 14196 Use->getValueType(0).isFloatingPoint()) 14197 return SDValue(); 14198 14199 // Check profitability. 14200 // Model is, if more than half of the relevant operands are bitcast from 14201 // i32, turn the build_vector into a sequence of insert_vector_elt. 14202 // Relevant operands are everything that is not statically 14203 // (i.e., at compile time) bitcasted. 14204 unsigned NumOfBitCastedElts = 0; 14205 unsigned NumElts = VT.getVectorNumElements(); 14206 unsigned NumOfRelevantElts = NumElts; 14207 for (unsigned Idx = 0; Idx < NumElts; ++Idx) { 14208 SDValue Elt = N->getOperand(Idx); 14209 if (Elt->getOpcode() == ISD::BITCAST) { 14210 // Assume only bit cast to i32 will go away. 14211 if (Elt->getOperand(0).getValueType() == MVT::i32) 14212 ++NumOfBitCastedElts; 14213 } else if (Elt.isUndef() || isa<ConstantSDNode>(Elt)) 14214 // Constants are statically casted, thus do not count them as 14215 // relevant operands. 14216 --NumOfRelevantElts; 14217 } 14218 14219 // Check if more than half of the elements require a non-free bitcast. 14220 if (NumOfBitCastedElts <= NumOfRelevantElts / 2) 14221 return SDValue(); 14222 14223 SelectionDAG &DAG = DCI.DAG; 14224 // Create the new vector type. 14225 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts); 14226 // Check if the type is legal. 14227 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 14228 if (!TLI.isTypeLegal(VecVT)) 14229 return SDValue(); 14230 14231 // Combine: 14232 // ARMISD::BUILD_VECTOR E1, E2, ..., EN. 14233 // => BITCAST INSERT_VECTOR_ELT 14234 // (INSERT_VECTOR_ELT (...), (BITCAST EN-1), N-1), 14235 // (BITCAST EN), N. 14236 SDValue Vec = DAG.getUNDEF(VecVT); 14237 SDLoc dl(N); 14238 for (unsigned Idx = 0 ; Idx < NumElts; ++Idx) { 14239 SDValue V = N->getOperand(Idx); 14240 if (V.isUndef()) 14241 continue; 14242 if (V.getOpcode() == ISD::BITCAST && 14243 V->getOperand(0).getValueType() == MVT::i32) 14244 // Fold obvious case. 14245 V = V.getOperand(0); 14246 else { 14247 V = DAG.getNode(ISD::BITCAST, SDLoc(V), MVT::i32, V); 14248 // Make the DAGCombiner fold the bitcasts. 14249 DCI.AddToWorklist(V.getNode()); 14250 } 14251 SDValue LaneIdx = DAG.getConstant(Idx, dl, MVT::i32); 14252 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VecVT, Vec, V, LaneIdx); 14253 } 14254 Vec = DAG.getNode(ISD::BITCAST, dl, VT, Vec); 14255 // Make the DAGCombiner fold the bitcasts. 14256 DCI.AddToWorklist(Vec.getNode()); 14257 return Vec; 14258 } 14259 14260 static SDValue 14261 PerformPREDICATE_CASTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { 14262 EVT VT = N->getValueType(0); 14263 SDValue Op = N->getOperand(0); 14264 SDLoc dl(N); 14265 14266 // PREDICATE_CAST(PREDICATE_CAST(x)) == PREDICATE_CAST(x) 14267 if (Op->getOpcode() == ARMISD::PREDICATE_CAST) { 14268 // If the valuetypes are the same, we can remove the cast entirely. 14269 if (Op->getOperand(0).getValueType() == VT) 14270 return Op->getOperand(0); 14271 return DCI.DAG.getNode(ARMISD::PREDICATE_CAST, dl, VT, Op->getOperand(0)); 14272 } 14273 14274 // Turn pred_cast(xor x, -1) into xor(pred_cast x, -1), in order to produce 14275 // more VPNOT which might get folded as else predicates. 14276 if (Op.getValueType() == MVT::i32 && isBitwiseNot(Op)) { 14277 SDValue X = 14278 DCI.DAG.getNode(ARMISD::PREDICATE_CAST, dl, VT, Op->getOperand(0)); 14279 SDValue C = DCI.DAG.getNode(ARMISD::PREDICATE_CAST, dl, VT, 14280 DCI.DAG.getConstant(65535, dl, MVT::i32)); 14281 return DCI.DAG.getNode(ISD::XOR, dl, VT, X, C); 14282 } 14283 14284 // Only the bottom 16 bits of the source register are used. 14285 if (Op.getValueType() == MVT::i32) { 14286 APInt DemandedMask = APInt::getLowBitsSet(32, 16); 14287 const TargetLowering &TLI = DCI.DAG.getTargetLoweringInfo(); 14288 if (TLI.SimplifyDemandedBits(Op, DemandedMask, DCI)) 14289 return SDValue(N, 0); 14290 } 14291 return SDValue(); 14292 } 14293 14294 static SDValue 14295 PerformVECTOR_REG_CASTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, 14296 const ARMSubtarget *ST) { 14297 EVT VT = N->getValueType(0); 14298 SDValue Op = N->getOperand(0); 14299 SDLoc dl(N); 14300 14301 // Under Little endian, a VECTOR_REG_CAST is equivalent to a BITCAST 14302 if (ST->isLittle()) 14303 return DCI.DAG.getNode(ISD::BITCAST, dl, VT, Op); 14304 14305 // VECTOR_REG_CAST undef -> undef 14306 if (Op.isUndef()) 14307 return DCI.DAG.getUNDEF(VT); 14308 14309 // VECTOR_REG_CAST(VECTOR_REG_CAST(x)) == VECTOR_REG_CAST(x) 14310 if (Op->getOpcode() == ARMISD::VECTOR_REG_CAST) { 14311 // If the valuetypes are the same, we can remove the cast entirely. 14312 if (Op->getOperand(0).getValueType() == VT) 14313 return Op->getOperand(0); 14314 return DCI.DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, Op->getOperand(0)); 14315 } 14316 14317 return SDValue(); 14318 } 14319 14320 static SDValue PerformVCMPCombine(SDNode *N, 14321 TargetLowering::DAGCombinerInfo &DCI, 14322 const ARMSubtarget *Subtarget) { 14323 if (!Subtarget->hasMVEIntegerOps()) 14324 return SDValue(); 14325 14326 EVT VT = N->getValueType(0); 14327 SDValue Op0 = N->getOperand(0); 14328 SDValue Op1 = N->getOperand(1); 14329 ARMCC::CondCodes Cond = 14330 (ARMCC::CondCodes)cast<ConstantSDNode>(N->getOperand(2))->getZExtValue(); 14331 SDLoc dl(N); 14332 14333 // vcmp X, 0, cc -> vcmpz X, cc 14334 if (isZeroVector(Op1)) 14335 return DCI.DAG.getNode(ARMISD::VCMPZ, dl, VT, Op0, 14336 N->getOperand(2)); 14337 14338 unsigned SwappedCond = getSwappedCondition(Cond); 14339 if (isValidMVECond(SwappedCond, VT.isFloatingPoint())) { 14340 // vcmp 0, X, cc -> vcmpz X, reversed(cc) 14341 if (isZeroVector(Op0)) 14342 return DCI.DAG.getNode(ARMISD::VCMPZ, dl, VT, Op1, 14343 DCI.DAG.getConstant(SwappedCond, dl, MVT::i32)); 14344 // vcmp vdup(Y), X, cc -> vcmp X, vdup(Y), reversed(cc) 14345 if (Op0->getOpcode() == ARMISD::VDUP && Op1->getOpcode() != ARMISD::VDUP) 14346 return DCI.DAG.getNode(ARMISD::VCMP, dl, VT, Op1, Op0, 14347 DCI.DAG.getConstant(SwappedCond, dl, MVT::i32)); 14348 } 14349 14350 return SDValue(); 14351 } 14352 14353 /// PerformInsertEltCombine - Target-specific dag combine xforms for 14354 /// ISD::INSERT_VECTOR_ELT. 14355 static SDValue PerformInsertEltCombine(SDNode *N, 14356 TargetLowering::DAGCombinerInfo &DCI) { 14357 // Bitcast an i64 load inserted into a vector to f64. 14358 // Otherwise, the i64 value will be legalized to a pair of i32 values. 14359 EVT VT = N->getValueType(0); 14360 SDNode *Elt = N->getOperand(1).getNode(); 14361 if (VT.getVectorElementType() != MVT::i64 || 14362 !ISD::isNormalLoad(Elt) || cast<LoadSDNode>(Elt)->isVolatile()) 14363 return SDValue(); 14364 14365 SelectionDAG &DAG = DCI.DAG; 14366 SDLoc dl(N); 14367 EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, 14368 VT.getVectorNumElements()); 14369 SDValue Vec = DAG.getNode(ISD::BITCAST, dl, FloatVT, N->getOperand(0)); 14370 SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::f64, N->getOperand(1)); 14371 // Make the DAGCombiner fold the bitcasts. 14372 DCI.AddToWorklist(Vec.getNode()); 14373 DCI.AddToWorklist(V.getNode()); 14374 SDValue InsElt = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, FloatVT, 14375 Vec, V, N->getOperand(2)); 14376 return DAG.getNode(ISD::BITCAST, dl, VT, InsElt); 14377 } 14378 14379 // Convert a pair of extracts from the same base vector to a VMOVRRD. Either 14380 // directly or bitcast to an integer if the original is a float vector. 14381 // extract(x, n); extract(x, n+1) -> VMOVRRD(extract v2f64 x, n/2) 14382 // bitcast(extract(x, n)); bitcast(extract(x, n+1)) -> VMOVRRD(extract x, n/2) 14383 static SDValue 14384 PerformExtractEltToVMOVRRD(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { 14385 EVT VT = N->getValueType(0); 14386 SDLoc dl(N); 14387 14388 if (!DCI.isAfterLegalizeDAG() || VT != MVT::i32 || 14389 !DCI.DAG.getTargetLoweringInfo().isTypeLegal(MVT::f64)) 14390 return SDValue(); 14391 14392 SDValue Ext = SDValue(N, 0); 14393 if (Ext.getOpcode() == ISD::BITCAST && 14394 Ext.getOperand(0).getValueType() == MVT::f32) 14395 Ext = Ext.getOperand(0); 14396 if (Ext.getOpcode() != ISD::EXTRACT_VECTOR_ELT || 14397 !isa<ConstantSDNode>(Ext.getOperand(1)) || 14398 Ext.getConstantOperandVal(1) % 2 != 0) 14399 return SDValue(); 14400 if (Ext->use_size() == 1 && 14401 (Ext->use_begin()->getOpcode() == ISD::SINT_TO_FP || 14402 Ext->use_begin()->getOpcode() == ISD::UINT_TO_FP)) 14403 return SDValue(); 14404 14405 SDValue Op0 = Ext.getOperand(0); 14406 EVT VecVT = Op0.getValueType(); 14407 unsigned Lane = Ext.getConstantOperandVal(1); 14408 if (VecVT.getVectorNumElements() != 4) 14409 return SDValue(); 14410 14411 // Find another extract, of Lane + 1 14412 auto OtherIt = find_if(Op0->uses(), [&](SDNode *V) { 14413 return V->getOpcode() == ISD::EXTRACT_VECTOR_ELT && 14414 isa<ConstantSDNode>(V->getOperand(1)) && 14415 V->getConstantOperandVal(1) == Lane + 1; 14416 }); 14417 if (OtherIt == Op0->uses().end()) 14418 return SDValue(); 14419 14420 // For float extracts, we need to be converting to a i32 for both vector 14421 // lanes. 14422 SDValue OtherExt(*OtherIt, 0); 14423 if (OtherExt.getValueType() != MVT::i32) { 14424 if (OtherExt->use_size() != 1 || 14425 OtherExt->use_begin()->getOpcode() != ISD::BITCAST || 14426 OtherExt->use_begin()->getValueType(0) != MVT::i32) 14427 return SDValue(); 14428 OtherExt = SDValue(*OtherExt->use_begin(), 0); 14429 } 14430 14431 // Convert the type to a f64 and extract with a VMOVRRD. 14432 SDValue F64 = DCI.DAG.getNode( 14433 ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, 14434 DCI.DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v2f64, Op0), 14435 DCI.DAG.getConstant(Ext.getConstantOperandVal(1) / 2, dl, MVT::i32)); 14436 SDValue VMOVRRD = 14437 DCI.DAG.getNode(ARMISD::VMOVRRD, dl, {MVT::i32, MVT::i32}, F64); 14438 14439 DCI.CombineTo(OtherExt.getNode(), SDValue(VMOVRRD.getNode(), 1)); 14440 return VMOVRRD; 14441 } 14442 14443 static SDValue PerformExtractEltCombine(SDNode *N, 14444 TargetLowering::DAGCombinerInfo &DCI, 14445 const ARMSubtarget *ST) { 14446 SDValue Op0 = N->getOperand(0); 14447 EVT VT = N->getValueType(0); 14448 SDLoc dl(N); 14449 14450 // extract (vdup x) -> x 14451 if (Op0->getOpcode() == ARMISD::VDUP) { 14452 SDValue X = Op0->getOperand(0); 14453 if (VT == MVT::f16 && X.getValueType() == MVT::i32) 14454 return DCI.DAG.getNode(ARMISD::VMOVhr, dl, VT, X); 14455 if (VT == MVT::i32 && X.getValueType() == MVT::f16) 14456 return DCI.DAG.getNode(ARMISD::VMOVrh, dl, VT, X); 14457 14458 while (X.getValueType() != VT && X->getOpcode() == ISD::BITCAST) 14459 X = X->getOperand(0); 14460 if (X.getValueType() == VT) 14461 return X; 14462 } 14463 14464 // extract(bitcast(BUILD_VECTOR(VMOVDRR(a, b), ..))) -> a or b 14465 if (Op0.getValueType() == MVT::v4i32 && 14466 isa<ConstantSDNode>(N->getOperand(1)) && 14467 Op0.getOpcode() == ISD::BITCAST && 14468 Op0.getOperand(0).getOpcode() == ISD::BUILD_VECTOR && 14469 Op0.getOperand(0).getValueType() == MVT::v2f64) { 14470 SDValue BV = Op0.getOperand(0); 14471 unsigned Offset = N->getConstantOperandVal(1); 14472 SDValue MOV = BV.getOperand(Offset < 2 ? 0 : 1); 14473 if (MOV.getOpcode() == ARMISD::VMOVDRR) 14474 return MOV.getOperand(ST->isLittle() ? Offset % 2 : 1 - Offset % 2); 14475 } 14476 14477 // extract x, n; extract x, n+1 -> VMOVRRD x 14478 if (SDValue R = PerformExtractEltToVMOVRRD(N, DCI)) 14479 return R; 14480 14481 return SDValue(); 14482 } 14483 14484 static SDValue PerformSignExtendInregCombine(SDNode *N, SelectionDAG &DAG) { 14485 SDValue Op = N->getOperand(0); 14486 EVT VT = N->getValueType(0); 14487 14488 // sext_inreg(VGETLANEu) -> VGETLANEs 14489 if (Op.getOpcode() == ARMISD::VGETLANEu && 14490 cast<VTSDNode>(N->getOperand(1))->getVT() == 14491 Op.getOperand(0).getValueType().getScalarType()) 14492 return DAG.getNode(ARMISD::VGETLANEs, SDLoc(N), VT, Op.getOperand(0), 14493 Op.getOperand(1)); 14494 14495 return SDValue(); 14496 } 14497 14498 // When lowering complex nodes that we recognize, like VQDMULH and MULH, we 14499 // can end up with shuffle(binop(shuffle, shuffle)), that can be simplified to 14500 // binop as the shuffles cancel out. 14501 static SDValue FlattenVectorShuffle(ShuffleVectorSDNode *N, SelectionDAG &DAG) { 14502 EVT VT = N->getValueType(0); 14503 if (!N->getOperand(1).isUndef() || N->getOperand(0).getValueType() != VT) 14504 return SDValue(); 14505 SDValue Op = N->getOperand(0); 14506 14507 // Looking for binary operators that will have been folded from 14508 // truncates/extends. 14509 switch (Op.getOpcode()) { 14510 case ARMISD::VQDMULH: 14511 case ISD::MULHS: 14512 case ISD::MULHU: 14513 break; 14514 default: 14515 return SDValue(); 14516 } 14517 14518 ShuffleVectorSDNode *Op0 = dyn_cast<ShuffleVectorSDNode>(Op.getOperand(0)); 14519 ShuffleVectorSDNode *Op1 = dyn_cast<ShuffleVectorSDNode>(Op.getOperand(1)); 14520 if (!Op0 || !Op1 || !Op0->getOperand(1).isUndef() || 14521 !Op1->getOperand(1).isUndef() || Op0->getMask() != Op1->getMask() || 14522 Op0->getOperand(0).getValueType() != VT) 14523 return SDValue(); 14524 14525 // Check the mask turns into an identity shuffle. 14526 ArrayRef<int> NMask = N->getMask(); 14527 ArrayRef<int> OpMask = Op0->getMask(); 14528 for (int i = 0, e = NMask.size(); i != e; i++) { 14529 if (NMask[i] > 0 && OpMask[NMask[i]] > 0 && OpMask[NMask[i]] != i) 14530 return SDValue(); 14531 } 14532 14533 return DAG.getNode(Op.getOpcode(), SDLoc(Op), Op.getValueType(), 14534 Op0->getOperand(0), Op1->getOperand(0)); 14535 } 14536 14537 /// PerformVECTOR_SHUFFLECombine - Target-specific dag combine xforms for 14538 /// ISD::VECTOR_SHUFFLE. 14539 static SDValue PerformVECTOR_SHUFFLECombine(SDNode *N, SelectionDAG &DAG) { 14540 if (SDValue R = FlattenVectorShuffle(cast<ShuffleVectorSDNode>(N), DAG)) 14541 return R; 14542 14543 // The LLVM shufflevector instruction does not require the shuffle mask 14544 // length to match the operand vector length, but ISD::VECTOR_SHUFFLE does 14545 // have that requirement. When translating to ISD::VECTOR_SHUFFLE, if the 14546 // operands do not match the mask length, they are extended by concatenating 14547 // them with undef vectors. That is probably the right thing for other 14548 // targets, but for NEON it is better to concatenate two double-register 14549 // size vector operands into a single quad-register size vector. Do that 14550 // transformation here: 14551 // shuffle(concat(v1, undef), concat(v2, undef)) -> 14552 // shuffle(concat(v1, v2), undef) 14553 SDValue Op0 = N->getOperand(0); 14554 SDValue Op1 = N->getOperand(1); 14555 if (Op0.getOpcode() != ISD::CONCAT_VECTORS || 14556 Op1.getOpcode() != ISD::CONCAT_VECTORS || 14557 Op0.getNumOperands() != 2 || 14558 Op1.getNumOperands() != 2) 14559 return SDValue(); 14560 SDValue Concat0Op1 = Op0.getOperand(1); 14561 SDValue Concat1Op1 = Op1.getOperand(1); 14562 if (!Concat0Op1.isUndef() || !Concat1Op1.isUndef()) 14563 return SDValue(); 14564 // Skip the transformation if any of the types are illegal. 14565 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 14566 EVT VT = N->getValueType(0); 14567 if (!TLI.isTypeLegal(VT) || 14568 !TLI.isTypeLegal(Concat0Op1.getValueType()) || 14569 !TLI.isTypeLegal(Concat1Op1.getValueType())) 14570 return SDValue(); 14571 14572 SDValue NewConcat = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, 14573 Op0.getOperand(0), Op1.getOperand(0)); 14574 // Translate the shuffle mask. 14575 SmallVector<int, 16> NewMask; 14576 unsigned NumElts = VT.getVectorNumElements(); 14577 unsigned HalfElts = NumElts/2; 14578 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(N); 14579 for (unsigned n = 0; n < NumElts; ++n) { 14580 int MaskElt = SVN->getMaskElt(n); 14581 int NewElt = -1; 14582 if (MaskElt < (int)HalfElts) 14583 NewElt = MaskElt; 14584 else if (MaskElt >= (int)NumElts && MaskElt < (int)(NumElts + HalfElts)) 14585 NewElt = HalfElts + MaskElt - NumElts; 14586 NewMask.push_back(NewElt); 14587 } 14588 return DAG.getVectorShuffle(VT, SDLoc(N), NewConcat, 14589 DAG.getUNDEF(VT), NewMask); 14590 } 14591 14592 /// CombineBaseUpdate - Target-specific DAG combine function for VLDDUP, 14593 /// NEON load/store intrinsics, and generic vector load/stores, to merge 14594 /// base address updates. 14595 /// For generic load/stores, the memory type is assumed to be a vector. 14596 /// The caller is assumed to have checked legality. 14597 static SDValue CombineBaseUpdate(SDNode *N, 14598 TargetLowering::DAGCombinerInfo &DCI) { 14599 SelectionDAG &DAG = DCI.DAG; 14600 const bool isIntrinsic = (N->getOpcode() == ISD::INTRINSIC_VOID || 14601 N->getOpcode() == ISD::INTRINSIC_W_CHAIN); 14602 const bool isStore = N->getOpcode() == ISD::STORE; 14603 const unsigned AddrOpIdx = ((isIntrinsic || isStore) ? 2 : 1); 14604 SDValue Addr = N->getOperand(AddrOpIdx); 14605 MemSDNode *MemN = cast<MemSDNode>(N); 14606 SDLoc dl(N); 14607 14608 // Search for a use of the address operand that is an increment. 14609 for (SDNode::use_iterator UI = Addr.getNode()->use_begin(), 14610 UE = Addr.getNode()->use_end(); UI != UE; ++UI) { 14611 SDNode *User = *UI; 14612 if (User->getOpcode() != ISD::ADD || 14613 UI.getUse().getResNo() != Addr.getResNo()) 14614 continue; 14615 14616 // Check that the add is independent of the load/store. Otherwise, folding 14617 // it would create a cycle. We can avoid searching through Addr as it's a 14618 // predecessor to both. 14619 SmallPtrSet<const SDNode *, 32> Visited; 14620 SmallVector<const SDNode *, 16> Worklist; 14621 Visited.insert(Addr.getNode()); 14622 Worklist.push_back(N); 14623 Worklist.push_back(User); 14624 if (SDNode::hasPredecessorHelper(N, Visited, Worklist) || 14625 SDNode::hasPredecessorHelper(User, Visited, Worklist)) 14626 continue; 14627 14628 // Find the new opcode for the updating load/store. 14629 bool isLoadOp = true; 14630 bool isLaneOp = false; 14631 // Workaround for vst1x and vld1x which do not have alignment operand. 14632 bool hasAlignment = true; 14633 unsigned NewOpc = 0; 14634 unsigned NumVecs = 0; 14635 if (isIntrinsic) { 14636 unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue(); 14637 switch (IntNo) { 14638 default: llvm_unreachable("unexpected intrinsic for Neon base update"); 14639 case Intrinsic::arm_neon_vld1: NewOpc = ARMISD::VLD1_UPD; 14640 NumVecs = 1; break; 14641 case Intrinsic::arm_neon_vld2: NewOpc = ARMISD::VLD2_UPD; 14642 NumVecs = 2; break; 14643 case Intrinsic::arm_neon_vld3: NewOpc = ARMISD::VLD3_UPD; 14644 NumVecs = 3; break; 14645 case Intrinsic::arm_neon_vld4: NewOpc = ARMISD::VLD4_UPD; 14646 NumVecs = 4; break; 14647 case Intrinsic::arm_neon_vld1x2: 14648 case Intrinsic::arm_neon_vld1x3: 14649 case Intrinsic::arm_neon_vld1x4: 14650 case Intrinsic::arm_neon_vld2dup: 14651 case Intrinsic::arm_neon_vld3dup: 14652 case Intrinsic::arm_neon_vld4dup: 14653 // TODO: Support updating VLD1x and VLDxDUP nodes. For now, we just skip 14654 // combining base updates for such intrinsics. 14655 continue; 14656 case Intrinsic::arm_neon_vld2lane: NewOpc = ARMISD::VLD2LN_UPD; 14657 NumVecs = 2; isLaneOp = true; break; 14658 case Intrinsic::arm_neon_vld3lane: NewOpc = ARMISD::VLD3LN_UPD; 14659 NumVecs = 3; isLaneOp = true; break; 14660 case Intrinsic::arm_neon_vld4lane: NewOpc = ARMISD::VLD4LN_UPD; 14661 NumVecs = 4; isLaneOp = true; break; 14662 case Intrinsic::arm_neon_vst1: NewOpc = ARMISD::VST1_UPD; 14663 NumVecs = 1; isLoadOp = false; break; 14664 case Intrinsic::arm_neon_vst2: NewOpc = ARMISD::VST2_UPD; 14665 NumVecs = 2; isLoadOp = false; break; 14666 case Intrinsic::arm_neon_vst3: NewOpc = ARMISD::VST3_UPD; 14667 NumVecs = 3; isLoadOp = false; break; 14668 case Intrinsic::arm_neon_vst4: NewOpc = ARMISD::VST4_UPD; 14669 NumVecs = 4; isLoadOp = false; break; 14670 case Intrinsic::arm_neon_vst2lane: NewOpc = ARMISD::VST2LN_UPD; 14671 NumVecs = 2; isLoadOp = false; isLaneOp = true; break; 14672 case Intrinsic::arm_neon_vst3lane: NewOpc = ARMISD::VST3LN_UPD; 14673 NumVecs = 3; isLoadOp = false; isLaneOp = true; break; 14674 case Intrinsic::arm_neon_vst4lane: NewOpc = ARMISD::VST4LN_UPD; 14675 NumVecs = 4; isLoadOp = false; isLaneOp = true; break; 14676 case Intrinsic::arm_neon_vst1x2: NewOpc = ARMISD::VST1x2_UPD; 14677 NumVecs = 2; isLoadOp = false; hasAlignment = false; break; 14678 case Intrinsic::arm_neon_vst1x3: NewOpc = ARMISD::VST1x3_UPD; 14679 NumVecs = 3; isLoadOp = false; hasAlignment = false; break; 14680 case Intrinsic::arm_neon_vst1x4: NewOpc = ARMISD::VST1x4_UPD; 14681 NumVecs = 4; isLoadOp = false; hasAlignment = false; break; 14682 } 14683 } else { 14684 isLaneOp = true; 14685 switch (N->getOpcode()) { 14686 default: llvm_unreachable("unexpected opcode for Neon base update"); 14687 case ARMISD::VLD1DUP: NewOpc = ARMISD::VLD1DUP_UPD; NumVecs = 1; break; 14688 case ARMISD::VLD2DUP: NewOpc = ARMISD::VLD2DUP_UPD; NumVecs = 2; break; 14689 case ARMISD::VLD3DUP: NewOpc = ARMISD::VLD3DUP_UPD; NumVecs = 3; break; 14690 case ARMISD::VLD4DUP: NewOpc = ARMISD::VLD4DUP_UPD; NumVecs = 4; break; 14691 case ISD::LOAD: NewOpc = ARMISD::VLD1_UPD; 14692 NumVecs = 1; isLaneOp = false; break; 14693 case ISD::STORE: NewOpc = ARMISD::VST1_UPD; 14694 NumVecs = 1; isLaneOp = false; isLoadOp = false; break; 14695 } 14696 } 14697 14698 // Find the size of memory referenced by the load/store. 14699 EVT VecTy; 14700 if (isLoadOp) { 14701 VecTy = N->getValueType(0); 14702 } else if (isIntrinsic) { 14703 VecTy = N->getOperand(AddrOpIdx+1).getValueType(); 14704 } else { 14705 assert(isStore && "Node has to be a load, a store, or an intrinsic!"); 14706 VecTy = N->getOperand(1).getValueType(); 14707 } 14708 14709 unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8; 14710 if (isLaneOp) 14711 NumBytes /= VecTy.getVectorNumElements(); 14712 14713 // If the increment is a constant, it must match the memory ref size. 14714 SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0); 14715 ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode()); 14716 if (NumBytes >= 3 * 16 && (!CInc || CInc->getZExtValue() != NumBytes)) { 14717 // VLD3/4 and VST3/4 for 128-bit vectors are implemented with two 14718 // separate instructions that make it harder to use a non-constant update. 14719 continue; 14720 } 14721 14722 // OK, we found an ADD we can fold into the base update. 14723 // Now, create a _UPD node, taking care of not breaking alignment. 14724 14725 EVT AlignedVecTy = VecTy; 14726 unsigned Alignment = MemN->getAlignment(); 14727 14728 // If this is a less-than-standard-aligned load/store, change the type to 14729 // match the standard alignment. 14730 // The alignment is overlooked when selecting _UPD variants; and it's 14731 // easier to introduce bitcasts here than fix that. 14732 // There are 3 ways to get to this base-update combine: 14733 // - intrinsics: they are assumed to be properly aligned (to the standard 14734 // alignment of the memory type), so we don't need to do anything. 14735 // - ARMISD::VLDx nodes: they are only generated from the aforementioned 14736 // intrinsics, so, likewise, there's nothing to do. 14737 // - generic load/store instructions: the alignment is specified as an 14738 // explicit operand, rather than implicitly as the standard alignment 14739 // of the memory type (like the intrisics). We need to change the 14740 // memory type to match the explicit alignment. That way, we don't 14741 // generate non-standard-aligned ARMISD::VLDx nodes. 14742 if (isa<LSBaseSDNode>(N)) { 14743 if (Alignment == 0) 14744 Alignment = 1; 14745 if (Alignment < VecTy.getScalarSizeInBits() / 8) { 14746 MVT EltTy = MVT::getIntegerVT(Alignment * 8); 14747 assert(NumVecs == 1 && "Unexpected multi-element generic load/store."); 14748 assert(!isLaneOp && "Unexpected generic load/store lane."); 14749 unsigned NumElts = NumBytes / (EltTy.getSizeInBits() / 8); 14750 AlignedVecTy = MVT::getVectorVT(EltTy, NumElts); 14751 } 14752 // Don't set an explicit alignment on regular load/stores that we want 14753 // to transform to VLD/VST 1_UPD nodes. 14754 // This matches the behavior of regular load/stores, which only get an 14755 // explicit alignment if the MMO alignment is larger than the standard 14756 // alignment of the memory type. 14757 // Intrinsics, however, always get an explicit alignment, set to the 14758 // alignment of the MMO. 14759 Alignment = 1; 14760 } 14761 14762 // Create the new updating load/store node. 14763 // First, create an SDVTList for the new updating node's results. 14764 EVT Tys[6]; 14765 unsigned NumResultVecs = (isLoadOp ? NumVecs : 0); 14766 unsigned n; 14767 for (n = 0; n < NumResultVecs; ++n) 14768 Tys[n] = AlignedVecTy; 14769 Tys[n++] = MVT::i32; 14770 Tys[n] = MVT::Other; 14771 SDVTList SDTys = DAG.getVTList(makeArrayRef(Tys, NumResultVecs+2)); 14772 14773 // Then, gather the new node's operands. 14774 SmallVector<SDValue, 8> Ops; 14775 Ops.push_back(N->getOperand(0)); // incoming chain 14776 Ops.push_back(N->getOperand(AddrOpIdx)); 14777 Ops.push_back(Inc); 14778 14779 if (StoreSDNode *StN = dyn_cast<StoreSDNode>(N)) { 14780 // Try to match the intrinsic's signature 14781 Ops.push_back(StN->getValue()); 14782 } else { 14783 // Loads (and of course intrinsics) match the intrinsics' signature, 14784 // so just add all but the alignment operand. 14785 unsigned LastOperand = 14786 hasAlignment ? N->getNumOperands() - 1 : N->getNumOperands(); 14787 for (unsigned i = AddrOpIdx + 1; i < LastOperand; ++i) 14788 Ops.push_back(N->getOperand(i)); 14789 } 14790 14791 // For all node types, the alignment operand is always the last one. 14792 Ops.push_back(DAG.getConstant(Alignment, dl, MVT::i32)); 14793 14794 // If this is a non-standard-aligned STORE, the penultimate operand is the 14795 // stored value. Bitcast it to the aligned type. 14796 if (AlignedVecTy != VecTy && N->getOpcode() == ISD::STORE) { 14797 SDValue &StVal = Ops[Ops.size()-2]; 14798 StVal = DAG.getNode(ISD::BITCAST, dl, AlignedVecTy, StVal); 14799 } 14800 14801 EVT LoadVT = isLaneOp ? VecTy.getVectorElementType() : AlignedVecTy; 14802 SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, dl, SDTys, Ops, LoadVT, 14803 MemN->getMemOperand()); 14804 14805 // Update the uses. 14806 SmallVector<SDValue, 5> NewResults; 14807 for (unsigned i = 0; i < NumResultVecs; ++i) 14808 NewResults.push_back(SDValue(UpdN.getNode(), i)); 14809 14810 // If this is an non-standard-aligned LOAD, the first result is the loaded 14811 // value. Bitcast it to the expected result type. 14812 if (AlignedVecTy != VecTy && N->getOpcode() == ISD::LOAD) { 14813 SDValue &LdVal = NewResults[0]; 14814 LdVal = DAG.getNode(ISD::BITCAST, dl, VecTy, LdVal); 14815 } 14816 14817 NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs+1)); // chain 14818 DCI.CombineTo(N, NewResults); 14819 DCI.CombineTo(User, SDValue(UpdN.getNode(), NumResultVecs)); 14820 14821 break; 14822 } 14823 return SDValue(); 14824 } 14825 14826 static SDValue PerformVLDCombine(SDNode *N, 14827 TargetLowering::DAGCombinerInfo &DCI) { 14828 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) 14829 return SDValue(); 14830 14831 return CombineBaseUpdate(N, DCI); 14832 } 14833 14834 static SDValue PerformMVEVLDCombine(SDNode *N, 14835 TargetLowering::DAGCombinerInfo &DCI) { 14836 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) 14837 return SDValue(); 14838 14839 SelectionDAG &DAG = DCI.DAG; 14840 SDValue Addr = N->getOperand(2); 14841 MemSDNode *MemN = cast<MemSDNode>(N); 14842 SDLoc dl(N); 14843 14844 // For the stores, where there are multiple intrinsics we only actually want 14845 // to post-inc the last of the them. 14846 unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue(); 14847 if (IntNo == Intrinsic::arm_mve_vst2q && 14848 cast<ConstantSDNode>(N->getOperand(5))->getZExtValue() != 1) 14849 return SDValue(); 14850 if (IntNo == Intrinsic::arm_mve_vst4q && 14851 cast<ConstantSDNode>(N->getOperand(7))->getZExtValue() != 3) 14852 return SDValue(); 14853 14854 // Search for a use of the address operand that is an increment. 14855 for (SDNode::use_iterator UI = Addr.getNode()->use_begin(), 14856 UE = Addr.getNode()->use_end(); 14857 UI != UE; ++UI) { 14858 SDNode *User = *UI; 14859 if (User->getOpcode() != ISD::ADD || 14860 UI.getUse().getResNo() != Addr.getResNo()) 14861 continue; 14862 14863 // Check that the add is independent of the load/store. Otherwise, folding 14864 // it would create a cycle. We can avoid searching through Addr as it's a 14865 // predecessor to both. 14866 SmallPtrSet<const SDNode *, 32> Visited; 14867 SmallVector<const SDNode *, 16> Worklist; 14868 Visited.insert(Addr.getNode()); 14869 Worklist.push_back(N); 14870 Worklist.push_back(User); 14871 if (SDNode::hasPredecessorHelper(N, Visited, Worklist) || 14872 SDNode::hasPredecessorHelper(User, Visited, Worklist)) 14873 continue; 14874 14875 // Find the new opcode for the updating load/store. 14876 bool isLoadOp = true; 14877 unsigned NewOpc = 0; 14878 unsigned NumVecs = 0; 14879 switch (IntNo) { 14880 default: 14881 llvm_unreachable("unexpected intrinsic for MVE VLDn combine"); 14882 case Intrinsic::arm_mve_vld2q: 14883 NewOpc = ARMISD::VLD2_UPD; 14884 NumVecs = 2; 14885 break; 14886 case Intrinsic::arm_mve_vld4q: 14887 NewOpc = ARMISD::VLD4_UPD; 14888 NumVecs = 4; 14889 break; 14890 case Intrinsic::arm_mve_vst2q: 14891 NewOpc = ARMISD::VST2_UPD; 14892 NumVecs = 2; 14893 isLoadOp = false; 14894 break; 14895 case Intrinsic::arm_mve_vst4q: 14896 NewOpc = ARMISD::VST4_UPD; 14897 NumVecs = 4; 14898 isLoadOp = false; 14899 break; 14900 } 14901 14902 // Find the size of memory referenced by the load/store. 14903 EVT VecTy; 14904 if (isLoadOp) { 14905 VecTy = N->getValueType(0); 14906 } else { 14907 VecTy = N->getOperand(3).getValueType(); 14908 } 14909 14910 unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8; 14911 14912 // If the increment is a constant, it must match the memory ref size. 14913 SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0); 14914 ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode()); 14915 if (!CInc || CInc->getZExtValue() != NumBytes) 14916 continue; 14917 14918 // Create the new updating load/store node. 14919 // First, create an SDVTList for the new updating node's results. 14920 EVT Tys[6]; 14921 unsigned NumResultVecs = (isLoadOp ? NumVecs : 0); 14922 unsigned n; 14923 for (n = 0; n < NumResultVecs; ++n) 14924 Tys[n] = VecTy; 14925 Tys[n++] = MVT::i32; 14926 Tys[n] = MVT::Other; 14927 SDVTList SDTys = DAG.getVTList(makeArrayRef(Tys, NumResultVecs + 2)); 14928 14929 // Then, gather the new node's operands. 14930 SmallVector<SDValue, 8> Ops; 14931 Ops.push_back(N->getOperand(0)); // incoming chain 14932 Ops.push_back(N->getOperand(2)); // ptr 14933 Ops.push_back(Inc); 14934 14935 for (unsigned i = 3; i < N->getNumOperands(); ++i) 14936 Ops.push_back(N->getOperand(i)); 14937 14938 SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, dl, SDTys, Ops, VecTy, 14939 MemN->getMemOperand()); 14940 14941 // Update the uses. 14942 SmallVector<SDValue, 5> NewResults; 14943 for (unsigned i = 0; i < NumResultVecs; ++i) 14944 NewResults.push_back(SDValue(UpdN.getNode(), i)); 14945 14946 NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs + 1)); // chain 14947 DCI.CombineTo(N, NewResults); 14948 DCI.CombineTo(User, SDValue(UpdN.getNode(), NumResultVecs)); 14949 14950 break; 14951 } 14952 14953 return SDValue(); 14954 } 14955 14956 /// CombineVLDDUP - For a VDUPLANE node N, check if its source operand is a 14957 /// vldN-lane (N > 1) intrinsic, and if all the other uses of that intrinsic 14958 /// are also VDUPLANEs. If so, combine them to a vldN-dup operation and 14959 /// return true. 14960 static bool CombineVLDDUP(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { 14961 SelectionDAG &DAG = DCI.DAG; 14962 EVT VT = N->getValueType(0); 14963 // vldN-dup instructions only support 64-bit vectors for N > 1. 14964 if (!VT.is64BitVector()) 14965 return false; 14966 14967 // Check if the VDUPLANE operand is a vldN-dup intrinsic. 14968 SDNode *VLD = N->getOperand(0).getNode(); 14969 if (VLD->getOpcode() != ISD::INTRINSIC_W_CHAIN) 14970 return false; 14971 unsigned NumVecs = 0; 14972 unsigned NewOpc = 0; 14973 unsigned IntNo = cast<ConstantSDNode>(VLD->getOperand(1))->getZExtValue(); 14974 if (IntNo == Intrinsic::arm_neon_vld2lane) { 14975 NumVecs = 2; 14976 NewOpc = ARMISD::VLD2DUP; 14977 } else if (IntNo == Intrinsic::arm_neon_vld3lane) { 14978 NumVecs = 3; 14979 NewOpc = ARMISD::VLD3DUP; 14980 } else if (IntNo == Intrinsic::arm_neon_vld4lane) { 14981 NumVecs = 4; 14982 NewOpc = ARMISD::VLD4DUP; 14983 } else { 14984 return false; 14985 } 14986 14987 // First check that all the vldN-lane uses are VDUPLANEs and that the lane 14988 // numbers match the load. 14989 unsigned VLDLaneNo = 14990 cast<ConstantSDNode>(VLD->getOperand(NumVecs+3))->getZExtValue(); 14991 for (SDNode::use_iterator UI = VLD->use_begin(), UE = VLD->use_end(); 14992 UI != UE; ++UI) { 14993 // Ignore uses of the chain result. 14994 if (UI.getUse().getResNo() == NumVecs) 14995 continue; 14996 SDNode *User = *UI; 14997 if (User->getOpcode() != ARMISD::VDUPLANE || 14998 VLDLaneNo != cast<ConstantSDNode>(User->getOperand(1))->getZExtValue()) 14999 return false; 15000 } 15001 15002 // Create the vldN-dup node. 15003 EVT Tys[5]; 15004 unsigned n; 15005 for (n = 0; n < NumVecs; ++n) 15006 Tys[n] = VT; 15007 Tys[n] = MVT::Other; 15008 SDVTList SDTys = DAG.getVTList(makeArrayRef(Tys, NumVecs+1)); 15009 SDValue Ops[] = { VLD->getOperand(0), VLD->getOperand(2) }; 15010 MemIntrinsicSDNode *VLDMemInt = cast<MemIntrinsicSDNode>(VLD); 15011 SDValue VLDDup = DAG.getMemIntrinsicNode(NewOpc, SDLoc(VLD), SDTys, 15012 Ops, VLDMemInt->getMemoryVT(), 15013 VLDMemInt->getMemOperand()); 15014 15015 // Update the uses. 15016 for (SDNode::use_iterator UI = VLD->use_begin(), UE = VLD->use_end(); 15017 UI != UE; ++UI) { 15018 unsigned ResNo = UI.getUse().getResNo(); 15019 // Ignore uses of the chain result. 15020 if (ResNo == NumVecs) 15021 continue; 15022 SDNode *User = *UI; 15023 DCI.CombineTo(User, SDValue(VLDDup.getNode(), ResNo)); 15024 } 15025 15026 // Now the vldN-lane intrinsic is dead except for its chain result. 15027 // Update uses of the chain. 15028 std::vector<SDValue> VLDDupResults; 15029 for (unsigned n = 0; n < NumVecs; ++n) 15030 VLDDupResults.push_back(SDValue(VLDDup.getNode(), n)); 15031 VLDDupResults.push_back(SDValue(VLDDup.getNode(), NumVecs)); 15032 DCI.CombineTo(VLD, VLDDupResults); 15033 15034 return true; 15035 } 15036 15037 /// PerformVDUPLANECombine - Target-specific dag combine xforms for 15038 /// ARMISD::VDUPLANE. 15039 static SDValue PerformVDUPLANECombine(SDNode *N, 15040 TargetLowering::DAGCombinerInfo &DCI, 15041 const ARMSubtarget *Subtarget) { 15042 SDValue Op = N->getOperand(0); 15043 EVT VT = N->getValueType(0); 15044 15045 // On MVE, we just convert the VDUPLANE to a VDUP with an extract. 15046 if (Subtarget->hasMVEIntegerOps()) { 15047 EVT ExtractVT = VT.getVectorElementType(); 15048 // We need to ensure we are creating a legal type. 15049 if (!DCI.DAG.getTargetLoweringInfo().isTypeLegal(ExtractVT)) 15050 ExtractVT = MVT::i32; 15051 SDValue Extract = DCI.DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(N), ExtractVT, 15052 N->getOperand(0), N->getOperand(1)); 15053 return DCI.DAG.getNode(ARMISD::VDUP, SDLoc(N), VT, Extract); 15054 } 15055 15056 // If the source is a vldN-lane (N > 1) intrinsic, and all the other uses 15057 // of that intrinsic are also VDUPLANEs, combine them to a vldN-dup operation. 15058 if (CombineVLDDUP(N, DCI)) 15059 return SDValue(N, 0); 15060 15061 // If the source is already a VMOVIMM or VMVNIMM splat, the VDUPLANE is 15062 // redundant. Ignore bit_converts for now; element sizes are checked below. 15063 while (Op.getOpcode() == ISD::BITCAST) 15064 Op = Op.getOperand(0); 15065 if (Op.getOpcode() != ARMISD::VMOVIMM && Op.getOpcode() != ARMISD::VMVNIMM) 15066 return SDValue(); 15067 15068 // Make sure the VMOV element size is not bigger than the VDUPLANE elements. 15069 unsigned EltSize = Op.getScalarValueSizeInBits(); 15070 // The canonical VMOV for a zero vector uses a 32-bit element size. 15071 unsigned Imm = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 15072 unsigned EltBits; 15073 if (ARM_AM::decodeVMOVModImm(Imm, EltBits) == 0) 15074 EltSize = 8; 15075 if (EltSize > VT.getScalarSizeInBits()) 15076 return SDValue(); 15077 15078 return DCI.DAG.getNode(ISD::BITCAST, SDLoc(N), VT, Op); 15079 } 15080 15081 /// PerformVDUPCombine - Target-specific dag combine xforms for ARMISD::VDUP. 15082 static SDValue PerformVDUPCombine(SDNode *N, 15083 TargetLowering::DAGCombinerInfo &DCI, 15084 const ARMSubtarget *Subtarget) { 15085 SelectionDAG &DAG = DCI.DAG; 15086 SDValue Op = N->getOperand(0); 15087 SDLoc dl(N); 15088 15089 if (Subtarget->hasMVEIntegerOps()) { 15090 // Convert VDUP f32 -> VDUP BITCAST i32 under MVE, as we know the value will 15091 // need to come from a GPR. 15092 if (Op.getValueType() == MVT::f32) 15093 return DCI.DAG.getNode(ARMISD::VDUP, dl, N->getValueType(0), 15094 DAG.getNode(ISD::BITCAST, dl, MVT::i32, Op)); 15095 else if (Op.getValueType() == MVT::f16) 15096 return DCI.DAG.getNode(ARMISD::VDUP, dl, N->getValueType(0), 15097 DAG.getNode(ARMISD::VMOVrh, dl, MVT::i32, Op)); 15098 } 15099 15100 if (!Subtarget->hasNEON()) 15101 return SDValue(); 15102 15103 // Match VDUP(LOAD) -> VLD1DUP. 15104 // We match this pattern here rather than waiting for isel because the 15105 // transform is only legal for unindexed loads. 15106 LoadSDNode *LD = dyn_cast<LoadSDNode>(Op.getNode()); 15107 if (LD && Op.hasOneUse() && LD->isUnindexed() && 15108 LD->getMemoryVT() == N->getValueType(0).getVectorElementType()) { 15109 SDValue Ops[] = { LD->getOperand(0), LD->getOperand(1), 15110 DAG.getConstant(LD->getAlignment(), SDLoc(N), MVT::i32) }; 15111 SDVTList SDTys = DAG.getVTList(N->getValueType(0), MVT::Other); 15112 SDValue VLDDup = DAG.getMemIntrinsicNode(ARMISD::VLD1DUP, SDLoc(N), SDTys, 15113 Ops, LD->getMemoryVT(), 15114 LD->getMemOperand()); 15115 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), VLDDup.getValue(1)); 15116 return VLDDup; 15117 } 15118 15119 return SDValue(); 15120 } 15121 15122 static SDValue PerformLOADCombine(SDNode *N, 15123 TargetLowering::DAGCombinerInfo &DCI) { 15124 EVT VT = N->getValueType(0); 15125 15126 // If this is a legal vector load, try to combine it into a VLD1_UPD. 15127 if (ISD::isNormalLoad(N) && VT.isVector() && 15128 DCI.DAG.getTargetLoweringInfo().isTypeLegal(VT)) 15129 return CombineBaseUpdate(N, DCI); 15130 15131 return SDValue(); 15132 } 15133 15134 // Optimize trunc store (of multiple scalars) to shuffle and store. First, 15135 // pack all of the elements in one place. Next, store to memory in fewer 15136 // chunks. 15137 static SDValue PerformTruncatingStoreCombine(StoreSDNode *St, 15138 SelectionDAG &DAG) { 15139 SDValue StVal = St->getValue(); 15140 EVT VT = StVal.getValueType(); 15141 if (!St->isTruncatingStore() || !VT.isVector()) 15142 return SDValue(); 15143 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 15144 EVT StVT = St->getMemoryVT(); 15145 unsigned NumElems = VT.getVectorNumElements(); 15146 assert(StVT != VT && "Cannot truncate to the same type"); 15147 unsigned FromEltSz = VT.getScalarSizeInBits(); 15148 unsigned ToEltSz = StVT.getScalarSizeInBits(); 15149 15150 // From, To sizes and ElemCount must be pow of two 15151 if (!isPowerOf2_32(NumElems * FromEltSz * ToEltSz)) 15152 return SDValue(); 15153 15154 // We are going to use the original vector elt for storing. 15155 // Accumulated smaller vector elements must be a multiple of the store size. 15156 if (0 != (NumElems * FromEltSz) % ToEltSz) 15157 return SDValue(); 15158 15159 unsigned SizeRatio = FromEltSz / ToEltSz; 15160 assert(SizeRatio * NumElems * ToEltSz == VT.getSizeInBits()); 15161 15162 // Create a type on which we perform the shuffle. 15163 EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(), StVT.getScalarType(), 15164 NumElems * SizeRatio); 15165 assert(WideVecVT.getSizeInBits() == VT.getSizeInBits()); 15166 15167 SDLoc DL(St); 15168 SDValue WideVec = DAG.getNode(ISD::BITCAST, DL, WideVecVT, StVal); 15169 SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1); 15170 for (unsigned i = 0; i < NumElems; ++i) 15171 ShuffleVec[i] = DAG.getDataLayout().isBigEndian() ? (i + 1) * SizeRatio - 1 15172 : i * SizeRatio; 15173 15174 // Can't shuffle using an illegal type. 15175 if (!TLI.isTypeLegal(WideVecVT)) 15176 return SDValue(); 15177 15178 SDValue Shuff = DAG.getVectorShuffle( 15179 WideVecVT, DL, WideVec, DAG.getUNDEF(WideVec.getValueType()), ShuffleVec); 15180 // At this point all of the data is stored at the bottom of the 15181 // register. We now need to save it to mem. 15182 15183 // Find the largest store unit 15184 MVT StoreType = MVT::i8; 15185 for (MVT Tp : MVT::integer_valuetypes()) { 15186 if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToEltSz) 15187 StoreType = Tp; 15188 } 15189 // Didn't find a legal store type. 15190 if (!TLI.isTypeLegal(StoreType)) 15191 return SDValue(); 15192 15193 // Bitcast the original vector into a vector of store-size units 15194 EVT StoreVecVT = 15195 EVT::getVectorVT(*DAG.getContext(), StoreType, 15196 VT.getSizeInBits() / EVT(StoreType).getSizeInBits()); 15197 assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits()); 15198 SDValue ShuffWide = DAG.getNode(ISD::BITCAST, DL, StoreVecVT, Shuff); 15199 SmallVector<SDValue, 8> Chains; 15200 SDValue Increment = DAG.getConstant(StoreType.getSizeInBits() / 8, DL, 15201 TLI.getPointerTy(DAG.getDataLayout())); 15202 SDValue BasePtr = St->getBasePtr(); 15203 15204 // Perform one or more big stores into memory. 15205 unsigned E = (ToEltSz * NumElems) / StoreType.getSizeInBits(); 15206 for (unsigned I = 0; I < E; I++) { 15207 SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, StoreType, 15208 ShuffWide, DAG.getIntPtrConstant(I, DL)); 15209 SDValue Ch = 15210 DAG.getStore(St->getChain(), DL, SubVec, BasePtr, St->getPointerInfo(), 15211 St->getAlignment(), St->getMemOperand()->getFlags()); 15212 BasePtr = 15213 DAG.getNode(ISD::ADD, DL, BasePtr.getValueType(), BasePtr, Increment); 15214 Chains.push_back(Ch); 15215 } 15216 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains); 15217 } 15218 15219 // Try taking a single vector store from an truncate (which would otherwise turn 15220 // into an expensive buildvector) and splitting it into a series of narrowing 15221 // stores. 15222 static SDValue PerformSplittingToNarrowingStores(StoreSDNode *St, 15223 SelectionDAG &DAG) { 15224 if (!St->isSimple() || St->isTruncatingStore() || !St->isUnindexed()) 15225 return SDValue(); 15226 SDValue Trunc = St->getValue(); 15227 if (Trunc->getOpcode() != ISD::TRUNCATE && Trunc->getOpcode() != ISD::FP_ROUND) 15228 return SDValue(); 15229 EVT FromVT = Trunc->getOperand(0).getValueType(); 15230 EVT ToVT = Trunc.getValueType(); 15231 if (!ToVT.isVector()) 15232 return SDValue(); 15233 assert(FromVT.getVectorNumElements() == ToVT.getVectorNumElements()); 15234 EVT ToEltVT = ToVT.getVectorElementType(); 15235 EVT FromEltVT = FromVT.getVectorElementType(); 15236 15237 unsigned NumElements = 0; 15238 if (FromEltVT == MVT::i32 && (ToEltVT == MVT::i16 || ToEltVT == MVT::i8)) 15239 NumElements = 4; 15240 if (FromEltVT == MVT::i16 && ToEltVT == MVT::i8) 15241 NumElements = 8; 15242 if (FromEltVT == MVT::f32 && ToEltVT == MVT::f16) 15243 NumElements = 4; 15244 if (NumElements == 0 || 15245 (FromEltVT != MVT::f32 && FromVT.getVectorNumElements() == NumElements) || 15246 FromVT.getVectorNumElements() % NumElements != 0) 15247 return SDValue(); 15248 15249 // Test if the Trunc will be convertable to a VMOVN with a shuffle, and if so 15250 // use the VMOVN over splitting the store. We are looking for patterns of: 15251 // !rev: 0 N 1 N+1 2 N+2 ... 15252 // rev: N 0 N+1 1 N+2 2 ... 15253 // The shuffle may either be a single source (in which case N = NumElts/2) or 15254 // two inputs extended with concat to the same size (in which case N = 15255 // NumElts). 15256 auto isVMOVNShuffle = [&](ShuffleVectorSDNode *SVN, bool Rev) { 15257 ArrayRef<int> M = SVN->getMask(); 15258 unsigned NumElts = ToVT.getVectorNumElements(); 15259 if (SVN->getOperand(1).isUndef()) 15260 NumElts /= 2; 15261 15262 unsigned Off0 = Rev ? NumElts : 0; 15263 unsigned Off1 = Rev ? 0 : NumElts; 15264 15265 for (unsigned I = 0; I < NumElts; I += 2) { 15266 if (M[I] >= 0 && M[I] != (int)(Off0 + I / 2)) 15267 return false; 15268 if (M[I + 1] >= 0 && M[I + 1] != (int)(Off1 + I / 2)) 15269 return false; 15270 } 15271 15272 return true; 15273 }; 15274 15275 // It may be preferable to keep the store unsplit as the trunc may end up 15276 // being removed. Check that here. 15277 if (Trunc.getOperand(0).getOpcode() == ISD::SMIN) { 15278 if (SDValue U = PerformVQDMULHCombine(Trunc.getOperand(0).getNode(), DAG)) { 15279 DAG.ReplaceAllUsesWith(Trunc.getOperand(0), U); 15280 return SDValue(); 15281 } 15282 } 15283 if (auto *Shuffle = dyn_cast<ShuffleVectorSDNode>(Trunc.getOperand(0))) 15284 if (isVMOVNShuffle(Shuffle, false) || isVMOVNShuffle(Shuffle, true)) 15285 return SDValue(); 15286 15287 LLVMContext &C = *DAG.getContext(); 15288 SDLoc DL(St); 15289 // Details about the old store 15290 SDValue Ch = St->getChain(); 15291 SDValue BasePtr = St->getBasePtr(); 15292 Align Alignment = St->getOriginalAlign(); 15293 MachineMemOperand::Flags MMOFlags = St->getMemOperand()->getFlags(); 15294 AAMDNodes AAInfo = St->getAAInfo(); 15295 15296 // We split the store into slices of NumElements. fp16 trunc stores are vcvt 15297 // and then stored as truncating integer stores. 15298 EVT NewFromVT = EVT::getVectorVT(C, FromEltVT, NumElements); 15299 EVT NewToVT = EVT::getVectorVT( 15300 C, EVT::getIntegerVT(C, ToEltVT.getSizeInBits()), NumElements); 15301 15302 SmallVector<SDValue, 4> Stores; 15303 for (unsigned i = 0; i < FromVT.getVectorNumElements() / NumElements; i++) { 15304 unsigned NewOffset = i * NumElements * ToEltVT.getSizeInBits() / 8; 15305 SDValue NewPtr = 15306 DAG.getObjectPtrOffset(DL, BasePtr, TypeSize::Fixed(NewOffset)); 15307 15308 SDValue Extract = 15309 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NewFromVT, Trunc.getOperand(0), 15310 DAG.getConstant(i * NumElements, DL, MVT::i32)); 15311 15312 if (ToEltVT == MVT::f16) { 15313 SDValue FPTrunc = 15314 DAG.getNode(ARMISD::VCVTN, DL, MVT::v8f16, DAG.getUNDEF(MVT::v8f16), 15315 Extract, DAG.getConstant(0, DL, MVT::i32)); 15316 Extract = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, MVT::v4i32, FPTrunc); 15317 } 15318 15319 SDValue Store = DAG.getTruncStore( 15320 Ch, DL, Extract, NewPtr, St->getPointerInfo().getWithOffset(NewOffset), 15321 NewToVT, Alignment.value(), MMOFlags, AAInfo); 15322 Stores.push_back(Store); 15323 } 15324 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores); 15325 } 15326 15327 // Given a floating point store from an extracted vector, with an integer 15328 // VGETLANE that already exists, store the existing VGETLANEu directly. This can 15329 // help reduce fp register pressure, doesn't require the fp extract and allows 15330 // use of more integer post-inc stores not available with vstr. 15331 static SDValue PerformExtractFpToIntStores(StoreSDNode *St, SelectionDAG &DAG) { 15332 if (!St->isSimple() || St->isTruncatingStore() || !St->isUnindexed()) 15333 return SDValue(); 15334 SDValue Extract = St->getValue(); 15335 EVT VT = Extract.getValueType(); 15336 // For now only uses f16. This may be useful for f32 too, but that will 15337 // be bitcast(extract), not the VGETLANEu we currently check here. 15338 if (VT != MVT::f16 || Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT) 15339 return SDValue(); 15340 15341 SDNode *GetLane = 15342 DAG.getNodeIfExists(ARMISD::VGETLANEu, DAG.getVTList(MVT::i32), 15343 {Extract.getOperand(0), Extract.getOperand(1)}); 15344 if (!GetLane) 15345 return SDValue(); 15346 15347 LLVMContext &C = *DAG.getContext(); 15348 SDLoc DL(St); 15349 // Create a new integer store to replace the existing floating point version. 15350 SDValue Ch = St->getChain(); 15351 SDValue BasePtr = St->getBasePtr(); 15352 Align Alignment = St->getOriginalAlign(); 15353 MachineMemOperand::Flags MMOFlags = St->getMemOperand()->getFlags(); 15354 AAMDNodes AAInfo = St->getAAInfo(); 15355 EVT NewToVT = EVT::getIntegerVT(C, VT.getSizeInBits()); 15356 SDValue Store = DAG.getTruncStore(Ch, DL, SDValue(GetLane, 0), BasePtr, 15357 St->getPointerInfo(), NewToVT, 15358 Alignment.value(), MMOFlags, AAInfo); 15359 15360 return Store; 15361 } 15362 15363 /// PerformSTORECombine - Target-specific dag combine xforms for 15364 /// ISD::STORE. 15365 static SDValue PerformSTORECombine(SDNode *N, 15366 TargetLowering::DAGCombinerInfo &DCI, 15367 const ARMSubtarget *Subtarget) { 15368 StoreSDNode *St = cast<StoreSDNode>(N); 15369 if (St->isVolatile()) 15370 return SDValue(); 15371 SDValue StVal = St->getValue(); 15372 EVT VT = StVal.getValueType(); 15373 15374 if (Subtarget->hasNEON()) 15375 if (SDValue Store = PerformTruncatingStoreCombine(St, DCI.DAG)) 15376 return Store; 15377 15378 if (Subtarget->hasMVEIntegerOps()) { 15379 if (SDValue NewToken = PerformSplittingToNarrowingStores(St, DCI.DAG)) 15380 return NewToken; 15381 if (SDValue NewChain = PerformExtractFpToIntStores(St, DCI.DAG)) 15382 return NewChain; 15383 } 15384 15385 if (!ISD::isNormalStore(St)) 15386 return SDValue(); 15387 15388 // Split a store of a VMOVDRR into two integer stores to avoid mixing NEON and 15389 // ARM stores of arguments in the same cache line. 15390 if (StVal.getNode()->getOpcode() == ARMISD::VMOVDRR && 15391 StVal.getNode()->hasOneUse()) { 15392 SelectionDAG &DAG = DCI.DAG; 15393 bool isBigEndian = DAG.getDataLayout().isBigEndian(); 15394 SDLoc DL(St); 15395 SDValue BasePtr = St->getBasePtr(); 15396 SDValue NewST1 = DAG.getStore( 15397 St->getChain(), DL, StVal.getNode()->getOperand(isBigEndian ? 1 : 0), 15398 BasePtr, St->getPointerInfo(), St->getOriginalAlign(), 15399 St->getMemOperand()->getFlags()); 15400 15401 SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr, 15402 DAG.getConstant(4, DL, MVT::i32)); 15403 return DAG.getStore(NewST1.getValue(0), DL, 15404 StVal.getNode()->getOperand(isBigEndian ? 0 : 1), 15405 OffsetPtr, St->getPointerInfo().getWithOffset(4), 15406 St->getOriginalAlign(), 15407 St->getMemOperand()->getFlags()); 15408 } 15409 15410 if (StVal.getValueType() == MVT::i64 && 15411 StVal.getNode()->getOpcode() == ISD::EXTRACT_VECTOR_ELT) { 15412 15413 // Bitcast an i64 store extracted from a vector to f64. 15414 // Otherwise, the i64 value will be legalized to a pair of i32 values. 15415 SelectionDAG &DAG = DCI.DAG; 15416 SDLoc dl(StVal); 15417 SDValue IntVec = StVal.getOperand(0); 15418 EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, 15419 IntVec.getValueType().getVectorNumElements()); 15420 SDValue Vec = DAG.getNode(ISD::BITCAST, dl, FloatVT, IntVec); 15421 SDValue ExtElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, 15422 Vec, StVal.getOperand(1)); 15423 dl = SDLoc(N); 15424 SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::i64, ExtElt); 15425 // Make the DAGCombiner fold the bitcasts. 15426 DCI.AddToWorklist(Vec.getNode()); 15427 DCI.AddToWorklist(ExtElt.getNode()); 15428 DCI.AddToWorklist(V.getNode()); 15429 return DAG.getStore(St->getChain(), dl, V, St->getBasePtr(), 15430 St->getPointerInfo(), St->getAlignment(), 15431 St->getMemOperand()->getFlags(), St->getAAInfo()); 15432 } 15433 15434 // If this is a legal vector store, try to combine it into a VST1_UPD. 15435 if (Subtarget->hasNEON() && ISD::isNormalStore(N) && VT.isVector() && 15436 DCI.DAG.getTargetLoweringInfo().isTypeLegal(VT)) 15437 return CombineBaseUpdate(N, DCI); 15438 15439 return SDValue(); 15440 } 15441 15442 /// PerformVCVTCombine - VCVT (floating-point to fixed-point, Advanced SIMD) 15443 /// can replace combinations of VMUL and VCVT (floating-point to integer) 15444 /// when the VMUL has a constant operand that is a power of 2. 15445 /// 15446 /// Example (assume d17 = <float 8.000000e+00, float 8.000000e+00>): 15447 /// vmul.f32 d16, d17, d16 15448 /// vcvt.s32.f32 d16, d16 15449 /// becomes: 15450 /// vcvt.s32.f32 d16, d16, #3 15451 static SDValue PerformVCVTCombine(SDNode *N, SelectionDAG &DAG, 15452 const ARMSubtarget *Subtarget) { 15453 if (!Subtarget->hasNEON()) 15454 return SDValue(); 15455 15456 SDValue Op = N->getOperand(0); 15457 if (!Op.getValueType().isVector() || !Op.getValueType().isSimple() || 15458 Op.getOpcode() != ISD::FMUL) 15459 return SDValue(); 15460 15461 SDValue ConstVec = Op->getOperand(1); 15462 if (!isa<BuildVectorSDNode>(ConstVec)) 15463 return SDValue(); 15464 15465 MVT FloatTy = Op.getSimpleValueType().getVectorElementType(); 15466 uint32_t FloatBits = FloatTy.getSizeInBits(); 15467 MVT IntTy = N->getSimpleValueType(0).getVectorElementType(); 15468 uint32_t IntBits = IntTy.getSizeInBits(); 15469 unsigned NumLanes = Op.getValueType().getVectorNumElements(); 15470 if (FloatBits != 32 || IntBits > 32 || (NumLanes != 4 && NumLanes != 2)) { 15471 // These instructions only exist converting from f32 to i32. We can handle 15472 // smaller integers by generating an extra truncate, but larger ones would 15473 // be lossy. We also can't handle anything other than 2 or 4 lanes, since 15474 // these intructions only support v2i32/v4i32 types. 15475 return SDValue(); 15476 } 15477 15478 BitVector UndefElements; 15479 BuildVectorSDNode *BV = cast<BuildVectorSDNode>(ConstVec); 15480 int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, 33); 15481 if (C == -1 || C == 0 || C > 32) 15482 return SDValue(); 15483 15484 SDLoc dl(N); 15485 bool isSigned = N->getOpcode() == ISD::FP_TO_SINT; 15486 unsigned IntrinsicOpcode = isSigned ? Intrinsic::arm_neon_vcvtfp2fxs : 15487 Intrinsic::arm_neon_vcvtfp2fxu; 15488 SDValue FixConv = DAG.getNode( 15489 ISD::INTRINSIC_WO_CHAIN, dl, NumLanes == 2 ? MVT::v2i32 : MVT::v4i32, 15490 DAG.getConstant(IntrinsicOpcode, dl, MVT::i32), Op->getOperand(0), 15491 DAG.getConstant(C, dl, MVT::i32)); 15492 15493 if (IntBits < FloatBits) 15494 FixConv = DAG.getNode(ISD::TRUNCATE, dl, N->getValueType(0), FixConv); 15495 15496 return FixConv; 15497 } 15498 15499 /// PerformVDIVCombine - VCVT (fixed-point to floating-point, Advanced SIMD) 15500 /// can replace combinations of VCVT (integer to floating-point) and VDIV 15501 /// when the VDIV has a constant operand that is a power of 2. 15502 /// 15503 /// Example (assume d17 = <float 8.000000e+00, float 8.000000e+00>): 15504 /// vcvt.f32.s32 d16, d16 15505 /// vdiv.f32 d16, d17, d16 15506 /// becomes: 15507 /// vcvt.f32.s32 d16, d16, #3 15508 static SDValue PerformVDIVCombine(SDNode *N, SelectionDAG &DAG, 15509 const ARMSubtarget *Subtarget) { 15510 if (!Subtarget->hasNEON()) 15511 return SDValue(); 15512 15513 SDValue Op = N->getOperand(0); 15514 unsigned OpOpcode = Op.getNode()->getOpcode(); 15515 if (!N->getValueType(0).isVector() || !N->getValueType(0).isSimple() || 15516 (OpOpcode != ISD::SINT_TO_FP && OpOpcode != ISD::UINT_TO_FP)) 15517 return SDValue(); 15518 15519 SDValue ConstVec = N->getOperand(1); 15520 if (!isa<BuildVectorSDNode>(ConstVec)) 15521 return SDValue(); 15522 15523 MVT FloatTy = N->getSimpleValueType(0).getVectorElementType(); 15524 uint32_t FloatBits = FloatTy.getSizeInBits(); 15525 MVT IntTy = Op.getOperand(0).getSimpleValueType().getVectorElementType(); 15526 uint32_t IntBits = IntTy.getSizeInBits(); 15527 unsigned NumLanes = Op.getValueType().getVectorNumElements(); 15528 if (FloatBits != 32 || IntBits > 32 || (NumLanes != 4 && NumLanes != 2)) { 15529 // These instructions only exist converting from i32 to f32. We can handle 15530 // smaller integers by generating an extra extend, but larger ones would 15531 // be lossy. We also can't handle anything other than 2 or 4 lanes, since 15532 // these intructions only support v2i32/v4i32 types. 15533 return SDValue(); 15534 } 15535 15536 BitVector UndefElements; 15537 BuildVectorSDNode *BV = cast<BuildVectorSDNode>(ConstVec); 15538 int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, 33); 15539 if (C == -1 || C == 0 || C > 32) 15540 return SDValue(); 15541 15542 SDLoc dl(N); 15543 bool isSigned = OpOpcode == ISD::SINT_TO_FP; 15544 SDValue ConvInput = Op.getOperand(0); 15545 if (IntBits < FloatBits) 15546 ConvInput = DAG.getNode(isSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, 15547 dl, NumLanes == 2 ? MVT::v2i32 : MVT::v4i32, 15548 ConvInput); 15549 15550 unsigned IntrinsicOpcode = isSigned ? Intrinsic::arm_neon_vcvtfxs2fp : 15551 Intrinsic::arm_neon_vcvtfxu2fp; 15552 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, 15553 Op.getValueType(), 15554 DAG.getConstant(IntrinsicOpcode, dl, MVT::i32), 15555 ConvInput, DAG.getConstant(C, dl, MVT::i32)); 15556 } 15557 15558 static SDValue PerformVECREDUCE_ADDCombine(SDNode *N, SelectionDAG &DAG, 15559 const ARMSubtarget *ST) { 15560 if (!ST->hasMVEIntegerOps()) 15561 return SDValue(); 15562 15563 assert(N->getOpcode() == ISD::VECREDUCE_ADD); 15564 EVT ResVT = N->getValueType(0); 15565 SDValue N0 = N->getOperand(0); 15566 SDLoc dl(N); 15567 15568 // We are looking for something that will have illegal types if left alone, 15569 // but that we can convert to a single instruction undef MVE. For example 15570 // vecreduce_add(sext(A, v8i32)) => VADDV.s16 A 15571 // or 15572 // vecreduce_add(mul(zext(A, v16i32), zext(B, v16i32))) => VMLADAV.u8 A, B 15573 15574 // Cases: 15575 // VADDV u/s 8/16/32 15576 // VMLAV u/s 8/16/32 15577 // VADDLV u/s 32 15578 // VMLALV u/s 16/32 15579 15580 // If the input vector is smaller than legal (v4i8/v4i16 for example) we can 15581 // extend it and use v4i32 instead. 15582 auto ExtendIfNeeded = [&](SDValue A, unsigned ExtendCode) { 15583 EVT AVT = A.getValueType(); 15584 if (!AVT.is128BitVector()) 15585 A = DAG.getNode(ExtendCode, dl, 15586 AVT.changeVectorElementType(MVT::getIntegerVT( 15587 128 / AVT.getVectorMinNumElements())), 15588 A); 15589 return A; 15590 }; 15591 auto IsVADDV = [&](MVT RetTy, unsigned ExtendCode, ArrayRef<MVT> ExtTypes) { 15592 if (ResVT != RetTy || N0->getOpcode() != ExtendCode) 15593 return SDValue(); 15594 SDValue A = N0->getOperand(0); 15595 if (llvm::any_of(ExtTypes, [&A](MVT Ty) { return A.getValueType() == Ty; })) 15596 return ExtendIfNeeded(A, ExtendCode); 15597 return SDValue(); 15598 }; 15599 auto IsPredVADDV = [&](MVT RetTy, unsigned ExtendCode, 15600 ArrayRef<MVT> ExtTypes, SDValue &Mask) { 15601 if (ResVT != RetTy || N0->getOpcode() != ISD::VSELECT || 15602 !ISD::isBuildVectorAllZeros(N0->getOperand(2).getNode())) 15603 return SDValue(); 15604 Mask = N0->getOperand(0); 15605 SDValue Ext = N0->getOperand(1); 15606 if (Ext->getOpcode() != ExtendCode) 15607 return SDValue(); 15608 SDValue A = Ext->getOperand(0); 15609 if (llvm::any_of(ExtTypes, [&A](MVT Ty) { return A.getValueType() == Ty; })) 15610 return ExtendIfNeeded(A, ExtendCode); 15611 return SDValue(); 15612 }; 15613 auto IsVMLAV = [&](MVT RetTy, unsigned ExtendCode, ArrayRef<MVT> ExtTypes, 15614 SDValue &A, SDValue &B) { 15615 // For a vmla we are trying to match a larger pattern: 15616 // ExtA = sext/zext A 15617 // ExtB = sext/zext B 15618 // Mul = mul ExtA, ExtB 15619 // vecreduce.add Mul 15620 // There might also be en extra extend between the mul and the addreduce, so 15621 // long as the bitwidth is high enough to make them equivalent (for example 15622 // original v8i16 might be mul at v8i32 and the reduce happens at v8i64). 15623 if (ResVT != RetTy) 15624 return false; 15625 SDValue Mul = N0; 15626 if (Mul->getOpcode() == ExtendCode && 15627 Mul->getOperand(0).getScalarValueSizeInBits() * 2 >= 15628 ResVT.getScalarSizeInBits()) 15629 Mul = Mul->getOperand(0); 15630 if (Mul->getOpcode() != ISD::MUL) 15631 return false; 15632 SDValue ExtA = Mul->getOperand(0); 15633 SDValue ExtB = Mul->getOperand(1); 15634 if (ExtA->getOpcode() != ExtendCode && ExtB->getOpcode() != ExtendCode) 15635 return false; 15636 A = ExtA->getOperand(0); 15637 B = ExtB->getOperand(0); 15638 if (A.getValueType() == B.getValueType() && 15639 llvm::any_of(ExtTypes, 15640 [&A](MVT Ty) { return A.getValueType() == Ty; })) { 15641 A = ExtendIfNeeded(A, ExtendCode); 15642 B = ExtendIfNeeded(B, ExtendCode); 15643 return true; 15644 } 15645 return false; 15646 }; 15647 auto IsPredVMLAV = [&](MVT RetTy, unsigned ExtendCode, ArrayRef<MVT> ExtTypes, 15648 SDValue &A, SDValue &B, SDValue &Mask) { 15649 // Same as the pattern above with a select for the zero predicated lanes 15650 // ExtA = sext/zext A 15651 // ExtB = sext/zext B 15652 // Mul = mul ExtA, ExtB 15653 // N0 = select Mask, Mul, 0 15654 // vecreduce.add N0 15655 if (ResVT != RetTy || N0->getOpcode() != ISD::VSELECT || 15656 !ISD::isBuildVectorAllZeros(N0->getOperand(2).getNode())) 15657 return false; 15658 Mask = N0->getOperand(0); 15659 SDValue Mul = N0->getOperand(1); 15660 if (Mul->getOpcode() == ExtendCode && 15661 Mul->getOperand(0).getScalarValueSizeInBits() * 2 >= 15662 ResVT.getScalarSizeInBits()) 15663 Mul = Mul->getOperand(0); 15664 if (Mul->getOpcode() != ISD::MUL) 15665 return false; 15666 SDValue ExtA = Mul->getOperand(0); 15667 SDValue ExtB = Mul->getOperand(1); 15668 if (ExtA->getOpcode() != ExtendCode && ExtB->getOpcode() != ExtendCode) 15669 return false; 15670 A = ExtA->getOperand(0); 15671 B = ExtB->getOperand(0); 15672 if (A.getValueType() == B.getValueType() && 15673 llvm::any_of(ExtTypes, 15674 [&A](MVT Ty) { return A.getValueType() == Ty; })) { 15675 A = ExtendIfNeeded(A, ExtendCode); 15676 B = ExtendIfNeeded(B, ExtendCode); 15677 return true; 15678 } 15679 return false; 15680 }; 15681 auto Create64bitNode = [&](unsigned Opcode, ArrayRef<SDValue> Ops) { 15682 SDValue Node = DAG.getNode(Opcode, dl, {MVT::i32, MVT::i32}, Ops); 15683 return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Node, 15684 SDValue(Node.getNode(), 1)); 15685 }; 15686 15687 if (SDValue A = IsVADDV(MVT::i32, ISD::SIGN_EXTEND, {MVT::v8i16, MVT::v16i8})) 15688 return DAG.getNode(ARMISD::VADDVs, dl, ResVT, A); 15689 if (SDValue A = IsVADDV(MVT::i32, ISD::ZERO_EXTEND, {MVT::v8i16, MVT::v16i8})) 15690 return DAG.getNode(ARMISD::VADDVu, dl, ResVT, A); 15691 if (SDValue A = IsVADDV(MVT::i64, ISD::SIGN_EXTEND, 15692 {MVT::v4i8, MVT::v4i16, MVT::v4i32})) 15693 return Create64bitNode(ARMISD::VADDLVs, {A}); 15694 if (SDValue A = IsVADDV(MVT::i64, ISD::ZERO_EXTEND, 15695 {MVT::v4i8, MVT::v4i16, MVT::v4i32})) 15696 return Create64bitNode(ARMISD::VADDLVu, {A}); 15697 if (SDValue A = IsVADDV(MVT::i16, ISD::SIGN_EXTEND, {MVT::v16i8})) 15698 return DAG.getNode(ISD::TRUNCATE, dl, ResVT, 15699 DAG.getNode(ARMISD::VADDVs, dl, MVT::i32, A)); 15700 if (SDValue A = IsVADDV(MVT::i16, ISD::ZERO_EXTEND, {MVT::v16i8})) 15701 return DAG.getNode(ISD::TRUNCATE, dl, ResVT, 15702 DAG.getNode(ARMISD::VADDVu, dl, MVT::i32, A)); 15703 15704 SDValue Mask; 15705 if (SDValue A = IsPredVADDV(MVT::i32, ISD::SIGN_EXTEND, {MVT::v8i16, MVT::v16i8}, Mask)) 15706 return DAG.getNode(ARMISD::VADDVps, dl, ResVT, A, Mask); 15707 if (SDValue A = IsPredVADDV(MVT::i32, ISD::ZERO_EXTEND, {MVT::v8i16, MVT::v16i8}, Mask)) 15708 return DAG.getNode(ARMISD::VADDVpu, dl, ResVT, A, Mask); 15709 if (SDValue A = IsPredVADDV(MVT::i64, ISD::SIGN_EXTEND, 15710 {MVT::v4i8, MVT::v4i16, MVT::v4i32}, Mask)) 15711 return Create64bitNode(ARMISD::VADDLVps, {A, Mask}); 15712 if (SDValue A = IsPredVADDV(MVT::i64, ISD::ZERO_EXTEND, 15713 {MVT::v4i8, MVT::v4i16, MVT::v4i32}, Mask)) 15714 return Create64bitNode(ARMISD::VADDLVpu, {A, Mask}); 15715 if (SDValue A = IsPredVADDV(MVT::i16, ISD::SIGN_EXTEND, {MVT::v16i8}, Mask)) 15716 return DAG.getNode(ISD::TRUNCATE, dl, ResVT, 15717 DAG.getNode(ARMISD::VADDVps, dl, MVT::i32, A, Mask)); 15718 if (SDValue A = IsPredVADDV(MVT::i16, ISD::ZERO_EXTEND, {MVT::v16i8}, Mask)) 15719 return DAG.getNode(ISD::TRUNCATE, dl, ResVT, 15720 DAG.getNode(ARMISD::VADDVpu, dl, MVT::i32, A, Mask)); 15721 15722 SDValue A, B; 15723 if (IsVMLAV(MVT::i32, ISD::SIGN_EXTEND, {MVT::v8i16, MVT::v16i8}, A, B)) 15724 return DAG.getNode(ARMISD::VMLAVs, dl, ResVT, A, B); 15725 if (IsVMLAV(MVT::i32, ISD::ZERO_EXTEND, {MVT::v8i16, MVT::v16i8}, A, B)) 15726 return DAG.getNode(ARMISD::VMLAVu, dl, ResVT, A, B); 15727 if (IsVMLAV(MVT::i64, ISD::SIGN_EXTEND, 15728 {MVT::v8i8, MVT::v8i16, MVT::v4i8, MVT::v4i16, MVT::v4i32}, A, B)) 15729 return Create64bitNode(ARMISD::VMLALVs, {A, B}); 15730 if (IsVMLAV(MVT::i64, ISD::ZERO_EXTEND, 15731 {MVT::v8i8, MVT::v8i16, MVT::v4i8, MVT::v4i16, MVT::v4i32}, A, B)) 15732 return Create64bitNode(ARMISD::VMLALVu, {A, B}); 15733 if (IsVMLAV(MVT::i16, ISD::SIGN_EXTEND, {MVT::v16i8}, A, B)) 15734 return DAG.getNode(ISD::TRUNCATE, dl, ResVT, 15735 DAG.getNode(ARMISD::VMLAVs, dl, MVT::i32, A, B)); 15736 if (IsVMLAV(MVT::i16, ISD::ZERO_EXTEND, {MVT::v16i8}, A, B)) 15737 return DAG.getNode(ISD::TRUNCATE, dl, ResVT, 15738 DAG.getNode(ARMISD::VMLAVu, dl, MVT::i32, A, B)); 15739 15740 if (IsPredVMLAV(MVT::i32, ISD::SIGN_EXTEND, {MVT::v8i16, MVT::v16i8}, A, B, Mask)) 15741 return DAG.getNode(ARMISD::VMLAVps, dl, ResVT, A, B, Mask); 15742 if (IsPredVMLAV(MVT::i32, ISD::ZERO_EXTEND, {MVT::v8i16, MVT::v16i8}, A, B, Mask)) 15743 return DAG.getNode(ARMISD::VMLAVpu, dl, ResVT, A, B, Mask); 15744 if (IsPredVMLAV(MVT::i64, ISD::SIGN_EXTEND, 15745 {MVT::v8i8, MVT::v8i16, MVT::v4i8, MVT::v4i16, MVT::v4i32}, A, 15746 B, Mask)) 15747 return Create64bitNode(ARMISD::VMLALVps, {A, B, Mask}); 15748 if (IsPredVMLAV(MVT::i64, ISD::ZERO_EXTEND, 15749 {MVT::v8i8, MVT::v8i16, MVT::v4i8, MVT::v4i16, MVT::v4i32}, A, 15750 B, Mask)) 15751 return Create64bitNode(ARMISD::VMLALVpu, {A, B, Mask}); 15752 if (IsPredVMLAV(MVT::i16, ISD::SIGN_EXTEND, {MVT::v16i8}, A, B, Mask)) 15753 return DAG.getNode(ISD::TRUNCATE, dl, ResVT, 15754 DAG.getNode(ARMISD::VMLAVps, dl, MVT::i32, A, B, Mask)); 15755 if (IsPredVMLAV(MVT::i16, ISD::ZERO_EXTEND, {MVT::v16i8}, A, B, Mask)) 15756 return DAG.getNode(ISD::TRUNCATE, dl, ResVT, 15757 DAG.getNode(ARMISD::VMLAVpu, dl, MVT::i32, A, B, Mask)); 15758 15759 // Some complications. We can get a case where the two inputs of the mul are 15760 // the same, then the output sext will have been helpfully converted to a 15761 // zext. Turn it back. 15762 SDValue Op = N0; 15763 if (Op->getOpcode() == ISD::VSELECT) 15764 Op = Op->getOperand(1); 15765 if (Op->getOpcode() == ISD::ZERO_EXTEND && 15766 Op->getOperand(0)->getOpcode() == ISD::MUL) { 15767 SDValue Mul = Op->getOperand(0); 15768 if (Mul->getOperand(0) == Mul->getOperand(1) && 15769 Mul->getOperand(0)->getOpcode() == ISD::SIGN_EXTEND) { 15770 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, N0->getValueType(0), Mul); 15771 if (Op != N0) 15772 Ext = DAG.getNode(ISD::VSELECT, dl, N0->getValueType(0), 15773 N0->getOperand(0), Ext, N0->getOperand(2)); 15774 return DAG.getNode(ISD::VECREDUCE_ADD, dl, ResVT, Ext); 15775 } 15776 } 15777 15778 return SDValue(); 15779 } 15780 15781 static SDValue PerformVMOVNCombine(SDNode *N, 15782 TargetLowering::DAGCombinerInfo &DCI) { 15783 SDValue Op0 = N->getOperand(0); 15784 SDValue Op1 = N->getOperand(1); 15785 unsigned IsTop = N->getConstantOperandVal(2); 15786 15787 // VMOVNT a undef -> a 15788 // VMOVNB a undef -> a 15789 // VMOVNB undef a -> a 15790 if (Op1->isUndef()) 15791 return Op0; 15792 if (Op0->isUndef() && !IsTop) 15793 return Op1; 15794 15795 // VMOVNt(c, VQMOVNb(a, b)) => VQMOVNt(c, b) 15796 // VMOVNb(c, VQMOVNb(a, b)) => VQMOVNb(c, b) 15797 if ((Op1->getOpcode() == ARMISD::VQMOVNs || 15798 Op1->getOpcode() == ARMISD::VQMOVNu) && 15799 Op1->getConstantOperandVal(2) == 0) 15800 return DCI.DAG.getNode(Op1->getOpcode(), SDLoc(Op1), N->getValueType(0), 15801 Op0, Op1->getOperand(1), N->getOperand(2)); 15802 15803 // Only the bottom lanes from Qm (Op1) and either the top or bottom lanes from 15804 // Qd (Op0) are demanded from a VMOVN, depending on whether we are inserting 15805 // into the top or bottom lanes. 15806 unsigned NumElts = N->getValueType(0).getVectorNumElements(); 15807 APInt Op1DemandedElts = APInt::getSplat(NumElts, APInt::getLowBitsSet(2, 1)); 15808 APInt Op0DemandedElts = 15809 IsTop ? Op1DemandedElts 15810 : APInt::getSplat(NumElts, APInt::getHighBitsSet(2, 1)); 15811 15812 APInt KnownUndef, KnownZero; 15813 const TargetLowering &TLI = DCI.DAG.getTargetLoweringInfo(); 15814 if (TLI.SimplifyDemandedVectorElts(Op0, Op0DemandedElts, KnownUndef, 15815 KnownZero, DCI)) 15816 return SDValue(N, 0); 15817 if (TLI.SimplifyDemandedVectorElts(Op1, Op1DemandedElts, KnownUndef, 15818 KnownZero, DCI)) 15819 return SDValue(N, 0); 15820 15821 return SDValue(); 15822 } 15823 15824 static SDValue PerformVQMOVNCombine(SDNode *N, 15825 TargetLowering::DAGCombinerInfo &DCI) { 15826 SDValue Op0 = N->getOperand(0); 15827 unsigned IsTop = N->getConstantOperandVal(2); 15828 15829 unsigned NumElts = N->getValueType(0).getVectorNumElements(); 15830 APInt Op0DemandedElts = 15831 APInt::getSplat(NumElts, IsTop ? APInt::getLowBitsSet(2, 1) 15832 : APInt::getHighBitsSet(2, 1)); 15833 15834 APInt KnownUndef, KnownZero; 15835 const TargetLowering &TLI = DCI.DAG.getTargetLoweringInfo(); 15836 if (TLI.SimplifyDemandedVectorElts(Op0, Op0DemandedElts, KnownUndef, 15837 KnownZero, DCI)) 15838 return SDValue(N, 0); 15839 return SDValue(); 15840 } 15841 15842 static SDValue PerformLongShiftCombine(SDNode *N, SelectionDAG &DAG) { 15843 SDLoc DL(N); 15844 SDValue Op0 = N->getOperand(0); 15845 SDValue Op1 = N->getOperand(1); 15846 15847 // Turn X << -C -> X >> C and viceversa. The negative shifts can come up from 15848 // uses of the intrinsics. 15849 if (auto C = dyn_cast<ConstantSDNode>(N->getOperand(2))) { 15850 int ShiftAmt = C->getSExtValue(); 15851 if (ShiftAmt == 0) { 15852 SDValue Merge = DAG.getMergeValues({Op0, Op1}, DL); 15853 DAG.ReplaceAllUsesWith(N, Merge.getNode()); 15854 return SDValue(); 15855 } 15856 15857 if (ShiftAmt >= -32 && ShiftAmt < 0) { 15858 unsigned NewOpcode = 15859 N->getOpcode() == ARMISD::LSLL ? ARMISD::LSRL : ARMISD::LSLL; 15860 SDValue NewShift = DAG.getNode(NewOpcode, DL, N->getVTList(), Op0, Op1, 15861 DAG.getConstant(-ShiftAmt, DL, MVT::i32)); 15862 DAG.ReplaceAllUsesWith(N, NewShift.getNode()); 15863 return NewShift; 15864 } 15865 } 15866 15867 return SDValue(); 15868 } 15869 15870 /// PerformIntrinsicCombine - ARM-specific DAG combining for intrinsics. 15871 SDValue ARMTargetLowering::PerformIntrinsicCombine(SDNode *N, 15872 DAGCombinerInfo &DCI) const { 15873 SelectionDAG &DAG = DCI.DAG; 15874 unsigned IntNo = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue(); 15875 switch (IntNo) { 15876 default: 15877 // Don't do anything for most intrinsics. 15878 break; 15879 15880 // Vector shifts: check for immediate versions and lower them. 15881 // Note: This is done during DAG combining instead of DAG legalizing because 15882 // the build_vectors for 64-bit vector element shift counts are generally 15883 // not legal, and it is hard to see their values after they get legalized to 15884 // loads from a constant pool. 15885 case Intrinsic::arm_neon_vshifts: 15886 case Intrinsic::arm_neon_vshiftu: 15887 case Intrinsic::arm_neon_vrshifts: 15888 case Intrinsic::arm_neon_vrshiftu: 15889 case Intrinsic::arm_neon_vrshiftn: 15890 case Intrinsic::arm_neon_vqshifts: 15891 case Intrinsic::arm_neon_vqshiftu: 15892 case Intrinsic::arm_neon_vqshiftsu: 15893 case Intrinsic::arm_neon_vqshiftns: 15894 case Intrinsic::arm_neon_vqshiftnu: 15895 case Intrinsic::arm_neon_vqshiftnsu: 15896 case Intrinsic::arm_neon_vqrshiftns: 15897 case Intrinsic::arm_neon_vqrshiftnu: 15898 case Intrinsic::arm_neon_vqrshiftnsu: { 15899 EVT VT = N->getOperand(1).getValueType(); 15900 int64_t Cnt; 15901 unsigned VShiftOpc = 0; 15902 15903 switch (IntNo) { 15904 case Intrinsic::arm_neon_vshifts: 15905 case Intrinsic::arm_neon_vshiftu: 15906 if (isVShiftLImm(N->getOperand(2), VT, false, Cnt)) { 15907 VShiftOpc = ARMISD::VSHLIMM; 15908 break; 15909 } 15910 if (isVShiftRImm(N->getOperand(2), VT, false, true, Cnt)) { 15911 VShiftOpc = (IntNo == Intrinsic::arm_neon_vshifts ? ARMISD::VSHRsIMM 15912 : ARMISD::VSHRuIMM); 15913 break; 15914 } 15915 return SDValue(); 15916 15917 case Intrinsic::arm_neon_vrshifts: 15918 case Intrinsic::arm_neon_vrshiftu: 15919 if (isVShiftRImm(N->getOperand(2), VT, false, true, Cnt)) 15920 break; 15921 return SDValue(); 15922 15923 case Intrinsic::arm_neon_vqshifts: 15924 case Intrinsic::arm_neon_vqshiftu: 15925 if (isVShiftLImm(N->getOperand(2), VT, false, Cnt)) 15926 break; 15927 return SDValue(); 15928 15929 case Intrinsic::arm_neon_vqshiftsu: 15930 if (isVShiftLImm(N->getOperand(2), VT, false, Cnt)) 15931 break; 15932 llvm_unreachable("invalid shift count for vqshlu intrinsic"); 15933 15934 case Intrinsic::arm_neon_vrshiftn: 15935 case Intrinsic::arm_neon_vqshiftns: 15936 case Intrinsic::arm_neon_vqshiftnu: 15937 case Intrinsic::arm_neon_vqshiftnsu: 15938 case Intrinsic::arm_neon_vqrshiftns: 15939 case Intrinsic::arm_neon_vqrshiftnu: 15940 case Intrinsic::arm_neon_vqrshiftnsu: 15941 // Narrowing shifts require an immediate right shift. 15942 if (isVShiftRImm(N->getOperand(2), VT, true, true, Cnt)) 15943 break; 15944 llvm_unreachable("invalid shift count for narrowing vector shift " 15945 "intrinsic"); 15946 15947 default: 15948 llvm_unreachable("unhandled vector shift"); 15949 } 15950 15951 switch (IntNo) { 15952 case Intrinsic::arm_neon_vshifts: 15953 case Intrinsic::arm_neon_vshiftu: 15954 // Opcode already set above. 15955 break; 15956 case Intrinsic::arm_neon_vrshifts: 15957 VShiftOpc = ARMISD::VRSHRsIMM; 15958 break; 15959 case Intrinsic::arm_neon_vrshiftu: 15960 VShiftOpc = ARMISD::VRSHRuIMM; 15961 break; 15962 case Intrinsic::arm_neon_vrshiftn: 15963 VShiftOpc = ARMISD::VRSHRNIMM; 15964 break; 15965 case Intrinsic::arm_neon_vqshifts: 15966 VShiftOpc = ARMISD::VQSHLsIMM; 15967 break; 15968 case Intrinsic::arm_neon_vqshiftu: 15969 VShiftOpc = ARMISD::VQSHLuIMM; 15970 break; 15971 case Intrinsic::arm_neon_vqshiftsu: 15972 VShiftOpc = ARMISD::VQSHLsuIMM; 15973 break; 15974 case Intrinsic::arm_neon_vqshiftns: 15975 VShiftOpc = ARMISD::VQSHRNsIMM; 15976 break; 15977 case Intrinsic::arm_neon_vqshiftnu: 15978 VShiftOpc = ARMISD::VQSHRNuIMM; 15979 break; 15980 case Intrinsic::arm_neon_vqshiftnsu: 15981 VShiftOpc = ARMISD::VQSHRNsuIMM; 15982 break; 15983 case Intrinsic::arm_neon_vqrshiftns: 15984 VShiftOpc = ARMISD::VQRSHRNsIMM; 15985 break; 15986 case Intrinsic::arm_neon_vqrshiftnu: 15987 VShiftOpc = ARMISD::VQRSHRNuIMM; 15988 break; 15989 case Intrinsic::arm_neon_vqrshiftnsu: 15990 VShiftOpc = ARMISD::VQRSHRNsuIMM; 15991 break; 15992 } 15993 15994 SDLoc dl(N); 15995 return DAG.getNode(VShiftOpc, dl, N->getValueType(0), 15996 N->getOperand(1), DAG.getConstant(Cnt, dl, MVT::i32)); 15997 } 15998 15999 case Intrinsic::arm_neon_vshiftins: { 16000 EVT VT = N->getOperand(1).getValueType(); 16001 int64_t Cnt; 16002 unsigned VShiftOpc = 0; 16003 16004 if (isVShiftLImm(N->getOperand(3), VT, false, Cnt)) 16005 VShiftOpc = ARMISD::VSLIIMM; 16006 else if (isVShiftRImm(N->getOperand(3), VT, false, true, Cnt)) 16007 VShiftOpc = ARMISD::VSRIIMM; 16008 else { 16009 llvm_unreachable("invalid shift count for vsli/vsri intrinsic"); 16010 } 16011 16012 SDLoc dl(N); 16013 return DAG.getNode(VShiftOpc, dl, N->getValueType(0), 16014 N->getOperand(1), N->getOperand(2), 16015 DAG.getConstant(Cnt, dl, MVT::i32)); 16016 } 16017 16018 case Intrinsic::arm_neon_vqrshifts: 16019 case Intrinsic::arm_neon_vqrshiftu: 16020 // No immediate versions of these to check for. 16021 break; 16022 16023 case Intrinsic::arm_mve_vqdmlah: 16024 case Intrinsic::arm_mve_vqdmlash: 16025 case Intrinsic::arm_mve_vqrdmlah: 16026 case Intrinsic::arm_mve_vqrdmlash: 16027 case Intrinsic::arm_mve_vmla_n_predicated: 16028 case Intrinsic::arm_mve_vmlas_n_predicated: 16029 case Intrinsic::arm_mve_vqdmlah_predicated: 16030 case Intrinsic::arm_mve_vqdmlash_predicated: 16031 case Intrinsic::arm_mve_vqrdmlah_predicated: 16032 case Intrinsic::arm_mve_vqrdmlash_predicated: { 16033 // These intrinsics all take an i32 scalar operand which is narrowed to the 16034 // size of a single lane of the vector type they return. So we don't need 16035 // any bits of that operand above that point, which allows us to eliminate 16036 // uxth/sxth. 16037 unsigned BitWidth = N->getValueType(0).getScalarSizeInBits(); 16038 APInt DemandedMask = APInt::getLowBitsSet(32, BitWidth); 16039 if (SimplifyDemandedBits(N->getOperand(3), DemandedMask, DCI)) 16040 return SDValue(); 16041 break; 16042 } 16043 16044 case Intrinsic::arm_mve_minv: 16045 case Intrinsic::arm_mve_maxv: 16046 case Intrinsic::arm_mve_minav: 16047 case Intrinsic::arm_mve_maxav: 16048 case Intrinsic::arm_mve_minv_predicated: 16049 case Intrinsic::arm_mve_maxv_predicated: 16050 case Intrinsic::arm_mve_minav_predicated: 16051 case Intrinsic::arm_mve_maxav_predicated: { 16052 // These intrinsics all take an i32 scalar operand which is narrowed to the 16053 // size of a single lane of the vector type they take as the other input. 16054 unsigned BitWidth = N->getOperand(2)->getValueType(0).getScalarSizeInBits(); 16055 APInt DemandedMask = APInt::getLowBitsSet(32, BitWidth); 16056 if (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI)) 16057 return SDValue(); 16058 break; 16059 } 16060 16061 case Intrinsic::arm_mve_addv: { 16062 // Turn this intrinsic straight into the appropriate ARMISD::VADDV node, 16063 // which allow PerformADDVecReduce to turn it into VADDLV when possible. 16064 bool Unsigned = cast<ConstantSDNode>(N->getOperand(2))->getZExtValue(); 16065 unsigned Opc = Unsigned ? ARMISD::VADDVu : ARMISD::VADDVs; 16066 return DAG.getNode(Opc, SDLoc(N), N->getVTList(), N->getOperand(1)); 16067 } 16068 16069 case Intrinsic::arm_mve_addlv: 16070 case Intrinsic::arm_mve_addlv_predicated: { 16071 // Same for these, but ARMISD::VADDLV has to be followed by a BUILD_PAIR 16072 // which recombines the two outputs into an i64 16073 bool Unsigned = cast<ConstantSDNode>(N->getOperand(2))->getZExtValue(); 16074 unsigned Opc = IntNo == Intrinsic::arm_mve_addlv ? 16075 (Unsigned ? ARMISD::VADDLVu : ARMISD::VADDLVs) : 16076 (Unsigned ? ARMISD::VADDLVpu : ARMISD::VADDLVps); 16077 16078 SmallVector<SDValue, 4> Ops; 16079 for (unsigned i = 1, e = N->getNumOperands(); i < e; i++) 16080 if (i != 2) // skip the unsigned flag 16081 Ops.push_back(N->getOperand(i)); 16082 16083 SDLoc dl(N); 16084 SDValue val = DAG.getNode(Opc, dl, {MVT::i32, MVT::i32}, Ops); 16085 return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, val.getValue(0), 16086 val.getValue(1)); 16087 } 16088 } 16089 16090 return SDValue(); 16091 } 16092 16093 /// PerformShiftCombine - Checks for immediate versions of vector shifts and 16094 /// lowers them. As with the vector shift intrinsics, this is done during DAG 16095 /// combining instead of DAG legalizing because the build_vectors for 64-bit 16096 /// vector element shift counts are generally not legal, and it is hard to see 16097 /// their values after they get legalized to loads from a constant pool. 16098 static SDValue PerformShiftCombine(SDNode *N, 16099 TargetLowering::DAGCombinerInfo &DCI, 16100 const ARMSubtarget *ST) { 16101 SelectionDAG &DAG = DCI.DAG; 16102 EVT VT = N->getValueType(0); 16103 if (N->getOpcode() == ISD::SRL && VT == MVT::i32 && ST->hasV6Ops()) { 16104 // Canonicalize (srl (bswap x), 16) to (rotr (bswap x), 16) if the high 16105 // 16-bits of x is zero. This optimizes rev + lsr 16 to rev16. 16106 SDValue N1 = N->getOperand(1); 16107 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N1)) { 16108 SDValue N0 = N->getOperand(0); 16109 if (C->getZExtValue() == 16 && N0.getOpcode() == ISD::BSWAP && 16110 DAG.MaskedValueIsZero(N0.getOperand(0), 16111 APInt::getHighBitsSet(32, 16))) 16112 return DAG.getNode(ISD::ROTR, SDLoc(N), VT, N0, N1); 16113 } 16114 } 16115 16116 if (ST->isThumb1Only() && N->getOpcode() == ISD::SHL && VT == MVT::i32 && 16117 N->getOperand(0)->getOpcode() == ISD::AND && 16118 N->getOperand(0)->hasOneUse()) { 16119 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) 16120 return SDValue(); 16121 // Look for the pattern (shl (and x, AndMask), ShiftAmt). This doesn't 16122 // usually show up because instcombine prefers to canonicalize it to 16123 // (and (shl x, ShiftAmt) (shl AndMask, ShiftAmt)), but the shift can come 16124 // out of GEP lowering in some cases. 16125 SDValue N0 = N->getOperand(0); 16126 ConstantSDNode *ShiftAmtNode = dyn_cast<ConstantSDNode>(N->getOperand(1)); 16127 if (!ShiftAmtNode) 16128 return SDValue(); 16129 uint32_t ShiftAmt = static_cast<uint32_t>(ShiftAmtNode->getZExtValue()); 16130 ConstantSDNode *AndMaskNode = dyn_cast<ConstantSDNode>(N0->getOperand(1)); 16131 if (!AndMaskNode) 16132 return SDValue(); 16133 uint32_t AndMask = static_cast<uint32_t>(AndMaskNode->getZExtValue()); 16134 // Don't transform uxtb/uxth. 16135 if (AndMask == 255 || AndMask == 65535) 16136 return SDValue(); 16137 if (isMask_32(AndMask)) { 16138 uint32_t MaskedBits = countLeadingZeros(AndMask); 16139 if (MaskedBits > ShiftAmt) { 16140 SDLoc DL(N); 16141 SDValue SHL = DAG.getNode(ISD::SHL, DL, MVT::i32, N0->getOperand(0), 16142 DAG.getConstant(MaskedBits, DL, MVT::i32)); 16143 return DAG.getNode( 16144 ISD::SRL, DL, MVT::i32, SHL, 16145 DAG.getConstant(MaskedBits - ShiftAmt, DL, MVT::i32)); 16146 } 16147 } 16148 } 16149 16150 // Nothing to be done for scalar shifts. 16151 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 16152 if (!VT.isVector() || !TLI.isTypeLegal(VT)) 16153 return SDValue(); 16154 if (ST->hasMVEIntegerOps() && VT == MVT::v2i64) 16155 return SDValue(); 16156 16157 int64_t Cnt; 16158 16159 switch (N->getOpcode()) { 16160 default: llvm_unreachable("unexpected shift opcode"); 16161 16162 case ISD::SHL: 16163 if (isVShiftLImm(N->getOperand(1), VT, false, Cnt)) { 16164 SDLoc dl(N); 16165 return DAG.getNode(ARMISD::VSHLIMM, dl, VT, N->getOperand(0), 16166 DAG.getConstant(Cnt, dl, MVT::i32)); 16167 } 16168 break; 16169 16170 case ISD::SRA: 16171 case ISD::SRL: 16172 if (isVShiftRImm(N->getOperand(1), VT, false, false, Cnt)) { 16173 unsigned VShiftOpc = 16174 (N->getOpcode() == ISD::SRA ? ARMISD::VSHRsIMM : ARMISD::VSHRuIMM); 16175 SDLoc dl(N); 16176 return DAG.getNode(VShiftOpc, dl, VT, N->getOperand(0), 16177 DAG.getConstant(Cnt, dl, MVT::i32)); 16178 } 16179 } 16180 return SDValue(); 16181 } 16182 16183 // Look for a sign/zero/fpextend extend of a larger than legal load. This can be 16184 // split into multiple extending loads, which are simpler to deal with than an 16185 // arbitrary extend. For fp extends we use an integer extending load and a VCVTL 16186 // to convert the type to an f32. 16187 static SDValue PerformSplittingToWideningLoad(SDNode *N, SelectionDAG &DAG) { 16188 SDValue N0 = N->getOperand(0); 16189 if (N0.getOpcode() != ISD::LOAD) 16190 return SDValue(); 16191 LoadSDNode *LD = cast<LoadSDNode>(N0.getNode()); 16192 if (!LD->isSimple() || !N0.hasOneUse() || LD->isIndexed() || 16193 LD->getExtensionType() != ISD::NON_EXTLOAD) 16194 return SDValue(); 16195 EVT FromVT = LD->getValueType(0); 16196 EVT ToVT = N->getValueType(0); 16197 if (!ToVT.isVector()) 16198 return SDValue(); 16199 assert(FromVT.getVectorNumElements() == ToVT.getVectorNumElements()); 16200 EVT ToEltVT = ToVT.getVectorElementType(); 16201 EVT FromEltVT = FromVT.getVectorElementType(); 16202 16203 unsigned NumElements = 0; 16204 if (ToEltVT == MVT::i32 && (FromEltVT == MVT::i16 || FromEltVT == MVT::i8)) 16205 NumElements = 4; 16206 if (ToEltVT == MVT::i16 && FromEltVT == MVT::i8) 16207 NumElements = 8; 16208 if (ToEltVT == MVT::f32 && FromEltVT == MVT::f16) 16209 NumElements = 4; 16210 if (NumElements == 0 || 16211 (FromEltVT != MVT::f16 && FromVT.getVectorNumElements() == NumElements) || 16212 FromVT.getVectorNumElements() % NumElements != 0 || 16213 !isPowerOf2_32(NumElements)) 16214 return SDValue(); 16215 16216 LLVMContext &C = *DAG.getContext(); 16217 SDLoc DL(LD); 16218 // Details about the old load 16219 SDValue Ch = LD->getChain(); 16220 SDValue BasePtr = LD->getBasePtr(); 16221 Align Alignment = LD->getOriginalAlign(); 16222 MachineMemOperand::Flags MMOFlags = LD->getMemOperand()->getFlags(); 16223 AAMDNodes AAInfo = LD->getAAInfo(); 16224 16225 ISD::LoadExtType NewExtType = 16226 N->getOpcode() == ISD::SIGN_EXTEND ? ISD::SEXTLOAD : ISD::ZEXTLOAD; 16227 SDValue Offset = DAG.getUNDEF(BasePtr.getValueType()); 16228 EVT NewFromVT = EVT::getVectorVT( 16229 C, EVT::getIntegerVT(C, FromEltVT.getScalarSizeInBits()), NumElements); 16230 EVT NewToVT = EVT::getVectorVT( 16231 C, EVT::getIntegerVT(C, ToEltVT.getScalarSizeInBits()), NumElements); 16232 16233 SmallVector<SDValue, 4> Loads; 16234 SmallVector<SDValue, 4> Chains; 16235 for (unsigned i = 0; i < FromVT.getVectorNumElements() / NumElements; i++) { 16236 unsigned NewOffset = (i * NewFromVT.getSizeInBits()) / 8; 16237 SDValue NewPtr = 16238 DAG.getObjectPtrOffset(DL, BasePtr, TypeSize::Fixed(NewOffset)); 16239 16240 SDValue NewLoad = 16241 DAG.getLoad(ISD::UNINDEXED, NewExtType, NewToVT, DL, Ch, NewPtr, Offset, 16242 LD->getPointerInfo().getWithOffset(NewOffset), NewFromVT, 16243 Alignment, MMOFlags, AAInfo); 16244 Loads.push_back(NewLoad); 16245 Chains.push_back(SDValue(NewLoad.getNode(), 1)); 16246 } 16247 16248 // Float truncs need to extended with VCVTB's into their floating point types. 16249 if (FromEltVT == MVT::f16) { 16250 SmallVector<SDValue, 4> Extends; 16251 16252 for (unsigned i = 0; i < Loads.size(); i++) { 16253 SDValue LoadBC = 16254 DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, MVT::v8f16, Loads[i]); 16255 SDValue FPExt = DAG.getNode(ARMISD::VCVTL, DL, MVT::v4f32, LoadBC, 16256 DAG.getConstant(0, DL, MVT::i32)); 16257 Extends.push_back(FPExt); 16258 } 16259 16260 Loads = Extends; 16261 } 16262 16263 SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains); 16264 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewChain); 16265 return DAG.getNode(ISD::CONCAT_VECTORS, DL, ToVT, Loads); 16266 } 16267 16268 /// PerformExtendCombine - Target-specific DAG combining for ISD::SIGN_EXTEND, 16269 /// ISD::ZERO_EXTEND, and ISD::ANY_EXTEND. 16270 static SDValue PerformExtendCombine(SDNode *N, SelectionDAG &DAG, 16271 const ARMSubtarget *ST) { 16272 SDValue N0 = N->getOperand(0); 16273 16274 // Check for sign- and zero-extensions of vector extract operations of 8- and 16275 // 16-bit vector elements. NEON and MVE support these directly. They are 16276 // handled during DAG combining because type legalization will promote them 16277 // to 32-bit types and it is messy to recognize the operations after that. 16278 if ((ST->hasNEON() || ST->hasMVEIntegerOps()) && 16279 N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT) { 16280 SDValue Vec = N0.getOperand(0); 16281 SDValue Lane = N0.getOperand(1); 16282 EVT VT = N->getValueType(0); 16283 EVT EltVT = N0.getValueType(); 16284 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 16285 16286 if (VT == MVT::i32 && 16287 (EltVT == MVT::i8 || EltVT == MVT::i16) && 16288 TLI.isTypeLegal(Vec.getValueType()) && 16289 isa<ConstantSDNode>(Lane)) { 16290 16291 unsigned Opc = 0; 16292 switch (N->getOpcode()) { 16293 default: llvm_unreachable("unexpected opcode"); 16294 case ISD::SIGN_EXTEND: 16295 Opc = ARMISD::VGETLANEs; 16296 break; 16297 case ISD::ZERO_EXTEND: 16298 case ISD::ANY_EXTEND: 16299 Opc = ARMISD::VGETLANEu; 16300 break; 16301 } 16302 return DAG.getNode(Opc, SDLoc(N), VT, Vec, Lane); 16303 } 16304 } 16305 16306 if (ST->hasMVEIntegerOps()) 16307 if (SDValue NewLoad = PerformSplittingToWideningLoad(N, DAG)) 16308 return NewLoad; 16309 16310 return SDValue(); 16311 } 16312 16313 static SDValue PerformFPExtendCombine(SDNode *N, SelectionDAG &DAG, 16314 const ARMSubtarget *ST) { 16315 if (ST->hasMVEFloatOps()) 16316 if (SDValue NewLoad = PerformSplittingToWideningLoad(N, DAG)) 16317 return NewLoad; 16318 16319 return SDValue(); 16320 } 16321 16322 /// PerformMinMaxCombine - Target-specific DAG combining for creating truncating 16323 /// saturates. 16324 static SDValue PerformMinMaxCombine(SDNode *N, SelectionDAG &DAG, 16325 const ARMSubtarget *ST) { 16326 EVT VT = N->getValueType(0); 16327 SDValue N0 = N->getOperand(0); 16328 if (!ST->hasMVEIntegerOps()) 16329 return SDValue(); 16330 16331 if (SDValue V = PerformVQDMULHCombine(N, DAG)) 16332 return V; 16333 16334 if (VT != MVT::v4i32 && VT != MVT::v8i16) 16335 return SDValue(); 16336 16337 auto IsSignedSaturate = [&](SDNode *Min, SDNode *Max) { 16338 // Check one is a smin and the other is a smax 16339 if (Min->getOpcode() != ISD::SMIN) 16340 std::swap(Min, Max); 16341 if (Min->getOpcode() != ISD::SMIN || Max->getOpcode() != ISD::SMAX) 16342 return false; 16343 16344 APInt SaturateC; 16345 if (VT == MVT::v4i32) 16346 SaturateC = APInt(32, (1 << 15) - 1, true); 16347 else //if (VT == MVT::v8i16) 16348 SaturateC = APInt(16, (1 << 7) - 1, true); 16349 16350 APInt MinC, MaxC; 16351 if (!ISD::isConstantSplatVector(Min->getOperand(1).getNode(), MinC) || 16352 MinC != SaturateC) 16353 return false; 16354 if (!ISD::isConstantSplatVector(Max->getOperand(1).getNode(), MaxC) || 16355 MaxC != ~SaturateC) 16356 return false; 16357 return true; 16358 }; 16359 16360 if (IsSignedSaturate(N, N0.getNode())) { 16361 SDLoc DL(N); 16362 MVT ExtVT, HalfVT; 16363 if (VT == MVT::v4i32) { 16364 HalfVT = MVT::v8i16; 16365 ExtVT = MVT::v4i16; 16366 } else { // if (VT == MVT::v8i16) 16367 HalfVT = MVT::v16i8; 16368 ExtVT = MVT::v8i8; 16369 } 16370 16371 // Create a VQMOVNB with undef top lanes, then signed extended into the top 16372 // half. That extend will hopefully be removed if only the bottom bits are 16373 // demanded (though a truncating store, for example). 16374 SDValue VQMOVN = 16375 DAG.getNode(ARMISD::VQMOVNs, DL, HalfVT, DAG.getUNDEF(HalfVT), 16376 N0->getOperand(0), DAG.getConstant(0, DL, MVT::i32)); 16377 SDValue Bitcast = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, VQMOVN); 16378 return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Bitcast, 16379 DAG.getValueType(ExtVT)); 16380 } 16381 16382 auto IsUnsignedSaturate = [&](SDNode *Min) { 16383 // For unsigned, we just need to check for <= 0xffff 16384 if (Min->getOpcode() != ISD::UMIN) 16385 return false; 16386 16387 APInt SaturateC; 16388 if (VT == MVT::v4i32) 16389 SaturateC = APInt(32, (1 << 16) - 1, true); 16390 else //if (VT == MVT::v8i16) 16391 SaturateC = APInt(16, (1 << 8) - 1, true); 16392 16393 APInt MinC; 16394 if (!ISD::isConstantSplatVector(Min->getOperand(1).getNode(), MinC) || 16395 MinC != SaturateC) 16396 return false; 16397 return true; 16398 }; 16399 16400 if (IsUnsignedSaturate(N)) { 16401 SDLoc DL(N); 16402 MVT HalfVT; 16403 unsigned ExtConst; 16404 if (VT == MVT::v4i32) { 16405 HalfVT = MVT::v8i16; 16406 ExtConst = 0x0000FFFF; 16407 } else { //if (VT == MVT::v8i16) 16408 HalfVT = MVT::v16i8; 16409 ExtConst = 0x00FF; 16410 } 16411 16412 // Create a VQMOVNB with undef top lanes, then ZExt into the top half with 16413 // an AND. That extend will hopefully be removed if only the bottom bits are 16414 // demanded (though a truncating store, for example). 16415 SDValue VQMOVN = 16416 DAG.getNode(ARMISD::VQMOVNu, DL, HalfVT, DAG.getUNDEF(HalfVT), N0, 16417 DAG.getConstant(0, DL, MVT::i32)); 16418 SDValue Bitcast = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, VQMOVN); 16419 return DAG.getNode(ISD::AND, DL, VT, Bitcast, 16420 DAG.getConstant(ExtConst, DL, VT)); 16421 } 16422 16423 return SDValue(); 16424 } 16425 16426 static const APInt *isPowerOf2Constant(SDValue V) { 16427 ConstantSDNode *C = dyn_cast<ConstantSDNode>(V); 16428 if (!C) 16429 return nullptr; 16430 const APInt *CV = &C->getAPIntValue(); 16431 return CV->isPowerOf2() ? CV : nullptr; 16432 } 16433 16434 SDValue ARMTargetLowering::PerformCMOVToBFICombine(SDNode *CMOV, SelectionDAG &DAG) const { 16435 // If we have a CMOV, OR and AND combination such as: 16436 // if (x & CN) 16437 // y |= CM; 16438 // 16439 // And: 16440 // * CN is a single bit; 16441 // * All bits covered by CM are known zero in y 16442 // 16443 // Then we can convert this into a sequence of BFI instructions. This will 16444 // always be a win if CM is a single bit, will always be no worse than the 16445 // TST&OR sequence if CM is two bits, and for thumb will be no worse if CM is 16446 // three bits (due to the extra IT instruction). 16447 16448 SDValue Op0 = CMOV->getOperand(0); 16449 SDValue Op1 = CMOV->getOperand(1); 16450 auto CCNode = cast<ConstantSDNode>(CMOV->getOperand(2)); 16451 auto CC = CCNode->getAPIntValue().getLimitedValue(); 16452 SDValue CmpZ = CMOV->getOperand(4); 16453 16454 // The compare must be against zero. 16455 if (!isNullConstant(CmpZ->getOperand(1))) 16456 return SDValue(); 16457 16458 assert(CmpZ->getOpcode() == ARMISD::CMPZ); 16459 SDValue And = CmpZ->getOperand(0); 16460 if (And->getOpcode() != ISD::AND) 16461 return SDValue(); 16462 const APInt *AndC = isPowerOf2Constant(And->getOperand(1)); 16463 if (!AndC) 16464 return SDValue(); 16465 SDValue X = And->getOperand(0); 16466 16467 if (CC == ARMCC::EQ) { 16468 // We're performing an "equal to zero" compare. Swap the operands so we 16469 // canonicalize on a "not equal to zero" compare. 16470 std::swap(Op0, Op1); 16471 } else { 16472 assert(CC == ARMCC::NE && "How can a CMPZ node not be EQ or NE?"); 16473 } 16474 16475 if (Op1->getOpcode() != ISD::OR) 16476 return SDValue(); 16477 16478 ConstantSDNode *OrC = dyn_cast<ConstantSDNode>(Op1->getOperand(1)); 16479 if (!OrC) 16480 return SDValue(); 16481 SDValue Y = Op1->getOperand(0); 16482 16483 if (Op0 != Y) 16484 return SDValue(); 16485 16486 // Now, is it profitable to continue? 16487 APInt OrCI = OrC->getAPIntValue(); 16488 unsigned Heuristic = Subtarget->isThumb() ? 3 : 2; 16489 if (OrCI.countPopulation() > Heuristic) 16490 return SDValue(); 16491 16492 // Lastly, can we determine that the bits defined by OrCI 16493 // are zero in Y? 16494 KnownBits Known = DAG.computeKnownBits(Y); 16495 if ((OrCI & Known.Zero) != OrCI) 16496 return SDValue(); 16497 16498 // OK, we can do the combine. 16499 SDValue V = Y; 16500 SDLoc dl(X); 16501 EVT VT = X.getValueType(); 16502 unsigned BitInX = AndC->logBase2(); 16503 16504 if (BitInX != 0) { 16505 // We must shift X first. 16506 X = DAG.getNode(ISD::SRL, dl, VT, X, 16507 DAG.getConstant(BitInX, dl, VT)); 16508 } 16509 16510 for (unsigned BitInY = 0, NumActiveBits = OrCI.getActiveBits(); 16511 BitInY < NumActiveBits; ++BitInY) { 16512 if (OrCI[BitInY] == 0) 16513 continue; 16514 APInt Mask(VT.getSizeInBits(), 0); 16515 Mask.setBit(BitInY); 16516 V = DAG.getNode(ARMISD::BFI, dl, VT, V, X, 16517 // Confusingly, the operand is an *inverted* mask. 16518 DAG.getConstant(~Mask, dl, VT)); 16519 } 16520 16521 return V; 16522 } 16523 16524 // Given N, the value controlling the conditional branch, search for the loop 16525 // intrinsic, returning it, along with how the value is used. We need to handle 16526 // patterns such as the following: 16527 // (brcond (xor (setcc (loop.decrement), 0, ne), 1), exit) 16528 // (brcond (setcc (loop.decrement), 0, eq), exit) 16529 // (brcond (setcc (loop.decrement), 0, ne), header) 16530 static SDValue SearchLoopIntrinsic(SDValue N, ISD::CondCode &CC, int &Imm, 16531 bool &Negate) { 16532 switch (N->getOpcode()) { 16533 default: 16534 break; 16535 case ISD::XOR: { 16536 if (!isa<ConstantSDNode>(N.getOperand(1))) 16537 return SDValue(); 16538 if (!cast<ConstantSDNode>(N.getOperand(1))->isOne()) 16539 return SDValue(); 16540 Negate = !Negate; 16541 return SearchLoopIntrinsic(N.getOperand(0), CC, Imm, Negate); 16542 } 16543 case ISD::SETCC: { 16544 auto *Const = dyn_cast<ConstantSDNode>(N.getOperand(1)); 16545 if (!Const) 16546 return SDValue(); 16547 if (Const->isNullValue()) 16548 Imm = 0; 16549 else if (Const->isOne()) 16550 Imm = 1; 16551 else 16552 return SDValue(); 16553 CC = cast<CondCodeSDNode>(N.getOperand(2))->get(); 16554 return SearchLoopIntrinsic(N->getOperand(0), CC, Imm, Negate); 16555 } 16556 case ISD::INTRINSIC_W_CHAIN: { 16557 unsigned IntOp = cast<ConstantSDNode>(N.getOperand(1))->getZExtValue(); 16558 if (IntOp != Intrinsic::test_start_loop_iterations && 16559 IntOp != Intrinsic::loop_decrement_reg) 16560 return SDValue(); 16561 return N; 16562 } 16563 } 16564 return SDValue(); 16565 } 16566 16567 static SDValue PerformHWLoopCombine(SDNode *N, 16568 TargetLowering::DAGCombinerInfo &DCI, 16569 const ARMSubtarget *ST) { 16570 16571 // The hwloop intrinsics that we're interested are used for control-flow, 16572 // either for entering or exiting the loop: 16573 // - test.start.loop.iterations will test whether its operand is zero. If it 16574 // is zero, the proceeding branch should not enter the loop. 16575 // - loop.decrement.reg also tests whether its operand is zero. If it is 16576 // zero, the proceeding branch should not branch back to the beginning of 16577 // the loop. 16578 // So here, we need to check that how the brcond is using the result of each 16579 // of the intrinsics to ensure that we're branching to the right place at the 16580 // right time. 16581 16582 ISD::CondCode CC; 16583 SDValue Cond; 16584 int Imm = 1; 16585 bool Negate = false; 16586 SDValue Chain = N->getOperand(0); 16587 SDValue Dest; 16588 16589 if (N->getOpcode() == ISD::BRCOND) { 16590 CC = ISD::SETEQ; 16591 Cond = N->getOperand(1); 16592 Dest = N->getOperand(2); 16593 } else { 16594 assert(N->getOpcode() == ISD::BR_CC && "Expected BRCOND or BR_CC!"); 16595 CC = cast<CondCodeSDNode>(N->getOperand(1))->get(); 16596 Cond = N->getOperand(2); 16597 Dest = N->getOperand(4); 16598 if (auto *Const = dyn_cast<ConstantSDNode>(N->getOperand(3))) { 16599 if (!Const->isOne() && !Const->isNullValue()) 16600 return SDValue(); 16601 Imm = Const->getZExtValue(); 16602 } else 16603 return SDValue(); 16604 } 16605 16606 SDValue Int = SearchLoopIntrinsic(Cond, CC, Imm, Negate); 16607 if (!Int) 16608 return SDValue(); 16609 16610 if (Negate) 16611 CC = ISD::getSetCCInverse(CC, /* Integer inverse */ MVT::i32); 16612 16613 auto IsTrueIfZero = [](ISD::CondCode CC, int Imm) { 16614 return (CC == ISD::SETEQ && Imm == 0) || 16615 (CC == ISD::SETNE && Imm == 1) || 16616 (CC == ISD::SETLT && Imm == 1) || 16617 (CC == ISD::SETULT && Imm == 1); 16618 }; 16619 16620 auto IsFalseIfZero = [](ISD::CondCode CC, int Imm) { 16621 return (CC == ISD::SETEQ && Imm == 1) || 16622 (CC == ISD::SETNE && Imm == 0) || 16623 (CC == ISD::SETGT && Imm == 0) || 16624 (CC == ISD::SETUGT && Imm == 0) || 16625 (CC == ISD::SETGE && Imm == 1) || 16626 (CC == ISD::SETUGE && Imm == 1); 16627 }; 16628 16629 assert((IsTrueIfZero(CC, Imm) || IsFalseIfZero(CC, Imm)) && 16630 "unsupported condition"); 16631 16632 SDLoc dl(Int); 16633 SelectionDAG &DAG = DCI.DAG; 16634 SDValue Elements = Int.getOperand(2); 16635 unsigned IntOp = cast<ConstantSDNode>(Int->getOperand(1))->getZExtValue(); 16636 assert((N->hasOneUse() && N->use_begin()->getOpcode() == ISD::BR) 16637 && "expected single br user"); 16638 SDNode *Br = *N->use_begin(); 16639 SDValue OtherTarget = Br->getOperand(1); 16640 16641 // Update the unconditional branch to branch to the given Dest. 16642 auto UpdateUncondBr = [](SDNode *Br, SDValue Dest, SelectionDAG &DAG) { 16643 SDValue NewBrOps[] = { Br->getOperand(0), Dest }; 16644 SDValue NewBr = DAG.getNode(ISD::BR, SDLoc(Br), MVT::Other, NewBrOps); 16645 DAG.ReplaceAllUsesOfValueWith(SDValue(Br, 0), NewBr); 16646 }; 16647 16648 if (IntOp == Intrinsic::test_start_loop_iterations) { 16649 SDValue Res; 16650 SDValue Setup = DAG.getNode(ARMISD::WLSSETUP, dl, MVT::i32, Elements); 16651 // We expect this 'instruction' to branch when the counter is zero. 16652 if (IsTrueIfZero(CC, Imm)) { 16653 SDValue Ops[] = {Chain, Setup, Dest}; 16654 Res = DAG.getNode(ARMISD::WLS, dl, MVT::Other, Ops); 16655 } else { 16656 // The logic is the reverse of what we need for WLS, so find the other 16657 // basic block target: the target of the proceeding br. 16658 UpdateUncondBr(Br, Dest, DAG); 16659 16660 SDValue Ops[] = {Chain, Setup, OtherTarget}; 16661 Res = DAG.getNode(ARMISD::WLS, dl, MVT::Other, Ops); 16662 } 16663 // Update LR count to the new value 16664 DAG.ReplaceAllUsesOfValueWith(Int.getValue(0), Setup); 16665 // Update chain 16666 DAG.ReplaceAllUsesOfValueWith(Int.getValue(2), Int.getOperand(0)); 16667 return Res; 16668 } else { 16669 SDValue Size = DAG.getTargetConstant( 16670 cast<ConstantSDNode>(Int.getOperand(3))->getZExtValue(), dl, MVT::i32); 16671 SDValue Args[] = { Int.getOperand(0), Elements, Size, }; 16672 SDValue LoopDec = DAG.getNode(ARMISD::LOOP_DEC, dl, 16673 DAG.getVTList(MVT::i32, MVT::Other), Args); 16674 DAG.ReplaceAllUsesWith(Int.getNode(), LoopDec.getNode()); 16675 16676 // We expect this instruction to branch when the count is not zero. 16677 SDValue Target = IsFalseIfZero(CC, Imm) ? Dest : OtherTarget; 16678 16679 // Update the unconditional branch to target the loop preheader if we've 16680 // found the condition has been reversed. 16681 if (Target == OtherTarget) 16682 UpdateUncondBr(Br, Dest, DAG); 16683 16684 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 16685 SDValue(LoopDec.getNode(), 1), Chain); 16686 16687 SDValue EndArgs[] = { Chain, SDValue(LoopDec.getNode(), 0), Target }; 16688 return DAG.getNode(ARMISD::LE, dl, MVT::Other, EndArgs); 16689 } 16690 return SDValue(); 16691 } 16692 16693 /// PerformBRCONDCombine - Target-specific DAG combining for ARMISD::BRCOND. 16694 SDValue 16695 ARMTargetLowering::PerformBRCONDCombine(SDNode *N, SelectionDAG &DAG) const { 16696 SDValue Cmp = N->getOperand(4); 16697 if (Cmp.getOpcode() != ARMISD::CMPZ) 16698 // Only looking at NE cases. 16699 return SDValue(); 16700 16701 EVT VT = N->getValueType(0); 16702 SDLoc dl(N); 16703 SDValue LHS = Cmp.getOperand(0); 16704 SDValue RHS = Cmp.getOperand(1); 16705 SDValue Chain = N->getOperand(0); 16706 SDValue BB = N->getOperand(1); 16707 SDValue ARMcc = N->getOperand(2); 16708 ARMCC::CondCodes CC = 16709 (ARMCC::CondCodes)cast<ConstantSDNode>(ARMcc)->getZExtValue(); 16710 16711 // (brcond Chain BB ne CPSR (cmpz (and (cmov 0 1 CC CPSR Cmp) 1) 0)) 16712 // -> (brcond Chain BB CC CPSR Cmp) 16713 if (CC == ARMCC::NE && LHS.getOpcode() == ISD::AND && LHS->hasOneUse() && 16714 LHS->getOperand(0)->getOpcode() == ARMISD::CMOV && 16715 LHS->getOperand(0)->hasOneUse()) { 16716 auto *LHS00C = dyn_cast<ConstantSDNode>(LHS->getOperand(0)->getOperand(0)); 16717 auto *LHS01C = dyn_cast<ConstantSDNode>(LHS->getOperand(0)->getOperand(1)); 16718 auto *LHS1C = dyn_cast<ConstantSDNode>(LHS->getOperand(1)); 16719 auto *RHSC = dyn_cast<ConstantSDNode>(RHS); 16720 if ((LHS00C && LHS00C->getZExtValue() == 0) && 16721 (LHS01C && LHS01C->getZExtValue() == 1) && 16722 (LHS1C && LHS1C->getZExtValue() == 1) && 16723 (RHSC && RHSC->getZExtValue() == 0)) { 16724 return DAG.getNode( 16725 ARMISD::BRCOND, dl, VT, Chain, BB, LHS->getOperand(0)->getOperand(2), 16726 LHS->getOperand(0)->getOperand(3), LHS->getOperand(0)->getOperand(4)); 16727 } 16728 } 16729 16730 return SDValue(); 16731 } 16732 16733 /// PerformCMOVCombine - Target-specific DAG combining for ARMISD::CMOV. 16734 SDValue 16735 ARMTargetLowering::PerformCMOVCombine(SDNode *N, SelectionDAG &DAG) const { 16736 SDValue Cmp = N->getOperand(4); 16737 if (Cmp.getOpcode() != ARMISD::CMPZ) 16738 // Only looking at EQ and NE cases. 16739 return SDValue(); 16740 16741 EVT VT = N->getValueType(0); 16742 SDLoc dl(N); 16743 SDValue LHS = Cmp.getOperand(0); 16744 SDValue RHS = Cmp.getOperand(1); 16745 SDValue FalseVal = N->getOperand(0); 16746 SDValue TrueVal = N->getOperand(1); 16747 SDValue ARMcc = N->getOperand(2); 16748 ARMCC::CondCodes CC = 16749 (ARMCC::CondCodes)cast<ConstantSDNode>(ARMcc)->getZExtValue(); 16750 16751 // BFI is only available on V6T2+. 16752 if (!Subtarget->isThumb1Only() && Subtarget->hasV6T2Ops()) { 16753 SDValue R = PerformCMOVToBFICombine(N, DAG); 16754 if (R) 16755 return R; 16756 } 16757 16758 // Simplify 16759 // mov r1, r0 16760 // cmp r1, x 16761 // mov r0, y 16762 // moveq r0, x 16763 // to 16764 // cmp r0, x 16765 // movne r0, y 16766 // 16767 // mov r1, r0 16768 // cmp r1, x 16769 // mov r0, x 16770 // movne r0, y 16771 // to 16772 // cmp r0, x 16773 // movne r0, y 16774 /// FIXME: Turn this into a target neutral optimization? 16775 SDValue Res; 16776 if (CC == ARMCC::NE && FalseVal == RHS && FalseVal != LHS) { 16777 Res = DAG.getNode(ARMISD::CMOV, dl, VT, LHS, TrueVal, ARMcc, 16778 N->getOperand(3), Cmp); 16779 } else if (CC == ARMCC::EQ && TrueVal == RHS) { 16780 SDValue ARMcc; 16781 SDValue NewCmp = getARMCmp(LHS, RHS, ISD::SETNE, ARMcc, DAG, dl); 16782 Res = DAG.getNode(ARMISD::CMOV, dl, VT, LHS, FalseVal, ARMcc, 16783 N->getOperand(3), NewCmp); 16784 } 16785 16786 // (cmov F T ne CPSR (cmpz (cmov 0 1 CC CPSR Cmp) 0)) 16787 // -> (cmov F T CC CPSR Cmp) 16788 if (CC == ARMCC::NE && LHS.getOpcode() == ARMISD::CMOV && LHS->hasOneUse()) { 16789 auto *LHS0C = dyn_cast<ConstantSDNode>(LHS->getOperand(0)); 16790 auto *LHS1C = dyn_cast<ConstantSDNode>(LHS->getOperand(1)); 16791 auto *RHSC = dyn_cast<ConstantSDNode>(RHS); 16792 if ((LHS0C && LHS0C->getZExtValue() == 0) && 16793 (LHS1C && LHS1C->getZExtValue() == 1) && 16794 (RHSC && RHSC->getZExtValue() == 0)) { 16795 return DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, TrueVal, 16796 LHS->getOperand(2), LHS->getOperand(3), 16797 LHS->getOperand(4)); 16798 } 16799 } 16800 16801 if (!VT.isInteger()) 16802 return SDValue(); 16803 16804 // Materialize a boolean comparison for integers so we can avoid branching. 16805 if (isNullConstant(FalseVal)) { 16806 if (CC == ARMCC::EQ && isOneConstant(TrueVal)) { 16807 if (!Subtarget->isThumb1Only() && Subtarget->hasV5TOps()) { 16808 // If x == y then x - y == 0 and ARM's CLZ will return 32, shifting it 16809 // right 5 bits will make that 32 be 1, otherwise it will be 0. 16810 // CMOV 0, 1, ==, (CMPZ x, y) -> SRL (CTLZ (SUB x, y)), 5 16811 SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, LHS, RHS); 16812 Res = DAG.getNode(ISD::SRL, dl, VT, DAG.getNode(ISD::CTLZ, dl, VT, Sub), 16813 DAG.getConstant(5, dl, MVT::i32)); 16814 } else { 16815 // CMOV 0, 1, ==, (CMPZ x, y) -> 16816 // (ADDCARRY (SUB x, y), t:0, t:1) 16817 // where t = (SUBCARRY 0, (SUB x, y), 0) 16818 // 16819 // The SUBCARRY computes 0 - (x - y) and this will give a borrow when 16820 // x != y. In other words, a carry C == 1 when x == y, C == 0 16821 // otherwise. 16822 // The final ADDCARRY computes 16823 // x - y + (0 - (x - y)) + C == C 16824 SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, LHS, RHS); 16825 SDVTList VTs = DAG.getVTList(VT, MVT::i32); 16826 SDValue Neg = DAG.getNode(ISD::USUBO, dl, VTs, FalseVal, Sub); 16827 // ISD::SUBCARRY returns a borrow but we want the carry here 16828 // actually. 16829 SDValue Carry = 16830 DAG.getNode(ISD::SUB, dl, MVT::i32, 16831 DAG.getConstant(1, dl, MVT::i32), Neg.getValue(1)); 16832 Res = DAG.getNode(ISD::ADDCARRY, dl, VTs, Sub, Neg, Carry); 16833 } 16834 } else if (CC == ARMCC::NE && !isNullConstant(RHS) && 16835 (!Subtarget->isThumb1Only() || isPowerOf2Constant(TrueVal))) { 16836 // This seems pointless but will allow us to combine it further below. 16837 // CMOV 0, z, !=, (CMPZ x, y) -> CMOV (SUBS x, y), z, !=, (SUBS x, y):1 16838 SDValue Sub = 16839 DAG.getNode(ARMISD::SUBS, dl, DAG.getVTList(VT, MVT::i32), LHS, RHS); 16840 SDValue CPSRGlue = DAG.getCopyToReg(DAG.getEntryNode(), dl, ARM::CPSR, 16841 Sub.getValue(1), SDValue()); 16842 Res = DAG.getNode(ARMISD::CMOV, dl, VT, Sub, TrueVal, ARMcc, 16843 N->getOperand(3), CPSRGlue.getValue(1)); 16844 FalseVal = Sub; 16845 } 16846 } else if (isNullConstant(TrueVal)) { 16847 if (CC == ARMCC::EQ && !isNullConstant(RHS) && 16848 (!Subtarget->isThumb1Only() || isPowerOf2Constant(FalseVal))) { 16849 // This seems pointless but will allow us to combine it further below 16850 // Note that we change == for != as this is the dual for the case above. 16851 // CMOV z, 0, ==, (CMPZ x, y) -> CMOV (SUBS x, y), z, !=, (SUBS x, y):1 16852 SDValue Sub = 16853 DAG.getNode(ARMISD::SUBS, dl, DAG.getVTList(VT, MVT::i32), LHS, RHS); 16854 SDValue CPSRGlue = DAG.getCopyToReg(DAG.getEntryNode(), dl, ARM::CPSR, 16855 Sub.getValue(1), SDValue()); 16856 Res = DAG.getNode(ARMISD::CMOV, dl, VT, Sub, FalseVal, 16857 DAG.getConstant(ARMCC::NE, dl, MVT::i32), 16858 N->getOperand(3), CPSRGlue.getValue(1)); 16859 FalseVal = Sub; 16860 } 16861 } 16862 16863 // On Thumb1, the DAG above may be further combined if z is a power of 2 16864 // (z == 2 ^ K). 16865 // CMOV (SUBS x, y), z, !=, (SUBS x, y):1 -> 16866 // t1 = (USUBO (SUB x, y), 1) 16867 // t2 = (SUBCARRY (SUB x, y), t1:0, t1:1) 16868 // Result = if K != 0 then (SHL t2:0, K) else t2:0 16869 // 16870 // This also handles the special case of comparing against zero; it's 16871 // essentially, the same pattern, except there's no SUBS: 16872 // CMOV x, z, !=, (CMPZ x, 0) -> 16873 // t1 = (USUBO x, 1) 16874 // t2 = (SUBCARRY x, t1:0, t1:1) 16875 // Result = if K != 0 then (SHL t2:0, K) else t2:0 16876 const APInt *TrueConst; 16877 if (Subtarget->isThumb1Only() && CC == ARMCC::NE && 16878 ((FalseVal.getOpcode() == ARMISD::SUBS && 16879 FalseVal.getOperand(0) == LHS && FalseVal.getOperand(1) == RHS) || 16880 (FalseVal == LHS && isNullConstant(RHS))) && 16881 (TrueConst = isPowerOf2Constant(TrueVal))) { 16882 SDVTList VTs = DAG.getVTList(VT, MVT::i32); 16883 unsigned ShiftAmount = TrueConst->logBase2(); 16884 if (ShiftAmount) 16885 TrueVal = DAG.getConstant(1, dl, VT); 16886 SDValue Subc = DAG.getNode(ISD::USUBO, dl, VTs, FalseVal, TrueVal); 16887 Res = DAG.getNode(ISD::SUBCARRY, dl, VTs, FalseVal, Subc, Subc.getValue(1)); 16888 16889 if (ShiftAmount) 16890 Res = DAG.getNode(ISD::SHL, dl, VT, Res, 16891 DAG.getConstant(ShiftAmount, dl, MVT::i32)); 16892 } 16893 16894 if (Res.getNode()) { 16895 KnownBits Known = DAG.computeKnownBits(SDValue(N,0)); 16896 // Capture demanded bits information that would be otherwise lost. 16897 if (Known.Zero == 0xfffffffe) 16898 Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res, 16899 DAG.getValueType(MVT::i1)); 16900 else if (Known.Zero == 0xffffff00) 16901 Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res, 16902 DAG.getValueType(MVT::i8)); 16903 else if (Known.Zero == 0xffff0000) 16904 Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res, 16905 DAG.getValueType(MVT::i16)); 16906 } 16907 16908 return Res; 16909 } 16910 16911 static SDValue PerformBITCASTCombine(SDNode *N, 16912 TargetLowering::DAGCombinerInfo &DCI, 16913 const ARMSubtarget *ST) { 16914 SelectionDAG &DAG = DCI.DAG; 16915 SDValue Src = N->getOperand(0); 16916 EVT DstVT = N->getValueType(0); 16917 16918 // Convert v4f32 bitcast (v4i32 vdup (i32)) -> v4f32 vdup (i32) under MVE. 16919 if (ST->hasMVEIntegerOps() && Src.getOpcode() == ARMISD::VDUP) { 16920 EVT SrcVT = Src.getValueType(); 16921 if (SrcVT.getScalarSizeInBits() == DstVT.getScalarSizeInBits()) 16922 return DAG.getNode(ARMISD::VDUP, SDLoc(N), DstVT, Src.getOperand(0)); 16923 } 16924 16925 // We may have a bitcast of something that has already had this bitcast 16926 // combine performed on it, so skip past any VECTOR_REG_CASTs. 16927 while (Src.getOpcode() == ARMISD::VECTOR_REG_CAST) 16928 Src = Src.getOperand(0); 16929 16930 // Bitcast from element-wise VMOV or VMVN doesn't need VREV if the VREV that 16931 // would be generated is at least the width of the element type. 16932 EVT SrcVT = Src.getValueType(); 16933 if ((Src.getOpcode() == ARMISD::VMOVIMM || 16934 Src.getOpcode() == ARMISD::VMVNIMM || 16935 Src.getOpcode() == ARMISD::VMOVFPIMM) && 16936 SrcVT.getScalarSizeInBits() <= DstVT.getScalarSizeInBits() && 16937 DAG.getDataLayout().isBigEndian()) 16938 return DAG.getNode(ARMISD::VECTOR_REG_CAST, SDLoc(N), DstVT, Src); 16939 16940 // bitcast(extract(x, n)); bitcast(extract(x, n+1)) -> VMOVRRD x 16941 if (SDValue R = PerformExtractEltToVMOVRRD(N, DCI)) 16942 return R; 16943 16944 return SDValue(); 16945 } 16946 16947 SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N, 16948 DAGCombinerInfo &DCI) const { 16949 switch (N->getOpcode()) { 16950 default: break; 16951 case ISD::SELECT_CC: 16952 case ISD::SELECT: return PerformSELECTCombine(N, DCI, Subtarget); 16953 case ISD::VSELECT: return PerformVSELECTCombine(N, DCI, Subtarget); 16954 case ISD::ABS: return PerformABSCombine(N, DCI, Subtarget); 16955 case ARMISD::ADDE: return PerformADDECombine(N, DCI, Subtarget); 16956 case ARMISD::UMLAL: return PerformUMLALCombine(N, DCI.DAG, Subtarget); 16957 case ISD::ADD: return PerformADDCombine(N, DCI, Subtarget); 16958 case ISD::SUB: return PerformSUBCombine(N, DCI, Subtarget); 16959 case ISD::MUL: return PerformMULCombine(N, DCI, Subtarget); 16960 case ISD::OR: return PerformORCombine(N, DCI, Subtarget); 16961 case ISD::XOR: return PerformXORCombine(N, DCI, Subtarget); 16962 case ISD::AND: return PerformANDCombine(N, DCI, Subtarget); 16963 case ISD::BRCOND: 16964 case ISD::BR_CC: return PerformHWLoopCombine(N, DCI, Subtarget); 16965 case ARMISD::ADDC: 16966 case ARMISD::SUBC: return PerformAddcSubcCombine(N, DCI, Subtarget); 16967 case ARMISD::SUBE: return PerformAddeSubeCombine(N, DCI, Subtarget); 16968 case ARMISD::BFI: return PerformBFICombine(N, DCI); 16969 case ARMISD::VMOVRRD: return PerformVMOVRRDCombine(N, DCI, Subtarget); 16970 case ARMISD::VMOVDRR: return PerformVMOVDRRCombine(N, DCI.DAG); 16971 case ARMISD::VMOVhr: return PerformVMOVhrCombine(N, DCI); 16972 case ARMISD::VMOVrh: return PerformVMOVrhCombine(N, DCI); 16973 case ISD::STORE: return PerformSTORECombine(N, DCI, Subtarget); 16974 case ISD::BUILD_VECTOR: return PerformBUILD_VECTORCombine(N, DCI, Subtarget); 16975 case ISD::INSERT_VECTOR_ELT: return PerformInsertEltCombine(N, DCI); 16976 case ISD::EXTRACT_VECTOR_ELT: 16977 return PerformExtractEltCombine(N, DCI, Subtarget); 16978 case ISD::SIGN_EXTEND_INREG: return PerformSignExtendInregCombine(N, DCI.DAG); 16979 case ISD::VECTOR_SHUFFLE: return PerformVECTOR_SHUFFLECombine(N, DCI.DAG); 16980 case ARMISD::VDUPLANE: return PerformVDUPLANECombine(N, DCI, Subtarget); 16981 case ARMISD::VDUP: return PerformVDUPCombine(N, DCI, Subtarget); 16982 case ISD::FP_TO_SINT: 16983 case ISD::FP_TO_UINT: 16984 return PerformVCVTCombine(N, DCI.DAG, Subtarget); 16985 case ISD::FDIV: 16986 return PerformVDIVCombine(N, DCI.DAG, Subtarget); 16987 case ISD::INTRINSIC_WO_CHAIN: 16988 return PerformIntrinsicCombine(N, DCI); 16989 case ISD::SHL: 16990 case ISD::SRA: 16991 case ISD::SRL: 16992 return PerformShiftCombine(N, DCI, Subtarget); 16993 case ISD::SIGN_EXTEND: 16994 case ISD::ZERO_EXTEND: 16995 case ISD::ANY_EXTEND: 16996 return PerformExtendCombine(N, DCI.DAG, Subtarget); 16997 case ISD::FP_EXTEND: 16998 return PerformFPExtendCombine(N, DCI.DAG, Subtarget); 16999 case ISD::SMIN: 17000 case ISD::UMIN: 17001 case ISD::SMAX: 17002 case ISD::UMAX: 17003 return PerformMinMaxCombine(N, DCI.DAG, Subtarget); 17004 case ARMISD::CMOV: return PerformCMOVCombine(N, DCI.DAG); 17005 case ARMISD::BRCOND: return PerformBRCONDCombine(N, DCI.DAG); 17006 case ISD::LOAD: return PerformLOADCombine(N, DCI); 17007 case ARMISD::VLD1DUP: 17008 case ARMISD::VLD2DUP: 17009 case ARMISD::VLD3DUP: 17010 case ARMISD::VLD4DUP: 17011 return PerformVLDCombine(N, DCI); 17012 case ARMISD::BUILD_VECTOR: 17013 return PerformARMBUILD_VECTORCombine(N, DCI); 17014 case ISD::BITCAST: 17015 return PerformBITCASTCombine(N, DCI, Subtarget); 17016 case ARMISD::PREDICATE_CAST: 17017 return PerformPREDICATE_CASTCombine(N, DCI); 17018 case ARMISD::VECTOR_REG_CAST: 17019 return PerformVECTOR_REG_CASTCombine(N, DCI, Subtarget); 17020 case ARMISD::VCMP: 17021 return PerformVCMPCombine(N, DCI, Subtarget); 17022 case ISD::VECREDUCE_ADD: 17023 return PerformVECREDUCE_ADDCombine(N, DCI.DAG, Subtarget); 17024 case ARMISD::VMOVN: 17025 return PerformVMOVNCombine(N, DCI); 17026 case ARMISD::VQMOVNs: 17027 case ARMISD::VQMOVNu: 17028 return PerformVQMOVNCombine(N, DCI); 17029 case ARMISD::ASRL: 17030 case ARMISD::LSRL: 17031 case ARMISD::LSLL: 17032 return PerformLongShiftCombine(N, DCI.DAG); 17033 case ARMISD::SMULWB: { 17034 unsigned BitWidth = N->getValueType(0).getSizeInBits(); 17035 APInt DemandedMask = APInt::getLowBitsSet(BitWidth, 16); 17036 if (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI)) 17037 return SDValue(); 17038 break; 17039 } 17040 case ARMISD::SMULWT: { 17041 unsigned BitWidth = N->getValueType(0).getSizeInBits(); 17042 APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 16); 17043 if (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI)) 17044 return SDValue(); 17045 break; 17046 } 17047 case ARMISD::SMLALBB: 17048 case ARMISD::QADD16b: 17049 case ARMISD::QSUB16b: { 17050 unsigned BitWidth = N->getValueType(0).getSizeInBits(); 17051 APInt DemandedMask = APInt::getLowBitsSet(BitWidth, 16); 17052 if ((SimplifyDemandedBits(N->getOperand(0), DemandedMask, DCI)) || 17053 (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI))) 17054 return SDValue(); 17055 break; 17056 } 17057 case ARMISD::SMLALBT: { 17058 unsigned LowWidth = N->getOperand(0).getValueType().getSizeInBits(); 17059 APInt LowMask = APInt::getLowBitsSet(LowWidth, 16); 17060 unsigned HighWidth = N->getOperand(1).getValueType().getSizeInBits(); 17061 APInt HighMask = APInt::getHighBitsSet(HighWidth, 16); 17062 if ((SimplifyDemandedBits(N->getOperand(0), LowMask, DCI)) || 17063 (SimplifyDemandedBits(N->getOperand(1), HighMask, DCI))) 17064 return SDValue(); 17065 break; 17066 } 17067 case ARMISD::SMLALTB: { 17068 unsigned HighWidth = N->getOperand(0).getValueType().getSizeInBits(); 17069 APInt HighMask = APInt::getHighBitsSet(HighWidth, 16); 17070 unsigned LowWidth = N->getOperand(1).getValueType().getSizeInBits(); 17071 APInt LowMask = APInt::getLowBitsSet(LowWidth, 16); 17072 if ((SimplifyDemandedBits(N->getOperand(0), HighMask, DCI)) || 17073 (SimplifyDemandedBits(N->getOperand(1), LowMask, DCI))) 17074 return SDValue(); 17075 break; 17076 } 17077 case ARMISD::SMLALTT: { 17078 unsigned BitWidth = N->getValueType(0).getSizeInBits(); 17079 APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 16); 17080 if ((SimplifyDemandedBits(N->getOperand(0), DemandedMask, DCI)) || 17081 (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI))) 17082 return SDValue(); 17083 break; 17084 } 17085 case ARMISD::QADD8b: 17086 case ARMISD::QSUB8b: { 17087 unsigned BitWidth = N->getValueType(0).getSizeInBits(); 17088 APInt DemandedMask = APInt::getLowBitsSet(BitWidth, 8); 17089 if ((SimplifyDemandedBits(N->getOperand(0), DemandedMask, DCI)) || 17090 (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI))) 17091 return SDValue(); 17092 break; 17093 } 17094 case ISD::INTRINSIC_VOID: 17095 case ISD::INTRINSIC_W_CHAIN: 17096 switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) { 17097 case Intrinsic::arm_neon_vld1: 17098 case Intrinsic::arm_neon_vld1x2: 17099 case Intrinsic::arm_neon_vld1x3: 17100 case Intrinsic::arm_neon_vld1x4: 17101 case Intrinsic::arm_neon_vld2: 17102 case Intrinsic::arm_neon_vld3: 17103 case Intrinsic::arm_neon_vld4: 17104 case Intrinsic::arm_neon_vld2lane: 17105 case Intrinsic::arm_neon_vld3lane: 17106 case Intrinsic::arm_neon_vld4lane: 17107 case Intrinsic::arm_neon_vld2dup: 17108 case Intrinsic::arm_neon_vld3dup: 17109 case Intrinsic::arm_neon_vld4dup: 17110 case Intrinsic::arm_neon_vst1: 17111 case Intrinsic::arm_neon_vst1x2: 17112 case Intrinsic::arm_neon_vst1x3: 17113 case Intrinsic::arm_neon_vst1x4: 17114 case Intrinsic::arm_neon_vst2: 17115 case Intrinsic::arm_neon_vst3: 17116 case Intrinsic::arm_neon_vst4: 17117 case Intrinsic::arm_neon_vst2lane: 17118 case Intrinsic::arm_neon_vst3lane: 17119 case Intrinsic::arm_neon_vst4lane: 17120 return PerformVLDCombine(N, DCI); 17121 case Intrinsic::arm_mve_vld2q: 17122 case Intrinsic::arm_mve_vld4q: 17123 case Intrinsic::arm_mve_vst2q: 17124 case Intrinsic::arm_mve_vst4q: 17125 return PerformMVEVLDCombine(N, DCI); 17126 default: break; 17127 } 17128 break; 17129 } 17130 return SDValue(); 17131 } 17132 17133 bool ARMTargetLowering::isDesirableToTransformToIntegerOp(unsigned Opc, 17134 EVT VT) const { 17135 return (VT == MVT::f32) && (Opc == ISD::LOAD || Opc == ISD::STORE); 17136 } 17137 17138 bool ARMTargetLowering::allowsMisalignedMemoryAccesses(EVT VT, unsigned, 17139 Align Alignment, 17140 MachineMemOperand::Flags, 17141 bool *Fast) const { 17142 // Depends what it gets converted into if the type is weird. 17143 if (!VT.isSimple()) 17144 return false; 17145 17146 // The AllowsUnaligned flag models the SCTLR.A setting in ARM cpus 17147 bool AllowsUnaligned = Subtarget->allowsUnalignedMem(); 17148 auto Ty = VT.getSimpleVT().SimpleTy; 17149 17150 if (Ty == MVT::i8 || Ty == MVT::i16 || Ty == MVT::i32) { 17151 // Unaligned access can use (for example) LRDB, LRDH, LDR 17152 if (AllowsUnaligned) { 17153 if (Fast) 17154 *Fast = Subtarget->hasV7Ops(); 17155 return true; 17156 } 17157 } 17158 17159 if (Ty == MVT::f64 || Ty == MVT::v2f64) { 17160 // For any little-endian targets with neon, we can support unaligned ld/st 17161 // of D and Q (e.g. {D0,D1}) registers by using vld1.i8/vst1.i8. 17162 // A big-endian target may also explicitly support unaligned accesses 17163 if (Subtarget->hasNEON() && (AllowsUnaligned || Subtarget->isLittle())) { 17164 if (Fast) 17165 *Fast = true; 17166 return true; 17167 } 17168 } 17169 17170 if (!Subtarget->hasMVEIntegerOps()) 17171 return false; 17172 17173 // These are for predicates 17174 if ((Ty == MVT::v16i1 || Ty == MVT::v8i1 || Ty == MVT::v4i1)) { 17175 if (Fast) 17176 *Fast = true; 17177 return true; 17178 } 17179 17180 // These are for truncated stores/narrowing loads. They are fine so long as 17181 // the alignment is at least the size of the item being loaded 17182 if ((Ty == MVT::v4i8 || Ty == MVT::v8i8 || Ty == MVT::v4i16) && 17183 Alignment >= VT.getScalarSizeInBits() / 8) { 17184 if (Fast) 17185 *Fast = true; 17186 return true; 17187 } 17188 17189 // In little-endian MVE, the store instructions VSTRB.U8, VSTRH.U16 and 17190 // VSTRW.U32 all store the vector register in exactly the same format, and 17191 // differ only in the range of their immediate offset field and the required 17192 // alignment. So there is always a store that can be used, regardless of 17193 // actual type. 17194 // 17195 // For big endian, that is not the case. But can still emit a (VSTRB.U8; 17196 // VREV64.8) pair and get the same effect. This will likely be better than 17197 // aligning the vector through the stack. 17198 if (Ty == MVT::v16i8 || Ty == MVT::v8i16 || Ty == MVT::v8f16 || 17199 Ty == MVT::v4i32 || Ty == MVT::v4f32 || Ty == MVT::v2i64 || 17200 Ty == MVT::v2f64) { 17201 if (Fast) 17202 *Fast = true; 17203 return true; 17204 } 17205 17206 return false; 17207 } 17208 17209 17210 EVT ARMTargetLowering::getOptimalMemOpType( 17211 const MemOp &Op, const AttributeList &FuncAttributes) const { 17212 // See if we can use NEON instructions for this... 17213 if ((Op.isMemcpy() || Op.isZeroMemset()) && Subtarget->hasNEON() && 17214 !FuncAttributes.hasFnAttribute(Attribute::NoImplicitFloat)) { 17215 bool Fast; 17216 if (Op.size() >= 16 && 17217 (Op.isAligned(Align(16)) || 17218 (allowsMisalignedMemoryAccesses(MVT::v2f64, 0, Align(1), 17219 MachineMemOperand::MONone, &Fast) && 17220 Fast))) { 17221 return MVT::v2f64; 17222 } else if (Op.size() >= 8 && 17223 (Op.isAligned(Align(8)) || 17224 (allowsMisalignedMemoryAccesses( 17225 MVT::f64, 0, Align(1), MachineMemOperand::MONone, &Fast) && 17226 Fast))) { 17227 return MVT::f64; 17228 } 17229 } 17230 17231 // Let the target-independent logic figure it out. 17232 return MVT::Other; 17233 } 17234 17235 // 64-bit integers are split into their high and low parts and held in two 17236 // different registers, so the trunc is free since the low register can just 17237 // be used. 17238 bool ARMTargetLowering::isTruncateFree(Type *SrcTy, Type *DstTy) const { 17239 if (!SrcTy->isIntegerTy() || !DstTy->isIntegerTy()) 17240 return false; 17241 unsigned SrcBits = SrcTy->getPrimitiveSizeInBits(); 17242 unsigned DestBits = DstTy->getPrimitiveSizeInBits(); 17243 return (SrcBits == 64 && DestBits == 32); 17244 } 17245 17246 bool ARMTargetLowering::isTruncateFree(EVT SrcVT, EVT DstVT) const { 17247 if (SrcVT.isVector() || DstVT.isVector() || !SrcVT.isInteger() || 17248 !DstVT.isInteger()) 17249 return false; 17250 unsigned SrcBits = SrcVT.getSizeInBits(); 17251 unsigned DestBits = DstVT.getSizeInBits(); 17252 return (SrcBits == 64 && DestBits == 32); 17253 } 17254 17255 bool ARMTargetLowering::isZExtFree(SDValue Val, EVT VT2) const { 17256 if (Val.getOpcode() != ISD::LOAD) 17257 return false; 17258 17259 EVT VT1 = Val.getValueType(); 17260 if (!VT1.isSimple() || !VT1.isInteger() || 17261 !VT2.isSimple() || !VT2.isInteger()) 17262 return false; 17263 17264 switch (VT1.getSimpleVT().SimpleTy) { 17265 default: break; 17266 case MVT::i1: 17267 case MVT::i8: 17268 case MVT::i16: 17269 // 8-bit and 16-bit loads implicitly zero-extend to 32-bits. 17270 return true; 17271 } 17272 17273 return false; 17274 } 17275 17276 bool ARMTargetLowering::isFNegFree(EVT VT) const { 17277 if (!VT.isSimple()) 17278 return false; 17279 17280 // There are quite a few FP16 instructions (e.g. VNMLA, VNMLS, etc.) that 17281 // negate values directly (fneg is free). So, we don't want to let the DAG 17282 // combiner rewrite fneg into xors and some other instructions. For f16 and 17283 // FullFP16 argument passing, some bitcast nodes may be introduced, 17284 // triggering this DAG combine rewrite, so we are avoiding that with this. 17285 switch (VT.getSimpleVT().SimpleTy) { 17286 default: break; 17287 case MVT::f16: 17288 return Subtarget->hasFullFP16(); 17289 } 17290 17291 return false; 17292 } 17293 17294 /// Check if Ext1 and Ext2 are extends of the same type, doubling the bitwidth 17295 /// of the vector elements. 17296 static bool areExtractExts(Value *Ext1, Value *Ext2) { 17297 auto areExtDoubled = [](Instruction *Ext) { 17298 return Ext->getType()->getScalarSizeInBits() == 17299 2 * Ext->getOperand(0)->getType()->getScalarSizeInBits(); 17300 }; 17301 17302 if (!match(Ext1, m_ZExtOrSExt(m_Value())) || 17303 !match(Ext2, m_ZExtOrSExt(m_Value())) || 17304 !areExtDoubled(cast<Instruction>(Ext1)) || 17305 !areExtDoubled(cast<Instruction>(Ext2))) 17306 return false; 17307 17308 return true; 17309 } 17310 17311 /// Check if sinking \p I's operands to I's basic block is profitable, because 17312 /// the operands can be folded into a target instruction, e.g. 17313 /// sext/zext can be folded into vsubl. 17314 bool ARMTargetLowering::shouldSinkOperands(Instruction *I, 17315 SmallVectorImpl<Use *> &Ops) const { 17316 if (!I->getType()->isVectorTy()) 17317 return false; 17318 17319 if (Subtarget->hasNEON()) { 17320 switch (I->getOpcode()) { 17321 case Instruction::Sub: 17322 case Instruction::Add: { 17323 if (!areExtractExts(I->getOperand(0), I->getOperand(1))) 17324 return false; 17325 Ops.push_back(&I->getOperandUse(0)); 17326 Ops.push_back(&I->getOperandUse(1)); 17327 return true; 17328 } 17329 default: 17330 return false; 17331 } 17332 } 17333 17334 if (!Subtarget->hasMVEIntegerOps()) 17335 return false; 17336 17337 auto IsFMSMul = [&](Instruction *I) { 17338 if (!I->hasOneUse()) 17339 return false; 17340 auto *Sub = cast<Instruction>(*I->users().begin()); 17341 return Sub->getOpcode() == Instruction::FSub && Sub->getOperand(1) == I; 17342 }; 17343 auto IsFMS = [&](Instruction *I) { 17344 if (match(I->getOperand(0), m_FNeg(m_Value())) || 17345 match(I->getOperand(1), m_FNeg(m_Value()))) 17346 return true; 17347 return false; 17348 }; 17349 17350 auto IsSinker = [&](Instruction *I, int Operand) { 17351 switch (I->getOpcode()) { 17352 case Instruction::Add: 17353 case Instruction::Mul: 17354 case Instruction::FAdd: 17355 case Instruction::ICmp: 17356 case Instruction::FCmp: 17357 return true; 17358 case Instruction::FMul: 17359 return !IsFMSMul(I); 17360 case Instruction::Sub: 17361 case Instruction::FSub: 17362 case Instruction::Shl: 17363 case Instruction::LShr: 17364 case Instruction::AShr: 17365 return Operand == 1; 17366 case Instruction::Call: 17367 if (auto *II = dyn_cast<IntrinsicInst>(I)) { 17368 switch (II->getIntrinsicID()) { 17369 case Intrinsic::fma: 17370 return !IsFMS(I); 17371 case Intrinsic::arm_mve_add_predicated: 17372 case Intrinsic::arm_mve_mul_predicated: 17373 case Intrinsic::arm_mve_qadd_predicated: 17374 case Intrinsic::arm_mve_hadd_predicated: 17375 case Intrinsic::arm_mve_vqdmull_predicated: 17376 case Intrinsic::arm_mve_qdmulh_predicated: 17377 case Intrinsic::arm_mve_qrdmulh_predicated: 17378 case Intrinsic::arm_mve_fma_predicated: 17379 return true; 17380 case Intrinsic::arm_mve_sub_predicated: 17381 case Intrinsic::arm_mve_qsub_predicated: 17382 case Intrinsic::arm_mve_hsub_predicated: 17383 return Operand == 1; 17384 default: 17385 return false; 17386 } 17387 } 17388 return false; 17389 default: 17390 return false; 17391 } 17392 }; 17393 17394 for (auto OpIdx : enumerate(I->operands())) { 17395 Instruction *Op = dyn_cast<Instruction>(OpIdx.value().get()); 17396 // Make sure we are not already sinking this operand 17397 if (!Op || any_of(Ops, [&](Use *U) { return U->get() == Op; })) 17398 continue; 17399 17400 Instruction *Shuffle = Op; 17401 if (Shuffle->getOpcode() == Instruction::BitCast) 17402 Shuffle = dyn_cast<Instruction>(Shuffle->getOperand(0)); 17403 // We are looking for a splat that can be sunk. 17404 if (!Shuffle || 17405 !match(Shuffle, m_Shuffle( 17406 m_InsertElt(m_Undef(), m_Value(), m_ZeroInt()), 17407 m_Undef(), m_ZeroMask()))) 17408 continue; 17409 if (!IsSinker(I, OpIdx.index())) 17410 continue; 17411 17412 // All uses of the shuffle should be sunk to avoid duplicating it across gpr 17413 // and vector registers 17414 for (Use &U : Op->uses()) { 17415 Instruction *Insn = cast<Instruction>(U.getUser()); 17416 if (!IsSinker(Insn, U.getOperandNo())) 17417 return false; 17418 } 17419 17420 Ops.push_back(&Shuffle->getOperandUse(0)); 17421 if (Shuffle != Op) 17422 Ops.push_back(&Op->getOperandUse(0)); 17423 Ops.push_back(&OpIdx.value()); 17424 } 17425 return true; 17426 } 17427 17428 Type *ARMTargetLowering::shouldConvertSplatType(ShuffleVectorInst *SVI) const { 17429 if (!Subtarget->hasMVEIntegerOps()) 17430 return nullptr; 17431 Type *SVIType = SVI->getType(); 17432 Type *ScalarType = SVIType->getScalarType(); 17433 17434 if (ScalarType->isFloatTy()) 17435 return Type::getInt32Ty(SVIType->getContext()); 17436 if (ScalarType->isHalfTy()) 17437 return Type::getInt16Ty(SVIType->getContext()); 17438 return nullptr; 17439 } 17440 17441 bool ARMTargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const { 17442 EVT VT = ExtVal.getValueType(); 17443 17444 if (!isTypeLegal(VT)) 17445 return false; 17446 17447 if (auto *Ld = dyn_cast<MaskedLoadSDNode>(ExtVal.getOperand(0))) { 17448 if (Ld->isExpandingLoad()) 17449 return false; 17450 } 17451 17452 if (Subtarget->hasMVEIntegerOps()) 17453 return true; 17454 17455 // Don't create a loadext if we can fold the extension into a wide/long 17456 // instruction. 17457 // If there's more than one user instruction, the loadext is desirable no 17458 // matter what. There can be two uses by the same instruction. 17459 if (ExtVal->use_empty() || 17460 !ExtVal->use_begin()->isOnlyUserOf(ExtVal.getNode())) 17461 return true; 17462 17463 SDNode *U = *ExtVal->use_begin(); 17464 if ((U->getOpcode() == ISD::ADD || U->getOpcode() == ISD::SUB || 17465 U->getOpcode() == ISD::SHL || U->getOpcode() == ARMISD::VSHLIMM)) 17466 return false; 17467 17468 return true; 17469 } 17470 17471 bool ARMTargetLowering::allowTruncateForTailCall(Type *Ty1, Type *Ty2) const { 17472 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy()) 17473 return false; 17474 17475 if (!isTypeLegal(EVT::getEVT(Ty1))) 17476 return false; 17477 17478 assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop"); 17479 17480 // Assuming the caller doesn't have a zeroext or signext return parameter, 17481 // truncation all the way down to i1 is valid. 17482 return true; 17483 } 17484 17485 InstructionCost ARMTargetLowering::getScalingFactorCost(const DataLayout &DL, 17486 const AddrMode &AM, 17487 Type *Ty, 17488 unsigned AS) const { 17489 if (isLegalAddressingMode(DL, AM, Ty, AS)) { 17490 if (Subtarget->hasFPAO()) 17491 return AM.Scale < 0 ? 1 : 0; // positive offsets execute faster 17492 return 0; 17493 } 17494 return -1; 17495 } 17496 17497 /// isFMAFasterThanFMulAndFAdd - Return true if an FMA operation is faster 17498 /// than a pair of fmul and fadd instructions. fmuladd intrinsics will be 17499 /// expanded to FMAs when this method returns true, otherwise fmuladd is 17500 /// expanded to fmul + fadd. 17501 /// 17502 /// ARM supports both fused and unfused multiply-add operations; we already 17503 /// lower a pair of fmul and fadd to the latter so it's not clear that there 17504 /// would be a gain or that the gain would be worthwhile enough to risk 17505 /// correctness bugs. 17506 /// 17507 /// For MVE, we set this to true as it helps simplify the need for some 17508 /// patterns (and we don't have the non-fused floating point instruction). 17509 bool ARMTargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, 17510 EVT VT) const { 17511 if (!VT.isSimple()) 17512 return false; 17513 17514 switch (VT.getSimpleVT().SimpleTy) { 17515 case MVT::v4f32: 17516 case MVT::v8f16: 17517 return Subtarget->hasMVEFloatOps(); 17518 case MVT::f16: 17519 return Subtarget->useFPVFMx16(); 17520 case MVT::f32: 17521 return Subtarget->useFPVFMx(); 17522 case MVT::f64: 17523 return Subtarget->useFPVFMx64(); 17524 default: 17525 break; 17526 } 17527 17528 return false; 17529 } 17530 17531 static bool isLegalT1AddressImmediate(int64_t V, EVT VT) { 17532 if (V < 0) 17533 return false; 17534 17535 unsigned Scale = 1; 17536 switch (VT.getSimpleVT().SimpleTy) { 17537 case MVT::i1: 17538 case MVT::i8: 17539 // Scale == 1; 17540 break; 17541 case MVT::i16: 17542 // Scale == 2; 17543 Scale = 2; 17544 break; 17545 default: 17546 // On thumb1 we load most things (i32, i64, floats, etc) with a LDR 17547 // Scale == 4; 17548 Scale = 4; 17549 break; 17550 } 17551 17552 if ((V & (Scale - 1)) != 0) 17553 return false; 17554 return isUInt<5>(V / Scale); 17555 } 17556 17557 static bool isLegalT2AddressImmediate(int64_t V, EVT VT, 17558 const ARMSubtarget *Subtarget) { 17559 if (!VT.isInteger() && !VT.isFloatingPoint()) 17560 return false; 17561 if (VT.isVector() && Subtarget->hasNEON()) 17562 return false; 17563 if (VT.isVector() && VT.isFloatingPoint() && Subtarget->hasMVEIntegerOps() && 17564 !Subtarget->hasMVEFloatOps()) 17565 return false; 17566 17567 bool IsNeg = false; 17568 if (V < 0) { 17569 IsNeg = true; 17570 V = -V; 17571 } 17572 17573 unsigned NumBytes = std::max((unsigned)VT.getSizeInBits() / 8, 1U); 17574 17575 // MVE: size * imm7 17576 if (VT.isVector() && Subtarget->hasMVEIntegerOps()) { 17577 switch (VT.getSimpleVT().getVectorElementType().SimpleTy) { 17578 case MVT::i32: 17579 case MVT::f32: 17580 return isShiftedUInt<7,2>(V); 17581 case MVT::i16: 17582 case MVT::f16: 17583 return isShiftedUInt<7,1>(V); 17584 case MVT::i8: 17585 return isUInt<7>(V); 17586 default: 17587 return false; 17588 } 17589 } 17590 17591 // half VLDR: 2 * imm8 17592 if (VT.isFloatingPoint() && NumBytes == 2 && Subtarget->hasFPRegs16()) 17593 return isShiftedUInt<8, 1>(V); 17594 // VLDR and LDRD: 4 * imm8 17595 if ((VT.isFloatingPoint() && Subtarget->hasVFP2Base()) || NumBytes == 8) 17596 return isShiftedUInt<8, 2>(V); 17597 17598 if (NumBytes == 1 || NumBytes == 2 || NumBytes == 4) { 17599 // + imm12 or - imm8 17600 if (IsNeg) 17601 return isUInt<8>(V); 17602 return isUInt<12>(V); 17603 } 17604 17605 return false; 17606 } 17607 17608 /// isLegalAddressImmediate - Return true if the integer value can be used 17609 /// as the offset of the target addressing mode for load / store of the 17610 /// given type. 17611 static bool isLegalAddressImmediate(int64_t V, EVT VT, 17612 const ARMSubtarget *Subtarget) { 17613 if (V == 0) 17614 return true; 17615 17616 if (!VT.isSimple()) 17617 return false; 17618 17619 if (Subtarget->isThumb1Only()) 17620 return isLegalT1AddressImmediate(V, VT); 17621 else if (Subtarget->isThumb2()) 17622 return isLegalT2AddressImmediate(V, VT, Subtarget); 17623 17624 // ARM mode. 17625 if (V < 0) 17626 V = - V; 17627 switch (VT.getSimpleVT().SimpleTy) { 17628 default: return false; 17629 case MVT::i1: 17630 case MVT::i8: 17631 case MVT::i32: 17632 // +- imm12 17633 return isUInt<12>(V); 17634 case MVT::i16: 17635 // +- imm8 17636 return isUInt<8>(V); 17637 case MVT::f32: 17638 case MVT::f64: 17639 if (!Subtarget->hasVFP2Base()) // FIXME: NEON? 17640 return false; 17641 return isShiftedUInt<8, 2>(V); 17642 } 17643 } 17644 17645 bool ARMTargetLowering::isLegalT2ScaledAddressingMode(const AddrMode &AM, 17646 EVT VT) const { 17647 int Scale = AM.Scale; 17648 if (Scale < 0) 17649 return false; 17650 17651 switch (VT.getSimpleVT().SimpleTy) { 17652 default: return false; 17653 case MVT::i1: 17654 case MVT::i8: 17655 case MVT::i16: 17656 case MVT::i32: 17657 if (Scale == 1) 17658 return true; 17659 // r + r << imm 17660 Scale = Scale & ~1; 17661 return Scale == 2 || Scale == 4 || Scale == 8; 17662 case MVT::i64: 17663 // FIXME: What are we trying to model here? ldrd doesn't have an r + r 17664 // version in Thumb mode. 17665 // r + r 17666 if (Scale == 1) 17667 return true; 17668 // r * 2 (this can be lowered to r + r). 17669 if (!AM.HasBaseReg && Scale == 2) 17670 return true; 17671 return false; 17672 case MVT::isVoid: 17673 // Note, we allow "void" uses (basically, uses that aren't loads or 17674 // stores), because arm allows folding a scale into many arithmetic 17675 // operations. This should be made more precise and revisited later. 17676 17677 // Allow r << imm, but the imm has to be a multiple of two. 17678 if (Scale & 1) return false; 17679 return isPowerOf2_32(Scale); 17680 } 17681 } 17682 17683 bool ARMTargetLowering::isLegalT1ScaledAddressingMode(const AddrMode &AM, 17684 EVT VT) const { 17685 const int Scale = AM.Scale; 17686 17687 // Negative scales are not supported in Thumb1. 17688 if (Scale < 0) 17689 return false; 17690 17691 // Thumb1 addressing modes do not support register scaling excepting the 17692 // following cases: 17693 // 1. Scale == 1 means no scaling. 17694 // 2. Scale == 2 this can be lowered to r + r if there is no base register. 17695 return (Scale == 1) || (!AM.HasBaseReg && Scale == 2); 17696 } 17697 17698 /// isLegalAddressingMode - Return true if the addressing mode represented 17699 /// by AM is legal for this target, for a load/store of the specified type. 17700 bool ARMTargetLowering::isLegalAddressingMode(const DataLayout &DL, 17701 const AddrMode &AM, Type *Ty, 17702 unsigned AS, Instruction *I) const { 17703 EVT VT = getValueType(DL, Ty, true); 17704 if (!isLegalAddressImmediate(AM.BaseOffs, VT, Subtarget)) 17705 return false; 17706 17707 // Can never fold addr of global into load/store. 17708 if (AM.BaseGV) 17709 return false; 17710 17711 switch (AM.Scale) { 17712 case 0: // no scale reg, must be "r+i" or "r", or "i". 17713 break; 17714 default: 17715 // ARM doesn't support any R+R*scale+imm addr modes. 17716 if (AM.BaseOffs) 17717 return false; 17718 17719 if (!VT.isSimple()) 17720 return false; 17721 17722 if (Subtarget->isThumb1Only()) 17723 return isLegalT1ScaledAddressingMode(AM, VT); 17724 17725 if (Subtarget->isThumb2()) 17726 return isLegalT2ScaledAddressingMode(AM, VT); 17727 17728 int Scale = AM.Scale; 17729 switch (VT.getSimpleVT().SimpleTy) { 17730 default: return false; 17731 case MVT::i1: 17732 case MVT::i8: 17733 case MVT::i32: 17734 if (Scale < 0) Scale = -Scale; 17735 if (Scale == 1) 17736 return true; 17737 // r + r << imm 17738 return isPowerOf2_32(Scale & ~1); 17739 case MVT::i16: 17740 case MVT::i64: 17741 // r +/- r 17742 if (Scale == 1 || (AM.HasBaseReg && Scale == -1)) 17743 return true; 17744 // r * 2 (this can be lowered to r + r). 17745 if (!AM.HasBaseReg && Scale == 2) 17746 return true; 17747 return false; 17748 17749 case MVT::isVoid: 17750 // Note, we allow "void" uses (basically, uses that aren't loads or 17751 // stores), because arm allows folding a scale into many arithmetic 17752 // operations. This should be made more precise and revisited later. 17753 17754 // Allow r << imm, but the imm has to be a multiple of two. 17755 if (Scale & 1) return false; 17756 return isPowerOf2_32(Scale); 17757 } 17758 } 17759 return true; 17760 } 17761 17762 /// isLegalICmpImmediate - Return true if the specified immediate is legal 17763 /// icmp immediate, that is the target has icmp instructions which can compare 17764 /// a register against the immediate without having to materialize the 17765 /// immediate into a register. 17766 bool ARMTargetLowering::isLegalICmpImmediate(int64_t Imm) const { 17767 // Thumb2 and ARM modes can use cmn for negative immediates. 17768 if (!Subtarget->isThumb()) 17769 return ARM_AM::getSOImmVal((uint32_t)Imm) != -1 || 17770 ARM_AM::getSOImmVal(-(uint32_t)Imm) != -1; 17771 if (Subtarget->isThumb2()) 17772 return ARM_AM::getT2SOImmVal((uint32_t)Imm) != -1 || 17773 ARM_AM::getT2SOImmVal(-(uint32_t)Imm) != -1; 17774 // Thumb1 doesn't have cmn, and only 8-bit immediates. 17775 return Imm >= 0 && Imm <= 255; 17776 } 17777 17778 /// isLegalAddImmediate - Return true if the specified immediate is a legal add 17779 /// *or sub* immediate, that is the target has add or sub instructions which can 17780 /// add a register with the immediate without having to materialize the 17781 /// immediate into a register. 17782 bool ARMTargetLowering::isLegalAddImmediate(int64_t Imm) const { 17783 // Same encoding for add/sub, just flip the sign. 17784 int64_t AbsImm = std::abs(Imm); 17785 if (!Subtarget->isThumb()) 17786 return ARM_AM::getSOImmVal(AbsImm) != -1; 17787 if (Subtarget->isThumb2()) 17788 return ARM_AM::getT2SOImmVal(AbsImm) != -1; 17789 // Thumb1 only has 8-bit unsigned immediate. 17790 return AbsImm >= 0 && AbsImm <= 255; 17791 } 17792 17793 static bool getARMIndexedAddressParts(SDNode *Ptr, EVT VT, 17794 bool isSEXTLoad, SDValue &Base, 17795 SDValue &Offset, bool &isInc, 17796 SelectionDAG &DAG) { 17797 if (Ptr->getOpcode() != ISD::ADD && Ptr->getOpcode() != ISD::SUB) 17798 return false; 17799 17800 if (VT == MVT::i16 || ((VT == MVT::i8 || VT == MVT::i1) && isSEXTLoad)) { 17801 // AddressingMode 3 17802 Base = Ptr->getOperand(0); 17803 if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Ptr->getOperand(1))) { 17804 int RHSC = (int)RHS->getZExtValue(); 17805 if (RHSC < 0 && RHSC > -256) { 17806 assert(Ptr->getOpcode() == ISD::ADD); 17807 isInc = false; 17808 Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0)); 17809 return true; 17810 } 17811 } 17812 isInc = (Ptr->getOpcode() == ISD::ADD); 17813 Offset = Ptr->getOperand(1); 17814 return true; 17815 } else if (VT == MVT::i32 || VT == MVT::i8 || VT == MVT::i1) { 17816 // AddressingMode 2 17817 if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Ptr->getOperand(1))) { 17818 int RHSC = (int)RHS->getZExtValue(); 17819 if (RHSC < 0 && RHSC > -0x1000) { 17820 assert(Ptr->getOpcode() == ISD::ADD); 17821 isInc = false; 17822 Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0)); 17823 Base = Ptr->getOperand(0); 17824 return true; 17825 } 17826 } 17827 17828 if (Ptr->getOpcode() == ISD::ADD) { 17829 isInc = true; 17830 ARM_AM::ShiftOpc ShOpcVal= 17831 ARM_AM::getShiftOpcForNode(Ptr->getOperand(0).getOpcode()); 17832 if (ShOpcVal != ARM_AM::no_shift) { 17833 Base = Ptr->getOperand(1); 17834 Offset = Ptr->getOperand(0); 17835 } else { 17836 Base = Ptr->getOperand(0); 17837 Offset = Ptr->getOperand(1); 17838 } 17839 return true; 17840 } 17841 17842 isInc = (Ptr->getOpcode() == ISD::ADD); 17843 Base = Ptr->getOperand(0); 17844 Offset = Ptr->getOperand(1); 17845 return true; 17846 } 17847 17848 // FIXME: Use VLDM / VSTM to emulate indexed FP load / store. 17849 return false; 17850 } 17851 17852 static bool getT2IndexedAddressParts(SDNode *Ptr, EVT VT, 17853 bool isSEXTLoad, SDValue &Base, 17854 SDValue &Offset, bool &isInc, 17855 SelectionDAG &DAG) { 17856 if (Ptr->getOpcode() != ISD::ADD && Ptr->getOpcode() != ISD::SUB) 17857 return false; 17858 17859 Base = Ptr->getOperand(0); 17860 if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Ptr->getOperand(1))) { 17861 int RHSC = (int)RHS->getZExtValue(); 17862 if (RHSC < 0 && RHSC > -0x100) { // 8 bits. 17863 assert(Ptr->getOpcode() == ISD::ADD); 17864 isInc = false; 17865 Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0)); 17866 return true; 17867 } else if (RHSC > 0 && RHSC < 0x100) { // 8 bit, no zero. 17868 isInc = Ptr->getOpcode() == ISD::ADD; 17869 Offset = DAG.getConstant(RHSC, SDLoc(Ptr), RHS->getValueType(0)); 17870 return true; 17871 } 17872 } 17873 17874 return false; 17875 } 17876 17877 static bool getMVEIndexedAddressParts(SDNode *Ptr, EVT VT, Align Alignment, 17878 bool isSEXTLoad, bool IsMasked, bool isLE, 17879 SDValue &Base, SDValue &Offset, 17880 bool &isInc, SelectionDAG &DAG) { 17881 if (Ptr->getOpcode() != ISD::ADD && Ptr->getOpcode() != ISD::SUB) 17882 return false; 17883 if (!isa<ConstantSDNode>(Ptr->getOperand(1))) 17884 return false; 17885 17886 // We allow LE non-masked loads to change the type (for example use a vldrb.8 17887 // as opposed to a vldrw.32). This can allow extra addressing modes or 17888 // alignments for what is otherwise an equivalent instruction. 17889 bool CanChangeType = isLE && !IsMasked; 17890 17891 ConstantSDNode *RHS = cast<ConstantSDNode>(Ptr->getOperand(1)); 17892 int RHSC = (int)RHS->getZExtValue(); 17893 17894 auto IsInRange = [&](int RHSC, int Limit, int Scale) { 17895 if (RHSC < 0 && RHSC > -Limit * Scale && RHSC % Scale == 0) { 17896 assert(Ptr->getOpcode() == ISD::ADD); 17897 isInc = false; 17898 Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0)); 17899 return true; 17900 } else if (RHSC > 0 && RHSC < Limit * Scale && RHSC % Scale == 0) { 17901 isInc = Ptr->getOpcode() == ISD::ADD; 17902 Offset = DAG.getConstant(RHSC, SDLoc(Ptr), RHS->getValueType(0)); 17903 return true; 17904 } 17905 return false; 17906 }; 17907 17908 // Try to find a matching instruction based on s/zext, Alignment, Offset and 17909 // (in BE/masked) type. 17910 Base = Ptr->getOperand(0); 17911 if (VT == MVT::v4i16) { 17912 if (Alignment >= 2 && IsInRange(RHSC, 0x80, 2)) 17913 return true; 17914 } else if (VT == MVT::v4i8 || VT == MVT::v8i8) { 17915 if (IsInRange(RHSC, 0x80, 1)) 17916 return true; 17917 } else if (Alignment >= 4 && 17918 (CanChangeType || VT == MVT::v4i32 || VT == MVT::v4f32) && 17919 IsInRange(RHSC, 0x80, 4)) 17920 return true; 17921 else if (Alignment >= 2 && 17922 (CanChangeType || VT == MVT::v8i16 || VT == MVT::v8f16) && 17923 IsInRange(RHSC, 0x80, 2)) 17924 return true; 17925 else if ((CanChangeType || VT == MVT::v16i8) && IsInRange(RHSC, 0x80, 1)) 17926 return true; 17927 return false; 17928 } 17929 17930 /// getPreIndexedAddressParts - returns true by value, base pointer and 17931 /// offset pointer and addressing mode by reference if the node's address 17932 /// can be legally represented as pre-indexed load / store address. 17933 bool 17934 ARMTargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base, 17935 SDValue &Offset, 17936 ISD::MemIndexedMode &AM, 17937 SelectionDAG &DAG) const { 17938 if (Subtarget->isThumb1Only()) 17939 return false; 17940 17941 EVT VT; 17942 SDValue Ptr; 17943 Align Alignment; 17944 bool isSEXTLoad = false; 17945 bool IsMasked = false; 17946 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) { 17947 Ptr = LD->getBasePtr(); 17948 VT = LD->getMemoryVT(); 17949 Alignment = LD->getAlign(); 17950 isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD; 17951 } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) { 17952 Ptr = ST->getBasePtr(); 17953 VT = ST->getMemoryVT(); 17954 Alignment = ST->getAlign(); 17955 } else if (MaskedLoadSDNode *LD = dyn_cast<MaskedLoadSDNode>(N)) { 17956 Ptr = LD->getBasePtr(); 17957 VT = LD->getMemoryVT(); 17958 Alignment = LD->getAlign(); 17959 isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD; 17960 IsMasked = true; 17961 } else if (MaskedStoreSDNode *ST = dyn_cast<MaskedStoreSDNode>(N)) { 17962 Ptr = ST->getBasePtr(); 17963 VT = ST->getMemoryVT(); 17964 Alignment = ST->getAlign(); 17965 IsMasked = true; 17966 } else 17967 return false; 17968 17969 bool isInc; 17970 bool isLegal = false; 17971 if (VT.isVector()) 17972 isLegal = Subtarget->hasMVEIntegerOps() && 17973 getMVEIndexedAddressParts( 17974 Ptr.getNode(), VT, Alignment, isSEXTLoad, IsMasked, 17975 Subtarget->isLittle(), Base, Offset, isInc, DAG); 17976 else { 17977 if (Subtarget->isThumb2()) 17978 isLegal = getT2IndexedAddressParts(Ptr.getNode(), VT, isSEXTLoad, Base, 17979 Offset, isInc, DAG); 17980 else 17981 isLegal = getARMIndexedAddressParts(Ptr.getNode(), VT, isSEXTLoad, Base, 17982 Offset, isInc, DAG); 17983 } 17984 if (!isLegal) 17985 return false; 17986 17987 AM = isInc ? ISD::PRE_INC : ISD::PRE_DEC; 17988 return true; 17989 } 17990 17991 /// getPostIndexedAddressParts - returns true by value, base pointer and 17992 /// offset pointer and addressing mode by reference if this node can be 17993 /// combined with a load / store to form a post-indexed load / store. 17994 bool ARMTargetLowering::getPostIndexedAddressParts(SDNode *N, SDNode *Op, 17995 SDValue &Base, 17996 SDValue &Offset, 17997 ISD::MemIndexedMode &AM, 17998 SelectionDAG &DAG) const { 17999 EVT VT; 18000 SDValue Ptr; 18001 Align Alignment; 18002 bool isSEXTLoad = false, isNonExt; 18003 bool IsMasked = false; 18004 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) { 18005 VT = LD->getMemoryVT(); 18006 Ptr = LD->getBasePtr(); 18007 Alignment = LD->getAlign(); 18008 isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD; 18009 isNonExt = LD->getExtensionType() == ISD::NON_EXTLOAD; 18010 } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) { 18011 VT = ST->getMemoryVT(); 18012 Ptr = ST->getBasePtr(); 18013 Alignment = ST->getAlign(); 18014 isNonExt = !ST->isTruncatingStore(); 18015 } else if (MaskedLoadSDNode *LD = dyn_cast<MaskedLoadSDNode>(N)) { 18016 VT = LD->getMemoryVT(); 18017 Ptr = LD->getBasePtr(); 18018 Alignment = LD->getAlign(); 18019 isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD; 18020 isNonExt = LD->getExtensionType() == ISD::NON_EXTLOAD; 18021 IsMasked = true; 18022 } else if (MaskedStoreSDNode *ST = dyn_cast<MaskedStoreSDNode>(N)) { 18023 VT = ST->getMemoryVT(); 18024 Ptr = ST->getBasePtr(); 18025 Alignment = ST->getAlign(); 18026 isNonExt = !ST->isTruncatingStore(); 18027 IsMasked = true; 18028 } else 18029 return false; 18030 18031 if (Subtarget->isThumb1Only()) { 18032 // Thumb-1 can do a limited post-inc load or store as an updating LDM. It 18033 // must be non-extending/truncating, i32, with an offset of 4. 18034 assert(Op->getValueType(0) == MVT::i32 && "Non-i32 post-inc op?!"); 18035 if (Op->getOpcode() != ISD::ADD || !isNonExt) 18036 return false; 18037 auto *RHS = dyn_cast<ConstantSDNode>(Op->getOperand(1)); 18038 if (!RHS || RHS->getZExtValue() != 4) 18039 return false; 18040 18041 Offset = Op->getOperand(1); 18042 Base = Op->getOperand(0); 18043 AM = ISD::POST_INC; 18044 return true; 18045 } 18046 18047 bool isInc; 18048 bool isLegal = false; 18049 if (VT.isVector()) 18050 isLegal = Subtarget->hasMVEIntegerOps() && 18051 getMVEIndexedAddressParts(Op, VT, Alignment, isSEXTLoad, IsMasked, 18052 Subtarget->isLittle(), Base, Offset, 18053 isInc, DAG); 18054 else { 18055 if (Subtarget->isThumb2()) 18056 isLegal = getT2IndexedAddressParts(Op, VT, isSEXTLoad, Base, Offset, 18057 isInc, DAG); 18058 else 18059 isLegal = getARMIndexedAddressParts(Op, VT, isSEXTLoad, Base, Offset, 18060 isInc, DAG); 18061 } 18062 if (!isLegal) 18063 return false; 18064 18065 if (Ptr != Base) { 18066 // Swap base ptr and offset to catch more post-index load / store when 18067 // it's legal. In Thumb2 mode, offset must be an immediate. 18068 if (Ptr == Offset && Op->getOpcode() == ISD::ADD && 18069 !Subtarget->isThumb2()) 18070 std::swap(Base, Offset); 18071 18072 // Post-indexed load / store update the base pointer. 18073 if (Ptr != Base) 18074 return false; 18075 } 18076 18077 AM = isInc ? ISD::POST_INC : ISD::POST_DEC; 18078 return true; 18079 } 18080 18081 void ARMTargetLowering::computeKnownBitsForTargetNode(const SDValue Op, 18082 KnownBits &Known, 18083 const APInt &DemandedElts, 18084 const SelectionDAG &DAG, 18085 unsigned Depth) const { 18086 unsigned BitWidth = Known.getBitWidth(); 18087 Known.resetAll(); 18088 switch (Op.getOpcode()) { 18089 default: break; 18090 case ARMISD::ADDC: 18091 case ARMISD::ADDE: 18092 case ARMISD::SUBC: 18093 case ARMISD::SUBE: 18094 // Special cases when we convert a carry to a boolean. 18095 if (Op.getResNo() == 0) { 18096 SDValue LHS = Op.getOperand(0); 18097 SDValue RHS = Op.getOperand(1); 18098 // (ADDE 0, 0, C) will give us a single bit. 18099 if (Op->getOpcode() == ARMISD::ADDE && isNullConstant(LHS) && 18100 isNullConstant(RHS)) { 18101 Known.Zero |= APInt::getHighBitsSet(BitWidth, BitWidth - 1); 18102 return; 18103 } 18104 } 18105 break; 18106 case ARMISD::CMOV: { 18107 // Bits are known zero/one if known on the LHS and RHS. 18108 Known = DAG.computeKnownBits(Op.getOperand(0), Depth+1); 18109 if (Known.isUnknown()) 18110 return; 18111 18112 KnownBits KnownRHS = DAG.computeKnownBits(Op.getOperand(1), Depth+1); 18113 Known = KnownBits::commonBits(Known, KnownRHS); 18114 return; 18115 } 18116 case ISD::INTRINSIC_W_CHAIN: { 18117 ConstantSDNode *CN = cast<ConstantSDNode>(Op->getOperand(1)); 18118 Intrinsic::ID IntID = static_cast<Intrinsic::ID>(CN->getZExtValue()); 18119 switch (IntID) { 18120 default: return; 18121 case Intrinsic::arm_ldaex: 18122 case Intrinsic::arm_ldrex: { 18123 EVT VT = cast<MemIntrinsicSDNode>(Op)->getMemoryVT(); 18124 unsigned MemBits = VT.getScalarSizeInBits(); 18125 Known.Zero |= APInt::getHighBitsSet(BitWidth, BitWidth - MemBits); 18126 return; 18127 } 18128 } 18129 } 18130 case ARMISD::BFI: { 18131 // Conservatively, we can recurse down the first operand 18132 // and just mask out all affected bits. 18133 Known = DAG.computeKnownBits(Op.getOperand(0), Depth + 1); 18134 18135 // The operand to BFI is already a mask suitable for removing the bits it 18136 // sets. 18137 ConstantSDNode *CI = cast<ConstantSDNode>(Op.getOperand(2)); 18138 const APInt &Mask = CI->getAPIntValue(); 18139 Known.Zero &= Mask; 18140 Known.One &= Mask; 18141 return; 18142 } 18143 case ARMISD::VGETLANEs: 18144 case ARMISD::VGETLANEu: { 18145 const SDValue &SrcSV = Op.getOperand(0); 18146 EVT VecVT = SrcSV.getValueType(); 18147 assert(VecVT.isVector() && "VGETLANE expected a vector type"); 18148 const unsigned NumSrcElts = VecVT.getVectorNumElements(); 18149 ConstantSDNode *Pos = cast<ConstantSDNode>(Op.getOperand(1).getNode()); 18150 assert(Pos->getAPIntValue().ult(NumSrcElts) && 18151 "VGETLANE index out of bounds"); 18152 unsigned Idx = Pos->getZExtValue(); 18153 APInt DemandedElt = APInt::getOneBitSet(NumSrcElts, Idx); 18154 Known = DAG.computeKnownBits(SrcSV, DemandedElt, Depth + 1); 18155 18156 EVT VT = Op.getValueType(); 18157 const unsigned DstSz = VT.getScalarSizeInBits(); 18158 const unsigned SrcSz = VecVT.getVectorElementType().getSizeInBits(); 18159 (void)SrcSz; 18160 assert(SrcSz == Known.getBitWidth()); 18161 assert(DstSz > SrcSz); 18162 if (Op.getOpcode() == ARMISD::VGETLANEs) 18163 Known = Known.sext(DstSz); 18164 else { 18165 Known = Known.zext(DstSz); 18166 } 18167 assert(DstSz == Known.getBitWidth()); 18168 break; 18169 } 18170 case ARMISD::VMOVrh: { 18171 KnownBits KnownOp = DAG.computeKnownBits(Op->getOperand(0), Depth + 1); 18172 assert(KnownOp.getBitWidth() == 16); 18173 Known = KnownOp.zext(32); 18174 break; 18175 } 18176 case ARMISD::CSINC: 18177 case ARMISD::CSINV: 18178 case ARMISD::CSNEG: { 18179 KnownBits KnownOp0 = DAG.computeKnownBits(Op->getOperand(0), Depth + 1); 18180 KnownBits KnownOp1 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1); 18181 18182 // The result is either: 18183 // CSINC: KnownOp0 or KnownOp1 + 1 18184 // CSINV: KnownOp0 or ~KnownOp1 18185 // CSNEG: KnownOp0 or KnownOp1 * -1 18186 if (Op.getOpcode() == ARMISD::CSINC) 18187 KnownOp1 = KnownBits::computeForAddSub( 18188 true, false, KnownOp1, KnownBits::makeConstant(APInt(32, 1))); 18189 else if (Op.getOpcode() == ARMISD::CSINV) 18190 std::swap(KnownOp1.Zero, KnownOp1.One); 18191 else if (Op.getOpcode() == ARMISD::CSNEG) 18192 KnownOp1 = KnownBits::mul( 18193 KnownOp1, KnownBits::makeConstant(APInt(32, -1))); 18194 18195 Known = KnownBits::commonBits(KnownOp0, KnownOp1); 18196 break; 18197 } 18198 } 18199 } 18200 18201 bool ARMTargetLowering::targetShrinkDemandedConstant( 18202 SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, 18203 TargetLoweringOpt &TLO) const { 18204 // Delay optimization, so we don't have to deal with illegal types, or block 18205 // optimizations. 18206 if (!TLO.LegalOps) 18207 return false; 18208 18209 // Only optimize AND for now. 18210 if (Op.getOpcode() != ISD::AND) 18211 return false; 18212 18213 EVT VT = Op.getValueType(); 18214 18215 // Ignore vectors. 18216 if (VT.isVector()) 18217 return false; 18218 18219 assert(VT == MVT::i32 && "Unexpected integer type"); 18220 18221 // Make sure the RHS really is a constant. 18222 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1)); 18223 if (!C) 18224 return false; 18225 18226 unsigned Mask = C->getZExtValue(); 18227 18228 unsigned Demanded = DemandedBits.getZExtValue(); 18229 unsigned ShrunkMask = Mask & Demanded; 18230 unsigned ExpandedMask = Mask | ~Demanded; 18231 18232 // If the mask is all zeros, let the target-independent code replace the 18233 // result with zero. 18234 if (ShrunkMask == 0) 18235 return false; 18236 18237 // If the mask is all ones, erase the AND. (Currently, the target-independent 18238 // code won't do this, so we have to do it explicitly to avoid an infinite 18239 // loop in obscure cases.) 18240 if (ExpandedMask == ~0U) 18241 return TLO.CombineTo(Op, Op.getOperand(0)); 18242 18243 auto IsLegalMask = [ShrunkMask, ExpandedMask](unsigned Mask) -> bool { 18244 return (ShrunkMask & Mask) == ShrunkMask && (~ExpandedMask & Mask) == 0; 18245 }; 18246 auto UseMask = [Mask, Op, VT, &TLO](unsigned NewMask) -> bool { 18247 if (NewMask == Mask) 18248 return true; 18249 SDLoc DL(Op); 18250 SDValue NewC = TLO.DAG.getConstant(NewMask, DL, VT); 18251 SDValue NewOp = TLO.DAG.getNode(ISD::AND, DL, VT, Op.getOperand(0), NewC); 18252 return TLO.CombineTo(Op, NewOp); 18253 }; 18254 18255 // Prefer uxtb mask. 18256 if (IsLegalMask(0xFF)) 18257 return UseMask(0xFF); 18258 18259 // Prefer uxth mask. 18260 if (IsLegalMask(0xFFFF)) 18261 return UseMask(0xFFFF); 18262 18263 // [1, 255] is Thumb1 movs+ands, legal immediate for ARM/Thumb2. 18264 // FIXME: Prefer a contiguous sequence of bits for other optimizations. 18265 if (ShrunkMask < 256) 18266 return UseMask(ShrunkMask); 18267 18268 // [-256, -2] is Thumb1 movs+bics, legal immediate for ARM/Thumb2. 18269 // FIXME: Prefer a contiguous sequence of bits for other optimizations. 18270 if ((int)ExpandedMask <= -2 && (int)ExpandedMask >= -256) 18271 return UseMask(ExpandedMask); 18272 18273 // Potential improvements: 18274 // 18275 // We could try to recognize lsls+lsrs or lsrs+lsls pairs here. 18276 // We could try to prefer Thumb1 immediates which can be lowered to a 18277 // two-instruction sequence. 18278 // We could try to recognize more legal ARM/Thumb2 immediates here. 18279 18280 return false; 18281 } 18282 18283 bool ARMTargetLowering::SimplifyDemandedBitsForTargetNode( 18284 SDValue Op, const APInt &OriginalDemandedBits, 18285 const APInt &OriginalDemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, 18286 unsigned Depth) const { 18287 unsigned Opc = Op.getOpcode(); 18288 18289 switch (Opc) { 18290 case ARMISD::ASRL: 18291 case ARMISD::LSRL: { 18292 // If this is result 0 and the other result is unused, see if the demand 18293 // bits allow us to shrink this long shift into a standard small shift in 18294 // the opposite direction. 18295 if (Op.getResNo() == 0 && !Op->hasAnyUseOfValue(1) && 18296 isa<ConstantSDNode>(Op->getOperand(2))) { 18297 unsigned ShAmt = Op->getConstantOperandVal(2); 18298 if (ShAmt < 32 && OriginalDemandedBits.isSubsetOf( 18299 APInt::getAllOnesValue(32) << (32 - ShAmt))) 18300 return TLO.CombineTo( 18301 Op, TLO.DAG.getNode( 18302 ISD::SHL, SDLoc(Op), MVT::i32, Op.getOperand(1), 18303 TLO.DAG.getConstant(32 - ShAmt, SDLoc(Op), MVT::i32))); 18304 } 18305 break; 18306 } 18307 } 18308 18309 return TargetLowering::SimplifyDemandedBitsForTargetNode( 18310 Op, OriginalDemandedBits, OriginalDemandedElts, Known, TLO, Depth); 18311 } 18312 18313 //===----------------------------------------------------------------------===// 18314 // ARM Inline Assembly Support 18315 //===----------------------------------------------------------------------===// 18316 18317 bool ARMTargetLowering::ExpandInlineAsm(CallInst *CI) const { 18318 // Looking for "rev" which is V6+. 18319 if (!Subtarget->hasV6Ops()) 18320 return false; 18321 18322 InlineAsm *IA = cast<InlineAsm>(CI->getCalledOperand()); 18323 std::string AsmStr = IA->getAsmString(); 18324 SmallVector<StringRef, 4> AsmPieces; 18325 SplitString(AsmStr, AsmPieces, ";\n"); 18326 18327 switch (AsmPieces.size()) { 18328 default: return false; 18329 case 1: 18330 AsmStr = std::string(AsmPieces[0]); 18331 AsmPieces.clear(); 18332 SplitString(AsmStr, AsmPieces, " \t,"); 18333 18334 // rev $0, $1 18335 if (AsmPieces.size() == 3 && 18336 AsmPieces[0] == "rev" && AsmPieces[1] == "$0" && AsmPieces[2] == "$1" && 18337 IA->getConstraintString().compare(0, 4, "=l,l") == 0) { 18338 IntegerType *Ty = dyn_cast<IntegerType>(CI->getType()); 18339 if (Ty && Ty->getBitWidth() == 32) 18340 return IntrinsicLowering::LowerToByteSwap(CI); 18341 } 18342 break; 18343 } 18344 18345 return false; 18346 } 18347 18348 const char *ARMTargetLowering::LowerXConstraint(EVT ConstraintVT) const { 18349 // At this point, we have to lower this constraint to something else, so we 18350 // lower it to an "r" or "w". However, by doing this we will force the result 18351 // to be in register, while the X constraint is much more permissive. 18352 // 18353 // Although we are correct (we are free to emit anything, without 18354 // constraints), we might break use cases that would expect us to be more 18355 // efficient and emit something else. 18356 if (!Subtarget->hasVFP2Base()) 18357 return "r"; 18358 if (ConstraintVT.isFloatingPoint()) 18359 return "w"; 18360 if (ConstraintVT.isVector() && Subtarget->hasNEON() && 18361 (ConstraintVT.getSizeInBits() == 64 || 18362 ConstraintVT.getSizeInBits() == 128)) 18363 return "w"; 18364 18365 return "r"; 18366 } 18367 18368 /// getConstraintType - Given a constraint letter, return the type of 18369 /// constraint it is for this target. 18370 ARMTargetLowering::ConstraintType 18371 ARMTargetLowering::getConstraintType(StringRef Constraint) const { 18372 unsigned S = Constraint.size(); 18373 if (S == 1) { 18374 switch (Constraint[0]) { 18375 default: break; 18376 case 'l': return C_RegisterClass; 18377 case 'w': return C_RegisterClass; 18378 case 'h': return C_RegisterClass; 18379 case 'x': return C_RegisterClass; 18380 case 't': return C_RegisterClass; 18381 case 'j': return C_Immediate; // Constant for movw. 18382 // An address with a single base register. Due to the way we 18383 // currently handle addresses it is the same as an 'r' memory constraint. 18384 case 'Q': return C_Memory; 18385 } 18386 } else if (S == 2) { 18387 switch (Constraint[0]) { 18388 default: break; 18389 case 'T': return C_RegisterClass; 18390 // All 'U+' constraints are addresses. 18391 case 'U': return C_Memory; 18392 } 18393 } 18394 return TargetLowering::getConstraintType(Constraint); 18395 } 18396 18397 /// Examine constraint type and operand type and determine a weight value. 18398 /// This object must already have been set up with the operand type 18399 /// and the current alternative constraint selected. 18400 TargetLowering::ConstraintWeight 18401 ARMTargetLowering::getSingleConstraintMatchWeight( 18402 AsmOperandInfo &info, const char *constraint) const { 18403 ConstraintWeight weight = CW_Invalid; 18404 Value *CallOperandVal = info.CallOperandVal; 18405 // If we don't have a value, we can't do a match, 18406 // but allow it at the lowest weight. 18407 if (!CallOperandVal) 18408 return CW_Default; 18409 Type *type = CallOperandVal->getType(); 18410 // Look at the constraint type. 18411 switch (*constraint) { 18412 default: 18413 weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint); 18414 break; 18415 case 'l': 18416 if (type->isIntegerTy()) { 18417 if (Subtarget->isThumb()) 18418 weight = CW_SpecificReg; 18419 else 18420 weight = CW_Register; 18421 } 18422 break; 18423 case 'w': 18424 if (type->isFloatingPointTy()) 18425 weight = CW_Register; 18426 break; 18427 } 18428 return weight; 18429 } 18430 18431 using RCPair = std::pair<unsigned, const TargetRegisterClass *>; 18432 18433 RCPair ARMTargetLowering::getRegForInlineAsmConstraint( 18434 const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const { 18435 switch (Constraint.size()) { 18436 case 1: 18437 // GCC ARM Constraint Letters 18438 switch (Constraint[0]) { 18439 case 'l': // Low regs or general regs. 18440 if (Subtarget->isThumb()) 18441 return RCPair(0U, &ARM::tGPRRegClass); 18442 return RCPair(0U, &ARM::GPRRegClass); 18443 case 'h': // High regs or no regs. 18444 if (Subtarget->isThumb()) 18445 return RCPair(0U, &ARM::hGPRRegClass); 18446 break; 18447 case 'r': 18448 if (Subtarget->isThumb1Only()) 18449 return RCPair(0U, &ARM::tGPRRegClass); 18450 return RCPair(0U, &ARM::GPRRegClass); 18451 case 'w': 18452 if (VT == MVT::Other) 18453 break; 18454 if (VT == MVT::f32) 18455 return RCPair(0U, &ARM::SPRRegClass); 18456 if (VT.getSizeInBits() == 64) 18457 return RCPair(0U, &ARM::DPRRegClass); 18458 if (VT.getSizeInBits() == 128) 18459 return RCPair(0U, &ARM::QPRRegClass); 18460 break; 18461 case 'x': 18462 if (VT == MVT::Other) 18463 break; 18464 if (VT == MVT::f32) 18465 return RCPair(0U, &ARM::SPR_8RegClass); 18466 if (VT.getSizeInBits() == 64) 18467 return RCPair(0U, &ARM::DPR_8RegClass); 18468 if (VT.getSizeInBits() == 128) 18469 return RCPair(0U, &ARM::QPR_8RegClass); 18470 break; 18471 case 't': 18472 if (VT == MVT::Other) 18473 break; 18474 if (VT == MVT::f32 || VT == MVT::i32) 18475 return RCPair(0U, &ARM::SPRRegClass); 18476 if (VT.getSizeInBits() == 64) 18477 return RCPair(0U, &ARM::DPR_VFP2RegClass); 18478 if (VT.getSizeInBits() == 128) 18479 return RCPair(0U, &ARM::QPR_VFP2RegClass); 18480 break; 18481 } 18482 break; 18483 18484 case 2: 18485 if (Constraint[0] == 'T') { 18486 switch (Constraint[1]) { 18487 default: 18488 break; 18489 case 'e': 18490 return RCPair(0U, &ARM::tGPREvenRegClass); 18491 case 'o': 18492 return RCPair(0U, &ARM::tGPROddRegClass); 18493 } 18494 } 18495 break; 18496 18497 default: 18498 break; 18499 } 18500 18501 if (StringRef("{cc}").equals_lower(Constraint)) 18502 return std::make_pair(unsigned(ARM::CPSR), &ARM::CCRRegClass); 18503 18504 return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT); 18505 } 18506 18507 /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops 18508 /// vector. If it is invalid, don't add anything to Ops. 18509 void ARMTargetLowering::LowerAsmOperandForConstraint(SDValue Op, 18510 std::string &Constraint, 18511 std::vector<SDValue>&Ops, 18512 SelectionDAG &DAG) const { 18513 SDValue Result; 18514 18515 // Currently only support length 1 constraints. 18516 if (Constraint.length() != 1) return; 18517 18518 char ConstraintLetter = Constraint[0]; 18519 switch (ConstraintLetter) { 18520 default: break; 18521 case 'j': 18522 case 'I': case 'J': case 'K': case 'L': 18523 case 'M': case 'N': case 'O': 18524 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op); 18525 if (!C) 18526 return; 18527 18528 int64_t CVal64 = C->getSExtValue(); 18529 int CVal = (int) CVal64; 18530 // None of these constraints allow values larger than 32 bits. Check 18531 // that the value fits in an int. 18532 if (CVal != CVal64) 18533 return; 18534 18535 switch (ConstraintLetter) { 18536 case 'j': 18537 // Constant suitable for movw, must be between 0 and 18538 // 65535. 18539 if (Subtarget->hasV6T2Ops() || (Subtarget->hasV8MBaselineOps())) 18540 if (CVal >= 0 && CVal <= 65535) 18541 break; 18542 return; 18543 case 'I': 18544 if (Subtarget->isThumb1Only()) { 18545 // This must be a constant between 0 and 255, for ADD 18546 // immediates. 18547 if (CVal >= 0 && CVal <= 255) 18548 break; 18549 } else if (Subtarget->isThumb2()) { 18550 // A constant that can be used as an immediate value in a 18551 // data-processing instruction. 18552 if (ARM_AM::getT2SOImmVal(CVal) != -1) 18553 break; 18554 } else { 18555 // A constant that can be used as an immediate value in a 18556 // data-processing instruction. 18557 if (ARM_AM::getSOImmVal(CVal) != -1) 18558 break; 18559 } 18560 return; 18561 18562 case 'J': 18563 if (Subtarget->isThumb1Only()) { 18564 // This must be a constant between -255 and -1, for negated ADD 18565 // immediates. This can be used in GCC with an "n" modifier that 18566 // prints the negated value, for use with SUB instructions. It is 18567 // not useful otherwise but is implemented for compatibility. 18568 if (CVal >= -255 && CVal <= -1) 18569 break; 18570 } else { 18571 // This must be a constant between -4095 and 4095. It is not clear 18572 // what this constraint is intended for. Implemented for 18573 // compatibility with GCC. 18574 if (CVal >= -4095 && CVal <= 4095) 18575 break; 18576 } 18577 return; 18578 18579 case 'K': 18580 if (Subtarget->isThumb1Only()) { 18581 // A 32-bit value where only one byte has a nonzero value. Exclude 18582 // zero to match GCC. This constraint is used by GCC internally for 18583 // constants that can be loaded with a move/shift combination. 18584 // It is not useful otherwise but is implemented for compatibility. 18585 if (CVal != 0 && ARM_AM::isThumbImmShiftedVal(CVal)) 18586 break; 18587 } else if (Subtarget->isThumb2()) { 18588 // A constant whose bitwise inverse can be used as an immediate 18589 // value in a data-processing instruction. This can be used in GCC 18590 // with a "B" modifier that prints the inverted value, for use with 18591 // BIC and MVN instructions. It is not useful otherwise but is 18592 // implemented for compatibility. 18593 if (ARM_AM::getT2SOImmVal(~CVal) != -1) 18594 break; 18595 } else { 18596 // A constant whose bitwise inverse can be used as an immediate 18597 // value in a data-processing instruction. This can be used in GCC 18598 // with a "B" modifier that prints the inverted value, for use with 18599 // BIC and MVN instructions. It is not useful otherwise but is 18600 // implemented for compatibility. 18601 if (ARM_AM::getSOImmVal(~CVal) != -1) 18602 break; 18603 } 18604 return; 18605 18606 case 'L': 18607 if (Subtarget->isThumb1Only()) { 18608 // This must be a constant between -7 and 7, 18609 // for 3-operand ADD/SUB immediate instructions. 18610 if (CVal >= -7 && CVal < 7) 18611 break; 18612 } else if (Subtarget->isThumb2()) { 18613 // A constant whose negation can be used as an immediate value in a 18614 // data-processing instruction. This can be used in GCC with an "n" 18615 // modifier that prints the negated value, for use with SUB 18616 // instructions. It is not useful otherwise but is implemented for 18617 // compatibility. 18618 if (ARM_AM::getT2SOImmVal(-CVal) != -1) 18619 break; 18620 } else { 18621 // A constant whose negation can be used as an immediate value in a 18622 // data-processing instruction. This can be used in GCC with an "n" 18623 // modifier that prints the negated value, for use with SUB 18624 // instructions. It is not useful otherwise but is implemented for 18625 // compatibility. 18626 if (ARM_AM::getSOImmVal(-CVal) != -1) 18627 break; 18628 } 18629 return; 18630 18631 case 'M': 18632 if (Subtarget->isThumb1Only()) { 18633 // This must be a multiple of 4 between 0 and 1020, for 18634 // ADD sp + immediate. 18635 if ((CVal >= 0 && CVal <= 1020) && ((CVal & 3) == 0)) 18636 break; 18637 } else { 18638 // A power of two or a constant between 0 and 32. This is used in 18639 // GCC for the shift amount on shifted register operands, but it is 18640 // useful in general for any shift amounts. 18641 if ((CVal >= 0 && CVal <= 32) || ((CVal & (CVal - 1)) == 0)) 18642 break; 18643 } 18644 return; 18645 18646 case 'N': 18647 if (Subtarget->isThumb1Only()) { 18648 // This must be a constant between 0 and 31, for shift amounts. 18649 if (CVal >= 0 && CVal <= 31) 18650 break; 18651 } 18652 return; 18653 18654 case 'O': 18655 if (Subtarget->isThumb1Only()) { 18656 // This must be a multiple of 4 between -508 and 508, for 18657 // ADD/SUB sp = sp + immediate. 18658 if ((CVal >= -508 && CVal <= 508) && ((CVal & 3) == 0)) 18659 break; 18660 } 18661 return; 18662 } 18663 Result = DAG.getTargetConstant(CVal, SDLoc(Op), Op.getValueType()); 18664 break; 18665 } 18666 18667 if (Result.getNode()) { 18668 Ops.push_back(Result); 18669 return; 18670 } 18671 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG); 18672 } 18673 18674 static RTLIB::Libcall getDivRemLibcall( 18675 const SDNode *N, MVT::SimpleValueType SVT) { 18676 assert((N->getOpcode() == ISD::SDIVREM || N->getOpcode() == ISD::UDIVREM || 18677 N->getOpcode() == ISD::SREM || N->getOpcode() == ISD::UREM) && 18678 "Unhandled Opcode in getDivRemLibcall"); 18679 bool isSigned = N->getOpcode() == ISD::SDIVREM || 18680 N->getOpcode() == ISD::SREM; 18681 RTLIB::Libcall LC; 18682 switch (SVT) { 18683 default: llvm_unreachable("Unexpected request for libcall!"); 18684 case MVT::i8: LC = isSigned ? RTLIB::SDIVREM_I8 : RTLIB::UDIVREM_I8; break; 18685 case MVT::i16: LC = isSigned ? RTLIB::SDIVREM_I16 : RTLIB::UDIVREM_I16; break; 18686 case MVT::i32: LC = isSigned ? RTLIB::SDIVREM_I32 : RTLIB::UDIVREM_I32; break; 18687 case MVT::i64: LC = isSigned ? RTLIB::SDIVREM_I64 : RTLIB::UDIVREM_I64; break; 18688 } 18689 return LC; 18690 } 18691 18692 static TargetLowering::ArgListTy getDivRemArgList( 18693 const SDNode *N, LLVMContext *Context, const ARMSubtarget *Subtarget) { 18694 assert((N->getOpcode() == ISD::SDIVREM || N->getOpcode() == ISD::UDIVREM || 18695 N->getOpcode() == ISD::SREM || N->getOpcode() == ISD::UREM) && 18696 "Unhandled Opcode in getDivRemArgList"); 18697 bool isSigned = N->getOpcode() == ISD::SDIVREM || 18698 N->getOpcode() == ISD::SREM; 18699 TargetLowering::ArgListTy Args; 18700 TargetLowering::ArgListEntry Entry; 18701 for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) { 18702 EVT ArgVT = N->getOperand(i).getValueType(); 18703 Type *ArgTy = ArgVT.getTypeForEVT(*Context); 18704 Entry.Node = N->getOperand(i); 18705 Entry.Ty = ArgTy; 18706 Entry.IsSExt = isSigned; 18707 Entry.IsZExt = !isSigned; 18708 Args.push_back(Entry); 18709 } 18710 if (Subtarget->isTargetWindows() && Args.size() >= 2) 18711 std::swap(Args[0], Args[1]); 18712 return Args; 18713 } 18714 18715 SDValue ARMTargetLowering::LowerDivRem(SDValue Op, SelectionDAG &DAG) const { 18716 assert((Subtarget->isTargetAEABI() || Subtarget->isTargetAndroid() || 18717 Subtarget->isTargetGNUAEABI() || Subtarget->isTargetMuslAEABI() || 18718 Subtarget->isTargetWindows()) && 18719 "Register-based DivRem lowering only"); 18720 unsigned Opcode = Op->getOpcode(); 18721 assert((Opcode == ISD::SDIVREM || Opcode == ISD::UDIVREM) && 18722 "Invalid opcode for Div/Rem lowering"); 18723 bool isSigned = (Opcode == ISD::SDIVREM); 18724 EVT VT = Op->getValueType(0); 18725 Type *Ty = VT.getTypeForEVT(*DAG.getContext()); 18726 SDLoc dl(Op); 18727 18728 // If the target has hardware divide, use divide + multiply + subtract: 18729 // div = a / b 18730 // rem = a - b * div 18731 // return {div, rem} 18732 // This should be lowered into UDIV/SDIV + MLS later on. 18733 bool hasDivide = Subtarget->isThumb() ? Subtarget->hasDivideInThumbMode() 18734 : Subtarget->hasDivideInARMMode(); 18735 if (hasDivide && Op->getValueType(0).isSimple() && 18736 Op->getSimpleValueType(0) == MVT::i32) { 18737 unsigned DivOpcode = isSigned ? ISD::SDIV : ISD::UDIV; 18738 const SDValue Dividend = Op->getOperand(0); 18739 const SDValue Divisor = Op->getOperand(1); 18740 SDValue Div = DAG.getNode(DivOpcode, dl, VT, Dividend, Divisor); 18741 SDValue Mul = DAG.getNode(ISD::MUL, dl, VT, Div, Divisor); 18742 SDValue Rem = DAG.getNode(ISD::SUB, dl, VT, Dividend, Mul); 18743 18744 SDValue Values[2] = {Div, Rem}; 18745 return DAG.getNode(ISD::MERGE_VALUES, dl, DAG.getVTList(VT, VT), Values); 18746 } 18747 18748 RTLIB::Libcall LC = getDivRemLibcall(Op.getNode(), 18749 VT.getSimpleVT().SimpleTy); 18750 SDValue InChain = DAG.getEntryNode(); 18751 18752 TargetLowering::ArgListTy Args = getDivRemArgList(Op.getNode(), 18753 DAG.getContext(), 18754 Subtarget); 18755 18756 SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC), 18757 getPointerTy(DAG.getDataLayout())); 18758 18759 Type *RetTy = StructType::get(Ty, Ty); 18760 18761 if (Subtarget->isTargetWindows()) 18762 InChain = WinDBZCheckDenominator(DAG, Op.getNode(), InChain); 18763 18764 TargetLowering::CallLoweringInfo CLI(DAG); 18765 CLI.setDebugLoc(dl).setChain(InChain) 18766 .setCallee(getLibcallCallingConv(LC), RetTy, Callee, std::move(Args)) 18767 .setInRegister().setSExtResult(isSigned).setZExtResult(!isSigned); 18768 18769 std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI); 18770 return CallInfo.first; 18771 } 18772 18773 // Lowers REM using divmod helpers 18774 // see RTABI section 4.2/4.3 18775 SDValue ARMTargetLowering::LowerREM(SDNode *N, SelectionDAG &DAG) const { 18776 // Build return types (div and rem) 18777 std::vector<Type*> RetTyParams; 18778 Type *RetTyElement; 18779 18780 switch (N->getValueType(0).getSimpleVT().SimpleTy) { 18781 default: llvm_unreachable("Unexpected request for libcall!"); 18782 case MVT::i8: RetTyElement = Type::getInt8Ty(*DAG.getContext()); break; 18783 case MVT::i16: RetTyElement = Type::getInt16Ty(*DAG.getContext()); break; 18784 case MVT::i32: RetTyElement = Type::getInt32Ty(*DAG.getContext()); break; 18785 case MVT::i64: RetTyElement = Type::getInt64Ty(*DAG.getContext()); break; 18786 } 18787 18788 RetTyParams.push_back(RetTyElement); 18789 RetTyParams.push_back(RetTyElement); 18790 ArrayRef<Type*> ret = ArrayRef<Type*>(RetTyParams); 18791 Type *RetTy = StructType::get(*DAG.getContext(), ret); 18792 18793 RTLIB::Libcall LC = getDivRemLibcall(N, N->getValueType(0).getSimpleVT(). 18794 SimpleTy); 18795 SDValue InChain = DAG.getEntryNode(); 18796 TargetLowering::ArgListTy Args = getDivRemArgList(N, DAG.getContext(), 18797 Subtarget); 18798 bool isSigned = N->getOpcode() == ISD::SREM; 18799 SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC), 18800 getPointerTy(DAG.getDataLayout())); 18801 18802 if (Subtarget->isTargetWindows()) 18803 InChain = WinDBZCheckDenominator(DAG, N, InChain); 18804 18805 // Lower call 18806 CallLoweringInfo CLI(DAG); 18807 CLI.setChain(InChain) 18808 .setCallee(CallingConv::ARM_AAPCS, RetTy, Callee, std::move(Args)) 18809 .setSExtResult(isSigned).setZExtResult(!isSigned).setDebugLoc(SDLoc(N)); 18810 std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI); 18811 18812 // Return second (rem) result operand (first contains div) 18813 SDNode *ResNode = CallResult.first.getNode(); 18814 assert(ResNode->getNumOperands() == 2 && "divmod should return two operands"); 18815 return ResNode->getOperand(1); 18816 } 18817 18818 SDValue 18819 ARMTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const { 18820 assert(Subtarget->isTargetWindows() && "unsupported target platform"); 18821 SDLoc DL(Op); 18822 18823 // Get the inputs. 18824 SDValue Chain = Op.getOperand(0); 18825 SDValue Size = Op.getOperand(1); 18826 18827 if (DAG.getMachineFunction().getFunction().hasFnAttribute( 18828 "no-stack-arg-probe")) { 18829 MaybeAlign Align = 18830 cast<ConstantSDNode>(Op.getOperand(2))->getMaybeAlignValue(); 18831 SDValue SP = DAG.getCopyFromReg(Chain, DL, ARM::SP, MVT::i32); 18832 Chain = SP.getValue(1); 18833 SP = DAG.getNode(ISD::SUB, DL, MVT::i32, SP, Size); 18834 if (Align) 18835 SP = 18836 DAG.getNode(ISD::AND, DL, MVT::i32, SP.getValue(0), 18837 DAG.getConstant(-(uint64_t)Align->value(), DL, MVT::i32)); 18838 Chain = DAG.getCopyToReg(Chain, DL, ARM::SP, SP); 18839 SDValue Ops[2] = { SP, Chain }; 18840 return DAG.getMergeValues(Ops, DL); 18841 } 18842 18843 SDValue Words = DAG.getNode(ISD::SRL, DL, MVT::i32, Size, 18844 DAG.getConstant(2, DL, MVT::i32)); 18845 18846 SDValue Flag; 18847 Chain = DAG.getCopyToReg(Chain, DL, ARM::R4, Words, Flag); 18848 Flag = Chain.getValue(1); 18849 18850 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); 18851 Chain = DAG.getNode(ARMISD::WIN__CHKSTK, DL, NodeTys, Chain, Flag); 18852 18853 SDValue NewSP = DAG.getCopyFromReg(Chain, DL, ARM::SP, MVT::i32); 18854 Chain = NewSP.getValue(1); 18855 18856 SDValue Ops[2] = { NewSP, Chain }; 18857 return DAG.getMergeValues(Ops, DL); 18858 } 18859 18860 SDValue ARMTargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const { 18861 bool IsStrict = Op->isStrictFPOpcode(); 18862 SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0); 18863 const unsigned DstSz = Op.getValueType().getSizeInBits(); 18864 const unsigned SrcSz = SrcVal.getValueType().getSizeInBits(); 18865 assert(DstSz > SrcSz && DstSz <= 64 && SrcSz >= 16 && 18866 "Unexpected type for custom-lowering FP_EXTEND"); 18867 18868 assert((!Subtarget->hasFP64() || !Subtarget->hasFPARMv8Base()) && 18869 "With both FP DP and 16, any FP conversion is legal!"); 18870 18871 assert(!(DstSz == 32 && Subtarget->hasFP16()) && 18872 "With FP16, 16 to 32 conversion is legal!"); 18873 18874 // Converting from 32 -> 64 is valid if we have FP64. 18875 if (SrcSz == 32 && DstSz == 64 && Subtarget->hasFP64()) { 18876 // FIXME: Remove this when we have strict fp instruction selection patterns 18877 if (IsStrict) { 18878 SDLoc Loc(Op); 18879 SDValue Result = DAG.getNode(ISD::FP_EXTEND, 18880 Loc, Op.getValueType(), SrcVal); 18881 return DAG.getMergeValues({Result, Op.getOperand(0)}, Loc); 18882 } 18883 return Op; 18884 } 18885 18886 // Either we are converting from 16 -> 64, without FP16 and/or 18887 // FP.double-precision or without Armv8-fp. So we must do it in two 18888 // steps. 18889 // Or we are converting from 32 -> 64 without fp.double-precision or 16 -> 32 18890 // without FP16. So we must do a function call. 18891 SDLoc Loc(Op); 18892 RTLIB::Libcall LC; 18893 MakeLibCallOptions CallOptions; 18894 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue(); 18895 for (unsigned Sz = SrcSz; Sz <= 32 && Sz < DstSz; Sz *= 2) { 18896 bool Supported = (Sz == 16 ? Subtarget->hasFP16() : Subtarget->hasFP64()); 18897 MVT SrcVT = (Sz == 16 ? MVT::f16 : MVT::f32); 18898 MVT DstVT = (Sz == 16 ? MVT::f32 : MVT::f64); 18899 if (Supported) { 18900 if (IsStrict) { 18901 SrcVal = DAG.getNode(ISD::STRICT_FP_EXTEND, Loc, 18902 {DstVT, MVT::Other}, {Chain, SrcVal}); 18903 Chain = SrcVal.getValue(1); 18904 } else { 18905 SrcVal = DAG.getNode(ISD::FP_EXTEND, Loc, DstVT, SrcVal); 18906 } 18907 } else { 18908 LC = RTLIB::getFPEXT(SrcVT, DstVT); 18909 assert(LC != RTLIB::UNKNOWN_LIBCALL && 18910 "Unexpected type for custom-lowering FP_EXTEND"); 18911 std::tie(SrcVal, Chain) = makeLibCall(DAG, LC, DstVT, SrcVal, CallOptions, 18912 Loc, Chain); 18913 } 18914 } 18915 18916 return IsStrict ? DAG.getMergeValues({SrcVal, Chain}, Loc) : SrcVal; 18917 } 18918 18919 SDValue ARMTargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const { 18920 bool IsStrict = Op->isStrictFPOpcode(); 18921 18922 SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0); 18923 EVT SrcVT = SrcVal.getValueType(); 18924 EVT DstVT = Op.getValueType(); 18925 const unsigned DstSz = Op.getValueType().getSizeInBits(); 18926 const unsigned SrcSz = SrcVT.getSizeInBits(); 18927 (void)DstSz; 18928 assert(DstSz < SrcSz && SrcSz <= 64 && DstSz >= 16 && 18929 "Unexpected type for custom-lowering FP_ROUND"); 18930 18931 assert((!Subtarget->hasFP64() || !Subtarget->hasFPARMv8Base()) && 18932 "With both FP DP and 16, any FP conversion is legal!"); 18933 18934 SDLoc Loc(Op); 18935 18936 // Instruction from 32 -> 16 if hasFP16 is valid 18937 if (SrcSz == 32 && Subtarget->hasFP16()) 18938 return Op; 18939 18940 // Lib call from 32 -> 16 / 64 -> [32, 16] 18941 RTLIB::Libcall LC = RTLIB::getFPROUND(SrcVT, DstVT); 18942 assert(LC != RTLIB::UNKNOWN_LIBCALL && 18943 "Unexpected type for custom-lowering FP_ROUND"); 18944 MakeLibCallOptions CallOptions; 18945 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue(); 18946 SDValue Result; 18947 std::tie(Result, Chain) = makeLibCall(DAG, LC, DstVT, SrcVal, CallOptions, 18948 Loc, Chain); 18949 return IsStrict ? DAG.getMergeValues({Result, Chain}, Loc) : Result; 18950 } 18951 18952 void ARMTargetLowering::lowerABS(SDNode *N, SmallVectorImpl<SDValue> &Results, 18953 SelectionDAG &DAG) const { 18954 assert(N->getValueType(0) == MVT::i64 && "Unexpected type (!= i64) on ABS."); 18955 MVT HalfT = MVT::i32; 18956 SDLoc dl(N); 18957 SDValue Hi, Lo, Tmp; 18958 18959 if (!isOperationLegalOrCustom(ISD::ADDCARRY, HalfT) || 18960 !isOperationLegalOrCustom(ISD::UADDO, HalfT)) 18961 return ; 18962 18963 unsigned OpTypeBits = HalfT.getScalarSizeInBits(); 18964 SDVTList VTList = DAG.getVTList(HalfT, MVT::i1); 18965 18966 Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(0), 18967 DAG.getConstant(0, dl, HalfT)); 18968 Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(0), 18969 DAG.getConstant(1, dl, HalfT)); 18970 18971 Tmp = DAG.getNode(ISD::SRA, dl, HalfT, Hi, 18972 DAG.getConstant(OpTypeBits - 1, dl, 18973 getShiftAmountTy(HalfT, DAG.getDataLayout()))); 18974 Lo = DAG.getNode(ISD::UADDO, dl, VTList, Tmp, Lo); 18975 Hi = DAG.getNode(ISD::ADDCARRY, dl, VTList, Tmp, Hi, 18976 SDValue(Lo.getNode(), 1)); 18977 Hi = DAG.getNode(ISD::XOR, dl, HalfT, Tmp, Hi); 18978 Lo = DAG.getNode(ISD::XOR, dl, HalfT, Tmp, Lo); 18979 18980 Results.push_back(Lo); 18981 Results.push_back(Hi); 18982 } 18983 18984 bool 18985 ARMTargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const { 18986 // The ARM target isn't yet aware of offsets. 18987 return false; 18988 } 18989 18990 bool ARM::isBitFieldInvertedMask(unsigned v) { 18991 if (v == 0xffffffff) 18992 return false; 18993 18994 // there can be 1's on either or both "outsides", all the "inside" 18995 // bits must be 0's 18996 return isShiftedMask_32(~v); 18997 } 18998 18999 /// isFPImmLegal - Returns true if the target can instruction select the 19000 /// specified FP immediate natively. If false, the legalizer will 19001 /// materialize the FP immediate as a load from a constant pool. 19002 bool ARMTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT, 19003 bool ForCodeSize) const { 19004 if (!Subtarget->hasVFP3Base()) 19005 return false; 19006 if (VT == MVT::f16 && Subtarget->hasFullFP16()) 19007 return ARM_AM::getFP16Imm(Imm) != -1; 19008 if (VT == MVT::f32 && Subtarget->hasFullFP16() && 19009 ARM_AM::getFP32FP16Imm(Imm) != -1) 19010 return true; 19011 if (VT == MVT::f32) 19012 return ARM_AM::getFP32Imm(Imm) != -1; 19013 if (VT == MVT::f64 && Subtarget->hasFP64()) 19014 return ARM_AM::getFP64Imm(Imm) != -1; 19015 return false; 19016 } 19017 19018 /// getTgtMemIntrinsic - Represent NEON load and store intrinsics as 19019 /// MemIntrinsicNodes. The associated MachineMemOperands record the alignment 19020 /// specified in the intrinsic calls. 19021 bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, 19022 const CallInst &I, 19023 MachineFunction &MF, 19024 unsigned Intrinsic) const { 19025 switch (Intrinsic) { 19026 case Intrinsic::arm_neon_vld1: 19027 case Intrinsic::arm_neon_vld2: 19028 case Intrinsic::arm_neon_vld3: 19029 case Intrinsic::arm_neon_vld4: 19030 case Intrinsic::arm_neon_vld2lane: 19031 case Intrinsic::arm_neon_vld3lane: 19032 case Intrinsic::arm_neon_vld4lane: 19033 case Intrinsic::arm_neon_vld2dup: 19034 case Intrinsic::arm_neon_vld3dup: 19035 case Intrinsic::arm_neon_vld4dup: { 19036 Info.opc = ISD::INTRINSIC_W_CHAIN; 19037 // Conservatively set memVT to the entire set of vectors loaded. 19038 auto &DL = I.getCalledFunction()->getParent()->getDataLayout(); 19039 uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64; 19040 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts); 19041 Info.ptrVal = I.getArgOperand(0); 19042 Info.offset = 0; 19043 Value *AlignArg = I.getArgOperand(I.getNumArgOperands() - 1); 19044 Info.align = cast<ConstantInt>(AlignArg)->getMaybeAlignValue(); 19045 // volatile loads with NEON intrinsics not supported 19046 Info.flags = MachineMemOperand::MOLoad; 19047 return true; 19048 } 19049 case Intrinsic::arm_neon_vld1x2: 19050 case Intrinsic::arm_neon_vld1x3: 19051 case Intrinsic::arm_neon_vld1x4: { 19052 Info.opc = ISD::INTRINSIC_W_CHAIN; 19053 // Conservatively set memVT to the entire set of vectors loaded. 19054 auto &DL = I.getCalledFunction()->getParent()->getDataLayout(); 19055 uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64; 19056 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts); 19057 Info.ptrVal = I.getArgOperand(I.getNumArgOperands() - 1); 19058 Info.offset = 0; 19059 Info.align.reset(); 19060 // volatile loads with NEON intrinsics not supported 19061 Info.flags = MachineMemOperand::MOLoad; 19062 return true; 19063 } 19064 case Intrinsic::arm_neon_vst1: 19065 case Intrinsic::arm_neon_vst2: 19066 case Intrinsic::arm_neon_vst3: 19067 case Intrinsic::arm_neon_vst4: 19068 case Intrinsic::arm_neon_vst2lane: 19069 case Intrinsic::arm_neon_vst3lane: 19070 case Intrinsic::arm_neon_vst4lane: { 19071 Info.opc = ISD::INTRINSIC_VOID; 19072 // Conservatively set memVT to the entire set of vectors stored. 19073 auto &DL = I.getCalledFunction()->getParent()->getDataLayout(); 19074 unsigned NumElts = 0; 19075 for (unsigned ArgI = 1, ArgE = I.getNumArgOperands(); ArgI < ArgE; ++ArgI) { 19076 Type *ArgTy = I.getArgOperand(ArgI)->getType(); 19077 if (!ArgTy->isVectorTy()) 19078 break; 19079 NumElts += DL.getTypeSizeInBits(ArgTy) / 64; 19080 } 19081 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts); 19082 Info.ptrVal = I.getArgOperand(0); 19083 Info.offset = 0; 19084 Value *AlignArg = I.getArgOperand(I.getNumArgOperands() - 1); 19085 Info.align = cast<ConstantInt>(AlignArg)->getMaybeAlignValue(); 19086 // volatile stores with NEON intrinsics not supported 19087 Info.flags = MachineMemOperand::MOStore; 19088 return true; 19089 } 19090 case Intrinsic::arm_neon_vst1x2: 19091 case Intrinsic::arm_neon_vst1x3: 19092 case Intrinsic::arm_neon_vst1x4: { 19093 Info.opc = ISD::INTRINSIC_VOID; 19094 // Conservatively set memVT to the entire set of vectors stored. 19095 auto &DL = I.getCalledFunction()->getParent()->getDataLayout(); 19096 unsigned NumElts = 0; 19097 for (unsigned ArgI = 1, ArgE = I.getNumArgOperands(); ArgI < ArgE; ++ArgI) { 19098 Type *ArgTy = I.getArgOperand(ArgI)->getType(); 19099 if (!ArgTy->isVectorTy()) 19100 break; 19101 NumElts += DL.getTypeSizeInBits(ArgTy) / 64; 19102 } 19103 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts); 19104 Info.ptrVal = I.getArgOperand(0); 19105 Info.offset = 0; 19106 Info.align.reset(); 19107 // volatile stores with NEON intrinsics not supported 19108 Info.flags = MachineMemOperand::MOStore; 19109 return true; 19110 } 19111 case Intrinsic::arm_mve_vld2q: 19112 case Intrinsic::arm_mve_vld4q: { 19113 Info.opc = ISD::INTRINSIC_W_CHAIN; 19114 // Conservatively set memVT to the entire set of vectors loaded. 19115 Type *VecTy = cast<StructType>(I.getType())->getElementType(1); 19116 unsigned Factor = Intrinsic == Intrinsic::arm_mve_vld2q ? 2 : 4; 19117 Info.memVT = EVT::getVectorVT(VecTy->getContext(), MVT::i64, Factor * 2); 19118 Info.ptrVal = I.getArgOperand(0); 19119 Info.offset = 0; 19120 Info.align = Align(VecTy->getScalarSizeInBits() / 8); 19121 // volatile loads with MVE intrinsics not supported 19122 Info.flags = MachineMemOperand::MOLoad; 19123 return true; 19124 } 19125 case Intrinsic::arm_mve_vst2q: 19126 case Intrinsic::arm_mve_vst4q: { 19127 Info.opc = ISD::INTRINSIC_VOID; 19128 // Conservatively set memVT to the entire set of vectors stored. 19129 Type *VecTy = I.getArgOperand(1)->getType(); 19130 unsigned Factor = Intrinsic == Intrinsic::arm_mve_vst2q ? 2 : 4; 19131 Info.memVT = EVT::getVectorVT(VecTy->getContext(), MVT::i64, Factor * 2); 19132 Info.ptrVal = I.getArgOperand(0); 19133 Info.offset = 0; 19134 Info.align = Align(VecTy->getScalarSizeInBits() / 8); 19135 // volatile stores with MVE intrinsics not supported 19136 Info.flags = MachineMemOperand::MOStore; 19137 return true; 19138 } 19139 case Intrinsic::arm_mve_vldr_gather_base: 19140 case Intrinsic::arm_mve_vldr_gather_base_predicated: { 19141 Info.opc = ISD::INTRINSIC_W_CHAIN; 19142 Info.ptrVal = nullptr; 19143 Info.memVT = MVT::getVT(I.getType()); 19144 Info.align = Align(1); 19145 Info.flags |= MachineMemOperand::MOLoad; 19146 return true; 19147 } 19148 case Intrinsic::arm_mve_vldr_gather_base_wb: 19149 case Intrinsic::arm_mve_vldr_gather_base_wb_predicated: { 19150 Info.opc = ISD::INTRINSIC_W_CHAIN; 19151 Info.ptrVal = nullptr; 19152 Info.memVT = MVT::getVT(I.getType()->getContainedType(0)); 19153 Info.align = Align(1); 19154 Info.flags |= MachineMemOperand::MOLoad; 19155 return true; 19156 } 19157 case Intrinsic::arm_mve_vldr_gather_offset: 19158 case Intrinsic::arm_mve_vldr_gather_offset_predicated: { 19159 Info.opc = ISD::INTRINSIC_W_CHAIN; 19160 Info.ptrVal = nullptr; 19161 MVT DataVT = MVT::getVT(I.getType()); 19162 unsigned MemSize = cast<ConstantInt>(I.getArgOperand(2))->getZExtValue(); 19163 Info.memVT = MVT::getVectorVT(MVT::getIntegerVT(MemSize), 19164 DataVT.getVectorNumElements()); 19165 Info.align = Align(1); 19166 Info.flags |= MachineMemOperand::MOLoad; 19167 return true; 19168 } 19169 case Intrinsic::arm_mve_vstr_scatter_base: 19170 case Intrinsic::arm_mve_vstr_scatter_base_predicated: { 19171 Info.opc = ISD::INTRINSIC_VOID; 19172 Info.ptrVal = nullptr; 19173 Info.memVT = MVT::getVT(I.getArgOperand(2)->getType()); 19174 Info.align = Align(1); 19175 Info.flags |= MachineMemOperand::MOStore; 19176 return true; 19177 } 19178 case Intrinsic::arm_mve_vstr_scatter_base_wb: 19179 case Intrinsic::arm_mve_vstr_scatter_base_wb_predicated: { 19180 Info.opc = ISD::INTRINSIC_W_CHAIN; 19181 Info.ptrVal = nullptr; 19182 Info.memVT = MVT::getVT(I.getArgOperand(2)->getType()); 19183 Info.align = Align(1); 19184 Info.flags |= MachineMemOperand::MOStore; 19185 return true; 19186 } 19187 case Intrinsic::arm_mve_vstr_scatter_offset: 19188 case Intrinsic::arm_mve_vstr_scatter_offset_predicated: { 19189 Info.opc = ISD::INTRINSIC_VOID; 19190 Info.ptrVal = nullptr; 19191 MVT DataVT = MVT::getVT(I.getArgOperand(2)->getType()); 19192 unsigned MemSize = cast<ConstantInt>(I.getArgOperand(3))->getZExtValue(); 19193 Info.memVT = MVT::getVectorVT(MVT::getIntegerVT(MemSize), 19194 DataVT.getVectorNumElements()); 19195 Info.align = Align(1); 19196 Info.flags |= MachineMemOperand::MOStore; 19197 return true; 19198 } 19199 case Intrinsic::arm_ldaex: 19200 case Intrinsic::arm_ldrex: { 19201 auto &DL = I.getCalledFunction()->getParent()->getDataLayout(); 19202 PointerType *PtrTy = cast<PointerType>(I.getArgOperand(0)->getType()); 19203 Info.opc = ISD::INTRINSIC_W_CHAIN; 19204 Info.memVT = MVT::getVT(PtrTy->getElementType()); 19205 Info.ptrVal = I.getArgOperand(0); 19206 Info.offset = 0; 19207 Info.align = DL.getABITypeAlign(PtrTy->getElementType()); 19208 Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile; 19209 return true; 19210 } 19211 case Intrinsic::arm_stlex: 19212 case Intrinsic::arm_strex: { 19213 auto &DL = I.getCalledFunction()->getParent()->getDataLayout(); 19214 PointerType *PtrTy = cast<PointerType>(I.getArgOperand(1)->getType()); 19215 Info.opc = ISD::INTRINSIC_W_CHAIN; 19216 Info.memVT = MVT::getVT(PtrTy->getElementType()); 19217 Info.ptrVal = I.getArgOperand(1); 19218 Info.offset = 0; 19219 Info.align = DL.getABITypeAlign(PtrTy->getElementType()); 19220 Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MOVolatile; 19221 return true; 19222 } 19223 case Intrinsic::arm_stlexd: 19224 case Intrinsic::arm_strexd: 19225 Info.opc = ISD::INTRINSIC_W_CHAIN; 19226 Info.memVT = MVT::i64; 19227 Info.ptrVal = I.getArgOperand(2); 19228 Info.offset = 0; 19229 Info.align = Align(8); 19230 Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MOVolatile; 19231 return true; 19232 19233 case Intrinsic::arm_ldaexd: 19234 case Intrinsic::arm_ldrexd: 19235 Info.opc = ISD::INTRINSIC_W_CHAIN; 19236 Info.memVT = MVT::i64; 19237 Info.ptrVal = I.getArgOperand(0); 19238 Info.offset = 0; 19239 Info.align = Align(8); 19240 Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile; 19241 return true; 19242 19243 default: 19244 break; 19245 } 19246 19247 return false; 19248 } 19249 19250 /// Returns true if it is beneficial to convert a load of a constant 19251 /// to just the constant itself. 19252 bool ARMTargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm, 19253 Type *Ty) const { 19254 assert(Ty->isIntegerTy()); 19255 19256 unsigned Bits = Ty->getPrimitiveSizeInBits(); 19257 if (Bits == 0 || Bits > 32) 19258 return false; 19259 return true; 19260 } 19261 19262 bool ARMTargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, 19263 unsigned Index) const { 19264 if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT)) 19265 return false; 19266 19267 return (Index == 0 || Index == ResVT.getVectorNumElements()); 19268 } 19269 19270 Instruction* ARMTargetLowering::makeDMB(IRBuilder<> &Builder, 19271 ARM_MB::MemBOpt Domain) const { 19272 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 19273 19274 // First, if the target has no DMB, see what fallback we can use. 19275 if (!Subtarget->hasDataBarrier()) { 19276 // Some ARMv6 cpus can support data barriers with an mcr instruction. 19277 // Thumb1 and pre-v6 ARM mode use a libcall instead and should never get 19278 // here. 19279 if (Subtarget->hasV6Ops() && !Subtarget->isThumb()) { 19280 Function *MCR = Intrinsic::getDeclaration(M, Intrinsic::arm_mcr); 19281 Value* args[6] = {Builder.getInt32(15), Builder.getInt32(0), 19282 Builder.getInt32(0), Builder.getInt32(7), 19283 Builder.getInt32(10), Builder.getInt32(5)}; 19284 return Builder.CreateCall(MCR, args); 19285 } else { 19286 // Instead of using barriers, atomic accesses on these subtargets use 19287 // libcalls. 19288 llvm_unreachable("makeDMB on a target so old that it has no barriers"); 19289 } 19290 } else { 19291 Function *DMB = Intrinsic::getDeclaration(M, Intrinsic::arm_dmb); 19292 // Only a full system barrier exists in the M-class architectures. 19293 Domain = Subtarget->isMClass() ? ARM_MB::SY : Domain; 19294 Constant *CDomain = Builder.getInt32(Domain); 19295 return Builder.CreateCall(DMB, CDomain); 19296 } 19297 } 19298 19299 // Based on http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html 19300 Instruction *ARMTargetLowering::emitLeadingFence(IRBuilder<> &Builder, 19301 Instruction *Inst, 19302 AtomicOrdering Ord) const { 19303 switch (Ord) { 19304 case AtomicOrdering::NotAtomic: 19305 case AtomicOrdering::Unordered: 19306 llvm_unreachable("Invalid fence: unordered/non-atomic"); 19307 case AtomicOrdering::Monotonic: 19308 case AtomicOrdering::Acquire: 19309 return nullptr; // Nothing to do 19310 case AtomicOrdering::SequentiallyConsistent: 19311 if (!Inst->hasAtomicStore()) 19312 return nullptr; // Nothing to do 19313 LLVM_FALLTHROUGH; 19314 case AtomicOrdering::Release: 19315 case AtomicOrdering::AcquireRelease: 19316 if (Subtarget->preferISHSTBarriers()) 19317 return makeDMB(Builder, ARM_MB::ISHST); 19318 // FIXME: add a comment with a link to documentation justifying this. 19319 else 19320 return makeDMB(Builder, ARM_MB::ISH); 19321 } 19322 llvm_unreachable("Unknown fence ordering in emitLeadingFence"); 19323 } 19324 19325 Instruction *ARMTargetLowering::emitTrailingFence(IRBuilder<> &Builder, 19326 Instruction *Inst, 19327 AtomicOrdering Ord) const { 19328 switch (Ord) { 19329 case AtomicOrdering::NotAtomic: 19330 case AtomicOrdering::Unordered: 19331 llvm_unreachable("Invalid fence: unordered/not-atomic"); 19332 case AtomicOrdering::Monotonic: 19333 case AtomicOrdering::Release: 19334 return nullptr; // Nothing to do 19335 case AtomicOrdering::Acquire: 19336 case AtomicOrdering::AcquireRelease: 19337 case AtomicOrdering::SequentiallyConsistent: 19338 return makeDMB(Builder, ARM_MB::ISH); 19339 } 19340 llvm_unreachable("Unknown fence ordering in emitTrailingFence"); 19341 } 19342 19343 // Loads and stores less than 64-bits are already atomic; ones above that 19344 // are doomed anyway, so defer to the default libcall and blame the OS when 19345 // things go wrong. Cortex M doesn't have ldrexd/strexd though, so don't emit 19346 // anything for those. 19347 bool ARMTargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const { 19348 unsigned Size = SI->getValueOperand()->getType()->getPrimitiveSizeInBits(); 19349 return (Size == 64) && !Subtarget->isMClass(); 19350 } 19351 19352 // Loads and stores less than 64-bits are already atomic; ones above that 19353 // are doomed anyway, so defer to the default libcall and blame the OS when 19354 // things go wrong. Cortex M doesn't have ldrexd/strexd though, so don't emit 19355 // anything for those. 19356 // FIXME: ldrd and strd are atomic if the CPU has LPAE (e.g. A15 has that 19357 // guarantee, see DDI0406C ARM architecture reference manual, 19358 // sections A8.8.72-74 LDRD) 19359 TargetLowering::AtomicExpansionKind 19360 ARMTargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const { 19361 unsigned Size = LI->getType()->getPrimitiveSizeInBits(); 19362 return ((Size == 64) && !Subtarget->isMClass()) ? AtomicExpansionKind::LLOnly 19363 : AtomicExpansionKind::None; 19364 } 19365 19366 // For the real atomic operations, we have ldrex/strex up to 32 bits, 19367 // and up to 64 bits on the non-M profiles 19368 TargetLowering::AtomicExpansionKind 19369 ARMTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const { 19370 if (AI->isFloatingPointOperation()) 19371 return AtomicExpansionKind::CmpXChg; 19372 19373 // At -O0, fast-regalloc cannot cope with the live vregs necessary to 19374 // implement atomicrmw without spilling. If the target address is also on the 19375 // stack and close enough to the spill slot, this can lead to a situation 19376 // where the monitor always gets cleared and the atomic operation can never 19377 // succeed. So at -O0 lower this operation to a CAS loop. 19378 if (getTargetMachine().getOptLevel() == CodeGenOpt::None) 19379 return AtomicExpansionKind::CmpXChg; 19380 19381 unsigned Size = AI->getType()->getPrimitiveSizeInBits(); 19382 bool hasAtomicRMW = !Subtarget->isThumb() || Subtarget->hasV8MBaselineOps(); 19383 return (Size <= (Subtarget->isMClass() ? 32U : 64U) && hasAtomicRMW) 19384 ? AtomicExpansionKind::LLSC 19385 : AtomicExpansionKind::None; 19386 } 19387 19388 // Similar to shouldExpandAtomicRMWInIR, ldrex/strex can be used up to 32 19389 // bits, and up to 64 bits on the non-M profiles. 19390 TargetLowering::AtomicExpansionKind 19391 ARMTargetLowering::shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const { 19392 // At -O0, fast-regalloc cannot cope with the live vregs necessary to 19393 // implement cmpxchg without spilling. If the address being exchanged is also 19394 // on the stack and close enough to the spill slot, this can lead to a 19395 // situation where the monitor always gets cleared and the atomic operation 19396 // can never succeed. So at -O0 we need a late-expanded pseudo-inst instead. 19397 unsigned Size = AI->getOperand(1)->getType()->getPrimitiveSizeInBits(); 19398 bool HasAtomicCmpXchg = 19399 !Subtarget->isThumb() || Subtarget->hasV8MBaselineOps(); 19400 if (getTargetMachine().getOptLevel() != 0 && HasAtomicCmpXchg && 19401 Size <= (Subtarget->isMClass() ? 32U : 64U)) 19402 return AtomicExpansionKind::LLSC; 19403 return AtomicExpansionKind::None; 19404 } 19405 19406 bool ARMTargetLowering::shouldInsertFencesForAtomic( 19407 const Instruction *I) const { 19408 return InsertFencesForAtomic; 19409 } 19410 19411 // This has so far only been implemented for MachO. 19412 bool ARMTargetLowering::useLoadStackGuardNode() const { 19413 return Subtarget->isTargetMachO(); 19414 } 19415 19416 void ARMTargetLowering::insertSSPDeclarations(Module &M) const { 19417 if (!Subtarget->getTargetTriple().isWindowsMSVCEnvironment()) 19418 return TargetLowering::insertSSPDeclarations(M); 19419 19420 // MSVC CRT has a global variable holding security cookie. 19421 M.getOrInsertGlobal("__security_cookie", 19422 Type::getInt8PtrTy(M.getContext())); 19423 19424 // MSVC CRT has a function to validate security cookie. 19425 FunctionCallee SecurityCheckCookie = M.getOrInsertFunction( 19426 "__security_check_cookie", Type::getVoidTy(M.getContext()), 19427 Type::getInt8PtrTy(M.getContext())); 19428 if (Function *F = dyn_cast<Function>(SecurityCheckCookie.getCallee())) 19429 F->addAttribute(1, Attribute::AttrKind::InReg); 19430 } 19431 19432 Value *ARMTargetLowering::getSDagStackGuard(const Module &M) const { 19433 // MSVC CRT has a global variable holding security cookie. 19434 if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment()) 19435 return M.getGlobalVariable("__security_cookie"); 19436 return TargetLowering::getSDagStackGuard(M); 19437 } 19438 19439 Function *ARMTargetLowering::getSSPStackGuardCheck(const Module &M) const { 19440 // MSVC CRT has a function to validate security cookie. 19441 if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment()) 19442 return M.getFunction("__security_check_cookie"); 19443 return TargetLowering::getSSPStackGuardCheck(M); 19444 } 19445 19446 bool ARMTargetLowering::canCombineStoreAndExtract(Type *VectorTy, Value *Idx, 19447 unsigned &Cost) const { 19448 // If we do not have NEON, vector types are not natively supported. 19449 if (!Subtarget->hasNEON()) 19450 return false; 19451 19452 // Floating point values and vector values map to the same register file. 19453 // Therefore, although we could do a store extract of a vector type, this is 19454 // better to leave at float as we have more freedom in the addressing mode for 19455 // those. 19456 if (VectorTy->isFPOrFPVectorTy()) 19457 return false; 19458 19459 // If the index is unknown at compile time, this is very expensive to lower 19460 // and it is not possible to combine the store with the extract. 19461 if (!isa<ConstantInt>(Idx)) 19462 return false; 19463 19464 assert(VectorTy->isVectorTy() && "VectorTy is not a vector type"); 19465 unsigned BitWidth = VectorTy->getPrimitiveSizeInBits().getFixedSize(); 19466 // We can do a store + vector extract on any vector that fits perfectly in a D 19467 // or Q register. 19468 if (BitWidth == 64 || BitWidth == 128) { 19469 Cost = 0; 19470 return true; 19471 } 19472 return false; 19473 } 19474 19475 bool ARMTargetLowering::isCheapToSpeculateCttz() const { 19476 return Subtarget->hasV6T2Ops(); 19477 } 19478 19479 bool ARMTargetLowering::isCheapToSpeculateCtlz() const { 19480 return Subtarget->hasV6T2Ops(); 19481 } 19482 19483 bool ARMTargetLowering::shouldExpandShift(SelectionDAG &DAG, SDNode *N) const { 19484 return !Subtarget->hasMinSize() || Subtarget->isTargetWindows(); 19485 } 19486 19487 Value *ARMTargetLowering::emitLoadLinked(IRBuilder<> &Builder, Value *Addr, 19488 AtomicOrdering Ord) const { 19489 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 19490 Type *ValTy = cast<PointerType>(Addr->getType())->getElementType(); 19491 bool IsAcquire = isAcquireOrStronger(Ord); 19492 19493 // Since i64 isn't legal and intrinsics don't get type-lowered, the ldrexd 19494 // intrinsic must return {i32, i32} and we have to recombine them into a 19495 // single i64 here. 19496 if (ValTy->getPrimitiveSizeInBits() == 64) { 19497 Intrinsic::ID Int = 19498 IsAcquire ? Intrinsic::arm_ldaexd : Intrinsic::arm_ldrexd; 19499 Function *Ldrex = Intrinsic::getDeclaration(M, Int); 19500 19501 Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext())); 19502 Value *LoHi = Builder.CreateCall(Ldrex, Addr, "lohi"); 19503 19504 Value *Lo = Builder.CreateExtractValue(LoHi, 0, "lo"); 19505 Value *Hi = Builder.CreateExtractValue(LoHi, 1, "hi"); 19506 if (!Subtarget->isLittle()) 19507 std::swap (Lo, Hi); 19508 Lo = Builder.CreateZExt(Lo, ValTy, "lo64"); 19509 Hi = Builder.CreateZExt(Hi, ValTy, "hi64"); 19510 return Builder.CreateOr( 19511 Lo, Builder.CreateShl(Hi, ConstantInt::get(ValTy, 32)), "val64"); 19512 } 19513 19514 Type *Tys[] = { Addr->getType() }; 19515 Intrinsic::ID Int = IsAcquire ? Intrinsic::arm_ldaex : Intrinsic::arm_ldrex; 19516 Function *Ldrex = Intrinsic::getDeclaration(M, Int, Tys); 19517 19518 return Builder.CreateTruncOrBitCast( 19519 Builder.CreateCall(Ldrex, Addr), 19520 cast<PointerType>(Addr->getType())->getElementType()); 19521 } 19522 19523 void ARMTargetLowering::emitAtomicCmpXchgNoStoreLLBalance( 19524 IRBuilder<> &Builder) const { 19525 if (!Subtarget->hasV7Ops()) 19526 return; 19527 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 19528 Builder.CreateCall(Intrinsic::getDeclaration(M, Intrinsic::arm_clrex)); 19529 } 19530 19531 Value *ARMTargetLowering::emitStoreConditional(IRBuilder<> &Builder, Value *Val, 19532 Value *Addr, 19533 AtomicOrdering Ord) const { 19534 Module *M = Builder.GetInsertBlock()->getParent()->getParent(); 19535 bool IsRelease = isReleaseOrStronger(Ord); 19536 19537 // Since the intrinsics must have legal type, the i64 intrinsics take two 19538 // parameters: "i32, i32". We must marshal Val into the appropriate form 19539 // before the call. 19540 if (Val->getType()->getPrimitiveSizeInBits() == 64) { 19541 Intrinsic::ID Int = 19542 IsRelease ? Intrinsic::arm_stlexd : Intrinsic::arm_strexd; 19543 Function *Strex = Intrinsic::getDeclaration(M, Int); 19544 Type *Int32Ty = Type::getInt32Ty(M->getContext()); 19545 19546 Value *Lo = Builder.CreateTrunc(Val, Int32Ty, "lo"); 19547 Value *Hi = Builder.CreateTrunc(Builder.CreateLShr(Val, 32), Int32Ty, "hi"); 19548 if (!Subtarget->isLittle()) 19549 std::swap(Lo, Hi); 19550 Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext())); 19551 return Builder.CreateCall(Strex, {Lo, Hi, Addr}); 19552 } 19553 19554 Intrinsic::ID Int = IsRelease ? Intrinsic::arm_stlex : Intrinsic::arm_strex; 19555 Type *Tys[] = { Addr->getType() }; 19556 Function *Strex = Intrinsic::getDeclaration(M, Int, Tys); 19557 19558 return Builder.CreateCall( 19559 Strex, {Builder.CreateZExtOrBitCast( 19560 Val, Strex->getFunctionType()->getParamType(0)), 19561 Addr}); 19562 } 19563 19564 19565 bool ARMTargetLowering::alignLoopsWithOptSize() const { 19566 return Subtarget->isMClass(); 19567 } 19568 19569 /// A helper function for determining the number of interleaved accesses we 19570 /// will generate when lowering accesses of the given type. 19571 unsigned 19572 ARMTargetLowering::getNumInterleavedAccesses(VectorType *VecTy, 19573 const DataLayout &DL) const { 19574 return (DL.getTypeSizeInBits(VecTy) + 127) / 128; 19575 } 19576 19577 bool ARMTargetLowering::isLegalInterleavedAccessType( 19578 unsigned Factor, FixedVectorType *VecTy, Align Alignment, 19579 const DataLayout &DL) const { 19580 19581 unsigned VecSize = DL.getTypeSizeInBits(VecTy); 19582 unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType()); 19583 19584 if (!Subtarget->hasNEON() && !Subtarget->hasMVEIntegerOps()) 19585 return false; 19586 19587 // Ensure the vector doesn't have f16 elements. Even though we could do an 19588 // i16 vldN, we can't hold the f16 vectors and will end up converting via 19589 // f32. 19590 if (Subtarget->hasNEON() && VecTy->getElementType()->isHalfTy()) 19591 return false; 19592 if (Subtarget->hasMVEIntegerOps() && Factor == 3) 19593 return false; 19594 19595 // Ensure the number of vector elements is greater than 1. 19596 if (VecTy->getNumElements() < 2) 19597 return false; 19598 19599 // Ensure the element type is legal. 19600 if (ElSize != 8 && ElSize != 16 && ElSize != 32) 19601 return false; 19602 // And the alignment if high enough under MVE. 19603 if (Subtarget->hasMVEIntegerOps() && Alignment < ElSize / 8) 19604 return false; 19605 19606 // Ensure the total vector size is 64 or a multiple of 128. Types larger than 19607 // 128 will be split into multiple interleaved accesses. 19608 if (Subtarget->hasNEON() && VecSize == 64) 19609 return true; 19610 return VecSize % 128 == 0; 19611 } 19612 19613 unsigned ARMTargetLowering::getMaxSupportedInterleaveFactor() const { 19614 if (Subtarget->hasNEON()) 19615 return 4; 19616 if (Subtarget->hasMVEIntegerOps()) 19617 return MVEMaxSupportedInterleaveFactor; 19618 return TargetLoweringBase::getMaxSupportedInterleaveFactor(); 19619 } 19620 19621 /// Lower an interleaved load into a vldN intrinsic. 19622 /// 19623 /// E.g. Lower an interleaved load (Factor = 2): 19624 /// %wide.vec = load <8 x i32>, <8 x i32>* %ptr, align 4 19625 /// %v0 = shuffle %wide.vec, undef, <0, 2, 4, 6> ; Extract even elements 19626 /// %v1 = shuffle %wide.vec, undef, <1, 3, 5, 7> ; Extract odd elements 19627 /// 19628 /// Into: 19629 /// %vld2 = { <4 x i32>, <4 x i32> } call llvm.arm.neon.vld2(%ptr, 4) 19630 /// %vec0 = extractelement { <4 x i32>, <4 x i32> } %vld2, i32 0 19631 /// %vec1 = extractelement { <4 x i32>, <4 x i32> } %vld2, i32 1 19632 bool ARMTargetLowering::lowerInterleavedLoad( 19633 LoadInst *LI, ArrayRef<ShuffleVectorInst *> Shuffles, 19634 ArrayRef<unsigned> Indices, unsigned Factor) const { 19635 assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() && 19636 "Invalid interleave factor"); 19637 assert(!Shuffles.empty() && "Empty shufflevector input"); 19638 assert(Shuffles.size() == Indices.size() && 19639 "Unmatched number of shufflevectors and indices"); 19640 19641 auto *VecTy = cast<FixedVectorType>(Shuffles[0]->getType()); 19642 Type *EltTy = VecTy->getElementType(); 19643 19644 const DataLayout &DL = LI->getModule()->getDataLayout(); 19645 Align Alignment = LI->getAlign(); 19646 19647 // Skip if we do not have NEON and skip illegal vector types. We can 19648 // "legalize" wide vector types into multiple interleaved accesses as long as 19649 // the vector types are divisible by 128. 19650 if (!isLegalInterleavedAccessType(Factor, VecTy, Alignment, DL)) 19651 return false; 19652 19653 unsigned NumLoads = getNumInterleavedAccesses(VecTy, DL); 19654 19655 // A pointer vector can not be the return type of the ldN intrinsics. Need to 19656 // load integer vectors first and then convert to pointer vectors. 19657 if (EltTy->isPointerTy()) 19658 VecTy = FixedVectorType::get(DL.getIntPtrType(EltTy), VecTy); 19659 19660 IRBuilder<> Builder(LI); 19661 19662 // The base address of the load. 19663 Value *BaseAddr = LI->getPointerOperand(); 19664 19665 if (NumLoads > 1) { 19666 // If we're going to generate more than one load, reset the sub-vector type 19667 // to something legal. 19668 VecTy = FixedVectorType::get(VecTy->getElementType(), 19669 VecTy->getNumElements() / NumLoads); 19670 19671 // We will compute the pointer operand of each load from the original base 19672 // address using GEPs. Cast the base address to a pointer to the scalar 19673 // element type. 19674 BaseAddr = Builder.CreateBitCast( 19675 BaseAddr, 19676 VecTy->getElementType()->getPointerTo(LI->getPointerAddressSpace())); 19677 } 19678 19679 assert(isTypeLegal(EVT::getEVT(VecTy)) && "Illegal vldN vector type!"); 19680 19681 auto createLoadIntrinsic = [&](Value *BaseAddr) { 19682 if (Subtarget->hasNEON()) { 19683 Type *Int8Ptr = Builder.getInt8PtrTy(LI->getPointerAddressSpace()); 19684 Type *Tys[] = {VecTy, Int8Ptr}; 19685 static const Intrinsic::ID LoadInts[3] = {Intrinsic::arm_neon_vld2, 19686 Intrinsic::arm_neon_vld3, 19687 Intrinsic::arm_neon_vld4}; 19688 Function *VldnFunc = 19689 Intrinsic::getDeclaration(LI->getModule(), LoadInts[Factor - 2], Tys); 19690 19691 SmallVector<Value *, 2> Ops; 19692 Ops.push_back(Builder.CreateBitCast(BaseAddr, Int8Ptr)); 19693 Ops.push_back(Builder.getInt32(LI->getAlignment())); 19694 19695 return Builder.CreateCall(VldnFunc, Ops, "vldN"); 19696 } else { 19697 assert((Factor == 2 || Factor == 4) && 19698 "expected interleave factor of 2 or 4 for MVE"); 19699 Intrinsic::ID LoadInts = 19700 Factor == 2 ? Intrinsic::arm_mve_vld2q : Intrinsic::arm_mve_vld4q; 19701 Type *VecEltTy = 19702 VecTy->getElementType()->getPointerTo(LI->getPointerAddressSpace()); 19703 Type *Tys[] = {VecTy, VecEltTy}; 19704 Function *VldnFunc = 19705 Intrinsic::getDeclaration(LI->getModule(), LoadInts, Tys); 19706 19707 SmallVector<Value *, 2> Ops; 19708 Ops.push_back(Builder.CreateBitCast(BaseAddr, VecEltTy)); 19709 return Builder.CreateCall(VldnFunc, Ops, "vldN"); 19710 } 19711 }; 19712 19713 // Holds sub-vectors extracted from the load intrinsic return values. The 19714 // sub-vectors are associated with the shufflevector instructions they will 19715 // replace. 19716 DenseMap<ShuffleVectorInst *, SmallVector<Value *, 4>> SubVecs; 19717 19718 for (unsigned LoadCount = 0; LoadCount < NumLoads; ++LoadCount) { 19719 // If we're generating more than one load, compute the base address of 19720 // subsequent loads as an offset from the previous. 19721 if (LoadCount > 0) 19722 BaseAddr = Builder.CreateConstGEP1_32(VecTy->getElementType(), BaseAddr, 19723 VecTy->getNumElements() * Factor); 19724 19725 CallInst *VldN = createLoadIntrinsic(BaseAddr); 19726 19727 // Replace uses of each shufflevector with the corresponding vector loaded 19728 // by ldN. 19729 for (unsigned i = 0; i < Shuffles.size(); i++) { 19730 ShuffleVectorInst *SV = Shuffles[i]; 19731 unsigned Index = Indices[i]; 19732 19733 Value *SubVec = Builder.CreateExtractValue(VldN, Index); 19734 19735 // Convert the integer vector to pointer vector if the element is pointer. 19736 if (EltTy->isPointerTy()) 19737 SubVec = Builder.CreateIntToPtr( 19738 SubVec, 19739 FixedVectorType::get(SV->getType()->getElementType(), VecTy)); 19740 19741 SubVecs[SV].push_back(SubVec); 19742 } 19743 } 19744 19745 // Replace uses of the shufflevector instructions with the sub-vectors 19746 // returned by the load intrinsic. If a shufflevector instruction is 19747 // associated with more than one sub-vector, those sub-vectors will be 19748 // concatenated into a single wide vector. 19749 for (ShuffleVectorInst *SVI : Shuffles) { 19750 auto &SubVec = SubVecs[SVI]; 19751 auto *WideVec = 19752 SubVec.size() > 1 ? concatenateVectors(Builder, SubVec) : SubVec[0]; 19753 SVI->replaceAllUsesWith(WideVec); 19754 } 19755 19756 return true; 19757 } 19758 19759 /// Lower an interleaved store into a vstN intrinsic. 19760 /// 19761 /// E.g. Lower an interleaved store (Factor = 3): 19762 /// %i.vec = shuffle <8 x i32> %v0, <8 x i32> %v1, 19763 /// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> 19764 /// store <12 x i32> %i.vec, <12 x i32>* %ptr, align 4 19765 /// 19766 /// Into: 19767 /// %sub.v0 = shuffle <8 x i32> %v0, <8 x i32> v1, <0, 1, 2, 3> 19768 /// %sub.v1 = shuffle <8 x i32> %v0, <8 x i32> v1, <4, 5, 6, 7> 19769 /// %sub.v2 = shuffle <8 x i32> %v0, <8 x i32> v1, <8, 9, 10, 11> 19770 /// call void llvm.arm.neon.vst3(%ptr, %sub.v0, %sub.v1, %sub.v2, 4) 19771 /// 19772 /// Note that the new shufflevectors will be removed and we'll only generate one 19773 /// vst3 instruction in CodeGen. 19774 /// 19775 /// Example for a more general valid mask (Factor 3). Lower: 19776 /// %i.vec = shuffle <32 x i32> %v0, <32 x i32> %v1, 19777 /// <4, 32, 16, 5, 33, 17, 6, 34, 18, 7, 35, 19> 19778 /// store <12 x i32> %i.vec, <12 x i32>* %ptr 19779 /// 19780 /// Into: 19781 /// %sub.v0 = shuffle <32 x i32> %v0, <32 x i32> v1, <4, 5, 6, 7> 19782 /// %sub.v1 = shuffle <32 x i32> %v0, <32 x i32> v1, <32, 33, 34, 35> 19783 /// %sub.v2 = shuffle <32 x i32> %v0, <32 x i32> v1, <16, 17, 18, 19> 19784 /// call void llvm.arm.neon.vst3(%ptr, %sub.v0, %sub.v1, %sub.v2, 4) 19785 bool ARMTargetLowering::lowerInterleavedStore(StoreInst *SI, 19786 ShuffleVectorInst *SVI, 19787 unsigned Factor) const { 19788 assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() && 19789 "Invalid interleave factor"); 19790 19791 auto *VecTy = cast<FixedVectorType>(SVI->getType()); 19792 assert(VecTy->getNumElements() % Factor == 0 && "Invalid interleaved store"); 19793 19794 unsigned LaneLen = VecTy->getNumElements() / Factor; 19795 Type *EltTy = VecTy->getElementType(); 19796 auto *SubVecTy = FixedVectorType::get(EltTy, LaneLen); 19797 19798 const DataLayout &DL = SI->getModule()->getDataLayout(); 19799 Align Alignment = SI->getAlign(); 19800 19801 // Skip if we do not have NEON and skip illegal vector types. We can 19802 // "legalize" wide vector types into multiple interleaved accesses as long as 19803 // the vector types are divisible by 128. 19804 if (!isLegalInterleavedAccessType(Factor, SubVecTy, Alignment, DL)) 19805 return false; 19806 19807 unsigned NumStores = getNumInterleavedAccesses(SubVecTy, DL); 19808 19809 Value *Op0 = SVI->getOperand(0); 19810 Value *Op1 = SVI->getOperand(1); 19811 IRBuilder<> Builder(SI); 19812 19813 // StN intrinsics don't support pointer vectors as arguments. Convert pointer 19814 // vectors to integer vectors. 19815 if (EltTy->isPointerTy()) { 19816 Type *IntTy = DL.getIntPtrType(EltTy); 19817 19818 // Convert to the corresponding integer vector. 19819 auto *IntVecTy = 19820 FixedVectorType::get(IntTy, cast<FixedVectorType>(Op0->getType())); 19821 Op0 = Builder.CreatePtrToInt(Op0, IntVecTy); 19822 Op1 = Builder.CreatePtrToInt(Op1, IntVecTy); 19823 19824 SubVecTy = FixedVectorType::get(IntTy, LaneLen); 19825 } 19826 19827 // The base address of the store. 19828 Value *BaseAddr = SI->getPointerOperand(); 19829 19830 if (NumStores > 1) { 19831 // If we're going to generate more than one store, reset the lane length 19832 // and sub-vector type to something legal. 19833 LaneLen /= NumStores; 19834 SubVecTy = FixedVectorType::get(SubVecTy->getElementType(), LaneLen); 19835 19836 // We will compute the pointer operand of each store from the original base 19837 // address using GEPs. Cast the base address to a pointer to the scalar 19838 // element type. 19839 BaseAddr = Builder.CreateBitCast( 19840 BaseAddr, 19841 SubVecTy->getElementType()->getPointerTo(SI->getPointerAddressSpace())); 19842 } 19843 19844 assert(isTypeLegal(EVT::getEVT(SubVecTy)) && "Illegal vstN vector type!"); 19845 19846 auto Mask = SVI->getShuffleMask(); 19847 19848 auto createStoreIntrinsic = [&](Value *BaseAddr, 19849 SmallVectorImpl<Value *> &Shuffles) { 19850 if (Subtarget->hasNEON()) { 19851 static const Intrinsic::ID StoreInts[3] = {Intrinsic::arm_neon_vst2, 19852 Intrinsic::arm_neon_vst3, 19853 Intrinsic::arm_neon_vst4}; 19854 Type *Int8Ptr = Builder.getInt8PtrTy(SI->getPointerAddressSpace()); 19855 Type *Tys[] = {Int8Ptr, SubVecTy}; 19856 19857 Function *VstNFunc = Intrinsic::getDeclaration( 19858 SI->getModule(), StoreInts[Factor - 2], Tys); 19859 19860 SmallVector<Value *, 6> Ops; 19861 Ops.push_back(Builder.CreateBitCast(BaseAddr, Int8Ptr)); 19862 append_range(Ops, Shuffles); 19863 Ops.push_back(Builder.getInt32(SI->getAlignment())); 19864 Builder.CreateCall(VstNFunc, Ops); 19865 } else { 19866 assert((Factor == 2 || Factor == 4) && 19867 "expected interleave factor of 2 or 4 for MVE"); 19868 Intrinsic::ID StoreInts = 19869 Factor == 2 ? Intrinsic::arm_mve_vst2q : Intrinsic::arm_mve_vst4q; 19870 Type *EltPtrTy = SubVecTy->getElementType()->getPointerTo( 19871 SI->getPointerAddressSpace()); 19872 Type *Tys[] = {EltPtrTy, SubVecTy}; 19873 Function *VstNFunc = 19874 Intrinsic::getDeclaration(SI->getModule(), StoreInts, Tys); 19875 19876 SmallVector<Value *, 6> Ops; 19877 Ops.push_back(Builder.CreateBitCast(BaseAddr, EltPtrTy)); 19878 append_range(Ops, Shuffles); 19879 for (unsigned F = 0; F < Factor; F++) { 19880 Ops.push_back(Builder.getInt32(F)); 19881 Builder.CreateCall(VstNFunc, Ops); 19882 Ops.pop_back(); 19883 } 19884 } 19885 }; 19886 19887 for (unsigned StoreCount = 0; StoreCount < NumStores; ++StoreCount) { 19888 // If we generating more than one store, we compute the base address of 19889 // subsequent stores as an offset from the previous. 19890 if (StoreCount > 0) 19891 BaseAddr = Builder.CreateConstGEP1_32(SubVecTy->getElementType(), 19892 BaseAddr, LaneLen * Factor); 19893 19894 SmallVector<Value *, 4> Shuffles; 19895 19896 // Split the shufflevector operands into sub vectors for the new vstN call. 19897 for (unsigned i = 0; i < Factor; i++) { 19898 unsigned IdxI = StoreCount * LaneLen * Factor + i; 19899 if (Mask[IdxI] >= 0) { 19900 Shuffles.push_back(Builder.CreateShuffleVector( 19901 Op0, Op1, createSequentialMask(Mask[IdxI], LaneLen, 0))); 19902 } else { 19903 unsigned StartMask = 0; 19904 for (unsigned j = 1; j < LaneLen; j++) { 19905 unsigned IdxJ = StoreCount * LaneLen * Factor + j; 19906 if (Mask[IdxJ * Factor + IdxI] >= 0) { 19907 StartMask = Mask[IdxJ * Factor + IdxI] - IdxJ; 19908 break; 19909 } 19910 } 19911 // Note: If all elements in a chunk are undefs, StartMask=0! 19912 // Note: Filling undef gaps with random elements is ok, since 19913 // those elements were being written anyway (with undefs). 19914 // In the case of all undefs we're defaulting to using elems from 0 19915 // Note: StartMask cannot be negative, it's checked in 19916 // isReInterleaveMask 19917 Shuffles.push_back(Builder.CreateShuffleVector( 19918 Op0, Op1, createSequentialMask(StartMask, LaneLen, 0))); 19919 } 19920 } 19921 19922 createStoreIntrinsic(BaseAddr, Shuffles); 19923 } 19924 return true; 19925 } 19926 19927 enum HABaseType { 19928 HA_UNKNOWN = 0, 19929 HA_FLOAT, 19930 HA_DOUBLE, 19931 HA_VECT64, 19932 HA_VECT128 19933 }; 19934 19935 static bool isHomogeneousAggregate(Type *Ty, HABaseType &Base, 19936 uint64_t &Members) { 19937 if (auto *ST = dyn_cast<StructType>(Ty)) { 19938 for (unsigned i = 0; i < ST->getNumElements(); ++i) { 19939 uint64_t SubMembers = 0; 19940 if (!isHomogeneousAggregate(ST->getElementType(i), Base, SubMembers)) 19941 return false; 19942 Members += SubMembers; 19943 } 19944 } else if (auto *AT = dyn_cast<ArrayType>(Ty)) { 19945 uint64_t SubMembers = 0; 19946 if (!isHomogeneousAggregate(AT->getElementType(), Base, SubMembers)) 19947 return false; 19948 Members += SubMembers * AT->getNumElements(); 19949 } else if (Ty->isFloatTy()) { 19950 if (Base != HA_UNKNOWN && Base != HA_FLOAT) 19951 return false; 19952 Members = 1; 19953 Base = HA_FLOAT; 19954 } else if (Ty->isDoubleTy()) { 19955 if (Base != HA_UNKNOWN && Base != HA_DOUBLE) 19956 return false; 19957 Members = 1; 19958 Base = HA_DOUBLE; 19959 } else if (auto *VT = dyn_cast<VectorType>(Ty)) { 19960 Members = 1; 19961 switch (Base) { 19962 case HA_FLOAT: 19963 case HA_DOUBLE: 19964 return false; 19965 case HA_VECT64: 19966 return VT->getPrimitiveSizeInBits().getFixedSize() == 64; 19967 case HA_VECT128: 19968 return VT->getPrimitiveSizeInBits().getFixedSize() == 128; 19969 case HA_UNKNOWN: 19970 switch (VT->getPrimitiveSizeInBits().getFixedSize()) { 19971 case 64: 19972 Base = HA_VECT64; 19973 return true; 19974 case 128: 19975 Base = HA_VECT128; 19976 return true; 19977 default: 19978 return false; 19979 } 19980 } 19981 } 19982 19983 return (Members > 0 && Members <= 4); 19984 } 19985 19986 /// Return the correct alignment for the current calling convention. 19987 Align ARMTargetLowering::getABIAlignmentForCallingConv(Type *ArgTy, 19988 DataLayout DL) const { 19989 const Align ABITypeAlign = DL.getABITypeAlign(ArgTy); 19990 if (!ArgTy->isVectorTy()) 19991 return ABITypeAlign; 19992 19993 // Avoid over-aligning vector parameters. It would require realigning the 19994 // stack and waste space for no real benefit. 19995 return std::min(ABITypeAlign, DL.getStackAlignment()); 19996 } 19997 19998 /// Return true if a type is an AAPCS-VFP homogeneous aggregate or one of 19999 /// [N x i32] or [N x i64]. This allows front-ends to skip emitting padding when 20000 /// passing according to AAPCS rules. 20001 bool ARMTargetLowering::functionArgumentNeedsConsecutiveRegisters( 20002 Type *Ty, CallingConv::ID CallConv, bool isVarArg) const { 20003 if (getEffectiveCallingConv(CallConv, isVarArg) != 20004 CallingConv::ARM_AAPCS_VFP) 20005 return false; 20006 20007 HABaseType Base = HA_UNKNOWN; 20008 uint64_t Members = 0; 20009 bool IsHA = isHomogeneousAggregate(Ty, Base, Members); 20010 LLVM_DEBUG(dbgs() << "isHA: " << IsHA << " "; Ty->dump()); 20011 20012 bool IsIntArray = Ty->isArrayTy() && Ty->getArrayElementType()->isIntegerTy(); 20013 return IsHA || IsIntArray; 20014 } 20015 20016 Register ARMTargetLowering::getExceptionPointerRegister( 20017 const Constant *PersonalityFn) const { 20018 // Platforms which do not use SjLj EH may return values in these registers 20019 // via the personality function. 20020 return Subtarget->useSjLjEH() ? Register() : ARM::R0; 20021 } 20022 20023 Register ARMTargetLowering::getExceptionSelectorRegister( 20024 const Constant *PersonalityFn) const { 20025 // Platforms which do not use SjLj EH may return values in these registers 20026 // via the personality function. 20027 return Subtarget->useSjLjEH() ? Register() : ARM::R1; 20028 } 20029 20030 void ARMTargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const { 20031 // Update IsSplitCSR in ARMFunctionInfo. 20032 ARMFunctionInfo *AFI = Entry->getParent()->getInfo<ARMFunctionInfo>(); 20033 AFI->setIsSplitCSR(true); 20034 } 20035 20036 void ARMTargetLowering::insertCopiesSplitCSR( 20037 MachineBasicBlock *Entry, 20038 const SmallVectorImpl<MachineBasicBlock *> &Exits) const { 20039 const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo(); 20040 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent()); 20041 if (!IStart) 20042 return; 20043 20044 const TargetInstrInfo *TII = Subtarget->getInstrInfo(); 20045 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo(); 20046 MachineBasicBlock::iterator MBBI = Entry->begin(); 20047 for (const MCPhysReg *I = IStart; *I; ++I) { 20048 const TargetRegisterClass *RC = nullptr; 20049 if (ARM::GPRRegClass.contains(*I)) 20050 RC = &ARM::GPRRegClass; 20051 else if (ARM::DPRRegClass.contains(*I)) 20052 RC = &ARM::DPRRegClass; 20053 else 20054 llvm_unreachable("Unexpected register class in CSRsViaCopy!"); 20055 20056 Register NewVR = MRI->createVirtualRegister(RC); 20057 // Create copy from CSR to a virtual register. 20058 // FIXME: this currently does not emit CFI pseudo-instructions, it works 20059 // fine for CXX_FAST_TLS since the C++-style TLS access functions should be 20060 // nounwind. If we want to generalize this later, we may need to emit 20061 // CFI pseudo-instructions. 20062 assert(Entry->getParent()->getFunction().hasFnAttribute( 20063 Attribute::NoUnwind) && 20064 "Function should be nounwind in insertCopiesSplitCSR!"); 20065 Entry->addLiveIn(*I); 20066 BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR) 20067 .addReg(*I); 20068 20069 // Insert the copy-back instructions right before the terminator. 20070 for (auto *Exit : Exits) 20071 BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(), 20072 TII->get(TargetOpcode::COPY), *I) 20073 .addReg(NewVR); 20074 } 20075 } 20076 20077 void ARMTargetLowering::finalizeLowering(MachineFunction &MF) const { 20078 MF.getFrameInfo().computeMaxCallFrameSize(MF); 20079 TargetLoweringBase::finalizeLowering(MF); 20080 } 20081