1 //===- AArch64InstrInfo.cpp - AArch64 Instruction Information -------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file contains the AArch64 implementation of the TargetInstrInfo class. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "AArch64InstrInfo.h" 14 #include "AArch64MachineFunctionInfo.h" 15 #include "AArch64Subtarget.h" 16 #include "MCTargetDesc/AArch64AddressingModes.h" 17 #include "Utils/AArch64BaseInfo.h" 18 #include "llvm/ADT/ArrayRef.h" 19 #include "llvm/ADT/STLExtras.h" 20 #include "llvm/ADT/SmallVector.h" 21 #include "llvm/CodeGen/MachineBasicBlock.h" 22 #include "llvm/CodeGen/MachineFrameInfo.h" 23 #include "llvm/CodeGen/MachineFunction.h" 24 #include "llvm/CodeGen/MachineInstr.h" 25 #include "llvm/CodeGen/MachineInstrBuilder.h" 26 #include "llvm/CodeGen/MachineMemOperand.h" 27 #include "llvm/CodeGen/MachineModuleInfo.h" 28 #include "llvm/CodeGen/MachineOperand.h" 29 #include "llvm/CodeGen/MachineRegisterInfo.h" 30 #include "llvm/CodeGen/StackMaps.h" 31 #include "llvm/CodeGen/TargetRegisterInfo.h" 32 #include "llvm/CodeGen/TargetSubtargetInfo.h" 33 #include "llvm/IR/DebugInfoMetadata.h" 34 #include "llvm/IR/DebugLoc.h" 35 #include "llvm/IR/GlobalValue.h" 36 #include "llvm/MC/MCAsmInfo.h" 37 #include "llvm/MC/MCInst.h" 38 #include "llvm/MC/MCInstBuilder.h" 39 #include "llvm/MC/MCInstrDesc.h" 40 #include "llvm/Support/Casting.h" 41 #include "llvm/Support/CodeGen.h" 42 #include "llvm/Support/CommandLine.h" 43 #include "llvm/Support/Compiler.h" 44 #include "llvm/Support/ErrorHandling.h" 45 #include "llvm/Support/MathExtras.h" 46 #include "llvm/Target/TargetMachine.h" 47 #include "llvm/Target/TargetOptions.h" 48 #include <cassert> 49 #include <cstdint> 50 #include <iterator> 51 #include <utility> 52 53 using namespace llvm; 54 55 #define GET_INSTRINFO_CTOR_DTOR 56 #include "AArch64GenInstrInfo.inc" 57 58 static cl::opt<unsigned> TBZDisplacementBits( 59 "aarch64-tbz-offset-bits", cl::Hidden, cl::init(14), 60 cl::desc("Restrict range of TB[N]Z instructions (DEBUG)")); 61 62 static cl::opt<unsigned> CBZDisplacementBits( 63 "aarch64-cbz-offset-bits", cl::Hidden, cl::init(19), 64 cl::desc("Restrict range of CB[N]Z instructions (DEBUG)")); 65 66 static cl::opt<unsigned> 67 BCCDisplacementBits("aarch64-bcc-offset-bits", cl::Hidden, cl::init(19), 68 cl::desc("Restrict range of Bcc instructions (DEBUG)")); 69 70 AArch64InstrInfo::AArch64InstrInfo(const AArch64Subtarget &STI) 71 : AArch64GenInstrInfo(AArch64::ADJCALLSTACKDOWN, AArch64::ADJCALLSTACKUP, 72 AArch64::CATCHRET), 73 RI(STI.getTargetTriple()), Subtarget(STI) {} 74 75 /// GetInstSize - Return the number of bytes of code the specified 76 /// instruction may be. This returns the maximum number of bytes. 77 unsigned AArch64InstrInfo::getInstSizeInBytes(const MachineInstr &MI) const { 78 const MachineBasicBlock &MBB = *MI.getParent(); 79 const MachineFunction *MF = MBB.getParent(); 80 const MCAsmInfo *MAI = MF->getTarget().getMCAsmInfo(); 81 82 { 83 auto Op = MI.getOpcode(); 84 if (Op == AArch64::INLINEASM || Op == AArch64::INLINEASM_BR) 85 return getInlineAsmLength(MI.getOperand(0).getSymbolName(), *MAI); 86 } 87 88 // Meta-instructions emit no code. 89 if (MI.isMetaInstruction()) 90 return 0; 91 92 // FIXME: We currently only handle pseudoinstructions that don't get expanded 93 // before the assembly printer. 94 unsigned NumBytes = 0; 95 const MCInstrDesc &Desc = MI.getDesc(); 96 switch (Desc.getOpcode()) { 97 default: 98 // Anything not explicitly designated otherwise is a normal 4-byte insn. 99 NumBytes = 4; 100 break; 101 case TargetOpcode::STACKMAP: 102 // The upper bound for a stackmap intrinsic is the full length of its shadow 103 NumBytes = StackMapOpers(&MI).getNumPatchBytes(); 104 assert(NumBytes % 4 == 0 && "Invalid number of NOP bytes requested!"); 105 break; 106 case TargetOpcode::PATCHPOINT: 107 // The size of the patchpoint intrinsic is the number of bytes requested 108 NumBytes = PatchPointOpers(&MI).getNumPatchBytes(); 109 assert(NumBytes % 4 == 0 && "Invalid number of NOP bytes requested!"); 110 break; 111 case TargetOpcode::STATEPOINT: 112 NumBytes = StatepointOpers(&MI).getNumPatchBytes(); 113 assert(NumBytes % 4 == 0 && "Invalid number of NOP bytes requested!"); 114 // No patch bytes means a normal call inst is emitted 115 if (NumBytes == 0) 116 NumBytes = 4; 117 break; 118 case AArch64::TLSDESC_CALLSEQ: 119 // This gets lowered to an instruction sequence which takes 16 bytes 120 NumBytes = 16; 121 break; 122 case AArch64::SpeculationBarrierISBDSBEndBB: 123 // This gets lowered to 2 4-byte instructions. 124 NumBytes = 8; 125 break; 126 case AArch64::SpeculationBarrierSBEndBB: 127 // This gets lowered to 1 4-byte instructions. 128 NumBytes = 4; 129 break; 130 case AArch64::JumpTableDest32: 131 case AArch64::JumpTableDest16: 132 case AArch64::JumpTableDest8: 133 NumBytes = 12; 134 break; 135 case AArch64::SPACE: 136 NumBytes = MI.getOperand(1).getImm(); 137 break; 138 case AArch64::StoreSwiftAsyncContext: 139 NumBytes = 20; 140 break; 141 case TargetOpcode::BUNDLE: 142 NumBytes = getInstBundleLength(MI); 143 break; 144 } 145 146 return NumBytes; 147 } 148 149 unsigned AArch64InstrInfo::getInstBundleLength(const MachineInstr &MI) const { 150 unsigned Size = 0; 151 MachineBasicBlock::const_instr_iterator I = MI.getIterator(); 152 MachineBasicBlock::const_instr_iterator E = MI.getParent()->instr_end(); 153 while (++I != E && I->isInsideBundle()) { 154 assert(!I->isBundle() && "No nested bundle!"); 155 Size += getInstSizeInBytes(*I); 156 } 157 return Size; 158 } 159 160 static void parseCondBranch(MachineInstr *LastInst, MachineBasicBlock *&Target, 161 SmallVectorImpl<MachineOperand> &Cond) { 162 // Block ends with fall-through condbranch. 163 switch (LastInst->getOpcode()) { 164 default: 165 llvm_unreachable("Unknown branch instruction?"); 166 case AArch64::Bcc: 167 Target = LastInst->getOperand(1).getMBB(); 168 Cond.push_back(LastInst->getOperand(0)); 169 break; 170 case AArch64::CBZW: 171 case AArch64::CBZX: 172 case AArch64::CBNZW: 173 case AArch64::CBNZX: 174 Target = LastInst->getOperand(1).getMBB(); 175 Cond.push_back(MachineOperand::CreateImm(-1)); 176 Cond.push_back(MachineOperand::CreateImm(LastInst->getOpcode())); 177 Cond.push_back(LastInst->getOperand(0)); 178 break; 179 case AArch64::TBZW: 180 case AArch64::TBZX: 181 case AArch64::TBNZW: 182 case AArch64::TBNZX: 183 Target = LastInst->getOperand(2).getMBB(); 184 Cond.push_back(MachineOperand::CreateImm(-1)); 185 Cond.push_back(MachineOperand::CreateImm(LastInst->getOpcode())); 186 Cond.push_back(LastInst->getOperand(0)); 187 Cond.push_back(LastInst->getOperand(1)); 188 } 189 } 190 191 static unsigned getBranchDisplacementBits(unsigned Opc) { 192 switch (Opc) { 193 default: 194 llvm_unreachable("unexpected opcode!"); 195 case AArch64::B: 196 return 64; 197 case AArch64::TBNZW: 198 case AArch64::TBZW: 199 case AArch64::TBNZX: 200 case AArch64::TBZX: 201 return TBZDisplacementBits; 202 case AArch64::CBNZW: 203 case AArch64::CBZW: 204 case AArch64::CBNZX: 205 case AArch64::CBZX: 206 return CBZDisplacementBits; 207 case AArch64::Bcc: 208 return BCCDisplacementBits; 209 } 210 } 211 212 bool AArch64InstrInfo::isBranchOffsetInRange(unsigned BranchOp, 213 int64_t BrOffset) const { 214 unsigned Bits = getBranchDisplacementBits(BranchOp); 215 assert(Bits >= 3 && "max branch displacement must be enough to jump" 216 "over conditional branch expansion"); 217 return isIntN(Bits, BrOffset / 4); 218 } 219 220 MachineBasicBlock * 221 AArch64InstrInfo::getBranchDestBlock(const MachineInstr &MI) const { 222 switch (MI.getOpcode()) { 223 default: 224 llvm_unreachable("unexpected opcode!"); 225 case AArch64::B: 226 return MI.getOperand(0).getMBB(); 227 case AArch64::TBZW: 228 case AArch64::TBNZW: 229 case AArch64::TBZX: 230 case AArch64::TBNZX: 231 return MI.getOperand(2).getMBB(); 232 case AArch64::CBZW: 233 case AArch64::CBNZW: 234 case AArch64::CBZX: 235 case AArch64::CBNZX: 236 case AArch64::Bcc: 237 return MI.getOperand(1).getMBB(); 238 } 239 } 240 241 // Branch analysis. 242 bool AArch64InstrInfo::analyzeBranch(MachineBasicBlock &MBB, 243 MachineBasicBlock *&TBB, 244 MachineBasicBlock *&FBB, 245 SmallVectorImpl<MachineOperand> &Cond, 246 bool AllowModify) const { 247 // If the block has no terminators, it just falls into the block after it. 248 MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr(); 249 if (I == MBB.end()) 250 return false; 251 252 // Skip over SpeculationBarrierEndBB terminators 253 if (I->getOpcode() == AArch64::SpeculationBarrierISBDSBEndBB || 254 I->getOpcode() == AArch64::SpeculationBarrierSBEndBB) { 255 --I; 256 } 257 258 if (!isUnpredicatedTerminator(*I)) 259 return false; 260 261 // Get the last instruction in the block. 262 MachineInstr *LastInst = &*I; 263 264 // If there is only one terminator instruction, process it. 265 unsigned LastOpc = LastInst->getOpcode(); 266 if (I == MBB.begin() || !isUnpredicatedTerminator(*--I)) { 267 if (isUncondBranchOpcode(LastOpc)) { 268 TBB = LastInst->getOperand(0).getMBB(); 269 return false; 270 } 271 if (isCondBranchOpcode(LastOpc)) { 272 // Block ends with fall-through condbranch. 273 parseCondBranch(LastInst, TBB, Cond); 274 return false; 275 } 276 return true; // Can't handle indirect branch. 277 } 278 279 // Get the instruction before it if it is a terminator. 280 MachineInstr *SecondLastInst = &*I; 281 unsigned SecondLastOpc = SecondLastInst->getOpcode(); 282 283 // If AllowModify is true and the block ends with two or more unconditional 284 // branches, delete all but the first unconditional branch. 285 if (AllowModify && isUncondBranchOpcode(LastOpc)) { 286 while (isUncondBranchOpcode(SecondLastOpc)) { 287 LastInst->eraseFromParent(); 288 LastInst = SecondLastInst; 289 LastOpc = LastInst->getOpcode(); 290 if (I == MBB.begin() || !isUnpredicatedTerminator(*--I)) { 291 // Return now the only terminator is an unconditional branch. 292 TBB = LastInst->getOperand(0).getMBB(); 293 return false; 294 } else { 295 SecondLastInst = &*I; 296 SecondLastOpc = SecondLastInst->getOpcode(); 297 } 298 } 299 } 300 301 // If we're allowed to modify and the block ends in a unconditional branch 302 // which could simply fallthrough, remove the branch. (Note: This case only 303 // matters when we can't understand the whole sequence, otherwise it's also 304 // handled by BranchFolding.cpp.) 305 if (AllowModify && isUncondBranchOpcode(LastOpc) && 306 MBB.isLayoutSuccessor(getBranchDestBlock(*LastInst))) { 307 LastInst->eraseFromParent(); 308 LastInst = SecondLastInst; 309 LastOpc = LastInst->getOpcode(); 310 if (I == MBB.begin() || !isUnpredicatedTerminator(*--I)) { 311 assert(!isUncondBranchOpcode(LastOpc) && 312 "unreachable unconditional branches removed above"); 313 314 if (isCondBranchOpcode(LastOpc)) { 315 // Block ends with fall-through condbranch. 316 parseCondBranch(LastInst, TBB, Cond); 317 return false; 318 } 319 return true; // Can't handle indirect branch. 320 } else { 321 SecondLastInst = &*I; 322 SecondLastOpc = SecondLastInst->getOpcode(); 323 } 324 } 325 326 // If there are three terminators, we don't know what sort of block this is. 327 if (SecondLastInst && I != MBB.begin() && isUnpredicatedTerminator(*--I)) 328 return true; 329 330 // If the block ends with a B and a Bcc, handle it. 331 if (isCondBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) { 332 parseCondBranch(SecondLastInst, TBB, Cond); 333 FBB = LastInst->getOperand(0).getMBB(); 334 return false; 335 } 336 337 // If the block ends with two unconditional branches, handle it. The second 338 // one is not executed, so remove it. 339 if (isUncondBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) { 340 TBB = SecondLastInst->getOperand(0).getMBB(); 341 I = LastInst; 342 if (AllowModify) 343 I->eraseFromParent(); 344 return false; 345 } 346 347 // ...likewise if it ends with an indirect branch followed by an unconditional 348 // branch. 349 if (isIndirectBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) { 350 I = LastInst; 351 if (AllowModify) 352 I->eraseFromParent(); 353 return true; 354 } 355 356 // Otherwise, can't handle this. 357 return true; 358 } 359 360 bool AArch64InstrInfo::analyzeBranchPredicate(MachineBasicBlock &MBB, 361 MachineBranchPredicate &MBP, 362 bool AllowModify) const { 363 // For the moment, handle only a block which ends with a cb(n)zx followed by 364 // a fallthrough. Why this? Because it is a common form. 365 // TODO: Should we handle b.cc? 366 367 MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr(); 368 if (I == MBB.end()) 369 return true; 370 371 // Skip over SpeculationBarrierEndBB terminators 372 if (I->getOpcode() == AArch64::SpeculationBarrierISBDSBEndBB || 373 I->getOpcode() == AArch64::SpeculationBarrierSBEndBB) { 374 --I; 375 } 376 377 if (!isUnpredicatedTerminator(*I)) 378 return true; 379 380 // Get the last instruction in the block. 381 MachineInstr *LastInst = &*I; 382 unsigned LastOpc = LastInst->getOpcode(); 383 if (!isCondBranchOpcode(LastOpc)) 384 return true; 385 386 switch (LastOpc) { 387 default: 388 return true; 389 case AArch64::CBZW: 390 case AArch64::CBZX: 391 case AArch64::CBNZW: 392 case AArch64::CBNZX: 393 break; 394 }; 395 396 MBP.TrueDest = LastInst->getOperand(1).getMBB(); 397 assert(MBP.TrueDest && "expected!"); 398 MBP.FalseDest = MBB.getNextNode(); 399 400 MBP.ConditionDef = nullptr; 401 MBP.SingleUseCondition = false; 402 403 MBP.LHS = LastInst->getOperand(0); 404 MBP.RHS = MachineOperand::CreateImm(0); 405 MBP.Predicate = LastOpc == AArch64::CBNZX ? MachineBranchPredicate::PRED_NE 406 : MachineBranchPredicate::PRED_EQ; 407 return false; 408 } 409 410 bool AArch64InstrInfo::reverseBranchCondition( 411 SmallVectorImpl<MachineOperand> &Cond) const { 412 if (Cond[0].getImm() != -1) { 413 // Regular Bcc 414 AArch64CC::CondCode CC = (AArch64CC::CondCode)(int)Cond[0].getImm(); 415 Cond[0].setImm(AArch64CC::getInvertedCondCode(CC)); 416 } else { 417 // Folded compare-and-branch 418 switch (Cond[1].getImm()) { 419 default: 420 llvm_unreachable("Unknown conditional branch!"); 421 case AArch64::CBZW: 422 Cond[1].setImm(AArch64::CBNZW); 423 break; 424 case AArch64::CBNZW: 425 Cond[1].setImm(AArch64::CBZW); 426 break; 427 case AArch64::CBZX: 428 Cond[1].setImm(AArch64::CBNZX); 429 break; 430 case AArch64::CBNZX: 431 Cond[1].setImm(AArch64::CBZX); 432 break; 433 case AArch64::TBZW: 434 Cond[1].setImm(AArch64::TBNZW); 435 break; 436 case AArch64::TBNZW: 437 Cond[1].setImm(AArch64::TBZW); 438 break; 439 case AArch64::TBZX: 440 Cond[1].setImm(AArch64::TBNZX); 441 break; 442 case AArch64::TBNZX: 443 Cond[1].setImm(AArch64::TBZX); 444 break; 445 } 446 } 447 448 return false; 449 } 450 451 unsigned AArch64InstrInfo::removeBranch(MachineBasicBlock &MBB, 452 int *BytesRemoved) const { 453 MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr(); 454 if (I == MBB.end()) 455 return 0; 456 457 if (!isUncondBranchOpcode(I->getOpcode()) && 458 !isCondBranchOpcode(I->getOpcode())) 459 return 0; 460 461 // Remove the branch. 462 I->eraseFromParent(); 463 464 I = MBB.end(); 465 466 if (I == MBB.begin()) { 467 if (BytesRemoved) 468 *BytesRemoved = 4; 469 return 1; 470 } 471 --I; 472 if (!isCondBranchOpcode(I->getOpcode())) { 473 if (BytesRemoved) 474 *BytesRemoved = 4; 475 return 1; 476 } 477 478 // Remove the branch. 479 I->eraseFromParent(); 480 if (BytesRemoved) 481 *BytesRemoved = 8; 482 483 return 2; 484 } 485 486 void AArch64InstrInfo::instantiateCondBranch( 487 MachineBasicBlock &MBB, const DebugLoc &DL, MachineBasicBlock *TBB, 488 ArrayRef<MachineOperand> Cond) const { 489 if (Cond[0].getImm() != -1) { 490 // Regular Bcc 491 BuildMI(&MBB, DL, get(AArch64::Bcc)).addImm(Cond[0].getImm()).addMBB(TBB); 492 } else { 493 // Folded compare-and-branch 494 // Note that we use addOperand instead of addReg to keep the flags. 495 const MachineInstrBuilder MIB = 496 BuildMI(&MBB, DL, get(Cond[1].getImm())).add(Cond[2]); 497 if (Cond.size() > 3) 498 MIB.addImm(Cond[3].getImm()); 499 MIB.addMBB(TBB); 500 } 501 } 502 503 unsigned AArch64InstrInfo::insertBranch( 504 MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB, 505 ArrayRef<MachineOperand> Cond, const DebugLoc &DL, int *BytesAdded) const { 506 // Shouldn't be a fall through. 507 assert(TBB && "insertBranch must not be told to insert a fallthrough"); 508 509 if (!FBB) { 510 if (Cond.empty()) // Unconditional branch? 511 BuildMI(&MBB, DL, get(AArch64::B)).addMBB(TBB); 512 else 513 instantiateCondBranch(MBB, DL, TBB, Cond); 514 515 if (BytesAdded) 516 *BytesAdded = 4; 517 518 return 1; 519 } 520 521 // Two-way conditional branch. 522 instantiateCondBranch(MBB, DL, TBB, Cond); 523 BuildMI(&MBB, DL, get(AArch64::B)).addMBB(FBB); 524 525 if (BytesAdded) 526 *BytesAdded = 8; 527 528 return 2; 529 } 530 531 // Find the original register that VReg is copied from. 532 static unsigned removeCopies(const MachineRegisterInfo &MRI, unsigned VReg) { 533 while (Register::isVirtualRegister(VReg)) { 534 const MachineInstr *DefMI = MRI.getVRegDef(VReg); 535 if (!DefMI->isFullCopy()) 536 return VReg; 537 VReg = DefMI->getOperand(1).getReg(); 538 } 539 return VReg; 540 } 541 542 // Determine if VReg is defined by an instruction that can be folded into a 543 // csel instruction. If so, return the folded opcode, and the replacement 544 // register. 545 static unsigned canFoldIntoCSel(const MachineRegisterInfo &MRI, unsigned VReg, 546 unsigned *NewVReg = nullptr) { 547 VReg = removeCopies(MRI, VReg); 548 if (!Register::isVirtualRegister(VReg)) 549 return 0; 550 551 bool Is64Bit = AArch64::GPR64allRegClass.hasSubClassEq(MRI.getRegClass(VReg)); 552 const MachineInstr *DefMI = MRI.getVRegDef(VReg); 553 unsigned Opc = 0; 554 unsigned SrcOpNum = 0; 555 switch (DefMI->getOpcode()) { 556 case AArch64::ADDSXri: 557 case AArch64::ADDSWri: 558 // if NZCV is used, do not fold. 559 if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, true) == -1) 560 return 0; 561 // fall-through to ADDXri and ADDWri. 562 LLVM_FALLTHROUGH; 563 case AArch64::ADDXri: 564 case AArch64::ADDWri: 565 // add x, 1 -> csinc. 566 if (!DefMI->getOperand(2).isImm() || DefMI->getOperand(2).getImm() != 1 || 567 DefMI->getOperand(3).getImm() != 0) 568 return 0; 569 SrcOpNum = 1; 570 Opc = Is64Bit ? AArch64::CSINCXr : AArch64::CSINCWr; 571 break; 572 573 case AArch64::ORNXrr: 574 case AArch64::ORNWrr: { 575 // not x -> csinv, represented as orn dst, xzr, src. 576 unsigned ZReg = removeCopies(MRI, DefMI->getOperand(1).getReg()); 577 if (ZReg != AArch64::XZR && ZReg != AArch64::WZR) 578 return 0; 579 SrcOpNum = 2; 580 Opc = Is64Bit ? AArch64::CSINVXr : AArch64::CSINVWr; 581 break; 582 } 583 584 case AArch64::SUBSXrr: 585 case AArch64::SUBSWrr: 586 // if NZCV is used, do not fold. 587 if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, true) == -1) 588 return 0; 589 // fall-through to SUBXrr and SUBWrr. 590 LLVM_FALLTHROUGH; 591 case AArch64::SUBXrr: 592 case AArch64::SUBWrr: { 593 // neg x -> csneg, represented as sub dst, xzr, src. 594 unsigned ZReg = removeCopies(MRI, DefMI->getOperand(1).getReg()); 595 if (ZReg != AArch64::XZR && ZReg != AArch64::WZR) 596 return 0; 597 SrcOpNum = 2; 598 Opc = Is64Bit ? AArch64::CSNEGXr : AArch64::CSNEGWr; 599 break; 600 } 601 default: 602 return 0; 603 } 604 assert(Opc && SrcOpNum && "Missing parameters"); 605 606 if (NewVReg) 607 *NewVReg = DefMI->getOperand(SrcOpNum).getReg(); 608 return Opc; 609 } 610 611 bool AArch64InstrInfo::canInsertSelect(const MachineBasicBlock &MBB, 612 ArrayRef<MachineOperand> Cond, 613 Register DstReg, Register TrueReg, 614 Register FalseReg, int &CondCycles, 615 int &TrueCycles, 616 int &FalseCycles) const { 617 // Check register classes. 618 const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 619 const TargetRegisterClass *RC = 620 RI.getCommonSubClass(MRI.getRegClass(TrueReg), MRI.getRegClass(FalseReg)); 621 if (!RC) 622 return false; 623 624 // Also need to check the dest regclass, in case we're trying to optimize 625 // something like: 626 // %1(gpr) = PHI %2(fpr), bb1, %(fpr), bb2 627 if (!RI.getCommonSubClass(RC, MRI.getRegClass(DstReg))) 628 return false; 629 630 // Expanding cbz/tbz requires an extra cycle of latency on the condition. 631 unsigned ExtraCondLat = Cond.size() != 1; 632 633 // GPRs are handled by csel. 634 // FIXME: Fold in x+1, -x, and ~x when applicable. 635 if (AArch64::GPR64allRegClass.hasSubClassEq(RC) || 636 AArch64::GPR32allRegClass.hasSubClassEq(RC)) { 637 // Single-cycle csel, csinc, csinv, and csneg. 638 CondCycles = 1 + ExtraCondLat; 639 TrueCycles = FalseCycles = 1; 640 if (canFoldIntoCSel(MRI, TrueReg)) 641 TrueCycles = 0; 642 else if (canFoldIntoCSel(MRI, FalseReg)) 643 FalseCycles = 0; 644 return true; 645 } 646 647 // Scalar floating point is handled by fcsel. 648 // FIXME: Form fabs, fmin, and fmax when applicable. 649 if (AArch64::FPR64RegClass.hasSubClassEq(RC) || 650 AArch64::FPR32RegClass.hasSubClassEq(RC)) { 651 CondCycles = 5 + ExtraCondLat; 652 TrueCycles = FalseCycles = 2; 653 return true; 654 } 655 656 // Can't do vectors. 657 return false; 658 } 659 660 void AArch64InstrInfo::insertSelect(MachineBasicBlock &MBB, 661 MachineBasicBlock::iterator I, 662 const DebugLoc &DL, Register DstReg, 663 ArrayRef<MachineOperand> Cond, 664 Register TrueReg, Register FalseReg) const { 665 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 666 667 // Parse the condition code, see parseCondBranch() above. 668 AArch64CC::CondCode CC; 669 switch (Cond.size()) { 670 default: 671 llvm_unreachable("Unknown condition opcode in Cond"); 672 case 1: // b.cc 673 CC = AArch64CC::CondCode(Cond[0].getImm()); 674 break; 675 case 3: { // cbz/cbnz 676 // We must insert a compare against 0. 677 bool Is64Bit; 678 switch (Cond[1].getImm()) { 679 default: 680 llvm_unreachable("Unknown branch opcode in Cond"); 681 case AArch64::CBZW: 682 Is64Bit = false; 683 CC = AArch64CC::EQ; 684 break; 685 case AArch64::CBZX: 686 Is64Bit = true; 687 CC = AArch64CC::EQ; 688 break; 689 case AArch64::CBNZW: 690 Is64Bit = false; 691 CC = AArch64CC::NE; 692 break; 693 case AArch64::CBNZX: 694 Is64Bit = true; 695 CC = AArch64CC::NE; 696 break; 697 } 698 Register SrcReg = Cond[2].getReg(); 699 if (Is64Bit) { 700 // cmp reg, #0 is actually subs xzr, reg, #0. 701 MRI.constrainRegClass(SrcReg, &AArch64::GPR64spRegClass); 702 BuildMI(MBB, I, DL, get(AArch64::SUBSXri), AArch64::XZR) 703 .addReg(SrcReg) 704 .addImm(0) 705 .addImm(0); 706 } else { 707 MRI.constrainRegClass(SrcReg, &AArch64::GPR32spRegClass); 708 BuildMI(MBB, I, DL, get(AArch64::SUBSWri), AArch64::WZR) 709 .addReg(SrcReg) 710 .addImm(0) 711 .addImm(0); 712 } 713 break; 714 } 715 case 4: { // tbz/tbnz 716 // We must insert a tst instruction. 717 switch (Cond[1].getImm()) { 718 default: 719 llvm_unreachable("Unknown branch opcode in Cond"); 720 case AArch64::TBZW: 721 case AArch64::TBZX: 722 CC = AArch64CC::EQ; 723 break; 724 case AArch64::TBNZW: 725 case AArch64::TBNZX: 726 CC = AArch64CC::NE; 727 break; 728 } 729 // cmp reg, #foo is actually ands xzr, reg, #1<<foo. 730 if (Cond[1].getImm() == AArch64::TBZW || Cond[1].getImm() == AArch64::TBNZW) 731 BuildMI(MBB, I, DL, get(AArch64::ANDSWri), AArch64::WZR) 732 .addReg(Cond[2].getReg()) 733 .addImm( 734 AArch64_AM::encodeLogicalImmediate(1ull << Cond[3].getImm(), 32)); 735 else 736 BuildMI(MBB, I, DL, get(AArch64::ANDSXri), AArch64::XZR) 737 .addReg(Cond[2].getReg()) 738 .addImm( 739 AArch64_AM::encodeLogicalImmediate(1ull << Cond[3].getImm(), 64)); 740 break; 741 } 742 } 743 744 unsigned Opc = 0; 745 const TargetRegisterClass *RC = nullptr; 746 bool TryFold = false; 747 if (MRI.constrainRegClass(DstReg, &AArch64::GPR64RegClass)) { 748 RC = &AArch64::GPR64RegClass; 749 Opc = AArch64::CSELXr; 750 TryFold = true; 751 } else if (MRI.constrainRegClass(DstReg, &AArch64::GPR32RegClass)) { 752 RC = &AArch64::GPR32RegClass; 753 Opc = AArch64::CSELWr; 754 TryFold = true; 755 } else if (MRI.constrainRegClass(DstReg, &AArch64::FPR64RegClass)) { 756 RC = &AArch64::FPR64RegClass; 757 Opc = AArch64::FCSELDrrr; 758 } else if (MRI.constrainRegClass(DstReg, &AArch64::FPR32RegClass)) { 759 RC = &AArch64::FPR32RegClass; 760 Opc = AArch64::FCSELSrrr; 761 } 762 assert(RC && "Unsupported regclass"); 763 764 // Try folding simple instructions into the csel. 765 if (TryFold) { 766 unsigned NewVReg = 0; 767 unsigned FoldedOpc = canFoldIntoCSel(MRI, TrueReg, &NewVReg); 768 if (FoldedOpc) { 769 // The folded opcodes csinc, csinc and csneg apply the operation to 770 // FalseReg, so we need to invert the condition. 771 CC = AArch64CC::getInvertedCondCode(CC); 772 TrueReg = FalseReg; 773 } else 774 FoldedOpc = canFoldIntoCSel(MRI, FalseReg, &NewVReg); 775 776 // Fold the operation. Leave any dead instructions for DCE to clean up. 777 if (FoldedOpc) { 778 FalseReg = NewVReg; 779 Opc = FoldedOpc; 780 // The extends the live range of NewVReg. 781 MRI.clearKillFlags(NewVReg); 782 } 783 } 784 785 // Pull all virtual register into the appropriate class. 786 MRI.constrainRegClass(TrueReg, RC); 787 MRI.constrainRegClass(FalseReg, RC); 788 789 // Insert the csel. 790 BuildMI(MBB, I, DL, get(Opc), DstReg) 791 .addReg(TrueReg) 792 .addReg(FalseReg) 793 .addImm(CC); 794 } 795 796 /// Returns true if a MOVi32imm or MOVi64imm can be expanded to an ORRxx. 797 static bool canBeExpandedToORR(const MachineInstr &MI, unsigned BitSize) { 798 uint64_t Imm = MI.getOperand(1).getImm(); 799 uint64_t UImm = Imm << (64 - BitSize) >> (64 - BitSize); 800 uint64_t Encoding; 801 return AArch64_AM::processLogicalImmediate(UImm, BitSize, Encoding); 802 } 803 804 // FIXME: this implementation should be micro-architecture dependent, so a 805 // micro-architecture target hook should be introduced here in future. 806 bool AArch64InstrInfo::isAsCheapAsAMove(const MachineInstr &MI) const { 807 if (!Subtarget.hasCustomCheapAsMoveHandling()) 808 return MI.isAsCheapAsAMove(); 809 810 const unsigned Opcode = MI.getOpcode(); 811 812 // Firstly, check cases gated by features. 813 814 if (Subtarget.hasZeroCycleZeroingFP()) { 815 if (Opcode == AArch64::FMOVH0 || 816 Opcode == AArch64::FMOVS0 || 817 Opcode == AArch64::FMOVD0) 818 return true; 819 } 820 821 if (Subtarget.hasZeroCycleZeroingGP()) { 822 if (Opcode == TargetOpcode::COPY && 823 (MI.getOperand(1).getReg() == AArch64::WZR || 824 MI.getOperand(1).getReg() == AArch64::XZR)) 825 return true; 826 } 827 828 // Secondly, check cases specific to sub-targets. 829 830 if (Subtarget.hasExynosCheapAsMoveHandling()) { 831 if (isExynosCheapAsMove(MI)) 832 return true; 833 834 return MI.isAsCheapAsAMove(); 835 } 836 837 // Finally, check generic cases. 838 839 switch (Opcode) { 840 default: 841 return false; 842 843 // add/sub on register without shift 844 case AArch64::ADDWri: 845 case AArch64::ADDXri: 846 case AArch64::SUBWri: 847 case AArch64::SUBXri: 848 return (MI.getOperand(3).getImm() == 0); 849 850 // logical ops on immediate 851 case AArch64::ANDWri: 852 case AArch64::ANDXri: 853 case AArch64::EORWri: 854 case AArch64::EORXri: 855 case AArch64::ORRWri: 856 case AArch64::ORRXri: 857 return true; 858 859 // logical ops on register without shift 860 case AArch64::ANDWrr: 861 case AArch64::ANDXrr: 862 case AArch64::BICWrr: 863 case AArch64::BICXrr: 864 case AArch64::EONWrr: 865 case AArch64::EONXrr: 866 case AArch64::EORWrr: 867 case AArch64::EORXrr: 868 case AArch64::ORNWrr: 869 case AArch64::ORNXrr: 870 case AArch64::ORRWrr: 871 case AArch64::ORRXrr: 872 return true; 873 874 // If MOVi32imm or MOVi64imm can be expanded into ORRWri or 875 // ORRXri, it is as cheap as MOV 876 case AArch64::MOVi32imm: 877 return canBeExpandedToORR(MI, 32); 878 case AArch64::MOVi64imm: 879 return canBeExpandedToORR(MI, 64); 880 } 881 882 llvm_unreachable("Unknown opcode to check as cheap as a move!"); 883 } 884 885 bool AArch64InstrInfo::isFalkorShiftExtFast(const MachineInstr &MI) { 886 switch (MI.getOpcode()) { 887 default: 888 return false; 889 890 case AArch64::ADDWrs: 891 case AArch64::ADDXrs: 892 case AArch64::ADDSWrs: 893 case AArch64::ADDSXrs: { 894 unsigned Imm = MI.getOperand(3).getImm(); 895 unsigned ShiftVal = AArch64_AM::getShiftValue(Imm); 896 if (ShiftVal == 0) 897 return true; 898 return AArch64_AM::getShiftType(Imm) == AArch64_AM::LSL && ShiftVal <= 5; 899 } 900 901 case AArch64::ADDWrx: 902 case AArch64::ADDXrx: 903 case AArch64::ADDXrx64: 904 case AArch64::ADDSWrx: 905 case AArch64::ADDSXrx: 906 case AArch64::ADDSXrx64: { 907 unsigned Imm = MI.getOperand(3).getImm(); 908 switch (AArch64_AM::getArithExtendType(Imm)) { 909 default: 910 return false; 911 case AArch64_AM::UXTB: 912 case AArch64_AM::UXTH: 913 case AArch64_AM::UXTW: 914 case AArch64_AM::UXTX: 915 return AArch64_AM::getArithShiftValue(Imm) <= 4; 916 } 917 } 918 919 case AArch64::SUBWrs: 920 case AArch64::SUBSWrs: { 921 unsigned Imm = MI.getOperand(3).getImm(); 922 unsigned ShiftVal = AArch64_AM::getShiftValue(Imm); 923 return ShiftVal == 0 || 924 (AArch64_AM::getShiftType(Imm) == AArch64_AM::ASR && ShiftVal == 31); 925 } 926 927 case AArch64::SUBXrs: 928 case AArch64::SUBSXrs: { 929 unsigned Imm = MI.getOperand(3).getImm(); 930 unsigned ShiftVal = AArch64_AM::getShiftValue(Imm); 931 return ShiftVal == 0 || 932 (AArch64_AM::getShiftType(Imm) == AArch64_AM::ASR && ShiftVal == 63); 933 } 934 935 case AArch64::SUBWrx: 936 case AArch64::SUBXrx: 937 case AArch64::SUBXrx64: 938 case AArch64::SUBSWrx: 939 case AArch64::SUBSXrx: 940 case AArch64::SUBSXrx64: { 941 unsigned Imm = MI.getOperand(3).getImm(); 942 switch (AArch64_AM::getArithExtendType(Imm)) { 943 default: 944 return false; 945 case AArch64_AM::UXTB: 946 case AArch64_AM::UXTH: 947 case AArch64_AM::UXTW: 948 case AArch64_AM::UXTX: 949 return AArch64_AM::getArithShiftValue(Imm) == 0; 950 } 951 } 952 953 case AArch64::LDRBBroW: 954 case AArch64::LDRBBroX: 955 case AArch64::LDRBroW: 956 case AArch64::LDRBroX: 957 case AArch64::LDRDroW: 958 case AArch64::LDRDroX: 959 case AArch64::LDRHHroW: 960 case AArch64::LDRHHroX: 961 case AArch64::LDRHroW: 962 case AArch64::LDRHroX: 963 case AArch64::LDRQroW: 964 case AArch64::LDRQroX: 965 case AArch64::LDRSBWroW: 966 case AArch64::LDRSBWroX: 967 case AArch64::LDRSBXroW: 968 case AArch64::LDRSBXroX: 969 case AArch64::LDRSHWroW: 970 case AArch64::LDRSHWroX: 971 case AArch64::LDRSHXroW: 972 case AArch64::LDRSHXroX: 973 case AArch64::LDRSWroW: 974 case AArch64::LDRSWroX: 975 case AArch64::LDRSroW: 976 case AArch64::LDRSroX: 977 case AArch64::LDRWroW: 978 case AArch64::LDRWroX: 979 case AArch64::LDRXroW: 980 case AArch64::LDRXroX: 981 case AArch64::PRFMroW: 982 case AArch64::PRFMroX: 983 case AArch64::STRBBroW: 984 case AArch64::STRBBroX: 985 case AArch64::STRBroW: 986 case AArch64::STRBroX: 987 case AArch64::STRDroW: 988 case AArch64::STRDroX: 989 case AArch64::STRHHroW: 990 case AArch64::STRHHroX: 991 case AArch64::STRHroW: 992 case AArch64::STRHroX: 993 case AArch64::STRQroW: 994 case AArch64::STRQroX: 995 case AArch64::STRSroW: 996 case AArch64::STRSroX: 997 case AArch64::STRWroW: 998 case AArch64::STRWroX: 999 case AArch64::STRXroW: 1000 case AArch64::STRXroX: { 1001 unsigned IsSigned = MI.getOperand(3).getImm(); 1002 return !IsSigned; 1003 } 1004 } 1005 } 1006 1007 bool AArch64InstrInfo::isSEHInstruction(const MachineInstr &MI) { 1008 unsigned Opc = MI.getOpcode(); 1009 switch (Opc) { 1010 default: 1011 return false; 1012 case AArch64::SEH_StackAlloc: 1013 case AArch64::SEH_SaveFPLR: 1014 case AArch64::SEH_SaveFPLR_X: 1015 case AArch64::SEH_SaveReg: 1016 case AArch64::SEH_SaveReg_X: 1017 case AArch64::SEH_SaveRegP: 1018 case AArch64::SEH_SaveRegP_X: 1019 case AArch64::SEH_SaveFReg: 1020 case AArch64::SEH_SaveFReg_X: 1021 case AArch64::SEH_SaveFRegP: 1022 case AArch64::SEH_SaveFRegP_X: 1023 case AArch64::SEH_SetFP: 1024 case AArch64::SEH_AddFP: 1025 case AArch64::SEH_Nop: 1026 case AArch64::SEH_PrologEnd: 1027 case AArch64::SEH_EpilogStart: 1028 case AArch64::SEH_EpilogEnd: 1029 return true; 1030 } 1031 } 1032 1033 bool AArch64InstrInfo::isCoalescableExtInstr(const MachineInstr &MI, 1034 Register &SrcReg, Register &DstReg, 1035 unsigned &SubIdx) const { 1036 switch (MI.getOpcode()) { 1037 default: 1038 return false; 1039 case AArch64::SBFMXri: // aka sxtw 1040 case AArch64::UBFMXri: // aka uxtw 1041 // Check for the 32 -> 64 bit extension case, these instructions can do 1042 // much more. 1043 if (MI.getOperand(2).getImm() != 0 || MI.getOperand(3).getImm() != 31) 1044 return false; 1045 // This is a signed or unsigned 32 -> 64 bit extension. 1046 SrcReg = MI.getOperand(1).getReg(); 1047 DstReg = MI.getOperand(0).getReg(); 1048 SubIdx = AArch64::sub_32; 1049 return true; 1050 } 1051 } 1052 1053 bool AArch64InstrInfo::areMemAccessesTriviallyDisjoint( 1054 const MachineInstr &MIa, const MachineInstr &MIb) const { 1055 const TargetRegisterInfo *TRI = &getRegisterInfo(); 1056 const MachineOperand *BaseOpA = nullptr, *BaseOpB = nullptr; 1057 int64_t OffsetA = 0, OffsetB = 0; 1058 unsigned WidthA = 0, WidthB = 0; 1059 bool OffsetAIsScalable = false, OffsetBIsScalable = false; 1060 1061 assert(MIa.mayLoadOrStore() && "MIa must be a load or store."); 1062 assert(MIb.mayLoadOrStore() && "MIb must be a load or store."); 1063 1064 if (MIa.hasUnmodeledSideEffects() || MIb.hasUnmodeledSideEffects() || 1065 MIa.hasOrderedMemoryRef() || MIb.hasOrderedMemoryRef()) 1066 return false; 1067 1068 // Retrieve the base, offset from the base and width. Width 1069 // is the size of memory that is being loaded/stored (e.g. 1, 2, 4, 8). If 1070 // base are identical, and the offset of a lower memory access + 1071 // the width doesn't overlap the offset of a higher memory access, 1072 // then the memory accesses are different. 1073 // If OffsetAIsScalable and OffsetBIsScalable are both true, they 1074 // are assumed to have the same scale (vscale). 1075 if (getMemOperandWithOffsetWidth(MIa, BaseOpA, OffsetA, OffsetAIsScalable, 1076 WidthA, TRI) && 1077 getMemOperandWithOffsetWidth(MIb, BaseOpB, OffsetB, OffsetBIsScalable, 1078 WidthB, TRI)) { 1079 if (BaseOpA->isIdenticalTo(*BaseOpB) && 1080 OffsetAIsScalable == OffsetBIsScalable) { 1081 int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB; 1082 int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA; 1083 int LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB; 1084 if (LowOffset + LowWidth <= HighOffset) 1085 return true; 1086 } 1087 } 1088 return false; 1089 } 1090 1091 bool AArch64InstrInfo::isSchedulingBoundary(const MachineInstr &MI, 1092 const MachineBasicBlock *MBB, 1093 const MachineFunction &MF) const { 1094 if (TargetInstrInfo::isSchedulingBoundary(MI, MBB, MF)) 1095 return true; 1096 switch (MI.getOpcode()) { 1097 case AArch64::HINT: 1098 // CSDB hints are scheduling barriers. 1099 if (MI.getOperand(0).getImm() == 0x14) 1100 return true; 1101 break; 1102 case AArch64::DSB: 1103 case AArch64::ISB: 1104 // DSB and ISB also are scheduling barriers. 1105 return true; 1106 default:; 1107 } 1108 return isSEHInstruction(MI); 1109 } 1110 1111 /// analyzeCompare - For a comparison instruction, return the source registers 1112 /// in SrcReg and SrcReg2, and the value it compares against in CmpValue. 1113 /// Return true if the comparison instruction can be analyzed. 1114 bool AArch64InstrInfo::analyzeCompare(const MachineInstr &MI, Register &SrcReg, 1115 Register &SrcReg2, int &CmpMask, 1116 int &CmpValue) const { 1117 // The first operand can be a frame index where we'd normally expect a 1118 // register. 1119 assert(MI.getNumOperands() >= 2 && "All AArch64 cmps should have 2 operands"); 1120 if (!MI.getOperand(1).isReg()) 1121 return false; 1122 1123 switch (MI.getOpcode()) { 1124 default: 1125 break; 1126 case AArch64::PTEST_PP: 1127 SrcReg = MI.getOperand(0).getReg(); 1128 SrcReg2 = MI.getOperand(1).getReg(); 1129 // Not sure about the mask and value for now... 1130 CmpMask = ~0; 1131 CmpValue = 0; 1132 return true; 1133 case AArch64::SUBSWrr: 1134 case AArch64::SUBSWrs: 1135 case AArch64::SUBSWrx: 1136 case AArch64::SUBSXrr: 1137 case AArch64::SUBSXrs: 1138 case AArch64::SUBSXrx: 1139 case AArch64::ADDSWrr: 1140 case AArch64::ADDSWrs: 1141 case AArch64::ADDSWrx: 1142 case AArch64::ADDSXrr: 1143 case AArch64::ADDSXrs: 1144 case AArch64::ADDSXrx: 1145 // Replace SUBSWrr with SUBWrr if NZCV is not used. 1146 SrcReg = MI.getOperand(1).getReg(); 1147 SrcReg2 = MI.getOperand(2).getReg(); 1148 CmpMask = ~0; 1149 CmpValue = 0; 1150 return true; 1151 case AArch64::SUBSWri: 1152 case AArch64::ADDSWri: 1153 case AArch64::SUBSXri: 1154 case AArch64::ADDSXri: 1155 SrcReg = MI.getOperand(1).getReg(); 1156 SrcReg2 = 0; 1157 CmpMask = ~0; 1158 // FIXME: In order to convert CmpValue to 0 or 1 1159 CmpValue = MI.getOperand(2).getImm() != 0; 1160 return true; 1161 case AArch64::ANDSWri: 1162 case AArch64::ANDSXri: 1163 // ANDS does not use the same encoding scheme as the others xxxS 1164 // instructions. 1165 SrcReg = MI.getOperand(1).getReg(); 1166 SrcReg2 = 0; 1167 CmpMask = ~0; 1168 // FIXME:The return val type of decodeLogicalImmediate is uint64_t, 1169 // while the type of CmpValue is int. When converting uint64_t to int, 1170 // the high 32 bits of uint64_t will be lost. 1171 // In fact it causes a bug in spec2006-483.xalancbmk 1172 // CmpValue is only used to compare with zero in OptimizeCompareInstr 1173 CmpValue = AArch64_AM::decodeLogicalImmediate( 1174 MI.getOperand(2).getImm(), 1175 MI.getOpcode() == AArch64::ANDSWri ? 32 : 64) != 0; 1176 return true; 1177 } 1178 1179 return false; 1180 } 1181 1182 static bool UpdateOperandRegClass(MachineInstr &Instr) { 1183 MachineBasicBlock *MBB = Instr.getParent(); 1184 assert(MBB && "Can't get MachineBasicBlock here"); 1185 MachineFunction *MF = MBB->getParent(); 1186 assert(MF && "Can't get MachineFunction here"); 1187 const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo(); 1188 const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo(); 1189 MachineRegisterInfo *MRI = &MF->getRegInfo(); 1190 1191 for (unsigned OpIdx = 0, EndIdx = Instr.getNumOperands(); OpIdx < EndIdx; 1192 ++OpIdx) { 1193 MachineOperand &MO = Instr.getOperand(OpIdx); 1194 const TargetRegisterClass *OpRegCstraints = 1195 Instr.getRegClassConstraint(OpIdx, TII, TRI); 1196 1197 // If there's no constraint, there's nothing to do. 1198 if (!OpRegCstraints) 1199 continue; 1200 // If the operand is a frame index, there's nothing to do here. 1201 // A frame index operand will resolve correctly during PEI. 1202 if (MO.isFI()) 1203 continue; 1204 1205 assert(MO.isReg() && 1206 "Operand has register constraints without being a register!"); 1207 1208 Register Reg = MO.getReg(); 1209 if (Register::isPhysicalRegister(Reg)) { 1210 if (!OpRegCstraints->contains(Reg)) 1211 return false; 1212 } else if (!OpRegCstraints->hasSubClassEq(MRI->getRegClass(Reg)) && 1213 !MRI->constrainRegClass(Reg, OpRegCstraints)) 1214 return false; 1215 } 1216 1217 return true; 1218 } 1219 1220 /// Return the opcode that does not set flags when possible - otherwise 1221 /// return the original opcode. The caller is responsible to do the actual 1222 /// substitution and legality checking. 1223 static unsigned convertToNonFlagSettingOpc(const MachineInstr &MI) { 1224 // Don't convert all compare instructions, because for some the zero register 1225 // encoding becomes the sp register. 1226 bool MIDefinesZeroReg = false; 1227 if (MI.definesRegister(AArch64::WZR) || MI.definesRegister(AArch64::XZR)) 1228 MIDefinesZeroReg = true; 1229 1230 switch (MI.getOpcode()) { 1231 default: 1232 return MI.getOpcode(); 1233 case AArch64::ADDSWrr: 1234 return AArch64::ADDWrr; 1235 case AArch64::ADDSWri: 1236 return MIDefinesZeroReg ? AArch64::ADDSWri : AArch64::ADDWri; 1237 case AArch64::ADDSWrs: 1238 return MIDefinesZeroReg ? AArch64::ADDSWrs : AArch64::ADDWrs; 1239 case AArch64::ADDSWrx: 1240 return AArch64::ADDWrx; 1241 case AArch64::ADDSXrr: 1242 return AArch64::ADDXrr; 1243 case AArch64::ADDSXri: 1244 return MIDefinesZeroReg ? AArch64::ADDSXri : AArch64::ADDXri; 1245 case AArch64::ADDSXrs: 1246 return MIDefinesZeroReg ? AArch64::ADDSXrs : AArch64::ADDXrs; 1247 case AArch64::ADDSXrx: 1248 return AArch64::ADDXrx; 1249 case AArch64::SUBSWrr: 1250 return AArch64::SUBWrr; 1251 case AArch64::SUBSWri: 1252 return MIDefinesZeroReg ? AArch64::SUBSWri : AArch64::SUBWri; 1253 case AArch64::SUBSWrs: 1254 return MIDefinesZeroReg ? AArch64::SUBSWrs : AArch64::SUBWrs; 1255 case AArch64::SUBSWrx: 1256 return AArch64::SUBWrx; 1257 case AArch64::SUBSXrr: 1258 return AArch64::SUBXrr; 1259 case AArch64::SUBSXri: 1260 return MIDefinesZeroReg ? AArch64::SUBSXri : AArch64::SUBXri; 1261 case AArch64::SUBSXrs: 1262 return MIDefinesZeroReg ? AArch64::SUBSXrs : AArch64::SUBXrs; 1263 case AArch64::SUBSXrx: 1264 return AArch64::SUBXrx; 1265 } 1266 } 1267 1268 enum AccessKind { AK_Write = 0x01, AK_Read = 0x10, AK_All = 0x11 }; 1269 1270 /// True when condition flags are accessed (either by writing or reading) 1271 /// on the instruction trace starting at From and ending at To. 1272 /// 1273 /// Note: If From and To are from different blocks it's assumed CC are accessed 1274 /// on the path. 1275 static bool areCFlagsAccessedBetweenInstrs( 1276 MachineBasicBlock::iterator From, MachineBasicBlock::iterator To, 1277 const TargetRegisterInfo *TRI, const AccessKind AccessToCheck = AK_All) { 1278 // Early exit if To is at the beginning of the BB. 1279 if (To == To->getParent()->begin()) 1280 return true; 1281 1282 // Check whether the instructions are in the same basic block 1283 // If not, assume the condition flags might get modified somewhere. 1284 if (To->getParent() != From->getParent()) 1285 return true; 1286 1287 // From must be above To. 1288 assert(std::any_of( 1289 ++To.getReverse(), To->getParent()->rend(), 1290 [From](MachineInstr &MI) { return MI.getIterator() == From; })); 1291 1292 // We iterate backward starting at \p To until we hit \p From. 1293 for (const MachineInstr &Instr : 1294 instructionsWithoutDebug(++To.getReverse(), From.getReverse())) { 1295 if (((AccessToCheck & AK_Write) && 1296 Instr.modifiesRegister(AArch64::NZCV, TRI)) || 1297 ((AccessToCheck & AK_Read) && Instr.readsRegister(AArch64::NZCV, TRI))) 1298 return true; 1299 } 1300 return false; 1301 } 1302 1303 /// optimizePTestInstr - Attempt to remove a ptest of a predicate-generating 1304 /// operation which could set the flags in an identical manner 1305 bool AArch64InstrInfo::optimizePTestInstr( 1306 MachineInstr *PTest, unsigned MaskReg, unsigned PredReg, 1307 const MachineRegisterInfo *MRI) const { 1308 auto *Mask = MRI->getUniqueVRegDef(MaskReg); 1309 auto *Pred = MRI->getUniqueVRegDef(PredReg); 1310 auto NewOp = Pred->getOpcode(); 1311 bool OpChanged = false; 1312 1313 unsigned MaskOpcode = Mask->getOpcode(); 1314 unsigned PredOpcode = Pred->getOpcode(); 1315 bool PredIsPTestLike = isPTestLikeOpcode(PredOpcode); 1316 bool PredIsWhileLike = isWhileOpcode(PredOpcode); 1317 1318 if (isPTrueOpcode(MaskOpcode) && (PredIsPTestLike || PredIsWhileLike)) { 1319 // For PTEST(PTRUE, OTHER_INST), PTEST is redundant when PTRUE doesn't 1320 // deactivate any lanes OTHER_INST might set. 1321 uint64_t MaskElementSize = getElementSizeForOpcode(MaskOpcode); 1322 uint64_t PredElementSize = getElementSizeForOpcode(PredOpcode); 1323 1324 // Must be an all active predicate of matching element size. 1325 if ((PredElementSize != MaskElementSize) || 1326 (Mask->getOperand(1).getImm() != 31)) 1327 return false; 1328 1329 // Fallthough to simply remove the PTEST. 1330 } else if ((Mask == Pred) && (PredIsPTestLike || PredIsWhileLike)) { 1331 // For PTEST(PG, PG), PTEST is redundant when PG is the result of an 1332 // instruction that sets the flags as PTEST would. 1333 1334 // Fallthough to simply remove the PTEST. 1335 } else if (PredIsPTestLike) { 1336 // For PTEST(PG_1, PTEST_LIKE(PG2, ...)), PTEST is redundant when both 1337 // instructions use the same predicate. 1338 auto PTestLikeMask = MRI->getUniqueVRegDef(Pred->getOperand(1).getReg()); 1339 if (Mask != PTestLikeMask) 1340 return false; 1341 1342 // Fallthough to simply remove the PTEST. 1343 } else { 1344 switch (Pred->getOpcode()) { 1345 case AArch64::BRKB_PPzP: 1346 case AArch64::BRKPB_PPzPP: { 1347 // Op 0 is chain, 1 is the mask, 2 the previous predicate to 1348 // propagate, 3 the new predicate. 1349 1350 // Check to see if our mask is the same as the brkpb's. If 1351 // not the resulting flag bits may be different and we 1352 // can't remove the ptest. 1353 auto *PredMask = MRI->getUniqueVRegDef(Pred->getOperand(1).getReg()); 1354 if (Mask != PredMask) 1355 return false; 1356 1357 // Switch to the new opcode 1358 NewOp = Pred->getOpcode() == AArch64::BRKB_PPzP ? AArch64::BRKBS_PPzP 1359 : AArch64::BRKPBS_PPzPP; 1360 OpChanged = true; 1361 break; 1362 } 1363 case AArch64::BRKN_PPzP: { 1364 auto *PredMask = MRI->getUniqueVRegDef(Pred->getOperand(1).getReg()); 1365 if (Mask != PredMask) 1366 return false; 1367 1368 NewOp = AArch64::BRKNS_PPzP; 1369 OpChanged = true; 1370 break; 1371 } 1372 case AArch64::RDFFR_PPz: { 1373 // rdffr p1.b, PredMask=p0/z <--- Definition of Pred 1374 // ptest Mask=p0, Pred=p1.b <--- If equal masks, remove this and use 1375 // `rdffrs p1.b, p0/z` above. 1376 auto *PredMask = MRI->getUniqueVRegDef(Pred->getOperand(1).getReg()); 1377 if (Mask != PredMask) 1378 return false; 1379 1380 NewOp = AArch64::RDFFRS_PPz; 1381 OpChanged = true; 1382 break; 1383 } 1384 default: 1385 // Bail out if we don't recognize the input 1386 return false; 1387 } 1388 } 1389 1390 const TargetRegisterInfo *TRI = &getRegisterInfo(); 1391 1392 // If another instruction between Pred and PTest accesses flags, don't remove 1393 // the ptest or update the earlier instruction to modify them. 1394 if (areCFlagsAccessedBetweenInstrs(Pred, PTest, TRI)) 1395 return false; 1396 1397 // If we pass all the checks, it's safe to remove the PTEST and use the flags 1398 // as they are prior to PTEST. Sometimes this requires the tested PTEST 1399 // operand to be replaced with an equivalent instruction that also sets the 1400 // flags. 1401 Pred->setDesc(get(NewOp)); 1402 PTest->eraseFromParent(); 1403 if (OpChanged) { 1404 bool succeeded = UpdateOperandRegClass(*Pred); 1405 (void)succeeded; 1406 assert(succeeded && "Operands have incompatible register classes!"); 1407 Pred->addRegisterDefined(AArch64::NZCV, TRI); 1408 } 1409 1410 // Ensure that the flags def is live. 1411 if (Pred->registerDefIsDead(AArch64::NZCV, TRI)) { 1412 unsigned i = 0, e = Pred->getNumOperands(); 1413 for (; i != e; ++i) { 1414 MachineOperand &MO = Pred->getOperand(i); 1415 if (MO.isReg() && MO.isDef() && MO.getReg() == AArch64::NZCV) { 1416 MO.setIsDead(false); 1417 break; 1418 } 1419 } 1420 } 1421 return true; 1422 } 1423 1424 /// Try to optimize a compare instruction. A compare instruction is an 1425 /// instruction which produces AArch64::NZCV. It can be truly compare 1426 /// instruction 1427 /// when there are no uses of its destination register. 1428 /// 1429 /// The following steps are tried in order: 1430 /// 1. Convert CmpInstr into an unconditional version. 1431 /// 2. Remove CmpInstr if above there is an instruction producing a needed 1432 /// condition code or an instruction which can be converted into such an 1433 /// instruction. 1434 /// Only comparison with zero is supported. 1435 bool AArch64InstrInfo::optimizeCompareInstr( 1436 MachineInstr &CmpInstr, Register SrcReg, Register SrcReg2, int CmpMask, 1437 int CmpValue, const MachineRegisterInfo *MRI) const { 1438 assert(CmpInstr.getParent()); 1439 assert(MRI); 1440 1441 // Replace SUBSWrr with SUBWrr if NZCV is not used. 1442 int DeadNZCVIdx = CmpInstr.findRegisterDefOperandIdx(AArch64::NZCV, true); 1443 if (DeadNZCVIdx != -1) { 1444 if (CmpInstr.definesRegister(AArch64::WZR) || 1445 CmpInstr.definesRegister(AArch64::XZR)) { 1446 CmpInstr.eraseFromParent(); 1447 return true; 1448 } 1449 unsigned Opc = CmpInstr.getOpcode(); 1450 unsigned NewOpc = convertToNonFlagSettingOpc(CmpInstr); 1451 if (NewOpc == Opc) 1452 return false; 1453 const MCInstrDesc &MCID = get(NewOpc); 1454 CmpInstr.setDesc(MCID); 1455 CmpInstr.RemoveOperand(DeadNZCVIdx); 1456 bool succeeded = UpdateOperandRegClass(CmpInstr); 1457 (void)succeeded; 1458 assert(succeeded && "Some operands reg class are incompatible!"); 1459 return true; 1460 } 1461 1462 if (CmpInstr.getOpcode() == AArch64::PTEST_PP) 1463 return optimizePTestInstr(&CmpInstr, SrcReg, SrcReg2, MRI); 1464 1465 // Continue only if we have a "ri" where immediate is zero. 1466 // FIXME:CmpValue has already been converted to 0 or 1 in analyzeCompare 1467 // function. 1468 assert((CmpValue == 0 || CmpValue == 1) && "CmpValue must be 0 or 1!"); 1469 if (SrcReg2 != 0) 1470 return false; 1471 1472 // CmpInstr is a Compare instruction if destination register is not used. 1473 if (!MRI->use_nodbg_empty(CmpInstr.getOperand(0).getReg())) 1474 return false; 1475 1476 if (!CmpValue && substituteCmpToZero(CmpInstr, SrcReg, *MRI)) 1477 return true; 1478 return removeCmpToZeroOrOne(CmpInstr, SrcReg, CmpValue, *MRI); 1479 } 1480 1481 /// Get opcode of S version of Instr. 1482 /// If Instr is S version its opcode is returned. 1483 /// AArch64::INSTRUCTION_LIST_END is returned if Instr does not have S version 1484 /// or we are not interested in it. 1485 static unsigned sForm(MachineInstr &Instr) { 1486 switch (Instr.getOpcode()) { 1487 default: 1488 return AArch64::INSTRUCTION_LIST_END; 1489 1490 case AArch64::ADDSWrr: 1491 case AArch64::ADDSWri: 1492 case AArch64::ADDSXrr: 1493 case AArch64::ADDSXri: 1494 case AArch64::SUBSWrr: 1495 case AArch64::SUBSWri: 1496 case AArch64::SUBSXrr: 1497 case AArch64::SUBSXri: 1498 return Instr.getOpcode(); 1499 1500 case AArch64::ADDWrr: 1501 return AArch64::ADDSWrr; 1502 case AArch64::ADDWri: 1503 return AArch64::ADDSWri; 1504 case AArch64::ADDXrr: 1505 return AArch64::ADDSXrr; 1506 case AArch64::ADDXri: 1507 return AArch64::ADDSXri; 1508 case AArch64::ADCWr: 1509 return AArch64::ADCSWr; 1510 case AArch64::ADCXr: 1511 return AArch64::ADCSXr; 1512 case AArch64::SUBWrr: 1513 return AArch64::SUBSWrr; 1514 case AArch64::SUBWri: 1515 return AArch64::SUBSWri; 1516 case AArch64::SUBXrr: 1517 return AArch64::SUBSXrr; 1518 case AArch64::SUBXri: 1519 return AArch64::SUBSXri; 1520 case AArch64::SBCWr: 1521 return AArch64::SBCSWr; 1522 case AArch64::SBCXr: 1523 return AArch64::SBCSXr; 1524 case AArch64::ANDWri: 1525 return AArch64::ANDSWri; 1526 case AArch64::ANDXri: 1527 return AArch64::ANDSXri; 1528 } 1529 } 1530 1531 /// Check if AArch64::NZCV should be alive in successors of MBB. 1532 static bool areCFlagsAliveInSuccessors(const MachineBasicBlock *MBB) { 1533 for (auto *BB : MBB->successors()) 1534 if (BB->isLiveIn(AArch64::NZCV)) 1535 return true; 1536 return false; 1537 } 1538 1539 /// \returns The condition code operand index for \p Instr if it is a branch 1540 /// or select and -1 otherwise. 1541 static int 1542 findCondCodeUseOperandIdxForBranchOrSelect(const MachineInstr &Instr) { 1543 switch (Instr.getOpcode()) { 1544 default: 1545 return -1; 1546 1547 case AArch64::Bcc: { 1548 int Idx = Instr.findRegisterUseOperandIdx(AArch64::NZCV); 1549 assert(Idx >= 2); 1550 return Idx - 2; 1551 } 1552 1553 case AArch64::CSINVWr: 1554 case AArch64::CSINVXr: 1555 case AArch64::CSINCWr: 1556 case AArch64::CSINCXr: 1557 case AArch64::CSELWr: 1558 case AArch64::CSELXr: 1559 case AArch64::CSNEGWr: 1560 case AArch64::CSNEGXr: 1561 case AArch64::FCSELSrrr: 1562 case AArch64::FCSELDrrr: { 1563 int Idx = Instr.findRegisterUseOperandIdx(AArch64::NZCV); 1564 assert(Idx >= 1); 1565 return Idx - 1; 1566 } 1567 } 1568 } 1569 1570 namespace { 1571 1572 struct UsedNZCV { 1573 bool N = false; 1574 bool Z = false; 1575 bool C = false; 1576 bool V = false; 1577 1578 UsedNZCV() = default; 1579 1580 UsedNZCV &operator|=(const UsedNZCV &UsedFlags) { 1581 this->N |= UsedFlags.N; 1582 this->Z |= UsedFlags.Z; 1583 this->C |= UsedFlags.C; 1584 this->V |= UsedFlags.V; 1585 return *this; 1586 } 1587 }; 1588 1589 } // end anonymous namespace 1590 1591 /// Find a condition code used by the instruction. 1592 /// Returns AArch64CC::Invalid if either the instruction does not use condition 1593 /// codes or we don't optimize CmpInstr in the presence of such instructions. 1594 static AArch64CC::CondCode findCondCodeUsedByInstr(const MachineInstr &Instr) { 1595 int CCIdx = findCondCodeUseOperandIdxForBranchOrSelect(Instr); 1596 return CCIdx >= 0 ? static_cast<AArch64CC::CondCode>( 1597 Instr.getOperand(CCIdx).getImm()) 1598 : AArch64CC::Invalid; 1599 } 1600 1601 static UsedNZCV getUsedNZCV(AArch64CC::CondCode CC) { 1602 assert(CC != AArch64CC::Invalid); 1603 UsedNZCV UsedFlags; 1604 switch (CC) { 1605 default: 1606 break; 1607 1608 case AArch64CC::EQ: // Z set 1609 case AArch64CC::NE: // Z clear 1610 UsedFlags.Z = true; 1611 break; 1612 1613 case AArch64CC::HI: // Z clear and C set 1614 case AArch64CC::LS: // Z set or C clear 1615 UsedFlags.Z = true; 1616 LLVM_FALLTHROUGH; 1617 case AArch64CC::HS: // C set 1618 case AArch64CC::LO: // C clear 1619 UsedFlags.C = true; 1620 break; 1621 1622 case AArch64CC::MI: // N set 1623 case AArch64CC::PL: // N clear 1624 UsedFlags.N = true; 1625 break; 1626 1627 case AArch64CC::VS: // V set 1628 case AArch64CC::VC: // V clear 1629 UsedFlags.V = true; 1630 break; 1631 1632 case AArch64CC::GT: // Z clear, N and V the same 1633 case AArch64CC::LE: // Z set, N and V differ 1634 UsedFlags.Z = true; 1635 LLVM_FALLTHROUGH; 1636 case AArch64CC::GE: // N and V the same 1637 case AArch64CC::LT: // N and V differ 1638 UsedFlags.N = true; 1639 UsedFlags.V = true; 1640 break; 1641 } 1642 return UsedFlags; 1643 } 1644 1645 /// \returns Conditions flags used after \p CmpInstr in its MachineBB if they 1646 /// are not containing C or V flags and NZCV flags are not alive in successors 1647 /// of the same \p CmpInstr and \p MI parent. \returns None otherwise. 1648 /// 1649 /// Collect instructions using that flags in \p CCUseInstrs if provided. 1650 static Optional<UsedNZCV> 1651 examineCFlagsUse(MachineInstr &MI, MachineInstr &CmpInstr, 1652 const TargetRegisterInfo &TRI, 1653 SmallVectorImpl<MachineInstr *> *CCUseInstrs = nullptr) { 1654 MachineBasicBlock *CmpParent = CmpInstr.getParent(); 1655 if (MI.getParent() != CmpParent) 1656 return None; 1657 1658 if (areCFlagsAliveInSuccessors(CmpParent)) 1659 return None; 1660 1661 UsedNZCV NZCVUsedAfterCmp; 1662 for (MachineInstr &Instr : instructionsWithoutDebug( 1663 std::next(CmpInstr.getIterator()), CmpParent->instr_end())) { 1664 if (Instr.readsRegister(AArch64::NZCV, &TRI)) { 1665 AArch64CC::CondCode CC = findCondCodeUsedByInstr(Instr); 1666 if (CC == AArch64CC::Invalid) // Unsupported conditional instruction 1667 return None; 1668 NZCVUsedAfterCmp |= getUsedNZCV(CC); 1669 if (CCUseInstrs) 1670 CCUseInstrs->push_back(&Instr); 1671 } 1672 if (Instr.modifiesRegister(AArch64::NZCV, &TRI)) 1673 break; 1674 } 1675 if (NZCVUsedAfterCmp.C || NZCVUsedAfterCmp.V) 1676 return None; 1677 return NZCVUsedAfterCmp; 1678 } 1679 1680 static bool isADDSRegImm(unsigned Opcode) { 1681 return Opcode == AArch64::ADDSWri || Opcode == AArch64::ADDSXri; 1682 } 1683 1684 static bool isSUBSRegImm(unsigned Opcode) { 1685 return Opcode == AArch64::SUBSWri || Opcode == AArch64::SUBSXri; 1686 } 1687 1688 /// Check if CmpInstr can be substituted by MI. 1689 /// 1690 /// CmpInstr can be substituted: 1691 /// - CmpInstr is either 'ADDS %vreg, 0' or 'SUBS %vreg, 0' 1692 /// - and, MI and CmpInstr are from the same MachineBB 1693 /// - and, condition flags are not alive in successors of the CmpInstr parent 1694 /// - and, if MI opcode is the S form there must be no defs of flags between 1695 /// MI and CmpInstr 1696 /// or if MI opcode is not the S form there must be neither defs of flags 1697 /// nor uses of flags between MI and CmpInstr. 1698 /// - and C/V flags are not used after CmpInstr 1699 static bool canInstrSubstituteCmpInstr(MachineInstr &MI, MachineInstr &CmpInstr, 1700 const TargetRegisterInfo &TRI) { 1701 assert(sForm(MI) != AArch64::INSTRUCTION_LIST_END); 1702 1703 const unsigned CmpOpcode = CmpInstr.getOpcode(); 1704 if (!isADDSRegImm(CmpOpcode) && !isSUBSRegImm(CmpOpcode)) 1705 return false; 1706 1707 if (!examineCFlagsUse(MI, CmpInstr, TRI)) 1708 return false; 1709 1710 AccessKind AccessToCheck = AK_Write; 1711 if (sForm(MI) != MI.getOpcode()) 1712 AccessToCheck = AK_All; 1713 return !areCFlagsAccessedBetweenInstrs(&MI, &CmpInstr, &TRI, AccessToCheck); 1714 } 1715 1716 /// Substitute an instruction comparing to zero with another instruction 1717 /// which produces needed condition flags. 1718 /// 1719 /// Return true on success. 1720 bool AArch64InstrInfo::substituteCmpToZero( 1721 MachineInstr &CmpInstr, unsigned SrcReg, 1722 const MachineRegisterInfo &MRI) const { 1723 // Get the unique definition of SrcReg. 1724 MachineInstr *MI = MRI.getUniqueVRegDef(SrcReg); 1725 if (!MI) 1726 return false; 1727 1728 const TargetRegisterInfo &TRI = getRegisterInfo(); 1729 1730 unsigned NewOpc = sForm(*MI); 1731 if (NewOpc == AArch64::INSTRUCTION_LIST_END) 1732 return false; 1733 1734 if (!canInstrSubstituteCmpInstr(*MI, CmpInstr, TRI)) 1735 return false; 1736 1737 // Update the instruction to set NZCV. 1738 MI->setDesc(get(NewOpc)); 1739 CmpInstr.eraseFromParent(); 1740 bool succeeded = UpdateOperandRegClass(*MI); 1741 (void)succeeded; 1742 assert(succeeded && "Some operands reg class are incompatible!"); 1743 MI->addRegisterDefined(AArch64::NZCV, &TRI); 1744 return true; 1745 } 1746 1747 /// \returns True if \p CmpInstr can be removed. 1748 /// 1749 /// \p IsInvertCC is true if, after removing \p CmpInstr, condition 1750 /// codes used in \p CCUseInstrs must be inverted. 1751 static bool canCmpInstrBeRemoved(MachineInstr &MI, MachineInstr &CmpInstr, 1752 int CmpValue, const TargetRegisterInfo &TRI, 1753 SmallVectorImpl<MachineInstr *> &CCUseInstrs, 1754 bool &IsInvertCC) { 1755 assert((CmpValue == 0 || CmpValue == 1) && 1756 "Only comparisons to 0 or 1 considered for removal!"); 1757 1758 // MI is 'CSINCWr %vreg, wzr, wzr, <cc>' or 'CSINCXr %vreg, xzr, xzr, <cc>' 1759 unsigned MIOpc = MI.getOpcode(); 1760 if (MIOpc == AArch64::CSINCWr) { 1761 if (MI.getOperand(1).getReg() != AArch64::WZR || 1762 MI.getOperand(2).getReg() != AArch64::WZR) 1763 return false; 1764 } else if (MIOpc == AArch64::CSINCXr) { 1765 if (MI.getOperand(1).getReg() != AArch64::XZR || 1766 MI.getOperand(2).getReg() != AArch64::XZR) 1767 return false; 1768 } else { 1769 return false; 1770 } 1771 AArch64CC::CondCode MICC = findCondCodeUsedByInstr(MI); 1772 if (MICC == AArch64CC::Invalid) 1773 return false; 1774 1775 // NZCV needs to be defined 1776 if (MI.findRegisterDefOperandIdx(AArch64::NZCV, true) != -1) 1777 return false; 1778 1779 // CmpInstr is 'ADDS %vreg, 0' or 'SUBS %vreg, 0' or 'SUBS %vreg, 1' 1780 const unsigned CmpOpcode = CmpInstr.getOpcode(); 1781 bool IsSubsRegImm = isSUBSRegImm(CmpOpcode); 1782 if (CmpValue && !IsSubsRegImm) 1783 return false; 1784 if (!CmpValue && !IsSubsRegImm && !isADDSRegImm(CmpOpcode)) 1785 return false; 1786 1787 // MI conditions allowed: eq, ne, mi, pl 1788 UsedNZCV MIUsedNZCV = getUsedNZCV(MICC); 1789 if (MIUsedNZCV.C || MIUsedNZCV.V) 1790 return false; 1791 1792 Optional<UsedNZCV> NZCVUsedAfterCmp = 1793 examineCFlagsUse(MI, CmpInstr, TRI, &CCUseInstrs); 1794 // Condition flags are not used in CmpInstr basic block successors and only 1795 // Z or N flags allowed to be used after CmpInstr within its basic block 1796 if (!NZCVUsedAfterCmp) 1797 return false; 1798 // Z or N flag used after CmpInstr must correspond to the flag used in MI 1799 if ((MIUsedNZCV.Z && NZCVUsedAfterCmp->N) || 1800 (MIUsedNZCV.N && NZCVUsedAfterCmp->Z)) 1801 return false; 1802 // If CmpInstr is comparison to zero MI conditions are limited to eq, ne 1803 if (MIUsedNZCV.N && !CmpValue) 1804 return false; 1805 1806 // There must be no defs of flags between MI and CmpInstr 1807 if (areCFlagsAccessedBetweenInstrs(&MI, &CmpInstr, &TRI, AK_Write)) 1808 return false; 1809 1810 // Condition code is inverted in the following cases: 1811 // 1. MI condition is ne; CmpInstr is 'ADDS %vreg, 0' or 'SUBS %vreg, 0' 1812 // 2. MI condition is eq, pl; CmpInstr is 'SUBS %vreg, 1' 1813 IsInvertCC = (CmpValue && (MICC == AArch64CC::EQ || MICC == AArch64CC::PL)) || 1814 (!CmpValue && MICC == AArch64CC::NE); 1815 return true; 1816 } 1817 1818 /// Remove comparision in csinc-cmp sequence 1819 /// 1820 /// Examples: 1821 /// 1. \code 1822 /// csinc w9, wzr, wzr, ne 1823 /// cmp w9, #0 1824 /// b.eq 1825 /// \endcode 1826 /// to 1827 /// \code 1828 /// csinc w9, wzr, wzr, ne 1829 /// b.ne 1830 /// \endcode 1831 /// 1832 /// 2. \code 1833 /// csinc x2, xzr, xzr, mi 1834 /// cmp x2, #1 1835 /// b.pl 1836 /// \endcode 1837 /// to 1838 /// \code 1839 /// csinc x2, xzr, xzr, mi 1840 /// b.pl 1841 /// \endcode 1842 /// 1843 /// \param CmpInstr comparison instruction 1844 /// \return True when comparison removed 1845 bool AArch64InstrInfo::removeCmpToZeroOrOne( 1846 MachineInstr &CmpInstr, unsigned SrcReg, int CmpValue, 1847 const MachineRegisterInfo &MRI) const { 1848 MachineInstr *MI = MRI.getUniqueVRegDef(SrcReg); 1849 if (!MI) 1850 return false; 1851 const TargetRegisterInfo &TRI = getRegisterInfo(); 1852 SmallVector<MachineInstr *, 4> CCUseInstrs; 1853 bool IsInvertCC = false; 1854 if (!canCmpInstrBeRemoved(*MI, CmpInstr, CmpValue, TRI, CCUseInstrs, 1855 IsInvertCC)) 1856 return false; 1857 // Make transformation 1858 CmpInstr.eraseFromParent(); 1859 if (IsInvertCC) { 1860 // Invert condition codes in CmpInstr CC users 1861 for (MachineInstr *CCUseInstr : CCUseInstrs) { 1862 int Idx = findCondCodeUseOperandIdxForBranchOrSelect(*CCUseInstr); 1863 assert(Idx >= 0 && "Unexpected instruction using CC."); 1864 MachineOperand &CCOperand = CCUseInstr->getOperand(Idx); 1865 AArch64CC::CondCode CCUse = AArch64CC::getInvertedCondCode( 1866 static_cast<AArch64CC::CondCode>(CCOperand.getImm())); 1867 CCOperand.setImm(CCUse); 1868 } 1869 } 1870 return true; 1871 } 1872 1873 bool AArch64InstrInfo::expandPostRAPseudo(MachineInstr &MI) const { 1874 if (MI.getOpcode() != TargetOpcode::LOAD_STACK_GUARD && 1875 MI.getOpcode() != AArch64::CATCHRET) 1876 return false; 1877 1878 MachineBasicBlock &MBB = *MI.getParent(); 1879 auto &Subtarget = MBB.getParent()->getSubtarget<AArch64Subtarget>(); 1880 auto TRI = Subtarget.getRegisterInfo(); 1881 DebugLoc DL = MI.getDebugLoc(); 1882 1883 if (MI.getOpcode() == AArch64::CATCHRET) { 1884 // Skip to the first instruction before the epilog. 1885 const TargetInstrInfo *TII = 1886 MBB.getParent()->getSubtarget().getInstrInfo(); 1887 MachineBasicBlock *TargetMBB = MI.getOperand(0).getMBB(); 1888 auto MBBI = MachineBasicBlock::iterator(MI); 1889 MachineBasicBlock::iterator FirstEpilogSEH = std::prev(MBBI); 1890 while (FirstEpilogSEH->getFlag(MachineInstr::FrameDestroy) && 1891 FirstEpilogSEH != MBB.begin()) 1892 FirstEpilogSEH = std::prev(FirstEpilogSEH); 1893 if (FirstEpilogSEH != MBB.begin()) 1894 FirstEpilogSEH = std::next(FirstEpilogSEH); 1895 BuildMI(MBB, FirstEpilogSEH, DL, TII->get(AArch64::ADRP)) 1896 .addReg(AArch64::X0, RegState::Define) 1897 .addMBB(TargetMBB); 1898 BuildMI(MBB, FirstEpilogSEH, DL, TII->get(AArch64::ADDXri)) 1899 .addReg(AArch64::X0, RegState::Define) 1900 .addReg(AArch64::X0) 1901 .addMBB(TargetMBB) 1902 .addImm(0); 1903 return true; 1904 } 1905 1906 Register Reg = MI.getOperand(0).getReg(); 1907 Module &M = *MBB.getParent()->getFunction().getParent(); 1908 if (M.getStackProtectorGuard() == "sysreg") { 1909 const AArch64SysReg::SysReg *SrcReg = 1910 AArch64SysReg::lookupSysRegByName(M.getStackProtectorGuardReg()); 1911 if (!SrcReg) 1912 report_fatal_error("Unknown SysReg for Stack Protector Guard Register"); 1913 1914 // mrs xN, sysreg 1915 BuildMI(MBB, MI, DL, get(AArch64::MRS)) 1916 .addDef(Reg, RegState::Renamable) 1917 .addImm(SrcReg->Encoding); 1918 int Offset = M.getStackProtectorGuardOffset(); 1919 if (Offset >= 0 && Offset <= 32760 && Offset % 8 == 0) { 1920 // ldr xN, [xN, #offset] 1921 BuildMI(MBB, MI, DL, get(AArch64::LDRXui)) 1922 .addDef(Reg) 1923 .addUse(Reg, RegState::Kill) 1924 .addImm(Offset / 8); 1925 } else if (Offset >= -256 && Offset <= 255) { 1926 // ldur xN, [xN, #offset] 1927 BuildMI(MBB, MI, DL, get(AArch64::LDURXi)) 1928 .addDef(Reg) 1929 .addUse(Reg, RegState::Kill) 1930 .addImm(Offset); 1931 } else if (Offset >= -4095 && Offset <= 4095) { 1932 if (Offset > 0) { 1933 // add xN, xN, #offset 1934 BuildMI(MBB, MI, DL, get(AArch64::ADDXri)) 1935 .addDef(Reg) 1936 .addUse(Reg, RegState::Kill) 1937 .addImm(Offset) 1938 .addImm(0); 1939 } else { 1940 // sub xN, xN, #offset 1941 BuildMI(MBB, MI, DL, get(AArch64::SUBXri)) 1942 .addDef(Reg) 1943 .addUse(Reg, RegState::Kill) 1944 .addImm(-Offset) 1945 .addImm(0); 1946 } 1947 // ldr xN, [xN] 1948 BuildMI(MBB, MI, DL, get(AArch64::LDRXui)) 1949 .addDef(Reg) 1950 .addUse(Reg, RegState::Kill) 1951 .addImm(0); 1952 } else { 1953 // Cases that are larger than +/- 4095 and not a multiple of 8, or larger 1954 // than 23760. 1955 // It might be nice to use AArch64::MOVi32imm here, which would get 1956 // expanded in PreSched2 after PostRA, but our lone scratch Reg already 1957 // contains the MRS result. findScratchNonCalleeSaveRegister() in 1958 // AArch64FrameLowering might help us find such a scratch register 1959 // though. If we failed to find a scratch register, we could emit a 1960 // stream of add instructions to build up the immediate. Or, we could try 1961 // to insert a AArch64::MOVi32imm before register allocation so that we 1962 // didn't need to scavenge for a scratch register. 1963 report_fatal_error("Unable to encode Stack Protector Guard Offset"); 1964 } 1965 MBB.erase(MI); 1966 return true; 1967 } 1968 1969 const GlobalValue *GV = 1970 cast<GlobalValue>((*MI.memoperands_begin())->getValue()); 1971 const TargetMachine &TM = MBB.getParent()->getTarget(); 1972 unsigned OpFlags = Subtarget.ClassifyGlobalReference(GV, TM); 1973 const unsigned char MO_NC = AArch64II::MO_NC; 1974 1975 if ((OpFlags & AArch64II::MO_GOT) != 0) { 1976 BuildMI(MBB, MI, DL, get(AArch64::LOADgot), Reg) 1977 .addGlobalAddress(GV, 0, OpFlags); 1978 if (Subtarget.isTargetILP32()) { 1979 unsigned Reg32 = TRI->getSubReg(Reg, AArch64::sub_32); 1980 BuildMI(MBB, MI, DL, get(AArch64::LDRWui)) 1981 .addDef(Reg32, RegState::Dead) 1982 .addUse(Reg, RegState::Kill) 1983 .addImm(0) 1984 .addMemOperand(*MI.memoperands_begin()) 1985 .addDef(Reg, RegState::Implicit); 1986 } else { 1987 BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg) 1988 .addReg(Reg, RegState::Kill) 1989 .addImm(0) 1990 .addMemOperand(*MI.memoperands_begin()); 1991 } 1992 } else if (TM.getCodeModel() == CodeModel::Large) { 1993 assert(!Subtarget.isTargetILP32() && "how can large exist in ILP32?"); 1994 BuildMI(MBB, MI, DL, get(AArch64::MOVZXi), Reg) 1995 .addGlobalAddress(GV, 0, AArch64II::MO_G0 | MO_NC) 1996 .addImm(0); 1997 BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg) 1998 .addReg(Reg, RegState::Kill) 1999 .addGlobalAddress(GV, 0, AArch64II::MO_G1 | MO_NC) 2000 .addImm(16); 2001 BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg) 2002 .addReg(Reg, RegState::Kill) 2003 .addGlobalAddress(GV, 0, AArch64II::MO_G2 | MO_NC) 2004 .addImm(32); 2005 BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg) 2006 .addReg(Reg, RegState::Kill) 2007 .addGlobalAddress(GV, 0, AArch64II::MO_G3) 2008 .addImm(48); 2009 BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg) 2010 .addReg(Reg, RegState::Kill) 2011 .addImm(0) 2012 .addMemOperand(*MI.memoperands_begin()); 2013 } else if (TM.getCodeModel() == CodeModel::Tiny) { 2014 BuildMI(MBB, MI, DL, get(AArch64::ADR), Reg) 2015 .addGlobalAddress(GV, 0, OpFlags); 2016 } else { 2017 BuildMI(MBB, MI, DL, get(AArch64::ADRP), Reg) 2018 .addGlobalAddress(GV, 0, OpFlags | AArch64II::MO_PAGE); 2019 unsigned char LoFlags = OpFlags | AArch64II::MO_PAGEOFF | MO_NC; 2020 if (Subtarget.isTargetILP32()) { 2021 unsigned Reg32 = TRI->getSubReg(Reg, AArch64::sub_32); 2022 BuildMI(MBB, MI, DL, get(AArch64::LDRWui)) 2023 .addDef(Reg32, RegState::Dead) 2024 .addUse(Reg, RegState::Kill) 2025 .addGlobalAddress(GV, 0, LoFlags) 2026 .addMemOperand(*MI.memoperands_begin()) 2027 .addDef(Reg, RegState::Implicit); 2028 } else { 2029 BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg) 2030 .addReg(Reg, RegState::Kill) 2031 .addGlobalAddress(GV, 0, LoFlags) 2032 .addMemOperand(*MI.memoperands_begin()); 2033 } 2034 } 2035 2036 MBB.erase(MI); 2037 2038 return true; 2039 } 2040 2041 // Return true if this instruction simply sets its single destination register 2042 // to zero. This is equivalent to a register rename of the zero-register. 2043 bool AArch64InstrInfo::isGPRZero(const MachineInstr &MI) { 2044 switch (MI.getOpcode()) { 2045 default: 2046 break; 2047 case AArch64::MOVZWi: 2048 case AArch64::MOVZXi: // movz Rd, #0 (LSL #0) 2049 if (MI.getOperand(1).isImm() && MI.getOperand(1).getImm() == 0) { 2050 assert(MI.getDesc().getNumOperands() == 3 && 2051 MI.getOperand(2).getImm() == 0 && "invalid MOVZi operands"); 2052 return true; 2053 } 2054 break; 2055 case AArch64::ANDWri: // and Rd, Rzr, #imm 2056 return MI.getOperand(1).getReg() == AArch64::WZR; 2057 case AArch64::ANDXri: 2058 return MI.getOperand(1).getReg() == AArch64::XZR; 2059 case TargetOpcode::COPY: 2060 return MI.getOperand(1).getReg() == AArch64::WZR; 2061 } 2062 return false; 2063 } 2064 2065 // Return true if this instruction simply renames a general register without 2066 // modifying bits. 2067 bool AArch64InstrInfo::isGPRCopy(const MachineInstr &MI) { 2068 switch (MI.getOpcode()) { 2069 default: 2070 break; 2071 case TargetOpcode::COPY: { 2072 // GPR32 copies will by lowered to ORRXrs 2073 Register DstReg = MI.getOperand(0).getReg(); 2074 return (AArch64::GPR32RegClass.contains(DstReg) || 2075 AArch64::GPR64RegClass.contains(DstReg)); 2076 } 2077 case AArch64::ORRXrs: // orr Xd, Xzr, Xm (LSL #0) 2078 if (MI.getOperand(1).getReg() == AArch64::XZR) { 2079 assert(MI.getDesc().getNumOperands() == 4 && 2080 MI.getOperand(3).getImm() == 0 && "invalid ORRrs operands"); 2081 return true; 2082 } 2083 break; 2084 case AArch64::ADDXri: // add Xd, Xn, #0 (LSL #0) 2085 if (MI.getOperand(2).getImm() == 0) { 2086 assert(MI.getDesc().getNumOperands() == 4 && 2087 MI.getOperand(3).getImm() == 0 && "invalid ADDXri operands"); 2088 return true; 2089 } 2090 break; 2091 } 2092 return false; 2093 } 2094 2095 // Return true if this instruction simply renames a general register without 2096 // modifying bits. 2097 bool AArch64InstrInfo::isFPRCopy(const MachineInstr &MI) { 2098 switch (MI.getOpcode()) { 2099 default: 2100 break; 2101 case TargetOpcode::COPY: { 2102 // FPR64 copies will by lowered to ORR.16b 2103 Register DstReg = MI.getOperand(0).getReg(); 2104 return (AArch64::FPR64RegClass.contains(DstReg) || 2105 AArch64::FPR128RegClass.contains(DstReg)); 2106 } 2107 case AArch64::ORRv16i8: 2108 if (MI.getOperand(1).getReg() == MI.getOperand(2).getReg()) { 2109 assert(MI.getDesc().getNumOperands() == 3 && MI.getOperand(0).isReg() && 2110 "invalid ORRv16i8 operands"); 2111 return true; 2112 } 2113 break; 2114 } 2115 return false; 2116 } 2117 2118 unsigned AArch64InstrInfo::isLoadFromStackSlot(const MachineInstr &MI, 2119 int &FrameIndex) const { 2120 switch (MI.getOpcode()) { 2121 default: 2122 break; 2123 case AArch64::LDRWui: 2124 case AArch64::LDRXui: 2125 case AArch64::LDRBui: 2126 case AArch64::LDRHui: 2127 case AArch64::LDRSui: 2128 case AArch64::LDRDui: 2129 case AArch64::LDRQui: 2130 if (MI.getOperand(0).getSubReg() == 0 && MI.getOperand(1).isFI() && 2131 MI.getOperand(2).isImm() && MI.getOperand(2).getImm() == 0) { 2132 FrameIndex = MI.getOperand(1).getIndex(); 2133 return MI.getOperand(0).getReg(); 2134 } 2135 break; 2136 } 2137 2138 return 0; 2139 } 2140 2141 unsigned AArch64InstrInfo::isStoreToStackSlot(const MachineInstr &MI, 2142 int &FrameIndex) const { 2143 switch (MI.getOpcode()) { 2144 default: 2145 break; 2146 case AArch64::STRWui: 2147 case AArch64::STRXui: 2148 case AArch64::STRBui: 2149 case AArch64::STRHui: 2150 case AArch64::STRSui: 2151 case AArch64::STRDui: 2152 case AArch64::STRQui: 2153 case AArch64::LDR_PXI: 2154 case AArch64::STR_PXI: 2155 if (MI.getOperand(0).getSubReg() == 0 && MI.getOperand(1).isFI() && 2156 MI.getOperand(2).isImm() && MI.getOperand(2).getImm() == 0) { 2157 FrameIndex = MI.getOperand(1).getIndex(); 2158 return MI.getOperand(0).getReg(); 2159 } 2160 break; 2161 } 2162 return 0; 2163 } 2164 2165 /// Check all MachineMemOperands for a hint to suppress pairing. 2166 bool AArch64InstrInfo::isLdStPairSuppressed(const MachineInstr &MI) { 2167 return llvm::any_of(MI.memoperands(), [](MachineMemOperand *MMO) { 2168 return MMO->getFlags() & MOSuppressPair; 2169 }); 2170 } 2171 2172 /// Set a flag on the first MachineMemOperand to suppress pairing. 2173 void AArch64InstrInfo::suppressLdStPair(MachineInstr &MI) { 2174 if (MI.memoperands_empty()) 2175 return; 2176 (*MI.memoperands_begin())->setFlags(MOSuppressPair); 2177 } 2178 2179 /// Check all MachineMemOperands for a hint that the load/store is strided. 2180 bool AArch64InstrInfo::isStridedAccess(const MachineInstr &MI) { 2181 return llvm::any_of(MI.memoperands(), [](MachineMemOperand *MMO) { 2182 return MMO->getFlags() & MOStridedAccess; 2183 }); 2184 } 2185 2186 bool AArch64InstrInfo::hasUnscaledLdStOffset(unsigned Opc) { 2187 switch (Opc) { 2188 default: 2189 return false; 2190 case AArch64::STURSi: 2191 case AArch64::STRSpre: 2192 case AArch64::STURDi: 2193 case AArch64::STRDpre: 2194 case AArch64::STURQi: 2195 case AArch64::STRQpre: 2196 case AArch64::STURBBi: 2197 case AArch64::STURHHi: 2198 case AArch64::STURWi: 2199 case AArch64::STRWpre: 2200 case AArch64::STURXi: 2201 case AArch64::STRXpre: 2202 case AArch64::LDURSi: 2203 case AArch64::LDRSpre: 2204 case AArch64::LDURDi: 2205 case AArch64::LDRDpre: 2206 case AArch64::LDURQi: 2207 case AArch64::LDRQpre: 2208 case AArch64::LDURWi: 2209 case AArch64::LDRWpre: 2210 case AArch64::LDURXi: 2211 case AArch64::LDRXpre: 2212 case AArch64::LDURSWi: 2213 case AArch64::LDURHHi: 2214 case AArch64::LDURBBi: 2215 case AArch64::LDURSBWi: 2216 case AArch64::LDURSHWi: 2217 return true; 2218 } 2219 } 2220 2221 Optional<unsigned> AArch64InstrInfo::getUnscaledLdSt(unsigned Opc) { 2222 switch (Opc) { 2223 default: return {}; 2224 case AArch64::PRFMui: return AArch64::PRFUMi; 2225 case AArch64::LDRXui: return AArch64::LDURXi; 2226 case AArch64::LDRWui: return AArch64::LDURWi; 2227 case AArch64::LDRBui: return AArch64::LDURBi; 2228 case AArch64::LDRHui: return AArch64::LDURHi; 2229 case AArch64::LDRSui: return AArch64::LDURSi; 2230 case AArch64::LDRDui: return AArch64::LDURDi; 2231 case AArch64::LDRQui: return AArch64::LDURQi; 2232 case AArch64::LDRBBui: return AArch64::LDURBBi; 2233 case AArch64::LDRHHui: return AArch64::LDURHHi; 2234 case AArch64::LDRSBXui: return AArch64::LDURSBXi; 2235 case AArch64::LDRSBWui: return AArch64::LDURSBWi; 2236 case AArch64::LDRSHXui: return AArch64::LDURSHXi; 2237 case AArch64::LDRSHWui: return AArch64::LDURSHWi; 2238 case AArch64::LDRSWui: return AArch64::LDURSWi; 2239 case AArch64::STRXui: return AArch64::STURXi; 2240 case AArch64::STRWui: return AArch64::STURWi; 2241 case AArch64::STRBui: return AArch64::STURBi; 2242 case AArch64::STRHui: return AArch64::STURHi; 2243 case AArch64::STRSui: return AArch64::STURSi; 2244 case AArch64::STRDui: return AArch64::STURDi; 2245 case AArch64::STRQui: return AArch64::STURQi; 2246 case AArch64::STRBBui: return AArch64::STURBBi; 2247 case AArch64::STRHHui: return AArch64::STURHHi; 2248 } 2249 } 2250 2251 unsigned AArch64InstrInfo::getLoadStoreImmIdx(unsigned Opc) { 2252 switch (Opc) { 2253 default: 2254 return 2; 2255 case AArch64::LDPXi: 2256 case AArch64::LDPDi: 2257 case AArch64::STPXi: 2258 case AArch64::STPDi: 2259 case AArch64::LDNPXi: 2260 case AArch64::LDNPDi: 2261 case AArch64::STNPXi: 2262 case AArch64::STNPDi: 2263 case AArch64::LDPQi: 2264 case AArch64::STPQi: 2265 case AArch64::LDNPQi: 2266 case AArch64::STNPQi: 2267 case AArch64::LDPWi: 2268 case AArch64::LDPSi: 2269 case AArch64::STPWi: 2270 case AArch64::STPSi: 2271 case AArch64::LDNPWi: 2272 case AArch64::LDNPSi: 2273 case AArch64::STNPWi: 2274 case AArch64::STNPSi: 2275 case AArch64::LDG: 2276 case AArch64::STGPi: 2277 case AArch64::LD1B_IMM: 2278 case AArch64::LD1H_IMM: 2279 case AArch64::LD1W_IMM: 2280 case AArch64::LD1D_IMM: 2281 case AArch64::ST1B_IMM: 2282 case AArch64::ST1H_IMM: 2283 case AArch64::ST1W_IMM: 2284 case AArch64::ST1D_IMM: 2285 case AArch64::LD1B_H_IMM: 2286 case AArch64::LD1SB_H_IMM: 2287 case AArch64::LD1H_S_IMM: 2288 case AArch64::LD1SH_S_IMM: 2289 case AArch64::LD1W_D_IMM: 2290 case AArch64::LD1SW_D_IMM: 2291 case AArch64::ST1B_H_IMM: 2292 case AArch64::ST1H_S_IMM: 2293 case AArch64::ST1W_D_IMM: 2294 case AArch64::LD1B_S_IMM: 2295 case AArch64::LD1SB_S_IMM: 2296 case AArch64::LD1H_D_IMM: 2297 case AArch64::LD1SH_D_IMM: 2298 case AArch64::ST1B_S_IMM: 2299 case AArch64::ST1H_D_IMM: 2300 case AArch64::LD1B_D_IMM: 2301 case AArch64::LD1SB_D_IMM: 2302 case AArch64::ST1B_D_IMM: 2303 return 3; 2304 case AArch64::ADDG: 2305 case AArch64::STGOffset: 2306 case AArch64::LDR_PXI: 2307 case AArch64::STR_PXI: 2308 return 2; 2309 } 2310 } 2311 2312 bool AArch64InstrInfo::isPairableLdStInst(const MachineInstr &MI) { 2313 switch (MI.getOpcode()) { 2314 default: 2315 return false; 2316 // Scaled instructions. 2317 case AArch64::STRSui: 2318 case AArch64::STRDui: 2319 case AArch64::STRQui: 2320 case AArch64::STRXui: 2321 case AArch64::STRWui: 2322 case AArch64::LDRSui: 2323 case AArch64::LDRDui: 2324 case AArch64::LDRQui: 2325 case AArch64::LDRXui: 2326 case AArch64::LDRWui: 2327 case AArch64::LDRSWui: 2328 // Unscaled instructions. 2329 case AArch64::STURSi: 2330 case AArch64::STRSpre: 2331 case AArch64::STURDi: 2332 case AArch64::STRDpre: 2333 case AArch64::STURQi: 2334 case AArch64::STRQpre: 2335 case AArch64::STURWi: 2336 case AArch64::STRWpre: 2337 case AArch64::STURXi: 2338 case AArch64::STRXpre: 2339 case AArch64::LDURSi: 2340 case AArch64::LDRSpre: 2341 case AArch64::LDURDi: 2342 case AArch64::LDRDpre: 2343 case AArch64::LDURQi: 2344 case AArch64::LDRQpre: 2345 case AArch64::LDURWi: 2346 case AArch64::LDRWpre: 2347 case AArch64::LDURXi: 2348 case AArch64::LDRXpre: 2349 case AArch64::LDURSWi: 2350 return true; 2351 } 2352 } 2353 2354 unsigned AArch64InstrInfo::convertToFlagSettingOpc(unsigned Opc, 2355 bool &Is64Bit) { 2356 switch (Opc) { 2357 default: 2358 llvm_unreachable("Opcode has no flag setting equivalent!"); 2359 // 32-bit cases: 2360 case AArch64::ADDWri: 2361 Is64Bit = false; 2362 return AArch64::ADDSWri; 2363 case AArch64::ADDWrr: 2364 Is64Bit = false; 2365 return AArch64::ADDSWrr; 2366 case AArch64::ADDWrs: 2367 Is64Bit = false; 2368 return AArch64::ADDSWrs; 2369 case AArch64::ADDWrx: 2370 Is64Bit = false; 2371 return AArch64::ADDSWrx; 2372 case AArch64::ANDWri: 2373 Is64Bit = false; 2374 return AArch64::ANDSWri; 2375 case AArch64::ANDWrr: 2376 Is64Bit = false; 2377 return AArch64::ANDSWrr; 2378 case AArch64::ANDWrs: 2379 Is64Bit = false; 2380 return AArch64::ANDSWrs; 2381 case AArch64::BICWrr: 2382 Is64Bit = false; 2383 return AArch64::BICSWrr; 2384 case AArch64::BICWrs: 2385 Is64Bit = false; 2386 return AArch64::BICSWrs; 2387 case AArch64::SUBWri: 2388 Is64Bit = false; 2389 return AArch64::SUBSWri; 2390 case AArch64::SUBWrr: 2391 Is64Bit = false; 2392 return AArch64::SUBSWrr; 2393 case AArch64::SUBWrs: 2394 Is64Bit = false; 2395 return AArch64::SUBSWrs; 2396 case AArch64::SUBWrx: 2397 Is64Bit = false; 2398 return AArch64::SUBSWrx; 2399 // 64-bit cases: 2400 case AArch64::ADDXri: 2401 Is64Bit = true; 2402 return AArch64::ADDSXri; 2403 case AArch64::ADDXrr: 2404 Is64Bit = true; 2405 return AArch64::ADDSXrr; 2406 case AArch64::ADDXrs: 2407 Is64Bit = true; 2408 return AArch64::ADDSXrs; 2409 case AArch64::ADDXrx: 2410 Is64Bit = true; 2411 return AArch64::ADDSXrx; 2412 case AArch64::ANDXri: 2413 Is64Bit = true; 2414 return AArch64::ANDSXri; 2415 case AArch64::ANDXrr: 2416 Is64Bit = true; 2417 return AArch64::ANDSXrr; 2418 case AArch64::ANDXrs: 2419 Is64Bit = true; 2420 return AArch64::ANDSXrs; 2421 case AArch64::BICXrr: 2422 Is64Bit = true; 2423 return AArch64::BICSXrr; 2424 case AArch64::BICXrs: 2425 Is64Bit = true; 2426 return AArch64::BICSXrs; 2427 case AArch64::SUBXri: 2428 Is64Bit = true; 2429 return AArch64::SUBSXri; 2430 case AArch64::SUBXrr: 2431 Is64Bit = true; 2432 return AArch64::SUBSXrr; 2433 case AArch64::SUBXrs: 2434 Is64Bit = true; 2435 return AArch64::SUBSXrs; 2436 case AArch64::SUBXrx: 2437 Is64Bit = true; 2438 return AArch64::SUBSXrx; 2439 } 2440 } 2441 2442 // Is this a candidate for ld/st merging or pairing? For example, we don't 2443 // touch volatiles or load/stores that have a hint to avoid pair formation. 2444 bool AArch64InstrInfo::isCandidateToMergeOrPair(const MachineInstr &MI) const { 2445 2446 bool IsPreLdSt = isPreLdSt(MI); 2447 2448 // If this is a volatile load/store, don't mess with it. 2449 if (MI.hasOrderedMemoryRef()) 2450 return false; 2451 2452 // Make sure this is a reg/fi+imm (as opposed to an address reloc). 2453 // For Pre-inc LD/ST, the operand is shifted by one. 2454 assert((MI.getOperand(IsPreLdSt ? 2 : 1).isReg() || 2455 MI.getOperand(IsPreLdSt ? 2 : 1).isFI()) && 2456 "Expected a reg or frame index operand."); 2457 2458 // For Pre-indexed addressing quadword instructions, the third operand is the 2459 // immediate value. 2460 bool IsImmPreLdSt = IsPreLdSt && MI.getOperand(3).isImm(); 2461 2462 if (!MI.getOperand(2).isImm() && !IsImmPreLdSt) 2463 return false; 2464 2465 // Can't merge/pair if the instruction modifies the base register. 2466 // e.g., ldr x0, [x0] 2467 // This case will never occur with an FI base. 2468 // However, if the instruction is an LDR/STR<S,D,Q,W,X>pre, it can be merged. 2469 // For example: 2470 // ldr q0, [x11, #32]! 2471 // ldr q1, [x11, #16] 2472 // to 2473 // ldp q0, q1, [x11, #32]! 2474 if (MI.getOperand(1).isReg() && !IsPreLdSt) { 2475 Register BaseReg = MI.getOperand(1).getReg(); 2476 const TargetRegisterInfo *TRI = &getRegisterInfo(); 2477 if (MI.modifiesRegister(BaseReg, TRI)) 2478 return false; 2479 } 2480 2481 // Check if this load/store has a hint to avoid pair formation. 2482 // MachineMemOperands hints are set by the AArch64StorePairSuppress pass. 2483 if (isLdStPairSuppressed(MI)) 2484 return false; 2485 2486 // Do not pair any callee-save store/reload instructions in the 2487 // prologue/epilogue if the CFI information encoded the operations as separate 2488 // instructions, as that will cause the size of the actual prologue to mismatch 2489 // with the prologue size recorded in the Windows CFI. 2490 const MCAsmInfo *MAI = MI.getMF()->getTarget().getMCAsmInfo(); 2491 bool NeedsWinCFI = MAI->usesWindowsCFI() && 2492 MI.getMF()->getFunction().needsUnwindTableEntry(); 2493 if (NeedsWinCFI && (MI.getFlag(MachineInstr::FrameSetup) || 2494 MI.getFlag(MachineInstr::FrameDestroy))) 2495 return false; 2496 2497 // On some CPUs quad load/store pairs are slower than two single load/stores. 2498 if (Subtarget.isPaired128Slow()) { 2499 switch (MI.getOpcode()) { 2500 default: 2501 break; 2502 case AArch64::LDURQi: 2503 case AArch64::STURQi: 2504 case AArch64::LDRQui: 2505 case AArch64::STRQui: 2506 return false; 2507 } 2508 } 2509 2510 return true; 2511 } 2512 2513 bool AArch64InstrInfo::getMemOperandsWithOffsetWidth( 2514 const MachineInstr &LdSt, SmallVectorImpl<const MachineOperand *> &BaseOps, 2515 int64_t &Offset, bool &OffsetIsScalable, unsigned &Width, 2516 const TargetRegisterInfo *TRI) const { 2517 if (!LdSt.mayLoadOrStore()) 2518 return false; 2519 2520 const MachineOperand *BaseOp; 2521 if (!getMemOperandWithOffsetWidth(LdSt, BaseOp, Offset, OffsetIsScalable, 2522 Width, TRI)) 2523 return false; 2524 BaseOps.push_back(BaseOp); 2525 return true; 2526 } 2527 2528 Optional<ExtAddrMode> 2529 AArch64InstrInfo::getAddrModeFromMemoryOp(const MachineInstr &MemI, 2530 const TargetRegisterInfo *TRI) const { 2531 const MachineOperand *Base; // Filled with the base operand of MI. 2532 int64_t Offset; // Filled with the offset of MI. 2533 bool OffsetIsScalable; 2534 if (!getMemOperandWithOffset(MemI, Base, Offset, OffsetIsScalable, TRI)) 2535 return None; 2536 2537 if (!Base->isReg()) 2538 return None; 2539 ExtAddrMode AM; 2540 AM.BaseReg = Base->getReg(); 2541 AM.Displacement = Offset; 2542 AM.ScaledReg = 0; 2543 return AM; 2544 } 2545 2546 bool AArch64InstrInfo::getMemOperandWithOffsetWidth( 2547 const MachineInstr &LdSt, const MachineOperand *&BaseOp, int64_t &Offset, 2548 bool &OffsetIsScalable, unsigned &Width, 2549 const TargetRegisterInfo *TRI) const { 2550 assert(LdSt.mayLoadOrStore() && "Expected a memory operation."); 2551 // Handle only loads/stores with base register followed by immediate offset. 2552 if (LdSt.getNumExplicitOperands() == 3) { 2553 // Non-paired instruction (e.g., ldr x1, [x0, #8]). 2554 if ((!LdSt.getOperand(1).isReg() && !LdSt.getOperand(1).isFI()) || 2555 !LdSt.getOperand(2).isImm()) 2556 return false; 2557 } else if (LdSt.getNumExplicitOperands() == 4) { 2558 // Paired instruction (e.g., ldp x1, x2, [x0, #8]). 2559 if (!LdSt.getOperand(1).isReg() || 2560 (!LdSt.getOperand(2).isReg() && !LdSt.getOperand(2).isFI()) || 2561 !LdSt.getOperand(3).isImm()) 2562 return false; 2563 } else 2564 return false; 2565 2566 // Get the scaling factor for the instruction and set the width for the 2567 // instruction. 2568 TypeSize Scale(0U, false); 2569 int64_t Dummy1, Dummy2; 2570 2571 // If this returns false, then it's an instruction we don't want to handle. 2572 if (!getMemOpInfo(LdSt.getOpcode(), Scale, Width, Dummy1, Dummy2)) 2573 return false; 2574 2575 // Compute the offset. Offset is calculated as the immediate operand 2576 // multiplied by the scaling factor. Unscaled instructions have scaling factor 2577 // set to 1. 2578 if (LdSt.getNumExplicitOperands() == 3) { 2579 BaseOp = &LdSt.getOperand(1); 2580 Offset = LdSt.getOperand(2).getImm() * Scale.getKnownMinSize(); 2581 } else { 2582 assert(LdSt.getNumExplicitOperands() == 4 && "invalid number of operands"); 2583 BaseOp = &LdSt.getOperand(2); 2584 Offset = LdSt.getOperand(3).getImm() * Scale.getKnownMinSize(); 2585 } 2586 OffsetIsScalable = Scale.isScalable(); 2587 2588 if (!BaseOp->isReg() && !BaseOp->isFI()) 2589 return false; 2590 2591 return true; 2592 } 2593 2594 MachineOperand & 2595 AArch64InstrInfo::getMemOpBaseRegImmOfsOffsetOperand(MachineInstr &LdSt) const { 2596 assert(LdSt.mayLoadOrStore() && "Expected a memory operation."); 2597 MachineOperand &OfsOp = LdSt.getOperand(LdSt.getNumExplicitOperands() - 1); 2598 assert(OfsOp.isImm() && "Offset operand wasn't immediate."); 2599 return OfsOp; 2600 } 2601 2602 bool AArch64InstrInfo::getMemOpInfo(unsigned Opcode, TypeSize &Scale, 2603 unsigned &Width, int64_t &MinOffset, 2604 int64_t &MaxOffset) { 2605 const unsigned SVEMaxBytesPerVector = AArch64::SVEMaxBitsPerVector / 8; 2606 switch (Opcode) { 2607 // Not a memory operation or something we want to handle. 2608 default: 2609 Scale = TypeSize::Fixed(0); 2610 Width = 0; 2611 MinOffset = MaxOffset = 0; 2612 return false; 2613 case AArch64::STRWpost: 2614 case AArch64::LDRWpost: 2615 Width = 32; 2616 Scale = TypeSize::Fixed(4); 2617 MinOffset = -256; 2618 MaxOffset = 255; 2619 break; 2620 case AArch64::LDURQi: 2621 case AArch64::STURQi: 2622 Width = 16; 2623 Scale = TypeSize::Fixed(1); 2624 MinOffset = -256; 2625 MaxOffset = 255; 2626 break; 2627 case AArch64::PRFUMi: 2628 case AArch64::LDURXi: 2629 case AArch64::LDURDi: 2630 case AArch64::STURXi: 2631 case AArch64::STURDi: 2632 Width = 8; 2633 Scale = TypeSize::Fixed(1); 2634 MinOffset = -256; 2635 MaxOffset = 255; 2636 break; 2637 case AArch64::LDURWi: 2638 case AArch64::LDURSi: 2639 case AArch64::LDURSWi: 2640 case AArch64::STURWi: 2641 case AArch64::STURSi: 2642 Width = 4; 2643 Scale = TypeSize::Fixed(1); 2644 MinOffset = -256; 2645 MaxOffset = 255; 2646 break; 2647 case AArch64::LDURHi: 2648 case AArch64::LDURHHi: 2649 case AArch64::LDURSHXi: 2650 case AArch64::LDURSHWi: 2651 case AArch64::STURHi: 2652 case AArch64::STURHHi: 2653 Width = 2; 2654 Scale = TypeSize::Fixed(1); 2655 MinOffset = -256; 2656 MaxOffset = 255; 2657 break; 2658 case AArch64::LDURBi: 2659 case AArch64::LDURBBi: 2660 case AArch64::LDURSBXi: 2661 case AArch64::LDURSBWi: 2662 case AArch64::STURBi: 2663 case AArch64::STURBBi: 2664 Width = 1; 2665 Scale = TypeSize::Fixed(1); 2666 MinOffset = -256; 2667 MaxOffset = 255; 2668 break; 2669 case AArch64::LDPQi: 2670 case AArch64::LDNPQi: 2671 case AArch64::STPQi: 2672 case AArch64::STNPQi: 2673 Scale = TypeSize::Fixed(16); 2674 Width = 32; 2675 MinOffset = -64; 2676 MaxOffset = 63; 2677 break; 2678 case AArch64::LDRQui: 2679 case AArch64::STRQui: 2680 Scale = TypeSize::Fixed(16); 2681 Width = 16; 2682 MinOffset = 0; 2683 MaxOffset = 4095; 2684 break; 2685 case AArch64::LDPXi: 2686 case AArch64::LDPDi: 2687 case AArch64::LDNPXi: 2688 case AArch64::LDNPDi: 2689 case AArch64::STPXi: 2690 case AArch64::STPDi: 2691 case AArch64::STNPXi: 2692 case AArch64::STNPDi: 2693 Scale = TypeSize::Fixed(8); 2694 Width = 16; 2695 MinOffset = -64; 2696 MaxOffset = 63; 2697 break; 2698 case AArch64::PRFMui: 2699 case AArch64::LDRXui: 2700 case AArch64::LDRDui: 2701 case AArch64::STRXui: 2702 case AArch64::STRDui: 2703 Scale = TypeSize::Fixed(8); 2704 Width = 8; 2705 MinOffset = 0; 2706 MaxOffset = 4095; 2707 break; 2708 case AArch64::StoreSwiftAsyncContext: 2709 // Store is an STRXui, but there might be an ADDXri in the expansion too. 2710 Scale = TypeSize::Fixed(1); 2711 Width = 8; 2712 MinOffset = 0; 2713 MaxOffset = 4095; 2714 break; 2715 case AArch64::LDPWi: 2716 case AArch64::LDPSi: 2717 case AArch64::LDNPWi: 2718 case AArch64::LDNPSi: 2719 case AArch64::STPWi: 2720 case AArch64::STPSi: 2721 case AArch64::STNPWi: 2722 case AArch64::STNPSi: 2723 Scale = TypeSize::Fixed(4); 2724 Width = 8; 2725 MinOffset = -64; 2726 MaxOffset = 63; 2727 break; 2728 case AArch64::LDRWui: 2729 case AArch64::LDRSui: 2730 case AArch64::LDRSWui: 2731 case AArch64::STRWui: 2732 case AArch64::STRSui: 2733 Scale = TypeSize::Fixed(4); 2734 Width = 4; 2735 MinOffset = 0; 2736 MaxOffset = 4095; 2737 break; 2738 case AArch64::LDRHui: 2739 case AArch64::LDRHHui: 2740 case AArch64::LDRSHWui: 2741 case AArch64::LDRSHXui: 2742 case AArch64::STRHui: 2743 case AArch64::STRHHui: 2744 Scale = TypeSize::Fixed(2); 2745 Width = 2; 2746 MinOffset = 0; 2747 MaxOffset = 4095; 2748 break; 2749 case AArch64::LDRBui: 2750 case AArch64::LDRBBui: 2751 case AArch64::LDRSBWui: 2752 case AArch64::LDRSBXui: 2753 case AArch64::STRBui: 2754 case AArch64::STRBBui: 2755 Scale = TypeSize::Fixed(1); 2756 Width = 1; 2757 MinOffset = 0; 2758 MaxOffset = 4095; 2759 break; 2760 case AArch64::STPXpre: 2761 case AArch64::LDPXpost: 2762 case AArch64::STPDpre: 2763 case AArch64::LDPDpost: 2764 Scale = TypeSize::Fixed(8); 2765 Width = 8; 2766 MinOffset = -512; 2767 MaxOffset = 504; 2768 break; 2769 case AArch64::STPQpre: 2770 case AArch64::LDPQpost: 2771 Scale = TypeSize::Fixed(16); 2772 Width = 16; 2773 MinOffset = -1024; 2774 MaxOffset = 1008; 2775 break; 2776 case AArch64::STRXpre: 2777 case AArch64::STRDpre: 2778 case AArch64::LDRXpost: 2779 case AArch64::LDRDpost: 2780 Scale = TypeSize::Fixed(1); 2781 Width = 8; 2782 MinOffset = -256; 2783 MaxOffset = 255; 2784 break; 2785 case AArch64::STRQpre: 2786 case AArch64::LDRQpost: 2787 Scale = TypeSize::Fixed(1); 2788 Width = 16; 2789 MinOffset = -256; 2790 MaxOffset = 255; 2791 break; 2792 case AArch64::ADDG: 2793 Scale = TypeSize::Fixed(16); 2794 Width = 0; 2795 MinOffset = 0; 2796 MaxOffset = 63; 2797 break; 2798 case AArch64::TAGPstack: 2799 Scale = TypeSize::Fixed(16); 2800 Width = 0; 2801 // TAGP with a negative offset turns into SUBP, which has a maximum offset 2802 // of 63 (not 64!). 2803 MinOffset = -63; 2804 MaxOffset = 63; 2805 break; 2806 case AArch64::LDG: 2807 case AArch64::STGOffset: 2808 case AArch64::STZGOffset: 2809 Scale = TypeSize::Fixed(16); 2810 Width = 16; 2811 MinOffset = -256; 2812 MaxOffset = 255; 2813 break; 2814 case AArch64::STR_ZZZZXI: 2815 case AArch64::LDR_ZZZZXI: 2816 Scale = TypeSize::Scalable(16); 2817 Width = SVEMaxBytesPerVector * 4; 2818 MinOffset = -256; 2819 MaxOffset = 252; 2820 break; 2821 case AArch64::STR_ZZZXI: 2822 case AArch64::LDR_ZZZXI: 2823 Scale = TypeSize::Scalable(16); 2824 Width = SVEMaxBytesPerVector * 3; 2825 MinOffset = -256; 2826 MaxOffset = 253; 2827 break; 2828 case AArch64::STR_ZZXI: 2829 case AArch64::LDR_ZZXI: 2830 Scale = TypeSize::Scalable(16); 2831 Width = SVEMaxBytesPerVector * 2; 2832 MinOffset = -256; 2833 MaxOffset = 254; 2834 break; 2835 case AArch64::LDR_PXI: 2836 case AArch64::STR_PXI: 2837 Scale = TypeSize::Scalable(2); 2838 Width = SVEMaxBytesPerVector / 8; 2839 MinOffset = -256; 2840 MaxOffset = 255; 2841 break; 2842 case AArch64::LDR_ZXI: 2843 case AArch64::STR_ZXI: 2844 Scale = TypeSize::Scalable(16); 2845 Width = SVEMaxBytesPerVector; 2846 MinOffset = -256; 2847 MaxOffset = 255; 2848 break; 2849 case AArch64::LD1B_IMM: 2850 case AArch64::LD1H_IMM: 2851 case AArch64::LD1W_IMM: 2852 case AArch64::LD1D_IMM: 2853 case AArch64::ST1B_IMM: 2854 case AArch64::ST1H_IMM: 2855 case AArch64::ST1W_IMM: 2856 case AArch64::ST1D_IMM: 2857 // A full vectors worth of data 2858 // Width = mbytes * elements 2859 Scale = TypeSize::Scalable(16); 2860 Width = SVEMaxBytesPerVector; 2861 MinOffset = -8; 2862 MaxOffset = 7; 2863 break; 2864 case AArch64::LD1B_H_IMM: 2865 case AArch64::LD1SB_H_IMM: 2866 case AArch64::LD1H_S_IMM: 2867 case AArch64::LD1SH_S_IMM: 2868 case AArch64::LD1W_D_IMM: 2869 case AArch64::LD1SW_D_IMM: 2870 case AArch64::ST1B_H_IMM: 2871 case AArch64::ST1H_S_IMM: 2872 case AArch64::ST1W_D_IMM: 2873 // A half vector worth of data 2874 // Width = mbytes * elements 2875 Scale = TypeSize::Scalable(8); 2876 Width = SVEMaxBytesPerVector / 2; 2877 MinOffset = -8; 2878 MaxOffset = 7; 2879 break; 2880 case AArch64::LD1B_S_IMM: 2881 case AArch64::LD1SB_S_IMM: 2882 case AArch64::LD1H_D_IMM: 2883 case AArch64::LD1SH_D_IMM: 2884 case AArch64::ST1B_S_IMM: 2885 case AArch64::ST1H_D_IMM: 2886 // A quarter vector worth of data 2887 // Width = mbytes * elements 2888 Scale = TypeSize::Scalable(4); 2889 Width = SVEMaxBytesPerVector / 4; 2890 MinOffset = -8; 2891 MaxOffset = 7; 2892 break; 2893 case AArch64::LD1B_D_IMM: 2894 case AArch64::LD1SB_D_IMM: 2895 case AArch64::ST1B_D_IMM: 2896 // A eighth vector worth of data 2897 // Width = mbytes * elements 2898 Scale = TypeSize::Scalable(2); 2899 Width = SVEMaxBytesPerVector / 8; 2900 MinOffset = -8; 2901 MaxOffset = 7; 2902 break; 2903 case AArch64::ST2GOffset: 2904 case AArch64::STZ2GOffset: 2905 Scale = TypeSize::Fixed(16); 2906 Width = 32; 2907 MinOffset = -256; 2908 MaxOffset = 255; 2909 break; 2910 case AArch64::STGPi: 2911 Scale = TypeSize::Fixed(16); 2912 Width = 16; 2913 MinOffset = -64; 2914 MaxOffset = 63; 2915 break; 2916 } 2917 2918 return true; 2919 } 2920 2921 // Scaling factor for unscaled load or store. 2922 int AArch64InstrInfo::getMemScale(unsigned Opc) { 2923 switch (Opc) { 2924 default: 2925 llvm_unreachable("Opcode has unknown scale!"); 2926 case AArch64::LDRBBui: 2927 case AArch64::LDURBBi: 2928 case AArch64::LDRSBWui: 2929 case AArch64::LDURSBWi: 2930 case AArch64::STRBBui: 2931 case AArch64::STURBBi: 2932 return 1; 2933 case AArch64::LDRHHui: 2934 case AArch64::LDURHHi: 2935 case AArch64::LDRSHWui: 2936 case AArch64::LDURSHWi: 2937 case AArch64::STRHHui: 2938 case AArch64::STURHHi: 2939 return 2; 2940 case AArch64::LDRSui: 2941 case AArch64::LDURSi: 2942 case AArch64::LDRSpre: 2943 case AArch64::LDRSWui: 2944 case AArch64::LDURSWi: 2945 case AArch64::LDRWpre: 2946 case AArch64::LDRWui: 2947 case AArch64::LDURWi: 2948 case AArch64::STRSui: 2949 case AArch64::STURSi: 2950 case AArch64::STRSpre: 2951 case AArch64::STRWui: 2952 case AArch64::STURWi: 2953 case AArch64::STRWpre: 2954 case AArch64::LDPSi: 2955 case AArch64::LDPSWi: 2956 case AArch64::LDPWi: 2957 case AArch64::STPSi: 2958 case AArch64::STPWi: 2959 return 4; 2960 case AArch64::LDRDui: 2961 case AArch64::LDURDi: 2962 case AArch64::LDRDpre: 2963 case AArch64::LDRXui: 2964 case AArch64::LDURXi: 2965 case AArch64::LDRXpre: 2966 case AArch64::STRDui: 2967 case AArch64::STURDi: 2968 case AArch64::STRDpre: 2969 case AArch64::STRXui: 2970 case AArch64::STURXi: 2971 case AArch64::STRXpre: 2972 case AArch64::LDPDi: 2973 case AArch64::LDPXi: 2974 case AArch64::STPDi: 2975 case AArch64::STPXi: 2976 return 8; 2977 case AArch64::LDRQui: 2978 case AArch64::LDURQi: 2979 case AArch64::STRQui: 2980 case AArch64::STURQi: 2981 case AArch64::STRQpre: 2982 case AArch64::LDPQi: 2983 case AArch64::LDRQpre: 2984 case AArch64::STPQi: 2985 case AArch64::STGOffset: 2986 case AArch64::STZGOffset: 2987 case AArch64::ST2GOffset: 2988 case AArch64::STZ2GOffset: 2989 case AArch64::STGPi: 2990 return 16; 2991 } 2992 } 2993 2994 bool AArch64InstrInfo::isPreLd(const MachineInstr &MI) { 2995 switch (MI.getOpcode()) { 2996 default: 2997 return false; 2998 case AArch64::LDRWpre: 2999 case AArch64::LDRXpre: 3000 case AArch64::LDRSpre: 3001 case AArch64::LDRDpre: 3002 case AArch64::LDRQpre: 3003 return true; 3004 } 3005 } 3006 3007 bool AArch64InstrInfo::isPreSt(const MachineInstr &MI) { 3008 switch (MI.getOpcode()) { 3009 default: 3010 return false; 3011 case AArch64::STRWpre: 3012 case AArch64::STRXpre: 3013 case AArch64::STRSpre: 3014 case AArch64::STRDpre: 3015 case AArch64::STRQpre: 3016 return true; 3017 } 3018 } 3019 3020 bool AArch64InstrInfo::isPreLdSt(const MachineInstr &MI) { 3021 return isPreLd(MI) || isPreSt(MI); 3022 } 3023 3024 // Scale the unscaled offsets. Returns false if the unscaled offset can't be 3025 // scaled. 3026 static bool scaleOffset(unsigned Opc, int64_t &Offset) { 3027 int Scale = AArch64InstrInfo::getMemScale(Opc); 3028 3029 // If the byte-offset isn't a multiple of the stride, we can't scale this 3030 // offset. 3031 if (Offset % Scale != 0) 3032 return false; 3033 3034 // Convert the byte-offset used by unscaled into an "element" offset used 3035 // by the scaled pair load/store instructions. 3036 Offset /= Scale; 3037 return true; 3038 } 3039 3040 static bool canPairLdStOpc(unsigned FirstOpc, unsigned SecondOpc) { 3041 if (FirstOpc == SecondOpc) 3042 return true; 3043 // We can also pair sign-ext and zero-ext instructions. 3044 switch (FirstOpc) { 3045 default: 3046 return false; 3047 case AArch64::LDRWui: 3048 case AArch64::LDURWi: 3049 return SecondOpc == AArch64::LDRSWui || SecondOpc == AArch64::LDURSWi; 3050 case AArch64::LDRSWui: 3051 case AArch64::LDURSWi: 3052 return SecondOpc == AArch64::LDRWui || SecondOpc == AArch64::LDURWi; 3053 } 3054 // These instructions can't be paired based on their opcodes. 3055 return false; 3056 } 3057 3058 static bool shouldClusterFI(const MachineFrameInfo &MFI, int FI1, 3059 int64_t Offset1, unsigned Opcode1, int FI2, 3060 int64_t Offset2, unsigned Opcode2) { 3061 // Accesses through fixed stack object frame indices may access a different 3062 // fixed stack slot. Check that the object offsets + offsets match. 3063 if (MFI.isFixedObjectIndex(FI1) && MFI.isFixedObjectIndex(FI2)) { 3064 int64_t ObjectOffset1 = MFI.getObjectOffset(FI1); 3065 int64_t ObjectOffset2 = MFI.getObjectOffset(FI2); 3066 assert(ObjectOffset1 <= ObjectOffset2 && "Object offsets are not ordered."); 3067 // Convert to scaled object offsets. 3068 int Scale1 = AArch64InstrInfo::getMemScale(Opcode1); 3069 if (ObjectOffset1 % Scale1 != 0) 3070 return false; 3071 ObjectOffset1 /= Scale1; 3072 int Scale2 = AArch64InstrInfo::getMemScale(Opcode2); 3073 if (ObjectOffset2 % Scale2 != 0) 3074 return false; 3075 ObjectOffset2 /= Scale2; 3076 ObjectOffset1 += Offset1; 3077 ObjectOffset2 += Offset2; 3078 return ObjectOffset1 + 1 == ObjectOffset2; 3079 } 3080 3081 return FI1 == FI2; 3082 } 3083 3084 /// Detect opportunities for ldp/stp formation. 3085 /// 3086 /// Only called for LdSt for which getMemOperandWithOffset returns true. 3087 bool AArch64InstrInfo::shouldClusterMemOps( 3088 ArrayRef<const MachineOperand *> BaseOps1, 3089 ArrayRef<const MachineOperand *> BaseOps2, unsigned NumLoads, 3090 unsigned NumBytes) const { 3091 assert(BaseOps1.size() == 1 && BaseOps2.size() == 1); 3092 const MachineOperand &BaseOp1 = *BaseOps1.front(); 3093 const MachineOperand &BaseOp2 = *BaseOps2.front(); 3094 const MachineInstr &FirstLdSt = *BaseOp1.getParent(); 3095 const MachineInstr &SecondLdSt = *BaseOp2.getParent(); 3096 if (BaseOp1.getType() != BaseOp2.getType()) 3097 return false; 3098 3099 assert((BaseOp1.isReg() || BaseOp1.isFI()) && 3100 "Only base registers and frame indices are supported."); 3101 3102 // Check for both base regs and base FI. 3103 if (BaseOp1.isReg() && BaseOp1.getReg() != BaseOp2.getReg()) 3104 return false; 3105 3106 // Only cluster up to a single pair. 3107 if (NumLoads > 2) 3108 return false; 3109 3110 if (!isPairableLdStInst(FirstLdSt) || !isPairableLdStInst(SecondLdSt)) 3111 return false; 3112 3113 // Can we pair these instructions based on their opcodes? 3114 unsigned FirstOpc = FirstLdSt.getOpcode(); 3115 unsigned SecondOpc = SecondLdSt.getOpcode(); 3116 if (!canPairLdStOpc(FirstOpc, SecondOpc)) 3117 return false; 3118 3119 // Can't merge volatiles or load/stores that have a hint to avoid pair 3120 // formation, for example. 3121 if (!isCandidateToMergeOrPair(FirstLdSt) || 3122 !isCandidateToMergeOrPair(SecondLdSt)) 3123 return false; 3124 3125 // isCandidateToMergeOrPair guarantees that operand 2 is an immediate. 3126 int64_t Offset1 = FirstLdSt.getOperand(2).getImm(); 3127 if (hasUnscaledLdStOffset(FirstOpc) && !scaleOffset(FirstOpc, Offset1)) 3128 return false; 3129 3130 int64_t Offset2 = SecondLdSt.getOperand(2).getImm(); 3131 if (hasUnscaledLdStOffset(SecondOpc) && !scaleOffset(SecondOpc, Offset2)) 3132 return false; 3133 3134 // Pairwise instructions have a 7-bit signed offset field. 3135 if (Offset1 > 63 || Offset1 < -64) 3136 return false; 3137 3138 // The caller should already have ordered First/SecondLdSt by offset. 3139 // Note: except for non-equal frame index bases 3140 if (BaseOp1.isFI()) { 3141 assert((!BaseOp1.isIdenticalTo(BaseOp2) || Offset1 <= Offset2) && 3142 "Caller should have ordered offsets."); 3143 3144 const MachineFrameInfo &MFI = 3145 FirstLdSt.getParent()->getParent()->getFrameInfo(); 3146 return shouldClusterFI(MFI, BaseOp1.getIndex(), Offset1, FirstOpc, 3147 BaseOp2.getIndex(), Offset2, SecondOpc); 3148 } 3149 3150 assert(Offset1 <= Offset2 && "Caller should have ordered offsets."); 3151 3152 return Offset1 + 1 == Offset2; 3153 } 3154 3155 static const MachineInstrBuilder &AddSubReg(const MachineInstrBuilder &MIB, 3156 unsigned Reg, unsigned SubIdx, 3157 unsigned State, 3158 const TargetRegisterInfo *TRI) { 3159 if (!SubIdx) 3160 return MIB.addReg(Reg, State); 3161 3162 if (Register::isPhysicalRegister(Reg)) 3163 return MIB.addReg(TRI->getSubReg(Reg, SubIdx), State); 3164 return MIB.addReg(Reg, State, SubIdx); 3165 } 3166 3167 static bool forwardCopyWillClobberTuple(unsigned DestReg, unsigned SrcReg, 3168 unsigned NumRegs) { 3169 // We really want the positive remainder mod 32 here, that happens to be 3170 // easily obtainable with a mask. 3171 return ((DestReg - SrcReg) & 0x1f) < NumRegs; 3172 } 3173 3174 void AArch64InstrInfo::copyPhysRegTuple(MachineBasicBlock &MBB, 3175 MachineBasicBlock::iterator I, 3176 const DebugLoc &DL, MCRegister DestReg, 3177 MCRegister SrcReg, bool KillSrc, 3178 unsigned Opcode, 3179 ArrayRef<unsigned> Indices) const { 3180 assert(Subtarget.hasNEON() && "Unexpected register copy without NEON"); 3181 const TargetRegisterInfo *TRI = &getRegisterInfo(); 3182 uint16_t DestEncoding = TRI->getEncodingValue(DestReg); 3183 uint16_t SrcEncoding = TRI->getEncodingValue(SrcReg); 3184 unsigned NumRegs = Indices.size(); 3185 3186 int SubReg = 0, End = NumRegs, Incr = 1; 3187 if (forwardCopyWillClobberTuple(DestEncoding, SrcEncoding, NumRegs)) { 3188 SubReg = NumRegs - 1; 3189 End = -1; 3190 Incr = -1; 3191 } 3192 3193 for (; SubReg != End; SubReg += Incr) { 3194 const MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(Opcode)); 3195 AddSubReg(MIB, DestReg, Indices[SubReg], RegState::Define, TRI); 3196 AddSubReg(MIB, SrcReg, Indices[SubReg], 0, TRI); 3197 AddSubReg(MIB, SrcReg, Indices[SubReg], getKillRegState(KillSrc), TRI); 3198 } 3199 } 3200 3201 void AArch64InstrInfo::copyGPRRegTuple(MachineBasicBlock &MBB, 3202 MachineBasicBlock::iterator I, 3203 DebugLoc DL, unsigned DestReg, 3204 unsigned SrcReg, bool KillSrc, 3205 unsigned Opcode, unsigned ZeroReg, 3206 llvm::ArrayRef<unsigned> Indices) const { 3207 const TargetRegisterInfo *TRI = &getRegisterInfo(); 3208 unsigned NumRegs = Indices.size(); 3209 3210 #ifndef NDEBUG 3211 uint16_t DestEncoding = TRI->getEncodingValue(DestReg); 3212 uint16_t SrcEncoding = TRI->getEncodingValue(SrcReg); 3213 assert(DestEncoding % NumRegs == 0 && SrcEncoding % NumRegs == 0 && 3214 "GPR reg sequences should not be able to overlap"); 3215 #endif 3216 3217 for (unsigned SubReg = 0; SubReg != NumRegs; ++SubReg) { 3218 const MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(Opcode)); 3219 AddSubReg(MIB, DestReg, Indices[SubReg], RegState::Define, TRI); 3220 MIB.addReg(ZeroReg); 3221 AddSubReg(MIB, SrcReg, Indices[SubReg], getKillRegState(KillSrc), TRI); 3222 MIB.addImm(0); 3223 } 3224 } 3225 3226 void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB, 3227 MachineBasicBlock::iterator I, 3228 const DebugLoc &DL, MCRegister DestReg, 3229 MCRegister SrcReg, bool KillSrc) const { 3230 if (AArch64::GPR32spRegClass.contains(DestReg) && 3231 (AArch64::GPR32spRegClass.contains(SrcReg) || SrcReg == AArch64::WZR)) { 3232 const TargetRegisterInfo *TRI = &getRegisterInfo(); 3233 3234 if (DestReg == AArch64::WSP || SrcReg == AArch64::WSP) { 3235 // If either operand is WSP, expand to ADD #0. 3236 if (Subtarget.hasZeroCycleRegMove()) { 3237 // Cyclone recognizes "ADD Xd, Xn, #0" as a zero-cycle register move. 3238 MCRegister DestRegX = TRI->getMatchingSuperReg( 3239 DestReg, AArch64::sub_32, &AArch64::GPR64spRegClass); 3240 MCRegister SrcRegX = TRI->getMatchingSuperReg( 3241 SrcReg, AArch64::sub_32, &AArch64::GPR64spRegClass); 3242 // This instruction is reading and writing X registers. This may upset 3243 // the register scavenger and machine verifier, so we need to indicate 3244 // that we are reading an undefined value from SrcRegX, but a proper 3245 // value from SrcReg. 3246 BuildMI(MBB, I, DL, get(AArch64::ADDXri), DestRegX) 3247 .addReg(SrcRegX, RegState::Undef) 3248 .addImm(0) 3249 .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0)) 3250 .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc)); 3251 } else { 3252 BuildMI(MBB, I, DL, get(AArch64::ADDWri), DestReg) 3253 .addReg(SrcReg, getKillRegState(KillSrc)) 3254 .addImm(0) 3255 .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0)); 3256 } 3257 } else if (SrcReg == AArch64::WZR && Subtarget.hasZeroCycleZeroingGP()) { 3258 BuildMI(MBB, I, DL, get(AArch64::MOVZWi), DestReg) 3259 .addImm(0) 3260 .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0)); 3261 } else { 3262 if (Subtarget.hasZeroCycleRegMove()) { 3263 // Cyclone recognizes "ORR Xd, XZR, Xm" as a zero-cycle register move. 3264 MCRegister DestRegX = TRI->getMatchingSuperReg( 3265 DestReg, AArch64::sub_32, &AArch64::GPR64spRegClass); 3266 MCRegister SrcRegX = TRI->getMatchingSuperReg( 3267 SrcReg, AArch64::sub_32, &AArch64::GPR64spRegClass); 3268 // This instruction is reading and writing X registers. This may upset 3269 // the register scavenger and machine verifier, so we need to indicate 3270 // that we are reading an undefined value from SrcRegX, but a proper 3271 // value from SrcReg. 3272 BuildMI(MBB, I, DL, get(AArch64::ORRXrr), DestRegX) 3273 .addReg(AArch64::XZR) 3274 .addReg(SrcRegX, RegState::Undef) 3275 .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc)); 3276 } else { 3277 // Otherwise, expand to ORR WZR. 3278 BuildMI(MBB, I, DL, get(AArch64::ORRWrr), DestReg) 3279 .addReg(AArch64::WZR) 3280 .addReg(SrcReg, getKillRegState(KillSrc)); 3281 } 3282 } 3283 return; 3284 } 3285 3286 // Copy a Predicate register by ORRing with itself. 3287 if (AArch64::PPRRegClass.contains(DestReg) && 3288 AArch64::PPRRegClass.contains(SrcReg)) { 3289 assert(Subtarget.hasSVE() && "Unexpected SVE register."); 3290 BuildMI(MBB, I, DL, get(AArch64::ORR_PPzPP), DestReg) 3291 .addReg(SrcReg) // Pg 3292 .addReg(SrcReg) 3293 .addReg(SrcReg, getKillRegState(KillSrc)); 3294 return; 3295 } 3296 3297 // Copy a Z register by ORRing with itself. 3298 if (AArch64::ZPRRegClass.contains(DestReg) && 3299 AArch64::ZPRRegClass.contains(SrcReg)) { 3300 assert(Subtarget.hasSVE() && "Unexpected SVE register."); 3301 BuildMI(MBB, I, DL, get(AArch64::ORR_ZZZ), DestReg) 3302 .addReg(SrcReg) 3303 .addReg(SrcReg, getKillRegState(KillSrc)); 3304 return; 3305 } 3306 3307 // Copy a Z register pair by copying the individual sub-registers. 3308 if (AArch64::ZPR2RegClass.contains(DestReg) && 3309 AArch64::ZPR2RegClass.contains(SrcReg)) { 3310 static const unsigned Indices[] = {AArch64::zsub0, AArch64::zsub1}; 3311 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORR_ZZZ, 3312 Indices); 3313 return; 3314 } 3315 3316 // Copy a Z register triple by copying the individual sub-registers. 3317 if (AArch64::ZPR3RegClass.contains(DestReg) && 3318 AArch64::ZPR3RegClass.contains(SrcReg)) { 3319 static const unsigned Indices[] = {AArch64::zsub0, AArch64::zsub1, 3320 AArch64::zsub2}; 3321 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORR_ZZZ, 3322 Indices); 3323 return; 3324 } 3325 3326 // Copy a Z register quad by copying the individual sub-registers. 3327 if (AArch64::ZPR4RegClass.contains(DestReg) && 3328 AArch64::ZPR4RegClass.contains(SrcReg)) { 3329 static const unsigned Indices[] = {AArch64::zsub0, AArch64::zsub1, 3330 AArch64::zsub2, AArch64::zsub3}; 3331 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORR_ZZZ, 3332 Indices); 3333 return; 3334 } 3335 3336 if (AArch64::GPR64spRegClass.contains(DestReg) && 3337 (AArch64::GPR64spRegClass.contains(SrcReg) || SrcReg == AArch64::XZR)) { 3338 if (DestReg == AArch64::SP || SrcReg == AArch64::SP) { 3339 // If either operand is SP, expand to ADD #0. 3340 BuildMI(MBB, I, DL, get(AArch64::ADDXri), DestReg) 3341 .addReg(SrcReg, getKillRegState(KillSrc)) 3342 .addImm(0) 3343 .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0)); 3344 } else if (SrcReg == AArch64::XZR && Subtarget.hasZeroCycleZeroingGP()) { 3345 BuildMI(MBB, I, DL, get(AArch64::MOVZXi), DestReg) 3346 .addImm(0) 3347 .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0)); 3348 } else { 3349 // Otherwise, expand to ORR XZR. 3350 BuildMI(MBB, I, DL, get(AArch64::ORRXrr), DestReg) 3351 .addReg(AArch64::XZR) 3352 .addReg(SrcReg, getKillRegState(KillSrc)); 3353 } 3354 return; 3355 } 3356 3357 // Copy a DDDD register quad by copying the individual sub-registers. 3358 if (AArch64::DDDDRegClass.contains(DestReg) && 3359 AArch64::DDDDRegClass.contains(SrcReg)) { 3360 static const unsigned Indices[] = {AArch64::dsub0, AArch64::dsub1, 3361 AArch64::dsub2, AArch64::dsub3}; 3362 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8, 3363 Indices); 3364 return; 3365 } 3366 3367 // Copy a DDD register triple by copying the individual sub-registers. 3368 if (AArch64::DDDRegClass.contains(DestReg) && 3369 AArch64::DDDRegClass.contains(SrcReg)) { 3370 static const unsigned Indices[] = {AArch64::dsub0, AArch64::dsub1, 3371 AArch64::dsub2}; 3372 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8, 3373 Indices); 3374 return; 3375 } 3376 3377 // Copy a DD register pair by copying the individual sub-registers. 3378 if (AArch64::DDRegClass.contains(DestReg) && 3379 AArch64::DDRegClass.contains(SrcReg)) { 3380 static const unsigned Indices[] = {AArch64::dsub0, AArch64::dsub1}; 3381 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8, 3382 Indices); 3383 return; 3384 } 3385 3386 // Copy a QQQQ register quad by copying the individual sub-registers. 3387 if (AArch64::QQQQRegClass.contains(DestReg) && 3388 AArch64::QQQQRegClass.contains(SrcReg)) { 3389 static const unsigned Indices[] = {AArch64::qsub0, AArch64::qsub1, 3390 AArch64::qsub2, AArch64::qsub3}; 3391 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8, 3392 Indices); 3393 return; 3394 } 3395 3396 // Copy a QQQ register triple by copying the individual sub-registers. 3397 if (AArch64::QQQRegClass.contains(DestReg) && 3398 AArch64::QQQRegClass.contains(SrcReg)) { 3399 static const unsigned Indices[] = {AArch64::qsub0, AArch64::qsub1, 3400 AArch64::qsub2}; 3401 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8, 3402 Indices); 3403 return; 3404 } 3405 3406 // Copy a QQ register pair by copying the individual sub-registers. 3407 if (AArch64::QQRegClass.contains(DestReg) && 3408 AArch64::QQRegClass.contains(SrcReg)) { 3409 static const unsigned Indices[] = {AArch64::qsub0, AArch64::qsub1}; 3410 copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8, 3411 Indices); 3412 return; 3413 } 3414 3415 if (AArch64::XSeqPairsClassRegClass.contains(DestReg) && 3416 AArch64::XSeqPairsClassRegClass.contains(SrcReg)) { 3417 static const unsigned Indices[] = {AArch64::sube64, AArch64::subo64}; 3418 copyGPRRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRXrs, 3419 AArch64::XZR, Indices); 3420 return; 3421 } 3422 3423 if (AArch64::WSeqPairsClassRegClass.contains(DestReg) && 3424 AArch64::WSeqPairsClassRegClass.contains(SrcReg)) { 3425 static const unsigned Indices[] = {AArch64::sube32, AArch64::subo32}; 3426 copyGPRRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRWrs, 3427 AArch64::WZR, Indices); 3428 return; 3429 } 3430 3431 if (AArch64::FPR128RegClass.contains(DestReg) && 3432 AArch64::FPR128RegClass.contains(SrcReg)) { 3433 if (Subtarget.hasNEON()) { 3434 BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg) 3435 .addReg(SrcReg) 3436 .addReg(SrcReg, getKillRegState(KillSrc)); 3437 } else { 3438 BuildMI(MBB, I, DL, get(AArch64::STRQpre)) 3439 .addReg(AArch64::SP, RegState::Define) 3440 .addReg(SrcReg, getKillRegState(KillSrc)) 3441 .addReg(AArch64::SP) 3442 .addImm(-16); 3443 BuildMI(MBB, I, DL, get(AArch64::LDRQpre)) 3444 .addReg(AArch64::SP, RegState::Define) 3445 .addReg(DestReg, RegState::Define) 3446 .addReg(AArch64::SP) 3447 .addImm(16); 3448 } 3449 return; 3450 } 3451 3452 if (AArch64::FPR64RegClass.contains(DestReg) && 3453 AArch64::FPR64RegClass.contains(SrcReg)) { 3454 if (Subtarget.hasNEON()) { 3455 DestReg = RI.getMatchingSuperReg(DestReg, AArch64::dsub, 3456 &AArch64::FPR128RegClass); 3457 SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::dsub, 3458 &AArch64::FPR128RegClass); 3459 BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg) 3460 .addReg(SrcReg) 3461 .addReg(SrcReg, getKillRegState(KillSrc)); 3462 } else { 3463 BuildMI(MBB, I, DL, get(AArch64::FMOVDr), DestReg) 3464 .addReg(SrcReg, getKillRegState(KillSrc)); 3465 } 3466 return; 3467 } 3468 3469 if (AArch64::FPR32RegClass.contains(DestReg) && 3470 AArch64::FPR32RegClass.contains(SrcReg)) { 3471 if (Subtarget.hasNEON()) { 3472 DestReg = RI.getMatchingSuperReg(DestReg, AArch64::ssub, 3473 &AArch64::FPR128RegClass); 3474 SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::ssub, 3475 &AArch64::FPR128RegClass); 3476 BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg) 3477 .addReg(SrcReg) 3478 .addReg(SrcReg, getKillRegState(KillSrc)); 3479 } else { 3480 BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg) 3481 .addReg(SrcReg, getKillRegState(KillSrc)); 3482 } 3483 return; 3484 } 3485 3486 if (AArch64::FPR16RegClass.contains(DestReg) && 3487 AArch64::FPR16RegClass.contains(SrcReg)) { 3488 if (Subtarget.hasNEON()) { 3489 DestReg = RI.getMatchingSuperReg(DestReg, AArch64::hsub, 3490 &AArch64::FPR128RegClass); 3491 SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::hsub, 3492 &AArch64::FPR128RegClass); 3493 BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg) 3494 .addReg(SrcReg) 3495 .addReg(SrcReg, getKillRegState(KillSrc)); 3496 } else { 3497 DestReg = RI.getMatchingSuperReg(DestReg, AArch64::hsub, 3498 &AArch64::FPR32RegClass); 3499 SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::hsub, 3500 &AArch64::FPR32RegClass); 3501 BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg) 3502 .addReg(SrcReg, getKillRegState(KillSrc)); 3503 } 3504 return; 3505 } 3506 3507 if (AArch64::FPR8RegClass.contains(DestReg) && 3508 AArch64::FPR8RegClass.contains(SrcReg)) { 3509 if (Subtarget.hasNEON()) { 3510 DestReg = RI.getMatchingSuperReg(DestReg, AArch64::bsub, 3511 &AArch64::FPR128RegClass); 3512 SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::bsub, 3513 &AArch64::FPR128RegClass); 3514 BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg) 3515 .addReg(SrcReg) 3516 .addReg(SrcReg, getKillRegState(KillSrc)); 3517 } else { 3518 DestReg = RI.getMatchingSuperReg(DestReg, AArch64::bsub, 3519 &AArch64::FPR32RegClass); 3520 SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::bsub, 3521 &AArch64::FPR32RegClass); 3522 BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg) 3523 .addReg(SrcReg, getKillRegState(KillSrc)); 3524 } 3525 return; 3526 } 3527 3528 // Copies between GPR64 and FPR64. 3529 if (AArch64::FPR64RegClass.contains(DestReg) && 3530 AArch64::GPR64RegClass.contains(SrcReg)) { 3531 BuildMI(MBB, I, DL, get(AArch64::FMOVXDr), DestReg) 3532 .addReg(SrcReg, getKillRegState(KillSrc)); 3533 return; 3534 } 3535 if (AArch64::GPR64RegClass.contains(DestReg) && 3536 AArch64::FPR64RegClass.contains(SrcReg)) { 3537 BuildMI(MBB, I, DL, get(AArch64::FMOVDXr), DestReg) 3538 .addReg(SrcReg, getKillRegState(KillSrc)); 3539 return; 3540 } 3541 // Copies between GPR32 and FPR32. 3542 if (AArch64::FPR32RegClass.contains(DestReg) && 3543 AArch64::GPR32RegClass.contains(SrcReg)) { 3544 BuildMI(MBB, I, DL, get(AArch64::FMOVWSr), DestReg) 3545 .addReg(SrcReg, getKillRegState(KillSrc)); 3546 return; 3547 } 3548 if (AArch64::GPR32RegClass.contains(DestReg) && 3549 AArch64::FPR32RegClass.contains(SrcReg)) { 3550 BuildMI(MBB, I, DL, get(AArch64::FMOVSWr), DestReg) 3551 .addReg(SrcReg, getKillRegState(KillSrc)); 3552 return; 3553 } 3554 3555 if (DestReg == AArch64::NZCV) { 3556 assert(AArch64::GPR64RegClass.contains(SrcReg) && "Invalid NZCV copy"); 3557 BuildMI(MBB, I, DL, get(AArch64::MSR)) 3558 .addImm(AArch64SysReg::NZCV) 3559 .addReg(SrcReg, getKillRegState(KillSrc)) 3560 .addReg(AArch64::NZCV, RegState::Implicit | RegState::Define); 3561 return; 3562 } 3563 3564 if (SrcReg == AArch64::NZCV) { 3565 assert(AArch64::GPR64RegClass.contains(DestReg) && "Invalid NZCV copy"); 3566 BuildMI(MBB, I, DL, get(AArch64::MRS), DestReg) 3567 .addImm(AArch64SysReg::NZCV) 3568 .addReg(AArch64::NZCV, RegState::Implicit | getKillRegState(KillSrc)); 3569 return; 3570 } 3571 3572 llvm_unreachable("unimplemented reg-to-reg copy"); 3573 } 3574 3575 static void storeRegPairToStackSlot(const TargetRegisterInfo &TRI, 3576 MachineBasicBlock &MBB, 3577 MachineBasicBlock::iterator InsertBefore, 3578 const MCInstrDesc &MCID, 3579 Register SrcReg, bool IsKill, 3580 unsigned SubIdx0, unsigned SubIdx1, int FI, 3581 MachineMemOperand *MMO) { 3582 Register SrcReg0 = SrcReg; 3583 Register SrcReg1 = SrcReg; 3584 if (Register::isPhysicalRegister(SrcReg)) { 3585 SrcReg0 = TRI.getSubReg(SrcReg, SubIdx0); 3586 SubIdx0 = 0; 3587 SrcReg1 = TRI.getSubReg(SrcReg, SubIdx1); 3588 SubIdx1 = 0; 3589 } 3590 BuildMI(MBB, InsertBefore, DebugLoc(), MCID) 3591 .addReg(SrcReg0, getKillRegState(IsKill), SubIdx0) 3592 .addReg(SrcReg1, getKillRegState(IsKill), SubIdx1) 3593 .addFrameIndex(FI) 3594 .addImm(0) 3595 .addMemOperand(MMO); 3596 } 3597 3598 void AArch64InstrInfo::storeRegToStackSlot( 3599 MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register SrcReg, 3600 bool isKill, int FI, const TargetRegisterClass *RC, 3601 const TargetRegisterInfo *TRI) const { 3602 MachineFunction &MF = *MBB.getParent(); 3603 MachineFrameInfo &MFI = MF.getFrameInfo(); 3604 3605 MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI); 3606 MachineMemOperand *MMO = 3607 MF.getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore, 3608 MFI.getObjectSize(FI), MFI.getObjectAlign(FI)); 3609 unsigned Opc = 0; 3610 bool Offset = true; 3611 unsigned StackID = TargetStackID::Default; 3612 switch (TRI->getSpillSize(*RC)) { 3613 case 1: 3614 if (AArch64::FPR8RegClass.hasSubClassEq(RC)) 3615 Opc = AArch64::STRBui; 3616 break; 3617 case 2: 3618 if (AArch64::FPR16RegClass.hasSubClassEq(RC)) 3619 Opc = AArch64::STRHui; 3620 else if (AArch64::PPRRegClass.hasSubClassEq(RC)) { 3621 assert(Subtarget.hasSVE() && "Unexpected register store without SVE"); 3622 Opc = AArch64::STR_PXI; 3623 StackID = TargetStackID::ScalableVector; 3624 } 3625 break; 3626 case 4: 3627 if (AArch64::GPR32allRegClass.hasSubClassEq(RC)) { 3628 Opc = AArch64::STRWui; 3629 if (Register::isVirtualRegister(SrcReg)) 3630 MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR32RegClass); 3631 else 3632 assert(SrcReg != AArch64::WSP); 3633 } else if (AArch64::FPR32RegClass.hasSubClassEq(RC)) 3634 Opc = AArch64::STRSui; 3635 break; 3636 case 8: 3637 if (AArch64::GPR64allRegClass.hasSubClassEq(RC)) { 3638 Opc = AArch64::STRXui; 3639 if (Register::isVirtualRegister(SrcReg)) 3640 MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR64RegClass); 3641 else 3642 assert(SrcReg != AArch64::SP); 3643 } else if (AArch64::FPR64RegClass.hasSubClassEq(RC)) { 3644 Opc = AArch64::STRDui; 3645 } else if (AArch64::WSeqPairsClassRegClass.hasSubClassEq(RC)) { 3646 storeRegPairToStackSlot(getRegisterInfo(), MBB, MBBI, 3647 get(AArch64::STPWi), SrcReg, isKill, 3648 AArch64::sube32, AArch64::subo32, FI, MMO); 3649 return; 3650 } 3651 break; 3652 case 16: 3653 if (AArch64::FPR128RegClass.hasSubClassEq(RC)) 3654 Opc = AArch64::STRQui; 3655 else if (AArch64::DDRegClass.hasSubClassEq(RC)) { 3656 assert(Subtarget.hasNEON() && "Unexpected register store without NEON"); 3657 Opc = AArch64::ST1Twov1d; 3658 Offset = false; 3659 } else if (AArch64::XSeqPairsClassRegClass.hasSubClassEq(RC)) { 3660 storeRegPairToStackSlot(getRegisterInfo(), MBB, MBBI, 3661 get(AArch64::STPXi), SrcReg, isKill, 3662 AArch64::sube64, AArch64::subo64, FI, MMO); 3663 return; 3664 } else if (AArch64::ZPRRegClass.hasSubClassEq(RC)) { 3665 assert(Subtarget.hasSVE() && "Unexpected register store without SVE"); 3666 Opc = AArch64::STR_ZXI; 3667 StackID = TargetStackID::ScalableVector; 3668 } 3669 break; 3670 case 24: 3671 if (AArch64::DDDRegClass.hasSubClassEq(RC)) { 3672 assert(Subtarget.hasNEON() && "Unexpected register store without NEON"); 3673 Opc = AArch64::ST1Threev1d; 3674 Offset = false; 3675 } 3676 break; 3677 case 32: 3678 if (AArch64::DDDDRegClass.hasSubClassEq(RC)) { 3679 assert(Subtarget.hasNEON() && "Unexpected register store without NEON"); 3680 Opc = AArch64::ST1Fourv1d; 3681 Offset = false; 3682 } else if (AArch64::QQRegClass.hasSubClassEq(RC)) { 3683 assert(Subtarget.hasNEON() && "Unexpected register store without NEON"); 3684 Opc = AArch64::ST1Twov2d; 3685 Offset = false; 3686 } else if (AArch64::ZPR2RegClass.hasSubClassEq(RC)) { 3687 assert(Subtarget.hasSVE() && "Unexpected register store without SVE"); 3688 Opc = AArch64::STR_ZZXI; 3689 StackID = TargetStackID::ScalableVector; 3690 } 3691 break; 3692 case 48: 3693 if (AArch64::QQQRegClass.hasSubClassEq(RC)) { 3694 assert(Subtarget.hasNEON() && "Unexpected register store without NEON"); 3695 Opc = AArch64::ST1Threev2d; 3696 Offset = false; 3697 } else if (AArch64::ZPR3RegClass.hasSubClassEq(RC)) { 3698 assert(Subtarget.hasSVE() && "Unexpected register store without SVE"); 3699 Opc = AArch64::STR_ZZZXI; 3700 StackID = TargetStackID::ScalableVector; 3701 } 3702 break; 3703 case 64: 3704 if (AArch64::QQQQRegClass.hasSubClassEq(RC)) { 3705 assert(Subtarget.hasNEON() && "Unexpected register store without NEON"); 3706 Opc = AArch64::ST1Fourv2d; 3707 Offset = false; 3708 } else if (AArch64::ZPR4RegClass.hasSubClassEq(RC)) { 3709 assert(Subtarget.hasSVE() && "Unexpected register store without SVE"); 3710 Opc = AArch64::STR_ZZZZXI; 3711 StackID = TargetStackID::ScalableVector; 3712 } 3713 break; 3714 } 3715 assert(Opc && "Unknown register class"); 3716 MFI.setStackID(FI, StackID); 3717 3718 const MachineInstrBuilder MI = BuildMI(MBB, MBBI, DebugLoc(), get(Opc)) 3719 .addReg(SrcReg, getKillRegState(isKill)) 3720 .addFrameIndex(FI); 3721 3722 if (Offset) 3723 MI.addImm(0); 3724 MI.addMemOperand(MMO); 3725 } 3726 3727 static void loadRegPairFromStackSlot(const TargetRegisterInfo &TRI, 3728 MachineBasicBlock &MBB, 3729 MachineBasicBlock::iterator InsertBefore, 3730 const MCInstrDesc &MCID, 3731 Register DestReg, unsigned SubIdx0, 3732 unsigned SubIdx1, int FI, 3733 MachineMemOperand *MMO) { 3734 Register DestReg0 = DestReg; 3735 Register DestReg1 = DestReg; 3736 bool IsUndef = true; 3737 if (Register::isPhysicalRegister(DestReg)) { 3738 DestReg0 = TRI.getSubReg(DestReg, SubIdx0); 3739 SubIdx0 = 0; 3740 DestReg1 = TRI.getSubReg(DestReg, SubIdx1); 3741 SubIdx1 = 0; 3742 IsUndef = false; 3743 } 3744 BuildMI(MBB, InsertBefore, DebugLoc(), MCID) 3745 .addReg(DestReg0, RegState::Define | getUndefRegState(IsUndef), SubIdx0) 3746 .addReg(DestReg1, RegState::Define | getUndefRegState(IsUndef), SubIdx1) 3747 .addFrameIndex(FI) 3748 .addImm(0) 3749 .addMemOperand(MMO); 3750 } 3751 3752 void AArch64InstrInfo::loadRegFromStackSlot( 3753 MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register DestReg, 3754 int FI, const TargetRegisterClass *RC, 3755 const TargetRegisterInfo *TRI) const { 3756 MachineFunction &MF = *MBB.getParent(); 3757 MachineFrameInfo &MFI = MF.getFrameInfo(); 3758 MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI); 3759 MachineMemOperand *MMO = 3760 MF.getMachineMemOperand(PtrInfo, MachineMemOperand::MOLoad, 3761 MFI.getObjectSize(FI), MFI.getObjectAlign(FI)); 3762 3763 unsigned Opc = 0; 3764 bool Offset = true; 3765 unsigned StackID = TargetStackID::Default; 3766 switch (TRI->getSpillSize(*RC)) { 3767 case 1: 3768 if (AArch64::FPR8RegClass.hasSubClassEq(RC)) 3769 Opc = AArch64::LDRBui; 3770 break; 3771 case 2: 3772 if (AArch64::FPR16RegClass.hasSubClassEq(RC)) 3773 Opc = AArch64::LDRHui; 3774 else if (AArch64::PPRRegClass.hasSubClassEq(RC)) { 3775 assert(Subtarget.hasSVE() && "Unexpected register load without SVE"); 3776 Opc = AArch64::LDR_PXI; 3777 StackID = TargetStackID::ScalableVector; 3778 } 3779 break; 3780 case 4: 3781 if (AArch64::GPR32allRegClass.hasSubClassEq(RC)) { 3782 Opc = AArch64::LDRWui; 3783 if (Register::isVirtualRegister(DestReg)) 3784 MF.getRegInfo().constrainRegClass(DestReg, &AArch64::GPR32RegClass); 3785 else 3786 assert(DestReg != AArch64::WSP); 3787 } else if (AArch64::FPR32RegClass.hasSubClassEq(RC)) 3788 Opc = AArch64::LDRSui; 3789 break; 3790 case 8: 3791 if (AArch64::GPR64allRegClass.hasSubClassEq(RC)) { 3792 Opc = AArch64::LDRXui; 3793 if (Register::isVirtualRegister(DestReg)) 3794 MF.getRegInfo().constrainRegClass(DestReg, &AArch64::GPR64RegClass); 3795 else 3796 assert(DestReg != AArch64::SP); 3797 } else if (AArch64::FPR64RegClass.hasSubClassEq(RC)) { 3798 Opc = AArch64::LDRDui; 3799 } else if (AArch64::WSeqPairsClassRegClass.hasSubClassEq(RC)) { 3800 loadRegPairFromStackSlot(getRegisterInfo(), MBB, MBBI, 3801 get(AArch64::LDPWi), DestReg, AArch64::sube32, 3802 AArch64::subo32, FI, MMO); 3803 return; 3804 } 3805 break; 3806 case 16: 3807 if (AArch64::FPR128RegClass.hasSubClassEq(RC)) 3808 Opc = AArch64::LDRQui; 3809 else if (AArch64::DDRegClass.hasSubClassEq(RC)) { 3810 assert(Subtarget.hasNEON() && "Unexpected register load without NEON"); 3811 Opc = AArch64::LD1Twov1d; 3812 Offset = false; 3813 } else if (AArch64::XSeqPairsClassRegClass.hasSubClassEq(RC)) { 3814 loadRegPairFromStackSlot(getRegisterInfo(), MBB, MBBI, 3815 get(AArch64::LDPXi), DestReg, AArch64::sube64, 3816 AArch64::subo64, FI, MMO); 3817 return; 3818 } else if (AArch64::ZPRRegClass.hasSubClassEq(RC)) { 3819 assert(Subtarget.hasSVE() && "Unexpected register load without SVE"); 3820 Opc = AArch64::LDR_ZXI; 3821 StackID = TargetStackID::ScalableVector; 3822 } 3823 break; 3824 case 24: 3825 if (AArch64::DDDRegClass.hasSubClassEq(RC)) { 3826 assert(Subtarget.hasNEON() && "Unexpected register load without NEON"); 3827 Opc = AArch64::LD1Threev1d; 3828 Offset = false; 3829 } 3830 break; 3831 case 32: 3832 if (AArch64::DDDDRegClass.hasSubClassEq(RC)) { 3833 assert(Subtarget.hasNEON() && "Unexpected register load without NEON"); 3834 Opc = AArch64::LD1Fourv1d; 3835 Offset = false; 3836 } else if (AArch64::QQRegClass.hasSubClassEq(RC)) { 3837 assert(Subtarget.hasNEON() && "Unexpected register load without NEON"); 3838 Opc = AArch64::LD1Twov2d; 3839 Offset = false; 3840 } else if (AArch64::ZPR2RegClass.hasSubClassEq(RC)) { 3841 assert(Subtarget.hasSVE() && "Unexpected register load without SVE"); 3842 Opc = AArch64::LDR_ZZXI; 3843 StackID = TargetStackID::ScalableVector; 3844 } 3845 break; 3846 case 48: 3847 if (AArch64::QQQRegClass.hasSubClassEq(RC)) { 3848 assert(Subtarget.hasNEON() && "Unexpected register load without NEON"); 3849 Opc = AArch64::LD1Threev2d; 3850 Offset = false; 3851 } else if (AArch64::ZPR3RegClass.hasSubClassEq(RC)) { 3852 assert(Subtarget.hasSVE() && "Unexpected register load without SVE"); 3853 Opc = AArch64::LDR_ZZZXI; 3854 StackID = TargetStackID::ScalableVector; 3855 } 3856 break; 3857 case 64: 3858 if (AArch64::QQQQRegClass.hasSubClassEq(RC)) { 3859 assert(Subtarget.hasNEON() && "Unexpected register load without NEON"); 3860 Opc = AArch64::LD1Fourv2d; 3861 Offset = false; 3862 } else if (AArch64::ZPR4RegClass.hasSubClassEq(RC)) { 3863 assert(Subtarget.hasSVE() && "Unexpected register load without SVE"); 3864 Opc = AArch64::LDR_ZZZZXI; 3865 StackID = TargetStackID::ScalableVector; 3866 } 3867 break; 3868 } 3869 3870 assert(Opc && "Unknown register class"); 3871 MFI.setStackID(FI, StackID); 3872 3873 const MachineInstrBuilder MI = BuildMI(MBB, MBBI, DebugLoc(), get(Opc)) 3874 .addReg(DestReg, getDefRegState(true)) 3875 .addFrameIndex(FI); 3876 if (Offset) 3877 MI.addImm(0); 3878 MI.addMemOperand(MMO); 3879 } 3880 3881 bool llvm::isNZCVTouchedInInstructionRange(const MachineInstr &DefMI, 3882 const MachineInstr &UseMI, 3883 const TargetRegisterInfo *TRI) { 3884 return any_of(instructionsWithoutDebug(std::next(DefMI.getIterator()), 3885 UseMI.getIterator()), 3886 [TRI](const MachineInstr &I) { 3887 return I.modifiesRegister(AArch64::NZCV, TRI) || 3888 I.readsRegister(AArch64::NZCV, TRI); 3889 }); 3890 } 3891 3892 void AArch64InstrInfo::decomposeStackOffsetForDwarfOffsets( 3893 const StackOffset &Offset, int64_t &ByteSized, int64_t &VGSized) { 3894 // The smallest scalable element supported by scaled SVE addressing 3895 // modes are predicates, which are 2 scalable bytes in size. So the scalable 3896 // byte offset must always be a multiple of 2. 3897 assert(Offset.getScalable() % 2 == 0 && "Invalid frame offset"); 3898 3899 // VGSized offsets are divided by '2', because the VG register is the 3900 // the number of 64bit granules as opposed to 128bit vector chunks, 3901 // which is how the 'n' in e.g. MVT::nxv1i8 is modelled. 3902 // So, for a stack offset of 16 MVT::nxv1i8's, the size is n x 16 bytes. 3903 // VG = n * 2 and the dwarf offset must be VG * 8 bytes. 3904 ByteSized = Offset.getFixed(); 3905 VGSized = Offset.getScalable() / 2; 3906 } 3907 3908 /// Returns the offset in parts to which this frame offset can be 3909 /// decomposed for the purpose of describing a frame offset. 3910 /// For non-scalable offsets this is simply its byte size. 3911 void AArch64InstrInfo::decomposeStackOffsetForFrameOffsets( 3912 const StackOffset &Offset, int64_t &NumBytes, int64_t &NumPredicateVectors, 3913 int64_t &NumDataVectors) { 3914 // The smallest scalable element supported by scaled SVE addressing 3915 // modes are predicates, which are 2 scalable bytes in size. So the scalable 3916 // byte offset must always be a multiple of 2. 3917 assert(Offset.getScalable() % 2 == 0 && "Invalid frame offset"); 3918 3919 NumBytes = Offset.getFixed(); 3920 NumDataVectors = 0; 3921 NumPredicateVectors = Offset.getScalable() / 2; 3922 // This method is used to get the offsets to adjust the frame offset. 3923 // If the function requires ADDPL to be used and needs more than two ADDPL 3924 // instructions, part of the offset is folded into NumDataVectors so that it 3925 // uses ADDVL for part of it, reducing the number of ADDPL instructions. 3926 if (NumPredicateVectors % 8 == 0 || NumPredicateVectors < -64 || 3927 NumPredicateVectors > 62) { 3928 NumDataVectors = NumPredicateVectors / 8; 3929 NumPredicateVectors -= NumDataVectors * 8; 3930 } 3931 } 3932 3933 // Helper function to emit a frame offset adjustment from a given 3934 // pointer (SrcReg), stored into DestReg. This function is explicit 3935 // in that it requires the opcode. 3936 static void emitFrameOffsetAdj(MachineBasicBlock &MBB, 3937 MachineBasicBlock::iterator MBBI, 3938 const DebugLoc &DL, unsigned DestReg, 3939 unsigned SrcReg, int64_t Offset, unsigned Opc, 3940 const TargetInstrInfo *TII, 3941 MachineInstr::MIFlag Flag, bool NeedsWinCFI, 3942 bool *HasWinCFI) { 3943 int Sign = 1; 3944 unsigned MaxEncoding, ShiftSize; 3945 switch (Opc) { 3946 case AArch64::ADDXri: 3947 case AArch64::ADDSXri: 3948 case AArch64::SUBXri: 3949 case AArch64::SUBSXri: 3950 MaxEncoding = 0xfff; 3951 ShiftSize = 12; 3952 break; 3953 case AArch64::ADDVL_XXI: 3954 case AArch64::ADDPL_XXI: 3955 MaxEncoding = 31; 3956 ShiftSize = 0; 3957 if (Offset < 0) { 3958 MaxEncoding = 32; 3959 Sign = -1; 3960 Offset = -Offset; 3961 } 3962 break; 3963 default: 3964 llvm_unreachable("Unsupported opcode"); 3965 } 3966 3967 // FIXME: If the offset won't fit in 24-bits, compute the offset into a 3968 // scratch register. If DestReg is a virtual register, use it as the 3969 // scratch register; otherwise, create a new virtual register (to be 3970 // replaced by the scavenger at the end of PEI). That case can be optimized 3971 // slightly if DestReg is SP which is always 16-byte aligned, so the scratch 3972 // register can be loaded with offset%8 and the add/sub can use an extending 3973 // instruction with LSL#3. 3974 // Currently the function handles any offsets but generates a poor sequence 3975 // of code. 3976 // assert(Offset < (1 << 24) && "unimplemented reg plus immediate"); 3977 3978 const unsigned MaxEncodableValue = MaxEncoding << ShiftSize; 3979 Register TmpReg = DestReg; 3980 if (TmpReg == AArch64::XZR) 3981 TmpReg = MBB.getParent()->getRegInfo().createVirtualRegister( 3982 &AArch64::GPR64RegClass); 3983 do { 3984 uint64_t ThisVal = std::min<uint64_t>(Offset, MaxEncodableValue); 3985 unsigned LocalShiftSize = 0; 3986 if (ThisVal > MaxEncoding) { 3987 ThisVal = ThisVal >> ShiftSize; 3988 LocalShiftSize = ShiftSize; 3989 } 3990 assert((ThisVal >> ShiftSize) <= MaxEncoding && 3991 "Encoding cannot handle value that big"); 3992 3993 Offset -= ThisVal << LocalShiftSize; 3994 if (Offset == 0) 3995 TmpReg = DestReg; 3996 auto MBI = BuildMI(MBB, MBBI, DL, TII->get(Opc), TmpReg) 3997 .addReg(SrcReg) 3998 .addImm(Sign * (int)ThisVal); 3999 if (ShiftSize) 4000 MBI = MBI.addImm( 4001 AArch64_AM::getShifterImm(AArch64_AM::LSL, LocalShiftSize)); 4002 MBI = MBI.setMIFlag(Flag); 4003 4004 if (NeedsWinCFI) { 4005 assert(Sign == 1 && "SEH directives should always have a positive sign"); 4006 int Imm = (int)(ThisVal << LocalShiftSize); 4007 if ((DestReg == AArch64::FP && SrcReg == AArch64::SP) || 4008 (SrcReg == AArch64::FP && DestReg == AArch64::SP)) { 4009 if (HasWinCFI) 4010 *HasWinCFI = true; 4011 if (Imm == 0) 4012 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_SetFP)).setMIFlag(Flag); 4013 else 4014 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_AddFP)) 4015 .addImm(Imm) 4016 .setMIFlag(Flag); 4017 assert(Offset == 0 && "Expected remaining offset to be zero to " 4018 "emit a single SEH directive"); 4019 } else if (DestReg == AArch64::SP) { 4020 if (HasWinCFI) 4021 *HasWinCFI = true; 4022 assert(SrcReg == AArch64::SP && "Unexpected SrcReg for SEH_StackAlloc"); 4023 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_StackAlloc)) 4024 .addImm(Imm) 4025 .setMIFlag(Flag); 4026 } 4027 if (HasWinCFI) 4028 *HasWinCFI = true; 4029 } 4030 4031 SrcReg = TmpReg; 4032 } while (Offset); 4033 } 4034 4035 void llvm::emitFrameOffset(MachineBasicBlock &MBB, 4036 MachineBasicBlock::iterator MBBI, const DebugLoc &DL, 4037 unsigned DestReg, unsigned SrcReg, 4038 StackOffset Offset, const TargetInstrInfo *TII, 4039 MachineInstr::MIFlag Flag, bool SetNZCV, 4040 bool NeedsWinCFI, bool *HasWinCFI) { 4041 int64_t Bytes, NumPredicateVectors, NumDataVectors; 4042 AArch64InstrInfo::decomposeStackOffsetForFrameOffsets( 4043 Offset, Bytes, NumPredicateVectors, NumDataVectors); 4044 4045 // First emit non-scalable frame offsets, or a simple 'mov'. 4046 if (Bytes || (!Offset && SrcReg != DestReg)) { 4047 assert((DestReg != AArch64::SP || Bytes % 8 == 0) && 4048 "SP increment/decrement not 8-byte aligned"); 4049 unsigned Opc = SetNZCV ? AArch64::ADDSXri : AArch64::ADDXri; 4050 if (Bytes < 0) { 4051 Bytes = -Bytes; 4052 Opc = SetNZCV ? AArch64::SUBSXri : AArch64::SUBXri; 4053 } 4054 emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, Bytes, Opc, TII, Flag, 4055 NeedsWinCFI, HasWinCFI); 4056 SrcReg = DestReg; 4057 } 4058 4059 assert(!(SetNZCV && (NumPredicateVectors || NumDataVectors)) && 4060 "SetNZCV not supported with SVE vectors"); 4061 assert(!(NeedsWinCFI && (NumPredicateVectors || NumDataVectors)) && 4062 "WinCFI not supported with SVE vectors"); 4063 4064 if (NumDataVectors) { 4065 emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, NumDataVectors, 4066 AArch64::ADDVL_XXI, TII, Flag, NeedsWinCFI, nullptr); 4067 SrcReg = DestReg; 4068 } 4069 4070 if (NumPredicateVectors) { 4071 assert(DestReg != AArch64::SP && "Unaligned access to SP"); 4072 emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, NumPredicateVectors, 4073 AArch64::ADDPL_XXI, TII, Flag, NeedsWinCFI, nullptr); 4074 } 4075 } 4076 4077 MachineInstr *AArch64InstrInfo::foldMemoryOperandImpl( 4078 MachineFunction &MF, MachineInstr &MI, ArrayRef<unsigned> Ops, 4079 MachineBasicBlock::iterator InsertPt, int FrameIndex, 4080 LiveIntervals *LIS, VirtRegMap *VRM) const { 4081 // This is a bit of a hack. Consider this instruction: 4082 // 4083 // %0 = COPY %sp; GPR64all:%0 4084 // 4085 // We explicitly chose GPR64all for the virtual register so such a copy might 4086 // be eliminated by RegisterCoalescer. However, that may not be possible, and 4087 // %0 may even spill. We can't spill %sp, and since it is in the GPR64all 4088 // register class, TargetInstrInfo::foldMemoryOperand() is going to try. 4089 // 4090 // To prevent that, we are going to constrain the %0 register class here. 4091 // 4092 // <rdar://problem/11522048> 4093 // 4094 if (MI.isFullCopy()) { 4095 Register DstReg = MI.getOperand(0).getReg(); 4096 Register SrcReg = MI.getOperand(1).getReg(); 4097 if (SrcReg == AArch64::SP && Register::isVirtualRegister(DstReg)) { 4098 MF.getRegInfo().constrainRegClass(DstReg, &AArch64::GPR64RegClass); 4099 return nullptr; 4100 } 4101 if (DstReg == AArch64::SP && Register::isVirtualRegister(SrcReg)) { 4102 MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR64RegClass); 4103 return nullptr; 4104 } 4105 } 4106 4107 // Handle the case where a copy is being spilled or filled but the source 4108 // and destination register class don't match. For example: 4109 // 4110 // %0 = COPY %xzr; GPR64common:%0 4111 // 4112 // In this case we can still safely fold away the COPY and generate the 4113 // following spill code: 4114 // 4115 // STRXui %xzr, %stack.0 4116 // 4117 // This also eliminates spilled cross register class COPYs (e.g. between x and 4118 // d regs) of the same size. For example: 4119 // 4120 // %0 = COPY %1; GPR64:%0, FPR64:%1 4121 // 4122 // will be filled as 4123 // 4124 // LDRDui %0, fi<#0> 4125 // 4126 // instead of 4127 // 4128 // LDRXui %Temp, fi<#0> 4129 // %0 = FMOV %Temp 4130 // 4131 if (MI.isCopy() && Ops.size() == 1 && 4132 // Make sure we're only folding the explicit COPY defs/uses. 4133 (Ops[0] == 0 || Ops[0] == 1)) { 4134 bool IsSpill = Ops[0] == 0; 4135 bool IsFill = !IsSpill; 4136 const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo(); 4137 const MachineRegisterInfo &MRI = MF.getRegInfo(); 4138 MachineBasicBlock &MBB = *MI.getParent(); 4139 const MachineOperand &DstMO = MI.getOperand(0); 4140 const MachineOperand &SrcMO = MI.getOperand(1); 4141 Register DstReg = DstMO.getReg(); 4142 Register SrcReg = SrcMO.getReg(); 4143 // This is slightly expensive to compute for physical regs since 4144 // getMinimalPhysRegClass is slow. 4145 auto getRegClass = [&](unsigned Reg) { 4146 return Register::isVirtualRegister(Reg) ? MRI.getRegClass(Reg) 4147 : TRI.getMinimalPhysRegClass(Reg); 4148 }; 4149 4150 if (DstMO.getSubReg() == 0 && SrcMO.getSubReg() == 0) { 4151 assert(TRI.getRegSizeInBits(*getRegClass(DstReg)) == 4152 TRI.getRegSizeInBits(*getRegClass(SrcReg)) && 4153 "Mismatched register size in non subreg COPY"); 4154 if (IsSpill) 4155 storeRegToStackSlot(MBB, InsertPt, SrcReg, SrcMO.isKill(), FrameIndex, 4156 getRegClass(SrcReg), &TRI); 4157 else 4158 loadRegFromStackSlot(MBB, InsertPt, DstReg, FrameIndex, 4159 getRegClass(DstReg), &TRI); 4160 return &*--InsertPt; 4161 } 4162 4163 // Handle cases like spilling def of: 4164 // 4165 // %0:sub_32<def,read-undef> = COPY %wzr; GPR64common:%0 4166 // 4167 // where the physical register source can be widened and stored to the full 4168 // virtual reg destination stack slot, in this case producing: 4169 // 4170 // STRXui %xzr, %stack.0 4171 // 4172 if (IsSpill && DstMO.isUndef() && Register::isPhysicalRegister(SrcReg)) { 4173 assert(SrcMO.getSubReg() == 0 && 4174 "Unexpected subreg on physical register"); 4175 const TargetRegisterClass *SpillRC; 4176 unsigned SpillSubreg; 4177 switch (DstMO.getSubReg()) { 4178 default: 4179 SpillRC = nullptr; 4180 break; 4181 case AArch64::sub_32: 4182 case AArch64::ssub: 4183 if (AArch64::GPR32RegClass.contains(SrcReg)) { 4184 SpillRC = &AArch64::GPR64RegClass; 4185 SpillSubreg = AArch64::sub_32; 4186 } else if (AArch64::FPR32RegClass.contains(SrcReg)) { 4187 SpillRC = &AArch64::FPR64RegClass; 4188 SpillSubreg = AArch64::ssub; 4189 } else 4190 SpillRC = nullptr; 4191 break; 4192 case AArch64::dsub: 4193 if (AArch64::FPR64RegClass.contains(SrcReg)) { 4194 SpillRC = &AArch64::FPR128RegClass; 4195 SpillSubreg = AArch64::dsub; 4196 } else 4197 SpillRC = nullptr; 4198 break; 4199 } 4200 4201 if (SpillRC) 4202 if (unsigned WidenedSrcReg = 4203 TRI.getMatchingSuperReg(SrcReg, SpillSubreg, SpillRC)) { 4204 storeRegToStackSlot(MBB, InsertPt, WidenedSrcReg, SrcMO.isKill(), 4205 FrameIndex, SpillRC, &TRI); 4206 return &*--InsertPt; 4207 } 4208 } 4209 4210 // Handle cases like filling use of: 4211 // 4212 // %0:sub_32<def,read-undef> = COPY %1; GPR64:%0, GPR32:%1 4213 // 4214 // where we can load the full virtual reg source stack slot, into the subreg 4215 // destination, in this case producing: 4216 // 4217 // LDRWui %0:sub_32<def,read-undef>, %stack.0 4218 // 4219 if (IsFill && SrcMO.getSubReg() == 0 && DstMO.isUndef()) { 4220 const TargetRegisterClass *FillRC; 4221 switch (DstMO.getSubReg()) { 4222 default: 4223 FillRC = nullptr; 4224 break; 4225 case AArch64::sub_32: 4226 FillRC = &AArch64::GPR32RegClass; 4227 break; 4228 case AArch64::ssub: 4229 FillRC = &AArch64::FPR32RegClass; 4230 break; 4231 case AArch64::dsub: 4232 FillRC = &AArch64::FPR64RegClass; 4233 break; 4234 } 4235 4236 if (FillRC) { 4237 assert(TRI.getRegSizeInBits(*getRegClass(SrcReg)) == 4238 TRI.getRegSizeInBits(*FillRC) && 4239 "Mismatched regclass size on folded subreg COPY"); 4240 loadRegFromStackSlot(MBB, InsertPt, DstReg, FrameIndex, FillRC, &TRI); 4241 MachineInstr &LoadMI = *--InsertPt; 4242 MachineOperand &LoadDst = LoadMI.getOperand(0); 4243 assert(LoadDst.getSubReg() == 0 && "unexpected subreg on fill load"); 4244 LoadDst.setSubReg(DstMO.getSubReg()); 4245 LoadDst.setIsUndef(); 4246 return &LoadMI; 4247 } 4248 } 4249 } 4250 4251 // Cannot fold. 4252 return nullptr; 4253 } 4254 4255 int llvm::isAArch64FrameOffsetLegal(const MachineInstr &MI, 4256 StackOffset &SOffset, 4257 bool *OutUseUnscaledOp, 4258 unsigned *OutUnscaledOp, 4259 int64_t *EmittableOffset) { 4260 // Set output values in case of early exit. 4261 if (EmittableOffset) 4262 *EmittableOffset = 0; 4263 if (OutUseUnscaledOp) 4264 *OutUseUnscaledOp = false; 4265 if (OutUnscaledOp) 4266 *OutUnscaledOp = 0; 4267 4268 // Exit early for structured vector spills/fills as they can't take an 4269 // immediate offset. 4270 switch (MI.getOpcode()) { 4271 default: 4272 break; 4273 case AArch64::LD1Twov2d: 4274 case AArch64::LD1Threev2d: 4275 case AArch64::LD1Fourv2d: 4276 case AArch64::LD1Twov1d: 4277 case AArch64::LD1Threev1d: 4278 case AArch64::LD1Fourv1d: 4279 case AArch64::ST1Twov2d: 4280 case AArch64::ST1Threev2d: 4281 case AArch64::ST1Fourv2d: 4282 case AArch64::ST1Twov1d: 4283 case AArch64::ST1Threev1d: 4284 case AArch64::ST1Fourv1d: 4285 case AArch64::IRG: 4286 case AArch64::IRGstack: 4287 case AArch64::STGloop: 4288 case AArch64::STZGloop: 4289 return AArch64FrameOffsetCannotUpdate; 4290 } 4291 4292 // Get the min/max offset and the scale. 4293 TypeSize ScaleValue(0U, false); 4294 unsigned Width; 4295 int64_t MinOff, MaxOff; 4296 if (!AArch64InstrInfo::getMemOpInfo(MI.getOpcode(), ScaleValue, Width, MinOff, 4297 MaxOff)) 4298 llvm_unreachable("unhandled opcode in isAArch64FrameOffsetLegal"); 4299 4300 // Construct the complete offset. 4301 bool IsMulVL = ScaleValue.isScalable(); 4302 unsigned Scale = ScaleValue.getKnownMinSize(); 4303 int64_t Offset = IsMulVL ? SOffset.getScalable() : SOffset.getFixed(); 4304 4305 const MachineOperand &ImmOpnd = 4306 MI.getOperand(AArch64InstrInfo::getLoadStoreImmIdx(MI.getOpcode())); 4307 Offset += ImmOpnd.getImm() * Scale; 4308 4309 // If the offset doesn't match the scale, we rewrite the instruction to 4310 // use the unscaled instruction instead. Likewise, if we have a negative 4311 // offset and there is an unscaled op to use. 4312 Optional<unsigned> UnscaledOp = 4313 AArch64InstrInfo::getUnscaledLdSt(MI.getOpcode()); 4314 bool useUnscaledOp = UnscaledOp && (Offset % Scale || Offset < 0); 4315 if (useUnscaledOp && 4316 !AArch64InstrInfo::getMemOpInfo(*UnscaledOp, ScaleValue, Width, MinOff, 4317 MaxOff)) 4318 llvm_unreachable("unhandled opcode in isAArch64FrameOffsetLegal"); 4319 4320 Scale = ScaleValue.getKnownMinSize(); 4321 assert(IsMulVL == ScaleValue.isScalable() && 4322 "Unscaled opcode has different value for scalable"); 4323 4324 int64_t Remainder = Offset % Scale; 4325 assert(!(Remainder && useUnscaledOp) && 4326 "Cannot have remainder when using unscaled op"); 4327 4328 assert(MinOff < MaxOff && "Unexpected Min/Max offsets"); 4329 int64_t NewOffset = Offset / Scale; 4330 if (MinOff <= NewOffset && NewOffset <= MaxOff) 4331 Offset = Remainder; 4332 else { 4333 NewOffset = NewOffset < 0 ? MinOff : MaxOff; 4334 Offset = Offset - NewOffset * Scale + Remainder; 4335 } 4336 4337 if (EmittableOffset) 4338 *EmittableOffset = NewOffset; 4339 if (OutUseUnscaledOp) 4340 *OutUseUnscaledOp = useUnscaledOp; 4341 if (OutUnscaledOp && UnscaledOp) 4342 *OutUnscaledOp = *UnscaledOp; 4343 4344 if (IsMulVL) 4345 SOffset = StackOffset::get(SOffset.getFixed(), Offset); 4346 else 4347 SOffset = StackOffset::get(Offset, SOffset.getScalable()); 4348 return AArch64FrameOffsetCanUpdate | 4349 (SOffset ? 0 : AArch64FrameOffsetIsLegal); 4350 } 4351 4352 bool llvm::rewriteAArch64FrameIndex(MachineInstr &MI, unsigned FrameRegIdx, 4353 unsigned FrameReg, StackOffset &Offset, 4354 const AArch64InstrInfo *TII) { 4355 unsigned Opcode = MI.getOpcode(); 4356 unsigned ImmIdx = FrameRegIdx + 1; 4357 4358 if (Opcode == AArch64::ADDSXri || Opcode == AArch64::ADDXri) { 4359 Offset += StackOffset::getFixed(MI.getOperand(ImmIdx).getImm()); 4360 emitFrameOffset(*MI.getParent(), MI, MI.getDebugLoc(), 4361 MI.getOperand(0).getReg(), FrameReg, Offset, TII, 4362 MachineInstr::NoFlags, (Opcode == AArch64::ADDSXri)); 4363 MI.eraseFromParent(); 4364 Offset = StackOffset(); 4365 return true; 4366 } 4367 4368 int64_t NewOffset; 4369 unsigned UnscaledOp; 4370 bool UseUnscaledOp; 4371 int Status = isAArch64FrameOffsetLegal(MI, Offset, &UseUnscaledOp, 4372 &UnscaledOp, &NewOffset); 4373 if (Status & AArch64FrameOffsetCanUpdate) { 4374 if (Status & AArch64FrameOffsetIsLegal) 4375 // Replace the FrameIndex with FrameReg. 4376 MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false); 4377 if (UseUnscaledOp) 4378 MI.setDesc(TII->get(UnscaledOp)); 4379 4380 MI.getOperand(ImmIdx).ChangeToImmediate(NewOffset); 4381 return !Offset; 4382 } 4383 4384 return false; 4385 } 4386 4387 MCInst AArch64InstrInfo::getNop() const { 4388 return MCInstBuilder(AArch64::HINT).addImm(0); 4389 } 4390 4391 // AArch64 supports MachineCombiner. 4392 bool AArch64InstrInfo::useMachineCombiner() const { return true; } 4393 4394 // True when Opc sets flag 4395 static bool isCombineInstrSettingFlag(unsigned Opc) { 4396 switch (Opc) { 4397 case AArch64::ADDSWrr: 4398 case AArch64::ADDSWri: 4399 case AArch64::ADDSXrr: 4400 case AArch64::ADDSXri: 4401 case AArch64::SUBSWrr: 4402 case AArch64::SUBSXrr: 4403 // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi. 4404 case AArch64::SUBSWri: 4405 case AArch64::SUBSXri: 4406 return true; 4407 default: 4408 break; 4409 } 4410 return false; 4411 } 4412 4413 // 32b Opcodes that can be combined with a MUL 4414 static bool isCombineInstrCandidate32(unsigned Opc) { 4415 switch (Opc) { 4416 case AArch64::ADDWrr: 4417 case AArch64::ADDWri: 4418 case AArch64::SUBWrr: 4419 case AArch64::ADDSWrr: 4420 case AArch64::ADDSWri: 4421 case AArch64::SUBSWrr: 4422 // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi. 4423 case AArch64::SUBWri: 4424 case AArch64::SUBSWri: 4425 return true; 4426 default: 4427 break; 4428 } 4429 return false; 4430 } 4431 4432 // 64b Opcodes that can be combined with a MUL 4433 static bool isCombineInstrCandidate64(unsigned Opc) { 4434 switch (Opc) { 4435 case AArch64::ADDXrr: 4436 case AArch64::ADDXri: 4437 case AArch64::SUBXrr: 4438 case AArch64::ADDSXrr: 4439 case AArch64::ADDSXri: 4440 case AArch64::SUBSXrr: 4441 // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi. 4442 case AArch64::SUBXri: 4443 case AArch64::SUBSXri: 4444 case AArch64::ADDv8i8: 4445 case AArch64::ADDv16i8: 4446 case AArch64::ADDv4i16: 4447 case AArch64::ADDv8i16: 4448 case AArch64::ADDv2i32: 4449 case AArch64::ADDv4i32: 4450 case AArch64::SUBv8i8: 4451 case AArch64::SUBv16i8: 4452 case AArch64::SUBv4i16: 4453 case AArch64::SUBv8i16: 4454 case AArch64::SUBv2i32: 4455 case AArch64::SUBv4i32: 4456 return true; 4457 default: 4458 break; 4459 } 4460 return false; 4461 } 4462 4463 // FP Opcodes that can be combined with a FMUL. 4464 static bool isCombineInstrCandidateFP(const MachineInstr &Inst) { 4465 switch (Inst.getOpcode()) { 4466 default: 4467 break; 4468 case AArch64::FADDHrr: 4469 case AArch64::FADDSrr: 4470 case AArch64::FADDDrr: 4471 case AArch64::FADDv4f16: 4472 case AArch64::FADDv8f16: 4473 case AArch64::FADDv2f32: 4474 case AArch64::FADDv2f64: 4475 case AArch64::FADDv4f32: 4476 case AArch64::FSUBHrr: 4477 case AArch64::FSUBSrr: 4478 case AArch64::FSUBDrr: 4479 case AArch64::FSUBv4f16: 4480 case AArch64::FSUBv8f16: 4481 case AArch64::FSUBv2f32: 4482 case AArch64::FSUBv2f64: 4483 case AArch64::FSUBv4f32: 4484 TargetOptions Options = Inst.getParent()->getParent()->getTarget().Options; 4485 // We can fuse FADD/FSUB with FMUL, if fusion is either allowed globally by 4486 // the target options or if FADD/FSUB has the contract fast-math flag. 4487 return Options.UnsafeFPMath || 4488 Options.AllowFPOpFusion == FPOpFusion::Fast || 4489 Inst.getFlag(MachineInstr::FmContract); 4490 return true; 4491 } 4492 return false; 4493 } 4494 4495 // Opcodes that can be combined with a MUL 4496 static bool isCombineInstrCandidate(unsigned Opc) { 4497 return (isCombineInstrCandidate32(Opc) || isCombineInstrCandidate64(Opc)); 4498 } 4499 4500 // 4501 // Utility routine that checks if \param MO is defined by an 4502 // \param CombineOpc instruction in the basic block \param MBB 4503 static bool canCombine(MachineBasicBlock &MBB, MachineOperand &MO, 4504 unsigned CombineOpc, unsigned ZeroReg = 0, 4505 bool CheckZeroReg = false) { 4506 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 4507 MachineInstr *MI = nullptr; 4508 4509 if (MO.isReg() && Register::isVirtualRegister(MO.getReg())) 4510 MI = MRI.getUniqueVRegDef(MO.getReg()); 4511 // And it needs to be in the trace (otherwise, it won't have a depth). 4512 if (!MI || MI->getParent() != &MBB || (unsigned)MI->getOpcode() != CombineOpc) 4513 return false; 4514 // Must only used by the user we combine with. 4515 if (!MRI.hasOneNonDBGUse(MI->getOperand(0).getReg())) 4516 return false; 4517 4518 if (CheckZeroReg) { 4519 assert(MI->getNumOperands() >= 4 && MI->getOperand(0).isReg() && 4520 MI->getOperand(1).isReg() && MI->getOperand(2).isReg() && 4521 MI->getOperand(3).isReg() && "MAdd/MSub must have a least 4 regs"); 4522 // The third input reg must be zero. 4523 if (MI->getOperand(3).getReg() != ZeroReg) 4524 return false; 4525 } 4526 4527 return true; 4528 } 4529 4530 // 4531 // Is \param MO defined by an integer multiply and can be combined? 4532 static bool canCombineWithMUL(MachineBasicBlock &MBB, MachineOperand &MO, 4533 unsigned MulOpc, unsigned ZeroReg) { 4534 return canCombine(MBB, MO, MulOpc, ZeroReg, true); 4535 } 4536 4537 // 4538 // Is \param MO defined by a floating-point multiply and can be combined? 4539 static bool canCombineWithFMUL(MachineBasicBlock &MBB, MachineOperand &MO, 4540 unsigned MulOpc) { 4541 return canCombine(MBB, MO, MulOpc); 4542 } 4543 4544 // TODO: There are many more machine instruction opcodes to match: 4545 // 1. Other data types (integer, vectors) 4546 // 2. Other math / logic operations (xor, or) 4547 // 3. Other forms of the same operation (intrinsics and other variants) 4548 bool AArch64InstrInfo::isAssociativeAndCommutative( 4549 const MachineInstr &Inst) const { 4550 switch (Inst.getOpcode()) { 4551 case AArch64::FADDDrr: 4552 case AArch64::FADDSrr: 4553 case AArch64::FADDv2f32: 4554 case AArch64::FADDv2f64: 4555 case AArch64::FADDv4f32: 4556 case AArch64::FMULDrr: 4557 case AArch64::FMULSrr: 4558 case AArch64::FMULX32: 4559 case AArch64::FMULX64: 4560 case AArch64::FMULXv2f32: 4561 case AArch64::FMULXv2f64: 4562 case AArch64::FMULXv4f32: 4563 case AArch64::FMULv2f32: 4564 case AArch64::FMULv2f64: 4565 case AArch64::FMULv4f32: 4566 return Inst.getParent()->getParent()->getTarget().Options.UnsafeFPMath; 4567 default: 4568 return false; 4569 } 4570 } 4571 4572 /// Find instructions that can be turned into madd. 4573 static bool getMaddPatterns(MachineInstr &Root, 4574 SmallVectorImpl<MachineCombinerPattern> &Patterns) { 4575 unsigned Opc = Root.getOpcode(); 4576 MachineBasicBlock &MBB = *Root.getParent(); 4577 bool Found = false; 4578 4579 if (!isCombineInstrCandidate(Opc)) 4580 return false; 4581 if (isCombineInstrSettingFlag(Opc)) { 4582 int Cmp_NZCV = Root.findRegisterDefOperandIdx(AArch64::NZCV, true); 4583 // When NZCV is live bail out. 4584 if (Cmp_NZCV == -1) 4585 return false; 4586 unsigned NewOpc = convertToNonFlagSettingOpc(Root); 4587 // When opcode can't change bail out. 4588 // CHECKME: do we miss any cases for opcode conversion? 4589 if (NewOpc == Opc) 4590 return false; 4591 Opc = NewOpc; 4592 } 4593 4594 auto setFound = [&](int Opcode, int Operand, unsigned ZeroReg, 4595 MachineCombinerPattern Pattern) { 4596 if (canCombineWithMUL(MBB, Root.getOperand(Operand), Opcode, ZeroReg)) { 4597 Patterns.push_back(Pattern); 4598 Found = true; 4599 } 4600 }; 4601 4602 auto setVFound = [&](int Opcode, int Operand, MachineCombinerPattern Pattern) { 4603 if (canCombine(MBB, Root.getOperand(Operand), Opcode)) { 4604 Patterns.push_back(Pattern); 4605 Found = true; 4606 } 4607 }; 4608 4609 typedef MachineCombinerPattern MCP; 4610 4611 switch (Opc) { 4612 default: 4613 break; 4614 case AArch64::ADDWrr: 4615 assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() && 4616 "ADDWrr does not have register operands"); 4617 setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULADDW_OP1); 4618 setFound(AArch64::MADDWrrr, 2, AArch64::WZR, MCP::MULADDW_OP2); 4619 break; 4620 case AArch64::ADDXrr: 4621 setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULADDX_OP1); 4622 setFound(AArch64::MADDXrrr, 2, AArch64::XZR, MCP::MULADDX_OP2); 4623 break; 4624 case AArch64::SUBWrr: 4625 setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULSUBW_OP1); 4626 setFound(AArch64::MADDWrrr, 2, AArch64::WZR, MCP::MULSUBW_OP2); 4627 break; 4628 case AArch64::SUBXrr: 4629 setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULSUBX_OP1); 4630 setFound(AArch64::MADDXrrr, 2, AArch64::XZR, MCP::MULSUBX_OP2); 4631 break; 4632 case AArch64::ADDWri: 4633 setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULADDWI_OP1); 4634 break; 4635 case AArch64::ADDXri: 4636 setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULADDXI_OP1); 4637 break; 4638 case AArch64::SUBWri: 4639 setFound(AArch64::MADDWrrr, 1, AArch64::WZR, MCP::MULSUBWI_OP1); 4640 break; 4641 case AArch64::SUBXri: 4642 setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULSUBXI_OP1); 4643 break; 4644 case AArch64::ADDv8i8: 4645 setVFound(AArch64::MULv8i8, 1, MCP::MULADDv8i8_OP1); 4646 setVFound(AArch64::MULv8i8, 2, MCP::MULADDv8i8_OP2); 4647 break; 4648 case AArch64::ADDv16i8: 4649 setVFound(AArch64::MULv16i8, 1, MCP::MULADDv16i8_OP1); 4650 setVFound(AArch64::MULv16i8, 2, MCP::MULADDv16i8_OP2); 4651 break; 4652 case AArch64::ADDv4i16: 4653 setVFound(AArch64::MULv4i16, 1, MCP::MULADDv4i16_OP1); 4654 setVFound(AArch64::MULv4i16, 2, MCP::MULADDv4i16_OP2); 4655 setVFound(AArch64::MULv4i16_indexed, 1, MCP::MULADDv4i16_indexed_OP1); 4656 setVFound(AArch64::MULv4i16_indexed, 2, MCP::MULADDv4i16_indexed_OP2); 4657 break; 4658 case AArch64::ADDv8i16: 4659 setVFound(AArch64::MULv8i16, 1, MCP::MULADDv8i16_OP1); 4660 setVFound(AArch64::MULv8i16, 2, MCP::MULADDv8i16_OP2); 4661 setVFound(AArch64::MULv8i16_indexed, 1, MCP::MULADDv8i16_indexed_OP1); 4662 setVFound(AArch64::MULv8i16_indexed, 2, MCP::MULADDv8i16_indexed_OP2); 4663 break; 4664 case AArch64::ADDv2i32: 4665 setVFound(AArch64::MULv2i32, 1, MCP::MULADDv2i32_OP1); 4666 setVFound(AArch64::MULv2i32, 2, MCP::MULADDv2i32_OP2); 4667 setVFound(AArch64::MULv2i32_indexed, 1, MCP::MULADDv2i32_indexed_OP1); 4668 setVFound(AArch64::MULv2i32_indexed, 2, MCP::MULADDv2i32_indexed_OP2); 4669 break; 4670 case AArch64::ADDv4i32: 4671 setVFound(AArch64::MULv4i32, 1, MCP::MULADDv4i32_OP1); 4672 setVFound(AArch64::MULv4i32, 2, MCP::MULADDv4i32_OP2); 4673 setVFound(AArch64::MULv4i32_indexed, 1, MCP::MULADDv4i32_indexed_OP1); 4674 setVFound(AArch64::MULv4i32_indexed, 2, MCP::MULADDv4i32_indexed_OP2); 4675 break; 4676 case AArch64::SUBv8i8: 4677 setVFound(AArch64::MULv8i8, 1, MCP::MULSUBv8i8_OP1); 4678 setVFound(AArch64::MULv8i8, 2, MCP::MULSUBv8i8_OP2); 4679 break; 4680 case AArch64::SUBv16i8: 4681 setVFound(AArch64::MULv16i8, 1, MCP::MULSUBv16i8_OP1); 4682 setVFound(AArch64::MULv16i8, 2, MCP::MULSUBv16i8_OP2); 4683 break; 4684 case AArch64::SUBv4i16: 4685 setVFound(AArch64::MULv4i16, 1, MCP::MULSUBv4i16_OP1); 4686 setVFound(AArch64::MULv4i16, 2, MCP::MULSUBv4i16_OP2); 4687 setVFound(AArch64::MULv4i16_indexed, 1, MCP::MULSUBv4i16_indexed_OP1); 4688 setVFound(AArch64::MULv4i16_indexed, 2, MCP::MULSUBv4i16_indexed_OP2); 4689 break; 4690 case AArch64::SUBv8i16: 4691 setVFound(AArch64::MULv8i16, 1, MCP::MULSUBv8i16_OP1); 4692 setVFound(AArch64::MULv8i16, 2, MCP::MULSUBv8i16_OP2); 4693 setVFound(AArch64::MULv8i16_indexed, 1, MCP::MULSUBv8i16_indexed_OP1); 4694 setVFound(AArch64::MULv8i16_indexed, 2, MCP::MULSUBv8i16_indexed_OP2); 4695 break; 4696 case AArch64::SUBv2i32: 4697 setVFound(AArch64::MULv2i32, 1, MCP::MULSUBv2i32_OP1); 4698 setVFound(AArch64::MULv2i32, 2, MCP::MULSUBv2i32_OP2); 4699 setVFound(AArch64::MULv2i32_indexed, 1, MCP::MULSUBv2i32_indexed_OP1); 4700 setVFound(AArch64::MULv2i32_indexed, 2, MCP::MULSUBv2i32_indexed_OP2); 4701 break; 4702 case AArch64::SUBv4i32: 4703 setVFound(AArch64::MULv4i32, 1, MCP::MULSUBv4i32_OP1); 4704 setVFound(AArch64::MULv4i32, 2, MCP::MULSUBv4i32_OP2); 4705 setVFound(AArch64::MULv4i32_indexed, 1, MCP::MULSUBv4i32_indexed_OP1); 4706 setVFound(AArch64::MULv4i32_indexed, 2, MCP::MULSUBv4i32_indexed_OP2); 4707 break; 4708 } 4709 return Found; 4710 } 4711 /// Floating-Point Support 4712 4713 /// Find instructions that can be turned into madd. 4714 static bool getFMAPatterns(MachineInstr &Root, 4715 SmallVectorImpl<MachineCombinerPattern> &Patterns) { 4716 4717 if (!isCombineInstrCandidateFP(Root)) 4718 return false; 4719 4720 MachineBasicBlock &MBB = *Root.getParent(); 4721 bool Found = false; 4722 4723 auto Match = [&](int Opcode, int Operand, 4724 MachineCombinerPattern Pattern) -> bool { 4725 if (canCombineWithFMUL(MBB, Root.getOperand(Operand), Opcode)) { 4726 Patterns.push_back(Pattern); 4727 return true; 4728 } 4729 return false; 4730 }; 4731 4732 typedef MachineCombinerPattern MCP; 4733 4734 switch (Root.getOpcode()) { 4735 default: 4736 assert(false && "Unsupported FP instruction in combiner\n"); 4737 break; 4738 case AArch64::FADDHrr: 4739 assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() && 4740 "FADDHrr does not have register operands"); 4741 4742 Found = Match(AArch64::FMULHrr, 1, MCP::FMULADDH_OP1); 4743 Found |= Match(AArch64::FMULHrr, 2, MCP::FMULADDH_OP2); 4744 break; 4745 case AArch64::FADDSrr: 4746 assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() && 4747 "FADDSrr does not have register operands"); 4748 4749 Found |= Match(AArch64::FMULSrr, 1, MCP::FMULADDS_OP1) || 4750 Match(AArch64::FMULv1i32_indexed, 1, MCP::FMLAv1i32_indexed_OP1); 4751 4752 Found |= Match(AArch64::FMULSrr, 2, MCP::FMULADDS_OP2) || 4753 Match(AArch64::FMULv1i32_indexed, 2, MCP::FMLAv1i32_indexed_OP2); 4754 break; 4755 case AArch64::FADDDrr: 4756 Found |= Match(AArch64::FMULDrr, 1, MCP::FMULADDD_OP1) || 4757 Match(AArch64::FMULv1i64_indexed, 1, MCP::FMLAv1i64_indexed_OP1); 4758 4759 Found |= Match(AArch64::FMULDrr, 2, MCP::FMULADDD_OP2) || 4760 Match(AArch64::FMULv1i64_indexed, 2, MCP::FMLAv1i64_indexed_OP2); 4761 break; 4762 case AArch64::FADDv4f16: 4763 Found |= Match(AArch64::FMULv4i16_indexed, 1, MCP::FMLAv4i16_indexed_OP1) || 4764 Match(AArch64::FMULv4f16, 1, MCP::FMLAv4f16_OP1); 4765 4766 Found |= Match(AArch64::FMULv4i16_indexed, 2, MCP::FMLAv4i16_indexed_OP2) || 4767 Match(AArch64::FMULv4f16, 2, MCP::FMLAv4f16_OP2); 4768 break; 4769 case AArch64::FADDv8f16: 4770 Found |= Match(AArch64::FMULv8i16_indexed, 1, MCP::FMLAv8i16_indexed_OP1) || 4771 Match(AArch64::FMULv8f16, 1, MCP::FMLAv8f16_OP1); 4772 4773 Found |= Match(AArch64::FMULv8i16_indexed, 2, MCP::FMLAv8i16_indexed_OP2) || 4774 Match(AArch64::FMULv8f16, 2, MCP::FMLAv8f16_OP2); 4775 break; 4776 case AArch64::FADDv2f32: 4777 Found |= Match(AArch64::FMULv2i32_indexed, 1, MCP::FMLAv2i32_indexed_OP1) || 4778 Match(AArch64::FMULv2f32, 1, MCP::FMLAv2f32_OP1); 4779 4780 Found |= Match(AArch64::FMULv2i32_indexed, 2, MCP::FMLAv2i32_indexed_OP2) || 4781 Match(AArch64::FMULv2f32, 2, MCP::FMLAv2f32_OP2); 4782 break; 4783 case AArch64::FADDv2f64: 4784 Found |= Match(AArch64::FMULv2i64_indexed, 1, MCP::FMLAv2i64_indexed_OP1) || 4785 Match(AArch64::FMULv2f64, 1, MCP::FMLAv2f64_OP1); 4786 4787 Found |= Match(AArch64::FMULv2i64_indexed, 2, MCP::FMLAv2i64_indexed_OP2) || 4788 Match(AArch64::FMULv2f64, 2, MCP::FMLAv2f64_OP2); 4789 break; 4790 case AArch64::FADDv4f32: 4791 Found |= Match(AArch64::FMULv4i32_indexed, 1, MCP::FMLAv4i32_indexed_OP1) || 4792 Match(AArch64::FMULv4f32, 1, MCP::FMLAv4f32_OP1); 4793 4794 Found |= Match(AArch64::FMULv4i32_indexed, 2, MCP::FMLAv4i32_indexed_OP2) || 4795 Match(AArch64::FMULv4f32, 2, MCP::FMLAv4f32_OP2); 4796 break; 4797 case AArch64::FSUBHrr: 4798 Found = Match(AArch64::FMULHrr, 1, MCP::FMULSUBH_OP1); 4799 Found |= Match(AArch64::FMULHrr, 2, MCP::FMULSUBH_OP2); 4800 Found |= Match(AArch64::FNMULHrr, 1, MCP::FNMULSUBH_OP1); 4801 break; 4802 case AArch64::FSUBSrr: 4803 Found = Match(AArch64::FMULSrr, 1, MCP::FMULSUBS_OP1); 4804 4805 Found |= Match(AArch64::FMULSrr, 2, MCP::FMULSUBS_OP2) || 4806 Match(AArch64::FMULv1i32_indexed, 2, MCP::FMLSv1i32_indexed_OP2); 4807 4808 Found |= Match(AArch64::FNMULSrr, 1, MCP::FNMULSUBS_OP1); 4809 break; 4810 case AArch64::FSUBDrr: 4811 Found = Match(AArch64::FMULDrr, 1, MCP::FMULSUBD_OP1); 4812 4813 Found |= Match(AArch64::FMULDrr, 2, MCP::FMULSUBD_OP2) || 4814 Match(AArch64::FMULv1i64_indexed, 2, MCP::FMLSv1i64_indexed_OP2); 4815 4816 Found |= Match(AArch64::FNMULDrr, 1, MCP::FNMULSUBD_OP1); 4817 break; 4818 case AArch64::FSUBv4f16: 4819 Found |= Match(AArch64::FMULv4i16_indexed, 2, MCP::FMLSv4i16_indexed_OP2) || 4820 Match(AArch64::FMULv4f16, 2, MCP::FMLSv4f16_OP2); 4821 4822 Found |= Match(AArch64::FMULv4i16_indexed, 1, MCP::FMLSv4i16_indexed_OP1) || 4823 Match(AArch64::FMULv4f16, 1, MCP::FMLSv4f16_OP1); 4824 break; 4825 case AArch64::FSUBv8f16: 4826 Found |= Match(AArch64::FMULv8i16_indexed, 2, MCP::FMLSv8i16_indexed_OP2) || 4827 Match(AArch64::FMULv8f16, 2, MCP::FMLSv8f16_OP2); 4828 4829 Found |= Match(AArch64::FMULv8i16_indexed, 1, MCP::FMLSv8i16_indexed_OP1) || 4830 Match(AArch64::FMULv8f16, 1, MCP::FMLSv8f16_OP1); 4831 break; 4832 case AArch64::FSUBv2f32: 4833 Found |= Match(AArch64::FMULv2i32_indexed, 2, MCP::FMLSv2i32_indexed_OP2) || 4834 Match(AArch64::FMULv2f32, 2, MCP::FMLSv2f32_OP2); 4835 4836 Found |= Match(AArch64::FMULv2i32_indexed, 1, MCP::FMLSv2i32_indexed_OP1) || 4837 Match(AArch64::FMULv2f32, 1, MCP::FMLSv2f32_OP1); 4838 break; 4839 case AArch64::FSUBv2f64: 4840 Found |= Match(AArch64::FMULv2i64_indexed, 2, MCP::FMLSv2i64_indexed_OP2) || 4841 Match(AArch64::FMULv2f64, 2, MCP::FMLSv2f64_OP2); 4842 4843 Found |= Match(AArch64::FMULv2i64_indexed, 1, MCP::FMLSv2i64_indexed_OP1) || 4844 Match(AArch64::FMULv2f64, 1, MCP::FMLSv2f64_OP1); 4845 break; 4846 case AArch64::FSUBv4f32: 4847 Found |= Match(AArch64::FMULv4i32_indexed, 2, MCP::FMLSv4i32_indexed_OP2) || 4848 Match(AArch64::FMULv4f32, 2, MCP::FMLSv4f32_OP2); 4849 4850 Found |= Match(AArch64::FMULv4i32_indexed, 1, MCP::FMLSv4i32_indexed_OP1) || 4851 Match(AArch64::FMULv4f32, 1, MCP::FMLSv4f32_OP1); 4852 break; 4853 } 4854 return Found; 4855 } 4856 4857 /// Return true when a code sequence can improve throughput. It 4858 /// should be called only for instructions in loops. 4859 /// \param Pattern - combiner pattern 4860 bool AArch64InstrInfo::isThroughputPattern( 4861 MachineCombinerPattern Pattern) const { 4862 switch (Pattern) { 4863 default: 4864 break; 4865 case MachineCombinerPattern::FMULADDH_OP1: 4866 case MachineCombinerPattern::FMULADDH_OP2: 4867 case MachineCombinerPattern::FMULSUBH_OP1: 4868 case MachineCombinerPattern::FMULSUBH_OP2: 4869 case MachineCombinerPattern::FMULADDS_OP1: 4870 case MachineCombinerPattern::FMULADDS_OP2: 4871 case MachineCombinerPattern::FMULSUBS_OP1: 4872 case MachineCombinerPattern::FMULSUBS_OP2: 4873 case MachineCombinerPattern::FMULADDD_OP1: 4874 case MachineCombinerPattern::FMULADDD_OP2: 4875 case MachineCombinerPattern::FMULSUBD_OP1: 4876 case MachineCombinerPattern::FMULSUBD_OP2: 4877 case MachineCombinerPattern::FNMULSUBH_OP1: 4878 case MachineCombinerPattern::FNMULSUBS_OP1: 4879 case MachineCombinerPattern::FNMULSUBD_OP1: 4880 case MachineCombinerPattern::FMLAv4i16_indexed_OP1: 4881 case MachineCombinerPattern::FMLAv4i16_indexed_OP2: 4882 case MachineCombinerPattern::FMLAv8i16_indexed_OP1: 4883 case MachineCombinerPattern::FMLAv8i16_indexed_OP2: 4884 case MachineCombinerPattern::FMLAv1i32_indexed_OP1: 4885 case MachineCombinerPattern::FMLAv1i32_indexed_OP2: 4886 case MachineCombinerPattern::FMLAv1i64_indexed_OP1: 4887 case MachineCombinerPattern::FMLAv1i64_indexed_OP2: 4888 case MachineCombinerPattern::FMLAv4f16_OP2: 4889 case MachineCombinerPattern::FMLAv4f16_OP1: 4890 case MachineCombinerPattern::FMLAv8f16_OP1: 4891 case MachineCombinerPattern::FMLAv8f16_OP2: 4892 case MachineCombinerPattern::FMLAv2f32_OP2: 4893 case MachineCombinerPattern::FMLAv2f32_OP1: 4894 case MachineCombinerPattern::FMLAv2f64_OP1: 4895 case MachineCombinerPattern::FMLAv2f64_OP2: 4896 case MachineCombinerPattern::FMLAv2i32_indexed_OP1: 4897 case MachineCombinerPattern::FMLAv2i32_indexed_OP2: 4898 case MachineCombinerPattern::FMLAv2i64_indexed_OP1: 4899 case MachineCombinerPattern::FMLAv2i64_indexed_OP2: 4900 case MachineCombinerPattern::FMLAv4f32_OP1: 4901 case MachineCombinerPattern::FMLAv4f32_OP2: 4902 case MachineCombinerPattern::FMLAv4i32_indexed_OP1: 4903 case MachineCombinerPattern::FMLAv4i32_indexed_OP2: 4904 case MachineCombinerPattern::FMLSv4i16_indexed_OP1: 4905 case MachineCombinerPattern::FMLSv4i16_indexed_OP2: 4906 case MachineCombinerPattern::FMLSv8i16_indexed_OP1: 4907 case MachineCombinerPattern::FMLSv8i16_indexed_OP2: 4908 case MachineCombinerPattern::FMLSv1i32_indexed_OP2: 4909 case MachineCombinerPattern::FMLSv1i64_indexed_OP2: 4910 case MachineCombinerPattern::FMLSv2i32_indexed_OP2: 4911 case MachineCombinerPattern::FMLSv2i64_indexed_OP2: 4912 case MachineCombinerPattern::FMLSv4f16_OP1: 4913 case MachineCombinerPattern::FMLSv4f16_OP2: 4914 case MachineCombinerPattern::FMLSv8f16_OP1: 4915 case MachineCombinerPattern::FMLSv8f16_OP2: 4916 case MachineCombinerPattern::FMLSv2f32_OP2: 4917 case MachineCombinerPattern::FMLSv2f64_OP2: 4918 case MachineCombinerPattern::FMLSv4i32_indexed_OP2: 4919 case MachineCombinerPattern::FMLSv4f32_OP2: 4920 case MachineCombinerPattern::MULADDv8i8_OP1: 4921 case MachineCombinerPattern::MULADDv8i8_OP2: 4922 case MachineCombinerPattern::MULADDv16i8_OP1: 4923 case MachineCombinerPattern::MULADDv16i8_OP2: 4924 case MachineCombinerPattern::MULADDv4i16_OP1: 4925 case MachineCombinerPattern::MULADDv4i16_OP2: 4926 case MachineCombinerPattern::MULADDv8i16_OP1: 4927 case MachineCombinerPattern::MULADDv8i16_OP2: 4928 case MachineCombinerPattern::MULADDv2i32_OP1: 4929 case MachineCombinerPattern::MULADDv2i32_OP2: 4930 case MachineCombinerPattern::MULADDv4i32_OP1: 4931 case MachineCombinerPattern::MULADDv4i32_OP2: 4932 case MachineCombinerPattern::MULSUBv8i8_OP1: 4933 case MachineCombinerPattern::MULSUBv8i8_OP2: 4934 case MachineCombinerPattern::MULSUBv16i8_OP1: 4935 case MachineCombinerPattern::MULSUBv16i8_OP2: 4936 case MachineCombinerPattern::MULSUBv4i16_OP1: 4937 case MachineCombinerPattern::MULSUBv4i16_OP2: 4938 case MachineCombinerPattern::MULSUBv8i16_OP1: 4939 case MachineCombinerPattern::MULSUBv8i16_OP2: 4940 case MachineCombinerPattern::MULSUBv2i32_OP1: 4941 case MachineCombinerPattern::MULSUBv2i32_OP2: 4942 case MachineCombinerPattern::MULSUBv4i32_OP1: 4943 case MachineCombinerPattern::MULSUBv4i32_OP2: 4944 case MachineCombinerPattern::MULADDv4i16_indexed_OP1: 4945 case MachineCombinerPattern::MULADDv4i16_indexed_OP2: 4946 case MachineCombinerPattern::MULADDv8i16_indexed_OP1: 4947 case MachineCombinerPattern::MULADDv8i16_indexed_OP2: 4948 case MachineCombinerPattern::MULADDv2i32_indexed_OP1: 4949 case MachineCombinerPattern::MULADDv2i32_indexed_OP2: 4950 case MachineCombinerPattern::MULADDv4i32_indexed_OP1: 4951 case MachineCombinerPattern::MULADDv4i32_indexed_OP2: 4952 case MachineCombinerPattern::MULSUBv4i16_indexed_OP1: 4953 case MachineCombinerPattern::MULSUBv4i16_indexed_OP2: 4954 case MachineCombinerPattern::MULSUBv8i16_indexed_OP1: 4955 case MachineCombinerPattern::MULSUBv8i16_indexed_OP2: 4956 case MachineCombinerPattern::MULSUBv2i32_indexed_OP1: 4957 case MachineCombinerPattern::MULSUBv2i32_indexed_OP2: 4958 case MachineCombinerPattern::MULSUBv4i32_indexed_OP1: 4959 case MachineCombinerPattern::MULSUBv4i32_indexed_OP2: 4960 return true; 4961 } // end switch (Pattern) 4962 return false; 4963 } 4964 /// Return true when there is potentially a faster code sequence for an 4965 /// instruction chain ending in \p Root. All potential patterns are listed in 4966 /// the \p Pattern vector. Pattern should be sorted in priority order since the 4967 /// pattern evaluator stops checking as soon as it finds a faster sequence. 4968 4969 bool AArch64InstrInfo::getMachineCombinerPatterns( 4970 MachineInstr &Root, SmallVectorImpl<MachineCombinerPattern> &Patterns, 4971 bool DoRegPressureReduce) const { 4972 // Integer patterns 4973 if (getMaddPatterns(Root, Patterns)) 4974 return true; 4975 // Floating point patterns 4976 if (getFMAPatterns(Root, Patterns)) 4977 return true; 4978 4979 return TargetInstrInfo::getMachineCombinerPatterns(Root, Patterns, 4980 DoRegPressureReduce); 4981 } 4982 4983 enum class FMAInstKind { Default, Indexed, Accumulator }; 4984 /// genFusedMultiply - Generate fused multiply instructions. 4985 /// This function supports both integer and floating point instructions. 4986 /// A typical example: 4987 /// F|MUL I=A,B,0 4988 /// F|ADD R,I,C 4989 /// ==> F|MADD R,A,B,C 4990 /// \param MF Containing MachineFunction 4991 /// \param MRI Register information 4992 /// \param TII Target information 4993 /// \param Root is the F|ADD instruction 4994 /// \param [out] InsInstrs is a vector of machine instructions and will 4995 /// contain the generated madd instruction 4996 /// \param IdxMulOpd is index of operand in Root that is the result of 4997 /// the F|MUL. In the example above IdxMulOpd is 1. 4998 /// \param MaddOpc the opcode fo the f|madd instruction 4999 /// \param RC Register class of operands 5000 /// \param kind of fma instruction (addressing mode) to be generated 5001 /// \param ReplacedAddend is the result register from the instruction 5002 /// replacing the non-combined operand, if any. 5003 static MachineInstr * 5004 genFusedMultiply(MachineFunction &MF, MachineRegisterInfo &MRI, 5005 const TargetInstrInfo *TII, MachineInstr &Root, 5006 SmallVectorImpl<MachineInstr *> &InsInstrs, unsigned IdxMulOpd, 5007 unsigned MaddOpc, const TargetRegisterClass *RC, 5008 FMAInstKind kind = FMAInstKind::Default, 5009 const Register *ReplacedAddend = nullptr) { 5010 assert(IdxMulOpd == 1 || IdxMulOpd == 2); 5011 5012 unsigned IdxOtherOpd = IdxMulOpd == 1 ? 2 : 1; 5013 MachineInstr *MUL = MRI.getUniqueVRegDef(Root.getOperand(IdxMulOpd).getReg()); 5014 Register ResultReg = Root.getOperand(0).getReg(); 5015 Register SrcReg0 = MUL->getOperand(1).getReg(); 5016 bool Src0IsKill = MUL->getOperand(1).isKill(); 5017 Register SrcReg1 = MUL->getOperand(2).getReg(); 5018 bool Src1IsKill = MUL->getOperand(2).isKill(); 5019 5020 unsigned SrcReg2; 5021 bool Src2IsKill; 5022 if (ReplacedAddend) { 5023 // If we just generated a new addend, we must be it's only use. 5024 SrcReg2 = *ReplacedAddend; 5025 Src2IsKill = true; 5026 } else { 5027 SrcReg2 = Root.getOperand(IdxOtherOpd).getReg(); 5028 Src2IsKill = Root.getOperand(IdxOtherOpd).isKill(); 5029 } 5030 5031 if (Register::isVirtualRegister(ResultReg)) 5032 MRI.constrainRegClass(ResultReg, RC); 5033 if (Register::isVirtualRegister(SrcReg0)) 5034 MRI.constrainRegClass(SrcReg0, RC); 5035 if (Register::isVirtualRegister(SrcReg1)) 5036 MRI.constrainRegClass(SrcReg1, RC); 5037 if (Register::isVirtualRegister(SrcReg2)) 5038 MRI.constrainRegClass(SrcReg2, RC); 5039 5040 MachineInstrBuilder MIB; 5041 if (kind == FMAInstKind::Default) 5042 MIB = BuildMI(MF, Root.getDebugLoc(), TII->get(MaddOpc), ResultReg) 5043 .addReg(SrcReg0, getKillRegState(Src0IsKill)) 5044 .addReg(SrcReg1, getKillRegState(Src1IsKill)) 5045 .addReg(SrcReg2, getKillRegState(Src2IsKill)); 5046 else if (kind == FMAInstKind::Indexed) 5047 MIB = BuildMI(MF, Root.getDebugLoc(), TII->get(MaddOpc), ResultReg) 5048 .addReg(SrcReg2, getKillRegState(Src2IsKill)) 5049 .addReg(SrcReg0, getKillRegState(Src0IsKill)) 5050 .addReg(SrcReg1, getKillRegState(Src1IsKill)) 5051 .addImm(MUL->getOperand(3).getImm()); 5052 else if (kind == FMAInstKind::Accumulator) 5053 MIB = BuildMI(MF, Root.getDebugLoc(), TII->get(MaddOpc), ResultReg) 5054 .addReg(SrcReg2, getKillRegState(Src2IsKill)) 5055 .addReg(SrcReg0, getKillRegState(Src0IsKill)) 5056 .addReg(SrcReg1, getKillRegState(Src1IsKill)); 5057 else 5058 assert(false && "Invalid FMA instruction kind \n"); 5059 // Insert the MADD (MADD, FMA, FMS, FMLA, FMSL) 5060 InsInstrs.push_back(MIB); 5061 return MUL; 5062 } 5063 5064 /// genFusedMultiplyAcc - Helper to generate fused multiply accumulate 5065 /// instructions. 5066 /// 5067 /// \see genFusedMultiply 5068 static MachineInstr *genFusedMultiplyAcc( 5069 MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII, 5070 MachineInstr &Root, SmallVectorImpl<MachineInstr *> &InsInstrs, 5071 unsigned IdxMulOpd, unsigned MaddOpc, const TargetRegisterClass *RC) { 5072 return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC, 5073 FMAInstKind::Accumulator); 5074 } 5075 5076 /// genNeg - Helper to generate an intermediate negation of the second operand 5077 /// of Root 5078 static Register genNeg(MachineFunction &MF, MachineRegisterInfo &MRI, 5079 const TargetInstrInfo *TII, MachineInstr &Root, 5080 SmallVectorImpl<MachineInstr *> &InsInstrs, 5081 DenseMap<unsigned, unsigned> &InstrIdxForVirtReg, 5082 unsigned MnegOpc, const TargetRegisterClass *RC) { 5083 Register NewVR = MRI.createVirtualRegister(RC); 5084 MachineInstrBuilder MIB = 5085 BuildMI(MF, Root.getDebugLoc(), TII->get(MnegOpc), NewVR) 5086 .add(Root.getOperand(2)); 5087 InsInstrs.push_back(MIB); 5088 5089 assert(InstrIdxForVirtReg.empty()); 5090 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0)); 5091 5092 return NewVR; 5093 } 5094 5095 /// genFusedMultiplyAccNeg - Helper to generate fused multiply accumulate 5096 /// instructions with an additional negation of the accumulator 5097 static MachineInstr *genFusedMultiplyAccNeg( 5098 MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII, 5099 MachineInstr &Root, SmallVectorImpl<MachineInstr *> &InsInstrs, 5100 DenseMap<unsigned, unsigned> &InstrIdxForVirtReg, unsigned IdxMulOpd, 5101 unsigned MaddOpc, unsigned MnegOpc, const TargetRegisterClass *RC) { 5102 assert(IdxMulOpd == 1); 5103 5104 Register NewVR = 5105 genNeg(MF, MRI, TII, Root, InsInstrs, InstrIdxForVirtReg, MnegOpc, RC); 5106 return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC, 5107 FMAInstKind::Accumulator, &NewVR); 5108 } 5109 5110 /// genFusedMultiplyIdx - Helper to generate fused multiply accumulate 5111 /// instructions. 5112 /// 5113 /// \see genFusedMultiply 5114 static MachineInstr *genFusedMultiplyIdx( 5115 MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII, 5116 MachineInstr &Root, SmallVectorImpl<MachineInstr *> &InsInstrs, 5117 unsigned IdxMulOpd, unsigned MaddOpc, const TargetRegisterClass *RC) { 5118 return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC, 5119 FMAInstKind::Indexed); 5120 } 5121 5122 /// genFusedMultiplyAccNeg - Helper to generate fused multiply accumulate 5123 /// instructions with an additional negation of the accumulator 5124 static MachineInstr *genFusedMultiplyIdxNeg( 5125 MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII, 5126 MachineInstr &Root, SmallVectorImpl<MachineInstr *> &InsInstrs, 5127 DenseMap<unsigned, unsigned> &InstrIdxForVirtReg, unsigned IdxMulOpd, 5128 unsigned MaddOpc, unsigned MnegOpc, const TargetRegisterClass *RC) { 5129 assert(IdxMulOpd == 1); 5130 5131 Register NewVR = 5132 genNeg(MF, MRI, TII, Root, InsInstrs, InstrIdxForVirtReg, MnegOpc, RC); 5133 5134 return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC, 5135 FMAInstKind::Indexed, &NewVR); 5136 } 5137 5138 /// genMaddR - Generate madd instruction and combine mul and add using 5139 /// an extra virtual register 5140 /// Example - an ADD intermediate needs to be stored in a register: 5141 /// MUL I=A,B,0 5142 /// ADD R,I,Imm 5143 /// ==> ORR V, ZR, Imm 5144 /// ==> MADD R,A,B,V 5145 /// \param MF Containing MachineFunction 5146 /// \param MRI Register information 5147 /// \param TII Target information 5148 /// \param Root is the ADD instruction 5149 /// \param [out] InsInstrs is a vector of machine instructions and will 5150 /// contain the generated madd instruction 5151 /// \param IdxMulOpd is index of operand in Root that is the result of 5152 /// the MUL. In the example above IdxMulOpd is 1. 5153 /// \param MaddOpc the opcode fo the madd instruction 5154 /// \param VR is a virtual register that holds the value of an ADD operand 5155 /// (V in the example above). 5156 /// \param RC Register class of operands 5157 static MachineInstr *genMaddR(MachineFunction &MF, MachineRegisterInfo &MRI, 5158 const TargetInstrInfo *TII, MachineInstr &Root, 5159 SmallVectorImpl<MachineInstr *> &InsInstrs, 5160 unsigned IdxMulOpd, unsigned MaddOpc, unsigned VR, 5161 const TargetRegisterClass *RC) { 5162 assert(IdxMulOpd == 1 || IdxMulOpd == 2); 5163 5164 MachineInstr *MUL = MRI.getUniqueVRegDef(Root.getOperand(IdxMulOpd).getReg()); 5165 Register ResultReg = Root.getOperand(0).getReg(); 5166 Register SrcReg0 = MUL->getOperand(1).getReg(); 5167 bool Src0IsKill = MUL->getOperand(1).isKill(); 5168 Register SrcReg1 = MUL->getOperand(2).getReg(); 5169 bool Src1IsKill = MUL->getOperand(2).isKill(); 5170 5171 if (Register::isVirtualRegister(ResultReg)) 5172 MRI.constrainRegClass(ResultReg, RC); 5173 if (Register::isVirtualRegister(SrcReg0)) 5174 MRI.constrainRegClass(SrcReg0, RC); 5175 if (Register::isVirtualRegister(SrcReg1)) 5176 MRI.constrainRegClass(SrcReg1, RC); 5177 if (Register::isVirtualRegister(VR)) 5178 MRI.constrainRegClass(VR, RC); 5179 5180 MachineInstrBuilder MIB = 5181 BuildMI(MF, Root.getDebugLoc(), TII->get(MaddOpc), ResultReg) 5182 .addReg(SrcReg0, getKillRegState(Src0IsKill)) 5183 .addReg(SrcReg1, getKillRegState(Src1IsKill)) 5184 .addReg(VR); 5185 // Insert the MADD 5186 InsInstrs.push_back(MIB); 5187 return MUL; 5188 } 5189 5190 /// When getMachineCombinerPatterns() finds potential patterns, 5191 /// this function generates the instructions that could replace the 5192 /// original code sequence 5193 void AArch64InstrInfo::genAlternativeCodeSequence( 5194 MachineInstr &Root, MachineCombinerPattern Pattern, 5195 SmallVectorImpl<MachineInstr *> &InsInstrs, 5196 SmallVectorImpl<MachineInstr *> &DelInstrs, 5197 DenseMap<unsigned, unsigned> &InstrIdxForVirtReg) const { 5198 MachineBasicBlock &MBB = *Root.getParent(); 5199 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 5200 MachineFunction &MF = *MBB.getParent(); 5201 const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo(); 5202 5203 MachineInstr *MUL = nullptr; 5204 const TargetRegisterClass *RC; 5205 unsigned Opc; 5206 switch (Pattern) { 5207 default: 5208 // Reassociate instructions. 5209 TargetInstrInfo::genAlternativeCodeSequence(Root, Pattern, InsInstrs, 5210 DelInstrs, InstrIdxForVirtReg); 5211 return; 5212 case MachineCombinerPattern::MULADDW_OP1: 5213 case MachineCombinerPattern::MULADDX_OP1: 5214 // MUL I=A,B,0 5215 // ADD R,I,C 5216 // ==> MADD R,A,B,C 5217 // --- Create(MADD); 5218 if (Pattern == MachineCombinerPattern::MULADDW_OP1) { 5219 Opc = AArch64::MADDWrrr; 5220 RC = &AArch64::GPR32RegClass; 5221 } else { 5222 Opc = AArch64::MADDXrrr; 5223 RC = &AArch64::GPR64RegClass; 5224 } 5225 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 5226 break; 5227 case MachineCombinerPattern::MULADDW_OP2: 5228 case MachineCombinerPattern::MULADDX_OP2: 5229 // MUL I=A,B,0 5230 // ADD R,C,I 5231 // ==> MADD R,A,B,C 5232 // --- Create(MADD); 5233 if (Pattern == MachineCombinerPattern::MULADDW_OP2) { 5234 Opc = AArch64::MADDWrrr; 5235 RC = &AArch64::GPR32RegClass; 5236 } else { 5237 Opc = AArch64::MADDXrrr; 5238 RC = &AArch64::GPR64RegClass; 5239 } 5240 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 5241 break; 5242 case MachineCombinerPattern::MULADDWI_OP1: 5243 case MachineCombinerPattern::MULADDXI_OP1: { 5244 // MUL I=A,B,0 5245 // ADD R,I,Imm 5246 // ==> ORR V, ZR, Imm 5247 // ==> MADD R,A,B,V 5248 // --- Create(MADD); 5249 const TargetRegisterClass *OrrRC; 5250 unsigned BitSize, OrrOpc, ZeroReg; 5251 if (Pattern == MachineCombinerPattern::MULADDWI_OP1) { 5252 OrrOpc = AArch64::ORRWri; 5253 OrrRC = &AArch64::GPR32spRegClass; 5254 BitSize = 32; 5255 ZeroReg = AArch64::WZR; 5256 Opc = AArch64::MADDWrrr; 5257 RC = &AArch64::GPR32RegClass; 5258 } else { 5259 OrrOpc = AArch64::ORRXri; 5260 OrrRC = &AArch64::GPR64spRegClass; 5261 BitSize = 64; 5262 ZeroReg = AArch64::XZR; 5263 Opc = AArch64::MADDXrrr; 5264 RC = &AArch64::GPR64RegClass; 5265 } 5266 Register NewVR = MRI.createVirtualRegister(OrrRC); 5267 uint64_t Imm = Root.getOperand(2).getImm(); 5268 5269 if (Root.getOperand(3).isImm()) { 5270 unsigned Val = Root.getOperand(3).getImm(); 5271 Imm = Imm << Val; 5272 } 5273 uint64_t UImm = SignExtend64(Imm, BitSize); 5274 uint64_t Encoding; 5275 if (AArch64_AM::processLogicalImmediate(UImm, BitSize, Encoding)) { 5276 MachineInstrBuilder MIB1 = 5277 BuildMI(MF, Root.getDebugLoc(), TII->get(OrrOpc), NewVR) 5278 .addReg(ZeroReg) 5279 .addImm(Encoding); 5280 InsInstrs.push_back(MIB1); 5281 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0)); 5282 MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR, RC); 5283 } 5284 break; 5285 } 5286 case MachineCombinerPattern::MULSUBW_OP1: 5287 case MachineCombinerPattern::MULSUBX_OP1: { 5288 // MUL I=A,B,0 5289 // SUB R,I, C 5290 // ==> SUB V, 0, C 5291 // ==> MADD R,A,B,V // = -C + A*B 5292 // --- Create(MADD); 5293 const TargetRegisterClass *SubRC; 5294 unsigned SubOpc, ZeroReg; 5295 if (Pattern == MachineCombinerPattern::MULSUBW_OP1) { 5296 SubOpc = AArch64::SUBWrr; 5297 SubRC = &AArch64::GPR32spRegClass; 5298 ZeroReg = AArch64::WZR; 5299 Opc = AArch64::MADDWrrr; 5300 RC = &AArch64::GPR32RegClass; 5301 } else { 5302 SubOpc = AArch64::SUBXrr; 5303 SubRC = &AArch64::GPR64spRegClass; 5304 ZeroReg = AArch64::XZR; 5305 Opc = AArch64::MADDXrrr; 5306 RC = &AArch64::GPR64RegClass; 5307 } 5308 Register NewVR = MRI.createVirtualRegister(SubRC); 5309 // SUB NewVR, 0, C 5310 MachineInstrBuilder MIB1 = 5311 BuildMI(MF, Root.getDebugLoc(), TII->get(SubOpc), NewVR) 5312 .addReg(ZeroReg) 5313 .add(Root.getOperand(2)); 5314 InsInstrs.push_back(MIB1); 5315 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0)); 5316 MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR, RC); 5317 break; 5318 } 5319 case MachineCombinerPattern::MULSUBW_OP2: 5320 case MachineCombinerPattern::MULSUBX_OP2: 5321 // MUL I=A,B,0 5322 // SUB R,C,I 5323 // ==> MSUB R,A,B,C (computes C - A*B) 5324 // --- Create(MSUB); 5325 if (Pattern == MachineCombinerPattern::MULSUBW_OP2) { 5326 Opc = AArch64::MSUBWrrr; 5327 RC = &AArch64::GPR32RegClass; 5328 } else { 5329 Opc = AArch64::MSUBXrrr; 5330 RC = &AArch64::GPR64RegClass; 5331 } 5332 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 5333 break; 5334 case MachineCombinerPattern::MULSUBWI_OP1: 5335 case MachineCombinerPattern::MULSUBXI_OP1: { 5336 // MUL I=A,B,0 5337 // SUB R,I, Imm 5338 // ==> ORR V, ZR, -Imm 5339 // ==> MADD R,A,B,V // = -Imm + A*B 5340 // --- Create(MADD); 5341 const TargetRegisterClass *OrrRC; 5342 unsigned BitSize, OrrOpc, ZeroReg; 5343 if (Pattern == MachineCombinerPattern::MULSUBWI_OP1) { 5344 OrrOpc = AArch64::ORRWri; 5345 OrrRC = &AArch64::GPR32spRegClass; 5346 BitSize = 32; 5347 ZeroReg = AArch64::WZR; 5348 Opc = AArch64::MADDWrrr; 5349 RC = &AArch64::GPR32RegClass; 5350 } else { 5351 OrrOpc = AArch64::ORRXri; 5352 OrrRC = &AArch64::GPR64spRegClass; 5353 BitSize = 64; 5354 ZeroReg = AArch64::XZR; 5355 Opc = AArch64::MADDXrrr; 5356 RC = &AArch64::GPR64RegClass; 5357 } 5358 Register NewVR = MRI.createVirtualRegister(OrrRC); 5359 uint64_t Imm = Root.getOperand(2).getImm(); 5360 if (Root.getOperand(3).isImm()) { 5361 unsigned Val = Root.getOperand(3).getImm(); 5362 Imm = Imm << Val; 5363 } 5364 uint64_t UImm = SignExtend64(-Imm, BitSize); 5365 uint64_t Encoding; 5366 if (AArch64_AM::processLogicalImmediate(UImm, BitSize, Encoding)) { 5367 MachineInstrBuilder MIB1 = 5368 BuildMI(MF, Root.getDebugLoc(), TII->get(OrrOpc), NewVR) 5369 .addReg(ZeroReg) 5370 .addImm(Encoding); 5371 InsInstrs.push_back(MIB1); 5372 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0)); 5373 MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR, RC); 5374 } 5375 break; 5376 } 5377 5378 case MachineCombinerPattern::MULADDv8i8_OP1: 5379 Opc = AArch64::MLAv8i8; 5380 RC = &AArch64::FPR64RegClass; 5381 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 5382 break; 5383 case MachineCombinerPattern::MULADDv8i8_OP2: 5384 Opc = AArch64::MLAv8i8; 5385 RC = &AArch64::FPR64RegClass; 5386 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 5387 break; 5388 case MachineCombinerPattern::MULADDv16i8_OP1: 5389 Opc = AArch64::MLAv16i8; 5390 RC = &AArch64::FPR128RegClass; 5391 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 5392 break; 5393 case MachineCombinerPattern::MULADDv16i8_OP2: 5394 Opc = AArch64::MLAv16i8; 5395 RC = &AArch64::FPR128RegClass; 5396 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 5397 break; 5398 case MachineCombinerPattern::MULADDv4i16_OP1: 5399 Opc = AArch64::MLAv4i16; 5400 RC = &AArch64::FPR64RegClass; 5401 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 5402 break; 5403 case MachineCombinerPattern::MULADDv4i16_OP2: 5404 Opc = AArch64::MLAv4i16; 5405 RC = &AArch64::FPR64RegClass; 5406 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 5407 break; 5408 case MachineCombinerPattern::MULADDv8i16_OP1: 5409 Opc = AArch64::MLAv8i16; 5410 RC = &AArch64::FPR128RegClass; 5411 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 5412 break; 5413 case MachineCombinerPattern::MULADDv8i16_OP2: 5414 Opc = AArch64::MLAv8i16; 5415 RC = &AArch64::FPR128RegClass; 5416 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 5417 break; 5418 case MachineCombinerPattern::MULADDv2i32_OP1: 5419 Opc = AArch64::MLAv2i32; 5420 RC = &AArch64::FPR64RegClass; 5421 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 5422 break; 5423 case MachineCombinerPattern::MULADDv2i32_OP2: 5424 Opc = AArch64::MLAv2i32; 5425 RC = &AArch64::FPR64RegClass; 5426 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 5427 break; 5428 case MachineCombinerPattern::MULADDv4i32_OP1: 5429 Opc = AArch64::MLAv4i32; 5430 RC = &AArch64::FPR128RegClass; 5431 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 5432 break; 5433 case MachineCombinerPattern::MULADDv4i32_OP2: 5434 Opc = AArch64::MLAv4i32; 5435 RC = &AArch64::FPR128RegClass; 5436 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 5437 break; 5438 5439 case MachineCombinerPattern::MULSUBv8i8_OP1: 5440 Opc = AArch64::MLAv8i8; 5441 RC = &AArch64::FPR64RegClass; 5442 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs, 5443 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv8i8, 5444 RC); 5445 break; 5446 case MachineCombinerPattern::MULSUBv8i8_OP2: 5447 Opc = AArch64::MLSv8i8; 5448 RC = &AArch64::FPR64RegClass; 5449 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 5450 break; 5451 case MachineCombinerPattern::MULSUBv16i8_OP1: 5452 Opc = AArch64::MLAv16i8; 5453 RC = &AArch64::FPR128RegClass; 5454 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs, 5455 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv16i8, 5456 RC); 5457 break; 5458 case MachineCombinerPattern::MULSUBv16i8_OP2: 5459 Opc = AArch64::MLSv16i8; 5460 RC = &AArch64::FPR128RegClass; 5461 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 5462 break; 5463 case MachineCombinerPattern::MULSUBv4i16_OP1: 5464 Opc = AArch64::MLAv4i16; 5465 RC = &AArch64::FPR64RegClass; 5466 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs, 5467 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv4i16, 5468 RC); 5469 break; 5470 case MachineCombinerPattern::MULSUBv4i16_OP2: 5471 Opc = AArch64::MLSv4i16; 5472 RC = &AArch64::FPR64RegClass; 5473 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 5474 break; 5475 case MachineCombinerPattern::MULSUBv8i16_OP1: 5476 Opc = AArch64::MLAv8i16; 5477 RC = &AArch64::FPR128RegClass; 5478 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs, 5479 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv8i16, 5480 RC); 5481 break; 5482 case MachineCombinerPattern::MULSUBv8i16_OP2: 5483 Opc = AArch64::MLSv8i16; 5484 RC = &AArch64::FPR128RegClass; 5485 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 5486 break; 5487 case MachineCombinerPattern::MULSUBv2i32_OP1: 5488 Opc = AArch64::MLAv2i32; 5489 RC = &AArch64::FPR64RegClass; 5490 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs, 5491 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv2i32, 5492 RC); 5493 break; 5494 case MachineCombinerPattern::MULSUBv2i32_OP2: 5495 Opc = AArch64::MLSv2i32; 5496 RC = &AArch64::FPR64RegClass; 5497 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 5498 break; 5499 case MachineCombinerPattern::MULSUBv4i32_OP1: 5500 Opc = AArch64::MLAv4i32; 5501 RC = &AArch64::FPR128RegClass; 5502 MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs, 5503 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv4i32, 5504 RC); 5505 break; 5506 case MachineCombinerPattern::MULSUBv4i32_OP2: 5507 Opc = AArch64::MLSv4i32; 5508 RC = &AArch64::FPR128RegClass; 5509 MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 5510 break; 5511 5512 case MachineCombinerPattern::MULADDv4i16_indexed_OP1: 5513 Opc = AArch64::MLAv4i16_indexed; 5514 RC = &AArch64::FPR64RegClass; 5515 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 5516 break; 5517 case MachineCombinerPattern::MULADDv4i16_indexed_OP2: 5518 Opc = AArch64::MLAv4i16_indexed; 5519 RC = &AArch64::FPR64RegClass; 5520 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 5521 break; 5522 case MachineCombinerPattern::MULADDv8i16_indexed_OP1: 5523 Opc = AArch64::MLAv8i16_indexed; 5524 RC = &AArch64::FPR128RegClass; 5525 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 5526 break; 5527 case MachineCombinerPattern::MULADDv8i16_indexed_OP2: 5528 Opc = AArch64::MLAv8i16_indexed; 5529 RC = &AArch64::FPR128RegClass; 5530 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 5531 break; 5532 case MachineCombinerPattern::MULADDv2i32_indexed_OP1: 5533 Opc = AArch64::MLAv2i32_indexed; 5534 RC = &AArch64::FPR64RegClass; 5535 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 5536 break; 5537 case MachineCombinerPattern::MULADDv2i32_indexed_OP2: 5538 Opc = AArch64::MLAv2i32_indexed; 5539 RC = &AArch64::FPR64RegClass; 5540 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 5541 break; 5542 case MachineCombinerPattern::MULADDv4i32_indexed_OP1: 5543 Opc = AArch64::MLAv4i32_indexed; 5544 RC = &AArch64::FPR128RegClass; 5545 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 5546 break; 5547 case MachineCombinerPattern::MULADDv4i32_indexed_OP2: 5548 Opc = AArch64::MLAv4i32_indexed; 5549 RC = &AArch64::FPR128RegClass; 5550 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 5551 break; 5552 5553 case MachineCombinerPattern::MULSUBv4i16_indexed_OP1: 5554 Opc = AArch64::MLAv4i16_indexed; 5555 RC = &AArch64::FPR64RegClass; 5556 MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs, 5557 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv4i16, 5558 RC); 5559 break; 5560 case MachineCombinerPattern::MULSUBv4i16_indexed_OP2: 5561 Opc = AArch64::MLSv4i16_indexed; 5562 RC = &AArch64::FPR64RegClass; 5563 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 5564 break; 5565 case MachineCombinerPattern::MULSUBv8i16_indexed_OP1: 5566 Opc = AArch64::MLAv8i16_indexed; 5567 RC = &AArch64::FPR128RegClass; 5568 MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs, 5569 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv8i16, 5570 RC); 5571 break; 5572 case MachineCombinerPattern::MULSUBv8i16_indexed_OP2: 5573 Opc = AArch64::MLSv8i16_indexed; 5574 RC = &AArch64::FPR128RegClass; 5575 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 5576 break; 5577 case MachineCombinerPattern::MULSUBv2i32_indexed_OP1: 5578 Opc = AArch64::MLAv2i32_indexed; 5579 RC = &AArch64::FPR64RegClass; 5580 MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs, 5581 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv2i32, 5582 RC); 5583 break; 5584 case MachineCombinerPattern::MULSUBv2i32_indexed_OP2: 5585 Opc = AArch64::MLSv2i32_indexed; 5586 RC = &AArch64::FPR64RegClass; 5587 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 5588 break; 5589 case MachineCombinerPattern::MULSUBv4i32_indexed_OP1: 5590 Opc = AArch64::MLAv4i32_indexed; 5591 RC = &AArch64::FPR128RegClass; 5592 MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs, 5593 InstrIdxForVirtReg, 1, Opc, AArch64::NEGv4i32, 5594 RC); 5595 break; 5596 case MachineCombinerPattern::MULSUBv4i32_indexed_OP2: 5597 Opc = AArch64::MLSv4i32_indexed; 5598 RC = &AArch64::FPR128RegClass; 5599 MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 5600 break; 5601 5602 // Floating Point Support 5603 case MachineCombinerPattern::FMULADDH_OP1: 5604 Opc = AArch64::FMADDHrrr; 5605 RC = &AArch64::FPR16RegClass; 5606 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 5607 break; 5608 case MachineCombinerPattern::FMULADDS_OP1: 5609 Opc = AArch64::FMADDSrrr; 5610 RC = &AArch64::FPR32RegClass; 5611 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 5612 break; 5613 case MachineCombinerPattern::FMULADDD_OP1: 5614 Opc = AArch64::FMADDDrrr; 5615 RC = &AArch64::FPR64RegClass; 5616 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 5617 break; 5618 5619 case MachineCombinerPattern::FMULADDH_OP2: 5620 Opc = AArch64::FMADDHrrr; 5621 RC = &AArch64::FPR16RegClass; 5622 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 5623 break; 5624 case MachineCombinerPattern::FMULADDS_OP2: 5625 Opc = AArch64::FMADDSrrr; 5626 RC = &AArch64::FPR32RegClass; 5627 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 5628 break; 5629 case MachineCombinerPattern::FMULADDD_OP2: 5630 Opc = AArch64::FMADDDrrr; 5631 RC = &AArch64::FPR64RegClass; 5632 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 5633 break; 5634 5635 case MachineCombinerPattern::FMLAv1i32_indexed_OP1: 5636 Opc = AArch64::FMLAv1i32_indexed; 5637 RC = &AArch64::FPR32RegClass; 5638 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 5639 FMAInstKind::Indexed); 5640 break; 5641 case MachineCombinerPattern::FMLAv1i32_indexed_OP2: 5642 Opc = AArch64::FMLAv1i32_indexed; 5643 RC = &AArch64::FPR32RegClass; 5644 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 5645 FMAInstKind::Indexed); 5646 break; 5647 5648 case MachineCombinerPattern::FMLAv1i64_indexed_OP1: 5649 Opc = AArch64::FMLAv1i64_indexed; 5650 RC = &AArch64::FPR64RegClass; 5651 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 5652 FMAInstKind::Indexed); 5653 break; 5654 case MachineCombinerPattern::FMLAv1i64_indexed_OP2: 5655 Opc = AArch64::FMLAv1i64_indexed; 5656 RC = &AArch64::FPR64RegClass; 5657 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 5658 FMAInstKind::Indexed); 5659 break; 5660 5661 case MachineCombinerPattern::FMLAv4i16_indexed_OP1: 5662 RC = &AArch64::FPR64RegClass; 5663 Opc = AArch64::FMLAv4i16_indexed; 5664 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 5665 FMAInstKind::Indexed); 5666 break; 5667 case MachineCombinerPattern::FMLAv4f16_OP1: 5668 RC = &AArch64::FPR64RegClass; 5669 Opc = AArch64::FMLAv4f16; 5670 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 5671 FMAInstKind::Accumulator); 5672 break; 5673 case MachineCombinerPattern::FMLAv4i16_indexed_OP2: 5674 RC = &AArch64::FPR64RegClass; 5675 Opc = AArch64::FMLAv4i16_indexed; 5676 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 5677 FMAInstKind::Indexed); 5678 break; 5679 case MachineCombinerPattern::FMLAv4f16_OP2: 5680 RC = &AArch64::FPR64RegClass; 5681 Opc = AArch64::FMLAv4f16; 5682 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 5683 FMAInstKind::Accumulator); 5684 break; 5685 5686 case MachineCombinerPattern::FMLAv2i32_indexed_OP1: 5687 case MachineCombinerPattern::FMLAv2f32_OP1: 5688 RC = &AArch64::FPR64RegClass; 5689 if (Pattern == MachineCombinerPattern::FMLAv2i32_indexed_OP1) { 5690 Opc = AArch64::FMLAv2i32_indexed; 5691 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 5692 FMAInstKind::Indexed); 5693 } else { 5694 Opc = AArch64::FMLAv2f32; 5695 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 5696 FMAInstKind::Accumulator); 5697 } 5698 break; 5699 case MachineCombinerPattern::FMLAv2i32_indexed_OP2: 5700 case MachineCombinerPattern::FMLAv2f32_OP2: 5701 RC = &AArch64::FPR64RegClass; 5702 if (Pattern == MachineCombinerPattern::FMLAv2i32_indexed_OP2) { 5703 Opc = AArch64::FMLAv2i32_indexed; 5704 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 5705 FMAInstKind::Indexed); 5706 } else { 5707 Opc = AArch64::FMLAv2f32; 5708 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 5709 FMAInstKind::Accumulator); 5710 } 5711 break; 5712 5713 case MachineCombinerPattern::FMLAv8i16_indexed_OP1: 5714 RC = &AArch64::FPR128RegClass; 5715 Opc = AArch64::FMLAv8i16_indexed; 5716 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 5717 FMAInstKind::Indexed); 5718 break; 5719 case MachineCombinerPattern::FMLAv8f16_OP1: 5720 RC = &AArch64::FPR128RegClass; 5721 Opc = AArch64::FMLAv8f16; 5722 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 5723 FMAInstKind::Accumulator); 5724 break; 5725 case MachineCombinerPattern::FMLAv8i16_indexed_OP2: 5726 RC = &AArch64::FPR128RegClass; 5727 Opc = AArch64::FMLAv8i16_indexed; 5728 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 5729 FMAInstKind::Indexed); 5730 break; 5731 case MachineCombinerPattern::FMLAv8f16_OP2: 5732 RC = &AArch64::FPR128RegClass; 5733 Opc = AArch64::FMLAv8f16; 5734 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 5735 FMAInstKind::Accumulator); 5736 break; 5737 5738 case MachineCombinerPattern::FMLAv2i64_indexed_OP1: 5739 case MachineCombinerPattern::FMLAv2f64_OP1: 5740 RC = &AArch64::FPR128RegClass; 5741 if (Pattern == MachineCombinerPattern::FMLAv2i64_indexed_OP1) { 5742 Opc = AArch64::FMLAv2i64_indexed; 5743 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 5744 FMAInstKind::Indexed); 5745 } else { 5746 Opc = AArch64::FMLAv2f64; 5747 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 5748 FMAInstKind::Accumulator); 5749 } 5750 break; 5751 case MachineCombinerPattern::FMLAv2i64_indexed_OP2: 5752 case MachineCombinerPattern::FMLAv2f64_OP2: 5753 RC = &AArch64::FPR128RegClass; 5754 if (Pattern == MachineCombinerPattern::FMLAv2i64_indexed_OP2) { 5755 Opc = AArch64::FMLAv2i64_indexed; 5756 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 5757 FMAInstKind::Indexed); 5758 } else { 5759 Opc = AArch64::FMLAv2f64; 5760 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 5761 FMAInstKind::Accumulator); 5762 } 5763 break; 5764 5765 case MachineCombinerPattern::FMLAv4i32_indexed_OP1: 5766 case MachineCombinerPattern::FMLAv4f32_OP1: 5767 RC = &AArch64::FPR128RegClass; 5768 if (Pattern == MachineCombinerPattern::FMLAv4i32_indexed_OP1) { 5769 Opc = AArch64::FMLAv4i32_indexed; 5770 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 5771 FMAInstKind::Indexed); 5772 } else { 5773 Opc = AArch64::FMLAv4f32; 5774 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 5775 FMAInstKind::Accumulator); 5776 } 5777 break; 5778 5779 case MachineCombinerPattern::FMLAv4i32_indexed_OP2: 5780 case MachineCombinerPattern::FMLAv4f32_OP2: 5781 RC = &AArch64::FPR128RegClass; 5782 if (Pattern == MachineCombinerPattern::FMLAv4i32_indexed_OP2) { 5783 Opc = AArch64::FMLAv4i32_indexed; 5784 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 5785 FMAInstKind::Indexed); 5786 } else { 5787 Opc = AArch64::FMLAv4f32; 5788 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 5789 FMAInstKind::Accumulator); 5790 } 5791 break; 5792 5793 case MachineCombinerPattern::FMULSUBH_OP1: 5794 Opc = AArch64::FNMSUBHrrr; 5795 RC = &AArch64::FPR16RegClass; 5796 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 5797 break; 5798 case MachineCombinerPattern::FMULSUBS_OP1: 5799 Opc = AArch64::FNMSUBSrrr; 5800 RC = &AArch64::FPR32RegClass; 5801 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 5802 break; 5803 case MachineCombinerPattern::FMULSUBD_OP1: 5804 Opc = AArch64::FNMSUBDrrr; 5805 RC = &AArch64::FPR64RegClass; 5806 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 5807 break; 5808 5809 case MachineCombinerPattern::FNMULSUBH_OP1: 5810 Opc = AArch64::FNMADDHrrr; 5811 RC = &AArch64::FPR16RegClass; 5812 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 5813 break; 5814 case MachineCombinerPattern::FNMULSUBS_OP1: 5815 Opc = AArch64::FNMADDSrrr; 5816 RC = &AArch64::FPR32RegClass; 5817 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 5818 break; 5819 case MachineCombinerPattern::FNMULSUBD_OP1: 5820 Opc = AArch64::FNMADDDrrr; 5821 RC = &AArch64::FPR64RegClass; 5822 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); 5823 break; 5824 5825 case MachineCombinerPattern::FMULSUBH_OP2: 5826 Opc = AArch64::FMSUBHrrr; 5827 RC = &AArch64::FPR16RegClass; 5828 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 5829 break; 5830 case MachineCombinerPattern::FMULSUBS_OP2: 5831 Opc = AArch64::FMSUBSrrr; 5832 RC = &AArch64::FPR32RegClass; 5833 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 5834 break; 5835 case MachineCombinerPattern::FMULSUBD_OP2: 5836 Opc = AArch64::FMSUBDrrr; 5837 RC = &AArch64::FPR64RegClass; 5838 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); 5839 break; 5840 5841 case MachineCombinerPattern::FMLSv1i32_indexed_OP2: 5842 Opc = AArch64::FMLSv1i32_indexed; 5843 RC = &AArch64::FPR32RegClass; 5844 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 5845 FMAInstKind::Indexed); 5846 break; 5847 5848 case MachineCombinerPattern::FMLSv1i64_indexed_OP2: 5849 Opc = AArch64::FMLSv1i64_indexed; 5850 RC = &AArch64::FPR64RegClass; 5851 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 5852 FMAInstKind::Indexed); 5853 break; 5854 5855 case MachineCombinerPattern::FMLSv4f16_OP1: 5856 case MachineCombinerPattern::FMLSv4i16_indexed_OP1: { 5857 RC = &AArch64::FPR64RegClass; 5858 Register NewVR = MRI.createVirtualRegister(RC); 5859 MachineInstrBuilder MIB1 = 5860 BuildMI(MF, Root.getDebugLoc(), TII->get(AArch64::FNEGv4f16), NewVR) 5861 .add(Root.getOperand(2)); 5862 InsInstrs.push_back(MIB1); 5863 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0)); 5864 if (Pattern == MachineCombinerPattern::FMLSv4f16_OP1) { 5865 Opc = AArch64::FMLAv4f16; 5866 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 5867 FMAInstKind::Accumulator, &NewVR); 5868 } else { 5869 Opc = AArch64::FMLAv4i16_indexed; 5870 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 5871 FMAInstKind::Indexed, &NewVR); 5872 } 5873 break; 5874 } 5875 case MachineCombinerPattern::FMLSv4f16_OP2: 5876 RC = &AArch64::FPR64RegClass; 5877 Opc = AArch64::FMLSv4f16; 5878 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 5879 FMAInstKind::Accumulator); 5880 break; 5881 case MachineCombinerPattern::FMLSv4i16_indexed_OP2: 5882 RC = &AArch64::FPR64RegClass; 5883 Opc = AArch64::FMLSv4i16_indexed; 5884 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 5885 FMAInstKind::Indexed); 5886 break; 5887 5888 case MachineCombinerPattern::FMLSv2f32_OP2: 5889 case MachineCombinerPattern::FMLSv2i32_indexed_OP2: 5890 RC = &AArch64::FPR64RegClass; 5891 if (Pattern == MachineCombinerPattern::FMLSv2i32_indexed_OP2) { 5892 Opc = AArch64::FMLSv2i32_indexed; 5893 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 5894 FMAInstKind::Indexed); 5895 } else { 5896 Opc = AArch64::FMLSv2f32; 5897 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 5898 FMAInstKind::Accumulator); 5899 } 5900 break; 5901 5902 case MachineCombinerPattern::FMLSv8f16_OP1: 5903 case MachineCombinerPattern::FMLSv8i16_indexed_OP1: { 5904 RC = &AArch64::FPR128RegClass; 5905 Register NewVR = MRI.createVirtualRegister(RC); 5906 MachineInstrBuilder MIB1 = 5907 BuildMI(MF, Root.getDebugLoc(), TII->get(AArch64::FNEGv8f16), NewVR) 5908 .add(Root.getOperand(2)); 5909 InsInstrs.push_back(MIB1); 5910 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0)); 5911 if (Pattern == MachineCombinerPattern::FMLSv8f16_OP1) { 5912 Opc = AArch64::FMLAv8f16; 5913 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 5914 FMAInstKind::Accumulator, &NewVR); 5915 } else { 5916 Opc = AArch64::FMLAv8i16_indexed; 5917 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 5918 FMAInstKind::Indexed, &NewVR); 5919 } 5920 break; 5921 } 5922 case MachineCombinerPattern::FMLSv8f16_OP2: 5923 RC = &AArch64::FPR128RegClass; 5924 Opc = AArch64::FMLSv8f16; 5925 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 5926 FMAInstKind::Accumulator); 5927 break; 5928 case MachineCombinerPattern::FMLSv8i16_indexed_OP2: 5929 RC = &AArch64::FPR128RegClass; 5930 Opc = AArch64::FMLSv8i16_indexed; 5931 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 5932 FMAInstKind::Indexed); 5933 break; 5934 5935 case MachineCombinerPattern::FMLSv2f64_OP2: 5936 case MachineCombinerPattern::FMLSv2i64_indexed_OP2: 5937 RC = &AArch64::FPR128RegClass; 5938 if (Pattern == MachineCombinerPattern::FMLSv2i64_indexed_OP2) { 5939 Opc = AArch64::FMLSv2i64_indexed; 5940 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 5941 FMAInstKind::Indexed); 5942 } else { 5943 Opc = AArch64::FMLSv2f64; 5944 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 5945 FMAInstKind::Accumulator); 5946 } 5947 break; 5948 5949 case MachineCombinerPattern::FMLSv4f32_OP2: 5950 case MachineCombinerPattern::FMLSv4i32_indexed_OP2: 5951 RC = &AArch64::FPR128RegClass; 5952 if (Pattern == MachineCombinerPattern::FMLSv4i32_indexed_OP2) { 5953 Opc = AArch64::FMLSv4i32_indexed; 5954 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 5955 FMAInstKind::Indexed); 5956 } else { 5957 Opc = AArch64::FMLSv4f32; 5958 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, 5959 FMAInstKind::Accumulator); 5960 } 5961 break; 5962 case MachineCombinerPattern::FMLSv2f32_OP1: 5963 case MachineCombinerPattern::FMLSv2i32_indexed_OP1: { 5964 RC = &AArch64::FPR64RegClass; 5965 Register NewVR = MRI.createVirtualRegister(RC); 5966 MachineInstrBuilder MIB1 = 5967 BuildMI(MF, Root.getDebugLoc(), TII->get(AArch64::FNEGv2f32), NewVR) 5968 .add(Root.getOperand(2)); 5969 InsInstrs.push_back(MIB1); 5970 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0)); 5971 if (Pattern == MachineCombinerPattern::FMLSv2i32_indexed_OP1) { 5972 Opc = AArch64::FMLAv2i32_indexed; 5973 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 5974 FMAInstKind::Indexed, &NewVR); 5975 } else { 5976 Opc = AArch64::FMLAv2f32; 5977 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 5978 FMAInstKind::Accumulator, &NewVR); 5979 } 5980 break; 5981 } 5982 case MachineCombinerPattern::FMLSv4f32_OP1: 5983 case MachineCombinerPattern::FMLSv4i32_indexed_OP1: { 5984 RC = &AArch64::FPR128RegClass; 5985 Register NewVR = MRI.createVirtualRegister(RC); 5986 MachineInstrBuilder MIB1 = 5987 BuildMI(MF, Root.getDebugLoc(), TII->get(AArch64::FNEGv4f32), NewVR) 5988 .add(Root.getOperand(2)); 5989 InsInstrs.push_back(MIB1); 5990 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0)); 5991 if (Pattern == MachineCombinerPattern::FMLSv4i32_indexed_OP1) { 5992 Opc = AArch64::FMLAv4i32_indexed; 5993 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 5994 FMAInstKind::Indexed, &NewVR); 5995 } else { 5996 Opc = AArch64::FMLAv4f32; 5997 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 5998 FMAInstKind::Accumulator, &NewVR); 5999 } 6000 break; 6001 } 6002 case MachineCombinerPattern::FMLSv2f64_OP1: 6003 case MachineCombinerPattern::FMLSv2i64_indexed_OP1: { 6004 RC = &AArch64::FPR128RegClass; 6005 Register NewVR = MRI.createVirtualRegister(RC); 6006 MachineInstrBuilder MIB1 = 6007 BuildMI(MF, Root.getDebugLoc(), TII->get(AArch64::FNEGv2f64), NewVR) 6008 .add(Root.getOperand(2)); 6009 InsInstrs.push_back(MIB1); 6010 InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0)); 6011 if (Pattern == MachineCombinerPattern::FMLSv2i64_indexed_OP1) { 6012 Opc = AArch64::FMLAv2i64_indexed; 6013 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 6014 FMAInstKind::Indexed, &NewVR); 6015 } else { 6016 Opc = AArch64::FMLAv2f64; 6017 MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, 6018 FMAInstKind::Accumulator, &NewVR); 6019 } 6020 break; 6021 } 6022 } // end switch (Pattern) 6023 // Record MUL and ADD/SUB for deletion 6024 // FIXME: This assertion fails in CodeGen/AArch64/tailmerging_in_mbp.ll and 6025 // CodeGen/AArch64/urem-seteq-nonzero.ll. 6026 // assert(MUL && "MUL was never set"); 6027 DelInstrs.push_back(MUL); 6028 DelInstrs.push_back(&Root); 6029 } 6030 6031 /// Replace csincr-branch sequence by simple conditional branch 6032 /// 6033 /// Examples: 6034 /// 1. \code 6035 /// csinc w9, wzr, wzr, <condition code> 6036 /// tbnz w9, #0, 0x44 6037 /// \endcode 6038 /// to 6039 /// \code 6040 /// b.<inverted condition code> 6041 /// \endcode 6042 /// 6043 /// 2. \code 6044 /// csinc w9, wzr, wzr, <condition code> 6045 /// tbz w9, #0, 0x44 6046 /// \endcode 6047 /// to 6048 /// \code 6049 /// b.<condition code> 6050 /// \endcode 6051 /// 6052 /// Replace compare and branch sequence by TBZ/TBNZ instruction when the 6053 /// compare's constant operand is power of 2. 6054 /// 6055 /// Examples: 6056 /// \code 6057 /// and w8, w8, #0x400 6058 /// cbnz w8, L1 6059 /// \endcode 6060 /// to 6061 /// \code 6062 /// tbnz w8, #10, L1 6063 /// \endcode 6064 /// 6065 /// \param MI Conditional Branch 6066 /// \return True when the simple conditional branch is generated 6067 /// 6068 bool AArch64InstrInfo::optimizeCondBranch(MachineInstr &MI) const { 6069 bool IsNegativeBranch = false; 6070 bool IsTestAndBranch = false; 6071 unsigned TargetBBInMI = 0; 6072 switch (MI.getOpcode()) { 6073 default: 6074 llvm_unreachable("Unknown branch instruction?"); 6075 case AArch64::Bcc: 6076 return false; 6077 case AArch64::CBZW: 6078 case AArch64::CBZX: 6079 TargetBBInMI = 1; 6080 break; 6081 case AArch64::CBNZW: 6082 case AArch64::CBNZX: 6083 TargetBBInMI = 1; 6084 IsNegativeBranch = true; 6085 break; 6086 case AArch64::TBZW: 6087 case AArch64::TBZX: 6088 TargetBBInMI = 2; 6089 IsTestAndBranch = true; 6090 break; 6091 case AArch64::TBNZW: 6092 case AArch64::TBNZX: 6093 TargetBBInMI = 2; 6094 IsNegativeBranch = true; 6095 IsTestAndBranch = true; 6096 break; 6097 } 6098 // So we increment a zero register and test for bits other 6099 // than bit 0? Conservatively bail out in case the verifier 6100 // missed this case. 6101 if (IsTestAndBranch && MI.getOperand(1).getImm()) 6102 return false; 6103 6104 // Find Definition. 6105 assert(MI.getParent() && "Incomplete machine instruciton\n"); 6106 MachineBasicBlock *MBB = MI.getParent(); 6107 MachineFunction *MF = MBB->getParent(); 6108 MachineRegisterInfo *MRI = &MF->getRegInfo(); 6109 Register VReg = MI.getOperand(0).getReg(); 6110 if (!Register::isVirtualRegister(VReg)) 6111 return false; 6112 6113 MachineInstr *DefMI = MRI->getVRegDef(VReg); 6114 6115 // Look through COPY instructions to find definition. 6116 while (DefMI->isCopy()) { 6117 Register CopyVReg = DefMI->getOperand(1).getReg(); 6118 if (!MRI->hasOneNonDBGUse(CopyVReg)) 6119 return false; 6120 if (!MRI->hasOneDef(CopyVReg)) 6121 return false; 6122 DefMI = MRI->getVRegDef(CopyVReg); 6123 } 6124 6125 switch (DefMI->getOpcode()) { 6126 default: 6127 return false; 6128 // Fold AND into a TBZ/TBNZ if constant operand is power of 2. 6129 case AArch64::ANDWri: 6130 case AArch64::ANDXri: { 6131 if (IsTestAndBranch) 6132 return false; 6133 if (DefMI->getParent() != MBB) 6134 return false; 6135 if (!MRI->hasOneNonDBGUse(VReg)) 6136 return false; 6137 6138 bool Is32Bit = (DefMI->getOpcode() == AArch64::ANDWri); 6139 uint64_t Mask = AArch64_AM::decodeLogicalImmediate( 6140 DefMI->getOperand(2).getImm(), Is32Bit ? 32 : 64); 6141 if (!isPowerOf2_64(Mask)) 6142 return false; 6143 6144 MachineOperand &MO = DefMI->getOperand(1); 6145 Register NewReg = MO.getReg(); 6146 if (!Register::isVirtualRegister(NewReg)) 6147 return false; 6148 6149 assert(!MRI->def_empty(NewReg) && "Register must be defined."); 6150 6151 MachineBasicBlock &RefToMBB = *MBB; 6152 MachineBasicBlock *TBB = MI.getOperand(1).getMBB(); 6153 DebugLoc DL = MI.getDebugLoc(); 6154 unsigned Imm = Log2_64(Mask); 6155 unsigned Opc = (Imm < 32) 6156 ? (IsNegativeBranch ? AArch64::TBNZW : AArch64::TBZW) 6157 : (IsNegativeBranch ? AArch64::TBNZX : AArch64::TBZX); 6158 MachineInstr *NewMI = BuildMI(RefToMBB, MI, DL, get(Opc)) 6159 .addReg(NewReg) 6160 .addImm(Imm) 6161 .addMBB(TBB); 6162 // Register lives on to the CBZ now. 6163 MO.setIsKill(false); 6164 6165 // For immediate smaller than 32, we need to use the 32-bit 6166 // variant (W) in all cases. Indeed the 64-bit variant does not 6167 // allow to encode them. 6168 // Therefore, if the input register is 64-bit, we need to take the 6169 // 32-bit sub-part. 6170 if (!Is32Bit && Imm < 32) 6171 NewMI->getOperand(0).setSubReg(AArch64::sub_32); 6172 MI.eraseFromParent(); 6173 return true; 6174 } 6175 // Look for CSINC 6176 case AArch64::CSINCWr: 6177 case AArch64::CSINCXr: { 6178 if (!(DefMI->getOperand(1).getReg() == AArch64::WZR && 6179 DefMI->getOperand(2).getReg() == AArch64::WZR) && 6180 !(DefMI->getOperand(1).getReg() == AArch64::XZR && 6181 DefMI->getOperand(2).getReg() == AArch64::XZR)) 6182 return false; 6183 6184 if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, true) != -1) 6185 return false; 6186 6187 AArch64CC::CondCode CC = (AArch64CC::CondCode)DefMI->getOperand(3).getImm(); 6188 // Convert only when the condition code is not modified between 6189 // the CSINC and the branch. The CC may be used by other 6190 // instructions in between. 6191 if (areCFlagsAccessedBetweenInstrs(DefMI, MI, &getRegisterInfo(), AK_Write)) 6192 return false; 6193 MachineBasicBlock &RefToMBB = *MBB; 6194 MachineBasicBlock *TBB = MI.getOperand(TargetBBInMI).getMBB(); 6195 DebugLoc DL = MI.getDebugLoc(); 6196 if (IsNegativeBranch) 6197 CC = AArch64CC::getInvertedCondCode(CC); 6198 BuildMI(RefToMBB, MI, DL, get(AArch64::Bcc)).addImm(CC).addMBB(TBB); 6199 MI.eraseFromParent(); 6200 return true; 6201 } 6202 } 6203 } 6204 6205 std::pair<unsigned, unsigned> 6206 AArch64InstrInfo::decomposeMachineOperandsTargetFlags(unsigned TF) const { 6207 const unsigned Mask = AArch64II::MO_FRAGMENT; 6208 return std::make_pair(TF & Mask, TF & ~Mask); 6209 } 6210 6211 ArrayRef<std::pair<unsigned, const char *>> 6212 AArch64InstrInfo::getSerializableDirectMachineOperandTargetFlags() const { 6213 using namespace AArch64II; 6214 6215 static const std::pair<unsigned, const char *> TargetFlags[] = { 6216 {MO_PAGE, "aarch64-page"}, {MO_PAGEOFF, "aarch64-pageoff"}, 6217 {MO_G3, "aarch64-g3"}, {MO_G2, "aarch64-g2"}, 6218 {MO_G1, "aarch64-g1"}, {MO_G0, "aarch64-g0"}, 6219 {MO_HI12, "aarch64-hi12"}}; 6220 return makeArrayRef(TargetFlags); 6221 } 6222 6223 ArrayRef<std::pair<unsigned, const char *>> 6224 AArch64InstrInfo::getSerializableBitmaskMachineOperandTargetFlags() const { 6225 using namespace AArch64II; 6226 6227 static const std::pair<unsigned, const char *> TargetFlags[] = { 6228 {MO_COFFSTUB, "aarch64-coffstub"}, 6229 {MO_GOT, "aarch64-got"}, 6230 {MO_NC, "aarch64-nc"}, 6231 {MO_S, "aarch64-s"}, 6232 {MO_TLS, "aarch64-tls"}, 6233 {MO_DLLIMPORT, "aarch64-dllimport"}, 6234 {MO_PREL, "aarch64-prel"}, 6235 {MO_TAGGED, "aarch64-tagged"}}; 6236 return makeArrayRef(TargetFlags); 6237 } 6238 6239 ArrayRef<std::pair<MachineMemOperand::Flags, const char *>> 6240 AArch64InstrInfo::getSerializableMachineMemOperandTargetFlags() const { 6241 static const std::pair<MachineMemOperand::Flags, const char *> TargetFlags[] = 6242 {{MOSuppressPair, "aarch64-suppress-pair"}, 6243 {MOStridedAccess, "aarch64-strided-access"}}; 6244 return makeArrayRef(TargetFlags); 6245 } 6246 6247 /// Constants defining how certain sequences should be outlined. 6248 /// This encompasses how an outlined function should be called, and what kind of 6249 /// frame should be emitted for that outlined function. 6250 /// 6251 /// \p MachineOutlinerDefault implies that the function should be called with 6252 /// a save and restore of LR to the stack. 6253 /// 6254 /// That is, 6255 /// 6256 /// I1 Save LR OUTLINED_FUNCTION: 6257 /// I2 --> BL OUTLINED_FUNCTION I1 6258 /// I3 Restore LR I2 6259 /// I3 6260 /// RET 6261 /// 6262 /// * Call construction overhead: 3 (save + BL + restore) 6263 /// * Frame construction overhead: 1 (ret) 6264 /// * Requires stack fixups? Yes 6265 /// 6266 /// \p MachineOutlinerTailCall implies that the function is being created from 6267 /// a sequence of instructions ending in a return. 6268 /// 6269 /// That is, 6270 /// 6271 /// I1 OUTLINED_FUNCTION: 6272 /// I2 --> B OUTLINED_FUNCTION I1 6273 /// RET I2 6274 /// RET 6275 /// 6276 /// * Call construction overhead: 1 (B) 6277 /// * Frame construction overhead: 0 (Return included in sequence) 6278 /// * Requires stack fixups? No 6279 /// 6280 /// \p MachineOutlinerNoLRSave implies that the function should be called using 6281 /// a BL instruction, but doesn't require LR to be saved and restored. This 6282 /// happens when LR is known to be dead. 6283 /// 6284 /// That is, 6285 /// 6286 /// I1 OUTLINED_FUNCTION: 6287 /// I2 --> BL OUTLINED_FUNCTION I1 6288 /// I3 I2 6289 /// I3 6290 /// RET 6291 /// 6292 /// * Call construction overhead: 1 (BL) 6293 /// * Frame construction overhead: 1 (RET) 6294 /// * Requires stack fixups? No 6295 /// 6296 /// \p MachineOutlinerThunk implies that the function is being created from 6297 /// a sequence of instructions ending in a call. The outlined function is 6298 /// called with a BL instruction, and the outlined function tail-calls the 6299 /// original call destination. 6300 /// 6301 /// That is, 6302 /// 6303 /// I1 OUTLINED_FUNCTION: 6304 /// I2 --> BL OUTLINED_FUNCTION I1 6305 /// BL f I2 6306 /// B f 6307 /// * Call construction overhead: 1 (BL) 6308 /// * Frame construction overhead: 0 6309 /// * Requires stack fixups? No 6310 /// 6311 /// \p MachineOutlinerRegSave implies that the function should be called with a 6312 /// save and restore of LR to an available register. This allows us to avoid 6313 /// stack fixups. Note that this outlining variant is compatible with the 6314 /// NoLRSave case. 6315 /// 6316 /// That is, 6317 /// 6318 /// I1 Save LR OUTLINED_FUNCTION: 6319 /// I2 --> BL OUTLINED_FUNCTION I1 6320 /// I3 Restore LR I2 6321 /// I3 6322 /// RET 6323 /// 6324 /// * Call construction overhead: 3 (save + BL + restore) 6325 /// * Frame construction overhead: 1 (ret) 6326 /// * Requires stack fixups? No 6327 enum MachineOutlinerClass { 6328 MachineOutlinerDefault, /// Emit a save, restore, call, and return. 6329 MachineOutlinerTailCall, /// Only emit a branch. 6330 MachineOutlinerNoLRSave, /// Emit a call and return. 6331 MachineOutlinerThunk, /// Emit a call and tail-call. 6332 MachineOutlinerRegSave /// Same as default, but save to a register. 6333 }; 6334 6335 enum MachineOutlinerMBBFlags { 6336 LRUnavailableSomewhere = 0x2, 6337 HasCalls = 0x4, 6338 UnsafeRegsDead = 0x8 6339 }; 6340 6341 unsigned 6342 AArch64InstrInfo::findRegisterToSaveLRTo(const outliner::Candidate &C) const { 6343 assert(C.LRUWasSet && "LRU wasn't set?"); 6344 MachineFunction *MF = C.getMF(); 6345 const AArch64RegisterInfo *ARI = static_cast<const AArch64RegisterInfo *>( 6346 MF->getSubtarget().getRegisterInfo()); 6347 6348 // Check if there is an available register across the sequence that we can 6349 // use. 6350 for (unsigned Reg : AArch64::GPR64RegClass) { 6351 if (!ARI->isReservedReg(*MF, Reg) && 6352 Reg != AArch64::LR && // LR is not reserved, but don't use it. 6353 Reg != AArch64::X16 && // X16 is not guaranteed to be preserved. 6354 Reg != AArch64::X17 && // Ditto for X17. 6355 C.LRU.available(Reg) && C.UsedInSequence.available(Reg)) 6356 return Reg; 6357 } 6358 6359 // No suitable register. Return 0. 6360 return 0u; 6361 } 6362 6363 static bool 6364 outliningCandidatesSigningScopeConsensus(const outliner::Candidate &a, 6365 const outliner::Candidate &b) { 6366 const auto &MFIa = a.getMF()->getInfo<AArch64FunctionInfo>(); 6367 const auto &MFIb = b.getMF()->getInfo<AArch64FunctionInfo>(); 6368 6369 return MFIa->shouldSignReturnAddress(false) == MFIb->shouldSignReturnAddress(false) && 6370 MFIa->shouldSignReturnAddress(true) == MFIb->shouldSignReturnAddress(true); 6371 } 6372 6373 static bool 6374 outliningCandidatesSigningKeyConsensus(const outliner::Candidate &a, 6375 const outliner::Candidate &b) { 6376 const auto &MFIa = a.getMF()->getInfo<AArch64FunctionInfo>(); 6377 const auto &MFIb = b.getMF()->getInfo<AArch64FunctionInfo>(); 6378 6379 return MFIa->shouldSignWithBKey() == MFIb->shouldSignWithBKey(); 6380 } 6381 6382 static bool outliningCandidatesV8_3OpsConsensus(const outliner::Candidate &a, 6383 const outliner::Candidate &b) { 6384 const AArch64Subtarget &SubtargetA = 6385 a.getMF()->getSubtarget<AArch64Subtarget>(); 6386 const AArch64Subtarget &SubtargetB = 6387 b.getMF()->getSubtarget<AArch64Subtarget>(); 6388 return SubtargetA.hasV8_3aOps() == SubtargetB.hasV8_3aOps(); 6389 } 6390 6391 outliner::OutlinedFunction AArch64InstrInfo::getOutliningCandidateInfo( 6392 std::vector<outliner::Candidate> &RepeatedSequenceLocs) const { 6393 outliner::Candidate &FirstCand = RepeatedSequenceLocs[0]; 6394 unsigned SequenceSize = 6395 std::accumulate(FirstCand.front(), std::next(FirstCand.back()), 0, 6396 [this](unsigned Sum, const MachineInstr &MI) { 6397 return Sum + getInstSizeInBytes(MI); 6398 }); 6399 unsigned NumBytesToCreateFrame = 0; 6400 6401 // We only allow outlining for functions having exactly matching return 6402 // address signing attributes, i.e., all share the same value for the 6403 // attribute "sign-return-address" and all share the same type of key they 6404 // are signed with. 6405 // Additionally we require all functions to simultaniously either support 6406 // v8.3a features or not. Otherwise an outlined function could get signed 6407 // using dedicated v8.3 instructions and a call from a function that doesn't 6408 // support v8.3 instructions would therefore be invalid. 6409 if (std::adjacent_find( 6410 RepeatedSequenceLocs.begin(), RepeatedSequenceLocs.end(), 6411 [](const outliner::Candidate &a, const outliner::Candidate &b) { 6412 // Return true if a and b are non-equal w.r.t. return address 6413 // signing or support of v8.3a features 6414 if (outliningCandidatesSigningScopeConsensus(a, b) && 6415 outliningCandidatesSigningKeyConsensus(a, b) && 6416 outliningCandidatesV8_3OpsConsensus(a, b)) { 6417 return false; 6418 } 6419 return true; 6420 }) != RepeatedSequenceLocs.end()) { 6421 return outliner::OutlinedFunction(); 6422 } 6423 6424 // Since at this point all candidates agree on their return address signing 6425 // picking just one is fine. If the candidate functions potentially sign their 6426 // return addresses, the outlined function should do the same. Note that in 6427 // the case of "sign-return-address"="non-leaf" this is an assumption: It is 6428 // not certainly true that the outlined function will have to sign its return 6429 // address but this decision is made later, when the decision to outline 6430 // has already been made. 6431 // The same holds for the number of additional instructions we need: On 6432 // v8.3a RET can be replaced by RETAA/RETAB and no AUT instruction is 6433 // necessary. However, at this point we don't know if the outlined function 6434 // will have a RET instruction so we assume the worst. 6435 const TargetRegisterInfo &TRI = getRegisterInfo(); 6436 if (FirstCand.getMF() 6437 ->getInfo<AArch64FunctionInfo>() 6438 ->shouldSignReturnAddress(true)) { 6439 // One PAC and one AUT instructions 6440 NumBytesToCreateFrame += 8; 6441 6442 // We have to check if sp modifying instructions would get outlined. 6443 // If so we only allow outlining if sp is unchanged overall, so matching 6444 // sub and add instructions are okay to outline, all other sp modifications 6445 // are not 6446 auto hasIllegalSPModification = [&TRI](outliner::Candidate &C) { 6447 int SPValue = 0; 6448 MachineBasicBlock::iterator MBBI = C.front(); 6449 for (;;) { 6450 if (MBBI->modifiesRegister(AArch64::SP, &TRI)) { 6451 switch (MBBI->getOpcode()) { 6452 case AArch64::ADDXri: 6453 case AArch64::ADDWri: 6454 assert(MBBI->getNumOperands() == 4 && "Wrong number of operands"); 6455 assert(MBBI->getOperand(2).isImm() && 6456 "Expected operand to be immediate"); 6457 assert(MBBI->getOperand(1).isReg() && 6458 "Expected operand to be a register"); 6459 // Check if the add just increments sp. If so, we search for 6460 // matching sub instructions that decrement sp. If not, the 6461 // modification is illegal 6462 if (MBBI->getOperand(1).getReg() == AArch64::SP) 6463 SPValue += MBBI->getOperand(2).getImm(); 6464 else 6465 return true; 6466 break; 6467 case AArch64::SUBXri: 6468 case AArch64::SUBWri: 6469 assert(MBBI->getNumOperands() == 4 && "Wrong number of operands"); 6470 assert(MBBI->getOperand(2).isImm() && 6471 "Expected operand to be immediate"); 6472 assert(MBBI->getOperand(1).isReg() && 6473 "Expected operand to be a register"); 6474 // Check if the sub just decrements sp. If so, we search for 6475 // matching add instructions that increment sp. If not, the 6476 // modification is illegal 6477 if (MBBI->getOperand(1).getReg() == AArch64::SP) 6478 SPValue -= MBBI->getOperand(2).getImm(); 6479 else 6480 return true; 6481 break; 6482 default: 6483 return true; 6484 } 6485 } 6486 if (MBBI == C.back()) 6487 break; 6488 ++MBBI; 6489 } 6490 if (SPValue) 6491 return true; 6492 return false; 6493 }; 6494 // Remove candidates with illegal stack modifying instructions 6495 llvm::erase_if(RepeatedSequenceLocs, hasIllegalSPModification); 6496 6497 // If the sequence doesn't have enough candidates left, then we're done. 6498 if (RepeatedSequenceLocs.size() < 2) 6499 return outliner::OutlinedFunction(); 6500 } 6501 6502 // Properties about candidate MBBs that hold for all of them. 6503 unsigned FlagsSetInAll = 0xF; 6504 6505 // Compute liveness information for each candidate, and set FlagsSetInAll. 6506 std::for_each(RepeatedSequenceLocs.begin(), RepeatedSequenceLocs.end(), 6507 [&FlagsSetInAll](outliner::Candidate &C) { 6508 FlagsSetInAll &= C.Flags; 6509 }); 6510 6511 // According to the AArch64 Procedure Call Standard, the following are 6512 // undefined on entry/exit from a function call: 6513 // 6514 // * Registers x16, x17, (and thus w16, w17) 6515 // * Condition codes (and thus the NZCV register) 6516 // 6517 // Because if this, we can't outline any sequence of instructions where 6518 // one 6519 // of these registers is live into/across it. Thus, we need to delete 6520 // those 6521 // candidates. 6522 auto CantGuaranteeValueAcrossCall = [&TRI](outliner::Candidate &C) { 6523 // If the unsafe registers in this block are all dead, then we don't need 6524 // to compute liveness here. 6525 if (C.Flags & UnsafeRegsDead) 6526 return false; 6527 C.initLRU(TRI); 6528 LiveRegUnits LRU = C.LRU; 6529 return (!LRU.available(AArch64::W16) || !LRU.available(AArch64::W17) || 6530 !LRU.available(AArch64::NZCV)); 6531 }; 6532 6533 // Are there any candidates where those registers are live? 6534 if (!(FlagsSetInAll & UnsafeRegsDead)) { 6535 // Erase every candidate that violates the restrictions above. (It could be 6536 // true that we have viable candidates, so it's not worth bailing out in 6537 // the case that, say, 1 out of 20 candidates violate the restructions.) 6538 llvm::erase_if(RepeatedSequenceLocs, CantGuaranteeValueAcrossCall); 6539 6540 // If the sequence doesn't have enough candidates left, then we're done. 6541 if (RepeatedSequenceLocs.size() < 2) 6542 return outliner::OutlinedFunction(); 6543 } 6544 6545 // At this point, we have only "safe" candidates to outline. Figure out 6546 // frame + call instruction information. 6547 6548 unsigned LastInstrOpcode = RepeatedSequenceLocs[0].back()->getOpcode(); 6549 6550 // Helper lambda which sets call information for every candidate. 6551 auto SetCandidateCallInfo = 6552 [&RepeatedSequenceLocs](unsigned CallID, unsigned NumBytesForCall) { 6553 for (outliner::Candidate &C : RepeatedSequenceLocs) 6554 C.setCallInfo(CallID, NumBytesForCall); 6555 }; 6556 6557 unsigned FrameID = MachineOutlinerDefault; 6558 NumBytesToCreateFrame += 4; 6559 6560 bool HasBTI = any_of(RepeatedSequenceLocs, [](outliner::Candidate &C) { 6561 return C.getMF()->getInfo<AArch64FunctionInfo>()->branchTargetEnforcement(); 6562 }); 6563 6564 // We check to see if CFI Instructions are present, and if they are 6565 // we find the number of CFI Instructions in the candidates. 6566 unsigned CFICount = 0; 6567 MachineBasicBlock::iterator MBBI = RepeatedSequenceLocs[0].front(); 6568 for (unsigned Loc = RepeatedSequenceLocs[0].getStartIdx(); 6569 Loc < RepeatedSequenceLocs[0].getEndIdx() + 1; Loc++) { 6570 const std::vector<MCCFIInstruction> &CFIInstructions = 6571 RepeatedSequenceLocs[0].getMF()->getFrameInstructions(); 6572 if (MBBI->isCFIInstruction()) { 6573 unsigned CFIIndex = MBBI->getOperand(0).getCFIIndex(); 6574 MCCFIInstruction CFI = CFIInstructions[CFIIndex]; 6575 CFICount++; 6576 } 6577 MBBI++; 6578 } 6579 6580 // We compare the number of found CFI Instructions to the number of CFI 6581 // instructions in the parent function for each candidate. We must check this 6582 // since if we outline one of the CFI instructions in a function, we have to 6583 // outline them all for correctness. If we do not, the address offsets will be 6584 // incorrect between the two sections of the program. 6585 for (outliner::Candidate &C : RepeatedSequenceLocs) { 6586 std::vector<MCCFIInstruction> CFIInstructions = 6587 C.getMF()->getFrameInstructions(); 6588 6589 if (CFICount > 0 && CFICount != CFIInstructions.size()) 6590 return outliner::OutlinedFunction(); 6591 } 6592 6593 // Returns true if an instructions is safe to fix up, false otherwise. 6594 auto IsSafeToFixup = [this, &TRI](MachineInstr &MI) { 6595 if (MI.isCall()) 6596 return true; 6597 6598 if (!MI.modifiesRegister(AArch64::SP, &TRI) && 6599 !MI.readsRegister(AArch64::SP, &TRI)) 6600 return true; 6601 6602 // Any modification of SP will break our code to save/restore LR. 6603 // FIXME: We could handle some instructions which add a constant 6604 // offset to SP, with a bit more work. 6605 if (MI.modifiesRegister(AArch64::SP, &TRI)) 6606 return false; 6607 6608 // At this point, we have a stack instruction that we might need to 6609 // fix up. We'll handle it if it's a load or store. 6610 if (MI.mayLoadOrStore()) { 6611 const MachineOperand *Base; // Filled with the base operand of MI. 6612 int64_t Offset; // Filled with the offset of MI. 6613 bool OffsetIsScalable; 6614 6615 // Does it allow us to offset the base operand and is the base the 6616 // register SP? 6617 if (!getMemOperandWithOffset(MI, Base, Offset, OffsetIsScalable, &TRI) || 6618 !Base->isReg() || Base->getReg() != AArch64::SP) 6619 return false; 6620 6621 // Fixe-up code below assumes bytes. 6622 if (OffsetIsScalable) 6623 return false; 6624 6625 // Find the minimum/maximum offset for this instruction and check 6626 // if fixing it up would be in range. 6627 int64_t MinOffset, 6628 MaxOffset; // Unscaled offsets for the instruction. 6629 TypeSize Scale(0U, false); // The scale to multiply the offsets by. 6630 unsigned DummyWidth; 6631 getMemOpInfo(MI.getOpcode(), Scale, DummyWidth, MinOffset, MaxOffset); 6632 6633 Offset += 16; // Update the offset to what it would be if we outlined. 6634 if (Offset < MinOffset * (int64_t)Scale.getFixedSize() || 6635 Offset > MaxOffset * (int64_t)Scale.getFixedSize()) 6636 return false; 6637 6638 // It's in range, so we can outline it. 6639 return true; 6640 } 6641 6642 // FIXME: Add handling for instructions like "add x0, sp, #8". 6643 6644 // We can't fix it up, so don't outline it. 6645 return false; 6646 }; 6647 6648 // True if it's possible to fix up each stack instruction in this sequence. 6649 // Important for frames/call variants that modify the stack. 6650 bool AllStackInstrsSafe = std::all_of( 6651 FirstCand.front(), std::next(FirstCand.back()), IsSafeToFixup); 6652 6653 // If the last instruction in any candidate is a terminator, then we should 6654 // tail call all of the candidates. 6655 if (RepeatedSequenceLocs[0].back()->isTerminator()) { 6656 FrameID = MachineOutlinerTailCall; 6657 NumBytesToCreateFrame = 0; 6658 SetCandidateCallInfo(MachineOutlinerTailCall, 4); 6659 } 6660 6661 else if (LastInstrOpcode == AArch64::BL || 6662 ((LastInstrOpcode == AArch64::BLR || 6663 LastInstrOpcode == AArch64::BLRNoIP) && 6664 !HasBTI)) { 6665 // FIXME: Do we need to check if the code after this uses the value of LR? 6666 FrameID = MachineOutlinerThunk; 6667 NumBytesToCreateFrame = 0; 6668 SetCandidateCallInfo(MachineOutlinerThunk, 4); 6669 } 6670 6671 else { 6672 // We need to decide how to emit calls + frames. We can always emit the same 6673 // frame if we don't need to save to the stack. If we have to save to the 6674 // stack, then we need a different frame. 6675 unsigned NumBytesNoStackCalls = 0; 6676 std::vector<outliner::Candidate> CandidatesWithoutStackFixups; 6677 6678 // Check if we have to save LR. 6679 for (outliner::Candidate &C : RepeatedSequenceLocs) { 6680 C.initLRU(TRI); 6681 6682 // If we have a noreturn caller, then we're going to be conservative and 6683 // say that we have to save LR. If we don't have a ret at the end of the 6684 // block, then we can't reason about liveness accurately. 6685 // 6686 // FIXME: We can probably do better than always disabling this in 6687 // noreturn functions by fixing up the liveness info. 6688 bool IsNoReturn = 6689 C.getMF()->getFunction().hasFnAttribute(Attribute::NoReturn); 6690 6691 // Is LR available? If so, we don't need a save. 6692 if (C.LRU.available(AArch64::LR) && !IsNoReturn) { 6693 NumBytesNoStackCalls += 4; 6694 C.setCallInfo(MachineOutlinerNoLRSave, 4); 6695 CandidatesWithoutStackFixups.push_back(C); 6696 } 6697 6698 // Is an unused register available? If so, we won't modify the stack, so 6699 // we can outline with the same frame type as those that don't save LR. 6700 else if (findRegisterToSaveLRTo(C)) { 6701 NumBytesNoStackCalls += 12; 6702 C.setCallInfo(MachineOutlinerRegSave, 12); 6703 CandidatesWithoutStackFixups.push_back(C); 6704 } 6705 6706 // Is SP used in the sequence at all? If not, we don't have to modify 6707 // the stack, so we are guaranteed to get the same frame. 6708 else if (C.UsedInSequence.available(AArch64::SP)) { 6709 NumBytesNoStackCalls += 12; 6710 C.setCallInfo(MachineOutlinerDefault, 12); 6711 CandidatesWithoutStackFixups.push_back(C); 6712 } 6713 6714 // If we outline this, we need to modify the stack. Pretend we don't 6715 // outline this by saving all of its bytes. 6716 else { 6717 NumBytesNoStackCalls += SequenceSize; 6718 } 6719 } 6720 6721 // If there are no places where we have to save LR, then note that we 6722 // don't have to update the stack. Otherwise, give every candidate the 6723 // default call type, as long as it's safe to do so. 6724 if (!AllStackInstrsSafe || 6725 NumBytesNoStackCalls <= RepeatedSequenceLocs.size() * 12) { 6726 RepeatedSequenceLocs = CandidatesWithoutStackFixups; 6727 FrameID = MachineOutlinerNoLRSave; 6728 } else { 6729 SetCandidateCallInfo(MachineOutlinerDefault, 12); 6730 6731 // Bugzilla ID: 46767 6732 // TODO: Check if fixing up the stack more than once is safe so we can 6733 // outline these. 6734 // 6735 // An outline resulting in a caller that requires stack fixups at the 6736 // callsite to a callee that also requires stack fixups can happen when 6737 // there are no available registers at the candidate callsite for a 6738 // candidate that itself also has calls. 6739 // 6740 // In other words if function_containing_sequence in the following pseudo 6741 // assembly requires that we save LR at the point of the call, but there 6742 // are no available registers: in this case we save using SP and as a 6743 // result the SP offsets requires stack fixups by multiples of 16. 6744 // 6745 // function_containing_sequence: 6746 // ... 6747 // save LR to SP <- Requires stack instr fixups in OUTLINED_FUNCTION_N 6748 // call OUTLINED_FUNCTION_N 6749 // restore LR from SP 6750 // ... 6751 // 6752 // OUTLINED_FUNCTION_N: 6753 // save LR to SP <- Requires stack instr fixups in OUTLINED_FUNCTION_N 6754 // ... 6755 // bl foo 6756 // restore LR from SP 6757 // ret 6758 // 6759 // Because the code to handle more than one stack fixup does not 6760 // currently have the proper checks for legality, these cases will assert 6761 // in the AArch64 MachineOutliner. This is because the code to do this 6762 // needs more hardening, testing, better checks that generated code is 6763 // legal, etc and because it is only verified to handle a single pass of 6764 // stack fixup. 6765 // 6766 // The assert happens in AArch64InstrInfo::buildOutlinedFrame to catch 6767 // these cases until they are known to be handled. Bugzilla 46767 is 6768 // referenced in comments at the assert site. 6769 // 6770 // To avoid asserting (or generating non-legal code on noassert builds) 6771 // we remove all candidates which would need more than one stack fixup by 6772 // pruning the cases where the candidate has calls while also having no 6773 // available LR and having no available general purpose registers to copy 6774 // LR to (ie one extra stack save/restore). 6775 // 6776 if (FlagsSetInAll & MachineOutlinerMBBFlags::HasCalls) { 6777 erase_if(RepeatedSequenceLocs, [this](outliner::Candidate &C) { 6778 return (std::any_of( 6779 C.front(), std::next(C.back()), 6780 [](const MachineInstr &MI) { return MI.isCall(); })) && 6781 (!C.LRU.available(AArch64::LR) || !findRegisterToSaveLRTo(C)); 6782 }); 6783 } 6784 } 6785 6786 // If we dropped all of the candidates, bail out here. 6787 if (RepeatedSequenceLocs.size() < 2) { 6788 RepeatedSequenceLocs.clear(); 6789 return outliner::OutlinedFunction(); 6790 } 6791 } 6792 6793 // Does every candidate's MBB contain a call? If so, then we might have a call 6794 // in the range. 6795 if (FlagsSetInAll & MachineOutlinerMBBFlags::HasCalls) { 6796 // Check if the range contains a call. These require a save + restore of the 6797 // link register. 6798 bool ModStackToSaveLR = false; 6799 if (std::any_of(FirstCand.front(), FirstCand.back(), 6800 [](const MachineInstr &MI) { return MI.isCall(); })) 6801 ModStackToSaveLR = true; 6802 6803 // Handle the last instruction separately. If this is a tail call, then the 6804 // last instruction is a call. We don't want to save + restore in this case. 6805 // However, it could be possible that the last instruction is a call without 6806 // it being valid to tail call this sequence. We should consider this as 6807 // well. 6808 else if (FrameID != MachineOutlinerThunk && 6809 FrameID != MachineOutlinerTailCall && FirstCand.back()->isCall()) 6810 ModStackToSaveLR = true; 6811 6812 if (ModStackToSaveLR) { 6813 // We can't fix up the stack. Bail out. 6814 if (!AllStackInstrsSafe) { 6815 RepeatedSequenceLocs.clear(); 6816 return outliner::OutlinedFunction(); 6817 } 6818 6819 // Save + restore LR. 6820 NumBytesToCreateFrame += 8; 6821 } 6822 } 6823 6824 // If we have CFI instructions, we can only outline if the outlined section 6825 // can be a tail call 6826 if (FrameID != MachineOutlinerTailCall && CFICount > 0) 6827 return outliner::OutlinedFunction(); 6828 6829 return outliner::OutlinedFunction(RepeatedSequenceLocs, SequenceSize, 6830 NumBytesToCreateFrame, FrameID); 6831 } 6832 6833 bool AArch64InstrInfo::isFunctionSafeToOutlineFrom( 6834 MachineFunction &MF, bool OutlineFromLinkOnceODRs) const { 6835 const Function &F = MF.getFunction(); 6836 6837 // Can F be deduplicated by the linker? If it can, don't outline from it. 6838 if (!OutlineFromLinkOnceODRs && F.hasLinkOnceODRLinkage()) 6839 return false; 6840 6841 // Don't outline from functions with section markings; the program could 6842 // expect that all the code is in the named section. 6843 // FIXME: Allow outlining from multiple functions with the same section 6844 // marking. 6845 if (F.hasSection()) 6846 return false; 6847 6848 // Outlining from functions with redzones is unsafe since the outliner may 6849 // modify the stack. Check if hasRedZone is true or unknown; if yes, don't 6850 // outline from it. 6851 AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>(); 6852 if (!AFI || AFI->hasRedZone().getValueOr(true)) 6853 return false; 6854 6855 // FIXME: Teach the outliner to generate/handle Windows unwind info. 6856 if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI()) 6857 return false; 6858 6859 // It's safe to outline from MF. 6860 return true; 6861 } 6862 6863 bool AArch64InstrInfo::isMBBSafeToOutlineFrom(MachineBasicBlock &MBB, 6864 unsigned &Flags) const { 6865 // Check if LR is available through all of the MBB. If it's not, then set 6866 // a flag. 6867 assert(MBB.getParent()->getRegInfo().tracksLiveness() && 6868 "Suitable Machine Function for outlining must track liveness"); 6869 LiveRegUnits LRU(getRegisterInfo()); 6870 6871 std::for_each(MBB.rbegin(), MBB.rend(), 6872 [&LRU](MachineInstr &MI) { LRU.accumulate(MI); }); 6873 6874 // Check if each of the unsafe registers are available... 6875 bool W16AvailableInBlock = LRU.available(AArch64::W16); 6876 bool W17AvailableInBlock = LRU.available(AArch64::W17); 6877 bool NZCVAvailableInBlock = LRU.available(AArch64::NZCV); 6878 6879 // If all of these are dead (and not live out), we know we don't have to check 6880 // them later. 6881 if (W16AvailableInBlock && W17AvailableInBlock && NZCVAvailableInBlock) 6882 Flags |= MachineOutlinerMBBFlags::UnsafeRegsDead; 6883 6884 // Now, add the live outs to the set. 6885 LRU.addLiveOuts(MBB); 6886 6887 // If any of these registers is available in the MBB, but also a live out of 6888 // the block, then we know outlining is unsafe. 6889 if (W16AvailableInBlock && !LRU.available(AArch64::W16)) 6890 return false; 6891 if (W17AvailableInBlock && !LRU.available(AArch64::W17)) 6892 return false; 6893 if (NZCVAvailableInBlock && !LRU.available(AArch64::NZCV)) 6894 return false; 6895 6896 // Check if there's a call inside this MachineBasicBlock. If there is, then 6897 // set a flag. 6898 if (any_of(MBB, [](MachineInstr &MI) { return MI.isCall(); })) 6899 Flags |= MachineOutlinerMBBFlags::HasCalls; 6900 6901 MachineFunction *MF = MBB.getParent(); 6902 6903 // In the event that we outline, we may have to save LR. If there is an 6904 // available register in the MBB, then we'll always save LR there. Check if 6905 // this is true. 6906 bool CanSaveLR = false; 6907 const AArch64RegisterInfo *ARI = static_cast<const AArch64RegisterInfo *>( 6908 MF->getSubtarget().getRegisterInfo()); 6909 6910 // Check if there is an available register across the sequence that we can 6911 // use. 6912 for (unsigned Reg : AArch64::GPR64RegClass) { 6913 if (!ARI->isReservedReg(*MF, Reg) && Reg != AArch64::LR && 6914 Reg != AArch64::X16 && Reg != AArch64::X17 && LRU.available(Reg)) { 6915 CanSaveLR = true; 6916 break; 6917 } 6918 } 6919 6920 // Check if we have a register we can save LR to, and if LR was used 6921 // somewhere. If both of those things are true, then we need to evaluate the 6922 // safety of outlining stack instructions later. 6923 if (!CanSaveLR && !LRU.available(AArch64::LR)) 6924 Flags |= MachineOutlinerMBBFlags::LRUnavailableSomewhere; 6925 6926 return true; 6927 } 6928 6929 outliner::InstrType 6930 AArch64InstrInfo::getOutliningType(MachineBasicBlock::iterator &MIT, 6931 unsigned Flags) const { 6932 MachineInstr &MI = *MIT; 6933 MachineBasicBlock *MBB = MI.getParent(); 6934 MachineFunction *MF = MBB->getParent(); 6935 AArch64FunctionInfo *FuncInfo = MF->getInfo<AArch64FunctionInfo>(); 6936 6937 // Don't outline anything used for return address signing. The outlined 6938 // function will get signed later if needed 6939 switch (MI.getOpcode()) { 6940 case AArch64::PACIASP: 6941 case AArch64::PACIBSP: 6942 case AArch64::AUTIASP: 6943 case AArch64::AUTIBSP: 6944 case AArch64::RETAA: 6945 case AArch64::RETAB: 6946 case AArch64::EMITBKEY: 6947 return outliner::InstrType::Illegal; 6948 } 6949 6950 // Don't outline LOHs. 6951 if (FuncInfo->getLOHRelated().count(&MI)) 6952 return outliner::InstrType::Illegal; 6953 6954 // We can only outline these if we will tail call the outlined function, or 6955 // fix up the CFI offsets. Currently, CFI instructions are outlined only if 6956 // in a tail call. 6957 // 6958 // FIXME: If the proper fixups for the offset are implemented, this should be 6959 // possible. 6960 if (MI.isCFIInstruction()) 6961 return outliner::InstrType::Legal; 6962 6963 // Don't allow debug values to impact outlining type. 6964 if (MI.isDebugInstr() || MI.isIndirectDebugValue()) 6965 return outliner::InstrType::Invisible; 6966 6967 // At this point, KILL instructions don't really tell us much so we can go 6968 // ahead and skip over them. 6969 if (MI.isKill()) 6970 return outliner::InstrType::Invisible; 6971 6972 // Is this a terminator for a basic block? 6973 if (MI.isTerminator()) { 6974 6975 // Is this the end of a function? 6976 if (MI.getParent()->succ_empty()) 6977 return outliner::InstrType::Legal; 6978 6979 // It's not, so don't outline it. 6980 return outliner::InstrType::Illegal; 6981 } 6982 6983 // Make sure none of the operands are un-outlinable. 6984 for (const MachineOperand &MOP : MI.operands()) { 6985 if (MOP.isCPI() || MOP.isJTI() || MOP.isCFIIndex() || MOP.isFI() || 6986 MOP.isTargetIndex()) 6987 return outliner::InstrType::Illegal; 6988 6989 // If it uses LR or W30 explicitly, then don't touch it. 6990 if (MOP.isReg() && !MOP.isImplicit() && 6991 (MOP.getReg() == AArch64::LR || MOP.getReg() == AArch64::W30)) 6992 return outliner::InstrType::Illegal; 6993 } 6994 6995 // Special cases for instructions that can always be outlined, but will fail 6996 // the later tests. e.g, ADRPs, which are PC-relative use LR, but can always 6997 // be outlined because they don't require a *specific* value to be in LR. 6998 if (MI.getOpcode() == AArch64::ADRP) 6999 return outliner::InstrType::Legal; 7000 7001 // If MI is a call we might be able to outline it. We don't want to outline 7002 // any calls that rely on the position of items on the stack. When we outline 7003 // something containing a call, we have to emit a save and restore of LR in 7004 // the outlined function. Currently, this always happens by saving LR to the 7005 // stack. Thus, if we outline, say, half the parameters for a function call 7006 // plus the call, then we'll break the callee's expectations for the layout 7007 // of the stack. 7008 // 7009 // FIXME: Allow calls to functions which construct a stack frame, as long 7010 // as they don't access arguments on the stack. 7011 // FIXME: Figure out some way to analyze functions defined in other modules. 7012 // We should be able to compute the memory usage based on the IR calling 7013 // convention, even if we can't see the definition. 7014 if (MI.isCall()) { 7015 // Get the function associated with the call. Look at each operand and find 7016 // the one that represents the callee and get its name. 7017 const Function *Callee = nullptr; 7018 for (const MachineOperand &MOP : MI.operands()) { 7019 if (MOP.isGlobal()) { 7020 Callee = dyn_cast<Function>(MOP.getGlobal()); 7021 break; 7022 } 7023 } 7024 7025 // Never outline calls to mcount. There isn't any rule that would require 7026 // this, but the Linux kernel's "ftrace" feature depends on it. 7027 if (Callee && Callee->getName() == "\01_mcount") 7028 return outliner::InstrType::Illegal; 7029 7030 // If we don't know anything about the callee, assume it depends on the 7031 // stack layout of the caller. In that case, it's only legal to outline 7032 // as a tail-call. Explicitly list the call instructions we know about so we 7033 // don't get unexpected results with call pseudo-instructions. 7034 auto UnknownCallOutlineType = outliner::InstrType::Illegal; 7035 if (MI.getOpcode() == AArch64::BLR || 7036 MI.getOpcode() == AArch64::BLRNoIP || MI.getOpcode() == AArch64::BL) 7037 UnknownCallOutlineType = outliner::InstrType::LegalTerminator; 7038 7039 if (!Callee) 7040 return UnknownCallOutlineType; 7041 7042 // We have a function we have information about. Check it if it's something 7043 // can safely outline. 7044 MachineFunction *CalleeMF = MF->getMMI().getMachineFunction(*Callee); 7045 7046 // We don't know what's going on with the callee at all. Don't touch it. 7047 if (!CalleeMF) 7048 return UnknownCallOutlineType; 7049 7050 // Check if we know anything about the callee saves on the function. If we 7051 // don't, then don't touch it, since that implies that we haven't 7052 // computed anything about its stack frame yet. 7053 MachineFrameInfo &MFI = CalleeMF->getFrameInfo(); 7054 if (!MFI.isCalleeSavedInfoValid() || MFI.getStackSize() > 0 || 7055 MFI.getNumObjects() > 0) 7056 return UnknownCallOutlineType; 7057 7058 // At this point, we can say that CalleeMF ought to not pass anything on the 7059 // stack. Therefore, we can outline it. 7060 return outliner::InstrType::Legal; 7061 } 7062 7063 // Don't outline positions. 7064 if (MI.isPosition()) 7065 return outliner::InstrType::Illegal; 7066 7067 // Don't touch the link register or W30. 7068 if (MI.readsRegister(AArch64::W30, &getRegisterInfo()) || 7069 MI.modifiesRegister(AArch64::W30, &getRegisterInfo())) 7070 return outliner::InstrType::Illegal; 7071 7072 // Don't outline BTI instructions, because that will prevent the outlining 7073 // site from being indirectly callable. 7074 if (MI.getOpcode() == AArch64::HINT) { 7075 int64_t Imm = MI.getOperand(0).getImm(); 7076 if (Imm == 32 || Imm == 34 || Imm == 36 || Imm == 38) 7077 return outliner::InstrType::Illegal; 7078 } 7079 7080 return outliner::InstrType::Legal; 7081 } 7082 7083 void AArch64InstrInfo::fixupPostOutline(MachineBasicBlock &MBB) const { 7084 for (MachineInstr &MI : MBB) { 7085 const MachineOperand *Base; 7086 unsigned Width; 7087 int64_t Offset; 7088 bool OffsetIsScalable; 7089 7090 // Is this a load or store with an immediate offset with SP as the base? 7091 if (!MI.mayLoadOrStore() || 7092 !getMemOperandWithOffsetWidth(MI, Base, Offset, OffsetIsScalable, Width, 7093 &RI) || 7094 (Base->isReg() && Base->getReg() != AArch64::SP)) 7095 continue; 7096 7097 // It is, so we have to fix it up. 7098 TypeSize Scale(0U, false); 7099 int64_t Dummy1, Dummy2; 7100 7101 MachineOperand &StackOffsetOperand = getMemOpBaseRegImmOfsOffsetOperand(MI); 7102 assert(StackOffsetOperand.isImm() && "Stack offset wasn't immediate!"); 7103 getMemOpInfo(MI.getOpcode(), Scale, Width, Dummy1, Dummy2); 7104 assert(Scale != 0 && "Unexpected opcode!"); 7105 assert(!OffsetIsScalable && "Expected offset to be a byte offset"); 7106 7107 // We've pushed the return address to the stack, so add 16 to the offset. 7108 // This is safe, since we already checked if it would overflow when we 7109 // checked if this instruction was legal to outline. 7110 int64_t NewImm = (Offset + 16) / (int64_t)Scale.getFixedSize(); 7111 StackOffsetOperand.setImm(NewImm); 7112 } 7113 } 7114 7115 static void signOutlinedFunction(MachineFunction &MF, MachineBasicBlock &MBB, 7116 bool ShouldSignReturnAddr, 7117 bool ShouldSignReturnAddrWithAKey) { 7118 if (ShouldSignReturnAddr) { 7119 MachineBasicBlock::iterator MBBPAC = MBB.begin(); 7120 MachineBasicBlock::iterator MBBAUT = MBB.getFirstTerminator(); 7121 const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>(); 7122 const TargetInstrInfo *TII = Subtarget.getInstrInfo(); 7123 DebugLoc DL; 7124 7125 if (MBBAUT != MBB.end()) 7126 DL = MBBAUT->getDebugLoc(); 7127 7128 // At the very beginning of the basic block we insert the following 7129 // depending on the key type 7130 // 7131 // a_key: b_key: 7132 // PACIASP EMITBKEY 7133 // CFI_INSTRUCTION PACIBSP 7134 // CFI_INSTRUCTION 7135 if (ShouldSignReturnAddrWithAKey) { 7136 BuildMI(MBB, MBBPAC, DebugLoc(), TII->get(AArch64::PACIASP)) 7137 .setMIFlag(MachineInstr::FrameSetup); 7138 } else { 7139 BuildMI(MBB, MBBPAC, DebugLoc(), TII->get(AArch64::EMITBKEY)) 7140 .setMIFlag(MachineInstr::FrameSetup); 7141 BuildMI(MBB, MBBPAC, DebugLoc(), TII->get(AArch64::PACIBSP)) 7142 .setMIFlag(MachineInstr::FrameSetup); 7143 } 7144 unsigned CFIIndex = 7145 MF.addFrameInst(MCCFIInstruction::createNegateRAState(nullptr)); 7146 BuildMI(MBB, MBBPAC, DebugLoc(), TII->get(AArch64::CFI_INSTRUCTION)) 7147 .addCFIIndex(CFIIndex) 7148 .setMIFlags(MachineInstr::FrameSetup); 7149 7150 // If v8.3a features are available we can replace a RET instruction by 7151 // RETAA or RETAB and omit the AUT instructions 7152 if (Subtarget.hasPAuth() && MBBAUT != MBB.end() && 7153 MBBAUT->getOpcode() == AArch64::RET) { 7154 BuildMI(MBB, MBBAUT, DL, 7155 TII->get(ShouldSignReturnAddrWithAKey ? AArch64::RETAA 7156 : AArch64::RETAB)) 7157 .copyImplicitOps(*MBBAUT); 7158 MBB.erase(MBBAUT); 7159 } else { 7160 BuildMI(MBB, MBBAUT, DL, 7161 TII->get(ShouldSignReturnAddrWithAKey ? AArch64::AUTIASP 7162 : AArch64::AUTIBSP)) 7163 .setMIFlag(MachineInstr::FrameDestroy); 7164 } 7165 } 7166 } 7167 7168 void AArch64InstrInfo::buildOutlinedFrame( 7169 MachineBasicBlock &MBB, MachineFunction &MF, 7170 const outliner::OutlinedFunction &OF) const { 7171 7172 AArch64FunctionInfo *FI = MF.getInfo<AArch64FunctionInfo>(); 7173 7174 if (OF.FrameConstructionID == MachineOutlinerTailCall) 7175 FI->setOutliningStyle("Tail Call"); 7176 else if (OF.FrameConstructionID == MachineOutlinerThunk) { 7177 // For thunk outlining, rewrite the last instruction from a call to a 7178 // tail-call. 7179 MachineInstr *Call = &*--MBB.instr_end(); 7180 unsigned TailOpcode; 7181 if (Call->getOpcode() == AArch64::BL) { 7182 TailOpcode = AArch64::TCRETURNdi; 7183 } else { 7184 assert(Call->getOpcode() == AArch64::BLR || 7185 Call->getOpcode() == AArch64::BLRNoIP); 7186 TailOpcode = AArch64::TCRETURNriALL; 7187 } 7188 MachineInstr *TC = BuildMI(MF, DebugLoc(), get(TailOpcode)) 7189 .add(Call->getOperand(0)) 7190 .addImm(0); 7191 MBB.insert(MBB.end(), TC); 7192 Call->eraseFromParent(); 7193 7194 FI->setOutliningStyle("Thunk"); 7195 } 7196 7197 bool IsLeafFunction = true; 7198 7199 // Is there a call in the outlined range? 7200 auto IsNonTailCall = [](const MachineInstr &MI) { 7201 return MI.isCall() && !MI.isReturn(); 7202 }; 7203 7204 if (llvm::any_of(MBB.instrs(), IsNonTailCall)) { 7205 // Fix up the instructions in the range, since we're going to modify the 7206 // stack. 7207 7208 // Bugzilla ID: 46767 7209 // TODO: Check if fixing up twice is safe so we can outline these. 7210 assert(OF.FrameConstructionID != MachineOutlinerDefault && 7211 "Can only fix up stack references once"); 7212 fixupPostOutline(MBB); 7213 7214 IsLeafFunction = false; 7215 7216 // LR has to be a live in so that we can save it. 7217 if (!MBB.isLiveIn(AArch64::LR)) 7218 MBB.addLiveIn(AArch64::LR); 7219 7220 MachineBasicBlock::iterator It = MBB.begin(); 7221 MachineBasicBlock::iterator Et = MBB.end(); 7222 7223 if (OF.FrameConstructionID == MachineOutlinerTailCall || 7224 OF.FrameConstructionID == MachineOutlinerThunk) 7225 Et = std::prev(MBB.end()); 7226 7227 // Insert a save before the outlined region 7228 MachineInstr *STRXpre = BuildMI(MF, DebugLoc(), get(AArch64::STRXpre)) 7229 .addReg(AArch64::SP, RegState::Define) 7230 .addReg(AArch64::LR) 7231 .addReg(AArch64::SP) 7232 .addImm(-16); 7233 It = MBB.insert(It, STRXpre); 7234 7235 const TargetSubtargetInfo &STI = MF.getSubtarget(); 7236 const MCRegisterInfo *MRI = STI.getRegisterInfo(); 7237 unsigned DwarfReg = MRI->getDwarfRegNum(AArch64::LR, true); 7238 7239 // Add a CFI saying the stack was moved 16 B down. 7240 int64_t StackPosEntry = 7241 MF.addFrameInst(MCCFIInstruction::cfiDefCfaOffset(nullptr, 16)); 7242 BuildMI(MBB, It, DebugLoc(), get(AArch64::CFI_INSTRUCTION)) 7243 .addCFIIndex(StackPosEntry) 7244 .setMIFlags(MachineInstr::FrameSetup); 7245 7246 // Add a CFI saying that the LR that we want to find is now 16 B higher than 7247 // before. 7248 int64_t LRPosEntry = 7249 MF.addFrameInst(MCCFIInstruction::createOffset(nullptr, DwarfReg, -16)); 7250 BuildMI(MBB, It, DebugLoc(), get(AArch64::CFI_INSTRUCTION)) 7251 .addCFIIndex(LRPosEntry) 7252 .setMIFlags(MachineInstr::FrameSetup); 7253 7254 // Insert a restore before the terminator for the function. 7255 MachineInstr *LDRXpost = BuildMI(MF, DebugLoc(), get(AArch64::LDRXpost)) 7256 .addReg(AArch64::SP, RegState::Define) 7257 .addReg(AArch64::LR, RegState::Define) 7258 .addReg(AArch64::SP) 7259 .addImm(16); 7260 Et = MBB.insert(Et, LDRXpost); 7261 } 7262 7263 // If a bunch of candidates reach this point they must agree on their return 7264 // address signing. It is therefore enough to just consider the signing 7265 // behaviour of one of them 7266 const auto &MFI = *OF.Candidates.front().getMF()->getInfo<AArch64FunctionInfo>(); 7267 bool ShouldSignReturnAddr = MFI.shouldSignReturnAddress(!IsLeafFunction); 7268 7269 // a_key is the default 7270 bool ShouldSignReturnAddrWithAKey = !MFI.shouldSignWithBKey(); 7271 7272 // If this is a tail call outlined function, then there's already a return. 7273 if (OF.FrameConstructionID == MachineOutlinerTailCall || 7274 OF.FrameConstructionID == MachineOutlinerThunk) { 7275 signOutlinedFunction(MF, MBB, ShouldSignReturnAddr, 7276 ShouldSignReturnAddrWithAKey); 7277 return; 7278 } 7279 7280 // It's not a tail call, so we have to insert the return ourselves. 7281 7282 // LR has to be a live in so that we can return to it. 7283 if (!MBB.isLiveIn(AArch64::LR)) 7284 MBB.addLiveIn(AArch64::LR); 7285 7286 MachineInstr *ret = BuildMI(MF, DebugLoc(), get(AArch64::RET)) 7287 .addReg(AArch64::LR); 7288 MBB.insert(MBB.end(), ret); 7289 7290 signOutlinedFunction(MF, MBB, ShouldSignReturnAddr, 7291 ShouldSignReturnAddrWithAKey); 7292 7293 FI->setOutliningStyle("Function"); 7294 7295 // Did we have to modify the stack by saving the link register? 7296 if (OF.FrameConstructionID != MachineOutlinerDefault) 7297 return; 7298 7299 // We modified the stack. 7300 // Walk over the basic block and fix up all the stack accesses. 7301 fixupPostOutline(MBB); 7302 } 7303 7304 MachineBasicBlock::iterator AArch64InstrInfo::insertOutlinedCall( 7305 Module &M, MachineBasicBlock &MBB, MachineBasicBlock::iterator &It, 7306 MachineFunction &MF, const outliner::Candidate &C) const { 7307 7308 // Are we tail calling? 7309 if (C.CallConstructionID == MachineOutlinerTailCall) { 7310 // If yes, then we can just branch to the label. 7311 It = MBB.insert(It, BuildMI(MF, DebugLoc(), get(AArch64::TCRETURNdi)) 7312 .addGlobalAddress(M.getNamedValue(MF.getName())) 7313 .addImm(0)); 7314 return It; 7315 } 7316 7317 // Are we saving the link register? 7318 if (C.CallConstructionID == MachineOutlinerNoLRSave || 7319 C.CallConstructionID == MachineOutlinerThunk) { 7320 // No, so just insert the call. 7321 It = MBB.insert(It, BuildMI(MF, DebugLoc(), get(AArch64::BL)) 7322 .addGlobalAddress(M.getNamedValue(MF.getName()))); 7323 return It; 7324 } 7325 7326 // We want to return the spot where we inserted the call. 7327 MachineBasicBlock::iterator CallPt; 7328 7329 // Instructions for saving and restoring LR around the call instruction we're 7330 // going to insert. 7331 MachineInstr *Save; 7332 MachineInstr *Restore; 7333 // Can we save to a register? 7334 if (C.CallConstructionID == MachineOutlinerRegSave) { 7335 // FIXME: This logic should be sunk into a target-specific interface so that 7336 // we don't have to recompute the register. 7337 unsigned Reg = findRegisterToSaveLRTo(C); 7338 assert(Reg != 0 && "No callee-saved register available?"); 7339 7340 // Save and restore LR from that register. 7341 Save = BuildMI(MF, DebugLoc(), get(AArch64::ORRXrs), Reg) 7342 .addReg(AArch64::XZR) 7343 .addReg(AArch64::LR) 7344 .addImm(0); 7345 Restore = BuildMI(MF, DebugLoc(), get(AArch64::ORRXrs), AArch64::LR) 7346 .addReg(AArch64::XZR) 7347 .addReg(Reg) 7348 .addImm(0); 7349 } else { 7350 // We have the default case. Save and restore from SP. 7351 Save = BuildMI(MF, DebugLoc(), get(AArch64::STRXpre)) 7352 .addReg(AArch64::SP, RegState::Define) 7353 .addReg(AArch64::LR) 7354 .addReg(AArch64::SP) 7355 .addImm(-16); 7356 Restore = BuildMI(MF, DebugLoc(), get(AArch64::LDRXpost)) 7357 .addReg(AArch64::SP, RegState::Define) 7358 .addReg(AArch64::LR, RegState::Define) 7359 .addReg(AArch64::SP) 7360 .addImm(16); 7361 } 7362 7363 It = MBB.insert(It, Save); 7364 It++; 7365 7366 // Insert the call. 7367 It = MBB.insert(It, BuildMI(MF, DebugLoc(), get(AArch64::BL)) 7368 .addGlobalAddress(M.getNamedValue(MF.getName()))); 7369 CallPt = It; 7370 It++; 7371 7372 It = MBB.insert(It, Restore); 7373 return CallPt; 7374 } 7375 7376 bool AArch64InstrInfo::shouldOutlineFromFunctionByDefault( 7377 MachineFunction &MF) const { 7378 return MF.getFunction().hasMinSize(); 7379 } 7380 7381 Optional<DestSourcePair> 7382 AArch64InstrInfo::isCopyInstrImpl(const MachineInstr &MI) const { 7383 7384 // AArch64::ORRWrs and AArch64::ORRXrs with WZR/XZR reg 7385 // and zero immediate operands used as an alias for mov instruction. 7386 if (MI.getOpcode() == AArch64::ORRWrs && 7387 MI.getOperand(1).getReg() == AArch64::WZR && 7388 MI.getOperand(3).getImm() == 0x0) { 7389 return DestSourcePair{MI.getOperand(0), MI.getOperand(2)}; 7390 } 7391 7392 if (MI.getOpcode() == AArch64::ORRXrs && 7393 MI.getOperand(1).getReg() == AArch64::XZR && 7394 MI.getOperand(3).getImm() == 0x0) { 7395 return DestSourcePair{MI.getOperand(0), MI.getOperand(2)}; 7396 } 7397 7398 return None; 7399 } 7400 7401 Optional<RegImmPair> AArch64InstrInfo::isAddImmediate(const MachineInstr &MI, 7402 Register Reg) const { 7403 int Sign = 1; 7404 int64_t Offset = 0; 7405 7406 // TODO: Handle cases where Reg is a super- or sub-register of the 7407 // destination register. 7408 const MachineOperand &Op0 = MI.getOperand(0); 7409 if (!Op0.isReg() || Reg != Op0.getReg()) 7410 return None; 7411 7412 switch (MI.getOpcode()) { 7413 default: 7414 return None; 7415 case AArch64::SUBWri: 7416 case AArch64::SUBXri: 7417 case AArch64::SUBSWri: 7418 case AArch64::SUBSXri: 7419 Sign *= -1; 7420 LLVM_FALLTHROUGH; 7421 case AArch64::ADDSWri: 7422 case AArch64::ADDSXri: 7423 case AArch64::ADDWri: 7424 case AArch64::ADDXri: { 7425 // TODO: Third operand can be global address (usually some string). 7426 if (!MI.getOperand(0).isReg() || !MI.getOperand(1).isReg() || 7427 !MI.getOperand(2).isImm()) 7428 return None; 7429 int Shift = MI.getOperand(3).getImm(); 7430 assert((Shift == 0 || Shift == 12) && "Shift can be either 0 or 12"); 7431 Offset = Sign * (MI.getOperand(2).getImm() << Shift); 7432 } 7433 } 7434 return RegImmPair{MI.getOperand(1).getReg(), Offset}; 7435 } 7436 7437 /// If the given ORR instruction is a copy, and \p DescribedReg overlaps with 7438 /// the destination register then, if possible, describe the value in terms of 7439 /// the source register. 7440 static Optional<ParamLoadedValue> 7441 describeORRLoadedValue(const MachineInstr &MI, Register DescribedReg, 7442 const TargetInstrInfo *TII, 7443 const TargetRegisterInfo *TRI) { 7444 auto DestSrc = TII->isCopyInstr(MI); 7445 if (!DestSrc) 7446 return None; 7447 7448 Register DestReg = DestSrc->Destination->getReg(); 7449 Register SrcReg = DestSrc->Source->getReg(); 7450 7451 auto Expr = DIExpression::get(MI.getMF()->getFunction().getContext(), {}); 7452 7453 // If the described register is the destination, just return the source. 7454 if (DestReg == DescribedReg) 7455 return ParamLoadedValue(MachineOperand::CreateReg(SrcReg, false), Expr); 7456 7457 // ORRWrs zero-extends to 64-bits, so we need to consider such cases. 7458 if (MI.getOpcode() == AArch64::ORRWrs && 7459 TRI->isSuperRegister(DestReg, DescribedReg)) 7460 return ParamLoadedValue(MachineOperand::CreateReg(SrcReg, false), Expr); 7461 7462 // We may need to describe the lower part of a ORRXrs move. 7463 if (MI.getOpcode() == AArch64::ORRXrs && 7464 TRI->isSubRegister(DestReg, DescribedReg)) { 7465 Register SrcSubReg = TRI->getSubReg(SrcReg, AArch64::sub_32); 7466 return ParamLoadedValue(MachineOperand::CreateReg(SrcSubReg, false), Expr); 7467 } 7468 7469 assert(!TRI->isSuperOrSubRegisterEq(DestReg, DescribedReg) && 7470 "Unhandled ORR[XW]rs copy case"); 7471 7472 return None; 7473 } 7474 7475 Optional<ParamLoadedValue> 7476 AArch64InstrInfo::describeLoadedValue(const MachineInstr &MI, 7477 Register Reg) const { 7478 const MachineFunction *MF = MI.getMF(); 7479 const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo(); 7480 switch (MI.getOpcode()) { 7481 case AArch64::MOVZWi: 7482 case AArch64::MOVZXi: { 7483 // MOVZWi may be used for producing zero-extended 32-bit immediates in 7484 // 64-bit parameters, so we need to consider super-registers. 7485 if (!TRI->isSuperRegisterEq(MI.getOperand(0).getReg(), Reg)) 7486 return None; 7487 7488 if (!MI.getOperand(1).isImm()) 7489 return None; 7490 int64_t Immediate = MI.getOperand(1).getImm(); 7491 int Shift = MI.getOperand(2).getImm(); 7492 return ParamLoadedValue(MachineOperand::CreateImm(Immediate << Shift), 7493 nullptr); 7494 } 7495 case AArch64::ORRWrs: 7496 case AArch64::ORRXrs: 7497 return describeORRLoadedValue(MI, Reg, this, TRI); 7498 } 7499 7500 return TargetInstrInfo::describeLoadedValue(MI, Reg); 7501 } 7502 7503 bool AArch64InstrInfo::isExtendLikelyToBeFolded( 7504 MachineInstr &ExtMI, MachineRegisterInfo &MRI) const { 7505 assert(ExtMI.getOpcode() == TargetOpcode::G_SEXT || 7506 ExtMI.getOpcode() == TargetOpcode::G_ZEXT || 7507 ExtMI.getOpcode() == TargetOpcode::G_ANYEXT); 7508 7509 // Anyexts are nops. 7510 if (ExtMI.getOpcode() == TargetOpcode::G_ANYEXT) 7511 return true; 7512 7513 Register DefReg = ExtMI.getOperand(0).getReg(); 7514 if (!MRI.hasOneNonDBGUse(DefReg)) 7515 return false; 7516 7517 // It's likely that a sext/zext as a G_PTR_ADD offset will be folded into an 7518 // addressing mode. 7519 auto *UserMI = &*MRI.use_instr_nodbg_begin(DefReg); 7520 return UserMI->getOpcode() == TargetOpcode::G_PTR_ADD; 7521 } 7522 7523 uint64_t AArch64InstrInfo::getElementSizeForOpcode(unsigned Opc) const { 7524 return get(Opc).TSFlags & AArch64::ElementSizeMask; 7525 } 7526 7527 bool AArch64InstrInfo::isPTestLikeOpcode(unsigned Opc) const { 7528 return get(Opc).TSFlags & AArch64::InstrFlagIsPTestLike; 7529 } 7530 7531 bool AArch64InstrInfo::isWhileOpcode(unsigned Opc) const { 7532 return get(Opc).TSFlags & AArch64::InstrFlagIsWhile; 7533 } 7534 7535 unsigned int 7536 AArch64InstrInfo::getTailDuplicateSize(CodeGenOpt::Level OptLevel) const { 7537 return OptLevel >= CodeGenOpt::Aggressive ? 6 : 2; 7538 } 7539 7540 unsigned llvm::getBLRCallOpcode(const MachineFunction &MF) { 7541 if (MF.getSubtarget<AArch64Subtarget>().hardenSlsBlr()) 7542 return AArch64::BLRNoIP; 7543 else 7544 return AArch64::BLR; 7545 } 7546 7547 #define GET_INSTRINFO_HELPERS 7548 #define GET_INSTRMAP_INFO 7549 #include "AArch64GenInstrInfo.inc" 7550