Home | History | Annotate | Line # | Download | only in VE
      1 //===-- VEISelLowering.cpp - VE DAG Lowering Implementation ---------------===//
      2 //
      3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
      4 // See https://llvm.org/LICENSE.txt for license information.
      5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
      6 //
      7 //===----------------------------------------------------------------------===//
      8 //
      9 // This file implements the interfaces that VE uses to lower LLVM code into a
     10 // selection DAG.
     11 //
     12 //===----------------------------------------------------------------------===//
     13 
     14 #include "VEISelLowering.h"
     15 #include "MCTargetDesc/VEMCExpr.h"
     16 #include "VEInstrBuilder.h"
     17 #include "VEMachineFunctionInfo.h"
     18 #include "VERegisterInfo.h"
     19 #include "VETargetMachine.h"
     20 #include "llvm/ADT/StringSwitch.h"
     21 #include "llvm/CodeGen/CallingConvLower.h"
     22 #include "llvm/CodeGen/MachineFrameInfo.h"
     23 #include "llvm/CodeGen/MachineFunction.h"
     24 #include "llvm/CodeGen/MachineInstrBuilder.h"
     25 #include "llvm/CodeGen/MachineJumpTableInfo.h"
     26 #include "llvm/CodeGen/MachineModuleInfo.h"
     27 #include "llvm/CodeGen/MachineRegisterInfo.h"
     28 #include "llvm/CodeGen/SelectionDAG.h"
     29 #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
     30 #include "llvm/IR/DerivedTypes.h"
     31 #include "llvm/IR/Function.h"
     32 #include "llvm/IR/Module.h"
     33 #include "llvm/Support/ErrorHandling.h"
     34 #include "llvm/Support/KnownBits.h"
     35 using namespace llvm;
     36 
     37 #define DEBUG_TYPE "ve-lower"
     38 
     39 //===----------------------------------------------------------------------===//
     40 // Calling Convention Implementation
     41 //===----------------------------------------------------------------------===//
     42 
     43 #include "VEGenCallingConv.inc"
     44 
     45 CCAssignFn *getReturnCC(CallingConv::ID CallConv) {
     46   switch (CallConv) {
     47   default:
     48     return RetCC_VE_C;
     49   case CallingConv::Fast:
     50     return RetCC_VE_Fast;
     51   }
     52 }
     53 
     54 CCAssignFn *getParamCC(CallingConv::ID CallConv, bool IsVarArg) {
     55   if (IsVarArg)
     56     return CC_VE2;
     57   switch (CallConv) {
     58   default:
     59     return CC_VE_C;
     60   case CallingConv::Fast:
     61     return CC_VE_Fast;
     62   }
     63 }
     64 
     65 bool VETargetLowering::CanLowerReturn(
     66     CallingConv::ID CallConv, MachineFunction &MF, bool IsVarArg,
     67     const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
     68   CCAssignFn *RetCC = getReturnCC(CallConv);
     69   SmallVector<CCValAssign, 16> RVLocs;
     70   CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context);
     71   return CCInfo.CheckReturn(Outs, RetCC);
     72 }
     73 
     74 static const MVT AllVectorVTs[] = {MVT::v256i32, MVT::v512i32, MVT::v256i64,
     75                                    MVT::v256f32, MVT::v512f32, MVT::v256f64};
     76 
     77 static const MVT AllPackedVTs[] = {MVT::v512i32, MVT::v512f32};
     78 
     79 void VETargetLowering::initRegisterClasses() {
     80   // Set up the register classes.
     81   addRegisterClass(MVT::i32, &VE::I32RegClass);
     82   addRegisterClass(MVT::i64, &VE::I64RegClass);
     83   addRegisterClass(MVT::f32, &VE::F32RegClass);
     84   addRegisterClass(MVT::f64, &VE::I64RegClass);
     85   addRegisterClass(MVT::f128, &VE::F128RegClass);
     86 
     87   if (Subtarget->enableVPU()) {
     88     for (MVT VecVT : AllVectorVTs)
     89       addRegisterClass(VecVT, &VE::V64RegClass);
     90     addRegisterClass(MVT::v256i1, &VE::VMRegClass);
     91     addRegisterClass(MVT::v512i1, &VE::VM512RegClass);
     92   }
     93 }
     94 
     95 void VETargetLowering::initSPUActions() {
     96   const auto &TM = getTargetMachine();
     97   /// Load & Store {
     98 
     99   // VE doesn't have i1 sign extending load.
    100   for (MVT VT : MVT::integer_valuetypes()) {
    101     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
    102     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote);
    103     setLoadExtAction(ISD::EXTLOAD, VT, MVT::i1, Promote);
    104     setTruncStoreAction(VT, MVT::i1, Expand);
    105   }
    106 
    107   // VE doesn't have floating point extload/truncstore, so expand them.
    108   for (MVT FPVT : MVT::fp_valuetypes()) {
    109     for (MVT OtherFPVT : MVT::fp_valuetypes()) {
    110       setLoadExtAction(ISD::EXTLOAD, FPVT, OtherFPVT, Expand);
    111       setTruncStoreAction(FPVT, OtherFPVT, Expand);
    112     }
    113   }
    114 
    115   // VE doesn't have fp128 load/store, so expand them in custom lower.
    116   setOperationAction(ISD::LOAD, MVT::f128, Custom);
    117   setOperationAction(ISD::STORE, MVT::f128, Custom);
    118 
    119   /// } Load & Store
    120 
    121   // Custom legalize address nodes into LO/HI parts.
    122   MVT PtrVT = MVT::getIntegerVT(TM.getPointerSizeInBits(0));
    123   setOperationAction(ISD::BlockAddress, PtrVT, Custom);
    124   setOperationAction(ISD::GlobalAddress, PtrVT, Custom);
    125   setOperationAction(ISD::GlobalTLSAddress, PtrVT, Custom);
    126   setOperationAction(ISD::ConstantPool, PtrVT, Custom);
    127   setOperationAction(ISD::JumpTable, PtrVT, Custom);
    128 
    129   /// VAARG handling {
    130   setOperationAction(ISD::VASTART, MVT::Other, Custom);
    131   // VAARG needs to be lowered to access with 8 bytes alignment.
    132   setOperationAction(ISD::VAARG, MVT::Other, Custom);
    133   // Use the default implementation.
    134   setOperationAction(ISD::VACOPY, MVT::Other, Expand);
    135   setOperationAction(ISD::VAEND, MVT::Other, Expand);
    136   /// } VAARG handling
    137 
    138   /// Stack {
    139   setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom);
    140   setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Custom);
    141 
    142   // Use the default implementation.
    143   setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
    144   setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
    145   /// } Stack
    146 
    147   /// Branch {
    148 
    149   // VE doesn't have BRCOND
    150   setOperationAction(ISD::BRCOND, MVT::Other, Expand);
    151 
    152   // BR_JT is not implemented yet.
    153   setOperationAction(ISD::BR_JT, MVT::Other, Expand);
    154 
    155   /// } Branch
    156 
    157   /// Int Ops {
    158   for (MVT IntVT : {MVT::i32, MVT::i64}) {
    159     // VE has no REM or DIVREM operations.
    160     setOperationAction(ISD::UREM, IntVT, Expand);
    161     setOperationAction(ISD::SREM, IntVT, Expand);
    162     setOperationAction(ISD::SDIVREM, IntVT, Expand);
    163     setOperationAction(ISD::UDIVREM, IntVT, Expand);
    164 
    165     // VE has no SHL_PARTS/SRA_PARTS/SRL_PARTS operations.
    166     setOperationAction(ISD::SHL_PARTS, IntVT, Expand);
    167     setOperationAction(ISD::SRA_PARTS, IntVT, Expand);
    168     setOperationAction(ISD::SRL_PARTS, IntVT, Expand);
    169 
    170     // VE has no MULHU/S or U/SMUL_LOHI operations.
    171     // TODO: Use MPD instruction to implement SMUL_LOHI for i32 type.
    172     setOperationAction(ISD::MULHU, IntVT, Expand);
    173     setOperationAction(ISD::MULHS, IntVT, Expand);
    174     setOperationAction(ISD::UMUL_LOHI, IntVT, Expand);
    175     setOperationAction(ISD::SMUL_LOHI, IntVT, Expand);
    176 
    177     // VE has no CTTZ, ROTL, ROTR operations.
    178     setOperationAction(ISD::CTTZ, IntVT, Expand);
    179     setOperationAction(ISD::ROTL, IntVT, Expand);
    180     setOperationAction(ISD::ROTR, IntVT, Expand);
    181 
    182     // VE has 64 bits instruction which works as i64 BSWAP operation.  This
    183     // instruction works fine as i32 BSWAP operation with an additional
    184     // parameter.  Use isel patterns to lower BSWAP.
    185     setOperationAction(ISD::BSWAP, IntVT, Legal);
    186 
    187     // VE has only 64 bits instructions which work as i64 BITREVERSE/CTLZ/CTPOP
    188     // operations.  Use isel patterns for i64, promote for i32.
    189     LegalizeAction Act = (IntVT == MVT::i32) ? Promote : Legal;
    190     setOperationAction(ISD::BITREVERSE, IntVT, Act);
    191     setOperationAction(ISD::CTLZ, IntVT, Act);
    192     setOperationAction(ISD::CTLZ_ZERO_UNDEF, IntVT, Act);
    193     setOperationAction(ISD::CTPOP, IntVT, Act);
    194 
    195     // VE has only 64 bits instructions which work as i64 AND/OR/XOR operations.
    196     // Use isel patterns for i64, promote for i32.
    197     setOperationAction(ISD::AND, IntVT, Act);
    198     setOperationAction(ISD::OR, IntVT, Act);
    199     setOperationAction(ISD::XOR, IntVT, Act);
    200   }
    201   /// } Int Ops
    202 
    203   /// Conversion {
    204   // VE doesn't have instructions for fp<->uint, so expand them by llvm
    205   setOperationAction(ISD::FP_TO_UINT, MVT::i32, Promote); // use i64
    206   setOperationAction(ISD::UINT_TO_FP, MVT::i32, Promote); // use i64
    207   setOperationAction(ISD::FP_TO_UINT, MVT::i64, Expand);
    208   setOperationAction(ISD::UINT_TO_FP, MVT::i64, Expand);
    209 
    210   // fp16 not supported
    211   for (MVT FPVT : MVT::fp_valuetypes()) {
    212     setOperationAction(ISD::FP16_TO_FP, FPVT, Expand);
    213     setOperationAction(ISD::FP_TO_FP16, FPVT, Expand);
    214   }
    215   /// } Conversion
    216 
    217   /// Floating-point Ops {
    218   /// Note: Floating-point operations are fneg, fadd, fsub, fmul, fdiv, frem,
    219   ///       and fcmp.
    220 
    221   // VE doesn't have following floating point operations.
    222   for (MVT VT : MVT::fp_valuetypes()) {
    223     setOperationAction(ISD::FNEG, VT, Expand);
    224     setOperationAction(ISD::FREM, VT, Expand);
    225   }
    226 
    227   // VE doesn't have fdiv of f128.
    228   setOperationAction(ISD::FDIV, MVT::f128, Expand);
    229 
    230   for (MVT FPVT : {MVT::f32, MVT::f64}) {
    231     // f32 and f64 uses ConstantFP.  f128 uses ConstantPool.
    232     setOperationAction(ISD::ConstantFP, FPVT, Legal);
    233   }
    234   /// } Floating-point Ops
    235 
    236   /// Floating-point math functions {
    237 
    238   // VE doesn't have following floating point math functions.
    239   for (MVT VT : MVT::fp_valuetypes()) {
    240     setOperationAction(ISD::FABS, VT, Expand);
    241     setOperationAction(ISD::FCOPYSIGN, VT, Expand);
    242     setOperationAction(ISD::FCOS, VT, Expand);
    243     setOperationAction(ISD::FSIN, VT, Expand);
    244     setOperationAction(ISD::FSQRT, VT, Expand);
    245   }
    246 
    247   /// } Floating-point math functions
    248 
    249   /// Atomic instructions {
    250 
    251   setMaxAtomicSizeInBitsSupported(64);
    252   setMinCmpXchgSizeInBits(32);
    253   setSupportsUnalignedAtomics(false);
    254 
    255   // Use custom inserter for ATOMIC_FENCE.
    256   setOperationAction(ISD::ATOMIC_FENCE, MVT::Other, Custom);
    257 
    258   // Other atomic instructions.
    259   for (MVT VT : MVT::integer_valuetypes()) {
    260     // Support i8/i16 atomic swap.
    261     setOperationAction(ISD::ATOMIC_SWAP, VT, Custom);
    262 
    263     // FIXME: Support "atmam" instructions.
    264     setOperationAction(ISD::ATOMIC_LOAD_ADD, VT, Expand);
    265     setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Expand);
    266     setOperationAction(ISD::ATOMIC_LOAD_AND, VT, Expand);
    267     setOperationAction(ISD::ATOMIC_LOAD_OR, VT, Expand);
    268 
    269     // VE doesn't have follwing instructions.
    270     setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VT, Expand);
    271     setOperationAction(ISD::ATOMIC_LOAD_CLR, VT, Expand);
    272     setOperationAction(ISD::ATOMIC_LOAD_XOR, VT, Expand);
    273     setOperationAction(ISD::ATOMIC_LOAD_NAND, VT, Expand);
    274     setOperationAction(ISD::ATOMIC_LOAD_MIN, VT, Expand);
    275     setOperationAction(ISD::ATOMIC_LOAD_MAX, VT, Expand);
    276     setOperationAction(ISD::ATOMIC_LOAD_UMIN, VT, Expand);
    277     setOperationAction(ISD::ATOMIC_LOAD_UMAX, VT, Expand);
    278   }
    279 
    280   /// } Atomic instructions
    281 
    282   /// SJLJ instructions {
    283   setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);
    284   setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);
    285   setOperationAction(ISD::EH_SJLJ_SETUP_DISPATCH, MVT::Other, Custom);
    286   if (TM.Options.ExceptionModel == ExceptionHandling::SjLj)
    287     setLibcallName(RTLIB::UNWIND_RESUME, "_Unwind_SjLj_Resume");
    288   /// } SJLJ instructions
    289 
    290   // Intrinsic instructions
    291   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
    292 }
    293 
    294 void VETargetLowering::initVPUActions() {
    295   for (MVT LegalVecVT : AllVectorVTs) {
    296     setOperationAction(ISD::BUILD_VECTOR, LegalVecVT, Custom);
    297     setOperationAction(ISD::INSERT_VECTOR_ELT, LegalVecVT, Legal);
    298     setOperationAction(ISD::EXTRACT_VECTOR_ELT, LegalVecVT, Legal);
    299     // Translate all vector instructions with legal element types to VVP_*
    300     // nodes.
    301     // TODO We will custom-widen into VVP_* nodes in the future. While we are
    302     // buildling the infrastructure for this, we only do this for legal vector
    303     // VTs.
    304 #define HANDLE_VP_TO_VVP(VP_OPC, VVP_NAME)                                     \
    305   setOperationAction(ISD::VP_OPC, LegalVecVT, Custom);
    306 #define ADD_VVP_OP(VVP_NAME, ISD_NAME)                                         \
    307   setOperationAction(ISD::ISD_NAME, LegalVecVT, Custom);
    308 #include "VVPNodes.def"
    309   }
    310 
    311   for (MVT LegalPackedVT : AllPackedVTs) {
    312     setOperationAction(ISD::INSERT_VECTOR_ELT, LegalPackedVT, Custom);
    313     setOperationAction(ISD::EXTRACT_VECTOR_ELT, LegalPackedVT, Custom);
    314   }
    315 }
    316 
    317 SDValue
    318 VETargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
    319                               bool IsVarArg,
    320                               const SmallVectorImpl<ISD::OutputArg> &Outs,
    321                               const SmallVectorImpl<SDValue> &OutVals,
    322                               const SDLoc &DL, SelectionDAG &DAG) const {
    323   // CCValAssign - represent the assignment of the return value to locations.
    324   SmallVector<CCValAssign, 16> RVLocs;
    325 
    326   // CCState - Info about the registers and stack slot.
    327   CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
    328                  *DAG.getContext());
    329 
    330   // Analyze return values.
    331   CCInfo.AnalyzeReturn(Outs, getReturnCC(CallConv));
    332 
    333   SDValue Flag;
    334   SmallVector<SDValue, 4> RetOps(1, Chain);
    335 
    336   // Copy the result values into the output registers.
    337   for (unsigned i = 0; i != RVLocs.size(); ++i) {
    338     CCValAssign &VA = RVLocs[i];
    339     assert(VA.isRegLoc() && "Can only return in registers!");
    340     assert(!VA.needsCustom() && "Unexpected custom lowering");
    341     SDValue OutVal = OutVals[i];
    342 
    343     // Integer return values must be sign or zero extended by the callee.
    344     switch (VA.getLocInfo()) {
    345     case CCValAssign::Full:
    346       break;
    347     case CCValAssign::SExt:
    348       OutVal = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), OutVal);
    349       break;
    350     case CCValAssign::ZExt:
    351       OutVal = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), OutVal);
    352       break;
    353     case CCValAssign::AExt:
    354       OutVal = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), OutVal);
    355       break;
    356     case CCValAssign::BCvt: {
    357       // Convert a float return value to i64 with padding.
    358       //     63     31   0
    359       //    +------+------+
    360       //    | float|   0  |
    361       //    +------+------+
    362       assert(VA.getLocVT() == MVT::i64);
    363       assert(VA.getValVT() == MVT::f32);
    364       SDValue Undef = SDValue(
    365           DAG.getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, MVT::i64), 0);
    366       SDValue Sub_f32 = DAG.getTargetConstant(VE::sub_f32, DL, MVT::i32);
    367       OutVal = SDValue(DAG.getMachineNode(TargetOpcode::INSERT_SUBREG, DL,
    368                                           MVT::i64, Undef, OutVal, Sub_f32),
    369                        0);
    370       break;
    371     }
    372     default:
    373       llvm_unreachable("Unknown loc info!");
    374     }
    375 
    376     Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), OutVal, Flag);
    377 
    378     // Guarantee that all emitted copies are stuck together with flags.
    379     Flag = Chain.getValue(1);
    380     RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
    381   }
    382 
    383   RetOps[0] = Chain; // Update chain.
    384 
    385   // Add the flag if we have it.
    386   if (Flag.getNode())
    387     RetOps.push_back(Flag);
    388 
    389   return DAG.getNode(VEISD::RET_FLAG, DL, MVT::Other, RetOps);
    390 }
    391 
    392 SDValue VETargetLowering::LowerFormalArguments(
    393     SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,
    394     const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
    395     SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
    396   MachineFunction &MF = DAG.getMachineFunction();
    397 
    398   // Get the base offset of the incoming arguments stack space.
    399   unsigned ArgsBaseOffset = Subtarget->getRsaSize();
    400   // Get the size of the preserved arguments area
    401   unsigned ArgsPreserved = 64;
    402 
    403   // Analyze arguments according to CC_VE.
    404   SmallVector<CCValAssign, 16> ArgLocs;
    405   CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), ArgLocs,
    406                  *DAG.getContext());
    407   // Allocate the preserved area first.
    408   CCInfo.AllocateStack(ArgsPreserved, Align(8));
    409   // We already allocated the preserved area, so the stack offset computed
    410   // by CC_VE would be correct now.
    411   CCInfo.AnalyzeFormalArguments(Ins, getParamCC(CallConv, false));
    412 
    413   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
    414     CCValAssign &VA = ArgLocs[i];
    415     assert(!VA.needsCustom() && "Unexpected custom lowering");
    416     if (VA.isRegLoc()) {
    417       // This argument is passed in a register.
    418       // All integer register arguments are promoted by the caller to i64.
    419 
    420       // Create a virtual register for the promoted live-in value.
    421       unsigned VReg =
    422           MF.addLiveIn(VA.getLocReg(), getRegClassFor(VA.getLocVT()));
    423       SDValue Arg = DAG.getCopyFromReg(Chain, DL, VReg, VA.getLocVT());
    424 
    425       // The caller promoted the argument, so insert an Assert?ext SDNode so we
    426       // won't promote the value again in this function.
    427       switch (VA.getLocInfo()) {
    428       case CCValAssign::SExt:
    429         Arg = DAG.getNode(ISD::AssertSext, DL, VA.getLocVT(), Arg,
    430                           DAG.getValueType(VA.getValVT()));
    431         break;
    432       case CCValAssign::ZExt:
    433         Arg = DAG.getNode(ISD::AssertZext, DL, VA.getLocVT(), Arg,
    434                           DAG.getValueType(VA.getValVT()));
    435         break;
    436       case CCValAssign::BCvt: {
    437         // Extract a float argument from i64 with padding.
    438         //     63     31   0
    439         //    +------+------+
    440         //    | float|   0  |
    441         //    +------+------+
    442         assert(VA.getLocVT() == MVT::i64);
    443         assert(VA.getValVT() == MVT::f32);
    444         SDValue Sub_f32 = DAG.getTargetConstant(VE::sub_f32, DL, MVT::i32);
    445         Arg = SDValue(DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL,
    446                                          MVT::f32, Arg, Sub_f32),
    447                       0);
    448         break;
    449       }
    450       default:
    451         break;
    452       }
    453 
    454       // Truncate the register down to the argument type.
    455       if (VA.isExtInLoc())
    456         Arg = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Arg);
    457 
    458       InVals.push_back(Arg);
    459       continue;
    460     }
    461 
    462     // The registers are exhausted. This argument was passed on the stack.
    463     assert(VA.isMemLoc());
    464     // The CC_VE_Full/Half functions compute stack offsets relative to the
    465     // beginning of the arguments area at %fp + the size of reserved area.
    466     unsigned Offset = VA.getLocMemOffset() + ArgsBaseOffset;
    467     unsigned ValSize = VA.getValVT().getSizeInBits() / 8;
    468 
    469     // Adjust offset for a float argument by adding 4 since the argument is
    470     // stored in 8 bytes buffer with offset like below.  LLVM generates
    471     // 4 bytes load instruction, so need to adjust offset here.  This
    472     // adjustment is required in only LowerFormalArguments.  In LowerCall,
    473     // a float argument is converted to i64 first, and stored as 8 bytes
    474     // data, which is required by ABI, so no need for adjustment.
    475     //    0      4
    476     //    +------+------+
    477     //    | empty| float|
    478     //    +------+------+
    479     if (VA.getValVT() == MVT::f32)
    480       Offset += 4;
    481 
    482     int FI = MF.getFrameInfo().CreateFixedObject(ValSize, Offset, true);
    483     InVals.push_back(
    484         DAG.getLoad(VA.getValVT(), DL, Chain,
    485                     DAG.getFrameIndex(FI, getPointerTy(MF.getDataLayout())),
    486                     MachinePointerInfo::getFixedStack(MF, FI)));
    487   }
    488 
    489   if (!IsVarArg)
    490     return Chain;
    491 
    492   // This function takes variable arguments, some of which may have been passed
    493   // in registers %s0-%s8.
    494   //
    495   // The va_start intrinsic needs to know the offset to the first variable
    496   // argument.
    497   // TODO: need to calculate offset correctly once we support f128.
    498   unsigned ArgOffset = ArgLocs.size() * 8;
    499   VEMachineFunctionInfo *FuncInfo = MF.getInfo<VEMachineFunctionInfo>();
    500   // Skip the reserved area at the top of stack.
    501   FuncInfo->setVarArgsFrameOffset(ArgOffset + ArgsBaseOffset);
    502 
    503   return Chain;
    504 }
    505 
    506 // FIXME? Maybe this could be a TableGen attribute on some registers and
    507 // this table could be generated automatically from RegInfo.
    508 Register VETargetLowering::getRegisterByName(const char *RegName, LLT VT,
    509                                              const MachineFunction &MF) const {
    510   Register Reg = StringSwitch<Register>(RegName)
    511                      .Case("sp", VE::SX11)    // Stack pointer
    512                      .Case("fp", VE::SX9)     // Frame pointer
    513                      .Case("sl", VE::SX8)     // Stack limit
    514                      .Case("lr", VE::SX10)    // Link register
    515                      .Case("tp", VE::SX14)    // Thread pointer
    516                      .Case("outer", VE::SX12) // Outer regiser
    517                      .Case("info", VE::SX17)  // Info area register
    518                      .Case("got", VE::SX15)   // Global offset table register
    519                      .Case("plt", VE::SX16) // Procedure linkage table register
    520                      .Default(0);
    521 
    522   if (Reg)
    523     return Reg;
    524 
    525   report_fatal_error("Invalid register name global variable");
    526 }
    527 
    528 //===----------------------------------------------------------------------===//
    529 // TargetLowering Implementation
    530 //===----------------------------------------------------------------------===//
    531 
    532 SDValue VETargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
    533                                     SmallVectorImpl<SDValue> &InVals) const {
    534   SelectionDAG &DAG = CLI.DAG;
    535   SDLoc DL = CLI.DL;
    536   SDValue Chain = CLI.Chain;
    537   auto PtrVT = getPointerTy(DAG.getDataLayout());
    538 
    539   // VE target does not yet support tail call optimization.
    540   CLI.IsTailCall = false;
    541 
    542   // Get the base offset of the outgoing arguments stack space.
    543   unsigned ArgsBaseOffset = Subtarget->getRsaSize();
    544   // Get the size of the preserved arguments area
    545   unsigned ArgsPreserved = 8 * 8u;
    546 
    547   // Analyze operands of the call, assigning locations to each operand.
    548   SmallVector<CCValAssign, 16> ArgLocs;
    549   CCState CCInfo(CLI.CallConv, CLI.IsVarArg, DAG.getMachineFunction(), ArgLocs,
    550                  *DAG.getContext());
    551   // Allocate the preserved area first.
    552   CCInfo.AllocateStack(ArgsPreserved, Align(8));
    553   // We already allocated the preserved area, so the stack offset computed
    554   // by CC_VE would be correct now.
    555   CCInfo.AnalyzeCallOperands(CLI.Outs, getParamCC(CLI.CallConv, false));
    556 
    557   // VE requires to use both register and stack for varargs or no-prototyped
    558   // functions.
    559   bool UseBoth = CLI.IsVarArg;
    560 
    561   // Analyze operands again if it is required to store BOTH.
    562   SmallVector<CCValAssign, 16> ArgLocs2;
    563   CCState CCInfo2(CLI.CallConv, CLI.IsVarArg, DAG.getMachineFunction(),
    564                   ArgLocs2, *DAG.getContext());
    565   if (UseBoth)
    566     CCInfo2.AnalyzeCallOperands(CLI.Outs, getParamCC(CLI.CallConv, true));
    567 
    568   // Get the size of the outgoing arguments stack space requirement.
    569   unsigned ArgsSize = CCInfo.getNextStackOffset();
    570 
    571   // Keep stack frames 16-byte aligned.
    572   ArgsSize = alignTo(ArgsSize, 16);
    573 
    574   // Adjust the stack pointer to make room for the arguments.
    575   // FIXME: Use hasReservedCallFrame to avoid %sp adjustments around all calls
    576   // with more than 6 arguments.
    577   Chain = DAG.getCALLSEQ_START(Chain, ArgsSize, 0, DL);
    578 
    579   // Collect the set of registers to pass to the function and their values.
    580   // This will be emitted as a sequence of CopyToReg nodes glued to the call
    581   // instruction.
    582   SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
    583 
    584   // Collect chains from all the memory opeations that copy arguments to the
    585   // stack. They must follow the stack pointer adjustment above and precede the
    586   // call instruction itself.
    587   SmallVector<SDValue, 8> MemOpChains;
    588 
    589   // VE needs to get address of callee function in a register
    590   // So, prepare to copy it to SX12 here.
    591 
    592   // If the callee is a GlobalAddress node (quite common, every direct call is)
    593   // turn it into a TargetGlobalAddress node so that legalize doesn't hack it.
    594   // Likewise ExternalSymbol -> TargetExternalSymbol.
    595   SDValue Callee = CLI.Callee;
    596 
    597   bool IsPICCall = isPositionIndependent();
    598 
    599   // PC-relative references to external symbols should go through $stub.
    600   // If so, we need to prepare GlobalBaseReg first.
    601   const TargetMachine &TM = DAG.getTarget();
    602   const Module *Mod = DAG.getMachineFunction().getFunction().getParent();
    603   const GlobalValue *GV = nullptr;
    604   auto *CalleeG = dyn_cast<GlobalAddressSDNode>(Callee);
    605   if (CalleeG)
    606     GV = CalleeG->getGlobal();
    607   bool Local = TM.shouldAssumeDSOLocal(*Mod, GV);
    608   bool UsePlt = !Local;
    609   MachineFunction &MF = DAG.getMachineFunction();
    610 
    611   // Turn GlobalAddress/ExternalSymbol node into a value node
    612   // containing the address of them here.
    613   if (CalleeG) {
    614     if (IsPICCall) {
    615       if (UsePlt)
    616         Subtarget->getInstrInfo()->getGlobalBaseReg(&MF);
    617       Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, 0);
    618       Callee = DAG.getNode(VEISD::GETFUNPLT, DL, PtrVT, Callee);
    619     } else {
    620       Callee =
    621           makeHiLoPair(Callee, VEMCExpr::VK_VE_HI32, VEMCExpr::VK_VE_LO32, DAG);
    622     }
    623   } else if (ExternalSymbolSDNode *E = dyn_cast<ExternalSymbolSDNode>(Callee)) {
    624     if (IsPICCall) {
    625       if (UsePlt)
    626         Subtarget->getInstrInfo()->getGlobalBaseReg(&MF);
    627       Callee = DAG.getTargetExternalSymbol(E->getSymbol(), PtrVT, 0);
    628       Callee = DAG.getNode(VEISD::GETFUNPLT, DL, PtrVT, Callee);
    629     } else {
    630       Callee =
    631           makeHiLoPair(Callee, VEMCExpr::VK_VE_HI32, VEMCExpr::VK_VE_LO32, DAG);
    632     }
    633   }
    634 
    635   RegsToPass.push_back(std::make_pair(VE::SX12, Callee));
    636 
    637   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
    638     CCValAssign &VA = ArgLocs[i];
    639     SDValue Arg = CLI.OutVals[i];
    640 
    641     // Promote the value if needed.
    642     switch (VA.getLocInfo()) {
    643     default:
    644       llvm_unreachable("Unknown location info!");
    645     case CCValAssign::Full:
    646       break;
    647     case CCValAssign::SExt:
    648       Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
    649       break;
    650     case CCValAssign::ZExt:
    651       Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
    652       break;
    653     case CCValAssign::AExt:
    654       Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
    655       break;
    656     case CCValAssign::BCvt: {
    657       // Convert a float argument to i64 with padding.
    658       //     63     31   0
    659       //    +------+------+
    660       //    | float|   0  |
    661       //    +------+------+
    662       assert(VA.getLocVT() == MVT::i64);
    663       assert(VA.getValVT() == MVT::f32);
    664       SDValue Undef = SDValue(
    665           DAG.getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, MVT::i64), 0);
    666       SDValue Sub_f32 = DAG.getTargetConstant(VE::sub_f32, DL, MVT::i32);
    667       Arg = SDValue(DAG.getMachineNode(TargetOpcode::INSERT_SUBREG, DL,
    668                                        MVT::i64, Undef, Arg, Sub_f32),
    669                     0);
    670       break;
    671     }
    672     }
    673 
    674     if (VA.isRegLoc()) {
    675       RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
    676       if (!UseBoth)
    677         continue;
    678       VA = ArgLocs2[i];
    679     }
    680 
    681     assert(VA.isMemLoc());
    682 
    683     // Create a store off the stack pointer for this argument.
    684     SDValue StackPtr = DAG.getRegister(VE::SX11, PtrVT);
    685     // The argument area starts at %fp/%sp + the size of reserved area.
    686     SDValue PtrOff =
    687         DAG.getIntPtrConstant(VA.getLocMemOffset() + ArgsBaseOffset, DL);
    688     PtrOff = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff);
    689     MemOpChains.push_back(
    690         DAG.getStore(Chain, DL, Arg, PtrOff, MachinePointerInfo()));
    691   }
    692 
    693   // Emit all stores, make sure they occur before the call.
    694   if (!MemOpChains.empty())
    695     Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
    696 
    697   // Build a sequence of CopyToReg nodes glued together with token chain and
    698   // glue operands which copy the outgoing args into registers. The InGlue is
    699   // necessary since all emitted instructions must be stuck together in order
    700   // to pass the live physical registers.
    701   SDValue InGlue;
    702   for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
    703     Chain = DAG.getCopyToReg(Chain, DL, RegsToPass[i].first,
    704                              RegsToPass[i].second, InGlue);
    705     InGlue = Chain.getValue(1);
    706   }
    707 
    708   // Build the operands for the call instruction itself.
    709   SmallVector<SDValue, 8> Ops;
    710   Ops.push_back(Chain);
    711   for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
    712     Ops.push_back(DAG.getRegister(RegsToPass[i].first,
    713                                   RegsToPass[i].second.getValueType()));
    714 
    715   // Add a register mask operand representing the call-preserved registers.
    716   const VERegisterInfo *TRI = Subtarget->getRegisterInfo();
    717   const uint32_t *Mask =
    718       TRI->getCallPreservedMask(DAG.getMachineFunction(), CLI.CallConv);
    719   assert(Mask && "Missing call preserved mask for calling convention");
    720   Ops.push_back(DAG.getRegisterMask(Mask));
    721 
    722   // Make sure the CopyToReg nodes are glued to the call instruction which
    723   // consumes the registers.
    724   if (InGlue.getNode())
    725     Ops.push_back(InGlue);
    726 
    727   // Now the call itself.
    728   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
    729   Chain = DAG.getNode(VEISD::CALL, DL, NodeTys, Ops);
    730   InGlue = Chain.getValue(1);
    731 
    732   // Revert the stack pointer immediately after the call.
    733   Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(ArgsSize, DL, true),
    734                              DAG.getIntPtrConstant(0, DL, true), InGlue, DL);
    735   InGlue = Chain.getValue(1);
    736 
    737   // Now extract the return values. This is more or less the same as
    738   // LowerFormalArguments.
    739 
    740   // Assign locations to each value returned by this call.
    741   SmallVector<CCValAssign, 16> RVLocs;
    742   CCState RVInfo(CLI.CallConv, CLI.IsVarArg, DAG.getMachineFunction(), RVLocs,
    743                  *DAG.getContext());
    744 
    745   // Set inreg flag manually for codegen generated library calls that
    746   // return float.
    747   if (CLI.Ins.size() == 1 && CLI.Ins[0].VT == MVT::f32 && !CLI.CB)
    748     CLI.Ins[0].Flags.setInReg();
    749 
    750   RVInfo.AnalyzeCallResult(CLI.Ins, getReturnCC(CLI.CallConv));
    751 
    752   // Copy all of the result registers out of their specified physreg.
    753   for (unsigned i = 0; i != RVLocs.size(); ++i) {
    754     CCValAssign &VA = RVLocs[i];
    755     assert(!VA.needsCustom() && "Unexpected custom lowering");
    756     unsigned Reg = VA.getLocReg();
    757 
    758     // When returning 'inreg {i32, i32 }', two consecutive i32 arguments can
    759     // reside in the same register in the high and low bits. Reuse the
    760     // CopyFromReg previous node to avoid duplicate copies.
    761     SDValue RV;
    762     if (RegisterSDNode *SrcReg = dyn_cast<RegisterSDNode>(Chain.getOperand(1)))
    763       if (SrcReg->getReg() == Reg && Chain->getOpcode() == ISD::CopyFromReg)
    764         RV = Chain.getValue(0);
    765 
    766     // But usually we'll create a new CopyFromReg for a different register.
    767     if (!RV.getNode()) {
    768       RV = DAG.getCopyFromReg(Chain, DL, Reg, RVLocs[i].getLocVT(), InGlue);
    769       Chain = RV.getValue(1);
    770       InGlue = Chain.getValue(2);
    771     }
    772 
    773     // The callee promoted the return value, so insert an Assert?ext SDNode so
    774     // we won't promote the value again in this function.
    775     switch (VA.getLocInfo()) {
    776     case CCValAssign::SExt:
    777       RV = DAG.getNode(ISD::AssertSext, DL, VA.getLocVT(), RV,
    778                        DAG.getValueType(VA.getValVT()));
    779       break;
    780     case CCValAssign::ZExt:
    781       RV = DAG.getNode(ISD::AssertZext, DL, VA.getLocVT(), RV,
    782                        DAG.getValueType(VA.getValVT()));
    783       break;
    784     case CCValAssign::BCvt: {
    785       // Extract a float return value from i64 with padding.
    786       //     63     31   0
    787       //    +------+------+
    788       //    | float|   0  |
    789       //    +------+------+
    790       assert(VA.getLocVT() == MVT::i64);
    791       assert(VA.getValVT() == MVT::f32);
    792       SDValue Sub_f32 = DAG.getTargetConstant(VE::sub_f32, DL, MVT::i32);
    793       RV = SDValue(DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL,
    794                                       MVT::f32, RV, Sub_f32),
    795                    0);
    796       break;
    797     }
    798     default:
    799       break;
    800     }
    801 
    802     // Truncate the register down to the return value type.
    803     if (VA.isExtInLoc())
    804       RV = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), RV);
    805 
    806     InVals.push_back(RV);
    807   }
    808 
    809   return Chain;
    810 }
    811 
    812 bool VETargetLowering::isOffsetFoldingLegal(
    813     const GlobalAddressSDNode *GA) const {
    814   // VE uses 64 bit addressing, so we need multiple instructions to generate
    815   // an address.  Folding address with offset increases the number of
    816   // instructions, so that we disable it here.  Offsets will be folded in
    817   // the DAG combine later if it worth to do so.
    818   return false;
    819 }
    820 
    821 /// isFPImmLegal - Returns true if the target can instruction select the
    822 /// specified FP immediate natively. If false, the legalizer will
    823 /// materialize the FP immediate as a load from a constant pool.
    824 bool VETargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,
    825                                     bool ForCodeSize) const {
    826   return VT == MVT::f32 || VT == MVT::f64;
    827 }
    828 
    829 /// Determine if the target supports unaligned memory accesses.
    830 ///
    831 /// This function returns true if the target allows unaligned memory accesses
    832 /// of the specified type in the given address space. If true, it also returns
    833 /// whether the unaligned memory access is "fast" in the last argument by
    834 /// reference. This is used, for example, in situations where an array
    835 /// copy/move/set is converted to a sequence of store operations. Its use
    836 /// helps to ensure that such replacements don't generate code that causes an
    837 /// alignment error (trap) on the target machine.
    838 bool VETargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
    839                                                       unsigned AddrSpace,
    840                                                       Align A,
    841                                                       MachineMemOperand::Flags,
    842                                                       bool *Fast) const {
    843   if (Fast) {
    844     // It's fast anytime on VE
    845     *Fast = true;
    846   }
    847   return true;
    848 }
    849 
    850 VETargetLowering::VETargetLowering(const TargetMachine &TM,
    851                                    const VESubtarget &STI)
    852     : TargetLowering(TM), Subtarget(&STI) {
    853   // Instructions which use registers as conditionals examine all the
    854   // bits (as does the pseudo SELECT_CC expansion). I don't think it
    855   // matters much whether it's ZeroOrOneBooleanContent, or
    856   // ZeroOrNegativeOneBooleanContent, so, arbitrarily choose the
    857   // former.
    858   setBooleanContents(ZeroOrOneBooleanContent);
    859   setBooleanVectorContents(ZeroOrOneBooleanContent);
    860 
    861   initRegisterClasses();
    862   initSPUActions();
    863   initVPUActions();
    864 
    865   setStackPointerRegisterToSaveRestore(VE::SX11);
    866 
    867   // We have target-specific dag combine patterns for the following nodes:
    868   setTargetDAGCombine(ISD::TRUNCATE);
    869 
    870   // Set function alignment to 16 bytes
    871   setMinFunctionAlignment(Align(16));
    872 
    873   // VE stores all argument by 8 bytes alignment
    874   setMinStackArgumentAlignment(Align(8));
    875 
    876   computeRegisterProperties(Subtarget->getRegisterInfo());
    877 }
    878 
    879 const char *VETargetLowering::getTargetNodeName(unsigned Opcode) const {
    880 #define TARGET_NODE_CASE(NAME)                                                 \
    881   case VEISD::NAME:                                                            \
    882     return "VEISD::" #NAME;
    883   switch ((VEISD::NodeType)Opcode) {
    884   case VEISD::FIRST_NUMBER:
    885     break;
    886     TARGET_NODE_CASE(CALL)
    887     TARGET_NODE_CASE(EH_SJLJ_LONGJMP)
    888     TARGET_NODE_CASE(EH_SJLJ_SETJMP)
    889     TARGET_NODE_CASE(EH_SJLJ_SETUP_DISPATCH)
    890     TARGET_NODE_CASE(GETFUNPLT)
    891     TARGET_NODE_CASE(GETSTACKTOP)
    892     TARGET_NODE_CASE(GETTLSADDR)
    893     TARGET_NODE_CASE(GLOBAL_BASE_REG)
    894     TARGET_NODE_CASE(Hi)
    895     TARGET_NODE_CASE(Lo)
    896     TARGET_NODE_CASE(MEMBARRIER)
    897     TARGET_NODE_CASE(RET_FLAG)
    898     TARGET_NODE_CASE(TS1AM)
    899     TARGET_NODE_CASE(VEC_BROADCAST)
    900 
    901     // Register the VVP_* SDNodes.
    902 #define ADD_VVP_OP(VVP_NAME, ...) TARGET_NODE_CASE(VVP_NAME)
    903 #include "VVPNodes.def"
    904   }
    905 #undef TARGET_NODE_CASE
    906   return nullptr;
    907 }
    908 
    909 EVT VETargetLowering::getSetCCResultType(const DataLayout &, LLVMContext &,
    910                                          EVT VT) const {
    911   return MVT::i32;
    912 }
    913 
    914 // Convert to a target node and set target flags.
    915 SDValue VETargetLowering::withTargetFlags(SDValue Op, unsigned TF,
    916                                           SelectionDAG &DAG) const {
    917   if (const GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(Op))
    918     return DAG.getTargetGlobalAddress(GA->getGlobal(), SDLoc(GA),
    919                                       GA->getValueType(0), GA->getOffset(), TF);
    920 
    921   if (const BlockAddressSDNode *BA = dyn_cast<BlockAddressSDNode>(Op))
    922     return DAG.getTargetBlockAddress(BA->getBlockAddress(), Op.getValueType(),
    923                                      0, TF);
    924 
    925   if (const ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(Op))
    926     return DAG.getTargetConstantPool(CP->getConstVal(), CP->getValueType(0),
    927                                      CP->getAlign(), CP->getOffset(), TF);
    928 
    929   if (const ExternalSymbolSDNode *ES = dyn_cast<ExternalSymbolSDNode>(Op))
    930     return DAG.getTargetExternalSymbol(ES->getSymbol(), ES->getValueType(0),
    931                                        TF);
    932 
    933   if (const JumpTableSDNode *JT = dyn_cast<JumpTableSDNode>(Op))
    934     return DAG.getTargetJumpTable(JT->getIndex(), JT->getValueType(0), TF);
    935 
    936   llvm_unreachable("Unhandled address SDNode");
    937 }
    938 
    939 // Split Op into high and low parts according to HiTF and LoTF.
    940 // Return an ADD node combining the parts.
    941 SDValue VETargetLowering::makeHiLoPair(SDValue Op, unsigned HiTF, unsigned LoTF,
    942                                        SelectionDAG &DAG) const {
    943   SDLoc DL(Op);
    944   EVT VT = Op.getValueType();
    945   SDValue Hi = DAG.getNode(VEISD::Hi, DL, VT, withTargetFlags(Op, HiTF, DAG));
    946   SDValue Lo = DAG.getNode(VEISD::Lo, DL, VT, withTargetFlags(Op, LoTF, DAG));
    947   return DAG.getNode(ISD::ADD, DL, VT, Hi, Lo);
    948 }
    949 
    950 // Build SDNodes for producing an address from a GlobalAddress, ConstantPool,
    951 // or ExternalSymbol SDNode.
    952 SDValue VETargetLowering::makeAddress(SDValue Op, SelectionDAG &DAG) const {
    953   SDLoc DL(Op);
    954   EVT PtrVT = Op.getValueType();
    955 
    956   // Handle PIC mode first. VE needs a got load for every variable!
    957   if (isPositionIndependent()) {
    958     auto GlobalN = dyn_cast<GlobalAddressSDNode>(Op);
    959 
    960     if (isa<ConstantPoolSDNode>(Op) || isa<JumpTableSDNode>(Op) ||
    961         (GlobalN && GlobalN->getGlobal()->hasLocalLinkage())) {
    962       // Create following instructions for local linkage PIC code.
    963       //     lea %reg, label@gotoff_lo
    964       //     and %reg, %reg, (32)0
    965       //     lea.sl %reg, label@gotoff_hi(%reg, %got)
    966       SDValue HiLo = makeHiLoPair(Op, VEMCExpr::VK_VE_GOTOFF_HI32,
    967                                   VEMCExpr::VK_VE_GOTOFF_LO32, DAG);
    968       SDValue GlobalBase = DAG.getNode(VEISD::GLOBAL_BASE_REG, DL, PtrVT);
    969       return DAG.getNode(ISD::ADD, DL, PtrVT, GlobalBase, HiLo);
    970     }
    971     // Create following instructions for not local linkage PIC code.
    972     //     lea %reg, label@got_lo
    973     //     and %reg, %reg, (32)0
    974     //     lea.sl %reg, label@got_hi(%reg)
    975     //     ld %reg, (%reg, %got)
    976     SDValue HiLo = makeHiLoPair(Op, VEMCExpr::VK_VE_GOT_HI32,
    977                                 VEMCExpr::VK_VE_GOT_LO32, DAG);
    978     SDValue GlobalBase = DAG.getNode(VEISD::GLOBAL_BASE_REG, DL, PtrVT);
    979     SDValue AbsAddr = DAG.getNode(ISD::ADD, DL, PtrVT, GlobalBase, HiLo);
    980     return DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), AbsAddr,
    981                        MachinePointerInfo::getGOT(DAG.getMachineFunction()));
    982   }
    983 
    984   // This is one of the absolute code models.
    985   switch (getTargetMachine().getCodeModel()) {
    986   default:
    987     llvm_unreachable("Unsupported absolute code model");
    988   case CodeModel::Small:
    989   case CodeModel::Medium:
    990   case CodeModel::Large:
    991     // abs64.
    992     return makeHiLoPair(Op, VEMCExpr::VK_VE_HI32, VEMCExpr::VK_VE_LO32, DAG);
    993   }
    994 }
    995 
    996 /// Custom Lower {
    997 
    998 // The mappings for emitLeading/TrailingFence for VE is designed by following
    999 // http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html
   1000 Instruction *VETargetLowering::emitLeadingFence(IRBuilder<> &Builder,
   1001                                                 Instruction *Inst,
   1002                                                 AtomicOrdering Ord) const {
   1003   switch (Ord) {
   1004   case AtomicOrdering::NotAtomic:
   1005   case AtomicOrdering::Unordered:
   1006     llvm_unreachable("Invalid fence: unordered/non-atomic");
   1007   case AtomicOrdering::Monotonic:
   1008   case AtomicOrdering::Acquire:
   1009     return nullptr; // Nothing to do
   1010   case AtomicOrdering::Release:
   1011   case AtomicOrdering::AcquireRelease:
   1012     return Builder.CreateFence(AtomicOrdering::Release);
   1013   case AtomicOrdering::SequentiallyConsistent:
   1014     if (!Inst->hasAtomicStore())
   1015       return nullptr; // Nothing to do
   1016     return Builder.CreateFence(AtomicOrdering::SequentiallyConsistent);
   1017   }
   1018   llvm_unreachable("Unknown fence ordering in emitLeadingFence");
   1019 }
   1020 
   1021 Instruction *VETargetLowering::emitTrailingFence(IRBuilder<> &Builder,
   1022                                                  Instruction *Inst,
   1023                                                  AtomicOrdering Ord) const {
   1024   switch (Ord) {
   1025   case AtomicOrdering::NotAtomic:
   1026   case AtomicOrdering::Unordered:
   1027     llvm_unreachable("Invalid fence: unordered/not-atomic");
   1028   case AtomicOrdering::Monotonic:
   1029   case AtomicOrdering::Release:
   1030     return nullptr; // Nothing to do
   1031   case AtomicOrdering::Acquire:
   1032   case AtomicOrdering::AcquireRelease:
   1033     return Builder.CreateFence(AtomicOrdering::Acquire);
   1034   case AtomicOrdering::SequentiallyConsistent:
   1035     return Builder.CreateFence(AtomicOrdering::SequentiallyConsistent);
   1036   }
   1037   llvm_unreachable("Unknown fence ordering in emitTrailingFence");
   1038 }
   1039 
   1040 SDValue VETargetLowering::lowerATOMIC_FENCE(SDValue Op,
   1041                                             SelectionDAG &DAG) const {
   1042   SDLoc DL(Op);
   1043   AtomicOrdering FenceOrdering = static_cast<AtomicOrdering>(
   1044       cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue());
   1045   SyncScope::ID FenceSSID = static_cast<SyncScope::ID>(
   1046       cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue());
   1047 
   1048   // VE uses Release consistency, so need a fence instruction if it is a
   1049   // cross-thread fence.
   1050   if (FenceSSID == SyncScope::System) {
   1051     switch (FenceOrdering) {
   1052     case AtomicOrdering::NotAtomic:
   1053     case AtomicOrdering::Unordered:
   1054     case AtomicOrdering::Monotonic:
   1055       // No need to generate fencem instruction here.
   1056       break;
   1057     case AtomicOrdering::Acquire:
   1058       // Generate "fencem 2" as acquire fence.
   1059       return SDValue(DAG.getMachineNode(VE::FENCEM, DL, MVT::Other,
   1060                                         DAG.getTargetConstant(2, DL, MVT::i32),
   1061                                         Op.getOperand(0)),
   1062                      0);
   1063     case AtomicOrdering::Release:
   1064       // Generate "fencem 1" as release fence.
   1065       return SDValue(DAG.getMachineNode(VE::FENCEM, DL, MVT::Other,
   1066                                         DAG.getTargetConstant(1, DL, MVT::i32),
   1067                                         Op.getOperand(0)),
   1068                      0);
   1069     case AtomicOrdering::AcquireRelease:
   1070     case AtomicOrdering::SequentiallyConsistent:
   1071       // Generate "fencem 3" as acq_rel and seq_cst fence.
   1072       // FIXME: "fencem 3" doesn't wait for for PCIe deveices accesses,
   1073       //        so  seq_cst may require more instruction for them.
   1074       return SDValue(DAG.getMachineNode(VE::FENCEM, DL, MVT::Other,
   1075                                         DAG.getTargetConstant(3, DL, MVT::i32),
   1076                                         Op.getOperand(0)),
   1077                      0);
   1078     }
   1079   }
   1080 
   1081   // MEMBARRIER is a compiler barrier; it codegens to a no-op.
   1082   return DAG.getNode(VEISD::MEMBARRIER, DL, MVT::Other, Op.getOperand(0));
   1083 }
   1084 
   1085 TargetLowering::AtomicExpansionKind
   1086 VETargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
   1087   // We have TS1AM implementation for i8/i16/i32/i64, so use it.
   1088   if (AI->getOperation() == AtomicRMWInst::Xchg) {
   1089     return AtomicExpansionKind::None;
   1090   }
   1091   // FIXME: Support "ATMAM" instruction for LOAD_ADD/SUB/AND/OR.
   1092 
   1093   // Otherwise, expand it using compare and exchange instruction to not call
   1094   // __sync_fetch_and_* functions.
   1095   return AtomicExpansionKind::CmpXChg;
   1096 }
   1097 
   1098 static SDValue prepareTS1AM(SDValue Op, SelectionDAG &DAG, SDValue &Flag,
   1099                             SDValue &Bits) {
   1100   SDLoc DL(Op);
   1101   AtomicSDNode *N = cast<AtomicSDNode>(Op);
   1102   SDValue Ptr = N->getOperand(1);
   1103   SDValue Val = N->getOperand(2);
   1104   EVT PtrVT = Ptr.getValueType();
   1105   bool Byte = N->getMemoryVT() == MVT::i8;
   1106   //   Remainder = AND Ptr, 3
   1107   //   Flag = 1 << Remainder  ; If Byte is true (1 byte swap flag)
   1108   //   Flag = 3 << Remainder  ; If Byte is false (2 bytes swap flag)
   1109   //   Bits = Remainder << 3
   1110   //   NewVal = Val << Bits
   1111   SDValue Const3 = DAG.getConstant(3, DL, PtrVT);
   1112   SDValue Remainder = DAG.getNode(ISD::AND, DL, PtrVT, {Ptr, Const3});
   1113   SDValue Mask = Byte ? DAG.getConstant(1, DL, MVT::i32)
   1114                       : DAG.getConstant(3, DL, MVT::i32);
   1115   Flag = DAG.getNode(ISD::SHL, DL, MVT::i32, {Mask, Remainder});
   1116   Bits = DAG.getNode(ISD::SHL, DL, PtrVT, {Remainder, Const3});
   1117   return DAG.getNode(ISD::SHL, DL, Val.getValueType(), {Val, Bits});
   1118 }
   1119 
   1120 static SDValue finalizeTS1AM(SDValue Op, SelectionDAG &DAG, SDValue Data,
   1121                              SDValue Bits) {
   1122   SDLoc DL(Op);
   1123   EVT VT = Data.getValueType();
   1124   bool Byte = cast<AtomicSDNode>(Op)->getMemoryVT() == MVT::i8;
   1125   //   NewData = Data >> Bits
   1126   //   Result = NewData & 0xff   ; If Byte is true (1 byte)
   1127   //   Result = NewData & 0xffff ; If Byte is false (2 bytes)
   1128 
   1129   SDValue NewData = DAG.getNode(ISD::SRL, DL, VT, Data, Bits);
   1130   return DAG.getNode(ISD::AND, DL, VT,
   1131                      {NewData, DAG.getConstant(Byte ? 0xff : 0xffff, DL, VT)});
   1132 }
   1133 
   1134 SDValue VETargetLowering::lowerATOMIC_SWAP(SDValue Op,
   1135                                            SelectionDAG &DAG) const {
   1136   SDLoc DL(Op);
   1137   AtomicSDNode *N = cast<AtomicSDNode>(Op);
   1138 
   1139   if (N->getMemoryVT() == MVT::i8) {
   1140     // For i8, use "ts1am"
   1141     //   Input:
   1142     //     ATOMIC_SWAP Ptr, Val, Order
   1143     //
   1144     //   Output:
   1145     //     Remainder = AND Ptr, 3
   1146     //     Flag = 1 << Remainder   ; 1 byte swap flag for TS1AM inst.
   1147     //     Bits = Remainder << 3
   1148     //     NewVal = Val << Bits
   1149     //
   1150     //     Aligned = AND Ptr, -4
   1151     //     Data = TS1AM Aligned, Flag, NewVal
   1152     //
   1153     //     NewData = Data >> Bits
   1154     //     Result = NewData & 0xff ; 1 byte result
   1155     SDValue Flag;
   1156     SDValue Bits;
   1157     SDValue NewVal = prepareTS1AM(Op, DAG, Flag, Bits);
   1158 
   1159     SDValue Ptr = N->getOperand(1);
   1160     SDValue Aligned = DAG.getNode(ISD::AND, DL, Ptr.getValueType(),
   1161                                   {Ptr, DAG.getConstant(-4, DL, MVT::i64)});
   1162     SDValue TS1AM = DAG.getAtomic(VEISD::TS1AM, DL, N->getMemoryVT(),
   1163                                   DAG.getVTList(Op.getNode()->getValueType(0),
   1164                                                 Op.getNode()->getValueType(1)),
   1165                                   {N->getChain(), Aligned, Flag, NewVal},
   1166                                   N->getMemOperand());
   1167 
   1168     SDValue Result = finalizeTS1AM(Op, DAG, TS1AM, Bits);
   1169     SDValue Chain = TS1AM.getValue(1);
   1170     return DAG.getMergeValues({Result, Chain}, DL);
   1171   }
   1172   if (N->getMemoryVT() == MVT::i16) {
   1173     // For i16, use "ts1am"
   1174     SDValue Flag;
   1175     SDValue Bits;
   1176     SDValue NewVal = prepareTS1AM(Op, DAG, Flag, Bits);
   1177 
   1178     SDValue Ptr = N->getOperand(1);
   1179     SDValue Aligned = DAG.getNode(ISD::AND, DL, Ptr.getValueType(),
   1180                                   {Ptr, DAG.getConstant(-4, DL, MVT::i64)});
   1181     SDValue TS1AM = DAG.getAtomic(VEISD::TS1AM, DL, N->getMemoryVT(),
   1182                                   DAG.getVTList(Op.getNode()->getValueType(0),
   1183                                                 Op.getNode()->getValueType(1)),
   1184                                   {N->getChain(), Aligned, Flag, NewVal},
   1185                                   N->getMemOperand());
   1186 
   1187     SDValue Result = finalizeTS1AM(Op, DAG, TS1AM, Bits);
   1188     SDValue Chain = TS1AM.getValue(1);
   1189     return DAG.getMergeValues({Result, Chain}, DL);
   1190   }
   1191   // Otherwise, let llvm legalize it.
   1192   return Op;
   1193 }
   1194 
   1195 SDValue VETargetLowering::lowerGlobalAddress(SDValue Op,
   1196                                              SelectionDAG &DAG) const {
   1197   return makeAddress(Op, DAG);
   1198 }
   1199 
   1200 SDValue VETargetLowering::lowerBlockAddress(SDValue Op,
   1201                                             SelectionDAG &DAG) const {
   1202   return makeAddress(Op, DAG);
   1203 }
   1204 
   1205 SDValue VETargetLowering::lowerConstantPool(SDValue Op,
   1206                                             SelectionDAG &DAG) const {
   1207   return makeAddress(Op, DAG);
   1208 }
   1209 
   1210 SDValue
   1211 VETargetLowering::lowerToTLSGeneralDynamicModel(SDValue Op,
   1212                                                 SelectionDAG &DAG) const {
   1213   SDLoc DL(Op);
   1214 
   1215   // Generate the following code:
   1216   //   t1: ch,glue = callseq_start t0, 0, 0
   1217   //   t2: i64,ch,glue = VEISD::GETTLSADDR t1, label, t1:1
   1218   //   t3: ch,glue = callseq_end t2, 0, 0, t2:2
   1219   //   t4: i64,ch,glue = CopyFromReg t3, Register:i64 $sx0, t3:1
   1220   SDValue Label = withTargetFlags(Op, 0, DAG);
   1221   EVT PtrVT = Op.getValueType();
   1222 
   1223   // Lowering the machine isd will make sure everything is in the right
   1224   // location.
   1225   SDValue Chain = DAG.getEntryNode();
   1226   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
   1227   const uint32_t *Mask = Subtarget->getRegisterInfo()->getCallPreservedMask(
   1228       DAG.getMachineFunction(), CallingConv::C);
   1229   Chain = DAG.getCALLSEQ_START(Chain, 64, 0, DL);
   1230   SDValue Args[] = {Chain, Label, DAG.getRegisterMask(Mask), Chain.getValue(1)};
   1231   Chain = DAG.getNode(VEISD::GETTLSADDR, DL, NodeTys, Args);
   1232   Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(64, DL, true),
   1233                              DAG.getIntPtrConstant(0, DL, true),
   1234                              Chain.getValue(1), DL);
   1235   Chain = DAG.getCopyFromReg(Chain, DL, VE::SX0, PtrVT, Chain.getValue(1));
   1236 
   1237   // GETTLSADDR will be codegen'ed as call. Inform MFI that function has calls.
   1238   MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
   1239   MFI.setHasCalls(true);
   1240 
   1241   // Also generate code to prepare a GOT register if it is PIC.
   1242   if (isPositionIndependent()) {
   1243     MachineFunction &MF = DAG.getMachineFunction();
   1244     Subtarget->getInstrInfo()->getGlobalBaseReg(&MF);
   1245   }
   1246 
   1247   return Chain;
   1248 }
   1249 
   1250 SDValue VETargetLowering::lowerGlobalTLSAddress(SDValue Op,
   1251                                                 SelectionDAG &DAG) const {
   1252   // The current implementation of nld (2.26) doesn't allow local exec model
   1253   // code described in VE-tls_v1.1.pdf (*1) as its input. Instead, we always
   1254   // generate the general dynamic model code sequence.
   1255   //
   1256   // *1: https://www.nec.com/en/global/prod/hpc/aurora/document/VE-tls_v1.1.pdf
   1257   return lowerToTLSGeneralDynamicModel(Op, DAG);
   1258 }
   1259 
   1260 SDValue VETargetLowering::lowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
   1261   return makeAddress(Op, DAG);
   1262 }
   1263 
   1264 // Lower a f128 load into two f64 loads.
   1265 static SDValue lowerLoadF128(SDValue Op, SelectionDAG &DAG) {
   1266   SDLoc DL(Op);
   1267   LoadSDNode *LdNode = dyn_cast<LoadSDNode>(Op.getNode());
   1268   assert(LdNode && LdNode->getOffset().isUndef() && "Unexpected node type");
   1269   unsigned Alignment = LdNode->getAlign().value();
   1270   if (Alignment > 8)
   1271     Alignment = 8;
   1272 
   1273   SDValue Lo64 =
   1274       DAG.getLoad(MVT::f64, DL, LdNode->getChain(), LdNode->getBasePtr(),
   1275                   LdNode->getPointerInfo(), Alignment,
   1276                   LdNode->isVolatile() ? MachineMemOperand::MOVolatile
   1277                                        : MachineMemOperand::MONone);
   1278   EVT AddrVT = LdNode->getBasePtr().getValueType();
   1279   SDValue HiPtr = DAG.getNode(ISD::ADD, DL, AddrVT, LdNode->getBasePtr(),
   1280                               DAG.getConstant(8, DL, AddrVT));
   1281   SDValue Hi64 =
   1282       DAG.getLoad(MVT::f64, DL, LdNode->getChain(), HiPtr,
   1283                   LdNode->getPointerInfo(), Alignment,
   1284                   LdNode->isVolatile() ? MachineMemOperand::MOVolatile
   1285                                        : MachineMemOperand::MONone);
   1286 
   1287   SDValue SubRegEven = DAG.getTargetConstant(VE::sub_even, DL, MVT::i32);
   1288   SDValue SubRegOdd = DAG.getTargetConstant(VE::sub_odd, DL, MVT::i32);
   1289 
   1290   // VE stores Hi64 to 8(addr) and Lo64 to 0(addr)
   1291   SDNode *InFP128 =
   1292       DAG.getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, MVT::f128);
   1293   InFP128 = DAG.getMachineNode(TargetOpcode::INSERT_SUBREG, DL, MVT::f128,
   1294                                SDValue(InFP128, 0), Hi64, SubRegEven);
   1295   InFP128 = DAG.getMachineNode(TargetOpcode::INSERT_SUBREG, DL, MVT::f128,
   1296                                SDValue(InFP128, 0), Lo64, SubRegOdd);
   1297   SDValue OutChains[2] = {SDValue(Lo64.getNode(), 1),
   1298                           SDValue(Hi64.getNode(), 1)};
   1299   SDValue OutChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, OutChains);
   1300   SDValue Ops[2] = {SDValue(InFP128, 0), OutChain};
   1301   return DAG.getMergeValues(Ops, DL);
   1302 }
   1303 
   1304 SDValue VETargetLowering::lowerLOAD(SDValue Op, SelectionDAG &DAG) const {
   1305   LoadSDNode *LdNode = cast<LoadSDNode>(Op.getNode());
   1306 
   1307   SDValue BasePtr = LdNode->getBasePtr();
   1308   if (isa<FrameIndexSDNode>(BasePtr.getNode())) {
   1309     // Do not expand store instruction with frame index here because of
   1310     // dependency problems.  We expand it later in eliminateFrameIndex().
   1311     return Op;
   1312   }
   1313 
   1314   EVT MemVT = LdNode->getMemoryVT();
   1315   if (MemVT == MVT::f128)
   1316     return lowerLoadF128(Op, DAG);
   1317 
   1318   return Op;
   1319 }
   1320 
   1321 // Lower a f128 store into two f64 stores.
   1322 static SDValue lowerStoreF128(SDValue Op, SelectionDAG &DAG) {
   1323   SDLoc DL(Op);
   1324   StoreSDNode *StNode = dyn_cast<StoreSDNode>(Op.getNode());
   1325   assert(StNode && StNode->getOffset().isUndef() && "Unexpected node type");
   1326 
   1327   SDValue SubRegEven = DAG.getTargetConstant(VE::sub_even, DL, MVT::i32);
   1328   SDValue SubRegOdd = DAG.getTargetConstant(VE::sub_odd, DL, MVT::i32);
   1329 
   1330   SDNode *Hi64 = DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL, MVT::i64,
   1331                                     StNode->getValue(), SubRegEven);
   1332   SDNode *Lo64 = DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL, MVT::i64,
   1333                                     StNode->getValue(), SubRegOdd);
   1334 
   1335   unsigned Alignment = StNode->getAlign().value();
   1336   if (Alignment > 8)
   1337     Alignment = 8;
   1338 
   1339   // VE stores Hi64 to 8(addr) and Lo64 to 0(addr)
   1340   SDValue OutChains[2];
   1341   OutChains[0] =
   1342       DAG.getStore(StNode->getChain(), DL, SDValue(Lo64, 0),
   1343                    StNode->getBasePtr(), MachinePointerInfo(), Alignment,
   1344                    StNode->isVolatile() ? MachineMemOperand::MOVolatile
   1345                                         : MachineMemOperand::MONone);
   1346   EVT AddrVT = StNode->getBasePtr().getValueType();
   1347   SDValue HiPtr = DAG.getNode(ISD::ADD, DL, AddrVT, StNode->getBasePtr(),
   1348                               DAG.getConstant(8, DL, AddrVT));
   1349   OutChains[1] =
   1350       DAG.getStore(StNode->getChain(), DL, SDValue(Hi64, 0), HiPtr,
   1351                    MachinePointerInfo(), Alignment,
   1352                    StNode->isVolatile() ? MachineMemOperand::MOVolatile
   1353                                         : MachineMemOperand::MONone);
   1354   return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, OutChains);
   1355 }
   1356 
   1357 SDValue VETargetLowering::lowerSTORE(SDValue Op, SelectionDAG &DAG) const {
   1358   StoreSDNode *StNode = cast<StoreSDNode>(Op.getNode());
   1359   assert(StNode && StNode->getOffset().isUndef() && "Unexpected node type");
   1360 
   1361   SDValue BasePtr = StNode->getBasePtr();
   1362   if (isa<FrameIndexSDNode>(BasePtr.getNode())) {
   1363     // Do not expand store instruction with frame index here because of
   1364     // dependency problems.  We expand it later in eliminateFrameIndex().
   1365     return Op;
   1366   }
   1367 
   1368   EVT MemVT = StNode->getMemoryVT();
   1369   if (MemVT == MVT::f128)
   1370     return lowerStoreF128(Op, DAG);
   1371 
   1372   // Otherwise, ask llvm to expand it.
   1373   return SDValue();
   1374 }
   1375 
   1376 SDValue VETargetLowering::lowerVASTART(SDValue Op, SelectionDAG &DAG) const {
   1377   MachineFunction &MF = DAG.getMachineFunction();
   1378   VEMachineFunctionInfo *FuncInfo = MF.getInfo<VEMachineFunctionInfo>();
   1379   auto PtrVT = getPointerTy(DAG.getDataLayout());
   1380 
   1381   // Need frame address to find the address of VarArgsFrameIndex.
   1382   MF.getFrameInfo().setFrameAddressIsTaken(true);
   1383 
   1384   // vastart just stores the address of the VarArgsFrameIndex slot into the
   1385   // memory location argument.
   1386   SDLoc DL(Op);
   1387   SDValue Offset =
   1388       DAG.getNode(ISD::ADD, DL, PtrVT, DAG.getRegister(VE::SX9, PtrVT),
   1389                   DAG.getIntPtrConstant(FuncInfo->getVarArgsFrameOffset(), DL));
   1390   const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
   1391   return DAG.getStore(Op.getOperand(0), DL, Offset, Op.getOperand(1),
   1392                       MachinePointerInfo(SV));
   1393 }
   1394 
   1395 SDValue VETargetLowering::lowerVAARG(SDValue Op, SelectionDAG &DAG) const {
   1396   SDNode *Node = Op.getNode();
   1397   EVT VT = Node->getValueType(0);
   1398   SDValue InChain = Node->getOperand(0);
   1399   SDValue VAListPtr = Node->getOperand(1);
   1400   EVT PtrVT = VAListPtr.getValueType();
   1401   const Value *SV = cast<SrcValueSDNode>(Node->getOperand(2))->getValue();
   1402   SDLoc DL(Node);
   1403   SDValue VAList =
   1404       DAG.getLoad(PtrVT, DL, InChain, VAListPtr, MachinePointerInfo(SV));
   1405   SDValue Chain = VAList.getValue(1);
   1406   SDValue NextPtr;
   1407 
   1408   if (VT == MVT::f128) {
   1409     // VE f128 values must be stored with 16 bytes alignment.  We doesn't
   1410     // know the actual alignment of VAList, so we take alignment of it
   1411     // dyanmically.
   1412     int Align = 16;
   1413     VAList = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
   1414                          DAG.getConstant(Align - 1, DL, PtrVT));
   1415     VAList = DAG.getNode(ISD::AND, DL, PtrVT, VAList,
   1416                          DAG.getConstant(-Align, DL, PtrVT));
   1417     // Increment the pointer, VAList, by 16 to the next vaarg.
   1418     NextPtr =
   1419         DAG.getNode(ISD::ADD, DL, PtrVT, VAList, DAG.getIntPtrConstant(16, DL));
   1420   } else if (VT == MVT::f32) {
   1421     // float --> need special handling like below.
   1422     //    0      4
   1423     //    +------+------+
   1424     //    | empty| float|
   1425     //    +------+------+
   1426     // Increment the pointer, VAList, by 8 to the next vaarg.
   1427     NextPtr =
   1428         DAG.getNode(ISD::ADD, DL, PtrVT, VAList, DAG.getIntPtrConstant(8, DL));
   1429     // Then, adjust VAList.
   1430     unsigned InternalOffset = 4;
   1431     VAList = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
   1432                          DAG.getConstant(InternalOffset, DL, PtrVT));
   1433   } else {
   1434     // Increment the pointer, VAList, by 8 to the next vaarg.
   1435     NextPtr =
   1436         DAG.getNode(ISD::ADD, DL, PtrVT, VAList, DAG.getIntPtrConstant(8, DL));
   1437   }
   1438 
   1439   // Store the incremented VAList to the legalized pointer.
   1440   InChain = DAG.getStore(Chain, DL, NextPtr, VAListPtr, MachinePointerInfo(SV));
   1441 
   1442   // Load the actual argument out of the pointer VAList.
   1443   // We can't count on greater alignment than the word size.
   1444   return DAG.getLoad(VT, DL, InChain, VAList, MachinePointerInfo(),
   1445                      std::min(PtrVT.getSizeInBits(), VT.getSizeInBits()) / 8);
   1446 }
   1447 
   1448 SDValue VETargetLowering::lowerDYNAMIC_STACKALLOC(SDValue Op,
   1449                                                   SelectionDAG &DAG) const {
   1450   // Generate following code.
   1451   //   (void)__llvm_grow_stack(size);
   1452   //   ret = GETSTACKTOP;        // pseudo instruction
   1453   SDLoc DL(Op);
   1454 
   1455   // Get the inputs.
   1456   SDNode *Node = Op.getNode();
   1457   SDValue Chain = Op.getOperand(0);
   1458   SDValue Size = Op.getOperand(1);
   1459   MaybeAlign Alignment(Op.getConstantOperandVal(2));
   1460   EVT VT = Node->getValueType(0);
   1461 
   1462   // Chain the dynamic stack allocation so that it doesn't modify the stack
   1463   // pointer when other instructions are using the stack.
   1464   Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
   1465 
   1466   const TargetFrameLowering &TFI = *Subtarget->getFrameLowering();
   1467   Align StackAlign = TFI.getStackAlign();
   1468   bool NeedsAlign = Alignment.valueOrOne() > StackAlign;
   1469 
   1470   // Prepare arguments
   1471   TargetLowering::ArgListTy Args;
   1472   TargetLowering::ArgListEntry Entry;
   1473   Entry.Node = Size;
   1474   Entry.Ty = Entry.Node.getValueType().getTypeForEVT(*DAG.getContext());
   1475   Args.push_back(Entry);
   1476   if (NeedsAlign) {
   1477     Entry.Node = DAG.getConstant(~(Alignment->value() - 1ULL), DL, VT);
   1478     Entry.Ty = Entry.Node.getValueType().getTypeForEVT(*DAG.getContext());
   1479     Args.push_back(Entry);
   1480   }
   1481   Type *RetTy = Type::getVoidTy(*DAG.getContext());
   1482 
   1483   EVT PtrVT = Op.getValueType();
   1484   SDValue Callee;
   1485   if (NeedsAlign) {
   1486     Callee = DAG.getTargetExternalSymbol("__ve_grow_stack_align", PtrVT, 0);
   1487   } else {
   1488     Callee = DAG.getTargetExternalSymbol("__ve_grow_stack", PtrVT, 0);
   1489   }
   1490 
   1491   TargetLowering::CallLoweringInfo CLI(DAG);
   1492   CLI.setDebugLoc(DL)
   1493       .setChain(Chain)
   1494       .setCallee(CallingConv::PreserveAll, RetTy, Callee, std::move(Args))
   1495       .setDiscardResult(true);
   1496   std::pair<SDValue, SDValue> pair = LowerCallTo(CLI);
   1497   Chain = pair.second;
   1498   SDValue Result = DAG.getNode(VEISD::GETSTACKTOP, DL, VT, Chain);
   1499   if (NeedsAlign) {
   1500     Result = DAG.getNode(ISD::ADD, DL, VT, Result,
   1501                          DAG.getConstant((Alignment->value() - 1ULL), DL, VT));
   1502     Result = DAG.getNode(ISD::AND, DL, VT, Result,
   1503                          DAG.getConstant(~(Alignment->value() - 1ULL), DL, VT));
   1504   }
   1505   //  Chain = Result.getValue(1);
   1506   Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, DL, true),
   1507                              DAG.getIntPtrConstant(0, DL, true), SDValue(), DL);
   1508 
   1509   SDValue Ops[2] = {Result, Chain};
   1510   return DAG.getMergeValues(Ops, DL);
   1511 }
   1512 
   1513 SDValue VETargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op,
   1514                                                SelectionDAG &DAG) const {
   1515   SDLoc DL(Op);
   1516   return DAG.getNode(VEISD::EH_SJLJ_LONGJMP, DL, MVT::Other, Op.getOperand(0),
   1517                      Op.getOperand(1));
   1518 }
   1519 
   1520 SDValue VETargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op,
   1521                                               SelectionDAG &DAG) const {
   1522   SDLoc DL(Op);
   1523   return DAG.getNode(VEISD::EH_SJLJ_SETJMP, DL,
   1524                      DAG.getVTList(MVT::i32, MVT::Other), Op.getOperand(0),
   1525                      Op.getOperand(1));
   1526 }
   1527 
   1528 SDValue VETargetLowering::lowerEH_SJLJ_SETUP_DISPATCH(SDValue Op,
   1529                                                       SelectionDAG &DAG) const {
   1530   SDLoc DL(Op);
   1531   return DAG.getNode(VEISD::EH_SJLJ_SETUP_DISPATCH, DL, MVT::Other,
   1532                      Op.getOperand(0));
   1533 }
   1534 
   1535 static SDValue lowerFRAMEADDR(SDValue Op, SelectionDAG &DAG,
   1536                               const VETargetLowering &TLI,
   1537                               const VESubtarget *Subtarget) {
   1538   SDLoc DL(Op);
   1539   MachineFunction &MF = DAG.getMachineFunction();
   1540   EVT PtrVT = TLI.getPointerTy(MF.getDataLayout());
   1541 
   1542   MachineFrameInfo &MFI = MF.getFrameInfo();
   1543   MFI.setFrameAddressIsTaken(true);
   1544 
   1545   unsigned Depth = Op.getConstantOperandVal(0);
   1546   const VERegisterInfo *RegInfo = Subtarget->getRegisterInfo();
   1547   unsigned FrameReg = RegInfo->getFrameRegister(MF);
   1548   SDValue FrameAddr =
   1549       DAG.getCopyFromReg(DAG.getEntryNode(), DL, FrameReg, PtrVT);
   1550   while (Depth--)
   1551     FrameAddr = DAG.getLoad(Op.getValueType(), DL, DAG.getEntryNode(),
   1552                             FrameAddr, MachinePointerInfo());
   1553   return FrameAddr;
   1554 }
   1555 
   1556 static SDValue lowerRETURNADDR(SDValue Op, SelectionDAG &DAG,
   1557                                const VETargetLowering &TLI,
   1558                                const VESubtarget *Subtarget) {
   1559   MachineFunction &MF = DAG.getMachineFunction();
   1560   MachineFrameInfo &MFI = MF.getFrameInfo();
   1561   MFI.setReturnAddressIsTaken(true);
   1562 
   1563   if (TLI.verifyReturnAddressArgumentIsConstant(Op, DAG))
   1564     return SDValue();
   1565 
   1566   SDValue FrameAddr = lowerFRAMEADDR(Op, DAG, TLI, Subtarget);
   1567 
   1568   SDLoc DL(Op);
   1569   EVT VT = Op.getValueType();
   1570   SDValue Offset = DAG.getConstant(8, DL, VT);
   1571   return DAG.getLoad(VT, DL, DAG.getEntryNode(),
   1572                      DAG.getNode(ISD::ADD, DL, VT, FrameAddr, Offset),
   1573                      MachinePointerInfo());
   1574 }
   1575 
   1576 SDValue VETargetLowering::lowerINTRINSIC_WO_CHAIN(SDValue Op,
   1577                                                   SelectionDAG &DAG) const {
   1578   SDLoc DL(Op);
   1579   unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
   1580   switch (IntNo) {
   1581   default: // Don't custom lower most intrinsics.
   1582     return SDValue();
   1583   case Intrinsic::eh_sjlj_lsda: {
   1584     MachineFunction &MF = DAG.getMachineFunction();
   1585     MVT VT = Op.getSimpleValueType();
   1586     const VETargetMachine *TM =
   1587         static_cast<const VETargetMachine *>(&DAG.getTarget());
   1588 
   1589     // Create GCC_except_tableXX string.  The real symbol for that will be
   1590     // generated in EHStreamer::emitExceptionTable() later.  So, we just
   1591     // borrow it's name here.
   1592     TM->getStrList()->push_back(std::string(
   1593         (Twine("GCC_except_table") + Twine(MF.getFunctionNumber())).str()));
   1594     SDValue Addr =
   1595         DAG.getTargetExternalSymbol(TM->getStrList()->back().c_str(), VT, 0);
   1596     if (isPositionIndependent()) {
   1597       Addr = makeHiLoPair(Addr, VEMCExpr::VK_VE_GOTOFF_HI32,
   1598                           VEMCExpr::VK_VE_GOTOFF_LO32, DAG);
   1599       SDValue GlobalBase = DAG.getNode(VEISD::GLOBAL_BASE_REG, DL, VT);
   1600       return DAG.getNode(ISD::ADD, DL, VT, GlobalBase, Addr);
   1601     }
   1602     return makeHiLoPair(Addr, VEMCExpr::VK_VE_HI32, VEMCExpr::VK_VE_LO32, DAG);
   1603   }
   1604   }
   1605 }
   1606 
   1607 static bool getUniqueInsertion(SDNode *N, unsigned &UniqueIdx) {
   1608   if (!isa<BuildVectorSDNode>(N))
   1609     return false;
   1610   const auto *BVN = cast<BuildVectorSDNode>(N);
   1611 
   1612   // Find first non-undef insertion.
   1613   unsigned Idx;
   1614   for (Idx = 0; Idx < BVN->getNumOperands(); ++Idx) {
   1615     auto ElemV = BVN->getOperand(Idx);
   1616     if (!ElemV->isUndef())
   1617       break;
   1618   }
   1619   // Catch the (hypothetical) all-undef case.
   1620   if (Idx == BVN->getNumOperands())
   1621     return false;
   1622   // Remember insertion.
   1623   UniqueIdx = Idx++;
   1624   // Verify that all other insertions are undef.
   1625   for (; Idx < BVN->getNumOperands(); ++Idx) {
   1626     auto ElemV = BVN->getOperand(Idx);
   1627     if (!ElemV->isUndef())
   1628       return false;
   1629   }
   1630   return true;
   1631 }
   1632 
   1633 static SDValue getSplatValue(SDNode *N) {
   1634   if (auto *BuildVec = dyn_cast<BuildVectorSDNode>(N)) {
   1635     return BuildVec->getSplatValue();
   1636   }
   1637   return SDValue();
   1638 }
   1639 
   1640 SDValue VETargetLowering::lowerBUILD_VECTOR(SDValue Op,
   1641                                             SelectionDAG &DAG) const {
   1642   SDLoc DL(Op);
   1643   unsigned NumEls = Op.getValueType().getVectorNumElements();
   1644   MVT ElemVT = Op.getSimpleValueType().getVectorElementType();
   1645 
   1646   // If there is just one element, expand to INSERT_VECTOR_ELT.
   1647   unsigned UniqueIdx;
   1648   if (getUniqueInsertion(Op.getNode(), UniqueIdx)) {
   1649     SDValue AccuV = DAG.getUNDEF(Op.getValueType());
   1650     auto ElemV = Op->getOperand(UniqueIdx);
   1651     SDValue IdxV = DAG.getConstant(UniqueIdx, DL, MVT::i64);
   1652     return DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, Op.getValueType(), AccuV,
   1653                        ElemV, IdxV);
   1654   }
   1655 
   1656   // Else emit a broadcast.
   1657   if (SDValue ScalarV = getSplatValue(Op.getNode())) {
   1658     // lower to VEC_BROADCAST
   1659     MVT LegalResVT = MVT::getVectorVT(ElemVT, 256);
   1660 
   1661     auto AVL = DAG.getConstant(NumEls, DL, MVT::i32);
   1662     return DAG.getNode(VEISD::VEC_BROADCAST, DL, LegalResVT, Op.getOperand(0),
   1663                        AVL);
   1664   }
   1665 
   1666   // Expand
   1667   return SDValue();
   1668 }
   1669 
   1670 SDValue VETargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   1671   unsigned Opcode = Op.getOpcode();
   1672   if (ISD::isVPOpcode(Opcode))
   1673     return lowerToVVP(Op, DAG);
   1674 
   1675   switch (Opcode) {
   1676   default:
   1677     llvm_unreachable("Should not custom lower this!");
   1678   case ISD::ATOMIC_FENCE:
   1679     return lowerATOMIC_FENCE(Op, DAG);
   1680   case ISD::ATOMIC_SWAP:
   1681     return lowerATOMIC_SWAP(Op, DAG);
   1682   case ISD::BlockAddress:
   1683     return lowerBlockAddress(Op, DAG);
   1684   case ISD::ConstantPool:
   1685     return lowerConstantPool(Op, DAG);
   1686   case ISD::DYNAMIC_STACKALLOC:
   1687     return lowerDYNAMIC_STACKALLOC(Op, DAG);
   1688   case ISD::EH_SJLJ_LONGJMP:
   1689     return lowerEH_SJLJ_LONGJMP(Op, DAG);
   1690   case ISD::EH_SJLJ_SETJMP:
   1691     return lowerEH_SJLJ_SETJMP(Op, DAG);
   1692   case ISD::EH_SJLJ_SETUP_DISPATCH:
   1693     return lowerEH_SJLJ_SETUP_DISPATCH(Op, DAG);
   1694   case ISD::FRAMEADDR:
   1695     return lowerFRAMEADDR(Op, DAG, *this, Subtarget);
   1696   case ISD::GlobalAddress:
   1697     return lowerGlobalAddress(Op, DAG);
   1698   case ISD::GlobalTLSAddress:
   1699     return lowerGlobalTLSAddress(Op, DAG);
   1700   case ISD::INTRINSIC_WO_CHAIN:
   1701     return lowerINTRINSIC_WO_CHAIN(Op, DAG);
   1702   case ISD::JumpTable:
   1703     return lowerJumpTable(Op, DAG);
   1704   case ISD::LOAD:
   1705     return lowerLOAD(Op, DAG);
   1706   case ISD::RETURNADDR:
   1707     return lowerRETURNADDR(Op, DAG, *this, Subtarget);
   1708   case ISD::BUILD_VECTOR:
   1709     return lowerBUILD_VECTOR(Op, DAG);
   1710   case ISD::STORE:
   1711     return lowerSTORE(Op, DAG);
   1712   case ISD::VASTART:
   1713     return lowerVASTART(Op, DAG);
   1714   case ISD::VAARG:
   1715     return lowerVAARG(Op, DAG);
   1716 
   1717   case ISD::INSERT_VECTOR_ELT:
   1718     return lowerINSERT_VECTOR_ELT(Op, DAG);
   1719   case ISD::EXTRACT_VECTOR_ELT:
   1720     return lowerEXTRACT_VECTOR_ELT(Op, DAG);
   1721 
   1722 #define ADD_BINARY_VVP_OP(VVP_NAME, ISD_NAME) case ISD::ISD_NAME:
   1723 #include "VVPNodes.def"
   1724     return lowerToVVP(Op, DAG);
   1725   }
   1726 }
   1727 /// } Custom Lower
   1728 
   1729 void VETargetLowering::ReplaceNodeResults(SDNode *N,
   1730                                           SmallVectorImpl<SDValue> &Results,
   1731                                           SelectionDAG &DAG) const {
   1732   switch (N->getOpcode()) {
   1733   case ISD::ATOMIC_SWAP:
   1734     // Let LLVM expand atomic swap instruction through LowerOperation.
   1735     return;
   1736   default:
   1737     LLVM_DEBUG(N->dumpr(&DAG));
   1738     llvm_unreachable("Do not know how to custom type legalize this operation!");
   1739   }
   1740 }
   1741 
   1742 /// JumpTable for VE.
   1743 ///
   1744 ///   VE cannot generate relocatable symbol in jump table.  VE cannot
   1745 ///   generate expressions using symbols in both text segment and data
   1746 ///   segment like below.
   1747 ///             .4byte  .LBB0_2-.LJTI0_0
   1748 ///   So, we generate offset from the top of function like below as
   1749 ///   a custom label.
   1750 ///             .4byte  .LBB0_2-<function name>
   1751 
   1752 unsigned VETargetLowering::getJumpTableEncoding() const {
   1753   // Use custom label for PIC.
   1754   if (isPositionIndependent())
   1755     return MachineJumpTableInfo::EK_Custom32;
   1756 
   1757   // Otherwise, use the normal jump table encoding heuristics.
   1758   return TargetLowering::getJumpTableEncoding();
   1759 }
   1760 
   1761 const MCExpr *VETargetLowering::LowerCustomJumpTableEntry(
   1762     const MachineJumpTableInfo *MJTI, const MachineBasicBlock *MBB,
   1763     unsigned Uid, MCContext &Ctx) const {
   1764   assert(isPositionIndependent());
   1765 
   1766   // Generate custom label for PIC like below.
   1767   //    .4bytes  .LBB0_2-<function name>
   1768   const auto *Value = MCSymbolRefExpr::create(MBB->getSymbol(), Ctx);
   1769   MCSymbol *Sym = Ctx.getOrCreateSymbol(MBB->getParent()->getName().data());
   1770   const auto *Base = MCSymbolRefExpr::create(Sym, Ctx);
   1771   return MCBinaryExpr::createSub(Value, Base, Ctx);
   1772 }
   1773 
   1774 SDValue VETargetLowering::getPICJumpTableRelocBase(SDValue Table,
   1775                                                    SelectionDAG &DAG) const {
   1776   assert(isPositionIndependent());
   1777   SDLoc DL(Table);
   1778   Function *Function = &DAG.getMachineFunction().getFunction();
   1779   assert(Function != nullptr);
   1780   auto PtrTy = getPointerTy(DAG.getDataLayout(), Function->getAddressSpace());
   1781 
   1782   // In the jump table, we have following values in PIC mode.
   1783   //    .4bytes  .LBB0_2-<function name>
   1784   // We need to add this value and the address of this function to generate
   1785   // .LBB0_2 label correctly under PIC mode.  So, we want to generate following
   1786   // instructions:
   1787   //     lea %reg, fun@gotoff_lo
   1788   //     and %reg, %reg, (32)0
   1789   //     lea.sl %reg, fun@gotoff_hi(%reg, %got)
   1790   // In order to do so, we need to genarate correctly marked DAG node using
   1791   // makeHiLoPair.
   1792   SDValue Op = DAG.getGlobalAddress(Function, DL, PtrTy);
   1793   SDValue HiLo = makeHiLoPair(Op, VEMCExpr::VK_VE_GOTOFF_HI32,
   1794                               VEMCExpr::VK_VE_GOTOFF_LO32, DAG);
   1795   SDValue GlobalBase = DAG.getNode(VEISD::GLOBAL_BASE_REG, DL, PtrTy);
   1796   return DAG.getNode(ISD::ADD, DL, PtrTy, GlobalBase, HiLo);
   1797 }
   1798 
   1799 Register VETargetLowering::prepareMBB(MachineBasicBlock &MBB,
   1800                                       MachineBasicBlock::iterator I,
   1801                                       MachineBasicBlock *TargetBB,
   1802                                       const DebugLoc &DL) const {
   1803   MachineFunction *MF = MBB.getParent();
   1804   MachineRegisterInfo &MRI = MF->getRegInfo();
   1805   const VEInstrInfo *TII = Subtarget->getInstrInfo();
   1806 
   1807   const TargetRegisterClass *RC = &VE::I64RegClass;
   1808   Register Tmp1 = MRI.createVirtualRegister(RC);
   1809   Register Tmp2 = MRI.createVirtualRegister(RC);
   1810   Register Result = MRI.createVirtualRegister(RC);
   1811 
   1812   if (isPositionIndependent()) {
   1813     // Create following instructions for local linkage PIC code.
   1814     //     lea %Tmp1, TargetBB@gotoff_lo
   1815     //     and %Tmp2, %Tmp1, (32)0
   1816     //     lea.sl %Result, TargetBB@gotoff_hi(%Tmp2, %s15) ; %s15 is GOT
   1817     BuildMI(MBB, I, DL, TII->get(VE::LEAzii), Tmp1)
   1818         .addImm(0)
   1819         .addImm(0)
   1820         .addMBB(TargetBB, VEMCExpr::VK_VE_GOTOFF_LO32);
   1821     BuildMI(MBB, I, DL, TII->get(VE::ANDrm), Tmp2)
   1822         .addReg(Tmp1, getKillRegState(true))
   1823         .addImm(M0(32));
   1824     BuildMI(MBB, I, DL, TII->get(VE::LEASLrri), Result)
   1825         .addReg(VE::SX15)
   1826         .addReg(Tmp2, getKillRegState(true))
   1827         .addMBB(TargetBB, VEMCExpr::VK_VE_GOTOFF_HI32);
   1828   } else {
   1829     // Create following instructions for non-PIC code.
   1830     //     lea     %Tmp1, TargetBB@lo
   1831     //     and     %Tmp2, %Tmp1, (32)0
   1832     //     lea.sl  %Result, TargetBB@hi(%Tmp2)
   1833     BuildMI(MBB, I, DL, TII->get(VE::LEAzii), Tmp1)
   1834         .addImm(0)
   1835         .addImm(0)
   1836         .addMBB(TargetBB, VEMCExpr::VK_VE_LO32);
   1837     BuildMI(MBB, I, DL, TII->get(VE::ANDrm), Tmp2)
   1838         .addReg(Tmp1, getKillRegState(true))
   1839         .addImm(M0(32));
   1840     BuildMI(MBB, I, DL, TII->get(VE::LEASLrii), Result)
   1841         .addReg(Tmp2, getKillRegState(true))
   1842         .addImm(0)
   1843         .addMBB(TargetBB, VEMCExpr::VK_VE_HI32);
   1844   }
   1845   return Result;
   1846 }
   1847 
   1848 Register VETargetLowering::prepareSymbol(MachineBasicBlock &MBB,
   1849                                          MachineBasicBlock::iterator I,
   1850                                          StringRef Symbol, const DebugLoc &DL,
   1851                                          bool IsLocal = false,
   1852                                          bool IsCall = false) const {
   1853   MachineFunction *MF = MBB.getParent();
   1854   MachineRegisterInfo &MRI = MF->getRegInfo();
   1855   const VEInstrInfo *TII = Subtarget->getInstrInfo();
   1856 
   1857   const TargetRegisterClass *RC = &VE::I64RegClass;
   1858   Register Result = MRI.createVirtualRegister(RC);
   1859 
   1860   if (isPositionIndependent()) {
   1861     if (IsCall && !IsLocal) {
   1862       // Create following instructions for non-local linkage PIC code function
   1863       // calls.  These instructions uses IC and magic number -24, so we expand
   1864       // them in VEAsmPrinter.cpp from GETFUNPLT pseudo instruction.
   1865       //     lea %Reg, Symbol@plt_lo(-24)
   1866       //     and %Reg, %Reg, (32)0
   1867       //     sic %s16
   1868       //     lea.sl %Result, Symbol@plt_hi(%Reg, %s16) ; %s16 is PLT
   1869       BuildMI(MBB, I, DL, TII->get(VE::GETFUNPLT), Result)
   1870           .addExternalSymbol("abort");
   1871     } else if (IsLocal) {
   1872       Register Tmp1 = MRI.createVirtualRegister(RC);
   1873       Register Tmp2 = MRI.createVirtualRegister(RC);
   1874       // Create following instructions for local linkage PIC code.
   1875       //     lea %Tmp1, Symbol@gotoff_lo
   1876       //     and %Tmp2, %Tmp1, (32)0
   1877       //     lea.sl %Result, Symbol@gotoff_hi(%Tmp2, %s15) ; %s15 is GOT
   1878       BuildMI(MBB, I, DL, TII->get(VE::LEAzii), Tmp1)
   1879           .addImm(0)
   1880           .addImm(0)
   1881           .addExternalSymbol(Symbol.data(), VEMCExpr::VK_VE_GOTOFF_LO32);
   1882       BuildMI(MBB, I, DL, TII->get(VE::ANDrm), Tmp2)
   1883           .addReg(Tmp1, getKillRegState(true))
   1884           .addImm(M0(32));
   1885       BuildMI(MBB, I, DL, TII->get(VE::LEASLrri), Result)
   1886           .addReg(VE::SX15)
   1887           .addReg(Tmp2, getKillRegState(true))
   1888           .addExternalSymbol(Symbol.data(), VEMCExpr::VK_VE_GOTOFF_HI32);
   1889     } else {
   1890       Register Tmp1 = MRI.createVirtualRegister(RC);
   1891       Register Tmp2 = MRI.createVirtualRegister(RC);
   1892       // Create following instructions for not local linkage PIC code.
   1893       //     lea %Tmp1, Symbol@got_lo
   1894       //     and %Tmp2, %Tmp1, (32)0
   1895       //     lea.sl %Tmp3, Symbol@gotoff_hi(%Tmp2, %s15) ; %s15 is GOT
   1896       //     ld %Result, 0(%Tmp3)
   1897       Register Tmp3 = MRI.createVirtualRegister(RC);
   1898       BuildMI(MBB, I, DL, TII->get(VE::LEAzii), Tmp1)
   1899           .addImm(0)
   1900           .addImm(0)
   1901           .addExternalSymbol(Symbol.data(), VEMCExpr::VK_VE_GOT_LO32);
   1902       BuildMI(MBB, I, DL, TII->get(VE::ANDrm), Tmp2)
   1903           .addReg(Tmp1, getKillRegState(true))
   1904           .addImm(M0(32));
   1905       BuildMI(MBB, I, DL, TII->get(VE::LEASLrri), Tmp3)
   1906           .addReg(VE::SX15)
   1907           .addReg(Tmp2, getKillRegState(true))
   1908           .addExternalSymbol(Symbol.data(), VEMCExpr::VK_VE_GOT_HI32);
   1909       BuildMI(MBB, I, DL, TII->get(VE::LDrii), Result)
   1910           .addReg(Tmp3, getKillRegState(true))
   1911           .addImm(0)
   1912           .addImm(0);
   1913     }
   1914   } else {
   1915     Register Tmp1 = MRI.createVirtualRegister(RC);
   1916     Register Tmp2 = MRI.createVirtualRegister(RC);
   1917     // Create following instructions for non-PIC code.
   1918     //     lea     %Tmp1, Symbol@lo
   1919     //     and     %Tmp2, %Tmp1, (32)0
   1920     //     lea.sl  %Result, Symbol@hi(%Tmp2)
   1921     BuildMI(MBB, I, DL, TII->get(VE::LEAzii), Tmp1)
   1922         .addImm(0)
   1923         .addImm(0)
   1924         .addExternalSymbol(Symbol.data(), VEMCExpr::VK_VE_LO32);
   1925     BuildMI(MBB, I, DL, TII->get(VE::ANDrm), Tmp2)
   1926         .addReg(Tmp1, getKillRegState(true))
   1927         .addImm(M0(32));
   1928     BuildMI(MBB, I, DL, TII->get(VE::LEASLrii), Result)
   1929         .addReg(Tmp2, getKillRegState(true))
   1930         .addImm(0)
   1931         .addExternalSymbol(Symbol.data(), VEMCExpr::VK_VE_HI32);
   1932   }
   1933   return Result;
   1934 }
   1935 
   1936 void VETargetLowering::setupEntryBlockForSjLj(MachineInstr &MI,
   1937                                               MachineBasicBlock *MBB,
   1938                                               MachineBasicBlock *DispatchBB,
   1939                                               int FI, int Offset) const {
   1940   DebugLoc DL = MI.getDebugLoc();
   1941   const VEInstrInfo *TII = Subtarget->getInstrInfo();
   1942 
   1943   Register LabelReg =
   1944       prepareMBB(*MBB, MachineBasicBlock::iterator(MI), DispatchBB, DL);
   1945 
   1946   // Store an address of DispatchBB to a given jmpbuf[1] where has next IC
   1947   // referenced by longjmp (throw) later.
   1948   MachineInstrBuilder MIB = BuildMI(*MBB, MI, DL, TII->get(VE::STrii));
   1949   addFrameReference(MIB, FI, Offset); // jmpbuf[1]
   1950   MIB.addReg(LabelReg, getKillRegState(true));
   1951 }
   1952 
   1953 MachineBasicBlock *
   1954 VETargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,
   1955                                    MachineBasicBlock *MBB) const {
   1956   DebugLoc DL = MI.getDebugLoc();
   1957   MachineFunction *MF = MBB->getParent();
   1958   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
   1959   const TargetRegisterInfo *TRI = Subtarget->getRegisterInfo();
   1960   MachineRegisterInfo &MRI = MF->getRegInfo();
   1961 
   1962   const BasicBlock *BB = MBB->getBasicBlock();
   1963   MachineFunction::iterator I = ++MBB->getIterator();
   1964 
   1965   // Memory Reference.
   1966   SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),
   1967                                            MI.memoperands_end());
   1968   Register BufReg = MI.getOperand(1).getReg();
   1969 
   1970   Register DstReg;
   1971 
   1972   DstReg = MI.getOperand(0).getReg();
   1973   const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
   1974   assert(TRI->isTypeLegalForClass(*RC, MVT::i32) && "Invalid destination!");
   1975   (void)TRI;
   1976   Register MainDestReg = MRI.createVirtualRegister(RC);
   1977   Register RestoreDestReg = MRI.createVirtualRegister(RC);
   1978 
   1979   // For `v = call @llvm.eh.sjlj.setjmp(buf)`, we generate following
   1980   // instructions.  SP/FP must be saved in jmpbuf before `llvm.eh.sjlj.setjmp`.
   1981   //
   1982   // ThisMBB:
   1983   //   buf[3] = %s17 iff %s17 is used as BP
   1984   //   buf[1] = RestoreMBB as IC after longjmp
   1985   //   # SjLjSetup RestoreMBB
   1986   //
   1987   // MainMBB:
   1988   //   v_main = 0
   1989   //
   1990   // SinkMBB:
   1991   //   v = phi(v_main, MainMBB, v_restore, RestoreMBB)
   1992   //   ...
   1993   //
   1994   // RestoreMBB:
   1995   //   %s17 = buf[3] = iff %s17 is used as BP
   1996   //   v_restore = 1
   1997   //   goto SinkMBB
   1998 
   1999   MachineBasicBlock *ThisMBB = MBB;
   2000   MachineBasicBlock *MainMBB = MF->CreateMachineBasicBlock(BB);
   2001   MachineBasicBlock *SinkMBB = MF->CreateMachineBasicBlock(BB);
   2002   MachineBasicBlock *RestoreMBB = MF->CreateMachineBasicBlock(BB);
   2003   MF->insert(I, MainMBB);
   2004   MF->insert(I, SinkMBB);
   2005   MF->push_back(RestoreMBB);
   2006   RestoreMBB->setHasAddressTaken();
   2007 
   2008   // Transfer the remainder of BB and its successor edges to SinkMBB.
   2009   SinkMBB->splice(SinkMBB->begin(), MBB,
   2010                   std::next(MachineBasicBlock::iterator(MI)), MBB->end());
   2011   SinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
   2012 
   2013   // ThisMBB:
   2014   Register LabelReg =
   2015       prepareMBB(*MBB, MachineBasicBlock::iterator(MI), RestoreMBB, DL);
   2016 
   2017   // Store BP in buf[3] iff this function is using BP.
   2018   const VEFrameLowering *TFI = Subtarget->getFrameLowering();
   2019   if (TFI->hasBP(*MF)) {
   2020     MachineInstrBuilder MIB = BuildMI(*MBB, MI, DL, TII->get(VE::STrii));
   2021     MIB.addReg(BufReg);
   2022     MIB.addImm(0);
   2023     MIB.addImm(24);
   2024     MIB.addReg(VE::SX17);
   2025     MIB.setMemRefs(MMOs);
   2026   }
   2027 
   2028   // Store IP in buf[1].
   2029   MachineInstrBuilder MIB = BuildMI(*MBB, MI, DL, TII->get(VE::STrii));
   2030   MIB.add(MI.getOperand(1)); // we can preserve the kill flags here.
   2031   MIB.addImm(0);
   2032   MIB.addImm(8);
   2033   MIB.addReg(LabelReg, getKillRegState(true));
   2034   MIB.setMemRefs(MMOs);
   2035 
   2036   // SP/FP are already stored in jmpbuf before `llvm.eh.sjlj.setjmp`.
   2037 
   2038   // Insert setup.
   2039   MIB =
   2040       BuildMI(*ThisMBB, MI, DL, TII->get(VE::EH_SjLj_Setup)).addMBB(RestoreMBB);
   2041 
   2042   const VERegisterInfo *RegInfo = Subtarget->getRegisterInfo();
   2043   MIB.addRegMask(RegInfo->getNoPreservedMask());
   2044   ThisMBB->addSuccessor(MainMBB);
   2045   ThisMBB->addSuccessor(RestoreMBB);
   2046 
   2047   // MainMBB:
   2048   BuildMI(MainMBB, DL, TII->get(VE::LEAzii), MainDestReg)
   2049       .addImm(0)
   2050       .addImm(0)
   2051       .addImm(0);
   2052   MainMBB->addSuccessor(SinkMBB);
   2053 
   2054   // SinkMBB:
   2055   BuildMI(*SinkMBB, SinkMBB->begin(), DL, TII->get(VE::PHI), DstReg)
   2056       .addReg(MainDestReg)
   2057       .addMBB(MainMBB)
   2058       .addReg(RestoreDestReg)
   2059       .addMBB(RestoreMBB);
   2060 
   2061   // RestoreMBB:
   2062   // Restore BP from buf[3] iff this function is using BP.  The address of
   2063   // buf is in SX10.
   2064   // FIXME: Better to not use SX10 here
   2065   if (TFI->hasBP(*MF)) {
   2066     MachineInstrBuilder MIB =
   2067         BuildMI(RestoreMBB, DL, TII->get(VE::LDrii), VE::SX17);
   2068     MIB.addReg(VE::SX10);
   2069     MIB.addImm(0);
   2070     MIB.addImm(24);
   2071     MIB.setMemRefs(MMOs);
   2072   }
   2073   BuildMI(RestoreMBB, DL, TII->get(VE::LEAzii), RestoreDestReg)
   2074       .addImm(0)
   2075       .addImm(0)
   2076       .addImm(1);
   2077   BuildMI(RestoreMBB, DL, TII->get(VE::BRCFLa_t)).addMBB(SinkMBB);
   2078   RestoreMBB->addSuccessor(SinkMBB);
   2079 
   2080   MI.eraseFromParent();
   2081   return SinkMBB;
   2082 }
   2083 
   2084 MachineBasicBlock *
   2085 VETargetLowering::emitEHSjLjLongJmp(MachineInstr &MI,
   2086                                     MachineBasicBlock *MBB) const {
   2087   DebugLoc DL = MI.getDebugLoc();
   2088   MachineFunction *MF = MBB->getParent();
   2089   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
   2090   MachineRegisterInfo &MRI = MF->getRegInfo();
   2091 
   2092   // Memory Reference.
   2093   SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),
   2094                                            MI.memoperands_end());
   2095   Register BufReg = MI.getOperand(0).getReg();
   2096 
   2097   Register Tmp = MRI.createVirtualRegister(&VE::I64RegClass);
   2098   // Since FP is only updated here but NOT referenced, it's treated as GPR.
   2099   Register FP = VE::SX9;
   2100   Register SP = VE::SX11;
   2101 
   2102   MachineInstrBuilder MIB;
   2103 
   2104   MachineBasicBlock *ThisMBB = MBB;
   2105 
   2106   // For `call @llvm.eh.sjlj.longjmp(buf)`, we generate following instructions.
   2107   //
   2108   // ThisMBB:
   2109   //   %fp = load buf[0]
   2110   //   %jmp = load buf[1]
   2111   //   %s10 = buf        ; Store an address of buf to SX10 for RestoreMBB
   2112   //   %sp = load buf[2] ; generated by llvm.eh.sjlj.setjmp.
   2113   //   jmp %jmp
   2114 
   2115   // Reload FP.
   2116   MIB = BuildMI(*ThisMBB, MI, DL, TII->get(VE::LDrii), FP);
   2117   MIB.addReg(BufReg);
   2118   MIB.addImm(0);
   2119   MIB.addImm(0);
   2120   MIB.setMemRefs(MMOs);
   2121 
   2122   // Reload IP.
   2123   MIB = BuildMI(*ThisMBB, MI, DL, TII->get(VE::LDrii), Tmp);
   2124   MIB.addReg(BufReg);
   2125   MIB.addImm(0);
   2126   MIB.addImm(8);
   2127   MIB.setMemRefs(MMOs);
   2128 
   2129   // Copy BufReg to SX10 for later use in setjmp.
   2130   // FIXME: Better to not use SX10 here
   2131   BuildMI(*ThisMBB, MI, DL, TII->get(VE::ORri), VE::SX10)
   2132       .addReg(BufReg)
   2133       .addImm(0);
   2134 
   2135   // Reload SP.
   2136   MIB = BuildMI(*ThisMBB, MI, DL, TII->get(VE::LDrii), SP);
   2137   MIB.add(MI.getOperand(0)); // we can preserve the kill flags here.
   2138   MIB.addImm(0);
   2139   MIB.addImm(16);
   2140   MIB.setMemRefs(MMOs);
   2141 
   2142   // Jump.
   2143   BuildMI(*ThisMBB, MI, DL, TII->get(VE::BCFLari_t))
   2144       .addReg(Tmp, getKillRegState(true))
   2145       .addImm(0);
   2146 
   2147   MI.eraseFromParent();
   2148   return ThisMBB;
   2149 }
   2150 
   2151 MachineBasicBlock *
   2152 VETargetLowering::emitSjLjDispatchBlock(MachineInstr &MI,
   2153                                         MachineBasicBlock *BB) const {
   2154   DebugLoc DL = MI.getDebugLoc();
   2155   MachineFunction *MF = BB->getParent();
   2156   MachineFrameInfo &MFI = MF->getFrameInfo();
   2157   MachineRegisterInfo &MRI = MF->getRegInfo();
   2158   const VEInstrInfo *TII = Subtarget->getInstrInfo();
   2159   int FI = MFI.getFunctionContextIndex();
   2160 
   2161   // Get a mapping of the call site numbers to all of the landing pads they're
   2162   // associated with.
   2163   DenseMap<unsigned, SmallVector<MachineBasicBlock *, 2>> CallSiteNumToLPad;
   2164   unsigned MaxCSNum = 0;
   2165   for (auto &MBB : *MF) {
   2166     if (!MBB.isEHPad())
   2167       continue;
   2168 
   2169     MCSymbol *Sym = nullptr;
   2170     for (const auto &MI : MBB) {
   2171       if (MI.isDebugInstr())
   2172         continue;
   2173 
   2174       assert(MI.isEHLabel() && "expected EH_LABEL");
   2175       Sym = MI.getOperand(0).getMCSymbol();
   2176       break;
   2177     }
   2178 
   2179     if (!MF->hasCallSiteLandingPad(Sym))
   2180       continue;
   2181 
   2182     for (unsigned CSI : MF->getCallSiteLandingPad(Sym)) {
   2183       CallSiteNumToLPad[CSI].push_back(&MBB);
   2184       MaxCSNum = std::max(MaxCSNum, CSI);
   2185     }
   2186   }
   2187 
   2188   // Get an ordered list of the machine basic blocks for the jump table.
   2189   std::vector<MachineBasicBlock *> LPadList;
   2190   SmallPtrSet<MachineBasicBlock *, 32> InvokeBBs;
   2191   LPadList.reserve(CallSiteNumToLPad.size());
   2192 
   2193   for (unsigned CSI = 1; CSI <= MaxCSNum; ++CSI) {
   2194     for (auto &LP : CallSiteNumToLPad[CSI]) {
   2195       LPadList.push_back(LP);
   2196       InvokeBBs.insert(LP->pred_begin(), LP->pred_end());
   2197     }
   2198   }
   2199 
   2200   assert(!LPadList.empty() &&
   2201          "No landing pad destinations for the dispatch jump table!");
   2202 
   2203   // The %fn_context is allocated like below (from --print-after=sjljehprepare):
   2204   //   %fn_context = alloca { i8*, i64, [4 x i64], i8*, i8*, [5 x i8*] }
   2205   //
   2206   // This `[5 x i8*]` is jmpbuf, so jmpbuf[1] is FI+72.
   2207   // First `i64` is callsite, so callsite is FI+8.
   2208   static const int OffsetIC = 72;
   2209   static const int OffsetCS = 8;
   2210 
   2211   // Create the MBBs for the dispatch code like following:
   2212   //
   2213   // ThisMBB:
   2214   //   Prepare DispatchBB address and store it to buf[1].
   2215   //   ...
   2216   //
   2217   // DispatchBB:
   2218   //   %s15 = GETGOT iff isPositionIndependent
   2219   //   %callsite = load callsite
   2220   //   brgt.l.t #size of callsites, %callsite, DispContBB
   2221   //
   2222   // TrapBB:
   2223   //   Call abort.
   2224   //
   2225   // DispContBB:
   2226   //   %breg = address of jump table
   2227   //   %pc = load and calculate next pc from %breg and %callsite
   2228   //   jmp %pc
   2229 
   2230   // Shove the dispatch's address into the return slot in the function context.
   2231   MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock();
   2232   DispatchBB->setIsEHPad(true);
   2233 
   2234   // Trap BB will causes trap like `assert(0)`.
   2235   MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
   2236   DispatchBB->addSuccessor(TrapBB);
   2237 
   2238   MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock();
   2239   DispatchBB->addSuccessor(DispContBB);
   2240 
   2241   // Insert MBBs.
   2242   MF->push_back(DispatchBB);
   2243   MF->push_back(DispContBB);
   2244   MF->push_back(TrapBB);
   2245 
   2246   // Insert code to call abort in the TrapBB.
   2247   Register Abort = prepareSymbol(*TrapBB, TrapBB->end(), "abort", DL,
   2248                                  /* Local */ false, /* Call */ true);
   2249   BuildMI(TrapBB, DL, TII->get(VE::BSICrii), VE::SX10)
   2250       .addReg(Abort, getKillRegState(true))
   2251       .addImm(0)
   2252       .addImm(0);
   2253 
   2254   // Insert code into the entry block that creates and registers the function
   2255   // context.
   2256   setupEntryBlockForSjLj(MI, BB, DispatchBB, FI, OffsetIC);
   2257 
   2258   // Create the jump table and associated information
   2259   unsigned JTE = getJumpTableEncoding();
   2260   MachineJumpTableInfo *JTI = MF->getOrCreateJumpTableInfo(JTE);
   2261   unsigned MJTI = JTI->createJumpTableIndex(LPadList);
   2262 
   2263   const VERegisterInfo &RI = TII->getRegisterInfo();
   2264   // Add a register mask with no preserved registers.  This results in all
   2265   // registers being marked as clobbered.
   2266   BuildMI(DispatchBB, DL, TII->get(VE::NOP))
   2267       .addRegMask(RI.getNoPreservedMask());
   2268 
   2269   if (isPositionIndependent()) {
   2270     // Force to generate GETGOT, since current implementation doesn't store GOT
   2271     // register.
   2272     BuildMI(DispatchBB, DL, TII->get(VE::GETGOT), VE::SX15);
   2273   }
   2274 
   2275   // IReg is used as an index in a memory operand and therefore can't be SP
   2276   const TargetRegisterClass *RC = &VE::I64RegClass;
   2277   Register IReg = MRI.createVirtualRegister(RC);
   2278   addFrameReference(BuildMI(DispatchBB, DL, TII->get(VE::LDLZXrii), IReg), FI,
   2279                     OffsetCS);
   2280   if (LPadList.size() < 64) {
   2281     BuildMI(DispatchBB, DL, TII->get(VE::BRCFLir_t))
   2282         .addImm(VECC::CC_ILE)
   2283         .addImm(LPadList.size())
   2284         .addReg(IReg)
   2285         .addMBB(TrapBB);
   2286   } else {
   2287     assert(LPadList.size() <= 0x7FFFFFFF && "Too large Landing Pad!");
   2288     Register TmpReg = MRI.createVirtualRegister(RC);
   2289     BuildMI(DispatchBB, DL, TII->get(VE::LEAzii), TmpReg)
   2290         .addImm(0)
   2291         .addImm(0)
   2292         .addImm(LPadList.size());
   2293     BuildMI(DispatchBB, DL, TII->get(VE::BRCFLrr_t))
   2294         .addImm(VECC::CC_ILE)
   2295         .addReg(TmpReg, getKillRegState(true))
   2296         .addReg(IReg)
   2297         .addMBB(TrapBB);
   2298   }
   2299 
   2300   Register BReg = MRI.createVirtualRegister(RC);
   2301   Register Tmp1 = MRI.createVirtualRegister(RC);
   2302   Register Tmp2 = MRI.createVirtualRegister(RC);
   2303 
   2304   if (isPositionIndependent()) {
   2305     // Create following instructions for local linkage PIC code.
   2306     //     lea    %Tmp1, .LJTI0_0@gotoff_lo
   2307     //     and    %Tmp2, %Tmp1, (32)0
   2308     //     lea.sl %BReg, .LJTI0_0@gotoff_hi(%Tmp2, %s15) ; %s15 is GOT
   2309     BuildMI(DispContBB, DL, TII->get(VE::LEAzii), Tmp1)
   2310         .addImm(0)
   2311         .addImm(0)
   2312         .addJumpTableIndex(MJTI, VEMCExpr::VK_VE_GOTOFF_LO32);
   2313     BuildMI(DispContBB, DL, TII->get(VE::ANDrm), Tmp2)
   2314         .addReg(Tmp1, getKillRegState(true))
   2315         .addImm(M0(32));
   2316     BuildMI(DispContBB, DL, TII->get(VE::LEASLrri), BReg)
   2317         .addReg(VE::SX15)
   2318         .addReg(Tmp2, getKillRegState(true))
   2319         .addJumpTableIndex(MJTI, VEMCExpr::VK_VE_GOTOFF_HI32);
   2320   } else {
   2321     // Create following instructions for non-PIC code.
   2322     //     lea     %Tmp1, .LJTI0_0@lo
   2323     //     and     %Tmp2, %Tmp1, (32)0
   2324     //     lea.sl  %BReg, .LJTI0_0@hi(%Tmp2)
   2325     BuildMI(DispContBB, DL, TII->get(VE::LEAzii), Tmp1)
   2326         .addImm(0)
   2327         .addImm(0)
   2328         .addJumpTableIndex(MJTI, VEMCExpr::VK_VE_LO32);
   2329     BuildMI(DispContBB, DL, TII->get(VE::ANDrm), Tmp2)
   2330         .addReg(Tmp1, getKillRegState(true))
   2331         .addImm(M0(32));
   2332     BuildMI(DispContBB, DL, TII->get(VE::LEASLrii), BReg)
   2333         .addReg(Tmp2, getKillRegState(true))
   2334         .addImm(0)
   2335         .addJumpTableIndex(MJTI, VEMCExpr::VK_VE_HI32);
   2336   }
   2337 
   2338   switch (JTE) {
   2339   case MachineJumpTableInfo::EK_BlockAddress: {
   2340     // Generate simple block address code for no-PIC model.
   2341     //     sll %Tmp1, %IReg, 3
   2342     //     lds %TReg, 0(%Tmp1, %BReg)
   2343     //     bcfla %TReg
   2344 
   2345     Register TReg = MRI.createVirtualRegister(RC);
   2346     Register Tmp1 = MRI.createVirtualRegister(RC);
   2347 
   2348     BuildMI(DispContBB, DL, TII->get(VE::SLLri), Tmp1)
   2349         .addReg(IReg, getKillRegState(true))
   2350         .addImm(3);
   2351     BuildMI(DispContBB, DL, TII->get(VE::LDrri), TReg)
   2352         .addReg(BReg, getKillRegState(true))
   2353         .addReg(Tmp1, getKillRegState(true))
   2354         .addImm(0);
   2355     BuildMI(DispContBB, DL, TII->get(VE::BCFLari_t))
   2356         .addReg(TReg, getKillRegState(true))
   2357         .addImm(0);
   2358     break;
   2359   }
   2360   case MachineJumpTableInfo::EK_Custom32: {
   2361     // Generate block address code using differences from the function pointer
   2362     // for PIC model.
   2363     //     sll %Tmp1, %IReg, 2
   2364     //     ldl.zx %OReg, 0(%Tmp1, %BReg)
   2365     //     Prepare function address in BReg2.
   2366     //     adds.l %TReg, %BReg2, %OReg
   2367     //     bcfla %TReg
   2368 
   2369     assert(isPositionIndependent());
   2370     Register OReg = MRI.createVirtualRegister(RC);
   2371     Register TReg = MRI.createVirtualRegister(RC);
   2372     Register Tmp1 = MRI.createVirtualRegister(RC);
   2373 
   2374     BuildMI(DispContBB, DL, TII->get(VE::SLLri), Tmp1)
   2375         .addReg(IReg, getKillRegState(true))
   2376         .addImm(2);
   2377     BuildMI(DispContBB, DL, TII->get(VE::LDLZXrri), OReg)
   2378         .addReg(BReg, getKillRegState(true))
   2379         .addReg(Tmp1, getKillRegState(true))
   2380         .addImm(0);
   2381     Register BReg2 =
   2382         prepareSymbol(*DispContBB, DispContBB->end(),
   2383                       DispContBB->getParent()->getName(), DL, /* Local */ true);
   2384     BuildMI(DispContBB, DL, TII->get(VE::ADDSLrr), TReg)
   2385         .addReg(OReg, getKillRegState(true))
   2386         .addReg(BReg2, getKillRegState(true));
   2387     BuildMI(DispContBB, DL, TII->get(VE::BCFLari_t))
   2388         .addReg(TReg, getKillRegState(true))
   2389         .addImm(0);
   2390     break;
   2391   }
   2392   default:
   2393     llvm_unreachable("Unexpected jump table encoding");
   2394   }
   2395 
   2396   // Add the jump table entries as successors to the MBB.
   2397   SmallPtrSet<MachineBasicBlock *, 8> SeenMBBs;
   2398   for (auto &LP : LPadList)
   2399     if (SeenMBBs.insert(LP).second)
   2400       DispContBB->addSuccessor(LP);
   2401 
   2402   // N.B. the order the invoke BBs are processed in doesn't matter here.
   2403   SmallVector<MachineBasicBlock *, 64> MBBLPads;
   2404   const MCPhysReg *SavedRegs = MF->getRegInfo().getCalleeSavedRegs();
   2405   for (MachineBasicBlock *MBB : InvokeBBs) {
   2406     // Remove the landing pad successor from the invoke block and replace it
   2407     // with the new dispatch block.
   2408     // Keep a copy of Successors since it's modified inside the loop.
   2409     SmallVector<MachineBasicBlock *, 8> Successors(MBB->succ_rbegin(),
   2410                                                    MBB->succ_rend());
   2411     // FIXME: Avoid quadratic complexity.
   2412     for (auto MBBS : Successors) {
   2413       if (MBBS->isEHPad()) {
   2414         MBB->removeSuccessor(MBBS);
   2415         MBBLPads.push_back(MBBS);
   2416       }
   2417     }
   2418 
   2419     MBB->addSuccessor(DispatchBB);
   2420 
   2421     // Find the invoke call and mark all of the callee-saved registers as
   2422     // 'implicit defined' so that they're spilled.  This prevents code from
   2423     // moving instructions to before the EH block, where they will never be
   2424     // executed.
   2425     for (auto &II : reverse(*MBB)) {
   2426       if (!II.isCall())
   2427         continue;
   2428 
   2429       DenseMap<Register, bool> DefRegs;
   2430       for (auto &MOp : II.operands())
   2431         if (MOp.isReg())
   2432           DefRegs[MOp.getReg()] = true;
   2433 
   2434       MachineInstrBuilder MIB(*MF, &II);
   2435       for (unsigned RI = 0; SavedRegs[RI]; ++RI) {
   2436         Register Reg = SavedRegs[RI];
   2437         if (!DefRegs[Reg])
   2438           MIB.addReg(Reg, RegState::ImplicitDefine | RegState::Dead);
   2439       }
   2440 
   2441       break;
   2442     }
   2443   }
   2444 
   2445   // Mark all former landing pads as non-landing pads.  The dispatch is the only
   2446   // landing pad now.
   2447   for (auto &LP : MBBLPads)
   2448     LP->setIsEHPad(false);
   2449 
   2450   // The instruction is gone now.
   2451   MI.eraseFromParent();
   2452   return BB;
   2453 }
   2454 
   2455 MachineBasicBlock *
   2456 VETargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
   2457                                               MachineBasicBlock *BB) const {
   2458   switch (MI.getOpcode()) {
   2459   default:
   2460     llvm_unreachable("Unknown Custom Instruction!");
   2461   case VE::EH_SjLj_LongJmp:
   2462     return emitEHSjLjLongJmp(MI, BB);
   2463   case VE::EH_SjLj_SetJmp:
   2464     return emitEHSjLjSetJmp(MI, BB);
   2465   case VE::EH_SjLj_Setup_Dispatch:
   2466     return emitSjLjDispatchBlock(MI, BB);
   2467   }
   2468 }
   2469 
   2470 static bool isI32Insn(const SDNode *User, const SDNode *N) {
   2471   switch (User->getOpcode()) {
   2472   default:
   2473     return false;
   2474   case ISD::ADD:
   2475   case ISD::SUB:
   2476   case ISD::MUL:
   2477   case ISD::SDIV:
   2478   case ISD::UDIV:
   2479   case ISD::SETCC:
   2480   case ISD::SMIN:
   2481   case ISD::SMAX:
   2482   case ISD::SHL:
   2483   case ISD::SRA:
   2484   case ISD::BSWAP:
   2485   case ISD::SINT_TO_FP:
   2486   case ISD::UINT_TO_FP:
   2487   case ISD::BR_CC:
   2488   case ISD::BITCAST:
   2489   case ISD::ATOMIC_CMP_SWAP:
   2490   case ISD::ATOMIC_SWAP:
   2491     return true;
   2492   case ISD::SRL:
   2493     if (N->getOperand(0).getOpcode() != ISD::SRL)
   2494       return true;
   2495     // (srl (trunc (srl ...))) may be optimized by combining srl, so
   2496     // doesn't optimize trunc now.
   2497     return false;
   2498   case ISD::SELECT_CC:
   2499     if (User->getOperand(2).getNode() != N &&
   2500         User->getOperand(3).getNode() != N)
   2501       return true;
   2502     LLVM_FALLTHROUGH;
   2503   case ISD::AND:
   2504   case ISD::OR:
   2505   case ISD::XOR:
   2506   case ISD::SELECT:
   2507   case ISD::CopyToReg:
   2508     // Check all use of selections, bit operations, and copies.  If all of them
   2509     // are safe, optimize truncate to extract_subreg.
   2510     for (SDNode::use_iterator UI = User->use_begin(), UE = User->use_end();
   2511          UI != UE; ++UI) {
   2512       switch ((*UI)->getOpcode()) {
   2513       default:
   2514         // If the use is an instruction which treats the source operand as i32,
   2515         // it is safe to avoid truncate here.
   2516         if (isI32Insn(*UI, N))
   2517           continue;
   2518         break;
   2519       case ISD::ANY_EXTEND:
   2520       case ISD::SIGN_EXTEND:
   2521       case ISD::ZERO_EXTEND: {
   2522         // Special optimizations to the combination of ext and trunc.
   2523         // (ext ... (select ... (trunc ...))) is safe to avoid truncate here
   2524         // since this truncate instruction clears higher 32 bits which is filled
   2525         // by one of ext instructions later.
   2526         assert(N->getValueType(0) == MVT::i32 &&
   2527                "find truncate to not i32 integer");
   2528         if (User->getOpcode() == ISD::SELECT_CC ||
   2529             User->getOpcode() == ISD::SELECT)
   2530           continue;
   2531         break;
   2532       }
   2533       }
   2534       return false;
   2535     }
   2536     return true;
   2537   }
   2538 }
   2539 
   2540 // Optimize TRUNCATE in DAG combining.  Optimizing it in CUSTOM lower is
   2541 // sometime too early.  Optimizing it in DAG pattern matching in VEInstrInfo.td
   2542 // is sometime too late.  So, doing it at here.
   2543 SDValue VETargetLowering::combineTRUNCATE(SDNode *N,
   2544                                           DAGCombinerInfo &DCI) const {
   2545   assert(N->getOpcode() == ISD::TRUNCATE &&
   2546          "Should be called with a TRUNCATE node");
   2547 
   2548   SelectionDAG &DAG = DCI.DAG;
   2549   SDLoc DL(N);
   2550   EVT VT = N->getValueType(0);
   2551 
   2552   // We prefer to do this when all types are legal.
   2553   if (!DCI.isAfterLegalizeDAG())
   2554     return SDValue();
   2555 
   2556   // Skip combine TRUNCATE atm if the operand of TRUNCATE might be a constant.
   2557   if (N->getOperand(0)->getOpcode() == ISD::SELECT_CC &&
   2558       isa<ConstantSDNode>(N->getOperand(0)->getOperand(0)) &&
   2559       isa<ConstantSDNode>(N->getOperand(0)->getOperand(1)))
   2560     return SDValue();
   2561 
   2562   // Check all use of this TRUNCATE.
   2563   for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end(); UI != UE;
   2564        ++UI) {
   2565     SDNode *User = *UI;
   2566 
   2567     // Make sure that we're not going to replace TRUNCATE for non i32
   2568     // instructions.
   2569     //
   2570     // FIXME: Although we could sometimes handle this, and it does occur in
   2571     // practice that one of the condition inputs to the select is also one of
   2572     // the outputs, we currently can't deal with this.
   2573     if (isI32Insn(User, N))
   2574       continue;
   2575 
   2576     return SDValue();
   2577   }
   2578 
   2579   SDValue SubI32 = DAG.getTargetConstant(VE::sub_i32, DL, MVT::i32);
   2580   return SDValue(DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL, VT,
   2581                                     N->getOperand(0), SubI32),
   2582                  0);
   2583 }
   2584 
   2585 SDValue VETargetLowering::PerformDAGCombine(SDNode *N,
   2586                                             DAGCombinerInfo &DCI) const {
   2587   switch (N->getOpcode()) {
   2588   default:
   2589     break;
   2590   case ISD::TRUNCATE:
   2591     return combineTRUNCATE(N, DCI);
   2592   }
   2593 
   2594   return SDValue();
   2595 }
   2596 
   2597 //===----------------------------------------------------------------------===//
   2598 // VE Inline Assembly Support
   2599 //===----------------------------------------------------------------------===//
   2600 
   2601 VETargetLowering::ConstraintType
   2602 VETargetLowering::getConstraintType(StringRef Constraint) const {
   2603   if (Constraint.size() == 1) {
   2604     switch (Constraint[0]) {
   2605     default:
   2606       break;
   2607     case 'v': // vector registers
   2608       return C_RegisterClass;
   2609     }
   2610   }
   2611   return TargetLowering::getConstraintType(Constraint);
   2612 }
   2613 
   2614 std::pair<unsigned, const TargetRegisterClass *>
   2615 VETargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
   2616                                                StringRef Constraint,
   2617                                                MVT VT) const {
   2618   const TargetRegisterClass *RC = nullptr;
   2619   if (Constraint.size() == 1) {
   2620     switch (Constraint[0]) {
   2621     default:
   2622       return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
   2623     case 'r':
   2624       RC = &VE::I64RegClass;
   2625       break;
   2626     case 'v':
   2627       RC = &VE::V64RegClass;
   2628       break;
   2629     }
   2630     return std::make_pair(0U, RC);
   2631   }
   2632 
   2633   return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
   2634 }
   2635 
   2636 //===----------------------------------------------------------------------===//
   2637 // VE Target Optimization Support
   2638 //===----------------------------------------------------------------------===//
   2639 
   2640 unsigned VETargetLowering::getMinimumJumpTableEntries() const {
   2641   // Specify 8 for PIC model to relieve the impact of PIC load instructions.
   2642   if (isJumpTableRelative())
   2643     return 8;
   2644 
   2645   return TargetLowering::getMinimumJumpTableEntries();
   2646 }
   2647 
   2648 bool VETargetLowering::hasAndNot(SDValue Y) const {
   2649   EVT VT = Y.getValueType();
   2650 
   2651   // VE doesn't have vector and not instruction.
   2652   if (VT.isVector())
   2653     return false;
   2654 
   2655   // VE allows different immediate values for X and Y where ~X & Y.
   2656   // Only simm7 works for X, and only mimm works for Y on VE.  However, this
   2657   // function is used to check whether an immediate value is OK for and-not
   2658   // instruction as both X and Y.  Generating additional instruction to
   2659   // retrieve an immediate value is no good since the purpose of this
   2660   // function is to convert a series of 3 instructions to another series of
   2661   // 3 instructions with better parallelism.  Therefore, we return false
   2662   // for all immediate values now.
   2663   // FIXME: Change hasAndNot function to have two operands to make it work
   2664   //        correctly with Aurora VE.
   2665   if (isa<ConstantSDNode>(Y))
   2666     return false;
   2667 
   2668   // It's ok for generic registers.
   2669   return true;
   2670 }
   2671 
   2672 /// \returns the VVP_* SDNode opcode corresponsing to \p OC.
   2673 static Optional<unsigned> getVVPOpcode(unsigned Opcode) {
   2674   switch (Opcode) {
   2675 #define HANDLE_VP_TO_VVP(VPOPC, VVPNAME)                                       \
   2676   case ISD::VPOPC:                                                             \
   2677     return VEISD::VVPNAME;
   2678 #define ADD_VVP_OP(VVPNAME, SDNAME)                                            \
   2679   case VEISD::VVPNAME:                                                         \
   2680   case ISD::SDNAME:                                                            \
   2681     return VEISD::VVPNAME;
   2682 #include "VVPNodes.def"
   2683   }
   2684   return None;
   2685 }
   2686 
   2687 SDValue VETargetLowering::lowerToVVP(SDValue Op, SelectionDAG &DAG) const {
   2688   // Can we represent this as a VVP node.
   2689   const unsigned Opcode = Op->getOpcode();
   2690   auto VVPOpcodeOpt = getVVPOpcode(Opcode);
   2691   if (!VVPOpcodeOpt.hasValue())
   2692     return SDValue();
   2693   unsigned VVPOpcode = VVPOpcodeOpt.getValue();
   2694   const bool FromVP = ISD::isVPOpcode(Opcode);
   2695 
   2696   // The representative and legalized vector type of this operation.
   2697   SDLoc DL(Op);
   2698   MVT MaskVT = MVT::v256i1; // TODO: packed mode.
   2699   EVT OpVecVT = Op.getValueType();
   2700   EVT LegalVecVT = getTypeToTransformTo(*DAG.getContext(), OpVecVT);
   2701 
   2702   SDValue AVL;
   2703   SDValue Mask;
   2704 
   2705   if (FromVP) {
   2706     // All upstream VP SDNodes always have a mask and avl.
   2707     auto MaskIdx = ISD::getVPMaskIdx(Opcode).getValue();
   2708     auto AVLIdx = ISD::getVPExplicitVectorLengthIdx(Opcode).getValue();
   2709     Mask = Op->getOperand(MaskIdx);
   2710     AVL = Op->getOperand(AVLIdx);
   2711 
   2712   } else {
   2713     // Materialize the VL parameter.
   2714     AVL = DAG.getConstant(OpVecVT.getVectorNumElements(), DL, MVT::i32);
   2715     SDValue ConstTrue = DAG.getConstant(1, DL, MVT::i32);
   2716     Mask = DAG.getNode(VEISD::VEC_BROADCAST, DL, MaskVT,
   2717                        ConstTrue); // emit a VEISD::VEC_BROADCAST here.
   2718   }
   2719 
   2720   // Categories we are interested in.
   2721   bool IsBinaryOp = false;
   2722 
   2723   switch (VVPOpcode) {
   2724 #define ADD_BINARY_VVP_OP(VVPNAME, ...)                                        \
   2725   case VEISD::VVPNAME:                                                         \
   2726     IsBinaryOp = true;                                                         \
   2727     break;
   2728 #include "VVPNodes.def"
   2729   }
   2730 
   2731   if (IsBinaryOp) {
   2732     assert(LegalVecVT.isSimple());
   2733     return DAG.getNode(VVPOpcode, DL, LegalVecVT, Op->getOperand(0),
   2734                        Op->getOperand(1), Mask, AVL);
   2735   }
   2736   llvm_unreachable("lowerToVVP called for unexpected SDNode.");
   2737 }
   2738 
   2739 SDValue VETargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
   2740                                                   SelectionDAG &DAG) const {
   2741   assert(Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unknown opcode!");
   2742   MVT VT = Op.getOperand(0).getSimpleValueType();
   2743 
   2744   // Special treatment for packed V64 types.
   2745   assert(VT == MVT::v512i32 || VT == MVT::v512f32);
   2746   (void)VT;
   2747   // Example of codes:
   2748   //   %packed_v = extractelt %vr, %idx / 2
   2749   //   %v = %packed_v >> (%idx % 2 * 32)
   2750   //   %res = %v & 0xffffffff
   2751 
   2752   SDValue Vec = Op.getOperand(0);
   2753   SDValue Idx = Op.getOperand(1);
   2754   SDLoc DL(Op);
   2755   SDValue Result = Op;
   2756   if (0 /* Idx->isConstant() */) {
   2757     // TODO: optimized implementation using constant values
   2758   } else {
   2759     SDValue Const1 = DAG.getConstant(1, DL, MVT::i64);
   2760     SDValue HalfIdx = DAG.getNode(ISD::SRL, DL, MVT::i64, {Idx, Const1});
   2761     SDValue PackedElt =
   2762         SDValue(DAG.getMachineNode(VE::LVSvr, DL, MVT::i64, {Vec, HalfIdx}), 0);
   2763     SDValue AndIdx = DAG.getNode(ISD::AND, DL, MVT::i64, {Idx, Const1});
   2764     SDValue Shift = DAG.getNode(ISD::XOR, DL, MVT::i64, {AndIdx, Const1});
   2765     SDValue Const5 = DAG.getConstant(5, DL, MVT::i64);
   2766     Shift = DAG.getNode(ISD::SHL, DL, MVT::i64, {Shift, Const5});
   2767     PackedElt = DAG.getNode(ISD::SRL, DL, MVT::i64, {PackedElt, Shift});
   2768     SDValue Mask = DAG.getConstant(0xFFFFFFFFL, DL, MVT::i64);
   2769     PackedElt = DAG.getNode(ISD::AND, DL, MVT::i64, {PackedElt, Mask});
   2770     SDValue SubI32 = DAG.getTargetConstant(VE::sub_i32, DL, MVT::i32);
   2771     Result = SDValue(DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL,
   2772                                         MVT::i32, PackedElt, SubI32),
   2773                      0);
   2774 
   2775     if (Op.getSimpleValueType() == MVT::f32) {
   2776       Result = DAG.getBitcast(MVT::f32, Result);
   2777     } else {
   2778       assert(Op.getSimpleValueType() == MVT::i32);
   2779     }
   2780   }
   2781   return Result;
   2782 }
   2783 
   2784 SDValue VETargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op,
   2785                                                  SelectionDAG &DAG) const {
   2786   assert(Op.getOpcode() == ISD::INSERT_VECTOR_ELT && "Unknown opcode!");
   2787   MVT VT = Op.getOperand(0).getSimpleValueType();
   2788 
   2789   // Special treatment for packed V64 types.
   2790   assert(VT == MVT::v512i32 || VT == MVT::v512f32);
   2791   (void)VT;
   2792   // The v512i32 and v512f32 starts from upper bits (0..31).  This "upper
   2793   // bits" required `val << 32` from C implementation's point of view.
   2794   //
   2795   // Example of codes:
   2796   //   %packed_elt = extractelt %vr, (%idx >> 1)
   2797   //   %shift = ((%idx & 1) ^ 1) << 5
   2798   //   %packed_elt &= 0xffffffff00000000 >> shift
   2799   //   %packed_elt |= (zext %val) << shift
   2800   //   %vr = insertelt %vr, %packed_elt, (%idx >> 1)
   2801 
   2802   SDLoc DL(Op);
   2803   SDValue Vec = Op.getOperand(0);
   2804   SDValue Val = Op.getOperand(1);
   2805   SDValue Idx = Op.getOperand(2);
   2806   if (Idx.getSimpleValueType() == MVT::i32)
   2807     Idx = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Idx);
   2808   if (Val.getSimpleValueType() == MVT::f32)
   2809     Val = DAG.getBitcast(MVT::i32, Val);
   2810   assert(Val.getSimpleValueType() == MVT::i32);
   2811   Val = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Val);
   2812 
   2813   SDValue Result = Op;
   2814   if (0 /* Idx->isConstant()*/) {
   2815     // TODO: optimized implementation using constant values
   2816   } else {
   2817     SDValue Const1 = DAG.getConstant(1, DL, MVT::i64);
   2818     SDValue HalfIdx = DAG.getNode(ISD::SRL, DL, MVT::i64, {Idx, Const1});
   2819     SDValue PackedElt =
   2820         SDValue(DAG.getMachineNode(VE::LVSvr, DL, MVT::i64, {Vec, HalfIdx}), 0);
   2821     SDValue AndIdx = DAG.getNode(ISD::AND, DL, MVT::i64, {Idx, Const1});
   2822     SDValue Shift = DAG.getNode(ISD::XOR, DL, MVT::i64, {AndIdx, Const1});
   2823     SDValue Const5 = DAG.getConstant(5, DL, MVT::i64);
   2824     Shift = DAG.getNode(ISD::SHL, DL, MVT::i64, {Shift, Const5});
   2825     SDValue Mask = DAG.getConstant(0xFFFFFFFF00000000L, DL, MVT::i64);
   2826     Mask = DAG.getNode(ISD::SRL, DL, MVT::i64, {Mask, Shift});
   2827     PackedElt = DAG.getNode(ISD::AND, DL, MVT::i64, {PackedElt, Mask});
   2828     Val = DAG.getNode(ISD::SHL, DL, MVT::i64, {Val, Shift});
   2829     PackedElt = DAG.getNode(ISD::OR, DL, MVT::i64, {PackedElt, Val});
   2830     Result =
   2831         SDValue(DAG.getMachineNode(VE::LSVrr_v, DL, Vec.getSimpleValueType(),
   2832                                    {HalfIdx, PackedElt, Vec}),
   2833                 0);
   2834   }
   2835   return Result;
   2836 }
   2837