Home | History | Annotate | Line # | Download | only in ARM
      1 //===-- ARMSelectionDAGInfo.cpp - ARM SelectionDAG Info -------------------===//
      2 //
      3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
      4 // See https://llvm.org/LICENSE.txt for license information.
      5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
      6 //
      7 //===----------------------------------------------------------------------===//
      8 //
      9 // This file implements the ARMSelectionDAGInfo class.
     10 //
     11 //===----------------------------------------------------------------------===//
     12 
     13 #include "ARMTargetMachine.h"
     14 #include "ARMTargetTransformInfo.h"
     15 #include "llvm/CodeGen/SelectionDAG.h"
     16 #include "llvm/IR/DerivedTypes.h"
     17 #include "llvm/Support/CommandLine.h"
     18 using namespace llvm;
     19 
     20 #define DEBUG_TYPE "arm-selectiondag-info"
     21 
     22 cl::opt<TPLoop::MemTransfer> EnableMemtransferTPLoop(
     23     "arm-memtransfer-tploop", cl::Hidden,
     24     cl::desc("Control conversion of memcpy to "
     25              "Tail predicated loops (WLSTP)"),
     26     cl::init(TPLoop::ForceDisabled),
     27     cl::values(clEnumValN(TPLoop::ForceDisabled, "force-disabled",
     28                           "Don't convert memcpy to TP loop."),
     29                clEnumValN(TPLoop::ForceEnabled, "force-enabled",
     30                           "Always convert memcpy to TP loop."),
     31                clEnumValN(TPLoop::Allow, "allow",
     32                           "Allow (may be subject to certain conditions) "
     33                           "conversion of memcpy to TP loop.")));
     34 
     35 // Emit, if possible, a specialized version of the given Libcall. Typically this
     36 // means selecting the appropriately aligned version, but we also convert memset
     37 // of 0 into memclr.
     38 SDValue ARMSelectionDAGInfo::EmitSpecializedLibcall(
     39     SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src,
     40     SDValue Size, unsigned Align, RTLIB::Libcall LC) const {
     41   const ARMSubtarget &Subtarget =
     42       DAG.getMachineFunction().getSubtarget<ARMSubtarget>();
     43   const ARMTargetLowering *TLI = Subtarget.getTargetLowering();
     44 
     45   // Only use a specialized AEABI function if the default version of this
     46   // Libcall is an AEABI function.
     47   if (std::strncmp(TLI->getLibcallName(LC), "__aeabi", 7) != 0)
     48     return SDValue();
     49 
     50   // Translate RTLIB::Libcall to AEABILibcall. We only do this in order to be
     51   // able to translate memset to memclr and use the value to index the function
     52   // name array.
     53   enum {
     54     AEABI_MEMCPY = 0,
     55     AEABI_MEMMOVE,
     56     AEABI_MEMSET,
     57     AEABI_MEMCLR
     58   } AEABILibcall;
     59   switch (LC) {
     60   case RTLIB::MEMCPY:
     61     AEABILibcall = AEABI_MEMCPY;
     62     break;
     63   case RTLIB::MEMMOVE:
     64     AEABILibcall = AEABI_MEMMOVE;
     65     break;
     66   case RTLIB::MEMSET:
     67     AEABILibcall = AEABI_MEMSET;
     68     if (ConstantSDNode *ConstantSrc = dyn_cast<ConstantSDNode>(Src))
     69       if (ConstantSrc->getZExtValue() == 0)
     70         AEABILibcall = AEABI_MEMCLR;
     71     break;
     72   default:
     73     return SDValue();
     74   }
     75 
     76   // Choose the most-aligned libcall variant that we can
     77   enum {
     78     ALIGN1 = 0,
     79     ALIGN4,
     80     ALIGN8
     81   } AlignVariant;
     82   if ((Align & 7) == 0)
     83     AlignVariant = ALIGN8;
     84   else if ((Align & 3) == 0)
     85     AlignVariant = ALIGN4;
     86   else
     87     AlignVariant = ALIGN1;
     88 
     89   TargetLowering::ArgListTy Args;
     90   TargetLowering::ArgListEntry Entry;
     91   Entry.Ty = DAG.getDataLayout().getIntPtrType(*DAG.getContext());
     92   Entry.Node = Dst;
     93   Args.push_back(Entry);
     94   if (AEABILibcall == AEABI_MEMCLR) {
     95     Entry.Node = Size;
     96     Args.push_back(Entry);
     97   } else if (AEABILibcall == AEABI_MEMSET) {
     98     // Adjust parameters for memset, EABI uses format (ptr, size, value),
     99     // GNU library uses (ptr, value, size)
    100     // See RTABI section 4.3.4
    101     Entry.Node = Size;
    102     Args.push_back(Entry);
    103 
    104     // Extend or truncate the argument to be an i32 value for the call.
    105     if (Src.getValueType().bitsGT(MVT::i32))
    106       Src = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Src);
    107     else if (Src.getValueType().bitsLT(MVT::i32))
    108       Src = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Src);
    109 
    110     Entry.Node = Src;
    111     Entry.Ty = Type::getInt32Ty(*DAG.getContext());
    112     Entry.IsSExt = false;
    113     Args.push_back(Entry);
    114   } else {
    115     Entry.Node = Src;
    116     Args.push_back(Entry);
    117 
    118     Entry.Node = Size;
    119     Args.push_back(Entry);
    120   }
    121 
    122   char const *FunctionNames[4][3] = {
    123     { "__aeabi_memcpy",  "__aeabi_memcpy4",  "__aeabi_memcpy8"  },
    124     { "__aeabi_memmove", "__aeabi_memmove4", "__aeabi_memmove8" },
    125     { "__aeabi_memset",  "__aeabi_memset4",  "__aeabi_memset8"  },
    126     { "__aeabi_memclr",  "__aeabi_memclr4",  "__aeabi_memclr8"  }
    127   };
    128   TargetLowering::CallLoweringInfo CLI(DAG);
    129   CLI.setDebugLoc(dl)
    130       .setChain(Chain)
    131       .setLibCallee(
    132           TLI->getLibcallCallingConv(LC), Type::getVoidTy(*DAG.getContext()),
    133           DAG.getExternalSymbol(FunctionNames[AEABILibcall][AlignVariant],
    134                                 TLI->getPointerTy(DAG.getDataLayout())),
    135           std::move(Args))
    136       .setDiscardResult();
    137   std::pair<SDValue,SDValue> CallResult = TLI->LowerCallTo(CLI);
    138 
    139   return CallResult.second;
    140 }
    141 
    142 static bool shouldGenerateInlineTPLoop(const ARMSubtarget &Subtarget,
    143                                        const SelectionDAG &DAG,
    144                                        ConstantSDNode *ConstantSize,
    145                                        Align Alignment, bool IsMemcpy) {
    146   auto &F = DAG.getMachineFunction().getFunction();
    147   if (!EnableMemtransferTPLoop)
    148     return false;
    149   if (EnableMemtransferTPLoop == TPLoop::ForceEnabled)
    150     return true;
    151   // Do not generate inline TP loop if optimizations is disabled,
    152   // or if optimization for size (-Os or -Oz) is on.
    153   if (F.hasOptNone() || F.hasOptSize())
    154     return false;
    155   // If cli option is unset, for memset always generate inline TP.
    156   // For memcpy, check some conditions
    157   if (!IsMemcpy)
    158     return true;
    159   if (!ConstantSize && Alignment >= Align(4))
    160     return true;
    161   if (ConstantSize &&
    162       ConstantSize->getZExtValue() > Subtarget.getMaxInlineSizeThreshold() &&
    163       ConstantSize->getZExtValue() <
    164           Subtarget.getMaxMemcpyTPInlineSizeThreshold())
    165     return true;
    166   return false;
    167 }
    168 
    169 SDValue ARMSelectionDAGInfo::EmitTargetCodeForMemcpy(
    170     SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src,
    171     SDValue Size, Align Alignment, bool isVolatile, bool AlwaysInline,
    172     MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) const {
    173   const ARMSubtarget &Subtarget =
    174       DAG.getMachineFunction().getSubtarget<ARMSubtarget>();
    175   ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
    176 
    177   if (Subtarget.hasMVEIntegerOps() &&
    178       shouldGenerateInlineTPLoop(Subtarget, DAG, ConstantSize, Alignment, true))
    179     return DAG.getNode(ARMISD::MEMCPYLOOP, dl, MVT::Other, Chain, Dst, Src,
    180                        DAG.getZExtOrTrunc(Size, dl, MVT::i32));
    181 
    182   // Do repeated 4-byte loads and stores. To be improved.
    183   // This requires 4-byte alignment.
    184   if (Alignment < Align(4))
    185     return SDValue();
    186   // This requires the copy size to be a constant, preferably
    187   // within a subtarget-specific limit.
    188   if (!ConstantSize)
    189     return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size,
    190                                   Alignment.value(), RTLIB::MEMCPY);
    191   uint64_t SizeVal = ConstantSize->getZExtValue();
    192   if (!AlwaysInline && SizeVal > Subtarget.getMaxInlineSizeThreshold())
    193     return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size,
    194                                   Alignment.value(), RTLIB::MEMCPY);
    195 
    196   unsigned BytesLeft = SizeVal & 3;
    197   unsigned NumMemOps = SizeVal >> 2;
    198   unsigned EmittedNumMemOps = 0;
    199   EVT VT = MVT::i32;
    200   unsigned VTSize = 4;
    201   unsigned i = 0;
    202   // Emit a maximum of 4 loads in Thumb1 since we have fewer registers
    203   const unsigned MaxLoadsInLDM = Subtarget.isThumb1Only() ? 4 : 6;
    204   SDValue TFOps[6];
    205   SDValue Loads[6];
    206   uint64_t SrcOff = 0, DstOff = 0;
    207 
    208   // FIXME: We should invent a VMEMCPY pseudo-instruction that lowers to
    209   // VLDM/VSTM and make this code emit it when appropriate. This would reduce
    210   // pressure on the general purpose registers. However this seems harder to map
    211   // onto the register allocator's view of the world.
    212 
    213   // The number of MEMCPY pseudo-instructions to emit. We use up to
    214   // MaxLoadsInLDM registers per mcopy, which will get lowered into ldm/stm
    215   // later on. This is a lower bound on the number of MEMCPY operations we must
    216   // emit.
    217   unsigned NumMEMCPYs = (NumMemOps + MaxLoadsInLDM - 1) / MaxLoadsInLDM;
    218 
    219   // Code size optimisation: do not inline memcpy if expansion results in
    220   // more instructions than the libary call.
    221   if (NumMEMCPYs > 1 && Subtarget.hasMinSize()) {
    222     return SDValue();
    223   }
    224 
    225   SDVTList VTs = DAG.getVTList(MVT::i32, MVT::i32, MVT::Other, MVT::Glue);
    226 
    227   for (unsigned I = 0; I != NumMEMCPYs; ++I) {
    228     // Evenly distribute registers among MEMCPY operations to reduce register
    229     // pressure.
    230     unsigned NextEmittedNumMemOps = NumMemOps * (I + 1) / NumMEMCPYs;
    231     unsigned NumRegs = NextEmittedNumMemOps - EmittedNumMemOps;
    232 
    233     Dst = DAG.getNode(ARMISD::MEMCPY, dl, VTs, Chain, Dst, Src,
    234                       DAG.getConstant(NumRegs, dl, MVT::i32));
    235     Src = Dst.getValue(1);
    236     Chain = Dst.getValue(2);
    237 
    238     DstPtrInfo = DstPtrInfo.getWithOffset(NumRegs * VTSize);
    239     SrcPtrInfo = SrcPtrInfo.getWithOffset(NumRegs * VTSize);
    240 
    241     EmittedNumMemOps = NextEmittedNumMemOps;
    242   }
    243 
    244   if (BytesLeft == 0)
    245     return Chain;
    246 
    247   // Issue loads / stores for the trailing (1 - 3) bytes.
    248   auto getRemainingValueType = [](unsigned BytesLeft) {
    249     return (BytesLeft >= 2) ? MVT::i16 : MVT::i8;
    250   };
    251   auto getRemainingSize = [](unsigned BytesLeft) {
    252     return (BytesLeft >= 2) ? 2 : 1;
    253   };
    254 
    255   unsigned BytesLeftSave = BytesLeft;
    256   i = 0;
    257   while (BytesLeft) {
    258     VT = getRemainingValueType(BytesLeft);
    259     VTSize = getRemainingSize(BytesLeft);
    260     Loads[i] = DAG.getLoad(VT, dl, Chain,
    261                            DAG.getNode(ISD::ADD, dl, MVT::i32, Src,
    262                                        DAG.getConstant(SrcOff, dl, MVT::i32)),
    263                            SrcPtrInfo.getWithOffset(SrcOff));
    264     TFOps[i] = Loads[i].getValue(1);
    265     ++i;
    266     SrcOff += VTSize;
    267     BytesLeft -= VTSize;
    268   }
    269   Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
    270                       makeArrayRef(TFOps, i));
    271 
    272   i = 0;
    273   BytesLeft = BytesLeftSave;
    274   while (BytesLeft) {
    275     VT = getRemainingValueType(BytesLeft);
    276     VTSize = getRemainingSize(BytesLeft);
    277     TFOps[i] = DAG.getStore(Chain, dl, Loads[i],
    278                             DAG.getNode(ISD::ADD, dl, MVT::i32, Dst,
    279                                         DAG.getConstant(DstOff, dl, MVT::i32)),
    280                             DstPtrInfo.getWithOffset(DstOff));
    281     ++i;
    282     DstOff += VTSize;
    283     BytesLeft -= VTSize;
    284   }
    285   return DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
    286                      makeArrayRef(TFOps, i));
    287 }
    288 
    289 SDValue ARMSelectionDAGInfo::EmitTargetCodeForMemmove(
    290     SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src,
    291     SDValue Size, Align Alignment, bool isVolatile,
    292     MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) const {
    293   return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size,
    294                                 Alignment.value(), RTLIB::MEMMOVE);
    295 }
    296 
    297 SDValue ARMSelectionDAGInfo::EmitTargetCodeForMemset(
    298     SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src,
    299     SDValue Size, Align Alignment, bool isVolatile,
    300     MachinePointerInfo DstPtrInfo) const {
    301 
    302   const ARMSubtarget &Subtarget =
    303       DAG.getMachineFunction().getSubtarget<ARMSubtarget>();
    304 
    305   ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
    306 
    307   // Generate TP loop for llvm.memset
    308   if (Subtarget.hasMVEIntegerOps() &&
    309       shouldGenerateInlineTPLoop(Subtarget, DAG, ConstantSize, Alignment,
    310                                  false)) {
    311     Src = DAG.getSplatBuildVector(MVT::v16i8, dl,
    312                                   DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Src));
    313     return DAG.getNode(ARMISD::MEMSETLOOP, dl, MVT::Other, Chain, Dst, Src,
    314                        DAG.getZExtOrTrunc(Size, dl, MVT::i32));
    315   }
    316 
    317   return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size,
    318                                 Alignment.value(), RTLIB::MEMSET);
    319 }
    320