Home | History | Annotate | Line # | Download | only in X86
      1 //===-- X86InstCombineIntrinsic.cpp - X86 specific InstCombine pass -------===//
      2 //
      3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
      4 // See https://llvm.org/LICENSE.txt for license information.
      5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
      6 //
      7 //===----------------------------------------------------------------------===//
      8 /// \file
      9 /// This file implements a TargetTransformInfo analysis pass specific to the
     10 /// X86 target machine. It uses the target's detailed information to provide
     11 /// more precise answers to certain TTI queries, while letting the target
     12 /// independent and default TTI implementations handle the rest.
     13 ///
     14 //===----------------------------------------------------------------------===//
     15 
     16 #include "X86TargetTransformInfo.h"
     17 #include "llvm/IR/IntrinsicInst.h"
     18 #include "llvm/IR/IntrinsicsX86.h"
     19 #include "llvm/Support/KnownBits.h"
     20 #include "llvm/Transforms/InstCombine/InstCombiner.h"
     21 
     22 using namespace llvm;
     23 
     24 #define DEBUG_TYPE "x86tti"
     25 
     26 /// Return a constant boolean vector that has true elements in all positions
     27 /// where the input constant data vector has an element with the sign bit set.
     28 static Constant *getNegativeIsTrueBoolVec(Constant *V) {
     29   VectorType *IntTy = VectorType::getInteger(cast<VectorType>(V->getType()));
     30   V = ConstantExpr::getBitCast(V, IntTy);
     31   V = ConstantExpr::getICmp(CmpInst::ICMP_SGT, Constant::getNullValue(IntTy),
     32                             V);
     33   return V;
     34 }
     35 
     36 /// Convert the x86 XMM integer vector mask to a vector of bools based on
     37 /// each element's most significant bit (the sign bit).
     38 static Value *getBoolVecFromMask(Value *Mask) {
     39   // Fold Constant Mask.
     40   if (auto *ConstantMask = dyn_cast<ConstantDataVector>(Mask))
     41     return getNegativeIsTrueBoolVec(ConstantMask);
     42 
     43   // Mask was extended from a boolean vector.
     44   Value *ExtMask;
     45   if (PatternMatch::match(
     46           Mask, PatternMatch::m_SExt(PatternMatch::m_Value(ExtMask))) &&
     47       ExtMask->getType()->isIntOrIntVectorTy(1))
     48     return ExtMask;
     49 
     50   return nullptr;
     51 }
     52 
     53 // TODO: If the x86 backend knew how to convert a bool vector mask back to an
     54 // XMM register mask efficiently, we could transform all x86 masked intrinsics
     55 // to LLVM masked intrinsics and remove the x86 masked intrinsic defs.
     56 static Instruction *simplifyX86MaskedLoad(IntrinsicInst &II, InstCombiner &IC) {
     57   Value *Ptr = II.getOperand(0);
     58   Value *Mask = II.getOperand(1);
     59   Constant *ZeroVec = Constant::getNullValue(II.getType());
     60 
     61   // Zero Mask - masked load instruction creates a zero vector.
     62   if (isa<ConstantAggregateZero>(Mask))
     63     return IC.replaceInstUsesWith(II, ZeroVec);
     64 
     65   // The mask is constant or extended from a bool vector. Convert this x86
     66   // intrinsic to the LLVM intrinsic to allow target-independent optimizations.
     67   if (Value *BoolMask = getBoolVecFromMask(Mask)) {
     68     // First, cast the x86 intrinsic scalar pointer to a vector pointer to match
     69     // the LLVM intrinsic definition for the pointer argument.
     70     unsigned AddrSpace = cast<PointerType>(Ptr->getType())->getAddressSpace();
     71     PointerType *VecPtrTy = PointerType::get(II.getType(), AddrSpace);
     72     Value *PtrCast = IC.Builder.CreateBitCast(Ptr, VecPtrTy, "castvec");
     73 
     74     // The pass-through vector for an x86 masked load is a zero vector.
     75     CallInst *NewMaskedLoad =
     76         IC.Builder.CreateMaskedLoad(PtrCast, Align(1), BoolMask, ZeroVec);
     77     return IC.replaceInstUsesWith(II, NewMaskedLoad);
     78   }
     79 
     80   return nullptr;
     81 }
     82 
     83 // TODO: If the x86 backend knew how to convert a bool vector mask back to an
     84 // XMM register mask efficiently, we could transform all x86 masked intrinsics
     85 // to LLVM masked intrinsics and remove the x86 masked intrinsic defs.
     86 static bool simplifyX86MaskedStore(IntrinsicInst &II, InstCombiner &IC) {
     87   Value *Ptr = II.getOperand(0);
     88   Value *Mask = II.getOperand(1);
     89   Value *Vec = II.getOperand(2);
     90 
     91   // Zero Mask - this masked store instruction does nothing.
     92   if (isa<ConstantAggregateZero>(Mask)) {
     93     IC.eraseInstFromFunction(II);
     94     return true;
     95   }
     96 
     97   // The SSE2 version is too weird (eg, unaligned but non-temporal) to do
     98   // anything else at this level.
     99   if (II.getIntrinsicID() == Intrinsic::x86_sse2_maskmov_dqu)
    100     return false;
    101 
    102   // The mask is constant or extended from a bool vector. Convert this x86
    103   // intrinsic to the LLVM intrinsic to allow target-independent optimizations.
    104   if (Value *BoolMask = getBoolVecFromMask(Mask)) {
    105     unsigned AddrSpace = cast<PointerType>(Ptr->getType())->getAddressSpace();
    106     PointerType *VecPtrTy = PointerType::get(Vec->getType(), AddrSpace);
    107     Value *PtrCast = IC.Builder.CreateBitCast(Ptr, VecPtrTy, "castvec");
    108 
    109     IC.Builder.CreateMaskedStore(Vec, PtrCast, Align(1), BoolMask);
    110 
    111     // 'Replace uses' doesn't work for stores. Erase the original masked store.
    112     IC.eraseInstFromFunction(II);
    113     return true;
    114   }
    115 
    116   return false;
    117 }
    118 
    119 static Value *simplifyX86immShift(const IntrinsicInst &II,
    120                                   InstCombiner::BuilderTy &Builder) {
    121   bool LogicalShift = false;
    122   bool ShiftLeft = false;
    123   bool IsImm = false;
    124 
    125   switch (II.getIntrinsicID()) {
    126   default:
    127     llvm_unreachable("Unexpected intrinsic!");
    128   case Intrinsic::x86_sse2_psrai_d:
    129   case Intrinsic::x86_sse2_psrai_w:
    130   case Intrinsic::x86_avx2_psrai_d:
    131   case Intrinsic::x86_avx2_psrai_w:
    132   case Intrinsic::x86_avx512_psrai_q_128:
    133   case Intrinsic::x86_avx512_psrai_q_256:
    134   case Intrinsic::x86_avx512_psrai_d_512:
    135   case Intrinsic::x86_avx512_psrai_q_512:
    136   case Intrinsic::x86_avx512_psrai_w_512:
    137     IsImm = true;
    138     LLVM_FALLTHROUGH;
    139   case Intrinsic::x86_sse2_psra_d:
    140   case Intrinsic::x86_sse2_psra_w:
    141   case Intrinsic::x86_avx2_psra_d:
    142   case Intrinsic::x86_avx2_psra_w:
    143   case Intrinsic::x86_avx512_psra_q_128:
    144   case Intrinsic::x86_avx512_psra_q_256:
    145   case Intrinsic::x86_avx512_psra_d_512:
    146   case Intrinsic::x86_avx512_psra_q_512:
    147   case Intrinsic::x86_avx512_psra_w_512:
    148     LogicalShift = false;
    149     ShiftLeft = false;
    150     break;
    151   case Intrinsic::x86_sse2_psrli_d:
    152   case Intrinsic::x86_sse2_psrli_q:
    153   case Intrinsic::x86_sse2_psrli_w:
    154   case Intrinsic::x86_avx2_psrli_d:
    155   case Intrinsic::x86_avx2_psrli_q:
    156   case Intrinsic::x86_avx2_psrli_w:
    157   case Intrinsic::x86_avx512_psrli_d_512:
    158   case Intrinsic::x86_avx512_psrli_q_512:
    159   case Intrinsic::x86_avx512_psrli_w_512:
    160     IsImm = true;
    161     LLVM_FALLTHROUGH;
    162   case Intrinsic::x86_sse2_psrl_d:
    163   case Intrinsic::x86_sse2_psrl_q:
    164   case Intrinsic::x86_sse2_psrl_w:
    165   case Intrinsic::x86_avx2_psrl_d:
    166   case Intrinsic::x86_avx2_psrl_q:
    167   case Intrinsic::x86_avx2_psrl_w:
    168   case Intrinsic::x86_avx512_psrl_d_512:
    169   case Intrinsic::x86_avx512_psrl_q_512:
    170   case Intrinsic::x86_avx512_psrl_w_512:
    171     LogicalShift = true;
    172     ShiftLeft = false;
    173     break;
    174   case Intrinsic::x86_sse2_pslli_d:
    175   case Intrinsic::x86_sse2_pslli_q:
    176   case Intrinsic::x86_sse2_pslli_w:
    177   case Intrinsic::x86_avx2_pslli_d:
    178   case Intrinsic::x86_avx2_pslli_q:
    179   case Intrinsic::x86_avx2_pslli_w:
    180   case Intrinsic::x86_avx512_pslli_d_512:
    181   case Intrinsic::x86_avx512_pslli_q_512:
    182   case Intrinsic::x86_avx512_pslli_w_512:
    183     IsImm = true;
    184     LLVM_FALLTHROUGH;
    185   case Intrinsic::x86_sse2_psll_d:
    186   case Intrinsic::x86_sse2_psll_q:
    187   case Intrinsic::x86_sse2_psll_w:
    188   case Intrinsic::x86_avx2_psll_d:
    189   case Intrinsic::x86_avx2_psll_q:
    190   case Intrinsic::x86_avx2_psll_w:
    191   case Intrinsic::x86_avx512_psll_d_512:
    192   case Intrinsic::x86_avx512_psll_q_512:
    193   case Intrinsic::x86_avx512_psll_w_512:
    194     LogicalShift = true;
    195     ShiftLeft = true;
    196     break;
    197   }
    198   assert((LogicalShift || !ShiftLeft) && "Only logical shifts can shift left");
    199 
    200   auto Vec = II.getArgOperand(0);
    201   auto Amt = II.getArgOperand(1);
    202   auto VT = cast<FixedVectorType>(Vec->getType());
    203   auto SVT = VT->getElementType();
    204   auto AmtVT = Amt->getType();
    205   unsigned VWidth = VT->getNumElements();
    206   unsigned BitWidth = SVT->getPrimitiveSizeInBits();
    207 
    208   // If the shift amount is guaranteed to be in-range we can replace it with a
    209   // generic shift. If its guaranteed to be out of range, logical shifts combine
    210   // to zero and arithmetic shifts are clamped to (BitWidth - 1).
    211   if (IsImm) {
    212     assert(AmtVT->isIntegerTy(32) && "Unexpected shift-by-immediate type");
    213     KnownBits KnownAmtBits =
    214         llvm::computeKnownBits(Amt, II.getModule()->getDataLayout());
    215     if (KnownAmtBits.getMaxValue().ult(BitWidth)) {
    216       Amt = Builder.CreateZExtOrTrunc(Amt, SVT);
    217       Amt = Builder.CreateVectorSplat(VWidth, Amt);
    218       return (LogicalShift ? (ShiftLeft ? Builder.CreateShl(Vec, Amt)
    219                                         : Builder.CreateLShr(Vec, Amt))
    220                            : Builder.CreateAShr(Vec, Amt));
    221     }
    222     if (KnownAmtBits.getMinValue().uge(BitWidth)) {
    223       if (LogicalShift)
    224         return ConstantAggregateZero::get(VT);
    225       Amt = ConstantInt::get(SVT, BitWidth - 1);
    226       return Builder.CreateAShr(Vec, Builder.CreateVectorSplat(VWidth, Amt));
    227     }
    228   } else {
    229     // Ensure the first element has an in-range value and the rest of the
    230     // elements in the bottom 64 bits are zero.
    231     assert(AmtVT->isVectorTy() && AmtVT->getPrimitiveSizeInBits() == 128 &&
    232            cast<VectorType>(AmtVT)->getElementType() == SVT &&
    233            "Unexpected shift-by-scalar type");
    234     unsigned NumAmtElts = cast<FixedVectorType>(AmtVT)->getNumElements();
    235     APInt DemandedLower = APInt::getOneBitSet(NumAmtElts, 0);
    236     APInt DemandedUpper = APInt::getBitsSet(NumAmtElts, 1, NumAmtElts / 2);
    237     KnownBits KnownLowerBits = llvm::computeKnownBits(
    238         Amt, DemandedLower, II.getModule()->getDataLayout());
    239     KnownBits KnownUpperBits = llvm::computeKnownBits(
    240         Amt, DemandedUpper, II.getModule()->getDataLayout());
    241     if (KnownLowerBits.getMaxValue().ult(BitWidth) &&
    242         (DemandedUpper.isNullValue() || KnownUpperBits.isZero())) {
    243       SmallVector<int, 16> ZeroSplat(VWidth, 0);
    244       Amt = Builder.CreateShuffleVector(Amt, ZeroSplat);
    245       return (LogicalShift ? (ShiftLeft ? Builder.CreateShl(Vec, Amt)
    246                                         : Builder.CreateLShr(Vec, Amt))
    247                            : Builder.CreateAShr(Vec, Amt));
    248     }
    249   }
    250 
    251   // Simplify if count is constant vector.
    252   auto CDV = dyn_cast<ConstantDataVector>(Amt);
    253   if (!CDV)
    254     return nullptr;
    255 
    256   // SSE2/AVX2 uses all the first 64-bits of the 128-bit vector
    257   // operand to compute the shift amount.
    258   assert(AmtVT->isVectorTy() && AmtVT->getPrimitiveSizeInBits() == 128 &&
    259          cast<VectorType>(AmtVT)->getElementType() == SVT &&
    260          "Unexpected shift-by-scalar type");
    261 
    262   // Concatenate the sub-elements to create the 64-bit value.
    263   APInt Count(64, 0);
    264   for (unsigned i = 0, NumSubElts = 64 / BitWidth; i != NumSubElts; ++i) {
    265     unsigned SubEltIdx = (NumSubElts - 1) - i;
    266     auto SubElt = cast<ConstantInt>(CDV->getElementAsConstant(SubEltIdx));
    267     Count <<= BitWidth;
    268     Count |= SubElt->getValue().zextOrTrunc(64);
    269   }
    270 
    271   // If shift-by-zero then just return the original value.
    272   if (Count.isNullValue())
    273     return Vec;
    274 
    275   // Handle cases when Shift >= BitWidth.
    276   if (Count.uge(BitWidth)) {
    277     // If LogicalShift - just return zero.
    278     if (LogicalShift)
    279       return ConstantAggregateZero::get(VT);
    280 
    281     // If ArithmeticShift - clamp Shift to (BitWidth - 1).
    282     Count = APInt(64, BitWidth - 1);
    283   }
    284 
    285   // Get a constant vector of the same type as the first operand.
    286   auto ShiftAmt = ConstantInt::get(SVT, Count.zextOrTrunc(BitWidth));
    287   auto ShiftVec = Builder.CreateVectorSplat(VWidth, ShiftAmt);
    288 
    289   if (ShiftLeft)
    290     return Builder.CreateShl(Vec, ShiftVec);
    291 
    292   if (LogicalShift)
    293     return Builder.CreateLShr(Vec, ShiftVec);
    294 
    295   return Builder.CreateAShr(Vec, ShiftVec);
    296 }
    297 
    298 // Attempt to simplify AVX2 per-element shift intrinsics to a generic IR shift.
    299 // Unlike the generic IR shifts, the intrinsics have defined behaviour for out
    300 // of range shift amounts (logical - set to zero, arithmetic - splat sign bit).
    301 static Value *simplifyX86varShift(const IntrinsicInst &II,
    302                                   InstCombiner::BuilderTy &Builder) {
    303   bool LogicalShift = false;
    304   bool ShiftLeft = false;
    305 
    306   switch (II.getIntrinsicID()) {
    307   default:
    308     llvm_unreachable("Unexpected intrinsic!");
    309   case Intrinsic::x86_avx2_psrav_d:
    310   case Intrinsic::x86_avx2_psrav_d_256:
    311   case Intrinsic::x86_avx512_psrav_q_128:
    312   case Intrinsic::x86_avx512_psrav_q_256:
    313   case Intrinsic::x86_avx512_psrav_d_512:
    314   case Intrinsic::x86_avx512_psrav_q_512:
    315   case Intrinsic::x86_avx512_psrav_w_128:
    316   case Intrinsic::x86_avx512_psrav_w_256:
    317   case Intrinsic::x86_avx512_psrav_w_512:
    318     LogicalShift = false;
    319     ShiftLeft = false;
    320     break;
    321   case Intrinsic::x86_avx2_psrlv_d:
    322   case Intrinsic::x86_avx2_psrlv_d_256:
    323   case Intrinsic::x86_avx2_psrlv_q:
    324   case Intrinsic::x86_avx2_psrlv_q_256:
    325   case Intrinsic::x86_avx512_psrlv_d_512:
    326   case Intrinsic::x86_avx512_psrlv_q_512:
    327   case Intrinsic::x86_avx512_psrlv_w_128:
    328   case Intrinsic::x86_avx512_psrlv_w_256:
    329   case Intrinsic::x86_avx512_psrlv_w_512:
    330     LogicalShift = true;
    331     ShiftLeft = false;
    332     break;
    333   case Intrinsic::x86_avx2_psllv_d:
    334   case Intrinsic::x86_avx2_psllv_d_256:
    335   case Intrinsic::x86_avx2_psllv_q:
    336   case Intrinsic::x86_avx2_psllv_q_256:
    337   case Intrinsic::x86_avx512_psllv_d_512:
    338   case Intrinsic::x86_avx512_psllv_q_512:
    339   case Intrinsic::x86_avx512_psllv_w_128:
    340   case Intrinsic::x86_avx512_psllv_w_256:
    341   case Intrinsic::x86_avx512_psllv_w_512:
    342     LogicalShift = true;
    343     ShiftLeft = true;
    344     break;
    345   }
    346   assert((LogicalShift || !ShiftLeft) && "Only logical shifts can shift left");
    347 
    348   auto Vec = II.getArgOperand(0);
    349   auto Amt = II.getArgOperand(1);
    350   auto VT = cast<FixedVectorType>(II.getType());
    351   auto SVT = VT->getElementType();
    352   int NumElts = VT->getNumElements();
    353   int BitWidth = SVT->getIntegerBitWidth();
    354 
    355   // If the shift amount is guaranteed to be in-range we can replace it with a
    356   // generic shift.
    357   APInt UpperBits =
    358       APInt::getHighBitsSet(BitWidth, BitWidth - Log2_32(BitWidth));
    359   if (llvm::MaskedValueIsZero(Amt, UpperBits,
    360                               II.getModule()->getDataLayout())) {
    361     return (LogicalShift ? (ShiftLeft ? Builder.CreateShl(Vec, Amt)
    362                                       : Builder.CreateLShr(Vec, Amt))
    363                          : Builder.CreateAShr(Vec, Amt));
    364   }
    365 
    366   // Simplify if all shift amounts are constant/undef.
    367   auto *CShift = dyn_cast<Constant>(Amt);
    368   if (!CShift)
    369     return nullptr;
    370 
    371   // Collect each element's shift amount.
    372   // We also collect special cases: UNDEF = -1, OUT-OF-RANGE = BitWidth.
    373   bool AnyOutOfRange = false;
    374   SmallVector<int, 8> ShiftAmts;
    375   for (int I = 0; I < NumElts; ++I) {
    376     auto *CElt = CShift->getAggregateElement(I);
    377     if (isa_and_nonnull<UndefValue>(CElt)) {
    378       ShiftAmts.push_back(-1);
    379       continue;
    380     }
    381 
    382     auto *COp = dyn_cast_or_null<ConstantInt>(CElt);
    383     if (!COp)
    384       return nullptr;
    385 
    386     // Handle out of range shifts.
    387     // If LogicalShift - set to BitWidth (special case).
    388     // If ArithmeticShift - set to (BitWidth - 1) (sign splat).
    389     APInt ShiftVal = COp->getValue();
    390     if (ShiftVal.uge(BitWidth)) {
    391       AnyOutOfRange = LogicalShift;
    392       ShiftAmts.push_back(LogicalShift ? BitWidth : BitWidth - 1);
    393       continue;
    394     }
    395 
    396     ShiftAmts.push_back((int)ShiftVal.getZExtValue());
    397   }
    398 
    399   // If all elements out of range or UNDEF, return vector of zeros/undefs.
    400   // ArithmeticShift should only hit this if they are all UNDEF.
    401   auto OutOfRange = [&](int Idx) { return (Idx < 0) || (BitWidth <= Idx); };
    402   if (llvm::all_of(ShiftAmts, OutOfRange)) {
    403     SmallVector<Constant *, 8> ConstantVec;
    404     for (int Idx : ShiftAmts) {
    405       if (Idx < 0) {
    406         ConstantVec.push_back(UndefValue::get(SVT));
    407       } else {
    408         assert(LogicalShift && "Logical shift expected");
    409         ConstantVec.push_back(ConstantInt::getNullValue(SVT));
    410       }
    411     }
    412     return ConstantVector::get(ConstantVec);
    413   }
    414 
    415   // We can't handle only some out of range values with generic logical shifts.
    416   if (AnyOutOfRange)
    417     return nullptr;
    418 
    419   // Build the shift amount constant vector.
    420   SmallVector<Constant *, 8> ShiftVecAmts;
    421   for (int Idx : ShiftAmts) {
    422     if (Idx < 0)
    423       ShiftVecAmts.push_back(UndefValue::get(SVT));
    424     else
    425       ShiftVecAmts.push_back(ConstantInt::get(SVT, Idx));
    426   }
    427   auto ShiftVec = ConstantVector::get(ShiftVecAmts);
    428 
    429   if (ShiftLeft)
    430     return Builder.CreateShl(Vec, ShiftVec);
    431 
    432   if (LogicalShift)
    433     return Builder.CreateLShr(Vec, ShiftVec);
    434 
    435   return Builder.CreateAShr(Vec, ShiftVec);
    436 }
    437 
    438 static Value *simplifyX86pack(IntrinsicInst &II,
    439                               InstCombiner::BuilderTy &Builder, bool IsSigned) {
    440   Value *Arg0 = II.getArgOperand(0);
    441   Value *Arg1 = II.getArgOperand(1);
    442   Type *ResTy = II.getType();
    443 
    444   // Fast all undef handling.
    445   if (isa<UndefValue>(Arg0) && isa<UndefValue>(Arg1))
    446     return UndefValue::get(ResTy);
    447 
    448   auto *ArgTy = cast<FixedVectorType>(Arg0->getType());
    449   unsigned NumLanes = ResTy->getPrimitiveSizeInBits() / 128;
    450   unsigned NumSrcElts = ArgTy->getNumElements();
    451   assert(cast<FixedVectorType>(ResTy)->getNumElements() == (2 * NumSrcElts) &&
    452          "Unexpected packing types");
    453 
    454   unsigned NumSrcEltsPerLane = NumSrcElts / NumLanes;
    455   unsigned DstScalarSizeInBits = ResTy->getScalarSizeInBits();
    456   unsigned SrcScalarSizeInBits = ArgTy->getScalarSizeInBits();
    457   assert(SrcScalarSizeInBits == (2 * DstScalarSizeInBits) &&
    458          "Unexpected packing types");
    459 
    460   // Constant folding.
    461   if (!isa<Constant>(Arg0) || !isa<Constant>(Arg1))
    462     return nullptr;
    463 
    464   // Clamp Values - signed/unsigned both use signed clamp values, but they
    465   // differ on the min/max values.
    466   APInt MinValue, MaxValue;
    467   if (IsSigned) {
    468     // PACKSS: Truncate signed value with signed saturation.
    469     // Source values less than dst minint are saturated to minint.
    470     // Source values greater than dst maxint are saturated to maxint.
    471     MinValue =
    472         APInt::getSignedMinValue(DstScalarSizeInBits).sext(SrcScalarSizeInBits);
    473     MaxValue =
    474         APInt::getSignedMaxValue(DstScalarSizeInBits).sext(SrcScalarSizeInBits);
    475   } else {
    476     // PACKUS: Truncate signed value with unsigned saturation.
    477     // Source values less than zero are saturated to zero.
    478     // Source values greater than dst maxuint are saturated to maxuint.
    479     MinValue = APInt::getNullValue(SrcScalarSizeInBits);
    480     MaxValue = APInt::getLowBitsSet(SrcScalarSizeInBits, DstScalarSizeInBits);
    481   }
    482 
    483   auto *MinC = Constant::getIntegerValue(ArgTy, MinValue);
    484   auto *MaxC = Constant::getIntegerValue(ArgTy, MaxValue);
    485   Arg0 = Builder.CreateSelect(Builder.CreateICmpSLT(Arg0, MinC), MinC, Arg0);
    486   Arg1 = Builder.CreateSelect(Builder.CreateICmpSLT(Arg1, MinC), MinC, Arg1);
    487   Arg0 = Builder.CreateSelect(Builder.CreateICmpSGT(Arg0, MaxC), MaxC, Arg0);
    488   Arg1 = Builder.CreateSelect(Builder.CreateICmpSGT(Arg1, MaxC), MaxC, Arg1);
    489 
    490   // Shuffle clamped args together at the lane level.
    491   SmallVector<int, 32> PackMask;
    492   for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
    493     for (unsigned Elt = 0; Elt != NumSrcEltsPerLane; ++Elt)
    494       PackMask.push_back(Elt + (Lane * NumSrcEltsPerLane));
    495     for (unsigned Elt = 0; Elt != NumSrcEltsPerLane; ++Elt)
    496       PackMask.push_back(Elt + (Lane * NumSrcEltsPerLane) + NumSrcElts);
    497   }
    498   auto *Shuffle = Builder.CreateShuffleVector(Arg0, Arg1, PackMask);
    499 
    500   // Truncate to dst size.
    501   return Builder.CreateTrunc(Shuffle, ResTy);
    502 }
    503 
    504 static Value *simplifyX86movmsk(const IntrinsicInst &II,
    505                                 InstCombiner::BuilderTy &Builder) {
    506   Value *Arg = II.getArgOperand(0);
    507   Type *ResTy = II.getType();
    508 
    509   // movmsk(undef) -> zero as we must ensure the upper bits are zero.
    510   if (isa<UndefValue>(Arg))
    511     return Constant::getNullValue(ResTy);
    512 
    513   auto *ArgTy = dyn_cast<FixedVectorType>(Arg->getType());
    514   // We can't easily peek through x86_mmx types.
    515   if (!ArgTy)
    516     return nullptr;
    517 
    518   // Expand MOVMSK to compare/bitcast/zext:
    519   // e.g. PMOVMSKB(v16i8 x):
    520   // %cmp = icmp slt <16 x i8> %x, zeroinitializer
    521   // %int = bitcast <16 x i1> %cmp to i16
    522   // %res = zext i16 %int to i32
    523   unsigned NumElts = ArgTy->getNumElements();
    524   Type *IntegerVecTy = VectorType::getInteger(ArgTy);
    525   Type *IntegerTy = Builder.getIntNTy(NumElts);
    526 
    527   Value *Res = Builder.CreateBitCast(Arg, IntegerVecTy);
    528   Res = Builder.CreateICmpSLT(Res, Constant::getNullValue(IntegerVecTy));
    529   Res = Builder.CreateBitCast(Res, IntegerTy);
    530   Res = Builder.CreateZExtOrTrunc(Res, ResTy);
    531   return Res;
    532 }
    533 
    534 static Value *simplifyX86addcarry(const IntrinsicInst &II,
    535                                   InstCombiner::BuilderTy &Builder) {
    536   Value *CarryIn = II.getArgOperand(0);
    537   Value *Op1 = II.getArgOperand(1);
    538   Value *Op2 = II.getArgOperand(2);
    539   Type *RetTy = II.getType();
    540   Type *OpTy = Op1->getType();
    541   assert(RetTy->getStructElementType(0)->isIntegerTy(8) &&
    542          RetTy->getStructElementType(1) == OpTy && OpTy == Op2->getType() &&
    543          "Unexpected types for x86 addcarry");
    544 
    545   // If carry-in is zero, this is just an unsigned add with overflow.
    546   if (match(CarryIn, PatternMatch::m_ZeroInt())) {
    547     Value *UAdd = Builder.CreateIntrinsic(Intrinsic::uadd_with_overflow, OpTy,
    548                                           {Op1, Op2});
    549     // The types have to be adjusted to match the x86 call types.
    550     Value *UAddResult = Builder.CreateExtractValue(UAdd, 0);
    551     Value *UAddOV = Builder.CreateZExt(Builder.CreateExtractValue(UAdd, 1),
    552                                        Builder.getInt8Ty());
    553     Value *Res = UndefValue::get(RetTy);
    554     Res = Builder.CreateInsertValue(Res, UAddOV, 0);
    555     return Builder.CreateInsertValue(Res, UAddResult, 1);
    556   }
    557 
    558   return nullptr;
    559 }
    560 
    561 static Value *simplifyX86insertps(const IntrinsicInst &II,
    562                                   InstCombiner::BuilderTy &Builder) {
    563   auto *CInt = dyn_cast<ConstantInt>(II.getArgOperand(2));
    564   if (!CInt)
    565     return nullptr;
    566 
    567   auto *VecTy = cast<FixedVectorType>(II.getType());
    568   assert(VecTy->getNumElements() == 4 && "insertps with wrong vector type");
    569 
    570   // The immediate permute control byte looks like this:
    571   //    [3:0] - zero mask for each 32-bit lane
    572   //    [5:4] - select one 32-bit destination lane
    573   //    [7:6] - select one 32-bit source lane
    574 
    575   uint8_t Imm = CInt->getZExtValue();
    576   uint8_t ZMask = Imm & 0xf;
    577   uint8_t DestLane = (Imm >> 4) & 0x3;
    578   uint8_t SourceLane = (Imm >> 6) & 0x3;
    579 
    580   ConstantAggregateZero *ZeroVector = ConstantAggregateZero::get(VecTy);
    581 
    582   // If all zero mask bits are set, this was just a weird way to
    583   // generate a zero vector.
    584   if (ZMask == 0xf)
    585     return ZeroVector;
    586 
    587   // Initialize by passing all of the first source bits through.
    588   int ShuffleMask[4] = {0, 1, 2, 3};
    589 
    590   // We may replace the second operand with the zero vector.
    591   Value *V1 = II.getArgOperand(1);
    592 
    593   if (ZMask) {
    594     // If the zero mask is being used with a single input or the zero mask
    595     // overrides the destination lane, this is a shuffle with the zero vector.
    596     if ((II.getArgOperand(0) == II.getArgOperand(1)) ||
    597         (ZMask & (1 << DestLane))) {
    598       V1 = ZeroVector;
    599       // We may still move 32-bits of the first source vector from one lane
    600       // to another.
    601       ShuffleMask[DestLane] = SourceLane;
    602       // The zero mask may override the previous insert operation.
    603       for (unsigned i = 0; i < 4; ++i)
    604         if ((ZMask >> i) & 0x1)
    605           ShuffleMask[i] = i + 4;
    606     } else {
    607       // TODO: Model this case as 2 shuffles or a 'logical and' plus shuffle?
    608       return nullptr;
    609     }
    610   } else {
    611     // Replace the selected destination lane with the selected source lane.
    612     ShuffleMask[DestLane] = SourceLane + 4;
    613   }
    614 
    615   return Builder.CreateShuffleVector(II.getArgOperand(0), V1, ShuffleMask);
    616 }
    617 
    618 /// Attempt to simplify SSE4A EXTRQ/EXTRQI instructions using constant folding
    619 /// or conversion to a shuffle vector.
    620 static Value *simplifyX86extrq(IntrinsicInst &II, Value *Op0,
    621                                ConstantInt *CILength, ConstantInt *CIIndex,
    622                                InstCombiner::BuilderTy &Builder) {
    623   auto LowConstantHighUndef = [&](uint64_t Val) {
    624     Type *IntTy64 = Type::getInt64Ty(II.getContext());
    625     Constant *Args[] = {ConstantInt::get(IntTy64, Val),
    626                         UndefValue::get(IntTy64)};
    627     return ConstantVector::get(Args);
    628   };
    629 
    630   // See if we're dealing with constant values.
    631   Constant *C0 = dyn_cast<Constant>(Op0);
    632   ConstantInt *CI0 =
    633       C0 ? dyn_cast_or_null<ConstantInt>(C0->getAggregateElement((unsigned)0))
    634          : nullptr;
    635 
    636   // Attempt to constant fold.
    637   if (CILength && CIIndex) {
    638     // From AMD documentation: "The bit index and field length are each six
    639     // bits in length other bits of the field are ignored."
    640     APInt APIndex = CIIndex->getValue().zextOrTrunc(6);
    641     APInt APLength = CILength->getValue().zextOrTrunc(6);
    642 
    643     unsigned Index = APIndex.getZExtValue();
    644 
    645     // From AMD documentation: "a value of zero in the field length is
    646     // defined as length of 64".
    647     unsigned Length = APLength == 0 ? 64 : APLength.getZExtValue();
    648 
    649     // From AMD documentation: "If the sum of the bit index + length field
    650     // is greater than 64, the results are undefined".
    651     unsigned End = Index + Length;
    652 
    653     // Note that both field index and field length are 8-bit quantities.
    654     // Since variables 'Index' and 'Length' are unsigned values
    655     // obtained from zero-extending field index and field length
    656     // respectively, their sum should never wrap around.
    657     if (End > 64)
    658       return UndefValue::get(II.getType());
    659 
    660     // If we are inserting whole bytes, we can convert this to a shuffle.
    661     // Lowering can recognize EXTRQI shuffle masks.
    662     if ((Length % 8) == 0 && (Index % 8) == 0) {
    663       // Convert bit indices to byte indices.
    664       Length /= 8;
    665       Index /= 8;
    666 
    667       Type *IntTy8 = Type::getInt8Ty(II.getContext());
    668       auto *ShufTy = FixedVectorType::get(IntTy8, 16);
    669 
    670       SmallVector<int, 16> ShuffleMask;
    671       for (int i = 0; i != (int)Length; ++i)
    672         ShuffleMask.push_back(i + Index);
    673       for (int i = Length; i != 8; ++i)
    674         ShuffleMask.push_back(i + 16);
    675       for (int i = 8; i != 16; ++i)
    676         ShuffleMask.push_back(-1);
    677 
    678       Value *SV = Builder.CreateShuffleVector(
    679           Builder.CreateBitCast(Op0, ShufTy),
    680           ConstantAggregateZero::get(ShufTy), ShuffleMask);
    681       return Builder.CreateBitCast(SV, II.getType());
    682     }
    683 
    684     // Constant Fold - shift Index'th bit to lowest position and mask off
    685     // Length bits.
    686     if (CI0) {
    687       APInt Elt = CI0->getValue();
    688       Elt.lshrInPlace(Index);
    689       Elt = Elt.zextOrTrunc(Length);
    690       return LowConstantHighUndef(Elt.getZExtValue());
    691     }
    692 
    693     // If we were an EXTRQ call, we'll save registers if we convert to EXTRQI.
    694     if (II.getIntrinsicID() == Intrinsic::x86_sse4a_extrq) {
    695       Value *Args[] = {Op0, CILength, CIIndex};
    696       Module *M = II.getModule();
    697       Function *F = Intrinsic::getDeclaration(M, Intrinsic::x86_sse4a_extrqi);
    698       return Builder.CreateCall(F, Args);
    699     }
    700   }
    701 
    702   // Constant Fold - extraction from zero is always {zero, undef}.
    703   if (CI0 && CI0->isZero())
    704     return LowConstantHighUndef(0);
    705 
    706   return nullptr;
    707 }
    708 
    709 /// Attempt to simplify SSE4A INSERTQ/INSERTQI instructions using constant
    710 /// folding or conversion to a shuffle vector.
    711 static Value *simplifyX86insertq(IntrinsicInst &II, Value *Op0, Value *Op1,
    712                                  APInt APLength, APInt APIndex,
    713                                  InstCombiner::BuilderTy &Builder) {
    714   // From AMD documentation: "The bit index and field length are each six bits
    715   // in length other bits of the field are ignored."
    716   APIndex = APIndex.zextOrTrunc(6);
    717   APLength = APLength.zextOrTrunc(6);
    718 
    719   // Attempt to constant fold.
    720   unsigned Index = APIndex.getZExtValue();
    721 
    722   // From AMD documentation: "a value of zero in the field length is
    723   // defined as length of 64".
    724   unsigned Length = APLength == 0 ? 64 : APLength.getZExtValue();
    725 
    726   // From AMD documentation: "If the sum of the bit index + length field
    727   // is greater than 64, the results are undefined".
    728   unsigned End = Index + Length;
    729 
    730   // Note that both field index and field length are 8-bit quantities.
    731   // Since variables 'Index' and 'Length' are unsigned values
    732   // obtained from zero-extending field index and field length
    733   // respectively, their sum should never wrap around.
    734   if (End > 64)
    735     return UndefValue::get(II.getType());
    736 
    737   // If we are inserting whole bytes, we can convert this to a shuffle.
    738   // Lowering can recognize INSERTQI shuffle masks.
    739   if ((Length % 8) == 0 && (Index % 8) == 0) {
    740     // Convert bit indices to byte indices.
    741     Length /= 8;
    742     Index /= 8;
    743 
    744     Type *IntTy8 = Type::getInt8Ty(II.getContext());
    745     auto *ShufTy = FixedVectorType::get(IntTy8, 16);
    746 
    747     SmallVector<int, 16> ShuffleMask;
    748     for (int i = 0; i != (int)Index; ++i)
    749       ShuffleMask.push_back(i);
    750     for (int i = 0; i != (int)Length; ++i)
    751       ShuffleMask.push_back(i + 16);
    752     for (int i = Index + Length; i != 8; ++i)
    753       ShuffleMask.push_back(i);
    754     for (int i = 8; i != 16; ++i)
    755       ShuffleMask.push_back(-1);
    756 
    757     Value *SV = Builder.CreateShuffleVector(Builder.CreateBitCast(Op0, ShufTy),
    758                                             Builder.CreateBitCast(Op1, ShufTy),
    759                                             ShuffleMask);
    760     return Builder.CreateBitCast(SV, II.getType());
    761   }
    762 
    763   // See if we're dealing with constant values.
    764   Constant *C0 = dyn_cast<Constant>(Op0);
    765   Constant *C1 = dyn_cast<Constant>(Op1);
    766   ConstantInt *CI00 =
    767       C0 ? dyn_cast_or_null<ConstantInt>(C0->getAggregateElement((unsigned)0))
    768          : nullptr;
    769   ConstantInt *CI10 =
    770       C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)0))
    771          : nullptr;
    772 
    773   // Constant Fold - insert bottom Length bits starting at the Index'th bit.
    774   if (CI00 && CI10) {
    775     APInt V00 = CI00->getValue();
    776     APInt V10 = CI10->getValue();
    777     APInt Mask = APInt::getLowBitsSet(64, Length).shl(Index);
    778     V00 = V00 & ~Mask;
    779     V10 = V10.zextOrTrunc(Length).zextOrTrunc(64).shl(Index);
    780     APInt Val = V00 | V10;
    781     Type *IntTy64 = Type::getInt64Ty(II.getContext());
    782     Constant *Args[] = {ConstantInt::get(IntTy64, Val.getZExtValue()),
    783                         UndefValue::get(IntTy64)};
    784     return ConstantVector::get(Args);
    785   }
    786 
    787   // If we were an INSERTQ call, we'll save demanded elements if we convert to
    788   // INSERTQI.
    789   if (II.getIntrinsicID() == Intrinsic::x86_sse4a_insertq) {
    790     Type *IntTy8 = Type::getInt8Ty(II.getContext());
    791     Constant *CILength = ConstantInt::get(IntTy8, Length, false);
    792     Constant *CIIndex = ConstantInt::get(IntTy8, Index, false);
    793 
    794     Value *Args[] = {Op0, Op1, CILength, CIIndex};
    795     Module *M = II.getModule();
    796     Function *F = Intrinsic::getDeclaration(M, Intrinsic::x86_sse4a_insertqi);
    797     return Builder.CreateCall(F, Args);
    798   }
    799 
    800   return nullptr;
    801 }
    802 
    803 /// Attempt to convert pshufb* to shufflevector if the mask is constant.
    804 static Value *simplifyX86pshufb(const IntrinsicInst &II,
    805                                 InstCombiner::BuilderTy &Builder) {
    806   Constant *V = dyn_cast<Constant>(II.getArgOperand(1));
    807   if (!V)
    808     return nullptr;
    809 
    810   auto *VecTy = cast<FixedVectorType>(II.getType());
    811   unsigned NumElts = VecTy->getNumElements();
    812   assert((NumElts == 16 || NumElts == 32 || NumElts == 64) &&
    813          "Unexpected number of elements in shuffle mask!");
    814 
    815   // Construct a shuffle mask from constant integers or UNDEFs.
    816   int Indexes[64];
    817 
    818   // Each byte in the shuffle control mask forms an index to permute the
    819   // corresponding byte in the destination operand.
    820   for (unsigned I = 0; I < NumElts; ++I) {
    821     Constant *COp = V->getAggregateElement(I);
    822     if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp)))
    823       return nullptr;
    824 
    825     if (isa<UndefValue>(COp)) {
    826       Indexes[I] = -1;
    827       continue;
    828     }
    829 
    830     int8_t Index = cast<ConstantInt>(COp)->getValue().getZExtValue();
    831 
    832     // If the most significant bit (bit[7]) of each byte of the shuffle
    833     // control mask is set, then zero is written in the result byte.
    834     // The zero vector is in the right-hand side of the resulting
    835     // shufflevector.
    836 
    837     // The value of each index for the high 128-bit lane is the least
    838     // significant 4 bits of the respective shuffle control byte.
    839     Index = ((Index < 0) ? NumElts : Index & 0x0F) + (I & 0xF0);
    840     Indexes[I] = Index;
    841   }
    842 
    843   auto V1 = II.getArgOperand(0);
    844   auto V2 = Constant::getNullValue(VecTy);
    845   return Builder.CreateShuffleVector(V1, V2, makeArrayRef(Indexes, NumElts));
    846 }
    847 
    848 /// Attempt to convert vpermilvar* to shufflevector if the mask is constant.
    849 static Value *simplifyX86vpermilvar(const IntrinsicInst &II,
    850                                     InstCombiner::BuilderTy &Builder) {
    851   Constant *V = dyn_cast<Constant>(II.getArgOperand(1));
    852   if (!V)
    853     return nullptr;
    854 
    855   auto *VecTy = cast<FixedVectorType>(II.getType());
    856   unsigned NumElts = VecTy->getNumElements();
    857   bool IsPD = VecTy->getScalarType()->isDoubleTy();
    858   unsigned NumLaneElts = IsPD ? 2 : 4;
    859   assert(NumElts == 16 || NumElts == 8 || NumElts == 4 || NumElts == 2);
    860 
    861   // Construct a shuffle mask from constant integers or UNDEFs.
    862   int Indexes[16];
    863 
    864   // The intrinsics only read one or two bits, clear the rest.
    865   for (unsigned I = 0; I < NumElts; ++I) {
    866     Constant *COp = V->getAggregateElement(I);
    867     if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp)))
    868       return nullptr;
    869 
    870     if (isa<UndefValue>(COp)) {
    871       Indexes[I] = -1;
    872       continue;
    873     }
    874 
    875     APInt Index = cast<ConstantInt>(COp)->getValue();
    876     Index = Index.zextOrTrunc(32).getLoBits(2);
    877 
    878     // The PD variants uses bit 1 to select per-lane element index, so
    879     // shift down to convert to generic shuffle mask index.
    880     if (IsPD)
    881       Index.lshrInPlace(1);
    882 
    883     // The _256 variants are a bit trickier since the mask bits always index
    884     // into the corresponding 128 half. In order to convert to a generic
    885     // shuffle, we have to make that explicit.
    886     Index += APInt(32, (I / NumLaneElts) * NumLaneElts);
    887 
    888     Indexes[I] = Index.getZExtValue();
    889   }
    890 
    891   auto V1 = II.getArgOperand(0);
    892   return Builder.CreateShuffleVector(V1, makeArrayRef(Indexes, NumElts));
    893 }
    894 
    895 /// Attempt to convert vpermd/vpermps to shufflevector if the mask is constant.
    896 static Value *simplifyX86vpermv(const IntrinsicInst &II,
    897                                 InstCombiner::BuilderTy &Builder) {
    898   auto *V = dyn_cast<Constant>(II.getArgOperand(1));
    899   if (!V)
    900     return nullptr;
    901 
    902   auto *VecTy = cast<FixedVectorType>(II.getType());
    903   unsigned Size = VecTy->getNumElements();
    904   assert((Size == 4 || Size == 8 || Size == 16 || Size == 32 || Size == 64) &&
    905          "Unexpected shuffle mask size");
    906 
    907   // Construct a shuffle mask from constant integers or UNDEFs.
    908   int Indexes[64];
    909 
    910   for (unsigned I = 0; I < Size; ++I) {
    911     Constant *COp = V->getAggregateElement(I);
    912     if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp)))
    913       return nullptr;
    914 
    915     if (isa<UndefValue>(COp)) {
    916       Indexes[I] = -1;
    917       continue;
    918     }
    919 
    920     uint32_t Index = cast<ConstantInt>(COp)->getZExtValue();
    921     Index &= Size - 1;
    922     Indexes[I] = Index;
    923   }
    924 
    925   auto V1 = II.getArgOperand(0);
    926   return Builder.CreateShuffleVector(V1, makeArrayRef(Indexes, Size));
    927 }
    928 
    929 Optional<Instruction *>
    930 X86TTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
    931   auto SimplifyDemandedVectorEltsLow = [&IC](Value *Op, unsigned Width,
    932                                              unsigned DemandedWidth) {
    933     APInt UndefElts(Width, 0);
    934     APInt DemandedElts = APInt::getLowBitsSet(Width, DemandedWidth);
    935     return IC.SimplifyDemandedVectorElts(Op, DemandedElts, UndefElts);
    936   };
    937 
    938   Intrinsic::ID IID = II.getIntrinsicID();
    939   switch (IID) {
    940   case Intrinsic::x86_bmi_bextr_32:
    941   case Intrinsic::x86_bmi_bextr_64:
    942   case Intrinsic::x86_tbm_bextri_u32:
    943   case Intrinsic::x86_tbm_bextri_u64:
    944     // If the RHS is a constant we can try some simplifications.
    945     if (auto *C = dyn_cast<ConstantInt>(II.getArgOperand(1))) {
    946       uint64_t Shift = C->getZExtValue();
    947       uint64_t Length = (Shift >> 8) & 0xff;
    948       Shift &= 0xff;
    949       unsigned BitWidth = II.getType()->getIntegerBitWidth();
    950       // If the length is 0 or the shift is out of range, replace with zero.
    951       if (Length == 0 || Shift >= BitWidth) {
    952         return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0));
    953       }
    954       // If the LHS is also a constant, we can completely constant fold this.
    955       if (auto *InC = dyn_cast<ConstantInt>(II.getArgOperand(0))) {
    956         uint64_t Result = InC->getZExtValue() >> Shift;
    957         if (Length > BitWidth)
    958           Length = BitWidth;
    959         Result &= maskTrailingOnes<uint64_t>(Length);
    960         return IC.replaceInstUsesWith(II,
    961                                       ConstantInt::get(II.getType(), Result));
    962       }
    963       // TODO should we turn this into 'and' if shift is 0? Or 'shl' if we
    964       // are only masking bits that a shift already cleared?
    965     }
    966     break;
    967 
    968   case Intrinsic::x86_bmi_bzhi_32:
    969   case Intrinsic::x86_bmi_bzhi_64:
    970     // If the RHS is a constant we can try some simplifications.
    971     if (auto *C = dyn_cast<ConstantInt>(II.getArgOperand(1))) {
    972       uint64_t Index = C->getZExtValue() & 0xff;
    973       unsigned BitWidth = II.getType()->getIntegerBitWidth();
    974       if (Index >= BitWidth) {
    975         return IC.replaceInstUsesWith(II, II.getArgOperand(0));
    976       }
    977       if (Index == 0) {
    978         return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0));
    979       }
    980       // If the LHS is also a constant, we can completely constant fold this.
    981       if (auto *InC = dyn_cast<ConstantInt>(II.getArgOperand(0))) {
    982         uint64_t Result = InC->getZExtValue();
    983         Result &= maskTrailingOnes<uint64_t>(Index);
    984         return IC.replaceInstUsesWith(II,
    985                                       ConstantInt::get(II.getType(), Result));
    986       }
    987       // TODO should we convert this to an AND if the RHS is constant?
    988     }
    989     break;
    990   case Intrinsic::x86_bmi_pext_32:
    991   case Intrinsic::x86_bmi_pext_64:
    992     if (auto *MaskC = dyn_cast<ConstantInt>(II.getArgOperand(1))) {
    993       if (MaskC->isNullValue()) {
    994         return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0));
    995       }
    996       if (MaskC->isAllOnesValue()) {
    997         return IC.replaceInstUsesWith(II, II.getArgOperand(0));
    998       }
    999 
   1000       if (MaskC->getValue().isShiftedMask()) {
   1001         // any single contingous sequence of 1s anywhere in the mask simply
   1002         // describes a subset of the input bits shifted to the appropriate
   1003         // position.  Replace with the straight forward IR.
   1004         unsigned ShiftAmount = MaskC->getValue().countTrailingZeros();
   1005         Value *Input = II.getArgOperand(0);
   1006         Value *Masked = IC.Builder.CreateAnd(Input, II.getArgOperand(1));
   1007         Value *Shifted = IC.Builder.CreateLShr(Masked,
   1008                                                ConstantInt::get(II.getType(),
   1009                                                                 ShiftAmount));
   1010         return IC.replaceInstUsesWith(II, Shifted);
   1011       }
   1012 
   1013 
   1014       if (auto *SrcC = dyn_cast<ConstantInt>(II.getArgOperand(0))) {
   1015         uint64_t Src = SrcC->getZExtValue();
   1016         uint64_t Mask = MaskC->getZExtValue();
   1017         uint64_t Result = 0;
   1018         uint64_t BitToSet = 1;
   1019 
   1020         while (Mask) {
   1021           // Isolate lowest set bit.
   1022           uint64_t BitToTest = Mask & -Mask;
   1023           if (BitToTest & Src)
   1024             Result |= BitToSet;
   1025 
   1026           BitToSet <<= 1;
   1027           // Clear lowest set bit.
   1028           Mask &= Mask - 1;
   1029         }
   1030 
   1031         return IC.replaceInstUsesWith(II,
   1032                                       ConstantInt::get(II.getType(), Result));
   1033       }
   1034     }
   1035     break;
   1036   case Intrinsic::x86_bmi_pdep_32:
   1037   case Intrinsic::x86_bmi_pdep_64:
   1038     if (auto *MaskC = dyn_cast<ConstantInt>(II.getArgOperand(1))) {
   1039       if (MaskC->isNullValue()) {
   1040         return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0));
   1041       }
   1042       if (MaskC->isAllOnesValue()) {
   1043         return IC.replaceInstUsesWith(II, II.getArgOperand(0));
   1044       }
   1045       if (MaskC->getValue().isShiftedMask()) {
   1046         // any single contingous sequence of 1s anywhere in the mask simply
   1047         // describes a subset of the input bits shifted to the appropriate
   1048         // position.  Replace with the straight forward IR.
   1049         unsigned ShiftAmount = MaskC->getValue().countTrailingZeros();
   1050         Value *Input = II.getArgOperand(0);
   1051         Value *Shifted = IC.Builder.CreateShl(Input,
   1052                                               ConstantInt::get(II.getType(),
   1053                                                                ShiftAmount));
   1054         Value *Masked = IC.Builder.CreateAnd(Shifted, II.getArgOperand(1));
   1055         return IC.replaceInstUsesWith(II, Masked);
   1056       }
   1057 
   1058       if (auto *SrcC = dyn_cast<ConstantInt>(II.getArgOperand(0))) {
   1059         uint64_t Src = SrcC->getZExtValue();
   1060         uint64_t Mask = MaskC->getZExtValue();
   1061         uint64_t Result = 0;
   1062         uint64_t BitToTest = 1;
   1063 
   1064         while (Mask) {
   1065           // Isolate lowest set bit.
   1066           uint64_t BitToSet = Mask & -Mask;
   1067           if (BitToTest & Src)
   1068             Result |= BitToSet;
   1069 
   1070           BitToTest <<= 1;
   1071           // Clear lowest set bit;
   1072           Mask &= Mask - 1;
   1073         }
   1074 
   1075         return IC.replaceInstUsesWith(II,
   1076                                       ConstantInt::get(II.getType(), Result));
   1077       }
   1078     }
   1079     break;
   1080 
   1081   case Intrinsic::x86_sse_cvtss2si:
   1082   case Intrinsic::x86_sse_cvtss2si64:
   1083   case Intrinsic::x86_sse_cvttss2si:
   1084   case Intrinsic::x86_sse_cvttss2si64:
   1085   case Intrinsic::x86_sse2_cvtsd2si:
   1086   case Intrinsic::x86_sse2_cvtsd2si64:
   1087   case Intrinsic::x86_sse2_cvttsd2si:
   1088   case Intrinsic::x86_sse2_cvttsd2si64:
   1089   case Intrinsic::x86_avx512_vcvtss2si32:
   1090   case Intrinsic::x86_avx512_vcvtss2si64:
   1091   case Intrinsic::x86_avx512_vcvtss2usi32:
   1092   case Intrinsic::x86_avx512_vcvtss2usi64:
   1093   case Intrinsic::x86_avx512_vcvtsd2si32:
   1094   case Intrinsic::x86_avx512_vcvtsd2si64:
   1095   case Intrinsic::x86_avx512_vcvtsd2usi32:
   1096   case Intrinsic::x86_avx512_vcvtsd2usi64:
   1097   case Intrinsic::x86_avx512_cvttss2si:
   1098   case Intrinsic::x86_avx512_cvttss2si64:
   1099   case Intrinsic::x86_avx512_cvttss2usi:
   1100   case Intrinsic::x86_avx512_cvttss2usi64:
   1101   case Intrinsic::x86_avx512_cvttsd2si:
   1102   case Intrinsic::x86_avx512_cvttsd2si64:
   1103   case Intrinsic::x86_avx512_cvttsd2usi:
   1104   case Intrinsic::x86_avx512_cvttsd2usi64: {
   1105     // These intrinsics only demand the 0th element of their input vectors. If
   1106     // we can simplify the input based on that, do so now.
   1107     Value *Arg = II.getArgOperand(0);
   1108     unsigned VWidth = cast<FixedVectorType>(Arg->getType())->getNumElements();
   1109     if (Value *V = SimplifyDemandedVectorEltsLow(Arg, VWidth, 1)) {
   1110       return IC.replaceOperand(II, 0, V);
   1111     }
   1112     break;
   1113   }
   1114 
   1115   case Intrinsic::x86_mmx_pmovmskb:
   1116   case Intrinsic::x86_sse_movmsk_ps:
   1117   case Intrinsic::x86_sse2_movmsk_pd:
   1118   case Intrinsic::x86_sse2_pmovmskb_128:
   1119   case Intrinsic::x86_avx_movmsk_pd_256:
   1120   case Intrinsic::x86_avx_movmsk_ps_256:
   1121   case Intrinsic::x86_avx2_pmovmskb:
   1122     if (Value *V = simplifyX86movmsk(II, IC.Builder)) {
   1123       return IC.replaceInstUsesWith(II, V);
   1124     }
   1125     break;
   1126 
   1127   case Intrinsic::x86_sse_comieq_ss:
   1128   case Intrinsic::x86_sse_comige_ss:
   1129   case Intrinsic::x86_sse_comigt_ss:
   1130   case Intrinsic::x86_sse_comile_ss:
   1131   case Intrinsic::x86_sse_comilt_ss:
   1132   case Intrinsic::x86_sse_comineq_ss:
   1133   case Intrinsic::x86_sse_ucomieq_ss:
   1134   case Intrinsic::x86_sse_ucomige_ss:
   1135   case Intrinsic::x86_sse_ucomigt_ss:
   1136   case Intrinsic::x86_sse_ucomile_ss:
   1137   case Intrinsic::x86_sse_ucomilt_ss:
   1138   case Intrinsic::x86_sse_ucomineq_ss:
   1139   case Intrinsic::x86_sse2_comieq_sd:
   1140   case Intrinsic::x86_sse2_comige_sd:
   1141   case Intrinsic::x86_sse2_comigt_sd:
   1142   case Intrinsic::x86_sse2_comile_sd:
   1143   case Intrinsic::x86_sse2_comilt_sd:
   1144   case Intrinsic::x86_sse2_comineq_sd:
   1145   case Intrinsic::x86_sse2_ucomieq_sd:
   1146   case Intrinsic::x86_sse2_ucomige_sd:
   1147   case Intrinsic::x86_sse2_ucomigt_sd:
   1148   case Intrinsic::x86_sse2_ucomile_sd:
   1149   case Intrinsic::x86_sse2_ucomilt_sd:
   1150   case Intrinsic::x86_sse2_ucomineq_sd:
   1151   case Intrinsic::x86_avx512_vcomi_ss:
   1152   case Intrinsic::x86_avx512_vcomi_sd:
   1153   case Intrinsic::x86_avx512_mask_cmp_ss:
   1154   case Intrinsic::x86_avx512_mask_cmp_sd: {
   1155     // These intrinsics only demand the 0th element of their input vectors. If
   1156     // we can simplify the input based on that, do so now.
   1157     bool MadeChange = false;
   1158     Value *Arg0 = II.getArgOperand(0);
   1159     Value *Arg1 = II.getArgOperand(1);
   1160     unsigned VWidth = cast<FixedVectorType>(Arg0->getType())->getNumElements();
   1161     if (Value *V = SimplifyDemandedVectorEltsLow(Arg0, VWidth, 1)) {
   1162       IC.replaceOperand(II, 0, V);
   1163       MadeChange = true;
   1164     }
   1165     if (Value *V = SimplifyDemandedVectorEltsLow(Arg1, VWidth, 1)) {
   1166       IC.replaceOperand(II, 1, V);
   1167       MadeChange = true;
   1168     }
   1169     if (MadeChange) {
   1170       return &II;
   1171     }
   1172     break;
   1173   }
   1174 
   1175   case Intrinsic::x86_avx512_add_ps_512:
   1176   case Intrinsic::x86_avx512_div_ps_512:
   1177   case Intrinsic::x86_avx512_mul_ps_512:
   1178   case Intrinsic::x86_avx512_sub_ps_512:
   1179   case Intrinsic::x86_avx512_add_pd_512:
   1180   case Intrinsic::x86_avx512_div_pd_512:
   1181   case Intrinsic::x86_avx512_mul_pd_512:
   1182   case Intrinsic::x86_avx512_sub_pd_512:
   1183     // If the rounding mode is CUR_DIRECTION(4) we can turn these into regular
   1184     // IR operations.
   1185     if (auto *R = dyn_cast<ConstantInt>(II.getArgOperand(2))) {
   1186       if (R->getValue() == 4) {
   1187         Value *Arg0 = II.getArgOperand(0);
   1188         Value *Arg1 = II.getArgOperand(1);
   1189 
   1190         Value *V;
   1191         switch (IID) {
   1192         default:
   1193           llvm_unreachable("Case stmts out of sync!");
   1194         case Intrinsic::x86_avx512_add_ps_512:
   1195         case Intrinsic::x86_avx512_add_pd_512:
   1196           V = IC.Builder.CreateFAdd(Arg0, Arg1);
   1197           break;
   1198         case Intrinsic::x86_avx512_sub_ps_512:
   1199         case Intrinsic::x86_avx512_sub_pd_512:
   1200           V = IC.Builder.CreateFSub(Arg0, Arg1);
   1201           break;
   1202         case Intrinsic::x86_avx512_mul_ps_512:
   1203         case Intrinsic::x86_avx512_mul_pd_512:
   1204           V = IC.Builder.CreateFMul(Arg0, Arg1);
   1205           break;
   1206         case Intrinsic::x86_avx512_div_ps_512:
   1207         case Intrinsic::x86_avx512_div_pd_512:
   1208           V = IC.Builder.CreateFDiv(Arg0, Arg1);
   1209           break;
   1210         }
   1211 
   1212         return IC.replaceInstUsesWith(II, V);
   1213       }
   1214     }
   1215     break;
   1216 
   1217   case Intrinsic::x86_avx512_mask_add_ss_round:
   1218   case Intrinsic::x86_avx512_mask_div_ss_round:
   1219   case Intrinsic::x86_avx512_mask_mul_ss_round:
   1220   case Intrinsic::x86_avx512_mask_sub_ss_round:
   1221   case Intrinsic::x86_avx512_mask_add_sd_round:
   1222   case Intrinsic::x86_avx512_mask_div_sd_round:
   1223   case Intrinsic::x86_avx512_mask_mul_sd_round:
   1224   case Intrinsic::x86_avx512_mask_sub_sd_round:
   1225     // If the rounding mode is CUR_DIRECTION(4) we can turn these into regular
   1226     // IR operations.
   1227     if (auto *R = dyn_cast<ConstantInt>(II.getArgOperand(4))) {
   1228       if (R->getValue() == 4) {
   1229         // Extract the element as scalars.
   1230         Value *Arg0 = II.getArgOperand(0);
   1231         Value *Arg1 = II.getArgOperand(1);
   1232         Value *LHS = IC.Builder.CreateExtractElement(Arg0, (uint64_t)0);
   1233         Value *RHS = IC.Builder.CreateExtractElement(Arg1, (uint64_t)0);
   1234 
   1235         Value *V;
   1236         switch (IID) {
   1237         default:
   1238           llvm_unreachable("Case stmts out of sync!");
   1239         case Intrinsic::x86_avx512_mask_add_ss_round:
   1240         case Intrinsic::x86_avx512_mask_add_sd_round:
   1241           V = IC.Builder.CreateFAdd(LHS, RHS);
   1242           break;
   1243         case Intrinsic::x86_avx512_mask_sub_ss_round:
   1244         case Intrinsic::x86_avx512_mask_sub_sd_round:
   1245           V = IC.Builder.CreateFSub(LHS, RHS);
   1246           break;
   1247         case Intrinsic::x86_avx512_mask_mul_ss_round:
   1248         case Intrinsic::x86_avx512_mask_mul_sd_round:
   1249           V = IC.Builder.CreateFMul(LHS, RHS);
   1250           break;
   1251         case Intrinsic::x86_avx512_mask_div_ss_round:
   1252         case Intrinsic::x86_avx512_mask_div_sd_round:
   1253           V = IC.Builder.CreateFDiv(LHS, RHS);
   1254           break;
   1255         }
   1256 
   1257         // Handle the masking aspect of the intrinsic.
   1258         Value *Mask = II.getArgOperand(3);
   1259         auto *C = dyn_cast<ConstantInt>(Mask);
   1260         // We don't need a select if we know the mask bit is a 1.
   1261         if (!C || !C->getValue()[0]) {
   1262           // Cast the mask to an i1 vector and then extract the lowest element.
   1263           auto *MaskTy = FixedVectorType::get(
   1264               IC.Builder.getInt1Ty(),
   1265               cast<IntegerType>(Mask->getType())->getBitWidth());
   1266           Mask = IC.Builder.CreateBitCast(Mask, MaskTy);
   1267           Mask = IC.Builder.CreateExtractElement(Mask, (uint64_t)0);
   1268           // Extract the lowest element from the passthru operand.
   1269           Value *Passthru =
   1270               IC.Builder.CreateExtractElement(II.getArgOperand(2), (uint64_t)0);
   1271           V = IC.Builder.CreateSelect(Mask, V, Passthru);
   1272         }
   1273 
   1274         // Insert the result back into the original argument 0.
   1275         V = IC.Builder.CreateInsertElement(Arg0, V, (uint64_t)0);
   1276 
   1277         return IC.replaceInstUsesWith(II, V);
   1278       }
   1279     }
   1280     break;
   1281 
   1282   // Constant fold ashr( <A x Bi>, Ci ).
   1283   // Constant fold lshr( <A x Bi>, Ci ).
   1284   // Constant fold shl( <A x Bi>, Ci ).
   1285   case Intrinsic::x86_sse2_psrai_d:
   1286   case Intrinsic::x86_sse2_psrai_w:
   1287   case Intrinsic::x86_avx2_psrai_d:
   1288   case Intrinsic::x86_avx2_psrai_w:
   1289   case Intrinsic::x86_avx512_psrai_q_128:
   1290   case Intrinsic::x86_avx512_psrai_q_256:
   1291   case Intrinsic::x86_avx512_psrai_d_512:
   1292   case Intrinsic::x86_avx512_psrai_q_512:
   1293   case Intrinsic::x86_avx512_psrai_w_512:
   1294   case Intrinsic::x86_sse2_psrli_d:
   1295   case Intrinsic::x86_sse2_psrli_q:
   1296   case Intrinsic::x86_sse2_psrli_w:
   1297   case Intrinsic::x86_avx2_psrli_d:
   1298   case Intrinsic::x86_avx2_psrli_q:
   1299   case Intrinsic::x86_avx2_psrli_w:
   1300   case Intrinsic::x86_avx512_psrli_d_512:
   1301   case Intrinsic::x86_avx512_psrli_q_512:
   1302   case Intrinsic::x86_avx512_psrli_w_512:
   1303   case Intrinsic::x86_sse2_pslli_d:
   1304   case Intrinsic::x86_sse2_pslli_q:
   1305   case Intrinsic::x86_sse2_pslli_w:
   1306   case Intrinsic::x86_avx2_pslli_d:
   1307   case Intrinsic::x86_avx2_pslli_q:
   1308   case Intrinsic::x86_avx2_pslli_w:
   1309   case Intrinsic::x86_avx512_pslli_d_512:
   1310   case Intrinsic::x86_avx512_pslli_q_512:
   1311   case Intrinsic::x86_avx512_pslli_w_512:
   1312     if (Value *V = simplifyX86immShift(II, IC.Builder)) {
   1313       return IC.replaceInstUsesWith(II, V);
   1314     }
   1315     break;
   1316 
   1317   case Intrinsic::x86_sse2_psra_d:
   1318   case Intrinsic::x86_sse2_psra_w:
   1319   case Intrinsic::x86_avx2_psra_d:
   1320   case Intrinsic::x86_avx2_psra_w:
   1321   case Intrinsic::x86_avx512_psra_q_128:
   1322   case Intrinsic::x86_avx512_psra_q_256:
   1323   case Intrinsic::x86_avx512_psra_d_512:
   1324   case Intrinsic::x86_avx512_psra_q_512:
   1325   case Intrinsic::x86_avx512_psra_w_512:
   1326   case Intrinsic::x86_sse2_psrl_d:
   1327   case Intrinsic::x86_sse2_psrl_q:
   1328   case Intrinsic::x86_sse2_psrl_w:
   1329   case Intrinsic::x86_avx2_psrl_d:
   1330   case Intrinsic::x86_avx2_psrl_q:
   1331   case Intrinsic::x86_avx2_psrl_w:
   1332   case Intrinsic::x86_avx512_psrl_d_512:
   1333   case Intrinsic::x86_avx512_psrl_q_512:
   1334   case Intrinsic::x86_avx512_psrl_w_512:
   1335   case Intrinsic::x86_sse2_psll_d:
   1336   case Intrinsic::x86_sse2_psll_q:
   1337   case Intrinsic::x86_sse2_psll_w:
   1338   case Intrinsic::x86_avx2_psll_d:
   1339   case Intrinsic::x86_avx2_psll_q:
   1340   case Intrinsic::x86_avx2_psll_w:
   1341   case Intrinsic::x86_avx512_psll_d_512:
   1342   case Intrinsic::x86_avx512_psll_q_512:
   1343   case Intrinsic::x86_avx512_psll_w_512: {
   1344     if (Value *V = simplifyX86immShift(II, IC.Builder)) {
   1345       return IC.replaceInstUsesWith(II, V);
   1346     }
   1347 
   1348     // SSE2/AVX2 uses only the first 64-bits of the 128-bit vector
   1349     // operand to compute the shift amount.
   1350     Value *Arg1 = II.getArgOperand(1);
   1351     assert(Arg1->getType()->getPrimitiveSizeInBits() == 128 &&
   1352            "Unexpected packed shift size");
   1353     unsigned VWidth = cast<FixedVectorType>(Arg1->getType())->getNumElements();
   1354 
   1355     if (Value *V = SimplifyDemandedVectorEltsLow(Arg1, VWidth, VWidth / 2)) {
   1356       return IC.replaceOperand(II, 1, V);
   1357     }
   1358     break;
   1359   }
   1360 
   1361   case Intrinsic::x86_avx2_psllv_d:
   1362   case Intrinsic::x86_avx2_psllv_d_256:
   1363   case Intrinsic::x86_avx2_psllv_q:
   1364   case Intrinsic::x86_avx2_psllv_q_256:
   1365   case Intrinsic::x86_avx512_psllv_d_512:
   1366   case Intrinsic::x86_avx512_psllv_q_512:
   1367   case Intrinsic::x86_avx512_psllv_w_128:
   1368   case Intrinsic::x86_avx512_psllv_w_256:
   1369   case Intrinsic::x86_avx512_psllv_w_512:
   1370   case Intrinsic::x86_avx2_psrav_d:
   1371   case Intrinsic::x86_avx2_psrav_d_256:
   1372   case Intrinsic::x86_avx512_psrav_q_128:
   1373   case Intrinsic::x86_avx512_psrav_q_256:
   1374   case Intrinsic::x86_avx512_psrav_d_512:
   1375   case Intrinsic::x86_avx512_psrav_q_512:
   1376   case Intrinsic::x86_avx512_psrav_w_128:
   1377   case Intrinsic::x86_avx512_psrav_w_256:
   1378   case Intrinsic::x86_avx512_psrav_w_512:
   1379   case Intrinsic::x86_avx2_psrlv_d:
   1380   case Intrinsic::x86_avx2_psrlv_d_256:
   1381   case Intrinsic::x86_avx2_psrlv_q:
   1382   case Intrinsic::x86_avx2_psrlv_q_256:
   1383   case Intrinsic::x86_avx512_psrlv_d_512:
   1384   case Intrinsic::x86_avx512_psrlv_q_512:
   1385   case Intrinsic::x86_avx512_psrlv_w_128:
   1386   case Intrinsic::x86_avx512_psrlv_w_256:
   1387   case Intrinsic::x86_avx512_psrlv_w_512:
   1388     if (Value *V = simplifyX86varShift(II, IC.Builder)) {
   1389       return IC.replaceInstUsesWith(II, V);
   1390     }
   1391     break;
   1392 
   1393   case Intrinsic::x86_sse2_packssdw_128:
   1394   case Intrinsic::x86_sse2_packsswb_128:
   1395   case Intrinsic::x86_avx2_packssdw:
   1396   case Intrinsic::x86_avx2_packsswb:
   1397   case Intrinsic::x86_avx512_packssdw_512:
   1398   case Intrinsic::x86_avx512_packsswb_512:
   1399     if (Value *V = simplifyX86pack(II, IC.Builder, true)) {
   1400       return IC.replaceInstUsesWith(II, V);
   1401     }
   1402     break;
   1403 
   1404   case Intrinsic::x86_sse2_packuswb_128:
   1405   case Intrinsic::x86_sse41_packusdw:
   1406   case Intrinsic::x86_avx2_packusdw:
   1407   case Intrinsic::x86_avx2_packuswb:
   1408   case Intrinsic::x86_avx512_packusdw_512:
   1409   case Intrinsic::x86_avx512_packuswb_512:
   1410     if (Value *V = simplifyX86pack(II, IC.Builder, false)) {
   1411       return IC.replaceInstUsesWith(II, V);
   1412     }
   1413     break;
   1414 
   1415   case Intrinsic::x86_pclmulqdq:
   1416   case Intrinsic::x86_pclmulqdq_256:
   1417   case Intrinsic::x86_pclmulqdq_512: {
   1418     if (auto *C = dyn_cast<ConstantInt>(II.getArgOperand(2))) {
   1419       unsigned Imm = C->getZExtValue();
   1420 
   1421       bool MadeChange = false;
   1422       Value *Arg0 = II.getArgOperand(0);
   1423       Value *Arg1 = II.getArgOperand(1);
   1424       unsigned VWidth =
   1425           cast<FixedVectorType>(Arg0->getType())->getNumElements();
   1426 
   1427       APInt UndefElts1(VWidth, 0);
   1428       APInt DemandedElts1 =
   1429           APInt::getSplat(VWidth, APInt(2, (Imm & 0x01) ? 2 : 1));
   1430       if (Value *V =
   1431               IC.SimplifyDemandedVectorElts(Arg0, DemandedElts1, UndefElts1)) {
   1432         IC.replaceOperand(II, 0, V);
   1433         MadeChange = true;
   1434       }
   1435 
   1436       APInt UndefElts2(VWidth, 0);
   1437       APInt DemandedElts2 =
   1438           APInt::getSplat(VWidth, APInt(2, (Imm & 0x10) ? 2 : 1));
   1439       if (Value *V =
   1440               IC.SimplifyDemandedVectorElts(Arg1, DemandedElts2, UndefElts2)) {
   1441         IC.replaceOperand(II, 1, V);
   1442         MadeChange = true;
   1443       }
   1444 
   1445       // If either input elements are undef, the result is zero.
   1446       if (DemandedElts1.isSubsetOf(UndefElts1) ||
   1447           DemandedElts2.isSubsetOf(UndefElts2)) {
   1448         return IC.replaceInstUsesWith(II,
   1449                                       ConstantAggregateZero::get(II.getType()));
   1450       }
   1451 
   1452       if (MadeChange) {
   1453         return &II;
   1454       }
   1455     }
   1456     break;
   1457   }
   1458 
   1459   case Intrinsic::x86_sse41_insertps:
   1460     if (Value *V = simplifyX86insertps(II, IC.Builder)) {
   1461       return IC.replaceInstUsesWith(II, V);
   1462     }
   1463     break;
   1464 
   1465   case Intrinsic::x86_sse4a_extrq: {
   1466     Value *Op0 = II.getArgOperand(0);
   1467     Value *Op1 = II.getArgOperand(1);
   1468     unsigned VWidth0 = cast<FixedVectorType>(Op0->getType())->getNumElements();
   1469     unsigned VWidth1 = cast<FixedVectorType>(Op1->getType())->getNumElements();
   1470     assert(Op0->getType()->getPrimitiveSizeInBits() == 128 &&
   1471            Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth0 == 2 &&
   1472            VWidth1 == 16 && "Unexpected operand sizes");
   1473 
   1474     // See if we're dealing with constant values.
   1475     Constant *C1 = dyn_cast<Constant>(Op1);
   1476     ConstantInt *CILength =
   1477         C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)0))
   1478            : nullptr;
   1479     ConstantInt *CIIndex =
   1480         C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)1))
   1481            : nullptr;
   1482 
   1483     // Attempt to simplify to a constant, shuffle vector or EXTRQI call.
   1484     if (Value *V = simplifyX86extrq(II, Op0, CILength, CIIndex, IC.Builder)) {
   1485       return IC.replaceInstUsesWith(II, V);
   1486     }
   1487 
   1488     // EXTRQ only uses the lowest 64-bits of the first 128-bit vector
   1489     // operands and the lowest 16-bits of the second.
   1490     bool MadeChange = false;
   1491     if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth0, 1)) {
   1492       IC.replaceOperand(II, 0, V);
   1493       MadeChange = true;
   1494     }
   1495     if (Value *V = SimplifyDemandedVectorEltsLow(Op1, VWidth1, 2)) {
   1496       IC.replaceOperand(II, 1, V);
   1497       MadeChange = true;
   1498     }
   1499     if (MadeChange) {
   1500       return &II;
   1501     }
   1502     break;
   1503   }
   1504 
   1505   case Intrinsic::x86_sse4a_extrqi: {
   1506     // EXTRQI: Extract Length bits starting from Index. Zero pad the remaining
   1507     // bits of the lower 64-bits. The upper 64-bits are undefined.
   1508     Value *Op0 = II.getArgOperand(0);
   1509     unsigned VWidth = cast<FixedVectorType>(Op0->getType())->getNumElements();
   1510     assert(Op0->getType()->getPrimitiveSizeInBits() == 128 && VWidth == 2 &&
   1511            "Unexpected operand size");
   1512 
   1513     // See if we're dealing with constant values.
   1514     ConstantInt *CILength = dyn_cast<ConstantInt>(II.getArgOperand(1));
   1515     ConstantInt *CIIndex = dyn_cast<ConstantInt>(II.getArgOperand(2));
   1516 
   1517     // Attempt to simplify to a constant or shuffle vector.
   1518     if (Value *V = simplifyX86extrq(II, Op0, CILength, CIIndex, IC.Builder)) {
   1519       return IC.replaceInstUsesWith(II, V);
   1520     }
   1521 
   1522     // EXTRQI only uses the lowest 64-bits of the first 128-bit vector
   1523     // operand.
   1524     if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth, 1)) {
   1525       return IC.replaceOperand(II, 0, V);
   1526     }
   1527     break;
   1528   }
   1529 
   1530   case Intrinsic::x86_sse4a_insertq: {
   1531     Value *Op0 = II.getArgOperand(0);
   1532     Value *Op1 = II.getArgOperand(1);
   1533     unsigned VWidth = cast<FixedVectorType>(Op0->getType())->getNumElements();
   1534     assert(Op0->getType()->getPrimitiveSizeInBits() == 128 &&
   1535            Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth == 2 &&
   1536            cast<FixedVectorType>(Op1->getType())->getNumElements() == 2 &&
   1537            "Unexpected operand size");
   1538 
   1539     // See if we're dealing with constant values.
   1540     Constant *C1 = dyn_cast<Constant>(Op1);
   1541     ConstantInt *CI11 =
   1542         C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)1))
   1543            : nullptr;
   1544 
   1545     // Attempt to simplify to a constant, shuffle vector or INSERTQI call.
   1546     if (CI11) {
   1547       const APInt &V11 = CI11->getValue();
   1548       APInt Len = V11.zextOrTrunc(6);
   1549       APInt Idx = V11.lshr(8).zextOrTrunc(6);
   1550       if (Value *V = simplifyX86insertq(II, Op0, Op1, Len, Idx, IC.Builder)) {
   1551         return IC.replaceInstUsesWith(II, V);
   1552       }
   1553     }
   1554 
   1555     // INSERTQ only uses the lowest 64-bits of the first 128-bit vector
   1556     // operand.
   1557     if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth, 1)) {
   1558       return IC.replaceOperand(II, 0, V);
   1559     }
   1560     break;
   1561   }
   1562 
   1563   case Intrinsic::x86_sse4a_insertqi: {
   1564     // INSERTQI: Extract lowest Length bits from lower half of second source and
   1565     // insert over first source starting at Index bit. The upper 64-bits are
   1566     // undefined.
   1567     Value *Op0 = II.getArgOperand(0);
   1568     Value *Op1 = II.getArgOperand(1);
   1569     unsigned VWidth0 = cast<FixedVectorType>(Op0->getType())->getNumElements();
   1570     unsigned VWidth1 = cast<FixedVectorType>(Op1->getType())->getNumElements();
   1571     assert(Op0->getType()->getPrimitiveSizeInBits() == 128 &&
   1572            Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth0 == 2 &&
   1573            VWidth1 == 2 && "Unexpected operand sizes");
   1574 
   1575     // See if we're dealing with constant values.
   1576     ConstantInt *CILength = dyn_cast<ConstantInt>(II.getArgOperand(2));
   1577     ConstantInt *CIIndex = dyn_cast<ConstantInt>(II.getArgOperand(3));
   1578 
   1579     // Attempt to simplify to a constant or shuffle vector.
   1580     if (CILength && CIIndex) {
   1581       APInt Len = CILength->getValue().zextOrTrunc(6);
   1582       APInt Idx = CIIndex->getValue().zextOrTrunc(6);
   1583       if (Value *V = simplifyX86insertq(II, Op0, Op1, Len, Idx, IC.Builder)) {
   1584         return IC.replaceInstUsesWith(II, V);
   1585       }
   1586     }
   1587 
   1588     // INSERTQI only uses the lowest 64-bits of the first two 128-bit vector
   1589     // operands.
   1590     bool MadeChange = false;
   1591     if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth0, 1)) {
   1592       IC.replaceOperand(II, 0, V);
   1593       MadeChange = true;
   1594     }
   1595     if (Value *V = SimplifyDemandedVectorEltsLow(Op1, VWidth1, 1)) {
   1596       IC.replaceOperand(II, 1, V);
   1597       MadeChange = true;
   1598     }
   1599     if (MadeChange) {
   1600       return &II;
   1601     }
   1602     break;
   1603   }
   1604 
   1605   case Intrinsic::x86_sse41_pblendvb:
   1606   case Intrinsic::x86_sse41_blendvps:
   1607   case Intrinsic::x86_sse41_blendvpd:
   1608   case Intrinsic::x86_avx_blendv_ps_256:
   1609   case Intrinsic::x86_avx_blendv_pd_256:
   1610   case Intrinsic::x86_avx2_pblendvb: {
   1611     // fold (blend A, A, Mask) -> A
   1612     Value *Op0 = II.getArgOperand(0);
   1613     Value *Op1 = II.getArgOperand(1);
   1614     Value *Mask = II.getArgOperand(2);
   1615     if (Op0 == Op1) {
   1616       return IC.replaceInstUsesWith(II, Op0);
   1617     }
   1618 
   1619     // Zero Mask - select 1st argument.
   1620     if (isa<ConstantAggregateZero>(Mask)) {
   1621       return IC.replaceInstUsesWith(II, Op0);
   1622     }
   1623 
   1624     // Constant Mask - select 1st/2nd argument lane based on top bit of mask.
   1625     if (auto *ConstantMask = dyn_cast<ConstantDataVector>(Mask)) {
   1626       Constant *NewSelector = getNegativeIsTrueBoolVec(ConstantMask);
   1627       return SelectInst::Create(NewSelector, Op1, Op0, "blendv");
   1628     }
   1629 
   1630     // Convert to a vector select if we can bypass casts and find a boolean
   1631     // vector condition value.
   1632     Value *BoolVec;
   1633     Mask = InstCombiner::peekThroughBitcast(Mask);
   1634     if (match(Mask, PatternMatch::m_SExt(PatternMatch::m_Value(BoolVec))) &&
   1635         BoolVec->getType()->isVectorTy() &&
   1636         BoolVec->getType()->getScalarSizeInBits() == 1) {
   1637       assert(Mask->getType()->getPrimitiveSizeInBits() ==
   1638                  II.getType()->getPrimitiveSizeInBits() &&
   1639              "Not expecting mask and operands with different sizes");
   1640 
   1641       unsigned NumMaskElts =
   1642           cast<FixedVectorType>(Mask->getType())->getNumElements();
   1643       unsigned NumOperandElts =
   1644           cast<FixedVectorType>(II.getType())->getNumElements();
   1645       if (NumMaskElts == NumOperandElts) {
   1646         return SelectInst::Create(BoolVec, Op1, Op0);
   1647       }
   1648 
   1649       // If the mask has less elements than the operands, each mask bit maps to
   1650       // multiple elements of the operands. Bitcast back and forth.
   1651       if (NumMaskElts < NumOperandElts) {
   1652         Value *CastOp0 = IC.Builder.CreateBitCast(Op0, Mask->getType());
   1653         Value *CastOp1 = IC.Builder.CreateBitCast(Op1, Mask->getType());
   1654         Value *Sel = IC.Builder.CreateSelect(BoolVec, CastOp1, CastOp0);
   1655         return new BitCastInst(Sel, II.getType());
   1656       }
   1657     }
   1658 
   1659     break;
   1660   }
   1661 
   1662   case Intrinsic::x86_ssse3_pshuf_b_128:
   1663   case Intrinsic::x86_avx2_pshuf_b:
   1664   case Intrinsic::x86_avx512_pshuf_b_512:
   1665     if (Value *V = simplifyX86pshufb(II, IC.Builder)) {
   1666       return IC.replaceInstUsesWith(II, V);
   1667     }
   1668     break;
   1669 
   1670   case Intrinsic::x86_avx_vpermilvar_ps:
   1671   case Intrinsic::x86_avx_vpermilvar_ps_256:
   1672   case Intrinsic::x86_avx512_vpermilvar_ps_512:
   1673   case Intrinsic::x86_avx_vpermilvar_pd:
   1674   case Intrinsic::x86_avx_vpermilvar_pd_256:
   1675   case Intrinsic::x86_avx512_vpermilvar_pd_512:
   1676     if (Value *V = simplifyX86vpermilvar(II, IC.Builder)) {
   1677       return IC.replaceInstUsesWith(II, V);
   1678     }
   1679     break;
   1680 
   1681   case Intrinsic::x86_avx2_permd:
   1682   case Intrinsic::x86_avx2_permps:
   1683   case Intrinsic::x86_avx512_permvar_df_256:
   1684   case Intrinsic::x86_avx512_permvar_df_512:
   1685   case Intrinsic::x86_avx512_permvar_di_256:
   1686   case Intrinsic::x86_avx512_permvar_di_512:
   1687   case Intrinsic::x86_avx512_permvar_hi_128:
   1688   case Intrinsic::x86_avx512_permvar_hi_256:
   1689   case Intrinsic::x86_avx512_permvar_hi_512:
   1690   case Intrinsic::x86_avx512_permvar_qi_128:
   1691   case Intrinsic::x86_avx512_permvar_qi_256:
   1692   case Intrinsic::x86_avx512_permvar_qi_512:
   1693   case Intrinsic::x86_avx512_permvar_sf_512:
   1694   case Intrinsic::x86_avx512_permvar_si_512:
   1695     if (Value *V = simplifyX86vpermv(II, IC.Builder)) {
   1696       return IC.replaceInstUsesWith(II, V);
   1697     }
   1698     break;
   1699 
   1700   case Intrinsic::x86_avx_maskload_ps:
   1701   case Intrinsic::x86_avx_maskload_pd:
   1702   case Intrinsic::x86_avx_maskload_ps_256:
   1703   case Intrinsic::x86_avx_maskload_pd_256:
   1704   case Intrinsic::x86_avx2_maskload_d:
   1705   case Intrinsic::x86_avx2_maskload_q:
   1706   case Intrinsic::x86_avx2_maskload_d_256:
   1707   case Intrinsic::x86_avx2_maskload_q_256:
   1708     if (Instruction *I = simplifyX86MaskedLoad(II, IC)) {
   1709       return I;
   1710     }
   1711     break;
   1712 
   1713   case Intrinsic::x86_sse2_maskmov_dqu:
   1714   case Intrinsic::x86_avx_maskstore_ps:
   1715   case Intrinsic::x86_avx_maskstore_pd:
   1716   case Intrinsic::x86_avx_maskstore_ps_256:
   1717   case Intrinsic::x86_avx_maskstore_pd_256:
   1718   case Intrinsic::x86_avx2_maskstore_d:
   1719   case Intrinsic::x86_avx2_maskstore_q:
   1720   case Intrinsic::x86_avx2_maskstore_d_256:
   1721   case Intrinsic::x86_avx2_maskstore_q_256:
   1722     if (simplifyX86MaskedStore(II, IC)) {
   1723       return nullptr;
   1724     }
   1725     break;
   1726 
   1727   case Intrinsic::x86_addcarry_32:
   1728   case Intrinsic::x86_addcarry_64:
   1729     if (Value *V = simplifyX86addcarry(II, IC.Builder)) {
   1730       return IC.replaceInstUsesWith(II, V);
   1731     }
   1732     break;
   1733 
   1734   default:
   1735     break;
   1736   }
   1737   return None;
   1738 }
   1739 
   1740 Optional<Value *> X86TTIImpl::simplifyDemandedUseBitsIntrinsic(
   1741     InstCombiner &IC, IntrinsicInst &II, APInt DemandedMask, KnownBits &Known,
   1742     bool &KnownBitsComputed) const {
   1743   switch (II.getIntrinsicID()) {
   1744   default:
   1745     break;
   1746   case Intrinsic::x86_mmx_pmovmskb:
   1747   case Intrinsic::x86_sse_movmsk_ps:
   1748   case Intrinsic::x86_sse2_movmsk_pd:
   1749   case Intrinsic::x86_sse2_pmovmskb_128:
   1750   case Intrinsic::x86_avx_movmsk_ps_256:
   1751   case Intrinsic::x86_avx_movmsk_pd_256:
   1752   case Intrinsic::x86_avx2_pmovmskb: {
   1753     // MOVMSK copies the vector elements' sign bits to the low bits
   1754     // and zeros the high bits.
   1755     unsigned ArgWidth;
   1756     if (II.getIntrinsicID() == Intrinsic::x86_mmx_pmovmskb) {
   1757       ArgWidth = 8; // Arg is x86_mmx, but treated as <8 x i8>.
   1758     } else {
   1759       auto Arg = II.getArgOperand(0);
   1760       auto ArgType = cast<FixedVectorType>(Arg->getType());
   1761       ArgWidth = ArgType->getNumElements();
   1762     }
   1763 
   1764     // If we don't need any of low bits then return zero,
   1765     // we know that DemandedMask is non-zero already.
   1766     APInt DemandedElts = DemandedMask.zextOrTrunc(ArgWidth);
   1767     Type *VTy = II.getType();
   1768     if (DemandedElts.isNullValue()) {
   1769       return ConstantInt::getNullValue(VTy);
   1770     }
   1771 
   1772     // We know that the upper bits are set to zero.
   1773     Known.Zero.setBitsFrom(ArgWidth);
   1774     KnownBitsComputed = true;
   1775     break;
   1776   }
   1777   }
   1778   return None;
   1779 }
   1780 
   1781 Optional<Value *> X86TTIImpl::simplifyDemandedVectorEltsIntrinsic(
   1782     InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts,
   1783     APInt &UndefElts2, APInt &UndefElts3,
   1784     std::function<void(Instruction *, unsigned, APInt, APInt &)>
   1785         simplifyAndSetOp) const {
   1786   unsigned VWidth = cast<FixedVectorType>(II.getType())->getNumElements();
   1787   switch (II.getIntrinsicID()) {
   1788   default:
   1789     break;
   1790   case Intrinsic::x86_xop_vfrcz_ss:
   1791   case Intrinsic::x86_xop_vfrcz_sd:
   1792     // The instructions for these intrinsics are speced to zero upper bits not
   1793     // pass them through like other scalar intrinsics. So we shouldn't just
   1794     // use Arg0 if DemandedElts[0] is clear like we do for other intrinsics.
   1795     // Instead we should return a zero vector.
   1796     if (!DemandedElts[0]) {
   1797       IC.addToWorklist(&II);
   1798       return ConstantAggregateZero::get(II.getType());
   1799     }
   1800 
   1801     // Only the lower element is used.
   1802     DemandedElts = 1;
   1803     simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
   1804 
   1805     // Only the lower element is undefined. The high elements are zero.
   1806     UndefElts = UndefElts[0];
   1807     break;
   1808 
   1809   // Unary scalar-as-vector operations that work column-wise.
   1810   case Intrinsic::x86_sse_rcp_ss:
   1811   case Intrinsic::x86_sse_rsqrt_ss:
   1812     simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
   1813 
   1814     // If lowest element of a scalar op isn't used then use Arg0.
   1815     if (!DemandedElts[0]) {
   1816       IC.addToWorklist(&II);
   1817       return II.getArgOperand(0);
   1818     }
   1819     // TODO: If only low elt lower SQRT to FSQRT (with rounding/exceptions
   1820     // checks).
   1821     break;
   1822 
   1823   // Binary scalar-as-vector operations that work column-wise. The high
   1824   // elements come from operand 0. The low element is a function of both
   1825   // operands.
   1826   case Intrinsic::x86_sse_min_ss:
   1827   case Intrinsic::x86_sse_max_ss:
   1828   case Intrinsic::x86_sse_cmp_ss:
   1829   case Intrinsic::x86_sse2_min_sd:
   1830   case Intrinsic::x86_sse2_max_sd:
   1831   case Intrinsic::x86_sse2_cmp_sd: {
   1832     simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
   1833 
   1834     // If lowest element of a scalar op isn't used then use Arg0.
   1835     if (!DemandedElts[0]) {
   1836       IC.addToWorklist(&II);
   1837       return II.getArgOperand(0);
   1838     }
   1839 
   1840     // Only lower element is used for operand 1.
   1841     DemandedElts = 1;
   1842     simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2);
   1843 
   1844     // Lower element is undefined if both lower elements are undefined.
   1845     // Consider things like undef&0.  The result is known zero, not undef.
   1846     if (!UndefElts2[0])
   1847       UndefElts.clearBit(0);
   1848 
   1849     break;
   1850   }
   1851 
   1852   // Binary scalar-as-vector operations that work column-wise. The high
   1853   // elements come from operand 0 and the low element comes from operand 1.
   1854   case Intrinsic::x86_sse41_round_ss:
   1855   case Intrinsic::x86_sse41_round_sd: {
   1856     // Don't use the low element of operand 0.
   1857     APInt DemandedElts2 = DemandedElts;
   1858     DemandedElts2.clearBit(0);
   1859     simplifyAndSetOp(&II, 0, DemandedElts2, UndefElts);
   1860 
   1861     // If lowest element of a scalar op isn't used then use Arg0.
   1862     if (!DemandedElts[0]) {
   1863       IC.addToWorklist(&II);
   1864       return II.getArgOperand(0);
   1865     }
   1866 
   1867     // Only lower element is used for operand 1.
   1868     DemandedElts = 1;
   1869     simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2);
   1870 
   1871     // Take the high undef elements from operand 0 and take the lower element
   1872     // from operand 1.
   1873     UndefElts.clearBit(0);
   1874     UndefElts |= UndefElts2[0];
   1875     break;
   1876   }
   1877 
   1878   // Three input scalar-as-vector operations that work column-wise. The high
   1879   // elements come from operand 0 and the low element is a function of all
   1880   // three inputs.
   1881   case Intrinsic::x86_avx512_mask_add_ss_round:
   1882   case Intrinsic::x86_avx512_mask_div_ss_round:
   1883   case Intrinsic::x86_avx512_mask_mul_ss_round:
   1884   case Intrinsic::x86_avx512_mask_sub_ss_round:
   1885   case Intrinsic::x86_avx512_mask_max_ss_round:
   1886   case Intrinsic::x86_avx512_mask_min_ss_round:
   1887   case Intrinsic::x86_avx512_mask_add_sd_round:
   1888   case Intrinsic::x86_avx512_mask_div_sd_round:
   1889   case Intrinsic::x86_avx512_mask_mul_sd_round:
   1890   case Intrinsic::x86_avx512_mask_sub_sd_round:
   1891   case Intrinsic::x86_avx512_mask_max_sd_round:
   1892   case Intrinsic::x86_avx512_mask_min_sd_round:
   1893     simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
   1894 
   1895     // If lowest element of a scalar op isn't used then use Arg0.
   1896     if (!DemandedElts[0]) {
   1897       IC.addToWorklist(&II);
   1898       return II.getArgOperand(0);
   1899     }
   1900 
   1901     // Only lower element is used for operand 1 and 2.
   1902     DemandedElts = 1;
   1903     simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2);
   1904     simplifyAndSetOp(&II, 2, DemandedElts, UndefElts3);
   1905 
   1906     // Lower element is undefined if all three lower elements are undefined.
   1907     // Consider things like undef&0.  The result is known zero, not undef.
   1908     if (!UndefElts2[0] || !UndefElts3[0])
   1909       UndefElts.clearBit(0);
   1910     break;
   1911 
   1912   // TODO: Add fmaddsub support?
   1913   case Intrinsic::x86_sse3_addsub_pd:
   1914   case Intrinsic::x86_sse3_addsub_ps:
   1915   case Intrinsic::x86_avx_addsub_pd_256:
   1916   case Intrinsic::x86_avx_addsub_ps_256: {
   1917     // If none of the even or none of the odd lanes are required, turn this
   1918     // into a generic FP math instruction.
   1919     APInt SubMask = APInt::getSplat(VWidth, APInt(2, 0x1));
   1920     APInt AddMask = APInt::getSplat(VWidth, APInt(2, 0x2));
   1921     bool IsSubOnly = DemandedElts.isSubsetOf(SubMask);
   1922     bool IsAddOnly = DemandedElts.isSubsetOf(AddMask);
   1923     if (IsSubOnly || IsAddOnly) {
   1924       assert((IsSubOnly ^ IsAddOnly) && "Can't be both add-only and sub-only");
   1925       IRBuilderBase::InsertPointGuard Guard(IC.Builder);
   1926       IC.Builder.SetInsertPoint(&II);
   1927       Value *Arg0 = II.getArgOperand(0), *Arg1 = II.getArgOperand(1);
   1928       return IC.Builder.CreateBinOp(
   1929           IsSubOnly ? Instruction::FSub : Instruction::FAdd, Arg0, Arg1);
   1930     }
   1931 
   1932     simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
   1933     simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2);
   1934     UndefElts &= UndefElts2;
   1935     break;
   1936   }
   1937 
   1938   case Intrinsic::x86_sse2_packssdw_128:
   1939   case Intrinsic::x86_sse2_packsswb_128:
   1940   case Intrinsic::x86_sse2_packuswb_128:
   1941   case Intrinsic::x86_sse41_packusdw:
   1942   case Intrinsic::x86_avx2_packssdw:
   1943   case Intrinsic::x86_avx2_packsswb:
   1944   case Intrinsic::x86_avx2_packusdw:
   1945   case Intrinsic::x86_avx2_packuswb:
   1946   case Intrinsic::x86_avx512_packssdw_512:
   1947   case Intrinsic::x86_avx512_packsswb_512:
   1948   case Intrinsic::x86_avx512_packusdw_512:
   1949   case Intrinsic::x86_avx512_packuswb_512: {
   1950     auto *Ty0 = II.getArgOperand(0)->getType();
   1951     unsigned InnerVWidth = cast<FixedVectorType>(Ty0)->getNumElements();
   1952     assert(VWidth == (InnerVWidth * 2) && "Unexpected input size");
   1953 
   1954     unsigned NumLanes = Ty0->getPrimitiveSizeInBits() / 128;
   1955     unsigned VWidthPerLane = VWidth / NumLanes;
   1956     unsigned InnerVWidthPerLane = InnerVWidth / NumLanes;
   1957 
   1958     // Per lane, pack the elements of the first input and then the second.
   1959     // e.g.
   1960     // v8i16 PACK(v4i32 X, v4i32 Y) - (X[0..3],Y[0..3])
   1961     // v32i8 PACK(v16i16 X, v16i16 Y) - (X[0..7],Y[0..7]),(X[8..15],Y[8..15])
   1962     for (int OpNum = 0; OpNum != 2; ++OpNum) {
   1963       APInt OpDemandedElts(InnerVWidth, 0);
   1964       for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
   1965         unsigned LaneIdx = Lane * VWidthPerLane;
   1966         for (unsigned Elt = 0; Elt != InnerVWidthPerLane; ++Elt) {
   1967           unsigned Idx = LaneIdx + Elt + InnerVWidthPerLane * OpNum;
   1968           if (DemandedElts[Idx])
   1969             OpDemandedElts.setBit((Lane * InnerVWidthPerLane) + Elt);
   1970         }
   1971       }
   1972 
   1973       // Demand elements from the operand.
   1974       APInt OpUndefElts(InnerVWidth, 0);
   1975       simplifyAndSetOp(&II, OpNum, OpDemandedElts, OpUndefElts);
   1976 
   1977       // Pack the operand's UNDEF elements, one lane at a time.
   1978       OpUndefElts = OpUndefElts.zext(VWidth);
   1979       for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
   1980         APInt LaneElts = OpUndefElts.lshr(InnerVWidthPerLane * Lane);
   1981         LaneElts = LaneElts.getLoBits(InnerVWidthPerLane);
   1982         LaneElts <<= InnerVWidthPerLane * (2 * Lane + OpNum);
   1983         UndefElts |= LaneElts;
   1984       }
   1985     }
   1986     break;
   1987   }
   1988 
   1989   // PSHUFB
   1990   case Intrinsic::x86_ssse3_pshuf_b_128:
   1991   case Intrinsic::x86_avx2_pshuf_b:
   1992   case Intrinsic::x86_avx512_pshuf_b_512:
   1993   // PERMILVAR
   1994   case Intrinsic::x86_avx_vpermilvar_ps:
   1995   case Intrinsic::x86_avx_vpermilvar_ps_256:
   1996   case Intrinsic::x86_avx512_vpermilvar_ps_512:
   1997   case Intrinsic::x86_avx_vpermilvar_pd:
   1998   case Intrinsic::x86_avx_vpermilvar_pd_256:
   1999   case Intrinsic::x86_avx512_vpermilvar_pd_512:
   2000   // PERMV
   2001   case Intrinsic::x86_avx2_permd:
   2002   case Intrinsic::x86_avx2_permps: {
   2003     simplifyAndSetOp(&II, 1, DemandedElts, UndefElts);
   2004     break;
   2005   }
   2006 
   2007   // SSE4A instructions leave the upper 64-bits of the 128-bit result
   2008   // in an undefined state.
   2009   case Intrinsic::x86_sse4a_extrq:
   2010   case Intrinsic::x86_sse4a_extrqi:
   2011   case Intrinsic::x86_sse4a_insertq:
   2012   case Intrinsic::x86_sse4a_insertqi:
   2013     UndefElts.setHighBits(VWidth / 2);
   2014     break;
   2015   }
   2016   return None;
   2017 }
   2018