llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp - llvm-project - Git at Google

 //===-- X86InstCombineIntrinsic.cpp - X86 specific InstCombine pass -------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 /// \file
 /// This file implements a TargetTransformInfo analysis pass specific to the
 /// X86 target machine. It uses the target's detailed information to provide
 /// more precise answers to certain TTI queries, while letting the target
 /// independent and default TTI implementations handle the rest.
 ///
 //===----------------------------------------------------------------------===//

 #include "X86TargetTransformInfo.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/IntrinsicsX86.h"
 #include "llvm/Support/KnownBits.h"
 #include "llvm/Transforms/InstCombine/InstCombiner.h"

 using namespace llvm;

 #define DEBUG_TYPE "x86tti"

 /// Return a constant boolean vector that has true elements in all positions
 /// where the input constant data vector has an element with the sign bit set.
 static Constant *getNegativeIsTrueBoolVec(Constant *V) {
   VectorType *IntTy = VectorType::getInteger(cast<VectorType>(V->getType()));
   V = ConstantExpr::getBitCast(V, IntTy);
   V = ConstantExpr::getICmp(CmpInst::ICMP_SGT, Constant::getNullValue(IntTy),
                             V);
   return V;
 }

 /// Convert the x86 XMM integer vector mask to a vector of bools based on
 /// each element's most significant bit (the sign bit).
 static Value *getBoolVecFromMask(Value *Mask) {
   // Fold Constant Mask.
   if (auto *ConstantMask = dyn_cast<ConstantDataVector>(Mask))
     return getNegativeIsTrueBoolVec(ConstantMask);

   // Mask was extended from a boolean vector.
   Value *ExtMask;
   if (PatternMatch::match(
           Mask, PatternMatch::m_SExt(PatternMatch::m_Value(ExtMask))) &&
       ExtMask->getType()->isIntOrIntVectorTy(1))
     return ExtMask;

   return nullptr;
 }

 // TODO: If the x86 backend knew how to convert a bool vector mask back to an
 // XMM register mask efficiently, we could transform all x86 masked intrinsics
 // to LLVM masked intrinsics and remove the x86 masked intrinsic defs.
 static Instruction *simplifyX86MaskedLoad(IntrinsicInst &II, InstCombiner &IC) {
   Value *Ptr = II.getOperand(0);
   Value *Mask = II.getOperand(1);
   Constant *ZeroVec = Constant::getNullValue(II.getType());

   // Zero Mask - masked load instruction creates a zero vector.
   if (isa<ConstantAggregateZero>(Mask))
     return IC.replaceInstUsesWith(II, ZeroVec);

   // The mask is constant or extended from a bool vector. Convert this x86
   // intrinsic to the LLVM intrinsic to allow target-independent optimizations.
   if (Value *BoolMask = getBoolVecFromMask(Mask)) {
     // First, cast the x86 intrinsic scalar pointer to a vector pointer to match
     // the LLVM intrinsic definition for the pointer argument.
     unsigned AddrSpace = cast<PointerType>(Ptr->getType())->getAddressSpace();
     PointerType *VecPtrTy = PointerType::get(II.getType(), AddrSpace);
     Value *PtrCast = IC.Builder.CreateBitCast(Ptr, VecPtrTy, "castvec");

     // The pass-through vector for an x86 masked load is a zero vector.
     CallInst *NewMaskedLoad = IC.Builder.CreateMaskedLoad(
         II.getType(), PtrCast, Align(1), BoolMask, ZeroVec);
     return IC.replaceInstUsesWith(II, NewMaskedLoad);
   }

   return nullptr;
 }

 // TODO: If the x86 backend knew how to convert a bool vector mask back to an
 // XMM register mask efficiently, we could transform all x86 masked intrinsics
 // to LLVM masked intrinsics and remove the x86 masked intrinsic defs.
 static bool simplifyX86MaskedStore(IntrinsicInst &II, InstCombiner &IC) {
   Value *Ptr = II.getOperand(0);
   Value *Mask = II.getOperand(1);
   Value *Vec = II.getOperand(2);

   // Zero Mask - this masked store instruction does nothing.
   if (isa<ConstantAggregateZero>(Mask)) {
     IC.eraseInstFromFunction(II);
     return true;
   }

   // The SSE2 version is too weird (eg, unaligned but non-temporal) to do
   // anything else at this level.
   if (II.getIntrinsicID() == Intrinsic::x86_sse2_maskmov_dqu)
     return false;

   // The mask is constant or extended from a bool vector. Convert this x86
   // intrinsic to the LLVM intrinsic to allow target-independent optimizations.
   if (Value *BoolMask = getBoolVecFromMask(Mask)) {
     unsigned AddrSpace = cast<PointerType>(Ptr->getType())->getAddressSpace();
     PointerType *VecPtrTy = PointerType::get(Vec->getType(), AddrSpace);
     Value *PtrCast = IC.Builder.CreateBitCast(Ptr, VecPtrTy, "castvec");

     IC.Builder.CreateMaskedStore(Vec, PtrCast, Align(1), BoolMask);

     // 'Replace uses' doesn't work for stores. Erase the original masked store.
     IC.eraseInstFromFunction(II);
     return true;
   }

   return false;
 }

 static Value *simplifyX86immShift(const IntrinsicInst &II,
                                   InstCombiner::BuilderTy &Builder) {
   bool LogicalShift = false;
   bool ShiftLeft = false;
   bool IsImm = false;

   switch (II.getIntrinsicID()) {
   default:
     llvm_unreachable("Unexpected intrinsic!");
   case Intrinsic::x86_sse2_psrai_d:
   case Intrinsic::x86_sse2_psrai_w:
   case Intrinsic::x86_avx2_psrai_d:
   case Intrinsic::x86_avx2_psrai_w:
   case Intrinsic::x86_avx512_psrai_q_128:
   case Intrinsic::x86_avx512_psrai_q_256:
   case Intrinsic::x86_avx512_psrai_d_512:
   case Intrinsic::x86_avx512_psrai_q_512:
   case Intrinsic::x86_avx512_psrai_w_512:
     IsImm = true;
     LLVM_FALLTHROUGH;
   case Intrinsic::x86_sse2_psra_d:
   case Intrinsic::x86_sse2_psra_w:
   case Intrinsic::x86_avx2_psra_d:
   case Intrinsic::x86_avx2_psra_w:
   case Intrinsic::x86_avx512_psra_q_128:
   case Intrinsic::x86_avx512_psra_q_256:
   case Intrinsic::x86_avx512_psra_d_512:
   case Intrinsic::x86_avx512_psra_q_512:
   case Intrinsic::x86_avx512_psra_w_512:
     LogicalShift = false;
     ShiftLeft = false;
     break;
   case Intrinsic::x86_sse2_psrli_d:
   case Intrinsic::x86_sse2_psrli_q:
   case Intrinsic::x86_sse2_psrli_w:
   case Intrinsic::x86_avx2_psrli_d:
   case Intrinsic::x86_avx2_psrli_q:
   case Intrinsic::x86_avx2_psrli_w:
   case Intrinsic::x86_avx512_psrli_d_512:
   case Intrinsic::x86_avx512_psrli_q_512:
   case Intrinsic::x86_avx512_psrli_w_512:
     IsImm = true;
     LLVM_FALLTHROUGH;
   case Intrinsic::x86_sse2_psrl_d:
   case Intrinsic::x86_sse2_psrl_q:
   case Intrinsic::x86_sse2_psrl_w:
   case Intrinsic::x86_avx2_psrl_d:
   case Intrinsic::x86_avx2_psrl_q:
   case Intrinsic::x86_avx2_psrl_w:
   case Intrinsic::x86_avx512_psrl_d_512:
   case Intrinsic::x86_avx512_psrl_q_512:
   case Intrinsic::x86_avx512_psrl_w_512:
     LogicalShift = true;
     ShiftLeft = false;
     break;
   case Intrinsic::x86_sse2_pslli_d:
   case Intrinsic::x86_sse2_pslli_q:
   case Intrinsic::x86_sse2_pslli_w:
   case Intrinsic::x86_avx2_pslli_d:
   case Intrinsic::x86_avx2_pslli_q:
   case Intrinsic::x86_avx2_pslli_w:
   case Intrinsic::x86_avx512_pslli_d_512:
   case Intrinsic::x86_avx512_pslli_q_512:
   case Intrinsic::x86_avx512_pslli_w_512:
     IsImm = true;
     LLVM_FALLTHROUGH;
   case Intrinsic::x86_sse2_psll_d:
   case Intrinsic::x86_sse2_psll_q:
   case Intrinsic::x86_sse2_psll_w:
   case Intrinsic::x86_avx2_psll_d:
   case Intrinsic::x86_avx2_psll_q:
   case Intrinsic::x86_avx2_psll_w:
   case Intrinsic::x86_avx512_psll_d_512:
   case Intrinsic::x86_avx512_psll_q_512:
   case Intrinsic::x86_avx512_psll_w_512:
     LogicalShift = true;
     ShiftLeft = true;
     break;
   }
   assert((LogicalShift || !ShiftLeft) && "Only logical shifts can shift left");

   Value *Vec = II.getArgOperand(0);
   Value *Amt = II.getArgOperand(1);
   auto *VT = cast<FixedVectorType>(Vec->getType());
   Type *SVT = VT->getElementType();
   Type *AmtVT = Amt->getType();
   unsigned VWidth = VT->getNumElements();
   unsigned BitWidth = SVT->getPrimitiveSizeInBits();

   // If the shift amount is guaranteed to be in-range we can replace it with a
   // generic shift. If its guaranteed to be out of range, logical shifts combine
   // to zero and arithmetic shifts are clamped to (BitWidth - 1).
   if (IsImm) {
     assert(AmtVT->isIntegerTy(32) && "Unexpected shift-by-immediate type");
     KnownBits KnownAmtBits =
         llvm::computeKnownBits(Amt, II.getModule()->getDataLayout());
     if (KnownAmtBits.getMaxValue().ult(BitWidth)) {
       Amt = Builder.CreateZExtOrTrunc(Amt, SVT);
       Amt = Builder.CreateVectorSplat(VWidth, Amt);
       return (LogicalShift ? (ShiftLeft ? Builder.CreateShl(Vec, Amt)
                                         : Builder.CreateLShr(Vec, Amt))
                            : Builder.CreateAShr(Vec, Amt));
     }
     if (KnownAmtBits.getMinValue().uge(BitWidth)) {
       if (LogicalShift)
         return ConstantAggregateZero::get(VT);
       Amt = ConstantInt::get(SVT, BitWidth - 1);
       return Builder.CreateAShr(Vec, Builder.CreateVectorSplat(VWidth, Amt));
     }
   } else {
     // Ensure the first element has an in-range value and the rest of the
     // elements in the bottom 64 bits are zero.
     assert(AmtVT->isVectorTy() && AmtVT->getPrimitiveSizeInBits() == 128 &&
            cast<VectorType>(AmtVT)->getElementType() == SVT &&
            "Unexpected shift-by-scalar type");
     unsigned NumAmtElts = cast<FixedVectorType>(AmtVT)->getNumElements();
     APInt DemandedLower = APInt::getOneBitSet(NumAmtElts, 0);
     APInt DemandedUpper = APInt::getBitsSet(NumAmtElts, 1, NumAmtElts / 2);
     KnownBits KnownLowerBits = llvm::computeKnownBits(
         Amt, DemandedLower, II.getModule()->getDataLayout());
     KnownBits KnownUpperBits = llvm::computeKnownBits(
         Amt, DemandedUpper, II.getModule()->getDataLayout());
     if (KnownLowerBits.getMaxValue().ult(BitWidth) &&
         (DemandedUpper.isZero() || KnownUpperBits.isZero())) {
       SmallVector<int, 16> ZeroSplat(VWidth, 0);
       Amt = Builder.CreateShuffleVector(Amt, ZeroSplat);
       return (LogicalShift ? (ShiftLeft ? Builder.CreateShl(Vec, Amt)
                                         : Builder.CreateLShr(Vec, Amt))
                            : Builder.CreateAShr(Vec, Amt));
     }
   }

   // Simplify if count is constant vector.
   auto *CDV = dyn_cast<ConstantDataVector>(Amt);
   if (!CDV)
     return nullptr;

   // SSE2/AVX2 uses all the first 64-bits of the 128-bit vector
   // operand to compute the shift amount.
   assert(AmtVT->isVectorTy() && AmtVT->getPrimitiveSizeInBits() == 128 &&
          cast<VectorType>(AmtVT)->getElementType() == SVT &&
          "Unexpected shift-by-scalar type");

   // Concatenate the sub-elements to create the 64-bit value.
   APInt Count(64, 0);
   for (unsigned i = 0, NumSubElts = 64 / BitWidth; i != NumSubElts; ++i) {
     unsigned SubEltIdx = (NumSubElts - 1) - i;
     auto *SubElt = cast<ConstantInt>(CDV->getElementAsConstant(SubEltIdx));
     Count <<= BitWidth;
     Count |= SubElt->getValue().zextOrTrunc(64);
   }

   // If shift-by-zero then just return the original value.
   if (Count.isZero())
     return Vec;

   // Handle cases when Shift >= BitWidth.
   if (Count.uge(BitWidth)) {
     // If LogicalShift - just return zero.
     if (LogicalShift)
       return ConstantAggregateZero::get(VT);

     // If ArithmeticShift - clamp Shift to (BitWidth - 1).
     Count = APInt(64, BitWidth - 1);
   }

   // Get a constant vector of the same type as the first operand.
   auto ShiftAmt = ConstantInt::get(SVT, Count.zextOrTrunc(BitWidth));
   auto ShiftVec = Builder.CreateVectorSplat(VWidth, ShiftAmt);

   if (ShiftLeft)
     return Builder.CreateShl(Vec, ShiftVec);

   if (LogicalShift)
     return Builder.CreateLShr(Vec, ShiftVec);

   return Builder.CreateAShr(Vec, ShiftVec);
 }

 // Attempt to simplify AVX2 per-element shift intrinsics to a generic IR shift.
 // Unlike the generic IR shifts, the intrinsics have defined behaviour for out
 // of range shift amounts (logical - set to zero, arithmetic - splat sign bit).
 static Value *simplifyX86varShift(const IntrinsicInst &II,
                                   InstCombiner::BuilderTy &Builder) {
   bool LogicalShift = false;
   bool ShiftLeft = false;

   switch (II.getIntrinsicID()) {
   default:
     llvm_unreachable("Unexpected intrinsic!");
   case Intrinsic::x86_avx2_psrav_d:
   case Intrinsic::x86_avx2_psrav_d_256:
   case Intrinsic::x86_avx512_psrav_q_128:
   case Intrinsic::x86_avx512_psrav_q_256:
   case Intrinsic::x86_avx512_psrav_d_512:
   case Intrinsic::x86_avx512_psrav_q_512:
   case Intrinsic::x86_avx512_psrav_w_128:
   case Intrinsic::x86_avx512_psrav_w_256:
   case Intrinsic::x86_avx512_psrav_w_512:
     LogicalShift = false;
     ShiftLeft = false;
     break;
   case Intrinsic::x86_avx2_psrlv_d:
   case Intrinsic::x86_avx2_psrlv_d_256:
   case Intrinsic::x86_avx2_psrlv_q:
   case Intrinsic::x86_avx2_psrlv_q_256:
   case Intrinsic::x86_avx512_psrlv_d_512:
   case Intrinsic::x86_avx512_psrlv_q_512:
   case Intrinsic::x86_avx512_psrlv_w_128:
   case Intrinsic::x86_avx512_psrlv_w_256:
   case Intrinsic::x86_avx512_psrlv_w_512:
     LogicalShift = true;
     ShiftLeft = false;
     break;
   case Intrinsic::x86_avx2_psllv_d:
   case Intrinsic::x86_avx2_psllv_d_256:
   case Intrinsic::x86_avx2_psllv_q:
   case Intrinsic::x86_avx2_psllv_q_256:
   case Intrinsic::x86_avx512_psllv_d_512:
   case Intrinsic::x86_avx512_psllv_q_512:
   case Intrinsic::x86_avx512_psllv_w_128:
   case Intrinsic::x86_avx512_psllv_w_256:
   case Intrinsic::x86_avx512_psllv_w_512:
     LogicalShift = true;
     ShiftLeft = true;
     break;
   }
   assert((LogicalShift || !ShiftLeft) && "Only logical shifts can shift left");

   Value *Vec = II.getArgOperand(0);
   Value *Amt = II.getArgOperand(1);
   auto *VT = cast<FixedVectorType>(II.getType());
   Type *SVT = VT->getElementType();
   int NumElts = VT->getNumElements();
   int BitWidth = SVT->getIntegerBitWidth();

   // If the shift amount is guaranteed to be in-range we can replace it with a
   // generic shift.
   APInt UpperBits =
       APInt::getHighBitsSet(BitWidth, BitWidth - Log2_32(BitWidth));
   if (llvm::MaskedValueIsZero(Amt, UpperBits,
                               II.getModule()->getDataLayout())) {
     return (LogicalShift ? (ShiftLeft ? Builder.CreateShl(Vec, Amt)
                                       : Builder.CreateLShr(Vec, Amt))
                          : Builder.CreateAShr(Vec, Amt));
   }

   // Simplify if all shift amounts are constant/undef.
   auto *CShift = dyn_cast<Constant>(Amt);
   if (!CShift)
     return nullptr;

   // Collect each element's shift amount.
   // We also collect special cases: UNDEF = -1, OUT-OF-RANGE = BitWidth.
   bool AnyOutOfRange = false;
   SmallVector<int, 8> ShiftAmts;
   for (int I = 0; I < NumElts; ++I) {
     auto *CElt = CShift->getAggregateElement(I);
     if (isa_and_nonnull<UndefValue>(CElt)) {
       ShiftAmts.push_back(-1);
       continue;
     }

     auto *COp = dyn_cast_or_null<ConstantInt>(CElt);
     if (!COp)
       return nullptr;

     // Handle out of range shifts.
     // If LogicalShift - set to BitWidth (special case).
     // If ArithmeticShift - set to (BitWidth - 1) (sign splat).
     APInt ShiftVal = COp->getValue();
     if (ShiftVal.uge(BitWidth)) {
       AnyOutOfRange = LogicalShift;
       ShiftAmts.push_back(LogicalShift ? BitWidth : BitWidth - 1);
       continue;
     }

     ShiftAmts.push_back((int)ShiftVal.getZExtValue());
   }

   // If all elements out of range or UNDEF, return vector of zeros/undefs.
   // ArithmeticShift should only hit this if they are all UNDEF.
   auto OutOfRange = [&](int Idx) { return (Idx < 0) || (BitWidth <= Idx); };
   if (llvm::all_of(ShiftAmts, OutOfRange)) {
     SmallVector<Constant *, 8> ConstantVec;
     for (int Idx : ShiftAmts) {
       if (Idx < 0) {
         ConstantVec.push_back(UndefValue::get(SVT));
       } else {
         assert(LogicalShift && "Logical shift expected");
         ConstantVec.push_back(ConstantInt::getNullValue(SVT));
       }
     }
     return ConstantVector::get(ConstantVec);
   }

   // We can't handle only some out of range values with generic logical shifts.
   if (AnyOutOfRange)
     return nullptr;

   // Build the shift amount constant vector.
   SmallVector<Constant *, 8> ShiftVecAmts;
   for (int Idx : ShiftAmts) {
     if (Idx < 0)
       ShiftVecAmts.push_back(UndefValue::get(SVT));
     else
       ShiftVecAmts.push_back(ConstantInt::get(SVT, Idx));
   }
   auto ShiftVec = ConstantVector::get(ShiftVecAmts);

   if (ShiftLeft)
     return Builder.CreateShl(Vec, ShiftVec);

   if (LogicalShift)
     return Builder.CreateLShr(Vec, ShiftVec);

   return Builder.CreateAShr(Vec, ShiftVec);
 }

 static Value *simplifyX86pack(IntrinsicInst &II,
                               InstCombiner::BuilderTy &Builder, bool IsSigned) {
   Value *Arg0 = II.getArgOperand(0);
   Value *Arg1 = II.getArgOperand(1);
   Type *ResTy = II.getType();

   // Fast all undef handling.
   if (isa<UndefValue>(Arg0) && isa<UndefValue>(Arg1))
     return UndefValue::get(ResTy);

   auto *ArgTy = cast<FixedVectorType>(Arg0->getType());
   unsigned NumLanes = ResTy->getPrimitiveSizeInBits() / 128;
   unsigned NumSrcElts = ArgTy->getNumElements();
   assert(cast<FixedVectorType>(ResTy)->getNumElements() == (2 * NumSrcElts) &&
          "Unexpected packing types");

   unsigned NumSrcEltsPerLane = NumSrcElts / NumLanes;
   unsigned DstScalarSizeInBits = ResTy->getScalarSizeInBits();
   unsigned SrcScalarSizeInBits = ArgTy->getScalarSizeInBits();
   assert(SrcScalarSizeInBits == (2 * DstScalarSizeInBits) &&
          "Unexpected packing types");

   // Constant folding.
   if (!isa<Constant>(Arg0) || !isa<Constant>(Arg1))
     return nullptr;

   // Clamp Values - signed/unsigned both use signed clamp values, but they
   // differ on the min/max values.
   APInt MinValue, MaxValue;
   if (IsSigned) {
     // PACKSS: Truncate signed value with signed saturation.
     // Source values less than dst minint are saturated to minint.
     // Source values greater than dst maxint are saturated to maxint.
     MinValue =
         APInt::getSignedMinValue(DstScalarSizeInBits).sext(SrcScalarSizeInBits);
     MaxValue =
         APInt::getSignedMaxValue(DstScalarSizeInBits).sext(SrcScalarSizeInBits);
   } else {
     // PACKUS: Truncate signed value with unsigned saturation.
     // Source values less than zero are saturated to zero.
     // Source values greater than dst maxuint are saturated to maxuint.
     MinValue = APInt::getZero(SrcScalarSizeInBits);
     MaxValue = APInt::getLowBitsSet(SrcScalarSizeInBits, DstScalarSizeInBits);
   }

   auto *MinC = Constant::getIntegerValue(ArgTy, MinValue);
   auto *MaxC = Constant::getIntegerValue(ArgTy, MaxValue);
   Arg0 = Builder.CreateSelect(Builder.CreateICmpSLT(Arg0, MinC), MinC, Arg0);
   Arg1 = Builder.CreateSelect(Builder.CreateICmpSLT(Arg1, MinC), MinC, Arg1);
   Arg0 = Builder.CreateSelect(Builder.CreateICmpSGT(Arg0, MaxC), MaxC, Arg0);
   Arg1 = Builder.CreateSelect(Builder.CreateICmpSGT(Arg1, MaxC), MaxC, Arg1);

   // Shuffle clamped args together at the lane level.
   SmallVector<int, 32> PackMask;
   for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
     for (unsigned Elt = 0; Elt != NumSrcEltsPerLane; ++Elt)
       PackMask.push_back(Elt + (Lane * NumSrcEltsPerLane));
     for (unsigned Elt = 0; Elt != NumSrcEltsPerLane; ++Elt)
       PackMask.push_back(Elt + (Lane * NumSrcEltsPerLane) + NumSrcElts);
   }
   auto *Shuffle = Builder.CreateShuffleVector(Arg0, Arg1, PackMask);

   // Truncate to dst size.
   return Builder.CreateTrunc(Shuffle, ResTy);
 }

 static Value *simplifyX86movmsk(const IntrinsicInst &II,
                                 InstCombiner::BuilderTy &Builder) {
   Value *Arg = II.getArgOperand(0);
   Type *ResTy = II.getType();

   // movmsk(undef) -> zero as we must ensure the upper bits are zero.
   if (isa<UndefValue>(Arg))
     return Constant::getNullValue(ResTy);

   auto *ArgTy = dyn_cast<FixedVectorType>(Arg->getType());
   // We can't easily peek through x86_mmx types.
   if (!ArgTy)
     return nullptr;

   // Expand MOVMSK to compare/bitcast/zext:
   // e.g. PMOVMSKB(v16i8 x):
   // %cmp = icmp slt <16 x i8> %x, zeroinitializer
   // %int = bitcast <16 x i1> %cmp to i16
   // %res = zext i16 %int to i32
   unsigned NumElts = ArgTy->getNumElements();
   Type *IntegerVecTy = VectorType::getInteger(ArgTy);
   Type *IntegerTy = Builder.getIntNTy(NumElts);

   Value *Res = Builder.CreateBitCast(Arg, IntegerVecTy);
   Res = Builder.CreateICmpSLT(Res, Constant::getNullValue(IntegerVecTy));
   Res = Builder.CreateBitCast(Res, IntegerTy);
   Res = Builder.CreateZExtOrTrunc(Res, ResTy);
   return Res;
 }

 static Value *simplifyX86addcarry(const IntrinsicInst &II,
                                   InstCombiner::BuilderTy &Builder) {
   Value *CarryIn = II.getArgOperand(0);
   Value *Op1 = II.getArgOperand(1);
   Value *Op2 = II.getArgOperand(2);
   Type *RetTy = II.getType();
   Type *OpTy = Op1->getType();
   assert(RetTy->getStructElementType(0)->isIntegerTy(8) &&
          RetTy->getStructElementType(1) == OpTy && OpTy == Op2->getType() &&
          "Unexpected types for x86 addcarry");

   // If carry-in is zero, this is just an unsigned add with overflow.
   if (match(CarryIn, PatternMatch::m_ZeroInt())) {
     Value *UAdd = Builder.CreateIntrinsic(Intrinsic::uadd_with_overflow, OpTy,
                                           {Op1, Op2});
     // The types have to be adjusted to match the x86 call types.
     Value *UAddResult = Builder.CreateExtractValue(UAdd, 0);
     Value *UAddOV = Builder.CreateZExt(Builder.CreateExtractValue(UAdd, 1),
                                        Builder.getInt8Ty());
     Value *Res = UndefValue::get(RetTy);
     Res = Builder.CreateInsertValue(Res, UAddOV, 0);
     return Builder.CreateInsertValue(Res, UAddResult, 1);
   }

   return nullptr;
 }

 static Value *simplifyX86insertps(const IntrinsicInst &II,
                                   InstCombiner::BuilderTy &Builder) {
   auto *CInt = dyn_cast<ConstantInt>(II.getArgOperand(2));
   if (!CInt)
     return nullptr;

   auto *VecTy = cast<FixedVectorType>(II.getType());
   assert(VecTy->getNumElements() == 4 && "insertps with wrong vector type");

   // The immediate permute control byte looks like this:
   //    [3:0] - zero mask for each 32-bit lane
   //    [5:4] - select one 32-bit destination lane
   //    [7:6] - select one 32-bit source lane

   uint8_t Imm = CInt->getZExtValue();
   uint8_t ZMask = Imm & 0xf;
   uint8_t DestLane = (Imm >> 4) & 0x3;
   uint8_t SourceLane = (Imm >> 6) & 0x3;

   ConstantAggregateZero *ZeroVector = ConstantAggregateZero::get(VecTy);

   // If all zero mask bits are set, this was just a weird way to
   // generate a zero vector.
   if (ZMask == 0xf)
     return ZeroVector;

   // Initialize by passing all of the first source bits through.
   int ShuffleMask[4] = {0, 1, 2, 3};

   // We may replace the second operand with the zero vector.
   Value *V1 = II.getArgOperand(1);

   if (ZMask) {
     // If the zero mask is being used with a single input or the zero mask
     // overrides the destination lane, this is a shuffle with the zero vector.
     if ((II.getArgOperand(0) == II.getArgOperand(1)) ||
         (ZMask & (1 << DestLane))) {
       V1 = ZeroVector;
       // We may still move 32-bits of the first source vector from one lane
       // to another.
       ShuffleMask[DestLane] = SourceLane;
       // The zero mask may override the previous insert operation.
       for (unsigned i = 0; i < 4; ++i)
         if ((ZMask >> i) & 0x1)
           ShuffleMask[i] = i + 4;
     } else {
       // TODO: Model this case as 2 shuffles or a 'logical and' plus shuffle?
       return nullptr;
     }
   } else {
     // Replace the selected destination lane with the selected source lane.
     ShuffleMask[DestLane] = SourceLane + 4;
   }

   return Builder.CreateShuffleVector(II.getArgOperand(0), V1, ShuffleMask);
 }

 /// Attempt to simplify SSE4A EXTRQ/EXTRQI instructions using constant folding
 /// or conversion to a shuffle vector.
 static Value *simplifyX86extrq(IntrinsicInst &II, Value *Op0,
                                ConstantInt *CILength, ConstantInt *CIIndex,
                                InstCombiner::BuilderTy &Builder) {
   auto LowConstantHighUndef = [&](uint64_t Val) {
     Type *IntTy64 = Type::getInt64Ty(II.getContext());
     Constant *Args[] = {ConstantInt::get(IntTy64, Val),
                         UndefValue::get(IntTy64)};
     return ConstantVector::get(Args);
   };

   // See if we're dealing with constant values.
   auto *C0 = dyn_cast<Constant>(Op0);
   auto *CI0 =
       C0 ? dyn_cast_or_null<ConstantInt>(C0->getAggregateElement((unsigned)0))
          : nullptr;

   // Attempt to constant fold.
   if (CILength && CIIndex) {
     // From AMD documentation: "The bit index and field length are each six
     // bits in length other bits of the field are ignored."
     APInt APIndex = CIIndex->getValue().zextOrTrunc(6);
     APInt APLength = CILength->getValue().zextOrTrunc(6);

     unsigned Index = APIndex.getZExtValue();

     // From AMD documentation: "a value of zero in the field length is
     // defined as length of 64".
     unsigned Length = APLength == 0 ? 64 : APLength.getZExtValue();

     // From AMD documentation: "If the sum of the bit index + length field
     // is greater than 64, the results are undefined".
     unsigned End = Index + Length;

     // Note that both field index and field length are 8-bit quantities.
     // Since variables 'Index' and 'Length' are unsigned values
     // obtained from zero-extending field index and field length
     // respectively, their sum should never wrap around.
     if (End > 64)
       return UndefValue::get(II.getType());

     // If we are inserting whole bytes, we can convert this to a shuffle.
     // Lowering can recognize EXTRQI shuffle masks.
     if ((Length % 8) == 0 && (Index % 8) == 0) {
       // Convert bit indices to byte indices.
       Length /= 8;
       Index /= 8;

       Type *IntTy8 = Type::getInt8Ty(II.getContext());
       auto *ShufTy = FixedVectorType::get(IntTy8, 16);

       SmallVector<int, 16> ShuffleMask;
       for (int i = 0; i != (int)Length; ++i)
         ShuffleMask.push_back(i + Index);
       for (int i = Length; i != 8; ++i)
         ShuffleMask.push_back(i + 16);
       for (int i = 8; i != 16; ++i)
         ShuffleMask.push_back(-1);

       Value *SV = Builder.CreateShuffleVector(
           Builder.CreateBitCast(Op0, ShufTy),
           ConstantAggregateZero::get(ShufTy), ShuffleMask);
       return Builder.CreateBitCast(SV, II.getType());
     }

     // Constant Fold - shift Index'th bit to lowest position and mask off
     // Length bits.
     if (CI0) {
       APInt Elt = CI0->getValue();
       Elt.lshrInPlace(Index);
       Elt = Elt.zextOrTrunc(Length);
       return LowConstantHighUndef(Elt.getZExtValue());
     }

     // If we were an EXTRQ call, we'll save registers if we convert to EXTRQI.
     if (II.getIntrinsicID() == Intrinsic::x86_sse4a_extrq) {
       Value *Args[] = {Op0, CILength, CIIndex};
       Module *M = II.getModule();
       Function *F = Intrinsic::getDeclaration(M, Intrinsic::x86_sse4a_extrqi);
       return Builder.CreateCall(F, Args);
     }
   }

   // Constant Fold - extraction from zero is always {zero, undef}.
   if (CI0 && CI0->isZero())
     return LowConstantHighUndef(0);

   return nullptr;
 }

 /// Attempt to simplify SSE4A INSERTQ/INSERTQI instructions using constant
 /// folding or conversion to a shuffle vector.
 static Value *simplifyX86insertq(IntrinsicInst &II, Value *Op0, Value *Op1,
                                  APInt APLength, APInt APIndex,
                                  InstCombiner::BuilderTy &Builder) {
   // From AMD documentation: "The bit index and field length are each six bits
   // in length other bits of the field are ignored."
   APIndex = APIndex.zextOrTrunc(6);
   APLength = APLength.zextOrTrunc(6);

   // Attempt to constant fold.
   unsigned Index = APIndex.getZExtValue();

   // From AMD documentation: "a value of zero in the field length is
   // defined as length of 64".
   unsigned Length = APLength == 0 ? 64 : APLength.getZExtValue();

   // From AMD documentation: "If the sum of the bit index + length field
   // is greater than 64, the results are undefined".
   unsigned End = Index + Length;

   // Note that both field index and field length are 8-bit quantities.
   // Since variables 'Index' and 'Length' are unsigned values
   // obtained from zero-extending field index and field length
   // respectively, their sum should never wrap around.
   if (End > 64)
     return UndefValue::get(II.getType());

   // If we are inserting whole bytes, we can convert this to a shuffle.
   // Lowering can recognize INSERTQI shuffle masks.
   if ((Length % 8) == 0 && (Index % 8) == 0) {
     // Convert bit indices to byte indices.
     Length /= 8;
     Index /= 8;

     Type *IntTy8 = Type::getInt8Ty(II.getContext());
     auto *ShufTy = FixedVectorType::get(IntTy8, 16);

     SmallVector<int, 16> ShuffleMask;
     for (int i = 0; i != (int)Index; ++i)
       ShuffleMask.push_back(i);
     for (int i = 0; i != (int)Length; ++i)
       ShuffleMask.push_back(i + 16);
     for (int i = Index + Length; i != 8; ++i)
       ShuffleMask.push_back(i);
     for (int i = 8; i != 16; ++i)
       ShuffleMask.push_back(-1);

     Value *SV = Builder.CreateShuffleVector(Builder.CreateBitCast(Op0, ShufTy),
                                             Builder.CreateBitCast(Op1, ShufTy),
                                             ShuffleMask);
     return Builder.CreateBitCast(SV, II.getType());
   }

   // See if we're dealing with constant values.
   auto *C0 = dyn_cast<Constant>(Op0);
   auto *C1 = dyn_cast<Constant>(Op1);
   auto *CI00 =
       C0 ? dyn_cast_or_null<ConstantInt>(C0->getAggregateElement((unsigned)0))
          : nullptr;
   auto *CI10 =
       C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)0))
          : nullptr;

   // Constant Fold - insert bottom Length bits starting at the Index'th bit.
   if (CI00 && CI10) {
     APInt V00 = CI00->getValue();
     APInt V10 = CI10->getValue();
     APInt Mask = APInt::getLowBitsSet(64, Length).shl(Index);
     V00 = V00 & ~Mask;
     V10 = V10.zextOrTrunc(Length).zextOrTrunc(64).shl(Index);
     APInt Val = V00 | V10;
     Type *IntTy64 = Type::getInt64Ty(II.getContext());
     Constant *Args[] = {ConstantInt::get(IntTy64, Val.getZExtValue()),
                         UndefValue::get(IntTy64)};
     return ConstantVector::get(Args);
   }

   // If we were an INSERTQ call, we'll save demanded elements if we convert to
   // INSERTQI.
   if (II.getIntrinsicID() == Intrinsic::x86_sse4a_insertq) {
     Type *IntTy8 = Type::getInt8Ty(II.getContext());
     Constant *CILength = ConstantInt::get(IntTy8, Length, false);
     Constant *CIIndex = ConstantInt::get(IntTy8, Index, false);

     Value *Args[] = {Op0, Op1, CILength, CIIndex};
     Module *M = II.getModule();
     Function *F = Intrinsic::getDeclaration(M, Intrinsic::x86_sse4a_insertqi);
     return Builder.CreateCall(F, Args);
   }

   return nullptr;
 }

 /// Attempt to convert pshufb* to shufflevector if the mask is constant.
 static Value *simplifyX86pshufb(const IntrinsicInst &II,
                                 InstCombiner::BuilderTy &Builder) {
   auto *V = dyn_cast<Constant>(II.getArgOperand(1));
   if (!V)
     return nullptr;

   auto *VecTy = cast<FixedVectorType>(II.getType());
   unsigned NumElts = VecTy->getNumElements();
   assert((NumElts == 16 || NumElts == 32 || NumElts == 64) &&
          "Unexpected number of elements in shuffle mask!");

   // Construct a shuffle mask from constant integers or UNDEFs.
   int Indexes[64];

   // Each byte in the shuffle control mask forms an index to permute the
   // corresponding byte in the destination operand.
   for (unsigned I = 0; I < NumElts; ++I) {
     Constant *COp = V->getAggregateElement(I);
     if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp)))
       return nullptr;

     if (isa<UndefValue>(COp)) {
       Indexes[I] = -1;
       continue;
     }

     int8_t Index = cast<ConstantInt>(COp)->getValue().getZExtValue();

     // If the most significant bit (bit[7]) of each byte of the shuffle
     // control mask is set, then zero is written in the result byte.
     // The zero vector is in the right-hand side of the resulting
     // shufflevector.

     // The value of each index for the high 128-bit lane is the least
     // significant 4 bits of the respective shuffle control byte.
     Index = ((Index < 0) ? NumElts : Index & 0x0F) + (I & 0xF0);
     Indexes[I] = Index;
   }

   auto V1 = II.getArgOperand(0);
   auto V2 = Constant::getNullValue(VecTy);
   return Builder.CreateShuffleVector(V1, V2, makeArrayRef(Indexes, NumElts));
 }

 /// Attempt to convert vpermilvar* to shufflevector if the mask is constant.
 static Value *simplifyX86vpermilvar(const IntrinsicInst &II,
                                     InstCombiner::BuilderTy &Builder) {
   auto *V = dyn_cast<Constant>(II.getArgOperand(1));
   if (!V)
     return nullptr;

   auto *VecTy = cast<FixedVectorType>(II.getType());
   unsigned NumElts = VecTy->getNumElements();
   bool IsPD = VecTy->getScalarType()->isDoubleTy();
   unsigned NumLaneElts = IsPD ? 2 : 4;
   assert(NumElts == 16 || NumElts == 8 || NumElts == 4 || NumElts == 2);

   // Construct a shuffle mask from constant integers or UNDEFs.
   int Indexes[16];

   // The intrinsics only read one or two bits, clear the rest.
   for (unsigned I = 0; I < NumElts; ++I) {
     Constant *COp = V->getAggregateElement(I);
     if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp)))
       return nullptr;

     if (isa<UndefValue>(COp)) {
       Indexes[I] = -1;
       continue;
     }

     APInt Index = cast<ConstantInt>(COp)->getValue();
     Index = Index.zextOrTrunc(32).getLoBits(2);

     // The PD variants uses bit 1 to select per-lane element index, so
     // shift down to convert to generic shuffle mask index.
     if (IsPD)
       Index.lshrInPlace(1);

     // The _256 variants are a bit trickier since the mask bits always index
     // into the corresponding 128 half. In order to convert to a generic
     // shuffle, we have to make that explicit.
     Index += APInt(32, (I / NumLaneElts) * NumLaneElts);

     Indexes[I] = Index.getZExtValue();
   }

   auto V1 = II.getArgOperand(0);
   return Builder.CreateShuffleVector(V1, makeArrayRef(Indexes, NumElts));
 }

 /// Attempt to convert vpermd/vpermps to shufflevector if the mask is constant.
 static Value *simplifyX86vpermv(const IntrinsicInst &II,
                                 InstCombiner::BuilderTy &Builder) {
   auto *V = dyn_cast<Constant>(II.getArgOperand(1));
   if (!V)
     return nullptr;

   auto *VecTy = cast<FixedVectorType>(II.getType());
   unsigned Size = VecTy->getNumElements();
   assert((Size == 4 || Size == 8 || Size == 16 || Size == 32 || Size == 64) &&
          "Unexpected shuffle mask size");

   // Construct a shuffle mask from constant integers or UNDEFs.
   int Indexes[64];

   for (unsigned I = 0; I < Size; ++I) {
     Constant *COp = V->getAggregateElement(I);
     if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp)))
       return nullptr;

     if (isa<UndefValue>(COp)) {
       Indexes[I] = -1;
       continue;
     }

     uint32_t Index = cast<ConstantInt>(COp)->getZExtValue();
     Index &= Size - 1;
     Indexes[I] = Index;
   }

   auto V1 = II.getArgOperand(0);
   return Builder.CreateShuffleVector(V1, makeArrayRef(Indexes, Size));
 }

 Optional<Instruction *>
 X86TTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
   auto SimplifyDemandedVectorEltsLow = [&IC](Value *Op, unsigned Width,
                                              unsigned DemandedWidth) {
     APInt UndefElts(Width, 0);
     APInt DemandedElts = APInt::getLowBitsSet(Width, DemandedWidth);
     return IC.SimplifyDemandedVectorElts(Op, DemandedElts, UndefElts);
   };

   Intrinsic::ID IID = II.getIntrinsicID();
   switch (IID) {
   case Intrinsic::x86_bmi_bextr_32:
   case Intrinsic::x86_bmi_bextr_64:
   case Intrinsic::x86_tbm_bextri_u32:
   case Intrinsic::x86_tbm_bextri_u64:
     // If the RHS is a constant we can try some simplifications.
     if (auto *C = dyn_cast<ConstantInt>(II.getArgOperand(1))) {
       uint64_t Shift = C->getZExtValue();
       uint64_t Length = (Shift >> 8) & 0xff;
       Shift &= 0xff;
       unsigned BitWidth = II.getType()->getIntegerBitWidth();
       // If the length is 0 or the shift is out of range, replace with zero.
       if (Length == 0 || Shift >= BitWidth) {
         return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0));
       }
       // If the LHS is also a constant, we can completely constant fold this.
       if (auto *InC = dyn_cast<ConstantInt>(II.getArgOperand(0))) {
         uint64_t Result = InC->getZExtValue() >> Shift;
         if (Length > BitWidth)
           Length = BitWidth;
         Result &= maskTrailingOnes<uint64_t>(Length);
         return IC.replaceInstUsesWith(II,
                                       ConstantInt::get(II.getType(), Result));
       }
       // TODO should we turn this into 'and' if shift is 0? Or 'shl' if we
       // are only masking bits that a shift already cleared?
     }
     break;

   case Intrinsic::x86_bmi_bzhi_32:
   case Intrinsic::x86_bmi_bzhi_64:
     // If the RHS is a constant we can try some simplifications.
     if (auto *C = dyn_cast<ConstantInt>(II.getArgOperand(1))) {
       uint64_t Index = C->getZExtValue() & 0xff;
       unsigned BitWidth = II.getType()->getIntegerBitWidth();
       if (Index >= BitWidth) {
         return IC.replaceInstUsesWith(II, II.getArgOperand(0));
       }
       if (Index == 0) {
         return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0));
       }
       // If the LHS is also a constant, we can completely constant fold this.
       if (auto *InC = dyn_cast<ConstantInt>(II.getArgOperand(0))) {
         uint64_t Result = InC->getZExtValue();
         Result &= maskTrailingOnes<uint64_t>(Index);
         return IC.replaceInstUsesWith(II,
                                       ConstantInt::get(II.getType(), Result));
       }
       // TODO should we convert this to an AND if the RHS is constant?
     }
     break;
   case Intrinsic::x86_bmi_pext_32:
   case Intrinsic::x86_bmi_pext_64:
     if (auto *MaskC = dyn_cast<ConstantInt>(II.getArgOperand(1))) {
       if (MaskC->isNullValue()) {
         return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0));
       }
       if (MaskC->isAllOnesValue()) {
         return IC.replaceInstUsesWith(II, II.getArgOperand(0));
       }

       if (MaskC->getValue().isShiftedMask()) {
         // any single contingous sequence of 1s anywhere in the mask simply
         // describes a subset of the input bits shifted to the appropriate
         // position.  Replace with the straight forward IR.
         unsigned ShiftAmount = MaskC->getValue().countTrailingZeros();
         Value *Input = II.getArgOperand(0);
         Value *Masked = IC.Builder.CreateAnd(Input, II.getArgOperand(1));
         Value *Shifted = IC.Builder.CreateLShr(Masked,
                                                ConstantInt::get(II.getType(),
                                                                 ShiftAmount));
         return IC.replaceInstUsesWith(II, Shifted);
       }


       if (auto *SrcC = dyn_cast<ConstantInt>(II.getArgOperand(0))) {
         uint64_t Src = SrcC->getZExtValue();
         uint64_t Mask = MaskC->getZExtValue();
         uint64_t Result = 0;
         uint64_t BitToSet = 1;

         while (Mask) {
           // Isolate lowest set bit.
           uint64_t BitToTest = Mask & -Mask;
           if (BitToTest & Src)
             Result |= BitToSet;

           BitToSet <<= 1;
           // Clear lowest set bit.
           Mask &= Mask - 1;
         }

         return IC.replaceInstUsesWith(II,
                                       ConstantInt::get(II.getType(), Result));
       }
     }
     break;
   case Intrinsic::x86_bmi_pdep_32:
   case Intrinsic::x86_bmi_pdep_64:
     if (auto *MaskC = dyn_cast<ConstantInt>(II.getArgOperand(1))) {
       if (MaskC->isNullValue()) {
         return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), 0));
       }
       if (MaskC->isAllOnesValue()) {
         return IC.replaceInstUsesWith(II, II.getArgOperand(0));
       }
       if (MaskC->getValue().isShiftedMask()) {
         // any single contingous sequence of 1s anywhere in the mask simply
         // describes a subset of the input bits shifted to the appropriate
         // position.  Replace with the straight forward IR.
         unsigned ShiftAmount = MaskC->getValue().countTrailingZeros();
         Value *Input = II.getArgOperand(0);
         Value *Shifted = IC.Builder.CreateShl(Input,
                                               ConstantInt::get(II.getType(),
                                                                ShiftAmount));
         Value *Masked = IC.Builder.CreateAnd(Shifted, II.getArgOperand(1));
         return IC.replaceInstUsesWith(II, Masked);
       }

       if (auto *SrcC = dyn_cast<ConstantInt>(II.getArgOperand(0))) {
         uint64_t Src = SrcC->getZExtValue();
         uint64_t Mask = MaskC->getZExtValue();
         uint64_t Result = 0;
         uint64_t BitToTest = 1;

         while (Mask) {
           // Isolate lowest set bit.
           uint64_t BitToSet = Mask & -Mask;
           if (BitToTest & Src)
             Result |= BitToSet;

           BitToTest <<= 1;
           // Clear lowest set bit;
           Mask &= Mask - 1;
         }

         return IC.replaceInstUsesWith(II,
                                       ConstantInt::get(II.getType(), Result));
       }
     }
     break;

   case Intrinsic::x86_sse_cvtss2si:
   case Intrinsic::x86_sse_cvtss2si64:
   case Intrinsic::x86_sse_cvttss2si:
   case Intrinsic::x86_sse_cvttss2si64:
   case Intrinsic::x86_sse2_cvtsd2si:
   case Intrinsic::x86_sse2_cvtsd2si64:
   case Intrinsic::x86_sse2_cvttsd2si:
   case Intrinsic::x86_sse2_cvttsd2si64:
   case Intrinsic::x86_avx512_vcvtss2si32:
   case Intrinsic::x86_avx512_vcvtss2si64:
   case Intrinsic::x86_avx512_vcvtss2usi32:
   case Intrinsic::x86_avx512_vcvtss2usi64:
   case Intrinsic::x86_avx512_vcvtsd2si32:
   case Intrinsic::x86_avx512_vcvtsd2si64:
   case Intrinsic::x86_avx512_vcvtsd2usi32:
   case Intrinsic::x86_avx512_vcvtsd2usi64:
   case Intrinsic::x86_avx512_cvttss2si:
   case Intrinsic::x86_avx512_cvttss2si64:
   case Intrinsic::x86_avx512_cvttss2usi:
   case Intrinsic::x86_avx512_cvttss2usi64:
   case Intrinsic::x86_avx512_cvttsd2si:
   case Intrinsic::x86_avx512_cvttsd2si64:
   case Intrinsic::x86_avx512_cvttsd2usi:
   case Intrinsic::x86_avx512_cvttsd2usi64: {
     // These intrinsics only demand the 0th element of their input vectors. If
     // we can simplify the input based on that, do so now.
     Value *Arg = II.getArgOperand(0);
     unsigned VWidth = cast<FixedVectorType>(Arg->getType())->getNumElements();
     if (Value *V = SimplifyDemandedVectorEltsLow(Arg, VWidth, 1)) {
       return IC.replaceOperand(II, 0, V);
     }
     break;
   }

   case Intrinsic::x86_mmx_pmovmskb:
   case Intrinsic::x86_sse_movmsk_ps:
   case Intrinsic::x86_sse2_movmsk_pd:
   case Intrinsic::x86_sse2_pmovmskb_128:
   case Intrinsic::x86_avx_movmsk_pd_256:
   case Intrinsic::x86_avx_movmsk_ps_256:
   case Intrinsic::x86_avx2_pmovmskb:
     if (Value *V = simplifyX86movmsk(II, IC.Builder)) {
       return IC.replaceInstUsesWith(II, V);
     }
     break;

   case Intrinsic::x86_sse_comieq_ss:
   case Intrinsic::x86_sse_comige_ss:
   case Intrinsic::x86_sse_comigt_ss:
   case Intrinsic::x86_sse_comile_ss:
   case Intrinsic::x86_sse_comilt_ss:
   case Intrinsic::x86_sse_comineq_ss:
   case Intrinsic::x86_sse_ucomieq_ss:
   case Intrinsic::x86_sse_ucomige_ss:
   case Intrinsic::x86_sse_ucomigt_ss:
   case Intrinsic::x86_sse_ucomile_ss:
   case Intrinsic::x86_sse_ucomilt_ss:
   case Intrinsic::x86_sse_ucomineq_ss:
   case Intrinsic::x86_sse2_comieq_sd:
   case Intrinsic::x86_sse2_comige_sd:
   case Intrinsic::x86_sse2_comigt_sd:
   case Intrinsic::x86_sse2_comile_sd:
   case Intrinsic::x86_sse2_comilt_sd:
   case Intrinsic::x86_sse2_comineq_sd:
   case Intrinsic::x86_sse2_ucomieq_sd:
   case Intrinsic::x86_sse2_ucomige_sd:
   case Intrinsic::x86_sse2_ucomigt_sd:
   case Intrinsic::x86_sse2_ucomile_sd:
   case Intrinsic::x86_sse2_ucomilt_sd:
   case Intrinsic::x86_sse2_ucomineq_sd:
   case Intrinsic::x86_avx512_vcomi_ss:
   case Intrinsic::x86_avx512_vcomi_sd:
   case Intrinsic::x86_avx512_mask_cmp_ss:
   case Intrinsic::x86_avx512_mask_cmp_sd: {
     // These intrinsics only demand the 0th element of their input vectors. If
     // we can simplify the input based on that, do so now.
     bool MadeChange = false;
     Value *Arg0 = II.getArgOperand(0);
     Value *Arg1 = II.getArgOperand(1);
     unsigned VWidth = cast<FixedVectorType>(Arg0->getType())->getNumElements();
     if (Value *V = SimplifyDemandedVectorEltsLow(Arg0, VWidth, 1)) {
       IC.replaceOperand(II, 0, V);
       MadeChange = true;
     }
     if (Value *V = SimplifyDemandedVectorEltsLow(Arg1, VWidth, 1)) {
       IC.replaceOperand(II, 1, V);
       MadeChange = true;
     }
     if (MadeChange) {
       return &II;
     }
     break;
   }

   case Intrinsic::x86_avx512_add_ps_512:
   case Intrinsic::x86_avx512_div_ps_512:
   case Intrinsic::x86_avx512_mul_ps_512:
   case Intrinsic::x86_avx512_sub_ps_512:
   case Intrinsic::x86_avx512_add_pd_512:
   case Intrinsic::x86_avx512_div_pd_512:
   case Intrinsic::x86_avx512_mul_pd_512:
   case Intrinsic::x86_avx512_sub_pd_512:
     // If the rounding mode is CUR_DIRECTION(4) we can turn these into regular
     // IR operations.
     if (auto *R = dyn_cast<ConstantInt>(II.getArgOperand(2))) {
       if (R->getValue() == 4) {
         Value *Arg0 = II.getArgOperand(0);
         Value *Arg1 = II.getArgOperand(1);

         Value *V;
         switch (IID) {
         default:
           llvm_unreachable("Case stmts out of sync!");
         case Intrinsic::x86_avx512_add_ps_512:
         case Intrinsic::x86_avx512_add_pd_512:
           V = IC.Builder.CreateFAdd(Arg0, Arg1);
           break;
         case Intrinsic::x86_avx512_sub_ps_512:
         case Intrinsic::x86_avx512_sub_pd_512:
           V = IC.Builder.CreateFSub(Arg0, Arg1);
           break;
         case Intrinsic::x86_avx512_mul_ps_512:
         case Intrinsic::x86_avx512_mul_pd_512:
           V = IC.Builder.CreateFMul(Arg0, Arg1);
           break;
         case Intrinsic::x86_avx512_div_ps_512:
         case Intrinsic::x86_avx512_div_pd_512:
           V = IC.Builder.CreateFDiv(Arg0, Arg1);
           break;
         }

         return IC.replaceInstUsesWith(II, V);
       }
     }
     break;

   case Intrinsic::x86_avx512_mask_add_ss_round:
   case Intrinsic::x86_avx512_mask_div_ss_round:
   case Intrinsic::x86_avx512_mask_mul_ss_round:
   case Intrinsic::x86_avx512_mask_sub_ss_round:
   case Intrinsic::x86_avx512_mask_add_sd_round:
   case Intrinsic::x86_avx512_mask_div_sd_round:
   case Intrinsic::x86_avx512_mask_mul_sd_round:
   case Intrinsic::x86_avx512_mask_sub_sd_round:
     // If the rounding mode is CUR_DIRECTION(4) we can turn these into regular
     // IR operations.
     if (auto *R = dyn_cast<ConstantInt>(II.getArgOperand(4))) {
       if (R->getValue() == 4) {
         // Extract the element as scalars.
         Value *Arg0 = II.getArgOperand(0);
         Value *Arg1 = II.getArgOperand(1);
         Value *LHS = IC.Builder.CreateExtractElement(Arg0, (uint64_t)0);
         Value *RHS = IC.Builder.CreateExtractElement(Arg1, (uint64_t)0);

         Value *V;
         switch (IID) {
         default:
           llvm_unreachable("Case stmts out of sync!");
         case Intrinsic::x86_avx512_mask_add_ss_round:
         case Intrinsic::x86_avx512_mask_add_sd_round:
           V = IC.Builder.CreateFAdd(LHS, RHS);
           break;
         case Intrinsic::x86_avx512_mask_sub_ss_round:
         case Intrinsic::x86_avx512_mask_sub_sd_round:
           V = IC.Builder.CreateFSub(LHS, RHS);
           break;
         case Intrinsic::x86_avx512_mask_mul_ss_round:
         case Intrinsic::x86_avx512_mask_mul_sd_round:
           V = IC.Builder.CreateFMul(LHS, RHS);
           break;
         case Intrinsic::x86_avx512_mask_div_ss_round:
         case Intrinsic::x86_avx512_mask_div_sd_round:
           V = IC.Builder.CreateFDiv(LHS, RHS);
           break;
         }

         // Handle the masking aspect of the intrinsic.
         Value *Mask = II.getArgOperand(3);
         auto *C = dyn_cast<ConstantInt>(Mask);
         // We don't need a select if we know the mask bit is a 1.
         if (!C || !C->getValue()[0]) {
           // Cast the mask to an i1 vector and then extract the lowest element.
           auto *MaskTy = FixedVectorType::get(
               IC.Builder.getInt1Ty(),
               cast<IntegerType>(Mask->getType())->getBitWidth());
           Mask = IC.Builder.CreateBitCast(Mask, MaskTy);
           Mask = IC.Builder.CreateExtractElement(Mask, (uint64_t)0);
           // Extract the lowest element from the passthru operand.
           Value *Passthru =
               IC.Builder.CreateExtractElement(II.getArgOperand(2), (uint64_t)0);
           V = IC.Builder.CreateSelect(Mask, V, Passthru);
         }

         // Insert the result back into the original argument 0.
         V = IC.Builder.CreateInsertElement(Arg0, V, (uint64_t)0);

         return IC.replaceInstUsesWith(II, V);
       }
     }
     break;

   // Constant fold ashr( <A x Bi>, Ci ).
   // Constant fold lshr( <A x Bi>, Ci ).
   // Constant fold shl( <A x Bi>, Ci ).
   case Intrinsic::x86_sse2_psrai_d:
   case Intrinsic::x86_sse2_psrai_w:
   case Intrinsic::x86_avx2_psrai_d:
   case Intrinsic::x86_avx2_psrai_w:
   case Intrinsic::x86_avx512_psrai_q_128:
   case Intrinsic::x86_avx512_psrai_q_256:
   case Intrinsic::x86_avx512_psrai_d_512:
   case Intrinsic::x86_avx512_psrai_q_512:
   case Intrinsic::x86_avx512_psrai_w_512:
   case Intrinsic::x86_sse2_psrli_d:
   case Intrinsic::x86_sse2_psrli_q:
   case Intrinsic::x86_sse2_psrli_w:
   case Intrinsic::x86_avx2_psrli_d:
   case Intrinsic::x86_avx2_psrli_q:
   case Intrinsic::x86_avx2_psrli_w:
   case Intrinsic::x86_avx512_psrli_d_512:
   case Intrinsic::x86_avx512_psrli_q_512:
   case Intrinsic::x86_avx512_psrli_w_512:
   case Intrinsic::x86_sse2_pslli_d:
   case Intrinsic::x86_sse2_pslli_q:
   case Intrinsic::x86_sse2_pslli_w:
   case Intrinsic::x86_avx2_pslli_d:
   case Intrinsic::x86_avx2_pslli_q:
   case Intrinsic::x86_avx2_pslli_w:
   case Intrinsic::x86_avx512_pslli_d_512:
   case Intrinsic::x86_avx512_pslli_q_512:
   case Intrinsic::x86_avx512_pslli_w_512:
     if (Value *V = simplifyX86immShift(II, IC.Builder)) {
       return IC.replaceInstUsesWith(II, V);
     }
     break;

   case Intrinsic::x86_sse2_psra_d:
   case Intrinsic::x86_sse2_psra_w:
   case Intrinsic::x86_avx2_psra_d:
   case Intrinsic::x86_avx2_psra_w:
   case Intrinsic::x86_avx512_psra_q_128:
   case Intrinsic::x86_avx512_psra_q_256:
   case Intrinsic::x86_avx512_psra_d_512:
   case Intrinsic::x86_avx512_psra_q_512:
   case Intrinsic::x86_avx512_psra_w_512:
   case Intrinsic::x86_sse2_psrl_d:
   case Intrinsic::x86_sse2_psrl_q:
   case Intrinsic::x86_sse2_psrl_w:
   case Intrinsic::x86_avx2_psrl_d:
   case Intrinsic::x86_avx2_psrl_q:
   case Intrinsic::x86_avx2_psrl_w:
   case Intrinsic::x86_avx512_psrl_d_512:
   case Intrinsic::x86_avx512_psrl_q_512:
   case Intrinsic::x86_avx512_psrl_w_512:
   case Intrinsic::x86_sse2_psll_d:
   case Intrinsic::x86_sse2_psll_q:
   case Intrinsic::x86_sse2_psll_w:
   case Intrinsic::x86_avx2_psll_d:
   case Intrinsic::x86_avx2_psll_q:
   case Intrinsic::x86_avx2_psll_w:
   case Intrinsic::x86_avx512_psll_d_512:
   case Intrinsic::x86_avx512_psll_q_512:
   case Intrinsic::x86_avx512_psll_w_512: {
     if (Value *V = simplifyX86immShift(II, IC.Builder)) {
       return IC.replaceInstUsesWith(II, V);
     }

     // SSE2/AVX2 uses only the first 64-bits of the 128-bit vector
     // operand to compute the shift amount.
     Value *Arg1 = II.getArgOperand(1);
     assert(Arg1->getType()->getPrimitiveSizeInBits() == 128 &&
            "Unexpected packed shift size");
     unsigned VWidth = cast<FixedVectorType>(Arg1->getType())->getNumElements();

     if (Value *V = SimplifyDemandedVectorEltsLow(Arg1, VWidth, VWidth / 2)) {
       return IC.replaceOperand(II, 1, V);
     }
     break;
   }

   case Intrinsic::x86_avx2_psllv_d:
   case Intrinsic::x86_avx2_psllv_d_256:
   case Intrinsic::x86_avx2_psllv_q:
   case Intrinsic::x86_avx2_psllv_q_256:
   case Intrinsic::x86_avx512_psllv_d_512:
   case Intrinsic::x86_avx512_psllv_q_512:
   case Intrinsic::x86_avx512_psllv_w_128:
   case Intrinsic::x86_avx512_psllv_w_256:
   case Intrinsic::x86_avx512_psllv_w_512:
   case Intrinsic::x86_avx2_psrav_d:
   case Intrinsic::x86_avx2_psrav_d_256:
   case Intrinsic::x86_avx512_psrav_q_128:
   case Intrinsic::x86_avx512_psrav_q_256:
   case Intrinsic::x86_avx512_psrav_d_512:
   case Intrinsic::x86_avx512_psrav_q_512:
   case Intrinsic::x86_avx512_psrav_w_128:
   case Intrinsic::x86_avx512_psrav_w_256:
   case Intrinsic::x86_avx512_psrav_w_512:
   case Intrinsic::x86_avx2_psrlv_d:
   case Intrinsic::x86_avx2_psrlv_d_256:
   case Intrinsic::x86_avx2_psrlv_q:
   case Intrinsic::x86_avx2_psrlv_q_256:
   case Intrinsic::x86_avx512_psrlv_d_512:
   case Intrinsic::x86_avx512_psrlv_q_512:
   case Intrinsic::x86_avx512_psrlv_w_128:
   case Intrinsic::x86_avx512_psrlv_w_256:
   case Intrinsic::x86_avx512_psrlv_w_512:
     if (Value *V = simplifyX86varShift(II, IC.Builder)) {
       return IC.replaceInstUsesWith(II, V);
     }
     break;

   case Intrinsic::x86_sse2_packssdw_128:
   case Intrinsic::x86_sse2_packsswb_128:
   case Intrinsic::x86_avx2_packssdw:
   case Intrinsic::x86_avx2_packsswb:
   case Intrinsic::x86_avx512_packssdw_512:
   case Intrinsic::x86_avx512_packsswb_512:
     if (Value *V = simplifyX86pack(II, IC.Builder, true)) {
       return IC.replaceInstUsesWith(II, V);
     }
     break;

   case Intrinsic::x86_sse2_packuswb_128:
   case Intrinsic::x86_sse41_packusdw:
   case Intrinsic::x86_avx2_packusdw:
   case Intrinsic::x86_avx2_packuswb:
   case Intrinsic::x86_avx512_packusdw_512:
   case Intrinsic::x86_avx512_packuswb_512:
     if (Value *V = simplifyX86pack(II, IC.Builder, false)) {
       return IC.replaceInstUsesWith(II, V);
     }
     break;

   case Intrinsic::x86_pclmulqdq:
   case Intrinsic::x86_pclmulqdq_256:
   case Intrinsic::x86_pclmulqdq_512: {
     if (auto *C = dyn_cast<ConstantInt>(II.getArgOperand(2))) {
       unsigned Imm = C->getZExtValue();

       bool MadeChange = false;
       Value *Arg0 = II.getArgOperand(0);
       Value *Arg1 = II.getArgOperand(1);
       unsigned VWidth =
           cast<FixedVectorType>(Arg0->getType())->getNumElements();

       APInt UndefElts1(VWidth, 0);
       APInt DemandedElts1 =
           APInt::getSplat(VWidth, APInt(2, (Imm & 0x01) ? 2 : 1));
       if (Value *V =
               IC.SimplifyDemandedVectorElts(Arg0, DemandedElts1, UndefElts1)) {
         IC.replaceOperand(II, 0, V);
         MadeChange = true;
       }

       APInt UndefElts2(VWidth, 0);
       APInt DemandedElts2 =
           APInt::getSplat(VWidth, APInt(2, (Imm & 0x10) ? 2 : 1));
       if (Value *V =
               IC.SimplifyDemandedVectorElts(Arg1, DemandedElts2, UndefElts2)) {
         IC.replaceOperand(II, 1, V);
         MadeChange = true;
       }

       // If either input elements are undef, the result is zero.
       if (DemandedElts1.isSubsetOf(UndefElts1) ||
           DemandedElts2.isSubsetOf(UndefElts2)) {
         return IC.replaceInstUsesWith(II,
                                       ConstantAggregateZero::get(II.getType()));
       }

       if (MadeChange) {
         return &II;
       }
     }
     break;
   }

   case Intrinsic::x86_sse41_insertps:
     if (Value *V = simplifyX86insertps(II, IC.Builder)) {
       return IC.replaceInstUsesWith(II, V);
     }
     break;

   case Intrinsic::x86_sse4a_extrq: {
     Value *Op0 = II.getArgOperand(0);
     Value *Op1 = II.getArgOperand(1);
     unsigned VWidth0 = cast<FixedVectorType>(Op0->getType())->getNumElements();
     unsigned VWidth1 = cast<FixedVectorType>(Op1->getType())->getNumElements();
     assert(Op0->getType()->getPrimitiveSizeInBits() == 128 &&
            Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth0 == 2 &&
            VWidth1 == 16 && "Unexpected operand sizes");

     // See if we're dealing with constant values.
     auto *C1 = dyn_cast<Constant>(Op1);
     auto *CILength =
         C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)0))
            : nullptr;
     auto *CIIndex =
         C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)1))
            : nullptr;

     // Attempt to simplify to a constant, shuffle vector or EXTRQI call.
     if (Value *V = simplifyX86extrq(II, Op0, CILength, CIIndex, IC.Builder)) {
       return IC.replaceInstUsesWith(II, V);
     }

     // EXTRQ only uses the lowest 64-bits of the first 128-bit vector
     // operands and the lowest 16-bits of the second.
     bool MadeChange = false;
     if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth0, 1)) {
       IC.replaceOperand(II, 0, V);
       MadeChange = true;
     }
     if (Value *V = SimplifyDemandedVectorEltsLow(Op1, VWidth1, 2)) {
       IC.replaceOperand(II, 1, V);
       MadeChange = true;
     }
     if (MadeChange) {
       return &II;
     }
     break;
   }

   case Intrinsic::x86_sse4a_extrqi: {
     // EXTRQI: Extract Length bits starting from Index. Zero pad the remaining
     // bits of the lower 64-bits. The upper 64-bits are undefined.
     Value *Op0 = II.getArgOperand(0);
     unsigned VWidth = cast<FixedVectorType>(Op0->getType())->getNumElements();
     assert(Op0->getType()->getPrimitiveSizeInBits() == 128 && VWidth == 2 &&
            "Unexpected operand size");

     // See if we're dealing with constant values.
     auto *CILength = dyn_cast<ConstantInt>(II.getArgOperand(1));
     auto *CIIndex = dyn_cast<ConstantInt>(II.getArgOperand(2));

     // Attempt to simplify to a constant or shuffle vector.
     if (Value *V = simplifyX86extrq(II, Op0, CILength, CIIndex, IC.Builder)) {
       return IC.replaceInstUsesWith(II, V);
     }

     // EXTRQI only uses the lowest 64-bits of the first 128-bit vector
     // operand.
     if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth, 1)) {
       return IC.replaceOperand(II, 0, V);
     }
     break;
   }

   case Intrinsic::x86_sse4a_insertq: {
     Value *Op0 = II.getArgOperand(0);
     Value *Op1 = II.getArgOperand(1);
     unsigned VWidth = cast<FixedVectorType>(Op0->getType())->getNumElements();
     assert(Op0->getType()->getPrimitiveSizeInBits() == 128 &&
            Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth == 2 &&
            cast<FixedVectorType>(Op1->getType())->getNumElements() == 2 &&
            "Unexpected operand size");

     // See if we're dealing with constant values.
     auto *C1 = dyn_cast<Constant>(Op1);
     auto *CI11 =
         C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)1))
            : nullptr;

     // Attempt to simplify to a constant, shuffle vector or INSERTQI call.
     if (CI11) {
       const APInt &V11 = CI11->getValue();
       APInt Len = V11.zextOrTrunc(6);
       APInt Idx = V11.lshr(8).zextOrTrunc(6);
       if (Value *V = simplifyX86insertq(II, Op0, Op1, Len, Idx, IC.Builder)) {
         return IC.replaceInstUsesWith(II, V);
       }
     }

     // INSERTQ only uses the lowest 64-bits of the first 128-bit vector
     // operand.
     if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth, 1)) {
       return IC.replaceOperand(II, 0, V);
     }
     break;
   }

   case Intrinsic::x86_sse4a_insertqi: {
     // INSERTQI: Extract lowest Length bits from lower half of second source and
     // insert over first source starting at Index bit. The upper 64-bits are
     // undefined.
     Value *Op0 = II.getArgOperand(0);
     Value *Op1 = II.getArgOperand(1);
     unsigned VWidth0 = cast<FixedVectorType>(Op0->getType())->getNumElements();
     unsigned VWidth1 = cast<FixedVectorType>(Op1->getType())->getNumElements();
     assert(Op0->getType()->getPrimitiveSizeInBits() == 128 &&
            Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth0 == 2 &&
            VWidth1 == 2 && "Unexpected operand sizes");

     // See if we're dealing with constant values.
     auto *CILength = dyn_cast<ConstantInt>(II.getArgOperand(2));
     auto *CIIndex = dyn_cast<ConstantInt>(II.getArgOperand(3));

     // Attempt to simplify to a constant or shuffle vector.
     if (CILength && CIIndex) {
       APInt Len = CILength->getValue().zextOrTrunc(6);
       APInt Idx = CIIndex->getValue().zextOrTrunc(6);
       if (Value *V = simplifyX86insertq(II, Op0, Op1, Len, Idx, IC.Builder)) {
         return IC.replaceInstUsesWith(II, V);
       }
     }

     // INSERTQI only uses the lowest 64-bits of the first two 128-bit vector
     // operands.
     bool MadeChange = false;
     if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth0, 1)) {
       IC.replaceOperand(II, 0, V);
       MadeChange = true;
     }
     if (Value *V = SimplifyDemandedVectorEltsLow(Op1, VWidth1, 1)) {
       IC.replaceOperand(II, 1, V);
       MadeChange = true;
     }
     if (MadeChange) {
       return &II;
     }
     break;
   }

   case Intrinsic::x86_sse41_pblendvb:
   case Intrinsic::x86_sse41_blendvps:
   case Intrinsic::x86_sse41_blendvpd:
   case Intrinsic::x86_avx_blendv_ps_256:
   case Intrinsic::x86_avx_blendv_pd_256:
   case Intrinsic::x86_avx2_pblendvb: {
     // fold (blend A, A, Mask) -> A
     Value *Op0 = II.getArgOperand(0);
     Value *Op1 = II.getArgOperand(1);
     Value *Mask = II.getArgOperand(2);
     if (Op0 == Op1) {
       return IC.replaceInstUsesWith(II, Op0);
     }

     // Zero Mask - select 1st argument.
     if (isa<ConstantAggregateZero>(Mask)) {
       return IC.replaceInstUsesWith(II, Op0);
     }

     // Constant Mask - select 1st/2nd argument lane based on top bit of mask.
     if (auto *ConstantMask = dyn_cast<ConstantDataVector>(Mask)) {
       Constant *NewSelector = getNegativeIsTrueBoolVec(ConstantMask);
       return SelectInst::Create(NewSelector, Op1, Op0, "blendv");
     }

     // Convert to a vector select if we can bypass casts and find a boolean
     // vector condition value.
     Value *BoolVec;
     Mask = InstCombiner::peekThroughBitcast(Mask);
     if (match(Mask, PatternMatch::m_SExt(PatternMatch::m_Value(BoolVec))) &&
         BoolVec->getType()->isVectorTy() &&
         BoolVec->getType()->getScalarSizeInBits() == 1) {
       assert(Mask->getType()->getPrimitiveSizeInBits() ==
                  II.getType()->getPrimitiveSizeInBits() &&
              "Not expecting mask and operands with different sizes");

       unsigned NumMaskElts =
           cast<FixedVectorType>(Mask->getType())->getNumElements();
       unsigned NumOperandElts =
           cast<FixedVectorType>(II.getType())->getNumElements();
       if (NumMaskElts == NumOperandElts) {
         return SelectInst::Create(BoolVec, Op1, Op0);
       }

       // If the mask has less elements than the operands, each mask bit maps to
       // multiple elements of the operands. Bitcast back and forth.
       if (NumMaskElts < NumOperandElts) {
         Value *CastOp0 = IC.Builder.CreateBitCast(Op0, Mask->getType());
         Value *CastOp1 = IC.Builder.CreateBitCast(Op1, Mask->getType());
         Value *Sel = IC.Builder.CreateSelect(BoolVec, CastOp1, CastOp0);
         return new BitCastInst(Sel, II.getType());
       }
     }

     break;
   }

   case Intrinsic::x86_ssse3_pshuf_b_128:
   case Intrinsic::x86_avx2_pshuf_b:
   case Intrinsic::x86_avx512_pshuf_b_512:
     if (Value *V = simplifyX86pshufb(II, IC.Builder)) {
       return IC.replaceInstUsesWith(II, V);
     }
     break;

   case Intrinsic::x86_avx_vpermilvar_ps:
   case Intrinsic::x86_avx_vpermilvar_ps_256:
   case Intrinsic::x86_avx512_vpermilvar_ps_512:
   case Intrinsic::x86_avx_vpermilvar_pd:
   case Intrinsic::x86_avx_vpermilvar_pd_256:
   case Intrinsic::x86_avx512_vpermilvar_pd_512:
     if (Value *V = simplifyX86vpermilvar(II, IC.Builder)) {
       return IC.replaceInstUsesWith(II, V);
     }
     break;

   case Intrinsic::x86_avx2_permd:
   case Intrinsic::x86_avx2_permps:
   case Intrinsic::x86_avx512_permvar_df_256:
   case Intrinsic::x86_avx512_permvar_df_512:
   case Intrinsic::x86_avx512_permvar_di_256:
   case Intrinsic::x86_avx512_permvar_di_512:
   case Intrinsic::x86_avx512_permvar_hi_128:
   case Intrinsic::x86_avx512_permvar_hi_256:
   case Intrinsic::x86_avx512_permvar_hi_512:
   case Intrinsic::x86_avx512_permvar_qi_128:
   case Intrinsic::x86_avx512_permvar_qi_256:
   case Intrinsic::x86_avx512_permvar_qi_512:
   case Intrinsic::x86_avx512_permvar_sf_512:
   case Intrinsic::x86_avx512_permvar_si_512:
     if (Value *V = simplifyX86vpermv(II, IC.Builder)) {
       return IC.replaceInstUsesWith(II, V);
     }
     break;

   case Intrinsic::x86_avx_maskload_ps:
   case Intrinsic::x86_avx_maskload_pd:
   case Intrinsic::x86_avx_maskload_ps_256:
   case Intrinsic::x86_avx_maskload_pd_256:
   case Intrinsic::x86_avx2_maskload_d:
   case Intrinsic::x86_avx2_maskload_q:
   case Intrinsic::x86_avx2_maskload_d_256:
   case Intrinsic::x86_avx2_maskload_q_256:
     if (Instruction *I = simplifyX86MaskedLoad(II, IC)) {
       return I;
     }
     break;

   case Intrinsic::x86_sse2_maskmov_dqu:
   case Intrinsic::x86_avx_maskstore_ps:
   case Intrinsic::x86_avx_maskstore_pd:
   case Intrinsic::x86_avx_maskstore_ps_256:
   case Intrinsic::x86_avx_maskstore_pd_256:
   case Intrinsic::x86_avx2_maskstore_d:
   case Intrinsic::x86_avx2_maskstore_q:
   case Intrinsic::x86_avx2_maskstore_d_256:
   case Intrinsic::x86_avx2_maskstore_q_256:
     if (simplifyX86MaskedStore(II, IC)) {
       return nullptr;
     }
     break;

   case Intrinsic::x86_addcarry_32:
   case Intrinsic::x86_addcarry_64:
     if (Value *V = simplifyX86addcarry(II, IC.Builder)) {
       return IC.replaceInstUsesWith(II, V);
     }
     break;

   default:
     break;
   }
   return None;
 }

 Optional<Value *> X86TTIImpl::simplifyDemandedUseBitsIntrinsic(
     InstCombiner &IC, IntrinsicInst &II, APInt DemandedMask, KnownBits &Known,
     bool &KnownBitsComputed) const {
   switch (II.getIntrinsicID()) {
   default:
     break;
   case Intrinsic::x86_mmx_pmovmskb:
   case Intrinsic::x86_sse_movmsk_ps:
   case Intrinsic::x86_sse2_movmsk_pd:
   case Intrinsic::x86_sse2_pmovmskb_128:
   case Intrinsic::x86_avx_movmsk_ps_256:
   case Intrinsic::x86_avx_movmsk_pd_256:
   case Intrinsic::x86_avx2_pmovmskb: {
     // MOVMSK copies the vector elements' sign bits to the low bits
     // and zeros the high bits.
     unsigned ArgWidth;
     if (II.getIntrinsicID() == Intrinsic::x86_mmx_pmovmskb) {
       ArgWidth = 8; // Arg is x86_mmx, but treated as <8 x i8>.
     } else {
       auto *ArgType = cast<FixedVectorType>(II.getArgOperand(0)->getType());
       ArgWidth = ArgType->getNumElements();
     }

     // If we don't need any of low bits then return zero,
     // we know that DemandedMask is non-zero already.
     APInt DemandedElts = DemandedMask.zextOrTrunc(ArgWidth);
     Type *VTy = II.getType();
     if (DemandedElts.isZero()) {
       return ConstantInt::getNullValue(VTy);
     }

     // We know that the upper bits are set to zero.
     Known.Zero.setBitsFrom(ArgWidth);
     KnownBitsComputed = true;
     break;
   }
   }
   return None;
 }

 Optional<Value *> X86TTIImpl::simplifyDemandedVectorEltsIntrinsic(
     InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts,
     APInt &UndefElts2, APInt &UndefElts3,
     std::function<void(Instruction *, unsigned, APInt, APInt &)>
         simplifyAndSetOp) const {
   unsigned VWidth = cast<FixedVectorType>(II.getType())->getNumElements();
   switch (II.getIntrinsicID()) {
   default:
     break;
   case Intrinsic::x86_xop_vfrcz_ss:
   case Intrinsic::x86_xop_vfrcz_sd:
     // The instructions for these intrinsics are speced to zero upper bits not
     // pass them through like other scalar intrinsics. So we shouldn't just
     // use Arg0 if DemandedElts[0] is clear like we do for other intrinsics.
     // Instead we should return a zero vector.
     if (!DemandedElts[0]) {
       IC.addToWorklist(&II);
       return ConstantAggregateZero::get(II.getType());
     }

     // Only the lower element is used.
     DemandedElts = 1;
     simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);

     // Only the lower element is undefined. The high elements are zero.
     UndefElts = UndefElts[0];
     break;

   // Unary scalar-as-vector operations that work column-wise.
   case Intrinsic::x86_sse_rcp_ss:
   case Intrinsic::x86_sse_rsqrt_ss:
     simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);

     // If lowest element of a scalar op isn't used then use Arg0.
     if (!DemandedElts[0]) {
       IC.addToWorklist(&II);
       return II.getArgOperand(0);
     }
     // TODO: If only low elt lower SQRT to FSQRT (with rounding/exceptions
     // checks).
     break;

   // Binary scalar-as-vector operations that work column-wise. The high
   // elements come from operand 0. The low element is a function of both
   // operands.
   case Intrinsic::x86_sse_min_ss:
   case Intrinsic::x86_sse_max_ss:
   case Intrinsic::x86_sse_cmp_ss:
   case Intrinsic::x86_sse2_min_sd:
   case Intrinsic::x86_sse2_max_sd:
   case Intrinsic::x86_sse2_cmp_sd: {
     simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);

     // If lowest element of a scalar op isn't used then use Arg0.
     if (!DemandedElts[0]) {
       IC.addToWorklist(&II);
       return II.getArgOperand(0);
     }

     // Only lower element is used for operand 1.
     DemandedElts = 1;
     simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2);

     // Lower element is undefined if both lower elements are undefined.
     // Consider things like undef&0.  The result is known zero, not undef.
     if (!UndefElts2[0])
       UndefElts.clearBit(0);

     break;
   }

   // Binary scalar-as-vector operations that work column-wise. The high
   // elements come from operand 0 and the low element comes from operand 1.
   case Intrinsic::x86_sse41_round_ss:
   case Intrinsic::x86_sse41_round_sd: {
     // Don't use the low element of operand 0.
     APInt DemandedElts2 = DemandedElts;
     DemandedElts2.clearBit(0);
     simplifyAndSetOp(&II, 0, DemandedElts2, UndefElts);

     // If lowest element of a scalar op isn't used then use Arg0.
     if (!DemandedElts[0]) {
       IC.addToWorklist(&II);
       return II.getArgOperand(0);
     }

     // Only lower element is used for operand 1.
     DemandedElts = 1;
     simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2);

     // Take the high undef elements from operand 0 and take the lower element
     // from operand 1.
     UndefElts.clearBit(0);
     UndefElts |= UndefElts2[0];
     break;
   }

   // Three input scalar-as-vector operations that work column-wise. The high
   // elements come from operand 0 and the low element is a function of all
   // three inputs.
   case Intrinsic::x86_avx512_mask_add_ss_round:
   case Intrinsic::x86_avx512_mask_div_ss_round:
   case Intrinsic::x86_avx512_mask_mul_ss_round:
   case Intrinsic::x86_avx512_mask_sub_ss_round:
   case Intrinsic::x86_avx512_mask_max_ss_round:
   case Intrinsic::x86_avx512_mask_min_ss_round:
   case Intrinsic::x86_avx512_mask_add_sd_round:
   case Intrinsic::x86_avx512_mask_div_sd_round:
   case Intrinsic::x86_avx512_mask_mul_sd_round:
   case Intrinsic::x86_avx512_mask_sub_sd_round:
   case Intrinsic::x86_avx512_mask_max_sd_round:
   case Intrinsic::x86_avx512_mask_min_sd_round:
     simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);

     // If lowest element of a scalar op isn't used then use Arg0.
     if (!DemandedElts[0]) {
       IC.addToWorklist(&II);
       return II.getArgOperand(0);
     }

     // Only lower element is used for operand 1 and 2.
     DemandedElts = 1;
     simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2);
     simplifyAndSetOp(&II, 2, DemandedElts, UndefElts3);

     // Lower element is undefined if all three lower elements are undefined.
     // Consider things like undef&0.  The result is known zero, not undef.
     if (!UndefElts2[0] || !UndefElts3[0])
       UndefElts.clearBit(0);
     break;

   // TODO: Add fmaddsub support?
   case Intrinsic::x86_sse3_addsub_pd:
   case Intrinsic::x86_sse3_addsub_ps:
   case Intrinsic::x86_avx_addsub_pd_256:
   case Intrinsic::x86_avx_addsub_ps_256: {
     // If none of the even or none of the odd lanes are required, turn this
     // into a generic FP math instruction.
     APInt SubMask = APInt::getSplat(VWidth, APInt(2, 0x1));
     APInt AddMask = APInt::getSplat(VWidth, APInt(2, 0x2));
     bool IsSubOnly = DemandedElts.isSubsetOf(SubMask);
     bool IsAddOnly = DemandedElts.isSubsetOf(AddMask);
     if (IsSubOnly || IsAddOnly) {
       assert((IsSubOnly ^ IsAddOnly) && "Can't be both add-only and sub-only");
       IRBuilderBase::InsertPointGuard Guard(IC.Builder);
       IC.Builder.SetInsertPoint(&II);
       Value *Arg0 = II.getArgOperand(0), *Arg1 = II.getArgOperand(1);
       return IC.Builder.CreateBinOp(
           IsSubOnly ? Instruction::FSub : Instruction::FAdd, Arg0, Arg1);
     }

     simplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
     simplifyAndSetOp(&II, 1, DemandedElts, UndefElts2);
     UndefElts &= UndefElts2;
     break;
   }

   case Intrinsic::x86_sse2_packssdw_128:
   case Intrinsic::x86_sse2_packsswb_128:
   case Intrinsic::x86_sse2_packuswb_128:
   case Intrinsic::x86_sse41_packusdw:
   case Intrinsic::x86_avx2_packssdw:
   case Intrinsic::x86_avx2_packsswb:
   case Intrinsic::x86_avx2_packusdw:
   case Intrinsic::x86_avx2_packuswb:
   case Intrinsic::x86_avx512_packssdw_512:
   case Intrinsic::x86_avx512_packsswb_512:
   case Intrinsic::x86_avx512_packusdw_512:
   case Intrinsic::x86_avx512_packuswb_512: {
     auto *Ty0 = II.getArgOperand(0)->getType();
     unsigned InnerVWidth = cast<FixedVectorType>(Ty0)->getNumElements();
     assert(VWidth == (InnerVWidth * 2) && "Unexpected input size");

     unsigned NumLanes = Ty0->getPrimitiveSizeInBits() / 128;
     unsigned VWidthPerLane = VWidth / NumLanes;
     unsigned InnerVWidthPerLane = InnerVWidth / NumLanes;

     // Per lane, pack the elements of the first input and then the second.
     // e.g.
     // v8i16 PACK(v4i32 X, v4i32 Y) - (X[0..3],Y[0..3])
     // v32i8 PACK(v16i16 X, v16i16 Y) - (X[0..7],Y[0..7]),(X[8..15],Y[8..15])
     for (int OpNum = 0; OpNum != 2; ++OpNum) {
       APInt OpDemandedElts(InnerVWidth, 0);
       for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
         unsigned LaneIdx = Lane * VWidthPerLane;
         for (unsigned Elt = 0; Elt != InnerVWidthPerLane; ++Elt) {
           unsigned Idx = LaneIdx + Elt + InnerVWidthPerLane * OpNum;
           if (DemandedElts[Idx])
             OpDemandedElts.setBit((Lane * InnerVWidthPerLane) + Elt);
         }
       }

       // Demand elements from the operand.
       APInt OpUndefElts(InnerVWidth, 0);
       simplifyAndSetOp(&II, OpNum, OpDemandedElts, OpUndefElts);

       // Pack the operand's UNDEF elements, one lane at a time.
       OpUndefElts = OpUndefElts.zext(VWidth);
       for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
         APInt LaneElts = OpUndefElts.lshr(InnerVWidthPerLane * Lane);
         LaneElts = LaneElts.getLoBits(InnerVWidthPerLane);
         LaneElts <<= InnerVWidthPerLane * (2 * Lane + OpNum);
         UndefElts |= LaneElts;
       }
     }
     break;
   }

   // PSHUFB
   case Intrinsic::x86_ssse3_pshuf_b_128:
   case Intrinsic::x86_avx2_pshuf_b:
   case Intrinsic::x86_avx512_pshuf_b_512:
   // PERMILVAR
   case Intrinsic::x86_avx_vpermilvar_ps:
   case Intrinsic::x86_avx_vpermilvar_ps_256:
   case Intrinsic::x86_avx512_vpermilvar_ps_512:
   case Intrinsic::x86_avx_vpermilvar_pd:
   case Intrinsic::x86_avx_vpermilvar_pd_256:
   case Intrinsic::x86_avx512_vpermilvar_pd_512:
   // PERMV
   case Intrinsic::x86_avx2_permd:
   case Intrinsic::x86_avx2_permps: {
     simplifyAndSetOp(&II, 1, DemandedElts, UndefElts);
     break;
   }

   // SSE4A instructions leave the upper 64-bits of the 128-bit result
   // in an undefined state.
   case Intrinsic::x86_sse4a_extrq:
   case Intrinsic::x86_sse4a_extrqi:
   case Intrinsic::x86_sse4a_insertq:
   case Intrinsic::x86_sse4a_insertqi:
     UndefElts.setHighBits(VWidth / 2);
     break;
   }
   return None;
 }