llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp - llvm-project.git - Git at Google

 //===-- WebAssemblyTargetTransformInfo.cpp - WebAssembly-specific TTI -----===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 ///
 /// \file
 /// This file defines the WebAssembly-specific TargetTransformInfo
 /// implementation.
 ///
 //===----------------------------------------------------------------------===//

 #include "WebAssemblyTargetTransformInfo.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/IntrinsicsWebAssembly.h"
 #include "llvm/Transforms/InstCombine/InstCombiner.h"

 #include "llvm/CodeGen/CostTable.h"
 using namespace llvm;

 #define DEBUG_TYPE "wasmtti"

 TargetTransformInfo::PopcntSupportKind
 WebAssemblyTTIImpl::getPopcntSupport(unsigned TyWidth) const {
   assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
   return TargetTransformInfo::PSK_FastHardware;
 }

 unsigned WebAssemblyTTIImpl::getNumberOfRegisters(unsigned ClassID) const {
   unsigned Result = BaseT::getNumberOfRegisters(ClassID);

   // For SIMD, use at least 16 registers, as a rough guess.
   bool Vector = (ClassID == 1);
   if (Vector)
     Result = std::max(Result, 16u);

   return Result;
 }

 TypeSize WebAssemblyTTIImpl::getRegisterBitWidth(
     TargetTransformInfo::RegisterKind K) const {
   switch (K) {
   case TargetTransformInfo::RGK_Scalar:
     return TypeSize::getFixed(64);
   case TargetTransformInfo::RGK_FixedWidthVector:
     return TypeSize::getFixed(getST()->hasSIMD128() ? 128 : 64);
   case TargetTransformInfo::RGK_ScalableVector:
     return TypeSize::getScalable(0);
   }

   llvm_unreachable("Unsupported register kind");
 }

 InstructionCost WebAssemblyTTIImpl::getArithmeticInstrCost(
     unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
     TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info,
     ArrayRef<const Value *> Args, const Instruction *CxtI) const {

   if (ST->hasSIMD128()) {
     static const CostTblEntry ArithCostTbl[]{
         // extmul + (maybe awkward) shuffle
         {ISD::MUL, MVT::v8i8, 4},
         // 2x extmul + (okay) shuffle
         {ISD::MUL, MVT::v16i8, 4},
         // extmul
         {ISD::MUL, MVT::v4i16, 1},
         // extmul
         {ISD::MUL, MVT::v2i32, 1},
     };
     EVT DstVT = TLI->getValueType(DL, Ty);
     if (DstVT.isSimple()) {
       int ISD = TLI->InstructionOpcodeToISD(Opcode);
       if (const auto *Entry =
               CostTableLookup(ArithCostTbl, ISD, DstVT.getSimpleVT()))
         return Entry->Cost;
     }
   }

   InstructionCost Cost =
       BasicTTIImplBase<WebAssemblyTTIImpl>::getArithmeticInstrCost(
           Opcode, Ty, CostKind, Op1Info, Op2Info);

   if (auto *VTy = dyn_cast<VectorType>(Ty)) {
     switch (Opcode) {
     case Instruction::LShr:
     case Instruction::AShr:
     case Instruction::Shl:
       // SIMD128's shifts currently only accept a scalar shift count. For each
       // element, we'll need to extract, op, insert. The following is a rough
       // approximation.
       if (!Op2Info.isUniform())
         Cost =
             cast<FixedVectorType>(VTy)->getNumElements() *
             (TargetTransformInfo::TCC_Basic +
              getArithmeticInstrCost(Opcode, VTy->getElementType(), CostKind) +
              TargetTransformInfo::TCC_Basic);
       break;
     }
   }
   return Cost;
 }

 InstructionCost WebAssemblyTTIImpl::getCastInstrCost(
     unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH,
     TTI::TargetCostKind CostKind, const Instruction *I) const {
   int ISD = TLI->InstructionOpcodeToISD(Opcode);
   auto SrcTy = TLI->getValueType(DL, Src);
   auto DstTy = TLI->getValueType(DL, Dst);

   if (!SrcTy.isSimple() || !DstTy.isSimple()) {
     return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
   }

   if (!ST->hasSIMD128()) {
     return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
   }

   auto DstVT = DstTy.getSimpleVT();
   auto SrcVT = SrcTy.getSimpleVT();

   if (I && I->hasOneUser()) {
     auto *SingleUser = cast<Instruction>(*I->user_begin());
     int UserISD = TLI->InstructionOpcodeToISD(SingleUser->getOpcode());

     // extmul_low support
     if (UserISD == ISD::MUL &&
         (ISD == ISD::ZERO_EXTEND || ISD == ISD::SIGN_EXTEND)) {
       // Free low extensions.
       if ((SrcVT == MVT::v8i8 && DstVT == MVT::v8i16) ||
           (SrcVT == MVT::v4i16 && DstVT == MVT::v4i32) ||
           (SrcVT == MVT::v2i32 && DstVT == MVT::v2i64)) {
         return 0;
       }
       // Will require an additional extlow operation for the intermediate
       // i16/i32 value.
       if ((SrcVT == MVT::v4i8 && DstVT == MVT::v4i32) ||
           (SrcVT == MVT::v2i16 && DstVT == MVT::v2i64)) {
         return 1;
       }
     }
   }

   static constexpr TypeConversionCostTblEntry ConversionTbl[] = {
       // extend_low
       {ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i32, 1},
       {ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i32, 1},
       {ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 1},
       {ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 1},
       {ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, 1},
       {ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, 1},
       // 2 x extend_low
       {ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i16, 2},
       {ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i16, 2},
       {ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8, 2},
       {ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 2},
       // extend_low, extend_high
       {ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 2},
       {ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 2},
       {ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 2},
       {ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 2},
       {ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 2},
       {ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 2},
       // 2x extend_low, extend_high
       {ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i32, 4},
       {ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i32, 4},
       {ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 4},
       {ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, 4},
       // shuffle
       {ISD::TRUNCATE, MVT::v2i16, MVT::v2i32, 2},
       {ISD::TRUNCATE, MVT::v2i8, MVT::v2i32, 4},
       {ISD::TRUNCATE, MVT::v4i16, MVT::v4i32, 2},
       {ISD::TRUNCATE, MVT::v4i8, MVT::v4i32, 4},
       // narrow, and
       {ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 2},
       {ISD::TRUNCATE, MVT::v8i8, MVT::v8i16, 2},
       // narrow, 2x and
       {ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 3},
       // 3x narrow, 4x and
       {ISD::TRUNCATE, MVT::v8i16, MVT::v8i64, 7},
       {ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 7},
       // 7x narrow, 8x and
       {ISD::TRUNCATE, MVT::v16i8, MVT::v16i64, 15},
       // convert_i32x4
       {ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i32, 1},
       {ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 1},
       {ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1},
       {ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 1},
       // extend_low, convert
       {ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i16, 2},
       {ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i16, 2},
       {ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i16, 2},
       {ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i16, 2},
       // extend_low x 2, convert
       {ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i8, 3},
       {ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i8, 3},
       {ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i8, 3},
       {ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i8, 3},
       // several shuffles
       {ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i8, 10},
       {ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i8, 10},
       {ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 10},
       {ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i8, 10},
       /// trunc_sat, const, and, 3x narrow
       {ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f32, 6},
       {ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f32, 6},
       {ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f32, 6},
       {ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f32, 6},
       /// trunc_sat, const, and, narrow
       {ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f32, 4},
       {ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f32, 4},
       {ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f32, 4},
       {ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f32, 4},
       // 2x trunc_sat, const, 2x and, 3x narrow
       {ISD::FP_TO_SINT, MVT::v8i8, MVT::v8f32, 8},
       {ISD::FP_TO_UINT, MVT::v8i8, MVT::v8f32, 8},
       // 2x trunc_sat, const, 2x and, narrow
       {ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f32, 6},
       {ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f32, 6},
   };

   if (const auto *Entry =
           ConvertCostTableLookup(ConversionTbl, ISD, DstVT, SrcVT)) {
     return Entry->Cost;
   }

   return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
 }

 WebAssemblyTTIImpl::TTI::MemCmpExpansionOptions
 WebAssemblyTTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
   TTI::MemCmpExpansionOptions Options;

   Options.AllowOverlappingLoads = true;

   if (ST->hasSIMD128())
     Options.LoadSizes.push_back(16);

   Options.LoadSizes.append({8, 4, 2, 1});
   Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);
   Options.NumLoadsPerBlock = Options.MaxNumLoads;

   return Options;
 }

 InstructionCost WebAssemblyTTIImpl::getMemoryOpCost(
     unsigned Opcode, Type *Ty, Align Alignment, unsigned AddressSpace,
     TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo,
     const Instruction *I) const {
   if (!ST->hasSIMD128() || !isa<FixedVectorType>(Ty)) {
     return BaseT::getMemoryOpCost(Opcode, Ty, Alignment, AddressSpace,
                                   CostKind);
   }

   EVT VT = TLI->getValueType(DL, Ty, true);
   // Type legalization can't handle structs
   if (VT == MVT::Other)
     return BaseT::getMemoryOpCost(Opcode, Ty, Alignment, AddressSpace,
                                   CostKind);

   auto LT = getTypeLegalizationCost(Ty);
   if (!LT.first.isValid())
     return InstructionCost::getInvalid();

   int ISD = TLI->InstructionOpcodeToISD(Opcode);
   unsigned width = VT.getSizeInBits();
   if (ISD == ISD::LOAD) {
     // 128-bit loads are a single instruction. 32-bit and 64-bit vector loads
     // can be lowered to load32_zero and load64_zero respectively. Assume SIMD
     // loads are twice as expensive as scalar.
     switch (width) {
     default:
       break;
     case 32:
     case 64:
     case 128:
       return 2;
     }
   } else if (ISD == ISD::STORE) {
     // For stores, we can use store lane operations.
     switch (width) {
     default:
       break;
     case 8:
     case 16:
     case 32:
     case 64:
     case 128:
       return 2;
     }
   }

   return BaseT::getMemoryOpCost(Opcode, Ty, Alignment, AddressSpace, CostKind);
 }

 InstructionCost WebAssemblyTTIImpl::getInterleavedMemoryOpCost(
     unsigned Opcode, Type *Ty, unsigned Factor, ArrayRef<unsigned> Indices,
     Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
     bool UseMaskForCond, bool UseMaskForGaps) const {
   assert(Factor >= 2 && "Invalid interleave factor");

   auto *VecTy = cast<VectorType>(Ty);
   if (!ST->hasSIMD128() || !isa<FixedVectorType>(VecTy)) {
     return InstructionCost::getInvalid();
   }

   if (UseMaskForCond || UseMaskForGaps)
     return BaseT::getInterleavedMemoryOpCost(Opcode, Ty, Factor, Indices,
                                              Alignment, AddressSpace, CostKind,
                                              UseMaskForCond, UseMaskForGaps);

   constexpr unsigned MaxInterleaveFactor = 4;
   if (Factor <= MaxInterleaveFactor) {
     unsigned MinElts = VecTy->getElementCount().getKnownMinValue();
     // Ensure the number of vector elements is greater than 1.
     if (MinElts < 2 || MinElts % Factor != 0)
       return InstructionCost::getInvalid();

     unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType());
     // Ensure the element type is legal.
     if (ElSize != 8 && ElSize != 16 && ElSize != 32 && ElSize != 64)
       return InstructionCost::getInvalid();

     if (Factor != 2 && Factor != 4)
       return InstructionCost::getInvalid();

     auto *SubVecTy =
         VectorType::get(VecTy->getElementType(),
                         VecTy->getElementCount().divideCoefficientBy(Factor));
     InstructionCost MemCost =
         getMemoryOpCost(Opcode, SubVecTy, Alignment, AddressSpace, CostKind);

     unsigned VecSize = DL.getTypeSizeInBits(SubVecTy);
     unsigned MaxVecSize = 128;
     unsigned NumAccesses =
         std::max<unsigned>(1, (MinElts * ElSize + MaxVecSize - 1) / VecSize);

     // A stride of two is commonly supported via dedicated instructions, so it
     // should be relatively cheap for all element sizes. A stride of four is
     // more expensive as it will likely require more shuffles. Using two
     // simd128 inputs is considered more expensive and we mainly account for
     // shuffling two inputs (32 bytes), but we do model 4 x v4i32 to enable
     // arithmetic kernels.
     static const CostTblEntry ShuffleCostTbl[] = {
         // One reg.
         {2, MVT::v2i8, 1},  // interleave 2 x 2i8 into 4i8
         {2, MVT::v4i8, 1},  // interleave 2 x 4i8 into 8i8
         {2, MVT::v8i8, 1},  // interleave 2 x 8i8 into 16i8
         {2, MVT::v2i16, 1}, // interleave 2 x 2i16 into 4i16
         {2, MVT::v4i16, 1}, // interleave 2 x 4i16 into 8i16
         {2, MVT::v2i32, 1}, // interleave 2 x 2i32 into 4i32

         // Two regs.
         {2, MVT::v16i8, 2}, // interleave 2 x 16i8 into 32i8
         {2, MVT::v8i16, 2}, // interleave 2 x 8i16 into 16i16
         {2, MVT::v4i32, 2}, // interleave 2 x 4i32 into 8i32

         // One reg.
         {4, MVT::v2i8, 4},  // interleave 4 x 2i8 into 8i8
         {4, MVT::v4i8, 4},  // interleave 4 x 4i8 into 16i8
         {4, MVT::v2i16, 4}, // interleave 4 x 2i16 into 8i16

         // Two regs.
         {4, MVT::v8i8, 16}, // interleave 4 x 8i8 into 32i8
         {4, MVT::v4i16, 8}, // interleave 4 x 4i16 into 16i16
         {4, MVT::v2i32, 4}, // interleave 4 x 2i32 into 8i32

         // Four regs.
         {4, MVT::v4i32, 16}, // interleave 4 x 4i32 into 16i32
     };

     EVT ETy = TLI->getValueType(DL, SubVecTy);
     if (const auto *Entry =
             CostTableLookup(ShuffleCostTbl, Factor, ETy.getSimpleVT()))
       return Entry->Cost + (NumAccesses * MemCost);
   }

   return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
                                            Alignment, AddressSpace, CostKind,
                                            UseMaskForCond, UseMaskForGaps);
 }

 InstructionCost WebAssemblyTTIImpl::getVectorInstrCost(
     unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
     const Value *Op0, const Value *Op1) const {
   InstructionCost Cost = BasicTTIImplBase::getVectorInstrCost(
       Opcode, Val, CostKind, Index, Op0, Op1);

   // SIMD128's insert/extract currently only take constant indices.
   if (Index == -1u)
     return Cost + 25 * TargetTransformInfo::TCC_Expensive;

   return Cost;
 }

 InstructionCost WebAssemblyTTIImpl::getPartialReductionCost(
     unsigned Opcode, Type *InputTypeA, Type *InputTypeB, Type *AccumType,
     ElementCount VF, TTI::PartialReductionExtendKind OpAExtend,
     TTI::PartialReductionExtendKind OpBExtend, std::optional<unsigned> BinOp,
     TTI::TargetCostKind CostKind) const {
   InstructionCost Invalid = InstructionCost::getInvalid();
   if (!VF.isFixed() || !ST->hasSIMD128())
     return Invalid;

   if (CostKind != TTI::TCK_RecipThroughput)
     return Invalid;

   if (Opcode != Instruction::Add)
     return Invalid;

   EVT AccumEVT = EVT::getEVT(AccumType);
   // TODO: Add i64 accumulator.
   if (AccumEVT != MVT::i32)
     return Invalid;

   // Possible options:
   // - i16x8.extadd_pairwise_i8x16_sx
   // - i32x4.extadd_pairwise_i16x8_sx
   // - i32x4.dot_i16x8_s
   // Only try to support dot, for now.

   EVT InputEVT = EVT::getEVT(InputTypeA);
   if (!((InputEVT == MVT::i16 && VF.getFixedValue() == 8) ||
         (InputEVT == MVT::i8 && VF.getFixedValue() == 16))) {
     return Invalid;
   }

   if (OpAExtend == TTI::PR_None)
     return Invalid;

   InstructionCost Cost(TTI::TCC_Basic);
   if (!BinOp)
     return Cost;

   if (OpAExtend != OpBExtend)
     return Invalid;

   if (*BinOp != Instruction::Mul)
     return Invalid;

   if (InputTypeA != InputTypeB)
     return Invalid;

   // Signed inputs can lower to dot
   if (InputEVT == MVT::i16 && VF.getFixedValue() == 8)
     return OpAExtend == TTI::PR_SignExtend ? Cost : Cost * 2;

   // Double the size of the lowered sequence.
   if (InputEVT == MVT::i8 && VF.getFixedValue() == 16)
     return OpAExtend == TTI::PR_SignExtend ? Cost * 2 : Cost * 4;

   return Invalid;
 }

 TTI::ReductionShuffle WebAssemblyTTIImpl::getPreferredExpandedReductionShuffle(
     const IntrinsicInst *II) const {

   switch (II->getIntrinsicID()) {
   default:
     break;
   case Intrinsic::vector_reduce_fadd:
     return TTI::ReductionShuffle::Pairwise;
   }
   return TTI::ReductionShuffle::SplitHalf;
 }

 void WebAssemblyTTIImpl::getUnrollingPreferences(
     Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP,
     OptimizationRemarkEmitter *ORE) const {
   // Scan the loop: don't unroll loops with calls. This is a standard approach
   // for most (all?) targets.
   for (BasicBlock *BB : L->blocks())
     for (Instruction &I : *BB)
       if (isa<CallInst>(I) || isa<InvokeInst>(I))
         if (const Function *F = cast<CallBase>(I).getCalledFunction())
           if (isLoweredToCall(F))
             return;

   // The chosen threshold is within the range of 'LoopMicroOpBufferSize' of
   // the various microarchitectures that use the BasicTTI implementation and
   // has been selected through heuristics across multiple cores and runtimes.
   UP.Partial = UP.Runtime = UP.UpperBound = true;
   UP.PartialThreshold = 30;

   // Avoid unrolling when optimizing for size.
   UP.OptSizeThreshold = 0;
   UP.PartialOptSizeThreshold = 0;

   // Set number of instructions optimized when "back edge"
   // becomes "fall through" to default value of 2.
   UP.BEInsns = 2;
 }

 bool WebAssemblyTTIImpl::supportsTailCalls() const {
   return getST()->hasTailCall();
 }

 bool WebAssemblyTTIImpl::isProfitableToSinkOperands(
     Instruction *I, SmallVectorImpl<Use *> &Ops) const {
   using namespace llvm::PatternMatch;

   if (!I->getType()->isVectorTy() || !I->isShift())
     return false;

   Value *V = I->getOperand(1);
   // We dont need to sink constant splat.
   if (isa<Constant>(V))
     return false;

   if (match(V, m_Shuffle(m_InsertElt(m_Value(), m_Value(), m_ZeroInt()),
                          m_Value(), m_ZeroMask()))) {
     // Sink insert
     Ops.push_back(&cast<Instruction>(V)->getOperandUse(0));
     // Sink shuffle
     Ops.push_back(&I->getOperandUse(1));
     return true;
   }

   return false;
 }

 /// Attempt to convert [relaxed_]swizzle to shufflevector if the mask is
 /// constant.
 static Value *simplifyWasmSwizzle(const IntrinsicInst &II,
                                   InstCombiner::BuilderTy &Builder,
                                   bool IsRelaxed) {
   auto *V = dyn_cast<Constant>(II.getArgOperand(1));
   if (!V)
     return nullptr;

   auto *VecTy = cast<FixedVectorType>(II.getType());
   unsigned NumElts = VecTy->getNumElements();
   assert(NumElts == 16);

   // Construct a shuffle mask from constant integers or UNDEFs.
   int Indexes[16];
   bool AnyOutOfBounds = false;

   for (unsigned I = 0; I < NumElts; ++I) {
     Constant *COp = V->getAggregateElement(I);
     if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp)))
       return nullptr;

     if (isa<UndefValue>(COp)) {
       Indexes[I] = -1;
       continue;
     }

     if (IsRelaxed && cast<ConstantInt>(COp)->getSExtValue() >= NumElts) {
       // The relaxed_swizzle operation always returns 0 if the lane index is
       // less than 0 when interpreted as a signed value. For lane indices above
       // 15, however, it can choose between returning 0 or the lane at `Index %
       // 16`. However, the choice must be made consistently. As the WebAssembly
       // spec states:
       //
       // "The result of relaxed operators are implementation-dependent, because
       // the set of possible results may depend on properties of the host
       // environment, such as its hardware. Technically, their behaviour is
       // controlled by a set of global parameters to the semantics that an
       // implementation can instantiate in different ways. These choices are
       // fixed, that is, parameters are constant during the execution of any
       // given program."
       //
       // The WebAssembly runtime may choose differently from us, so we can't
       // optimize a relaxed swizzle with lane indices above 15.
       return nullptr;
     }

     uint64_t Index = cast<ConstantInt>(COp)->getZExtValue();
     if (Index >= NumElts) {
       AnyOutOfBounds = true;
       // If there are out-of-bounds indices, the swizzle instruction returns
       // zeroes in those lanes. We'll provide an all-zeroes vector as the
       // second argument to shufflevector and read the first element from it.
       Indexes[I] = NumElts;
       continue;
     }

     Indexes[I] = Index;
   }

   auto *V1 = II.getArgOperand(0);
   auto *V2 =
       AnyOutOfBounds ? Constant::getNullValue(VecTy) : PoisonValue::get(VecTy);

   return Builder.CreateShuffleVector(V1, V2, ArrayRef(Indexes, NumElts));
 }

 std::optional<Instruction *>
 WebAssemblyTTIImpl::instCombineIntrinsic(InstCombiner &IC,
                                          IntrinsicInst &II) const {
   Intrinsic::ID IID = II.getIntrinsicID();
   switch (IID) {
   case Intrinsic::wasm_swizzle:
   case Intrinsic::wasm_relaxed_swizzle:
     if (Value *V = simplifyWasmSwizzle(
             II, IC.Builder, IID == Intrinsic::wasm_relaxed_swizzle)) {
       return IC.replaceInstUsesWith(II, V);
     }
     break;
   }

   return std::nullopt;
 }
	//===-- WebAssemblyTargetTransformInfo.cpp - WebAssembly-specific TTI -----===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	///
	/// \file
	/// This file defines the WebAssembly-specific TargetTransformInfo
	/// implementation.
	///
	//===----------------------------------------------------------------------===//

	#include "WebAssemblyTargetTransformInfo.h"
	#include "llvm/IR/IntrinsicInst.h"
	#include "llvm/IR/IntrinsicsWebAssembly.h"
	#include "llvm/Transforms/InstCombine/InstCombiner.h"

	#include "llvm/CodeGen/CostTable.h"
	using namespace llvm;

	#define DEBUG_TYPE "wasmtti"

	TargetTransformInfo::PopcntSupportKind
	WebAssemblyTTIImpl::getPopcntSupport(unsigned TyWidth) const {
	assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
	return TargetTransformInfo::PSK_FastHardware;
	}

	unsigned WebAssemblyTTIImpl::getNumberOfRegisters(unsigned ClassID) const {
	unsigned Result = BaseT::getNumberOfRegisters(ClassID);

	// For SIMD, use at least 16 registers, as a rough guess.
	bool Vector = (ClassID == 1);
	if (Vector)
	Result = std::max(Result, 16u);

	return Result;
	}

	TypeSize WebAssemblyTTIImpl::getRegisterBitWidth(
	TargetTransformInfo::RegisterKind K) const {
	switch (K) {
	case TargetTransformInfo::RGK_Scalar:
	return TypeSize::getFixed(64);
	case TargetTransformInfo::RGK_FixedWidthVector:
	return TypeSize::getFixed(getST()->hasSIMD128() ? 128 : 64);
	case TargetTransformInfo::RGK_ScalableVector:
	return TypeSize::getScalable(0);
	}

	llvm_unreachable("Unsupported register kind");
	}

	InstructionCost WebAssemblyTTIImpl::getArithmeticInstrCost(
	unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
	TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info,
	ArrayRef<const Value > Args, const Instruction CxtI) const {

	if (ST->hasSIMD128()) {
	static const CostTblEntry ArithCostTbl[]{
	// extmul + (maybe awkward) shuffle
	{ISD::MUL, MVT::v8i8, 4},
	// 2x extmul + (okay) shuffle
	{ISD::MUL, MVT::v16i8, 4},
	// extmul
	{ISD::MUL, MVT::v4i16, 1},
	// extmul
	{ISD::MUL, MVT::v2i32, 1},
	};
	EVT DstVT = TLI->getValueType(DL, Ty);
	if (DstVT.isSimple()) {
	int ISD = TLI->InstructionOpcodeToISD(Opcode);
	if (const auto *Entry =
	CostTableLookup(ArithCostTbl, ISD, DstVT.getSimpleVT()))
	return Entry->Cost;
	}
	}

	InstructionCost Cost =
	BasicTTIImplBase<WebAssemblyTTIImpl>::getArithmeticInstrCost(
	Opcode, Ty, CostKind, Op1Info, Op2Info);

	if (auto *VTy = dyn_cast<VectorType>(Ty)) {
	switch (Opcode) {
	case Instruction::LShr:
	case Instruction::AShr:
	case Instruction::Shl:
	// SIMD128's shifts currently only accept a scalar shift count. For each
	// element, we'll need to extract, op, insert. The following is a rough
	// approximation.
	if (!Op2Info.isUniform())
	Cost =
	cast<FixedVectorType>(VTy)->getNumElements() *
	(TargetTransformInfo::TCC_Basic +
	getArithmeticInstrCost(Opcode, VTy->getElementType(), CostKind) +
	TargetTransformInfo::TCC_Basic);
	break;
	}
	}
	return Cost;
	}

	InstructionCost WebAssemblyTTIImpl::getCastInstrCost(
	unsigned Opcode, Type Dst, Type Src, TTI::CastContextHint CCH,
	TTI::TargetCostKind CostKind, const Instruction *I) const {
	int ISD = TLI->InstructionOpcodeToISD(Opcode);
	auto SrcTy = TLI->getValueType(DL, Src);
	auto DstTy = TLI->getValueType(DL, Dst);

	if (!SrcTy.isSimple() \|\| !DstTy.isSimple()) {
	return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
	}

	if (!ST->hasSIMD128()) {
	return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
	}

	auto DstVT = DstTy.getSimpleVT();
	auto SrcVT = SrcTy.getSimpleVT();

	if (I && I->hasOneUser()) {
	auto SingleUser = cast<Instruction>(I->user_begin());
	int UserISD = TLI->InstructionOpcodeToISD(SingleUser->getOpcode());

	// extmul_low support
	if (UserISD == ISD::MUL &&
	(ISD == ISD::ZERO_EXTEND \|\| ISD == ISD::SIGN_EXTEND)) {
	// Free low extensions.
	if ((SrcVT == MVT::v8i8 && DstVT == MVT::v8i16) \|\|
	(SrcVT == MVT::v4i16 && DstVT == MVT::v4i32) \|\|
	(SrcVT == MVT::v2i32 && DstVT == MVT::v2i64)) {
	return 0;
	}
	// Will require an additional extlow operation for the intermediate
	// i16/i32 value.
	if ((SrcVT == MVT::v4i8 && DstVT == MVT::v4i32) \|\|
	(SrcVT == MVT::v2i16 && DstVT == MVT::v2i64)) {
	return 1;
	}
	}
	}

	static constexpr TypeConversionCostTblEntry ConversionTbl[] = {
	// extend_low
	{ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i32, 1},
	{ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i32, 1},
	{ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 1},
	{ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 1},
	{ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, 1},
	{ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, 1},
	// 2 x extend_low
	{ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i16, 2},
	{ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i16, 2},
	{ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8, 2},
	{ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 2},
	// extend_low, extend_high
	{ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 2},
	{ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 2},
	{ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 2},
	{ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 2},
	{ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 2},
	{ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 2},
	// 2x extend_low, extend_high
	{ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i32, 4},
	{ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i32, 4},
	{ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 4},
	{ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, 4},
	// shuffle
	{ISD::TRUNCATE, MVT::v2i16, MVT::v2i32, 2},
	{ISD::TRUNCATE, MVT::v2i8, MVT::v2i32, 4},
	{ISD::TRUNCATE, MVT::v4i16, MVT::v4i32, 2},
	{ISD::TRUNCATE, MVT::v4i8, MVT::v4i32, 4},
	// narrow, and
	{ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 2},
	{ISD::TRUNCATE, MVT::v8i8, MVT::v8i16, 2},
	// narrow, 2x and
	{ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 3},
	// 3x narrow, 4x and
	{ISD::TRUNCATE, MVT::v8i16, MVT::v8i64, 7},
	{ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 7},
	// 7x narrow, 8x and
	{ISD::TRUNCATE, MVT::v16i8, MVT::v16i64, 15},
	// convert_i32x4
	{ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i32, 1},
	{ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 1},
	{ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1},
	{ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 1},
	// extend_low, convert
	{ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i16, 2},
	{ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i16, 2},
	{ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i16, 2},
	{ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i16, 2},
	// extend_low x 2, convert
	{ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i8, 3},
	{ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i8, 3},
	{ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i8, 3},
	{ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i8, 3},
	// several shuffles
	{ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i8, 10},
	{ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i8, 10},
	{ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 10},
	{ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i8, 10},
	/// trunc_sat, const, and, 3x narrow
	{ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f32, 6},
	{ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f32, 6},
	{ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f32, 6},
	{ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f32, 6},
	/// trunc_sat, const, and, narrow
	{ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f32, 4},
	{ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f32, 4},
	{ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f32, 4},
	{ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f32, 4},
	// 2x trunc_sat, const, 2x and, 3x narrow
	{ISD::FP_TO_SINT, MVT::v8i8, MVT::v8f32, 8},
	{ISD::FP_TO_UINT, MVT::v8i8, MVT::v8f32, 8},
	// 2x trunc_sat, const, 2x and, narrow
	{ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f32, 6},
	{ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f32, 6},
	};

	if (const auto *Entry =
	ConvertCostTableLookup(ConversionTbl, ISD, DstVT, SrcVT)) {
	return Entry->Cost;
	}

	return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
	}

	WebAssemblyTTIImpl::TTI::MemCmpExpansionOptions
	WebAssemblyTTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
	TTI::MemCmpExpansionOptions Options;

	Options.AllowOverlappingLoads = true;

	if (ST->hasSIMD128())
	Options.LoadSizes.push_back(16);

	Options.LoadSizes.append({8, 4, 2, 1});
	Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);
	Options.NumLoadsPerBlock = Options.MaxNumLoads;

	return Options;
	}

	InstructionCost WebAssemblyTTIImpl::getMemoryOpCost(
	unsigned Opcode, Type *Ty, Align Alignment, unsigned AddressSpace,
	TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo,
	const Instruction *I) const {
	if (!ST->hasSIMD128() \|\| !isa<FixedVectorType>(Ty)) {
	return BaseT::getMemoryOpCost(Opcode, Ty, Alignment, AddressSpace,
	CostKind);
	}

	EVT VT = TLI->getValueType(DL, Ty, true);
	// Type legalization can't handle structs
	if (VT == MVT::Other)
	return BaseT::getMemoryOpCost(Opcode, Ty, Alignment, AddressSpace,
	CostKind);

	auto LT = getTypeLegalizationCost(Ty);
	if (!LT.first.isValid())
	return InstructionCost::getInvalid();

	int ISD = TLI->InstructionOpcodeToISD(Opcode);
	unsigned width = VT.getSizeInBits();
	if (ISD == ISD::LOAD) {
	// 128-bit loads are a single instruction. 32-bit and 64-bit vector loads
	// can be lowered to load32_zero and load64_zero respectively. Assume SIMD
	// loads are twice as expensive as scalar.
	switch (width) {
	default:
	break;
	case 32:
	case 64:
	case 128:
	return 2;
	}
	} else if (ISD == ISD::STORE) {
	// For stores, we can use store lane operations.
	switch (width) {
	default:
	break;
	case 8:
	case 16:
	case 32:
	case 64:
	case 128:
	return 2;
	}
	}

	return BaseT::getMemoryOpCost(Opcode, Ty, Alignment, AddressSpace, CostKind);
	}

	InstructionCost WebAssemblyTTIImpl::getInterleavedMemoryOpCost(
	unsigned Opcode, Type *Ty, unsigned Factor, ArrayRef<unsigned> Indices,
	Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
	bool UseMaskForCond, bool UseMaskForGaps) const {
	assert(Factor >= 2 && "Invalid interleave factor");

	auto *VecTy = cast<VectorType>(Ty);
	if (!ST->hasSIMD128() \|\| !isa<FixedVectorType>(VecTy)) {
	return InstructionCost::getInvalid();
	}

	if (UseMaskForCond \|\| UseMaskForGaps)
	return BaseT::getInterleavedMemoryOpCost(Opcode, Ty, Factor, Indices,
	Alignment, AddressSpace, CostKind,
	UseMaskForCond, UseMaskForGaps);

	constexpr unsigned MaxInterleaveFactor = 4;
	if (Factor <= MaxInterleaveFactor) {
	unsigned MinElts = VecTy->getElementCount().getKnownMinValue();
	// Ensure the number of vector elements is greater than 1.
	if (MinElts < 2 \|\| MinElts % Factor != 0)
	return InstructionCost::getInvalid();

	unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType());
	// Ensure the element type is legal.
	if (ElSize != 8 && ElSize != 16 && ElSize != 32 && ElSize != 64)
	return InstructionCost::getInvalid();

	if (Factor != 2 && Factor != 4)
	return InstructionCost::getInvalid();

	auto *SubVecTy =
	VectorType::get(VecTy->getElementType(),
	VecTy->getElementCount().divideCoefficientBy(Factor));
	InstructionCost MemCost =
	getMemoryOpCost(Opcode, SubVecTy, Alignment, AddressSpace, CostKind);

	unsigned VecSize = DL.getTypeSizeInBits(SubVecTy);
	unsigned MaxVecSize = 128;
	unsigned NumAccesses =
	std::max<unsigned>(1, (MinElts * ElSize + MaxVecSize - 1) / VecSize);

	// A stride of two is commonly supported via dedicated instructions, so it
	// should be relatively cheap for all element sizes. A stride of four is
	// more expensive as it will likely require more shuffles. Using two
	// simd128 inputs is considered more expensive and we mainly account for
	// shuffling two inputs (32 bytes), but we do model 4 x v4i32 to enable
	// arithmetic kernels.
	static const CostTblEntry ShuffleCostTbl[] = {
	// One reg.
	{2, MVT::v2i8, 1}, // interleave 2 x 2i8 into 4i8
	{2, MVT::v4i8, 1}, // interleave 2 x 4i8 into 8i8
	{2, MVT::v8i8, 1}, // interleave 2 x 8i8 into 16i8
	{2, MVT::v2i16, 1}, // interleave 2 x 2i16 into 4i16
	{2, MVT::v4i16, 1}, // interleave 2 x 4i16 into 8i16
	{2, MVT::v2i32, 1}, // interleave 2 x 2i32 into 4i32

	// Two regs.
	{2, MVT::v16i8, 2}, // interleave 2 x 16i8 into 32i8
	{2, MVT::v8i16, 2}, // interleave 2 x 8i16 into 16i16
	{2, MVT::v4i32, 2}, // interleave 2 x 4i32 into 8i32

	// One reg.
	{4, MVT::v2i8, 4}, // interleave 4 x 2i8 into 8i8
	{4, MVT::v4i8, 4}, // interleave 4 x 4i8 into 16i8
	{4, MVT::v2i16, 4}, // interleave 4 x 2i16 into 8i16

	// Two regs.
	{4, MVT::v8i8, 16}, // interleave 4 x 8i8 into 32i8
	{4, MVT::v4i16, 8}, // interleave 4 x 4i16 into 16i16
	{4, MVT::v2i32, 4}, // interleave 4 x 2i32 into 8i32

	// Four regs.
	{4, MVT::v4i32, 16}, // interleave 4 x 4i32 into 16i32
	};

	EVT ETy = TLI->getValueType(DL, SubVecTy);
	if (const auto *Entry =
	CostTableLookup(ShuffleCostTbl, Factor, ETy.getSimpleVT()))
	return Entry->Cost + (NumAccesses * MemCost);
	}

	return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
	Alignment, AddressSpace, CostKind,
	UseMaskForCond, UseMaskForGaps);
	}

	InstructionCost WebAssemblyTTIImpl::getVectorInstrCost(
	unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
	const Value Op0, const Value Op1) const {
	InstructionCost Cost = BasicTTIImplBase::getVectorInstrCost(
	Opcode, Val, CostKind, Index, Op0, Op1);

	// SIMD128's insert/extract currently only take constant indices.
	if (Index == -1u)
	return Cost + 25 * TargetTransformInfo::TCC_Expensive;

	return Cost;
	}

	InstructionCost WebAssemblyTTIImpl::getPartialReductionCost(
	unsigned Opcode, Type InputTypeA, Type InputTypeB, Type *AccumType,
	ElementCount VF, TTI::PartialReductionExtendKind OpAExtend,
	TTI::PartialReductionExtendKind OpBExtend, std::optional<unsigned> BinOp,
	TTI::TargetCostKind CostKind) const {
	InstructionCost Invalid = InstructionCost::getInvalid();
	if (!VF.isFixed() \|\| !ST->hasSIMD128())
	return Invalid;

	if (CostKind != TTI::TCK_RecipThroughput)
	return Invalid;

	if (Opcode != Instruction::Add)
	return Invalid;

	EVT AccumEVT = EVT::getEVT(AccumType);
	// TODO: Add i64 accumulator.
	if (AccumEVT != MVT::i32)
	return Invalid;

	// Possible options:
	// - i16x8.extadd_pairwise_i8x16_sx
	// - i32x4.extadd_pairwise_i16x8_sx
	// - i32x4.dot_i16x8_s
	// Only try to support dot, for now.

	EVT InputEVT = EVT::getEVT(InputTypeA);
	if (!((InputEVT == MVT::i16 && VF.getFixedValue() == 8) \|\|
	(InputEVT == MVT::i8 && VF.getFixedValue() == 16))) {
	return Invalid;
	}

	if (OpAExtend == TTI::PR_None)
	return Invalid;

	InstructionCost Cost(TTI::TCC_Basic);
	if (!BinOp)
	return Cost;

	if (OpAExtend != OpBExtend)
	return Invalid;

	if (*BinOp != Instruction::Mul)
	return Invalid;

	if (InputTypeA != InputTypeB)
	return Invalid;

	// Signed inputs can lower to dot
	if (InputEVT == MVT::i16 && VF.getFixedValue() == 8)
	return OpAExtend == TTI::PR_SignExtend ? Cost : Cost * 2;

	// Double the size of the lowered sequence.
	if (InputEVT == MVT::i8 && VF.getFixedValue() == 16)
	return OpAExtend == TTI::PR_SignExtend ? Cost * 2 : Cost * 4;

	return Invalid;
	}

	TTI::ReductionShuffle WebAssemblyTTIImpl::getPreferredExpandedReductionShuffle(
	const IntrinsicInst *II) const {

	switch (II->getIntrinsicID()) {
	default:
	break;
	case Intrinsic::vector_reduce_fadd:
	return TTI::ReductionShuffle::Pairwise;
	}
	return TTI::ReductionShuffle::SplitHalf;
	}

	void WebAssemblyTTIImpl::getUnrollingPreferences(
	Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP,
	OptimizationRemarkEmitter *ORE) const {
	// Scan the loop: don't unroll loops with calls. This is a standard approach
	// for most (all?) targets.
	for (BasicBlock *BB : L->blocks())
	for (Instruction &I : *BB)
	if (isa<CallInst>(I) \|\| isa<InvokeInst>(I))
	if (const Function *F = cast<CallBase>(I).getCalledFunction())
	if (isLoweredToCall(F))
	return;

	// The chosen threshold is within the range of 'LoopMicroOpBufferSize' of
	// the various microarchitectures that use the BasicTTI implementation and
	// has been selected through heuristics across multiple cores and runtimes.
	UP.Partial = UP.Runtime = UP.UpperBound = true;
	UP.PartialThreshold = 30;

	// Avoid unrolling when optimizing for size.
	UP.OptSizeThreshold = 0;
	UP.PartialOptSizeThreshold = 0;

	// Set number of instructions optimized when "back edge"
	// becomes "fall through" to default value of 2.
	UP.BEInsns = 2;
	}

	bool WebAssemblyTTIImpl::supportsTailCalls() const {
	return getST()->hasTailCall();
	}

	bool WebAssemblyTTIImpl::isProfitableToSinkOperands(
	Instruction I, SmallVectorImpl<Use > &Ops) const {
	using namespace llvm::PatternMatch;

	if (!I->getType()->isVectorTy() \|\| !I->isShift())
	return false;

	Value *V = I->getOperand(1);
	// We dont need to sink constant splat.
	if (isa<Constant>(V))
	return false;

	if (match(V, m_Shuffle(m_InsertElt(m_Value(), m_Value(), m_ZeroInt()),
	m_Value(), m_ZeroMask()))) {
	// Sink insert
	Ops.push_back(&cast<Instruction>(V)->getOperandUse(0));
	// Sink shuffle
	Ops.push_back(&I->getOperandUse(1));
	return true;
	}

	return false;
	}

	/// Attempt to convert [relaxed_]swizzle to shufflevector if the mask is
	/// constant.
	static Value *simplifyWasmSwizzle(const IntrinsicInst &II,
	InstCombiner::BuilderTy &Builder,
	bool IsRelaxed) {
	auto *V = dyn_cast<Constant>(II.getArgOperand(1));
	if (!V)
	return nullptr;

	auto *VecTy = cast<FixedVectorType>(II.getType());
	unsigned NumElts = VecTy->getNumElements();
	assert(NumElts == 16);

	// Construct a shuffle mask from constant integers or UNDEFs.
	int Indexes[16];
	bool AnyOutOfBounds = false;

	for (unsigned I = 0; I < NumElts; ++I) {
	Constant *COp = V->getAggregateElement(I);
	if (!COp \|\| (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp)))
	return nullptr;

	if (isa<UndefValue>(COp)) {
	Indexes[I] = -1;
	continue;
	}

	if (IsRelaxed && cast<ConstantInt>(COp)->getSExtValue() >= NumElts) {
	// The relaxed_swizzle operation always returns 0 if the lane index is
	// less than 0 when interpreted as a signed value. For lane indices above
	// 15, however, it can choose between returning 0 or the lane at `Index %
	// 16`. However, the choice must be made consistently. As the WebAssembly
	// spec states:
	//
	// "The result of relaxed operators are implementation-dependent, because
	// the set of possible results may depend on properties of the host
	// environment, such as its hardware. Technically, their behaviour is
	// controlled by a set of global parameters to the semantics that an
	// implementation can instantiate in different ways. These choices are
	// fixed, that is, parameters are constant during the execution of any
	// given program."
	//
	// The WebAssembly runtime may choose differently from us, so we can't
	// optimize a relaxed swizzle with lane indices above 15.
	return nullptr;
	}

	uint64_t Index = cast<ConstantInt>(COp)->getZExtValue();
	if (Index >= NumElts) {
	AnyOutOfBounds = true;
	// If there are out-of-bounds indices, the swizzle instruction returns
	// zeroes in those lanes. We'll provide an all-zeroes vector as the
	// second argument to shufflevector and read the first element from it.
	Indexes[I] = NumElts;
	continue;
	}

	Indexes[I] = Index;
	}

	auto *V1 = II.getArgOperand(0);
	auto *V2 =
	AnyOutOfBounds ? Constant::getNullValue(VecTy) : PoisonValue::get(VecTy);

	return Builder.CreateShuffleVector(V1, V2, ArrayRef(Indexes, NumElts));
	}

	std::optional<Instruction *>
	WebAssemblyTTIImpl::instCombineIntrinsic(InstCombiner &IC,
	IntrinsicInst &II) const {
	Intrinsic::ID IID = II.getIntrinsicID();
	switch (IID) {
	case Intrinsic::wasm_swizzle:
	case Intrinsic::wasm_relaxed_swizzle:
	if (Value *V = simplifyWasmSwizzle(
	II, IC.Builder, IID == Intrinsic::wasm_relaxed_swizzle)) {
	return IC.replaceInstUsesWith(II, V);
	}
	break;
	}

	return std::nullopt;
	}