llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp - llvm-project - Git at Google

 //===-- WebAssemblyTargetTransformInfo.cpp - WebAssembly-specific TTI -----===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 ///
 /// \file
 /// This file defines the WebAssembly-specific TargetTransformInfo
 /// implementation.
 ///
 //===----------------------------------------------------------------------===//

 #include "WebAssemblyTargetTransformInfo.h"

 #include "llvm/CodeGen/CostTable.h"
 using namespace llvm;

 #define DEBUG_TYPE "wasmtti"

 TargetTransformInfo::PopcntSupportKind
 WebAssemblyTTIImpl::getPopcntSupport(unsigned TyWidth) const {
   assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
   return TargetTransformInfo::PSK_FastHardware;
 }

 unsigned WebAssemblyTTIImpl::getNumberOfRegisters(unsigned ClassID) const {
   unsigned Result = BaseT::getNumberOfRegisters(ClassID);

   // For SIMD, use at least 16 registers, as a rough guess.
   bool Vector = (ClassID == 1);
   if (Vector)
     Result = std::max(Result, 16u);

   return Result;
 }

 TypeSize WebAssemblyTTIImpl::getRegisterBitWidth(
     TargetTransformInfo::RegisterKind K) const {
   switch (K) {
   case TargetTransformInfo::RGK_Scalar:
     return TypeSize::getFixed(64);
   case TargetTransformInfo::RGK_FixedWidthVector:
     return TypeSize::getFixed(getST()->hasSIMD128() ? 128 : 64);
   case TargetTransformInfo::RGK_ScalableVector:
     return TypeSize::getScalable(0);
   }

   llvm_unreachable("Unsupported register kind");
 }

 InstructionCost WebAssemblyTTIImpl::getArithmeticInstrCost(
     unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
     TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info,
     ArrayRef<const Value *> Args, const Instruction *CxtI) const {

   InstructionCost Cost =
       BasicTTIImplBase<WebAssemblyTTIImpl>::getArithmeticInstrCost(
           Opcode, Ty, CostKind, Op1Info, Op2Info);

   if (auto *VTy = dyn_cast<VectorType>(Ty)) {
     switch (Opcode) {
     case Instruction::LShr:
     case Instruction::AShr:
     case Instruction::Shl:
       // SIMD128's shifts currently only accept a scalar shift count. For each
       // element, we'll need to extract, op, insert. The following is a rough
       // approximation.
       if (!Op2Info.isUniform())
         Cost =
             cast<FixedVectorType>(VTy)->getNumElements() *
             (TargetTransformInfo::TCC_Basic +
              getArithmeticInstrCost(Opcode, VTy->getElementType(), CostKind) +
              TargetTransformInfo::TCC_Basic);
       break;
     }
   }
   return Cost;
 }

 InstructionCost WebAssemblyTTIImpl::getCastInstrCost(
     unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH,
     TTI::TargetCostKind CostKind, const Instruction *I) const {
   int ISD = TLI->InstructionOpcodeToISD(Opcode);
   auto SrcTy = TLI->getValueType(DL, Src);
   auto DstTy = TLI->getValueType(DL, Dst);

   if (!SrcTy.isSimple() || !DstTy.isSimple()) {
     return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
   }

   if (!ST->hasSIMD128()) {
     return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
   }

   auto DstVT = DstTy.getSimpleVT();
   auto SrcVT = SrcTy.getSimpleVT();

   if (I && I->hasOneUser()) {
     auto *SingleUser = cast<Instruction>(*I->user_begin());
     int UserISD = TLI->InstructionOpcodeToISD(SingleUser->getOpcode());

     // extmul_low support
     if (UserISD == ISD::MUL &&
         (ISD == ISD::ZERO_EXTEND || ISD == ISD::SIGN_EXTEND)) {
       // Free low extensions.
       if ((SrcVT == MVT::v8i8 && DstVT == MVT::v8i16) ||
           (SrcVT == MVT::v4i16 && DstVT == MVT::v4i32) ||
           (SrcVT == MVT::v2i32 && DstVT == MVT::v2i64)) {
         return 0;
       }
       // Will require an additional extlow operation for the intermediate
       // i16/i32 value.
       if ((SrcVT == MVT::v4i8 && DstVT == MVT::v4i32) ||
           (SrcVT == MVT::v2i16 && DstVT == MVT::v2i64)) {
         return 1;
       }
     }
   }

   // extend_low
   static constexpr TypeConversionCostTblEntry ConversionTbl[] = {
       {ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i32, 1},
       {ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i32, 1},
       {ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 1},
       {ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 1},
       {ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, 1},
       {ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, 1},
       {ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i16, 2},
       {ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i16, 2},
       {ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8, 2},
       {ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 2},
   };

   if (const auto *Entry =
           ConvertCostTableLookup(ConversionTbl, ISD, DstVT, SrcVT)) {
     return Entry->Cost;
   }

   return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
 }

 WebAssemblyTTIImpl::TTI::MemCmpExpansionOptions
 WebAssemblyTTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
   TTI::MemCmpExpansionOptions Options;

   Options.AllowOverlappingLoads = true;

   if (ST->hasSIMD128())
     Options.LoadSizes.push_back(16);

   Options.LoadSizes.append({8, 4, 2, 1});
   Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);
   Options.NumLoadsPerBlock = Options.MaxNumLoads;

   return Options;
 }

 InstructionCost WebAssemblyTTIImpl::getMemoryOpCost(
     unsigned Opcode, Type *Ty, Align Alignment, unsigned AddressSpace,
     TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo,
     const Instruction *I) const {
   if (!ST->hasSIMD128() || !isa<FixedVectorType>(Ty)) {
     return BaseT::getMemoryOpCost(Opcode, Ty, Alignment, AddressSpace,
                                   CostKind);
   }

   EVT VT = TLI->getValueType(DL, Ty, true);
   // Type legalization can't handle structs
   if (VT == MVT::Other)
     return BaseT::getMemoryOpCost(Opcode, Ty, Alignment, AddressSpace,
                                   CostKind);

   auto LT = getTypeLegalizationCost(Ty);
   if (!LT.first.isValid())
     return InstructionCost::getInvalid();

   int ISD = TLI->InstructionOpcodeToISD(Opcode);
   unsigned width = VT.getSizeInBits();
   if (ISD == ISD::LOAD) {
     // 128-bit loads are a single instruction. 32-bit and 64-bit vector loads
     // can be lowered to load32_zero and load64_zero respectively. Assume SIMD
     // loads are twice as expensive as scalar.
     switch (width) {
     default:
       break;
     case 32:
     case 64:
     case 128:
       return 2;
     }
   } else if (ISD == ISD::STORE) {
     // For stores, we can use store lane operations.
     switch (width) {
     default:
       break;
     case 8:
     case 16:
     case 32:
     case 64:
     case 128:
       return 2;
     }
   }

   return BaseT::getMemoryOpCost(Opcode, Ty, Alignment, AddressSpace, CostKind);
 }

 InstructionCost WebAssemblyTTIImpl::getInterleavedMemoryOpCost(
     unsigned Opcode, Type *Ty, unsigned Factor, ArrayRef<unsigned> Indices,
     Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
     bool UseMaskForCond, bool UseMaskForGaps) const {
   assert(Factor >= 2 && "Invalid interleave factor");

   auto *VecTy = cast<VectorType>(Ty);
   if (!ST->hasSIMD128() || !isa<FixedVectorType>(VecTy)) {
     return InstructionCost::getInvalid();
   }

   if (UseMaskForCond || UseMaskForGaps)
     return BaseT::getInterleavedMemoryOpCost(Opcode, Ty, Factor, Indices,
                                              Alignment, AddressSpace, CostKind,
                                              UseMaskForCond, UseMaskForGaps);

   constexpr unsigned MaxInterleaveFactor = 4;
   if (Factor <= MaxInterleaveFactor) {
     unsigned MinElts = VecTy->getElementCount().getKnownMinValue();
     // Ensure the number of vector elements is greater than 1.
     if (MinElts < 2 || MinElts % Factor != 0)
       return InstructionCost::getInvalid();

     unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType());
     // Ensure the element type is legal.
     if (ElSize != 8 && ElSize != 16 && ElSize != 32 && ElSize != 64)
       return InstructionCost::getInvalid();

     auto *SubVecTy =
         VectorType::get(VecTy->getElementType(),
                         VecTy->getElementCount().divideCoefficientBy(Factor));
     InstructionCost MemCost =
         getMemoryOpCost(Opcode, SubVecTy, Alignment, AddressSpace, CostKind);

     unsigned VecSize = DL.getTypeSizeInBits(SubVecTy);
     unsigned MaxVecSize = 128;
     unsigned NumAccesses =
         std::max<unsigned>(1, (MinElts * ElSize + MaxVecSize - 1) / VecSize);

     // A stride of two is commonly supported via dedicated instructions, so it
     // should be relatively cheap for all element sizes. A stride of four is
     // more expensive as it will likely require more shuffles. Using two
     // simd128 inputs is considered more expensive and we mainly account for
     // shuffling two inputs (32 bytes), but we do model 4 x v4i32 to enable
     // arithmetic kernels.
     static const CostTblEntry ShuffleCostTbl[] = {
         // One reg.
         {2, MVT::v2i8, 1},  // interleave 2 x 2i8 into 4i8
         {2, MVT::v4i8, 1},  // interleave 2 x 4i8 into 8i8
         {2, MVT::v8i8, 1},  // interleave 2 x 8i8 into 16i8
         {2, MVT::v2i16, 1}, // interleave 2 x 2i16 into 4i16
         {2, MVT::v4i16, 1}, // interleave 2 x 4i16 into 8i16
         {2, MVT::v2i32, 1}, // interleave 2 x 2i32 into 4i32

         // Two regs.
         {2, MVT::v16i8, 2}, // interleave 2 x 16i8 into 32i8
         {2, MVT::v8i16, 2}, // interleave 2 x 8i16 into 16i16
         {2, MVT::v4i32, 2}, // interleave 2 x 4i32 into 8i32

         // One reg.
         {4, MVT::v2i8, 4},  // interleave 4 x 2i8 into 8i8
         {4, MVT::v4i8, 4},  // interleave 4 x 4i8 into 16i8
         {4, MVT::v2i16, 4}, // interleave 4 x 2i16 into 8i16

         // Two regs.
         {4, MVT::v8i8, 16}, // interleave 4 x 8i8 into 32i8
         {4, MVT::v4i16, 8}, // interleave 4 x 4i16 into 16i16
         {4, MVT::v2i32, 4}, // interleave 4 x 2i32 into 8i32

         // Four regs.
         {4, MVT::v4i32, 16}, // interleave 4 x 4i32 into 16i32
     };

     EVT ETy = TLI->getValueType(DL, SubVecTy);
     if (const auto *Entry =
             CostTableLookup(ShuffleCostTbl, Factor, ETy.getSimpleVT()))
       return Entry->Cost + (NumAccesses * MemCost);
   }

   return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
                                            Alignment, AddressSpace, CostKind,
                                            UseMaskForCond, UseMaskForGaps);
 }

 InstructionCost WebAssemblyTTIImpl::getVectorInstrCost(
     unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
     const Value *Op0, const Value *Op1) const {
   InstructionCost Cost = BasicTTIImplBase::getVectorInstrCost(
       Opcode, Val, CostKind, Index, Op0, Op1);

   // SIMD128's insert/extract currently only take constant indices.
   if (Index == -1u)
     return Cost + 25 * TargetTransformInfo::TCC_Expensive;

   return Cost;
 }

 InstructionCost WebAssemblyTTIImpl::getPartialReductionCost(
     unsigned Opcode, Type *InputTypeA, Type *InputTypeB, Type *AccumType,
     ElementCount VF, TTI::PartialReductionExtendKind OpAExtend,
     TTI::PartialReductionExtendKind OpBExtend, std::optional<unsigned> BinOp,
     TTI::TargetCostKind CostKind) const {
   InstructionCost Invalid = InstructionCost::getInvalid();
   if (!VF.isFixed() || !ST->hasSIMD128())
     return Invalid;

   if (CostKind != TTI::TCK_RecipThroughput)
     return Invalid;

   InstructionCost Cost(TTI::TCC_Basic);

   // Possible options:
   // - i16x8.extadd_pairwise_i8x16_sx
   // - i32x4.extadd_pairwise_i16x8_sx
   // - i32x4.dot_i16x8_s
   // Only try to support dot, for now.

   if (Opcode != Instruction::Add)
     return Invalid;

   if (!BinOp || *BinOp != Instruction::Mul)
     return Invalid;

   if (InputTypeA != InputTypeB)
     return Invalid;

   if (OpAExtend != OpBExtend)
     return Invalid;

   EVT InputEVT = EVT::getEVT(InputTypeA);
   EVT AccumEVT = EVT::getEVT(AccumType);

   // TODO: Add i64 accumulator.
   if (AccumEVT != MVT::i32)
     return Invalid;

   // Signed inputs can lower to dot
   if (InputEVT == MVT::i16 && VF.getFixedValue() == 8)
     return OpAExtend == TTI::PR_SignExtend ? Cost : Cost * 2;

   // Double the size of the lowered sequence.
   if (InputEVT == MVT::i8 && VF.getFixedValue() == 16)
     return OpAExtend == TTI::PR_SignExtend ? Cost * 2 : Cost * 4;

   return Invalid;
 }

 TTI::ReductionShuffle WebAssemblyTTIImpl::getPreferredExpandedReductionShuffle(
     const IntrinsicInst *II) const {

   switch (II->getIntrinsicID()) {
   default:
     break;
   case Intrinsic::vector_reduce_fadd:
     return TTI::ReductionShuffle::Pairwise;
   }
   return TTI::ReductionShuffle::SplitHalf;
 }

 void WebAssemblyTTIImpl::getUnrollingPreferences(
     Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP,
     OptimizationRemarkEmitter *ORE) const {
   // Scan the loop: don't unroll loops with calls. This is a standard approach
   // for most (all?) targets.
   for (BasicBlock *BB : L->blocks())
     for (Instruction &I : *BB)
       if (isa<CallInst>(I) || isa<InvokeInst>(I))
         if (const Function *F = cast<CallBase>(I).getCalledFunction())
           if (isLoweredToCall(F))
             return;

   // The chosen threshold is within the range of 'LoopMicroOpBufferSize' of
   // the various microarchitectures that use the BasicTTI implementation and
   // has been selected through heuristics across multiple cores and runtimes.
   UP.Partial = UP.Runtime = UP.UpperBound = true;
   UP.PartialThreshold = 30;

   // Avoid unrolling when optimizing for size.
   UP.OptSizeThreshold = 0;
   UP.PartialOptSizeThreshold = 0;

   // Set number of instructions optimized when "back edge"
   // becomes "fall through" to default value of 2.
   UP.BEInsns = 2;
 }

 bool WebAssemblyTTIImpl::supportsTailCalls() const {
   return getST()->hasTailCall();
 }

 bool WebAssemblyTTIImpl::isProfitableToSinkOperands(
     Instruction *I, SmallVectorImpl<Use *> &Ops) const {
   using namespace llvm::PatternMatch;

   if (!I->getType()->isVectorTy() || !I->isShift())
     return false;

   Value *V = I->getOperand(1);
   // We dont need to sink constant splat.
   if (isa<Constant>(V))
     return false;

   if (match(V, m_Shuffle(m_InsertElt(m_Value(), m_Value(), m_ZeroInt()),
                          m_Value(), m_ZeroMask()))) {
     // Sink insert
     Ops.push_back(&cast<Instruction>(V)->getOperandUse(0));
     // Sink shuffle
     Ops.push_back(&I->getOperandUse(1));
     return true;
   }

   return false;
 }
	//===-- WebAssemblyTargetTransformInfo.cpp - WebAssembly-specific TTI -----===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	///
	/// \file
	/// This file defines the WebAssembly-specific TargetTransformInfo
	/// implementation.
	///
	//===----------------------------------------------------------------------===//

	#include "WebAssemblyTargetTransformInfo.h"

	#include "llvm/CodeGen/CostTable.h"
	using namespace llvm;

	#define DEBUG_TYPE "wasmtti"

	TargetTransformInfo::PopcntSupportKind
	WebAssemblyTTIImpl::getPopcntSupport(unsigned TyWidth) const {
	assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
	return TargetTransformInfo::PSK_FastHardware;
	}

	unsigned WebAssemblyTTIImpl::getNumberOfRegisters(unsigned ClassID) const {
	unsigned Result = BaseT::getNumberOfRegisters(ClassID);

	// For SIMD, use at least 16 registers, as a rough guess.
	bool Vector = (ClassID == 1);
	if (Vector)
	Result = std::max(Result, 16u);

	return Result;
	}

	TypeSize WebAssemblyTTIImpl::getRegisterBitWidth(
	TargetTransformInfo::RegisterKind K) const {
	switch (K) {
	case TargetTransformInfo::RGK_Scalar:
	return TypeSize::getFixed(64);
	case TargetTransformInfo::RGK_FixedWidthVector:
	return TypeSize::getFixed(getST()->hasSIMD128() ? 128 : 64);
	case TargetTransformInfo::RGK_ScalableVector:
	return TypeSize::getScalable(0);
	}

	llvm_unreachable("Unsupported register kind");
	}

	InstructionCost WebAssemblyTTIImpl::getArithmeticInstrCost(
	unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
	TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info,
	ArrayRef<const Value > Args, const Instruction CxtI) const {

	InstructionCost Cost =
	BasicTTIImplBase<WebAssemblyTTIImpl>::getArithmeticInstrCost(
	Opcode, Ty, CostKind, Op1Info, Op2Info);

	if (auto *VTy = dyn_cast<VectorType>(Ty)) {
	switch (Opcode) {
	case Instruction::LShr:
	case Instruction::AShr:
	case Instruction::Shl:
	// SIMD128's shifts currently only accept a scalar shift count. For each
	// element, we'll need to extract, op, insert. The following is a rough
	// approximation.
	if (!Op2Info.isUniform())
	Cost =
	cast<FixedVectorType>(VTy)->getNumElements() *
	(TargetTransformInfo::TCC_Basic +
	getArithmeticInstrCost(Opcode, VTy->getElementType(), CostKind) +
	TargetTransformInfo::TCC_Basic);
	break;
	}
	}
	return Cost;
	}

	InstructionCost WebAssemblyTTIImpl::getCastInstrCost(
	unsigned Opcode, Type Dst, Type Src, TTI::CastContextHint CCH,
	TTI::TargetCostKind CostKind, const Instruction *I) const {
	int ISD = TLI->InstructionOpcodeToISD(Opcode);
	auto SrcTy = TLI->getValueType(DL, Src);
	auto DstTy = TLI->getValueType(DL, Dst);

	if (!SrcTy.isSimple() \|\| !DstTy.isSimple()) {
	return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
	}

	if (!ST->hasSIMD128()) {
	return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
	}

	auto DstVT = DstTy.getSimpleVT();
	auto SrcVT = SrcTy.getSimpleVT();

	if (I && I->hasOneUser()) {
	auto SingleUser = cast<Instruction>(I->user_begin());
	int UserISD = TLI->InstructionOpcodeToISD(SingleUser->getOpcode());

	// extmul_low support
	if (UserISD == ISD::MUL &&
	(ISD == ISD::ZERO_EXTEND \|\| ISD == ISD::SIGN_EXTEND)) {
	// Free low extensions.
	if ((SrcVT == MVT::v8i8 && DstVT == MVT::v8i16) \|\|
	(SrcVT == MVT::v4i16 && DstVT == MVT::v4i32) \|\|
	(SrcVT == MVT::v2i32 && DstVT == MVT::v2i64)) {
	return 0;
	}
	// Will require an additional extlow operation for the intermediate
	// i16/i32 value.
	if ((SrcVT == MVT::v4i8 && DstVT == MVT::v4i32) \|\|
	(SrcVT == MVT::v2i16 && DstVT == MVT::v2i64)) {
	return 1;
	}
	}
	}

	// extend_low
	static constexpr TypeConversionCostTblEntry ConversionTbl[] = {
	{ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i32, 1},
	{ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i32, 1},
	{ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 1},
	{ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 1},
	{ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, 1},
	{ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, 1},
	{ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i16, 2},
	{ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i16, 2},
	{ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8, 2},
	{ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 2},
	};

	if (const auto *Entry =
	ConvertCostTableLookup(ConversionTbl, ISD, DstVT, SrcVT)) {
	return Entry->Cost;
	}

	return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
	}

	WebAssemblyTTIImpl::TTI::MemCmpExpansionOptions
	WebAssemblyTTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
	TTI::MemCmpExpansionOptions Options;

	Options.AllowOverlappingLoads = true;

	if (ST->hasSIMD128())
	Options.LoadSizes.push_back(16);

	Options.LoadSizes.append({8, 4, 2, 1});
	Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);
	Options.NumLoadsPerBlock = Options.MaxNumLoads;

	return Options;
	}

	InstructionCost WebAssemblyTTIImpl::getMemoryOpCost(
	unsigned Opcode, Type *Ty, Align Alignment, unsigned AddressSpace,
	TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo,
	const Instruction *I) const {
	if (!ST->hasSIMD128() \|\| !isa<FixedVectorType>(Ty)) {
	return BaseT::getMemoryOpCost(Opcode, Ty, Alignment, AddressSpace,
	CostKind);
	}

	EVT VT = TLI->getValueType(DL, Ty, true);
	// Type legalization can't handle structs
	if (VT == MVT::Other)
	return BaseT::getMemoryOpCost(Opcode, Ty, Alignment, AddressSpace,
	CostKind);

	auto LT = getTypeLegalizationCost(Ty);
	if (!LT.first.isValid())
	return InstructionCost::getInvalid();

	int ISD = TLI->InstructionOpcodeToISD(Opcode);
	unsigned width = VT.getSizeInBits();
	if (ISD == ISD::LOAD) {
	// 128-bit loads are a single instruction. 32-bit and 64-bit vector loads
	// can be lowered to load32_zero and load64_zero respectively. Assume SIMD
	// loads are twice as expensive as scalar.
	switch (width) {
	default:
	break;
	case 32:
	case 64:
	case 128:
	return 2;
	}
	} else if (ISD == ISD::STORE) {
	// For stores, we can use store lane operations.
	switch (width) {
	default:
	break;
	case 8:
	case 16:
	case 32:
	case 64:
	case 128:
	return 2;
	}
	}

	return BaseT::getMemoryOpCost(Opcode, Ty, Alignment, AddressSpace, CostKind);
	}

	InstructionCost WebAssemblyTTIImpl::getInterleavedMemoryOpCost(
	unsigned Opcode, Type *Ty, unsigned Factor, ArrayRef<unsigned> Indices,
	Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
	bool UseMaskForCond, bool UseMaskForGaps) const {
	assert(Factor >= 2 && "Invalid interleave factor");

	auto *VecTy = cast<VectorType>(Ty);
	if (!ST->hasSIMD128() \|\| !isa<FixedVectorType>(VecTy)) {
	return InstructionCost::getInvalid();
	}

	if (UseMaskForCond \|\| UseMaskForGaps)
	return BaseT::getInterleavedMemoryOpCost(Opcode, Ty, Factor, Indices,
	Alignment, AddressSpace, CostKind,
	UseMaskForCond, UseMaskForGaps);

	constexpr unsigned MaxInterleaveFactor = 4;
	if (Factor <= MaxInterleaveFactor) {
	unsigned MinElts = VecTy->getElementCount().getKnownMinValue();
	// Ensure the number of vector elements is greater than 1.
	if (MinElts < 2 \|\| MinElts % Factor != 0)
	return InstructionCost::getInvalid();

	unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType());
	// Ensure the element type is legal.
	if (ElSize != 8 && ElSize != 16 && ElSize != 32 && ElSize != 64)
	return InstructionCost::getInvalid();

	auto *SubVecTy =
	VectorType::get(VecTy->getElementType(),
	VecTy->getElementCount().divideCoefficientBy(Factor));
	InstructionCost MemCost =
	getMemoryOpCost(Opcode, SubVecTy, Alignment, AddressSpace, CostKind);

	unsigned VecSize = DL.getTypeSizeInBits(SubVecTy);
	unsigned MaxVecSize = 128;
	unsigned NumAccesses =
	std::max<unsigned>(1, (MinElts * ElSize + MaxVecSize - 1) / VecSize);

	// A stride of two is commonly supported via dedicated instructions, so it
	// should be relatively cheap for all element sizes. A stride of four is
	// more expensive as it will likely require more shuffles. Using two
	// simd128 inputs is considered more expensive and we mainly account for
	// shuffling two inputs (32 bytes), but we do model 4 x v4i32 to enable
	// arithmetic kernels.
	static const CostTblEntry ShuffleCostTbl[] = {
	// One reg.
	{2, MVT::v2i8, 1}, // interleave 2 x 2i8 into 4i8
	{2, MVT::v4i8, 1}, // interleave 2 x 4i8 into 8i8
	{2, MVT::v8i8, 1}, // interleave 2 x 8i8 into 16i8
	{2, MVT::v2i16, 1}, // interleave 2 x 2i16 into 4i16
	{2, MVT::v4i16, 1}, // interleave 2 x 4i16 into 8i16
	{2, MVT::v2i32, 1}, // interleave 2 x 2i32 into 4i32

	// Two regs.
	{2, MVT::v16i8, 2}, // interleave 2 x 16i8 into 32i8
	{2, MVT::v8i16, 2}, // interleave 2 x 8i16 into 16i16
	{2, MVT::v4i32, 2}, // interleave 2 x 4i32 into 8i32

	// One reg.
	{4, MVT::v2i8, 4}, // interleave 4 x 2i8 into 8i8
	{4, MVT::v4i8, 4}, // interleave 4 x 4i8 into 16i8
	{4, MVT::v2i16, 4}, // interleave 4 x 2i16 into 8i16

	// Two regs.
	{4, MVT::v8i8, 16}, // interleave 4 x 8i8 into 32i8
	{4, MVT::v4i16, 8}, // interleave 4 x 4i16 into 16i16
	{4, MVT::v2i32, 4}, // interleave 4 x 2i32 into 8i32

	// Four regs.
	{4, MVT::v4i32, 16}, // interleave 4 x 4i32 into 16i32
	};

	EVT ETy = TLI->getValueType(DL, SubVecTy);
	if (const auto *Entry =
	CostTableLookup(ShuffleCostTbl, Factor, ETy.getSimpleVT()))
	return Entry->Cost + (NumAccesses * MemCost);
	}

	return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
	Alignment, AddressSpace, CostKind,
	UseMaskForCond, UseMaskForGaps);
	}

	InstructionCost WebAssemblyTTIImpl::getVectorInstrCost(
	unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
	const Value Op0, const Value Op1) const {
	InstructionCost Cost = BasicTTIImplBase::getVectorInstrCost(
	Opcode, Val, CostKind, Index, Op0, Op1);

	// SIMD128's insert/extract currently only take constant indices.
	if (Index == -1u)
	return Cost + 25 * TargetTransformInfo::TCC_Expensive;

	return Cost;
	}

	InstructionCost WebAssemblyTTIImpl::getPartialReductionCost(
	unsigned Opcode, Type InputTypeA, Type InputTypeB, Type *AccumType,
	ElementCount VF, TTI::PartialReductionExtendKind OpAExtend,
	TTI::PartialReductionExtendKind OpBExtend, std::optional<unsigned> BinOp,
	TTI::TargetCostKind CostKind) const {
	InstructionCost Invalid = InstructionCost::getInvalid();
	if (!VF.isFixed() \|\| !ST->hasSIMD128())
	return Invalid;

	if (CostKind != TTI::TCK_RecipThroughput)
	return Invalid;

	InstructionCost Cost(TTI::TCC_Basic);

	// Possible options:
	// - i16x8.extadd_pairwise_i8x16_sx
	// - i32x4.extadd_pairwise_i16x8_sx
	// - i32x4.dot_i16x8_s
	// Only try to support dot, for now.

	if (Opcode != Instruction::Add)
	return Invalid;

	if (!BinOp \|\| *BinOp != Instruction::Mul)
	return Invalid;

	if (InputTypeA != InputTypeB)
	return Invalid;

	if (OpAExtend != OpBExtend)
	return Invalid;

	EVT InputEVT = EVT::getEVT(InputTypeA);
	EVT AccumEVT = EVT::getEVT(AccumType);

	// TODO: Add i64 accumulator.
	if (AccumEVT != MVT::i32)
	return Invalid;

	// Signed inputs can lower to dot
	if (InputEVT == MVT::i16 && VF.getFixedValue() == 8)
	return OpAExtend == TTI::PR_SignExtend ? Cost : Cost * 2;

	// Double the size of the lowered sequence.
	if (InputEVT == MVT::i8 && VF.getFixedValue() == 16)
	return OpAExtend == TTI::PR_SignExtend ? Cost * 2 : Cost * 4;

	return Invalid;
	}

	TTI::ReductionShuffle WebAssemblyTTIImpl::getPreferredExpandedReductionShuffle(
	const IntrinsicInst *II) const {

	switch (II->getIntrinsicID()) {
	default:
	break;
	case Intrinsic::vector_reduce_fadd:
	return TTI::ReductionShuffle::Pairwise;
	}
	return TTI::ReductionShuffle::SplitHalf;
	}

	void WebAssemblyTTIImpl::getUnrollingPreferences(
	Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP,
	OptimizationRemarkEmitter *ORE) const {
	// Scan the loop: don't unroll loops with calls. This is a standard approach
	// for most (all?) targets.
	for (BasicBlock *BB : L->blocks())
	for (Instruction &I : *BB)
	if (isa<CallInst>(I) \|\| isa<InvokeInst>(I))
	if (const Function *F = cast<CallBase>(I).getCalledFunction())
	if (isLoweredToCall(F))
	return;

	// The chosen threshold is within the range of 'LoopMicroOpBufferSize' of
	// the various microarchitectures that use the BasicTTI implementation and
	// has been selected through heuristics across multiple cores and runtimes.
	UP.Partial = UP.Runtime = UP.UpperBound = true;
	UP.PartialThreshold = 30;

	// Avoid unrolling when optimizing for size.
	UP.OptSizeThreshold = 0;
	UP.PartialOptSizeThreshold = 0;

	// Set number of instructions optimized when "back edge"
	// becomes "fall through" to default value of 2.
	UP.BEInsns = 2;
	}

	bool WebAssemblyTTIImpl::supportsTailCalls() const {
	return getST()->hasTailCall();
	}

	bool WebAssemblyTTIImpl::isProfitableToSinkOperands(
	Instruction I, SmallVectorImpl<Use > &Ops) const {
	using namespace llvm::PatternMatch;

	if (!I->getType()->isVectorTy() \|\| !I->isShift())
	return false;

	Value *V = I->getOperand(1);
	// We dont need to sink constant splat.
	if (isa<Constant>(V))
	return false;

	if (match(V, m_Shuffle(m_InsertElt(m_Value(), m_Value(), m_ZeroInt()),
	m_Value(), m_ZeroMask()))) {
	// Sink insert
	Ops.push_back(&cast<Instruction>(V)->getOperandUse(0));
	// Sink shuffle
	Ops.push_back(&I->getOperandUse(1));
	return true;
	}

	return false;
	}