blob: 0eefd3e2b3500883a270ad0d6c88c2b376879bec [file] [log] [blame]
//===-- WebAssemblyTargetTransformInfo.cpp - WebAssembly-specific TTI -----===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
///
/// \file
/// This file defines the WebAssembly-specific TargetTransformInfo
/// implementation.
///
//===----------------------------------------------------------------------===//
#include "WebAssemblyTargetTransformInfo.h"
#include "llvm/CodeGen/CostTable.h"
using namespace llvm;
#define DEBUG_TYPE "wasmtti"
TargetTransformInfo::PopcntSupportKind
WebAssemblyTTIImpl::getPopcntSupport(unsigned TyWidth) const {
assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
return TargetTransformInfo::PSK_FastHardware;
}
unsigned WebAssemblyTTIImpl::getNumberOfRegisters(unsigned ClassID) const {
unsigned Result = BaseT::getNumberOfRegisters(ClassID);
// For SIMD, use at least 16 registers, as a rough guess.
bool Vector = (ClassID == 1);
if (Vector)
Result = std::max(Result, 16u);
return Result;
}
TypeSize WebAssemblyTTIImpl::getRegisterBitWidth(
TargetTransformInfo::RegisterKind K) const {
switch (K) {
case TargetTransformInfo::RGK_Scalar:
return TypeSize::getFixed(64);
case TargetTransformInfo::RGK_FixedWidthVector:
return TypeSize::getFixed(getST()->hasSIMD128() ? 128 : 64);
case TargetTransformInfo::RGK_ScalableVector:
return TypeSize::getScalable(0);
}
llvm_unreachable("Unsupported register kind");
}
InstructionCost WebAssemblyTTIImpl::getArithmeticInstrCost(
unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info,
ArrayRef<const Value *> Args, const Instruction *CxtI) const {
InstructionCost Cost =
BasicTTIImplBase<WebAssemblyTTIImpl>::getArithmeticInstrCost(
Opcode, Ty, CostKind, Op1Info, Op2Info);
if (auto *VTy = dyn_cast<VectorType>(Ty)) {
switch (Opcode) {
case Instruction::LShr:
case Instruction::AShr:
case Instruction::Shl:
// SIMD128's shifts currently only accept a scalar shift count. For each
// element, we'll need to extract, op, insert. The following is a rough
// approximation.
if (!Op2Info.isUniform())
Cost =
cast<FixedVectorType>(VTy)->getNumElements() *
(TargetTransformInfo::TCC_Basic +
getArithmeticInstrCost(Opcode, VTy->getElementType(), CostKind) +
TargetTransformInfo::TCC_Basic);
break;
}
}
return Cost;
}
InstructionCost WebAssemblyTTIImpl::getCastInstrCost(
unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH,
TTI::TargetCostKind CostKind, const Instruction *I) const {
int ISD = TLI->InstructionOpcodeToISD(Opcode);
auto SrcTy = TLI->getValueType(DL, Src);
auto DstTy = TLI->getValueType(DL, Dst);
if (!SrcTy.isSimple() || !DstTy.isSimple()) {
return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
}
if (!ST->hasSIMD128()) {
return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
}
auto DstVT = DstTy.getSimpleVT();
auto SrcVT = SrcTy.getSimpleVT();
if (I && I->hasOneUser()) {
auto *SingleUser = cast<Instruction>(*I->user_begin());
int UserISD = TLI->InstructionOpcodeToISD(SingleUser->getOpcode());
// extmul_low support
if (UserISD == ISD::MUL &&
(ISD == ISD::ZERO_EXTEND || ISD == ISD::SIGN_EXTEND)) {
// Free low extensions.
if ((SrcVT == MVT::v8i8 && DstVT == MVT::v8i16) ||
(SrcVT == MVT::v4i16 && DstVT == MVT::v4i32) ||
(SrcVT == MVT::v2i32 && DstVT == MVT::v2i64)) {
return 0;
}
// Will require an additional extlow operation for the intermediate
// i16/i32 value.
if ((SrcVT == MVT::v4i8 && DstVT == MVT::v4i32) ||
(SrcVT == MVT::v2i16 && DstVT == MVT::v2i64)) {
return 1;
}
}
}
// extend_low
static constexpr TypeConversionCostTblEntry ConversionTbl[] = {
{ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i32, 1},
{ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i32, 1},
{ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 1},
{ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 1},
{ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, 1},
{ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, 1},
{ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i16, 2},
{ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i16, 2},
{ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8, 2},
{ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 2},
};
if (const auto *Entry =
ConvertCostTableLookup(ConversionTbl, ISD, DstVT, SrcVT)) {
return Entry->Cost;
}
return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
}
WebAssemblyTTIImpl::TTI::MemCmpExpansionOptions
WebAssemblyTTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
TTI::MemCmpExpansionOptions Options;
Options.AllowOverlappingLoads = true;
if (ST->hasSIMD128())
Options.LoadSizes.push_back(16);
Options.LoadSizes.append({8, 4, 2, 1});
Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);
Options.NumLoadsPerBlock = Options.MaxNumLoads;
return Options;
}
InstructionCost WebAssemblyTTIImpl::getMemoryOpCost(
unsigned Opcode, Type *Ty, Align Alignment, unsigned AddressSpace,
TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo,
const Instruction *I) const {
if (!ST->hasSIMD128() || !isa<FixedVectorType>(Ty)) {
return BaseT::getMemoryOpCost(Opcode, Ty, Alignment, AddressSpace,
CostKind);
}
EVT VT = TLI->getValueType(DL, Ty, true);
// Type legalization can't handle structs
if (VT == MVT::Other)
return BaseT::getMemoryOpCost(Opcode, Ty, Alignment, AddressSpace,
CostKind);
auto LT = getTypeLegalizationCost(Ty);
if (!LT.first.isValid())
return InstructionCost::getInvalid();
int ISD = TLI->InstructionOpcodeToISD(Opcode);
unsigned width = VT.getSizeInBits();
if (ISD == ISD::LOAD) {
// 128-bit loads are a single instruction. 32-bit and 64-bit vector loads
// can be lowered to load32_zero and load64_zero respectively. Assume SIMD
// loads are twice as expensive as scalar.
switch (width) {
default:
break;
case 32:
case 64:
case 128:
return 2;
}
} else if (ISD == ISD::STORE) {
// For stores, we can use store lane operations.
switch (width) {
default:
break;
case 8:
case 16:
case 32:
case 64:
case 128:
return 2;
}
}
return BaseT::getMemoryOpCost(Opcode, Ty, Alignment, AddressSpace, CostKind);
}
InstructionCost WebAssemblyTTIImpl::getInterleavedMemoryOpCost(
unsigned Opcode, Type *Ty, unsigned Factor, ArrayRef<unsigned> Indices,
Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
bool UseMaskForCond, bool UseMaskForGaps) const {
assert(Factor >= 2 && "Invalid interleave factor");
auto *VecTy = cast<VectorType>(Ty);
if (!ST->hasSIMD128() || !isa<FixedVectorType>(VecTy)) {
return InstructionCost::getInvalid();
}
if (UseMaskForCond || UseMaskForGaps)
return BaseT::getInterleavedMemoryOpCost(Opcode, Ty, Factor, Indices,
Alignment, AddressSpace, CostKind,
UseMaskForCond, UseMaskForGaps);
constexpr unsigned MaxInterleaveFactor = 4;
if (Factor <= MaxInterleaveFactor) {
unsigned MinElts = VecTy->getElementCount().getKnownMinValue();
// Ensure the number of vector elements is greater than 1.
if (MinElts < 2 || MinElts % Factor != 0)
return InstructionCost::getInvalid();
unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType());
// Ensure the element type is legal.
if (ElSize != 8 && ElSize != 16 && ElSize != 32 && ElSize != 64)
return InstructionCost::getInvalid();
auto *SubVecTy =
VectorType::get(VecTy->getElementType(),
VecTy->getElementCount().divideCoefficientBy(Factor));
InstructionCost MemCost =
getMemoryOpCost(Opcode, SubVecTy, Alignment, AddressSpace, CostKind);
unsigned VecSize = DL.getTypeSizeInBits(SubVecTy);
unsigned MaxVecSize = 128;
unsigned NumAccesses =
std::max<unsigned>(1, (MinElts * ElSize + MaxVecSize - 1) / VecSize);
// A stride of two is commonly supported via dedicated instructions, so it
// should be relatively cheap for all element sizes. A stride of four is
// more expensive as it will likely require more shuffles. Using two
// simd128 inputs is considered more expensive and we mainly account for
// shuffling two inputs (32 bytes), but we do model 4 x v4i32 to enable
// arithmetic kernels.
static const CostTblEntry ShuffleCostTbl[] = {
// One reg.
{2, MVT::v2i8, 1}, // interleave 2 x 2i8 into 4i8
{2, MVT::v4i8, 1}, // interleave 2 x 4i8 into 8i8
{2, MVT::v8i8, 1}, // interleave 2 x 8i8 into 16i8
{2, MVT::v2i16, 1}, // interleave 2 x 2i16 into 4i16
{2, MVT::v4i16, 1}, // interleave 2 x 4i16 into 8i16
{2, MVT::v2i32, 1}, // interleave 2 x 2i32 into 4i32
// Two regs.
{2, MVT::v16i8, 2}, // interleave 2 x 16i8 into 32i8
{2, MVT::v8i16, 2}, // interleave 2 x 8i16 into 16i16
{2, MVT::v4i32, 2}, // interleave 2 x 4i32 into 8i32
// One reg.
{4, MVT::v2i8, 4}, // interleave 4 x 2i8 into 8i8
{4, MVT::v4i8, 4}, // interleave 4 x 4i8 into 16i8
{4, MVT::v2i16, 4}, // interleave 4 x 2i16 into 8i16
// Two regs.
{4, MVT::v8i8, 16}, // interleave 4 x 8i8 into 32i8
{4, MVT::v4i16, 8}, // interleave 4 x 4i16 into 16i16
{4, MVT::v2i32, 4}, // interleave 4 x 2i32 into 8i32
// Four regs.
{4, MVT::v4i32, 16}, // interleave 4 x 4i32 into 16i32
};
EVT ETy = TLI->getValueType(DL, SubVecTy);
if (const auto *Entry =
CostTableLookup(ShuffleCostTbl, Factor, ETy.getSimpleVT()))
return Entry->Cost + (NumAccesses * MemCost);
}
return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
Alignment, AddressSpace, CostKind,
UseMaskForCond, UseMaskForGaps);
}
InstructionCost WebAssemblyTTIImpl::getVectorInstrCost(
unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
const Value *Op0, const Value *Op1) const {
InstructionCost Cost = BasicTTIImplBase::getVectorInstrCost(
Opcode, Val, CostKind, Index, Op0, Op1);
// SIMD128's insert/extract currently only take constant indices.
if (Index == -1u)
return Cost + 25 * TargetTransformInfo::TCC_Expensive;
return Cost;
}
InstructionCost WebAssemblyTTIImpl::getPartialReductionCost(
unsigned Opcode, Type *InputTypeA, Type *InputTypeB, Type *AccumType,
ElementCount VF, TTI::PartialReductionExtendKind OpAExtend,
TTI::PartialReductionExtendKind OpBExtend, std::optional<unsigned> BinOp,
TTI::TargetCostKind CostKind) const {
InstructionCost Invalid = InstructionCost::getInvalid();
if (!VF.isFixed() || !ST->hasSIMD128())
return Invalid;
if (CostKind != TTI::TCK_RecipThroughput)
return Invalid;
InstructionCost Cost(TTI::TCC_Basic);
// Possible options:
// - i16x8.extadd_pairwise_i8x16_sx
// - i32x4.extadd_pairwise_i16x8_sx
// - i32x4.dot_i16x8_s
// Only try to support dot, for now.
if (Opcode != Instruction::Add)
return Invalid;
if (!BinOp || *BinOp != Instruction::Mul)
return Invalid;
if (InputTypeA != InputTypeB)
return Invalid;
if (OpAExtend != OpBExtend)
return Invalid;
EVT InputEVT = EVT::getEVT(InputTypeA);
EVT AccumEVT = EVT::getEVT(AccumType);
// TODO: Add i64 accumulator.
if (AccumEVT != MVT::i32)
return Invalid;
// Signed inputs can lower to dot
if (InputEVT == MVT::i16 && VF.getFixedValue() == 8)
return OpAExtend == TTI::PR_SignExtend ? Cost : Cost * 2;
// Double the size of the lowered sequence.
if (InputEVT == MVT::i8 && VF.getFixedValue() == 16)
return OpAExtend == TTI::PR_SignExtend ? Cost * 2 : Cost * 4;
return Invalid;
}
TTI::ReductionShuffle WebAssemblyTTIImpl::getPreferredExpandedReductionShuffle(
const IntrinsicInst *II) const {
switch (II->getIntrinsicID()) {
default:
break;
case Intrinsic::vector_reduce_fadd:
return TTI::ReductionShuffle::Pairwise;
}
return TTI::ReductionShuffle::SplitHalf;
}
void WebAssemblyTTIImpl::getUnrollingPreferences(
Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP,
OptimizationRemarkEmitter *ORE) const {
// Scan the loop: don't unroll loops with calls. This is a standard approach
// for most (all?) targets.
for (BasicBlock *BB : L->blocks())
for (Instruction &I : *BB)
if (isa<CallInst>(I) || isa<InvokeInst>(I))
if (const Function *F = cast<CallBase>(I).getCalledFunction())
if (isLoweredToCall(F))
return;
// The chosen threshold is within the range of 'LoopMicroOpBufferSize' of
// the various microarchitectures that use the BasicTTI implementation and
// has been selected through heuristics across multiple cores and runtimes.
UP.Partial = UP.Runtime = UP.UpperBound = true;
UP.PartialThreshold = 30;
// Avoid unrolling when optimizing for size.
UP.OptSizeThreshold = 0;
UP.PartialOptSizeThreshold = 0;
// Set number of instructions optimized when "back edge"
// becomes "fall through" to default value of 2.
UP.BEInsns = 2;
}
bool WebAssemblyTTIImpl::supportsTailCalls() const {
return getST()->hasTailCall();
}
bool WebAssemblyTTIImpl::isProfitableToSinkOperands(
Instruction *I, SmallVectorImpl<Use *> &Ops) const {
using namespace llvm::PatternMatch;
if (!I->getType()->isVectorTy() || !I->isShift())
return false;
Value *V = I->getOperand(1);
// We dont need to sink constant splat.
if (isa<Constant>(V))
return false;
if (match(V, m_Shuffle(m_InsertElt(m_Value(), m_Value(), m_ZeroInt()),
m_Value(), m_ZeroMask()))) {
// Sink insert
Ops.push_back(&cast<Instruction>(V)->getOperandUse(0));
// Sink shuffle
Ops.push_back(&I->getOperandUse(1));
return true;
}
return false;
}