blob: 5a6413653908ec82adb66e8ba5a0c46dc54f6e35 [file] [log] [blame]
//===-- X86TargetTransformInfo.cpp - X86 specific TTI pass ----------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
/// \file
/// This file implements a TargetTransformInfo analysis pass specific to the
/// X86 target machine. It uses the target's detailed information to provide
/// more precise answers to certain TTI queries, while letting the target
/// independent and default TTI implementations handle the rest.
///
//===----------------------------------------------------------------------===//
/// About Cost Model numbers used below it's necessary to say the following:
/// the numbers correspond to some "generic" X86 CPU instead of usage of a
/// specific CPU model. Usually the numbers correspond to the CPU where the
/// feature first appeared. For example, if we do Subtarget.hasSSE42() in
/// the lookups below the cost is based on Nehalem as that was the first CPU
/// to support that feature level and thus has most likely the worst case cost,
/// although we may discard an outlying worst cost from one CPU (e.g. Atom).
///
/// Some examples of other technologies/CPUs:
/// SSE 3 - Pentium4 / Athlon64
/// SSE 4.1 - Penryn
/// SSE 4.2 - Nehalem / Silvermont
/// AVX - Sandy Bridge / Jaguar / Bulldozer
/// AVX2 - Haswell / Ryzen
/// AVX-512 - Xeon Phi / Skylake
///
/// And some examples of instruction target dependent costs (latency)
/// divss sqrtss rsqrtss
/// AMD K7 11-16 19 3
/// Piledriver 9-24 13-15 5
/// Jaguar 14 16 2
/// Pentium II,III 18 30 2
/// Nehalem 7-14 7-18 3
/// Haswell 10-13 11 5
///
/// Interpreting the 4 TargetCostKind types:
/// TCK_RecipThroughput and TCK_Latency should try to match the worst case
/// values reported by the CPU scheduler models (and llvm-mca).
/// TCK_CodeSize should match the instruction count (e.g. divss = 1), NOT the
/// actual encoding size of the instruction.
/// TCK_SizeAndLatency should match the worst case micro-op counts reported by
/// by the CPU scheduler models (and llvm-mca), to ensure that they are
/// compatible with the MicroOpBufferSize and LoopMicroOpBufferSize values which are
/// often used as the cost thresholds where TCK_SizeAndLatency is requested.
//===----------------------------------------------------------------------===//
#include "X86TargetTransformInfo.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/CodeGen/BasicTTIImpl.h"
#include "llvm/CodeGen/CostTable.h"
#include "llvm/CodeGen/TargetLowering.h"
#include "llvm/IR/InstIterator.h"
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/Support/Debug.h"
#include <optional>
using namespace llvm;
#define DEBUG_TYPE "x86tti"
//===----------------------------------------------------------------------===//
//
// X86 cost model.
//
//===----------------------------------------------------------------------===//
// Helper struct to store/access costs for each cost kind.
// TODO: Move this to allow other targets to use it?
struct CostKindCosts {
unsigned RecipThroughputCost = ~0U;
unsigned LatencyCost = ~0U;
unsigned CodeSizeCost = ~0U;
unsigned SizeAndLatencyCost = ~0U;
std::optional<unsigned>
operator[](TargetTransformInfo::TargetCostKind Kind) const {
unsigned Cost = ~0U;
switch (Kind) {
case TargetTransformInfo::TCK_RecipThroughput:
Cost = RecipThroughputCost;
break;
case TargetTransformInfo::TCK_Latency:
Cost = LatencyCost;
break;
case TargetTransformInfo::TCK_CodeSize:
Cost = CodeSizeCost;
break;
case TargetTransformInfo::TCK_SizeAndLatency:
Cost = SizeAndLatencyCost;
break;
}
if (Cost == ~0U)
return std::nullopt;
return Cost;
}
};
using CostKindTblEntry = CostTblEntryT<CostKindCosts>;
TargetTransformInfo::PopcntSupportKind
X86TTIImpl::getPopcntSupport(unsigned TyWidth) {
assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
// TODO: Currently the __builtin_popcount() implementation using SSE3
// instructions is inefficient. Once the problem is fixed, we should
// call ST->hasSSE3() instead of ST->hasPOPCNT().
return ST->hasPOPCNT() ? TTI::PSK_FastHardware : TTI::PSK_Software;
}
std::optional<unsigned> X86TTIImpl::getCacheSize(
TargetTransformInfo::CacheLevel Level) const {
switch (Level) {
case TargetTransformInfo::CacheLevel::L1D:
// - Penryn
// - Nehalem
// - Westmere
// - Sandy Bridge
// - Ivy Bridge
// - Haswell
// - Broadwell
// - Skylake
// - Kabylake
return 32 * 1024; // 32 KByte
case TargetTransformInfo::CacheLevel::L2D:
// - Penryn
// - Nehalem
// - Westmere
// - Sandy Bridge
// - Ivy Bridge
// - Haswell
// - Broadwell
// - Skylake
// - Kabylake
return 256 * 1024; // 256 KByte
}
llvm_unreachable("Unknown TargetTransformInfo::CacheLevel");
}
std::optional<unsigned> X86TTIImpl::getCacheAssociativity(
TargetTransformInfo::CacheLevel Level) const {
// - Penryn
// - Nehalem
// - Westmere
// - Sandy Bridge
// - Ivy Bridge
// - Haswell
// - Broadwell
// - Skylake
// - Kabylake
switch (Level) {
case TargetTransformInfo::CacheLevel::L1D:
[[fallthrough]];
case TargetTransformInfo::CacheLevel::L2D:
return 8;
}
llvm_unreachable("Unknown TargetTransformInfo::CacheLevel");
}
unsigned X86TTIImpl::getNumberOfRegisters(unsigned ClassID) const {
bool Vector = (ClassID == 1);
if (Vector && !ST->hasSSE1())
return 0;
if (ST->is64Bit()) {
if (Vector && ST->hasAVX512())
return 32;
return 16;
}
return 8;
}
TypeSize
X86TTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const {
unsigned PreferVectorWidth = ST->getPreferVectorWidth();
switch (K) {
case TargetTransformInfo::RGK_Scalar:
return TypeSize::getFixed(ST->is64Bit() ? 64 : 32);
case TargetTransformInfo::RGK_FixedWidthVector:
if (ST->hasAVX512() && PreferVectorWidth >= 512)
return TypeSize::getFixed(512);
if (ST->hasAVX() && PreferVectorWidth >= 256)
return TypeSize::getFixed(256);
if (ST->hasSSE1() && PreferVectorWidth >= 128)
return TypeSize::getFixed(128);
return TypeSize::getFixed(0);
case TargetTransformInfo::RGK_ScalableVector:
return TypeSize::getScalable(0);
}
llvm_unreachable("Unsupported register kind");
}
unsigned X86TTIImpl::getLoadStoreVecRegBitWidth(unsigned) const {
return getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector)
.getFixedValue();
}
unsigned X86TTIImpl::getMaxInterleaveFactor(ElementCount VF) {
// If the loop will not be vectorized, don't interleave the loop.
// Let regular unroll to unroll the loop, which saves the overflow
// check and memory check cost.
if (VF.isScalar())
return 1;
if (ST->isAtom())
return 1;
// Sandybridge and Haswell have multiple execution ports and pipelined
// vector units.
if (ST->hasAVX())
return 4;
return 2;
}
InstructionCost X86TTIImpl::getArithmeticInstrCost(
unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info,
ArrayRef<const Value *> Args,
const Instruction *CxtI) {
// vXi8 multiplications are always promoted to vXi16.
// Sub-128-bit types can be extended/packed more efficiently.
if (Opcode == Instruction::Mul && Ty->isVectorTy() &&
Ty->getPrimitiveSizeInBits() <= 64 && Ty->getScalarSizeInBits() == 8) {
Type *WideVecTy =
VectorType::getExtendedElementVectorType(cast<VectorType>(Ty));
return getCastInstrCost(Instruction::ZExt, WideVecTy, Ty,
TargetTransformInfo::CastContextHint::None,
CostKind) +
getCastInstrCost(Instruction::Trunc, Ty, WideVecTy,
TargetTransformInfo::CastContextHint::None,
CostKind) +
getArithmeticInstrCost(Opcode, WideVecTy, CostKind, Op1Info, Op2Info);
}
// Legalize the type.
std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
int ISD = TLI->InstructionOpcodeToISD(Opcode);
assert(ISD && "Invalid opcode");
if (ISD == ISD::MUL && Args.size() == 2 && LT.second.isVector() &&
(LT.second.getScalarType() == MVT::i32 ||
LT.second.getScalarType() == MVT::i64)) {
// Check if the operands can be represented as a smaller datatype.
bool Op1Signed = false, Op2Signed = false;
unsigned Op1MinSize = BaseT::minRequiredElementSize(Args[0], Op1Signed);
unsigned Op2MinSize = BaseT::minRequiredElementSize(Args[1], Op2Signed);
unsigned OpMinSize = std::max(Op1MinSize, Op2MinSize);
bool SignedMode = Op1Signed || Op2Signed;
// If both vXi32 are representable as i15 and at least one is constant,
// zero-extended, or sign-extended from vXi16 (or less pre-SSE41) then we
// can treat this as PMADDWD which has the same costs as a vXi16 multiply.
if (OpMinSize <= 15 && !ST->isPMADDWDSlow() &&
LT.second.getScalarType() == MVT::i32) {
bool Op1Constant =
isa<ConstantDataVector>(Args[0]) || isa<ConstantVector>(Args[0]);
bool Op2Constant =
isa<ConstantDataVector>(Args[1]) || isa<ConstantVector>(Args[1]);
bool Op1Sext = isa<SExtInst>(Args[0]) &&
(Op1MinSize == 15 || (Op1MinSize < 15 && !ST->hasSSE41()));
bool Op2Sext = isa<SExtInst>(Args[1]) &&
(Op2MinSize == 15 || (Op2MinSize < 15 && !ST->hasSSE41()));
bool IsZeroExtended = !Op1Signed || !Op2Signed;
bool IsConstant = Op1Constant || Op2Constant;
bool IsSext = Op1Sext || Op2Sext;
if (IsConstant || IsZeroExtended || IsSext)
LT.second =
MVT::getVectorVT(MVT::i16, 2 * LT.second.getVectorNumElements());
}
// Check if the vXi32 operands can be shrunk into a smaller datatype.
// This should match the codegen from reduceVMULWidth.
// TODO: Make this generic (!ST->SSE41 || ST->isPMULLDSlow()).
if (ST->useSLMArithCosts() && LT.second == MVT::v4i32) {
if (OpMinSize <= 7)
return LT.first * 3; // pmullw/sext
if (!SignedMode && OpMinSize <= 8)
return LT.first * 3; // pmullw/zext
if (OpMinSize <= 15)
return LT.first * 5; // pmullw/pmulhw/pshuf
if (!SignedMode && OpMinSize <= 16)
return LT.first * 5; // pmullw/pmulhw/pshuf
}
// If both vXi64 are representable as (unsigned) i32, then we can perform
// the multiple with a single PMULUDQ instruction.
// TODO: Add (SSE41+) PMULDQ handling for signed extensions.
if (!SignedMode && OpMinSize <= 32 && LT.second.getScalarType() == MVT::i64)
ISD = X86ISD::PMULUDQ;
}
// Vector multiply by pow2 will be simplified to shifts.
// Vector multiply by -pow2 will be simplified to shifts/negates.
if (ISD == ISD::MUL && Op2Info.isConstant() &&
(Op2Info.isPowerOf2() || Op2Info.isNegatedPowerOf2())) {
InstructionCost Cost =
getArithmeticInstrCost(Instruction::Shl, Ty, CostKind,
Op1Info.getNoProps(), Op2Info.getNoProps());
if (Op2Info.isNegatedPowerOf2())
Cost += getArithmeticInstrCost(Instruction::Sub, Ty, CostKind);
return Cost;
}
// On X86, vector signed division by constants power-of-two are
// normally expanded to the sequence SRA + SRL + ADD + SRA.
// The OperandValue properties may not be the same as that of the previous
// operation; conservatively assume OP_None.
if ((ISD == ISD::SDIV || ISD == ISD::SREM) &&
Op2Info.isConstant() && Op2Info.isPowerOf2()) {
InstructionCost Cost =
2 * getArithmeticInstrCost(Instruction::AShr, Ty, CostKind,
Op1Info.getNoProps(), Op2Info.getNoProps());
Cost += getArithmeticInstrCost(Instruction::LShr, Ty, CostKind,
Op1Info.getNoProps(), Op2Info.getNoProps());
Cost += getArithmeticInstrCost(Instruction::Add, Ty, CostKind,
Op1Info.getNoProps(), Op2Info.getNoProps());
if (ISD == ISD::SREM) {
// For SREM: (X % C) is the equivalent of (X - (X/C)*C)
Cost += getArithmeticInstrCost(Instruction::Mul, Ty, CostKind, Op1Info.getNoProps(),
Op2Info.getNoProps());
Cost += getArithmeticInstrCost(Instruction::Sub, Ty, CostKind, Op1Info.getNoProps(),
Op2Info.getNoProps());
}
return Cost;
}
// Vector unsigned division/remainder will be simplified to shifts/masks.
if ((ISD == ISD::UDIV || ISD == ISD::UREM) &&
Op2Info.isConstant() && Op2Info.isPowerOf2()) {
if (ISD == ISD::UDIV)
return getArithmeticInstrCost(Instruction::LShr, Ty, CostKind,
Op1Info.getNoProps(), Op2Info.getNoProps());
// UREM
return getArithmeticInstrCost(Instruction::And, Ty, CostKind,
Op1Info.getNoProps(), Op2Info.getNoProps());
}
static const CostKindTblEntry AVX512BWUniformConstCostTable[] = {
{ ISD::SHL, MVT::v16i8, { 1, 7, 2, 3 } }, // psllw + pand.
{ ISD::SRL, MVT::v16i8, { 1, 7, 2, 3 } }, // psrlw + pand.
{ ISD::SRA, MVT::v16i8, { 1, 8, 4, 5 } }, // psrlw, pand, pxor, psubb.
{ ISD::SHL, MVT::v32i8, { 1, 8, 2, 3 } }, // psllw + pand.
{ ISD::SRL, MVT::v32i8, { 1, 8, 2, 3 } }, // psrlw + pand.
{ ISD::SRA, MVT::v32i8, { 1, 9, 4, 5 } }, // psrlw, pand, pxor, psubb.
{ ISD::SHL, MVT::v64i8, { 1, 8, 2, 3 } }, // psllw + pand.
{ ISD::SRL, MVT::v64i8, { 1, 8, 2, 3 } }, // psrlw + pand.
{ ISD::SRA, MVT::v64i8, { 1, 9, 4, 6 } }, // psrlw, pand, pxor, psubb.
{ ISD::SHL, MVT::v16i16, { 1, 1, 1, 1 } }, // psllw
{ ISD::SRL, MVT::v16i16, { 1, 1, 1, 1 } }, // psrlw
{ ISD::SRA, MVT::v16i16, { 1, 1, 1, 1 } }, // psrlw
{ ISD::SHL, MVT::v32i16, { 1, 1, 1, 1 } }, // psllw
{ ISD::SRL, MVT::v32i16, { 1, 1, 1, 1 } }, // psrlw
{ ISD::SRA, MVT::v32i16, { 1, 1, 1, 1 } }, // psrlw
};
if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasBWI())
if (const auto *Entry =
CostTableLookup(AVX512BWUniformConstCostTable, ISD, LT.second))
if (auto KindCost = Entry->Cost[CostKind])
return LT.first * *KindCost;
static const CostKindTblEntry AVX512UniformConstCostTable[] = {
{ ISD::SHL, MVT::v64i8, { 2, 12, 5, 6 } }, // psllw + pand.
{ ISD::SRL, MVT::v64i8, { 2, 12, 5, 6 } }, // psrlw + pand.
{ ISD::SRA, MVT::v64i8, { 3, 10, 12, 12 } }, // psrlw, pand, pxor, psubb.
{ ISD::SHL, MVT::v16i16, { 2, 7, 4, 4 } }, // psllw + split.
{ ISD::SRL, MVT::v16i16, { 2, 7, 4, 4 } }, // psrlw + split.
{ ISD::SRA, MVT::v16i16, { 2, 7, 4, 4 } }, // psraw + split.
{ ISD::SHL, MVT::v8i32, { 1, 1, 1, 1 } }, // pslld
{ ISD::SRL, MVT::v8i32, { 1, 1, 1, 1 } }, // psrld
{ ISD::SRA, MVT::v8i32, { 1, 1, 1, 1 } }, // psrad
{ ISD::SHL, MVT::v16i32, { 1, 1, 1, 1 } }, // pslld
{ ISD::SRL, MVT::v16i32, { 1, 1, 1, 1 } }, // psrld
{ ISD::SRA, MVT::v16i32, { 1, 1, 1, 1 } }, // psrad
{ ISD::SRA, MVT::v2i64, { 1, 1, 1, 1 } }, // psraq
{ ISD::SHL, MVT::v4i64, { 1, 1, 1, 1 } }, // psllq
{ ISD::SRL, MVT::v4i64, { 1, 1, 1, 1 } }, // psrlq
{ ISD::SRA, MVT::v4i64, { 1, 1, 1, 1 } }, // psraq
{ ISD::SHL, MVT::v8i64, { 1, 1, 1, 1 } }, // psllq
{ ISD::SRL, MVT::v8i64, { 1, 1, 1, 1 } }, // psrlq
{ ISD::SRA, MVT::v8i64, { 1, 1, 1, 1 } }, // psraq
{ ISD::SDIV, MVT::v16i32, { 6 } }, // pmuludq sequence
{ ISD::SREM, MVT::v16i32, { 8 } }, // pmuludq+mul+sub sequence
{ ISD::UDIV, MVT::v16i32, { 5 } }, // pmuludq sequence
{ ISD::UREM, MVT::v16i32, { 7 } }, // pmuludq+mul+sub sequence
};
if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasAVX512())
if (const auto *Entry =
CostTableLookup(AVX512UniformConstCostTable, ISD, LT.second))
if (auto KindCost = Entry->Cost[CostKind])
return LT.first * *KindCost;
static const CostKindTblEntry AVX2UniformConstCostTable[] = {
{ ISD::SHL, MVT::v16i8, { 1, 8, 2, 3 } }, // psllw + pand.
{ ISD::SRL, MVT::v16i8, { 1, 8, 2, 3 } }, // psrlw + pand.
{ ISD::SRA, MVT::v16i8, { 2, 10, 5, 6 } }, // psrlw, pand, pxor, psubb.
{ ISD::SHL, MVT::v32i8, { 2, 8, 2, 4 } }, // psllw + pand.
{ ISD::SRL, MVT::v32i8, { 2, 8, 2, 4 } }, // psrlw + pand.
{ ISD::SRA, MVT::v32i8, { 3, 10, 5, 9 } }, // psrlw, pand, pxor, psubb.
{ ISD::SHL, MVT::v8i16, { 1, 1, 1, 1 } }, // psllw
{ ISD::SRL, MVT::v8i16, { 1, 1, 1, 1 } }, // psrlw
{ ISD::SRA, MVT::v8i16, { 1, 1, 1, 1 } }, // psraw
{ ISD::SHL, MVT::v16i16,{ 2, 2, 1, 2 } }, // psllw
{ ISD::SRL, MVT::v16i16,{ 2, 2, 1, 2 } }, // psrlw
{ ISD::SRA, MVT::v16i16,{ 2, 2, 1, 2 } }, // psraw
{ ISD::SHL, MVT::v4i32, { 1, 1, 1, 1 } }, // pslld
{ ISD::SRL, MVT::v4i32, { 1, 1, 1, 1 } }, // psrld
{ ISD::SRA, MVT::v4i32, { 1, 1, 1, 1 } }, // psrad
{ ISD::SHL, MVT::v8i32, { 2, 2, 1, 2 } }, // pslld
{ ISD::SRL, MVT::v8i32, { 2, 2, 1, 2 } }, // psrld
{ ISD::SRA, MVT::v8i32, { 2, 2, 1, 2 } }, // psrad
{ ISD::SHL, MVT::v2i64, { 1, 1, 1, 1 } }, // psllq
{ ISD::SRL, MVT::v2i64, { 1, 1, 1, 1 } }, // psrlq
{ ISD::SRA, MVT::v2i64, { 2, 3, 3, 3 } }, // psrad + shuffle.
{ ISD::SHL, MVT::v4i64, { 2, 2, 1, 2 } }, // psllq
{ ISD::SRL, MVT::v4i64, { 2, 2, 1, 2 } }, // psrlq
{ ISD::SRA, MVT::v4i64, { 4, 4, 3, 6 } }, // psrad + shuffle + split.
{ ISD::SDIV, MVT::v8i32, { 6 } }, // pmuludq sequence
{ ISD::SREM, MVT::v8i32, { 8 } }, // pmuludq+mul+sub sequence
{ ISD::UDIV, MVT::v8i32, { 5 } }, // pmuludq sequence
{ ISD::UREM, MVT::v8i32, { 7 } }, // pmuludq+mul+sub sequence
};
if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasAVX2())
if (const auto *Entry =
CostTableLookup(AVX2UniformConstCostTable, ISD, LT.second))
if (auto KindCost = Entry->Cost[CostKind])
return LT.first * *KindCost;
static const CostKindTblEntry AVXUniformConstCostTable[] = {
{ ISD::SHL, MVT::v16i8, { 2, 7, 2, 3 } }, // psllw + pand.
{ ISD::SRL, MVT::v16i8, { 2, 7, 2, 3 } }, // psrlw + pand.
{ ISD::SRA, MVT::v16i8, { 3, 9, 5, 6 } }, // psrlw, pand, pxor, psubb.
{ ISD::SHL, MVT::v32i8, { 4, 7, 7, 8 } }, // 2*(psllw + pand) + split.
{ ISD::SRL, MVT::v32i8, { 4, 7, 7, 8 } }, // 2*(psrlw + pand) + split.
{ ISD::SRA, MVT::v32i8, { 7, 7, 12, 13 } }, // 2*(psrlw, pand, pxor, psubb) + split.
{ ISD::SHL, MVT::v8i16, { 1, 2, 1, 1 } }, // psllw.
{ ISD::SRL, MVT::v8i16, { 1, 2, 1, 1 } }, // psrlw.
{ ISD::SRA, MVT::v8i16, { 1, 2, 1, 1 } }, // psraw.
{ ISD::SHL, MVT::v16i16,{ 3, 6, 4, 5 } }, // psllw + split.
{ ISD::SRL, MVT::v16i16,{ 3, 6, 4, 5 } }, // psrlw + split.
{ ISD::SRA, MVT::v16i16,{ 3, 6, 4, 5 } }, // psraw + split.
{ ISD::SHL, MVT::v4i32, { 1, 2, 1, 1 } }, // pslld.
{ ISD::SRL, MVT::v4i32, { 1, 2, 1, 1 } }, // psrld.
{ ISD::SRA, MVT::v4i32, { 1, 2, 1, 1 } }, // psrad.
{ ISD::SHL, MVT::v8i32, { 3, 6, 4, 5 } }, // pslld + split.
{ ISD::SRL, MVT::v8i32, { 3, 6, 4, 5 } }, // psrld + split.
{ ISD::SRA, MVT::v8i32, { 3, 6, 4, 5 } }, // psrad + split.
{ ISD::SHL, MVT::v2i64, { 1, 2, 1, 1 } }, // psllq.
{ ISD::SRL, MVT::v2i64, { 1, 2, 1, 1 } }, // psrlq.
{ ISD::SRA, MVT::v2i64, { 2, 3, 3, 3 } }, // psrad + shuffle.
{ ISD::SHL, MVT::v4i64, { 3, 6, 4, 5 } }, // 2 x psllq + split.
{ ISD::SRL, MVT::v4i64, { 3, 6, 4, 5 } }, // 2 x psllq + split.
{ ISD::SRA, MVT::v4i64, { 5, 7, 8, 9 } }, // 2 x psrad + shuffle + split.
{ ISD::SDIV, MVT::v8i32, { 14 } }, // 2*pmuludq sequence + split.
{ ISD::SREM, MVT::v8i32, { 18 } }, // 2*pmuludq+mul+sub sequence + split.
{ ISD::UDIV, MVT::v8i32, { 12 } }, // 2*pmuludq sequence + split.
{ ISD::UREM, MVT::v8i32, { 16 } }, // 2*pmuludq+mul+sub sequence + split.
};
// XOP has faster vXi8 shifts.
if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasAVX() &&
(!ST->hasXOP() || LT.second.getScalarSizeInBits() != 8))
if (const auto *Entry =
CostTableLookup(AVXUniformConstCostTable, ISD, LT.second))
if (auto KindCost = Entry->Cost[CostKind])
return LT.first * *KindCost;
static const CostKindTblEntry SSE2UniformConstCostTable[] = {
{ ISD::SHL, MVT::v16i8, { 1, 7, 2, 3 } }, // psllw + pand.
{ ISD::SRL, MVT::v16i8, { 1, 7, 2, 3 } }, // psrlw + pand.
{ ISD::SRA, MVT::v16i8, { 3, 9, 5, 6 } }, // psrlw, pand, pxor, psubb.
{ ISD::SHL, MVT::v8i16, { 1, 1, 1, 1 } }, // psllw.
{ ISD::SRL, MVT::v8i16, { 1, 1, 1, 1 } }, // psrlw.
{ ISD::SRA, MVT::v8i16, { 1, 1, 1, 1 } }, // psraw.
{ ISD::SHL, MVT::v4i32, { 1, 1, 1, 1 } }, // pslld
{ ISD::SRL, MVT::v4i32, { 1, 1, 1, 1 } }, // psrld.
{ ISD::SRA, MVT::v4i32, { 1, 1, 1, 1 } }, // psrad.
{ ISD::SHL, MVT::v2i64, { 1, 1, 1, 1 } }, // psllq.
{ ISD::SRL, MVT::v2i64, { 1, 1, 1, 1 } }, // psrlq.
{ ISD::SRA, MVT::v2i64, { 3, 5, 6, 6 } }, // 2 x psrad + shuffle.
{ ISD::SDIV, MVT::v4i32, { 6 } }, // pmuludq sequence
{ ISD::SREM, MVT::v4i32, { 8 } }, // pmuludq+mul+sub sequence
{ ISD::UDIV, MVT::v4i32, { 5 } }, // pmuludq sequence
{ ISD::UREM, MVT::v4i32, { 7 } }, // pmuludq+mul+sub sequence
};
// XOP has faster vXi8 shifts.
if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasSSE2() &&
(!ST->hasXOP() || LT.second.getScalarSizeInBits() != 8))
if (const auto *Entry =
CostTableLookup(SSE2UniformConstCostTable, ISD, LT.second))
if (auto KindCost = Entry->Cost[CostKind])
return LT.first * *KindCost;
static const CostKindTblEntry AVX512BWConstCostTable[] = {
{ ISD::SDIV, MVT::v64i8, { 14 } }, // 2*ext+2*pmulhw sequence
{ ISD::SREM, MVT::v64i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
{ ISD::UDIV, MVT::v64i8, { 14 } }, // 2*ext+2*pmulhw sequence
{ ISD::UREM, MVT::v64i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
{ ISD::SDIV, MVT::v32i16, { 6 } }, // vpmulhw sequence
{ ISD::SREM, MVT::v32i16, { 8 } }, // vpmulhw+mul+sub sequence
{ ISD::UDIV, MVT::v32i16, { 6 } }, // vpmulhuw sequence
{ ISD::UREM, MVT::v32i16, { 8 } }, // vpmulhuw+mul+sub sequence
};
if (Op2Info.isConstant() && ST->hasBWI())
if (const auto *Entry =
CostTableLookup(AVX512BWConstCostTable, ISD, LT.second))
if (auto KindCost = Entry->Cost[CostKind])
return LT.first * *KindCost;
static const CostKindTblEntry AVX512ConstCostTable[] = {
{ ISD::SDIV, MVT::v64i8, { 28 } }, // 4*ext+4*pmulhw sequence
{ ISD::SREM, MVT::v64i8, { 32 } }, // 4*ext+4*pmulhw+mul+sub sequence
{ ISD::UDIV, MVT::v64i8, { 28 } }, // 4*ext+4*pmulhw sequence
{ ISD::UREM, MVT::v64i8, { 32 } }, // 4*ext+4*pmulhw+mul+sub sequence
{ ISD::SDIV, MVT::v32i16, { 12 } }, // 2*vpmulhw sequence
{ ISD::SREM, MVT::v32i16, { 16 } }, // 2*vpmulhw+mul+sub sequence
{ ISD::UDIV, MVT::v32i16, { 12 } }, // 2*vpmulhuw sequence
{ ISD::UREM, MVT::v32i16, { 16 } }, // 2*vpmulhuw+mul+sub sequence
{ ISD::SDIV, MVT::v16i32, { 15 } }, // vpmuldq sequence
{ ISD::SREM, MVT::v16i32, { 17 } }, // vpmuldq+mul+sub sequence
{ ISD::UDIV, MVT::v16i32, { 15 } }, // vpmuludq sequence
{ ISD::UREM, MVT::v16i32, { 17 } }, // vpmuludq+mul+sub sequence
};
if (Op2Info.isConstant() && ST->hasAVX512())
if (const auto *Entry =
CostTableLookup(AVX512ConstCostTable, ISD, LT.second))
if (auto KindCost = Entry->Cost[CostKind])
return LT.first * *KindCost;
static const CostKindTblEntry AVX2ConstCostTable[] = {
{ ISD::SDIV, MVT::v32i8, { 14 } }, // 2*ext+2*pmulhw sequence
{ ISD::SREM, MVT::v32i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
{ ISD::UDIV, MVT::v32i8, { 14 } }, // 2*ext+2*pmulhw sequence
{ ISD::UREM, MVT::v32i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
{ ISD::SDIV, MVT::v16i16, { 6 } }, // vpmulhw sequence
{ ISD::SREM, MVT::v16i16, { 8 } }, // vpmulhw+mul+sub sequence
{ ISD::UDIV, MVT::v16i16, { 6 } }, // vpmulhuw sequence
{ ISD::UREM, MVT::v16i16, { 8 } }, // vpmulhuw+mul+sub sequence
{ ISD::SDIV, MVT::v8i32, { 15 } }, // vpmuldq sequence
{ ISD::SREM, MVT::v8i32, { 19 } }, // vpmuldq+mul+sub sequence
{ ISD::UDIV, MVT::v8i32, { 15 } }, // vpmuludq sequence
{ ISD::UREM, MVT::v8i32, { 19 } }, // vpmuludq+mul+sub sequence
};
if (Op2Info.isConstant() && ST->hasAVX2())
if (const auto *Entry = CostTableLookup(AVX2ConstCostTable, ISD, LT.second))
if (auto KindCost = Entry->Cost[CostKind])
return LT.first * *KindCost;
static const CostKindTblEntry AVXConstCostTable[] = {
{ ISD::SDIV, MVT::v32i8, { 30 } }, // 4*ext+4*pmulhw sequence + split.
{ ISD::SREM, MVT::v32i8, { 34 } }, // 4*ext+4*pmulhw+mul+sub sequence + split.
{ ISD::UDIV, MVT::v32i8, { 30 } }, // 4*ext+4*pmulhw sequence + split.
{ ISD::UREM, MVT::v32i8, { 34 } }, // 4*ext+4*pmulhw+mul+sub sequence + split.
{ ISD::SDIV, MVT::v16i16, { 14 } }, // 2*pmulhw sequence + split.
{ ISD::SREM, MVT::v16i16, { 18 } }, // 2*pmulhw+mul+sub sequence + split.
{ ISD::UDIV, MVT::v16i16, { 14 } }, // 2*pmulhuw sequence + split.
{ ISD::UREM, MVT::v16i16, { 18 } }, // 2*pmulhuw+mul+sub sequence + split.
{ ISD::SDIV, MVT::v8i32, { 32 } }, // vpmuludq sequence
{ ISD::SREM, MVT::v8i32, { 38 } }, // vpmuludq+mul+sub sequence
{ ISD::UDIV, MVT::v8i32, { 32 } }, // 2*pmuludq sequence + split.
{ ISD::UREM, MVT::v8i32, { 42 } }, // 2*pmuludq+mul+sub sequence + split.
};
if (Op2Info.isConstant() && ST->hasAVX())
if (const auto *Entry = CostTableLookup(AVXConstCostTable, ISD, LT.second))
if (auto KindCost = Entry->Cost[CostKind])
return LT.first * *KindCost;
static const CostKindTblEntry SSE41ConstCostTable[] = {
{ ISD::SDIV, MVT::v4i32, { 15 } }, // vpmuludq sequence
{ ISD::SREM, MVT::v4i32, { 20 } }, // vpmuludq+mul+sub sequence
};
if (Op2Info.isConstant() && ST->hasSSE41())
if (const auto *Entry =
CostTableLookup(SSE41ConstCostTable, ISD, LT.second))
if (auto KindCost = Entry->Cost[CostKind])
return LT.first * *KindCost;
static const CostKindTblEntry SSE2ConstCostTable[] = {
{ ISD::SDIV, MVT::v16i8, { 14 } }, // 2*ext+2*pmulhw sequence
{ ISD::SREM, MVT::v16i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
{ ISD::UDIV, MVT::v16i8, { 14 } }, // 2*ext+2*pmulhw sequence
{ ISD::UREM, MVT::v16i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence
{ ISD::SDIV, MVT::v8i16, { 6 } }, // pmulhw sequence
{ ISD::SREM, MVT::v8i16, { 8 } }, // pmulhw+mul+sub sequence
{ ISD::UDIV, MVT::v8i16, { 6 } }, // pmulhuw sequence
{ ISD::UREM, MVT::v8i16, { 8 } }, // pmulhuw+mul+sub sequence
{ ISD::SDIV, MVT::v4i32, { 19 } }, // pmuludq sequence
{ ISD::SREM, MVT::v4i32, { 24 } }, // pmuludq+mul+sub sequence
{ ISD::UDIV, MVT::v4i32, { 15 } }, // pmuludq sequence
{ ISD::UREM, MVT::v4i32, { 20 } }, // pmuludq+mul+sub sequence
};
if (Op2Info.isConstant() && ST->hasSSE2())
if (const auto *Entry = CostTableLookup(SSE2ConstCostTable, ISD, LT.second))
if (auto KindCost = Entry->Cost[CostKind])
return LT.first * *KindCost;
static const CostKindTblEntry AVX512BWUniformCostTable[] = {
{ ISD::SHL, MVT::v16i8, { 3, 5, 5, 7 } }, // psllw + pand.
{ ISD::SRL, MVT::v16i8, { 3,10, 5, 8 } }, // psrlw + pand.
{ ISD::SRA, MVT::v16i8, { 4,12, 8,12 } }, // psrlw, pand, pxor, psubb.
{ ISD::SHL, MVT::v32i8, { 4, 7, 6, 8 } }, // psllw + pand.
{ ISD::SRL, MVT::v32i8, { 4, 8, 7, 9 } }, // psrlw + pand.
{ ISD::SRA, MVT::v32i8, { 5,10,10,13 } }, // psrlw, pand, pxor, psubb.
{ ISD::SHL, MVT::v64i8, { 4, 7, 6, 8 } }, // psllw + pand.
{ ISD::SRL, MVT::v64i8, { 4, 8, 7,10 } }, // psrlw + pand.
{ ISD::SRA, MVT::v64i8, { 5,10,10,15 } }, // psrlw, pand, pxor, psubb.
{ ISD::SHL, MVT::v32i16, { 2, 4, 2, 3 } }, // psllw
{ ISD::SRL, MVT::v32i16, { 2, 4, 2, 3 } }, // psrlw
{ ISD::SRA, MVT::v32i16, { 2, 4, 2, 3 } }, // psrqw
};
if (ST->hasBWI() && Op2Info.isUniform())
if (const auto *Entry =
CostTableLookup(AVX512BWUniformCostTable, ISD, LT.second))
if (auto KindCost = Entry->Cost[CostKind])
return LT.first * *KindCost;
static const CostKindTblEntry AVX512UniformCostTable[] = {
{ ISD::SHL, MVT::v32i16, { 5,10, 5, 7 } }, // psllw + split.
{ ISD::SRL, MVT::v32i16, { 5,10, 5, 7 } }, // psrlw + split.
{ ISD::SRA, MVT::v32i16, { 5,10, 5, 7 } }, // psraw + split.
{ ISD::SHL, MVT::v16i32, { 2, 4, 2, 3 } }, // pslld
{ ISD::SRL, MVT::v16i32, { 2, 4, 2, 3 } }, // psrld
{ ISD::SRA, MVT::v16i32, { 2, 4, 2, 3 } }, // psrad
{ ISD::SRA, MVT::v2i64, { 1, 2, 1, 2 } }, // psraq
{ ISD::SHL, MVT::v4i64, { 1, 4, 1, 2 } }, // psllq
{ ISD::SRL, MVT::v4i64, { 1, 4, 1, 2 } }, // psrlq
{ ISD::SRA, MVT::v4i64, { 1, 4, 1, 2 } }, // psraq
{ ISD::SHL, MVT::v8i64, { 1, 4, 1, 2 } }, // psllq
{ ISD::SRL, MVT::v8i64, { 1, 4, 1, 2 } }, // psrlq
{ ISD::SRA, MVT::v8i64, { 1, 4, 1, 2 } }, // psraq
};
if (ST->hasAVX512() && Op2Info.isUniform())
if (const auto *Entry =
CostTableLookup(AVX512UniformCostTable, ISD, LT.second))
if (auto KindCost = Entry->Cost[CostKind])
return LT.first * *KindCost;
static const CostKindTblEntry AVX2UniformCostTable[] = {
// Uniform splats are cheaper for the following instructions.
{ ISD::SHL, MVT::v16i8, { 3, 5, 5, 7 } }, // psllw + pand.
{ ISD::SRL, MVT::v16i8, { 3, 9, 5, 8 } }, // psrlw + pand.
{ ISD::SRA, MVT::v16i8, { 4, 5, 9,13 } }, // psrlw, pand, pxor, psubb.
{ ISD::SHL, MVT::v32i8, { 4, 7, 6, 8 } }, // psllw + pand.
{ ISD::SRL, MVT::v32i8, { 4, 8, 7, 9 } }, // psrlw + pand.
{ ISD::SRA, MVT::v32i8, { 6, 9,11,16 } }, // psrlw, pand, pxor, psubb.
{ ISD::SHL, MVT::v8i16, { 1, 2, 1, 2 } }, // psllw.
{ ISD::SRL, MVT::v8i16, { 1, 2, 1, 2 } }, // psrlw.
{ ISD::SRA, MVT::v8i16, { 1, 2, 1, 2 } }, // psraw.
{ ISD::SHL, MVT::v16i16, { 2, 4, 2, 3 } }, // psllw.
{ ISD::SRL, MVT::v16i16, { 2, 4, 2, 3 } }, // psrlw.
{ ISD::SRA, MVT::v16i16, { 2, 4, 2, 3 } }, // psraw.
{ ISD::SHL, MVT::v4i32, { 1, 2, 1, 2 } }, // pslld
{ ISD::SRL, MVT::v4i32, { 1, 2, 1, 2 } }, // psrld
{ ISD::SRA, MVT::v4i32, { 1, 2, 1, 2 } }, // psrad
{ ISD::SHL, MVT::v8i32, { 2, 4, 2, 3 } }, // pslld
{ ISD::SRL, MVT::v8i32, { 2, 4, 2, 3 } }, // psrld
{ ISD::SRA, MVT::v8i32, { 2, 4, 2, 3 } }, // psrad
{ ISD::SHL, MVT::v2i64, { 1, 2, 1, 2 } }, // psllq
{ ISD::SRL, MVT::v2i64, { 1, 2, 1, 2 } }, // psrlq
{ ISD::SRA, MVT::v2i64, { 2, 4, 5, 7 } }, // 2 x psrad + shuffle.
{ ISD::SHL, MVT::v4i64, { 2, 4, 1, 2 } }, // psllq
{ ISD::SRL, MVT::v4i64, { 2, 4, 1, 2 } }, // psrlq
{ ISD::SRA, MVT::v4i64, { 4, 6, 5, 9 } }, // 2 x psrad + shuffle.
};
if (ST->hasAVX2() && Op2Info.isUniform())
if (const auto *Entry =
CostTableLookup(AVX2UniformCostTable, ISD, LT.second))
if (auto KindCost = Entry->Cost[CostKind])
return LT.first * *KindCost;
static const CostKindTblEntry AVXUniformCostTable[] = {
{ ISD::SHL, MVT::v16i8, { 4, 4, 6, 8 } }, // psllw + pand.
{ ISD::SRL, MVT::v16i8, { 4, 8, 5, 8 } }, // psrlw + pand.
{ ISD::SRA, MVT::v16i8, { 6, 6, 9,13 } }, // psrlw, pand, pxor, psubb.
{ ISD::SHL, MVT::v32i8, { 7, 8,11,14 } }, // psllw + pand + split.
{ ISD::SRL, MVT::v32i8, { 7, 9,10,14 } }, // psrlw + pand + split.
{ ISD::SRA, MVT::v32i8, { 10,11,16,21 } }, // psrlw, pand, pxor, psubb + split.
{ ISD::SHL, MVT::v8i16, { 1, 3, 1, 2 } }, // psllw.
{ ISD::SRL, MVT::v8i16, { 1, 3, 1, 2 } }, // psrlw.
{ ISD::SRA, MVT::v8i16, { 1, 3, 1, 2 } }, // psraw.
{ ISD::SHL, MVT::v16i16, { 3, 7, 5, 7 } }, // psllw + split.
{ ISD::SRL, MVT::v16i16, { 3, 7, 5, 7 } }, // psrlw + split.
{ ISD::SRA, MVT::v16i16, { 3, 7, 5, 7 } }, // psraw + split.
{ ISD::SHL, MVT::v4i32, { 1, 3, 1, 2 } }, // pslld.
{ ISD::SRL, MVT::v4i32, { 1, 3, 1, 2 } }, // psrld.
{ ISD::SRA, MVT::v4i32, { 1, 3, 1, 2 } }, // psrad.
{ ISD::SHL, MVT::v8i32, { 3, 7, 5, 7 } }, // pslld + split.
{ ISD::SRL, MVT::v8i32, { 3, 7, 5, 7 } }, // psrld + split.
{ ISD::SRA, MVT::v8i32, { 3, 7, 5, 7 } }, // psrad + split.
{ ISD::SHL, MVT::v2i64, { 1, 3, 1, 2 } }, // psllq.
{ ISD::SRL, MVT::v2i64, { 1, 3, 1, 2 } }, // psrlq.
{ ISD::SRA, MVT::v2i64, { 3, 4, 5, 7 } }, // 2 x psrad + shuffle.
{ ISD::SHL, MVT::v4i64, { 3, 7, 4, 6 } }, // psllq + split.
{ ISD::SRL, MVT::v4i64, { 3, 7, 4, 6 } }, // psrlq + split.
{ ISD::SRA, MVT::v4i64, { 6, 7,10,13 } }, // 2 x (2 x psrad + shuffle) + split.
};
// XOP has faster vXi8 shifts.
if (ST->hasAVX() && Op2Info.isUniform() &&
(!ST->hasXOP() || LT.second.getScalarSizeInBits() != 8))
if (const auto *Entry =
CostTableLookup(AVXUniformCostTable, ISD, LT.second))
if (auto KindCost = Entry->Cost[CostKind])
return LT.first * *KindCost;
static const CostKindTblEntry SSE2UniformCostTable[] = {
// Uniform splats are cheaper for the following instructions.
{ ISD::SHL, MVT::v16i8, { 9, 10, 6, 9 } }, // psllw + pand.
{ ISD::SRL, MVT::v16i8, { 9, 13, 5, 9 } }, // psrlw + pand.
{ ISD::SRA, MVT::v16i8, { 11, 15, 9,13 } }, // pcmpgtb sequence.
{ ISD::SHL, MVT::v8i16, { 2, 2, 1, 2 } }, // psllw.
{ ISD::SRL, MVT::v8i16, { 2, 2, 1, 2 } }, // psrlw.
{ ISD::SRA, MVT::v8i16, { 2, 2, 1, 2 } }, // psraw.
{ ISD::SHL, MVT::v4i32, { 2, 2, 1, 2 } }, // pslld
{ ISD::SRL, MVT::v4i32, { 2, 2, 1, 2 } }, // psrld.
{ ISD::SRA, MVT::v4i32, { 2, 2, 1, 2 } }, // psrad.
{ ISD::SHL, MVT::v2i64, { 2, 2, 1, 2 } }, // psllq.
{ ISD::SRL, MVT::v2i64, { 2, 2, 1, 2 } }, // psrlq.
{ ISD::SRA, MVT::v2i64, { 5, 9, 5, 7 } }, // 2*psrlq + xor + sub.
};
if (ST->hasSSE2() && Op2Info.isUniform() &&
(!ST->hasXOP() || LT.second.getScalarSizeInBits() != 8))
if (const auto *Entry =
CostTableLookup(SSE2UniformCostTable, ISD, LT.second))
if (auto KindCost = Entry->Cost[CostKind])
return LT.first * *KindCost;
static const CostKindTblEntry AVX512DQCostTable[] = {
{ ISD::MUL, MVT::v2i64, { 2, 15, 1, 3 } }, // pmullq
{ ISD::MUL, MVT::v4i64, { 2, 15, 1, 3 } }, // pmullq
{ ISD::MUL, MVT::v8i64, { 3, 15, 1, 3 } } // pmullq
};
// Look for AVX512DQ lowering tricks for custom cases.
if (ST->hasDQI())
if (const auto *Entry = CostTableLookup(AVX512DQCostTable, ISD, LT.second))
if (auto KindCost = Entry->Cost[CostKind])
return LT.first * *KindCost;
static const CostKindTblEntry AVX512BWCostTable[] = {
{ ISD::SHL, MVT::v16i8, { 4, 8, 4, 5 } }, // extend/vpsllvw/pack sequence.
{ ISD::SRL, MVT::v16i8, { 4, 8, 4, 5 } }, // extend/vpsrlvw/pack sequence.
{ ISD::SRA, MVT::v16i8, { 4, 8, 4, 5 } }, // extend/vpsravw/pack sequence.
{ ISD::SHL, MVT::v32i8, { 4, 23,11,16 } }, // extend/vpsllvw/pack sequence.
{ ISD::SRL, MVT::v32i8, { 4, 30,12,18 } }, // extend/vpsrlvw/pack sequence.
{ ISD::SRA, MVT::v32i8, { 6, 13,24,30 } }, // extend/vpsravw/pack sequence.
{ ISD::SHL, MVT::v64i8, { 6, 19,13,15 } }, // extend/vpsllvw/pack sequence.
{ ISD::SRL, MVT::v64i8, { 7, 27,15,18 } }, // extend/vpsrlvw/pack sequence.
{ ISD::SRA, MVT::v64i8, { 15, 15,30,30 } }, // extend/vpsravw/pack sequence.
{ ISD::SHL, MVT::v8i16, { 1, 1, 1, 1 } }, // vpsllvw
{ ISD::SRL, MVT::v8i16, { 1, 1, 1, 1 } }, // vpsrlvw
{ ISD::SRA, MVT::v8i16, { 1, 1, 1, 1 } }, // vpsravw
{ ISD::SHL, MVT::v16i16, { 1, 1, 1, 1 } }, // vpsllvw
{ ISD::SRL, MVT::v16i16, { 1, 1, 1, 1 } }, // vpsrlvw
{ ISD::SRA, MVT::v16i16, { 1, 1, 1, 1 } }, // vpsravw
{ ISD::SHL, MVT::v32i16, { 1, 1, 1, 1 } }, // vpsllvw
{ ISD::SRL, MVT::v32i16, { 1, 1, 1, 1 } }, // vpsrlvw
{ ISD::SRA, MVT::v32i16, { 1, 1, 1, 1 } }, // vpsravw
{ ISD::ADD, MVT::v64i8, { 1, 1, 1, 1 } }, // paddb
{ ISD::ADD, MVT::v32i16, { 1, 1, 1, 1 } }, // paddw
{ ISD::ADD, MVT::v32i8, { 1, 1, 1, 1 } }, // paddb
{ ISD::ADD, MVT::v16i16, { 1, 1, 1, 1 } }, // paddw
{ ISD::ADD, MVT::v8i32, { 1, 1, 1, 1 } }, // paddd
{ ISD::ADD, MVT::v4i64, { 1, 1, 1, 1 } }, // paddq
{ ISD::SUB, MVT::v64i8, { 1, 1, 1, 1 } }, // psubb
{ ISD::SUB, MVT::v32i16, { 1, 1, 1, 1 } }, // psubw
{ ISD::MUL, MVT::v64i8, { 5, 10,10,11 } },
{ ISD::MUL, MVT::v32i16, { 1, 5, 1, 1 } }, // pmullw
{ ISD::SUB, MVT::v32i8, { 1, 1, 1, 1 } }, // psubb
{ ISD::SUB, MVT::v16i16, { 1, 1, 1, 1 } }, // psubw
{ ISD::SUB, MVT::v8i32, { 1, 1, 1, 1 } }, // psubd
{ ISD::SUB, MVT::v4i64, { 1, 1, 1, 1 } }, // psubq
};
// Look for AVX512BW lowering tricks for custom cases.
if (ST->hasBWI())
if (const auto *Entry = CostTableLookup(AVX512BWCostTable, ISD, LT.second))
if (auto KindCost = Entry->Cost[CostKind])
return LT.first * *KindCost;
static const CostKindTblEntry AVX512CostTable[] = {
{ ISD::SHL, MVT::v64i8, { 15, 19,27,33 } }, // vpblendv+split sequence.
{ ISD::SRL, MVT::v64i8, { 15, 19,30,36 } }, // vpblendv+split sequence.
{ ISD::SRA, MVT::v64i8, { 37, 37,51,63 } }, // vpblendv+split sequence.
{ ISD::SHL, MVT::v32i16, { 11, 16,11,15 } }, // 2*extend/vpsrlvd/pack sequence.
{ ISD::SRL, MVT::v32i16, { 11, 16,11,15 } }, // 2*extend/vpsrlvd/pack sequence.
{ ISD::SRA, MVT::v32i16, { 11, 16,11,15 } }, // 2*extend/vpsravd/pack sequence.
{ ISD::SHL, MVT::v4i32, { 1, 1, 1, 1 } },
{ ISD::SRL, MVT::v4i32, { 1, 1, 1, 1 } },
{ ISD::SRA, MVT::v4i32, { 1, 1, 1, 1 } },
{ ISD::SHL, MVT::v8i32, { 1, 1, 1, 1 } },
{ ISD::SRL, MVT::v8i32, { 1, 1, 1, 1 } },
{ ISD::SRA, MVT::v8i32, { 1, 1, 1, 1 } },
{ ISD::SHL, MVT::v16i32, { 1, 1, 1, 1 } },
{ ISD::SRL, MVT::v16i32, { 1, 1, 1, 1 } },
{ ISD::SRA, MVT::v16i32, { 1, 1, 1, 1 } },
{ ISD::SHL, MVT::v2i64, { 1, 1, 1, 1 } },
{ ISD::SRL, MVT::v2i64, { 1, 1, 1, 1 } },
{ ISD::SRA, MVT::v2i64, { 1, 1, 1, 1 } },
{ ISD::SHL, MVT::v4i64, { 1, 1, 1, 1 } },
{ ISD::SRL, MVT::v4i64, { 1, 1, 1, 1 } },
{ ISD::SRA, MVT::v4i64, { 1, 1, 1, 1 } },
{ ISD::SHL, MVT::v8i64, { 1, 1, 1, 1 } },
{ ISD::SRL, MVT::v8i64, { 1, 1, 1, 1 } },
{ ISD::SRA, MVT::v8i64, { 1, 1, 1, 1 } },
{ ISD::ADD, MVT::v64i8, { 3, 7, 5, 5 } }, // 2*paddb + split
{ ISD::ADD, MVT::v32i16, { 3, 7, 5, 5 } }, // 2*paddw + split
{ ISD::SUB, MVT::v64i8, { 3, 7, 5, 5 } }, // 2*psubb + split
{ ISD::SUB, MVT::v32i16, { 3, 7, 5, 5 } }, // 2*psubw + split
{ ISD::AND, MVT::v32i8, { 1, 1, 1, 1 } },
{ ISD::AND, MVT::v16i16, { 1, 1, 1, 1 } },
{ ISD::AND, MVT::v8i32, { 1, 1, 1, 1 } },
{ ISD::AND, MVT::v4i64, { 1, 1, 1, 1 } },
{ ISD::OR, MVT::v32i8, { 1, 1, 1, 1 } },
{ ISD::OR, MVT::v16i16, { 1, 1, 1, 1 } },
{ ISD::OR, MVT::v8i32, { 1, 1, 1, 1 } },
{ ISD::OR, MVT::v4i64, { 1, 1, 1, 1 } },
{ ISD::XOR, MVT::v32i8, { 1, 1, 1, 1 } },
{ ISD::XOR, MVT::v16i16, { 1, 1, 1, 1 } },
{ ISD::XOR, MVT::v8i32, { 1, 1, 1, 1 } },
{ ISD::XOR, MVT::v4i64, { 1, 1, 1, 1 } },
{ ISD::MUL, MVT::v16i32, { 1, 10, 1, 2 } }, // pmulld (Skylake from agner.org)
{ ISD::MUL, MVT::v8i32, { 1, 10, 1, 2 } }, // pmulld (Skylake from agner.org)
{ ISD::MUL, MVT::v4i32, { 1, 10, 1, 2 } }, // pmulld (Skylake from agner.org)
{ ISD::MUL, MVT::v8i64, { 6, 9, 8, 8 } }, // 3*pmuludq/3*shift/2*add
{ ISD::MUL, MVT::i64, { 1 } }, // Skylake from http://www.agner.org/
{ X86ISD::PMULUDQ, MVT::v8i64, { 1, 5, 1, 1 } },
{ ISD::FNEG, MVT::v8f64, { 1, 1, 1, 2 } }, // Skylake from http://www.agner.org/
{ ISD::FADD, MVT::v8f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
{ ISD::FADD, MVT::v4f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
{ ISD::FSUB, MVT::v8f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
{ ISD::FSUB, MVT::v4f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
{ ISD::FMUL, MVT::v8f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
{ ISD::FMUL, MVT::v4f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
{ ISD::FMUL, MVT::v2f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
{ ISD::FMUL, MVT::f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
{ ISD::FDIV, MVT::f64, { 4, 14, 1, 1 } }, // Skylake from http://www.agner.org/
{ ISD::FDIV, MVT::v2f64, { 4, 14, 1, 1 } }, // Skylake from http://www.agner.org/
{ ISD::FDIV, MVT::v4f64, { 8, 14, 1, 1 } }, // Skylake from http://www.agner.org/
{ ISD::FDIV, MVT::v8f64, { 16, 23, 1, 3 } }, // Skylake from http://www.agner.org/
{ ISD::FNEG, MVT::v16f32, { 1, 1, 1, 2 } }, // Skylake from http://www.agner.org/
{ ISD::FADD, MVT::v16f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
{ ISD::FADD, MVT::v8f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
{ ISD::FSUB, MVT::v16f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
{ ISD::FSUB, MVT::v8f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
{ ISD::FMUL, MVT::v16f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
{ ISD::FMUL, MVT::v8f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
{ ISD::FMUL, MVT::v4f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
{ ISD::FMUL, MVT::f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/
{ ISD::FDIV, MVT::f32, { 3, 11, 1, 1 } }, // Skylake from http://www.agner.org/
{ ISD::FDIV, MVT::v4f32, { 3, 11, 1, 1 } }, // Skylake from http://www.agner.org/
{ ISD::FDIV, MVT::v8f32, { 5, 11, 1, 1 } }, // Skylake from http://www.agner.org/
{ ISD::FDIV, MVT::v16f32, { 10, 18, 1, 3 } }, // Skylake from http://www.agner.org/
};
if (ST->hasAVX512())
if (const auto *Entry = CostTableLookup(AVX512CostTable, ISD, LT.second))
if (auto KindCost = Entry->Cost[CostKind])
return LT.first * *KindCost;
static const CostKindTblEntry AVX2ShiftCostTable[] = {
// Shifts on vXi64/vXi32 on AVX2 is legal even though we declare to
// customize them to detect the cases where shift amount is a scalar one.
{ ISD::SHL, MVT::v4i32, { 2, 3, 1, 3 } }, // vpsllvd (Haswell from agner.org)
{ ISD::SRL, MVT::v4i32, { 2, 3, 1, 3 } }, // vpsrlvd (Haswell from agner.org)
{ ISD::SRA, MVT::v4i32, { 2, 3, 1, 3 } }, // vpsravd (Haswell from agner.org)
{ ISD::SHL, MVT::v8i32, { 4, 4, 1, 3 } }, // vpsllvd (Haswell from agner.org)
{ ISD::SRL, MVT::v8i32, { 4, 4, 1, 3 } }, // vpsrlvd (Haswell from agner.org)
{ ISD::SRA, MVT::v8i32, { 4, 4, 1, 3 } }, // vpsravd (Haswell from agner.org)
{ ISD::SHL, MVT::v2i64, { 2, 3, 1, 1 } }, // vpsllvq (Haswell from agner.org)
{ ISD::SRL, MVT::v2i64, { 2, 3, 1, 1 } }, // vpsrlvq (Haswell from agner.org)
{ ISD::SHL, MVT::v4i64, { 4, 4, 1, 2 } }, // vpsllvq (Haswell from agner.org)
{ ISD::SRL, MVT::v4i64, { 4, 4, 1, 2 } }, // vpsrlvq (Haswell from agner.org)
};
if (ST->hasAVX512()) {
if (ISD == ISD::SHL && LT.second == MVT::v32i16 && Op2Info.isConstant())
// On AVX512, a packed v32i16 shift left by a constant build_vector
// is lowered into a vector multiply (vpmullw).
return getArithmeticInstrCost(Instruction::Mul, Ty, CostKind,
Op1Info.getNoProps(), Op2Info.getNoProps());
}
// Look for AVX2 lowering tricks (XOP is always better at v4i32 shifts).
if (ST->hasAVX2() && !(ST->hasXOP() && LT.second == MVT::v4i32)) {
if (ISD == ISD::SHL && LT.second == MVT::v16i16 &&
Op2Info.isConstant())
// On AVX2, a packed v16i16 shift left by a constant build_vector
// is lowered into a vector multiply (vpmullw).
return getArithmeticInstrCost(Instruction::Mul, Ty, CostKind,
Op1Info.getNoProps(), Op2Info.getNoProps());
if (const auto *Entry = CostTableLookup(AVX2ShiftCostTable, ISD, LT.second))
if (auto KindCost = Entry->Cost[CostKind])
return LT.first * *KindCost;
}
static const CostKindTblEntry XOPShiftCostTable[] = {
// 128bit shifts take 1cy, but right shifts require negation beforehand.
{ ISD::SHL, MVT::v16i8, { 1, 3, 1, 1 } },
{ ISD::SRL, MVT::v16i8, { 2, 3, 1, 1 } },
{ ISD::SRA, MVT::v16i8, { 2, 3, 1, 1 } },
{ ISD::SHL, MVT::v8i16, { 1, 3, 1, 1 } },
{ ISD::SRL, MVT::v8i16, { 2, 3, 1, 1 } },
{ ISD::SRA, MVT::v8i16, { 2, 3, 1, 1 } },
{ ISD::SHL, MVT::v4i32, { 1, 3, 1, 1 } },
{ ISD::SRL, MVT::v4i32, { 2, 3, 1, 1 } },
{ ISD::SRA, MVT::v4i32, { 2, 3, 1, 1 } },
{ ISD::SHL, MVT::v2i64, { 1, 3, 1, 1 } },
{ ISD::SRL, MVT::v2i64, { 2, 3, 1, 1 } },
{ ISD::SRA, MVT::v2i64, { 2, 3, 1, 1 } },
// 256bit shifts require splitting if AVX2 didn't catch them above.
{ ISD::SHL, MVT::v32i8, { 4, 7, 5, 6 } },
{ ISD::SRL, MVT::v32i8, { 6, 7, 5, 6 } },
{ ISD::SRA, MVT::v32i8, { 6, 7, 5, 6 } },
{ ISD::SHL, MVT::v16i16, { 4, 7, 5, 6 } },
{ ISD::SRL, MVT::v16i16, { 6, 7, 5, 6 } },
{ ISD::SRA, MVT::v16i16, { 6, 7, 5, 6 } },
{ ISD::SHL, MVT::v8i32, { 4, 7, 5, 6 } },
{ ISD::SRL, MVT::v8i32, { 6, 7, 5, 6 } },
{ ISD::SRA, MVT::v8i32, { 6, 7, 5, 6 } },
{ ISD::SHL, MVT::v4i64, { 4, 7, 5, 6 } },
{ ISD::SRL, MVT::v4i64, { 6, 7, 5, 6 } },
{ ISD::SRA, MVT::v4i64, { 6, 7, 5, 6 } },
};
// Look for XOP lowering tricks.
if (ST->hasXOP()) {
// If the right shift is constant then we'll fold the negation so
// it's as cheap as a left shift.
int ShiftISD = ISD;
if ((ShiftISD == ISD::SRL || ShiftISD == ISD::SRA) && Op2Info.isConstant())
ShiftISD = ISD::SHL;
if (const auto *Entry =
CostTableLookup(XOPShiftCostTable, ShiftISD, LT.second))
if (auto KindCost = Entry->Cost[CostKind])
return LT.first * *KindCost;
}
if (ISD == ISD::SHL && !Op2Info.isUniform() && Op2Info.isConstant()) {
MVT VT = LT.second;
// Vector shift left by non uniform constant can be lowered
// into vector multiply.
if (((VT == MVT::v8i16 || VT == MVT::v4i32) && ST->hasSSE2()) ||
((VT == MVT::v16i16 || VT == MVT::v8i32) && ST->hasAVX()))
ISD = ISD::MUL;
}
static const CostKindTblEntry GLMCostTable[] = {
{ ISD::FDIV, MVT::f32, { 18, 19, 1, 1 } }, // divss
{ ISD::FDIV, MVT::v4f32, { 35, 36, 1, 1 } }, // divps
{ ISD::FDIV, MVT::f64, { 33, 34, 1, 1 } }, // divsd
{ ISD::FDIV, MVT::v2f64, { 65, 66, 1, 1 } }, // divpd
};
if (ST->useGLMDivSqrtCosts())
if (const auto *Entry = CostTableLookup(GLMCostTable, ISD, LT.second))
if (auto KindCost = Entry->Cost[CostKind])
return LT.first * *KindCost;
static const CostKindTblEntry SLMCostTable[] = {
{ ISD::MUL, MVT::v4i32, { 11, 11, 1, 7 } }, // pmulld
{ ISD::MUL, MVT::v8i16, { 2, 5, 1, 1 } }, // pmullw
{ ISD::FMUL, MVT::f64, { 2, 5, 1, 1 } }, // mulsd
{ ISD::FMUL, MVT::f32, { 1, 4, 1, 1 } }, // mulss
{ ISD::FMUL, MVT::v2f64, { 4, 7, 1, 1 } }, // mulpd
{ ISD::FMUL, MVT::v4f32, { 2, 5, 1, 1 } }, // mulps
{ ISD::FDIV, MVT::f32, { 17, 19, 1, 1 } }, // divss
{ ISD::FDIV, MVT::v4f32, { 39, 39, 1, 6 } }, // divps
{ ISD::FDIV, MVT::f64, { 32, 34, 1, 1 } }, // divsd
{ ISD::FDIV, MVT::v2f64, { 69, 69, 1, 6 } }, // divpd
{ ISD::FADD, MVT::v2f64, { 2, 4, 1, 1 } }, // addpd
{ ISD::FSUB, MVT::v2f64, { 2, 4, 1, 1 } }, // subpd
// v2i64/v4i64 mul is custom lowered as a series of long:
// multiplies(3), shifts(3) and adds(2)
// slm muldq version throughput is 2 and addq throughput 4
// thus: 3X2 (muldq throughput) + 3X1 (shift throughput) +
// 3X4 (addq throughput) = 17
{ ISD::MUL, MVT::v2i64, { 17, 22, 9, 9 } },
// slm addq\subq throughput is 4
{ ISD::ADD, MVT::v2i64, { 4, 2, 1, 2 } },
{ ISD::SUB, MVT::v2i64, { 4, 2, 1, 2 } },
};
if (ST->useSLMArithCosts())
if (const auto *Entry = CostTableLookup(SLMCostTable, ISD, LT.second))
if (auto KindCost = Entry->Cost[CostKind])
return LT.first * *KindCost;
static const CostKindTblEntry AVX2CostTable[] = {
{ ISD::SHL, MVT::v16i8, { 6, 21,11,16 } }, // vpblendvb sequence.
{ ISD::SHL, MVT::v32i8, { 6, 23,11,22 } }, // vpblendvb sequence.
{ ISD::SHL, MVT::v8i16, { 5, 18, 5,10 } }, // extend/vpsrlvd/pack sequence.
{ ISD::SHL, MVT::v16i16, { 8, 10,10,14 } }, // extend/vpsrlvd/pack sequence.
{ ISD::SRL, MVT::v16i8, { 6, 27,12,18 } }, // vpblendvb sequence.
{ ISD::SRL, MVT::v32i8, { 8, 30,12,24 } }, // vpblendvb sequence.
{ ISD::SRL, MVT::v8i16, { 5, 11, 5,10 } }, // extend/vpsrlvd/pack sequence.
{ ISD::SRL, MVT::v16i16, { 8, 10,10,14 } }, // extend/vpsrlvd/pack sequence.
{ ISD::SRA, MVT::v16i8, { 17, 17,24,30 } }, // vpblendvb sequence.
{ ISD::SRA, MVT::v32i8, { 18, 20,24,43 } }, // vpblendvb sequence.
{ ISD::SRA, MVT::v8i16, { 5, 11, 5,10 } }, // extend/vpsravd/pack sequence.
{ ISD::SRA, MVT::v16i16, { 8, 10,10,14 } }, // extend/vpsravd/pack sequence.
{ ISD::SRA, MVT::v2i64, { 4, 5, 5, 5 } }, // srl/xor/sub sequence.
{ ISD::SRA, MVT::v4i64, { 8, 8, 5, 9 } }, // srl/xor/sub sequence.
{ ISD::SUB, MVT::v32i8, { 1, 1, 1, 2 } }, // psubb
{ ISD::ADD, MVT::v32i8, { 1, 1, 1, 2 } }, // paddb
{ ISD::SUB, MVT::v16i16, { 1, 1, 1, 2 } }, // psubw
{ ISD::ADD, MVT::v16i16, { 1, 1, 1, 2 } }, // paddw
{ ISD::SUB, MVT::v8i32, { 1, 1, 1, 2 } }, // psubd
{ ISD::ADD, MVT::v8i32, { 1, 1, 1, 2 } }, // paddd
{ ISD::SUB, MVT::v4i64, { 1, 1, 1, 2 } }, // psubq
{ ISD::ADD, MVT::v4i64, { 1, 1, 1, 2 } }, // paddq
{ ISD::MUL, MVT::v16i8, { 5, 18, 6,12 } }, // extend/pmullw/pack
{ ISD::MUL, MVT::v32i8, { 6, 11,10,19 } }, // unpack/pmullw
{ ISD::MUL, MVT::v16i16, { 2, 5, 1, 2 } }, // pmullw
{ ISD::MUL, MVT::v8i32, { 4, 10, 1, 2 } }, // pmulld
{ ISD::MUL, MVT::v4i32, { 2, 10, 1, 2 } }, // pmulld
{ ISD::MUL, MVT::v4i64, { 6, 10, 8,13 } }, // 3*pmuludq/3*shift/2*add
{ ISD::MUL, MVT::v2i64, { 6, 10, 8, 8 } }, // 3*pmuludq/3*shift/2*add
{ X86ISD::PMULUDQ, MVT::v4i64, { 1, 5, 1, 1 } },
{ ISD::FNEG, MVT::v4f64, { 1, 1, 1, 2 } }, // vxorpd
{ ISD::FNEG, MVT::v8f32, { 1, 1, 1, 2 } }, // vxorps
{ ISD::FADD, MVT::f64, { 1, 4, 1, 1 } }, // vaddsd
{ ISD::FADD, MVT::f32, { 1, 4, 1, 1 } }, // vaddss
{ ISD::FADD, MVT::v2f64, { 1, 4, 1, 1 } }, // vaddpd
{ ISD::FADD, MVT::v4f32, { 1, 4, 1, 1 } }, // vaddps
{ ISD::FADD, MVT::v4f64, { 1, 4, 1, 2 } }, // vaddpd
{ ISD::FADD, MVT::v8f32, { 1, 4, 1, 2 } }, // vaddps
{ ISD::FSUB, MVT::f64, { 1, 4, 1, 1 } }, // vsubsd
{ ISD::FSUB, MVT::f32, { 1, 4, 1, 1 } }, // vsubss
{ ISD::FSUB, MVT::v2f64, { 1, 4, 1, 1 } }, // vsubpd
{ ISD::FSUB, MVT::v4f32, { 1, 4, 1, 1 } }, // vsubps
{ ISD::FSUB, MVT::v4f64, { 1, 4, 1, 2 } }, // vsubpd
{ ISD::FSUB, MVT::v8f32, { 1, 4, 1, 2 } }, // vsubps
{ ISD::FMUL, MVT::f64, { 1, 5, 1, 1 } }, // vmulsd
{ ISD::FMUL, MVT::f32, { 1, 5, 1, 1 } }, // vmulss
{ ISD::FMUL, MVT::v2f64, { 1, 5, 1, 1 } }, // vmulpd
{ ISD::FMUL, MVT::v4f32, { 1, 5, 1, 1 } }, // vmulps
{ ISD::FMUL, MVT::v4f64, { 1, 5, 1, 2 } }, // vmulpd
{ ISD::FMUL, MVT::v8f32, { 1, 5, 1, 2 } }, // vmulps
{ ISD::FDIV, MVT::f32, { 7, 13, 1, 1 } }, // vdivss
{ ISD::FDIV, MVT::v4f32, { 7, 13, 1, 1 } }, // vdivps
{ ISD::FDIV, MVT::v8f32, { 14, 21, 1, 3 } }, // vdivps
{ ISD::FDIV, MVT::f64, { 14, 20, 1, 1 } }, // vdivsd
{ ISD::FDIV, MVT::v2f64, { 14, 20, 1, 1 } }, // vdivpd
{ ISD::FDIV, MVT::v4f64, { 28, 35, 1, 3 } }, // vdivpd
};
// Look for AVX2 lowering tricks for custom cases.
if (ST->hasAVX2())
if (const auto *Entry = CostTableLookup(AVX2CostTable, ISD, LT.second))
if (auto KindCost = Entry->Cost[CostKind])
return LT.first * *KindCost;
static const CostKindTblEntry AVX1CostTable[] = {
// We don't have to scalarize unsupported ops. We can issue two half-sized
// operations and we only need to extract the upper YMM half.
// Two ops + 1 extract + 1 insert = 4.
{ ISD::MUL, MVT::v32i8, { 12, 13, 22, 23 } }, // unpack/pmullw + split
{ ISD::MUL, MVT::v16i16, { 4, 8, 5, 6 } }, // pmullw + split
{ ISD::MUL, MVT::v8i32, { 5, 8, 5, 10 } }, // pmulld + split
{ ISD::MUL, MVT::v4i32, { 2, 5, 1, 3 } }, // pmulld
{ ISD::MUL, MVT::v4i64, { 12, 15, 19, 20 } },
{ ISD::AND, MVT::v32i8, { 1, 1, 1, 2 } }, // vandps
{ ISD::AND, MVT::v16i16, { 1, 1, 1, 2 } }, // vandps
{ ISD::AND, MVT::v8i32, { 1, 1, 1, 2 } }, // vandps
{ ISD::AND, MVT::v4i64, { 1, 1, 1, 2 } }, // vandps
{ ISD::OR, MVT::v32i8, { 1, 1, 1, 2 } }, // vorps
{ ISD::OR, MVT::v16i16, { 1, 1, 1, 2 } }, // vorps
{ ISD::OR, MVT::v8i32, { 1, 1, 1, 2 } }, // vorps
{ ISD::OR, MVT::v4i64, { 1, 1, 1, 2 } }, // vorps
{ ISD::XOR, MVT::v32i8, { 1, 1, 1, 2 } }, // vxorps
{ ISD::XOR, MVT::v16i16, { 1, 1, 1, 2 } }, // vxorps
{ ISD::XOR, MVT::v8i32, { 1, 1, 1, 2 } }, // vxorps
{ ISD::XOR, MVT::v4i64, { 1, 1, 1, 2 } }, // vxorps
{ ISD::SUB, MVT::v32i8, { 4, 2, 5, 6 } }, // psubb + split
{ ISD::ADD, MVT::v32i8, { 4, 2, 5, 6 } }, // paddb + split
{ ISD::SUB, MVT::v16i16, { 4, 2, 5, 6 } }, // psubw + split
{ ISD::ADD, MVT::v16i16, { 4, 2, 5, 6 } }, // paddw + split
{ ISD::SUB, MVT::v8i32, { 4, 2, 5, 6 } }, // psubd + split
{ ISD::ADD, MVT::v8i32, { 4, 2, 5, 6 } }, // paddd + split
{ ISD::SUB, MVT::v4i64, { 4, 2, 5, 6 } }, // psubq + split
{ ISD::ADD, MVT::v4i64, { 4, 2, 5, 6 } }, // paddq + split
{ ISD::SUB, MVT::v2i64, { 1, 1, 1, 1 } }, // psubq
{ ISD::ADD, MVT::v2i64, { 1, 1, 1, 1 } }, // paddq
{ ISD::SHL, MVT::v16i8, { 10, 21,11,17 } }, // pblendvb sequence.
{ ISD::SHL, MVT::v32i8, { 22, 22,27,40 } }, // pblendvb sequence + split.
{ ISD::SHL, MVT::v8i16, { 6, 9,11,11 } }, // pblendvb sequence.
{ ISD::SHL, MVT::v16i16, { 13, 16,24,25 } }, // pblendvb sequence + split.
{ ISD::SHL, MVT::v4i32, { 3, 11, 4, 6 } }, // pslld/paddd/cvttps2dq/pmulld
{ ISD::SHL, MVT::v8i32, { 9, 11,12,17 } }, // pslld/paddd/cvttps2dq/pmulld + split
{ ISD::SHL, MVT::v2i64, { 2, 4, 4, 6 } }, // Shift each lane + blend.
{ ISD::SHL, MVT::v4i64, { 6, 7,11,15 } }, // Shift each lane + blend + split.
{ ISD::SRL, MVT::v16i8, { 11, 27,12,18 } }, // pblendvb sequence.
{ ISD::SRL, MVT::v32i8, { 23, 23,30,43 } }, // pblendvb sequence + split.
{ ISD::SRL, MVT::v8i16, { 13, 16,14,22 } }, // pblendvb sequence.
{ ISD::SRL, MVT::v16i16, { 28, 30,31,48 } }, // pblendvb sequence + split.
{ ISD::SRL, MVT::v4i32, { 6, 7,12,16 } }, // Shift each lane + blend.
{ ISD::SRL, MVT::v8i32, { 14, 14,26,34 } }, // Shift each lane + blend + split.
{ ISD::SRL, MVT::v2i64, { 2, 4, 4, 6 } }, // Shift each lane + blend.
{ ISD::SRL, MVT::v4i64, { 6, 7,11,15 } }, // Shift each lane + blend + split.
{ ISD::SRA, MVT::v16i8, { 21, 22,24,36 } }, // pblendvb sequence.
{ ISD::SRA, MVT::v32i8, { 44, 45,51,76 } }, // pblendvb sequence + split.
{ ISD::SRA, MVT::v8i16, { 13, 16,14,22 } }, // pblendvb sequence.
{ ISD::SRA, MVT::v16i16, { 28, 30,31,48 } }, // pblendvb sequence + split.
{ ISD::SRA, MVT::v4i32, { 6, 7,12,16 } }, // Shift each lane + blend.
{ ISD::SRA, MVT::v8i32, { 14, 14,26,34 } }, // Shift each lane + blend + split.
{ ISD::SRA, MVT::v2i64, { 5, 6,10,14 } }, // Shift each lane + blend.
{ ISD::SRA, MVT::v4i64, { 12, 12,22,30 } }, // Shift each lane + blend + split.
{ ISD::FNEG, MVT::v4f64, { 2, 2, 1, 2 } }, // BTVER2 from http://www.agner.org/
{ ISD::FNEG, MVT::v8f32, { 2, 2, 1, 2 } }, // BTVER2 from http://www.agner.org/
{ ISD::FADD, MVT::f64, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
{ ISD::FADD, MVT::f32, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
{ ISD::FADD, MVT::v2f64, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
{ ISD::FADD, MVT::v4f32, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
{ ISD::FADD, MVT::v4f64, { 2, 5, 1, 2 } }, // BDVER2 from http://www.agner.org/
{ ISD::FADD, MVT::v8f32, { 2, 5, 1, 2 } }, // BDVER2 from http://www.agner.org/
{ ISD::FSUB, MVT::f64, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
{ ISD::FSUB, MVT::f32, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
{ ISD::FSUB, MVT::v2f64, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
{ ISD::FSUB, MVT::v4f32, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/
{ ISD::FSUB, MVT::v4f64, { 2, 5, 1, 2 } }, // BDVER2 from http://www.agner.org/
{ ISD::FSUB, MVT::v8f32, { 2, 5, 1, 2 } }, // BDVER2 from http://www.agner.org/
{ ISD::FMUL, MVT::f64, { 2, 5, 1, 1 } }, // BTVER2 from http://www.agner.org/
{ ISD::FMUL, MVT::f32, { 1, 5, 1, 1 } }, // BTVER2 from http://www.agner.org/
{ ISD::FMUL, MVT::v2f64, { 2, 5, 1, 1 } }, // BTVER2 from http://www.agner.org/
{ ISD::FMUL, MVT::v4f32, { 1, 5, 1, 1 } }, // BTVER2 from http://www.agner.org/
{ ISD::FMUL, MVT::v4f64, { 4, 5, 1, 2 } }, // BTVER2 from http://www.agner.org/
{ ISD::FMUL, MVT::v8f32, { 2, 5, 1, 2 } }, // BTVER2 from http://www.agner.org/
{ ISD::FDIV, MVT::f32, { 14, 14, 1, 1 } }, // SNB from http://www.agner.org/
{ ISD::FDIV, MVT::v4f32, { 14, 14, 1, 1 } }, // SNB from http://www.agner.org/
{ ISD::FDIV, MVT::v8f32, { 28, 29, 1, 3 } }, // SNB from http://www.agner.org/
{ ISD::FDIV, MVT::f64, { 22, 22, 1, 1 } }, // SNB from http://www.agner.org/
{ ISD::FDIV, MVT::v2f64, { 22, 22, 1, 1 } }, // SNB from http://www.agner.org/
{ ISD::FDIV, MVT::v4f64, { 44, 45, 1, 3 } }, // SNB from http://www.agner.org/
};
if (ST->hasAVX())
if (const auto *Entry = CostTableLookup(AVX1CostTable, ISD, LT.second))
if (auto KindCost = Entry->Cost[CostKind])
return LT.first * *KindCost;
static const CostKindTblEntry SSE42CostTable[] = {
{ ISD::FADD, MVT::f64, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
{ ISD::FADD, MVT::f32, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
{ ISD::FADD, MVT::v2f64, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
{ ISD::FADD, MVT::v4f32, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
{ ISD::FSUB, MVT::f64, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
{ ISD::FSUB, MVT::f32 , { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
{ ISD::FSUB, MVT::v2f64, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
{ ISD::FSUB, MVT::v4f32, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/
{ ISD::FMUL, MVT::f64, { 1, 5, 1, 1 } }, // Nehalem from http://www.agner.org/
{ ISD::FMUL, MVT::f32, { 1, 5, 1, 1 } }, // Nehalem from http://www.agner.org/
{ ISD::FMUL, MVT::v2f64, { 1, 5, 1, 1 } }, // Nehalem from http://www.agner.org/
{ ISD::FMUL, MVT::v4f32, { 1, 5, 1, 1 } }, // Nehalem from http://www.agner.org/
{ ISD::FDIV, MVT::f32, { 14, 14, 1, 1 } }, // Nehalem from http://www.agner.org/
{ ISD::FDIV, MVT::v4f32, { 14, 14, 1, 1 } }, // Nehalem from http://www.agner.org/
{ ISD::FDIV, MVT::f64, { 22, 22, 1, 1 } }, // Nehalem from http://www.agner.org/
{ ISD::FDIV, MVT::v2f64, { 22, 22, 1, 1 } }, // Nehalem from http://www.agner.org/
{ ISD::MUL, MVT::v2i64, { 6, 10,10,10 } } // 3*pmuludq/3*shift/2*add
};
if (ST->hasSSE42())
if (const auto *Entry = CostTableLookup(SSE42CostTable, ISD, LT.second))
if (auto KindCost = Entry->Cost[CostKind])
return LT.first * *KindCost;
static const CostKindTblEntry SSE41CostTable[] = {
{ ISD::SHL, MVT::v16i8, { 15, 24,17,22 } }, // pblendvb sequence.
{ ISD::SHL, MVT::v8i16, { 11, 14,11,11 } }, // pblendvb sequence.
{ ISD::SHL, MVT::v4i32, { 14, 20, 4,10 } }, // pslld/paddd/cvttps2dq/pmulld
{ ISD::SRL, MVT::v16i8, { 16, 27,18,24 } }, // pblendvb sequence.
{ ISD::SRL, MVT::v8i16, { 22, 26,23,27 } }, // pblendvb sequence.
{ ISD::SRL, MVT::v4i32, { 16, 17,15,19 } }, // Shift each lane + blend.
{ ISD::SRL, MVT::v2i64, { 4, 6, 5, 7 } }, // splat+shuffle sequence.
{ ISD::SRA, MVT::v16i8, { 38, 41,30,36 } }, // pblendvb sequence.
{ ISD::SRA, MVT::v8i16, { 22, 26,23,27 } }, // pblendvb sequence.
{ ISD::SRA, MVT::v4i32, { 16, 17,15,19 } }, // Shift each lane + blend.
{ ISD::SRA, MVT::v2i64, { 8, 17, 5, 7 } }, // splat+shuffle sequence.
{ ISD::MUL, MVT::v16i8, { 5, 18,10,12 } }, // 2*unpack/2*pmullw/2*and/pack
{ ISD::MUL, MVT::v4i32, { 2, 11, 1, 1 } } // pmulld (Nehalem from agner.org)
};
if (ST->hasSSE41())
if (const auto *Entry = CostTableLookup(SSE41CostTable, ISD, LT.second))
if (auto KindCost = Entry->Cost[CostKind])
return LT.first * *KindCost;
static const CostKindTblEntry SSE2CostTable[] = {
// We don't correctly identify costs of casts because they are marked as
// custom.
{ ISD::SHL, MVT::v16i8, { 13, 21,26,28 } }, // cmpgtb sequence.
{ ISD::SHL, MVT::v8i16, { 24, 27,16,20 } }, // cmpgtw sequence.
{ ISD::SHL, MVT::v4i32, { 17, 19,10,12 } }, // pslld/paddd/cvttps2dq/pmuludq.
{ ISD::SHL, MVT::v2i64, { 4, 6, 5, 7 } }, // splat+shuffle sequence.
{ ISD::SRL, MVT::v16i8, { 14, 28,27,30 } }, // cmpgtb sequence.
{ ISD::SRL, MVT::v8i16, { 16, 19,31,31 } }, // cmpgtw sequence.
{ ISD::SRL, MVT::v4i32, { 12, 12,15,19 } }, // Shift each lane + blend.
{ ISD::SRL, MVT::v2i64, { 4, 6, 5, 7 } }, // splat+shuffle sequence.
{ ISD::SRA, MVT::v16i8, { 27, 30,54,54 } }, // unpacked cmpgtb sequence.
{ ISD::SRA, MVT::v8i16, { 16, 19,31,31 } }, // cmpgtw sequence.
{ ISD::SRA, MVT::v4i32, { 12, 12,15,19 } }, // Shift each lane + blend.
{ ISD::SRA, MVT::v2i64, { 8, 11,12,16 } }, // srl/xor/sub splat+shuffle sequence.
{ ISD::AND, MVT::v16i8, { 1, 1, 1, 1 } }, // pand
{ ISD::AND, MVT::v8i16, { 1, 1, 1, 1 } }, // pand
{ ISD::AND, MVT::v4i32, { 1, 1, 1, 1 } }, // pand
{ ISD::AND, MVT::v2i64, { 1, 1, 1, 1 } }, // pand
{ ISD::OR, MVT::v16i8, { 1, 1, 1, 1 } }, // por
{ ISD::OR, MVT::v8i16, { 1, 1, 1, 1 } }, // por
{ ISD::OR, MVT::v4i32, { 1, 1, 1, 1 } }, // por
{ ISD::OR, MVT::v2i64, { 1, 1, 1, 1 } }, // por
{ ISD::XOR, MVT::v16i8, { 1, 1, 1, 1 } }, // pxor
{ ISD::XOR, MVT::v8i16, { 1, 1, 1, 1 } }, // pxor
{ ISD::XOR, MVT::v4i32, { 1, 1, 1, 1 } }, // pxor
{ ISD::XOR, MVT::v2i64, { 1, 1, 1, 1 } }, // pxor
{ ISD::ADD, MVT::v2i64, { 1, 2, 1, 2 } }, // paddq
{ ISD::SUB, MVT::v2i64, { 1, 2, 1, 2 } }, // psubq
{ ISD::MUL, MVT::v16i8, { 5, 18,12,12 } }, // 2*unpack/2*pmullw/2*and/pack
{ ISD::MUL, MVT::v8i16, { 1, 5, 1, 1 } }, // pmullw
{ ISD::MUL, MVT::v4i32, { 6, 8, 7, 7 } }, // 3*pmuludq/4*shuffle
{ ISD::MUL, MVT::v2i64, { 7, 10,10,10 } }, // 3*pmuludq/3*shift/2*add
{ X86ISD::PMULUDQ, MVT::v2i64, { 1, 5, 1, 1 } },
{ ISD::FDIV, MVT::f32, { 23, 23, 1, 1 } }, // Pentium IV from http://www.agner.org/
{ ISD::FDIV, MVT::v4f32, { 39, 39, 1, 1 } }, // Pentium IV from http://www.agner.org/
{ ISD::FDIV, MVT::f64, { 38, 38, 1, 1 } }, // Pentium IV from http://www.agner.org/
{ ISD::FDIV, MVT::v2f64, { 69, 69, 1, 1 } }, // Pentium IV from http://www.agner.org/
{ ISD::FNEG, MVT::f32, { 1, 1, 1, 1 } }, // Pentium IV from http://www.agner.org/
{ ISD::FNEG, MVT::f64, { 1, 1, 1, 1 } }, // Pentium IV from http://www.agner.org/
{ ISD::FNEG, MVT::v4f32, { 1, 1, 1, 1 } }, // Pentium IV from http://www.agner.org/
{ ISD::FNEG, MVT::v2f64, { 1, 1, 1, 1 } }, // Pentium IV from http://www.agner.org/
{ ISD::FADD, MVT::f32, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/
{ ISD::FADD, MVT::f64, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/
{ ISD::FADD, MVT::v2f64, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/
{ ISD::FSUB, MVT::f32, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/
{ ISD::FSUB, MVT::f64, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/
{ ISD::FSUB, MVT::v2f64, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/
{ ISD::FMUL, MVT::f64, { 2, 5, 1, 1 } }, // Pentium IV from http://www.agner.org/
{ ISD::FMUL, MVT::v2f64, { 2, 5, 1, 1 } }, // Pentium IV from http://www.agner.org/
};
if (ST->hasSSE2())
if (const auto *Entry = CostTableLookup(SSE2CostTable, ISD, LT.second))
if (auto KindCost = Entry->Cost[CostKind])
return LT.first * *KindCost;
static const CostKindTblEntry SSE1CostTable[] = {
{ ISD::FDIV, MVT::f32, { 17, 18, 1, 1 } }, // Pentium III from http://www.agner.org/
{ ISD::FDIV, MVT::v4f32, { 34, 48, 1, 1 } }, // Pentium III from http://www.agner.org/
{ ISD::FNEG, MVT::f32, { 2, 2, 1, 2 } }, // Pentium III from http://www.agner.org/
{ ISD::FNEG, MVT::v4f32, { 2, 2, 1, 2 } }, // Pentium III from http://www.agner.org/
{ ISD::FADD, MVT::f32, { 1, 3, 1, 1 } }, // Pentium III from http://www.agner.org/
{ ISD::FADD, MVT::v4f32, { 2, 3, 1, 1 } }, // Pentium III from http://www.agner.org/
{ ISD::FSUB, MVT::f32, { 1, 3, 1, 1 } }, // Pentium III from http://www.agner.org/
{ ISD::FSUB, MVT::v4f32, { 2, 3, 1, 1 } }, // Pentium III from http://www.agner.org/
{ ISD::FMUL, MVT::f32, { 2, 5, 1, 1 } }, // Pentium III from http://www.agner.org/
{ ISD::FMUL, MVT::v4f32, { 2, 5, 1, 1 } }, // Pentium III from http://www.agner.org/
};
if (ST->hasSSE1())
if (const auto *Entry = CostTableLookup(SSE1CostTable, ISD, LT.second))
if (auto KindCost = Entry->Cost[CostKind])
return LT.first * *KindCost;
static const CostKindTblEntry X64CostTbl[] = { // 64-bit targets
{ ISD::ADD, MVT::i64, { 1 } }, // Core (Merom) from http://www.agner.org/
{ ISD::SUB, MVT::i64, { 1 } }, // Core (Merom) from http://www.agner.org/
{ ISD::MUL, MVT::i64, { 2, 6, 1, 2 } },
};
if (ST->is64Bit())
if (const auto *Entry = CostTableLookup(X64CostTbl, ISD, LT.second))
if (auto KindCost = Entry->Cost[CostKind])
return LT.first * *KindCost;
static const CostKindTblEntry X86CostTbl[] = { // 32 or 64-bit targets
{ ISD::ADD, MVT::i8, { 1 } }, // Pentium III from http://www.agner.org/
{ ISD::ADD, MVT::i16, { 1 } }, // Pentium III from http://www.agner.org/
{ ISD::ADD, MVT::i32, { 1 } }, // Pentium III from http://www.agner.org/
{ ISD::SUB, MVT::i8, { 1 } }, // Pentium III from http://www.agner.org/
{ ISD::SUB, MVT::i16, { 1 } }, // Pentium III from http://www.agner.org/
{ ISD::SUB, MVT::i32, { 1 } }, // Pentium III from http://www.agner.org/
{ ISD::MUL, MVT::i8, { 3, 4, 1, 1 } },
{ ISD::MUL, MVT::i16, { 2, 4, 1, 1 } },
{ ISD::MUL, MVT::i32, { 1, 4, 1, 1 } },
{ ISD::FNEG, MVT::f64, { 2, 2, 1, 3 } }, // (x87)
{ ISD::FADD, MVT::f64, { 2, 3, 1, 1 } }, // (x87)
{ ISD::FSUB, MVT::f64, { 2, 3, 1, 1 } }, // (x87)
{ ISD::FMUL, MVT::f64, { 2, 5, 1, 1 } }, // (x87)
{ ISD::FDIV, MVT::f64, { 38, 38, 1, 1 } }, // (x87)
};
if (const auto *Entry = CostTableLookup(X86CostTbl, ISD, LT.second))
if (auto KindCost = Entry->Cost[CostKind])
return LT.first * *KindCost;
// It is not a good idea to vectorize division. We have to scalarize it and
// in the process we will often end up having to spilling regular
// registers. The overhead of division is going to dominate most kernels
// anyways so try hard to prevent vectorization of division - it is
// generally a bad idea. Assume somewhat arbitrarily that we have to be able
// to hide "20 cycles" for each lane.
if (CostKind == TTI::TCK_RecipThroughput && LT.second.isVector() &&
(ISD == ISD::SDIV || ISD == ISD::SREM || ISD == ISD::UDIV ||
ISD == ISD::UREM)) {
InstructionCost ScalarCost =
getArithmeticInstrCost(Opcode, Ty->getScalarType(), CostKind,
Op1Info.getNoProps(), Op2Info.getNoProps());
return 20 * LT.first * LT.second.getVectorNumElements() * ScalarCost;
}
// Handle some basic single instruction code size cases.
if (CostKind == TTI::TCK_CodeSize) {
switch (ISD) {
case ISD::FADD:
case ISD::FSUB:
case ISD::FMUL:
case ISD::FDIV:
case ISD::FNEG:
case ISD::AND:
case ISD::OR:
case ISD::XOR:
return LT.first;
break;
}
}
// Fallback to the default implementation.
return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
Args, CxtI);
}
InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
VectorType *BaseTp,
ArrayRef<int> Mask,
TTI::TargetCostKind CostKind,
int Index, VectorType *SubTp,
ArrayRef<const Value *> Args) {
// 64-bit packed float vectors (v2f32) are widened to type v4f32.
// 64-bit packed integer vectors (v2i32) are widened to type v4i32.
std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(BaseTp);
Kind = improveShuffleKindFromMask(Kind, Mask, BaseTp, Index, SubTp);
// Treat Transpose as 2-op shuffles - there's no difference in lowering.
if (Kind == TTI::SK_Transpose)
Kind = TTI::SK_PermuteTwoSrc;
// For Broadcasts we are splatting the first element from the first input
// register, so only need to reference that input and all the output
// registers are the same.
if (Kind == TTI::SK_Broadcast)
LT.first = 1;
// Subvector extractions are free if they start at the beginning of a
// vector and cheap if the subvectors are aligned.
if (Kind == TTI::SK_ExtractSubvector && LT.second.isVector()) {
int NumElts = LT.second.getVectorNumElements();
if ((Index % NumElts) == 0)
return 0;
std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(SubTp);
if (SubLT.second.isVector()) {
int NumSubElts = SubLT.second.getVectorNumElements();
if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0)
return SubLT.first;
// Handle some cases for widening legalization. For now we only handle
// cases where the original subvector was naturally aligned and evenly
// fit in its legalized subvector type.
// FIXME: Remove some of the alignment restrictions.
// FIXME: We can use permq for 64-bit or larger extracts from 256-bit
// vectors.
int OrigSubElts = cast<FixedVectorType>(SubTp)->getNumElements();
if (NumSubElts > OrigSubElts && (Index % OrigSubElts) == 0 &&
(NumSubElts % OrigSubElts) == 0 &&
LT.second.getVectorElementType() ==
SubLT.second.getVectorElementType() &&
LT.second.getVectorElementType().getSizeInBits() ==
BaseTp->getElementType()->getPrimitiveSizeInBits()) {
assert(NumElts >= NumSubElts && NumElts > OrigSubElts &&
"Unexpected number of elements!");
auto *VecTy = FixedVectorType::get(BaseTp->getElementType(),
LT.second.getVectorNumElements());
auto *SubTy = FixedVectorType::get(BaseTp->getElementType(),
SubLT.second.getVectorNumElements());
int ExtractIndex = alignDown((Index % NumElts), NumSubElts);
InstructionCost ExtractCost =
getShuffleCost(TTI::SK_ExtractSubvector, VecTy, std::nullopt,
CostKind, ExtractIndex, SubTy);
// If the original size is 32-bits or more, we can use pshufd. Otherwise
// if we have SSSE3 we can use pshufb.
if (SubTp->getPrimitiveSizeInBits() >= 32 || ST->hasSSSE3())
return ExtractCost + 1; // pshufd or pshufb
assert(SubTp->getPrimitiveSizeInBits() == 16 &&
"Unexpected vector size");
return ExtractCost + 2; // worst case pshufhw + pshufd
}
}
}
// Subvector insertions are cheap if the subvectors are aligned.
// Note that in general, the insertion starting at the beginning of a vector
// isn't free, because we need to preserve the rest of the wide vector.
if (Kind == TTI::SK_InsertSubvector && LT.second.isVector()) {
int NumElts = LT.second.getVectorNumElements();
std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(SubTp);
if (SubLT.second.isVector()) {
int NumSubElts = SubLT.second.getVectorNumElements();
if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0)
return SubLT.first;
}
// If the insertion isn't aligned, treat it like a 2-op shuffle.
Kind = TTI::SK_PermuteTwoSrc;
}
// Handle some common (illegal) sub-vector types as they are often very cheap
// to shuffle even on targets without PSHUFB.
EVT VT = TLI->getValueType(DL, BaseTp);
if (VT.isSimple() && VT.isVector() && VT.getSizeInBits() < 128 &&
!ST->hasSSSE3()) {
static const CostTblEntry SSE2SubVectorShuffleTbl[] = {
{TTI::SK_Broadcast, MVT::v4i16, 1}, // pshuflw
{TTI::SK_Broadcast, MVT::v2i16, 1}, // pshuflw
{TTI::SK_Broadcast, MVT::v8i8, 2}, // punpck/pshuflw
{TTI::SK_Broadcast, MVT::v4i8, 2}, // punpck/pshuflw
{TTI::SK_Broadcast, MVT::v2i8, 1}, // punpck
{TTI::SK_Reverse, MVT::v4i16, 1}, // pshuflw
{TTI::SK_Reverse, MVT::v2i16, 1}, // pshuflw
{TTI::SK_Reverse, MVT::v4i8, 3}, // punpck/pshuflw/packus
{TTI::SK_Reverse, MVT::v2i8, 1}, // punpck
{TTI::SK_Splice, MVT::v4i16, 2}, // punpck+psrldq
{TTI::SK_Splice, MVT::v2i16, 2}, // punpck+psrldq
{TTI::SK_Splice, MVT::v4i8, 2}, // punpck+psrldq
{TTI::SK_Splice, MVT::v2i8, 2}, // punpck+psrldq
{TTI::SK_PermuteTwoSrc, MVT::v4i16, 2}, // punpck/pshuflw
{TTI::SK_PermuteTwoSrc, MVT::v2i16, 2}, // punpck/pshuflw
{TTI::SK_PermuteTwoSrc, MVT::v8i8, 7}, // punpck/pshuflw
{TTI::SK_PermuteTwoSrc, MVT::v4i8, 4}, // punpck/pshuflw
{TTI::SK_PermuteTwoSrc, MVT::v2i8, 2}, // punpck
{TTI::SK_PermuteSingleSrc, MVT::v4i16, 1}, // pshuflw
{TTI::SK_PermuteSingleSrc, MVT::v2i16, 1}, // pshuflw
{TTI::SK_PermuteSingleSrc, MVT::v8i8, 5}, // punpck/pshuflw
{TTI::SK_PermuteSingleSrc, MVT::v4i8, 3}, // punpck/pshuflw
{TTI::SK_PermuteSingleSrc, MVT::v2i8, 1}, // punpck
};
if (ST->hasSSE2())
if (const auto *Entry =
CostTableLookup(SSE2SubVectorShuffleTbl, Kind, VT.getSimpleVT()))
return Entry->Cost;
}
// We are going to permute multiple sources and the result will be in multiple
// destinations. Providing an accurate cost only for splits where the element
// type remains the same.
if (Kind == TTI::SK_PermuteSingleSrc && LT.first != 1) {
MVT LegalVT = LT.second;
if (LegalVT.isVector() &&
LegalVT.getVectorElementType().getSizeInBits() ==
BaseTp->getElementType()->getPrimitiveSizeInBits() &&
LegalVT.getVectorNumElements() <
cast<FixedVectorType>(BaseTp)->getNumElements()) {
unsigned VecTySize = DL.getTypeStoreSize(BaseTp);
unsigned LegalVTSize = LegalVT.getStoreSize();
// Number of source vectors after legalization:
unsigned NumOfSrcs = (VecTySize + LegalVTSize - 1) / LegalVTSize;
// Number of destination vectors after legalization:
InstructionCost NumOfDests = LT.first;
auto *SingleOpTy = FixedVectorType::get(BaseTp->getElementType(),
LegalVT.getVectorNumElements());
if (!Mask.empty() && NumOfDests.isValid()) {
// Try to perform better estimation of the permutation.
// 1. Split the source/destination vectors into real registers.
// 2. Do the mask analysis to identify which real registers are
// permuted. If more than 1 source registers are used for the
// destination register building, the cost for this destination register
// is (Number_of_source_register - 1) * Cost_PermuteTwoSrc. If only one
// source register is used, build mask and calculate the cost as a cost
// of PermuteSingleSrc.
// Also, for the single register permute we try to identify if the
// destination register is just a copy of the source register or the
// copy of the previous destination register (the cost is
// TTI::TCC_Basic). If the source register is just reused, the cost for
// this operation is 0.
NumOfDests =
getTypeLegalizationCost(
FixedVectorType::get(BaseTp->getElementType(), Mask.size()))
.first;
unsigned E = *NumOfDests.getValue();
unsigned NormalizedVF =
LegalVT.getVectorNumElements() * std::max(NumOfSrcs, E);
unsigned NumOfSrcRegs = NormalizedVF / LegalVT.getVectorNumElements();
unsigned NumOfDestRegs = NormalizedVF / LegalVT.getVectorNumElements();
SmallVector<int> NormalizedMask(NormalizedVF, PoisonMaskElem);
copy(Mask, NormalizedMask.begin());
unsigned PrevSrcReg = 0;
ArrayRef<int> PrevRegMask;
InstructionCost Cost = 0;
processShuffleMasks(
NormalizedMask, NumOfSrcRegs, NumOfDestRegs, NumOfDestRegs, []() {},
[this, SingleOpTy, CostKind, &PrevSrcReg, &PrevRegMask,
&Cost](ArrayRef<int> RegMask, unsigned SrcReg, unsigned DestReg) {
if (!ShuffleVectorInst::isIdentityMask(RegMask, RegMask.size())) {
// Check if the previous register can be just copied to the next
// one.
if (PrevRegMask.empty() || PrevSrcReg != SrcReg ||
PrevRegMask != RegMask)
Cost += getShuffleCost(TTI::SK_PermuteSingleSrc, SingleOpTy,
RegMask, CostKind, 0, nullptr);
else
// Just a copy of previous destination register.
Cost += TTI::TCC_Basic;
return;
}
if (SrcReg != DestReg &&
any_of(RegMask, [](int I) { return I != PoisonMaskElem; })) {
// Just a copy of the source register.
Cost += TTI::TCC_Basic;
}
PrevSrcReg = SrcReg;
PrevRegMask = RegMask;
},
[this, SingleOpTy, CostKind, &Cost](ArrayRef<int> RegMask,
unsigned /*Unused*/,
unsigned /*Unused*/) {
Cost += getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy, RegMask,
CostKind, 0, nullptr);
});
return Cost;
}
InstructionCost NumOfShuffles = (NumOfSrcs - 1) * NumOfDests;
return NumOfShuffles * getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy,
std::nullopt, CostKind, 0, nullptr);
}
return BaseT::getShuffleCost(Kind, BaseTp, Mask, CostKind, Index, SubTp);
}
// For 2-input shuffles, we must account for splitting the 2 inputs into many.
if (Kind == TTI::SK_PermuteTwoSrc && LT.first != 1) {
// We assume that source and destination have the same vector type.
InstructionCost NumOfDests = LT.first;
InstructionCost NumOfShufflesPerDest = LT.first * 2 - 1;
LT.first = NumOfDests * NumOfShufflesPerDest;
}
static const CostTblEntry AVX512VBMIShuffleTbl[] = {
{TTI::SK_Reverse, MVT::v64i8, 1}, // vpermb
{TTI::SK_Reverse, MVT::v32i8, 1}, // vpermb
{TTI::SK_PermuteSingleSrc, MVT::v64i8, 1}, // vpermb
{TTI::SK_PermuteSingleSrc, MVT::v32i8, 1}, // vpermb
{TTI::SK_PermuteTwoSrc, MVT::v64i8, 2}, // vpermt2b
{TTI::SK_PermuteTwoSrc, MVT::v32i8, 2}, // vpermt2b
{TTI::SK_PermuteTwoSrc, MVT::v16i8, 2} // vpermt2b
};
if (ST->hasVBMI())
if (const auto *Entry =
CostTableLookup(AVX512VBMIShuffleTbl, Kind, LT.second))
return LT.first * Entry->Cost;
static const CostTblEntry AVX512BWShuffleTbl[] = {
{TTI::SK_Broadcast, MVT::v32i16, 1}, // vpbroadcastw
{TTI::SK_Broadcast, MVT::v32f16, 1}, // vpbroadcastw
{TTI::SK_Broadcast, MVT::v64i8, 1}, // vpbroadcastb
{TTI::SK_Reverse, MVT::v32i16, 2}, // vpermw
{TTI::SK_Reverse, MVT::v32f16, 2}, // vpermw
{TTI::SK_Reverse, MVT::v16i16, 2}, // vpermw
{TTI::SK_Reverse, MVT::v64i8, 2}, // pshufb + vshufi64x2
{TTI::SK_PermuteSingleSrc, MVT::v32i16, 2}, // vpermw
{TTI::SK_PermuteSingleSrc, MVT::v32f16, 2}, // vpermw
{TTI::SK_PermuteSingleSrc, MVT::v16i16, 2}, // vpermw
{TTI::SK_PermuteSingleSrc, MVT::v16f16, 2}, // vpermw
{TTI::SK_PermuteSingleSrc, MVT::v64i8, 8}, // extend to v32i16
{TTI::SK_PermuteTwoSrc, MVT::v32i16, 2}, // vpermt2w
{TTI::SK_PermuteTwoSrc, MVT::v32f16, 2}, // vpermt2w
{TTI::SK_PermuteTwoSrc, MVT::v16i16, 2}, // vpermt2w
{TTI::SK_PermuteTwoSrc, MVT::v8i16, 2}, // vpermt2w
{TTI::SK_PermuteTwoSrc, MVT::v64i8, 19}, // 6 * v32i8 + 1
{TTI::SK_Select, MVT::v32i16, 1}, // vblendmw
{TTI::SK_Select, MVT::v64i8, 1}, // vblendmb
{TTI::SK_Splice, MVT::v32i16, 2}, // vshufi64x2 + palignr
{TTI::SK_Splice, MVT::v32f16, 2}, // vshufi64x2 + palignr
{TTI::SK_Splice, MVT::v64i8, 2}, // vshufi64x2 + palignr
};
if (ST->hasBWI())
if (const auto *Entry =
CostTableLookup(AVX512BWShuffleTbl, Kind, LT.second))
return LT.first * Entry->Cost;
static const CostKindTblEntry AVX512ShuffleTbl[] = {
{TTI::SK_Broadcast, MVT::v8f64, { 1, 1, 1, 1 } }, // vbroadcastsd
{TTI::SK_Broadcast, MVT::v16f32, { 1, 1, 1, 1 } }, // vbroadcastss
{TTI::SK_Broadcast, MVT::v8i64, { 1, 1, 1, 1 } }, // vpbroadcastq
{TTI::SK_Broadcast, MVT::v16i32, { 1, 1, 1, 1 } }, // vpbroadcastd
{TTI::SK_Broadcast, MVT::v32i16, { 1, 1, 1, 1 } }, // vpbroadcastw