blob: 869762b3519610ed8d4b3b017b2d6a8ab5c8d8dd [file] [log] [blame]
//===-- X86TargetTransformInfo.cpp - X86 specific TTI pass ----------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
/// \file
/// This file implements a TargetTransformInfo analysis pass specific to the
/// X86 target machine. It uses the target's detailed information to provide
/// more precise answers to certain TTI queries, while letting the target
/// independent and default TTI implementations handle the rest.
///
//===----------------------------------------------------------------------===//
/// About Cost Model numbers used below it's necessary to say the following:
/// the numbers correspond to some "generic" X86 CPU instead of usage of
/// concrete CPU model. Usually the numbers correspond to CPU where the feature
/// apeared at the first time. For example, if we do Subtarget.hasSSE42() in
/// the lookups below the cost is based on Nehalem as that was the first CPU
/// to support that feature level and thus has most likely the worst case cost.
/// Some examples of other technologies/CPUs:
/// SSE 3 - Pentium4 / Athlon64
/// SSE 4.1 - Penryn
/// SSE 4.2 - Nehalem
/// AVX - Sandy Bridge
/// AVX2 - Haswell
/// AVX-512 - Xeon Phi / Skylake
/// And some examples of instruction target dependent costs (latency)
/// divss sqrtss rsqrtss
/// AMD K7 11-16 19 3
/// Piledriver 9-24 13-15 5
/// Jaguar 14 16 2
/// Pentium II,III 18 30 2
/// Nehalem 7-14 7-18 3
/// Haswell 10-13 11 5
/// TODO: Develop and implement the target dependent cost model and
/// specialize cost numbers for different Cost Model Targets such as throughput,
/// code size, latency and uop count.
//===----------------------------------------------------------------------===//
#include "X86TargetTransformInfo.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/CodeGen/BasicTTIImpl.h"
#include "llvm/CodeGen/CostTable.h"
#include "llvm/CodeGen/TargetLowering.h"
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/Support/Debug.h"
using namespace llvm;
#define DEBUG_TYPE "x86tti"
//===----------------------------------------------------------------------===//
//
// X86 cost model.
//
//===----------------------------------------------------------------------===//
TargetTransformInfo::PopcntSupportKind
X86TTIImpl::getPopcntSupport(unsigned TyWidth) {
assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
// TODO: Currently the __builtin_popcount() implementation using SSE3
// instructions is inefficient. Once the problem is fixed, we should
// call ST->hasSSE3() instead of ST->hasPOPCNT().
return ST->hasPOPCNT() ? TTI::PSK_FastHardware : TTI::PSK_Software;
}
llvm::Optional<unsigned> X86TTIImpl::getCacheSize(
TargetTransformInfo::CacheLevel Level) const {
switch (Level) {
case TargetTransformInfo::CacheLevel::L1D:
// - Penryn
// - Nehalem
// - Westmere
// - Sandy Bridge
// - Ivy Bridge
// - Haswell
// - Broadwell
// - Skylake
// - Kabylake
return 32 * 1024; // 32 KByte
case TargetTransformInfo::CacheLevel::L2D:
// - Penryn
// - Nehalem
// - Westmere
// - Sandy Bridge
// - Ivy Bridge
// - Haswell
// - Broadwell
// - Skylake
// - Kabylake
return 256 * 1024; // 256 KByte
}
llvm_unreachable("Unknown TargetTransformInfo::CacheLevel");
}
llvm::Optional<unsigned> X86TTIImpl::getCacheAssociativity(
TargetTransformInfo::CacheLevel Level) const {
// - Penryn
// - Nehalem
// - Westmere
// - Sandy Bridge
// - Ivy Bridge
// - Haswell
// - Broadwell
// - Skylake
// - Kabylake
switch (Level) {
case TargetTransformInfo::CacheLevel::L1D:
LLVM_FALLTHROUGH;
case TargetTransformInfo::CacheLevel::L2D:
return 8;
}
llvm_unreachable("Unknown TargetTransformInfo::CacheLevel");
}
unsigned X86TTIImpl::getNumberOfRegisters(unsigned ClassID) const {
bool Vector = (ClassID == 1);
if (Vector && !ST->hasSSE1())
return 0;
if (ST->is64Bit()) {
if (Vector && ST->hasAVX512())
return 32;
return 16;
}
return 8;
}
TypeSize
X86TTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const {
unsigned PreferVectorWidth = ST->getPreferVectorWidth();
switch (K) {
case TargetTransformInfo::RGK_Scalar:
return TypeSize::getFixed(ST->is64Bit() ? 64 : 32);
case TargetTransformInfo::RGK_FixedWidthVector:
if (ST->hasAVX512() && PreferVectorWidth >= 512)
return TypeSize::getFixed(512);
if (ST->hasAVX() && PreferVectorWidth >= 256)
return TypeSize::getFixed(256);
if (ST->hasSSE1() && PreferVectorWidth >= 128)
return TypeSize::getFixed(128);
return TypeSize::getFixed(0);
case TargetTransformInfo::RGK_ScalableVector:
return TypeSize::getScalable(0);
}
llvm_unreachable("Unsupported register kind");
}
unsigned X86TTIImpl::getLoadStoreVecRegBitWidth(unsigned) const {
return getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector)
.getFixedSize();
}
unsigned X86TTIImpl::getMaxInterleaveFactor(unsigned VF) {
// If the loop will not be vectorized, don't interleave the loop.
// Let regular unroll to unroll the loop, which saves the overflow
// check and memory check cost.
if (VF == 1)
return 1;
if (ST->isAtom())
return 1;
// Sandybridge and Haswell have multiple execution ports and pipelined
// vector units.
if (ST->hasAVX())
return 4;
return 2;
}
InstructionCost X86TTIImpl::getArithmeticInstrCost(
unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
TTI::OperandValueKind Op1Info, TTI::OperandValueKind Op2Info,
TTI::OperandValueProperties Opd1PropInfo,
TTI::OperandValueProperties Opd2PropInfo, ArrayRef<const Value *> Args,
const Instruction *CxtI) {
// TODO: Handle more cost kinds.
if (CostKind != TTI::TCK_RecipThroughput)
return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
Op2Info, Opd1PropInfo,
Opd2PropInfo, Args, CxtI);
// vXi8 multiplications are always promoted to vXi16.
if (Opcode == Instruction::Mul && Ty->isVectorTy() &&
Ty->getScalarSizeInBits() == 8) {
Type *WideVecTy =
VectorType::getExtendedElementVectorType(cast<VectorType>(Ty));
return getCastInstrCost(Instruction::ZExt, WideVecTy, Ty,
TargetTransformInfo::CastContextHint::None,
CostKind) +
getCastInstrCost(Instruction::Trunc, Ty, WideVecTy,
TargetTransformInfo::CastContextHint::None,
CostKind) +
getArithmeticInstrCost(Opcode, WideVecTy, CostKind, Op1Info, Op2Info,
Opd1PropInfo, Opd2PropInfo);
}
// Legalize the type.
std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
int ISD = TLI->InstructionOpcodeToISD(Opcode);
assert(ISD && "Invalid opcode");
if (ISD == ISD::MUL && Args.size() == 2 && LT.second.isVector() &&
LT.second.getScalarType() == MVT::i32) {
// Check if the operands can be represented as a smaller datatype.
bool Op1Signed = false, Op2Signed = false;
unsigned Op1MinSize = BaseT::minRequiredElementSize(Args[0], Op1Signed);
unsigned Op2MinSize = BaseT::minRequiredElementSize(Args[1], Op2Signed);
unsigned OpMinSize = std::max(Op1MinSize, Op2MinSize);
// If both are representable as i15 and at least one is constant,
// zero-extended, or sign-extended from vXi16 (or less pre-SSE41) then we
// can treat this as PMADDWD which has the same costs as a vXi16 multiply.
if (OpMinSize <= 15 && !ST->isPMADDWDSlow()) {
bool Op1Constant =
isa<ConstantDataVector>(Args[0]) || isa<ConstantVector>(Args[0]);
bool Op2Constant =
isa<ConstantDataVector>(Args[1]) || isa<ConstantVector>(Args[1]);
bool Op1Sext = isa<SExtInst>(Args[0]) &&
(Op1MinSize == 15 || (Op1MinSize < 15 && !ST->hasSSE41()));
bool Op2Sext = isa<SExtInst>(Args[1]) &&
(Op2MinSize == 15 || (Op2MinSize < 15 && !ST->hasSSE41()));
bool IsZeroExtended = !Op1Signed || !Op2Signed;
bool IsConstant = Op1Constant || Op2Constant;
bool IsSext = Op1Sext || Op2Sext;
if (IsConstant || IsZeroExtended || IsSext)
LT.second =
MVT::getVectorVT(MVT::i16, 2 * LT.second.getVectorNumElements());
}
}
if ((ISD == ISD::MUL || ISD == ISD::SDIV || ISD == ISD::SREM ||
ISD == ISD::UDIV || ISD == ISD::UREM) &&
(Op2Info == TargetTransformInfo::OK_UniformConstantValue ||
Op2Info == TargetTransformInfo::OK_NonUniformConstantValue) &&
Opd2PropInfo == TargetTransformInfo::OP_PowerOf2) {
// Vector multiply by pow2 will be simplified to shifts.
if (ISD == ISD::MUL) {
InstructionCost Cost = getArithmeticInstrCost(
Instruction::Shl, Ty, CostKind, Op1Info, Op2Info,
TargetTransformInfo::OP_None, TargetTransformInfo::OP_None);
return Cost;
}
if (ISD == ISD::SDIV || ISD == ISD::SREM) {
// On X86, vector signed division by constants power-of-two are
// normally expanded to the sequence SRA + SRL + ADD + SRA.
// The OperandValue properties may not be the same as that of the previous
// operation; conservatively assume OP_None.
InstructionCost Cost =
2 * getArithmeticInstrCost(Instruction::AShr, Ty, CostKind, Op1Info,
Op2Info, TargetTransformInfo::OP_None,
TargetTransformInfo::OP_None);
Cost += getArithmeticInstrCost(Instruction::LShr, Ty, CostKind, Op1Info,
Op2Info, TargetTransformInfo::OP_None,
TargetTransformInfo::OP_None);
Cost += getArithmeticInstrCost(Instruction::Add, Ty, CostKind, Op1Info,
Op2Info, TargetTransformInfo::OP_None,
TargetTransformInfo::OP_None);
if (ISD == ISD::SREM) {
// For SREM: (X % C) is the equivalent of (X - (X/C)*C)
Cost += getArithmeticInstrCost(Instruction::Mul, Ty, CostKind, Op1Info,
Op2Info);
Cost += getArithmeticInstrCost(Instruction::Sub, Ty, CostKind, Op1Info,
Op2Info);
}
return Cost;
}
// Vector unsigned division/remainder will be simplified to shifts/masks.
if (ISD == ISD::UDIV)
return getArithmeticInstrCost(Instruction::LShr, Ty, CostKind, Op1Info,
Op2Info, TargetTransformInfo::OP_None,
TargetTransformInfo::OP_None);
// UREM
return getArithmeticInstrCost(Instruction::And, Ty, CostKind, Op1Info,
Op2Info, TargetTransformInfo::OP_None,
TargetTransformInfo::OP_None);
}
static const CostTblEntry GLMCostTable[] = {
{ ISD::FDIV, MVT::f32, 18 }, // divss
{ ISD::FDIV, MVT::v4f32, 35 }, // divps
{ ISD::FDIV, MVT::f64, 33 }, // divsd
{ ISD::FDIV, MVT::v2f64, 65 }, // divpd
};
if (ST->useGLMDivSqrtCosts())
if (const auto *Entry = CostTableLookup(GLMCostTable, ISD,
LT.second))
return LT.first * Entry->Cost;
static const CostTblEntry SLMCostTable[] = {
{ ISD::MUL, MVT::v4i32, 11 }, // pmulld
{ ISD::MUL, MVT::v8i16, 2 }, // pmullw
{ ISD::FMUL, MVT::f64, 2 }, // mulsd
{ ISD::FMUL, MVT::v2f64, 4 }, // mulpd
{ ISD::FMUL, MVT::v4f32, 2 }, // mulps
{ ISD::FDIV, MVT::f32, 17 }, // divss
{ ISD::FDIV, MVT::v4f32, 39 }, // divps
{ ISD::FDIV, MVT::f64, 32 }, // divsd
{ ISD::FDIV, MVT::v2f64, 69 }, // divpd
{ ISD::FADD, MVT::v2f64, 2 }, // addpd
{ ISD::FSUB, MVT::v2f64, 2 }, // subpd
// v2i64/v4i64 mul is custom lowered as a series of long:
// multiplies(3), shifts(3) and adds(2)
// slm muldq version throughput is 2 and addq throughput 4
// thus: 3X2 (muldq throughput) + 3X1 (shift throughput) +
// 3X4 (addq throughput) = 17
{ ISD::MUL, MVT::v2i64, 17 },
// slm addq\subq throughput is 4
{ ISD::ADD, MVT::v2i64, 4 },
{ ISD::SUB, MVT::v2i64, 4 },
};
if (ST->useSLMArithCosts()) {
if (Args.size() == 2 && ISD == ISD::MUL && LT.second == MVT::v4i32) {
// Check if the operands can be shrinked into a smaller datatype.
// TODO: Merge this into generiic vXi32 MUL patterns above.
bool Op1Signed = false;
unsigned Op1MinSize = BaseT::minRequiredElementSize(Args[0], Op1Signed);
bool Op2Signed = false;
unsigned Op2MinSize = BaseT::minRequiredElementSize(Args[1], Op2Signed);
bool SignedMode = Op1Signed || Op2Signed;
unsigned OpMinSize = std::max(Op1MinSize, Op2MinSize);
if (OpMinSize <= 7)
return LT.first * 3; // pmullw/sext
if (!SignedMode && OpMinSize <= 8)
return LT.first * 3; // pmullw/zext
if (OpMinSize <= 15)
return LT.first * 5; // pmullw/pmulhw/pshuf
if (!SignedMode && OpMinSize <= 16)
return LT.first * 5; // pmullw/pmulhw/pshuf
}
if (const auto *Entry = CostTableLookup(SLMCostTable, ISD,
LT.second)) {
return LT.first * Entry->Cost;
}
}
static const CostTblEntry AVX512BWUniformConstCostTable[] = {
{ ISD::SHL, MVT::v64i8, 2 }, // psllw + pand.
{ ISD::SRL, MVT::v64i8, 2 }, // psrlw + pand.
{ ISD::SRA, MVT::v64i8, 4 }, // psrlw, pand, pxor, psubb.
};
if (Op2Info == TargetTransformInfo::OK_UniformConstantValue &&
ST->hasBWI()) {
if (const auto *Entry = CostTableLookup(AVX512BWUniformConstCostTable, ISD,
LT.second))
return LT.first * Entry->Cost;
}
static const CostTblEntry AVX512UniformConstCostTable[] = {
{ ISD::SRA, MVT::v2i64, 1 },
{ ISD::SRA, MVT::v4i64, 1 },
{ ISD::SRA, MVT::v8i64, 1 },
{ ISD::SHL, MVT::v64i8, 4 }, // psllw + pand.
{ ISD::SRL, MVT::v64i8, 4 }, // psrlw + pand.
{ ISD::SRA, MVT::v64i8, 8 }, // psrlw, pand, pxor, psubb.
{ ISD::SDIV, MVT::v16i32, 6 }, // pmuludq sequence
{ ISD::SREM, MVT::v16i32, 8 }, // pmuludq+mul+sub sequence
{ ISD::UDIV, MVT::v16i32, 5 }, // pmuludq sequence
{ ISD::UREM, MVT::v16i32, 7 }, // pmuludq+mul+sub sequence
};
if (Op2Info == TargetTransformInfo::OK_UniformConstantValue &&
ST->hasAVX512()) {
if (const auto *Entry = CostTableLookup(AVX512UniformConstCostTable, ISD,
LT.second))
return LT.first * Entry->Cost;
}
static const CostTblEntry AVX2UniformConstCostTable[] = {
{ ISD::SHL, MVT::v32i8, 2 }, // psllw + pand.
{ ISD::SRL, MVT::v32i8, 2 }, // psrlw + pand.
{ ISD::SRA, MVT::v32i8, 4 }, // psrlw, pand, pxor, psubb.
{ ISD::SRA, MVT::v4i64, 4 }, // 2 x psrad + shuffle.
{ ISD::SDIV, MVT::v8i32, 6 }, // pmuludq sequence
{ ISD::SREM, MVT::v8i32, 8 }, // pmuludq+mul+sub sequence
{ ISD::UDIV, MVT::v8i32, 5 }, // pmuludq sequence
{ ISD::UREM, MVT::v8i32, 7 }, // pmuludq+mul+sub sequence
};
if (Op2Info == TargetTransformInfo::OK_UniformConstantValue &&
ST->hasAVX2()) {
if (const auto *Entry = CostTableLookup(AVX2UniformConstCostTable, ISD,
LT.second))
return LT.first * Entry->Cost;
}
static const CostTblEntry SSE2UniformConstCostTable[] = {
{ ISD::SHL, MVT::v16i8, 2 }, // psllw + pand.
{ ISD::SRL, MVT::v16i8, 2 }, // psrlw + pand.
{ ISD::SRA, MVT::v16i8, 4 }, // psrlw, pand, pxor, psubb.
{ ISD::SHL, MVT::v32i8, 4+2 }, // 2*(psllw + pand) + split.
{ ISD::SRL, MVT::v32i8, 4+2 }, // 2*(psrlw + pand) + split.
{ ISD::SRA, MVT::v32i8, 8+2 }, // 2*(psrlw, pand, pxor, psubb) + split.
{ ISD::SDIV, MVT::v8i32, 12+2 }, // 2*pmuludq sequence + split.
{ ISD::SREM, MVT::v8i32, 16+2 }, // 2*pmuludq+mul+sub sequence + split.
{ ISD::SDIV, MVT::v4i32, 6 }, // pmuludq sequence
{ ISD::SREM, MVT::v4i32, 8 }, // pmuludq+mul+sub sequence
{ ISD::UDIV, MVT::v8i32, 10+2 }, // 2*pmuludq sequence + split.
{ ISD::UREM, MVT::v8i32, 14+2 }, // 2*pmuludq+mul+sub sequence + split.
{ ISD::UDIV, MVT::v4i32, 5 }, // pmuludq sequence
{ ISD::UREM, MVT::v4i32, 7 }, // pmuludq+mul+sub sequence
};
// XOP has faster vXi8 shifts.
if (Op2Info == TargetTransformInfo::OK_UniformConstantValue &&
ST->hasSSE2() && !ST->hasXOP()) {
if (const auto *Entry =
CostTableLookup(SSE2UniformConstCostTable, ISD, LT.second))
return LT.first * Entry->Cost;
}
static const CostTblEntry AVX512BWConstCostTable[] = {
{ ISD::SDIV, MVT::v64i8, 14 }, // 2*ext+2*pmulhw sequence
{ ISD::SREM, MVT::v64i8, 16 }, // 2*ext+2*pmulhw+mul+sub sequence
{ ISD::UDIV, MVT::v64i8, 14 }, // 2*ext+2*pmulhw sequence
{ ISD::UREM, MVT::v64i8, 16 }, // 2*ext+2*pmulhw+mul+sub sequence
{ ISD::SDIV, MVT::v32i16, 6 }, // vpmulhw sequence
{ ISD::SREM, MVT::v32i16, 8 }, // vpmulhw+mul+sub sequence
{ ISD::UDIV, MVT::v32i16, 6 }, // vpmulhuw sequence
{ ISD::UREM, MVT::v32i16, 8 }, // vpmulhuw+mul+sub sequence
};
if ((Op2Info == TargetTransformInfo::OK_UniformConstantValue ||
Op2Info == TargetTransformInfo::OK_NonUniformConstantValue) &&
ST->hasBWI()) {
if (const auto *Entry =
CostTableLookup(AVX512BWConstCostTable, ISD, LT.second))
return LT.first * Entry->Cost;
}
static const CostTblEntry AVX512ConstCostTable[] = {
{ ISD::SDIV, MVT::v16i32, 15 }, // vpmuldq sequence
{ ISD::SREM, MVT::v16i32, 17 }, // vpmuldq+mul+sub sequence
{ ISD::UDIV, MVT::v16i32, 15 }, // vpmuludq sequence
{ ISD::UREM, MVT::v16i32, 17 }, // vpmuludq+mul+sub sequence
{ ISD::SDIV, MVT::v64i8, 28 }, // 4*ext+4*pmulhw sequence
{ ISD::SREM, MVT::v64i8, 32 }, // 4*ext+4*pmulhw+mul+sub sequence
{ ISD::UDIV, MVT::v64i8, 28 }, // 4*ext+4*pmulhw sequence
{ ISD::UREM, MVT::v64i8, 32 }, // 4*ext+4*pmulhw+mul+sub sequence
{ ISD::SDIV, MVT::v32i16, 12 }, // 2*vpmulhw sequence
{ ISD::SREM, MVT::v32i16, 16 }, // 2*vpmulhw+mul+sub sequence
{ ISD::UDIV, MVT::v32i16, 12 }, // 2*vpmulhuw sequence
{ ISD::UREM, MVT::v32i16, 16 }, // 2*vpmulhuw+mul+sub sequence
};
if ((Op2Info == TargetTransformInfo::OK_UniformConstantValue ||
Op2Info == TargetTransformInfo::OK_NonUniformConstantValue) &&
ST->hasAVX512()) {
if (const auto *Entry =
CostTableLookup(AVX512ConstCostTable, ISD, LT.second))
return LT.first * Entry->Cost;
}
static const CostTblEntry AVX2ConstCostTable[] = {
{ ISD::SDIV, MVT::v32i8, 14 }, // 2*ext+2*pmulhw sequence
{ ISD::SREM, MVT::v32i8, 16 }, // 2*ext+2*pmulhw+mul+sub sequence
{ ISD::UDIV, MVT::v32i8, 14 }, // 2*ext+2*pmulhw sequence
{ ISD::UREM, MVT::v32i8, 16 }, // 2*ext+2*pmulhw+mul+sub sequence
{ ISD::SDIV, MVT::v16i16, 6 }, // vpmulhw sequence
{ ISD::SREM, MVT::v16i16, 8 }, // vpmulhw+mul+sub sequence
{ ISD::UDIV, MVT::v16i16, 6 }, // vpmulhuw sequence
{ ISD::UREM, MVT::v16i16, 8 }, // vpmulhuw+mul+sub sequence
{ ISD::SDIV, MVT::v8i32, 15 }, // vpmuldq sequence
{ ISD::SREM, MVT::v8i32, 19 }, // vpmuldq+mul+sub sequence
{ ISD::UDIV, MVT::v8i32, 15 }, // vpmuludq sequence
{ ISD::UREM, MVT::v8i32, 19 }, // vpmuludq+mul+sub sequence
};
if ((Op2Info == TargetTransformInfo::OK_UniformConstantValue ||
Op2Info == TargetTransformInfo::OK_NonUniformConstantValue) &&
ST->hasAVX2()) {
if (const auto *Entry = CostTableLookup(AVX2ConstCostTable, ISD, LT.second))
return LT.first * Entry->Cost;
}
static const CostTblEntry SSE2ConstCostTable[] = {
{ ISD::SDIV, MVT::v32i8, 28+2 }, // 4*ext+4*pmulhw sequence + split.
{ ISD::SREM, MVT::v32i8, 32+2 }, // 4*ext+4*pmulhw+mul+sub sequence + split.
{ ISD::SDIV, MVT::v16i8, 14 }, // 2*ext+2*pmulhw sequence
{ ISD::SREM, MVT::v16i8, 16 }, // 2*ext+2*pmulhw+mul+sub sequence
{ ISD::UDIV, MVT::v32i8, 28+2 }, // 4*ext+4*pmulhw sequence + split.
{ ISD::UREM, MVT::v32i8, 32+2 }, // 4*ext+4*pmulhw+mul+sub sequence + split.
{ ISD::UDIV, MVT::v16i8, 14 }, // 2*ext+2*pmulhw sequence
{ ISD::UREM, MVT::v16i8, 16 }, // 2*ext+2*pmulhw+mul+sub sequence
{ ISD::SDIV, MVT::v16i16, 12+2 }, // 2*pmulhw sequence + split.
{ ISD::SREM, MVT::v16i16, 16+2 }, // 2*pmulhw+mul+sub sequence + split.
{ ISD::SDIV, MVT::v8i16, 6 }, // pmulhw sequence
{ ISD::SREM, MVT::v8i16, 8 }, // pmulhw+mul+sub sequence
{ ISD::UDIV, MVT::v16i16, 12+2 }, // 2*pmulhuw sequence + split.
{ ISD::UREM, MVT::v16i16, 16+2 }, // 2*pmulhuw+mul+sub sequence + split.
{ ISD::UDIV, MVT::v8i16, 6 }, // pmulhuw sequence
{ ISD::UREM, MVT::v8i16, 8 }, // pmulhuw+mul+sub sequence
{ ISD::SDIV, MVT::v8i32, 38+2 }, // 2*pmuludq sequence + split.
{ ISD::SREM, MVT::v8i32, 48+2 }, // 2*pmuludq+mul+sub sequence + split.
{ ISD::SDIV, MVT::v4i32, 19 }, // pmuludq sequence
{ ISD::SREM, MVT::v4i32, 24 }, // pmuludq+mul+sub sequence
{ ISD::UDIV, MVT::v8i32, 30+2 }, // 2*pmuludq sequence + split.
{ ISD::UREM, MVT::v8i32, 40+2 }, // 2*pmuludq+mul+sub sequence + split.
{ ISD::UDIV, MVT::v4i32, 15 }, // pmuludq sequence
{ ISD::UREM, MVT::v4i32, 20 }, // pmuludq+mul+sub sequence
};
if ((Op2Info == TargetTransformInfo::OK_UniformConstantValue ||
Op2Info == TargetTransformInfo::OK_NonUniformConstantValue) &&
ST->hasSSE2()) {
// pmuldq sequence.
if (ISD == ISD::SDIV && LT.second == MVT::v8i32 && ST->hasAVX())
return LT.first * 32;
if (ISD == ISD::SREM && LT.second == MVT::v8i32 && ST->hasAVX())
return LT.first * 38;
if (ISD == ISD::SDIV && LT.second == MVT::v4i32 && ST->hasSSE41())
return LT.first * 15;
if (ISD == ISD::SREM && LT.second == MVT::v4i32 && ST->hasSSE41())
return LT.first * 20;
if (const auto *Entry = CostTableLookup(SSE2ConstCostTable, ISD, LT.second))
return LT.first * Entry->Cost;
}
static const CostTblEntry AVX512BWShiftCostTable[] = {
{ ISD::SHL, MVT::v16i8, 4 }, // extend/vpsllvw/pack sequence.
{ ISD::SRL, MVT::v16i8, 4 }, // extend/vpsrlvw/pack sequence.
{ ISD::SRA, MVT::v16i8, 4 }, // extend/vpsravw/pack sequence.
{ ISD::SHL, MVT::v32i8, 4 }, // extend/vpsllvw/pack sequence.
{ ISD::SRL, MVT::v32i8, 4 }, // extend/vpsrlvw/pack sequence.
{ ISD::SRA, MVT::v32i8, 6 }, // extend/vpsravw/pack sequence.
{ ISD::SHL, MVT::v64i8, 6 }, // extend/vpsllvw/pack sequence.
{ ISD::SRL, MVT::v64i8, 7 }, // extend/vpsrlvw/pack sequence.
{ ISD::SRA, MVT::v64i8, 15 }, // extend/vpsravw/pack sequence.
{ ISD::SHL, MVT::v8i16, 1 }, // vpsllvw
{ ISD::SRL, MVT::v8i16, 1 }, // vpsrlvw
{ ISD::SRA, MVT::v8i16, 1 }, // vpsravw
{ ISD::SHL, MVT::v16i16, 1 }, // vpsllvw
{ ISD::SRL, MVT::v16i16, 1 }, // vpsrlvw
{ ISD::SRA, MVT::v16i16, 1 }, // vpsravw
{ ISD::SHL, MVT::v32i16, 1 }, // vpsllvw
{ ISD::SRL, MVT::v32i16, 1 }, // vpsrlvw
{ ISD::SRA, MVT::v32i16, 1 }, // vpsravw
};
if (ST->hasBWI())
if (const auto *Entry = CostTableLookup(AVX512BWShiftCostTable, ISD, LT.second))
return LT.first * Entry->Cost;
static const CostTblEntry AVX2UniformCostTable[] = {
// Uniform splats are cheaper for the following instructions.
{ ISD::SHL, MVT::v16i16, 1 }, // psllw.
{ ISD::SRL, MVT::v16i16, 1 }, // psrlw.
{ ISD::SRA, MVT::v16i16, 1 }, // psraw.
{ ISD::SHL, MVT::v32i16, 2 }, // 2*psllw.
{ ISD::SRL, MVT::v32i16, 2 }, // 2*psrlw.
{ ISD::SRA, MVT::v32i16, 2 }, // 2*psraw.
{ ISD::SHL, MVT::v8i32, 1 }, // pslld
{ ISD::SRL, MVT::v8i32, 1 }, // psrld
{ ISD::SRA, MVT::v8i32, 1 }, // psrad
{ ISD::SHL, MVT::v4i64, 1 }, // psllq
{ ISD::SRL, MVT::v4i64, 1 }, // psrlq
};
if (ST->hasAVX2() &&
((Op2Info == TargetTransformInfo::OK_UniformConstantValue) ||
(Op2Info == TargetTransformInfo::OK_UniformValue))) {
if (const auto *Entry =
CostTableLookup(AVX2UniformCostTable, ISD, LT.second))
return LT.first * Entry->Cost;
}
static const CostTblEntry SSE2UniformCostTable[] = {
// Uniform splats are cheaper for the following instructions.
{ ISD::SHL, MVT::v8i16, 1 }, // psllw.
{ ISD::SHL, MVT::v4i32, 1 }, // pslld
{ ISD::SHL, MVT::v2i64, 1 }, // psllq.
{ ISD::SRL, MVT::v8i16, 1 }, // psrlw.
{ ISD::SRL, MVT::v4i32, 1 }, // psrld.
{ ISD::SRL, MVT::v2i64, 1 }, // psrlq.
{ ISD::SRA, MVT::v8i16, 1 }, // psraw.
{ ISD::SRA, MVT::v4i32, 1 }, // psrad.
};
if (ST->hasSSE2() &&
((Op2Info == TargetTransformInfo::OK_UniformConstantValue) ||
(Op2Info == TargetTransformInfo::OK_UniformValue))) {
if (const auto *Entry =
CostTableLookup(SSE2UniformCostTable, ISD, LT.second))
return LT.first * Entry->Cost;
}
static const CostTblEntry AVX512DQCostTable[] = {
{ ISD::MUL, MVT::v2i64, 2 }, // pmullq
{ ISD::MUL, MVT::v4i64, 2 }, // pmullq
{ ISD::MUL, MVT::v8i64, 2 } // pmullq
};
// Look for AVX512DQ lowering tricks for custom cases.
if (ST->hasDQI())
if (const auto *Entry = CostTableLookup(AVX512DQCostTable, ISD, LT.second))
return LT.first * Entry->Cost;
static const CostTblEntry AVX512BWCostTable[] = {
{ ISD::SHL, MVT::v64i8, 11 }, // vpblendvb sequence.
{ ISD::SRL, MVT::v64i8, 11 }, // vpblendvb sequence.
{ ISD::SRA, MVT::v64i8, 24 }, // vpblendvb sequence.
};
// Look for AVX512BW lowering tricks for custom cases.
if (ST->hasBWI())
if (const auto *Entry = CostTableLookup(AVX512BWCostTable, ISD, LT.second))
return LT.first * Entry->Cost;
static const CostTblEntry AVX512CostTable[] = {
{ ISD::SHL, MVT::v4i32, 1 },
{ ISD::SRL, MVT::v4i32, 1 },
{ ISD::SRA, MVT::v4i32, 1 },
{ ISD::SHL, MVT::v8i32, 1 },
{ ISD::SRL, MVT::v8i32, 1 },
{ ISD::SRA, MVT::v8i32, 1 },
{ ISD::SHL, MVT::v16i32, 1 },
{ ISD::SRL, MVT::v16i32, 1 },
{ ISD::SRA, MVT::v16i32, 1 },
{ ISD::SHL, MVT::v2i64, 1 },
{ ISD::SRL, MVT::v2i64, 1 },
{ ISD::SHL, MVT::v4i64, 1 },
{ ISD::SRL, MVT::v4i64, 1 },
{ ISD::SHL, MVT::v8i64, 1 },
{ ISD::SRL, MVT::v8i64, 1 },
{ ISD::SRA, MVT::v2i64, 1 },
{ ISD::SRA, MVT::v4i64, 1 },
{ ISD::SRA, MVT::v8i64, 1 },
{ ISD::MUL, MVT::v16i32, 1 }, // pmulld (Skylake from agner.org)
{ ISD::MUL, MVT::v8i32, 1 }, // pmulld (Skylake from agner.org)
{ ISD::MUL, MVT::v4i32, 1 }, // pmulld (Skylake from agner.org)
{ ISD::MUL, MVT::v8i64, 6 }, // 3*pmuludq/3*shift/2*add
{ ISD::FNEG, MVT::v8f64, 1 }, // Skylake from http://www.agner.org/
{ ISD::FADD, MVT::v8f64, 1 }, // Skylake from http://www.agner.org/
{ ISD::FSUB, MVT::v8f64, 1 }, // Skylake from http://www.agner.org/
{ ISD::FMUL, MVT::v8f64, 1 }, // Skylake from http://www.agner.org/
{ ISD::FDIV, MVT::f64, 4 }, // Skylake from http://www.agner.org/
{ ISD::FDIV, MVT::v2f64, 4 }, // Skylake from http://www.agner.org/
{ ISD::FDIV, MVT::v4f64, 8 }, // Skylake from http://www.agner.org/
{ ISD::FDIV, MVT::v8f64, 16 }, // Skylake from http://www.agner.org/
{ ISD::FNEG, MVT::v16f32, 1 }, // Skylake from http://www.agner.org/
{ ISD::FADD, MVT::v16f32, 1 }, // Skylake from http://www.agner.org/
{ ISD::FSUB, MVT::v16f32, 1 }, // Skylake from http://www.agner.org/
{ ISD::FMUL, MVT::v16f32, 1 }, // Skylake from http://www.agner.org/
{ ISD::FDIV, MVT::f32, 3 }, // Skylake from http://www.agner.org/
{ ISD::FDIV, MVT::v4f32, 3 }, // Skylake from http://www.agner.org/
{ ISD::FDIV, MVT::v8f32, 5 }, // Skylake from http://www.agner.org/
{ ISD::FDIV, MVT::v16f32, 10 }, // Skylake from http://www.agner.org/
};
if (ST->hasAVX512())
if (const auto *Entry = CostTableLookup(AVX512CostTable, ISD, LT.second))
return LT.first * Entry->Cost;
static const CostTblEntry AVX2ShiftCostTable[] = {
// Shifts on vXi64/vXi32 on AVX2 is legal even though we declare to
// customize them to detect the cases where shift amount is a scalar one.
{ ISD::SHL, MVT::v4i32, 2 }, // vpsllvd (Haswell from agner.org)
{ ISD::SRL, MVT::v4i32, 2 }, // vpsrlvd (Haswell from agner.org)
{ ISD::SRA, MVT::v4i32, 2 }, // vpsravd (Haswell from agner.org)
{ ISD::SHL, MVT::v8i32, 2 }, // vpsllvd (Haswell from agner.org)
{ ISD::SRL, MVT::v8i32, 2 }, // vpsrlvd (Haswell from agner.org)
{ ISD::SRA, MVT::v8i32, 2 }, // vpsravd (Haswell from agner.org)
{ ISD::SHL, MVT::v2i64, 1 }, // vpsllvq (Haswell from agner.org)
{ ISD::SRL, MVT::v2i64, 1 }, // vpsrlvq (Haswell from agner.org)
{ ISD::SHL, MVT::v4i64, 1 }, // vpsllvq (Haswell from agner.org)
{ ISD::SRL, MVT::v4i64, 1 }, // vpsrlvq (Haswell from agner.org)
};
if (ST->hasAVX512()) {
if (ISD == ISD::SHL && LT.second == MVT::v32i16 &&
(Op2Info == TargetTransformInfo::OK_UniformConstantValue ||
Op2Info == TargetTransformInfo::OK_NonUniformConstantValue))
// On AVX512, a packed v32i16 shift left by a constant build_vector
// is lowered into a vector multiply (vpmullw).
return getArithmeticInstrCost(Instruction::Mul, Ty, CostKind,
Op1Info, Op2Info,
TargetTransformInfo::OP_None,
TargetTransformInfo::OP_None);
}
// Look for AVX2 lowering tricks (XOP is always better at v4i32 shifts).
if (ST->hasAVX2() && !(ST->hasXOP() && LT.second == MVT::v4i32)) {
if (ISD == ISD::SHL && LT.second == MVT::v16i16 &&
(Op2Info == TargetTransformInfo::OK_UniformConstantValue ||
Op2Info == TargetTransformInfo::OK_NonUniformConstantValue))
// On AVX2, a packed v16i16 shift left by a constant build_vector
// is lowered into a vector multiply (vpmullw).
return getArithmeticInstrCost(Instruction::Mul, Ty, CostKind,
Op1Info, Op2Info,
TargetTransformInfo::OP_None,
TargetTransformInfo::OP_None);
if (const auto *Entry = CostTableLookup(AVX2ShiftCostTable, ISD, LT.second))
return LT.first * Entry->Cost;
}
static const CostTblEntry XOPShiftCostTable[] = {
// 128bit shifts take 1cy, but right shifts require negation beforehand.
{ ISD::SHL, MVT::v16i8, 1 },
{ ISD::SRL, MVT::v16i8, 2 },
{ ISD::SRA, MVT::v16i8, 2 },
{ ISD::SHL, MVT::v8i16, 1 },
{ ISD::SRL, MVT::v8i16, 2 },
{ ISD::SRA, MVT::v8i16, 2 },
{ ISD::SHL, MVT::v4i32, 1 },
{ ISD::SRL, MVT::v4i32, 2 },
{ ISD::SRA, MVT::v4i32, 2 },
{ ISD::SHL, MVT::v2i64, 1 },
{ ISD::SRL, MVT::v2i64, 2 },
{ ISD::SRA, MVT::v2i64, 2 },
// 256bit shifts require splitting if AVX2 didn't catch them above.
{ ISD::SHL, MVT::v32i8, 2+2 },
{ ISD::SRL, MVT::v32i8, 4+2 },
{ ISD::SRA, MVT::v32i8, 4+2 },
{ ISD::SHL, MVT::v16i16, 2+2 },
{ ISD::SRL, MVT::v16i16, 4+2 },
{ ISD::SRA, MVT::v16i16, 4+2 },
{ ISD::SHL, MVT::v8i32, 2+2 },
{ ISD::SRL, MVT::v8i32, 4+2 },
{ ISD::SRA, MVT::v8i32, 4+2 },
{ ISD::SHL, MVT::v4i64, 2+2 },
{ ISD::SRL, MVT::v4i64, 4+2 },
{ ISD::SRA, MVT::v4i64, 4+2 },
};
// Look for XOP lowering tricks.
if (ST->hasXOP()) {
// If the right shift is constant then we'll fold the negation so
// it's as cheap as a left shift.
int ShiftISD = ISD;
if ((ShiftISD == ISD::SRL || ShiftISD == ISD::SRA) &&
(Op2Info == TargetTransformInfo::OK_UniformConstantValue ||
Op2Info == TargetTransformInfo::OK_NonUniformConstantValue))
ShiftISD = ISD::SHL;
if (const auto *Entry =
CostTableLookup(XOPShiftCostTable, ShiftISD, LT.second))
return LT.first * Entry->Cost;
}
static const CostTblEntry SSE2UniformShiftCostTable[] = {
// Uniform splats are cheaper for the following instructions.
{ ISD::SHL, MVT::v16i16, 2+2 }, // 2*psllw + split.
{ ISD::SHL, MVT::v8i32, 2+2 }, // 2*pslld + split.
{ ISD::SHL, MVT::v4i64, 2+2 }, // 2*psllq + split.
{ ISD::SRL, MVT::v16i16, 2+2 }, // 2*psrlw + split.
{ ISD::SRL, MVT::v8i32, 2+2 }, // 2*psrld + split.
{ ISD::SRL, MVT::v4i64, 2+2 }, // 2*psrlq + split.
{ ISD::SRA, MVT::v16i16, 2+2 }, // 2*psraw + split.
{ ISD::SRA, MVT::v8i32, 2+2 }, // 2*psrad + split.
{ ISD::SRA, MVT::v2i64, 4 }, // 2*psrad + shuffle.
{ ISD::SRA, MVT::v4i64, 8+2 }, // 2*(2*psrad + shuffle) + split.
};
if (ST->hasSSE2() &&
((Op2Info == TargetTransformInfo::OK_UniformConstantValue) ||
(Op2Info == TargetTransformInfo::OK_UniformValue))) {
// Handle AVX2 uniform v4i64 ISD::SRA, it's not worth a table.
if (ISD == ISD::SRA && LT.second == MVT::v4i64 && ST->hasAVX2())
return LT.first * 4; // 2*psrad + shuffle.
if (const auto *Entry =
CostTableLookup(SSE2UniformShiftCostTable, ISD, LT.second))
return LT.first * Entry->Cost;
}
if (ISD == ISD::SHL &&
Op2Info == TargetTransformInfo::OK_NonUniformConstantValue) {
MVT VT = LT.second;
// Vector shift left by non uniform constant can be lowered
// into vector multiply.
if (((VT == MVT::v8i16 || VT == MVT::v4i32) && ST->hasSSE2()) ||
((VT == MVT::v16i16 || VT == MVT::v8i32) && ST->hasAVX()))
ISD = ISD::MUL;
}
static const CostTblEntry AVX2CostTable[] = {
{ ISD::SHL, MVT::v16i8, 6 }, // vpblendvb sequence.
{ ISD::SHL, MVT::v32i8, 6 }, // vpblendvb sequence.
{ ISD::SHL, MVT::v64i8, 12 }, // 2*vpblendvb sequence.
{ ISD::SHL, MVT::v8i16, 5 }, // extend/vpsrlvd/pack sequence.
{ ISD::SHL, MVT::v16i16, 7 }, // extend/vpsrlvd/pack sequence.
{ ISD::SHL, MVT::v32i16, 14 }, // 2*extend/vpsrlvd/pack sequence.
{ ISD::SRL, MVT::v16i8, 6 }, // vpblendvb sequence.
{ ISD::SRL, MVT::v32i8, 6 }, // vpblendvb sequence.
{ ISD::SRL, MVT::v64i8, 12 }, // 2*vpblendvb sequence.
{ ISD::SRL, MVT::v8i16, 5 }, // extend/vpsrlvd/pack sequence.
{ ISD::SRL, MVT::v16i16, 7 }, // extend/vpsrlvd/pack sequence.
{ ISD::SRL, MVT::v32i16, 14 }, // 2*extend/vpsrlvd/pack sequence.
{ ISD::SRA, MVT::v16i8, 17 }, // vpblendvb sequence.
{ ISD::SRA, MVT::v32i8, 17 }, // vpblendvb sequence.
{ ISD::SRA, MVT::v64i8, 34 }, // 2*vpblendvb sequence.
{ ISD::SRA, MVT::v8i16, 5 }, // extend/vpsravd/pack sequence.
{ ISD::SRA, MVT::v16i16, 7 }, // extend/vpsravd/pack sequence.
{ ISD::SRA, MVT::v32i16, 14 }, // 2*extend/vpsravd/pack sequence.
{ ISD::SRA, MVT::v2i64, 2 }, // srl/xor/sub sequence.
{ ISD::SRA, MVT::v4i64, 2 }, // srl/xor/sub sequence.
{ ISD::SUB, MVT::v32i8, 1 }, // psubb
{ ISD::ADD, MVT::v32i8, 1 }, // paddb
{ ISD::SUB, MVT::v16i16, 1 }, // psubw
{ ISD::ADD, MVT::v16i16, 1 }, // paddw
{ ISD::SUB, MVT::v8i32, 1 }, // psubd
{ ISD::ADD, MVT::v8i32, 1 }, // paddd
{ ISD::SUB, MVT::v4i64, 1 }, // psubq
{ ISD::ADD, MVT::v4i64, 1 }, // paddq
{ ISD::MUL, MVT::v16i16, 1 }, // pmullw
{ ISD::MUL, MVT::v8i32, 2 }, // pmulld (Haswell from agner.org)
{ ISD::MUL, MVT::v4i64, 6 }, // 3*pmuludq/3*shift/2*add
{ ISD::FNEG, MVT::v4f64, 1 }, // Haswell from http://www.agner.org/
{ ISD::FNEG, MVT::v8f32, 1 }, // Haswell from http://www.agner.org/
{ ISD::FADD, MVT::v4f64, 1 }, // Haswell from http://www.agner.org/
{ ISD::FADD, MVT::v8f32, 1 }, // Haswell from http://www.agner.org/
{ ISD::FSUB, MVT::v4f64, 1 }, // Haswell from http://www.agner.org/
{ ISD::FSUB, MVT::v8f32, 1 }, // Haswell from http://www.agner.org/
{ ISD::FMUL, MVT::f64, 1 }, // Haswell from http://www.agner.org/
{ ISD::FMUL, MVT::v2f64, 1 }, // Haswell from http://www.agner.org/
{ ISD::FMUL, MVT::v4f64, 1 }, // Haswell from http://www.agner.org/
{ ISD::FMUL, MVT::v8f32, 1 }, // Haswell from http://www.agner.org/
{ ISD::FDIV, MVT::f32, 7 }, // Haswell from http://www.agner.org/
{ ISD::FDIV, MVT::v4f32, 7 }, // Haswell from http://www.agner.org/
{ ISD::FDIV, MVT::v8f32, 14 }, // Haswell from http://www.agner.org/
{ ISD::FDIV, MVT::f64, 14 }, // Haswell from http://www.agner.org/
{ ISD::FDIV, MVT::v2f64, 14 }, // Haswell from http://www.agner.org/
{ ISD::FDIV, MVT::v4f64, 28 }, // Haswell from http://www.agner.org/
};
// Look for AVX2 lowering tricks for custom cases.
if (ST->hasAVX2())
if (const auto *Entry = CostTableLookup(AVX2CostTable, ISD, LT.second))
return LT.first * Entry->Cost;
static const CostTblEntry AVX1CostTable[] = {
// We don't have to scalarize unsupported ops. We can issue two half-sized
// operations and we only need to extract the upper YMM half.
// Two ops + 1 extract + 1 insert = 4.
{ ISD::MUL, MVT::v16i16, 4 },
{ ISD::MUL, MVT::v8i32, 5 }, // BTVER2 from http://www.agner.org/
{ ISD::MUL, MVT::v4i64, 12 },
{ ISD::SUB, MVT::v32i8, 4 },
{ ISD::ADD, MVT::v32i8, 4 },
{ ISD::SUB, MVT::v16i16, 4 },
{ ISD::ADD, MVT::v16i16, 4 },
{ ISD::SUB, MVT::v8i32, 4 },
{ ISD::ADD, MVT::v8i32, 4 },
{ ISD::SUB, MVT::v4i64, 4 },
{ ISD::ADD, MVT::v4i64, 4 },
{ ISD::SHL, MVT::v32i8, 22 }, // pblendvb sequence + split.
{ ISD::SHL, MVT::v8i16, 6 }, // pblendvb sequence.
{ ISD::SHL, MVT::v16i16, 13 }, // pblendvb sequence + split.
{ ISD::SHL, MVT::v4i32, 3 }, // pslld/paddd/cvttps2dq/pmulld
{ ISD::SHL, MVT::v8i32, 9 }, // pslld/paddd/cvttps2dq/pmulld + split
{ ISD::SHL, MVT::v2i64, 2 }, // Shift each lane + blend.
{ ISD::SHL, MVT::v4i64, 6 }, // Shift each lane + blend + split.
{ ISD::SRL, MVT::v32i8, 23 }, // pblendvb sequence + split.
{ ISD::SRL, MVT::v16i16, 28 }, // pblendvb sequence + split.
{ ISD::SRL, MVT::v4i32, 6 }, // Shift each lane + blend.
{ ISD::SRL, MVT::v8i32, 14 }, // Shift each lane + blend + split.
{ ISD::SRL, MVT::v2i64, 2 }, // Shift each lane + blend.
{ ISD::SRL, MVT::v4i64, 6 }, // Shift each lane + blend + split.
{ ISD::SRA, MVT::v32i8, 44 }, // pblendvb sequence + split.
{ ISD::SRA, MVT::v16i16, 28 }, // pblendvb sequence + split.
{ ISD::SRA, MVT::v4i32, 6 }, // Shift each lane + blend.
{ ISD::SRA, MVT::v8i32, 14 }, // Shift each lane + blend + split.
{ ISD::SRA, MVT::v2i64, 5 }, // Shift each lane + blend.
{ ISD::SRA, MVT::v4i64, 12 }, // Shift each lane + blend + split.
{ ISD::FNEG, MVT::v4f64, 2 }, // BTVER2 from http://www.agner.org/
{ ISD::FNEG, MVT::v8f32, 2 }, // BTVER2 from http://www.agner.org/
{ ISD::FMUL, MVT::f64, 2 }, // BTVER2 from http://www.agner.org/
{ ISD::FMUL, MVT::v2f64, 2 }, // BTVER2 from http://www.agner.org/
{ ISD::FMUL, MVT::v4f64, 4 }, // BTVER2 from http://www.agner.org/
{ ISD::FDIV, MVT::f32, 14 }, // SNB from http://www.agner.org/
{ ISD::FDIV, MVT::v4f32, 14 }, // SNB from http://www.agner.org/
{ ISD::FDIV, MVT::v8f32, 28 }, // SNB from http://www.agner.org/
{ ISD::FDIV, MVT::f64, 22 }, // SNB from http://www.agner.org/
{ ISD::FDIV, MVT::v2f64, 22 }, // SNB from http://www.agner.org/
{ ISD::FDIV, MVT::v4f64, 44 }, // SNB from http://www.agner.org/
};
if (ST->hasAVX())
if (const auto *Entry = CostTableLookup(AVX1CostTable, ISD, LT.second))
return LT.first * Entry->Cost;
static const CostTblEntry SSE42CostTable[] = {
{ ISD::FADD, MVT::f64, 1 }, // Nehalem from http://www.agner.org/
{ ISD::FADD, MVT::f32, 1 }, // Nehalem from http://www.agner.org/
{ ISD::FADD, MVT::v2f64, 1 }, // Nehalem from http://www.agner.org/
{ ISD::FADD, MVT::v4f32, 1 }, // Nehalem from http://www.agner.org/
{ ISD::FSUB, MVT::f64, 1 }, // Nehalem from http://www.agner.org/
{ ISD::FSUB, MVT::f32 , 1 }, // Nehalem from http://www.agner.org/
{ ISD::FSUB, MVT::v2f64, 1 }, // Nehalem from http://www.agner.org/
{ ISD::FSUB, MVT::v4f32, 1 }, // Nehalem from http://www.agner.org/
{ ISD::FMUL, MVT::f64, 1 }, // Nehalem from http://www.agner.org/
{ ISD::FMUL, MVT::f32, 1 }, // Nehalem from http://www.agner.org/
{ ISD::FMUL, MVT::v2f64, 1 }, // Nehalem from http://www.agner.org/
{ ISD::FMUL, MVT::v4f32, 1 }, // Nehalem from http://www.agner.org/
{ ISD::FDIV, MVT::f32, 14 }, // Nehalem from http://www.agner.org/
{ ISD::FDIV, MVT::v4f32, 14 }, // Nehalem from http://www.agner.org/
{ ISD::FDIV, MVT::f64, 22 }, // Nehalem from http://www.agner.org/
{ ISD::FDIV, MVT::v2f64, 22 }, // Nehalem from http://www.agner.org/
{ ISD::MUL, MVT::v2i64, 6 } // 3*pmuludq/3*shift/2*add
};
if (ST->hasSSE42())
if (const auto *Entry = CostTableLookup(SSE42CostTable, ISD, LT.second))
return LT.first * Entry->Cost;
static const CostTblEntry SSE41CostTable[] = {
{ ISD::SHL, MVT::v16i8, 10 }, // pblendvb sequence.
{ ISD::SHL, MVT::v8i16, 11 }, // pblendvb sequence.
{ ISD::SHL, MVT::v4i32, 4 }, // pslld/paddd/cvttps2dq/pmulld
{ ISD::SRL, MVT::v16i8, 11 }, // pblendvb sequence.
{ ISD::SRL, MVT::v8i16, 13 }, // pblendvb sequence.
{ ISD::SRL, MVT::v4i32, 16 }, // Shift each lane + blend.
{ ISD::SRA, MVT::v16i8, 21 }, // pblendvb sequence.
{ ISD::SRA, MVT::v8i16, 13 }, // pblendvb sequence.
{ ISD::MUL, MVT::v4i32, 2 } // pmulld (Nehalem from agner.org)
};
if (ST->hasSSE41())
if (const auto *Entry = CostTableLookup(SSE41CostTable, ISD, LT.second))
return LT.first * Entry->Cost;
static const CostTblEntry SSE2CostTable[] = {
// We don't correctly identify costs of casts because they are marked as
// custom.
{ ISD::SHL, MVT::v16i8, 13 }, // cmpgtb sequence.
{ ISD::SHL, MVT::v8i16, 25 }, // cmpgtw sequence.
{ ISD::SHL, MVT::v4i32, 16 }, // pslld/paddd/cvttps2dq/pmuludq.
{ ISD::SHL, MVT::v2i64, 4 }, // splat+shuffle sequence.
{ ISD::SRL, MVT::v16i8, 14 }, // cmpgtb sequence.
{ ISD::SRL, MVT::v8i16, 16 }, // cmpgtw sequence.
{ ISD::SRL, MVT::v4i32, 12 }, // Shift each lane + blend.
{ ISD::SRL, MVT::v2i64, 4 }, // splat+shuffle sequence.
{ ISD::SRA, MVT::v16i8, 27 }, // unpacked cmpgtb sequence.
{ ISD::SRA, MVT::v8i16, 16 }, // cmpgtw sequence.
{ ISD::SRA, MVT::v4i32, 12 }, // Shift each lane + blend.
{ ISD::SRA, MVT::v2i64, 8 }, // srl/xor/sub splat+shuffle sequence.
{ ISD::MUL, MVT::v8i16, 1 }, // pmullw
{ ISD::MUL, MVT::v4i32, 6 }, // 3*pmuludq/4*shuffle
{ ISD::MUL, MVT::v2i64, 8 }, // 3*pmuludq/3*shift/2*add
{ ISD::FDIV, MVT::f32, 23 }, // Pentium IV from http://www.agner.org/
{ ISD::FDIV, MVT::v4f32, 39 }, // Pentium IV from http://www.agner.org/
{ ISD::FDIV, MVT::f64, 38 }, // Pentium IV from http://www.agner.org/
{ ISD::FDIV, MVT::v2f64, 69 }, // Pentium IV from http://www.agner.org/
{ ISD::FNEG, MVT::f32, 1 }, // Pentium IV from http://www.agner.org/
{ ISD::FNEG, MVT::f64, 1 }, // Pentium IV from http://www.agner.org/
{ ISD::FNEG, MVT::v4f32, 1 }, // Pentium IV from http://www.agner.org/
{ ISD::FNEG, MVT::v2f64, 1 }, // Pentium IV from http://www.agner.org/
{ ISD::FADD, MVT::f32, 2 }, // Pentium IV from http://www.agner.org/
{ ISD::FADD, MVT::f64, 2 }, // Pentium IV from http://www.agner.org/
{ ISD::FSUB, MVT::f32, 2 }, // Pentium IV from http://www.agner.org/
{ ISD::FSUB, MVT::f64, 2 }, // Pentium IV from http://www.agner.org/
};
if (ST->hasSSE2())
if (const auto *Entry = CostTableLookup(SSE2CostTable, ISD, LT.second))
return LT.first * Entry->Cost;
static const CostTblEntry SSE1CostTable[] = {
{ ISD::FDIV, MVT::f32, 17 }, // Pentium III from http://www.agner.org/
{ ISD::FDIV, MVT::v4f32, 34 }, // Pentium III from http://www.agner.org/
{ ISD::FNEG, MVT::f32, 2 }, // Pentium III from http://www.agner.org/
{ ISD::FNEG, MVT::v4f32, 2 }, // Pentium III from http://www.agner.org/
{ ISD::FADD, MVT::f32, 1 }, // Pentium III from http://www.agner.org/
{ ISD::FADD, MVT::v4f32, 2 }, // Pentium III from http://www.agner.org/
{ ISD::FSUB, MVT::f32, 1 }, // Pentium III from http://www.agner.org/
{ ISD::FSUB, MVT::v4f32, 2 }, // Pentium III from http://www.agner.org/
};
if (ST->hasSSE1())
if (const auto *Entry = CostTableLookup(SSE1CostTable, ISD, LT.second))
return LT.first * Entry->Cost;
static const CostTblEntry X64CostTbl[] = { // 64-bit targets
{ ISD::ADD, MVT::i64, 1 }, // Core (Merom) from http://www.agner.org/
{ ISD::SUB, MVT::i64, 1 }, // Core (Merom) from http://www.agner.org/
{ ISD::MUL, MVT::i64, 2 }, // Nehalem from http://www.agner.org/
};
if (ST->is64Bit())
if (const auto *Entry = CostTableLookup(X64CostTbl, ISD, LT.second))
return LT.first * Entry->Cost;
static const CostTblEntry X86CostTbl[] = { // 32 or 64-bit targets
{ ISD::ADD, MVT::i8, 1 }, // Pentium III from http://www.agner.org/
{ ISD::ADD, MVT::i16, 1 }, // Pentium III from http://www.agner.org/
{ ISD::ADD, MVT::i32, 1 }, // Pentium III from http://www.agner.org/
{ ISD::SUB, MVT::i8, 1 }, // Pentium III from http://www.agner.org/
{ ISD::SUB, MVT::i16, 1 }, // Pentium III from http://www.agner.org/
{ ISD::SUB, MVT::i32, 1 }, // Pentium III from http://www.agner.org/
};
if (const auto *Entry = CostTableLookup(X86CostTbl, ISD, LT.second))
return LT.first * Entry->Cost;
// It is not a good idea to vectorize division. We have to scalarize it and
// in the process we will often end up having to spilling regular
// registers. The overhead of division is going to dominate most kernels
// anyways so try hard to prevent vectorization of division - it is
// generally a bad idea. Assume somewhat arbitrarily that we have to be able
// to hide "20 cycles" for each lane.
if (LT.second.isVector() && (ISD == ISD::SDIV || ISD == ISD::SREM ||
ISD == ISD::UDIV || ISD == ISD::UREM)) {
InstructionCost ScalarCost = getArithmeticInstrCost(
Opcode, Ty->getScalarType(), CostKind, Op1Info, Op2Info,
TargetTransformInfo::OP_None, TargetTransformInfo::OP_None);
return 20 * LT.first * LT.second.getVectorNumElements() * ScalarCost;
}
// Fallback to the default implementation.
return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info);
}
InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
VectorType *BaseTp,
ArrayRef<int> Mask, int Index,
VectorType *SubTp) {
// 64-bit packed float vectors (v2f32) are widened to type v4f32.
// 64-bit packed integer vectors (v2i32) are widened to type v4i32.
std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, BaseTp);
Kind = improveShuffleKindFromMask(Kind, Mask);
// Treat Transpose as 2-op shuffles - there's no difference in lowering.
if (Kind == TTI::SK_Transpose)
Kind = TTI::SK_PermuteTwoSrc;
// For Broadcasts we are splatting the first element from the first input
// register, so only need to reference that input and all the output
// registers are the same.
if (Kind == TTI::SK_Broadcast)
LT.first = 1;
// Subvector extractions are free if they start at the beginning of a
// vector and cheap if the subvectors are aligned.
if (Kind == TTI::SK_ExtractSubvector && LT.second.isVector()) {
int NumElts = LT.second.getVectorNumElements();
if ((Index % NumElts) == 0)
return 0;
std::pair<InstructionCost, MVT> SubLT =
TLI->getTypeLegalizationCost(DL, SubTp);
if (SubLT.second.isVector()) {
int NumSubElts = SubLT.second.getVectorNumElements();
if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0)
return SubLT.first;
// Handle some cases for widening legalization. For now we only handle
// cases where the original subvector was naturally aligned and evenly
// fit in its legalized subvector type.
// FIXME: Remove some of the alignment restrictions.
// FIXME: We can use permq for 64-bit or larger extracts from 256-bit
// vectors.
int OrigSubElts = cast<FixedVectorType>(SubTp)->getNumElements();
if (NumSubElts > OrigSubElts && (Index % OrigSubElts) == 0 &&
(NumSubElts % OrigSubElts) == 0 &&
LT.second.getVectorElementType() ==
SubLT.second.getVectorElementType() &&
LT.second.getVectorElementType().getSizeInBits() ==
BaseTp->getElementType()->getPrimitiveSizeInBits()) {
assert(NumElts >= NumSubElts && NumElts > OrigSubElts &&
"Unexpected number of elements!");
auto *VecTy = FixedVectorType::get(BaseTp->getElementType(),
LT.second.getVectorNumElements());
auto *SubTy = FixedVectorType::get(BaseTp->getElementType(),
SubLT.second.getVectorNumElements());
int ExtractIndex = alignDown((Index % NumElts), NumSubElts);
InstructionCost ExtractCost = getShuffleCost(
TTI::SK_ExtractSubvector, VecTy, None, ExtractIndex, SubTy);
// If the original size is 32-bits or more, we can use pshufd. Otherwise
// if we have SSSE3 we can use pshufb.
if (SubTp->getPrimitiveSizeInBits() >= 32 || ST->hasSSSE3())
return ExtractCost + 1; // pshufd or pshufb
assert(SubTp->getPrimitiveSizeInBits() == 16 &&
"Unexpected vector size");
return ExtractCost + 2; // worst case pshufhw + pshufd
}
}
}
// Subvector insertions are cheap if the subvectors are aligned.
// Note that in general, the insertion starting at the beginning of a vector
// isn't free, because we need to preserve the rest of the wide vector.
if (Kind == TTI::SK_InsertSubvector && LT.second.isVector()) {
int NumElts = LT.second.getVectorNumElements();
std::pair<InstructionCost, MVT> SubLT =
TLI->getTypeLegalizationCost(DL, SubTp);
if (SubLT.second.isVector()) {
int NumSubElts = SubLT.second.getVectorNumElements();
if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0)
return SubLT.first;
}
// If the insertion isn't aligned, treat it like a 2-op shuffle.
Kind = TTI::SK_PermuteTwoSrc;
}
// Handle some common (illegal) sub-vector types as they are often very cheap
// to shuffle even on targets without PSHUFB.
EVT VT = TLI->getValueType(DL, BaseTp);
if (VT.isSimple() && VT.isVector() && VT.getSizeInBits() < 128 &&
!ST->hasSSSE3()) {
static const CostTblEntry SSE2SubVectorShuffleTbl[] = {
{TTI::SK_Broadcast, MVT::v4i16, 1}, // pshuflw
{TTI::SK_Broadcast, MVT::v2i16, 1}, // pshuflw
{TTI::SK_Broadcast, MVT::v8i8, 2}, // punpck/pshuflw
{TTI::SK_Broadcast, MVT::v4i8, 2}, // punpck/pshuflw
{TTI::SK_Broadcast, MVT::v2i8, 1}, // punpck
{TTI::SK_Reverse, MVT::v4i16, 1}, // pshuflw
{TTI::SK_Reverse, MVT::v2i16, 1}, // pshuflw
{TTI::SK_Reverse, MVT::v4i8, 3}, // punpck/pshuflw/packus
{TTI::SK_Reverse, MVT::v2i8, 1}, // punpck
{TTI::SK_PermuteTwoSrc, MVT::v4i16, 2}, // punpck/pshuflw
{TTI::SK_PermuteTwoSrc, MVT::v2i16, 2}, // punpck/pshuflw
{TTI::SK_PermuteTwoSrc, MVT::v8i8, 7}, // punpck/pshuflw
{TTI::SK_PermuteTwoSrc, MVT::v4i8, 4}, // punpck/pshuflw
{TTI::SK_PermuteTwoSrc, MVT::v2i8, 2}, // punpck
{TTI::SK_PermuteSingleSrc, MVT::v4i16, 1}, // pshuflw
{TTI::SK_PermuteSingleSrc, MVT::v2i16, 1}, // pshuflw
{TTI::SK_PermuteSingleSrc, MVT::v8i8, 5}, // punpck/pshuflw
{TTI::SK_PermuteSingleSrc, MVT::v4i8, 3}, // punpck/pshuflw
{TTI::SK_PermuteSingleSrc, MVT::v2i8, 1}, // punpck
};
if (ST->hasSSE2())
if (const auto *Entry =
CostTableLookup(SSE2SubVectorShuffleTbl, Kind, VT.getSimpleVT()))
return Entry->Cost;
}
// We are going to permute multiple sources and the result will be in multiple
// destinations. Providing an accurate cost only for splits where the element
// type remains the same.
if (Kind == TTI::SK_PermuteSingleSrc && LT.first != 1) {
MVT LegalVT = LT.second;
if (LegalVT.isVector() &&
LegalVT.getVectorElementType().getSizeInBits() ==
BaseTp->getElementType()->getPrimitiveSizeInBits() &&
LegalVT.getVectorNumElements() <
cast<FixedVectorType>(BaseTp)->getNumElements()) {
unsigned VecTySize = DL.getTypeStoreSize(BaseTp);
unsigned LegalVTSize = LegalVT.getStoreSize();
// Number of source vectors after legalization:
unsigned NumOfSrcs = (VecTySize + LegalVTSize - 1) / LegalVTSize;
// Number of destination vectors after legalization:
InstructionCost NumOfDests = LT.first;
auto *SingleOpTy = FixedVectorType::get(BaseTp->getElementType(),
LegalVT.getVectorNumElements());
InstructionCost NumOfShuffles = (NumOfSrcs - 1) * NumOfDests;
return NumOfShuffles * getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy,
None, 0, nullptr);
}
return BaseT::getShuffleCost(Kind, BaseTp, Mask, Index, SubTp);
}
// For 2-input shuffles, we must account for splitting the 2 inputs into many.
if (Kind == TTI::SK_PermuteTwoSrc && LT.first != 1) {
// We assume that source and destination have the same vector type.
InstructionCost NumOfDests = LT.first;
InstructionCost NumOfShufflesPerDest = LT.first * 2 - 1;
LT.first = NumOfDests * NumOfShufflesPerDest;
}
static const CostTblEntry AVX512FP16ShuffleTbl[] = {
{TTI::SK_Broadcast, MVT::v32f16, 1}, // vpbroadcastw
{TTI::SK_Broadcast, MVT::v16f16, 1}, // vpbroadcastw
{TTI::SK_Broadcast, MVT::v8f16, 1}, // vpbroadcastw
{TTI::SK_Reverse, MVT::v32f16, 2}, // vpermw
{TTI::SK_Reverse, MVT::v16f16, 2}, // vpermw
{TTI::SK_Reverse, MVT::v8f16, 1}, // vpshufb
{TTI::SK_PermuteSingleSrc, MVT::v32f16, 2}, // vpermw
{TTI::SK_PermuteSingleSrc, MVT::v16f16, 2}, // vpermw
{TTI::SK_PermuteSingleSrc, MVT::v8f16, 1}, // vpshufb
{TTI::SK_PermuteTwoSrc, MVT::v32f16, 2}, // vpermt2w
{TTI::SK_PermuteTwoSrc, MVT::v16f16, 2}, // vpermt2w
{TTI::SK_PermuteTwoSrc, MVT::v8f16, 2} // vpermt2w
};
if (!ST->useSoftFloat() && ST->hasFP16())
if (const auto *Entry =
CostTableLookup(AVX512FP16ShuffleTbl, Kind, LT.second))
return LT.first * Entry->Cost;
static const CostTblEntry AVX512VBMIShuffleTbl[] = {
{TTI::SK_Reverse, MVT::v64i8, 1}, // vpermb
{TTI::SK_Reverse, MVT::v32i8, 1}, // vpermb
{TTI::SK_PermuteSingleSrc, MVT::v64i8, 1}, // vpermb
{TTI::SK_PermuteSingleSrc, MVT::v32i8, 1}, // vpermb
{TTI::SK_PermuteTwoSrc, MVT::v64i8, 2}, // vpermt2b
{TTI::SK_PermuteTwoSrc, MVT::v32i8, 2}, // vpermt2b
{TTI::SK_PermuteTwoSrc, MVT::v16i8, 2} // vpermt2b
};
if (ST->hasVBMI())
if (const auto *Entry =
CostTableLookup(AVX512VBMIShuffleTbl, Kind, LT.second))
return LT.first * Entry->Cost;
static const CostTblEntry AVX512BWShuffleTbl[] = {
{TTI::SK_Broadcast, MVT::v32i16, 1}, // vpbroadcastw
{TTI::SK_Broadcast, MVT::v64i8, 1}, // vpbroadcastb
{TTI::SK_Reverse, MVT::v32i16, 2}, // vpermw
{TTI::SK_Reverse, MVT::v16i16, 2}, // vpermw
{TTI::SK_Reverse, MVT::v64i8, 2}, // pshufb + vshufi64x2
{TTI::SK_PermuteSingleSrc, MVT::v32i16, 2}, // vpermw
{TTI::SK_PermuteSingleSrc, MVT::v16i16, 2}, // vpermw
{TTI::SK_PermuteSingleSrc, MVT::v64i8, 8}, // extend to v32i16
{TTI::SK_PermuteTwoSrc, MVT::v32i16, 2}, // vpermt2w
{TTI::SK_PermuteTwoSrc, MVT::v16i16, 2}, // vpermt2w
{TTI::SK_PermuteTwoSrc, MVT::v8i16, 2}, // vpermt2w
{TTI::SK_PermuteTwoSrc, MVT::v64i8, 19}, // 6 * v32i8 + 1
{TTI::SK_Select, MVT::v32i16, 1}, // vblendmw
{TTI::SK_Select, MVT::v64i8, 1}, // vblendmb
};
if (ST->hasBWI())
if (const auto *Entry =
CostTableLookup(AVX512BWShuffleTbl, Kind, LT.second))
return LT.first * Entry->Cost;
static const CostTblEntry AVX512ShuffleTbl[] = {
{TTI::SK_Broadcast, MVT::v8f64, 1}, // vbroadcastpd
{TTI::SK_Broadcast, MVT::v16f32, 1}, // vbroadcastps
{TTI::SK_Broadcast, MVT::v8i64, 1}, // vpbroadcastq
{TTI::SK_Broadcast, MVT::v16i32, 1}, // vpbroadcastd
{TTI::SK_Broadcast, MVT::v32i16, 1}, // vpbroadcastw
{TTI::SK_Broadcast, MVT::v64i8, 1}, // vpbroadcastb
{TTI::SK_Reverse, MVT::v8f64, 1}, // vpermpd
{TTI::SK_Reverse, MVT::v16f32, 1}, // vpermps
{TTI::SK_Reverse, MVT::v8i64, 1}, // vpermq
{TTI::SK_Reverse, MVT::v16i32, 1}, // vpermd
{TTI::SK_Reverse, MVT::v32i16, 7}, // per mca
{TTI::SK_Reverse, MVT::v64i8, 7}, // per mca
{TTI::SK_PermuteSingleSrc, MVT::v8f64, 1}, // vpermpd
{TTI::SK_PermuteSingleSrc, MVT::v4f64, 1}, // vpermpd
{TTI::SK_PermuteSingleSrc, MVT::v2f64, 1}, // vpermpd
{TTI::SK_PermuteSingleSrc, MVT::v16f32, 1}, // vpermps
{TTI::SK_PermuteSingleSrc, MVT::v8f32, 1}, // vpermps
{TTI::SK_PermuteSingleSrc, MVT::v4f32, 1}, // vpermps
{TTI::SK_PermuteSingleSrc, MVT::v8i64, 1}, // vpermq
{TTI::SK_PermuteSingleSrc, MVT::v4i64, 1}, // vpermq
{TTI::SK_PermuteSingleSrc, MVT::v2i64, 1}, // vpermq
{TTI::SK_PermuteSingleSrc, MVT::v16i32, 1}, // vpermd
{TTI::SK_PermuteSingleSrc, MVT::v8i32, 1}, // vpermd
{TTI::SK_PermuteSingleSrc, MVT::v4i32, 1}, // vpermd
{TTI::SK_PermuteSingleSrc, MVT::v16i8, 1}, // pshufb
{TTI::SK_PermuteTwoSrc, MVT::v8f64, 1}, // vpermt2pd
{TTI::SK_PermuteTwoSrc, MVT::v16f32, 1}, // vpermt2ps
{TTI::SK_PermuteTwoSrc, MVT::v8i64, 1}, // vpermt2q
{TTI::SK_PermuteTwoSrc, MVT::v16i32, 1}, // vpermt2d
{TTI::SK_PermuteTwoSrc, MVT::v4f64, 1}, // vpermt2pd
{TTI::SK_PermuteTwoSrc, MVT::v8f32, 1}, // vpermt2ps
{TTI::SK_PermuteTwoSrc, MVT::v4i64, 1}, // vpermt2q
{TTI::SK_PermuteTwoSrc, MVT::v8i32, 1}, // vpermt2d
{TTI::SK_PermuteTwoSrc, MVT::v2f64, 1}, // vpermt2pd
{TTI::SK_PermuteTwoSrc, MVT::v4f32, 1}, // vpermt2ps
{TTI::SK_PermuteTwoSrc, MVT::v2i64, 1}, // vpermt2q
{TTI::SK_PermuteTwoSrc, MVT::v4i32, 1}, // vpermt2d
// FIXME: This just applies the type legalization cost rules above
// assuming these completely split.
{TTI::SK_PermuteSingleSrc, MVT::v32i16, 14},
{TTI::SK_PermuteSingleSrc, MVT::v64i8, 14},
{TTI::SK_PermuteTwoSrc, MVT::v32i16, 42},
{TTI::SK_PermuteTwoSrc, MVT::v64i8, 42},
{TTI::SK_Select, MVT::v32i16, 1}, // vpternlogq
{TTI::SK_Select, MVT::v64i8, 1}, // vpternlogq
{TTI::SK_Select, MVT::v8f64, 1}, // vblendmpd
{TTI::SK_Select, MVT::v16f32, 1}, // vblendmps
{TTI::SK_Select, MVT::v8i64, 1}, // vblendmq
{TTI::SK_Select, MVT::v16i32, 1}, // vblendmd
};
if (ST->hasAVX512())
if (const auto *Entry = CostTableLookup(AVX512ShuffleTbl, Kind, LT.second))
return LT.first * Entry->Cost;
static const CostTblEntry AVX2ShuffleTbl[] = {
{TTI::SK_Broadcast, MVT::v4f64, 1}, // vbroadcastpd
{TTI::SK_Broadcast, MVT::v8f32, 1}, // vbroadcastps
{TTI::SK_Broadcast, MVT::v4i64, 1}, // vpbroadcastq
{TTI::SK_Broadcast, MVT::v8i32, 1}, // vpbroadcastd
{TTI::SK_Broadcast, MVT::v16i16, 1}, // vpbroadcastw
{TTI::SK_Broadcast, MVT::v32i8, 1}, // vpbroadcastb
{TTI::SK_Reverse, MVT::v4f64, 1}, // vpermpd
{TTI::SK_Reverse, MVT::v8f32, 1}, // vpermps
{TTI::SK_Reverse, MVT::v4i64, 1}, // vpermq
{TTI::SK_Reverse, MVT::v8i32, 1}, // vpermd
{TTI::SK_Reverse, MVT::v16i16, 2}, // vperm2i128 + pshufb
{TTI::SK_Reverse, MVT::v32i8, 2}, // vperm2i128 + pshufb
{TTI::SK_Select, MVT::v16i16, 1}, // vpblendvb
{TTI::SK_Select, MVT::v32i8, 1}, // vpblendvb
{TTI::SK_PermuteSingleSrc, MVT::v4f64, 1}, // vpermpd
{TTI::SK_PermuteSingleSrc, MVT::v8f32, 1}, // vpermps
{TTI::SK_PermuteSingleSrc, MVT::v4i64, 1}, // vpermq
{TTI::SK_PermuteSingleSrc, MVT::v8i32, 1}, // vpermd
{TTI::SK_PermuteSingleSrc, MVT::v16i16, 4}, // vperm2i128 + 2*vpshufb
// + vpblendvb
{TTI::SK_PermuteSingleSrc, MVT::v32i8, 4}, // vperm2i128 + 2*vpshufb
// + vpblendvb
{TTI::SK_PermuteTwoSrc, MVT::v4f64, 3}, // 2*vpermpd + vblendpd
{TTI::SK_PermuteTwoSrc, MVT::v8f32, 3}, // 2*vpermps + vblendps
{TTI::SK_PermuteTwoSrc, MVT::v4i64, 3}, // 2*vpermq + vpblendd
{TTI::SK_PermuteTwoSrc, MVT::v8i32, 3}, // 2*vpermd + vpblendd
{TTI::SK_PermuteTwoSrc, MVT::v16i16, 7}, // 2*vperm2i128 + 4*vpshufb
// + vpblendvb
{TTI::SK_PermuteTwoSrc, MVT::v32i8, 7}, // 2*vperm2i128 + 4*vpshufb
// + vpblendvb
};
if (ST->hasAVX2())
if (const auto *Entry = CostTableLookup(AVX2ShuffleTbl, Kind, LT.second))
return LT.first * Entry->Cost;
static const CostTblEntry XOPShuffleTbl[] = {
{TTI::SK_PermuteSingleSrc, MVT::v4f64, 2}, // vperm2f128 + vpermil2pd
{TTI::SK_PermuteSingleSrc, MVT::v8f32, 2}, // vperm2f128 + vpermil2ps
{TTI::SK_PermuteSingleSrc, MVT::v4i64, 2}, // vperm2f128 + vpermil2pd
{TTI::SK_PermuteSingleSrc, MVT::v8i32, 2}, // vperm2f128 + vpermil2ps
{TTI::SK_PermuteSingleSrc, MVT::v16i16, 4}, // vextractf128 + 2*vpperm
// + vinsertf128
{TTI::SK_PermuteSingleSrc, MVT::v32i8, 4}, // vextractf128 + 2*vpperm
// + vinsertf128
{TTI::SK_PermuteTwoSrc, MVT::v16i16, 9}, // 2*vextractf128 + 6*vpperm
// + vinsertf128
{TTI::SK_PermuteTwoSrc, MVT::v8i16, 1}, // vpperm
{TTI::SK_PermuteTwoSrc, MVT::v32i8, 9}, // 2*vextractf128 + 6*vpperm
// + vinsertf128
{TTI::SK_PermuteTwoSrc, MVT::v16i8, 1}, // vpperm
};
if (ST->hasXOP())
if (const auto *Entry = CostTableLookup(XOPShuffleTbl, Kind, LT.second))
return LT.first * Entry->Cost;
static const CostTblEntry AVX1ShuffleTbl[] = {
{TTI::SK_Broadcast, MVT::v4f64, 2}, // vperm2f128 + vpermilpd
{TTI::SK_Broadcast, MVT::v8f32, 2}, // vperm2f128 + vpermilps
{TTI::SK_Broadcast, MVT::v4i64, 2}, // vperm2f128 + vpermilpd
{TTI::SK_Broadcast, MVT::v8i32, 2}, // vperm2f128 + vpermilps
{TTI::SK_Broadcast, MVT::v16i16, 3}, // vpshuflw + vpshufd + vinsertf128
{TTI::SK_Broadcast, MVT::v32i8, 2}, // vpshufb + vinsertf128
{TTI::SK_Reverse, MVT::v4f64, 2}, // vperm2f128 + vpermilpd
{TTI::SK_Reverse, MVT::v8f32, 2}, // vperm2f128 + vpermilps
{TTI::SK_Reverse, MVT::v4i64, 2}, // vperm2f128 + vpermilpd
{TTI::SK_Reverse, MVT::v8i32, 2}, // vperm2f128 + vpermilps
{TTI::SK_Reverse, MVT::v16i16, 4}, // vextractf128 + 2*pshufb
// + vinsertf128
{TTI::SK_Reverse, MVT::v32i8, 4}, // vextractf128 + 2*pshufb
// + vinsertf128
{TTI::SK_Select, MVT::v4i64, 1}, // vblendpd
{TTI::SK_Select, MVT::v4f64, 1}, // vblendpd
{TTI::SK_Select, MVT::v8i32, 1}, // vblendps
{TTI::SK_Select, MVT::v8f32, 1}, // vblendps
{TTI::SK_Select, MVT::v16i16, 3}, // vpand + vpandn + vpor
{TTI::SK_Select, MVT::v32i8, 3}, // vpand + vpandn + vpor
{TTI::SK_PermuteSingleSrc, MVT::v4f64, 2}, // vperm2f128 + vshufpd
{TTI::SK_PermuteSingleSrc, MVT::v4i64, 2}, // vperm2f128 + vshufpd
{TTI::SK_PermuteSingleSrc, MVT::v8f32, 4}, // 2*vperm2f128 + 2*vshufps
{TTI::SK_PermuteSingleSrc, MVT::v8i32, 4}, // 2*vperm2f128 + 2*vshufps
{TTI::SK_PermuteSingleSrc, MVT::v16i16, 8}, // vextractf128 + 4*pshufb
// + 2*por + vinsertf128
{TTI::SK_PermuteSingleSrc, MVT::v32i8, 8}, // vextractf128 + 4*pshufb
// + 2*por + vinsertf128
{TTI::SK_PermuteTwoSrc, MVT::v4f64, 3}, // 2*vperm2f128 + vshufpd
{TTI::SK_PermuteTwoSrc, MVT::v4i64, 3}, // 2*vperm2f128 + vshufpd
{TTI::SK_PermuteTwoSrc, MVT::v8f32, 4}, // 2*vperm2f128 + 2*vshufps
{TTI::SK_PermuteTwoSrc, MVT::v8i32, 4}, // 2*vperm2f128 + 2*vshufps
{TTI::SK_PermuteTwoSrc, MVT::v16i16, 15}, // 2*vextractf128 + 8*pshufb
// + 4*por + vinsertf128
{TTI::SK_PermuteTwoSrc, MVT::v32i8, 15}, // 2*vextractf128 + 8*pshufb
// + 4*por + vinsertf128
};
if (ST->hasAVX())
if (const auto *Entry = CostTableLookup(AVX1ShuffleTbl, Kind, LT.second))
return LT.first * Entry->Cost;
static const CostTblEntry SSE41ShuffleTbl[] = {
{TTI::SK_Select, MVT::v2i64, 1}, // pblendw
{TTI::SK_Select, MVT::v2f64, 1}, // movsd
{TTI::SK_Select, MVT::v4i32, 1}, // pblendw
{TTI::SK_Select, MVT::v4f32, 1}, // blendps
{TTI::SK_Select, MVT::v8i16, 1}, // pblendw
{TTI::SK_Select, MVT::v16i8, 1} // pblendvb
};
if (ST->hasSSE41())
if (const auto *Entry = CostTableLookup(SSE41ShuffleTbl, Kind, LT.second))
return LT.first * Entry->Cost;
static const CostTblEntry SSSE3ShuffleTbl[] = {
{TTI::SK_Broadcast, MVT::v8i16, 1}, // pshufb
{TTI::SK_Broadcast, MVT::v16i8, 1}, // pshufb
{TTI::SK_Reverse, MVT::v8i16, 1}, // pshufb
{TTI::SK_Reverse, MVT::v16i8, 1}, // pshufb
{TTI::SK_Select, MVT::v8i16, 3}, // 2*pshufb + por
{TTI::SK_Select, MVT::v16i8, 3}, // 2*pshufb + por
{TTI::SK_PermuteSingleSrc, MVT::v8i16, 1}, // pshufb
{TTI::SK_PermuteSingleSrc, MVT::v16i8, 1}, // pshufb
{TTI::SK_PermuteTwoSrc, MVT::v8i16, 3}, // 2*pshufb + por
{TTI::SK_PermuteTwoSrc, MVT::v16i8, 3}, // 2*pshufb + por
};
if (ST->hasSSSE3())
if (const auto *Entry = CostTableLookup(SSSE3ShuffleTbl, Kind, LT.second))
return LT.first * Entry->Cost;
static const CostTblEntry SSE2ShuffleTbl[] = {
{TTI::SK_Broadcast, MVT::v2f64, 1}, // shufpd
{TTI::SK_Broadcast, MVT::v2i64, 1}, // pshufd
{TTI::SK_Broadcast, MVT::v4i32, 1}, // pshufd
{TTI::SK_Broadcast, MVT::v8i16, 2}, // pshuflw + pshufd
{TTI::SK_Broadcast, MVT::v16i8, 3}, // unpck + pshuflw + pshufd
{TTI::SK_Reverse, MVT::v2f64, 1}, // shufpd
{TTI::SK_Reverse, MVT::v2i64, 1}, // pshufd
{TTI::SK_Reverse, MVT::v4i32, 1}, // pshufd
{TTI::SK_Reverse, MVT::v8i16, 3}, // pshuflw + pshufhw + pshufd
{TTI::SK_Reverse, MVT::v16i8, 9}, // 2*pshuflw + 2*pshufhw
// + 2*pshufd + 2*unpck + packus
{TTI::SK_Select, MVT::v2i64, 1}, // movsd
{TTI::SK_Select, MVT::v2f64, 1}, // movsd
{TTI::SK_Select, MVT::v4i32, 2}, // 2*shufps
{TTI::SK_Select, MVT::v8i16, 3}, // pand + pandn + por
{TTI::SK_Select, MVT::v16i8, 3}, // pand + pandn + por
{TTI::SK_PermuteSingleSrc, MVT::v2f64, 1}, // shufpd
{TTI::SK_PermuteSingleSrc, MVT::v2i64, 1}, // pshufd
{TTI::SK_PermuteSingleSrc, MVT::v4i32, 1}, // pshufd
{TTI::SK_PermuteSingleSrc, MVT::v8i16, 5}, // 2*pshuflw + 2*pshufhw
// + pshufd/unpck
{ TTI::SK_PermuteSingleSrc, MVT::v16i8, 10 }, // 2*pshuflw + 2*pshufhw
// + 2*pshufd + 2*unpck + 2*packus
{ TTI::SK_PermuteTwoSrc, MVT::v2f64, 1 }, // shufpd
{ TTI::SK_PermuteTwoSrc, MVT::v2i64, 1 }, // shufpd
{ TTI::SK_PermuteTwoSrc, MVT::v4i32, 2 }, // 2*{unpck,movsd,pshufd}
{ TTI::SK_PermuteTwoSrc, MVT::v8i16, 8 }, // blend+permute
{ TTI::SK_PermuteTwoSrc, MVT::v16i8, 13 }, // blend+permute
};
if (ST->hasSSE2())
if (const auto *Entry = CostTableLookup(SSE2ShuffleTbl, Kind, LT.second))
return LT.first * Entry->Cost;
static const CostTblEntry SSE1ShuffleTbl[] = {
{ TTI::SK_Broadcast, MVT::v4f32, 1 }, // shufps
{ TTI::SK_Reverse, MVT::v4f32, 1 }, // shufps
{ TTI::SK_Select, MVT::v4f32, 2 }, // 2*shufps
{ TTI::SK_PermuteSingleSrc, MVT::v4f32, 1 }, // shufps
{ TTI::SK_PermuteTwoSrc, MVT::v4f32, 2 }, // 2*shufps
};
if (ST->hasSSE1())
if (const auto *Entry = CostTableLookup(SSE1ShuffleTbl, Kind, LT.second))
return LT.first * Entry->Cost;
return BaseT::getShuffleCost(Kind, BaseTp, Mask, Index, SubTp);
}
InstructionCost X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
Type *Src,
TTI::CastContextHint CCH,
TTI::TargetCostKind CostKind,
const Instruction *I) {
int ISD = TLI->InstructionOpcodeToISD(Opcode);
assert(ISD && "Invalid opcode");
// TODO: Allow non-throughput costs that aren't binary.
auto AdjustCost = [&CostKind](InstructionCost Cost) -> InstructionCost {
if (CostKind != TTI::TCK_RecipThroughput)
return Cost == 0 ? 0 : 1;
return Cost;
};
// The cost tables include both specific, custom (non-legal) src/dst type
// conversions and generic, legalized types. We test for customs first, before
// falling back to legalization.
// FIXME: Need a better design of the cost table to handle non-simple types of
// potential massive combinations (elem_num x src_type x dst_type).
static const TypeConversionCostTblEntry AVX512BWConversionTbl[] {
{ ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i8, 1 },
{ ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i8, 1 },
// Mask sign extend has an instruction.
{ ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, 1 },
{ ISD::SIGN_EXTEND, MVT::v16i8, MVT::v2i1, 1 },
{ ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, 1 },
{ ISD::SIGN_EXTEND, MVT::v8i16, MVT::v2i1, 1 },
{ ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, 1 },
{ ISD::SIGN_EXTEND, MVT::v16i8, MVT::v4i1, 1 },
{ ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, 1 },
{ ISD::SIGN_EXTEND, MVT::v8i16, MVT::v4i1, 1 },
{ ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, 1 },
{ ISD::SIGN_EXTEND, MVT::v16i8, MVT::v8i1, 1 },
{ ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, 1 },
{ ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, 1 },
{ ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 1 },
{ ISD::SIGN_EXTEND, MVT::v32i8, MVT::v32i1, 1 },
{ ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i1, 1 },
{ ISD::SIGN_EXTEND, MVT::v64i8, MVT::v64i1, 1 },
{ ISD::SIGN_EXTEND, MVT::v32i16, MVT::v64i1, 1 },
// Mask zero extend is a sext + shift.
{ ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, 2 },
{ ISD::ZERO_EXTEND, MVT::v16i8, MVT::v2i1, 2 },
{ ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, 2 },
{ ISD::ZERO_EXTEND, MVT::v8i16, MVT::v2i1, 2 },
{ ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, 2 },
{ ISD::ZERO_EXTEND, MVT::v16i8, MVT::v4i1, 2 },
{ ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, 2 },
{ ISD::ZERO_EXTEND, MVT::v8i16, MVT::v4i1, 2 },
{ ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, 2 },
{ ISD::ZERO_EXTEND, MVT::v16i8, MVT::v8i1, 2 },
{ ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, 2 },
{ ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, 2 },
{ ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 2 },
{ ISD::ZERO_EXTEND, MVT::v32i8, MVT::v32i1, 2 },
{ ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i1, 2 },
{ ISD::ZERO_EXTEND, MVT::v64i8, MVT::v64i1, 2 },
{ ISD::ZERO_EXTEND, MVT::v32i16, MVT::v64i1, 2 },
{ ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 2 },
{ ISD::TRUNCATE, MVT::v2i1, MVT::v16i8, 2 },
{ ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 2 },
{ ISD::TRUNCATE, MVT::v2i1, MVT::v8i16, 2 },
{ ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 2 },
{ ISD::TRUNCATE, MVT::v4i1, MVT::v16i8, 2 },
{ ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, 2 },
{ ISD::TRUNCATE, MVT::v4i1, MVT::v8i16, 2 },
{ ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 2 },
{ ISD::TRUNCATE, MVT::v8i1, MVT::v16i8, 2 },
{ ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, 2 },
{ ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, 2 },
{ ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, 2 },
{ ISD::TRUNCATE, MVT::v32i1, MVT::v32i8, 2 },
{ ISD::TRUNCATE, MVT::v32i1, MVT::v32i16, 2 },
{ ISD::TRUNCATE, MVT::v64i1, MVT::v64i8, 2 },
{ ISD::TRUNCATE, MVT::v64i1, MVT::v32i16, 2 },
{ ISD::TRUNCATE, MVT::v32i8, MVT::v32i16, 2 },
{ ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 2 }, // widen to zmm
{ ISD::TRUNCATE, MVT::v2i8, MVT::v2i16, 2 }, // vpmovwb
{ ISD::TRUNCATE, MVT::v4i8, MVT::v4i16, 2 }, // vpmovwb
{ ISD::TRUNCATE, MVT::v8i8, MVT::v8i16, 2 }, // vpmovwb
};
static const TypeConversionCostTblEntry AVX512DQConversionTbl[] = {
// Mask sign extend has an instruction.
{ ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, 1 },
{ ISD::SIGN_EXTEND, MVT::v4i32, MVT::v2i1, 1 },
{ ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, 1 },
{ ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 1 },
{ ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 1 },
{ ISD::SIGN_EXTEND, MVT::v8i64, MVT::v16i1, 1 },
{ ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i1, 1 },
{ ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i1, 1 },
// Mask zero extend is a sext + shift.
{ ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, 2 },
{ ISD::ZERO_EXTEND, MVT::v4i32, MVT::v2i1, 2 },
{ ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, 2 },
{ ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 2 },
{ ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 2 },
{ ISD::ZERO_EXTEND, MVT::v8i64, MVT::v16i1, 2 },
{ ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i1, 2 },
{ ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i1, 2 },
{ ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, 2 },
{ ISD::TRUNCATE, MVT::v2i1, MVT::v4i32, 2 },
{ ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, 2 },
{ ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, 2 },
{ ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, 2 },
{ ISD::TRUNCATE, MVT::v8i1, MVT::v8i64, 2 },
{ ISD::TRUNCATE, MVT::v16i1, MVT::v16i32, 2 },
{ ISD::TRUNCATE, MVT::v16i1, MVT::v8i64, 2 },
{ ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i64, 1 },
{ ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i64, 1 },
{ ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i64, 1 },
{ ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i64, 1 },
{ ISD::FP_TO_SINT, MVT::v8i64, MVT::v8f32, 1 },
{ ISD::FP_TO_SINT, MVT::v8i64, MVT::v8f64, 1 },
{ ISD::FP_TO_UINT, MVT::v8i64, MVT::v8f32, 1 },
{ ISD::FP_TO_UINT, MVT::v8i64, MVT::v8f64, 1 },
};
// TODO: For AVX512DQ + AVX512VL, we also have cheap casts for 128-bit and
// 256-bit wide vectors.
static const TypeConversionCostTblEntry AVX512FConversionTbl[] = {
{ ISD::FP_EXTEND, MVT::v8f64, MVT::v8f32, 1 },
{ ISD::FP_EXTEND, MVT::v8f64, MVT::v16f32, 3 },
{ ISD::FP_ROUND, MVT::v8f32, MVT::v8f64, 1 },
{ ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 3 }, // sext+vpslld+vptestmd
{ ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 3 }, // sext+vpslld+vptestmd
{ ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 3 }, // sext+vpslld+vptestmd
{ ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, 3 }, // sext+vpslld+vptestmd
{ ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 3 }, // sext+vpsllq+vptestmq
{ ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, 3 }, // sext+vpsllq+vptestmq
{ ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, 3 }, // sext+vpsllq+vptestmq
{ ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, 3 }, // sext+vpslld+vptestmd
{ ISD::TRUNCATE, MVT::v2i1, MVT::v2i32, 2 }, // zmm vpslld+vptestmd
{ ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, 2 }, // zmm vpslld+vptestmd
{ ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, 2 }, // zmm vpslld+vptestmd
{ ISD::TRUNCATE, MVT::v16i1, MVT::v16i32, 2 }, // vpslld+vptestmd
{ ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, 2 }, // zmm vpsllq+vptestmq
{ ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, 2 }, // zmm vpsllq+vptestmq
{ ISD::TRUNCATE, MVT::v8i1, MVT::v8i64, 2 }, // vpsllq+vptestmq
{ ISD::TRUNCATE, MVT::v2i8, MVT::v2i32, 2 }, // vpmovdb
{ ISD::TRUNCATE, MVT::v4i8, MVT::v4i32, 2 }, // vpmovdb
{ ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 2 }, // vpmovdb
{ ISD::TRUNCATE, MVT::v32i8, MVT::v16i32, 2 }, // vpmovdb
{ ISD::TRUNCATE, MVT::v64i8, MVT::v16i32, 2 }, // vpmovdb
{ ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 2 }, // vpmovdw
{ ISD::TRUNCATE, MVT::v32i16, MVT::v16i32, 2 }, // vpmovdw
{ ISD::TRUNCATE, MVT::v2i8, MVT::v2i64, 2 }, // vpmovqb
{ ISD::TRUNCATE, MVT::v2i16, MVT::v2i64, 1 }, // vpshufb
{ ISD::TRUNCATE, MVT::v8i8, MVT::v8i64, 2 }, // vpmovqb
{ ISD::TRUNCATE, MVT::v16i8, MVT::v8i64, 2 }, // vpmovqb
{ ISD::TRUNCATE, MVT::v32i8, MVT::v8i64, 2 }, // vpmovqb
{ ISD::TRUNCATE, MVT::v64i8, MVT::v8i64, 2 }, // vpmovqb
{ ISD::TRUNCATE, MVT::v8i16, MVT::v8i64, 2 }, // vpmovqw
{ ISD::TRUNCATE, MVT::v16i16, MVT::v8i64, 2 }, // vpmovqw
{ ISD::TRUNCATE, MVT::v32i16, MVT::v8i64, 2 }, // vpmovqw
{ ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, 1 }, // vpmovqd
{ ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 1 }, // zmm vpmovqd
{ ISD::TRUNCATE, MVT::v16i8, MVT::v16i64, 5 },// 2*vpmovqd+concat+vpmovdb
{ ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 3 }, // extend to v16i32
{ ISD::TRUNCATE, MVT::v32i8, MVT::v32i16, 8 },
{ ISD::TRUNCATE, MVT::v64i8, MVT::v32i16, 8 },
// Sign extend is zmm vpternlogd+vptruncdb.
// Zero extend is zmm broadcast load+vptruncdw.
{ ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, 3 },
{ ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, 4 },
{ ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, 3 },
{ ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, 4 },
{ ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, 3 },
{ ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, 4 },
{ ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, 3 },
{ ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, 4 },
// Sign extend is zmm vpternlogd+vptruncdw.
// Zero extend is zmm vpternlogd+vptruncdw+vpsrlw.
{ ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, 3 },
{ ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, 4 },
{ ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, 3 },
{ ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, 4 },
{ ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, 3 },
{ ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, 4 },
{ ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 3 },
{ ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 4 },
{ ISD::SIGN_EXTEND, MVT::v2i32, MVT::v2i1, 1 }, // zmm vpternlogd
{ ISD::ZERO_EXTEND, MVT::v2i32, MVT::v2i1, 2 }, // zmm vpternlogd+psrld
{ ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, 1 }, // zmm vpternlogd
{ ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, 2 }, // zmm vpternlogd+psrld
{ ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 1 }, // zmm vpternlogd
{ ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 2 }, // zmm vpternlogd+psrld
{ ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, 1 }, // zmm vpternlogq
{ ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, 2 }, // zmm vpternlogq+psrlq
{ ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 1 }, // zmm vpternlogq
{ ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 2 }, // zmm vpternlogq+psrlq
{ ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i1, 1 }, // vpternlogd
{ ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i1, 2 }, // vpternlogd+psrld
{ ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i1, 1 }, // vpternlogq
{ ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i1, 2 }, // vpternlogq+psrlq
{ ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 1 },
{ ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 1 },
{ ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 1 },
{ ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, 1 },
{ ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, 1 },
{ ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, 1 },
{ ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, 1 },
{ ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, 1 },
{ ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i32, 1 },
{ ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i32, 1 },
{ ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i8, 3 }, // FIXME: May not be right
{ ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i8, 3 }, // FIXME: May not be right
{ ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i1, 4 },
{ ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i1, 3 },
{ ISD::SINT_TO_FP, MVT::v8f64, MVT::v16i8, 2 },
{ ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i8, 1 },
{ ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i16, 2 },
{ ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i16, 1 },
{ ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i32, 1 },
{ ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i32, 1 },
{ ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i1, 4 },
{ ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i1, 3 },
{ ISD::UINT_TO_FP, MVT::v8f64, MVT::v16i8, 2 },
{ ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i8, 1 },
{ ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i16, 2 },
{ ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i16, 1 },
{ ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i32, 1 },
{ ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i32, 1 },
{ ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i64, 26 },
{ ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i64, 5 },
{ ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f32, 2 },
{ ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f64, 7 },
{ ISD::FP_TO_SINT, MVT::v32i8, MVT::v32f64,15 },
{ ISD::FP_TO_SINT, MVT::v64i8, MVT::v64f32,11 },
{ ISD::FP_TO_SINT, MVT::v64i8, MVT::v64f64,31 },
{ ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f64, 3 },
{ ISD::FP_TO_SINT, MVT::v16i16, MVT::v16f64, 7 },
{ ISD::FP_TO_SINT, MVT::v32i16, MVT::v32f32, 5 },
{ ISD::FP_TO_SINT, MVT::v32i16, MVT::v32f64,15 },
{ ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f64, 1 },
{ ISD::FP_TO_SINT, MVT::v16i32, MVT::v16f64, 3 },
{ ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f64, 1 },
{ ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f64, 3 },
{ ISD::FP_TO_UINT, MVT::v8i8, MVT::v8f64, 3 },
{ ISD::FP_TO_UINT, MVT::v16i32, MVT::v16f32, 1 },
{ ISD::FP_TO_UINT, MVT::v16i16, MVT::v16f32, 3 },
{ ISD::FP_TO_UINT, MVT::v16i8, MVT::v16f32, 3 },
};
static const TypeConversionCostTblEntry AVX512BWVLConversionTbl[] {
// Mask sign extend has an instruction.
{ ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, 1 },
{ ISD::SIGN_EXTEND, MVT::v16i8, MVT::v2i1, 1 },
{ ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, 1 },
{ ISD::SIGN_EXTEND, MVT::v8i16, MVT::v2i1, 1 },
{ ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, 1 },
{ ISD::SIGN_EXTEND, MVT::v16i8, MVT::v4i1, 1 },
{ ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, 1 },
{ ISD::SIGN_EXTEND, MVT::v8i16, MVT::v4i1, 1 },
{ ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, 1 },
{ ISD::SIGN_EXTEND, MVT::v16i8, MVT::v8i1, 1 },
{ ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, 1 },
{ ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, 1 },
{ ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 1 },
{ ISD::SIGN_EXTEND, MVT::v32i8, MVT::v32i1, 1 },
{ ISD::SIGN_EXTEND, MVT::v16i16, MVT::v32i1, 1 },
{ ISD::SIGN_EXTEND, MVT::v32i8, MVT::v64i1, 1 },
{ ISD::SIGN_EXTEND, MVT::v16i16, MVT::v64i1, 1 },
// Mask zero extend is a sext + shift.
{ ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, 2 },
{ ISD::ZERO_EXTEND, MVT::v16i8, MVT::v2i1, 2 },
{ ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, 2 },
{ ISD::ZERO_EXTEND, MVT::v8i16, MVT::v2i1, 2 },
{ ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, 2 },
{ ISD::ZERO_EXTEND, MVT::v16i8, MVT::v4i1, 2 },
{ ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, 2 },
{ ISD::ZERO_EXTEND, MVT::v8i16, MVT::v4i1, 2 },
{ ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, 2 },
{ ISD::ZERO_EXTEND, MVT::v16i8, MVT::v8i1, 2 },
{ ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, 2 },
{ ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, 2 },
{ ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 2 },
{ ISD::ZERO_EXTEND, MVT::v32i8, MVT::v32i1, 2 },
{ ISD::ZERO_EXTEND, MVT::v16i16, MVT::v32i1, 2 },
{ ISD::ZERO_EXTEND, MVT::v32i8, MVT::v64i1, 2 },
{ ISD::ZERO_EXTEND, MVT::v16i16, MVT::v64i1, 2 },
{ ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 2 },
{ ISD::TRUNCATE, MVT::v2i1, MVT::v16i8, 2 },
{ ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 2 },
{ ISD::TRUNCATE, MVT::v2i1, MVT::v8i16, 2 },
{ ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 2 },
{ ISD::TRUNCATE, MVT::v4i1, MVT::v16i8, 2 },
{ ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, 2 },
{ ISD::TRUNCATE, MVT::v4i1, MVT::v8i16, 2 },
{ ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 2 },
{ ISD::TRUNCATE, MVT::v8i1, MVT::v16i8, 2 },
{ ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, 2 },
{ ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, 2 },
{ ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, 2 },
{ ISD::TRUNCATE, MVT::v32i1, MVT::v32i8, 2 },
{ ISD::TRUNCATE, MVT::v32i1, MVT::v16i16, 2 },
{ ISD::TRUNCATE, MVT::v64i1, MVT::v32i8, 2 },
{ ISD::TRUNCATE, MVT::v64i1, MVT::v16i16, 2 },
{ ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 2 },
};
static const TypeConversionCostTblEntry AVX512DQVLConversionTbl[] = {
// Mask sign extend has an instruction.
{ ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, 1 },
{ ISD::SIGN_EXTEND, MVT::v4i32, MVT::v2i1, 1 },
{ ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, 1 },
{ ISD::SIGN_EXTEND, MVT::v4i64, MVT::v16i1, 1 },
{ ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 1 },
{ ISD::SIGN_EXTEND, MVT::v4i64, MVT::v8i1, 1 },
{ ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i1, 1 },
{ ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 1 },
// Mask zero extend is a sext + shift.
{ ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, 2 },
{ ISD::ZERO_EXTEND, MVT::v4i32, MVT::v2i1, 2 },
{ ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, 2 },
{ ISD::ZERO_EXTEND, MVT::v4i64, MVT::v16i1, 2 },
{ ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 2 },
{ ISD::ZERO_EXTEND, MVT::v4i64, MVT::v8i1, 2 },
{ ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i1, 2 },
{ ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 2 },
{ ISD::TRUNCATE, MVT::v16i1, MVT::v4i64, 2 },
{ ISD::TRUNCATE, MVT::v16i1, MVT::v8i32, 2 },
{ ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, 2 },
{ ISD::TRUNCATE, MVT::v2i1, MVT::v4i32, 2 },
{ ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, 2 },
{ ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, 2 },
{ ISD::TRUNCATE, MVT::v8i1, MVT::v4i64, 2 },
{ ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, 2 },
{ ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i64, 1 },
{ ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 },
{ ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i64, 1 },
{ ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i64, 1 },
{ ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, 1 },
{ ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 },
{ ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i64, 1 },
{ ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, 1 },
{ ISD::FP_TO_SINT, MVT::v2i64, MVT::v4f32, 1 },
{ ISD::FP_TO_SINT, MVT::v4i64, MVT::v4f32, 1 },
{ ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, 1 },
{ ISD::FP_TO_SINT, MVT::v4i64, MVT::v4f64, 1 },
{ ISD::FP_TO_UINT, MVT::v2i64, MVT::v4f32, 1 },
{ ISD::FP_TO_UINT, MVT::v4i64, MVT::v4f32, 1 },
{ ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, 1 },
{ ISD::FP_TO_UINT, MVT::v4i64, MVT::v4f64, 1 },
};
static const TypeConversionCostTblEntry AVX512VLConversionTbl[] = {
{ ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 3 }, // sext+vpslld+vptestmd
{ ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 3 }, // sext+vpslld+vptestmd
{ ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 3 }, // sext+vpslld+vptestmd
{ ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, 8 }, // split+2*v8i8
{ ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 3 }, // sext+vpsllq+vptestmq
{ ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, 3 }, // sext+vpsllq+vptestmq
{ ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, 3 }, // sext+vpsllq+vptestmq
{ ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, 8 }, // split+2*v8i16
{ ISD::TRUNCATE, MVT::v2i1, MVT::v2i32, 2 }, // vpslld+vptestmd
{ ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, 2 }, // vpslld+vptestmd
{ ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, 2 }, // vpslld+vptestmd
{ ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, 2 }, // vpsllq+vptestmq
{ ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, 2 }, // vpsllq+vptestmq
{ ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 1 }, // vpmovqd
{ ISD::TRUNCATE, MVT::v4i8, MVT::v4i64, 2 }, // vpmovqb
{ ISD::TRUNCATE, MVT::v4i16, MVT::v4i64, 2 }, // vpmovqw
{ ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 2 }, // vpmovwb
// sign extend is vpcmpeq+maskedmove+vpmovdw+vpacksswb
// zero extend is vpcmpeq+maskedmove+vpmovdw+vpsrlw+vpackuswb
{ ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, 5 },
{ ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, 6 },
{ ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, 5 },
{ ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, 6 },
{ ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, 5 },
{ ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, 6 },
{ ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, 10 },
{ ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, 12 },
// sign extend is vpcmpeq+maskedmove+vpmovdw
// zero extend is vpcmpeq+maskedmove+vpmovdw+vpsrlw
{ ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, 4 },
{ ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, 5 },
{ ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, 4 },
{ ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, 5 },
{ ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, 4 },
{ ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, 5 },
{ ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 10 },
{ ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 12 },
{ ISD::SIGN_EXTEND, MVT::v2i32, MVT::v2i1, 1 }, // vpternlogd
{ ISD::ZERO_EXTEND, MVT::v2i32, MVT::v2i1, 2 }, // vpternlogd+psrld
{ ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, 1 }, // vpternlogd
{ ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, 2 }, // vpternlogd+psrld
{ ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 1 }, // vpternlogd
{ ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 2 }, // vpternlogd+psrld
{ ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, 1 }, // vpternlogq
{ ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, 2 }, // vpternlogq+psrlq
{ ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 1 }, // vpternlogq
{ ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 2 }, // vpternlogq+psrlq
{ ISD::SIGN_EXTEND, MVT::v4i64, MVT::v16i8, 1 },
{ ISD::ZERO_EXTEND, MVT::v4i64, MVT::v16i8, 1 },
{ ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i8, 1 },
{ ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i8, 1 },
{ ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 1 },
{ ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 1 },
{ ISD::SIGN_EXTEND, MVT::v4i64, MVT::v8i16, 1 },
{ ISD::ZERO_EXTEND, MVT::v4i64, MVT::v8i16, 1 },
{ ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 1 },
{ ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 1 },
{ ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 1 },
{ ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 1 },
{ ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, 1 },
{ ISD::SINT_TO_FP, MVT::v8f32, MVT::v16i8, 1 },
{ ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, 1 },
{ ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 1 },
{ ISD::UINT_TO_FP, MVT::f32, MVT::i64, 1 },
{ ISD::UINT_TO_FP, MVT::f64, MVT::i64, 1 },
{ ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, 1 },
{ ISD::UINT_TO_FP, MVT::v8f32, MVT::v16i8, 1 },
{ ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, 1 },
{ ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 1 },
{ ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 },
{ ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 },
{ ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, 1 },
{ ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, 1 },
{ ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, 5 },
{ ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 5 },
{ ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, 5 },
{ ISD::FP_TO_SINT, MVT::v16i8, MVT::v8f32, 2 },
{ ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f32, 2 },
{ ISD::FP_TO_SINT, MVT::v32i8, MVT::v32f32, 5 },
{ ISD::FP_TO_UINT, MVT::i64, MVT::f32, 1 },
{ ISD::FP_TO_UINT, MVT::i64, MVT::f64, 1 },
{ ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1 },
{ ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, 1 },
{ ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f64, 1 },
{ ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, 1 },
{ ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f64, 1 },