| //===-- X86TargetTransformInfo.cpp - X86 specific TTI pass ----------------===// |
| // |
| // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| // See https://llvm.org/LICENSE.txt for license information. |
| // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| // |
| //===----------------------------------------------------------------------===// |
| /// \file |
| /// This file implements a TargetTransformInfo analysis pass specific to the |
| /// X86 target machine. It uses the target's detailed information to provide |
| /// more precise answers to certain TTI queries, while letting the target |
| /// independent and default TTI implementations handle the rest. |
| /// |
| //===----------------------------------------------------------------------===// |
| /// About Cost Model numbers used below it's necessary to say the following: |
| /// the numbers correspond to some "generic" X86 CPU instead of usage of |
| /// concrete CPU model. Usually the numbers correspond to CPU where the feature |
| /// apeared at the first time. For example, if we do Subtarget.hasSSE42() in |
| /// the lookups below the cost is based on Nehalem as that was the first CPU |
| /// to support that feature level and thus has most likely the worst case cost. |
| /// Some examples of other technologies/CPUs: |
| /// SSE 3 - Pentium4 / Athlon64 |
| /// SSE 4.1 - Penryn |
| /// SSE 4.2 - Nehalem |
| /// AVX - Sandy Bridge |
| /// AVX2 - Haswell |
| /// AVX-512 - Xeon Phi / Skylake |
| /// And some examples of instruction target dependent costs (latency) |
| /// divss sqrtss rsqrtss |
| /// AMD K7 11-16 19 3 |
| /// Piledriver 9-24 13-15 5 |
| /// Jaguar 14 16 2 |
| /// Pentium II,III 18 30 2 |
| /// Nehalem 7-14 7-18 3 |
| /// Haswell 10-13 11 5 |
| /// TODO: Develop and implement the target dependent cost model and |
| /// specialize cost numbers for different Cost Model Targets such as throughput, |
| /// code size, latency and uop count. |
| //===----------------------------------------------------------------------===// |
| |
| #include "X86TargetTransformInfo.h" |
| #include "llvm/Analysis/TargetTransformInfo.h" |
| #include "llvm/CodeGen/BasicTTIImpl.h" |
| #include "llvm/CodeGen/CostTable.h" |
| #include "llvm/CodeGen/TargetLowering.h" |
| #include "llvm/IR/IntrinsicInst.h" |
| #include "llvm/Support/Debug.h" |
| |
| using namespace llvm; |
| |
| #define DEBUG_TYPE "x86tti" |
| |
| //===----------------------------------------------------------------------===// |
| // |
| // X86 cost model. |
| // |
| //===----------------------------------------------------------------------===// |
| |
| TargetTransformInfo::PopcntSupportKind |
| X86TTIImpl::getPopcntSupport(unsigned TyWidth) { |
| assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2"); |
| // TODO: Currently the __builtin_popcount() implementation using SSE3 |
| // instructions is inefficient. Once the problem is fixed, we should |
| // call ST->hasSSE3() instead of ST->hasPOPCNT(). |
| return ST->hasPOPCNT() ? TTI::PSK_FastHardware : TTI::PSK_Software; |
| } |
| |
| llvm::Optional<unsigned> X86TTIImpl::getCacheSize( |
| TargetTransformInfo::CacheLevel Level) const { |
| switch (Level) { |
| case TargetTransformInfo::CacheLevel::L1D: |
| // - Penryn |
| // - Nehalem |
| // - Westmere |
| // - Sandy Bridge |
| // - Ivy Bridge |
| // - Haswell |
| // - Broadwell |
| // - Skylake |
| // - Kabylake |
| return 32 * 1024; // 32 KByte |
| case TargetTransformInfo::CacheLevel::L2D: |
| // - Penryn |
| // - Nehalem |
| // - Westmere |
| // - Sandy Bridge |
| // - Ivy Bridge |
| // - Haswell |
| // - Broadwell |
| // - Skylake |
| // - Kabylake |
| return 256 * 1024; // 256 KByte |
| } |
| |
| llvm_unreachable("Unknown TargetTransformInfo::CacheLevel"); |
| } |
| |
| llvm::Optional<unsigned> X86TTIImpl::getCacheAssociativity( |
| TargetTransformInfo::CacheLevel Level) const { |
| // - Penryn |
| // - Nehalem |
| // - Westmere |
| // - Sandy Bridge |
| // - Ivy Bridge |
| // - Haswell |
| // - Broadwell |
| // - Skylake |
| // - Kabylake |
| switch (Level) { |
| case TargetTransformInfo::CacheLevel::L1D: |
| LLVM_FALLTHROUGH; |
| case TargetTransformInfo::CacheLevel::L2D: |
| return 8; |
| } |
| |
| llvm_unreachable("Unknown TargetTransformInfo::CacheLevel"); |
| } |
| |
| unsigned X86TTIImpl::getNumberOfRegisters(unsigned ClassID) const { |
| bool Vector = (ClassID == 1); |
| if (Vector && !ST->hasSSE1()) |
| return 0; |
| |
| if (ST->is64Bit()) { |
| if (Vector && ST->hasAVX512()) |
| return 32; |
| return 16; |
| } |
| return 8; |
| } |
| |
| TypeSize |
| X86TTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const { |
| unsigned PreferVectorWidth = ST->getPreferVectorWidth(); |
| switch (K) { |
| case TargetTransformInfo::RGK_Scalar: |
| return TypeSize::getFixed(ST->is64Bit() ? 64 : 32); |
| case TargetTransformInfo::RGK_FixedWidthVector: |
| if (ST->hasAVX512() && PreferVectorWidth >= 512) |
| return TypeSize::getFixed(512); |
| if (ST->hasAVX() && PreferVectorWidth >= 256) |
| return TypeSize::getFixed(256); |
| if (ST->hasSSE1() && PreferVectorWidth >= 128) |
| return TypeSize::getFixed(128); |
| return TypeSize::getFixed(0); |
| case TargetTransformInfo::RGK_ScalableVector: |
| return TypeSize::getScalable(0); |
| } |
| |
| llvm_unreachable("Unsupported register kind"); |
| } |
| |
| unsigned X86TTIImpl::getLoadStoreVecRegBitWidth(unsigned) const { |
| return getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector) |
| .getFixedSize(); |
| } |
| |
| unsigned X86TTIImpl::getMaxInterleaveFactor(unsigned VF) { |
| // If the loop will not be vectorized, don't interleave the loop. |
| // Let regular unroll to unroll the loop, which saves the overflow |
| // check and memory check cost. |
| if (VF == 1) |
| return 1; |
| |
| if (ST->isAtom()) |
| return 1; |
| |
| // Sandybridge and Haswell have multiple execution ports and pipelined |
| // vector units. |
| if (ST->hasAVX()) |
| return 4; |
| |
| return 2; |
| } |
| |
| InstructionCost X86TTIImpl::getArithmeticInstrCost( |
| unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, |
| TTI::OperandValueKind Op1Info, TTI::OperandValueKind Op2Info, |
| TTI::OperandValueProperties Opd1PropInfo, |
| TTI::OperandValueProperties Opd2PropInfo, ArrayRef<const Value *> Args, |
| const Instruction *CxtI) { |
| // TODO: Handle more cost kinds. |
| if (CostKind != TTI::TCK_RecipThroughput) |
| return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, |
| Op2Info, Opd1PropInfo, |
| Opd2PropInfo, Args, CxtI); |
| |
| // vXi8 multiplications are always promoted to vXi16. |
| if (Opcode == Instruction::Mul && Ty->isVectorTy() && |
| Ty->getScalarSizeInBits() == 8) { |
| Type *WideVecTy = |
| VectorType::getExtendedElementVectorType(cast<VectorType>(Ty)); |
| return getCastInstrCost(Instruction::ZExt, WideVecTy, Ty, |
| TargetTransformInfo::CastContextHint::None, |
| CostKind) + |
| getCastInstrCost(Instruction::Trunc, Ty, WideVecTy, |
| TargetTransformInfo::CastContextHint::None, |
| CostKind) + |
| getArithmeticInstrCost(Opcode, WideVecTy, CostKind, Op1Info, Op2Info, |
| Opd1PropInfo, Opd2PropInfo); |
| } |
| |
| // Legalize the type. |
| std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty); |
| |
| int ISD = TLI->InstructionOpcodeToISD(Opcode); |
| assert(ISD && "Invalid opcode"); |
| |
| if (ISD == ISD::MUL && Args.size() == 2 && LT.second.isVector() && |
| LT.second.getScalarType() == MVT::i32) { |
| // Check if the operands can be represented as a smaller datatype. |
| bool Op1Signed = false, Op2Signed = false; |
| unsigned Op1MinSize = BaseT::minRequiredElementSize(Args[0], Op1Signed); |
| unsigned Op2MinSize = BaseT::minRequiredElementSize(Args[1], Op2Signed); |
| unsigned OpMinSize = std::max(Op1MinSize, Op2MinSize); |
| |
| // If both are representable as i15 and at least one is constant, |
| // zero-extended, or sign-extended from vXi16 (or less pre-SSE41) then we |
| // can treat this as PMADDWD which has the same costs as a vXi16 multiply. |
| if (OpMinSize <= 15 && !ST->isPMADDWDSlow()) { |
| bool Op1Constant = |
| isa<ConstantDataVector>(Args[0]) || isa<ConstantVector>(Args[0]); |
| bool Op2Constant = |
| isa<ConstantDataVector>(Args[1]) || isa<ConstantVector>(Args[1]); |
| bool Op1Sext = isa<SExtInst>(Args[0]) && |
| (Op1MinSize == 15 || (Op1MinSize < 15 && !ST->hasSSE41())); |
| bool Op2Sext = isa<SExtInst>(Args[1]) && |
| (Op2MinSize == 15 || (Op2MinSize < 15 && !ST->hasSSE41())); |
| |
| bool IsZeroExtended = !Op1Signed || !Op2Signed; |
| bool IsConstant = Op1Constant || Op2Constant; |
| bool IsSext = Op1Sext || Op2Sext; |
| if (IsConstant || IsZeroExtended || IsSext) |
| LT.second = |
| MVT::getVectorVT(MVT::i16, 2 * LT.second.getVectorNumElements()); |
| } |
| } |
| |
| if ((ISD == ISD::MUL || ISD == ISD::SDIV || ISD == ISD::SREM || |
| ISD == ISD::UDIV || ISD == ISD::UREM) && |
| (Op2Info == TargetTransformInfo::OK_UniformConstantValue || |
| Op2Info == TargetTransformInfo::OK_NonUniformConstantValue) && |
| Opd2PropInfo == TargetTransformInfo::OP_PowerOf2) { |
| // Vector multiply by pow2 will be simplified to shifts. |
| if (ISD == ISD::MUL) { |
| InstructionCost Cost = getArithmeticInstrCost( |
| Instruction::Shl, Ty, CostKind, Op1Info, Op2Info, |
| TargetTransformInfo::OP_None, TargetTransformInfo::OP_None); |
| return Cost; |
| } |
| |
| if (ISD == ISD::SDIV || ISD == ISD::SREM) { |
| // On X86, vector signed division by constants power-of-two are |
| // normally expanded to the sequence SRA + SRL + ADD + SRA. |
| // The OperandValue properties may not be the same as that of the previous |
| // operation; conservatively assume OP_None. |
| InstructionCost Cost = |
| 2 * getArithmeticInstrCost(Instruction::AShr, Ty, CostKind, Op1Info, |
| Op2Info, TargetTransformInfo::OP_None, |
| TargetTransformInfo::OP_None); |
| Cost += getArithmeticInstrCost(Instruction::LShr, Ty, CostKind, Op1Info, |
| Op2Info, TargetTransformInfo::OP_None, |
| TargetTransformInfo::OP_None); |
| Cost += getArithmeticInstrCost(Instruction::Add, Ty, CostKind, Op1Info, |
| Op2Info, TargetTransformInfo::OP_None, |
| TargetTransformInfo::OP_None); |
| |
| if (ISD == ISD::SREM) { |
| // For SREM: (X % C) is the equivalent of (X - (X/C)*C) |
| Cost += getArithmeticInstrCost(Instruction::Mul, Ty, CostKind, Op1Info, |
| Op2Info); |
| Cost += getArithmeticInstrCost(Instruction::Sub, Ty, CostKind, Op1Info, |
| Op2Info); |
| } |
| |
| return Cost; |
| } |
| |
| // Vector unsigned division/remainder will be simplified to shifts/masks. |
| if (ISD == ISD::UDIV) |
| return getArithmeticInstrCost(Instruction::LShr, Ty, CostKind, Op1Info, |
| Op2Info, TargetTransformInfo::OP_None, |
| TargetTransformInfo::OP_None); |
| // UREM |
| return getArithmeticInstrCost(Instruction::And, Ty, CostKind, Op1Info, |
| Op2Info, TargetTransformInfo::OP_None, |
| TargetTransformInfo::OP_None); |
| } |
| |
| static const CostTblEntry GLMCostTable[] = { |
| { ISD::FDIV, MVT::f32, 18 }, // divss |
| { ISD::FDIV, MVT::v4f32, 35 }, // divps |
| { ISD::FDIV, MVT::f64, 33 }, // divsd |
| { ISD::FDIV, MVT::v2f64, 65 }, // divpd |
| }; |
| |
| if (ST->useGLMDivSqrtCosts()) |
| if (const auto *Entry = CostTableLookup(GLMCostTable, ISD, |
| LT.second)) |
| return LT.first * Entry->Cost; |
| |
| static const CostTblEntry SLMCostTable[] = { |
| { ISD::MUL, MVT::v4i32, 11 }, // pmulld |
| { ISD::MUL, MVT::v8i16, 2 }, // pmullw |
| { ISD::FMUL, MVT::f64, 2 }, // mulsd |
| { ISD::FMUL, MVT::v2f64, 4 }, // mulpd |
| { ISD::FMUL, MVT::v4f32, 2 }, // mulps |
| { ISD::FDIV, MVT::f32, 17 }, // divss |
| { ISD::FDIV, MVT::v4f32, 39 }, // divps |
| { ISD::FDIV, MVT::f64, 32 }, // divsd |
| { ISD::FDIV, MVT::v2f64, 69 }, // divpd |
| { ISD::FADD, MVT::v2f64, 2 }, // addpd |
| { ISD::FSUB, MVT::v2f64, 2 }, // subpd |
| // v2i64/v4i64 mul is custom lowered as a series of long: |
| // multiplies(3), shifts(3) and adds(2) |
| // slm muldq version throughput is 2 and addq throughput 4 |
| // thus: 3X2 (muldq throughput) + 3X1 (shift throughput) + |
| // 3X4 (addq throughput) = 17 |
| { ISD::MUL, MVT::v2i64, 17 }, |
| // slm addq\subq throughput is 4 |
| { ISD::ADD, MVT::v2i64, 4 }, |
| { ISD::SUB, MVT::v2i64, 4 }, |
| }; |
| |
| if (ST->useSLMArithCosts()) { |
| if (Args.size() == 2 && ISD == ISD::MUL && LT.second == MVT::v4i32) { |
| // Check if the operands can be shrinked into a smaller datatype. |
| // TODO: Merge this into generiic vXi32 MUL patterns above. |
| bool Op1Signed = false; |
| unsigned Op1MinSize = BaseT::minRequiredElementSize(Args[0], Op1Signed); |
| bool Op2Signed = false; |
| unsigned Op2MinSize = BaseT::minRequiredElementSize(Args[1], Op2Signed); |
| |
| bool SignedMode = Op1Signed || Op2Signed; |
| unsigned OpMinSize = std::max(Op1MinSize, Op2MinSize); |
| |
| if (OpMinSize <= 7) |
| return LT.first * 3; // pmullw/sext |
| if (!SignedMode && OpMinSize <= 8) |
| return LT.first * 3; // pmullw/zext |
| if (OpMinSize <= 15) |
| return LT.first * 5; // pmullw/pmulhw/pshuf |
| if (!SignedMode && OpMinSize <= 16) |
| return LT.first * 5; // pmullw/pmulhw/pshuf |
| } |
| |
| if (const auto *Entry = CostTableLookup(SLMCostTable, ISD, |
| LT.second)) { |
| return LT.first * Entry->Cost; |
| } |
| } |
| |
| static const CostTblEntry AVX512BWUniformConstCostTable[] = { |
| { ISD::SHL, MVT::v64i8, 2 }, // psllw + pand. |
| { ISD::SRL, MVT::v64i8, 2 }, // psrlw + pand. |
| { ISD::SRA, MVT::v64i8, 4 }, // psrlw, pand, pxor, psubb. |
| }; |
| |
| if (Op2Info == TargetTransformInfo::OK_UniformConstantValue && |
| ST->hasBWI()) { |
| if (const auto *Entry = CostTableLookup(AVX512BWUniformConstCostTable, ISD, |
| LT.second)) |
| return LT.first * Entry->Cost; |
| } |
| |
| static const CostTblEntry AVX512UniformConstCostTable[] = { |
| { ISD::SRA, MVT::v2i64, 1 }, |
| { ISD::SRA, MVT::v4i64, 1 }, |
| { ISD::SRA, MVT::v8i64, 1 }, |
| |
| { ISD::SHL, MVT::v64i8, 4 }, // psllw + pand. |
| { ISD::SRL, MVT::v64i8, 4 }, // psrlw + pand. |
| { ISD::SRA, MVT::v64i8, 8 }, // psrlw, pand, pxor, psubb. |
| |
| { ISD::SDIV, MVT::v16i32, 6 }, // pmuludq sequence |
| { ISD::SREM, MVT::v16i32, 8 }, // pmuludq+mul+sub sequence |
| { ISD::UDIV, MVT::v16i32, 5 }, // pmuludq sequence |
| { ISD::UREM, MVT::v16i32, 7 }, // pmuludq+mul+sub sequence |
| }; |
| |
| if (Op2Info == TargetTransformInfo::OK_UniformConstantValue && |
| ST->hasAVX512()) { |
| if (const auto *Entry = CostTableLookup(AVX512UniformConstCostTable, ISD, |
| LT.second)) |
| return LT.first * Entry->Cost; |
| } |
| |
| static const CostTblEntry AVX2UniformConstCostTable[] = { |
| { ISD::SHL, MVT::v32i8, 2 }, // psllw + pand. |
| { ISD::SRL, MVT::v32i8, 2 }, // psrlw + pand. |
| { ISD::SRA, MVT::v32i8, 4 }, // psrlw, pand, pxor, psubb. |
| |
| { ISD::SRA, MVT::v4i64, 4 }, // 2 x psrad + shuffle. |
| |
| { ISD::SDIV, MVT::v8i32, 6 }, // pmuludq sequence |
| { ISD::SREM, MVT::v8i32, 8 }, // pmuludq+mul+sub sequence |
| { ISD::UDIV, MVT::v8i32, 5 }, // pmuludq sequence |
| { ISD::UREM, MVT::v8i32, 7 }, // pmuludq+mul+sub sequence |
| }; |
| |
| if (Op2Info == TargetTransformInfo::OK_UniformConstantValue && |
| ST->hasAVX2()) { |
| if (const auto *Entry = CostTableLookup(AVX2UniformConstCostTable, ISD, |
| LT.second)) |
| return LT.first * Entry->Cost; |
| } |
| |
| static const CostTblEntry SSE2UniformConstCostTable[] = { |
| { ISD::SHL, MVT::v16i8, 2 }, // psllw + pand. |
| { ISD::SRL, MVT::v16i8, 2 }, // psrlw + pand. |
| { ISD::SRA, MVT::v16i8, 4 }, // psrlw, pand, pxor, psubb. |
| |
| { ISD::SHL, MVT::v32i8, 4+2 }, // 2*(psllw + pand) + split. |
| { ISD::SRL, MVT::v32i8, 4+2 }, // 2*(psrlw + pand) + split. |
| { ISD::SRA, MVT::v32i8, 8+2 }, // 2*(psrlw, pand, pxor, psubb) + split. |
| |
| { ISD::SDIV, MVT::v8i32, 12+2 }, // 2*pmuludq sequence + split. |
| { ISD::SREM, MVT::v8i32, 16+2 }, // 2*pmuludq+mul+sub sequence + split. |
| { ISD::SDIV, MVT::v4i32, 6 }, // pmuludq sequence |
| { ISD::SREM, MVT::v4i32, 8 }, // pmuludq+mul+sub sequence |
| { ISD::UDIV, MVT::v8i32, 10+2 }, // 2*pmuludq sequence + split. |
| { ISD::UREM, MVT::v8i32, 14+2 }, // 2*pmuludq+mul+sub sequence + split. |
| { ISD::UDIV, MVT::v4i32, 5 }, // pmuludq sequence |
| { ISD::UREM, MVT::v4i32, 7 }, // pmuludq+mul+sub sequence |
| }; |
| |
| // XOP has faster vXi8 shifts. |
| if (Op2Info == TargetTransformInfo::OK_UniformConstantValue && |
| ST->hasSSE2() && !ST->hasXOP()) { |
| if (const auto *Entry = |
| CostTableLookup(SSE2UniformConstCostTable, ISD, LT.second)) |
| return LT.first * Entry->Cost; |
| } |
| |
| static const CostTblEntry AVX512BWConstCostTable[] = { |
| { ISD::SDIV, MVT::v64i8, 14 }, // 2*ext+2*pmulhw sequence |
| { ISD::SREM, MVT::v64i8, 16 }, // 2*ext+2*pmulhw+mul+sub sequence |
| { ISD::UDIV, MVT::v64i8, 14 }, // 2*ext+2*pmulhw sequence |
| { ISD::UREM, MVT::v64i8, 16 }, // 2*ext+2*pmulhw+mul+sub sequence |
| { ISD::SDIV, MVT::v32i16, 6 }, // vpmulhw sequence |
| { ISD::SREM, MVT::v32i16, 8 }, // vpmulhw+mul+sub sequence |
| { ISD::UDIV, MVT::v32i16, 6 }, // vpmulhuw sequence |
| { ISD::UREM, MVT::v32i16, 8 }, // vpmulhuw+mul+sub sequence |
| }; |
| |
| if ((Op2Info == TargetTransformInfo::OK_UniformConstantValue || |
| Op2Info == TargetTransformInfo::OK_NonUniformConstantValue) && |
| ST->hasBWI()) { |
| if (const auto *Entry = |
| CostTableLookup(AVX512BWConstCostTable, ISD, LT.second)) |
| return LT.first * Entry->Cost; |
| } |
| |
| static const CostTblEntry AVX512ConstCostTable[] = { |
| { ISD::SDIV, MVT::v16i32, 15 }, // vpmuldq sequence |
| { ISD::SREM, MVT::v16i32, 17 }, // vpmuldq+mul+sub sequence |
| { ISD::UDIV, MVT::v16i32, 15 }, // vpmuludq sequence |
| { ISD::UREM, MVT::v16i32, 17 }, // vpmuludq+mul+sub sequence |
| { ISD::SDIV, MVT::v64i8, 28 }, // 4*ext+4*pmulhw sequence |
| { ISD::SREM, MVT::v64i8, 32 }, // 4*ext+4*pmulhw+mul+sub sequence |
| { ISD::UDIV, MVT::v64i8, 28 }, // 4*ext+4*pmulhw sequence |
| { ISD::UREM, MVT::v64i8, 32 }, // 4*ext+4*pmulhw+mul+sub sequence |
| { ISD::SDIV, MVT::v32i16, 12 }, // 2*vpmulhw sequence |
| { ISD::SREM, MVT::v32i16, 16 }, // 2*vpmulhw+mul+sub sequence |
| { ISD::UDIV, MVT::v32i16, 12 }, // 2*vpmulhuw sequence |
| { ISD::UREM, MVT::v32i16, 16 }, // 2*vpmulhuw+mul+sub sequence |
| }; |
| |
| if ((Op2Info == TargetTransformInfo::OK_UniformConstantValue || |
| Op2Info == TargetTransformInfo::OK_NonUniformConstantValue) && |
| ST->hasAVX512()) { |
| if (const auto *Entry = |
| CostTableLookup(AVX512ConstCostTable, ISD, LT.second)) |
| return LT.first * Entry->Cost; |
| } |
| |
| static const CostTblEntry AVX2ConstCostTable[] = { |
| { ISD::SDIV, MVT::v32i8, 14 }, // 2*ext+2*pmulhw sequence |
| { ISD::SREM, MVT::v32i8, 16 }, // 2*ext+2*pmulhw+mul+sub sequence |
| { ISD::UDIV, MVT::v32i8, 14 }, // 2*ext+2*pmulhw sequence |
| { ISD::UREM, MVT::v32i8, 16 }, // 2*ext+2*pmulhw+mul+sub sequence |
| { ISD::SDIV, MVT::v16i16, 6 }, // vpmulhw sequence |
| { ISD::SREM, MVT::v16i16, 8 }, // vpmulhw+mul+sub sequence |
| { ISD::UDIV, MVT::v16i16, 6 }, // vpmulhuw sequence |
| { ISD::UREM, MVT::v16i16, 8 }, // vpmulhuw+mul+sub sequence |
| { ISD::SDIV, MVT::v8i32, 15 }, // vpmuldq sequence |
| { ISD::SREM, MVT::v8i32, 19 }, // vpmuldq+mul+sub sequence |
| { ISD::UDIV, MVT::v8i32, 15 }, // vpmuludq sequence |
| { ISD::UREM, MVT::v8i32, 19 }, // vpmuludq+mul+sub sequence |
| }; |
| |
| if ((Op2Info == TargetTransformInfo::OK_UniformConstantValue || |
| Op2Info == TargetTransformInfo::OK_NonUniformConstantValue) && |
| ST->hasAVX2()) { |
| if (const auto *Entry = CostTableLookup(AVX2ConstCostTable, ISD, LT.second)) |
| return LT.first * Entry->Cost; |
| } |
| |
| static const CostTblEntry SSE2ConstCostTable[] = { |
| { ISD::SDIV, MVT::v32i8, 28+2 }, // 4*ext+4*pmulhw sequence + split. |
| { ISD::SREM, MVT::v32i8, 32+2 }, // 4*ext+4*pmulhw+mul+sub sequence + split. |
| { ISD::SDIV, MVT::v16i8, 14 }, // 2*ext+2*pmulhw sequence |
| { ISD::SREM, MVT::v16i8, 16 }, // 2*ext+2*pmulhw+mul+sub sequence |
| { ISD::UDIV, MVT::v32i8, 28+2 }, // 4*ext+4*pmulhw sequence + split. |
| { ISD::UREM, MVT::v32i8, 32+2 }, // 4*ext+4*pmulhw+mul+sub sequence + split. |
| { ISD::UDIV, MVT::v16i8, 14 }, // 2*ext+2*pmulhw sequence |
| { ISD::UREM, MVT::v16i8, 16 }, // 2*ext+2*pmulhw+mul+sub sequence |
| { ISD::SDIV, MVT::v16i16, 12+2 }, // 2*pmulhw sequence + split. |
| { ISD::SREM, MVT::v16i16, 16+2 }, // 2*pmulhw+mul+sub sequence + split. |
| { ISD::SDIV, MVT::v8i16, 6 }, // pmulhw sequence |
| { ISD::SREM, MVT::v8i16, 8 }, // pmulhw+mul+sub sequence |
| { ISD::UDIV, MVT::v16i16, 12+2 }, // 2*pmulhuw sequence + split. |
| { ISD::UREM, MVT::v16i16, 16+2 }, // 2*pmulhuw+mul+sub sequence + split. |
| { ISD::UDIV, MVT::v8i16, 6 }, // pmulhuw sequence |
| { ISD::UREM, MVT::v8i16, 8 }, // pmulhuw+mul+sub sequence |
| { ISD::SDIV, MVT::v8i32, 38+2 }, // 2*pmuludq sequence + split. |
| { ISD::SREM, MVT::v8i32, 48+2 }, // 2*pmuludq+mul+sub sequence + split. |
| { ISD::SDIV, MVT::v4i32, 19 }, // pmuludq sequence |
| { ISD::SREM, MVT::v4i32, 24 }, // pmuludq+mul+sub sequence |
| { ISD::UDIV, MVT::v8i32, 30+2 }, // 2*pmuludq sequence + split. |
| { ISD::UREM, MVT::v8i32, 40+2 }, // 2*pmuludq+mul+sub sequence + split. |
| { ISD::UDIV, MVT::v4i32, 15 }, // pmuludq sequence |
| { ISD::UREM, MVT::v4i32, 20 }, // pmuludq+mul+sub sequence |
| }; |
| |
| if ((Op2Info == TargetTransformInfo::OK_UniformConstantValue || |
| Op2Info == TargetTransformInfo::OK_NonUniformConstantValue) && |
| ST->hasSSE2()) { |
| // pmuldq sequence. |
| if (ISD == ISD::SDIV && LT.second == MVT::v8i32 && ST->hasAVX()) |
| return LT.first * 32; |
| if (ISD == ISD::SREM && LT.second == MVT::v8i32 && ST->hasAVX()) |
| return LT.first * 38; |
| if (ISD == ISD::SDIV && LT.second == MVT::v4i32 && ST->hasSSE41()) |
| return LT.first * 15; |
| if (ISD == ISD::SREM && LT.second == MVT::v4i32 && ST->hasSSE41()) |
| return LT.first * 20; |
| |
| if (const auto *Entry = CostTableLookup(SSE2ConstCostTable, ISD, LT.second)) |
| return LT.first * Entry->Cost; |
| } |
| |
| static const CostTblEntry AVX512BWShiftCostTable[] = { |
| { ISD::SHL, MVT::v16i8, 4 }, // extend/vpsllvw/pack sequence. |
| { ISD::SRL, MVT::v16i8, 4 }, // extend/vpsrlvw/pack sequence. |
| { ISD::SRA, MVT::v16i8, 4 }, // extend/vpsravw/pack sequence. |
| { ISD::SHL, MVT::v32i8, 4 }, // extend/vpsllvw/pack sequence. |
| { ISD::SRL, MVT::v32i8, 4 }, // extend/vpsrlvw/pack sequence. |
| { ISD::SRA, MVT::v32i8, 6 }, // extend/vpsravw/pack sequence. |
| { ISD::SHL, MVT::v64i8, 6 }, // extend/vpsllvw/pack sequence. |
| { ISD::SRL, MVT::v64i8, 7 }, // extend/vpsrlvw/pack sequence. |
| { ISD::SRA, MVT::v64i8, 15 }, // extend/vpsravw/pack sequence. |
| |
| { ISD::SHL, MVT::v8i16, 1 }, // vpsllvw |
| { ISD::SRL, MVT::v8i16, 1 }, // vpsrlvw |
| { ISD::SRA, MVT::v8i16, 1 }, // vpsravw |
| { ISD::SHL, MVT::v16i16, 1 }, // vpsllvw |
| { ISD::SRL, MVT::v16i16, 1 }, // vpsrlvw |
| { ISD::SRA, MVT::v16i16, 1 }, // vpsravw |
| { ISD::SHL, MVT::v32i16, 1 }, // vpsllvw |
| { ISD::SRL, MVT::v32i16, 1 }, // vpsrlvw |
| { ISD::SRA, MVT::v32i16, 1 }, // vpsravw |
| }; |
| |
| if (ST->hasBWI()) |
| if (const auto *Entry = CostTableLookup(AVX512BWShiftCostTable, ISD, LT.second)) |
| return LT.first * Entry->Cost; |
| |
| static const CostTblEntry AVX2UniformCostTable[] = { |
| // Uniform splats are cheaper for the following instructions. |
| { ISD::SHL, MVT::v16i16, 1 }, // psllw. |
| { ISD::SRL, MVT::v16i16, 1 }, // psrlw. |
| { ISD::SRA, MVT::v16i16, 1 }, // psraw. |
| { ISD::SHL, MVT::v32i16, 2 }, // 2*psllw. |
| { ISD::SRL, MVT::v32i16, 2 }, // 2*psrlw. |
| { ISD::SRA, MVT::v32i16, 2 }, // 2*psraw. |
| |
| { ISD::SHL, MVT::v8i32, 1 }, // pslld |
| { ISD::SRL, MVT::v8i32, 1 }, // psrld |
| { ISD::SRA, MVT::v8i32, 1 }, // psrad |
| { ISD::SHL, MVT::v4i64, 1 }, // psllq |
| { ISD::SRL, MVT::v4i64, 1 }, // psrlq |
| }; |
| |
| if (ST->hasAVX2() && |
| ((Op2Info == TargetTransformInfo::OK_UniformConstantValue) || |
| (Op2Info == TargetTransformInfo::OK_UniformValue))) { |
| if (const auto *Entry = |
| CostTableLookup(AVX2UniformCostTable, ISD, LT.second)) |
| return LT.first * Entry->Cost; |
| } |
| |
| static const CostTblEntry SSE2UniformCostTable[] = { |
| // Uniform splats are cheaper for the following instructions. |
| { ISD::SHL, MVT::v8i16, 1 }, // psllw. |
| { ISD::SHL, MVT::v4i32, 1 }, // pslld |
| { ISD::SHL, MVT::v2i64, 1 }, // psllq. |
| |
| { ISD::SRL, MVT::v8i16, 1 }, // psrlw. |
| { ISD::SRL, MVT::v4i32, 1 }, // psrld. |
| { ISD::SRL, MVT::v2i64, 1 }, // psrlq. |
| |
| { ISD::SRA, MVT::v8i16, 1 }, // psraw. |
| { ISD::SRA, MVT::v4i32, 1 }, // psrad. |
| }; |
| |
| if (ST->hasSSE2() && |
| ((Op2Info == TargetTransformInfo::OK_UniformConstantValue) || |
| (Op2Info == TargetTransformInfo::OK_UniformValue))) { |
| if (const auto *Entry = |
| CostTableLookup(SSE2UniformCostTable, ISD, LT.second)) |
| return LT.first * Entry->Cost; |
| } |
| |
| static const CostTblEntry AVX512DQCostTable[] = { |
| { ISD::MUL, MVT::v2i64, 2 }, // pmullq |
| { ISD::MUL, MVT::v4i64, 2 }, // pmullq |
| { ISD::MUL, MVT::v8i64, 2 } // pmullq |
| }; |
| |
| // Look for AVX512DQ lowering tricks for custom cases. |
| if (ST->hasDQI()) |
| if (const auto *Entry = CostTableLookup(AVX512DQCostTable, ISD, LT.second)) |
| return LT.first * Entry->Cost; |
| |
| static const CostTblEntry AVX512BWCostTable[] = { |
| { ISD::SHL, MVT::v64i8, 11 }, // vpblendvb sequence. |
| { ISD::SRL, MVT::v64i8, 11 }, // vpblendvb sequence. |
| { ISD::SRA, MVT::v64i8, 24 }, // vpblendvb sequence. |
| }; |
| |
| // Look for AVX512BW lowering tricks for custom cases. |
| if (ST->hasBWI()) |
| if (const auto *Entry = CostTableLookup(AVX512BWCostTable, ISD, LT.second)) |
| return LT.first * Entry->Cost; |
| |
| static const CostTblEntry AVX512CostTable[] = { |
| { ISD::SHL, MVT::v4i32, 1 }, |
| { ISD::SRL, MVT::v4i32, 1 }, |
| { ISD::SRA, MVT::v4i32, 1 }, |
| { ISD::SHL, MVT::v8i32, 1 }, |
| { ISD::SRL, MVT::v8i32, 1 }, |
| { ISD::SRA, MVT::v8i32, 1 }, |
| { ISD::SHL, MVT::v16i32, 1 }, |
| { ISD::SRL, MVT::v16i32, 1 }, |
| { ISD::SRA, MVT::v16i32, 1 }, |
| |
| { ISD::SHL, MVT::v2i64, 1 }, |
| { ISD::SRL, MVT::v2i64, 1 }, |
| { ISD::SHL, MVT::v4i64, 1 }, |
| { ISD::SRL, MVT::v4i64, 1 }, |
| { ISD::SHL, MVT::v8i64, 1 }, |
| { ISD::SRL, MVT::v8i64, 1 }, |
| |
| { ISD::SRA, MVT::v2i64, 1 }, |
| { ISD::SRA, MVT::v4i64, 1 }, |
| { ISD::SRA, MVT::v8i64, 1 }, |
| |
| { ISD::MUL, MVT::v16i32, 1 }, // pmulld (Skylake from agner.org) |
| { ISD::MUL, MVT::v8i32, 1 }, // pmulld (Skylake from agner.org) |
| { ISD::MUL, MVT::v4i32, 1 }, // pmulld (Skylake from agner.org) |
| { ISD::MUL, MVT::v8i64, 6 }, // 3*pmuludq/3*shift/2*add |
| |
| { ISD::FNEG, MVT::v8f64, 1 }, // Skylake from http://www.agner.org/ |
| { ISD::FADD, MVT::v8f64, 1 }, // Skylake from http://www.agner.org/ |
| { ISD::FSUB, MVT::v8f64, 1 }, // Skylake from http://www.agner.org/ |
| { ISD::FMUL, MVT::v8f64, 1 }, // Skylake from http://www.agner.org/ |
| { ISD::FDIV, MVT::f64, 4 }, // Skylake from http://www.agner.org/ |
| { ISD::FDIV, MVT::v2f64, 4 }, // Skylake from http://www.agner.org/ |
| { ISD::FDIV, MVT::v4f64, 8 }, // Skylake from http://www.agner.org/ |
| { ISD::FDIV, MVT::v8f64, 16 }, // Skylake from http://www.agner.org/ |
| |
| { ISD::FNEG, MVT::v16f32, 1 }, // Skylake from http://www.agner.org/ |
| { ISD::FADD, MVT::v16f32, 1 }, // Skylake from http://www.agner.org/ |
| { ISD::FSUB, MVT::v16f32, 1 }, // Skylake from http://www.agner.org/ |
| { ISD::FMUL, MVT::v16f32, 1 }, // Skylake from http://www.agner.org/ |
| { ISD::FDIV, MVT::f32, 3 }, // Skylake from http://www.agner.org/ |
| { ISD::FDIV, MVT::v4f32, 3 }, // Skylake from http://www.agner.org/ |
| { ISD::FDIV, MVT::v8f32, 5 }, // Skylake from http://www.agner.org/ |
| { ISD::FDIV, MVT::v16f32, 10 }, // Skylake from http://www.agner.org/ |
| }; |
| |
| if (ST->hasAVX512()) |
| if (const auto *Entry = CostTableLookup(AVX512CostTable, ISD, LT.second)) |
| return LT.first * Entry->Cost; |
| |
| static const CostTblEntry AVX2ShiftCostTable[] = { |
| // Shifts on vXi64/vXi32 on AVX2 is legal even though we declare to |
| // customize them to detect the cases where shift amount is a scalar one. |
| { ISD::SHL, MVT::v4i32, 2 }, // vpsllvd (Haswell from agner.org) |
| { ISD::SRL, MVT::v4i32, 2 }, // vpsrlvd (Haswell from agner.org) |
| { ISD::SRA, MVT::v4i32, 2 }, // vpsravd (Haswell from agner.org) |
| { ISD::SHL, MVT::v8i32, 2 }, // vpsllvd (Haswell from agner.org) |
| { ISD::SRL, MVT::v8i32, 2 }, // vpsrlvd (Haswell from agner.org) |
| { ISD::SRA, MVT::v8i32, 2 }, // vpsravd (Haswell from agner.org) |
| { ISD::SHL, MVT::v2i64, 1 }, // vpsllvq (Haswell from agner.org) |
| { ISD::SRL, MVT::v2i64, 1 }, // vpsrlvq (Haswell from agner.org) |
| { ISD::SHL, MVT::v4i64, 1 }, // vpsllvq (Haswell from agner.org) |
| { ISD::SRL, MVT::v4i64, 1 }, // vpsrlvq (Haswell from agner.org) |
| }; |
| |
| if (ST->hasAVX512()) { |
| if (ISD == ISD::SHL && LT.second == MVT::v32i16 && |
| (Op2Info == TargetTransformInfo::OK_UniformConstantValue || |
| Op2Info == TargetTransformInfo::OK_NonUniformConstantValue)) |
| // On AVX512, a packed v32i16 shift left by a constant build_vector |
| // is lowered into a vector multiply (vpmullw). |
| return getArithmeticInstrCost(Instruction::Mul, Ty, CostKind, |
| Op1Info, Op2Info, |
| TargetTransformInfo::OP_None, |
| TargetTransformInfo::OP_None); |
| } |
| |
| // Look for AVX2 lowering tricks (XOP is always better at v4i32 shifts). |
| if (ST->hasAVX2() && !(ST->hasXOP() && LT.second == MVT::v4i32)) { |
| if (ISD == ISD::SHL && LT.second == MVT::v16i16 && |
| (Op2Info == TargetTransformInfo::OK_UniformConstantValue || |
| Op2Info == TargetTransformInfo::OK_NonUniformConstantValue)) |
| // On AVX2, a packed v16i16 shift left by a constant build_vector |
| // is lowered into a vector multiply (vpmullw). |
| return getArithmeticInstrCost(Instruction::Mul, Ty, CostKind, |
| Op1Info, Op2Info, |
| TargetTransformInfo::OP_None, |
| TargetTransformInfo::OP_None); |
| |
| if (const auto *Entry = CostTableLookup(AVX2ShiftCostTable, ISD, LT.second)) |
| return LT.first * Entry->Cost; |
| } |
| |
| static const CostTblEntry XOPShiftCostTable[] = { |
| // 128bit shifts take 1cy, but right shifts require negation beforehand. |
| { ISD::SHL, MVT::v16i8, 1 }, |
| { ISD::SRL, MVT::v16i8, 2 }, |
| { ISD::SRA, MVT::v16i8, 2 }, |
| { ISD::SHL, MVT::v8i16, 1 }, |
| { ISD::SRL, MVT::v8i16, 2 }, |
| { ISD::SRA, MVT::v8i16, 2 }, |
| { ISD::SHL, MVT::v4i32, 1 }, |
| { ISD::SRL, MVT::v4i32, 2 }, |
| { ISD::SRA, MVT::v4i32, 2 }, |
| { ISD::SHL, MVT::v2i64, 1 }, |
| { ISD::SRL, MVT::v2i64, 2 }, |
| { ISD::SRA, MVT::v2i64, 2 }, |
| // 256bit shifts require splitting if AVX2 didn't catch them above. |
| { ISD::SHL, MVT::v32i8, 2+2 }, |
| { ISD::SRL, MVT::v32i8, 4+2 }, |
| { ISD::SRA, MVT::v32i8, 4+2 }, |
| { ISD::SHL, MVT::v16i16, 2+2 }, |
| { ISD::SRL, MVT::v16i16, 4+2 }, |
| { ISD::SRA, MVT::v16i16, 4+2 }, |
| { ISD::SHL, MVT::v8i32, 2+2 }, |
| { ISD::SRL, MVT::v8i32, 4+2 }, |
| { ISD::SRA, MVT::v8i32, 4+2 }, |
| { ISD::SHL, MVT::v4i64, 2+2 }, |
| { ISD::SRL, MVT::v4i64, 4+2 }, |
| { ISD::SRA, MVT::v4i64, 4+2 }, |
| }; |
| |
| // Look for XOP lowering tricks. |
| if (ST->hasXOP()) { |
| // If the right shift is constant then we'll fold the negation so |
| // it's as cheap as a left shift. |
| int ShiftISD = ISD; |
| if ((ShiftISD == ISD::SRL || ShiftISD == ISD::SRA) && |
| (Op2Info == TargetTransformInfo::OK_UniformConstantValue || |
| Op2Info == TargetTransformInfo::OK_NonUniformConstantValue)) |
| ShiftISD = ISD::SHL; |
| if (const auto *Entry = |
| CostTableLookup(XOPShiftCostTable, ShiftISD, LT.second)) |
| return LT.first * Entry->Cost; |
| } |
| |
| static const CostTblEntry SSE2UniformShiftCostTable[] = { |
| // Uniform splats are cheaper for the following instructions. |
| { ISD::SHL, MVT::v16i16, 2+2 }, // 2*psllw + split. |
| { ISD::SHL, MVT::v8i32, 2+2 }, // 2*pslld + split. |
| { ISD::SHL, MVT::v4i64, 2+2 }, // 2*psllq + split. |
| |
| { ISD::SRL, MVT::v16i16, 2+2 }, // 2*psrlw + split. |
| { ISD::SRL, MVT::v8i32, 2+2 }, // 2*psrld + split. |
| { ISD::SRL, MVT::v4i64, 2+2 }, // 2*psrlq + split. |
| |
| { ISD::SRA, MVT::v16i16, 2+2 }, // 2*psraw + split. |
| { ISD::SRA, MVT::v8i32, 2+2 }, // 2*psrad + split. |
| { ISD::SRA, MVT::v2i64, 4 }, // 2*psrad + shuffle. |
| { ISD::SRA, MVT::v4i64, 8+2 }, // 2*(2*psrad + shuffle) + split. |
| }; |
| |
| if (ST->hasSSE2() && |
| ((Op2Info == TargetTransformInfo::OK_UniformConstantValue) || |
| (Op2Info == TargetTransformInfo::OK_UniformValue))) { |
| |
| // Handle AVX2 uniform v4i64 ISD::SRA, it's not worth a table. |
| if (ISD == ISD::SRA && LT.second == MVT::v4i64 && ST->hasAVX2()) |
| return LT.first * 4; // 2*psrad + shuffle. |
| |
| if (const auto *Entry = |
| CostTableLookup(SSE2UniformShiftCostTable, ISD, LT.second)) |
| return LT.first * Entry->Cost; |
| } |
| |
| if (ISD == ISD::SHL && |
| Op2Info == TargetTransformInfo::OK_NonUniformConstantValue) { |
| MVT VT = LT.second; |
| // Vector shift left by non uniform constant can be lowered |
| // into vector multiply. |
| if (((VT == MVT::v8i16 || VT == MVT::v4i32) && ST->hasSSE2()) || |
| ((VT == MVT::v16i16 || VT == MVT::v8i32) && ST->hasAVX())) |
| ISD = ISD::MUL; |
| } |
| |
| static const CostTblEntry AVX2CostTable[] = { |
| { ISD::SHL, MVT::v16i8, 6 }, // vpblendvb sequence. |
| { ISD::SHL, MVT::v32i8, 6 }, // vpblendvb sequence. |
| { ISD::SHL, MVT::v64i8, 12 }, // 2*vpblendvb sequence. |
| { ISD::SHL, MVT::v8i16, 5 }, // extend/vpsrlvd/pack sequence. |
| { ISD::SHL, MVT::v16i16, 7 }, // extend/vpsrlvd/pack sequence. |
| { ISD::SHL, MVT::v32i16, 14 }, // 2*extend/vpsrlvd/pack sequence. |
| |
| { ISD::SRL, MVT::v16i8, 6 }, // vpblendvb sequence. |
| { ISD::SRL, MVT::v32i8, 6 }, // vpblendvb sequence. |
| { ISD::SRL, MVT::v64i8, 12 }, // 2*vpblendvb sequence. |
| { ISD::SRL, MVT::v8i16, 5 }, // extend/vpsrlvd/pack sequence. |
| { ISD::SRL, MVT::v16i16, 7 }, // extend/vpsrlvd/pack sequence. |
| { ISD::SRL, MVT::v32i16, 14 }, // 2*extend/vpsrlvd/pack sequence. |
| |
| { ISD::SRA, MVT::v16i8, 17 }, // vpblendvb sequence. |
| { ISD::SRA, MVT::v32i8, 17 }, // vpblendvb sequence. |
| { ISD::SRA, MVT::v64i8, 34 }, // 2*vpblendvb sequence. |
| { ISD::SRA, MVT::v8i16, 5 }, // extend/vpsravd/pack sequence. |
| { ISD::SRA, MVT::v16i16, 7 }, // extend/vpsravd/pack sequence. |
| { ISD::SRA, MVT::v32i16, 14 }, // 2*extend/vpsravd/pack sequence. |
| { ISD::SRA, MVT::v2i64, 2 }, // srl/xor/sub sequence. |
| { ISD::SRA, MVT::v4i64, 2 }, // srl/xor/sub sequence. |
| |
| { ISD::SUB, MVT::v32i8, 1 }, // psubb |
| { ISD::ADD, MVT::v32i8, 1 }, // paddb |
| { ISD::SUB, MVT::v16i16, 1 }, // psubw |
| { ISD::ADD, MVT::v16i16, 1 }, // paddw |
| { ISD::SUB, MVT::v8i32, 1 }, // psubd |
| { ISD::ADD, MVT::v8i32, 1 }, // paddd |
| { ISD::SUB, MVT::v4i64, 1 }, // psubq |
| { ISD::ADD, MVT::v4i64, 1 }, // paddq |
| |
| { ISD::MUL, MVT::v16i16, 1 }, // pmullw |
| { ISD::MUL, MVT::v8i32, 2 }, // pmulld (Haswell from agner.org) |
| { ISD::MUL, MVT::v4i64, 6 }, // 3*pmuludq/3*shift/2*add |
| |
| { ISD::FNEG, MVT::v4f64, 1 }, // Haswell from http://www.agner.org/ |
| { ISD::FNEG, MVT::v8f32, 1 }, // Haswell from http://www.agner.org/ |
| { ISD::FADD, MVT::v4f64, 1 }, // Haswell from http://www.agner.org/ |
| { ISD::FADD, MVT::v8f32, 1 }, // Haswell from http://www.agner.org/ |
| { ISD::FSUB, MVT::v4f64, 1 }, // Haswell from http://www.agner.org/ |
| { ISD::FSUB, MVT::v8f32, 1 }, // Haswell from http://www.agner.org/ |
| { ISD::FMUL, MVT::f64, 1 }, // Haswell from http://www.agner.org/ |
| { ISD::FMUL, MVT::v2f64, 1 }, // Haswell from http://www.agner.org/ |
| { ISD::FMUL, MVT::v4f64, 1 }, // Haswell from http://www.agner.org/ |
| { ISD::FMUL, MVT::v8f32, 1 }, // Haswell from http://www.agner.org/ |
| |
| { ISD::FDIV, MVT::f32, 7 }, // Haswell from http://www.agner.org/ |
| { ISD::FDIV, MVT::v4f32, 7 }, // Haswell from http://www.agner.org/ |
| { ISD::FDIV, MVT::v8f32, 14 }, // Haswell from http://www.agner.org/ |
| { ISD::FDIV, MVT::f64, 14 }, // Haswell from http://www.agner.org/ |
| { ISD::FDIV, MVT::v2f64, 14 }, // Haswell from http://www.agner.org/ |
| { ISD::FDIV, MVT::v4f64, 28 }, // Haswell from http://www.agner.org/ |
| }; |
| |
| // Look for AVX2 lowering tricks for custom cases. |
| if (ST->hasAVX2()) |
| if (const auto *Entry = CostTableLookup(AVX2CostTable, ISD, LT.second)) |
| return LT.first * Entry->Cost; |
| |
| static const CostTblEntry AVX1CostTable[] = { |
| // We don't have to scalarize unsupported ops. We can issue two half-sized |
| // operations and we only need to extract the upper YMM half. |
| // Two ops + 1 extract + 1 insert = 4. |
| { ISD::MUL, MVT::v16i16, 4 }, |
| { ISD::MUL, MVT::v8i32, 5 }, // BTVER2 from http://www.agner.org/ |
| { ISD::MUL, MVT::v4i64, 12 }, |
| |
| { ISD::SUB, MVT::v32i8, 4 }, |
| { ISD::ADD, MVT::v32i8, 4 }, |
| { ISD::SUB, MVT::v16i16, 4 }, |
| { ISD::ADD, MVT::v16i16, 4 }, |
| { ISD::SUB, MVT::v8i32, 4 }, |
| { ISD::ADD, MVT::v8i32, 4 }, |
| { ISD::SUB, MVT::v4i64, 4 }, |
| { ISD::ADD, MVT::v4i64, 4 }, |
| |
| { ISD::SHL, MVT::v32i8, 22 }, // pblendvb sequence + split. |
| { ISD::SHL, MVT::v8i16, 6 }, // pblendvb sequence. |
| { ISD::SHL, MVT::v16i16, 13 }, // pblendvb sequence + split. |
| { ISD::SHL, MVT::v4i32, 3 }, // pslld/paddd/cvttps2dq/pmulld |
| { ISD::SHL, MVT::v8i32, 9 }, // pslld/paddd/cvttps2dq/pmulld + split |
| { ISD::SHL, MVT::v2i64, 2 }, // Shift each lane + blend. |
| { ISD::SHL, MVT::v4i64, 6 }, // Shift each lane + blend + split. |
| |
| { ISD::SRL, MVT::v32i8, 23 }, // pblendvb sequence + split. |
| { ISD::SRL, MVT::v16i16, 28 }, // pblendvb sequence + split. |
| { ISD::SRL, MVT::v4i32, 6 }, // Shift each lane + blend. |
| { ISD::SRL, MVT::v8i32, 14 }, // Shift each lane + blend + split. |
| { ISD::SRL, MVT::v2i64, 2 }, // Shift each lane + blend. |
| { ISD::SRL, MVT::v4i64, 6 }, // Shift each lane + blend + split. |
| |
| { ISD::SRA, MVT::v32i8, 44 }, // pblendvb sequence + split. |
| { ISD::SRA, MVT::v16i16, 28 }, // pblendvb sequence + split. |
| { ISD::SRA, MVT::v4i32, 6 }, // Shift each lane + blend. |
| { ISD::SRA, MVT::v8i32, 14 }, // Shift each lane + blend + split. |
| { ISD::SRA, MVT::v2i64, 5 }, // Shift each lane + blend. |
| { ISD::SRA, MVT::v4i64, 12 }, // Shift each lane + blend + split. |
| |
| { ISD::FNEG, MVT::v4f64, 2 }, // BTVER2 from http://www.agner.org/ |
| { ISD::FNEG, MVT::v8f32, 2 }, // BTVER2 from http://www.agner.org/ |
| |
| { ISD::FMUL, MVT::f64, 2 }, // BTVER2 from http://www.agner.org/ |
| { ISD::FMUL, MVT::v2f64, 2 }, // BTVER2 from http://www.agner.org/ |
| { ISD::FMUL, MVT::v4f64, 4 }, // BTVER2 from http://www.agner.org/ |
| |
| { ISD::FDIV, MVT::f32, 14 }, // SNB from http://www.agner.org/ |
| { ISD::FDIV, MVT::v4f32, 14 }, // SNB from http://www.agner.org/ |
| { ISD::FDIV, MVT::v8f32, 28 }, // SNB from http://www.agner.org/ |
| { ISD::FDIV, MVT::f64, 22 }, // SNB from http://www.agner.org/ |
| { ISD::FDIV, MVT::v2f64, 22 }, // SNB from http://www.agner.org/ |
| { ISD::FDIV, MVT::v4f64, 44 }, // SNB from http://www.agner.org/ |
| }; |
| |
| if (ST->hasAVX()) |
| if (const auto *Entry = CostTableLookup(AVX1CostTable, ISD, LT.second)) |
| return LT.first * Entry->Cost; |
| |
| static const CostTblEntry SSE42CostTable[] = { |
| { ISD::FADD, MVT::f64, 1 }, // Nehalem from http://www.agner.org/ |
| { ISD::FADD, MVT::f32, 1 }, // Nehalem from http://www.agner.org/ |
| { ISD::FADD, MVT::v2f64, 1 }, // Nehalem from http://www.agner.org/ |
| { ISD::FADD, MVT::v4f32, 1 }, // Nehalem from http://www.agner.org/ |
| |
| { ISD::FSUB, MVT::f64, 1 }, // Nehalem from http://www.agner.org/ |
| { ISD::FSUB, MVT::f32 , 1 }, // Nehalem from http://www.agner.org/ |
| { ISD::FSUB, MVT::v2f64, 1 }, // Nehalem from http://www.agner.org/ |
| { ISD::FSUB, MVT::v4f32, 1 }, // Nehalem from http://www.agner.org/ |
| |
| { ISD::FMUL, MVT::f64, 1 }, // Nehalem from http://www.agner.org/ |
| { ISD::FMUL, MVT::f32, 1 }, // Nehalem from http://www.agner.org/ |
| { ISD::FMUL, MVT::v2f64, 1 }, // Nehalem from http://www.agner.org/ |
| { ISD::FMUL, MVT::v4f32, 1 }, // Nehalem from http://www.agner.org/ |
| |
| { ISD::FDIV, MVT::f32, 14 }, // Nehalem from http://www.agner.org/ |
| { ISD::FDIV, MVT::v4f32, 14 }, // Nehalem from http://www.agner.org/ |
| { ISD::FDIV, MVT::f64, 22 }, // Nehalem from http://www.agner.org/ |
| { ISD::FDIV, MVT::v2f64, 22 }, // Nehalem from http://www.agner.org/ |
| |
| { ISD::MUL, MVT::v2i64, 6 } // 3*pmuludq/3*shift/2*add |
| }; |
| |
| if (ST->hasSSE42()) |
| if (const auto *Entry = CostTableLookup(SSE42CostTable, ISD, LT.second)) |
| return LT.first * Entry->Cost; |
| |
| static const CostTblEntry SSE41CostTable[] = { |
| { ISD::SHL, MVT::v16i8, 10 }, // pblendvb sequence. |
| { ISD::SHL, MVT::v8i16, 11 }, // pblendvb sequence. |
| { ISD::SHL, MVT::v4i32, 4 }, // pslld/paddd/cvttps2dq/pmulld |
| |
| { ISD::SRL, MVT::v16i8, 11 }, // pblendvb sequence. |
| { ISD::SRL, MVT::v8i16, 13 }, // pblendvb sequence. |
| { ISD::SRL, MVT::v4i32, 16 }, // Shift each lane + blend. |
| |
| { ISD::SRA, MVT::v16i8, 21 }, // pblendvb sequence. |
| { ISD::SRA, MVT::v8i16, 13 }, // pblendvb sequence. |
| |
| { ISD::MUL, MVT::v4i32, 2 } // pmulld (Nehalem from agner.org) |
| }; |
| |
| if (ST->hasSSE41()) |
| if (const auto *Entry = CostTableLookup(SSE41CostTable, ISD, LT.second)) |
| return LT.first * Entry->Cost; |
| |
| static const CostTblEntry SSE2CostTable[] = { |
| // We don't correctly identify costs of casts because they are marked as |
| // custom. |
| { ISD::SHL, MVT::v16i8, 13 }, // cmpgtb sequence. |
| { ISD::SHL, MVT::v8i16, 25 }, // cmpgtw sequence. |
| { ISD::SHL, MVT::v4i32, 16 }, // pslld/paddd/cvttps2dq/pmuludq. |
| { ISD::SHL, MVT::v2i64, 4 }, // splat+shuffle sequence. |
| |
| { ISD::SRL, MVT::v16i8, 14 }, // cmpgtb sequence. |
| { ISD::SRL, MVT::v8i16, 16 }, // cmpgtw sequence. |
| { ISD::SRL, MVT::v4i32, 12 }, // Shift each lane + blend. |
| { ISD::SRL, MVT::v2i64, 4 }, // splat+shuffle sequence. |
| |
| { ISD::SRA, MVT::v16i8, 27 }, // unpacked cmpgtb sequence. |
| { ISD::SRA, MVT::v8i16, 16 }, // cmpgtw sequence. |
| { ISD::SRA, MVT::v4i32, 12 }, // Shift each lane + blend. |
| { ISD::SRA, MVT::v2i64, 8 }, // srl/xor/sub splat+shuffle sequence. |
| |
| { ISD::MUL, MVT::v8i16, 1 }, // pmullw |
| { ISD::MUL, MVT::v4i32, 6 }, // 3*pmuludq/4*shuffle |
| { ISD::MUL, MVT::v2i64, 8 }, // 3*pmuludq/3*shift/2*add |
| |
| { ISD::FDIV, MVT::f32, 23 }, // Pentium IV from http://www.agner.org/ |
| { ISD::FDIV, MVT::v4f32, 39 }, // Pentium IV from http://www.agner.org/ |
| { ISD::FDIV, MVT::f64, 38 }, // Pentium IV from http://www.agner.org/ |
| { ISD::FDIV, MVT::v2f64, 69 }, // Pentium IV from http://www.agner.org/ |
| |
| { ISD::FNEG, MVT::f32, 1 }, // Pentium IV from http://www.agner.org/ |
| { ISD::FNEG, MVT::f64, 1 }, // Pentium IV from http://www.agner.org/ |
| { ISD::FNEG, MVT::v4f32, 1 }, // Pentium IV from http://www.agner.org/ |
| { ISD::FNEG, MVT::v2f64, 1 }, // Pentium IV from http://www.agner.org/ |
| |
| { ISD::FADD, MVT::f32, 2 }, // Pentium IV from http://www.agner.org/ |
| { ISD::FADD, MVT::f64, 2 }, // Pentium IV from http://www.agner.org/ |
| |
| { ISD::FSUB, MVT::f32, 2 }, // Pentium IV from http://www.agner.org/ |
| { ISD::FSUB, MVT::f64, 2 }, // Pentium IV from http://www.agner.org/ |
| }; |
| |
| if (ST->hasSSE2()) |
| if (const auto *Entry = CostTableLookup(SSE2CostTable, ISD, LT.second)) |
| return LT.first * Entry->Cost; |
| |
| static const CostTblEntry SSE1CostTable[] = { |
| { ISD::FDIV, MVT::f32, 17 }, // Pentium III from http://www.agner.org/ |
| { ISD::FDIV, MVT::v4f32, 34 }, // Pentium III from http://www.agner.org/ |
| |
| { ISD::FNEG, MVT::f32, 2 }, // Pentium III from http://www.agner.org/ |
| { ISD::FNEG, MVT::v4f32, 2 }, // Pentium III from http://www.agner.org/ |
| |
| { ISD::FADD, MVT::f32, 1 }, // Pentium III from http://www.agner.org/ |
| { ISD::FADD, MVT::v4f32, 2 }, // Pentium III from http://www.agner.org/ |
| |
| { ISD::FSUB, MVT::f32, 1 }, // Pentium III from http://www.agner.org/ |
| { ISD::FSUB, MVT::v4f32, 2 }, // Pentium III from http://www.agner.org/ |
| }; |
| |
| if (ST->hasSSE1()) |
| if (const auto *Entry = CostTableLookup(SSE1CostTable, ISD, LT.second)) |
| return LT.first * Entry->Cost; |
| |
| static const CostTblEntry X64CostTbl[] = { // 64-bit targets |
| { ISD::ADD, MVT::i64, 1 }, // Core (Merom) from http://www.agner.org/ |
| { ISD::SUB, MVT::i64, 1 }, // Core (Merom) from http://www.agner.org/ |
| { ISD::MUL, MVT::i64, 2 }, // Nehalem from http://www.agner.org/ |
| }; |
| |
| if (ST->is64Bit()) |
| if (const auto *Entry = CostTableLookup(X64CostTbl, ISD, LT.second)) |
| return LT.first * Entry->Cost; |
| |
| static const CostTblEntry X86CostTbl[] = { // 32 or 64-bit targets |
| { ISD::ADD, MVT::i8, 1 }, // Pentium III from http://www.agner.org/ |
| { ISD::ADD, MVT::i16, 1 }, // Pentium III from http://www.agner.org/ |
| { ISD::ADD, MVT::i32, 1 }, // Pentium III from http://www.agner.org/ |
| |
| { ISD::SUB, MVT::i8, 1 }, // Pentium III from http://www.agner.org/ |
| { ISD::SUB, MVT::i16, 1 }, // Pentium III from http://www.agner.org/ |
| { ISD::SUB, MVT::i32, 1 }, // Pentium III from http://www.agner.org/ |
| }; |
| |
| if (const auto *Entry = CostTableLookup(X86CostTbl, ISD, LT.second)) |
| return LT.first * Entry->Cost; |
| |
| // It is not a good idea to vectorize division. We have to scalarize it and |
| // in the process we will often end up having to spilling regular |
| // registers. The overhead of division is going to dominate most kernels |
| // anyways so try hard to prevent vectorization of division - it is |
| // generally a bad idea. Assume somewhat arbitrarily that we have to be able |
| // to hide "20 cycles" for each lane. |
| if (LT.second.isVector() && (ISD == ISD::SDIV || ISD == ISD::SREM || |
| ISD == ISD::UDIV || ISD == ISD::UREM)) { |
| InstructionCost ScalarCost = getArithmeticInstrCost( |
| Opcode, Ty->getScalarType(), CostKind, Op1Info, Op2Info, |
| TargetTransformInfo::OP_None, TargetTransformInfo::OP_None); |
| return 20 * LT.first * LT.second.getVectorNumElements() * ScalarCost; |
| } |
| |
| // Fallback to the default implementation. |
| return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info); |
| } |
| |
| InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, |
| VectorType *BaseTp, |
| ArrayRef<int> Mask, int Index, |
| VectorType *SubTp) { |
| // 64-bit packed float vectors (v2f32) are widened to type v4f32. |
| // 64-bit packed integer vectors (v2i32) are widened to type v4i32. |
| std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, BaseTp); |
| |
| Kind = improveShuffleKindFromMask(Kind, Mask); |
| // Treat Transpose as 2-op shuffles - there's no difference in lowering. |
| if (Kind == TTI::SK_Transpose) |
| Kind = TTI::SK_PermuteTwoSrc; |
| |
| // For Broadcasts we are splatting the first element from the first input |
| // register, so only need to reference that input and all the output |
| // registers are the same. |
| if (Kind == TTI::SK_Broadcast) |
| LT.first = 1; |
| |
| // Subvector extractions are free if they start at the beginning of a |
| // vector and cheap if the subvectors are aligned. |
| if (Kind == TTI::SK_ExtractSubvector && LT.second.isVector()) { |
| int NumElts = LT.second.getVectorNumElements(); |
| if ((Index % NumElts) == 0) |
| return 0; |
| std::pair<InstructionCost, MVT> SubLT = |
| TLI->getTypeLegalizationCost(DL, SubTp); |
| if (SubLT.second.isVector()) { |
| int NumSubElts = SubLT.second.getVectorNumElements(); |
| if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0) |
| return SubLT.first; |
| // Handle some cases for widening legalization. For now we only handle |
| // cases where the original subvector was naturally aligned and evenly |
| // fit in its legalized subvector type. |
| // FIXME: Remove some of the alignment restrictions. |
| // FIXME: We can use permq for 64-bit or larger extracts from 256-bit |
| // vectors. |
| int OrigSubElts = cast<FixedVectorType>(SubTp)->getNumElements(); |
| if (NumSubElts > OrigSubElts && (Index % OrigSubElts) == 0 && |
| (NumSubElts % OrigSubElts) == 0 && |
| LT.second.getVectorElementType() == |
| SubLT.second.getVectorElementType() && |
| LT.second.getVectorElementType().getSizeInBits() == |
| BaseTp->getElementType()->getPrimitiveSizeInBits()) { |
| assert(NumElts >= NumSubElts && NumElts > OrigSubElts && |
| "Unexpected number of elements!"); |
| auto *VecTy = FixedVectorType::get(BaseTp->getElementType(), |
| LT.second.getVectorNumElements()); |
| auto *SubTy = FixedVectorType::get(BaseTp->getElementType(), |
| SubLT.second.getVectorNumElements()); |
| int ExtractIndex = alignDown((Index % NumElts), NumSubElts); |
| InstructionCost ExtractCost = getShuffleCost( |
| TTI::SK_ExtractSubvector, VecTy, None, ExtractIndex, SubTy); |
| |
| // If the original size is 32-bits or more, we can use pshufd. Otherwise |
| // if we have SSSE3 we can use pshufb. |
| if (SubTp->getPrimitiveSizeInBits() >= 32 || ST->hasSSSE3()) |
| return ExtractCost + 1; // pshufd or pshufb |
| |
| assert(SubTp->getPrimitiveSizeInBits() == 16 && |
| "Unexpected vector size"); |
| |
| return ExtractCost + 2; // worst case pshufhw + pshufd |
| } |
| } |
| } |
| |
| // Subvector insertions are cheap if the subvectors are aligned. |
| // Note that in general, the insertion starting at the beginning of a vector |
| // isn't free, because we need to preserve the rest of the wide vector. |
| if (Kind == TTI::SK_InsertSubvector && LT.second.isVector()) { |
| int NumElts = LT.second.getVectorNumElements(); |
| std::pair<InstructionCost, MVT> SubLT = |
| TLI->getTypeLegalizationCost(DL, SubTp); |
| if (SubLT.second.isVector()) { |
| int NumSubElts = SubLT.second.getVectorNumElements(); |
| if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0) |
| return SubLT.first; |
| } |
| |
| // If the insertion isn't aligned, treat it like a 2-op shuffle. |
| Kind = TTI::SK_PermuteTwoSrc; |
| } |
| |
| // Handle some common (illegal) sub-vector types as they are often very cheap |
| // to shuffle even on targets without PSHUFB. |
| EVT VT = TLI->getValueType(DL, BaseTp); |
| if (VT.isSimple() && VT.isVector() && VT.getSizeInBits() < 128 && |
| !ST->hasSSSE3()) { |
| static const CostTblEntry SSE2SubVectorShuffleTbl[] = { |
| {TTI::SK_Broadcast, MVT::v4i16, 1}, // pshuflw |
| {TTI::SK_Broadcast, MVT::v2i16, 1}, // pshuflw |
| {TTI::SK_Broadcast, MVT::v8i8, 2}, // punpck/pshuflw |
| {TTI::SK_Broadcast, MVT::v4i8, 2}, // punpck/pshuflw |
| {TTI::SK_Broadcast, MVT::v2i8, 1}, // punpck |
| |
| {TTI::SK_Reverse, MVT::v4i16, 1}, // pshuflw |
| {TTI::SK_Reverse, MVT::v2i16, 1}, // pshuflw |
| {TTI::SK_Reverse, MVT::v4i8, 3}, // punpck/pshuflw/packus |
| {TTI::SK_Reverse, MVT::v2i8, 1}, // punpck |
| |
| {TTI::SK_PermuteTwoSrc, MVT::v4i16, 2}, // punpck/pshuflw |
| {TTI::SK_PermuteTwoSrc, MVT::v2i16, 2}, // punpck/pshuflw |
| {TTI::SK_PermuteTwoSrc, MVT::v8i8, 7}, // punpck/pshuflw |
| {TTI::SK_PermuteTwoSrc, MVT::v4i8, 4}, // punpck/pshuflw |
| {TTI::SK_PermuteTwoSrc, MVT::v2i8, 2}, // punpck |
| |
| {TTI::SK_PermuteSingleSrc, MVT::v4i16, 1}, // pshuflw |
| {TTI::SK_PermuteSingleSrc, MVT::v2i16, 1}, // pshuflw |
| {TTI::SK_PermuteSingleSrc, MVT::v8i8, 5}, // punpck/pshuflw |
| {TTI::SK_PermuteSingleSrc, MVT::v4i8, 3}, // punpck/pshuflw |
| {TTI::SK_PermuteSingleSrc, MVT::v2i8, 1}, // punpck |
| }; |
| |
| if (ST->hasSSE2()) |
| if (const auto *Entry = |
| CostTableLookup(SSE2SubVectorShuffleTbl, Kind, VT.getSimpleVT())) |
| return Entry->Cost; |
| } |
| |
| // We are going to permute multiple sources and the result will be in multiple |
| // destinations. Providing an accurate cost only for splits where the element |
| // type remains the same. |
| if (Kind == TTI::SK_PermuteSingleSrc && LT.first != 1) { |
| MVT LegalVT = LT.second; |
| if (LegalVT.isVector() && |
| LegalVT.getVectorElementType().getSizeInBits() == |
| BaseTp->getElementType()->getPrimitiveSizeInBits() && |
| LegalVT.getVectorNumElements() < |
| cast<FixedVectorType>(BaseTp)->getNumElements()) { |
| |
| unsigned VecTySize = DL.getTypeStoreSize(BaseTp); |
| unsigned LegalVTSize = LegalVT.getStoreSize(); |
| // Number of source vectors after legalization: |
| unsigned NumOfSrcs = (VecTySize + LegalVTSize - 1) / LegalVTSize; |
| // Number of destination vectors after legalization: |
| InstructionCost NumOfDests = LT.first; |
| |
| auto *SingleOpTy = FixedVectorType::get(BaseTp->getElementType(), |
| LegalVT.getVectorNumElements()); |
| |
| InstructionCost NumOfShuffles = (NumOfSrcs - 1) * NumOfDests; |
| return NumOfShuffles * getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy, |
| None, 0, nullptr); |
| } |
| |
| return BaseT::getShuffleCost(Kind, BaseTp, Mask, Index, SubTp); |
| } |
| |
| // For 2-input shuffles, we must account for splitting the 2 inputs into many. |
| if (Kind == TTI::SK_PermuteTwoSrc && LT.first != 1) { |
| // We assume that source and destination have the same vector type. |
| InstructionCost NumOfDests = LT.first; |
| InstructionCost NumOfShufflesPerDest = LT.first * 2 - 1; |
| LT.first = NumOfDests * NumOfShufflesPerDest; |
| } |
| |
| static const CostTblEntry AVX512FP16ShuffleTbl[] = { |
| {TTI::SK_Broadcast, MVT::v32f16, 1}, // vpbroadcastw |
| {TTI::SK_Broadcast, MVT::v16f16, 1}, // vpbroadcastw |
| {TTI::SK_Broadcast, MVT::v8f16, 1}, // vpbroadcastw |
| |
| {TTI::SK_Reverse, MVT::v32f16, 2}, // vpermw |
| {TTI::SK_Reverse, MVT::v16f16, 2}, // vpermw |
| {TTI::SK_Reverse, MVT::v8f16, 1}, // vpshufb |
| |
| {TTI::SK_PermuteSingleSrc, MVT::v32f16, 2}, // vpermw |
| {TTI::SK_PermuteSingleSrc, MVT::v16f16, 2}, // vpermw |
| {TTI::SK_PermuteSingleSrc, MVT::v8f16, 1}, // vpshufb |
| |
| {TTI::SK_PermuteTwoSrc, MVT::v32f16, 2}, // vpermt2w |
| {TTI::SK_PermuteTwoSrc, MVT::v16f16, 2}, // vpermt2w |
| {TTI::SK_PermuteTwoSrc, MVT::v8f16, 2} // vpermt2w |
| }; |
| |
| if (!ST->useSoftFloat() && ST->hasFP16()) |
| if (const auto *Entry = |
| CostTableLookup(AVX512FP16ShuffleTbl, Kind, LT.second)) |
| return LT.first * Entry->Cost; |
| |
| static const CostTblEntry AVX512VBMIShuffleTbl[] = { |
| {TTI::SK_Reverse, MVT::v64i8, 1}, // vpermb |
| {TTI::SK_Reverse, MVT::v32i8, 1}, // vpermb |
| |
| {TTI::SK_PermuteSingleSrc, MVT::v64i8, 1}, // vpermb |
| {TTI::SK_PermuteSingleSrc, MVT::v32i8, 1}, // vpermb |
| |
| {TTI::SK_PermuteTwoSrc, MVT::v64i8, 2}, // vpermt2b |
| {TTI::SK_PermuteTwoSrc, MVT::v32i8, 2}, // vpermt2b |
| {TTI::SK_PermuteTwoSrc, MVT::v16i8, 2} // vpermt2b |
| }; |
| |
| if (ST->hasVBMI()) |
| if (const auto *Entry = |
| CostTableLookup(AVX512VBMIShuffleTbl, Kind, LT.second)) |
| return LT.first * Entry->Cost; |
| |
| static const CostTblEntry AVX512BWShuffleTbl[] = { |
| {TTI::SK_Broadcast, MVT::v32i16, 1}, // vpbroadcastw |
| {TTI::SK_Broadcast, MVT::v64i8, 1}, // vpbroadcastb |
| |
| {TTI::SK_Reverse, MVT::v32i16, 2}, // vpermw |
| {TTI::SK_Reverse, MVT::v16i16, 2}, // vpermw |
| {TTI::SK_Reverse, MVT::v64i8, 2}, // pshufb + vshufi64x2 |
| |
| {TTI::SK_PermuteSingleSrc, MVT::v32i16, 2}, // vpermw |
| {TTI::SK_PermuteSingleSrc, MVT::v16i16, 2}, // vpermw |
| {TTI::SK_PermuteSingleSrc, MVT::v64i8, 8}, // extend to v32i16 |
| |
| {TTI::SK_PermuteTwoSrc, MVT::v32i16, 2}, // vpermt2w |
| {TTI::SK_PermuteTwoSrc, MVT::v16i16, 2}, // vpermt2w |
| {TTI::SK_PermuteTwoSrc, MVT::v8i16, 2}, // vpermt2w |
| {TTI::SK_PermuteTwoSrc, MVT::v64i8, 19}, // 6 * v32i8 + 1 |
| |
| {TTI::SK_Select, MVT::v32i16, 1}, // vblendmw |
| {TTI::SK_Select, MVT::v64i8, 1}, // vblendmb |
| }; |
| |
| if (ST->hasBWI()) |
| if (const auto *Entry = |
| CostTableLookup(AVX512BWShuffleTbl, Kind, LT.second)) |
| return LT.first * Entry->Cost; |
| |
| static const CostTblEntry AVX512ShuffleTbl[] = { |
| {TTI::SK_Broadcast, MVT::v8f64, 1}, // vbroadcastpd |
| {TTI::SK_Broadcast, MVT::v16f32, 1}, // vbroadcastps |
| {TTI::SK_Broadcast, MVT::v8i64, 1}, // vpbroadcastq |
| {TTI::SK_Broadcast, MVT::v16i32, 1}, // vpbroadcastd |
| {TTI::SK_Broadcast, MVT::v32i16, 1}, // vpbroadcastw |
| {TTI::SK_Broadcast, MVT::v64i8, 1}, // vpbroadcastb |
| |
| {TTI::SK_Reverse, MVT::v8f64, 1}, // vpermpd |
| {TTI::SK_Reverse, MVT::v16f32, 1}, // vpermps |
| {TTI::SK_Reverse, MVT::v8i64, 1}, // vpermq |
| {TTI::SK_Reverse, MVT::v16i32, 1}, // vpermd |
| {TTI::SK_Reverse, MVT::v32i16, 7}, // per mca |
| {TTI::SK_Reverse, MVT::v64i8, 7}, // per mca |
| |
| {TTI::SK_PermuteSingleSrc, MVT::v8f64, 1}, // vpermpd |
| {TTI::SK_PermuteSingleSrc, MVT::v4f64, 1}, // vpermpd |
| {TTI::SK_PermuteSingleSrc, MVT::v2f64, 1}, // vpermpd |
| {TTI::SK_PermuteSingleSrc, MVT::v16f32, 1}, // vpermps |
| {TTI::SK_PermuteSingleSrc, MVT::v8f32, 1}, // vpermps |
| {TTI::SK_PermuteSingleSrc, MVT::v4f32, 1}, // vpermps |
| {TTI::SK_PermuteSingleSrc, MVT::v8i64, 1}, // vpermq |
| {TTI::SK_PermuteSingleSrc, MVT::v4i64, 1}, // vpermq |
| {TTI::SK_PermuteSingleSrc, MVT::v2i64, 1}, // vpermq |
| {TTI::SK_PermuteSingleSrc, MVT::v16i32, 1}, // vpermd |
| {TTI::SK_PermuteSingleSrc, MVT::v8i32, 1}, // vpermd |
| {TTI::SK_PermuteSingleSrc, MVT::v4i32, 1}, // vpermd |
| {TTI::SK_PermuteSingleSrc, MVT::v16i8, 1}, // pshufb |
| |
| {TTI::SK_PermuteTwoSrc, MVT::v8f64, 1}, // vpermt2pd |
| {TTI::SK_PermuteTwoSrc, MVT::v16f32, 1}, // vpermt2ps |
| {TTI::SK_PermuteTwoSrc, MVT::v8i64, 1}, // vpermt2q |
| {TTI::SK_PermuteTwoSrc, MVT::v16i32, 1}, // vpermt2d |
| {TTI::SK_PermuteTwoSrc, MVT::v4f64, 1}, // vpermt2pd |
| {TTI::SK_PermuteTwoSrc, MVT::v8f32, 1}, // vpermt2ps |
| {TTI::SK_PermuteTwoSrc, MVT::v4i64, 1}, // vpermt2q |
| {TTI::SK_PermuteTwoSrc, MVT::v8i32, 1}, // vpermt2d |
| {TTI::SK_PermuteTwoSrc, MVT::v2f64, 1}, // vpermt2pd |
| {TTI::SK_PermuteTwoSrc, MVT::v4f32, 1}, // vpermt2ps |
| {TTI::SK_PermuteTwoSrc, MVT::v2i64, 1}, // vpermt2q |
| {TTI::SK_PermuteTwoSrc, MVT::v4i32, 1}, // vpermt2d |
| |
| // FIXME: This just applies the type legalization cost rules above |
| // assuming these completely split. |
| {TTI::SK_PermuteSingleSrc, MVT::v32i16, 14}, |
| {TTI::SK_PermuteSingleSrc, MVT::v64i8, 14}, |
| {TTI::SK_PermuteTwoSrc, MVT::v32i16, 42}, |
| {TTI::SK_PermuteTwoSrc, MVT::v64i8, 42}, |
| |
| {TTI::SK_Select, MVT::v32i16, 1}, // vpternlogq |
| {TTI::SK_Select, MVT::v64i8, 1}, // vpternlogq |
| {TTI::SK_Select, MVT::v8f64, 1}, // vblendmpd |
| {TTI::SK_Select, MVT::v16f32, 1}, // vblendmps |
| {TTI::SK_Select, MVT::v8i64, 1}, // vblendmq |
| {TTI::SK_Select, MVT::v16i32, 1}, // vblendmd |
| }; |
| |
| if (ST->hasAVX512()) |
| if (const auto *Entry = CostTableLookup(AVX512ShuffleTbl, Kind, LT.second)) |
| return LT.first * Entry->Cost; |
| |
| static const CostTblEntry AVX2ShuffleTbl[] = { |
| {TTI::SK_Broadcast, MVT::v4f64, 1}, // vbroadcastpd |
| {TTI::SK_Broadcast, MVT::v8f32, 1}, // vbroadcastps |
| {TTI::SK_Broadcast, MVT::v4i64, 1}, // vpbroadcastq |
| {TTI::SK_Broadcast, MVT::v8i32, 1}, // vpbroadcastd |
| {TTI::SK_Broadcast, MVT::v16i16, 1}, // vpbroadcastw |
| {TTI::SK_Broadcast, MVT::v32i8, 1}, // vpbroadcastb |
| |
| {TTI::SK_Reverse, MVT::v4f64, 1}, // vpermpd |
| {TTI::SK_Reverse, MVT::v8f32, 1}, // vpermps |
| {TTI::SK_Reverse, MVT::v4i64, 1}, // vpermq |
| {TTI::SK_Reverse, MVT::v8i32, 1}, // vpermd |
| {TTI::SK_Reverse, MVT::v16i16, 2}, // vperm2i128 + pshufb |
| {TTI::SK_Reverse, MVT::v32i8, 2}, // vperm2i128 + pshufb |
| |
| {TTI::SK_Select, MVT::v16i16, 1}, // vpblendvb |
| {TTI::SK_Select, MVT::v32i8, 1}, // vpblendvb |
| |
| {TTI::SK_PermuteSingleSrc, MVT::v4f64, 1}, // vpermpd |
| {TTI::SK_PermuteSingleSrc, MVT::v8f32, 1}, // vpermps |
| {TTI::SK_PermuteSingleSrc, MVT::v4i64, 1}, // vpermq |
| {TTI::SK_PermuteSingleSrc, MVT::v8i32, 1}, // vpermd |
| {TTI::SK_PermuteSingleSrc, MVT::v16i16, 4}, // vperm2i128 + 2*vpshufb |
| // + vpblendvb |
| {TTI::SK_PermuteSingleSrc, MVT::v32i8, 4}, // vperm2i128 + 2*vpshufb |
| // + vpblendvb |
| |
| {TTI::SK_PermuteTwoSrc, MVT::v4f64, 3}, // 2*vpermpd + vblendpd |
| {TTI::SK_PermuteTwoSrc, MVT::v8f32, 3}, // 2*vpermps + vblendps |
| {TTI::SK_PermuteTwoSrc, MVT::v4i64, 3}, // 2*vpermq + vpblendd |
| {TTI::SK_PermuteTwoSrc, MVT::v8i32, 3}, // 2*vpermd + vpblendd |
| {TTI::SK_PermuteTwoSrc, MVT::v16i16, 7}, // 2*vperm2i128 + 4*vpshufb |
| // + vpblendvb |
| {TTI::SK_PermuteTwoSrc, MVT::v32i8, 7}, // 2*vperm2i128 + 4*vpshufb |
| // + vpblendvb |
| }; |
| |
| if (ST->hasAVX2()) |
| if (const auto *Entry = CostTableLookup(AVX2ShuffleTbl, Kind, LT.second)) |
| return LT.first * Entry->Cost; |
| |
| static const CostTblEntry XOPShuffleTbl[] = { |
| {TTI::SK_PermuteSingleSrc, MVT::v4f64, 2}, // vperm2f128 + vpermil2pd |
| {TTI::SK_PermuteSingleSrc, MVT::v8f32, 2}, // vperm2f128 + vpermil2ps |
| {TTI::SK_PermuteSingleSrc, MVT::v4i64, 2}, // vperm2f128 + vpermil2pd |
| {TTI::SK_PermuteSingleSrc, MVT::v8i32, 2}, // vperm2f128 + vpermil2ps |
| {TTI::SK_PermuteSingleSrc, MVT::v16i16, 4}, // vextractf128 + 2*vpperm |
| // + vinsertf128 |
| {TTI::SK_PermuteSingleSrc, MVT::v32i8, 4}, // vextractf128 + 2*vpperm |
| // + vinsertf128 |
| |
| {TTI::SK_PermuteTwoSrc, MVT::v16i16, 9}, // 2*vextractf128 + 6*vpperm |
| // + vinsertf128 |
| {TTI::SK_PermuteTwoSrc, MVT::v8i16, 1}, // vpperm |
| {TTI::SK_PermuteTwoSrc, MVT::v32i8, 9}, // 2*vextractf128 + 6*vpperm |
| // + vinsertf128 |
| {TTI::SK_PermuteTwoSrc, MVT::v16i8, 1}, // vpperm |
| }; |
| |
| if (ST->hasXOP()) |
| if (const auto *Entry = CostTableLookup(XOPShuffleTbl, Kind, LT.second)) |
| return LT.first * Entry->Cost; |
| |
| static const CostTblEntry AVX1ShuffleTbl[] = { |
| {TTI::SK_Broadcast, MVT::v4f64, 2}, // vperm2f128 + vpermilpd |
| {TTI::SK_Broadcast, MVT::v8f32, 2}, // vperm2f128 + vpermilps |
| {TTI::SK_Broadcast, MVT::v4i64, 2}, // vperm2f128 + vpermilpd |
| {TTI::SK_Broadcast, MVT::v8i32, 2}, // vperm2f128 + vpermilps |
| {TTI::SK_Broadcast, MVT::v16i16, 3}, // vpshuflw + vpshufd + vinsertf128 |
| {TTI::SK_Broadcast, MVT::v32i8, 2}, // vpshufb + vinsertf128 |
| |
| {TTI::SK_Reverse, MVT::v4f64, 2}, // vperm2f128 + vpermilpd |
| {TTI::SK_Reverse, MVT::v8f32, 2}, // vperm2f128 + vpermilps |
| {TTI::SK_Reverse, MVT::v4i64, 2}, // vperm2f128 + vpermilpd |
| {TTI::SK_Reverse, MVT::v8i32, 2}, // vperm2f128 + vpermilps |
| {TTI::SK_Reverse, MVT::v16i16, 4}, // vextractf128 + 2*pshufb |
| // + vinsertf128 |
| {TTI::SK_Reverse, MVT::v32i8, 4}, // vextractf128 + 2*pshufb |
| // + vinsertf128 |
| |
| {TTI::SK_Select, MVT::v4i64, 1}, // vblendpd |
| {TTI::SK_Select, MVT::v4f64, 1}, // vblendpd |
| {TTI::SK_Select, MVT::v8i32, 1}, // vblendps |
| {TTI::SK_Select, MVT::v8f32, 1}, // vblendps |
| {TTI::SK_Select, MVT::v16i16, 3}, // vpand + vpandn + vpor |
| {TTI::SK_Select, MVT::v32i8, 3}, // vpand + vpandn + vpor |
| |
| {TTI::SK_PermuteSingleSrc, MVT::v4f64, 2}, // vperm2f128 + vshufpd |
| {TTI::SK_PermuteSingleSrc, MVT::v4i64, 2}, // vperm2f128 + vshufpd |
| {TTI::SK_PermuteSingleSrc, MVT::v8f32, 4}, // 2*vperm2f128 + 2*vshufps |
| {TTI::SK_PermuteSingleSrc, MVT::v8i32, 4}, // 2*vperm2f128 + 2*vshufps |
| {TTI::SK_PermuteSingleSrc, MVT::v16i16, 8}, // vextractf128 + 4*pshufb |
| // + 2*por + vinsertf128 |
| {TTI::SK_PermuteSingleSrc, MVT::v32i8, 8}, // vextractf128 + 4*pshufb |
| // + 2*por + vinsertf128 |
| |
| {TTI::SK_PermuteTwoSrc, MVT::v4f64, 3}, // 2*vperm2f128 + vshufpd |
| {TTI::SK_PermuteTwoSrc, MVT::v4i64, 3}, // 2*vperm2f128 + vshufpd |
| {TTI::SK_PermuteTwoSrc, MVT::v8f32, 4}, // 2*vperm2f128 + 2*vshufps |
| {TTI::SK_PermuteTwoSrc, MVT::v8i32, 4}, // 2*vperm2f128 + 2*vshufps |
| {TTI::SK_PermuteTwoSrc, MVT::v16i16, 15}, // 2*vextractf128 + 8*pshufb |
| // + 4*por + vinsertf128 |
| {TTI::SK_PermuteTwoSrc, MVT::v32i8, 15}, // 2*vextractf128 + 8*pshufb |
| // + 4*por + vinsertf128 |
| }; |
| |
| if (ST->hasAVX()) |
| if (const auto *Entry = CostTableLookup(AVX1ShuffleTbl, Kind, LT.second)) |
| return LT.first * Entry->Cost; |
| |
| static const CostTblEntry SSE41ShuffleTbl[] = { |
| {TTI::SK_Select, MVT::v2i64, 1}, // pblendw |
| {TTI::SK_Select, MVT::v2f64, 1}, // movsd |
| {TTI::SK_Select, MVT::v4i32, 1}, // pblendw |
| {TTI::SK_Select, MVT::v4f32, 1}, // blendps |
| {TTI::SK_Select, MVT::v8i16, 1}, // pblendw |
| {TTI::SK_Select, MVT::v16i8, 1} // pblendvb |
| }; |
| |
| if (ST->hasSSE41()) |
| if (const auto *Entry = CostTableLookup(SSE41ShuffleTbl, Kind, LT.second)) |
| return LT.first * Entry->Cost; |
| |
| static const CostTblEntry SSSE3ShuffleTbl[] = { |
| {TTI::SK_Broadcast, MVT::v8i16, 1}, // pshufb |
| {TTI::SK_Broadcast, MVT::v16i8, 1}, // pshufb |
| |
| {TTI::SK_Reverse, MVT::v8i16, 1}, // pshufb |
| {TTI::SK_Reverse, MVT::v16i8, 1}, // pshufb |
| |
| {TTI::SK_Select, MVT::v8i16, 3}, // 2*pshufb + por |
| {TTI::SK_Select, MVT::v16i8, 3}, // 2*pshufb + por |
| |
| {TTI::SK_PermuteSingleSrc, MVT::v8i16, 1}, // pshufb |
| {TTI::SK_PermuteSingleSrc, MVT::v16i8, 1}, // pshufb |
| |
| {TTI::SK_PermuteTwoSrc, MVT::v8i16, 3}, // 2*pshufb + por |
| {TTI::SK_PermuteTwoSrc, MVT::v16i8, 3}, // 2*pshufb + por |
| }; |
| |
| if (ST->hasSSSE3()) |
| if (const auto *Entry = CostTableLookup(SSSE3ShuffleTbl, Kind, LT.second)) |
| return LT.first * Entry->Cost; |
| |
| static const CostTblEntry SSE2ShuffleTbl[] = { |
| {TTI::SK_Broadcast, MVT::v2f64, 1}, // shufpd |
| {TTI::SK_Broadcast, MVT::v2i64, 1}, // pshufd |
| {TTI::SK_Broadcast, MVT::v4i32, 1}, // pshufd |
| {TTI::SK_Broadcast, MVT::v8i16, 2}, // pshuflw + pshufd |
| {TTI::SK_Broadcast, MVT::v16i8, 3}, // unpck + pshuflw + pshufd |
| |
| {TTI::SK_Reverse, MVT::v2f64, 1}, // shufpd |
| {TTI::SK_Reverse, MVT::v2i64, 1}, // pshufd |
| {TTI::SK_Reverse, MVT::v4i32, 1}, // pshufd |
| {TTI::SK_Reverse, MVT::v8i16, 3}, // pshuflw + pshufhw + pshufd |
| {TTI::SK_Reverse, MVT::v16i8, 9}, // 2*pshuflw + 2*pshufhw |
| // + 2*pshufd + 2*unpck + packus |
| |
| {TTI::SK_Select, MVT::v2i64, 1}, // movsd |
| {TTI::SK_Select, MVT::v2f64, 1}, // movsd |
| {TTI::SK_Select, MVT::v4i32, 2}, // 2*shufps |
| {TTI::SK_Select, MVT::v8i16, 3}, // pand + pandn + por |
| {TTI::SK_Select, MVT::v16i8, 3}, // pand + pandn + por |
| |
| {TTI::SK_PermuteSingleSrc, MVT::v2f64, 1}, // shufpd |
| {TTI::SK_PermuteSingleSrc, MVT::v2i64, 1}, // pshufd |
| {TTI::SK_PermuteSingleSrc, MVT::v4i32, 1}, // pshufd |
| {TTI::SK_PermuteSingleSrc, MVT::v8i16, 5}, // 2*pshuflw + 2*pshufhw |
| // + pshufd/unpck |
| { TTI::SK_PermuteSingleSrc, MVT::v16i8, 10 }, // 2*pshuflw + 2*pshufhw |
| // + 2*pshufd + 2*unpck + 2*packus |
| |
| { TTI::SK_PermuteTwoSrc, MVT::v2f64, 1 }, // shufpd |
| { TTI::SK_PermuteTwoSrc, MVT::v2i64, 1 }, // shufpd |
| { TTI::SK_PermuteTwoSrc, MVT::v4i32, 2 }, // 2*{unpck,movsd,pshufd} |
| { TTI::SK_PermuteTwoSrc, MVT::v8i16, 8 }, // blend+permute |
| { TTI::SK_PermuteTwoSrc, MVT::v16i8, 13 }, // blend+permute |
| }; |
| |
| if (ST->hasSSE2()) |
| if (const auto *Entry = CostTableLookup(SSE2ShuffleTbl, Kind, LT.second)) |
| return LT.first * Entry->Cost; |
| |
| static const CostTblEntry SSE1ShuffleTbl[] = { |
| { TTI::SK_Broadcast, MVT::v4f32, 1 }, // shufps |
| { TTI::SK_Reverse, MVT::v4f32, 1 }, // shufps |
| { TTI::SK_Select, MVT::v4f32, 2 }, // 2*shufps |
| { TTI::SK_PermuteSingleSrc, MVT::v4f32, 1 }, // shufps |
| { TTI::SK_PermuteTwoSrc, MVT::v4f32, 2 }, // 2*shufps |
| }; |
| |
| if (ST->hasSSE1()) |
| if (const auto *Entry = CostTableLookup(SSE1ShuffleTbl, Kind, LT.second)) |
| return LT.first * Entry->Cost; |
| |
| return BaseT::getShuffleCost(Kind, BaseTp, Mask, Index, SubTp); |
| } |
| |
| InstructionCost X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, |
| Type *Src, |
| TTI::CastContextHint CCH, |
| TTI::TargetCostKind CostKind, |
| const Instruction *I) { |
| int ISD = TLI->InstructionOpcodeToISD(Opcode); |
| assert(ISD && "Invalid opcode"); |
| |
| // TODO: Allow non-throughput costs that aren't binary. |
| auto AdjustCost = [&CostKind](InstructionCost Cost) -> InstructionCost { |
| if (CostKind != TTI::TCK_RecipThroughput) |
| return Cost == 0 ? 0 : 1; |
| return Cost; |
| }; |
| |
| // The cost tables include both specific, custom (non-legal) src/dst type |
| // conversions and generic, legalized types. We test for customs first, before |
| // falling back to legalization. |
| // FIXME: Need a better design of the cost table to handle non-simple types of |
| // potential massive combinations (elem_num x src_type x dst_type). |
| static const TypeConversionCostTblEntry AVX512BWConversionTbl[] { |
| { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i8, 1 }, |
| { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i8, 1 }, |
| |
| // Mask sign extend has an instruction. |
| { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, 1 }, |
| { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v2i1, 1 }, |
| { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, 1 }, |
| { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v2i1, 1 }, |
| { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, 1 }, |
| { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v4i1, 1 }, |
| { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, 1 }, |
| { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v4i1, 1 }, |
| { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, 1 }, |
| { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v8i1, 1 }, |
| { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, 1 }, |
| { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, 1 }, |
| { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 1 }, |
| { ISD::SIGN_EXTEND, MVT::v32i8, MVT::v32i1, 1 }, |
| { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i1, 1 }, |
| { ISD::SIGN_EXTEND, MVT::v64i8, MVT::v64i1, 1 }, |
| { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v64i1, 1 }, |
| |
| // Mask zero extend is a sext + shift. |
| { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, 2 }, |
| { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v2i1, 2 }, |
| { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, 2 }, |
| { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v2i1, 2 }, |
| { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, 2 }, |
| { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v4i1, 2 }, |
| { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, 2 }, |
| { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v4i1, 2 }, |
| { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, 2 }, |
| { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v8i1, 2 }, |
| { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, 2 }, |
| { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, 2 }, |
| { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 2 }, |
| { ISD::ZERO_EXTEND, MVT::v32i8, MVT::v32i1, 2 }, |
| { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i1, 2 }, |
| { ISD::ZERO_EXTEND, MVT::v64i8, MVT::v64i1, 2 }, |
| { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v64i1, 2 }, |
| |
| { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 2 }, |
| { ISD::TRUNCATE, MVT::v2i1, MVT::v16i8, 2 }, |
| { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 2 }, |
| { ISD::TRUNCATE, MVT::v2i1, MVT::v8i16, 2 }, |
| { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 2 }, |
| { ISD::TRUNCATE, MVT::v4i1, MVT::v16i8, 2 }, |
| { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, 2 }, |
| { ISD::TRUNCATE, MVT::v4i1, MVT::v8i16, 2 }, |
| { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 2 }, |
| { ISD::TRUNCATE, MVT::v8i1, MVT::v16i8, 2 }, |
| { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, 2 }, |
| { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, 2 }, |
| { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, 2 }, |
| { ISD::TRUNCATE, MVT::v32i1, MVT::v32i8, 2 }, |
| { ISD::TRUNCATE, MVT::v32i1, MVT::v32i16, 2 }, |
| { ISD::TRUNCATE, MVT::v64i1, MVT::v64i8, 2 }, |
| { ISD::TRUNCATE, MVT::v64i1, MVT::v32i16, 2 }, |
| |
| { ISD::TRUNCATE, MVT::v32i8, MVT::v32i16, 2 }, |
| { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 2 }, // widen to zmm |
| { ISD::TRUNCATE, MVT::v2i8, MVT::v2i16, 2 }, // vpmovwb |
| { ISD::TRUNCATE, MVT::v4i8, MVT::v4i16, 2 }, // vpmovwb |
| { ISD::TRUNCATE, MVT::v8i8, MVT::v8i16, 2 }, // vpmovwb |
| }; |
| |
| static const TypeConversionCostTblEntry AVX512DQConversionTbl[] = { |
| // Mask sign extend has an instruction. |
| { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, 1 }, |
| { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v2i1, 1 }, |
| { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, 1 }, |
| { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 1 }, |
| { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 1 }, |
| { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v16i1, 1 }, |
| { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i1, 1 }, |
| { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i1, 1 }, |
| |
| // Mask zero extend is a sext + shift. |
| { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, 2 }, |
| { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v2i1, 2 }, |
| { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, 2 }, |
| { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 2 }, |
| { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 2 }, |
| { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v16i1, 2 }, |
| { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i1, 2 }, |
| { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i1, 2 }, |
| |
| { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, 2 }, |
| { ISD::TRUNCATE, MVT::v2i1, MVT::v4i32, 2 }, |
| { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, 2 }, |
| { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, 2 }, |
| { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, 2 }, |
| { ISD::TRUNCATE, MVT::v8i1, MVT::v8i64, 2 }, |
| { ISD::TRUNCATE, MVT::v16i1, MVT::v16i32, 2 }, |
| { ISD::TRUNCATE, MVT::v16i1, MVT::v8i64, 2 }, |
| |
| { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i64, 1 }, |
| { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i64, 1 }, |
| |
| { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i64, 1 }, |
| { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i64, 1 }, |
| |
| { ISD::FP_TO_SINT, MVT::v8i64, MVT::v8f32, 1 }, |
| { ISD::FP_TO_SINT, MVT::v8i64, MVT::v8f64, 1 }, |
| |
| { ISD::FP_TO_UINT, MVT::v8i64, MVT::v8f32, 1 }, |
| { ISD::FP_TO_UINT, MVT::v8i64, MVT::v8f64, 1 }, |
| }; |
| |
| // TODO: For AVX512DQ + AVX512VL, we also have cheap casts for 128-bit and |
| // 256-bit wide vectors. |
| |
| static const TypeConversionCostTblEntry AVX512FConversionTbl[] = { |
| { ISD::FP_EXTEND, MVT::v8f64, MVT::v8f32, 1 }, |
| { ISD::FP_EXTEND, MVT::v8f64, MVT::v16f32, 3 }, |
| { ISD::FP_ROUND, MVT::v8f32, MVT::v8f64, 1 }, |
| |
| { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 3 }, // sext+vpslld+vptestmd |
| { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 3 }, // sext+vpslld+vptestmd |
| { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 3 }, // sext+vpslld+vptestmd |
| { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, 3 }, // sext+vpslld+vptestmd |
| { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 3 }, // sext+vpsllq+vptestmq |
| { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, 3 }, // sext+vpsllq+vptestmq |
| { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, 3 }, // sext+vpsllq+vptestmq |
| { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, 3 }, // sext+vpslld+vptestmd |
| { ISD::TRUNCATE, MVT::v2i1, MVT::v2i32, 2 }, // zmm vpslld+vptestmd |
| { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, 2 }, // zmm vpslld+vptestmd |
| { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, 2 }, // zmm vpslld+vptestmd |
| { ISD::TRUNCATE, MVT::v16i1, MVT::v16i32, 2 }, // vpslld+vptestmd |
| { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, 2 }, // zmm vpsllq+vptestmq |
| { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, 2 }, // zmm vpsllq+vptestmq |
| { ISD::TRUNCATE, MVT::v8i1, MVT::v8i64, 2 }, // vpsllq+vptestmq |
| { ISD::TRUNCATE, MVT::v2i8, MVT::v2i32, 2 }, // vpmovdb |
| { ISD::TRUNCATE, MVT::v4i8, MVT::v4i32, 2 }, // vpmovdb |
| { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 2 }, // vpmovdb |
| { ISD::TRUNCATE, MVT::v32i8, MVT::v16i32, 2 }, // vpmovdb |
| { ISD::TRUNCATE, MVT::v64i8, MVT::v16i32, 2 }, // vpmovdb |
| { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 2 }, // vpmovdw |
| { ISD::TRUNCATE, MVT::v32i16, MVT::v16i32, 2 }, // vpmovdw |
| { ISD::TRUNCATE, MVT::v2i8, MVT::v2i64, 2 }, // vpmovqb |
| { ISD::TRUNCATE, MVT::v2i16, MVT::v2i64, 1 }, // vpshufb |
| { ISD::TRUNCATE, MVT::v8i8, MVT::v8i64, 2 }, // vpmovqb |
| { ISD::TRUNCATE, MVT::v16i8, MVT::v8i64, 2 }, // vpmovqb |
| { ISD::TRUNCATE, MVT::v32i8, MVT::v8i64, 2 }, // vpmovqb |
| { ISD::TRUNCATE, MVT::v64i8, MVT::v8i64, 2 }, // vpmovqb |
| { ISD::TRUNCATE, MVT::v8i16, MVT::v8i64, 2 }, // vpmovqw |
| { ISD::TRUNCATE, MVT::v16i16, MVT::v8i64, 2 }, // vpmovqw |
| { ISD::TRUNCATE, MVT::v32i16, MVT::v8i64, 2 }, // vpmovqw |
| { ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, 1 }, // vpmovqd |
| { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 1 }, // zmm vpmovqd |
| { ISD::TRUNCATE, MVT::v16i8, MVT::v16i64, 5 },// 2*vpmovqd+concat+vpmovdb |
| |
| { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 3 }, // extend to v16i32 |
| { ISD::TRUNCATE, MVT::v32i8, MVT::v32i16, 8 }, |
| { ISD::TRUNCATE, MVT::v64i8, MVT::v32i16, 8 }, |
| |
| // Sign extend is zmm vpternlogd+vptruncdb. |
| // Zero extend is zmm broadcast load+vptruncdw. |
| { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, 3 }, |
| { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, 4 }, |
| { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, 3 }, |
| { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, 4 }, |
| { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, 3 }, |
| { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, 4 }, |
| { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, 3 }, |
| { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, 4 }, |
| |
| // Sign extend is zmm vpternlogd+vptruncdw. |
| // Zero extend is zmm vpternlogd+vptruncdw+vpsrlw. |
| { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, 3 }, |
| { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, 4 }, |
| { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, 3 }, |
| { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, 4 }, |
| { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, 3 }, |
| { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, 4 }, |
| { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 3 }, |
| { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 4 }, |
| |
| { ISD::SIGN_EXTEND, MVT::v2i32, MVT::v2i1, 1 }, // zmm vpternlogd |
| { ISD::ZERO_EXTEND, MVT::v2i32, MVT::v2i1, 2 }, // zmm vpternlogd+psrld |
| { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, 1 }, // zmm vpternlogd |
| { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, 2 }, // zmm vpternlogd+psrld |
| { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 1 }, // zmm vpternlogd |
| { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 2 }, // zmm vpternlogd+psrld |
| { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, 1 }, // zmm vpternlogq |
| { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, 2 }, // zmm vpternlogq+psrlq |
| { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 1 }, // zmm vpternlogq |
| { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 2 }, // zmm vpternlogq+psrlq |
| |
| { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i1, 1 }, // vpternlogd |
| { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i1, 2 }, // vpternlogd+psrld |
| { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i1, 1 }, // vpternlogq |
| { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i1, 2 }, // vpternlogq+psrlq |
| |
| { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 1 }, |
| { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 1 }, |
| { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 1 }, |
| { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, 1 }, |
| { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, 1 }, |
| { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, 1 }, |
| { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, 1 }, |
| { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, 1 }, |
| { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i32, 1 }, |
| { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i32, 1 }, |
| |
| { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i8, 3 }, // FIXME: May not be right |
| { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i8, 3 }, // FIXME: May not be right |
| |
| { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i1, 4 }, |
| { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i1, 3 }, |
| { ISD::SINT_TO_FP, MVT::v8f64, MVT::v16i8, 2 }, |
| { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i8, 1 }, |
| { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i16, 2 }, |
| { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i16, 1 }, |
| { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i32, 1 }, |
| { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i32, 1 }, |
| |
| { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i1, 4 }, |
| { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i1, 3 }, |
| { ISD::UINT_TO_FP, MVT::v8f64, MVT::v16i8, 2 }, |
| { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i8, 1 }, |
| { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i16, 2 }, |
| { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i16, 1 }, |
| { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i32, 1 }, |
| { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i32, 1 }, |
| { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i64, 26 }, |
| { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i64, 5 }, |
| |
| { ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f32, 2 }, |
| { ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f64, 7 }, |
| { ISD::FP_TO_SINT, MVT::v32i8, MVT::v32f64,15 }, |
| { ISD::FP_TO_SINT, MVT::v64i8, MVT::v64f32,11 }, |
| { ISD::FP_TO_SINT, MVT::v64i8, MVT::v64f64,31 }, |
| { ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f64, 3 }, |
| { ISD::FP_TO_SINT, MVT::v16i16, MVT::v16f64, 7 }, |
| { ISD::FP_TO_SINT, MVT::v32i16, MVT::v32f32, 5 }, |
| { ISD::FP_TO_SINT, MVT::v32i16, MVT::v32f64,15 }, |
| { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f64, 1 }, |
| { ISD::FP_TO_SINT, MVT::v16i32, MVT::v16f64, 3 }, |
| |
| { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f64, 1 }, |
| { ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f64, 3 }, |
| { ISD::FP_TO_UINT, MVT::v8i8, MVT::v8f64, 3 }, |
| { ISD::FP_TO_UINT, MVT::v16i32, MVT::v16f32, 1 }, |
| { ISD::FP_TO_UINT, MVT::v16i16, MVT::v16f32, 3 }, |
| { ISD::FP_TO_UINT, MVT::v16i8, MVT::v16f32, 3 }, |
| }; |
| |
| static const TypeConversionCostTblEntry AVX512BWVLConversionTbl[] { |
| // Mask sign extend has an instruction. |
| { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, 1 }, |
| { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v2i1, 1 }, |
| { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, 1 }, |
| { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v2i1, 1 }, |
| { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, 1 }, |
| { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v4i1, 1 }, |
| { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, 1 }, |
| { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v4i1, 1 }, |
| { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, 1 }, |
| { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v8i1, 1 }, |
| { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, 1 }, |
| { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, 1 }, |
| { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 1 }, |
| { ISD::SIGN_EXTEND, MVT::v32i8, MVT::v32i1, 1 }, |
| { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v32i1, 1 }, |
| { ISD::SIGN_EXTEND, MVT::v32i8, MVT::v64i1, 1 }, |
| { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v64i1, 1 }, |
| |
| // Mask zero extend is a sext + shift. |
| { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, 2 }, |
| { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v2i1, 2 }, |
| { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, 2 }, |
| { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v2i1, 2 }, |
| { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, 2 }, |
| { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v4i1, 2 }, |
| { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, 2 }, |
| { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v4i1, 2 }, |
| { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, 2 }, |
| { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v8i1, 2 }, |
| { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, 2 }, |
| { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, 2 }, |
| { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 2 }, |
| { ISD::ZERO_EXTEND, MVT::v32i8, MVT::v32i1, 2 }, |
| { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v32i1, 2 }, |
| { ISD::ZERO_EXTEND, MVT::v32i8, MVT::v64i1, 2 }, |
| { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v64i1, 2 }, |
| |
| { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 2 }, |
| { ISD::TRUNCATE, MVT::v2i1, MVT::v16i8, 2 }, |
| { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 2 }, |
| { ISD::TRUNCATE, MVT::v2i1, MVT::v8i16, 2 }, |
| { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 2 }, |
| { ISD::TRUNCATE, MVT::v4i1, MVT::v16i8, 2 }, |
| { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, 2 }, |
| { ISD::TRUNCATE, MVT::v4i1, MVT::v8i16, 2 }, |
| { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 2 }, |
| { ISD::TRUNCATE, MVT::v8i1, MVT::v16i8, 2 }, |
| { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, 2 }, |
| { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, 2 }, |
| { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, 2 }, |
| { ISD::TRUNCATE, MVT::v32i1, MVT::v32i8, 2 }, |
| { ISD::TRUNCATE, MVT::v32i1, MVT::v16i16, 2 }, |
| { ISD::TRUNCATE, MVT::v64i1, MVT::v32i8, 2 }, |
| { ISD::TRUNCATE, MVT::v64i1, MVT::v16i16, 2 }, |
| |
| { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 2 }, |
| }; |
| |
| static const TypeConversionCostTblEntry AVX512DQVLConversionTbl[] = { |
| // Mask sign extend has an instruction. |
| { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, 1 }, |
| { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v2i1, 1 }, |
| { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, 1 }, |
| { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v16i1, 1 }, |
| { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 1 }, |
| { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v8i1, 1 }, |
| { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i1, 1 }, |
| { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 1 }, |
| |
| // Mask zero extend is a sext + shift. |
| { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, 2 }, |
| { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v2i1, 2 }, |
| { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, 2 }, |
| { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v16i1, 2 }, |
| { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 2 }, |
| { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v8i1, 2 }, |
| { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i1, 2 }, |
| { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 2 }, |
| |
| { ISD::TRUNCATE, MVT::v16i1, MVT::v4i64, 2 }, |
| { ISD::TRUNCATE, MVT::v16i1, MVT::v8i32, 2 }, |
| { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, 2 }, |
| { ISD::TRUNCATE, MVT::v2i1, MVT::v4i32, 2 }, |
| { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, 2 }, |
| { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, 2 }, |
| { ISD::TRUNCATE, MVT::v8i1, MVT::v4i64, 2 }, |
| { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, 2 }, |
| |
| { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i64, 1 }, |
| { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 }, |
| { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i64, 1 }, |
| { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i64, 1 }, |
| |
| { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, 1 }, |
| { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 }, |
| { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i64, 1 }, |
| { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, 1 }, |
| |
| { ISD::FP_TO_SINT, MVT::v2i64, MVT::v4f32, 1 }, |
| { ISD::FP_TO_SINT, MVT::v4i64, MVT::v4f32, 1 }, |
| { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, 1 }, |
| { ISD::FP_TO_SINT, MVT::v4i64, MVT::v4f64, 1 }, |
| |
| { ISD::FP_TO_UINT, MVT::v2i64, MVT::v4f32, 1 }, |
| { ISD::FP_TO_UINT, MVT::v4i64, MVT::v4f32, 1 }, |
| { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, 1 }, |
| { ISD::FP_TO_UINT, MVT::v4i64, MVT::v4f64, 1 }, |
| }; |
| |
| static const TypeConversionCostTblEntry AVX512VLConversionTbl[] = { |
| { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 3 }, // sext+vpslld+vptestmd |
| { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 3 }, // sext+vpslld+vptestmd |
| { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 3 }, // sext+vpslld+vptestmd |
| { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, 8 }, // split+2*v8i8 |
| { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 3 }, // sext+vpsllq+vptestmq |
| { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, 3 }, // sext+vpsllq+vptestmq |
| { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, 3 }, // sext+vpsllq+vptestmq |
| { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, 8 }, // split+2*v8i16 |
| { ISD::TRUNCATE, MVT::v2i1, MVT::v2i32, 2 }, // vpslld+vptestmd |
| { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, 2 }, // vpslld+vptestmd |
| { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, 2 }, // vpslld+vptestmd |
| { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, 2 }, // vpsllq+vptestmq |
| { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, 2 }, // vpsllq+vptestmq |
| { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 1 }, // vpmovqd |
| { ISD::TRUNCATE, MVT::v4i8, MVT::v4i64, 2 }, // vpmovqb |
| { ISD::TRUNCATE, MVT::v4i16, MVT::v4i64, 2 }, // vpmovqw |
| { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 2 }, // vpmovwb |
| |
| // sign extend is vpcmpeq+maskedmove+vpmovdw+vpacksswb |
| // zero extend is vpcmpeq+maskedmove+vpmovdw+vpsrlw+vpackuswb |
| { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, 5 }, |
| { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, 6 }, |
| { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, 5 }, |
| { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, 6 }, |
| { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, 5 }, |
| { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, 6 }, |
| { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, 10 }, |
| { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, 12 }, |
| |
| // sign extend is vpcmpeq+maskedmove+vpmovdw |
| // zero extend is vpcmpeq+maskedmove+vpmovdw+vpsrlw |
| { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, 4 }, |
| { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, 5 }, |
| { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, 4 }, |
| { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, 5 }, |
| { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, 4 }, |
| { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, 5 }, |
| { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 10 }, |
| { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 12 }, |
| |
| { ISD::SIGN_EXTEND, MVT::v2i32, MVT::v2i1, 1 }, // vpternlogd |
| { ISD::ZERO_EXTEND, MVT::v2i32, MVT::v2i1, 2 }, // vpternlogd+psrld |
| { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, 1 }, // vpternlogd |
| { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, 2 }, // vpternlogd+psrld |
| { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 1 }, // vpternlogd |
| { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 2 }, // vpternlogd+psrld |
| { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, 1 }, // vpternlogq |
| { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, 2 }, // vpternlogq+psrlq |
| { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 1 }, // vpternlogq |
| { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 2 }, // vpternlogq+psrlq |
| |
| { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v16i8, 1 }, |
| { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v16i8, 1 }, |
| { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i8, 1 }, |
| { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i8, 1 }, |
| { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 1 }, |
| { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 1 }, |
| { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v8i16, 1 }, |
| { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v8i16, 1 }, |
| { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 1 }, |
| { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 1 }, |
| { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 1 }, |
| { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 1 }, |
| |
| { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, 1 }, |
| { ISD::SINT_TO_FP, MVT::v8f32, MVT::v16i8, 1 }, |
| { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, 1 }, |
| { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 1 }, |
| |
| { ISD::UINT_TO_FP, MVT::f32, MVT::i64, 1 }, |
| { ISD::UINT_TO_FP, MVT::f64, MVT::i64, 1 }, |
| { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, 1 }, |
| { ISD::UINT_TO_FP, MVT::v8f32, MVT::v16i8, 1 }, |
| { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, 1 }, |
| { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 1 }, |
| { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 }, |
| { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 }, |
| { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, 1 }, |
| { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, 1 }, |
| { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, 5 }, |
| { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 5 }, |
| { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, 5 }, |
| |
| { ISD::FP_TO_SINT, MVT::v16i8, MVT::v8f32, 2 }, |
| { ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f32, 2 }, |
| { ISD::FP_TO_SINT, MVT::v32i8, MVT::v32f32, 5 }, |
| |
| { ISD::FP_TO_UINT, MVT::i64, MVT::f32, 1 }, |
| { ISD::FP_TO_UINT, MVT::i64, MVT::f64, 1 }, |
| { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1 }, |
| { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, 1 }, |
| { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f64, 1 }, |
| { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, 1 }, |
| { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f64, 1 }, |
| |