blob: e6209ca12a48c316d91230a22f67fac97cce63a9 [file] [log] [blame]
//===-- AArch64TargetTransformInfo.cpp - AArch64 specific TTI -------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#include "AArch64TargetTransformInfo.h"
#include "AArch64ExpandImm.h"
#include "AArch64PerfectShuffle.h"
#include "MCTargetDesc/AArch64AddressingModes.h"
#include "llvm/Analysis/IVDescriptors.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/CodeGen/BasicTTIImpl.h"
#include "llvm/CodeGen/CostTable.h"
#include "llvm/CodeGen/TargetLowering.h"
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/Intrinsics.h"
#include "llvm/IR/IntrinsicsAArch64.h"
#include "llvm/IR/PatternMatch.h"
#include "llvm/Support/Debug.h"
#include "llvm/Transforms/InstCombine/InstCombiner.h"
#include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
#include <algorithm>
#include <optional>
using namespace llvm;
using namespace llvm::PatternMatch;
#define DEBUG_TYPE "aarch64tti"
static cl::opt<bool> EnableFalkorHWPFUnrollFix("enable-falkor-hwpf-unroll-fix",
cl::init(true), cl::Hidden);
static cl::opt<unsigned> SVEGatherOverhead("sve-gather-overhead", cl::init(10),
cl::Hidden);
static cl::opt<unsigned> SVEScatterOverhead("sve-scatter-overhead",
cl::init(10), cl::Hidden);
static cl::opt<unsigned> SVETailFoldInsnThreshold("sve-tail-folding-insn-threshold",
cl::init(15), cl::Hidden);
static cl::opt<unsigned>
NeonNonConstStrideOverhead("neon-nonconst-stride-overhead", cl::init(10),
cl::Hidden);
namespace {
class TailFoldingOption {
// These bitfields will only ever be set to something non-zero in operator=,
// when setting the -sve-tail-folding option. This option should always be of
// the form (default|simple|all|disable)[+(Flag1|Flag2|etc)], where here
// InitialBits is one of (disabled|all|simple). EnableBits represents
// additional flags we're enabling, and DisableBits for those flags we're
// disabling. The default flag is tracked in the variable NeedsDefault, since
// at the time of setting the option we may not know what the default value
// for the CPU is.
TailFoldingOpts InitialBits = TailFoldingOpts::Disabled;
TailFoldingOpts EnableBits = TailFoldingOpts::Disabled;
TailFoldingOpts DisableBits = TailFoldingOpts::Disabled;
// This value needs to be initialised to true in case the user does not
// explicitly set the -sve-tail-folding option.
bool NeedsDefault = true;
void setInitialBits(TailFoldingOpts Bits) { InitialBits = Bits; }
void setNeedsDefault(bool V) { NeedsDefault = V; }
void setEnableBit(TailFoldingOpts Bit) {
EnableBits |= Bit;
DisableBits &= ~Bit;
}
void setDisableBit(TailFoldingOpts Bit) {
EnableBits &= ~Bit;
DisableBits |= Bit;
}
TailFoldingOpts getBits(TailFoldingOpts DefaultBits) const {
TailFoldingOpts Bits = TailFoldingOpts::Disabled;
assert((InitialBits == TailFoldingOpts::Disabled || !NeedsDefault) &&
"Initial bits should only include one of "
"(disabled|all|simple|default)");
Bits = NeedsDefault ? DefaultBits : InitialBits;
Bits |= EnableBits;
Bits &= ~DisableBits;
return Bits;
}
void reportError(std::string Opt) {
errs() << "invalid argument '" << Opt
<< "' to -sve-tail-folding=; the option should be of the form\n"
" (disabled|all|default|simple)[+(reductions|recurrences"
"|reverse|noreductions|norecurrences|noreverse)]\n";
report_fatal_error("Unrecognised tail-folding option");
}
public:
void operator=(const std::string &Val) {
// If the user explicitly sets -sve-tail-folding= then treat as an error.
if (Val.empty()) {
reportError("");
return;
}
// Since the user is explicitly setting the option we don't automatically
// need the default unless they require it.
setNeedsDefault(false);
SmallVector<StringRef, 4> TailFoldTypes;
StringRef(Val).split(TailFoldTypes, '+', -1, false);
unsigned StartIdx = 1;
if (TailFoldTypes[0] == "disabled")
setInitialBits(TailFoldingOpts::Disabled);
else if (TailFoldTypes[0] == "all")
setInitialBits(TailFoldingOpts::All);
else if (TailFoldTypes[0] == "default")
setNeedsDefault(true);
else if (TailFoldTypes[0] == "simple")
setInitialBits(TailFoldingOpts::Simple);
else {
StartIdx = 0;
setInitialBits(TailFoldingOpts::Disabled);
}
for (unsigned I = StartIdx; I < TailFoldTypes.size(); I++) {
if (TailFoldTypes[I] == "reductions")
setEnableBit(TailFoldingOpts::Reductions);
else if (TailFoldTypes[I] == "recurrences")
setEnableBit(TailFoldingOpts::Recurrences);
else if (TailFoldTypes[I] == "reverse")
setEnableBit(TailFoldingOpts::Reverse);
else if (TailFoldTypes[I] == "noreductions")
setDisableBit(TailFoldingOpts::Reductions);
else if (TailFoldTypes[I] == "norecurrences")
setDisableBit(TailFoldingOpts::Recurrences);
else if (TailFoldTypes[I] == "noreverse")
setDisableBit(TailFoldingOpts::Reverse);
else
reportError(Val);
}
}
bool satisfies(TailFoldingOpts DefaultBits, TailFoldingOpts Required) const {
return (getBits(DefaultBits) & Required) == Required;
}
};
} // namespace
TailFoldingOption TailFoldingOptionLoc;
cl::opt<TailFoldingOption, true, cl::parser<std::string>> SVETailFolding(
"sve-tail-folding",
cl::desc(
"Control the use of vectorisation using tail-folding for SVE where the"
" option is specified in the form (Initial)[+(Flag1|Flag2|...)]:"
"\ndisabled (Initial) No loop types will vectorize using "
"tail-folding"
"\ndefault (Initial) Uses the default tail-folding settings for "
"the target CPU"
"\nall (Initial) All legal loop types will vectorize using "
"tail-folding"
"\nsimple (Initial) Use tail-folding for simple loops (not "
"reductions or recurrences)"
"\nreductions Use tail-folding for loops containing reductions"
"\nnoreductions Inverse of above"
"\nrecurrences Use tail-folding for loops containing fixed order "
"recurrences"
"\nnorecurrences Inverse of above"
"\nreverse Use tail-folding for loops requiring reversed "
"predicates"
"\nnoreverse Inverse of above"),
cl::location(TailFoldingOptionLoc));
// Experimental option that will only be fully functional when the
// code-generator is changed to use SVE instead of NEON for all fixed-width
// operations.
static cl::opt<bool> EnableFixedwidthAutovecInStreamingMode(
"enable-fixedwidth-autovec-in-streaming-mode", cl::init(false), cl::Hidden);
// Experimental option that will only be fully functional when the cost-model
// and code-generator have been changed to avoid using scalable vector
// instructions that are not legal in streaming SVE mode.
static cl::opt<bool> EnableScalableAutovecInStreamingMode(
"enable-scalable-autovec-in-streaming-mode", cl::init(false), cl::Hidden);
bool AArch64TTIImpl::areInlineCompatible(const Function *Caller,
const Function *Callee) const {
SMEAttrs CallerAttrs(*Caller);
SMEAttrs CalleeAttrs(*Callee);
if (CallerAttrs.requiresSMChange(CalleeAttrs,
/*BodyOverridesInterface=*/true) ||
CallerAttrs.requiresLazySave(CalleeAttrs) ||
CalleeAttrs.hasNewZABody())
return false;
const TargetMachine &TM = getTLI()->getTargetMachine();
const FeatureBitset &CallerBits =
TM.getSubtargetImpl(*Caller)->getFeatureBits();
const FeatureBitset &CalleeBits =
TM.getSubtargetImpl(*Callee)->getFeatureBits();
// Inline a callee if its target-features are a subset of the callers
// target-features.
return (CallerBits & CalleeBits) == CalleeBits;
}
bool AArch64TTIImpl::shouldMaximizeVectorBandwidth(
TargetTransformInfo::RegisterKind K) const {
assert(K != TargetTransformInfo::RGK_Scalar);
return (K == TargetTransformInfo::RGK_FixedWidthVector &&
ST->isNeonAvailable());
}
/// Calculate the cost of materializing a 64-bit value. This helper
/// method might only calculate a fraction of a larger immediate. Therefore it
/// is valid to return a cost of ZERO.
InstructionCost AArch64TTIImpl::getIntImmCost(int64_t Val) {
// Check if the immediate can be encoded within an instruction.
if (Val == 0 || AArch64_AM::isLogicalImmediate(Val, 64))
return 0;
if (Val < 0)
Val = ~Val;
// Calculate how many moves we will need to materialize this constant.
SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn;
AArch64_IMM::expandMOVImm(Val, 64, Insn);
return Insn.size();
}
/// Calculate the cost of materializing the given constant.
InstructionCost AArch64TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty,
TTI::TargetCostKind CostKind) {
assert(Ty->isIntegerTy());
unsigned BitSize = Ty->getPrimitiveSizeInBits();
if (BitSize == 0)
return ~0U;
// Sign-extend all constants to a multiple of 64-bit.
APInt ImmVal = Imm;
if (BitSize & 0x3f)
ImmVal = Imm.sext((BitSize + 63) & ~0x3fU);
// Split the constant into 64-bit chunks and calculate the cost for each
// chunk.
InstructionCost Cost = 0;
for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) {
APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64);
int64_t Val = Tmp.getSExtValue();
Cost += getIntImmCost(Val);
}
// We need at least one instruction to materialze the constant.
return std::max<InstructionCost>(1, Cost);
}
InstructionCost AArch64TTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx,
const APInt &Imm, Type *Ty,
TTI::TargetCostKind CostKind,
Instruction *Inst) {
assert(Ty->isIntegerTy());
unsigned BitSize = Ty->getPrimitiveSizeInBits();
// There is no cost model for constants with a bit size of 0. Return TCC_Free
// here, so that constant hoisting will ignore this constant.
if (BitSize == 0)
return TTI::TCC_Free;
unsigned ImmIdx = ~0U;
switch (Opcode) {
default:
return TTI::TCC_Free;
case Instruction::GetElementPtr:
// Always hoist the base address of a GetElementPtr.
if (Idx == 0)
return 2 * TTI::TCC_Basic;
return TTI::TCC_Free;
case Instruction::Store:
ImmIdx = 0;
break;
case Instruction::Add:
case Instruction::Sub:
case Instruction::Mul:
case Instruction::UDiv:
case Instruction::SDiv:
case Instruction::URem:
case Instruction::SRem:
case Instruction::And:
case Instruction::Or:
case Instruction::Xor:
case Instruction::ICmp:
ImmIdx = 1;
break;
// Always return TCC_Free for the shift value of a shift instruction.
case Instruction::Shl:
case Instruction::LShr:
case Instruction::AShr:
if (Idx == 1)
return TTI::TCC_Free;
break;
case Instruction::Trunc:
case Instruction::ZExt:
case Instruction::SExt:
case Instruction::IntToPtr:
case Instruction::PtrToInt:
case Instruction::BitCast:
case Instruction::PHI:
case Instruction::Call:
case Instruction::Select:
case Instruction::Ret:
case Instruction::Load:
break;
}
if (Idx == ImmIdx) {
int NumConstants = (BitSize + 63) / 64;
InstructionCost Cost = AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind);
return (Cost <= NumConstants * TTI::TCC_Basic)
? static_cast<int>(TTI::TCC_Free)
: Cost;
}
return AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind);
}
InstructionCost
AArch64TTIImpl::getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx,
const APInt &Imm, Type *Ty,
TTI::TargetCostKind CostKind) {
assert(Ty->isIntegerTy());
unsigned BitSize = Ty->getPrimitiveSizeInBits();
// There is no cost model for constants with a bit size of 0. Return TCC_Free
// here, so that constant hoisting will ignore this constant.
if (BitSize == 0)
return TTI::TCC_Free;
// Most (all?) AArch64 intrinsics do not support folding immediates into the
// selected instruction, so we compute the materialization cost for the
// immediate directly.
if (IID >= Intrinsic::aarch64_addg && IID <= Intrinsic::aarch64_udiv)
return AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind);
switch (IID) {
default:
return TTI::TCC_Free;
case Intrinsic::sadd_with_overflow:
case Intrinsic::uadd_with_overflow:
case Intrinsic::ssub_with_overflow:
case Intrinsic::usub_with_overflow:
case Intrinsic::smul_with_overflow:
case Intrinsic::umul_with_overflow:
if (Idx == 1) {
int NumConstants = (BitSize + 63) / 64;
InstructionCost Cost = AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind);
return (Cost <= NumConstants * TTI::TCC_Basic)
? static_cast<int>(TTI::TCC_Free)
: Cost;
}
break;
case Intrinsic::experimental_stackmap:
if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
return TTI::TCC_Free;
break;
case Intrinsic::experimental_patchpoint_void:
case Intrinsic::experimental_patchpoint_i64:
if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
return TTI::TCC_Free;
break;
case Intrinsic::experimental_gc_statepoint:
if ((Idx < 5) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
return TTI::TCC_Free;
break;
}
return AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind);
}
TargetTransformInfo::PopcntSupportKind
AArch64TTIImpl::getPopcntSupport(unsigned TyWidth) {
assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
if (TyWidth == 32 || TyWidth == 64)
return TTI::PSK_FastHardware;
// TODO: AArch64TargetLowering::LowerCTPOP() supports 128bit popcount.
return TTI::PSK_Software;
}
InstructionCost
AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
TTI::TargetCostKind CostKind) {
auto *RetTy = ICA.getReturnType();
switch (ICA.getID()) {
case Intrinsic::umin:
case Intrinsic::umax:
case Intrinsic::smin:
case Intrinsic::smax: {
static const auto ValidMinMaxTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,
MVT::v8i16, MVT::v2i32, MVT::v4i32,
MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32,
MVT::nxv2i64};
auto LT = getTypeLegalizationCost(RetTy);
// v2i64 types get converted to cmp+bif hence the cost of 2
if (LT.second == MVT::v2i64)
return LT.first * 2;
if (any_of(ValidMinMaxTys, [&LT](MVT M) { return M == LT.second; }))
return LT.first;
break;
}
case Intrinsic::sadd_sat:
case Intrinsic::ssub_sat:
case Intrinsic::uadd_sat:
case Intrinsic::usub_sat: {
static const auto ValidSatTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,
MVT::v8i16, MVT::v2i32, MVT::v4i32,
MVT::v2i64};
auto LT = getTypeLegalizationCost(RetTy);
// This is a base cost of 1 for the vadd, plus 3 extract shifts if we
// need to extend the type, as it uses shr(qadd(shl, shl)).
unsigned Instrs =
LT.second.getScalarSizeInBits() == RetTy->getScalarSizeInBits() ? 1 : 4;
if (any_of(ValidSatTys, [&LT](MVT M) { return M == LT.second; }))
return LT.first * Instrs;
break;
}
case Intrinsic::abs: {
static const auto ValidAbsTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,
MVT::v8i16, MVT::v2i32, MVT::v4i32,
MVT::v2i64};
auto LT = getTypeLegalizationCost(RetTy);
if (any_of(ValidAbsTys, [&LT](MVT M) { return M == LT.second; }))
return LT.first;
break;
}
case Intrinsic::bswap: {
static const auto ValidAbsTys = {MVT::v4i16, MVT::v8i16, MVT::v2i32,
MVT::v4i32, MVT::v2i64};
auto LT = getTypeLegalizationCost(RetTy);
if (any_of(ValidAbsTys, [&LT](MVT M) { return M == LT.second; }) &&
LT.second.getScalarSizeInBits() == RetTy->getScalarSizeInBits())
return LT.first;
break;
}
case Intrinsic::experimental_stepvector: {
InstructionCost Cost = 1; // Cost of the `index' instruction
auto LT = getTypeLegalizationCost(RetTy);
// Legalisation of illegal vectors involves an `index' instruction plus
// (LT.first - 1) vector adds.
if (LT.first > 1) {
Type *LegalVTy = EVT(LT.second).getTypeForEVT(RetTy->getContext());
InstructionCost AddCost =
getArithmeticInstrCost(Instruction::Add, LegalVTy, CostKind);
Cost += AddCost * (LT.first - 1);
}
return Cost;
}
case Intrinsic::bitreverse: {
static const CostTblEntry BitreverseTbl[] = {
{Intrinsic::bitreverse, MVT::i32, 1},
{Intrinsic::bitreverse, MVT::i64, 1},
{Intrinsic::bitreverse, MVT::v8i8, 1},
{Intrinsic::bitreverse, MVT::v16i8, 1},
{Intrinsic::bitreverse, MVT::v4i16, 2},
{Intrinsic::bitreverse, MVT::v8i16, 2},
{Intrinsic::bitreverse, MVT::v2i32, 2},
{Intrinsic::bitreverse, MVT::v4i32, 2},
{Intrinsic::bitreverse, MVT::v1i64, 2},
{Intrinsic::bitreverse, MVT::v2i64, 2},
};
const auto LegalisationCost = getTypeLegalizationCost(RetTy);
const auto *Entry =
CostTableLookup(BitreverseTbl, ICA.getID(), LegalisationCost.second);
if (Entry) {
// Cost Model is using the legal type(i32) that i8 and i16 will be
// converted to +1 so that we match the actual lowering cost
if (TLI->getValueType(DL, RetTy, true) == MVT::i8 ||
TLI->getValueType(DL, RetTy, true) == MVT::i16)
return LegalisationCost.first * Entry->Cost + 1;
return LegalisationCost.first * Entry->Cost;
}
break;
}
case Intrinsic::ctpop: {
if (!ST->hasNEON()) {
// 32-bit or 64-bit ctpop without NEON is 12 instructions.
return getTypeLegalizationCost(RetTy).first * 12;
}
static const CostTblEntry CtpopCostTbl[] = {
{ISD::CTPOP, MVT::v2i64, 4},
{ISD::CTPOP, MVT::v4i32, 3},
{ISD::CTPOP, MVT::v8i16, 2},
{ISD::CTPOP, MVT::v16i8, 1},
{ISD::CTPOP, MVT::i64, 4},
{ISD::CTPOP, MVT::v2i32, 3},
{ISD::CTPOP, MVT::v4i16, 2},
{ISD::CTPOP, MVT::v8i8, 1},
{ISD::CTPOP, MVT::i32, 5},
};
auto LT = getTypeLegalizationCost(RetTy);
MVT MTy = LT.second;
if (const auto *Entry = CostTableLookup(CtpopCostTbl, ISD::CTPOP, MTy)) {
// Extra cost of +1 when illegal vector types are legalized by promoting
// the integer type.
int ExtraCost = MTy.isVector() && MTy.getScalarSizeInBits() !=
RetTy->getScalarSizeInBits()
? 1
: 0;
return LT.first * Entry->Cost + ExtraCost;
}
break;
}
case Intrinsic::sadd_with_overflow:
case Intrinsic::uadd_with_overflow:
case Intrinsic::ssub_with_overflow:
case Intrinsic::usub_with_overflow:
case Intrinsic::smul_with_overflow:
case Intrinsic::umul_with_overflow: {
static const CostTblEntry WithOverflowCostTbl[] = {
{Intrinsic::sadd_with_overflow, MVT::i8, 3},
{Intrinsic::uadd_with_overflow, MVT::i8, 3},
{Intrinsic::sadd_with_overflow, MVT::i16, 3},
{Intrinsic::uadd_with_overflow, MVT::i16, 3},
{Intrinsic::sadd_with_overflow, MVT::i32, 1},
{Intrinsic::uadd_with_overflow, MVT::i32, 1},
{Intrinsic::sadd_with_overflow, MVT::i64, 1},
{Intrinsic::uadd_with_overflow, MVT::i64, 1},
{Intrinsic::ssub_with_overflow, MVT::i8, 3},
{Intrinsic::usub_with_overflow, MVT::i8, 3},
{Intrinsic::ssub_with_overflow, MVT::i16, 3},
{Intrinsic::usub_with_overflow, MVT::i16, 3},
{Intrinsic::ssub_with_overflow, MVT::i32, 1},
{Intrinsic::usub_with_overflow, MVT::i32, 1},
{Intrinsic::ssub_with_overflow, MVT::i64, 1},
{Intrinsic::usub_with_overflow, MVT::i64, 1},
{Intrinsic::smul_with_overflow, MVT::i8, 5},
{Intrinsic::umul_with_overflow, MVT::i8, 4},
{Intrinsic::smul_with_overflow, MVT::i16, 5},
{Intrinsic::umul_with_overflow, MVT::i16, 4},
{Intrinsic::smul_with_overflow, MVT::i32, 2}, // eg umull;tst
{Intrinsic::umul_with_overflow, MVT::i32, 2}, // eg umull;cmp sxtw
{Intrinsic::smul_with_overflow, MVT::i64, 3}, // eg mul;smulh;cmp
{Intrinsic::umul_with_overflow, MVT::i64, 3}, // eg mul;umulh;cmp asr
};
EVT MTy = TLI->getValueType(DL, RetTy->getContainedType(0), true);
if (MTy.isSimple())
if (const auto *Entry = CostTableLookup(WithOverflowCostTbl, ICA.getID(),
MTy.getSimpleVT()))
return Entry->Cost;
break;
}
case Intrinsic::fptosi_sat:
case Intrinsic::fptoui_sat: {
if (ICA.getArgTypes().empty())
break;
bool IsSigned = ICA.getID() == Intrinsic::fptosi_sat;
auto LT = getTypeLegalizationCost(ICA.getArgTypes()[0]);
EVT MTy = TLI->getValueType(DL, RetTy);
// Check for the legal types, which are where the size of the input and the
// output are the same, or we are using cvt f64->i32 or f32->i64.
if ((LT.second == MVT::f32 || LT.second == MVT::f64 ||
LT.second == MVT::v2f32 || LT.second == MVT::v4f32 ||
LT.second == MVT::v2f64) &&
(LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits() ||
(LT.second == MVT::f64 && MTy == MVT::i32) ||
(LT.second == MVT::f32 && MTy == MVT::i64)))
return LT.first;
// Similarly for fp16 sizes
if (ST->hasFullFP16() &&
((LT.second == MVT::f16 && MTy == MVT::i32) ||
((LT.second == MVT::v4f16 || LT.second == MVT::v8f16) &&
(LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits()))))
return LT.first;
// Otherwise we use a legal convert followed by a min+max
if ((LT.second.getScalarType() == MVT::f32 ||
LT.second.getScalarType() == MVT::f64 ||
(ST->hasFullFP16() && LT.second.getScalarType() == MVT::f16)) &&
LT.second.getScalarSizeInBits() >= MTy.getScalarSizeInBits()) {
Type *LegalTy =
Type::getIntNTy(RetTy->getContext(), LT.second.getScalarSizeInBits());
if (LT.second.isVector())
LegalTy = VectorType::get(LegalTy, LT.second.getVectorElementCount());
InstructionCost Cost = 1;
IntrinsicCostAttributes Attrs1(IsSigned ? Intrinsic::smin : Intrinsic::umin,
LegalTy, {LegalTy, LegalTy});
Cost += getIntrinsicInstrCost(Attrs1, CostKind);
IntrinsicCostAttributes Attrs2(IsSigned ? Intrinsic::smax : Intrinsic::umax,
LegalTy, {LegalTy, LegalTy});
Cost += getIntrinsicInstrCost(Attrs2, CostKind);
return LT.first * Cost;
}
break;
}
case Intrinsic::fshl:
case Intrinsic::fshr: {
if (ICA.getArgs().empty())
break;
// TODO: Add handling for fshl where third argument is not a constant.
const TTI::OperandValueInfo OpInfoZ = TTI::getOperandInfo(ICA.getArgs()[2]);
if (!OpInfoZ.isConstant())
break;
const auto LegalisationCost = getTypeLegalizationCost(RetTy);
if (OpInfoZ.isUniform()) {
// FIXME: The costs could be lower if the codegen is better.
static const CostTblEntry FshlTbl[] = {
{Intrinsic::fshl, MVT::v4i32, 3}, // ushr + shl + orr
{Intrinsic::fshl, MVT::v2i64, 3}, {Intrinsic::fshl, MVT::v16i8, 4},
{Intrinsic::fshl, MVT::v8i16, 4}, {Intrinsic::fshl, MVT::v2i32, 3},
{Intrinsic::fshl, MVT::v8i8, 4}, {Intrinsic::fshl, MVT::v4i16, 4}};
// Costs for both fshl & fshr are the same, so just pass Intrinsic::fshl
// to avoid having to duplicate the costs.
const auto *Entry =
CostTableLookup(FshlTbl, Intrinsic::fshl, LegalisationCost.second);
if (Entry)
return LegalisationCost.first * Entry->Cost;
}
auto TyL = getTypeLegalizationCost(RetTy);
if (!RetTy->isIntegerTy())
break;
// Estimate cost manually, as types like i8 and i16 will get promoted to
// i32 and CostTableLookup will ignore the extra conversion cost.
bool HigherCost = (RetTy->getScalarSizeInBits() != 32 &&
RetTy->getScalarSizeInBits() < 64) ||
(RetTy->getScalarSizeInBits() % 64 != 0);
unsigned ExtraCost = HigherCost ? 1 : 0;
if (RetTy->getScalarSizeInBits() == 32 ||
RetTy->getScalarSizeInBits() == 64)
ExtraCost = 0; // fhsl/fshr for i32 and i64 can be lowered to a single
// extr instruction.
else if (HigherCost)
ExtraCost = 1;
else
break;
return TyL.first + ExtraCost;
}
default:
break;
}
return BaseT::getIntrinsicInstrCost(ICA, CostKind);
}
/// The function will remove redundant reinterprets casting in the presence
/// of the control flow
static std::optional<Instruction *> processPhiNode(InstCombiner &IC,
IntrinsicInst &II) {
SmallVector<Instruction *, 32> Worklist;
auto RequiredType = II.getType();
auto *PN = dyn_cast<PHINode>(II.getArgOperand(0));
assert(PN && "Expected Phi Node!");
// Don't create a new Phi unless we can remove the old one.
if (!PN->hasOneUse())
return std::nullopt;
for (Value *IncValPhi : PN->incoming_values()) {
auto *Reinterpret = dyn_cast<IntrinsicInst>(IncValPhi);
if (!Reinterpret ||
Reinterpret->getIntrinsicID() !=
Intrinsic::aarch64_sve_convert_to_svbool ||
RequiredType != Reinterpret->getArgOperand(0)->getType())
return std::nullopt;
}
// Create the new Phi
IC.Builder.SetInsertPoint(PN);
PHINode *NPN = IC.Builder.CreatePHI(RequiredType, PN->getNumIncomingValues());
Worklist.push_back(PN);
for (unsigned I = 0; I < PN->getNumIncomingValues(); I++) {
auto *Reinterpret = cast<Instruction>(PN->getIncomingValue(I));
NPN->addIncoming(Reinterpret->getOperand(0), PN->getIncomingBlock(I));
Worklist.push_back(Reinterpret);
}
// Cleanup Phi Node and reinterprets
return IC.replaceInstUsesWith(II, NPN);
}
// (from_svbool (binop (to_svbool pred) (svbool_t _) (svbool_t _))))
// => (binop (pred) (from_svbool _) (from_svbool _))
//
// The above transformation eliminates a `to_svbool` in the predicate
// operand of bitwise operation `binop` by narrowing the vector width of
// the operation. For example, it would convert a `<vscale x 16 x i1>
// and` into a `<vscale x 4 x i1> and`. This is profitable because
// to_svbool must zero the new lanes during widening, whereas
// from_svbool is free.
static std::optional<Instruction *>
tryCombineFromSVBoolBinOp(InstCombiner &IC, IntrinsicInst &II) {
auto BinOp = dyn_cast<IntrinsicInst>(II.getOperand(0));
if (!BinOp)
return std::nullopt;
auto IntrinsicID = BinOp->getIntrinsicID();
switch (IntrinsicID) {
case Intrinsic::aarch64_sve_and_z:
case Intrinsic::aarch64_sve_bic_z:
case Intrinsic::aarch64_sve_eor_z:
case Intrinsic::aarch64_sve_nand_z:
case Intrinsic::aarch64_sve_nor_z:
case Intrinsic::aarch64_sve_orn_z:
case Intrinsic::aarch64_sve_orr_z:
break;
default:
return std::nullopt;
}
auto BinOpPred = BinOp->getOperand(0);
auto BinOpOp1 = BinOp->getOperand(1);
auto BinOpOp2 = BinOp->getOperand(2);
auto PredIntr = dyn_cast<IntrinsicInst>(BinOpPred);
if (!PredIntr ||
PredIntr->getIntrinsicID() != Intrinsic::aarch64_sve_convert_to_svbool)
return std::nullopt;
auto PredOp = PredIntr->getOperand(0);
auto PredOpTy = cast<VectorType>(PredOp->getType());
if (PredOpTy != II.getType())
return std::nullopt;
SmallVector<Value *> NarrowedBinOpArgs = {PredOp};
auto NarrowBinOpOp1 = IC.Builder.CreateIntrinsic(
Intrinsic::aarch64_sve_convert_from_svbool, {PredOpTy}, {BinOpOp1});
NarrowedBinOpArgs.push_back(NarrowBinOpOp1);
if (BinOpOp1 == BinOpOp2)
NarrowedBinOpArgs.push_back(NarrowBinOpOp1);
else
NarrowedBinOpArgs.push_back(IC.Builder.CreateIntrinsic(
Intrinsic::aarch64_sve_convert_from_svbool, {PredOpTy}, {BinOpOp2}));
auto NarrowedBinOp =
IC.Builder.CreateIntrinsic(IntrinsicID, {PredOpTy}, NarrowedBinOpArgs);
return IC.replaceInstUsesWith(II, NarrowedBinOp);
}
static std::optional<Instruction *>
instCombineConvertFromSVBool(InstCombiner &IC, IntrinsicInst &II) {
// If the reinterpret instruction operand is a PHI Node
if (isa<PHINode>(II.getArgOperand(0)))
return processPhiNode(IC, II);
if (auto BinOpCombine = tryCombineFromSVBoolBinOp(IC, II))
return BinOpCombine;
// Ignore converts to/from svcount_t.
if (isa<TargetExtType>(II.getArgOperand(0)->getType()) ||
isa<TargetExtType>(II.getType()))
return std::nullopt;
SmallVector<Instruction *, 32> CandidatesForRemoval;
Value *Cursor = II.getOperand(0), *EarliestReplacement = nullptr;
const auto *IVTy = cast<VectorType>(II.getType());
// Walk the chain of conversions.
while (Cursor) {
// If the type of the cursor has fewer lanes than the final result, zeroing
// must take place, which breaks the equivalence chain.
const auto *CursorVTy = cast<VectorType>(Cursor->getType());
if (CursorVTy->getElementCount().getKnownMinValue() <
IVTy->getElementCount().getKnownMinValue())
break;
// If the cursor has the same type as I, it is a viable replacement.
if (Cursor->getType() == IVTy)
EarliestReplacement = Cursor;
auto *IntrinsicCursor = dyn_cast<IntrinsicInst>(Cursor);
// If this is not an SVE conversion intrinsic, this is the end of the chain.
if (!IntrinsicCursor || !(IntrinsicCursor->getIntrinsicID() ==
Intrinsic::aarch64_sve_convert_to_svbool ||
IntrinsicCursor->getIntrinsicID() ==
Intrinsic::aarch64_sve_convert_from_svbool))
break;
CandidatesForRemoval.insert(CandidatesForRemoval.begin(), IntrinsicCursor);
Cursor = IntrinsicCursor->getOperand(0);
}
// If no viable replacement in the conversion chain was found, there is
// nothing to do.
if (!EarliestReplacement)
return std::nullopt;
return IC.replaceInstUsesWith(II, EarliestReplacement);
}
static std::optional<Instruction *> instCombineSVESel(InstCombiner &IC,
IntrinsicInst &II) {
auto Select = IC.Builder.CreateSelect(II.getOperand(0), II.getOperand(1),
II.getOperand(2));
return IC.replaceInstUsesWith(II, Select);
}
static std::optional<Instruction *> instCombineSVEDup(InstCombiner &IC,
IntrinsicInst &II) {
IntrinsicInst *Pg = dyn_cast<IntrinsicInst>(II.getArgOperand(1));
if (!Pg)
return std::nullopt;
if (Pg->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue)
return std::nullopt;
const auto PTruePattern =
cast<ConstantInt>(Pg->getOperand(0))->getZExtValue();
if (PTruePattern != AArch64SVEPredPattern::vl1)
return std::nullopt;
// The intrinsic is inserting into lane zero so use an insert instead.
auto *IdxTy = Type::getInt64Ty(II.getContext());
auto *Insert = InsertElementInst::Create(
II.getArgOperand(0), II.getArgOperand(2), ConstantInt::get(IdxTy, 0));
Insert->insertBefore(&II);
Insert->takeName(&II);
return IC.replaceInstUsesWith(II, Insert);
}
static std::optional<Instruction *> instCombineSVEDupX(InstCombiner &IC,
IntrinsicInst &II) {
// Replace DupX with a regular IR splat.
auto *RetTy = cast<ScalableVectorType>(II.getType());
Value *Splat = IC.Builder.CreateVectorSplat(RetTy->getElementCount(),
II.getArgOperand(0));
Splat->takeName(&II);
return IC.replaceInstUsesWith(II, Splat);
}
static std::optional<Instruction *> instCombineSVECmpNE(InstCombiner &IC,
IntrinsicInst &II) {
LLVMContext &Ctx = II.getContext();
// Check that the predicate is all active
auto *Pg = dyn_cast<IntrinsicInst>(II.getArgOperand(0));
if (!Pg || Pg->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue)
return std::nullopt;
const auto PTruePattern =
cast<ConstantInt>(Pg->getOperand(0))->getZExtValue();
if (PTruePattern != AArch64SVEPredPattern::all)
return std::nullopt;
// Check that we have a compare of zero..
auto *SplatValue =
dyn_cast_or_null<ConstantInt>(getSplatValue(II.getArgOperand(2)));
if (!SplatValue || !SplatValue->isZero())
return std::nullopt;
// ..against a dupq
auto *DupQLane = dyn_cast<IntrinsicInst>(II.getArgOperand(1));
if (!DupQLane ||
DupQLane->getIntrinsicID() != Intrinsic::aarch64_sve_dupq_lane)
return std::nullopt;
// Where the dupq is a lane 0 replicate of a vector insert
if (!cast<ConstantInt>(DupQLane->getArgOperand(1))->isZero())
return std::nullopt;
auto *VecIns = dyn_cast<IntrinsicInst>(DupQLane->getArgOperand(0));
if (!VecIns || VecIns->getIntrinsicID() != Intrinsic::vector_insert)
return std::nullopt;
// Where the vector insert is a fixed constant vector insert into undef at
// index zero
if (!isa<UndefValue>(VecIns->getArgOperand(0)))
return std::nullopt;
if (!cast<ConstantInt>(VecIns->getArgOperand(2))->isZero())
return std::nullopt;
auto *ConstVec = dyn_cast<Constant>(VecIns->getArgOperand(1));
if (!ConstVec)
return std::nullopt;
auto *VecTy = dyn_cast<FixedVectorType>(ConstVec->getType());
auto *OutTy = dyn_cast<ScalableVectorType>(II.getType());
if (!VecTy || !OutTy || VecTy->getNumElements() != OutTy->getMinNumElements())
return std::nullopt;
unsigned NumElts = VecTy->getNumElements();
unsigned PredicateBits = 0;
// Expand intrinsic operands to a 16-bit byte level predicate
for (unsigned I = 0; I < NumElts; ++I) {
auto *Arg = dyn_cast<ConstantInt>(ConstVec->getAggregateElement(I));
if (!Arg)
return std::nullopt;
if (!Arg->isZero())
PredicateBits |= 1 << (I * (16 / NumElts));
}
// If all bits are zero bail early with an empty predicate
if (PredicateBits == 0) {
auto *PFalse = Constant::getNullValue(II.getType());
PFalse->takeName(&II);
return IC.replaceInstUsesWith(II, PFalse);
}
// Calculate largest predicate type used (where byte predicate is largest)
unsigned Mask = 8;
for (unsigned I = 0; I < 16; ++I)
if ((PredicateBits & (1 << I)) != 0)
Mask |= (I % 8);
unsigned PredSize = Mask & -Mask;
auto *PredType = ScalableVectorType::get(
Type::getInt1Ty(Ctx), AArch64::SVEBitsPerBlock / (PredSize * 8));
// Ensure all relevant bits are set
for (unsigned I = 0; I < 16; I += PredSize)
if ((PredicateBits & (1 << I)) == 0)
return std::nullopt;
auto *PTruePat =
ConstantInt::get(Type::getInt32Ty(Ctx), AArch64SVEPredPattern::all);
auto *PTrue = IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue,
{PredType}, {PTruePat});
auto *ConvertToSVBool = IC.Builder.CreateIntrinsic(
Intrinsic::aarch64_sve_convert_to_svbool, {PredType}, {PTrue});
auto *ConvertFromSVBool =
IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_convert_from_svbool,
{II.getType()}, {ConvertToSVBool});
ConvertFromSVBool->takeName(&II);
return IC.replaceInstUsesWith(II, ConvertFromSVBool);
}
static std::optional<Instruction *> instCombineSVELast(InstCombiner &IC,
IntrinsicInst &II) {
Value *Pg = II.getArgOperand(0);
Value *Vec = II.getArgOperand(1);
auto IntrinsicID = II.getIntrinsicID();
bool IsAfter = IntrinsicID == Intrinsic::aarch64_sve_lasta;
// lastX(splat(X)) --> X
if (auto *SplatVal = getSplatValue(Vec))
return IC.replaceInstUsesWith(II, SplatVal);
// If x and/or y is a splat value then:
// lastX (binop (x, y)) --> binop(lastX(x), lastX(y))
Value *LHS, *RHS;
if (match(Vec, m_OneUse(m_BinOp(m_Value(LHS), m_Value(RHS))))) {
if (isSplatValue(LHS) || isSplatValue(RHS)) {
auto *OldBinOp = cast<BinaryOperator>(Vec);
auto OpC = OldBinOp->getOpcode();
auto *NewLHS =
IC.Builder.CreateIntrinsic(IntrinsicID, {Vec->getType()}, {Pg, LHS});
auto *NewRHS =
IC.Builder.CreateIntrinsic(IntrinsicID, {Vec->getType()}, {Pg, RHS});
auto *NewBinOp = BinaryOperator::CreateWithCopiedFlags(
OpC, NewLHS, NewRHS, OldBinOp, OldBinOp->getName(), &II);
return IC.replaceInstUsesWith(II, NewBinOp);
}
}
auto *C = dyn_cast<Constant>(Pg);
if (IsAfter && C && C->isNullValue()) {
// The intrinsic is extracting lane 0 so use an extract instead.
auto *IdxTy = Type::getInt64Ty(II.getContext());
auto *Extract = ExtractElementInst::Create(Vec, ConstantInt::get(IdxTy, 0));
Extract->insertBefore(&II);
Extract->takeName(&II);
return IC.replaceInstUsesWith(II, Extract);
}
auto *IntrPG = dyn_cast<IntrinsicInst>(Pg);
if (!IntrPG)
return std::nullopt;
if (IntrPG->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue)
return std::nullopt;
const auto PTruePattern =
cast<ConstantInt>(IntrPG->getOperand(0))->getZExtValue();
// Can the intrinsic's predicate be converted to a known constant index?
unsigned MinNumElts = getNumElementsFromSVEPredPattern(PTruePattern);
if (!MinNumElts)
return std::nullopt;
unsigned Idx = MinNumElts - 1;
// Increment the index if extracting the element after the last active
// predicate element.
if (IsAfter)
++Idx;
// Ignore extracts whose index is larger than the known minimum vector
// length. NOTE: This is an artificial constraint where we prefer to
// maintain what the user asked for until an alternative is proven faster.
auto *PgVTy = cast<ScalableVectorType>(Pg->getType());
if (Idx >= PgVTy->getMinNumElements())
return std::nullopt;
// The intrinsic is extracting a fixed lane so use an extract instead.
auto *IdxTy = Type::getInt64Ty(II.getContext());
auto *Extract = ExtractElementInst::Create(Vec, ConstantInt::get(IdxTy, Idx));
Extract->insertBefore(&II);
Extract->takeName(&II);
return IC.replaceInstUsesWith(II, Extract);
}
static std::optional<Instruction *> instCombineSVECondLast(InstCombiner &IC,
IntrinsicInst &II) {
// The SIMD&FP variant of CLAST[AB] is significantly faster than the scalar
// integer variant across a variety of micro-architectures. Replace scalar
// integer CLAST[AB] intrinsic with optimal SIMD&FP variant. A simple
// bitcast-to-fp + clast[ab] + bitcast-to-int will cost a cycle or two more
// depending on the micro-architecture, but has been observed as generally
// being faster, particularly when the CLAST[AB] op is a loop-carried
// dependency.
Value *Pg = II.getArgOperand(0);
Value *Fallback = II.getArgOperand(1);
Value *Vec = II.getArgOperand(2);
Type *Ty = II.getType();
if (!Ty->isIntegerTy())
return std::nullopt;
Type *FPTy;
switch (cast<IntegerType>(Ty)->getBitWidth()) {
default:
return std::nullopt;
case 16:
FPTy = IC.Builder.getHalfTy();
break;
case 32:
FPTy = IC.Builder.getFloatTy();
break;
case 64:
FPTy = IC.Builder.getDoubleTy();
break;
}
Value *FPFallBack = IC.Builder.CreateBitCast(Fallback, FPTy);
auto *FPVTy = VectorType::get(
FPTy, cast<VectorType>(Vec->getType())->getElementCount());
Value *FPVec = IC.Builder.CreateBitCast(Vec, FPVTy);
auto *FPII = IC.Builder.CreateIntrinsic(
II.getIntrinsicID(), {FPVec->getType()}, {Pg, FPFallBack, FPVec});
Value *FPIItoInt = IC.Builder.CreateBitCast(FPII, II.getType());
return IC.replaceInstUsesWith(II, FPIItoInt);
}
static std::optional<Instruction *> instCombineRDFFR(InstCombiner &IC,
IntrinsicInst &II) {
LLVMContext &Ctx = II.getContext();
// Replace rdffr with predicated rdffr.z intrinsic, so that optimizePTestInstr
// can work with RDFFR_PP for ptest elimination.
auto *AllPat =
ConstantInt::get(Type::getInt32Ty(Ctx), AArch64SVEPredPattern::all);
auto *PTrue = IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue,
{II.getType()}, {AllPat});
auto *RDFFR =
IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_rdffr_z, {}, {PTrue});
RDFFR->takeName(&II);
return IC.replaceInstUsesWith(II, RDFFR);
}
static std::optional<Instruction *>
instCombineSVECntElts(InstCombiner &IC, IntrinsicInst &II, unsigned NumElts) {
const auto Pattern = cast<ConstantInt>(II.getArgOperand(0))->getZExtValue();
if (Pattern == AArch64SVEPredPattern::all) {
Constant *StepVal = ConstantInt::get(II.getType(), NumElts);
auto *VScale = IC.Builder.CreateVScale(StepVal);
VScale->takeName(&II);
return IC.replaceInstUsesWith(II, VScale);
}
unsigned MinNumElts = getNumElementsFromSVEPredPattern(Pattern);
return MinNumElts && NumElts >= MinNumElts
? std::optional<Instruction *>(IC.replaceInstUsesWith(
II, ConstantInt::get(II.getType(), MinNumElts)))
: std::nullopt;
}
static std::optional<Instruction *> instCombineSVEPTest(InstCombiner &IC,
IntrinsicInst &II) {
Value *PgVal = II.getArgOperand(0);
Value *OpVal = II.getArgOperand(1);
// PTEST_<FIRST|LAST>(X, X) is equivalent to PTEST_ANY(X, X).
// Later optimizations prefer this form.
if (PgVal == OpVal &&
(II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_first ||
II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_last)) {
Value *Ops[] = {PgVal, OpVal};
Type *Tys[] = {PgVal->getType()};
auto *PTest =
IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptest_any, Tys, Ops);
PTest->takeName(&II);
return IC.replaceInstUsesWith(II, PTest);
}
IntrinsicInst *Pg = dyn_cast<IntrinsicInst>(PgVal);
IntrinsicInst *Op = dyn_cast<IntrinsicInst>(OpVal);
if (!Pg || !Op)
return std::nullopt;
Intrinsic::ID OpIID = Op->getIntrinsicID();
if (Pg->getIntrinsicID() == Intrinsic::aarch64_sve_convert_to_svbool &&
OpIID == Intrinsic::aarch64_sve_convert_to_svbool &&
Pg->getArgOperand(0)->getType() == Op->getArgOperand(0)->getType()) {
Value *Ops[] = {Pg->getArgOperand(0), Op->getArgOperand(0)};
Type *Tys[] = {Pg->getArgOperand(0)->getType()};
auto *PTest = IC.Builder.CreateIntrinsic(II.getIntrinsicID(), Tys, Ops);
PTest->takeName(&II);
return IC.replaceInstUsesWith(II, PTest);
}
// Transform PTEST_ANY(X=OP(PG,...), X) -> PTEST_ANY(PG, X)).
// Later optimizations may rewrite sequence to use the flag-setting variant
// of instruction X to remove PTEST.
if ((Pg == Op) && (II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_any) &&
((OpIID == Intrinsic::aarch64_sve_brka_z) ||
(OpIID == Intrinsic::aarch64_sve_brkb_z) ||
(OpIID == Intrinsic::aarch64_sve_brkpa_z) ||
(OpIID == Intrinsic::aarch64_sve_brkpb_z) ||
(OpIID == Intrinsic::aarch64_sve_rdffr_z) ||
(OpIID == Intrinsic::aarch64_sve_and_z) ||
(OpIID == Intrinsic::aarch64_sve_bic_z) ||
(OpIID == Intrinsic::aarch64_sve_eor_z) ||
(OpIID == Intrinsic::aarch64_sve_nand_z) ||
(OpIID == Intrinsic::aarch64_sve_nor_z) ||
(OpIID == Intrinsic::aarch64_sve_orn_z) ||
(OpIID == Intrinsic::aarch64_sve_orr_z))) {
Value *Ops[] = {Pg->getArgOperand(0), Pg};
Type *Tys[] = {Pg->getType()};
auto *PTest = IC.Builder.CreateIntrinsic(II.getIntrinsicID(), Tys, Ops);
PTest->takeName(&II);
return IC.replaceInstUsesWith(II, PTest);
}
return std::nullopt;
}
template <Intrinsic::ID MulOpc, typename Intrinsic::ID FuseOpc>
static std::optional<Instruction *>
instCombineSVEVectorFuseMulAddSub(InstCombiner &IC, IntrinsicInst &II,
bool MergeIntoAddendOp) {
Value *P = II.getOperand(0);
Value *MulOp0, *MulOp1, *AddendOp, *Mul;
if (MergeIntoAddendOp) {
AddendOp = II.getOperand(1);
Mul = II.getOperand(2);
} else {
AddendOp = II.getOperand(2);
Mul = II.getOperand(1);
}
if (!match(Mul, m_Intrinsic<MulOpc>(m_Specific(P), m_Value(MulOp0),
m_Value(MulOp1))))
return std::nullopt;
if (!Mul->hasOneUse())
return std::nullopt;
Instruction *FMFSource = nullptr;
if (II.getType()->isFPOrFPVectorTy()) {
llvm::FastMathFlags FAddFlags = II.getFastMathFlags();
// Stop the combine when the flags on the inputs differ in case dropping
// flags would lead to us missing out on more beneficial optimizations.
if (FAddFlags != cast<CallInst>(Mul)->getFastMathFlags())
return std::nullopt;
if (!FAddFlags.allowContract())
return std::nullopt;
FMFSource = &II;
}
CallInst *Res;
if (MergeIntoAddendOp)
Res = IC.Builder.CreateIntrinsic(FuseOpc, {II.getType()},
{P, AddendOp, MulOp0, MulOp1}, FMFSource);
else
Res = IC.Builder.CreateIntrinsic(FuseOpc, {II.getType()},
{P, MulOp0, MulOp1, AddendOp}, FMFSource);
return IC.replaceInstUsesWith(II, Res);
}
static bool isAllActivePredicate(Value *Pred) {
// Look through convert.from.svbool(convert.to.svbool(...) chain.
Value *UncastedPred;
if (match(Pred, m_Intrinsic<Intrinsic::aarch64_sve_convert_from_svbool>(
m_Intrinsic<Intrinsic::aarch64_sve_convert_to_svbool>(
m_Value(UncastedPred)))))
// If the predicate has the same or less lanes than the uncasted
// predicate then we know the casting has no effect.
if (cast<ScalableVectorType>(Pred->getType())->getMinNumElements() <=
cast<ScalableVectorType>(UncastedPred->getType())->getMinNumElements())
Pred = UncastedPred;
return match(Pred, m_Intrinsic<Intrinsic::aarch64_sve_ptrue>(
m_ConstantInt<AArch64SVEPredPattern::all>()));
}
static std::optional<Instruction *>
instCombineSVELD1(InstCombiner &IC, IntrinsicInst &II, const DataLayout &DL) {
Value *Pred = II.getOperand(0);
Value *PtrOp = II.getOperand(1);
Type *VecTy = II.getType();
if (isAllActivePredicate(Pred)) {
LoadInst *Load = IC.Builder.CreateLoad(VecTy, PtrOp);
Load->copyMetadata(II);
return IC.replaceInstUsesWith(II, Load);
}
CallInst *MaskedLoad =
IC.Builder.CreateMaskedLoad(VecTy, PtrOp, PtrOp->getPointerAlignment(DL),
Pred, ConstantAggregateZero::get(VecTy));
MaskedLoad->copyMetadata(II);
return IC.replaceInstUsesWith(II, MaskedLoad);
}
static std::optional<Instruction *>
instCombineSVEST1(InstCombiner &IC, IntrinsicInst &II, const DataLayout &DL) {
Value *VecOp = II.getOperand(0);
Value *Pred = II.getOperand(1);
Value *PtrOp = II.getOperand(2);
if (isAllActivePredicate(Pred)) {
StoreInst *Store = IC.Builder.CreateStore(VecOp, PtrOp);
Store->copyMetadata(II);
return IC.eraseInstFromFunction(II);
}
CallInst *MaskedStore = IC.Builder.CreateMaskedStore(
VecOp, PtrOp, PtrOp->getPointerAlignment(DL), Pred);
MaskedStore->copyMetadata(II);
return IC.eraseInstFromFunction(II);
}
static Instruction::BinaryOps intrinsicIDToBinOpCode(unsigned Intrinsic) {
switch (Intrinsic) {
case Intrinsic::aarch64_sve_fmul_u:
return Instruction::BinaryOps::FMul;
case Intrinsic::aarch64_sve_fadd_u:
return Instruction::BinaryOps::FAdd;
case Intrinsic::aarch64_sve_fsub_u:
return Instruction::BinaryOps::FSub;
default:
return Instruction::BinaryOpsEnd;
}
}
static std::optional<Instruction *>
instCombineSVEVectorBinOp(InstCombiner &IC, IntrinsicInst &II) {
// Bail due to missing support for ISD::STRICT_ scalable vector operations.
if (II.isStrictFP())
return std::nullopt;
auto *OpPredicate = II.getOperand(0);
auto BinOpCode = intrinsicIDToBinOpCode(II.getIntrinsicID());
if (BinOpCode == Instruction::BinaryOpsEnd ||
!match(OpPredicate, m_Intrinsic<Intrinsic::aarch64_sve_ptrue>(
m_ConstantInt<AArch64SVEPredPattern::all>())))
return std::nullopt;
IRBuilderBase::FastMathFlagGuard FMFGuard(IC.Builder);
IC.Builder.setFastMathFlags(II.getFastMathFlags());
auto BinOp =
IC.Builder.CreateBinOp(BinOpCode, II.getOperand(1), II.getOperand(2));
return IC.replaceInstUsesWith(II, BinOp);
}
// Canonicalise operations that take an all active predicate (e.g. sve.add ->
// sve.add_u).
static std::optional<Instruction *> instCombineSVEAllActive(IntrinsicInst &II,
Intrinsic::ID IID) {
auto *OpPredicate = II.getOperand(0);
if (!match(OpPredicate, m_Intrinsic<Intrinsic::aarch64_sve_ptrue>(
m_ConstantInt<AArch64SVEPredPattern::all>())))
return std::nullopt;
auto *Mod = II.getModule();
auto *NewDecl = Intrinsic::getDeclaration(Mod, IID, {II.getType()});
II.setCalledFunction(NewDecl);
return &II;
}
static std::optional<Instruction *> instCombineSVEVectorAdd(InstCombiner &IC,
IntrinsicInst &II) {
if (auto II_U = instCombineSVEAllActive(II, Intrinsic::aarch64_sve_add_u))
return II_U;
if (auto MLA = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul,
Intrinsic::aarch64_sve_mla>(
IC, II, true))
return MLA;
if (auto MAD = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul,
Intrinsic::aarch64_sve_mad>(
IC, II, false))
return MAD;
return std::nullopt;
}
static std::optional<Instruction *>
instCombineSVEVectorFAdd(InstCombiner &IC, IntrinsicInst &II) {
if (auto II_U = instCombineSVEAllActive(II, Intrinsic::aarch64_sve_fadd_u))
return II_U;
if (auto FMLA =
instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
Intrinsic::aarch64_sve_fmla>(IC, II,
true))
return FMLA;
if (auto FMAD =
instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
Intrinsic::aarch64_sve_fmad>(IC, II,
false))
return FMAD;
if (auto FMLA =
instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
Intrinsic::aarch64_sve_fmla>(IC, II,
true))
return FMLA;
return std::nullopt;
}
static std::optional<Instruction *>
instCombineSVEVectorFAddU(InstCombiner &IC, IntrinsicInst &II) {
if (auto FMLA =
instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
Intrinsic::aarch64_sve_fmla>(IC, II,
true))
return FMLA;
if (auto FMAD =
instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
Intrinsic::aarch64_sve_fmad>(IC, II,
false))
return FMAD;
if (auto FMLA_U =
instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
Intrinsic::aarch64_sve_fmla_u>(
IC, II, true))
return FMLA_U;
return instCombineSVEVectorBinOp(IC, II);
}
static std::optional<Instruction *>
instCombineSVEVectorFSub(InstCombiner &IC, IntrinsicInst &II) {
if (auto II_U = instCombineSVEAllActive(II, Intrinsic::aarch64_sve_fsub_u))
return II_U;
if (auto FMLS =
instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
Intrinsic::aarch64_sve_fmls>(IC, II,
true))
return FMLS;
if (auto FMSB =
instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
Intrinsic::aarch64_sve_fnmsb>(
IC, II, false))
return FMSB;
if (auto FMLS =
instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
Intrinsic::aarch64_sve_fmls>(IC, II,
true))
return FMLS;
return std::nullopt;
}
static std::optional<Instruction *>
instCombineSVEVectorFSubU(InstCombiner &IC, IntrinsicInst &II) {
if (auto FMLS =
instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
Intrinsic::aarch64_sve_fmls>(IC, II,
true))
return FMLS;
if (auto FMSB =
instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
Intrinsic::aarch64_sve_fnmsb>(
IC, II, false))
return FMSB;
if (auto FMLS_U =
instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
Intrinsic::aarch64_sve_fmls_u>(
IC, II, true))
return FMLS_U;
return instCombineSVEVectorBinOp(IC, II);
}
static std::optional<Instruction *> instCombineSVEVectorSub(InstCombiner &IC,
IntrinsicInst &II) {
if (auto II_U = instCombineSVEAllActive(II, Intrinsic::aarch64_sve_sub_u))
return II_U;
if (auto MLS = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul,
Intrinsic::aarch64_sve_mls>(
IC, II, true))
return MLS;
return std::nullopt;
}
static std::optional<Instruction *> instCombineSVEVectorMul(InstCombiner &IC,
IntrinsicInst &II,
Intrinsic::ID IID) {
auto *OpPredicate = II.getOperand(0);
auto *OpMultiplicand = II.getOperand(1);
auto *OpMultiplier = II.getOperand(2);
// Canonicalise a non _u intrinsic only.
if (II.getIntrinsicID() != IID)
if (auto II_U = instCombineSVEAllActive(II, IID))
return II_U;
// Return true if a given instruction is a unit splat value, false otherwise.
auto IsUnitSplat = [](auto *I) {
auto *SplatValue = getSplatValue(I);
if (!SplatValue)
return false;
return match(SplatValue, m_FPOne()) || match(SplatValue, m_One());
};
// Return true if a given instruction is an aarch64_sve_dup intrinsic call
// with a unit splat value, false otherwise.
auto IsUnitDup = [](auto *I) {
auto *IntrI = dyn_cast<IntrinsicInst>(I);
if (!IntrI || IntrI->getIntrinsicID() != Intrinsic::aarch64_sve_dup)
return false;
auto *SplatValue = IntrI->getOperand(2);
return match(SplatValue, m_FPOne()) || match(SplatValue, m_One());
};
if (IsUnitSplat(OpMultiplier)) {
// [f]mul pg %n, (dupx 1) => %n
OpMultiplicand->takeName(&II);
return IC.replaceInstUsesWith(II, OpMultiplicand);
} else if (IsUnitDup(OpMultiplier)) {
// [f]mul pg %n, (dup pg 1) => %n
auto *DupInst = cast<IntrinsicInst>(OpMultiplier);
auto *DupPg = DupInst->getOperand(1);
// TODO: this is naive. The optimization is still valid if DupPg
// 'encompasses' OpPredicate, not only if they're the same predicate.
if (OpPredicate == DupPg) {
OpMultiplicand->takeName(&II);
return IC.replaceInstUsesWith(II, OpMultiplicand);
}
}
return instCombineSVEVectorBinOp(IC, II);
}
static std::optional<Instruction *> instCombineSVEUnpack(InstCombiner &IC,
IntrinsicInst &II) {
Value *UnpackArg = II.getArgOperand(0);
auto *RetTy = cast<ScalableVectorType>(II.getType());
bool IsSigned = II.getIntrinsicID() == Intrinsic::aarch64_sve_sunpkhi ||
II.getIntrinsicID() == Intrinsic::aarch64_sve_sunpklo;
// Hi = uunpkhi(splat(X)) --> Hi = splat(extend(X))
// Lo = uunpklo(splat(X)) --> Lo = splat(extend(X))
if (auto *ScalarArg = getSplatValue(UnpackArg)) {
ScalarArg =
IC.Builder.CreateIntCast(ScalarArg, RetTy->getScalarType(), IsSigned);
Value *NewVal =
IC.Builder.CreateVectorSplat(RetTy->getElementCount(), ScalarArg);
NewVal->takeName(&II);
return IC.replaceInstUsesWith(II, NewVal);
}
return std::nullopt;
}
static std::optional<Instruction *> instCombineSVETBL(InstCombiner &IC,
IntrinsicInst &II) {
auto *OpVal = II.getOperand(0);
auto *OpIndices = II.getOperand(1);
VectorType *VTy = cast<VectorType>(II.getType());
// Check whether OpIndices is a constant splat value < minimal element count
// of result.
auto *SplatValue = dyn_cast_or_null<ConstantInt>(getSplatValue(OpIndices));
if (!SplatValue ||
SplatValue->getValue().uge(VTy->getElementCount().getKnownMinValue()))
return std::nullopt;
// Convert sve_tbl(OpVal sve_dup_x(SplatValue)) to
// splat_vector(extractelement(OpVal, SplatValue)) for further optimization.
auto *Extract = IC.Builder.CreateExtractElement(OpVal, SplatValue);
auto *VectorSplat =
IC.Builder.CreateVectorSplat(VTy->getElementCount(), Extract);
VectorSplat->takeName(&II);
return IC.replaceInstUsesWith(II, VectorSplat);
}
static std::optional<Instruction *> instCombineSVEZip(InstCombiner &IC,
IntrinsicInst &II) {
// zip1(uzp1(A, B), uzp2(A, B)) --> A
// zip2(uzp1(A, B), uzp2(A, B)) --> B
Value *A, *B;
if (match(II.getArgOperand(0),
m_Intrinsic<Intrinsic::aarch64_sve_uzp1>(m_Value(A), m_Value(B))) &&
match(II.getArgOperand(1), m_Intrinsic<Intrinsic::aarch64_sve_uzp2>(
m_Specific(A), m_Specific(B))))
return IC.replaceInstUsesWith(
II, (II.getIntrinsicID() == Intrinsic::aarch64_sve_zip1 ? A : B));
return std::nullopt;
}
static std::optional<Instruction *>
instCombineLD1GatherIndex(InstCombiner &IC, IntrinsicInst &II) {
Value *Mask = II.getOperand(0);
Value *BasePtr = II.getOperand(1);
Value *Index = II.getOperand(2);
Type *Ty = II.getType();
Value *PassThru = ConstantAggregateZero::get(Ty);
// Contiguous gather => masked load.
// (sve.ld1.gather.index Mask BasePtr (sve.index IndexBase 1))
// => (masked.load (gep BasePtr IndexBase) Align Mask zeroinitializer)
Value *IndexBase;
if (match(Index, m_Intrinsic<Intrinsic::aarch64_sve_index>(
m_Value(IndexBase), m_SpecificInt(1)))) {
Align Alignment =
BasePtr->getPointerAlignment(II.getModule()->getDataLayout());
Type *VecPtrTy = PointerType::getUnqual(Ty);
Value *Ptr = IC.Builder.CreateGEP(cast<VectorType>(Ty)->getElementType(),
BasePtr, IndexBase);
Ptr = IC.Builder.CreateBitCast(Ptr, VecPtrTy);
CallInst *MaskedLoad =
IC.Builder.CreateMaskedLoad(Ty, Ptr, Alignment, Mask, PassThru);
MaskedLoad->takeName(&II);
return IC.replaceInstUsesWith(II, MaskedLoad);
}
return std::nullopt;
}
static std::optional<Instruction *>
instCombineST1ScatterIndex(InstCombiner &IC, IntrinsicInst &II) {
Value *Val = II.getOperand(0);
Value *Mask = II.getOperand(1);
Value *BasePtr = II.getOperand(2);
Value *Index = II.getOperand(3);
Type *Ty = Val->getType();
// Contiguous scatter => masked store.
// (sve.st1.scatter.index Value Mask BasePtr (sve.index IndexBase 1))
// => (masked.store Value (gep BasePtr IndexBase) Align Mask)
Value *IndexBase;
if (match(Index, m_Intrinsic<Intrinsic::aarch64_sve_index>(
m_Value(IndexBase), m_SpecificInt(1)))) {
Align Alignment =
BasePtr->getPointerAlignment(II.getModule()->getDataLayout());
Value *Ptr = IC.Builder.CreateGEP(cast<VectorType>(Ty)->getElementType(),
BasePtr, IndexBase);
Type *VecPtrTy = PointerType::getUnqual(Ty);
Ptr = IC.Builder.CreateBitCast(Ptr, VecPtrTy);
(void)IC.Builder.CreateMaskedStore(Val, Ptr, Alignment, Mask);
return IC.eraseInstFromFunction(II);
}
return std::nullopt;
}
static std::optional<Instruction *> instCombineSVESDIV(InstCombiner &IC,
IntrinsicInst &II) {
Type *Int32Ty = IC.Builder.getInt32Ty();
Value *Pred = II.getOperand(0);
Value *Vec = II.getOperand(1);
Value *DivVec = II.getOperand(2);
Value *SplatValue = getSplatValue(DivVec);
ConstantInt *SplatConstantInt = dyn_cast_or_null<ConstantInt>(SplatValue);
if (!SplatConstantInt)
return std::nullopt;
APInt Divisor = SplatConstantInt->getValue();
if (Divisor.isPowerOf2()) {
Constant *DivisorLog2 = ConstantInt::get(Int32Ty, Divisor.logBase2());
auto ASRD = IC.Builder.CreateIntrinsic(
Intrinsic::aarch64_sve_asrd, {II.getType()}, {Pred, Vec, DivisorLog2});
return IC.replaceInstUsesWith(II, ASRD);
}
if (Divisor.isNegatedPowerOf2()) {
Divisor.negate();
Constant *DivisorLog2 = ConstantInt::get(Int32Ty, Divisor.logBase2());
auto ASRD = IC.Builder.CreateIntrinsic(
Intrinsic::aarch64_sve_asrd, {II.getType()}, {Pred, Vec, DivisorLog2});
auto NEG = IC.Builder.CreateIntrinsic(
Intrinsic::aarch64_sve_neg, {ASRD->getType()}, {ASRD, Pred, ASRD});
return IC.replaceInstUsesWith(II, NEG);
}
return std::nullopt;
}
bool SimplifyValuePattern(SmallVector<Value *> &Vec, bool AllowPoison) {
size_t VecSize = Vec.size();
if (VecSize == 1)
return true;
if (!isPowerOf2_64(VecSize))
return false;
size_t HalfVecSize = VecSize / 2;
for (auto LHS = Vec.begin(), RHS = Vec.begin() + HalfVecSize;
RHS != Vec.end(); LHS++, RHS++) {
if (*LHS != nullptr && *RHS != nullptr) {
if (*LHS == *RHS)
continue;
else
return false;
}
if (!AllowPoison)
return false;
if (*LHS == nullptr && *RHS != nullptr)
*LHS = *RHS;
}
Vec.resize(HalfVecSize);
SimplifyValuePattern(Vec, AllowPoison);
return true;
}
// Try to simplify dupqlane patterns like dupqlane(f32 A, f32 B, f32 A, f32 B)
// to dupqlane(f64(C)) where C is A concatenated with B
static std::optional<Instruction *> instCombineSVEDupqLane(InstCombiner &IC,
IntrinsicInst &II) {
Value *CurrentInsertElt = nullptr, *Default = nullptr;
if (!match(II.getOperand(0),
m_Intrinsic<Intrinsic::vector_insert>(
m_Value(Default), m_Value(CurrentInsertElt), m_Value())) ||
!isa<FixedVectorType>(CurrentInsertElt->getType()))
return std::nullopt;
auto IIScalableTy = cast<ScalableVectorType>(II.getType());
// Insert the scalars into a container ordered by InsertElement index
SmallVector<Value *> Elts(IIScalableTy->getMinNumElements(), nullptr);
while (auto InsertElt = dyn_cast<InsertElementInst>(CurrentInsertElt)) {
auto Idx = cast<ConstantInt>(InsertElt->getOperand(2));
Elts[Idx->getValue().getZExtValue()] = InsertElt->getOperand(1);
CurrentInsertElt = InsertElt->getOperand(0);
}
bool AllowPoison =
isa<PoisonValue>(CurrentInsertElt) && isa<PoisonValue>(Default);
if (!SimplifyValuePattern(Elts, AllowPoison))
return std::nullopt;
// Rebuild the simplified chain of InsertElements. e.g. (a, b, a, b) as (a, b)
Value *InsertEltChain = PoisonValue::get(CurrentInsertElt->getType());
for (size_t I = 0; I < Elts.size(); I++) {
if (Elts[I] == nullptr)
continue;
InsertEltChain = IC.Builder.CreateInsertElement(InsertEltChain, Elts[I],
IC.Builder.getInt64(I));
}
if (InsertEltChain == nullptr)
return std::nullopt;
// Splat the simplified sequence, e.g. (f16 a, f16 b, f16 c, f16 d) as one i64
// value or (f16 a, f16 b) as one i32 value. This requires an InsertSubvector
// be bitcast to a type wide enough to fit the sequence, be splatted, and then
// be narrowed back to the original type.
unsigned PatternWidth = IIScalableTy->getScalarSizeInBits() * Elts.size();
unsigned PatternElementCount = IIScalableTy->getScalarSizeInBits() *
IIScalableTy->getMinNumElements() /
PatternWidth;
IntegerType *WideTy = IC.Builder.getIntNTy(PatternWidth);
auto *WideScalableTy = ScalableVectorType::get(WideTy, PatternElementCount);
auto *WideShuffleMaskTy =
ScalableVectorType::get(IC.Builder.getInt32Ty(), PatternElementCount);
auto ZeroIdx = ConstantInt::get(IC.Builder.getInt64Ty(), APInt(64, 0));
auto InsertSubvector = IC.Builder.CreateInsertVector(
II.getType(), PoisonValue::get(II.getType()), InsertEltChain, ZeroIdx);
auto WideBitcast =
IC.Builder.CreateBitOrPointerCast(InsertSubvector, WideScalableTy);
auto WideShuffleMask = ConstantAggregateZero::get(WideShuffleMaskTy);
auto WideShuffle = IC.Builder.CreateShuffleVector(
WideBitcast, PoisonValue::get(WideScalableTy), WideShuffleMask);
auto NarrowBitcast =
IC.Builder.CreateBitOrPointerCast(WideShuffle, II.getType());
return IC.replaceInstUsesWith(II, NarrowBitcast);
}
static std::optional<Instruction *> instCombineMaxMinNM(InstCombiner &IC,
IntrinsicInst &II) {
Value *A = II.getArgOperand(0);
Value *B = II.getArgOperand(1);
if (A == B)
return IC.replaceInstUsesWith(II, A);
return std::nullopt;
}
static std::optional<Instruction *> instCombineSVESrshl(InstCombiner &IC,
IntrinsicInst &II) {
Value *Pred = II.getOperand(0);
Value *Vec = II.getOperand(1);
Value *Shift = II.getOperand(2);
// Convert SRSHL into the simpler LSL intrinsic when fed by an ABS intrinsic.
Value *AbsPred, *MergedValue;
if (!match(Vec, m_Intrinsic<Intrinsic::aarch64_sve_sqabs>(
m_Value(MergedValue), m_Value(AbsPred), m_Value())) &&
!match(Vec, m_Intrinsic<Intrinsic::aarch64_sve_abs>(
m_Value(MergedValue), m_Value(AbsPred), m_Value())))
return std::nullopt;
// Transform is valid if any of the following are true:
// * The ABS merge value is an undef or non-negative
// * The ABS predicate is all active
// * The ABS predicate and the SRSHL predicates are the same
if (!isa<UndefValue>(MergedValue) && !match(MergedValue, m_NonNegative()) &&
AbsPred != Pred && !isAllActivePredicate(AbsPred))
return std::nullopt;
// Only valid when the shift amount is non-negative, otherwise the rounding
// behaviour of SRSHL cannot be ignored.
if (!match(Shift, m_NonNegative()))
return std::nullopt;
auto LSL = IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_lsl,
{II.getType()}, {Pred, Vec, Shift});
return IC.replaceInstUsesWith(II, LSL);
}
std::optional<Instruction *>
AArch64TTIImpl::instCombineIntrinsic(InstCombiner &IC,
IntrinsicInst &II) const {
Intrinsic::ID IID = II.getIntrinsicID();
switch (IID) {
default:
break;
case Intrinsic::aarch64_neon_fmaxnm:
case Intrinsic::aarch64_neon_fminnm:
return instCombineMaxMinNM(IC, II);
case Intrinsic::aarch64_sve_convert_from_svbool:
return instCombineConvertFromSVBool(IC, II);
case Intrinsic::aarch64_sve_dup:
return instCombineSVEDup(IC, II);
case Intrinsic::aarch64_sve_dup_x:
return instCombineSVEDupX(IC, II);
case Intrinsic::aarch64_sve_cmpne:
case Intrinsic::aarch64_sve_cmpne_wide:
return instCombineSVECmpNE(IC, II);
case Intrinsic::aarch64_sve_rdffr:
return instCombineRDFFR(IC, II);
case Intrinsic::aarch64_sve_lasta:
case Intrinsic::aarch64_sve_lastb:
return instCombineSVELast(IC, II);
case Intrinsic::aarch64_sve_clasta_n:
case Intrinsic::aarch64_sve_clastb_n:
return instCombineSVECondLast(IC, II);
case Intrinsic::aarch64_sve_cntd:
return instCombineSVECntElts(IC, II, 2);
case Intrinsic::aarch64_sve_cntw:
return instCombineSVECntElts(IC, II, 4);
case Intrinsic::aarch64_sve_cnth:
return instCombineSVECntElts(IC, II, 8);
case Intrinsic::aarch64_sve_cntb:
return instCombineSVECntElts(IC, II, 16);
case Intrinsic::aarch64_sve_ptest_any:
case Intrinsic::aarch64_sve_ptest_first:
case Intrinsic::aarch64_sve_ptest_last:
return instCombineSVEPTest(IC, II);
case Intrinsic::aarch64_sve_fabd:
return instCombineSVEAllActive(II, Intrinsic::aarch64_sve_fabd_u);
case Intrinsic::aarch64_sve_fadd:
return instCombineSVEVectorFAdd(IC, II);
case Intrinsic::aarch64_sve_fadd_u:
return instCombineSVEVectorFAddU(IC, II);
case Intrinsic::aarch64_sve_fdiv:
return instCombineSVEAllActive(II, Intrinsic::aarch64_sve_fdiv_u);
case Intrinsic::aarch64_sve_fmax:
return instCombineSVEAllActive(II, Intrinsic::aarch64_sve_fmax_u);
case Intrinsic::aarch64_sve_fmaxnm:
return instCombineSVEAllActive(II, Intrinsic::aarch64_sve_fmaxnm_u);
case Intrinsic::aarch64_sve_fmin:
return instCombineSVEAllActive(II, Intrinsic::aarch64_sve_fmin_u);
case Intrinsic::aarch64_sve_fminnm:
return instCombineSVEAllActive(II, Intrinsic::aarch64_sve_fminnm_u);
case Intrinsic::aarch64_sve_fmla:
return instCombineSVEAllActive(II, Intrinsic::aarch64_sve_fmla_u);
case Intrinsic::aarch64_sve_fmls:
return instCombineSVEAllActive(II, Intrinsic::aarch64_sve_fmls_u);
case Intrinsic::aarch64_sve_fmul:
case Intrinsic::aarch64_sve_fmul_u:
return instCombineSVEVectorMul(IC, II, Intrinsic::aarch64_sve_fmul_u);
case Intrinsic::aarch64_sve_fmulx:
return instCombineSVEAllActive(II, Intrinsic::aarch64_sve_fmulx_u);
case Intrinsic::aarch64_sve_fnmla:
return instCombineSVEAllActive(II, Intrinsic::aarch64_sve_fnmla_u);
case Intrinsic::aarch64_sve_fnmls:
return instCombineSVEAllActive(II, Intrinsic::aarch64_sve_fnmls_u);
case Intrinsic::aarch64_sve_fsub:
return instCombineSVEVectorFSub(IC, II);
case Intrinsic::aarch64_sve_fsub_u:
return instCombineSVEVectorFSubU(IC, II);
case Intrinsic::aarch64_sve_add:
return instCombineSVEVectorAdd(IC, II);
case Intrinsic::aarch64_sve_add_u:
return instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul_u,
Intrinsic::aarch64_sve_mla_u>(
IC, II, true);
case Intrinsic::aarch64_sve_mla:
return instCombineSVEAllActive(II, Intrinsic::aarch64_sve_mla_u);
case Intrinsic::aarch64_sve_mls:
return instCombineSVEAllActive(II, Intrinsic::aarch64_sve_mls_u);
case Intrinsic::aarch64_sve_mul:
case Intrinsic::aarch64_sve_mul_u:
return instCombineSVEVectorMul(IC, II, Intrinsic::aarch64_sve_mul_u);
case Intrinsic::aarch64_sve_sabd:
return instCombineSVEAllActive(II, Intrinsic::aarch64_sve_sabd_u);
case Intrinsic::aarch64_sve_smax:
return instCombineSVEAllActive(II, Intrinsic::aarch64_sve_smax_u);
case Intrinsic::aarch64_sve_smin:
return instCombineSVEAllActive(II, Intrinsic::aarch64_sve_smin_u);
case Intrinsic::aarch64_sve_smulh:
return instCombineSVEAllActive(II, Intrinsic::aarch64_sve_smulh_u);
case Intrinsic::aarch64_sve_sub:
return instCombineSVEVectorSub(IC, II);
case Intrinsic::aarch64_sve_sub_u:
return instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul_u,
Intrinsic::aarch64_sve_mls_u>(
IC, II, true);
case Intrinsic::aarch64_sve_uabd:
return instCombineSVEAllActive(II, Intrinsic::aarch64_sve_uabd_u);
case Intrinsic::aarch64_sve_umax:
return instCombineSVEAllActive(II, Intrinsic::aarch64_sve_umax_u);
case Intrinsic::aarch64_sve_umin:
return instCombineSVEAllActive(II, Intrinsic::aarch64_sve_umin_u);
case Intrinsic::aarch64_sve_umulh:
return instCombineSVEAllActive(II, Intrinsic::aarch64_sve_umulh_u);
case Intrinsic::aarch64_sve_asr:
return instCombineSVEAllActive(II, Intrinsic::aarch64_sve_asr_u);
case Intrinsic::aarch64_sve_lsl:
return instCombineSVEAllActive(II, Intrinsic::aarch64_sve_lsl_u);
case Intrinsic::aarch64_sve_lsr:
return instCombineSVEAllActive(II, Intrinsic::aarch64_sve_lsr_u);
case Intrinsic::aarch64_sve_and:
return instCombineSVEAllActive(II, Intrinsic::aarch64_sve_and_u);
case Intrinsic::aarch64_sve_bic:
return instCombineSVEAllActive(II, Intrinsic::aarch64_sve_bic_u);
case Intrinsic::aarch64_sve_eor:
return instCombineSVEAllActive(II, Intrinsic::aarch64_sve_eor_u);
case Intrinsic::aarch64_sve_orr:
return instCombineSVEAllActive(II, Intrinsic::aarch64_sve_orr_u);
case Intrinsic::aarch64_sve_sqsub:
return instCombineSVEAllActive(II, Intrinsic::aarch64_sve_sqsub_u);
case Intrinsic::aarch64_sve_uqsub:
return instCombineSVEAllActive(II, Intrinsic::aarch64_sve_uqsub_u);
case Intrinsic::aarch64_sve_tbl:
return instCombineSVETBL(IC, II);
case Intrinsic::aarch64_sve_uunpkhi:
case Intrinsic::aarch64_sve_uunpklo:
case Intrinsic::aarch64_sve_sunpkhi:
case Intrinsic::aarch64_sve_sunpklo:
return instCombineSVEUnpack(IC, II);
case Intrinsic::aarch64_sve_zip1:
case Intrinsic::aarch64_sve_zip2:
return instCombineSVEZip(IC, II);
case Intrinsic::aarch64_sve_ld1_gather_index:
return instCombineLD1GatherIndex(IC, II);
case Intrinsic::aarch64_sve_st1_scatter_index:
return instCombineST1ScatterIndex(IC, II);
case Intrinsic::aarch64_sve_ld1:
return instCombineSVELD1(IC, II, DL);
case Intrinsic::aarch64_sve_st1:
return instCombineSVEST1(IC, II, DL);
case Intrinsic::aarch64_sve_sdiv:
return instCombineSVESDIV(IC, II);
case Intrinsic::aarch64_sve_sel:
return instCombineSVESel(IC, II);
case Intrinsic::aarch64_sve_srshl:
return instCombineSVESrshl(IC, II);
case Intrinsic::aarch64_sve_dupq_lane:
return instCombineSVEDupqLane(IC, II);
}
return std::nullopt;
}
std::optional<Value *> AArch64TTIImpl::simplifyDemandedVectorEltsIntrinsic(
InstCombiner &IC, IntrinsicInst &II, APInt OrigDemandedElts,
APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3,
std::function<void(Instruction *, unsigned, APInt, APInt &)>
SimplifyAndSetOp) const {
switch (II.getIntrinsicID()) {
default:
break;
case Intrinsic::aarch64_neon_fcvtxn:
case Intrinsic::aarch64_neon_rshrn:
case Intrinsic::aarch64_neon_sqrshrn:
case Intrinsic::aarch64_neon_sqrshrun:
case Intrinsic::aarch64_neon_sqshrn:
case Intrinsic::aarch64_neon_sqshrun:
case Intrinsic::aarch64_neon_sqxtn:
case Intrinsic::aarch64_neon_sqxtun:
case Intrinsic::aarch64_neon_uqrshrn:
case Intrinsic::aarch64_neon_uqshrn:
case Intrinsic::aarch64_neon_uqxtn:
SimplifyAndSetOp(&II, 0, OrigDemandedElts, UndefElts);
break;
}
return std::nullopt;
}
TypeSize
AArch64TTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const {
switch (K) {
case TargetTransformInfo::RGK_Scalar:
return TypeSize::getFixed(64);
case TargetTransformInfo::RGK_FixedWidthVector:
if (!ST->isNeonAvailable() && !EnableFixedwidthAutovecInStreamingMode)
return TypeSize::getFixed(0);
if (ST->hasSVE())
return TypeSize::getFixed(
std::max(ST->getMinSVEVectorSizeInBits(), 128u));
return TypeSize::getFixed(ST->hasNEON() ? 128 : 0);
case TargetTransformInfo::RGK_ScalableVector:
if (!ST->isSVEAvailable() && !EnableScalableAutovecInStreamingMode)
return TypeSize::getScalable(0);
return TypeSize::getScalable(ST->hasSVE() ? 128 : 0);
}
llvm_unreachable("Unsupported register kind");
}
bool AArch64TTIImpl::isWideningInstruction(Type *DstTy, unsigned Opcode,
ArrayRef<const Value *> Args,
Type *SrcOverrideTy) {
// A helper that returns a vector type from the given type. The number of
// elements in type Ty determines the vector width.
auto toVectorTy = [&](Type *ArgTy) {
return VectorType::get(ArgTy->getScalarType(),
cast<VectorType>(DstTy)->getElementCount());
};
// Exit early if DstTy is not a vector type whose elements are one of [i16,
// i32, i64]. SVE doesn't generally have the same set of instructions to
// perform an extend with the add/sub/mul. There are SMULLB style
// instructions, but they operate on top/bottom, requiring some sort of lane
// interleaving to be used with zext/sext.
unsigned DstEltSize = DstTy->getScalarSizeInBits();
if (!useNeonVector(DstTy) || Args.size() != 2 ||
(DstEltSize != 16 && DstEltSize != 32 && DstEltSize != 64))
return false;
// Determine if the operation has a widening variant. We consider both the
// "long" (e.g., usubl) and "wide" (e.g., usubw) versions of the
// instructions.
//
// TODO: Add additional widening operations (e.g., shl, etc.) once we
// verify that their extending operands are eliminated during code
// generation.
Type *SrcTy = SrcOverrideTy;
switch (Opcode) {
case Instruction::Add: // UADDL(2), SADDL(2), UADDW(2), SADDW(2).
case Instruction::Sub: // USUBL(2), SSUBL(2), USUBW(2), SSUBW(2).
// The second operand needs to be an extend
if (isa<SExtInst>(Args[1]) || isa<ZExtInst>(Args[1])) {
if (!SrcTy)
SrcTy =
toVectorTy(cast<Instruction>(Args[1])->getOperand(0)->getType());
} else
return false;
break;
case Instruction::Mul: { // SMULL(2), UMULL(2)
// Both operands need to be extends of the same type.
if ((isa<SExtInst>(Args[0]) && isa<SExtInst>(Args[1])) ||
(isa<ZExtInst>(Args[0]) && isa<ZExtInst>(Args[1]))) {
if (!SrcTy)
SrcTy =
toVectorTy(cast<Instruction>(Args[0])->getOperand(0)->getType());
} else if (isa<ZExtInst>(Args[0]) || isa<ZExtInst>(Args[1])) {
// If one of the operands is a Zext and the other has enough zero bits to
// be treated as unsigned, we can still general a umull, meaning the zext
// is free.
KnownBits Known =
computeKnownBits(isa<ZExtInst>(Args[0]) ? Args[1] : Args[0], DL);
if (Args[0]->getType()->getScalarSizeInBits() -
Known.Zero.countLeadingOnes() >
DstTy->getScalarSizeInBits() / 2)
return false;
if (!SrcTy)
SrcTy = toVectorTy(Type::getIntNTy(DstTy->getContext(),
DstTy->getScalarSizeInBits() / 2));
} else
return false;
break;
}
default:
return false;
}
// Legalize the destination type and ensure it can be used in a widening
// operation.
auto DstTyL = getTypeLegalizationCost(DstTy);
if (!DstTyL.second.isVector() || DstEltSize != DstTy->getScalarSizeInBits())
return false;
// Legalize the source type and ensure it can be used in a widening
// operation.
assert(SrcTy && "Expected some SrcTy");
auto SrcTyL = getTypeLegalizationCost(SrcTy);
unsigned SrcElTySize = SrcTyL.second.getScalarSizeInBits();
if (!SrcTyL.second.isVector() || SrcElTySize != SrcTy->getScalarSizeInBits())
return false;
// Get the total number of vector elements in the legalized types.
InstructionCost NumDstEls =
DstTyL.first * DstTyL.second.getVectorMinNumElements();
InstructionCost NumSrcEls =
SrcTyL.first * SrcTyL.second.getVectorMinNumElements();
// Return true if the legalized types have the same number of vector elements
// and the destination element type size is twice that of the source type.
return NumDstEls == NumSrcEls && 2 * SrcElTySize == DstEltSize;
}
// s/urhadd instructions implement the following pattern, making the
// extends free:
// %x = add ((zext i8 -> i16), 1)
// %y = (zext i8 -> i16)
// trunc i16 (lshr (add %x, %y), 1) -> i8
//
bool AArch64TTIImpl::isExtPartOfAvgExpr(const Instruction *ExtUser, Type *Dst,
Type *Src) {
// The source should be a legal vector type.
if (!Src->isVectorTy() || !TLI->isTypeLegal(TLI->getValueType(DL, Src)) ||
(Src->isScalableTy() && !ST->hasSVE2()))
return false;
if (ExtUser->getOpcode() != Instruction::Add || !ExtUser->hasOneUse())
return false;
// Look for trunc/shl/add before trying to match the pattern.
const Instruction *Add = ExtUser;
auto *AddUser =
dyn_cast_or_null<Instruction>(Add->getUniqueUndroppableUser());
if (AddUser && AddUser->getOpcode() == Instruction::Add)
Add = AddUser;
auto *Shr = dyn_cast_or_null<Instruction>(Add->getUniqueUndroppableUser());
if (!Shr || Shr->getOpcode() != Instruction::LShr)
return false;
auto *Trunc = dyn_cast_or_null<Instruction>(Shr->getUniqueUndroppableUser());
if (!Trunc || Trunc->getOpcode() != Instruction::Trunc ||
Src->getScalarSizeInBits() !=
cast<CastInst>(Trunc)->getDestTy()->getScalarSizeInBits())
return false;
// Try to match the whole pattern. Ext could be either the first or second
// m_ZExtOrSExt matched.
Instruction *Ex1, *Ex2;
if (!(match(Add, m_c_Add(m_Instruction(Ex1),
m_c_Add(m_Instruction(Ex2), m_SpecificInt(1))))))
return false;
// Ensure both extends are of the same type
if (match(Ex1, m_ZExtOrSExt(m_Value())) &&
Ex1->getOpcode() == Ex2->getOpcode())
return true;
return false;
}
InstructionCost AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
Type *Src,
TTI::CastContextHint CCH,
TTI::TargetCostKind CostKind,
const Instruction *I) {
int ISD = TLI->InstructionOpcodeToISD(Opcode);
assert(ISD && "Invalid opcode");
// If the cast is observable, and it is used by a widening instruction (e.g.,
// uaddl, saddw, etc.), it may be free.
if (I && I->hasOneUser()) {
auto *SingleUser = cast<Instruction>(*I->user_begin());
SmallVector<const Value *, 4> Operands(SingleUser->operand_values());
if (isWideningInstruction(Dst, SingleUser->getOpcode(), Operands, Src)) {
// For adds only count the second operand as free if both operands are
// extends but not the same operation. (i.e both operands are not free in
// add(sext, zext)).
if (SingleUser->getOpcode() == Instruction::Add) {
if (I == SingleUser->getOperand(1) ||
(isa<CastInst>(SingleUser->getOperand(1)) &&
cast<CastInst>(SingleUser->getOperand(1))->getOpcode() == Opcode))
return 0;
} else // Others are free so long as isWideningInstruction returned true.
return 0;
}
// The cast will be free for the s/urhadd instructions
if ((isa<ZExtInst>(I) || isa<SExtInst>(I)) &&
isExtPartOfAvgExpr(SingleUser, Dst, Src))
return 0;
}
// TODO: Allow non-throughput costs that aren't binary.
auto AdjustCost = [&CostKind](InstructionCost Cost) -> InstructionCost {
if (CostKind != TTI::TCK_RecipThroughput)
return Cost == 0 ? 0 : 1;
return Cost;
};
EVT SrcTy = TLI->getValueType(DL, Src);
EVT DstTy = TLI->getValueType(DL, Dst);
if (!SrcTy.isSimple() || !DstTy.isSimple())
return AdjustCost(
BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
static const TypeConversionCostTblEntry
ConversionTbl[] = {
{ ISD::TRUNCATE, MVT::v2i8, MVT::v2i64, 1}, // xtn
{ ISD::TRUNCATE, MVT::v2i16, MVT::v2i64, 1}, // xtn
{ ISD::TRUNCATE, MVT::v2i32, MVT::v2i64, 1}, // xtn
{ ISD::TRUNCATE, MVT::v4i8, MVT::v4i32, 1}, // xtn
{ ISD::TRUNCATE, MVT::v4i8, MVT::v4i64, 3}, // 2 xtn + 1 uzp1
{ ISD::TRUNCATE, MVT::v4i16, MVT::v4i32, 1}, // xtn
{ ISD::TRUNCATE, MVT::v4i16, MVT::v4i64, 2}, // 1 uzp1 + 1 xtn
{ ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 1}, // 1 uzp1
{ ISD::TRUNCATE, MVT::v8i8, MVT::v8i16, 1}, // 1 xtn
{ ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 2}, // 1 uzp1 + 1 xtn
{ ISD::TRUNCATE, MVT::v8i8, MVT::v8i64, 4}, // 3 x uzp1 + xtn
{ ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 1}, // 1 uzp1
{ ISD::TRUNCATE, MVT::v8i16, MVT::v8i64, 3}, // 3 x uzp1
{ ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, 2}, // 2 x uzp1
{ ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 1}, // uzp1
{ ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 3}, // (2 + 1) x uzp1
{ ISD::TRUNCATE, MVT::v16i8, MVT::v16i64, 7}, // (4 + 2 + 1) x uzp1
{ ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 2}, // 2 x uzp1
{ ISD::TRUNCATE, MVT::v16i16, MVT::v16i64, 6}, // (4 + 2) x uzp1
{ ISD::TRUNCATE, MVT::v16i32, MVT::v16i64, 4}, // 4 x uzp1
// Truncations on nxvmiN
{ ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i16, 1 },
{ ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i32, 1 },
{ ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i64, 1 },
{ ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i16, 1 },
{ ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i32, 1 },
{ ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i64, 2 },
{ ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i16, 1 },
{ ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i32, 3 },
{ ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i64, 5 },
{ ISD::TRUNCATE, MVT::nxv16i1, MVT::nxv16i8, 1 },
{ ISD::TRUNCATE, MVT::nxv2i16, MVT::nxv2i32, 1 },
{ ISD::TRUNCATE, MVT::nxv2i32, MVT::nxv2i64, 1 },
{ ISD::TRUNCATE, MVT::nxv4i16, MVT::nxv4i32, 1 },
{ ISD::TRUNCATE, MVT::nxv4i32, MVT::nxv4i64, 2 },
{ ISD::TRUNCATE, MVT::nxv8i16, MVT::nxv8i32, 3 },
{ ISD::TRUNCATE, MVT::nxv8i32, MVT::nxv8i64, 6 },
// The number of shll instructions for the extension.
{ ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 3 },
{ ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3 },
{ ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 2 },
{ ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 2 },
{ ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 3 },
{ ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 3 },
{ ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 2 },
{ ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 2 },
{ ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, 7 },
{ ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, 7 },
{ ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, 6 },
{ ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, 6 },
{ ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 2 },
{ ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 2 },
{ ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 6 },
{ ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 6 },
// LowerVectorINT_TO_FP:
{ ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 },
{ ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 },
{ ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 },
{ ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 },
{ ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 },
{ ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 },
// Complex: to v2f32
{ ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i8, 3 },
{ ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i16, 3 },
{ ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i64, 2 },
{ ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i8, 3 },
{ ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i16, 3 },
{ ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, 2 },
// Complex: to v4f32
{ ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i8, 4 },
{ ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 },
{ ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i8, 3 },
{ ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 },
// Complex: to v8f32
{ ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i8, 10 },
{ ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 },
{ ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i8, 10 },
{ ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 },
// Complex: to v16f32
{ ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i8, 21 },
{ ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i8, 21 },
// Complex: to v2f64
{ ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i8, 4 },
{ ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i16, 4 },
{ ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 },
{ ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i8, 4 },
{ ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i16, 4 },
{ ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 },
// Complex: to v4f64
{ ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, 4 },
{ ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, 4 },
// LowerVectorFP_TO_INT
{ ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f32, 1 },
{ ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, 1 },
{ ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, 1 },
{ ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f32, 1 },
{ ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1 },
{ ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, 1 },
// Complex, from v2f32: legal type is v2i32 (no cost) or v2i64 (1 ext).
{ ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f32, 2 },
{ ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f32, 1 },
{ ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f32, 1 },
{ ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f32, 2 },
{ ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f32, 1 },
{ ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f32, 1 },
// Complex, from v4f32: legal type is v4i16, 1 narrowing => ~2
{ ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f32, 2 },
{ ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f32, 2 },
{ ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f32, 2 },
{ ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f32, 2 },
// Complex, from nxv2f32.
{ ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f32, 1 },
{ ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f32, 1 },
{ ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f32, 1 },
{ ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f32, 1 },
{ ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f32, 1 },
{ ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f32, 1 },
{ ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f32, 1 },
{ ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f32, 1 },
// Complex, from v2f64: legal type is v2i32, 1 narrowing => ~2.
{ ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f64, 2 },
{ ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f64, 2 },
{ ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f64, 2 },
{ ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f64, 2 },
{ ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f64, 2 },
{ ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f64, 2 },
// Complex, from nxv2f64.
{ ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f64, 1 },
{ ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f64, 1 },
{ ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f64, 1 },
{ ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f64, 1 },
{ ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f64, 1 },
{ ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f64, 1 },
{ ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f64, 1 },
{ ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f64, 1 },
// Complex, from nxv4f32.
{ ISD::FP_TO_SINT, MVT::nxv4i64, MVT::nxv4f32, 4 },
{ ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f32, 1 },
{ ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f32, 1 },
{ ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f32, 1 },
{ ISD::FP_TO_UINT, MVT::nxv4i64, MVT::nxv4f32, 4 },
{ ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f32, 1 },
{ ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f32, 1 },
{ ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f32, 1 },
// Complex, from nxv8f64. Illegal -> illegal conversions not required.
{ ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f64, 7 },
{ ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f64, 7 },
{ ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f64, 7 },
{ ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f64, 7 },
// Complex, from nxv4f64. Illegal -> illegal conversions not required.
{ ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f64, 3 },
{ ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f64, 3 },
{ ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f64, 3 },
{ ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f64, 3 },
{ ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f64, 3 },
{ ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f64, 3 },
// Complex, from nxv8f32. Illegal -> illegal conversions not required.
{ ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f32, 3 },
{ ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f32, 3 },
{ ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f32, 3 },
{ ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f32, 3 },
// Complex, from nxv8f16.
{ ISD::FP_TO_SINT, MVT::nxv8i64, MVT::nxv8f16, 10 },
{ ISD::FP_TO_SINT, MVT::nxv8i32, MVT::nxv8f16, 4 },
{ ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f16, 1 },
{ ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f16, 1 },
{ ISD::FP_TO_UINT, MVT::nxv8i64, MVT::nxv8f16, 10 },
{ ISD::FP_TO_UINT, MVT::nxv8i32, MVT::nxv8f16, 4 },
{ ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f16, 1 },
{ ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f16, 1 },
// Complex, from nxv4f16.
{ ISD::FP_TO_SINT, MVT::nxv4i64, MVT::nxv4f16, 4 },
{ ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f16, 1 },
{ ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f16, 1 },
{ ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f16, 1 },
{ ISD::FP_TO_UINT, MVT::nxv4i64, MVT::nxv4f16, 4 },
{ ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f16, 1 },
{ ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f16, 1 },
{ ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f16, 1 },
// Complex, from nxv2f16.
{ ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f16, 1 },
{ ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f16, 1 },
{ ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f16, 1 },
{ ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f16, 1 },
{ ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f16, 1 },
{ ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f16, 1 },
{ ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f16, 1 },
{ ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f16, 1 },
// Truncate from nxvmf32 to nxvmf16.
{ ISD::FP_ROUND, MVT::nxv2f16, MVT::nxv2f32, 1 },
{ ISD::FP_ROUND, MVT::nxv4f16, MVT::nxv4f32, 1 },
{ ISD::FP_ROUND, MVT::nxv8f16, MVT::nxv8f32, 3 },
// Truncate from nxvmf64 to nxvmf16.
{ ISD::FP_ROUND, MVT::nxv2f16, MVT::nxv2f64, 1 },
{ ISD::FP_ROUND, MVT::nxv4f16, MVT::nxv4f64, 3 },
{ ISD::FP_ROUND, MVT::nxv8f16, MVT::nxv8f64, 7 },
// Truncate from nxvmf64 to nxvmf32.
{ ISD::FP_ROUND, MVT::nxv2f32, MVT::nxv2f64, 1 },
{ ISD::FP_ROUND, MVT::nxv4f32, MVT::nxv4f64, 3 },
{ ISD::FP_ROUND, MVT::nxv8f32, MVT::nxv8f64, 6 },
// Extend from nxvmf16 to nxvmf32.
{ ISD::FP_EXTEND, MVT::nxv2f32, MVT::nxv2f16, 1},
{ ISD::FP_EXTEND, MVT::nxv4f32, MVT::nxv4f16, 1},
{ ISD::FP_EXTEND, MVT::nxv8f32, MVT::nxv8f16, 2},
// Extend from nxvmf16 to nxvmf64.
{ ISD::FP_EXTEND, MVT::nxv2f64, MVT::nxv2f16, 1},
{ ISD::FP_EXTEND, MVT::nxv4f64, MVT::nxv4f16, 2},
{ ISD::FP_EXTEND, MVT::nxv8f64, MVT::nxv8f16, 4},
// Extend from nxvmf32 to nxvmf64.
{ ISD::FP_EXTEND, MVT::nxv2f64, MVT::nxv2f32, 1},
{ ISD::FP_EXTEND, MVT::nxv4f64, MVT::nxv4f32, 2},
{ ISD::FP_EXTEND, MVT::nxv8f64, MVT::nxv8f32, 6},
// Bitcasts from float to integer
{ ISD::BITCAST, MVT::nxv2f16, MVT::nxv2i16, 0 },
{ ISD::BITCAST, MVT::nxv4f16, MVT::nxv4i16, 0 },
{ ISD::BITCAST, MVT::nxv2f32, MVT::nxv2i32, 0 },
// Bitcasts from integer to float
{ ISD::BITCAST, MVT::nxv2i16, MVT::nxv2f16, 0 },
{ ISD::BITCAST, MVT::nxv4i16, MVT::nxv4f16, 0 },
{ ISD::BITCAST, MVT::nxv2i32, MVT::nxv2f32, 0 },
// Add cost for extending to illegal -too wide- scalable vectors.
// zero/sign extend are implemented by multiple unpack operations,
// where each operation has a cost of 1.
{ ISD::ZERO_EXTEND, MVT::nxv16i16, MVT::nxv16i8, 2},