blob: c7716020da13c5b2835935bd4405f5a79ce0a2c0 [file] [log] [blame] [edit]
//===- AArch64TargetTransformInfo.h - AArch64 specific TTI ------*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
/// \file
/// This file a TargetTransformInfoImplBase conforming object specific to the
/// AArch64 target machine. It uses the target's detailed information to
/// provide more precise answers to certain TTI queries, while letting the
/// target independent and default TTI implementations handle the rest.
///
//===----------------------------------------------------------------------===//
#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64TARGETTRANSFORMINFO_H
#define LLVM_LIB_TARGET_AARCH64_AARCH64TARGETTRANSFORMINFO_H
#include "AArch64.h"
#include "AArch64Subtarget.h"
#include "AArch64TargetMachine.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/CodeGen/BasicTTIImpl.h"
#include "llvm/IR/FMF.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/Intrinsics.h"
#include "llvm/Support/InstructionCost.h"
#include <cstdint>
#include <optional>
namespace llvm {
class APInt;
class Instruction;
class IntrinsicInst;
class Loop;
class SCEV;
class ScalarEvolution;
class Type;
class Value;
class VectorType;
class AArch64TTIImpl final : public BasicTTIImplBase<AArch64TTIImpl> {
using BaseT = BasicTTIImplBase<AArch64TTIImpl>;
using TTI = TargetTransformInfo;
friend BaseT;
const AArch64Subtarget *ST;
const AArch64TargetLowering *TLI;
static const FeatureBitset InlineInverseFeatures;
const AArch64Subtarget *getST() const { return ST; }
const AArch64TargetLowering *getTLI() const { return TLI; }
enum MemIntrinsicType {
VECTOR_LDST_TWO_ELEMENTS,
VECTOR_LDST_THREE_ELEMENTS,
VECTOR_LDST_FOUR_ELEMENTS
};
/// Given a add/sub/mul operation, detect a widening addl/subl/mull pattern
/// where both operands can be treated like extends. Returns the minimal type
/// needed to compute the operation.
Type *isBinExtWideningInstruction(unsigned Opcode, Type *DstTy,
ArrayRef<const Value *> Args,
Type *SrcOverrideTy = nullptr) const;
/// Given a add/sub operation with a single extend operand, detect a
/// widening addw/subw pattern.
bool isSingleExtWideningInstruction(unsigned Opcode, Type *DstTy,
ArrayRef<const Value *> Args,
Type *SrcOverrideTy = nullptr) const;
// A helper function called by 'getVectorInstrCost'.
//
// 'Val' and 'Index' are forwarded from 'getVectorInstrCost';
// \param ScalarUserAndIdx encodes the information about extracts from a
/// vector with 'Scalar' being the value being extracted,'User' being the user
/// of the extract(nullptr if user is not known before vectorization) and
/// 'Idx' being the extract lane.
InstructionCost getVectorInstrCostHelper(
unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
const Instruction *I = nullptr, Value *Scalar = nullptr,
ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx = {},
TTI::VectorInstrContext VIC = TTI::VectorInstrContext::None) const;
public:
explicit AArch64TTIImpl(const AArch64TargetMachine *TM, const Function &F)
: BaseT(TM, F.getDataLayout()), ST(TM->getSubtargetImpl(F)),
TLI(ST->getTargetLowering()) {}
bool areInlineCompatible(const Function *Caller,
const Function *Callee) const override;
bool areTypesABICompatible(const Function *Caller, const Function *Callee,
ArrayRef<Type *> Types) const override;
unsigned getInlineCallPenalty(const Function *F, const CallBase &Call,
unsigned DefaultCallPenalty) const override;
APInt getFeatureMask(const Function &F) const override;
APInt getPriorityMask(const Function &F) const override;
bool isMultiversionedFunction(const Function &F) const override;
/// \name Scalar TTI Implementations
/// @{
using BaseT::getIntImmCost;
InstructionCost getIntImmCost(int64_t Val) const;
InstructionCost getIntImmCost(const APInt &Imm, Type *Ty,
TTI::TargetCostKind CostKind) const override;
InstructionCost getIntImmCostInst(unsigned Opcode, unsigned Idx,
const APInt &Imm, Type *Ty,
TTI::TargetCostKind CostKind,
Instruction *Inst = nullptr) const override;
InstructionCost
getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, const APInt &Imm,
Type *Ty, TTI::TargetCostKind CostKind) const override;
TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth) const override;
/// @}
/// \name Vector TTI Implementations
/// @{
bool enableInterleavedAccessVectorization() const override { return true; }
bool enableMaskedInterleavedAccessVectorization() const override {
return ST->hasSVE();
}
unsigned getNumberOfRegisters(unsigned ClassID) const override {
bool Vector = (ClassID == 1);
if (Vector) {
if (ST->hasNEON())
return 32;
return 0;
}
return 31;
}
InstructionCost
getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
TTI::TargetCostKind CostKind) const override;
std::optional<Instruction *>
instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const override;
std::optional<Value *> simplifyDemandedVectorEltsIntrinsic(
InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts,
APInt &UndefElts2, APInt &UndefElts3,
std::function<void(Instruction *, unsigned, APInt, APInt &)>
SimplifyAndSetOp) const override;
TypeSize
getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const override;
unsigned getMinVectorRegisterBitWidth() const override {
return ST->getMinVectorRegisterBitWidth();
}
std::optional<unsigned> getVScaleForTuning() const override {
return ST->getVScaleForTuning();
}
bool shouldMaximizeVectorBandwidth(
TargetTransformInfo::RegisterKind K) const override;
/// Try to return an estimate cost factor that can be used as a multiplier
/// when scalarizing an operation for a vector with ElementCount \p VF.
/// For scalable vectors this currently takes the most pessimistic view based
/// upon the maximum possible value for vscale.
unsigned getMaxNumElements(ElementCount VF) const {
if (!VF.isScalable())
return VF.getFixedValue();
return VF.getKnownMinValue() * ST->getVScaleForTuning();
}
unsigned getMaxInterleaveFactor(ElementCount VF) const override;
bool prefersVectorizedAddressing() const override;
/// Check whether Opcode1 has less throughput according to the scheduling
/// model than Opcode2.
bool hasKnownLowerThroughputFromSchedulingModel(unsigned Opcode1,
unsigned Opcode2) const;
InstructionCost
getMemIntrinsicInstrCost(const MemIntrinsicCostAttributes &MICA,
TTI::TargetCostKind CostKind) const override;
InstructionCost getMaskedMemoryOpCost(const MemIntrinsicCostAttributes &MICA,
TTI::TargetCostKind CostKind) const;
InstructionCost getGatherScatterOpCost(const MemIntrinsicCostAttributes &MICA,
TTI::TargetCostKind CostKind) const;
bool isExtPartOfAvgExpr(const Instruction *ExtUser, Type *Dst,
Type *Src) const;
InstructionCost
getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
TTI::CastContextHint CCH, TTI::TargetCostKind CostKind,
const Instruction *I = nullptr) const override;
InstructionCost
getExtractWithExtendCost(unsigned Opcode, Type *Dst, VectorType *VecTy,
unsigned Index,
TTI::TargetCostKind CostKind) const override;
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind,
const Instruction *I = nullptr) const override;
InstructionCost
getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind,
unsigned Index, const Value *Op0, const Value *Op1,
TTI::VectorInstrContext VIC =
TTI::VectorInstrContext::None) const override;
/// \param ScalarUserAndIdx encodes the information about extracts from a
/// vector with 'Scalar' being the value being extracted,'User' being the user
/// of the extract(nullptr if user is not known before vectorization) and
/// 'Idx' being the extract lane.
InstructionCost getVectorInstrCost(
unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
Value *Scalar,
ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx,
TTI::VectorInstrContext VIC =
TTI::VectorInstrContext::None) const override;
InstructionCost
getVectorInstrCost(const Instruction &I, Type *Val,
TTI::TargetCostKind CostKind, unsigned Index,
TTI::VectorInstrContext VIC =
TTI::VectorInstrContext::None) const override;
InstructionCost
getIndexedVectorInstrCostFromEnd(unsigned Opcode, Type *Val,
TTI::TargetCostKind CostKind,
unsigned Index) const override;
InstructionCost
getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF,
TTI::TargetCostKind CostKind) const override;
InstructionCost
getArithmeticReductionCostSVE(unsigned Opcode, VectorType *ValTy,
TTI::TargetCostKind CostKind) const;
InstructionCost getSpliceCost(VectorType *Tp, int Index,
TTI::TargetCostKind CostKind) const;
InstructionCost getArithmeticInstrCost(
unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
TTI::OperandValueInfo Op1Info = {TTI::OK_AnyValue, TTI::OP_None},
TTI::OperandValueInfo Op2Info = {TTI::OK_AnyValue, TTI::OP_None},
ArrayRef<const Value *> Args = {},
const Instruction *CxtI = nullptr) const override;
InstructionCost
getAddressComputationCost(Type *PtrTy, ScalarEvolution *SE, const SCEV *Ptr,
TTI::TargetCostKind CostKind) const override;
InstructionCost getCmpSelInstrCost(
unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred,
TTI::TargetCostKind CostKind,
TTI::OperandValueInfo Op1Info = {TTI::OK_AnyValue, TTI::OP_None},
TTI::OperandValueInfo Op2Info = {TTI::OK_AnyValue, TTI::OP_None},
const Instruction *I = nullptr) const override;
TTI::MemCmpExpansionOptions
enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const override;
bool useNeonVector(const Type *Ty) const;
InstructionCost getMemoryOpCost(
unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace,
TTI::TargetCostKind CostKind,
TTI::OperandValueInfo OpInfo = {TTI::OK_AnyValue, TTI::OP_None},
const Instruction *I = nullptr) const override;
InstructionCost
getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys) const override;
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
TTI::UnrollingPreferences &UP,
OptimizationRemarkEmitter *ORE) const override;
void getPeelingPreferences(Loop *L, ScalarEvolution &SE,
TTI::PeelingPreferences &PP) const override;
Value *
getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst, Type *ExpectedType,
bool CanCreate = true) const override;
bool getTgtMemIntrinsic(IntrinsicInst *Inst,
MemIntrinsicInfo &Info) const override;
bool isElementTypeLegalForScalableVector(Type *Ty) const override {
if (Ty->isPointerTy())
return true;
if (Ty->isBFloatTy() && ST->hasBF16())
return true;
if (Ty->isHalfTy() || Ty->isFloatTy() || Ty->isDoubleTy())
return true;
if (Ty->isIntegerTy(1) || Ty->isIntegerTy(8) || Ty->isIntegerTy(16) ||
Ty->isIntegerTy(32) || Ty->isIntegerTy(64))
return true;
return false;
}
bool isLegalMaskedLoadStore(Type *DataType, Align Alignment) const {
if (!ST->isSVEorStreamingSVEAvailable())
return false;
// For fixed vectors, avoid scalarization if using SVE for them.
if (isa<FixedVectorType>(DataType) && !ST->useSVEForFixedLengthVectors() &&
DataType->getPrimitiveSizeInBits() != 128)
return false; // Fall back to scalarization of masked operations.
return isElementTypeLegalForScalableVector(DataType->getScalarType());
}
bool isLegalMaskedLoad(Type *DataType, Align Alignment,
unsigned /*AddressSpace*/,
TTI::MaskKind /*MaskKind*/) const override {
return isLegalMaskedLoadStore(DataType, Alignment);
}
bool isLegalMaskedStore(Type *DataType, Align Alignment,
unsigned /*AddressSpace*/,
TTI::MaskKind /*MaskKind*/) const override {
return isLegalMaskedLoadStore(DataType, Alignment);
}
bool isElementTypeLegalForCompressStore(Type *Ty) const {
return Ty->isFloatTy() || Ty->isDoubleTy() || Ty->isIntegerTy(32) ||
Ty->isIntegerTy(64);
}
bool isLegalMaskedCompressStore(Type *DataType,
Align Alignment) const override {
if (!ST->isSVEAvailable())
return false;
if (isa<FixedVectorType>(DataType) &&
DataType->getPrimitiveSizeInBits() < 128)
return false;
return isElementTypeLegalForCompressStore(DataType->getScalarType());
}
bool isLegalMaskedGatherScatter(Type *DataType) const {
if (!ST->isSVEAvailable())
return false;
// For fixed vectors, scalarize if not using SVE for them.
auto *DataTypeFVTy = dyn_cast<FixedVectorType>(DataType);
if (DataTypeFVTy && (!ST->useSVEForFixedLengthVectors() ||
DataTypeFVTy->getNumElements() < 2))
return false;
return isElementTypeLegalForScalableVector(DataType->getScalarType());
}
bool isLegalMaskedGather(Type *DataType, Align Alignment) const override {
return isLegalMaskedGatherScatter(DataType);
}
bool isLegalMaskedScatter(Type *DataType, Align Alignment) const override {
return isLegalMaskedGatherScatter(DataType);
}
bool isLegalBroadcastLoad(Type *ElementTy,
ElementCount NumElements) const override {
// Return true if we can generate a `ld1r` splat load instruction.
if (!ST->hasNEON() || NumElements.isScalable())
return false;
switch (unsigned ElementBits = ElementTy->getScalarSizeInBits()) {
case 8:
case 16:
case 32:
case 64: {
// We accept bit-widths >= 64bits and elements {8,16,32,64} bits.
unsigned VectorBits = NumElements.getFixedValue() * ElementBits;
return VectorBits >= 64;
}
}
return false;
}
std::optional<bool> isLegalNTStoreLoad(Type *DataType,
Align Alignment) const {
// Currently we only support NT load and store lowering for little-endian
// targets.
//
// Coordinated with LDNP and STNP constraints in
// `llvm/lib/Target/AArch64/AArch64InstrInfo.td` and
// `AArch64ISelLowering.cpp`
if (!ST->isLittleEndian())
return false;
// NOTE: The logic below is mostly geared towards LV, which calls it with
// vectors with 2 elements. We might want to improve that, if other
// users show up.
// Nontemporal vector loads/stores can be directly lowered to LDNP/STNP, if
// the vector can be halved so that each half fits into a register. That's
// the case if the element type fits into a register and the number of
// elements is a power of 2 > 1.
if (auto *DataTypeTy = dyn_cast<FixedVectorType>(DataType)) {
unsigned NumElements = DataTypeTy->getNumElements();
unsigned EltSize = DataTypeTy->getElementType()->getScalarSizeInBits();
return NumElements > 1 && isPowerOf2_64(NumElements) && EltSize >= 8 &&
EltSize <= 128 && isPowerOf2_64(EltSize);
}
return std::nullopt;
}
bool isLegalNTStore(Type *DataType, Align Alignment) const override {
if (auto Result = isLegalNTStoreLoad(DataType, Alignment))
return *Result;
// Fallback to target independent logic
return BaseT::isLegalNTStore(DataType, Alignment);
}
bool isLegalNTLoad(Type *DataType, Align Alignment) const override {
if (auto Result = isLegalNTStoreLoad(DataType, Alignment))
return *Result;
// Fallback to target independent logic
return BaseT::isLegalNTLoad(DataType, Alignment);
}
InstructionCost getPartialReductionCost(
unsigned Opcode, Type *InputTypeA, Type *InputTypeB, Type *AccumType,
ElementCount VF, TTI::PartialReductionExtendKind OpAExtend,
TTI::PartialReductionExtendKind OpBExtend, std::optional<unsigned> BinOp,
TTI::TargetCostKind CostKind,
std::optional<FastMathFlags> FMF) const override;
bool enableOrderedReductions() const override { return true; }
InstructionCost getInterleavedMemoryOpCost(
unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
bool UseMaskForCond = false, bool UseMaskForGaps = false) const override;
bool shouldConsiderAddressTypePromotion(
const Instruction &I,
bool &AllowPromotionWithoutCommonHeader) const override;
bool shouldExpandReduction(const IntrinsicInst *II) const override {
return false;
}
unsigned getGISelRematGlobalCost() const override { return 2; }
unsigned getMinTripCountTailFoldingThreshold() const override {
return ST->hasSVE() ? 5 : 0;
}
TailFoldingStyle getPreferredTailFoldingStyle() const override {
return ST->hasSVE() ? TailFoldingStyle::DataAndControlFlow
: TailFoldingStyle::DataWithoutLaneMask;
}
bool preferFixedOverScalableIfEqualCost(bool IsEpilogue) const override;
unsigned getEpilogueVectorizationMinVF() const override;
bool preferPredicateOverEpilogue(TailFoldingInfo *TFI) const override;
bool supportsScalableVectors() const override {
return ST->isSVEorStreamingSVEAvailable();
}
bool enableScalableVectorization() const override;
bool isLegalToVectorizeReduction(const RecurrenceDescriptor &RdxDesc,
ElementCount VF) const override;
bool preferPredicatedReductionSelect() const override { return ST->hasSVE(); }
/// FP16 and BF16 operations are lowered to fptrunc(op(fpext, fpext) if the
/// architecture features are not present.
std::optional<InstructionCost> getFP16BF16PromoteCost(
Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info,
TTI::OperandValueInfo Op2Info, bool IncludeTrunc, bool CanUseSVE,
std::function<InstructionCost(Type *)> InstCost) const;
InstructionCost
getArithmeticReductionCost(unsigned Opcode, VectorType *Ty,
std::optional<FastMathFlags> FMF,
TTI::TargetCostKind CostKind) const override;
InstructionCost
getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy,
VectorType *ValTy, std::optional<FastMathFlags> FMF,
TTI::TargetCostKind CostKind) const override;
InstructionCost getMulAccReductionCost(
bool IsUnsigned, unsigned RedOpcode, Type *ResTy, VectorType *Ty,
TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput) const override;
InstructionCost
getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy,
ArrayRef<int> Mask, TTI::TargetCostKind CostKind, int Index,
VectorType *SubTp, ArrayRef<const Value *> Args = {},
const Instruction *CxtI = nullptr) const override;
InstructionCost
getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts,
bool Insert, bool Extract,
TTI::TargetCostKind CostKind,
bool ForPoisonSrc = true, ArrayRef<Value *> VL = {},
TTI::VectorInstrContext VIC =
TTI::VectorInstrContext::None) const override;
/// Return the cost of the scaling factor used in the addressing
/// mode represented by AM for this target, for a load/store
/// of the specified type.
/// If the AM is supported, the return value must be >= 0.
/// If the AM is not supported, it returns an invalid cost.
InstructionCost getScalingFactorCost(Type *Ty, GlobalValue *BaseGV,
StackOffset BaseOffset, bool HasBaseReg,
int64_t Scale,
unsigned AddrSpace) const override;
bool enableSelectOptimize() const override {
return ST->enableSelectOptimize();
}
bool shouldTreatInstructionLikeSelect(const Instruction *I) const override;
unsigned getStoreMinimumVF(unsigned VF, Type *ScalarMemTy,
Type *ScalarValTy) const override {
// We can vectorize store v4i8.
if (ScalarMemTy->isIntegerTy(8) && isPowerOf2_32(VF) && VF >= 4)
return 4;
return BaseT::getStoreMinimumVF(VF, ScalarMemTy, ScalarValTy);
}
std::optional<unsigned> getMinPageSize() const override { return 4096; }
bool isLSRCostLess(const TargetTransformInfo::LSRCost &C1,
const TargetTransformInfo::LSRCost &C2) const override;
bool isProfitableToSinkOperands(Instruction *I,
SmallVectorImpl<Use *> &Ops) const override;
bool enableAggressiveInterleaving(bool) const override {
return ST->enableAggressiveInterleaving();
}
/// @}
};
} // end namespace llvm
#endif // LLVM_LIB_TARGET_AARCH64_AARCH64TARGETTRANSFORMINFO_H