blob: bb4b321b53009168601a13602fc6e62c39205002 [file] [log] [blame]
//===- ARMTargetTransformInfo.h - ARM specific TTI --------------*- C++ -*-===//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
/// \file
/// This file a TargetTransformInfo::Concept conforming object specific to the
/// ARM target machine. It uses the target's detailed information to
/// provide more precise answers to certain TTI queries, while letting the
/// target independent and default TTI implementations handle the rest.
#include "ARM.h"
#include "ARMSubtarget.h"
#include "ARMTargetMachine.h"
#include "llvm/ADT/ArrayRef.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/CodeGen/BasicTTIImpl.h"
#include "llvm/IR/Constant.h"
#include "llvm/IR/Function.h"
#include "llvm/TargetParser/SubtargetFeature.h"
#include <optional>
namespace llvm {
class APInt;
class ARMTargetLowering;
class Instruction;
class Loop;
class SCEV;
class ScalarEvolution;
class Type;
class Value;
namespace TailPredication {
enum Mode {
Disabled = 0,
// For controlling conversion of memcpy into Tail Predicated loop.
namespace TPLoop {
enum MemTransfer { ForceDisabled = 0, ForceEnabled, Allow };
class ARMTTIImpl : public BasicTTIImplBase<ARMTTIImpl> {
using BaseT = BasicTTIImplBase<ARMTTIImpl>;
using TTI = TargetTransformInfo;
friend BaseT;
const ARMSubtarget *ST;
const ARMTargetLowering *TLI;
// Currently the following features are excluded from InlineFeaturesAllowed.
// ModeThumb, FeatureNoARM, ModeSoftFloat, FeatureFP64, FeatureD32
// Depending on whether they are set or unset, different
// instructions/registers are available. For example, inlining a callee with
// -thumb-mode in a caller with +thumb-mode, may cause the assembler to
// fail if the callee uses ARM only instructions, e.g. in inline asm.
const FeatureBitset InlineFeaturesAllowed = {
ARM::FeatureVFP2, ARM::FeatureVFP3, ARM::FeatureNEON, ARM::FeatureThumb2,
ARM::FeatureFP16, ARM::FeatureVFP4, ARM::FeatureFPARMv8,
ARM::FeatureFullFP16, ARM::FeatureFP16FML, ARM::FeatureHWDivThumb,
ARM::FeatureHWDivARM, ARM::FeatureDB, ARM::FeatureV7Clrex,
ARM::FeatureAcquireRelease, ARM::FeatureSlowFPBrcc,
ARM::FeaturePerfMon, ARM::FeatureTrustZone, ARM::Feature8MSecExt,
ARM::FeatureCrypto, ARM::FeatureCRC, ARM::FeatureRAS,
ARM::FeatureFPAO, ARM::FeatureFuseAES, ARM::FeatureZCZeroing,
ARM::FeatureProfUnpredicate, ARM::FeatureSlowVGETLNi32,
ARM::FeatureSlowVDUP32, ARM::FeaturePreferVMOVSR,
ARM::FeaturePrefISHSTBarrier, ARM::FeatureMuxedUnits,
ARM::FeatureSlowOddRegister, ARM::FeatureSlowLoadDSubreg,
ARM::FeatureDontWidenVMOVS, ARM::FeatureExpandMLx,
ARM::FeatureHasVMLxHazards, ARM::FeatureNEONForFPMovs,
ARM::FeatureNEONForFP, ARM::FeatureCheckVLDnAlign,
ARM::FeatureHasSlowFPVMLx, ARM::FeatureHasSlowFPVFMx,
ARM::FeatureVMLxForwarding, ARM::FeaturePref32BitThumb,
ARM::FeatureAvoidPartialCPSR, ARM::FeatureCheapPredicableCPSR,
ARM::FeatureAvoidMOVsShOp, ARM::FeatureHasRetAddrStack,
ARM::FeatureHasNoBranchPredictor, ARM::FeatureDSP, ARM::FeatureMP,
ARM::FeatureVirtualization, ARM::FeatureMClass, ARM::FeatureRClass,
ARM::FeatureAClass, ARM::FeatureNaClTrap, ARM::FeatureStrictAlign,
ARM::FeatureLongCalls, ARM::FeatureExecuteOnly, ARM::FeatureReserveR9,
ARM::FeatureNoMovt, ARM::FeatureNoNegativeImmediates
const ARMSubtarget *getST() const { return ST; }
const ARMTargetLowering *getTLI() const { return TLI; }
explicit ARMTTIImpl(const ARMBaseTargetMachine *TM, const Function &F)
: BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl(F)),
TLI(ST->getTargetLowering()) {}
bool areInlineCompatible(const Function *Caller,
const Function *Callee) const;
bool enableInterleavedAccessVectorization() { return true; }
getPreferredAddressingMode(const Loop *L, ScalarEvolution *SE) const;
/// Floating-point computation using ARMv8 AArch32 Advanced
/// SIMD instructions remains unchanged from ARMv7. Only AArch64 SIMD
/// and Arm MVE are IEEE-754 compliant.
bool isFPVectorizationPotentiallyUnsafe() {
return !ST->isTargetDarwin() && !ST->hasMVEFloatOps();
std::optional<Instruction *> instCombineIntrinsic(InstCombiner &IC,
IntrinsicInst &II) const;
std::optional<Value *> simplifyDemandedVectorEltsIntrinsic(
InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts,
APInt &UndefElts2, APInt &UndefElts3,
std::function<void(Instruction *, unsigned, APInt, APInt &)>
SimplifyAndSetOp) const;
/// \name Scalar TTI Implementations
/// @{
InstructionCost getIntImmCodeSizeCost(unsigned Opcode, unsigned Idx,
const APInt &Imm, Type *Ty);
using BaseT::getIntImmCost;
InstructionCost getIntImmCost(const APInt &Imm, Type *Ty,
TTI::TargetCostKind CostKind);
InstructionCost getIntImmCostInst(unsigned Opcode, unsigned Idx,
const APInt &Imm, Type *Ty,
TTI::TargetCostKind CostKind,
Instruction *Inst = nullptr);
/// @}
/// \name Vector TTI Implementations
/// @{
unsigned getNumberOfRegisters(unsigned ClassID) const {
bool Vector = (ClassID == 1);
if (Vector) {
if (ST->hasNEON())
return 16;
if (ST->hasMVEIntegerOps())
return 8;
return 0;
if (ST->isThumb1Only())
return 8;
return 13;
TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const {
switch (K) {
case TargetTransformInfo::RGK_Scalar:
return TypeSize::getFixed(32);
case TargetTransformInfo::RGK_FixedWidthVector:
if (ST->hasNEON())
return TypeSize::getFixed(128);
if (ST->hasMVEIntegerOps())
return TypeSize::getFixed(128);
return TypeSize::getFixed(0);
case TargetTransformInfo::RGK_ScalableVector:
return TypeSize::getScalable(0);
llvm_unreachable("Unsupported register kind");
unsigned getMaxInterleaveFactor(ElementCount VF) {
return ST->getMaxInterleaveFactor();
bool isProfitableLSRChainElement(Instruction *I);
bool isLegalMaskedLoad(Type *DataTy, Align Alignment);
bool isLegalMaskedStore(Type *DataTy, Align Alignment) {
return isLegalMaskedLoad(DataTy, Alignment);
bool forceScalarizeMaskedGather(VectorType *VTy, Align Alignment) {
// For MVE, we have a custom lowering pass that will already have custom
// legalised any gathers that we can lower to MVE intrinsics, and want to
// expand all the rest. The pass runs before the masked intrinsic lowering
// pass.
return true;
bool forceScalarizeMaskedScatter(VectorType *VTy, Align Alignment) {
return forceScalarizeMaskedGather(VTy, Alignment);
bool isLegalMaskedGather(Type *Ty, Align Alignment);
bool isLegalMaskedScatter(Type *Ty, Align Alignment) {
return isLegalMaskedGather(Ty, Alignment);
InstructionCost getMemcpyCost(const Instruction *I);
uint64_t getMaxMemIntrinsicInlineSizeThreshold() const {
return ST->getMaxInlineSizeThreshold();
int getNumMemOps(const IntrinsicInst *I) const;
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp,
ArrayRef<int> Mask,
TTI::TargetCostKind CostKind, int Index,
VectorType *SubTp,
ArrayRef<const Value *> Args = std::nullopt);
bool preferInLoopReduction(unsigned Opcode, Type *Ty,
TTI::ReductionFlags Flags) const;
bool preferPredicatedReductionSelect(unsigned Opcode, Type *Ty,
TTI::ReductionFlags Flags) const;
bool shouldExpandReduction(const IntrinsicInst *II) const { return false; }
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind,
const Instruction *I = nullptr);
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
TTI::CastContextHint CCH,
TTI::TargetCostKind CostKind,
const Instruction *I = nullptr);
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
CmpInst::Predicate VecPred,
TTI::TargetCostKind CostKind,
const Instruction *I = nullptr);
using BaseT::getVectorInstrCost;
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val,
TTI::TargetCostKind CostKind,
unsigned Index, Value *Op0, Value *Op1);
InstructionCost getAddressComputationCost(Type *Val, ScalarEvolution *SE,
const SCEV *Ptr);
InstructionCost getArithmeticInstrCost(
unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
TTI::OperandValueInfo Op1Info = {TTI::OK_AnyValue, TTI::OP_None},
TTI::OperandValueInfo Op2Info = {TTI::OK_AnyValue, TTI::OP_None},
ArrayRef<const Value *> Args = ArrayRef<const Value *>(),
const Instruction *CxtI = nullptr);
getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment,
unsigned AddressSpace, TTI::TargetCostKind CostKind,
TTI::OperandValueInfo OpInfo = {TTI::OK_AnyValue, TTI::OP_None},
const Instruction *I = nullptr);
InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *Src,
Align Alignment, unsigned AddressSpace,
TTI::TargetCostKind CostKind);
InstructionCost getInterleavedMemoryOpCost(
unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
bool UseMaskForCond = false, bool UseMaskForGaps = false);
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy,
const Value *Ptr, bool VariableMask,
Align Alignment,
TTI::TargetCostKind CostKind,
const Instruction *I = nullptr);
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy,
std::optional<FastMathFlags> FMF,
TTI::TargetCostKind CostKind);
InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned,
Type *ResTy, VectorType *ValTy,
FastMathFlags FMF,
TTI::TargetCostKind CostKind);
InstructionCost getMulAccReductionCost(bool IsUnsigned, Type *ResTy,
VectorType *ValTy,
TTI::TargetCostKind CostKind);
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty,
FastMathFlags FMF,
TTI::TargetCostKind CostKind);
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
TTI::TargetCostKind CostKind);
/// getScalingFactorCost - Return the cost of the scaling used in
/// addressing mode represented by AM.
/// If the AM is supported, the return value must be >= 0.
/// If the AM is not supported, the return value must be negative.
InstructionCost getScalingFactorCost(Type *Ty, GlobalValue *BaseGV,
int64_t BaseOffset, bool HasBaseReg,
int64_t Scale, unsigned AddrSpace) const;
bool maybeLoweredToCall(Instruction &I);
bool isLoweredToCall(const Function *F);
bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
AssumptionCache &AC,
TargetLibraryInfo *LibInfo,
HardwareLoopInfo &HWLoopInfo);
bool preferPredicateOverEpilogue(TailFoldingInfo *TFI);
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
TTI::UnrollingPreferences &UP,
OptimizationRemarkEmitter *ORE);
getPreferredTailFoldingStyle(bool IVUpdateMayOverflow = true) const;
void getPeelingPreferences(Loop *L, ScalarEvolution &SE,
TTI::PeelingPreferences &PP);
bool shouldBuildLookupTablesForConstant(Constant *C) const {
// In the ROPI and RWPI relocation models we can't have pointers to global
// variables or functions in constant data, so don't convert switches to
// lookup tables if any of the values would need relocation.
if (ST->isROPI() || ST->isRWPI())
return !C->needsDynamicRelocation();
return true;
bool hasArmWideBranch(bool Thumb) const;
/// @}
/// isVREVMask - Check if a vector shuffle corresponds to a VREV
/// instruction with the specified blocksize. (The order of the elements
/// within each block of the vector is reversed.)
inline bool isVREVMask(ArrayRef<int> M, EVT VT, unsigned BlockSize) {
assert((BlockSize == 16 || BlockSize == 32 || BlockSize == 64) &&
"Only possible block sizes for VREV are: 16, 32, 64");
unsigned EltSz = VT.getScalarSizeInBits();
if (EltSz != 8 && EltSz != 16 && EltSz != 32)
return false;
unsigned BlockElts = M[0] + 1;
// If the first shuffle index is UNDEF, be optimistic.
if (M[0] < 0)
BlockElts = BlockSize / EltSz;
if (BlockSize <= EltSz || BlockSize != BlockElts * EltSz)
return false;
for (unsigned i = 0, e = M.size(); i < e; ++i) {
if (M[i] < 0)
continue; // ignore UNDEF indices
if ((unsigned)M[i] != (i - i % BlockElts) + (BlockElts - 1 - i % BlockElts))
return false;
return true;
} // end namespace llvm