llvm/lib/Target/ARM/ARMTargetTransformInfo.h - llvm-project - Git at Google

 //===- ARMTargetTransformInfo.h - ARM specific TTI --------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 //
 /// \file
 /// This file a TargetTransformInfoImplBase conforming object specific to the
 /// ARM target machine. It uses the target's detailed information to
 /// provide more precise answers to certain TTI queries, while letting the
 /// target independent and default TTI implementations handle the rest.
 //
 //===----------------------------------------------------------------------===//

 #ifndef LLVM_LIB_TARGET_ARM_ARMTARGETTRANSFORMINFO_H
 #define LLVM_LIB_TARGET_ARM_ARMTARGETTRANSFORMINFO_H

 #include "ARM.h"
 #include "ARMSubtarget.h"
 #include "ARMTargetMachine.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/CodeGen/BasicTTIImpl.h"
 #include "llvm/IR/Constant.h"
 #include "llvm/IR/Function.h"
 #include "llvm/TargetParser/SubtargetFeature.h"
 #include <optional>

 namespace llvm {

 class APInt;
 class ARMTargetLowering;
 class Instruction;
 class Loop;
 class SCEV;
 class ScalarEvolution;
 class Type;
 class Value;

 namespace TailPredication {
 enum Mode {
   Disabled = 0,
   EnabledNoReductions,
   Enabled,
   ForceEnabledNoReductions,
   ForceEnabled
 };
 }

 // For controlling conversion of memcpy into Tail Predicated loop.
 namespace TPLoop {
 enum MemTransfer { ForceDisabled = 0, ForceEnabled, Allow };
 }

 class ARMTTIImpl final : public BasicTTIImplBase<ARMTTIImpl> {
   using BaseT = BasicTTIImplBase<ARMTTIImpl>;
   using TTI = TargetTransformInfo;

   friend BaseT;

   const ARMSubtarget *ST;
   const ARMTargetLowering *TLI;

   // Currently the following features are excluded from InlineFeaturesAllowed.
   // ModeThumb, FeatureNoARM, ModeSoftFloat.
   // Depending on whether they are set or unset, different
   // instructions/registers are available. For example, inlining a callee with
   // -thumb-mode in a caller with +thumb-mode, may cause the assembler to
   // fail if the callee uses ARM only instructions, e.g. in inline asm.
   const FeatureBitset InlineFeaturesAllowed = {ARM::Feature8MSecExt,
                                                ARM::FeatureAClass,
                                                ARM::FeatureAES,
                                                ARM::FeatureAcquireRelease,
                                                ARM::FeatureAvoidMOVsShOp,
                                                ARM::FeatureAvoidMULS,
                                                ARM::FeatureAvoidPartialCPSR,
                                                ARM::FeatureBF16,
                                                ARM::FeatureCRC,
                                                ARM::FeatureCheapPredicableCPSR,
                                                ARM::FeatureCheckVLDnAlign,
                                                ARM::FeatureCrypto,
                                                ARM::FeatureD32,
                                                ARM::FeatureDB,
                                                ARM::FeatureDFB,
                                                ARM::FeatureDSP,
                                                ARM::FeatureDontWidenVMOVS,
                                                ARM::FeatureDotProd,
                                                ARM::FeatureExecuteOnly,
                                                ARM::FeatureExpandMLx,
                                                ARM::FeatureFP16,
                                                ARM::FeatureFP16FML,
                                                ARM::FeatureFP64,
                                                ARM::FeatureFPAO,
                                                ARM::FeatureFPARMv8,
                                                ARM::FeatureFPARMv8_D16,
                                                ARM::FeatureFPARMv8_D16_SP,
                                                ARM::FeatureFPARMv8_SP,
                                                ARM::FeatureFPRegs,
                                                ARM::FeatureFPRegs16,
                                                ARM::FeatureFPRegs64,
                                                ARM::FeatureFullFP16,
                                                ARM::FeatureFuseAES,
                                                ARM::FeatureFuseLiterals,
                                                ARM::FeatureHWDivARM,
                                                ARM::FeatureHWDivThumb,
                                                ARM::FeatureHasNoBranchPredictor,
                                                ARM::FeatureHasRetAddrStack,
                                                ARM::FeatureHasSlowFPVFMx,
                                                ARM::FeatureHasSlowFPVMLx,
                                                ARM::FeatureHasVMLxHazards,
                                                ARM::FeatureLOB,
                                                ARM::FeatureLongCalls,
                                                ARM::FeatureMClass,
                                                ARM::FeatureMP,
                                                ARM::FeatureMVEVectorCostFactor1,
                                                ARM::FeatureMVEVectorCostFactor2,
                                                ARM::FeatureMVEVectorCostFactor4,
                                                ARM::FeatureMatMulInt8,
                                                ARM::FeatureMuxedUnits,
                                                ARM::FeatureNEON,
                                                ARM::FeatureNEONForFP,
                                                ARM::FeatureNEONForFPMovs,
                                                ARM::FeatureNoMovt,
                                                ARM::FeatureNoNegativeImmediates,
                                                ARM::FeatureNoPostRASched,
                                                ARM::FeaturePerfMon,
                                                ARM::FeaturePref32BitThumb,
                                                ARM::FeaturePrefISHSTBarrier,
                                                ARM::FeaturePreferBranchAlign32,
                                                ARM::FeaturePreferBranchAlign64,
                                                ARM::FeaturePreferVMOVSR,
                                                ARM::FeatureProfUnpredicate,
                                                ARM::FeatureRAS,
                                                ARM::FeatureRClass,
                                                ARM::FeatureReserveR9,
                                                ARM::FeatureSB,
                                                ARM::FeatureSHA2,
                                                ARM::FeatureSlowFPBrcc,
                                                ARM::FeatureSlowLoadDSubreg,
                                                ARM::FeatureSlowOddRegister,
                                                ARM::FeatureSlowVDUP32,
                                                ARM::FeatureSlowVGETLNi32,
                                                ARM::FeatureSplatVFPToNeon,
                                                ARM::FeatureStrictAlign,
                                                ARM::FeatureThumb2,
                                                ARM::FeatureTrustZone,
                                                ARM::FeatureUseMIPipeliner,
                                                ARM::FeatureUseMISched,
                                                ARM::FeatureUseWideStrideVFP,
                                                ARM::FeatureV7Clrex,
                                                ARM::FeatureVFP2,
                                                ARM::FeatureVFP2_SP,
                                                ARM::FeatureVFP3,
                                                ARM::FeatureVFP3_D16,
                                                ARM::FeatureVFP3_D16_SP,
                                                ARM::FeatureVFP3_SP,
                                                ARM::FeatureVFP4,
                                                ARM::FeatureVFP4_D16,
                                                ARM::FeatureVFP4_D16_SP,
                                                ARM::FeatureVFP4_SP,
                                                ARM::FeatureVMLxForwarding,
                                                ARM::FeatureVirtualization,
                                                ARM::FeatureZCZeroing,
                                                ARM::HasMVEFloatOps,
                                                ARM::HasMVEIntegerOps,
                                                ARM::HasV5TEOps,
                                                ARM::HasV5TOps,
                                                ARM::HasV6KOps,
                                                ARM::HasV6MOps,
                                                ARM::HasV6Ops,
                                                ARM::HasV6T2Ops,
                                                ARM::HasV7Ops,
                                                ARM::HasV8MBaselineOps,
                                                ARM::HasV8MMainlineOps,
                                                ARM::HasV8Ops,
                                                ARM::HasV8_1MMainlineOps,
                                                ARM::HasV8_1aOps,
                                                ARM::HasV8_2aOps,
                                                ARM::HasV8_3aOps,
                                                ARM::HasV8_4aOps,
                                                ARM::HasV8_5aOps,
                                                ARM::HasV8_6aOps,
                                                ARM::HasV8_7aOps,
                                                ARM::HasV8_8aOps,
                                                ARM::HasV8_9aOps,
                                                ARM::HasV9_0aOps,
                                                ARM::HasV9_1aOps,
                                                ARM::HasV9_2aOps,
                                                ARM::HasV9_3aOps,
                                                ARM::HasV9_4aOps,
                                                ARM::HasV9_5aOps,
                                                ARM::HasV9_6aOps,
                                                ARM::HasV9_7aOps};

   const ARMSubtarget *getST() const { return ST; }
   const ARMTargetLowering *getTLI() const { return TLI; }

 public:
   explicit ARMTTIImpl(const ARMBaseTargetMachine *TM, const Function &F)
       : BaseT(TM, F.getDataLayout()), ST(TM->getSubtargetImpl(F)),
         TLI(ST->getTargetLowering()) {}

   bool areInlineCompatible(const Function *Caller,
                            const Function *Callee) const override;

   bool enableInterleavedAccessVectorization() const override { return true; }

   TTI::AddressingModeKind
   getPreferredAddressingMode(const Loop *L, ScalarEvolution *SE) const override;

   /// Floating-point computation using ARMv8 AArch32 Advanced
   /// SIMD instructions remains unchanged from ARMv7. Only AArch64 SIMD
   /// and Arm MVE are IEEE-754 compliant.
   bool isFPVectorizationPotentiallyUnsafe() const override {
     return !ST->isTargetDarwin() && !ST->hasMVEFloatOps();
   }

   std::optional<Instruction *>
   instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const override;
   std::optional<Value *> simplifyDemandedVectorEltsIntrinsic(
       InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts,
       APInt &UndefElts2, APInt &UndefElts3,
       std::function<void(Instruction *, unsigned, APInt, APInt &)>
           SimplifyAndSetOp) const override;

   /// \name Scalar TTI Implementations
   /// @{

   InstructionCost getIntImmCodeSizeCost(unsigned Opcode, unsigned Idx,
                                         const APInt &Imm,
                                         Type *Ty) const override;

   using BaseT::getIntImmCost;
   InstructionCost getIntImmCost(const APInt &Imm, Type *Ty,
                                 TTI::TargetCostKind CostKind) const override;

   InstructionCost getIntImmCostInst(unsigned Opcode, unsigned Idx,
                                     const APInt &Imm, Type *Ty,
                                     TTI::TargetCostKind CostKind,
                                     Instruction *Inst = nullptr) const override;

   /// @}

   /// \name Vector TTI Implementations
   /// @{

   unsigned getNumberOfRegisters(unsigned ClassID) const override {
     bool Vector = (ClassID == 1);
     if (Vector) {
       if (ST->hasNEON())
         return 16;
       if (ST->hasMVEIntegerOps())
         return 8;
       return 0;
     }

     if (ST->isThumb1Only())
       return 8;
     return 13;
   }

   TypeSize
   getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const override {
     switch (K) {
     case TargetTransformInfo::RGK_Scalar:
       return TypeSize::getFixed(32);
     case TargetTransformInfo::RGK_FixedWidthVector:
       if (ST->hasNEON())
         return TypeSize::getFixed(128);
       if (ST->hasMVEIntegerOps())
         return TypeSize::getFixed(128);
       return TypeSize::getFixed(0);
     case TargetTransformInfo::RGK_ScalableVector:
       return TypeSize::getScalable(0);
     }
     llvm_unreachable("Unsupported register kind");
   }

   unsigned getMaxInterleaveFactor(ElementCount VF) const override {
     return ST->getMaxInterleaveFactor();
   }

   bool isProfitableLSRChainElement(Instruction *I) const override;

   bool
   isLegalMaskedLoad(Type *DataTy, Align Alignment, unsigned AddressSpace,
                     TTI::MaskKind MaskKind =
                         TTI::MaskKind::VariableOrConstantMask) const override;

   bool
   isLegalMaskedStore(Type *DataTy, Align Alignment, unsigned AddressSpace,
                      TTI::MaskKind MaskKind =
                          TTI::MaskKind::VariableOrConstantMask) const override {
     return isLegalMaskedLoad(DataTy, Alignment, AddressSpace, MaskKind);
   }

   bool forceScalarizeMaskedGather(VectorType *VTy,
                                   Align Alignment) const override {
     // For MVE, we have a custom lowering pass that will already have custom
     // legalised any gathers that we can lower to MVE intrinsics, and want to
     // expand all the rest. The pass runs before the masked intrinsic lowering
     // pass.
     return true;
   }

   bool forceScalarizeMaskedScatter(VectorType *VTy,
                                    Align Alignment) const override {
     return forceScalarizeMaskedGather(VTy, Alignment);
   }

   bool isLegalMaskedGather(Type *Ty, Align Alignment) const override;

   bool isLegalMaskedScatter(Type *Ty, Align Alignment) const override {
     return isLegalMaskedGather(Ty, Alignment);
   }

   InstructionCost getMemcpyCost(const Instruction *I) const override;

   uint64_t getMaxMemIntrinsicInlineSizeThreshold() const override {
     return ST->getMaxInlineSizeThreshold();
   }

   int getNumMemOps(const IntrinsicInst *I) const;

   InstructionCost
   getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy,
                  ArrayRef<int> Mask, TTI::TargetCostKind CostKind, int Index,
                  VectorType *SubTp, ArrayRef<const Value *> Args = {},
                  const Instruction *CxtI = nullptr) const override;

   bool preferInLoopReduction(RecurKind Kind, Type *Ty) const override;

   bool preferPredicatedReductionSelect() const override;

   bool shouldExpandReduction(const IntrinsicInst *II) const override {
     return false;
   }

   InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind,
                                  const Instruction *I = nullptr) const override;

   InstructionCost
   getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
                    TTI::CastContextHint CCH, TTI::TargetCostKind CostKind,
                    const Instruction *I = nullptr) const override;

   InstructionCost getCmpSelInstrCost(
       unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred,
       TTI::TargetCostKind CostKind,
       TTI::OperandValueInfo Op1Info = {TTI::OK_AnyValue, TTI::OP_None},
       TTI::OperandValueInfo Op2Info = {TTI::OK_AnyValue, TTI::OP_None},
       const Instruction *I = nullptr) const override;

   using BaseT::getVectorInstrCost;
   InstructionCost
   getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind,
                      unsigned Index, const Value *Op0, const Value *Op1,
                      TTI::VectorInstrContext VIC =
                          TTI::VectorInstrContext::None) const override;

   InstructionCost
   getAddressComputationCost(Type *Val, ScalarEvolution *SE, const SCEV *Ptr,
                             TTI::TargetCostKind CostKind) const override;

   InstructionCost getArithmeticInstrCost(
       unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
       TTI::OperandValueInfo Op1Info = {TTI::OK_AnyValue, TTI::OP_None},
       TTI::OperandValueInfo Op2Info = {TTI::OK_AnyValue, TTI::OP_None},
       ArrayRef<const Value *> Args = {},
       const Instruction *CxtI = nullptr) const override;

   InstructionCost getMemoryOpCost(
       unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace,
       TTI::TargetCostKind CostKind,
       TTI::OperandValueInfo OpInfo = {TTI::OK_AnyValue, TTI::OP_None},
       const Instruction *I = nullptr) const override;

   InstructionCost
   getMemIntrinsicInstrCost(const MemIntrinsicCostAttributes &MICA,
                            TTI::TargetCostKind CostKind) const override;

   InstructionCost getMaskedMemoryOpCost(const MemIntrinsicCostAttributes &MICA,
                                         TTI::TargetCostKind CostKind) const;

   InstructionCost getInterleavedMemoryOpCost(
       unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
       Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
       bool UseMaskForCond = false, bool UseMaskForGaps = false) const override;

   InstructionCost getGatherScatterOpCost(const MemIntrinsicCostAttributes &MICA,
                                          TTI::TargetCostKind CostKind) const;

   InstructionCost
   getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy,
                              std::optional<FastMathFlags> FMF,
                              TTI::TargetCostKind CostKind) const override;
   InstructionCost
   getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy,
                            VectorType *ValTy, std::optional<FastMathFlags> FMF,
                            TTI::TargetCostKind CostKind) const override;
   InstructionCost
   getMulAccReductionCost(bool IsUnsigned, unsigned RedOpcode, Type *ResTy,
                          VectorType *ValTy,
                          TTI::TargetCostKind CostKind) const override;

   InstructionCost
   getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF,
                          TTI::TargetCostKind CostKind) const override;

   InstructionCost
   getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
                         TTI::TargetCostKind CostKind) const override;

   /// getScalingFactorCost - Return the cost of the scaling used in
   /// addressing mode represented by AM.
   /// If the AM is supported, the return value must be >= 0.
   /// If the AM is not supported, the return value is an invalid cost.
   InstructionCost getScalingFactorCost(Type *Ty, GlobalValue *BaseGV,
                                        StackOffset BaseOffset, bool HasBaseReg,
                                        int64_t Scale,
                                        unsigned AddrSpace) const override;

   bool maybeLoweredToCall(Instruction &I) const;
   bool isLoweredToCall(const Function *F) const override;
   bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
                                 AssumptionCache &AC, TargetLibraryInfo *LibInfo,
                                 HardwareLoopInfo &HWLoopInfo) const override;
   bool preferPredicateOverEpilogue(TailFoldingInfo *TFI) const override;
   void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
                                TTI::UnrollingPreferences &UP,
                                OptimizationRemarkEmitter *ORE) const override;

   TailFoldingStyle
   getPreferredTailFoldingStyle(bool IVUpdateMayOverflow = true) const override;

   void getPeelingPreferences(Loop *L, ScalarEvolution &SE,
                              TTI::PeelingPreferences &PP) const override;
   bool shouldBuildLookupTablesForConstant(Constant *C) const override {
     // In the ROPI and RWPI relocation models we can't have pointers to global
     // variables or functions in constant data, so don't convert switches to
     // lookup tables if any of the values would need relocation.
     if (ST->isROPI() || ST->isRWPI())
       return !C->needsDynamicRelocation();

     return true;
   }

   bool hasArmWideBranch(bool Thumb) const override;

   bool isProfitableToSinkOperands(Instruction *I,
                                   SmallVectorImpl<Use *> &Ops) const override;

   unsigned getNumBytesToPadGlobalArray(unsigned Size,
                                        Type *ArrayType) const override;

   /// @}
 };

 /// isVREVMask - Check if a vector shuffle corresponds to a VREV
 /// instruction with the specified blocksize.  (The order of the elements
 /// within each block of the vector is reversed.)
 inline bool isVREVMask(ArrayRef<int> M, EVT VT, unsigned BlockSize) {
   assert((BlockSize == 16 || BlockSize == 32 || BlockSize == 64) &&
          "Only possible block sizes for VREV are: 16, 32, 64");

   unsigned EltSz = VT.getScalarSizeInBits();
   if (EltSz != 8 && EltSz != 16 && EltSz != 32)
     return false;

   unsigned BlockElts = M[0] + 1;
   // If the first shuffle index is UNDEF, be optimistic.
   if (M[0] < 0)
     BlockElts = BlockSize / EltSz;

   if (BlockSize <= EltSz || BlockSize != BlockElts * EltSz)
     return false;

   for (unsigned i = 0, e = M.size(); i < e; ++i) {
     if (M[i] < 0)
       continue; // ignore UNDEF indices
     if ((unsigned)M[i] != (i - i % BlockElts) + (BlockElts - 1 - i % BlockElts))
       return false;
   }

   return true;
 }

 } // end namespace llvm

 #endif // LLVM_LIB_TARGET_ARM_ARMTARGETTRANSFORMINFO_H
	//===- ARMTargetTransformInfo.h - ARM specific TTI --------------- C++ --===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	//
	/// \file
	/// This file a TargetTransformInfoImplBase conforming object specific to the
	/// ARM target machine. It uses the target's detailed information to
	/// provide more precise answers to certain TTI queries, while letting the
	/// target independent and default TTI implementations handle the rest.
	//
	//===----------------------------------------------------------------------===//

	#ifndef LLVM_LIB_TARGET_ARM_ARMTARGETTRANSFORMINFO_H
	#define LLVM_LIB_TARGET_ARM_ARMTARGETTRANSFORMINFO_H

	#include "ARM.h"
	#include "ARMSubtarget.h"
	#include "ARMTargetMachine.h"
	#include "llvm/ADT/ArrayRef.h"
	#include "llvm/Analysis/TargetTransformInfo.h"
	#include "llvm/CodeGen/BasicTTIImpl.h"
	#include "llvm/IR/Constant.h"
	#include "llvm/IR/Function.h"
	#include "llvm/TargetParser/SubtargetFeature.h"
	#include <optional>

	namespace llvm {

	class APInt;
	class ARMTargetLowering;
	class Instruction;
	class Loop;
	class SCEV;
	class ScalarEvolution;
	class Type;
	class Value;

	namespace TailPredication {
	enum Mode {
	Disabled = 0,
	EnabledNoReductions,
	Enabled,
	ForceEnabledNoReductions,
	ForceEnabled
	};
	}

	// For controlling conversion of memcpy into Tail Predicated loop.
	namespace TPLoop {
	enum MemTransfer { ForceDisabled = 0, ForceEnabled, Allow };
	}

	class ARMTTIImpl final : public BasicTTIImplBase<ARMTTIImpl> {
	using BaseT = BasicTTIImplBase<ARMTTIImpl>;
	using TTI = TargetTransformInfo;

	friend BaseT;

	const ARMSubtarget *ST;
	const ARMTargetLowering *TLI;

	// Currently the following features are excluded from InlineFeaturesAllowed.
	// ModeThumb, FeatureNoARM, ModeSoftFloat.
	// Depending on whether they are set or unset, different
	// instructions/registers are available. For example, inlining a callee with
	// -thumb-mode in a caller with +thumb-mode, may cause the assembler to
	// fail if the callee uses ARM only instructions, e.g. in inline asm.
	const FeatureBitset InlineFeaturesAllowed = {ARM::Feature8MSecExt,
	ARM::FeatureAClass,
	ARM::FeatureAES,
	ARM::FeatureAcquireRelease,
	ARM::FeatureAvoidMOVsShOp,
	ARM::FeatureAvoidMULS,
	ARM::FeatureAvoidPartialCPSR,
	ARM::FeatureBF16,
	ARM::FeatureCRC,
	ARM::FeatureCheapPredicableCPSR,
	ARM::FeatureCheckVLDnAlign,
	ARM::FeatureCrypto,
	ARM::FeatureD32,
	ARM::FeatureDB,
	ARM::FeatureDFB,
	ARM::FeatureDSP,
	ARM::FeatureDontWidenVMOVS,
	ARM::FeatureDotProd,
	ARM::FeatureExecuteOnly,
	ARM::FeatureExpandMLx,
	ARM::FeatureFP16,
	ARM::FeatureFP16FML,
	ARM::FeatureFP64,
	ARM::FeatureFPAO,
	ARM::FeatureFPARMv8,
	ARM::FeatureFPARMv8_D16,
	ARM::FeatureFPARMv8_D16_SP,
	ARM::FeatureFPARMv8_SP,
	ARM::FeatureFPRegs,
	ARM::FeatureFPRegs16,
	ARM::FeatureFPRegs64,
	ARM::FeatureFullFP16,
	ARM::FeatureFuseAES,
	ARM::FeatureFuseLiterals,
	ARM::FeatureHWDivARM,
	ARM::FeatureHWDivThumb,
	ARM::FeatureHasNoBranchPredictor,
	ARM::FeatureHasRetAddrStack,
	ARM::FeatureHasSlowFPVFMx,
	ARM::FeatureHasSlowFPVMLx,
	ARM::FeatureHasVMLxHazards,
	ARM::FeatureLOB,
	ARM::FeatureLongCalls,
	ARM::FeatureMClass,
	ARM::FeatureMP,
	ARM::FeatureMVEVectorCostFactor1,
	ARM::FeatureMVEVectorCostFactor2,
	ARM::FeatureMVEVectorCostFactor4,
	ARM::FeatureMatMulInt8,
	ARM::FeatureMuxedUnits,
	ARM::FeatureNEON,
	ARM::FeatureNEONForFP,
	ARM::FeatureNEONForFPMovs,
	ARM::FeatureNoMovt,
	ARM::FeatureNoNegativeImmediates,
	ARM::FeatureNoPostRASched,
	ARM::FeaturePerfMon,
	ARM::FeaturePref32BitThumb,
	ARM::FeaturePrefISHSTBarrier,
	ARM::FeaturePreferBranchAlign32,
	ARM::FeaturePreferBranchAlign64,
	ARM::FeaturePreferVMOVSR,
	ARM::FeatureProfUnpredicate,
	ARM::FeatureRAS,
	ARM::FeatureRClass,
	ARM::FeatureReserveR9,
	ARM::FeatureSB,
	ARM::FeatureSHA2,
	ARM::FeatureSlowFPBrcc,
	ARM::FeatureSlowLoadDSubreg,
	ARM::FeatureSlowOddRegister,
	ARM::FeatureSlowVDUP32,
	ARM::FeatureSlowVGETLNi32,
	ARM::FeatureSplatVFPToNeon,
	ARM::FeatureStrictAlign,
	ARM::FeatureThumb2,
	ARM::FeatureTrustZone,
	ARM::FeatureUseMIPipeliner,
	ARM::FeatureUseMISched,
	ARM::FeatureUseWideStrideVFP,
	ARM::FeatureV7Clrex,
	ARM::FeatureVFP2,
	ARM::FeatureVFP2_SP,
	ARM::FeatureVFP3,
	ARM::FeatureVFP3_D16,
	ARM::FeatureVFP3_D16_SP,
	ARM::FeatureVFP3_SP,
	ARM::FeatureVFP4,
	ARM::FeatureVFP4_D16,
	ARM::FeatureVFP4_D16_SP,
	ARM::FeatureVFP4_SP,
	ARM::FeatureVMLxForwarding,
	ARM::FeatureVirtualization,
	ARM::FeatureZCZeroing,
	ARM::HasMVEFloatOps,
	ARM::HasMVEIntegerOps,
	ARM::HasV5TEOps,
	ARM::HasV5TOps,
	ARM::HasV6KOps,
	ARM::HasV6MOps,
	ARM::HasV6Ops,
	ARM::HasV6T2Ops,
	ARM::HasV7Ops,
	ARM::HasV8MBaselineOps,
	ARM::HasV8MMainlineOps,
	ARM::HasV8Ops,
	ARM::HasV8_1MMainlineOps,
	ARM::HasV8_1aOps,
	ARM::HasV8_2aOps,
	ARM::HasV8_3aOps,
	ARM::HasV8_4aOps,
	ARM::HasV8_5aOps,
	ARM::HasV8_6aOps,
	ARM::HasV8_7aOps,
	ARM::HasV8_8aOps,
	ARM::HasV8_9aOps,
	ARM::HasV9_0aOps,
	ARM::HasV9_1aOps,
	ARM::HasV9_2aOps,
	ARM::HasV9_3aOps,
	ARM::HasV9_4aOps,
	ARM::HasV9_5aOps,
	ARM::HasV9_6aOps,
	ARM::HasV9_7aOps};

	const ARMSubtarget *getST() const { return ST; }
	const ARMTargetLowering *getTLI() const { return TLI; }

	public:
	explicit ARMTTIImpl(const ARMBaseTargetMachine *TM, const Function &F)
	: BaseT(TM, F.getDataLayout()), ST(TM->getSubtargetImpl(F)),
	TLI(ST->getTargetLowering()) {}

	bool areInlineCompatible(const Function *Caller,
	const Function *Callee) const override;

	bool enableInterleavedAccessVectorization() const override { return true; }

	TTI::AddressingModeKind
	getPreferredAddressingMode(const Loop L, ScalarEvolution SE) const override;

	/// Floating-point computation using ARMv8 AArch32 Advanced
	/// SIMD instructions remains unchanged from ARMv7. Only AArch64 SIMD
	/// and Arm MVE are IEEE-754 compliant.
	bool isFPVectorizationPotentiallyUnsafe() const override {
	return !ST->isTargetDarwin() && !ST->hasMVEFloatOps();
	}

	std::optional<Instruction *>
	instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const override;
	std::optional<Value *> simplifyDemandedVectorEltsIntrinsic(
	InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts,
	APInt &UndefElts2, APInt &UndefElts3,
	std::function<void(Instruction *, unsigned, APInt, APInt &)>
	SimplifyAndSetOp) const override;

	/// \name Scalar TTI Implementations
	/// @{

	InstructionCost getIntImmCodeSizeCost(unsigned Opcode, unsigned Idx,
	const APInt &Imm,
	Type *Ty) const override;

	using BaseT::getIntImmCost;
	InstructionCost getIntImmCost(const APInt &Imm, Type *Ty,
	TTI::TargetCostKind CostKind) const override;

	InstructionCost getIntImmCostInst(unsigned Opcode, unsigned Idx,
	const APInt &Imm, Type *Ty,
	TTI::TargetCostKind CostKind,
	Instruction *Inst = nullptr) const override;

	/// @}

	/// \name Vector TTI Implementations
	/// @{

	unsigned getNumberOfRegisters(unsigned ClassID) const override {
	bool Vector = (ClassID == 1);
	if (Vector) {
	if (ST->hasNEON())
	return 16;
	if (ST->hasMVEIntegerOps())
	return 8;
	return 0;
	}

	if (ST->isThumb1Only())
	return 8;
	return 13;
	}

	TypeSize
	getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const override {
	switch (K) {
	case TargetTransformInfo::RGK_Scalar:
	return TypeSize::getFixed(32);
	case TargetTransformInfo::RGK_FixedWidthVector:
	if (ST->hasNEON())
	return TypeSize::getFixed(128);
	if (ST->hasMVEIntegerOps())
	return TypeSize::getFixed(128);
	return TypeSize::getFixed(0);
	case TargetTransformInfo::RGK_ScalableVector:
	return TypeSize::getScalable(0);
	}
	llvm_unreachable("Unsupported register kind");
	}

	unsigned getMaxInterleaveFactor(ElementCount VF) const override {
	return ST->getMaxInterleaveFactor();
	}

	bool isProfitableLSRChainElement(Instruction *I) const override;

	bool
	isLegalMaskedLoad(Type *DataTy, Align Alignment, unsigned AddressSpace,
	TTI::MaskKind MaskKind =
	TTI::MaskKind::VariableOrConstantMask) const override;

	bool
	isLegalMaskedStore(Type *DataTy, Align Alignment, unsigned AddressSpace,
	TTI::MaskKind MaskKind =
	TTI::MaskKind::VariableOrConstantMask) const override {
	return isLegalMaskedLoad(DataTy, Alignment, AddressSpace, MaskKind);
	}

	bool forceScalarizeMaskedGather(VectorType *VTy,
	Align Alignment) const override {
	// For MVE, we have a custom lowering pass that will already have custom
	// legalised any gathers that we can lower to MVE intrinsics, and want to
	// expand all the rest. The pass runs before the masked intrinsic lowering
	// pass.
	return true;
	}

	bool forceScalarizeMaskedScatter(VectorType *VTy,
	Align Alignment) const override {
	return forceScalarizeMaskedGather(VTy, Alignment);
	}

	bool isLegalMaskedGather(Type *Ty, Align Alignment) const override;

	bool isLegalMaskedScatter(Type *Ty, Align Alignment) const override {
	return isLegalMaskedGather(Ty, Alignment);
	}

	InstructionCost getMemcpyCost(const Instruction *I) const override;

	uint64_t getMaxMemIntrinsicInlineSizeThreshold() const override {
	return ST->getMaxInlineSizeThreshold();
	}

	int getNumMemOps(const IntrinsicInst *I) const;

	InstructionCost
	getShuffleCost(TTI::ShuffleKind Kind, VectorType DstTy, VectorType SrcTy,
	ArrayRef<int> Mask, TTI::TargetCostKind CostKind, int Index,
	VectorType SubTp, ArrayRef<const Value > Args = {},
	const Instruction *CxtI = nullptr) const override;

	bool preferInLoopReduction(RecurKind Kind, Type *Ty) const override;

	bool preferPredicatedReductionSelect() const override;

	bool shouldExpandReduction(const IntrinsicInst *II) const override {
	return false;
	}

	InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind,
	const Instruction *I = nullptr) const override;

	InstructionCost
	getCastInstrCost(unsigned Opcode, Type Dst, Type Src,
	TTI::CastContextHint CCH, TTI::TargetCostKind CostKind,
	const Instruction *I = nullptr) const override;

	InstructionCost getCmpSelInstrCost(
	unsigned Opcode, Type ValTy, Type CondTy, CmpInst::Predicate VecPred,
	TTI::TargetCostKind CostKind,
	TTI::OperandValueInfo Op1Info = {TTI::OK_AnyValue, TTI::OP_None},
	TTI::OperandValueInfo Op2Info = {TTI::OK_AnyValue, TTI::OP_None},
	const Instruction *I = nullptr) const override;

	using BaseT::getVectorInstrCost;
	InstructionCost
	getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind,
	unsigned Index, const Value Op0, const Value Op1,
	TTI::VectorInstrContext VIC =
	TTI::VectorInstrContext::None) const override;

	InstructionCost
	getAddressComputationCost(Type Val, ScalarEvolution SE, const SCEV *Ptr,
	TTI::TargetCostKind CostKind) const override;

	InstructionCost getArithmeticInstrCost(
	unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
	TTI::OperandValueInfo Op1Info = {TTI::OK_AnyValue, TTI::OP_None},
	TTI::OperandValueInfo Op2Info = {TTI::OK_AnyValue, TTI::OP_None},
	ArrayRef<const Value *> Args = {},
	const Instruction *CxtI = nullptr) const override;

	InstructionCost getMemoryOpCost(
	unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace,
	TTI::TargetCostKind CostKind,
	TTI::OperandValueInfo OpInfo = {TTI::OK_AnyValue, TTI::OP_None},
	const Instruction *I = nullptr) const override;

	InstructionCost
	getMemIntrinsicInstrCost(const MemIntrinsicCostAttributes &MICA,
	TTI::TargetCostKind CostKind) const override;

	InstructionCost getMaskedMemoryOpCost(const MemIntrinsicCostAttributes &MICA,
	TTI::TargetCostKind CostKind) const;

	InstructionCost getInterleavedMemoryOpCost(
	unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
	Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
	bool UseMaskForCond = false, bool UseMaskForGaps = false) const override;

	InstructionCost getGatherScatterOpCost(const MemIntrinsicCostAttributes &MICA,
	TTI::TargetCostKind CostKind) const;

	InstructionCost
	getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy,
	std::optional<FastMathFlags> FMF,
	TTI::TargetCostKind CostKind) const override;
	InstructionCost
	getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy,
	VectorType *ValTy, std::optional<FastMathFlags> FMF,
	TTI::TargetCostKind CostKind) const override;
	InstructionCost
	getMulAccReductionCost(bool IsUnsigned, unsigned RedOpcode, Type *ResTy,
	VectorType *ValTy,
	TTI::TargetCostKind CostKind) const override;

	InstructionCost
	getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF,
	TTI::TargetCostKind CostKind) const override;

	InstructionCost
	getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
	TTI::TargetCostKind CostKind) const override;

	/// getScalingFactorCost - Return the cost of the scaling used in
	/// addressing mode represented by AM.
	/// If the AM is supported, the return value must be >= 0.
	/// If the AM is not supported, the return value is an invalid cost.
	InstructionCost getScalingFactorCost(Type Ty, GlobalValue BaseGV,
	StackOffset BaseOffset, bool HasBaseReg,
	int64_t Scale,
	unsigned AddrSpace) const override;

	bool maybeLoweredToCall(Instruction &I) const;
	bool isLoweredToCall(const Function *F) const override;
	bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
	AssumptionCache &AC, TargetLibraryInfo *LibInfo,
	HardwareLoopInfo &HWLoopInfo) const override;
	bool preferPredicateOverEpilogue(TailFoldingInfo *TFI) const override;
	void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
	TTI::UnrollingPreferences &UP,
	OptimizationRemarkEmitter *ORE) const override;

	TailFoldingStyle
	getPreferredTailFoldingStyle(bool IVUpdateMayOverflow = true) const override;

	void getPeelingPreferences(Loop *L, ScalarEvolution &SE,
	TTI::PeelingPreferences &PP) const override;
	bool shouldBuildLookupTablesForConstant(Constant *C) const override {
	// In the ROPI and RWPI relocation models we can't have pointers to global
	// variables or functions in constant data, so don't convert switches to
	// lookup tables if any of the values would need relocation.
	if (ST->isROPI() \|\| ST->isRWPI())
	return !C->needsDynamicRelocation();

	return true;
	}

	bool hasArmWideBranch(bool Thumb) const override;

	bool isProfitableToSinkOperands(Instruction *I,
	SmallVectorImpl<Use *> &Ops) const override;

	unsigned getNumBytesToPadGlobalArray(unsigned Size,
	Type *ArrayType) const override;

	/// @}
	};

	/// isVREVMask - Check if a vector shuffle corresponds to a VREV
	/// instruction with the specified blocksize. (The order of the elements
	/// within each block of the vector is reversed.)
	inline bool isVREVMask(ArrayRef<int> M, EVT VT, unsigned BlockSize) {
	assert((BlockSize == 16 \|\| BlockSize == 32 \|\| BlockSize == 64) &&
	"Only possible block sizes for VREV are: 16, 32, 64");

	unsigned EltSz = VT.getScalarSizeInBits();
	if (EltSz != 8 && EltSz != 16 && EltSz != 32)
	return false;

	unsigned BlockElts = M[0] + 1;
	// If the first shuffle index is UNDEF, be optimistic.
	if (M[0] < 0)
	BlockElts = BlockSize / EltSz;

	if (BlockSize <= EltSz \|\| BlockSize != BlockElts * EltSz)
	return false;

	for (unsigned i = 0, e = M.size(); i < e; ++i) {
	if (M[i] < 0)
	continue; // ignore UNDEF indices
	if ((unsigned)M[i] != (i - i % BlockElts) + (BlockElts - 1 - i % BlockElts))
	return false;
	}

	return true;
	}

	} // end namespace llvm

	#endif // LLVM_LIB_TARGET_ARM_ARMTARGETTRANSFORMINFO_H