| //=====-- GCNSubtarget.h - Define GCN Subtarget for AMDGPU ------*- C++ -*-===// |
| // |
| // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| // See https://llvm.org/LICENSE.txt for license information. |
| // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| // |
| //==-----------------------------------------------------------------------===// |
| // |
| /// \file |
| /// AMD GCN specific subclass of TargetSubtarget. |
| // |
| //===----------------------------------------------------------------------===// |
| |
| #ifndef LLVM_LIB_TARGET_AMDGPU_GCNSUBTARGET_H |
| #define LLVM_LIB_TARGET_AMDGPU_GCNSUBTARGET_H |
| |
| #include "AMDGPUCallLowering.h" |
| #include "AMDGPUSubtarget.h" |
| #include "SIFrameLowering.h" |
| #include "SIISelLowering.h" |
| #include "SIInstrInfo.h" |
| #include "llvm/CodeGen/SelectionDAGTargetInfo.h" |
| |
| #define GET_SUBTARGETINFO_HEADER |
| #include "AMDGPUGenSubtargetInfo.inc" |
| |
| namespace llvm { |
| |
| class GCNTargetMachine; |
| |
| class GCNSubtarget final : public AMDGPUGenSubtargetInfo, |
| public AMDGPUSubtarget { |
| |
| using AMDGPUSubtarget::getMaxWavesPerEU; |
| |
| public: |
| // Following 2 enums are documented at: |
| // - https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi |
| enum class TrapHandlerAbi { |
| NONE = 0x00, |
| AMDHSA = 0x01, |
| }; |
| |
| enum class TrapID { |
| LLVMAMDHSATrap = 0x02, |
| LLVMAMDHSADebugTrap = 0x03, |
| }; |
| |
| private: |
| /// GlobalISel related APIs. |
| std::unique_ptr<AMDGPUCallLowering> CallLoweringInfo; |
| std::unique_ptr<InlineAsmLowering> InlineAsmLoweringInfo; |
| std::unique_ptr<InstructionSelector> InstSelector; |
| std::unique_ptr<LegalizerInfo> Legalizer; |
| std::unique_ptr<RegisterBankInfo> RegBankInfo; |
| |
| protected: |
| // Basic subtarget description. |
| Triple TargetTriple; |
| AMDGPU::IsaInfo::AMDGPUTargetID TargetID; |
| unsigned Gen; |
| InstrItineraryData InstrItins; |
| int LDSBankCount; |
| unsigned MaxPrivateElementSize; |
| |
| // Possibly statically set by tablegen, but may want to be overridden. |
| bool FastFMAF32; |
| bool FastDenormalF32; |
| bool HalfRate64Ops; |
| bool FullRate64Ops; |
| |
| // Dynamically set bits that enable features. |
| bool FlatForGlobal; |
| bool AutoWaitcntBeforeBarrier; |
| bool UnalignedScratchAccess; |
| bool UnalignedAccessMode; |
| bool HasApertureRegs; |
| bool SupportsXNACK; |
| |
| // This should not be used directly. 'TargetID' tracks the dynamic settings |
| // for XNACK. |
| bool EnableXNACK; |
| |
| bool EnableTgSplit; |
| bool EnableCuMode; |
| bool TrapHandler; |
| |
| // Used as options. |
| bool EnableLoadStoreOpt; |
| bool EnableUnsafeDSOffsetFolding; |
| bool EnableSIScheduler; |
| bool EnableDS128; |
| bool EnablePRTStrictNull; |
| bool DumpCode; |
| |
| // Subtarget statically properties set by tablegen |
| bool FP64; |
| bool FMA; |
| bool MIMG_R128; |
| bool CIInsts; |
| bool GFX8Insts; |
| bool GFX9Insts; |
| bool GFX90AInsts; |
| bool GFX10Insts; |
| bool GFX10_3Insts; |
| bool GFX7GFX8GFX9Insts; |
| bool SGPRInitBug; |
| bool NegativeScratchOffsetBug; |
| bool NegativeUnalignedScratchOffsetBug; |
| bool HasSMemRealTime; |
| bool HasIntClamp; |
| bool HasFmaMixInsts; |
| bool HasMovrel; |
| bool HasVGPRIndexMode; |
| bool HasScalarStores; |
| bool HasScalarAtomics; |
| bool HasSDWAOmod; |
| bool HasSDWAScalar; |
| bool HasSDWASdst; |
| bool HasSDWAMac; |
| bool HasSDWAOutModsVOPC; |
| bool HasDPP; |
| bool HasDPP8; |
| bool Has64BitDPP; |
| bool HasPackedFP32Ops; |
| bool HasExtendedImageInsts; |
| bool HasR128A16; |
| bool HasGFX10A16; |
| bool HasG16; |
| bool HasNSAEncoding; |
| unsigned NSAMaxSize; |
| bool GFX10_AEncoding; |
| bool GFX10_BEncoding; |
| bool HasDLInsts; |
| bool HasDot1Insts; |
| bool HasDot2Insts; |
| bool HasDot3Insts; |
| bool HasDot4Insts; |
| bool HasDot5Insts; |
| bool HasDot6Insts; |
| bool HasDot7Insts; |
| bool HasMAIInsts; |
| bool HasPkFmacF16Inst; |
| bool HasAtomicFaddInsts; |
| bool SupportsSRAMECC; |
| |
| // This should not be used directly. 'TargetID' tracks the dynamic settings |
| // for SRAMECC. |
| bool EnableSRAMECC; |
| |
| bool HasNoSdstCMPX; |
| bool HasVscnt; |
| bool HasGetWaveIdInst; |
| bool HasSMemTimeInst; |
| bool HasShaderCyclesRegister; |
| bool HasRegisterBanking; |
| bool HasVOP3Literal; |
| bool HasNoDataDepHazard; |
| bool FlatAddressSpace; |
| bool FlatInstOffsets; |
| bool FlatGlobalInsts; |
| bool FlatScratchInsts; |
| bool ScalarFlatScratchInsts; |
| bool HasArchitectedFlatScratch; |
| bool AddNoCarryInsts; |
| bool HasUnpackedD16VMem; |
| bool LDSMisalignedBug; |
| bool HasMFMAInlineLiteralBug; |
| bool UnalignedBufferAccess; |
| bool UnalignedDSAccess; |
| bool HasPackedTID; |
| bool ScalarizeGlobal; |
| |
| bool HasVcmpxPermlaneHazard; |
| bool HasVMEMtoScalarWriteHazard; |
| bool HasSMEMtoVectorWriteHazard; |
| bool HasInstFwdPrefetchBug; |
| bool HasVcmpxExecWARHazard; |
| bool HasLdsBranchVmemWARHazard; |
| bool HasNSAtoVMEMBug; |
| bool HasNSAClauseBug; |
| bool HasOffset3fBug; |
| bool HasFlatSegmentOffsetBug; |
| bool HasImageStoreD16Bug; |
| bool HasImageGather4D16Bug; |
| |
| // Dummy feature to use for assembler in tablegen. |
| bool FeatureDisable; |
| |
| SelectionDAGTargetInfo TSInfo; |
| private: |
| SIInstrInfo InstrInfo; |
| SITargetLowering TLInfo; |
| SIFrameLowering FrameLowering; |
| |
| public: |
| // See COMPUTE_TMPRING_SIZE.WAVESIZE, 13-bit field in units of 256-dword. |
| static const unsigned MaxWaveScratchSize = (256 * 4) * ((1 << 13) - 1); |
| |
| GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, |
| const GCNTargetMachine &TM); |
| ~GCNSubtarget() override; |
| |
| GCNSubtarget &initializeSubtargetDependencies(const Triple &TT, |
| StringRef GPU, StringRef FS); |
| |
| const SIInstrInfo *getInstrInfo() const override { |
| return &InstrInfo; |
| } |
| |
| const SIFrameLowering *getFrameLowering() const override { |
| return &FrameLowering; |
| } |
| |
| const SITargetLowering *getTargetLowering() const override { |
| return &TLInfo; |
| } |
| |
| const SIRegisterInfo *getRegisterInfo() const override { |
| return &InstrInfo.getRegisterInfo(); |
| } |
| |
| const CallLowering *getCallLowering() const override { |
| return CallLoweringInfo.get(); |
| } |
| |
| const InlineAsmLowering *getInlineAsmLowering() const override { |
| return InlineAsmLoweringInfo.get(); |
| } |
| |
| InstructionSelector *getInstructionSelector() const override { |
| return InstSelector.get(); |
| } |
| |
| const LegalizerInfo *getLegalizerInfo() const override { |
| return Legalizer.get(); |
| } |
| |
| const RegisterBankInfo *getRegBankInfo() const override { |
| return RegBankInfo.get(); |
| } |
| |
| const AMDGPU::IsaInfo::AMDGPUTargetID &getTargetID() const { |
| return TargetID; |
| } |
| |
| // Nothing implemented, just prevent crashes on use. |
| const SelectionDAGTargetInfo *getSelectionDAGInfo() const override { |
| return &TSInfo; |
| } |
| |
| const InstrItineraryData *getInstrItineraryData() const override { |
| return &InstrItins; |
| } |
| |
| void ParseSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS); |
| |
| Generation getGeneration() const { |
| return (Generation)Gen; |
| } |
| |
| /// Return the number of high bits known to be zero for a frame index. |
| unsigned getKnownHighZeroBitsForFrameIndex() const { |
| return countLeadingZeros(MaxWaveScratchSize) + getWavefrontSizeLog2(); |
| } |
| |
| int getLDSBankCount() const { |
| return LDSBankCount; |
| } |
| |
| unsigned getMaxPrivateElementSize(bool ForBufferRSrc = false) const { |
| return (ForBufferRSrc || !enableFlatScratch()) ? MaxPrivateElementSize : 16; |
| } |
| |
| unsigned getConstantBusLimit(unsigned Opcode) const; |
| |
| /// Returns if the result of this instruction with a 16-bit result returned in |
| /// a 32-bit register implicitly zeroes the high 16-bits, rather than preserve |
| /// the original value. |
| bool zeroesHigh16BitsOfDest(unsigned Opcode) const; |
| |
| bool hasIntClamp() const { |
| return HasIntClamp; |
| } |
| |
| bool hasFP64() const { |
| return FP64; |
| } |
| |
| bool hasMIMG_R128() const { |
| return MIMG_R128; |
| } |
| |
| bool hasHWFP64() const { |
| return FP64; |
| } |
| |
| bool hasFastFMAF32() const { |
| return FastFMAF32; |
| } |
| |
| bool hasHalfRate64Ops() const { |
| return HalfRate64Ops; |
| } |
| |
| bool hasFullRate64Ops() const { |
| return FullRate64Ops; |
| } |
| |
| bool hasAddr64() const { |
| return (getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS); |
| } |
| |
| bool hasFlat() const { |
| return (getGeneration() > AMDGPUSubtarget::SOUTHERN_ISLANDS); |
| } |
| |
| // Return true if the target only has the reverse operand versions of VALU |
| // shift instructions (e.g. v_lshrrev_b32, and no v_lshr_b32). |
| bool hasOnlyRevVALUShifts() const { |
| return getGeneration() >= VOLCANIC_ISLANDS; |
| } |
| |
| bool hasFractBug() const { |
| return getGeneration() == SOUTHERN_ISLANDS; |
| } |
| |
| bool hasBFE() const { |
| return true; |
| } |
| |
| bool hasBFI() const { |
| return true; |
| } |
| |
| bool hasBFM() const { |
| return hasBFE(); |
| } |
| |
| bool hasBCNT(unsigned Size) const { |
| return true; |
| } |
| |
| bool hasFFBL() const { |
| return true; |
| } |
| |
| bool hasFFBH() const { |
| return true; |
| } |
| |
| bool hasMed3_16() const { |
| return getGeneration() >= AMDGPUSubtarget::GFX9; |
| } |
| |
| bool hasMin3Max3_16() const { |
| return getGeneration() >= AMDGPUSubtarget::GFX9; |
| } |
| |
| bool hasFmaMixInsts() const { |
| return HasFmaMixInsts; |
| } |
| |
| bool hasCARRY() const { |
| return true; |
| } |
| |
| bool hasFMA() const { |
| return FMA; |
| } |
| |
| bool hasSwap() const { |
| return GFX9Insts; |
| } |
| |
| bool hasScalarPackInsts() const { |
| return GFX9Insts; |
| } |
| |
| bool hasScalarMulHiInsts() const { |
| return GFX9Insts; |
| } |
| |
| TrapHandlerAbi getTrapHandlerAbi() const { |
| return isAmdHsaOS() ? TrapHandlerAbi::AMDHSA : TrapHandlerAbi::NONE; |
| } |
| |
| bool supportsGetDoorbellID() const { |
| // The S_GETREG DOORBELL_ID is supported by all GFX9 onward targets. |
| return getGeneration() >= GFX9; |
| } |
| |
| /// True if the offset field of DS instructions works as expected. On SI, the |
| /// offset uses a 16-bit adder and does not always wrap properly. |
| bool hasUsableDSOffset() const { |
| return getGeneration() >= SEA_ISLANDS; |
| } |
| |
| bool unsafeDSOffsetFoldingEnabled() const { |
| return EnableUnsafeDSOffsetFolding; |
| } |
| |
| /// Condition output from div_scale is usable. |
| bool hasUsableDivScaleConditionOutput() const { |
| return getGeneration() != SOUTHERN_ISLANDS; |
| } |
| |
| /// Extra wait hazard is needed in some cases before |
| /// s_cbranch_vccnz/s_cbranch_vccz. |
| bool hasReadVCCZBug() const { |
| return getGeneration() <= SEA_ISLANDS; |
| } |
| |
| /// Writes to VCC_LO/VCC_HI update the VCCZ flag. |
| bool partialVCCWritesUpdateVCCZ() const { |
| return getGeneration() >= GFX10; |
| } |
| |
| /// A read of an SGPR by SMRD instruction requires 4 wait states when the SGPR |
| /// was written by a VALU instruction. |
| bool hasSMRDReadVALUDefHazard() const { |
| return getGeneration() == SOUTHERN_ISLANDS; |
| } |
| |
| /// A read of an SGPR by a VMEM instruction requires 5 wait states when the |
| /// SGPR was written by a VALU Instruction. |
| bool hasVMEMReadSGPRVALUDefHazard() const { |
| return getGeneration() >= VOLCANIC_ISLANDS; |
| } |
| |
| bool hasRFEHazards() const { |
| return getGeneration() >= VOLCANIC_ISLANDS; |
| } |
| |
| /// Number of hazard wait states for s_setreg_b32/s_setreg_imm32_b32. |
| unsigned getSetRegWaitStates() const { |
| return getGeneration() <= SEA_ISLANDS ? 1 : 2; |
| } |
| |
| bool dumpCode() const { |
| return DumpCode; |
| } |
| |
| /// Return the amount of LDS that can be used that will not restrict the |
| /// occupancy lower than WaveCount. |
| unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount, |
| const Function &) const; |
| |
| bool supportsMinMaxDenormModes() const { |
| return getGeneration() >= AMDGPUSubtarget::GFX9; |
| } |
| |
| /// \returns If target supports S_DENORM_MODE. |
| bool hasDenormModeInst() const { |
| return getGeneration() >= AMDGPUSubtarget::GFX10; |
| } |
| |
| bool useFlatForGlobal() const { |
| return FlatForGlobal; |
| } |
| |
| /// \returns If target supports ds_read/write_b128 and user enables generation |
| /// of ds_read/write_b128. |
| bool useDS128() const { |
| return CIInsts && EnableDS128; |
| } |
| |
| /// \return If target supports ds_read/write_b96/128. |
| bool hasDS96AndDS128() const { |
| return CIInsts; |
| } |
| |
| /// Have v_trunc_f64, v_ceil_f64, v_rndne_f64 |
| bool haveRoundOpsF64() const { |
| return CIInsts; |
| } |
| |
| /// \returns If MUBUF instructions always perform range checking, even for |
| /// buffer resources used for private memory access. |
| bool privateMemoryResourceIsRangeChecked() const { |
| return getGeneration() < AMDGPUSubtarget::GFX9; |
| } |
| |
| /// \returns If target requires PRT Struct NULL support (zero result registers |
| /// for sparse texture support). |
| bool usePRTStrictNull() const { |
| return EnablePRTStrictNull; |
| } |
| |
| bool hasAutoWaitcntBeforeBarrier() const { |
| return AutoWaitcntBeforeBarrier; |
| } |
| |
| bool hasUnalignedBufferAccess() const { |
| return UnalignedBufferAccess; |
| } |
| |
| bool hasUnalignedBufferAccessEnabled() const { |
| return UnalignedBufferAccess && UnalignedAccessMode; |
| } |
| |
| bool hasUnalignedDSAccess() const { |
| return UnalignedDSAccess; |
| } |
| |
| bool hasUnalignedDSAccessEnabled() const { |
| return UnalignedDSAccess && UnalignedAccessMode; |
| } |
| |
| bool hasUnalignedScratchAccess() const { |
| return UnalignedScratchAccess; |
| } |
| |
| bool hasUnalignedAccessMode() const { |
| return UnalignedAccessMode; |
| } |
| |
| bool hasApertureRegs() const { |
| return HasApertureRegs; |
| } |
| |
| bool isTrapHandlerEnabled() const { |
| return TrapHandler; |
| } |
| |
| bool isXNACKEnabled() const { |
| return TargetID.isXnackOnOrAny(); |
| } |
| |
| bool isTgSplitEnabled() const { |
| return EnableTgSplit; |
| } |
| |
| bool isCuModeEnabled() const { |
| return EnableCuMode; |
| } |
| |
| bool hasFlatAddressSpace() const { |
| return FlatAddressSpace; |
| } |
| |
| bool hasFlatScrRegister() const { |
| return hasFlatAddressSpace(); |
| } |
| |
| bool hasFlatInstOffsets() const { |
| return FlatInstOffsets; |
| } |
| |
| bool hasFlatGlobalInsts() const { |
| return FlatGlobalInsts; |
| } |
| |
| bool hasFlatScratchInsts() const { |
| return FlatScratchInsts; |
| } |
| |
| // Check if target supports ST addressing mode with FLAT scratch instructions. |
| // The ST addressing mode means no registers are used, either VGPR or SGPR, |
| // but only immediate offset is swizzled and added to the FLAT scratch base. |
| bool hasFlatScratchSTMode() const { |
| return hasFlatScratchInsts() && hasGFX10_3Insts(); |
| } |
| |
| bool hasScalarFlatScratchInsts() const { |
| return ScalarFlatScratchInsts; |
| } |
| |
| bool hasGlobalAddTidInsts() const { |
| return GFX10_BEncoding; |
| } |
| |
| bool hasAtomicCSub() const { |
| return GFX10_BEncoding; |
| } |
| |
| bool hasMultiDwordFlatScratchAddressing() const { |
| return getGeneration() >= GFX9; |
| } |
| |
| bool hasFlatSegmentOffsetBug() const { |
| return HasFlatSegmentOffsetBug; |
| } |
| |
| bool hasFlatLgkmVMemCountInOrder() const { |
| return getGeneration() > GFX9; |
| } |
| |
| bool hasD16LoadStore() const { |
| return getGeneration() >= GFX9; |
| } |
| |
| bool d16PreservesUnusedBits() const { |
| return hasD16LoadStore() && !TargetID.isSramEccOnOrAny(); |
| } |
| |
| bool hasD16Images() const { |
| return getGeneration() >= VOLCANIC_ISLANDS; |
| } |
| |
| /// Return if most LDS instructions have an m0 use that require m0 to be |
| /// initialized. |
| bool ldsRequiresM0Init() const { |
| return getGeneration() < GFX9; |
| } |
| |
| // True if the hardware rewinds and replays GWS operations if a wave is |
| // preempted. |
| // |
| // If this is false, a GWS operation requires testing if a nack set the |
| // MEM_VIOL bit, and repeating if so. |
| bool hasGWSAutoReplay() const { |
| return getGeneration() >= GFX9; |
| } |
| |
| /// \returns if target has ds_gws_sema_release_all instruction. |
| bool hasGWSSemaReleaseAll() const { |
| return CIInsts; |
| } |
| |
| /// \returns true if the target has integer add/sub instructions that do not |
| /// produce a carry-out. This includes v_add_[iu]32, v_sub_[iu]32, |
| /// v_add_[iu]16, and v_sub_[iu]16, all of which support the clamp modifier |
| /// for saturation. |
| bool hasAddNoCarry() const { |
| return AddNoCarryInsts; |
| } |
| |
| bool hasUnpackedD16VMem() const { |
| return HasUnpackedD16VMem; |
| } |
| |
| // Covers VS/PS/CS graphics shaders |
| bool isMesaGfxShader(const Function &F) const { |
| return isMesa3DOS() && AMDGPU::isShader(F.getCallingConv()); |
| } |
| |
| bool hasMad64_32() const { |
| return getGeneration() >= SEA_ISLANDS; |
| } |
| |
| bool hasSDWAOmod() const { |
| return HasSDWAOmod; |
| } |
| |
| bool hasSDWAScalar() const { |
| return HasSDWAScalar; |
| } |
| |
| bool hasSDWASdst() const { |
| return HasSDWASdst; |
| } |
| |
| bool hasSDWAMac() const { |
| return HasSDWAMac; |
| } |
| |
| bool hasSDWAOutModsVOPC() const { |
| return HasSDWAOutModsVOPC; |
| } |
| |
| bool hasDLInsts() const { |
| return HasDLInsts; |
| } |
| |
| bool hasDot1Insts() const { |
| return HasDot1Insts; |
| } |
| |
| bool hasDot2Insts() const { |
| return HasDot2Insts; |
| } |
| |
| bool hasDot3Insts() const { |
| return HasDot3Insts; |
| } |
| |
| bool hasDot4Insts() const { |
| return HasDot4Insts; |
| } |
| |
| bool hasDot5Insts() const { |
| return HasDot5Insts; |
| } |
| |
| bool hasDot6Insts() const { |
| return HasDot6Insts; |
| } |
| |
| bool hasDot7Insts() const { |
| return HasDot7Insts; |
| } |
| |
| bool hasMAIInsts() const { |
| return HasMAIInsts; |
| } |
| |
| bool hasPkFmacF16Inst() const { |
| return HasPkFmacF16Inst; |
| } |
| |
| bool hasAtomicFaddInsts() const { |
| return HasAtomicFaddInsts; |
| } |
| |
| bool hasNoSdstCMPX() const { |
| return HasNoSdstCMPX; |
| } |
| |
| bool hasVscnt() const { |
| return HasVscnt; |
| } |
| |
| bool hasGetWaveIdInst() const { |
| return HasGetWaveIdInst; |
| } |
| |
| bool hasSMemTimeInst() const { |
| return HasSMemTimeInst; |
| } |
| |
| bool hasShaderCyclesRegister() const { |
| return HasShaderCyclesRegister; |
| } |
| |
| bool hasRegisterBanking() const { |
| return HasRegisterBanking; |
| } |
| |
| bool hasVOP3Literal() const { |
| return HasVOP3Literal; |
| } |
| |
| bool hasNoDataDepHazard() const { |
| return HasNoDataDepHazard; |
| } |
| |
| bool vmemWriteNeedsExpWaitcnt() const { |
| return getGeneration() < SEA_ISLANDS; |
| } |
| |
| // Scratch is allocated in 256 dword per wave blocks for the entire |
| // wavefront. When viewed from the perspective of an arbitrary workitem, this |
| // is 4-byte aligned. |
| // |
| // Only 4-byte alignment is really needed to access anything. Transformations |
| // on the pointer value itself may rely on the alignment / known low bits of |
| // the pointer. Set this to something above the minimum to avoid needing |
| // dynamic realignment in common cases. |
| Align getStackAlignment() const { return Align(16); } |
| |
| bool enableMachineScheduler() const override { |
| return true; |
| } |
| |
| bool useAA() const override; |
| |
| bool enableSubRegLiveness() const override { |
| return true; |
| } |
| |
| void setScalarizeGlobalBehavior(bool b) { ScalarizeGlobal = b; } |
| bool getScalarizeGlobalBehavior() const { return ScalarizeGlobal; } |
| |
| // static wrappers |
| static bool hasHalfRate64Ops(const TargetSubtargetInfo &STI); |
| |
| // XXX - Why is this here if it isn't in the default pass set? |
| bool enableEarlyIfConversion() const override { |
| return true; |
| } |
| |
| bool enableFlatScratch() const; |
| |
| void overrideSchedPolicy(MachineSchedPolicy &Policy, |
| unsigned NumRegionInstrs) const override; |
| |
| unsigned getMaxNumUserSGPRs() const { |
| return 16; |
| } |
| |
| bool hasSMemRealTime() const { |
| return HasSMemRealTime; |
| } |
| |
| bool hasMovrel() const { |
| return HasMovrel; |
| } |
| |
| bool hasVGPRIndexMode() const { |
| return HasVGPRIndexMode; |
| } |
| |
| bool useVGPRIndexMode() const; |
| |
| bool hasScalarCompareEq64() const { |
| return getGeneration() >= VOLCANIC_ISLANDS; |
| } |
| |
| bool hasScalarStores() const { |
| return HasScalarStores; |
| } |
| |
| bool hasScalarAtomics() const { |
| return HasScalarAtomics; |
| } |
| |
| bool hasLDSFPAtomicAdd() const { return GFX8Insts; } |
| |
| /// \returns true if the subtarget has the v_permlanex16_b32 instruction. |
| bool hasPermLaneX16() const { return getGeneration() >= GFX10; } |
| |
| bool hasDPP() const { |
| return HasDPP; |
| } |
| |
| bool hasDPPBroadcasts() const { |
| return HasDPP && getGeneration() < GFX10; |
| } |
| |
| bool hasDPPWavefrontShifts() const { |
| return HasDPP && getGeneration() < GFX10; |
| } |
| |
| bool hasDPP8() const { |
| return HasDPP8; |
| } |
| |
| bool has64BitDPP() const { |
| return Has64BitDPP; |
| } |
| |
| bool hasPackedFP32Ops() const { |
| return HasPackedFP32Ops; |
| } |
| |
| bool hasFmaakFmamkF32Insts() const { |
| return getGeneration() >= GFX10; |
| } |
| |
| bool hasExtendedImageInsts() const { |
| return HasExtendedImageInsts; |
| } |
| |
| bool hasR128A16() const { |
| return HasR128A16; |
| } |
| |
| bool hasGFX10A16() const { |
| return HasGFX10A16; |
| } |
| |
| bool hasA16() const { return hasR128A16() || hasGFX10A16(); } |
| |
| bool hasG16() const { return HasG16; } |
| |
| bool hasOffset3fBug() const { |
| return HasOffset3fBug; |
| } |
| |
| bool hasImageStoreD16Bug() const { return HasImageStoreD16Bug; } |
| |
| bool hasImageGather4D16Bug() const { return HasImageGather4D16Bug; } |
| |
| bool hasNSAEncoding() const { return HasNSAEncoding; } |
| |
| unsigned getNSAMaxSize() const { return NSAMaxSize; } |
| |
| bool hasGFX10_AEncoding() const { |
| return GFX10_AEncoding; |
| } |
| |
| bool hasGFX10_BEncoding() const { |
| return GFX10_BEncoding; |
| } |
| |
| bool hasGFX10_3Insts() const { |
| return GFX10_3Insts; |
| } |
| |
| bool hasMadF16() const; |
| |
| bool enableSIScheduler() const { |
| return EnableSIScheduler; |
| } |
| |
| bool loadStoreOptEnabled() const { |
| return EnableLoadStoreOpt; |
| } |
| |
| bool hasSGPRInitBug() const { |
| return SGPRInitBug; |
| } |
| |
| bool hasNegativeScratchOffsetBug() const { return NegativeScratchOffsetBug; } |
| |
| bool hasNegativeUnalignedScratchOffsetBug() const { |
| return NegativeUnalignedScratchOffsetBug; |
| } |
| |
| bool hasMFMAInlineLiteralBug() const { |
| return HasMFMAInlineLiteralBug; |
| } |
| |
| bool has12DWordStoreHazard() const { |
| return getGeneration() != AMDGPUSubtarget::SOUTHERN_ISLANDS; |
| } |
| |
| // \returns true if the subtarget supports DWORDX3 load/store instructions. |
| bool hasDwordx3LoadStores() const { |
| return CIInsts; |
| } |
| |
| bool hasReadM0MovRelInterpHazard() const { |
| return getGeneration() == AMDGPUSubtarget::GFX9; |
| } |
| |
| bool hasReadM0SendMsgHazard() const { |
| return getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS && |
| getGeneration() <= AMDGPUSubtarget::GFX9; |
| } |
| |
| bool hasVcmpxPermlaneHazard() const { |
| return HasVcmpxPermlaneHazard; |
| } |
| |
| bool hasVMEMtoScalarWriteHazard() const { |
| return HasVMEMtoScalarWriteHazard; |
| } |
| |
| bool hasSMEMtoVectorWriteHazard() const { |
| return HasSMEMtoVectorWriteHazard; |
| } |
| |
| bool hasLDSMisalignedBug() const { |
| return LDSMisalignedBug && !EnableCuMode; |
| } |
| |
| bool hasInstFwdPrefetchBug() const { |
| return HasInstFwdPrefetchBug; |
| } |
| |
| bool hasVcmpxExecWARHazard() const { |
| return HasVcmpxExecWARHazard; |
| } |
| |
| bool hasLdsBranchVmemWARHazard() const { |
| return HasLdsBranchVmemWARHazard; |
| } |
| |
| bool hasNSAtoVMEMBug() const { |
| return HasNSAtoVMEMBug; |
| } |
| |
| bool hasNSAClauseBug() const { return HasNSAClauseBug; } |
| |
| bool hasHardClauses() const { return getGeneration() >= GFX10; } |
| |
| bool hasGFX90AInsts() const { return GFX90AInsts; } |
| |
| /// Return if operations acting on VGPR tuples require even alignment. |
| bool needsAlignedVGPRs() const { return GFX90AInsts; } |
| |
| bool hasPackedTID() const { return HasPackedTID; } |
| |
| /// Return the maximum number of waves per SIMD for kernels using \p SGPRs |
| /// SGPRs |
| unsigned getOccupancyWithNumSGPRs(unsigned SGPRs) const; |
| |
| /// Return the maximum number of waves per SIMD for kernels using \p VGPRs |
| /// VGPRs |
| unsigned getOccupancyWithNumVGPRs(unsigned VGPRs) const; |
| |
| /// Return occupancy for the given function. Used LDS and a number of |
| /// registers if provided. |
| /// Note, occupancy can be affected by the scratch allocation as well, but |
| /// we do not have enough information to compute it. |
| unsigned computeOccupancy(const Function &F, unsigned LDSSize = 0, |
| unsigned NumSGPRs = 0, unsigned NumVGPRs = 0) const; |
| |
| /// \returns true if the flat_scratch register should be initialized with the |
| /// pointer to the wave's scratch memory rather than a size and offset. |
| bool flatScratchIsPointer() const { |
| return getGeneration() >= AMDGPUSubtarget::GFX9; |
| } |
| |
| /// \returns true if the flat_scratch register is initialized by the HW. |
| /// In this case it is readonly. |
| bool flatScratchIsArchitected() const { return HasArchitectedFlatScratch; } |
| |
| /// \returns true if the machine has merged shaders in which s0-s7 are |
| /// reserved by the hardware and user SGPRs start at s8 |
| bool hasMergedShaders() const { |
| return getGeneration() >= GFX9; |
| } |
| |
| /// \returns SGPR allocation granularity supported by the subtarget. |
| unsigned getSGPRAllocGranule() const { |
| return AMDGPU::IsaInfo::getSGPRAllocGranule(this); |
| } |
| |
| /// \returns SGPR encoding granularity supported by the subtarget. |
| unsigned getSGPREncodingGranule() const { |
| return AMDGPU::IsaInfo::getSGPREncodingGranule(this); |
| } |
| |
| /// \returns Total number of SGPRs supported by the subtarget. |
| unsigned getTotalNumSGPRs() const { |
| return AMDGPU::IsaInfo::getTotalNumSGPRs(this); |
| } |
| |
| /// \returns Addressable number of SGPRs supported by the subtarget. |
| unsigned getAddressableNumSGPRs() const { |
| return AMDGPU::IsaInfo::getAddressableNumSGPRs(this); |
| } |
| |
| /// \returns Minimum number of SGPRs that meets the given number of waves per |
| /// execution unit requirement supported by the subtarget. |
| unsigned getMinNumSGPRs(unsigned WavesPerEU) const { |
| return AMDGPU::IsaInfo::getMinNumSGPRs(this, WavesPerEU); |
| } |
| |
| /// \returns Maximum number of SGPRs that meets the given number of waves per |
| /// execution unit requirement supported by the subtarget. |
| unsigned getMaxNumSGPRs(unsigned WavesPerEU, bool Addressable) const { |
| return AMDGPU::IsaInfo::getMaxNumSGPRs(this, WavesPerEU, Addressable); |
| } |
| |
| /// \returns Reserved number of SGPRs. This is common |
| /// utility function called by MachineFunction and |
| /// Function variants of getReservedNumSGPRs. |
| unsigned getBaseReservedNumSGPRs(const bool HasFlatScratchInit) const; |
| /// \returns Reserved number of SGPRs for given machine function \p MF. |
| unsigned getReservedNumSGPRs(const MachineFunction &MF) const; |
| |
| /// \returns Reserved number of SGPRs for given function \p F. |
| unsigned getReservedNumSGPRs(const Function &F) const; |
| |
| /// \returns max num SGPRs. This is the common utility |
| /// function called by MachineFunction and Function |
| /// variants of getMaxNumSGPRs. |
| unsigned getBaseMaxNumSGPRs(const Function &F, |
| std::pair<unsigned, unsigned> WavesPerEU, |
| unsigned PreloadedSGPRs, |
| unsigned ReservedNumSGPRs) const; |
| |
| /// \returns Maximum number of SGPRs that meets number of waves per execution |
| /// unit requirement for function \p MF, or number of SGPRs explicitly |
| /// requested using "amdgpu-num-sgpr" attribute attached to function \p MF. |
| /// |
| /// \returns Value that meets number of waves per execution unit requirement |
| /// if explicitly requested value cannot be converted to integer, violates |
| /// subtarget's specifications, or does not meet number of waves per execution |
| /// unit requirement. |
| unsigned getMaxNumSGPRs(const MachineFunction &MF) const; |
| |
| /// \returns Maximum number of SGPRs that meets number of waves per execution |
| /// unit requirement for function \p F, or number of SGPRs explicitly |
| /// requested using "amdgpu-num-sgpr" attribute attached to function \p F. |
| /// |
| /// \returns Value that meets number of waves per execution unit requirement |
| /// if explicitly requested value cannot be converted to integer, violates |
| /// subtarget's specifications, or does not meet number of waves per execution |
| /// unit requirement. |
| unsigned getMaxNumSGPRs(const Function &F) const; |
| |
| /// \returns VGPR allocation granularity supported by the subtarget. |
| unsigned getVGPRAllocGranule() const { |
| return AMDGPU::IsaInfo::getVGPRAllocGranule(this); |
| } |
| |
| /// \returns VGPR encoding granularity supported by the subtarget. |
| unsigned getVGPREncodingGranule() const { |
| return AMDGPU::IsaInfo::getVGPREncodingGranule(this); |
| } |
| |
| /// \returns Total number of VGPRs supported by the subtarget. |
| unsigned getTotalNumVGPRs() const { |
| return AMDGPU::IsaInfo::getTotalNumVGPRs(this); |
| } |
| |
| /// \returns Addressable number of VGPRs supported by the subtarget. |
| unsigned getAddressableNumVGPRs() const { |
| return AMDGPU::IsaInfo::getAddressableNumVGPRs(this); |
| } |
| |
| /// \returns Minimum number of VGPRs that meets given number of waves per |
| /// execution unit requirement supported by the subtarget. |
| unsigned getMinNumVGPRs(unsigned WavesPerEU) const { |
| return AMDGPU::IsaInfo::getMinNumVGPRs(this, WavesPerEU); |
| } |
| |
| /// \returns Maximum number of VGPRs that meets given number of waves per |
| /// execution unit requirement supported by the subtarget. |
| unsigned getMaxNumVGPRs(unsigned WavesPerEU) const { |
| return AMDGPU::IsaInfo::getMaxNumVGPRs(this, WavesPerEU); |
| } |
| |
| /// \returns max num VGPRs. This is the common utility function |
| /// called by MachineFunction and Function variants of getMaxNumVGPRs. |
| unsigned getBaseMaxNumVGPRs(const Function &F, |
| std::pair<unsigned, unsigned> WavesPerEU) const; |
| /// \returns Maximum number of VGPRs that meets number of waves per execution |
| /// unit requirement for function \p F, or number of VGPRs explicitly |
| /// requested using "amdgpu-num-vgpr" attribute attached to function \p F. |
| /// |
| /// \returns Value that meets number of waves per execution unit requirement |
| /// if explicitly requested value cannot be converted to integer, violates |
| /// subtarget's specifications, or does not meet number of waves per execution |
| /// unit requirement. |
| unsigned getMaxNumVGPRs(const Function &F) const; |
| |
| /// \returns Maximum number of VGPRs that meets number of waves per execution |
| /// unit requirement for function \p MF, or number of VGPRs explicitly |
| /// requested using "amdgpu-num-vgpr" attribute attached to function \p MF. |
| /// |
| /// \returns Value that meets number of waves per execution unit requirement |
| /// if explicitly requested value cannot be converted to integer, violates |
| /// subtarget's specifications, or does not meet number of waves per execution |
| /// unit requirement. |
| unsigned getMaxNumVGPRs(const MachineFunction &MF) const; |
| |
| void getPostRAMutations( |
| std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) |
| const override; |
| |
| std::unique_ptr<ScheduleDAGMutation> |
| createFillMFMAShadowMutation(const TargetInstrInfo *TII) const; |
| |
| bool isWave32() const { |
| return getWavefrontSize() == 32; |
| } |
| |
| bool isWave64() const { |
| return getWavefrontSize() == 64; |
| } |
| |
| const TargetRegisterClass *getBoolRC() const { |
| return getRegisterInfo()->getBoolRC(); |
| } |
| |
| /// \returns Maximum number of work groups per compute unit supported by the |
| /// subtarget and limited by given \p FlatWorkGroupSize. |
| unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const override { |
| return AMDGPU::IsaInfo::getMaxWorkGroupsPerCU(this, FlatWorkGroupSize); |
| } |
| |
| /// \returns Minimum flat work group size supported by the subtarget. |
| unsigned getMinFlatWorkGroupSize() const override { |
| return AMDGPU::IsaInfo::getMinFlatWorkGroupSize(this); |
| } |
| |
| /// \returns Maximum flat work group size supported by the subtarget. |
| unsigned getMaxFlatWorkGroupSize() const override { |
| return AMDGPU::IsaInfo::getMaxFlatWorkGroupSize(this); |
| } |
| |
| /// \returns Number of waves per execution unit required to support the given |
| /// \p FlatWorkGroupSize. |
| unsigned |
| getWavesPerEUForWorkGroup(unsigned FlatWorkGroupSize) const override { |
| return AMDGPU::IsaInfo::getWavesPerEUForWorkGroup(this, FlatWorkGroupSize); |
| } |
| |
| /// \returns Minimum number of waves per execution unit supported by the |
| /// subtarget. |
| unsigned getMinWavesPerEU() const override { |
| return AMDGPU::IsaInfo::getMinWavesPerEU(this); |
| } |
| |
| void adjustSchedDependency(SUnit *Def, int DefOpIdx, SUnit *Use, int UseOpIdx, |
| SDep &Dep) const override; |
| }; |
| |
| } // end namespace llvm |
| |
| #endif // LLVM_LIB_TARGET_AMDGPU_GCNSUBTARGET_H |