| //=====-- GCNSubtarget.h - Define GCN Subtarget for AMDGPU ------*- C++ -*-===// |
| // |
| // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| // See https://llvm.org/LICENSE.txt for license information. |
| // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| // |
| //==-----------------------------------------------------------------------===// |
| // |
| /// \file |
| /// AMD GCN specific subclass of TargetSubtarget. |
| // |
| //===----------------------------------------------------------------------===// |
| |
| #ifndef LLVM_LIB_TARGET_AMDGPU_GCNSUBTARGET_H |
| #define LLVM_LIB_TARGET_AMDGPU_GCNSUBTARGET_H |
| |
| #include "AMDGPUCallLowering.h" |
| #include "AMDGPURegisterBankInfo.h" |
| #include "AMDGPUSubtarget.h" |
| #include "SIFrameLowering.h" |
| #include "SIISelLowering.h" |
| #include "SIInstrInfo.h" |
| #include "Utils/AMDGPUBaseInfo.h" |
| #include "llvm/Support/ErrorHandling.h" |
| |
| #define GET_SUBTARGETINFO_HEADER |
| #include "AMDGPUGenSubtargetInfo.inc" |
| |
| namespace llvm { |
| |
| class GCNTargetMachine; |
| |
| class GCNSubtarget final : public AMDGPUGenSubtargetInfo, |
| public AMDGPUSubtarget { |
| public: |
| using AMDGPUSubtarget::getMaxWavesPerEU; |
| |
| // Following 2 enums are documented at: |
| // - https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi |
| enum class TrapHandlerAbi { |
| NONE = 0x00, |
| AMDHSA = 0x01, |
| }; |
| |
| enum class TrapID { |
| LLVMAMDHSATrap = 0x02, |
| LLVMAMDHSADebugTrap = 0x03, |
| }; |
| |
| private: |
| /// SelectionDAGISel related APIs. |
| std::unique_ptr<const SelectionDAGTargetInfo> TSInfo; |
| |
| /// GlobalISel related APIs. |
| std::unique_ptr<AMDGPUCallLowering> CallLoweringInfo; |
| std::unique_ptr<InlineAsmLowering> InlineAsmLoweringInfo; |
| std::unique_ptr<InstructionSelector> InstSelector; |
| std::unique_ptr<LegalizerInfo> Legalizer; |
| std::unique_ptr<AMDGPURegisterBankInfo> RegBankInfo; |
| |
| protected: |
| // Basic subtarget description. |
| AMDGPU::IsaInfo::AMDGPUTargetID TargetID; |
| unsigned Gen = INVALID; |
| InstrItineraryData InstrItins; |
| int LDSBankCount = 0; |
| unsigned MaxPrivateElementSize = 0; |
| |
| // Dynamically set bits that enable features. |
| bool DynamicVGPR = false; |
| bool DynamicVGPRBlockSize32 = false; |
| bool ScalarizeGlobal = false; |
| |
| /// The maximum number of instructions that may be placed within an S_CLAUSE, |
| /// which is one greater than the maximum argument to S_CLAUSE. A value of 0 |
| /// indicates a lack of S_CLAUSE support. |
| unsigned MaxHardClauseLength = 0; |
| |
| #define GET_SUBTARGETINFO_MACRO(ATTRIBUTE, DEFAULT, GETTER) \ |
| bool ATTRIBUTE = DEFAULT; |
| #include "AMDGPUGenSubtargetInfo.inc" |
| |
| private: |
| SIInstrInfo InstrInfo; |
| SITargetLowering TLInfo; |
| SIFrameLowering FrameLowering; |
| |
| public: |
| GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, |
| const GCNTargetMachine &TM); |
| ~GCNSubtarget() override; |
| |
| GCNSubtarget &initializeSubtargetDependencies(const Triple &TT, StringRef GPU, |
| StringRef FS); |
| |
| /// Diagnose inconsistent subtarget features before attempting to codegen |
| /// function \p F. |
| void checkSubtargetFeatures(const Function &F) const; |
| |
| const SIInstrInfo *getInstrInfo() const override { return &InstrInfo; } |
| |
| const SIFrameLowering *getFrameLowering() const override { |
| return &FrameLowering; |
| } |
| |
| const SITargetLowering *getTargetLowering() const override { return &TLInfo; } |
| |
| const SIRegisterInfo *getRegisterInfo() const override { |
| return &InstrInfo.getRegisterInfo(); |
| } |
| |
| const SelectionDAGTargetInfo *getSelectionDAGInfo() const override; |
| |
| const CallLowering *getCallLowering() const override { |
| return CallLoweringInfo.get(); |
| } |
| |
| const InlineAsmLowering *getInlineAsmLowering() const override { |
| return InlineAsmLoweringInfo.get(); |
| } |
| |
| InstructionSelector *getInstructionSelector() const override { |
| return InstSelector.get(); |
| } |
| |
| const LegalizerInfo *getLegalizerInfo() const override { |
| return Legalizer.get(); |
| } |
| |
| const AMDGPURegisterBankInfo *getRegBankInfo() const override { |
| return RegBankInfo.get(); |
| } |
| |
| const AMDGPU::IsaInfo::AMDGPUTargetID &getTargetID() const { |
| return TargetID; |
| } |
| |
| const InstrItineraryData *getInstrItineraryData() const override { |
| return &InstrItins; |
| } |
| |
| void ParseSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS); |
| |
| Generation getGeneration() const { return (Generation)Gen; } |
| |
| bool isGFX11Plus() const { return getGeneration() >= GFX11; } |
| |
| #define GET_SUBTARGETINFO_MACRO(ATTRIBUTE, DEFAULT, GETTER) \ |
| bool GETTER() const override { return ATTRIBUTE; } |
| #include "AMDGPUGenSubtargetInfo.inc" |
| |
| unsigned getMaxWaveScratchSize() const { |
| // See COMPUTE_TMPRING_SIZE.WAVESIZE. |
| if (getGeneration() >= GFX12) { |
| // 18-bit field in units of 64-dword. |
| return (64 * 4) * ((1 << 18) - 1); |
| } |
| if (getGeneration() == GFX11) { |
| // 15-bit field in units of 64-dword. |
| return (64 * 4) * ((1 << 15) - 1); |
| } |
| // 13-bit field in units of 256-dword. |
| return (256 * 4) * ((1 << 13) - 1); |
| } |
| |
| /// Return the number of high bits known to be zero for a frame index. |
| unsigned getKnownHighZeroBitsForFrameIndex() const { |
| return llvm::countl_zero(getMaxWaveScratchSize()) + getWavefrontSizeLog2(); |
| } |
| |
| int getLDSBankCount() const { return LDSBankCount; } |
| |
| unsigned getMaxPrivateElementSize(bool ForBufferRSrc = false) const { |
| return (ForBufferRSrc || !hasFlatScratchEnabled()) ? MaxPrivateElementSize |
| : 16; |
| } |
| |
| unsigned getConstantBusLimit(unsigned Opcode) const; |
| |
| /// Returns if the result of this instruction with a 16-bit result returned in |
| /// a 32-bit register implicitly zeroes the high 16-bits, rather than preserve |
| /// the original value. |
| bool zeroesHigh16BitsOfDest(unsigned Opcode) const; |
| |
| bool supportsWGP() const { |
| if (HasGFX1250Insts) |
| return false; |
| return getGeneration() >= GFX10; |
| } |
| |
| bool hasHWFP64() const { return HasFP64; } |
| |
| bool hasAddr64() const { |
| return (getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS); |
| } |
| |
| bool hasFlat() const { |
| return (getGeneration() > AMDGPUSubtarget::SOUTHERN_ISLANDS); |
| } |
| |
| // Return true if the target only has the reverse operand versions of VALU |
| // shift instructions (e.g. v_lshrrev_b32, and no v_lshr_b32). |
| bool hasOnlyRevVALUShifts() const { |
| return getGeneration() >= VOLCANIC_ISLANDS; |
| } |
| |
| bool hasFractBug() const { return getGeneration() == SOUTHERN_ISLANDS; } |
| |
| bool hasMed3_16() const { return getGeneration() >= AMDGPUSubtarget::GFX9; } |
| |
| bool hasMin3Max3_16() const { |
| return getGeneration() >= AMDGPUSubtarget::GFX9; |
| } |
| |
| bool hasSwap() const { return HasGFX9Insts; } |
| |
| bool hasScalarPackInsts() const { return HasGFX9Insts; } |
| |
| bool hasScalarMulHiInsts() const { return HasGFX9Insts; } |
| |
| bool hasScalarSubwordLoads() const { return getGeneration() >= GFX12; } |
| |
| TrapHandlerAbi getTrapHandlerAbi() const { |
| return isAmdHsaOS() ? TrapHandlerAbi::AMDHSA : TrapHandlerAbi::NONE; |
| } |
| |
| bool supportsGetDoorbellID() const { |
| // The S_GETREG DOORBELL_ID is supported by all GFX9 onward targets. |
| return getGeneration() >= GFX9; |
| } |
| |
| /// True if the offset field of DS instructions works as expected. On SI, the |
| /// offset uses a 16-bit adder and does not always wrap properly. |
| bool hasUsableDSOffset() const { return getGeneration() >= SEA_ISLANDS; } |
| |
| bool unsafeDSOffsetFoldingEnabled() const { |
| return EnableUnsafeDSOffsetFolding; |
| } |
| |
| /// Condition output from div_scale is usable. |
| bool hasUsableDivScaleConditionOutput() const { |
| return getGeneration() != SOUTHERN_ISLANDS; |
| } |
| |
| /// Extra wait hazard is needed in some cases before |
| /// s_cbranch_vccnz/s_cbranch_vccz. |
| bool hasReadVCCZBug() const { return getGeneration() <= SEA_ISLANDS; } |
| |
| /// Writes to VCC_LO/VCC_HI update the VCCZ flag. |
| bool partialVCCWritesUpdateVCCZ() const { return getGeneration() >= GFX10; } |
| |
| /// A read of an SGPR by SMRD instruction requires 4 wait states when the SGPR |
| /// was written by a VALU instruction. |
| bool hasSMRDReadVALUDefHazard() const { |
| return getGeneration() == SOUTHERN_ISLANDS; |
| } |
| |
| /// A read of an SGPR by a VMEM instruction requires 5 wait states when the |
| /// SGPR was written by a VALU Instruction. |
| bool hasVMEMReadSGPRVALUDefHazard() const { |
| return getGeneration() >= VOLCANIC_ISLANDS; |
| } |
| |
| bool hasRFEHazards() const { return getGeneration() >= VOLCANIC_ISLANDS; } |
| |
| /// Number of hazard wait states for s_setreg_b32/s_setreg_imm32_b32. |
| unsigned getSetRegWaitStates() const { |
| return getGeneration() <= SEA_ISLANDS ? 1 : 2; |
| } |
| |
| /// Return the amount of LDS that can be used that will not restrict the |
| /// occupancy lower than WaveCount. |
| unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount, |
| const Function &) const; |
| |
| bool supportsMinMaxDenormModes() const { |
| return getGeneration() >= AMDGPUSubtarget::GFX9; |
| } |
| |
| /// \returns If target supports S_DENORM_MODE. |
| bool hasDenormModeInst() const { |
| return getGeneration() >= AMDGPUSubtarget::GFX10; |
| } |
| |
| /// \returns If target supports ds_read/write_b128 and user enables generation |
| /// of ds_read/write_b128. |
| bool useDS128() const { return HasCIInsts && EnableDS128; } |
| |
| /// \return If target supports ds_read/write_b96/128. |
| bool hasDS96AndDS128() const { return HasCIInsts; } |
| |
| /// Have v_trunc_f64, v_ceil_f64, v_rndne_f64 |
| bool haveRoundOpsF64() const { return HasCIInsts; } |
| |
| /// \returns If MUBUF instructions always perform range checking, even for |
| /// buffer resources used for private memory access. |
| bool privateMemoryResourceIsRangeChecked() const { |
| return getGeneration() < AMDGPUSubtarget::GFX9; |
| } |
| |
| /// \returns If target requires PRT Struct NULL support (zero result registers |
| /// for sparse texture support). |
| bool usePRTStrictNull() const { return EnablePRTStrictNull; } |
| |
| bool hasUnalignedBufferAccessEnabled() const { |
| return HasUnalignedBufferAccess && HasUnalignedAccessMode; |
| } |
| |
| bool hasUnalignedDSAccessEnabled() const { |
| return HasUnalignedDSAccess && HasUnalignedAccessMode; |
| } |
| |
| bool hasUnalignedScratchAccessEnabled() const { |
| return HasUnalignedScratchAccess && HasUnalignedAccessMode; |
| } |
| |
| bool isXNACKEnabled() const { return TargetID.isXnackOnOrAny(); } |
| |
| bool isTgSplitEnabled() const { return EnableTgSplit; } |
| |
| bool isCuModeEnabled() const { return EnableCuMode; } |
| |
| bool isPreciseMemoryEnabled() const { return EnablePreciseMemory; } |
| |
| bool hasFlatScrRegister() const { return hasFlatAddressSpace(); } |
| |
| // Check if target supports ST addressing mode with FLAT scratch instructions. |
| // The ST addressing mode means no registers are used, either VGPR or SGPR, |
| // but only immediate offset is swizzled and added to the FLAT scratch base. |
| bool hasFlatScratchSTMode() const { |
| return hasFlatScratchInsts() && (hasGFX10_3Insts() || hasGFX940Insts()); |
| } |
| |
| bool hasFlatScratchSVSMode() const { return HasGFX940Insts || HasGFX11Insts; } |
| |
| bool hasFlatScratchEnabled() const { |
| return hasArchitectedFlatScratch() || |
| (EnableFlatScratch && hasFlatScratchInsts()); |
| } |
| |
| bool hasGlobalAddTidInsts() const { return HasGFX10_BEncoding; } |
| |
| bool hasAtomicCSub() const { return HasGFX10_BEncoding; } |
| |
| bool hasMTBUFInsts() const { return !hasGFX1250Insts(); } |
| |
| bool hasFormattedMUBUFInsts() const { return !hasGFX1250Insts(); } |
| |
| bool hasExportInsts() const { |
| return !hasGFX940Insts() && !hasGFX1250Insts(); |
| } |
| |
| bool hasVINTERPEncoding() const { |
| return HasGFX11Insts && !hasGFX1250Insts(); |
| } |
| |
| // DS_ADD_F64/DS_ADD_RTN_F64 |
| bool hasLdsAtomicAddF64() const { |
| return hasGFX90AInsts() || hasGFX1250Insts(); |
| } |
| |
| bool hasMultiDwordFlatScratchAddressing() const { |
| return getGeneration() >= GFX9; |
| } |
| |
| bool hasFlatLgkmVMemCountInOrder() const { return getGeneration() > GFX9; } |
| |
| bool hasD16LoadStore() const { return getGeneration() >= GFX9; } |
| |
| bool d16PreservesUnusedBits() const { |
| return hasD16LoadStore() && !TargetID.isSramEccOnOrAny(); |
| } |
| |
| bool hasD16Images() const { return getGeneration() >= VOLCANIC_ISLANDS; } |
| |
| /// Return if most LDS instructions have an m0 use that require m0 to be |
| /// initialized. |
| bool ldsRequiresM0Init() const { return getGeneration() < GFX9; } |
| |
| // True if the hardware rewinds and replays GWS operations if a wave is |
| // preempted. |
| // |
| // If this is false, a GWS operation requires testing if a nack set the |
| // MEM_VIOL bit, and repeating if so. |
| bool hasGWSAutoReplay() const { return getGeneration() >= GFX9; } |
| |
| /// \returns if target has ds_gws_sema_release_all instruction. |
| bool hasGWSSemaReleaseAll() const { return HasCIInsts; } |
| |
| bool hasScalarAddSub64() const { return getGeneration() >= GFX12; } |
| |
| bool hasScalarSMulU64() const { return getGeneration() >= GFX12; } |
| |
| // Covers VS/PS/CS graphics shaders |
| bool isMesaGfxShader(const Function &F) const { |
| return isMesa3DOS() && AMDGPU::isShader(F.getCallingConv()); |
| } |
| |
| bool hasMad64_32() const { return getGeneration() >= SEA_ISLANDS; } |
| |
| bool hasAtomicFaddInsts() const { |
| return HasAtomicFaddRtnInsts || HasAtomicFaddNoRtnInsts; |
| } |
| |
| bool vmemWriteNeedsExpWaitcnt() const { |
| return getGeneration() < SEA_ISLANDS; |
| } |
| |
| bool hasInstPrefetch() const { |
| return getGeneration() == GFX10 || getGeneration() == GFX11; |
| } |
| |
| bool hasPrefetch() const { return HasGFX12Insts; } |
| |
| // Has s_cmpk_* instructions. |
| bool hasSCmpK() const { return getGeneration() < GFX12; } |
| |
| // Scratch is allocated in 256 dword per wave blocks for the entire |
| // wavefront. When viewed from the perspective of an arbitrary workitem, this |
| // is 4-byte aligned. |
| // |
| // Only 4-byte alignment is really needed to access anything. Transformations |
| // on the pointer value itself may rely on the alignment / known low bits of |
| // the pointer. Set this to something above the minimum to avoid needing |
| // dynamic realignment in common cases. |
| Align getStackAlignment() const { return Align(16); } |
| |
| bool enableMachineScheduler() const override { return true; } |
| |
| bool useAA() const override; |
| |
| bool enableSubRegLiveness() const override { return true; } |
| |
| void setScalarizeGlobalBehavior(bool b) { ScalarizeGlobal = b; } |
| bool getScalarizeGlobalBehavior() const { return ScalarizeGlobal; } |
| |
| // static wrappers |
| static bool hasHalfRate64Ops(const TargetSubtargetInfo &STI); |
| |
| // XXX - Why is this here if it isn't in the default pass set? |
| bool enableEarlyIfConversion() const override { return true; } |
| |
| void overrideSchedPolicy(MachineSchedPolicy &Policy, |
| const SchedRegion &Region) const override; |
| |
| void overridePostRASchedPolicy(MachineSchedPolicy &Policy, |
| const SchedRegion &Region) const override; |
| |
| void mirFileLoaded(MachineFunction &MF) const override; |
| |
| unsigned getMaxNumUserSGPRs() const { |
| return AMDGPU::getMaxNumUserSGPRs(*this); |
| } |
| |
| bool useVGPRIndexMode() const; |
| |
| bool hasScalarCompareEq64() const { |
| return getGeneration() >= VOLCANIC_ISLANDS; |
| } |
| |
| bool hasLDSFPAtomicAddF32() const { return HasGFX8Insts; } |
| bool hasLDSFPAtomicAddF64() const { |
| return HasGFX90AInsts || HasGFX1250Insts; |
| } |
| |
| /// \returns true if the subtarget has the v_permlanex16_b32 instruction. |
| bool hasPermLaneX16() const { return getGeneration() >= GFX10; } |
| |
| /// \returns true if the subtarget has the v_permlane64_b32 instruction. |
| bool hasPermLane64() const { return getGeneration() >= GFX11; } |
| |
| bool hasDPPBroadcasts() const { return HasDPP && getGeneration() < GFX10; } |
| |
| bool hasDPPWavefrontShifts() const { |
| return HasDPP && getGeneration() < GFX10; |
| } |
| |
| // Has V_PK_MOV_B32 opcode |
| bool hasPkMovB32() const { return HasGFX90AInsts; } |
| |
| bool hasFmaakFmamkF32Insts() const { |
| return getGeneration() >= GFX10 || hasGFX940Insts(); |
| } |
| |
| bool hasFmaakFmamkF64Insts() const { return hasGFX1250Insts(); } |
| |
| bool hasNonNSAEncoding() const { return getGeneration() < GFX12; } |
| |
| unsigned getNSAMaxSize(bool HasSampler = false) const { |
| return AMDGPU::getNSAMaxSize(*this, HasSampler); |
| } |
| |
| bool hasMadF16() const; |
| |
| bool hasMovB64() const { return HasGFX940Insts || HasGFX1250Insts; } |
| |
| // Scalar and global loads support scale_offset bit. |
| bool hasScaleOffset() const { return HasGFX1250Insts; } |
| |
| // FLAT GLOBAL VOffset is signed |
| bool hasSignedGVSOffset() const { return HasGFX1250Insts; } |
| |
| bool loadStoreOptEnabled() const { return EnableLoadStoreOpt; } |
| |
| bool hasUserSGPRInit16BugInWave32() const { |
| return HasUserSGPRInit16Bug && isWave32(); |
| } |
| |
| bool has12DWordStoreHazard() const { |
| return getGeneration() != AMDGPUSubtarget::SOUTHERN_ISLANDS; |
| } |
| |
| // \returns true if the subtarget supports DWORDX3 load/store instructions. |
| bool hasDwordx3LoadStores() const { return HasCIInsts; } |
| |
| bool hasReadM0MovRelInterpHazard() const { |
| return getGeneration() == AMDGPUSubtarget::GFX9; |
| } |
| |
| bool hasReadM0SendMsgHazard() const { |
| return getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS && |
| getGeneration() <= AMDGPUSubtarget::GFX9; |
| } |
| |
| bool hasReadM0LdsDmaHazard() const { |
| return getGeneration() == AMDGPUSubtarget::GFX9; |
| } |
| |
| bool hasReadM0LdsDirectHazard() const { |
| return getGeneration() == AMDGPUSubtarget::GFX9; |
| } |
| |
| bool hasLDSMisalignedBugInWGPMode() const { |
| return HasLDSMisalignedBug && !EnableCuMode; |
| } |
| |
| // Shift amount of a 64 bit shift cannot be a highest allocated register |
| // if also at the end of the allocation block. |
| bool hasShift64HighRegBug() const { |
| return HasGFX90AInsts && !HasGFX940Insts; |
| } |
| |
| // Has one cycle hazard on transcendental instruction feeding a |
| // non transcendental VALU. |
| bool hasTransForwardingHazard() const { return HasGFX940Insts; } |
| |
| // Has one cycle hazard on a VALU instruction partially writing dst with |
| // a shift of result bits feeding another VALU instruction. |
| bool hasDstSelForwardingHazard() const { return HasGFX940Insts; } |
| |
| // Cannot use op_sel with v_dot instructions. |
| bool hasDOTOpSelHazard() const { return HasGFX940Insts || HasGFX11Insts; } |
| |
| // Does not have HW interlocs for VALU writing and then reading SGPRs. |
| bool hasVDecCoExecHazard() const { return HasGFX940Insts; } |
| |
| bool hasHardClauses() const { return MaxHardClauseLength > 0; } |
| |
| bool hasFPAtomicToDenormModeHazard() const { |
| return getGeneration() == GFX10; |
| } |
| |
| bool hasVOP3DPP() const { return getGeneration() >= GFX11; } |
| |
| bool hasLdsDirect() const { return getGeneration() >= GFX11; } |
| |
| bool hasLdsWaitVMSRC() const { return getGeneration() >= GFX12; } |
| |
| bool hasVALUPartialForwardingHazard() const { |
| return getGeneration() == GFX11; |
| } |
| |
| bool hasCvtScaleForwardingHazard() const { return HasGFX950Insts; } |
| |
| bool requiresCodeObjectV6() const { return RequiresCOV6; } |
| |
| bool useVGPRBlockOpsForCSR() const { return UseBlockVGPROpsForCSR; } |
| |
| bool hasVALUMaskWriteHazard() const { return getGeneration() == GFX11; } |
| |
| bool hasVALUReadSGPRHazard() const { |
| return HasGFX12Insts && !HasGFX1250Insts; |
| } |
| |
| bool setRegModeNeedsVNOPs() const { |
| return HasGFX1250Insts && getGeneration() == GFX12; |
| } |
| |
| /// Return if operations acting on VGPR tuples require even alignment. |
| bool needsAlignedVGPRs() const { return RequiresAlignVGPR; } |
| |
| /// Return true if the target has the S_PACK_HL_B32_B16 instruction. |
| bool hasSPackHL() const { return HasGFX11Insts; } |
| |
| /// Return true if the target's EXP instruction has the COMPR flag, which |
| /// affects the meaning of the EN (enable) bits. |
| bool hasCompressedExport() const { return !HasGFX11Insts; } |
| |
| /// Return true if the target's EXP instruction supports the NULL export |
| /// target. |
| bool hasNullExportTarget() const { return !HasGFX11Insts; } |
| |
| bool hasFlatScratchSVSSwizzleBug() const { return getGeneration() == GFX11; } |
| |
| /// Return true if the target has the S_DELAY_ALU instruction. |
| bool hasDelayAlu() const { return HasGFX11Insts; } |
| |
| /// Returns true if the target supports |
| /// global_load_lds_dwordx3/global_load_lds_dwordx4 or |
| /// buffer_load_dwordx3/buffer_load_dwordx4 with the lds bit. |
| bool hasLDSLoadB96_B128() const { return hasGFX950Insts(); } |
| |
| /// \returns true if the target uses LOADcnt/SAMPLEcnt/BVHcnt, DScnt/KMcnt |
| /// and STOREcnt rather than VMcnt, LGKMcnt and VScnt respectively. |
| bool hasExtendedWaitCounts() const { return getGeneration() >= GFX12; } |
| |
| /// \returns true if inline constants are not supported for F16 pseudo |
| /// scalar transcendentals. |
| bool hasNoF16PseudoScalarTransInlineConstants() const { |
| return getGeneration() == GFX12; |
| } |
| |
| /// \returns true if the target has packed f32 instructions that only read 32 |
| /// bits from a scalar operand (SGPR or literal) and replicates the bits to |
| /// both channels. |
| bool hasPKF32InstsReplicatingLower32BitsOfScalarInput() const { |
| return getGeneration() == GFX12 && HasGFX1250Insts; |
| } |
| |
| bool hasAddPC64Inst() const { return HasGFX1250Insts; } |
| |
| /// \returns true if the target supports expert scheduling mode 2 which relies |
| /// on the compiler to insert waits to avoid hazards between VMEM and VALU |
| /// instructions in some instances. |
| bool hasExpertSchedulingMode() const { return getGeneration() >= GFX12; } |
| |
| /// \returns The maximum number of instructions that can be enclosed in an |
| /// S_CLAUSE on the given subtarget, or 0 for targets that do not support that |
| /// instruction. |
| unsigned maxHardClauseLength() const { return MaxHardClauseLength; } |
| |
| /// Return the maximum number of waves per SIMD for kernels using \p SGPRs |
| /// SGPRs |
| unsigned getOccupancyWithNumSGPRs(unsigned SGPRs) const; |
| |
| /// Return the maximum number of waves per SIMD for kernels using \p VGPRs |
| /// VGPRs |
| unsigned getOccupancyWithNumVGPRs(unsigned VGPRs, |
| unsigned DynamicVGPRBlockSize) const; |
| |
| /// Subtarget's minimum/maximum occupancy, in number of waves per EU, that can |
| /// be achieved when the only function running on a CU is \p F, each workgroup |
| /// uses \p LDSSize bytes of LDS, and each wave uses \p NumSGPRs SGPRs and \p |
| /// NumVGPRs VGPRs. The flat workgroup sizes associated to the function are a |
| /// range, so this returns a range as well. |
| /// |
| /// Note that occupancy can be affected by the scratch allocation as well, but |
| /// we do not have enough information to compute it. |
| std::pair<unsigned, unsigned> computeOccupancy(const Function &F, |
| unsigned LDSSize = 0, |
| unsigned NumSGPRs = 0, |
| unsigned NumVGPRs = 0) const; |
| |
| /// \returns true if the flat_scratch register should be initialized with the |
| /// pointer to the wave's scratch memory rather than a size and offset. |
| bool flatScratchIsPointer() const { |
| return getGeneration() >= AMDGPUSubtarget::GFX9; |
| } |
| |
| /// \returns true if the machine has merged shaders in which s0-s7 are |
| /// reserved by the hardware and user SGPRs start at s8 |
| bool hasMergedShaders() const { return getGeneration() >= GFX9; } |
| |
| // \returns true if the target supports the pre-NGG legacy geometry path. |
| bool hasLegacyGeometry() const { return getGeneration() < GFX11; } |
| |
| // \returns true if the target has split barriers feature |
| bool hasSplitBarriers() const { return getGeneration() >= GFX12; } |
| |
| // \returns true if the target has DX10_CLAMP kernel descriptor mode bit |
| bool hasDX10ClampMode() const { return getGeneration() < GFX12; } |
| |
| // \returns true if the target has IEEE kernel descriptor mode bit |
| bool hasIEEEMode() const { return getGeneration() < GFX12; } |
| |
| // \returns true if the target has WG_RR_MODE kernel descriptor mode bit |
| bool hasRrWGMode() const { return getGeneration() >= GFX12; } |
| |
| /// \returns true if VADDR and SADDR fields in VSCRATCH can use negative |
| /// values. |
| bool hasSignedScratchOffsets() const { return getGeneration() >= GFX12; } |
| |
| bool hasINVWBL2WaitCntRequirement() const { return HasGFX1250Insts; } |
| |
| bool hasVOPD3() const { return HasGFX1250Insts; } |
| |
| // \returns true if the target has V_MUL_U64/V_MUL_I64 instructions. |
| bool hasVectorMulU64() const { return HasGFX1250Insts; } |
| |
| // \returns true if the target has V_MAD_NC_U64_U32/V_MAD_NC_I64_I32 |
| // instructions. |
| bool hasMadU64U32NoCarry() const { return HasGFX1250Insts; } |
| |
| // \returns true if the target has V_{MIN|MAX}_{I|U}64 instructions. |
| bool hasIntMinMax64() const { return HasGFX1250Insts; } |
| |
| // \returns true if the target has V_PK_{MIN|MAX}3_{I|U}16 instructions. |
| bool hasPkMinMax3Insts() const { return HasGFX1250Insts; } |
| |
| // \returns ture if target has S_GET_SHADER_CYCLES_U64 instruction. |
| bool hasSGetShaderCyclesInst() const { return HasGFX1250Insts; } |
| |
| // \returns true if S_GETPC_B64 zero-extends the result from 48 bits instead |
| // of sign-extending. Note that GFX1250 has not only fixed the bug but also |
| // extended VA to 57 bits. |
| bool hasGetPCZeroExtension() const { |
| return HasGFX12Insts && !HasGFX1250Insts; |
| } |
| |
| // \returns true if the target needs to create a prolog for backward |
| // compatibility when preloading kernel arguments. |
| bool needsKernArgPreloadProlog() const { |
| return hasKernargPreload() && !HasGFX1250Insts; |
| } |
| |
| bool hasCondSubInsts() const { return HasGFX12Insts; } |
| |
| bool hasSubClampInsts() const { return hasGFX10_3Insts(); } |
| |
| /// \returns SGPR allocation granularity supported by the subtarget. |
| unsigned getSGPRAllocGranule() const { |
| return AMDGPU::IsaInfo::getSGPRAllocGranule(this); |
| } |
| |
| /// \returns SGPR encoding granularity supported by the subtarget. |
| unsigned getSGPREncodingGranule() const { |
| return AMDGPU::IsaInfo::getSGPREncodingGranule(this); |
| } |
| |
| /// \returns Total number of SGPRs supported by the subtarget. |
| unsigned getTotalNumSGPRs() const { |
| return AMDGPU::IsaInfo::getTotalNumSGPRs(this); |
| } |
| |
| /// \returns Addressable number of SGPRs supported by the subtarget. |
| unsigned getAddressableNumSGPRs() const { |
| return AMDGPU::IsaInfo::getAddressableNumSGPRs(this); |
| } |
| |
| /// \returns Minimum number of SGPRs that meets the given number of waves per |
| /// execution unit requirement supported by the subtarget. |
| unsigned getMinNumSGPRs(unsigned WavesPerEU) const { |
| return AMDGPU::IsaInfo::getMinNumSGPRs(this, WavesPerEU); |
| } |
| |
| /// \returns Maximum number of SGPRs that meets the given number of waves per |
| /// execution unit requirement supported by the subtarget. |
| unsigned getMaxNumSGPRs(unsigned WavesPerEU, bool Addressable) const { |
| return AMDGPU::IsaInfo::getMaxNumSGPRs(this, WavesPerEU, Addressable); |
| } |
| |
| /// \returns Reserved number of SGPRs. This is common |
| /// utility function called by MachineFunction and |
| /// Function variants of getReservedNumSGPRs. |
| unsigned getBaseReservedNumSGPRs(const bool HasFlatScratch) const; |
| /// \returns Reserved number of SGPRs for given machine function \p MF. |
| unsigned getReservedNumSGPRs(const MachineFunction &MF) const; |
| |
| /// \returns Reserved number of SGPRs for given function \p F. |
| unsigned getReservedNumSGPRs(const Function &F) const; |
| |
| /// \returns Maximum number of preloaded SGPRs for the subtarget. |
| unsigned getMaxNumPreloadedSGPRs() const; |
| |
| /// \returns max num SGPRs. This is the common utility |
| /// function called by MachineFunction and Function |
| /// variants of getMaxNumSGPRs. |
| unsigned getBaseMaxNumSGPRs(const Function &F, |
| std::pair<unsigned, unsigned> WavesPerEU, |
| unsigned PreloadedSGPRs, |
| unsigned ReservedNumSGPRs) const; |
| |
| /// \returns Maximum number of SGPRs that meets number of waves per execution |
| /// unit requirement for function \p MF, or number of SGPRs explicitly |
| /// requested using "amdgpu-num-sgpr" attribute attached to function \p MF. |
| /// |
| /// \returns Value that meets number of waves per execution unit requirement |
| /// if explicitly requested value cannot be converted to integer, violates |
| /// subtarget's specifications, or does not meet number of waves per execution |
| /// unit requirement. |
| unsigned getMaxNumSGPRs(const MachineFunction &MF) const; |
| |
| /// \returns Maximum number of SGPRs that meets number of waves per execution |
| /// unit requirement for function \p F, or number of SGPRs explicitly |
| /// requested using "amdgpu-num-sgpr" attribute attached to function \p F. |
| /// |
| /// \returns Value that meets number of waves per execution unit requirement |
| /// if explicitly requested value cannot be converted to integer, violates |
| /// subtarget's specifications, or does not meet number of waves per execution |
| /// unit requirement. |
| unsigned getMaxNumSGPRs(const Function &F) const; |
| |
| /// \returns VGPR allocation granularity supported by the subtarget. |
| unsigned getVGPRAllocGranule(unsigned DynamicVGPRBlockSize) const { |
| return AMDGPU::IsaInfo::getVGPRAllocGranule(this, DynamicVGPRBlockSize); |
| } |
| |
| /// \returns VGPR encoding granularity supported by the subtarget. |
| unsigned getVGPREncodingGranule() const { |
| return AMDGPU::IsaInfo::getVGPREncodingGranule(this); |
| } |
| |
| /// \returns Total number of VGPRs supported by the subtarget. |
| unsigned getTotalNumVGPRs() const { |
| return AMDGPU::IsaInfo::getTotalNumVGPRs(this); |
| } |
| |
| /// \returns Addressable number of architectural VGPRs supported by the |
| /// subtarget. |
| unsigned getAddressableNumArchVGPRs() const { |
| return AMDGPU::IsaInfo::getAddressableNumArchVGPRs(this); |
| } |
| |
| /// \returns Addressable number of VGPRs supported by the subtarget. |
| unsigned getAddressableNumVGPRs(unsigned DynamicVGPRBlockSize) const { |
| return AMDGPU::IsaInfo::getAddressableNumVGPRs(this, DynamicVGPRBlockSize); |
| } |
| |
| /// \returns the minimum number of VGPRs that will prevent achieving more than |
| /// the specified number of waves \p WavesPerEU. |
| unsigned getMinNumVGPRs(unsigned WavesPerEU, |
| unsigned DynamicVGPRBlockSize) const { |
| return AMDGPU::IsaInfo::getMinNumVGPRs(this, WavesPerEU, |
| DynamicVGPRBlockSize); |
| } |
| |
| /// \returns the maximum number of VGPRs that can be used and still achieved |
| /// at least the specified number of waves \p WavesPerEU. |
| unsigned getMaxNumVGPRs(unsigned WavesPerEU, |
| unsigned DynamicVGPRBlockSize) const { |
| return AMDGPU::IsaInfo::getMaxNumVGPRs(this, WavesPerEU, |
| DynamicVGPRBlockSize); |
| } |
| |
| /// \returns max num VGPRs. This is the common utility function |
| /// called by MachineFunction and Function variants of getMaxNumVGPRs. |
| unsigned |
| getBaseMaxNumVGPRs(const Function &F, |
| std::pair<unsigned, unsigned> NumVGPRBounds) const; |
| |
| /// \returns Maximum number of VGPRs that meets number of waves per execution |
| /// unit requirement for function \p F, or number of VGPRs explicitly |
| /// requested using "amdgpu-num-vgpr" attribute attached to function \p F. |
| /// |
| /// \returns Value that meets number of waves per execution unit requirement |
| /// if explicitly requested value cannot be converted to integer, violates |
| /// subtarget's specifications, or does not meet number of waves per execution |
| /// unit requirement. |
| unsigned getMaxNumVGPRs(const Function &F) const; |
| |
| unsigned getMaxNumAGPRs(const Function &F) const { return getMaxNumVGPRs(F); } |
| |
| /// Return a pair of maximum numbers of VGPRs and AGPRs that meet the number |
| /// of waves per execution unit required for the function \p MF. |
| std::pair<unsigned, unsigned> getMaxNumVectorRegs(const Function &F) const; |
| |
| /// \returns Maximum number of VGPRs that meets number of waves per execution |
| /// unit requirement for function \p MF, or number of VGPRs explicitly |
| /// requested using "amdgpu-num-vgpr" attribute attached to function \p MF. |
| /// |
| /// \returns Value that meets number of waves per execution unit requirement |
| /// if explicitly requested value cannot be converted to integer, violates |
| /// subtarget's specifications, or does not meet number of waves per execution |
| /// unit requirement. |
| unsigned getMaxNumVGPRs(const MachineFunction &MF) const; |
| |
| bool supportsWave32() const { return getGeneration() >= GFX10; } |
| |
| bool supportsWave64() const { return !hasGFX1250Insts(); } |
| |
| bool isWave32() const { return getWavefrontSize() == 32; } |
| |
| bool isWave64() const { return getWavefrontSize() == 64; } |
| |
| /// Returns if the wavesize of this subtarget is known reliable. This is false |
| /// only for the a default target-cpu that does not have an explicit |
| /// +wavefrontsize target feature. |
| bool isWaveSizeKnown() const { |
| return hasFeature(AMDGPU::FeatureWavefrontSize32) || |
| hasFeature(AMDGPU::FeatureWavefrontSize64); |
| } |
| |
| const TargetRegisterClass *getBoolRC() const { |
| return getRegisterInfo()->getBoolRC(); |
| } |
| |
| /// \returns Maximum number of work groups per compute unit supported by the |
| /// subtarget and limited by given \p FlatWorkGroupSize. |
| unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const override { |
| return AMDGPU::IsaInfo::getMaxWorkGroupsPerCU(this, FlatWorkGroupSize); |
| } |
| |
| /// \returns Minimum flat work group size supported by the subtarget. |
| unsigned getMinFlatWorkGroupSize() const override { |
| return AMDGPU::IsaInfo::getMinFlatWorkGroupSize(this); |
| } |
| |
| /// \returns Maximum flat work group size supported by the subtarget. |
| unsigned getMaxFlatWorkGroupSize() const override { |
| return AMDGPU::IsaInfo::getMaxFlatWorkGroupSize(this); |
| } |
| |
| /// \returns Number of waves per execution unit required to support the given |
| /// \p FlatWorkGroupSize. |
| unsigned |
| getWavesPerEUForWorkGroup(unsigned FlatWorkGroupSize) const override { |
| return AMDGPU::IsaInfo::getWavesPerEUForWorkGroup(this, FlatWorkGroupSize); |
| } |
| |
| /// \returns Minimum number of waves per execution unit supported by the |
| /// subtarget. |
| unsigned getMinWavesPerEU() const override { |
| return AMDGPU::IsaInfo::getMinWavesPerEU(this); |
| } |
| |
| void adjustSchedDependency(SUnit *Def, int DefOpIdx, SUnit *Use, int UseOpIdx, |
| SDep &Dep, |
| const TargetSchedModel *SchedModel) const override; |
| |
| // \returns true if it's beneficial on this subtarget for the scheduler to |
| // cluster stores as well as loads. |
| bool shouldClusterStores() const { return getGeneration() >= GFX11; } |
| |
| // \returns the number of address arguments from which to enable MIMG NSA |
| // on supported architectures. |
| unsigned getNSAThreshold(const MachineFunction &MF) const; |
| |
| // \returns true if the subtarget has a hazard requiring an "s_nop 0" |
| // instruction before "s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)". |
| bool requiresNopBeforeDeallocVGPRs() const { return !HasGFX1250Insts; } |
| |
| // \returns true if the subtarget needs S_WAIT_ALU 0 before S_GETREG_B32 on |
| // STATUS, STATE_PRIV, EXCP_FLAG_PRIV, or EXCP_FLAG_USER. |
| bool requiresWaitIdleBeforeGetReg() const { return HasGFX1250Insts; } |
| |
| bool isDynamicVGPREnabled() const { return DynamicVGPR; } |
| unsigned getDynamicVGPRBlockSize() const { |
| return DynamicVGPRBlockSize32 ? 32 : 16; |
| } |
| |
| bool requiresDisjointEarlyClobberAndUndef() const override { |
| // AMDGPU doesn't care if early-clobber and undef operands are allocated |
| // to the same register. |
| return false; |
| } |
| |
| // DS_ATOMIC_ASYNC_BARRIER_ARRIVE_B64 shall not be claused with anything |
| // and surronded by S_WAIT_ALU(0xFFE3). |
| bool hasDsAtomicAsyncBarrierArriveB64PipeBug() const { |
| return getGeneration() == GFX12; |
| } |
| |
| // Requires s_wait_alu(0) after s102/s103 write and src_flat_scratch_base |
| // read. |
| bool hasScratchBaseForwardingHazard() const { |
| return HasGFX1250Insts && getGeneration() == GFX12; |
| } |
| |
| // src_flat_scratch_hi cannot be used as a source in SALU producing a 64-bit |
| // result. |
| bool hasFlatScratchHiInB64InstHazard() const { |
| return HasGFX1250Insts && getGeneration() == GFX12; |
| } |
| |
| /// \returns true if the subtarget requires a wait for xcnt before VMEM |
| /// accesses that must never be repeated in the event of a page fault/re-try. |
| /// Atomic stores/rmw and all volatile accesses fall under this criteria. |
| bool requiresWaitXCntForSingleAccessInstructions() const { |
| return HasGFX1250Insts; |
| } |
| |
| /// \returns the number of significant bits in the immediate field of the |
| /// S_NOP instruction. |
| unsigned getSNopBits() const { |
| if (getGeneration() >= AMDGPUSubtarget::GFX12) |
| return 7; |
| if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) |
| return 4; |
| return 3; |
| } |
| |
| bool supportsBPermute() const { |
| return getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS; |
| } |
| |
| bool supportsWaveWideBPermute() const { |
| return (getGeneration() <= AMDGPUSubtarget::GFX9 || |
| getGeneration() == AMDGPUSubtarget::GFX12) || |
| isWave32(); |
| } |
| |
| /// Return true if real (non-fake) variants of True16 instructions using |
| /// 16-bit registers should be code-generated. Fake True16 instructions are |
| /// identical to non-fake ones except that they take 32-bit registers as |
| /// operands and always use their low halves. |
| // TODO: Remove and use hasTrue16BitInsts() instead once True16 is fully |
| // supported and the support for fake True16 instructions is removed. |
| bool useRealTrue16Insts() const { |
| return hasTrue16BitInsts() && EnableRealTrue16Insts; |
| } |
| |
| bool requiresWaitOnWorkgroupReleaseFence() const { |
| return getGeneration() >= GFX10 || isTgSplitEnabled(); |
| } |
| }; |
| |
| class GCNUserSGPRUsageInfo { |
| public: |
| bool hasImplicitBufferPtr() const { return ImplicitBufferPtr; } |
| |
| bool hasPrivateSegmentBuffer() const { return PrivateSegmentBuffer; } |
| |
| bool hasDispatchPtr() const { return DispatchPtr; } |
| |
| bool hasQueuePtr() const { return QueuePtr; } |
| |
| bool hasKernargSegmentPtr() const { return KernargSegmentPtr; } |
| |
| bool hasDispatchID() const { return DispatchID; } |
| |
| bool hasFlatScratchInit() const { return FlatScratchInit; } |
| |
| bool hasPrivateSegmentSize() const { return PrivateSegmentSize; } |
| |
| unsigned getNumKernargPreloadSGPRs() const { return NumKernargPreloadSGPRs; } |
| |
| unsigned getNumUsedUserSGPRs() const { return NumUsedUserSGPRs; } |
| |
| unsigned getNumFreeUserSGPRs(); |
| |
| void allocKernargPreloadSGPRs(unsigned NumSGPRs); |
| |
| enum UserSGPRID : unsigned { |
| ImplicitBufferPtrID = 0, |
| PrivateSegmentBufferID = 1, |
| DispatchPtrID = 2, |
| QueuePtrID = 3, |
| KernargSegmentPtrID = 4, |
| DispatchIdID = 5, |
| FlatScratchInitID = 6, |
| PrivateSegmentSizeID = 7 |
| }; |
| |
| // Returns the size in number of SGPRs for preload user SGPR field. |
| static unsigned getNumUserSGPRForField(UserSGPRID ID) { |
| switch (ID) { |
| case ImplicitBufferPtrID: |
| return 2; |
| case PrivateSegmentBufferID: |
| return 4; |
| case DispatchPtrID: |
| return 2; |
| case QueuePtrID: |
| return 2; |
| case KernargSegmentPtrID: |
| return 2; |
| case DispatchIdID: |
| return 2; |
| case FlatScratchInitID: |
| return 2; |
| case PrivateSegmentSizeID: |
| return 1; |
| } |
| llvm_unreachable("Unknown UserSGPRID."); |
| } |
| |
| GCNUserSGPRUsageInfo(const Function &F, const GCNSubtarget &ST); |
| |
| private: |
| const GCNSubtarget &ST; |
| |
| // Private memory buffer |
| // Compute directly in sgpr[0:1] |
| // Other shaders indirect 64-bits at sgpr[0:1] |
| bool ImplicitBufferPtr = false; |
| |
| bool PrivateSegmentBuffer = false; |
| |
| bool DispatchPtr = false; |
| |
| bool QueuePtr = false; |
| |
| bool KernargSegmentPtr = false; |
| |
| bool DispatchID = false; |
| |
| bool FlatScratchInit = false; |
| |
| bool PrivateSegmentSize = false; |
| |
| unsigned NumKernargPreloadSGPRs = 0; |
| |
| unsigned NumUsedUserSGPRs = 0; |
| }; |
| |
| } // end namespace llvm |
| |
| #endif // LLVM_LIB_TARGET_AMDGPU_GCNSUBTARGET_H |