| //=====-- NVPTXSubtarget.h - Define Subtarget for the NVPTX ---*- C++ -*--====// |
| // |
| // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| // See https://llvm.org/LICENSE.txt for license information. |
| // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| // |
| //===----------------------------------------------------------------------===// |
| // |
| // This file declares the NVPTX specific subclass of TargetSubtarget. |
| // |
| //===----------------------------------------------------------------------===// |
| |
| #ifndef LLVM_LIB_TARGET_NVPTX_NVPTXSUBTARGET_H |
| #define LLVM_LIB_TARGET_NVPTX_NVPTXSUBTARGET_H |
| |
| #include "NVPTX.h" |
| #include "NVPTXFrameLowering.h" |
| #include "NVPTXISelLowering.h" |
| #include "NVPTXInstrInfo.h" |
| #include "NVPTXRegisterInfo.h" |
| #include "llvm/CodeGen/TargetSubtargetInfo.h" |
| #include "llvm/IR/DataLayout.h" |
| #include "llvm/IR/NVVMIntrinsicUtils.h" |
| #include "llvm/Support/NVPTXAddrSpace.h" |
| #include <string> |
| |
| #define GET_SUBTARGETINFO_HEADER |
| #include "NVPTXGenSubtargetInfo.inc" |
| |
| namespace llvm { |
| |
| // FullSmVersion encoding: SM * 10 + ArchSuffixOffset |
| // ArchSuffixOffset: 0 (base), 2 ('f'), 3 ('a') |
| // e.g. sm_100 -> 1000, sm_100f -> 1002, sm_100a -> 1003 |
| |
| class NVPTXSubtarget : public NVPTXGenSubtargetInfo { |
| virtual void anchor(); |
| std::string TargetName; |
| |
| // PTX version x.y is represented as 10*x+y, e.g. 3.1 == 31 |
| unsigned PTXVersion; |
| |
| // FullSmVersion encoding: SM * 10 + ArchSuffixOffset |
| // ArchSuffixOffset: 0 (base), 2 ('f'), 3 ('a') |
| // e.g. sm_30 -> 300, sm_90a -> 903, sm_100f -> 1002 |
| unsigned int FullSmVersion; |
| |
| // SM version x.y is represented as 10*x+y, e.g. 3.1 == 31. Derived from |
| // FullSmVersion. |
| unsigned int SmVersion; |
| |
| NVPTXInstrInfo InstrInfo; |
| NVPTXTargetLowering TLInfo; |
| std::unique_ptr<const SelectionDAGTargetInfo> TSInfo; |
| |
| // NVPTX does not have any call stack frame, but need a NVPTX specific |
| // FrameLowering class because TargetFrameLowering is abstract. |
| NVPTXFrameLowering FrameLowering; |
| |
| public: |
| /// This constructor initializes the data members to match that |
| /// of the specified module. |
| /// |
| NVPTXSubtarget(const Triple &TT, const std::string &CPU, |
| const std::string &FS, const NVPTXTargetMachine &TM); |
| |
| ~NVPTXSubtarget() override; |
| |
| const TargetFrameLowering *getFrameLowering() const override { |
| return &FrameLowering; |
| } |
| const NVPTXInstrInfo *getInstrInfo() const override { return &InstrInfo; } |
| const NVPTXRegisterInfo *getRegisterInfo() const override { |
| return &InstrInfo.getRegisterInfo(); |
| } |
| const NVPTXTargetLowering *getTargetLowering() const override { |
| return &TLInfo; |
| } |
| |
| const SelectionDAGTargetInfo *getSelectionDAGInfo() const override; |
| |
| // Checks PTX version and family-specific and architecture-specific SM |
| // versions. For example, sm_100{f/a} and any future variants in the same |
| // family will match for any PTX version greater than or equal to |
| // `PTXVersion`. |
| bool hasPTXWithFamilySMs(unsigned PTXVersion, |
| ArrayRef<unsigned> SMVersions) const; |
| // Checks PTX version and architecture-specific SM versions. |
| // For example, sm_100{a} will match for any PTX version greater than or equal |
| // to `PTXVersion`. |
| bool hasPTXWithAccelSMs(unsigned PTXVersion, |
| ArrayRef<unsigned> SMVersions) const; |
| |
| bool has256BitVectorLoadStore(unsigned AS) const { |
| return SmVersion >= 100 && PTXVersion >= 88 && |
| AS == NVPTXAS::ADDRESS_SPACE_GLOBAL; |
| } |
| bool hasUsedBytesMaskPragma() const { |
| return SmVersion >= 50 && PTXVersion >= 83; |
| } |
| bool hasAtomAddF64() const { return SmVersion >= 60; } |
| bool hasAtomScope() const { return SmVersion >= 60; } |
| bool hasAtomBitwise64() const { return SmVersion >= 32; } |
| bool hasAtomMinMax64() const { return SmVersion >= 32; } |
| bool hasAtomCas16() const { return SmVersion >= 70 && PTXVersion >= 63; } |
| bool hasAtomSwap128() const { return SmVersion >= 90 && PTXVersion >= 83; } |
| bool hasClusters() const { return SmVersion >= 90 && PTXVersion >= 78; } |
| bool hasLDG() const { return SmVersion >= 32; } |
| bool hasHWROT32() const { return SmVersion >= 32; } |
| bool hasBrx() const { return SmVersion >= 30 && PTXVersion >= 60; } |
| bool hasFP16Math() const { return SmVersion >= 53; } |
| bool hasBF16Math() const { return SmVersion >= 80; } |
| bool allowFP16Math() const; |
| bool hasMaskOperator() const { return PTXVersion >= 71; } |
| bool hasNoReturn() const { return SmVersion >= 30 && PTXVersion >= 64; } |
| // Does SM & PTX support memory orderings (weak and atomic: relaxed, acquire, |
| // release, acq_rel, sc) ? |
| bool hasMemoryOrdering() const { return SmVersion >= 70 && PTXVersion >= 60; } |
| // Does SM & PTX support .acquire and .release qualifiers for fence? |
| bool hasSplitAcquireAndReleaseFences() const { |
| return SmVersion >= 90 && PTXVersion >= 86; |
| } |
| // Does SM & PTX support atomic relaxed MMIO operations ? |
| bool hasRelaxedMMIO() const { return SmVersion >= 70 && PTXVersion >= 82; } |
| bool hasDotInstructions() const { |
| return SmVersion >= 61 && PTXVersion >= 50; |
| } |
| |
| // Checks following instructions support: |
| // - tcgen05.ld/st |
| // - tcgen05.alloc/dealloc/relinquish |
| // - tcgen05.cp |
| // - tcgen05.fence/wait |
| // - tcgen05.commit |
| // - tcgen05.mma |
| bool hasTcgen05InstSupport() const { |
| // sm_101 renamed to sm_110 in PTX 9.0 |
| return hasPTXWithFamilySMs(90, {100, 110}) || |
| hasPTXWithFamilySMs(88, {100, 101}) || |
| hasPTXWithAccelSMs(86, {100, 101}); |
| } |
| |
| // Checks tcgen05.shift instruction support. |
| bool hasTcgen05ShiftSupport() const { |
| // sm_101 renamed to sm_110 in PTX 9.0 |
| return hasPTXWithAccelSMs(90, {100, 110, 103}) || |
| hasPTXWithAccelSMs(88, {100, 101, 103}) || |
| hasPTXWithAccelSMs(86, {100, 101}); |
| } |
| |
| bool hasTcgen05MMAScaleInputDImm() const { |
| return hasPTXWithFamilySMs(88, {100}) || hasPTXWithAccelSMs(86, {100}); |
| } |
| |
| bool hasTcgen05MMAI8Kind() const { |
| return hasPTXWithAccelSMs(90, {100, 110}) || |
| hasPTXWithAccelSMs(86, {100, 101}); |
| } |
| |
| bool hasTcgen05MMASparseMxf4nvf4() const { |
| return hasPTXWithAccelSMs(90, {100, 110, 103}) || |
| hasPTXWithAccelSMs(87, {100, 101, 103}); |
| } |
| |
| bool hasTcgen05MMASparseMxf4() const { |
| return hasPTXWithAccelSMs(90, {100, 110, 103}) || |
| hasPTXWithAccelSMs(86, {100, 101, 103}); |
| } |
| |
| bool hasTcgen05LdRedSupport() const { |
| return hasPTXWithFamilySMs(90, {110, 103}) || |
| hasPTXWithFamilySMs(88, {101, 103}); |
| } |
| |
| bool hasReduxSyncF32() const { |
| return hasPTXWithFamilySMs(88, {100}) || hasPTXWithAccelSMs(86, {100}); |
| } |
| |
| bool hasMMABlockScale() const { |
| return hasPTXWithFamilySMs(88, {120}) || hasPTXWithAccelSMs(87, {120}); |
| } |
| |
| bool hasMMASparseBlockScaleF4() const { |
| return hasPTXWithAccelSMs(87, {120, 121}); |
| } |
| |
| bool hasMMAWithMXF4NVF4Scale4xE8M0() const { |
| return hasPTXWithFamilySMs(91, {120}); |
| } |
| |
| bool hasMMASparseWithMXF4NVF4Scale4xE8M0() const { |
| return hasPTXWithAccelSMs(91, {120, 121}); |
| } |
| |
| // f32x2 instructions in Blackwell family |
| bool hasF32x2Instructions() const; |
| |
| // Checks support for following in TMA: |
| // - cta_group::1/2 support |
| // - im2col_w/w_128 mode support |
| // - tile_gather4 mode support |
| // - tile_scatter4 mode support |
| bool hasTMABlackwellSupport() const { |
| return hasPTXWithFamilySMs(90, {100, 110}) || |
| hasPTXWithFamilySMs(88, {100, 101}) || |
| hasPTXWithAccelSMs(86, {100, 101}); |
| } |
| |
| // Checks support for conversions involving e4m3x2 and e5m2x2. |
| bool hasFP8ConversionSupport() const { |
| if (PTXVersion >= 81) |
| return SmVersion >= 89; |
| |
| if (PTXVersion >= 78) |
| return SmVersion >= 90; |
| |
| return false; |
| } |
| |
| // Checks support for conversions involving the following types: |
| // - e2m3x2/e3m2x2 |
| // - e2m1x2 |
| // - ue8m0x2 |
| bool hasNarrowFPConversionSupport() const { |
| return hasPTXWithFamilySMs(90, {100, 110, 120}) || |
| hasPTXWithFamilySMs(88, {100, 101, 120}) || |
| hasPTXWithAccelSMs(86, {100, 101, 120}); |
| } |
| |
| // Checks support for conversions involving the following types: |
| // - bf16x2 -> f8x2 |
| // - f16x2 -> f6x2 |
| // - bf16x2 -> f6x2 |
| // - f16x2 -> f4x2 |
| // - bf16x2 -> f4x2 |
| bool hasFP16X2ToNarrowFPConversionSupport() const { |
| return hasPTXWithFamilySMs(91, {100, 110, 120}); |
| } |
| |
| bool hasS2F6X2ConversionSupport() const { |
| return hasPTXWithAccelSMs(91, {100, 103, 110, 120, 121}); |
| } |
| |
| // Checks support for conversions from narrow FP types to bf16x2. |
| bool hasNarrowFPToBF16x2ConversionSupport() const { |
| return hasPTXWithFamilySMs(92, {100, 110, 120}); |
| } |
| |
| bool hasTensormapReplaceSupport() const { |
| return hasPTXWithFamilySMs(90, {90, 100, 110, 120}) || |
| hasPTXWithFamilySMs(88, {90, 100, 101, 120}) || |
| hasPTXWithAccelSMs(83, {90, 100, 101, 120}); |
| } |
| |
| bool hasTensormapReplaceElemtypeSupport(unsigned value) const { |
| if (value >= static_cast<unsigned>(nvvm::TensormapElemType::B4x16)) |
| return hasPTXWithFamilySMs(90, {100, 110, 120}) || |
| hasPTXWithFamilySMs(88, {100, 101, 120}) || |
| hasPTXWithAccelSMs(87, {100, 101, 120}); |
| |
| return hasTensormapReplaceSupport(); |
| } |
| |
| bool hasTensormapReplaceSwizzleAtomicitySupport() const { |
| return hasPTXWithFamilySMs(90, {100, 110, 120}) || |
| hasPTXWithFamilySMs(88, {100, 101, 120}) || |
| hasPTXWithAccelSMs(87, {100, 101, 120}); |
| } |
| |
| bool hasTensormapReplaceSwizzleModeSupport(unsigned value) const { |
| if (value == static_cast<unsigned>(nvvm::TensormapSwizzleMode::SWIZZLE_96B)) |
| return hasPTXWithAccelSMs(88, {103}); |
| |
| return hasTensormapReplaceSupport(); |
| } |
| |
| bool hasClusterLaunchControlTryCancelMulticastSupport() const { |
| return hasPTXWithFamilySMs(90, {100, 110, 120}) || |
| hasPTXWithFamilySMs(88, {100, 101, 120}) || |
| hasPTXWithAccelSMs(86, {100, 101, 120}); |
| } |
| |
| bool hasSetMaxNRegSupport() const { |
| return hasPTXWithFamilySMs(90, {100, 110, 120}) || |
| hasPTXWithFamilySMs(88, {100, 101, 120}) || |
| hasPTXWithAccelSMs(86, {100, 101, 120}) || |
| hasPTXWithAccelSMs(80, {90}); |
| } |
| |
| bool hasLdStmatrixBlackwellSupport() const { |
| return hasPTXWithFamilySMs(90, {100, 110, 120}) || |
| hasPTXWithFamilySMs(88, {100, 101, 120}) || |
| hasPTXWithAccelSMs(86, {100, 101, 120}); |
| } |
| |
| // Prior to CUDA 12.3 ptxas did not recognize that the trap instruction |
| // terminates a basic block. Instead, it would assume that control flow |
| // continued to the next instruction. The next instruction could be in the |
| // block that's lexically below it. This would lead to a phantom CFG edges |
| // being created within ptxas. This issue was fixed in CUDA 12.3. Thus, when |
| // PTX ISA versions 8.3+ we can confidently say that the bug will not be |
| // present. |
| bool hasPTXASUnreachableBug() const { return PTXVersion < 83; } |
| bool hasCvtaParam() const { return SmVersion >= 70 && PTXVersion >= 77; } |
| bool hasConvertWithStochasticRounding() const { |
| return hasPTXWithAccelSMs(87, {100, 103}); |
| } |
| unsigned int getFullSmVersion() const { return FullSmVersion; } |
| unsigned int getSmVersion() const { return getFullSmVersion() / 10; } |
| unsigned int getSmFamilyVersion() const { return getFullSmVersion() / 100; } |
| // GPUs with "a" suffix have architecture-accelerated features that are |
| // supported on the specified architecture only, hence such targets do not |
| // follow the onion layer model. hasArchAccelFeatures() allows distinguishing |
| // such GPU variants from the base GPU architecture. |
| // - false represents non-accelerated architecture. |
| // - true represents architecture-accelerated variant. |
| bool hasArchAccelFeatures() const { |
| return (getFullSmVersion() & 1) && PTXVersion >= 80; |
| } |
| // GPUs with 'f' suffix have architecture-accelerated features which are |
| // portable across all future architectures under same SM major. For example, |
| // sm_100f features will work for sm_10X*f*/sm_10X*a* future architectures. |
| // - false represents non-family-specific architecture. |
| // - true represents family-specific variant. |
| bool hasFamilySpecificFeatures() const { |
| return getFullSmVersion() % 10 == 2 ? PTXVersion >= 88 |
| : hasArchAccelFeatures(); |
| } |
| // If the user did not provide a target we default to the `sm_75` target. |
| std::string getTargetName() const { |
| return TargetName.empty() ? "sm_75" : TargetName; |
| } |
| bool hasTargetName() const { return !TargetName.empty(); } |
| |
| bool hasNativeBF16Support(int Opcode) const; |
| |
| // Get maximum value of required alignments among the supported data types. |
| // From the PTX ISA doc, section 8.2.3: |
| // The memory consistency model relates operations executed on memory |
| // locations with scalar data-types, which have a maximum size and alignment |
| // of 64 bits. Memory operations with a vector data-type are modelled as a |
| // set of equivalent memory operations with a scalar data-type, executed in |
| // an unspecified order on the elements in the vector. |
| unsigned getMaxRequiredAlignment() const { return 8; } |
| // Get the smallest cmpxchg word size that the hardware supports. |
| unsigned getMinCmpXchgSizeInBits() const { return 32; } |
| |
| unsigned getPTXVersion() const { return PTXVersion; } |
| |
| NVPTXSubtarget &initializeSubtargetDependencies(StringRef CPU, StringRef FS); |
| void ParseSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS); |
| |
| void failIfClustersUnsupported(std::string const &FailureMessage) const; |
| }; |
| |
| } // End llvm namespace |
| |
| #endif |