lib/Target/NVPTX/NVPTXSubtarget.h - llvm-project/llvm - Git at Google

 //=====-- NVPTXSubtarget.h - Define Subtarget for the NVPTX ---*- C++ -*--====//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 //
 // This file declares the NVPTX specific subclass of TargetSubtarget.
 //
 //===----------------------------------------------------------------------===//

 #ifndef LLVM_LIB_TARGET_NVPTX_NVPTXSUBTARGET_H
 #define LLVM_LIB_TARGET_NVPTX_NVPTXSUBTARGET_H

 #include "NVPTX.h"
 #include "NVPTXFrameLowering.h"
 #include "NVPTXISelLowering.h"
 #include "NVPTXInstrInfo.h"
 #include "NVPTXRegisterInfo.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/NVVMIntrinsicUtils.h"
 #include "llvm/Support/NVPTXAddrSpace.h"
 #include <string>

 #define GET_SUBTARGETINFO_HEADER
 #include "NVPTXGenSubtargetInfo.inc"

 namespace llvm {

 // FullSmVersion encoding: SM * 10 + ArchSuffixOffset
 // ArchSuffixOffset: 0 (base), 2 ('f'), 3 ('a')
 // e.g. sm_100 -> 1000, sm_100f -> 1002, sm_100a -> 1003

 class NVPTXSubtarget : public NVPTXGenSubtargetInfo {
   virtual void anchor();
   std::string TargetName;

   // PTX version x.y is represented as 10*x+y, e.g. 3.1 == 31
   unsigned PTXVersion;

   // FullSmVersion encoding: SM * 10 + ArchSuffixOffset
   // ArchSuffixOffset: 0 (base), 2 ('f'), 3 ('a')
   // e.g. sm_30 -> 300, sm_90a -> 903, sm_100f -> 1002
   unsigned int FullSmVersion;

   // SM version x.y is represented as 10*x+y, e.g. 3.1 == 31. Derived from
   // FullSmVersion.
   unsigned int SmVersion;

   NVPTXInstrInfo InstrInfo;
   NVPTXTargetLowering TLInfo;
   std::unique_ptr<const SelectionDAGTargetInfo> TSInfo;

   // NVPTX does not have any call stack frame, but need a NVPTX specific
   // FrameLowering class because TargetFrameLowering is abstract.
   NVPTXFrameLowering FrameLowering;

 public:
   /// This constructor initializes the data members to match that
   /// of the specified module.
   ///
   NVPTXSubtarget(const Triple &TT, const std::string &CPU,
                  const std::string &FS, const NVPTXTargetMachine &TM);

   ~NVPTXSubtarget() override;

   const TargetFrameLowering *getFrameLowering() const override {
     return &FrameLowering;
   }
   const NVPTXInstrInfo *getInstrInfo() const override { return &InstrInfo; }
   const NVPTXRegisterInfo *getRegisterInfo() const override {
     return &InstrInfo.getRegisterInfo();
   }
   const NVPTXTargetLowering *getTargetLowering() const override {
     return &TLInfo;
   }

   const SelectionDAGTargetInfo *getSelectionDAGInfo() const override;

   // Checks PTX version and family-specific and architecture-specific SM
   // versions. For example, sm_100{f/a} and any future variants in the same
   // family will match for any PTX version greater than or equal to
   // `PTXVersion`.
   bool hasPTXWithFamilySMs(unsigned PTXVersion,
                            ArrayRef<unsigned> SMVersions) const;
   // Checks PTX version and architecture-specific SM versions.
   // For example, sm_100{a} will match for any PTX version greater than or equal
   // to `PTXVersion`.
   bool hasPTXWithAccelSMs(unsigned PTXVersion,
                           ArrayRef<unsigned> SMVersions) const;

   bool has256BitVectorLoadStore(unsigned AS) const {
     return SmVersion >= 100 && PTXVersion >= 88 &&
            AS == NVPTXAS::ADDRESS_SPACE_GLOBAL;
   }
   bool hasUsedBytesMaskPragma() const {
     return SmVersion >= 50 && PTXVersion >= 83;
   }
   bool hasAtomAddF64() const { return SmVersion >= 60; }
   bool hasAtomScope() const { return SmVersion >= 60; }
   bool hasAtomBitwise64() const { return SmVersion >= 32; }
   bool hasAtomMinMax64() const { return SmVersion >= 32; }
   bool hasAtomCas16() const { return SmVersion >= 70 && PTXVersion >= 63; }
   bool hasAtomSwap128() const { return SmVersion >= 90 && PTXVersion >= 83; }
   bool hasClusters() const { return SmVersion >= 90 && PTXVersion >= 78; }
   bool hasLDG() const { return SmVersion >= 32; }
   bool hasHWROT32() const { return SmVersion >= 32; }
   bool hasBrx() const { return SmVersion >= 30 && PTXVersion >= 60; }
   bool hasFP16Math() const { return SmVersion >= 53; }
   bool hasBF16Math() const { return SmVersion >= 80; }
   bool allowFP16Math() const;
   bool hasMaskOperator() const { return PTXVersion >= 71; }
   bool hasNoReturn() const { return SmVersion >= 30 && PTXVersion >= 64; }
   // Does SM & PTX support memory orderings (weak and atomic: relaxed, acquire,
   // release, acq_rel, sc) ?
   bool hasMemoryOrdering() const { return SmVersion >= 70 && PTXVersion >= 60; }
   // Does SM & PTX support .acquire and .release qualifiers for fence?
   bool hasSplitAcquireAndReleaseFences() const {
     return SmVersion >= 90 && PTXVersion >= 86;
   }
   // Does SM & PTX support atomic relaxed MMIO operations ?
   bool hasRelaxedMMIO() const { return SmVersion >= 70 && PTXVersion >= 82; }
   bool hasDotInstructions() const {
     return SmVersion >= 61 && PTXVersion >= 50;
   }

   // Checks following instructions support:
   // - tcgen05.ld/st
   // - tcgen05.alloc/dealloc/relinquish
   // - tcgen05.cp
   // - tcgen05.fence/wait
   // - tcgen05.commit
   // - tcgen05.mma
   bool hasTcgen05InstSupport() const {
     // sm_101 renamed to sm_110 in PTX 9.0
     return hasPTXWithFamilySMs(90, {100, 110}) ||
            hasPTXWithFamilySMs(88, {100, 101}) ||
            hasPTXWithAccelSMs(86, {100, 101});
   }

   // Checks tcgen05.shift instruction support.
   bool hasTcgen05ShiftSupport() const {
     // sm_101 renamed to sm_110 in PTX 9.0
     return hasPTXWithAccelSMs(90, {100, 110, 103}) ||
            hasPTXWithAccelSMs(88, {100, 101, 103}) ||
            hasPTXWithAccelSMs(86, {100, 101});
   }

   bool hasTcgen05MMAScaleInputDImm() const {
     return hasPTXWithFamilySMs(88, {100}) || hasPTXWithAccelSMs(86, {100});
   }

   bool hasTcgen05MMAI8Kind() const {
     return hasPTXWithAccelSMs(90, {100, 110}) ||
            hasPTXWithAccelSMs(86, {100, 101});
   }

   bool hasTcgen05MMASparseMxf4nvf4() const {
     return hasPTXWithAccelSMs(90, {100, 110, 103}) ||
            hasPTXWithAccelSMs(87, {100, 101, 103});
   }

   bool hasTcgen05MMASparseMxf4() const {
     return hasPTXWithAccelSMs(90, {100, 110, 103}) ||
            hasPTXWithAccelSMs(86, {100, 101, 103});
   }

   bool hasTcgen05LdRedSupport() const {
     return hasPTXWithFamilySMs(90, {110, 103}) ||
            hasPTXWithFamilySMs(88, {101, 103});
   }

   bool hasReduxSyncF32() const {
     return hasPTXWithFamilySMs(88, {100}) || hasPTXWithAccelSMs(86, {100});
   }

   bool hasMMABlockScale() const {
     return hasPTXWithFamilySMs(88, {120}) || hasPTXWithAccelSMs(87, {120});
   }

   bool hasMMASparseBlockScaleF4() const {
     return hasPTXWithAccelSMs(87, {120, 121});
   }

   bool hasMMAWithMXF4NVF4Scale4xE8M0() const {
     return hasPTXWithFamilySMs(91, {120});
   }

   bool hasMMASparseWithMXF4NVF4Scale4xE8M0() const {
     return hasPTXWithAccelSMs(91, {120, 121});
   }

   // f32x2 instructions in Blackwell family
   bool hasF32x2Instructions() const;

   // Checks support for following in TMA:
   //  - cta_group::1/2 support
   //  - im2col_w/w_128 mode support
   //  - tile_gather4 mode support
   //  - tile_scatter4 mode support
   bool hasTMABlackwellSupport() const {
     return hasPTXWithFamilySMs(90, {100, 110}) ||
            hasPTXWithFamilySMs(88, {100, 101}) ||
            hasPTXWithAccelSMs(86, {100, 101});
   }

   // Checks support for conversions involving e4m3x2 and e5m2x2.
   bool hasFP8ConversionSupport() const {
     if (PTXVersion >= 81)
       return SmVersion >= 89;

     if (PTXVersion >= 78)
       return SmVersion >= 90;

     return false;
   }

   // Checks support for conversions involving the following types:
   // - e2m3x2/e3m2x2
   // - e2m1x2
   // - ue8m0x2
   bool hasNarrowFPConversionSupport() const {
     return hasPTXWithFamilySMs(90, {100, 110, 120}) ||
            hasPTXWithFamilySMs(88, {100, 101, 120}) ||
            hasPTXWithAccelSMs(86, {100, 101, 120});
   }

   // Checks support for conversions involving the following types:
   // - bf16x2 -> f8x2
   // - f16x2 -> f6x2
   // - bf16x2 -> f6x2
   // - f16x2 -> f4x2
   // - bf16x2 -> f4x2
   bool hasFP16X2ToNarrowFPConversionSupport() const {
     return hasPTXWithFamilySMs(91, {100, 110, 120});
   }

   bool hasS2F6X2ConversionSupport() const {
     return hasPTXWithAccelSMs(91, {100, 103, 110, 120, 121});
   }

   // Checks support for conversions from narrow FP types to bf16x2.
   bool hasNarrowFPToBF16x2ConversionSupport() const {
     return hasPTXWithFamilySMs(92, {100, 110, 120});
   }

   bool hasTensormapReplaceSupport() const {
     return hasPTXWithFamilySMs(90, {90, 100, 110, 120}) ||
            hasPTXWithFamilySMs(88, {90, 100, 101, 120}) ||
            hasPTXWithAccelSMs(83, {90, 100, 101, 120});
   }

   bool hasTensormapReplaceElemtypeSupport(unsigned value) const {
     if (value >= static_cast<unsigned>(nvvm::TensormapElemType::B4x16))
       return hasPTXWithFamilySMs(90, {100, 110, 120}) ||
              hasPTXWithFamilySMs(88, {100, 101, 120}) ||
              hasPTXWithAccelSMs(87, {100, 101, 120});

     return hasTensormapReplaceSupport();
   }

   bool hasTensormapReplaceSwizzleAtomicitySupport() const {
     return hasPTXWithFamilySMs(90, {100, 110, 120}) ||
            hasPTXWithFamilySMs(88, {100, 101, 120}) ||
            hasPTXWithAccelSMs(87, {100, 101, 120});
   }

   bool hasTensormapReplaceSwizzleModeSupport(unsigned value) const {
     if (value == static_cast<unsigned>(nvvm::TensormapSwizzleMode::SWIZZLE_96B))
       return hasPTXWithAccelSMs(88, {103});

     return hasTensormapReplaceSupport();
   }

   bool hasClusterLaunchControlTryCancelMulticastSupport() const {
     return hasPTXWithFamilySMs(90, {100, 110, 120}) ||
            hasPTXWithFamilySMs(88, {100, 101, 120}) ||
            hasPTXWithAccelSMs(86, {100, 101, 120});
   }

   bool hasSetMaxNRegSupport() const {
     return hasPTXWithFamilySMs(90, {100, 110, 120}) ||
            hasPTXWithFamilySMs(88, {100, 101, 120}) ||
            hasPTXWithAccelSMs(86, {100, 101, 120}) ||
            hasPTXWithAccelSMs(80, {90});
   }

   bool hasLdStmatrixBlackwellSupport() const {
     return hasPTXWithFamilySMs(90, {100, 110, 120}) ||
            hasPTXWithFamilySMs(88, {100, 101, 120}) ||
            hasPTXWithAccelSMs(86, {100, 101, 120});
   }

   // Prior to CUDA 12.3 ptxas did not recognize that the trap instruction
   // terminates a basic block. Instead, it would assume that control flow
   // continued to the next instruction. The next instruction could be in the
   // block that's lexically below it. This would lead to a phantom CFG edges
   // being created within ptxas. This issue was fixed in CUDA 12.3. Thus, when
   // PTX ISA versions 8.3+ we can confidently say that the bug will not be
   // present.
   bool hasPTXASUnreachableBug() const { return PTXVersion < 83; }
   bool hasCvtaParam() const { return SmVersion >= 70 && PTXVersion >= 77; }
   bool hasConvertWithStochasticRounding() const {
     return hasPTXWithAccelSMs(87, {100, 103});
   }
   unsigned int getFullSmVersion() const { return FullSmVersion; }
   unsigned int getSmVersion() const { return getFullSmVersion() / 10; }
   unsigned int getSmFamilyVersion() const { return getFullSmVersion() / 100; }
   // GPUs with "a" suffix have architecture-accelerated features that are
   // supported on the specified architecture only, hence such targets do not
   // follow the onion layer model. hasArchAccelFeatures() allows distinguishing
   // such GPU variants from the base GPU architecture.
   // - false represents non-accelerated architecture.
   // - true represents architecture-accelerated variant.
   bool hasArchAccelFeatures() const {
     return (getFullSmVersion() & 1) && PTXVersion >= 80;
   }
   // GPUs with 'f' suffix have architecture-accelerated features which are
   // portable across all future architectures under same SM major. For example,
   // sm_100f features will work for sm_10X*f*/sm_10X*a* future architectures.
   // - false represents non-family-specific architecture.
   // - true represents family-specific variant.
   bool hasFamilySpecificFeatures() const {
     return getFullSmVersion() % 10 == 2 ? PTXVersion >= 88
                                         : hasArchAccelFeatures();
   }
   // If the user did not provide a target we default to the `sm_75` target.
   std::string getTargetName() const {
     return TargetName.empty() ? "sm_75" : TargetName;
   }
   bool hasTargetName() const { return !TargetName.empty(); }

   bool hasNativeBF16Support(int Opcode) const;

   // Get maximum value of required alignments among the supported data types.
   // From the PTX ISA doc, section 8.2.3:
   //  The memory consistency model relates operations executed on memory
   //  locations with scalar data-types, which have a maximum size and alignment
   //  of 64 bits. Memory operations with a vector data-type are modelled as a
   //  set of equivalent memory operations with a scalar data-type, executed in
   //  an unspecified order on the elements in the vector.
   unsigned getMaxRequiredAlignment() const { return 8; }
   // Get the smallest cmpxchg word size that the hardware supports.
   unsigned getMinCmpXchgSizeInBits() const { return 32; }

   unsigned getPTXVersion() const { return PTXVersion; }

   NVPTXSubtarget &initializeSubtargetDependencies(StringRef CPU, StringRef FS);
   void ParseSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS);

   void failIfClustersUnsupported(std::string const &FailureMessage) const;
 };

 } // End llvm namespace

 #endif
	//=====-- NVPTXSubtarget.h - Define Subtarget for the NVPTX ---- C++ ---====//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	//
	// This file declares the NVPTX specific subclass of TargetSubtarget.
	//
	//===----------------------------------------------------------------------===//

	#ifndef LLVM_LIB_TARGET_NVPTX_NVPTXSUBTARGET_H
	#define LLVM_LIB_TARGET_NVPTX_NVPTXSUBTARGET_H

	#include "NVPTX.h"
	#include "NVPTXFrameLowering.h"
	#include "NVPTXISelLowering.h"
	#include "NVPTXInstrInfo.h"
	#include "NVPTXRegisterInfo.h"
	#include "llvm/CodeGen/TargetSubtargetInfo.h"
	#include "llvm/IR/DataLayout.h"
	#include "llvm/IR/NVVMIntrinsicUtils.h"
	#include "llvm/Support/NVPTXAddrSpace.h"
	#include <string>

	#define GET_SUBTARGETINFO_HEADER
	#include "NVPTXGenSubtargetInfo.inc"

	namespace llvm {

	// FullSmVersion encoding: SM * 10 + ArchSuffixOffset
	// ArchSuffixOffset: 0 (base), 2 ('f'), 3 ('a')
	// e.g. sm_100 -> 1000, sm_100f -> 1002, sm_100a -> 1003

	class NVPTXSubtarget : public NVPTXGenSubtargetInfo {
	virtual void anchor();
	std::string TargetName;

	// PTX version x.y is represented as 10*x+y, e.g. 3.1 == 31
	unsigned PTXVersion;

	// FullSmVersion encoding: SM * 10 + ArchSuffixOffset
	// ArchSuffixOffset: 0 (base), 2 ('f'), 3 ('a')
	// e.g. sm_30 -> 300, sm_90a -> 903, sm_100f -> 1002
	unsigned int FullSmVersion;

	// SM version x.y is represented as 10*x+y, e.g. 3.1 == 31. Derived from
	// FullSmVersion.
	unsigned int SmVersion;

	NVPTXInstrInfo InstrInfo;
	NVPTXTargetLowering TLInfo;
	std::unique_ptr<const SelectionDAGTargetInfo> TSInfo;

	// NVPTX does not have any call stack frame, but need a NVPTX specific
	// FrameLowering class because TargetFrameLowering is abstract.
	NVPTXFrameLowering FrameLowering;

	public:
	/// This constructor initializes the data members to match that
	/// of the specified module.
	///
	NVPTXSubtarget(const Triple &TT, const std::string &CPU,
	const std::string &FS, const NVPTXTargetMachine &TM);

	~NVPTXSubtarget() override;

	const TargetFrameLowering *getFrameLowering() const override {
	return &FrameLowering;
	}
	const NVPTXInstrInfo *getInstrInfo() const override { return &InstrInfo; }
	const NVPTXRegisterInfo *getRegisterInfo() const override {
	return &InstrInfo.getRegisterInfo();
	}
	const NVPTXTargetLowering *getTargetLowering() const override {
	return &TLInfo;
	}

	const SelectionDAGTargetInfo *getSelectionDAGInfo() const override;

	// Checks PTX version and family-specific and architecture-specific SM
	// versions. For example, sm_100{f/a} and any future variants in the same
	// family will match for any PTX version greater than or equal to
	// `PTXVersion`.
	bool hasPTXWithFamilySMs(unsigned PTXVersion,
	ArrayRef<unsigned> SMVersions) const;
	// Checks PTX version and architecture-specific SM versions.
	// For example, sm_100{a} will match for any PTX version greater than or equal
	// to `PTXVersion`.
	bool hasPTXWithAccelSMs(unsigned PTXVersion,
	ArrayRef<unsigned> SMVersions) const;

	bool has256BitVectorLoadStore(unsigned AS) const {
	return SmVersion >= 100 && PTXVersion >= 88 &&
	AS == NVPTXAS::ADDRESS_SPACE_GLOBAL;
	}
	bool hasUsedBytesMaskPragma() const {
	return SmVersion >= 50 && PTXVersion >= 83;
	}
	bool hasAtomAddF64() const { return SmVersion >= 60; }
	bool hasAtomScope() const { return SmVersion >= 60; }
	bool hasAtomBitwise64() const { return SmVersion >= 32; }
	bool hasAtomMinMax64() const { return SmVersion >= 32; }
	bool hasAtomCas16() const { return SmVersion >= 70 && PTXVersion >= 63; }
	bool hasAtomSwap128() const { return SmVersion >= 90 && PTXVersion >= 83; }
	bool hasClusters() const { return SmVersion >= 90 && PTXVersion >= 78; }
	bool hasLDG() const { return SmVersion >= 32; }
	bool hasHWROT32() const { return SmVersion >= 32; }
	bool hasBrx() const { return SmVersion >= 30 && PTXVersion >= 60; }
	bool hasFP16Math() const { return SmVersion >= 53; }
	bool hasBF16Math() const { return SmVersion >= 80; }
	bool allowFP16Math() const;
	bool hasMaskOperator() const { return PTXVersion >= 71; }
	bool hasNoReturn() const { return SmVersion >= 30 && PTXVersion >= 64; }
	// Does SM & PTX support memory orderings (weak and atomic: relaxed, acquire,
	// release, acq_rel, sc) ?
	bool hasMemoryOrdering() const { return SmVersion >= 70 && PTXVersion >= 60; }
	// Does SM & PTX support .acquire and .release qualifiers for fence?
	bool hasSplitAcquireAndReleaseFences() const {
	return SmVersion >= 90 && PTXVersion >= 86;
	}
	// Does SM & PTX support atomic relaxed MMIO operations ?
	bool hasRelaxedMMIO() const { return SmVersion >= 70 && PTXVersion >= 82; }
	bool hasDotInstructions() const {
	return SmVersion >= 61 && PTXVersion >= 50;
	}

	// Checks following instructions support:
	// - tcgen05.ld/st
	// - tcgen05.alloc/dealloc/relinquish
	// - tcgen05.cp
	// - tcgen05.fence/wait
	// - tcgen05.commit
	// - tcgen05.mma
	bool hasTcgen05InstSupport() const {
	// sm_101 renamed to sm_110 in PTX 9.0
	return hasPTXWithFamilySMs(90, {100, 110}) \|\|
	hasPTXWithFamilySMs(88, {100, 101}) \|\|
	hasPTXWithAccelSMs(86, {100, 101});
	}

	// Checks tcgen05.shift instruction support.
	bool hasTcgen05ShiftSupport() const {
	// sm_101 renamed to sm_110 in PTX 9.0
	return hasPTXWithAccelSMs(90, {100, 110, 103}) \|\|
	hasPTXWithAccelSMs(88, {100, 101, 103}) \|\|
	hasPTXWithAccelSMs(86, {100, 101});
	}

	bool hasTcgen05MMAScaleInputDImm() const {
	return hasPTXWithFamilySMs(88, {100}) \|\| hasPTXWithAccelSMs(86, {100});
	}

	bool hasTcgen05MMAI8Kind() const {
	return hasPTXWithAccelSMs(90, {100, 110}) \|\|
	hasPTXWithAccelSMs(86, {100, 101});
	}

	bool hasTcgen05MMASparseMxf4nvf4() const {
	return hasPTXWithAccelSMs(90, {100, 110, 103}) \|\|
	hasPTXWithAccelSMs(87, {100, 101, 103});
	}

	bool hasTcgen05MMASparseMxf4() const {
	return hasPTXWithAccelSMs(90, {100, 110, 103}) \|\|
	hasPTXWithAccelSMs(86, {100, 101, 103});
	}

	bool hasTcgen05LdRedSupport() const {
	return hasPTXWithFamilySMs(90, {110, 103}) \|\|
	hasPTXWithFamilySMs(88, {101, 103});
	}

	bool hasReduxSyncF32() const {
	return hasPTXWithFamilySMs(88, {100}) \|\| hasPTXWithAccelSMs(86, {100});
	}

	bool hasMMABlockScale() const {
	return hasPTXWithFamilySMs(88, {120}) \|\| hasPTXWithAccelSMs(87, {120});
	}

	bool hasMMASparseBlockScaleF4() const {
	return hasPTXWithAccelSMs(87, {120, 121});
	}

	bool hasMMAWithMXF4NVF4Scale4xE8M0() const {
	return hasPTXWithFamilySMs(91, {120});
	}

	bool hasMMASparseWithMXF4NVF4Scale4xE8M0() const {
	return hasPTXWithAccelSMs(91, {120, 121});
	}

	// f32x2 instructions in Blackwell family
	bool hasF32x2Instructions() const;

	// Checks support for following in TMA:
	// - cta_group::1/2 support
	// - im2col_w/w_128 mode support
	// - tile_gather4 mode support
	// - tile_scatter4 mode support
	bool hasTMABlackwellSupport() const {
	return hasPTXWithFamilySMs(90, {100, 110}) \|\|
	hasPTXWithFamilySMs(88, {100, 101}) \|\|
	hasPTXWithAccelSMs(86, {100, 101});
	}

	// Checks support for conversions involving e4m3x2 and e5m2x2.
	bool hasFP8ConversionSupport() const {
	if (PTXVersion >= 81)
	return SmVersion >= 89;

	if (PTXVersion >= 78)
	return SmVersion >= 90;

	return false;
	}

	// Checks support for conversions involving the following types:
	// - e2m3x2/e3m2x2
	// - e2m1x2
	// - ue8m0x2
	bool hasNarrowFPConversionSupport() const {
	return hasPTXWithFamilySMs(90, {100, 110, 120}) \|\|
	hasPTXWithFamilySMs(88, {100, 101, 120}) \|\|
	hasPTXWithAccelSMs(86, {100, 101, 120});
	}

	// Checks support for conversions involving the following types:
	// - bf16x2 -> f8x2
	// - f16x2 -> f6x2
	// - bf16x2 -> f6x2
	// - f16x2 -> f4x2
	// - bf16x2 -> f4x2
	bool hasFP16X2ToNarrowFPConversionSupport() const {
	return hasPTXWithFamilySMs(91, {100, 110, 120});
	}

	bool hasS2F6X2ConversionSupport() const {
	return hasPTXWithAccelSMs(91, {100, 103, 110, 120, 121});
	}

	// Checks support for conversions from narrow FP types to bf16x2.
	bool hasNarrowFPToBF16x2ConversionSupport() const {
	return hasPTXWithFamilySMs(92, {100, 110, 120});
	}

	bool hasTensormapReplaceSupport() const {
	return hasPTXWithFamilySMs(90, {90, 100, 110, 120}) \|\|
	hasPTXWithFamilySMs(88, {90, 100, 101, 120}) \|\|
	hasPTXWithAccelSMs(83, {90, 100, 101, 120});
	}

	bool hasTensormapReplaceElemtypeSupport(unsigned value) const {
	if (value >= static_cast<unsigned>(nvvm::TensormapElemType::B4x16))
	return hasPTXWithFamilySMs(90, {100, 110, 120}) \|\|
	hasPTXWithFamilySMs(88, {100, 101, 120}) \|\|
	hasPTXWithAccelSMs(87, {100, 101, 120});

	return hasTensormapReplaceSupport();
	}

	bool hasTensormapReplaceSwizzleAtomicitySupport() const {
	return hasPTXWithFamilySMs(90, {100, 110, 120}) \|\|
	hasPTXWithFamilySMs(88, {100, 101, 120}) \|\|
	hasPTXWithAccelSMs(87, {100, 101, 120});
	}

	bool hasTensormapReplaceSwizzleModeSupport(unsigned value) const {
	if (value == static_cast<unsigned>(nvvm::TensormapSwizzleMode::SWIZZLE_96B))
	return hasPTXWithAccelSMs(88, {103});

	return hasTensormapReplaceSupport();
	}

	bool hasClusterLaunchControlTryCancelMulticastSupport() const {
	return hasPTXWithFamilySMs(90, {100, 110, 120}) \|\|
	hasPTXWithFamilySMs(88, {100, 101, 120}) \|\|
	hasPTXWithAccelSMs(86, {100, 101, 120});
	}

	bool hasSetMaxNRegSupport() const {
	return hasPTXWithFamilySMs(90, {100, 110, 120}) \|\|
	hasPTXWithFamilySMs(88, {100, 101, 120}) \|\|
	hasPTXWithAccelSMs(86, {100, 101, 120}) \|\|
	hasPTXWithAccelSMs(80, {90});
	}

	bool hasLdStmatrixBlackwellSupport() const {
	return hasPTXWithFamilySMs(90, {100, 110, 120}) \|\|
	hasPTXWithFamilySMs(88, {100, 101, 120}) \|\|
	hasPTXWithAccelSMs(86, {100, 101, 120});
	}

	// Prior to CUDA 12.3 ptxas did not recognize that the trap instruction
	// terminates a basic block. Instead, it would assume that control flow
	// continued to the next instruction. The next instruction could be in the
	// block that's lexically below it. This would lead to a phantom CFG edges
	// being created within ptxas. This issue was fixed in CUDA 12.3. Thus, when
	// PTX ISA versions 8.3+ we can confidently say that the bug will not be
	// present.
	bool hasPTXASUnreachableBug() const { return PTXVersion < 83; }
	bool hasCvtaParam() const { return SmVersion >= 70 && PTXVersion >= 77; }
	bool hasConvertWithStochasticRounding() const {
	return hasPTXWithAccelSMs(87, {100, 103});
	}
	unsigned int getFullSmVersion() const { return FullSmVersion; }
	unsigned int getSmVersion() const { return getFullSmVersion() / 10; }
	unsigned int getSmFamilyVersion() const { return getFullSmVersion() / 100; }
	// GPUs with "a" suffix have architecture-accelerated features that are
	// supported on the specified architecture only, hence such targets do not
	// follow the onion layer model. hasArchAccelFeatures() allows distinguishing
	// such GPU variants from the base GPU architecture.
	// - false represents non-accelerated architecture.
	// - true represents architecture-accelerated variant.
	bool hasArchAccelFeatures() const {
	return (getFullSmVersion() & 1) && PTXVersion >= 80;
	}
	// GPUs with 'f' suffix have architecture-accelerated features which are
	// portable across all future architectures under same SM major. For example,
	// sm_100f features will work for sm_10Xf/sm_10Xa future architectures.
	// - false represents non-family-specific architecture.
	// - true represents family-specific variant.
	bool hasFamilySpecificFeatures() const {
	return getFullSmVersion() % 10 == 2 ? PTXVersion >= 88
	: hasArchAccelFeatures();
	}
	// If the user did not provide a target we default to the `sm_75` target.
	std::string getTargetName() const {
	return TargetName.empty() ? "sm_75" : TargetName;
	}
	bool hasTargetName() const { return !TargetName.empty(); }

	bool hasNativeBF16Support(int Opcode) const;

	// Get maximum value of required alignments among the supported data types.
	// From the PTX ISA doc, section 8.2.3:
	// The memory consistency model relates operations executed on memory
	// locations with scalar data-types, which have a maximum size and alignment
	// of 64 bits. Memory operations with a vector data-type are modelled as a
	// set of equivalent memory operations with a scalar data-type, executed in
	// an unspecified order on the elements in the vector.
	unsigned getMaxRequiredAlignment() const { return 8; }
	// Get the smallest cmpxchg word size that the hardware supports.
	unsigned getMinCmpXchgSizeInBits() const { return 32; }

	unsigned getPTXVersion() const { return PTXVersion; }

	NVPTXSubtarget &initializeSubtargetDependencies(StringRef CPU, StringRef FS);
	void ParseSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS);

	void failIfClustersUnsupported(std::string const &FailureMessage) const;
	};

	} // End llvm namespace

	#endif