llvm/lib/Target/NVPTX/NVPTXSubtarget.h - llvm-project - Git at Google

 //=====-- NVPTXSubtarget.h - Define Subtarget for the NVPTX ---*- C++ -*--====//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 //
 // This file declares the NVPTX specific subclass of TargetSubtarget.
 //
 //===----------------------------------------------------------------------===//

 #ifndef LLVM_LIB_TARGET_NVPTX_NVPTXSUBTARGET_H
 #define LLVM_LIB_TARGET_NVPTX_NVPTXSUBTARGET_H

 #include "NVPTX.h"
 #include "NVPTXFrameLowering.h"
 #include "NVPTXISelLowering.h"
 #include "NVPTXInstrInfo.h"
 #include "NVPTXRegisterInfo.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/Support/NVPTXAddrSpace.h"
 #include <string>

 #define GET_SUBTARGETINFO_HEADER
 #include "NVPTXGenSubtargetInfo.inc"

 namespace llvm {

 class NVPTXSubtarget : public NVPTXGenSubtargetInfo {
   virtual void anchor();
   std::string TargetName;

   // PTX version x.y is represented as 10*x+y, e.g. 3.1 == 31
   unsigned PTXVersion;

   // Full SM version x.y is represented as 100*x+10*y+feature, e.g. 3.1 == 310
   // sm_90a == 901
   unsigned int FullSmVersion;

   // SM version x.y is represented as 10*x+y, e.g. 3.1 == 31. Derived from
   // FullSmVersion.
   unsigned int SmVersion;

   NVPTXInstrInfo InstrInfo;
   NVPTXTargetLowering TLInfo;
   std::unique_ptr<const SelectionDAGTargetInfo> TSInfo;

   // NVPTX does not have any call stack frame, but need a NVPTX specific
   // FrameLowering class because TargetFrameLowering is abstract.
   NVPTXFrameLowering FrameLowering;

 public:
   /// This constructor initializes the data members to match that
   /// of the specified module.
   ///
   NVPTXSubtarget(const Triple &TT, const std::string &CPU,
                  const std::string &FS, const NVPTXTargetMachine &TM);

   ~NVPTXSubtarget() override;

   const TargetFrameLowering *getFrameLowering() const override {
     return &FrameLowering;
   }
   const NVPTXInstrInfo *getInstrInfo() const override { return &InstrInfo; }
   const NVPTXRegisterInfo *getRegisterInfo() const override {
     return &InstrInfo.getRegisterInfo();
   }
   const NVPTXTargetLowering *getTargetLowering() const override {
     return &TLInfo;
   }

   const SelectionDAGTargetInfo *getSelectionDAGInfo() const override;

   // Checks PTX version and family-specific and architecture-specific SM
   // versions. For example, sm_100{f/a} and any future variants in the same
   // family will match for any PTX version greater than or equal to
   // `PTXVersion`.
   bool hasPTXWithFamilySMs(unsigned PTXVersion,
                            ArrayRef<unsigned> SMVersions) const;
   // Checks PTX version and architecture-specific SM versions.
   // For example, sm_100{a} will match for any PTX version greater than or equal
   // to `PTXVersion`.
   bool hasPTXWithAccelSMs(unsigned PTXVersion,
                           ArrayRef<unsigned> SMVersions) const;

   bool has256BitVectorLoadStore(unsigned AS) const {
     return SmVersion >= 100 && PTXVersion >= 88 &&
            AS == NVPTXAS::ADDRESS_SPACE_GLOBAL;
   }
   bool hasAtomAddF64() const { return SmVersion >= 60; }
   bool hasAtomScope() const { return SmVersion >= 60; }
   bool hasAtomBitwise64() const { return SmVersion >= 32; }
   bool hasAtomMinMax64() const { return SmVersion >= 32; }
   bool hasAtomCas16() const { return SmVersion >= 70 && PTXVersion >= 63; }
   bool hasAtomSwap128() const { return SmVersion >= 90 && PTXVersion >= 83; }
   bool hasClusters() const { return SmVersion >= 90 && PTXVersion >= 78; }
   bool hasLDG() const { return SmVersion >= 32; }
   bool hasHWROT32() const { return SmVersion >= 32; }
   bool hasFP16Math() const { return SmVersion >= 53; }
   bool hasBF16Math() const { return SmVersion >= 80; }
   bool allowFP16Math() const;
   bool hasMaskOperator() const { return PTXVersion >= 71; }
   bool hasNoReturn() const { return SmVersion >= 30 && PTXVersion >= 64; }
   // Does SM & PTX support memory orderings (weak and atomic: relaxed, acquire,
   // release, acq_rel, sc) ?
   bool hasMemoryOrdering() const { return SmVersion >= 70 && PTXVersion >= 60; }
   // Does SM & PTX support .acquire and .release qualifiers for fence?
   bool hasSplitAcquireAndReleaseFences() const {
     return SmVersion >= 90 && PTXVersion >= 86;
   }
   // Does SM & PTX support atomic relaxed MMIO operations ?
   bool hasRelaxedMMIO() const { return SmVersion >= 70 && PTXVersion >= 82; }
   bool hasDotInstructions() const {
     return SmVersion >= 61 && PTXVersion >= 50;
   }
   // Tcgen05 instructions in Blackwell family
   bool hasTcgen05Instructions() const {
     bool HasTcgen05 = false;
     unsigned MinPTXVersion = 86;
     switch (FullSmVersion) {
     default:
       break;
     case 1003: // sm_100a
     case 1013: // sm_101a
       HasTcgen05 = true;
       break;
     case 1103: // sm_110a
       HasTcgen05 = true;
       MinPTXVersion = 90;
       break;
     case 1033: // sm_103a
       HasTcgen05 = true;
       MinPTXVersion = 88;
       break;
     }

     return HasTcgen05 && PTXVersion >= MinPTXVersion;
   }

   // Checks following instructions support:
   // - tcgen05.ld/st
   // - tcgen05.alloc/dealloc/relinquish
   // - tcgen05.cp
   // - tcgen05.fence/wait
   // - tcgen05.commit
   bool hasTcgen05InstSupport() const {
     // sm_101 renamed to sm_110 in PTX 9.0
     return hasPTXWithFamilySMs(90, {100, 110}) ||
            hasPTXWithFamilySMs(88, {100, 101}) ||
            hasPTXWithAccelSMs(86, {100, 101});
   }

   // Checks tcgen05.shift instruction support.
   bool hasTcgen05ShiftSupport() const {
     // sm_101 renamed to sm_110 in PTX 9.0
     return hasPTXWithAccelSMs(90, {100, 110, 103}) ||
            hasPTXWithAccelSMs(88, {100, 101, 103}) ||
            hasPTXWithAccelSMs(86, {100, 101});
   }

   bool hasTcgen05MMAScaleInputDImm() const {
     return FullSmVersion == 1003 && PTXVersion >= 86;
   }
   // f32x2 instructions in Blackwell family
   bool hasF32x2Instructions() const;

   // TMA G2S copy with cta_group::1/2 support
   bool hasCpAsyncBulkTensorCTAGroupSupport() const {
     // TODO: Update/tidy-up after the family-conditional support arrives
     switch (FullSmVersion) {
     case 1003:
     case 1013:
       return PTXVersion >= 86;
     case 1033:
       return PTXVersion >= 88;
     default:
       return false;
     }
   }

   // Prior to CUDA 12.3 ptxas did not recognize that the trap instruction
   // terminates a basic block. Instead, it would assume that control flow
   // continued to the next instruction. The next instruction could be in the
   // block that's lexically below it. This would lead to a phantom CFG edges
   // being created within ptxas. This issue was fixed in CUDA 12.3. Thus, when
   // PTX ISA versions 8.3+ we can confidently say that the bug will not be
   // present.
   bool hasPTXASUnreachableBug() const { return PTXVersion < 83; }
   bool hasCvtaParam() const { return SmVersion >= 70 && PTXVersion >= 77; }
   unsigned int getFullSmVersion() const { return FullSmVersion; }
   unsigned int getSmVersion() const { return getFullSmVersion() / 10; }
   unsigned int getSmFamilyVersion() const { return getFullSmVersion() / 100; }
   // GPUs with "a" suffix have architecture-accelerated features that are
   // supported on the specified architecture only, hence such targets do not
   // follow the onion layer model. hasArchAccelFeatures() allows distinguishing
   // such GPU variants from the base GPU architecture.
   // - false represents non-accelerated architecture.
   // - true represents architecture-accelerated variant.
   bool hasArchAccelFeatures() const {
     return (getFullSmVersion() & 1) && PTXVersion >= 80;
   }
   // GPUs with 'f' suffix have architecture-accelerated features which are
   // portable across all future architectures under same SM major. For example,
   // sm_100f features will work for sm_10X*f*/sm_10X*a* future architectures.
   // - false represents non-family-specific architecture.
   // - true represents family-specific variant.
   bool hasFamilySpecificFeatures() const {
     return getFullSmVersion() % 10 == 2 ? PTXVersion >= 88
                                         : hasArchAccelFeatures();
   }
   // If the user did not provide a target we default to the `sm_30` target.
   std::string getTargetName() const {
     return TargetName.empty() ? "sm_30" : TargetName;
   }
   bool hasTargetName() const { return !TargetName.empty(); }

   bool hasNativeBF16Support(int Opcode) const;

   // Get maximum value of required alignments among the supported data types.
   // From the PTX ISA doc, section 8.2.3:
   //  The memory consistency model relates operations executed on memory
   //  locations with scalar data-types, which have a maximum size and alignment
   //  of 64 bits. Memory operations with a vector data-type are modelled as a
   //  set of equivalent memory operations with a scalar data-type, executed in
   //  an unspecified order on the elements in the vector.
   unsigned getMaxRequiredAlignment() const { return 8; }
   // Get the smallest cmpxchg word size that the hardware supports.
   unsigned getMinCmpXchgSizeInBits() const { return 32; }

   unsigned getPTXVersion() const { return PTXVersion; }

   NVPTXSubtarget &initializeSubtargetDependencies(StringRef CPU, StringRef FS);
   void ParseSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS);

   void failIfClustersUnsupported(std::string const &FailureMessage) const;
 };

 } // End llvm namespace

 #endif
	//=====-- NVPTXSubtarget.h - Define Subtarget for the NVPTX ---- C++ ---====//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	//
	// This file declares the NVPTX specific subclass of TargetSubtarget.
	//
	//===----------------------------------------------------------------------===//

	#ifndef LLVM_LIB_TARGET_NVPTX_NVPTXSUBTARGET_H
	#define LLVM_LIB_TARGET_NVPTX_NVPTXSUBTARGET_H

	#include "NVPTX.h"
	#include "NVPTXFrameLowering.h"
	#include "NVPTXISelLowering.h"
	#include "NVPTXInstrInfo.h"
	#include "NVPTXRegisterInfo.h"
	#include "llvm/CodeGen/TargetSubtargetInfo.h"
	#include "llvm/IR/DataLayout.h"
	#include "llvm/Support/NVPTXAddrSpace.h"
	#include <string>

	#define GET_SUBTARGETINFO_HEADER
	#include "NVPTXGenSubtargetInfo.inc"

	namespace llvm {

	class NVPTXSubtarget : public NVPTXGenSubtargetInfo {
	virtual void anchor();
	std::string TargetName;

	// PTX version x.y is represented as 10*x+y, e.g. 3.1 == 31
	unsigned PTXVersion;

	// Full SM version x.y is represented as 100x+10y+feature, e.g. 3.1 == 310
	// sm_90a == 901
	unsigned int FullSmVersion;

	// SM version x.y is represented as 10*x+y, e.g. 3.1 == 31. Derived from
	// FullSmVersion.
	unsigned int SmVersion;

	NVPTXInstrInfo InstrInfo;
	NVPTXTargetLowering TLInfo;
	std::unique_ptr<const SelectionDAGTargetInfo> TSInfo;

	// NVPTX does not have any call stack frame, but need a NVPTX specific
	// FrameLowering class because TargetFrameLowering is abstract.
	NVPTXFrameLowering FrameLowering;

	public:
	/// This constructor initializes the data members to match that
	/// of the specified module.
	///
	NVPTXSubtarget(const Triple &TT, const std::string &CPU,
	const std::string &FS, const NVPTXTargetMachine &TM);

	~NVPTXSubtarget() override;

	const TargetFrameLowering *getFrameLowering() const override {
	return &FrameLowering;
	}
	const NVPTXInstrInfo *getInstrInfo() const override { return &InstrInfo; }
	const NVPTXRegisterInfo *getRegisterInfo() const override {
	return &InstrInfo.getRegisterInfo();
	}
	const NVPTXTargetLowering *getTargetLowering() const override {
	return &TLInfo;
	}

	const SelectionDAGTargetInfo *getSelectionDAGInfo() const override;

	// Checks PTX version and family-specific and architecture-specific SM
	// versions. For example, sm_100{f/a} and any future variants in the same
	// family will match for any PTX version greater than or equal to
	// `PTXVersion`.
	bool hasPTXWithFamilySMs(unsigned PTXVersion,
	ArrayRef<unsigned> SMVersions) const;
	// Checks PTX version and architecture-specific SM versions.
	// For example, sm_100{a} will match for any PTX version greater than or equal
	// to `PTXVersion`.
	bool hasPTXWithAccelSMs(unsigned PTXVersion,
	ArrayRef<unsigned> SMVersions) const;

	bool has256BitVectorLoadStore(unsigned AS) const {
	return SmVersion >= 100 && PTXVersion >= 88 &&
	AS == NVPTXAS::ADDRESS_SPACE_GLOBAL;
	}
	bool hasAtomAddF64() const { return SmVersion >= 60; }
	bool hasAtomScope() const { return SmVersion >= 60; }
	bool hasAtomBitwise64() const { return SmVersion >= 32; }
	bool hasAtomMinMax64() const { return SmVersion >= 32; }
	bool hasAtomCas16() const { return SmVersion >= 70 && PTXVersion >= 63; }
	bool hasAtomSwap128() const { return SmVersion >= 90 && PTXVersion >= 83; }
	bool hasClusters() const { return SmVersion >= 90 && PTXVersion >= 78; }
	bool hasLDG() const { return SmVersion >= 32; }
	bool hasHWROT32() const { return SmVersion >= 32; }
	bool hasFP16Math() const { return SmVersion >= 53; }
	bool hasBF16Math() const { return SmVersion >= 80; }
	bool allowFP16Math() const;
	bool hasMaskOperator() const { return PTXVersion >= 71; }
	bool hasNoReturn() const { return SmVersion >= 30 && PTXVersion >= 64; }
	// Does SM & PTX support memory orderings (weak and atomic: relaxed, acquire,
	// release, acq_rel, sc) ?
	bool hasMemoryOrdering() const { return SmVersion >= 70 && PTXVersion >= 60; }
	// Does SM & PTX support .acquire and .release qualifiers for fence?
	bool hasSplitAcquireAndReleaseFences() const {
	return SmVersion >= 90 && PTXVersion >= 86;
	}
	// Does SM & PTX support atomic relaxed MMIO operations ?
	bool hasRelaxedMMIO() const { return SmVersion >= 70 && PTXVersion >= 82; }
	bool hasDotInstructions() const {
	return SmVersion >= 61 && PTXVersion >= 50;
	}
	// Tcgen05 instructions in Blackwell family
	bool hasTcgen05Instructions() const {
	bool HasTcgen05 = false;
	unsigned MinPTXVersion = 86;
	switch (FullSmVersion) {
	default:
	break;
	case 1003: // sm_100a
	case 1013: // sm_101a
	HasTcgen05 = true;
	break;
	case 1103: // sm_110a
	HasTcgen05 = true;
	MinPTXVersion = 90;
	break;
	case 1033: // sm_103a
	HasTcgen05 = true;
	MinPTXVersion = 88;
	break;
	}

	return HasTcgen05 && PTXVersion >= MinPTXVersion;
	}

	// Checks following instructions support:
	// - tcgen05.ld/st
	// - tcgen05.alloc/dealloc/relinquish
	// - tcgen05.cp
	// - tcgen05.fence/wait
	// - tcgen05.commit
	bool hasTcgen05InstSupport() const {
	// sm_101 renamed to sm_110 in PTX 9.0
	return hasPTXWithFamilySMs(90, {100, 110}) \|\|
	hasPTXWithFamilySMs(88, {100, 101}) \|\|
	hasPTXWithAccelSMs(86, {100, 101});
	}

	// Checks tcgen05.shift instruction support.
	bool hasTcgen05ShiftSupport() const {
	// sm_101 renamed to sm_110 in PTX 9.0
	return hasPTXWithAccelSMs(90, {100, 110, 103}) \|\|
	hasPTXWithAccelSMs(88, {100, 101, 103}) \|\|
	hasPTXWithAccelSMs(86, {100, 101});
	}

	bool hasTcgen05MMAScaleInputDImm() const {
	return FullSmVersion == 1003 && PTXVersion >= 86;
	}
	// f32x2 instructions in Blackwell family
	bool hasF32x2Instructions() const;

	// TMA G2S copy with cta_group::1/2 support
	bool hasCpAsyncBulkTensorCTAGroupSupport() const {
	// TODO: Update/tidy-up after the family-conditional support arrives
	switch (FullSmVersion) {
	case 1003:
	case 1013:
	return PTXVersion >= 86;
	case 1033:
	return PTXVersion >= 88;
	default:
	return false;
	}
	}

	// Prior to CUDA 12.3 ptxas did not recognize that the trap instruction
	// terminates a basic block. Instead, it would assume that control flow
	// continued to the next instruction. The next instruction could be in the
	// block that's lexically below it. This would lead to a phantom CFG edges
	// being created within ptxas. This issue was fixed in CUDA 12.3. Thus, when
	// PTX ISA versions 8.3+ we can confidently say that the bug will not be
	// present.
	bool hasPTXASUnreachableBug() const { return PTXVersion < 83; }
	bool hasCvtaParam() const { return SmVersion >= 70 && PTXVersion >= 77; }
	unsigned int getFullSmVersion() const { return FullSmVersion; }
	unsigned int getSmVersion() const { return getFullSmVersion() / 10; }
	unsigned int getSmFamilyVersion() const { return getFullSmVersion() / 100; }
	// GPUs with "a" suffix have architecture-accelerated features that are
	// supported on the specified architecture only, hence such targets do not
	// follow the onion layer model. hasArchAccelFeatures() allows distinguishing
	// such GPU variants from the base GPU architecture.
	// - false represents non-accelerated architecture.
	// - true represents architecture-accelerated variant.
	bool hasArchAccelFeatures() const {
	return (getFullSmVersion() & 1) && PTXVersion >= 80;
	}
	// GPUs with 'f' suffix have architecture-accelerated features which are
	// portable across all future architectures under same SM major. For example,
	// sm_100f features will work for sm_10Xf/sm_10Xa future architectures.
	// - false represents non-family-specific architecture.
	// - true represents family-specific variant.
	bool hasFamilySpecificFeatures() const {
	return getFullSmVersion() % 10 == 2 ? PTXVersion >= 88
	: hasArchAccelFeatures();
	}
	// If the user did not provide a target we default to the `sm_30` target.
	std::string getTargetName() const {
	return TargetName.empty() ? "sm_30" : TargetName;
	}
	bool hasTargetName() const { return !TargetName.empty(); }

	bool hasNativeBF16Support(int Opcode) const;

	// Get maximum value of required alignments among the supported data types.
	// From the PTX ISA doc, section 8.2.3:
	// The memory consistency model relates operations executed on memory
	// locations with scalar data-types, which have a maximum size and alignment
	// of 64 bits. Memory operations with a vector data-type are modelled as a
	// set of equivalent memory operations with a scalar data-type, executed in
	// an unspecified order on the elements in the vector.
	unsigned getMaxRequiredAlignment() const { return 8; }
	// Get the smallest cmpxchg word size that the hardware supports.
	unsigned getMinCmpXchgSizeInBits() const { return 32; }

	unsigned getPTXVersion() const { return PTXVersion; }

	NVPTXSubtarget &initializeSubtargetDependencies(StringRef CPU, StringRef FS);
	void ParseSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS);

	void failIfClustersUnsupported(std::string const &FailureMessage) const;
	};

	} // End llvm namespace

	#endif