blob: 194dbdc061a968a166bc902aab598098b83aabdf [file] [log] [blame]
//=====-- NVPTXSubtarget.h - Define Subtarget for the NVPTX ---*- C++ -*--====//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This file declares the NVPTX specific subclass of TargetSubtarget.
//
//===----------------------------------------------------------------------===//
#ifndef LLVM_LIB_TARGET_NVPTX_NVPTXSUBTARGET_H
#define LLVM_LIB_TARGET_NVPTX_NVPTXSUBTARGET_H
#include "NVPTX.h"
#include "NVPTXFrameLowering.h"
#include "NVPTXISelLowering.h"
#include "NVPTXInstrInfo.h"
#include "NVPTXRegisterInfo.h"
#include "llvm/CodeGen/TargetSubtargetInfo.h"
#include "llvm/IR/DataLayout.h"
#include "llvm/Support/NVPTXAddrSpace.h"
#include <string>
#define GET_SUBTARGETINFO_HEADER
#include "NVPTXGenSubtargetInfo.inc"
namespace llvm {
class NVPTXSubtarget : public NVPTXGenSubtargetInfo {
virtual void anchor();
std::string TargetName;
// PTX version x.y is represented as 10*x+y, e.g. 3.1 == 31
unsigned PTXVersion;
// Full SM version x.y is represented as 100*x+10*y+feature, e.g. 3.1 == 310
// sm_90a == 901
unsigned int FullSmVersion;
// SM version x.y is represented as 10*x+y, e.g. 3.1 == 31. Derived from
// FullSmVersion.
unsigned int SmVersion;
NVPTXInstrInfo InstrInfo;
NVPTXTargetLowering TLInfo;
std::unique_ptr<const SelectionDAGTargetInfo> TSInfo;
// NVPTX does not have any call stack frame, but need a NVPTX specific
// FrameLowering class because TargetFrameLowering is abstract.
NVPTXFrameLowering FrameLowering;
public:
/// This constructor initializes the data members to match that
/// of the specified module.
///
NVPTXSubtarget(const Triple &TT, const std::string &CPU,
const std::string &FS, const NVPTXTargetMachine &TM);
~NVPTXSubtarget() override;
const TargetFrameLowering *getFrameLowering() const override {
return &FrameLowering;
}
const NVPTXInstrInfo *getInstrInfo() const override { return &InstrInfo; }
const NVPTXRegisterInfo *getRegisterInfo() const override {
return &InstrInfo.getRegisterInfo();
}
const NVPTXTargetLowering *getTargetLowering() const override {
return &TLInfo;
}
const SelectionDAGTargetInfo *getSelectionDAGInfo() const override;
// Checks PTX version and family-specific and architecture-specific SM
// versions. For example, sm_100{f/a} and any future variants in the same
// family will match for any PTX version greater than or equal to
// `PTXVersion`.
bool hasPTXWithFamilySMs(unsigned PTXVersion,
ArrayRef<unsigned> SMVersions) const;
// Checks PTX version and architecture-specific SM versions.
// For example, sm_100{a} will match for any PTX version greater than or equal
// to `PTXVersion`.
bool hasPTXWithAccelSMs(unsigned PTXVersion,
ArrayRef<unsigned> SMVersions) const;
bool has256BitVectorLoadStore(unsigned AS) const {
return SmVersion >= 100 && PTXVersion >= 88 &&
AS == NVPTXAS::ADDRESS_SPACE_GLOBAL;
}
bool hasAtomAddF64() const { return SmVersion >= 60; }
bool hasAtomScope() const { return SmVersion >= 60; }
bool hasAtomBitwise64() const { return SmVersion >= 32; }
bool hasAtomMinMax64() const { return SmVersion >= 32; }
bool hasAtomCas16() const { return SmVersion >= 70 && PTXVersion >= 63; }
bool hasAtomSwap128() const { return SmVersion >= 90 && PTXVersion >= 83; }
bool hasClusters() const { return SmVersion >= 90 && PTXVersion >= 78; }
bool hasLDG() const { return SmVersion >= 32; }
bool hasHWROT32() const { return SmVersion >= 32; }
bool hasFP16Math() const { return SmVersion >= 53; }
bool hasBF16Math() const { return SmVersion >= 80; }
bool allowFP16Math() const;
bool hasMaskOperator() const { return PTXVersion >= 71; }
bool hasNoReturn() const { return SmVersion >= 30 && PTXVersion >= 64; }
// Does SM & PTX support memory orderings (weak and atomic: relaxed, acquire,
// release, acq_rel, sc) ?
bool hasMemoryOrdering() const { return SmVersion >= 70 && PTXVersion >= 60; }
// Does SM & PTX support .acquire and .release qualifiers for fence?
bool hasSplitAcquireAndReleaseFences() const {
return SmVersion >= 90 && PTXVersion >= 86;
}
// Does SM & PTX support atomic relaxed MMIO operations ?
bool hasRelaxedMMIO() const { return SmVersion >= 70 && PTXVersion >= 82; }
bool hasDotInstructions() const {
return SmVersion >= 61 && PTXVersion >= 50;
}
// Tcgen05 instructions in Blackwell family
bool hasTcgen05Instructions() const {
bool HasTcgen05 = false;
unsigned MinPTXVersion = 86;
switch (FullSmVersion) {
default:
break;
case 1003: // sm_100a
case 1013: // sm_101a
HasTcgen05 = true;
break;
case 1103: // sm_110a
HasTcgen05 = true;
MinPTXVersion = 90;
break;
case 1033: // sm_103a
HasTcgen05 = true;
MinPTXVersion = 88;
break;
}
return HasTcgen05 && PTXVersion >= MinPTXVersion;
}
// Checks following instructions support:
// - tcgen05.ld/st
// - tcgen05.alloc/dealloc/relinquish
// - tcgen05.cp
// - tcgen05.fence/wait
// - tcgen05.commit
bool hasTcgen05InstSupport() const {
// sm_101 renamed to sm_110 in PTX 9.0
return hasPTXWithFamilySMs(90, {100, 110}) ||
hasPTXWithFamilySMs(88, {100, 101}) ||
hasPTXWithAccelSMs(86, {100, 101});
}
// Checks tcgen05.shift instruction support.
bool hasTcgen05ShiftSupport() const {
// sm_101 renamed to sm_110 in PTX 9.0
return hasPTXWithAccelSMs(90, {100, 110, 103}) ||
hasPTXWithAccelSMs(88, {100, 101, 103}) ||
hasPTXWithAccelSMs(86, {100, 101});
}
bool hasTcgen05MMAScaleInputDImm() const {
return FullSmVersion == 1003 && PTXVersion >= 86;
}
// f32x2 instructions in Blackwell family
bool hasF32x2Instructions() const;
// TMA G2S copy with cta_group::1/2 support
bool hasCpAsyncBulkTensorCTAGroupSupport() const {
// TODO: Update/tidy-up after the family-conditional support arrives
switch (FullSmVersion) {
case 1003:
case 1013:
return PTXVersion >= 86;
case 1033:
return PTXVersion >= 88;
default:
return false;
}
}
// Prior to CUDA 12.3 ptxas did not recognize that the trap instruction
// terminates a basic block. Instead, it would assume that control flow
// continued to the next instruction. The next instruction could be in the
// block that's lexically below it. This would lead to a phantom CFG edges
// being created within ptxas. This issue was fixed in CUDA 12.3. Thus, when
// PTX ISA versions 8.3+ we can confidently say that the bug will not be
// present.
bool hasPTXASUnreachableBug() const { return PTXVersion < 83; }
bool hasCvtaParam() const { return SmVersion >= 70 && PTXVersion >= 77; }
unsigned int getFullSmVersion() const { return FullSmVersion; }
unsigned int getSmVersion() const { return getFullSmVersion() / 10; }
unsigned int getSmFamilyVersion() const { return getFullSmVersion() / 100; }
// GPUs with "a" suffix have architecture-accelerated features that are
// supported on the specified architecture only, hence such targets do not
// follow the onion layer model. hasArchAccelFeatures() allows distinguishing
// such GPU variants from the base GPU architecture.
// - false represents non-accelerated architecture.
// - true represents architecture-accelerated variant.
bool hasArchAccelFeatures() const {
return (getFullSmVersion() & 1) && PTXVersion >= 80;
}
// GPUs with 'f' suffix have architecture-accelerated features which are
// portable across all future architectures under same SM major. For example,
// sm_100f features will work for sm_10X*f*/sm_10X*a* future architectures.
// - false represents non-family-specific architecture.
// - true represents family-specific variant.
bool hasFamilySpecificFeatures() const {
return getFullSmVersion() % 10 == 2 ? PTXVersion >= 88
: hasArchAccelFeatures();
}
// If the user did not provide a target we default to the `sm_30` target.
std::string getTargetName() const {
return TargetName.empty() ? "sm_30" : TargetName;
}
bool hasTargetName() const { return !TargetName.empty(); }
bool hasNativeBF16Support(int Opcode) const;
// Get maximum value of required alignments among the supported data types.
// From the PTX ISA doc, section 8.2.3:
// The memory consistency model relates operations executed on memory
// locations with scalar data-types, which have a maximum size and alignment
// of 64 bits. Memory operations with a vector data-type are modelled as a
// set of equivalent memory operations with a scalar data-type, executed in
// an unspecified order on the elements in the vector.
unsigned getMaxRequiredAlignment() const { return 8; }
// Get the smallest cmpxchg word size that the hardware supports.
unsigned getMinCmpXchgSizeInBits() const { return 32; }
unsigned getPTXVersion() const { return PTXVersion; }
NVPTXSubtarget &initializeSubtargetDependencies(StringRef CPU, StringRef FS);
void ParseSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS);
void failIfClustersUnsupported(std::string const &FailureMessage) const;
};
} // End llvm namespace
#endif