blob: ee1d507a08c9b6881582948d355d1ed070ecfd43 [file]
//===- AMDGPUHWEvents.cpp ---------------------------------------*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#include "AMDGPUHWEvents.h"
#include "GCNSubtarget.h"
#include "SIInstrInfo.h"
#include "llvm/ADT/StringExtras.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
namespace llvm {
namespace AMDGPU {
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
LLVM_DUMP_METHOD void HWEvents::dump() const { dbgs() << *this << "\n"; }
#endif
static HWEvents getExpertSchedulingEventType(const MachineInstr &Inst,
const SIInstrInfo &TII) {
if (TII.isVALU(Inst, /*AllowLDSDMA=*/true) && !SIInstrInfo::isLDSDMA(Inst)) {
// Core/Side-, DP-, XDL- and TRANS-MACC VALU instructions complete
// out-of-order with respect to each other, so each of these classes
// has its own event.
if (TII.isXDL(Inst))
return HWEvents::VGPR_XDL_WRITE;
if (TII.isTRANS(Inst))
return HWEvents::VGPR_TRANS_WRITE;
if (AMDGPU::isDPMACCInstruction(Inst.getOpcode()))
return HWEvents::VGPR_DPMACC_WRITE;
return HWEvents::VGPR_CSMACC_WRITE;
}
// FLAT and LDS instructions may read their VGPR sources out-of-order
// with respect to each other and all other VMEM instructions, so
// each of these also has a separate event.
if (TII.isFLAT(Inst))
return HWEvents::VGPR_FLAT_READ;
if (TII.isDS(Inst))
return HWEvents::VGPR_LDS_READ;
if (TII.isVMEM(Inst) || TII.isVIMAGE(Inst) || TII.isVSAMPLE(Inst))
return HWEvents::VGPR_VMEM_READ;
// Otherwise, no hazard.
return HWEvents::NONE;
}
static HWEvents getVmemHWEvent(const MachineInstr &Inst, const GCNSubtarget &ST,
const SIInstrInfo &TII) {
switch (Inst.getOpcode()) {
// FIXME: GLOBAL_INV needs to be tracked with xcnt too.
case AMDGPU::GLOBAL_INV:
return HWEvents::GLOBAL_INV_ACCESS; // tracked using loadcnt, but doesn't
// write VGPRs
case AMDGPU::GLOBAL_WB:
case AMDGPU::GLOBAL_WBINV:
return HWEvents::VMEM_WRITE_ACCESS; // tracked using storecnt
default:
break;
}
assert(SIInstrInfo::isVMEM(Inst));
// LDS DMA loads are also stores, but on the LDS side. On the VMEM side
// these should use VM_CNT.
if (!ST.hasVscnt() || SIInstrInfo::mayWriteLDSThroughDMA(Inst))
return HWEvents::VMEM_ACCESS;
if (Inst.mayStore() &&
(!Inst.mayLoad() || SIInstrInfo::isAtomicNoRet(Inst))) {
if (TII.mayAccessScratch(Inst))
return HWEvents::SCRATCH_WRITE_ACCESS;
return HWEvents::VMEM_WRITE_ACCESS;
}
if (!ST.hasExtendedWaitCounts() || SIInstrInfo::isFLAT(Inst))
return HWEvents::VMEM_ACCESS;
if (SIInstrInfo::isImage(Inst)) {
const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Inst.getOpcode());
const AMDGPU::MIMGBaseOpcodeInfo *BaseInfo =
AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode);
if (BaseInfo->BVH)
return HWEvents::VMEM_BVH_READ_ACCESS;
// We have to make an additional check for isVSAMPLE here since some
// instructions don't have a sampler, but are still classified as sampler
// instructions for the purposes of e.g. waitcnt.
if (BaseInfo->Sampler || BaseInfo->MSAA || SIInstrInfo::isVSAMPLE(Inst))
return HWEvents::VMEM_SAMPLER_READ_ACCESS;
}
return HWEvents::VMEM_ACCESS;
}
static HWEvents getEventsForImpl(const MachineInstr &Inst,
const GCNSubtarget &ST,
const SIInstrInfo &TII) {
if (TII.isDS(Inst) && TII.usesLGKM_CNT(Inst)) {
if (TII.isAlwaysGDS(Inst.getOpcode()) ||
TII.hasModifiersSet(Inst, AMDGPU::OpName::gds))
return HWEvents::GDS_ACCESS | HWEvents::GDS_GPR_LOCK;
return HWEvents::LDS_ACCESS;
}
if (TII.isFLAT(Inst)) {
if (SIInstrInfo::isGFX12CacheInvOrWBInst(Inst.getOpcode()))
return getVmemHWEvent(Inst, ST, TII);
assert(Inst.mayLoadOrStore());
HWEvents E = HWEvents::NONE;
if (TII.mayAccessVMEMThroughFlat(Inst)) {
if (ST.hasWaitXcnt())
E |= HWEvents::VMEM_GROUP;
E |= getVmemHWEvent(Inst, ST, TII);
}
if (TII.mayAccessLDSThroughFlat(Inst))
E |= HWEvents::LDS_ACCESS;
return E;
}
if (SIInstrInfo::isVMEM(Inst) &&
(!AMDGPU::getMUBUFIsBufferInv(Inst.getOpcode()) ||
Inst.getOpcode() == AMDGPU::BUFFER_WBL2)) {
// BUFFER_WBL2 is included here because unlike invalidates, has to be
// followed "S_WAITCNT vmcnt(0)" is needed after to ensure the writeback has
// completed.
HWEvents E = getVmemHWEvent(Inst, ST, TII);
if (ST.hasWaitXcnt())
E |= HWEvents::VMEM_GROUP;
if (ST.vmemWriteNeedsExpWaitcnt() &&
(Inst.mayStore() || SIInstrInfo::isAtomicRet(Inst)))
E |= HWEvents::VMW_GPR_LOCK;
return E;
}
if (TII.isSMRD(Inst)) {
if (ST.hasWaitXcnt())
return HWEvents::SMEM_GROUP | HWEvents::SMEM_ACCESS;
return HWEvents::SMEM_ACCESS;
}
if (SIInstrInfo::isLDSDIR(Inst)) {
return HWEvents::EXP_LDS_ACCESS;
}
if (SIInstrInfo::isEXP(Inst)) {
unsigned Imm = TII.getNamedOperand(Inst, AMDGPU::OpName::tgt)->getImm();
if (Imm >= AMDGPU::Exp::ET_PARAM0 && Imm <= AMDGPU::Exp::ET_PARAM31)
return HWEvents::EXP_PARAM_ACCESS;
if (Imm >= AMDGPU::Exp::ET_POS0 && Imm <= AMDGPU::Exp::ET_POS_LAST)
return HWEvents::EXP_POS_ACCESS;
return HWEvents::EXP_GPR_LOCK;
}
if (SIInstrInfo::isSBarrierSCCWrite(Inst.getOpcode())) {
return HWEvents::SCC_WRITE;
}
switch (Inst.getOpcode()) {
case AMDGPU::S_SENDMSG:
case AMDGPU::S_SENDMSG_RTN_B32:
case AMDGPU::S_SENDMSG_RTN_B64:
case AMDGPU::S_SENDMSGHALT:
return HWEvents::SQ_MESSAGE;
case AMDGPU::S_MEMTIME:
case AMDGPU::S_MEMREALTIME:
case AMDGPU::S_GET_BARRIER_STATE_M0:
case AMDGPU::S_GET_BARRIER_STATE_IMM:
return HWEvents::SMEM_ACCESS;
}
return HWEvents::NONE;
}
HWEvents getEventsFor(const MachineInstr &Inst, const GCNSubtarget &ST,
bool IsExpertMode) {
const SIInstrInfo &TII = *ST.getInstrInfo();
if (IsExpertMode)
return getEventsForImpl(Inst, ST, TII) |
getExpertSchedulingEventType(Inst, TII);
return getEventsForImpl(Inst, ST, TII);
}
} // namespace AMDGPU
raw_ostream &operator<<(raw_ostream &OS, AMDGPU::HWEvents Events) {
ListSeparator LS(" | ");
#define AMDGPU_HW_EVENT(E, V) \
if (Events & AMDGPU::HWEvents::E) \
OS << LS << #E << " ";
#include "AMDGPUHWEvents.def"
return OS;
}
} // namespace llvm