blob: bfdd8cf1bc2b11e8d66414d192b337016762d372 [file] [log] [blame]
//===- AMDGPUWaitSGPRHazards.cpp - Insert waits for SGPR read hazards -----===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
/// \file
/// Insert s_wait_alu instructions to mitigate SGPR read hazards on GFX12.
//
//===----------------------------------------------------------------------===//
#include "AMDGPUWaitSGPRHazards.h"
#include "AMDGPU.h"
#include "GCNSubtarget.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "SIInstrInfo.h"
#include "llvm/ADT/SetVector.h"
using namespace llvm;
#define DEBUG_TYPE "amdgpu-wait-sgpr-hazards"
static cl::opt<bool> GlobalEnableSGPRHazardWaits(
"amdgpu-sgpr-hazard-wait", cl::init(true), cl::Hidden,
cl::desc("Enable required s_wait_alu on SGPR hazards"));
static cl::opt<bool> GlobalCullSGPRHazardsOnFunctionBoundary(
"amdgpu-sgpr-hazard-boundary-cull", cl::init(false), cl::Hidden,
cl::desc("Cull hazards on function boundaries"));
static cl::opt<bool>
GlobalCullSGPRHazardsAtMemWait("amdgpu-sgpr-hazard-mem-wait-cull",
cl::init(false), cl::Hidden,
cl::desc("Cull hazards on memory waits"));
static cl::opt<unsigned> GlobalCullSGPRHazardsMemWaitThreshold(
"amdgpu-sgpr-hazard-mem-wait-cull-threshold", cl::init(8), cl::Hidden,
cl::desc("Number of tracked SGPRs before initiating hazard cull on memory "
"wait"));
namespace {
class AMDGPUWaitSGPRHazards {
public:
const SIInstrInfo *TII;
const SIRegisterInfo *TRI;
const MachineRegisterInfo *MRI;
unsigned DsNopCount;
bool EnableSGPRHazardWaits;
bool CullSGPRHazardsOnFunctionBoundary;
bool CullSGPRHazardsAtMemWait;
unsigned CullSGPRHazardsMemWaitThreshold;
AMDGPUWaitSGPRHazards() {}
// Return the numeric ID 0-127 for a given SGPR.
static std::optional<unsigned> sgprNumber(Register Reg,
const SIRegisterInfo &TRI) {
switch (Reg) {
case AMDGPU::M0:
case AMDGPU::EXEC:
case AMDGPU::EXEC_LO:
case AMDGPU::EXEC_HI:
case AMDGPU::SGPR_NULL:
case AMDGPU::SGPR_NULL64:
return {};
default:
break;
}
unsigned RegN = TRI.getHWRegIndex(Reg);
if (RegN > 127)
return {};
return RegN;
}
static inline bool isVCC(Register Reg) {
return Reg == AMDGPU::VCC || Reg == AMDGPU::VCC_LO || Reg == AMDGPU::VCC_HI;
}
// Adjust global offsets for instructions bundled with S_GETPC_B64 after
// insertion of a new instruction.
static void updateGetPCBundle(MachineInstr *NewMI) {
if (!NewMI->isBundled())
return;
// Find start of bundle.
auto I = NewMI->getIterator();
while (I->isBundledWithPred())
I--;
if (I->isBundle())
I++;
// Bail if this is not an S_GETPC bundle.
if (I->getOpcode() != AMDGPU::S_GETPC_B64)
return;
// Update offsets of any references in the bundle.
const unsigned NewBytes = 4;
assert(NewMI->getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
"Unexpected instruction insertion in bundle");
auto NextMI = std::next(NewMI->getIterator());
auto End = NewMI->getParent()->end();
while (NextMI != End && NextMI->isBundledWithPred()) {
for (auto &Operand : NextMI->operands()) {
if (Operand.isGlobal())
Operand.setOffset(Operand.getOffset() + NewBytes);
}
NextMI++;
}
}
struct HazardState {
static constexpr unsigned None = 0;
static constexpr unsigned SALU = (1 << 0);
static constexpr unsigned VALU = (1 << 1);
std::bitset<64> Tracked; // SGPR banks ever read by VALU
std::bitset<128> SALUHazards; // SGPRs with uncommitted values from SALU
std::bitset<128> VALUHazards; // SGPRs with uncommitted values from VALU
unsigned VCCHazard = None; // Source of current VCC writes
bool ActiveFlat = false; // Has unwaited flat instructions
bool merge(const HazardState &RHS) {
HazardState Orig(*this);
*this |= RHS;
return (*this != Orig);
}
bool operator==(const HazardState &RHS) const {
return Tracked == RHS.Tracked && SALUHazards == RHS.SALUHazards &&
VALUHazards == RHS.VALUHazards && VCCHazard == RHS.VCCHazard &&
ActiveFlat == RHS.ActiveFlat;
}
bool operator!=(const HazardState &RHS) const { return !(*this == RHS); }
void operator|=(const HazardState &RHS) {
Tracked |= RHS.Tracked;
SALUHazards |= RHS.SALUHazards;
VALUHazards |= RHS.VALUHazards;
VCCHazard |= RHS.VCCHazard;
ActiveFlat |= RHS.ActiveFlat;
}
};
struct BlockHazardState {
HazardState In;
HazardState Out;
};
DenseMap<const MachineBasicBlock *, BlockHazardState> BlockState;
static constexpr unsigned WAVE32_NOPS = 4;
static constexpr unsigned WAVE64_NOPS = 8;
void insertHazardCull(MachineBasicBlock &MBB,
MachineBasicBlock::instr_iterator &MI) {
assert(!MI->isBundled());
unsigned Count = DsNopCount;
while (Count--)
BuildMI(MBB, MI, MI->getDebugLoc(), TII->get(AMDGPU::DS_NOP));
}
unsigned mergeMasks(unsigned Mask1, unsigned Mask2) {
unsigned Mask = 0xffff;
Mask = AMDGPU::DepCtr::encodeFieldSaSdst(
Mask, std::min(AMDGPU::DepCtr::decodeFieldSaSdst(Mask1),
AMDGPU::DepCtr::decodeFieldSaSdst(Mask2)));
Mask = AMDGPU::DepCtr::encodeFieldVaVcc(
Mask, std::min(AMDGPU::DepCtr::decodeFieldVaVcc(Mask1),
AMDGPU::DepCtr::decodeFieldVaVcc(Mask2)));
Mask = AMDGPU::DepCtr::encodeFieldVmVsrc(
Mask, std::min(AMDGPU::DepCtr::decodeFieldVmVsrc(Mask1),
AMDGPU::DepCtr::decodeFieldVmVsrc(Mask2)));
Mask = AMDGPU::DepCtr::encodeFieldVaSdst(
Mask, std::min(AMDGPU::DepCtr::decodeFieldVaSdst(Mask1),
AMDGPU::DepCtr::decodeFieldVaSdst(Mask2)));
Mask = AMDGPU::DepCtr::encodeFieldVaVdst(
Mask, std::min(AMDGPU::DepCtr::decodeFieldVaVdst(Mask1),
AMDGPU::DepCtr::decodeFieldVaVdst(Mask2)));
Mask = AMDGPU::DepCtr::encodeFieldHoldCnt(
Mask, std::min(AMDGPU::DepCtr::decodeFieldHoldCnt(Mask1),
AMDGPU::DepCtr::decodeFieldHoldCnt(Mask2)));
Mask = AMDGPU::DepCtr::encodeFieldVaSsrc(
Mask, std::min(AMDGPU::DepCtr::decodeFieldVaSsrc(Mask1),
AMDGPU::DepCtr::decodeFieldVaSsrc(Mask2)));
return Mask;
}
bool mergeConsecutiveWaitAlus(MachineBasicBlock::instr_iterator &MI,
unsigned Mask) {
auto MBB = MI->getParent();
if (MI == MBB->instr_begin())
return false;
auto It = prev_nodbg(MI, MBB->instr_begin());
if (It->getOpcode() != AMDGPU::S_WAITCNT_DEPCTR)
return false;
It->getOperand(0).setImm(mergeMasks(Mask, It->getOperand(0).getImm()));
return true;
}
bool runOnMachineBasicBlock(MachineBasicBlock &MBB, bool Emit) {
enum { WA_VALU = 0x1, WA_SALU = 0x2, WA_VCC = 0x4 };
HazardState State = BlockState[&MBB].In;
SmallSet<Register, 8> SeenRegs;
bool Emitted = false;
unsigned DsNops = 0;
for (MachineBasicBlock::instr_iterator MI = MBB.instr_begin(),
E = MBB.instr_end();
MI != E; ++MI) {
if (MI->isMetaInstruction())
continue;
// Clear tracked SGPRs if sufficient DS_NOPs occur
if (MI->getOpcode() == AMDGPU::DS_NOP) {
if (++DsNops >= DsNopCount)
State.Tracked.reset();
continue;
}
DsNops = 0;
// Snoop FLAT instructions to avoid adding culls before scratch/lds loads.
// Culls could be disproportionate in cost to load time.
if (SIInstrInfo::isFLAT(*MI) && !SIInstrInfo::isFLATGlobal(*MI))
State.ActiveFlat = true;
// SMEM or VMEM clears hazards
if (SIInstrInfo::isVMEM(*MI) || SIInstrInfo::isSMRD(*MI)) {
State.VCCHazard = HazardState::None;
State.SALUHazards.reset();
State.VALUHazards.reset();
continue;
}
// Existing S_WAITALU can clear hazards
if (MI->getOpcode() == AMDGPU::S_WAITCNT_DEPCTR) {
unsigned int Mask = MI->getOperand(0).getImm();
if (AMDGPU::DepCtr::decodeFieldVaVcc(Mask) == 0)
State.VCCHazard &= ~HazardState::VALU;
if (AMDGPU::DepCtr::decodeFieldSaSdst(Mask) == 0) {
State.SALUHazards.reset();
State.VCCHazard &= ~HazardState::SALU;
}
if (AMDGPU::DepCtr::decodeFieldVaSdst(Mask) == 0)
State.VALUHazards.reset();
continue;
}
// Snoop counter waits to insert culls
if (CullSGPRHazardsAtMemWait &&
(MI->getOpcode() == AMDGPU::S_WAIT_LOADCNT ||
MI->getOpcode() == AMDGPU::S_WAIT_SAMPLECNT ||
MI->getOpcode() == AMDGPU::S_WAIT_BVHCNT) &&
(MI->getOperand(0).isImm() && MI->getOperand(0).getImm() == 0) &&
(State.Tracked.count() >= CullSGPRHazardsMemWaitThreshold)) {
if (MI->getOpcode() == AMDGPU::S_WAIT_LOADCNT && State.ActiveFlat) {
State.ActiveFlat = false;
} else {
State.Tracked.reset();
if (Emit)
insertHazardCull(MBB, MI);
continue;
}
}
// Process only VALUs and SALUs
bool IsVALU = SIInstrInfo::isVALU(*MI);
bool IsSALU = SIInstrInfo::isSALU(*MI);
if (!IsVALU && !IsSALU)
continue;
unsigned Wait = 0;
auto processOperand = [&](const MachineOperand &Op, bool IsUse) {
if (!Op.isReg())
return;
Register Reg = Op.getReg();
assert(!Op.getSubReg());
if (!TRI->isSGPRReg(*MRI, Reg))
return;
// Only visit each register once
if (!SeenRegs.insert(Reg).second)
return;
auto RegNumber = sgprNumber(Reg, *TRI);
if (!RegNumber)
return;
// Track SGPRs by pair -- numeric ID of an 64b SGPR pair.
// i.e. SGPR0 = SGPR0_SGPR1 = 0, SGPR3 = SGPR2_SGPR3 = 1, etc
unsigned RegN = *RegNumber;
unsigned PairN = (RegN >> 1) & 0x3f;
// Read/write of untracked register is safe; but must record any new
// reads.
if (!State.Tracked[PairN]) {
if (IsVALU && IsUse)
State.Tracked.set(PairN);
return;
}
uint8_t SGPRCount =
AMDGPU::getRegBitWidth(*TRI->getRegClassForReg(*MRI, Reg)) / 32;
if (IsUse) {
// SALU reading SGPR clears VALU hazards
if (IsSALU) {
if (isVCC(Reg)) {
if (State.VCCHazard & HazardState::VALU)
State.VCCHazard = HazardState::None;
} else {
State.VALUHazards.reset();
}
}
// Compute required waits
for (uint8_t RegIdx = 0; RegIdx < SGPRCount; ++RegIdx) {
Wait |= State.SALUHazards[RegN + RegIdx] ? WA_SALU : 0;
Wait |= IsVALU && State.VALUHazards[RegN + RegIdx] ? WA_VALU : 0;
}
if (isVCC(Reg) && State.VCCHazard) {
// Note: it's possible for both SALU and VALU to exist if VCC
// was updated differently by merged predecessors.
if (State.VCCHazard & HazardState::SALU)
Wait |= WA_SALU;
if (State.VCCHazard & HazardState::VALU)
Wait |= WA_VCC;
}
} else {
// Update hazards
if (isVCC(Reg)) {
State.VCCHazard = IsSALU ? HazardState::SALU : HazardState::VALU;
} else {
for (uint8_t RegIdx = 0; RegIdx < SGPRCount; ++RegIdx) {
if (IsSALU)
State.SALUHazards.set(RegN + RegIdx);
else
State.VALUHazards.set(RegN + RegIdx);
}
}
}
};
const bool IsSetPC =
(MI->isCall() || MI->isReturn() || MI->isIndirectBranch()) &&
MI->getOpcode() != AMDGPU::S_ENDPGM &&
MI->getOpcode() != AMDGPU::S_ENDPGM_SAVED;
// Only consider implicit VCC specified by instruction descriptor.
const bool HasImplicitVCC =
llvm::any_of(MI->getDesc().implicit_uses(),
[](MCPhysReg Reg) { return isVCC(Reg); }) ||
llvm::any_of(MI->getDesc().implicit_defs(),
[](MCPhysReg Reg) { return isVCC(Reg); });
if (IsSetPC) {
// All SGPR writes before a call/return must be flushed as the
// callee/caller will not will not see the hazard chain.
if (State.VCCHazard & HazardState::VALU)
Wait |= WA_VCC;
if (State.SALUHazards.any() || (State.VCCHazard & HazardState::SALU))
Wait |= WA_SALU;
if (State.VALUHazards.any())
Wait |= WA_VALU;
if (CullSGPRHazardsOnFunctionBoundary && State.Tracked.any()) {
State.Tracked.reset();
if (Emit)
insertHazardCull(MBB, MI);
}
} else {
// Process uses to determine required wait.
SeenRegs.clear();
for (const MachineOperand &Op : MI->all_uses()) {
if (Op.isImplicit() &&
(!HasImplicitVCC || !Op.isReg() || !isVCC(Op.getReg())))
continue;
processOperand(Op, true);
}
}
// Apply wait
if (Wait) {
unsigned Mask = 0xffff;
if (Wait & WA_VCC) {
State.VCCHazard &= ~HazardState::VALU;
Mask = AMDGPU::DepCtr::encodeFieldVaVcc(Mask, 0);
}
if (Wait & WA_SALU) {
State.SALUHazards.reset();
State.VCCHazard &= ~HazardState::SALU;
Mask = AMDGPU::DepCtr::encodeFieldSaSdst(Mask, 0);
}
if (Wait & WA_VALU) {
State.VALUHazards.reset();
Mask = AMDGPU::DepCtr::encodeFieldVaSdst(Mask, 0);
}
if (Emit) {
if (!mergeConsecutiveWaitAlus(MI, Mask)) {
auto NewMI = BuildMI(MBB, MI, MI->getDebugLoc(),
TII->get(AMDGPU::S_WAITCNT_DEPCTR))
.addImm(Mask);
updateGetPCBundle(NewMI);
}
Emitted = true;
}
}
// On return from a call SGPR state is unknown, so all potential hazards.
if (MI->isCall() && !CullSGPRHazardsOnFunctionBoundary)
State.Tracked.set();
// Update hazards based on defs.
SeenRegs.clear();
for (const MachineOperand &Op : MI->all_defs()) {
if (Op.isImplicit() &&
(!HasImplicitVCC || !Op.isReg() || !isVCC(Op.getReg())))
continue;
processOperand(Op, false);
}
}
BlockHazardState &BS = BlockState[&MBB];
bool Changed = State != BS.Out;
if (Emit) {
assert(!Changed && "Hazard state should not change on emit pass");
return Emitted;
}
if (Changed)
BS.Out = State;
return Changed;
}
bool run(MachineFunction &MF) {
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
if (!ST.hasVALUReadSGPRHazard())
return false;
// Parse settings
EnableSGPRHazardWaits = GlobalEnableSGPRHazardWaits;
CullSGPRHazardsOnFunctionBoundary = GlobalCullSGPRHazardsOnFunctionBoundary;
CullSGPRHazardsAtMemWait = GlobalCullSGPRHazardsAtMemWait;
CullSGPRHazardsMemWaitThreshold = GlobalCullSGPRHazardsMemWaitThreshold;
if (!GlobalEnableSGPRHazardWaits.getNumOccurrences())
EnableSGPRHazardWaits = MF.getFunction().getFnAttributeAsParsedInteger(
"amdgpu-sgpr-hazard-wait", EnableSGPRHazardWaits);
if (!GlobalCullSGPRHazardsOnFunctionBoundary.getNumOccurrences())
CullSGPRHazardsOnFunctionBoundary =
MF.getFunction().hasFnAttribute("amdgpu-sgpr-hazard-boundary-cull");
if (!GlobalCullSGPRHazardsAtMemWait.getNumOccurrences())
CullSGPRHazardsAtMemWait =
MF.getFunction().hasFnAttribute("amdgpu-sgpr-hazard-mem-wait-cull");
if (!GlobalCullSGPRHazardsMemWaitThreshold.getNumOccurrences())
CullSGPRHazardsMemWaitThreshold =
MF.getFunction().getFnAttributeAsParsedInteger(
"amdgpu-sgpr-hazard-mem-wait-cull-threshold",
CullSGPRHazardsMemWaitThreshold);
// Bail if disabled
if (!EnableSGPRHazardWaits)
return false;
TII = ST.getInstrInfo();
TRI = ST.getRegisterInfo();
MRI = &MF.getRegInfo();
DsNopCount = ST.isWave64() ? WAVE64_NOPS : WAVE32_NOPS;
auto CallingConv = MF.getFunction().getCallingConv();
if (!AMDGPU::isEntryFunctionCC(CallingConv) &&
!CullSGPRHazardsOnFunctionBoundary) {
// Callee must consider all SGPRs as tracked.
LLVM_DEBUG(dbgs() << "Is called function, track all SGPRs.\n");
MachineBasicBlock &EntryBlock = MF.front();
BlockState[&EntryBlock].In.Tracked.set();
}
// Calculate the hazard state for each basic block.
// Iterate until a fixed point is reached.
// Fixed point is guaranteed as merge function only ever increases
// the hazard set, and all backedges will cause a merge.
//
// Note: we have to take care of the entry block as this technically
// has an edge from outside the function. Failure to treat this as
// a merge could prevent fixed point being reached.
SetVector<MachineBasicBlock *> Worklist;
for (auto &MBB : reverse(MF))
Worklist.insert(&MBB);
while (!Worklist.empty()) {
auto &MBB = *Worklist.pop_back_val();
bool Changed = runOnMachineBasicBlock(MBB, false);
if (Changed) {
// Note: take a copy of state here in case it is reallocated by map
HazardState NewState = BlockState[&MBB].Out;
// Propagate to all successor blocks
for (auto Succ : MBB.successors()) {
// We only need to merge hazards at CFG merge points.
auto &SuccState = BlockState[Succ];
if (Succ->getSinglePredecessor() && !Succ->isEntryBlock()) {
if (SuccState.In != NewState) {
SuccState.In = NewState;
Worklist.insert(Succ);
}
} else if (SuccState.In.merge(NewState)) {
Worklist.insert(Succ);
}
}
}
}
LLVM_DEBUG(dbgs() << "Emit s_wait_alu instructions\n");
// Final to emit wait instructions.
bool Changed = false;
for (auto &MBB : MF)
Changed |= runOnMachineBasicBlock(MBB, true);
BlockState.clear();
return Changed;
}
};
class AMDGPUWaitSGPRHazardsLegacy : public MachineFunctionPass {
public:
static char ID;
AMDGPUWaitSGPRHazardsLegacy() : MachineFunctionPass(ID) {}
bool runOnMachineFunction(MachineFunction &MF) override {
return AMDGPUWaitSGPRHazards().run(MF);
}
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.setPreservesCFG();
MachineFunctionPass::getAnalysisUsage(AU);
}
};
} // namespace
char AMDGPUWaitSGPRHazardsLegacy::ID = 0;
char &llvm::AMDGPUWaitSGPRHazardsLegacyID = AMDGPUWaitSGPRHazardsLegacy::ID;
INITIALIZE_PASS(AMDGPUWaitSGPRHazardsLegacy, DEBUG_TYPE,
"AMDGPU Insert waits for SGPR read hazards", false, false)
PreservedAnalyses
AMDGPUWaitSGPRHazardsPass::run(MachineFunction &MF,
MachineFunctionAnalysisManager &MFAM) {
if (AMDGPUWaitSGPRHazards().run(MF))
return PreservedAnalyses::none();
return PreservedAnalyses::all();
}