blob: 57ae7d63218ddea9f0915e6803beeef0ebee781c [file] [log] [blame]
//===-- SIWholeQuadMode.cpp - enter and suspend whole quad mode -----------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
/// \file
/// This pass adds instructions to enable whole quad mode (strict or non-strict)
/// for pixel shaders, and strict whole wavefront mode for all programs.
///
/// The "strict" prefix indicates that inactive lanes do not take part in
/// control flow, specifically an inactive lane enabled by a strict WQM/WWM will
/// always be enabled irrespective of control flow decisions. Conversely in
/// non-strict WQM inactive lanes may control flow decisions.
///
/// Whole quad mode is required for derivative computations, but it interferes
/// with shader side effects (stores and atomics). It ensures that WQM is
/// enabled when necessary, but disabled around stores and atomics.
///
/// When necessary, this pass creates a function prolog
///
/// S_MOV_B64 LiveMask, EXEC
/// S_WQM_B64 EXEC, EXEC
///
/// to enter WQM at the top of the function and surrounds blocks of Exact
/// instructions by
///
/// S_AND_SAVEEXEC_B64 Tmp, LiveMask
/// ...
/// S_MOV_B64 EXEC, Tmp
///
/// We also compute when a sequence of instructions requires strict whole
/// wavefront mode (StrictWWM) and insert instructions to save and restore it:
///
/// S_OR_SAVEEXEC_B64 Tmp, -1
/// ...
/// S_MOV_B64 EXEC, Tmp
///
/// When a sequence of instructions requires strict whole quad mode (StrictWQM)
/// we use a similar save and restore mechanism and force whole quad mode for
/// those instructions:
///
/// S_MOV_B64 Tmp, EXEC
/// S_WQM_B64 EXEC, EXEC
/// ...
/// S_MOV_B64 EXEC, Tmp
///
/// In order to avoid excessive switching during sequences of Exact
/// instructions, the pass first analyzes which instructions must be run in WQM
/// (aka which instructions produce values that lead to derivative
/// computations).
///
/// Basic blocks are always exited in WQM as long as some successor needs WQM.
///
/// There is room for improvement given better control flow analysis:
///
/// (1) at the top level (outside of control flow statements, and as long as
/// kill hasn't been used), one SGPR can be saved by recovering WQM from
/// the LiveMask (this is implemented for the entry block).
///
/// (2) when entire regions (e.g. if-else blocks or entire loops) only
/// consist of exact and don't-care instructions, the switch only has to
/// be done at the entry and exit points rather than potentially in each
/// block of the region.
///
//===----------------------------------------------------------------------===//
#include "SIWholeQuadMode.h"
#include "AMDGPU.h"
#include "GCNSubtarget.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "llvm/ADT/MapVector.h"
#include "llvm/ADT/PostOrderIterator.h"
#include "llvm/CodeGen/LiveIntervals.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
#include "llvm/CodeGen/MachineDominators.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineInstr.h"
#include "llvm/CodeGen/MachinePostDominators.h"
#include "llvm/IR/CallingConv.h"
#include "llvm/InitializePasses.h"
#include "llvm/Support/raw_ostream.h"
using namespace llvm;
#define DEBUG_TYPE "si-wqm"
namespace {
enum {
StateWQM = 0x1,
StateStrictWWM = 0x2,
StateStrictWQM = 0x4,
StateExact = 0x8,
StateStrict = StateStrictWWM | StateStrictWQM,
};
struct PrintState {
public:
int State;
explicit PrintState(int State) : State(State) {}
};
#ifndef NDEBUG
static raw_ostream &operator<<(raw_ostream &OS, const PrintState &PS) {
static const std::pair<char, const char *> Mapping[] = {
std::pair(StateWQM, "WQM"), std::pair(StateStrictWWM, "StrictWWM"),
std::pair(StateStrictWQM, "StrictWQM"), std::pair(StateExact, "Exact")};
char State = PS.State;
for (auto M : Mapping) {
if (State & M.first) {
OS << M.second;
State &= ~M.first;
if (State)
OS << '|';
}
}
assert(State == 0);
return OS;
}
#endif
struct InstrInfo {
char Needs = 0;
char Disabled = 0;
char OutNeeds = 0;
char MarkedStates = 0;
};
struct BlockInfo {
char Needs = 0;
char InNeeds = 0;
char OutNeeds = 0;
char InitialState = 0;
bool NeedsLowering = false;
};
struct WorkItem {
MachineBasicBlock *MBB = nullptr;
MachineInstr *MI = nullptr;
WorkItem() = default;
WorkItem(MachineBasicBlock *MBB) : MBB(MBB) {}
WorkItem(MachineInstr *MI) : MI(MI) {}
};
class SIWholeQuadMode {
public:
SIWholeQuadMode(MachineFunction &MF, LiveIntervals *LIS,
MachineDominatorTree *MDT, MachinePostDominatorTree *PDT)
: ST(&MF.getSubtarget<GCNSubtarget>()), TII(ST->getInstrInfo()),
TRI(&TII->getRegisterInfo()), MRI(&MF.getRegInfo()), LIS(LIS), MDT(MDT),
PDT(PDT) {}
bool run(MachineFunction &MF);
private:
const GCNSubtarget *ST;
const SIInstrInfo *TII;
const SIRegisterInfo *TRI;
MachineRegisterInfo *MRI;
LiveIntervals *LIS;
MachineDominatorTree *MDT;
MachinePostDominatorTree *PDT;
unsigned AndOpc;
unsigned AndTermOpc;
unsigned AndN2Opc;
unsigned XorOpc;
unsigned AndSaveExecOpc;
unsigned AndSaveExecTermOpc;
unsigned WQMOpc;
Register Exec;
Register LiveMaskReg;
DenseMap<const MachineInstr *, InstrInfo> Instructions;
MapVector<MachineBasicBlock *, BlockInfo> Blocks;
// Tracks state (WQM/StrictWWM/StrictWQM/Exact) after a given instruction
DenseMap<const MachineInstr *, char> StateTransition;
SmallVector<MachineInstr *, 2> LiveMaskQueries;
SmallVector<MachineInstr *, 4> LowerToMovInstrs;
SmallSetVector<MachineInstr *, 4> LowerToCopyInstrs;
SmallVector<MachineInstr *, 4> KillInstrs;
SmallVector<MachineInstr *, 4> InitExecInstrs;
SmallVector<MachineInstr *, 4> SetInactiveInstrs;
void printInfo();
void markInstruction(MachineInstr &MI, char Flag,
std::vector<WorkItem> &Worklist);
void markDefs(const MachineInstr &UseMI, LiveRange &LR, Register Reg,
unsigned SubReg, char Flag, std::vector<WorkItem> &Worklist);
void markOperand(const MachineInstr &MI, const MachineOperand &Op, char Flag,
std::vector<WorkItem> &Worklist);
void markInstructionUses(const MachineInstr &MI, char Flag,
std::vector<WorkItem> &Worklist);
char scanInstructions(MachineFunction &MF, std::vector<WorkItem> &Worklist);
void propagateInstruction(MachineInstr &MI, std::vector<WorkItem> &Worklist);
void propagateBlock(MachineBasicBlock &MBB, std::vector<WorkItem> &Worklist);
char analyzeFunction(MachineFunction &MF);
MachineBasicBlock::iterator saveSCC(MachineBasicBlock &MBB,
MachineBasicBlock::iterator Before);
MachineBasicBlock::iterator
prepareInsertion(MachineBasicBlock &MBB, MachineBasicBlock::iterator First,
MachineBasicBlock::iterator Last, bool PreferLast,
bool SaveSCC);
void toExact(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
Register SaveWQM);
void toWQM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
Register SavedWQM);
void toStrictMode(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
Register SaveOrig, char StrictStateNeeded);
void fromStrictMode(MachineBasicBlock &MBB,
MachineBasicBlock::iterator Before, Register SavedOrig,
char NonStrictState, char CurrentStrictState);
void splitBlock(MachineInstr *TermMI);
MachineInstr *lowerKillI1(MachineInstr &MI, bool IsWQM);
MachineInstr *lowerKillF32(MachineInstr &MI);
void lowerBlock(MachineBasicBlock &MBB, BlockInfo &BI);
void processBlock(MachineBasicBlock &MBB, BlockInfo &BI, bool IsEntry);
bool lowerLiveMaskQueries();
bool lowerCopyInstrs();
bool lowerKillInstrs(bool IsWQM);
void lowerInitExec(MachineInstr &MI);
MachineBasicBlock::iterator lowerInitExecInstrs(MachineBasicBlock &Entry,
bool &Changed);
};
class SIWholeQuadModeLegacy : public MachineFunctionPass {
public:
static char ID;
SIWholeQuadModeLegacy() : MachineFunctionPass(ID) {}
bool runOnMachineFunction(MachineFunction &MF) override;
StringRef getPassName() const override { return "SI Whole Quad Mode"; }
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.addRequired<LiveIntervalsWrapperPass>();
AU.addPreserved<SlotIndexesWrapperPass>();
AU.addPreserved<LiveIntervalsWrapperPass>();
AU.addPreserved<MachineDominatorTreeWrapperPass>();
AU.addPreserved<MachinePostDominatorTreeWrapperPass>();
MachineFunctionPass::getAnalysisUsage(AU);
}
MachineFunctionProperties getClearedProperties() const override {
return MachineFunctionProperties().set(
MachineFunctionProperties::Property::IsSSA);
}
};
} // end anonymous namespace
char SIWholeQuadModeLegacy::ID = 0;
INITIALIZE_PASS_BEGIN(SIWholeQuadModeLegacy, DEBUG_TYPE, "SI Whole Quad Mode",
false, false)
INITIALIZE_PASS_DEPENDENCY(LiveIntervalsWrapperPass)
INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass)
INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTreeWrapperPass)
INITIALIZE_PASS_END(SIWholeQuadModeLegacy, DEBUG_TYPE, "SI Whole Quad Mode",
false, false)
char &llvm::SIWholeQuadModeID = SIWholeQuadModeLegacy::ID;
FunctionPass *llvm::createSIWholeQuadModeLegacyPass() {
return new SIWholeQuadModeLegacy;
}
#ifndef NDEBUG
LLVM_DUMP_METHOD void SIWholeQuadMode::printInfo() {
for (const auto &BII : Blocks) {
dbgs() << "\n"
<< printMBBReference(*BII.first) << ":\n"
<< " InNeeds = " << PrintState(BII.second.InNeeds)
<< ", Needs = " << PrintState(BII.second.Needs)
<< ", OutNeeds = " << PrintState(BII.second.OutNeeds) << "\n\n";
for (const MachineInstr &MI : *BII.first) {
auto III = Instructions.find(&MI);
if (III != Instructions.end()) {
dbgs() << " " << MI << " Needs = " << PrintState(III->second.Needs)
<< ", OutNeeds = " << PrintState(III->second.OutNeeds) << '\n';
}
}
}
}
#endif
void SIWholeQuadMode::markInstruction(MachineInstr &MI, char Flag,
std::vector<WorkItem> &Worklist) {
InstrInfo &II = Instructions[&MI];
assert(!(Flag & StateExact) && Flag != 0);
// Capture all states requested in marking including disabled ones.
II.MarkedStates |= Flag;
// Remove any disabled states from the flag. The user that required it gets
// an undefined value in the helper lanes. For example, this can happen if
// the result of an atomic is used by instruction that requires WQM, where
// ignoring the request for WQM is correct as per the relevant specs.
Flag &= ~II.Disabled;
// Ignore if the flag is already encompassed by the existing needs, or we
// just disabled everything.
if ((II.Needs & Flag) == Flag)
return;
LLVM_DEBUG(dbgs() << "markInstruction " << PrintState(Flag) << ": " << MI);
II.Needs |= Flag;
Worklist.emplace_back(&MI);
}
/// Mark all relevant definitions of register \p Reg in usage \p UseMI.
void SIWholeQuadMode::markDefs(const MachineInstr &UseMI, LiveRange &LR,
Register Reg, unsigned SubReg, char Flag,
std::vector<WorkItem> &Worklist) {
LLVM_DEBUG(dbgs() << "markDefs " << PrintState(Flag) << ": " << UseMI);
LiveQueryResult UseLRQ = LR.Query(LIS->getInstructionIndex(UseMI));
const VNInfo *Value = UseLRQ.valueIn();
if (!Value)
return;
// Note: this code assumes that lane masks on AMDGPU completely
// cover registers.
const LaneBitmask UseLanes =
SubReg ? TRI->getSubRegIndexLaneMask(SubReg)
: (Reg.isVirtual() ? MRI->getMaxLaneMaskForVReg(Reg)
: LaneBitmask::getNone());
// Perform a depth-first iteration of the LiveRange graph marking defs.
// Stop processing of a given branch when all use lanes have been defined.
// The first definition stops processing for a physical register.
struct PhiEntry {
const VNInfo *Phi;
unsigned PredIdx;
LaneBitmask DefinedLanes;
PhiEntry(const VNInfo *Phi, unsigned PredIdx, LaneBitmask DefinedLanes)
: Phi(Phi), PredIdx(PredIdx), DefinedLanes(DefinedLanes) {}
};
using VisitKey = std::pair<const VNInfo *, LaneBitmask>;
SmallVector<PhiEntry, 2> PhiStack;
SmallSet<VisitKey, 4> Visited;
LaneBitmask DefinedLanes;
unsigned NextPredIdx = 0; // Only used for processing phi nodes
do {
const VNInfo *NextValue = nullptr;
const VisitKey Key(Value, DefinedLanes);
if (Visited.insert(Key).second) {
// On first visit to a phi then start processing first predecessor
NextPredIdx = 0;
}
if (Value->isPHIDef()) {
// Each predecessor node in the phi must be processed as a subgraph
const MachineBasicBlock *MBB = LIS->getMBBFromIndex(Value->def);
assert(MBB && "Phi-def has no defining MBB");
// Find next predecessor to process
unsigned Idx = NextPredIdx;
const auto *PI = MBB->pred_begin() + Idx;
const auto *PE = MBB->pred_end();
for (; PI != PE && !NextValue; ++PI, ++Idx) {
if (const VNInfo *VN = LR.getVNInfoBefore(LIS->getMBBEndIdx(*PI))) {
if (!Visited.count(VisitKey(VN, DefinedLanes)))
NextValue = VN;
}
}
// If there are more predecessors to process; add phi to stack
if (PI != PE)
PhiStack.emplace_back(Value, Idx, DefinedLanes);
} else {
MachineInstr *MI = LIS->getInstructionFromIndex(Value->def);
assert(MI && "Def has no defining instruction");
if (Reg.isVirtual()) {
// Iterate over all operands to find relevant definitions
bool HasDef = false;
for (const MachineOperand &Op : MI->all_defs()) {
if (Op.getReg() != Reg)
continue;
// Compute lanes defined and overlap with use
LaneBitmask OpLanes =
Op.isUndef() ? LaneBitmask::getAll()
: TRI->getSubRegIndexLaneMask(Op.getSubReg());
LaneBitmask Overlap = (UseLanes & OpLanes);
// Record if this instruction defined any of use
HasDef |= Overlap.any();
// Mark any lanes defined
DefinedLanes |= OpLanes;
}
// Check if all lanes of use have been defined
if ((DefinedLanes & UseLanes) != UseLanes) {
// Definition not complete; need to process input value
LiveQueryResult LRQ = LR.Query(LIS->getInstructionIndex(*MI));
if (const VNInfo *VN = LRQ.valueIn()) {
if (!Visited.count(VisitKey(VN, DefinedLanes)))
NextValue = VN;
}
}
// Only mark the instruction if it defines some part of the use
if (HasDef)
markInstruction(*MI, Flag, Worklist);
} else {
// For physical registers simply mark the defining instruction
markInstruction(*MI, Flag, Worklist);
}
}
if (!NextValue && !PhiStack.empty()) {
// Reach end of chain; revert to processing last phi
PhiEntry &Entry = PhiStack.back();
NextValue = Entry.Phi;
NextPredIdx = Entry.PredIdx;
DefinedLanes = Entry.DefinedLanes;
PhiStack.pop_back();
}
Value = NextValue;
} while (Value);
}
void SIWholeQuadMode::markOperand(const MachineInstr &MI,
const MachineOperand &Op, char Flag,
std::vector<WorkItem> &Worklist) {
assert(Op.isReg());
Register Reg = Op.getReg();
// Ignore some hardware registers
switch (Reg) {
case AMDGPU::EXEC:
case AMDGPU::EXEC_LO:
return;
default:
break;
}
LLVM_DEBUG(dbgs() << "markOperand " << PrintState(Flag) << ": " << Op
<< " for " << MI);
if (Reg.isVirtual()) {
LiveRange &LR = LIS->getInterval(Reg);
markDefs(MI, LR, Reg, Op.getSubReg(), Flag, Worklist);
} else {
// Handle physical registers that we need to track; this is mostly relevant
// for VCC, which can appear as the (implicit) input of a uniform branch,
// e.g. when a loop counter is stored in a VGPR.
for (MCRegUnit Unit : TRI->regunits(Reg.asMCReg())) {
LiveRange &LR = LIS->getRegUnit(Unit);
const VNInfo *Value = LR.Query(LIS->getInstructionIndex(MI)).valueIn();
if (Value)
markDefs(MI, LR, Unit, AMDGPU::NoSubRegister, Flag, Worklist);
}
}
}
/// Mark all instructions defining the uses in \p MI with \p Flag.
void SIWholeQuadMode::markInstructionUses(const MachineInstr &MI, char Flag,
std::vector<WorkItem> &Worklist) {
LLVM_DEBUG(dbgs() << "markInstructionUses " << PrintState(Flag) << ": "
<< MI);
for (const MachineOperand &Use : MI.all_uses())
markOperand(MI, Use, Flag, Worklist);
}
// Scan instructions to determine which ones require an Exact execmask and
// which ones seed WQM requirements.
char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
std::vector<WorkItem> &Worklist) {
char GlobalFlags = 0;
bool WQMOutputs = MF.getFunction().hasFnAttribute("amdgpu-ps-wqm-outputs");
SmallVector<MachineInstr *, 4> SoftWQMInstrs;
bool HasImplicitDerivatives =
MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS;
// We need to visit the basic blocks in reverse post-order so that we visit
// defs before uses, in particular so that we don't accidentally mark an
// instruction as needing e.g. WQM before visiting it and realizing it needs
// WQM disabled.
ReversePostOrderTraversal<MachineFunction *> RPOT(&MF);
for (MachineBasicBlock *MBB : RPOT) {
BlockInfo &BBI = Blocks[MBB];
for (MachineInstr &MI : *MBB) {
InstrInfo &III = Instructions[&MI];
unsigned Opcode = MI.getOpcode();
char Flags = 0;
if (TII->isWQM(Opcode)) {
// If LOD is not supported WQM is not needed.
// Only generate implicit WQM if implicit derivatives are required.
// This avoids inserting unintended WQM if a shader type without
// implicit derivatives uses an image sampling instruction.
if (ST->hasExtendedImageInsts() && HasImplicitDerivatives) {
// Sampling instructions don't need to produce results for all pixels
// in a quad, they just require all inputs of a quad to have been
// computed for derivatives.
markInstructionUses(MI, StateWQM, Worklist);
GlobalFlags |= StateWQM;
}
} else if (Opcode == AMDGPU::WQM) {
// The WQM intrinsic requires its output to have all the helper lanes
// correct, so we need it to be in WQM.
Flags = StateWQM;
LowerToCopyInstrs.insert(&MI);
} else if (Opcode == AMDGPU::SOFT_WQM) {
LowerToCopyInstrs.insert(&MI);
SoftWQMInstrs.push_back(&MI);
} else if (Opcode == AMDGPU::STRICT_WWM) {
// The STRICT_WWM intrinsic doesn't make the same guarantee, and plus
// it needs to be executed in WQM or Exact so that its copy doesn't
// clobber inactive lanes.
markInstructionUses(MI, StateStrictWWM, Worklist);
GlobalFlags |= StateStrictWWM;
LowerToMovInstrs.push_back(&MI);
} else if (Opcode == AMDGPU::STRICT_WQM ||
TII->isDualSourceBlendEXP(MI)) {
// STRICT_WQM is similar to STRICTWWM, but instead of enabling all
// threads of the wave like STRICTWWM, STRICT_WQM enables all threads in
// quads that have at least one active thread.
markInstructionUses(MI, StateStrictWQM, Worklist);
GlobalFlags |= StateStrictWQM;
if (Opcode == AMDGPU::STRICT_WQM) {
LowerToMovInstrs.push_back(&MI);
} else {
// Dual source blend export acts as implicit strict-wqm, its sources
// need to be shuffled in strict wqm, but the export itself needs to
// run in exact mode.
BBI.Needs |= StateExact;
if (!(BBI.InNeeds & StateExact)) {
BBI.InNeeds |= StateExact;
Worklist.emplace_back(MBB);
}
GlobalFlags |= StateExact;
III.Disabled = StateWQM | StateStrict;
}
} else if (Opcode == AMDGPU::LDS_PARAM_LOAD ||
Opcode == AMDGPU::DS_PARAM_LOAD ||
Opcode == AMDGPU::LDS_DIRECT_LOAD ||
Opcode == AMDGPU::DS_DIRECT_LOAD) {
// Mark these STRICTWQM, but only for the instruction, not its operands.
// This avoid unnecessarily marking M0 as requiring WQM.
III.Needs |= StateStrictWQM;
GlobalFlags |= StateStrictWQM;
} else if (Opcode == AMDGPU::V_SET_INACTIVE_B32) {
// Disable strict states; StrictWQM will be added as required later.
III.Disabled = StateStrict;
MachineOperand &Inactive = MI.getOperand(4);
if (Inactive.isReg()) {
if (Inactive.isUndef() && MI.getOperand(3).getImm() == 0)
LowerToCopyInstrs.insert(&MI);
else
markOperand(MI, Inactive, StateStrictWWM, Worklist);
}
SetInactiveInstrs.push_back(&MI);
BBI.NeedsLowering = true;
} else if (TII->isDisableWQM(MI)) {
BBI.Needs |= StateExact;
if (!(BBI.InNeeds & StateExact)) {
BBI.InNeeds |= StateExact;
Worklist.emplace_back(MBB);
}
GlobalFlags |= StateExact;
III.Disabled = StateWQM | StateStrict;
} else if (Opcode == AMDGPU::SI_PS_LIVE ||
Opcode == AMDGPU::SI_LIVE_MASK) {
LiveMaskQueries.push_back(&MI);
} else if (Opcode == AMDGPU::SI_KILL_I1_TERMINATOR ||
Opcode == AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR ||
Opcode == AMDGPU::SI_DEMOTE_I1) {
KillInstrs.push_back(&MI);
BBI.NeedsLowering = true;
} else if (Opcode == AMDGPU::SI_INIT_EXEC ||
Opcode == AMDGPU::SI_INIT_EXEC_FROM_INPUT ||
Opcode == AMDGPU::SI_INIT_WHOLE_WAVE) {
InitExecInstrs.push_back(&MI);
} else if (WQMOutputs) {
// The function is in machine SSA form, which means that physical
// VGPRs correspond to shader inputs and outputs. Inputs are
// only used, outputs are only defined.
// FIXME: is this still valid?
for (const MachineOperand &MO : MI.defs()) {
Register Reg = MO.getReg();
if (Reg.isPhysical() &&
TRI->hasVectorRegisters(TRI->getPhysRegBaseClass(Reg))) {
Flags = StateWQM;
break;
}
}
}
if (Flags) {
markInstruction(MI, Flags, Worklist);
GlobalFlags |= Flags;
}
}
}
// Mark sure that any SET_INACTIVE instructions are computed in WQM if WQM is
// ever used anywhere in the function. This implements the corresponding
// semantics of @llvm.amdgcn.set.inactive.
// Similarly for SOFT_WQM instructions, implementing @llvm.amdgcn.softwqm.
if (GlobalFlags & StateWQM) {
for (MachineInstr *MI : SetInactiveInstrs)
markInstruction(*MI, StateWQM, Worklist);
for (MachineInstr *MI : SoftWQMInstrs)
markInstruction(*MI, StateWQM, Worklist);
}
return GlobalFlags;
}
void SIWholeQuadMode::propagateInstruction(MachineInstr &MI,
std::vector<WorkItem>& Worklist) {
MachineBasicBlock *MBB = MI.getParent();
InstrInfo II = Instructions[&MI]; // take a copy to prevent dangling references
BlockInfo &BI = Blocks[MBB];
// Control flow-type instructions and stores to temporary memory that are
// followed by WQM computations must themselves be in WQM.
if ((II.OutNeeds & StateWQM) && !(II.Disabled & StateWQM) &&
(MI.isTerminator() || (TII->usesVM_CNT(MI) && MI.mayStore()))) {
Instructions[&MI].Needs = StateWQM;
II.Needs = StateWQM;
}
// Propagate to block level
if (II.Needs & StateWQM) {
BI.Needs |= StateWQM;
if (!(BI.InNeeds & StateWQM)) {
BI.InNeeds |= StateWQM;
Worklist.emplace_back(MBB);
}
}
// Propagate backwards within block
if (MachineInstr *PrevMI = MI.getPrevNode()) {
char InNeeds = (II.Needs & ~StateStrict) | II.OutNeeds;
if (!PrevMI->isPHI()) {
InstrInfo &PrevII = Instructions[PrevMI];
if ((PrevII.OutNeeds | InNeeds) != PrevII.OutNeeds) {
PrevII.OutNeeds |= InNeeds;
Worklist.emplace_back(PrevMI);
}
}
}
// Propagate WQM flag to instruction inputs
assert(!(II.Needs & StateExact));
if (II.Needs != 0)
markInstructionUses(MI, II.Needs, Worklist);
// Ensure we process a block containing StrictWWM/StrictWQM, even if it does
// not require any WQM transitions.
if (II.Needs & StateStrictWWM)
BI.Needs |= StateStrictWWM;
if (II.Needs & StateStrictWQM)
BI.Needs |= StateStrictWQM;
}
void SIWholeQuadMode::propagateBlock(MachineBasicBlock &MBB,
std::vector<WorkItem>& Worklist) {
BlockInfo BI = Blocks[&MBB]; // Make a copy to prevent dangling references.
// Propagate through instructions
if (!MBB.empty()) {
MachineInstr *LastMI = &*MBB.rbegin();
InstrInfo &LastII = Instructions[LastMI];
if ((LastII.OutNeeds | BI.OutNeeds) != LastII.OutNeeds) {
LastII.OutNeeds |= BI.OutNeeds;
Worklist.emplace_back(LastMI);
}
}
// Predecessor blocks must provide for our WQM/Exact needs.
for (MachineBasicBlock *Pred : MBB.predecessors()) {
BlockInfo &PredBI = Blocks[Pred];
if ((PredBI.OutNeeds | BI.InNeeds) == PredBI.OutNeeds)
continue;
PredBI.OutNeeds |= BI.InNeeds;
PredBI.InNeeds |= BI.InNeeds;
Worklist.emplace_back(Pred);
}
// All successors must be prepared to accept the same set of WQM/Exact data.
for (MachineBasicBlock *Succ : MBB.successors()) {
BlockInfo &SuccBI = Blocks[Succ];
if ((SuccBI.InNeeds | BI.OutNeeds) == SuccBI.InNeeds)
continue;
SuccBI.InNeeds |= BI.OutNeeds;
Worklist.emplace_back(Succ);
}
}
char SIWholeQuadMode::analyzeFunction(MachineFunction &MF) {
std::vector<WorkItem> Worklist;
char GlobalFlags = scanInstructions(MF, Worklist);
while (!Worklist.empty()) {
WorkItem WI = Worklist.back();
Worklist.pop_back();
if (WI.MI)
propagateInstruction(*WI.MI, Worklist);
else
propagateBlock(*WI.MBB, Worklist);
}
return GlobalFlags;
}
MachineBasicBlock::iterator
SIWholeQuadMode::saveSCC(MachineBasicBlock &MBB,
MachineBasicBlock::iterator Before) {
Register SaveReg = MRI->createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
MachineInstr *Save =
BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), SaveReg)
.addReg(AMDGPU::SCC);
MachineInstr *Restore =
BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), AMDGPU::SCC)
.addReg(SaveReg);
LIS->InsertMachineInstrInMaps(*Save);
LIS->InsertMachineInstrInMaps(*Restore);
LIS->createAndComputeVirtRegInterval(SaveReg);
return Restore;
}
void SIWholeQuadMode::splitBlock(MachineInstr *TermMI) {
MachineBasicBlock *BB = TermMI->getParent();
LLVM_DEBUG(dbgs() << "Split block " << printMBBReference(*BB) << " @ "
<< *TermMI << "\n");
MachineBasicBlock *SplitBB =
BB->splitAt(*TermMI, /*UpdateLiveIns*/ true, LIS);
// Convert last instruction in block to a terminator.
// Note: this only covers the expected patterns
unsigned NewOpcode = 0;
switch (TermMI->getOpcode()) {
case AMDGPU::S_AND_B32:
NewOpcode = AMDGPU::S_AND_B32_term;
break;
case AMDGPU::S_AND_B64:
NewOpcode = AMDGPU::S_AND_B64_term;
break;
case AMDGPU::S_MOV_B32:
NewOpcode = AMDGPU::S_MOV_B32_term;
break;
case AMDGPU::S_MOV_B64:
NewOpcode = AMDGPU::S_MOV_B64_term;
break;
case AMDGPU::S_ANDN2_B32:
NewOpcode = AMDGPU::S_ANDN2_B32_term;
break;
case AMDGPU::S_ANDN2_B64:
NewOpcode = AMDGPU::S_ANDN2_B64_term;
break;
default:
llvm_unreachable("Unexpected instruction");
}
// These terminators fallthrough to the next block, no need to add an
// unconditional branch to the next block (SplitBB).
TermMI->setDesc(TII->get(NewOpcode));
if (SplitBB != BB) {
// Update dominator trees
using DomTreeT = DomTreeBase<MachineBasicBlock>;
SmallVector<DomTreeT::UpdateType, 16> DTUpdates;
for (MachineBasicBlock *Succ : SplitBB->successors()) {
DTUpdates.push_back({DomTreeT::Insert, SplitBB, Succ});
DTUpdates.push_back({DomTreeT::Delete, BB, Succ});
}
DTUpdates.push_back({DomTreeT::Insert, BB, SplitBB});
if (MDT)
MDT->applyUpdates(DTUpdates);
if (PDT)
PDT->applyUpdates(DTUpdates);
}
}
MachineInstr *SIWholeQuadMode::lowerKillF32(MachineInstr &MI) {
assert(LiveMaskReg.isVirtual());
const DebugLoc &DL = MI.getDebugLoc();
unsigned Opcode = 0;
assert(MI.getOperand(0).isReg());
// Comparison is for live lanes; however here we compute the inverse
// (killed lanes). This is because VCMP will always generate 0 bits
// for inactive lanes so a mask of live lanes would not be correct
// inside control flow.
// Invert the comparison by swapping the operands and adjusting
// the comparison codes.
switch (MI.getOperand(2).getImm()) {
case ISD::SETUEQ:
Opcode = AMDGPU::V_CMP_LG_F32_e64;
break;
case ISD::SETUGT:
Opcode = AMDGPU::V_CMP_GE_F32_e64;
break;
case ISD::SETUGE:
Opcode = AMDGPU::V_CMP_GT_F32_e64;
break;
case ISD::SETULT:
Opcode = AMDGPU::V_CMP_LE_F32_e64;
break;
case ISD::SETULE:
Opcode = AMDGPU::V_CMP_LT_F32_e64;
break;
case ISD::SETUNE:
Opcode = AMDGPU::V_CMP_EQ_F32_e64;
break;
case ISD::SETO:
Opcode = AMDGPU::V_CMP_O_F32_e64;
break;
case ISD::SETUO:
Opcode = AMDGPU::V_CMP_U_F32_e64;
break;
case ISD::SETOEQ:
case ISD::SETEQ:
Opcode = AMDGPU::V_CMP_NEQ_F32_e64;
break;
case ISD::SETOGT:
case ISD::SETGT:
Opcode = AMDGPU::V_CMP_NLT_F32_e64;
break;
case ISD::SETOGE:
case ISD::SETGE:
Opcode = AMDGPU::V_CMP_NLE_F32_e64;
break;
case ISD::SETOLT:
case ISD::SETLT:
Opcode = AMDGPU::V_CMP_NGT_F32_e64;
break;
case ISD::SETOLE:
case ISD::SETLE:
Opcode = AMDGPU::V_CMP_NGE_F32_e64;
break;
case ISD::SETONE:
case ISD::SETNE:
Opcode = AMDGPU::V_CMP_NLG_F32_e64;
break;
default:
llvm_unreachable("invalid ISD:SET cond code");
}
MachineBasicBlock &MBB = *MI.getParent();
// Pick opcode based on comparison type.
MachineInstr *VcmpMI;
const MachineOperand &Op0 = MI.getOperand(0);
const MachineOperand &Op1 = MI.getOperand(1);
// VCC represents lanes killed.
Register VCC = ST->isWave32() ? AMDGPU::VCC_LO : AMDGPU::VCC;
if (TRI->isVGPR(*MRI, Op0.getReg())) {
Opcode = AMDGPU::getVOPe32(Opcode);
VcmpMI = BuildMI(MBB, &MI, DL, TII->get(Opcode)).add(Op1).add(Op0);
} else {
VcmpMI = BuildMI(MBB, &MI, DL, TII->get(Opcode))
.addReg(VCC, RegState::Define)
.addImm(0) // src0 modifiers
.add(Op1)
.addImm(0) // src1 modifiers
.add(Op0)
.addImm(0); // omod
}
MachineInstr *MaskUpdateMI =
BuildMI(MBB, MI, DL, TII->get(AndN2Opc), LiveMaskReg)
.addReg(LiveMaskReg)
.addReg(VCC);
// State of SCC represents whether any lanes are live in mask,
// if SCC is 0 then no lanes will be alive anymore.
MachineInstr *EarlyTermMI =
BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_EARLY_TERMINATE_SCC0));
MachineInstr *ExecMaskMI =
BuildMI(MBB, MI, DL, TII->get(AndN2Opc), Exec).addReg(Exec).addReg(VCC);
assert(MBB.succ_size() == 1);
// Update live intervals
LIS->ReplaceMachineInstrInMaps(MI, *VcmpMI);
MBB.remove(&MI);
LIS->InsertMachineInstrInMaps(*MaskUpdateMI);
LIS->InsertMachineInstrInMaps(*EarlyTermMI);
LIS->InsertMachineInstrInMaps(*ExecMaskMI);
return ExecMaskMI;
}
MachineInstr *SIWholeQuadMode::lowerKillI1(MachineInstr &MI, bool IsWQM) {
assert(LiveMaskReg.isVirtual());
MachineBasicBlock &MBB = *MI.getParent();
const DebugLoc &DL = MI.getDebugLoc();
MachineInstr *MaskUpdateMI = nullptr;
const bool IsDemote = IsWQM && (MI.getOpcode() == AMDGPU::SI_DEMOTE_I1);
const MachineOperand &Op = MI.getOperand(0);
int64_t KillVal = MI.getOperand(1).getImm();
MachineInstr *ComputeKilledMaskMI = nullptr;
Register CndReg = !Op.isImm() ? Op.getReg() : Register();
Register TmpReg;
// Is this a static or dynamic kill?
if (Op.isImm()) {
if (Op.getImm() == KillVal) {
// Static: all active lanes are killed
MaskUpdateMI = BuildMI(MBB, MI, DL, TII->get(AndN2Opc), LiveMaskReg)
.addReg(LiveMaskReg)
.addReg(Exec);
} else {
// Static: kill does nothing
bool IsLastTerminator = std::next(MI.getIterator()) == MBB.end();
if (!IsLastTerminator) {
LIS->RemoveMachineInstrFromMaps(MI);
} else {
assert(MBB.succ_size() == 1 && MI.getOpcode() != AMDGPU::SI_DEMOTE_I1);
MachineInstr *NewTerm = BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_BRANCH))
.addMBB(*MBB.succ_begin());
LIS->ReplaceMachineInstrInMaps(MI, *NewTerm);
}
MBB.remove(&MI);
return nullptr;
}
} else {
if (!KillVal) {
// Op represents live lanes after kill,
// so exec mask needs to be factored in.
TmpReg = MRI->createVirtualRegister(TRI->getBoolRC());
ComputeKilledMaskMI =
BuildMI(MBB, MI, DL, TII->get(AndN2Opc), TmpReg).addReg(Exec).add(Op);
MaskUpdateMI = BuildMI(MBB, MI, DL, TII->get(AndN2Opc), LiveMaskReg)
.addReg(LiveMaskReg)
.addReg(TmpReg);
} else {
// Op represents lanes to kill
MaskUpdateMI = BuildMI(MBB, MI, DL, TII->get(AndN2Opc), LiveMaskReg)
.addReg(LiveMaskReg)
.add(Op);
}
}
// State of SCC represents whether any lanes are live in mask,
// if SCC is 0 then no lanes will be alive anymore.
MachineInstr *EarlyTermMI =
BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_EARLY_TERMINATE_SCC0));
// In the case we got this far some lanes are still live,
// update EXEC to deactivate lanes as appropriate.
MachineInstr *NewTerm;
MachineInstr *WQMMaskMI = nullptr;
Register LiveMaskWQM;
if (IsDemote) {
// Demote - deactivate quads with only helper lanes
LiveMaskWQM = MRI->createVirtualRegister(TRI->getBoolRC());
WQMMaskMI =
BuildMI(MBB, MI, DL, TII->get(WQMOpc), LiveMaskWQM).addReg(LiveMaskReg);
NewTerm = BuildMI(MBB, MI, DL, TII->get(AndOpc), Exec)
.addReg(Exec)
.addReg(LiveMaskWQM);
} else {
// Kill - deactivate lanes no longer in live mask
if (Op.isImm()) {
unsigned MovOpc = ST->isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
NewTerm = BuildMI(MBB, &MI, DL, TII->get(MovOpc), Exec).addImm(0);
} else if (!IsWQM) {
NewTerm = BuildMI(MBB, &MI, DL, TII->get(AndOpc), Exec)
.addReg(Exec)
.addReg(LiveMaskReg);
} else {
unsigned Opcode = KillVal ? AndN2Opc : AndOpc;
NewTerm =
BuildMI(MBB, &MI, DL, TII->get(Opcode), Exec).addReg(Exec).add(Op);
}
}
// Update live intervals
LIS->RemoveMachineInstrFromMaps(MI);
MBB.remove(&MI);
assert(EarlyTermMI);
assert(MaskUpdateMI);
assert(NewTerm);
if (ComputeKilledMaskMI)
LIS->InsertMachineInstrInMaps(*ComputeKilledMaskMI);
LIS->InsertMachineInstrInMaps(*MaskUpdateMI);
LIS->InsertMachineInstrInMaps(*EarlyTermMI);
if (WQMMaskMI)
LIS->InsertMachineInstrInMaps(*WQMMaskMI);
LIS->InsertMachineInstrInMaps(*NewTerm);
if (CndReg) {
LIS->removeInterval(CndReg);
LIS->createAndComputeVirtRegInterval(CndReg);
}
if (TmpReg)
LIS->createAndComputeVirtRegInterval(TmpReg);
if (LiveMaskWQM)
LIS->createAndComputeVirtRegInterval(LiveMaskWQM);
return NewTerm;
}
// Replace (or supplement) instructions accessing live mask.
// This can only happen once all the live mask registers have been created
// and the execute state (WQM/StrictWWM/Exact) of instructions is known.
void SIWholeQuadMode::lowerBlock(MachineBasicBlock &MBB, BlockInfo &BI) {
if (!BI.NeedsLowering)
return;
LLVM_DEBUG(dbgs() << "\nLowering block " << printMBBReference(MBB) << ":\n");
SmallVector<MachineInstr *, 4> SplitPoints;
Register ActiveLanesReg = 0;
char State = BI.InitialState;
for (MachineInstr &MI : llvm::make_early_inc_range(
llvm::make_range(MBB.getFirstNonPHI(), MBB.end()))) {
auto MIState = StateTransition.find(&MI);
if (MIState != StateTransition.end())
State = MIState->second;
MachineInstr *SplitPoint = nullptr;
switch (MI.getOpcode()) {
case AMDGPU::SI_DEMOTE_I1:
case AMDGPU::SI_KILL_I1_TERMINATOR:
SplitPoint = lowerKillI1(MI, State == StateWQM);
break;
case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
SplitPoint = lowerKillF32(MI);
break;
case AMDGPU::ENTER_STRICT_WWM:
ActiveLanesReg = MI.getOperand(0).getReg();
break;
case AMDGPU::EXIT_STRICT_WWM:
ActiveLanesReg = 0;
break;
case AMDGPU::V_SET_INACTIVE_B32:
if (ActiveLanesReg) {
LiveInterval &LI = LIS->getInterval(MI.getOperand(5).getReg());
MRI->constrainRegClass(ActiveLanesReg, TRI->getWaveMaskRegClass());
MI.getOperand(5).setReg(ActiveLanesReg);
LIS->shrinkToUses(&LI);
} else {
assert(State == StateExact || State == StateWQM);
}
break;
default:
break;
}
if (SplitPoint)
SplitPoints.push_back(SplitPoint);
}
// Perform splitting after instruction scan to simplify iteration.
for (MachineInstr *MI : SplitPoints)
splitBlock(MI);
}
// Return an iterator in the (inclusive) range [First, Last] at which
// instructions can be safely inserted, keeping in mind that some of the
// instructions we want to add necessarily clobber SCC.
MachineBasicBlock::iterator SIWholeQuadMode::prepareInsertion(
MachineBasicBlock &MBB, MachineBasicBlock::iterator First,
MachineBasicBlock::iterator Last, bool PreferLast, bool SaveSCC) {
if (!SaveSCC)
return PreferLast ? Last : First;
LiveRange &LR =
LIS->getRegUnit(*TRI->regunits(MCRegister::from(AMDGPU::SCC)).begin());
auto MBBE = MBB.end();
SlotIndex FirstIdx = First != MBBE ? LIS->getInstructionIndex(*First)
: LIS->getMBBEndIdx(&MBB);
SlotIndex LastIdx =
Last != MBBE ? LIS->getInstructionIndex(*Last) : LIS->getMBBEndIdx(&MBB);
SlotIndex Idx = PreferLast ? LastIdx : FirstIdx;
const LiveRange::Segment *S;
for (;;) {
S = LR.getSegmentContaining(Idx);
if (!S)
break;
if (PreferLast) {
SlotIndex Next = S->start.getBaseIndex();
if (Next < FirstIdx)
break;
Idx = Next;
} else {
MachineInstr *EndMI = LIS->getInstructionFromIndex(S->end.getBaseIndex());
assert(EndMI && "Segment does not end on valid instruction");
auto NextI = std::next(EndMI->getIterator());
if (NextI == MBB.end())
break;
SlotIndex Next = LIS->getInstructionIndex(*NextI);
if (Next > LastIdx)
break;
Idx = Next;
}
}
MachineBasicBlock::iterator MBBI;
if (MachineInstr *MI = LIS->getInstructionFromIndex(Idx))
MBBI = MI;
else {
assert(Idx == LIS->getMBBEndIdx(&MBB));
MBBI = MBB.end();
}
// Move insertion point past any operations modifying EXEC.
// This assumes that the value of SCC defined by any of these operations
// does not need to be preserved.
while (MBBI != Last) {
bool IsExecDef = false;
for (const MachineOperand &MO : MBBI->all_defs()) {
IsExecDef |=
MO.getReg() == AMDGPU::EXEC_LO || MO.getReg() == AMDGPU::EXEC;
}
if (!IsExecDef)
break;
MBBI++;
S = nullptr;
}
if (S)
MBBI = saveSCC(MBB, MBBI);
return MBBI;
}
void SIWholeQuadMode::toExact(MachineBasicBlock &MBB,
MachineBasicBlock::iterator Before,
Register SaveWQM) {
assert(LiveMaskReg.isVirtual());
bool IsTerminator = Before == MBB.end();
if (!IsTerminator) {
auto FirstTerm = MBB.getFirstTerminator();
if (FirstTerm != MBB.end()) {
SlotIndex FirstTermIdx = LIS->getInstructionIndex(*FirstTerm);
SlotIndex BeforeIdx = LIS->getInstructionIndex(*Before);
IsTerminator = BeforeIdx > FirstTermIdx;
}
}
MachineInstr *MI;
if (SaveWQM) {
unsigned Opcode = IsTerminator ? AndSaveExecTermOpc : AndSaveExecOpc;
MI = BuildMI(MBB, Before, DebugLoc(), TII->get(Opcode), SaveWQM)
.addReg(LiveMaskReg);
} else {
unsigned Opcode = IsTerminator ? AndTermOpc : AndOpc;
MI = BuildMI(MBB, Before, DebugLoc(), TII->get(Opcode), Exec)
.addReg(Exec)
.addReg(LiveMaskReg);
}
LIS->InsertMachineInstrInMaps(*MI);
StateTransition[MI] = StateExact;
}
void SIWholeQuadMode::toWQM(MachineBasicBlock &MBB,
MachineBasicBlock::iterator Before,
Register SavedWQM) {
MachineInstr *MI;
if (SavedWQM) {
MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), Exec)
.addReg(SavedWQM);
} else {
MI = BuildMI(MBB, Before, DebugLoc(), TII->get(WQMOpc), Exec).addReg(Exec);
}
LIS->InsertMachineInstrInMaps(*MI);
StateTransition[MI] = StateWQM;
}
void SIWholeQuadMode::toStrictMode(MachineBasicBlock &MBB,
MachineBasicBlock::iterator Before,
Register SaveOrig, char StrictStateNeeded) {
MachineInstr *MI;
assert(SaveOrig);
assert(StrictStateNeeded == StateStrictWWM ||
StrictStateNeeded == StateStrictWQM);
if (StrictStateNeeded == StateStrictWWM) {
MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::ENTER_STRICT_WWM),
SaveOrig)
.addImm(-1);
} else {
MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::ENTER_STRICT_WQM),
SaveOrig)
.addImm(-1);
}
LIS->InsertMachineInstrInMaps(*MI);
StateTransition[MI] = StrictStateNeeded;
}
void SIWholeQuadMode::fromStrictMode(MachineBasicBlock &MBB,
MachineBasicBlock::iterator Before,
Register SavedOrig, char NonStrictState,
char CurrentStrictState) {
MachineInstr *MI;
assert(SavedOrig);
assert(CurrentStrictState == StateStrictWWM ||
CurrentStrictState == StateStrictWQM);
if (CurrentStrictState == StateStrictWWM) {
MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::EXIT_STRICT_WWM),
Exec)
.addReg(SavedOrig);
} else {
MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::EXIT_STRICT_WQM),
Exec)
.addReg(SavedOrig);
}
LIS->InsertMachineInstrInMaps(*MI);
StateTransition[MI] = NonStrictState;
}
void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, BlockInfo &BI,
bool IsEntry) {
// This is a non-entry block that is WQM throughout, so no need to do
// anything.
if (!IsEntry && BI.Needs == StateWQM && BI.OutNeeds != StateExact) {
BI.InitialState = StateWQM;
return;
}
LLVM_DEBUG(dbgs() << "\nProcessing block " << printMBBReference(MBB)
<< ":\n");
Register SavedWQMReg;
Register SavedNonStrictReg;
bool WQMFromExec = IsEntry;
char State = (IsEntry || !(BI.InNeeds & StateWQM)) ? StateExact : StateWQM;
char NonStrictState = 0;
const TargetRegisterClass *BoolRC = TRI->getBoolRC();
auto II = MBB.getFirstNonPHI(), IE = MBB.end();
if (IsEntry) {
// Skip the instruction that saves LiveMask
if (II != IE && II->getOpcode() == AMDGPU::COPY &&
II->getOperand(1).getReg() == TRI->getExec())
++II;
}
// This stores the first instruction where it's safe to switch from WQM to
// Exact or vice versa.
MachineBasicBlock::iterator FirstWQM = IE;
// This stores the first instruction where it's safe to switch from Strict
// mode to Exact/WQM or to switch to Strict mode. It must always be the same
// as, or after, FirstWQM since if it's safe to switch to/from Strict, it must
// be safe to switch to/from WQM as well.
MachineBasicBlock::iterator FirstStrict = IE;
// Record initial state is block information.
BI.InitialState = State;
for (unsigned Idx = 0;; ++Idx) {
MachineBasicBlock::iterator Next = II;
char Needs = StateExact | StateWQM; // Strict mode is disabled by default.
char OutNeeds = 0;
if (FirstWQM == IE)
FirstWQM = II;
if (FirstStrict == IE)
FirstStrict = II;
// Adjust needs if this is first instruction of WQM requiring shader.
if (IsEntry && Idx == 0 && (BI.InNeeds & StateWQM))
Needs = StateWQM;
// First, figure out the allowed states (Needs) based on the propagated
// flags.
if (II != IE) {
MachineInstr &MI = *II;
if (MI.isTerminator() || TII->mayReadEXEC(*MRI, MI)) {
auto III = Instructions.find(&MI);
if (III != Instructions.end()) {
if (III->second.Needs & StateStrictWWM)
Needs = StateStrictWWM;
else if (III->second.Needs & StateStrictWQM)
Needs = StateStrictWQM;
else if (III->second.Needs & StateWQM)
Needs = StateWQM;
else
Needs &= ~III->second.Disabled;
OutNeeds = III->second.OutNeeds;
}
} else {
// If the instruction doesn't actually need a correct EXEC, then we can
// safely leave Strict mode enabled.
Needs = StateExact | StateWQM | StateStrict;
}
// Exact mode exit can occur in terminators, but must be before branches.
if (MI.isBranch() && OutNeeds == StateExact)
Needs = StateExact;
++Next;
} else {
// End of basic block
if (BI.OutNeeds & StateWQM)
Needs = StateWQM;
else if (BI.OutNeeds == StateExact)
Needs = StateExact;
else
Needs = StateWQM | StateExact;
}
// Now, transition if necessary.
if (!(Needs & State)) {
MachineBasicBlock::iterator First;
if (State == StateStrictWWM || Needs == StateStrictWWM ||
State == StateStrictWQM || Needs == StateStrictWQM) {
// We must switch to or from Strict mode.
First = FirstStrict;
} else {
// We only need to switch to/from WQM, so we can use FirstWQM.
First = FirstWQM;
}
// Whether we need to save SCC depends on start and end states.
bool SaveSCC = false;
switch (State) {
case StateExact:
case StateStrictWWM:
case StateStrictWQM:
// Exact/Strict -> Strict: save SCC
// Exact/Strict -> WQM: save SCC if WQM mask is generated from exec
// Exact/Strict -> Exact: no save
SaveSCC = (Needs & StateStrict) || ((Needs & StateWQM) && WQMFromExec);
break;
case StateWQM:
// WQM -> Exact/Strict: save SCC
SaveSCC = !(Needs & StateWQM);
break;
default:
llvm_unreachable("Unknown state");
break;
}
char StartState = State & StateStrict ? NonStrictState : State;
bool WQMToExact =
StartState == StateWQM && (Needs & StateExact) && !(Needs & StateWQM);
bool ExactToWQM = StartState == StateExact && (Needs & StateWQM) &&
!(Needs & StateExact);
bool PreferLast = Needs == StateWQM;
// Exact regions in divergent control flow may run at EXEC=0, so try to
// exclude instructions with unexpected effects from them.
// FIXME: ideally we would branch over these when EXEC=0,
// but this requires updating implicit values, live intervals and CFG.
if ((WQMToExact && (OutNeeds & StateWQM)) || ExactToWQM) {
for (MachineBasicBlock::iterator I = First; I != II; ++I) {
if (TII->hasUnwantedEffectsWhenEXECEmpty(*I)) {
PreferLast = WQMToExact;
break;
}
}
}
MachineBasicBlock::iterator Before =
prepareInsertion(MBB, First, II, PreferLast, SaveSCC);
if (State & StateStrict) {
assert(State == StateStrictWWM || State == StateStrictWQM);
assert(SavedNonStrictReg);
fromStrictMode(MBB, Before, SavedNonStrictReg, NonStrictState, State);
LIS->createAndComputeVirtRegInterval(SavedNonStrictReg);
SavedNonStrictReg = 0;
State = NonStrictState;
}
if (Needs & StateStrict) {
NonStrictState = State;
assert(Needs == StateStrictWWM || Needs == StateStrictWQM);
assert(!SavedNonStrictReg);
SavedNonStrictReg = MRI->createVirtualRegister(BoolRC);
toStrictMode(MBB, Before, SavedNonStrictReg, Needs);
State = Needs;
} else {
if (WQMToExact) {
if (!WQMFromExec && (OutNeeds & StateWQM)) {
assert(!SavedWQMReg);
SavedWQMReg = MRI->createVirtualRegister(BoolRC);
}
toExact(MBB, Before, SavedWQMReg);
State = StateExact;
} else if (ExactToWQM) {
assert(WQMFromExec == (SavedWQMReg == 0));
toWQM(MBB, Before, SavedWQMReg);
if (SavedWQMReg) {
LIS->createAndComputeVirtRegInterval(SavedWQMReg);
SavedWQMReg = 0;
}
State = StateWQM;
} else {
// We can get here if we transitioned from StrictWWM to a
// non-StrictWWM state that already matches our needs, but we
// shouldn't need to do anything.
assert(Needs & State);
}
}
}
if (Needs != (StateExact | StateWQM | StateStrict)) {
if (Needs != (StateExact | StateWQM))
FirstWQM = IE;
FirstStrict = IE;
}
if (II == IE)
break;
II = Next;
}
assert(!SavedWQMReg);
assert(!SavedNonStrictReg);
}
bool SIWholeQuadMode::lowerLiveMaskQueries() {
for (MachineInstr *MI : LiveMaskQueries) {
const DebugLoc &DL = MI->getDebugLoc();
Register Dest = MI->getOperand(0).getReg();
MachineInstr *Copy =
BuildMI(*MI->getParent(), MI, DL, TII->get(AMDGPU::COPY), Dest)
.addReg(LiveMaskReg);
LIS->ReplaceMachineInstrInMaps(*MI, *Copy);
MI->eraseFromParent();
}
return !LiveMaskQueries.empty();
}
bool SIWholeQuadMode::lowerCopyInstrs() {
for (MachineInstr *MI : LowerToMovInstrs) {
assert(MI->getNumExplicitOperands() == 2);
const Register Reg = MI->getOperand(0).getReg();
const TargetRegisterClass *regClass =
TRI->getRegClassForOperandReg(*MRI, MI->getOperand(0));
if (TRI->isVGPRClass(regClass)) {
const unsigned MovOp = TII->getMovOpcode(regClass);
MI->setDesc(TII->get(MovOp));
// Check that it already implicitly depends on exec (like all VALU movs
// should do).
assert(any_of(MI->implicit_operands(), [](const MachineOperand &MO) {
return MO.isUse() && MO.getReg() == AMDGPU::EXEC;
}));
} else {
// Remove early-clobber and exec dependency from simple SGPR copies.
// This allows some to be eliminated during/post RA.
LLVM_DEBUG(dbgs() << "simplify SGPR copy: " << *MI);
if (MI->getOperand(0).isEarlyClobber()) {
LIS->removeInterval(Reg);
MI->getOperand(0).setIsEarlyClobber(false);
LIS->createAndComputeVirtRegInterval(Reg);
}
int Index = MI->findRegisterUseOperandIdx(AMDGPU::EXEC, /*TRI=*/nullptr);
while (Index >= 0) {
MI->removeOperand(Index);
Index = MI->findRegisterUseOperandIdx(AMDGPU::EXEC, /*TRI=*/nullptr);
}
MI->setDesc(TII->get(AMDGPU::COPY));
LLVM_DEBUG(dbgs() << " -> " << *MI);
}
}
for (MachineInstr *MI : LowerToCopyInstrs) {
LLVM_DEBUG(dbgs() << "simplify: " << *MI);
if (MI->getOpcode() == AMDGPU::V_SET_INACTIVE_B32) {
assert(MI->getNumExplicitOperands() == 6);
LiveInterval *RecomputeLI = nullptr;
if (MI->getOperand(4).isReg())
RecomputeLI = &LIS->getInterval(MI->getOperand(4).getReg());
MI->removeOperand(5);
MI->removeOperand(4);
MI->removeOperand(3);
MI->removeOperand(1);
if (RecomputeLI)
LIS->shrinkToUses(RecomputeLI);
} else {
assert(MI->getNumExplicitOperands() == 2);
}
unsigned CopyOp = MI->getOperand(1).isReg()
? (unsigned)AMDGPU::COPY
: TII->getMovOpcode(TRI->getRegClassForOperandReg(
*MRI, MI->getOperand(0)));
MI->setDesc(TII->get(CopyOp));
LLVM_DEBUG(dbgs() << " -> " << *MI);
}
return !LowerToCopyInstrs.empty() || !LowerToMovInstrs.empty();
}
bool SIWholeQuadMode::lowerKillInstrs(bool IsWQM) {
for (MachineInstr *MI : KillInstrs) {
MachineInstr *SplitPoint = nullptr;
switch (MI->getOpcode()) {
case AMDGPU::SI_DEMOTE_I1:
case AMDGPU::SI_KILL_I1_TERMINATOR:
SplitPoint = lowerKillI1(*MI, IsWQM);
break;
case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
SplitPoint = lowerKillF32(*MI);
break;
}
if (SplitPoint)
splitBlock(SplitPoint);
}
return !KillInstrs.empty();
}
void SIWholeQuadMode::lowerInitExec(MachineInstr &MI) {
MachineBasicBlock *MBB = MI.getParent();
bool IsWave32 = ST->isWave32();
if (MI.getOpcode() == AMDGPU::SI_INIT_WHOLE_WAVE) {
assert(MBB == &MBB->getParent()->front() &&
"init whole wave not in entry block");
Register EntryExec = MRI->createVirtualRegister(TRI->getBoolRC());
MachineInstr *SaveExec =
BuildMI(*MBB, MBB->begin(), MI.getDebugLoc(),
TII->get(IsWave32 ? AMDGPU::S_OR_SAVEEXEC_B32
: AMDGPU::S_OR_SAVEEXEC_B64),
EntryExec)
.addImm(-1);
// Replace all uses of MI's destination reg with EntryExec.
MRI->replaceRegWith(MI.getOperand(0).getReg(), EntryExec);
if (LIS) {
LIS->RemoveMachineInstrFromMaps(MI);
}
MI.eraseFromParent();
if (LIS) {
LIS->InsertMachineInstrInMaps(*SaveExec);
LIS->createAndComputeVirtRegInterval(EntryExec);
}
return;
}
if (MI.getOpcode() == AMDGPU::SI_INIT_EXEC) {
// This should be before all vector instructions.
MachineInstr *InitMI =
BuildMI(*MBB, MBB->begin(), MI.getDebugLoc(),
TII->get(IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64),
Exec)
.addImm(MI.getOperand(0).getImm());
if (LIS) {
LIS->RemoveMachineInstrFromMaps(MI);
LIS->InsertMachineInstrInMaps(*InitMI);
}
MI.eraseFromParent();
return;
}
// Extract the thread count from an SGPR input and set EXEC accordingly.
// Since BFM can't shift by 64, handle that case with CMP + CMOV.
//
// S_BFE_U32 count, input, {shift, 7}
// S_BFM_B64 exec, count, 0
// S_CMP_EQ_U32 count, 64
// S_CMOV_B64 exec, -1
Register InputReg = MI.getOperand(0).getReg();
MachineInstr *FirstMI = &*MBB->begin();
if (InputReg.isVirtual()) {
MachineInstr *DefInstr = MRI->getVRegDef(InputReg);
assert(DefInstr && DefInstr->isCopy());
if (DefInstr->getParent() == MBB) {
if (DefInstr != FirstMI) {
// If the `InputReg` is defined in current block, we also need to
// move that instruction to the beginning of the block.
DefInstr->removeFromParent();
MBB->insert(FirstMI, DefInstr);
if (LIS)
LIS->handleMove(*DefInstr);
} else {
// If first instruction is definition then move pointer after it.
FirstMI = &*std::next(FirstMI->getIterator());
}
}
}
// Insert instruction sequence at block beginning (before vector operations).
const DebugLoc DL = MI.getDebugLoc();
const unsigned WavefrontSize = ST->getWavefrontSize();
const unsigned Mask = (WavefrontSize << 1) - 1;
Register CountReg = MRI->createVirtualRegister(&AMDGPU::SGPR_32RegClass);
auto BfeMI = BuildMI(*MBB, FirstMI, DL, TII->get(AMDGPU::S_BFE_U32), CountReg)
.addReg(InputReg)
.addImm((MI.getOperand(1).getImm() & Mask) | 0x70000);
auto BfmMI =
BuildMI(*MBB, FirstMI, DL,
TII->get(IsWave32 ? AMDGPU::S_BFM_B32 : AMDGPU::S_BFM_B64), Exec)
.addReg(CountReg)
.addImm(0);
auto CmpMI = BuildMI(*MBB, FirstMI, DL, TII->get(AMDGPU::S_CMP_EQ_U32))
.addReg(CountReg, RegState::Kill)
.addImm(WavefrontSize);
auto CmovMI =
BuildMI(*MBB, FirstMI, DL,
TII->get(IsWave32 ? AMDGPU::S_CMOV_B32 : AMDGPU::S_CMOV_B64),
Exec)
.addImm(-1);
if (!LIS) {
MI.eraseFromParent();
return;
}
LIS->RemoveMachineInstrFromMaps(MI);
MI.eraseFromParent();
LIS->InsertMachineInstrInMaps(*BfeMI);
LIS->InsertMachineInstrInMaps(*BfmMI);
LIS->InsertMachineInstrInMaps(*CmpMI);
LIS->InsertMachineInstrInMaps(*CmovMI);
LIS->removeInterval(InputReg);
LIS->createAndComputeVirtRegInterval(InputReg);
LIS->createAndComputeVirtRegInterval(CountReg);
}
/// Lower INIT_EXEC instructions. Return a suitable insert point in \p Entry
/// for instructions that depend on EXEC.
MachineBasicBlock::iterator
SIWholeQuadMode::lowerInitExecInstrs(MachineBasicBlock &Entry, bool &Changed) {
MachineBasicBlock::iterator InsertPt = Entry.getFirstNonPHI();
for (MachineInstr *MI : InitExecInstrs) {
// Try to handle undefined cases gracefully:
// - multiple INIT_EXEC instructions
// - INIT_EXEC instructions not in the entry block
if (MI->getParent() == &Entry)
InsertPt = std::next(MI->getIterator());
lowerInitExec(*MI);
Changed = true;
}
return InsertPt;
}
bool SIWholeQuadMode::run(MachineFunction &MF) {
LLVM_DEBUG(dbgs() << "SI Whole Quad Mode on " << MF.getName()
<< " ------------- \n");
LLVM_DEBUG(MF.dump(););
Instructions.clear();
Blocks.clear();
LiveMaskQueries.clear();
LowerToCopyInstrs.clear();
LowerToMovInstrs.clear();
KillInstrs.clear();
InitExecInstrs.clear();
SetInactiveInstrs.clear();
StateTransition.clear();
if (ST->isWave32()) {
AndOpc = AMDGPU::S_AND_B32;
AndTermOpc = AMDGPU::S_AND_B32_term;
AndN2Opc = AMDGPU::S_ANDN2_B32;
XorOpc = AMDGPU::S_XOR_B32;
AndSaveExecOpc = AMDGPU::S_AND_SAVEEXEC_B32;
AndSaveExecTermOpc = AMDGPU::S_AND_SAVEEXEC_B32_term;
WQMOpc = AMDGPU::S_WQM_B32;
Exec = AMDGPU::EXEC_LO;
} else {
AndOpc = AMDGPU::S_AND_B64;
AndTermOpc = AMDGPU::S_AND_B64_term;
AndN2Opc = AMDGPU::S_ANDN2_B64;
XorOpc = AMDGPU::S_XOR_B64;
AndSaveExecOpc = AMDGPU::S_AND_SAVEEXEC_B64;
AndSaveExecTermOpc = AMDGPU::S_AND_SAVEEXEC_B64_term;
WQMOpc = AMDGPU::S_WQM_B64;
Exec = AMDGPU::EXEC;
}
const char GlobalFlags = analyzeFunction(MF);
bool Changed = false;
LiveMaskReg = Exec;
MachineBasicBlock &Entry = MF.front();
MachineBasicBlock::iterator EntryMI = lowerInitExecInstrs(Entry, Changed);
// Store a copy of the original live mask when required
const bool HasLiveMaskQueries = !LiveMaskQueries.empty();
const bool HasWaveModes = GlobalFlags & ~StateExact;
const bool HasKills = !KillInstrs.empty();
const bool UsesWQM = GlobalFlags & StateWQM;
if (HasKills || UsesWQM || (HasWaveModes && HasLiveMaskQueries)) {
LiveMaskReg = MRI->createVirtualRegister(TRI->getBoolRC());
MachineInstr *MI =
BuildMI(Entry, EntryMI, DebugLoc(), TII->get(AMDGPU::COPY), LiveMaskReg)
.addReg(Exec);
LIS->InsertMachineInstrInMaps(*MI);
Changed = true;
}
// Check if V_SET_INACTIVE was touched by a strict state mode.
// If so, promote to WWM; otherwise lower to COPY.
for (MachineInstr *MI : SetInactiveInstrs) {
if (LowerToCopyInstrs.contains(MI))
continue;
auto &Info = Instructions[MI];
if (Info.MarkedStates & StateStrict) {
Info.Needs |= StateStrictWWM;
Info.Disabled &= ~StateStrictWWM;
Blocks[MI->getParent()].Needs |= StateStrictWWM;
} else {
LLVM_DEBUG(dbgs() << "Has no WWM marking: " << *MI);
LowerToCopyInstrs.insert(MI);
}
}
LLVM_DEBUG(printInfo());
Changed |= lowerLiveMaskQueries();
Changed |= lowerCopyInstrs();
if (!HasWaveModes) {
// No wave mode execution
Changed |= lowerKillInstrs(false);
} else if (GlobalFlags == StateWQM) {
// Shader only needs WQM
auto MI = BuildMI(Entry, EntryMI, DebugLoc(), TII->get(WQMOpc), Exec)
.addReg(Exec);
LIS->InsertMachineInstrInMaps(*MI);
lowerKillInstrs(true);
Changed = true;
} else {
// Mark entry for WQM if required.
if (GlobalFlags & StateWQM)
Blocks[&Entry].InNeeds |= StateWQM;
// Wave mode switching requires full lowering pass.
for (auto &BII : Blocks)
processBlock(*BII.first, BII.second, BII.first == &Entry);
// Lowering blocks causes block splitting so perform as a second pass.
for (auto &BII : Blocks)
lowerBlock(*BII.first, BII.second);
Changed = true;
}
// Compute live range for live mask
if (LiveMaskReg != Exec)
LIS->createAndComputeVirtRegInterval(LiveMaskReg);
// Physical registers like SCC aren't tracked by default anyway, so just
// removing the ranges we computed is the simplest option for maintaining
// the analysis results.
LIS->removeAllRegUnitsForPhysReg(AMDGPU::SCC);
// If we performed any kills then recompute EXEC
if (!KillInstrs.empty() || !InitExecInstrs.empty())
LIS->removeAllRegUnitsForPhysReg(AMDGPU::EXEC);
return Changed;
}
bool SIWholeQuadModeLegacy::runOnMachineFunction(MachineFunction &MF) {
LiveIntervals *LIS = &getAnalysis<LiveIntervalsWrapperPass>().getLIS();
auto *MDTWrapper = getAnalysisIfAvailable<MachineDominatorTreeWrapperPass>();
MachineDominatorTree *MDT = MDTWrapper ? &MDTWrapper->getDomTree() : nullptr;
auto *PDTWrapper =
getAnalysisIfAvailable<MachinePostDominatorTreeWrapperPass>();
MachinePostDominatorTree *PDT =
PDTWrapper ? &PDTWrapper->getPostDomTree() : nullptr;
SIWholeQuadMode Impl(MF, LIS, MDT, PDT);
return Impl.run(MF);
}
PreservedAnalyses
SIWholeQuadModePass::run(MachineFunction &MF,
MachineFunctionAnalysisManager &MFAM) {
MFPropsModifier _(*this, MF);
LiveIntervals *LIS = &MFAM.getResult<LiveIntervalsAnalysis>(MF);
MachineDominatorTree *MDT =
MFAM.getCachedResult<MachineDominatorTreeAnalysis>(MF);
MachinePostDominatorTree *PDT =
MFAM.getCachedResult<MachinePostDominatorTreeAnalysis>(MF);
SIWholeQuadMode Impl(MF, LIS, MDT, PDT);
bool Changed = Impl.run(MF);
if (!Changed)
return PreservedAnalyses::all();
PreservedAnalyses PA = getMachineFunctionPassPreservedAnalyses();
PA.preserve<SlotIndexesAnalysis>();
PA.preserve<LiveIntervalsAnalysis>();
PA.preserve<MachineDominatorTreeAnalysis>();
PA.preserve<MachinePostDominatorTreeAnalysis>();
return PA;
}