blob: 932b9e8f52b2749494ab5fdffb4c918421d4fb89 [file] [log] [blame] [edit]
//===-- GCNSchedStrategy.cpp - GCN Scheduler Strategy ---------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
/// \file
/// This contains a MachineSchedStrategy implementation for maximizing wave
/// occupancy on GCN hardware.
///
/// This pass will apply multiple scheduling stages to the same function.
/// Regions are first recorded in GCNScheduleDAGMILive::schedule. The actual
/// entry point for the scheduling of those regions is
/// GCNScheduleDAGMILive::runSchedStages.
/// Generally, the reason for having multiple scheduling stages is to account
/// for the kernel-wide effect of register usage on occupancy. Usually, only a
/// few scheduling regions will have register pressure high enough to limit
/// occupancy for the kernel, so constraints can be relaxed to improve ILP in
/// other regions.
///
//===----------------------------------------------------------------------===//
#include "GCNSchedStrategy.h"
#include "AMDGPUIGroupLP.h"
#include "GCNRegPressure.h"
#include "SIMachineFunctionInfo.h"
#include "Utils/AMDGPUBaseInfo.h"
#include "llvm/ADT/BitVector.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/CodeGen/CalcSpillWeights.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
#include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
#include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
#include "llvm/CodeGen/MachineCycleAnalysis.h"
#include "llvm/CodeGen/MachineOperand.h"
#include "llvm/CodeGen/RegisterClassInfo.h"
#include "llvm/MC/LaneBitmask.h"
#include "llvm/MC/MCInstrItineraries.h"
#include "llvm/MC/MCSchedule.h"
#include "llvm/MC/TargetRegistry.h"
#include "llvm/Support/ErrorHandling.h"
#define DEBUG_TYPE "machine-scheduler"
using namespace llvm;
static cl::opt<bool> DisableUnclusterHighRP(
"amdgpu-disable-unclustered-high-rp-reschedule", cl::Hidden,
cl::desc("Disable unclustered high register pressure "
"reduction scheduling stage."),
cl::init(false));
static cl::opt<bool> DisableClusteredLowOccupancy(
"amdgpu-disable-clustered-low-occupancy-reschedule", cl::Hidden,
cl::desc("Disable clustered low occupancy "
"rescheduling for ILP scheduling stage."),
cl::init(false));
static cl::opt<unsigned> ScheduleMetricBias(
"amdgpu-schedule-metric-bias", cl::Hidden,
cl::desc(
"Sets the bias which adds weight to occupancy vs latency. Set it to "
"100 to chase the occupancy only."),
cl::init(10));
static cl::opt<bool>
RelaxedOcc("amdgpu-schedule-relaxed-occupancy", cl::Hidden,
cl::desc("Relax occupancy targets for kernels which are memory "
"bound (amdgpu-membound-threshold), or "
"Wave Limited (amdgpu-limit-wave-threshold)."),
cl::init(false));
static cl::opt<bool> GCNTrackers(
"amdgpu-use-amdgpu-trackers", cl::Hidden,
cl::desc("Use the AMDGPU specific RPTrackers during scheduling"),
cl::init(false));
static cl::opt<unsigned> PendingQueueLimit(
"amdgpu-scheduler-pending-queue-limit", cl::Hidden,
cl::desc(
"Max (Available+Pending) size to inspect pending queue (0 disables)"),
cl::init(256));
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
#define DUMP_MAX_REG_PRESSURE
static cl::opt<bool> PrintMaxRPRegUsageBeforeScheduler(
"amdgpu-print-max-reg-pressure-regusage-before-scheduler", cl::Hidden,
cl::desc("Print a list of live registers along with their def/uses at the "
"point of maximum register pressure before scheduling."),
cl::init(false));
static cl::opt<bool> PrintMaxRPRegUsageAfterScheduler(
"amdgpu-print-max-reg-pressure-regusage-after-scheduler", cl::Hidden,
cl::desc("Print a list of live registers along with their def/uses at the "
"point of maximum register pressure after scheduling."),
cl::init(false));
#endif
static cl::opt<bool> DisableRewriteMFMAFormSchedStage(
"amdgpu-disable-rewrite-mfma-form-sched-stage", cl::Hidden,
cl::desc("Disable rewrite mfma rewrite scheduling stage"), cl::init(true));
const unsigned ScheduleMetrics::ScaleFactor = 100;
GCNSchedStrategy::GCNSchedStrategy(const MachineSchedContext *C)
: GenericScheduler(C), TargetOccupancy(0), MF(nullptr),
DownwardTracker(*C->LIS), UpwardTracker(*C->LIS), HasHighPressure(false) {
}
void GCNSchedStrategy::initialize(ScheduleDAGMI *DAG) {
GenericScheduler::initialize(DAG);
MF = &DAG->MF;
const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
SGPRExcessLimit =
Context->RegClassInfo->getNumAllocatableRegs(&AMDGPU::SGPR_32RegClass);
VGPRExcessLimit =
Context->RegClassInfo->getNumAllocatableRegs(&AMDGPU::VGPR_32RegClass);
SIMachineFunctionInfo &MFI = *MF->getInfo<SIMachineFunctionInfo>();
// Set the initial TargetOccupnacy to the maximum occupancy that we can
// achieve for this function. This effectively sets a lower bound on the
// 'Critical' register limits in the scheduler.
// Allow for lower occupancy targets if kernel is wave limited or memory
// bound, and using the relaxed occupancy feature.
TargetOccupancy =
RelaxedOcc ? MFI.getMinAllowedOccupancy() : MFI.getOccupancy();
SGPRCriticalLimit =
std::min(ST.getMaxNumSGPRs(TargetOccupancy, true), SGPRExcessLimit);
if (!KnownExcessRP) {
VGPRCriticalLimit = std::min(
ST.getMaxNumVGPRs(TargetOccupancy, MFI.getDynamicVGPRBlockSize()),
VGPRExcessLimit);
} else {
// This is similar to ST.getMaxNumVGPRs(TargetOccupancy) result except
// returns a reasonably small number for targets with lots of VGPRs, such
// as GFX10 and GFX11.
LLVM_DEBUG(dbgs() << "Region is known to spill, use alternative "
"VGPRCriticalLimit calculation method.\n");
unsigned DynamicVGPRBlockSize = MFI.getDynamicVGPRBlockSize();
unsigned Granule =
AMDGPU::IsaInfo::getVGPRAllocGranule(&ST, DynamicVGPRBlockSize);
unsigned Addressable =
AMDGPU::IsaInfo::getAddressableNumVGPRs(&ST, DynamicVGPRBlockSize);
unsigned VGPRBudget = alignDown(Addressable / TargetOccupancy, Granule);
VGPRBudget = std::max(VGPRBudget, Granule);
VGPRCriticalLimit = std::min(VGPRBudget, VGPRExcessLimit);
}
// Subtract error margin and bias from register limits and avoid overflow.
SGPRCriticalLimit -= std::min(SGPRLimitBias + ErrorMargin, SGPRCriticalLimit);
VGPRCriticalLimit -= std::min(VGPRLimitBias + ErrorMargin, VGPRCriticalLimit);
SGPRExcessLimit -= std::min(SGPRLimitBias + ErrorMargin, SGPRExcessLimit);
VGPRExcessLimit -= std::min(VGPRLimitBias + ErrorMargin, VGPRExcessLimit);
LLVM_DEBUG(dbgs() << "VGPRCriticalLimit = " << VGPRCriticalLimit
<< ", VGPRExcessLimit = " << VGPRExcessLimit
<< ", SGPRCriticalLimit = " << SGPRCriticalLimit
<< ", SGPRExcessLimit = " << SGPRExcessLimit << "\n\n");
}
/// Checks whether \p SU can use the cached DAG pressure diffs to compute the
/// current register pressure.
///
/// This works for the common case, but it has a few exceptions that have been
/// observed through trial and error:
/// - Explicit physical register operands
/// - Subregister definitions
///
/// In both of those cases, PressureDiff doesn't represent the actual pressure,
/// and querying LiveIntervals through the RegPressureTracker is needed to get
/// an accurate value.
///
/// We should eventually only use PressureDiff for maximum performance, but this
/// already allows 80% of SUs to take the fast path without changing scheduling
/// at all. Further changes would either change scheduling, or require a lot
/// more logic to recover an accurate pressure estimate from the PressureDiffs.
static bool canUsePressureDiffs(const SUnit &SU) {
if (!SU.isInstr())
return false;
// Cannot use pressure diffs for subregister defs or with physregs, it's
// imprecise in both cases.
for (const auto &Op : SU.getInstr()->operands()) {
if (!Op.isReg() || Op.isImplicit())
continue;
if (Op.getReg().isPhysical() ||
(Op.isDef() && Op.getSubReg() != AMDGPU::NoSubRegister))
return false;
}
return true;
}
static void getRegisterPressures(
bool AtTop, const RegPressureTracker &RPTracker, SUnit *SU,
std::vector<unsigned> &Pressure, std::vector<unsigned> &MaxPressure,
GCNDownwardRPTracker &DownwardTracker, GCNUpwardRPTracker &UpwardTracker,
ScheduleDAGMI *DAG, const SIRegisterInfo *SRI) {
// getDownwardPressure() and getUpwardPressure() make temporary changes to
// the tracker, so we need to pass those function a non-const copy.
RegPressureTracker &TempTracker = const_cast<RegPressureTracker &>(RPTracker);
if (!GCNTrackers) {
AtTop
? TempTracker.getDownwardPressure(SU->getInstr(), Pressure, MaxPressure)
: TempTracker.getUpwardPressure(SU->getInstr(), Pressure, MaxPressure);
return;
}
// GCNTrackers
Pressure.resize(4, 0);
MachineInstr *MI = SU->getInstr();
GCNRegPressure NewPressure;
if (AtTop) {
GCNDownwardRPTracker TempDownwardTracker(DownwardTracker);
NewPressure = TempDownwardTracker.bumpDownwardPressure(MI, SRI);
} else {
GCNUpwardRPTracker TempUpwardTracker(UpwardTracker);
TempUpwardTracker.recede(*MI);
NewPressure = TempUpwardTracker.getPressure();
}
Pressure[AMDGPU::RegisterPressureSets::SReg_32] = NewPressure.getSGPRNum();
Pressure[AMDGPU::RegisterPressureSets::VGPR_32] =
NewPressure.getArchVGPRNum();
Pressure[AMDGPU::RegisterPressureSets::AGPR_32] = NewPressure.getAGPRNum();
}
void GCNSchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU,
bool AtTop,
const RegPressureTracker &RPTracker,
const SIRegisterInfo *SRI,
unsigned SGPRPressure,
unsigned VGPRPressure, bool IsBottomUp) {
Cand.SU = SU;
Cand.AtTop = AtTop;
if (!DAG->isTrackingPressure())
return;
Pressure.clear();
MaxPressure.clear();
// We try to use the cached PressureDiffs in the ScheduleDAG whenever
// possible over querying the RegPressureTracker.
//
// RegPressureTracker will make a lot of LIS queries which are very
// expensive, it is considered a slow function in this context.
//
// PressureDiffs are precomputed and cached, and getPressureDiff is just a
// trivial lookup into an array. It is pretty much free.
//
// In EXPENSIVE_CHECKS, we always query RPTracker to verify the results of
// PressureDiffs.
if (AtTop || !canUsePressureDiffs(*SU) || GCNTrackers) {
getRegisterPressures(AtTop, RPTracker, SU, Pressure, MaxPressure,
DownwardTracker, UpwardTracker, DAG, SRI);
} else {
// Reserve 4 slots.
Pressure.resize(4, 0);
Pressure[AMDGPU::RegisterPressureSets::SReg_32] = SGPRPressure;
Pressure[AMDGPU::RegisterPressureSets::VGPR_32] = VGPRPressure;
for (const auto &Diff : DAG->getPressureDiff(SU)) {
if (!Diff.isValid())
continue;
// PressureDiffs is always bottom-up so if we're working top-down we need
// to invert its sign.
Pressure[Diff.getPSet()] +=
(IsBottomUp ? Diff.getUnitInc() : -Diff.getUnitInc());
}
#ifdef EXPENSIVE_CHECKS
std::vector<unsigned> CheckPressure, CheckMaxPressure;
getRegisterPressures(AtTop, RPTracker, SU, CheckPressure, CheckMaxPressure,
DownwardTracker, UpwardTracker, DAG, SRI);
if (Pressure[AMDGPU::RegisterPressureSets::SReg_32] !=
CheckPressure[AMDGPU::RegisterPressureSets::SReg_32] ||
Pressure[AMDGPU::RegisterPressureSets::VGPR_32] !=
CheckPressure[AMDGPU::RegisterPressureSets::VGPR_32]) {
errs() << "Register Pressure is inaccurate when calculated through "
"PressureDiff\n"
<< "SGPR got " << Pressure[AMDGPU::RegisterPressureSets::SReg_32]
<< ", expected "
<< CheckPressure[AMDGPU::RegisterPressureSets::SReg_32] << "\n"
<< "VGPR got " << Pressure[AMDGPU::RegisterPressureSets::VGPR_32]
<< ", expected "
<< CheckPressure[AMDGPU::RegisterPressureSets::VGPR_32] << "\n";
report_fatal_error("inaccurate register pressure calculation");
}
#endif
}
unsigned NewSGPRPressure = Pressure[AMDGPU::RegisterPressureSets::SReg_32];
unsigned NewVGPRPressure = Pressure[AMDGPU::RegisterPressureSets::VGPR_32];
// If two instructions increase the pressure of different register sets
// by the same amount, the generic scheduler will prefer to schedule the
// instruction that increases the set with the least amount of registers,
// which in our case would be SGPRs. This is rarely what we want, so
// when we report excess/critical register pressure, we do it either
// only for VGPRs or only for SGPRs.
// FIXME: Better heuristics to determine whether to prefer SGPRs or VGPRs.
const unsigned MaxVGPRPressureInc = 16;
bool ShouldTrackVGPRs = VGPRPressure + MaxVGPRPressureInc >= VGPRExcessLimit;
bool ShouldTrackSGPRs = !ShouldTrackVGPRs && SGPRPressure >= SGPRExcessLimit;
// FIXME: We have to enter REG-EXCESS before we reach the actual threshold
// to increase the likelihood we don't go over the limits. We should improve
// the analysis to look through dependencies to find the path with the least
// register pressure.
// We only need to update the RPDelta for instructions that increase register
// pressure. Instructions that decrease or keep reg pressure the same will be
// marked as RegExcess in tryCandidate() when they are compared with
// instructions that increase the register pressure.
if (ShouldTrackVGPRs && NewVGPRPressure >= VGPRExcessLimit) {
HasHighPressure = true;
Cand.RPDelta.Excess = PressureChange(AMDGPU::RegisterPressureSets::VGPR_32);
Cand.RPDelta.Excess.setUnitInc(NewVGPRPressure - VGPRExcessLimit);
}
if (ShouldTrackSGPRs && NewSGPRPressure >= SGPRExcessLimit) {
HasHighPressure = true;
Cand.RPDelta.Excess = PressureChange(AMDGPU::RegisterPressureSets::SReg_32);
Cand.RPDelta.Excess.setUnitInc(NewSGPRPressure - SGPRExcessLimit);
}
// Register pressure is considered 'CRITICAL' if it is approaching a value
// that would reduce the wave occupancy for the execution unit. When
// register pressure is 'CRITICAL', increasing SGPR and VGPR pressure both
// has the same cost, so we don't need to prefer one over the other.
int SGPRDelta = NewSGPRPressure - SGPRCriticalLimit;
int VGPRDelta = NewVGPRPressure - VGPRCriticalLimit;
if (SGPRDelta >= 0 || VGPRDelta >= 0) {
HasHighPressure = true;
if (SGPRDelta > VGPRDelta) {
Cand.RPDelta.CriticalMax =
PressureChange(AMDGPU::RegisterPressureSets::SReg_32);
Cand.RPDelta.CriticalMax.setUnitInc(SGPRDelta);
} else {
Cand.RPDelta.CriticalMax =
PressureChange(AMDGPU::RegisterPressureSets::VGPR_32);
Cand.RPDelta.CriticalMax.setUnitInc(VGPRDelta);
}
}
}
static bool shouldCheckPending(SchedBoundary &Zone,
const TargetSchedModel *SchedModel) {
bool HasBufferedModel =
SchedModel->hasInstrSchedModel() && SchedModel->getMicroOpBufferSize();
unsigned Combined = Zone.Available.size() + Zone.Pending.size();
return Combined <= PendingQueueLimit && HasBufferedModel;
}
static SUnit *pickOnlyChoice(SchedBoundary &Zone,
const TargetSchedModel *SchedModel) {
// pickOnlyChoice() releases pending instructions and checks for new hazards.
SUnit *OnlyChoice = Zone.pickOnlyChoice();
if (!shouldCheckPending(Zone, SchedModel) || Zone.Pending.empty())
return OnlyChoice;
return nullptr;
}
void GCNSchedStrategy::printCandidateDecision(const SchedCandidate &Current,
const SchedCandidate &Preferred) {
LLVM_DEBUG({
dbgs() << "Prefer:\t\t";
DAG->dumpNode(*Preferred.SU);
if (Current.SU) {
dbgs() << "Not:\t";
DAG->dumpNode(*Current.SU);
}
dbgs() << "Reason:\t\t";
traceCandidate(Preferred);
});
}
// This function is mostly cut and pasted from
// GenericScheduler::pickNodeFromQueue()
void GCNSchedStrategy::pickNodeFromQueue(SchedBoundary &Zone,
const CandPolicy &ZonePolicy,
const RegPressureTracker &RPTracker,
SchedCandidate &Cand, bool &IsPending,
bool IsBottomUp) {
const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo *>(TRI);
ArrayRef<unsigned> Pressure = RPTracker.getRegSetPressureAtPos();
unsigned SGPRPressure = 0;
unsigned VGPRPressure = 0;
IsPending = false;
if (DAG->isTrackingPressure()) {
if (!GCNTrackers) {
SGPRPressure = Pressure[AMDGPU::RegisterPressureSets::SReg_32];
VGPRPressure = Pressure[AMDGPU::RegisterPressureSets::VGPR_32];
} else {
GCNRPTracker *T = IsBottomUp
? static_cast<GCNRPTracker *>(&UpwardTracker)
: static_cast<GCNRPTracker *>(&DownwardTracker);
SGPRPressure = T->getPressure().getSGPRNum();
VGPRPressure = T->getPressure().getArchVGPRNum();
}
}
LLVM_DEBUG(dbgs() << "Available Q:\n");
ReadyQueue &AQ = Zone.Available;
for (SUnit *SU : AQ) {
SchedCandidate TryCand(ZonePolicy);
initCandidate(TryCand, SU, Zone.isTop(), RPTracker, SRI, SGPRPressure,
VGPRPressure, IsBottomUp);
// Pass SchedBoundary only when comparing nodes from the same boundary.
SchedBoundary *ZoneArg = Cand.AtTop == TryCand.AtTop ? &Zone : nullptr;
tryCandidate(Cand, TryCand, ZoneArg);
if (TryCand.Reason != NoCand) {
// Initialize resource delta if needed in case future heuristics query it.
if (TryCand.ResDelta == SchedResourceDelta())
TryCand.initResourceDelta(Zone.DAG, SchedModel);
LLVM_DEBUG(printCandidateDecision(Cand, TryCand));
Cand.setBest(TryCand);
} else {
printCandidateDecision(TryCand, Cand);
}
}
if (!shouldCheckPending(Zone, SchedModel))
return;
LLVM_DEBUG(dbgs() << "Pending Q:\n");
ReadyQueue &PQ = Zone.Pending;
for (SUnit *SU : PQ) {
SchedCandidate TryCand(ZonePolicy);
initCandidate(TryCand, SU, Zone.isTop(), RPTracker, SRI, SGPRPressure,
VGPRPressure, IsBottomUp);
// Pass SchedBoundary only when comparing nodes from the same boundary.
SchedBoundary *ZoneArg = Cand.AtTop == TryCand.AtTop ? &Zone : nullptr;
tryPendingCandidate(Cand, TryCand, ZoneArg);
if (TryCand.Reason != NoCand) {
// Initialize resource delta if needed in case future heuristics query it.
if (TryCand.ResDelta == SchedResourceDelta())
TryCand.initResourceDelta(Zone.DAG, SchedModel);
LLVM_DEBUG(printCandidateDecision(Cand, TryCand));
IsPending = true;
Cand.setBest(TryCand);
} else {
printCandidateDecision(TryCand, Cand);
}
}
}
// This function is mostly cut and pasted from
// GenericScheduler::pickNodeBidirectional()
SUnit *GCNSchedStrategy::pickNodeBidirectional(bool &IsTopNode,
bool &PickedPending) {
// Schedule as far as possible in the direction of no choice. This is most
// efficient, but also provides the best heuristics for CriticalPSets.
if (SUnit *SU = pickOnlyChoice(Bot, SchedModel)) {
IsTopNode = false;
return SU;
}
if (SUnit *SU = pickOnlyChoice(Top, SchedModel)) {
IsTopNode = true;
return SU;
}
// Set the bottom-up policy based on the state of the current bottom zone
// and the instructions outside the zone, including the top zone.
CandPolicy BotPolicy;
setPolicy(BotPolicy, /*IsPostRA=*/false, Bot, &Top);
// Set the top-down policy based on the state of the current top zone and
// the instructions outside the zone, including the bottom zone.
CandPolicy TopPolicy;
setPolicy(TopPolicy, /*IsPostRA=*/false, Top, &Bot);
bool BotPending = false;
// See if BotCand is still valid (because we previously scheduled from Top).
LLVM_DEBUG(dbgs() << "Picking from Bot:\n");
if (!BotCand.isValid() || BotCand.SU->isScheduled ||
BotCand.Policy != BotPolicy) {
BotCand.reset(CandPolicy());
pickNodeFromQueue(Bot, BotPolicy, DAG->getBotRPTracker(), BotCand,
BotPending,
/*IsBottomUp=*/true);
assert(BotCand.Reason != NoCand && "failed to find the first candidate");
} else {
LLVM_DEBUG(traceCandidate(BotCand));
#ifndef NDEBUG
if (VerifyScheduling) {
SchedCandidate TCand;
TCand.reset(CandPolicy());
pickNodeFromQueue(Bot, BotPolicy, DAG->getBotRPTracker(), TCand,
BotPending,
/*IsBottomUp=*/true);
assert(TCand.SU == BotCand.SU &&
"Last pick result should correspond to re-picking right now");
}
#endif
}
bool TopPending = false;
// Check if the top Q has a better candidate.
LLVM_DEBUG(dbgs() << "Picking from Top:\n");
if (!TopCand.isValid() || TopCand.SU->isScheduled ||
TopCand.Policy != TopPolicy) {
TopCand.reset(CandPolicy());
pickNodeFromQueue(Top, TopPolicy, DAG->getTopRPTracker(), TopCand,
TopPending,
/*IsBottomUp=*/false);
assert(TopCand.Reason != NoCand && "failed to find the first candidate");
} else {
LLVM_DEBUG(traceCandidate(TopCand));
#ifndef NDEBUG
if (VerifyScheduling) {
SchedCandidate TCand;
TCand.reset(CandPolicy());
pickNodeFromQueue(Top, TopPolicy, DAG->getTopRPTracker(), TCand,
TopPending,
/*IsBottomUp=*/false);
assert(TCand.SU == TopCand.SU &&
"Last pick result should correspond to re-picking right now");
}
#endif
}
// Pick best from BotCand and TopCand.
LLVM_DEBUG(dbgs() << "Top Cand: "; traceCandidate(TopCand);
dbgs() << "Bot Cand: "; traceCandidate(BotCand););
SchedCandidate Cand = BotPending ? TopCand : BotCand;
SchedCandidate TryCand = BotPending ? BotCand : TopCand;
PickedPending = BotPending && TopPending;
TryCand.Reason = NoCand;
if (BotPending || TopPending) {
PickedPending |= tryPendingCandidate(Cand, TopCand, nullptr);
} else {
tryCandidate(Cand, TryCand, nullptr);
}
if (TryCand.Reason != NoCand) {
Cand.setBest(TryCand);
}
LLVM_DEBUG(dbgs() << "Picking: "; traceCandidate(Cand););
IsTopNode = Cand.AtTop;
return Cand.SU;
}
// This function is mostly cut and pasted from
// GenericScheduler::pickNode()
SUnit *GCNSchedStrategy::pickNode(bool &IsTopNode) {
if (DAG->top() == DAG->bottom()) {
assert(Top.Available.empty() && Top.Pending.empty() &&
Bot.Available.empty() && Bot.Pending.empty() && "ReadyQ garbage");
return nullptr;
}
bool PickedPending;
SUnit *SU;
do {
PickedPending = false;
if (RegionPolicy.OnlyTopDown) {
SU = pickOnlyChoice(Top, SchedModel);
if (!SU) {
CandPolicy NoPolicy;
TopCand.reset(NoPolicy);
pickNodeFromQueue(Top, NoPolicy, DAG->getTopRPTracker(), TopCand,
PickedPending,
/*IsBottomUp=*/false);
assert(TopCand.Reason != NoCand && "failed to find a candidate");
SU = TopCand.SU;
}
IsTopNode = true;
} else if (RegionPolicy.OnlyBottomUp) {
SU = pickOnlyChoice(Bot, SchedModel);
if (!SU) {
CandPolicy NoPolicy;
BotCand.reset(NoPolicy);
pickNodeFromQueue(Bot, NoPolicy, DAG->getBotRPTracker(), BotCand,
PickedPending,
/*IsBottomUp=*/true);
assert(BotCand.Reason != NoCand && "failed to find a candidate");
SU = BotCand.SU;
}
IsTopNode = false;
} else {
SU = pickNodeBidirectional(IsTopNode, PickedPending);
}
} while (SU->isScheduled);
if (PickedPending) {
unsigned ReadyCycle = IsTopNode ? SU->TopReadyCycle : SU->BotReadyCycle;
SchedBoundary &Zone = IsTopNode ? Top : Bot;
unsigned CurrentCycle = Zone.getCurrCycle();
if (ReadyCycle > CurrentCycle)
Zone.bumpCycle(ReadyCycle);
// FIXME: checkHazard() doesn't give information about which cycle the
// hazard will resolve so just keep bumping the cycle by 1. This could be
// made more efficient if checkHazard() returned more details.
while (Zone.checkHazard(SU))
Zone.bumpCycle(Zone.getCurrCycle() + 1);
Zone.releasePending();
}
if (SU->isTopReady())
Top.removeReady(SU);
if (SU->isBottomReady())
Bot.removeReady(SU);
LLVM_DEBUG(dbgs() << "Scheduling SU(" << SU->NodeNum << ") "
<< *SU->getInstr());
return SU;
}
void GCNSchedStrategy::schedNode(SUnit *SU, bool IsTopNode) {
if (GCNTrackers) {
MachineInstr *MI = SU->getInstr();
IsTopNode ? (void)DownwardTracker.advance(MI, false)
: UpwardTracker.recede(*MI);
}
return GenericScheduler::schedNode(SU, IsTopNode);
}
GCNSchedStageID GCNSchedStrategy::getCurrentStage() {
assert(CurrentStage && CurrentStage != SchedStages.end());
return *CurrentStage;
}
bool GCNSchedStrategy::advanceStage() {
assert(CurrentStage != SchedStages.end());
if (!CurrentStage)
CurrentStage = SchedStages.begin();
else
CurrentStage++;
return CurrentStage != SchedStages.end();
}
bool GCNSchedStrategy::hasNextStage() const {
assert(CurrentStage);
return std::next(CurrentStage) != SchedStages.end();
}
GCNSchedStageID GCNSchedStrategy::getNextStage() const {
assert(CurrentStage && std::next(CurrentStage) != SchedStages.end());
return *std::next(CurrentStage);
}
bool GCNSchedStrategy::tryPendingCandidate(SchedCandidate &Cand,
SchedCandidate &TryCand,
SchedBoundary *Zone) const {
// Initialize the candidate if needed.
if (!Cand.isValid()) {
TryCand.Reason = NodeOrder;
return true;
}
// Bias PhysReg Defs and copies to their uses and defined respectively.
if (tryGreater(biasPhysReg(TryCand.SU, TryCand.AtTop),
biasPhysReg(Cand.SU, Cand.AtTop), TryCand, Cand, PhysReg))
return TryCand.Reason != NoCand;
// Avoid exceeding the target's limit.
if (DAG->isTrackingPressure() &&
tryPressure(TryCand.RPDelta.Excess, Cand.RPDelta.Excess, TryCand, Cand,
RegExcess, TRI, DAG->MF))
return TryCand.Reason != NoCand;
// Avoid increasing the max critical pressure in the scheduled region.
if (DAG->isTrackingPressure() &&
tryPressure(TryCand.RPDelta.CriticalMax, Cand.RPDelta.CriticalMax,
TryCand, Cand, RegCritical, TRI, DAG->MF))
return TryCand.Reason != NoCand;
bool SameBoundary = Zone != nullptr;
if (SameBoundary) {
TryCand.initResourceDelta(DAG, SchedModel);
if (tryLess(TryCand.ResDelta.CritResources, Cand.ResDelta.CritResources,
TryCand, Cand, ResourceReduce))
return TryCand.Reason != NoCand;
if (tryGreater(TryCand.ResDelta.DemandedResources,
Cand.ResDelta.DemandedResources, TryCand, Cand,
ResourceDemand))
return TryCand.Reason != NoCand;
}
return false;
}
GCNMaxOccupancySchedStrategy::GCNMaxOccupancySchedStrategy(
const MachineSchedContext *C, bool IsLegacyScheduler)
: GCNSchedStrategy(C) {
SchedStages.push_back(GCNSchedStageID::OccInitialSchedule);
if (!DisableRewriteMFMAFormSchedStage)
SchedStages.push_back(GCNSchedStageID::RewriteMFMAForm);
SchedStages.push_back(GCNSchedStageID::UnclusteredHighRPReschedule);
SchedStages.push_back(GCNSchedStageID::ClusteredLowOccupancyReschedule);
SchedStages.push_back(GCNSchedStageID::PreRARematerialize);
GCNTrackers = GCNTrackers & !IsLegacyScheduler;
}
GCNMaxILPSchedStrategy::GCNMaxILPSchedStrategy(const MachineSchedContext *C)
: GCNSchedStrategy(C) {
SchedStages.push_back(GCNSchedStageID::ILPInitialSchedule);
}
bool GCNMaxILPSchedStrategy::tryCandidate(SchedCandidate &Cand,
SchedCandidate &TryCand,
SchedBoundary *Zone) const {
// Initialize the candidate if needed.
if (!Cand.isValid()) {
TryCand.Reason = NodeOrder;
return true;
}
// Avoid spilling by exceeding the register limit.
if (DAG->isTrackingPressure() &&
tryPressure(TryCand.RPDelta.Excess, Cand.RPDelta.Excess, TryCand, Cand,
RegExcess, TRI, DAG->MF))
return TryCand.Reason != NoCand;
// Bias PhysReg Defs and copies to their uses and defined respectively.
if (tryGreater(biasPhysReg(TryCand.SU, TryCand.AtTop),
biasPhysReg(Cand.SU, Cand.AtTop), TryCand, Cand, PhysReg))
return TryCand.Reason != NoCand;
bool SameBoundary = Zone != nullptr;
if (SameBoundary) {
// Prioritize instructions that read unbuffered resources by stall cycles.
if (tryLess(Zone->getLatencyStallCycles(TryCand.SU),
Zone->getLatencyStallCycles(Cand.SU), TryCand, Cand, Stall))
return TryCand.Reason != NoCand;
// Avoid critical resource consumption and balance the schedule.
TryCand.initResourceDelta(DAG, SchedModel);
if (tryLess(TryCand.ResDelta.CritResources, Cand.ResDelta.CritResources,
TryCand, Cand, ResourceReduce))
return TryCand.Reason != NoCand;
if (tryGreater(TryCand.ResDelta.DemandedResources,
Cand.ResDelta.DemandedResources, TryCand, Cand,
ResourceDemand))
return TryCand.Reason != NoCand;
// Unconditionally try to reduce latency.
if (tryLatency(TryCand, Cand, *Zone))
return TryCand.Reason != NoCand;
// Weak edges are for clustering and other constraints.
if (tryLess(getWeakLeft(TryCand.SU, TryCand.AtTop),
getWeakLeft(Cand.SU, Cand.AtTop), TryCand, Cand, Weak))
return TryCand.Reason != NoCand;
}
// Keep clustered nodes together to encourage downstream peephole
// optimizations which may reduce resource requirements.
//
// This is a best effort to set things up for a post-RA pass. Optimizations
// like generating loads of multiple registers should ideally be done within
// the scheduler pass by combining the loads during DAG postprocessing.
unsigned CandZoneCluster = Cand.AtTop ? TopClusterID : BotClusterID;
unsigned TryCandZoneCluster = TryCand.AtTop ? TopClusterID : BotClusterID;
bool CandIsClusterSucc =
isTheSameCluster(CandZoneCluster, Cand.SU->ParentClusterIdx);
bool TryCandIsClusterSucc =
isTheSameCluster(TryCandZoneCluster, TryCand.SU->ParentClusterIdx);
if (tryGreater(TryCandIsClusterSucc, CandIsClusterSucc, TryCand, Cand,
Cluster))
return TryCand.Reason != NoCand;
// Avoid increasing the max critical pressure in the scheduled region.
if (DAG->isTrackingPressure() &&
tryPressure(TryCand.RPDelta.CriticalMax, Cand.RPDelta.CriticalMax,
TryCand, Cand, RegCritical, TRI, DAG->MF))
return TryCand.Reason != NoCand;
// Avoid increasing the max pressure of the entire region.
if (DAG->isTrackingPressure() &&
tryPressure(TryCand.RPDelta.CurrentMax, Cand.RPDelta.CurrentMax, TryCand,
Cand, RegMax, TRI, DAG->MF))
return TryCand.Reason != NoCand;
if (SameBoundary) {
// Fall through to original instruction order.
if ((Zone->isTop() && TryCand.SU->NodeNum < Cand.SU->NodeNum) ||
(!Zone->isTop() && TryCand.SU->NodeNum > Cand.SU->NodeNum)) {
TryCand.Reason = NodeOrder;
return true;
}
}
return false;
}
GCNMaxMemoryClauseSchedStrategy::GCNMaxMemoryClauseSchedStrategy(
const MachineSchedContext *C)
: GCNSchedStrategy(C) {
SchedStages.push_back(GCNSchedStageID::MemoryClauseInitialSchedule);
}
/// GCNMaxMemoryClauseSchedStrategy tries best to clause memory instructions as
/// much as possible. This is achieved by:
// 1. Prioritize clustered operations before stall latency heuristic.
// 2. Prioritize long-latency-load before stall latency heuristic.
///
/// \param Cand provides the policy and current best candidate.
/// \param TryCand refers to the next SUnit candidate, otherwise uninitialized.
/// \param Zone describes the scheduled zone that we are extending, or nullptr
/// if Cand is from a different zone than TryCand.
/// \return \c true if TryCand is better than Cand (Reason is NOT NoCand)
bool GCNMaxMemoryClauseSchedStrategy::tryCandidate(SchedCandidate &Cand,
SchedCandidate &TryCand,
SchedBoundary *Zone) const {
// Initialize the candidate if needed.
if (!Cand.isValid()) {
TryCand.Reason = NodeOrder;
return true;
}
// Bias PhysReg Defs and copies to their uses and defined respectively.
if (tryGreater(biasPhysReg(TryCand.SU, TryCand.AtTop),
biasPhysReg(Cand.SU, Cand.AtTop), TryCand, Cand, PhysReg))
return TryCand.Reason != NoCand;
if (DAG->isTrackingPressure()) {
// Avoid exceeding the target's limit.
if (tryPressure(TryCand.RPDelta.Excess, Cand.RPDelta.Excess, TryCand, Cand,
RegExcess, TRI, DAG->MF))
return TryCand.Reason != NoCand;
// Avoid increasing the max critical pressure in the scheduled region.
if (tryPressure(TryCand.RPDelta.CriticalMax, Cand.RPDelta.CriticalMax,
TryCand, Cand, RegCritical, TRI, DAG->MF))
return TryCand.Reason != NoCand;
}
// MaxMemoryClause-specific: We prioritize clustered instructions as we would
// get more benefit from clausing these memory instructions.
unsigned CandZoneCluster = Cand.AtTop ? TopClusterID : BotClusterID;
unsigned TryCandZoneCluster = TryCand.AtTop ? TopClusterID : BotClusterID;
bool CandIsClusterSucc =
isTheSameCluster(CandZoneCluster, Cand.SU->ParentClusterIdx);
bool TryCandIsClusterSucc =
isTheSameCluster(TryCandZoneCluster, TryCand.SU->ParentClusterIdx);
if (tryGreater(TryCandIsClusterSucc, CandIsClusterSucc, TryCand, Cand,
Cluster))
return TryCand.Reason != NoCand;
// We only compare a subset of features when comparing nodes between
// Top and Bottom boundary. Some properties are simply incomparable, in many
// other instances we should only override the other boundary if something
// is a clear good pick on one boundary. Skip heuristics that are more
// "tie-breaking" in nature.
bool SameBoundary = Zone != nullptr;
if (SameBoundary) {
// For loops that are acyclic path limited, aggressively schedule for
// latency. Within an single cycle, whenever CurrMOps > 0, allow normal
// heuristics to take precedence.
if (Rem.IsAcyclicLatencyLimited && !Zone->getCurrMOps() &&
tryLatency(TryCand, Cand, *Zone))
return TryCand.Reason != NoCand;
// MaxMemoryClause-specific: Prioritize long latency memory load
// instructions in top-bottom order to hide more latency. The mayLoad check
// is used to exclude store-like instructions, which we do not want to
// scheduler them too early.
bool TryMayLoad =
TryCand.SU->isInstr() && TryCand.SU->getInstr()->mayLoad();
bool CandMayLoad = Cand.SU->isInstr() && Cand.SU->getInstr()->mayLoad();
if (TryMayLoad || CandMayLoad) {
bool TryLongLatency =
TryCand.SU->Latency > 10 * Cand.SU->Latency && TryMayLoad;
bool CandLongLatency =
10 * TryCand.SU->Latency < Cand.SU->Latency && CandMayLoad;
if (tryGreater(Zone->isTop() ? TryLongLatency : CandLongLatency,
Zone->isTop() ? CandLongLatency : TryLongLatency, TryCand,
Cand, Stall))
return TryCand.Reason != NoCand;
}
// Prioritize instructions that read unbuffered resources by stall cycles.
if (tryLess(Zone->getLatencyStallCycles(TryCand.SU),
Zone->getLatencyStallCycles(Cand.SU), TryCand, Cand, Stall))
return TryCand.Reason != NoCand;
}
if (SameBoundary) {
// Weak edges are for clustering and other constraints.
if (tryLess(getWeakLeft(TryCand.SU, TryCand.AtTop),
getWeakLeft(Cand.SU, Cand.AtTop), TryCand, Cand, Weak))
return TryCand.Reason != NoCand;
}
// Avoid increasing the max pressure of the entire region.
if (DAG->isTrackingPressure() &&
tryPressure(TryCand.RPDelta.CurrentMax, Cand.RPDelta.CurrentMax, TryCand,
Cand, RegMax, TRI, DAG->MF))
return TryCand.Reason != NoCand;
if (SameBoundary) {
// Avoid critical resource consumption and balance the schedule.
TryCand.initResourceDelta(DAG, SchedModel);
if (tryLess(TryCand.ResDelta.CritResources, Cand.ResDelta.CritResources,
TryCand, Cand, ResourceReduce))
return TryCand.Reason != NoCand;
if (tryGreater(TryCand.ResDelta.DemandedResources,
Cand.ResDelta.DemandedResources, TryCand, Cand,
ResourceDemand))
return TryCand.Reason != NoCand;
// Avoid serializing long latency dependence chains.
// For acyclic path limited loops, latency was already checked above.
if (!RegionPolicy.DisableLatencyHeuristic && TryCand.Policy.ReduceLatency &&
!Rem.IsAcyclicLatencyLimited && tryLatency(TryCand, Cand, *Zone))
return TryCand.Reason != NoCand;
// Fall through to original instruction order.
if (Zone->isTop() == (TryCand.SU->NodeNum < Cand.SU->NodeNum)) {
assert(TryCand.SU->NodeNum != Cand.SU->NodeNum);
TryCand.Reason = NodeOrder;
return true;
}
}
return false;
}
GCNScheduleDAGMILive::GCNScheduleDAGMILive(
MachineSchedContext *C, std::unique_ptr<MachineSchedStrategy> S)
: ScheduleDAGMILive(C, std::move(S)), ST(MF.getSubtarget<GCNSubtarget>()),
MFI(*MF.getInfo<SIMachineFunctionInfo>()),
StartingOccupancy(MFI.getOccupancy()), MinOccupancy(StartingOccupancy),
RegionLiveOuts(this, /*IsLiveOut=*/true) {
// We want regions with a single MI to be scheduled so that we can reason
// about them correctly during scheduling stages that move MIs between regions
// (e.g., rematerialization).
ScheduleSingleMIRegions = true;
LLVM_DEBUG(dbgs() << "Starting occupancy is " << StartingOccupancy << ".\n");
if (RelaxedOcc) {
MinOccupancy = std::min(MFI.getMinAllowedOccupancy(), StartingOccupancy);
if (MinOccupancy != StartingOccupancy)
LLVM_DEBUG(dbgs() << "Allowing Occupancy drops to " << MinOccupancy
<< ".\n");
}
}
std::unique_ptr<GCNSchedStage>
GCNScheduleDAGMILive::createSchedStage(GCNSchedStageID SchedStageID) {
switch (SchedStageID) {
case GCNSchedStageID::OccInitialSchedule:
return std::make_unique<OccInitialScheduleStage>(SchedStageID, *this);
case GCNSchedStageID::RewriteMFMAForm:
return std::make_unique<RewriteMFMAFormStage>(SchedStageID, *this);
case GCNSchedStageID::UnclusteredHighRPReschedule:
return std::make_unique<UnclusteredHighRPStage>(SchedStageID, *this);
case GCNSchedStageID::ClusteredLowOccupancyReschedule:
return std::make_unique<ClusteredLowOccStage>(SchedStageID, *this);
case GCNSchedStageID::PreRARematerialize:
return std::make_unique<PreRARematStage>(SchedStageID, *this);
case GCNSchedStageID::ILPInitialSchedule:
return std::make_unique<ILPInitialScheduleStage>(SchedStageID, *this);
case GCNSchedStageID::MemoryClauseInitialSchedule:
return std::make_unique<MemoryClauseInitialScheduleStage>(SchedStageID,
*this);
}
llvm_unreachable("Unknown SchedStageID.");
}
void GCNScheduleDAGMILive::schedule() {
// Collect all scheduling regions. The actual scheduling is performed in
// GCNScheduleDAGMILive::finalizeSchedule.
Regions.push_back(std::pair(RegionBegin, RegionEnd));
}
GCNRegPressure
GCNScheduleDAGMILive::getRealRegPressure(unsigned RegionIdx) const {
if (Regions[RegionIdx].first == Regions[RegionIdx].second)
return llvm::getRegPressure(MRI, LiveIns[RegionIdx]);
GCNDownwardRPTracker RPTracker(*LIS);
RPTracker.advance(Regions[RegionIdx].first, Regions[RegionIdx].second,
&LiveIns[RegionIdx]);
return RPTracker.moveMaxPressure();
}
static MachineInstr *getLastMIForRegion(MachineBasicBlock::iterator RegionBegin,
MachineBasicBlock::iterator RegionEnd) {
assert(RegionBegin != RegionEnd && "Region must not be empty");
return &*skipDebugInstructionsBackward(std::prev(RegionEnd), RegionBegin);
}
void GCNScheduleDAGMILive::computeBlockPressure(unsigned RegionIdx,
const MachineBasicBlock *MBB) {
GCNDownwardRPTracker RPTracker(*LIS);
// If the block has the only successor then live-ins of that successor are
// live-outs of the current block. We can reuse calculated live set if the
// successor will be sent to scheduling past current block.
// However, due to the bug in LiveInterval analysis it may happen that two
// predecessors of the same successor block have different lane bitmasks for
// a live-out register. Workaround that by sticking to one-to-one relationship
// i.e. one predecessor with one successor block.
const MachineBasicBlock *OnlySucc = nullptr;
if (MBB->succ_size() == 1) {
auto *Candidate = *MBB->succ_begin();
if (!Candidate->empty() && Candidate->pred_size() == 1) {
SlotIndexes *Ind = LIS->getSlotIndexes();
if (Ind->getMBBStartIdx(MBB) < Ind->getMBBStartIdx(Candidate))
OnlySucc = Candidate;
}
}
// Scheduler sends regions from the end of the block upwards.
size_t CurRegion = RegionIdx;
for (size_t E = Regions.size(); CurRegion != E; ++CurRegion)
if (Regions[CurRegion].first->getParent() != MBB)
break;
--CurRegion;
auto I = MBB->begin();
auto LiveInIt = MBBLiveIns.find(MBB);
auto &Rgn = Regions[CurRegion];
auto *NonDbgMI = &*skipDebugInstructionsForward(Rgn.first, Rgn.second);
if (LiveInIt != MBBLiveIns.end()) {
auto LiveIn = std::move(LiveInIt->second);
RPTracker.reset(*MBB->begin(), &LiveIn);
MBBLiveIns.erase(LiveInIt);
} else {
I = Rgn.first;
auto LRS = BBLiveInMap.lookup(NonDbgMI);
#ifdef EXPENSIVE_CHECKS
assert(isEqual(getLiveRegsBefore(*NonDbgMI, *LIS), LRS));
#endif
RPTracker.reset(*I, &LRS);
}
for (;;) {
I = RPTracker.getNext();
if (Regions[CurRegion].first == I || NonDbgMI == I) {
LiveIns[CurRegion] = RPTracker.getLiveRegs();
RPTracker.clearMaxPressure();
}
if (Regions[CurRegion].second == I) {
Pressure[CurRegion] = RPTracker.moveMaxPressure();
if (CurRegion-- == RegionIdx)
break;
auto &Rgn = Regions[CurRegion];
NonDbgMI = &*skipDebugInstructionsForward(Rgn.first, Rgn.second);
}
RPTracker.advanceBeforeNext();
RPTracker.advanceToNext();
}
if (OnlySucc) {
if (I != MBB->end()) {
RPTracker.advanceBeforeNext();
RPTracker.advanceToNext();
RPTracker.advance(MBB->end());
}
MBBLiveIns[OnlySucc] = RPTracker.moveLiveRegs();
}
}
DenseMap<MachineInstr *, GCNRPTracker::LiveRegSet>
GCNScheduleDAGMILive::getRegionLiveInMap() const {
assert(!Regions.empty());
std::vector<MachineInstr *> RegionFirstMIs;
RegionFirstMIs.reserve(Regions.size());
for (auto &[RegionBegin, RegionEnd] : reverse(Regions))
RegionFirstMIs.push_back(
&*skipDebugInstructionsForward(RegionBegin, RegionEnd));
return getLiveRegMap(RegionFirstMIs, /*After=*/false, *LIS);
}
DenseMap<MachineInstr *, GCNRPTracker::LiveRegSet>
GCNScheduleDAGMILive::getRegionLiveOutMap() const {
assert(!Regions.empty());
std::vector<MachineInstr *> RegionLastMIs;
RegionLastMIs.reserve(Regions.size());
for (auto &[RegionBegin, RegionEnd] : reverse(Regions)) {
// Skip empty regions.
if (RegionBegin == RegionEnd)
continue;
RegionLastMIs.push_back(getLastMIForRegion(RegionBegin, RegionEnd));
}
return getLiveRegMap(RegionLastMIs, /*After=*/true, *LIS);
}
void RegionPressureMap::buildLiveRegMap() {
IdxToInstruction.clear();
RegionLiveRegMap =
IsLiveOut ? DAG->getRegionLiveOutMap() : DAG->getRegionLiveInMap();
for (unsigned I = 0; I < DAG->Regions.size(); I++) {
auto &[RegionBegin, RegionEnd] = DAG->Regions[I];
// Skip empty regions.
if (RegionBegin == RegionEnd)
continue;
MachineInstr *RegionKey =
IsLiveOut ? getLastMIForRegion(RegionBegin, RegionEnd) : &*RegionBegin;
IdxToInstruction[I] = RegionKey;
}
}
void GCNScheduleDAGMILive::finalizeSchedule() {
// Start actual scheduling here. This function is called by the base
// MachineScheduler after all regions have been recorded by
// GCNScheduleDAGMILive::schedule().
LiveIns.resize(Regions.size());
Pressure.resize(Regions.size());
RegionsWithHighRP.resize(Regions.size());
RegionsWithExcessRP.resize(Regions.size());
RegionsWithIGLPInstrs.resize(Regions.size());
RegionsWithHighRP.reset();
RegionsWithExcessRP.reset();
RegionsWithIGLPInstrs.reset();
runSchedStages();
}
void GCNScheduleDAGMILive::runSchedStages() {
LLVM_DEBUG(dbgs() << "All regions recorded, starting actual scheduling.\n");
if (!Regions.empty()) {
BBLiveInMap = getRegionLiveInMap();
if (GCNTrackers)
RegionLiveOuts.buildLiveRegMap();
}
#ifdef DUMP_MAX_REG_PRESSURE
if (PrintMaxRPRegUsageBeforeScheduler) {
dumpMaxRegPressure(MF, GCNRegPressure::VGPR, *LIS, MLI);
dumpMaxRegPressure(MF, GCNRegPressure::SGPR, *LIS, MLI);
LIS->dump();
}
#endif
GCNSchedStrategy &S = static_cast<GCNSchedStrategy &>(*SchedImpl);
while (S.advanceStage()) {
auto Stage = createSchedStage(S.getCurrentStage());
if (!Stage->initGCNSchedStage())
continue;
for (auto Region : Regions) {
RegionBegin = Region.first;
RegionEnd = Region.second;
// Setup for scheduling the region and check whether it should be skipped.
if (!Stage->initGCNRegion()) {
Stage->advanceRegion();
exitRegion();
continue;
}
if (GCNTrackers) {
GCNDownwardRPTracker *DownwardTracker = S.getDownwardTracker();
GCNUpwardRPTracker *UpwardTracker = S.getUpwardTracker();
GCNRPTracker::LiveRegSet *RegionLiveIns =
&LiveIns[Stage->getRegionIdx()];
reinterpret_cast<GCNRPTracker *>(DownwardTracker)
->reset(MRI, *RegionLiveIns);
reinterpret_cast<GCNRPTracker *>(UpwardTracker)
->reset(MRI, RegionLiveOuts.getLiveRegsForRegionIdx(
Stage->getRegionIdx()));
}
ScheduleDAGMILive::schedule();
Stage->finalizeGCNRegion();
Stage->advanceRegion();
exitRegion();
}
Stage->finalizeGCNSchedStage();
}
#ifdef DUMP_MAX_REG_PRESSURE
if (PrintMaxRPRegUsageAfterScheduler) {
dumpMaxRegPressure(MF, GCNRegPressure::VGPR, *LIS, MLI);
dumpMaxRegPressure(MF, GCNRegPressure::SGPR, *LIS, MLI);
LIS->dump();
}
#endif
}
#ifndef NDEBUG
raw_ostream &llvm::operator<<(raw_ostream &OS, const GCNSchedStageID &StageID) {
switch (StageID) {
case GCNSchedStageID::OccInitialSchedule:
OS << "Max Occupancy Initial Schedule";
break;
case GCNSchedStageID::RewriteMFMAForm:
OS << "Instruction Rewriting Reschedule";
break;
case GCNSchedStageID::UnclusteredHighRPReschedule:
OS << "Unclustered High Register Pressure Reschedule";
break;
case GCNSchedStageID::ClusteredLowOccupancyReschedule:
OS << "Clustered Low Occupancy Reschedule";
break;
case GCNSchedStageID::PreRARematerialize:
OS << "Pre-RA Rematerialize";
break;
case GCNSchedStageID::ILPInitialSchedule:
OS << "Max ILP Initial Schedule";
break;
case GCNSchedStageID::MemoryClauseInitialSchedule:
OS << "Max memory clause Initial Schedule";
break;
}
return OS;
}
#endif
GCNSchedStage::GCNSchedStage(GCNSchedStageID StageID, GCNScheduleDAGMILive &DAG)
: DAG(DAG), S(static_cast<GCNSchedStrategy &>(*DAG.SchedImpl)), MF(DAG.MF),
MFI(DAG.MFI), ST(DAG.ST), StageID(StageID) {}
bool GCNSchedStage::initGCNSchedStage() {
if (!DAG.LIS)
return false;
LLVM_DEBUG(dbgs() << "Starting scheduling stage: " << StageID << "\n");
return true;
}
void RewriteMFMAFormStage::findReachingDefs(
MachineOperand &UseMO, LiveIntervals *LIS,
SmallVectorImpl<SlotIndex> &DefIdxs) {
MachineInstr *UseMI = UseMO.getParent();
LiveInterval &UseLI = LIS->getInterval(UseMO.getReg());
VNInfo *VNI = UseLI.getVNInfoAt(LIS->getInstructionIndex(*UseMI));
// If the def is not a PHI, then it must be the only reaching def.
if (!VNI->isPHIDef()) {
DefIdxs.push_back(VNI->def);
return;
}
SmallPtrSet<MachineBasicBlock *, 8> Visited = {UseMI->getParent()};
SmallVector<MachineBasicBlock *, 8> Worklist;
// Mark the predecessor blocks for traversal
for (MachineBasicBlock *PredMBB : UseMI->getParent()->predecessors()) {
Worklist.push_back(PredMBB);
Visited.insert(PredMBB);
}
while (!Worklist.empty()) {
MachineBasicBlock *CurrMBB = Worklist.pop_back_val();
SlotIndex CurrMBBEnd = LIS->getMBBEndIdx(CurrMBB);
VNInfo *VNI = UseLI.getVNInfoAt(CurrMBBEnd.getPrevSlot());
MachineBasicBlock *DefMBB = LIS->getMBBFromIndex(VNI->def);
// If there is a def in this block, then add it to the list. This is the
// reaching def of this path.
if (!VNI->isPHIDef()) {
DefIdxs.push_back(VNI->def);
continue;
}
for (MachineBasicBlock *PredMBB : DefMBB->predecessors()) {
if (Visited.insert(PredMBB).second)
Worklist.push_back(PredMBB);
}
}
}
void RewriteMFMAFormStage::findReachingUses(
MachineInstr *DefMI, LiveIntervals *LIS,
SmallVectorImpl<MachineOperand *> &ReachingUses) {
SlotIndex DefIdx = LIS->getInstructionIndex(*DefMI);
for (MachineOperand &UseMO :
DAG.MRI.use_nodbg_operands(DefMI->getOperand(0).getReg())) {
SmallVector<SlotIndex, 8> ReachingDefIndexes;
findReachingDefs(UseMO, LIS, ReachingDefIndexes);
// If we find a use that contains this DefMI in its reachingDefs, then it is
// a reaching use.
if (any_of(ReachingDefIndexes, [DefIdx](SlotIndex RDIdx) {
return SlotIndex::isSameInstr(RDIdx, DefIdx);
}))
ReachingUses.push_back(&UseMO);
}
}
bool RewriteMFMAFormStage::initGCNSchedStage() {
// We only need to run this pass if the architecture supports AGPRs.
// Additionally, we don't use AGPRs at occupancy levels above 1 so there
// is no need for this pass in that case, either.
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
if (!ST.hasGFX90AInsts() || MFI.getMinWavesPerEU() > 1)
return false;
RegionsWithExcessArchVGPR.resize(DAG.Regions.size());
RegionsWithExcessArchVGPR.reset();
for (unsigned Region = 0; Region < DAG.Regions.size(); Region++) {
GCNRegPressure PressureBefore = DAG.Pressure[Region];
if (PressureBefore.getArchVGPRNum() > ST.getAddressableNumArchVGPRs())
RegionsWithExcessArchVGPR[Region] = true;
}
if (RegionsWithExcessArchVGPR.none())
return false;
TII = ST.getInstrInfo();
SRI = ST.getRegisterInfo();
std::vector<std::pair<MachineInstr *, unsigned>> RewriteCands;
DenseMap<MachineBasicBlock *, std::set<Register>> CopyForUse;
SmallPtrSet<MachineInstr *, 8> CopyForDef;
if (!initHeuristics(RewriteCands, CopyForUse, CopyForDef))
return false;
int64_t Cost = getRewriteCost(RewriteCands, CopyForUse, CopyForDef);
// If we haven't found the beneficial conditions, prefer the VGPR form which
// may result in less cross RC copies.
if (Cost > 0)
return false;
return rewrite(RewriteCands);
}
bool UnclusteredHighRPStage::initGCNSchedStage() {
if (DisableUnclusterHighRP)
return false;
if (!GCNSchedStage::initGCNSchedStage())
return false;
if (DAG.RegionsWithHighRP.none() && DAG.RegionsWithExcessRP.none())
return false;
SavedMutations.swap(DAG.Mutations);
DAG.addMutation(
createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::PreRAReentry));
InitialOccupancy = DAG.MinOccupancy;
// Aggressively try to reduce register pressure in the unclustered high RP
// stage. Temporarily increase occupancy target in the region.
TempTargetOccupancy = MFI.getMaxWavesPerEU() > DAG.MinOccupancy
? InitialOccupancy + 1
: InitialOccupancy;
IsAnyRegionScheduled = false;
S.SGPRLimitBias = S.HighRPSGPRBias;
S.VGPRLimitBias = S.HighRPVGPRBias;
LLVM_DEBUG(
dbgs()
<< "Retrying function scheduling without clustering. "
"Aggressively try to reduce register pressure to achieve occupancy "
<< TempTargetOccupancy << ".\n");
return true;
}
bool ClusteredLowOccStage::initGCNSchedStage() {
if (DisableClusteredLowOccupancy)
return false;
if (!GCNSchedStage::initGCNSchedStage())
return false;
// Don't bother trying to improve ILP in lower RP regions if occupancy has not
// been dropped. All regions will have already been scheduled with the ideal
// occupancy targets.
if (DAG.StartingOccupancy <= DAG.MinOccupancy)
return false;
LLVM_DEBUG(
dbgs() << "Retrying function scheduling with lowest recorded occupancy "
<< DAG.MinOccupancy << ".\n");
return true;
}
/// Allows to easily filter for this stage's debug output.
#define REMAT_PREFIX "[PreRARemat] "
#define REMAT_DEBUG(X) LLVM_DEBUG(dbgs() << REMAT_PREFIX; X;)
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
Printable PreRARematStage::ScoredRemat::print() const {
return Printable([&](raw_ostream &OS) {
OS << '(' << MaxFreq << ", " << FreqDiff << ", " << RegionImpact << ')';
});
}
#endif
bool PreRARematStage::initGCNSchedStage() {
// FIXME: This pass will invalidate cached BBLiveInMap and MBBLiveIns for
// regions inbetween the defs and region we sinked the def to. Will need to be
// fixed if there is another pass after this pass.
assert(!S.hasNextStage());
if (!GCNSchedStage::initGCNSchedStage() || DAG.Regions.size() <= 1)
return false;
// Maps all MIs (except lone terminators, which are not part of any region) to
// their parent region. Non-lone terminators are considered part of the region
// they delimitate.
DenseMap<MachineInstr *, unsigned> MIRegion(MF.getInstructionCount());
// Before performing any IR modification record the parent region of each MI
// and the parent MBB of each region.
const unsigned NumRegions = DAG.Regions.size();
for (unsigned I = 0; I < NumRegions; ++I) {
RegionBoundaries Region = DAG.Regions[I];
for (auto MI = Region.first; MI != Region.second; ++MI)
MIRegion.insert({&*MI, I});
MachineBasicBlock *ParentMBB = Region.first->getParent();
if (Region.second != ParentMBB->end())
MIRegion.insert({&*Region.second, I});
RegionBB.push_back(ParentMBB);
}
#ifndef NDEBUG
auto PrintTargetRegions = [&]() -> void {
if (TargetRegions.none()) {
dbgs() << REMAT_PREFIX << "No target regions\n";
return;
}
dbgs() << REMAT_PREFIX << "Target regions:\n";
for (unsigned I : TargetRegions.set_bits())
dbgs() << REMAT_PREFIX << " [" << I << "] " << RPTargets[I] << '\n';
};
auto PrintRematReg = [&](const RematReg &Remat) -> Printable {
return Printable([&, Remat](raw_ostream &OS) {
// Concatenate all region numbers in which the register is unused and
// live-through.
bool HasLiveThroughRegion = false;
OS << '[' << Remat.DefRegion << " -";
for (unsigned I = 0; I < NumRegions; ++I) {
if (Remat.isUnusedLiveThrough(I)) {
if (HasLiveThroughRegion) {
OS << ',';
} else {
OS << "- ";
HasLiveThroughRegion = true;
}
OS << I;
}
}
if (HasLiveThroughRegion)
OS << " -";
OS << "-> " << Remat.UseRegion << "] ";
Remat.DefMI->print(OS, /*IsStandalone=*/true, /*SkipOpers=*/false,
/*SkipDebugLoc=*/false, /*AddNewLine=*/false);
});
};
#endif
// Set an objective for the stage based on current RP in each region.
REMAT_DEBUG({
dbgs() << "Analyzing ";
MF.getFunction().printAsOperand(dbgs(), false);
dbgs() << ": ";
});
if (!setObjective()) {
LLVM_DEBUG(dbgs() << "no objective to achieve, occupancy is maximal at "
<< MFI.getMaxWavesPerEU() << '\n');
return false;
}
LLVM_DEBUG({
if (TargetOcc) {
dbgs() << "increase occupancy from " << *TargetOcc - 1 << '\n';
} else {
dbgs() << "reduce spilling (minimum target occupancy is "
<< MFI.getMinWavesPerEU() << ")\n";
}
PrintTargetRegions();
});
if (!collectRematRegs(MIRegion)) {
REMAT_DEBUG(dbgs() << "No rematerializable registers\n");
return false;
}
const ScoredRemat::FreqInfo FreqInfo(MF, DAG);
REMAT_DEBUG({
dbgs() << "Rematerializable registers:\n";
for (const RematReg &Remat : RematRegs)
dbgs() << REMAT_PREFIX << " " << PrintRematReg(Remat) << '\n';
dbgs() << REMAT_PREFIX << "Region frequencies\n";
for (auto [I, Freq] : enumerate(FreqInfo.Regions)) {
dbgs() << REMAT_PREFIX << " [" << I << "] ";
if (Freq)
dbgs() << Freq;
else
dbgs() << "unknown ";
dbgs() << " | " << *DAG.Regions[I].first;
}
});
SmallVector<ScoredRemat> ScoredRemats;
for (RematReg &Remat : RematRegs)
ScoredRemats.emplace_back(&Remat, FreqInfo, DAG);
// Rematerialize registers in successive rounds until all RP targets are
// satisifed or until we run out of rematerialization candidates.
#ifndef NDEBUG
unsigned RoundNum = 0;
#endif
BitVector RecomputeRP(NumRegions);
do {
assert(!ScoredRemats.empty() && "no more remat candidates");
// (Re-)Score and (re-)sort all remats in increasing score order.
for (ScoredRemat &Remat : ScoredRemats)
Remat.update(TargetRegions, RPTargets, FreqInfo, !TargetOcc);
sort(ScoredRemats);
REMAT_DEBUG({
dbgs() << "==== ROUND " << RoundNum++ << " ====\n"
<< REMAT_PREFIX
<< "Candidates with non-null score, in rematerialization order:\n";
for (const ScoredRemat &RematDecision : reverse(ScoredRemats)) {
if (RematDecision.hasNullScore())
break;
dbgs() << REMAT_PREFIX << " " << RematDecision.print() << " | "
<< *RematDecision.Remat->DefMI;
}
PrintTargetRegions();
});
RecomputeRP.reset();
unsigned RematIdx = ScoredRemats.size();
// Rematerialize registers in decreasing score order until we estimate
// that all RP targets are satisfied or until rematerialization candidates
// are no longer useful to decrease RP.
for (; RematIdx && TargetRegions.any(); --RematIdx) {
const ScoredRemat &Candidate = ScoredRemats[RematIdx - 1];
// Stop rematerializing on encountering a null score. Since scores
// monotonically decrease as we rematerialize, we know there is nothing
// useful left to do in such cases, even if we were to re-score.
if (Candidate.hasNullScore()) {
RematIdx = 0;
break;
}
RematReg &Remat = *Candidate.Remat;
// When previous rematerializations in this round have already satisfied
// RP targets in all regions this rematerialization can impact, we have a
// good indication that our scores have diverged significantly from
// reality, in which case we interrupt this round and re-score. This also
// ensures that every rematerialization we perform is possibly impactful
// in at least one target region.
if (!Remat.maybeBeneficial(TargetRegions, RPTargets))
break;
REMAT_DEBUG(dbgs() << "** REMAT " << PrintRematReg(Remat) << '\n';);
MachineInstr *RematMI =
Candidate.rematerialize(RecomputeRP, RPTargets, DAG);
RescheduleRegions |= Remat.Live;
// Every rematerialization we do here is likely to move the instruction
// into a higher frequency region, increasing the total sum latency of the
// instruction itself. This is acceptable if we are eliminating a spill in
// the process, but when the goal is increasing occupancy we get nothing
// out of rematerialization if occupancy is not increased in the end; in
// such cases we want to roll back the rematerialization.
if (TargetOcc) {
RollbackInfo &Rollback = Rollbacks.emplace_back(&Remat);
Rollback.RematMI = RematMI;
// Make the original MI a debug value so that it does not influence
// scheduling and replace all read registers with a sentinel register to
// prevent operands to appear in use-lists of other MIs during LIS
// updates. Store mappings between operand indices and original
// registers for potential rollback.
Remat.DefMI->setDesc(DAG.TII->get(TargetOpcode::DBG_VALUE));
for (auto [Idx, MO] : enumerate(Remat.DefMI->operands())) {
if (MO.isReg() && MO.readsReg()) {
Rollback.RegMap.insert({Idx, MO.getReg()});
MO.setReg(Register());
}
}
} else {
// Just delete the original instruction if it cannot be rolled back.
DAG.deleteMI(Remat.DefRegion, Remat.DefMI);
}
unsetSatisfiedRPTargets(Remat.Live);
}
REMAT_DEBUG({
if (!TargetRegions.any()) {
dbgs() << "** Interrupt round on all targets achieved\n";
} else if (RematIdx) {
dbgs() << "** Interrupt round on stale score for "
<< *ScoredRemats[RematIdx - 1].Remat->DefMI;
} else {
dbgs() << "** Stop on exhausted rematerialization candidates\n";
}
});
// Peel off registers we already rematerialized from the vector's tail.
ScoredRemats.truncate(RematIdx);
} while ((updateAndVerifyRPTargets(RecomputeRP) || TargetRegions.any()) &&
!ScoredRemats.empty());
if (RescheduleRegions.none())
return false;
// Commit all pressure changes to the DAG and compute minimum achieved
// occupancy in impacted regions.
REMAT_DEBUG(dbgs() << "==== REMAT RESULTS ====\n");
unsigned DynamicVGPRBlockSize = MFI.getDynamicVGPRBlockSize();
for (unsigned I : RescheduleRegions.set_bits()) {
DAG.Pressure[I] = RPTargets[I].getCurrentRP();
REMAT_DEBUG(dbgs() << '[' << I << "] Achieved occupancy "
<< DAG.Pressure[I].getOccupancy(ST, DynamicVGPRBlockSize)
<< " (" << RPTargets[I] << ")\n");
}
AchievedOcc = MFI.getMaxWavesPerEU();
for (const GCNRegPressure &RP : DAG.Pressure) {
AchievedOcc =
std::min(AchievedOcc, RP.getOccupancy(ST, DynamicVGPRBlockSize));
}
REMAT_DEBUG({
dbgs() << "Retrying function scheduling with new min. occupancy of "
<< AchievedOcc << " from rematerializing (original was "
<< DAG.MinOccupancy;
if (TargetOcc)
dbgs() << ", target was " << *TargetOcc;
dbgs() << ")\n";
});
DAG.setTargetOccupancy(getStageTargetOccupancy());
return true;
}
void GCNSchedStage::finalizeGCNSchedStage() {
DAG.finishBlock();
LLVM_DEBUG(dbgs() << "Ending scheduling stage: " << StageID << "\n");
}
void UnclusteredHighRPStage::finalizeGCNSchedStage() {
SavedMutations.swap(DAG.Mutations);
S.SGPRLimitBias = S.VGPRLimitBias = 0;
if (DAG.MinOccupancy > InitialOccupancy) {
assert(IsAnyRegionScheduled);
LLVM_DEBUG(dbgs() << StageID
<< " stage successfully increased occupancy to "
<< DAG.MinOccupancy << '\n');
} else if (!IsAnyRegionScheduled) {
assert(DAG.MinOccupancy == InitialOccupancy);
LLVM_DEBUG(dbgs() << StageID
<< ": No regions scheduled, min occupancy stays at "
<< DAG.MinOccupancy << ", MFI occupancy stays at "
<< MFI.getOccupancy() << ".\n");
}
GCNSchedStage::finalizeGCNSchedStage();
}
bool GCNSchedStage::initGCNRegion() {
// Skip empty scheduling region.
if (DAG.begin() == DAG.end())
return false;
// Check whether this new region is also a new block.
if (DAG.RegionBegin->getParent() != CurrentMBB)
setupNewBlock();
unsigned NumRegionInstrs = std::distance(DAG.begin(), DAG.end());
DAG.enterRegion(CurrentMBB, DAG.begin(), DAG.end(), NumRegionInstrs);
// Skip regions with 1 schedulable instruction.
if (DAG.begin() == std::prev(DAG.end()))
return false;
LLVM_DEBUG(dbgs() << "********** MI Scheduling **********\n");
LLVM_DEBUG(dbgs() << MF.getName() << ":" << printMBBReference(*CurrentMBB)
<< " " << CurrentMBB->getName()
<< "\n From: " << *DAG.begin() << " To: ";
if (DAG.RegionEnd != CurrentMBB->end()) dbgs() << *DAG.RegionEnd;
else dbgs() << "End";
dbgs() << " RegionInstrs: " << NumRegionInstrs << '\n');
// Save original instruction order before scheduling for possible revert.
Unsched.clear();
Unsched.reserve(DAG.NumRegionInstrs);
if (StageID == GCNSchedStageID::OccInitialSchedule ||
StageID == GCNSchedStageID::ILPInitialSchedule) {
const SIInstrInfo *SII = static_cast<const SIInstrInfo *>(DAG.TII);
for (auto &I : DAG) {
Unsched.push_back(&I);
if (SII->isIGLPMutationOnly(I.getOpcode()))
DAG.RegionsWithIGLPInstrs[RegionIdx] = true;
}
} else {
for (auto &I : DAG)
Unsched.push_back(&I);
}
PressureBefore = DAG.Pressure[RegionIdx];
LLVM_DEBUG(
dbgs() << "Pressure before scheduling:\nRegion live-ins:"
<< print(DAG.LiveIns[RegionIdx], DAG.MRI)
<< "Region live-in pressure: "
<< print(llvm::getRegPressure(DAG.MRI, DAG.LiveIns[RegionIdx]))
<< "Region register pressure: " << print(PressureBefore));
S.HasHighPressure = false;
S.KnownExcessRP = isRegionWithExcessRP();
if (DAG.RegionsWithIGLPInstrs[RegionIdx] &&
StageID != GCNSchedStageID::UnclusteredHighRPReschedule) {
SavedMutations.clear();
SavedMutations.swap(DAG.Mutations);
bool IsInitialStage = StageID == GCNSchedStageID::OccInitialSchedule ||
StageID == GCNSchedStageID::ILPInitialSchedule;
DAG.addMutation(createIGroupLPDAGMutation(
IsInitialStage ? AMDGPU::SchedulingPhase::Initial
: AMDGPU::SchedulingPhase::PreRAReentry));
}
return true;
}
bool UnclusteredHighRPStage::initGCNRegion() {
// Only reschedule regions that have excess register pressure (i.e. spilling)
// or had minimum occupancy at the beginning of the stage (as long as
// rescheduling of previous regions did not make occupancy drop back down to
// the initial minimum).
unsigned DynamicVGPRBlockSize = DAG.MFI.getDynamicVGPRBlockSize();
// If no region has been scheduled yet, the DAG has not yet been updated with
// the occupancy target. So retrieve it from the temporary.
unsigned CurrentTargetOccupancy =
IsAnyRegionScheduled ? DAG.MinOccupancy : TempTargetOccupancy;
if (!DAG.RegionsWithExcessRP[RegionIdx] &&
(CurrentTargetOccupancy <= InitialOccupancy ||
DAG.Pressure[RegionIdx].getOccupancy(ST, DynamicVGPRBlockSize) !=
InitialOccupancy))
return false;
bool IsSchedulingThisRegion = GCNSchedStage::initGCNRegion();
// If this is the first region scheduled during this stage, make the target
// occupancy changes in the DAG and MFI.
if (!IsAnyRegionScheduled && IsSchedulingThisRegion) {
IsAnyRegionScheduled = true;
if (MFI.getMaxWavesPerEU() > DAG.MinOccupancy)
DAG.setTargetOccupancy(TempTargetOccupancy);
}
return IsSchedulingThisRegion;
}
bool ClusteredLowOccStage::initGCNRegion() {
// We may need to reschedule this region if it wasn't rescheduled in the last
// stage, or if we found it was testing critical register pressure limits in
// the unclustered reschedule stage. The later is because we may not have been
// able to raise the min occupancy in the previous stage so the region may be
// overly constrained even if it was already rescheduled.
if (!DAG.RegionsWithHighRP[RegionIdx])
return false;
return GCNSchedStage::initGCNRegion();
}
bool PreRARematStage::initGCNRegion() {
return RescheduleRegions[RegionIdx] && GCNSchedStage::initGCNRegion();
}
void GCNSchedStage::setupNewBlock() {
if (CurrentMBB)
DAG.finishBlock();
CurrentMBB = DAG.RegionBegin->getParent();
DAG.startBlock(CurrentMBB);
// Get real RP for the region if it hasn't be calculated before. After the
// initial schedule stage real RP will be collected after scheduling.
if (StageID == GCNSchedStageID::OccInitialSchedule ||
StageID == GCNSchedStageID::ILPInitialSchedule ||
StageID == GCNSchedStageID::MemoryClauseInitialSchedule)
DAG.computeBlockPressure(RegionIdx, CurrentMBB);
}
void GCNSchedStage::finalizeGCNRegion() {
DAG.Regions[RegionIdx] = std::pair(DAG.RegionBegin, DAG.RegionEnd);
if (S.HasHighPressure)
DAG.RegionsWithHighRP[RegionIdx] = true;
// Revert scheduling if we have dropped occupancy or there is some other
// reason that the original schedule is better.
checkScheduling();
if (DAG.RegionsWithIGLPInstrs[RegionIdx] &&
StageID != GCNSchedStageID::UnclusteredHighRPReschedule)
SavedMutations.swap(DAG.Mutations);
}
void PreRARematStage::finalizeGCNRegion() {
GCNSchedStage::finalizeGCNRegion();
// When the goal is to increase occupancy, all regions must reach the target
// occupancy for rematerializations to be possibly useful, otherwise we will
// just hurt latency for no benefit. If minimum occupancy drops below the
// target there is no point in trying to re-schedule further regions.
if (!TargetOcc)
return;
RegionReverts.emplace_back(RegionIdx, Unsched, PressureBefore);
if (DAG.MinOccupancy < *TargetOcc) {
REMAT_DEBUG(dbgs() << "Region " << RegionIdx
<< " cannot meet occupancy target, interrupting "
"re-scheduling in all regions\n");
RescheduleRegions.reset();
}
}
void GCNSchedStage::checkScheduling() {
// Check the results of scheduling.
PressureAfter = DAG.getRealRegPressure(RegionIdx);
LLVM_DEBUG(dbgs() << "Pressure after scheduling: " << print(PressureAfter));
LLVM_DEBUG(dbgs() << "Region: " << RegionIdx << ".\n");
unsigned DynamicVGPRBlockSize = DAG.MFI.getDynamicVGPRBlockSize();
if (PressureAfter.getSGPRNum() <= S.SGPRCriticalLimit &&
PressureAfter.getVGPRNum(ST.hasGFX90AInsts()) <= S.VGPRCriticalLimit) {
DAG.Pressure[RegionIdx] = PressureAfter;
// Early out if we have achieved the occupancy target.
LLVM_DEBUG(dbgs() << "Pressure in desired limits, done.\n");
return;
}
unsigned TargetOccupancy = std::min(
S.getTargetOccupancy(), ST.getOccupancyWithWorkGroupSizes(MF).second);
unsigned WavesAfter = std::min(
TargetOccupancy, PressureAfter.getOccupancy(ST, DynamicVGPRBlockSize));
unsigned WavesBefore = std::min(
TargetOccupancy, PressureBefore.getOccupancy(ST, DynamicVGPRBlockSize));
LLVM_DEBUG(dbgs() << "Occupancy before scheduling: " << WavesBefore
<< ", after " << WavesAfter << ".\n");
// We may not be able to keep the current target occupancy because of the just
// scheduled region. We might still be able to revert scheduling if the
// occupancy before was higher, or if the current schedule has register
// pressure higher than the excess limits which could lead to more spilling.
unsigned NewOccupancy = std::max(WavesAfter, WavesBefore);
// Allow memory bound functions to drop to 4 waves if not limited by an
// attribute.
if (WavesAfter < WavesBefore && WavesAfter < DAG.MinOccupancy &&
WavesAfter >= MFI.getMinAllowedOccupancy()) {
LLVM_DEBUG(dbgs() << "Function is memory bound, allow occupancy drop up to "
<< MFI.getMinAllowedOccupancy() << " waves\n");
NewOccupancy = WavesAfter;
}
if (NewOccupancy < DAG.MinOccupancy) {
DAG.MinOccupancy = NewOccupancy;
MFI.limitOccupancy(DAG.MinOccupancy);
LLVM_DEBUG(dbgs() << "Occupancy lowered for the function to "
<< DAG.MinOccupancy << ".\n");
}
// The maximum number of arch VGPR on non-unified register file, or the
// maximum VGPR + AGPR in the unified register file case.
unsigned MaxVGPRs = ST.getMaxNumVGPRs(MF);
// The maximum number of arch VGPR for both unified and non-unified register
// file.
unsigned MaxArchVGPRs = std::min(MaxVGPRs, ST.getAddressableNumArchVGPRs());
unsigned MaxSGPRs = ST.getMaxNumSGPRs(MF);
if (PressureAfter.getVGPRNum(ST.hasGFX90AInsts()) > MaxVGPRs ||
PressureAfter.getArchVGPRNum() > MaxArchVGPRs ||
PressureAfter.getAGPRNum() > MaxArchVGPRs ||
PressureAfter.getSGPRNum() > MaxSGPRs) {
DAG.RegionsWithHighRP[RegionIdx] = true;
DAG.RegionsWithExcessRP[RegionIdx] = true;
}
// Revert if this region's schedule would cause a drop in occupancy or
// spilling.
if (shouldRevertScheduling(WavesAfter)) {
modifyRegionSchedule(RegionIdx, DAG.BB, Unsched);
std::tie(DAG.RegionBegin, DAG.RegionEnd) = DAG.Regions[RegionIdx];
} else {
DAG.Pressure[RegionIdx] = PressureAfter;
}
}
unsigned
GCNSchedStage::computeSUnitReadyCycle(const SUnit &SU, unsigned CurrCycle,
DenseMap<unsigned, unsigned> &ReadyCycles,
const TargetSchedModel &SM) {
unsigned ReadyCycle = CurrCycle;
for (auto &D : SU.Preds) {
if (D.isAssignedRegDep()) {
MachineInstr *DefMI = D.getSUnit()->getInstr();
unsigned Latency = SM.computeInstrLatency(DefMI);
unsigned DefReady = ReadyCycles[DAG.getSUnit(DefMI)->NodeNum];
ReadyCycle = std::max(ReadyCycle, DefReady + Latency);
}
}
ReadyCycles[SU.NodeNum] = ReadyCycle;
return ReadyCycle;
}
#ifndef NDEBUG
struct EarlierIssuingCycle {
bool operator()(std::pair<MachineInstr *, unsigned> A,
std::pair<MachineInstr *, unsigned> B) const {
return A.second < B.second;
}
};
static void printScheduleModel(std::set<std::pair<MachineInstr *, unsigned>,
EarlierIssuingCycle> &ReadyCycles) {
if (ReadyCycles.empty())
return;
unsigned BBNum = ReadyCycles.begin()->first->getParent()->getNumber();
dbgs() << "\n################## Schedule time ReadyCycles for MBB : " << BBNum
<< " ##################\n# Cycle #\t\t\tInstruction "
" "
" \n";
unsigned IPrev = 1;
for (auto &I : ReadyCycles) {
if (I.second > IPrev + 1)
dbgs() << "****************************** BUBBLE OF " << I.second - IPrev
<< " CYCLES DETECTED ******************************\n\n";
dbgs() << "[ " << I.second << " ] : " << *I.first << "\n";
IPrev = I.second;
}
}
#endif
ScheduleMetrics
GCNSchedStage::getScheduleMetrics(const std::vector<SUnit> &InputSchedule) {
#ifndef NDEBUG
std::set<std::pair<MachineInstr *, unsigned>, EarlierIssuingCycle>
ReadyCyclesSorted;
#endif
const TargetSchedModel &SM = ST.getInstrInfo()->getSchedModel();
unsigned SumBubbles = 0;
DenseMap<unsigned, unsigned> ReadyCycles;
unsigned CurrCycle = 0;
for (auto &SU : InputSchedule) {
unsigned ReadyCycle =
computeSUnitReadyCycle(SU, CurrCycle, ReadyCycles, SM);
SumBubbles += ReadyCycle - CurrCycle;
#ifndef NDEBUG
ReadyCyclesSorted.insert(std::make_pair(SU.getInstr(), ReadyCycle));
#endif
CurrCycle = ++ReadyCycle;
}
#ifndef NDEBUG
LLVM_DEBUG(
printScheduleModel(ReadyCyclesSorted);
dbgs() << "\n\t"
<< "Metric: "
<< (SumBubbles
? (SumBubbles * ScheduleMetrics::ScaleFactor) / CurrCycle
: 1)
<< "\n\n");
#endif
return ScheduleMetrics(CurrCycle, SumBubbles);
}
ScheduleMetrics
GCNSchedStage::getScheduleMetrics(const GCNScheduleDAGMILive &DAG) {
#ifndef NDEBUG
std::set<std::pair<MachineInstr *, unsigned>, EarlierIssuingCycle>
ReadyCyclesSorted;
#endif
const TargetSchedModel &SM = ST.getInstrInfo()->getSchedModel();
unsigned SumBubbles = 0;
DenseMap<unsigned, unsigned> ReadyCycles;
unsigned CurrCycle = 0;
for (auto &MI : DAG) {
SUnit *SU = DAG.getSUnit(&MI);
if (!SU)
continue;
unsigned ReadyCycle =
computeSUnitReadyCycle(*SU, CurrCycle, ReadyCycles, SM);
SumBubbles += ReadyCycle - CurrCycle;
#ifndef NDEBUG
ReadyCyclesSorted.insert(std::make_pair(SU->getInstr(), ReadyCycle));
#endif
CurrCycle = ++ReadyCycle;
}
#ifndef NDEBUG
LLVM_DEBUG(
printScheduleModel(ReadyCyclesSorted);
dbgs() << "\n\t"
<< "Metric: "
<< (SumBubbles
? (SumBubbles * ScheduleMetrics::ScaleFactor) / CurrCycle
: 1)
<< "\n\n");
#endif
return ScheduleMetrics(CurrCycle, SumBubbles);
}
bool GCNSchedStage::shouldRevertScheduling(unsigned WavesAfter) {
if (WavesAfter < DAG.MinOccupancy)
return true;
// For dynamic VGPR mode, we don't want to waste any VGPR blocks.
if (DAG.MFI.isDynamicVGPREnabled()) {
unsigned BlocksBefore = AMDGPU::IsaInfo::getAllocatedNumVGPRBlocks(
&ST, DAG.MFI.getDynamicVGPRBlockSize(),
PressureBefore.getVGPRNum(false));
unsigned BlocksAfter = AMDGPU::IsaInfo::getAllocatedNumVGPRBlocks(
&ST, DAG.MFI.getDynamicVGPRBlockSize(),
PressureAfter.getVGPRNum(false));
if (BlocksAfter > BlocksBefore)
return true;
}
return false;
}
bool OccInitialScheduleStage::shouldRevertScheduling(unsigned WavesAfter) {
if (PressureAfter == PressureBefore)
return false;
if (GCNSchedStage::shouldRevertScheduling(WavesAfter))
return true;
if (mayCauseSpilling(WavesAfter))
return true;
return false;
}
bool UnclusteredHighRPStage::shouldRevertScheduling(unsigned WavesAfter) {
// If RP is not reduced in the unclustered reschedule stage, revert to the
// old schedule.
if ((WavesAfter <=
PressureBefore.getOccupancy(ST, DAG.MFI.getDynamicVGPRBlockSize()) &&
mayCauseSpilling(WavesAfter)) ||
GCNSchedStage::shouldRevertScheduling(WavesAfter)) {
LLVM_DEBUG(dbgs() << "Unclustered reschedule did not help.\n");
return true;
}
// Do not attempt to relax schedule even more if we are already spilling.
if (isRegionWithExcessRP())
return false;
LLVM_DEBUG(
dbgs()
<< "\n\t *** In shouldRevertScheduling ***\n"
<< " *********** BEFORE UnclusteredHighRPStage ***********\n");
ScheduleMetrics MBefore = getScheduleMetrics(DAG.SUnits);
LLVM_DEBUG(
dbgs()
<< "\n *********** AFTER UnclusteredHighRPStage ***********\n");
ScheduleMetrics MAfter = getScheduleMetrics(DAG);
unsigned OldMetric = MBefore.getMetric();
unsigned NewMetric = MAfter.getMetric();
unsigned WavesBefore = std::min(
S.getTargetOccupancy(),
PressureBefore.getOccupancy(ST, DAG.MFI.getDynamicVGPRBlockSize()));
unsigned Profit =
((WavesAfter * ScheduleMetrics::ScaleFactor) / WavesBefore *
((OldMetric + ScheduleMetricBias) * ScheduleMetrics::ScaleFactor) /
NewMetric) /
ScheduleMetrics::ScaleFactor;
LLVM_DEBUG(dbgs() << "\tMetric before " << MBefore << "\tMetric after "
<< MAfter << "Profit: " << Profit << "\n");
return Profit < ScheduleMetrics::ScaleFactor;
}
bool ClusteredLowOccStage::shouldRevertScheduling(unsigned WavesAfter) {
if (PressureAfter == PressureBefore)
return false;
if (GCNSchedStage::shouldRevertScheduling(WavesAfter))
return true;
if (mayCauseSpilling(WavesAfter))
return true;
return false;
}
bool PreRARematStage::shouldRevertScheduling(unsigned WavesAfter) {
// When trying to increase occupancy (TargetOcc == true) the stage manages
// region reverts globally (all or none), so we always return false here.
return !TargetOcc && mayCauseSpilling(WavesAfter);
}
bool ILPInitialScheduleStage::shouldRevertScheduling(unsigned WavesAfter) {
if (mayCauseSpilling(WavesAfter))
return true;
return false;
}
bool MemoryClauseInitialScheduleStage::shouldRevertScheduling(
unsigned WavesAfter) {
return mayCauseSpilling(WavesAfter);
}
bool GCNSchedStage::mayCauseSpilling(unsigned WavesAfter) {
if (WavesAfter <= MFI.getMinWavesPerEU() && isRegionWithExcessRP() &&
!PressureAfter.less(MF, PressureBefore)) {
LLVM_DEBUG(dbgs() << "New pressure will result in more spilling.\n");
return true;
}
return false;
}
void GCNSchedStage::modifyRegionSchedule(unsigned RegionIdx,
MachineBasicBlock *MBB,
ArrayRef<MachineInstr *> MIOrder) {
assert(static_cast<size_t>(std::distance(DAG.Regions[RegionIdx].first,
DAG.Regions[RegionIdx].second)) ==
MIOrder.size() &&
"instruction number mismatch");
if (MIOrder.empty())
return;
LLVM_DEBUG(dbgs() << "Reverting scheduling for region " << RegionIdx << '\n');
// Reconstruct MI sequence by moving instructions in desired order before
// the current region's start.
MachineBasicBlock::iterator RegionEnd = DAG.Regions[RegionIdx].first;
for (MachineInstr *MI : MIOrder) {
// Either move the next MI in order before the end of the region or move the
// region end past the MI if it is at the correct position.
MachineBasicBlock::iterator MII = MI->getIterator();
if (MII != RegionEnd) {
// Will subsequent splice move MI up past a non-debug instruction?
bool NonDebugReordered =
!MI->isDebugInstr() &&
skipDebugInstructionsForward(RegionEnd, MII) != MII;
MBB->splice(RegionEnd, MBB, MI);
// Only update LiveIntervals information if non-debug instructions are
// reordered. Otherwise debug instructions could cause code generation to
// change.
if (NonDebugReordered)
DAG.LIS->handleMove(*MI, true);
} else {
++RegionEnd;
}
if (MI->isDebugInstr()) {
LLVM_DEBUG(dbgs() << "Scheduling " << *MI);
continue;
}
// Reset read-undef flags and update them later.
for (MachineOperand &Op : MI->all_defs())
Op.setIsUndef(false);
RegisterOperands RegOpers;
RegOpers.collect(*MI, *DAG.TRI, DAG.MRI, DAG.ShouldTrackLaneMasks, false);
if (DAG.ShouldTrackLaneMasks) {
// Adjust liveness and add missing dead+read-undef flags.
SlotIndex SlotIdx = DAG.LIS->getInstructionIndex(*MI).getRegSlot();
RegOpers.adjustLaneLiveness(*DAG.LIS, DAG.MRI, SlotIdx, MI);
} else {
// Adjust for missing dead-def flags.
RegOpers.detectDeadDefs(*MI, *DAG.LIS);
}
LLVM_DEBUG(dbgs() << "Scheduling " << *MI);
}
// The region end doesn't change throughout scheduling since it itself is
// outside the region (whether that is a MBB end or a terminator MI).
assert(RegionEnd == DAG.Regions[RegionIdx].second && "region end mismatch");
DAG.Regions[RegionIdx].first = MIOrder.front();
}
bool RewriteMFMAFormStage::isRewriteCandidate(MachineInstr *MI) const {
if (!static_cast<const SIInstrInfo *>(DAG.TII)->isMAI(*MI))
return false;
return AMDGPU::getMFMASrcCVDstAGPROp(MI->getOpcode()) != -1;
}
bool RewriteMFMAFormStage::initHeuristics(
std::vector<std::pair<MachineInstr *, unsigned>> &RewriteCands,
DenseMap<MachineBasicBlock *, std::set<Register>> &CopyForUse,
SmallPtrSetImpl<MachineInstr *> &CopyForDef) {
bool Changed = false;
// Prepare for the heuristics
for (MachineBasicBlock &MBB : MF) {
for (MachineInstr &MI : MBB) {
if (!isRewriteCandidate(&MI))
continue;
int ReplacementOp = AMDGPU::getMFMASrcCVDstAGPROp(MI.getOpcode());
assert(ReplacementOp != -1);
RewriteCands.push_back({&MI, MI.getOpcode()});
MI.setDesc(TII->get(ReplacementOp));
MachineOperand *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2);
if (Src2->isReg()) {
SmallVector<SlotIndex, 8> Src2ReachingDefs;
findReachingDefs(*Src2, DAG.LIS, Src2ReachingDefs);
// For any definition of the src2 register which is non-MFMA, we
// insert a copy.
for (SlotIndex RDIdx : Src2ReachingDefs) {
MachineInstr *RD = DAG.LIS->getInstructionFromIndex(RDIdx);
if (!TII->isMAI(*RD))
CopyForDef.insert(RD);
}
}
MachineOperand &Dst = MI.getOperand(0);
SmallVector<MachineOperand *, 8> DstReachingUses;
findReachingUses(&MI, DAG.LIS, DstReachingUses);
for (MachineOperand *RUOp : DstReachingUses) {
if (TII->isMAI(*RUOp->getParent()))
continue;
// For any user of the result of the MFMA which is not an MFMA, we
// insert a copy. For a given register, we will only insert one copy
// per user block.
CopyForUse[RUOp->getParent()->getParent()].insert(RUOp->getReg());
SmallVector<SlotIndex, 8> DstUsesReachingDefs;
findReachingDefs(*RUOp, DAG.LIS, DstUsesReachingDefs);
for (SlotIndex RDIndex : DstUsesReachingDefs) {
MachineInstr *RD = DAG.LIS->getInstructionFromIndex(RDIndex);
if (TII->isMAI(*RD))
continue;
// For any definition of the user of the MFMA which is not an MFMA,
// we insert a copy. We do this to transform all the reaching defs
// of this use to AGPR. By doing this, we can insert a copy from
// AGPR to VGPR at the user rather than after the MFMA.
CopyForDef.insert(RD);
}
}
// Do the rewrite to allow for updated RP calculation.
const TargetRegisterClass *VDefRC = DAG.MRI.getRegClass(Dst.getReg());
const TargetRegisterClass *ADefRC = SRI->getEquivalentAGPRClass(VDefRC);
DAG.MRI.setRegClass(Dst.getReg(), ADefRC);
if (Src2->isReg()) {
// Have to get src types separately since subregs may cause C and D
// registers to be different types even though the actual operand is
// the same size.
const TargetRegisterClass *VUseRC = DAG.MRI.getRegClass(Src2->getReg());
const TargetRegisterClass *AUseRC = SRI->getEquivalentAGPRClass(VUseRC);
DAG.MRI.setRegClass(Src2->getReg(), AUseRC);
}
Changed = true;
}
}
return Changed;
}
int64_t RewriteMFMAFormStage::getRewriteCost(
const std::vector<std::pair<MachineInstr *, unsigned>> &RewriteCands,
const DenseMap<MachineBasicBlock *, std::set<Register>> &CopyForUse,
const SmallPtrSetImpl<MachineInstr *> &CopyForDef) {
MachineBlockFrequencyInfo *MBFI = DAG.MBFI;
int64_t BestSpillCost = 0;
int64_t Cost = 0;
uint64_t EntryFreq = MBFI->getEntryFreq().getFrequency();
std::pair<unsigned, unsigned> MaxVectorRegs =
ST.getMaxNumVectorRegs(MF.getFunction());
unsigned ArchVGPRThreshold = MaxVectorRegs.first;
unsigned AGPRThreshold = MaxVectorRegs.second;
unsigned CombinedThreshold = ST.getMaxNumVGPRs(MF);
for (unsigned Region = 0; Region < DAG.Regions.size(); Region++) {
if (!RegionsWithExcessArchVGPR[Region])
continue;
GCNRegPressure &PressureBefore = DAG.Pressure[Region];
unsigned SpillCostBefore = PressureBefore.getVGPRSpills(
MF, ArchVGPRThreshold, AGPRThreshold, CombinedThreshold);
// For the cases we care about (i.e. ArchVGPR usage is greater than the
// addressable limit), rewriting alone should bring pressure to manageable
// level. If we find any such region, then the rewrite is potentially
// beneficial.
GCNRegPressure PressureAfter = DAG.getRealRegPressure(Region);
unsigned SpillCostAfter = PressureAfter.getVGPRSpills(
MF, ArchVGPRThreshold, AGPRThreshold, CombinedThreshold);
uint64_t BlockFreq =
MBFI->getBlockFreq(DAG.Regions[Region].first->getParent())
.getFrequency();
bool RelativeFreqIsDenom = EntryFreq > BlockFreq;
uint64_t RelativeFreq = EntryFreq && BlockFreq
? (RelativeFreqIsDenom ? EntryFreq / BlockFreq
: BlockFreq / EntryFreq)
: 1;
// This assumes perfect spilling / splitting -- using one spill / copy
// instruction and one restoreFrom / copy for each excess register,
int64_t SpillCost = ((int)SpillCostAfter - (int)SpillCostBefore) * 2;
// Also account for the block frequency.
if (RelativeFreqIsDenom)
SpillCost /= (int64_t)RelativeFreq;
else
SpillCost *= (int64_t)RelativeFreq;
// If we have increased spilling in any block, just bail.
if (SpillCost > 0)
return SpillCost;
if (SpillCost < BestSpillCost)
BestSpillCost = SpillCost;
}
// Set the cost to the largest decrease in spill cost in order to not double
// count spill reductions.
Cost = BestSpillCost;
assert(Cost <= 0);
unsigned CopyCost = 0;
// For each CopyForDef, increase the cost by the register size while
// accounting for block frequency.
for (MachineInstr *DefMI : CopyForDef) {
Register DefReg = DefMI->getOperand(0).getReg();
uint64_t DefFreq =
EntryFreq
? MBFI->getBlockFreq(DefMI->getParent()).getFrequency() / EntryFreq
: 1;
const TargetRegisterClass *RC = DAG.MRI.getRegClass(DefReg);
CopyCost += RC->getCopyCost() * DefFreq;
}
// Account for CopyForUse copies in each block that the register is used.
for (auto &[UseBlock, UseRegs] : CopyForUse) {
uint64_t UseFreq =
EntryFreq ? MBFI->getBlockFreq(UseBlock).getFrequency() / EntryFreq : 1;
for (Register UseReg : UseRegs) {
const TargetRegisterClass *RC = DAG.MRI.getRegClass(UseReg);
CopyCost += RC->getCopyCost() * UseFreq;
}
}
// Reset the classes that were changed to AGPR for better RB analysis.
// We must do rewriting after copy-insertion, as some defs of the register
// may require VGPR. Additionally, if we bail out and don't perform the
// rewrite then these need to be restored anyway.
for (auto &[MI, OriginalOpcode] : RewriteCands) {
assert(TII->isMAI(*MI));
const TargetRegisterClass *ADefRC =
DAG.MRI.getRegClass(MI->getOperand(0).getReg());
const TargetRegisterClass *VDefRC = SRI->getEquivalentVGPRClass(ADefRC);
DAG.MRI.setRegClass(MI->getOperand(0).getReg(), VDefRC);
MI->setDesc(TII->get(OriginalOpcode));
MachineOperand *Src2 = TII->getNamedOperand(*MI, AMDGPU::OpName::src2);
assert(Src2);
if (!Src2->isReg())
continue;
// Have to get src types separately since subregs may cause C and D
// registers to be different types even though the actual operand is
// the same size.
const TargetRegisterClass *AUseRC = DAG.MRI.getRegClass(Src2->getReg());
const TargetRegisterClass *VUseRC = SRI->getEquivalentVGPRClass(AUseRC);
DAG.MRI.setRegClass(Src2->getReg(), VUseRC);
}
return Cost + CopyCost;
}
bool RewriteMFMAFormStage::rewrite(
const std::vector<std::pair<MachineInstr *, unsigned>> &RewriteCands) {
DenseMap<MachineInstr *, unsigned> FirstMIToRegion;
DenseMap<MachineInstr *, unsigned> LastMIToRegion;
for (unsigned Region = 0; Region < DAG.Regions.size(); Region++) {
RegionBoundaries Entry = DAG.Regions[Region];
if (Entry.first == Entry.second)
continue;
FirstMIToRegion[&*Entry.first] = Region;
if (Entry.second != Entry.first->getParent()->end())
LastMIToRegion[&*Entry.second] = Region;
}
// Rewrite the MFMAs to AGPR, and insert any copies as needed.
// The general assumption of the algorithm (and the previous cost calculation)
// is that it is better to insert the copies in the MBB of the def of the src2
// operands, and in the MBB of the user of the dest operands. This is based on
// the assumption that the MFMAs are likely to appear in loop bodies, while
// the src2 and dest operands are live-in / live-out of the loop. Due to this
// design, the algorithm for finding copy insertion points is more
// complicated.
//
// There are three main cases to handle: 1. the reaching defs of the src2
// operands, 2. the reaching uses of the dst operands, and 3. the reaching
// defs of the reaching uses of the dst operand.
//
// In the first case, we simply insert copies after each of the reaching
// definitions. In the second case, we collect all the uses of a given dest
// and organize them by MBB. Then, we insert 1 copy for each MBB before the
// earliest use. Since the use may have multiple reaching defs, and since we
// want to replace the register it is using with the result of the copy, we
// must handle case 3. In the third case, we simply insert a copy after each
// of the reaching defs to connect to the copy of the reaching uses of the dst
// reg. This allows us to avoid inserting copies next to the MFMAs.
//
// While inserting the copies, we maintain a map of operands which will use
// different regs (i.e. the result of the copies). For example, a case 1 src2
// operand will use the register result of the copies after the reaching defs,
// as opposed to the original register. Now that we have completed our copy
// analysis and placement, we can bulk update the registers. We do this
// separately as to avoid complicating the reachingDef and reachingUse
// queries.
//
// While inserting the copies, we also maintain a list or registers which we
// will want to reclassify as AGPR. After doing the copy insertion and the
// register replacement, we can finally do the reclassification. This uses the
// redef map, as the registers we are interested in reclassifying may be
// replaced by the result of a copy. We must do this after the copy analysis
// and placement as we must have an accurate redef map -- otherwise we may end
// up creating illegal instructions.
// The original registers of the MFMA that need to be reclassified as AGPR.
DenseSet<Register> RewriteRegs;
// The map of an original register in the MFMA to a new register (result of a
// copy) that it should be replaced with.
DenseMap<Register, Register> RedefMap;
// The map of the original MFMA registers to the relevant MFMA operands.
DenseMap<Register, DenseSet<MachineOperand *>> ReplaceMap;
// The map of reaching defs for a given register -- to avoid duplicate copies.
DenseMap<Register, SmallPtrSet<MachineInstr *, 8>> ReachingDefCopyMap;
// The map of reaching uses for a given register by basic block -- to avoid
// duplicate copies and to calculate per MBB insert pts.
DenseMap<unsigned, DenseMap<Register, SmallPtrSet<MachineOperand *, 8>>>
ReachingUseTracker;
for (auto &[MI, OriginalOpcode] : RewriteCands) {
int ReplacementOp = AMDGPU::getMFMASrcCVDstAGPROp(MI->getOpcode());
if (ReplacementOp == -1)
continue;
MI->setDesc(TII->get(ReplacementOp));
// Case 1: insert copies for the reaching defs of the Src2Reg.
MachineOperand *Src2 = TII->getNamedOperand(*MI, AMDGPU::OpName::src2);
if (Src2->isReg()) {
Register Src2Reg = Src2->getReg();
if (!Src2Reg.isVirtual())
return false;
Register MappedReg = Src2->getReg();
SmallVector<SlotIndex, 8> Src2ReachingDefs;
findReachingDefs(*Src2, DAG.LIS, Src2ReachingDefs);
SmallSetVector<MachineInstr *, 8> Src2DefsReplace;
for (SlotIndex RDIndex : Src2ReachingDefs) {
MachineInstr *RD = DAG.LIS->getInstructionFromIndex(RDIndex);
if (TII->isMAI(*RD))
continue;
// If there is a non mai reaching def, then we need a copy.
Src2DefsReplace.insert(RD);
}
if (!Src2DefsReplace.empty()) {
DenseMap<Register, Register>::iterator RI = RedefMap.find(Src2Reg);
if (RI != RedefMap.end()) {
MappedReg = RI->second;
} else {
assert(!ReachingDefCopyMap.contains(Src2Reg));
const TargetRegisterClass *Src2RC = DAG.MRI.getRegClass(Src2Reg);
const TargetRegisterClass *VGPRRC =
SRI->getEquivalentVGPRClass(Src2RC);
// Track the mapping of the original register to the new register.
MappedReg = DAG.MRI.createVirtualRegister(VGPRRC);
RedefMap[Src2Reg] = MappedReg;
}
// If none exists, create a copy from this reaching def.
// We may have inserted a copy already in an earlier iteration.
for (MachineInstr *RD : Src2DefsReplace) {
// Do not create redundant copies.
if (ReachingDefCopyMap[Src2Reg].insert(RD).second) {
MachineInstrBuilder VGPRCopy =
BuildMI(*RD->getParent(), std::next(RD->getIterator()),
RD->getDebugLoc(), TII->get(TargetOpcode::COPY))
.addDef(MappedReg, {}, 0)
.addUse(Src2Reg, {}, 0);
DAG.LIS->InsertMachineInstrInMaps(*VGPRCopy);
// If this reaching def was the last MI in the region, update the
// region boundaries.
if (LastMIToRegion.contains(RD)) {
unsigned UpdateRegion = LastMIToRegion[RD];
DAG.Regions[UpdateRegion].second = VGPRCopy;
LastMIToRegion.erase(RD);
}
}
}
}
// Track the register for reclassification
RewriteRegs.insert(Src2Reg);
// Always insert the operand for replacement. If this corresponds with a
// chain of tied-def we may not see the VGPR requirement until later.
ReplaceMap[Src2Reg].insert(Src2);
}
// Case 2 and Case 3: insert copies before the reaching uses of the dsts,
// and after the reaching defs of the reaching uses of the dsts.
MachineOperand *Dst = &MI->getOperand(0);
Register DstReg = Dst->getReg();
if (!DstReg.isVirtual())
return false;
Register MappedReg = DstReg;
SmallVector<MachineOperand *, 8> DstReachingUses;
SmallVector<MachineOperand *, 8> DstReachingUseCopies;
SmallVector<MachineInstr *, 8> DstUseDefsReplace;
findReachingUses(MI, DAG.LIS, DstReachingUses);
for (MachineOperand *RUOp : DstReachingUses) {
if (TII->isMAI(*RUOp->getParent()))
continue;
// If there is a non mai reaching use, then we need a copy.
if (find(DstReachingUseCopies, RUOp) == DstReachingUseCopies.end())
DstReachingUseCopies.push_back(RUOp);
SmallVector<SlotIndex, 8> DstUsesReachingDefs;
findReachingDefs(*RUOp, DAG.LIS, DstUsesReachingDefs);
for (SlotIndex RDIndex : DstUsesReachingDefs) {
MachineInstr *RD = DAG.LIS->getInstructionFromIndex(RDIndex);
if (TII->isMAI(*RD))
continue;
// If there is a non mai reaching def of this reaching use, then we will
// need a copy.
if (find(DstUseDefsReplace, RD) == DstUseDefsReplace.end())
DstUseDefsReplace.push_back(RD);
}
}
if (!DstUseDefsReplace.empty()) {
DenseMap<Register, Register>::iterator RI = RedefMap.find(DstReg);
if (RI != RedefMap.end()) {
MappedReg = RI->second;
} else {
assert(!ReachingDefCopyMap.contains(DstReg));
const TargetRegisterClass *DstRC = DAG.MRI.getRegClass(DstReg);
const TargetRegisterClass *VGPRRC = SRI->getEquivalentVGPRClass(DstRC);
// Track the mapping of the original register to the new register.
MappedReg = DAG.MRI.createVirtualRegister(VGPRRC);
RedefMap[DstReg] = MappedReg;
}
// If none exists, create a copy from this reaching def.
// We may have inserted a copy already in an earlier iteration.
for (MachineInstr *RD : DstUseDefsReplace) {
// Do not create reundant copies.
if (ReachingDefCopyMap[DstReg].insert(RD).second) {
MachineInstrBuilder VGPRCopy =
BuildMI(*RD->getParent(), std::next(RD->getIterator()),
RD->getDebugLoc(), TII->get(TargetOpcode::COPY))
.addDef(MappedReg, {}, 0)
.addUse(DstReg, {}, 0);
DAG.LIS->InsertMachineInstrInMaps(*VGPRCopy);
// If this reaching def was the last MI in the region, update the
// region boundaries.
DenseMap<MachineInstr *, unsigned>::iterator LMI =
LastMIToRegion.find(RD);
if (LMI != LastMIToRegion.end()) {
unsigned UpdateRegion = LMI->second;
DAG.Regions[UpdateRegion].second = VGPRCopy;
LastMIToRegion.erase(RD);
}
}
}
}
DenseSet<MachineOperand *> &DstRegSet = ReplaceMap[DstReg];
for (MachineOperand *RU : DstReachingUseCopies) {
MachineBasicBlock *RUBlock = RU->getParent()->getParent();
// Just keep track of the reaching use of this register by block. After we
// have scanned all the MFMAs we can find optimal insert pts.
if (RUBlock != MI->getParent()) {
ReachingUseTracker[RUBlock->getNumber()][DstReg].insert(RU);
continue;
}
// Special case, the use is in the same block as the MFMA. Insert the copy
// just before the use.
const TargetRegisterClass *DstRC = DAG.MRI.getRegClass(DstReg);
const TargetRegisterClass *VGPRRC = SRI->getEquivalentVGPRClass(DstRC);
Register NewUseReg = DAG.MRI.createVirtualRegister(VGPRRC);
MachineInstr *UseInst = RU->getParent();
MachineInstrBuilder VGPRCopy =
BuildMI(*UseInst->getParent(), UseInst->getIterator(),
UseInst->getDebugLoc(), TII->get(TargetOpcode::COPY))
.addDef(NewUseReg, {}, 0)
.addUse(DstReg, {}, 0);
DAG.LIS->InsertMachineInstrInMaps(*VGPRCopy);
// Since we know this use has only one reaching def, we can replace the
// use reg.
RU->setReg(NewUseReg);
// Track the copy source operand for r eplacement.
DstRegSet.insert(&VGPRCopy->getOperand(1));
}
// Track the register for reclassification
RewriteRegs.insert(DstReg);
// Insert the dst operand for replacement. If this dst is in a chain of
// tied-def MFMAs, and the first src2 needs to be replaced with a new reg,
// all the correspond operands need to be replaced.
DstRegSet.insert(Dst);
}
// Handle the copies for dst uses.
using RUBType =
std::pair<unsigned, DenseMap<Register, SmallPtrSet<MachineOperand *, 8>>>;
for (RUBType RUBlockEntry : ReachingUseTracker) {
using RUDType = std::pair<Register, SmallPtrSet<MachineOperand *, 8>>;
for (RUDType RUDst : RUBlockEntry.second) {
MachineOperand *OpBegin = *RUDst.second.begin();
SlotIndex InstPt = DAG.LIS->getInstructionIndex(*OpBegin->getParent());
// Find the earliest use in this block.
for (MachineOperand *User : RUDst.second) {
SlotIndex NewInstPt = DAG.LIS->getInstructionIndex(*User->getParent());
if (SlotIndex::isEarlierInstr(NewInstPt, InstPt))
InstPt = NewInstPt;
}
const TargetRegisterClass *DstRC = DAG.MRI.getRegClass(RUDst.first);
const TargetRegisterClass *VGPRRC = SRI->getEquivalentVGPRClass(DstRC);
Register NewUseReg = DAG.MRI.createVirtualRegister(VGPRRC);
MachineInstr *UseInst = DAG.LIS->getInstructionFromIndex(InstPt);
MachineInstrBuilder VGPRCopy =
BuildMI(*UseInst->getParent(), UseInst->getIterator(),
UseInst->getDebugLoc(), TII->get(TargetOpcode::COPY))
.addDef(NewUseReg, {}, 0)
.addUse(RUDst.first, {}, 0);
DAG.LIS->InsertMachineInstrInMaps(*VGPRCopy);
// If this UseInst was the first MI in the region, update the region
// boundaries.
DenseMap<MachineInstr *, unsigned>::iterator FI =
FirstMIToRegion.find(UseInst);
if (FI != FirstMIToRegion.end()) {
unsigned UpdateRegion = FI->second;
DAG.Regions[UpdateRegion].first = VGPRCopy;
FirstMIToRegion.erase(UseInst);
}
// Replace the operand for all users.
for (MachineOperand *User : RUDst.second) {
User->setReg(NewUseReg);
}
// Track the copy source operand for replacement.
ReplaceMap[RUDst.first].insert(&VGPRCopy->getOperand(1));
}
}
// We may have needed to insert copies after the reaching defs of the MFMAs.
// Replace the original register with the result of the copy for all relevant
// operands.
for (std::pair<Register, Register> NewDef : RedefMap) {
Register OldReg = NewDef.first;
Register NewReg = NewDef.second;
// Replace the register for any associated operand in the MFMA chain.
for (MachineOperand *ReplaceOp : ReplaceMap[OldReg])
ReplaceOp->setReg(NewReg);
}
// Finally, do the reclassification of the MFMA registers.
for (Register RewriteReg : RewriteRegs) {
Register RegToRewrite = RewriteReg;
// Be sure to update the replacement register and not the original.
DenseMap<Register, Register>::iterator RI = RedefMap.find(RewriteReg);
if (RI != RedefMap.end())
RegToRewrite = RI->second;
const TargetRegisterClass *CurrRC = DAG.MRI.getRegClass(RegToRewrite);
const TargetRegisterClass *AGPRRC = SRI->getEquivalentAGPRClass(CurrRC);
DAG.MRI.setRegClass(RegToRewrite, AGPRRC);
}
// Bulk update the LIS.
DAG.LIS->reanalyze(DAG.MF);
// Liveins may have been modified for cross RC copies
RegionPressureMap LiveInUpdater(&DAG, false);
LiveInUpdater.buildLiveRegMap();
for (unsigned Region = 0; Region < DAG.Regions.size(); Region++)
DAG.LiveIns[Region] = LiveInUpdater.getLiveRegsForRegionIdx(Region);
DAG.Pressure[RegionIdx] = DAG.getRealRegPressure(RegionIdx);
return true;
}
unsigned PreRARematStage::getStageTargetOccupancy() const {
return TargetOcc ? *TargetOcc : MFI.getMinWavesPerEU();
}
bool PreRARematStage::setObjective() {
const Function &F = MF.getFunction();
// Set up "spilling targets" for all regions.
unsigned MaxSGPRs = ST.getMaxNumSGPRs(F);
unsigned MaxVGPRs = ST.getMaxNumVGPRs(F);
bool HasVectorRegisterExcess = false;
for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) {
const GCNRegPressure &RP = DAG.Pressure[I];
GCNRPTarget &Target = RPTargets.emplace_back(MaxSGPRs, MaxVGPRs, MF, RP);
if (!Target.satisfied())
TargetRegions.set(I);
HasVectorRegisterExcess |= Target.hasVectorRegisterExcess();
}
if (HasVectorRegisterExcess || DAG.MinOccupancy >= MFI.getMaxWavesPerEU()) {
// In addition to register usage being above addressable limits, occupancy
// below the minimum is considered like "spilling" as well.
TargetOcc = std::nullopt;
} else {
// There is no spilling and room to improve occupancy; set up "increased
// occupancy targets" for all regions.
TargetOcc = DAG.MinOccupancy + 1;
const unsigned VGPRBlockSize = MFI.getDynamicVGPRBlockSize();
MaxSGPRs = ST.getMaxNumSGPRs(*TargetOcc, false);
MaxVGPRs = ST.getMaxNumVGPRs(*TargetOcc, VGPRBlockSize);
for (auto [I, Target] : enumerate(RPTargets)) {
Target.setTarget(MaxSGPRs, MaxVGPRs);
if (!Target.satisfied())
TargetRegions.set(I);
}
}
return TargetRegions.any();
}
bool PreRARematStage::collectRematRegs(
const DenseMap<MachineInstr *, unsigned> &MIRegion) {
// We need up-to-date live-out info. to query live-out register masks in
// regions containing rematerializable instructions.
DAG.RegionLiveOuts.buildLiveRegMap();
// Set of registers already marked for potential remterialization; used to
// avoid rematerialization chains.
SmallSet<Register, 4> MarkedRegs;
auto IsMarkedForRemat = [&MarkedRegs](const MachineOperand &MO) -> bool {
return MO.isReg() && MarkedRegs.contains(MO.getReg());
};
// Identify rematerializable instructions in the function.
for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) {
RegionBoundaries Bounds = DAG.Regions[I];
for (auto MI = Bounds.first; MI != Bounds.second; ++MI) {
// The instruction must be rematerializable.
MachineInstr &DefMI = *MI;
if (!isReMaterializable(DefMI))
continue;
// We only support rematerializing virtual registers with one
// definition.
Register Reg = DefMI.getOperand(0).getReg();
if (!Reg.isVirtual() || !DAG.MRI.hasOneDef(Reg))
continue;
// We only care to rematerialize the instruction if it has a single
// non-debug user in a different region.
// FIXME: Allow rematerializations with multiple uses. This should be
// relatively easy to support using the current cost model.
MachineInstr *UseMI = DAG.MRI.getOneNonDBGUser(Reg);
if (!UseMI)
continue;
auto UseRegion = MIRegion.find(UseMI);
if (UseRegion == MIRegion.end() || UseRegion->second == I)
continue;
// Do not rematerialize an instruction if it uses or is used by an
// instruction that we have designated for rematerialization.
// FIXME: Allow for rematerialization chains: this requires 1. updating
// remat points to account for uses that are rematerialized, and 2.
// either rematerializing the candidates in careful ordering, or
// deferring the MBB RP walk until the entire chain has been
// rematerialized.
const MachineOperand &UseMO = UseMI->getOperand(0);
if (IsMarkedForRemat(UseMO) ||
llvm::any_of(DefMI.operands(), IsMarkedForRemat))
continue;
// Do not rematerialize an instruction it it uses registers that aren't
// available at its use. This ensures that we are not extending any live
// range while rematerializing.
SlotIndex UseIdx = DAG.LIS->getInstructionIndex(*UseMI).getRegSlot(true);
if (!VirtRegAuxInfo::allUsesAvailableAt(&DefMI, UseIdx, *DAG.LIS, DAG.MRI,
*DAG.TII))
continue;
// Add the instruction to the rematerializable list.
MarkedRegs.insert(Reg);
RematRegs.emplace_back(&DefMI, UseMI, DAG, MIRegion);
}
}
return !RematRegs.empty();
}
PreRARematStage::RematReg::RematReg(
MachineInstr *DefMI, MachineInstr *UseMI, GCNScheduleDAGMILive &DAG,
const DenseMap<MachineInstr *, unsigned> &MIRegion)
: DefMI(DefMI), UseMI(UseMI), LiveIn(DAG.Regions.size()),
LiveOut(DAG.Regions.size()), Live(DAG.Regions.size()),
DefRegion(MIRegion.at(DefMI)), UseRegion(MIRegion.at(UseMI)) {
// Mark regions in which the rematerializable register is live.
Register Reg = getReg();
for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) {
auto LiveInIt = DAG.LiveIns[I].find(Reg);
if (LiveInIt != DAG.LiveIns[I].end())
LiveIn.set(I);
const auto &LiveOuts = DAG.RegionLiveOuts.getLiveRegsForRegionIdx(I);
if (auto LiveOutIt = LiveOuts.find(Reg); LiveOutIt != LiveOuts.end())
LiveOut.set(I);
}
Live |= LiveIn;
Live |= LiveOut;
Mask = DAG.RegionLiveOuts.getLiveRegsForRegionIdx(DefRegion).at(Reg);
}
bool PreRARematStage::RematReg::maybeBeneficial(
const BitVector &TargetRegions, ArrayRef<GCNRPTarget> RPTargets) const {
Register Reg = getReg();
for (unsigned I : TargetRegions.set_bits()) {
if (Live[I] && RPTargets[I].isSaveBeneficial(Reg))
return true;
}
return false;
}
void PreRARematStage::RematReg::insertMI(unsigned RegionIdx,
MachineInstr *RematMI,
GCNScheduleDAGMILive &DAG) const {
RegionBoundaries &Bounds = DAG.Regions[RegionIdx];
if (Bounds.first == std::next(MachineBasicBlock::iterator(RematMI)))
Bounds.first = RematMI;
DAG.LIS->InsertMachineInstrInMaps(*RematMI);
DAG.LIS->createAndComputeVirtRegInterval(RematMI->getOperand(0).getReg());
}
PreRARematStage::ScoredRemat::FreqInfo::FreqInfo(
MachineFunction &MF, const GCNScheduleDAGMILive &DAG) {
assert(DAG.MLI && "MLI not defined in DAG");
MachineBranchProbabilityInfo MBPI;
MachineBlockFrequencyInfo MBFI(MF, MBPI, *DAG.MLI);
const unsigned NumRegions = DAG.Regions.size();
MinFreq = MBFI.getEntryFreq().getFrequency();
MaxFreq = 0;
Regions.reserve(NumRegions);
for (unsigned I = 0; I < NumRegions; ++I) {
MachineBasicBlock *MBB = DAG.Regions[I].first->getParent();
uint64_t BlockFreq = MBFI.getBlockFreq(MBB).getFrequency();
Regions.push_back(BlockFreq);
if (BlockFreq && BlockFreq < MinFreq)
MinFreq = BlockFreq;
else if (BlockFreq > MaxFreq)
MaxFreq = BlockFreq;
}
if (!MinFreq)
return;
// Scale everything down if frequencies are high.
if (MinFreq >= ScaleFactor * ScaleFactor) {
for (uint64_t &Freq : Regions)
Freq /= ScaleFactor;
MinFreq /= ScaleFactor;
MaxFreq /= ScaleFactor;
}
}
PreRARematStage::ScoredRemat::ScoredRemat(RematReg *Remat, const FreqInfo &Freq,
const GCNScheduleDAGMILive &DAG)
: Remat(Remat), FreqDiff(getFreqDiff(Freq)) {
RPSave.inc(Remat->getReg(), LaneBitmask::getNone(), Remat->Mask, DAG.MRI);
}
int64_t PreRARematStage::ScoredRemat::getFreqDiff(const FreqInfo &Freq) const {
// Get frequencies of defining and using regions. A rematerialization from the
// least frequent region to the most frequent region will yield the greatest
// latency penalty and therefore should get minimum score. Reciprocally, a
// rematerialization in the other direction should get maximum score. Default
// to values that will yield the worst possible score given known frequencies
// in order to penalize rematerializations from or into regions whose
// frequency is unknown.
int64_t DefOrMin = std::max(Freq.Regions[Remat->DefRegion], Freq.MinFreq);
int64_t UseOrMax = Freq.Regions[Remat->UseRegion];
if (!UseOrMax)
UseOrMax = Freq.MaxFreq;
return DefOrMin - UseOrMax;
}
void PreRARematStage::ScoredRemat::update(const BitVector &TargetRegions,
ArrayRef<GCNRPTarget> RPTargets,
const FreqInfo &FreqInfo,
bool ReduceSpill) {
MaxFreq = 0;
RegionImpact = 0;
for (unsigned I : TargetRegions.set_bits()) {
if (!Remat->Live[I])
continue;
// The rematerialization must contribute positively in at least one
// register class with usage above the RP target for this region to
// contribute to the score.
const GCNRPTarget &RegionTarget = RPTargets[I];
const unsigned NumRegsBenefit = RegionTarget.getNumRegsBenefit(RPSave);
if (!NumRegsBenefit)
continue;
bool UnusedLT = Remat->isUnusedLiveThrough(I);
// Regions in which RP is guaranteed to decrease have more weight.
RegionImpact += (UnusedLT ? 2 : 1) * NumRegsBenefit;
if (ReduceSpill) {
uint64_t Freq = FreqInfo.Regions[I];
if (!UnusedLT) {
// Apply a frequency penalty in regions in which we are not sure that RP
// will decrease.
Freq /= 2;
}
MaxFreq = std::max(MaxFreq, Freq);
}
}
}
MachineInstr *PreRARematStage::ScoredRemat::rematerialize(
BitVector &RecomputeRP, SmallVectorImpl<GCNRPTarget> &RPTargets,
GCNScheduleDAGMILive &DAG) const {
const SIInstrInfo *TII = DAG.MF.getSubtarget<GCNSubtarget>().getInstrInfo();
MachineInstr &DefMI = *Remat->DefMI;
Register Reg = DefMI.getOperand(0).getReg();
Register NewReg = DAG.MRI.cloneVirtualRegister(Reg);
// Rematerialize the register in the region where it is used.
MachineBasicBlock::iterator InsertPos = Remat->UseMI;
TII->reMaterialize(*InsertPos->getParent(), InsertPos, NewReg, 0, DefMI);
MachineInstr *RematMI = &*std::prev(InsertPos);
Remat->UseMI->substituteRegister(Reg, NewReg, 0, *DAG.TRI);
Remat->insertMI(Remat->UseRegion, RematMI, DAG);
#ifdef EXPENSIVE_CHECKS
// All uses are known to be available / live at the remat point. Thus,
// the uses should already be live in to the using region.
for (MachineOperand &MO : DefMI.operands()) {
if (!MO.isReg() || !MO.getReg() || !MO.readsReg())
continue;
Register UseReg = MO.getReg();
if (!UseReg.isVirtual())
continue;
LiveInterval &LI = DAG.LIS->getInterval(UseReg);
LaneBitmask LM = DAG.MRI.getMaxLaneMaskForVReg(MO.getReg());
if (LI.hasSubRanges() && MO.getSubReg())
LM = DAG.TRI->getSubRegIndexLaneMask(MO.getSubReg());
LaneBitmask LiveInMask = DAG.LiveIns[Remat->UseRegion].at(UseReg);
LaneBitmask UncoveredLanes = LM & ~(LiveInMask & LM);
// If this register has lanes not covered by the LiveIns, be sure they
// do not map to any subrange. ref:
// machine-scheduler-sink-trivial-remats.mir::omitted_subrange
if (UncoveredLanes.any()) {
assert(LI.hasSubRanges());
for (LiveInterval::SubRange &SR : LI.subranges())
assert((SR.LaneMask & UncoveredLanes).none());
}
}
#endif
// Remove the register from all regions where it is a live-in or live-out
// and adjust RP targets. The save is guaranteed in regions in which the
// register is live-through and unused but optimistic in all other regions
// where the register is live.
for (unsigned I : Remat->Live.set_bits()) {
RPTargets[I].saveRP(RPSave);
DAG.LiveIns[I].erase(Reg);
DAG.RegionLiveOuts.getLiveRegsForRegionIdx(I).erase(Reg);
if (!Remat->isUnusedLiveThrough(I))
RecomputeRP.set(I);
}
return RematMI;
}
void PreRARematStage::commitRematerializations() const {
REMAT_DEBUG(dbgs() << "Commiting all rematerializations\n");
for (const RollbackInfo &Rollback : Rollbacks)
DAG.deleteMI(Rollback.Remat->DefRegion, Rollback.Remat->DefMI);
}
void PreRARematStage::unsetSatisfiedRPTargets(const BitVector &Regions) {
for (unsigned I : Regions.set_bits()) {
if (TargetRegions[I] && RPTargets[I].satisfied()) {
REMAT_DEBUG(dbgs() << " [" << I << "] Target reached!\n");
TargetRegions.reset(I);
}
}
}
bool PreRARematStage::updateAndVerifyRPTargets(const BitVector &Regions) {
bool TooOptimistic = false;
for (unsigned I : Regions.set_bits()) {
GCNRPTarget &Target = RPTargets[I];
Target.setRP(DAG.getRealRegPressure(I));
// Since we were optimistic in assessing RP decreases in these regions, we
// may need to remark the target as a target region if RP didn't decrease
// as expected.
if (!TargetRegions[I] && !Target.satisfied()) {
REMAT_DEBUG(dbgs() << " [" << I << "] Incorrect RP estimation\n");
TooOptimistic = true;
TargetRegions.set(I);
}
}
return TooOptimistic;
}
// Copied from MachineLICM
bool PreRARematStage::isReMaterializable(const MachineInstr &MI) {
if (!DAG.TII->isReMaterializable(MI))
return false;
for (const MachineOperand &MO : MI.all_uses()) {
// We can't remat physreg uses, unless it is a constant or an ignorable
// use (e.g. implicit exec use on VALU instructions)
if (MO.getReg().isPhysical()) {
if (DAG.MRI.isConstantPhysReg(MO.getReg()) || DAG.TII->isIgnorableUse(MO))
continue;
return false;
}
}
return true;
}
void PreRARematStage::finalizeGCNSchedStage() {
// We consider that reducing spilling is always beneficial so we never
// rollback rematerializations or revert scheduling in such cases.
if (!TargetOcc)
return;
// When increasing occupancy, it is possible that re-scheduling is not able to
// achieve the target occupancy in all regions, in which case re-scheduling in
// all regions should be reverted.
if (DAG.MinOccupancy >= *TargetOcc) {
commitRematerializations();
return;
}
// It is possible that re-scheduling lowers occupancy over the one achieved
// just through rematerializations, in which case we revert re-scheduling in
// all regions but do not roll back rematerializations.
const bool ShouldRollbackRemats = AchievedOcc < *TargetOcc;
// When we both need to revert re-scheduling and rollback rematerializations,
// restore rematerialized MIs' original state before reverting so that they
// are treated as non-debug instructions by the revert logic.
if (ShouldRollbackRemats) {
for (const RollbackInfo &Rollback : Rollbacks) {
const auto &[Remat, RematMI, RegMap] = Rollback;
Remat->DefMI->setDesc(DAG.TII->get(RematMI->getOpcode()));
for (const auto &[MOIdx, Reg] : RegMap)
Remat->DefMI->getOperand(MOIdx).setReg(Reg);
}
}
// Revert re-scheduling in all affected regions.
for (const auto &[RegionIdx, OrigMIOrder, MaxPressure] : RegionReverts) {
REMAT_DEBUG(dbgs() << "Reverting re-scheduling in region " << RegionIdx
<< '\n');
DAG.Pressure[RegionIdx] = MaxPressure;
modifyRegionSchedule(RegionIdx, RegionBB[RegionIdx], OrigMIOrder);
}
if (!ShouldRollbackRemats) {
commitRematerializations();
DAG.setTargetOccupancy(AchievedOcc);
return;
}
// Reset the target occupancy to what it was pre-rematerialization.
DAG.setTargetOccupancy(*TargetOcc - 1);
// Finish rolling back rematerializations, then recompute pressure in all
// affected regions.
REMAT_DEBUG(dbgs() << "==== ROLLBACK ====\n");
BitVector RecomputeRP(DAG.Regions.size());
DenseSet<Register> RecomputeLI;
for (const RollbackInfo &Rollback : Rollbacks) {
const auto &[Remat, RematMI, RegMap] = Rollback;
// Switch back to using the original register and delete the
// rematerialization.
Register Reg = RematMI->getOperand(0).getReg();
Register OriginalReg = Remat->DefMI->getOperand(0).getReg();
Remat->UseMI->substituteRegister(Reg, OriginalReg, 0, *DAG.TRI);
REMAT_DEBUG(dbgs() << '[' << Remat->UseRegion
<< "] Deleting rematerialization " << *RematMI);
DAG.deleteMI(Remat->UseRegion, RematMI);
// Re-add the defined register as a live-in/live-out in all regions it used
// to be one in.
std::pair<Register, LaneBitmask> LiveReg(OriginalReg, Remat->Mask);
for (unsigned I : Remat->LiveIn.set_bits())
DAG.LiveIns[I].insert(LiveReg);
for (unsigned I : Remat->LiveOut.set_bits())
DAG.RegionLiveOuts.getLiveRegsForRegionIdx(I).insert(LiveReg);
RecomputeRP |= Rollback.Remat->Live;
// Regenerate intervals for all register operands of rematerialized MIs as
// slot indices may have changed slightly from before re-scheduling.
for (MachineOperand &MO : Rollback.Remat->DefMI->operands()) {
if (MO.isReg() && MO.getReg().isVirtual())
RecomputeLI.insert(MO.getReg());
}
}
for (Register Reg : RecomputeLI) {
DAG.LIS->removeInterval(Reg);
DAG.LIS->createAndComputeVirtRegInterval(Reg);
}
#ifdef EXPENSIVE_CHECKS
// In particular, we want to check for coherent MI/slot order in regions in
// which reverts and/or rollbacks may have happened.
MF.verify();
#endif
for (unsigned I : RecomputeRP.set_bits())
DAG.Pressure[I] = DAG.getRealRegPressure(I);
GCNSchedStage::finalizeGCNSchedStage();
}
void GCNScheduleDAGMILive::deleteMI(unsigned RegionIdx, MachineInstr *MI) {
// It's not possible for the deleted instruction to be upper region boundary
// since we don't delete region terminators.
if (Regions[RegionIdx].first == MI)
Regions[RegionIdx].first = std::next(MachineBasicBlock::iterator(MI));
LIS->removeInterval(MI->getOperand(0).getReg());
LIS->RemoveMachineInstrFromMaps(*MI);
MI->eraseFromParent();
}
void GCNScheduleDAGMILive::setTargetOccupancy(unsigned TargetOccupancy) {
MinOccupancy = TargetOccupancy;
if (MFI.getOccupancy() < TargetOccupancy)
MFI.increaseOccupancy(MF, MinOccupancy);
else
MFI.limitOccupancy(MinOccupancy);
}
static bool hasIGLPInstrs(ScheduleDAGInstrs *DAG) {
const SIInstrInfo *SII = static_cast<const SIInstrInfo *>(DAG->TII);
return any_of(*DAG, [SII](MachineBasicBlock::iterator MI) {
return SII->isIGLPMutationOnly(MI->getOpcode());
});
}
GCNPostScheduleDAGMILive::GCNPostScheduleDAGMILive(
MachineSchedContext *C, std::unique_ptr<MachineSchedStrategy> S,
bool RemoveKillFlags)
: ScheduleDAGMI(C, std::move(S), RemoveKillFlags) {}
void GCNPostScheduleDAGMILive::schedule() {
HasIGLPInstrs = hasIGLPInstrs(this);
if (HasIGLPInstrs) {
SavedMutations.clear();
SavedMutations.swap(Mutations);
addMutation(createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::PostRA));
}
ScheduleDAGMI::schedule();
}
void GCNPostScheduleDAGMILive::finalizeSchedule() {
if (HasIGLPInstrs)
SavedMutations.swap(Mutations);
ScheduleDAGMI::finalizeSchedule();
}