blob: 1684690cd829eb78f183c21e86fd6227cd72cb1d [file] [log] [blame] [edit]
//===- AMDGPUCoExecSchedStrategy.h - CoExec Scheduling Strategy -*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
/// \file
/// Coexecution-focused scheduling strategy for AMDGPU.
//
//===----------------------------------------------------------------------===//
#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUCOEXECSCHEDSTRATEGY_H
#define LLVM_LIB_TARGET_AMDGPU_AMDGPUCOEXECSCHEDSTRATEGY_H
#include "GCNSchedStrategy.h"
#include "llvm/CodeGen/MachineScheduler.h"
namespace llvm {
namespace AMDGPU {
//===----------------------------------------------------------------------===//
// Instruction Flavor Classification
//===----------------------------------------------------------------------===//
enum class InstructionFlavor : uint8_t {
WMMA, // WMMA/MFMA matrix operations
SingleCycleVALU, // Single-cycle VALU (not TRANS32, not multi-cycle CVT)
TRANS, // Transcendental ops (v_exp, v_log, etc.)
MultiCycleVALU, // VALU instructions with repeat rate > 1
VMEM, // FLAT/GLOBAL memory operations
DS, // LDS/GDS operations
SALU, // Scalar ALU
DMA, // Tensor DMA operations
Fence, // Fences and waits
Other, // Everything else
NUM_FLAVORS
};
inline StringRef getFlavorName(InstructionFlavor F) {
switch (F) {
case InstructionFlavor::WMMA:
return "WMMA";
case InstructionFlavor::SingleCycleVALU:
return "VALU(1c)";
case InstructionFlavor::TRANS:
return "TRANS";
case InstructionFlavor::MultiCycleVALU:
return "VALU(Nc)";
case InstructionFlavor::VMEM:
return "VMEM";
case InstructionFlavor::DS:
return "DS";
case InstructionFlavor::SALU:
return "SALU";
case InstructionFlavor::DMA:
return "DMA";
case InstructionFlavor::Fence:
return "Fence";
case InstructionFlavor::Other:
return "Other";
case InstructionFlavor::NUM_FLAVORS:
return "???";
}
llvm_unreachable("Unknown InstructionFlavor");
}
inline StringRef getFlavorShortName(InstructionFlavor F) {
switch (F) {
case InstructionFlavor::WMMA:
return "W";
case InstructionFlavor::SingleCycleVALU:
return "V";
case InstructionFlavor::TRANS:
return "T";
case InstructionFlavor::MultiCycleVALU:
return "C";
case InstructionFlavor::VMEM:
return "M";
case InstructionFlavor::DS:
return "D";
case InstructionFlavor::SALU:
return "S";
case InstructionFlavor::DMA:
return "X";
case InstructionFlavor::Fence:
return "F";
case InstructionFlavor::Other:
return "O";
case InstructionFlavor::NUM_FLAVORS:
return "?";
}
llvm_unreachable("Unknown InstructionFlavor");
}
InstructionFlavor classifyFlavor(const MachineInstr &MI,
const SIInstrInfo &SII);
using FlavorGroup = SmallVector<InstructionFlavor, 4>;
namespace FlavorGroups {
inline FlavorGroup allVALU() {
return {InstructionFlavor::SingleCycleVALU, InstructionFlavor::TRANS,
InstructionFlavor::MultiCycleVALU};
}
inline FlavorGroup allMem() {
return {InstructionFlavor::VMEM, InstructionFlavor::DS,
InstructionFlavor::DMA};
}
inline FlavorGroup individual(InstructionFlavor F) { return {F}; }
inline FlavorGroup all() {
FlavorGroup G;
for (unsigned I = 0;
I < static_cast<unsigned>(InstructionFlavor::NUM_FLAVORS); ++I)
G.push_back(static_cast<InstructionFlavor>(I));
return G;
}
} // namespace FlavorGroups
/// AMDGPU-specific scheduling decision reasons. These provide more granularity
/// than the generic CandReason enum for debugging purposes.
enum class AMDGPUSchedReason : uint8_t {
None,
CritResourceBalance, // tryCriticalResource chose based on resource pressure
CritResourceDep, // tryCriticalResourceDependency chose based on enabling
NUM_REASONS
};
inline StringRef getReasonName(AMDGPUSchedReason R) {
switch (R) {
case AMDGPUSchedReason::None:
return "None";
case AMDGPUSchedReason::CritResourceBalance:
return "CritResource";
case AMDGPUSchedReason::CritResourceDep:
return "CritResourceDep";
case AMDGPUSchedReason::NUM_REASONS:
return "???";
}
llvm_unreachable("Unknown AMDGPUSchedReason");
}
} // End namespace AMDGPU
//===----------------------------------------------------------------------===//
// Hardware Unit Information
//===----------------------------------------------------------------------===//
/// HardwareUnitInfo is a wrapper class which maps to some real hardware
/// resource. This is used to model hardware resource pressure per region, and
/// guide scheduling heuristics.
class HardwareUnitInfo {
private:
/// PrioritySUs maintains a list of the SUs we want to prioritize scheduling
/// for this HardwareUnit. This is used for agreement between
/// tryCriticalResourceDependency and tryCriticalResource: we schedule the
/// dependencies for a SU on critical resource, then schedule that same SU on
/// the critical resource. This agreement results in shorter live ranges and
/// more regular HardwareUnit access patterns. SUs are prioritized based on
/// depth for top-down scheduling.
SmallSetVector<SUnit *, 16> PrioritySUs;
/// All the SUs in the region that consume this resource
SmallSetVector<SUnit *, 16> AllSUs;
/// The total number of busy cycles for this HardwareUnit for a given region.
unsigned TotalCycles = 0;
// InstructionFlavor mapping
AMDGPU::InstructionFlavor Type;
// Whether or not instructions on this HardwareUnit may produce a window in
// which instructions in other HardwareUnits can coexecute. For example, WMMA
// / MFMA instructions may take multiple cycles, which may be overlapped with
// instructions on other HardwareUnits
bool ProducesCoexecWindow = false;
public:
HardwareUnitInfo() {}
unsigned size() { return AllSUs.size(); }
unsigned getTotalCycles() { return TotalCycles; }
void setType(unsigned TheType) {
assert(TheType < (unsigned)AMDGPU::InstructionFlavor::NUM_FLAVORS);
Type = (AMDGPU::InstructionFlavor)(TheType);
}
AMDGPU::InstructionFlavor getType() const { return Type; }
bool producesCoexecWindow() const { return ProducesCoexecWindow; }
void setProducesCoexecWindow(bool Val) { ProducesCoexecWindow = Val; }
bool contains(SUnit *SU) const { return AllSUs.contains(SU); }
/// \returns true if there is a difference in priority between \p SU and \p
/// Other. If so, \returns the SUnit with higher priority. This
/// method looks through the PrioritySUs to determine if one SU is more
/// prioritized than the other. If neither are in the PrioritySUs list, then
/// neither have priority over each other.
SUnit *getHigherPriority(SUnit *SU, SUnit *Other) const {
for (auto *SUOrder : PrioritySUs) {
if (SUOrder == SU)
return SU;
if (SUOrder == Other)
return Other;
}
return nullptr;
}
void reset() {
AllSUs.clear();
PrioritySUs.clear();
TotalCycles = 0;
Type = AMDGPU::InstructionFlavor::Other;
ProducesCoexecWindow = false;
}
/// \returns the next SU in PrioritySUs that is not ready. If \p LookDeep is
/// set, we will look beyond the PrioritySUs (if all the PrioritySUs are
/// ready) to AllSUs to attempt to find a target SU. When looking through
/// AllSUs we sort pick the target SU by minimal depth for top-down
/// scheduling. getNextTargetSU is useful for determining which SU on this
/// HardwareUnit we are trying to schedule - this info helps us determine
/// which dependencies to schedule. LookDeep is useful if the dependencies are
/// long latency (e.g. memory instructions). If we have many long latency
/// dependencies, it is beneficial to enable SUs multiple levels ahead.
SUnit *getNextTargetSU(bool LookDeep = false) const;
/// Insert the \p SU into the AllSUs and account its \p BlockingCycles into
/// the TotalCycles. This maintains the list of PrioritySUs.
void insert(SUnit *SU, unsigned BlockingCycles);
/// Update the state for \p SU being scheduled by removing it from the AllSus
/// and reducing its \p BlockingCycles from the TotalCycles. This maintains
/// the list of PrioritySUS.
void markScheduled(SUnit *SU, unsigned BlockingCycles);
};
//===----------------------------------------------------------------------===//
// Candidate Heuristics
//===----------------------------------------------------------------------===//
/// CandidateHeuristics contains state and implementations to facilitate making
/// per instruction scheduling decisions; it contains methods used in
/// tryCandidate to decide which instruction to schedule next.
class CandidateHeuristics {
protected:
ScheduleDAGMI *DAG;
const SIInstrInfo *SII;
const SIRegisterInfo *SRI;
const TargetSchedModel *SchedModel;
SmallVector<HardwareUnitInfo, 8> HWUInfo;
/// Walk over the region and collect total usage per HardwareUnit
void collectHWUIPressure();
/// Compute the blocking cycles for the appropriate HardwareUnit given an \p
/// SU
unsigned getHWUICyclesForInst(SUnit *SU);
/// Given a \p Flavor , find the corresponding HardwareUnit. \returns the
/// mapped HardwareUnit.
HardwareUnitInfo *getHWUIFromFlavor(AMDGPU::InstructionFlavor Flavor);
public:
CandidateHeuristics() = default;
void initialize(ScheduleDAGMI *DAG, const TargetSchedModel *SchedModel,
const TargetRegisterInfo *TRI);
/// Update the state to reflect that \p SU is going to be scheduled.
void updateForScheduling(SUnit *SU);
/// Sort the HWUInfo vector. After sorting, the HardwareUnits that are highest
/// priority are first. Priority is determined by maximizing coexecution and
/// keeping the critical HardwareUnit busy.
void sortHWUIResources();
/// Check for critical resource consumption. Prefer the candidate that uses
/// the most prioritized HardwareUnit. If both candidates use the same
/// HarwareUnit, prefer the candidate with higher priority on that
/// HardwareUnit.
bool tryCriticalResource(GenericSchedulerBase::SchedCandidate &TryCand,
GenericSchedulerBase::SchedCandidate &Cand,
SchedBoundary *Zone) const;
/// Check for dependencies of instructions that use prioritized HardwareUnits.
/// Prefer the candidate that is a dependency of an instruction that uses the
/// most prioritized HardwareUnit. If both candidates enable the same
/// HardwareUnit, prefer the candidate that enables the higher priority
/// instruction on that HardwareUnit.
bool
tryCriticalResourceDependency(GenericSchedulerBase::SchedCandidate &TryCand,
GenericSchedulerBase::SchedCandidate &Cand,
SchedBoundary *Zone) const;
void dumpRegionSummary();
};
class AMDGPUCoExecSchedStrategy final : public GCNSchedStrategy {
protected:
bool tryEffectiveStall(SchedCandidate &Cand, SchedCandidate &TryCand,
SchedBoundary &Zone) const;
AMDGPU::AMDGPUSchedReason LastAMDGPUReason = AMDGPU::AMDGPUSchedReason::None;
CandidateHeuristics Heurs;
#ifndef NDEBUG
void dumpPickSummary(SUnit *SU, bool IsTopNode, SchedCandidate &Cand);
#endif
bool tryCandidateCoexec(SchedCandidate &Cand, SchedCandidate &TryCand,
SchedBoundary *Zone);
void pickNodeFromQueue(SchedBoundary &Zone, const CandPolicy &ZonePolicy,
const RegPressureTracker &RPTracker,
SchedCandidate &Cand, bool &PickedPending,
bool IsBottomUp);
public:
AMDGPUCoExecSchedStrategy(const MachineSchedContext *C);
void initPolicy(MachineBasicBlock::iterator Begin,
MachineBasicBlock::iterator End,
unsigned NumRegionInstrs) override;
void initialize(ScheduleDAGMI *DAG) override;
SUnit *pickNode(bool &IsTopNode) override;
void schedNode(SUnit *SU, bool IsTopNode) override;
};
ScheduleDAGInstrs *createGCNCoExecMachineScheduler(MachineSchedContext *C);
ScheduleDAGInstrs *createGCNNoopPostMachineScheduler(MachineSchedContext *C);
} // End namespace llvm
#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUCOEXECSCHEDSTRATEGY_H