lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.h - llvm-project/llvm - Git at Google

 //===- AMDGPUCoExecSchedStrategy.h - CoExec Scheduling Strategy -*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 //
 /// \file
 /// Coexecution-focused scheduling strategy for AMDGPU.
 //
 //===----------------------------------------------------------------------===//

 #ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUCOEXECSCHEDSTRATEGY_H
 #define LLVM_LIB_TARGET_AMDGPU_AMDGPUCOEXECSCHEDSTRATEGY_H

 #include "GCNSchedStrategy.h"
 #include "llvm/CodeGen/MachineScheduler.h"

 namespace llvm {

 namespace AMDGPU {

 //===----------------------------------------------------------------------===//
 // Instruction Flavor Classification
 //===----------------------------------------------------------------------===//

 enum class InstructionFlavor : uint8_t {
   WMMA,            // WMMA/MFMA matrix operations
   SingleCycleVALU, // Single-cycle VALU (not TRANS32, not multi-cycle CVT)
   TRANS,           // Transcendental ops (v_exp, v_log, etc.)
   MultiCycleVALU,  // VALU instructions with repeat rate > 1
   VMEM,            // FLAT/GLOBAL memory operations
   DS,              // LDS/GDS operations
   SALU,            // Scalar ALU
   DMA,             // Tensor DMA operations
   Fence,           // Fences and waits
   Other,           // Everything else
   NUM_FLAVORS
 };

 inline StringRef getFlavorName(InstructionFlavor F) {
   switch (F) {
   case InstructionFlavor::WMMA:
     return "WMMA";
   case InstructionFlavor::SingleCycleVALU:
     return "VALU(1c)";
   case InstructionFlavor::TRANS:
     return "TRANS";
   case InstructionFlavor::MultiCycleVALU:
     return "VALU(Nc)";
   case InstructionFlavor::VMEM:
     return "VMEM";
   case InstructionFlavor::DS:
     return "DS";
   case InstructionFlavor::SALU:
     return "SALU";
   case InstructionFlavor::DMA:
     return "DMA";
   case InstructionFlavor::Fence:
     return "Fence";
   case InstructionFlavor::Other:
     return "Other";
   case InstructionFlavor::NUM_FLAVORS:
     return "???";
   }
   llvm_unreachable("Unknown InstructionFlavor");
 }

 inline StringRef getFlavorShortName(InstructionFlavor F) {
   switch (F) {
   case InstructionFlavor::WMMA:
     return "W";
   case InstructionFlavor::SingleCycleVALU:
     return "V";
   case InstructionFlavor::TRANS:
     return "T";
   case InstructionFlavor::MultiCycleVALU:
     return "C";
   case InstructionFlavor::VMEM:
     return "M";
   case InstructionFlavor::DS:
     return "D";
   case InstructionFlavor::SALU:
     return "S";
   case InstructionFlavor::DMA:
     return "X";
   case InstructionFlavor::Fence:
     return "F";
   case InstructionFlavor::Other:
     return "O";
   case InstructionFlavor::NUM_FLAVORS:
     return "?";
   }
   llvm_unreachable("Unknown InstructionFlavor");
 }

 InstructionFlavor classifyFlavor(const MachineInstr &MI,
                                  const SIInstrInfo &SII);

 using FlavorGroup = SmallVector<InstructionFlavor, 4>;

 namespace FlavorGroups {
 inline FlavorGroup allVALU() {
   return {InstructionFlavor::SingleCycleVALU, InstructionFlavor::TRANS,
           InstructionFlavor::MultiCycleVALU};
 }
 inline FlavorGroup allMem() {
   return {InstructionFlavor::VMEM, InstructionFlavor::DS,
           InstructionFlavor::DMA};
 }
 inline FlavorGroup individual(InstructionFlavor F) { return {F}; }
 inline FlavorGroup all() {
   FlavorGroup G;
   for (unsigned I = 0;
        I < static_cast<unsigned>(InstructionFlavor::NUM_FLAVORS); ++I)
     G.push_back(static_cast<InstructionFlavor>(I));
   return G;
 }
 } // namespace FlavorGroups

 /// AMDGPU-specific scheduling decision reasons. These provide more granularity
 /// than the generic CandReason enum for debugging purposes.
 enum class AMDGPUSchedReason : uint8_t {
   None,
   CritResourceBalance, // tryCriticalResource chose based on resource pressure
   CritResourceDep,     // tryCriticalResourceDependency chose based on enabling
   NUM_REASONS
 };

 inline StringRef getReasonName(AMDGPUSchedReason R) {
   switch (R) {
   case AMDGPUSchedReason::None:
     return "None";
   case AMDGPUSchedReason::CritResourceBalance:
     return "CritResource";
   case AMDGPUSchedReason::CritResourceDep:
     return "CritResourceDep";
   case AMDGPUSchedReason::NUM_REASONS:
     return "???";
   }
   llvm_unreachable("Unknown AMDGPUSchedReason");
 }

 } // End namespace AMDGPU

 //===----------------------------------------------------------------------===//
 // Hardware Unit Information
 //===----------------------------------------------------------------------===//

 /// HardwareUnitInfo is a wrapper class which maps to some real hardware
 /// resource. This is used to model hardware resource pressure per region, and
 /// guide scheduling heuristics.
 class HardwareUnitInfo {
 private:
   /// PrioritySUs maintains a list of the SUs we want to prioritize scheduling
   /// for this HardwareUnit. This is used for agreement between
   /// tryCriticalResourceDependency and tryCriticalResource: we schedule the
   /// dependencies for a SU on critical resource, then schedule that same SU on
   /// the critical resource. This agreement results in shorter live ranges and
   /// more regular HardwareUnit access patterns. SUs are prioritized based on
   /// depth for top-down scheduling.
   SmallSetVector<SUnit *, 16> PrioritySUs;
   /// All the SUs in the region that consume this resource
   SmallSetVector<SUnit *, 16> AllSUs;
   /// The total number of busy cycles for this HardwareUnit for a given region.
   unsigned TotalCycles = 0;
   // InstructionFlavor mapping
   AMDGPU::InstructionFlavor Type;
   // Whether or not instructions on this HardwareUnit may produce a window in
   // which instructions in other HardwareUnits can coexecute. For example, WMMA
   // / MFMA instructions may take multiple cycles, which may be overlapped with
   // instructions on other HardwareUnits
   bool ProducesCoexecWindow = false;

 public:
   HardwareUnitInfo() {}

   unsigned size() { return AllSUs.size(); }

   unsigned getTotalCycles() { return TotalCycles; }

   void setType(unsigned TheType) {
     assert(TheType < (unsigned)AMDGPU::InstructionFlavor::NUM_FLAVORS);
     Type = (AMDGPU::InstructionFlavor)(TheType);
   }

   AMDGPU::InstructionFlavor getType() const { return Type; }

   bool producesCoexecWindow() const { return ProducesCoexecWindow; }

   void setProducesCoexecWindow(bool Val) { ProducesCoexecWindow = Val; }

   bool contains(SUnit *SU) const { return AllSUs.contains(SU); }

   /// \returns true if there is a difference in priority between \p SU and \p
   /// Other. If so, \returns the SUnit with higher priority. This
   /// method looks through the PrioritySUs to determine if one SU is more
   /// prioritized than the other. If neither are in the PrioritySUs list, then
   /// neither have priority over each other.
   SUnit *getHigherPriority(SUnit *SU, SUnit *Other) const {
     for (auto *SUOrder : PrioritySUs) {
       if (SUOrder == SU)
         return SU;

       if (SUOrder == Other)
         return Other;
     }
     return nullptr;
   }

   void reset() {
     AllSUs.clear();
     PrioritySUs.clear();
     TotalCycles = 0;
     Type = AMDGPU::InstructionFlavor::Other;
     ProducesCoexecWindow = false;
   }

   /// \returns the next SU in PrioritySUs that is not ready. If \p LookDeep is
   /// set, we will look beyond the PrioritySUs (if all the PrioritySUs are
   /// ready) to AllSUs to attempt to find a target SU. When looking through
   /// AllSUs we sort pick the target SU by minimal depth for top-down
   /// scheduling. getNextTargetSU is useful for determining which SU on this
   /// HardwareUnit we are trying to schedule - this info helps us determine
   /// which dependencies to schedule. LookDeep is useful if the dependencies are
   /// long latency (e.g. memory instructions). If we have many long latency
   /// dependencies, it is beneficial to enable SUs multiple levels ahead.
   SUnit *getNextTargetSU(bool LookDeep = false) const;
   /// Insert the \p SU into the AllSUs and account its \p BlockingCycles into
   /// the TotalCycles. This maintains the list of PrioritySUs.
   void insert(SUnit *SU, unsigned BlockingCycles);
   /// Update the state for \p SU being scheduled by removing it from the AllSus
   /// and reducing its \p BlockingCycles from the TotalCycles. This maintains
   /// the list of PrioritySUS.
   void markScheduled(SUnit *SU, unsigned BlockingCycles);
 };

 //===----------------------------------------------------------------------===//
 // Candidate Heuristics
 //===----------------------------------------------------------------------===//

 /// CandidateHeuristics contains state and implementations to facilitate making
 /// per instruction scheduling decisions; it contains methods used in
 /// tryCandidate to decide which instruction to schedule next.
 class CandidateHeuristics {
 protected:
   ScheduleDAGMI *DAG;
   const SIInstrInfo *SII;
   const SIRegisterInfo *SRI;
   const TargetSchedModel *SchedModel;
   SmallVector<HardwareUnitInfo, 8> HWUInfo;

   /// Walk over the region and collect total usage per HardwareUnit
   void collectHWUIPressure();

   /// Compute the blocking cycles for the appropriate HardwareUnit given an \p
   /// SU
   unsigned getHWUICyclesForInst(SUnit *SU);

   /// Given a \p Flavor , find the corresponding HardwareUnit. \returns the
   /// mapped HardwareUnit.
   HardwareUnitInfo *getHWUIFromFlavor(AMDGPU::InstructionFlavor Flavor);

 public:
   CandidateHeuristics() = default;

   void initialize(ScheduleDAGMI *DAG, const TargetSchedModel *SchedModel,
                   const TargetRegisterInfo *TRI);

   /// Update the state to reflect that \p SU is going to be scheduled.
   void updateForScheduling(SUnit *SU);

   /// Sort the HWUInfo vector. After sorting, the HardwareUnits that are highest
   /// priority are first. Priority is determined by maximizing coexecution and
   /// keeping the critical HardwareUnit busy.
   void sortHWUIResources();

   /// Check for critical resource consumption. Prefer the candidate that uses
   /// the most prioritized HardwareUnit. If both candidates use the same
   /// HarwareUnit, prefer the candidate with higher priority on that
   /// HardwareUnit.
   bool tryCriticalResource(GenericSchedulerBase::SchedCandidate &TryCand,
                            GenericSchedulerBase::SchedCandidate &Cand,
                            SchedBoundary *Zone) const;

   /// Check for dependencies of instructions that use prioritized HardwareUnits.
   /// Prefer the candidate that is a dependency of an instruction that uses the
   /// most prioritized HardwareUnit. If both candidates enable the same
   /// HardwareUnit, prefer the candidate that enables the higher priority
   /// instruction on that HardwareUnit.
   bool
   tryCriticalResourceDependency(GenericSchedulerBase::SchedCandidate &TryCand,
                                 GenericSchedulerBase::SchedCandidate &Cand,
                                 SchedBoundary *Zone) const;

   void dumpRegionSummary();
 };

 class AMDGPUCoExecSchedStrategy final : public GCNSchedStrategy {
 protected:
   bool tryEffectiveStall(SchedCandidate &Cand, SchedCandidate &TryCand,
                          SchedBoundary &Zone) const;
   AMDGPU::AMDGPUSchedReason LastAMDGPUReason = AMDGPU::AMDGPUSchedReason::None;
   CandidateHeuristics Heurs;

 #ifndef NDEBUG
   void dumpPickSummary(SUnit *SU, bool IsTopNode, SchedCandidate &Cand);
 #endif

   bool tryCandidateCoexec(SchedCandidate &Cand, SchedCandidate &TryCand,
                           SchedBoundary *Zone);
   void pickNodeFromQueue(SchedBoundary &Zone, const CandPolicy &ZonePolicy,
                          const RegPressureTracker &RPTracker,
                          SchedCandidate &Cand, bool &PickedPending,
                          bool IsBottomUp);

 public:
   AMDGPUCoExecSchedStrategy(const MachineSchedContext *C);

   void initPolicy(MachineBasicBlock::iterator Begin,
                   MachineBasicBlock::iterator End,
                   unsigned NumRegionInstrs) override;
   void initialize(ScheduleDAGMI *DAG) override;
   SUnit *pickNode(bool &IsTopNode) override;
   void schedNode(SUnit *SU, bool IsTopNode) override;
 };

 ScheduleDAGInstrs *createGCNCoExecMachineScheduler(MachineSchedContext *C);
 ScheduleDAGInstrs *createGCNNoopPostMachineScheduler(MachineSchedContext *C);

 } // End namespace llvm

 #endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUCOEXECSCHEDSTRATEGY_H
	//===- AMDGPUCoExecSchedStrategy.h - CoExec Scheduling Strategy -- C++ --===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	//
	/// \file
	/// Coexecution-focused scheduling strategy for AMDGPU.
	//
	//===----------------------------------------------------------------------===//

	#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUCOEXECSCHEDSTRATEGY_H
	#define LLVM_LIB_TARGET_AMDGPU_AMDGPUCOEXECSCHEDSTRATEGY_H

	#include "GCNSchedStrategy.h"
	#include "llvm/CodeGen/MachineScheduler.h"

	namespace llvm {

	namespace AMDGPU {

	//===----------------------------------------------------------------------===//
	// Instruction Flavor Classification
	//===----------------------------------------------------------------------===//

	enum class InstructionFlavor : uint8_t {
	WMMA, // WMMA/MFMA matrix operations
	SingleCycleVALU, // Single-cycle VALU (not TRANS32, not multi-cycle CVT)
	TRANS, // Transcendental ops (v_exp, v_log, etc.)
	MultiCycleVALU, // VALU instructions with repeat rate > 1
	VMEM, // FLAT/GLOBAL memory operations
	DS, // LDS/GDS operations
	SALU, // Scalar ALU
	DMA, // Tensor DMA operations
	Fence, // Fences and waits
	Other, // Everything else
	NUM_FLAVORS
	};

	inline StringRef getFlavorName(InstructionFlavor F) {
	switch (F) {
	case InstructionFlavor::WMMA:
	return "WMMA";
	case InstructionFlavor::SingleCycleVALU:
	return "VALU(1c)";
	case InstructionFlavor::TRANS:
	return "TRANS";
	case InstructionFlavor::MultiCycleVALU:
	return "VALU(Nc)";
	case InstructionFlavor::VMEM:
	return "VMEM";
	case InstructionFlavor::DS:
	return "DS";
	case InstructionFlavor::SALU:
	return "SALU";
	case InstructionFlavor::DMA:
	return "DMA";
	case InstructionFlavor::Fence:
	return "Fence";
	case InstructionFlavor::Other:
	return "Other";
	case InstructionFlavor::NUM_FLAVORS:
	return "???";
	}
	llvm_unreachable("Unknown InstructionFlavor");
	}

	inline StringRef getFlavorShortName(InstructionFlavor F) {
	switch (F) {
	case InstructionFlavor::WMMA:
	return "W";
	case InstructionFlavor::SingleCycleVALU:
	return "V";
	case InstructionFlavor::TRANS:
	return "T";
	case InstructionFlavor::MultiCycleVALU:
	return "C";
	case InstructionFlavor::VMEM:
	return "M";
	case InstructionFlavor::DS:
	return "D";
	case InstructionFlavor::SALU:
	return "S";
	case InstructionFlavor::DMA:
	return "X";
	case InstructionFlavor::Fence:
	return "F";
	case InstructionFlavor::Other:
	return "O";
	case InstructionFlavor::NUM_FLAVORS:
	return "?";
	}
	llvm_unreachable("Unknown InstructionFlavor");
	}

	InstructionFlavor classifyFlavor(const MachineInstr &MI,
	const SIInstrInfo &SII);

	using FlavorGroup = SmallVector<InstructionFlavor, 4>;

	namespace FlavorGroups {
	inline FlavorGroup allVALU() {
	return {InstructionFlavor::SingleCycleVALU, InstructionFlavor::TRANS,
	InstructionFlavor::MultiCycleVALU};
	}
	inline FlavorGroup allMem() {
	return {InstructionFlavor::VMEM, InstructionFlavor::DS,
	InstructionFlavor::DMA};
	}
	inline FlavorGroup individual(InstructionFlavor F) { return {F}; }
	inline FlavorGroup all() {
	FlavorGroup G;
	for (unsigned I = 0;
	I < static_cast<unsigned>(InstructionFlavor::NUM_FLAVORS); ++I)
	G.push_back(static_cast<InstructionFlavor>(I));
	return G;
	}
	} // namespace FlavorGroups

	/// AMDGPU-specific scheduling decision reasons. These provide more granularity
	/// than the generic CandReason enum for debugging purposes.
	enum class AMDGPUSchedReason : uint8_t {
	None,
	CritResourceBalance, // tryCriticalResource chose based on resource pressure
	CritResourceDep, // tryCriticalResourceDependency chose based on enabling
	NUM_REASONS
	};

	inline StringRef getReasonName(AMDGPUSchedReason R) {
	switch (R) {
	case AMDGPUSchedReason::None:
	return "None";
	case AMDGPUSchedReason::CritResourceBalance:
	return "CritResource";
	case AMDGPUSchedReason::CritResourceDep:
	return "CritResourceDep";
	case AMDGPUSchedReason::NUM_REASONS:
	return "???";
	}
	llvm_unreachable("Unknown AMDGPUSchedReason");
	}

	} // End namespace AMDGPU

	//===----------------------------------------------------------------------===//
	// Hardware Unit Information
	//===----------------------------------------------------------------------===//

	/// HardwareUnitInfo is a wrapper class which maps to some real hardware
	/// resource. This is used to model hardware resource pressure per region, and
	/// guide scheduling heuristics.
	class HardwareUnitInfo {
	private:
	/// PrioritySUs maintains a list of the SUs we want to prioritize scheduling
	/// for this HardwareUnit. This is used for agreement between
	/// tryCriticalResourceDependency and tryCriticalResource: we schedule the
	/// dependencies for a SU on critical resource, then schedule that same SU on
	/// the critical resource. This agreement results in shorter live ranges and
	/// more regular HardwareUnit access patterns. SUs are prioritized based on
	/// depth for top-down scheduling.
	SmallSetVector<SUnit *, 16> PrioritySUs;
	/// All the SUs in the region that consume this resource
	SmallSetVector<SUnit *, 16> AllSUs;
	/// The total number of busy cycles for this HardwareUnit for a given region.
	unsigned TotalCycles = 0;
	// InstructionFlavor mapping
	AMDGPU::InstructionFlavor Type;
	// Whether or not instructions on this HardwareUnit may produce a window in
	// which instructions in other HardwareUnits can coexecute. For example, WMMA
	// / MFMA instructions may take multiple cycles, which may be overlapped with
	// instructions on other HardwareUnits
	bool ProducesCoexecWindow = false;

	public:
	HardwareUnitInfo() {}

	unsigned size() { return AllSUs.size(); }

	unsigned getTotalCycles() { return TotalCycles; }

	void setType(unsigned TheType) {
	assert(TheType < (unsigned)AMDGPU::InstructionFlavor::NUM_FLAVORS);
	Type = (AMDGPU::InstructionFlavor)(TheType);
	}

	AMDGPU::InstructionFlavor getType() const { return Type; }

	bool producesCoexecWindow() const { return ProducesCoexecWindow; }

	void setProducesCoexecWindow(bool Val) { ProducesCoexecWindow = Val; }

	bool contains(SUnit *SU) const { return AllSUs.contains(SU); }

	/// \returns true if there is a difference in priority between \p SU and \p
	/// Other. If so, \returns the SUnit with higher priority. This
	/// method looks through the PrioritySUs to determine if one SU is more
	/// prioritized than the other. If neither are in the PrioritySUs list, then
	/// neither have priority over each other.
	SUnit getHigherPriority(SUnit SU, SUnit *Other) const {
	for (auto *SUOrder : PrioritySUs) {
	if (SUOrder == SU)
	return SU;

	if (SUOrder == Other)
	return Other;
	}
	return nullptr;
	}

	void reset() {
	AllSUs.clear();
	PrioritySUs.clear();
	TotalCycles = 0;
	Type = AMDGPU::InstructionFlavor::Other;
	ProducesCoexecWindow = false;
	}

	/// \returns the next SU in PrioritySUs that is not ready. If \p LookDeep is
	/// set, we will look beyond the PrioritySUs (if all the PrioritySUs are
	/// ready) to AllSUs to attempt to find a target SU. When looking through
	/// AllSUs we sort pick the target SU by minimal depth for top-down
	/// scheduling. getNextTargetSU is useful for determining which SU on this
	/// HardwareUnit we are trying to schedule - this info helps us determine
	/// which dependencies to schedule. LookDeep is useful if the dependencies are
	/// long latency (e.g. memory instructions). If we have many long latency
	/// dependencies, it is beneficial to enable SUs multiple levels ahead.
	SUnit *getNextTargetSU(bool LookDeep = false) const;
	/// Insert the \p SU into the AllSUs and account its \p BlockingCycles into
	/// the TotalCycles. This maintains the list of PrioritySUs.
	void insert(SUnit *SU, unsigned BlockingCycles);
	/// Update the state for \p SU being scheduled by removing it from the AllSus
	/// and reducing its \p BlockingCycles from the TotalCycles. This maintains
	/// the list of PrioritySUS.
	void markScheduled(SUnit *SU, unsigned BlockingCycles);
	};

	//===----------------------------------------------------------------------===//
	// Candidate Heuristics
	//===----------------------------------------------------------------------===//

	/// CandidateHeuristics contains state and implementations to facilitate making
	/// per instruction scheduling decisions; it contains methods used in
	/// tryCandidate to decide which instruction to schedule next.
	class CandidateHeuristics {
	protected:
	ScheduleDAGMI *DAG;
	const SIInstrInfo *SII;
	const SIRegisterInfo *SRI;
	const TargetSchedModel *SchedModel;
	SmallVector<HardwareUnitInfo, 8> HWUInfo;

	/// Walk over the region and collect total usage per HardwareUnit
	void collectHWUIPressure();

	/// Compute the blocking cycles for the appropriate HardwareUnit given an \p
	/// SU
	unsigned getHWUICyclesForInst(SUnit *SU);

	/// Given a \p Flavor , find the corresponding HardwareUnit. \returns the
	/// mapped HardwareUnit.
	HardwareUnitInfo *getHWUIFromFlavor(AMDGPU::InstructionFlavor Flavor);

	public:
	CandidateHeuristics() = default;

	void initialize(ScheduleDAGMI DAG, const TargetSchedModel SchedModel,
	const TargetRegisterInfo *TRI);

	/// Update the state to reflect that \p SU is going to be scheduled.
	void updateForScheduling(SUnit *SU);

	/// Sort the HWUInfo vector. After sorting, the HardwareUnits that are highest
	/// priority are first. Priority is determined by maximizing coexecution and
	/// keeping the critical HardwareUnit busy.
	void sortHWUIResources();

	/// Check for critical resource consumption. Prefer the candidate that uses
	/// the most prioritized HardwareUnit. If both candidates use the same
	/// HarwareUnit, prefer the candidate with higher priority on that
	/// HardwareUnit.
	bool tryCriticalResource(GenericSchedulerBase::SchedCandidate &TryCand,
	GenericSchedulerBase::SchedCandidate &Cand,
	SchedBoundary *Zone) const;

	/// Check for dependencies of instructions that use prioritized HardwareUnits.
	/// Prefer the candidate that is a dependency of an instruction that uses the
	/// most prioritized HardwareUnit. If both candidates enable the same
	/// HardwareUnit, prefer the candidate that enables the higher priority
	/// instruction on that HardwareUnit.
	bool
	tryCriticalResourceDependency(GenericSchedulerBase::SchedCandidate &TryCand,
	GenericSchedulerBase::SchedCandidate &Cand,
	SchedBoundary *Zone) const;

	void dumpRegionSummary();
	};

	class AMDGPUCoExecSchedStrategy final : public GCNSchedStrategy {
	protected:
	bool tryEffectiveStall(SchedCandidate &Cand, SchedCandidate &TryCand,
	SchedBoundary &Zone) const;
	AMDGPU::AMDGPUSchedReason LastAMDGPUReason = AMDGPU::AMDGPUSchedReason::None;
	CandidateHeuristics Heurs;

	#ifndef NDEBUG
	void dumpPickSummary(SUnit *SU, bool IsTopNode, SchedCandidate &Cand);
	#endif

	bool tryCandidateCoexec(SchedCandidate &Cand, SchedCandidate &TryCand,
	SchedBoundary *Zone);
	void pickNodeFromQueue(SchedBoundary &Zone, const CandPolicy &ZonePolicy,
	const RegPressureTracker &RPTracker,
	SchedCandidate &Cand, bool &PickedPending,
	bool IsBottomUp);

	public:
	AMDGPUCoExecSchedStrategy(const MachineSchedContext *C);

	void initPolicy(MachineBasicBlock::iterator Begin,
	MachineBasicBlock::iterator End,
	unsigned NumRegionInstrs) override;
	void initialize(ScheduleDAGMI *DAG) override;
	SUnit *pickNode(bool &IsTopNode) override;
	void schedNode(SUnit *SU, bool IsTopNode) override;
	};

	ScheduleDAGInstrs createGCNCoExecMachineScheduler(MachineSchedContext C);
	ScheduleDAGInstrs createGCNNoopPostMachineScheduler(MachineSchedContext C);

	} // End namespace llvm

	#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUCOEXECSCHEDSTRATEGY_H