llvm/lib/Target/AMDGPU/SIMachineScheduler.h - llvm-project - Git at Google

 //===-- SIMachineScheduler.h - SI Scheduler Interface -----------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 //
 /// \file
 /// SI Machine Scheduler interface
 //
 //===----------------------------------------------------------------------===//

 #ifndef LLVM_LIB_TARGET_AMDGPU_SIMACHINESCHEDULER_H
 #define LLVM_LIB_TARGET_AMDGPU_SIMACHINESCHEDULER_H

 #include "llvm/CodeGen/MachineScheduler.h"
 #include "llvm/CodeGen/RegisterPressure.h"
 #include "llvm/CodeGen/ScheduleDAG.h"
 #include <cstdint>
 #include <set>
 #include <vector>

 namespace llvm {

 class SIInstrInfo;
 class SIRegisterInfo;
 class SIScheduleDAGMI;
 class SIScheduleBlockCreator;

 enum SIScheduleCandReason {
   NoCand,
   RegUsage,
   Latency,
   Successor,
   Depth,
   NodeOrder
 };

 struct SISchedulerCandidate {
   // The reason for this candidate.
   SIScheduleCandReason Reason = NoCand;

   // Set of reasons that apply to multiple candidates.
   uint32_t RepeatReasonSet = 0;

   SISchedulerCandidate() = default;

   bool isRepeat(SIScheduleCandReason R) { return RepeatReasonSet & (1 << R); }
   void setRepeat(SIScheduleCandReason R) { RepeatReasonSet |= (1 << R); }
 };

 enum SIScheduleBlockLinkKind {
   NoData,
   Data
 };

 class SIScheduleBlock {
   SIScheduleDAGMI *DAG;
   SIScheduleBlockCreator *BC;

   std::vector<SUnit*> SUnits;
   std::map<unsigned, unsigned> NodeNum2Index;
   std::vector<SUnit*> TopReadySUs;
   std::vector<SUnit*> ScheduledSUnits;

   /// The top of the unscheduled zone.
   IntervalPressure TopPressure;
   RegPressureTracker TopRPTracker;

   // Pressure: number of said class of registers needed to
   // store the live virtual and real registers.
   // We do care only of SGPR32 and VGPR32 and do track only virtual registers.
   // Pressure of additional registers required inside the block.
   std::vector<unsigned> InternalAdditionalPressure;
   // Pressure of input and output registers
   std::vector<unsigned> LiveInPressure;
   std::vector<unsigned> LiveOutPressure;
   // Registers required by the block, and outputs.
   // We do track only virtual registers.
   // Note that some registers are not 32 bits,
   // and thus the pressure is not equal
   // to the number of live registers.
   std::set<unsigned> LiveInRegs;
   std::set<unsigned> LiveOutRegs;

   bool Scheduled = false;
   bool HighLatencyBlock = false;

   std::vector<unsigned> HasLowLatencyNonWaitedParent;

   // Unique ID, the index of the Block in the SIScheduleDAGMI Blocks table.
   unsigned ID;

   std::vector<SIScheduleBlock*> Preds;  // All blocks predecessors.
   // All blocks successors, and the kind of link
   std::vector<std::pair<SIScheduleBlock*, SIScheduleBlockLinkKind>> Succs;
   unsigned NumHighLatencySuccessors = 0;

 public:
   SIScheduleBlock(SIScheduleDAGMI *DAG, SIScheduleBlockCreator *BC,
                   unsigned ID):
     DAG(DAG), BC(BC), TopRPTracker(TopPressure), ID(ID) {}

   ~SIScheduleBlock() = default;

   unsigned getID() const { return ID; }

   /// Functions for Block construction.
   void addUnit(SUnit *SU);

   // When all SUs have been added.
   void finalizeUnits();

   // Add block pred, which has instruction predecessor of SU.
   void addPred(SIScheduleBlock *Pred);
   void addSucc(SIScheduleBlock *Succ, SIScheduleBlockLinkKind Kind);

   const std::vector<SIScheduleBlock*>& getPreds() const { return Preds; }
   ArrayRef<std::pair<SIScheduleBlock*, SIScheduleBlockLinkKind>>
     getSuccs() const { return Succs; }

   unsigned Height;  // Maximum topdown path length to block without outputs
   unsigned Depth;   // Maximum bottomup path length to block without inputs

   unsigned getNumHighLatencySuccessors() const {
     return NumHighLatencySuccessors;
   }

   bool isHighLatencyBlock() { return HighLatencyBlock; }

   // This is approximative.
   // Ideally should take into accounts some instructions (rcp, etc)
   // are 4 times slower.
   int getCost() { return SUnits.size(); }

   // The block Predecessors and Successors must be all registered
   // before fastSchedule().
   // Fast schedule with no particular requirement.
   void fastSchedule();

   std::vector<SUnit*> getScheduledUnits() { return ScheduledSUnits; }

   // Complete schedule that will try to minimize reg pressure and
   // low latencies, and will fill liveins and liveouts.
   // Needs all MIs to be grouped between BeginBlock and EndBlock.
   // The MIs can be moved after the scheduling,
   // it is just used to allow correct track of live registers.
   void schedule(MachineBasicBlock::iterator BeginBlock,
                 MachineBasicBlock::iterator EndBlock);

   bool isScheduled() { return Scheduled; }

   // Needs the block to be scheduled inside
   // TODO: find a way to compute it.
   std::vector<unsigned> &getInternalAdditionalRegUsage() {
     return InternalAdditionalPressure;
   }

   std::set<unsigned> &getInRegs() { return LiveInRegs; }
   std::set<unsigned> &getOutRegs() { return LiveOutRegs; }

   void printDebug(bool Full);

 private:
   struct SISchedCandidate : SISchedulerCandidate {
     // The best SUnit candidate.
     SUnit *SU = nullptr;

     unsigned SGPRUsage;
     unsigned VGPRUsage;
     bool IsLowLatency;
     unsigned LowLatencyOffset;
     bool HasLowLatencyNonWaitedParent;

     SISchedCandidate() = default;

     bool isValid() const { return SU; }

     // Copy the status of another candidate without changing policy.
     void setBest(SISchedCandidate &Best) {
       assert(Best.Reason != NoCand && "uninitialized Sched candidate");
       SU = Best.SU;
       Reason = Best.Reason;
       SGPRUsage = Best.SGPRUsage;
       VGPRUsage = Best.VGPRUsage;
       IsLowLatency = Best.IsLowLatency;
       LowLatencyOffset = Best.LowLatencyOffset;
       HasLowLatencyNonWaitedParent = Best.HasLowLatencyNonWaitedParent;
     }
   };

   void undoSchedule();

   void undoReleaseSucc(SUnit *SU, SDep *SuccEdge);
   void releaseSucc(SUnit *SU, SDep *SuccEdge);
   // InOrOutBlock: restrict to links pointing inside the block (true),
   // or restrict to links pointing outside the block (false).
   void releaseSuccessors(SUnit *SU, bool InOrOutBlock);

   void nodeScheduled(SUnit *SU);
   void tryCandidateTopDown(SISchedCandidate &Cand, SISchedCandidate &TryCand);
   void tryCandidateBottomUp(SISchedCandidate &Cand, SISchedCandidate &TryCand);
   SUnit* pickNode();
   void traceCandidate(const SISchedCandidate &Cand);
   void initRegPressure(MachineBasicBlock::iterator BeginBlock,
                        MachineBasicBlock::iterator EndBlock);
 };

 struct SIScheduleBlocks {
   std::vector<SIScheduleBlock*> Blocks;
   std::vector<int> TopDownIndex2Block;
   std::vector<int> TopDownBlock2Index;
 };

 enum SISchedulerBlockCreatorVariant {
   LatenciesAlone,
   LatenciesGrouped,
   LatenciesAlonePlusConsecutive
 };

 class SIScheduleBlockCreator {
   SIScheduleDAGMI *DAG;
   // unique_ptr handles freeing memory for us.
   std::vector<std::unique_ptr<SIScheduleBlock>> BlockPtrs;
   std::map<SISchedulerBlockCreatorVariant,
            SIScheduleBlocks> Blocks;
   std::vector<SIScheduleBlock*> CurrentBlocks;
   std::vector<int> Node2CurrentBlock;

   // Topological sort
   // Maps topological index to the node number.
   std::vector<int> TopDownIndex2Block;
   std::vector<int> TopDownBlock2Index;
   std::vector<int> BottomUpIndex2Block;

   // 0 -> Color not given.
   // 1 to SUnits.size() -> Reserved group (you should only add elements to them).
   // Above -> Other groups.
   int NextReservedID;
   int NextNonReservedID;
   std::vector<int> CurrentColoring;
   std::vector<int> CurrentTopDownReservedDependencyColoring;
   std::vector<int> CurrentBottomUpReservedDependencyColoring;

 public:
   SIScheduleBlockCreator(SIScheduleDAGMI *DAG);

   SIScheduleBlocks
   getBlocks(SISchedulerBlockCreatorVariant BlockVariant);

   bool isSUInBlock(SUnit *SU, unsigned ID);

 private:
   // Give a Reserved color to every high latency.
   void colorHighLatenciesAlone();

   // Create groups of high latencies with a Reserved color.
   void colorHighLatenciesGroups();

   // Compute coloring for topdown and bottom traversals with
   // different colors depending on dependencies on Reserved colors.
   void colorComputeReservedDependencies();

   // Give color to all non-colored SUs according to Reserved groups dependencies.
   void colorAccordingToReservedDependencies();

   // Divides Blocks having no bottom up or top down dependencies on Reserved groups.
   // The new colors are computed according to the dependencies on the other blocks
   // formed with colorAccordingToReservedDependencies.
   void colorEndsAccordingToDependencies();

   // Cut groups into groups with SUs in consecutive order (except for Reserved groups).
   void colorForceConsecutiveOrderInGroup();

   // Merge Constant loads that have all their users into another group to the group.
   // (TODO: else if all their users depend on the same group, put them there)
   void colorMergeConstantLoadsNextGroup();

   // Merge SUs that have all their users into another group to the group
   void colorMergeIfPossibleNextGroup();

   // Merge SUs that have all their users into another group to the group,
   // but only for Reserved groups.
   void colorMergeIfPossibleNextGroupOnlyForReserved();

   // Merge SUs that have all their users into another group to the group,
   // but only if the group is no more than a few SUs.
   void colorMergeIfPossibleSmallGroupsToNextGroup();

   // Divides Blocks with important size.
   // Idea of implementation: attribute new colors depending on topdown and
   // bottom up links to other blocks.
   void cutHugeBlocks();

   // Put in one group all instructions with no users in this scheduling region
   // (we'd want these groups be at the end).
   void regroupNoUserInstructions();

   // Give Reserved color to export instructions
   void colorExports();

   void createBlocksForVariant(SISchedulerBlockCreatorVariant BlockVariant);

   void topologicalSort();

   void scheduleInsideBlocks();

   void fillStats();
 };

 enum SISchedulerBlockSchedulerVariant {
   BlockLatencyRegUsage,
   BlockRegUsageLatency,
   BlockRegUsage
 };

 class SIScheduleBlockScheduler {
   SIScheduleDAGMI *DAG;
   SISchedulerBlockSchedulerVariant Variant;
   std::vector<SIScheduleBlock*> Blocks;

   std::vector<std::map<unsigned, unsigned>> LiveOutRegsNumUsages;
   std::set<unsigned> LiveRegs;
   // Num of schedulable unscheduled blocks reading the register.
   std::map<unsigned, unsigned> LiveRegsConsumers;

   std::vector<unsigned> LastPosHighLatencyParentScheduled;
   int LastPosWaitedHighLatency;

   std::vector<SIScheduleBlock*> BlocksScheduled;
   unsigned NumBlockScheduled;
   std::vector<SIScheduleBlock*> ReadyBlocks;

   unsigned VregCurrentUsage;
   unsigned SregCurrentUsage;

   // Currently is only approximation.
   unsigned maxVregUsage;
   unsigned maxSregUsage;

   std::vector<unsigned> BlockNumPredsLeft;
   std::vector<unsigned> BlockNumSuccsLeft;

 public:
   SIScheduleBlockScheduler(SIScheduleDAGMI *DAG,
                            SISchedulerBlockSchedulerVariant Variant,
                            SIScheduleBlocks BlocksStruct);
   ~SIScheduleBlockScheduler() = default;

   std::vector<SIScheduleBlock*> getBlocks() { return BlocksScheduled; }

   unsigned getVGPRUsage() { return maxVregUsage; }
   unsigned getSGPRUsage() { return maxSregUsage; }

 private:
   struct SIBlockSchedCandidate : SISchedulerCandidate {
     // The best Block candidate.
     SIScheduleBlock *Block = nullptr;

     bool IsHighLatency;
     int VGPRUsageDiff;
     unsigned NumSuccessors;
     unsigned NumHighLatencySuccessors;
     unsigned LastPosHighLatParentScheduled;
     unsigned Height;

     SIBlockSchedCandidate() = default;

     bool isValid() const { return Block; }

     // Copy the status of another candidate without changing policy.
     void setBest(SIBlockSchedCandidate &Best) {
       assert(Best.Reason != NoCand && "uninitialized Sched candidate");
       Block = Best.Block;
       Reason = Best.Reason;
       IsHighLatency = Best.IsHighLatency;
       VGPRUsageDiff = Best.VGPRUsageDiff;
       NumSuccessors = Best.NumSuccessors;
       NumHighLatencySuccessors = Best.NumHighLatencySuccessors;
       LastPosHighLatParentScheduled = Best.LastPosHighLatParentScheduled;
       Height = Best.Height;
     }
   };

   bool tryCandidateLatency(SIBlockSchedCandidate &Cand,
                            SIBlockSchedCandidate &TryCand);
   bool tryCandidateRegUsage(SIBlockSchedCandidate &Cand,
                             SIBlockSchedCandidate &TryCand);
   SIScheduleBlock *pickBlock();

   void addLiveRegs(std::set<unsigned> &Regs);
   void decreaseLiveRegs(SIScheduleBlock *Block, std::set<unsigned> &Regs);
   void releaseBlockSuccs(SIScheduleBlock *Parent);
   void blockScheduled(SIScheduleBlock *Block);

   // Check register pressure change
   // by scheduling a block with these LiveIn and LiveOut.
   std::vector<int> checkRegUsageImpact(std::set<unsigned> &InRegs,
                                        std::set<unsigned> &OutRegs);

   void schedule();
 };

 struct SIScheduleBlockResult {
   std::vector<unsigned> SUs;
   unsigned MaxSGPRUsage;
   unsigned MaxVGPRUsage;
 };

 class SIScheduler {
   SIScheduleDAGMI *DAG;
   SIScheduleBlockCreator BlockCreator;

 public:
   SIScheduler(SIScheduleDAGMI *DAG) : DAG(DAG), BlockCreator(DAG) {}

   ~SIScheduler() = default;

   struct SIScheduleBlockResult
   scheduleVariant(SISchedulerBlockCreatorVariant BlockVariant,
                   SISchedulerBlockSchedulerVariant ScheduleVariant);
 };

 class SIScheduleDAGMI final : public ScheduleDAGMILive {
   const SIInstrInfo *SITII;
   const SIRegisterInfo *SITRI;

   std::vector<SUnit> SUnitsLinksBackup;

   // For moveLowLatencies. After all Scheduling variants are tested.
   std::vector<unsigned> ScheduledSUnits;
   std::vector<unsigned> ScheduledSUnitsInv;

 public:
   SIScheduleDAGMI(MachineSchedContext *C);

   ~SIScheduleDAGMI() override;

   // Entry point for the schedule.
   void schedule() override;

   // To init Block's RPTracker.
   void initRPTracker(RegPressureTracker &RPTracker) {
     RPTracker.init(&MF, RegClassInfo, LIS, BB, RegionBegin, false, false);
   }

   MachineBasicBlock *getBB() { return BB; }
   MachineBasicBlock::iterator getCurrentTop() { return CurrentTop; }
   MachineBasicBlock::iterator getCurrentBottom() { return CurrentBottom; }
   LiveIntervals *getLIS() { return LIS; }
   MachineRegisterInfo *getMRI() { return &MRI; }
   const TargetRegisterInfo *getTRI() { return TRI; }
   ScheduleDAGTopologicalSort *GetTopo() { return &Topo; }
   SUnit &getEntrySU() { return EntrySU; }
   SUnit& getExitSU() { return ExitSU; }

   void restoreSULinksLeft();

   template<typename _Iterator> void fillVgprSgprCost(_Iterator First,
                                                      _Iterator End,
                                                      unsigned &VgprUsage,
                                                      unsigned &SgprUsage);

   std::set<unsigned> getInRegs() {
     std::set<unsigned> InRegs;
     for (const auto &RegMaskPair : RPTracker.getPressure().LiveInRegs) {
       InRegs.insert(RegMaskPair.RegUnit);
     }
     return InRegs;
   }

   std::set<unsigned> getOutRegs() {
     std::set<unsigned> OutRegs;
     for (const auto &RegMaskPair : RPTracker.getPressure().LiveOutRegs) {
       OutRegs.insert(RegMaskPair.RegUnit);
     }
     return OutRegs;
   };

 private:
   void topologicalSort();
   // After scheduling is done, improve low latency placements.
   void moveLowLatencies();

 public:
   // Some stats for scheduling inside blocks.
   std::vector<unsigned> IsLowLatencySU;
   std::vector<unsigned> LowLatencyOffset;
   std::vector<unsigned> IsHighLatencySU;
   // Topological sort
   // Maps topological index to the node number.
   std::vector<int> TopDownIndex2SU;
   std::vector<int> BottomUpIndex2SU;
 };

 } // end namespace llvm

 #endif // LLVM_LIB_TARGET_AMDGPU_SIMACHINESCHEDULER_H
	//===-- SIMachineScheduler.h - SI Scheduler Interface ------------ C++ --===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	//
	/// \file
	/// SI Machine Scheduler interface
	//
	//===----------------------------------------------------------------------===//

	#ifndef LLVM_LIB_TARGET_AMDGPU_SIMACHINESCHEDULER_H
	#define LLVM_LIB_TARGET_AMDGPU_SIMACHINESCHEDULER_H

	#include "llvm/CodeGen/MachineScheduler.h"
	#include "llvm/CodeGen/RegisterPressure.h"
	#include "llvm/CodeGen/ScheduleDAG.h"
	#include <cstdint>
	#include <set>
	#include <vector>

	namespace llvm {

	class SIInstrInfo;
	class SIRegisterInfo;
	class SIScheduleDAGMI;
	class SIScheduleBlockCreator;

	enum SIScheduleCandReason {
	NoCand,
	RegUsage,
	Latency,
	Successor,
	Depth,
	NodeOrder
	};

	struct SISchedulerCandidate {
	// The reason for this candidate.
	SIScheduleCandReason Reason = NoCand;

	// Set of reasons that apply to multiple candidates.
	uint32_t RepeatReasonSet = 0;

	SISchedulerCandidate() = default;

	bool isRepeat(SIScheduleCandReason R) { return RepeatReasonSet & (1 << R); }
	void setRepeat(SIScheduleCandReason R) { RepeatReasonSet \|= (1 << R); }
	};

	enum SIScheduleBlockLinkKind {
	NoData,
	Data
	};

	class SIScheduleBlock {
	SIScheduleDAGMI *DAG;
	SIScheduleBlockCreator *BC;

	std::vector<SUnit*> SUnits;
	std::map<unsigned, unsigned> NodeNum2Index;
	std::vector<SUnit*> TopReadySUs;
	std::vector<SUnit*> ScheduledSUnits;

	/// The top of the unscheduled zone.
	IntervalPressure TopPressure;
	RegPressureTracker TopRPTracker;

	// Pressure: number of said class of registers needed to
	// store the live virtual and real registers.
	// We do care only of SGPR32 and VGPR32 and do track only virtual registers.
	// Pressure of additional registers required inside the block.
	std::vector<unsigned> InternalAdditionalPressure;
	// Pressure of input and output registers
	std::vector<unsigned> LiveInPressure;
	std::vector<unsigned> LiveOutPressure;
	// Registers required by the block, and outputs.
	// We do track only virtual registers.
	// Note that some registers are not 32 bits,
	// and thus the pressure is not equal
	// to the number of live registers.
	std::set<unsigned> LiveInRegs;
	std::set<unsigned> LiveOutRegs;

	bool Scheduled = false;
	bool HighLatencyBlock = false;

	std::vector<unsigned> HasLowLatencyNonWaitedParent;

	// Unique ID, the index of the Block in the SIScheduleDAGMI Blocks table.
	unsigned ID;

	std::vector<SIScheduleBlock*> Preds; // All blocks predecessors.
	// All blocks successors, and the kind of link
	std::vector<std::pair<SIScheduleBlock*, SIScheduleBlockLinkKind>> Succs;
	unsigned NumHighLatencySuccessors = 0;

	public:
	SIScheduleBlock(SIScheduleDAGMI DAG, SIScheduleBlockCreator BC,
	unsigned ID):
	DAG(DAG), BC(BC), TopRPTracker(TopPressure), ID(ID) {}

	~SIScheduleBlock() = default;

	unsigned getID() const { return ID; }

	/// Functions for Block construction.
	void addUnit(SUnit *SU);

	// When all SUs have been added.
	void finalizeUnits();

	// Add block pred, which has instruction predecessor of SU.
	void addPred(SIScheduleBlock *Pred);
	void addSucc(SIScheduleBlock *Succ, SIScheduleBlockLinkKind Kind);

	const std::vector<SIScheduleBlock*>& getPreds() const { return Preds; }
	ArrayRef<std::pair<SIScheduleBlock*, SIScheduleBlockLinkKind>>
	getSuccs() const { return Succs; }

	unsigned Height; // Maximum topdown path length to block without outputs
	unsigned Depth; // Maximum bottomup path length to block without inputs

	unsigned getNumHighLatencySuccessors() const {
	return NumHighLatencySuccessors;
	}

	bool isHighLatencyBlock() { return HighLatencyBlock; }

	// This is approximative.
	// Ideally should take into accounts some instructions (rcp, etc)
	// are 4 times slower.
	int getCost() { return SUnits.size(); }

	// The block Predecessors and Successors must be all registered
	// before fastSchedule().
	// Fast schedule with no particular requirement.
	void fastSchedule();

	std::vector<SUnit*> getScheduledUnits() { return ScheduledSUnits; }

	// Complete schedule that will try to minimize reg pressure and
	// low latencies, and will fill liveins and liveouts.
	// Needs all MIs to be grouped between BeginBlock and EndBlock.
	// The MIs can be moved after the scheduling,
	// it is just used to allow correct track of live registers.
	void schedule(MachineBasicBlock::iterator BeginBlock,
	MachineBasicBlock::iterator EndBlock);

	bool isScheduled() { return Scheduled; }

	// Needs the block to be scheduled inside
	// TODO: find a way to compute it.
	std::vector<unsigned> &getInternalAdditionalRegUsage() {
	return InternalAdditionalPressure;
	}

	std::set<unsigned> &getInRegs() { return LiveInRegs; }
	std::set<unsigned> &getOutRegs() { return LiveOutRegs; }

	void printDebug(bool Full);

	private:
	struct SISchedCandidate : SISchedulerCandidate {
	// The best SUnit candidate.
	SUnit *SU = nullptr;

	unsigned SGPRUsage;
	unsigned VGPRUsage;
	bool IsLowLatency;
	unsigned LowLatencyOffset;
	bool HasLowLatencyNonWaitedParent;

	SISchedCandidate() = default;

	bool isValid() const { return SU; }

	// Copy the status of another candidate without changing policy.
	void setBest(SISchedCandidate &Best) {
	assert(Best.Reason != NoCand && "uninitialized Sched candidate");
	SU = Best.SU;
	Reason = Best.Reason;
	SGPRUsage = Best.SGPRUsage;
	VGPRUsage = Best.VGPRUsage;
	IsLowLatency = Best.IsLowLatency;
	LowLatencyOffset = Best.LowLatencyOffset;
	HasLowLatencyNonWaitedParent = Best.HasLowLatencyNonWaitedParent;
	}
	};

	void undoSchedule();

	void undoReleaseSucc(SUnit SU, SDep SuccEdge);
	void releaseSucc(SUnit SU, SDep SuccEdge);
	// InOrOutBlock: restrict to links pointing inside the block (true),
	// or restrict to links pointing outside the block (false).
	void releaseSuccessors(SUnit *SU, bool InOrOutBlock);

	void nodeScheduled(SUnit *SU);
	void tryCandidateTopDown(SISchedCandidate &Cand, SISchedCandidate &TryCand);
	void tryCandidateBottomUp(SISchedCandidate &Cand, SISchedCandidate &TryCand);
	SUnit* pickNode();
	void traceCandidate(const SISchedCandidate &Cand);
	void initRegPressure(MachineBasicBlock::iterator BeginBlock,
	MachineBasicBlock::iterator EndBlock);
	};

	struct SIScheduleBlocks {
	std::vector<SIScheduleBlock*> Blocks;
	std::vector<int> TopDownIndex2Block;
	std::vector<int> TopDownBlock2Index;
	};

	enum SISchedulerBlockCreatorVariant {
	LatenciesAlone,
	LatenciesGrouped,
	LatenciesAlonePlusConsecutive
	};

	class SIScheduleBlockCreator {
	SIScheduleDAGMI *DAG;
	// unique_ptr handles freeing memory for us.
	std::vector<std::unique_ptr<SIScheduleBlock>> BlockPtrs;
	std::map<SISchedulerBlockCreatorVariant,
	SIScheduleBlocks> Blocks;
	std::vector<SIScheduleBlock*> CurrentBlocks;
	std::vector<int> Node2CurrentBlock;

	// Topological sort
	// Maps topological index to the node number.
	std::vector<int> TopDownIndex2Block;
	std::vector<int> TopDownBlock2Index;
	std::vector<int> BottomUpIndex2Block;

	// 0 -> Color not given.
	// 1 to SUnits.size() -> Reserved group (you should only add elements to them).
	// Above -> Other groups.
	int NextReservedID;
	int NextNonReservedID;
	std::vector<int> CurrentColoring;
	std::vector<int> CurrentTopDownReservedDependencyColoring;
	std::vector<int> CurrentBottomUpReservedDependencyColoring;

	public:
	SIScheduleBlockCreator(SIScheduleDAGMI *DAG);

	SIScheduleBlocks
	getBlocks(SISchedulerBlockCreatorVariant BlockVariant);

	bool isSUInBlock(SUnit *SU, unsigned ID);

	private:
	// Give a Reserved color to every high latency.
	void colorHighLatenciesAlone();

	// Create groups of high latencies with a Reserved color.
	void colorHighLatenciesGroups();

	// Compute coloring for topdown and bottom traversals with
	// different colors depending on dependencies on Reserved colors.
	void colorComputeReservedDependencies();

	// Give color to all non-colored SUs according to Reserved groups dependencies.
	void colorAccordingToReservedDependencies();

	// Divides Blocks having no bottom up or top down dependencies on Reserved groups.
	// The new colors are computed according to the dependencies on the other blocks
	// formed with colorAccordingToReservedDependencies.
	void colorEndsAccordingToDependencies();

	// Cut groups into groups with SUs in consecutive order (except for Reserved groups).
	void colorForceConsecutiveOrderInGroup();

	// Merge Constant loads that have all their users into another group to the group.
	// (TODO: else if all their users depend on the same group, put them there)
	void colorMergeConstantLoadsNextGroup();

	// Merge SUs that have all their users into another group to the group
	void colorMergeIfPossibleNextGroup();

	// Merge SUs that have all their users into another group to the group,
	// but only for Reserved groups.
	void colorMergeIfPossibleNextGroupOnlyForReserved();

	// Merge SUs that have all their users into another group to the group,
	// but only if the group is no more than a few SUs.
	void colorMergeIfPossibleSmallGroupsToNextGroup();

	// Divides Blocks with important size.
	// Idea of implementation: attribute new colors depending on topdown and
	// bottom up links to other blocks.
	void cutHugeBlocks();

	// Put in one group all instructions with no users in this scheduling region
	// (we'd want these groups be at the end).
	void regroupNoUserInstructions();

	// Give Reserved color to export instructions
	void colorExports();

	void createBlocksForVariant(SISchedulerBlockCreatorVariant BlockVariant);

	void topologicalSort();

	void scheduleInsideBlocks();

	void fillStats();
	};

	enum SISchedulerBlockSchedulerVariant {
	BlockLatencyRegUsage,
	BlockRegUsageLatency,
	BlockRegUsage
	};

	class SIScheduleBlockScheduler {
	SIScheduleDAGMI *DAG;
	SISchedulerBlockSchedulerVariant Variant;
	std::vector<SIScheduleBlock*> Blocks;

	std::vector<std::map<unsigned, unsigned>> LiveOutRegsNumUsages;
	std::set<unsigned> LiveRegs;
	// Num of schedulable unscheduled blocks reading the register.
	std::map<unsigned, unsigned> LiveRegsConsumers;

	std::vector<unsigned> LastPosHighLatencyParentScheduled;
	int LastPosWaitedHighLatency;

	std::vector<SIScheduleBlock*> BlocksScheduled;
	unsigned NumBlockScheduled;
	std::vector<SIScheduleBlock*> ReadyBlocks;

	unsigned VregCurrentUsage;
	unsigned SregCurrentUsage;

	// Currently is only approximation.
	unsigned maxVregUsage;
	unsigned maxSregUsage;

	std::vector<unsigned> BlockNumPredsLeft;
	std::vector<unsigned> BlockNumSuccsLeft;

	public:
	SIScheduleBlockScheduler(SIScheduleDAGMI *DAG,
	SISchedulerBlockSchedulerVariant Variant,
	SIScheduleBlocks BlocksStruct);
	~SIScheduleBlockScheduler() = default;

	std::vector<SIScheduleBlock*> getBlocks() { return BlocksScheduled; }

	unsigned getVGPRUsage() { return maxVregUsage; }
	unsigned getSGPRUsage() { return maxSregUsage; }

	private:
	struct SIBlockSchedCandidate : SISchedulerCandidate {
	// The best Block candidate.
	SIScheduleBlock *Block = nullptr;

	bool IsHighLatency;
	int VGPRUsageDiff;
	unsigned NumSuccessors;
	unsigned NumHighLatencySuccessors;
	unsigned LastPosHighLatParentScheduled;
	unsigned Height;

	SIBlockSchedCandidate() = default;

	bool isValid() const { return Block; }

	// Copy the status of another candidate without changing policy.
	void setBest(SIBlockSchedCandidate &Best) {
	assert(Best.Reason != NoCand && "uninitialized Sched candidate");
	Block = Best.Block;
	Reason = Best.Reason;
	IsHighLatency = Best.IsHighLatency;
	VGPRUsageDiff = Best.VGPRUsageDiff;
	NumSuccessors = Best.NumSuccessors;
	NumHighLatencySuccessors = Best.NumHighLatencySuccessors;
	LastPosHighLatParentScheduled = Best.LastPosHighLatParentScheduled;
	Height = Best.Height;
	}
	};

	bool tryCandidateLatency(SIBlockSchedCandidate &Cand,
	SIBlockSchedCandidate &TryCand);
	bool tryCandidateRegUsage(SIBlockSchedCandidate &Cand,
	SIBlockSchedCandidate &TryCand);
	SIScheduleBlock *pickBlock();

	void addLiveRegs(std::set<unsigned> &Regs);
	void decreaseLiveRegs(SIScheduleBlock *Block, std::set<unsigned> &Regs);
	void releaseBlockSuccs(SIScheduleBlock *Parent);
	void blockScheduled(SIScheduleBlock *Block);

	// Check register pressure change
	// by scheduling a block with these LiveIn and LiveOut.
	std::vector<int> checkRegUsageImpact(std::set<unsigned> &InRegs,
	std::set<unsigned> &OutRegs);

	void schedule();
	};

	struct SIScheduleBlockResult {
	std::vector<unsigned> SUs;
	unsigned MaxSGPRUsage;
	unsigned MaxVGPRUsage;
	};

	class SIScheduler {
	SIScheduleDAGMI *DAG;
	SIScheduleBlockCreator BlockCreator;

	public:
	SIScheduler(SIScheduleDAGMI *DAG) : DAG(DAG), BlockCreator(DAG) {}

	~SIScheduler() = default;

	struct SIScheduleBlockResult
	scheduleVariant(SISchedulerBlockCreatorVariant BlockVariant,
	SISchedulerBlockSchedulerVariant ScheduleVariant);
	};

	class SIScheduleDAGMI final : public ScheduleDAGMILive {
	const SIInstrInfo *SITII;
	const SIRegisterInfo *SITRI;

	std::vector<SUnit> SUnitsLinksBackup;

	// For moveLowLatencies. After all Scheduling variants are tested.
	std::vector<unsigned> ScheduledSUnits;
	std::vector<unsigned> ScheduledSUnitsInv;

	public:
	SIScheduleDAGMI(MachineSchedContext *C);

	~SIScheduleDAGMI() override;

	// Entry point for the schedule.
	void schedule() override;

	// To init Block's RPTracker.
	void initRPTracker(RegPressureTracker &RPTracker) {
	RPTracker.init(&MF, RegClassInfo, LIS, BB, RegionBegin, false, false);
	}

	MachineBasicBlock *getBB() { return BB; }
	MachineBasicBlock::iterator getCurrentTop() { return CurrentTop; }
	MachineBasicBlock::iterator getCurrentBottom() { return CurrentBottom; }
	LiveIntervals *getLIS() { return LIS; }
	MachineRegisterInfo *getMRI() { return &MRI; }
	const TargetRegisterInfo *getTRI() { return TRI; }
	ScheduleDAGTopologicalSort *GetTopo() { return &Topo; }
	SUnit &getEntrySU() { return EntrySU; }
	SUnit& getExitSU() { return ExitSU; }

	void restoreSULinksLeft();

	template<typename _Iterator> void fillVgprSgprCost(_Iterator First,
	_Iterator End,
	unsigned &VgprUsage,
	unsigned &SgprUsage);

	std::set<unsigned> getInRegs() {
	std::set<unsigned> InRegs;
	for (const auto &RegMaskPair : RPTracker.getPressure().LiveInRegs) {
	InRegs.insert(RegMaskPair.RegUnit);
	}
	return InRegs;
	}

	std::set<unsigned> getOutRegs() {
	std::set<unsigned> OutRegs;
	for (const auto &RegMaskPair : RPTracker.getPressure().LiveOutRegs) {
	OutRegs.insert(RegMaskPair.RegUnit);
	}
	return OutRegs;
	};

	private:
	void topologicalSort();
	// After scheduling is done, improve low latency placements.
	void moveLowLatencies();

	public:
	// Some stats for scheduling inside blocks.
	std::vector<unsigned> IsLowLatencySU;
	std::vector<unsigned> LowLatencyOffset;
	std::vector<unsigned> IsHighLatencySU;
	// Topological sort
	// Maps topological index to the node number.
	std::vector<int> TopDownIndex2SU;
	std::vector<int> BottomUpIndex2SU;
	};

	} // end namespace llvm

	#endif // LLVM_LIB_TARGET_AMDGPU_SIMACHINESCHEDULER_H