llvm/lib/Target/AMDGPU/GCNSchedStrategy.h - llvm-project.git - Git at Google

 //===-- GCNSchedStrategy.h - GCN Scheduler Strategy -*- C++ -*-------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 //
 /// \file
 //
 //===----------------------------------------------------------------------===//

 #ifndef LLVM_LIB_TARGET_AMDGPU_GCNSCHEDSTRATEGY_H
 #define LLVM_LIB_TARGET_AMDGPU_GCNSCHEDSTRATEGY_H

 #include "GCNRegPressure.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/MapVector.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
 #include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineScheduler.h"

 namespace llvm {

 class SIMachineFunctionInfo;
 class SIRegisterInfo;
 class GCNSubtarget;
 class GCNSchedStage;

 enum class GCNSchedStageID : unsigned {
   OccInitialSchedule = 0,
   RewriteMFMAForm = 1,
   UnclusteredHighRPReschedule = 2,
   ClusteredLowOccupancyReschedule = 3,
   PreRARematerialize = 4,
   ILPInitialSchedule = 5,
   MemoryClauseInitialSchedule = 6
 };

 #ifndef NDEBUG
 raw_ostream &operator<<(raw_ostream &OS, const GCNSchedStageID &StageID);
 #endif

 /// This is a minimal scheduler strategy.  The main difference between this
 /// and the GenericScheduler is that GCNSchedStrategy uses different
 /// heuristics to determine excess/critical pressure sets.
 class GCNSchedStrategy : public GenericScheduler {
 protected:
   SUnit *pickNodeBidirectional(bool &IsTopNode, bool &PickedPending);

   void pickNodeFromQueue(SchedBoundary &Zone, const CandPolicy &ZonePolicy,
                          const RegPressureTracker &RPTracker,
                          SchedCandidate &Cand, bool &IsPending,
                          bool IsBottomUp);

   void initCandidate(SchedCandidate &Cand, SUnit *SU, bool AtTop,
                      const RegPressureTracker &RPTracker,
                      const SIRegisterInfo *SRI, unsigned SGPRPressure,
                      unsigned VGPRPressure, bool IsBottomUp);

   /// Evaluates instructions in the pending queue using a subset of scheduling
   /// heuristics.
   ///
   /// Instructions that cannot be issued due to hardware constraints are placed
   /// in the pending queue rather than the available queue, making them normally
   /// invisible to scheduling heuristics. However, in certain scenarios (such as
   /// avoiding register spilling), it may be beneficial to consider scheduling
   /// these not-yet-ready instructions.
   bool tryPendingCandidate(SchedCandidate &Cand, SchedCandidate &TryCand,
                            SchedBoundary *Zone) const;

   void printCandidateDecision(const SchedCandidate &Current,
                               const SchedCandidate &Preferred);

   std::vector<unsigned> Pressure;

   std::vector<unsigned> MaxPressure;

   unsigned SGPRExcessLimit;

   unsigned VGPRExcessLimit;

   unsigned TargetOccupancy;

   MachineFunction *MF;

   // Scheduling stages for this strategy.
   SmallVector<GCNSchedStageID, 4> SchedStages;

   // Pointer to the current SchedStageID.
   SmallVectorImpl<GCNSchedStageID>::iterator CurrentStage = nullptr;

   // GCN RP Tracker for top-down scheduling
   mutable GCNDownwardRPTracker DownwardTracker;

   // GCN RP Tracker for botttom-up scheduling
   mutable GCNUpwardRPTracker UpwardTracker;

 public:
   // schedule() have seen register pressure over the critical limits and had to
   // track register pressure for actual scheduling heuristics.
   bool HasHighPressure;

   // Schedule known to have excess register pressure. Be more conservative in
   // increasing ILP and preserving VGPRs.
   bool KnownExcessRP = false;

   // An error margin is necessary because of poor performance of the generic RP
   // tracker and can be adjusted up for tuning heuristics to try and more
   // aggressively reduce register pressure.
   unsigned ErrorMargin = 3;

   // Bias for SGPR limits under a high register pressure.
   const unsigned HighRPSGPRBias = 7;

   // Bias for VGPR limits under a high register pressure.
   const unsigned HighRPVGPRBias = 7;

   unsigned SGPRCriticalLimit;

   unsigned VGPRCriticalLimit;

   unsigned SGPRLimitBias = 0;

   unsigned VGPRLimitBias = 0;

   GCNSchedStrategy(const MachineSchedContext *C);

   SUnit *pickNode(bool &IsTopNode) override;

   void schedNode(SUnit *SU, bool IsTopNode) override;

   void initialize(ScheduleDAGMI *DAG) override;

   unsigned getTargetOccupancy() { return TargetOccupancy; }

   void setTargetOccupancy(unsigned Occ) { TargetOccupancy = Occ; }

   GCNSchedStageID getCurrentStage();

   // Advances stage. Returns true if there are remaining stages.
   bool advanceStage();

   bool hasNextStage() const;

   GCNSchedStageID getNextStage() const;

   GCNDownwardRPTracker *getDownwardTracker() { return &DownwardTracker; }

   GCNUpwardRPTracker *getUpwardTracker() { return &UpwardTracker; }
 };

 /// The goal of this scheduling strategy is to maximize kernel occupancy (i.e.
 /// maximum number of waves per simd).
 class GCNMaxOccupancySchedStrategy final : public GCNSchedStrategy {
 public:
   GCNMaxOccupancySchedStrategy(const MachineSchedContext *C,
                                bool IsLegacyScheduler = false);
 };

 /// The goal of this scheduling strategy is to maximize ILP for a single wave
 /// (i.e. latency hiding).
 class GCNMaxILPSchedStrategy final : public GCNSchedStrategy {
 protected:
   bool tryCandidate(SchedCandidate &Cand, SchedCandidate &TryCand,
                     SchedBoundary *Zone) const override;

 public:
   GCNMaxILPSchedStrategy(const MachineSchedContext *C);
 };

 /// The goal of this scheduling strategy is to maximize memory clause for a
 /// single wave.
 class GCNMaxMemoryClauseSchedStrategy final : public GCNSchedStrategy {
 protected:
   bool tryCandidate(SchedCandidate &Cand, SchedCandidate &TryCand,
                     SchedBoundary *Zone) const override;

 public:
   GCNMaxMemoryClauseSchedStrategy(const MachineSchedContext *C);
 };

 class ScheduleMetrics {
   unsigned ScheduleLength;
   unsigned BubbleCycles;

 public:
   ScheduleMetrics() = default;
   ScheduleMetrics(unsigned L, unsigned BC)
       : ScheduleLength(L), BubbleCycles(BC) {}
   unsigned getLength() const { return ScheduleLength; }
   unsigned getBubbles() const { return BubbleCycles; }
   unsigned getMetric() const {
     unsigned Metric = (BubbleCycles * ScaleFactor) / ScheduleLength;
     // Metric is zero if the amount of bubbles is less than 1% which is too
     // small. So, return 1.
     return Metric ? Metric : 1;
   }
   static const unsigned ScaleFactor;
 };

 inline raw_ostream &operator<<(raw_ostream &OS, const ScheduleMetrics &Sm) {
   dbgs() << "\n Schedule Metric (scaled by " << ScheduleMetrics::ScaleFactor
          << " ) is: " << Sm.getMetric() << " [ " << Sm.getBubbles() << "/"
          << Sm.getLength() << " ]\n";
   return OS;
 }

 class GCNScheduleDAGMILive;
 class RegionPressureMap {
   GCNScheduleDAGMILive *DAG;
   // The live in/out pressure as indexed by the first or last MI in the region
   // before scheduling.
   DenseMap<MachineInstr *, GCNRPTracker::LiveRegSet> RegionLiveRegMap;
   // The mapping of RegionIDx to key instruction
   DenseMap<unsigned, MachineInstr *> IdxToInstruction;
   // Whether we are calculating LiveOuts or LiveIns
   bool IsLiveOut;

 public:
   RegionPressureMap() = default;
   RegionPressureMap(GCNScheduleDAGMILive *GCNDAG, bool LiveOut)
       : DAG(GCNDAG), IsLiveOut(LiveOut) {}
   // Build the Instr->LiveReg and RegionIdx->Instr maps
   void buildLiveRegMap();

   // Retrieve the LiveReg for a given RegionIdx
   GCNRPTracker::LiveRegSet &getLiveRegsForRegionIdx(unsigned RegionIdx) {
     assert(IdxToInstruction.contains(RegionIdx));
     MachineInstr *Key = IdxToInstruction[RegionIdx];
     return RegionLiveRegMap[Key];
   }
 };

 /// A region's boundaries i.e. a pair of instruction bundle iterators. The lower
 /// boundary is inclusive, the upper boundary is exclusive.
 using RegionBoundaries =
     std::pair<MachineBasicBlock::iterator, MachineBasicBlock::iterator>;

 class GCNScheduleDAGMILive final : public ScheduleDAGMILive {
   friend class GCNSchedStage;
   friend class OccInitialScheduleStage;
   friend class RewriteMFMAFormStage;
   friend class UnclusteredHighRPStage;
   friend class ClusteredLowOccStage;
   friend class PreRARematStage;
   friend class ILPInitialScheduleStage;
   friend class RegionPressureMap;

   const GCNSubtarget &ST;

   SIMachineFunctionInfo &MFI;

   // Occupancy target at the beginning of function scheduling cycle.
   unsigned StartingOccupancy;

   // Minimal real occupancy recorder for the function.
   unsigned MinOccupancy;

   // Vector of regions recorder for later rescheduling
   SmallVector<RegionBoundaries, 32> Regions;

   // Record regions with high register pressure.
   BitVector RegionsWithHighRP;

   // Record regions with excess register pressure over the physical register
   // limit. Register pressure in these regions usually will result in spilling.
   BitVector RegionsWithExcessRP;

   // Regions that have IGLP instructions (SCHED_GROUP_BARRIER or IGLP_OPT).
   BitVector RegionsWithIGLPInstrs;

   // Region live-in cache.
   SmallVector<GCNRPTracker::LiveRegSet, 32> LiveIns;

   // Region pressure cache.
   SmallVector<GCNRegPressure, 32> Pressure;

   // Temporary basic block live-in cache.
   DenseMap<const MachineBasicBlock *, GCNRPTracker::LiveRegSet> MBBLiveIns;

   // The map of the initial first region instruction to region live in registers
   DenseMap<MachineInstr *, GCNRPTracker::LiveRegSet> BBLiveInMap;

   // Calculate the map of the initial first region instruction to region live in
   // registers
   DenseMap<MachineInstr *, GCNRPTracker::LiveRegSet> getRegionLiveInMap() const;

   // Calculate the map of the initial last region instruction to region live out
   // registers
   DenseMap<MachineInstr *, GCNRPTracker::LiveRegSet>
   getRegionLiveOutMap() const;

   // The live out registers per region. These are internally stored as a map of
   // the initial last region instruction to region live out registers, but can
   // be retreived with the regionIdx by calls to getLiveRegsForRegionIdx.
   RegionPressureMap RegionLiveOuts;

   // Return current region pressure.
   GCNRegPressure getRealRegPressure(unsigned RegionIdx) const;

   // Compute and cache live-ins and pressure for all regions in block.
   void computeBlockPressure(unsigned RegionIdx, const MachineBasicBlock *MBB);

   /// Makes the scheduler try to achieve an occupancy of \p TargetOccupancy.
   void setTargetOccupancy(unsigned TargetOccupancy);

   void runSchedStages();

   std::unique_ptr<GCNSchedStage> createSchedStage(GCNSchedStageID SchedStageID);

   void deleteMI(unsigned RegionIdx, MachineInstr *MI);

 public:
   GCNScheduleDAGMILive(MachineSchedContext *C,
                        std::unique_ptr<MachineSchedStrategy> S);

   void schedule() override;

   void finalizeSchedule() override;
 };

 // GCNSchedStrategy applies multiple scheduling stages to a function.
 class GCNSchedStage {
 protected:
   GCNScheduleDAGMILive &DAG;

   GCNSchedStrategy &S;

   MachineFunction &MF;

   SIMachineFunctionInfo &MFI;

   const GCNSubtarget &ST;

   const GCNSchedStageID StageID;

   // The current block being scheduled.
   MachineBasicBlock *CurrentMBB = nullptr;

   // Current region index.
   unsigned RegionIdx = 0;

   // Record the original order of instructions before scheduling.
   std::vector<MachineInstr *> Unsched;

   // RP before scheduling the current region.
   GCNRegPressure PressureBefore;

   // RP after scheduling the current region.
   GCNRegPressure PressureAfter;

   std::vector<std::unique_ptr<ScheduleDAGMutation>> SavedMutations;

   GCNSchedStage(GCNSchedStageID StageID, GCNScheduleDAGMILive &DAG);

 public:
   // Initialize state for a scheduling stage. Returns false if the current stage
   // should be skipped.
   virtual bool initGCNSchedStage();

   // Finalize state after finishing a scheduling pass on the function.
   virtual void finalizeGCNSchedStage();

   // Setup for scheduling a region. Returns false if the current region should
   // be skipped.
   virtual bool initGCNRegion();

   // Finalize state after scheduling a region.
   virtual void finalizeGCNRegion();

   // Track whether a new region is also a new MBB.
   void setupNewBlock();

   // Check result of scheduling.
   void checkScheduling();

   // computes the given schedule virtual execution time in clocks
   ScheduleMetrics getScheduleMetrics(const std::vector<SUnit> &InputSchedule);
   ScheduleMetrics getScheduleMetrics(const GCNScheduleDAGMILive &DAG);
   unsigned computeSUnitReadyCycle(const SUnit &SU, unsigned CurrCycle,
                                   DenseMap<unsigned, unsigned> &ReadyCycles,
                                   const TargetSchedModel &SM);

   // Returns true if scheduling should be reverted.
   virtual bool shouldRevertScheduling(unsigned WavesAfter);

   // Returns true if current region has known excess pressure.
   bool isRegionWithExcessRP() const {
     return DAG.RegionsWithExcessRP[RegionIdx];
   }

   // The region number this stage is currently working on
   unsigned getRegionIdx() { return RegionIdx; }

   // Returns true if the new schedule may result in more spilling.
   bool mayCauseSpilling(unsigned WavesAfter);

   /// Sets the schedule of region \p RegionIdx in block \p MBB to \p MIOrder.
   /// The MIs in \p MIOrder must be exactly the same as the ones currently
   /// existing inside the region, only in a different order that honors def-use
   /// chains.
   void modifyRegionSchedule(unsigned RegionIdx, MachineBasicBlock *MBB,
                             ArrayRef<MachineInstr *> MIOrder);

   void advanceRegion() { RegionIdx++; }

   virtual ~GCNSchedStage() = default;
 };

 class OccInitialScheduleStage : public GCNSchedStage {
 public:
   bool shouldRevertScheduling(unsigned WavesAfter) override;

   OccInitialScheduleStage(GCNSchedStageID StageID, GCNScheduleDAGMILive &DAG)
       : GCNSchedStage(StageID, DAG) {}
 };

 class RewriteMFMAFormStage : public GCNSchedStage {
 private:
   // Record regions with excess archvgpr register pressure over the physical
   // register limit. Register pressure in these regions usually will result in
   // spilling.
   BitVector RegionsWithExcessArchVGPR;

   const SIInstrInfo *TII;
   const SIRegisterInfo *SRI;

   /// Do a speculative rewrite and collect copy locations. The speculative
   /// rewrite allows us to calculate the RP of the code after the rewrite, and
   /// the copy locations allow us to calculate the total cost of copies required
   /// for the rewrite. Stores the rewritten instructions in \p RewriteCands ,
   /// the copy locations for uses (of the MFMA result) in \p CopyForUse and the
   /// copy locations for defs (of the MFMA operands) in \p CopyForDef
   bool
   initHeuristics(std::vector<std::pair<MachineInstr *, unsigned>> &RewriteCands,
                  DenseMap<MachineBasicBlock *, std::set<Register>> &CopyForUse,
                  SmallPtrSetImpl<MachineInstr *> &CopyForDef);

   /// Calculate the rewrite cost and undo the state change (e.g. rewriting) done
   /// in initHeuristics. Uses \p CopyForUse and \p CopyForDef to calculate copy
   /// costs, and \p RewriteCands to undo rewriting.
   int64_t getRewriteCost(
       const std::vector<std::pair<MachineInstr *, unsigned>> &RewriteCands,
       const DenseMap<MachineBasicBlock *, std::set<Register>> &CopyForUse,
       const SmallPtrSetImpl<MachineInstr *> &CopyForDef);

   /// Do the final rewrite on \p RewriteCands and insert any needed copies.
   bool
   rewrite(const std::vector<std::pair<MachineInstr *, unsigned>> &RewriteCands);

   /// \returns true if this MI is a rewrite candidate.
   bool isRewriteCandidate(MachineInstr *MI) const;

   /// Finds all the reaching defs of \p UseMO and stores the SlotIndexes into \p
   /// DefIdxs
   void findReachingDefs(MachineOperand &UseMO, LiveIntervals *LIS,
                         SmallVectorImpl<SlotIndex> &DefIdxs);

   /// Finds all the reaching uses of \p DefMI and stores the use operands in \p
   /// ReachingUses
   void findReachingUses(MachineInstr *DefMI, LiveIntervals *LIS,
                         SmallVectorImpl<MachineOperand *> &ReachingUses);

 public:
   bool initGCNSchedStage() override;

   RewriteMFMAFormStage(GCNSchedStageID StageID, GCNScheduleDAGMILive &DAG)
       : GCNSchedStage(StageID, DAG) {}
 };

 class UnclusteredHighRPStage : public GCNSchedStage {
 private:
   // Save the initial occupancy before starting this stage.
   unsigned InitialOccupancy;
   // Save the temporary target occupancy before starting this stage.
   unsigned TempTargetOccupancy;
   // Track whether any region was scheduled by this stage.
   bool IsAnyRegionScheduled;

 public:
   bool initGCNSchedStage() override;

   void finalizeGCNSchedStage() override;

   bool initGCNRegion() override;

   bool shouldRevertScheduling(unsigned WavesAfter) override;

   UnclusteredHighRPStage(GCNSchedStageID StageID, GCNScheduleDAGMILive &DAG)
       : GCNSchedStage(StageID, DAG) {}
 };

 // Retry function scheduling if we found resulting occupancy and it is
 // lower than used for other scheduling passes. This will give more freedom
 // to schedule low register pressure blocks.
 class ClusteredLowOccStage : public GCNSchedStage {
 public:
   bool initGCNSchedStage() override;

   bool initGCNRegion() override;

   bool shouldRevertScheduling(unsigned WavesAfter) override;

   ClusteredLowOccStage(GCNSchedStageID StageID, GCNScheduleDAGMILive &DAG)
       : GCNSchedStage(StageID, DAG) {}
 };

 /// Attempts to reduce function spilling or, if there is no spilling, to
 /// increase function occupancy by one with respect to register usage by sinking
 /// rematerializable instructions to their use. When the stage estimates that
 /// reducing spilling or increasing occupancy is possible, it tries to
 /// rematerialize as few registers as possible to reduce potential negative
 /// effects on function latency.
 ///
 /// The stage only supports rematerializing registers that meet all of the
 /// following constraints.
 /// 1. The register is virtual and has a single defining instruction.
 /// 2. The single defining instruction is either deemed rematerializable by the
 ///    target-independent logic, or if not, has no non-constant and
 ///    non-ignorable physical register use.
 /// 3  The register has no virtual register use whose live range would be
 ///    extended by the rematerialization.
 /// 4. The register has a single non-debug user in a different region from its
 ///    defining region.
 /// 5. The register is not used by or using another register that is going to be
 ///    rematerialized.
 class PreRARematStage : public GCNSchedStage {
 private:
   /// A rematerializable register.
   struct RematReg {
     /// Single MI defining the rematerializable register.
     MachineInstr *DefMI;
     /// Single user of the rematerializable register.
     MachineInstr *UseMI;
     /// Regions in which the register is live-in/live-out/live anywhere.
     BitVector LiveIn, LiveOut, Live;
     /// The rematerializable register's lane bitmask.
     LaneBitmask Mask;
     /// Defining and using regions.
     unsigned DefRegion, UseRegion;

     RematReg(MachineInstr *DefMI, MachineInstr *UseMI,
              GCNScheduleDAGMILive &DAG,
              const DenseMap<MachineInstr *, unsigned> &MIRegion);

     /// Returns the rematerializable register. Do not call after deleting the
     /// original defining instruction.
     Register getReg() const { return DefMI->getOperand(0).getReg(); }

     /// Determines whether this rematerialization may be beneficial in at least
     /// one target region.
     bool maybeBeneficial(const BitVector &TargetRegions,
                          ArrayRef<GCNRPTarget> RPTargets) const;

     /// Determines if the register is both unused and live-through in region \p
     /// I. This guarantees that rematerializing it will reduce RP in the region.
     bool isUnusedLiveThrough(unsigned I) const {
       assert(I < Live.size() && "region index out of range");
       return LiveIn[I] && LiveOut[I] && I != UseRegion;
     }

     /// Updates internal structures following a MI rematerialization. Part of
     /// the stage instead of the DAG because it makes assumptions that are
     /// specific to the rematerialization process.
     void insertMI(unsigned RegionIdx, MachineInstr *RematMI,
                   GCNScheduleDAGMILive &DAG) const;
   };

   /// A scored rematerialization candidate. Higher scores indicate more
   /// beneficial rematerializations. A null score indicate the rematerialization
   /// is not helpful to reduce RP in target regions.
   struct ScoredRemat {
     /// The rematerializable register under consideration.
     RematReg *Remat;

     /// Execution frequency information required by scoring heuristics.
     /// Frequencies are scaled down if they are high to avoid overflow/underflow
     /// when combining them.
     struct FreqInfo {
       /// Per-region execution frequencies. 0 when unknown.
       SmallVector<uint64_t> Regions;
       /// Minimum and maximum observed frequencies.
       uint64_t MinFreq, MaxFreq;

       FreqInfo(MachineFunction &MF, const GCNScheduleDAGMILive &DAG);

     private:
       static const uint64_t ScaleFactor = 1024;
     };

     /// This only initializes state-independent characteristics of \p Remat, not
     /// the actual score.
     ScoredRemat(RematReg *Remat, const FreqInfo &Freq,
                 const GCNScheduleDAGMILive &DAG);

     /// Updates the rematerialization's score w.r.t. the current \p RPTargets.
     /// \p RegionFreq indicates the frequency of each region
     void update(const BitVector &TargetRegions, ArrayRef<GCNRPTarget> RPTargets,
                 const FreqInfo &Freq, bool ReduceSpill);

     /// Returns whether the current score is null, indicating the
     /// rematerialization is useless.
     bool hasNullScore() const { return !RegionImpact; }

     /// Compare score components of non-null scores pair-wise. A null score is
     /// always strictly lesser than another non-null score.
     bool operator<(const ScoredRemat &O) const {
       if (hasNullScore())
         return !O.hasNullScore();
       if (O.hasNullScore())
         return false;
       if (MaxFreq != O.MaxFreq)
         return MaxFreq < O.MaxFreq;
       if (FreqDiff != O.FreqDiff)
         return FreqDiff < O.FreqDiff;
       if (RegionImpact != O.RegionImpact)
         return RegionImpact < O.RegionImpact;
       // Break ties using pointer to rematerializable register. Rematerializable
       // registers are collected in instruction order so, within the same
       // region, this will prefer registers defined earlier that have longer
       // live ranges in their defining region (since the registers we consider
       // are always live-out in their defining region).
       return Remat > O.Remat;
     }

 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
     Printable print() const;
 #endif

   private:
     /// Number of 32-bit registers this rematerialization covers.
     unsigned NumRegs;

     // The three members below are the scoring components, top to bottom from
     // most important to least important when comparing candidates.

     /// Frequency of impacted target region with highest known frequency. This
     /// only matters when the stage is trying to reduce spilling, so it is
     /// always 0 when it is not.
     uint64_t MaxFreq;
     /// Frequency difference between defining and using regions. Negative values
     /// indicate we are rematerializing to higher frequency regions; positive
     /// values indicate the contrary.
     int64_t FreqDiff;
     /// Expected number of target regions impacted by the rematerialization,
     /// scaled by the size of the register being rematerialized.
     unsigned RegionImpact;

     unsigned getNumRegs(const GCNScheduleDAGMILive &DAG) const;

     int64_t getFreqDiff(const FreqInfo &Freq) const;
   };

   /// Parent MBB to each region, in region order.
   SmallVector<MachineBasicBlock *> RegionBB;
   /// Register pressure targets for all regions.
   SmallVector<GCNRPTarget> RPTargets;
   /// Regions which are above the stage's RP target.
   BitVector TargetRegions;
   /// The target occupancy the set is trying to achieve. Empty when the
   /// objective is spilling reduction.
   std::optional<unsigned> TargetOcc;
   /// Achieved occupancy *only* through rematerializations (pre-rescheduling).
   unsigned AchievedOcc;
   /// After successful stage initialization, indicates which regions should be
   /// rescheduled.
   BitVector RescheduleRegions;

   /// List of rematerializable registers.
   SmallVector<RematReg> RematRegs;

   /// Holds enough information to rollback a rematerialization decision post
   /// re-scheduling.
   struct RollbackInfo {
     /// The rematerializable register under consideration.
     const RematReg *Remat;
     /// The rematerialized MI replacing the original defining MI.
     MachineInstr *RematMI;
     /// Maps register machine operand indices to their original register.
     SmallDenseMap<unsigned, Register, 4> RegMap;

     RollbackInfo(const RematReg *Remat) : Remat(Remat) {}
   };
   /// List of rematerializations to rollback if rematerialization does not end
   /// up being beneficial.
   SmallVector<RollbackInfo> Rollbacks;

   /// State of a region pre-re-scheduling but post-rematerializations that we
   /// must keep to be able to revert re-scheduling effects.
   struct RegionSchedRevert {
     /// Region number;
     unsigned RegionIdx;
     /// Original instruction order (both debug and non-debug MIs).
     std::vector<MachineInstr *> OrigMIOrder;
     /// Maximum pressure recorded in the region.
     GCNRegPressure MaxPressure;

     RegionSchedRevert(unsigned RegionIdx, ArrayRef<MachineInstr *> OrigMIOrder,
                       const GCNRegPressure &MaxPressure)
         : RegionIdx(RegionIdx), OrigMIOrder(OrigMIOrder),
           MaxPressure(MaxPressure) {}
   };
   /// After re-scheduling, contains pre-re-scheduling data for all re-scheduled
   /// regions.
   SmallVector<RegionSchedRevert> RegionReverts;

   /// Returns the occupancy the stage is trying to achieve.
   unsigned getStageTargetOccupancy() const;

   /// Determines the stage's objective (increasing occupancy or reducing
   /// spilling, set in \ref TargetOcc). Defines \ref RPTargets in all regions to
   /// achieve that objective and mark those that don't achieve it in \ref
   /// TargetRegions. Returns whether there is any target region.
   bool setObjective();

   /// Unsets target regions in \p Regions whose RP target has been reached.
   void unsetSatisifedRPTargets(const BitVector &Regions);

   /// Fully recomputes RP from the DAG in \p Regions. Among those regions, sets
   /// again all \ref TargetRegions that were optimistically marked as satisfied
   /// but are actually not, and returns whether there were any such regions.
   bool updateAndVerifyRPTargets(const BitVector &Regions);

   /// Collects all rematerializable registers and appends them to \ref
   /// RematRegs. \p MIRegion maps MIs to their region. Returns whether any
   /// rematerializable register was found.
   bool collectRematRegs(const DenseMap<MachineInstr *, unsigned> &MIRegion);

   /// Rematerializes \p Remat. This removes the rematerialized register from
   /// live-in/out lists in the DAG and updates RP targets in all affected
   /// regions, which are also marked in \ref RescheduleRegions. Regions in which
   /// RP savings are not guaranteed are set in \p RecomputeRP. When \p Rollback
   /// is non-null, fills it with required information to be able to rollback the
   /// rematerialization post-rescheduling.
   void rematerialize(const RematReg &Remat, BitVector &RecomputeRP,
                      RollbackInfo *Rollback);

   /// Rollbacks the rematerialization decision represented by \p Rollback. This
   /// update live-in/out lists in the DAG but does not update cached register
   /// pressures.
   void rollback(const RollbackInfo &Rollback) const;

   /// Deletes all rematerialized MIs from the MIR when they were kept around for
   /// potential rollback.
   void commitRematerializations() const;

   /// Whether the MI is rematerializable
   bool isReMaterializable(const MachineInstr &MI);

   /// If remat alone did not increase occupancy to the target one, rollbacks all
   /// rematerializations and resets live-ins/RP in all regions impacted by the
   /// stage to their pre-stage values.
   void finalizeGCNSchedStage() override;

 public:
   bool initGCNSchedStage() override;

   bool initGCNRegion() override;

   void finalizeGCNRegion() override;

   bool shouldRevertScheduling(unsigned WavesAfter) override;

   PreRARematStage(GCNSchedStageID StageID, GCNScheduleDAGMILive &DAG)
       : GCNSchedStage(StageID, DAG), TargetRegions(DAG.Regions.size()),
         RescheduleRegions(DAG.Regions.size()) {
     const unsigned NumRegions = DAG.Regions.size();
     RPTargets.reserve(NumRegions);
     RegionBB.reserve(NumRegions);
   }
 };

 class ILPInitialScheduleStage : public GCNSchedStage {
 public:
   bool shouldRevertScheduling(unsigned WavesAfter) override;

   ILPInitialScheduleStage(GCNSchedStageID StageID, GCNScheduleDAGMILive &DAG)
       : GCNSchedStage(StageID, DAG) {}
 };

 class MemoryClauseInitialScheduleStage : public GCNSchedStage {
 public:
   bool shouldRevertScheduling(unsigned WavesAfter) override;

   MemoryClauseInitialScheduleStage(GCNSchedStageID StageID,
                                    GCNScheduleDAGMILive &DAG)
       : GCNSchedStage(StageID, DAG) {}
 };

 class GCNPostScheduleDAGMILive final : public ScheduleDAGMI {
 private:
   std::vector<std::unique_ptr<ScheduleDAGMutation>> SavedMutations;

   bool HasIGLPInstrs = false;

 public:
   void schedule() override;

   void finalizeSchedule() override;

   GCNPostScheduleDAGMILive(MachineSchedContext *C,
                            std::unique_ptr<MachineSchedStrategy> S,
                            bool RemoveKillFlags);
 };

 } // End namespace llvm

 #endif // LLVM_LIB_TARGET_AMDGPU_GCNSCHEDSTRATEGY_H