llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp - llvm-project - Git at Google

 //===-- GCNSchedStrategy.cpp - GCN Scheduler Strategy ---------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 //
 /// \file
 /// This contains a MachineSchedStrategy implementation for maximizing wave
 /// occupancy on GCN hardware.
 ///
 /// This pass will apply multiple scheduling stages to the same function.
 /// Regions are first recorded in GCNScheduleDAGMILive::schedule. The actual
 /// entry point for the scheduling of those regions is
 /// GCNScheduleDAGMILive::runSchedStages.

 /// Generally, the reason for having multiple scheduling stages is to account
 /// for the kernel-wide effect of register usage on occupancy.  Usually, only a
 /// few scheduling regions will have register pressure high enough to limit
 /// occupancy for the kernel, so constraints can be relaxed to improve ILP in
 /// other regions.
 ///
 //===----------------------------------------------------------------------===//

 #include "GCNSchedStrategy.h"
 #include "AMDGPUIGroupLP.h"
 #include "GCNRegPressure.h"
 #include "SIMachineFunctionInfo.h"
 #include "Utils/AMDGPUBaseInfo.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/CodeGen/RegisterClassInfo.h"
 #include "llvm/MC/LaneBitmask.h"
 #include "llvm/Support/ErrorHandling.h"

 #define DEBUG_TYPE "machine-scheduler"

 using namespace llvm;

 static cl::opt<bool> DisableUnclusterHighRP(
     "amdgpu-disable-unclustered-high-rp-reschedule", cl::Hidden,
     cl::desc("Disable unclustered high register pressure "
              "reduction scheduling stage."),
     cl::init(false));

 static cl::opt<bool> DisableClusteredLowOccupancy(
     "amdgpu-disable-clustered-low-occupancy-reschedule", cl::Hidden,
     cl::desc("Disable clustered low occupancy "
              "rescheduling for ILP scheduling stage."),
     cl::init(false));

 static cl::opt<unsigned> ScheduleMetricBias(
     "amdgpu-schedule-metric-bias", cl::Hidden,
     cl::desc(
         "Sets the bias which adds weight to occupancy vs latency. Set it to "
         "100 to chase the occupancy only."),
     cl::init(10));

 static cl::opt<bool>
     RelaxedOcc("amdgpu-schedule-relaxed-occupancy", cl::Hidden,
                cl::desc("Relax occupancy targets for kernels which are memory "
                         "bound (amdgpu-membound-threshold), or "
                         "Wave Limited (amdgpu-limit-wave-threshold)."),
                cl::init(false));

 static cl::opt<bool> GCNTrackers(
     "amdgpu-use-amdgpu-trackers", cl::Hidden,
     cl::desc("Use the AMDGPU specific RPTrackers during scheduling"),
     cl::init(false));

 const unsigned ScheduleMetrics::ScaleFactor = 100;

 GCNSchedStrategy::GCNSchedStrategy(const MachineSchedContext *C)
     : GenericScheduler(C), TargetOccupancy(0), MF(nullptr),
       DownwardTracker(*C->LIS), UpwardTracker(*C->LIS), HasHighPressure(false) {
 }

 void GCNSchedStrategy::initialize(ScheduleDAGMI *DAG) {
   GenericScheduler::initialize(DAG);

   MF = &DAG->MF;

   const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();

   SGPRExcessLimit =
       Context->RegClassInfo->getNumAllocatableRegs(&AMDGPU::SGPR_32RegClass);
   VGPRExcessLimit =
       Context->RegClassInfo->getNumAllocatableRegs(&AMDGPU::VGPR_32RegClass);

   SIMachineFunctionInfo &MFI = *MF->getInfo<SIMachineFunctionInfo>();
   // Set the initial TargetOccupnacy to the maximum occupancy that we can
   // achieve for this function. This effectively sets a lower bound on the
   // 'Critical' register limits in the scheduler.
   // Allow for lower occupancy targets if kernel is wave limited or memory
   // bound, and using the relaxed occupancy feature.
   TargetOccupancy =
       RelaxedOcc ? MFI.getMinAllowedOccupancy() : MFI.getOccupancy();
   SGPRCriticalLimit =
       std::min(ST.getMaxNumSGPRs(TargetOccupancy, true), SGPRExcessLimit);

   if (!KnownExcessRP) {
     VGPRCriticalLimit =
         std::min(ST.getMaxNumVGPRs(TargetOccupancy), VGPRExcessLimit);
   } else {
     // This is similar to ST.getMaxNumVGPRs(TargetOccupancy) result except
     // returns a reasonably small number for targets with lots of VGPRs, such
     // as GFX10 and GFX11.
     LLVM_DEBUG(dbgs() << "Region is known to spill, use alternative "
                          "VGPRCriticalLimit calculation method.\n");

     unsigned Granule = AMDGPU::IsaInfo::getVGPRAllocGranule(&ST);
     unsigned Addressable = AMDGPU::IsaInfo::getAddressableNumVGPRs(&ST);
     unsigned VGPRBudget = alignDown(Addressable / TargetOccupancy, Granule);
     VGPRBudget = std::max(VGPRBudget, Granule);
     VGPRCriticalLimit = std::min(VGPRBudget, VGPRExcessLimit);
   }

   // Subtract error margin and bias from register limits and avoid overflow.
   SGPRCriticalLimit -= std::min(SGPRLimitBias + ErrorMargin, SGPRCriticalLimit);
   VGPRCriticalLimit -= std::min(VGPRLimitBias + ErrorMargin, VGPRCriticalLimit);
   SGPRExcessLimit -= std::min(SGPRLimitBias + ErrorMargin, SGPRExcessLimit);
   VGPRExcessLimit -= std::min(VGPRLimitBias + ErrorMargin, VGPRExcessLimit);

   LLVM_DEBUG(dbgs() << "VGPRCriticalLimit = " << VGPRCriticalLimit
                     << ", VGPRExcessLimit = " << VGPRExcessLimit
                     << ", SGPRCriticalLimit = " << SGPRCriticalLimit
                     << ", SGPRExcessLimit = " << SGPRExcessLimit << "\n\n");
 }

 /// Checks whether \p SU can use the cached DAG pressure diffs to compute the
 /// current register pressure.
 ///
 /// This works for the common case, but it has a few exceptions that have been
 /// observed through trial and error:
 ///   - Explicit physical register operands
 ///   - Subregister definitions
 ///
 /// In both of those cases, PressureDiff doesn't represent the actual pressure,
 /// and querying LiveIntervals through the RegPressureTracker is needed to get
 /// an accurate value.
 ///
 /// We should eventually only use PressureDiff for maximum performance, but this
 /// already allows 80% of SUs to take the fast path without changing scheduling
 /// at all. Further changes would either change scheduling, or require a lot
 /// more logic to recover an accurate pressure estimate from the PressureDiffs.
 static bool canUsePressureDiffs(const SUnit &SU) {
   if (!SU.isInstr())
     return false;

   // Cannot use pressure diffs for subregister defs or with physregs, it's
   // imprecise in both cases.
   for (const auto &Op : SU.getInstr()->operands()) {
     if (!Op.isReg() || Op.isImplicit())
       continue;
     if (Op.getReg().isPhysical() ||
         (Op.isDef() && Op.getSubReg() != AMDGPU::NoSubRegister))
       return false;
   }
   return true;
 }

 static void getRegisterPressures(
     bool AtTop, const RegPressureTracker &RPTracker, SUnit *SU,
     std::vector<unsigned> &Pressure, std::vector<unsigned> &MaxPressure,
     GCNDownwardRPTracker &DownwardTracker, GCNUpwardRPTracker &UpwardTracker,
     ScheduleDAGMI *DAG, const SIRegisterInfo *SRI) {
   // getDownwardPressure() and getUpwardPressure() make temporary changes to
   // the tracker, so we need to pass those function a non-const copy.
   RegPressureTracker &TempTracker = const_cast<RegPressureTracker &>(RPTracker);
   if (!GCNTrackers) {
     AtTop
         ? TempTracker.getDownwardPressure(SU->getInstr(), Pressure, MaxPressure)
         : TempTracker.getUpwardPressure(SU->getInstr(), Pressure, MaxPressure);

     return;
   }

   // GCNTrackers
   Pressure.resize(4, 0);
   MachineInstr *MI = SU->getInstr();
   GCNRegPressure NewPressure;
   if (AtTop) {
     GCNDownwardRPTracker TempDownwardTracker(DownwardTracker);
     NewPressure = TempDownwardTracker.bumpDownwardPressure(MI, SRI);
   } else {
     GCNUpwardRPTracker TempUpwardTracker(UpwardTracker);
     TempUpwardTracker.recede(*MI);
     NewPressure = TempUpwardTracker.getPressure();
   }
   Pressure[AMDGPU::RegisterPressureSets::SReg_32] = NewPressure.getSGPRNum();
   Pressure[AMDGPU::RegisterPressureSets::VGPR_32] =
       NewPressure.getArchVGPRNum();
   Pressure[AMDGPU::RegisterPressureSets::AGPR_32] = NewPressure.getAGPRNum();
 }

 void GCNSchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU,
                                      bool AtTop,
                                      const RegPressureTracker &RPTracker,
                                      const SIRegisterInfo *SRI,
                                      unsigned SGPRPressure,
                                      unsigned VGPRPressure, bool IsBottomUp) {
   Cand.SU = SU;
   Cand.AtTop = AtTop;

   if (!DAG->isTrackingPressure())
     return;

   Pressure.clear();
   MaxPressure.clear();

   // We try to use the cached PressureDiffs in the ScheduleDAG whenever
   // possible over querying the RegPressureTracker.
   //
   // RegPressureTracker will make a lot of LIS queries which are very
   // expensive, it is considered a slow function in this context.
   //
   // PressureDiffs are precomputed and cached, and getPressureDiff is just a
   // trivial lookup into an array. It is pretty much free.
   //
   // In EXPENSIVE_CHECKS, we always query RPTracker to verify the results of
   // PressureDiffs.
   if (AtTop || !canUsePressureDiffs(*SU) || GCNTrackers) {
     getRegisterPressures(AtTop, RPTracker, SU, Pressure, MaxPressure,
                          DownwardTracker, UpwardTracker, DAG, SRI);
   } else {
     // Reserve 4 slots.
     Pressure.resize(4, 0);
     Pressure[AMDGPU::RegisterPressureSets::SReg_32] = SGPRPressure;
     Pressure[AMDGPU::RegisterPressureSets::VGPR_32] = VGPRPressure;

     for (const auto &Diff : DAG->getPressureDiff(SU)) {
       if (!Diff.isValid())
         continue;
       // PressureDiffs is always bottom-up so if we're working top-down we need
       // to invert its sign.
       Pressure[Diff.getPSet()] +=
           (IsBottomUp ? Diff.getUnitInc() : -Diff.getUnitInc());
     }

 #ifdef EXPENSIVE_CHECKS
     std::vector<unsigned> CheckPressure, CheckMaxPressure;
     getRegisterPressures(AtTop, RPTracker, SU, CheckPressure, CheckMaxPressure,
                          DownwardTracker, UpwardTracker, DAG, SRI);
     if (Pressure[AMDGPU::RegisterPressureSets::SReg_32] !=
             CheckPressure[AMDGPU::RegisterPressureSets::SReg_32] ||
         Pressure[AMDGPU::RegisterPressureSets::VGPR_32] !=
             CheckPressure[AMDGPU::RegisterPressureSets::VGPR_32]) {
       errs() << "Register Pressure is inaccurate when calculated through "
                 "PressureDiff\n"
              << "SGPR got " << Pressure[AMDGPU::RegisterPressureSets::SReg_32]
              << ", expected "
              << CheckPressure[AMDGPU::RegisterPressureSets::SReg_32] << "\n"
              << "VGPR got " << Pressure[AMDGPU::RegisterPressureSets::VGPR_32]
              << ", expected "
              << CheckPressure[AMDGPU::RegisterPressureSets::VGPR_32] << "\n";
       report_fatal_error("inaccurate register pressure calculation");
     }
 #endif
   }

   unsigned NewSGPRPressure = Pressure[AMDGPU::RegisterPressureSets::SReg_32];
   unsigned NewVGPRPressure = Pressure[AMDGPU::RegisterPressureSets::VGPR_32];

   // If two instructions increase the pressure of different register sets
   // by the same amount, the generic scheduler will prefer to schedule the
   // instruction that increases the set with the least amount of registers,
   // which in our case would be SGPRs.  This is rarely what we want, so
   // when we report excess/critical register pressure, we do it either
   // only for VGPRs or only for SGPRs.

   // FIXME: Better heuristics to determine whether to prefer SGPRs or VGPRs.
   const unsigned MaxVGPRPressureInc = 16;
   bool ShouldTrackVGPRs = VGPRPressure + MaxVGPRPressureInc >= VGPRExcessLimit;
   bool ShouldTrackSGPRs = !ShouldTrackVGPRs && SGPRPressure >= SGPRExcessLimit;

   // FIXME: We have to enter REG-EXCESS before we reach the actual threshold
   // to increase the likelihood we don't go over the limits.  We should improve
   // the analysis to look through dependencies to find the path with the least
   // register pressure.

   // We only need to update the RPDelta for instructions that increase register
   // pressure. Instructions that decrease or keep reg pressure the same will be
   // marked as RegExcess in tryCandidate() when they are compared with
   // instructions that increase the register pressure.
   if (ShouldTrackVGPRs && NewVGPRPressure >= VGPRExcessLimit) {
     HasHighPressure = true;
     Cand.RPDelta.Excess = PressureChange(AMDGPU::RegisterPressureSets::VGPR_32);
     Cand.RPDelta.Excess.setUnitInc(NewVGPRPressure - VGPRExcessLimit);
   }

   if (ShouldTrackSGPRs && NewSGPRPressure >= SGPRExcessLimit) {
     HasHighPressure = true;
     Cand.RPDelta.Excess = PressureChange(AMDGPU::RegisterPressureSets::SReg_32);
     Cand.RPDelta.Excess.setUnitInc(NewSGPRPressure - SGPRExcessLimit);
   }

   // Register pressure is considered 'CRITICAL' if it is approaching a value
   // that would reduce the wave occupancy for the execution unit.  When
   // register pressure is 'CRITICAL', increasing SGPR and VGPR pressure both
   // has the same cost, so we don't need to prefer one over the other.

   int SGPRDelta = NewSGPRPressure - SGPRCriticalLimit;
   int VGPRDelta = NewVGPRPressure - VGPRCriticalLimit;

   if (SGPRDelta >= 0 || VGPRDelta >= 0) {
     HasHighPressure = true;
     if (SGPRDelta > VGPRDelta) {
       Cand.RPDelta.CriticalMax =
           PressureChange(AMDGPU::RegisterPressureSets::SReg_32);
       Cand.RPDelta.CriticalMax.setUnitInc(SGPRDelta);
     } else {
       Cand.RPDelta.CriticalMax =
           PressureChange(AMDGPU::RegisterPressureSets::VGPR_32);
       Cand.RPDelta.CriticalMax.setUnitInc(VGPRDelta);
     }
   }
 }

 // This function is mostly cut and pasted from
 // GenericScheduler::pickNodeFromQueue()
 void GCNSchedStrategy::pickNodeFromQueue(SchedBoundary &Zone,
                                          const CandPolicy &ZonePolicy,
                                          const RegPressureTracker &RPTracker,
                                          SchedCandidate &Cand,
                                          bool IsBottomUp) {
   const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo *>(TRI);
   ArrayRef<unsigned> Pressure = RPTracker.getRegSetPressureAtPos();
   unsigned SGPRPressure = 0;
   unsigned VGPRPressure = 0;
   if (DAG->isTrackingPressure()) {
     if (!GCNTrackers) {
       SGPRPressure = Pressure[AMDGPU::RegisterPressureSets::SReg_32];
       VGPRPressure = Pressure[AMDGPU::RegisterPressureSets::VGPR_32];
     } else {
       GCNRPTracker *T = IsBottomUp
                             ? static_cast<GCNRPTracker *>(&UpwardTracker)
                             : static_cast<GCNRPTracker *>(&DownwardTracker);
       SGPRPressure = T->getPressure().getSGPRNum();
       VGPRPressure = T->getPressure().getArchVGPRNum();
     }
   }
   ReadyQueue &Q = Zone.Available;
   for (SUnit *SU : Q) {

     SchedCandidate TryCand(ZonePolicy);
     initCandidate(TryCand, SU, Zone.isTop(), RPTracker, SRI, SGPRPressure,
                   VGPRPressure, IsBottomUp);
     // Pass SchedBoundary only when comparing nodes from the same boundary.
     SchedBoundary *ZoneArg = Cand.AtTop == TryCand.AtTop ? &Zone : nullptr;
     tryCandidate(Cand, TryCand, ZoneArg);
     if (TryCand.Reason != NoCand) {
       // Initialize resource delta if needed in case future heuristics query it.
       if (TryCand.ResDelta == SchedResourceDelta())
         TryCand.initResourceDelta(Zone.DAG, SchedModel);
       Cand.setBest(TryCand);
       LLVM_DEBUG(traceCandidate(Cand));
     }
   }
 }

 // This function is mostly cut and pasted from
 // GenericScheduler::pickNodeBidirectional()
 SUnit *GCNSchedStrategy::pickNodeBidirectional(bool &IsTopNode) {
   // Schedule as far as possible in the direction of no choice. This is most
   // efficient, but also provides the best heuristics for CriticalPSets.
   if (SUnit *SU = Bot.pickOnlyChoice()) {
     IsTopNode = false;
     return SU;
   }
   if (SUnit *SU = Top.pickOnlyChoice()) {
     IsTopNode = true;
     return SU;
   }
   // Set the bottom-up policy based on the state of the current bottom zone and
   // the instructions outside the zone, including the top zone.
   CandPolicy BotPolicy;
   setPolicy(BotPolicy, /*IsPostRA=*/false, Bot, &Top);
   // Set the top-down policy based on the state of the current top zone and
   // the instructions outside the zone, including the bottom zone.
   CandPolicy TopPolicy;
   setPolicy(TopPolicy, /*IsPostRA=*/false, Top, &Bot);

   // See if BotCand is still valid (because we previously scheduled from Top).
   LLVM_DEBUG(dbgs() << "Picking from Bot:\n");
   if (!BotCand.isValid() || BotCand.SU->isScheduled ||
       BotCand.Policy != BotPolicy) {
     BotCand.reset(CandPolicy());
     pickNodeFromQueue(Bot, BotPolicy, DAG->getBotRPTracker(), BotCand,
                       /*IsBottomUp=*/true);
     assert(BotCand.Reason != NoCand && "failed to find the first candidate");
   } else {
     LLVM_DEBUG(traceCandidate(BotCand));
 #ifndef NDEBUG
     if (VerifyScheduling) {
       SchedCandidate TCand;
       TCand.reset(CandPolicy());
       pickNodeFromQueue(Bot, BotPolicy, DAG->getBotRPTracker(), TCand,
                         /*IsBottomUp=*/true);
       assert(TCand.SU == BotCand.SU &&
              "Last pick result should correspond to re-picking right now");
     }
 #endif
   }

   // Check if the top Q has a better candidate.
   LLVM_DEBUG(dbgs() << "Picking from Top:\n");
   if (!TopCand.isValid() || TopCand.SU->isScheduled ||
       TopCand.Policy != TopPolicy) {
     TopCand.reset(CandPolicy());
     pickNodeFromQueue(Top, TopPolicy, DAG->getTopRPTracker(), TopCand,
                       /*IsBottomUp=*/false);
     assert(TopCand.Reason != NoCand && "failed to find the first candidate");
   } else {
     LLVM_DEBUG(traceCandidate(TopCand));
 #ifndef NDEBUG
     if (VerifyScheduling) {
       SchedCandidate TCand;
       TCand.reset(CandPolicy());
       pickNodeFromQueue(Top, TopPolicy, DAG->getTopRPTracker(), TCand,
                         /*IsBottomUp=*/false);
       assert(TCand.SU == TopCand.SU &&
              "Last pick result should correspond to re-picking right now");
     }
 #endif
   }

   // Pick best from BotCand and TopCand.
   LLVM_DEBUG(dbgs() << "Top Cand: "; traceCandidate(TopCand);
              dbgs() << "Bot Cand: "; traceCandidate(BotCand););
   SchedCandidate Cand = BotCand;
   TopCand.Reason = NoCand;
   tryCandidate(Cand, TopCand, nullptr);
   if (TopCand.Reason != NoCand) {
     Cand.setBest(TopCand);
   }
   LLVM_DEBUG(dbgs() << "Picking: "; traceCandidate(Cand););

   IsTopNode = Cand.AtTop;
   return Cand.SU;
 }

 // This function is mostly cut and pasted from
 // GenericScheduler::pickNode()
 SUnit *GCNSchedStrategy::pickNode(bool &IsTopNode) {
   if (DAG->top() == DAG->bottom()) {
     assert(Top.Available.empty() && Top.Pending.empty() &&
            Bot.Available.empty() && Bot.Pending.empty() && "ReadyQ garbage");
     return nullptr;
   }
   SUnit *SU;
   do {
     if (RegionPolicy.OnlyTopDown) {
       SU = Top.pickOnlyChoice();
       if (!SU) {
         CandPolicy NoPolicy;
         TopCand.reset(NoPolicy);
         pickNodeFromQueue(Top, NoPolicy, DAG->getTopRPTracker(), TopCand,
                           /*IsBottomUp=*/false);
         assert(TopCand.Reason != NoCand && "failed to find a candidate");
         SU = TopCand.SU;
       }
       IsTopNode = true;
     } else if (RegionPolicy.OnlyBottomUp) {
       SU = Bot.pickOnlyChoice();
       if (!SU) {
         CandPolicy NoPolicy;
         BotCand.reset(NoPolicy);
         pickNodeFromQueue(Bot, NoPolicy, DAG->getBotRPTracker(), BotCand,
                           /*IsBottomUp=*/true);
         assert(BotCand.Reason != NoCand && "failed to find a candidate");
         SU = BotCand.SU;
       }
       IsTopNode = false;
     } else {
       SU = pickNodeBidirectional(IsTopNode);
     }
   } while (SU->isScheduled);

   if (SU->isTopReady())
     Top.removeReady(SU);
   if (SU->isBottomReady())
     Bot.removeReady(SU);

   LLVM_DEBUG(dbgs() << "Scheduling SU(" << SU->NodeNum << ") "
                     << *SU->getInstr());
   return SU;
 }

 void GCNSchedStrategy::schedNode(SUnit *SU, bool IsTopNode) {
   if (GCNTrackers) {
     MachineInstr *MI = SU->getInstr();
     IsTopNode ? (void)DownwardTracker.advance(MI, false)
               : UpwardTracker.recede(*MI);
   }

   return GenericScheduler::schedNode(SU, IsTopNode);
 }

 GCNSchedStageID GCNSchedStrategy::getCurrentStage() {
   assert(CurrentStage && CurrentStage != SchedStages.end());
   return *CurrentStage;
 }

 bool GCNSchedStrategy::advanceStage() {
   assert(CurrentStage != SchedStages.end());
   if (!CurrentStage)
     CurrentStage = SchedStages.begin();
   else
     CurrentStage++;

   return CurrentStage != SchedStages.end();
 }

 bool GCNSchedStrategy::hasNextStage() const {
   assert(CurrentStage);
   return std::next(CurrentStage) != SchedStages.end();
 }

 GCNSchedStageID GCNSchedStrategy::getNextStage() const {
   assert(CurrentStage && std::next(CurrentStage) != SchedStages.end());
   return *std::next(CurrentStage);
 }

 GCNMaxOccupancySchedStrategy::GCNMaxOccupancySchedStrategy(
     const MachineSchedContext *C, bool IsLegacyScheduler)
     : GCNSchedStrategy(C) {
   SchedStages.push_back(GCNSchedStageID::OccInitialSchedule);
   SchedStages.push_back(GCNSchedStageID::UnclusteredHighRPReschedule);
   SchedStages.push_back(GCNSchedStageID::ClusteredLowOccupancyReschedule);
   SchedStages.push_back(GCNSchedStageID::PreRARematerialize);
   GCNTrackers = GCNTrackers & !IsLegacyScheduler;
 }

 GCNMaxILPSchedStrategy::GCNMaxILPSchedStrategy(const MachineSchedContext *C)
     : GCNSchedStrategy(C) {
   SchedStages.push_back(GCNSchedStageID::ILPInitialSchedule);
 }

 bool GCNMaxILPSchedStrategy::tryCandidate(SchedCandidate &Cand,
                                           SchedCandidate &TryCand,
                                           SchedBoundary *Zone) const {
   // Initialize the candidate if needed.
   if (!Cand.isValid()) {
     TryCand.Reason = NodeOrder;
     return true;
   }

   // Avoid spilling by exceeding the register limit.
   if (DAG->isTrackingPressure() &&
       tryPressure(TryCand.RPDelta.Excess, Cand.RPDelta.Excess, TryCand, Cand,
                   RegExcess, TRI, DAG->MF))
     return TryCand.Reason != NoCand;

   // Bias PhysReg Defs and copies to their uses and defined respectively.
   if (tryGreater(biasPhysReg(TryCand.SU, TryCand.AtTop),
                  biasPhysReg(Cand.SU, Cand.AtTop), TryCand, Cand, PhysReg))
     return TryCand.Reason != NoCand;

   bool SameBoundary = Zone != nullptr;
   if (SameBoundary) {
     // Prioritize instructions that read unbuffered resources by stall cycles.
     if (tryLess(Zone->getLatencyStallCycles(TryCand.SU),
                 Zone->getLatencyStallCycles(Cand.SU), TryCand, Cand, Stall))
       return TryCand.Reason != NoCand;

     // Avoid critical resource consumption and balance the schedule.
     TryCand.initResourceDelta(DAG, SchedModel);
     if (tryLess(TryCand.ResDelta.CritResources, Cand.ResDelta.CritResources,
                 TryCand, Cand, ResourceReduce))
       return TryCand.Reason != NoCand;
     if (tryGreater(TryCand.ResDelta.DemandedResources,
                    Cand.ResDelta.DemandedResources, TryCand, Cand,
                    ResourceDemand))
       return TryCand.Reason != NoCand;

     // Unconditionally try to reduce latency.
     if (tryLatency(TryCand, Cand, *Zone))
       return TryCand.Reason != NoCand;

     // Weak edges are for clustering and other constraints.
     if (tryLess(getWeakLeft(TryCand.SU, TryCand.AtTop),
                 getWeakLeft(Cand.SU, Cand.AtTop), TryCand, Cand, Weak))
       return TryCand.Reason != NoCand;
   }

   // Keep clustered nodes together to encourage downstream peephole
   // optimizations which may reduce resource requirements.
   //
   // This is a best effort to set things up for a post-RA pass. Optimizations
   // like generating loads of multiple registers should ideally be done within
   // the scheduler pass by combining the loads during DAG postprocessing.
   const SUnit *CandNextClusterSU =
       Cand.AtTop ? DAG->getNextClusterSucc() : DAG->getNextClusterPred();
   const SUnit *TryCandNextClusterSU =
       TryCand.AtTop ? DAG->getNextClusterSucc() : DAG->getNextClusterPred();
   if (tryGreater(TryCand.SU == TryCandNextClusterSU,
                  Cand.SU == CandNextClusterSU, TryCand, Cand, Cluster))
     return TryCand.Reason != NoCand;

   // Avoid increasing the max critical pressure in the scheduled region.
   if (DAG->isTrackingPressure() &&
       tryPressure(TryCand.RPDelta.CriticalMax, Cand.RPDelta.CriticalMax,
                   TryCand, Cand, RegCritical, TRI, DAG->MF))
     return TryCand.Reason != NoCand;

   // Avoid increasing the max pressure of the entire region.
   if (DAG->isTrackingPressure() &&
       tryPressure(TryCand.RPDelta.CurrentMax, Cand.RPDelta.CurrentMax, TryCand,
                   Cand, RegMax, TRI, DAG->MF))
     return TryCand.Reason != NoCand;

   if (SameBoundary) {
     // Fall through to original instruction order.
     if ((Zone->isTop() && TryCand.SU->NodeNum < Cand.SU->NodeNum) ||
         (!Zone->isTop() && TryCand.SU->NodeNum > Cand.SU->NodeNum)) {
       TryCand.Reason = NodeOrder;
       return true;
     }
   }
   return false;
 }

 GCNMaxMemoryClauseSchedStrategy::GCNMaxMemoryClauseSchedStrategy(
     const MachineSchedContext *C)
     : GCNSchedStrategy(C) {
   SchedStages.push_back(GCNSchedStageID::MemoryClauseInitialSchedule);
 }

 /// GCNMaxMemoryClauseSchedStrategy tries best to clause memory instructions as
 /// much as possible. This is achieved by:
 //  1. Prioritize clustered operations before stall latency heuristic.
 //  2. Prioritize long-latency-load before stall latency heuristic.
 ///
 /// \param Cand provides the policy and current best candidate.
 /// \param TryCand refers to the next SUnit candidate, otherwise uninitialized.
 /// \param Zone describes the scheduled zone that we are extending, or nullptr
 ///             if Cand is from a different zone than TryCand.
 /// \return \c true if TryCand is better than Cand (Reason is NOT NoCand)
 bool GCNMaxMemoryClauseSchedStrategy::tryCandidate(SchedCandidate &Cand,
                                                    SchedCandidate &TryCand,
                                                    SchedBoundary *Zone) const {
   // Initialize the candidate if needed.
   if (!Cand.isValid()) {
     TryCand.Reason = NodeOrder;
     return true;
   }

   // Bias PhysReg Defs and copies to their uses and defined respectively.
   if (tryGreater(biasPhysReg(TryCand.SU, TryCand.AtTop),
                  biasPhysReg(Cand.SU, Cand.AtTop), TryCand, Cand, PhysReg))
     return TryCand.Reason != NoCand;

   if (DAG->isTrackingPressure()) {
     // Avoid exceeding the target's limit.
     if (tryPressure(TryCand.RPDelta.Excess, Cand.RPDelta.Excess, TryCand, Cand,
                     RegExcess, TRI, DAG->MF))
       return TryCand.Reason != NoCand;

     // Avoid increasing the max critical pressure in the scheduled region.
     if (tryPressure(TryCand.RPDelta.CriticalMax, Cand.RPDelta.CriticalMax,
                     TryCand, Cand, RegCritical, TRI, DAG->MF))
       return TryCand.Reason != NoCand;
   }

   // MaxMemoryClause-specific: We prioritize clustered instructions as we would
   // get more benefit from clausing these memory instructions.
   const SUnit *CandNextClusterSU =
       Cand.AtTop ? DAG->getNextClusterSucc() : DAG->getNextClusterPred();
   const SUnit *TryCandNextClusterSU =
       TryCand.AtTop ? DAG->getNextClusterSucc() : DAG->getNextClusterPred();
   if (tryGreater(TryCand.SU == TryCandNextClusterSU,
                  Cand.SU == CandNextClusterSU, TryCand, Cand, Cluster))
     return TryCand.Reason != NoCand;

   // We only compare a subset of features when comparing nodes between
   // Top and Bottom boundary. Some properties are simply incomparable, in many
   // other instances we should only override the other boundary if something
   // is a clear good pick on one boundary. Skip heuristics that are more
   // "tie-breaking" in nature.
   bool SameBoundary = Zone != nullptr;
   if (SameBoundary) {
     // For loops that are acyclic path limited, aggressively schedule for
     // latency. Within an single cycle, whenever CurrMOps > 0, allow normal
     // heuristics to take precedence.
     if (Rem.IsAcyclicLatencyLimited && !Zone->getCurrMOps() &&
         tryLatency(TryCand, Cand, *Zone))
       return TryCand.Reason != NoCand;

     // MaxMemoryClause-specific: Prioritize long latency memory load
     // instructions in top-bottom order to hide more latency. The mayLoad check
     // is used to exclude store-like instructions, which we do not want to
     // scheduler them too early.
     bool TryMayLoad =
         TryCand.SU->isInstr() && TryCand.SU->getInstr()->mayLoad();
     bool CandMayLoad = Cand.SU->isInstr() && Cand.SU->getInstr()->mayLoad();

     if (TryMayLoad || CandMayLoad) {
       bool TryLongLatency =
           TryCand.SU->Latency > 10 * Cand.SU->Latency && TryMayLoad;
       bool CandLongLatency =
           10 * TryCand.SU->Latency < Cand.SU->Latency && CandMayLoad;

       if (tryGreater(Zone->isTop() ? TryLongLatency : CandLongLatency,
                      Zone->isTop() ? CandLongLatency : TryLongLatency, TryCand,
                      Cand, Stall))
         return TryCand.Reason != NoCand;
     }
     // Prioritize instructions that read unbuffered resources by stall cycles.
     if (tryLess(Zone->getLatencyStallCycles(TryCand.SU),
                 Zone->getLatencyStallCycles(Cand.SU), TryCand, Cand, Stall))
       return TryCand.Reason != NoCand;
   }

   if (SameBoundary) {
     // Weak edges are for clustering and other constraints.
     if (tryLess(getWeakLeft(TryCand.SU, TryCand.AtTop),
                 getWeakLeft(Cand.SU, Cand.AtTop), TryCand, Cand, Weak))
       return TryCand.Reason != NoCand;
   }

   // Avoid increasing the max pressure of the entire region.
   if (DAG->isTrackingPressure() &&
       tryPressure(TryCand.RPDelta.CurrentMax, Cand.RPDelta.CurrentMax, TryCand,
                   Cand, RegMax, TRI, DAG->MF))
     return TryCand.Reason != NoCand;

   if (SameBoundary) {
     // Avoid critical resource consumption and balance the schedule.
     TryCand.initResourceDelta(DAG, SchedModel);
     if (tryLess(TryCand.ResDelta.CritResources, Cand.ResDelta.CritResources,
                 TryCand, Cand, ResourceReduce))
       return TryCand.Reason != NoCand;
     if (tryGreater(TryCand.ResDelta.DemandedResources,
                    Cand.ResDelta.DemandedResources, TryCand, Cand,
                    ResourceDemand))
       return TryCand.Reason != NoCand;

     // Avoid serializing long latency dependence chains.
     // For acyclic path limited loops, latency was already checked above.
     if (!RegionPolicy.DisableLatencyHeuristic && TryCand.Policy.ReduceLatency &&
         !Rem.IsAcyclicLatencyLimited && tryLatency(TryCand, Cand, *Zone))
       return TryCand.Reason != NoCand;

     // Fall through to original instruction order.
     if (Zone->isTop() == (TryCand.SU->NodeNum < Cand.SU->NodeNum)) {
       assert(TryCand.SU->NodeNum != Cand.SU->NodeNum);
       TryCand.Reason = NodeOrder;
       return true;
     }
   }

   return false;
 }

 GCNScheduleDAGMILive::GCNScheduleDAGMILive(
     MachineSchedContext *C, std::unique_ptr<MachineSchedStrategy> S)
     : ScheduleDAGMILive(C, std::move(S)), ST(MF.getSubtarget<GCNSubtarget>()),
       MFI(*MF.getInfo<SIMachineFunctionInfo>()),
       StartingOccupancy(MFI.getOccupancy()), MinOccupancy(StartingOccupancy),
       RegionLiveOuts(this, /*IsLiveOut=*/true) {

   // We want regions with a single MI to be scheduled so that we can reason
   // about them correctly during scheduling stages that move MIs between regions
   // (e.g., rematerialization).
   ScheduleSingleMIRegions = true;
   LLVM_DEBUG(dbgs() << "Starting occupancy is " << StartingOccupancy << ".\n");
   if (RelaxedOcc) {
     MinOccupancy = std::min(MFI.getMinAllowedOccupancy(), StartingOccupancy);
     if (MinOccupancy != StartingOccupancy)
       LLVM_DEBUG(dbgs() << "Allowing Occupancy drops to " << MinOccupancy
                         << ".\n");
   }
 }

 std::unique_ptr<GCNSchedStage>
 GCNScheduleDAGMILive::createSchedStage(GCNSchedStageID SchedStageID) {
   switch (SchedStageID) {
   case GCNSchedStageID::OccInitialSchedule:
     return std::make_unique<OccInitialScheduleStage>(SchedStageID, *this);
   case GCNSchedStageID::UnclusteredHighRPReschedule:
     return std::make_unique<UnclusteredHighRPStage>(SchedStageID, *this);
   case GCNSchedStageID::ClusteredLowOccupancyReschedule:
     return std::make_unique<ClusteredLowOccStage>(SchedStageID, *this);
   case GCNSchedStageID::PreRARematerialize:
     return std::make_unique<PreRARematStage>(SchedStageID, *this);
   case GCNSchedStageID::ILPInitialSchedule:
     return std::make_unique<ILPInitialScheduleStage>(SchedStageID, *this);
   case GCNSchedStageID::MemoryClauseInitialSchedule:
     return std::make_unique<MemoryClauseInitialScheduleStage>(SchedStageID,
                                                               *this);
   }

   llvm_unreachable("Unknown SchedStageID.");
 }

 void GCNScheduleDAGMILive::schedule() {
   // Collect all scheduling regions. The actual scheduling is performed in
   // GCNScheduleDAGMILive::finalizeSchedule.
   Regions.push_back(std::pair(RegionBegin, RegionEnd));
 }

 GCNRegPressure
 GCNScheduleDAGMILive::getRealRegPressure(unsigned RegionIdx) const {
   GCNDownwardRPTracker RPTracker(*LIS);
   RPTracker.advance(begin(), end(), &LiveIns[RegionIdx]);
   return RPTracker.moveMaxPressure();
 }

 static MachineInstr *getLastMIForRegion(MachineBasicBlock::iterator RegionBegin,
                                         MachineBasicBlock::iterator RegionEnd) {
   auto REnd = RegionEnd == RegionBegin->getParent()->end()
                   ? std::prev(RegionEnd)
                   : RegionEnd;
   return &*skipDebugInstructionsBackward(REnd, RegionBegin);
 }

 void GCNScheduleDAGMILive::computeBlockPressure(unsigned RegionIdx,
                                                 const MachineBasicBlock *MBB) {
   GCNDownwardRPTracker RPTracker(*LIS);

   // If the block has the only successor then live-ins of that successor are
   // live-outs of the current block. We can reuse calculated live set if the
   // successor will be sent to scheduling past current block.

   // However, due to the bug in LiveInterval analysis it may happen that two
   // predecessors of the same successor block have different lane bitmasks for
   // a live-out register. Workaround that by sticking to one-to-one relationship
   // i.e. one predecessor with one successor block.
   const MachineBasicBlock *OnlySucc = nullptr;
   if (MBB->succ_size() == 1) {
     auto *Candidate = *MBB->succ_begin();
     if (!Candidate->empty() && Candidate->pred_size() == 1) {
       SlotIndexes *Ind = LIS->getSlotIndexes();
       if (Ind->getMBBStartIdx(MBB) < Ind->getMBBStartIdx(Candidate))
         OnlySucc = Candidate;
     }
   }

   // Scheduler sends regions from the end of the block upwards.
   size_t CurRegion = RegionIdx;
   for (size_t E = Regions.size(); CurRegion != E; ++CurRegion)
     if (Regions[CurRegion].first->getParent() != MBB)
       break;
   --CurRegion;

   auto I = MBB->begin();
   auto LiveInIt = MBBLiveIns.find(MBB);
   auto &Rgn = Regions[CurRegion];
   auto *NonDbgMI = &*skipDebugInstructionsForward(Rgn.first, Rgn.second);
   if (LiveInIt != MBBLiveIns.end()) {
     auto LiveIn = std::move(LiveInIt->second);
     RPTracker.reset(*MBB->begin(), &LiveIn);
     MBBLiveIns.erase(LiveInIt);
   } else {
     I = Rgn.first;
     auto LRS = BBLiveInMap.lookup(NonDbgMI);
 #ifdef EXPENSIVE_CHECKS
     assert(isEqual(getLiveRegsBefore(*NonDbgMI, *LIS), LRS));
 #endif
     RPTracker.reset(*I, &LRS);
   }

   for (;;) {
     I = RPTracker.getNext();

     if (Regions[CurRegion].first == I || NonDbgMI == I) {
       LiveIns[CurRegion] = RPTracker.getLiveRegs();
       RPTracker.clearMaxPressure();
     }

     if (Regions[CurRegion].second == I) {
       Pressure[CurRegion] = RPTracker.moveMaxPressure();
       if (CurRegion-- == RegionIdx)
         break;
       auto &Rgn = Regions[CurRegion];
       NonDbgMI = &*skipDebugInstructionsForward(Rgn.first, Rgn.second);
     }
     RPTracker.advanceToNext();
     RPTracker.advanceBeforeNext();
   }

   if (OnlySucc) {
     if (I != MBB->end()) {
       RPTracker.advanceToNext();
       RPTracker.advance(MBB->end());
     }
     RPTracker.advanceBeforeNext();
     MBBLiveIns[OnlySucc] = RPTracker.moveLiveRegs();
   }
 }

 DenseMap<MachineInstr *, GCNRPTracker::LiveRegSet>
 GCNScheduleDAGMILive::getRegionLiveInMap() const {
   assert(!Regions.empty());
   std::vector<MachineInstr *> RegionFirstMIs;
   RegionFirstMIs.reserve(Regions.size());
   auto I = Regions.rbegin(), E = Regions.rend();
   do {
     const MachineBasicBlock *MBB = I->first->getParent();
     auto *MI = &*skipDebugInstructionsForward(I->first, I->second);
     RegionFirstMIs.push_back(MI);
     do {
       ++I;
     } while (I != E && I->first->getParent() == MBB);
   } while (I != E);
   return getLiveRegMap(RegionFirstMIs, /*After=*/false, *LIS);
 }

 DenseMap<MachineInstr *, GCNRPTracker::LiveRegSet>
 GCNScheduleDAGMILive::getRegionLiveOutMap() const {
   assert(!Regions.empty());
   std::vector<MachineInstr *> RegionLastMIs;
   RegionLastMIs.reserve(Regions.size());
   for (auto &[RegionBegin, RegionEnd] : reverse(Regions))
     RegionLastMIs.push_back(getLastMIForRegion(RegionBegin, RegionEnd));

   return getLiveRegMap(RegionLastMIs, /*After=*/true, *LIS);
 }

 void RegionPressureMap::buildLiveRegMap() {
   IdxToInstruction.clear();

   RegionLiveRegMap =
       IsLiveOut ? DAG->getRegionLiveOutMap() : DAG->getRegionLiveInMap();
   for (unsigned I = 0; I < DAG->Regions.size(); I++) {
     MachineInstr *RegionKey =
         IsLiveOut
             ? getLastMIForRegion(DAG->Regions[I].first, DAG->Regions[I].second)
             : &*DAG->Regions[I].first;
     IdxToInstruction[I] = RegionKey;
   }
 }

 void GCNScheduleDAGMILive::finalizeSchedule() {
   // Start actual scheduling here. This function is called by the base
   // MachineScheduler after all regions have been recorded by
   // GCNScheduleDAGMILive::schedule().
   LiveIns.resize(Regions.size());
   Pressure.resize(Regions.size());
   RescheduleRegions.resize(Regions.size());
   RegionsWithHighRP.resize(Regions.size());
   RegionsWithExcessRP.resize(Regions.size());
   RegionsWithMinOcc.resize(Regions.size());
   RegionsWithIGLPInstrs.resize(Regions.size());
   RescheduleRegions.set();
   RegionsWithHighRP.reset();
   RegionsWithExcessRP.reset();
   RegionsWithMinOcc.reset();
   RegionsWithIGLPInstrs.reset();

   runSchedStages();
 }

 void GCNScheduleDAGMILive::runSchedStages() {
   LLVM_DEBUG(dbgs() << "All regions recorded, starting actual scheduling.\n");

   if (!Regions.empty()) {
     BBLiveInMap = getRegionLiveInMap();
     if (GCNTrackers)
       RegionLiveOuts.buildLiveRegMap();
   }

   GCNSchedStrategy &S = static_cast<GCNSchedStrategy &>(*SchedImpl);
   while (S.advanceStage()) {
     auto Stage = createSchedStage(S.getCurrentStage());
     if (!Stage->initGCNSchedStage())
       continue;

     for (auto Region : Regions) {
       RegionBegin = Region.first;
       RegionEnd = Region.second;
       // Setup for scheduling the region and check whether it should be skipped.
       if (!Stage->initGCNRegion()) {
         Stage->advanceRegion();
         exitRegion();
         continue;
       }

       if (GCNTrackers) {
         GCNDownwardRPTracker *DownwardTracker = S.getDownwardTracker();
         GCNUpwardRPTracker *UpwardTracker = S.getUpwardTracker();
         GCNRPTracker::LiveRegSet *RegionLiveIns =
             &LiveIns[Stage->getRegionIdx()];

         reinterpret_cast<GCNRPTracker *>(DownwardTracker)
             ->reset(MRI, *RegionLiveIns);
         reinterpret_cast<GCNRPTracker *>(UpwardTracker)
             ->reset(MRI, RegionLiveOuts.getLiveRegsForRegionIdx(
                              Stage->getRegionIdx()));
       }

       ScheduleDAGMILive::schedule();
       Stage->finalizeGCNRegion();
     }

     Stage->finalizeGCNSchedStage();
   }
 }

 #ifndef NDEBUG
 raw_ostream &llvm::operator<<(raw_ostream &OS, const GCNSchedStageID &StageID) {
   switch (StageID) {
   case GCNSchedStageID::OccInitialSchedule:
     OS << "Max Occupancy Initial Schedule";
     break;
   case GCNSchedStageID::UnclusteredHighRPReschedule:
     OS << "Unclustered High Register Pressure Reschedule";
     break;
   case GCNSchedStageID::ClusteredLowOccupancyReschedule:
     OS << "Clustered Low Occupancy Reschedule";
     break;
   case GCNSchedStageID::PreRARematerialize:
     OS << "Pre-RA Rematerialize";
     break;
   case GCNSchedStageID::ILPInitialSchedule:
     OS << "Max ILP Initial Schedule";
     break;
   case GCNSchedStageID::MemoryClauseInitialSchedule:
     OS << "Max memory clause Initial Schedule";
     break;
   }

   return OS;
 }
 #endif

 GCNSchedStage::GCNSchedStage(GCNSchedStageID StageID, GCNScheduleDAGMILive &DAG)
     : DAG(DAG), S(static_cast<GCNSchedStrategy &>(*DAG.SchedImpl)), MF(DAG.MF),
       MFI(DAG.MFI), ST(DAG.ST), StageID(StageID) {}

 bool GCNSchedStage::initGCNSchedStage() {
   if (!DAG.LIS)
     return false;

   LLVM_DEBUG(dbgs() << "Starting scheduling stage: " << StageID << "\n");
   return true;
 }

 bool UnclusteredHighRPStage::initGCNSchedStage() {
   if (DisableUnclusterHighRP)
     return false;

   if (!GCNSchedStage::initGCNSchedStage())
     return false;

   if (DAG.RegionsWithHighRP.none() && DAG.RegionsWithExcessRP.none())
     return false;

   SavedMutations.swap(DAG.Mutations);
   DAG.addMutation(
       createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::PreRAReentry));

   InitialOccupancy = DAG.MinOccupancy;
   // Aggressivly try to reduce register pressure in the unclustered high RP
   // stage. Temporarily increase occupancy target in the region.
   S.SGPRLimitBias = S.HighRPSGPRBias;
   S.VGPRLimitBias = S.HighRPVGPRBias;
   if (MFI.getMaxWavesPerEU() > DAG.MinOccupancy)
     MFI.increaseOccupancy(MF, ++DAG.MinOccupancy);

   LLVM_DEBUG(
       dbgs()
       << "Retrying function scheduling without clustering. "
          "Aggressivly try to reduce register pressure to achieve occupancy "
       << DAG.MinOccupancy << ".\n");

   return true;
 }

 bool ClusteredLowOccStage::initGCNSchedStage() {
   if (DisableClusteredLowOccupancy)
     return false;

   if (!GCNSchedStage::initGCNSchedStage())
     return false;

   // Don't bother trying to improve ILP in lower RP regions if occupancy has not
   // been dropped. All regions will have already been scheduled with the ideal
   // occupancy targets.
   if (DAG.StartingOccupancy <= DAG.MinOccupancy)
     return false;

   LLVM_DEBUG(
       dbgs() << "Retrying function scheduling with lowest recorded occupancy "
              << DAG.MinOccupancy << ".\n");
   return true;
 }

 /// Allows to easily filter for this stage's debug output.
 #define REMAT_DEBUG(X) LLVM_DEBUG(dbgs() << "[PreRARemat] "; X;)

 bool PreRARematStage::initGCNSchedStage() {
   // FIXME: This pass will invalidate cached BBLiveInMap and MBBLiveIns for
   // regions inbetween the defs and region we sinked the def to. Will need to be
   // fixed if there is another pass after this pass.
   assert(!S.hasNextStage());

   if (!GCNSchedStage::initGCNSchedStage() || DAG.RegionsWithMinOcc.none() ||
       DAG.Regions.size() == 1)
     return false;

   // Before performing any IR modification record the parent region of each MI
   // and the parent MBB of each region.
   const unsigned NumRegions = DAG.Regions.size();
   RegionBB.reserve(NumRegions);
   for (unsigned I = 0; I < NumRegions; ++I) {
     RegionBoundaries Region = DAG.Regions[I];
     for (auto MI = Region.first; MI != Region.second; ++MI)
       MIRegion.insert({&*MI, I});
     RegionBB.push_back(Region.first->getParent());
   }

   if (!canIncreaseOccupancyOrReduceSpill())
     return false;

   // Rematerialize identified instructions and update scheduler's state.
   rematerialize();
   if (GCNTrackers)
     DAG.RegionLiveOuts.buildLiveRegMap();
   REMAT_DEBUG(
       dbgs() << "Retrying function scheduling with new min. occupancy of "
              << AchievedOcc << " from rematerializing (original was "
              << DAG.MinOccupancy << ", target was " << TargetOcc << ")\n");
   if (AchievedOcc > DAG.MinOccupancy) {
     DAG.MinOccupancy = AchievedOcc;
     SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
     MFI.increaseOccupancy(MF, DAG.MinOccupancy);
   }
   return true;
 }

 void GCNSchedStage::finalizeGCNSchedStage() {
   DAG.finishBlock();
   LLVM_DEBUG(dbgs() << "Ending scheduling stage: " << StageID << "\n");
 }

 void UnclusteredHighRPStage::finalizeGCNSchedStage() {
   SavedMutations.swap(DAG.Mutations);
   S.SGPRLimitBias = S.VGPRLimitBias = 0;
   if (DAG.MinOccupancy > InitialOccupancy) {
     for (unsigned IDX = 0; IDX < DAG.Pressure.size(); ++IDX)
       DAG.RegionsWithMinOcc[IDX] =
           DAG.Pressure[IDX].getOccupancy(DAG.ST) == DAG.MinOccupancy;

     LLVM_DEBUG(dbgs() << StageID
                       << " stage successfully increased occupancy to "
                       << DAG.MinOccupancy << '\n');
   }

   GCNSchedStage::finalizeGCNSchedStage();
 }

 bool GCNSchedStage::initGCNRegion() {
   // Check whether this new region is also a new block.
   if (DAG.RegionBegin->getParent() != CurrentMBB)
     setupNewBlock();

   unsigned NumRegionInstrs = std::distance(DAG.begin(), DAG.end());
   DAG.enterRegion(CurrentMBB, DAG.begin(), DAG.end(), NumRegionInstrs);

   // Skip empty scheduling regions (0 or 1 schedulable instructions).
   if (DAG.begin() == DAG.end() || DAG.begin() == std::prev(DAG.end()))
     return false;

   LLVM_DEBUG(dbgs() << "********** MI Scheduling **********\n");
   LLVM_DEBUG(dbgs() << MF.getName() << ":" << printMBBReference(*CurrentMBB)
                     << " " << CurrentMBB->getName()
                     << "\n  From: " << *DAG.begin() << "    To: ";
              if (DAG.RegionEnd != CurrentMBB->end()) dbgs() << *DAG.RegionEnd;
              else dbgs() << "End";
              dbgs() << " RegionInstrs: " << NumRegionInstrs << '\n');

   // Save original instruction order before scheduling for possible revert.
   Unsched.clear();
   Unsched.reserve(DAG.NumRegionInstrs);
   if (StageID == GCNSchedStageID::OccInitialSchedule ||
       StageID == GCNSchedStageID::ILPInitialSchedule) {
     const SIInstrInfo *SII = static_cast<const SIInstrInfo *>(DAG.TII);
     for (auto &I : DAG) {
       Unsched.push_back(&I);
       if (SII->isIGLPMutationOnly(I.getOpcode()))
         DAG.RegionsWithIGLPInstrs[RegionIdx] = true;
     }
   } else {
     for (auto &I : DAG)
       Unsched.push_back(&I);
   }

   PressureBefore = DAG.Pressure[RegionIdx];

   LLVM_DEBUG(
       dbgs() << "Pressure before scheduling:\nRegion live-ins:"
              << print(DAG.LiveIns[RegionIdx], DAG.MRI)
              << "Region live-in pressure:  "
              << print(llvm::getRegPressure(DAG.MRI, DAG.LiveIns[RegionIdx]))
              << "Region register pressure: " << print(PressureBefore));

   S.HasHighPressure = false;
   S.KnownExcessRP = isRegionWithExcessRP();

   if (DAG.RegionsWithIGLPInstrs[RegionIdx] &&
       StageID != GCNSchedStageID::UnclusteredHighRPReschedule) {
     SavedMutations.clear();
     SavedMutations.swap(DAG.Mutations);
     bool IsInitialStage = StageID == GCNSchedStageID::OccInitialSchedule ||
                           StageID == GCNSchedStageID::ILPInitialSchedule;
     DAG.addMutation(createIGroupLPDAGMutation(
         IsInitialStage ? AMDGPU::SchedulingPhase::Initial
                        : AMDGPU::SchedulingPhase::PreRAReentry));
   }

   return true;
 }

 bool UnclusteredHighRPStage::initGCNRegion() {
   // Only reschedule regions with the minimum occupancy or regions that may have
   // spilling (excess register pressure).
   if ((!DAG.RegionsWithMinOcc[RegionIdx] ||
        DAG.MinOccupancy <= InitialOccupancy) &&
       !DAG.RegionsWithExcessRP[RegionIdx])
     return false;

   return GCNSchedStage::initGCNRegion();
 }

 bool ClusteredLowOccStage::initGCNRegion() {
   // We may need to reschedule this region if it wasn't rescheduled in the last
   // stage, or if we found it was testing critical register pressure limits in
   // the unclustered reschedule stage. The later is because we may not have been
   // able to raise the min occupancy in the previous stage so the region may be
   // overly constrained even if it was already rescheduled.
   if (!DAG.RegionsWithHighRP[RegionIdx])
     return false;

   return GCNSchedStage::initGCNRegion();
 }

 bool PreRARematStage::initGCNRegion() {
   if (!DAG.RescheduleRegions[RegionIdx])
     return false;

   return GCNSchedStage::initGCNRegion();
 }

 void GCNSchedStage::setupNewBlock() {
   if (CurrentMBB)
     DAG.finishBlock();

   CurrentMBB = DAG.RegionBegin->getParent();
   DAG.startBlock(CurrentMBB);
   // Get real RP for the region if it hasn't be calculated before. After the
   // initial schedule stage real RP will be collected after scheduling.
   if (StageID == GCNSchedStageID::OccInitialSchedule ||
       StageID == GCNSchedStageID::ILPInitialSchedule ||
       StageID == GCNSchedStageID::MemoryClauseInitialSchedule)
     DAG.computeBlockPressure(RegionIdx, CurrentMBB);
 }

 void GCNSchedStage::finalizeGCNRegion() {
   DAG.Regions[RegionIdx] = std::pair(DAG.RegionBegin, DAG.RegionEnd);
   DAG.RescheduleRegions[RegionIdx] = false;
   if (S.HasHighPressure)
     DAG.RegionsWithHighRP[RegionIdx] = true;

   // Revert scheduling if we have dropped occupancy or there is some other
   // reason that the original schedule is better.
   checkScheduling();

   if (DAG.RegionsWithIGLPInstrs[RegionIdx] &&
       StageID != GCNSchedStageID::UnclusteredHighRPReschedule)
     SavedMutations.swap(DAG.Mutations);

   DAG.exitRegion();
   RegionIdx++;
 }

 void GCNSchedStage::checkScheduling() {
   // Check the results of scheduling.
   PressureAfter = DAG.getRealRegPressure(RegionIdx);

   LLVM_DEBUG(dbgs() << "Pressure after scheduling: " << print(PressureAfter));
   LLVM_DEBUG(dbgs() << "Region: " << RegionIdx << ".\n");

   if (PressureAfter.getSGPRNum() <= S.SGPRCriticalLimit &&
       PressureAfter.getVGPRNum(ST.hasGFX90AInsts()) <= S.VGPRCriticalLimit) {
     DAG.Pressure[RegionIdx] = PressureAfter;
     DAG.RegionsWithMinOcc[RegionIdx] =
         PressureAfter.getOccupancy(ST) == DAG.MinOccupancy;

     // Early out if we have achieved the occupancy target.
     LLVM_DEBUG(dbgs() << "Pressure in desired limits, done.\n");
     return;
   }

   unsigned TargetOccupancy = std::min(
       S.getTargetOccupancy(), ST.getOccupancyWithWorkGroupSizes(MF).second);
   unsigned WavesAfter =
       std::min(TargetOccupancy, PressureAfter.getOccupancy(ST));
   unsigned WavesBefore =
       std::min(TargetOccupancy, PressureBefore.getOccupancy(ST));
   LLVM_DEBUG(dbgs() << "Occupancy before scheduling: " << WavesBefore
                     << ", after " << WavesAfter << ".\n");

   // We may not be able to keep the current target occupancy because of the just
   // scheduled region. We might still be able to revert scheduling if the
   // occupancy before was higher, or if the current schedule has register
   // pressure higher than the excess limits which could lead to more spilling.
   unsigned NewOccupancy = std::max(WavesAfter, WavesBefore);

   // Allow memory bound functions to drop to 4 waves if not limited by an
   // attribute.
   if (WavesAfter < WavesBefore && WavesAfter < DAG.MinOccupancy &&
       WavesAfter >= MFI.getMinAllowedOccupancy()) {
     LLVM_DEBUG(dbgs() << "Function is memory bound, allow occupancy drop up to "
                       << MFI.getMinAllowedOccupancy() << " waves\n");
     NewOccupancy = WavesAfter;
   }

   if (NewOccupancy < DAG.MinOccupancy) {
     DAG.MinOccupancy = NewOccupancy;
     MFI.limitOccupancy(DAG.MinOccupancy);
     DAG.RegionsWithMinOcc.reset();
     LLVM_DEBUG(dbgs() << "Occupancy lowered for the function to "
                       << DAG.MinOccupancy << ".\n");
   }
   // The maximum number of arch VGPR on non-unified register file, or the
   // maximum VGPR + AGPR in the unified register file case.
   unsigned MaxVGPRs = ST.getMaxNumVGPRs(MF);
   // The maximum number of arch VGPR for both unified and non-unified register
   // file.
   unsigned MaxArchVGPRs = std::min(MaxVGPRs, ST.getAddressableNumArchVGPRs());
   unsigned MaxSGPRs = ST.getMaxNumSGPRs(MF);

   if (PressureAfter.getVGPRNum(ST.hasGFX90AInsts()) > MaxVGPRs ||
       PressureAfter.getVGPRNum(false) > MaxArchVGPRs ||
       PressureAfter.getAGPRNum() > MaxArchVGPRs ||
       PressureAfter.getSGPRNum() > MaxSGPRs) {
     DAG.RescheduleRegions[RegionIdx] = true;
     DAG.RegionsWithHighRP[RegionIdx] = true;
     DAG.RegionsWithExcessRP[RegionIdx] = true;
   }

   // Revert if this region's schedule would cause a drop in occupancy or
   // spilling.
   if (shouldRevertScheduling(WavesAfter)) {
     revertScheduling();
   } else {
     DAG.Pressure[RegionIdx] = PressureAfter;
     DAG.RegionsWithMinOcc[RegionIdx] =
         PressureAfter.getOccupancy(ST) == DAG.MinOccupancy;
   }
 }

 unsigned
 GCNSchedStage::computeSUnitReadyCycle(const SUnit &SU, unsigned CurrCycle,
                                       DenseMap<unsigned, unsigned> &ReadyCycles,
                                       const TargetSchedModel &SM) {
   unsigned ReadyCycle = CurrCycle;
   for (auto &D : SU.Preds) {
     if (D.isAssignedRegDep()) {
       MachineInstr *DefMI = D.getSUnit()->getInstr();
       unsigned Latency = SM.computeInstrLatency(DefMI);
       unsigned DefReady = ReadyCycles[DAG.getSUnit(DefMI)->NodeNum];
       ReadyCycle = std::max(ReadyCycle, DefReady + Latency);
     }
   }
   ReadyCycles[SU.NodeNum] = ReadyCycle;
   return ReadyCycle;
 }

 #ifndef NDEBUG
 struct EarlierIssuingCycle {
   bool operator()(std::pair<MachineInstr *, unsigned> A,
                   std::pair<MachineInstr *, unsigned> B) const {
     return A.second < B.second;
   }
 };

 static void printScheduleModel(std::set<std::pair<MachineInstr *, unsigned>,
                                         EarlierIssuingCycle> &ReadyCycles) {
   if (ReadyCycles.empty())
     return;
   unsigned BBNum = ReadyCycles.begin()->first->getParent()->getNumber();
   dbgs() << "\n################## Schedule time ReadyCycles for MBB : " << BBNum
          << " ##################\n# Cycle #\t\t\tInstruction          "
             "             "
             "                            \n";
   unsigned IPrev = 1;
   for (auto &I : ReadyCycles) {
     if (I.second > IPrev + 1)
       dbgs() << "****************************** BUBBLE OF " << I.second - IPrev
              << " CYCLES DETECTED ******************************\n\n";
     dbgs() << "[ " << I.second << " ]  :  " << *I.first << "\n";
     IPrev = I.second;
   }
 }
 #endif

 ScheduleMetrics
 GCNSchedStage::getScheduleMetrics(const std::vector<SUnit> &InputSchedule) {
 #ifndef NDEBUG
   std::set<std::pair<MachineInstr *, unsigned>, EarlierIssuingCycle>
       ReadyCyclesSorted;
 #endif
   const TargetSchedModel &SM = ST.getInstrInfo()->getSchedModel();
   unsigned SumBubbles = 0;
   DenseMap<unsigned, unsigned> ReadyCycles;
   unsigned CurrCycle = 0;
   for (auto &SU : InputSchedule) {
     unsigned ReadyCycle =
         computeSUnitReadyCycle(SU, CurrCycle, ReadyCycles, SM);
     SumBubbles += ReadyCycle - CurrCycle;
 #ifndef NDEBUG
     ReadyCyclesSorted.insert(std::make_pair(SU.getInstr(), ReadyCycle));
 #endif
     CurrCycle = ++ReadyCycle;
   }
 #ifndef NDEBUG
   LLVM_DEBUG(
       printScheduleModel(ReadyCyclesSorted);
       dbgs() << "\n\t"
              << "Metric: "
              << (SumBubbles
                      ? (SumBubbles * ScheduleMetrics::ScaleFactor) / CurrCycle
                      : 1)
              << "\n\n");
 #endif

   return ScheduleMetrics(CurrCycle, SumBubbles);
 }

 ScheduleMetrics
 GCNSchedStage::getScheduleMetrics(const GCNScheduleDAGMILive &DAG) {
 #ifndef NDEBUG
   std::set<std::pair<MachineInstr *, unsigned>, EarlierIssuingCycle>
       ReadyCyclesSorted;
 #endif
   const TargetSchedModel &SM = ST.getInstrInfo()->getSchedModel();
   unsigned SumBubbles = 0;
   DenseMap<unsigned, unsigned> ReadyCycles;
   unsigned CurrCycle = 0;
   for (auto &MI : DAG) {
     SUnit *SU = DAG.getSUnit(&MI);
     if (!SU)
       continue;
     unsigned ReadyCycle =
         computeSUnitReadyCycle(*SU, CurrCycle, ReadyCycles, SM);
     SumBubbles += ReadyCycle - CurrCycle;
 #ifndef NDEBUG
     ReadyCyclesSorted.insert(std::make_pair(SU->getInstr(), ReadyCycle));
 #endif
     CurrCycle = ++ReadyCycle;
   }
 #ifndef NDEBUG
   LLVM_DEBUG(
       printScheduleModel(ReadyCyclesSorted);
       dbgs() << "\n\t"
              << "Metric: "
              << (SumBubbles
                      ? (SumBubbles * ScheduleMetrics::ScaleFactor) / CurrCycle
                      : 1)
              << "\n\n");
 #endif

   return ScheduleMetrics(CurrCycle, SumBubbles);
 }

 bool GCNSchedStage::shouldRevertScheduling(unsigned WavesAfter) {
   if (WavesAfter < DAG.MinOccupancy)
     return true;

   // For dynamic VGPR mode, we don't want to waste any VGPR blocks.
   if (ST.isDynamicVGPREnabled()) {
     unsigned BlocksBefore = AMDGPU::IsaInfo::getAllocatedNumVGPRBlocks(
         &ST, PressureBefore.getVGPRNum(false));
     unsigned BlocksAfter = AMDGPU::IsaInfo::getAllocatedNumVGPRBlocks(
         &ST, PressureAfter.getVGPRNum(false));
     if (BlocksAfter > BlocksBefore)
       return true;
   }

   return false;
 }

 bool OccInitialScheduleStage::shouldRevertScheduling(unsigned WavesAfter) {
   if (PressureAfter == PressureBefore)
     return false;

   if (GCNSchedStage::shouldRevertScheduling(WavesAfter))
     return true;

   if (mayCauseSpilling(WavesAfter))
     return true;

   return false;
 }

 bool UnclusteredHighRPStage::shouldRevertScheduling(unsigned WavesAfter) {
   // If RP is not reduced in the unclustered reschedule stage, revert to the
   // old schedule.
   if ((WavesAfter <= PressureBefore.getOccupancy(ST) &&
        mayCauseSpilling(WavesAfter)) ||
       GCNSchedStage::shouldRevertScheduling(WavesAfter)) {
     LLVM_DEBUG(dbgs() << "Unclustered reschedule did not help.\n");
     return true;
   }

   // Do not attempt to relax schedule even more if we are already spilling.
   if (isRegionWithExcessRP())
     return false;

   LLVM_DEBUG(
       dbgs()
       << "\n\t      *** In shouldRevertScheduling ***\n"
       << "      *********** BEFORE UnclusteredHighRPStage ***********\n");
   ScheduleMetrics MBefore = getScheduleMetrics(DAG.SUnits);
   LLVM_DEBUG(
       dbgs()
       << "\n      *********** AFTER UnclusteredHighRPStage ***********\n");
   ScheduleMetrics MAfter = getScheduleMetrics(DAG);
   unsigned OldMetric = MBefore.getMetric();
   unsigned NewMetric = MAfter.getMetric();
   unsigned WavesBefore =
       std::min(S.getTargetOccupancy(), PressureBefore.getOccupancy(ST));
   unsigned Profit =
       ((WavesAfter * ScheduleMetrics::ScaleFactor) / WavesBefore *
        ((OldMetric + ScheduleMetricBias) * ScheduleMetrics::ScaleFactor) /
        NewMetric) /
       ScheduleMetrics::ScaleFactor;
   LLVM_DEBUG(dbgs() << "\tMetric before " << MBefore << "\tMetric after "
                     << MAfter << "Profit: " << Profit << "\n");
   return Profit < ScheduleMetrics::ScaleFactor;
 }

 bool ClusteredLowOccStage::shouldRevertScheduling(unsigned WavesAfter) {
   if (PressureAfter == PressureBefore)
     return false;

   if (GCNSchedStage::shouldRevertScheduling(WavesAfter))
     return true;

   if (mayCauseSpilling(WavesAfter))
     return true;

   return false;
 }

 bool PreRARematStage::shouldRevertScheduling(unsigned WavesAfter) {
   return GCNSchedStage::shouldRevertScheduling(WavesAfter) ||
          mayCauseSpilling(WavesAfter) ||
          (IncreaseOccupancy && WavesAfter < TargetOcc);
 }

 bool ILPInitialScheduleStage::shouldRevertScheduling(unsigned WavesAfter) {
   if (mayCauseSpilling(WavesAfter))
     return true;

   return false;
 }

 bool MemoryClauseInitialScheduleStage::shouldRevertScheduling(
     unsigned WavesAfter) {
   return mayCauseSpilling(WavesAfter);
 }

 bool GCNSchedStage::mayCauseSpilling(unsigned WavesAfter) {
   if (WavesAfter <= MFI.getMinWavesPerEU() && isRegionWithExcessRP() &&
       !PressureAfter.less(MF, PressureBefore)) {
     LLVM_DEBUG(dbgs() << "New pressure will result in more spilling.\n");
     return true;
   }

   return false;
 }

 void GCNSchedStage::revertScheduling() {
   DAG.RegionsWithMinOcc[RegionIdx] =
       PressureBefore.getOccupancy(ST) == DAG.MinOccupancy;
   LLVM_DEBUG(dbgs() << "Attempting to revert scheduling.\n");
   DAG.RescheduleRegions[RegionIdx] =
       S.hasNextStage() &&
       S.getNextStage() != GCNSchedStageID::UnclusteredHighRPReschedule;
   DAG.RegionEnd = DAG.RegionBegin;
   int SkippedDebugInstr = 0;
   for (MachineInstr *MI : Unsched) {
     if (MI->isDebugInstr()) {
       ++SkippedDebugInstr;
       continue;
     }

     if (MI->getIterator() != DAG.RegionEnd) {
       DAG.BB->splice(DAG.RegionEnd, DAG.BB, MI);
       if (!MI->isDebugInstr())
         DAG.LIS->handleMove(*MI, true);
     }

     // Reset read-undef flags and update them later.
     for (auto &Op : MI->all_defs())
       Op.setIsUndef(false);
     RegisterOperands RegOpers;
     RegOpers.collect(*MI, *DAG.TRI, DAG.MRI, DAG.ShouldTrackLaneMasks, false);
     if (!MI->isDebugInstr()) {
       if (DAG.ShouldTrackLaneMasks) {
         // Adjust liveness and add missing dead+read-undef flags.
         SlotIndex SlotIdx = DAG.LIS->getInstructionIndex(*MI).getRegSlot();
         RegOpers.adjustLaneLiveness(*DAG.LIS, DAG.MRI, SlotIdx, MI);
       } else {
         // Adjust for missing dead-def flags.
         RegOpers.detectDeadDefs(*MI, *DAG.LIS);
       }
     }
     DAG.RegionEnd = MI->getIterator();
     ++DAG.RegionEnd;
     LLVM_DEBUG(dbgs() << "Scheduling " << *MI);
   }

   // After reverting schedule, debug instrs will now be at the end of the block
   // and RegionEnd will point to the first debug instr. Increment RegionEnd
   // pass debug instrs to the actual end of the scheduling region.
   while (SkippedDebugInstr-- > 0)
     ++DAG.RegionEnd;

   // If Unsched.front() instruction is a debug instruction, this will actually
   // shrink the region since we moved all debug instructions to the end of the
   // block. Find the first instruction that is not a debug instruction.
   DAG.RegionBegin = Unsched.front()->getIterator();
   if (DAG.RegionBegin->isDebugInstr()) {
     for (MachineInstr *MI : Unsched) {
       if (MI->isDebugInstr())
         continue;
       DAG.RegionBegin = MI->getIterator();
       break;
     }
   }

   // Then move the debug instructions back into their correct place and set
   // RegionBegin and RegionEnd if needed.
   DAG.placeDebugValues();

   DAG.Regions[RegionIdx] = std::pair(DAG.RegionBegin, DAG.RegionEnd);
 }

 bool PreRARematStage::allUsesAvailableAt(const MachineInstr *InstToRemat,
                                          SlotIndex OriginalIdx,
                                          SlotIndex RematIdx) const {

   LiveIntervals *LIS = DAG.LIS;
   MachineRegisterInfo &MRI = DAG.MRI;
   OriginalIdx = OriginalIdx.getRegSlot(true);
   RematIdx = std::max(RematIdx, RematIdx.getRegSlot(true));
   for (const MachineOperand &MO : InstToRemat->operands()) {
     if (!MO.isReg() || !MO.getReg() || !MO.readsReg())
       continue;

     if (!MO.getReg().isVirtual()) {
       // Do not attempt to reason about PhysRegs
       // TODO: better analysis of PhysReg livness
       if (!DAG.MRI.isConstantPhysReg(MO.getReg()) &&
           !DAG.TII->isIgnorableUse(MO))
         return false;

       // Constant PhysRegs and IgnorableUses are okay
       continue;
     }

     LiveInterval &LI = LIS->getInterval(MO.getReg());
     const VNInfo *OVNI = LI.getVNInfoAt(OriginalIdx);
     assert(OVNI);

     // Don't allow rematerialization immediately after the original def.
     // It would be incorrect if InstToRemat redefines the register.
     // See PR14098.
     if (SlotIndex::isSameInstr(OriginalIdx, RematIdx))
       return false;

     if (OVNI != LI.getVNInfoAt(RematIdx))
       return false;

     // Check that subrange is live at RematIdx.
     if (LI.hasSubRanges()) {
       const TargetRegisterInfo *TRI = MRI.getTargetRegisterInfo();
       unsigned SubReg = MO.getSubReg();
       LaneBitmask LM = SubReg ? TRI->getSubRegIndexLaneMask(SubReg)
                               : MRI.getMaxLaneMaskForVReg(MO.getReg());
       for (LiveInterval::SubRange &SR : LI.subranges()) {
         if ((SR.LaneMask & LM).none())
           continue;
         if (!SR.liveAt(RematIdx))
           return false;

         // Early exit if all used lanes are checked. No need to continue.
         LM &= ~SR.LaneMask;
         if (LM.none())
           break;
       }
     }
   }
   return true;
 }

 namespace {
 /// Models excess register pressure in a region and tracks our progress as we
 /// identify rematerialization opportunities.
 struct ExcessRP {
   /// Number of excess ArchVGPRs.
   unsigned ArchVGPRs = 0;
   /// Number of excess AGPRs.
   unsigned AGPRs = 0;
   /// For unified register files, number of excess VGPRs.
   unsigned VGPRs = 0;
   /// For unified register files with AGPR usage, number of excess ArchVGPRs to
   /// save before we are able to save a whole allocation granule.
   unsigned ArchVGPRsToAlignment = 0;
   /// Whether the region uses AGPRs.
   bool HasAGPRs = false;
   /// Whether the subtarget has a unified RF.
   bool UnifiedRF;

   /// Constructs the excess RP model; determines the excess pressure w.r.t. a
   /// maximum number of allowed VGPRs.
   ExcessRP(const GCNSubtarget &ST, const GCNRegPressure &RP, unsigned MaxVGPRs);

   /// Accounts for \p NumRegs saved ArchVGPRs in the model. If \p
   /// UseArchVGPRForAGPRSpill is true, saved ArchVGPRs are used to save excess
   /// AGPRs once excess ArchVGPR pressure has been eliminated. Returns whether
   /// saving these ArchVGPRs helped reduce excess pressure.
   bool saveArchVGPRs(unsigned NumRegs, bool UseArchVGPRForAGPRSpill);

   /// Accounts for \p NumRegs saved AGPRS in the model. Returns whether saving
   /// these ArchVGPRs helped reduce excess pressure.
   bool saveAGPRs(unsigned NumRegs);

   /// Returns whether there is any excess register pressure.
   operator bool() const { return ArchVGPRs != 0 || AGPRs != 0 || VGPRs != 0; }

 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
   friend raw_ostream &operator<<(raw_ostream &OS, const ExcessRP &Excess) {
     OS << Excess.ArchVGPRs << " ArchVGPRs, " << Excess.AGPRs << " AGPRs, and "
        << Excess.VGPRs << " VGPRs (next ArchVGPR aligment in "
        << Excess.ArchVGPRsToAlignment << " registers)\n";
     return OS;
   }
 #endif

 private:
   static inline bool saveRegs(unsigned &LeftToSave, unsigned &NumRegs) {
     unsigned NumSaved = std::min(LeftToSave, NumRegs);
     NumRegs -= NumSaved;
     LeftToSave -= NumSaved;
     return NumSaved;
   }
 };
 } // namespace

 ExcessRP::ExcessRP(const GCNSubtarget &ST, const GCNRegPressure &RP,
                    unsigned MaxVGPRs)
     : UnifiedRF(ST.hasGFX90AInsts()) {
   unsigned NumArchVGPRs = RP.getArchVGPRNum();
   unsigned NumAGPRs = RP.getAGPRNum();
   HasAGPRs = NumAGPRs;

   if (!UnifiedRF) {
     // Non-unified RF. Account for excess pressure for ArchVGPRs and AGPRs
     // independently.
     if (NumArchVGPRs > MaxVGPRs)
       ArchVGPRs = NumArchVGPRs - MaxVGPRs;
     if (NumAGPRs > MaxVGPRs)
       AGPRs = NumAGPRs - MaxVGPRs;
     return;
   }

   // Independently of whether overall VGPR pressure is under the limit, we still
   // have to check whether ArchVGPR pressure or AGPR pressure alone exceeds the
   // number of addressable registers in each category.
   const unsigned MaxArchVGPRs = ST.getAddressableNumArchVGPRs();
   if (NumArchVGPRs > MaxArchVGPRs) {
     ArchVGPRs = NumArchVGPRs - MaxArchVGPRs;
     NumArchVGPRs = MaxArchVGPRs;
   }
   if (NumAGPRs > MaxArchVGPRs) {
     AGPRs = NumAGPRs - MaxArchVGPRs;
     NumAGPRs = MaxArchVGPRs;
   }

   // Check overall VGPR usage against the limit; any excess above addressable
   // register limits has already been accounted for.
   const unsigned Granule = AMDGPU::IsaInfo::getArchVGPRAllocGranule();
   unsigned NumVGPRs = GCNRegPressure::getUnifiedVGPRNum(NumArchVGPRs, NumAGPRs);
   if (NumVGPRs > MaxVGPRs) {
     VGPRs = NumVGPRs - MaxVGPRs;
     ArchVGPRsToAlignment = NumArchVGPRs - alignDown(NumArchVGPRs, Granule);
     if (!ArchVGPRsToAlignment)
       ArchVGPRsToAlignment = Granule;
   }
 }

 bool ExcessRP::saveArchVGPRs(unsigned NumRegs, bool UseArchVGPRForAGPRSpill) {
   bool Progress = saveRegs(ArchVGPRs, NumRegs);
   if (!NumRegs)
     return Progress;

   if (!UnifiedRF) {
     if (UseArchVGPRForAGPRSpill)
       Progress |= saveRegs(AGPRs, NumRegs);
   } else if (HasAGPRs && (VGPRs || (UseArchVGPRForAGPRSpill && AGPRs))) {
     // There is progress as long as there are VGPRs left to save, even if the
     // save induced by this particular call does not cross an ArchVGPR alignment
     // barrier.
     Progress = true;

     // ArchVGPRs can only be allocated as a multiple of a granule in unified RF.
     unsigned NumSavedRegs = 0;

     // Count the number of whole ArchVGPR allocation granules we can save.
     const unsigned Granule = AMDGPU::IsaInfo::getArchVGPRAllocGranule();
     if (unsigned NumGranules = NumRegs / Granule; NumGranules) {
       NumSavedRegs = NumGranules * Granule;
       NumRegs -= NumSavedRegs;
     }

     // We may be able to save one more whole ArchVGPR allocation granule.
     if (NumRegs >= ArchVGPRsToAlignment) {
       NumSavedRegs += Granule;
       ArchVGPRsToAlignment = Granule - (NumRegs - ArchVGPRsToAlignment);
     } else {
       ArchVGPRsToAlignment -= NumRegs;
     }

     // Prioritize saving generic VGPRs, then AGPRs if we allow AGPR-to-ArchVGPR
     // spilling and have some free ArchVGPR slots.
     saveRegs(VGPRs, NumSavedRegs);
     if (UseArchVGPRForAGPRSpill)
       saveRegs(AGPRs, NumSavedRegs);
   } else {
     // No AGPR usage in the region i.e., no allocation granule to worry about.
     Progress |= saveRegs(VGPRs, NumRegs);
   }

   return Progress;
 }

 bool ExcessRP::saveAGPRs(unsigned NumRegs) {
   return saveRegs(AGPRs, NumRegs) || saveRegs(VGPRs, NumRegs);
 }

 bool PreRARematStage::canIncreaseOccupancyOrReduceSpill() {
   const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo *>(DAG.TRI);

   REMAT_DEBUG({
     dbgs() << "Collecting rematerializable instructions in ";
     MF.getFunction().printAsOperand(dbgs(), false);
     dbgs() << '\n';
   });

   // Maps optimizable regions (i.e., regions at minimum and VGPR-limited
   // occupancy, or regions with VGPR spilling) to a model of their excess RP.
   DenseMap<unsigned, ExcessRP> OptRegions;
   const Function &F = MF.getFunction();

   std::pair<unsigned, unsigned> WavesPerEU = ST.getWavesPerEU(F);
   const unsigned MaxSGPRsNoSpill = ST.getMaxNumSGPRs(F);
   const unsigned MaxVGPRsNoSpill = ST.getMaxNumVGPRs(F);
   const unsigned MaxSGPRsIncOcc =
       ST.getMaxNumSGPRs(DAG.MinOccupancy + 1, false);
   const unsigned MaxVGPRsIncOcc = ST.getMaxNumVGPRs(DAG.MinOccupancy + 1);
   IncreaseOccupancy = WavesPerEU.second > DAG.MinOccupancy;

   auto ClearOptRegionsIf = [&](bool Cond) -> bool {
     if (Cond) {
       // We won't try to increase occupancy.
       IncreaseOccupancy = false;
       OptRegions.clear();
     }
     return Cond;
   };

   // Collect optimizable regions. If there is spilling in any region we will
   // just try to reduce ArchVGPR spilling. Otherwise we will try to increase
   // occupancy by one in the whole function.
   for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) {
     GCNRegPressure &RP = DAG.Pressure[I];

     // Check whether SGPR pressures prevents us from eliminating spilling.
     unsigned NumSGPRs = RP.getSGPRNum();
     if (NumSGPRs > MaxSGPRsNoSpill)
       ClearOptRegionsIf(IncreaseOccupancy);

     ExcessRP Excess(ST, RP, MaxVGPRsNoSpill);
     if (Excess) {
       ClearOptRegionsIf(IncreaseOccupancy);
     } else if (IncreaseOccupancy) {
       // Check whether SGPR pressure prevents us from increasing occupancy.
       if (ClearOptRegionsIf(NumSGPRs > MaxSGPRsIncOcc)) {
         if (DAG.MinOccupancy >= WavesPerEU.first)
           return false;
         continue;
       }
       if ((Excess = ExcessRP(ST, RP, MaxVGPRsIncOcc))) {
         // We can only rematerialize ArchVGPRs at this point.
         unsigned NumArchVGPRsToRemat = Excess.ArchVGPRs + Excess.VGPRs;
         bool NotEnoughArchVGPRs = NumArchVGPRsToRemat > RP.getArchVGPRNum();
         if (ClearOptRegionsIf(Excess.AGPRs || NotEnoughArchVGPRs)) {
           if (DAG.MinOccupancy >= WavesPerEU.first)
             return false;
           continue;
         }
       }
     }
     if (Excess)
       OptRegions.insert({I, Excess});
   }
   if (OptRegions.empty())
     return false;

 #ifndef NDEBUG
   if (IncreaseOccupancy)
     REMAT_DEBUG(dbgs() << "Occupancy minimal in regions:\n");
   else
     REMAT_DEBUG(dbgs() << "Spilling in regions:\n");
   for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) {
     if (auto OptIt = OptRegions.find(I); OptIt != OptRegions.end())
       REMAT_DEBUG(dbgs() << "  " << I << ": " << OptIt->getSecond() << '\n');
   }
 #endif

   // When we are reducing spilling, the target is the minimum target number of
   // waves/EU determined by the subtarget.
   TargetOcc = IncreaseOccupancy ? DAG.MinOccupancy + 1 : WavesPerEU.first;

   // Accounts for a reduction in RP in an optimizable region. Returns whether we
   // estimate that we have identified enough rematerialization opportunities to
   // achieve our goal, and sets Progress to true when this particular reduction
   // in pressure was helpful toward that goal.
   auto ReduceRPInRegion = [&](auto OptIt, LaneBitmask Mask,
                               bool &Progress) -> bool {
     ExcessRP &Excess = OptIt->getSecond();
     // We allow saved ArchVGPRs to be considered as free spill slots for AGPRs
     // only when we are just trying to eliminate spilling to memory. At this
     // point we err on the conservative side and do not increase
     // register-to-register spilling for the sake of increasing occupancy.
     Progress |=
         Excess.saveArchVGPRs(SIRegisterInfo::getNumCoveredRegs(Mask),
                              /*UseArchVGPRForAGPRSpill=*/!IncreaseOccupancy);
     if (!Excess)
       OptRegions.erase(OptIt->getFirst());
     return OptRegions.empty();
   };

   // We need up-to-date live-out info. to query live-out register masks in
   // regions containing rematerializable instructions.
   DAG.RegionLiveOuts.buildLiveRegMap();

   // Cache set of registers that are going to be rematerialized.
   DenseSet<unsigned> RematRegs;

   // Identify rematerializable instructions in the function.
   for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) {
     auto Region = DAG.Regions[I];
     for (auto MI = Region.first; MI != Region.second; ++MI) {
       // The instruction must be trivially rematerializable.
       MachineInstr &DefMI = *MI;
       if (!isTriviallyReMaterializable(DefMI))
         continue;

       // We only support rematerializing virtual VGPRs with one definition.
       Register Reg = DefMI.getOperand(0).getReg();
       if (!Reg.isVirtual() || !SRI->isVGPRClass(DAG.MRI.getRegClass(Reg)) ||
           !DAG.MRI.hasOneDef(Reg))
         continue;

       // We only care to rematerialize the instruction if it has a single
       // non-debug user in a different region. The using MI may not belong to a
       // region if it is a lone region terminator.
       MachineInstr *UseMI = DAG.MRI.getOneNonDBGUser(Reg);
       if (!UseMI)
         continue;
       auto UseRegion = MIRegion.find(UseMI);
       if (UseRegion != MIRegion.end() && UseRegion->second == I)
         continue;

       // Do not rematerialize an instruction if it uses or is used by an
       // instruction that we have designated for rematerialization.
       // FIXME: Allow for rematerialization chains: this requires 1. updating
       // remat points to account for uses that are rematerialized, and 2. either
       // rematerializing the candidates in careful ordering, or deferring the
       // MBB RP walk until the entire chain has been rematerialized.
       if (Rematerializations.contains(UseMI) ||
           llvm::any_of(DefMI.operands(), [&RematRegs](MachineOperand &MO) {
             return MO.isReg() && RematRegs.contains(MO.getReg());
           }))
         continue;

       // Do not rematerialize an instruction it it uses registers that aren't
       // available at its use. This ensures that we are not extending any live
       // range while rematerializing.
       SlotIndex DefIdx = DAG.LIS->getInstructionIndex(DefMI);
       SlotIndex UseIdx = DAG.LIS->getInstructionIndex(*UseMI).getRegSlot(true);
       if (!allUsesAvailableAt(&DefMI, DefIdx, UseIdx))
         continue;

       REMAT_DEBUG(dbgs() << "Region " << I << ": remat instruction " << DefMI);
       RematInstruction &Remat =
           Rematerializations.try_emplace(&DefMI, UseMI).first->second;

       bool RematUseful = false;
       if (auto It = OptRegions.find(I); It != OptRegions.end()) {
         // Optimistically consider that moving the instruction out of its
         // defining region will reduce RP in the latter; this assumes that
         // maximum RP in the region is reached somewhere between the defining
         // instruction and the end of the region.
         REMAT_DEBUG(dbgs() << "  Defining region is optimizable\n");
         LaneBitmask Mask = DAG.RegionLiveOuts.getLiveRegsForRegionIdx(I)[Reg];
         if (ReduceRPInRegion(It, Mask, RematUseful))
           return true;
       }

       for (unsigned LIRegion = 0; LIRegion != E; ++LIRegion) {
         // We are only collecting regions in which the register is a live-in
         // (and may be live-through).
         auto It = DAG.LiveIns[LIRegion].find(Reg);
         if (It == DAG.LiveIns[LIRegion].end() || It->second.none())
           continue;
         Remat.LiveInRegions.insert(LIRegion);

         // Account for the reduction in RP due to the rematerialization in an
         // optimizable region in which the defined register is a live-in. This
         // is exact for live-through region but optimistic in the using region,
         // where RP is actually reduced only if maximum RP is reached somewhere
         // between the beginning of the region and the rematerializable
         // instruction's use.
         if (auto It = OptRegions.find(LIRegion); It != OptRegions.end()) {
           REMAT_DEBUG(dbgs() << "  Live-in in region " << LIRegion << '\n');
           if (ReduceRPInRegion(It, DAG.LiveIns[LIRegion][Reg], RematUseful))
             return true;
         }
       }

       // If the instruction is not a live-in or live-out in any optimizable
       // region then there is no point in rematerializing it.
       if (!RematUseful) {
         Rematerializations.pop_back();
         REMAT_DEBUG(dbgs() << "  No impact, not rematerializing instruction\n");
       } else {
         RematRegs.insert(Reg);
       }
     }
   }

   if (IncreaseOccupancy) {
     // We were trying to increase occupancy but failed, abort the stage.
     REMAT_DEBUG(dbgs() << "Cannot increase occupancy\n");
     Rematerializations.clear();
     return false;
   }
   REMAT_DEBUG(dbgs() << "Can reduce but not eliminate spilling\n");
   return !Rematerializations.empty();
 }

 void PreRARematStage::rematerialize() {
   const SIInstrInfo *TII = MF.getSubtarget<GCNSubtarget>().getInstrInfo();

   // Collect regions whose RP changes in unpredictable way; we will have to
   // fully recompute their RP after all rematerailizations.
   DenseSet<unsigned> RecomputeRP;

   // Rematerialize all instructions.
   for (auto &[DefMI, Remat] : Rematerializations) {
     MachineBasicBlock::iterator InsertPos(Remat.UseMI);
     Register Reg = DefMI->getOperand(0).getReg();
     unsigned SubReg = DefMI->getOperand(0).getSubReg();
     unsigned DefRegion = MIRegion.at(DefMI);

     // Rematerialize DefMI to its use block.
     TII->reMaterialize(*InsertPos->getParent(), InsertPos, Reg, SubReg, *DefMI,
                        *DAG.TRI);
     Remat.RematMI = &*std::prev(InsertPos);
     Remat.RematMI->getOperand(0).setSubReg(SubReg);
     DAG.LIS->InsertMachineInstrInMaps(*Remat.RematMI);

     // Update region boundaries in regions we sinked from (remove defining MI)
     // and to (insert MI rematerialized in use block). Only then we can erase
     // the original MI.
     DAG.updateRegionBoundaries(DAG.Regions[DefRegion], DefMI, nullptr);
     auto UseRegion = MIRegion.find(Remat.UseMI);
     if (UseRegion != MIRegion.end()) {
       DAG.updateRegionBoundaries(DAG.Regions[UseRegion->second], InsertPos,
                                  Remat.RematMI);
     }
     DefMI->eraseFromParent();
     DAG.LIS->RemoveMachineInstrFromMaps(*DefMI);

     // Collect all regions impacted by the rematerialization and update their
     // live-in/RP information.
     for (unsigned I : Remat.LiveInRegions) {
       ImpactedRegions.insert({I, DAG.Pressure[I]});
       GCNRPTracker::LiveRegSet &RegionLiveIns = DAG.LiveIns[I];

 #ifdef EXPENSIVE_CHECKS
       // All uses are known to be available / live at the remat point. Thus, the
       // uses should already be live in to the region.
       for (MachineOperand &MO : DefMI->operands()) {
         if (!MO.isReg() || !MO.getReg() || !MO.readsReg())
           continue;

         Register UseReg = MO.getReg();
         if (!UseReg.isVirtual())
           continue;

         LiveInterval &LI = DAG.LIS->getInterval(UseReg);
         LaneBitmask LM = DAG.MRI.getMaxLaneMaskForVReg(MO.getReg());
         if (LI.hasSubRanges() && MO.getSubReg())
           LM = DAG.TRI->getSubRegIndexLaneMask(MO.getSubReg());

         LaneBitmask LiveInMask = RegionLiveIns.at(UseReg);
         LaneBitmask UncoveredLanes = LM & ~(LiveInMask & LM);
         // If this register has lanes not covered by the LiveIns, be sure they
         // do not map to any subrange. ref:
         // machine-scheduler-sink-trivial-remats.mir::omitted_subrange
         if (UncoveredLanes.any()) {
           assert(LI.hasSubRanges());
           for (LiveInterval::SubRange &SR : LI.subranges())
             assert((SR.LaneMask & UncoveredLanes).none());
         }
       }
 #endif

       // The register is no longer a live-in in all regions but the one that
       // contains the single use. In live-through regions, maximum register
       // pressure decreases predictably so we can directly update it. In the
       // using region, maximum RP may or may not decrease, so we will mark it
       // for re-computation after all materializations have taken place.
       LaneBitmask PrevMask = RegionLiveIns[Reg];
       RegionLiveIns.erase(Reg);
       RegMasks.insert({{I, Remat.RematMI->getOperand(0).getReg()}, PrevMask});
       if (Remat.UseMI->getParent() != DAG.Regions[I].first->getParent())
         DAG.Pressure[I].inc(Reg, PrevMask, LaneBitmask::getNone(), DAG.MRI);
       else
         RecomputeRP.insert(I);
     }
     // RP in the region from which the instruction was rematerialized may or may
     // not decrease.
     ImpactedRegions.insert({DefRegion, DAG.Pressure[DefRegion]});
     RecomputeRP.insert(DefRegion);

     // Recompute live interval to reflect the register's rematerialization.
     Register RematReg = Remat.RematMI->getOperand(0).getReg();
     DAG.LIS->removeInterval(RematReg);
     DAG.LIS->createAndComputeVirtRegInterval(RematReg);
   }

   // All regions impacted by at least one rematerialization must be rescheduled.
   // Maximum pressure must also be recomputed for all regions where it changed
   // non-predictably and checked against the target occupancy.
   AchievedOcc = TargetOcc;
   for (auto &[I, OriginalRP] : ImpactedRegions) {
     bool IsEmptyRegion = DAG.Regions[I].first == DAG.Regions[I].second;
     DAG.RescheduleRegions[I] = !IsEmptyRegion;
     if (!RecomputeRP.contains(I))
       continue;

     GCNRegPressure RP;
     if (IsEmptyRegion) {
       RP = getRegPressure(DAG.MRI, DAG.LiveIns[I]);
     } else {
       GCNDownwardRPTracker RPT(*DAG.LIS);
       auto *NonDbgMI = &*skipDebugInstructionsForward(DAG.Regions[I].first,
                                                       DAG.Regions[I].second);
       if (NonDbgMI == DAG.Regions[I].second) {
         // Region is non-empty but contains only debug instructions.
         RP = getRegPressure(DAG.MRI, DAG.LiveIns[I]);
       } else {
         RPT.reset(*NonDbgMI, &DAG.LiveIns[I]);
         RPT.advance(DAG.Regions[I].second);
         RP = RPT.moveMaxPressure();
       }
     }
     DAG.Pressure[I] = RP;
     AchievedOcc = std::min(AchievedOcc, RP.getOccupancy(ST));
   }
   REMAT_DEBUG(dbgs() << "Achieved occupancy " << AchievedOcc << "\n");
 }

 // Copied from MachineLICM
 bool PreRARematStage::isTriviallyReMaterializable(const MachineInstr &MI) {
   if (!DAG.TII->isTriviallyReMaterializable(MI))
     return false;

   for (const MachineOperand &MO : MI.all_uses()) {
     // We can't remat physreg uses, unless it is a constant or an ignorable
     // use (e.g. implicit exec use on VALU instructions)
     if (MO.getReg().isPhysical()) {
       if (DAG.MRI.isConstantPhysReg(MO.getReg()) || DAG.TII->isIgnorableUse(MO))
         continue;
       return false;
     }
   }

   return true;
 }

 void PreRARematStage::finalizeGCNSchedStage() {
   // We consider that reducing spilling is always beneficial so we never
   // rollback rematerializations in such cases. It's also possible that
   // rescheduling lowers occupancy over the one achieved just through remats, in
   // which case we do not want to rollback either (the rescheduling was already
   // reverted in PreRARematStage::shouldRevertScheduling in such cases).
   unsigned MaxOcc = std::max(AchievedOcc, DAG.MinOccupancy);
   if (!IncreaseOccupancy || MaxOcc >= TargetOcc)
     return;

   REMAT_DEBUG(dbgs() << "Rolling back all rematerializations\n");
   const SIInstrInfo *TII = MF.getSubtarget<GCNSubtarget>().getInstrInfo();

   // Rollback the rematerializations.
   for (const auto &[DefMI, Remat] : Rematerializations) {
     MachineInstr &RematMI = *Remat.RematMI;
     unsigned DefRegion = MIRegion.at(DefMI);
     MachineBasicBlock::iterator InsertPos(DAG.Regions[DefRegion].second);
     MachineBasicBlock *MBB = RegionBB[DefRegion];
     Register Reg = RematMI.getOperand(0).getReg();
     unsigned SubReg = RematMI.getOperand(0).getSubReg();

     // Re-rematerialize MI at the end of its original region. Note that it may
     // not be rematerialized exactly in the same position as originally within
     // the region, but it should not matter much.
     TII->reMaterialize(*MBB, InsertPos, Reg, SubReg, RematMI, *DAG.TRI);
     MachineInstr *NewMI = &*std::prev(InsertPos);
     NewMI->getOperand(0).setSubReg(SubReg);
     DAG.LIS->InsertMachineInstrInMaps(*NewMI);

     auto UseRegion = MIRegion.find(Remat.UseMI);
     if (UseRegion != MIRegion.end()) {
       DAG.updateRegionBoundaries(DAG.Regions[UseRegion->second], RematMI,
                                  nullptr);
     }
     DAG.updateRegionBoundaries(DAG.Regions[DefRegion], InsertPos, NewMI);

     // Erase rematerialized MI.
     RematMI.eraseFromParent();
     DAG.LIS->RemoveMachineInstrFromMaps(RematMI);

     // Recompute live interval for the re-rematerialized register
     DAG.LIS->removeInterval(Reg);
     DAG.LIS->createAndComputeVirtRegInterval(Reg);

     // Re-add the register as a live-in in all regions it used to be one in.
     for (unsigned LIRegion : Remat.LiveInRegions)
       DAG.LiveIns[LIRegion].insert({Reg, RegMasks.at({LIRegion, Reg})});
   }

   // Reset RP in all impacted regions.
   for (auto &[I, OriginalRP] : ImpactedRegions)
     DAG.Pressure[I] = OriginalRP;

   GCNSchedStage::finalizeGCNSchedStage();
 }

 void GCNScheduleDAGMILive::updateRegionBoundaries(
     RegionBoundaries &RegionBounds, MachineBasicBlock::iterator MI,
     MachineInstr *NewMI) {
   assert(!NewMI ||
          NewMI != RegionBounds.second && "cannot remove at region end");

   if (RegionBounds.first == RegionBounds.second) {
     assert(NewMI && "cannot remove from an empty region");
     RegionBounds.first = NewMI;
     return;
   }

   // We only care for modifications at the beginning of a non-empty region since
   // the upper region boundary is exclusive.
   if (MI != RegionBounds.first)
     return;
   if (!NewMI)
     RegionBounds.first = std::next(MI); // Removal
   else
     RegionBounds.first = NewMI; // Insertion
 }

 static bool hasIGLPInstrs(ScheduleDAGInstrs *DAG) {
   const SIInstrInfo *SII = static_cast<const SIInstrInfo *>(DAG->TII);
   return any_of(*DAG, [SII](MachineBasicBlock::iterator MI) {
     return SII->isIGLPMutationOnly(MI->getOpcode());
   });
 }

 GCNPostScheduleDAGMILive::GCNPostScheduleDAGMILive(
     MachineSchedContext *C, std::unique_ptr<MachineSchedStrategy> S,
     bool RemoveKillFlags)
     : ScheduleDAGMI(C, std::move(S), RemoveKillFlags) {}

 void GCNPostScheduleDAGMILive::schedule() {
   HasIGLPInstrs = hasIGLPInstrs(this);
   if (HasIGLPInstrs) {
     SavedMutations.clear();
     SavedMutations.swap(Mutations);
     addMutation(createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::PostRA));
   }

   ScheduleDAGMI::schedule();
 }

 void GCNPostScheduleDAGMILive::finalizeSchedule() {
   if (HasIGLPInstrs)
     SavedMutations.swap(Mutations);

   ScheduleDAGMI::finalizeSchedule();
 }