lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.cpp - llvm-project/llvm - Git at Google

 //===- AMDGPUCoExecSchedStrategy.cpp - CoExec Scheduling Strategy ---------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 //
 /// \file
 /// Coexecution-focused scheduling strategy for AMDGPU.
 //
 //===----------------------------------------------------------------------===//

 #include "AMDGPUCoExecSchedStrategy.h"
 #include "llvm/Support/Debug.h"

 using namespace llvm;
 using namespace llvm::AMDGPU;

 #define DEBUG_TYPE "machine-scheduler"

 namespace {

 // Used to disable post-RA scheduling with function level granularity.
 class GCNNoopPostScheduleDAG final : public ScheduleDAGInstrs {
 public:
   explicit GCNNoopPostScheduleDAG(MachineSchedContext *C)
       : ScheduleDAGInstrs(*C->MF, C->MLI, /*RemoveKillFlags=*/true) {}

   // Do nothing.
   void schedule() override {}
 };

 } // namespace

 static SUnit *pickOnlyChoice(SchedBoundary &Zone) {
   // pickOnlyChoice() releases pending instructions and checks for new hazards.
   SUnit *OnlyChoice = Zone.pickOnlyChoice();
   if (!Zone.Pending.empty())
     return nullptr;

   return OnlyChoice;
 }

 InstructionFlavor llvm::AMDGPU::classifyFlavor(const MachineInstr &MI,
                                                const SIInstrInfo &SII) {
   if (MI.isDebugInstr())
     return InstructionFlavor::Other;

   unsigned Opc = MI.getOpcode();

   // Check for specific opcodes first.
   if (Opc == AMDGPU::ATOMIC_FENCE || Opc == AMDGPU::S_WAIT_ASYNCCNT ||
       Opc == AMDGPU::S_WAIT_TENSORCNT || Opc == AMDGPU::S_BARRIER_WAIT ||
       Opc == AMDGPU::S_BARRIER_SIGNAL_IMM)
     return InstructionFlavor::Fence;

   if (SII.isLDSDMA(MI))
     return InstructionFlavor::DMA;

   if (SII.isMFMAorWMMA(MI))
     return InstructionFlavor::WMMA;

   if (SII.isTRANS(MI))
     return InstructionFlavor::TRANS;

   if (SII.isVALU(MI))
     return InstructionFlavor::SingleCycleVALU;

   if (SII.isDS(MI))
     return InstructionFlavor::DS;

   if (SII.isFLAT(MI) || SII.isFLATGlobal(MI) || SII.isFLATScratch(MI))
     return InstructionFlavor::VMEM;

   if (SII.isSALU(MI))
     return InstructionFlavor::SALU;

   return InstructionFlavor::Other;
 }

 SUnit *HardwareUnitInfo::getNextTargetSU(bool LookDeep) const {
   for (auto *PrioritySU : PrioritySUs) {
     if (!PrioritySU->isTopReady())
       return PrioritySU;
   }

   if (!LookDeep)
     return nullptr;

   unsigned MinDepth = std::numeric_limits<unsigned int>::max();
   SUnit *TargetSU = nullptr;
   for (auto *SU : AllSUs) {
     if (SU->isScheduled)
       continue;

     if (SU->isTopReady())
       continue;

     if (SU->getDepth() < MinDepth) {
       MinDepth = SU->getDepth();
       TargetSU = SU;
     }
   }
   return TargetSU;
 }

 void HardwareUnitInfo::insert(SUnit *SU, unsigned BlockingCycles) {
 #ifndef NDEBUG
   bool Inserted = AllSUs.insert(SU);
   assert(Inserted);
 #else
   AllSUs.insert(SU);
 #endif

   TotalCycles += BlockingCycles;

   if (PrioritySUs.empty()) {
     PrioritySUs.insert(SU);
     return;
   }
   unsigned SUDepth = SU->getDepth();
   unsigned CurrDepth = (*PrioritySUs.begin())->getDepth();
   if (SUDepth > CurrDepth)
     return;

   if (SUDepth == CurrDepth) {
     PrioritySUs.insert(SU);
     return;
   }

   // SU is lower depth and should be prioritized.
   PrioritySUs.clear();
   PrioritySUs.insert(SU);
 }

 void HardwareUnitInfo::markScheduled(SUnit *SU, unsigned BlockingCycles) {
   // We may want to ignore some HWUIs (e.g. InstructionFlavor::Other). To do so,
   // we just clear the HWUI. However, we still have instructions which map to
   // this HWUI. Don't bother managing the state for these HWUI.
   if (TotalCycles == 0)
     return;

   AllSUs.remove(SU);
   PrioritySUs.remove(SU);

   TotalCycles -= BlockingCycles;

   if (AllSUs.empty())
     return;
   if (PrioritySUs.empty()) {
     for (auto SU : AllSUs) {
       if (PrioritySUs.empty()) {
         PrioritySUs.insert(SU);
         continue;
       }
       unsigned SUDepth = SU->getDepth();
       unsigned CurrDepth = (*PrioritySUs.begin())->getDepth();
       if (SUDepth > CurrDepth)
         continue;

       if (SUDepth == CurrDepth) {
         PrioritySUs.insert(SU);
         continue;
       }

       // SU is lower depth and should be prioritized.
       PrioritySUs.clear();
       PrioritySUs.insert(SU);
     }
   }
 }

 HardwareUnitInfo *
 CandidateHeuristics::getHWUIFromFlavor(InstructionFlavor Flavor) {
   for (auto &HWUICand : HWUInfo) {
     if (HWUICand.getType() == Flavor) {
       return &HWUICand;
     }
   }
   return nullptr;
 }

 unsigned CandidateHeuristics::getHWUICyclesForInst(SUnit *SU) {
   assert(SchedModel && SchedModel->hasInstrSchedModel());
   unsigned ReleaseAtCycle = 0;
   const MCSchedClassDesc *SC = DAG->getSchedClass(SU);
   for (TargetSchedModel::ProcResIter PI = SchedModel->getWriteProcResBegin(SC),
                                      PE = SchedModel->getWriteProcResEnd(SC);
        PI != PE; ++PI) {
     ReleaseAtCycle = std::max(ReleaseAtCycle, (unsigned)PI->ReleaseAtCycle);
   }
   return ReleaseAtCycle;
 }

 void CandidateHeuristics::updateForScheduling(SUnit *SU) {
   HardwareUnitInfo *HWUI =
       getHWUIFromFlavor(classifyFlavor(*SU->getInstr(), *SII));
   assert(HWUI);
   HWUI->markScheduled(SU, getHWUICyclesForInst(SU));
 }

 void CandidateHeuristics::initialize(ScheduleDAGMI *SchedDAG,
                                      const TargetSchedModel *TargetSchedModel,
                                      const TargetRegisterInfo *TRI) {
   DAG = SchedDAG;
   SchedModel = TargetSchedModel;
   assert(SchedModel && SchedModel->hasInstrSchedModel());

   SRI = static_cast<const SIRegisterInfo *>(TRI);
   SII = static_cast<const SIInstrInfo *>(DAG->TII);

   HWUInfo.resize((int)InstructionFlavor::NUM_FLAVORS);

   for (unsigned I = 0; I < HWUInfo.size(); I++) {
     HWUInfo[I].reset();
     HWUInfo[I].setType(I);
   }

   HWUInfo[(int)InstructionFlavor::WMMA].setProducesCoexecWindow(true);
   HWUInfo[(int)InstructionFlavor::MultiCycleVALU].setProducesCoexecWindow(true);
   HWUInfo[(int)InstructionFlavor::TRANS].setProducesCoexecWindow(true);

   collectHWUIPressure();
 }

 void CandidateHeuristics::collectHWUIPressure() {
   if (!SchedModel || !SchedModel->hasInstrSchedModel())
     return;

   for (auto &SU : DAG->SUnits) {
     const InstructionFlavor Flavor = classifyFlavor(*SU.getInstr(), *SII);
     HWUInfo[(int)(Flavor)].insert(&SU, getHWUICyclesForInst(&SU));
   }

   LLVM_DEBUG(dumpRegionSummary());
 }

 void CandidateHeuristics::dumpRegionSummary() {
   MachineBasicBlock *BB = DAG->begin()->getParent();
   dbgs() << "\n=== Region: " << DAG->MF.getName() << " BB" << BB->getNumber()
          << " (" << DAG->SUnits.size() << " SUs) ===\n";

   dbgs() << "\nHWUI Resource Pressure:\n";
   for (auto &HWUI : HWUInfo) {
     if (HWUI.getTotalCycles() == 0)
       continue;

     StringRef Name = getFlavorName(HWUI.getType());
     dbgs() << "  " << Name << ": " << HWUI.getTotalCycles() << " cycles, "
            << HWUI.size() << " instrs\n";
   }
   dbgs() << "\n";
 }

 void CandidateHeuristics::sortHWUIResources() {
   // Highest priority should be first.
   llvm::sort(HWUInfo, [](HardwareUnitInfo &A, HardwareUnitInfo &B) {
     // Prefer CoexecWindow producers
     if (A.producesCoexecWindow() != B.producesCoexecWindow())
       return A.producesCoexecWindow();

     // Prefer more demanded resources
     if (A.getTotalCycles() != B.getTotalCycles())
       return A.getTotalCycles() > B.getTotalCycles();

     // In ties -- prefer the resource with more instructions
     if (A.size() != B.size())
       return A.size() < B.size();

     // Default to Flavor order
     return (unsigned)A.getType() < (unsigned)B.getType();
   });
 }

 bool CandidateHeuristics::tryCriticalResourceDependency(
     GenericSchedulerBase::SchedCandidate &TryCand,
     GenericSchedulerBase::SchedCandidate &Cand, SchedBoundary *Zone) const {

   auto HasPrioritySU = [this, &Cand, &TryCand](unsigned ResourceIdx) {
     const HardwareUnitInfo &HWUI = HWUInfo[ResourceIdx];

     auto CandFlavor = classifyFlavor(*Cand.SU->getInstr(), *SII);
     auto TryCandFlavor = classifyFlavor(*TryCand.SU->getInstr(), *SII);
     bool LookDeep = (CandFlavor == InstructionFlavor::DS ||
                      TryCandFlavor == InstructionFlavor::DS) &&
                     HWUI.getType() == InstructionFlavor::WMMA;
     auto *TargetSU = HWUI.getNextTargetSU(LookDeep);

     // If we do not have a TargetSU for this resource, then it is not critical.
     if (!TargetSU)
       return false;

     return true;
   };

   auto TryEnablesResource = [&Cand, &TryCand, this](unsigned ResourceIdx) {
     const HardwareUnitInfo &HWUI = HWUInfo[ResourceIdx];
     auto CandFlavor = classifyFlavor(*Cand.SU->getInstr(), *SII);

     // We want to ensure our DS order matches WMMA order.
     bool LookDeep = CandFlavor == InstructionFlavor::DS &&
                     HWUI.getType() == InstructionFlavor::WMMA;
     auto *TargetSU = HWUI.getNextTargetSU(LookDeep);

     bool CandEnables =
         TargetSU != Cand.SU && DAG->IsReachable(TargetSU, Cand.SU);
     bool TryCandEnables =
         TargetSU != TryCand.SU && DAG->IsReachable(TargetSU, TryCand.SU);

     if (!CandEnables && !TryCandEnables)
       return false;

     if (CandEnables && !TryCandEnables) {
       if (Cand.Reason > GenericSchedulerBase::RegCritical)
         Cand.Reason = GenericSchedulerBase::RegCritical;

       return true;
     }

     if (!CandEnables && TryCandEnables) {
       TryCand.Reason = GenericSchedulerBase::RegCritical;
       return true;
     }

     // Both enable, prefer the critical path.
     unsigned CandHeight = Cand.SU->getHeight();
     unsigned TryCandHeight = TryCand.SU->getHeight();

     if (CandHeight > TryCandHeight) {
       if (Cand.Reason > GenericSchedulerBase::RegCritical)
         Cand.Reason = GenericSchedulerBase::RegCritical;

       return true;
     }

     if (CandHeight < TryCandHeight) {
       TryCand.Reason = GenericSchedulerBase::RegCritical;
       return true;
     }

     // Same critical path, just prefer original candidate.
     if (Cand.Reason > GenericSchedulerBase::RegCritical)
       Cand.Reason = GenericSchedulerBase::RegCritical;

     return true;
   };

   for (unsigned I = 0; I < HWUInfo.size(); I++) {
     // If we have encountered a resource that is not critical, then neither
     // candidate enables a critical resource
     if (!HasPrioritySU(I))
       continue;

     bool Enabled = TryEnablesResource(I);
     // If neither has enabled the resource, continue to the next resource
     if (Enabled)
       return true;
   }
   return false;
 }

 bool CandidateHeuristics::tryCriticalResource(
     GenericSchedulerBase::SchedCandidate &TryCand,
     GenericSchedulerBase::SchedCandidate &Cand, SchedBoundary *Zone) const {
   for (unsigned I = 0; I < HWUInfo.size(); I++) {
     const HardwareUnitInfo &HWUI = HWUInfo[I];

     bool CandUsesCrit = HWUI.contains(Cand.SU);
     bool TryCandUsesCrit = HWUI.contains(TryCand.SU);

     if (!CandUsesCrit && !TryCandUsesCrit)
       continue;

     if (CandUsesCrit != TryCandUsesCrit) {
       if (CandUsesCrit) {
         if (Cand.Reason > GenericSchedulerBase::RegCritical)
           Cand.Reason = GenericSchedulerBase::RegCritical;
         return true;
       }
       TryCand.Reason = GenericSchedulerBase::RegCritical;
       return true;
     }

     // Otherwise, both use the critical resource
     // For longer latency InstructionFlavors, we should prioritize first by
     // their enablement of critical resources
     if (HWUI.getType() == InstructionFlavor::DS) {
       if (tryCriticalResourceDependency(TryCand, Cand, Zone))
         return true;
     }

     // Prioritize based on HWUI priorities.
     SUnit *Match = HWUI.getHigherPriority(Cand.SU, TryCand.SU);
     if (Match) {
       if (Match == Cand.SU) {
         if (Cand.Reason > GenericSchedulerBase::RegCritical)
           Cand.Reason = GenericSchedulerBase::RegCritical;
         return true;
       }
       TryCand.Reason = GenericSchedulerBase::RegCritical;
       return true;
     }
   }

   return false;
 }

 AMDGPUCoExecSchedStrategy::AMDGPUCoExecSchedStrategy(
     const MachineSchedContext *C)
     : GCNSchedStrategy(C) {
   SchedStages.push_back(GCNSchedStageID::ILPInitialSchedule);
   SchedStages.push_back(GCNSchedStageID::PreRARematerialize);
   // Use more accurate GCN pressure trackers.
   UseGCNTrackers = true;
 }

 void AMDGPUCoExecSchedStrategy::initPolicy(MachineBasicBlock::iterator Begin,
                                            MachineBasicBlock::iterator End,
                                            unsigned NumRegionInstrs) {
   GCNSchedStrategy::initPolicy(Begin, End, NumRegionInstrs);
   assert((PreRADirection == MISched::Unspecified ||
           PreRADirection == MISched::TopDown) &&
          "coexec scheduler only supports top-down scheduling");
   RegionPolicy.OnlyTopDown = true;
   RegionPolicy.OnlyBottomUp = false;
 }

 void AMDGPUCoExecSchedStrategy::initialize(ScheduleDAGMI *DAG) {
   // Coexecution scheduling strategy is only done top-down to support new
   // resource balancing heuristics.
   RegionPolicy.OnlyTopDown = true;
   RegionPolicy.OnlyBottomUp = false;

   GCNSchedStrategy::initialize(DAG);
   Heurs.initialize(DAG, SchedModel, TRI);
 }

 void AMDGPUCoExecSchedStrategy::schedNode(SUnit *SU, bool IsTopNode) {
   Heurs.updateForScheduling(SU);
   GCNSchedStrategy::schedNode(SU, IsTopNode);
 }

 SUnit *AMDGPUCoExecSchedStrategy::pickNode(bool &IsTopNode) {
   assert(RegionPolicy.OnlyTopDown && !RegionPolicy.OnlyBottomUp &&
          "coexec scheduler only supports top-down scheduling");

   if (DAG->top() == DAG->bottom()) {
     assert(Top.Available.empty() && Top.Pending.empty() &&
            Bot.Available.empty() && Bot.Pending.empty() && "ReadyQ garbage");
     return nullptr;
   }

   bool PickedPending = false;
   SUnit *SU = nullptr;
 #ifndef NDEBUG
   SchedCandidate *PickedCand = nullptr;
 #endif
   do {
     PickedPending = false;
     SU = pickOnlyChoice(Top);
     if (!SU) {
       CandPolicy NoPolicy;
       TopCand.reset(NoPolicy);
       pickNodeFromQueue(Top, NoPolicy, DAG->getTopRPTracker(), TopCand,
                         PickedPending, /*IsBottomUp=*/false);
       assert(TopCand.Reason != NoCand && "failed to find a candidate");
       SU = TopCand.SU;
 #ifndef NDEBUG
       PickedCand = &TopCand;
 #endif
     }
     IsTopNode = true;
   } while (SU->isScheduled);

   LLVM_DEBUG(if (PickedCand) dumpPickSummary(SU, IsTopNode, *PickedCand));

   if (PickedPending) {
     unsigned ReadyCycle = SU->TopReadyCycle;
     unsigned CurrentCycle = Top.getCurrCycle();
     if (ReadyCycle > CurrentCycle)
       Top.bumpCycle(ReadyCycle);

     // checkHazard() does not expose the exact cycle where the hazard clears.
     while (Top.checkHazard(SU))
       Top.bumpCycle(Top.getCurrCycle() + 1);

     Top.releasePending();
   }

   if (SU->isTopReady())
     Top.removeReady(SU);
   if (SU->isBottomReady())
     Bot.removeReady(SU);

   LLVM_DEBUG(dbgs() << "Scheduling SU(" << SU->NodeNum << ") "
                     << *SU->getInstr());

   assert(IsTopNode && "coexec scheduler must only schedule from top boundary");
   return SU;
 }

 void AMDGPUCoExecSchedStrategy::pickNodeFromQueue(
     SchedBoundary &Zone, const CandPolicy &ZonePolicy,
     const RegPressureTracker &RPTracker, SchedCandidate &Cand,
     bool &PickedPending, bool IsBottomUp) {
   assert(Zone.isTop() && "coexec scheduler only supports top boundary");
   assert(!IsBottomUp && "coexec scheduler only supports top-down scheduling");

   const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo *>(TRI);
   ArrayRef<unsigned> Pressure = RPTracker.getRegSetPressureAtPos();
   unsigned SGPRPressure = 0;
   unsigned VGPRPressure = 0;
   PickedPending = false;
   if (DAG->isTrackingPressure()) {
     if (!useGCNTrackers()) {
       SGPRPressure = Pressure[AMDGPU::RegisterPressureSets::SReg_32];
       VGPRPressure = Pressure[AMDGPU::RegisterPressureSets::VGPR_32];
     } else {
       SGPRPressure = DownwardTracker.getPressure().getSGPRNum();
       VGPRPressure = DownwardTracker.getPressure().getArchVGPRNum();
     }
   }

   auto EvaluateQueue = [&](ReadyQueue &Q, bool FromPending) {
     for (SUnit *SU : Q) {
       SchedCandidate TryCand(ZonePolicy);
       initCandidate(TryCand, SU, Zone.isTop(), RPTracker, SRI, SGPRPressure,
                     VGPRPressure, IsBottomUp);
       SchedBoundary *ZoneArg = Cand.AtTop == TryCand.AtTop ? &Zone : nullptr;
       tryCandidateCoexec(Cand, TryCand, ZoneArg);
       if (TryCand.Reason != NoCand) {
         if (TryCand.ResDelta == SchedResourceDelta())
           TryCand.initResourceDelta(Zone.DAG, SchedModel);
         LLVM_DEBUG(printCandidateDecision(Cand, TryCand));
         PickedPending = FromPending;
         Cand.setBest(TryCand);
       } else {
         LLVM_DEBUG(printCandidateDecision(TryCand, Cand));
       }
     }
   };

   LLVM_DEBUG(dbgs() << "Available Q:\n");
   EvaluateQueue(Zone.Available, /*FromPending=*/false);

   LLVM_DEBUG(dbgs() << "Pending Q:\n");
   EvaluateQueue(Zone.Pending, /*FromPending=*/true);
 }

 #ifndef NDEBUG
 void AMDGPUCoExecSchedStrategy::dumpPickSummary(SUnit *SU, bool IsTopNode,
                                                 SchedCandidate &Cand) {
   const SIInstrInfo *SII = static_cast<const SIInstrInfo *>(DAG->TII);
   unsigned Cycle = IsTopNode ? Top.getCurrCycle() : Bot.getCurrCycle();

   dbgs() << "=== Pick @ Cycle " << Cycle << " ===\n";

   const InstructionFlavor Flavor = classifyFlavor(*SU->getInstr(), *SII);
   dbgs() << "Picked: SU(" << SU->NodeNum << ") ";
   SU->getInstr()->print(dbgs(), /*IsStandalone=*/true, /*SkipOpers=*/false,
                         /*SkipDebugLoc=*/true);
   dbgs() << " [" << getFlavorName(Flavor) << "]\n";

   dbgs() << "  Reason: ";
   if (LastAMDGPUReason != AMDGPUSchedReason::None)
     dbgs() << getReasonName(LastAMDGPUReason);
   else if (Cand.Reason != NoCand)
     dbgs() << GenericSchedulerBase::getReasonStr(Cand.Reason);
   else
     dbgs() << "Unknown";
   dbgs() << "\n\n";

   LastAMDGPUReason = AMDGPUSchedReason::None;
 }
 #endif

 bool AMDGPUCoExecSchedStrategy::tryCandidateCoexec(SchedCandidate &Cand,
                                                    SchedCandidate &TryCand,
                                                    SchedBoundary *Zone) {
   // Initialize the candidate if needed.
   if (!Cand.isValid()) {
     TryCand.Reason = FirstValid;
     return true;
   }

   // Bias PhysReg Defs and copies to their uses and defined respectively.
   if (tryGreater(biasPhysReg(TryCand.SU, TryCand.AtTop),
                  biasPhysReg(Cand.SU, Cand.AtTop), TryCand, Cand, PhysReg))
     return TryCand.Reason != NoCand;

   // Avoid exceeding the target's limit.
   if (DAG->isTrackingPressure() &&
       tryPressure(TryCand.RPDelta.Excess, Cand.RPDelta.Excess, TryCand, Cand,
                   RegExcess, TRI, DAG->MF))
     return TryCand.Reason != NoCand;

   // We only compare a subset of features when comparing nodes between
   // Top and Bottom boundary. Some properties are simply incomparable, in many
   // other instances we should only override the other boundary if something
   // is a clear good pick on one boundary. Skip heuristics that are more
   // "tie-breaking" in nature.
   bool SameBoundary = Zone != nullptr;
   if (SameBoundary) {
     // Compare candidates by the stall they would introduce if
     // scheduled in the current cycle.
     if (tryEffectiveStall(Cand, TryCand, *Zone))
       return TryCand.Reason != NoCand;

     Heurs.sortHWUIResources();
     if (Heurs.tryCriticalResource(TryCand, Cand, Zone)) {
       LastAMDGPUReason = AMDGPUSchedReason::CritResourceBalance;
       return TryCand.Reason != NoCand;
     }

     if (Heurs.tryCriticalResourceDependency(TryCand, Cand, Zone)) {
       LastAMDGPUReason = AMDGPUSchedReason::CritResourceDep;
       return TryCand.Reason != NoCand;
     }
   }

   // Keep clustered nodes together to encourage downstream peephole
   // optimizations which may reduce resource requirements.
   //
   // This is a best effort to set things up for a post-RA pass. Optimizations
   // like generating loads of multiple registers should ideally be done within
   // the scheduler pass by combining the loads during DAG postprocessing.
   unsigned CandZoneCluster = Cand.AtTop ? TopClusterID : BotClusterID;
   unsigned TryCandZoneCluster = TryCand.AtTop ? TopClusterID : BotClusterID;
   bool CandIsClusterSucc =
       isTheSameCluster(CandZoneCluster, Cand.SU->ParentClusterIdx);
   bool TryCandIsClusterSucc =
       isTheSameCluster(TryCandZoneCluster, TryCand.SU->ParentClusterIdx);

   if (tryGreater(TryCandIsClusterSucc, CandIsClusterSucc, TryCand, Cand,
                  Cluster))
     return TryCand.Reason != NoCand;

   if (SameBoundary) {
     // Weak edges are for clustering and other constraints.
     if (tryLess(getWeakLeft(TryCand.SU, TryCand.AtTop),
                 getWeakLeft(Cand.SU, Cand.AtTop), TryCand, Cand, Weak))
       return TryCand.Reason != NoCand;
   }

   // Avoid increasing the max pressure of the entire region.
   if (DAG->isTrackingPressure() &&
       tryPressure(TryCand.RPDelta.CurrentMax, Cand.RPDelta.CurrentMax, TryCand,
                   Cand, RegMax, TRI, DAG->MF))
     return TryCand.Reason != NoCand;

   if (SameBoundary) {
     // Avoid serializing long latency dependence chains.
     // For acyclic path limited loops, latency was already checked above.
     if (!RegionPolicy.DisableLatencyHeuristic && TryCand.Policy.ReduceLatency &&
         !Rem.IsAcyclicLatencyLimited && tryLatency(TryCand, Cand, *Zone))
       return TryCand.Reason != NoCand;

     // Fall through to original instruction order.
     if ((Zone->isTop() && TryCand.SU->NodeNum < Cand.SU->NodeNum) ||
         (!Zone->isTop() && TryCand.SU->NodeNum > Cand.SU->NodeNum)) {
       TryCand.Reason = NodeOrder;
       return true;
     }
   }

   return false;
 }

 bool AMDGPUCoExecSchedStrategy::tryEffectiveStall(SchedCandidate &Cand,
                                                   SchedCandidate &TryCand,
                                                   SchedBoundary &Zone) const {
   // Treat structural and latency stalls as a single scheduling cost for the
   // current cycle.
   struct StallCosts {
     unsigned Ready = 0;
     unsigned Structural = 0;
     unsigned Latency = 0;
     unsigned Effective = 0;
   };

   unsigned CurrCycle = Zone.getCurrCycle();
   auto GetStallCosts = [&](SUnit *SU) {
     unsigned ReadyCycle = Zone.isTop() ? SU->TopReadyCycle : SU->BotReadyCycle;
     StallCosts Costs;
     Costs.Ready = ReadyCycle > CurrCycle ? ReadyCycle - CurrCycle : 0;
     Costs.Structural = getStructuralStallCycles(Zone, SU);
     Costs.Latency = Zone.getLatencyStallCycles(SU);
     Costs.Effective = std::max({Costs.Ready, Costs.Structural, Costs.Latency});
     return Costs;
   };

   StallCosts TryCosts = GetStallCosts(TryCand.SU);
   StallCosts CandCosts = GetStallCosts(Cand.SU);

   LLVM_DEBUG(if (TryCosts.Effective || CandCosts.Effective) {
     dbgs() << "Effective stalls: try=" << TryCosts.Effective
            << " (ready=" << TryCosts.Ready << ", struct=" << TryCosts.Structural
            << ", lat=" << TryCosts.Latency << ") cand=" << CandCosts.Effective
            << " (ready=" << CandCosts.Ready
            << ", struct=" << CandCosts.Structural
            << ", lat=" << CandCosts.Latency << ")\n";
   });

   return tryLess(TryCosts.Effective, CandCosts.Effective, TryCand, Cand, Stall);
 }

 ScheduleDAGInstrs *
 llvm::createGCNCoExecMachineScheduler(MachineSchedContext *C) {
   LLVM_DEBUG(dbgs() << "AMDGPU coexec preRA scheduler selected for "
                     << C->MF->getName() << '\n');
   return new GCNScheduleDAGMILive(
       C, std::make_unique<AMDGPUCoExecSchedStrategy>(C));
 }

 ScheduleDAGInstrs *
 llvm::createGCNNoopPostMachineScheduler(MachineSchedContext *C) {
   LLVM_DEBUG(dbgs() << "AMDGPU nop postRA scheduler selected for "
                     << C->MF->getName() << '\n');
   return new GCNNoopPostScheduleDAG(C);
 }