llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp - llvm-project - Git at Google

 //===--- AMDGPUIGroupLP.cpp - AMDGPU IGroupLP  ------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 //
 // \file This file defines a set of schedule DAG mutations that can be used to
 // override default scheduler behavior to enforce specific scheduling patterns.
 // They should be used in cases where runtime performance considerations such as
 // inter-wavefront interactions, mean that compile-time heuristics cannot
 // predict the optimal instruction ordering, or in kernels where optimum
 // instruction scheduling is important enough to warrant manual intervention.
 //
 //===----------------------------------------------------------------------===//

 #include "AMDGPUIGroupLP.h"
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "SIInstrInfo.h"
 #include "SIMachineFunctionInfo.h"
 #include "llvm/ADT/BitmaskEnum.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/CodeGen/MachineScheduler.h"
 #include "llvm/CodeGen/TargetOpcodes.h"

 using namespace llvm;

 #define DEBUG_TYPE "igrouplp"

 namespace {

 static cl::opt<bool> EnableExactSolver(
     "amdgpu-igrouplp-exact-solver", cl::Hidden,
     cl::desc("Whether to use the exponential time solver to fit "
              "the instructions to the pipeline as closely as "
              "possible."),
     cl::init(false));

 static cl::opt<unsigned> CutoffForExact(
     "amdgpu-igrouplp-exact-solver-cutoff", cl::init(0), cl::Hidden,
     cl::desc("The maximum number of scheduling group conflicts "
              "which we attempt to solve with the exponential time "
              "exact solver. Problem sizes greater than this will"
              "be solved by the less accurate greedy algorithm. Selecting "
              "solver by size is superseded by manually selecting "
              "the solver (e.g. by amdgpu-igrouplp-exact-solver"));

 static cl::opt<uint64_t> MaxBranchesExplored(
     "amdgpu-igrouplp-exact-solver-max-branches", cl::init(0), cl::Hidden,
     cl::desc("The amount of branches that we are willing to explore with"
              "the exact algorithm before giving up."));

 static cl::opt<bool> UseCostHeur(
     "amdgpu-igrouplp-exact-solver-cost-heur", cl::init(true), cl::Hidden,
     cl::desc("Whether to use the cost heuristic to make choices as we "
              "traverse the search space using the exact solver. Defaulted "
              "to on, and if turned off, we will use the node order -- "
              "attempting to put the later nodes in the later sched groups. "
              "Experimentally, results are mixed, so this should be set on a "
              "case-by-case basis."));

 // Components of the mask that determines which instruction types may be may be
 // classified into a SchedGroup.
 enum class SchedGroupMask {
   NONE = 0u,
   ALU = 1u << 0,
   VALU = 1u << 1,
   SALU = 1u << 2,
   MFMA = 1u << 3,
   VMEM = 1u << 4,
   VMEM_READ = 1u << 5,
   VMEM_WRITE = 1u << 6,
   DS = 1u << 7,
   DS_READ = 1u << 8,
   DS_WRITE = 1u << 9,
   TRANS = 1u << 10,
   ALL = ALU | VALU | SALU | MFMA | VMEM | VMEM_READ | VMEM_WRITE | DS |
         DS_READ | DS_WRITE | TRANS,
   LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ ALL)
 };

 class SchedGroup;

 // InstructionRule class is used to enact a filter which determines whether or
 // not an SU maps to a given SchedGroup. It contains complementary data
 // structures (e.g Cache) to help those filters.
 class InstructionRule {
 protected:
   const SIInstrInfo *TII;
   unsigned SGID;
   // A cache made available to the Filter to store SUnits for subsequent
   // invocations of the Filter
   std::optional<SmallVector<SUnit *, 4>> Cache;

 public:
   virtual bool
   apply(const SUnit *, const ArrayRef<SUnit *>,
         SmallVectorImpl<SchedGroup> &) {
     return true;
   };

   InstructionRule(const SIInstrInfo *TII, unsigned SGID,
                   bool NeedsCache = false)
       : TII(TII), SGID(SGID) {
     if (NeedsCache) {
       Cache = SmallVector<SUnit *, 4>();
     }
   }

   virtual ~InstructionRule() = default;
 };

 using SUnitsToCandidateSGsMap = DenseMap<SUnit *, SmallVector<int, 4>>;

 // Classify instructions into groups to enable fine tuned control over the
 // scheduler. These groups may be more specific than current SchedModel
 // instruction classes.
 class SchedGroup {
 private:
   // Mask that defines which instruction types can be classified into this
   // SchedGroup. The instruction types correspond to the mask from SCHED_BARRIER
   // and SCHED_GROUP_BARRIER.
   SchedGroupMask SGMask;

   // Maximum number of SUnits that can be added to this group.
   std::optional<unsigned> MaxSize;

   // SchedGroups will only synchronize with other SchedGroups that have the same
   // SyncID.
   int SyncID = 0;

   // SGID is used to map instructions to candidate SchedGroups
   unsigned SGID;

   // The different rules each instruction in this SchedGroup must conform to
   SmallVector<std::shared_ptr<InstructionRule>, 4> Rules;

   // Count of the number of created SchedGroups, used to initialize SGID.
   static unsigned NumSchedGroups;

   // Try to add and edge from SU A to SU B.
   bool tryAddEdge(SUnit *A, SUnit *B);

   // Use SGMask to determine whether we can classify MI as a member of this
   // SchedGroup object.
   bool canAddMI(const MachineInstr &MI) const;

 public:
   // Collection of SUnits that are classified as members of this group.
   SmallVector<SUnit *, 32> Collection;

   ScheduleDAGInstrs *DAG;
   const SIInstrInfo *TII;

   // Returns true if SU can be added to this SchedGroup.
   bool canAddSU(SUnit &SU) const;

   // Add DAG dependencies from all SUnits in this SchedGroup and this SU. If
   // MakePred is true, SU will be a predecessor of the SUnits in this
   // SchedGroup, otherwise SU will be a successor.
   void link(SUnit &SU, bool MakePred = false);

   // Add DAG dependencies and track which edges are added, and the count of
   // missed edges
   int link(SUnit &SU, bool MakePred,
            std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges);

   // Add DAG dependencies from all SUnits in this SchedGroup and this SU.
   // Use the predicate to determine whether SU should be a predecessor (P =
   // true) or a successor (P = false) of this SchedGroup.
   void link(SUnit &SU, function_ref<bool(const SUnit *A, const SUnit *B)> P);

   // Add DAG dependencies such that SUnits in this group shall be ordered
   // before SUnits in OtherGroup.
   void link(SchedGroup &OtherGroup);

   // Returns true if no more instructions may be added to this group.
   bool isFull() const { return MaxSize && Collection.size() >= *MaxSize; }

   // Append a constraint that SUs must meet in order to fit into this
   // SchedGroup. Since many rules involve the relationship between a SchedGroup
   // and the SUnits in other SchedGroups, rules are checked at Pipeline Solve
   // time (rather than SchedGroup init time.)
   void addRule(std::shared_ptr<InstructionRule> NewRule) {
     Rules.push_back(NewRule);
   }

   // Returns true if the SU matches all rules
   bool allowedByRules(const SUnit *SU,
                       SmallVectorImpl<SchedGroup> &SyncPipe) const {
     for (auto &Rule : Rules) {
       if (!Rule->apply(SU, Collection, SyncPipe))
         return false;
     }
     return true;
   }

   // Add SU to the SchedGroup.
   void add(SUnit &SU) {
     LLVM_DEBUG(dbgs() << "For SchedGroup with mask "
                       << format_hex((int)SGMask, 10, true) << " adding "
                       << *SU.getInstr());
     Collection.push_back(&SU);
   }

   // Remove last element in the SchedGroup
   void pop() { Collection.pop_back(); }

   // Identify and add all relevant SUs from the DAG to this SchedGroup.
   void initSchedGroup();

   // Add instructions to the SchedGroup bottom up starting from RIter.
   // PipelineInstrs is a set of instructions that should not be added to the
   // SchedGroup even when the other conditions for adding it are satisfied.
   // RIter will be added to the SchedGroup as well, and dependencies will be
   // added so that RIter will always be scheduled at the end of the group.
   void initSchedGroup(std::vector<SUnit>::reverse_iterator RIter,
                       SUnitsToCandidateSGsMap &SyncedInstrs);

   void initSchedGroup(SUnitsToCandidateSGsMap &SyncedInstrs);

   int getSyncID() { return SyncID; }

   int getSGID() { return SGID; }

   SchedGroupMask getMask() { return SGMask; }

   SchedGroup(SchedGroupMask SGMask, std::optional<unsigned> MaxSize,
              ScheduleDAGInstrs *DAG, const SIInstrInfo *TII)
       : SGMask(SGMask), MaxSize(MaxSize), DAG(DAG), TII(TII) {
     SGID = NumSchedGroups++;
   }

   SchedGroup(SchedGroupMask SGMask, std::optional<unsigned> MaxSize, int SyncID,
              ScheduleDAGInstrs *DAG, const SIInstrInfo *TII)
       : SGMask(SGMask), MaxSize(MaxSize), SyncID(SyncID), DAG(DAG), TII(TII) {
     SGID = NumSchedGroups++;
   }
 };

 using SUToCandSGsPair = std::pair<SUnit *, SmallVector<int, 4>>;
 using SUsToCandSGsVec = SmallVector<SUToCandSGsPair, 4>;

 // The PipelineSolver is used to assign SUnits to SchedGroups in a pipeline
 // in non-trivial cases. For example, if the requested pipeline is
 // {VMEM_READ, VALU, MFMA, VMEM_READ} and we encounter a VMEM_READ instruction
 // in the DAG, then we will have an instruction that can not be trivially
 // assigned to a SchedGroup. The PipelineSolver class implements two algorithms
 // to find a good solution to the pipeline -- a greedy algorithm and an exact
 // algorithm. The exact algorithm has an exponential time complexity and should
 // only be used for small sized problems or medium sized problems where an exact
 // solution is highly desired.
 class PipelineSolver {
   [[maybe_unused]] ScheduleDAGMI *DAG;

   // Instructions that can be assigned to multiple SchedGroups
   DenseMap<int, SUnitsToCandidateSGsMap> SyncedInstrs;
   SmallVector<SUsToCandSGsVec, 4> PipelineInstrs;
   DenseMap<int, SmallVector<SchedGroup, 4>> SyncedSchedGroups;
   // The current working pipeline
   SmallVector<SmallVector<SchedGroup, 4>, 4> CurrPipeline;
   // The pipeline that has the best solution found so far
   SmallVector<SmallVector<SchedGroup, 4>, 4> BestPipeline;

   // Whether or not we actually have any SyncedInstrs to try to solve.
   bool NeedsSolver = false;

   // Compute an estimate of the size of search tree -- the true size is
   // the product of each conflictedInst.Matches.size() across all SyncPipelines
   unsigned computeProblemSize();

   // The cost penalty of not assigning a SU to a SchedGroup
   int MissPenalty = 0;

   // Costs in terms of the number of edges we are unable to add
   int BestCost = -1;
   int CurrCost = 0;

   // Index pointing to the conflicting instruction that is currently being
   // fitted
   int CurrConflInstNo = 0;
   // Index to the pipeline that is currently being fitted
   int CurrSyncGroupIdx = 0;
   // The first non trivial pipeline
   int BeginSyncGroupIdx = 0;

   // How many branches we have explored
   uint64_t BranchesExplored = 0;

   // The direction in which we process the candidate SchedGroups per SU
   bool IsBottomUp = true;

   // Update indices to fit next conflicting instruction
   void advancePosition();
   // Recede indices to attempt to find better fit for previous conflicting
   // instruction
   void retreatPosition();

   // The exponential time algorithm which finds the provably best fit
   bool solveExact();
   // The polynomial time algorithm which attempts to find a good fit
   bool solveGreedy();
   // Find the best SchedGroup for the current SU using the heuristic given all
   // current information. One step in the greedy algorithm. Templated against
   // the SchedGroup iterator (either reverse or forward).
   template <typename T>
   void greedyFind(std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges, T I,
                   T E);
   // Whether or not the current solution is optimal
   bool checkOptimal();
   // Populate the ready list, prioiritizing fewest missed edges first
   // Templated against the SchedGroup iterator (either reverse or forward).
   template <typename T>
   void populateReadyList(SmallVectorImpl<std::pair<int, int>> &ReadyList, T I,
                          T E);
   // Add edges corresponding to the SchedGroups as assigned by solver
   void makePipeline();
   // Link the SchedGroups in the best found pipeline.
   // Tmplated against the SchedGroup iterator (either reverse or forward).
   template <typename T> void linkSchedGroups(T I, T E);
   // Add the edges from the SU to the other SchedGroups in pipeline, and
   // return the number of edges missed.
   int addEdges(SmallVectorImpl<SchedGroup> &SyncPipeline, SUnit *SU, int SGID,
                std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges);
   /// Link the pipeline as if \p SU was in the SchedGroup with ID \p SGID. It
   /// returns the cost (in terms of missed pipeline edges), and tracks the edges
   /// added in \p AddedEdges
   template <typename T>
   int linkSUnit(SUnit *SU, int SGID,
                 std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges, T I, T E);
   /// Remove the edges passed via \p AddedEdges
   void removeEdges(const std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges);
   // Convert the passed in maps to arrays for bidirectional iterators
   void convertSyncMapsToArrays();

   void reset();

 public:
   // Invoke the solver to map instructions to instruction groups. Heuristic &&
   // command-line-option determines to use exact or greedy algorithm.
   void solve();

   PipelineSolver(DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,
                  DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
                  ScheduleDAGMI *DAG, bool IsBottomUp = true)
       : DAG(DAG), SyncedInstrs(SyncedInstrs),
         SyncedSchedGroups(SyncedSchedGroups), IsBottomUp(IsBottomUp) {

     for (auto &PipelineInstrs : SyncedInstrs) {
       if (PipelineInstrs.second.size() > 0) {
         NeedsSolver = true;
         break;
       }
     }

     if (!NeedsSolver)
       return;

     convertSyncMapsToArrays();

     CurrPipeline = BestPipeline;

     while (static_cast<size_t>(BeginSyncGroupIdx) < PipelineInstrs.size() &&
            PipelineInstrs[BeginSyncGroupIdx].size() == 0)
       ++BeginSyncGroupIdx;

     if (static_cast<size_t>(BeginSyncGroupIdx) >= PipelineInstrs.size())
       return;
   }
 };

 void PipelineSolver::reset() {

   for (auto &SyncPipeline : CurrPipeline) {
     for (auto &SG : SyncPipeline) {
       SmallVector<SUnit *, 32> TempCollection = SG.Collection;
       SG.Collection.clear();
       auto *SchedBarr = llvm::find_if(TempCollection, [](SUnit *SU) {
         return SU->getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER;
       });
       if (SchedBarr != TempCollection.end())
         SG.Collection.push_back(*SchedBarr);
     }
   }

   CurrSyncGroupIdx = BeginSyncGroupIdx;
   CurrConflInstNo = 0;
   CurrCost = 0;
 }

 void PipelineSolver::convertSyncMapsToArrays() {
   for (auto &SyncPipe : SyncedSchedGroups) {
     BestPipeline.insert(BestPipeline.begin(), SyncPipe.second);
   }

   int PipelineIDx = SyncedInstrs.size() - 1;
   PipelineInstrs.resize(SyncedInstrs.size());
   for (auto &SyncInstrMap : SyncedInstrs) {
     for (auto &SUsToCandSGs : SyncInstrMap.second) {
       if (PipelineInstrs[PipelineIDx].size() == 0) {
         PipelineInstrs[PipelineIDx].push_back(
             std::pair(SUsToCandSGs.first, SUsToCandSGs.second));
         continue;
       }
       auto *SortPosition = PipelineInstrs[PipelineIDx].begin();
       // Insert them in sorted order -- this allows for good parsing order in
       // the greedy algorithm
       while (SortPosition != PipelineInstrs[PipelineIDx].end() &&
              SUsToCandSGs.first->NodeNum > SortPosition->first->NodeNum)
         ++SortPosition;
       PipelineInstrs[PipelineIDx].insert(
           SortPosition, std::pair(SUsToCandSGs.first, SUsToCandSGs.second));
     }
     --PipelineIDx;
   }
 }

 template <typename T> void PipelineSolver::linkSchedGroups(T I, T E) {
   for (; I != E; ++I) {
     auto &GroupA = *I;
     for (auto J = std::next(I); J != E; ++J) {
       auto &GroupB = *J;
       GroupA.link(GroupB);
     }
   }
 }

 void PipelineSolver::makePipeline() {
   // Preserve the order of barrier for subsequent SchedGroupBarrier mutations
   for (auto &SyncPipeline : BestPipeline) {
     LLVM_DEBUG(dbgs() << "Printing SchedGroups\n");
     for (auto &SG : SyncPipeline) {
       LLVM_DEBUG(dbgs() << "SchedGroup with SGID " << SG.getSGID()
                         << " has: \n");
       SUnit *SGBarr = nullptr;
       for (auto &SU : SG.Collection) {
         if (SU->getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER)
           SGBarr = SU;
         LLVM_DEBUG(dbgs() << "SU(" << SU->NodeNum << ")\n");
       }
       // Command line requested IGroupLP doesn't have SGBarr
       if (!SGBarr)
         continue;
       SG.link(*SGBarr, false);
     }
   }

   for (auto &SyncPipeline : BestPipeline) {
     IsBottomUp ? linkSchedGroups(SyncPipeline.rbegin(), SyncPipeline.rend())
                : linkSchedGroups(SyncPipeline.begin(), SyncPipeline.end());
   }
 }

 template <typename T>
 int PipelineSolver::linkSUnit(
     SUnit *SU, int SGID, std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges,
     T I, T E) {
   bool MakePred = false;
   int AddedCost = 0;
   for (; I < E; ++I) {
     if (I->getSGID() == SGID) {
       MakePred = true;
       continue;
     }
     auto Group = *I;
     AddedCost += Group.link(*SU, MakePred, AddedEdges);
     assert(AddedCost >= 0);
   }
   return AddedCost;
 }

 int PipelineSolver::addEdges(
     SmallVectorImpl<SchedGroup> &SyncPipeline, SUnit *SU, int SGID,
     std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges) {

   // For IsBottomUp, the first SchedGroup in SyncPipeline contains the
   // instructions that are the ultimate successors in the resultant mutation.
   // Therefore, in such a configuration, the SchedGroups occurring before the
   // candidate SGID are successors of the candidate SchedGroup, thus the current
   // SU should be linked as a predecessor to SUs in those SchedGroups. The
   // opposite is true if !IsBottomUp. IsBottomUp occurs in the case of multiple
   // SCHED_GROUP_BARRIERS, or if a user specifies IGLP_OPT SchedGroups using
   // IsBottomUp (in reverse).
   return IsBottomUp ? linkSUnit(SU, SGID, AddedEdges, SyncPipeline.rbegin(),
                                 SyncPipeline.rend())
                     : linkSUnit(SU, SGID, AddedEdges, SyncPipeline.begin(),
                                 SyncPipeline.end());
 }

 void PipelineSolver::removeEdges(
     const std::vector<std::pair<SUnit *, SUnit *>> &EdgesToRemove) {
   // Only remove the edges that we have added when testing
   // the fit.
   for (auto &PredSuccPair : EdgesToRemove) {
     SUnit *Pred = PredSuccPair.first;
     SUnit *Succ = PredSuccPair.second;

     auto *Match = llvm::find_if(
         Succ->Preds, [&Pred](SDep &P) { return P.getSUnit() == Pred; });
     if (Match != Succ->Preds.end()) {
       assert(Match->isArtificial());
       Succ->removePred(*Match);
     }
   }
 }

 void PipelineSolver::advancePosition() {
   ++CurrConflInstNo;

   if (static_cast<size_t>(CurrConflInstNo) >=
       PipelineInstrs[CurrSyncGroupIdx].size()) {
     CurrConflInstNo = 0;
     ++CurrSyncGroupIdx;
     // Advance to next non-trivial pipeline
     while (static_cast<size_t>(CurrSyncGroupIdx) < PipelineInstrs.size() &&
            PipelineInstrs[CurrSyncGroupIdx].size() == 0)
       ++CurrSyncGroupIdx;
   }
 }

 void PipelineSolver::retreatPosition() {
   assert(CurrConflInstNo >= 0);
   assert(CurrSyncGroupIdx >= 0);

   if (CurrConflInstNo > 0) {
     --CurrConflInstNo;
     return;
   }

   if (CurrConflInstNo == 0) {
     // If we return to the starting position, we have explored
     // the entire tree
     if (CurrSyncGroupIdx == BeginSyncGroupIdx)
       return;

     --CurrSyncGroupIdx;
     // Go to previous non-trivial pipeline
     while (PipelineInstrs[CurrSyncGroupIdx].size() == 0)
       --CurrSyncGroupIdx;

     CurrConflInstNo = PipelineInstrs[CurrSyncGroupIdx].size() - 1;
   }
 }

 bool PipelineSolver::checkOptimal() {
   if (static_cast<size_t>(CurrSyncGroupIdx) == PipelineInstrs.size()) {
     if (BestCost == -1 || CurrCost < BestCost) {
       BestPipeline = CurrPipeline;
       BestCost = CurrCost;
       LLVM_DEBUG(dbgs() << "Found Fit with cost " << BestCost << "\n");
     }
     assert(BestCost >= 0);
   }

   bool DoneExploring = false;
   if (MaxBranchesExplored > 0 && BranchesExplored >= MaxBranchesExplored)
     DoneExploring = true;

   return (DoneExploring || BestCost == 0);
 }

 template <typename T>
 void PipelineSolver::populateReadyList(
     SmallVectorImpl<std::pair<int, int>> &ReadyList, T I, T E) {
   SUToCandSGsPair CurrSU = PipelineInstrs[CurrSyncGroupIdx][CurrConflInstNo];
   auto SyncPipeline = CurrPipeline[CurrSyncGroupIdx];
   assert(CurrSU.second.size() >= 1);

   for (; I != E; ++I) {
     std::vector<std::pair<SUnit *, SUnit *>> AddedEdges;
     int CandSGID = *I;
     SchedGroup *Match = llvm::find_if(SyncPipeline, [CandSGID](SchedGroup &SG) {
       return SG.getSGID() == CandSGID;
     });
     assert(Match);

     if (UseCostHeur) {
       if (Match->isFull()) {
         ReadyList.push_back(std::pair(*I, MissPenalty));
         continue;
       }

       int TempCost = addEdges(SyncPipeline, CurrSU.first, CandSGID, AddedEdges);
       ReadyList.push_back(std::pair(*I, TempCost));
       removeEdges(AddedEdges);
     } else
       ReadyList.push_back(std::pair(*I, -1));
   }

   if (UseCostHeur) {
     std::sort(ReadyList.begin(), ReadyList.end(),
               [](std::pair<int, int> A, std::pair<int, int> B) {
                 return A.second < B.second;
               });
   }

   assert(ReadyList.size() == CurrSU.second.size());
 }

 bool PipelineSolver::solveExact() {
   if (checkOptimal())
     return true;

   if (static_cast<size_t>(CurrSyncGroupIdx) == PipelineInstrs.size())
     return false;

   assert(static_cast<size_t>(CurrSyncGroupIdx) < PipelineInstrs.size());
   assert(static_cast<size_t>(CurrConflInstNo) <
          PipelineInstrs[CurrSyncGroupIdx].size());
   SUToCandSGsPair CurrSU = PipelineInstrs[CurrSyncGroupIdx][CurrConflInstNo];
   LLVM_DEBUG(dbgs() << "Fitting SU(" << CurrSU.first->NodeNum
                     << ") in Pipeline # " << CurrSyncGroupIdx << "\n");

   // SchedGroup -> Cost pairs
   SmallVector<std::pair<int, int>, 4> ReadyList;
   // Prioritize the candidate sched groups in terms of lowest cost first
   IsBottomUp ? populateReadyList(ReadyList, CurrSU.second.rbegin(),
                                  CurrSU.second.rend())
              : populateReadyList(ReadyList, CurrSU.second.begin(),
                                  CurrSU.second.end());

   auto *I = ReadyList.begin();
   auto *E = ReadyList.end();
   for (; I != E; ++I) {
     // If we are trying SGs in least cost order, and the current SG is cost
     // infeasible, then all subsequent SGs will also be cost infeasible, so we
     // can prune.
     if (BestCost != -1 && (CurrCost + I->second > BestCost))
       return false;

     int CandSGID = I->first;
     int AddedCost = 0;
     std::vector<std::pair<SUnit *, SUnit *>> AddedEdges;
     auto &SyncPipeline = CurrPipeline[CurrSyncGroupIdx];
     SchedGroup *Match;
     for (auto &SG : SyncPipeline) {
       if (SG.getSGID() == CandSGID)
         Match = &SG;
     }

     if (Match->isFull())
       continue;

     if (!Match->allowedByRules(CurrSU.first, SyncPipeline))
       continue;

     LLVM_DEBUG(dbgs() << "Assigning to SchedGroup with Mask "
                       << (int)Match->getMask() << "and ID " << CandSGID
                       << "\n");
     Match->add(*CurrSU.first);
     AddedCost = addEdges(SyncPipeline, CurrSU.first, CandSGID, AddedEdges);
     LLVM_DEBUG(dbgs() << "Cost of Assignment: " << AddedCost << "\n");
     CurrCost += AddedCost;
     advancePosition();
     ++BranchesExplored;
     bool FinishedExploring = false;
     // If the Cost after adding edges is greater than a known solution,
     // backtrack
     if (CurrCost < BestCost || BestCost == -1) {
       if (solveExact()) {
         FinishedExploring = BestCost != 0;
         if (!FinishedExploring)
           return true;
       }
     }

     retreatPosition();
     CurrCost -= AddedCost;
     removeEdges(AddedEdges);
     Match->pop();
     CurrPipeline[CurrSyncGroupIdx] = SyncPipeline;
     if (FinishedExploring)
       return true;
   }

   // Try the pipeline where the current instruction is omitted
   // Potentially if we omit a problematic instruction from the pipeline,
   // all the other instructions can nicely fit.
   CurrCost += MissPenalty;
   advancePosition();

   LLVM_DEBUG(dbgs() << "NOT Assigned (" << CurrSU.first->NodeNum << ")\n");

   bool FinishedExploring = false;
   if (CurrCost < BestCost || BestCost == -1) {
     if (solveExact()) {
       bool FinishedExploring = BestCost != 0;
       if (!FinishedExploring)
         return true;
     }
   }

   retreatPosition();
   CurrCost -= MissPenalty;
   return FinishedExploring;
 }

 template <typename T>
 void PipelineSolver::greedyFind(
     std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges, T I, T E) {
   SUToCandSGsPair CurrSU = PipelineInstrs[CurrSyncGroupIdx][CurrConflInstNo];
   int BestNodeCost = -1;
   int TempCost;
   SchedGroup *BestGroup = nullptr;
   int BestGroupID = -1;
   auto &SyncPipeline = CurrPipeline[CurrSyncGroupIdx];
   LLVM_DEBUG(dbgs() << "Fitting SU(" << CurrSU.first->NodeNum
                     << ") in Pipeline # " << CurrSyncGroupIdx << "\n");

   // Since we have added the potential SchedGroups from bottom up, but
   // traversed the DAG from top down, parse over the groups from last to
   // first. If we fail to do this for the greedy algorithm, the solution will
   // likely not be good in more complex cases.
   for (; I != E; ++I) {
     std::vector<std::pair<SUnit *, SUnit *>> AddedEdges;
     int CandSGID = *I;
     SchedGroup *Match = llvm::find_if(SyncPipeline, [CandSGID](SchedGroup &SG) {
       return SG.getSGID() == CandSGID;
     });
     assert(Match);

     LLVM_DEBUG(dbgs() << "Trying SGID # " << CandSGID << " with Mask "
                       << (int)Match->getMask() << "\n");

     if (Match->isFull()) {
       LLVM_DEBUG(dbgs() << "SGID # " << CandSGID << " is full\n");
       continue;
     }
     if (!Match->allowedByRules(CurrSU.first, SyncPipeline)) {
       LLVM_DEBUG(dbgs() << "SGID # " << CandSGID << " has conflicting rule\n");
       continue;
     }
     TempCost = addEdges(SyncPipeline, CurrSU.first, CandSGID, AddedEdges);
     LLVM_DEBUG(dbgs() << "Cost of Group " << TempCost << "\n");
     if (TempCost < BestNodeCost || BestNodeCost == -1) {
       BestGroup = Match;
       BestNodeCost = TempCost;
       BestGroupID = CandSGID;
     }
     removeEdges(AddedEdges);
     if (BestNodeCost == 0)
       break;
   }

   if (BestGroupID != -1) {
     BestGroup->add(*CurrSU.first);
     addEdges(SyncPipeline, CurrSU.first, BestGroupID, AddedEdges);
     LLVM_DEBUG(dbgs() << "Best Group has ID: " << BestGroupID << " and Mask"
                       << (int)BestGroup->getMask() << "\n");
     BestCost += TempCost;
   } else
     BestCost += MissPenalty;

   CurrPipeline[CurrSyncGroupIdx] = SyncPipeline;
 }

 bool PipelineSolver::solveGreedy() {
   BestCost = 0;
   std::vector<std::pair<SUnit *, SUnit *>> AddedEdges;

   while (static_cast<size_t>(CurrSyncGroupIdx) < PipelineInstrs.size()) {
     SUToCandSGsPair CurrSU = PipelineInstrs[CurrSyncGroupIdx][CurrConflInstNo];
     IsBottomUp
         ? greedyFind(AddedEdges, CurrSU.second.rbegin(), CurrSU.second.rend())
         : greedyFind(AddedEdges, CurrSU.second.begin(), CurrSU.second.end());
     advancePosition();
   }
   BestPipeline = CurrPipeline;
   removeEdges(AddedEdges);
   return false;
 }

 unsigned PipelineSolver::computeProblemSize() {
   unsigned ProblemSize = 0;
   for (auto &PipeConflicts : PipelineInstrs) {
     ProblemSize += PipeConflicts.size();
   }

   return ProblemSize;
 }

 void PipelineSolver::solve() {
   if (!NeedsSolver)
     return;

   unsigned ProblemSize = computeProblemSize();
   assert(ProblemSize > 0);

   bool BelowCutoff = (CutoffForExact > 0) && ProblemSize <= CutoffForExact;
   MissPenalty = (ProblemSize / 2) + 1;

   LLVM_DEBUG(DAG->dump());
   if (EnableExactSolver || BelowCutoff) {
     LLVM_DEBUG(dbgs() << "Starting Greedy pipeline solver\n");
     solveGreedy();
     reset();
     LLVM_DEBUG(dbgs() << "Greedy produced best cost of " << BestCost << "\n");
     if (BestCost > 0) {
       LLVM_DEBUG(dbgs() << "Starting EXACT pipeline solver\n");
       solveExact();
       LLVM_DEBUG(dbgs() << "Exact produced best cost of " << BestCost << "\n");
     }
   } else { // Use the Greedy Algorithm by default
     LLVM_DEBUG(dbgs() << "Starting GREEDY pipeline solver\n");
     solveGreedy();
   }

   makePipeline();
   LLVM_DEBUG(dbgs() << "After applying mutation\n");
   LLVM_DEBUG(DAG->dump());
 }

 enum IGLPStrategyID : int {
   MFMASmallGemmOptID = 0,
   MFMASmallGemmSingleWaveOptID = 1,
   MFMAExpInterleaveID = 2,
   MFMAExpSimpleInterleaveID = 3
 };

 // Implement a IGLP scheduling strategy.
 class IGLPStrategy {
 protected:
   ScheduleDAGInstrs *DAG;

   const SIInstrInfo *TII;

 public:
   /// Add SchedGroups to \p SyncedSchedGroups to implement this Strategy.
   virtual bool applyIGLPStrategy(
       DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
       DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,
       AMDGPU::SchedulingPhase Phase) = 0;

   // Returns true if this strategy should be applied to a ScheduleDAG.
   virtual bool shouldApplyStrategy(ScheduleDAGInstrs *DAG,
                                    AMDGPU::SchedulingPhase Phase) = 0;

   bool IsBottomUp = true;

   IGLPStrategy(ScheduleDAGInstrs *DAG, const SIInstrInfo *TII)
       : DAG(DAG), TII(TII) {}

   virtual ~IGLPStrategy() = default;
 };

 class MFMASmallGemmOpt final : public IGLPStrategy {
 private:
 public:
   bool applyIGLPStrategy(
       DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
       DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,
       AMDGPU::SchedulingPhase Phase) override;

   bool shouldApplyStrategy(ScheduleDAGInstrs *DAG,
                            AMDGPU::SchedulingPhase Phase) override {
     return true;
   }

   MFMASmallGemmOpt(ScheduleDAGInstrs *DAG, const SIInstrInfo *TII)
       : IGLPStrategy(DAG, TII) {
     IsBottomUp = true;
   }
 };

 bool MFMASmallGemmOpt::applyIGLPStrategy(
     DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
     DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,
     AMDGPU::SchedulingPhase Phase) {
   // Count the number of MFMA instructions.
   unsigned MFMACount = 0;
   for (const MachineInstr &I : *DAG)
     if (TII->isMFMAorWMMA(I))
       ++MFMACount;

   const unsigned PipelineSyncID = 0;
   SchedGroup *SG = nullptr;
   for (unsigned I = 0; I < MFMACount * 3; ++I) {
     SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
         SchedGroupMask::DS, 2, PipelineSyncID, DAG, TII);
     SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);

     SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
         SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);
     SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
   }

   return true;
 }

 class MFMAExpInterleaveOpt final : public IGLPStrategy {
 private:
   // The count of TRANS SUs involved in the interleaved pipeline
   static unsigned TransPipeCount;
   // The count of MFMA SUs involved in the interleaved pipeline
   static unsigned MFMAPipeCount;
   // The count of Add SUs involved in the interleaved pipeline
   static unsigned AddPipeCount;
   // The number of transitive MFMA successors for each TRANS SU
   static unsigned MFMAEnablement;
   // The number of transitive TRANS predecessors for each MFMA SU
   static unsigned ExpRequirement;
   // The count of independent "chains" of MFMA instructions in the pipeline
   static unsigned MFMAChains;
   // The length of each independent "chain" of MFMA instructions
   static unsigned MFMAChainLength;
   // Whether or not the pipeline has V_CVT instructions
   static bool HasCvt;
   // Whether or not there are instructions between the TRANS instruction and
   // V_CVT
   static bool HasChainBetweenCvt;
   // The first occuring DS_READ which feeds an MFMA chain
   static std::optional<unsigned> FirstPipeDSR;
   // The MFMAPipe SUs with no MFMA predecessors
   SmallVector<SUnit *, 4> MFMAChainSeeds;
   // Compute the heuristics for the pipeline, returning whether or not the DAG
   // is well formatted for the mutation
   bool analyzeDAG(const SIInstrInfo *TII);

   /// Whether or not the instruction is a transitive predecessor of an MFMA
   /// instruction
   class IsPipeExp final : public InstructionRule {
   public:
     bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
                SmallVectorImpl<SchedGroup> &SyncPipe) override {

       auto *DAG = SyncPipe[0].DAG;

       if (Cache->empty()) {
         auto I = DAG->SUnits.rbegin();
         auto E = DAG->SUnits.rend();
         for (; I != E; I++) {
           if (TII->isMFMAorWMMA(*I->getInstr()))
             Cache->push_back(&*I);
         }
         if (Cache->empty())
           return false;
       }

       auto Reaches = any_of(*Cache, [&SU, &DAG](SUnit *TargetSU) {
         return DAG->IsReachable(TargetSU, const_cast<SUnit *>(SU));
       });

       return Reaches;
     }
     IsPipeExp(const SIInstrInfo *TII, unsigned SGID, bool NeedsCache = false)
         : InstructionRule(TII, SGID, NeedsCache) {}
   };

   /// Whether or not the instruction is a transitive predecessor of the
   /// \p Number th MFMA of the MFMAs occuring after a TRANS instruction
   class EnablesNthMFMA final : public InstructionRule {
   private:
     unsigned Number = 1;

   public:
     bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
                SmallVectorImpl<SchedGroup> &SyncPipe) override {
       bool FoundTrans = false;
       unsigned Counter = 1;
       auto *DAG = SyncPipe[0].DAG;

       if (Cache->empty()) {
         SmallVector<SUnit *, 8> Worklist;

         auto I = DAG->SUnits.begin();
         auto E = DAG->SUnits.end();
         for (; I != E; I++) {
           if (FoundTrans && TII->isMFMAorWMMA(*I->getInstr())) {
             if (Counter == Number) {
               Cache->push_back(&*I);
               break;
             }
             ++Counter;
           }
           if (!FoundTrans && TII->isTRANS(I->getInstr()->getOpcode()))
             FoundTrans = true;
         }
         if (Cache->empty())
           return false;
       }

       return DAG->IsReachable((*Cache)[0], const_cast<SUnit *>(SU));
     }

     EnablesNthMFMA(unsigned Number, const SIInstrInfo *TII, unsigned SGID,
                    bool NeedsCache = false)
         : InstructionRule(TII, SGID, NeedsCache), Number(Number) {}
   };

   /// Whether or not the instruction enables the exact MFMA that is the \p
   /// Number th MFMA in the chain starting with \p ChainSeed
   class EnablesNthMFMAInChain final : public InstructionRule {
   private:
     unsigned Number = 1;
     SUnit *ChainSeed;

   public:
     bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
                SmallVectorImpl<SchedGroup> &SyncPipe) override {
       auto *DAG = SyncPipe[0].DAG;

       if (!SU || !TII->isMFMAorWMMA(*ChainSeed->getInstr()))
         return false;

       if (Cache->empty()) {
         auto *TempSU = ChainSeed;
         auto Depth = Number;
         while (Depth > 0) {
           --Depth;
           bool Found = false;
           for (auto &Succ : TempSU->Succs) {
             if (TII->isMFMAorWMMA(*Succ.getSUnit()->getInstr())) {
               TempSU = Succ.getSUnit();
               Found = true;
               break;
             }
           }
           if (!Found)
             return false;
         }

         Cache->push_back(TempSU);
       }
       // If we failed to find the instruction to be placed into the cache, we
       // would have already exited.
       assert(!Cache->empty());

       return DAG->IsReachable((*Cache)[0], const_cast<SUnit *>(SU));
     }

     EnablesNthMFMAInChain(unsigned Number, SUnit *ChainSeed,
                           const SIInstrInfo *TII, unsigned SGID,
                           bool NeedsCache = false)
         : InstructionRule(TII, SGID, NeedsCache), Number(Number),
           ChainSeed(ChainSeed) {}
   };

   /// Whether or not the instruction has less than \p Size immediate successors.
   /// If \p HasIntermediary is true, this tests also whether all successors of
   /// the SUnit have less than \p Size successors.
   class LessThanNSuccs final : public InstructionRule {
   private:
     unsigned Size = 1;
     bool HasIntermediary = false;

   public:
     bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
                SmallVectorImpl<SchedGroup> &SyncPipe) override {
       if (!SyncPipe.size())
         return false;

       auto SuccSize = std::count_if(
           SU->Succs.begin(), SU->Succs.end(),
           [](const SDep &Succ) { return Succ.getKind() == SDep::Data; });
       if (SuccSize >= Size)
         return false;

       if (HasIntermediary) {
         for (auto Succ : SU->Succs) {
           auto SuccSize = std::count_if(
               Succ.getSUnit()->Succs.begin(), Succ.getSUnit()->Succs.end(),
               [](const SDep &SuccSucc) {
                 return SuccSucc.getKind() == SDep::Data;
               });
           if (SuccSize >= Size)
             return false;
         }
       }

       return true;
     }
     LessThanNSuccs(unsigned Size, const SIInstrInfo *TII, unsigned SGID,
                    bool HasIntermediary = false, bool NeedsCache = false)
         : InstructionRule(TII, SGID, NeedsCache), Size(Size),
           HasIntermediary(HasIntermediary) {}
   };

   /// Whether or not the instruction has greater than or equal to \p Size
   /// immediate successors. If \p HasIntermediary is true, this tests also
   /// whether all successors of the SUnit have greater than or equal to \p Size
   /// successors.
   class GreaterThanOrEqualToNSuccs final : public InstructionRule {
   private:
     unsigned Size = 1;
     bool HasIntermediary = false;

   public:
     bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
                SmallVectorImpl<SchedGroup> &SyncPipe) override {
       if (!SyncPipe.size())
         return false;

       auto SuccSize = std::count_if(
           SU->Succs.begin(), SU->Succs.end(),
           [](const SDep &Succ) { return Succ.getKind() == SDep::Data; });
       if (SuccSize >= Size)
         return true;

       if (HasIntermediary) {
         for (auto Succ : SU->Succs) {
           auto SuccSize = std::count_if(
               Succ.getSUnit()->Succs.begin(), Succ.getSUnit()->Succs.end(),
               [](const SDep &SuccSucc) {
                 return SuccSucc.getKind() == SDep::Data;
               });
           if (SuccSize >= Size)
             return true;
         }
       }

       return false;
     }
     GreaterThanOrEqualToNSuccs(unsigned Size, const SIInstrInfo *TII,
                                unsigned SGID, bool HasIntermediary = false,
                                bool NeedsCache = false)
         : InstructionRule(TII, SGID, NeedsCache), Size(Size),
           HasIntermediary(HasIntermediary) {}
   };

   // Whether or not the instruction is a relevant V_CVT instruction.
   class IsCvt final : public InstructionRule {
   public:
     bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
                SmallVectorImpl<SchedGroup> &SyncPipe) override {
       auto Opc = SU->getInstr()->getOpcode();
       return Opc == AMDGPU::V_CVT_F16_F32_e32 ||
              Opc == AMDGPU::V_CVT_I32_F32_e32;
     }
     IsCvt(const SIInstrInfo *TII, unsigned SGID, bool NeedsCache = false)
         : InstructionRule(TII, SGID, NeedsCache) {}
   };

   // Whether or not the instruction is FMA_F32.
   class IsFMA final : public InstructionRule {
   public:
     bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
                SmallVectorImpl<SchedGroup> &SyncPipe) override {
       return SU->getInstr()->getOpcode() == AMDGPU::V_FMA_F32_e64 ||
              SU->getInstr()->getOpcode() == AMDGPU::V_PK_FMA_F32;
     }
     IsFMA(const SIInstrInfo *TII, unsigned SGID, bool NeedsCache = false)
         : InstructionRule(TII, SGID, NeedsCache) {}
   };

   // Whether or not the instruction is a V_ADD_F32 instruction.
   class IsPipeAdd final : public InstructionRule {
   public:
     bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
                SmallVectorImpl<SchedGroup> &SyncPipe) override {
       return SU->getInstr()->getOpcode() == AMDGPU::V_ADD_F32_e32;
     }
     IsPipeAdd(const SIInstrInfo *TII, unsigned SGID, bool NeedsCache = false)
         : InstructionRule(TII, SGID, NeedsCache) {}
   };

   /// Whether or not the instruction is an immediate RAW successor
   /// of the SchedGroup \p Distance steps before.
   class IsSuccOfPrevNthGroup final : public InstructionRule {
   private:
     unsigned Distance = 1;

   public:
     bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
                SmallVectorImpl<SchedGroup> &SyncPipe) override {
       SchedGroup *OtherGroup = nullptr;
       if (!SyncPipe.size())
         return false;

       for (auto &PipeSG : SyncPipe) {
         if ((unsigned)PipeSG.getSGID() == SGID - Distance)
           OtherGroup = &PipeSG;
       }

       if (!OtherGroup)
         return false;
       if (!OtherGroup->Collection.size())
         return true;

       for (auto &OtherEle : OtherGroup->Collection) {
         for (auto &Succ : OtherEle->Succs) {
           if (Succ.getSUnit() == SU && Succ.getKind() == SDep::Data)
             return true;
         }
       }

       return false;
     }
     IsSuccOfPrevNthGroup(unsigned Distance, const SIInstrInfo *TII,
                          unsigned SGID, bool NeedsCache = false)
         : InstructionRule(TII, SGID, NeedsCache), Distance(Distance) {}
   };

   /// Whether or not the instruction is a transitive successor of any
   /// instruction the the SchedGroup \p Distance steps before.
   class IsReachableFromPrevNthGroup final : public InstructionRule {
   private:
     unsigned Distance = 1;

   public:
     bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
                SmallVectorImpl<SchedGroup> &SyncPipe) override {
       SchedGroup *OtherGroup = nullptr;
       if (!SyncPipe.size())
         return false;

       for (auto &PipeSG : SyncPipe) {
         if ((unsigned)PipeSG.getSGID() == SGID - Distance)
           OtherGroup = &PipeSG;
       }

       if (!OtherGroup)
         return false;
       if (!OtherGroup->Collection.size())
         return true;

       auto *DAG = SyncPipe[0].DAG;

       for (auto &OtherEle : OtherGroup->Collection)
         if (DAG->IsReachable(const_cast<SUnit *>(SU), OtherEle))
           return true;

       return false;
     }
     IsReachableFromPrevNthGroup(unsigned Distance, const SIInstrInfo *TII,
                                 unsigned SGID, bool NeedsCache = false)
         : InstructionRule(TII, SGID, NeedsCache), Distance(Distance) {}
   };

   /// Whether or not the instruction occurs after the SU with NodeNUm \p Number
   class OccursAtOrAfterNode final : public InstructionRule {
   private:
     unsigned Number = 1;

   public:
     bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
                SmallVectorImpl<SchedGroup> &SyncPipe) override {

       return SU->NodeNum >= Number;
     }
     OccursAtOrAfterNode(unsigned Number, const SIInstrInfo *TII, unsigned SGID,
                         bool NeedsCache = false)
         : InstructionRule(TII, SGID, NeedsCache), Number(Number) {}
   };

   /// Whether or not the SU is exactly the \p Number th MFMA in the chain
   /// starting with \p ChainSeed
   class IsExactMFMA final : public InstructionRule {
   private:
     unsigned Number = 1;
     SUnit *ChainSeed;

   public:
     bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
                SmallVectorImpl<SchedGroup> &SyncPipe) override {
       if (!SU || !TII->isMFMAorWMMA(*ChainSeed->getInstr()))
         return false;

       if (Cache->empty()) {
         auto *TempSU = ChainSeed;
         auto Depth = Number;
         while (Depth > 0) {
           --Depth;
           bool Found = false;
           for (auto &Succ : TempSU->Succs) {
             if (TII->isMFMAorWMMA(*Succ.getSUnit()->getInstr())) {
               TempSU = Succ.getSUnit();
               Found = true;
               break;
             }
           }
           if (!Found) {
             return false;
           }
         }
         Cache->push_back(TempSU);
       }
       // If we failed to find the instruction to be placed into the cache, we
       // would have already exited.
       assert(!Cache->empty());

       return (*Cache)[0] == SU;
     }

     IsExactMFMA(unsigned Number, SUnit *ChainSeed, const SIInstrInfo *TII,
                 unsigned SGID, bool NeedsCache = false)
         : InstructionRule(TII, SGID, NeedsCache), Number(Number),
           ChainSeed(ChainSeed) {}
   };

   // Whether the instruction occurs after the first TRANS instruction. This
   // implies the instruction can not be a predecessor of the first TRANS
   // insruction
   class OccursAfterExp final : public InstructionRule {
   public:
     bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
                SmallVectorImpl<SchedGroup> &SyncPipe) override {

       SmallVector<SUnit *, 12> Worklist;
       auto *DAG = SyncPipe[0].DAG;
       if (Cache->empty()) {
         for (auto &SU : DAG->SUnits)
           if (TII->isTRANS(SU.getInstr()->getOpcode())) {
             Cache->push_back(&SU);
             break;
           }
         if (Cache->empty())
           return false;
       }

       return SU->NodeNum > (*Cache)[0]->NodeNum;
     }

     OccursAfterExp(const SIInstrInfo *TII, unsigned SGID,
                    bool NeedsCache = false)
         : InstructionRule(TII, SGID, NeedsCache) {}
   };

 public:
   bool applyIGLPStrategy(
       DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
       DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,
       AMDGPU::SchedulingPhase Phase) override;

   bool shouldApplyStrategy(ScheduleDAGInstrs *DAG,
                            AMDGPU::SchedulingPhase Phase) override;

   MFMAExpInterleaveOpt(ScheduleDAGInstrs *DAG, const SIInstrInfo *TII)
       : IGLPStrategy(DAG, TII) {
     IsBottomUp = false;
   }
 };

 unsigned MFMAExpInterleaveOpt::TransPipeCount = 0;
 unsigned MFMAExpInterleaveOpt::MFMAPipeCount = 0;
 unsigned MFMAExpInterleaveOpt::AddPipeCount = 0;
 unsigned MFMAExpInterleaveOpt::MFMAEnablement = 0;
 unsigned MFMAExpInterleaveOpt::ExpRequirement = 0;
 unsigned MFMAExpInterleaveOpt::MFMAChains = 0;
 unsigned MFMAExpInterleaveOpt::MFMAChainLength = 0;
 bool MFMAExpInterleaveOpt::HasCvt = false;
 bool MFMAExpInterleaveOpt::HasChainBetweenCvt = false;
 std::optional<unsigned> MFMAExpInterleaveOpt::FirstPipeDSR = std::nullopt;

 bool MFMAExpInterleaveOpt::analyzeDAG(const SIInstrInfo *TII) {
   SmallVector<SUnit *, 10> ExpPipeCands;
   SmallVector<SUnit *, 10> MFMAPipeCands;
   SmallVector<SUnit *, 10> MFMAPipeSUs;
   SmallVector<SUnit *, 10> PackSUs;
   SmallVector<SUnit *, 10> CvtSUs;

   auto isBitPack = [](unsigned Opc) {
     return Opc == AMDGPU::V_PACK_B32_F16_e64 || Opc == AMDGPU::V_PERM_B32_e64;
   };

   auto isCvt = [](unsigned Opc) {
     return Opc == AMDGPU::V_CVT_F16_F32_e32 || Opc == AMDGPU::V_CVT_I32_F32_e32;
   };

   auto isAdd = [](unsigned Opc) { return Opc == AMDGPU::V_ADD_F32_e32; };

   AddPipeCount = 0;
   for (SUnit &SU : DAG->SUnits) {
     auto Opc = SU.getInstr()->getOpcode();
     if (TII->isTRANS(Opc)) {
       // Avoid counting a potential bonus V_EXP which all the MFMA depend on
       if (SU.Succs.size() >= 7)
         continue;
       for (auto &Succ : SU.Succs) {
         if (Succ.getSUnit()->Succs.size() >= 7)
           continue;
       }
       ExpPipeCands.push_back(&SU);
     }

     if (TII->isMFMAorWMMA(*SU.getInstr()))
       MFMAPipeCands.push_back(&SU);

     if (isBitPack(Opc))
       PackSUs.push_back(&SU);

     if (isCvt(Opc))
       CvtSUs.push_back(&SU);

     if (isAdd(Opc))
       ++AddPipeCount;
   }

   if (!(PackSUs.size() && MFMAPipeCands.size() && ExpPipeCands.size()))
     return false;

   TransPipeCount = 0;

   std::optional<SUnit *> TempMFMA;
   std::optional<SUnit *> TempExp;
   // Count the number of EXPs that reach an MFMA
   for (auto &PredSU : ExpPipeCands) {
     for (auto &SuccSU : MFMAPipeCands) {
       if (DAG->IsReachable(SuccSU, PredSU)) {
         if (!TempExp) {
           TempExp = PredSU;
           TempMFMA = SuccSU;
         }
         MFMAPipeSUs.push_back(SuccSU);
         ++TransPipeCount;
         break;
       }
     }
   }

   if (!(TempExp && TempMFMA))
     return false;

   HasChainBetweenCvt = none_of((*TempExp)->Succs, [&isCvt](SDep &Succ) {
     return isCvt(Succ.getSUnit()->getInstr()->getOpcode());
   });

   // Count the number of MFMAs that are reached by an EXP
   for (auto &SuccSU : MFMAPipeCands) {
     if (MFMAPipeSUs.size() &&
         any_of(MFMAPipeSUs, [&SuccSU](SUnit *PotentialMatch) {
           return PotentialMatch->NodeNum == SuccSU->NodeNum;
         }))
       continue;

     for (auto &PredSU : ExpPipeCands) {
       if (DAG->IsReachable(SuccSU, PredSU)) {
         MFMAPipeSUs.push_back(SuccSU);
         break;
       }
     }
   }

   MFMAPipeCount = MFMAPipeSUs.size();

   assert(TempExp && TempMFMA);
   assert(MFMAPipeCount > 0);

   std::optional<SUnit *> TempCvt;
   for (auto &SuccSU : CvtSUs) {
     if (DAG->IsReachable(SuccSU, *TempExp)) {
       TempCvt = SuccSU;
       break;
     }
   }

   HasCvt = false;
   if (TempCvt.has_value()) {
     for (auto &SuccSU : MFMAPipeSUs) {
       if (DAG->IsReachable(SuccSU, *TempCvt)) {
         HasCvt = true;
         break;
       }
     }
   }

   MFMAChains = 0;
   for (auto &MFMAPipeSU : MFMAPipeSUs) {
     if (is_contained(MFMAChainSeeds, MFMAPipeSU))
       continue;
     if (none_of(MFMAPipeSU->Preds, [&TII](SDep &Succ) {
           return TII->isMFMAorWMMA(*Succ.getSUnit()->getInstr());
         })) {
       MFMAChainSeeds.push_back(MFMAPipeSU);
       ++MFMAChains;
     }
   }

   if (!MFMAChains)
     return false;

   for (auto Pred : MFMAChainSeeds[0]->Preds) {
     if (TII->isDS(Pred.getSUnit()->getInstr()->getOpcode()) &&
         Pred.getSUnit()->getInstr()->mayLoad())
       FirstPipeDSR = Pred.getSUnit()->NodeNum;
   }

   MFMAChainLength = MFMAPipeCount / MFMAChains;

   // The number of bit pack operations that depend on a single V_EXP
   unsigned PackSuccCount = std::count_if(
       PackSUs.begin(), PackSUs.end(), [this, &TempExp](SUnit *VPack) {
         return DAG->IsReachable(VPack, *TempExp);
       });

   // The number of bit pack operations an MFMA depends on
   unsigned PackPredCount =
       std::count_if((*TempMFMA)->Preds.begin(), (*TempMFMA)->Preds.end(),
                     [&isBitPack](SDep &Pred) {
                       auto Opc = Pred.getSUnit()->getInstr()->getOpcode();
                       return isBitPack(Opc);
                     });

   auto *PackPred =
       std::find_if((*TempMFMA)->Preds.begin(), (*TempMFMA)->Preds.end(),
                    [&isBitPack](SDep &Pred) {
                      auto Opc = Pred.getSUnit()->getInstr()->getOpcode();
                      return isBitPack(Opc);
                    });

   if (PackPred == (*TempMFMA)->Preds.end())
     return false;

   MFMAEnablement = 0;
   ExpRequirement = 0;
   // How many MFMAs depend on a single bit pack operation
   MFMAEnablement =
       std::count_if(PackPred->getSUnit()->Succs.begin(),
                     PackPred->getSUnit()->Succs.end(), [&TII](SDep &Succ) {
                       return TII->isMFMAorWMMA(*Succ.getSUnit()->getInstr());
                     });

   // The number of MFMAs that depend on a single V_EXP
   MFMAEnablement *= PackSuccCount;

   // The number of V_EXPs required to resolve all dependencies for an MFMA
   ExpRequirement =
       std::count_if(ExpPipeCands.begin(), ExpPipeCands.end(),
                     [this, &PackPred](SUnit *ExpBase) {
                       return DAG->IsReachable(PackPred->getSUnit(), ExpBase);
                     });

   ExpRequirement *= PackPredCount;
   return true;
 }

 bool MFMAExpInterleaveOpt::shouldApplyStrategy(ScheduleDAGInstrs *DAG,
                                                AMDGPU::SchedulingPhase Phase) {
   const GCNSubtarget &ST = DAG->MF.getSubtarget<GCNSubtarget>();
   const SIInstrInfo *TII = ST.getInstrInfo();

   if (Phase != AMDGPU::SchedulingPhase::PostRA)
     MFMAChainSeeds.clear();
   if (Phase != AMDGPU::SchedulingPhase::PostRA && !analyzeDAG(TII))
     return false;

   return true;
 }

 bool MFMAExpInterleaveOpt::applyIGLPStrategy(
     DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
     DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,
     AMDGPU::SchedulingPhase Phase) {

   bool IsSmallKernelType =
       MFMAEnablement == 2 && ExpRequirement == 4 && TransPipeCount == 32;
   bool IsLargeKernelType =
       MFMAEnablement == 4 && ExpRequirement == 4 && TransPipeCount == 64;

   if (!(IsSmallKernelType || IsLargeKernelType))
     return false;

   const GCNSubtarget &ST = DAG->MF.getSubtarget<GCNSubtarget>();
   const SIInstrInfo *TII = ST.getInstrInfo();

   unsigned PipelineSyncID = 0;
   SchedGroup *SG = nullptr;

   unsigned MFMAChain = 0;
   unsigned PositionInChain = 0;
   unsigned CurrMFMAForTransPosition = 0;

   auto incrementTransPosition = [&MFMAChain, &PositionInChain,
                                  &CurrMFMAForTransPosition]() {
     CurrMFMAForTransPosition += MFMAEnablement;
     PositionInChain = (CurrMFMAForTransPosition / MFMAChains);
     MFMAChain = CurrMFMAForTransPosition % MFMAChains;
   };

   auto getNextTransPositionInChain = [&CurrMFMAForTransPosition]() {
     auto TempMFMAForTrans = CurrMFMAForTransPosition + MFMAEnablement;
     return (TempMFMAForTrans / MFMAChains);
   };

   auto getNextTransMFMAChain = [&CurrMFMAForTransPosition]() {
     auto TempMFMAForTrans = CurrMFMAForTransPosition + MFMAEnablement;
     return TempMFMAForTrans % MFMAChains;
   };

   unsigned CurrMFMAPosition = 0;
   unsigned MFMAChainForMFMA = 0;
   unsigned PositionInChainForMFMA = 0;

   auto incrementMFMAPosition = [&CurrMFMAPosition, &MFMAChainForMFMA,
                                 &PositionInChainForMFMA]() {
     ++CurrMFMAPosition;
     MFMAChainForMFMA = CurrMFMAPosition % MFMAChains;
     PositionInChainForMFMA = CurrMFMAPosition / MFMAChains;
   };

   bool IsPostRA = Phase == AMDGPU::SchedulingPhase::PostRA;
   assert(IsPostRA || MFMAChainSeeds.size() == MFMAChains);

   bool UsesFMA = IsSmallKernelType || !IsPostRA;
   bool UsesDSRead = IsLargeKernelType && !IsPostRA && FirstPipeDSR;
   bool UsesCvt = HasCvt && (IsSmallKernelType || !IsPostRA);
   bool UsesVALU = IsSmallKernelType;

   // PHASE 1: "Prefetch"
   if (UsesFMA) {
     // First Round FMA
     SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
         SchedGroupMask::VALU, ExpRequirement, PipelineSyncID, DAG, TII);
     if (!IsPostRA && MFMAChains) {
       SG->addRule(std::make_shared<EnablesNthMFMAInChain>(
           PositionInChain, MFMAChainSeeds[MFMAChain], TII, SG->getSGID(),
           true));
     } else
       SG->addRule(
           std::make_shared<EnablesNthMFMA>(1, TII, SG->getSGID(), true));
     SG->addRule(std::make_shared<IsFMA>(TII, SG->getSGID()));
     SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);

     // Second Round FMA
     SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
         SchedGroupMask::VALU, ExpRequirement, PipelineSyncID, DAG, TII);
     if (!IsPostRA && MFMAChains) {
       SG->addRule(std::make_shared<EnablesNthMFMAInChain>(
           getNextTransPositionInChain(),
           MFMAChainSeeds[getNextTransMFMAChain()], TII, SG->getSGID(), true));
     } else
       SG->addRule(std::make_shared<EnablesNthMFMA>(MFMAEnablement + 1, TII,
                                                    SG->getSGID(), true));
     SG->addRule(std::make_shared<IsFMA>(TII, SG->getSGID()));
     SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
   }

   if (UsesDSRead) {
     SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
         SchedGroupMask::DS_READ, 2, PipelineSyncID, DAG, TII);
     SG->addRule(std::make_shared<OccursAtOrAfterNode>(*FirstPipeDSR, TII,
                                                       SG->getSGID()));
     SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
   }

   // First Round EXP
   SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
       SchedGroupMask::TRANS, ExpRequirement, PipelineSyncID, DAG, TII);
   if (!IsPostRA && MFMAChains)
     SG->addRule(std::make_shared<EnablesNthMFMAInChain>(
         PositionInChain, MFMAChainSeeds[MFMAChain], TII, SG->getSGID(), true));
   else
     SG->addRule(std::make_shared<EnablesNthMFMA>(1, TII, SG->getSGID(), true));
   SG->addRule(std::make_shared<IsPipeExp>(TII, SG->getSGID(), true));
   SG->addRule(std::make_shared<LessThanNSuccs>(8, TII, SG->getSGID(),
                                                HasChainBetweenCvt));
   SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);

   incrementTransPosition();

   // First Round CVT, Third Round FMA, Second Round EXP; interleaved
   for (unsigned I = 0; I < ExpRequirement; I++) {
     // First Round CVT
     if (UsesCvt) {
       SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
           SchedGroupMask::VALU, 1, PipelineSyncID, DAG, TII);
       SG->addRule(std::make_shared<IsCvt>(TII, SG->getSGID()));
       if (HasChainBetweenCvt)
         SG->addRule(std::make_shared<IsReachableFromPrevNthGroup>(
             1 + (2 + UsesFMA) * I, TII, SG->getSGID()));
       else
         SG->addRule(std::make_shared<IsSuccOfPrevNthGroup>(
             1 + (2 + UsesFMA) * I, TII, SG->getSGID()));
       SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
     }

     // Third Round FMA
     if (UsesFMA) {
       SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
           SchedGroupMask::VALU, 1, PipelineSyncID, DAG, TII);
       if (!IsPostRA && MFMAChains) {
         SG->addRule(std::make_shared<EnablesNthMFMAInChain>(
             getNextTransPositionInChain(),
             MFMAChainSeeds[getNextTransMFMAChain()], TII, SG->getSGID(), true));
       } else
         SG->addRule(std::make_shared<EnablesNthMFMA>(2 * MFMAEnablement + 1,
                                                      TII, SG->getSGID(), true));
       SG->addRule(std::make_shared<IsFMA>(TII, SG->getSGID()));
       SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
     }

     // Second Round EXP
     SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
         SchedGroupMask::TRANS, 1, PipelineSyncID, DAG, TII);
     if (!IsPostRA && MFMAChains)
       SG->addRule(std::make_shared<EnablesNthMFMAInChain>(
           PositionInChain, MFMAChainSeeds[MFMAChain], TII, SG->getSGID(),
           true));
     else
       SG->addRule(std::make_shared<EnablesNthMFMA>(MFMAEnablement + 1, TII,
                                                    SG->getSGID(), true));
     SG->addRule(std::make_shared<IsPipeExp>(TII, SG->getSGID(), true));
     SG->addRule(std::make_shared<LessThanNSuccs>(8, TII, SG->getSGID(),
                                                  HasChainBetweenCvt));
     SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
   }

   // The "extra" EXP which enables all MFMA
   // TODO: UsesExtraExp
   SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
       SchedGroupMask::TRANS, 1, PipelineSyncID, DAG, TII);
   SG->addRule(std::make_shared<IsPipeExp>(TII, SG->getSGID(), true));
   SG->addRule(std::make_shared<GreaterThanOrEqualToNSuccs>(
       8, TII, SG->getSGID(), HasChainBetweenCvt));
   SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);

   // PHASE 2: Main Interleave Loop

   // The number of MFMAs per iteration
   unsigned MFMARatio =
       MFMAEnablement > ExpRequirement ? MFMAEnablement / ExpRequirement : 1;
   // The number of Exps per iteration
   unsigned ExpRatio =
       MFMAEnablement > ExpRequirement ? 1 : ExpRequirement / MFMAEnablement;
   // The reamaining Exps
   unsigned RemainingExp = TransPipeCount > (2 * ExpRequirement)
                               ? TransPipeCount - (2 * ExpRequirement)
                               : 0;
   unsigned ExpLoopCount = RemainingExp / ExpRatio;
   // In loop MFMAs
   unsigned MFMAInLoop = MFMAPipeCount > (MFMAEnablement * 2)
                             ? MFMAPipeCount - (MFMAEnablement * 2)
                             : 0;
   unsigned MFMALoopCount = MFMAInLoop / MFMARatio;
   unsigned VALUOps =
       AddPipeCount < MFMAPipeCount ? 1 : AddPipeCount / MFMAPipeCount;
   unsigned LoopSize = std::min(ExpLoopCount, MFMALoopCount);

   for (unsigned I = 0; I < LoopSize; I++) {
     if (!(I * ExpRatio % ExpRequirement))
       incrementTransPosition();

     // Round N MFMA
     SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
         SchedGroupMask::MFMA, MFMARatio, PipelineSyncID, DAG, TII);
     if (!IsPostRA && MFMAChains)
       SG->addRule(std::make_shared<IsExactMFMA>(
           PositionInChainForMFMA, MFMAChainSeeds[MFMAChainForMFMA], TII,
           SG->getSGID(), true));
     else
       SG->addRule(std::make_shared<OccursAfterExp>(TII, SG->getSGID(), true));
     SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
     incrementMFMAPosition();

     if (UsesVALU) {
       SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
           SchedGroupMask::VALU, VALUOps, PipelineSyncID, DAG, TII);
       SG->addRule(std::make_shared<IsPipeAdd>(TII, SG->getSGID()));
       SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
     }

     if (UsesDSRead && !(I % 4)) {
       SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
           SchedGroupMask::DS_READ, 2, PipelineSyncID, DAG, TII);
       SG->addRule(std::make_shared<OccursAtOrAfterNode>(*FirstPipeDSR, TII,
                                                         SG->getSGID()));
       SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
     }

     // CVT, EXP, FMA Interleaving
     for (unsigned J = 0; J < ExpRatio; J++) {
       auto MFMAOffset = (1 + UsesVALU) * MFMARatio * (I + 1);
       auto MaxMFMAOffset =
           (1 + UsesVALU) * ExpRequirement * MFMARatio / ExpRatio;

       // Round N + 1 CVT
       if (UsesCvt) {
         SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
             SchedGroupMask::VALU, 1, PipelineSyncID, DAG, TII);
         SG->addRule(std::make_shared<IsCvt>(TII, SG->getSGID()));
         auto BaseDiff = (2 + UsesFMA) * (ExpRequirement - 1) + 1;
         auto DSROffset = I / 4 + 1;
         auto MaxDSROffset = MaxMFMAOffset / 4;
         // TODO: UsesExtraExp
         auto ExpOffset = I * ExpRatio + J >= ExpRequirement ? 0 : 1;
         auto CurrentOffset = UsesDSRead * std::min(MaxDSROffset, DSROffset) +
                              std::min(MaxMFMAOffset, MFMAOffset) + BaseDiff +
                              ExpOffset;
         if (HasChainBetweenCvt)
           SG->addRule(std::make_shared<IsReachableFromPrevNthGroup>(
               CurrentOffset, TII, SG->getSGID()));
         else
           SG->addRule(std::make_shared<IsSuccOfPrevNthGroup>(CurrentOffset, TII,
                                                              SG->getSGID()));
         SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
       }

       // Round N + 3 FMA
       if (UsesFMA) {
         SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
             SchedGroupMask::VALU, 1, PipelineSyncID, DAG, TII);
         if (!IsPostRA && MFMAChains)
           SG->addRule(std::make_shared<EnablesNthMFMAInChain>(
               getNextTransPositionInChain(),
               MFMAChainSeeds[getNextTransMFMAChain()], TII, SG->getSGID(),
               true));
         else
           SG->addRule(std::make_shared<EnablesNthMFMA>(
               (((I * ExpRatio + J) / ExpRequirement) + 3) * MFMAEnablement + 1,
               TII, SG->getSGID(), true));
         SG->addRule(std::make_shared<IsFMA>(TII, SG->getSGID()));
         SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
       }

       // Round N + 2 Exp
       SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
           SchedGroupMask::TRANS, 1, PipelineSyncID, DAG, TII);
       if (!IsPostRA && MFMAChains)
         SG->addRule(std::make_shared<EnablesNthMFMAInChain>(
             PositionInChain, MFMAChainSeeds[MFMAChain], TII, SG->getSGID(),
             true));
       else
         SG->addRule(std::make_shared<EnablesNthMFMA>(
             (((I * ExpRatio + J) / ExpRequirement) + 2) * MFMAEnablement + 1,
             TII, SG->getSGID(), true));
       SG->addRule(std::make_shared<IsPipeExp>(TII, SG->getSGID(), true));
       SG->addRule(std::make_shared<LessThanNSuccs>(8, TII, SG->getSGID(),
                                                    HasChainBetweenCvt));
       SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
     }
   }

   // PHASE 3: Remaining MFMAs
   SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
       SchedGroupMask::MFMA, MFMAEnablement * 2, PipelineSyncID, DAG, TII);
   SG->addRule(std::make_shared<OccursAfterExp>(TII, SG->getSGID(), true));
   SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
   return true;
 }

 class MFMAExpSimpleInterleaveOpt final : public IGLPStrategy {
 public:
   bool applyIGLPStrategy(
       DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
       DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,
       AMDGPU::SchedulingPhase Phase) override;

   bool shouldApplyStrategy(ScheduleDAGInstrs *DAG,
                            AMDGPU::SchedulingPhase Phase) override {
     return true;
   }

   MFMAExpSimpleInterleaveOpt(ScheduleDAGInstrs *DAG, const SIInstrInfo *TII)
       : IGLPStrategy(DAG, TII) {
     IsBottomUp = true;
   }
 };

 bool MFMAExpSimpleInterleaveOpt::applyIGLPStrategy(
     DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
     DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,
     AMDGPU::SchedulingPhase Phase) {
   // Count the number of MFMA instructions.
   unsigned MFMACount = 0;
   for (const MachineInstr &I : *DAG)
     if (TII->isMFMAorWMMA(I))
       ++MFMACount;

   const unsigned PipelineSyncID = 0;
   for (unsigned I = 0; I < MFMACount * 3; ++I) {
     SchedGroup *SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
         SchedGroupMask::TRANS, 1, PipelineSyncID, DAG, TII);
     SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);

     SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
         SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);
     SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
   }

   return true;
 }

 class MFMASmallGemmSingleWaveOpt final : public IGLPStrategy {
 private:
   // Whether the DS_READ is a predecessor of first four MFMA in region
   class EnablesInitialMFMA final : public InstructionRule {
   public:
     bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
                SmallVectorImpl<SchedGroup> &SyncPipe) override {
       if (!SyncPipe.size())
         return false;
       int MFMAsFound = 0;
       if (!Cache->size()) {
         for (auto &Elt : SyncPipe[0].DAG->SUnits) {
           if (TII->isMFMAorWMMA(*Elt.getInstr())) {
             ++MFMAsFound;
             if (MFMAsFound > 4)
               break;
             Cache->push_back(&Elt);
           }
         }
       }

       assert(Cache->size());
       auto *DAG = SyncPipe[0].DAG;
       for (auto &Elt : *Cache) {
         if (DAG->IsReachable(Elt, const_cast<SUnit *>(SU)))
           return true;
       }
       return false;
     }

     EnablesInitialMFMA(const SIInstrInfo *TII, unsigned SGID,
                        bool NeedsCache = false)
         : InstructionRule(TII, SGID, NeedsCache) {}
   };

   // Whether the MI is a V_PERM and is a predecessor of a common DS_WRITE
   class IsPermForDSW final : public InstructionRule {
   public:
     bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
                SmallVectorImpl<SchedGroup> &SyncPipe) override {
       auto *MI = SU->getInstr();
       if (MI->getOpcode() != AMDGPU::V_PERM_B32_e64)
         return false;

       bool FitsInGroup = false;
       // Does the VALU have a DS_WRITE successor
       if (!Collection.size()) {
         for (auto &Succ : SU->Succs) {
           SUnit *SuccUnit = Succ.getSUnit();
           if (TII->isDS(*SuccUnit->getInstr()) &&
               SuccUnit->getInstr()->mayStore()) {
             Cache->push_back(SuccUnit);
             FitsInGroup = true;
           }
         }
         return FitsInGroup;
       }

       assert(Cache->size());

       // Does the VALU have a DS_WRITE successor that is the same as other
       // VALU already in the group. The V_PERMs will all share 1 DS_W succ
       return llvm::any_of(*Cache, [&SU](SUnit *Elt) {
         return llvm::any_of(SU->Succs, [&Elt](const SDep &ThisSucc) {
           return ThisSucc.getSUnit() == Elt;
         });
       });
     }

     IsPermForDSW(const SIInstrInfo *TII, unsigned SGID, bool NeedsCache = false)
         : InstructionRule(TII, SGID, NeedsCache) {}
   };

   // Whether the SU is a successor of any element in previous SchedGroup
   class IsSuccOfPrevGroup final : public InstructionRule {
   public:
     bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
                SmallVectorImpl<SchedGroup> &SyncPipe) override {
       SchedGroup *OtherGroup = nullptr;
       for (auto &PipeSG : SyncPipe) {
         if ((unsigned)PipeSG.getSGID() == SGID - 1) {
           OtherGroup = &PipeSG;
         }
       }

       if (!OtherGroup)
         return false;
       if (!OtherGroup->Collection.size())
         return true;

       // Does the previous VALU have this DS_Write as a successor
       return any_of(OtherGroup->Collection, [&SU](SUnit *Elt) {
         return any_of(Elt->Succs,
                       [&SU](SDep &Succ) { return Succ.getSUnit() == SU; });
       });
     }
     IsSuccOfPrevGroup(const SIInstrInfo *TII, unsigned SGID,
                       bool NeedsCache = false)
         : InstructionRule(TII, SGID, NeedsCache) {}
   };

   // Whether the combined load width of group is 128 bits
   class VMEMSize final : public InstructionRule {
   public:
     bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
                SmallVectorImpl<SchedGroup> &SyncPipe) override {
       auto *MI = SU->getInstr();
       if (MI->getOpcode() == TargetOpcode::BUNDLE)
         return false;
       if (!Collection.size())
         return true;

       int NumBits = 0;

       auto TRI = TII->getRegisterInfo();
       auto &MRI = MI->getParent()->getParent()->getRegInfo();
       for (auto &Elt : Collection) {
         auto Op = Elt->getInstr()->getOperand(0);
         auto Size =
             TRI.getRegSizeInBits(*TRI.getRegClassForOperandReg(MRI, Op));
         NumBits += Size;
       }

       if (NumBits < 128) {
         assert(TII->isVMEM(*MI) && MI->mayLoad());
         if (NumBits + TRI.getRegSizeInBits(*TRI.getRegClassForOperandReg(
                           MRI, MI->getOperand(0))) <=
             128)
           return true;
       }

       return false;
     }

     VMEMSize(const SIInstrInfo *TII, unsigned SGID, bool NeedsCache = false)
         : InstructionRule(TII, SGID, NeedsCache) {}
   };

   /// Whether the SU shares a V_PERM predecessor with any SU in the SchedGroup
   /// that is \p Distance steps away
   class SharesPredWithPrevNthGroup final : public InstructionRule {
   private:
     unsigned Distance = 1;

   public:
     bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
                SmallVectorImpl<SchedGroup> &SyncPipe) override {
       SchedGroup *OtherGroup = nullptr;
       if (!SyncPipe.size())
         return false;

       if (!Cache->size()) {

         for (auto &PipeSG : SyncPipe) {
           if ((unsigned)PipeSG.getSGID() == SGID - Distance) {
             OtherGroup = &PipeSG;
           }
         }

         if (!OtherGroup)
           return false;
         if (!OtherGroup->Collection.size())
           return true;

         for (auto &OtherEle : OtherGroup->Collection) {
           for (auto &Pred : OtherEle->Preds) {
             if (Pred.getSUnit()->getInstr()->getOpcode() ==
                 AMDGPU::V_PERM_B32_e64)
               Cache->push_back(Pred.getSUnit());
           }
         }

         // If the other group has no PERM preds, then this group won't share any
         if (!Cache->size())
           return false;
       }

       auto *DAG = SyncPipe[0].DAG;
       // Does the previous DS_WRITE share a V_PERM predecessor with this
       // VMEM_READ
       return llvm::any_of(*Cache, [&SU, &DAG](SUnit *Elt) {
         return DAG->IsReachable(const_cast<SUnit *>(SU), Elt);
       });
     }
     SharesPredWithPrevNthGroup(unsigned Distance, const SIInstrInfo *TII,
                                unsigned SGID, bool NeedsCache = false)
         : InstructionRule(TII, SGID, NeedsCache), Distance(Distance) {}
   };

 public:
   bool applyIGLPStrategy(
       DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
       DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,
       AMDGPU::SchedulingPhase Phase) override;

   bool shouldApplyStrategy(ScheduleDAGInstrs *DAG,
                            AMDGPU::SchedulingPhase Phase) override {
     return true;
   }

   MFMASmallGemmSingleWaveOpt(ScheduleDAGInstrs *DAG, const SIInstrInfo *TII)
       : IGLPStrategy(DAG, TII) {
     IsBottomUp = false;
   }
 };

 static unsigned DSWCount = 0;
 static unsigned DSWWithPermCount = 0;
 static unsigned DSWWithSharedVMEMCount = 0;

 bool MFMASmallGemmSingleWaveOpt::applyIGLPStrategy(
     DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
     DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,
     AMDGPU::SchedulingPhase Phase) {
   unsigned MFMACount = 0;
   unsigned DSRCount = 0;

   bool IsInitial = Phase == AMDGPU::SchedulingPhase::Initial;

   assert((!IsInitial || (DSWCount == 0 && DSWWithPermCount == 0 &&
                          DSWWithSharedVMEMCount == 0)) &&
          "DSWCounters should be zero in pre-RA scheduling!");
   SmallVector<SUnit *, 6> DSWithPerms;
   for (auto &SU : DAG->SUnits) {
     auto *I = SU.getInstr();
     if (TII->isMFMAorWMMA(*I))
       ++MFMACount;
     else if (TII->isDS(*I)) {
       if (I->mayLoad())
         ++DSRCount;
       else if (I->mayStore() && IsInitial) {
         ++DSWCount;
         for (auto Pred : SU.Preds) {
           if (Pred.getSUnit()->getInstr()->getOpcode() ==
               AMDGPU::V_PERM_B32_e64) {
             DSWithPerms.push_back(&SU);
             break;
           }
         }
       }
     }
   }

   if (IsInitial) {
     DSWWithPermCount = DSWithPerms.size();
     auto *I = DSWithPerms.begin();
     auto *E = DSWithPerms.end();

     // Get the count of DS_WRITES with V_PERM predecessors which
     // have loop carried dependencies (WAR) on the same VMEM_READs.
     // We consider partial overlap as a miss -- in other words,
     // for a given DS_W, we only consider another DS_W as matching
     // if there is a corresponding (in terms of the VMEM_R it uses) V_PERM pred
     // for every V_PERM pred of this DS_W.
     DenseMap<MachineInstr *, SUnit *> VMEMLookup;
     SmallVector<SUnit *, 6> Counted;
     for (; I != E; I++) {
       SUnit *Cand = nullptr;
       bool MissedAny = false;
       for (auto &Pred : (*I)->Preds) {
         if (Pred.getSUnit()->getInstr()->getOpcode() != AMDGPU::V_PERM_B32_e64)
           continue;

         if (Cand && llvm::is_contained(Counted, Cand))
           break;

         for (auto &Succ : Pred.getSUnit()->Succs) {
           auto *MI = Succ.getSUnit()->getInstr();
           if (!TII->isVMEM(*MI) || !MI->mayLoad())
             continue;

           if (MissedAny || !VMEMLookup.size()) {
             MissedAny = true;
             VMEMLookup[MI] = *I;
             continue;
           }

           auto [It, Inserted] = VMEMLookup.try_emplace(MI, *I);
           if (Inserted) {
             MissedAny = true;
             continue;
           }

           Cand = It->second;
           if (llvm::is_contained(Counted, Cand)) {
             MissedAny = true;
             break;
           }
         }
       }
       if (!MissedAny && Cand) {
         DSWWithSharedVMEMCount += 2;
         Counted.push_back(Cand);
         Counted.push_back(*I);
       }
     }
   }

   assert(DSWWithSharedVMEMCount <= DSWWithPermCount);
   SchedGroup *SG;
   unsigned PipelineSyncID = 0;
   // For kernels with V_PERM, there are enough VALU to mix in between MFMAs
   if (DSWWithPermCount) {
     for (unsigned I = 0; I < MFMACount; I++) {
       SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
           SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);
       SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);

       SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
           SchedGroupMask::VALU, 2, PipelineSyncID, DAG, TII);
       SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
     }
   }

   PipelineSyncID = 1;
   // Phase 1: Break up DS_READ and MFMA clusters.
   // First DS_READ to make ready initial MFMA, then interleave MFMA with DS_READ
   // prefetch

   // Make ready initial MFMA
   SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
       SchedGroupMask::DS_READ, 4, PipelineSyncID, DAG, TII);
   SG->addRule(std::make_shared<EnablesInitialMFMA>(TII, SG->getSGID(), true));
   SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);

   SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
       SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);
   SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);

   // Interleave MFMA with DS_READ prefetch
   for (unsigned I = 0; I < DSRCount - 4; ++I) {
     SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
         SchedGroupMask::DS_READ, 1, PipelineSyncID, DAG, TII);
     SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);

     SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
         SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);
     SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
   }

   // Phase 2a: Loop carried dependency with V_PERM
   // Schedule VPerm & DS_WRITE as closely as possible to the VMEM_READ they
   // depend on. Interleave MFMA to keep XDL unit busy throughout.
   for (unsigned I = 0; I < DSWWithPermCount - DSWWithSharedVMEMCount; ++I) {
     SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
         SchedGroupMask::VALU, 4, PipelineSyncID, DAG, TII);
     SG->addRule(std::make_shared<IsPermForDSW>(TII, SG->getSGID(), true));
     SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);

     SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
         SchedGroupMask::DS_WRITE, 1, PipelineSyncID, DAG, TII);
     SG->addRule(std::make_shared<IsSuccOfPrevGroup>(TII, SG->getSGID()));
     SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);

     SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
         SchedGroupMask::VMEM_READ, 4, PipelineSyncID, DAG, TII);
     SG->addRule(std::make_shared<SharesPredWithPrevNthGroup>(
         1, TII, SG->getSGID(), true));
     SG->addRule(std::make_shared<VMEMSize>(TII, SG->getSGID()));
     SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);

     SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
         SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);
     SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);

     SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
         SchedGroupMask::VMEM_READ, 4, PipelineSyncID, DAG, TII);
     SG->addRule(std::make_shared<SharesPredWithPrevNthGroup>(
         3, TII, SG->getSGID(), true));
     SG->addRule(std::make_shared<VMEMSize>(TII, SG->getSGID()));
     SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);

     SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
         SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);
     SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
   }

   // Phase 2b: Loop carried dependency without V_PERM
   // Schedule DS_WRITE as closely as possible to the VMEM_READ they depend on.
   // Interleave MFMA to keep XDL unit busy throughout.
   for (unsigned I = 0; I < DSWCount - DSWWithPermCount; I++) {
     SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
         SchedGroupMask::DS_WRITE, 1, PipelineSyncID, DAG, TII);
     SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);

     SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
         SchedGroupMask::VMEM_READ, 4, PipelineSyncID, DAG, TII);
     SG->addRule(std::make_shared<VMEMSize>(TII, SG->getSGID()));
     SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);

     SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
         SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);
     SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
   }

   // Phase 2c: Loop carried dependency with V_PERM, VMEM_READs are
   // ultimately used by two DS_WRITE
   // Schedule VPerm & DS_WRITE as closely as possible to the VMEM_READ they
   // depend on. Interleave MFMA to keep XDL unit busy throughout.

   for (unsigned I = 0; I < DSWWithSharedVMEMCount; ++I) {
     SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
         SchedGroupMask::VALU, 4, PipelineSyncID, DAG, TII);
     SG->addRule(std::make_shared<IsPermForDSW>(TII, SG->getSGID(), true));
     SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);

     SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
         SchedGroupMask::DS_WRITE, 1, PipelineSyncID, DAG, TII);
     SG->addRule(std::make_shared<IsSuccOfPrevGroup>(TII, SG->getSGID()));
     SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);

     SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
         SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);
     SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);

     SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
         SchedGroupMask::VALU, 4, PipelineSyncID, DAG, TII);
     SG->addRule(std::make_shared<IsPermForDSW>(TII, SG->getSGID(), true));
     SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);

     SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
         SchedGroupMask::DS_WRITE, 1, PipelineSyncID, DAG, TII);
     SG->addRule(std::make_shared<IsSuccOfPrevGroup>(TII, SG->getSGID()));
     SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);

     SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
         SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);
     SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);

     SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
         SchedGroupMask::VMEM_READ, 4, PipelineSyncID, DAG, TII);
     SG->addRule(std::make_shared<SharesPredWithPrevNthGroup>(
         2, TII, SG->getSGID(), true));
     SG->addRule(std::make_shared<VMEMSize>(TII, SG->getSGID()));
     SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);

     SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
         SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);
     SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);

     SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
         SchedGroupMask::VMEM_READ, 4, PipelineSyncID, DAG, TII);
     SG->addRule(std::make_shared<SharesPredWithPrevNthGroup>(
         4, TII, SG->getSGID(), true));
     SG->addRule(std::make_shared<VMEMSize>(TII, SG->getSGID()));
     SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);

     SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
         SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);
     SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
   }

   return true;
 }

 static std::unique_ptr<IGLPStrategy>
 createIGLPStrategy(IGLPStrategyID ID, ScheduleDAGInstrs *DAG,
                    const SIInstrInfo *TII) {
   switch (ID) {
   case MFMASmallGemmOptID:
     return std::make_unique<MFMASmallGemmOpt>(DAG, TII);
   case MFMASmallGemmSingleWaveOptID:
     return std::make_unique<MFMASmallGemmSingleWaveOpt>(DAG, TII);
   case MFMAExpInterleaveID:
     return std::make_unique<MFMAExpInterleaveOpt>(DAG, TII);
   case MFMAExpSimpleInterleaveID:
     return std::make_unique<MFMAExpSimpleInterleaveOpt>(DAG, TII);
   }

   llvm_unreachable("Unknown IGLPStrategyID");
 }

 class IGroupLPDAGMutation : public ScheduleDAGMutation {
 private:
   const SIInstrInfo *TII;

   ScheduleDAGMI *DAG;

   // Organize lists of SchedGroups by their SyncID. SchedGroups /
   // SCHED_GROUP_BARRIERs with different SyncIDs will have no edges added
   // between then.
   DenseMap<int, SmallVector<SchedGroup, 4>> SyncedSchedGroups;

   // Used to track instructions that can be mapped to multiple sched groups
   DenseMap<int, SUnitsToCandidateSGsMap> SyncedInstrs;

   // Add DAG edges that enforce SCHED_BARRIER ordering.
   void addSchedBarrierEdges(SUnit &SU);

   // Use a SCHED_BARRIER's mask to identify instruction SchedGroups that should
   // not be reordered accross the SCHED_BARRIER. This is used for the base
   // SCHED_BARRIER, and not SCHED_GROUP_BARRIER. The difference is that
   // SCHED_BARRIER will always block all instructions that can be classified
   // into a particular SchedClass, whereas SCHED_GROUP_BARRIER has a fixed size
   // and may only synchronize with some SchedGroups. Returns the inverse of
   // Mask. SCHED_BARRIER's mask describes which instruction types should be
   // allowed to be scheduled across it. Invert the mask to get the
   // SchedGroupMask of instructions that should be barred.
   SchedGroupMask invertSchedBarrierMask(SchedGroupMask Mask) const;

   // Create SchedGroups for a SCHED_GROUP_BARRIER.
   void initSchedGroupBarrierPipelineStage(
       std::vector<SUnit>::reverse_iterator RIter);

   bool initIGLPOpt(SUnit &SU);

 public:
   void apply(ScheduleDAGInstrs *DAGInstrs) override;

   // The order in which the PipelineSolver should process the candidate
   // SchedGroup for a PipelineInstr. BOTTOM_UP will try to add SUs to the last
   // created SchedGroup first, and will consider that as the ultimate
   // predecessor group when linking. TOP_DOWN instead links and processes the
   // first created SchedGroup first.
   bool IsBottomUp = true;

   // The scheduling phase this application of IGLP corresponds with.
   AMDGPU::SchedulingPhase Phase = AMDGPU::SchedulingPhase::Initial;

   IGroupLPDAGMutation() = default;
   IGroupLPDAGMutation(AMDGPU::SchedulingPhase Phase) : Phase(Phase) {}
 };

 unsigned SchedGroup::NumSchedGroups = 0;

 bool SchedGroup::tryAddEdge(SUnit *A, SUnit *B) {
   if (A != B && DAG->canAddEdge(B, A)) {
     DAG->addEdge(B, SDep(A, SDep::Artificial));
     return true;
   }
   return false;
 }

 bool SchedGroup::canAddMI(const MachineInstr &MI) const {
   bool Result = false;
   if (MI.isMetaInstruction())
     Result = false;

   else if (((SGMask & SchedGroupMask::ALU) != SchedGroupMask::NONE) &&
            (TII->isVALU(MI) || TII->isMFMAorWMMA(MI) || TII->isSALU(MI) ||
             TII->isTRANS(MI)))
     Result = true;

   else if (((SGMask & SchedGroupMask::VALU) != SchedGroupMask::NONE) &&
            TII->isVALU(MI) && !TII->isMFMAorWMMA(MI) && !TII->isTRANS(MI))
     Result = true;

   else if (((SGMask & SchedGroupMask::SALU) != SchedGroupMask::NONE) &&
            TII->isSALU(MI))
     Result = true;

   else if (((SGMask & SchedGroupMask::MFMA) != SchedGroupMask::NONE) &&
            TII->isMFMAorWMMA(MI))
     Result = true;

   else if (((SGMask & SchedGroupMask::VMEM) != SchedGroupMask::NONE) &&
            (TII->isVMEM(MI) || (TII->isFLAT(MI) && !TII->isDS(MI))))
     Result = true;

   else if (((SGMask & SchedGroupMask::VMEM_READ) != SchedGroupMask::NONE) &&
            MI.mayLoad() &&
            (TII->isVMEM(MI) || (TII->isFLAT(MI) && !TII->isDS(MI))))
     Result = true;

   else if (((SGMask & SchedGroupMask::VMEM_WRITE) != SchedGroupMask::NONE) &&
            MI.mayStore() &&
            (TII->isVMEM(MI) || (TII->isFLAT(MI) && !TII->isDS(MI))))
     Result = true;

   else if (((SGMask & SchedGroupMask::DS) != SchedGroupMask::NONE) &&
            TII->isDS(MI))
     Result = true;

   else if (((SGMask & SchedGroupMask::DS_READ) != SchedGroupMask::NONE) &&
            MI.mayLoad() && TII->isDS(MI))
     Result = true;

   else if (((SGMask & SchedGroupMask::DS_WRITE) != SchedGroupMask::NONE) &&
            MI.mayStore() && TII->isDS(MI))
     Result = true;

   else if (((SGMask & SchedGroupMask::TRANS) != SchedGroupMask::NONE) &&
            TII->isTRANS(MI))
     Result = true;

   LLVM_DEBUG(
       dbgs() << "For SchedGroup with mask " << format_hex((int)SGMask, 10, true)
              << (Result ? " could classify " : " unable to classify ") << MI);

   return Result;
 }

 int SchedGroup::link(SUnit &SU, bool MakePred,
                      std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges) {
   int MissedEdges = 0;
   for (auto *A : Collection) {
     SUnit *B = &SU;
     if (A == B || A->getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER)
       continue;
     if (MakePred)
       std::swap(A, B);

     if (DAG->IsReachable(B, A))
       continue;

     // tryAddEdge returns false if there is a dependency that makes adding
     // the A->B edge impossible, otherwise it returns true;
     bool Added = tryAddEdge(A, B);
     if (Added)
       AddedEdges.emplace_back(A, B);
     else
       ++MissedEdges;
   }

   return MissedEdges;
 }

 void SchedGroup::link(SUnit &SU, bool MakePred) {
   for (auto *A : Collection) {
     SUnit *B = &SU;
     if (A->getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER)
       continue;
     if (MakePred)
       std::swap(A, B);

     tryAddEdge(A, B);
   }
 }

 void SchedGroup::link(SUnit &SU,
                       function_ref<bool(const SUnit *A, const SUnit *B)> P) {
   for (auto *A : Collection) {
     SUnit *B = &SU;
     if (P(A, B))
       std::swap(A, B);

     tryAddEdge(A, B);
   }
 }

 void SchedGroup::link(SchedGroup &OtherGroup) {
   for (auto *B : OtherGroup.Collection)
     link(*B);
 }

 bool SchedGroup::canAddSU(SUnit &SU) const {
   MachineInstr &MI = *SU.getInstr();
   if (MI.getOpcode() != TargetOpcode::BUNDLE)
     return canAddMI(MI);

   // Special case for bundled MIs.
   const MachineBasicBlock *MBB = MI.getParent();
   MachineBasicBlock::instr_iterator B = MI.getIterator(), E = ++B;
   while (E != MBB->end() && E->isBundledWithPred())
     ++E;

   // Return true if all of the bundled MIs can be added to this group.
   return std::all_of(B, E, [this](MachineInstr &MI) { return canAddMI(MI); });
 }

 void SchedGroup::initSchedGroup() {
   for (auto &SU : DAG->SUnits) {
     if (isFull())
       break;

     if (canAddSU(SU))
       add(SU);
   }
 }

 void SchedGroup::initSchedGroup(std::vector<SUnit>::reverse_iterator RIter,
                                 SUnitsToCandidateSGsMap &SyncedInstrs) {
   SUnit &InitSU = *RIter;
   for (auto E = DAG->SUnits.rend(); RIter != E; ++RIter) {
     auto &SU = *RIter;
     if (isFull())
       break;

     if (canAddSU(SU))
       SyncedInstrs[&SU].push_back(SGID);
   }

   add(InitSU);
   assert(MaxSize);
   (*MaxSize)++;
 }

 void SchedGroup::initSchedGroup(SUnitsToCandidateSGsMap &SyncedInstrs) {
   auto I = DAG->SUnits.rbegin();
   auto E = DAG->SUnits.rend();
   for (; I != E; ++I) {
     auto &SU = *I;
     if (isFull())
       break;
     if (canAddSU(SU))
       SyncedInstrs[&SU].push_back(SGID);
   }
 }

 void IGroupLPDAGMutation::apply(ScheduleDAGInstrs *DAGInstrs) {
   const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel();
   if (!TSchedModel || DAGInstrs->SUnits.empty())
     return;

   LLVM_DEBUG(dbgs() << "Applying IGroupLPDAGMutation...\n");
   const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>();
   TII = ST.getInstrInfo();
   DAG = static_cast<ScheduleDAGMI *>(DAGInstrs);
   SyncedSchedGroups.clear();
   SyncedInstrs.clear();
   bool FoundSB = false;
   bool FoundIGLP = false;
   bool ShouldApplyIGLP = false;
   for (auto R = DAG->SUnits.rbegin(), E = DAG->SUnits.rend(); R != E; ++R) {
     unsigned Opc = R->getInstr()->getOpcode();
     // SCHED_[GROUP_]BARRIER and IGLP are mutually exclusive.
     if (Opc == AMDGPU::SCHED_BARRIER) {
       addSchedBarrierEdges(*R);
       FoundSB = true;
     } else if (Opc == AMDGPU::SCHED_GROUP_BARRIER) {
       initSchedGroupBarrierPipelineStage(R);
       FoundSB = true;
     } else if (Opc == AMDGPU::IGLP_OPT) {
       if (!FoundSB && !FoundIGLP) {
         FoundIGLP = true;
         ShouldApplyIGLP = initIGLPOpt(*R);
       }
     }
   }

   if (FoundSB || (FoundIGLP && ShouldApplyIGLP)) {
     PipelineSolver PS(SyncedSchedGroups, SyncedInstrs, DAG, IsBottomUp);
     // PipelineSolver performs the mutation by adding the edges it
     // determined as the best
     PS.solve();
     return;
   }
 }

 void IGroupLPDAGMutation::addSchedBarrierEdges(SUnit &SchedBarrier) {
   MachineInstr &MI = *SchedBarrier.getInstr();
   assert(MI.getOpcode() == AMDGPU::SCHED_BARRIER);
   // Remove all existing edges from the SCHED_BARRIER that were added due to the
   // instruction having side effects.
   LLVM_DEBUG(dbgs() << "Building SchedGroup for SchedBarrier with Mask: "
                     << MI.getOperand(0).getImm() << "\n");
   auto InvertedMask =
       invertSchedBarrierMask((SchedGroupMask)MI.getOperand(0).getImm());
   SchedGroup SG(InvertedMask, std::nullopt, DAG, TII);
   SG.initSchedGroup();

   // Preserve original instruction ordering relative to the SCHED_BARRIER.
   SG.link(
       SchedBarrier,
       (function_ref<bool(const SUnit *A, const SUnit *B)>)[](
           const SUnit *A, const SUnit *B) { return A->NodeNum > B->NodeNum; });
 }

 SchedGroupMask
 IGroupLPDAGMutation::invertSchedBarrierMask(SchedGroupMask Mask) const {
   // Invert mask and erase bits for types of instructions that are implied to be
   // allowed past the SCHED_BARRIER.
   SchedGroupMask InvertedMask = ~Mask;

   // ALU implies VALU, SALU, MFMA, TRANS.
   if ((InvertedMask & SchedGroupMask::ALU) == SchedGroupMask::NONE)
     InvertedMask &= ~SchedGroupMask::VALU & ~SchedGroupMask::SALU &
                     ~SchedGroupMask::MFMA & ~SchedGroupMask::TRANS;
   // VALU, SALU, MFMA, TRANS implies ALU.
   else if ((InvertedMask & SchedGroupMask::VALU) == SchedGroupMask::NONE ||
            (InvertedMask & SchedGroupMask::SALU) == SchedGroupMask::NONE ||
            (InvertedMask & SchedGroupMask::MFMA) == SchedGroupMask::NONE ||
            (InvertedMask & SchedGroupMask::TRANS) == SchedGroupMask::NONE)
     InvertedMask &= ~SchedGroupMask::ALU;

   // VMEM implies VMEM_READ, VMEM_WRITE.
   if ((InvertedMask & SchedGroupMask::VMEM) == SchedGroupMask::NONE)
     InvertedMask &= ~SchedGroupMask::VMEM_READ & ~SchedGroupMask::VMEM_WRITE;
   // VMEM_READ, VMEM_WRITE implies VMEM.
   else if ((InvertedMask & SchedGroupMask::VMEM_READ) == SchedGroupMask::NONE ||
            (InvertedMask & SchedGroupMask::VMEM_WRITE) == SchedGroupMask::NONE)
     InvertedMask &= ~SchedGroupMask::VMEM;

   // DS implies DS_READ, DS_WRITE.
   if ((InvertedMask & SchedGroupMask::DS) == SchedGroupMask::NONE)
     InvertedMask &= ~SchedGroupMask::DS_READ & ~SchedGroupMask::DS_WRITE;
   // DS_READ, DS_WRITE implies DS.
   else if ((InvertedMask & SchedGroupMask::DS_READ) == SchedGroupMask::NONE ||
            (InvertedMask & SchedGroupMask::DS_WRITE) == SchedGroupMask::NONE)
     InvertedMask &= ~SchedGroupMask::DS;

   LLVM_DEBUG(dbgs() << "After Inverting, SchedGroup Mask: " << (int)InvertedMask
                     << "\n");

   return InvertedMask;
 }

 void IGroupLPDAGMutation::initSchedGroupBarrierPipelineStage(
     std::vector<SUnit>::reverse_iterator RIter) {
   // Remove all existing edges from the SCHED_GROUP_BARRIER that were added due
   // to the instruction having side effects.
   MachineInstr &SGB = *RIter->getInstr();
   assert(SGB.getOpcode() == AMDGPU::SCHED_GROUP_BARRIER);
   int32_t SGMask = SGB.getOperand(0).getImm();
   int32_t Size = SGB.getOperand(1).getImm();
   int32_t SyncID = SGB.getOperand(2).getImm();

   auto &SG = SyncedSchedGroups[SyncID].emplace_back((SchedGroupMask)SGMask,
                                                     Size, SyncID, DAG, TII);

   SG.initSchedGroup(RIter, SyncedInstrs[SG.getSyncID()]);
 }

 bool IGroupLPDAGMutation::initIGLPOpt(SUnit &SU) {
   IGLPStrategyID StrategyID =
       (IGLPStrategyID)SU.getInstr()->getOperand(0).getImm();
   auto S = createIGLPStrategy(StrategyID, DAG, TII);
   if (!S->shouldApplyStrategy(DAG, Phase))
     return false;

   IsBottomUp = S->IsBottomUp;
   return S->applyIGLPStrategy(SyncedInstrs, SyncedSchedGroups, Phase);
 }

 } // namespace

 namespace llvm {

 /// \p Phase specifes whether or not this is a reentry into the
 /// IGroupLPDAGMutation. Since there may be multiple scheduling passes on the
 /// same scheduling region (e.g. pre and post-RA scheduling / multiple
 /// scheduling "phases"), we can reenter this mutation framework more than once
 /// for a given region.
 std::unique_ptr<ScheduleDAGMutation>
 createIGroupLPDAGMutation(AMDGPU::SchedulingPhase Phase) {
   return std::make_unique<IGroupLPDAGMutation>(Phase);
 }

 } // end namespace llvm