| //===--- AMDGPUIGroupLP.cpp - AMDGPU IGroupLP ------------===// |
| // |
| // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| // See https://llvm.org/LICENSE.txt for license information. |
| // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| // |
| //===----------------------------------------------------------------------===// |
| // |
| // \file This file defines a set of schedule DAG mutations that can be used to |
| // override default scheduler behavior to enforce specific scheduling patterns. |
| // They should be used in cases where runtime performance considerations such as |
| // inter-wavefront interactions, mean that compile-time heuristics cannot |
| // predict the optimal instruction ordering, or in kernels where optimum |
| // instruction scheduling is important enough to warrant manual intervention. |
| // |
| //===----------------------------------------------------------------------===// |
| |
| #include "AMDGPUIGroupLP.h" |
| #include "MCTargetDesc/AMDGPUMCTargetDesc.h" |
| #include "SIInstrInfo.h" |
| #include "SIMachineFunctionInfo.h" |
| #include "llvm/ADT/BitmaskEnum.h" |
| #include "llvm/ADT/DenseMap.h" |
| #include "llvm/CodeGen/MachineScheduler.h" |
| #include "llvm/CodeGen/TargetOpcodes.h" |
| |
| using namespace llvm; |
| |
| #define DEBUG_TYPE "igrouplp" |
| |
| namespace { |
| |
| static cl::opt<bool> EnableExactSolver( |
| "amdgpu-igrouplp-exact-solver", cl::Hidden, |
| cl::desc("Whether to use the exponential time solver to fit " |
| "the instructions to the pipeline as closely as " |
| "possible."), |
| cl::init(false)); |
| |
| static cl::opt<unsigned> CutoffForExact( |
| "amdgpu-igrouplp-exact-solver-cutoff", cl::init(0), cl::Hidden, |
| cl::desc("The maximum number of scheduling group conflicts " |
| "which we attempt to solve with the exponential time " |
| "exact solver. Problem sizes greater than this will" |
| "be solved by the less accurate greedy algorithm. Selecting " |
| "solver by size is superseded by manually selecting " |
| "the solver (e.g. by amdgpu-igrouplp-exact-solver")); |
| |
| static cl::opt<uint64_t> MaxBranchesExplored( |
| "amdgpu-igrouplp-exact-solver-max-branches", cl::init(0), cl::Hidden, |
| cl::desc("The amount of branches that we are willing to explore with" |
| "the exact algorithm before giving up.")); |
| |
| static cl::opt<bool> UseCostHeur( |
| "amdgpu-igrouplp-exact-solver-cost-heur", cl::init(true), cl::Hidden, |
| cl::desc("Whether to use the cost heuristic to make choices as we " |
| "traverse the search space using the exact solver. Defaulted " |
| "to on, and if turned off, we will use the node order -- " |
| "attempting to put the later nodes in the later sched groups. " |
| "Experimentally, results are mixed, so this should be set on a " |
| "case-by-case basis.")); |
| |
| // Components of the mask that determines which instruction types may be may be |
| // classified into a SchedGroup. |
| enum class SchedGroupMask { |
| NONE = 0u, |
| ALU = 1u << 0, |
| VALU = 1u << 1, |
| SALU = 1u << 2, |
| MFMA = 1u << 3, |
| VMEM = 1u << 4, |
| VMEM_READ = 1u << 5, |
| VMEM_WRITE = 1u << 6, |
| DS = 1u << 7, |
| DS_READ = 1u << 8, |
| DS_WRITE = 1u << 9, |
| TRANS = 1u << 10, |
| ALL = ALU | VALU | SALU | MFMA | VMEM | VMEM_READ | VMEM_WRITE | DS | |
| DS_READ | DS_WRITE | TRANS, |
| LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ ALL) |
| }; |
| |
| class SchedGroup; |
| |
| // InstructionRule class is used to enact a filter which determines whether or |
| // not an SU maps to a given SchedGroup. It contains complementary data |
| // structures (e.g Cache) to help those filters. |
| class InstructionRule { |
| protected: |
| const SIInstrInfo *TII; |
| unsigned SGID; |
| // A cache made available to the Filter to store SUnits for subsequent |
| // invocations of the Filter |
| std::optional<SmallVector<SUnit *, 4>> Cache; |
| |
| public: |
| virtual bool |
| apply(const SUnit *, const ArrayRef<SUnit *>, |
| SmallVectorImpl<SchedGroup> &) { |
| return true; |
| }; |
| |
| InstructionRule(const SIInstrInfo *TII, unsigned SGID, |
| bool NeedsCache = false) |
| : TII(TII), SGID(SGID) { |
| if (NeedsCache) { |
| Cache = SmallVector<SUnit *, 4>(); |
| } |
| } |
| |
| virtual ~InstructionRule() = default; |
| }; |
| |
| using SUnitsToCandidateSGsMap = DenseMap<SUnit *, SmallVector<int, 4>>; |
| |
| // Classify instructions into groups to enable fine tuned control over the |
| // scheduler. These groups may be more specific than current SchedModel |
| // instruction classes. |
| class SchedGroup { |
| private: |
| // Mask that defines which instruction types can be classified into this |
| // SchedGroup. The instruction types correspond to the mask from SCHED_BARRIER |
| // and SCHED_GROUP_BARRIER. |
| SchedGroupMask SGMask; |
| |
| // Maximum number of SUnits that can be added to this group. |
| std::optional<unsigned> MaxSize; |
| |
| // SchedGroups will only synchronize with other SchedGroups that have the same |
| // SyncID. |
| int SyncID = 0; |
| |
| // SGID is used to map instructions to candidate SchedGroups |
| unsigned SGID; |
| |
| // The different rules each instruction in this SchedGroup must conform to |
| SmallVector<std::shared_ptr<InstructionRule>, 4> Rules; |
| |
| // Count of the number of created SchedGroups, used to initialize SGID. |
| static unsigned NumSchedGroups; |
| |
| // Try to add and edge from SU A to SU B. |
| bool tryAddEdge(SUnit *A, SUnit *B); |
| |
| // Use SGMask to determine whether we can classify MI as a member of this |
| // SchedGroup object. |
| bool canAddMI(const MachineInstr &MI) const; |
| |
| public: |
| // Collection of SUnits that are classified as members of this group. |
| SmallVector<SUnit *, 32> Collection; |
| |
| ScheduleDAGInstrs *DAG; |
| const SIInstrInfo *TII; |
| |
| // Returns true if SU can be added to this SchedGroup. |
| bool canAddSU(SUnit &SU) const; |
| |
| // Add DAG dependencies from all SUnits in this SchedGroup and this SU. If |
| // MakePred is true, SU will be a predecessor of the SUnits in this |
| // SchedGroup, otherwise SU will be a successor. |
| void link(SUnit &SU, bool MakePred = false); |
| |
| // Add DAG dependencies and track which edges are added, and the count of |
| // missed edges |
| int link(SUnit &SU, bool MakePred, |
| std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges); |
| |
| // Add DAG dependencies from all SUnits in this SchedGroup and this SU. |
| // Use the predicate to determine whether SU should be a predecessor (P = |
| // true) or a successor (P = false) of this SchedGroup. |
| void link(SUnit &SU, function_ref<bool(const SUnit *A, const SUnit *B)> P); |
| |
| // Add DAG dependencies such that SUnits in this group shall be ordered |
| // before SUnits in OtherGroup. |
| void link(SchedGroup &OtherGroup); |
| |
| // Returns true if no more instructions may be added to this group. |
| bool isFull() const { return MaxSize && Collection.size() >= *MaxSize; } |
| |
| // Append a constraint that SUs must meet in order to fit into this |
| // SchedGroup. Since many rules involve the relationship between a SchedGroup |
| // and the SUnits in other SchedGroups, rules are checked at Pipeline Solve |
| // time (rather than SchedGroup init time.) |
| void addRule(std::shared_ptr<InstructionRule> NewRule) { |
| Rules.push_back(NewRule); |
| } |
| |
| // Returns true if the SU matches all rules |
| bool allowedByRules(const SUnit *SU, |
| SmallVectorImpl<SchedGroup> &SyncPipe) const { |
| for (auto &Rule : Rules) { |
| if (!Rule->apply(SU, Collection, SyncPipe)) |
| return false; |
| } |
| return true; |
| } |
| |
| // Add SU to the SchedGroup. |
| void add(SUnit &SU) { |
| LLVM_DEBUG(dbgs() << "For SchedGroup with mask " |
| << format_hex((int)SGMask, 10, true) << " adding " |
| << *SU.getInstr()); |
| Collection.push_back(&SU); |
| } |
| |
| // Remove last element in the SchedGroup |
| void pop() { Collection.pop_back(); } |
| |
| // Identify and add all relevant SUs from the DAG to this SchedGroup. |
| void initSchedGroup(); |
| |
| // Add instructions to the SchedGroup bottom up starting from RIter. |
| // PipelineInstrs is a set of instructions that should not be added to the |
| // SchedGroup even when the other conditions for adding it are satisfied. |
| // RIter will be added to the SchedGroup as well, and dependencies will be |
| // added so that RIter will always be scheduled at the end of the group. |
| void initSchedGroup(std::vector<SUnit>::reverse_iterator RIter, |
| SUnitsToCandidateSGsMap &SyncedInstrs); |
| |
| void initSchedGroup(SUnitsToCandidateSGsMap &SyncedInstrs); |
| |
| int getSyncID() { return SyncID; } |
| |
| int getSGID() { return SGID; } |
| |
| SchedGroupMask getMask() { return SGMask; } |
| |
| SchedGroup(SchedGroupMask SGMask, std::optional<unsigned> MaxSize, |
| ScheduleDAGInstrs *DAG, const SIInstrInfo *TII) |
| : SGMask(SGMask), MaxSize(MaxSize), DAG(DAG), TII(TII) { |
| SGID = NumSchedGroups++; |
| } |
| |
| SchedGroup(SchedGroupMask SGMask, std::optional<unsigned> MaxSize, int SyncID, |
| ScheduleDAGInstrs *DAG, const SIInstrInfo *TII) |
| : SGMask(SGMask), MaxSize(MaxSize), SyncID(SyncID), DAG(DAG), TII(TII) { |
| SGID = NumSchedGroups++; |
| } |
| }; |
| |
| using SUToCandSGsPair = std::pair<SUnit *, SmallVector<int, 4>>; |
| using SUsToCandSGsVec = SmallVector<SUToCandSGsPair, 4>; |
| |
| // The PipelineSolver is used to assign SUnits to SchedGroups in a pipeline |
| // in non-trivial cases. For example, if the requested pipeline is |
| // {VMEM_READ, VALU, MFMA, VMEM_READ} and we encounter a VMEM_READ instruction |
| // in the DAG, then we will have an instruction that can not be trivially |
| // assigned to a SchedGroup. The PipelineSolver class implements two algorithms |
| // to find a good solution to the pipeline -- a greedy algorithm and an exact |
| // algorithm. The exact algorithm has an exponential time complexity and should |
| // only be used for small sized problems or medium sized problems where an exact |
| // solution is highly desired. |
| class PipelineSolver { |
| [[maybe_unused]] ScheduleDAGMI *DAG; |
| |
| // Instructions that can be assigned to multiple SchedGroups |
| DenseMap<int, SUnitsToCandidateSGsMap> SyncedInstrs; |
| SmallVector<SUsToCandSGsVec, 4> PipelineInstrs; |
| DenseMap<int, SmallVector<SchedGroup, 4>> SyncedSchedGroups; |
| // The current working pipeline |
| SmallVector<SmallVector<SchedGroup, 4>, 4> CurrPipeline; |
| // The pipeline that has the best solution found so far |
| SmallVector<SmallVector<SchedGroup, 4>, 4> BestPipeline; |
| |
| // Whether or not we actually have any SyncedInstrs to try to solve. |
| bool NeedsSolver = false; |
| |
| // Compute an estimate of the size of search tree -- the true size is |
| // the product of each conflictedInst.Matches.size() across all SyncPipelines |
| unsigned computeProblemSize(); |
| |
| // The cost penalty of not assigning a SU to a SchedGroup |
| int MissPenalty = 0; |
| |
| // Costs in terms of the number of edges we are unable to add |
| int BestCost = -1; |
| int CurrCost = 0; |
| |
| // Index pointing to the conflicting instruction that is currently being |
| // fitted |
| int CurrConflInstNo = 0; |
| // Index to the pipeline that is currently being fitted |
| int CurrSyncGroupIdx = 0; |
| // The first non trivial pipeline |
| int BeginSyncGroupIdx = 0; |
| |
| // How many branches we have explored |
| uint64_t BranchesExplored = 0; |
| |
| // The direction in which we process the candidate SchedGroups per SU |
| bool IsBottomUp = true; |
| |
| // Update indices to fit next conflicting instruction |
| void advancePosition(); |
| // Recede indices to attempt to find better fit for previous conflicting |
| // instruction |
| void retreatPosition(); |
| |
| // The exponential time algorithm which finds the provably best fit |
| bool solveExact(); |
| // The polynomial time algorithm which attempts to find a good fit |
| bool solveGreedy(); |
| // Find the best SchedGroup for the current SU using the heuristic given all |
| // current information. One step in the greedy algorithm. Templated against |
| // the SchedGroup iterator (either reverse or forward). |
| template <typename T> |
| void greedyFind(std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges, T I, |
| T E); |
| // Whether or not the current solution is optimal |
| bool checkOptimal(); |
| // Populate the ready list, prioiritizing fewest missed edges first |
| // Templated against the SchedGroup iterator (either reverse or forward). |
| template <typename T> |
| void populateReadyList(SmallVectorImpl<std::pair<int, int>> &ReadyList, T I, |
| T E); |
| // Add edges corresponding to the SchedGroups as assigned by solver |
| void makePipeline(); |
| // Link the SchedGroups in the best found pipeline. |
| // Tmplated against the SchedGroup iterator (either reverse or forward). |
| template <typename T> void linkSchedGroups(T I, T E); |
| // Add the edges from the SU to the other SchedGroups in pipeline, and |
| // return the number of edges missed. |
| int addEdges(SmallVectorImpl<SchedGroup> &SyncPipeline, SUnit *SU, int SGID, |
| std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges); |
| /// Link the pipeline as if \p SU was in the SchedGroup with ID \p SGID. It |
| /// returns the cost (in terms of missed pipeline edges), and tracks the edges |
| /// added in \p AddedEdges |
| template <typename T> |
| int linkSUnit(SUnit *SU, int SGID, |
| std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges, T I, T E); |
| /// Remove the edges passed via \p AddedEdges |
| void removeEdges(const std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges); |
| // Convert the passed in maps to arrays for bidirectional iterators |
| void convertSyncMapsToArrays(); |
| |
| void reset(); |
| |
| public: |
| // Invoke the solver to map instructions to instruction groups. Heuristic && |
| // command-line-option determines to use exact or greedy algorithm. |
| void solve(); |
| |
| PipelineSolver(DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups, |
| DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs, |
| ScheduleDAGMI *DAG, bool IsBottomUp = true) |
| : DAG(DAG), SyncedInstrs(SyncedInstrs), |
| SyncedSchedGroups(SyncedSchedGroups), IsBottomUp(IsBottomUp) { |
| |
| for (auto &PipelineInstrs : SyncedInstrs) { |
| if (PipelineInstrs.second.size() > 0) { |
| NeedsSolver = true; |
| break; |
| } |
| } |
| |
| if (!NeedsSolver) |
| return; |
| |
| convertSyncMapsToArrays(); |
| |
| CurrPipeline = BestPipeline; |
| |
| while (static_cast<size_t>(BeginSyncGroupIdx) < PipelineInstrs.size() && |
| PipelineInstrs[BeginSyncGroupIdx].size() == 0) |
| ++BeginSyncGroupIdx; |
| |
| if (static_cast<size_t>(BeginSyncGroupIdx) >= PipelineInstrs.size()) |
| return; |
| } |
| }; |
| |
| void PipelineSolver::reset() { |
| |
| for (auto &SyncPipeline : CurrPipeline) { |
| for (auto &SG : SyncPipeline) { |
| SmallVector<SUnit *, 32> TempCollection = SG.Collection; |
| SG.Collection.clear(); |
| auto *SchedBarr = llvm::find_if(TempCollection, [](SUnit *SU) { |
| return SU->getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER; |
| }); |
| if (SchedBarr != TempCollection.end()) |
| SG.Collection.push_back(*SchedBarr); |
| } |
| } |
| |
| CurrSyncGroupIdx = BeginSyncGroupIdx; |
| CurrConflInstNo = 0; |
| CurrCost = 0; |
| } |
| |
| void PipelineSolver::convertSyncMapsToArrays() { |
| for (auto &SyncPipe : SyncedSchedGroups) { |
| BestPipeline.insert(BestPipeline.begin(), SyncPipe.second); |
| } |
| |
| int PipelineIDx = SyncedInstrs.size() - 1; |
| PipelineInstrs.resize(SyncedInstrs.size()); |
| for (auto &SyncInstrMap : SyncedInstrs) { |
| for (auto &SUsToCandSGs : SyncInstrMap.second) { |
| if (PipelineInstrs[PipelineIDx].size() == 0) { |
| PipelineInstrs[PipelineIDx].push_back( |
| std::pair(SUsToCandSGs.first, SUsToCandSGs.second)); |
| continue; |
| } |
| auto *SortPosition = PipelineInstrs[PipelineIDx].begin(); |
| // Insert them in sorted order -- this allows for good parsing order in |
| // the greedy algorithm |
| while (SortPosition != PipelineInstrs[PipelineIDx].end() && |
| SUsToCandSGs.first->NodeNum > SortPosition->first->NodeNum) |
| ++SortPosition; |
| PipelineInstrs[PipelineIDx].insert( |
| SortPosition, std::pair(SUsToCandSGs.first, SUsToCandSGs.second)); |
| } |
| --PipelineIDx; |
| } |
| } |
| |
| template <typename T> void PipelineSolver::linkSchedGroups(T I, T E) { |
| for (; I != E; ++I) { |
| auto &GroupA = *I; |
| for (auto J = std::next(I); J != E; ++J) { |
| auto &GroupB = *J; |
| GroupA.link(GroupB); |
| } |
| } |
| } |
| |
| void PipelineSolver::makePipeline() { |
| // Preserve the order of barrier for subsequent SchedGroupBarrier mutations |
| for (auto &SyncPipeline : BestPipeline) { |
| LLVM_DEBUG(dbgs() << "Printing SchedGroups\n"); |
| for (auto &SG : SyncPipeline) { |
| LLVM_DEBUG(dbgs() << "SchedGroup with SGID " << SG.getSGID() |
| << " has: \n"); |
| SUnit *SGBarr = nullptr; |
| for (auto &SU : SG.Collection) { |
| if (SU->getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER) |
| SGBarr = SU; |
| LLVM_DEBUG(dbgs() << "SU(" << SU->NodeNum << ")\n"); |
| } |
| // Command line requested IGroupLP doesn't have SGBarr |
| if (!SGBarr) |
| continue; |
| SG.link(*SGBarr, false); |
| } |
| } |
| |
| for (auto &SyncPipeline : BestPipeline) { |
| IsBottomUp ? linkSchedGroups(SyncPipeline.rbegin(), SyncPipeline.rend()) |
| : linkSchedGroups(SyncPipeline.begin(), SyncPipeline.end()); |
| } |
| } |
| |
| template <typename T> |
| int PipelineSolver::linkSUnit( |
| SUnit *SU, int SGID, std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges, |
| T I, T E) { |
| bool MakePred = false; |
| int AddedCost = 0; |
| for (; I < E; ++I) { |
| if (I->getSGID() == SGID) { |
| MakePred = true; |
| continue; |
| } |
| auto Group = *I; |
| AddedCost += Group.link(*SU, MakePred, AddedEdges); |
| assert(AddedCost >= 0); |
| } |
| return AddedCost; |
| } |
| |
| int PipelineSolver::addEdges( |
| SmallVectorImpl<SchedGroup> &SyncPipeline, SUnit *SU, int SGID, |
| std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges) { |
| |
| // For IsBottomUp, the first SchedGroup in SyncPipeline contains the |
| // instructions that are the ultimate successors in the resultant mutation. |
| // Therefore, in such a configuration, the SchedGroups occurring before the |
| // candidate SGID are successors of the candidate SchedGroup, thus the current |
| // SU should be linked as a predecessor to SUs in those SchedGroups. The |
| // opposite is true if !IsBottomUp. IsBottomUp occurs in the case of multiple |
| // SCHED_GROUP_BARRIERS, or if a user specifies IGLP_OPT SchedGroups using |
| // IsBottomUp (in reverse). |
| return IsBottomUp ? linkSUnit(SU, SGID, AddedEdges, SyncPipeline.rbegin(), |
| SyncPipeline.rend()) |
| : linkSUnit(SU, SGID, AddedEdges, SyncPipeline.begin(), |
| SyncPipeline.end()); |
| } |
| |
| void PipelineSolver::removeEdges( |
| const std::vector<std::pair<SUnit *, SUnit *>> &EdgesToRemove) { |
| // Only remove the edges that we have added when testing |
| // the fit. |
| for (auto &PredSuccPair : EdgesToRemove) { |
| SUnit *Pred = PredSuccPair.first; |
| SUnit *Succ = PredSuccPair.second; |
| |
| auto *Match = llvm::find_if( |
| Succ->Preds, [&Pred](SDep &P) { return P.getSUnit() == Pred; }); |
| if (Match != Succ->Preds.end()) { |
| assert(Match->isArtificial()); |
| Succ->removePred(*Match); |
| } |
| } |
| } |
| |
| void PipelineSolver::advancePosition() { |
| ++CurrConflInstNo; |
| |
| if (static_cast<size_t>(CurrConflInstNo) >= |
| PipelineInstrs[CurrSyncGroupIdx].size()) { |
| CurrConflInstNo = 0; |
| ++CurrSyncGroupIdx; |
| // Advance to next non-trivial pipeline |
| while (static_cast<size_t>(CurrSyncGroupIdx) < PipelineInstrs.size() && |
| PipelineInstrs[CurrSyncGroupIdx].size() == 0) |
| ++CurrSyncGroupIdx; |
| } |
| } |
| |
| void PipelineSolver::retreatPosition() { |
| assert(CurrConflInstNo >= 0); |
| assert(CurrSyncGroupIdx >= 0); |
| |
| if (CurrConflInstNo > 0) { |
| --CurrConflInstNo; |
| return; |
| } |
| |
| if (CurrConflInstNo == 0) { |
| // If we return to the starting position, we have explored |
| // the entire tree |
| if (CurrSyncGroupIdx == BeginSyncGroupIdx) |
| return; |
| |
| --CurrSyncGroupIdx; |
| // Go to previous non-trivial pipeline |
| while (PipelineInstrs[CurrSyncGroupIdx].size() == 0) |
| --CurrSyncGroupIdx; |
| |
| CurrConflInstNo = PipelineInstrs[CurrSyncGroupIdx].size() - 1; |
| } |
| } |
| |
| bool PipelineSolver::checkOptimal() { |
| if (static_cast<size_t>(CurrSyncGroupIdx) == PipelineInstrs.size()) { |
| if (BestCost == -1 || CurrCost < BestCost) { |
| BestPipeline = CurrPipeline; |
| BestCost = CurrCost; |
| LLVM_DEBUG(dbgs() << "Found Fit with cost " << BestCost << "\n"); |
| } |
| assert(BestCost >= 0); |
| } |
| |
| bool DoneExploring = false; |
| if (MaxBranchesExplored > 0 && BranchesExplored >= MaxBranchesExplored) |
| DoneExploring = true; |
| |
| return (DoneExploring || BestCost == 0); |
| } |
| |
| template <typename T> |
| void PipelineSolver::populateReadyList( |
| SmallVectorImpl<std::pair<int, int>> &ReadyList, T I, T E) { |
| SUToCandSGsPair CurrSU = PipelineInstrs[CurrSyncGroupIdx][CurrConflInstNo]; |
| auto SyncPipeline = CurrPipeline[CurrSyncGroupIdx]; |
| assert(CurrSU.second.size() >= 1); |
| |
| for (; I != E; ++I) { |
| std::vector<std::pair<SUnit *, SUnit *>> AddedEdges; |
| int CandSGID = *I; |
| SchedGroup *Match = llvm::find_if(SyncPipeline, [CandSGID](SchedGroup &SG) { |
| return SG.getSGID() == CandSGID; |
| }); |
| assert(Match); |
| |
| if (UseCostHeur) { |
| if (Match->isFull()) { |
| ReadyList.push_back(std::pair(*I, MissPenalty)); |
| continue; |
| } |
| |
| int TempCost = addEdges(SyncPipeline, CurrSU.first, CandSGID, AddedEdges); |
| ReadyList.push_back(std::pair(*I, TempCost)); |
| removeEdges(AddedEdges); |
| } else |
| ReadyList.push_back(std::pair(*I, -1)); |
| } |
| |
| if (UseCostHeur) { |
| std::sort(ReadyList.begin(), ReadyList.end(), |
| [](std::pair<int, int> A, std::pair<int, int> B) { |
| return A.second < B.second; |
| }); |
| } |
| |
| assert(ReadyList.size() == CurrSU.second.size()); |
| } |
| |
| bool PipelineSolver::solveExact() { |
| if (checkOptimal()) |
| return true; |
| |
| if (static_cast<size_t>(CurrSyncGroupIdx) == PipelineInstrs.size()) |
| return false; |
| |
| assert(static_cast<size_t>(CurrSyncGroupIdx) < PipelineInstrs.size()); |
| assert(static_cast<size_t>(CurrConflInstNo) < |
| PipelineInstrs[CurrSyncGroupIdx].size()); |
| SUToCandSGsPair CurrSU = PipelineInstrs[CurrSyncGroupIdx][CurrConflInstNo]; |
| LLVM_DEBUG(dbgs() << "Fitting SU(" << CurrSU.first->NodeNum |
| << ") in Pipeline # " << CurrSyncGroupIdx << "\n"); |
| |
| // SchedGroup -> Cost pairs |
| SmallVector<std::pair<int, int>, 4> ReadyList; |
| // Prioritize the candidate sched groups in terms of lowest cost first |
| IsBottomUp ? populateReadyList(ReadyList, CurrSU.second.rbegin(), |
| CurrSU.second.rend()) |
| : populateReadyList(ReadyList, CurrSU.second.begin(), |
| CurrSU.second.end()); |
| |
| auto *I = ReadyList.begin(); |
| auto *E = ReadyList.end(); |
| for (; I != E; ++I) { |
| // If we are trying SGs in least cost order, and the current SG is cost |
| // infeasible, then all subsequent SGs will also be cost infeasible, so we |
| // can prune. |
| if (BestCost != -1 && (CurrCost + I->second > BestCost)) |
| return false; |
| |
| int CandSGID = I->first; |
| int AddedCost = 0; |
| std::vector<std::pair<SUnit *, SUnit *>> AddedEdges; |
| auto &SyncPipeline = CurrPipeline[CurrSyncGroupIdx]; |
| SchedGroup *Match; |
| for (auto &SG : SyncPipeline) { |
| if (SG.getSGID() == CandSGID) |
| Match = &SG; |
| } |
| |
| if (Match->isFull()) |
| continue; |
| |
| if (!Match->allowedByRules(CurrSU.first, SyncPipeline)) |
| continue; |
| |
| LLVM_DEBUG(dbgs() << "Assigning to SchedGroup with Mask " |
| << (int)Match->getMask() << "and ID " << CandSGID |
| << "\n"); |
| Match->add(*CurrSU.first); |
| AddedCost = addEdges(SyncPipeline, CurrSU.first, CandSGID, AddedEdges); |
| LLVM_DEBUG(dbgs() << "Cost of Assignment: " << AddedCost << "\n"); |
| CurrCost += AddedCost; |
| advancePosition(); |
| ++BranchesExplored; |
| bool FinishedExploring = false; |
| // If the Cost after adding edges is greater than a known solution, |
| // backtrack |
| if (CurrCost < BestCost || BestCost == -1) { |
| if (solveExact()) { |
| FinishedExploring = BestCost != 0; |
| if (!FinishedExploring) |
| return true; |
| } |
| } |
| |
| retreatPosition(); |
| CurrCost -= AddedCost; |
| removeEdges(AddedEdges); |
| Match->pop(); |
| CurrPipeline[CurrSyncGroupIdx] = SyncPipeline; |
| if (FinishedExploring) |
| return true; |
| } |
| |
| // Try the pipeline where the current instruction is omitted |
| // Potentially if we omit a problematic instruction from the pipeline, |
| // all the other instructions can nicely fit. |
| CurrCost += MissPenalty; |
| advancePosition(); |
| |
| LLVM_DEBUG(dbgs() << "NOT Assigned (" << CurrSU.first->NodeNum << ")\n"); |
| |
| bool FinishedExploring = false; |
| if (CurrCost < BestCost || BestCost == -1) { |
| if (solveExact()) { |
| bool FinishedExploring = BestCost != 0; |
| if (!FinishedExploring) |
| return true; |
| } |
| } |
| |
| retreatPosition(); |
| CurrCost -= MissPenalty; |
| return FinishedExploring; |
| } |
| |
| template <typename T> |
| void PipelineSolver::greedyFind( |
| std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges, T I, T E) { |
| SUToCandSGsPair CurrSU = PipelineInstrs[CurrSyncGroupIdx][CurrConflInstNo]; |
| int BestNodeCost = -1; |
| int TempCost; |
| SchedGroup *BestGroup = nullptr; |
| int BestGroupID = -1; |
| auto &SyncPipeline = CurrPipeline[CurrSyncGroupIdx]; |
| LLVM_DEBUG(dbgs() << "Fitting SU(" << CurrSU.first->NodeNum |
| << ") in Pipeline # " << CurrSyncGroupIdx << "\n"); |
| |
| // Since we have added the potential SchedGroups from bottom up, but |
| // traversed the DAG from top down, parse over the groups from last to |
| // first. If we fail to do this for the greedy algorithm, the solution will |
| // likely not be good in more complex cases. |
| for (; I != E; ++I) { |
| std::vector<std::pair<SUnit *, SUnit *>> AddedEdges; |
| int CandSGID = *I; |
| SchedGroup *Match = llvm::find_if(SyncPipeline, [CandSGID](SchedGroup &SG) { |
| return SG.getSGID() == CandSGID; |
| }); |
| assert(Match); |
| |
| LLVM_DEBUG(dbgs() << "Trying SGID # " << CandSGID << " with Mask " |
| << (int)Match->getMask() << "\n"); |
| |
| if (Match->isFull()) { |
| LLVM_DEBUG(dbgs() << "SGID # " << CandSGID << " is full\n"); |
| continue; |
| } |
| if (!Match->allowedByRules(CurrSU.first, SyncPipeline)) { |
| LLVM_DEBUG(dbgs() << "SGID # " << CandSGID << " has conflicting rule\n"); |
| continue; |
| } |
| TempCost = addEdges(SyncPipeline, CurrSU.first, CandSGID, AddedEdges); |
| LLVM_DEBUG(dbgs() << "Cost of Group " << TempCost << "\n"); |
| if (TempCost < BestNodeCost || BestNodeCost == -1) { |
| BestGroup = Match; |
| BestNodeCost = TempCost; |
| BestGroupID = CandSGID; |
| } |
| removeEdges(AddedEdges); |
| if (BestNodeCost == 0) |
| break; |
| } |
| |
| if (BestGroupID != -1) { |
| BestGroup->add(*CurrSU.first); |
| addEdges(SyncPipeline, CurrSU.first, BestGroupID, AddedEdges); |
| LLVM_DEBUG(dbgs() << "Best Group has ID: " << BestGroupID << " and Mask" |
| << (int)BestGroup->getMask() << "\n"); |
| BestCost += TempCost; |
| } else |
| BestCost += MissPenalty; |
| |
| CurrPipeline[CurrSyncGroupIdx] = SyncPipeline; |
| } |
| |
| bool PipelineSolver::solveGreedy() { |
| BestCost = 0; |
| std::vector<std::pair<SUnit *, SUnit *>> AddedEdges; |
| |
| while (static_cast<size_t>(CurrSyncGroupIdx) < PipelineInstrs.size()) { |
| SUToCandSGsPair CurrSU = PipelineInstrs[CurrSyncGroupIdx][CurrConflInstNo]; |
| IsBottomUp |
| ? greedyFind(AddedEdges, CurrSU.second.rbegin(), CurrSU.second.rend()) |
| : greedyFind(AddedEdges, CurrSU.second.begin(), CurrSU.second.end()); |
| advancePosition(); |
| } |
| BestPipeline = CurrPipeline; |
| removeEdges(AddedEdges); |
| return false; |
| } |
| |
| unsigned PipelineSolver::computeProblemSize() { |
| unsigned ProblemSize = 0; |
| for (auto &PipeConflicts : PipelineInstrs) { |
| ProblemSize += PipeConflicts.size(); |
| } |
| |
| return ProblemSize; |
| } |
| |
| void PipelineSolver::solve() { |
| if (!NeedsSolver) |
| return; |
| |
| unsigned ProblemSize = computeProblemSize(); |
| assert(ProblemSize > 0); |
| |
| bool BelowCutoff = (CutoffForExact > 0) && ProblemSize <= CutoffForExact; |
| MissPenalty = (ProblemSize / 2) + 1; |
| |
| LLVM_DEBUG(DAG->dump()); |
| if (EnableExactSolver || BelowCutoff) { |
| LLVM_DEBUG(dbgs() << "Starting Greedy pipeline solver\n"); |
| solveGreedy(); |
| reset(); |
| LLVM_DEBUG(dbgs() << "Greedy produced best cost of " << BestCost << "\n"); |
| if (BestCost > 0) { |
| LLVM_DEBUG(dbgs() << "Starting EXACT pipeline solver\n"); |
| solveExact(); |
| LLVM_DEBUG(dbgs() << "Exact produced best cost of " << BestCost << "\n"); |
| } |
| } else { // Use the Greedy Algorithm by default |
| LLVM_DEBUG(dbgs() << "Starting GREEDY pipeline solver\n"); |
| solveGreedy(); |
| } |
| |
| makePipeline(); |
| LLVM_DEBUG(dbgs() << "After applying mutation\n"); |
| LLVM_DEBUG(DAG->dump()); |
| } |
| |
| enum IGLPStrategyID : int { |
| MFMASmallGemmOptID = 0, |
| MFMASmallGemmSingleWaveOptID = 1, |
| MFMAExpInterleaveID = 2, |
| MFMAExpSimpleInterleaveID = 3 |
| }; |
| |
| // Implement a IGLP scheduling strategy. |
| class IGLPStrategy { |
| protected: |
| ScheduleDAGInstrs *DAG; |
| |
| const SIInstrInfo *TII; |
| |
| public: |
| /// Add SchedGroups to \p SyncedSchedGroups to implement this Strategy. |
| virtual bool applyIGLPStrategy( |
| DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs, |
| DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups, |
| AMDGPU::SchedulingPhase Phase) = 0; |
| |
| // Returns true if this strategy should be applied to a ScheduleDAG. |
| virtual bool shouldApplyStrategy(ScheduleDAGInstrs *DAG, |
| AMDGPU::SchedulingPhase Phase) = 0; |
| |
| bool IsBottomUp = true; |
| |
| IGLPStrategy(ScheduleDAGInstrs *DAG, const SIInstrInfo *TII) |
| : DAG(DAG), TII(TII) {} |
| |
| virtual ~IGLPStrategy() = default; |
| }; |
| |
| class MFMASmallGemmOpt final : public IGLPStrategy { |
| private: |
| public: |
| bool applyIGLPStrategy( |
| DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs, |
| DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups, |
| AMDGPU::SchedulingPhase Phase) override; |
| |
| bool shouldApplyStrategy(ScheduleDAGInstrs *DAG, |
| AMDGPU::SchedulingPhase Phase) override { |
| return true; |
| } |
| |
| MFMASmallGemmOpt(ScheduleDAGInstrs *DAG, const SIInstrInfo *TII) |
| : IGLPStrategy(DAG, TII) { |
| IsBottomUp = true; |
| } |
| }; |
| |
| bool MFMASmallGemmOpt::applyIGLPStrategy( |
| DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs, |
| DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups, |
| AMDGPU::SchedulingPhase Phase) { |
| // Count the number of MFMA instructions. |
| unsigned MFMACount = 0; |
| for (const MachineInstr &I : *DAG) |
| if (TII->isMFMAorWMMA(I)) |
| ++MFMACount; |
| |
| const unsigned PipelineSyncID = 0; |
| SchedGroup *SG = nullptr; |
| for (unsigned I = 0; I < MFMACount * 3; ++I) { |
| SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( |
| SchedGroupMask::DS, 2, PipelineSyncID, DAG, TII); |
| SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); |
| |
| SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( |
| SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII); |
| SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); |
| } |
| |
| return true; |
| } |
| |
| class MFMAExpInterleaveOpt final : public IGLPStrategy { |
| private: |
| // The count of TRANS SUs involved in the interleaved pipeline |
| static unsigned TransPipeCount; |
| // The count of MFMA SUs involved in the interleaved pipeline |
| static unsigned MFMAPipeCount; |
| // The count of Add SUs involved in the interleaved pipeline |
| static unsigned AddPipeCount; |
| // The number of transitive MFMA successors for each TRANS SU |
| static unsigned MFMAEnablement; |
| // The number of transitive TRANS predecessors for each MFMA SU |
| static unsigned ExpRequirement; |
| // The count of independent "chains" of MFMA instructions in the pipeline |
| static unsigned MFMAChains; |
| // The length of each independent "chain" of MFMA instructions |
| static unsigned MFMAChainLength; |
| // Whether or not the pipeline has V_CVT instructions |
| static bool HasCvt; |
| // Whether or not there are instructions between the TRANS instruction and |
| // V_CVT |
| static bool HasChainBetweenCvt; |
| // The first occuring DS_READ which feeds an MFMA chain |
| static std::optional<unsigned> FirstPipeDSR; |
| // The MFMAPipe SUs with no MFMA predecessors |
| SmallVector<SUnit *, 4> MFMAChainSeeds; |
| // Compute the heuristics for the pipeline, returning whether or not the DAG |
| // is well formatted for the mutation |
| bool analyzeDAG(const SIInstrInfo *TII); |
| |
| /// Whether or not the instruction is a transitive predecessor of an MFMA |
| /// instruction |
| class IsPipeExp final : public InstructionRule { |
| public: |
| bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection, |
| SmallVectorImpl<SchedGroup> &SyncPipe) override { |
| |
| auto *DAG = SyncPipe[0].DAG; |
| |
| if (Cache->empty()) { |
| auto I = DAG->SUnits.rbegin(); |
| auto E = DAG->SUnits.rend(); |
| for (; I != E; I++) { |
| if (TII->isMFMAorWMMA(*I->getInstr())) |
| Cache->push_back(&*I); |
| } |
| if (Cache->empty()) |
| return false; |
| } |
| |
| auto Reaches = any_of(*Cache, [&SU, &DAG](SUnit *TargetSU) { |
| return DAG->IsReachable(TargetSU, const_cast<SUnit *>(SU)); |
| }); |
| |
| return Reaches; |
| } |
| IsPipeExp(const SIInstrInfo *TII, unsigned SGID, bool NeedsCache = false) |
| : InstructionRule(TII, SGID, NeedsCache) {} |
| }; |
| |
| /// Whether or not the instruction is a transitive predecessor of the |
| /// \p Number th MFMA of the MFMAs occuring after a TRANS instruction |
| class EnablesNthMFMA final : public InstructionRule { |
| private: |
| unsigned Number = 1; |
| |
| public: |
| bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection, |
| SmallVectorImpl<SchedGroup> &SyncPipe) override { |
| bool FoundTrans = false; |
| unsigned Counter = 1; |
| auto *DAG = SyncPipe[0].DAG; |
| |
| if (Cache->empty()) { |
| SmallVector<SUnit *, 8> Worklist; |
| |
| auto I = DAG->SUnits.begin(); |
| auto E = DAG->SUnits.end(); |
| for (; I != E; I++) { |
| if (FoundTrans && TII->isMFMAorWMMA(*I->getInstr())) { |
| if (Counter == Number) { |
| Cache->push_back(&*I); |
| break; |
| } |
| ++Counter; |
| } |
| if (!FoundTrans && TII->isTRANS(I->getInstr()->getOpcode())) |
| FoundTrans = true; |
| } |
| if (Cache->empty()) |
| return false; |
| } |
| |
| return DAG->IsReachable((*Cache)[0], const_cast<SUnit *>(SU)); |
| } |
| |
| EnablesNthMFMA(unsigned Number, const SIInstrInfo *TII, unsigned SGID, |
| bool NeedsCache = false) |
| : InstructionRule(TII, SGID, NeedsCache), Number(Number) {} |
| }; |
| |
| /// Whether or not the instruction enables the exact MFMA that is the \p |
| /// Number th MFMA in the chain starting with \p ChainSeed |
| class EnablesNthMFMAInChain final : public InstructionRule { |
| private: |
| unsigned Number = 1; |
| SUnit *ChainSeed; |
| |
| public: |
| bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection, |
| SmallVectorImpl<SchedGroup> &SyncPipe) override { |
| auto *DAG = SyncPipe[0].DAG; |
| |
| if (!SU || !TII->isMFMAorWMMA(*ChainSeed->getInstr())) |
| return false; |
| |
| if (Cache->empty()) { |
| auto *TempSU = ChainSeed; |
| auto Depth = Number; |
| while (Depth > 0) { |
| --Depth; |
| bool Found = false; |
| for (auto &Succ : TempSU->Succs) { |
| if (TII->isMFMAorWMMA(*Succ.getSUnit()->getInstr())) { |
| TempSU = Succ.getSUnit(); |
| Found = true; |
| break; |
| } |
| } |
| if (!Found) |
| return false; |
| } |
| |
| Cache->push_back(TempSU); |
| } |
| // If we failed to find the instruction to be placed into the cache, we |
| // would have already exited. |
| assert(!Cache->empty()); |
| |
| return DAG->IsReachable((*Cache)[0], const_cast<SUnit *>(SU)); |
| } |
| |
| EnablesNthMFMAInChain(unsigned Number, SUnit *ChainSeed, |
| const SIInstrInfo *TII, unsigned SGID, |
| bool NeedsCache = false) |
| : InstructionRule(TII, SGID, NeedsCache), Number(Number), |
| ChainSeed(ChainSeed) {} |
| }; |
| |
| /// Whether or not the instruction has less than \p Size immediate successors. |
| /// If \p HasIntermediary is true, this tests also whether all successors of |
| /// the SUnit have less than \p Size successors. |
| class LessThanNSuccs final : public InstructionRule { |
| private: |
| unsigned Size = 1; |
| bool HasIntermediary = false; |
| |
| public: |
| bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection, |
| SmallVectorImpl<SchedGroup> &SyncPipe) override { |
| if (!SyncPipe.size()) |
| return false; |
| |
| auto SuccSize = std::count_if( |
| SU->Succs.begin(), SU->Succs.end(), |
| [](const SDep &Succ) { return Succ.getKind() == SDep::Data; }); |
| if (SuccSize >= Size) |
| return false; |
| |
| if (HasIntermediary) { |
| for (auto Succ : SU->Succs) { |
| auto SuccSize = std::count_if( |
| Succ.getSUnit()->Succs.begin(), Succ.getSUnit()->Succs.end(), |
| [](const SDep &SuccSucc) { |
| return SuccSucc.getKind() == SDep::Data; |
| }); |
| if (SuccSize >= Size) |
| return false; |
| } |
| } |
| |
| return true; |
| } |
| LessThanNSuccs(unsigned Size, const SIInstrInfo *TII, unsigned SGID, |
| bool HasIntermediary = false, bool NeedsCache = false) |
| : InstructionRule(TII, SGID, NeedsCache), Size(Size), |
| HasIntermediary(HasIntermediary) {} |
| }; |
| |
| /// Whether or not the instruction has greater than or equal to \p Size |
| /// immediate successors. If \p HasIntermediary is true, this tests also |
| /// whether all successors of the SUnit have greater than or equal to \p Size |
| /// successors. |
| class GreaterThanOrEqualToNSuccs final : public InstructionRule { |
| private: |
| unsigned Size = 1; |
| bool HasIntermediary = false; |
| |
| public: |
| bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection, |
| SmallVectorImpl<SchedGroup> &SyncPipe) override { |
| if (!SyncPipe.size()) |
| return false; |
| |
| auto SuccSize = std::count_if( |
| SU->Succs.begin(), SU->Succs.end(), |
| [](const SDep &Succ) { return Succ.getKind() == SDep::Data; }); |
| if (SuccSize >= Size) |
| return true; |
| |
| if (HasIntermediary) { |
| for (auto Succ : SU->Succs) { |
| auto SuccSize = std::count_if( |
| Succ.getSUnit()->Succs.begin(), Succ.getSUnit()->Succs.end(), |
| [](const SDep &SuccSucc) { |
| return SuccSucc.getKind() == SDep::Data; |
| }); |
| if (SuccSize >= Size) |
| return true; |
| } |
| } |
| |
| return false; |
| } |
| GreaterThanOrEqualToNSuccs(unsigned Size, const SIInstrInfo *TII, |
| unsigned SGID, bool HasIntermediary = false, |
| bool NeedsCache = false) |
| : InstructionRule(TII, SGID, NeedsCache), Size(Size), |
| HasIntermediary(HasIntermediary) {} |
| }; |
| |
| // Whether or not the instruction is a relevant V_CVT instruction. |
| class IsCvt final : public InstructionRule { |
| public: |
| bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection, |
| SmallVectorImpl<SchedGroup> &SyncPipe) override { |
| auto Opc = SU->getInstr()->getOpcode(); |
| return Opc == AMDGPU::V_CVT_F16_F32_e32 || |
| Opc == AMDGPU::V_CVT_I32_F32_e32; |
| } |
| IsCvt(const SIInstrInfo *TII, unsigned SGID, bool NeedsCache = false) |
| : InstructionRule(TII, SGID, NeedsCache) {} |
| }; |
| |
| // Whether or not the instruction is FMA_F32. |
| class IsFMA final : public InstructionRule { |
| public: |
| bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection, |
| SmallVectorImpl<SchedGroup> &SyncPipe) override { |
| return SU->getInstr()->getOpcode() == AMDGPU::V_FMA_F32_e64 || |
| SU->getInstr()->getOpcode() == AMDGPU::V_PK_FMA_F32; |
| } |
| IsFMA(const SIInstrInfo *TII, unsigned SGID, bool NeedsCache = false) |
| : InstructionRule(TII, SGID, NeedsCache) {} |
| }; |
| |
| // Whether or not the instruction is a V_ADD_F32 instruction. |
| class IsPipeAdd final : public InstructionRule { |
| public: |
| bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection, |
| SmallVectorImpl<SchedGroup> &SyncPipe) override { |
| return SU->getInstr()->getOpcode() == AMDGPU::V_ADD_F32_e32; |
| } |
| IsPipeAdd(const SIInstrInfo *TII, unsigned SGID, bool NeedsCache = false) |
| : InstructionRule(TII, SGID, NeedsCache) {} |
| }; |
| |
| /// Whether or not the instruction is an immediate RAW successor |
| /// of the SchedGroup \p Distance steps before. |
| class IsSuccOfPrevNthGroup final : public InstructionRule { |
| private: |
| unsigned Distance = 1; |
| |
| public: |
| bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection, |
| SmallVectorImpl<SchedGroup> &SyncPipe) override { |
| SchedGroup *OtherGroup = nullptr; |
| if (!SyncPipe.size()) |
| return false; |
| |
| for (auto &PipeSG : SyncPipe) { |
| if ((unsigned)PipeSG.getSGID() == SGID - Distance) |
| OtherGroup = &PipeSG; |
| } |
| |
| if (!OtherGroup) |
| return false; |
| if (!OtherGroup->Collection.size()) |
| return true; |
| |
| for (auto &OtherEle : OtherGroup->Collection) { |
| for (auto &Succ : OtherEle->Succs) { |
| if (Succ.getSUnit() == SU && Succ.getKind() == SDep::Data) |
| return true; |
| } |
| } |
| |
| return false; |
| } |
| IsSuccOfPrevNthGroup(unsigned Distance, const SIInstrInfo *TII, |
| unsigned SGID, bool NeedsCache = false) |
| : InstructionRule(TII, SGID, NeedsCache), Distance(Distance) {} |
| }; |
| |
| /// Whether or not the instruction is a transitive successor of any |
| /// instruction the the SchedGroup \p Distance steps before. |
| class IsReachableFromPrevNthGroup final : public InstructionRule { |
| private: |
| unsigned Distance = 1; |
| |
| public: |
| bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection, |
| SmallVectorImpl<SchedGroup> &SyncPipe) override { |
| SchedGroup *OtherGroup = nullptr; |
| if (!SyncPipe.size()) |
| return false; |
| |
| for (auto &PipeSG : SyncPipe) { |
| if ((unsigned)PipeSG.getSGID() == SGID - Distance) |
| OtherGroup = &PipeSG; |
| } |
| |
| if (!OtherGroup) |
| return false; |
| if (!OtherGroup->Collection.size()) |
| return true; |
| |
| auto *DAG = SyncPipe[0].DAG; |
| |
| for (auto &OtherEle : OtherGroup->Collection) |
| if (DAG->IsReachable(const_cast<SUnit *>(SU), OtherEle)) |
| return true; |
| |
| return false; |
| } |
| IsReachableFromPrevNthGroup(unsigned Distance, const SIInstrInfo *TII, |
| unsigned SGID, bool NeedsCache = false) |
| : InstructionRule(TII, SGID, NeedsCache), Distance(Distance) {} |
| }; |
| |
| /// Whether or not the instruction occurs after the SU with NodeNUm \p Number |
| class OccursAtOrAfterNode final : public InstructionRule { |
| private: |
| unsigned Number = 1; |
| |
| public: |
| bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection, |
| SmallVectorImpl<SchedGroup> &SyncPipe) override { |
| |
| return SU->NodeNum >= Number; |
| } |
| OccursAtOrAfterNode(unsigned Number, const SIInstrInfo *TII, unsigned SGID, |
| bool NeedsCache = false) |
| : InstructionRule(TII, SGID, NeedsCache), Number(Number) {} |
| }; |
| |
| /// Whether or not the SU is exactly the \p Number th MFMA in the chain |
| /// starting with \p ChainSeed |
| class IsExactMFMA final : public InstructionRule { |
| private: |
| unsigned Number = 1; |
| SUnit *ChainSeed; |
| |
| public: |
| bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection, |
| SmallVectorImpl<SchedGroup> &SyncPipe) override { |
| if (!SU || !TII->isMFMAorWMMA(*ChainSeed->getInstr())) |
| return false; |
| |
| if (Cache->empty()) { |
| auto *TempSU = ChainSeed; |
| auto Depth = Number; |
| while (Depth > 0) { |
| --Depth; |
| bool Found = false; |
| for (auto &Succ : TempSU->Succs) { |
| if (TII->isMFMAorWMMA(*Succ.getSUnit()->getInstr())) { |
| TempSU = Succ.getSUnit(); |
| Found = true; |
| break; |
| } |
| } |
| if (!Found) { |
| return false; |
| } |
| } |
| Cache->push_back(TempSU); |
| } |
| // If we failed to find the instruction to be placed into the cache, we |
| // would have already exited. |
| assert(!Cache->empty()); |
| |
| return (*Cache)[0] == SU; |
| } |
| |
| IsExactMFMA(unsigned Number, SUnit *ChainSeed, const SIInstrInfo *TII, |
| unsigned SGID, bool NeedsCache = false) |
| : InstructionRule(TII, SGID, NeedsCache), Number(Number), |
| ChainSeed(ChainSeed) {} |
| }; |
| |
| // Whether the instruction occurs after the first TRANS instruction. This |
| // implies the instruction can not be a predecessor of the first TRANS |
| // insruction |
| class OccursAfterExp final : public InstructionRule { |
| public: |
| bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection, |
| SmallVectorImpl<SchedGroup> &SyncPipe) override { |
| |
| SmallVector<SUnit *, 12> Worklist; |
| auto *DAG = SyncPipe[0].DAG; |
| if (Cache->empty()) { |
| for (auto &SU : DAG->SUnits) |
| if (TII->isTRANS(SU.getInstr()->getOpcode())) { |
| Cache->push_back(&SU); |
| break; |
| } |
| if (Cache->empty()) |
| return false; |
| } |
| |
| return SU->NodeNum > (*Cache)[0]->NodeNum; |
| } |
| |
| OccursAfterExp(const SIInstrInfo *TII, unsigned SGID, |
| bool NeedsCache = false) |
| : InstructionRule(TII, SGID, NeedsCache) {} |
| }; |
| |
| public: |
| bool applyIGLPStrategy( |
| DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs, |
| DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups, |
| AMDGPU::SchedulingPhase Phase) override; |
| |
| bool shouldApplyStrategy(ScheduleDAGInstrs *DAG, |
| AMDGPU::SchedulingPhase Phase) override; |
| |
| MFMAExpInterleaveOpt(ScheduleDAGInstrs *DAG, const SIInstrInfo *TII) |
| : IGLPStrategy(DAG, TII) { |
| IsBottomUp = false; |
| } |
| }; |
| |
| unsigned MFMAExpInterleaveOpt::TransPipeCount = 0; |
| unsigned MFMAExpInterleaveOpt::MFMAPipeCount = 0; |
| unsigned MFMAExpInterleaveOpt::AddPipeCount = 0; |
| unsigned MFMAExpInterleaveOpt::MFMAEnablement = 0; |
| unsigned MFMAExpInterleaveOpt::ExpRequirement = 0; |
| unsigned MFMAExpInterleaveOpt::MFMAChains = 0; |
| unsigned MFMAExpInterleaveOpt::MFMAChainLength = 0; |
| bool MFMAExpInterleaveOpt::HasCvt = false; |
| bool MFMAExpInterleaveOpt::HasChainBetweenCvt = false; |
| std::optional<unsigned> MFMAExpInterleaveOpt::FirstPipeDSR = std::nullopt; |
| |
| bool MFMAExpInterleaveOpt::analyzeDAG(const SIInstrInfo *TII) { |
| SmallVector<SUnit *, 10> ExpPipeCands; |
| SmallVector<SUnit *, 10> MFMAPipeCands; |
| SmallVector<SUnit *, 10> MFMAPipeSUs; |
| SmallVector<SUnit *, 10> PackSUs; |
| SmallVector<SUnit *, 10> CvtSUs; |
| |
| auto isBitPack = [](unsigned Opc) { |
| return Opc == AMDGPU::V_PACK_B32_F16_e64 || Opc == AMDGPU::V_PERM_B32_e64; |
| }; |
| |
| auto isCvt = [](unsigned Opc) { |
| return Opc == AMDGPU::V_CVT_F16_F32_e32 || Opc == AMDGPU::V_CVT_I32_F32_e32; |
| }; |
| |
| auto isAdd = [](unsigned Opc) { return Opc == AMDGPU::V_ADD_F32_e32; }; |
| |
| AddPipeCount = 0; |
| for (SUnit &SU : DAG->SUnits) { |
| auto Opc = SU.getInstr()->getOpcode(); |
| if (TII->isTRANS(Opc)) { |
| // Avoid counting a potential bonus V_EXP which all the MFMA depend on |
| if (SU.Succs.size() >= 7) |
| continue; |
| for (auto &Succ : SU.Succs) { |
| if (Succ.getSUnit()->Succs.size() >= 7) |
| continue; |
| } |
| ExpPipeCands.push_back(&SU); |
| } |
| |
| if (TII->isMFMAorWMMA(*SU.getInstr())) |
| MFMAPipeCands.push_back(&SU); |
| |
| if (isBitPack(Opc)) |
| PackSUs.push_back(&SU); |
| |
| if (isCvt(Opc)) |
| CvtSUs.push_back(&SU); |
| |
| if (isAdd(Opc)) |
| ++AddPipeCount; |
| } |
| |
| if (!(PackSUs.size() && MFMAPipeCands.size() && ExpPipeCands.size())) |
| return false; |
| |
| TransPipeCount = 0; |
| |
| std::optional<SUnit *> TempMFMA; |
| std::optional<SUnit *> TempExp; |
| // Count the number of EXPs that reach an MFMA |
| for (auto &PredSU : ExpPipeCands) { |
| for (auto &SuccSU : MFMAPipeCands) { |
| if (DAG->IsReachable(SuccSU, PredSU)) { |
| if (!TempExp) { |
| TempExp = PredSU; |
| TempMFMA = SuccSU; |
| } |
| MFMAPipeSUs.push_back(SuccSU); |
| ++TransPipeCount; |
| break; |
| } |
| } |
| } |
| |
| if (!(TempExp && TempMFMA)) |
| return false; |
| |
| HasChainBetweenCvt = none_of((*TempExp)->Succs, [&isCvt](SDep &Succ) { |
| return isCvt(Succ.getSUnit()->getInstr()->getOpcode()); |
| }); |
| |
| // Count the number of MFMAs that are reached by an EXP |
| for (auto &SuccSU : MFMAPipeCands) { |
| if (MFMAPipeSUs.size() && |
| any_of(MFMAPipeSUs, [&SuccSU](SUnit *PotentialMatch) { |
| return PotentialMatch->NodeNum == SuccSU->NodeNum; |
| })) |
| continue; |
| |
| for (auto &PredSU : ExpPipeCands) { |
| if (DAG->IsReachable(SuccSU, PredSU)) { |
| MFMAPipeSUs.push_back(SuccSU); |
| break; |
| } |
| } |
| } |
| |
| MFMAPipeCount = MFMAPipeSUs.size(); |
| |
| assert(TempExp && TempMFMA); |
| assert(MFMAPipeCount > 0); |
| |
| std::optional<SUnit *> TempCvt; |
| for (auto &SuccSU : CvtSUs) { |
| if (DAG->IsReachable(SuccSU, *TempExp)) { |
| TempCvt = SuccSU; |
| break; |
| } |
| } |
| |
| HasCvt = false; |
| if (TempCvt.has_value()) { |
| for (auto &SuccSU : MFMAPipeSUs) { |
| if (DAG->IsReachable(SuccSU, *TempCvt)) { |
| HasCvt = true; |
| break; |
| } |
| } |
| } |
| |
| MFMAChains = 0; |
| for (auto &MFMAPipeSU : MFMAPipeSUs) { |
| if (is_contained(MFMAChainSeeds, MFMAPipeSU)) |
| continue; |
| if (none_of(MFMAPipeSU->Preds, [&TII](SDep &Succ) { |
| return TII->isMFMAorWMMA(*Succ.getSUnit()->getInstr()); |
| })) { |
| MFMAChainSeeds.push_back(MFMAPipeSU); |
| ++MFMAChains; |
| } |
| } |
| |
| if (!MFMAChains) |
| return false; |
| |
| for (auto Pred : MFMAChainSeeds[0]->Preds) { |
| if (TII->isDS(Pred.getSUnit()->getInstr()->getOpcode()) && |
| Pred.getSUnit()->getInstr()->mayLoad()) |
| FirstPipeDSR = Pred.getSUnit()->NodeNum; |
| } |
| |
| MFMAChainLength = MFMAPipeCount / MFMAChains; |
| |
| // The number of bit pack operations that depend on a single V_EXP |
| unsigned PackSuccCount = std::count_if( |
| PackSUs.begin(), PackSUs.end(), [this, &TempExp](SUnit *VPack) { |
| return DAG->IsReachable(VPack, *TempExp); |
| }); |
| |
| // The number of bit pack operations an MFMA depends on |
| unsigned PackPredCount = |
| std::count_if((*TempMFMA)->Preds.begin(), (*TempMFMA)->Preds.end(), |
| [&isBitPack](SDep &Pred) { |
| auto Opc = Pred.getSUnit()->getInstr()->getOpcode(); |
| return isBitPack(Opc); |
| }); |
| |
| auto *PackPred = |
| std::find_if((*TempMFMA)->Preds.begin(), (*TempMFMA)->Preds.end(), |
| [&isBitPack](SDep &Pred) { |
| auto Opc = Pred.getSUnit()->getInstr()->getOpcode(); |
| return isBitPack(Opc); |
| }); |
| |
| if (PackPred == (*TempMFMA)->Preds.end()) |
| return false; |
| |
| MFMAEnablement = 0; |
| ExpRequirement = 0; |
| // How many MFMAs depend on a single bit pack operation |
| MFMAEnablement = |
| std::count_if(PackPred->getSUnit()->Succs.begin(), |
| PackPred->getSUnit()->Succs.end(), [&TII](SDep &Succ) { |
| return TII->isMFMAorWMMA(*Succ.getSUnit()->getInstr()); |
| }); |
| |
| // The number of MFMAs that depend on a single V_EXP |
| MFMAEnablement *= PackSuccCount; |
| |
| // The number of V_EXPs required to resolve all dependencies for an MFMA |
| ExpRequirement = |
| std::count_if(ExpPipeCands.begin(), ExpPipeCands.end(), |
| [this, &PackPred](SUnit *ExpBase) { |
| return DAG->IsReachable(PackPred->getSUnit(), ExpBase); |
| }); |
| |
| ExpRequirement *= PackPredCount; |
| return true; |
| } |
| |
| bool MFMAExpInterleaveOpt::shouldApplyStrategy(ScheduleDAGInstrs *DAG, |
| AMDGPU::SchedulingPhase Phase) { |
| const GCNSubtarget &ST = DAG->MF.getSubtarget<GCNSubtarget>(); |
| const SIInstrInfo *TII = ST.getInstrInfo(); |
| |
| if (Phase != AMDGPU::SchedulingPhase::PostRA) |
| MFMAChainSeeds.clear(); |
| if (Phase != AMDGPU::SchedulingPhase::PostRA && !analyzeDAG(TII)) |
| return false; |
| |
| return true; |
| } |
| |
| bool MFMAExpInterleaveOpt::applyIGLPStrategy( |
| DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs, |
| DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups, |
| AMDGPU::SchedulingPhase Phase) { |
| |
| bool IsSmallKernelType = |
| MFMAEnablement == 2 && ExpRequirement == 4 && TransPipeCount == 32; |
| bool IsLargeKernelType = |
| MFMAEnablement == 4 && ExpRequirement == 4 && TransPipeCount == 64; |
| |
| if (!(IsSmallKernelType || IsLargeKernelType)) |
| return false; |
| |
| const GCNSubtarget &ST = DAG->MF.getSubtarget<GCNSubtarget>(); |
| const SIInstrInfo *TII = ST.getInstrInfo(); |
| |
| unsigned PipelineSyncID = 0; |
| SchedGroup *SG = nullptr; |
| |
| unsigned MFMAChain = 0; |
| unsigned PositionInChain = 0; |
| unsigned CurrMFMAForTransPosition = 0; |
| |
| auto incrementTransPosition = [&MFMAChain, &PositionInChain, |
| &CurrMFMAForTransPosition]() { |
| CurrMFMAForTransPosition += MFMAEnablement; |
| PositionInChain = (CurrMFMAForTransPosition / MFMAChains); |
| MFMAChain = CurrMFMAForTransPosition % MFMAChains; |
| }; |
| |
| auto getNextTransPositionInChain = [&CurrMFMAForTransPosition]() { |
| auto TempMFMAForTrans = CurrMFMAForTransPosition + MFMAEnablement; |
| return (TempMFMAForTrans / MFMAChains); |
| }; |
| |
| auto getNextTransMFMAChain = [&CurrMFMAForTransPosition]() { |
| auto TempMFMAForTrans = CurrMFMAForTransPosition + MFMAEnablement; |
| return TempMFMAForTrans % MFMAChains; |
| }; |
| |
| unsigned CurrMFMAPosition = 0; |
| unsigned MFMAChainForMFMA = 0; |
| unsigned PositionInChainForMFMA = 0; |
| |
| auto incrementMFMAPosition = [&CurrMFMAPosition, &MFMAChainForMFMA, |
| &PositionInChainForMFMA]() { |
| ++CurrMFMAPosition; |
| MFMAChainForMFMA = CurrMFMAPosition % MFMAChains; |
| PositionInChainForMFMA = CurrMFMAPosition / MFMAChains; |
| }; |
| |
| bool IsPostRA = Phase == AMDGPU::SchedulingPhase::PostRA; |
| assert(IsPostRA || MFMAChainSeeds.size() == MFMAChains); |
| |
| bool UsesFMA = IsSmallKernelType || !IsPostRA; |
| bool UsesDSRead = IsLargeKernelType && !IsPostRA && FirstPipeDSR; |
| bool UsesCvt = HasCvt && (IsSmallKernelType || !IsPostRA); |
| bool UsesVALU = IsSmallKernelType; |
| |
| // PHASE 1: "Prefetch" |
| if (UsesFMA) { |
| // First Round FMA |
| SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( |
| SchedGroupMask::VALU, ExpRequirement, PipelineSyncID, DAG, TII); |
| if (!IsPostRA && MFMAChains) { |
| SG->addRule(std::make_shared<EnablesNthMFMAInChain>( |
| PositionInChain, MFMAChainSeeds[MFMAChain], TII, SG->getSGID(), |
| true)); |
| } else |
| SG->addRule( |
| std::make_shared<EnablesNthMFMA>(1, TII, SG->getSGID(), true)); |
| SG->addRule(std::make_shared<IsFMA>(TII, SG->getSGID())); |
| SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); |
| |
| // Second Round FMA |
| SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( |
| SchedGroupMask::VALU, ExpRequirement, PipelineSyncID, DAG, TII); |
| if (!IsPostRA && MFMAChains) { |
| SG->addRule(std::make_shared<EnablesNthMFMAInChain>( |
| getNextTransPositionInChain(), |
| MFMAChainSeeds[getNextTransMFMAChain()], TII, SG->getSGID(), true)); |
| } else |
| SG->addRule(std::make_shared<EnablesNthMFMA>(MFMAEnablement + 1, TII, |
| SG->getSGID(), true)); |
| SG->addRule(std::make_shared<IsFMA>(TII, SG->getSGID())); |
| SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); |
| } |
| |
| if (UsesDSRead) { |
| SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( |
| SchedGroupMask::DS_READ, 2, PipelineSyncID, DAG, TII); |
| SG->addRule(std::make_shared<OccursAtOrAfterNode>(*FirstPipeDSR, TII, |
| SG->getSGID())); |
| SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); |
| } |
| |
| // First Round EXP |
| SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( |
| SchedGroupMask::TRANS, ExpRequirement, PipelineSyncID, DAG, TII); |
| if (!IsPostRA && MFMAChains) |
| SG->addRule(std::make_shared<EnablesNthMFMAInChain>( |
| PositionInChain, MFMAChainSeeds[MFMAChain], TII, SG->getSGID(), true)); |
| else |
| SG->addRule(std::make_shared<EnablesNthMFMA>(1, TII, SG->getSGID(), true)); |
| SG->addRule(std::make_shared<IsPipeExp>(TII, SG->getSGID(), true)); |
| SG->addRule(std::make_shared<LessThanNSuccs>(8, TII, SG->getSGID(), |
| HasChainBetweenCvt)); |
| SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); |
| |
| incrementTransPosition(); |
| |
| // First Round CVT, Third Round FMA, Second Round EXP; interleaved |
| for (unsigned I = 0; I < ExpRequirement; I++) { |
| // First Round CVT |
| if (UsesCvt) { |
| SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( |
| SchedGroupMask::VALU, 1, PipelineSyncID, DAG, TII); |
| SG->addRule(std::make_shared<IsCvt>(TII, SG->getSGID())); |
| if (HasChainBetweenCvt) |
| SG->addRule(std::make_shared<IsReachableFromPrevNthGroup>( |
| 1 + (2 + UsesFMA) * I, TII, SG->getSGID())); |
| else |
| SG->addRule(std::make_shared<IsSuccOfPrevNthGroup>( |
| 1 + (2 + UsesFMA) * I, TII, SG->getSGID())); |
| SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); |
| } |
| |
| // Third Round FMA |
| if (UsesFMA) { |
| SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( |
| SchedGroupMask::VALU, 1, PipelineSyncID, DAG, TII); |
| if (!IsPostRA && MFMAChains) { |
| SG->addRule(std::make_shared<EnablesNthMFMAInChain>( |
| getNextTransPositionInChain(), |
| MFMAChainSeeds[getNextTransMFMAChain()], TII, SG->getSGID(), true)); |
| } else |
| SG->addRule(std::make_shared<EnablesNthMFMA>(2 * MFMAEnablement + 1, |
| TII, SG->getSGID(), true)); |
| SG->addRule(std::make_shared<IsFMA>(TII, SG->getSGID())); |
| SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); |
| } |
| |
| // Second Round EXP |
| SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( |
| SchedGroupMask::TRANS, 1, PipelineSyncID, DAG, TII); |
| if (!IsPostRA && MFMAChains) |
| SG->addRule(std::make_shared<EnablesNthMFMAInChain>( |
| PositionInChain, MFMAChainSeeds[MFMAChain], TII, SG->getSGID(), |
| true)); |
| else |
| SG->addRule(std::make_shared<EnablesNthMFMA>(MFMAEnablement + 1, TII, |
| SG->getSGID(), true)); |
| SG->addRule(std::make_shared<IsPipeExp>(TII, SG->getSGID(), true)); |
| SG->addRule(std::make_shared<LessThanNSuccs>(8, TII, SG->getSGID(), |
| HasChainBetweenCvt)); |
| SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); |
| } |
| |
| // The "extra" EXP which enables all MFMA |
| // TODO: UsesExtraExp |
| SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( |
| SchedGroupMask::TRANS, 1, PipelineSyncID, DAG, TII); |
| SG->addRule(std::make_shared<IsPipeExp>(TII, SG->getSGID(), true)); |
| SG->addRule(std::make_shared<GreaterThanOrEqualToNSuccs>( |
| 8, TII, SG->getSGID(), HasChainBetweenCvt)); |
| SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); |
| |
| // PHASE 2: Main Interleave Loop |
| |
| // The number of MFMAs per iteration |
| unsigned MFMARatio = |
| MFMAEnablement > ExpRequirement ? MFMAEnablement / ExpRequirement : 1; |
| // The number of Exps per iteration |
| unsigned ExpRatio = |
| MFMAEnablement > ExpRequirement ? 1 : ExpRequirement / MFMAEnablement; |
| // The reamaining Exps |
| unsigned RemainingExp = TransPipeCount > (2 * ExpRequirement) |
| ? TransPipeCount - (2 * ExpRequirement) |
| : 0; |
| unsigned ExpLoopCount = RemainingExp / ExpRatio; |
| // In loop MFMAs |
| unsigned MFMAInLoop = MFMAPipeCount > (MFMAEnablement * 2) |
| ? MFMAPipeCount - (MFMAEnablement * 2) |
| : 0; |
| unsigned MFMALoopCount = MFMAInLoop / MFMARatio; |
| unsigned VALUOps = |
| AddPipeCount < MFMAPipeCount ? 1 : AddPipeCount / MFMAPipeCount; |
| unsigned LoopSize = std::min(ExpLoopCount, MFMALoopCount); |
| |
| for (unsigned I = 0; I < LoopSize; I++) { |
| if (!(I * ExpRatio % ExpRequirement)) |
| incrementTransPosition(); |
| |
| // Round N MFMA |
| SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( |
| SchedGroupMask::MFMA, MFMARatio, PipelineSyncID, DAG, TII); |
| if (!IsPostRA && MFMAChains) |
| SG->addRule(std::make_shared<IsExactMFMA>( |
| PositionInChainForMFMA, MFMAChainSeeds[MFMAChainForMFMA], TII, |
| SG->getSGID(), true)); |
| else |
| SG->addRule(std::make_shared<OccursAfterExp>(TII, SG->getSGID(), true)); |
| SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); |
| incrementMFMAPosition(); |
| |
| if (UsesVALU) { |
| SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( |
| SchedGroupMask::VALU, VALUOps, PipelineSyncID, DAG, TII); |
| SG->addRule(std::make_shared<IsPipeAdd>(TII, SG->getSGID())); |
| SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); |
| } |
| |
| if (UsesDSRead && !(I % 4)) { |
| SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( |
| SchedGroupMask::DS_READ, 2, PipelineSyncID, DAG, TII); |
| SG->addRule(std::make_shared<OccursAtOrAfterNode>(*FirstPipeDSR, TII, |
| SG->getSGID())); |
| SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); |
| } |
| |
| // CVT, EXP, FMA Interleaving |
| for (unsigned J = 0; J < ExpRatio; J++) { |
| auto MFMAOffset = (1 + UsesVALU) * MFMARatio * (I + 1); |
| auto MaxMFMAOffset = |
| (1 + UsesVALU) * ExpRequirement * MFMARatio / ExpRatio; |
| |
| // Round N + 1 CVT |
| if (UsesCvt) { |
| SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( |
| SchedGroupMask::VALU, 1, PipelineSyncID, DAG, TII); |
| SG->addRule(std::make_shared<IsCvt>(TII, SG->getSGID())); |
| auto BaseDiff = (2 + UsesFMA) * (ExpRequirement - 1) + 1; |
| auto DSROffset = I / 4 + 1; |
| auto MaxDSROffset = MaxMFMAOffset / 4; |
| // TODO: UsesExtraExp |
| auto ExpOffset = I * ExpRatio + J >= ExpRequirement ? 0 : 1; |
| auto CurrentOffset = UsesDSRead * std::min(MaxDSROffset, DSROffset) + |
| std::min(MaxMFMAOffset, MFMAOffset) + BaseDiff + |
| ExpOffset; |
| if (HasChainBetweenCvt) |
| SG->addRule(std::make_shared<IsReachableFromPrevNthGroup>( |
| CurrentOffset, TII, SG->getSGID())); |
| else |
| SG->addRule(std::make_shared<IsSuccOfPrevNthGroup>(CurrentOffset, TII, |
| SG->getSGID())); |
| SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); |
| } |
| |
| // Round N + 3 FMA |
| if (UsesFMA) { |
| SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( |
| SchedGroupMask::VALU, 1, PipelineSyncID, DAG, TII); |
| if (!IsPostRA && MFMAChains) |
| SG->addRule(std::make_shared<EnablesNthMFMAInChain>( |
| getNextTransPositionInChain(), |
| MFMAChainSeeds[getNextTransMFMAChain()], TII, SG->getSGID(), |
| true)); |
| else |
| SG->addRule(std::make_shared<EnablesNthMFMA>( |
| (((I * ExpRatio + J) / ExpRequirement) + 3) * MFMAEnablement + 1, |
| TII, SG->getSGID(), true)); |
| SG->addRule(std::make_shared<IsFMA>(TII, SG->getSGID())); |
| SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); |
| } |
| |
| // Round N + 2 Exp |
| SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( |
| SchedGroupMask::TRANS, 1, PipelineSyncID, DAG, TII); |
| if (!IsPostRA && MFMAChains) |
| SG->addRule(std::make_shared<EnablesNthMFMAInChain>( |
| PositionInChain, MFMAChainSeeds[MFMAChain], TII, SG->getSGID(), |
| true)); |
| else |
| SG->addRule(std::make_shared<EnablesNthMFMA>( |
| (((I * ExpRatio + J) / ExpRequirement) + 2) * MFMAEnablement + 1, |
| TII, SG->getSGID(), true)); |
| SG->addRule(std::make_shared<IsPipeExp>(TII, SG->getSGID(), true)); |
| SG->addRule(std::make_shared<LessThanNSuccs>(8, TII, SG->getSGID(), |
| HasChainBetweenCvt)); |
| SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); |
| } |
| } |
| |
| // PHASE 3: Remaining MFMAs |
| SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( |
| SchedGroupMask::MFMA, MFMAEnablement * 2, PipelineSyncID, DAG, TII); |
| SG->addRule(std::make_shared<OccursAfterExp>(TII, SG->getSGID(), true)); |
| SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); |
| return true; |
| } |
| |
| class MFMAExpSimpleInterleaveOpt final : public IGLPStrategy { |
| public: |
| bool applyIGLPStrategy( |
| DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs, |
| DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups, |
| AMDGPU::SchedulingPhase Phase) override; |
| |
| bool shouldApplyStrategy(ScheduleDAGInstrs *DAG, |
| AMDGPU::SchedulingPhase Phase) override { |
| return true; |
| } |
| |
| MFMAExpSimpleInterleaveOpt(ScheduleDAGInstrs *DAG, const SIInstrInfo *TII) |
| : IGLPStrategy(DAG, TII) { |
| IsBottomUp = true; |
| } |
| }; |
| |
| bool MFMAExpSimpleInterleaveOpt::applyIGLPStrategy( |
| DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs, |
| DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups, |
| AMDGPU::SchedulingPhase Phase) { |
| // Count the number of MFMA instructions. |
| unsigned MFMACount = 0; |
| for (const MachineInstr &I : *DAG) |
| if (TII->isMFMAorWMMA(I)) |
| ++MFMACount; |
| |
| const unsigned PipelineSyncID = 0; |
| for (unsigned I = 0; I < MFMACount * 3; ++I) { |
| SchedGroup *SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( |
| SchedGroupMask::TRANS, 1, PipelineSyncID, DAG, TII); |
| SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); |
| |
| SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( |
| SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII); |
| SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); |
| } |
| |
| return true; |
| } |
| |
| class MFMASmallGemmSingleWaveOpt final : public IGLPStrategy { |
| private: |
| // Whether the DS_READ is a predecessor of first four MFMA in region |
| class EnablesInitialMFMA final : public InstructionRule { |
| public: |
| bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection, |
| SmallVectorImpl<SchedGroup> &SyncPipe) override { |
| if (!SyncPipe.size()) |
| return false; |
| int MFMAsFound = 0; |
| if (!Cache->size()) { |
| for (auto &Elt : SyncPipe[0].DAG->SUnits) { |
| if (TII->isMFMAorWMMA(*Elt.getInstr())) { |
| ++MFMAsFound; |
| if (MFMAsFound > 4) |
| break; |
| Cache->push_back(&Elt); |
| } |
| } |
| } |
| |
| assert(Cache->size()); |
| auto *DAG = SyncPipe[0].DAG; |
| for (auto &Elt : *Cache) { |
| if (DAG->IsReachable(Elt, const_cast<SUnit *>(SU))) |
| return true; |
| } |
| return false; |
| } |
| |
| EnablesInitialMFMA(const SIInstrInfo *TII, unsigned SGID, |
| bool NeedsCache = false) |
| : InstructionRule(TII, SGID, NeedsCache) {} |
| }; |
| |
| // Whether the MI is a V_PERM and is a predecessor of a common DS_WRITE |
| class IsPermForDSW final : public InstructionRule { |
| public: |
| bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection, |
| SmallVectorImpl<SchedGroup> &SyncPipe) override { |
| auto *MI = SU->getInstr(); |
| if (MI->getOpcode() != AMDGPU::V_PERM_B32_e64) |
| return false; |
| |
| bool FitsInGroup = false; |
| // Does the VALU have a DS_WRITE successor |
| if (!Collection.size()) { |
| for (auto &Succ : SU->Succs) { |
| SUnit *SuccUnit = Succ.getSUnit(); |
| if (TII->isDS(*SuccUnit->getInstr()) && |
| SuccUnit->getInstr()->mayStore()) { |
| Cache->push_back(SuccUnit); |
| FitsInGroup = true; |
| } |
| } |
| return FitsInGroup; |
| } |
| |
| assert(Cache->size()); |
| |
| // Does the VALU have a DS_WRITE successor that is the same as other |
| // VALU already in the group. The V_PERMs will all share 1 DS_W succ |
| return llvm::any_of(*Cache, [&SU](SUnit *Elt) { |
| return llvm::any_of(SU->Succs, [&Elt](const SDep &ThisSucc) { |
| return ThisSucc.getSUnit() == Elt; |
| }); |
| }); |
| } |
| |
| IsPermForDSW(const SIInstrInfo *TII, unsigned SGID, bool NeedsCache = false) |
| : InstructionRule(TII, SGID, NeedsCache) {} |
| }; |
| |
| // Whether the SU is a successor of any element in previous SchedGroup |
| class IsSuccOfPrevGroup final : public InstructionRule { |
| public: |
| bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection, |
| SmallVectorImpl<SchedGroup> &SyncPipe) override { |
| SchedGroup *OtherGroup = nullptr; |
| for (auto &PipeSG : SyncPipe) { |
| if ((unsigned)PipeSG.getSGID() == SGID - 1) { |
| OtherGroup = &PipeSG; |
| } |
| } |
| |
| if (!OtherGroup) |
| return false; |
| if (!OtherGroup->Collection.size()) |
| return true; |
| |
| // Does the previous VALU have this DS_Write as a successor |
| return any_of(OtherGroup->Collection, [&SU](SUnit *Elt) { |
| return any_of(Elt->Succs, |
| [&SU](SDep &Succ) { return Succ.getSUnit() == SU; }); |
| }); |
| } |
| IsSuccOfPrevGroup(const SIInstrInfo *TII, unsigned SGID, |
| bool NeedsCache = false) |
| : InstructionRule(TII, SGID, NeedsCache) {} |
| }; |
| |
| // Whether the combined load width of group is 128 bits |
| class VMEMSize final : public InstructionRule { |
| public: |
| bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection, |
| SmallVectorImpl<SchedGroup> &SyncPipe) override { |
| auto *MI = SU->getInstr(); |
| if (MI->getOpcode() == TargetOpcode::BUNDLE) |
| return false; |
| if (!Collection.size()) |
| return true; |
| |
| int NumBits = 0; |
| |
| auto TRI = TII->getRegisterInfo(); |
| auto &MRI = MI->getParent()->getParent()->getRegInfo(); |
| for (auto &Elt : Collection) { |
| auto Op = Elt->getInstr()->getOperand(0); |
| auto Size = |
| TRI.getRegSizeInBits(*TRI.getRegClassForOperandReg(MRI, Op)); |
| NumBits += Size; |
| } |
| |
| if (NumBits < 128) { |
| assert(TII->isVMEM(*MI) && MI->mayLoad()); |
| if (NumBits + TRI.getRegSizeInBits(*TRI.getRegClassForOperandReg( |
| MRI, MI->getOperand(0))) <= |
| 128) |
| return true; |
| } |
| |
| return false; |
| } |
| |
| VMEMSize(const SIInstrInfo *TII, unsigned SGID, bool NeedsCache = false) |
| : InstructionRule(TII, SGID, NeedsCache) {} |
| }; |
| |
| /// Whether the SU shares a V_PERM predecessor with any SU in the SchedGroup |
| /// that is \p Distance steps away |
| class SharesPredWithPrevNthGroup final : public InstructionRule { |
| private: |
| unsigned Distance = 1; |
| |
| public: |
| bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection, |
| SmallVectorImpl<SchedGroup> &SyncPipe) override { |
| SchedGroup *OtherGroup = nullptr; |
| if (!SyncPipe.size()) |
| return false; |
| |
| if (!Cache->size()) { |
| |
| for (auto &PipeSG : SyncPipe) { |
| if ((unsigned)PipeSG.getSGID() == SGID - Distance) { |
| OtherGroup = &PipeSG; |
| } |
| } |
| |
| if (!OtherGroup) |
| return false; |
| if (!OtherGroup->Collection.size()) |
| return true; |
| |
| for (auto &OtherEle : OtherGroup->Collection) { |
| for (auto &Pred : OtherEle->Preds) { |
| if (Pred.getSUnit()->getInstr()->getOpcode() == |
| AMDGPU::V_PERM_B32_e64) |
| Cache->push_back(Pred.getSUnit()); |
| } |
| } |
| |
| // If the other group has no PERM preds, then this group won't share any |
| if (!Cache->size()) |
| return false; |
| } |
| |
| auto *DAG = SyncPipe[0].DAG; |
| // Does the previous DS_WRITE share a V_PERM predecessor with this |
| // VMEM_READ |
| return llvm::any_of(*Cache, [&SU, &DAG](SUnit *Elt) { |
| return DAG->IsReachable(const_cast<SUnit *>(SU), Elt); |
| }); |
| } |
| SharesPredWithPrevNthGroup(unsigned Distance, const SIInstrInfo *TII, |
| unsigned SGID, bool NeedsCache = false) |
| : InstructionRule(TII, SGID, NeedsCache), Distance(Distance) {} |
| }; |
| |
| public: |
| bool applyIGLPStrategy( |
| DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs, |
| DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups, |
| AMDGPU::SchedulingPhase Phase) override; |
| |
| bool shouldApplyStrategy(ScheduleDAGInstrs *DAG, |
| AMDGPU::SchedulingPhase Phase) override { |
| return true; |
| } |
| |
| MFMASmallGemmSingleWaveOpt(ScheduleDAGInstrs *DAG, const SIInstrInfo *TII) |
| : IGLPStrategy(DAG, TII) { |
| IsBottomUp = false; |
| } |
| }; |
| |
| static unsigned DSWCount = 0; |
| static unsigned DSWWithPermCount = 0; |
| static unsigned DSWWithSharedVMEMCount = 0; |
| |
| bool MFMASmallGemmSingleWaveOpt::applyIGLPStrategy( |
| DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs, |
| DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups, |
| AMDGPU::SchedulingPhase Phase) { |
| unsigned MFMACount = 0; |
| unsigned DSRCount = 0; |
| |
| bool IsInitial = Phase == AMDGPU::SchedulingPhase::Initial; |
| |
| assert((!IsInitial || (DSWCount == 0 && DSWWithPermCount == 0 && |
| DSWWithSharedVMEMCount == 0)) && |
| "DSWCounters should be zero in pre-RA scheduling!"); |
| SmallVector<SUnit *, 6> DSWithPerms; |
| for (auto &SU : DAG->SUnits) { |
| auto *I = SU.getInstr(); |
| if (TII->isMFMAorWMMA(*I)) |
| ++MFMACount; |
| else if (TII->isDS(*I)) { |
| if (I->mayLoad()) |
| ++DSRCount; |
| else if (I->mayStore() && IsInitial) { |
| ++DSWCount; |
| for (auto Pred : SU.Preds) { |
| if (Pred.getSUnit()->getInstr()->getOpcode() == |
| AMDGPU::V_PERM_B32_e64) { |
| DSWithPerms.push_back(&SU); |
| break; |
| } |
| } |
| } |
| } |
| } |
| |
| if (IsInitial) { |
| DSWWithPermCount = DSWithPerms.size(); |
| auto *I = DSWithPerms.begin(); |
| auto *E = DSWithPerms.end(); |
| |
| // Get the count of DS_WRITES with V_PERM predecessors which |
| // have loop carried dependencies (WAR) on the same VMEM_READs. |
| // We consider partial overlap as a miss -- in other words, |
| // for a given DS_W, we only consider another DS_W as matching |
| // if there is a corresponding (in terms of the VMEM_R it uses) V_PERM pred |
| // for every V_PERM pred of this DS_W. |
| DenseMap<MachineInstr *, SUnit *> VMEMLookup; |
| SmallVector<SUnit *, 6> Counted; |
| for (; I != E; I++) { |
| SUnit *Cand = nullptr; |
| bool MissedAny = false; |
| for (auto &Pred : (*I)->Preds) { |
| if (Pred.getSUnit()->getInstr()->getOpcode() != AMDGPU::V_PERM_B32_e64) |
| continue; |
| |
| if (Cand && llvm::is_contained(Counted, Cand)) |
| break; |
| |
| for (auto &Succ : Pred.getSUnit()->Succs) { |
| auto *MI = Succ.getSUnit()->getInstr(); |
| if (!TII->isVMEM(*MI) || !MI->mayLoad()) |
| continue; |
| |
| if (MissedAny || !VMEMLookup.size()) { |
| MissedAny = true; |
| VMEMLookup[MI] = *I; |
| continue; |
| } |
| |
| auto [It, Inserted] = VMEMLookup.try_emplace(MI, *I); |
| if (Inserted) { |
| MissedAny = true; |
| continue; |
| } |
| |
| Cand = It->second; |
| if (llvm::is_contained(Counted, Cand)) { |
| MissedAny = true; |
| break; |
| } |
| } |
| } |
| if (!MissedAny && Cand) { |
| DSWWithSharedVMEMCount += 2; |
| Counted.push_back(Cand); |
| Counted.push_back(*I); |
| } |
| } |
| } |
| |
| assert(DSWWithSharedVMEMCount <= DSWWithPermCount); |
| SchedGroup *SG; |
| unsigned PipelineSyncID = 0; |
| // For kernels with V_PERM, there are enough VALU to mix in between MFMAs |
| if (DSWWithPermCount) { |
| for (unsigned I = 0; I < MFMACount; I++) { |
| SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( |
| SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII); |
| SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); |
| |
| SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( |
| SchedGroupMask::VALU, 2, PipelineSyncID, DAG, TII); |
| SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); |
| } |
| } |
| |
| PipelineSyncID = 1; |
| // Phase 1: Break up DS_READ and MFMA clusters. |
| // First DS_READ to make ready initial MFMA, then interleave MFMA with DS_READ |
| // prefetch |
| |
| // Make ready initial MFMA |
| SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( |
| SchedGroupMask::DS_READ, 4, PipelineSyncID, DAG, TII); |
| SG->addRule(std::make_shared<EnablesInitialMFMA>(TII, SG->getSGID(), true)); |
| SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); |
| |
| SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( |
| SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII); |
| SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); |
| |
| // Interleave MFMA with DS_READ prefetch |
| for (unsigned I = 0; I < DSRCount - 4; ++I) { |
| SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( |
| SchedGroupMask::DS_READ, 1, PipelineSyncID, DAG, TII); |
| SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); |
| |
| SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( |
| SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII); |
| SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); |
| } |
| |
| // Phase 2a: Loop carried dependency with V_PERM |
| // Schedule VPerm & DS_WRITE as closely as possible to the VMEM_READ they |
| // depend on. Interleave MFMA to keep XDL unit busy throughout. |
| for (unsigned I = 0; I < DSWWithPermCount - DSWWithSharedVMEMCount; ++I) { |
| SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( |
| SchedGroupMask::VALU, 4, PipelineSyncID, DAG, TII); |
| SG->addRule(std::make_shared<IsPermForDSW>(TII, SG->getSGID(), true)); |
| SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); |
| |
| SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( |
| SchedGroupMask::DS_WRITE, 1, PipelineSyncID, DAG, TII); |
| SG->addRule(std::make_shared<IsSuccOfPrevGroup>(TII, SG->getSGID())); |
| SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); |
| |
| SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( |
| SchedGroupMask::VMEM_READ, 4, PipelineSyncID, DAG, TII); |
| SG->addRule(std::make_shared<SharesPredWithPrevNthGroup>( |
| 1, TII, SG->getSGID(), true)); |
| SG->addRule(std::make_shared<VMEMSize>(TII, SG->getSGID())); |
| SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); |
| |
| SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( |
| SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII); |
| SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); |
| |
| SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( |
| SchedGroupMask::VMEM_READ, 4, PipelineSyncID, DAG, TII); |
| SG->addRule(std::make_shared<SharesPredWithPrevNthGroup>( |
| 3, TII, SG->getSGID(), true)); |
| SG->addRule(std::make_shared<VMEMSize>(TII, SG->getSGID())); |
| SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); |
| |
| SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( |
| SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII); |
| SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); |
| } |
| |
| // Phase 2b: Loop carried dependency without V_PERM |
| // Schedule DS_WRITE as closely as possible to the VMEM_READ they depend on. |
| // Interleave MFMA to keep XDL unit busy throughout. |
| for (unsigned I = 0; I < DSWCount - DSWWithPermCount; I++) { |
| SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( |
| SchedGroupMask::DS_WRITE, 1, PipelineSyncID, DAG, TII); |
| SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); |
| |
| SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( |
| SchedGroupMask::VMEM_READ, 4, PipelineSyncID, DAG, TII); |
| SG->addRule(std::make_shared<VMEMSize>(TII, SG->getSGID())); |
| SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); |
| |
| SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( |
| SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII); |
| SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); |
| } |
| |
| // Phase 2c: Loop carried dependency with V_PERM, VMEM_READs are |
| // ultimately used by two DS_WRITE |
| // Schedule VPerm & DS_WRITE as closely as possible to the VMEM_READ they |
| // depend on. Interleave MFMA to keep XDL unit busy throughout. |
| |
| for (unsigned I = 0; I < DSWWithSharedVMEMCount; ++I) { |
| SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( |
| SchedGroupMask::VALU, 4, PipelineSyncID, DAG, TII); |
| SG->addRule(std::make_shared<IsPermForDSW>(TII, SG->getSGID(), true)); |
| SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); |
| |
| SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( |
| SchedGroupMask::DS_WRITE, 1, PipelineSyncID, DAG, TII); |
| SG->addRule(std::make_shared<IsSuccOfPrevGroup>(TII, SG->getSGID())); |
| SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); |
| |
| SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( |
| SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII); |
| SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); |
| |
| SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( |
| SchedGroupMask::VALU, 4, PipelineSyncID, DAG, TII); |
| SG->addRule(std::make_shared<IsPermForDSW>(TII, SG->getSGID(), true)); |
| SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); |
| |
| SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( |
| SchedGroupMask::DS_WRITE, 1, PipelineSyncID, DAG, TII); |
| SG->addRule(std::make_shared<IsSuccOfPrevGroup>(TII, SG->getSGID())); |
| SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); |
| |
| SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( |
| SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII); |
| SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); |
| |
| SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( |
| SchedGroupMask::VMEM_READ, 4, PipelineSyncID, DAG, TII); |
| SG->addRule(std::make_shared<SharesPredWithPrevNthGroup>( |
| 2, TII, SG->getSGID(), true)); |
| SG->addRule(std::make_shared<VMEMSize>(TII, SG->getSGID())); |
| SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); |
| |
| SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( |
| SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII); |
| SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); |
| |
| SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( |
| SchedGroupMask::VMEM_READ, 4, PipelineSyncID, DAG, TII); |
| SG->addRule(std::make_shared<SharesPredWithPrevNthGroup>( |
| 4, TII, SG->getSGID(), true)); |
| SG->addRule(std::make_shared<VMEMSize>(TII, SG->getSGID())); |
| SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); |
| |
| SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( |
| SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII); |
| SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); |
| } |
| |
| return true; |
| } |
| |
| static std::unique_ptr<IGLPStrategy> |
| createIGLPStrategy(IGLPStrategyID ID, ScheduleDAGInstrs *DAG, |
| const SIInstrInfo *TII) { |
| switch (ID) { |
| case MFMASmallGemmOptID: |
| return std::make_unique<MFMASmallGemmOpt>(DAG, TII); |
| case MFMASmallGemmSingleWaveOptID: |
| return std::make_unique<MFMASmallGemmSingleWaveOpt>(DAG, TII); |
| case MFMAExpInterleaveID: |
| return std::make_unique<MFMAExpInterleaveOpt>(DAG, TII); |
| case MFMAExpSimpleInterleaveID: |
| return std::make_unique<MFMAExpSimpleInterleaveOpt>(DAG, TII); |
| } |
| |
| llvm_unreachable("Unknown IGLPStrategyID"); |
| } |
| |
| class IGroupLPDAGMutation : public ScheduleDAGMutation { |
| private: |
| const SIInstrInfo *TII; |
| |
| ScheduleDAGMI *DAG; |
| |
| // Organize lists of SchedGroups by their SyncID. SchedGroups / |
| // SCHED_GROUP_BARRIERs with different SyncIDs will have no edges added |
| // between then. |
| DenseMap<int, SmallVector<SchedGroup, 4>> SyncedSchedGroups; |
| |
| // Used to track instructions that can be mapped to multiple sched groups |
| DenseMap<int, SUnitsToCandidateSGsMap> SyncedInstrs; |
| |
| // Add DAG edges that enforce SCHED_BARRIER ordering. |
| void addSchedBarrierEdges(SUnit &SU); |
| |
| // Use a SCHED_BARRIER's mask to identify instruction SchedGroups that should |
| // not be reordered accross the SCHED_BARRIER. This is used for the base |
| // SCHED_BARRIER, and not SCHED_GROUP_BARRIER. The difference is that |
| // SCHED_BARRIER will always block all instructions that can be classified |
| // into a particular SchedClass, whereas SCHED_GROUP_BARRIER has a fixed size |
| // and may only synchronize with some SchedGroups. Returns the inverse of |
| // Mask. SCHED_BARRIER's mask describes which instruction types should be |
| // allowed to be scheduled across it. Invert the mask to get the |
| // SchedGroupMask of instructions that should be barred. |
| SchedGroupMask invertSchedBarrierMask(SchedGroupMask Mask) const; |
| |
| // Create SchedGroups for a SCHED_GROUP_BARRIER. |
| void initSchedGroupBarrierPipelineStage( |
| std::vector<SUnit>::reverse_iterator RIter); |
| |
| bool initIGLPOpt(SUnit &SU); |
| |
| public: |
| void apply(ScheduleDAGInstrs *DAGInstrs) override; |
| |
| // The order in which the PipelineSolver should process the candidate |
| // SchedGroup for a PipelineInstr. BOTTOM_UP will try to add SUs to the last |
| // created SchedGroup first, and will consider that as the ultimate |
| // predecessor group when linking. TOP_DOWN instead links and processes the |
| // first created SchedGroup first. |
| bool IsBottomUp = true; |
| |
| // The scheduling phase this application of IGLP corresponds with. |
| AMDGPU::SchedulingPhase Phase = AMDGPU::SchedulingPhase::Initial; |
| |
| IGroupLPDAGMutation() = default; |
| IGroupLPDAGMutation(AMDGPU::SchedulingPhase Phase) : Phase(Phase) {} |
| }; |
| |
| unsigned SchedGroup::NumSchedGroups = 0; |
| |
| bool SchedGroup::tryAddEdge(SUnit *A, SUnit *B) { |
| if (A != B && DAG->canAddEdge(B, A)) { |
| DAG->addEdge(B, SDep(A, SDep::Artificial)); |
| return true; |
| } |
| return false; |
| } |
| |
| bool SchedGroup::canAddMI(const MachineInstr &MI) const { |
| bool Result = false; |
| if (MI.isMetaInstruction()) |
| Result = false; |
| |
| else if (((SGMask & SchedGroupMask::ALU) != SchedGroupMask::NONE) && |
| (TII->isVALU(MI) || TII->isMFMAorWMMA(MI) || TII->isSALU(MI) || |
| TII->isTRANS(MI))) |
| Result = true; |
| |
| else if (((SGMask & SchedGroupMask::VALU) != SchedGroupMask::NONE) && |
| TII->isVALU(MI) && !TII->isMFMAorWMMA(MI) && !TII->isTRANS(MI)) |
| Result = true; |
| |
| else if (((SGMask & SchedGroupMask::SALU) != SchedGroupMask::NONE) && |
| TII->isSALU(MI)) |
| Result = true; |
| |
| else if (((SGMask & SchedGroupMask::MFMA) != SchedGroupMask::NONE) && |
| TII->isMFMAorWMMA(MI)) |
| Result = true; |
| |
| else if (((SGMask & SchedGroupMask::VMEM) != SchedGroupMask::NONE) && |
| (TII->isVMEM(MI) || (TII->isFLAT(MI) && !TII->isDS(MI)))) |
| Result = true; |
| |
| else if (((SGMask & SchedGroupMask::VMEM_READ) != SchedGroupMask::NONE) && |
| MI.mayLoad() && |
| (TII->isVMEM(MI) || (TII->isFLAT(MI) && !TII->isDS(MI)))) |
| Result = true; |
| |
| else if (((SGMask & SchedGroupMask::VMEM_WRITE) != SchedGroupMask::NONE) && |
| MI.mayStore() && |
| (TII->isVMEM(MI) || (TII->isFLAT(MI) && !TII->isDS(MI)))) |
| Result = true; |
| |
| else if (((SGMask & SchedGroupMask::DS) != SchedGroupMask::NONE) && |
| TII->isDS(MI)) |
| Result = true; |
| |
| else if (((SGMask & SchedGroupMask::DS_READ) != SchedGroupMask::NONE) && |
| MI.mayLoad() && TII->isDS(MI)) |
| Result = true; |
| |
| else if (((SGMask & SchedGroupMask::DS_WRITE) != SchedGroupMask::NONE) && |
| MI.mayStore() && TII->isDS(MI)) |
| Result = true; |
| |
| else if (((SGMask & SchedGroupMask::TRANS) != SchedGroupMask::NONE) && |
| TII->isTRANS(MI)) |
| Result = true; |
| |
| LLVM_DEBUG( |
| dbgs() << "For SchedGroup with mask " << format_hex((int)SGMask, 10, true) |
| << (Result ? " could classify " : " unable to classify ") << MI); |
| |
| return Result; |
| } |
| |
| int SchedGroup::link(SUnit &SU, bool MakePred, |
| std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges) { |
| int MissedEdges = 0; |
| for (auto *A : Collection) { |
| SUnit *B = &SU; |
| if (A == B || A->getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER) |
| continue; |
| if (MakePred) |
| std::swap(A, B); |
| |
| if (DAG->IsReachable(B, A)) |
| continue; |
| |
| // tryAddEdge returns false if there is a dependency that makes adding |
| // the A->B edge impossible, otherwise it returns true; |
| bool Added = tryAddEdge(A, B); |
| if (Added) |
| AddedEdges.emplace_back(A, B); |
| else |
| ++MissedEdges; |
| } |
| |
| return MissedEdges; |
| } |
| |
| void SchedGroup::link(SUnit &SU, bool MakePred) { |
| for (auto *A : Collection) { |
| SUnit *B = &SU; |
| if (A->getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER) |
| continue; |
| if (MakePred) |
| std::swap(A, B); |
| |
| tryAddEdge(A, B); |
| } |
| } |
| |
| void SchedGroup::link(SUnit &SU, |
| function_ref<bool(const SUnit *A, const SUnit *B)> P) { |
| for (auto *A : Collection) { |
| SUnit *B = &SU; |
| if (P(A, B)) |
| std::swap(A, B); |
| |
| tryAddEdge(A, B); |
| } |
| } |
| |
| void SchedGroup::link(SchedGroup &OtherGroup) { |
| for (auto *B : OtherGroup.Collection) |
| link(*B); |
| } |
| |
| bool SchedGroup::canAddSU(SUnit &SU) const { |
| MachineInstr &MI = *SU.getInstr(); |
| if (MI.getOpcode() != TargetOpcode::BUNDLE) |
| return canAddMI(MI); |
| |
| // Special case for bundled MIs. |
| const MachineBasicBlock *MBB = MI.getParent(); |
| MachineBasicBlock::instr_iterator B = MI.getIterator(), E = ++B; |
| while (E != MBB->end() && E->isBundledWithPred()) |
| ++E; |
| |
| // Return true if all of the bundled MIs can be added to this group. |
| return std::all_of(B, E, [this](MachineInstr &MI) { return canAddMI(MI); }); |
| } |
| |
| void SchedGroup::initSchedGroup() { |
| for (auto &SU : DAG->SUnits) { |
| if (isFull()) |
| break; |
| |
| if (canAddSU(SU)) |
| add(SU); |
| } |
| } |
| |
| void SchedGroup::initSchedGroup(std::vector<SUnit>::reverse_iterator RIter, |
| SUnitsToCandidateSGsMap &SyncedInstrs) { |
| SUnit &InitSU = *RIter; |
| for (auto E = DAG->SUnits.rend(); RIter != E; ++RIter) { |
| auto &SU = *RIter; |
| if (isFull()) |
| break; |
| |
| if (canAddSU(SU)) |
| SyncedInstrs[&SU].push_back(SGID); |
| } |
| |
| add(InitSU); |
| assert(MaxSize); |
| (*MaxSize)++; |
| } |
| |
| void SchedGroup::initSchedGroup(SUnitsToCandidateSGsMap &SyncedInstrs) { |
| auto I = DAG->SUnits.rbegin(); |
| auto E = DAG->SUnits.rend(); |
| for (; I != E; ++I) { |
| auto &SU = *I; |
| if (isFull()) |
| break; |
| if (canAddSU(SU)) |
| SyncedInstrs[&SU].push_back(SGID); |
| } |
| } |
| |
| void IGroupLPDAGMutation::apply(ScheduleDAGInstrs *DAGInstrs) { |
| const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel(); |
| if (!TSchedModel || DAGInstrs->SUnits.empty()) |
| return; |
| |
| LLVM_DEBUG(dbgs() << "Applying IGroupLPDAGMutation...\n"); |
| const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>(); |
| TII = ST.getInstrInfo(); |
| DAG = static_cast<ScheduleDAGMI *>(DAGInstrs); |
| SyncedSchedGroups.clear(); |
| SyncedInstrs.clear(); |
| bool FoundSB = false; |
| bool FoundIGLP = false; |
| bool ShouldApplyIGLP = false; |
| for (auto R = DAG->SUnits.rbegin(), E = DAG->SUnits.rend(); R != E; ++R) { |
| unsigned Opc = R->getInstr()->getOpcode(); |
| // SCHED_[GROUP_]BARRIER and IGLP are mutually exclusive. |
| if (Opc == AMDGPU::SCHED_BARRIER) { |
| addSchedBarrierEdges(*R); |
| FoundSB = true; |
| } else if (Opc == AMDGPU::SCHED_GROUP_BARRIER) { |
| initSchedGroupBarrierPipelineStage(R); |
| FoundSB = true; |
| } else if (Opc == AMDGPU::IGLP_OPT) { |
| if (!FoundSB && !FoundIGLP) { |
| FoundIGLP = true; |
| ShouldApplyIGLP = initIGLPOpt(*R); |
| } |
| } |
| } |
| |
| if (FoundSB || (FoundIGLP && ShouldApplyIGLP)) { |
| PipelineSolver PS(SyncedSchedGroups, SyncedInstrs, DAG, IsBottomUp); |
| // PipelineSolver performs the mutation by adding the edges it |
| // determined as the best |
| PS.solve(); |
| return; |
| } |
| } |
| |
| void IGroupLPDAGMutation::addSchedBarrierEdges(SUnit &SchedBarrier) { |
| MachineInstr &MI = *SchedBarrier.getInstr(); |
| assert(MI.getOpcode() == AMDGPU::SCHED_BARRIER); |
| // Remove all existing edges from the SCHED_BARRIER that were added due to the |
| // instruction having side effects. |
| LLVM_DEBUG(dbgs() << "Building SchedGroup for SchedBarrier with Mask: " |
| << MI.getOperand(0).getImm() << "\n"); |
| auto InvertedMask = |
| invertSchedBarrierMask((SchedGroupMask)MI.getOperand(0).getImm()); |
| SchedGroup SG(InvertedMask, std::nullopt, DAG, TII); |
| SG.initSchedGroup(); |
| |
| // Preserve original instruction ordering relative to the SCHED_BARRIER. |
| SG.link( |
| SchedBarrier, |
| (function_ref<bool(const SUnit *A, const SUnit *B)>)[]( |
| const SUnit *A, const SUnit *B) { return A->NodeNum > B->NodeNum; }); |
| } |
| |
| SchedGroupMask |
| IGroupLPDAGMutation::invertSchedBarrierMask(SchedGroupMask Mask) const { |
| // Invert mask and erase bits for types of instructions that are implied to be |
| // allowed past the SCHED_BARRIER. |
| SchedGroupMask InvertedMask = ~Mask; |
| |
| // ALU implies VALU, SALU, MFMA, TRANS. |
| if ((InvertedMask & SchedGroupMask::ALU) == SchedGroupMask::NONE) |
| InvertedMask &= ~SchedGroupMask::VALU & ~SchedGroupMask::SALU & |
| ~SchedGroupMask::MFMA & ~SchedGroupMask::TRANS; |
| // VALU, SALU, MFMA, TRANS implies ALU. |
| else if ((InvertedMask & SchedGroupMask::VALU) == SchedGroupMask::NONE || |
| (InvertedMask & SchedGroupMask::SALU) == SchedGroupMask::NONE || |
| (InvertedMask & SchedGroupMask::MFMA) == SchedGroupMask::NONE || |
| (InvertedMask & SchedGroupMask::TRANS) == SchedGroupMask::NONE) |
| InvertedMask &= ~SchedGroupMask::ALU; |
| |
| // VMEM implies VMEM_READ, VMEM_WRITE. |
| if ((InvertedMask & SchedGroupMask::VMEM) == SchedGroupMask::NONE) |
| InvertedMask &= ~SchedGroupMask::VMEM_READ & ~SchedGroupMask::VMEM_WRITE; |
| // VMEM_READ, VMEM_WRITE implies VMEM. |
| else if ((InvertedMask & SchedGroupMask::VMEM_READ) == SchedGroupMask::NONE || |
| (InvertedMask & SchedGroupMask::VMEM_WRITE) == SchedGroupMask::NONE) |
| InvertedMask &= ~SchedGroupMask::VMEM; |
| |
| // DS implies DS_READ, DS_WRITE. |
| if ((InvertedMask & SchedGroupMask::DS) == SchedGroupMask::NONE) |
| InvertedMask &= ~SchedGroupMask::DS_READ & ~SchedGroupMask::DS_WRITE; |
| // DS_READ, DS_WRITE implies DS. |
| else if ((InvertedMask & SchedGroupMask::DS_READ) == SchedGroupMask::NONE || |
| (InvertedMask & SchedGroupMask::DS_WRITE) == SchedGroupMask::NONE) |
| InvertedMask &= ~SchedGroupMask::DS; |
| |
| LLVM_DEBUG(dbgs() << "After Inverting, SchedGroup Mask: " << (int)InvertedMask |
| << "\n"); |
| |
| return InvertedMask; |
| } |
| |
| void IGroupLPDAGMutation::initSchedGroupBarrierPipelineStage( |
| std::vector<SUnit>::reverse_iterator RIter) { |
| // Remove all existing edges from the SCHED_GROUP_BARRIER that were added due |
| // to the instruction having side effects. |
| MachineInstr &SGB = *RIter->getInstr(); |
| assert(SGB.getOpcode() == AMDGPU::SCHED_GROUP_BARRIER); |
| int32_t SGMask = SGB.getOperand(0).getImm(); |
| int32_t Size = SGB.getOperand(1).getImm(); |
| int32_t SyncID = SGB.getOperand(2).getImm(); |
| |
| auto &SG = SyncedSchedGroups[SyncID].emplace_back((SchedGroupMask)SGMask, |
| Size, SyncID, DAG, TII); |
| |
| SG.initSchedGroup(RIter, SyncedInstrs[SG.getSyncID()]); |
| } |
| |
| bool IGroupLPDAGMutation::initIGLPOpt(SUnit &SU) { |
| IGLPStrategyID StrategyID = |
| (IGLPStrategyID)SU.getInstr()->getOperand(0).getImm(); |
| auto S = createIGLPStrategy(StrategyID, DAG, TII); |
| if (!S->shouldApplyStrategy(DAG, Phase)) |
| return false; |
| |
| IsBottomUp = S->IsBottomUp; |
| return S->applyIGLPStrategy(SyncedInstrs, SyncedSchedGroups, Phase); |
| } |
| |
| } // namespace |
| |
| namespace llvm { |
| |
| /// \p Phase specifes whether or not this is a reentry into the |
| /// IGroupLPDAGMutation. Since there may be multiple scheduling passes on the |
| /// same scheduling region (e.g. pre and post-RA scheduling / multiple |
| /// scheduling "phases"), we can reenter this mutation framework more than once |
| /// for a given region. |
| std::unique_ptr<ScheduleDAGMutation> |
| createIGroupLPDAGMutation(AMDGPU::SchedulingPhase Phase) { |
| return std::make_unique<IGroupLPDAGMutation>(Phase); |
| } |
| |
| } // end namespace llvm |