llvm/lib/Target/AMDGPU/AMDGPUSplitModule.cpp - llvm-project - Git at Google

 //===- AMDGPUSplitModule.cpp ----------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 //
 /// \file Implements a module splitting algorithm designed to support the
 /// FullLTO --lto-partitions option for parallel codegen.
 ///
 /// The role of this module splitting pass is the same as
 /// lib/Transforms/Utils/SplitModule.cpp: load-balance the module's functions
 /// across a set of N partitions to allow for parallel codegen.
 ///
 /// The similarities mostly end here, as this pass achieves load-balancing in a
 /// more elaborate fashion which is targeted towards AMDGPU modules. It can take
 /// advantage of the structure of AMDGPU modules (which are mostly
 /// self-contained) to allow for more efficient splitting without affecting
 /// codegen negatively, or causing innaccurate resource usage analysis.
 ///
 /// High-level pass overview:
 ///   - SplitGraph & associated classes
 ///      - Graph representation of the module and of the dependencies that
 ///      matter for splitting.
 ///   - RecursiveSearchSplitting
 ///     - Core splitting algorithm.
 ///   - SplitProposal
 ///     - Represents a suggested solution for splitting the input module. These
 ///     solutions can be scored to determine the best one when multiple
 ///     solutions are available.
 ///   - Driver/pass "run" function glues everything together.

 #include "AMDGPUSplitModule.h"
 #include "AMDGPUTargetMachine.h"
 #include "Utils/AMDGPUBaseInfo.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/EquivalenceClasses.h"
 #include "llvm/ADT/GraphTraits.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Analysis/CallGraph.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/InstIterator.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Value.h"
 #include "llvm/Support/Allocator.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/DOTGraphTraits.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/GraphWriter.h"
 #include "llvm/Support/Path.h"
 #include "llvm/Support/Timer.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Utils/Cloning.h"
 #include <cassert>
 #include <cmath>
 #include <memory>
 #include <utility>
 #include <vector>

 #ifndef NDEBUG
 #include "llvm/Support/LockFileManager.h"
 #endif

 #define DEBUG_TYPE "amdgpu-split-module"

 namespace llvm {
 namespace {

 static cl::opt<unsigned> MaxDepth(
     "amdgpu-module-splitting-max-depth",
     cl::desc(
         "maximum search depth. 0 forces a greedy approach. "
         "warning: the algorithm is up to O(2^N), where N is the max depth."),
     cl::init(8));

 static cl::opt<float> LargeFnFactor(
     "amdgpu-module-splitting-large-threshold", cl::init(2.0f), cl::Hidden,
     cl::desc(
         "when max depth is reached and we can no longer branch out, this "
         "value determines if a function is worth merging into an already "
         "existing partition to reduce code duplication. This is a factor "
         "of the ideal partition size, e.g. 2.0 means we consider the "
         "function for merging if its cost (including its callees) is 2x the "
         "size of an ideal partition."));

 static cl::opt<float> LargeFnOverlapForMerge(
     "amdgpu-module-splitting-merge-threshold", cl::init(0.7f), cl::Hidden,
     cl::desc("when a function is considered for merging into a partition that "
              "already contains some of its callees, do the merge if at least "
              "n% of the code it can reach is already present inside the "
              "partition; e.g. 0.7 means only merge >70%"));

 static cl::opt<bool> NoExternalizeGlobals(
     "amdgpu-module-splitting-no-externalize-globals", cl::Hidden,
     cl::desc("disables externalization of global variable with local linkage; "
              "may cause globals to be duplicated which increases binary size"));

 static cl::opt<bool> NoExternalizeOnAddrTaken(
     "amdgpu-module-splitting-no-externalize-address-taken", cl::Hidden,
     cl::desc(
         "disables externalization of functions whose addresses are taken"));

 static cl::opt<std::string>
     ModuleDotCfgOutput("amdgpu-module-splitting-print-module-dotcfg",
                        cl::Hidden,
                        cl::desc("output file to write out the dotgraph "
                                 "representation of the input module"));

 static cl::opt<std::string> PartitionSummariesOutput(
     "amdgpu-module-splitting-print-partition-summaries", cl::Hidden,
     cl::desc("output file to write out a summary of "
              "the partitions created for each module"));

 #ifndef NDEBUG
 static cl::opt<bool>
     UseLockFile("amdgpu-module-splitting-serial-execution", cl::Hidden,
                 cl::desc("use a lock file so only one process in the system "
                          "can run this pass at once. useful to avoid mangled "
                          "debug output in multithreaded environments."));

 static cl::opt<bool>
     DebugProposalSearch("amdgpu-module-splitting-debug-proposal-search",
                         cl::Hidden,
                         cl::desc("print all proposals received and whether "
                                  "they were rejected or accepted"));
 #endif

 struct SplitModuleTimer : NamedRegionTimer {
   SplitModuleTimer(StringRef Name, StringRef Desc)
       : NamedRegionTimer(Name, Desc, DEBUG_TYPE, "AMDGPU Module Splitting",
                          TimePassesIsEnabled) {}
 };

 //===----------------------------------------------------------------------===//
 // Utils
 //===----------------------------------------------------------------------===//

 using CostType = InstructionCost::CostType;
 using FunctionsCostMap = DenseMap<const Function *, CostType>;
 using GetTTIFn = function_ref<const TargetTransformInfo &(Function &)>;
 static constexpr unsigned InvalidPID = -1;

 /// \param Num numerator
 /// \param Dem denominator
 /// \returns a printable object to print (Num/Dem) using "%0.2f".
 static auto formatRatioOf(CostType Num, CostType Dem) {
   CostType DemOr1 = Dem ? Dem : 1;
   return format("%0.2f", (static_cast<double>(Num) / DemOr1) * 100);
 }

 /// Checks whether a given function is non-copyable.
 ///
 /// Non-copyable functions cannot be cloned into multiple partitions, and only
 /// one copy of the function can be present across all partitions.
 ///
 /// Kernel functions and external functions fall into this category. If we were
 /// to clone them, we would end up with multiple symbol definitions and a very
 /// unhappy linker.
 static bool isNonCopyable(const Function &F) {
   return F.hasExternalLinkage() || !F.isDefinitionExact() ||
          AMDGPU::isEntryFunctionCC(F.getCallingConv());
 }

 /// If \p GV has local linkage, make it external + hidden.
 static void externalize(GlobalValue &GV) {
   if (GV.hasLocalLinkage()) {
     GV.setLinkage(GlobalValue::ExternalLinkage);
     GV.setVisibility(GlobalValue::HiddenVisibility);
   }

   // Unnamed entities must be named consistently between modules. setName will
   // give a distinct name to each such entity.
   if (!GV.hasName())
     GV.setName("__llvmsplit_unnamed");
 }

 /// Cost analysis function. Calculates the cost of each function in \p M
 ///
 /// \param GetTTI Abstract getter for TargetTransformInfo.
 /// \param M Module to analyze.
 /// \param CostMap[out] Resulting Function -> Cost map.
 /// \return The module's total cost.
 static CostType calculateFunctionCosts(GetTTIFn GetTTI, Module &M,
                                        FunctionsCostMap &CostMap) {
   SplitModuleTimer SMT("calculateFunctionCosts", "cost analysis");

   LLVM_DEBUG(dbgs() << "[cost analysis] calculating function costs\n");
   CostType ModuleCost = 0;
   [[maybe_unused]] CostType KernelCost = 0;

   for (auto &Fn : M) {
     if (Fn.isDeclaration())
       continue;

     CostType FnCost = 0;
     const auto &TTI = GetTTI(Fn);
     for (const auto &BB : Fn) {
       for (const auto &I : BB) {
         auto Cost =
             TTI.getInstructionCost(&I, TargetTransformInfo::TCK_CodeSize);
         assert(Cost != InstructionCost::getMax());
         // Assume expensive if we can't tell the cost of an instruction.
         CostType CostVal = Cost.isValid()
                                ? Cost.getValue()
                                : (CostType)TargetTransformInfo::TCC_Expensive;
         assert((FnCost + CostVal) >= FnCost && "Overflow!");
         FnCost += CostVal;
       }
     }

     assert(FnCost != 0);

     CostMap[&Fn] = FnCost;
     assert((ModuleCost + FnCost) >= ModuleCost && "Overflow!");
     ModuleCost += FnCost;

     if (AMDGPU::isEntryFunctionCC(Fn.getCallingConv()))
       KernelCost += FnCost;
   }

   if (CostMap.empty())
     return 0;

   assert(ModuleCost);
   LLVM_DEBUG({
     const CostType FnCost = ModuleCost - KernelCost;
     dbgs() << " - total module cost is " << ModuleCost << ". kernels cost "
            << "" << KernelCost << " ("
            << format("%0.2f", (float(KernelCost) / ModuleCost) * 100)
            << "% of the module), functions cost " << FnCost << " ("
            << format("%0.2f", (float(FnCost) / ModuleCost) * 100)
            << "% of the module)\n";
   });

   return ModuleCost;
 }

 /// \return true if \p F can be indirectly called
 static bool canBeIndirectlyCalled(const Function &F) {
   if (F.isDeclaration() || AMDGPU::isEntryFunctionCC(F.getCallingConv()))
     return false;
   return !F.hasLocalLinkage() ||
          F.hasAddressTaken(/*PutOffender=*/nullptr,
                            /*IgnoreCallbackUses=*/false,
                            /*IgnoreAssumeLikeCalls=*/true,
                            /*IgnoreLLVMUsed=*/true,
                            /*IgnoreARCAttachedCall=*/false,
                            /*IgnoreCastedDirectCall=*/true);
 }

 //===----------------------------------------------------------------------===//
 // Graph-based Module Representation
 //===----------------------------------------------------------------------===//

 /// AMDGPUSplitModule's view of the source Module, as a graph of all components
 /// that can be split into different modules.
 ///
 /// The most trivial instance of this graph is just the CallGraph of the module,
 /// but it is not guaranteed that the graph is strictly equal to the CG. It
 /// currently always is but it's designed in a way that would eventually allow
 /// us to create abstract nodes, or nodes for different entities such as global
 /// variables or any other meaningful constraint we must consider.
 ///
 /// The graph is only mutable by this class, and is generally not modified
 /// after \ref SplitGraph::buildGraph runs. No consumers of the graph can
 /// mutate it.
 class SplitGraph {
 public:
   class Node;

   enum class EdgeKind : uint8_t {
     /// The nodes are related through a direct call. This is a "strong" edge as
     /// it means the Src will directly reference the Dst.
     DirectCall,
     /// The nodes are related through an indirect call.
     /// This is a "weaker" edge and is only considered when traversing the graph
     /// starting from a kernel. We need this edge for resource usage analysis.
     ///
     /// The reason why we have this edge in the first place is due to how
     /// AMDGPUResourceUsageAnalysis works. In the presence of an indirect call,
     /// the resource usage of the kernel containing the indirect call is the
     /// max resource usage of all functions that can be indirectly called.
     IndirectCall,
   };

   /// An edge between two nodes. Edges are directional, and tagged with a
   /// "kind".
   struct Edge {
     Edge(Node *Src, Node *Dst, EdgeKind Kind)
         : Src(Src), Dst(Dst), Kind(Kind) {}

     Node *Src; ///< Source
     Node *Dst; ///< Destination
     EdgeKind Kind;
   };

   using EdgesVec = SmallVector<const Edge *, 0>;
   using edges_iterator = EdgesVec::const_iterator;
   using nodes_iterator = const Node *const *;

   SplitGraph(const Module &M, const FunctionsCostMap &CostMap,
              CostType ModuleCost)
       : M(M), CostMap(CostMap), ModuleCost(ModuleCost) {}

   void buildGraph(CallGraph &CG);

 #ifndef NDEBUG
   bool verifyGraph() const;
 #endif

   bool empty() const { return Nodes.empty(); }
   const iterator_range<nodes_iterator> nodes() const {
     return {Nodes.begin(), Nodes.end()};
   }
   const Node &getNode(unsigned ID) const { return *Nodes[ID]; }

   unsigned getNumNodes() const { return Nodes.size(); }
   BitVector createNodesBitVector() const { return BitVector(Nodes.size()); }

   const Module &getModule() const { return M; }

   CostType getModuleCost() const { return ModuleCost; }
   CostType getCost(const Function &F) const { return CostMap.at(&F); }

   /// \returns the aggregated cost of all nodes in \p BV (bits set to 1 = node
   /// IDs).
   CostType calculateCost(const BitVector &BV) const;

 private:
   /// Retrieves the node for \p GV in \p Cache, or creates a new node for it and
   /// updates \p Cache.
   Node &getNode(DenseMap<const GlobalValue *, Node *> &Cache,
                 const GlobalValue &GV);

   // Create a new edge between two nodes and add it to both nodes.
   const Edge &createEdge(Node &Src, Node &Dst, EdgeKind EK);

   const Module &M;
   const FunctionsCostMap &CostMap;
   CostType ModuleCost;

   // Final list of nodes with stable ordering.
   SmallVector<Node *> Nodes;

   SpecificBumpPtrAllocator<Node> NodesPool;

   // Edges are trivially destructible objects, so as a small optimization we
   // use a BumpPtrAllocator which avoids destructor calls but also makes
   // allocation faster.
   static_assert(
       std::is_trivially_destructible_v<Edge>,
       "Edge must be trivially destructible to use the BumpPtrAllocator");
   BumpPtrAllocator EdgesPool;
 };

 /// Nodes in the SplitGraph contain both incoming, and outgoing edges.
 /// Incoming edges have this node as their Dst, and Outgoing ones have this node
 /// as their Src.
 ///
 /// Edge objects are shared by both nodes in Src/Dst. They provide immediate
 /// feedback on how two nodes are related, and in which direction they are
 /// related, which is valuable information to make splitting decisions.
 ///
 /// Nodes are fundamentally abstract, and any consumers of the graph should
 /// treat them as such. While a node will be a function most of the time, we
 /// could also create nodes for any other reason. In the future, we could have
 /// single nodes for multiple functions, or nodes for GVs, etc.
 class SplitGraph::Node {
   friend class SplitGraph;

 public:
   Node(unsigned ID, const GlobalValue &GV, CostType IndividualCost,
        bool IsNonCopyable)
       : ID(ID), GV(GV), IndividualCost(IndividualCost),
         IsNonCopyable(IsNonCopyable), IsEntryFnCC(false), IsGraphEntry(false) {
     if (auto *Fn = dyn_cast<Function>(&GV))
       IsEntryFnCC = AMDGPU::isEntryFunctionCC(Fn->getCallingConv());
   }

   /// An 0-indexed ID for the node. The maximum ID (exclusive) is the number of
   /// nodes in the graph. This ID can be used as an index in a BitVector.
   unsigned getID() const { return ID; }

   const Function &getFunction() const { return cast<Function>(GV); }

   /// \returns the cost to import this component into a given module, not
   /// accounting for any dependencies that may need to be imported as well.
   CostType getIndividualCost() const { return IndividualCost; }

   bool isNonCopyable() const { return IsNonCopyable; }
   bool isEntryFunctionCC() const { return IsEntryFnCC; }

   /// \returns whether this is an entry point in the graph. Entry points are
   /// defined as follows: if you take all entry points in the graph, and iterate
   /// their dependencies, you are guaranteed to visit all nodes in the graph at
   /// least once.
   bool isGraphEntryPoint() const { return IsGraphEntry; }

   StringRef getName() const { return GV.getName(); }

   bool hasAnyIncomingEdges() const { return IncomingEdges.size(); }
   bool hasAnyIncomingEdgesOfKind(EdgeKind EK) const {
     return any_of(IncomingEdges, [&](const auto *E) { return E->Kind == EK; });
   }

   bool hasAnyOutgoingEdges() const { return OutgoingEdges.size(); }
   bool hasAnyOutgoingEdgesOfKind(EdgeKind EK) const {
     return any_of(OutgoingEdges, [&](const auto *E) { return E->Kind == EK; });
   }

   iterator_range<edges_iterator> incoming_edges() const {
     return IncomingEdges;
   }

   iterator_range<edges_iterator> outgoing_edges() const {
     return OutgoingEdges;
   }

   bool shouldFollowIndirectCalls() const { return isEntryFunctionCC(); }

   /// Visit all children of this node in a recursive fashion. Also visits Self.
   /// If \ref shouldFollowIndirectCalls returns false, then this only follows
   /// DirectCall edges.
   ///
   /// \param Visitor Visitor Function.
   void visitAllDependencies(std::function<void(const Node &)> Visitor) const;

   /// Adds the depedencies of this node in \p BV by setting the bit
   /// corresponding to each node.
   ///
   /// Implemented using \ref visitAllDependencies, hence it follows the same
   /// rules regarding dependencies traversal.
   ///
   /// \param[out] BV The bitvector where the bits should be set.
   void getDependencies(BitVector &BV) const {
     visitAllDependencies([&](const Node &N) { BV.set(N.getID()); });
   }

 private:
   void markAsGraphEntry() { IsGraphEntry = true; }

   unsigned ID;
   const GlobalValue &GV;
   CostType IndividualCost;
   bool IsNonCopyable : 1;
   bool IsEntryFnCC : 1;
   bool IsGraphEntry : 1;

   // TODO: Use a single sorted vector (with all incoming/outgoing edges grouped
   // together)
   EdgesVec IncomingEdges;
   EdgesVec OutgoingEdges;
 };

 void SplitGraph::Node::visitAllDependencies(
     std::function<void(const Node &)> Visitor) const {
   const bool FollowIndirect = shouldFollowIndirectCalls();
   // FIXME: If this can access SplitGraph in the future, use a BitVector
   // instead.
   DenseSet<const Node *> Seen;
   SmallVector<const Node *, 8> WorkList({this});
   while (!WorkList.empty()) {
     const Node *CurN = WorkList.pop_back_val();
     if (auto [It, Inserted] = Seen.insert(CurN); !Inserted)
       continue;

     Visitor(*CurN);

     for (const Edge *E : CurN->outgoing_edges()) {
       if (!FollowIndirect && E->Kind == EdgeKind::IndirectCall)
         continue;
       WorkList.push_back(E->Dst);
     }
   }
 }

 /// Checks if \p I has MD_callees and if it does, parse it and put the function
 /// in \p Callees.
 ///
 /// \returns true if there was metadata and it was parsed correctly. false if
 /// there was no MD or if it contained unknown entries and parsing failed.
 /// If this returns false, \p Callees will contain incomplete information
 /// and must not be used.
 static bool handleCalleesMD(const Instruction &I,
                             SetVector<Function *> &Callees) {
   auto *MD = I.getMetadata(LLVMContext::MD_callees);
   if (!MD)
     return false;

   for (const auto &Op : MD->operands()) {
     Function *Callee = mdconst::extract_or_null<Function>(Op);
     if (!Callee)
       return false;
     Callees.insert(Callee);
   }

   return true;
 }

 void SplitGraph::buildGraph(CallGraph &CG) {
   SplitModuleTimer SMT("buildGraph", "graph construction");
   LLVM_DEBUG(
       dbgs()
       << "[build graph] constructing graph representation of the input\n");

   // FIXME(?): Is the callgraph really worth using if we have to iterate the
   // function again whenever it fails to give us enough information?

   // We build the graph by just iterating all functions in the module and
   // working on their direct callees. At the end, all nodes should be linked
   // together as expected.
   DenseMap<const GlobalValue *, Node *> Cache;
   SmallVector<const Function *> FnsWithIndirectCalls, IndirectlyCallableFns;
   for (const Function &Fn : M) {
     if (Fn.isDeclaration())
       continue;

     // Look at direct callees and create the necessary edges in the graph.
     SetVector<const Function *> DirectCallees;
     bool CallsExternal = false;
     for (auto &CGEntry : *CG[&Fn]) {
       auto *CGNode = CGEntry.second;
       if (auto *Callee = CGNode->getFunction()) {
         if (!Callee->isDeclaration())
           DirectCallees.insert(Callee);
       } else if (CGNode == CG.getCallsExternalNode())
         CallsExternal = true;
     }

     // Keep track of this function if it contains an indirect call and/or if it
     // can be indirectly called.
     if (CallsExternal) {
       LLVM_DEBUG(dbgs() << "  [!] callgraph is incomplete for ";
                  Fn.printAsOperand(dbgs());
                  dbgs() << " - analyzing function\n");

       SetVector<Function *> KnownCallees;
       bool HasUnknownIndirectCall = false;
       for (const auto &Inst : instructions(Fn)) {
         // look at all calls without a direct callee.
         const auto *CB = dyn_cast<CallBase>(&Inst);
         if (!CB || CB->getCalledFunction())
           continue;

         // inline assembly can be ignored, unless InlineAsmIsIndirectCall is
         // true.
         if (CB->isInlineAsm()) {
           LLVM_DEBUG(dbgs() << "    found inline assembly\n");
           continue;
         }

         if (handleCalleesMD(Inst, KnownCallees))
           continue;
         // If we failed to parse any !callees MD, or some was missing,
         // the entire KnownCallees list is now unreliable.
         KnownCallees.clear();

         // Everything else is handled conservatively. If we fall into the
         // conservative case don't bother analyzing further.
         HasUnknownIndirectCall = true;
         break;
       }

       if (HasUnknownIndirectCall) {
         LLVM_DEBUG(dbgs() << "    indirect call found\n");
         FnsWithIndirectCalls.push_back(&Fn);
       } else if (!KnownCallees.empty())
         DirectCallees.insert_range(KnownCallees);
     }

     Node &N = getNode(Cache, Fn);
     for (const auto *Callee : DirectCallees)
       createEdge(N, getNode(Cache, *Callee), EdgeKind::DirectCall);

     if (canBeIndirectlyCalled(Fn))
       IndirectlyCallableFns.push_back(&Fn);
   }

   // Post-process functions with indirect calls.
   for (const Function *Fn : FnsWithIndirectCalls) {
     for (const Function *Candidate : IndirectlyCallableFns) {
       Node &Src = getNode(Cache, *Fn);
       Node &Dst = getNode(Cache, *Candidate);
       createEdge(Src, Dst, EdgeKind::IndirectCall);
     }
   }

   // Now, find all entry points.
   SmallVector<Node *, 16> CandidateEntryPoints;
   BitVector NodesReachableByKernels = createNodesBitVector();
   for (Node *N : Nodes) {
     // Functions with an Entry CC are always graph entry points too.
     if (N->isEntryFunctionCC()) {
       N->markAsGraphEntry();
       N->getDependencies(NodesReachableByKernels);
     } else if (!N->hasAnyIncomingEdgesOfKind(EdgeKind::DirectCall))
       CandidateEntryPoints.push_back(N);
   }

   for (Node *N : CandidateEntryPoints) {
     // This can be another entry point if it's not reachable by a kernel
     // TODO: We could sort all of the possible new entries in a stable order
     // (e.g. by cost), then consume them one by one until
     // NodesReachableByKernels is all 1s. It'd allow us to avoid
     // considering some nodes as non-entries in some specific cases.
     if (!NodesReachableByKernels.test(N->getID()))
       N->markAsGraphEntry();
   }

 #ifndef NDEBUG
   assert(verifyGraph());
 #endif
 }

 #ifndef NDEBUG
 bool SplitGraph::verifyGraph() const {
   unsigned ExpectedID = 0;
   // Exceptionally using a set here in case IDs are messed up.
   DenseSet<const Node *> SeenNodes;
   DenseSet<const Function *> SeenFunctionNodes;
   for (const Node *N : Nodes) {
     if (N->getID() != (ExpectedID++)) {
       errs() << "Node IDs are incorrect!\n";
       return false;
     }

     if (!SeenNodes.insert(N).second) {
       errs() << "Node seen more than once!\n";
       return false;
     }

     if (&getNode(N->getID()) != N) {
       errs() << "getNode doesn't return the right node\n";
       return false;
     }

     for (const Edge *E : N->IncomingEdges) {
       if (!E->Src || !E->Dst || (E->Dst != N) ||
           (find(E->Src->OutgoingEdges, E) == E->Src->OutgoingEdges.end())) {
         errs() << "ill-formed incoming edges\n";
         return false;
       }
     }

     for (const Edge *E : N->OutgoingEdges) {
       if (!E->Src || !E->Dst || (E->Src != N) ||
           (find(E->Dst->IncomingEdges, E) == E->Dst->IncomingEdges.end())) {
         errs() << "ill-formed outgoing edges\n";
         return false;
       }
     }

     const Function &Fn = N->getFunction();
     if (AMDGPU::isEntryFunctionCC(Fn.getCallingConv())) {
       if (N->hasAnyIncomingEdges()) {
         errs() << "Kernels cannot have incoming edges\n";
         return false;
       }
     }

     if (Fn.isDeclaration()) {
       errs() << "declarations shouldn't have nodes!\n";
       return false;
     }

     auto [It, Inserted] = SeenFunctionNodes.insert(&Fn);
     if (!Inserted) {
       errs() << "one function has multiple nodes!\n";
       return false;
     }
   }

   if (ExpectedID != Nodes.size()) {
     errs() << "Node IDs out of sync!\n";
     return false;
   }

   if (createNodesBitVector().size() != getNumNodes()) {
     errs() << "nodes bit vector doesn't have the right size!\n";
     return false;
   }

   // Check we respect the promise of Node::isKernel
   BitVector BV = createNodesBitVector();
   for (const Node *N : nodes()) {
     if (N->isGraphEntryPoint())
       N->getDependencies(BV);
   }

   // Ensure each function in the module has an associated node.
   for (const auto &Fn : M) {
     if (!Fn.isDeclaration()) {
       if (!SeenFunctionNodes.contains(&Fn)) {
         errs() << "Fn has no associated node in the graph!\n";
         return false;
       }
     }
   }

   if (!BV.all()) {
     errs() << "not all nodes are reachable through the graph's entry points!\n";
     return false;
   }

   return true;
 }
 #endif

 CostType SplitGraph::calculateCost(const BitVector &BV) const {
   CostType Cost = 0;
   for (unsigned NodeID : BV.set_bits())
     Cost += getNode(NodeID).getIndividualCost();
   return Cost;
 }

 SplitGraph::Node &
 SplitGraph::getNode(DenseMap<const GlobalValue *, Node *> &Cache,
                     const GlobalValue &GV) {
   auto &N = Cache[&GV];
   if (N)
     return *N;

   CostType Cost = 0;
   bool NonCopyable = false;
   if (const Function *Fn = dyn_cast<Function>(&GV)) {
     NonCopyable = isNonCopyable(*Fn);
     Cost = CostMap.at(Fn);
   }
   N = new (NodesPool.Allocate()) Node(Nodes.size(), GV, Cost, NonCopyable);
   Nodes.push_back(N);
   assert(&getNode(N->getID()) == N);
   return *N;
 }

 const SplitGraph::Edge &SplitGraph::createEdge(Node &Src, Node &Dst,
                                                EdgeKind EK) {
   const Edge *E = new (EdgesPool.Allocate<Edge>(1)) Edge(&Src, &Dst, EK);
   Src.OutgoingEdges.push_back(E);
   Dst.IncomingEdges.push_back(E);
   return *E;
 }

 //===----------------------------------------------------------------------===//
 // Split Proposals
 //===----------------------------------------------------------------------===//

 /// Represents a module splitting proposal.
 ///
 /// Proposals are made of N BitVectors, one for each partition, where each bit
 /// set indicates that the node is present and should be copied inside that
 /// partition.
 ///
 /// Proposals have several metrics attached so they can be compared/sorted,
 /// which the driver to try multiple strategies resultings in multiple proposals
 /// and choose the best one out of them.
 class SplitProposal {
 public:
   SplitProposal(const SplitGraph &SG, unsigned MaxPartitions) : SG(&SG) {
     Partitions.resize(MaxPartitions, {0, SG.createNodesBitVector()});
   }

   void setName(StringRef NewName) { Name = NewName; }
   StringRef getName() const { return Name; }

   const BitVector &operator[](unsigned PID) const {
     return Partitions[PID].second;
   }

   void add(unsigned PID, const BitVector &BV) {
     Partitions[PID].second |= BV;
     updateScore(PID);
   }

   void print(raw_ostream &OS) const;
   LLVM_DUMP_METHOD void dump() const { print(dbgs()); }

   // Find the cheapest partition (lowest cost). In case of ties, always returns
   // the highest partition number.
   unsigned findCheapestPartition() const;

   /// Calculate the CodeSize and Bottleneck scores.
   void calculateScores();

 #ifndef NDEBUG
   void verifyCompleteness() const;
 #endif

   /// Only available after \ref calculateScores is called.
   ///
   /// A positive number indicating the % of code duplication that this proposal
   /// creates. e.g. 0.2 means this proposal adds roughly 20% code size by
   /// duplicating some functions across partitions.
   ///
   /// Value is always rounded up to 3 decimal places.
   ///
   /// A perfect score would be 0.0, and anything approaching 1.0 is very bad.
   double getCodeSizeScore() const { return CodeSizeScore; }

   /// Only available after \ref calculateScores is called.
   ///
   /// A number between [0, 1] which indicates how big of a bottleneck is
   /// expected from the largest partition.
   ///
   /// A score of 1.0 means the biggest partition is as big as the source module,
   /// so build time will be equal to or greater than the build time of the
   /// initial input.
   ///
   /// Value is always rounded up to 3 decimal places.
   ///
   /// This is one of the metrics used to estimate this proposal's build time.
   double getBottleneckScore() const { return BottleneckScore; }

 private:
   void updateScore(unsigned PID) {
     assert(SG);
     for (auto &[PCost, Nodes] : Partitions) {
       TotalCost -= PCost;
       PCost = SG->calculateCost(Nodes);
       TotalCost += PCost;
     }
   }

   /// \see getCodeSizeScore
   double CodeSizeScore = 0.0;
   /// \see getBottleneckScore
   double BottleneckScore = 0.0;
   /// Aggregated cost of all partitions
   CostType TotalCost = 0;

   const SplitGraph *SG = nullptr;
   std::string Name;

   std::vector<std::pair<CostType, BitVector>> Partitions;
 };

 void SplitProposal::print(raw_ostream &OS) const {
   assert(SG);

   OS << "[proposal] " << Name << ", total cost:" << TotalCost
      << ", code size score:" << format("%0.3f", CodeSizeScore)
      << ", bottleneck score:" << format("%0.3f", BottleneckScore) << '\n';
   for (const auto &[PID, Part] : enumerate(Partitions)) {
     const auto &[Cost, NodeIDs] = Part;
     OS << "  - P" << PID << " nodes:" << NodeIDs.count() << " cost: " << Cost
        << '|' << formatRatioOf(Cost, SG->getModuleCost()) << "%\n";
   }
 }

 unsigned SplitProposal::findCheapestPartition() const {
   assert(!Partitions.empty());
   CostType CurCost = std::numeric_limits<CostType>::max();
   unsigned CurPID = InvalidPID;
   for (const auto &[Idx, Part] : enumerate(Partitions)) {
     if (Part.first <= CurCost) {
       CurPID = Idx;
       CurCost = Part.first;
     }
   }
   assert(CurPID != InvalidPID);
   return CurPID;
 }

 void SplitProposal::calculateScores() {
   if (Partitions.empty())
     return;

   assert(SG);
   CostType LargestPCost = 0;
   for (auto &[PCost, Nodes] : Partitions) {
     if (PCost > LargestPCost)
       LargestPCost = PCost;
   }

   CostType ModuleCost = SG->getModuleCost();
   CodeSizeScore = double(TotalCost) / ModuleCost;
   assert(CodeSizeScore >= 0.0);

   BottleneckScore = double(LargestPCost) / ModuleCost;

   CodeSizeScore = std::ceil(CodeSizeScore * 100.0) / 100.0;
   BottleneckScore = std::ceil(BottleneckScore * 100.0) / 100.0;
 }

 #ifndef NDEBUG
 void SplitProposal::verifyCompleteness() const {
   if (Partitions.empty())
     return;

   BitVector Result = Partitions[0].second;
   for (const auto &P : drop_begin(Partitions))
     Result |= P.second;
   assert(Result.all() && "some nodes are missing from this proposal!");
 }
 #endif

 //===-- RecursiveSearchStrategy -------------------------------------------===//

 /// Partitioning algorithm.
 ///
 /// This is a recursive search algorithm that can explore multiple possiblities.
 ///
 /// When a cluster of nodes can go into more than one partition, and we haven't
 /// reached maximum search depth, we recurse and explore both options and their
 /// consequences. Both branches will yield a proposal, and the driver will grade
 /// both and choose the best one.
 ///
 /// If max depth is reached, we will use some heuristics to make a choice. Most
 /// of the time we will just use the least-pressured (cheapest) partition, but
 /// if a cluster is particularly big and there is a good amount of overlap with
 /// an existing partition, we will choose that partition instead.
 class RecursiveSearchSplitting {
 public:
   using SubmitProposalFn = function_ref<void(SplitProposal)>;

   RecursiveSearchSplitting(const SplitGraph &SG, unsigned NumParts,
                            SubmitProposalFn SubmitProposal);

   void run();

 private:
   struct WorkListEntry {
     WorkListEntry(const BitVector &BV) : Cluster(BV) {}

     unsigned NumNonEntryNodes = 0;
     CostType TotalCost = 0;
     CostType CostExcludingGraphEntryPoints = 0;
     BitVector Cluster;
   };

   /// Collects all graph entry points's clusters and sort them so the most
   /// expensive clusters are viewed first. This will merge clusters together if
   /// they share a non-copyable dependency.
   void setupWorkList();

   /// Recursive function that assigns the worklist item at \p Idx into a
   /// partition of \p SP.
   ///
   /// \p Depth is the current search depth. When this value is equal to
   /// \ref MaxDepth, we can no longer recurse.
   ///
   /// This function only recurses if there is more than one possible assignment,
   /// otherwise it is iterative to avoid creating a call stack that is as big as
   /// \ref WorkList.
   void pickPartition(unsigned Depth, unsigned Idx, SplitProposal SP);

   /// \return A pair: first element is the PID of the partition that has the
   /// most similarities with \p Entry, or \ref InvalidPID if no partition was
   /// found with at least one element in common. The second element is the
   /// aggregated cost of all dependencies in common between \p Entry and that
   /// partition.
   std::pair<unsigned, CostType>
   findMostSimilarPartition(const WorkListEntry &Entry, const SplitProposal &SP);

   const SplitGraph &SG;
   unsigned NumParts;
   SubmitProposalFn SubmitProposal;

   // A Cluster is considered large when its cost, excluding entry points,
   // exceeds this value.
   CostType LargeClusterThreshold = 0;
   unsigned NumProposalsSubmitted = 0;
   SmallVector<WorkListEntry> WorkList;
 };

 RecursiveSearchSplitting::RecursiveSearchSplitting(
     const SplitGraph &SG, unsigned NumParts, SubmitProposalFn SubmitProposal)
     : SG(SG), NumParts(NumParts), SubmitProposal(SubmitProposal) {
   // arbitrary max value as a safeguard. Anything above 10 will already be
   // slow, this is just a max value to prevent extreme resource exhaustion or
   // unbounded run time.
   if (MaxDepth > 16)
     report_fatal_error("[amdgpu-split-module] search depth of " +
                        Twine(MaxDepth) + " is too high!");
   LargeClusterThreshold =
       (LargeFnFactor != 0.0)
           ? CostType(((SG.getModuleCost() / NumParts) * LargeFnFactor))
           : std::numeric_limits<CostType>::max();
   LLVM_DEBUG(dbgs() << "[recursive search] large cluster threshold set at "
                     << LargeClusterThreshold << "\n");
 }

 void RecursiveSearchSplitting::run() {
   {
     SplitModuleTimer SMT("recursive_search_prepare", "preparing worklist");
     setupWorkList();
   }

   {
     SplitModuleTimer SMT("recursive_search_pick", "partitioning");
     SplitProposal SP(SG, NumParts);
     pickPartition(/*BranchDepth=*/0, /*Idx=*/0, SP);
   }
 }

 void RecursiveSearchSplitting::setupWorkList() {
   // e.g. if A and B are two worklist item, and they both call a non copyable
   // dependency C, this does:
   //    A=C
   //    B=C
   // => NodeEC will create a single group (A, B, C) and we create a new
   // WorkList entry for that group.

   EquivalenceClasses<unsigned> NodeEC;
   for (const SplitGraph::Node *N : SG.nodes()) {
     if (!N->isGraphEntryPoint())
       continue;

     NodeEC.insert(N->getID());
     N->visitAllDependencies([&](const SplitGraph::Node &Dep) {
       if (&Dep != N && Dep.isNonCopyable())
         NodeEC.unionSets(N->getID(), Dep.getID());
     });
   }

   for (const auto &Node : NodeEC) {
     if (!Node->isLeader())
       continue;

     BitVector Cluster = SG.createNodesBitVector();
     for (unsigned M : NodeEC.members(*Node)) {
       const SplitGraph::Node &N = SG.getNode(M);
       if (N.isGraphEntryPoint())
         N.getDependencies(Cluster);
     }
     WorkList.emplace_back(std::move(Cluster));
   }

   // Calculate costs and other useful information.
   for (WorkListEntry &Entry : WorkList) {
     for (unsigned NodeID : Entry.Cluster.set_bits()) {
       const SplitGraph::Node &N = SG.getNode(NodeID);
       const CostType Cost = N.getIndividualCost();

       Entry.TotalCost += Cost;
       if (!N.isGraphEntryPoint()) {
         Entry.CostExcludingGraphEntryPoints += Cost;
         ++Entry.NumNonEntryNodes;
       }
     }
   }

   stable_sort(WorkList, [](const WorkListEntry &A, const WorkListEntry &B) {
     if (A.TotalCost != B.TotalCost)
       return A.TotalCost > B.TotalCost;

     if (A.CostExcludingGraphEntryPoints != B.CostExcludingGraphEntryPoints)
       return A.CostExcludingGraphEntryPoints > B.CostExcludingGraphEntryPoints;

     if (A.NumNonEntryNodes != B.NumNonEntryNodes)
       return A.NumNonEntryNodes > B.NumNonEntryNodes;

     return A.Cluster.count() > B.Cluster.count();
   });

   LLVM_DEBUG({
     dbgs() << "[recursive search] worklist:\n";
     for (const auto &[Idx, Entry] : enumerate(WorkList)) {
       dbgs() << "  - [" << Idx << "]: ";
       for (unsigned NodeID : Entry.Cluster.set_bits())
         dbgs() << NodeID << " ";
       dbgs() << "(total_cost:" << Entry.TotalCost
              << ", cost_excl_entries:" << Entry.CostExcludingGraphEntryPoints
              << ")\n";
     }
   });
 }

 void RecursiveSearchSplitting::pickPartition(unsigned Depth, unsigned Idx,
                                              SplitProposal SP) {
   while (Idx < WorkList.size()) {
     // Step 1: Determine candidate PIDs.
     //
     const WorkListEntry &Entry = WorkList[Idx];
     const BitVector &Cluster = Entry.Cluster;

     // Default option is to do load-balancing, AKA assign to least pressured
     // partition.
     const unsigned CheapestPID = SP.findCheapestPartition();
     assert(CheapestPID != InvalidPID);

     // Explore assigning to the kernel that contains the most dependencies in
     // common.
     const auto [MostSimilarPID, SimilarDepsCost] =
         findMostSimilarPartition(Entry, SP);

     // We can chose to explore only one path if we only have one valid path, or
     // if we reached maximum search depth and can no longer branch out.
     unsigned SinglePIDToTry = InvalidPID;
     if (MostSimilarPID == InvalidPID) // no similar PID found
       SinglePIDToTry = CheapestPID;
     else if (MostSimilarPID == CheapestPID) // both landed on the same PID
       SinglePIDToTry = CheapestPID;
     else if (Depth >= MaxDepth) {
       // We have to choose one path. Use a heuristic to guess which one will be
       // more appropriate.
       if (Entry.CostExcludingGraphEntryPoints > LargeClusterThreshold) {
         // Check if the amount of code in common makes it worth it.
         assert(SimilarDepsCost && Entry.CostExcludingGraphEntryPoints);
         const double Ratio = static_cast<double>(SimilarDepsCost) /
                              Entry.CostExcludingGraphEntryPoints;
         assert(Ratio >= 0.0 && Ratio <= 1.0);
         if (Ratio > LargeFnOverlapForMerge) {
           // For debug, just print "L", so we'll see "L3=P3" for instance, which
           // will mean we reached max depth and chose P3 based on this
           // heuristic.
           LLVM_DEBUG(dbgs() << 'L');
           SinglePIDToTry = MostSimilarPID;
         }
       } else
         SinglePIDToTry = CheapestPID;
     }

     // Step 2: Explore candidates.

     // When we only explore one possible path, and thus branch depth doesn't
     // increase, do not recurse, iterate instead.
     if (SinglePIDToTry != InvalidPID) {
       LLVM_DEBUG(dbgs() << Idx << "=P" << SinglePIDToTry << ' ');
       // Only one path to explore, don't clone SP, don't increase depth.
       SP.add(SinglePIDToTry, Cluster);
       ++Idx;
       continue;
     }

     assert(MostSimilarPID != InvalidPID);

     // We explore multiple paths: recurse at increased depth, then stop this
     // function.

     LLVM_DEBUG(dbgs() << '\n');

     // lb = load balancing = put in cheapest partition
     {
       SplitProposal BranchSP = SP;
       LLVM_DEBUG(dbgs().indent(Depth)
                  << " [lb] " << Idx << "=P" << CheapestPID << "? ");
       BranchSP.add(CheapestPID, Cluster);
       pickPartition(Depth + 1, Idx + 1, BranchSP);
     }

     // ms = most similar = put in partition with the most in common
     {
       SplitProposal BranchSP = SP;
       LLVM_DEBUG(dbgs().indent(Depth)
                  << " [ms] " << Idx << "=P" << MostSimilarPID << "? ");
       BranchSP.add(MostSimilarPID, Cluster);
       pickPartition(Depth + 1, Idx + 1, BranchSP);
     }

     return;
   }

   // Step 3: If we assigned all WorkList items, submit the proposal.

   assert(Idx == WorkList.size());
   assert(NumProposalsSubmitted <= (2u << MaxDepth) &&
          "Search got out of bounds?");
   SP.setName("recursive_search (depth=" + std::to_string(Depth) + ") #" +
              std::to_string(NumProposalsSubmitted++));
   LLVM_DEBUG(dbgs() << '\n');
   SubmitProposal(SP);
 }

 std::pair<unsigned, CostType>
 RecursiveSearchSplitting::findMostSimilarPartition(const WorkListEntry &Entry,
                                                    const SplitProposal &SP) {
   if (!Entry.NumNonEntryNodes)
     return {InvalidPID, 0};

   // We take the partition that is the most similar using Cost as a metric.
   // So we take the set of nodes in common, compute their aggregated cost, and
   // pick the partition with the highest cost in common.
   unsigned ChosenPID = InvalidPID;
   CostType ChosenCost = 0;
   for (unsigned PID = 0; PID < NumParts; ++PID) {
     BitVector BV = SP[PID];
     BV &= Entry.Cluster; // FIXME: & doesn't work between BVs?!

     if (BV.none())
       continue;

     const CostType Cost = SG.calculateCost(BV);

     if (ChosenPID == InvalidPID || ChosenCost < Cost ||
         (ChosenCost == Cost && PID > ChosenPID)) {
       ChosenPID = PID;
       ChosenCost = Cost;
     }
   }

   return {ChosenPID, ChosenCost};
 }

 //===----------------------------------------------------------------------===//
 // DOTGraph Printing Support
 //===----------------------------------------------------------------------===//

 const SplitGraph::Node *mapEdgeToDst(const SplitGraph::Edge *E) {
   return E->Dst;
 }

 using SplitGraphEdgeDstIterator =
     mapped_iterator<SplitGraph::edges_iterator, decltype(&mapEdgeToDst)>;

 } // namespace

 template <> struct GraphTraits<SplitGraph> {
   using NodeRef = const SplitGraph::Node *;
   using nodes_iterator = SplitGraph::nodes_iterator;
   using ChildIteratorType = SplitGraphEdgeDstIterator;

   using EdgeRef = const SplitGraph::Edge *;
   using ChildEdgeIteratorType = SplitGraph::edges_iterator;

   static NodeRef getEntryNode(NodeRef N) { return N; }

   static ChildIteratorType child_begin(NodeRef Ref) {
     return {Ref->outgoing_edges().begin(), mapEdgeToDst};
   }
   static ChildIteratorType child_end(NodeRef Ref) {
     return {Ref->outgoing_edges().end(), mapEdgeToDst};
   }

   static nodes_iterator nodes_begin(const SplitGraph &G) {
     return G.nodes().begin();
   }
   static nodes_iterator nodes_end(const SplitGraph &G) {
     return G.nodes().end();
   }
 };

 template <> struct DOTGraphTraits<SplitGraph> : public DefaultDOTGraphTraits {
   DOTGraphTraits(bool IsSimple = false) : DefaultDOTGraphTraits(IsSimple) {}

   static std::string getGraphName(const SplitGraph &SG) {
     return SG.getModule().getName().str();
   }

   std::string getNodeLabel(const SplitGraph::Node *N, const SplitGraph &SG) {
     return N->getName().str();
   }

   static std::string getNodeDescription(const SplitGraph::Node *N,
                                         const SplitGraph &SG) {
     std::string Result;
     if (N->isEntryFunctionCC())
       Result += "entry-fn-cc ";
     if (N->isNonCopyable())
       Result += "non-copyable ";
     Result += "cost:" + std::to_string(N->getIndividualCost());
     return Result;
   }

   static std::string getNodeAttributes(const SplitGraph::Node *N,
                                        const SplitGraph &SG) {
     return N->hasAnyIncomingEdges() ? "" : "color=\"red\"";
   }

   static std::string getEdgeAttributes(const SplitGraph::Node *N,
                                        SplitGraphEdgeDstIterator EI,
                                        const SplitGraph &SG) {

     switch ((*EI.getCurrent())->Kind) {
     case SplitGraph::EdgeKind::DirectCall:
       return "";
     case SplitGraph::EdgeKind::IndirectCall:
       return "style=\"dashed\"";
     }
     llvm_unreachable("Unknown SplitGraph::EdgeKind enum");
   }
 };

 //===----------------------------------------------------------------------===//
 // Driver
 //===----------------------------------------------------------------------===//

 namespace {

 // If we didn't externalize GVs, then local GVs need to be conservatively
 // imported into every module (including their initializers), and then cleaned
 // up afterwards.
 static bool needsConservativeImport(const GlobalValue *GV) {
   if (const auto *Var = dyn_cast<GlobalVariable>(GV))
     return Var->hasLocalLinkage();
   return isa<GlobalAlias>(GV);
 }

 /// Prints a summary of the partition \p N, represented by module \p M, to \p
 /// OS.
 static void printPartitionSummary(raw_ostream &OS, unsigned N, const Module &M,
                                   unsigned PartCost, unsigned ModuleCost) {
   OS << "*** Partition P" << N << " ***\n";

   for (const auto &Fn : M) {
     if (!Fn.isDeclaration())
       OS << " - [function] " << Fn.getName() << "\n";
   }

   for (const auto &GV : M.globals()) {
     if (GV.hasInitializer())
       OS << " - [global] " << GV.getName() << "\n";
   }

   OS << "Partition contains " << formatRatioOf(PartCost, ModuleCost)
      << "% of the source\n";
 }

 static void evaluateProposal(SplitProposal &Best, SplitProposal New) {
   SplitModuleTimer SMT("proposal_evaluation", "proposal ranking algorithm");

   LLVM_DEBUG({
     New.verifyCompleteness();
     if (DebugProposalSearch)
       New.print(dbgs());
   });

   const double CurBScore = Best.getBottleneckScore();
   const double CurCSScore = Best.getCodeSizeScore();
   const double NewBScore = New.getBottleneckScore();
   const double NewCSScore = New.getCodeSizeScore();

   // TODO: Improve this
   //    We can probably lower the precision of the comparison at first
   //    e.g. if we have
   //      - (Current): BScore: 0.489 CSCore 1.105
   //      - (New): BScore: 0.475 CSCore 1.305
   //    Currently we'd choose the new one because the bottleneck score is
   //    lower, but the new one duplicates more code. It may be worth it to
   //    discard the new proposal as the impact on build time is negligible.

   // Compare them
   bool IsBest = false;
   if (NewBScore < CurBScore)
     IsBest = true;
   else if (NewBScore == CurBScore)
     IsBest = (NewCSScore < CurCSScore); // Use code size as tie breaker.

   if (IsBest)
     Best = std::move(New);

   LLVM_DEBUG(if (DebugProposalSearch) {
     if (IsBest)
       dbgs() << "[search] new best proposal!\n";
     else
       dbgs() << "[search] discarding - not profitable\n";
   });
 }

 /// Trivial helper to create an identical copy of \p M.
 static std::unique_ptr<Module> cloneAll(const Module &M) {
   ValueToValueMapTy VMap;
   return CloneModule(M, VMap, [&](const GlobalValue *GV) { return true; });
 }

 /// Writes \p SG as a DOTGraph to \ref ModuleDotCfgDir if requested.
 static void writeDOTGraph(const SplitGraph &SG) {
   if (ModuleDotCfgOutput.empty())
     return;

   std::error_code EC;
   raw_fd_ostream OS(ModuleDotCfgOutput, EC);
   if (EC) {
     errs() << "[" DEBUG_TYPE "]: cannot open '" << ModuleDotCfgOutput
            << "' - DOTGraph will not be printed\n";
   }
   WriteGraph(OS, SG, /*ShortName=*/false,
              /*Title=*/SG.getModule().getName());
 }

 static void splitAMDGPUModule(
     GetTTIFn GetTTI, Module &M, unsigned NumParts,
     function_ref<void(std::unique_ptr<Module> MPart)> ModuleCallback) {
   CallGraph CG(M);

   // Externalize functions whose address are taken.
   //
   // This is needed because partitioning is purely based on calls, but sometimes
   // a kernel/function may just look at the address of another local function
   // and not do anything (no calls). After partitioning, that local function may
   // end up in a different module (so it's just a declaration in the module
   // where its address is taken), which emits a "undefined hidden symbol" linker
   // error.
   //
   // Additionally, it guides partitioning to not duplicate this function if it's
   // called directly at some point.
   //
   // TODO: Could we be smarter about this ? This makes all functions whose
   // addresses are taken non-copyable. We should probably model this type of
   // constraint in the graph and use it to guide splitting, instead of
   // externalizing like this. Maybe non-copyable should really mean "keep one
   // visible copy, then internalize all other copies" for some functions?
   if (!NoExternalizeOnAddrTaken) {
     for (auto &Fn : M) {
       // TODO: Should aliases count? Probably not but they're so rare I'm not
       // sure it's worth fixing.
       if (Fn.hasLocalLinkage() && Fn.hasAddressTaken()) {
         LLVM_DEBUG(dbgs() << "[externalize] "; Fn.printAsOperand(dbgs());
                    dbgs() << " because its address is taken\n");
         externalize(Fn);
       }
     }
   }

   // Externalize local GVs, which avoids duplicating their initializers, which
   // in turns helps keep code size in check.
   if (!NoExternalizeGlobals) {
     for (auto &GV : M.globals()) {
       if (GV.hasLocalLinkage())
         LLVM_DEBUG(dbgs() << "[externalize] GV " << GV.getName() << '\n');
       externalize(GV);
     }
   }

   // Start by calculating the cost of every function in the module, as well as
   // the module's overall cost.
   FunctionsCostMap FnCosts;
   const CostType ModuleCost = calculateFunctionCosts(GetTTI, M, FnCosts);

   // Build the SplitGraph, which represents the module's functions and models
   // their dependencies accurately.
   SplitGraph SG(M, FnCosts, ModuleCost);
   SG.buildGraph(CG);

   if (SG.empty()) {
     LLVM_DEBUG(
         dbgs()
         << "[!] no nodes in graph, input is empty - no splitting possible\n");
     ModuleCallback(cloneAll(M));
     return;
   }

   LLVM_DEBUG({
     dbgs() << "[graph] nodes:\n";
     for (const SplitGraph::Node *N : SG.nodes()) {
       dbgs() << "  - [" << N->getID() << "]: " << N->getName() << " "
              << (N->isGraphEntryPoint() ? "(entry)" : "") << " "
              << (N->isNonCopyable() ? "(noncopyable)" : "") << "\n";
     }
   });

   writeDOTGraph(SG);

   LLVM_DEBUG(dbgs() << "[search] testing splitting strategies\n");

   std::optional<SplitProposal> Proposal;
   const auto EvaluateProposal = [&](SplitProposal SP) {
     SP.calculateScores();
     if (!Proposal)
       Proposal = std::move(SP);
     else
       evaluateProposal(*Proposal, std::move(SP));
   };

   // TODO: It would be very easy to create new strategies by just adding a base
   // class to RecursiveSearchSplitting and abstracting it away.
   RecursiveSearchSplitting(SG, NumParts, EvaluateProposal).run();
   LLVM_DEBUG(if (Proposal) dbgs() << "[search done] selected proposal: "
                                   << Proposal->getName() << "\n";);

   if (!Proposal) {
     LLVM_DEBUG(dbgs() << "[!] no proposal made, no splitting possible!\n");
     ModuleCallback(cloneAll(M));
     return;
   }

   LLVM_DEBUG(Proposal->print(dbgs()););

   std::optional<raw_fd_ostream> SummariesOS;
   if (!PartitionSummariesOutput.empty()) {
     std::error_code EC;
     SummariesOS.emplace(PartitionSummariesOutput, EC);
     if (EC)
       errs() << "[" DEBUG_TYPE "]: cannot open '" << PartitionSummariesOutput
              << "' - Partition summaries will not be printed\n";
   }

   // One module will import all GlobalValues that are not Functions
   // and are not subject to conservative import.
   bool ImportAllGVs = true;

   for (unsigned PID = 0; PID < NumParts; ++PID) {
     SplitModuleTimer SMT2("modules_creation",
                           "creating modules for each partition");
     LLVM_DEBUG(dbgs() << "[split] creating new modules\n");

     DenseSet<const Function *> FnsInPart;
     for (unsigned NodeID : (*Proposal)[PID].set_bits())
       FnsInPart.insert(&SG.getNode(NodeID).getFunction());

     // Don't create empty modules.
     if (FnsInPart.empty()) {
       LLVM_DEBUG(dbgs() << "[split] P" << PID
                         << " is empty, not creating module\n");
       continue;
     }

     ValueToValueMapTy VMap;
     CostType PartCost = 0;
     std::unique_ptr<Module> MPart(
         CloneModule(M, VMap, [&](const GlobalValue *GV) {
           // Functions go in their assigned partition.
           if (const auto *Fn = dyn_cast<Function>(GV)) {
             if (FnsInPart.contains(Fn)) {
               PartCost += SG.getCost(*Fn);
               return true;
             }
             return false;
           }

           // Everything else goes in the first non-empty module we create.
           return ImportAllGVs || needsConservativeImport(GV);
         }));

     ImportAllGVs = false;

     // FIXME: Aliases aren't seen often, and their handling isn't perfect so
     // bugs are possible.

     // Clean-up conservatively imported GVs without any users.
     for (auto &GV : make_early_inc_range(MPart->global_values())) {
       if (needsConservativeImport(&GV) && GV.use_empty())
         GV.eraseFromParent();
     }

     if (SummariesOS)
       printPartitionSummary(*SummariesOS, PID, *MPart, PartCost, ModuleCost);

     LLVM_DEBUG(
         printPartitionSummary(dbgs(), PID, *MPart, PartCost, ModuleCost));

     ModuleCallback(std::move(MPart));
   }
 }
 } // namespace

 PreservedAnalyses AMDGPUSplitModulePass::run(Module &M,
                                              ModuleAnalysisManager &MAM) {
   SplitModuleTimer SMT(
       "total", "total pass runtime (incl. potentially waiting for lockfile)");

   FunctionAnalysisManager &FAM =
       MAM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
   const auto TTIGetter = [&FAM](Function &F) -> const TargetTransformInfo & {
     return FAM.getResult<TargetIRAnalysis>(F);
   };

   bool Done = false;
 #ifndef NDEBUG
   if (UseLockFile) {
     SmallString<128> LockFilePath;
     sys::path::system_temp_directory(/*ErasedOnReboot=*/true, LockFilePath);
     sys::path::append(LockFilePath, "amdgpu-split-module-debug");
     LLVM_DEBUG(dbgs() << DEBUG_TYPE " using lockfile '" << LockFilePath
                       << "'\n");

     while (true) {
       llvm::LockFileManager Lock(LockFilePath.str());
       bool Owned;
       if (Error Err = Lock.tryLock().moveInto(Owned)) {
         consumeError(std::move(Err));
         LLVM_DEBUG(
             dbgs() << "[amdgpu-split-module] unable to acquire lockfile, debug "
                       "output may be mangled by other processes\n");
       } else if (!Owned) {
         switch (Lock.waitForUnlockFor(std::chrono::seconds(90))) {
         case WaitForUnlockResult::Success:
           break;
         case WaitForUnlockResult::OwnerDied:
           continue; // try again to get the lock.
         case WaitForUnlockResult::Timeout:
           LLVM_DEBUG(
               dbgs()
               << "[amdgpu-split-module] unable to acquire lockfile, debug "
                  "output may be mangled by other processes\n");
           Lock.unsafeMaybeUnlock();
           break; // give up
         }
       }

       splitAMDGPUModule(TTIGetter, M, N, ModuleCallback);
       Done = true;
       break;
     }
   }
 #endif

   if (!Done)
     splitAMDGPUModule(TTIGetter, M, N, ModuleCallback);

   // We can change linkage/visibilities in the input, consider that nothing is
   // preserved just to be safe. This pass runs last anyway.
   return PreservedAnalyses::none();
 }
 } // namespace llvm