| //==-- MemProfContextDisambiguation.cpp - Disambiguate contexts -------------=// |
| // |
| // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| // See https://llvm.org/LICENSE.txt for license information. |
| // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| // |
| //===----------------------------------------------------------------------===// |
| // |
| // This file implements support for context disambiguation of allocation |
| // calls for profile guided heap optimization. Specifically, it uses Memprof |
| // profiles which indicate context specific allocation behavior (currently |
| // distinguishing cold vs hot memory allocations). Cloning is performed to |
| // expose the cold allocation call contexts, and the allocation calls are |
| // subsequently annotated with an attribute for later transformation. |
| // |
| // The transformations can be performed either directly on IR (regular LTO), or |
| // on a ThinLTO index (and later applied to the IR during the ThinLTO backend). |
| // Both types of LTO operate on a the same base graph representation, which |
| // uses CRTP to support either IR or Index formats. |
| // |
| //===----------------------------------------------------------------------===// |
| |
| #include "llvm/Transforms/IPO/MemProfContextDisambiguation.h" |
| #include "llvm/ADT/DenseMap.h" |
| #include "llvm/ADT/DenseSet.h" |
| #include "llvm/ADT/MapVector.h" |
| #include "llvm/ADT/SetOperations.h" |
| #include "llvm/ADT/SmallPtrSet.h" |
| #include "llvm/ADT/SmallSet.h" |
| #include "llvm/ADT/SmallVector.h" |
| #include "llvm/ADT/Statistic.h" |
| #include "llvm/Analysis/MemoryProfileInfo.h" |
| #include "llvm/Analysis/ModuleSummaryAnalysis.h" |
| #include "llvm/Analysis/OptimizationRemarkEmitter.h" |
| #include "llvm/Bitcode/BitcodeReader.h" |
| #include "llvm/IR/Instructions.h" |
| #include "llvm/IR/Module.h" |
| #include "llvm/IR/ModuleSummaryIndex.h" |
| #include "llvm/Pass.h" |
| #include "llvm/Support/CommandLine.h" |
| #include "llvm/Support/GraphWriter.h" |
| #include "llvm/Support/InterleavedRange.h" |
| #include "llvm/Support/raw_ostream.h" |
| #include "llvm/Transforms/IPO.h" |
| #include "llvm/Transforms/Utils/CallPromotionUtils.h" |
| #include "llvm/Transforms/Utils/Cloning.h" |
| #include "llvm/Transforms/Utils/Instrumentation.h" |
| #include <deque> |
| #include <sstream> |
| #include <unordered_map> |
| #include <vector> |
| using namespace llvm; |
| using namespace llvm::memprof; |
| |
| #define DEBUG_TYPE "memprof-context-disambiguation" |
| |
| STATISTIC(FunctionClonesAnalysis, |
| "Number of function clones created during whole program analysis"); |
| STATISTIC(FunctionClonesThinBackend, |
| "Number of function clones created during ThinLTO backend"); |
| STATISTIC(FunctionsClonedThinBackend, |
| "Number of functions that had clones created during ThinLTO backend"); |
| STATISTIC(AllocTypeNotCold, "Number of not cold static allocations (possibly " |
| "cloned) during whole program analysis"); |
| STATISTIC(AllocTypeCold, "Number of cold static allocations (possibly cloned) " |
| "during whole program analysis"); |
| STATISTIC(AllocTypeNotColdThinBackend, |
| "Number of not cold static allocations (possibly cloned) during " |
| "ThinLTO backend"); |
| STATISTIC(AllocTypeColdThinBackend, "Number of cold static allocations " |
| "(possibly cloned) during ThinLTO backend"); |
| STATISTIC(OrigAllocsThinBackend, |
| "Number of original (not cloned) allocations with memprof profiles " |
| "during ThinLTO backend"); |
| STATISTIC( |
| AllocVersionsThinBackend, |
| "Number of allocation versions (including clones) during ThinLTO backend"); |
| STATISTIC(MaxAllocVersionsThinBackend, |
| "Maximum number of allocation versions created for an original " |
| "allocation during ThinLTO backend"); |
| STATISTIC(UnclonableAllocsThinBackend, |
| "Number of unclonable ambigous allocations during ThinLTO backend"); |
| STATISTIC(RemovedEdgesWithMismatchedCallees, |
| "Number of edges removed due to mismatched callees (profiled vs IR)"); |
| STATISTIC(FoundProfiledCalleeCount, |
| "Number of profiled callees found via tail calls"); |
| STATISTIC(FoundProfiledCalleeDepth, |
| "Aggregate depth of profiled callees found via tail calls"); |
| STATISTIC(FoundProfiledCalleeMaxDepth, |
| "Maximum depth of profiled callees found via tail calls"); |
| STATISTIC(FoundProfiledCalleeNonUniquelyCount, |
| "Number of profiled callees found via multiple tail call chains"); |
| STATISTIC(DeferredBackedges, "Number of backedges with deferred cloning"); |
| STATISTIC(NewMergedNodes, "Number of new nodes created during merging"); |
| STATISTIC(NonNewMergedNodes, "Number of non new nodes used during merging"); |
| STATISTIC(MissingAllocForContextId, |
| "Number of missing alloc nodes for context ids"); |
| |
| static cl::opt<std::string> DotFilePathPrefix( |
| "memprof-dot-file-path-prefix", cl::init(""), cl::Hidden, |
| cl::value_desc("filename"), |
| cl::desc("Specify the path prefix of the MemProf dot files.")); |
| |
| static cl::opt<bool> ExportToDot("memprof-export-to-dot", cl::init(false), |
| cl::Hidden, |
| cl::desc("Export graph to dot files.")); |
| |
| // How much of the graph to export to dot. |
| enum DotScope { |
| All, // The full CCG graph. |
| Alloc, // Only contexts for the specified allocation. |
| Context, // Only the specified context. |
| }; |
| |
| static cl::opt<DotScope> DotGraphScope( |
| "memprof-dot-scope", cl::desc("Scope of graph to export to dot"), |
| cl::Hidden, cl::init(DotScope::All), |
| cl::values( |
| clEnumValN(DotScope::All, "all", "Export full callsite graph"), |
| clEnumValN(DotScope::Alloc, "alloc", |
| "Export only nodes with contexts feeding given " |
| "-memprof-dot-alloc-id"), |
| clEnumValN(DotScope::Context, "context", |
| "Export only nodes with given -memprof-dot-context-id"))); |
| |
| static cl::opt<unsigned> |
| AllocIdForDot("memprof-dot-alloc-id", cl::init(0), cl::Hidden, |
| cl::desc("Id of alloc to export if -memprof-dot-scope=alloc " |
| "or to highlight if -memprof-dot-scope=all")); |
| |
| static cl::opt<unsigned> ContextIdForDot( |
| "memprof-dot-context-id", cl::init(0), cl::Hidden, |
| cl::desc("Id of context to export if -memprof-dot-scope=context or to " |
| "highlight otherwise")); |
| |
| static cl::opt<bool> |
| DumpCCG("memprof-dump-ccg", cl::init(false), cl::Hidden, |
| cl::desc("Dump CallingContextGraph to stdout after each stage.")); |
| |
| static cl::opt<bool> |
| VerifyCCG("memprof-verify-ccg", cl::init(false), cl::Hidden, |
| cl::desc("Perform verification checks on CallingContextGraph.")); |
| |
| static cl::opt<bool> |
| VerifyNodes("memprof-verify-nodes", cl::init(false), cl::Hidden, |
| cl::desc("Perform frequent verification checks on nodes.")); |
| |
| static cl::opt<std::string> MemProfImportSummary( |
| "memprof-import-summary", |
| cl::desc("Import summary to use for testing the ThinLTO backend via opt"), |
| cl::Hidden); |
| |
| static cl::opt<unsigned> |
| TailCallSearchDepth("memprof-tail-call-search-depth", cl::init(5), |
| cl::Hidden, |
| cl::desc("Max depth to recursively search for missing " |
| "frames through tail calls.")); |
| |
| // Optionally enable cloning of callsites involved with recursive cycles |
| static cl::opt<bool> AllowRecursiveCallsites( |
| "memprof-allow-recursive-callsites", cl::init(true), cl::Hidden, |
| cl::desc("Allow cloning of callsites involved in recursive cycles")); |
| |
| static cl::opt<bool> CloneRecursiveContexts( |
| "memprof-clone-recursive-contexts", cl::init(true), cl::Hidden, |
| cl::desc("Allow cloning of contexts through recursive cycles")); |
| |
| // Generally this is needed for correct assignment of allocation clones to |
| // function clones, however, allow it to be disabled for debugging while the |
| // functionality is new and being tested more widely. |
| static cl::opt<bool> |
| MergeClones("memprof-merge-clones", cl::init(true), cl::Hidden, |
| cl::desc("Merge clones before assigning functions")); |
| |
| // When disabled, try to detect and prevent cloning of recursive contexts. |
| // This is only necessary until we support cloning through recursive cycles. |
| // Leave on by default for now, as disabling requires a little bit of compile |
| // time overhead and doesn't affect correctness, it will just inflate the cold |
| // hinted bytes reporting a bit when -memprof-report-hinted-sizes is enabled. |
| static cl::opt<bool> AllowRecursiveContexts( |
| "memprof-allow-recursive-contexts", cl::init(true), cl::Hidden, |
| cl::desc("Allow cloning of contexts having recursive cycles")); |
| |
| namespace llvm { |
| cl::opt<bool> EnableMemProfContextDisambiguation( |
| "enable-memprof-context-disambiguation", cl::init(false), cl::Hidden, |
| cl::ZeroOrMore, cl::desc("Enable MemProf context disambiguation")); |
| |
| // Indicate we are linking with an allocator that supports hot/cold operator |
| // new interfaces. |
| cl::opt<bool> SupportsHotColdNew( |
| "supports-hot-cold-new", cl::init(false), cl::Hidden, |
| cl::desc("Linking with hot/cold operator new interfaces")); |
| |
| static cl::opt<bool> MemProfRequireDefinitionForPromotion( |
| "memprof-require-definition-for-promotion", cl::init(false), cl::Hidden, |
| cl::desc( |
| "Require target function definition when promoting indirect calls")); |
| } // namespace llvm |
| |
| extern cl::opt<bool> MemProfReportHintedSizes; |
| extern cl::opt<unsigned> MinClonedColdBytePercent; |
| |
| namespace { |
| /// CRTP base for graphs built from either IR or ThinLTO summary index. |
| /// |
| /// The graph represents the call contexts in all memprof metadata on allocation |
| /// calls, with nodes for the allocations themselves, as well as for the calls |
| /// in each context. The graph is initially built from the allocation memprof |
| /// metadata (or summary) MIBs. It is then updated to match calls with callsite |
| /// metadata onto the nodes, updating it to reflect any inlining performed on |
| /// those calls. |
| /// |
| /// Each MIB (representing an allocation's call context with allocation |
| /// behavior) is assigned a unique context id during the graph build. The edges |
| /// and nodes in the graph are decorated with the context ids they carry. This |
| /// is used to correctly update the graph when cloning is performed so that we |
| /// can uniquify the context for a single (possibly cloned) allocation. |
| template <typename DerivedCCG, typename FuncTy, typename CallTy> |
| class CallsiteContextGraph { |
| public: |
| CallsiteContextGraph() = default; |
| CallsiteContextGraph(const CallsiteContextGraph &) = default; |
| CallsiteContextGraph(CallsiteContextGraph &&) = default; |
| |
| /// Main entry point to perform analysis and transformations on graph. |
| bool process(); |
| |
| /// Perform cloning on the graph necessary to uniquely identify the allocation |
| /// behavior of an allocation based on its context. |
| void identifyClones(); |
| |
| /// Assign callsite clones to functions, cloning functions as needed to |
| /// accommodate the combinations of their callsite clones reached by callers. |
| /// For regular LTO this clones functions and callsites in the IR, but for |
| /// ThinLTO the cloning decisions are noted in the summaries and later applied |
| /// in applyImport. |
| bool assignFunctions(); |
| |
| void dump() const; |
| void print(raw_ostream &OS) const; |
| void printTotalSizes(raw_ostream &OS) const; |
| |
| friend raw_ostream &operator<<(raw_ostream &OS, |
| const CallsiteContextGraph &CCG) { |
| CCG.print(OS); |
| return OS; |
| } |
| |
| friend struct GraphTraits< |
| const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>; |
| friend struct DOTGraphTraits< |
| const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>; |
| |
| void exportToDot(std::string Label) const; |
| |
| /// Represents a function clone via FuncTy pointer and clone number pair. |
| struct FuncInfo final |
| : public std::pair<FuncTy *, unsigned /*Clone number*/> { |
| using Base = std::pair<FuncTy *, unsigned>; |
| FuncInfo(const Base &B) : Base(B) {} |
| FuncInfo(FuncTy *F = nullptr, unsigned CloneNo = 0) : Base(F, CloneNo) {} |
| explicit operator bool() const { return this->first != nullptr; } |
| FuncTy *func() const { return this->first; } |
| unsigned cloneNo() const { return this->second; } |
| }; |
| |
| /// Represents a callsite clone via CallTy and clone number pair. |
| struct CallInfo final : public std::pair<CallTy, unsigned /*Clone number*/> { |
| using Base = std::pair<CallTy, unsigned>; |
| CallInfo(const Base &B) : Base(B) {} |
| CallInfo(CallTy Call = nullptr, unsigned CloneNo = 0) |
| : Base(Call, CloneNo) {} |
| explicit operator bool() const { return (bool)this->first; } |
| CallTy call() const { return this->first; } |
| unsigned cloneNo() const { return this->second; } |
| void setCloneNo(unsigned N) { this->second = N; } |
| void print(raw_ostream &OS) const { |
| if (!operator bool()) { |
| assert(!cloneNo()); |
| OS << "null Call"; |
| return; |
| } |
| call()->print(OS); |
| OS << "\t(clone " << cloneNo() << ")"; |
| } |
| void dump() const { |
| print(dbgs()); |
| dbgs() << "\n"; |
| } |
| friend raw_ostream &operator<<(raw_ostream &OS, const CallInfo &Call) { |
| Call.print(OS); |
| return OS; |
| } |
| }; |
| |
| struct ContextEdge; |
| |
| /// Node in the Callsite Context Graph |
| struct ContextNode { |
| // Keep this for now since in the IR case where we have an Instruction* it |
| // is not as immediately discoverable. Used for printing richer information |
| // when dumping graph. |
| bool IsAllocation; |
| |
| // Keeps track of when the Call was reset to null because there was |
| // recursion. |
| bool Recursive = false; |
| |
| // This will be formed by ORing together the AllocationType enum values |
| // for contexts including this node. |
| uint8_t AllocTypes = 0; |
| |
| // The corresponding allocation or interior call. This is the primary call |
| // for which we have created this node. |
| CallInfo Call; |
| |
| // List of other calls that can be treated the same as the primary call |
| // through cloning. I.e. located in the same function and have the same |
| // (possibly pruned) stack ids. They will be updated the same way as the |
| // primary call when assigning to function clones. |
| SmallVector<CallInfo, 0> MatchingCalls; |
| |
| // For alloc nodes this is a unique id assigned when constructed, and for |
| // callsite stack nodes it is the original stack id when the node is |
| // constructed from the memprof MIB metadata on the alloc nodes. Note that |
| // this is only used when matching callsite metadata onto the stack nodes |
| // created when processing the allocation memprof MIBs, and for labeling |
| // nodes in the dot graph. Therefore we don't bother to assign a value for |
| // clones. |
| uint64_t OrigStackOrAllocId = 0; |
| |
| // Edges to all callees in the profiled call stacks. |
| // TODO: Should this be a map (from Callee node) for more efficient lookup? |
| std::vector<std::shared_ptr<ContextEdge>> CalleeEdges; |
| |
| // Edges to all callers in the profiled call stacks. |
| // TODO: Should this be a map (from Caller node) for more efficient lookup? |
| std::vector<std::shared_ptr<ContextEdge>> CallerEdges; |
| |
| // Returns true if we need to look at the callee edges for determining the |
| // node context ids and allocation type. |
| bool useCallerEdgesForContextInfo() const { |
| // Typically if the callee edges are empty either the caller edges are |
| // also empty, or this is an allocation (leaf node). However, if we are |
| // allowing recursive callsites and contexts this will be violated for |
| // incompletely cloned recursive cycles. |
| assert(!CalleeEdges.empty() || CallerEdges.empty() || IsAllocation || |
| (AllowRecursiveCallsites && AllowRecursiveContexts)); |
| // When cloning for a recursive context, during cloning we might be in the |
| // midst of cloning for a recurrence and have moved context ids off of a |
| // caller edge onto the clone but not yet off of the incoming caller |
| // (back) edge. If we don't look at those we miss the fact that this node |
| // still has context ids of interest. |
| return IsAllocation || CloneRecursiveContexts; |
| } |
| |
| // Compute the context ids for this node from the union of its edge context |
| // ids. |
| DenseSet<uint32_t> getContextIds() const { |
| unsigned Count = 0; |
| // Compute the number of ids for reserve below. In general we only need to |
| // look at one set of edges, typically the callee edges, since other than |
| // allocations and in some cases during recursion cloning, all the context |
| // ids on the callers should also flow out via callee edges. |
| for (auto &Edge : CalleeEdges.empty() ? CallerEdges : CalleeEdges) |
| Count += Edge->getContextIds().size(); |
| DenseSet<uint32_t> ContextIds; |
| ContextIds.reserve(Count); |
| auto Edges = llvm::concat<const std::shared_ptr<ContextEdge>>( |
| CalleeEdges, useCallerEdgesForContextInfo() |
| ? CallerEdges |
| : std::vector<std::shared_ptr<ContextEdge>>()); |
| for (const auto &Edge : Edges) |
| ContextIds.insert_range(Edge->getContextIds()); |
| return ContextIds; |
| } |
| |
| // Compute the allocation type for this node from the OR of its edge |
| // allocation types. |
| uint8_t computeAllocType() const { |
| uint8_t BothTypes = |
| (uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold; |
| uint8_t AllocType = (uint8_t)AllocationType::None; |
| auto Edges = llvm::concat<const std::shared_ptr<ContextEdge>>( |
| CalleeEdges, useCallerEdgesForContextInfo() |
| ? CallerEdges |
| : std::vector<std::shared_ptr<ContextEdge>>()); |
| for (const auto &Edge : Edges) { |
| AllocType |= Edge->AllocTypes; |
| // Bail early if alloc type reached both, no further refinement. |
| if (AllocType == BothTypes) |
| return AllocType; |
| } |
| return AllocType; |
| } |
| |
| // The context ids set for this node is empty if its edge context ids are |
| // also all empty. |
| bool emptyContextIds() const { |
| auto Edges = llvm::concat<const std::shared_ptr<ContextEdge>>( |
| CalleeEdges, useCallerEdgesForContextInfo() |
| ? CallerEdges |
| : std::vector<std::shared_ptr<ContextEdge>>()); |
| for (const auto &Edge : Edges) { |
| if (!Edge->getContextIds().empty()) |
| return false; |
| } |
| return true; |
| } |
| |
| // List of clones of this ContextNode, initially empty. |
| std::vector<ContextNode *> Clones; |
| |
| // If a clone, points to the original uncloned node. |
| ContextNode *CloneOf = nullptr; |
| |
| ContextNode(bool IsAllocation) : IsAllocation(IsAllocation), Call() {} |
| |
| ContextNode(bool IsAllocation, CallInfo C) |
| : IsAllocation(IsAllocation), Call(C) {} |
| |
| void addClone(ContextNode *Clone) { |
| if (CloneOf) { |
| CloneOf->Clones.push_back(Clone); |
| Clone->CloneOf = CloneOf; |
| } else { |
| Clones.push_back(Clone); |
| assert(!Clone->CloneOf); |
| Clone->CloneOf = this; |
| } |
| } |
| |
| ContextNode *getOrigNode() { |
| if (!CloneOf) |
| return this; |
| return CloneOf; |
| } |
| |
| void addOrUpdateCallerEdge(ContextNode *Caller, AllocationType AllocType, |
| unsigned int ContextId); |
| |
| ContextEdge *findEdgeFromCallee(const ContextNode *Callee); |
| ContextEdge *findEdgeFromCaller(const ContextNode *Caller); |
| void eraseCalleeEdge(const ContextEdge *Edge); |
| void eraseCallerEdge(const ContextEdge *Edge); |
| |
| void setCall(CallInfo C) { Call = C; } |
| |
| bool hasCall() const { return (bool)Call.call(); } |
| |
| void printCall(raw_ostream &OS) const { Call.print(OS); } |
| |
| // True if this node was effectively removed from the graph, in which case |
| // it should have an allocation type of None and empty context ids. |
| bool isRemoved() const { |
| // Typically if the callee edges are empty either the caller edges are |
| // also empty, or this is an allocation (leaf node). However, if we are |
| // allowing recursive callsites and contexts this will be violated for |
| // incompletely cloned recursive cycles. |
| assert((AllowRecursiveCallsites && AllowRecursiveContexts) || |
| (AllocTypes == (uint8_t)AllocationType::None) == |
| emptyContextIds()); |
| return AllocTypes == (uint8_t)AllocationType::None; |
| } |
| |
| void dump() const; |
| void print(raw_ostream &OS) const; |
| |
| friend raw_ostream &operator<<(raw_ostream &OS, const ContextNode &Node) { |
| Node.print(OS); |
| return OS; |
| } |
| }; |
| |
| /// Edge in the Callsite Context Graph from a ContextNode N to a caller or |
| /// callee. |
| struct ContextEdge { |
| ContextNode *Callee; |
| ContextNode *Caller; |
| |
| // This will be formed by ORing together the AllocationType enum values |
| // for contexts including this edge. |
| uint8_t AllocTypes = 0; |
| |
| // Set just before initiating cloning when cloning of recursive contexts is |
| // enabled. Used to defer cloning of backedges until we have done cloning of |
| // the callee node for non-backedge caller edges. This exposes cloning |
| // opportunities through the backedge of the cycle. |
| // TODO: Note that this is not updated during cloning, and it is unclear |
| // whether that would be needed. |
| bool IsBackedge = false; |
| |
| // The set of IDs for contexts including this edge. |
| DenseSet<uint32_t> ContextIds; |
| |
| ContextEdge(ContextNode *Callee, ContextNode *Caller, uint8_t AllocType, |
| DenseSet<uint32_t> ContextIds) |
| : Callee(Callee), Caller(Caller), AllocTypes(AllocType), |
| ContextIds(std::move(ContextIds)) {} |
| |
| DenseSet<uint32_t> &getContextIds() { return ContextIds; } |
| |
| // Helper to clear the fields of this edge when we are removing it from the |
| // graph. |
| inline void clear() { |
| ContextIds.clear(); |
| AllocTypes = (uint8_t)AllocationType::None; |
| Caller = nullptr; |
| Callee = nullptr; |
| } |
| |
| // Check if edge was removed from the graph. This is useful while iterating |
| // over a copy of edge lists when performing operations that mutate the |
| // graph in ways that might remove one of the edges. |
| inline bool isRemoved() const { |
| if (Callee || Caller) |
| return false; |
| // Any edges that have been removed from the graph but are still in a |
| // shared_ptr somewhere should have all fields null'ed out by clear() |
| // above. |
| assert(AllocTypes == (uint8_t)AllocationType::None); |
| assert(ContextIds.empty()); |
| return true; |
| } |
| |
| void dump() const; |
| void print(raw_ostream &OS) const; |
| |
| friend raw_ostream &operator<<(raw_ostream &OS, const ContextEdge &Edge) { |
| Edge.print(OS); |
| return OS; |
| } |
| }; |
| |
| /// Helpers to remove edges that have allocation type None (due to not |
| /// carrying any context ids) after transformations. |
| void removeNoneTypeCalleeEdges(ContextNode *Node); |
| void removeNoneTypeCallerEdges(ContextNode *Node); |
| void |
| recursivelyRemoveNoneTypeCalleeEdges(ContextNode *Node, |
| DenseSet<const ContextNode *> &Visited); |
| |
| protected: |
| /// Get a list of nodes corresponding to the stack ids in the given callsite |
| /// context. |
| template <class NodeT, class IteratorT> |
| std::vector<uint64_t> |
| getStackIdsWithContextNodes(CallStack<NodeT, IteratorT> &CallsiteContext); |
| |
| /// Adds nodes for the given allocation and any stack ids on its memprof MIB |
| /// metadata (or summary). |
| ContextNode *addAllocNode(CallInfo Call, const FuncTy *F); |
| |
| /// Adds nodes for the given MIB stack ids. |
| template <class NodeT, class IteratorT> |
| void addStackNodesForMIB(ContextNode *AllocNode, |
| CallStack<NodeT, IteratorT> &StackContext, |
| CallStack<NodeT, IteratorT> &CallsiteContext, |
| AllocationType AllocType, |
| ArrayRef<ContextTotalSize> ContextSizeInfo); |
| |
| /// Matches all callsite metadata (or summary) to the nodes created for |
| /// allocation memprof MIB metadata, synthesizing new nodes to reflect any |
| /// inlining performed on those callsite instructions. |
| void updateStackNodes(); |
| |
| /// Update graph to conservatively handle any callsite stack nodes that target |
| /// multiple different callee target functions. |
| void handleCallsitesWithMultipleTargets(); |
| |
| /// Mark backedges via the standard DFS based backedge algorithm. |
| void markBackedges(); |
| |
| /// Merge clones generated during cloning for different allocations but that |
| /// are called by the same caller node, to ensure proper function assignment. |
| void mergeClones(); |
| |
| // Try to partition calls on the given node (already placed into the AllCalls |
| // array) by callee function, creating new copies of Node as needed to hold |
| // calls with different callees, and moving the callee edges appropriately. |
| // Returns true if partitioning was successful. |
| bool partitionCallsByCallee( |
| ContextNode *Node, ArrayRef<CallInfo> AllCalls, |
| std::vector<std::pair<CallInfo, ContextNode *>> &NewCallToNode); |
| |
| /// Save lists of calls with MemProf metadata in each function, for faster |
| /// iteration. |
| MapVector<FuncTy *, std::vector<CallInfo>> FuncToCallsWithMetadata; |
| |
| /// Map from callsite node to the enclosing caller function. |
| std::map<const ContextNode *, const FuncTy *> NodeToCallingFunc; |
| |
| // When exporting to dot, and an allocation id is specified, contains the |
| // context ids on that allocation. |
| DenseSet<uint32_t> DotAllocContextIds; |
| |
| private: |
| using EdgeIter = typename std::vector<std::shared_ptr<ContextEdge>>::iterator; |
| |
| // Structure to keep track of information for each call as we are matching |
| // non-allocation callsites onto context nodes created from the allocation |
| // call metadata / summary contexts. |
| struct CallContextInfo { |
| // The callsite we're trying to match. |
| CallTy Call; |
| // The callsites stack ids that have a context node in the graph. |
| std::vector<uint64_t> StackIds; |
| // The function containing this callsite. |
| const FuncTy *Func; |
| // Initially empty, if needed this will be updated to contain the context |
| // ids for use in a new context node created for this callsite. |
| DenseSet<uint32_t> ContextIds; |
| }; |
| |
| /// Helper to remove edge from graph, updating edge iterator if it is provided |
| /// (in which case CalleeIter indicates which edge list is being iterated). |
| /// This will also perform the necessary clearing of the ContextEdge members |
| /// to enable later checking if the edge has been removed (since we may have |
| /// other copies of the shared_ptr in existence, and in fact rely on this to |
| /// enable removal while iterating over a copy of a node's edge list). |
| void removeEdgeFromGraph(ContextEdge *Edge, EdgeIter *EI = nullptr, |
| bool CalleeIter = true); |
| |
| /// Assigns the given Node to calls at or inlined into the location with |
| /// the Node's stack id, after post order traversing and processing its |
| /// caller nodes. Uses the call information recorded in the given |
| /// StackIdToMatchingCalls map, and creates new nodes for inlined sequences |
| /// as needed. Called by updateStackNodes which sets up the given |
| /// StackIdToMatchingCalls map. |
| void assignStackNodesPostOrder( |
| ContextNode *Node, DenseSet<const ContextNode *> &Visited, |
| DenseMap<uint64_t, std::vector<CallContextInfo>> &StackIdToMatchingCalls, |
| DenseMap<CallInfo, CallInfo> &CallToMatchingCall); |
| |
| /// Duplicates the given set of context ids, updating the provided |
| /// map from each original id with the newly generated context ids, |
| /// and returning the new duplicated id set. |
| DenseSet<uint32_t> duplicateContextIds( |
| const DenseSet<uint32_t> &StackSequenceContextIds, |
| DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds); |
| |
| /// Propagates all duplicated context ids across the graph. |
| void propagateDuplicateContextIds( |
| const DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds); |
| |
| /// Connect the NewNode to OrigNode's callees if TowardsCallee is true, |
| /// else to its callers. Also updates OrigNode's edges to remove any context |
| /// ids moved to the newly created edge. |
| void connectNewNode(ContextNode *NewNode, ContextNode *OrigNode, |
| bool TowardsCallee, |
| DenseSet<uint32_t> RemainingContextIds); |
| |
| /// Get the stack id corresponding to the given Id or Index (for IR this will |
| /// return itself, for a summary index this will return the id recorded in the |
| /// index for that stack id index value). |
| uint64_t getStackId(uint64_t IdOrIndex) const { |
| return static_cast<const DerivedCCG *>(this)->getStackId(IdOrIndex); |
| } |
| |
| /// Returns true if the given call targets the callee of the given edge, or if |
| /// we were able to identify the call chain through intermediate tail calls. |
| /// In the latter case new context nodes are added to the graph for the |
| /// identified tail calls, and their synthesized nodes are added to |
| /// TailCallToContextNodeMap. The EdgeIter is updated in the latter case for |
| /// the updated edges and to prepare it for an increment in the caller. |
| bool |
| calleesMatch(CallTy Call, EdgeIter &EI, |
| MapVector<CallInfo, ContextNode *> &TailCallToContextNodeMap); |
| |
| // Return the callee function of the given call, or nullptr if it can't be |
| // determined |
| const FuncTy *getCalleeFunc(CallTy Call) { |
| return static_cast<DerivedCCG *>(this)->getCalleeFunc(Call); |
| } |
| |
| /// Returns true if the given call targets the given function, or if we were |
| /// able to identify the call chain through intermediate tail calls (in which |
| /// case FoundCalleeChain will be populated). |
| bool calleeMatchesFunc( |
| CallTy Call, const FuncTy *Func, const FuncTy *CallerFunc, |
| std::vector<std::pair<CallTy, FuncTy *>> &FoundCalleeChain) { |
| return static_cast<DerivedCCG *>(this)->calleeMatchesFunc( |
| Call, Func, CallerFunc, FoundCalleeChain); |
| } |
| |
| /// Returns true if both call instructions have the same callee. |
| bool sameCallee(CallTy Call1, CallTy Call2) { |
| return static_cast<DerivedCCG *>(this)->sameCallee(Call1, Call2); |
| } |
| |
| /// Get a list of nodes corresponding to the stack ids in the given |
| /// callsite's context. |
| std::vector<uint64_t> getStackIdsWithContextNodesForCall(CallTy Call) { |
| return static_cast<DerivedCCG *>(this)->getStackIdsWithContextNodesForCall( |
| Call); |
| } |
| |
| /// Get the last stack id in the context for callsite. |
| uint64_t getLastStackId(CallTy Call) { |
| return static_cast<DerivedCCG *>(this)->getLastStackId(Call); |
| } |
| |
| /// Update the allocation call to record type of allocated memory. |
| void updateAllocationCall(CallInfo &Call, AllocationType AllocType) { |
| AllocType == AllocationType::Cold ? AllocTypeCold++ : AllocTypeNotCold++; |
| static_cast<DerivedCCG *>(this)->updateAllocationCall(Call, AllocType); |
| } |
| |
| /// Get the AllocationType assigned to the given allocation instruction clone. |
| AllocationType getAllocationCallType(const CallInfo &Call) const { |
| return static_cast<const DerivedCCG *>(this)->getAllocationCallType(Call); |
| } |
| |
| /// Update non-allocation call to invoke (possibly cloned) function |
| /// CalleeFunc. |
| void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc) { |
| static_cast<DerivedCCG *>(this)->updateCall(CallerCall, CalleeFunc); |
| } |
| |
| /// Clone the given function for the given callsite, recording mapping of all |
| /// of the functions tracked calls to their new versions in the CallMap. |
| /// Assigns new clones to clone number CloneNo. |
| FuncInfo cloneFunctionForCallsite( |
| FuncInfo &Func, CallInfo &Call, std::map<CallInfo, CallInfo> &CallMap, |
| std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) { |
| return static_cast<DerivedCCG *>(this)->cloneFunctionForCallsite( |
| Func, Call, CallMap, CallsWithMetadataInFunc, CloneNo); |
| } |
| |
| /// Gets a label to use in the dot graph for the given call clone in the given |
| /// function. |
| std::string getLabel(const FuncTy *Func, const CallTy Call, |
| unsigned CloneNo) const { |
| return static_cast<const DerivedCCG *>(this)->getLabel(Func, Call, CloneNo); |
| } |
| |
| // Create and return a new ContextNode. |
| ContextNode *createNewNode(bool IsAllocation, const FuncTy *F = nullptr, |
| CallInfo C = CallInfo()) { |
| NodeOwner.push_back(std::make_unique<ContextNode>(IsAllocation, C)); |
| auto *NewNode = NodeOwner.back().get(); |
| if (F) |
| NodeToCallingFunc[NewNode] = F; |
| return NewNode; |
| } |
| |
| /// Helpers to find the node corresponding to the given call or stackid. |
| ContextNode *getNodeForInst(const CallInfo &C); |
| ContextNode *getNodeForAlloc(const CallInfo &C); |
| ContextNode *getNodeForStackId(uint64_t StackId); |
| |
| /// Computes the alloc type corresponding to the given context ids, by |
| /// unioning their recorded alloc types. |
| uint8_t computeAllocType(DenseSet<uint32_t> &ContextIds) const; |
| |
| /// Returns the allocation type of the intersection of the contexts of two |
| /// nodes (based on their provided context id sets), optimized for the case |
| /// when Node1Ids is smaller than Node2Ids. |
| uint8_t intersectAllocTypesImpl(const DenseSet<uint32_t> &Node1Ids, |
| const DenseSet<uint32_t> &Node2Ids) const; |
| |
| /// Returns the allocation type of the intersection of the contexts of two |
| /// nodes (based on their provided context id sets). |
| uint8_t intersectAllocTypes(const DenseSet<uint32_t> &Node1Ids, |
| const DenseSet<uint32_t> &Node2Ids) const; |
| |
| /// Create a clone of Edge's callee and move Edge to that new callee node, |
| /// performing the necessary context id and allocation type updates. |
| /// If ContextIdsToMove is non-empty, only that subset of Edge's ids are |
| /// moved to an edge to the new callee. |
| ContextNode * |
| moveEdgeToNewCalleeClone(const std::shared_ptr<ContextEdge> &Edge, |
| DenseSet<uint32_t> ContextIdsToMove = {}); |
| |
| /// Change the callee of Edge to existing callee clone NewCallee, performing |
| /// the necessary context id and allocation type updates. |
| /// If ContextIdsToMove is non-empty, only that subset of Edge's ids are |
| /// moved to an edge to the new callee. |
| void moveEdgeToExistingCalleeClone(const std::shared_ptr<ContextEdge> &Edge, |
| ContextNode *NewCallee, |
| bool NewClone = false, |
| DenseSet<uint32_t> ContextIdsToMove = {}); |
| |
| /// Change the caller of the edge at the given callee edge iterator to be |
| /// NewCaller, performing the necessary context id and allocation type |
| /// updates. This is similar to the above moveEdgeToExistingCalleeClone, but |
| /// a simplified version of it as we always move the given edge and all of its |
| /// context ids. |
| void moveCalleeEdgeToNewCaller(const std::shared_ptr<ContextEdge> &Edge, |
| ContextNode *NewCaller); |
| |
| /// Recursive helper for marking backedges via DFS. |
| void markBackedges(ContextNode *Node, DenseSet<const ContextNode *> &Visited, |
| DenseSet<const ContextNode *> &CurrentStack); |
| |
| /// Recursive helper for merging clones. |
| void |
| mergeClones(ContextNode *Node, DenseSet<const ContextNode *> &Visited, |
| DenseMap<uint32_t, ContextNode *> &ContextIdToAllocationNode); |
| /// Main worker for merging callee clones for a given node. |
| void mergeNodeCalleeClones( |
| ContextNode *Node, DenseSet<const ContextNode *> &Visited, |
| DenseMap<uint32_t, ContextNode *> &ContextIdToAllocationNode); |
| /// Helper to find other callers of the given set of callee edges that can |
| /// share the same callee merge node. |
| void findOtherCallersToShareMerge( |
| ContextNode *Node, std::vector<std::shared_ptr<ContextEdge>> &CalleeEdges, |
| DenseMap<uint32_t, ContextNode *> &ContextIdToAllocationNode, |
| DenseSet<ContextNode *> &OtherCallersToShareMerge); |
| |
| /// Recursively perform cloning on the graph for the given Node and its |
| /// callers, in order to uniquely identify the allocation behavior of an |
| /// allocation given its context. The context ids of the allocation being |
| /// processed are given in AllocContextIds. |
| void identifyClones(ContextNode *Node, DenseSet<const ContextNode *> &Visited, |
| const DenseSet<uint32_t> &AllocContextIds); |
| |
| /// Map from each context ID to the AllocationType assigned to that context. |
| DenseMap<uint32_t, AllocationType> ContextIdToAllocationType; |
| |
| /// Map from each contextID to the profiled full contexts and their total |
| /// sizes (there may be more than one due to context trimming), |
| /// optionally populated when requested (via MemProfReportHintedSizes or |
| /// MinClonedColdBytePercent). |
| DenseMap<uint32_t, std::vector<ContextTotalSize>> ContextIdToContextSizeInfos; |
| |
| /// Identifies the context node created for a stack id when adding the MIB |
| /// contexts to the graph. This is used to locate the context nodes when |
| /// trying to assign the corresponding callsites with those stack ids to these |
| /// nodes. |
| DenseMap<uint64_t, ContextNode *> StackEntryIdToContextNodeMap; |
| |
| /// Maps to track the calls to their corresponding nodes in the graph. |
| MapVector<CallInfo, ContextNode *> AllocationCallToContextNodeMap; |
| MapVector<CallInfo, ContextNode *> NonAllocationCallToContextNodeMap; |
| |
| /// Owner of all ContextNode unique_ptrs. |
| std::vector<std::unique_ptr<ContextNode>> NodeOwner; |
| |
| /// Perform sanity checks on graph when requested. |
| void check() const; |
| |
| /// Keeps track of the last unique context id assigned. |
| unsigned int LastContextId = 0; |
| }; |
| |
| template <typename DerivedCCG, typename FuncTy, typename CallTy> |
| using ContextNode = |
| typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode; |
| template <typename DerivedCCG, typename FuncTy, typename CallTy> |
| using ContextEdge = |
| typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge; |
| template <typename DerivedCCG, typename FuncTy, typename CallTy> |
| using FuncInfo = |
| typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::FuncInfo; |
| template <typename DerivedCCG, typename FuncTy, typename CallTy> |
| using CallInfo = |
| typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::CallInfo; |
| |
| /// CRTP derived class for graphs built from IR (regular LTO). |
| class ModuleCallsiteContextGraph |
| : public CallsiteContextGraph<ModuleCallsiteContextGraph, Function, |
| Instruction *> { |
| public: |
| ModuleCallsiteContextGraph( |
| Module &M, |
| llvm::function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter); |
| |
| private: |
| friend CallsiteContextGraph<ModuleCallsiteContextGraph, Function, |
| Instruction *>; |
| |
| uint64_t getStackId(uint64_t IdOrIndex) const; |
| const Function *getCalleeFunc(Instruction *Call); |
| bool calleeMatchesFunc( |
| Instruction *Call, const Function *Func, const Function *CallerFunc, |
| std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain); |
| bool sameCallee(Instruction *Call1, Instruction *Call2); |
| bool findProfiledCalleeThroughTailCalls( |
| const Function *ProfiledCallee, Value *CurCallee, unsigned Depth, |
| std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain, |
| bool &FoundMultipleCalleeChains); |
| uint64_t getLastStackId(Instruction *Call); |
| std::vector<uint64_t> getStackIdsWithContextNodesForCall(Instruction *Call); |
| void updateAllocationCall(CallInfo &Call, AllocationType AllocType); |
| AllocationType getAllocationCallType(const CallInfo &Call) const; |
| void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc); |
| CallsiteContextGraph<ModuleCallsiteContextGraph, Function, |
| Instruction *>::FuncInfo |
| cloneFunctionForCallsite(FuncInfo &Func, CallInfo &Call, |
| std::map<CallInfo, CallInfo> &CallMap, |
| std::vector<CallInfo> &CallsWithMetadataInFunc, |
| unsigned CloneNo); |
| std::string getLabel(const Function *Func, const Instruction *Call, |
| unsigned CloneNo) const; |
| |
| const Module &Mod; |
| llvm::function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter; |
| }; |
| |
| /// Represents a call in the summary index graph, which can either be an |
| /// allocation or an interior callsite node in an allocation's context. |
| /// Holds a pointer to the corresponding data structure in the index. |
| struct IndexCall : public PointerUnion<CallsiteInfo *, AllocInfo *> { |
| IndexCall() : PointerUnion() {} |
| IndexCall(std::nullptr_t) : IndexCall() {} |
| IndexCall(CallsiteInfo *StackNode) : PointerUnion(StackNode) {} |
| IndexCall(AllocInfo *AllocNode) : PointerUnion(AllocNode) {} |
| IndexCall(PointerUnion PT) : PointerUnion(PT) {} |
| |
| IndexCall *operator->() { return this; } |
| |
| void print(raw_ostream &OS) const { |
| PointerUnion<CallsiteInfo *, AllocInfo *> Base = *this; |
| if (auto *AI = llvm::dyn_cast_if_present<AllocInfo *>(Base)) { |
| OS << *AI; |
| } else { |
| auto *CI = llvm::dyn_cast_if_present<CallsiteInfo *>(Base); |
| assert(CI); |
| OS << *CI; |
| } |
| } |
| }; |
| } // namespace |
| |
| namespace llvm { |
| template <> struct simplify_type<IndexCall> { |
| using SimpleType = PointerUnion<CallsiteInfo *, AllocInfo *>; |
| static SimpleType getSimplifiedValue(IndexCall &Val) { return Val; } |
| }; |
| template <> struct simplify_type<const IndexCall> { |
| using SimpleType = const PointerUnion<CallsiteInfo *, AllocInfo *>; |
| static SimpleType getSimplifiedValue(const IndexCall &Val) { return Val; } |
| }; |
| } // namespace llvm |
| |
| namespace { |
| /// CRTP derived class for graphs built from summary index (ThinLTO). |
| class IndexCallsiteContextGraph |
| : public CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary, |
| IndexCall> { |
| public: |
| IndexCallsiteContextGraph( |
| ModuleSummaryIndex &Index, |
| llvm::function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)> |
| isPrevailing); |
| |
| ~IndexCallsiteContextGraph() { |
| // Now that we are done with the graph it is safe to add the new |
| // CallsiteInfo structs to the function summary vectors. The graph nodes |
| // point into locations within these vectors, so we don't want to add them |
| // any earlier. |
| for (auto &I : FunctionCalleesToSynthesizedCallsiteInfos) { |
| auto *FS = I.first; |
| for (auto &Callsite : I.second) |
| FS->addCallsite(*Callsite.second); |
| } |
| } |
| |
| private: |
| friend CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary, |
| IndexCall>; |
| |
| uint64_t getStackId(uint64_t IdOrIndex) const; |
| const FunctionSummary *getCalleeFunc(IndexCall &Call); |
| bool calleeMatchesFunc( |
| IndexCall &Call, const FunctionSummary *Func, |
| const FunctionSummary *CallerFunc, |
| std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain); |
| bool sameCallee(IndexCall &Call1, IndexCall &Call2); |
| bool findProfiledCalleeThroughTailCalls( |
| ValueInfo ProfiledCallee, ValueInfo CurCallee, unsigned Depth, |
| std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain, |
| bool &FoundMultipleCalleeChains); |
| uint64_t getLastStackId(IndexCall &Call); |
| std::vector<uint64_t> getStackIdsWithContextNodesForCall(IndexCall &Call); |
| void updateAllocationCall(CallInfo &Call, AllocationType AllocType); |
| AllocationType getAllocationCallType(const CallInfo &Call) const; |
| void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc); |
| CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary, |
| IndexCall>::FuncInfo |
| cloneFunctionForCallsite(FuncInfo &Func, CallInfo &Call, |
| std::map<CallInfo, CallInfo> &CallMap, |
| std::vector<CallInfo> &CallsWithMetadataInFunc, |
| unsigned CloneNo); |
| std::string getLabel(const FunctionSummary *Func, const IndexCall &Call, |
| unsigned CloneNo) const; |
| |
| // Saves mapping from function summaries containing memprof records back to |
| // its VI, for use in checking and debugging. |
| std::map<const FunctionSummary *, ValueInfo> FSToVIMap; |
| |
| const ModuleSummaryIndex &Index; |
| llvm::function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)> |
| isPrevailing; |
| |
| // Saves/owns the callsite info structures synthesized for missing tail call |
| // frames that we discover while building the graph. |
| // It maps from the summary of the function making the tail call, to a map |
| // of callee ValueInfo to corresponding synthesized callsite info. |
| std::unordered_map<FunctionSummary *, |
| std::map<ValueInfo, std::unique_ptr<CallsiteInfo>>> |
| FunctionCalleesToSynthesizedCallsiteInfos; |
| }; |
| } // namespace |
| |
| namespace llvm { |
| template <> |
| struct DenseMapInfo<typename CallsiteContextGraph< |
| ModuleCallsiteContextGraph, Function, Instruction *>::CallInfo> |
| : public DenseMapInfo<std::pair<Instruction *, unsigned>> {}; |
| template <> |
| struct DenseMapInfo<typename CallsiteContextGraph< |
| IndexCallsiteContextGraph, FunctionSummary, IndexCall>::CallInfo> |
| : public DenseMapInfo<std::pair<IndexCall, unsigned>> {}; |
| template <> |
| struct DenseMapInfo<IndexCall> |
| : public DenseMapInfo<PointerUnion<CallsiteInfo *, AllocInfo *>> {}; |
| } // end namespace llvm |
| |
| namespace { |
| |
| // Map the uint8_t alloc types (which may contain NotCold|Cold) to the alloc |
| // type we should actually use on the corresponding allocation. |
| // If we can't clone a node that has NotCold+Cold alloc type, we will fall |
| // back to using NotCold. So don't bother cloning to distinguish NotCold+Cold |
| // from NotCold. |
| AllocationType allocTypeToUse(uint8_t AllocTypes) { |
| assert(AllocTypes != (uint8_t)AllocationType::None); |
| if (AllocTypes == |
| ((uint8_t)AllocationType::NotCold | (uint8_t)AllocationType::Cold)) |
| return AllocationType::NotCold; |
| else |
| return (AllocationType)AllocTypes; |
| } |
| |
| // Helper to check if the alloc types for all edges recorded in the |
| // InAllocTypes vector match the alloc types for all edges in the Edges |
| // vector. |
| template <typename DerivedCCG, typename FuncTy, typename CallTy> |
| bool allocTypesMatch( |
| const std::vector<uint8_t> &InAllocTypes, |
| const std::vector<std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>>> |
| &Edges) { |
| // This should be called only when the InAllocTypes vector was computed for |
| // this set of Edges. Make sure the sizes are the same. |
| assert(InAllocTypes.size() == Edges.size()); |
| return std::equal( |
| InAllocTypes.begin(), InAllocTypes.end(), Edges.begin(), Edges.end(), |
| [](const uint8_t &l, |
| const std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>> &r) { |
| // Can share if one of the edges is None type - don't |
| // care about the type along that edge as it doesn't |
| // exist for those context ids. |
| if (l == (uint8_t)AllocationType::None || |
| r->AllocTypes == (uint8_t)AllocationType::None) |
| return true; |
| return allocTypeToUse(l) == allocTypeToUse(r->AllocTypes); |
| }); |
| } |
| |
| // Helper to check if the alloc types for all edges recorded in the |
| // InAllocTypes vector match the alloc types for callee edges in the given |
| // clone. Because the InAllocTypes were computed from the original node's callee |
| // edges, and other cloning could have happened after this clone was created, we |
| // need to find the matching clone callee edge, which may or may not exist. |
| template <typename DerivedCCG, typename FuncTy, typename CallTy> |
| bool allocTypesMatchClone( |
| const std::vector<uint8_t> &InAllocTypes, |
| const ContextNode<DerivedCCG, FuncTy, CallTy> *Clone) { |
| const ContextNode<DerivedCCG, FuncTy, CallTy> *Node = Clone->CloneOf; |
| assert(Node); |
| // InAllocTypes should have been computed for the original node's callee |
| // edges. |
| assert(InAllocTypes.size() == Node->CalleeEdges.size()); |
| // First create a map of the clone callee edge callees to the edge alloc type. |
| DenseMap<const ContextNode<DerivedCCG, FuncTy, CallTy> *, uint8_t> |
| EdgeCalleeMap; |
| for (const auto &E : Clone->CalleeEdges) { |
| assert(!EdgeCalleeMap.contains(E->Callee)); |
| EdgeCalleeMap[E->Callee] = E->AllocTypes; |
| } |
| // Next, walk the original node's callees, and look for the corresponding |
| // clone edge to that callee. |
| for (unsigned I = 0; I < Node->CalleeEdges.size(); I++) { |
| auto Iter = EdgeCalleeMap.find(Node->CalleeEdges[I]->Callee); |
| // Not found is ok, we will simply add an edge if we use this clone. |
| if (Iter == EdgeCalleeMap.end()) |
| continue; |
| // Can share if one of the edges is None type - don't |
| // care about the type along that edge as it doesn't |
| // exist for those context ids. |
| if (InAllocTypes[I] == (uint8_t)AllocationType::None || |
| Iter->second == (uint8_t)AllocationType::None) |
| continue; |
| if (allocTypeToUse(Iter->second) != allocTypeToUse(InAllocTypes[I])) |
| return false; |
| } |
| return true; |
| } |
| |
| } // end anonymous namespace |
| |
| template <typename DerivedCCG, typename FuncTy, typename CallTy> |
| typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode * |
| CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getNodeForInst( |
| const CallInfo &C) { |
| ContextNode *Node = getNodeForAlloc(C); |
| if (Node) |
| return Node; |
| |
| return NonAllocationCallToContextNodeMap.lookup(C); |
| } |
| |
| template <typename DerivedCCG, typename FuncTy, typename CallTy> |
| typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode * |
| CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getNodeForAlloc( |
| const CallInfo &C) { |
| return AllocationCallToContextNodeMap.lookup(C); |
| } |
| |
| template <typename DerivedCCG, typename FuncTy, typename CallTy> |
| typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode * |
| CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getNodeForStackId( |
| uint64_t StackId) { |
| auto StackEntryNode = StackEntryIdToContextNodeMap.find(StackId); |
| if (StackEntryNode != StackEntryIdToContextNodeMap.end()) |
| return StackEntryNode->second; |
| return nullptr; |
| } |
| |
| template <typename DerivedCCG, typename FuncTy, typename CallTy> |
| void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode:: |
| addOrUpdateCallerEdge(ContextNode *Caller, AllocationType AllocType, |
| unsigned int ContextId) { |
| for (auto &Edge : CallerEdges) { |
| if (Edge->Caller == Caller) { |
| Edge->AllocTypes |= (uint8_t)AllocType; |
| Edge->getContextIds().insert(ContextId); |
| return; |
| } |
| } |
| std::shared_ptr<ContextEdge> Edge = std::make_shared<ContextEdge>( |
| this, Caller, (uint8_t)AllocType, DenseSet<uint32_t>({ContextId})); |
| CallerEdges.push_back(Edge); |
| Caller->CalleeEdges.push_back(Edge); |
| } |
| |
| template <typename DerivedCCG, typename FuncTy, typename CallTy> |
| void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::removeEdgeFromGraph( |
| ContextEdge *Edge, EdgeIter *EI, bool CalleeIter) { |
| assert(!EI || (*EI)->get() == Edge); |
| assert(!Edge->isRemoved()); |
| // Save the Caller and Callee pointers so we can erase Edge from their edge |
| // lists after clearing Edge below. We do the clearing first in case it is |
| // destructed after removing from the edge lists (if those were the last |
| // shared_ptr references to Edge). |
| auto *Callee = Edge->Callee; |
| auto *Caller = Edge->Caller; |
| |
| // Make sure the edge fields are cleared out so we can properly detect |
| // removed edges if Edge is not destructed because there is still a shared_ptr |
| // reference. |
| Edge->clear(); |
| |
| #ifndef NDEBUG |
| auto CalleeCallerCount = Callee->CallerEdges.size(); |
| auto CallerCalleeCount = Caller->CalleeEdges.size(); |
| #endif |
| if (!EI) { |
| Callee->eraseCallerEdge(Edge); |
| Caller->eraseCalleeEdge(Edge); |
| } else if (CalleeIter) { |
| Callee->eraseCallerEdge(Edge); |
| *EI = Caller->CalleeEdges.erase(*EI); |
| } else { |
| Caller->eraseCalleeEdge(Edge); |
| *EI = Callee->CallerEdges.erase(*EI); |
| } |
| assert(Callee->CallerEdges.size() < CalleeCallerCount); |
| assert(Caller->CalleeEdges.size() < CallerCalleeCount); |
| } |
| |
| template <typename DerivedCCG, typename FuncTy, typename CallTy> |
| void CallsiteContextGraph< |
| DerivedCCG, FuncTy, CallTy>::removeNoneTypeCalleeEdges(ContextNode *Node) { |
| for (auto EI = Node->CalleeEdges.begin(); EI != Node->CalleeEdges.end();) { |
| auto Edge = *EI; |
| if (Edge->AllocTypes == (uint8_t)AllocationType::None) { |
| assert(Edge->ContextIds.empty()); |
| removeEdgeFromGraph(Edge.get(), &EI, /*CalleeIter=*/true); |
| } else |
| ++EI; |
| } |
| } |
| |
| template <typename DerivedCCG, typename FuncTy, typename CallTy> |
| void CallsiteContextGraph< |
| DerivedCCG, FuncTy, CallTy>::removeNoneTypeCallerEdges(ContextNode *Node) { |
| for (auto EI = Node->CallerEdges.begin(); EI != Node->CallerEdges.end();) { |
| auto Edge = *EI; |
| if (Edge->AllocTypes == (uint8_t)AllocationType::None) { |
| assert(Edge->ContextIds.empty()); |
| Edge->Caller->eraseCalleeEdge(Edge.get()); |
| EI = Node->CallerEdges.erase(EI); |
| } else |
| ++EI; |
| } |
| } |
| |
| template <typename DerivedCCG, typename FuncTy, typename CallTy> |
| typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge * |
| CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode:: |
| findEdgeFromCallee(const ContextNode *Callee) { |
| for (const auto &Edge : CalleeEdges) |
| if (Edge->Callee == Callee) |
| return Edge.get(); |
| return nullptr; |
| } |
| |
| template <typename DerivedCCG, typename FuncTy, typename CallTy> |
| typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge * |
| CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode:: |
| findEdgeFromCaller(const ContextNode *Caller) { |
| for (const auto &Edge : CallerEdges) |
| if (Edge->Caller == Caller) |
| return Edge.get(); |
| return nullptr; |
| } |
| |
| template <typename DerivedCCG, typename FuncTy, typename CallTy> |
| void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode:: |
| eraseCalleeEdge(const ContextEdge *Edge) { |
| auto EI = llvm::find_if( |
| CalleeEdges, [Edge](const std::shared_ptr<ContextEdge> &CalleeEdge) { |
| return CalleeEdge.get() == Edge; |
| }); |
| assert(EI != CalleeEdges.end()); |
| CalleeEdges.erase(EI); |
| } |
| |
| template <typename DerivedCCG, typename FuncTy, typename CallTy> |
| void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode:: |
| eraseCallerEdge(const ContextEdge *Edge) { |
| auto EI = llvm::find_if( |
| CallerEdges, [Edge](const std::shared_ptr<ContextEdge> &CallerEdge) { |
| return CallerEdge.get() == Edge; |
| }); |
| assert(EI != CallerEdges.end()); |
| CallerEdges.erase(EI); |
| } |
| |
| template <typename DerivedCCG, typename FuncTy, typename CallTy> |
| uint8_t CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::computeAllocType( |
| DenseSet<uint32_t> &ContextIds) const { |
| uint8_t BothTypes = |
| (uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold; |
| uint8_t AllocType = (uint8_t)AllocationType::None; |
| for (auto Id : ContextIds) { |
| AllocType |= (uint8_t)ContextIdToAllocationType.at(Id); |
| // Bail early if alloc type reached both, no further refinement. |
| if (AllocType == BothTypes) |
| return AllocType; |
| } |
| return AllocType; |
| } |
| |
| template <typename DerivedCCG, typename FuncTy, typename CallTy> |
| uint8_t |
| CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::intersectAllocTypesImpl( |
| const DenseSet<uint32_t> &Node1Ids, |
| const DenseSet<uint32_t> &Node2Ids) const { |
| uint8_t BothTypes = |
| (uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold; |
| uint8_t AllocType = (uint8_t)AllocationType::None; |
| for (auto Id : Node1Ids) { |
| if (!Node2Ids.count(Id)) |
| continue; |
| AllocType |= (uint8_t)ContextIdToAllocationType.at(Id); |
| // Bail early if alloc type reached both, no further refinement. |
| if (AllocType == BothTypes) |
| return AllocType; |
| } |
| return AllocType; |
| } |
| |
| template <typename DerivedCCG, typename FuncTy, typename CallTy> |
| uint8_t CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::intersectAllocTypes( |
| const DenseSet<uint32_t> &Node1Ids, |
| const DenseSet<uint32_t> &Node2Ids) const { |
| if (Node1Ids.size() < Node2Ids.size()) |
| return intersectAllocTypesImpl(Node1Ids, Node2Ids); |
| else |
| return intersectAllocTypesImpl(Node2Ids, Node1Ids); |
| } |
| |
| template <typename DerivedCCG, typename FuncTy, typename CallTy> |
| typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode * |
| CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::addAllocNode( |
| CallInfo Call, const FuncTy *F) { |
| assert(!getNodeForAlloc(Call)); |
| ContextNode *AllocNode = createNewNode(/*IsAllocation=*/true, F, Call); |
| AllocationCallToContextNodeMap[Call] = AllocNode; |
| // Use LastContextId as a uniq id for MIB allocation nodes. |
| AllocNode->OrigStackOrAllocId = LastContextId; |
| // Alloc type should be updated as we add in the MIBs. We should assert |
| // afterwards that it is not still None. |
| AllocNode->AllocTypes = (uint8_t)AllocationType::None; |
| |
| return AllocNode; |
| } |
| |
| static std::string getAllocTypeString(uint8_t AllocTypes) { |
| if (!AllocTypes) |
| return "None"; |
| std::string Str; |
| if (AllocTypes & (uint8_t)AllocationType::NotCold) |
| Str += "NotCold"; |
| if (AllocTypes & (uint8_t)AllocationType::Cold) |
| Str += "Cold"; |
| return Str; |
| } |
| |
| template <typename DerivedCCG, typename FuncTy, typename CallTy> |
| template <class NodeT, class IteratorT> |
| void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::addStackNodesForMIB( |
| ContextNode *AllocNode, CallStack<NodeT, IteratorT> &StackContext, |
| CallStack<NodeT, IteratorT> &CallsiteContext, AllocationType AllocType, |
| ArrayRef<ContextTotalSize> ContextSizeInfo) { |
| // Treating the hot alloc type as NotCold before the disambiguation for "hot" |
| // is done. |
| if (AllocType == AllocationType::Hot) |
| AllocType = AllocationType::NotCold; |
| |
| ContextIdToAllocationType[++LastContextId] = AllocType; |
| |
| if (!ContextSizeInfo.empty()) { |
| auto &Entry = ContextIdToContextSizeInfos[LastContextId]; |
| Entry.insert(Entry.begin(), ContextSizeInfo.begin(), ContextSizeInfo.end()); |
| } |
| |
| // Update alloc type and context ids for this MIB. |
| AllocNode->AllocTypes |= (uint8_t)AllocType; |
| |
| // Now add or update nodes for each stack id in alloc's context. |
| // Later when processing the stack ids on non-alloc callsites we will adjust |
| // for any inlining in the context. |
| ContextNode *PrevNode = AllocNode; |
| // Look for recursion (direct recursion should have been collapsed by |
| // module summary analysis, here we should just be detecting mutual |
| // recursion). Mark these nodes so we don't try to clone. |
| SmallSet<uint64_t, 8> StackIdSet; |
| // Skip any on the allocation call (inlining). |
| for (auto ContextIter = StackContext.beginAfterSharedPrefix(CallsiteContext); |
| ContextIter != StackContext.end(); ++ContextIter) { |
| auto StackId = getStackId(*ContextIter); |
| ContextNode *StackNode = getNodeForStackId(StackId); |
| if (!StackNode) { |
| StackNode = createNewNode(/*IsAllocation=*/false); |
| StackEntryIdToContextNodeMap[StackId] = StackNode; |
| StackNode->OrigStackOrAllocId = StackId; |
| } |
| // Marking a node recursive will prevent its cloning completely, even for |
| // non-recursive contexts flowing through it. |
| if (!AllowRecursiveCallsites) { |
| auto Ins = StackIdSet.insert(StackId); |
| if (!Ins.second) |
| StackNode->Recursive = true; |
| } |
| StackNode->AllocTypes |= (uint8_t)AllocType; |
| PrevNode->addOrUpdateCallerEdge(StackNode, AllocType, LastContextId); |
| PrevNode = StackNode; |
| } |
| } |
| |
| template <typename DerivedCCG, typename FuncTy, typename CallTy> |
| DenseSet<uint32_t> |
| CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::duplicateContextIds( |
| const DenseSet<uint32_t> &StackSequenceContextIds, |
| DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds) { |
| DenseSet<uint32_t> NewContextIds; |
| for (auto OldId : StackSequenceContextIds) { |
| NewContextIds.insert(++LastContextId); |
| OldToNewContextIds[OldId].insert(LastContextId); |
| assert(ContextIdToAllocationType.count(OldId)); |
| // The new context has the same allocation type as original. |
| ContextIdToAllocationType[LastContextId] = ContextIdToAllocationType[OldId]; |
| if (DotAllocContextIds.contains(OldId)) |
| DotAllocContextIds.insert(LastContextId); |
| } |
| return NewContextIds; |
| } |
| |
| template <typename DerivedCCG, typename FuncTy, typename CallTy> |
| void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>:: |
| propagateDuplicateContextIds( |
| const DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds) { |
| // Build a set of duplicated context ids corresponding to the input id set. |
| auto GetNewIds = [&OldToNewContextIds](const DenseSet<uint32_t> &ContextIds) { |
| DenseSet<uint32_t> NewIds; |
| for (auto Id : ContextIds) |
| if (auto NewId = OldToNewContextIds.find(Id); |
| NewId != OldToNewContextIds.end()) |
| NewIds.insert_range(NewId->second); |
| return NewIds; |
| }; |
| |
| // Recursively update context ids sets along caller edges. |
| auto UpdateCallers = [&](ContextNode *Node, |
| DenseSet<const ContextEdge *> &Visited, |
| auto &&UpdateCallers) -> void { |
| for (const auto &Edge : Node->CallerEdges) { |
| auto Inserted = Visited.insert(Edge.get()); |
| if (!Inserted.second) |
| continue; |
| ContextNode *NextNode = Edge->Caller; |
| DenseSet<uint32_t> NewIdsToAdd = GetNewIds(Edge->getContextIds()); |
| // Only need to recursively iterate to NextNode via this caller edge if |
| // it resulted in any added ids to NextNode. |
| if (!NewIdsToAdd.empty()) { |
| Edge->getContextIds().insert_range(NewIdsToAdd); |
| UpdateCallers(NextNode, Visited, UpdateCallers); |
| } |
| } |
| }; |
| |
| DenseSet<const ContextEdge *> Visited; |
| for (auto &Entry : AllocationCallToContextNodeMap) { |
| auto *Node = Entry.second; |
| UpdateCallers(Node, Visited, UpdateCallers); |
| } |
| } |
| |
| template <typename DerivedCCG, typename FuncTy, typename CallTy> |
| void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::connectNewNode( |
| ContextNode *NewNode, ContextNode *OrigNode, bool TowardsCallee, |
| // This must be passed by value to make a copy since it will be adjusted |
| // as ids are moved. |
| DenseSet<uint32_t> RemainingContextIds) { |
| auto &OrigEdges = |
| TowardsCallee ? OrigNode->CalleeEdges : OrigNode->CallerEdges; |
| DenseSet<uint32_t> RecursiveContextIds; |
| DenseSet<uint32_t> AllCallerContextIds; |
| if (AllowRecursiveCallsites) { |
| // Identify which context ids are recursive which is needed to properly |
| // update the RemainingContextIds set. The relevant recursive context ids |
| // are those that are in multiple edges. |
| for (auto &CE : OrigEdges) { |
| AllCallerContextIds.reserve(CE->getContextIds().size()); |
| for (auto Id : CE->getContextIds()) |
| if (!AllCallerContextIds.insert(Id).second) |
| RecursiveContextIds.insert(Id); |
| } |
| } |
| // Increment iterator in loop so that we can remove edges as needed. |
| for (auto EI = OrigEdges.begin(); EI != OrigEdges.end();) { |
| auto Edge = *EI; |
| DenseSet<uint32_t> NewEdgeContextIds; |
| DenseSet<uint32_t> NotFoundContextIds; |
| // Remove any matching context ids from Edge, return set that were found and |
| // removed, these are the new edge's context ids. Also update the remaining |
| // (not found ids). |
| set_subtract(Edge->getContextIds(), RemainingContextIds, NewEdgeContextIds, |
| NotFoundContextIds); |
| // Update the remaining context ids set for the later edges. This is a |
| // compile time optimization. |
| if (RecursiveContextIds.empty()) { |
| // No recursive ids, so all of the previously remaining context ids that |
| // were not seen on this edge are the new remaining set. |
| RemainingContextIds.swap(NotFoundContextIds); |
| } else { |
| // Keep the recursive ids in the remaining set as we expect to see those |
| // on another edge. We can remove the non-recursive remaining ids that |
| // were seen on this edge, however. We already have the set of remaining |
| // ids that were on this edge (in NewEdgeContextIds). Figure out which are |
| // non-recursive and only remove those. Note that despite the higher |
| // overhead of updating the remaining context ids set when recursion |
| // handling is enabled, it was found to be at worst performance neutral |
| // and in one case a clear win. |
| DenseSet<uint32_t> NonRecursiveRemainingCurEdgeIds = |
| set_difference(NewEdgeContextIds, RecursiveContextIds); |
| set_subtract(RemainingContextIds, NonRecursiveRemainingCurEdgeIds); |
| } |
| // If no matching context ids for this edge, skip it. |
| if (NewEdgeContextIds.empty()) { |
| ++EI; |
| continue; |
| } |
| if (TowardsCallee) { |
| uint8_t NewAllocType = computeAllocType(NewEdgeContextIds); |
| auto NewEdge = std::make_shared<ContextEdge>( |
| Edge->Callee, NewNode, NewAllocType, std::move(NewEdgeContextIds)); |
| NewNode->CalleeEdges.push_back(NewEdge); |
| NewEdge->Callee->CallerEdges.push_back(NewEdge); |
| } else { |
| uint8_t NewAllocType = computeAllocType(NewEdgeContextIds); |
| auto NewEdge = std::make_shared<ContextEdge>( |
| NewNode, Edge->Caller, NewAllocType, std::move(NewEdgeContextIds)); |
| NewNode->CallerEdges.push_back(NewEdge); |
| NewEdge->Caller->CalleeEdges.push_back(NewEdge); |
| } |
| // Remove old edge if context ids empty. |
| if (Edge->getContextIds().empty()) { |
| removeEdgeFromGraph(Edge.get(), &EI, TowardsCallee); |
| continue; |
| } |
| ++EI; |
| } |
| } |
| |
| template <typename DerivedCCG, typename FuncTy, typename CallTy> |
| static void checkEdge( |
| const std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>> &Edge) { |
| // Confirm that alloc type is not None and that we have at least one context |
| // id. |
| assert(Edge->AllocTypes != (uint8_t)AllocationType::None); |
| assert(!Edge->ContextIds.empty()); |
| } |
| |
| template <typename DerivedCCG, typename FuncTy, typename CallTy> |
| static void checkNode(const ContextNode<DerivedCCG, FuncTy, CallTy> *Node, |
| bool CheckEdges = true) { |
| if (Node->isRemoved()) |
| return; |
| #ifndef NDEBUG |
| // Compute node's context ids once for use in asserts. |
| auto NodeContextIds = Node->getContextIds(); |
| #endif |
| // Node's context ids should be the union of both its callee and caller edge |
| // context ids. |
| if (Node->CallerEdges.size()) { |
| DenseSet<uint32_t> CallerEdgeContextIds( |
| Node->CallerEdges.front()->ContextIds); |
| for (const auto &Edge : llvm::drop_begin(Node->CallerEdges)) { |
| if (CheckEdges) |
| checkEdge<DerivedCCG, FuncTy, CallTy>(Edge); |
| set_union(CallerEdgeContextIds, Edge->ContextIds); |
| } |
| // Node can have more context ids than callers if some contexts terminate at |
| // node and some are longer. If we are allowing recursive callsites and |
| // contexts this will be violated for incompletely cloned recursive cycles, |
| // so skip the checking in that case. |
| assert((AllowRecursiveCallsites && AllowRecursiveContexts) || |
| NodeContextIds == CallerEdgeContextIds || |
| set_is_subset(CallerEdgeContextIds, NodeContextIds)); |
| } |
| if (Node->CalleeEdges.size()) { |
| DenseSet<uint32_t> CalleeEdgeContextIds( |
| Node->CalleeEdges.front()->ContextIds); |
| for (const auto &Edge : llvm::drop_begin(Node->CalleeEdges)) { |
| if (CheckEdges) |
| checkEdge<DerivedCCG, FuncTy, CallTy>(Edge); |
| set_union(CalleeEdgeContextIds, Edge->getContextIds()); |
| } |
| // If we are allowing recursive callsites and contexts this will be violated |
| // for incompletely cloned recursive cycles, so skip the checking in that |
| // case. |
| assert((AllowRecursiveCallsites && AllowRecursiveContexts) || |
| NodeContextIds == CalleeEdgeContextIds); |
| } |
| // FIXME: Since this checking is only invoked under an option, we should |
| // change the error checking from using assert to something that will trigger |
| // an error on a release build. |
| #ifndef NDEBUG |
| // Make sure we don't end up with duplicate edges between the same caller and |
| // callee. |
| DenseSet<ContextNode<DerivedCCG, FuncTy, CallTy> *> NodeSet; |
| for (const auto &E : Node->CalleeEdges) |
| NodeSet.insert(E->Callee); |
| assert(NodeSet.size() == Node->CalleeEdges.size()); |
| #endif |
| } |
| |
| template <typename DerivedCCG, typename FuncTy, typename CallTy> |
| void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>:: |
| assignStackNodesPostOrder( |
| ContextNode *Node, DenseSet<const ContextNode *> &Visited, |
| DenseMap<uint64_t, std::vector<CallContextInfo>> |
| &StackIdToMatchingCalls, |
| DenseMap<CallInfo, CallInfo> &CallToMatchingCall) { |
| auto Inserted = Visited.insert(Node); |
| if (!Inserted.second) |
| return; |
| // Post order traversal. Iterate over a copy since we may add nodes and |
| // therefore new callers during the recursive call, invalidating any |
| // iterator over the original edge vector. We don't need to process these |
| // new nodes as they were already processed on creation. |
| auto CallerEdges = Node->CallerEdges; |
| for (auto &Edge : CallerEdges) { |
| // Skip any that have been removed during the recursion. |
| if (Edge->isRemoved()) { |
| assert(!is_contained(Node->CallerEdges, Edge)); |
| continue; |
| } |
| assignStackNodesPostOrder(Edge->Caller, Visited, StackIdToMatchingCalls, |
| CallToMatchingCall); |
| } |
| |
| // If this node's stack id is in the map, update the graph to contain new |
| // nodes representing any inlining at interior callsites. Note we move the |
| // associated context ids over to the new nodes. |
| |
| // Ignore this node if it is for an allocation or we didn't record any |
| // stack id lists ending at it. |
| if (Node->IsAllocation || |
| !StackIdToMatchingCalls.count(Node->OrigStackOrAllocId)) |
| return; |
| |
| auto &Calls = StackIdToMatchingCalls[Node->OrigStackOrAllocId]; |
| // Handle the simple case first. A single call with a single stack id. |
| // In this case there is no need to create any new context nodes, simply |
| // assign the context node for stack id to this Call. |
| if (Calls.size() == 1) { |
| auto &[Call, Ids, Func, SavedContextIds] = Calls[0]; |
| if (Ids.size() == 1) { |
| assert(SavedContextIds.empty()); |
| // It should be this Node |
| assert(Node == getNodeForStackId(Ids[0])); |
| if (Node->Recursive) |
| return; |
| Node->setCall(Call); |
| NonAllocationCallToContextNodeMap[Call] = Node; |
| NodeToCallingFunc[Node] = Func; |
| return; |
| } |
| } |
| |
| #ifndef NDEBUG |
| // Find the node for the last stack id, which should be the same |
| // across all calls recorded for this id, and is this node's id. |
| uint64_t LastId = Node->OrigStackOrAllocId; |
| ContextNode *LastNode = getNodeForStackId(LastId); |
| // We should only have kept stack ids that had nodes. |
| assert(LastNode); |
| assert(LastNode == Node); |
| #else |
| ContextNode *LastNode = Node; |
| #endif |
| |
| // Compute the last node's context ids once, as it is shared by all calls in |
| // this entry. |
| DenseSet<uint32_t> LastNodeContextIds = LastNode->getContextIds(); |
| |
| bool PrevIterCreatedNode = false; |
| bool CreatedNode = false; |
| for (unsigned I = 0; I < Calls.size(); |
| I++, PrevIterCreatedNode = CreatedNode) { |
| CreatedNode = false; |
| auto &[Call, Ids, Func, SavedContextIds] = Calls[I]; |
| // Skip any for which we didn't assign any ids, these don't get a node in |
| // the graph. |
| if (SavedContextIds.empty()) { |
| // If this call has a matching call (located in the same function and |
| // having the same stack ids), simply add it to the context node created |
| // for its matching call earlier. These can be treated the same through |
| // cloning and get updated at the same time. |
| if (!CallToMatchingCall.contains(Call)) |
| continue; |
| auto MatchingCall = CallToMatchingCall[Call]; |
| if (!NonAllocationCallToContextNodeMap.contains(MatchingCall)) { |
| // This should only happen if we had a prior iteration, and it didn't |
| // create a node because of the below recomputation of context ids |
| // finding none remaining and continuing early. |
| assert(I > 0 && !PrevIterCreatedNode); |
| continue; |
| } |
| NonAllocationCallToContextNodeMap[MatchingCall]->MatchingCalls.push_back( |
| Call); |
| continue; |
| } |
| |
| assert(LastId == Ids.back()); |
| |
| // Recompute the context ids for this stack id sequence (the |
| // intersection of the context ids of the corresponding nodes). |
| // Start with the ids we saved in the map for this call, which could be |
| // duplicated context ids. We have to recompute as we might have overlap |
| // overlap between the saved context ids for different last nodes, and |
| // removed them already during the post order traversal. |
| set_intersect(SavedContextIds, LastNodeContextIds); |
| ContextNode *PrevNode = LastNode; |
| bool Skip = false; |
| // Iterate backwards through the stack Ids, starting after the last Id |
| // in the list, which was handled once outside for all Calls. |
| for (auto IdIter = Ids.rbegin() + 1; IdIter != Ids.rend(); IdIter++) { |
| auto Id = *IdIter; |
| ContextNode *CurNode = getNodeForStackId(Id); |
| // We should only have kept stack ids that had nodes and weren't |
| // recursive. |
| assert(CurNode); |
| assert(!CurNode->Recursive); |
| |
| auto *Edge = CurNode->findEdgeFromCaller(PrevNode); |
| if (!Edge) { |
| Skip = true; |
| break; |
| } |
| PrevNode = CurNode; |
| |
| // Update the context ids, which is the intersection of the ids along |
| // all edges in the sequence. |
| set_intersect(SavedContextIds, Edge->getContextIds()); |
| |
| // If we now have no context ids for clone, skip this call. |
| if (SavedContextIds.empty()) { |
| Skip = true; |
| break; |
| } |
| } |
| if (Skip) |
| continue; |
| |
| // Create new context node. |
| ContextNode *NewNode = createNewNode(/*IsAllocation=*/false, Func, Call); |
| NonAllocationCallToContextNodeMap[Call] = NewNode; |
| CreatedNode = true; |
| NewNode->AllocTypes = computeAllocType(SavedContextIds); |
| |
| ContextNode *FirstNode = getNodeForStackId(Ids[0]); |
| assert(FirstNode); |
| |
| // Connect to callees of innermost stack frame in inlined call chain. |
| // This updates context ids for FirstNode's callee's to reflect those |
| // moved to NewNode. |
| connectNewNode(NewNode, FirstNode, /*TowardsCallee=*/true, SavedContextIds); |
| |
| // Connect to callers of outermost stack frame in inlined call chain. |
| // This updates context ids for FirstNode's caller's to reflect those |
| // moved to NewNode. |
| connectNewNode(NewNode, LastNode, /*TowardsCallee=*/false, SavedContextIds); |
| |
| // Now we need to remove context ids from edges/nodes between First and |
| // Last Node. |
| PrevNode = nullptr; |
| for (auto Id : Ids) { |
| ContextNode *CurNode = getNodeForStackId(Id); |
| // We should only have kept stack ids that had nodes. |
| assert(CurNode); |
| |
| // Remove the context ids moved to NewNode from CurNode, and the |
| // edge from the prior node. |
| if (PrevNode) { |
| auto *PrevEdge = CurNode->findEdgeFromCallee(PrevNode); |
| // If the sequence contained recursion, we might have already removed |
| // some edges during the connectNewNode calls above. |
| if (!PrevEdge) { |
| PrevNode = CurNode; |
| continue; |
| } |
| set_subtract(PrevEdge->getContextIds(), SavedContextIds); |
| if (PrevEdge->getContextIds().empty()) |
| removeEdgeFromGraph(PrevEdge); |
| } |
| // Since we update the edges from leaf to tail, only look at the callee |
| // edges. This isn't an alloc node, so if there are no callee edges, the |
| // alloc type is None. |
| CurNode->AllocTypes = CurNode->CalleeEdges.empty() |
| ? (uint8_t)AllocationType::None |
| : CurNode->computeAllocType(); |
| PrevNode = CurNode; |
| } |
| if (VerifyNodes) { |
| checkNode<DerivedCCG, FuncTy, CallTy>(NewNode, /*CheckEdges=*/true); |
| for (auto Id : Ids) { |
| ContextNode *CurNode = getNodeForStackId(Id); |
| // We should only have kept stack ids that had nodes. |
| assert(CurNode); |
| checkNode<DerivedCCG, FuncTy, CallTy>(CurNode, /*CheckEdges=*/true); |
| } |
| } |
| } |
| } |
| |
| template <typename DerivedCCG, typename FuncTy, typename CallTy> |
| void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::updateStackNodes() { |
| // Map of stack id to all calls with that as the last (outermost caller) |
| // callsite id that has a context node (some might not due to pruning |
| // performed during matching of the allocation profile contexts). |
| // The CallContextInfo contains the Call and a list of its stack ids with |
| // ContextNodes, the function containing Call, and the set of context ids |
| // the analysis will eventually identify for use in any new node created |
| // for that callsite. |
| DenseMap<uint64_t, std::vector<CallContextInfo>> StackIdToMatchingCalls; |
| for (auto &[Func, CallsWithMetadata] : FuncToCallsWithMetadata) { |
| for (auto &Call : CallsWithMetadata) { |
| // Ignore allocations, already handled. |
| if (AllocationCallToContextNodeMap.count(Call)) |
| continue; |
| auto StackIdsWithContextNodes = |
| getStackIdsWithContextNodesForCall(Call.call()); |
| // If there were no nodes created for MIBs on allocs (maybe this was in |
| // the unambiguous part of the MIB stack that was pruned), ignore. |
| if (StackIdsWithContextNodes.empty()) |
| continue; |
| // Otherwise, record this Call along with the list of ids for the last |
| // (outermost caller) stack id with a node. |
| StackIdToMatchingCalls[StackIdsWithContextNodes.back()].push_back( |
| {Call.call(), StackIdsWithContextNodes, Func, {}}); |
| } |
| } |
| |
| // First make a pass through all stack ids that correspond to a call, |
| // as identified in the above loop. Compute the context ids corresponding to |
| // each of these calls when they correspond to multiple stack ids due to |
| // due to inlining. Perform any duplication of context ids required when |
| // there is more than one call with the same stack ids. Their (possibly newly |
| // duplicated) context ids are saved in the StackIdToMatchingCalls map. |
| DenseMap<uint32_t, DenseSet<uint32_t>> OldToNewContextIds; |
| // Save a map from each call to any that are found to match it. I.e. located |
| // in the same function and have the same (possibly pruned) stack ids. We use |
| // this to avoid creating extra graph nodes as they can be treated the same. |
| DenseMap<CallInfo, CallInfo> CallToMatchingCall; |
| for (auto &It : StackIdToMatchingCalls) { |
| auto &Calls = It.getSecond(); |
| // Skip single calls with a single stack id. These don't need a new node. |
| if (Calls.size() == 1) { |
| auto &Ids = Calls[0].StackIds; |
| if (Ids.size() == 1) |
| continue; |
| } |
| // In order to do the best and maximal matching of inlined calls to context |
| // node sequences we will sort the vectors of stack ids in descending order |
| // of length, and within each length, lexicographically by stack id. The |
| // latter is so that we can specially handle calls that have identical stack |
| // id sequences (either due to cloning or artificially because of the MIB |
| // context pruning). Those with the same Ids are then sorted by function to |
| // facilitate efficiently mapping them to the same context node. |
| // Because the functions are pointers, to ensure a stable sort first assign |
| // each function pointer to its first index in the Calls array, and then use |
| // that to sort by. |
| DenseMap<const FuncTy *, unsigned> FuncToIndex; |
| for (const auto &[Idx, CallCtxInfo] : enumerate(Calls)) |
| FuncToIndex.insert({CallCtxInfo.Func, Idx}); |
| llvm::stable_sort( |
| Calls, |
| [&FuncToIndex](const CallContextInfo &A, const CallContextInfo &B) { |
| return A.StackIds.size() > B.StackIds.size() || |
| (A.StackIds.size() == B.StackIds.size() && |
| (A.StackIds < B.StackIds || |
| (A.StackIds == B.StackIds && |
| FuncToIndex[A.Func] < FuncToIndex[B.Func]))); |
| }); |
| |
| // Find the node for the last stack id, which should be the same |
| // across all calls recorded for this id, and is the id for this |
| // entry in the StackIdToMatchingCalls map. |
| uint64_t LastId = It.getFirst(); |
| ContextNode *LastNode = getNodeForStackId(LastId); |
| // We should only have kept stack ids that had nodes. |
| assert(LastNode); |
| |
| if (LastNode->Recursive) |
| continue; |
| |
| // Initialize the context ids with the last node's. We will subsequently |
| // refine the context ids by computing the intersection along all edges. |
| DenseSet<uint32_t> LastNodeContextIds = LastNode->getContextIds(); |
| assert(!LastNodeContextIds.empty()); |
| |
| #ifndef NDEBUG |
| // Save the set of functions seen for a particular set of the same stack |
| // ids. This is used to ensure that they have been correctly sorted to be |
| // adjacent in the Calls list, since we rely on that to efficiently place |
| // all such matching calls onto the same context node. |
| DenseSet<const FuncTy *> MatchingIdsFuncSet; |
| #endif |
| |
| for (unsigned I = 0; I < Calls.size(); I++) { |
| auto &[Call, Ids, Func, SavedContextIds] = Calls[I]; |
| assert(SavedContextIds.empty()); |
| assert(LastId == Ids.back()); |
| |
| #ifndef NDEBUG |
| // If this call has a different set of ids than the last one, clear the |
| // set used to ensure they are sorted properly. |
| if (I > 0 && Ids != Calls[I - 1].StackIds) |
| MatchingIdsFuncSet.clear(); |
| #endif |
| |
| // First compute the context ids for this stack id sequence (the |
| // intersection of the context ids of the corresponding nodes). |
| // Start with the remaining saved ids for the last node. |
| assert(!LastNodeContextIds.empty()); |
| DenseSet<uint32_t> StackSequenceContextIds = LastNodeContextIds; |
| |
| ContextNode *PrevNode = LastNode; |
| ContextNode *CurNode = LastNode; |
| bool Skip = false; |
| |
| // Iterate backwards through the stack Ids, starting after the last Id |
| // in the list, which was handled once outside for all Calls. |
| for (auto IdIter = Ids.rbegin() + 1; IdIter != Ids.rend(); IdIter++) { |
| auto Id = *IdIter; |
| CurNode = getNodeForStackId(Id); |
| // We should only have kept stack ids that had nodes. |
| assert(CurNode); |
| |
| if (CurNode->Recursive) { |
| Skip = true; |
| break; |
| } |
| |
| auto *Edge = CurNode->findEdgeFromCaller(PrevNode); |
| // If there is no edge then the nodes belong to different MIB contexts, |
| // and we should skip this inlined context sequence. For example, this |
| // particular inlined context may include stack ids A->B, and we may |
| // indeed have nodes for both A and B, but it is possible that they were |
| // never profiled in sequence in a single MIB for any allocation (i.e. |
| // we might have profiled an allocation that involves the callsite A, |
| // but through a different one of its callee callsites, and we might |
| // have profiled an allocation that involves callsite B, but reached |
| // from a different caller callsite). |
| if (!Edge) { |
| Skip = true; |
| break; |
| } |
| PrevNode = CurNode; |
| |
| // Update the context ids, which is the intersection of the ids along |
| // all edges in the sequence. |
| set_intersect(StackSequenceContextIds, Edge->getContextIds()); |
| |
| // If we now have no context ids for clone, skip this call. |
| if (StackSequenceContextIds.empty()) { |
| Skip = true; |
| break; |
| } |
| } |
| if (Skip) |
| continue; |
| |
| // If some of this call's stack ids did not have corresponding nodes (due |
| // to pruning), don't include any context ids for contexts that extend |
| // beyond these nodes. Otherwise we would be matching part of unrelated / |
| // not fully matching stack contexts. To do this, subtract any context ids |
| // found in caller nodes of the last node found above. |
| if (Ids.back() != getLastStackId(Call)) { |
| for (const auto &PE : LastNode->CallerEdges) { |
| set_subtract(StackSequenceContextIds, PE->getContextIds()); |
| if (StackSequenceContextIds.empty()) |
| break; |
| } |
| // If we now have no context ids for clone, skip this call. |
| if (StackSequenceContextIds.empty()) |
| continue; |
| } |
| |
| #ifndef NDEBUG |
| // If the prior call had the same stack ids this set would not be empty. |
| // Check if we already have a call that "matches" because it is located |
| // in the same function. If the Calls list was sorted properly we should |
| // not encounter this situation as all such entries should be adjacent |
| // and processed in bulk further below. |
| assert(!MatchingIdsFuncSet.contains(Func)); |
| |
| MatchingIdsFuncSet.insert(Func); |
| #endif |
| |
| // Check if the next set of stack ids is the same (since the Calls vector |
| // of tuples is sorted by the stack ids we can just look at the next one). |
| // If so, save them in the CallToMatchingCall map so that they get |
| // assigned to the same context node, and skip them. |
| bool DuplicateContextIds = false; |
| for (unsigned J = I + 1; J < Calls.size(); J++) { |
| auto &CallCtxInfo = Calls[J]; |
| auto &NextIds = CallCtxInfo.StackIds; |
| if (NextIds != Ids) |
| break; |
| auto *NextFunc = CallCtxInfo.Func; |
| if (NextFunc != Func) { |
| // We have another Call with the same ids but that cannot share this |
| // node, must duplicate ids for it. |
| DuplicateContextIds = true; |
| break; |
| } |
| auto &NextCall = CallCtxInfo.Call; |
| CallToMatchingCall[NextCall] = Call; |
| // Update I so that it gets incremented correctly to skip this call. |
| I = J; |
| } |
| |
| // If we don't have duplicate context ids, then we can assign all the |
| // context ids computed for the original node sequence to this call. |
| // If there are duplicate calls with the same stack ids then we synthesize |
| // new context ids that are duplicates of the originals. These are |
| // assigned to SavedContextIds, which is a reference into the map entry |
| // for this call, allowing us to access these ids later on. |
| OldToNewContextIds.reserve(OldToNewContextIds.size() + |
| StackSequenceContextIds.size()); |
| SavedContextIds = |
| DuplicateContextIds |
| ? duplicateContextIds(StackSequenceContextIds, OldToNewContextIds) |
| : StackSequenceContextIds; |
| assert(!SavedContextIds.empty()); |
| |
| if (!DuplicateContextIds) { |
| // Update saved last node's context ids to remove those that are |
| // assigned to other calls, so that it is ready for the next call at |
| // this stack id. |
| set_subtract(LastNodeContextIds, StackSequenceContextIds); |
| if (LastNodeContextIds.empty()) |
| break; |
| } |
| } |
| } |
| |
| // Propagate the duplicate context ids over the graph. |
| propagateDuplicateContextIds(OldToNewContextIds); |
| |
| if (VerifyCCG) |
| check(); |
| |
| // Now perform a post-order traversal over the graph, starting with the |
| // allocation nodes, essentially processing nodes from callers to callees. |
| // For any that contains an id in the map, update the graph to contain new |
| // nodes representing any inlining at interior callsites. Note we move the |
| // associated context ids over to the new nodes. |
| DenseSet<const ContextNode *> Visited; |
| for (auto &Entry : AllocationCallToContextNodeMap) |
| assignStackNodesPostOrder(Entry.second, Visited, StackIdToMatchingCalls, |
| CallToMatchingCall); |
| if (VerifyCCG) |
| check(); |
| } |
| |
| uint64_t ModuleCallsiteContextGraph::getLastStackId(Instruction *Call) { |
| CallStack<MDNode, MDNode::op_iterator> CallsiteContext( |
| Call->getMetadata(LLVMContext::MD_callsite)); |
| return CallsiteContext.back(); |
| } |
| |
| uint64_t IndexCallsiteContextGraph::getLastStackId(IndexCall &Call) { |
| assert(isa<CallsiteInfo *>(Call)); |
| CallStack<CallsiteInfo, SmallVector<unsigned>::const_iterator> |
| CallsiteContext(dyn_cast_if_present<CallsiteInfo *>(Call)); |
| // Need to convert index into stack id. |
| return Index.getStackIdAtIndex(CallsiteContext.back()); |
| } |
| |
| static const std::string MemProfCloneSuffix = ".memprof."; |
| |
| static std::string getMemProfFuncName(Twine Base, unsigned CloneNo) { |
| // We use CloneNo == 0 to refer to the original version, which doesn't get |
| // renamed with a suffix. |
| if (!CloneNo) |
| return Base.str(); |
| return (Base + MemProfCloneSuffix + Twine(CloneNo)).str(); |
| } |
| |
| static bool isMemProfClone(const Function &F) { |
| return F.getName().contains(MemProfCloneSuffix); |
| } |
| |
| std::string ModuleCallsiteContextGraph::getLabel(const Function *Func, |
| const Instruction *Call, |
| unsigned CloneNo) const { |
| return (Twine(Call->getFunction()->getName()) + " -> " + |
| cast<CallBase>(Call)->getCalledFunction()->getName()) |
| .str(); |
| } |
| |
| std::string IndexCallsiteContextGraph::getLabel(const FunctionSummary *Func, |
| const IndexCall &Call, |
| unsigned CloneNo) const { |
| auto VI = FSToVIMap.find(Func); |
| assert(VI != FSToVIMap.end()); |
| if (isa<AllocInfo *>(Call)) |
| return (VI->second.name() + " -> alloc").str(); |
| else { |
| auto *Callsite = dyn_cast_if_present<CallsiteInfo *>(Call); |
| return (VI->second.name() + " -> " + |
| getMemProfFuncName(Callsite->Callee.name(), |
| Callsite->Clones[CloneNo])) |
| .str(); |
| } |
| } |
| |
| std::vector<uint64_t> |
| ModuleCallsiteContextGraph::getStackIdsWithContextNodesForCall( |
| Instruction *Call) { |
| CallStack<MDNode, MDNode::op_iterator> CallsiteContext( |
| Call->getMetadata(LLVMContext::MD_callsite)); |
| return getStackIdsWithContextNodes<MDNode, MDNode::op_iterator>( |
| CallsiteContext); |
| } |
| |
| std::vector<uint64_t> |
| IndexCallsiteContextGraph::getStackIdsWithContextNodesForCall(IndexCall &Call) { |
| assert(isa<CallsiteInfo *>(Call)); |
| CallStack<CallsiteInfo, SmallVector<unsigned>::const_iterator> |
| CallsiteContext(dyn_cast_if_present<CallsiteInfo *>(Call)); |
| return getStackIdsWithContextNodes<CallsiteInfo, |
| SmallVector<unsigned>::const_iterator>( |
| CallsiteContext); |
| } |
| |
| template <typename DerivedCCG, typename FuncTy, typename CallTy> |
| template <class NodeT, class IteratorT> |
| std::vector<uint64_t> |
| CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getStackIdsWithContextNodes( |
| CallStack<NodeT, IteratorT> &CallsiteContext) { |
| std::vector<uint64_t> StackIds; |
| for (auto IdOrIndex : CallsiteContext) { |
| auto StackId = getStackId(IdOrIndex); |
| ContextNode *Node = getNodeForStackId(StackId); |
| if (!Node) |
| break; |
| StackIds.push_back(StackId); |
| } |
| return StackIds; |
| } |
| |
| ModuleCallsiteContextGraph::ModuleCallsiteContextGraph( |
| Module &M, |
| llvm::function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter) |
| : Mod(M), OREGetter(OREGetter) { |
| for (auto &F : M) { |
| std::vector<CallInfo> CallsWithMetadata; |
| for (auto &BB : F) { |
| for (auto &I : BB) { |
| if (!isa<CallBase>(I)) |
| continue; |
| if (auto *MemProfMD = I.getMetadata(LLVMContext::MD_memprof)) { |
| CallsWithMetadata.push_back(&I); |
| auto *AllocNode = addAllocNode(&I, &F); |
| auto *CallsiteMD = I.getMetadata(LLVMContext::MD_callsite); |
| assert(CallsiteMD); |
| CallStack<MDNode, MDNode::op_iterator> CallsiteContext(CallsiteMD); |
| // Add all of the MIBs and their stack nodes. |
| for (auto &MDOp : MemProfMD->operands()) { |
| auto *MIBMD = cast<const MDNode>(MDOp); |
| std::vector<ContextTotalSize> ContextSizeInfo; |
| // Collect the context size information if it exists. |
| if (MIBMD->getNumOperands() > 2) { |
| for (unsigned I = 2; I < MIBMD->getNumOperands(); I++) { |
| MDNode *ContextSizePair = |
| dyn_cast<MDNode>(MIBMD->getOperand(I)); |
| assert(ContextSizePair->getNumOperands() == 2); |
| uint64_t FullStackId = mdconst::dyn_extract<ConstantInt>( |
| ContextSizePair->getOperand(0)) |
| ->getZExtValue(); |
| uint64_t TotalSize = mdconst::dyn_extract<ConstantInt>( |
| ContextSizePair->getOperand(1)) |
| ->getZExtValue(); |
| ContextSizeInfo.push_back({FullStackId, TotalSize}); |
| } |
| } |
| MDNode *StackNode = getMIBStackNode(MIBMD); |
| assert(StackNode); |
| CallStack<MDNode, MDNode::op_iterator> StackContext(StackNode); |
| addStackNodesForMIB<MDNode, MDNode::op_iterator>( |
| AllocNode, StackContext, CallsiteContext, |
| getMIBAllocType(MIBMD), ContextSizeInfo); |
| } |
| // If exporting the graph to dot and an allocation id of interest was |
| // specified, record all the context ids for this allocation node. |
| if (ExportToDot && AllocNode->OrigStackOrAllocId == AllocIdForDot) |
| DotAllocContextIds = AllocNode->getContextIds(); |
| assert(AllocNode->AllocTypes != (uint8_t)AllocationType::None); |
| // Memprof and callsite metadata on memory allocations no longer |
| // needed. |
| I.setMetadata(LLVMContext::MD_memprof, nullptr); |
| I.setMetadata(LLVMContext::MD_callsite, nullptr); |
| } |
| // For callsite metadata, add to list for this function for later use. |
| else if (I.getMetadata(LLVMContext::MD_callsite)) { |
| CallsWithMetadata.push_back(&I); |
| } |
| } |
| } |
| if (!CallsWithMetadata.empty()) |
| FuncToCallsWithMetadata[&F] = CallsWithMetadata; |
| } |
| |
| if (DumpCCG) { |
| dbgs() << "CCG before updating call stack chains:\n"; |
| dbgs() << *this; |
| } |
| |
| if (ExportToDot) |
| exportToDot("prestackupdate"); |
| |
| updateStackNodes(); |
| |
| handleCallsitesWithMultipleTargets(); |
| |
| markBackedges(); |
| |
| // Strip off remaining callsite metadata, no longer needed. |
| for (auto &FuncEntry : FuncToCallsWithMetadata) |
| for (auto &Call : FuncEntry.second) |
| Call.call()->setMetadata(LLVMContext::MD_callsite, nullptr); |
| } |
| |
| IndexCallsiteContextGraph::IndexCallsiteContextGraph( |
| ModuleSummaryIndex &Index, |
| llvm::function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)> |
| isPrevailing) |
| : Index(Index), isPrevailing(isPrevailing) { |
| for (auto &I : Index) { |
| auto VI = Index.getValueInfo(I); |
| for (auto &S : VI.getSummaryList()) { |
| // We should only add the prevailing nodes. Otherwise we may try to clone |
| // in a weak copy that won't be linked (and may be different than the |
| // prevailing version). |
| // We only keep the memprof summary on the prevailing copy now when |
| // building the combined index, as a space optimization, however don't |
| // rely on this optimization. The linker doesn't resolve local linkage |
| // values so don't check whether those are prevailing. |
| if (!GlobalValue::isLocalLinkage(S->linkage()) && |
| !isPrevailing(VI.getGUID(), S.get())) |
| continue; |
| auto *FS = dyn_cast<FunctionSummary>(S.get()); |
| if (!FS) |
| continue; |
| std::vector<CallInfo> CallsWithMetadata; |
| if (!FS->allocs().empty()) { |
| for (auto &AN : FS->mutableAllocs()) { |
| // This can happen because of recursion elimination handling that |
| // currently exists in ModuleSummaryAnalysis. Skip these for now. |
| // We still added them to the summary because we need to be able to |
| // correlate properly in applyImport in the backends. |
| if (AN.MIBs.empty()) |
| continue; |
| IndexCall AllocCall(&AN); |
| CallsWithMetadata.push_back(AllocCall); |
| auto *AllocNode = addAllocNode(AllocCall, FS); |
| // Pass an empty CallStack to the CallsiteContext (second) |
| // parameter, since for ThinLTO we already collapsed out the inlined |
| // stack ids on the allocation call during ModuleSummaryAnalysis. |
| CallStack<MIBInfo, SmallVector<unsigned>::const_iterator> |
| EmptyContext; |
| unsigned I = 0; |
| assert( |
| (!MemProfReportHintedSizes && MinClonedColdBytePercent >= 100) || |
| AN.ContextSizeInfos.size() == AN.MIBs.size()); |
| // Now add all of the MIBs and their stack nodes. |
| for (auto &MIB : AN.MIBs) { |
| CallStack<MIBInfo, SmallVector<unsigned>::const_iterator> |
| StackContext(&MIB); |
| std::vector<ContextTotalSize> ContextSizeInfo; |
| if (!AN.ContextSizeInfos.empty()) { |
| for (auto [FullStackId, TotalSize] : AN.ContextSizeInfos[I]) |
| ContextSizeInfo.push_back({FullStackId, TotalSize}); |
| } |
| addStackNodesForMIB<MIBInfo, SmallVector<unsigned>::const_iterator>( |
| AllocNode, StackContext, EmptyContext, MIB.AllocType, |
| ContextSizeInfo); |
| I++; |
| } |
| // If exporting the graph to dot and an allocation id of interest was |
| // specified, record all the context ids for this allocation node. |
| if (ExportToDot && AllocNode->OrigStackOrAllocId == AllocIdForDot) |
| DotAllocContextIds = AllocNode->getContextIds(); |
| assert(AllocNode->AllocTypes != (uint8_t)AllocationType::None); |
| // Initialize version 0 on the summary alloc node to the current alloc |
| // type, unless it has both types in which case make it default, so |
| // that in the case where we aren't able to clone the original version |
| // always ends up with the default allocation behavior. |
| AN.Versions[0] = (uint8_t)allocTypeToUse(AllocNode->AllocTypes); |
| } |
| } |
| // For callsite metadata, add to list for this function for later use. |
| if (!FS->callsites().empty()) |
| for (auto &SN : FS->mutableCallsites()) { |
| IndexCall StackNodeCall(&SN); |
| CallsWithMetadata.push_back(StackNodeCall); |
| } |
| |
| if (!CallsWithMetadata.empty()) |
| FuncToCallsWithMetadata[FS] = CallsWithMetadata; |
| |
| if (!FS->allocs().empty() || !FS->callsites().empty()) |
| FSToVIMap[FS] = VI; |
| } |
| } |
| |
| if (DumpCCG) { |
| dbgs() << "CCG before updating call stack chains:\n"; |
| dbgs() << *this; |
| } |
| |
| if (ExportToDot) |
| exportToDot("prestackupdate"); |
| |
| updateStackNodes(); |
| |
| handleCallsitesWithMultipleTargets(); |
| |
| markBackedges(); |
| } |
| |
| template <typename DerivedCCG, typename FuncTy, typename CallTy> |
| void CallsiteContextGraph<DerivedCCG, FuncTy, |
| CallTy>::handleCallsitesWithMultipleTargets() { |
| // Look for and workaround callsites that call multiple functions. |
| // This can happen for indirect calls, which needs better handling, and in |
| // more rare cases (e.g. macro expansion). |
| // TODO: To fix this for indirect calls we will want to perform speculative |
| // devirtualization using either the normal PGO info with ICP, or using the |
| // information in the profiled MemProf contexts. We can do this prior to |
| // this transformation for regular LTO, and for ThinLTO we can simulate that |
| // effect in the summary and perform the actual speculative devirtualization |
| // while cloning in the ThinLTO backend. |
| |
| // Keep track of the new nodes synthesized for discovered tail calls missing |
| // from the profiled contexts. |
| MapVector<CallInfo, ContextNode *> TailCallToContextNodeMap; |
| |
| std::vector<std::pair<CallInfo, ContextNode *>> NewCallToNode; |
| for (auto &Entry : NonAllocationCallToContextNodeMap) { |
| auto *Node = Entry.second; |
| assert(Node->Clones.empty()); |
| // Check all node callees and see if in the same function. |
| // We need to check all of the calls recorded in this Node, because in some |
| // cases we may have had multiple calls with the same debug info calling |
| // different callees. This can happen, for example, when an object is |
| // constructed in the paramter list - the destructor call of the object has |
| // the same debug info (line/col) as the call the object was passed to. |
| // Here we will prune any that don't match all callee nodes. |
| std::vector<CallInfo> AllCalls; |
| AllCalls.reserve(Node->MatchingCalls.size() + 1); |
| AllCalls.push_back(Node->Call); |
| llvm::append_range(AllCalls, Node->MatchingCalls); |
| |
| // First see if we can partition the calls by callee function, creating new |
| // nodes to host each set of calls calling the same callees. This is |
| // necessary for support indirect calls with ThinLTO, for which we |
| // synthesized CallsiteInfo records for each target. They will all have the |
| // same callsite stack ids and would be sharing a context node at this |
| // point. We need to perform separate cloning for each, which will be |
| // applied along with speculative devirtualization in the ThinLTO backends |
| // as needed. Note this does not currently support looking through tail |
| // calls, it is unclear if we need that for indirect call targets. |
| // First partition calls by callee func. Map indexed by func, value is |
| // struct with list of matching calls, assigned node. |
| if (partitionCallsByCallee(Node, AllCalls, NewCallToNode)) |
| continue; |
| |
| auto It = AllCalls.begin(); |
| // Iterate through the calls until we find the first that matches. |
| for (; It != AllCalls.end(); ++It) { |
| auto ThisCall = *It; |
| bool Match = true; |
| for (auto EI = Node->CalleeEdges.begin(); EI != Node->CalleeEdges.end(); |
| ++EI) { |
| auto Edge = *EI; |
| if (!Edge->Callee->hasCall()) |
| continue; |
| assert(NodeToCallingFunc.count(Edge->Callee)); |
| // Check if the called function matches that of the callee node. |
| if (!calleesMatch(ThisCall.call(), EI, TailCallToContextNodeMap)) { |
| Match = false; |
| break; |
| } |
| } |
| // Found a call that matches the callee nodes, we can quit now. |
| if (Match) { |
| // If the first match is not the primary call on the Node, update it |
| // now. We will update the list of matching calls further below. |
| if (Node->Call != ThisCall) { |
| Node->setCall(ThisCall); |
| // We need to update the NonAllocationCallToContextNodeMap, but don't |
| // want to do this during iteration over that map, so save the calls |
| // that need updated entries. |
| NewCallToNode.push_back({ThisCall, Node}); |
| } |
| break; |
| } |
| } |
| // We will update this list below (or leave it cleared if there was no |
| // match found above). |
| Node->MatchingCalls.clear(); |
| // If we hit the end of the AllCalls vector, no call matching the callee |
| // nodes was found, clear the call information in the node. |
| if (It == AllCalls.end()) { |
| RemovedEdgesWithMismatchedCallees++; |
| // Work around by setting Node to have a null call, so it gets |
| // skipped during cloning. Otherwise assignFunctions will assert |
| // because its data structures are not designed to handle this case. |
| Node->setCall(CallInfo()); |
| continue; |
| } |
| // Now add back any matching calls that call the same function as the |
| // matching primary call on Node. |
| for (++It; It != AllCalls.end(); ++It) { |
| auto ThisCall = *It; |
| if (!sameCallee(Node->Call.call(), ThisCall.call())) |
| continue; |
| Node->MatchingCalls.push_back(ThisCall); |
| } |
| } |
| |
| // Remove all mismatched nodes identified in the above loop from the node map |
| // (checking whether they have a null call which is set above). For a |
| // MapVector like NonAllocationCallToContextNodeMap it is much more efficient |
| // to do the removal via remove_if than by individually erasing entries above. |
| // Also remove any entries if we updated the node's primary call above. |
| NonAllocationCallToContextNodeMap.remove_if([](const auto &it) { |
| return !it.second->hasCall() || it.second->Call != it.first; |
| }); |
| |
| // Add entries for any new primary calls recorded above. |
| for (auto &[Call, Node] : NewCallToNode) |
| NonAllocationCallToContextNodeMap[Call] = Node; |
| |
| // Add the new nodes after the above loop so that the iteration is not |
| // invalidated. |
| for (auto &[Call, Node] : TailCallToContextNodeMap) |
| NonAllocationCallToContextNodeMap[Call] = Node; |
| } |
| |
| template <typename DerivedCCG, typename FuncTy, typename CallTy> |
| bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::partitionCallsByCallee( |
| ContextNode *Node, ArrayRef<CallInfo> AllCalls, |
| std::vector<std::pair<CallInfo, ContextNode *>> &NewCallToNode) { |
| // Struct to keep track of all the calls having the same callee function, |
| // and the node we eventually assign to them. Eventually we will record the |
| // context node assigned to this group of calls. |
| struct CallsWithSameCallee { |
| std::vector<CallInfo> Calls; |
| ContextNode *Node = nullptr; |
| }; |
| |
| // First partition calls by callee function. Build map from each function |
| // to the list of matching calls. |
| DenseMap<const FuncTy *, CallsWithSameCallee> CalleeFuncToCallInfo; |
| for (auto ThisCall : AllCalls) { |
| auto *F = getCalleeFunc(ThisCall.call()); |
| if (F) |
| CalleeFuncToCallInfo[F].Calls.push_back(ThisCall); |
| } |
| |
| // Next, walk through all callee edges. For each callee node, get its |
| // containing function and see if it was recorded in the above map (meaning we |
| // have at least one matching call). Build another map from each callee node |
| // with a matching call to the structure instance created above containing all |
| // the calls. |
| DenseMap<ContextNode *, CallsWithSameCallee *> CalleeNodeToCallInfo; |
| for (const auto &Edge : Node->CalleeEdges) { |
| if (!Edge->Callee->hasCall()) |
| continue; |
| const FuncTy *ProfiledCalleeFunc = NodeToCallingFunc[Edge->Callee]; |
| if (CalleeFuncToCallInfo.contains(ProfiledCalleeFunc)) |
| CalleeNodeToCallInfo[Edge->Callee] = |
| &CalleeFuncToCallInfo[ProfiledCalleeFunc]; |
| } |
| |
| // If there are entries in the second map, then there were no matching |
| // calls/callees, nothing to do here. Return so we can go to the handling that |
| // looks through tail calls. |
| if (CalleeNodeToCallInfo.empty()) |
| return false; |
| |
| // Walk through all callee edges again. Any and all callee edges that didn't |
| // match any calls (callee not in the CalleeNodeToCallInfo map) are moved to a |
| // new caller node (UnmatchedCalleesNode) which gets a null call so that it is |
| // ignored during cloning. If it is in the map, then we use the node recorded |
| // in that entry (creating it if needed), and move the callee edge to it. |
| // The first callee will use the original node instead of creating a new one. |
| // Note that any of the original calls on this node (in AllCalls) that didn't |
| // have a callee function automatically get dropped from the node as part of |
| // this process. |
| ContextNode *UnmatchedCalleesNode = nullptr; |
| // Track whether we already assigned original node to a callee. |
| bool UsedOrigNode = false; |
| assert(NodeToCallingFunc[Node]); |
| // Iterate over a copy of Node's callee edges, since we may need to remove |
| // edges in moveCalleeEdgeToNewCaller, and this simplifies the handling and |
| // makes it less error-prone. |
| auto CalleeEdges = Node->CalleeEdges; |
| for (auto &Edge : CalleeEdges) { |
| if (!Edge->Callee->hasCall()) |
| continue; |
| |
| // Will be updated below to point to whatever (caller) node this callee edge |
| // should be moved to. |
| ContextNode *CallerNodeToUse = nullptr; |
| |
| // Handle the case where there were no matching calls first. Move this |
| // callee edge to the UnmatchedCalleesNode, creating it if needed. |
| if (!CalleeNodeToCallInfo.contains(Edge->Callee)) { |
| if (!UnmatchedCalleesNode) |
| UnmatchedCalleesNode = |
| createNewNode(/*IsAllocation=*/false, NodeToCallingFunc[Node]); |
| CallerNodeToUse = UnmatchedCalleesNode; |
| } else { |
| // Look up the information recorded for this callee node, and use the |
| // recorded caller node (creating it if needed). |
| auto *Info = CalleeNodeToCallInfo[Edge->Callee]; |
| if (!Info->Node) { |
| // If we haven't assigned any callees to the original node use it. |
| if (!UsedOrigNode) { |
| Info->Node = Node; |
| // Clear the set of matching calls which will be updated below. |
| Node->MatchingCalls.clear(); |
| UsedOrigNode = true; |
| } else |
| Info->Node = |
| createNewNode(/*IsAllocation=*/false, NodeToCallingFunc[Node]); |
| assert(!Info->Calls.empty()); |
| // The first call becomes the primary call for this caller node, and the |
| // rest go in the matching calls list. |
| Info->Node->setCall(Info->Calls.front()); |
| llvm::append_range(Info->Node->MatchingCalls, |
| llvm::drop_begin(Info->Calls)); |
| // Save the primary call to node correspondence so that we can update |
| // the NonAllocationCallToContextNodeMap, which is being iterated in the |
| // caller of this function. |
| NewCallToNode.push_back({Info->Node->Call, Info->Node}); |
| } |
| CallerNodeToUse = Info->Node; |
| } |
| |
| // Don't need to move edge if we are using the original node; |
| if (CallerNodeToUse == Node) |
| continue; |
| |
| moveCalleeEdgeToNewCaller(Edge, CallerNodeToUse); |
| } |
| // Now that we are done moving edges, clean up any caller edges that ended |
| // up with no type or context ids. During moveCalleeEdgeToNewCaller all |
| // caller edges from Node are replicated onto the new callers, and it |
| // simplifies the handling to leave them until we have moved all |
| // edges/context ids. |
| for (auto &I : CalleeNodeToCallInfo) |
| removeNoneTypeCallerEdges(I.second->Node); |
| if (UnmatchedCalleesNode) |
| removeNoneTypeCallerEdges(UnmatchedCalleesNode); |
| removeNoneTypeCallerEdges(Node); |
| |
| return true; |
| } |
| |
| uint64_t ModuleCallsiteContextGraph::getStackId(uint64_t IdOrIndex) const { |
| // In the Module (IR) case this is already the Id. |
| return IdOrIndex; |
| } |
| |
| uint64_t IndexCallsiteContextGraph::getStackId(uint64_t IdOrIndex) const { |
| // In the Index case this is an index into the stack id list in the summary |
| // index, convert it to an Id. |
| return Index.getStackIdAtIndex(IdOrIndex); |
| } |
| |
| template <typename DerivedCCG, typename FuncTy, typename CallTy> |
| bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::calleesMatch( |
| CallTy Call, EdgeIter &EI, |
| MapVector<CallInfo, ContextNode *> &TailCallToContextNodeMap) { |
| auto Edge = *EI; |
| const FuncTy *ProfiledCalleeFunc = NodeToCallingFunc[Edge->Callee]; |
| const FuncTy *CallerFunc = NodeToCallingFunc[Edge->Caller]; |
| // Will be populated in order of callee to caller if we find a chain of tail |
| // calls between the profiled caller and callee. |
| std::vector<std::pair<CallTy, FuncTy *>> FoundCalleeChain; |
| if (!calleeMatchesFunc(Call, ProfiledCalleeFunc, CallerFunc, |
| FoundCalleeChain)) |
| return false; |
| |
| // The usual case where the profiled callee matches that of the IR/summary. |
| if (FoundCalleeChain.empty()) |
| return true; |
| |
| auto AddEdge = [Edge, &EI](ContextNode *Caller, ContextNode *Callee) { |
| auto *CurEdge = Callee->findEdgeFromCaller(Caller); |
| // If there is already an edge between these nodes, simply update it and |
| // return. |
| if (CurEdge) { |
| CurEdge->ContextIds.insert_range(Edge->ContextIds); |
| CurEdge->AllocTypes |= Edge->AllocTypes; |
| return; |
| } |
| // Otherwise, create a new edge and insert it into the caller and callee |
| // lists. |
| auto NewEdge = std::make_shared<ContextEdge>( |
| Callee, Caller, Edge->AllocTypes, Edge->ContextIds); |
| Callee->CallerEdges.push_back(NewEdge); |
| if (Caller == Edge->Caller) { |
| // If we are inserting the new edge into the current edge's caller, insert |
| // the new edge before the current iterator position, and then increment |
| // back to the current edge. |
| EI = Caller->CalleeEdges.insert(EI, NewEdge); |
| ++EI; |
| assert(*EI == Edge && |
| "Iterator position not restored after insert and increment"); |
| } else |
| Caller->CalleeEdges.push_back(NewEdge); |
| }; |
| |
| // Create new nodes for each found callee and connect in between the profiled |
| // caller and callee. |
| auto *CurCalleeNode = Edge->Callee; |
| for (auto &[NewCall, Func] : FoundCalleeChain) { |
| ContextNode *NewNode = nullptr; |
| // First check if we have already synthesized a node for this tail call. |
| if (TailCallToContextNodeMap.count(NewCall)) { |
| NewNode = TailCallToContextNodeMap[NewCall]; |
| NewNode->AllocTypes |= Edge->AllocTypes; |
| } else { |
| FuncToCallsWithMetadata[Func].push_back({NewCall}); |
| // Create Node and record node info. |
| NewNode = createNewNode(/*IsAllocation=*/false, Func, NewCall); |
| TailCallToContextNodeMap[NewCall] = NewNode; |
| NewNode->AllocTypes = Edge->AllocTypes; |
| } |
| |
| // Hook up node to its callee node |
| AddEdge(NewNode, CurCalleeNode); |
| |
| CurCalleeNode = NewNode; |
| } |
| |
| // Hook up edge's original caller to new callee node. |
| AddEdge(Edge->Caller, CurCalleeNode); |
| |
| #ifndef NDEBUG |
| // Save this because Edge's fields get cleared below when removed. |
| auto *Caller = Edge->Caller; |
| #endif |
| |
| // Remove old edge |
| removeEdgeFromGraph(Edge.get(), &EI, /*CalleeIter=*/true); |
| |
| // To simplify the increment of EI in the caller, subtract one from EI. |
| // In the final AddEdge call we would have either added a new callee edge, |
| // to Edge->Caller, or found an existing one. Either way we are guaranteed |
| // that there is at least one callee edge. |
| assert(!Caller->CalleeEdges.empty()); |
| --EI; |
| |
| return true; |
| } |
| |
| bool ModuleCallsiteContextGraph::findProfiledCalleeThroughTailCalls( |
| const Function *ProfiledCallee, Value *CurCallee, unsigned Depth, |
| std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain, |
| bool &FoundMultipleCalleeChains) { |
| // Stop recursive search if we have already explored the maximum specified |
| // depth. |
| if (Depth > TailCallSearchDepth) |
| return false; |
| |
| auto SaveCallsiteInfo = [&](Instruction *Callsite, Function *F) { |
| FoundCalleeChain.push_back({Callsite, F}); |
| }; |
| |
| auto *CalleeFunc = dyn_cast<Function>(CurCallee); |
| if (!CalleeFunc) { |
| auto *Alias = dyn_cast<GlobalAlias>(CurCallee); |
| assert(Alias); |
| CalleeFunc = dyn_cast<Function>(Alias->getAliasee()); |
| assert(CalleeFunc); |
| } |
| |
| // Look for tail calls in this function, and check if they either call the |
| // profiled callee directly, or indirectly (via a recursive search). |
| // Only succeed if there is a single unique tail call chain found between the |
| // profiled caller and callee, otherwise we could perform incorrect cloning. |
| bool FoundSingleCalleeChain = false; |
| for (auto &BB : *CalleeFunc) { |
| for (auto &I : BB) { |
| auto *CB = dyn_cast<CallBase>(&I); |
| if (!CB || !CB->isTailCall()) |
| continue; |
| auto *CalledValue = CB->getCalledOperand(); |
| auto *CalledFunction = CB->getCalledFunction(); |
| if (CalledValue && !CalledFunction) { |
| CalledValue = CalledValue->stripPointerCasts(); |
| // Stripping pointer casts can reveal a called function. |
| CalledFunction = dyn_cast<Function>(CalledValue); |
| } |
| // Check if this is an alias to a function. If so, get the |
| // called aliasee for the checks below. |
| if (auto *GA = dyn_cast<GlobalAlias>(CalledValue)) { |
| assert(!CalledFunction && |
| "Expected null called function in callsite for alias"); |
| CalledFunction = dyn_cast<Function>(GA->getAliaseeObject()); |
| } |
| if (!CalledFunction) |
| continue; |
| if (CalledFunction == ProfiledCallee) { |
| if (FoundSingleCalleeChain) { |
| FoundMultipleCalleeChains = true; |
| return false; |
| } |
| FoundSingleCalleeChain = true; |
| FoundProfiledCalleeCount++; |
| FoundProfiledCalleeDepth += Depth; |
| if (Depth > FoundProfiledCalleeMaxDepth) |
| FoundProfiledCalleeMaxDepth = Depth; |
| SaveCallsiteInfo(&I, CalleeFunc); |
| } else if (findProfiledCalleeThroughTailCalls( |
| ProfiledCallee, CalledFunction, Depth + 1, |
| FoundCalleeChain, FoundMultipleCalleeChains)) { |
| // findProfiledCalleeThroughTailCalls should not have returned |
| // true if FoundMultipleCalleeChains. |
| assert(!FoundMultipleCalleeChains); |
| if (FoundSingleCalleeChain) { |
| FoundMultipleCalleeChains = true; |
| return false; |
| } |
| FoundSingleCalleeChain = true; |
| SaveCallsiteInfo(&I, CalleeFunc); |
| } else if (FoundMultipleCalleeChains) |
| return false; |
| } |
| } |
| |
| return FoundSingleCalleeChain; |
| } |
| |
| const Function *ModuleCallsiteContextGraph::getCalleeFunc(Instruction *Call) { |
| auto *CB = dyn_cast<CallBase>(Call); |
| if (!CB->getCalledOperand() || CB->isIndirectCall()) |
| return nullptr; |
| auto *CalleeVal = CB->getCalledOperand()->stripPointerCasts(); |
| auto *Alias = dyn_cast<GlobalAlias>(CalleeVal); |
| if (Alias) |
| return dyn_cast<Function>(Alias->getAliasee()); |
| return dyn_cast<Function>(CalleeVal); |
| } |
| |
| bool ModuleCallsiteContextGraph::calleeMatchesFunc( |
| Instruction *Call, const Function *Func, const Function *CallerFunc, |
| std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain) { |
| auto *CB = dyn_cast<CallBase>(Call); |
| if (!CB->getCalledOperand() || CB->isIndirectCall()) |
| return false; |
| auto *CalleeVal = CB->getCalledOperand()->stripPointerCasts(); |
| auto *CalleeFunc = dyn_cast<Function>(CalleeVal); |
| if (CalleeFunc == Func) |
| return true; |
| auto *Alias = dyn_cast<GlobalAlias>(CalleeVal); |
| if (Alias && Alias->getAliasee() == Func) |
| return true; |
| |
| // Recursively search for the profiled callee through tail calls starting with |
| // the actual Callee. The discovered tail call chain is saved in |
| // FoundCalleeChain, and we will fixup the graph to include these callsites |
| // after returning. |
| // FIXME: We will currently redo the same recursive walk if we find the same |
| // mismatched callee from another callsite. We can improve this with more |
| // bookkeeping of the created chain of new nodes for each mismatch. |
| unsigned Depth = 1; |
| bool FoundMultipleCalleeChains = false; |
| if (!findProfiledCalleeThroughTailCalls(Func, CalleeVal, Depth, |
| FoundCalleeChain, |
| FoundMultipleCalleeChains)) { |
| LLVM_DEBUG(dbgs() << "Not found through unique tail call chain: " |
| << Func->getName() << " from " << CallerFunc->getName() |
| << " that actually called " << CalleeVal->getName() |
| << (FoundMultipleCalleeChains |
| ? " (found multiple possible chains)" |
| : "") |
| << "\n"); |
| if (FoundMultipleCalleeChains) |
| FoundProfiledCalleeNonUniquelyCount++; |
| return false; |
| } |
| |
| return true; |
| } |
| |
| bool ModuleCallsiteContextGraph::sameCallee(Instruction *Call1, |
| Instruction *Call2) { |
| auto *CB1 = cast<CallBase>(Call1); |
| if (!CB1->getCalledOperand() || CB1->isIndirectCall()) |
| return false; |
| auto *CalleeVal1 = CB1->getCalledOperand()->stripPointerCasts(); |
| auto *CalleeFunc1 = dyn_cast<Function>(CalleeVal1); |
| auto *CB2 = cast<CallBase>(Call2); |
| if (!CB2->getCalledOperand() || CB2->isIndirectCall()) |
| return false; |
| auto *CalleeVal2 = CB2->getCalledOperand()->stripPointerCasts(); |
| auto *CalleeFunc2 = dyn_cast<Function>(CalleeVal2); |
| return CalleeFunc1 == CalleeFunc2; |
| } |
| |
| bool IndexCallsiteContextGraph::findProfiledCalleeThroughTailCalls( |
| ValueInfo ProfiledCallee, ValueInfo CurCallee, unsigned Depth, |
| std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain, |
| bool &FoundMultipleCalleeChains) { |
| // Stop recursive search if we have already explored the maximum specified |
| // depth. |
| if (Depth > TailCallSearchDepth) |
| return false; |
| |
| auto CreateAndSaveCallsiteInfo = [&](ValueInfo Callee, FunctionSummary *FS) { |
| // Make a CallsiteInfo for each discovered callee, if one hasn't already |
| // been synthesized. |
| if (!FunctionCalleesToSynthesizedCallsiteInfos.count(FS) || |
| !FunctionCalleesToSynthesizedCallsiteInfos[FS].count(Callee)) |
| // StackIds is empty (we don't have debug info available in the index for |
| // these callsites) |
| FunctionCalleesToSynthesizedCallsiteInfos[FS][Callee] = |
| std::make_unique<CallsiteInfo>(Callee, SmallVector<unsigned>()); |
| CallsiteInfo *NewCallsiteInfo = |
| FunctionCalleesToSynthesizedCallsiteInfos[FS][Callee].get(); |
| FoundCalleeChain.push_back({NewCallsiteInfo, FS}); |
| }; |
| |
| // Look for tail calls in this function, and check if they either call the |
| // profiled callee directly, or indirectly (via a recursive search). |
| // Only succeed if there is a single unique tail call chain found between the |
| // profiled caller and callee, otherwise we could perform incorrect cloning. |
| bool FoundSingleCalleeChain = false; |
| for (auto &S : CurCallee.getSummaryList()) { |
| if (!GlobalValue::isLocalLinkage(S->linkage()) && |
| !isPrevailing(CurCallee.getGUID(), S.get())) |
| continue; |
| auto *FS = dyn_cast<FunctionSummary>(S->getBaseObject()); |
| if (!FS) |
| continue; |
| auto FSVI = CurCallee; |
| auto *AS = dyn_cast<AliasSummary>(S.get()); |
| if (AS) |
| FSVI = AS->getAliaseeVI(); |
| for (auto &CallEdge : FS->calls()) { |
| if (!CallEdge.second.hasTailCall()) |
| continue; |
| if (CallEdge.first == ProfiledCallee) { |
| if (FoundSingleCalleeChain) { |
| FoundMultipleCalleeChains = true; |
| return false; |
| } |
| FoundSingleCalleeChain = true; |
| FoundProfiledCalleeCount++; |
| FoundProfiledCalleeDepth += Depth; |
| if (Depth > FoundProfiledCalleeMaxDepth) |
| FoundProfiledCalleeMaxDepth = Depth; |
| CreateAndSaveCallsiteInfo(CallEdge.first, FS); |
| // Add FS to FSToVIMap in case it isn't already there. |
| assert(!FSToVIMap.count(FS) || FSToVIMap[FS] == FSVI); |
| FSToVIMap[FS] = FSVI; |
| } else if (findProfiledCalleeThroughTailCalls( |
| ProfiledCallee, CallEdge.first, Depth + 1, |
| FoundCalleeChain, FoundMultipleCalleeChains)) { |
| // findProfiledCalleeThroughTailCalls should not have returned |
| // true if FoundMultipleCalleeChains. |
| assert(!FoundMultipleCalleeChains); |
| if (FoundSingleCalleeChain) { |
| FoundMultipleCalleeChains = true; |
| return false; |
| } |
| FoundSingleCalleeChain = true; |
| CreateAndSaveCallsiteInfo(CallEdge.first, FS); |
| // Add FS to FSToVIMap in case it isn't already there. |
| assert(!FSToVIMap.count(FS) || FSToVIMap[FS] == FSVI); |
| FSToVIMap[FS] = FSVI; |
| } else if (FoundMultipleCalleeChains) |
| return false; |
| } |
| } |
| |
| return FoundSingleCalleeChain; |
| } |
| |
| const FunctionSummary * |
| IndexCallsiteContextGraph::getCalleeFunc(IndexCall &Call) { |
| ValueInfo Callee = dyn_cast_if_present<CallsiteInfo *>(Call)->Callee; |
| if (Callee.getSummaryList().empty()) |
| return nullptr; |
| return dyn_cast<FunctionSummary>(Callee.getSummaryList()[0]->getBaseObject()); |
| } |
| |
| bool IndexCallsiteContextGraph::calleeMatchesFunc( |
| IndexCall &Call, const FunctionSummary *Func, |
| const FunctionSummary *CallerFunc, |
| std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain) { |
| ValueInfo Callee = dyn_cast_if_present<CallsiteInfo *>(Call)->Callee; |
| // If there is no summary list then this is a call to an externally defined |
| // symbol. |
| AliasSummary *Alias = |
| Callee.getSummaryList().empty() |
| ? nullptr |
| : dyn_cast<AliasSummary>(Callee.getSummaryList()[0].get()); |
| assert(FSToVIMap.count(Func)); |
| auto FuncVI = FSToVIMap[Func]; |
| if (Callee == FuncVI || |
| // If callee is an alias, check the aliasee, since only function |
| // summary base objects will contain the stack node summaries and thus |
| // get a context node. |
| (Alias && Alias->getAliaseeVI() == FuncVI)) |
| return true; |
| |
| // Recursively search for the profiled callee through tail calls starting with |
| // the actual Callee. The discovered tail call chain is saved in |
| // FoundCalleeChain, and we will fixup the graph to include these callsites |
| // after returning. |
| // FIXME: We will currently redo the same recursive walk if we find the same |
| // mismatched callee from another callsite. We can improve this with more |
| // bookkeeping of the created chain of new nodes for each mismatch. |
| unsigned Depth = 1; |
| bool FoundMultipleCalleeChains = false; |
| if (!findProfiledCalleeThroughTailCalls( |
| FuncVI, Callee, Depth, FoundCalleeChain, FoundMultipleCalleeChains)) { |
| LLVM_DEBUG(dbgs() << "Not found through unique tail call chain: " << FuncVI |
| << " from " << FSToVIMap[CallerFunc] |
| << " that actually called " << Callee |
| << (FoundMultipleCalleeChains |
| ? " (found multiple possible chains)" |
| : "") |
| << "\n"); |
| if (FoundMultipleCalleeChains) |
| FoundProfiledCalleeNonUniquelyCount++; |
| return false; |
| } |
| |
| return true; |
| } |
| |
| bool IndexCallsiteContextGraph::sameCallee(IndexCall &Call1, IndexCall &Call2) { |
| ValueInfo Callee1 = dyn_cast_if_present<CallsiteInfo *>(Call1)->Callee; |
| ValueInfo Callee2 = dyn_cast_if_present<CallsiteInfo *>(Call2)->Callee; |
| return Callee1 == Callee2; |
| } |
| |
| template <typename DerivedCCG, typename FuncTy, typename CallTy> |
| void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::dump() |
| const { |
| print(dbgs()); |
| dbgs() << "\n"; |
| } |
| |
| template <typename DerivedCCG, typename FuncTy, typename CallTy> |
| void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::print( |
| raw_ostream &OS) const { |
| OS << "Node " << this << "\n"; |
| OS << "\t"; |
| printCall(OS); |
| if (Recursive) |
| OS << " (recursive)"; |
| OS << "\n"; |
| if (!MatchingCalls.empty()) { |
| OS << "\tMatchingCalls:\n"; |
| for (auto &MatchingCall : MatchingCalls) { |
| OS << "\t"; |
| MatchingCall.print(OS); |
| OS << "\n"; |
| } |
| } |
| OS << "\tAllocTypes: " << getAllocTypeString(AllocTypes) << "\n"; |
| OS << "\tContextIds:"; |
| // Make a copy of the computed context ids that we can sort for stability. |
| auto ContextIds = getContextIds(); |
| std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end()); |
| std::sort(SortedIds.begin(), SortedIds.end()); |
| for (auto Id : SortedIds) |
| OS << " " << Id; |
| OS << "\n"; |
| OS << "\tCalleeEdges:\n"; |
| for (auto &Edge : CalleeEdges) |
| OS << "\t\t" << *Edge << "\n"; |
| OS << "\tCallerEdges:\n"; |
| for (auto &Edge : CallerEdges) |
| OS << "\t\t" << *Edge << "\n"; |
| if (!Clones.empty()) { |
| OS << "\tClones: " << llvm::interleaved(Clones) << "\n"; |
| } else if (CloneOf) { |
| OS << "\tClone of " << CloneOf << "\n"; |
| } |
| } |
| |
| template <typename DerivedCCG, typename FuncTy, typename CallTy> |
| void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge::dump() |
| const { |
| print(dbgs()); |
| dbgs() << "\n"; |
| } |
| |
| template <typename DerivedCCG, typename FuncTy, typename CallTy> |
| void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge::print( |
| raw_ostream &OS) const { |
| OS << "Edge from Callee " << Callee << " to Caller: " << Caller |
| << (IsBackedge ? " (BE)" : "") |
| << " AllocTypes: " << getAllocTypeString(AllocTypes); |
| OS << " ContextIds:"; |
| std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end()); |
| std::sort(SortedIds.begin(), SortedIds.end()); |
| for (auto Id : SortedIds) |
| OS << " " << Id; |
| } |
| |
| template <typename DerivedCCG, typename FuncTy, typename CallTy> |
| void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::dump() const { |
| print(dbgs()); |
| } |
| |
| template <typename DerivedCCG, typename FuncTy, typename CallTy> |
| void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::print( |
| raw_ostream &OS) const { |
| OS << "Callsite Context Graph:\n"; |
| using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *; |
| for (const auto Node : nodes<GraphType>(this)) { |
| if (Node->isRemoved()) |
| continue; |
| Node->print(OS); |
| OS << "\n"; |
| } |
| } |
| |
| template <typename DerivedCCG, typename FuncTy, typename CallTy> |
| void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::printTotalSizes( |
| raw_ostream &OS) const { |
| using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *; |
| for (const auto Node : nodes<GraphType>(this)) { |
| if (Node->isRemoved()) |
| continue; |
| if (!Node->IsAllocation) |
| continue; |
| DenseSet<uint32_t> ContextIds = Node->getContextIds(); |
| auto AllocTypeFromCall = getAllocationCallType(Node->Call); |
| std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end()); |
| std::sort(SortedIds.begin(), SortedIds.end()); |
| for (auto Id : SortedIds) { |
| auto TypeI = ContextIdToAllocationType.find(Id); |
| assert(TypeI != ContextIdToAllocationType.end()); |
| auto CSI = ContextIdToContextSizeInfos.find(Id); |
| if (CSI != ContextIdToContextSizeInfos.end()) { |
| for (auto &Info : CSI->second) { |
| OS << "MemProf hinting: " |
| << getAllocTypeString((uint8_t)TypeI->second) |
| << " full allocation context " << Info.FullStackId |
| << " with total size " << Info.TotalSize << " is " |
| << getAllocTypeString(Node->AllocTypes) << " after cloning"; |
| if (allocTypeToUse(Node->AllocTypes) != AllocTypeFromCall) |
| OS << " marked " << getAllocTypeString((uint8_t)AllocTypeFromCall) |
| << " due to cold byte percent"; |
| // Print the internal context id to aid debugging and visualization. |
| OS << " (context id " << Id << ")"; |
| OS << "\n"; |
| } |
| } |
| } |
| } |
| } |
| |
| template <typename DerivedCCG, typename FuncTy, typename CallTy> |
| void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::check() const { |
| using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *; |
| for (const auto Node : nodes<GraphType>(this)) { |
| checkNode<DerivedCCG, FuncTy, CallTy>(Node, /*CheckEdges=*/false); |
| for (auto &Edge : Node->CallerEdges) |
| checkEdge<DerivedCCG, FuncTy, CallTy>(Edge); |
| } |
| } |
| |
| template <typename DerivedCCG, typename FuncTy, typename CallTy> |
| struct GraphTraits<const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *> { |
| using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *; |
| using NodeRef = const ContextNode<DerivedCCG, FuncTy, CallTy> *; |
| |
| using NodePtrTy = std::unique_ptr<ContextNode<DerivedCCG, FuncTy, CallTy>>; |
| static NodeRef getNode(const NodePtrTy &P) { return P.get(); } |
| |
| using nodes_iterator = |
| mapped_iterator<typename std::vector<NodePtrTy>::const_iterator, |
| decltype(&getNode)>; |
| |
| static nodes_iterator nodes_begin(GraphType G) { |
| return nodes_iterator(G->NodeOwner.begin(), &getNode); |
| } |
| |
| static nodes_iterator nodes_end(GraphType G) { |
| return nodes_iterator(G->NodeOwner.end(), &getNode); |
| } |
| |
| static NodeRef getEntryNode(GraphType G) { |
| return G->NodeOwner.begin()->get(); |
| } |
| |
| using EdgePtrTy = std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>>; |
| static const ContextNode<DerivedCCG, FuncTy, CallTy> * |
| GetCallee(const EdgePtrTy &P) { |
| return P->Callee; |
| } |
| |
| using ChildIteratorType = |
| mapped_iterator<typename std::vector<std::shared_ptr<ContextEdge< |
| DerivedCCG, FuncTy, CallTy>>>::const_iterator, |
| decltype(&GetCallee)>; |
| |
| static ChildIteratorType child_begin(NodeRef N) { |
| return ChildIteratorType(N->CalleeEdges.begin(), &GetCallee); |
| } |
| |
| static ChildIteratorType child_end(NodeRef N) { |
| return ChildIteratorType(N->CalleeEdges.end(), &GetCallee); |
| } |
| }; |
| |
| template <typename DerivedCCG, typename FuncTy, typename CallTy> |
| struct DOTGraphTraits<const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *> |
| : public DefaultDOTGraphTraits { |
| DOTGraphTraits(bool IsSimple = false) : DefaultDOTGraphTraits(IsSimple) { |
| // If the user requested the full graph to be exported, but provided an |
| // allocation id, or if the user gave a context id and requested more than |
| // just a specific context to be exported, note that highlighting is |
| // enabled. |
| DoHighlight = |
| (AllocIdForDot.getNumOccurrences() && DotGraphScope == DotScope::All) || |
| (ContextIdForDot.getNumOccurrences() && |
| DotGraphScope != DotScope::Context); |
| } |
| |
| using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *; |
| using GTraits = GraphTraits<GraphType>; |
| using NodeRef = typename GTraits::NodeRef; |
| using ChildIteratorType = typename GTraits::ChildIteratorType; |
| |
| static std::string getNodeLabel(NodeRef Node, GraphType G) { |
| std::string LabelString = |
| (Twine("OrigId: ") + (Node->IsAllocation ? "Alloc" : "") + |
| Twine(Node->OrigStackOrAllocId)) |
| .str(); |
| LabelString += "\n"; |
| if (Node->hasCall()) { |
| auto Func = G->NodeToCallingFunc.find(Node); |
| assert(Func != G->NodeToCallingFunc.end()); |
| LabelString += |
| G->getLabel(Func->second, Node->Call.call(), Node->Call.cloneNo()); |
| } else { |
| LabelString += "null call"; |
| if (Node->Recursive) |
| LabelString += " (recursive)"; |
| else |
| LabelString += " (external)"; |
| } |
| return LabelString; |
| } |
| |
| static std::string getNodeAttributes(NodeRef Node, GraphType G) { |
| auto ContextIds = Node->getContextIds(); |
| // If highlighting enabled, see if this node contains any of the context ids |
| // of interest. If so, it will use a different color and a larger fontsize |
| // (which makes the node larger as well). |
| bool Highlight = false; |
| if (DoHighlight) { |
| assert(ContextIdForDot.getNumOccurrences() || |
| AllocIdForDot.getNumOccurrences()); |
| if (ContextIdForDot.getNumOccurrences()) |
| Highlight = ContextIds.contains(ContextIdForDot); |
| else |
| Highlight = set_intersects(ContextIds, G->DotAllocContextIds); |
| } |
| std::string AttributeString = (Twine("tooltip=\"") + getNodeId(Node) + " " + |
| getContextIds(ContextIds) + "\"") |
| .str(); |
| // Default fontsize is 14 |
| if (Highlight) |
| AttributeString += ",fontsize=\"30\""; |
| AttributeString += |
| (Twine(",fillcolor=\"") + getColor(Node->AllocTypes, Highlight) + "\"") |
| .str(); |
| if (Node->CloneOf) { |
| AttributeString += ",color=\"blue\""; |
| AttributeString += ",style=\"filled,bold,dashed\""; |
| } else |
| AttributeString += ",style=\"filled\""; |
| return AttributeString; |
| } |
| |
| static std::string getEdgeAttributes(NodeRef, ChildIteratorType ChildIter, |
| GraphType G) { |
| auto &Edge = *(ChildIter.getCurrent()); |
| // If highlighting enabled, see if this edge contains any of the context ids |
| // of interest. If so, it will use a different color and a heavier arrow |
| // size and weight (the larger weight makes the highlighted path |
| // straighter). |
| bool Highlight = false; |
| if (DoHighlight) { |
| assert(ContextIdForDot.getNumOccurrences() || |
| AllocIdForDot.getNumOccurrences()); |
| if (ContextIdForDot.getNumOccurrences()) |
| Highlight = Edge->ContextIds.contains(ContextIdForDot); |
| else |
| Highlight = set_intersects(Edge->ContextIds, G->DotAllocContextIds); |
| } |
| auto Color = getColor(Edge->AllocTypes, Highlight); |
| std::string AttributeString = |
| (Twine("tooltip=\"") + getContextIds(Edge->ContextIds) + "\"" + |
| // fillcolor is the arrow head and color is the line |
| Twine(",fillcolor=\"") + Color + "\"" + Twine(",color=\"") + Color + |
| "\"") |
| .str(); |
| if (Edge->IsBackedge) |
| AttributeString += ",style=\"dotted\""; |
| // Default penwidth and weight are both 1. |
| if (Highlight) |
| AttributeString += ",penwidth=\"2.0\",weight=\"2\""; |
| return AttributeString; |
| } |
| |
| // Since the NodeOwners list includes nodes that are no longer connected to |
| // the graph, skip them here. |
| static bool isNodeHidden(NodeRef Node, GraphType G) { |
| if (Node->isRemoved()) |
| return true; |
| // If a scope smaller than the full graph was requested, see if this node |
| // contains any of the context ids of interest. |
| if (DotGraphScope == DotScope::Alloc) |
| return !set_intersects(Node->getContextIds(), G->DotAllocContextIds); |
| if (DotGraphScope == DotScope::Context) |
| return !Node->getContextIds().contains(ContextIdForDot); |
| return false; |
| } |
| |
| private: |
| static std::string getContextIds(const DenseSet<uint32_t> &ContextIds) { |
| std::string IdString = "ContextIds:"; |
| if (ContextIds.size() < 100) { |
| std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end()); |
| std::sort(SortedIds.begin(), SortedIds.end()); |
| for (auto Id : SortedIds) |
| IdString += (" " + Twine(Id)).str(); |
| } else { |
| IdString += (" (" + Twine(ContextIds.size()) + " ids)").str(); |
| } |
| return IdString; |
| } |
| |
| static std::string getColor(uint8_t AllocTypes, bool Highlight) { |
| // If DoHighlight is not enabled, we want to use the highlight colors for |
| // NotCold and Cold, and the non-highlight color for NotCold+Cold. This is |
| // both compatible with the color scheme before highlighting was supported, |
| // and for the NotCold+Cold color the non-highlight color is a bit more |
| // readable. |
| if (AllocTypes == (uint8_t)AllocationType::NotCold) |
| // Color "brown1" actually looks like a lighter red. |
| return !DoHighlight || Highlight ? "brown1" : "lightpink"; |
| if (AllocTypes == (uint8_t)AllocationType::Cold) |
| return !DoHighlight || Highlight ? "cyan" : "lightskyblue"; |
| if (AllocTypes == |
| ((uint8_t)AllocationType::NotCold | (uint8_t)AllocationType::Cold)) |
| return Highlight ? "magenta" : "mediumorchid1"; |
| return "gray"; |
| } |
| |
| static std::string getNodeId(NodeRef Node) { |
| std::stringstream SStream; |
| SStream << std::hex << "N0x" << (unsigned long long)Node; |
| std::string Result = SStream.str(); |
| return Result; |
| } |
| |
| // True if we should highlight a specific context or allocation's contexts in |
| // the emitted graph. |
| static bool DoHighlight; |
| }; |
| |
| template <typename DerivedCCG, typename FuncTy, typename CallTy> |
| bool DOTGraphTraits< |
| const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>::DoHighlight = |
| false; |
| |
| template <typename DerivedCCG, typename FuncTy, typename CallTy> |
| void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::exportToDot( |
| std::string Label) const { |
| WriteGraph(this, "", false, Label, |
| DotFilePathPrefix + "ccg." + Label + ".dot"); |
| } |
| |
| template <typename DerivedCCG, typename FuncTy, typename CallTy> |
| typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode * |
| CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::moveEdgeToNewCalleeClone( |
| const std::shared_ptr<ContextEdge> &Edge, |
| DenseSet<uint32_t> ContextIdsToMove) { |
| ContextNode *Node = Edge->Callee; |
| assert(NodeToCallingFunc.count(Node)); |
| ContextNode *Clone = |
| createNewNode(Node->IsAllocation, NodeToCallingFunc[Node], Node->Call); |
| Node->addClone(Clone); |
| Clone->MatchingCalls = Node->MatchingCalls; |
| moveEdgeToExistingCalleeClone(Edge, Clone, /*NewClone=*/true, |
| ContextIdsToMove); |
| return Clone; |
| } |
| |
| template <typename DerivedCCG, typename FuncTy, typename CallTy> |
| void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>:: |
| moveEdgeToExistingCalleeClone(const std::shared_ptr<ContextEdge> &Edge, |
| ContextNode *NewCallee, bool NewClone, |
| DenseSet<uint32_t> ContextIdsToMove) { |
| // NewCallee and Edge's current callee must be clones of the same original |
| // node (Edge's current callee may be the original node too). |
| assert(NewCallee->getOrigNode() == Edge->Callee->getOrigNode()); |
| |
| bool EdgeIsRecursive = Edge->Callee == Edge->Caller; |
| |
| ContextNode *OldCallee = Edge->Callee; |
| |
| // We might already have an edge to the new callee from earlier cloning for a |
| // different allocation. If one exists we will reuse it. |
| auto ExistingEdgeToNewCallee = NewCallee->findEdgeFromCaller(Edge->Caller); |
| |
| // Callers will pass an empty ContextIdsToMove set when they want to move the |
| // edge. Copy in Edge's ids for simplicity. |
| if (ContextIdsToMove.empty()) |
| ContextIdsToMove = Edge->getContextIds(); |
| |
| // If we are moving all of Edge's ids, then just move the whole Edge. |
| // Otherwise only move the specified subset, to a new edge if needed. |
| if (Edge->getContextIds().size() == ContextIdsToMove.size()) { |
| // First, update the alloc types on New Callee from Edge. |
| // Do this before we potentially clear Edge's fields below! |
| NewCallee->AllocTypes |= Edge->AllocTypes; |
| // Moving the whole Edge. |
| if (ExistingEdgeToNewCallee) { |
| // Since we already have an edge to NewCallee, simply move the ids |
| // onto it, and remove the existing Edge. |
| ExistingEdgeToNewCallee->getContextIds().insert_range(ContextIdsToMove); |
| ExistingEdgeToNewCallee->AllocTypes |= Edge->AllocTypes; |
| assert(Edge->ContextIds == ContextIdsToMove); |
| removeEdgeFromGraph(Edge.get()); |
| } else { |
| // Otherwise just reconnect Edge to NewCallee. |
| Edge->Callee = NewCallee; |
| NewCallee->CallerEdges.push_back(Edge); |
| // Remove it from callee where it was previously connected. |
| OldCallee->eraseCallerEdge(Edge.get()); |
| // Don't need to update Edge's context ids since we are simply |
| // reconnecting it. |
| } |
| } else { |
| // Only moving a subset of Edge's ids. |
| // Compute the alloc type of the subset of ids being moved. |
| auto CallerEdgeAllocType = computeAllocType(ContextIdsToMove); |
| if (ExistingEdgeToNewCallee) { |
| // Since we already have an edge to NewCallee, simply move the ids |
| // onto it. |
| ExistingEdgeToNewCallee->getContextIds().insert_range(ContextIdsToMove); |
| ExistingEdgeToNewCallee->AllocTypes |= CallerEdgeAllocType; |
| } else { |
| // Otherwise, create a new edge to NewCallee for the ids being moved. |
| auto NewEdge = std::make_shared<ContextEdge>( |
| NewCallee, Edge->Caller, CallerEdgeAllocType, ContextIdsToMove); |
| Edge->Caller->CalleeEdges.push_back(NewEdge); |
| NewCallee->CallerEdges.push_back(NewEdge); |
| } |
| // In either case, need to update the alloc types on NewCallee, and remove |
| // those ids and update the alloc type on the original Edge. |
| NewCallee->AllocTypes |= CallerEdgeAllocType; |
| set_subtract(Edge->ContextIds, ContextIdsToMove); |
| Edge->AllocTypes = computeAllocType(Edge->ContextIds); |
| } |
| // Now walk the old callee node's callee edges and move Edge's context ids |
| // over to the corresponding edge into the clone (which is created here if |
| // this is a newly created clone). |
| for (auto &OldCalleeEdge : OldCallee->CalleeEdges) { |
| ContextNode *CalleeToUse = OldCalleeEdge->Callee; |
| // If this is a direct recursion edge, use NewCallee (the clone) as the |
| // callee as well, so that any edge updated/created here is also direct |
| // recursive. |
| if (CalleeToUse == OldCallee) { |
| // If this is a recursive edge, see if we already moved a recursive edge |
| // (which would have to have been this one) - if we were only moving a |
| // subset of context ids it would still be on OldCallee. |
| if (EdgeIsRecursive) { |
| assert(OldCalleeEdge == Edge); |
| continue; |
| } |
| CalleeToUse = NewCallee; |
| } |
| // The context ids moving to the new callee are the subset of this edge's |
| // context ids and the context ids on the caller edge being moved. |
| DenseSet<uint32_t> EdgeContextIdsToMove = |
| set_intersection(OldCalleeEdge->getContextIds(), ContextIdsToMove); |
| set_subtract(OldCalleeEdge->getContextIds(), EdgeContextIdsToMove); |
| OldCalleeEdge->AllocTypes = |
| computeAllocType(OldCalleeEdge->getContextIds()); |
| if (!NewClone) { |
| // Update context ids / alloc type on corresponding edge to NewCallee. |
| // There is a chance this may not exist if we are reusing an existing |
| // clone, specifically during function assignment, where we would have |
| // removed none type edges after creating the clone. If we can't find |
| // a corresponding edge there, fall through to the cloning below. |
| if (auto *NewCalleeEdge = NewCallee->findEdgeFromCallee(CalleeToUse)) { |
| NewCalleeEdge->getContextIds().insert_range(EdgeContextIdsToMove); |
| NewCalleeEdge->AllocTypes |= computeAllocType(EdgeContextIdsToMove); |
| continue; |
| } |
| } |
| auto NewEdge = std::make_shared<ContextEdge>( |
| CalleeToUse, NewCallee, computeAllocType(EdgeContextIdsToMove), |
| EdgeContextIdsToMove); |
| NewCallee->CalleeEdges.push_back(NewEdge); |
| NewEdge->Callee->CallerEdges.push_back(NewEdge); |
| } |
| // Recompute the node alloc type now that its callee edges have been |
| // updated (since we will compute from those edges). |
| OldCallee->AllocTypes = OldCallee->computeAllocType(); |
| // OldCallee alloc type should be None iff its context id set is now empty. |
| assert((OldCallee->AllocTypes == (uint8_t)AllocationType::None) == |
| OldCallee->emptyContextIds()); |
| if (VerifyCCG) { |
| checkNode<DerivedCCG, FuncTy, CallTy>(OldCallee, /*CheckEdges=*/false); |
| checkNode<DerivedCCG, FuncTy, CallTy>(NewCallee, /*CheckEdges=*/false); |
| for (const auto &OldCalleeEdge : OldCallee->CalleeEdges) |
| checkNode<DerivedCCG, FuncTy, CallTy>(OldCalleeEdge->Callee, |
| /*CheckEdges=*/false); |
| for (const auto &NewCalleeEdge : NewCallee->CalleeEdges) |
| checkNode<DerivedCCG, FuncTy, CallTy>(NewCalleeEdge->Callee, |
| /*CheckEdges=*/false); |
| } |
| } |
| |
| template <typename DerivedCCG, typename FuncTy, typename CallTy> |
| void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>:: |
| moveCalleeEdgeToNewCaller(const std::shared_ptr<ContextEdge> &Edge, |
| ContextNode *NewCaller) { |
| auto *OldCallee = Edge->Callee; |
| auto *NewCallee = OldCallee; |
| // If this edge was direct recursive, make any new/updated edge also direct |
| // recursive to NewCaller. |
| bool Recursive = Edge->Caller == Edge->Callee; |
| if (Recursive) |
| NewCallee = NewCaller; |
| |
| ContextNode *OldCaller = Edge->Caller; |
| OldCaller->eraseCalleeEdge(Edge.get()); |
| |
| // We might already have an edge to the new caller. If one exists we will |
| // reuse it. |
| auto ExistingEdgeToNewCaller = NewCaller->findEdgeFromCallee(NewCallee); |
| |
| if (ExistingEdgeToNewCaller) { |
| // Since we already have an edge to NewCaller, simply move the ids |
| // onto it, and remove the existing Edge. |
| ExistingEdgeToNewCaller->getContextIds().insert_range( |
| Edge->getContextIds()); |
| ExistingEdgeToNewCaller->AllocTypes |= Edge->AllocTypes; |
| Edge->ContextIds.clear(); |
| Edge->AllocTypes = (uint8_t)AllocationType::None; |
| OldCallee->eraseCallerEdge(Edge.get()); |
| } else { |
| // Otherwise just reconnect Edge to NewCaller. |
| Edge->Caller = NewCaller; |
| NewCaller->CalleeEdges.push_back(Edge); |
| if (Recursive) { |
| assert(NewCallee == NewCaller); |
| // In the case of (direct) recursive edges, we update the callee as well |
| // so that it becomes recursive on the new caller. |
| Edge->Callee = NewCallee; |
| NewCallee->CallerEdges.push_back(Edge); |
| OldCallee->eraseCallerEdge(Edge.get()); |
| } |
| // Don't need to update Edge's context ids since we are simply |
| // reconnecting it. |
| } |
| // In either case, need to update the alloc types on New Caller. |
| NewCaller->AllocTypes |= Edge->AllocTypes; |
| |
| // Now walk the old caller node's caller edges and move Edge's context ids |
| // over to the corresponding edge into the node (which is created here if |
| // this is a newly created node). We can tell whether this is a newly created |
| // node by seeing if it has any caller edges yet. |
| #ifndef NDEBUG |
| bool IsNewNode = NewCaller->CallerEdges.empty(); |
| #endif |
| // If we just moved a direct recursive edge, presumably its context ids should |
| // also flow out of OldCaller via some other non-recursive callee edge. We |
| // don't want to remove the recursive context ids from other caller edges yet, |
| // otherwise the context ids get into an inconsistent state on OldCaller. |
| // We will update these context ids on the non-recursive caller edge when and |
| // if they are updated on the non-recursive callee. |
| if (!Recursive) { |
| for (auto &OldCallerEdge : OldCaller->CallerEdges) { |
| auto OldCallerCaller = OldCallerEdge->Caller; |
| // The context ids moving to the new caller are the subset of this edge's |
| // context ids and the context ids on the callee edge being moved. |
| DenseSet<uint32_t> EdgeContextIdsToMove = set_intersection( |
| OldCallerEdge->getContextIds(), Edge->getContextIds()); |
| if (OldCaller == OldCallerCaller) { |
| OldCallerCaller = NewCaller; |
| // Don't actually move this one. The caller will move it directly via a |
| // call to this function with this as the Edge if it is appropriate to |
| // move to a diff node that has a matching callee (itself). |
| continue; |
| } |
| set_subtract(OldCallerEdge->getContextIds(), EdgeContextIdsToMove); |
| OldCallerEdge->AllocTypes = |
| computeAllocType(OldCallerEdge->getContextIds()); |
| // In this function we expect that any pre-existing node already has edges |
| // from the same callers as the old node. That should be true in the |
| // current use case, where we will remove None-type edges after copying |
| // over all caller edges from the callee. |
| auto *ExistingCallerEdge = NewCaller->findEdgeFromCaller(OldCallerCaller); |
| // Since we would have skipped caller edges when moving a direct recursive |
| // edge, this may not hold true when recursive handling enabled. |
| assert(IsNewNode || ExistingCallerEdge || AllowRecursiveCallsites); |
| if (ExistingCallerEdge) { |
| ExistingCallerEdge->getContextIds().insert_range(EdgeContextIdsToMove); |
| ExistingCallerEdge->AllocTypes |= |
| computeAllocType(EdgeContextIdsToMove); |
| continue; |
| } |
| auto NewEdge = std::make_shared<ContextEdge>( |
| NewCaller, OldCallerCaller, computeAllocType(EdgeContextIdsToMove), |
| EdgeContextIdsToMove); |
| NewCaller->CallerEdges.push_back(NewEdge); |
| NewEdge->Caller->CalleeEdges.push_back(NewEdge); |
| } |
| } |
| // Recompute the node alloc type now that its caller edges have been |
| // updated (since we will compute from those edges). |
| OldCaller->AllocTypes = OldCaller->computeAllocType(); |
| // OldCaller alloc type should be None iff its context id set is now empty. |
| assert((OldCaller->AllocTypes == (uint8_t)AllocationType::None) == |
| OldCaller->emptyContextIds()); |
| if (VerifyCCG) { |
| checkNode<DerivedCCG, FuncTy, CallTy>(OldCaller, /*CheckEdges=*/false); |
| checkNode<DerivedCCG, FuncTy, CallTy>(NewCaller, /*CheckEdges=*/false); |
| for (const auto &OldCallerEdge : OldCaller->CallerEdges) |
| checkNode<DerivedCCG, FuncTy, CallTy>(OldCallerEdge->Caller, |
| /*CheckEdges=*/false); |
| for (const auto &NewCallerEdge : NewCaller->CallerEdges) |
| checkNode<DerivedCCG, FuncTy, CallTy>(NewCallerEdge->Caller, |
| /*CheckEdges=*/false); |
| } |
| } |
| |
| template <typename DerivedCCG, typename FuncTy, typename CallTy> |
| void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>:: |
| recursivelyRemoveNoneTypeCalleeEdges( |
| ContextNode *Node, DenseSet<const ContextNode *> &Visited) { |
| auto Inserted = Visited.insert(Node); |
| if (!Inserted.second) |
| return; |
| |
| removeNoneTypeCalleeEdges(Node); |
| |
| for (auto *Clone : Node->Clones) |
| recursivelyRemoveNoneTypeCalleeEdges(Clone, Visited); |
| |
| // The recursive call may remove some of this Node's caller edges. |
| // Iterate over a copy and skip any that were removed. |
| auto CallerEdges = Node->CallerEdges; |
| for (auto &Edge : CallerEdges) { |
| // Skip any that have been removed by an earlier recursive call. |
| if (Edge->isRemoved()) { |
| assert(!is_contained(Node->CallerEdges, Edge)); |
| continue; |
| } |
| recursivelyRemoveNoneTypeCalleeEdges(Edge->Caller, Visited); |
| } |
| } |
| |
| // This is the standard DFS based backedge discovery algorithm. |
| template <typename DerivedCCG, typename FuncTy, typename CallTy> |
| void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::markBackedges() { |
| // If we are cloning recursive contexts, find and mark backedges from all root |
| // callers, using the typical DFS based backedge analysis. |
| if (!CloneRecursiveContexts) |
| return; |
| DenseSet<const ContextNode *> Visited; |
| DenseSet<const ContextNode *> CurrentStack; |
| for (auto &Entry : NonAllocationCallToContextNodeMap) { |
| auto *Node = Entry.second; |
| if (Node->isRemoved()) |
| continue; |
| // It is a root if it doesn't have callers. |
| if (!Node->CallerEdges.empty()) |
| continue; |
| markBackedges(Node, Visited, CurrentStack); |
| assert(CurrentStack.empty()); |
| } |
| } |
| |
| // Recursive helper for above markBackedges method. |
| template <typename DerivedCCG, typename FuncTy, typename CallTy> |
| void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::markBackedges( |
| ContextNode *Node, DenseSet<const ContextNode *> &Visited, |
| DenseSet<const ContextNode *> &CurrentStack) { |
| auto I = Visited.insert(Node); |
| // We should only call this for unvisited nodes. |
| assert(I.second); |
| (void)I; |
| for (auto &CalleeEdge : Node->CalleeEdges) { |
| auto *Callee = CalleeEdge->Callee; |
| if (Visited.count(Callee)) { |
| // Since this was already visited we need to check if it is currently on |
| // the recursive stack in which case it is a backedge. |
| if (CurrentStack.count(Callee)) |
| CalleeEdge->IsBackedge = true; |
| continue; |
| } |
| CurrentStack.insert(Callee); |
| markBackedges(Callee, Visited, CurrentStack); |
| CurrentStack.erase(Callee); |
| } |
| } |
| |
| template <typename DerivedCCG, typename FuncTy, typename CallTy> |
| void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::identifyClones() { |
| DenseSet<const ContextNode *> Visited; |
| for (auto &Entry : AllocationCallToContextNodeMap) { |
| Visited.clear(); |
| identifyClones(Entry.second, Visited, Entry.second->getContextIds()); |
| } |
| Visited.clear(); |
| for (auto &Entry : AllocationCallToContextNodeMap) |
| recursivelyRemoveNoneTypeCalleeEdges(Entry.second, Visited); |
| if (VerifyCCG) |
| check(); |
| } |
| |
| // helper function to check an AllocType is cold or notcold or both. |
| bool checkColdOrNotCold(uint8_t AllocType) { |
| return (AllocType == (uint8_t)AllocationType::Cold) || |
| (AllocType == (uint8_t)AllocationType::NotCold) || |
| (AllocType == |
| ((uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold)); |
| } |
| |
| template <typename DerivedCCG, typename FuncTy, typename CallTy> |
| void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::identifyClones( |
| ContextNode *Node, DenseSet<const ContextNode *> &Visited, |
| const DenseSet<uint32_t> &AllocContextIds) { |
| if (VerifyNodes) |
| checkNode<DerivedCCG, FuncTy, CallTy>(Node, /*CheckEdges=*/false); |
| assert(!Node->CloneOf); |
| |
| // If Node as a null call, then either it wasn't found in the module (regular |
| // LTO) or summary index (ThinLTO), or there were other conditions blocking |
| // cloning (e.g. recursion, calls multiple targets, etc). |
| // Do this here so that we don't try to recursively clone callers below, which |
| // isn't useful at least for this node. |
| if (!Node->hasCall()) |
| return; |
| |
| // No need to look at any callers if allocation type already unambiguous. |
| if (hasSingleAllocType(Node->AllocTypes)) |
| return; |
| |
| #ifndef NDEBUG |
| auto Insert = |
| #endif |
| Visited.insert(Node); |
| // We should not have visited this node yet. |
| assert(Insert.second); |
| // The recursive call to identifyClones may delete the current edge from the |
| // CallerEdges vector. Make a copy and iterate on that, simpler than passing |
| // in an iterator and having recursive call erase from it. Other edges may |
| // also get removed during the recursion, which will have null Callee and |
| // Caller pointers (and are deleted later), so we skip those below. |
| { |
| auto CallerEdges = Node->CallerEdges; |
| for (auto &Edge : CallerEdges) { |
| // Skip any that have been removed by an earlier recursive call. |
| if (Edge->isRemoved()) { |
| assert(!is_contained(Node->CallerEdges, Edge)); |
| continue; |
| } |
| // Defer backedges. See comments further below where these edges are |
| // handled during the cloning of this Node. |
| if (Edge->IsBackedge) { |
| // We should only mark these if cloning recursive contexts, where we |
| // need to do this deferral. |
| assert(CloneRecursiveContexts); |
| continue; |
| } |
| // Ignore any caller we previously visited via another edge. |
| if (!Visited.count(Edge->Caller) && !Edge->Caller->CloneOf) { |
| identifyClones(Edge->Caller, Visited, AllocContextIds); |
| } |
| } |
| } |
| |
| // Check if we reached an unambiguous call or have have only a single caller. |
| if (hasSingleAllocType(Node->AllocTypes) || Node->CallerEdges.size() <= 1) |
| return; |
| |
| // We need to clone. |
| |
| // Try to keep the original version as alloc type NotCold. This will make |
| // cases with indirect calls or any other situation with an unknown call to |
| // the original function get the default behavior. We do this by sorting the |
| // CallerEdges of the Node we will clone by alloc type. |
| // |
| // Give NotCold edge the lowest sort priority so those edges are at the end of |
| // the caller edges vector, and stay on the original version (since the below |
| // code clones greedily until it finds all remaining edges have the same type |
| // and leaves the remaining ones on the original Node). |
| // |
| // We shouldn't actually have any None type edges, so the sorting priority for |
| // that is arbitrary, and we assert in that case below. |
| const unsigned AllocTypeCloningPriority[] = {/*None*/ 3, /*NotCold*/ 4, |
| /*Cold*/ 1, |
| /*NotColdCold*/ 2}; |
| llvm::stable_sort(Node->CallerEdges, |
| [&](const std::shared_ptr<ContextEdge> &A, |
| const std::shared_ptr<ContextEdge> &B) { |
| // Nodes with non-empty context ids should be sorted |
| // before those with empty context ids. |
| if (A->ContextIds.empty()) |
| // Either B ContextIds are non-empty (in which case we |
| // should return false because B < A), or B ContextIds |
| // are empty, in which case they are equal, and we |
| // should maintain the original relative ordering. |
| return false; |
| if (B->ContextIds.empty()) |
| return true; |
| |
| if (A->AllocTypes == B->AllocTypes) |
| // Use the first context id for each edge as a |
| // tie-breaker. |
| return *A->ContextIds.begin() < *B->ContextIds.begin(); |
| return AllocTypeCloningPriority[A->AllocTypes] < |
| AllocTypeCloningPriority[B->AllocTypes]; |
| }); |
| |
| assert(Node->AllocTypes != (uint8_t)AllocationType::None); |
| |
| DenseSet<uint32_t> RecursiveContextIds; |
| assert(AllowRecursiveContexts || !CloneRecursiveContexts); |
| // If we are allowing recursive callsites, but have also disabled recursive |
| // contexts, look for context ids that show up in multiple caller edges. |
| if (AllowRecursiveCallsites && !AllowRecursiveContexts) { |
| DenseSet<uint32_t> AllCallerContextIds; |
| for (auto &CE : Node->CallerEdges) { |
| // Resize to the largest set of caller context ids, since we know the |
| // final set will be at least that large. |
| AllCallerContextIds.reserve(CE->getContextIds().size()); |
| for (auto Id : CE->getContextIds()) |
| if (!AllCallerContextIds.insert(Id).second) |
| RecursiveContextIds.insert(Id); |
| } |
| } |
| |
| // Iterate until we find no more opportunities for disambiguating the alloc |
| // types via cloning. In most cases this loop will terminate once the Node |
| // has a single allocation type, in which case no more cloning is needed. |
| // Iterate over a copy of Node's caller edges, since we may need to remove |
| // edges in the moveEdgeTo* methods, and this simplifies the handling and |
| // makes it less error-prone. |
| auto CallerEdges = Node->CallerEdges; |
| for (auto &CallerEdge : CallerEdges) { |
| // Skip any that have been removed by an earlier recursive call. |
| if (CallerEdge->isRemoved()) { |
| assert(!is_contained(Node->CallerEdges, CallerEdge)); |
| continue; |
| } |
| assert(CallerEdge->Callee == Node); |
| |
| // See if cloning the prior caller edge left this node with a single alloc |
| // type or a single caller. In that case no more cloning of Node is needed. |
| if (hasSingleAllocType(Node->AllocTypes) || Node->CallerEdges.size() <= 1) |
| break; |
| |
| // If the caller was not successfully matched to a call in the IR/summary, |
| // there is no point in trying to clone for it as we can't update that call. |
| if (!CallerEdge->Caller->hasCall()) |
| continue; |
| |
| // Only need to process the ids along this edge pertaining to the given |
| // allocation. |
| auto CallerEdgeContextsForAlloc = |
| set_intersection(CallerEdge->getContextIds(), AllocContextIds); |
| if (!RecursiveContextIds.empty()) |
| CallerEdgeContextsForAlloc = |
| set_difference(CallerEdgeContextsForAlloc, RecursiveContextIds); |
| if (CallerEdgeContextsForAlloc.empty()) |
| continue; |
| |
| auto CallerAllocTypeForAlloc = computeAllocType(CallerEdgeContextsForAlloc); |
| |
| // Compute the node callee edge alloc types corresponding to the context ids |
| // for this caller edge. |
| std::vector<uint8_t> CalleeEdgeAllocTypesForCallerEdge; |
| CalleeEdgeAllocTypesForCallerEdge.reserve(Node->CalleeEdges.size()); |
| for (auto &CalleeEdge : Node->CalleeEdges) |
| CalleeEdgeAllocTypesForCallerEdge.push_back(intersectAllocTypes( |
| CalleeEdge->getContextIds(), CallerEdgeContextsForAlloc)); |
| |
| // Don't clone if doing so will not disambiguate any alloc types amongst |
| // caller edges (including the callee edges that would be cloned). |
| // Otherwise we will simply move all edges to the clone. |
| // |
| // First check if by cloning we will disambiguate the caller allocation |
| // type from node's allocation type. Query allocTypeToUse so that we don't |
| // bother cloning to distinguish NotCold+Cold from NotCold. Note that |
| // neither of these should be None type. |
| // |
| // Then check if by cloning node at least one of the callee edges will be |
| // disambiguated by splitting out different context ids. |
| // |
| // However, always do the cloning if this is a backedge, in which case we |
| // have not yet cloned along this caller edge. |
| assert(CallerEdge->AllocTypes != (uint8_t)AllocationType::None); |
| assert(Node->AllocTypes != (uint8_t)AllocationType::None); |
| if (!CallerEdge->IsBackedge && |
| allocTypeToUse(CallerAllocTypeForAlloc) == |
| allocTypeToUse(Node->AllocTypes) && |
| allocTypesMatch<DerivedCCG, FuncTy, CallTy>( |
| CalleeEdgeAllocTypesForCallerEdge, Node->CalleeEdges)) { |
| continue; |
| } |
| |
| if (CallerEdge->IsBackedge) { |
| // We should only mark these if cloning recursive contexts, where we |
| // need to do this deferral. |
| assert(CloneRecursiveContexts); |
| DeferredBackedges++; |
| } |
| |
| // If this is a backedge, we now do recursive cloning starting from its |
| // caller since we may have moved unambiguous caller contexts to a clone |
| // of this Node in a previous iteration of the current loop, giving more |
| // opportunity for cloning through the backedge. Because we sorted the |
| // caller edges earlier so that cold caller edges are first, we would have |
| // visited and cloned this node for any unamibiguously cold non-recursive |
| // callers before any ambiguous backedge callers. Note that we don't do this |
| // if the caller is already cloned or visited during cloning (e.g. via a |
| // different context path from the allocation). |
| // TODO: Can we do better in the case where the caller was already visited? |
| if (CallerEdge->IsBackedge && !CallerEdge->Caller->CloneOf && |
| !Visited.count(CallerEdge->Caller)) { |
| const auto OrigIdCount = CallerEdge->getContextIds().size(); |
| // Now do the recursive cloning of this backedge's caller, which was |
| // deferred earlier. |
| identifyClones(CallerEdge->Caller, Visited, CallerEdgeContextsForAlloc); |
| removeNoneTypeCalleeEdges(CallerEdge->Caller); |
| // See if the recursive call to identifyClones moved the context ids to a |
| // new edge from this node to a clone of caller, and switch to looking at |
| // that new edge so that we clone Node for the new caller clone. |
| bool UpdatedEdge = false; |
| if (OrigIdCount > CallerEdge->getContextIds().size()) { |
| for (auto E : Node->CallerEdges) { |
| // Only interested in clones of the current edges caller. |
| if (E->Caller->CloneOf != CallerEdge->Caller) |
| continue; |
| // See if this edge contains any of the context ids originally on the |
| // current caller edge. |
| auto CallerEdgeContextsForAllocNew = |
| set_intersection(CallerEdgeContextsForAlloc, E->getContextIds()); |
| if (CallerEdgeContextsForAllocNew.empty()) |
| continue; |
| // Make sure we don't pick a previously existing caller edge of this |
| // Node, which would be processed on a different iteration of the |
| // outer loop over the saved CallerEdges. |
| if (llvm::is_contained(CallerEdges, E)) |
| continue; |
| // The CallerAllocTypeForAlloc and CalleeEdgeAllocTypesForCallerEdge |
| // are updated further below for all cases where we just invoked |
| // identifyClones recursively. |
| CallerEdgeContextsForAlloc.swap(CallerEdgeContextsForAllocNew); |
| CallerEdge = E; |
| UpdatedEdge = true; |
| break; |
| } |
| } |
| // If cloning removed this edge (and we didn't update it to a new edge |
| // above), we're done with this edge. It's possible we moved all of the |
| // context ids to an existing clone, in which case there's no need to do |
| // further processing for them. |
| if (CallerEdge->isRemoved()) |
| continue; |
| |
| // Now we need to update the information used for the cloning decisions |
| // further below, as we may have modified edges and their context ids. |
| |
| // Note if we changed the CallerEdge above we would have already updated |
| // the context ids. |
| if (!UpdatedEdge) { |
| CallerEdgeContextsForAlloc = set_intersection( |
| CallerEdgeContextsForAlloc, CallerEdge->getContextIds()); |
| if (CallerEdgeContextsForAlloc.empty()) |
| continue; |
| } |
| // Update the other information that depends on the edges and on the now |
| // updated CallerEdgeContextsForAlloc. |
| CallerAllocTypeForAlloc = computeAllocType(CallerEdgeContextsForAlloc); |
| CalleeEdgeAllocTypesForCallerEdge.clear(); |
| for (auto &CalleeEdge : Node->CalleeEdges) { |
| CalleeEdgeAllocTypesForCallerEdge.push_back(intersectAllocTypes( |
| CalleeEdge->getContextIds(), CallerEdgeContextsForAlloc)); |
| } |
| } |
| |
| // First see if we can use an existing clone. Check each clone and its |
| // callee edges for matching alloc types. |
| ContextNode *Clone = nullptr; |
| for (auto *CurClone : Node->Clones) { |
| if (allocTypeToUse(CurClone->AllocTypes) != |
| allocTypeToUse(CallerAllocTypeForAlloc)) |
| continue; |
| |
| bool BothSingleAlloc = hasSingleAllocType(CurClone->AllocTypes) && |
| hasSingleAllocType(CallerAllocTypeForAlloc); |
| // The above check should mean that if both have single alloc types that |
| // they should be equal. |
| assert(!BothSingleAlloc || |
| CurClone->AllocTypes == CallerAllocTypeForAlloc); |
| |
| // If either both have a single alloc type (which are the same), or if the |
| // clone's callee edges have the same alloc types as those for the current |
| // allocation on Node's callee edges (CalleeEdgeAllocTypesForCallerEdge), |
| // then we can reuse this clone. |
| if (BothSingleAlloc || allocTypesMatchClone<DerivedCCG, FuncTy, CallTy>( |
| CalleeEdgeAllocTypesForCallerEdge, CurClone)) { |
| Clone = CurClone; |
| break; |
| } |
| } |
| |
| // The edge iterator is adjusted when we move the CallerEdge to the clone. |
| if (Clone) |
| moveEdgeToExistingCalleeClone(CallerEdge, Clone, /*NewClone=*/false, |
| CallerEdgeContextsForAlloc); |
| else |
| Clone = moveEdgeToNewCalleeClone(CallerEdge, CallerEdgeContextsForAlloc); |
| |
| // Sanity check that no alloc types on clone or its edges are None. |
| assert(Clone->AllocTypes != (uint8_t)AllocationType::None); |
| } |
| |
| // We should still have some context ids on the original Node. |
| assert(!Node->emptyContextIds()); |
| |
| // Sanity check that no alloc types on node or edges are None. |
| assert(Node->AllocTypes != (uint8_t)AllocationType::None); |
| |
| if (VerifyNodes) |
| checkNode<DerivedCCG, FuncTy, CallTy>(Node, /*CheckEdges=*/false); |
| } |
| |
| void ModuleCallsiteContextGraph::updateAllocationCall( |
| CallInfo &Call, AllocationType AllocType) { |
| std::string AllocTypeString = getAllocTypeAttributeString(AllocType); |
| auto A = llvm::Attribute::get(Call.call()->getFunction()->getContext(), |
| "memprof", AllocTypeString); |
| cast<CallBase>(Call.call())->addFnAttr(A); |
| OREGetter(Call.call()->getFunction()) |
| .emit(OptimizationRemark(DEBUG_TYPE, "MemprofAttribute", Call.call()) |
| << ore::NV("AllocationCall", Call.call()) << " in clone " |
| << ore::NV("Caller", Call.call()->getFunction()) |
| << " marked with memprof allocation attribute " |
| << ore::NV("Attribute", AllocTypeString)); |
| } |
| |
| void IndexCallsiteContextGraph::updateAllocationCall(CallInfo &Call, |
| AllocationType AllocType) { |
| auto *AI = cast<AllocInfo *>(Call.call()); |
| assert(AI); |
| assert(AI->Versions.size() > Call.cloneNo()); |
| AI->Versions[Call.cloneNo()] = (uint8_t)AllocType; |
| } |
| |
| AllocationType |
| ModuleCallsiteContextGraph::getAllocationCallType(const CallInfo &Call) const { |
| const auto *CB = cast<CallBase>(Call.call()); |
| if (!CB->getAttributes().hasFnAttr("memprof")) |
| return AllocationType::None; |
| return CB->getAttributes().getFnAttr("memprof").getValueAsString() == "cold" |
| ? AllocationType::Cold |
| : AllocationType::NotCold; |
| } |
| |
| AllocationType |
| IndexCallsiteContextGraph::getAllocationCallType(const CallInfo &Call) const { |
| const auto *AI = cast<AllocInfo *>(Call.call()); |
| assert(AI->Versions.size() > Call.cloneNo()); |
| return (AllocationType)AI->Versions[Call.cloneNo()]; |
| } |
| |
| void ModuleCallsiteContextGraph::updateCall(CallInfo &CallerCall, |
| FuncInfo CalleeFunc) { |
| if (CalleeFunc.cloneNo() > 0) |
| cast<CallBase>(CallerCall.call())->setCalledFunction(CalleeFunc.func()); |
| OREGetter(CallerCall.call()->getFunction()) |
| .emit(OptimizationRemark(DEBUG_TYPE, "MemprofCall", CallerCall.call()) |
| << ore::NV("Call", CallerCall.call()) << " in clone " |
| << ore::NV("Caller", CallerCall.call()->getFunction()) |
| << " assigned to call function clone " |
| << ore::NV("Callee", CalleeFunc.func())); |
| } |
| |
| void IndexCallsiteContextGraph::updateCall(CallInfo &CallerCall, |
| FuncInfo CalleeFunc) { |
| auto *CI = cast<CallsiteInfo *>(CallerCall.call()); |
| assert(CI && |
| "Caller cannot be an allocation which should not have profiled calls"); |
| assert(CI->Clones.size() > CallerCall.cloneNo()); |
| CI->Clones[CallerCall.cloneNo()] = CalleeFunc.cloneNo(); |
| } |
| |
| CallsiteContextGraph<ModuleCallsiteContextGraph, Function, |
| Instruction *>::FuncInfo |
| ModuleCallsiteContextGraph::cloneFunctionForCallsite( |
| FuncInfo &Func, CallInfo &Call, std::map<CallInfo, CallInfo> &CallMap, |
| std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) { |
| // Use existing LLVM facilities for cloning and obtaining Call in clone |
| ValueToValueMapTy VMap; |
| auto *NewFunc = CloneFunction(Func.func(), VMap); |
| std::string Name = getMemProfFuncName(Func.func()->getName(), CloneNo); |
| assert(!Func.func()->getParent()->getFunction(Name)); |
| NewFunc->setName(Name); |
| for (auto &Inst : CallsWithMetadataInFunc) { |
| // This map always has the initial version in it. |
| assert(Inst.cloneNo() == 0); |
| CallMap[Inst] = {cast<Instruction>(VMap[Inst.call()]), CloneNo}; |
| } |
| OREGetter(Func.func()) |
| .emit(OptimizationRemark(DEBUG_TYPE, "MemprofClone", Func.func()) |
| << "created clone " << ore::NV("NewFunction", NewFunc)); |
| return {NewFunc, CloneNo}; |
| } |
| |
| CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary, |
| IndexCall>::FuncInfo |
| IndexCallsiteContextGraph::cloneFunctionForCallsite( |
| FuncInfo &Func, CallInfo &Call, std::map<CallInfo, CallInfo> &CallMap, |
| std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) { |
| // Check how many clones we have of Call (and therefore function). |
| // The next clone number is the current size of versions array. |
| // Confirm this matches the CloneNo provided by the caller, which is based on |
| // the number of function clones we have. |
| assert(CloneNo == (isa<AllocInfo *>(Call.call()) |
| ? cast<AllocInfo *>(Call.call())->Versions.size() |
| : cast<CallsiteInfo *>(Call.call())->Clones.size())); |
| // Walk all the instructions in this function. Create a new version for |
| // each (by adding an entry to the Versions/Clones summary array), and copy |
| // over the version being called for the function clone being cloned here. |
| // Additionally, add an entry to the CallMap for the new function clone, |
| // mapping the original call (clone 0, what is in CallsWithMetadataInFunc) |
| // to the new call clone. |
| for (auto &Inst : CallsWithMetadataInFunc) { |
| // This map always has the initial version in it. |
| assert(Inst.cloneNo() == 0); |
| if (auto *AI = dyn_cast<AllocInfo *>(Inst.call())) { |
| assert(AI->Versions.size() == CloneNo); |
| // We assign the allocation type later (in updateAllocationCall), just add |
| // an entry for it here. |
| AI->Versions.push_back(0); |
| } else { |
| auto *CI = cast<CallsiteInfo *>(Inst.call()); |
| assert(CI && CI->Clones.size() == CloneNo); |
| // We assign the clone number later (in updateCall), just add an entry for |
| // it here. |
| CI->Clones.push_back(0); |
| } |
| CallMap[Inst] = {Inst.call(), CloneNo}; |
| } |
| return {Func.func(), CloneNo}; |
| } |
| |
| // We perform cloning for each allocation node separately. However, this |
| // sometimes results in a situation where the same node calls multiple |
| // clones of the same callee, created for different allocations. This |
| // causes issues when assigning functions to these clones, as each node can |
| // in reality only call a single callee clone. |
| // |
| // To address this, before assigning functions, merge callee clone nodes as |
| // needed using a post order traversal from the allocations. We attempt to |
| // use existing clones as the merge node when legal, and to share them |
| // among callers with the same properties (callers calling the same set of |
| // callee clone nodes for the same allocations). |
| // |
| // Without this fix, in some cases incorrect function assignment will lead |
| // to calling the wrong allocation clone. |
| template <typename DerivedCCG, typename FuncTy, typename CallTy> |
| void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::mergeClones() { |
| if (!MergeClones) |
| return; |
| |
| // Generate a map from context id to the associated allocation node for use |
| // when merging clones. |
| DenseMap<uint32_t, ContextNode *> ContextIdToAllocationNode; |
| for (auto &Entry : AllocationCallToContextNodeMap) { |
| auto *Node = Entry.second; |
| for (auto Id : Node->getContextIds()) |
| ContextIdToAllocationNode[Id] = Node->getOrigNode(); |
| for (auto *Clone : Node->Clones) { |
| for (auto Id : Clone->getContextIds()) |
| ContextIdToAllocationNode[Id] = Clone->getOrigNode(); |
| } |
| } |
| |
| // Post order traversal starting from allocations to ensure each callsite |
| // calls a single clone of its callee. Callee nodes that are clones of each |
| // other are merged (via new merge nodes if needed) to achieve this. |
| DenseSet<const ContextNode *> Visited; |
| for (auto &Entry : AllocationCallToContextNodeMap) { |
| auto *Node = Entry.second; |
| |
| mergeClones(Node, Visited, ContextIdToAllocationNode); |
| |
| // Make a copy so the recursive post order traversal that may create new |
| // clones doesn't mess up iteration. Note that the recursive traversal |
| // itself does not call mergeClones on any of these nodes, which are all |
| // (clones of) allocations. |
| auto Clones = Node->Clones; |
| for (auto *Clone : Clones) |
| mergeClones(Clone, Visited, ContextIdToAllocationNode); |
| } |
| |
| if (DumpCCG) { |
| dbgs() << "CCG after merging:\n"; |
| dbgs() << *this; |
| } |
| if (ExportToDot) |
| exportToDot("aftermerge"); |
| |
| if (VerifyCCG) { |
| check(); |
| } |
| } |
| |
| // Recursive helper for above mergeClones method. |
| template <typename DerivedCCG, typename FuncTy, typename CallTy> |
| void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::mergeClones( |
| ContextNode *Node, DenseSet<const ContextNode *> &Visited, |
| DenseMap<uint32_t, ContextNode *> &ContextIdToAllocationNode) { |
| auto Inserted = Visited.insert(Node); |
| if (!Inserted.second) |
| return; |
| |
| // Make a copy since the recursive call may move a caller edge to a new |
| // callee, messing up the iterator. |
| auto CallerEdges = Node->CallerEdges; |
| for (auto CallerEdge : CallerEdges) { |
| // Skip any caller edge moved onto a different callee during recursion. |
| if (CallerEdge->Callee != Node) |
| continue; |
| mergeClones(CallerEdge->Caller, Visited, ContextIdToAllocationNode); |
| } |
| |
| // Merge for this node after we handle its callers. |
| mergeNodeCalleeClones(Node, Visited, ContextIdToAllocationNode); |
| } |
| |
| template <typename DerivedCCG, typename FuncTy, typename CallTy> |
| void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::mergeNodeCalleeClones( |
| ContextNode *Node, DenseSet<const ContextNode *> &Visited, |
| DenseMap<uint32_t, ContextNode *> &ContextIdToAllocationNode) { |
| // Ignore Node if we moved all of its contexts to clones. |
| if (Node->emptyContextIds()) |
| return; |
| |
| // First identify groups of clones among Node's callee edges, by building |
| // a map from each callee base node to the associated callee edges from Node. |
| MapVector<ContextNode *, std::vector<std::shared_ptr<ContextEdge>>> |
| OrigNodeToCloneEdges; |
| for (const auto &E : Node->CalleeEdges) { |
| auto *Callee = E->Callee; |
| if (!Callee->CloneOf && Callee->Clones.empty()) |
| continue; |
| ContextNode *Base = Callee->getOrigNode(); |
| OrigNodeToCloneEdges[Base].push_back(E); |
| } |
| |
| // Helper for callee edge sorting below. Return true if A's callee has fewer |
| // caller edges than B, or if A is a clone and B is not, or if A's first |
| // context id is smaller than B's. |
| auto CalleeCallerEdgeLessThan = [](const std::shared_ptr<ContextEdge> &A, |
| const std::shared_ptr<ContextEdge> &B) { |
| if (A->Callee->CallerEdges.size() != B->Callee->CallerEdges.size()) |
| return A->Callee->CallerEdges.size() < B->Callee->CallerEdges.size(); |
| if (A->Callee->CloneOf && !B->Callee->CloneOf) |
| return true; |
| else if (!A->Callee->CloneOf && B->Callee->CloneOf) |
| return false; |
| // Use the first context id for each edge as a |
| // tie-breaker. |
| return *A->ContextIds.begin() < *B->ContextIds.begin(); |
| }; |
| |
| // Process each set of callee clones called by Node, performing the needed |
| // merging. |
| for (auto Entry : OrigNodeToCloneEdges) { |
| // CalleeEdges is the set of edges from Node reaching callees that are |
| // mutual clones of each other. |
| auto &CalleeEdges = Entry.second; |
| auto NumCalleeClones = CalleeEdges.size(); |
| // A single edge means there is no merging needed. |
| if (NumCalleeClones == 1) |
| continue; |
| // Sort the CalleeEdges calling this group of clones in ascending order of |
| // their caller edge counts, putting the original non-clone node first in |
| // cases of a tie. This simplifies finding an existing node to use as the |
| // merge node. |
| llvm::stable_sort(CalleeEdges, CalleeCallerEdgeLessThan); |
| |
| /// Find other callers of the given set of callee edges that can |
| /// share the same callee merge node. See the comments at this method |
| /// definition for details. |
| DenseSet<ContextNode *> OtherCallersToShareMerge; |
| findOtherCallersToShareMerge(Node, CalleeEdges, ContextIdToAllocationNode, |
| OtherCallersToShareMerge); |
| |
| // Now do the actual merging. Identify existing or create a new MergeNode |
| // during the first iteration. Move each callee over, along with edges from |
| // other callers we've determined above can share the same merge node. |
| ContextNode *MergeNode = nullptr; |
| DenseMap<ContextNode *, unsigned> CallerToMoveCount; |
| for (auto CalleeEdge : CalleeEdges) { |
| auto *OrigCallee = CalleeEdge->Callee; |
| // If we don't have a MergeNode yet (only happens on the first iteration, |
| // as a new one will be created when we go to move the first callee edge |
| // over as needed), see if we can use this callee. |
| if (!MergeNode) { |
| // If there are no other callers, simply use this callee. |
| if (CalleeEdge->Callee->CallerEdges.size() == 1) { |
| MergeNode = OrigCallee; |
| NonNewMergedNodes++; |
| continue; |
| } |
| // Otherwise, if we have identified other caller nodes that can share |
| // the merge node with Node, see if all of OrigCallee's callers are |
| // going to share the same merge node. In that case we can use callee |
| // (since all of its callers would move to the new merge node). |
| if (!OtherCallersToShareMerge.empty()) { |
| bool MoveAllCallerEdges = true; |
| for (auto CalleeCallerE : OrigCallee->CallerEdges) { |
| if (CalleeCallerE == CalleeEdge) |
| continue; |
| if (!OtherCallersToShareMerge.contains(CalleeCallerE->Caller)) { |
| MoveAllCallerEdges = false; |
| break; |
| } |
| } |
| // If we are going to move all callers over, we can use this callee as |
| // the MergeNode. |
| if (MoveAllCallerEdges) { |
| MergeNode = OrigCallee; |
| NonNewMergedNodes++; |
| continue; |
| } |
| } |
| } |
| // Move this callee edge, creating a new merge node if necessary. |
| if (MergeNode) { |
| assert(MergeNode != OrigCallee); |
| moveEdgeToExistingCalleeClone(CalleeEdge, MergeNode, |
| /*NewClone*/ false); |
| } else { |
| MergeNode = moveEdgeToNewCalleeClone(CalleeEdge); |
| NewMergedNodes++; |
| } |
| // Now move all identified edges from other callers over to the merge node |
| // as well. |
| if (!OtherCallersToShareMerge.empty()) { |
| // Make and iterate over a copy of OrigCallee's caller edges because |
| // some of these will be moved off of the OrigCallee and that would mess |
| // up the iteration from OrigCallee. |
| auto OrigCalleeCallerEdges = OrigCallee->CallerEdges; |
| for (auto &CalleeCallerE : OrigCalleeCallerEdges) { |
| if (CalleeCallerE == CalleeEdge) |
| continue; |
| if (!OtherCallersToShareMerge.contains(CalleeCallerE->Caller)) |
| continue; |
| CallerToMoveCount[CalleeCallerE->Caller]++; |
| moveEdgeToExistingCalleeClone(CalleeCallerE, MergeNode, |
| /*NewClone*/ false); |
| } |
| } |
| removeNoneTypeCalleeEdges(OrigCallee); |
| removeNoneTypeCalleeEdges(MergeNode); |
| } |
| } |
| } |
| |
| // Look for other nodes that have edges to the same set of callee |
| // clones as the current Node. Those can share the eventual merge node |
| // (reducing cloning and binary size overhead) iff: |
| // - they have edges to the same set of callee clones |
| // - each callee edge reaches a subset of the same allocations as Node's |
| // corresponding edge to the same callee clone. |
| // The second requirement is to ensure that we don't undo any of the |
| // necessary cloning to distinguish contexts with different allocation |
| // behavior. |
| // FIXME: This is somewhat conservative, as we really just need to ensure |
| // that they don't reach the same allocations as contexts on edges from Node |
| // going to any of the *other* callee clones being merged. However, that |
| // requires more tracking and checking to get right. |
| template <typename DerivedCCG, typename FuncTy, typename CallTy> |
| void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>:: |
| findOtherCallersToShareMerge( |
| ContextNode *Node, |
| std::vector<std::shared_ptr<ContextEdge>> &CalleeEdges, |
| DenseMap<uint32_t, ContextNode *> &ContextIdToAllocationNode, |
| DenseSet<ContextNode *> &OtherCallersToShareMerge) { |
| auto NumCalleeClones = CalleeEdges.size(); |
| // This map counts how many edges to the same callee clone exist for other |
| // caller nodes of each callee clone. |
| DenseMap<ContextNode *, unsigned> OtherCallersToSharedCalleeEdgeCount; |
| // Counts the number of other caller nodes that have edges to all callee |
| // clones that don't violate the allocation context checking. |
| unsigned PossibleOtherCallerNodes = 0; |
| |
| // We only need to look at other Caller nodes if the first callee edge has |
| // multiple callers (recall they are sorted in ascending order above). |
| if (CalleeEdges[0]->Callee->CallerEdges.size() < 2) |
| return; |
| |
| // For each callee edge: |
| // - Collect the count of other caller nodes calling the same callees. |
| // - Collect the alloc nodes reached by contexts on each callee edge. |
| DenseMap<ContextEdge *, DenseSet<ContextNode *>> CalleeEdgeToAllocNodes; |
| for (auto CalleeEdge : CalleeEdges) { |
| assert(CalleeEdge->Callee->CallerEdges.size() > 1); |
| // For each other caller of the same callee, increment the count of |
| // edges reaching the same callee clone. |
| for (auto CalleeCallerEdges : CalleeEdge->Callee->CallerEdges) { |
| if (CalleeCallerEdges->Caller == Node) { |
| assert(CalleeCallerEdges == CalleeEdge); |
| continue; |
| } |
| OtherCallersToSharedCalleeEdgeCount[CalleeCallerEdges->Caller]++; |
| // If this caller edge now reaches all of the same callee clones, |
| // increment the count of candidate other caller nodes. |
| if (OtherCallersToSharedCalleeEdgeCount[CalleeCallerEdges->Caller] == |
| NumCalleeClones) |
| PossibleOtherCallerNodes++; |
| } |
| // Collect the alloc nodes reached by contexts on each callee edge, for |
| // later analysis. |
| for (auto Id : CalleeEdge->getContextIds()) { |
| auto *Alloc = ContextIdToAllocationNode.lookup(Id); |
| if (!Alloc) { |
| // FIXME: unclear why this happens occasionally, presumably |
| // imperfect graph updates possibly with recursion. |
| MissingAllocForContextId++; |
| continue; |
| } |
| CalleeEdgeToAllocNodes[CalleeEdge.get()].insert(Alloc); |
| } |
| } |
| |
| // Now walk the callee edges again, and make sure that for each candidate |
| // caller node all of its edges to the callees reach the same allocs (or |
| // a subset) as those along the corresponding callee edge from Node. |
| for (auto CalleeEdge : CalleeEdges) { |
| assert(CalleeEdge->Callee->CallerEdges.size() > 1); |
| // Stop if we do not have any (more) candidate other caller nodes. |
| if (!PossibleOtherCallerNodes) |
| break; |
| auto &CurCalleeAllocNodes = CalleeEdgeToAllocNodes[CalleeEdge.get()]; |
| // Check each other caller of this callee clone. |
| for (auto &CalleeCallerE : CalleeEdge->Callee->CallerEdges) { |
| // Not interested in the callee edge from Node itself. |
| if (CalleeCallerE == CalleeEdge) |
| continue; |
| // Skip any callers that didn't have callee edges to all the same |
| // callee clones. |
| if (OtherCallersToSharedCalleeEdgeCount[CalleeCallerE->Caller] != |
| NumCalleeClones) |
| continue; |
| // Make sure that each context along edge from candidate caller node |
| // reaches an allocation also reached by this callee edge from Node. |
| for (auto Id : CalleeCallerE->getContextIds()) { |
| auto *Alloc = ContextIdToAllocationNode.lookup(Id); |
| if (!Alloc) |
| continue; |
| // If not, simply reset the map entry to 0 so caller is ignored, and |
| // reduce the count of candidate other caller nodes. |
| if (!CurCalleeAllocNodes.contains(Alloc)) { |
| OtherCallersToSharedCalleeEdgeCount[CalleeCallerE->Caller] = 0; |
| PossibleOtherCallerNodes--; |
| break; |
| } |
| } |
| } |
| } |
| |
| if (!PossibleOtherCallerNodes) |
| return; |
| |
| // Build the set of other caller nodes that can use the same callee merge |
| // node. |
| for (auto &[OtherCaller, Count] : OtherCallersToSharedCalleeEdgeCount) { |
| if (Count != NumCalleeClones) |
| continue; |
| OtherCallersToShareMerge.insert(OtherCaller); |
| } |
| } |
| |
| // This method assigns cloned callsites to functions, cloning the functions as |
| // needed. The assignment is greedy and proceeds roughly as follows: |
| // |
| // For each function Func: |
| // For each call with graph Node having clones: |
| // Initialize ClonesWorklist to Node and its clones |
| // Initialize NodeCloneCount to 0 |
| // While ClonesWorklist is not empty: |
| // Clone = pop front ClonesWorklist |
| // NodeCloneCount++ |
| // If Func has been cloned less than NodeCloneCount times: |
| // If NodeCloneCount is 1: |
| // Assign Clone to original Func |
| // Continue |
| // Create a new function clone |
| // If other callers not assigned to call a function clone yet: |
| // Assign them to call new function clone |
| // Continue |
| // Assign any other caller calling the cloned version to new clone |
| // |
| // For each caller of Clone: |
| // If caller is assigned to call a specific function clone: |
| // If we cannot assign Clone to that function clone: |
| // Create new callsite Clone NewClone |
| // Add NewClone to ClonesWorklist |
| // Continue |
| // Assign Clone to existing caller's called function clone |
| // Else: |
| // If Clone not already assigned to a function clone: |
| // Assign to first function clone without assignment |
| // Assign caller to selected function clone |
| template <typename DerivedCCG, typename FuncTy, typename CallTy> |
| bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() { |
| bool Changed = false; |
| |
| mergeClones(); |
| |
| // Keep track of the assignment of nodes (callsites) to function clones they |
| // call. |
| DenseMap<ContextNode *, FuncInfo> CallsiteToCalleeFuncCloneMap; |
| |
| // Update caller node to call function version CalleeFunc, by recording the |
| // assignment in CallsiteToCalleeFuncCloneMap. |
| auto RecordCalleeFuncOfCallsite = [&](ContextNode *Caller, |
| const FuncInfo &CalleeFunc) { |
| assert(Caller->hasCall()); |
| CallsiteToCalleeFuncCloneMap[Caller] = CalleeFunc; |
| }; |
| |
| // Walk all functions for which we saw calls with memprof metadata, and handle |
| // cloning for each of its calls. |
| for (auto &[Func, CallsWithMetadata] : FuncToCallsWithMetadata) { |
| FuncInfo OrigFunc(Func); |
| // Map from each clone of OrigFunc to a map of remappings of each call of |
| // interest (from original uncloned call to the corresponding cloned call in |
| // that function clone). |
| std::map<FuncInfo, std::map<CallInfo, CallInfo>> FuncClonesToCallMap; |
| for (auto &Call : CallsWithMetadata) { |
| ContextNode *Node = getNodeForInst(Call); |
| // Skip call if we do not have a node for it (all uses of its stack ids |
| // were either on inlined chains or pruned from the MIBs), or if we did |
| // not create any clones for it. |
| if (!Node || Node->Clones.empty()) |
| continue; |
| assert(Node->hasCall() && |
| "Not having a call should have prevented cloning"); |
| |
| // Track the assignment of function clones to clones of the current |
| // callsite Node being handled. |
| std::map<FuncInfo, ContextNode *> FuncCloneToCurNodeCloneMap; |
| |
| // Assign callsite version CallsiteClone to function version FuncClone, |
| // and also assign (possibly cloned) Call to CallsiteClone. |
| auto AssignCallsiteCloneToFuncClone = [&](const FuncInfo &FuncClone, |
| CallInfo &Call, |
| ContextNode *CallsiteClone, |
| bool IsAlloc) { |
| // Record the clone of callsite node assigned to this function clone. |
| FuncCloneToCurNodeCloneMap[FuncClone] = CallsiteClone; |
| |
| assert(FuncClonesToCallMap.count(FuncClone)); |
| std::map<CallInfo, CallInfo> &CallMap = FuncClonesToCallMap[FuncClone]; |
| CallInfo CallClone(Call); |
| if (auto It = CallMap.find(Call); It != CallMap.end()) |
| CallClone = It->second; |
| CallsiteClone->setCall(CallClone); |
| // Need to do the same for all matching calls. |
| for (auto &MatchingCall : Node->MatchingCalls) { |
| CallInfo CallClone(MatchingCall); |
| if (auto It = CallMap.find(MatchingCall); It != CallMap.end()) |
| CallClone = It->second; |
| // Updates the call in the list. |
| MatchingCall = CallClone; |
| } |
| }; |
| |
| // Keep track of the clones of callsite Node that need to be assigned to |
| // function clones. This list may be expanded in the loop body below if we |
| // find additional cloning is required. |
| std::deque<ContextNode *> ClonesWorklist; |
| // Ignore original Node if we moved all of its contexts to clones. |
| if (!Node->emptyContextIds()) |
| ClonesWorklist.push_back(Node); |
| llvm::append_range(ClonesWorklist, Node->Clones); |
| |
| // Now walk through all of the clones of this callsite Node that we need, |
| // and determine the assignment to a corresponding clone of the current |
| // function (creating new function clones as needed). |
| unsigned NodeCloneCount = 0; |
| while (!ClonesWorklist.empty()) { |
| ContextNode *Clone = ClonesWorklist.front(); |
| ClonesWorklist.pop_front(); |
| NodeCloneCount++; |
| if (VerifyNodes) |
| checkNode<DerivedCCG, FuncTy, CallTy>(Clone); |
| |
| // Need to create a new function clone if we have more callsite clones |
| // than existing function clones, which would have been assigned to an |
| // earlier clone in the list (we assign callsite clones to function |
| // clones greedily). |
| if (FuncClonesToCallMap.size() < NodeCloneCount) { |
| // If this is the first callsite copy, assign to original function. |
| if (NodeCloneCount == 1) { |
| // Since FuncClonesToCallMap is empty in this case, no clones have |
| // been created for this function yet, and no callers should have |
| // been assigned a function clone for this callee node yet. |
| assert(llvm::none_of( |
| Clone->CallerEdges, [&](const std::shared_ptr<ContextEdge> &E) { |
| return CallsiteToCalleeFuncCloneMap.count(E->Caller); |
| })); |
| // Initialize with empty call map, assign Clone to original function |
| // and its callers, and skip to the next clone. |
| FuncClonesToCallMap[OrigFunc] = {}; |
| AssignCallsiteCloneToFuncClone( |
| OrigFunc, Call, Clone, |
| AllocationCallToContextNodeMap.count(Call)); |
| for (auto &CE : Clone->CallerEdges) { |
| // Ignore any caller that does not have a recorded callsite Call. |
| if (!CE->Caller->hasCall()) |
| continue; |
| RecordCalleeFuncOfCallsite(CE->Caller, OrigFunc); |
| } |
| continue; |
| } |
| |
| // First locate which copy of OrigFunc to clone again. If a caller |
| // of this callsite clone was already assigned to call a particular |
| // function clone, we need to redirect all of those callers to the |
| // new function clone, and update their other callees within this |
| // function. |
| FuncInfo PreviousAssignedFuncClone; |
| auto EI = llvm::find_if( |
| Clone->CallerEdges, [&](const std::shared_ptr<ContextEdge> &E) { |
| return CallsiteToCalleeFuncCloneMap.count(E->Caller); |
| }); |
| bool CallerAssignedToCloneOfFunc = false; |
| if (EI != Clone->CallerEdges.end()) { |
| const std::shared_ptr<ContextEdge> &Edge = *EI; |
| PreviousAssignedFuncClone = |
| CallsiteToCalleeFuncCloneMap[Edge->Caller]; |
| CallerAssignedToCloneOfFunc = true; |
| } |
| |
| // Clone function and save it along with the CallInfo map created |
| // during cloning in the FuncClonesToCallMap. |
| std::map<CallInfo, CallInfo> NewCallMap; |
| unsigned CloneNo = FuncClonesToCallMap.size(); |
| assert(CloneNo > 0 && "Clone 0 is the original function, which " |
| "should already exist in the map"); |
| FuncInfo NewFuncClone = cloneFunctionForCallsite( |
| OrigFunc, Call, NewCallMap, CallsWithMetadata, CloneNo); |
| FuncClonesToCallMap.emplace(NewFuncClone, std::move(NewCallMap)); |
| FunctionClonesAnalysis++; |
| Changed = true; |
| |
| // If no caller callsites were already assigned to a clone of this |
| // function, we can simply assign this clone to the new func clone |
| // and update all callers to it, then skip to the next clone. |
| if (!CallerAssignedToCloneOfFunc) { |
| AssignCallsiteCloneToFuncClone( |
| NewFuncClone, Call, Clone, |
| AllocationCallToContextNodeMap.count(Call)); |
| for (auto &CE : Clone->CallerEdges) { |
| // Ignore any caller that does not have a recorded callsite Call. |
| if (!CE->Caller->hasCall()) |
| continue; |
| RecordCalleeFuncOfCallsite(CE->Caller, NewFuncClone); |
| } |
| continue; |
| } |
| |
| // We may need to do additional node cloning in this case. |
| // Reset the CallsiteToCalleeFuncCloneMap entry for any callers |
| // that were previously assigned to call PreviousAssignedFuncClone, |
| // to record that they now call NewFuncClone. |
| // The none type edge removal may remove some of this Clone's caller |
| // edges, if it is reached via another of its caller's callees. |
| // Iterate over a copy and skip any that were removed. |
| auto CallerEdges = Clone->CallerEdges; |
| for (auto CE : CallerEdges) { |
| // Skip any that have been removed on an earlier iteration. |
| if (CE->isRemoved()) { |
| assert(!is_contained(Clone->CallerEdges, CE)); |
| continue; |
| } |
| assert(CE); |
| // Ignore any caller that does not have a recorded callsite Call. |
| if (!CE->Caller->hasCall()) |
| continue; |
| |
| if (!CallsiteToCalleeFuncCloneMap.count(CE->Caller) || |
| // We subsequently fall through to later handling that |
| // will perform any additional cloning required for |
| // callers that were calling other function clones. |
| CallsiteToCalleeFuncCloneMap[CE->Caller] != |
| PreviousAssignedFuncClone) |
| continue; |
| |
| RecordCalleeFuncOfCallsite(CE->Caller, NewFuncClone); |
| |
| // If we are cloning a function that was already assigned to some |
| // callers, then essentially we are creating new callsite clones |
| // of the other callsites in that function that are reached by those |
| // callers. Clone the other callees of the current callsite's caller |
| // that were already assigned to PreviousAssignedFuncClone |
| // accordingly. This is important since we subsequently update the |
| // calls from the nodes in the graph and their assignments to callee |
| // functions recorded in CallsiteToCalleeFuncCloneMap. |
| // The none type edge removal may remove some of this caller's |
| // callee edges, if it is reached via another of its callees. |
| // Iterate over a copy and skip any that were removed. |
| auto CalleeEdges = CE->Caller->CalleeEdges; |
| for (auto CalleeEdge : CalleeEdges) { |
| // Skip any that have been removed on an earlier iteration when |
| // cleaning up newly None type callee edges. |
| if (CalleeEdge->isRemoved()) { |
| assert(!is_contained(CE->Caller->CalleeEdges, CalleeEdge)); |
| continue; |
| } |
| assert(CalleeEdge); |
| ContextNode *Callee = CalleeEdge->Callee; |
| // Skip the current callsite, we are looking for other |
| // callsites Caller calls, as well as any that does not have a |
| // recorded callsite Call. |
| if (Callee == Clone || !Callee->hasCall()) |
| continue; |
| // Skip direct recursive calls. We don't need/want to clone the |
| // caller node again, and this loop will not behave as expected if |
| // we tried. |
| if (Callee == CalleeEdge->Caller) |
| continue; |
| ContextNode *NewClone = moveEdgeToNewCalleeClone(CalleeEdge); |
| removeNoneTypeCalleeEdges(NewClone); |
| // Moving the edge may have resulted in some none type |
| // callee edges on the original Callee. |
| removeNoneTypeCalleeEdges(Callee); |
| assert(NewClone->AllocTypes != (uint8_t)AllocationType::None); |
| // If the Callee node was already assigned to call a specific |
| // function version, make sure its new clone is assigned to call |
| // that same function clone. |
| if (CallsiteToCalleeFuncCloneMap.count(Callee)) |
| RecordCalleeFuncOfCallsite( |
| NewClone, CallsiteToCalleeFuncCloneMap[Callee]); |
| // Update NewClone with the new Call clone of this callsite's Call |
| // created for the new function clone created earlier. |
| // Recall that we have already ensured when building the graph |
| // that each caller can only call callsites within the same |
| // function, so we are guaranteed that Callee Call is in the |
| // current OrigFunc. |
| // CallMap is set up as indexed by original Call at clone 0. |
| CallInfo OrigCall(Callee->getOrigNode()->Call); |
| OrigCall.setCloneNo(0); |
| std::map<CallInfo, CallInfo> &CallMap = |
| FuncClonesToCallMap[NewFuncClone]; |
| assert(CallMap.count(OrigCall)); |
| CallInfo NewCall(CallMap[OrigCall]); |
| assert(NewCall); |
| NewClone->setCall(NewCall); |
| // Need to do the same for all matching calls. |
| for (auto &MatchingCall : NewClone->MatchingCalls) { |
| CallInfo OrigMatchingCall(MatchingCall); |
| OrigMatchingCall.setCloneNo(0); |
| assert(CallMap.count(OrigMatchingCall)); |
| CallInfo NewCall(CallMap[OrigMatchingCall]); |
| assert(NewCall); |
| // Updates the call in the list. |
| MatchingCall = NewCall; |
| } |
| } |
| } |
| // Fall through to handling below to perform the recording of the |
| // function for this callsite clone. This enables handling of cases |
| // where the callers were assigned to different clones of a function. |
| } |
| |
| // See if we can use existing function clone. Walk through |
| // all caller edges to see if any have already been assigned to |
| // a clone of this callsite's function. If we can use it, do so. If not, |
| // because that function clone is already assigned to a different clone |
| // of this callsite, then we need to clone again. |
| // Basically, this checking is needed to handle the case where different |
| // caller functions/callsites may need versions of this function |
| // containing different mixes of callsite clones across the different |
| // callsites within the function. If that happens, we need to create |
| // additional function clones to handle the various combinations. |
| // |
| // Keep track of any new clones of this callsite created by the |
| // following loop, as well as any existing clone that we decided to |
| // assign this clone to. |
| std::map<FuncInfo, ContextNode *> FuncCloneToNewCallsiteCloneMap; |
| FuncInfo FuncCloneAssignedToCurCallsiteClone; |
| // Iterate over a copy of Clone's caller edges, since we may need to |
| // remove edges in the moveEdgeTo* methods, and this simplifies the |
| // handling and makes it less error-prone. |
| auto CloneCallerEdges = Clone->CallerEdges; |
| for (auto &Edge : CloneCallerEdges) { |
| // Skip removed edges (due to direct recursive edges updated when |
| // updating callee edges when moving an edge and subsequently |
| // removed by call to removeNoneTypeCalleeEdges on the Clone). |
| if (Edge->isRemoved()) |
| continue; |
| // Ignore any caller that does not have a recorded callsite Call. |
| if (!Edge->Caller->hasCall()) |
| continue; |
| // If this caller already assigned to call a version of OrigFunc, need |
| // to ensure we can assign this callsite clone to that function clone. |
| if (CallsiteToCalleeFuncCloneMap.count(Edge->Caller)) { |
| FuncInfo FuncCloneCalledByCaller = |
| CallsiteToCalleeFuncCloneMap[Edge->Caller]; |
| // First we need to confirm that this function clone is available |
| // for use by this callsite node clone. |
| // |
| // While FuncCloneToCurNodeCloneMap is built only for this Node and |
| // its callsite clones, one of those callsite clones X could have |
| // been assigned to the same function clone called by Edge's caller |
| // - if Edge's caller calls another callsite within Node's original |
| // function, and that callsite has another caller reaching clone X. |
| // We need to clone Node again in this case. |
| if ((FuncCloneToCurNodeCloneMap.count(FuncCloneCalledByCaller) && |
| FuncCloneToCurNodeCloneMap[FuncCloneCalledByCaller] != |
| Clone) || |
| // Detect when we have multiple callers of this callsite that |
| // have already been assigned to specific, and different, clones |
| // of OrigFunc (due to other unrelated callsites in Func they |
| // reach via call contexts). Is this Clone of callsite Node |
| // assigned to a different clone of OrigFunc? If so, clone Node |
| // again. |
| (FuncCloneAssignedToCurCallsiteClone && |
| FuncCloneAssignedToCurCallsiteClone != |
| FuncCloneCalledByCaller)) { |
| // We need to use a different newly created callsite clone, in |
| // order to assign it to another new function clone on a |
| // subsequent iteration over the Clones array (adjusted below). |
| // Note we specifically do not reset the |
| // CallsiteToCalleeFuncCloneMap entry for this caller, so that |
| // when this new clone is processed later we know which version of |
| // the function to copy (so that other callsite clones we have |
| // assigned to that function clone are properly cloned over). See |
| // comments in the function cloning handling earlier. |
| |
| // Check if we already have cloned this callsite again while |
| // walking through caller edges, for a caller calling the same |
| // function clone. If so, we can move this edge to that new clone |
| // rather than creating yet another new clone. |
| if (FuncCloneToNewCallsiteCloneMap.count( |
| FuncCloneCalledByCaller)) { |
| ContextNode *NewClone = |
| FuncCloneToNewCallsiteCloneMap[FuncCloneCalledByCaller]; |
| moveEdgeToExistingCalleeClone(Edge, NewClone); |
| // Cleanup any none type edges cloned over. |
| removeNoneTypeCalleeEdges(NewClone); |
| } else { |
| // Create a new callsite clone. |
| ContextNode *NewClone = moveEdgeToNewCalleeClone(Edge); |
| removeNoneTypeCalleeEdges(NewClone); |
| FuncCloneToNewCallsiteCloneMap[FuncCloneCalledByCaller] = |
| NewClone; |
| // Add to list of clones and process later. |
| ClonesWorklist.push_back(NewClone); |
| assert(NewClone->AllocTypes != (uint8_t)AllocationType::None); |
| } |
| // Moving the caller edge may have resulted in some none type |
| // callee edges. |
| removeNoneTypeCalleeEdges(Clone); |
| // We will handle the newly created callsite clone in a subsequent |
| // iteration over this Node's Clones. |
| continue; |
| } |
| |
| // Otherwise, we can use the function clone already assigned to this |
| // caller. |
| if (!FuncCloneAssignedToCurCallsiteClone) { |
| FuncCloneAssignedToCurCallsiteClone = FuncCloneCalledByCaller; |
| // Assign Clone to FuncCloneCalledByCaller |
| AssignCallsiteCloneToFuncClone( |
| FuncCloneCalledByCaller, Call, Clone, |
| AllocationCallToContextNodeMap.count(Call)); |
| } else |
| // Don't need to do anything - callsite is already calling this |
| // function clone. |
| assert(FuncCloneAssignedToCurCallsiteClone == |
| FuncCloneCalledByCaller); |
| |
| } else { |
| // We have not already assigned this caller to a version of |
| // OrigFunc. Do the assignment now. |
| |
| // First check if we have already assigned this callsite clone to a |
| // clone of OrigFunc for another caller during this iteration over |
| // its caller edges. |
| if (!FuncCloneAssignedToCurCallsiteClone) { |
| // Find first function in FuncClonesToCallMap without an assigned |
| // clone of this callsite Node. We should always have one |
| // available at this point due to the earlier cloning when the |
| // FuncClonesToCallMap size was smaller than the clone number. |
| for (auto &CF : FuncClonesToCallMap) { |
| if (!FuncCloneToCurNodeCloneMap.count(CF.first)) { |
| FuncCloneAssignedToCurCallsiteClone = CF.first; |
| break; |
| } |
| } |
| assert(FuncCloneAssignedToCurCallsiteClone); |
| // Assign Clone to FuncCloneAssignedToCurCallsiteClone |
| AssignCallsiteCloneToFuncClone( |
| FuncCloneAssignedToCurCallsiteClone, Call, Clone, |
| AllocationCallToContextNodeMap.count(Call)); |
| } else |
| assert(FuncCloneToCurNodeCloneMap |
| [FuncCloneAssignedToCurCallsiteClone] == Clone); |
| // Update callers to record function version called. |
| RecordCalleeFuncOfCallsite(Edge->Caller, |
| FuncCloneAssignedToCurCallsiteClone); |
| } |
| } |
| } |
| if (VerifyCCG) { |
| checkNode<DerivedCCG, FuncTy, CallTy>(Node); |
| for (const auto &PE : Node->CalleeEdges) |
| checkNode<DerivedCCG, FuncTy, CallTy>(PE->Callee); |
| for (const auto &CE : Node->CallerEdges) |
| checkNode<DerivedCCG, FuncTy, CallTy>(CE->Caller); |
| for (auto *Clone : Node->Clones) { |
| checkNode<DerivedCCG, FuncTy, CallTy>(Clone); |
| for (const auto &PE : Clone->CalleeEdges) |
| checkNode<DerivedCCG, FuncTy, CallTy>(PE->Callee); |
| for (const auto &CE : Clone->CallerEdges) |
| checkNode<DerivedCCG, FuncTy, CallTy>(CE->Caller); |
| } |
| } |
| } |
| } |
| |
| uint8_t BothTypes = |
| (uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold; |
| |
| auto UpdateCalls = [&](ContextNode *Node, |
| DenseSet<const ContextNode *> &Visited, |
| auto &&UpdateCalls) { |
| auto Inserted = Visited.insert(Node); |
| if (!Inserted.second) |
| return; |
| |
| for (auto *Clone : Node->Clones) |
| UpdateCalls(Clone, Visited, UpdateCalls); |
| |
| for (auto &Edge : Node->CallerEdges) |
| UpdateCalls(Edge->Caller, Visited, UpdateCalls); |
| |
| // Skip if either no call to update, or if we ended up with no context ids |
| // (we moved all edges onto other clones). |
| if (!Node->hasCall() || Node->emptyContextIds()) |
| return; |
| |
| if (Node->IsAllocation) { |
| auto AT = allocTypeToUse(Node->AllocTypes); |
| // If the allocation type is ambiguous, and more aggressive hinting |
| // has been enabled via the MinClonedColdBytePercent flag, see if this |
| // allocation should be hinted cold anyway because its fraction cold bytes |
| // allocated is at least the given threshold. |
| if (Node->AllocTypes == BothTypes && MinClonedColdBytePercent < 100 && |
| !ContextIdToContextSizeInfos.empty()) { |
| uint64_t TotalCold = 0; |
| uint64_t Total = 0; |
| for (auto Id : Node->getContextIds()) { |
| auto TypeI = ContextIdToAllocationType.find(Id); |
| assert(TypeI != ContextIdToAllocationType.end()); |
| auto CSI = ContextIdToContextSizeInfos.find(Id); |
| if (CSI != ContextIdToContextSizeInfos.end()) { |
| for (auto &Info : CSI->second) { |
| Total += Info.TotalSize; |
| if (TypeI->second == AllocationType::Cold) |
| TotalCold += Info.TotalSize; |
| } |
| } |
| } |
| if (TotalCold * 100 >= Total * MinClonedColdBytePercent) |
| AT = AllocationType::Cold; |
| } |
| updateAllocationCall(Node->Call, AT); |
| assert(Node->MatchingCalls.empty()); |
| return; |
| } |
| |
| if (!CallsiteToCalleeFuncCloneMap.count(Node)) |
| return; |
| |
| auto CalleeFunc = CallsiteToCalleeFuncCloneMap[Node]; |
| updateCall(Node->Call, CalleeFunc); |
| // Update all the matching calls as well. |
| for (auto &Call : Node->MatchingCalls) |
| updateCall(Call, CalleeFunc); |
| }; |
| |
| // Performs DFS traversal starting from allocation nodes to update calls to |
| // reflect cloning decisions recorded earlier. For regular LTO this will |
| // update the actual calls in the IR to call the appropriate function clone |
| // (and add attributes to allocation calls), whereas for ThinLTO the decisions |
| // are recorded in the summary entries. |
| DenseSet<const ContextNode *> Visited; |
| for (auto &Entry : AllocationCallToContextNodeMap) |
| UpdateCalls(Entry.second, Visited, UpdateCalls); |
| |
| return Changed; |
| } |
| |
| static SmallVector<std::unique_ptr<ValueToValueMapTy>, 4> createFunctionClones( |
| Function &F, unsigned NumClones, Module &M, OptimizationRemarkEmitter &ORE, |
| std::map<const Function *, SmallPtrSet<const GlobalAlias *, 1>> |
| &FuncToAliasMap) { |
| // The first "clone" is the original copy, we should only call this if we |
| // needed to create new clones. |
| assert(NumClones > 1); |
| SmallVector<std::unique_ptr<ValueToValueMapTy>, 4> VMaps; |
| VMaps.reserve(NumClones - 1); |
| FunctionsClonedThinBackend++; |
| for (unsigned I = 1; I < NumClones; I++) { |
| VMaps.emplace_back(std::make_unique<ValueToValueMapTy>()); |
| auto *NewF = CloneFunction(&F, *VMaps.back()); |
| FunctionClonesThinBackend++; |
| // Strip memprof and callsite metadata from clone as they are no longer |
| // needed. |
| for (auto &BB : *NewF) { |
| for (auto &Inst : BB) { |
| Inst.setMetadata(LLVMContext::MD_memprof, nullptr); |
| Inst.setMetadata(LLVMContext::MD_callsite, nullptr); |
| } |
| } |
| std::string Name = getMemProfFuncName(F.getName(), I); |
| auto *PrevF = M.getFunction(Name); |
| if (PrevF) { |
| // We might have created this when adjusting callsite in another |
| // function. It should be a declaration. |
| assert(PrevF->isDeclaration()); |
| NewF->takeName(PrevF); |
| PrevF->replaceAllUsesWith(NewF); |
| PrevF->eraseFromParent(); |
| } else |
| NewF->setName(Name); |
| ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofClone", &F) |
| << "created clone " << ore::NV("NewFunction", NewF)); |
| |
| // Now handle aliases to this function, and clone those as well. |
| if (!FuncToAliasMap.count(&F)) |
| continue; |
| for (auto *A : FuncToAliasMap[&F]) { |
| std::string Name = getMemProfFuncName(A->getName(), I); |
| auto *PrevA = M.getNamedAlias(Name); |
| auto *NewA = GlobalAlias::create(A->getValueType(), |
| A->getType()->getPointerAddressSpace(), |
| A->getLinkage(), Name, NewF); |
| NewA->copyAttributesFrom(A); |
| if (PrevA) { |
| // We might have created this when adjusting callsite in another |
| // function. It should be a declaration. |
| assert(PrevA->isDeclaration()); |
| NewA->takeName(PrevA); |
| PrevA->replaceAllUsesWith(NewA); |
| PrevA->eraseFromParent(); |
| } |
| } |
| } |
| return VMaps; |
| } |
| |
| // Locate the summary for F. This is complicated by the fact that it might |
| // have been internalized or promoted. |
| static ValueInfo findValueInfoForFunc(const Function &F, const Module &M, |
| const ModuleSummaryIndex *ImportSummary, |
| const Function *CallingFunc = nullptr) { |
| // FIXME: Ideally we would retain the original GUID in some fashion on the |
| // function (e.g. as metadata), but for now do our best to locate the |
| // summary without that information. |
| ValueInfo TheFnVI = ImportSummary->getValueInfo(F.getGUID()); |
| if (!TheFnVI) |
| // See if theFn was internalized, by checking index directly with |
| // original name (this avoids the name adjustment done by getGUID() for |
| // internal symbols). |
| TheFnVI = ImportSummary->getValueInfo( |
| GlobalValue::getGUIDAssumingExternalLinkage(F.getName())); |
| if (TheFnVI) |
| return TheFnVI; |
| // Now query with the original name before any promotion was performed. |
| StringRef OrigName = |
| ModuleSummaryIndex::getOriginalNameBeforePromote(F.getName()); |
| // When this pass is enabled, we always add thinlto_src_file provenance |
| // metadata to imported function definitions, which allows us to recreate the |
| // original internal symbol's GUID. |
| auto SrcFileMD = F.getMetadata("thinlto_src_file"); |
| // If this is a call to an imported/promoted local for which we didn't import |
| // the definition, the metadata will not exist on the declaration. However, |
| // since we are doing this early, before any inlining in the LTO backend, we |
| // can simply look at the metadata on the calling function which must have |
| // been from the same module if F was an internal symbol originally. |
| if (!SrcFileMD && F.isDeclaration()) { |
| // We would only call this for a declaration for a direct callsite, in which |
| // case the caller would have provided the calling function pointer. |
| assert(CallingFunc); |
| SrcFileMD = CallingFunc->getMetadata("thinlto_src_file"); |
| // If this is a promoted local (OrigName != F.getName()), since this is a |
| // declaration, it must be imported from a different module and therefore we |
| // should always find the metadata on its calling function. Any call to a |
| // promoted local that came from this module should still be a definition. |
| assert(SrcFileMD || OrigName == F.getName()); |
| } |
| StringRef SrcFile = M.getSourceFileName(); |
| if (SrcFileMD) |
| SrcFile = dyn_cast<MDString>(SrcFileMD->getOperand(0))->getString(); |
| std::string OrigId = GlobalValue::getGlobalIdentifier( |
| OrigName, GlobalValue::InternalLinkage, SrcFile); |
| TheFnVI = ImportSummary->getValueInfo( |
| GlobalValue::getGUIDAssumingExternalLinkage(OrigId)); |
| // Internal func in original module may have gotten a numbered suffix if we |
| // imported an external function with the same name. This happens |
| // automatically during IR linking for naming conflicts. It would have to |
| // still be internal in that case (otherwise it would have been renamed on |
| // promotion in which case we wouldn't have a naming conflict). |
| if (!TheFnVI && OrigName == F.getName() && F.hasLocalLinkage() && |
| F.getName().contains('.')) { |
| OrigName = F.getName().rsplit('.').first; |
| OrigId = GlobalValue::getGlobalIdentifier( |
| OrigName, GlobalValue::InternalLinkage, SrcFile); |
| TheFnVI = ImportSummary->getValueInfo( |
| GlobalValue::getGUIDAssumingExternalLinkage(OrigId)); |
| } |
| // The only way we may not have a VI is if this is a declaration created for |
| // an imported reference. For distributed ThinLTO we may not have a VI for |
| // such declarations in the distributed summary. |
| assert(TheFnVI || F.isDeclaration()); |
| return TheFnVI; |
| } |
| |
| bool MemProfContextDisambiguation::initializeIndirectCallPromotionInfo( |
| Module &M) { |
| ICallAnalysis = std::make_unique<ICallPromotionAnalysis>(); |
| Symtab = std::make_unique<InstrProfSymtab>(); |
| // Don't add canonical names, to avoid multiple functions to the symtab |
| // when they both have the same root name with "." suffixes stripped. |
| // If we pick the wrong one then this could lead to incorrect ICP and calling |
| // a memprof clone that we don't actually create (resulting in linker unsats). |
| // What this means is that the GUID of the function (or its PGOFuncName |
| // metadata) *must* match that in the VP metadata to allow promotion. |
| // In practice this should not be a limitation, since local functions should |
| // have PGOFuncName metadata and global function names shouldn't need any |
| // special handling (they should not get the ".llvm.*" suffix that the |
| // canonicalization handling is attempting to strip). |
| if (Error E = Symtab->create(M, /*InLTO=*/true, /*AddCanonical=*/false)) { |
| std::string SymtabFailure = toString(std::move(E)); |
| M.getContext().emitError("Failed to create symtab: " + SymtabFailure); |
| return false; |
| } |
| return true; |
| } |
| |
| #ifndef NDEBUG |
| // Sanity check that the MIB stack ids match between the summary and |
| // instruction metadata. |
| static void checkAllocContextIds( |
| const AllocInfo &AllocNode, const MDNode *MemProfMD, |
| const CallStack<MDNode, MDNode::op_iterator> &CallsiteContext, |
| const ModuleSummaryIndex *ImportSummary) { |
| auto MIBIter = AllocNode.MIBs.begin(); |
| for (auto &MDOp : MemProfMD->operands()) { |
| assert(MIBIter != AllocNode.MIBs.end()); |
| auto StackIdIndexIter = MIBIter->StackIdIndices.begin(); |
| auto *MIBMD = cast<const MDNode>(MDOp); |
| MDNode *StackMDNode = getMIBStackNode(MIBMD); |
| assert(StackMDNode); |
| CallStack<MDNode, MDNode::op_iterator> StackContext(StackMDNode); |
| auto ContextIterBegin = |
| StackContext.beginAfterSharedPrefix(CallsiteContext); |
| // Skip the checking on the first iteration. |
| uint64_t LastStackContextId = |
| (ContextIterBegin != StackContext.end() && *ContextIterBegin == 0) ? 1 |
| : 0; |
| for (auto ContextIter = ContextIterBegin; ContextIter != StackContext.end(); |
| ++ContextIter) { |
| // If this is a direct recursion, simply skip the duplicate |
| // entries, to be consistent with how the summary ids were |
| // generated during ModuleSummaryAnalysis. |
| if (LastStackContextId == *ContextIter) |
| continue; |
| LastStackContextId = *ContextIter; |
| assert(StackIdIndexIter != MIBIter->StackIdIndices.end()); |
| assert(ImportSummary->getStackIdAtIndex(*StackIdIndexIter) == |
| *ContextIter); |
| StackIdIndexIter++; |
| } |
| MIBIter++; |
| } |
| } |
| #endif |
| |
| bool MemProfContextDisambiguation::applyImport(Module &M) { |
| assert(ImportSummary); |
| bool Changed = false; |
| |
| // We also need to clone any aliases that reference cloned functions, because |
| // the modified callsites may invoke via the alias. Keep track of the aliases |
| // for each function. |
| std::map<const Function *, SmallPtrSet<const GlobalAlias *, 1>> |
| FuncToAliasMap; |
| for (auto &A : M.aliases()) { |
| auto *Aliasee = A.getAliaseeObject(); |
| if (auto *F = dyn_cast<Function>(Aliasee)) |
| FuncToAliasMap[F].insert(&A); |
| } |
| |
| if (!initializeIndirectCallPromotionInfo(M)) |
| return false; |
| |
| for (auto &F : M) { |
| if (F.isDeclaration() || isMemProfClone(F)) |
| continue; |
| |
| OptimizationRemarkEmitter ORE(&F); |
| |
| SmallVector<std::unique_ptr<ValueToValueMapTy>, 4> VMaps; |
| bool ClonesCreated = false; |
| unsigned NumClonesCreated = 0; |
| auto CloneFuncIfNeeded = [&](unsigned NumClones) { |
| // We should at least have version 0 which is the original copy. |
| assert(NumClones > 0); |
| // If only one copy needed use original. |
| if (NumClones == 1) |
| return; |
| // If we already performed cloning of this function, confirm that the |
| // requested number of clones matches (the thin link should ensure the |
| // number of clones for each constituent callsite is consistent within |
| // each function), before returning. |
| if (ClonesCreated) { |
| assert(NumClonesCreated == NumClones); |
| return; |
| } |
| VMaps = createFunctionClones(F, NumClones, M, ORE, FuncToAliasMap); |
| // The first "clone" is the original copy, which doesn't have a VMap. |
| assert(VMaps.size() == NumClones - 1); |
| Changed = true; |
| ClonesCreated = true; |
| NumClonesCreated = NumClones; |
| }; |
| |
| auto CloneCallsite = [&](const CallsiteInfo &StackNode, CallBase *CB, |
| Function *CalledFunction) { |
| // Perform cloning if not yet done. |
| CloneFuncIfNeeded(/*NumClones=*/StackNode.Clones.size()); |
| |
| assert(!isMemProfClone(*CalledFunction)); |
| |
| // Update the calls per the summary info. |
| // Save orig name since it gets updated in the first iteration |
| // below. |
| auto CalleeOrigName = CalledFunction->getName(); |
| for (unsigned J = 0; J < StackNode.Clones.size(); J++) { |
| // Do nothing if this version calls the original version of its |
| // callee. |
| if (!StackNode.Clones[J]) |
| continue; |
| auto NewF = M.getOrInsertFunction( |
| getMemProfFuncName(CalleeOrigName, StackNode.Clones[J]), |
| CalledFunction->getFunctionType()); |
| CallBase *CBClone; |
| // Copy 0 is the original function. |
| if (!J) |
| CBClone = CB; |
| else |
| CBClone = cast<CallBase>((*VMaps[J - 1])[CB]); |
| CBClone->setCalledFunction(NewF); |
| ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofCall", CBClone) |
| << ore::NV("Call", CBClone) << " in clone " |
| << ore::NV("Caller", CBClone->getFunction()) |
| << " assigned to call function clone " |
| << ore::NV("Callee", NewF.getCallee())); |
| } |
| }; |
| |
| // Locate the summary for F. |
| ValueInfo TheFnVI = findValueInfoForFunc(F, M, ImportSummary); |
| // If not found, this could be an imported local (see comment in |
| // findValueInfoForFunc). Skip for now as it will be cloned in its original |
| // module (where it would have been promoted to global scope so should |
| // satisfy any reference in this module). |
| if (!TheFnVI) |
| continue; |
| |
| auto *GVSummary = |
| ImportSummary->findSummaryInModule(TheFnVI, M.getModuleIdentifier()); |
| if (!GVSummary) { |
| // Must have been imported, use the summary which matches the definition。 |
| // (might be multiple if this was a linkonce_odr). |
| auto SrcModuleMD = F.getMetadata("thinlto_src_module"); |
| assert(SrcModuleMD && |
| "enable-import-metadata is needed to emit thinlto_src_module"); |
| StringRef SrcModule = |
| dyn_cast<MDString>(SrcModuleMD->getOperand(0))->getString(); |
| for (auto &GVS : TheFnVI.getSummaryList()) { |
| if (GVS->modulePath() == SrcModule) { |
| GVSummary = GVS.get(); |
| break; |
| } |
| } |
| assert(GVSummary && GVSummary->modulePath() == SrcModule); |
| } |
| |
| // If this was an imported alias skip it as we won't have the function |
| // summary, and it should be cloned in the original module. |
| if (isa<AliasSummary>(GVSummary)) |
| continue; |
| |
| auto *FS = cast<FunctionSummary>(GVSummary->getBaseObject()); |
| |
| if (FS->allocs().empty() && FS->callsites().empty()) |
| continue; |
| |
| auto SI = FS->callsites().begin(); |
| auto AI = FS->allocs().begin(); |
| |
| // To handle callsite infos synthesized for tail calls which have missing |
| // frames in the profiled context, map callee VI to the synthesized callsite |
| // info. |
| DenseMap<ValueInfo, CallsiteInfo> MapTailCallCalleeVIToCallsite; |
| // Iterate the callsites for this function in reverse, since we place all |
| // those synthesized for tail calls at the end. |
| for (auto CallsiteIt = FS->callsites().rbegin(); |
| CallsiteIt != FS->callsites().rend(); CallsiteIt++) { |
| auto &Callsite = *CallsiteIt; |
| // Stop as soon as we see a non-synthesized callsite info (see comment |
| // above loop). All the entries added for discovered tail calls have empty |
| // stack ids. |
| if (!Callsite.StackIdIndices.empty()) |
| break; |
| MapTailCallCalleeVIToCallsite.insert({Callsite.Callee, Callsite}); |
| } |
| |
| // Keeps track of needed ICP for the function. |
| SmallVector<ICallAnalysisData> ICallAnalysisInfo; |
| |
| // Assume for now that the instructions are in the exact same order |
| // as when the summary was created, but confirm this is correct by |
| // matching the stack ids. |
| for (auto &BB : F) { |
| for (auto &I : BB) { |
| auto *CB = dyn_cast<CallBase>(&I); |
| // Same handling as when creating module summary. |
| if (!mayHaveMemprofSummary(CB)) |
| continue; |
| |
| auto *CalledValue = CB->getCalledOperand(); |
| auto *CalledFunction = CB->getCalledFunction(); |
| if (CalledValue && !CalledFunction) { |
| CalledValue = CalledValue->stripPointerCasts(); |
| // Stripping pointer casts can reveal a called function. |
| CalledFunction = dyn_cast<Function>(CalledValue); |
| } |
| // Check if this is an alias to a function. If so, get the |
| // called aliasee for the checks below. |
| if (auto *GA = dyn_cast<GlobalAlias>(CalledValue)) { |
| assert(!CalledFunction && |
| "Expected null called function in callsite for alias"); |
| CalledFunction = dyn_cast<Function>(GA->getAliaseeObject()); |
| } |
| |
| CallStack<MDNode, MDNode::op_iterator> CallsiteContext( |
| I.getMetadata(LLVMContext::MD_callsite)); |
| auto *MemProfMD = I.getMetadata(LLVMContext::MD_memprof); |
| |
| // Include allocs that were already assigned a memprof function |
| // attribute in the statistics. |
| if (CB->getAttributes().hasFnAttr("memprof")) { |
| assert(!MemProfMD); |
| CB->getAttributes().getFnAttr("memprof").getValueAsString() == "cold" |
| ? AllocTypeColdThinBackend++ |
| : AllocTypeNotColdThinBackend++; |
| OrigAllocsThinBackend++; |
| AllocVersionsThinBackend++; |
| if (!MaxAllocVersionsThinBackend) |
| MaxAllocVersionsThinBackend = 1; |
| continue; |
| } |
| |
| if (MemProfMD) { |
| // Consult the next alloc node. |
| assert(AI != FS->allocs().end()); |
| auto &AllocNode = *(AI++); |
| |
| #ifndef NDEBUG |
| checkAllocContextIds(AllocNode, MemProfMD, CallsiteContext, |
| ImportSummary); |
| #endif |
| |
| // Perform cloning if not yet done. |
| CloneFuncIfNeeded(/*NumClones=*/AllocNode.Versions.size()); |
| |
| OrigAllocsThinBackend++; |
| AllocVersionsThinBackend += AllocNode.Versions.size(); |
| if (MaxAllocVersionsThinBackend < AllocNode.Versions.size()) |
| MaxAllocVersionsThinBackend = AllocNode.Versions.size(); |
| |
| // If there is only one version that means we didn't end up |
| // considering this function for cloning, and in that case the alloc |
| // will still be none type or should have gotten the default NotCold. |
| // Skip that after calling clone helper since that does some sanity |
| // checks that confirm we haven't decided yet that we need cloning. |
| // We might have a single version that is cold due to the |
| // MinClonedColdBytePercent heuristic, make sure we don't skip in that |
| // case. |
| if (AllocNode.Versions.size() == 1 && |
| (AllocationType)AllocNode.Versions[0] != AllocationType::Cold) { |
| assert((AllocationType)AllocNode.Versions[0] == |
| AllocationType::NotCold || |
| (AllocationType)AllocNode.Versions[0] == |
| AllocationType::None); |
| UnclonableAllocsThinBackend++; |
| continue; |
| } |
| |
| // All versions should have a singular allocation type. |
| assert(llvm::none_of(AllocNode.Versions, [](uint8_t Type) { |
| return Type == ((uint8_t)AllocationType::NotCold | |
| (uint8_t)AllocationType::Cold); |
| })); |
| |
| // Update the allocation types per the summary info. |
| for (unsigned J = 0; J < AllocNode.Versions.size(); J++) { |
| // Ignore any that didn't get an assigned allocation type. |
| if (AllocNode.Versions[J] == (uint8_t)AllocationType::None) |
| continue; |
| AllocationType AllocTy = (AllocationType)AllocNode.Versions[J]; |
| AllocTy == AllocationType::Cold ? AllocTypeColdThinBackend++ |
| : AllocTypeNotColdThinBackend++; |
| std::string AllocTypeString = getAllocTypeAttributeString(AllocTy); |
| auto A = llvm::Attribute::get(F.getContext(), "memprof", |
| AllocTypeString); |
| CallBase *CBClone; |
| // Copy 0 is the original function. |
| if (!J) |
| CBClone = CB; |
| else |
| // Since VMaps are only created for new clones, we index with |
| // clone J-1 (J==0 is the original clone and does not have a VMaps |
| // entry). |
| CBClone = cast<CallBase>((*VMaps[J - 1])[CB]); |
| CBClone->addFnAttr(A); |
| ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofAttribute", CBClone) |
| << ore::NV("AllocationCall", CBClone) << " in clone " |
| << ore::NV("Caller", CBClone->getFunction()) |
| << " marked with memprof allocation attribute " |
| << ore::NV("Attribute", AllocTypeString)); |
| } |
| } else if (!CallsiteContext.empty()) { |
| if (!CalledFunction) { |
| #ifndef NDEBUG |
| // We should have skipped inline assembly calls. |
| auto *CI = dyn_cast<CallInst>(CB); |
| assert(!CI || !CI->isInlineAsm()); |
| #endif |
| // We should have skipped direct calls via a Constant. |
| assert(CalledValue && !isa<Constant>(CalledValue)); |
| |
| // This is an indirect call, see if we have profile information and |
| // whether any clones were recorded for the profiled targets (that |
| // we synthesized CallsiteInfo summary records for when building the |
| // index). |
| auto NumClones = |
| recordICPInfo(CB, FS->callsites(), SI, ICallAnalysisInfo); |
| |
| // Perform cloning if not yet done. This is done here in case |
| // we don't need to do ICP, but might need to clone this |
| // function as it is the target of other cloned calls. |
| if (NumClones) |
| CloneFuncIfNeeded(NumClones); |
| } |
| |
| else { |
| // Consult the next callsite node. |
| assert(SI != FS->callsites().end()); |
| auto &StackNode = *(SI++); |
| |
| #ifndef NDEBUG |
| // Sanity check that the stack ids match between the summary and |
| // instruction metadata. |
| auto StackIdIndexIter = StackNode.StackIdIndices.begin(); |
| for (auto StackId : CallsiteContext) { |
| assert(StackIdIndexIter != StackNode.StackIdIndices.end()); |
| assert(ImportSummary->getStackIdAtIndex(*StackIdIndexIter) == |
| StackId); |
| StackIdIndexIter++; |
| } |
| #endif |
| |
| CloneCallsite(StackNode, CB, CalledFunction); |
| } |
| } else if (CB->isTailCall() && CalledFunction) { |
| // Locate the synthesized callsite info for the callee VI, if any was |
| // created, and use that for cloning. |
| ValueInfo CalleeVI = |
| findValueInfoForFunc(*CalledFunction, M, ImportSummary, &F); |
| if (CalleeVI && MapTailCallCalleeVIToCallsite.count(CalleeVI)) { |
| auto Callsite = MapTailCallCalleeVIToCallsite.find(CalleeVI); |
| assert(Callsite != MapTailCallCalleeVIToCallsite.end()); |
| CloneCallsite(Callsite->second, CB, CalledFunction); |
| } |
| } |
| } |
| } |
| |
| // Now do any promotion required for cloning. |
| performICP(M, FS->callsites(), VMaps, ICallAnalysisInfo, ORE); |
| } |
| |
| // We skip some of the functions and instructions above, so remove all the |
| // metadata in a single sweep here. |
| for (auto &F : M) { |
| // We can skip memprof clones because createFunctionClones already strips |
| // the metadata from the newly created clones. |
| if (F.isDeclaration() || isMemProfClone(F)) |
| continue; |
| for (auto &BB : F) { |
| for (auto &I : BB) { |
| if (!isa<CallBase>(I)) |
| continue; |
| I.setMetadata(LLVMContext::MD_memprof, nullptr); |
| I.setMetadata(LLVMContext::MD_callsite, nullptr); |
| } |
| } |
| } |
| |
| return Changed; |
| } |
| |
| unsigned MemProfContextDisambiguation::recordICPInfo( |
| CallBase *CB, ArrayRef<CallsiteInfo> AllCallsites, |
| ArrayRef<CallsiteInfo>::iterator &SI, |
| SmallVector<ICallAnalysisData> &ICallAnalysisInfo) { |
| // First see if we have profile information for this indirect call. |
| uint32_t NumCandidates; |
| uint64_t TotalCount; |
| auto CandidateProfileData = |
| ICallAnalysis->getPromotionCandidatesForInstruction(CB, TotalCount, |
| NumCandidates); |
| if (CandidateProfileData.empty()) |
| return 0; |
| |
| // Iterate through all of the candidate profiled targets along with the |
| // CallsiteInfo summary records synthesized for them when building the index, |
| // and see if any are cloned and/or refer to clones. |
| bool ICPNeeded = false; |
| unsigned NumClones = 0; |
| size_t CallsiteInfoStartIndex = std::distance(AllCallsites.begin(), SI); |
| for (const auto &Candidate : CandidateProfileData) { |
| #ifndef NDEBUG |
| auto CalleeValueInfo = |
| #endif |
| ImportSummary->getValueInfo(Candidate.Value); |
| // We might not have a ValueInfo if this is a distributed |
| // ThinLTO backend and decided not to import that function. |
| assert(!CalleeValueInfo || SI->Callee == CalleeValueInfo); |
| assert(SI != AllCallsites.end()); |
| auto &StackNode = *(SI++); |
| // See if any of the clones of the indirect callsite for this |
| // profiled target should call a cloned version of the profiled |
| // target. We only need to do the ICP here if so. |
| ICPNeeded |= llvm::any_of(StackNode.Clones, |
| [](unsigned CloneNo) { return CloneNo != 0; }); |
| // Every callsite in the same function should have been cloned the same |
| // number of times. |
| assert(!NumClones || NumClones == StackNode.Clones.size()); |
| NumClones = StackNode.Clones.size(); |
| } |
| if (!ICPNeeded) |
| return NumClones; |
| // Save information for ICP, which is performed later to avoid messing up the |
| // current function traversal. |
| ICallAnalysisInfo.push_back({CB, CandidateProfileData.vec(), NumCandidates, |
| TotalCount, CallsiteInfoStartIndex}); |
| return NumClones; |
| } |
| |
| void MemProfContextDisambiguation::performICP( |
| Module &M, ArrayRef<CallsiteInfo> AllCallsites, |
| ArrayRef<std::unique_ptr<ValueToValueMapTy>> VMaps, |
| ArrayRef<ICallAnalysisData> ICallAnalysisInfo, |
| OptimizationRemarkEmitter &ORE) { |
| // Now do any promotion required for cloning. Specifically, for each |
| // recorded ICP candidate (which was only recorded because one clone of that |
| // candidate should call a cloned target), we perform ICP (speculative |
| // devirtualization) for each clone of the callsite, and update its callee |
| // to the appropriate clone. Note that the ICP compares against the original |
| // version of the target, which is what is in the vtable. |
| for (auto &Info : ICallAnalysisInfo) { |
| auto *CB = Info.CB; |
| auto CallsiteIndex = Info.CallsiteInfoStartIndex; |
| auto TotalCount = Info.TotalCount; |
| unsigned NumPromoted = 0; |
| unsigned NumClones = 0; |
| |
| for (auto &Candidate : Info.CandidateProfileData) { |
| auto &StackNode = AllCallsites[CallsiteIndex++]; |
| |
| // All calls in the same function must have the same number of clones. |
| assert(!NumClones || NumClones == StackNode.Clones.size()); |
| NumClones = StackNode.Clones.size(); |
| |
| // See if the target is in the module. If it wasn't imported, it is |
| // possible that this profile could have been collected on a different |
| // target (or version of the code), and we need to be conservative |
| // (similar to what is done in the ICP pass). |
| Function *TargetFunction = Symtab->getFunction(Candidate.Value); |
| if (TargetFunction == nullptr || |
| // Any ThinLTO global dead symbol removal should have already |
| // occurred, so it should be safe to promote when the target is a |
| // declaration. |
| // TODO: Remove internal option once more fully tested. |
| (MemProfRequireDefinitionForPromotion && |
| TargetFunction->isDeclaration())) { |
| ORE.emit([&]() { |
| return OptimizationRemarkMissed(DEBUG_TYPE, "UnableToFindTarget", CB) |
| << "Memprof cannot promote indirect call: target with md5sum " |
| << ore::NV("target md5sum", Candidate.Value) << " not found"; |
| }); |
| // FIXME: See if we can use the new declaration importing support to |
| // at least get the declarations imported for this case. Hot indirect |
| // targets should have been imported normally, however. |
| continue; |
| } |
| |
| // Check if legal to promote |
| const char *Reason = nullptr; |
| if (!isLegalToPromote(*CB, TargetFunction, &Reason)) { |
| ORE.emit([&]() { |
| return OptimizationRemarkMissed(DEBUG_TYPE, "UnableToPromote", CB) |
| << "Memprof cannot promote indirect call to " |
| << ore::NV("TargetFunction", TargetFunction) |
| << " with count of " << ore::NV("TotalCount", TotalCount) |
| << ": " << Reason; |
| }); |
| continue; |
| } |
| |
| assert(!isMemProfClone(*TargetFunction)); |
| |
| // Handle each call clone, applying ICP so that each clone directly |
| // calls the specified callee clone, guarded by the appropriate ICP |
| // check. |
| CallBase *CBClone = CB; |
| for (unsigned J = 0; J < NumClones; J++) { |
| // Copy 0 is the original function. |
| if (J > 0) |
| CBClone = cast<CallBase>((*VMaps[J - 1])[CB]); |
| // We do the promotion using the original name, so that the comparison |
| // is against the name in the vtable. Then just below, change the new |
| // direct call to call the cloned function. |
| auto &DirectCall = |
| pgo::promoteIndirectCall(*CBClone, TargetFunction, Candidate.Count, |
| TotalCount, isSamplePGO, &ORE); |
| auto *TargetToUse = TargetFunction; |
| // Call original if this version calls the original version of its |
| // callee. |
| if (StackNode.Clones[J]) { |
| TargetToUse = |
| cast<Function>(M.getOrInsertFunction( |
| getMemProfFuncName(TargetFunction->getName(), |
| StackNode.Clones[J]), |
| TargetFunction->getFunctionType()) |
| .getCallee()); |
| } |
| DirectCall.setCalledFunction(TargetToUse); |
| ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofCall", CBClone) |
| << ore::NV("Call", CBClone) << " in clone " |
| << ore::NV("Caller", CBClone->getFunction()) |
| << " promoted and assigned to call function clone " |
| << ore::NV("Callee", TargetToUse)); |
| } |
| |
| // Update TotalCount (all clones should get same count above) |
| TotalCount -= Candidate.Count; |
| NumPromoted++; |
| } |
| // Adjust the MD.prof metadata for all clones, now that we have the new |
| // TotalCount and the number promoted. |
| CallBase *CBClone = CB; |
| for (unsigned J = 0; J < NumClones; J++) { |
| // Copy 0 is the original function. |
| if (J > 0) |
| CBClone = cast<CallBase>((*VMaps[J - 1])[CB]); |
| // First delete the old one. |
| CBClone->setMetadata(LLVMContext::MD_prof, nullptr); |
| // If all promoted, we don't need the MD.prof metadata. |
| // Otherwise we need update with the un-promoted records back. |
| if (TotalCount != 0) |
| annotateValueSite( |
| M, *CBClone, ArrayRef(Info.CandidateProfileData).slice(NumPromoted), |
| TotalCount, IPVK_IndirectCallTarget, Info.NumCandidates); |
| } |
| } |
| } |
| |
| template <typename DerivedCCG, typename FuncTy, typename CallTy> |
| bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::process() { |
| if (DumpCCG) { |
| dbgs() << "CCG before cloning:\n"; |
| dbgs() << *this; |
| } |
| if (ExportToDot) |
| exportToDot("postbuild"); |
| |
| if (VerifyCCG) { |
| check(); |
| } |
| |
| identifyClones(); |
| |
| if (VerifyCCG) { |
| check(); |
| } |
| |
| if (DumpCCG) { |
| dbgs() << "CCG after cloning:\n"; |
| dbgs() << *this; |
| } |
| if (ExportToDot) |
| exportToDot("cloned"); |
| |
| bool Changed = assignFunctions(); |
| |
| if (DumpCCG) { |
| dbgs() << "CCG after assigning function clones:\n"; |
| dbgs() << *this; |
| } |
| if (ExportToDot) |
| exportToDot("clonefuncassign"); |
| |
| if (MemProfReportHintedSizes) |
| printTotalSizes(errs()); |
| |
| return Changed; |
| } |
| |
| bool MemProfContextDisambiguation::processModule( |
| Module &M, |
| llvm::function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter) { |
| |
| // If we have an import summary, then the cloning decisions were made during |
| // the thin link on the index. Apply them and return. |
| if (ImportSummary) |
| return applyImport(M); |
| |
| // TODO: If/when other types of memprof cloning are enabled beyond just for |
| // hot and cold, we will need to change this to individually control the |
| // AllocationType passed to addStackNodesForMIB during CCG construction. |
| // Note that we specifically check this after applying imports above, so that |
| // the option isn't needed to be passed to distributed ThinLTO backend |
| // clang processes, which won't necessarily have visibility into the linker |
| // dependences. Instead the information is communicated from the LTO link to |
| // the backends via the combined summary index. |
| if (!SupportsHotColdNew) |
| return false; |
| |
| ModuleCallsiteContextGraph CCG(M, OREGetter); |
| return CCG.process(); |
| } |
| |
| MemProfContextDisambiguation::MemProfContextDisambiguation( |
| const ModuleSummaryIndex *Summary, bool isSamplePGO) |
| : ImportSummary(Summary), isSamplePGO(isSamplePGO) { |
| // Check the dot graph printing options once here, to make sure we have valid |
| // and expected combinations. |
| if (DotGraphScope == DotScope::Alloc && !AllocIdForDot.getNumOccurrences()) |
| llvm::report_fatal_error( |
| "-memprof-dot-scope=alloc requires -memprof-dot-alloc-id"); |
| if (DotGraphScope == DotScope::Context && |
| !ContextIdForDot.getNumOccurrences()) |
| llvm::report_fatal_error( |
| "-memprof-dot-scope=context requires -memprof-dot-context-id"); |
| if (DotGraphScope == DotScope::All && AllocIdForDot.getNumOccurrences() && |
| ContextIdForDot.getNumOccurrences()) |
| llvm::report_fatal_error( |
| "-memprof-dot-scope=all can't have both -memprof-dot-alloc-id and " |
| "-memprof-dot-context-id"); |
| if (ImportSummary) { |
| // The MemProfImportSummary should only be used for testing ThinLTO |
| // distributed backend handling via opt, in which case we don't have a |
| // summary from the pass pipeline. |
| assert(MemProfImportSummary.empty()); |
| return; |
| } |
| if (MemProfImportSummary.empty()) |
| return; |
| |
| auto ReadSummaryFile = |
| errorOrToExpected(MemoryBuffer::getFile(MemProfImportSummary)); |
| if (!ReadSummaryFile) { |
| logAllUnhandledErrors(ReadSummaryFile.takeError(), errs(), |
| "Error loading file '" + MemProfImportSummary + |
| "': "); |
| return; |
| } |
| auto ImportSummaryForTestingOrErr = getModuleSummaryIndex(**ReadSummaryFile); |
| if (!ImportSummaryForTestingOrErr) { |
| logAllUnhandledErrors(ImportSummaryForTestingOrErr.takeError(), errs(), |
| "Error parsing file '" + MemProfImportSummary + |
| "': "); |
| return; |
| } |
| ImportSummaryForTesting = std::move(*ImportSummaryForTestingOrErr); |
| ImportSummary = ImportSummaryForTesting.get(); |
| } |
| |
| PreservedAnalyses MemProfContextDisambiguation::run(Module &M, |
| ModuleAnalysisManager &AM) { |
| auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager(); |
| auto OREGetter = [&](Function *F) -> OptimizationRemarkEmitter & { |
| return FAM.getResult<OptimizationRemarkEmitterAnalysis>(*F); |
| }; |
| if (!processModule(M, OREGetter)) |
| return PreservedAnalyses::all(); |
| return PreservedAnalyses::none(); |
| } |
| |
| void MemProfContextDisambiguation::run( |
| ModuleSummaryIndex &Index, |
| llvm::function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)> |
| isPrevailing) { |
| // TODO: If/when other types of memprof cloning are enabled beyond just for |
| // hot and cold, we will need to change this to individually control the |
| // AllocationType passed to addStackNodesForMIB during CCG construction. |
| // The index was set from the option, so these should be in sync. |
| assert(Index.withSupportsHotColdNew() == SupportsHotColdNew); |
| if (!SupportsHotColdNew) |
| return; |
| |
| IndexCallsiteContextGraph CCG(Index, isPrevailing); |
| CCG.process(); |
| } |