blob: 5b4350845b726948805c2794bc8666d340c03e27 [file] [log] [blame]
//==-- MemProfContextDisambiguation.cpp - Disambiguate contexts -------------=//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This file implements support for context disambiguation of allocation
// calls for profile guided heap optimization. Specifically, it uses Memprof
// profiles which indicate context specific allocation behavior (currently
// distinguishing cold vs hot memory allocations). Cloning is performed to
// expose the cold allocation call contexts, and the allocation calls are
// subsequently annotated with an attribute for later transformation.
//
// The transformations can be performed either directly on IR (regular LTO), or
// on a ThinLTO index (and later applied to the IR during the ThinLTO backend).
// Both types of LTO operate on a the same base graph representation, which
// uses CRTP to support either IR or Index formats.
//
//===----------------------------------------------------------------------===//
#include "llvm/Transforms/IPO/MemProfContextDisambiguation.h"
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/DenseSet.h"
#include "llvm/ADT/MapVector.h"
#include "llvm/ADT/SetOperations.h"
#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/ADT/SmallSet.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/Analysis/MemoryProfileInfo.h"
#include "llvm/Analysis/ModuleSummaryAnalysis.h"
#include "llvm/Analysis/OptimizationRemarkEmitter.h"
#include "llvm/Bitcode/BitcodeReader.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/Module.h"
#include "llvm/IR/ModuleSummaryIndex.h"
#include "llvm/Pass.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/GraphWriter.h"
#include "llvm/Support/InterleavedRange.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/IPO.h"
#include "llvm/Transforms/Utils/CallPromotionUtils.h"
#include "llvm/Transforms/Utils/Cloning.h"
#include "llvm/Transforms/Utils/Instrumentation.h"
#include <deque>
#include <sstream>
#include <unordered_map>
#include <vector>
using namespace llvm;
using namespace llvm::memprof;
#define DEBUG_TYPE "memprof-context-disambiguation"
STATISTIC(FunctionClonesAnalysis,
"Number of function clones created during whole program analysis");
STATISTIC(FunctionClonesThinBackend,
"Number of function clones created during ThinLTO backend");
STATISTIC(FunctionsClonedThinBackend,
"Number of functions that had clones created during ThinLTO backend");
STATISTIC(AllocTypeNotCold, "Number of not cold static allocations (possibly "
"cloned) during whole program analysis");
STATISTIC(AllocTypeCold, "Number of cold static allocations (possibly cloned) "
"during whole program analysis");
STATISTIC(AllocTypeNotColdThinBackend,
"Number of not cold static allocations (possibly cloned) during "
"ThinLTO backend");
STATISTIC(AllocTypeColdThinBackend, "Number of cold static allocations "
"(possibly cloned) during ThinLTO backend");
STATISTIC(OrigAllocsThinBackend,
"Number of original (not cloned) allocations with memprof profiles "
"during ThinLTO backend");
STATISTIC(
AllocVersionsThinBackend,
"Number of allocation versions (including clones) during ThinLTO backend");
STATISTIC(MaxAllocVersionsThinBackend,
"Maximum number of allocation versions created for an original "
"allocation during ThinLTO backend");
STATISTIC(UnclonableAllocsThinBackend,
"Number of unclonable ambigous allocations during ThinLTO backend");
STATISTIC(RemovedEdgesWithMismatchedCallees,
"Number of edges removed due to mismatched callees (profiled vs IR)");
STATISTIC(FoundProfiledCalleeCount,
"Number of profiled callees found via tail calls");
STATISTIC(FoundProfiledCalleeDepth,
"Aggregate depth of profiled callees found via tail calls");
STATISTIC(FoundProfiledCalleeMaxDepth,
"Maximum depth of profiled callees found via tail calls");
STATISTIC(FoundProfiledCalleeNonUniquelyCount,
"Number of profiled callees found via multiple tail call chains");
STATISTIC(DeferredBackedges, "Number of backedges with deferred cloning");
STATISTIC(NewMergedNodes, "Number of new nodes created during merging");
STATISTIC(NonNewMergedNodes, "Number of non new nodes used during merging");
STATISTIC(MissingAllocForContextId,
"Number of missing alloc nodes for context ids");
static cl::opt<std::string> DotFilePathPrefix(
"memprof-dot-file-path-prefix", cl::init(""), cl::Hidden,
cl::value_desc("filename"),
cl::desc("Specify the path prefix of the MemProf dot files."));
static cl::opt<bool> ExportToDot("memprof-export-to-dot", cl::init(false),
cl::Hidden,
cl::desc("Export graph to dot files."));
// How much of the graph to export to dot.
enum DotScope {
All, // The full CCG graph.
Alloc, // Only contexts for the specified allocation.
Context, // Only the specified context.
};
static cl::opt<DotScope> DotGraphScope(
"memprof-dot-scope", cl::desc("Scope of graph to export to dot"),
cl::Hidden, cl::init(DotScope::All),
cl::values(
clEnumValN(DotScope::All, "all", "Export full callsite graph"),
clEnumValN(DotScope::Alloc, "alloc",
"Export only nodes with contexts feeding given "
"-memprof-dot-alloc-id"),
clEnumValN(DotScope::Context, "context",
"Export only nodes with given -memprof-dot-context-id")));
static cl::opt<unsigned>
AllocIdForDot("memprof-dot-alloc-id", cl::init(0), cl::Hidden,
cl::desc("Id of alloc to export if -memprof-dot-scope=alloc "
"or to highlight if -memprof-dot-scope=all"));
static cl::opt<unsigned> ContextIdForDot(
"memprof-dot-context-id", cl::init(0), cl::Hidden,
cl::desc("Id of context to export if -memprof-dot-scope=context or to "
"highlight otherwise"));
static cl::opt<bool>
DumpCCG("memprof-dump-ccg", cl::init(false), cl::Hidden,
cl::desc("Dump CallingContextGraph to stdout after each stage."));
static cl::opt<bool>
VerifyCCG("memprof-verify-ccg", cl::init(false), cl::Hidden,
cl::desc("Perform verification checks on CallingContextGraph."));
static cl::opt<bool>
VerifyNodes("memprof-verify-nodes", cl::init(false), cl::Hidden,
cl::desc("Perform frequent verification checks on nodes."));
static cl::opt<std::string> MemProfImportSummary(
"memprof-import-summary",
cl::desc("Import summary to use for testing the ThinLTO backend via opt"),
cl::Hidden);
static cl::opt<unsigned>
TailCallSearchDepth("memprof-tail-call-search-depth", cl::init(5),
cl::Hidden,
cl::desc("Max depth to recursively search for missing "
"frames through tail calls."));
// Optionally enable cloning of callsites involved with recursive cycles
static cl::opt<bool> AllowRecursiveCallsites(
"memprof-allow-recursive-callsites", cl::init(true), cl::Hidden,
cl::desc("Allow cloning of callsites involved in recursive cycles"));
static cl::opt<bool> CloneRecursiveContexts(
"memprof-clone-recursive-contexts", cl::init(true), cl::Hidden,
cl::desc("Allow cloning of contexts through recursive cycles"));
// Generally this is needed for correct assignment of allocation clones to
// function clones, however, allow it to be disabled for debugging while the
// functionality is new and being tested more widely.
static cl::opt<bool>
MergeClones("memprof-merge-clones", cl::init(true), cl::Hidden,
cl::desc("Merge clones before assigning functions"));
// When disabled, try to detect and prevent cloning of recursive contexts.
// This is only necessary until we support cloning through recursive cycles.
// Leave on by default for now, as disabling requires a little bit of compile
// time overhead and doesn't affect correctness, it will just inflate the cold
// hinted bytes reporting a bit when -memprof-report-hinted-sizes is enabled.
static cl::opt<bool> AllowRecursiveContexts(
"memprof-allow-recursive-contexts", cl::init(true), cl::Hidden,
cl::desc("Allow cloning of contexts having recursive cycles"));
namespace llvm {
cl::opt<bool> EnableMemProfContextDisambiguation(
"enable-memprof-context-disambiguation", cl::init(false), cl::Hidden,
cl::ZeroOrMore, cl::desc("Enable MemProf context disambiguation"));
// Indicate we are linking with an allocator that supports hot/cold operator
// new interfaces.
cl::opt<bool> SupportsHotColdNew(
"supports-hot-cold-new", cl::init(false), cl::Hidden,
cl::desc("Linking with hot/cold operator new interfaces"));
static cl::opt<bool> MemProfRequireDefinitionForPromotion(
"memprof-require-definition-for-promotion", cl::init(false), cl::Hidden,
cl::desc(
"Require target function definition when promoting indirect calls"));
} // namespace llvm
extern cl::opt<bool> MemProfReportHintedSizes;
extern cl::opt<unsigned> MinClonedColdBytePercent;
namespace {
/// CRTP base for graphs built from either IR or ThinLTO summary index.
///
/// The graph represents the call contexts in all memprof metadata on allocation
/// calls, with nodes for the allocations themselves, as well as for the calls
/// in each context. The graph is initially built from the allocation memprof
/// metadata (or summary) MIBs. It is then updated to match calls with callsite
/// metadata onto the nodes, updating it to reflect any inlining performed on
/// those calls.
///
/// Each MIB (representing an allocation's call context with allocation
/// behavior) is assigned a unique context id during the graph build. The edges
/// and nodes in the graph are decorated with the context ids they carry. This
/// is used to correctly update the graph when cloning is performed so that we
/// can uniquify the context for a single (possibly cloned) allocation.
template <typename DerivedCCG, typename FuncTy, typename CallTy>
class CallsiteContextGraph {
public:
CallsiteContextGraph() = default;
CallsiteContextGraph(const CallsiteContextGraph &) = default;
CallsiteContextGraph(CallsiteContextGraph &&) = default;
/// Main entry point to perform analysis and transformations on graph.
bool process();
/// Perform cloning on the graph necessary to uniquely identify the allocation
/// behavior of an allocation based on its context.
void identifyClones();
/// Assign callsite clones to functions, cloning functions as needed to
/// accommodate the combinations of their callsite clones reached by callers.
/// For regular LTO this clones functions and callsites in the IR, but for
/// ThinLTO the cloning decisions are noted in the summaries and later applied
/// in applyImport.
bool assignFunctions();
void dump() const;
void print(raw_ostream &OS) const;
void printTotalSizes(raw_ostream &OS) const;
friend raw_ostream &operator<<(raw_ostream &OS,
const CallsiteContextGraph &CCG) {
CCG.print(OS);
return OS;
}
friend struct GraphTraits<
const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>;
friend struct DOTGraphTraits<
const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>;
void exportToDot(std::string Label) const;
/// Represents a function clone via FuncTy pointer and clone number pair.
struct FuncInfo final
: public std::pair<FuncTy *, unsigned /*Clone number*/> {
using Base = std::pair<FuncTy *, unsigned>;
FuncInfo(const Base &B) : Base(B) {}
FuncInfo(FuncTy *F = nullptr, unsigned CloneNo = 0) : Base(F, CloneNo) {}
explicit operator bool() const { return this->first != nullptr; }
FuncTy *func() const { return this->first; }
unsigned cloneNo() const { return this->second; }
};
/// Represents a callsite clone via CallTy and clone number pair.
struct CallInfo final : public std::pair<CallTy, unsigned /*Clone number*/> {
using Base = std::pair<CallTy, unsigned>;
CallInfo(const Base &B) : Base(B) {}
CallInfo(CallTy Call = nullptr, unsigned CloneNo = 0)
: Base(Call, CloneNo) {}
explicit operator bool() const { return (bool)this->first; }
CallTy call() const { return this->first; }
unsigned cloneNo() const { return this->second; }
void setCloneNo(unsigned N) { this->second = N; }
void print(raw_ostream &OS) const {
if (!operator bool()) {
assert(!cloneNo());
OS << "null Call";
return;
}
call()->print(OS);
OS << "\t(clone " << cloneNo() << ")";
}
void dump() const {
print(dbgs());
dbgs() << "\n";
}
friend raw_ostream &operator<<(raw_ostream &OS, const CallInfo &Call) {
Call.print(OS);
return OS;
}
};
struct ContextEdge;
/// Node in the Callsite Context Graph
struct ContextNode {
// Keep this for now since in the IR case where we have an Instruction* it
// is not as immediately discoverable. Used for printing richer information
// when dumping graph.
bool IsAllocation;
// Keeps track of when the Call was reset to null because there was
// recursion.
bool Recursive = false;
// This will be formed by ORing together the AllocationType enum values
// for contexts including this node.
uint8_t AllocTypes = 0;
// The corresponding allocation or interior call. This is the primary call
// for which we have created this node.
CallInfo Call;
// List of other calls that can be treated the same as the primary call
// through cloning. I.e. located in the same function and have the same
// (possibly pruned) stack ids. They will be updated the same way as the
// primary call when assigning to function clones.
SmallVector<CallInfo, 0> MatchingCalls;
// For alloc nodes this is a unique id assigned when constructed, and for
// callsite stack nodes it is the original stack id when the node is
// constructed from the memprof MIB metadata on the alloc nodes. Note that
// this is only used when matching callsite metadata onto the stack nodes
// created when processing the allocation memprof MIBs, and for labeling
// nodes in the dot graph. Therefore we don't bother to assign a value for
// clones.
uint64_t OrigStackOrAllocId = 0;
// Edges to all callees in the profiled call stacks.
// TODO: Should this be a map (from Callee node) for more efficient lookup?
std::vector<std::shared_ptr<ContextEdge>> CalleeEdges;
// Edges to all callers in the profiled call stacks.
// TODO: Should this be a map (from Caller node) for more efficient lookup?
std::vector<std::shared_ptr<ContextEdge>> CallerEdges;
// Returns true if we need to look at the callee edges for determining the
// node context ids and allocation type.
bool useCallerEdgesForContextInfo() const {
// Typically if the callee edges are empty either the caller edges are
// also empty, or this is an allocation (leaf node). However, if we are
// allowing recursive callsites and contexts this will be violated for
// incompletely cloned recursive cycles.
assert(!CalleeEdges.empty() || CallerEdges.empty() || IsAllocation ||
(AllowRecursiveCallsites && AllowRecursiveContexts));
// When cloning for a recursive context, during cloning we might be in the
// midst of cloning for a recurrence and have moved context ids off of a
// caller edge onto the clone but not yet off of the incoming caller
// (back) edge. If we don't look at those we miss the fact that this node
// still has context ids of interest.
return IsAllocation || CloneRecursiveContexts;
}
// Compute the context ids for this node from the union of its edge context
// ids.
DenseSet<uint32_t> getContextIds() const {
unsigned Count = 0;
// Compute the number of ids for reserve below. In general we only need to
// look at one set of edges, typically the callee edges, since other than
// allocations and in some cases during recursion cloning, all the context
// ids on the callers should also flow out via callee edges.
for (auto &Edge : CalleeEdges.empty() ? CallerEdges : CalleeEdges)
Count += Edge->getContextIds().size();
DenseSet<uint32_t> ContextIds;
ContextIds.reserve(Count);
auto Edges = llvm::concat<const std::shared_ptr<ContextEdge>>(
CalleeEdges, useCallerEdgesForContextInfo()
? CallerEdges
: std::vector<std::shared_ptr<ContextEdge>>());
for (const auto &Edge : Edges)
ContextIds.insert_range(Edge->getContextIds());
return ContextIds;
}
// Compute the allocation type for this node from the OR of its edge
// allocation types.
uint8_t computeAllocType() const {
uint8_t BothTypes =
(uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold;
uint8_t AllocType = (uint8_t)AllocationType::None;
auto Edges = llvm::concat<const std::shared_ptr<ContextEdge>>(
CalleeEdges, useCallerEdgesForContextInfo()
? CallerEdges
: std::vector<std::shared_ptr<ContextEdge>>());
for (const auto &Edge : Edges) {
AllocType |= Edge->AllocTypes;
// Bail early if alloc type reached both, no further refinement.
if (AllocType == BothTypes)
return AllocType;
}
return AllocType;
}
// The context ids set for this node is empty if its edge context ids are
// also all empty.
bool emptyContextIds() const {
auto Edges = llvm::concat<const std::shared_ptr<ContextEdge>>(
CalleeEdges, useCallerEdgesForContextInfo()
? CallerEdges
: std::vector<std::shared_ptr<ContextEdge>>());
for (const auto &Edge : Edges) {
if (!Edge->getContextIds().empty())
return false;
}
return true;
}
// List of clones of this ContextNode, initially empty.
std::vector<ContextNode *> Clones;
// If a clone, points to the original uncloned node.
ContextNode *CloneOf = nullptr;
ContextNode(bool IsAllocation) : IsAllocation(IsAllocation), Call() {}
ContextNode(bool IsAllocation, CallInfo C)
: IsAllocation(IsAllocation), Call(C) {}
void addClone(ContextNode *Clone) {
if (CloneOf) {
CloneOf->Clones.push_back(Clone);
Clone->CloneOf = CloneOf;
} else {
Clones.push_back(Clone);
assert(!Clone->CloneOf);
Clone->CloneOf = this;
}
}
ContextNode *getOrigNode() {
if (!CloneOf)
return this;
return CloneOf;
}
void addOrUpdateCallerEdge(ContextNode *Caller, AllocationType AllocType,
unsigned int ContextId);
ContextEdge *findEdgeFromCallee(const ContextNode *Callee);
ContextEdge *findEdgeFromCaller(const ContextNode *Caller);
void eraseCalleeEdge(const ContextEdge *Edge);
void eraseCallerEdge(const ContextEdge *Edge);
void setCall(CallInfo C) { Call = C; }
bool hasCall() const { return (bool)Call.call(); }
void printCall(raw_ostream &OS) const { Call.print(OS); }
// True if this node was effectively removed from the graph, in which case
// it should have an allocation type of None and empty context ids.
bool isRemoved() const {
// Typically if the callee edges are empty either the caller edges are
// also empty, or this is an allocation (leaf node). However, if we are
// allowing recursive callsites and contexts this will be violated for
// incompletely cloned recursive cycles.
assert((AllowRecursiveCallsites && AllowRecursiveContexts) ||
(AllocTypes == (uint8_t)AllocationType::None) ==
emptyContextIds());
return AllocTypes == (uint8_t)AllocationType::None;
}
void dump() const;
void print(raw_ostream &OS) const;
friend raw_ostream &operator<<(raw_ostream &OS, const ContextNode &Node) {
Node.print(OS);
return OS;
}
};
/// Edge in the Callsite Context Graph from a ContextNode N to a caller or
/// callee.
struct ContextEdge {
ContextNode *Callee;
ContextNode *Caller;
// This will be formed by ORing together the AllocationType enum values
// for contexts including this edge.
uint8_t AllocTypes = 0;
// Set just before initiating cloning when cloning of recursive contexts is
// enabled. Used to defer cloning of backedges until we have done cloning of
// the callee node for non-backedge caller edges. This exposes cloning
// opportunities through the backedge of the cycle.
// TODO: Note that this is not updated during cloning, and it is unclear
// whether that would be needed.
bool IsBackedge = false;
// The set of IDs for contexts including this edge.
DenseSet<uint32_t> ContextIds;
ContextEdge(ContextNode *Callee, ContextNode *Caller, uint8_t AllocType,
DenseSet<uint32_t> ContextIds)
: Callee(Callee), Caller(Caller), AllocTypes(AllocType),
ContextIds(std::move(ContextIds)) {}
DenseSet<uint32_t> &getContextIds() { return ContextIds; }
// Helper to clear the fields of this edge when we are removing it from the
// graph.
inline void clear() {
ContextIds.clear();
AllocTypes = (uint8_t)AllocationType::None;
Caller = nullptr;
Callee = nullptr;
}
// Check if edge was removed from the graph. This is useful while iterating
// over a copy of edge lists when performing operations that mutate the
// graph in ways that might remove one of the edges.
inline bool isRemoved() const {
if (Callee || Caller)
return false;
// Any edges that have been removed from the graph but are still in a
// shared_ptr somewhere should have all fields null'ed out by clear()
// above.
assert(AllocTypes == (uint8_t)AllocationType::None);
assert(ContextIds.empty());
return true;
}
void dump() const;
void print(raw_ostream &OS) const;
friend raw_ostream &operator<<(raw_ostream &OS, const ContextEdge &Edge) {
Edge.print(OS);
return OS;
}
};
/// Helpers to remove edges that have allocation type None (due to not
/// carrying any context ids) after transformations.
void removeNoneTypeCalleeEdges(ContextNode *Node);
void removeNoneTypeCallerEdges(ContextNode *Node);
void
recursivelyRemoveNoneTypeCalleeEdges(ContextNode *Node,
DenseSet<const ContextNode *> &Visited);
protected:
/// Get a list of nodes corresponding to the stack ids in the given callsite
/// context.
template <class NodeT, class IteratorT>
std::vector<uint64_t>
getStackIdsWithContextNodes(CallStack<NodeT, IteratorT> &CallsiteContext);
/// Adds nodes for the given allocation and any stack ids on its memprof MIB
/// metadata (or summary).
ContextNode *addAllocNode(CallInfo Call, const FuncTy *F);
/// Adds nodes for the given MIB stack ids.
template <class NodeT, class IteratorT>
void addStackNodesForMIB(ContextNode *AllocNode,
CallStack<NodeT, IteratorT> &StackContext,
CallStack<NodeT, IteratorT> &CallsiteContext,
AllocationType AllocType,
ArrayRef<ContextTotalSize> ContextSizeInfo);
/// Matches all callsite metadata (or summary) to the nodes created for
/// allocation memprof MIB metadata, synthesizing new nodes to reflect any
/// inlining performed on those callsite instructions.
void updateStackNodes();
/// Update graph to conservatively handle any callsite stack nodes that target
/// multiple different callee target functions.
void handleCallsitesWithMultipleTargets();
/// Mark backedges via the standard DFS based backedge algorithm.
void markBackedges();
/// Merge clones generated during cloning for different allocations but that
/// are called by the same caller node, to ensure proper function assignment.
void mergeClones();
// Try to partition calls on the given node (already placed into the AllCalls
// array) by callee function, creating new copies of Node as needed to hold
// calls with different callees, and moving the callee edges appropriately.
// Returns true if partitioning was successful.
bool partitionCallsByCallee(
ContextNode *Node, ArrayRef<CallInfo> AllCalls,
std::vector<std::pair<CallInfo, ContextNode *>> &NewCallToNode);
/// Save lists of calls with MemProf metadata in each function, for faster
/// iteration.
MapVector<FuncTy *, std::vector<CallInfo>> FuncToCallsWithMetadata;
/// Map from callsite node to the enclosing caller function.
std::map<const ContextNode *, const FuncTy *> NodeToCallingFunc;
// When exporting to dot, and an allocation id is specified, contains the
// context ids on that allocation.
DenseSet<uint32_t> DotAllocContextIds;
private:
using EdgeIter = typename std::vector<std::shared_ptr<ContextEdge>>::iterator;
// Structure to keep track of information for each call as we are matching
// non-allocation callsites onto context nodes created from the allocation
// call metadata / summary contexts.
struct CallContextInfo {
// The callsite we're trying to match.
CallTy Call;
// The callsites stack ids that have a context node in the graph.
std::vector<uint64_t> StackIds;
// The function containing this callsite.
const FuncTy *Func;
// Initially empty, if needed this will be updated to contain the context
// ids for use in a new context node created for this callsite.
DenseSet<uint32_t> ContextIds;
};
/// Helper to remove edge from graph, updating edge iterator if it is provided
/// (in which case CalleeIter indicates which edge list is being iterated).
/// This will also perform the necessary clearing of the ContextEdge members
/// to enable later checking if the edge has been removed (since we may have
/// other copies of the shared_ptr in existence, and in fact rely on this to
/// enable removal while iterating over a copy of a node's edge list).
void removeEdgeFromGraph(ContextEdge *Edge, EdgeIter *EI = nullptr,
bool CalleeIter = true);
/// Assigns the given Node to calls at or inlined into the location with
/// the Node's stack id, after post order traversing and processing its
/// caller nodes. Uses the call information recorded in the given
/// StackIdToMatchingCalls map, and creates new nodes for inlined sequences
/// as needed. Called by updateStackNodes which sets up the given
/// StackIdToMatchingCalls map.
void assignStackNodesPostOrder(
ContextNode *Node, DenseSet<const ContextNode *> &Visited,
DenseMap<uint64_t, std::vector<CallContextInfo>> &StackIdToMatchingCalls,
DenseMap<CallInfo, CallInfo> &CallToMatchingCall);
/// Duplicates the given set of context ids, updating the provided
/// map from each original id with the newly generated context ids,
/// and returning the new duplicated id set.
DenseSet<uint32_t> duplicateContextIds(
const DenseSet<uint32_t> &StackSequenceContextIds,
DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds);
/// Propagates all duplicated context ids across the graph.
void propagateDuplicateContextIds(
const DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds);
/// Connect the NewNode to OrigNode's callees if TowardsCallee is true,
/// else to its callers. Also updates OrigNode's edges to remove any context
/// ids moved to the newly created edge.
void connectNewNode(ContextNode *NewNode, ContextNode *OrigNode,
bool TowardsCallee,
DenseSet<uint32_t> RemainingContextIds);
/// Get the stack id corresponding to the given Id or Index (for IR this will
/// return itself, for a summary index this will return the id recorded in the
/// index for that stack id index value).
uint64_t getStackId(uint64_t IdOrIndex) const {
return static_cast<const DerivedCCG *>(this)->getStackId(IdOrIndex);
}
/// Returns true if the given call targets the callee of the given edge, or if
/// we were able to identify the call chain through intermediate tail calls.
/// In the latter case new context nodes are added to the graph for the
/// identified tail calls, and their synthesized nodes are added to
/// TailCallToContextNodeMap. The EdgeIter is updated in the latter case for
/// the updated edges and to prepare it for an increment in the caller.
bool
calleesMatch(CallTy Call, EdgeIter &EI,
MapVector<CallInfo, ContextNode *> &TailCallToContextNodeMap);
// Return the callee function of the given call, or nullptr if it can't be
// determined
const FuncTy *getCalleeFunc(CallTy Call) {
return static_cast<DerivedCCG *>(this)->getCalleeFunc(Call);
}
/// Returns true if the given call targets the given function, or if we were
/// able to identify the call chain through intermediate tail calls (in which
/// case FoundCalleeChain will be populated).
bool calleeMatchesFunc(
CallTy Call, const FuncTy *Func, const FuncTy *CallerFunc,
std::vector<std::pair<CallTy, FuncTy *>> &FoundCalleeChain) {
return static_cast<DerivedCCG *>(this)->calleeMatchesFunc(
Call, Func, CallerFunc, FoundCalleeChain);
}
/// Returns true if both call instructions have the same callee.
bool sameCallee(CallTy Call1, CallTy Call2) {
return static_cast<DerivedCCG *>(this)->sameCallee(Call1, Call2);
}
/// Get a list of nodes corresponding to the stack ids in the given
/// callsite's context.
std::vector<uint64_t> getStackIdsWithContextNodesForCall(CallTy Call) {
return static_cast<DerivedCCG *>(this)->getStackIdsWithContextNodesForCall(
Call);
}
/// Get the last stack id in the context for callsite.
uint64_t getLastStackId(CallTy Call) {
return static_cast<DerivedCCG *>(this)->getLastStackId(Call);
}
/// Update the allocation call to record type of allocated memory.
void updateAllocationCall(CallInfo &Call, AllocationType AllocType) {
AllocType == AllocationType::Cold ? AllocTypeCold++ : AllocTypeNotCold++;
static_cast<DerivedCCG *>(this)->updateAllocationCall(Call, AllocType);
}
/// Get the AllocationType assigned to the given allocation instruction clone.
AllocationType getAllocationCallType(const CallInfo &Call) const {
return static_cast<const DerivedCCG *>(this)->getAllocationCallType(Call);
}
/// Update non-allocation call to invoke (possibly cloned) function
/// CalleeFunc.
void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc) {
static_cast<DerivedCCG *>(this)->updateCall(CallerCall, CalleeFunc);
}
/// Clone the given function for the given callsite, recording mapping of all
/// of the functions tracked calls to their new versions in the CallMap.
/// Assigns new clones to clone number CloneNo.
FuncInfo cloneFunctionForCallsite(
FuncInfo &Func, CallInfo &Call, std::map<CallInfo, CallInfo> &CallMap,
std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) {
return static_cast<DerivedCCG *>(this)->cloneFunctionForCallsite(
Func, Call, CallMap, CallsWithMetadataInFunc, CloneNo);
}
/// Gets a label to use in the dot graph for the given call clone in the given
/// function.
std::string getLabel(const FuncTy *Func, const CallTy Call,
unsigned CloneNo) const {
return static_cast<const DerivedCCG *>(this)->getLabel(Func, Call, CloneNo);
}
// Create and return a new ContextNode.
ContextNode *createNewNode(bool IsAllocation, const FuncTy *F = nullptr,
CallInfo C = CallInfo()) {
NodeOwner.push_back(std::make_unique<ContextNode>(IsAllocation, C));
auto *NewNode = NodeOwner.back().get();
if (F)
NodeToCallingFunc[NewNode] = F;
return NewNode;
}
/// Helpers to find the node corresponding to the given call or stackid.
ContextNode *getNodeForInst(const CallInfo &C);
ContextNode *getNodeForAlloc(const CallInfo &C);
ContextNode *getNodeForStackId(uint64_t StackId);
/// Computes the alloc type corresponding to the given context ids, by
/// unioning their recorded alloc types.
uint8_t computeAllocType(DenseSet<uint32_t> &ContextIds) const;
/// Returns the allocation type of the intersection of the contexts of two
/// nodes (based on their provided context id sets), optimized for the case
/// when Node1Ids is smaller than Node2Ids.
uint8_t intersectAllocTypesImpl(const DenseSet<uint32_t> &Node1Ids,
const DenseSet<uint32_t> &Node2Ids) const;
/// Returns the allocation type of the intersection of the contexts of two
/// nodes (based on their provided context id sets).
uint8_t intersectAllocTypes(const DenseSet<uint32_t> &Node1Ids,
const DenseSet<uint32_t> &Node2Ids) const;
/// Create a clone of Edge's callee and move Edge to that new callee node,
/// performing the necessary context id and allocation type updates.
/// If ContextIdsToMove is non-empty, only that subset of Edge's ids are
/// moved to an edge to the new callee.
ContextNode *
moveEdgeToNewCalleeClone(const std::shared_ptr<ContextEdge> &Edge,
DenseSet<uint32_t> ContextIdsToMove = {});
/// Change the callee of Edge to existing callee clone NewCallee, performing
/// the necessary context id and allocation type updates.
/// If ContextIdsToMove is non-empty, only that subset of Edge's ids are
/// moved to an edge to the new callee.
void moveEdgeToExistingCalleeClone(const std::shared_ptr<ContextEdge> &Edge,
ContextNode *NewCallee,
bool NewClone = false,
DenseSet<uint32_t> ContextIdsToMove = {});
/// Change the caller of the edge at the given callee edge iterator to be
/// NewCaller, performing the necessary context id and allocation type
/// updates. This is similar to the above moveEdgeToExistingCalleeClone, but
/// a simplified version of it as we always move the given edge and all of its
/// context ids.
void moveCalleeEdgeToNewCaller(const std::shared_ptr<ContextEdge> &Edge,
ContextNode *NewCaller);
/// Recursive helper for marking backedges via DFS.
void markBackedges(ContextNode *Node, DenseSet<const ContextNode *> &Visited,
DenseSet<const ContextNode *> &CurrentStack);
/// Recursive helper for merging clones.
void
mergeClones(ContextNode *Node, DenseSet<const ContextNode *> &Visited,
DenseMap<uint32_t, ContextNode *> &ContextIdToAllocationNode);
/// Main worker for merging callee clones for a given node.
void mergeNodeCalleeClones(
ContextNode *Node, DenseSet<const ContextNode *> &Visited,
DenseMap<uint32_t, ContextNode *> &ContextIdToAllocationNode);
/// Helper to find other callers of the given set of callee edges that can
/// share the same callee merge node.
void findOtherCallersToShareMerge(
ContextNode *Node, std::vector<std::shared_ptr<ContextEdge>> &CalleeEdges,
DenseMap<uint32_t, ContextNode *> &ContextIdToAllocationNode,
DenseSet<ContextNode *> &OtherCallersToShareMerge);
/// Recursively perform cloning on the graph for the given Node and its
/// callers, in order to uniquely identify the allocation behavior of an
/// allocation given its context. The context ids of the allocation being
/// processed are given in AllocContextIds.
void identifyClones(ContextNode *Node, DenseSet<const ContextNode *> &Visited,
const DenseSet<uint32_t> &AllocContextIds);
/// Map from each context ID to the AllocationType assigned to that context.
DenseMap<uint32_t, AllocationType> ContextIdToAllocationType;
/// Map from each contextID to the profiled full contexts and their total
/// sizes (there may be more than one due to context trimming),
/// optionally populated when requested (via MemProfReportHintedSizes or
/// MinClonedColdBytePercent).
DenseMap<uint32_t, std::vector<ContextTotalSize>> ContextIdToContextSizeInfos;
/// Identifies the context node created for a stack id when adding the MIB
/// contexts to the graph. This is used to locate the context nodes when
/// trying to assign the corresponding callsites with those stack ids to these
/// nodes.
DenseMap<uint64_t, ContextNode *> StackEntryIdToContextNodeMap;
/// Maps to track the calls to their corresponding nodes in the graph.
MapVector<CallInfo, ContextNode *> AllocationCallToContextNodeMap;
MapVector<CallInfo, ContextNode *> NonAllocationCallToContextNodeMap;
/// Owner of all ContextNode unique_ptrs.
std::vector<std::unique_ptr<ContextNode>> NodeOwner;
/// Perform sanity checks on graph when requested.
void check() const;
/// Keeps track of the last unique context id assigned.
unsigned int LastContextId = 0;
};
template <typename DerivedCCG, typename FuncTy, typename CallTy>
using ContextNode =
typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode;
template <typename DerivedCCG, typename FuncTy, typename CallTy>
using ContextEdge =
typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge;
template <typename DerivedCCG, typename FuncTy, typename CallTy>
using FuncInfo =
typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::FuncInfo;
template <typename DerivedCCG, typename FuncTy, typename CallTy>
using CallInfo =
typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::CallInfo;
/// CRTP derived class for graphs built from IR (regular LTO).
class ModuleCallsiteContextGraph
: public CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
Instruction *> {
public:
ModuleCallsiteContextGraph(
Module &M,
llvm::function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter);
private:
friend CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
Instruction *>;
uint64_t getStackId(uint64_t IdOrIndex) const;
const Function *getCalleeFunc(Instruction *Call);
bool calleeMatchesFunc(
Instruction *Call, const Function *Func, const Function *CallerFunc,
std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain);
bool sameCallee(Instruction *Call1, Instruction *Call2);
bool findProfiledCalleeThroughTailCalls(
const Function *ProfiledCallee, Value *CurCallee, unsigned Depth,
std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain,
bool &FoundMultipleCalleeChains);
uint64_t getLastStackId(Instruction *Call);
std::vector<uint64_t> getStackIdsWithContextNodesForCall(Instruction *Call);
void updateAllocationCall(CallInfo &Call, AllocationType AllocType);
AllocationType getAllocationCallType(const CallInfo &Call) const;
void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc);
CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
Instruction *>::FuncInfo
cloneFunctionForCallsite(FuncInfo &Func, CallInfo &Call,
std::map<CallInfo, CallInfo> &CallMap,
std::vector<CallInfo> &CallsWithMetadataInFunc,
unsigned CloneNo);
std::string getLabel(const Function *Func, const Instruction *Call,
unsigned CloneNo) const;
const Module &Mod;
llvm::function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter;
};
/// Represents a call in the summary index graph, which can either be an
/// allocation or an interior callsite node in an allocation's context.
/// Holds a pointer to the corresponding data structure in the index.
struct IndexCall : public PointerUnion<CallsiteInfo *, AllocInfo *> {
IndexCall() : PointerUnion() {}
IndexCall(std::nullptr_t) : IndexCall() {}
IndexCall(CallsiteInfo *StackNode) : PointerUnion(StackNode) {}
IndexCall(AllocInfo *AllocNode) : PointerUnion(AllocNode) {}
IndexCall(PointerUnion PT) : PointerUnion(PT) {}
IndexCall *operator->() { return this; }
void print(raw_ostream &OS) const {
PointerUnion<CallsiteInfo *, AllocInfo *> Base = *this;
if (auto *AI = llvm::dyn_cast_if_present<AllocInfo *>(Base)) {
OS << *AI;
} else {
auto *CI = llvm::dyn_cast_if_present<CallsiteInfo *>(Base);
assert(CI);
OS << *CI;
}
}
};
} // namespace
namespace llvm {
template <> struct simplify_type<IndexCall> {
using SimpleType = PointerUnion<CallsiteInfo *, AllocInfo *>;
static SimpleType getSimplifiedValue(IndexCall &Val) { return Val; }
};
template <> struct simplify_type<const IndexCall> {
using SimpleType = const PointerUnion<CallsiteInfo *, AllocInfo *>;
static SimpleType getSimplifiedValue(const IndexCall &Val) { return Val; }
};
} // namespace llvm
namespace {
/// CRTP derived class for graphs built from summary index (ThinLTO).
class IndexCallsiteContextGraph
: public CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
IndexCall> {
public:
IndexCallsiteContextGraph(
ModuleSummaryIndex &Index,
llvm::function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)>
isPrevailing);
~IndexCallsiteContextGraph() {
// Now that we are done with the graph it is safe to add the new
// CallsiteInfo structs to the function summary vectors. The graph nodes
// point into locations within these vectors, so we don't want to add them
// any earlier.
for (auto &I : FunctionCalleesToSynthesizedCallsiteInfos) {
auto *FS = I.first;
for (auto &Callsite : I.second)
FS->addCallsite(*Callsite.second);
}
}
private:
friend CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
IndexCall>;
uint64_t getStackId(uint64_t IdOrIndex) const;
const FunctionSummary *getCalleeFunc(IndexCall &Call);
bool calleeMatchesFunc(
IndexCall &Call, const FunctionSummary *Func,
const FunctionSummary *CallerFunc,
std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain);
bool sameCallee(IndexCall &Call1, IndexCall &Call2);
bool findProfiledCalleeThroughTailCalls(
ValueInfo ProfiledCallee, ValueInfo CurCallee, unsigned Depth,
std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain,
bool &FoundMultipleCalleeChains);
uint64_t getLastStackId(IndexCall &Call);
std::vector<uint64_t> getStackIdsWithContextNodesForCall(IndexCall &Call);
void updateAllocationCall(CallInfo &Call, AllocationType AllocType);
AllocationType getAllocationCallType(const CallInfo &Call) const;
void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc);
CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
IndexCall>::FuncInfo
cloneFunctionForCallsite(FuncInfo &Func, CallInfo &Call,
std::map<CallInfo, CallInfo> &CallMap,
std::vector<CallInfo> &CallsWithMetadataInFunc,
unsigned CloneNo);
std::string getLabel(const FunctionSummary *Func, const IndexCall &Call,
unsigned CloneNo) const;
// Saves mapping from function summaries containing memprof records back to
// its VI, for use in checking and debugging.
std::map<const FunctionSummary *, ValueInfo> FSToVIMap;
const ModuleSummaryIndex &Index;
llvm::function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)>
isPrevailing;
// Saves/owns the callsite info structures synthesized for missing tail call
// frames that we discover while building the graph.
// It maps from the summary of the function making the tail call, to a map
// of callee ValueInfo to corresponding synthesized callsite info.
std::unordered_map<FunctionSummary *,
std::map<ValueInfo, std::unique_ptr<CallsiteInfo>>>
FunctionCalleesToSynthesizedCallsiteInfos;
};
} // namespace
namespace llvm {
template <>
struct DenseMapInfo<typename CallsiteContextGraph<
ModuleCallsiteContextGraph, Function, Instruction *>::CallInfo>
: public DenseMapInfo<std::pair<Instruction *, unsigned>> {};
template <>
struct DenseMapInfo<typename CallsiteContextGraph<
IndexCallsiteContextGraph, FunctionSummary, IndexCall>::CallInfo>
: public DenseMapInfo<std::pair<IndexCall, unsigned>> {};
template <>
struct DenseMapInfo<IndexCall>
: public DenseMapInfo<PointerUnion<CallsiteInfo *, AllocInfo *>> {};
} // end namespace llvm
namespace {
// Map the uint8_t alloc types (which may contain NotCold|Cold) to the alloc
// type we should actually use on the corresponding allocation.
// If we can't clone a node that has NotCold+Cold alloc type, we will fall
// back to using NotCold. So don't bother cloning to distinguish NotCold+Cold
// from NotCold.
AllocationType allocTypeToUse(uint8_t AllocTypes) {
assert(AllocTypes != (uint8_t)AllocationType::None);
if (AllocTypes ==
((uint8_t)AllocationType::NotCold | (uint8_t)AllocationType::Cold))
return AllocationType::NotCold;
else
return (AllocationType)AllocTypes;
}
// Helper to check if the alloc types for all edges recorded in the
// InAllocTypes vector match the alloc types for all edges in the Edges
// vector.
template <typename DerivedCCG, typename FuncTy, typename CallTy>
bool allocTypesMatch(
const std::vector<uint8_t> &InAllocTypes,
const std::vector<std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>>>
&Edges) {
// This should be called only when the InAllocTypes vector was computed for
// this set of Edges. Make sure the sizes are the same.
assert(InAllocTypes.size() == Edges.size());
return std::equal(
InAllocTypes.begin(), InAllocTypes.end(), Edges.begin(), Edges.end(),
[](const uint8_t &l,
const std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>> &r) {
// Can share if one of the edges is None type - don't
// care about the type along that edge as it doesn't
// exist for those context ids.
if (l == (uint8_t)AllocationType::None ||
r->AllocTypes == (uint8_t)AllocationType::None)
return true;
return allocTypeToUse(l) == allocTypeToUse(r->AllocTypes);
});
}
// Helper to check if the alloc types for all edges recorded in the
// InAllocTypes vector match the alloc types for callee edges in the given
// clone. Because the InAllocTypes were computed from the original node's callee
// edges, and other cloning could have happened after this clone was created, we
// need to find the matching clone callee edge, which may or may not exist.
template <typename DerivedCCG, typename FuncTy, typename CallTy>
bool allocTypesMatchClone(
const std::vector<uint8_t> &InAllocTypes,
const ContextNode<DerivedCCG, FuncTy, CallTy> *Clone) {
const ContextNode<DerivedCCG, FuncTy, CallTy> *Node = Clone->CloneOf;
assert(Node);
// InAllocTypes should have been computed for the original node's callee
// edges.
assert(InAllocTypes.size() == Node->CalleeEdges.size());
// First create a map of the clone callee edge callees to the edge alloc type.
DenseMap<const ContextNode<DerivedCCG, FuncTy, CallTy> *, uint8_t>
EdgeCalleeMap;
for (const auto &E : Clone->CalleeEdges) {
assert(!EdgeCalleeMap.contains(E->Callee));
EdgeCalleeMap[E->Callee] = E->AllocTypes;
}
// Next, walk the original node's callees, and look for the corresponding
// clone edge to that callee.
for (unsigned I = 0; I < Node->CalleeEdges.size(); I++) {
auto Iter = EdgeCalleeMap.find(Node->CalleeEdges[I]->Callee);
// Not found is ok, we will simply add an edge if we use this clone.
if (Iter == EdgeCalleeMap.end())
continue;
// Can share if one of the edges is None type - don't
// care about the type along that edge as it doesn't
// exist for those context ids.
if (InAllocTypes[I] == (uint8_t)AllocationType::None ||
Iter->second == (uint8_t)AllocationType::None)
continue;
if (allocTypeToUse(Iter->second) != allocTypeToUse(InAllocTypes[I]))
return false;
}
return true;
}
} // end anonymous namespace
template <typename DerivedCCG, typename FuncTy, typename CallTy>
typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getNodeForInst(
const CallInfo &C) {
ContextNode *Node = getNodeForAlloc(C);
if (Node)
return Node;
return NonAllocationCallToContextNodeMap.lookup(C);
}
template <typename DerivedCCG, typename FuncTy, typename CallTy>
typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getNodeForAlloc(
const CallInfo &C) {
return AllocationCallToContextNodeMap.lookup(C);
}
template <typename DerivedCCG, typename FuncTy, typename CallTy>
typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getNodeForStackId(
uint64_t StackId) {
auto StackEntryNode = StackEntryIdToContextNodeMap.find(StackId);
if (StackEntryNode != StackEntryIdToContextNodeMap.end())
return StackEntryNode->second;
return nullptr;
}
template <typename DerivedCCG, typename FuncTy, typename CallTy>
void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
addOrUpdateCallerEdge(ContextNode *Caller, AllocationType AllocType,
unsigned int ContextId) {
for (auto &Edge : CallerEdges) {
if (Edge->Caller == Caller) {
Edge->AllocTypes |= (uint8_t)AllocType;
Edge->getContextIds().insert(ContextId);
return;
}
}
std::shared_ptr<ContextEdge> Edge = std::make_shared<ContextEdge>(
this, Caller, (uint8_t)AllocType, DenseSet<uint32_t>({ContextId}));
CallerEdges.push_back(Edge);
Caller->CalleeEdges.push_back(Edge);
}
template <typename DerivedCCG, typename FuncTy, typename CallTy>
void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::removeEdgeFromGraph(
ContextEdge *Edge, EdgeIter *EI, bool CalleeIter) {
assert(!EI || (*EI)->get() == Edge);
assert(!Edge->isRemoved());
// Save the Caller and Callee pointers so we can erase Edge from their edge
// lists after clearing Edge below. We do the clearing first in case it is
// destructed after removing from the edge lists (if those were the last
// shared_ptr references to Edge).
auto *Callee = Edge->Callee;
auto *Caller = Edge->Caller;
// Make sure the edge fields are cleared out so we can properly detect
// removed edges if Edge is not destructed because there is still a shared_ptr
// reference.
Edge->clear();
#ifndef NDEBUG
auto CalleeCallerCount = Callee->CallerEdges.size();
auto CallerCalleeCount = Caller->CalleeEdges.size();
#endif
if (!EI) {
Callee->eraseCallerEdge(Edge);
Caller->eraseCalleeEdge(Edge);
} else if (CalleeIter) {
Callee->eraseCallerEdge(Edge);
*EI = Caller->CalleeEdges.erase(*EI);
} else {
Caller->eraseCalleeEdge(Edge);
*EI = Callee->CallerEdges.erase(*EI);
}
assert(Callee->CallerEdges.size() < CalleeCallerCount);
assert(Caller->CalleeEdges.size() < CallerCalleeCount);
}
template <typename DerivedCCG, typename FuncTy, typename CallTy>
void CallsiteContextGraph<
DerivedCCG, FuncTy, CallTy>::removeNoneTypeCalleeEdges(ContextNode *Node) {
for (auto EI = Node->CalleeEdges.begin(); EI != Node->CalleeEdges.end();) {
auto Edge = *EI;
if (Edge->AllocTypes == (uint8_t)AllocationType::None) {
assert(Edge->ContextIds.empty());
removeEdgeFromGraph(Edge.get(), &EI, /*CalleeIter=*/true);
} else
++EI;
}
}
template <typename DerivedCCG, typename FuncTy, typename CallTy>
void CallsiteContextGraph<
DerivedCCG, FuncTy, CallTy>::removeNoneTypeCallerEdges(ContextNode *Node) {
for (auto EI = Node->CallerEdges.begin(); EI != Node->CallerEdges.end();) {
auto Edge = *EI;
if (Edge->AllocTypes == (uint8_t)AllocationType::None) {
assert(Edge->ContextIds.empty());
Edge->Caller->eraseCalleeEdge(Edge.get());
EI = Node->CallerEdges.erase(EI);
} else
++EI;
}
}
template <typename DerivedCCG, typename FuncTy, typename CallTy>
typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge *
CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
findEdgeFromCallee(const ContextNode *Callee) {
for (const auto &Edge : CalleeEdges)
if (Edge->Callee == Callee)
return Edge.get();
return nullptr;
}
template <typename DerivedCCG, typename FuncTy, typename CallTy>
typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge *
CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
findEdgeFromCaller(const ContextNode *Caller) {
for (const auto &Edge : CallerEdges)
if (Edge->Caller == Caller)
return Edge.get();
return nullptr;
}
template <typename DerivedCCG, typename FuncTy, typename CallTy>
void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
eraseCalleeEdge(const ContextEdge *Edge) {
auto EI = llvm::find_if(
CalleeEdges, [Edge](const std::shared_ptr<ContextEdge> &CalleeEdge) {
return CalleeEdge.get() == Edge;
});
assert(EI != CalleeEdges.end());
CalleeEdges.erase(EI);
}
template <typename DerivedCCG, typename FuncTy, typename CallTy>
void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
eraseCallerEdge(const ContextEdge *Edge) {
auto EI = llvm::find_if(
CallerEdges, [Edge](const std::shared_ptr<ContextEdge> &CallerEdge) {
return CallerEdge.get() == Edge;
});
assert(EI != CallerEdges.end());
CallerEdges.erase(EI);
}
template <typename DerivedCCG, typename FuncTy, typename CallTy>
uint8_t CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::computeAllocType(
DenseSet<uint32_t> &ContextIds) const {
uint8_t BothTypes =
(uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold;
uint8_t AllocType = (uint8_t)AllocationType::None;
for (auto Id : ContextIds) {
AllocType |= (uint8_t)ContextIdToAllocationType.at(Id);
// Bail early if alloc type reached both, no further refinement.
if (AllocType == BothTypes)
return AllocType;
}
return AllocType;
}
template <typename DerivedCCG, typename FuncTy, typename CallTy>
uint8_t
CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::intersectAllocTypesImpl(
const DenseSet<uint32_t> &Node1Ids,
const DenseSet<uint32_t> &Node2Ids) const {
uint8_t BothTypes =
(uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold;
uint8_t AllocType = (uint8_t)AllocationType::None;
for (auto Id : Node1Ids) {
if (!Node2Ids.count(Id))
continue;
AllocType |= (uint8_t)ContextIdToAllocationType.at(Id);
// Bail early if alloc type reached both, no further refinement.
if (AllocType == BothTypes)
return AllocType;
}
return AllocType;
}
template <typename DerivedCCG, typename FuncTy, typename CallTy>
uint8_t CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::intersectAllocTypes(
const DenseSet<uint32_t> &Node1Ids,
const DenseSet<uint32_t> &Node2Ids) const {
if (Node1Ids.size() < Node2Ids.size())
return intersectAllocTypesImpl(Node1Ids, Node2Ids);
else
return intersectAllocTypesImpl(Node2Ids, Node1Ids);
}
template <typename DerivedCCG, typename FuncTy, typename CallTy>
typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::addAllocNode(
CallInfo Call, const FuncTy *F) {
assert(!getNodeForAlloc(Call));
ContextNode *AllocNode = createNewNode(/*IsAllocation=*/true, F, Call);
AllocationCallToContextNodeMap[Call] = AllocNode;
// Use LastContextId as a uniq id for MIB allocation nodes.
AllocNode->OrigStackOrAllocId = LastContextId;
// Alloc type should be updated as we add in the MIBs. We should assert
// afterwards that it is not still None.
AllocNode->AllocTypes = (uint8_t)AllocationType::None;
return AllocNode;
}
static std::string getAllocTypeString(uint8_t AllocTypes) {
if (!AllocTypes)
return "None";
std::string Str;
if (AllocTypes & (uint8_t)AllocationType::NotCold)
Str += "NotCold";
if (AllocTypes & (uint8_t)AllocationType::Cold)
Str += "Cold";
return Str;
}
template <typename DerivedCCG, typename FuncTy, typename CallTy>
template <class NodeT, class IteratorT>
void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::addStackNodesForMIB(
ContextNode *AllocNode, CallStack<NodeT, IteratorT> &StackContext,
CallStack<NodeT, IteratorT> &CallsiteContext, AllocationType AllocType,
ArrayRef<ContextTotalSize> ContextSizeInfo) {
// Treating the hot alloc type as NotCold before the disambiguation for "hot"
// is done.
if (AllocType == AllocationType::Hot)
AllocType = AllocationType::NotCold;
ContextIdToAllocationType[++LastContextId] = AllocType;
if (!ContextSizeInfo.empty()) {
auto &Entry = ContextIdToContextSizeInfos[LastContextId];
Entry.insert(Entry.begin(), ContextSizeInfo.begin(), ContextSizeInfo.end());
}
// Update alloc type and context ids for this MIB.
AllocNode->AllocTypes |= (uint8_t)AllocType;
// Now add or update nodes for each stack id in alloc's context.
// Later when processing the stack ids on non-alloc callsites we will adjust
// for any inlining in the context.
ContextNode *PrevNode = AllocNode;
// Look for recursion (direct recursion should have been collapsed by
// module summary analysis, here we should just be detecting mutual
// recursion). Mark these nodes so we don't try to clone.
SmallSet<uint64_t, 8> StackIdSet;
// Skip any on the allocation call (inlining).
for (auto ContextIter = StackContext.beginAfterSharedPrefix(CallsiteContext);
ContextIter != StackContext.end(); ++ContextIter) {
auto StackId = getStackId(*ContextIter);
ContextNode *StackNode = getNodeForStackId(StackId);
if (!StackNode) {
StackNode = createNewNode(/*IsAllocation=*/false);
StackEntryIdToContextNodeMap[StackId] = StackNode;
StackNode->OrigStackOrAllocId = StackId;
}
// Marking a node recursive will prevent its cloning completely, even for
// non-recursive contexts flowing through it.
if (!AllowRecursiveCallsites) {
auto Ins = StackIdSet.insert(StackId);
if (!Ins.second)
StackNode->Recursive = true;
}
StackNode->AllocTypes |= (uint8_t)AllocType;
PrevNode->addOrUpdateCallerEdge(StackNode, AllocType, LastContextId);
PrevNode = StackNode;
}
}
template <typename DerivedCCG, typename FuncTy, typename CallTy>
DenseSet<uint32_t>
CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::duplicateContextIds(
const DenseSet<uint32_t> &StackSequenceContextIds,
DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds) {
DenseSet<uint32_t> NewContextIds;
for (auto OldId : StackSequenceContextIds) {
NewContextIds.insert(++LastContextId);
OldToNewContextIds[OldId].insert(LastContextId);
assert(ContextIdToAllocationType.count(OldId));
// The new context has the same allocation type as original.
ContextIdToAllocationType[LastContextId] = ContextIdToAllocationType[OldId];
if (DotAllocContextIds.contains(OldId))
DotAllocContextIds.insert(LastContextId);
}
return NewContextIds;
}
template <typename DerivedCCG, typename FuncTy, typename CallTy>
void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
propagateDuplicateContextIds(
const DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds) {
// Build a set of duplicated context ids corresponding to the input id set.
auto GetNewIds = [&OldToNewContextIds](const DenseSet<uint32_t> &ContextIds) {
DenseSet<uint32_t> NewIds;
for (auto Id : ContextIds)
if (auto NewId = OldToNewContextIds.find(Id);
NewId != OldToNewContextIds.end())
NewIds.insert_range(NewId->second);
return NewIds;
};
// Recursively update context ids sets along caller edges.
auto UpdateCallers = [&](ContextNode *Node,
DenseSet<const ContextEdge *> &Visited,
auto &&UpdateCallers) -> void {
for (const auto &Edge : Node->CallerEdges) {
auto Inserted = Visited.insert(Edge.get());
if (!Inserted.second)
continue;
ContextNode *NextNode = Edge->Caller;
DenseSet<uint32_t> NewIdsToAdd = GetNewIds(Edge->getContextIds());
// Only need to recursively iterate to NextNode via this caller edge if
// it resulted in any added ids to NextNode.
if (!NewIdsToAdd.empty()) {
Edge->getContextIds().insert_range(NewIdsToAdd);
UpdateCallers(NextNode, Visited, UpdateCallers);
}
}
};
DenseSet<const ContextEdge *> Visited;
for (auto &Entry : AllocationCallToContextNodeMap) {
auto *Node = Entry.second;
UpdateCallers(Node, Visited, UpdateCallers);
}
}
template <typename DerivedCCG, typename FuncTy, typename CallTy>
void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::connectNewNode(
ContextNode *NewNode, ContextNode *OrigNode, bool TowardsCallee,
// This must be passed by value to make a copy since it will be adjusted
// as ids are moved.
DenseSet<uint32_t> RemainingContextIds) {
auto &OrigEdges =
TowardsCallee ? OrigNode->CalleeEdges : OrigNode->CallerEdges;
DenseSet<uint32_t> RecursiveContextIds;
DenseSet<uint32_t> AllCallerContextIds;
if (AllowRecursiveCallsites) {
// Identify which context ids are recursive which is needed to properly
// update the RemainingContextIds set. The relevant recursive context ids
// are those that are in multiple edges.
for (auto &CE : OrigEdges) {
AllCallerContextIds.reserve(CE->getContextIds().size());
for (auto Id : CE->getContextIds())
if (!AllCallerContextIds.insert(Id).second)
RecursiveContextIds.insert(Id);
}
}
// Increment iterator in loop so that we can remove edges as needed.
for (auto EI = OrigEdges.begin(); EI != OrigEdges.end();) {
auto Edge = *EI;
DenseSet<uint32_t> NewEdgeContextIds;
DenseSet<uint32_t> NotFoundContextIds;
// Remove any matching context ids from Edge, return set that were found and
// removed, these are the new edge's context ids. Also update the remaining
// (not found ids).
set_subtract(Edge->getContextIds(), RemainingContextIds, NewEdgeContextIds,
NotFoundContextIds);
// Update the remaining context ids set for the later edges. This is a
// compile time optimization.
if (RecursiveContextIds.empty()) {
// No recursive ids, so all of the previously remaining context ids that
// were not seen on this edge are the new remaining set.
RemainingContextIds.swap(NotFoundContextIds);
} else {
// Keep the recursive ids in the remaining set as we expect to see those
// on another edge. We can remove the non-recursive remaining ids that
// were seen on this edge, however. We already have the set of remaining
// ids that were on this edge (in NewEdgeContextIds). Figure out which are
// non-recursive and only remove those. Note that despite the higher
// overhead of updating the remaining context ids set when recursion
// handling is enabled, it was found to be at worst performance neutral
// and in one case a clear win.
DenseSet<uint32_t> NonRecursiveRemainingCurEdgeIds =
set_difference(NewEdgeContextIds, RecursiveContextIds);
set_subtract(RemainingContextIds, NonRecursiveRemainingCurEdgeIds);
}
// If no matching context ids for this edge, skip it.
if (NewEdgeContextIds.empty()) {
++EI;
continue;
}
if (TowardsCallee) {
uint8_t NewAllocType = computeAllocType(NewEdgeContextIds);
auto NewEdge = std::make_shared<ContextEdge>(
Edge->Callee, NewNode, NewAllocType, std::move(NewEdgeContextIds));
NewNode->CalleeEdges.push_back(NewEdge);
NewEdge->Callee->CallerEdges.push_back(NewEdge);
} else {
uint8_t NewAllocType = computeAllocType(NewEdgeContextIds);
auto NewEdge = std::make_shared<ContextEdge>(
NewNode, Edge->Caller, NewAllocType, std::move(NewEdgeContextIds));
NewNode->CallerEdges.push_back(NewEdge);
NewEdge->Caller->CalleeEdges.push_back(NewEdge);
}
// Remove old edge if context ids empty.
if (Edge->getContextIds().empty()) {
removeEdgeFromGraph(Edge.get(), &EI, TowardsCallee);
continue;
}
++EI;
}
}
template <typename DerivedCCG, typename FuncTy, typename CallTy>
static void checkEdge(
const std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>> &Edge) {
// Confirm that alloc type is not None and that we have at least one context
// id.
assert(Edge->AllocTypes != (uint8_t)AllocationType::None);
assert(!Edge->ContextIds.empty());
}
template <typename DerivedCCG, typename FuncTy, typename CallTy>
static void checkNode(const ContextNode<DerivedCCG, FuncTy, CallTy> *Node,
bool CheckEdges = true) {
if (Node->isRemoved())
return;
#ifndef NDEBUG
// Compute node's context ids once for use in asserts.
auto NodeContextIds = Node->getContextIds();
#endif
// Node's context ids should be the union of both its callee and caller edge
// context ids.
if (Node->CallerEdges.size()) {
DenseSet<uint32_t> CallerEdgeContextIds(
Node->CallerEdges.front()->ContextIds);
for (const auto &Edge : llvm::drop_begin(Node->CallerEdges)) {
if (CheckEdges)
checkEdge<DerivedCCG, FuncTy, CallTy>(Edge);
set_union(CallerEdgeContextIds, Edge->ContextIds);
}
// Node can have more context ids than callers if some contexts terminate at
// node and some are longer. If we are allowing recursive callsites and
// contexts this will be violated for incompletely cloned recursive cycles,
// so skip the checking in that case.
assert((AllowRecursiveCallsites && AllowRecursiveContexts) ||
NodeContextIds == CallerEdgeContextIds ||
set_is_subset(CallerEdgeContextIds, NodeContextIds));
}
if (Node->CalleeEdges.size()) {
DenseSet<uint32_t> CalleeEdgeContextIds(
Node->CalleeEdges.front()->ContextIds);
for (const auto &Edge : llvm::drop_begin(Node->CalleeEdges)) {
if (CheckEdges)
checkEdge<DerivedCCG, FuncTy, CallTy>(Edge);
set_union(CalleeEdgeContextIds, Edge->getContextIds());
}
// If we are allowing recursive callsites and contexts this will be violated
// for incompletely cloned recursive cycles, so skip the checking in that
// case.
assert((AllowRecursiveCallsites && AllowRecursiveContexts) ||
NodeContextIds == CalleeEdgeContextIds);
}
// FIXME: Since this checking is only invoked under an option, we should
// change the error checking from using assert to something that will trigger
// an error on a release build.
#ifndef NDEBUG
// Make sure we don't end up with duplicate edges between the same caller and
// callee.
DenseSet<ContextNode<DerivedCCG, FuncTy, CallTy> *> NodeSet;
for (const auto &E : Node->CalleeEdges)
NodeSet.insert(E->Callee);
assert(NodeSet.size() == Node->CalleeEdges.size());
#endif
}
template <typename DerivedCCG, typename FuncTy, typename CallTy>
void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
assignStackNodesPostOrder(
ContextNode *Node, DenseSet<const ContextNode *> &Visited,
DenseMap<uint64_t, std::vector<CallContextInfo>>
&StackIdToMatchingCalls,
DenseMap<CallInfo, CallInfo> &CallToMatchingCall) {
auto Inserted = Visited.insert(Node);
if (!Inserted.second)
return;
// Post order traversal. Iterate over a copy since we may add nodes and
// therefore new callers during the recursive call, invalidating any
// iterator over the original edge vector. We don't need to process these
// new nodes as they were already processed on creation.
auto CallerEdges = Node->CallerEdges;
for (auto &Edge : CallerEdges) {
// Skip any that have been removed during the recursion.
if (Edge->isRemoved()) {
assert(!is_contained(Node->CallerEdges, Edge));
continue;
}
assignStackNodesPostOrder(Edge->Caller, Visited, StackIdToMatchingCalls,
CallToMatchingCall);
}
// If this node's stack id is in the map, update the graph to contain new
// nodes representing any inlining at interior callsites. Note we move the
// associated context ids over to the new nodes.
// Ignore this node if it is for an allocation or we didn't record any
// stack id lists ending at it.
if (Node->IsAllocation ||
!StackIdToMatchingCalls.count(Node->OrigStackOrAllocId))
return;
auto &Calls = StackIdToMatchingCalls[Node->OrigStackOrAllocId];
// Handle the simple case first. A single call with a single stack id.
// In this case there is no need to create any new context nodes, simply
// assign the context node for stack id to this Call.
if (Calls.size() == 1) {
auto &[Call, Ids, Func, SavedContextIds] = Calls[0];
if (Ids.size() == 1) {
assert(SavedContextIds.empty());
// It should be this Node
assert(Node == getNodeForStackId(Ids[0]));
if (Node->Recursive)
return;
Node->setCall(Call);
NonAllocationCallToContextNodeMap[Call] = Node;
NodeToCallingFunc[Node] = Func;
return;
}
}
#ifndef NDEBUG
// Find the node for the last stack id, which should be the same
// across all calls recorded for this id, and is this node's id.
uint64_t LastId = Node->OrigStackOrAllocId;
ContextNode *LastNode = getNodeForStackId(LastId);
// We should only have kept stack ids that had nodes.
assert(LastNode);
assert(LastNode == Node);
#else
ContextNode *LastNode = Node;
#endif
// Compute the last node's context ids once, as it is shared by all calls in
// this entry.
DenseSet<uint32_t> LastNodeContextIds = LastNode->getContextIds();
bool PrevIterCreatedNode = false;
bool CreatedNode = false;
for (unsigned I = 0; I < Calls.size();
I++, PrevIterCreatedNode = CreatedNode) {
CreatedNode = false;
auto &[Call, Ids, Func, SavedContextIds] = Calls[I];
// Skip any for which we didn't assign any ids, these don't get a node in
// the graph.
if (SavedContextIds.empty()) {
// If this call has a matching call (located in the same function and
// having the same stack ids), simply add it to the context node created
// for its matching call earlier. These can be treated the same through
// cloning and get updated at the same time.
if (!CallToMatchingCall.contains(Call))
continue;
auto MatchingCall = CallToMatchingCall[Call];
if (!NonAllocationCallToContextNodeMap.contains(MatchingCall)) {
// This should only happen if we had a prior iteration, and it didn't
// create a node because of the below recomputation of context ids
// finding none remaining and continuing early.
assert(I > 0 && !PrevIterCreatedNode);
continue;
}
NonAllocationCallToContextNodeMap[MatchingCall]->MatchingCalls.push_back(
Call);
continue;
}
assert(LastId == Ids.back());
// Recompute the context ids for this stack id sequence (the
// intersection of the context ids of the corresponding nodes).
// Start with the ids we saved in the map for this call, which could be
// duplicated context ids. We have to recompute as we might have overlap
// overlap between the saved context ids for different last nodes, and
// removed them already during the post order traversal.
set_intersect(SavedContextIds, LastNodeContextIds);
ContextNode *PrevNode = LastNode;
bool Skip = false;
// Iterate backwards through the stack Ids, starting after the last Id
// in the list, which was handled once outside for all Calls.
for (auto IdIter = Ids.rbegin() + 1; IdIter != Ids.rend(); IdIter++) {
auto Id = *IdIter;
ContextNode *CurNode = getNodeForStackId(Id);
// We should only have kept stack ids that had nodes and weren't
// recursive.
assert(CurNode);
assert(!CurNode->Recursive);
auto *Edge = CurNode->findEdgeFromCaller(PrevNode);
if (!Edge) {
Skip = true;
break;
}
PrevNode = CurNode;
// Update the context ids, which is the intersection of the ids along
// all edges in the sequence.
set_intersect(SavedContextIds, Edge->getContextIds());
// If we now have no context ids for clone, skip this call.
if (SavedContextIds.empty()) {
Skip = true;
break;
}
}
if (Skip)
continue;
// Create new context node.
ContextNode *NewNode = createNewNode(/*IsAllocation=*/false, Func, Call);
NonAllocationCallToContextNodeMap[Call] = NewNode;
CreatedNode = true;
NewNode->AllocTypes = computeAllocType(SavedContextIds);
ContextNode *FirstNode = getNodeForStackId(Ids[0]);
assert(FirstNode);
// Connect to callees of innermost stack frame in inlined call chain.
// This updates context ids for FirstNode's callee's to reflect those
// moved to NewNode.
connectNewNode(NewNode, FirstNode, /*TowardsCallee=*/true, SavedContextIds);
// Connect to callers of outermost stack frame in inlined call chain.
// This updates context ids for FirstNode's caller's to reflect those
// moved to NewNode.
connectNewNode(NewNode, LastNode, /*TowardsCallee=*/false, SavedContextIds);
// Now we need to remove context ids from edges/nodes between First and
// Last Node.
PrevNode = nullptr;
for (auto Id : Ids) {
ContextNode *CurNode = getNodeForStackId(Id);
// We should only have kept stack ids that had nodes.
assert(CurNode);
// Remove the context ids moved to NewNode from CurNode, and the
// edge from the prior node.
if (PrevNode) {
auto *PrevEdge = CurNode->findEdgeFromCallee(PrevNode);
// If the sequence contained recursion, we might have already removed
// some edges during the connectNewNode calls above.
if (!PrevEdge) {
PrevNode = CurNode;
continue;
}
set_subtract(PrevEdge->getContextIds(), SavedContextIds);
if (PrevEdge->getContextIds().empty())
removeEdgeFromGraph(PrevEdge);
}
// Since we update the edges from leaf to tail, only look at the callee
// edges. This isn't an alloc node, so if there are no callee edges, the
// alloc type is None.
CurNode->AllocTypes = CurNode->CalleeEdges.empty()
? (uint8_t)AllocationType::None
: CurNode->computeAllocType();
PrevNode = CurNode;
}
if (VerifyNodes) {
checkNode<DerivedCCG, FuncTy, CallTy>(NewNode, /*CheckEdges=*/true);
for (auto Id : Ids) {
ContextNode *CurNode = getNodeForStackId(Id);
// We should only have kept stack ids that had nodes.
assert(CurNode);
checkNode<DerivedCCG, FuncTy, CallTy>(CurNode, /*CheckEdges=*/true);
}
}
}
}
template <typename DerivedCCG, typename FuncTy, typename CallTy>
void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::updateStackNodes() {
// Map of stack id to all calls with that as the last (outermost caller)
// callsite id that has a context node (some might not due to pruning
// performed during matching of the allocation profile contexts).
// The CallContextInfo contains the Call and a list of its stack ids with
// ContextNodes, the function containing Call, and the set of context ids
// the analysis will eventually identify for use in any new node created
// for that callsite.
DenseMap<uint64_t, std::vector<CallContextInfo>> StackIdToMatchingCalls;
for (auto &[Func, CallsWithMetadata] : FuncToCallsWithMetadata) {
for (auto &Call : CallsWithMetadata) {
// Ignore allocations, already handled.
if (AllocationCallToContextNodeMap.count(Call))
continue;
auto StackIdsWithContextNodes =
getStackIdsWithContextNodesForCall(Call.call());
// If there were no nodes created for MIBs on allocs (maybe this was in
// the unambiguous part of the MIB stack that was pruned), ignore.
if (StackIdsWithContextNodes.empty())
continue;
// Otherwise, record this Call along with the list of ids for the last
// (outermost caller) stack id with a node.
StackIdToMatchingCalls[StackIdsWithContextNodes.back()].push_back(
{Call.call(), StackIdsWithContextNodes, Func, {}});
}
}
// First make a pass through all stack ids that correspond to a call,
// as identified in the above loop. Compute the context ids corresponding to
// each of these calls when they correspond to multiple stack ids due to
// due to inlining. Perform any duplication of context ids required when
// there is more than one call with the same stack ids. Their (possibly newly
// duplicated) context ids are saved in the StackIdToMatchingCalls map.
DenseMap<uint32_t, DenseSet<uint32_t>> OldToNewContextIds;
// Save a map from each call to any that are found to match it. I.e. located
// in the same function and have the same (possibly pruned) stack ids. We use
// this to avoid creating extra graph nodes as they can be treated the same.
DenseMap<CallInfo, CallInfo> CallToMatchingCall;
for (auto &It : StackIdToMatchingCalls) {
auto &Calls = It.getSecond();
// Skip single calls with a single stack id. These don't need a new node.
if (Calls.size() == 1) {
auto &Ids = Calls[0].StackIds;
if (Ids.size() == 1)
continue;
}
// In order to do the best and maximal matching of inlined calls to context
// node sequences we will sort the vectors of stack ids in descending order
// of length, and within each length, lexicographically by stack id. The
// latter is so that we can specially handle calls that have identical stack
// id sequences (either due to cloning or artificially because of the MIB
// context pruning). Those with the same Ids are then sorted by function to
// facilitate efficiently mapping them to the same context node.
// Because the functions are pointers, to ensure a stable sort first assign
// each function pointer to its first index in the Calls array, and then use
// that to sort by.
DenseMap<const FuncTy *, unsigned> FuncToIndex;
for (const auto &[Idx, CallCtxInfo] : enumerate(Calls))
FuncToIndex.insert({CallCtxInfo.Func, Idx});
llvm::stable_sort(
Calls,
[&FuncToIndex](const CallContextInfo &A, const CallContextInfo &B) {
return A.StackIds.size() > B.StackIds.size() ||
(A.StackIds.size() == B.StackIds.size() &&
(A.StackIds < B.StackIds ||
(A.StackIds == B.StackIds &&
FuncToIndex[A.Func] < FuncToIndex[B.Func])));
});
// Find the node for the last stack id, which should be the same
// across all calls recorded for this id, and is the id for this
// entry in the StackIdToMatchingCalls map.
uint64_t LastId = It.getFirst();
ContextNode *LastNode = getNodeForStackId(LastId);
// We should only have kept stack ids that had nodes.
assert(LastNode);
if (LastNode->Recursive)
continue;
// Initialize the context ids with the last node's. We will subsequently
// refine the context ids by computing the intersection along all edges.
DenseSet<uint32_t> LastNodeContextIds = LastNode->getContextIds();
assert(!LastNodeContextIds.empty());
#ifndef NDEBUG
// Save the set of functions seen for a particular set of the same stack
// ids. This is used to ensure that they have been correctly sorted to be
// adjacent in the Calls list, since we rely on that to efficiently place
// all such matching calls onto the same context node.
DenseSet<const FuncTy *> MatchingIdsFuncSet;
#endif
for (unsigned I = 0; I < Calls.size(); I++) {
auto &[Call, Ids, Func, SavedContextIds] = Calls[I];
assert(SavedContextIds.empty());
assert(LastId == Ids.back());
#ifndef NDEBUG
// If this call has a different set of ids than the last one, clear the
// set used to ensure they are sorted properly.
if (I > 0 && Ids != Calls[I - 1].StackIds)
MatchingIdsFuncSet.clear();
#endif
// First compute the context ids for this stack id sequence (the
// intersection of the context ids of the corresponding nodes).
// Start with the remaining saved ids for the last node.
assert(!LastNodeContextIds.empty());
DenseSet<uint32_t> StackSequenceContextIds = LastNodeContextIds;
ContextNode *PrevNode = LastNode;
ContextNode *CurNode = LastNode;
bool Skip = false;
// Iterate backwards through the stack Ids, starting after the last Id
// in the list, which was handled once outside for all Calls.
for (auto IdIter = Ids.rbegin() + 1; IdIter != Ids.rend(); IdIter++) {
auto Id = *IdIter;
CurNode = getNodeForStackId(Id);
// We should only have kept stack ids that had nodes.
assert(CurNode);
if (CurNode->Recursive) {
Skip = true;
break;
}
auto *Edge = CurNode->findEdgeFromCaller(PrevNode);
// If there is no edge then the nodes belong to different MIB contexts,
// and we should skip this inlined context sequence. For example, this
// particular inlined context may include stack ids A->B, and we may
// indeed have nodes for both A and B, but it is possible that they were
// never profiled in sequence in a single MIB for any allocation (i.e.
// we might have profiled an allocation that involves the callsite A,
// but through a different one of its callee callsites, and we might
// have profiled an allocation that involves callsite B, but reached
// from a different caller callsite).
if (!Edge) {
Skip = true;
break;
}
PrevNode = CurNode;
// Update the context ids, which is the intersection of the ids along
// all edges in the sequence.
set_intersect(StackSequenceContextIds, Edge->getContextIds());
// If we now have no context ids for clone, skip this call.
if (StackSequenceContextIds.empty()) {
Skip = true;
break;
}
}
if (Skip)
continue;
// If some of this call's stack ids did not have corresponding nodes (due
// to pruning), don't include any context ids for contexts that extend
// beyond these nodes. Otherwise we would be matching part of unrelated /
// not fully matching stack contexts. To do this, subtract any context ids
// found in caller nodes of the last node found above.
if (Ids.back() != getLastStackId(Call)) {
for (const auto &PE : LastNode->CallerEdges) {
set_subtract(StackSequenceContextIds, PE->getContextIds());
if (StackSequenceContextIds.empty())
break;
}
// If we now have no context ids for clone, skip this call.
if (StackSequenceContextIds.empty())
continue;
}
#ifndef NDEBUG
// If the prior call had the same stack ids this set would not be empty.
// Check if we already have a call that "matches" because it is located
// in the same function. If the Calls list was sorted properly we should
// not encounter this situation as all such entries should be adjacent
// and processed in bulk further below.
assert(!MatchingIdsFuncSet.contains(Func));
MatchingIdsFuncSet.insert(Func);
#endif
// Check if the next set of stack ids is the same (since the Calls vector
// of tuples is sorted by the stack ids we can just look at the next one).
// If so, save them in the CallToMatchingCall map so that they get
// assigned to the same context node, and skip them.
bool DuplicateContextIds = false;
for (unsigned J = I + 1; J < Calls.size(); J++) {
auto &CallCtxInfo = Calls[J];
auto &NextIds = CallCtxInfo.StackIds;
if (NextIds != Ids)
break;
auto *NextFunc = CallCtxInfo.Func;
if (NextFunc != Func) {
// We have another Call with the same ids but that cannot share this
// node, must duplicate ids for it.
DuplicateContextIds = true;
break;
}
auto &NextCall = CallCtxInfo.Call;
CallToMatchingCall[NextCall] = Call;
// Update I so that it gets incremented correctly to skip this call.
I = J;
}
// If we don't have duplicate context ids, then we can assign all the
// context ids computed for the original node sequence to this call.
// If there are duplicate calls with the same stack ids then we synthesize
// new context ids that are duplicates of the originals. These are
// assigned to SavedContextIds, which is a reference into the map entry
// for this call, allowing us to access these ids later on.
OldToNewContextIds.reserve(OldToNewContextIds.size() +
StackSequenceContextIds.size());
SavedContextIds =
DuplicateContextIds
? duplicateContextIds(StackSequenceContextIds, OldToNewContextIds)
: StackSequenceContextIds;
assert(!SavedContextIds.empty());
if (!DuplicateContextIds) {
// Update saved last node's context ids to remove those that are
// assigned to other calls, so that it is ready for the next call at
// this stack id.
set_subtract(LastNodeContextIds, StackSequenceContextIds);
if (LastNodeContextIds.empty())
break;
}
}
}
// Propagate the duplicate context ids over the graph.
propagateDuplicateContextIds(OldToNewContextIds);
if (VerifyCCG)
check();
// Now perform a post-order traversal over the graph, starting with the
// allocation nodes, essentially processing nodes from callers to callees.
// For any that contains an id in the map, update the graph to contain new
// nodes representing any inlining at interior callsites. Note we move the
// associated context ids over to the new nodes.
DenseSet<const ContextNode *> Visited;
for (auto &Entry : AllocationCallToContextNodeMap)
assignStackNodesPostOrder(Entry.second, Visited, StackIdToMatchingCalls,
CallToMatchingCall);
if (VerifyCCG)
check();
}
uint64_t ModuleCallsiteContextGraph::getLastStackId(Instruction *Call) {
CallStack<MDNode, MDNode::op_iterator> CallsiteContext(
Call->getMetadata(LLVMContext::MD_callsite));
return CallsiteContext.back();
}
uint64_t IndexCallsiteContextGraph::getLastStackId(IndexCall &Call) {
assert(isa<CallsiteInfo *>(Call));
CallStack<CallsiteInfo, SmallVector<unsigned>::const_iterator>
CallsiteContext(dyn_cast_if_present<CallsiteInfo *>(Call));
// Need to convert index into stack id.
return Index.getStackIdAtIndex(CallsiteContext.back());
}
static const std::string MemProfCloneSuffix = ".memprof.";
static std::string getMemProfFuncName(Twine Base, unsigned CloneNo) {
// We use CloneNo == 0 to refer to the original version, which doesn't get
// renamed with a suffix.
if (!CloneNo)
return Base.str();
return (Base + MemProfCloneSuffix + Twine(CloneNo)).str();
}
static bool isMemProfClone(const Function &F) {
return F.getName().contains(MemProfCloneSuffix);
}
std::string ModuleCallsiteContextGraph::getLabel(const Function *Func,
const Instruction *Call,
unsigned CloneNo) const {
return (Twine(Call->getFunction()->getName()) + " -> " +
cast<CallBase>(Call)->getCalledFunction()->getName())
.str();
}
std::string IndexCallsiteContextGraph::getLabel(const FunctionSummary *Func,
const IndexCall &Call,
unsigned CloneNo) const {
auto VI = FSToVIMap.find(Func);
assert(VI != FSToVIMap.end());
if (isa<AllocInfo *>(Call))
return (VI->second.name() + " -> alloc").str();
else {
auto *Callsite = dyn_cast_if_present<CallsiteInfo *>(Call);
return (VI->second.name() + " -> " +
getMemProfFuncName(Callsite->Callee.name(),
Callsite->Clones[CloneNo]))
.str();
}
}
std::vector<uint64_t>
ModuleCallsiteContextGraph::getStackIdsWithContextNodesForCall(
Instruction *Call) {
CallStack<MDNode, MDNode::op_iterator> CallsiteContext(
Call->getMetadata(LLVMContext::MD_callsite));
return getStackIdsWithContextNodes<MDNode, MDNode::op_iterator>(
CallsiteContext);
}
std::vector<uint64_t>
IndexCallsiteContextGraph::getStackIdsWithContextNodesForCall(IndexCall &Call) {
assert(isa<CallsiteInfo *>(Call));
CallStack<CallsiteInfo, SmallVector<unsigned>::const_iterator>
CallsiteContext(dyn_cast_if_present<CallsiteInfo *>(Call));
return getStackIdsWithContextNodes<CallsiteInfo,
SmallVector<unsigned>::const_iterator>(
CallsiteContext);
}
template <typename DerivedCCG, typename FuncTy, typename CallTy>
template <class NodeT, class IteratorT>
std::vector<uint64_t>
CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getStackIdsWithContextNodes(
CallStack<NodeT, IteratorT> &CallsiteContext) {
std::vector<uint64_t> StackIds;
for (auto IdOrIndex : CallsiteContext) {
auto StackId = getStackId(IdOrIndex);
ContextNode *Node = getNodeForStackId(StackId);
if (!Node)
break;
StackIds.push_back(StackId);
}
return StackIds;
}
ModuleCallsiteContextGraph::ModuleCallsiteContextGraph(
Module &M,
llvm::function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter)
: Mod(M), OREGetter(OREGetter) {
for (auto &F : M) {
std::vector<CallInfo> CallsWithMetadata;
for (auto &BB : F) {
for (auto &I : BB) {
if (!isa<CallBase>(I))
continue;
if (auto *MemProfMD = I.getMetadata(LLVMContext::MD_memprof)) {
CallsWithMetadata.push_back(&I);
auto *AllocNode = addAllocNode(&I, &F);
auto *CallsiteMD = I.getMetadata(LLVMContext::MD_callsite);
assert(CallsiteMD);
CallStack<MDNode, MDNode::op_iterator> CallsiteContext(CallsiteMD);
// Add all of the MIBs and their stack nodes.
for (auto &MDOp : MemProfMD->operands()) {
auto *MIBMD = cast<const MDNode>(MDOp);
std::vector<ContextTotalSize> ContextSizeInfo;
// Collect the context size information if it exists.
if (MIBMD->getNumOperands() > 2) {
for (unsigned I = 2; I < MIBMD->getNumOperands(); I++) {
MDNode *ContextSizePair =
dyn_cast<MDNode>(MIBMD->getOperand(I));
assert(ContextSizePair->getNumOperands() == 2);
uint64_t FullStackId = mdconst::dyn_extract<ConstantInt>(
ContextSizePair->getOperand(0))
->getZExtValue();
uint64_t TotalSize = mdconst::dyn_extract<ConstantInt>(
ContextSizePair->getOperand(1))
->getZExtValue();
ContextSizeInfo.push_back({FullStackId, TotalSize});
}
}
MDNode *StackNode = getMIBStackNode(MIBMD);
assert(StackNode);
CallStack<MDNode, MDNode::op_iterator> StackContext(StackNode);
addStackNodesForMIB<MDNode, MDNode::op_iterator>(
AllocNode, StackContext, CallsiteContext,
getMIBAllocType(MIBMD), ContextSizeInfo);
}
// If exporting the graph to dot and an allocation id of interest was
// specified, record all the context ids for this allocation node.
if (ExportToDot && AllocNode->OrigStackOrAllocId == AllocIdForDot)
DotAllocContextIds = AllocNode->getContextIds();
assert(AllocNode->AllocTypes != (uint8_t)AllocationType::None);
// Memprof and callsite metadata on memory allocations no longer
// needed.
I.setMetadata(LLVMContext::MD_memprof, nullptr);
I.setMetadata(LLVMContext::MD_callsite, nullptr);
}
// For callsite metadata, add to list for this function for later use.
else if (I.getMetadata(LLVMContext::MD_callsite)) {
CallsWithMetadata.push_back(&I);
}
}
}
if (!CallsWithMetadata.empty())
FuncToCallsWithMetadata[&F] = CallsWithMetadata;
}
if (DumpCCG) {
dbgs() << "CCG before updating call stack chains:\n";
dbgs() << *this;
}
if (ExportToDot)
exportToDot("prestackupdate");
updateStackNodes();
handleCallsitesWithMultipleTargets();
markBackedges();
// Strip off remaining callsite metadata, no longer needed.
for (auto &FuncEntry : FuncToCallsWithMetadata)
for (auto &Call : FuncEntry.second)
Call.call()->setMetadata(LLVMContext::MD_callsite, nullptr);
}
IndexCallsiteContextGraph::IndexCallsiteContextGraph(
ModuleSummaryIndex &Index,
llvm::function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)>
isPrevailing)
: Index(Index), isPrevailing(isPrevailing) {
for (auto &I : Index) {
auto VI = Index.getValueInfo(I);
for (auto &S : VI.getSummaryList()) {
// We should only add the prevailing nodes. Otherwise we may try to clone
// in a weak copy that won't be linked (and may be different than the
// prevailing version).
// We only keep the memprof summary on the prevailing copy now when
// building the combined index, as a space optimization, however don't
// rely on this optimization. The linker doesn't resolve local linkage
// values so don't check whether those are prevailing.
if (!GlobalValue::isLocalLinkage(S->linkage()) &&
!isPrevailing(VI.getGUID(), S.get()))
continue;
auto *FS = dyn_cast<FunctionSummary>(S.get());
if (!FS)
continue;
std::vector<CallInfo> CallsWithMetadata;
if (!FS->allocs().empty()) {
for (auto &AN : FS->mutableAllocs()) {
// This can happen because of recursion elimination handling that
// currently exists in ModuleSummaryAnalysis. Skip these for now.
// We still added them to the summary because we need to be able to
// correlate properly in applyImport in the backends.
if (AN.MIBs.empty())
continue;
IndexCall AllocCall(&AN);
CallsWithMetadata.push_back(AllocCall);
auto *AllocNode = addAllocNode(AllocCall, FS);
// Pass an empty CallStack to the CallsiteContext (second)
// parameter, since for ThinLTO we already collapsed out the inlined
// stack ids on the allocation call during ModuleSummaryAnalysis.
CallStack<MIBInfo, SmallVector<unsigned>::const_iterator>
EmptyContext;
unsigned I = 0;
assert(
(!MemProfReportHintedSizes && MinClonedColdBytePercent >= 100) ||
AN.ContextSizeInfos.size() == AN.MIBs.size());
// Now add all of the MIBs and their stack nodes.
for (auto &MIB : AN.MIBs) {
CallStack<MIBInfo, SmallVector<unsigned>::const_iterator>
StackContext(&MIB);
std::vector<ContextTotalSize> ContextSizeInfo;
if (!AN.ContextSizeInfos.empty()) {
for (auto [FullStackId, TotalSize] : AN.ContextSizeInfos[I])
ContextSizeInfo.push_back({FullStackId, TotalSize});
}
addStackNodesForMIB<MIBInfo, SmallVector<unsigned>::const_iterator>(
AllocNode, StackContext, EmptyContext, MIB.AllocType,
ContextSizeInfo);
I++;
}
// If exporting the graph to dot and an allocation id of interest was
// specified, record all the context ids for this allocation node.
if (ExportToDot && AllocNode->OrigStackOrAllocId == AllocIdForDot)
DotAllocContextIds = AllocNode->getContextIds();
assert(AllocNode->AllocTypes != (uint8_t)AllocationType::None);
// Initialize version 0 on the summary alloc node to the current alloc
// type, unless it has both types in which case make it default, so
// that in the case where we aren't able to clone the original version
// always ends up with the default allocation behavior.
AN.Versions[0] = (uint8_t)allocTypeToUse(AllocNode->AllocTypes);
}
}
// For callsite metadata, add to list for this function for later use.
if (!FS->callsites().empty())
for (auto &SN : FS->mutableCallsites()) {
IndexCall StackNodeCall(&SN);
CallsWithMetadata.push_back(StackNodeCall);
}
if (!CallsWithMetadata.empty())
FuncToCallsWithMetadata[FS] = CallsWithMetadata;
if (!FS->allocs().empty() || !FS->callsites().empty())
FSToVIMap[FS] = VI;
}
}
if (DumpCCG) {
dbgs() << "CCG before updating call stack chains:\n";
dbgs() << *this;
}
if (ExportToDot)
exportToDot("prestackupdate");
updateStackNodes();
handleCallsitesWithMultipleTargets();
markBackedges();
}
template <typename DerivedCCG, typename FuncTy, typename CallTy>
void CallsiteContextGraph<DerivedCCG, FuncTy,
CallTy>::handleCallsitesWithMultipleTargets() {
// Look for and workaround callsites that call multiple functions.
// This can happen for indirect calls, which needs better handling, and in
// more rare cases (e.g. macro expansion).
// TODO: To fix this for indirect calls we will want to perform speculative
// devirtualization using either the normal PGO info with ICP, or using the
// information in the profiled MemProf contexts. We can do this prior to
// this transformation for regular LTO, and for ThinLTO we can simulate that
// effect in the summary and perform the actual speculative devirtualization
// while cloning in the ThinLTO backend.
// Keep track of the new nodes synthesized for discovered tail calls missing
// from the profiled contexts.
MapVector<CallInfo, ContextNode *> TailCallToContextNodeMap;
std::vector<std::pair<CallInfo, ContextNode *>> NewCallToNode;
for (auto &Entry : NonAllocationCallToContextNodeMap) {
auto *Node = Entry.second;
assert(Node->Clones.empty());
// Check all node callees and see if in the same function.
// We need to check all of the calls recorded in this Node, because in some
// cases we may have had multiple calls with the same debug info calling
// different callees. This can happen, for example, when an object is
// constructed in the paramter list - the destructor call of the object has
// the same debug info (line/col) as the call the object was passed to.
// Here we will prune any that don't match all callee nodes.
std::vector<CallInfo> AllCalls;
AllCalls.reserve(Node->MatchingCalls.size() + 1);
AllCalls.push_back(Node->Call);
llvm::append_range(AllCalls, Node->MatchingCalls);
// First see if we can partition the calls by callee function, creating new
// nodes to host each set of calls calling the same callees. This is
// necessary for support indirect calls with ThinLTO, for which we
// synthesized CallsiteInfo records for each target. They will all have the
// same callsite stack ids and would be sharing a context node at this
// point. We need to perform separate cloning for each, which will be
// applied along with speculative devirtualization in the ThinLTO backends
// as needed. Note this does not currently support looking through tail
// calls, it is unclear if we need that for indirect call targets.
// First partition calls by callee func. Map indexed by func, value is
// struct with list of matching calls, assigned node.
if (partitionCallsByCallee(Node, AllCalls, NewCallToNode))
continue;
auto It = AllCalls.begin();
// Iterate through the calls until we find the first that matches.
for (; It != AllCalls.end(); ++It) {
auto ThisCall = *It;
bool Match = true;
for (auto EI = Node->CalleeEdges.begin(); EI != Node->CalleeEdges.end();
++EI) {
auto Edge = *EI;
if (!Edge->Callee->hasCall())
continue;
assert(NodeToCallingFunc.count(Edge->Callee));
// Check if the called function matches that of the callee node.
if (!calleesMatch(ThisCall.call(), EI, TailCallToContextNodeMap)) {
Match = false;
break;
}
}
// Found a call that matches the callee nodes, we can quit now.
if (Match) {
// If the first match is not the primary call on the Node, update it
// now. We will update the list of matching calls further below.
if (Node->Call != ThisCall) {
Node->setCall(ThisCall);
// We need to update the NonAllocationCallToContextNodeMap, but don't
// want to do this during iteration over that map, so save the calls
// that need updated entries.
NewCallToNode.push_back({ThisCall, Node});
}
break;
}
}
// We will update this list below (or leave it cleared if there was no
// match found above).
Node->MatchingCalls.clear();
// If we hit the end of the AllCalls vector, no call matching the callee
// nodes was found, clear the call information in the node.
if (It == AllCalls.end()) {
RemovedEdgesWithMismatchedCallees++;
// Work around by setting Node to have a null call, so it gets
// skipped during cloning. Otherwise assignFunctions will assert
// because its data structures are not designed to handle this case.
Node->setCall(CallInfo());
continue;
}
// Now add back any matching calls that call the same function as the
// matching primary call on Node.
for (++It; It != AllCalls.end(); ++It) {
auto ThisCall = *It;
if (!sameCallee(Node->Call.call(), ThisCall.call()))
continue;
Node->MatchingCalls.push_back(ThisCall);
}
}
// Remove all mismatched nodes identified in the above loop from the node map
// (checking whether they have a null call which is set above). For a
// MapVector like NonAllocationCallToContextNodeMap it is much more efficient
// to do the removal via remove_if than by individually erasing entries above.
// Also remove any entries if we updated the node's primary call above.
NonAllocationCallToContextNodeMap.remove_if([](const auto &it) {
return !it.second->hasCall() || it.second->Call != it.first;
});
// Add entries for any new primary calls recorded above.
for (auto &[Call, Node] : NewCallToNode)
NonAllocationCallToContextNodeMap[Call] = Node;
// Add the new nodes after the above loop so that the iteration is not
// invalidated.
for (auto &[Call, Node] : TailCallToContextNodeMap)
NonAllocationCallToContextNodeMap[Call] = Node;
}
template <typename DerivedCCG, typename FuncTy, typename CallTy>
bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::partitionCallsByCallee(
ContextNode *Node, ArrayRef<CallInfo> AllCalls,
std::vector<std::pair<CallInfo, ContextNode *>> &NewCallToNode) {
// Struct to keep track of all the calls having the same callee function,
// and the node we eventually assign to them. Eventually we will record the
// context node assigned to this group of calls.
struct CallsWithSameCallee {
std::vector<CallInfo> Calls;
ContextNode *Node = nullptr;
};
// First partition calls by callee function. Build map from each function
// to the list of matching calls.
DenseMap<const FuncTy *, CallsWithSameCallee> CalleeFuncToCallInfo;
for (auto ThisCall : AllCalls) {
auto *F = getCalleeFunc(ThisCall.call());
if (F)
CalleeFuncToCallInfo[F].Calls.push_back(ThisCall);
}
// Next, walk through all callee edges. For each callee node, get its
// containing function and see if it was recorded in the above map (meaning we
// have at least one matching call). Build another map from each callee node
// with a matching call to the structure instance created above containing all
// the calls.
DenseMap<ContextNode *, CallsWithSameCallee *> CalleeNodeToCallInfo;
for (const auto &Edge : Node->CalleeEdges) {
if (!Edge->Callee->hasCall())
continue;
const FuncTy *ProfiledCalleeFunc = NodeToCallingFunc[Edge->Callee];
if (CalleeFuncToCallInfo.contains(ProfiledCalleeFunc))
CalleeNodeToCallInfo[Edge->Callee] =
&CalleeFuncToCallInfo[ProfiledCalleeFunc];
}
// If there are entries in the second map, then there were no matching
// calls/callees, nothing to do here. Return so we can go to the handling that
// looks through tail calls.
if (CalleeNodeToCallInfo.empty())
return false;
// Walk through all callee edges again. Any and all callee edges that didn't
// match any calls (callee not in the CalleeNodeToCallInfo map) are moved to a
// new caller node (UnmatchedCalleesNode) which gets a null call so that it is
// ignored during cloning. If it is in the map, then we use the node recorded
// in that entry (creating it if needed), and move the callee edge to it.
// The first callee will use the original node instead of creating a new one.
// Note that any of the original calls on this node (in AllCalls) that didn't
// have a callee function automatically get dropped from the node as part of
// this process.
ContextNode *UnmatchedCalleesNode = nullptr;
// Track whether we already assigned original node to a callee.
bool UsedOrigNode = false;
assert(NodeToCallingFunc[Node]);
// Iterate over a copy of Node's callee edges, since we may need to remove
// edges in moveCalleeEdgeToNewCaller, and this simplifies the handling and
// makes it less error-prone.
auto CalleeEdges = Node->CalleeEdges;
for (auto &Edge : CalleeEdges) {
if (!Edge->Callee->hasCall())
continue;
// Will be updated below to point to whatever (caller) node this callee edge
// should be moved to.
ContextNode *CallerNodeToUse = nullptr;
// Handle the case where there were no matching calls first. Move this
// callee edge to the UnmatchedCalleesNode, creating it if needed.
if (!CalleeNodeToCallInfo.contains(Edge->Callee)) {
if (!UnmatchedCalleesNode)
UnmatchedCalleesNode =
createNewNode(/*IsAllocation=*/false, NodeToCallingFunc[Node]);
CallerNodeToUse = UnmatchedCalleesNode;
} else {
// Look up the information recorded for this callee node, and use the
// recorded caller node (creating it if needed).
auto *Info = CalleeNodeToCallInfo[Edge->Callee];
if (!Info->Node) {
// If we haven't assigned any callees to the original node use it.
if (!UsedOrigNode) {
Info->Node = Node;
// Clear the set of matching calls which will be updated below.
Node->MatchingCalls.clear();
UsedOrigNode = true;
} else
Info->Node =
createNewNode(/*IsAllocation=*/false, NodeToCallingFunc[Node]);
assert(!Info->Calls.empty());
// The first call becomes the primary call for this caller node, and the
// rest go in the matching calls list.
Info->Node->setCall(Info->Calls.front());
llvm::append_range(Info->Node->MatchingCalls,
llvm::drop_begin(Info->Calls));
// Save the primary call to node correspondence so that we can update
// the NonAllocationCallToContextNodeMap, which is being iterated in the
// caller of this function.
NewCallToNode.push_back({Info->Node->Call, Info->Node});
}
CallerNodeToUse = Info->Node;
}
// Don't need to move edge if we are using the original node;
if (CallerNodeToUse == Node)
continue;
moveCalleeEdgeToNewCaller(Edge, CallerNodeToUse);
}
// Now that we are done moving edges, clean up any caller edges that ended
// up with no type or context ids. During moveCalleeEdgeToNewCaller all
// caller edges from Node are replicated onto the new callers, and it
// simplifies the handling to leave them until we have moved all
// edges/context ids.
for (auto &I : CalleeNodeToCallInfo)
removeNoneTypeCallerEdges(I.second->Node);
if (UnmatchedCalleesNode)
removeNoneTypeCallerEdges(UnmatchedCalleesNode);
removeNoneTypeCallerEdges(Node);
return true;
}
uint64_t ModuleCallsiteContextGraph::getStackId(uint64_t IdOrIndex) const {
// In the Module (IR) case this is already the Id.
return IdOrIndex;
}
uint64_t IndexCallsiteContextGraph::getStackId(uint64_t IdOrIndex) const {
// In the Index case this is an index into the stack id list in the summary
// index, convert it to an Id.
return Index.getStackIdAtIndex(IdOrIndex);
}
template <typename DerivedCCG, typename FuncTy, typename CallTy>
bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::calleesMatch(
CallTy Call, EdgeIter &EI,
MapVector<CallInfo, ContextNode *> &TailCallToContextNodeMap) {
auto Edge = *EI;
const FuncTy *ProfiledCalleeFunc = NodeToCallingFunc[Edge->Callee];
const FuncTy *CallerFunc = NodeToCallingFunc[Edge->Caller];
// Will be populated in order of callee to caller if we find a chain of tail
// calls between the profiled caller and callee.
std::vector<std::pair<CallTy, FuncTy *>> FoundCalleeChain;
if (!calleeMatchesFunc(Call, ProfiledCalleeFunc, CallerFunc,
FoundCalleeChain))
return false;
// The usual case where the profiled callee matches that of the IR/summary.
if (FoundCalleeChain.empty())
return true;
auto AddEdge = [Edge, &EI](ContextNode *Caller, ContextNode *Callee) {
auto *CurEdge = Callee->findEdgeFromCaller(Caller);
// If there is already an edge between these nodes, simply update it and
// return.
if (CurEdge) {
CurEdge->ContextIds.insert_range(Edge->ContextIds);
CurEdge->AllocTypes |= Edge->AllocTypes;
return;
}
// Otherwise, create a new edge and insert it into the caller and callee
// lists.
auto NewEdge = std::make_shared<ContextEdge>(
Callee, Caller, Edge->AllocTypes, Edge->ContextIds);
Callee->CallerEdges.push_back(NewEdge);
if (Caller == Edge->Caller) {
// If we are inserting the new edge into the current edge's caller, insert
// the new edge before the current iterator position, and then increment
// back to the current edge.
EI = Caller->CalleeEdges.insert(EI, NewEdge);
++EI;
assert(*EI == Edge &&
"Iterator position not restored after insert and increment");
} else
Caller->CalleeEdges.push_back(NewEdge);
};
// Create new nodes for each found callee and connect in between the profiled
// caller and callee.
auto *CurCalleeNode = Edge->Callee;
for (auto &[NewCall, Func] : FoundCalleeChain) {
ContextNode *NewNode = nullptr;
// First check if we have already synthesized a node for this tail call.
if (TailCallToContextNodeMap.count(NewCall)) {
NewNode = TailCallToContextNodeMap[NewCall];
NewNode->AllocTypes |= Edge->AllocTypes;
} else {
FuncToCallsWithMetadata[Func].push_back({NewCall});
// Create Node and record node info.
NewNode = createNewNode(/*IsAllocation=*/false, Func, NewCall);
TailCallToContextNodeMap[NewCall] = NewNode;
NewNode->AllocTypes = Edge->AllocTypes;
}
// Hook up node to its callee node
AddEdge(NewNode, CurCalleeNode);
CurCalleeNode = NewNode;
}
// Hook up edge's original caller to new callee node.
AddEdge(Edge->Caller, CurCalleeNode);
#ifndef NDEBUG
// Save this because Edge's fields get cleared below when removed.
auto *Caller = Edge->Caller;
#endif
// Remove old edge
removeEdgeFromGraph(Edge.get(), &EI, /*CalleeIter=*/true);
// To simplify the increment of EI in the caller, subtract one from EI.
// In the final AddEdge call we would have either added a new callee edge,
// to Edge->Caller, or found an existing one. Either way we are guaranteed
// that there is at least one callee edge.
assert(!Caller->CalleeEdges.empty());
--EI;
return true;
}
bool ModuleCallsiteContextGraph::findProfiledCalleeThroughTailCalls(
const Function *ProfiledCallee, Value *CurCallee, unsigned Depth,
std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain,
bool &FoundMultipleCalleeChains) {
// Stop recursive search if we have already explored the maximum specified
// depth.
if (Depth > TailCallSearchDepth)
return false;
auto SaveCallsiteInfo = [&](Instruction *Callsite, Function *F) {
FoundCalleeChain.push_back({Callsite, F});
};
auto *CalleeFunc = dyn_cast<Function>(CurCallee);
if (!CalleeFunc) {
auto *Alias = dyn_cast<GlobalAlias>(CurCallee);
assert(Alias);
CalleeFunc = dyn_cast<Function>(Alias->getAliasee());
assert(CalleeFunc);
}
// Look for tail calls in this function, and check if they either call the
// profiled callee directly, or indirectly (via a recursive search).
// Only succeed if there is a single unique tail call chain found between the
// profiled caller and callee, otherwise we could perform incorrect cloning.
bool FoundSingleCalleeChain = false;
for (auto &BB : *CalleeFunc) {
for (auto &I : BB) {
auto *CB = dyn_cast<CallBase>(&I);
if (!CB || !CB->isTailCall())
continue;
auto *CalledValue = CB->getCalledOperand();
auto *CalledFunction = CB->getCalledFunction();
if (CalledValue && !CalledFunction) {
CalledValue = CalledValue->stripPointerCasts();
// Stripping pointer casts can reveal a called function.
CalledFunction = dyn_cast<Function>(CalledValue);
}
// Check if this is an alias to a function. If so, get the
// called aliasee for the checks below.
if (auto *GA = dyn_cast<GlobalAlias>(CalledValue)) {
assert(!CalledFunction &&
"Expected null called function in callsite for alias");
CalledFunction = dyn_cast<Function>(GA->getAliaseeObject());
}
if (!CalledFunction)
continue;
if (CalledFunction == ProfiledCallee) {
if (FoundSingleCalleeChain) {
FoundMultipleCalleeChains = true;
return false;
}
FoundSingleCalleeChain = true;
FoundProfiledCalleeCount++;
FoundProfiledCalleeDepth += Depth;
if (Depth > FoundProfiledCalleeMaxDepth)
FoundProfiledCalleeMaxDepth = Depth;
SaveCallsiteInfo(&I, CalleeFunc);
} else if (findProfiledCalleeThroughTailCalls(
ProfiledCallee, CalledFunction, Depth + 1,
FoundCalleeChain, FoundMultipleCalleeChains)) {
// findProfiledCalleeThroughTailCalls should not have returned
// true if FoundMultipleCalleeChains.
assert(!FoundMultipleCalleeChains);
if (FoundSingleCalleeChain) {
FoundMultipleCalleeChains = true;
return false;
}
FoundSingleCalleeChain = true;
SaveCallsiteInfo(&I, CalleeFunc);
} else if (FoundMultipleCalleeChains)
return false;
}
}
return FoundSingleCalleeChain;
}
const Function *ModuleCallsiteContextGraph::getCalleeFunc(Instruction *Call) {
auto *CB = dyn_cast<CallBase>(Call);
if (!CB->getCalledOperand() || CB->isIndirectCall())
return nullptr;
auto *CalleeVal = CB->getCalledOperand()->stripPointerCasts();
auto *Alias = dyn_cast<GlobalAlias>(CalleeVal);
if (Alias)
return dyn_cast<Function>(Alias->getAliasee());
return dyn_cast<Function>(CalleeVal);
}
bool ModuleCallsiteContextGraph::calleeMatchesFunc(
Instruction *Call, const Function *Func, const Function *CallerFunc,
std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain) {
auto *CB = dyn_cast<CallBase>(Call);
if (!CB->getCalledOperand() || CB->isIndirectCall())
return false;
auto *CalleeVal = CB->getCalledOperand()->stripPointerCasts();
auto *CalleeFunc = dyn_cast<Function>(CalleeVal);
if (CalleeFunc == Func)
return true;
auto *Alias = dyn_cast<GlobalAlias>(CalleeVal);
if (Alias && Alias->getAliasee() == Func)
return true;
// Recursively search for the profiled callee through tail calls starting with
// the actual Callee. The discovered tail call chain is saved in
// FoundCalleeChain, and we will fixup the graph to include these callsites
// after returning.
// FIXME: We will currently redo the same recursive walk if we find the same
// mismatched callee from another callsite. We can improve this with more
// bookkeeping of the created chain of new nodes for each mismatch.
unsigned Depth = 1;
bool FoundMultipleCalleeChains = false;
if (!findProfiledCalleeThroughTailCalls(Func, CalleeVal, Depth,
FoundCalleeChain,
FoundMultipleCalleeChains)) {
LLVM_DEBUG(dbgs() << "Not found through unique tail call chain: "
<< Func->getName() << " from " << CallerFunc->getName()
<< " that actually called " << CalleeVal->getName()
<< (FoundMultipleCalleeChains
? " (found multiple possible chains)"
: "")
<< "\n");
if (FoundMultipleCalleeChains)
FoundProfiledCalleeNonUniquelyCount++;
return false;
}
return true;
}
bool ModuleCallsiteContextGraph::sameCallee(Instruction *Call1,
Instruction *Call2) {
auto *CB1 = cast<CallBase>(Call1);
if (!CB1->getCalledOperand() || CB1->isIndirectCall())
return false;
auto *CalleeVal1 = CB1->getCalledOperand()->stripPointerCasts();
auto *CalleeFunc1 = dyn_cast<Function>(CalleeVal1);
auto *CB2 = cast<CallBase>(Call2);
if (!CB2->getCalledOperand() || CB2->isIndirectCall())
return false;
auto *CalleeVal2 = CB2->getCalledOperand()->stripPointerCasts();
auto *CalleeFunc2 = dyn_cast<Function>(CalleeVal2);
return CalleeFunc1 == CalleeFunc2;
}
bool IndexCallsiteContextGraph::findProfiledCalleeThroughTailCalls(
ValueInfo ProfiledCallee, ValueInfo CurCallee, unsigned Depth,
std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain,
bool &FoundMultipleCalleeChains) {
// Stop recursive search if we have already explored the maximum specified
// depth.
if (Depth > TailCallSearchDepth)
return false;
auto CreateAndSaveCallsiteInfo = [&](ValueInfo Callee, FunctionSummary *FS) {
// Make a CallsiteInfo for each discovered callee, if one hasn't already
// been synthesized.
if (!FunctionCalleesToSynthesizedCallsiteInfos.count(FS) ||
!FunctionCalleesToSynthesizedCallsiteInfos[FS].count(Callee))
// StackIds is empty (we don't have debug info available in the index for
// these callsites)
FunctionCalleesToSynthesizedCallsiteInfos[FS][Callee] =
std::make_unique<CallsiteInfo>(Callee, SmallVector<unsigned>());
CallsiteInfo *NewCallsiteInfo =
FunctionCalleesToSynthesizedCallsiteInfos[FS][Callee].get();
FoundCalleeChain.push_back({NewCallsiteInfo, FS});
};
// Look for tail calls in this function, and check if they either call the
// profiled callee directly, or indirectly (via a recursive search).
// Only succeed if there is a single unique tail call chain found between the
// profiled caller and callee, otherwise we could perform incorrect cloning.
bool FoundSingleCalleeChain = false;
for (auto &S : CurCallee.getSummaryList()) {
if (!GlobalValue::isLocalLinkage(S->linkage()) &&
!isPrevailing(CurCallee.getGUID(), S.get()))
continue;
auto *FS = dyn_cast<FunctionSummary>(S->getBaseObject());
if (!FS)
continue;
auto FSVI = CurCallee;
auto *AS = dyn_cast<AliasSummary>(S.get());
if (AS)
FSVI = AS->getAliaseeVI();
for (auto &CallEdge : FS->calls()) {
if (!CallEdge.second.hasTailCall())
continue;
if (CallEdge.first == ProfiledCallee) {
if (FoundSingleCalleeChain) {
FoundMultipleCalleeChains = true;
return false;
}
FoundSingleCalleeChain = true;
FoundProfiledCalleeCount++;
FoundProfiledCalleeDepth += Depth;
if (Depth > FoundProfiledCalleeMaxDepth)
FoundProfiledCalleeMaxDepth = Depth;
CreateAndSaveCallsiteInfo(CallEdge.first, FS);
// Add FS to FSToVIMap in case it isn't already there.
assert(!FSToVIMap.count(FS) || FSToVIMap[FS] == FSVI);
FSToVIMap[FS] = FSVI;
} else if (findProfiledCalleeThroughTailCalls(
ProfiledCallee, CallEdge.first, Depth + 1,
FoundCalleeChain, FoundMultipleCalleeChains)) {
// findProfiledCalleeThroughTailCalls should not have returned
// true if FoundMultipleCalleeChains.
assert(!FoundMultipleCalleeChains);
if (FoundSingleCalleeChain) {
FoundMultipleCalleeChains = true;
return false;
}
FoundSingleCalleeChain = true;
CreateAndSaveCallsiteInfo(CallEdge.first, FS);
// Add FS to FSToVIMap in case it isn't already there.
assert(!FSToVIMap.count(FS) || FSToVIMap[FS] == FSVI);
FSToVIMap[FS] = FSVI;
} else if (FoundMultipleCalleeChains)
return false;
}
}
return FoundSingleCalleeChain;
}
const FunctionSummary *
IndexCallsiteContextGraph::getCalleeFunc(IndexCall &Call) {
ValueInfo Callee = dyn_cast_if_present<CallsiteInfo *>(Call)->Callee;
if (Callee.getSummaryList().empty())
return nullptr;
return dyn_cast<FunctionSummary>(Callee.getSummaryList()[0]->getBaseObject());
}
bool IndexCallsiteContextGraph::calleeMatchesFunc(
IndexCall &Call, const FunctionSummary *Func,
const FunctionSummary *CallerFunc,
std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain) {
ValueInfo Callee = dyn_cast_if_present<CallsiteInfo *>(Call)->Callee;
// If there is no summary list then this is a call to an externally defined
// symbol.
AliasSummary *Alias =
Callee.getSummaryList().empty()
? nullptr
: dyn_cast<AliasSummary>(Callee.getSummaryList()[0].get());
assert(FSToVIMap.count(Func));
auto FuncVI = FSToVIMap[Func];
if (Callee == FuncVI ||
// If callee is an alias, check the aliasee, since only function
// summary base objects will contain the stack node summaries and thus
// get a context node.
(Alias && Alias->getAliaseeVI() == FuncVI))
return true;
// Recursively search for the profiled callee through tail calls starting with
// the actual Callee. The discovered tail call chain is saved in
// FoundCalleeChain, and we will fixup the graph to include these callsites
// after returning.
// FIXME: We will currently redo the same recursive walk if we find the same
// mismatched callee from another callsite. We can improve this with more
// bookkeeping of the created chain of new nodes for each mismatch.
unsigned Depth = 1;
bool FoundMultipleCalleeChains = false;
if (!findProfiledCalleeThroughTailCalls(
FuncVI, Callee, Depth, FoundCalleeChain, FoundMultipleCalleeChains)) {
LLVM_DEBUG(dbgs() << "Not found through unique tail call chain: " << FuncVI
<< " from " << FSToVIMap[CallerFunc]
<< " that actually called " << Callee
<< (FoundMultipleCalleeChains
? " (found multiple possible chains)"
: "")
<< "\n");
if (FoundMultipleCalleeChains)
FoundProfiledCalleeNonUniquelyCount++;
return false;
}
return true;
}
bool IndexCallsiteContextGraph::sameCallee(IndexCall &Call1, IndexCall &Call2) {
ValueInfo Callee1 = dyn_cast_if_present<CallsiteInfo *>(Call1)->Callee;
ValueInfo Callee2 = dyn_cast_if_present<CallsiteInfo *>(Call2)->Callee;
return Callee1 == Callee2;
}
template <typename DerivedCCG, typename FuncTy, typename CallTy>
void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::dump()
const {
print(dbgs());
dbgs() << "\n";
}
template <typename DerivedCCG, typename FuncTy, typename CallTy>
void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::print(
raw_ostream &OS) const {
OS << "Node " << this << "\n";
OS << "\t";
printCall(OS);
if (Recursive)
OS << " (recursive)";
OS << "\n";
if (!MatchingCalls.empty()) {
OS << "\tMatchingCalls:\n";
for (auto &MatchingCall : MatchingCalls) {
OS << "\t";
MatchingCall.print(OS);
OS << "\n";
}
}
OS << "\tAllocTypes: " << getAllocTypeString(AllocTypes) << "\n";
OS << "\tContextIds:";
// Make a copy of the computed context ids that we can sort for stability.
auto ContextIds = getContextIds();
std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());
std::sort(SortedIds.begin(), SortedIds.end());
for (auto Id : SortedIds)
OS << " " << Id;
OS << "\n";
OS << "\tCalleeEdges:\n";
for (auto &Edge : CalleeEdges)
OS << "\t\t" << *Edge << "\n";
OS << "\tCallerEdges:\n";
for (auto &Edge : CallerEdges)
OS << "\t\t" << *Edge << "\n";
if (!Clones.empty()) {
OS << "\tClones: " << llvm::interleaved(Clones) << "\n";
} else if (CloneOf) {
OS << "\tClone of " << CloneOf << "\n";
}
}
template <typename DerivedCCG, typename FuncTy, typename CallTy>
void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge::dump()
const {
print(dbgs());
dbgs() << "\n";
}
template <typename DerivedCCG, typename FuncTy, typename CallTy>
void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge::print(
raw_ostream &OS) const {
OS << "Edge from Callee " << Callee << " to Caller: " << Caller
<< (IsBackedge ? " (BE)" : "")
<< " AllocTypes: " << getAllocTypeString(AllocTypes);
OS << " ContextIds:";
std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());
std::sort(SortedIds.begin(), SortedIds.end());
for (auto Id : SortedIds)
OS << " " << Id;
}
template <typename DerivedCCG, typename FuncTy, typename CallTy>
void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::dump() const {
print(dbgs());
}
template <typename DerivedCCG, typename FuncTy, typename CallTy>
void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::print(
raw_ostream &OS) const {
OS << "Callsite Context Graph:\n";
using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
for (const auto Node : nodes<GraphType>(this)) {
if (Node->isRemoved())
continue;
Node->print(OS);
OS << "\n";
}
}
template <typename DerivedCCG, typename FuncTy, typename CallTy>
void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::printTotalSizes(
raw_ostream &OS) const {
using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
for (const auto Node : nodes<GraphType>(this)) {
if (Node->isRemoved())
continue;
if (!Node->IsAllocation)
continue;
DenseSet<uint32_t> ContextIds = Node->getContextIds();
auto AllocTypeFromCall = getAllocationCallType(Node->Call);
std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());
std::sort(SortedIds.begin(), SortedIds.end());
for (auto Id : SortedIds) {
auto TypeI = ContextIdToAllocationType.find(Id);
assert(TypeI != ContextIdToAllocationType.end());
auto CSI = ContextIdToContextSizeInfos.find(Id);
if (CSI != ContextIdToContextSizeInfos.end()) {
for (auto &Info : CSI->second) {
OS << "MemProf hinting: "
<< getAllocTypeString((uint8_t)TypeI->second)
<< " full allocation context " << Info.FullStackId
<< " with total size " << Info.TotalSize << " is "
<< getAllocTypeString(Node->AllocTypes) << " after cloning";
if (allocTypeToUse(Node->AllocTypes) != AllocTypeFromCall)
OS << " marked " << getAllocTypeString((uint8_t)AllocTypeFromCall)
<< " due to cold byte percent";
// Print the internal context id to aid debugging and visualization.
OS << " (context id " << Id << ")";
OS << "\n";
}
}
}
}
}
template <typename DerivedCCG, typename FuncTy, typename CallTy>
void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::check() const {
using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
for (const auto Node : nodes<GraphType>(this)) {
checkNode<DerivedCCG, FuncTy, CallTy>(Node, /*CheckEdges=*/false);
for (auto &Edge : Node->CallerEdges)
checkEdge<DerivedCCG, FuncTy, CallTy>(Edge);
}
}
template <typename DerivedCCG, typename FuncTy, typename CallTy>
struct GraphTraits<const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *> {
using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
using NodeRef = const ContextNode<DerivedCCG, FuncTy, CallTy> *;
using NodePtrTy = std::unique_ptr<ContextNode<DerivedCCG, FuncTy, CallTy>>;
static NodeRef getNode(const NodePtrTy &P) { return P.get(); }
using nodes_iterator =
mapped_iterator<typename std::vector<NodePtrTy>::const_iterator,
decltype(&getNode)>;
static nodes_iterator nodes_begin(GraphType G) {
return nodes_iterator(G->NodeOwner.begin(), &getNode);
}
static nodes_iterator nodes_end(GraphType G) {
return nodes_iterator(G->NodeOwner.end(), &getNode);
}
static NodeRef getEntryNode(GraphType G) {
return G->NodeOwner.begin()->get();
}
using EdgePtrTy = std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>>;
static const ContextNode<DerivedCCG, FuncTy, CallTy> *
GetCallee(const EdgePtrTy &P) {
return P->Callee;
}
using ChildIteratorType =
mapped_iterator<typename std::vector<std::shared_ptr<ContextEdge<
DerivedCCG, FuncTy, CallTy>>>::const_iterator,
decltype(&GetCallee)>;
static ChildIteratorType child_begin(NodeRef N) {
return ChildIteratorType(N->CalleeEdges.begin(), &GetCallee);
}
static ChildIteratorType child_end(NodeRef N) {
return ChildIteratorType(N->CalleeEdges.end(), &GetCallee);
}
};
template <typename DerivedCCG, typename FuncTy, typename CallTy>
struct DOTGraphTraits<const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>
: public DefaultDOTGraphTraits {
DOTGraphTraits(bool IsSimple = false) : DefaultDOTGraphTraits(IsSimple) {
// If the user requested the full graph to be exported, but provided an
// allocation id, or if the user gave a context id and requested more than
// just a specific context to be exported, note that highlighting is
// enabled.
DoHighlight =
(AllocIdForDot.getNumOccurrences() && DotGraphScope == DotScope::All) ||
(ContextIdForDot.getNumOccurrences() &&
DotGraphScope != DotScope::Context);
}
using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
using GTraits = GraphTraits<GraphType>;
using NodeRef = typename GTraits::NodeRef;
using ChildIteratorType = typename GTraits::ChildIteratorType;
static std::string getNodeLabel(NodeRef Node, GraphType G) {
std::string LabelString =
(Twine("OrigId: ") + (Node->IsAllocation ? "Alloc" : "") +
Twine(Node->OrigStackOrAllocId))
.str();
LabelString += "\n";
if (Node->hasCall()) {
auto Func = G->NodeToCallingFunc.find(Node);
assert(Func != G->NodeToCallingFunc.end());
LabelString +=
G->getLabel(Func->second, Node->Call.call(), Node->Call.cloneNo());
} else {
LabelString += "null call";
if (Node->Recursive)
LabelString += " (recursive)";
else
LabelString += " (external)";
}
return LabelString;
}
static std::string getNodeAttributes(NodeRef Node, GraphType G) {
auto ContextIds = Node->getContextIds();
// If highlighting enabled, see if this node contains any of the context ids
// of interest. If so, it will use a different color and a larger fontsize
// (which makes the node larger as well).
bool Highlight = false;
if (DoHighlight) {
assert(ContextIdForDot.getNumOccurrences() ||
AllocIdForDot.getNumOccurrences());
if (ContextIdForDot.getNumOccurrences())
Highlight = ContextIds.contains(ContextIdForDot);
else
Highlight = set_intersects(ContextIds, G->DotAllocContextIds);
}
std::string AttributeString = (Twine("tooltip=\"") + getNodeId(Node) + " " +
getContextIds(ContextIds) + "\"")
.str();
// Default fontsize is 14
if (Highlight)
AttributeString += ",fontsize=\"30\"";
AttributeString +=
(Twine(",fillcolor=\"") + getColor(Node->AllocTypes, Highlight) + "\"")
.str();
if (Node->CloneOf) {
AttributeString += ",color=\"blue\"";
AttributeString += ",style=\"filled,bold,dashed\"";
} else
AttributeString += ",style=\"filled\"";
return AttributeString;
}
static std::string getEdgeAttributes(NodeRef, ChildIteratorType ChildIter,
GraphType G) {
auto &Edge = *(ChildIter.getCurrent());
// If highlighting enabled, see if this edge contains any of the context ids
// of interest. If so, it will use a different color and a heavier arrow
// size and weight (the larger weight makes the highlighted path
// straighter).
bool Highlight = false;
if (DoHighlight) {
assert(ContextIdForDot.getNumOccurrences() ||
AllocIdForDot.getNumOccurrences());
if (ContextIdForDot.getNumOccurrences())
Highlight = Edge->ContextIds.contains(ContextIdForDot);
else
Highlight = set_intersects(Edge->ContextIds, G->DotAllocContextIds);
}
auto Color = getColor(Edge->AllocTypes, Highlight);
std::string AttributeString =
(Twine("tooltip=\"") + getContextIds(Edge->ContextIds) + "\"" +
// fillcolor is the arrow head and color is the line
Twine(",fillcolor=\"") + Color + "\"" + Twine(",color=\"") + Color +
"\"")
.str();
if (Edge->IsBackedge)
AttributeString += ",style=\"dotted\"";
// Default penwidth and weight are both 1.
if (Highlight)
AttributeString += ",penwidth=\"2.0\",weight=\"2\"";
return AttributeString;
}
// Since the NodeOwners list includes nodes that are no longer connected to
// the graph, skip them here.
static bool isNodeHidden(NodeRef Node, GraphType G) {
if (Node->isRemoved())
return true;
// If a scope smaller than the full graph was requested, see if this node
// contains any of the context ids of interest.
if (DotGraphScope == DotScope::Alloc)
return !set_intersects(Node->getContextIds(), G->DotAllocContextIds);
if (DotGraphScope == DotScope::Context)
return !Node->getContextIds().contains(ContextIdForDot);
return false;
}
private:
static std::string getContextIds(const DenseSet<uint32_t> &ContextIds) {
std::string IdString = "ContextIds:";
if (ContextIds.size() < 100) {
std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());
std::sort(SortedIds.begin(), SortedIds.end());
for (auto Id : SortedIds)
IdString += (" " + Twine(Id)).str();
} else {
IdString += (" (" + Twine(ContextIds.size()) + " ids)").str();
}
return IdString;
}
static std::string getColor(uint8_t AllocTypes, bool Highlight) {
// If DoHighlight is not enabled, we want to use the highlight colors for
// NotCold and Cold, and the non-highlight color for NotCold+Cold. This is
// both compatible with the color scheme before highlighting was supported,
// and for the NotCold+Cold color the non-highlight color is a bit more
// readable.
if (AllocTypes == (uint8_t)AllocationType::NotCold)
// Color "brown1" actually looks like a lighter red.
return !DoHighlight || Highlight ? "brown1" : "lightpink";
if (AllocTypes == (uint8_t)AllocationType::Cold)
return !DoHighlight || Highlight ? "cyan" : "lightskyblue";
if (AllocTypes ==
((uint8_t)AllocationType::NotCold | (uint8_t)AllocationType::Cold))
return Highlight ? "magenta" : "mediumorchid1";
return "gray";
}
static std::string getNodeId(NodeRef Node) {
std::stringstream SStream;
SStream << std::hex << "N0x" << (unsigned long long)Node;
std::string Result = SStream.str();
return Result;
}
// True if we should highlight a specific context or allocation's contexts in
// the emitted graph.
static bool DoHighlight;
};
template <typename DerivedCCG, typename FuncTy, typename CallTy>
bool DOTGraphTraits<
const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>::DoHighlight =
false;
template <typename DerivedCCG, typename FuncTy, typename CallTy>
void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::exportToDot(
std::string Label) const {
WriteGraph(this, "", false, Label,
DotFilePathPrefix + "ccg." + Label + ".dot");
}
template <typename DerivedCCG, typename FuncTy, typename CallTy>
typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::moveEdgeToNewCalleeClone(
const std::shared_ptr<ContextEdge> &Edge,
DenseSet<uint32_t> ContextIdsToMove) {
ContextNode *Node = Edge->Callee;
assert(NodeToCallingFunc.count(Node));
ContextNode *Clone =
createNewNode(Node->IsAllocation, NodeToCallingFunc[Node], Node->Call);
Node->addClone(Clone);
Clone->MatchingCalls = Node->MatchingCalls;
moveEdgeToExistingCalleeClone(Edge, Clone, /*NewClone=*/true,
ContextIdsToMove);
return Clone;
}
template <typename DerivedCCG, typename FuncTy, typename CallTy>
void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
moveEdgeToExistingCalleeClone(const std::shared_ptr<ContextEdge> &Edge,
ContextNode *NewCallee, bool NewClone,
DenseSet<uint32_t> ContextIdsToMove) {
// NewCallee and Edge's current callee must be clones of the same original
// node (Edge's current callee may be the original node too).
assert(NewCallee->getOrigNode() == Edge->Callee->getOrigNode());
bool EdgeIsRecursive = Edge->Callee == Edge->Caller;
ContextNode *OldCallee = Edge->Callee;
// We might already have an edge to the new callee from earlier cloning for a
// different allocation. If one exists we will reuse it.
auto ExistingEdgeToNewCallee = NewCallee->findEdgeFromCaller(Edge->Caller);
// Callers will pass an empty ContextIdsToMove set when they want to move the
// edge. Copy in Edge's ids for simplicity.
if (ContextIdsToMove.empty())
ContextIdsToMove = Edge->getContextIds();
// If we are moving all of Edge's ids, then just move the whole Edge.
// Otherwise only move the specified subset, to a new edge if needed.
if (Edge->getContextIds().size() == ContextIdsToMove.size()) {
// First, update the alloc types on New Callee from Edge.
// Do this before we potentially clear Edge's fields below!
NewCallee->AllocTypes |= Edge->AllocTypes;
// Moving the whole Edge.
if (ExistingEdgeToNewCallee) {
// Since we already have an edge to NewCallee, simply move the ids
// onto it, and remove the existing Edge.
ExistingEdgeToNewCallee->getContextIds().insert_range(ContextIdsToMove);
ExistingEdgeToNewCallee->AllocTypes |= Edge->AllocTypes;
assert(Edge->ContextIds == ContextIdsToMove);
removeEdgeFromGraph(Edge.get());
} else {
// Otherwise just reconnect Edge to NewCallee.
Edge->Callee = NewCallee;
NewCallee->CallerEdges.push_back(Edge);
// Remove it from callee where it was previously connected.
OldCallee->eraseCallerEdge(Edge.get());
// Don't need to update Edge's context ids since we are simply
// reconnecting it.
}
} else {
// Only moving a subset of Edge's ids.
// Compute the alloc type of the subset of ids being moved.
auto CallerEdgeAllocType = computeAllocType(ContextIdsToMove);
if (ExistingEdgeToNewCallee) {
// Since we already have an edge to NewCallee, simply move the ids
// onto it.
ExistingEdgeToNewCallee->getContextIds().insert_range(ContextIdsToMove);
ExistingEdgeToNewCallee->AllocTypes |= CallerEdgeAllocType;
} else {
// Otherwise, create a new edge to NewCallee for the ids being moved.
auto NewEdge = std::make_shared<ContextEdge>(
NewCallee, Edge->Caller, CallerEdgeAllocType, ContextIdsToMove);
Edge->Caller->CalleeEdges.push_back(NewEdge);
NewCallee->CallerEdges.push_back(NewEdge);
}
// In either case, need to update the alloc types on NewCallee, and remove
// those ids and update the alloc type on the original Edge.
NewCallee->AllocTypes |= CallerEdgeAllocType;
set_subtract(Edge->ContextIds, ContextIdsToMove);
Edge->AllocTypes = computeAllocType(Edge->ContextIds);
}
// Now walk the old callee node's callee edges and move Edge's context ids
// over to the corresponding edge into the clone (which is created here if
// this is a newly created clone).
for (auto &OldCalleeEdge : OldCallee->CalleeEdges) {
ContextNode *CalleeToUse = OldCalleeEdge->Callee;
// If this is a direct recursion edge, use NewCallee (the clone) as the
// callee as well, so that any edge updated/created here is also direct
// recursive.
if (CalleeToUse == OldCallee) {
// If this is a recursive edge, see if we already moved a recursive edge
// (which would have to have been this one) - if we were only moving a
// subset of context ids it would still be on OldCallee.
if (EdgeIsRecursive) {
assert(OldCalleeEdge == Edge);
continue;
}
CalleeToUse = NewCallee;
}
// The context ids moving to the new callee are the subset of this edge's
// context ids and the context ids on the caller edge being moved.
DenseSet<uint32_t> EdgeContextIdsToMove =
set_intersection(OldCalleeEdge->getContextIds(), ContextIdsToMove);
set_subtract(OldCalleeEdge->getContextIds(), EdgeContextIdsToMove);
OldCalleeEdge->AllocTypes =
computeAllocType(OldCalleeEdge->getContextIds());
if (!NewClone) {
// Update context ids / alloc type on corresponding edge to NewCallee.
// There is a chance this may not exist if we are reusing an existing
// clone, specifically during function assignment, where we would have
// removed none type edges after creating the clone. If we can't find
// a corresponding edge there, fall through to the cloning below.
if (auto *NewCalleeEdge = NewCallee->findEdgeFromCallee(CalleeToUse)) {
NewCalleeEdge->getContextIds().insert_range(EdgeContextIdsToMove);
NewCalleeEdge->AllocTypes |= computeAllocType(EdgeContextIdsToMove);
continue;
}
}
auto NewEdge = std::make_shared<ContextEdge>(
CalleeToUse, NewCallee, computeAllocType(EdgeContextIdsToMove),
EdgeContextIdsToMove);
NewCallee->CalleeEdges.push_back(NewEdge);
NewEdge->Callee->CallerEdges.push_back(NewEdge);
}
// Recompute the node alloc type now that its callee edges have been
// updated (since we will compute from those edges).
OldCallee->AllocTypes = OldCallee->computeAllocType();
// OldCallee alloc type should be None iff its context id set is now empty.
assert((OldCallee->AllocTypes == (uint8_t)AllocationType::None) ==
OldCallee->emptyContextIds());
if (VerifyCCG) {
checkNode<DerivedCCG, FuncTy, CallTy>(OldCallee, /*CheckEdges=*/false);
checkNode<DerivedCCG, FuncTy, CallTy>(NewCallee, /*CheckEdges=*/false);
for (const auto &OldCalleeEdge : OldCallee->CalleeEdges)
checkNode<DerivedCCG, FuncTy, CallTy>(OldCalleeEdge->Callee,
/*CheckEdges=*/false);
for (const auto &NewCalleeEdge : NewCallee->CalleeEdges)
checkNode<DerivedCCG, FuncTy, CallTy>(NewCalleeEdge->Callee,
/*CheckEdges=*/false);
}
}
template <typename DerivedCCG, typename FuncTy, typename CallTy>
void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
moveCalleeEdgeToNewCaller(const std::shared_ptr<ContextEdge> &Edge,
ContextNode *NewCaller) {
auto *OldCallee = Edge->Callee;
auto *NewCallee = OldCallee;
// If this edge was direct recursive, make any new/updated edge also direct
// recursive to NewCaller.
bool Recursive = Edge->Caller == Edge->Callee;
if (Recursive)
NewCallee = NewCaller;
ContextNode *OldCaller = Edge->Caller;
OldCaller->eraseCalleeEdge(Edge.get());
// We might already have an edge to the new caller. If one exists we will
// reuse it.
auto ExistingEdgeToNewCaller = NewCaller->findEdgeFromCallee(NewCallee);
if (ExistingEdgeToNewCaller) {
// Since we already have an edge to NewCaller, simply move the ids
// onto it, and remove the existing Edge.
ExistingEdgeToNewCaller->getContextIds().insert_range(
Edge->getContextIds());
ExistingEdgeToNewCaller->AllocTypes |= Edge->AllocTypes;
Edge->ContextIds.clear();
Edge->AllocTypes = (uint8_t)AllocationType::None;
OldCallee->eraseCallerEdge(Edge.get());
} else {
// Otherwise just reconnect Edge to NewCaller.
Edge->Caller = NewCaller;
NewCaller->CalleeEdges.push_back(Edge);
if (Recursive) {
assert(NewCallee == NewCaller);
// In the case of (direct) recursive edges, we update the callee as well
// so that it becomes recursive on the new caller.
Edge->Callee = NewCallee;
NewCallee->CallerEdges.push_back(Edge);
OldCallee->eraseCallerEdge(Edge.get());
}
// Don't need to update Edge's context ids since we are simply
// reconnecting it.
}
// In either case, need to update the alloc types on New Caller.
NewCaller->AllocTypes |= Edge->AllocTypes;
// Now walk the old caller node's caller edges and move Edge's context ids
// over to the corresponding edge into the node (which is created here if
// this is a newly created node). We can tell whether this is a newly created
// node by seeing if it has any caller edges yet.
#ifndef NDEBUG
bool IsNewNode = NewCaller->CallerEdges.empty();
#endif
// If we just moved a direct recursive edge, presumably its context ids should
// also flow out of OldCaller via some other non-recursive callee edge. We
// don't want to remove the recursive context ids from other caller edges yet,
// otherwise the context ids get into an inconsistent state on OldCaller.
// We will update these context ids on the non-recursive caller edge when and
// if they are updated on the non-recursive callee.
if (!Recursive) {
for (auto &OldCallerEdge : OldCaller->CallerEdges) {
auto OldCallerCaller = OldCallerEdge->Caller;
// The context ids moving to the new caller are the subset of this edge's
// context ids and the context ids on the callee edge being moved.
DenseSet<uint32_t> EdgeContextIdsToMove = set_intersection(
OldCallerEdge->getContextIds(), Edge->getContextIds());
if (OldCaller == OldCallerCaller) {
OldCallerCaller = NewCaller;
// Don't actually move this one. The caller will move it directly via a
// call to this function with this as the Edge if it is appropriate to
// move to a diff node that has a matching callee (itself).
continue;
}
set_subtract(OldCallerEdge->getContextIds(), EdgeContextIdsToMove);
OldCallerEdge->AllocTypes =
computeAllocType(OldCallerEdge->getContextIds());
// In this function we expect that any pre-existing node already has edges
// from the same callers as the old node. That should be true in the
// current use case, where we will remove None-type edges after copying
// over all caller edges from the callee.
auto *ExistingCallerEdge = NewCaller->findEdgeFromCaller(OldCallerCaller);
// Since we would have skipped caller edges when moving a direct recursive
// edge, this may not hold true when recursive handling enabled.
assert(IsNewNode || ExistingCallerEdge || AllowRecursiveCallsites);
if (ExistingCallerEdge) {
ExistingCallerEdge->getContextIds().insert_range(EdgeContextIdsToMove);
ExistingCallerEdge->AllocTypes |=
computeAllocType(EdgeContextIdsToMove);
continue;
}
auto NewEdge = std::make_shared<ContextEdge>(
NewCaller, OldCallerCaller, computeAllocType(EdgeContextIdsToMove),
EdgeContextIdsToMove);
NewCaller->CallerEdges.push_back(NewEdge);
NewEdge->Caller->CalleeEdges.push_back(NewEdge);
}
}
// Recompute the node alloc type now that its caller edges have been
// updated (since we will compute from those edges).
OldCaller->AllocTypes = OldCaller->computeAllocType();
// OldCaller alloc type should be None iff its context id set is now empty.
assert((OldCaller->AllocTypes == (uint8_t)AllocationType::None) ==
OldCaller->emptyContextIds());
if (VerifyCCG) {
checkNode<DerivedCCG, FuncTy, CallTy>(OldCaller, /*CheckEdges=*/false);
checkNode<DerivedCCG, FuncTy, CallTy>(NewCaller, /*CheckEdges=*/false);
for (const auto &OldCallerEdge : OldCaller->CallerEdges)
checkNode<DerivedCCG, FuncTy, CallTy>(OldCallerEdge->Caller,
/*CheckEdges=*/false);
for (const auto &NewCallerEdge : NewCaller->CallerEdges)
checkNode<DerivedCCG, FuncTy, CallTy>(NewCallerEdge->Caller,
/*CheckEdges=*/false);
}
}
template <typename DerivedCCG, typename FuncTy, typename CallTy>
void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
recursivelyRemoveNoneTypeCalleeEdges(
ContextNode *Node, DenseSet<const ContextNode *> &Visited) {
auto Inserted = Visited.insert(Node);
if (!Inserted.second)
return;
removeNoneTypeCalleeEdges(Node);
for (auto *Clone : Node->Clones)
recursivelyRemoveNoneTypeCalleeEdges(Clone, Visited);
// The recursive call may remove some of this Node's caller edges.
// Iterate over a copy and skip any that were removed.
auto CallerEdges = Node->CallerEdges;
for (auto &Edge : CallerEdges) {
// Skip any that have been removed by an earlier recursive call.
if (Edge->isRemoved()) {
assert(!is_contained(Node->CallerEdges, Edge));
continue;
}
recursivelyRemoveNoneTypeCalleeEdges(Edge->Caller, Visited);
}
}
// This is the standard DFS based backedge discovery algorithm.
template <typename DerivedCCG, typename FuncTy, typename CallTy>
void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::markBackedges() {
// If we are cloning recursive contexts, find and mark backedges from all root
// callers, using the typical DFS based backedge analysis.
if (!CloneRecursiveContexts)
return;
DenseSet<const ContextNode *> Visited;
DenseSet<const ContextNode *> CurrentStack;
for (auto &Entry : NonAllocationCallToContextNodeMap) {
auto *Node = Entry.second;
if (Node->isRemoved())
continue;
// It is a root if it doesn't have callers.
if (!Node->CallerEdges.empty())
continue;
markBackedges(Node, Visited, CurrentStack);
assert(CurrentStack.empty());
}
}
// Recursive helper for above markBackedges method.
template <typename DerivedCCG, typename FuncTy, typename CallTy>
void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::markBackedges(
ContextNode *Node, DenseSet<const ContextNode *> &Visited,
DenseSet<const ContextNode *> &CurrentStack) {
auto I = Visited.insert(Node);
// We should only call this for unvisited nodes.
assert(I.second);
(void)I;
for (auto &CalleeEdge : Node->CalleeEdges) {
auto *Callee = CalleeEdge->Callee;
if (Visited.count(Callee)) {
// Since this was already visited we need to check if it is currently on
// the recursive stack in which case it is a backedge.
if (CurrentStack.count(Callee))
CalleeEdge->IsBackedge = true;
continue;
}
CurrentStack.insert(Callee);
markBackedges(Callee, Visited, CurrentStack);
CurrentStack.erase(Callee);
}
}
template <typename DerivedCCG, typename FuncTy, typename CallTy>
void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::identifyClones() {
DenseSet<const ContextNode *> Visited;
for (auto &Entry : AllocationCallToContextNodeMap) {
Visited.clear();
identifyClones(Entry.second, Visited, Entry.second->getContextIds());
}
Visited.clear();
for (auto &Entry : AllocationCallToContextNodeMap)
recursivelyRemoveNoneTypeCalleeEdges(Entry.second, Visited);
if (VerifyCCG)
check();
}
// helper function to check an AllocType is cold or notcold or both.
bool checkColdOrNotCold(uint8_t AllocType) {
return (AllocType == (uint8_t)AllocationType::Cold) ||
(AllocType == (uint8_t)AllocationType::NotCold) ||
(AllocType ==
((uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold));
}
template <typename DerivedCCG, typename FuncTy, typename CallTy>
void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::identifyClones(
ContextNode *Node, DenseSet<const ContextNode *> &Visited,
const DenseSet<uint32_t> &AllocContextIds) {
if (VerifyNodes)
checkNode<DerivedCCG, FuncTy, CallTy>(Node, /*CheckEdges=*/false);
assert(!Node->CloneOf);
// If Node as a null call, then either it wasn't found in the module (regular
// LTO) or summary index (ThinLTO), or there were other conditions blocking
// cloning (e.g. recursion, calls multiple targets, etc).
// Do this here so that we don't try to recursively clone callers below, which
// isn't useful at least for this node.
if (!Node->hasCall())
return;
// No need to look at any callers if allocation type already unambiguous.
if (hasSingleAllocType(Node->AllocTypes))
return;
#ifndef NDEBUG
auto Insert =
#endif
Visited.insert(Node);
// We should not have visited this node yet.
assert(Insert.second);
// The recursive call to identifyClones may delete the current edge from the
// CallerEdges vector. Make a copy and iterate on that, simpler than passing
// in an iterator and having recursive call erase from it. Other edges may
// also get removed during the recursion, which will have null Callee and
// Caller pointers (and are deleted later), so we skip those below.
{
auto CallerEdges = Node->CallerEdges;
for (auto &Edge : CallerEdges) {
// Skip any that have been removed by an earlier recursive call.
if (Edge->isRemoved()) {
assert(!is_contained(Node->CallerEdges, Edge));
continue;
}
// Defer backedges. See comments further below where these edges are
// handled during the cloning of this Node.
if (Edge->IsBackedge) {
// We should only mark these if cloning recursive contexts, where we
// need to do this deferral.
assert(CloneRecursiveContexts);
continue;
}
// Ignore any caller we previously visited via another edge.
if (!Visited.count(Edge->Caller) && !Edge->Caller->CloneOf) {
identifyClones(Edge->Caller, Visited, AllocContextIds);
}
}
}
// Check if we reached an unambiguous call or have have only a single caller.
if (hasSingleAllocType(Node->AllocTypes) || Node->CallerEdges.size() <= 1)
return;
// We need to clone.
// Try to keep the original version as alloc type NotCold. This will make
// cases with indirect calls or any other situation with an unknown call to
// the original function get the default behavior. We do this by sorting the
// CallerEdges of the Node we will clone by alloc type.
//
// Give NotCold edge the lowest sort priority so those edges are at the end of
// the caller edges vector, and stay on the original version (since the below
// code clones greedily until it finds all remaining edges have the same type
// and leaves the remaining ones on the original Node).
//
// We shouldn't actually have any None type edges, so the sorting priority for
// that is arbitrary, and we assert in that case below.
const unsigned AllocTypeCloningPriority[] = {/*None*/ 3, /*NotCold*/ 4,
/*Cold*/ 1,
/*NotColdCold*/ 2};
llvm::stable_sort(Node->CallerEdges,
[&](const std::shared_ptr<ContextEdge> &A,
const std::shared_ptr<ContextEdge> &B) {
// Nodes with non-empty context ids should be sorted
// before those with empty context ids.
if (A->ContextIds.empty())
// Either B ContextIds are non-empty (in which case we
// should return false because B < A), or B ContextIds
// are empty, in which case they are equal, and we
// should maintain the original relative ordering.
return false;
if (B->ContextIds.empty())
return true;
if (A->AllocTypes == B->AllocTypes)
// Use the first context id for each edge as a
// tie-breaker.
return *A->ContextIds.begin() < *B->ContextIds.begin();
return AllocTypeCloningPriority[A->AllocTypes] <
AllocTypeCloningPriority[B->AllocTypes];
});
assert(Node->AllocTypes != (uint8_t)AllocationType::None);
DenseSet<uint32_t> RecursiveContextIds;
assert(AllowRecursiveContexts || !CloneRecursiveContexts);
// If we are allowing recursive callsites, but have also disabled recursive
// contexts, look for context ids that show up in multiple caller edges.
if (AllowRecursiveCallsites && !AllowRecursiveContexts) {
DenseSet<uint32_t> AllCallerContextIds;
for (auto &CE : Node->CallerEdges) {
// Resize to the largest set of caller context ids, since we know the
// final set will be at least that large.
AllCallerContextIds.reserve(CE->getContextIds().size());
for (auto Id : CE->getContextIds())
if (!AllCallerContextIds.insert(Id).second)
RecursiveContextIds.insert(Id);
}
}
// Iterate until we find no more opportunities for disambiguating the alloc
// types via cloning. In most cases this loop will terminate once the Node
// has a single allocation type, in which case no more cloning is needed.
// Iterate over a copy of Node's caller edges, since we may need to remove
// edges in the moveEdgeTo* methods, and this simplifies the handling and
// makes it less error-prone.
auto CallerEdges = Node->CallerEdges;
for (auto &CallerEdge : CallerEdges) {
// Skip any that have been removed by an earlier recursive call.
if (CallerEdge->isRemoved()) {
assert(!is_contained(Node->CallerEdges, CallerEdge));
continue;
}
assert(CallerEdge->Callee == Node);
// See if cloning the prior caller edge left this node with a single alloc
// type or a single caller. In that case no more cloning of Node is needed.
if (hasSingleAllocType(Node->AllocTypes) || Node->CallerEdges.size() <= 1)
break;
// If the caller was not successfully matched to a call in the IR/summary,
// there is no point in trying to clone for it as we can't update that call.
if (!CallerEdge->Caller->hasCall())
continue;
// Only need to process the ids along this edge pertaining to the given
// allocation.
auto CallerEdgeContextsForAlloc =
set_intersection(CallerEdge->getContextIds(), AllocContextIds);
if (!RecursiveContextIds.empty())
CallerEdgeContextsForAlloc =
set_difference(CallerEdgeContextsForAlloc, RecursiveContextIds);
if (CallerEdgeContextsForAlloc.empty())
continue;
auto CallerAllocTypeForAlloc = computeAllocType(CallerEdgeContextsForAlloc);
// Compute the node callee edge alloc types corresponding to the context ids
// for this caller edge.
std::vector<uint8_t> CalleeEdgeAllocTypesForCallerEdge;
CalleeEdgeAllocTypesForCallerEdge.reserve(Node->CalleeEdges.size());
for (auto &CalleeEdge : Node->CalleeEdges)
CalleeEdgeAllocTypesForCallerEdge.push_back(intersectAllocTypes(
CalleeEdge->getContextIds(), CallerEdgeContextsForAlloc));
// Don't clone if doing so will not disambiguate any alloc types amongst
// caller edges (including the callee edges that would be cloned).
// Otherwise we will simply move all edges to the clone.
//
// First check if by cloning we will disambiguate the caller allocation
// type from node's allocation type. Query allocTypeToUse so that we don't
// bother cloning to distinguish NotCold+Cold from NotCold. Note that
// neither of these should be None type.
//
// Then check if by cloning node at least one of the callee edges will be
// disambiguated by splitting out different context ids.
//
// However, always do the cloning if this is a backedge, in which case we
// have not yet cloned along this caller edge.
assert(CallerEdge->AllocTypes != (uint8_t)AllocationType::None);
assert(Node->AllocTypes != (uint8_t)AllocationType::None);
if (!CallerEdge->IsBackedge &&
allocTypeToUse(CallerAllocTypeForAlloc) ==
allocTypeToUse(Node->AllocTypes) &&
allocTypesMatch<DerivedCCG, FuncTy, CallTy>(
CalleeEdgeAllocTypesForCallerEdge, Node->CalleeEdges)) {
continue;
}
if (CallerEdge->IsBackedge) {
// We should only mark these if cloning recursive contexts, where we
// need to do this deferral.
assert(CloneRecursiveContexts);
DeferredBackedges++;
}
// If this is a backedge, we now do recursive cloning starting from its
// caller since we may have moved unambiguous caller contexts to a clone
// of this Node in a previous iteration of the current loop, giving more
// opportunity for cloning through the backedge. Because we sorted the
// caller edges earlier so that cold caller edges are first, we would have
// visited and cloned this node for any unamibiguously cold non-recursive
// callers before any ambiguous backedge callers. Note that we don't do this
// if the caller is already cloned or visited during cloning (e.g. via a
// different context path from the allocation).
// TODO: Can we do better in the case where the caller was already visited?
if (CallerEdge->IsBackedge && !CallerEdge->Caller->CloneOf &&
!Visited.count(CallerEdge->Caller)) {
const auto OrigIdCount = CallerEdge->getContextIds().size();
// Now do the recursive cloning of this backedge's caller, which was
// deferred earlier.
identifyClones(CallerEdge->Caller, Visited, CallerEdgeContextsForAlloc);
removeNoneTypeCalleeEdges(CallerEdge->Caller);
// See if the recursive call to identifyClones moved the context ids to a
// new edge from this node to a clone of caller, and switch to looking at
// that new edge so that we clone Node for the new caller clone.
bool UpdatedEdge = false;
if (OrigIdCount > CallerEdge->getContextIds().size()) {
for (auto E : Node->CallerEdges) {
// Only interested in clones of the current edges caller.
if (E->Caller->CloneOf != CallerEdge->Caller)
continue;
// See if this edge contains any of the context ids originally on the
// current caller edge.
auto CallerEdgeContextsForAllocNew =
set_intersection(CallerEdgeContextsForAlloc, E->getContextIds());
if (CallerEdgeContextsForAllocNew.empty())
continue;
// Make sure we don't pick a previously existing caller edge of this
// Node, which would be processed on a different iteration of the
// outer loop over the saved CallerEdges.
if (llvm::is_contained(CallerEdges, E))
continue;
// The CallerAllocTypeForAlloc and CalleeEdgeAllocTypesForCallerEdge
// are updated further below for all cases where we just invoked
// identifyClones recursively.
CallerEdgeContextsForAlloc.swap(CallerEdgeContextsForAllocNew);
CallerEdge = E;
UpdatedEdge = true;
break;
}
}
// If cloning removed this edge (and we didn't update it to a new edge
// above), we're done with this edge. It's possible we moved all of the
// context ids to an existing clone, in which case there's no need to do
// further processing for them.
if (CallerEdge->isRemoved())
continue;
// Now we need to update the information used for the cloning decisions
// further below, as we may have modified edges and their context ids.
// Note if we changed the CallerEdge above we would have already updated
// the context ids.
if (!UpdatedEdge) {
CallerEdgeContextsForAlloc = set_intersection(
CallerEdgeContextsForAlloc, CallerEdge->getContextIds());
if (CallerEdgeContextsForAlloc.empty())
continue;
}
// Update the other information that depends on the edges and on the now
// updated CallerEdgeContextsForAlloc.
CallerAllocTypeForAlloc = computeAllocType(CallerEdgeContextsForAlloc);
CalleeEdgeAllocTypesForCallerEdge.clear();
for (auto &CalleeEdge : Node->CalleeEdges) {
CalleeEdgeAllocTypesForCallerEdge.push_back(intersectAllocTypes(
CalleeEdge->getContextIds(), CallerEdgeContextsForAlloc));
}
}
// First see if we can use an existing clone. Check each clone and its
// callee edges for matching alloc types.
ContextNode *Clone = nullptr;
for (auto *CurClone : Node->Clones) {
if (allocTypeToUse(CurClone->AllocTypes) !=
allocTypeToUse(CallerAllocTypeForAlloc))
continue;
bool BothSingleAlloc = hasSingleAllocType(CurClone->AllocTypes) &&
hasSingleAllocType(CallerAllocTypeForAlloc);
// The above check should mean that if both have single alloc types that
// they should be equal.
assert(!BothSingleAlloc ||
CurClone->AllocTypes == CallerAllocTypeForAlloc);
// If either both have a single alloc type (which are the same), or if the
// clone's callee edges have the same alloc types as those for the current
// allocation on Node's callee edges (CalleeEdgeAllocTypesForCallerEdge),
// then we can reuse this clone.
if (BothSingleAlloc || allocTypesMatchClone<DerivedCCG, FuncTy, CallTy>(
CalleeEdgeAllocTypesForCallerEdge, CurClone)) {
Clone = CurClone;
break;
}
}
// The edge iterator is adjusted when we move the CallerEdge to the clone.
if (Clone)
moveEdgeToExistingCalleeClone(CallerEdge, Clone, /*NewClone=*/false,
CallerEdgeContextsForAlloc);
else
Clone = moveEdgeToNewCalleeClone(CallerEdge, CallerEdgeContextsForAlloc);
// Sanity check that no alloc types on clone or its edges are None.
assert(Clone->AllocTypes != (uint8_t)AllocationType::None);
}
// We should still have some context ids on the original Node.
assert(!Node->emptyContextIds());
// Sanity check that no alloc types on node or edges are None.
assert(Node->AllocTypes != (uint8_t)AllocationType::None);
if (VerifyNodes)
checkNode<DerivedCCG, FuncTy, CallTy>(Node, /*CheckEdges=*/false);
}
void ModuleCallsiteContextGraph::updateAllocationCall(
CallInfo &Call, AllocationType AllocType) {
std::string AllocTypeString = getAllocTypeAttributeString(AllocType);
auto A = llvm::Attribute::get(Call.call()->getFunction()->getContext(),
"memprof", AllocTypeString);
cast<CallBase>(Call.call())->addFnAttr(A);
OREGetter(Call.call()->getFunction())
.emit(OptimizationRemark(DEBUG_TYPE, "MemprofAttribute", Call.call())
<< ore::NV("AllocationCall", Call.call()) << " in clone "
<< ore::NV("Caller", Call.call()->getFunction())
<< " marked with memprof allocation attribute "
<< ore::NV("Attribute", AllocTypeString));
}
void IndexCallsiteContextGraph::updateAllocationCall(CallInfo &Call,
AllocationType AllocType) {
auto *AI = cast<AllocInfo *>(Call.call());
assert(AI);
assert(AI->Versions.size() > Call.cloneNo());
AI->Versions[Call.cloneNo()] = (uint8_t)AllocType;
}
AllocationType
ModuleCallsiteContextGraph::getAllocationCallType(const CallInfo &Call) const {
const auto *CB = cast<CallBase>(Call.call());
if (!CB->getAttributes().hasFnAttr("memprof"))
return AllocationType::None;
return CB->getAttributes().getFnAttr("memprof").getValueAsString() == "cold"
? AllocationType::Cold
: AllocationType::NotCold;
}
AllocationType
IndexCallsiteContextGraph::getAllocationCallType(const CallInfo &Call) const {
const auto *AI = cast<AllocInfo *>(Call.call());
assert(AI->Versions.size() > Call.cloneNo());
return (AllocationType)AI->Versions[Call.cloneNo()];
}
void ModuleCallsiteContextGraph::updateCall(CallInfo &CallerCall,
FuncInfo CalleeFunc) {
if (CalleeFunc.cloneNo() > 0)
cast<CallBase>(CallerCall.call())->setCalledFunction(CalleeFunc.func());
OREGetter(CallerCall.call()->getFunction())
.emit(OptimizationRemark(DEBUG_TYPE, "MemprofCall", CallerCall.call())
<< ore::NV("Call", CallerCall.call()) << " in clone "
<< ore::NV("Caller", CallerCall.call()->getFunction())
<< " assigned to call function clone "
<< ore::NV("Callee", CalleeFunc.func()));
}
void IndexCallsiteContextGraph::updateCall(CallInfo &CallerCall,
FuncInfo CalleeFunc) {
auto *CI = cast<CallsiteInfo *>(CallerCall.call());
assert(CI &&
"Caller cannot be an allocation which should not have profiled calls");
assert(CI->Clones.size() > CallerCall.cloneNo());
CI->Clones[CallerCall.cloneNo()] = CalleeFunc.cloneNo();
}
CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
Instruction *>::FuncInfo
ModuleCallsiteContextGraph::cloneFunctionForCallsite(
FuncInfo &Func, CallInfo &Call, std::map<CallInfo, CallInfo> &CallMap,
std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) {
// Use existing LLVM facilities for cloning and obtaining Call in clone
ValueToValueMapTy VMap;
auto *NewFunc = CloneFunction(Func.func(), VMap);
std::string Name = getMemProfFuncName(Func.func()->getName(), CloneNo);
assert(!Func.func()->getParent()->getFunction(Name));
NewFunc->setName(Name);
for (auto &Inst : CallsWithMetadataInFunc) {
// This map always has the initial version in it.
assert(Inst.cloneNo() == 0);
CallMap[Inst] = {cast<Instruction>(VMap[Inst.call()]), CloneNo};
}
OREGetter(Func.func())
.emit(OptimizationRemark(DEBUG_TYPE, "MemprofClone", Func.func())
<< "created clone " << ore::NV("NewFunction", NewFunc));
return {NewFunc, CloneNo};
}
CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
IndexCall>::FuncInfo
IndexCallsiteContextGraph::cloneFunctionForCallsite(
FuncInfo &Func, CallInfo &Call, std::map<CallInfo, CallInfo> &CallMap,
std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) {
// Check how many clones we have of Call (and therefore function).
// The next clone number is the current size of versions array.
// Confirm this matches the CloneNo provided by the caller, which is based on
// the number of function clones we have.
assert(CloneNo == (isa<AllocInfo *>(Call.call())
? cast<AllocInfo *>(Call.call())->Versions.size()
: cast<CallsiteInfo *>(Call.call())->Clones.size()));
// Walk all the instructions in this function. Create a new version for
// each (by adding an entry to the Versions/Clones summary array), and copy
// over the version being called for the function clone being cloned here.
// Additionally, add an entry to the CallMap for the new function clone,
// mapping the original call (clone 0, what is in CallsWithMetadataInFunc)
// to the new call clone.
for (auto &Inst : CallsWithMetadataInFunc) {
// This map always has the initial version in it.
assert(Inst.cloneNo() == 0);
if (auto *AI = dyn_cast<AllocInfo *>(Inst.call())) {
assert(AI->Versions.size() == CloneNo);
// We assign the allocation type later (in updateAllocationCall), just add
// an entry for it here.
AI->Versions.push_back(0);
} else {
auto *CI = cast<CallsiteInfo *>(Inst.call());
assert(CI && CI->Clones.size() == CloneNo);
// We assign the clone number later (in updateCall), just add an entry for
// it here.
CI->Clones.push_back(0);
}
CallMap[Inst] = {Inst.call(), CloneNo};
}
return {Func.func(), CloneNo};
}
// We perform cloning for each allocation node separately. However, this
// sometimes results in a situation where the same node calls multiple
// clones of the same callee, created for different allocations. This
// causes issues when assigning functions to these clones, as each node can
// in reality only call a single callee clone.
//
// To address this, before assigning functions, merge callee clone nodes as
// needed using a post order traversal from the allocations. We attempt to
// use existing clones as the merge node when legal, and to share them
// among callers with the same properties (callers calling the same set of
// callee clone nodes for the same allocations).
//
// Without this fix, in some cases incorrect function assignment will lead
// to calling the wrong allocation clone.
template <typename DerivedCCG, typename FuncTy, typename CallTy>
void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::mergeClones() {
if (!MergeClones)
return;
// Generate a map from context id to the associated allocation node for use
// when merging clones.
DenseMap<uint32_t, ContextNode *> ContextIdToAllocationNode;
for (auto &Entry : AllocationCallToContextNodeMap) {
auto *Node = Entry.second;
for (auto Id : Node->getContextIds())
ContextIdToAllocationNode[Id] = Node->getOrigNode();
for (auto *Clone : Node->Clones) {
for (auto Id : Clone->getContextIds())
ContextIdToAllocationNode[Id] = Clone->getOrigNode();
}
}
// Post order traversal starting from allocations to ensure each callsite
// calls a single clone of its callee. Callee nodes that are clones of each
// other are merged (via new merge nodes if needed) to achieve this.
DenseSet<const ContextNode *> Visited;
for (auto &Entry : AllocationCallToContextNodeMap) {
auto *Node = Entry.second;
mergeClones(Node, Visited, ContextIdToAllocationNode);
// Make a copy so the recursive post order traversal that may create new
// clones doesn't mess up iteration. Note that the recursive traversal
// itself does not call mergeClones on any of these nodes, which are all
// (clones of) allocations.
auto Clones = Node->Clones;
for (auto *Clone : Clones)
mergeClones(Clone, Visited, ContextIdToAllocationNode);
}
if (DumpCCG) {
dbgs() << "CCG after merging:\n";
dbgs() << *this;
}
if (ExportToDot)
exportToDot("aftermerge");
if (VerifyCCG) {
check();
}
}
// Recursive helper for above mergeClones method.
template <typename DerivedCCG, typename FuncTy, typename CallTy>
void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::mergeClones(
ContextNode *Node, DenseSet<const ContextNode *> &Visited,
DenseMap<uint32_t, ContextNode *> &ContextIdToAllocationNode) {
auto Inserted = Visited.insert(Node);
if (!Inserted.second)
return;
// Make a copy since the recursive call may move a caller edge to a new
// callee, messing up the iterator.
auto CallerEdges = Node->CallerEdges;
for (auto CallerEdge : CallerEdges) {
// Skip any caller edge moved onto a different callee during recursion.
if (CallerEdge->Callee != Node)
continue;
mergeClones(CallerEdge->Caller, Visited, ContextIdToAllocationNode);
}
// Merge for this node after we handle its callers.
mergeNodeCalleeClones(Node, Visited, ContextIdToAllocationNode);
}
template <typename DerivedCCG, typename FuncTy, typename CallTy>
void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::mergeNodeCalleeClones(
ContextNode *Node, DenseSet<const ContextNode *> &Visited,
DenseMap<uint32_t, ContextNode *> &ContextIdToAllocationNode) {
// Ignore Node if we moved all of its contexts to clones.
if (Node->emptyContextIds())
return;
// First identify groups of clones among Node's callee edges, by building
// a map from each callee base node to the associated callee edges from Node.
MapVector<ContextNode *, std::vector<std::shared_ptr<ContextEdge>>>
OrigNodeToCloneEdges;
for (const auto &E : Node->CalleeEdges) {
auto *Callee = E->Callee;
if (!Callee->CloneOf && Callee->Clones.empty())
continue;
ContextNode *Base = Callee->getOrigNode();
OrigNodeToCloneEdges[Base].push_back(E);
}
// Helper for callee edge sorting below. Return true if A's callee has fewer
// caller edges than B, or if A is a clone and B is not, or if A's first
// context id is smaller than B's.
auto CalleeCallerEdgeLessThan = [](const std::shared_ptr<ContextEdge> &A,
const std::shared_ptr<ContextEdge> &B) {
if (A->Callee->CallerEdges.size() != B->Callee->CallerEdges.size())
return A->Callee->CallerEdges.size() < B->Callee->CallerEdges.size();
if (A->Callee->CloneOf && !B->Callee->CloneOf)
return true;
else if (!A->Callee->CloneOf && B->Callee->CloneOf)
return false;
// Use the first context id for each edge as a
// tie-breaker.
return *A->ContextIds.begin() < *B->ContextIds.begin();
};
// Process each set of callee clones called by Node, performing the needed
// merging.
for (auto Entry : OrigNodeToCloneEdges) {
// CalleeEdges is the set of edges from Node reaching callees that are
// mutual clones of each other.
auto &CalleeEdges = Entry.second;
auto NumCalleeClones = CalleeEdges.size();
// A single edge means there is no merging needed.
if (NumCalleeClones == 1)
continue;
// Sort the CalleeEdges calling this group of clones in ascending order of
// their caller edge counts, putting the original non-clone node first in
// cases of a tie. This simplifies finding an existing node to use as the
// merge node.
llvm::stable_sort(CalleeEdges, CalleeCallerEdgeLessThan);
/// Find other callers of the given set of callee edges that can
/// share the same callee merge node. See the comments at this method
/// definition for details.
DenseSet<ContextNode *> OtherCallersToShareMerge;
findOtherCallersToShareMerge(Node, CalleeEdges, ContextIdToAllocationNode,
OtherCallersToShareMerge);
// Now do the actual merging. Identify existing or create a new MergeNode
// during the first iteration. Move each callee over, along with edges from
// other callers we've determined above can share the same merge node.
ContextNode *MergeNode = nullptr;
DenseMap<ContextNode *, unsigned> CallerToMoveCount;
for (auto CalleeEdge : CalleeEdges) {
auto *OrigCallee = CalleeEdge->Callee;
// If we don't have a MergeNode yet (only happens on the first iteration,
// as a new one will be created when we go to move the first callee edge
// over as needed), see if we can use this callee.
if (!MergeNode) {
// If there are no other callers, simply use this callee.
if (CalleeEdge->Callee->CallerEdges.size() == 1) {
MergeNode = OrigCallee;
NonNewMergedNodes++;
continue;
}
// Otherwise, if we have identified other caller nodes that can share
// the merge node with Node, see if all of OrigCallee's callers are
// going to share the same merge node. In that case we can use callee
// (since all of its callers would move to the new merge node).
if (!OtherCallersToShareMerge.empty()) {
bool MoveAllCallerEdges = true;
for (auto CalleeCallerE : OrigCallee->CallerEdges) {
if (CalleeCallerE == CalleeEdge)
continue;
if (!OtherCallersToShareMerge.contains(CalleeCallerE->Caller)) {
MoveAllCallerEdges = false;
break;
}
}
// If we are going to move all callers over, we can use this callee as
// the MergeNode.
if (MoveAllCallerEdges) {
MergeNode = OrigCallee;
NonNewMergedNodes++;
continue;
}
}
}
// Move this callee edge, creating a new merge node if necessary.
if (MergeNode) {
assert(MergeNode != OrigCallee);
moveEdgeToExistingCalleeClone(CalleeEdge, MergeNode,
/*NewClone*/ false);
} else {
MergeNode = moveEdgeToNewCalleeClone(CalleeEdge);
NewMergedNodes++;
}
// Now move all identified edges from other callers over to the merge node
// as well.
if (!OtherCallersToShareMerge.empty()) {
// Make and iterate over a copy of OrigCallee's caller edges because
// some of these will be moved off of the OrigCallee and that would mess
// up the iteration from OrigCallee.
auto OrigCalleeCallerEdges = OrigCallee->CallerEdges;
for (auto &CalleeCallerE : OrigCalleeCallerEdges) {
if (CalleeCallerE == CalleeEdge)
continue;
if (!OtherCallersToShareMerge.contains(CalleeCallerE->Caller))
continue;
CallerToMoveCount[CalleeCallerE->Caller]++;
moveEdgeToExistingCalleeClone(CalleeCallerE, MergeNode,
/*NewClone*/ false);
}
}
removeNoneTypeCalleeEdges(OrigCallee);
removeNoneTypeCalleeEdges(MergeNode);
}
}
}
// Look for other nodes that have edges to the same set of callee
// clones as the current Node. Those can share the eventual merge node
// (reducing cloning and binary size overhead) iff:
// - they have edges to the same set of callee clones
// - each callee edge reaches a subset of the same allocations as Node's
// corresponding edge to the same callee clone.
// The second requirement is to ensure that we don't undo any of the
// necessary cloning to distinguish contexts with different allocation
// behavior.
// FIXME: This is somewhat conservative, as we really just need to ensure
// that they don't reach the same allocations as contexts on edges from Node
// going to any of the *other* callee clones being merged. However, that
// requires more tracking and checking to get right.
template <typename DerivedCCG, typename FuncTy, typename CallTy>
void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
findOtherCallersToShareMerge(
ContextNode *Node,
std::vector<std::shared_ptr<ContextEdge>> &CalleeEdges,
DenseMap<uint32_t, ContextNode *> &ContextIdToAllocationNode,
DenseSet<ContextNode *> &OtherCallersToShareMerge) {
auto NumCalleeClones = CalleeEdges.size();
// This map counts how many edges to the same callee clone exist for other
// caller nodes of each callee clone.
DenseMap<ContextNode *, unsigned> OtherCallersToSharedCalleeEdgeCount;
// Counts the number of other caller nodes that have edges to all callee
// clones that don't violate the allocation context checking.
unsigned PossibleOtherCallerNodes = 0;
// We only need to look at other Caller nodes if the first callee edge has
// multiple callers (recall they are sorted in ascending order above).
if (CalleeEdges[0]->Callee->CallerEdges.size() < 2)
return;
// For each callee edge:
// - Collect the count of other caller nodes calling the same callees.
// - Collect the alloc nodes reached by contexts on each callee edge.
DenseMap<ContextEdge *, DenseSet<ContextNode *>> CalleeEdgeToAllocNodes;
for (auto CalleeEdge : CalleeEdges) {
assert(CalleeEdge->Callee->CallerEdges.size() > 1);
// For each other caller of the same callee, increment the count of
// edges reaching the same callee clone.
for (auto CalleeCallerEdges : CalleeEdge->Callee->CallerEdges) {
if (CalleeCallerEdges->Caller == Node) {
assert(CalleeCallerEdges == CalleeEdge);
continue;
}
OtherCallersToSharedCalleeEdgeCount[CalleeCallerEdges->Caller]++;
// If this caller edge now reaches all of the same callee clones,
// increment the count of candidate other caller nodes.
if (OtherCallersToSharedCalleeEdgeCount[CalleeCallerEdges->Caller] ==
NumCalleeClones)
PossibleOtherCallerNodes++;
}
// Collect the alloc nodes reached by contexts on each callee edge, for
// later analysis.
for (auto Id : CalleeEdge->getContextIds()) {
auto *Alloc = ContextIdToAllocationNode.lookup(Id);
if (!Alloc) {
// FIXME: unclear why this happens occasionally, presumably
// imperfect graph updates possibly with recursion.
MissingAllocForContextId++;
continue;
}
CalleeEdgeToAllocNodes[CalleeEdge.get()].insert(Alloc);
}
}
// Now walk the callee edges again, and make sure that for each candidate
// caller node all of its edges to the callees reach the same allocs (or
// a subset) as those along the corresponding callee edge from Node.
for (auto CalleeEdge : CalleeEdges) {
assert(CalleeEdge->Callee->CallerEdges.size() > 1);
// Stop if we do not have any (more) candidate other caller nodes.
if (!PossibleOtherCallerNodes)
break;
auto &CurCalleeAllocNodes = CalleeEdgeToAllocNodes[CalleeEdge.get()];
// Check each other caller of this callee clone.
for (auto &CalleeCallerE : CalleeEdge->Callee->CallerEdges) {
// Not interested in the callee edge from Node itself.
if (CalleeCallerE == CalleeEdge)
continue;
// Skip any callers that didn't have callee edges to all the same
// callee clones.
if (OtherCallersToSharedCalleeEdgeCount[CalleeCallerE->Caller] !=
NumCalleeClones)
continue;
// Make sure that each context along edge from candidate caller node
// reaches an allocation also reached by this callee edge from Node.
for (auto Id : CalleeCallerE->getContextIds()) {
auto *Alloc = ContextIdToAllocationNode.lookup(Id);
if (!Alloc)
continue;
// If not, simply reset the map entry to 0 so caller is ignored, and
// reduce the count of candidate other caller nodes.
if (!CurCalleeAllocNodes.contains(Alloc)) {
OtherCallersToSharedCalleeEdgeCount[CalleeCallerE->Caller] = 0;
PossibleOtherCallerNodes--;
break;
}
}
}
}
if (!PossibleOtherCallerNodes)
return;
// Build the set of other caller nodes that can use the same callee merge
// node.
for (auto &[OtherCaller, Count] : OtherCallersToSharedCalleeEdgeCount) {
if (Count != NumCalleeClones)
continue;
OtherCallersToShareMerge.insert(OtherCaller);
}
}
// This method assigns cloned callsites to functions, cloning the functions as
// needed. The assignment is greedy and proceeds roughly as follows:
//
// For each function Func:
// For each call with graph Node having clones:
// Initialize ClonesWorklist to Node and its clones
// Initialize NodeCloneCount to 0
// While ClonesWorklist is not empty:
// Clone = pop front ClonesWorklist
// NodeCloneCount++
// If Func has been cloned less than NodeCloneCount times:
// If NodeCloneCount is 1:
// Assign Clone to original Func
// Continue
// Create a new function clone
// If other callers not assigned to call a function clone yet:
// Assign them to call new function clone
// Continue
// Assign any other caller calling the cloned version to new clone
//
// For each caller of Clone:
// If caller is assigned to call a specific function clone:
// If we cannot assign Clone to that function clone:
// Create new callsite Clone NewClone
// Add NewClone to ClonesWorklist
// Continue
// Assign Clone to existing caller's called function clone
// Else:
// If Clone not already assigned to a function clone:
// Assign to first function clone without assignment
// Assign caller to selected function clone
template <typename DerivedCCG, typename FuncTy, typename CallTy>
bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() {
bool Changed = false;
mergeClones();
// Keep track of the assignment of nodes (callsites) to function clones they
// call.
DenseMap<ContextNode *, FuncInfo> CallsiteToCalleeFuncCloneMap;
// Update caller node to call function version CalleeFunc, by recording the
// assignment in CallsiteToCalleeFuncCloneMap.
auto RecordCalleeFuncOfCallsite = [&](ContextNode *Caller,
const FuncInfo &CalleeFunc) {
assert(Caller->hasCall());
CallsiteToCalleeFuncCloneMap[Caller] = CalleeFunc;
};
// Walk all functions for which we saw calls with memprof metadata, and handle
// cloning for each of its calls.
for (auto &[Func, CallsWithMetadata] : FuncToCallsWithMetadata) {
FuncInfo OrigFunc(Func);
// Map from each clone of OrigFunc to a map of remappings of each call of
// interest (from original uncloned call to the corresponding cloned call in
// that function clone).
std::map<FuncInfo, std::map<CallInfo, CallInfo>> FuncClonesToCallMap;
for (auto &Call : CallsWithMetadata) {
ContextNode *Node = getNodeForInst(Call);
// Skip call if we do not have a node for it (all uses of its stack ids
// were either on inlined chains or pruned from the MIBs), or if we did
// not create any clones for it.
if (!Node || Node->Clones.empty())
continue;
assert(Node->hasCall() &&
"Not having a call should have prevented cloning");
// Track the assignment of function clones to clones of the current
// callsite Node being handled.
std::map<FuncInfo, ContextNode *> FuncCloneToCurNodeCloneMap;
// Assign callsite version CallsiteClone to function version FuncClone,
// and also assign (possibly cloned) Call to CallsiteClone.
auto AssignCallsiteCloneToFuncClone = [&](const FuncInfo &FuncClone,
CallInfo &Call,
ContextNode *CallsiteClone,
bool IsAlloc) {
// Record the clone of callsite node assigned to this function clone.
FuncCloneToCurNodeCloneMap[FuncClone] = CallsiteClone;
assert(FuncClonesToCallMap.count(FuncClone));
std::map<CallInfo, CallInfo> &CallMap = FuncClonesToCallMap[FuncClone];
CallInfo CallClone(Call);
if (auto It = CallMap.find(Call); It != CallMap.end())
CallClone = It->second;
CallsiteClone->setCall(CallClone);
// Need to do the same for all matching calls.
for (auto &MatchingCall : Node->MatchingCalls) {
CallInfo CallClone(MatchingCall);
if (auto It = CallMap.find(MatchingCall); It != CallMap.end())
CallClone = It->second;
// Updates the call in the list.
MatchingCall = CallClone;
}
};
// Keep track of the clones of callsite Node that need to be assigned to
// function clones. This list may be expanded in the loop body below if we
// find additional cloning is required.
std::deque<ContextNode *> ClonesWorklist;
// Ignore original Node if we moved all of its contexts to clones.
if (!Node->emptyContextIds())
ClonesWorklist.push_back(Node);
llvm::append_range(ClonesWorklist, Node->Clones);
// Now walk through all of the clones of this callsite Node that we need,
// and determine the assignment to a corresponding clone of the current
// function (creating new function clones as needed).
unsigned NodeCloneCount = 0;
while (!ClonesWorklist.empty()) {
ContextNode *Clone = ClonesWorklist.front();
ClonesWorklist.pop_front();
NodeCloneCount++;
if (VerifyNodes)
checkNode<DerivedCCG, FuncTy, CallTy>(Clone);
// Need to create a new function clone if we have more callsite clones
// than existing function clones, which would have been assigned to an
// earlier clone in the list (we assign callsite clones to function
// clones greedily).
if (FuncClonesToCallMap.size() < NodeCloneCount) {
// If this is the first callsite copy, assign to original function.
if (NodeCloneCount == 1) {
// Since FuncClonesToCallMap is empty in this case, no clones have
// been created for this function yet, and no callers should have
// been assigned a function clone for this callee node yet.
assert(llvm::none_of(
Clone->CallerEdges, [&](const std::shared_ptr<ContextEdge> &E) {
return CallsiteToCalleeFuncCloneMap.count(E->Caller);
}));
// Initialize with empty call map, assign Clone to original function
// and its callers, and skip to the next clone.
FuncClonesToCallMap[OrigFunc] = {};
AssignCallsiteCloneToFuncClone(
OrigFunc, Call, Clone,
AllocationCallToContextNodeMap.count(Call));
for (auto &CE : Clone->CallerEdges) {
// Ignore any caller that does not have a recorded callsite Call.
if (!CE->Caller->hasCall())
continue;
RecordCalleeFuncOfCallsite(CE->Caller, OrigFunc);
}
continue;
}
// First locate which copy of OrigFunc to clone again. If a caller
// of this callsite clone was already assigned to call a particular
// function clone, we need to redirect all of those callers to the
// new function clone, and update their other callees within this
// function.
FuncInfo PreviousAssignedFuncClone;
auto EI = llvm::find_if(
Clone->CallerEdges, [&](const std::shared_ptr<ContextEdge> &E) {
return CallsiteToCalleeFuncCloneMap.count(E->Caller);
});
bool CallerAssignedToCloneOfFunc = false;
if (EI != Clone->CallerEdges.end()) {
const std::shared_ptr<ContextEdge> &Edge = *EI;
PreviousAssignedFuncClone =
CallsiteToCalleeFuncCloneMap[Edge->Caller];
CallerAssignedToCloneOfFunc = true;
}
// Clone function and save it along with the CallInfo map created
// during cloning in the FuncClonesToCallMap.
std::map<CallInfo, CallInfo> NewCallMap;
unsigned CloneNo = FuncClonesToCallMap.size();
assert(CloneNo > 0 && "Clone 0 is the original function, which "
"should already exist in the map");
FuncInfo NewFuncClone = cloneFunctionForCallsite(
OrigFunc, Call, NewCallMap, CallsWithMetadata, CloneNo);
FuncClonesToCallMap.emplace(NewFuncClone, std::move(NewCallMap));
FunctionClonesAnalysis++;
Changed = true;
// If no caller callsites were already assigned to a clone of this
// function, we can simply assign this clone to the new func clone
// and update all callers to it, then skip to the next clone.
if (!CallerAssignedToCloneOfFunc) {
AssignCallsiteCloneToFuncClone(
NewFuncClone, Call, Clone,
AllocationCallToContextNodeMap.count(Call));
for (auto &CE : Clone->CallerEdges) {
// Ignore any caller that does not have a recorded callsite Call.
if (!CE->Caller->hasCall())
continue;
RecordCalleeFuncOfCallsite(CE->Caller, NewFuncClone);
}
continue;
}
// We may need to do additional node cloning in this case.
// Reset the CallsiteToCalleeFuncCloneMap entry for any callers
// that were previously assigned to call PreviousAssignedFuncClone,
// to record that they now call NewFuncClone.
// The none type edge removal may remove some of this Clone's caller
// edges, if it is reached via another of its caller's callees.
// Iterate over a copy and skip any that were removed.
auto CallerEdges = Clone->CallerEdges;
for (auto CE : CallerEdges) {
// Skip any that have been removed on an earlier iteration.
if (CE->isRemoved()) {
assert(!is_contained(Clone->CallerEdges, CE));
continue;
}
assert(CE);
// Ignore any caller that does not have a recorded callsite Call.
if (!CE->Caller->hasCall())
continue;
if (!CallsiteToCalleeFuncCloneMap.count(CE->Caller) ||
// We subsequently fall through to later handling that
// will perform any additional cloning required for
// callers that were calling other function clones.
CallsiteToCalleeFuncCloneMap[CE->Caller] !=
PreviousAssignedFuncClone)
continue;
RecordCalleeFuncOfCallsite(CE->Caller, NewFuncClone);
// If we are cloning a function that was already assigned to some
// callers, then essentially we are creating new callsite clones
// of the other callsites in that function that are reached by those
// callers. Clone the other callees of the current callsite's caller
// that were already assigned to PreviousAssignedFuncClone
// accordingly. This is important since we subsequently update the
// calls from the nodes in the graph and their assignments to callee
// functions recorded in CallsiteToCalleeFuncCloneMap.
// The none type edge removal may remove some of this caller's
// callee edges, if it is reached via another of its callees.
// Iterate over a copy and skip any that were removed.
auto CalleeEdges = CE->Caller->CalleeEdges;
for (auto CalleeEdge : CalleeEdges) {
// Skip any that have been removed on an earlier iteration when
// cleaning up newly None type callee edges.
if (CalleeEdge->isRemoved()) {
assert(!is_contained(CE->Caller->CalleeEdges, CalleeEdge));
continue;
}
assert(CalleeEdge);
ContextNode *Callee = CalleeEdge->Callee;
// Skip the current callsite, we are looking for other
// callsites Caller calls, as well as any that does not have a
// recorded callsite Call.
if (Callee == Clone || !Callee->hasCall())
continue;
// Skip direct recursive calls. We don't need/want to clone the
// caller node again, and this loop will not behave as expected if
// we tried.
if (Callee == CalleeEdge->Caller)
continue;
ContextNode *NewClone = moveEdgeToNewCalleeClone(CalleeEdge);
removeNoneTypeCalleeEdges(NewClone);
// Moving the edge may have resulted in some none type
// callee edges on the original Callee.
removeNoneTypeCalleeEdges(Callee);
assert(NewClone->AllocTypes != (uint8_t)AllocationType::None);
// If the Callee node was already assigned to call a specific
// function version, make sure its new clone is assigned to call
// that same function clone.
if (CallsiteToCalleeFuncCloneMap.count(Callee))
RecordCalleeFuncOfCallsite(
NewClone, CallsiteToCalleeFuncCloneMap[Callee]);
// Update NewClone with the new Call clone of this callsite's Call
// created for the new function clone created earlier.
// Recall that we have already ensured when building the graph
// that each caller can only call callsites within the same
// function, so we are guaranteed that Callee Call is in the
// current OrigFunc.
// CallMap is set up as indexed by original Call at clone 0.
CallInfo OrigCall(Callee->getOrigNode()->Call);
OrigCall.setCloneNo(0);
std::map<CallInfo, CallInfo> &CallMap =
FuncClonesToCallMap[NewFuncClone];
assert(CallMap.count(OrigCall));
CallInfo NewCall(CallMap[OrigCall]);
assert(NewCall);
NewClone->setCall(NewCall);
// Need to do the same for all matching calls.
for (auto &MatchingCall : NewClone->MatchingCalls) {
CallInfo OrigMatchingCall(MatchingCall);
OrigMatchingCall.setCloneNo(0);
assert(CallMap.count(OrigMatchingCall));
CallInfo NewCall(CallMap[OrigMatchingCall]);
assert(NewCall);
// Updates the call in the list.
MatchingCall = NewCall;
}
}
}
// Fall through to handling below to perform the recording of the
// function for this callsite clone. This enables handling of cases
// where the callers were assigned to different clones of a function.
}
// See if we can use existing function clone. Walk through
// all caller edges to see if any have already been assigned to
// a clone of this callsite's function. If we can use it, do so. If not,
// because that function clone is already assigned to a different clone
// of this callsite, then we need to clone again.
// Basically, this checking is needed to handle the case where different
// caller functions/callsites may need versions of this function
// containing different mixes of callsite clones across the different
// callsites within the function. If that happens, we need to create
// additional function clones to handle the various combinations.
//
// Keep track of any new clones of this callsite created by the
// following loop, as well as any existing clone that we decided to
// assign this clone to.
std::map<FuncInfo, ContextNode *> FuncCloneToNewCallsiteCloneMap;
FuncInfo FuncCloneAssignedToCurCallsiteClone;
// Iterate over a copy of Clone's caller edges, since we may need to
// remove edges in the moveEdgeTo* methods, and this simplifies the
// handling and makes it less error-prone.
auto CloneCallerEdges = Clone->CallerEdges;
for (auto &Edge : CloneCallerEdges) {
// Skip removed edges (due to direct recursive edges updated when
// updating callee edges when moving an edge and subsequently
// removed by call to removeNoneTypeCalleeEdges on the Clone).
if (Edge->isRemoved())
continue;
// Ignore any caller that does not have a recorded callsite Call.
if (!Edge->Caller->hasCall())
continue;
// If this caller already assigned to call a version of OrigFunc, need
// to ensure we can assign this callsite clone to that function clone.
if (CallsiteToCalleeFuncCloneMap.count(Edge->Caller)) {
FuncInfo FuncCloneCalledByCaller =
CallsiteToCalleeFuncCloneMap[Edge->Caller];
// First we need to confirm that this function clone is available
// for use by this callsite node clone.
//
// While FuncCloneToCurNodeCloneMap is built only for this Node and
// its callsite clones, one of those callsite clones X could have
// been assigned to the same function clone called by Edge's caller
// - if Edge's caller calls another callsite within Node's original
// function, and that callsite has another caller reaching clone X.
// We need to clone Node again in this case.
if ((FuncCloneToCurNodeCloneMap.count(FuncCloneCalledByCaller) &&
FuncCloneToCurNodeCloneMap[FuncCloneCalledByCaller] !=
Clone) ||
// Detect when we have multiple callers of this callsite that
// have already been assigned to specific, and different, clones
// of OrigFunc (due to other unrelated callsites in Func they
// reach via call contexts). Is this Clone of callsite Node
// assigned to a different clone of OrigFunc? If so, clone Node
// again.
(FuncCloneAssignedToCurCallsiteClone &&
FuncCloneAssignedToCurCallsiteClone !=
FuncCloneCalledByCaller)) {
// We need to use a different newly created callsite clone, in
// order to assign it to another new function clone on a
// subsequent iteration over the Clones array (adjusted below).
// Note we specifically do not reset the
// CallsiteToCalleeFuncCloneMap entry for this caller, so that
// when this new clone is processed later we know which version of
// the function to copy (so that other callsite clones we have
// assigned to that function clone are properly cloned over). See
// comments in the function cloning handling earlier.
// Check if we already have cloned this callsite again while
// walking through caller edges, for a caller calling the same
// function clone. If so, we can move this edge to that new clone
// rather than creating yet another new clone.
if (FuncCloneToNewCallsiteCloneMap.count(
FuncCloneCalledByCaller)) {
ContextNode *NewClone =
FuncCloneToNewCallsiteCloneMap[FuncCloneCalledByCaller];
moveEdgeToExistingCalleeClone(Edge, NewClone);
// Cleanup any none type edges cloned over.
removeNoneTypeCalleeEdges(NewClone);
} else {
// Create a new callsite clone.
ContextNode *NewClone = moveEdgeToNewCalleeClone(Edge);
removeNoneTypeCalleeEdges(NewClone);
FuncCloneToNewCallsiteCloneMap[FuncCloneCalledByCaller] =
NewClone;
// Add to list of clones and process later.
ClonesWorklist.push_back(NewClone);
assert(NewClone->AllocTypes != (uint8_t)AllocationType::None);
}
// Moving the caller edge may have resulted in some none type
// callee edges.
removeNoneTypeCalleeEdges(Clone);
// We will handle the newly created callsite clone in a subsequent
// iteration over this Node's Clones.
continue;
}
// Otherwise, we can use the function clone already assigned to this
// caller.
if (!FuncCloneAssignedToCurCallsiteClone) {
FuncCloneAssignedToCurCallsiteClone = FuncCloneCalledByCaller;
// Assign Clone to FuncCloneCalledByCaller
AssignCallsiteCloneToFuncClone(
FuncCloneCalledByCaller, Call, Clone,
AllocationCallToContextNodeMap.count(Call));
} else
// Don't need to do anything - callsite is already calling this
// function clone.
assert(FuncCloneAssignedToCurCallsiteClone ==
FuncCloneCalledByCaller);
} else {
// We have not already assigned this caller to a version of
// OrigFunc. Do the assignment now.
// First check if we have already assigned this callsite clone to a
// clone of OrigFunc for another caller during this iteration over
// its caller edges.
if (!FuncCloneAssignedToCurCallsiteClone) {
// Find first function in FuncClonesToCallMap without an assigned
// clone of this callsite Node. We should always have one
// available at this point due to the earlier cloning when the
// FuncClonesToCallMap size was smaller than the clone number.
for (auto &CF : FuncClonesToCallMap) {
if (!FuncCloneToCurNodeCloneMap.count(CF.first)) {
FuncCloneAssignedToCurCallsiteClone = CF.first;
break;
}
}
assert(FuncCloneAssignedToCurCallsiteClone);
// Assign Clone to FuncCloneAssignedToCurCallsiteClone
AssignCallsiteCloneToFuncClone(
FuncCloneAssignedToCurCallsiteClone, Call, Clone,
AllocationCallToContextNodeMap.count(Call));
} else
assert(FuncCloneToCurNodeCloneMap
[FuncCloneAssignedToCurCallsiteClone] == Clone);
// Update callers to record function version called.
RecordCalleeFuncOfCallsite(Edge->Caller,
FuncCloneAssignedToCurCallsiteClone);
}
}
}
if (VerifyCCG) {
checkNode<DerivedCCG, FuncTy, CallTy>(Node);
for (const auto &PE : Node->CalleeEdges)
checkNode<DerivedCCG, FuncTy, CallTy>(PE->Callee);
for (const auto &CE : Node->CallerEdges)
checkNode<DerivedCCG, FuncTy, CallTy>(CE->Caller);
for (auto *Clone : Node->Clones) {
checkNode<DerivedCCG, FuncTy, CallTy>(Clone);
for (const auto &PE : Clone->CalleeEdges)
checkNode<DerivedCCG, FuncTy, CallTy>(PE->Callee);
for (const auto &CE : Clone->CallerEdges)
checkNode<DerivedCCG, FuncTy, CallTy>(CE->Caller);
}
}
}
}
uint8_t BothTypes =
(uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold;
auto UpdateCalls = [&](ContextNode *Node,
DenseSet<const ContextNode *> &Visited,
auto &&UpdateCalls) {
auto Inserted = Visited.insert(Node);
if (!Inserted.second)
return;
for (auto *Clone : Node->Clones)
UpdateCalls(Clone, Visited, UpdateCalls);
for (auto &Edge : Node->CallerEdges)
UpdateCalls(Edge->Caller, Visited, UpdateCalls);
// Skip if either no call to update, or if we ended up with no context ids
// (we moved all edges onto other clones).
if (!Node->hasCall() || Node->emptyContextIds())
return;
if (Node->IsAllocation) {
auto AT = allocTypeToUse(Node->AllocTypes);
// If the allocation type is ambiguous, and more aggressive hinting
// has been enabled via the MinClonedColdBytePercent flag, see if this
// allocation should be hinted cold anyway because its fraction cold bytes
// allocated is at least the given threshold.
if (Node->AllocTypes == BothTypes && MinClonedColdBytePercent < 100 &&
!ContextIdToContextSizeInfos.empty()) {
uint64_t TotalCold = 0;
uint64_t Total = 0;
for (auto Id : Node->getContextIds()) {
auto TypeI = ContextIdToAllocationType.find(Id);
assert(TypeI != ContextIdToAllocationType.end());
auto CSI = ContextIdToContextSizeInfos.find(Id);
if (CSI != ContextIdToContextSizeInfos.end()) {
for (auto &Info : CSI->second) {
Total += Info.TotalSize;
if (TypeI->second == AllocationType::Cold)
TotalCold += Info.TotalSize;
}
}
}
if (TotalCold * 100 >= Total * MinClonedColdBytePercent)
AT = AllocationType::Cold;
}
updateAllocationCall(Node->Call, AT);
assert(Node->MatchingCalls.empty());
return;
}
if (!CallsiteToCalleeFuncCloneMap.count(Node))
return;
auto CalleeFunc = CallsiteToCalleeFuncCloneMap[Node];
updateCall(Node->Call, CalleeFunc);
// Update all the matching calls as well.
for (auto &Call : Node->MatchingCalls)
updateCall(Call, CalleeFunc);
};
// Performs DFS traversal starting from allocation nodes to update calls to
// reflect cloning decisions recorded earlier. For regular LTO this will
// update the actual calls in the IR to call the appropriate function clone
// (and add attributes to allocation calls), whereas for ThinLTO the decisions
// are recorded in the summary entries.
DenseSet<const ContextNode *> Visited;
for (auto &Entry : AllocationCallToContextNodeMap)
UpdateCalls(Entry.second, Visited, UpdateCalls);
return Changed;
}
static SmallVector<std::unique_ptr<ValueToValueMapTy>, 4> createFunctionClones(
Function &F, unsigned NumClones, Module &M, OptimizationRemarkEmitter &ORE,
std::map<const Function *, SmallPtrSet<const GlobalAlias *, 1>>
&FuncToAliasMap) {
// The first "clone" is the original copy, we should only call this if we
// needed to create new clones.
assert(NumClones > 1);
SmallVector<std::unique_ptr<ValueToValueMapTy>, 4> VMaps;
VMaps.reserve(NumClones - 1);
FunctionsClonedThinBackend++;
for (unsigned I = 1; I < NumClones; I++) {
VMaps.emplace_back(std::make_unique<ValueToValueMapTy>());
auto *NewF = CloneFunction(&F, *VMaps.back());
FunctionClonesThinBackend++;
// Strip memprof and callsite metadata from clone as they are no longer
// needed.
for (auto &BB : *NewF) {
for (auto &Inst : BB) {
Inst.setMetadata(LLVMContext::MD_memprof, nullptr);
Inst.setMetadata(LLVMContext::MD_callsite, nullptr);
}
}
std::string Name = getMemProfFuncName(F.getName(), I);
auto *PrevF = M.getFunction(Name);
if (PrevF) {
// We might have created this when adjusting callsite in another
// function. It should be a declaration.
assert(PrevF->isDeclaration());
NewF->takeName(PrevF);
PrevF->replaceAllUsesWith(NewF);
PrevF->eraseFromParent();
} else
NewF->setName(Name);
ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofClone", &F)
<< "created clone " << ore::NV("NewFunction", NewF));
// Now handle aliases to this function, and clone those as well.
if (!FuncToAliasMap.count(&F))
continue;
for (auto *A : FuncToAliasMap[&F]) {
std::string Name = getMemProfFuncName(A->getName(), I);
auto *PrevA = M.getNamedAlias(Name);
auto *NewA = GlobalAlias::create(A->getValueType(),
A->getType()->getPointerAddressSpace(),
A->getLinkage(), Name, NewF);
NewA->copyAttributesFrom(A);
if (PrevA) {
// We might have created this when adjusting callsite in another
// function. It should be a declaration.
assert(PrevA->isDeclaration());
NewA->takeName(PrevA);
PrevA->replaceAllUsesWith(NewA);
PrevA->eraseFromParent();
}
}
}
return VMaps;
}
// Locate the summary for F. This is complicated by the fact that it might
// have been internalized or promoted.
static ValueInfo findValueInfoForFunc(const Function &F, const Module &M,
const ModuleSummaryIndex *ImportSummary,
const Function *CallingFunc = nullptr) {
// FIXME: Ideally we would retain the original GUID in some fashion on the
// function (e.g. as metadata), but for now do our best to locate the
// summary without that information.
ValueInfo TheFnVI = ImportSummary->getValueInfo(F.getGUID());
if (!TheFnVI)
// See if theFn was internalized, by checking index directly with
// original name (this avoids the name adjustment done by getGUID() for
// internal symbols).
TheFnVI = ImportSummary->getValueInfo(
GlobalValue::getGUIDAssumingExternalLinkage(F.getName()));
if (TheFnVI)
return TheFnVI;
// Now query with the original name before any promotion was performed.
StringRef OrigName =
ModuleSummaryIndex::getOriginalNameBeforePromote(F.getName());
// When this pass is enabled, we always add thinlto_src_file provenance
// metadata to imported function definitions, which allows us to recreate the
// original internal symbol's GUID.
auto SrcFileMD = F.getMetadata("thinlto_src_file");
// If this is a call to an imported/promoted local for which we didn't import
// the definition, the metadata will not exist on the declaration. However,
// since we are doing this early, before any inlining in the LTO backend, we
// can simply look at the metadata on the calling function which must have
// been from the same module if F was an internal symbol originally.
if (!SrcFileMD && F.isDeclaration()) {
// We would only call this for a declaration for a direct callsite, in which
// case the caller would have provided the calling function pointer.
assert(CallingFunc);
SrcFileMD = CallingFunc->getMetadata("thinlto_src_file");
// If this is a promoted local (OrigName != F.getName()), since this is a
// declaration, it must be imported from a different module and therefore we
// should always find the metadata on its calling function. Any call to a
// promoted local that came from this module should still be a definition.
assert(SrcFileMD || OrigName == F.getName());
}
StringRef SrcFile = M.getSourceFileName();
if (SrcFileMD)
SrcFile = dyn_cast<MDString>(SrcFileMD->getOperand(0))->getString();
std::string OrigId = GlobalValue::getGlobalIdentifier(
OrigName, GlobalValue::InternalLinkage, SrcFile);
TheFnVI = ImportSummary->getValueInfo(
GlobalValue::getGUIDAssumingExternalLinkage(OrigId));
// Internal func in original module may have gotten a numbered suffix if we
// imported an external function with the same name. This happens
// automatically during IR linking for naming conflicts. It would have to
// still be internal in that case (otherwise it would have been renamed on
// promotion in which case we wouldn't have a naming conflict).
if (!TheFnVI && OrigName == F.getName() && F.hasLocalLinkage() &&
F.getName().contains('.')) {
OrigName = F.getName().rsplit('.').first;
OrigId = GlobalValue::getGlobalIdentifier(
OrigName, GlobalValue::InternalLinkage, SrcFile);
TheFnVI = ImportSummary->getValueInfo(
GlobalValue::getGUIDAssumingExternalLinkage(OrigId));
}
// The only way we may not have a VI is if this is a declaration created for
// an imported reference. For distributed ThinLTO we may not have a VI for
// such declarations in the distributed summary.
assert(TheFnVI || F.isDeclaration());
return TheFnVI;
}
bool MemProfContextDisambiguation::initializeIndirectCallPromotionInfo(
Module &M) {
ICallAnalysis = std::make_unique<ICallPromotionAnalysis>();
Symtab = std::make_unique<InstrProfSymtab>();
// Don't add canonical names, to avoid multiple functions to the symtab
// when they both have the same root name with "." suffixes stripped.
// If we pick the wrong one then this could lead to incorrect ICP and calling
// a memprof clone that we don't actually create (resulting in linker unsats).
// What this means is that the GUID of the function (or its PGOFuncName
// metadata) *must* match that in the VP metadata to allow promotion.
// In practice this should not be a limitation, since local functions should
// have PGOFuncName metadata and global function names shouldn't need any
// special handling (they should not get the ".llvm.*" suffix that the
// canonicalization handling is attempting to strip).
if (Error E = Symtab->create(M, /*InLTO=*/true, /*AddCanonical=*/false)) {
std::string SymtabFailure = toString(std::move(E));
M.getContext().emitError("Failed to create symtab: " + SymtabFailure);
return false;
}
return true;
}
#ifndef NDEBUG
// Sanity check that the MIB stack ids match between the summary and
// instruction metadata.
static void checkAllocContextIds(
const AllocInfo &AllocNode, const MDNode *MemProfMD,
const CallStack<MDNode, MDNode::op_iterator> &CallsiteContext,
const ModuleSummaryIndex *ImportSummary) {
auto MIBIter = AllocNode.MIBs.begin();
for (auto &MDOp : MemProfMD->operands()) {
assert(MIBIter != AllocNode.MIBs.end());
auto StackIdIndexIter = MIBIter->StackIdIndices.begin();
auto *MIBMD = cast<const MDNode>(MDOp);
MDNode *StackMDNode = getMIBStackNode(MIBMD);
assert(StackMDNode);
CallStack<MDNode, MDNode::op_iterator> StackContext(StackMDNode);
auto ContextIterBegin =
StackContext.beginAfterSharedPrefix(CallsiteContext);
// Skip the checking on the first iteration.
uint64_t LastStackContextId =
(ContextIterBegin != StackContext.end() && *ContextIterBegin == 0) ? 1
: 0;
for (auto ContextIter = ContextIterBegin; ContextIter != StackContext.end();
++ContextIter) {
// If this is a direct recursion, simply skip the duplicate
// entries, to be consistent with how the summary ids were
// generated during ModuleSummaryAnalysis.
if (LastStackContextId == *ContextIter)
continue;
LastStackContextId = *ContextIter;
assert(StackIdIndexIter != MIBIter->StackIdIndices.end());
assert(ImportSummary->getStackIdAtIndex(*StackIdIndexIter) ==
*ContextIter);
StackIdIndexIter++;
}
MIBIter++;
}
}
#endif
bool MemProfContextDisambiguation::applyImport(Module &M) {
assert(ImportSummary);
bool Changed = false;
// We also need to clone any aliases that reference cloned functions, because
// the modified callsites may invoke via the alias. Keep track of the aliases
// for each function.
std::map<const Function *, SmallPtrSet<const GlobalAlias *, 1>>
FuncToAliasMap;
for (auto &A : M.aliases()) {
auto *Aliasee = A.getAliaseeObject();
if (auto *F = dyn_cast<Function>(Aliasee))
FuncToAliasMap[F].insert(&A);
}
if (!initializeIndirectCallPromotionInfo(M))
return false;
for (auto &F : M) {
if (F.isDeclaration() || isMemProfClone(F))
continue;
OptimizationRemarkEmitter ORE(&F);
SmallVector<std::unique_ptr<ValueToValueMapTy>, 4> VMaps;
bool ClonesCreated = false;
unsigned NumClonesCreated = 0;
auto CloneFuncIfNeeded = [&](unsigned NumClones) {
// We should at least have version 0 which is the original copy.
assert(NumClones > 0);
// If only one copy needed use original.
if (NumClones == 1)
return;
// If we already performed cloning of this function, confirm that the
// requested number of clones matches (the thin link should ensure the
// number of clones for each constituent callsite is consistent within
// each function), before returning.
if (ClonesCreated) {
assert(NumClonesCreated == NumClones);
return;
}
VMaps = createFunctionClones(F, NumClones, M, ORE, FuncToAliasMap);
// The first "clone" is the original copy, which doesn't have a VMap.
assert(VMaps.size() == NumClones - 1);
Changed = true;
ClonesCreated = true;
NumClonesCreated = NumClones;
};
auto CloneCallsite = [&](const CallsiteInfo &StackNode, CallBase *CB,
Function *CalledFunction) {
// Perform cloning if not yet done.
CloneFuncIfNeeded(/*NumClones=*/StackNode.Clones.size());
assert(!isMemProfClone(*CalledFunction));
// Update the calls per the summary info.
// Save orig name since it gets updated in the first iteration
// below.
auto CalleeOrigName = CalledFunction->getName();
for (unsigned J = 0; J < StackNode.Clones.size(); J++) {
// Do nothing if this version calls the original version of its
// callee.
if (!StackNode.Clones[J])
continue;
auto NewF = M.getOrInsertFunction(
getMemProfFuncName(CalleeOrigName, StackNode.Clones[J]),
CalledFunction->getFunctionType());
CallBase *CBClone;
// Copy 0 is the original function.
if (!J)
CBClone = CB;
else
CBClone = cast<CallBase>((*VMaps[J - 1])[CB]);
CBClone->setCalledFunction(NewF);
ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofCall", CBClone)
<< ore::NV("Call", CBClone) << " in clone "
<< ore::NV("Caller", CBClone->getFunction())
<< " assigned to call function clone "
<< ore::NV("Callee", NewF.getCallee()));
}
};
// Locate the summary for F.
ValueInfo TheFnVI = findValueInfoForFunc(F, M, ImportSummary);
// If not found, this could be an imported local (see comment in
// findValueInfoForFunc). Skip for now as it will be cloned in its original
// module (where it would have been promoted to global scope so should
// satisfy any reference in this module).
if (!TheFnVI)
continue;
auto *GVSummary =
ImportSummary->findSummaryInModule(TheFnVI, M.getModuleIdentifier());
if (!GVSummary) {
// Must have been imported, use the summary which matches the definition。
// (might be multiple if this was a linkonce_odr).
auto SrcModuleMD = F.getMetadata("thinlto_src_module");
assert(SrcModuleMD &&
"enable-import-metadata is needed to emit thinlto_src_module");
StringRef SrcModule =
dyn_cast<MDString>(SrcModuleMD->getOperand(0))->getString();
for (auto &GVS : TheFnVI.getSummaryList()) {
if (GVS->modulePath() == SrcModule) {
GVSummary = GVS.get();
break;
}
}
assert(GVSummary && GVSummary->modulePath() == SrcModule);
}
// If this was an imported alias skip it as we won't have the function
// summary, and it should be cloned in the original module.
if (isa<AliasSummary>(GVSummary))
continue;
auto *FS = cast<FunctionSummary>(GVSummary->getBaseObject());
if (FS->allocs().empty() && FS->callsites().empty())
continue;
auto SI = FS->callsites().begin();
auto AI = FS->allocs().begin();
// To handle callsite infos synthesized for tail calls which have missing
// frames in the profiled context, map callee VI to the synthesized callsite
// info.
DenseMap<ValueInfo, CallsiteInfo> MapTailCallCalleeVIToCallsite;
// Iterate the callsites for this function in reverse, since we place all
// those synthesized for tail calls at the end.
for (auto CallsiteIt = FS->callsites().rbegin();
CallsiteIt != FS->callsites().rend(); CallsiteIt++) {
auto &Callsite = *CallsiteIt;
// Stop as soon as we see a non-synthesized callsite info (see comment
// above loop). All the entries added for discovered tail calls have empty
// stack ids.
if (!Callsite.StackIdIndices.empty())
break;
MapTailCallCalleeVIToCallsite.insert({Callsite.Callee, Callsite});
}
// Keeps track of needed ICP for the function.
SmallVector<ICallAnalysisData> ICallAnalysisInfo;
// Assume for now that the instructions are in the exact same order
// as when the summary was created, but confirm this is correct by
// matching the stack ids.
for (auto &BB : F) {
for (auto &I : BB) {
auto *CB = dyn_cast<CallBase>(&I);
// Same handling as when creating module summary.
if (!mayHaveMemprofSummary(CB))
continue;
auto *CalledValue = CB->getCalledOperand();
auto *CalledFunction = CB->getCalledFunction();
if (CalledValue && !CalledFunction) {
CalledValue = CalledValue->stripPointerCasts();
// Stripping pointer casts can reveal a called function.
CalledFunction = dyn_cast<Function>(CalledValue);
}
// Check if this is an alias to a function. If so, get the
// called aliasee for the checks below.
if (auto *GA = dyn_cast<GlobalAlias>(CalledValue)) {
assert(!CalledFunction &&
"Expected null called function in callsite for alias");
CalledFunction = dyn_cast<Function>(GA->getAliaseeObject());
}
CallStack<MDNode, MDNode::op_iterator> CallsiteContext(
I.getMetadata(LLVMContext::MD_callsite));
auto *MemProfMD = I.getMetadata(LLVMContext::MD_memprof);
// Include allocs that were already assigned a memprof function
// attribute in the statistics.
if (CB->getAttributes().hasFnAttr("memprof")) {
assert(!MemProfMD);
CB->getAttributes().getFnAttr("memprof").getValueAsString() == "cold"
? AllocTypeColdThinBackend++
: AllocTypeNotColdThinBackend++;
OrigAllocsThinBackend++;
AllocVersionsThinBackend++;
if (!MaxAllocVersionsThinBackend)
MaxAllocVersionsThinBackend = 1;
continue;
}
if (MemProfMD) {
// Consult the next alloc node.
assert(AI != FS->allocs().end());
auto &AllocNode = *(AI++);
#ifndef NDEBUG
checkAllocContextIds(AllocNode, MemProfMD, CallsiteContext,
ImportSummary);
#endif
// Perform cloning if not yet done.
CloneFuncIfNeeded(/*NumClones=*/AllocNode.Versions.size());
OrigAllocsThinBackend++;
AllocVersionsThinBackend += AllocNode.Versions.size();
if (MaxAllocVersionsThinBackend < AllocNode.Versions.size())
MaxAllocVersionsThinBackend = AllocNode.Versions.size();
// If there is only one version that means we didn't end up
// considering this function for cloning, and in that case the alloc
// will still be none type or should have gotten the default NotCold.
// Skip that after calling clone helper since that does some sanity
// checks that confirm we haven't decided yet that we need cloning.
// We might have a single version that is cold due to the
// MinClonedColdBytePercent heuristic, make sure we don't skip in that
// case.
if (AllocNode.Versions.size() == 1 &&
(AllocationType)AllocNode.Versions[0] != AllocationType::Cold) {
assert((AllocationType)AllocNode.Versions[0] ==
AllocationType::NotCold ||
(AllocationType)AllocNode.Versions[0] ==
AllocationType::None);
UnclonableAllocsThinBackend++;
continue;
}
// All versions should have a singular allocation type.
assert(llvm::none_of(AllocNode.Versions, [](uint8_t Type) {
return Type == ((uint8_t)AllocationType::NotCold |
(uint8_t)AllocationType::Cold);
}));
// Update the allocation types per the summary info.
for (unsigned J = 0; J < AllocNode.Versions.size(); J++) {
// Ignore any that didn't get an assigned allocation type.
if (AllocNode.Versions[J] == (uint8_t)AllocationType::None)
continue;
AllocationType AllocTy = (AllocationType)AllocNode.Versions[J];
AllocTy == AllocationType::Cold ? AllocTypeColdThinBackend++
: AllocTypeNotColdThinBackend++;
std::string AllocTypeString = getAllocTypeAttributeString(AllocTy);
auto A = llvm::Attribute::get(F.getContext(), "memprof",
AllocTypeString);
CallBase *CBClone;
// Copy 0 is the original function.
if (!J)
CBClone = CB;
else
// Since VMaps are only created for new clones, we index with
// clone J-1 (J==0 is the original clone and does not have a VMaps
// entry).
CBClone = cast<CallBase>((*VMaps[J - 1])[CB]);
CBClone->addFnAttr(A);
ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofAttribute", CBClone)
<< ore::NV("AllocationCall", CBClone) << " in clone "
<< ore::NV("Caller", CBClone->getFunction())
<< " marked with memprof allocation attribute "
<< ore::NV("Attribute", AllocTypeString));
}
} else if (!CallsiteContext.empty()) {
if (!CalledFunction) {
#ifndef NDEBUG
// We should have skipped inline assembly calls.
auto *CI = dyn_cast<CallInst>(CB);
assert(!CI || !CI->isInlineAsm());
#endif
// We should have skipped direct calls via a Constant.
assert(CalledValue && !isa<Constant>(CalledValue));
// This is an indirect call, see if we have profile information and
// whether any clones were recorded for the profiled targets (that
// we synthesized CallsiteInfo summary records for when building the
// index).
auto NumClones =
recordICPInfo(CB, FS->callsites(), SI, ICallAnalysisInfo);
// Perform cloning if not yet done. This is done here in case
// we don't need to do ICP, but might need to clone this
// function as it is the target of other cloned calls.
if (NumClones)
CloneFuncIfNeeded(NumClones);
}
else {
// Consult the next callsite node.
assert(SI != FS->callsites().end());
auto &StackNode = *(SI++);
#ifndef NDEBUG
// Sanity check that the stack ids match between the summary and
// instruction metadata.
auto StackIdIndexIter = StackNode.StackIdIndices.begin();
for (auto StackId : CallsiteContext) {
assert(StackIdIndexIter != StackNode.StackIdIndices.end());
assert(ImportSummary->getStackIdAtIndex(*StackIdIndexIter) ==
StackId);
StackIdIndexIter++;
}
#endif
CloneCallsite(StackNode, CB, CalledFunction);
}
} else if (CB->isTailCall() && CalledFunction) {
// Locate the synthesized callsite info for the callee VI, if any was
// created, and use that for cloning.
ValueInfo CalleeVI =
findValueInfoForFunc(*CalledFunction, M, ImportSummary, &F);
if (CalleeVI && MapTailCallCalleeVIToCallsite.count(CalleeVI)) {
auto Callsite = MapTailCallCalleeVIToCallsite.find(CalleeVI);
assert(Callsite != MapTailCallCalleeVIToCallsite.end());
CloneCallsite(Callsite->second, CB, CalledFunction);
}
}
}
}
// Now do any promotion required for cloning.
performICP(M, FS->callsites(), VMaps, ICallAnalysisInfo, ORE);
}
// We skip some of the functions and instructions above, so remove all the
// metadata in a single sweep here.
for (auto &F : M) {
// We can skip memprof clones because createFunctionClones already strips
// the metadata from the newly created clones.
if (F.isDeclaration() || isMemProfClone(F))
continue;
for (auto &BB : F) {
for (auto &I : BB) {
if (!isa<CallBase>(I))
continue;
I.setMetadata(LLVMContext::MD_memprof, nullptr);
I.setMetadata(LLVMContext::MD_callsite, nullptr);
}
}
}
return Changed;
}
unsigned MemProfContextDisambiguation::recordICPInfo(
CallBase *CB, ArrayRef<CallsiteInfo> AllCallsites,
ArrayRef<CallsiteInfo>::iterator &SI,
SmallVector<ICallAnalysisData> &ICallAnalysisInfo) {
// First see if we have profile information for this indirect call.
uint32_t NumCandidates;
uint64_t TotalCount;
auto CandidateProfileData =
ICallAnalysis->getPromotionCandidatesForInstruction(CB, TotalCount,
NumCandidates);
if (CandidateProfileData.empty())
return 0;
// Iterate through all of the candidate profiled targets along with the
// CallsiteInfo summary records synthesized for them when building the index,
// and see if any are cloned and/or refer to clones.
bool ICPNeeded = false;
unsigned NumClones = 0;
size_t CallsiteInfoStartIndex = std::distance(AllCallsites.begin(), SI);
for (const auto &Candidate : CandidateProfileData) {
#ifndef NDEBUG
auto CalleeValueInfo =
#endif
ImportSummary->getValueInfo(Candidate.Value);
// We might not have a ValueInfo if this is a distributed
// ThinLTO backend and decided not to import that function.
assert(!CalleeValueInfo || SI->Callee == CalleeValueInfo);
assert(SI != AllCallsites.end());
auto &StackNode = *(SI++);
// See if any of the clones of the indirect callsite for this
// profiled target should call a cloned version of the profiled
// target. We only need to do the ICP here if so.
ICPNeeded |= llvm::any_of(StackNode.Clones,
[](unsigned CloneNo) { return CloneNo != 0; });
// Every callsite in the same function should have been cloned the same
// number of times.
assert(!NumClones || NumClones == StackNode.Clones.size());
NumClones = StackNode.Clones.size();
}
if (!ICPNeeded)
return NumClones;
// Save information for ICP, which is performed later to avoid messing up the
// current function traversal.
ICallAnalysisInfo.push_back({CB, CandidateProfileData.vec(), NumCandidates,
TotalCount, CallsiteInfoStartIndex});
return NumClones;
}
void MemProfContextDisambiguation::performICP(
Module &M, ArrayRef<CallsiteInfo> AllCallsites,
ArrayRef<std::unique_ptr<ValueToValueMapTy>> VMaps,
ArrayRef<ICallAnalysisData> ICallAnalysisInfo,
OptimizationRemarkEmitter &ORE) {
// Now do any promotion required for cloning. Specifically, for each
// recorded ICP candidate (which was only recorded because one clone of that
// candidate should call a cloned target), we perform ICP (speculative
// devirtualization) for each clone of the callsite, and update its callee
// to the appropriate clone. Note that the ICP compares against the original
// version of the target, which is what is in the vtable.
for (auto &Info : ICallAnalysisInfo) {
auto *CB = Info.CB;
auto CallsiteIndex = Info.CallsiteInfoStartIndex;
auto TotalCount = Info.TotalCount;
unsigned NumPromoted = 0;
unsigned NumClones = 0;
for (auto &Candidate : Info.CandidateProfileData) {
auto &StackNode = AllCallsites[CallsiteIndex++];
// All calls in the same function must have the same number of clones.
assert(!NumClones || NumClones == StackNode.Clones.size());
NumClones = StackNode.Clones.size();
// See if the target is in the module. If it wasn't imported, it is
// possible that this profile could have been collected on a different
// target (or version of the code), and we need to be conservative
// (similar to what is done in the ICP pass).
Function *TargetFunction = Symtab->getFunction(Candidate.Value);
if (TargetFunction == nullptr ||
// Any ThinLTO global dead symbol removal should have already
// occurred, so it should be safe to promote when the target is a
// declaration.
// TODO: Remove internal option once more fully tested.
(MemProfRequireDefinitionForPromotion &&
TargetFunction->isDeclaration())) {
ORE.emit([&]() {
return OptimizationRemarkMissed(DEBUG_TYPE, "UnableToFindTarget", CB)
<< "Memprof cannot promote indirect call: target with md5sum "
<< ore::NV("target md5sum", Candidate.Value) << " not found";
});
// FIXME: See if we can use the new declaration importing support to
// at least get the declarations imported for this case. Hot indirect
// targets should have been imported normally, however.
continue;
}
// Check if legal to promote
const char *Reason = nullptr;
if (!isLegalToPromote(*CB, TargetFunction, &Reason)) {
ORE.emit([&]() {
return OptimizationRemarkMissed(DEBUG_TYPE, "UnableToPromote", CB)
<< "Memprof cannot promote indirect call to "
<< ore::NV("TargetFunction", TargetFunction)
<< " with count of " << ore::NV("TotalCount", TotalCount)
<< ": " << Reason;
});
continue;
}
assert(!isMemProfClone(*TargetFunction));
// Handle each call clone, applying ICP so that each clone directly
// calls the specified callee clone, guarded by the appropriate ICP
// check.
CallBase *CBClone = CB;
for (unsigned J = 0; J < NumClones; J++) {
// Copy 0 is the original function.
if (J > 0)
CBClone = cast<CallBase>((*VMaps[J - 1])[CB]);
// We do the promotion using the original name, so that the comparison
// is against the name in the vtable. Then just below, change the new
// direct call to call the cloned function.
auto &DirectCall =
pgo::promoteIndirectCall(*CBClone, TargetFunction, Candidate.Count,
TotalCount, isSamplePGO, &ORE);
auto *TargetToUse = TargetFunction;
// Call original if this version calls the original version of its
// callee.
if (StackNode.Clones[J]) {
TargetToUse =
cast<Function>(M.getOrInsertFunction(
getMemProfFuncName(TargetFunction->getName(),
StackNode.Clones[J]),
TargetFunction->getFunctionType())
.getCallee());
}
DirectCall.setCalledFunction(TargetToUse);
ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofCall", CBClone)
<< ore::NV("Call", CBClone) << " in clone "
<< ore::NV("Caller", CBClone->getFunction())
<< " promoted and assigned to call function clone "
<< ore::NV("Callee", TargetToUse));
}
// Update TotalCount (all clones should get same count above)
TotalCount -= Candidate.Count;
NumPromoted++;
}
// Adjust the MD.prof metadata for all clones, now that we have the new
// TotalCount and the number promoted.
CallBase *CBClone = CB;
for (unsigned J = 0; J < NumClones; J++) {
// Copy 0 is the original function.
if (J > 0)
CBClone = cast<CallBase>((*VMaps[J - 1])[CB]);
// First delete the old one.
CBClone->setMetadata(LLVMContext::MD_prof, nullptr);
// If all promoted, we don't need the MD.prof metadata.
// Otherwise we need update with the un-promoted records back.
if (TotalCount != 0)
annotateValueSite(
M, *CBClone, ArrayRef(Info.CandidateProfileData).slice(NumPromoted),
TotalCount, IPVK_IndirectCallTarget, Info.NumCandidates);
}
}
}
template <typename DerivedCCG, typename FuncTy, typename CallTy>
bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::process() {
if (DumpCCG) {
dbgs() << "CCG before cloning:\n";
dbgs() << *this;
}
if (ExportToDot)
exportToDot("postbuild");
if (VerifyCCG) {
check();
}
identifyClones();
if (VerifyCCG) {
check();
}
if (DumpCCG) {
dbgs() << "CCG after cloning:\n";
dbgs() << *this;
}
if (ExportToDot)
exportToDot("cloned");
bool Changed = assignFunctions();
if (DumpCCG) {
dbgs() << "CCG after assigning function clones:\n";
dbgs() << *this;
}
if (ExportToDot)
exportToDot("clonefuncassign");
if (MemProfReportHintedSizes)
printTotalSizes(errs());
return Changed;
}
bool MemProfContextDisambiguation::processModule(
Module &M,
llvm::function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter) {
// If we have an import summary, then the cloning decisions were made during
// the thin link on the index. Apply them and return.
if (ImportSummary)
return applyImport(M);
// TODO: If/when other types of memprof cloning are enabled beyond just for
// hot and cold, we will need to change this to individually control the
// AllocationType passed to addStackNodesForMIB during CCG construction.
// Note that we specifically check this after applying imports above, so that
// the option isn't needed to be passed to distributed ThinLTO backend
// clang processes, which won't necessarily have visibility into the linker
// dependences. Instead the information is communicated from the LTO link to
// the backends via the combined summary index.
if (!SupportsHotColdNew)
return false;
ModuleCallsiteContextGraph CCG(M, OREGetter);
return CCG.process();
}
MemProfContextDisambiguation::MemProfContextDisambiguation(
const ModuleSummaryIndex *Summary, bool isSamplePGO)
: ImportSummary(Summary), isSamplePGO(isSamplePGO) {
// Check the dot graph printing options once here, to make sure we have valid
// and expected combinations.
if (DotGraphScope == DotScope::Alloc && !AllocIdForDot.getNumOccurrences())
llvm::report_fatal_error(
"-memprof-dot-scope=alloc requires -memprof-dot-alloc-id");
if (DotGraphScope == DotScope::Context &&
!ContextIdForDot.getNumOccurrences())
llvm::report_fatal_error(
"-memprof-dot-scope=context requires -memprof-dot-context-id");
if (DotGraphScope == DotScope::All && AllocIdForDot.getNumOccurrences() &&
ContextIdForDot.getNumOccurrences())
llvm::report_fatal_error(
"-memprof-dot-scope=all can't have both -memprof-dot-alloc-id and "
"-memprof-dot-context-id");
if (ImportSummary) {
// The MemProfImportSummary should only be used for testing ThinLTO
// distributed backend handling via opt, in which case we don't have a
// summary from the pass pipeline.
assert(MemProfImportSummary.empty());
return;
}
if (MemProfImportSummary.empty())
return;
auto ReadSummaryFile =
errorOrToExpected(MemoryBuffer::getFile(MemProfImportSummary));
if (!ReadSummaryFile) {
logAllUnhandledErrors(ReadSummaryFile.takeError(), errs(),
"Error loading file '" + MemProfImportSummary +
"': ");
return;
}
auto ImportSummaryForTestingOrErr = getModuleSummaryIndex(**ReadSummaryFile);
if (!ImportSummaryForTestingOrErr) {
logAllUnhandledErrors(ImportSummaryForTestingOrErr.takeError(), errs(),
"Error parsing file '" + MemProfImportSummary +
"': ");
return;
}
ImportSummaryForTesting = std::move(*ImportSummaryForTestingOrErr);
ImportSummary = ImportSummaryForTesting.get();
}
PreservedAnalyses MemProfContextDisambiguation::run(Module &M,
ModuleAnalysisManager &AM) {
auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
auto OREGetter = [&](Function *F) -> OptimizationRemarkEmitter & {
return FAM.getResult<OptimizationRemarkEmitterAnalysis>(*F);
};
if (!processModule(M, OREGetter))
return PreservedAnalyses::all();
return PreservedAnalyses::none();
}
void MemProfContextDisambiguation::run(
ModuleSummaryIndex &Index,
llvm::function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)>
isPrevailing) {
// TODO: If/when other types of memprof cloning are enabled beyond just for
// hot and cold, we will need to change this to individually control the
// AllocationType passed to addStackNodesForMIB during CCG construction.
// The index was set from the option, so these should be in sync.
assert(Index.withSupportsHotColdNew() == SupportsHotColdNew);
if (!SupportsHotColdNew)
return;
IndexCallsiteContextGraph CCG(Index, isPrevailing);
CCG.process();
}