blob: b9d84d583f49573c551c2c2d5ad3e2fd92c643d2 [file] [log] [blame]
//==-- MemProfContextDisambiguation.cpp - Disambiguate contexts -------------=//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This file implements support for context disambiguation of allocation
// calls for profile guided heap optimization. Specifically, it uses Memprof
// profiles which indicate context specific allocation behavior (currently
// distinguishing cold vs hot memory allocations). Cloning is performed to
// expose the cold allocation call contexts, and the allocation calls are
// subsequently annotated with an attribute for later transformation.
//
// The transformations can be performed either directly on IR (regular LTO), or
// on a ThinLTO index (and later applied to the IR during the ThinLTO backend).
// Both types of LTO operate on a the same base graph representation, which
// uses CRTP to support either IR or Index formats.
//
//===----------------------------------------------------------------------===//
#include "llvm/Transforms/IPO/MemProfContextDisambiguation.h"
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/DenseSet.h"
#include "llvm/ADT/MapVector.h"
#include "llvm/ADT/SetOperations.h"
#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/ADT/SmallSet.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/Analysis/MemoryProfileInfo.h"
#include "llvm/Analysis/ModuleSummaryAnalysis.h"
#include "llvm/Analysis/OptimizationRemarkEmitter.h"
#include "llvm/Bitcode/BitcodeReader.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/Module.h"
#include "llvm/IR/ModuleSummaryIndex.h"
#include "llvm/Pass.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/FileSystem.h"
#include "llvm/Support/GraphWriter.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/IPO.h"
#include "llvm/Transforms/Utils/Cloning.h"
#include <sstream>
#include <unordered_map>
#include <vector>
using namespace llvm;
using namespace llvm::memprof;
#define DEBUG_TYPE "memprof-context-disambiguation"
STATISTIC(FunctionClonesAnalysis,
"Number of function clones created during whole program analysis");
STATISTIC(FunctionClonesThinBackend,
"Number of function clones created during ThinLTO backend");
STATISTIC(FunctionsClonedThinBackend,
"Number of functions that had clones created during ThinLTO backend");
STATISTIC(AllocTypeNotCold, "Number of not cold static allocations (possibly "
"cloned) during whole program analysis");
STATISTIC(AllocTypeCold, "Number of cold static allocations (possibly cloned) "
"during whole program analysis");
STATISTIC(AllocTypeNotColdThinBackend,
"Number of not cold static allocations (possibly cloned) during "
"ThinLTO backend");
STATISTIC(AllocTypeColdThinBackend, "Number of cold static allocations "
"(possibly cloned) during ThinLTO backend");
STATISTIC(OrigAllocsThinBackend,
"Number of original (not cloned) allocations with memprof profiles "
"during ThinLTO backend");
STATISTIC(
AllocVersionsThinBackend,
"Number of allocation versions (including clones) during ThinLTO backend");
STATISTIC(MaxAllocVersionsThinBackend,
"Maximum number of allocation versions created for an original "
"allocation during ThinLTO backend");
STATISTIC(UnclonableAllocsThinBackend,
"Number of unclonable ambigous allocations during ThinLTO backend");
STATISTIC(RemovedEdgesWithMismatchedCallees,
"Number of edges removed due to mismatched callees (profiled vs IR)");
STATISTIC(FoundProfiledCalleeCount,
"Number of profiled callees found via tail calls");
STATISTIC(FoundProfiledCalleeDepth,
"Aggregate depth of profiled callees found via tail calls");
STATISTIC(FoundProfiledCalleeMaxDepth,
"Maximum depth of profiled callees found via tail calls");
STATISTIC(FoundProfiledCalleeNonUniquelyCount,
"Number of profiled callees found via multiple tail call chains");
static cl::opt<std::string> DotFilePathPrefix(
"memprof-dot-file-path-prefix", cl::init(""), cl::Hidden,
cl::value_desc("filename"),
cl::desc("Specify the path prefix of the MemProf dot files."));
static cl::opt<bool> ExportToDot("memprof-export-to-dot", cl::init(false),
cl::Hidden,
cl::desc("Export graph to dot files."));
static cl::opt<bool>
DumpCCG("memprof-dump-ccg", cl::init(false), cl::Hidden,
cl::desc("Dump CallingContextGraph to stdout after each stage."));
static cl::opt<bool>
VerifyCCG("memprof-verify-ccg", cl::init(false), cl::Hidden,
cl::desc("Perform verification checks on CallingContextGraph."));
static cl::opt<bool>
VerifyNodes("memprof-verify-nodes", cl::init(false), cl::Hidden,
cl::desc("Perform frequent verification checks on nodes."));
static cl::opt<std::string> MemProfImportSummary(
"memprof-import-summary",
cl::desc("Import summary to use for testing the ThinLTO backend via opt"),
cl::Hidden);
static cl::opt<unsigned>
TailCallSearchDepth("memprof-tail-call-search-depth", cl::init(5),
cl::Hidden,
cl::desc("Max depth to recursively search for missing "
"frames through tail calls."));
namespace llvm {
cl::opt<bool> EnableMemProfContextDisambiguation(
"enable-memprof-context-disambiguation", cl::init(false), cl::Hidden,
cl::ZeroOrMore, cl::desc("Enable MemProf context disambiguation"));
// Indicate we are linking with an allocator that supports hot/cold operator
// new interfaces.
cl::opt<bool> SupportsHotColdNew(
"supports-hot-cold-new", cl::init(false), cl::Hidden,
cl::desc("Linking with hot/cold operator new interfaces"));
} // namespace llvm
namespace {
/// CRTP base for graphs built from either IR or ThinLTO summary index.
///
/// The graph represents the call contexts in all memprof metadata on allocation
/// calls, with nodes for the allocations themselves, as well as for the calls
/// in each context. The graph is initially built from the allocation memprof
/// metadata (or summary) MIBs. It is then updated to match calls with callsite
/// metadata onto the nodes, updating it to reflect any inlining performed on
/// those calls.
///
/// Each MIB (representing an allocation's call context with allocation
/// behavior) is assigned a unique context id during the graph build. The edges
/// and nodes in the graph are decorated with the context ids they carry. This
/// is used to correctly update the graph when cloning is performed so that we
/// can uniquify the context for a single (possibly cloned) allocation.
template <typename DerivedCCG, typename FuncTy, typename CallTy>
class CallsiteContextGraph {
public:
CallsiteContextGraph() = default;
CallsiteContextGraph(const CallsiteContextGraph &) = default;
CallsiteContextGraph(CallsiteContextGraph &&) = default;
/// Main entry point to perform analysis and transformations on graph.
bool process();
/// Perform cloning on the graph necessary to uniquely identify the allocation
/// behavior of an allocation based on its context.
void identifyClones();
/// Assign callsite clones to functions, cloning functions as needed to
/// accommodate the combinations of their callsite clones reached by callers.
/// For regular LTO this clones functions and callsites in the IR, but for
/// ThinLTO the cloning decisions are noted in the summaries and later applied
/// in applyImport.
bool assignFunctions();
void dump() const;
void print(raw_ostream &OS) const;
friend raw_ostream &operator<<(raw_ostream &OS,
const CallsiteContextGraph &CCG) {
CCG.print(OS);
return OS;
}
friend struct GraphTraits<
const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>;
friend struct DOTGraphTraits<
const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>;
void exportToDot(std::string Label) const;
/// Represents a function clone via FuncTy pointer and clone number pair.
struct FuncInfo final
: public std::pair<FuncTy *, unsigned /*Clone number*/> {
using Base = std::pair<FuncTy *, unsigned>;
FuncInfo(const Base &B) : Base(B) {}
FuncInfo(FuncTy *F = nullptr, unsigned CloneNo = 0) : Base(F, CloneNo) {}
explicit operator bool() const { return this->first != nullptr; }
FuncTy *func() const { return this->first; }
unsigned cloneNo() const { return this->second; }
};
/// Represents a callsite clone via CallTy and clone number pair.
struct CallInfo final : public std::pair<CallTy, unsigned /*Clone number*/> {
using Base = std::pair<CallTy, unsigned>;
CallInfo(const Base &B) : Base(B) {}
CallInfo(CallTy Call = nullptr, unsigned CloneNo = 0)
: Base(Call, CloneNo) {}
explicit operator bool() const { return (bool)this->first; }
CallTy call() const { return this->first; }
unsigned cloneNo() const { return this->second; }
void setCloneNo(unsigned N) { this->second = N; }
void print(raw_ostream &OS) const {
if (!operator bool()) {
assert(!cloneNo());
OS << "null Call";
return;
}
call()->print(OS);
OS << "\t(clone " << cloneNo() << ")";
}
void dump() const {
print(dbgs());
dbgs() << "\n";
}
friend raw_ostream &operator<<(raw_ostream &OS, const CallInfo &Call) {
Call.print(OS);
return OS;
}
};
struct ContextEdge;
/// Node in the Callsite Context Graph
struct ContextNode {
// Keep this for now since in the IR case where we have an Instruction* it
// is not as immediately discoverable. Used for printing richer information
// when dumping graph.
bool IsAllocation;
// Keeps track of when the Call was reset to null because there was
// recursion.
bool Recursive = false;
// The corresponding allocation or interior call.
CallInfo Call;
// For alloc nodes this is a unique id assigned when constructed, and for
// callsite stack nodes it is the original stack id when the node is
// constructed from the memprof MIB metadata on the alloc nodes. Note that
// this is only used when matching callsite metadata onto the stack nodes
// created when processing the allocation memprof MIBs, and for labeling
// nodes in the dot graph. Therefore we don't bother to assign a value for
// clones.
uint64_t OrigStackOrAllocId = 0;
// This will be formed by ORing together the AllocationType enum values
// for contexts including this node.
uint8_t AllocTypes = 0;
// Edges to all callees in the profiled call stacks.
// TODO: Should this be a map (from Callee node) for more efficient lookup?
std::vector<std::shared_ptr<ContextEdge>> CalleeEdges;
// Edges to all callers in the profiled call stacks.
// TODO: Should this be a map (from Caller node) for more efficient lookup?
std::vector<std::shared_ptr<ContextEdge>> CallerEdges;
// The set of IDs for contexts including this node.
DenseSet<uint32_t> ContextIds;
// List of clones of this ContextNode, initially empty.
std::vector<ContextNode *> Clones;
// If a clone, points to the original uncloned node.
ContextNode *CloneOf = nullptr;
ContextNode(bool IsAllocation) : IsAllocation(IsAllocation), Call() {}
ContextNode(bool IsAllocation, CallInfo C)
: IsAllocation(IsAllocation), Call(C) {}
void addClone(ContextNode *Clone) {
if (CloneOf) {
CloneOf->Clones.push_back(Clone);
Clone->CloneOf = CloneOf;
} else {
Clones.push_back(Clone);
assert(!Clone->CloneOf);
Clone->CloneOf = this;
}
}
ContextNode *getOrigNode() {
if (!CloneOf)
return this;
return CloneOf;
}
void addOrUpdateCallerEdge(ContextNode *Caller, AllocationType AllocType,
unsigned int ContextId);
ContextEdge *findEdgeFromCallee(const ContextNode *Callee);
ContextEdge *findEdgeFromCaller(const ContextNode *Caller);
void eraseCalleeEdge(const ContextEdge *Edge);
void eraseCallerEdge(const ContextEdge *Edge);
void setCall(CallInfo C) { Call = C; }
bool hasCall() const { return (bool)Call.call(); }
void printCall(raw_ostream &OS) const { Call.print(OS); }
// True if this node was effectively removed from the graph, in which case
// its context id set, caller edges, and callee edges should all be empty.
bool isRemoved() const {
// Note that we can have non-empty context ids with empty caller and
// callee edges if the graph ends up with a single node.
if (ContextIds.empty())
assert(CalleeEdges.empty() && CallerEdges.empty() &&
"Context ids empty but at least one of callee and caller edges "
"were not!");
return ContextIds.empty();
}
void dump() const;
void print(raw_ostream &OS) const;
friend raw_ostream &operator<<(raw_ostream &OS, const ContextNode &Node) {
Node.print(OS);
return OS;
}
};
/// Edge in the Callsite Context Graph from a ContextNode N to a caller or
/// callee.
struct ContextEdge {
ContextNode *Callee;
ContextNode *Caller;
// This will be formed by ORing together the AllocationType enum values
// for contexts including this edge.
uint8_t AllocTypes = 0;
// The set of IDs for contexts including this edge.
DenseSet<uint32_t> ContextIds;
ContextEdge(ContextNode *Callee, ContextNode *Caller, uint8_t AllocType,
DenseSet<uint32_t> ContextIds)
: Callee(Callee), Caller(Caller), AllocTypes(AllocType),
ContextIds(ContextIds) {}
DenseSet<uint32_t> &getContextIds() { return ContextIds; }
void dump() const;
void print(raw_ostream &OS) const;
friend raw_ostream &operator<<(raw_ostream &OS, const ContextEdge &Edge) {
Edge.print(OS);
return OS;
}
};
/// Helpers to remove callee edges that have allocation type None (due to not
/// carrying any context ids) after transformations.
void removeNoneTypeCalleeEdges(ContextNode *Node);
void
recursivelyRemoveNoneTypeCalleeEdges(ContextNode *Node,
DenseSet<const ContextNode *> &Visited);
protected:
/// Get a list of nodes corresponding to the stack ids in the given callsite
/// context.
template <class NodeT, class IteratorT>
std::vector<uint64_t>
getStackIdsWithContextNodes(CallStack<NodeT, IteratorT> &CallsiteContext);
/// Adds nodes for the given allocation and any stack ids on its memprof MIB
/// metadata (or summary).
ContextNode *addAllocNode(CallInfo Call, const FuncTy *F);
/// Adds nodes for the given MIB stack ids.
template <class NodeT, class IteratorT>
void addStackNodesForMIB(ContextNode *AllocNode,
CallStack<NodeT, IteratorT> &StackContext,
CallStack<NodeT, IteratorT> &CallsiteContext,
AllocationType AllocType);
/// Matches all callsite metadata (or summary) to the nodes created for
/// allocation memprof MIB metadata, synthesizing new nodes to reflect any
/// inlining performed on those callsite instructions.
void updateStackNodes();
/// Update graph to conservatively handle any callsite stack nodes that target
/// multiple different callee target functions.
void handleCallsitesWithMultipleTargets();
/// Save lists of calls with MemProf metadata in each function, for faster
/// iteration.
MapVector<FuncTy *, std::vector<CallInfo>> FuncToCallsWithMetadata;
/// Map from callsite node to the enclosing caller function.
std::map<const ContextNode *, const FuncTy *> NodeToCallingFunc;
private:
using EdgeIter = typename std::vector<std::shared_ptr<ContextEdge>>::iterator;
using CallContextInfo = std::tuple<CallTy, std::vector<uint64_t>,
const FuncTy *, DenseSet<uint32_t>>;
/// Assigns the given Node to calls at or inlined into the location with
/// the Node's stack id, after post order traversing and processing its
/// caller nodes. Uses the call information recorded in the given
/// StackIdToMatchingCalls map, and creates new nodes for inlined sequences
/// as needed. Called by updateStackNodes which sets up the given
/// StackIdToMatchingCalls map.
void assignStackNodesPostOrder(
ContextNode *Node, DenseSet<const ContextNode *> &Visited,
DenseMap<uint64_t, std::vector<CallContextInfo>> &StackIdToMatchingCalls);
/// Duplicates the given set of context ids, updating the provided
/// map from each original id with the newly generated context ids,
/// and returning the new duplicated id set.
DenseSet<uint32_t> duplicateContextIds(
const DenseSet<uint32_t> &StackSequenceContextIds,
DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds);
/// Propagates all duplicated context ids across the graph.
void propagateDuplicateContextIds(
const DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds);
/// Connect the NewNode to OrigNode's callees if TowardsCallee is true,
/// else to its callers. Also updates OrigNode's edges to remove any context
/// ids moved to the newly created edge.
void connectNewNode(ContextNode *NewNode, ContextNode *OrigNode,
bool TowardsCallee);
/// Get the stack id corresponding to the given Id or Index (for IR this will
/// return itself, for a summary index this will return the id recorded in the
/// index for that stack id index value).
uint64_t getStackId(uint64_t IdOrIndex) const {
return static_cast<const DerivedCCG *>(this)->getStackId(IdOrIndex);
}
/// Returns true if the given call targets the callee of the given edge, or if
/// we were able to identify the call chain through intermediate tail calls.
/// In the latter case new context nodes are added to the graph for the
/// identified tail calls, and their synthesized nodes are added to
/// TailCallToContextNodeMap. The EdgeIter is updated in either case to the
/// next element after the input position (either incremented or updated after
/// removing the old edge).
bool
calleesMatch(CallTy Call, EdgeIter &EI,
MapVector<CallInfo, ContextNode *> &TailCallToContextNodeMap);
/// Returns true if the given call targets the given function, or if we were
/// able to identify the call chain through intermediate tail calls (in which
/// case FoundCalleeChain will be populated).
bool calleeMatchesFunc(
CallTy Call, const FuncTy *Func, const FuncTy *CallerFunc,
std::vector<std::pair<CallTy, FuncTy *>> &FoundCalleeChain) {
return static_cast<DerivedCCG *>(this)->calleeMatchesFunc(
Call, Func, CallerFunc, FoundCalleeChain);
}
/// Get a list of nodes corresponding to the stack ids in the given
/// callsite's context.
std::vector<uint64_t> getStackIdsWithContextNodesForCall(CallTy Call) {
return static_cast<DerivedCCG *>(this)->getStackIdsWithContextNodesForCall(
Call);
}
/// Get the last stack id in the context for callsite.
uint64_t getLastStackId(CallTy Call) {
return static_cast<DerivedCCG *>(this)->getLastStackId(Call);
}
/// Update the allocation call to record type of allocated memory.
void updateAllocationCall(CallInfo &Call, AllocationType AllocType) {
AllocType == AllocationType::Cold ? AllocTypeCold++ : AllocTypeNotCold++;
static_cast<DerivedCCG *>(this)->updateAllocationCall(Call, AllocType);
}
/// Update non-allocation call to invoke (possibly cloned) function
/// CalleeFunc.
void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc) {
static_cast<DerivedCCG *>(this)->updateCall(CallerCall, CalleeFunc);
}
/// Clone the given function for the given callsite, recording mapping of all
/// of the functions tracked calls to their new versions in the CallMap.
/// Assigns new clones to clone number CloneNo.
FuncInfo cloneFunctionForCallsite(
FuncInfo &Func, CallInfo &Call, std::map<CallInfo, CallInfo> &CallMap,
std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) {
return static_cast<DerivedCCG *>(this)->cloneFunctionForCallsite(
Func, Call, CallMap, CallsWithMetadataInFunc, CloneNo);
}
/// Gets a label to use in the dot graph for the given call clone in the given
/// function.
std::string getLabel(const FuncTy *Func, const CallTy Call,
unsigned CloneNo) const {
return static_cast<const DerivedCCG *>(this)->getLabel(Func, Call, CloneNo);
}
/// Helpers to find the node corresponding to the given call or stackid.
ContextNode *getNodeForInst(const CallInfo &C);
ContextNode *getNodeForAlloc(const CallInfo &C);
ContextNode *getNodeForStackId(uint64_t StackId);
/// Removes the node information recorded for the given call.
void unsetNodeForInst(const CallInfo &C);
/// Computes the alloc type corresponding to the given context ids, by
/// unioning their recorded alloc types.
uint8_t computeAllocType(DenseSet<uint32_t> &ContextIds);
/// Returns the alloction type of the intersection of the contexts of two
/// nodes (based on their provided context id sets), optimized for the case
/// when Node1Ids is smaller than Node2Ids.
uint8_t intersectAllocTypesImpl(const DenseSet<uint32_t> &Node1Ids,
const DenseSet<uint32_t> &Node2Ids);
/// Returns the alloction type of the intersection of the contexts of two
/// nodes (based on their provided context id sets).
uint8_t intersectAllocTypes(const DenseSet<uint32_t> &Node1Ids,
const DenseSet<uint32_t> &Node2Ids);
/// Create a clone of Edge's callee and move Edge to that new callee node,
/// performing the necessary context id and allocation type updates.
/// If callee's caller edge iterator is supplied, it is updated when removing
/// the edge from that list. If ContextIdsToMove is non-empty, only that
/// subset of Edge's ids are moved to an edge to the new callee.
ContextNode *
moveEdgeToNewCalleeClone(const std::shared_ptr<ContextEdge> &Edge,
EdgeIter *CallerEdgeI = nullptr,
DenseSet<uint32_t> ContextIdsToMove = {});
/// Change the callee of Edge to existing callee clone NewCallee, performing
/// the necessary context id and allocation type updates.
/// If callee's caller edge iterator is supplied, it is updated when removing
/// the edge from that list. If ContextIdsToMove is non-empty, only that
/// subset of Edge's ids are moved to an edge to the new callee.
void moveEdgeToExistingCalleeClone(const std::shared_ptr<ContextEdge> &Edge,
ContextNode *NewCallee,
EdgeIter *CallerEdgeI = nullptr,
bool NewClone = false,
DenseSet<uint32_t> ContextIdsToMove = {});
/// Recursively perform cloning on the graph for the given Node and its
/// callers, in order to uniquely identify the allocation behavior of an
/// allocation given its context. The context ids of the allocation being
/// processed are given in AllocContextIds.
void identifyClones(ContextNode *Node, DenseSet<const ContextNode *> &Visited,
const DenseSet<uint32_t> &AllocContextIds);
/// Map from each context ID to the AllocationType assigned to that context.
std::map<uint32_t, AllocationType> ContextIdToAllocationType;
/// Identifies the context node created for a stack id when adding the MIB
/// contexts to the graph. This is used to locate the context nodes when
/// trying to assign the corresponding callsites with those stack ids to these
/// nodes.
std::map<uint64_t, ContextNode *> StackEntryIdToContextNodeMap;
/// Maps to track the calls to their corresponding nodes in the graph.
MapVector<CallInfo, ContextNode *> AllocationCallToContextNodeMap;
MapVector<CallInfo, ContextNode *> NonAllocationCallToContextNodeMap;
/// Owner of all ContextNode unique_ptrs.
std::vector<std::unique_ptr<ContextNode>> NodeOwner;
/// Perform sanity checks on graph when requested.
void check() const;
/// Keeps track of the last unique context id assigned.
unsigned int LastContextId = 0;
};
template <typename DerivedCCG, typename FuncTy, typename CallTy>
using ContextNode =
typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode;
template <typename DerivedCCG, typename FuncTy, typename CallTy>
using ContextEdge =
typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge;
template <typename DerivedCCG, typename FuncTy, typename CallTy>
using FuncInfo =
typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::FuncInfo;
template <typename DerivedCCG, typename FuncTy, typename CallTy>
using CallInfo =
typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::CallInfo;
/// CRTP derived class for graphs built from IR (regular LTO).
class ModuleCallsiteContextGraph
: public CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
Instruction *> {
public:
ModuleCallsiteContextGraph(
Module &M,
llvm::function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter);
private:
friend CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
Instruction *>;
uint64_t getStackId(uint64_t IdOrIndex) const;
bool calleeMatchesFunc(
Instruction *Call, const Function *Func, const Function *CallerFunc,
std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain);
bool findProfiledCalleeThroughTailCalls(
const Function *ProfiledCallee, Value *CurCallee, unsigned Depth,
std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain,
bool &FoundMultipleCalleeChains);
uint64_t getLastStackId(Instruction *Call);
std::vector<uint64_t> getStackIdsWithContextNodesForCall(Instruction *Call);
void updateAllocationCall(CallInfo &Call, AllocationType AllocType);
void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc);
CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
Instruction *>::FuncInfo
cloneFunctionForCallsite(FuncInfo &Func, CallInfo &Call,
std::map<CallInfo, CallInfo> &CallMap,
std::vector<CallInfo> &CallsWithMetadataInFunc,
unsigned CloneNo);
std::string getLabel(const Function *Func, const Instruction *Call,
unsigned CloneNo) const;
const Module &Mod;
llvm::function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter;
};
/// Represents a call in the summary index graph, which can either be an
/// allocation or an interior callsite node in an allocation's context.
/// Holds a pointer to the corresponding data structure in the index.
struct IndexCall : public PointerUnion<CallsiteInfo *, AllocInfo *> {
IndexCall() : PointerUnion() {}
IndexCall(std::nullptr_t) : IndexCall() {}
IndexCall(CallsiteInfo *StackNode) : PointerUnion(StackNode) {}
IndexCall(AllocInfo *AllocNode) : PointerUnion(AllocNode) {}
IndexCall(PointerUnion PT) : PointerUnion(PT) {}
IndexCall *operator->() { return this; }
PointerUnion<CallsiteInfo *, AllocInfo *> getBase() const { return *this; }
void print(raw_ostream &OS) const {
if (auto *AI = llvm::dyn_cast_if_present<AllocInfo *>(getBase())) {
OS << *AI;
} else {
auto *CI = llvm::dyn_cast_if_present<CallsiteInfo *>(getBase());
assert(CI);
OS << *CI;
}
}
};
/// CRTP derived class for graphs built from summary index (ThinLTO).
class IndexCallsiteContextGraph
: public CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
IndexCall> {
public:
IndexCallsiteContextGraph(
ModuleSummaryIndex &Index,
llvm::function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)>
isPrevailing);
~IndexCallsiteContextGraph() {
// Now that we are done with the graph it is safe to add the new
// CallsiteInfo structs to the function summary vectors. The graph nodes
// point into locations within these vectors, so we don't want to add them
// any earlier.
for (auto &I : FunctionCalleesToSynthesizedCallsiteInfos) {
auto *FS = I.first;
for (auto &Callsite : I.second)
FS->addCallsite(*Callsite.second);
}
}
private:
friend CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
IndexCall>;
uint64_t getStackId(uint64_t IdOrIndex) const;
bool calleeMatchesFunc(
IndexCall &Call, const FunctionSummary *Func,
const FunctionSummary *CallerFunc,
std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain);
bool findProfiledCalleeThroughTailCalls(
ValueInfo ProfiledCallee, ValueInfo CurCallee, unsigned Depth,
std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain,
bool &FoundMultipleCalleeChains);
uint64_t getLastStackId(IndexCall &Call);
std::vector<uint64_t> getStackIdsWithContextNodesForCall(IndexCall &Call);
void updateAllocationCall(CallInfo &Call, AllocationType AllocType);
void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc);
CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
IndexCall>::FuncInfo
cloneFunctionForCallsite(FuncInfo &Func, CallInfo &Call,
std::map<CallInfo, CallInfo> &CallMap,
std::vector<CallInfo> &CallsWithMetadataInFunc,
unsigned CloneNo);
std::string getLabel(const FunctionSummary *Func, const IndexCall &Call,
unsigned CloneNo) const;
// Saves mapping from function summaries containing memprof records back to
// its VI, for use in checking and debugging.
std::map<const FunctionSummary *, ValueInfo> FSToVIMap;
const ModuleSummaryIndex &Index;
llvm::function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)>
isPrevailing;
// Saves/owns the callsite info structures synthesized for missing tail call
// frames that we discover while building the graph.
// It maps from the summary of the function making the tail call, to a map
// of callee ValueInfo to corresponding synthesized callsite info.
std::unordered_map<FunctionSummary *,
std::map<ValueInfo, std::unique_ptr<CallsiteInfo>>>
FunctionCalleesToSynthesizedCallsiteInfos;
};
} // namespace
namespace llvm {
template <>
struct DenseMapInfo<typename CallsiteContextGraph<
ModuleCallsiteContextGraph, Function, Instruction *>::CallInfo>
: public DenseMapInfo<std::pair<Instruction *, unsigned>> {};
template <>
struct DenseMapInfo<typename CallsiteContextGraph<
IndexCallsiteContextGraph, FunctionSummary, IndexCall>::CallInfo>
: public DenseMapInfo<std::pair<IndexCall, unsigned>> {};
template <>
struct DenseMapInfo<IndexCall>
: public DenseMapInfo<PointerUnion<CallsiteInfo *, AllocInfo *>> {};
} // end namespace llvm
namespace {
struct FieldSeparator {
bool Skip = true;
const char *Sep;
FieldSeparator(const char *Sep = ", ") : Sep(Sep) {}
};
raw_ostream &operator<<(raw_ostream &OS, FieldSeparator &FS) {
if (FS.Skip) {
FS.Skip = false;
return OS;
}
return OS << FS.Sep;
}
// Map the uint8_t alloc types (which may contain NotCold|Cold) to the alloc
// type we should actually use on the corresponding allocation.
// If we can't clone a node that has NotCold+Cold alloc type, we will fall
// back to using NotCold. So don't bother cloning to distinguish NotCold+Cold
// from NotCold.
AllocationType allocTypeToUse(uint8_t AllocTypes) {
assert(AllocTypes != (uint8_t)AllocationType::None);
if (AllocTypes ==
((uint8_t)AllocationType::NotCold | (uint8_t)AllocationType::Cold))
return AllocationType::NotCold;
else
return (AllocationType)AllocTypes;
}
// Helper to check if the alloc types for all edges recorded in the
// InAllocTypes vector match the alloc types for all edges in the Edges
// vector.
template <typename DerivedCCG, typename FuncTy, typename CallTy>
bool allocTypesMatch(
const std::vector<uint8_t> &InAllocTypes,
const std::vector<std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>>>
&Edges) {
return std::equal(
InAllocTypes.begin(), InAllocTypes.end(), Edges.begin(),
[](const uint8_t &l,
const std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>> &r) {
// Can share if one of the edges is None type - don't
// care about the type along that edge as it doesn't
// exist for those context ids.
if (l == (uint8_t)AllocationType::None ||
r->AllocTypes == (uint8_t)AllocationType::None)
return true;
return allocTypeToUse(l) == allocTypeToUse(r->AllocTypes);
});
}
} // end anonymous namespace
template <typename DerivedCCG, typename FuncTy, typename CallTy>
typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getNodeForInst(
const CallInfo &C) {
ContextNode *Node = getNodeForAlloc(C);
if (Node)
return Node;
return NonAllocationCallToContextNodeMap.lookup(C);
}
template <typename DerivedCCG, typename FuncTy, typename CallTy>
typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getNodeForAlloc(
const CallInfo &C) {
return AllocationCallToContextNodeMap.lookup(C);
}
template <typename DerivedCCG, typename FuncTy, typename CallTy>
typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getNodeForStackId(
uint64_t StackId) {
auto StackEntryNode = StackEntryIdToContextNodeMap.find(StackId);
if (StackEntryNode != StackEntryIdToContextNodeMap.end())
return StackEntryNode->second;
return nullptr;
}
template <typename DerivedCCG, typename FuncTy, typename CallTy>
void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::unsetNodeForInst(
const CallInfo &C) {
AllocationCallToContextNodeMap.erase(C) ||
NonAllocationCallToContextNodeMap.erase(C);
assert(!AllocationCallToContextNodeMap.count(C) &&
!NonAllocationCallToContextNodeMap.count(C));
}
template <typename DerivedCCG, typename FuncTy, typename CallTy>
void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
addOrUpdateCallerEdge(ContextNode *Caller, AllocationType AllocType,
unsigned int ContextId) {
for (auto &Edge : CallerEdges) {
if (Edge->Caller == Caller) {
Edge->AllocTypes |= (uint8_t)AllocType;
Edge->getContextIds().insert(ContextId);
return;
}
}
std::shared_ptr<ContextEdge> Edge = std::make_shared<ContextEdge>(
this, Caller, (uint8_t)AllocType, DenseSet<uint32_t>({ContextId}));
CallerEdges.push_back(Edge);
Caller->CalleeEdges.push_back(Edge);
}
template <typename DerivedCCG, typename FuncTy, typename CallTy>
void CallsiteContextGraph<
DerivedCCG, FuncTy, CallTy>::removeNoneTypeCalleeEdges(ContextNode *Node) {
for (auto EI = Node->CalleeEdges.begin(); EI != Node->CalleeEdges.end();) {
auto Edge = *EI;
if (Edge->AllocTypes == (uint8_t)AllocationType::None) {
assert(Edge->ContextIds.empty());
Edge->Callee->eraseCallerEdge(Edge.get());
EI = Node->CalleeEdges.erase(EI);
} else
++EI;
}
}
template <typename DerivedCCG, typename FuncTy, typename CallTy>
typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge *
CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
findEdgeFromCallee(const ContextNode *Callee) {
for (const auto &Edge : CalleeEdges)
if (Edge->Callee == Callee)
return Edge.get();
return nullptr;
}
template <typename DerivedCCG, typename FuncTy, typename CallTy>
typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge *
CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
findEdgeFromCaller(const ContextNode *Caller) {
for (const auto &Edge : CallerEdges)
if (Edge->Caller == Caller)
return Edge.get();
return nullptr;
}
template <typename DerivedCCG, typename FuncTy, typename CallTy>
void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
eraseCalleeEdge(const ContextEdge *Edge) {
auto EI = llvm::find_if(
CalleeEdges, [Edge](const std::shared_ptr<ContextEdge> &CalleeEdge) {
return CalleeEdge.get() == Edge;
});
assert(EI != CalleeEdges.end());
CalleeEdges.erase(EI);
}
template <typename DerivedCCG, typename FuncTy, typename CallTy>
void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
eraseCallerEdge(const ContextEdge *Edge) {
auto EI = llvm::find_if(
CallerEdges, [Edge](const std::shared_ptr<ContextEdge> &CallerEdge) {
return CallerEdge.get() == Edge;
});
assert(EI != CallerEdges.end());
CallerEdges.erase(EI);
}
template <typename DerivedCCG, typename FuncTy, typename CallTy>
uint8_t CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::computeAllocType(
DenseSet<uint32_t> &ContextIds) {
uint8_t BothTypes =
(uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold;
uint8_t AllocType = (uint8_t)AllocationType::None;
for (auto Id : ContextIds) {
AllocType |= (uint8_t)ContextIdToAllocationType[Id];
// Bail early if alloc type reached both, no further refinement.
if (AllocType == BothTypes)
return AllocType;
}
return AllocType;
}
template <typename DerivedCCG, typename FuncTy, typename CallTy>
uint8_t
CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::intersectAllocTypesImpl(
const DenseSet<uint32_t> &Node1Ids, const DenseSet<uint32_t> &Node2Ids) {
uint8_t BothTypes =
(uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold;
uint8_t AllocType = (uint8_t)AllocationType::None;
for (auto Id : Node1Ids) {
if (!Node2Ids.count(Id))
continue;
AllocType |= (uint8_t)ContextIdToAllocationType[Id];
// Bail early if alloc type reached both, no further refinement.
if (AllocType == BothTypes)
return AllocType;
}
return AllocType;
}
template <typename DerivedCCG, typename FuncTy, typename CallTy>
uint8_t CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::intersectAllocTypes(
const DenseSet<uint32_t> &Node1Ids, const DenseSet<uint32_t> &Node2Ids) {
if (Node1Ids.size() < Node2Ids.size())
return intersectAllocTypesImpl(Node1Ids, Node2Ids);
else
return intersectAllocTypesImpl(Node2Ids, Node1Ids);
}
template <typename DerivedCCG, typename FuncTy, typename CallTy>
typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::addAllocNode(
CallInfo Call, const FuncTy *F) {
assert(!getNodeForAlloc(Call));
NodeOwner.push_back(
std::make_unique<ContextNode>(/*IsAllocation=*/true, Call));
ContextNode *AllocNode = NodeOwner.back().get();
AllocationCallToContextNodeMap[Call] = AllocNode;
NodeToCallingFunc[AllocNode] = F;
// Use LastContextId as a uniq id for MIB allocation nodes.
AllocNode->OrigStackOrAllocId = LastContextId;
// Alloc type should be updated as we add in the MIBs. We should assert
// afterwards that it is not still None.
AllocNode->AllocTypes = (uint8_t)AllocationType::None;
return AllocNode;
}
template <typename DerivedCCG, typename FuncTy, typename CallTy>
template <class NodeT, class IteratorT>
void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::addStackNodesForMIB(
ContextNode *AllocNode, CallStack<NodeT, IteratorT> &StackContext,
CallStack<NodeT, IteratorT> &CallsiteContext, AllocationType AllocType) {
// Treating the hot alloc type as NotCold before the disambiguation for "hot"
// is done.
if (AllocType == AllocationType::Hot)
AllocType = AllocationType::NotCold;
ContextIdToAllocationType[++LastContextId] = AllocType;
// Update alloc type and context ids for this MIB.
AllocNode->AllocTypes |= (uint8_t)AllocType;
AllocNode->ContextIds.insert(LastContextId);
// Now add or update nodes for each stack id in alloc's context.
// Later when processing the stack ids on non-alloc callsites we will adjust
// for any inlining in the context.
ContextNode *PrevNode = AllocNode;
// Look for recursion (direct recursion should have been collapsed by
// module summary analysis, here we should just be detecting mutual
// recursion). Mark these nodes so we don't try to clone.
SmallSet<uint64_t, 8> StackIdSet;
// Skip any on the allocation call (inlining).
for (auto ContextIter = StackContext.beginAfterSharedPrefix(CallsiteContext);
ContextIter != StackContext.end(); ++ContextIter) {
auto StackId = getStackId(*ContextIter);
ContextNode *StackNode = getNodeForStackId(StackId);
if (!StackNode) {
NodeOwner.push_back(
std::make_unique<ContextNode>(/*IsAllocation=*/false));
StackNode = NodeOwner.back().get();
StackEntryIdToContextNodeMap[StackId] = StackNode;
StackNode->OrigStackOrAllocId = StackId;
}
auto Ins = StackIdSet.insert(StackId);
if (!Ins.second)
StackNode->Recursive = true;
StackNode->ContextIds.insert(LastContextId);
StackNode->AllocTypes |= (uint8_t)AllocType;
PrevNode->addOrUpdateCallerEdge(StackNode, AllocType, LastContextId);
PrevNode = StackNode;
}
}
template <typename DerivedCCG, typename FuncTy, typename CallTy>
DenseSet<uint32_t>
CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::duplicateContextIds(
const DenseSet<uint32_t> &StackSequenceContextIds,
DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds) {
DenseSet<uint32_t> NewContextIds;
for (auto OldId : StackSequenceContextIds) {
NewContextIds.insert(++LastContextId);
OldToNewContextIds[OldId].insert(LastContextId);
assert(ContextIdToAllocationType.count(OldId));
// The new context has the same allocation type as original.
ContextIdToAllocationType[LastContextId] = ContextIdToAllocationType[OldId];
}
return NewContextIds;
}
template <typename DerivedCCG, typename FuncTy, typename CallTy>
void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
propagateDuplicateContextIds(
const DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds) {
// Build a set of duplicated context ids corresponding to the input id set.
auto GetNewIds = [&OldToNewContextIds](const DenseSet<uint32_t> &ContextIds) {
DenseSet<uint32_t> NewIds;
for (auto Id : ContextIds)
if (auto NewId = OldToNewContextIds.find(Id);
NewId != OldToNewContextIds.end())
NewIds.insert(NewId->second.begin(), NewId->second.end());
return NewIds;
};
// Recursively update context ids sets along caller edges.
auto UpdateCallers = [&](ContextNode *Node,
DenseSet<const ContextEdge *> &Visited,
auto &&UpdateCallers) -> void {
for (const auto &Edge : Node->CallerEdges) {
auto Inserted = Visited.insert(Edge.get());
if (!Inserted.second)
continue;
ContextNode *NextNode = Edge->Caller;
DenseSet<uint32_t> NewIdsToAdd = GetNewIds(Edge->getContextIds());
// Only need to recursively iterate to NextNode via this caller edge if
// it resulted in any added ids to NextNode.
if (!NewIdsToAdd.empty()) {
Edge->getContextIds().insert(NewIdsToAdd.begin(), NewIdsToAdd.end());
NextNode->ContextIds.insert(NewIdsToAdd.begin(), NewIdsToAdd.end());
UpdateCallers(NextNode, Visited, UpdateCallers);
}
}
};
DenseSet<const ContextEdge *> Visited;
for (auto &Entry : AllocationCallToContextNodeMap) {
auto *Node = Entry.second;
// Update ids on the allocation nodes before calling the recursive
// update along caller edges, since this simplifies the logic during
// that traversal.
DenseSet<uint32_t> NewIdsToAdd = GetNewIds(Node->ContextIds);
Node->ContextIds.insert(NewIdsToAdd.begin(), NewIdsToAdd.end());
UpdateCallers(Node, Visited, UpdateCallers);
}
}
template <typename DerivedCCG, typename FuncTy, typename CallTy>
void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::connectNewNode(
ContextNode *NewNode, ContextNode *OrigNode, bool TowardsCallee) {
// Make a copy of the context ids, since this will be adjusted below as they
// are moved.
DenseSet<uint32_t> RemainingContextIds = NewNode->ContextIds;
auto &OrigEdges =
TowardsCallee ? OrigNode->CalleeEdges : OrigNode->CallerEdges;
// Increment iterator in loop so that we can remove edges as needed.
for (auto EI = OrigEdges.begin(); EI != OrigEdges.end();) {
auto Edge = *EI;
// Remove any matching context ids from Edge, return set that were found and
// removed, these are the new edge's context ids. Also update the remaining
// (not found ids).
DenseSet<uint32_t> NewEdgeContextIds, NotFoundContextIds;
set_subtract(Edge->getContextIds(), RemainingContextIds, NewEdgeContextIds,
NotFoundContextIds);
RemainingContextIds.swap(NotFoundContextIds);
// If no matching context ids for this edge, skip it.
if (NewEdgeContextIds.empty()) {
++EI;
continue;
}
if (TowardsCallee) {
auto NewEdge = std::make_shared<ContextEdge>(
Edge->Callee, NewNode, computeAllocType(NewEdgeContextIds),
NewEdgeContextIds);
NewNode->CalleeEdges.push_back(NewEdge);
NewEdge->Callee->CallerEdges.push_back(NewEdge);
} else {
auto NewEdge = std::make_shared<ContextEdge>(
NewNode, Edge->Caller, computeAllocType(NewEdgeContextIds),
NewEdgeContextIds);
NewNode->CallerEdges.push_back(NewEdge);
NewEdge->Caller->CalleeEdges.push_back(NewEdge);
}
// Remove old edge if context ids empty.
if (Edge->getContextIds().empty()) {
if (TowardsCallee) {
Edge->Callee->eraseCallerEdge(Edge.get());
EI = OrigNode->CalleeEdges.erase(EI);
} else {
Edge->Caller->eraseCalleeEdge(Edge.get());
EI = OrigNode->CallerEdges.erase(EI);
}
continue;
}
++EI;
}
}
template <typename DerivedCCG, typename FuncTy, typename CallTy>
void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
assignStackNodesPostOrder(ContextNode *Node,
DenseSet<const ContextNode *> &Visited,
DenseMap<uint64_t, std::vector<CallContextInfo>>
&StackIdToMatchingCalls) {
auto Inserted = Visited.insert(Node);
if (!Inserted.second)
return;
// Post order traversal. Iterate over a copy since we may add nodes and
// therefore new callers during the recursive call, invalidating any
// iterator over the original edge vector. We don't need to process these
// new nodes as they were already processed on creation.
auto CallerEdges = Node->CallerEdges;
for (auto &Edge : CallerEdges) {
// Skip any that have been removed during the recursion.
if (!Edge)
continue;
assignStackNodesPostOrder(Edge->Caller, Visited, StackIdToMatchingCalls);
}
// If this node's stack id is in the map, update the graph to contain new
// nodes representing any inlining at interior callsites. Note we move the
// associated context ids over to the new nodes.
// Ignore this node if it is for an allocation or we didn't record any
// stack id lists ending at it.
if (Node->IsAllocation ||
!StackIdToMatchingCalls.count(Node->OrigStackOrAllocId))
return;
auto &Calls = StackIdToMatchingCalls[Node->OrigStackOrAllocId];
// Handle the simple case first. A single call with a single stack id.
// In this case there is no need to create any new context nodes, simply
// assign the context node for stack id to this Call.
if (Calls.size() == 1) {
auto &[Call, Ids, Func, SavedContextIds] = Calls[0];
if (Ids.size() == 1) {
assert(SavedContextIds.empty());
// It should be this Node
assert(Node == getNodeForStackId(Ids[0]));
if (Node->Recursive)
return;
Node->setCall(Call);
NonAllocationCallToContextNodeMap[Call] = Node;
NodeToCallingFunc[Node] = Func;
return;
}
}
// Find the node for the last stack id, which should be the same
// across all calls recorded for this id, and is this node's id.
uint64_t LastId = Node->OrigStackOrAllocId;
ContextNode *LastNode = getNodeForStackId(LastId);
// We should only have kept stack ids that had nodes.
assert(LastNode);
for (unsigned I = 0; I < Calls.size(); I++) {
auto &[Call, Ids, Func, SavedContextIds] = Calls[I];
// Skip any for which we didn't assign any ids, these don't get a node in
// the graph.
if (SavedContextIds.empty())
continue;
assert(LastId == Ids.back());
ContextNode *FirstNode = getNodeForStackId(Ids[0]);
assert(FirstNode);
// Recompute the context ids for this stack id sequence (the
// intersection of the context ids of the corresponding nodes).
// Start with the ids we saved in the map for this call, which could be
// duplicated context ids. We have to recompute as we might have overlap
// overlap between the saved context ids for different last nodes, and
// removed them already during the post order traversal.
set_intersect(SavedContextIds, FirstNode->ContextIds);
ContextNode *PrevNode = nullptr;
for (auto Id : Ids) {
ContextNode *CurNode = getNodeForStackId(Id);
// We should only have kept stack ids that had nodes and weren't
// recursive.
assert(CurNode);
assert(!CurNode->Recursive);
if (!PrevNode) {
PrevNode = CurNode;
continue;
}
auto *Edge = CurNode->findEdgeFromCallee(PrevNode);
if (!Edge) {
SavedContextIds.clear();
break;
}
PrevNode = CurNode;
set_intersect(SavedContextIds, Edge->getContextIds());
// If we now have no context ids for clone, skip this call.
if (SavedContextIds.empty())
break;
}
if (SavedContextIds.empty())
continue;
// Create new context node.
NodeOwner.push_back(
std::make_unique<ContextNode>(/*IsAllocation=*/false, Call));
ContextNode *NewNode = NodeOwner.back().get();
NodeToCallingFunc[NewNode] = Func;
NonAllocationCallToContextNodeMap[Call] = NewNode;
NewNode->ContextIds = SavedContextIds;
NewNode->AllocTypes = computeAllocType(NewNode->ContextIds);
// Connect to callees of innermost stack frame in inlined call chain.
// This updates context ids for FirstNode's callee's to reflect those
// moved to NewNode.
connectNewNode(NewNode, FirstNode, /*TowardsCallee=*/true);
// Connect to callers of outermost stack frame in inlined call chain.
// This updates context ids for FirstNode's caller's to reflect those
// moved to NewNode.
connectNewNode(NewNode, LastNode, /*TowardsCallee=*/false);
// Now we need to remove context ids from edges/nodes between First and
// Last Node.
PrevNode = nullptr;
for (auto Id : Ids) {
ContextNode *CurNode = getNodeForStackId(Id);
// We should only have kept stack ids that had nodes.
assert(CurNode);
// Remove the context ids moved to NewNode from CurNode, and the
// edge from the prior node.
set_subtract(CurNode->ContextIds, NewNode->ContextIds);
if (PrevNode) {
auto *PrevEdge = CurNode->findEdgeFromCallee(PrevNode);
assert(PrevEdge);
set_subtract(PrevEdge->getContextIds(), NewNode->ContextIds);
if (PrevEdge->getContextIds().empty()) {
PrevNode->eraseCallerEdge(PrevEdge);
CurNode->eraseCalleeEdge(PrevEdge);
}
}
PrevNode = CurNode;
}
}
}
template <typename DerivedCCG, typename FuncTy, typename CallTy>
void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::updateStackNodes() {
// Map of stack id to all calls with that as the last (outermost caller)
// callsite id that has a context node (some might not due to pruning
// performed during matching of the allocation profile contexts).
// The CallContextInfo contains the Call and a list of its stack ids with
// ContextNodes, the function containing Call, and the set of context ids
// the analysis will eventually identify for use in any new node created
// for that callsite.
DenseMap<uint64_t, std::vector<CallContextInfo>> StackIdToMatchingCalls;
for (auto &[Func, CallsWithMetadata] : FuncToCallsWithMetadata) {
for (auto &Call : CallsWithMetadata) {
// Ignore allocations, already handled.
if (AllocationCallToContextNodeMap.count(Call))
continue;
auto StackIdsWithContextNodes =
getStackIdsWithContextNodesForCall(Call.call());
// If there were no nodes created for MIBs on allocs (maybe this was in
// the unambiguous part of the MIB stack that was pruned), ignore.
if (StackIdsWithContextNodes.empty())
continue;
// Otherwise, record this Call along with the list of ids for the last
// (outermost caller) stack id with a node.
StackIdToMatchingCalls[StackIdsWithContextNodes.back()].push_back(
{Call.call(), StackIdsWithContextNodes, Func, {}});
}
}
// First make a pass through all stack ids that correspond to a call,
// as identified in the above loop. Compute the context ids corresponding to
// each of these calls when they correspond to multiple stack ids due to
// due to inlining. Perform any duplication of context ids required when
// there is more than one call with the same stack ids. Their (possibly newly
// duplicated) context ids are saved in the StackIdToMatchingCalls map.
DenseMap<uint32_t, DenseSet<uint32_t>> OldToNewContextIds;
for (auto &It : StackIdToMatchingCalls) {
auto &Calls = It.getSecond();
// Skip single calls with a single stack id. These don't need a new node.
if (Calls.size() == 1) {
auto &Ids = std::get<1>(Calls[0]);
if (Ids.size() == 1)
continue;
}
// In order to do the best and maximal matching of inlined calls to context
// node sequences we will sort the vectors of stack ids in descending order
// of length, and within each length, lexicographically by stack id. The
// latter is so that we can specially handle calls that have identical stack
// id sequences (either due to cloning or artificially because of the MIB
// context pruning).
std::stable_sort(Calls.begin(), Calls.end(),
[](const CallContextInfo &A, const CallContextInfo &B) {
auto &IdsA = std::get<1>(A);
auto &IdsB = std::get<1>(B);
return IdsA.size() > IdsB.size() ||
(IdsA.size() == IdsB.size() && IdsA < IdsB);
});
// Find the node for the last stack id, which should be the same
// across all calls recorded for this id, and is the id for this
// entry in the StackIdToMatchingCalls map.
uint64_t LastId = It.getFirst();
ContextNode *LastNode = getNodeForStackId(LastId);
// We should only have kept stack ids that had nodes.
assert(LastNode);
if (LastNode->Recursive)
continue;
// Initialize the context ids with the last node's. We will subsequently
// refine the context ids by computing the intersection along all edges.
DenseSet<uint32_t> LastNodeContextIds = LastNode->ContextIds;
assert(!LastNodeContextIds.empty());
for (unsigned I = 0; I < Calls.size(); I++) {
auto &[Call, Ids, Func, SavedContextIds] = Calls[I];
assert(SavedContextIds.empty());
assert(LastId == Ids.back());
// First compute the context ids for this stack id sequence (the
// intersection of the context ids of the corresponding nodes).
// Start with the remaining saved ids for the last node.
assert(!LastNodeContextIds.empty());
DenseSet<uint32_t> StackSequenceContextIds = LastNodeContextIds;
ContextNode *PrevNode = LastNode;
ContextNode *CurNode = LastNode;
bool Skip = false;
// Iterate backwards through the stack Ids, starting after the last Id
// in the list, which was handled once outside for all Calls.
for (auto IdIter = Ids.rbegin() + 1; IdIter != Ids.rend(); IdIter++) {
auto Id = *IdIter;
CurNode = getNodeForStackId(Id);
// We should only have kept stack ids that had nodes.
assert(CurNode);
if (CurNode->Recursive) {
Skip = true;
break;
}
auto *Edge = CurNode->findEdgeFromCaller(PrevNode);
// If there is no edge then the nodes belong to different MIB contexts,
// and we should skip this inlined context sequence. For example, this
// particular inlined context may include stack ids A->B, and we may
// indeed have nodes for both A and B, but it is possible that they were
// never profiled in sequence in a single MIB for any allocation (i.e.
// we might have profiled an allocation that involves the callsite A,
// but through a different one of its callee callsites, and we might
// have profiled an allocation that involves callsite B, but reached
// from a different caller callsite).
if (!Edge) {
Skip = true;
break;
}
PrevNode = CurNode;
// Update the context ids, which is the intersection of the ids along
// all edges in the sequence.
set_intersect(StackSequenceContextIds, Edge->getContextIds());
// If we now have no context ids for clone, skip this call.
if (StackSequenceContextIds.empty()) {
Skip = true;
break;
}
}
if (Skip)
continue;
// If some of this call's stack ids did not have corresponding nodes (due
// to pruning), don't include any context ids for contexts that extend
// beyond these nodes. Otherwise we would be matching part of unrelated /
// not fully matching stack contexts. To do this, subtract any context ids
// found in caller nodes of the last node found above.
if (Ids.back() != getLastStackId(Call)) {
for (const auto &PE : LastNode->CallerEdges) {
set_subtract(StackSequenceContextIds, PE->getContextIds());
if (StackSequenceContextIds.empty())
break;
}
// If we now have no context ids for clone, skip this call.
if (StackSequenceContextIds.empty())
continue;
}
// Check if the next set of stack ids is the same (since the Calls vector
// of tuples is sorted by the stack ids we can just look at the next one).
bool DuplicateContextIds = false;
if (I + 1 < Calls.size()) {
auto NextIds = std::get<1>(Calls[I + 1]);
DuplicateContextIds = Ids == NextIds;
}
// If we don't have duplicate context ids, then we can assign all the
// context ids computed for the original node sequence to this call.
// If there are duplicate calls with the same stack ids then we synthesize
// new context ids that are duplicates of the originals. These are
// assigned to SavedContextIds, which is a reference into the map entry
// for this call, allowing us to access these ids later on.
OldToNewContextIds.reserve(OldToNewContextIds.size() +
StackSequenceContextIds.size());
SavedContextIds =
DuplicateContextIds
? duplicateContextIds(StackSequenceContextIds, OldToNewContextIds)
: StackSequenceContextIds;
assert(!SavedContextIds.empty());
if (!DuplicateContextIds) {
// Update saved last node's context ids to remove those that are
// assigned to other calls, so that it is ready for the next call at
// this stack id.
set_subtract(LastNodeContextIds, StackSequenceContextIds);
if (LastNodeContextIds.empty())
break;
}
}
}
// Propagate the duplicate context ids over the graph.
propagateDuplicateContextIds(OldToNewContextIds);
if (VerifyCCG)
check();
// Now perform a post-order traversal over the graph, starting with the
// allocation nodes, essentially processing nodes from callers to callees.
// For any that contains an id in the map, update the graph to contain new
// nodes representing any inlining at interior callsites. Note we move the
// associated context ids over to the new nodes.
DenseSet<const ContextNode *> Visited;
for (auto &Entry : AllocationCallToContextNodeMap)
assignStackNodesPostOrder(Entry.second, Visited, StackIdToMatchingCalls);
}
uint64_t ModuleCallsiteContextGraph::getLastStackId(Instruction *Call) {
CallStack<MDNode, MDNode::op_iterator> CallsiteContext(
Call->getMetadata(LLVMContext::MD_callsite));
return CallsiteContext.back();
}
uint64_t IndexCallsiteContextGraph::getLastStackId(IndexCall &Call) {
assert(isa<CallsiteInfo *>(Call.getBase()));
CallStack<CallsiteInfo, SmallVector<unsigned>::const_iterator>
CallsiteContext(dyn_cast_if_present<CallsiteInfo *>(Call.getBase()));
// Need to convert index into stack id.
return Index.getStackIdAtIndex(CallsiteContext.back());
}
static const std::string MemProfCloneSuffix = ".memprof.";
static std::string getMemProfFuncName(Twine Base, unsigned CloneNo) {
// We use CloneNo == 0 to refer to the original version, which doesn't get
// renamed with a suffix.
if (!CloneNo)
return Base.str();
return (Base + MemProfCloneSuffix + Twine(CloneNo)).str();
}
std::string ModuleCallsiteContextGraph::getLabel(const Function *Func,
const Instruction *Call,
unsigned CloneNo) const {
return (Twine(Call->getFunction()->getName()) + " -> " +
cast<CallBase>(Call)->getCalledFunction()->getName())
.str();
}
std::string IndexCallsiteContextGraph::getLabel(const FunctionSummary *Func,
const IndexCall &Call,
unsigned CloneNo) const {
auto VI = FSToVIMap.find(Func);
assert(VI != FSToVIMap.end());
if (isa<AllocInfo *>(Call.getBase()))
return (VI->second.name() + " -> alloc").str();
else {
auto *Callsite = dyn_cast_if_present<CallsiteInfo *>(Call.getBase());
return (VI->second.name() + " -> " +
getMemProfFuncName(Callsite->Callee.name(),
Callsite->Clones[CloneNo]))
.str();
}
}
std::vector<uint64_t>
ModuleCallsiteContextGraph::getStackIdsWithContextNodesForCall(
Instruction *Call) {
CallStack<MDNode, MDNode::op_iterator> CallsiteContext(
Call->getMetadata(LLVMContext::MD_callsite));
return getStackIdsWithContextNodes<MDNode, MDNode::op_iterator>(
CallsiteContext);
}
std::vector<uint64_t>
IndexCallsiteContextGraph::getStackIdsWithContextNodesForCall(IndexCall &Call) {
assert(isa<CallsiteInfo *>(Call.getBase()));
CallStack<CallsiteInfo, SmallVector<unsigned>::const_iterator>
CallsiteContext(dyn_cast_if_present<CallsiteInfo *>(Call.getBase()));
return getStackIdsWithContextNodes<CallsiteInfo,
SmallVector<unsigned>::const_iterator>(
CallsiteContext);
}
template <typename DerivedCCG, typename FuncTy, typename CallTy>
template <class NodeT, class IteratorT>
std::vector<uint64_t>
CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getStackIdsWithContextNodes(
CallStack<NodeT, IteratorT> &CallsiteContext) {
std::vector<uint64_t> StackIds;
for (auto IdOrIndex : CallsiteContext) {
auto StackId = getStackId(IdOrIndex);
ContextNode *Node = getNodeForStackId(StackId);
if (!Node)
break;
StackIds.push_back(StackId);
}
return StackIds;
}
ModuleCallsiteContextGraph::ModuleCallsiteContextGraph(
Module &M,
llvm::function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter)
: Mod(M), OREGetter(OREGetter) {
for (auto &F : M) {
std::vector<CallInfo> CallsWithMetadata;
for (auto &BB : F) {
for (auto &I : BB) {
if (!isa<CallBase>(I))
continue;
if (auto *MemProfMD = I.getMetadata(LLVMContext::MD_memprof)) {
CallsWithMetadata.push_back(&I);
auto *AllocNode = addAllocNode(&I, &F);
auto *CallsiteMD = I.getMetadata(LLVMContext::MD_callsite);
assert(CallsiteMD);
CallStack<MDNode, MDNode::op_iterator> CallsiteContext(CallsiteMD);
// Add all of the MIBs and their stack nodes.
for (auto &MDOp : MemProfMD->operands()) {
auto *MIBMD = cast<const MDNode>(MDOp);
MDNode *StackNode = getMIBStackNode(MIBMD);
assert(StackNode);
CallStack<MDNode, MDNode::op_iterator> StackContext(StackNode);
addStackNodesForMIB<MDNode, MDNode::op_iterator>(
AllocNode, StackContext, CallsiteContext,
getMIBAllocType(MIBMD));
}
assert(AllocNode->AllocTypes != (uint8_t)AllocationType::None);
// Memprof and callsite metadata on memory allocations no longer
// needed.
I.setMetadata(LLVMContext::MD_memprof, nullptr);
I.setMetadata(LLVMContext::MD_callsite, nullptr);
}
// For callsite metadata, add to list for this function for later use.
else if (I.getMetadata(LLVMContext::MD_callsite))
CallsWithMetadata.push_back(&I);
}
}
if (!CallsWithMetadata.empty())
FuncToCallsWithMetadata[&F] = CallsWithMetadata;
}
if (DumpCCG) {
dbgs() << "CCG before updating call stack chains:\n";
dbgs() << *this;
}
if (ExportToDot)
exportToDot("prestackupdate");
updateStackNodes();
handleCallsitesWithMultipleTargets();
// Strip off remaining callsite metadata, no longer needed.
for (auto &FuncEntry : FuncToCallsWithMetadata)
for (auto &Call : FuncEntry.second)
Call.call()->setMetadata(LLVMContext::MD_callsite, nullptr);
}
IndexCallsiteContextGraph::IndexCallsiteContextGraph(
ModuleSummaryIndex &Index,
llvm::function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)>
isPrevailing)
: Index(Index), isPrevailing(isPrevailing) {
for (auto &I : Index) {
auto VI = Index.getValueInfo(I);
for (auto &S : VI.getSummaryList()) {
// We should only add the prevailing nodes. Otherwise we may try to clone
// in a weak copy that won't be linked (and may be different than the
// prevailing version).
// We only keep the memprof summary on the prevailing copy now when
// building the combined index, as a space optimization, however don't
// rely on this optimization. The linker doesn't resolve local linkage
// values so don't check whether those are prevailing.
if (!GlobalValue::isLocalLinkage(S->linkage()) &&
!isPrevailing(VI.getGUID(), S.get()))
continue;
auto *FS = dyn_cast<FunctionSummary>(S.get());
if (!FS)
continue;
std::vector<CallInfo> CallsWithMetadata;
if (!FS->allocs().empty()) {
for (auto &AN : FS->mutableAllocs()) {
// This can happen because of recursion elimination handling that
// currently exists in ModuleSummaryAnalysis. Skip these for now.
// We still added them to the summary because we need to be able to
// correlate properly in applyImport in the backends.
if (AN.MIBs.empty())
continue;
CallsWithMetadata.push_back({&AN});
auto *AllocNode = addAllocNode({&AN}, FS);
// Pass an empty CallStack to the CallsiteContext (second)
// parameter, since for ThinLTO we already collapsed out the inlined
// stack ids on the allocation call during ModuleSummaryAnalysis.
CallStack<MIBInfo, SmallVector<unsigned>::const_iterator>
EmptyContext;
// Now add all of the MIBs and their stack nodes.
for (auto &MIB : AN.MIBs) {
CallStack<MIBInfo, SmallVector<unsigned>::const_iterator>
StackContext(&MIB);
addStackNodesForMIB<MIBInfo, SmallVector<unsigned>::const_iterator>(
AllocNode, StackContext, EmptyContext, MIB.AllocType);
}
assert(AllocNode->AllocTypes != (uint8_t)AllocationType::None);
// Initialize version 0 on the summary alloc node to the current alloc
// type, unless it has both types in which case make it default, so
// that in the case where we aren't able to clone the original version
// always ends up with the default allocation behavior.
AN.Versions[0] = (uint8_t)allocTypeToUse(AllocNode->AllocTypes);
}
}
// For callsite metadata, add to list for this function for later use.
if (!FS->callsites().empty())
for (auto &SN : FS->mutableCallsites())
CallsWithMetadata.push_back({&SN});
if (!CallsWithMetadata.empty())
FuncToCallsWithMetadata[FS] = CallsWithMetadata;
if (!FS->allocs().empty() || !FS->callsites().empty())
FSToVIMap[FS] = VI;
}
}
if (DumpCCG) {
dbgs() << "CCG before updating call stack chains:\n";
dbgs() << *this;
}
if (ExportToDot)
exportToDot("prestackupdate");
updateStackNodes();
handleCallsitesWithMultipleTargets();
}
template <typename DerivedCCG, typename FuncTy, typename CallTy>
void CallsiteContextGraph<DerivedCCG, FuncTy,
CallTy>::handleCallsitesWithMultipleTargets() {
// Look for and workaround callsites that call multiple functions.
// This can happen for indirect calls, which needs better handling, and in
// more rare cases (e.g. macro expansion).
// TODO: To fix this for indirect calls we will want to perform speculative
// devirtualization using either the normal PGO info with ICP, or using the
// information in the profiled MemProf contexts. We can do this prior to
// this transformation for regular LTO, and for ThinLTO we can simulate that
// effect in the summary and perform the actual speculative devirtualization
// while cloning in the ThinLTO backend.
// Keep track of the new nodes synthesized for discovered tail calls missing
// from the profiled contexts.
MapVector<CallInfo, ContextNode *> TailCallToContextNodeMap;
for (auto Entry = NonAllocationCallToContextNodeMap.begin();
Entry != NonAllocationCallToContextNodeMap.end();) {
auto *Node = Entry->second;
assert(Node->Clones.empty());
// Check all node callees and see if in the same function.
bool Removed = false;
auto Call = Node->Call.call();
for (auto EI = Node->CalleeEdges.begin(); EI != Node->CalleeEdges.end();) {
auto Edge = *EI;
if (!Edge->Callee->hasCall()) {
++EI;
continue;
}
assert(NodeToCallingFunc.count(Edge->Callee));
// Check if the called function matches that of the callee node.
if (calleesMatch(Call, EI, TailCallToContextNodeMap))
continue;
RemovedEdgesWithMismatchedCallees++;
// Work around by setting Node to have a null call, so it gets
// skipped during cloning. Otherwise assignFunctions will assert
// because its data structures are not designed to handle this case.
Entry = NonAllocationCallToContextNodeMap.erase(Entry);
Node->setCall(CallInfo());
Removed = true;
break;
}
if (!Removed)
Entry++;
}
// Add the new nodes after the above loop so that the iteration is not
// invalidated.
for (auto &[Call, Node] : TailCallToContextNodeMap)
NonAllocationCallToContextNodeMap[Call] = Node;
}
uint64_t ModuleCallsiteContextGraph::getStackId(uint64_t IdOrIndex) const {
// In the Module (IR) case this is already the Id.
return IdOrIndex;
}
uint64_t IndexCallsiteContextGraph::getStackId(uint64_t IdOrIndex) const {
// In the Index case this is an index into the stack id list in the summary
// index, convert it to an Id.
return Index.getStackIdAtIndex(IdOrIndex);
}
template <typename DerivedCCG, typename FuncTy, typename CallTy>
bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::calleesMatch(
CallTy Call, EdgeIter &EI,
MapVector<CallInfo, ContextNode *> &TailCallToContextNodeMap) {
auto Edge = *EI;
const FuncTy *ProfiledCalleeFunc = NodeToCallingFunc[Edge->Callee];
const FuncTy *CallerFunc = NodeToCallingFunc[Edge->Caller];
// Will be populated in order of callee to caller if we find a chain of tail
// calls between the profiled caller and callee.
std::vector<std::pair<CallTy, FuncTy *>> FoundCalleeChain;
if (!calleeMatchesFunc(Call, ProfiledCalleeFunc, CallerFunc,
FoundCalleeChain)) {
++EI;
return false;
}
// The usual case where the profiled callee matches that of the IR/summary.
if (FoundCalleeChain.empty()) {
++EI;
return true;
}
auto AddEdge = [Edge, &EI](ContextNode *Caller, ContextNode *Callee) {
auto *CurEdge = Callee->findEdgeFromCaller(Caller);
// If there is already an edge between these nodes, simply update it and
// return.
if (CurEdge) {
CurEdge->ContextIds.insert(Edge->ContextIds.begin(),
Edge->ContextIds.end());
CurEdge->AllocTypes |= Edge->AllocTypes;
return;
}
// Otherwise, create a new edge and insert it into the caller and callee
// lists.
auto NewEdge = std::make_shared<ContextEdge>(
Callee, Caller, Edge->AllocTypes, Edge->ContextIds);
Callee->CallerEdges.push_back(NewEdge);
if (Caller == Edge->Caller) {
// If we are inserting the new edge into the current edge's caller, insert
// the new edge before the current iterator position, and then increment
// back to the current edge.
EI = Caller->CalleeEdges.insert(EI, NewEdge);
++EI;
assert(*EI == Edge &&
"Iterator position not restored after insert and increment");
} else
Caller->CalleeEdges.push_back(NewEdge);
};
// Create new nodes for each found callee and connect in between the profiled
// caller and callee.
auto *CurCalleeNode = Edge->Callee;
for (auto &[NewCall, Func] : FoundCalleeChain) {
ContextNode *NewNode = nullptr;
// First check if we have already synthesized a node for this tail call.
if (TailCallToContextNodeMap.count(NewCall)) {
NewNode = TailCallToContextNodeMap[NewCall];
NewNode->ContextIds.insert(Edge->ContextIds.begin(),
Edge->ContextIds.end());
NewNode->AllocTypes |= Edge->AllocTypes;
} else {
FuncToCallsWithMetadata[Func].push_back({NewCall});
// Create Node and record node info.
NodeOwner.push_back(
std::make_unique<ContextNode>(/*IsAllocation=*/false, NewCall));
NewNode = NodeOwner.back().get();
NodeToCallingFunc[NewNode] = Func;
TailCallToContextNodeMap[NewCall] = NewNode;
NewNode->ContextIds = Edge->ContextIds;
NewNode->AllocTypes = Edge->AllocTypes;
}
// Hook up node to its callee node
AddEdge(NewNode, CurCalleeNode);
CurCalleeNode = NewNode;
}
// Hook up edge's original caller to new callee node.
AddEdge(Edge->Caller, CurCalleeNode);
// Remove old edge
Edge->Callee->eraseCallerEdge(Edge.get());
EI = Edge->Caller->CalleeEdges.erase(EI);
return true;
}
bool ModuleCallsiteContextGraph::findProfiledCalleeThroughTailCalls(
const Function *ProfiledCallee, Value *CurCallee, unsigned Depth,
std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain,
bool &FoundMultipleCalleeChains) {
// Stop recursive search if we have already explored the maximum specified
// depth.
if (Depth > TailCallSearchDepth)
return false;
auto SaveCallsiteInfo = [&](Instruction *Callsite, Function *F) {
FoundCalleeChain.push_back({Callsite, F});
};
auto *CalleeFunc = dyn_cast<Function>(CurCallee);
if (!CalleeFunc) {
auto *Alias = dyn_cast<GlobalAlias>(CurCallee);
assert(Alias);
CalleeFunc = dyn_cast<Function>(Alias->getAliasee());
assert(CalleeFunc);
}
// Look for tail calls in this function, and check if they either call the
// profiled callee directly, or indirectly (via a recursive search).
// Only succeed if there is a single unique tail call chain found between the
// profiled caller and callee, otherwise we could perform incorrect cloning.
bool FoundSingleCalleeChain = false;
for (auto &BB : *CalleeFunc) {
for (auto &I : BB) {
auto *CB = dyn_cast<CallBase>(&I);
if (!CB || !CB->isTailCall())
continue;
auto *CalledValue = CB->getCalledOperand();
auto *CalledFunction = CB->getCalledFunction();
if (CalledValue && !CalledFunction) {
CalledValue = CalledValue->stripPointerCasts();
// Stripping pointer casts can reveal a called function.
CalledFunction = dyn_cast<Function>(CalledValue);
}
// Check if this is an alias to a function. If so, get the
// called aliasee for the checks below.
if (auto *GA = dyn_cast<GlobalAlias>(CalledValue)) {
assert(!CalledFunction &&
"Expected null called function in callsite for alias");
CalledFunction = dyn_cast<Function>(GA->getAliaseeObject());
}
if (!CalledFunction)
continue;
if (CalledFunction == ProfiledCallee) {
if (FoundSingleCalleeChain) {
FoundMultipleCalleeChains = true;
return false;
}
FoundSingleCalleeChain = true;
FoundProfiledCalleeCount++;
FoundProfiledCalleeDepth += Depth;
if (Depth > FoundProfiledCalleeMaxDepth)
FoundProfiledCalleeMaxDepth = Depth;
SaveCallsiteInfo(&I, CalleeFunc);
} else if (findProfiledCalleeThroughTailCalls(
ProfiledCallee, CalledFunction, Depth + 1,
FoundCalleeChain, FoundMultipleCalleeChains)) {
if (FoundMultipleCalleeChains)
return false;
if (FoundSingleCalleeChain) {
FoundMultipleCalleeChains = true;
return false;
}
FoundSingleCalleeChain = true;
SaveCallsiteInfo(&I, CalleeFunc);
}
}
}
return FoundSingleCalleeChain;
}
bool ModuleCallsiteContextGraph::calleeMatchesFunc(
Instruction *Call, const Function *Func, const Function *CallerFunc,
std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain) {
auto *CB = dyn_cast<CallBase>(Call);
if (!CB->getCalledOperand())
return false;
auto *CalleeVal = CB->getCalledOperand()->stripPointerCasts();
auto *CalleeFunc = dyn_cast<Function>(CalleeVal);
if (CalleeFunc == Func)
return true;
auto *Alias = dyn_cast<GlobalAlias>(CalleeVal);
if (Alias && Alias->getAliasee() == Func)
return true;
// Recursively search for the profiled callee through tail calls starting with
// the actual Callee. The discovered tail call chain is saved in
// FoundCalleeChain, and we will fixup the graph to include these callsites
// after returning.
// FIXME: We will currently redo the same recursive walk if we find the same
// mismatched callee from another callsite. We can improve this with more
// bookkeeping of the created chain of new nodes for each mismatch.
unsigned Depth = 1;
bool FoundMultipleCalleeChains = false;
if (!findProfiledCalleeThroughTailCalls(Func, CalleeVal, Depth,
FoundCalleeChain,
FoundMultipleCalleeChains)) {
LLVM_DEBUG(dbgs() << "Not found through unique tail call chain: "
<< Func->getName() << " from " << CallerFunc->getName()
<< " that actually called " << CalleeVal->getName()
<< (FoundMultipleCalleeChains
? " (found multiple possible chains)"
: "")
<< "\n");
if (FoundMultipleCalleeChains)
FoundProfiledCalleeNonUniquelyCount++;
return false;
}
return true;
}
bool IndexCallsiteContextGraph::findProfiledCalleeThroughTailCalls(
ValueInfo ProfiledCallee, ValueInfo CurCallee, unsigned Depth,
std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain,
bool &FoundMultipleCalleeChains) {
// Stop recursive search if we have already explored the maximum specified
// depth.
if (Depth > TailCallSearchDepth)
return false;
auto CreateAndSaveCallsiteInfo = [&](ValueInfo Callee, FunctionSummary *FS) {
// Make a CallsiteInfo for each discovered callee, if one hasn't already
// been synthesized.
if (!FunctionCalleesToSynthesizedCallsiteInfos.count(FS) ||
!FunctionCalleesToSynthesizedCallsiteInfos[FS].count(Callee))
// StackIds is empty (we don't have debug info available in the index for
// these callsites)
FunctionCalleesToSynthesizedCallsiteInfos[FS][Callee] =
std::make_unique<CallsiteInfo>(Callee, SmallVector<unsigned>());
CallsiteInfo *NewCallsiteInfo =
FunctionCalleesToSynthesizedCallsiteInfos[FS][Callee].get();
FoundCalleeChain.push_back({NewCallsiteInfo, FS});
};
// Look for tail calls in this function, and check if they either call the
// profiled callee directly, or indirectly (via a recursive search).
// Only succeed if there is a single unique tail call chain found between the
// profiled caller and callee, otherwise we could perform incorrect cloning.
bool FoundSingleCalleeChain = false;
for (auto &S : CurCallee.getSummaryList()) {
if (!GlobalValue::isLocalLinkage(S->linkage()) &&
!isPrevailing(CurCallee.getGUID(), S.get()))
continue;
auto *FS = dyn_cast<FunctionSummary>(S->getBaseObject());
if (!FS)
continue;
auto FSVI = CurCallee;
auto *AS = dyn_cast<AliasSummary>(S.get());
if (AS)
FSVI = AS->getAliaseeVI();
for (auto &CallEdge : FS->calls()) {
if (!CallEdge.second.hasTailCall())
continue;
if (CallEdge.first == ProfiledCallee) {
if (FoundSingleCalleeChain) {
FoundMultipleCalleeChains = true;
return false;
}
FoundSingleCalleeChain = true;
FoundProfiledCalleeCount++;
FoundProfiledCalleeDepth += Depth;
if (Depth > FoundProfiledCalleeMaxDepth)
FoundProfiledCalleeMaxDepth = Depth;
CreateAndSaveCallsiteInfo(CallEdge.first, FS);
// Add FS to FSToVIMap in case it isn't already there.
assert(!FSToVIMap.count(FS) || FSToVIMap[FS] == FSVI);
FSToVIMap[FS] = FSVI;
} else if (findProfiledCalleeThroughTailCalls(
ProfiledCallee, CallEdge.first, Depth + 1,
FoundCalleeChain, FoundMultipleCalleeChains)) {
if (FoundMultipleCalleeChains)
return false;
if (FoundSingleCalleeChain) {
FoundMultipleCalleeChains = true;
return false;
}
FoundSingleCalleeChain = true;
CreateAndSaveCallsiteInfo(CallEdge.first, FS);
// Add FS to FSToVIMap in case it isn't already there.
assert(!FSToVIMap.count(FS) || FSToVIMap[FS] == FSVI);
FSToVIMap[FS] = FSVI;
}
}
}
return FoundSingleCalleeChain;
}
bool IndexCallsiteContextGraph::calleeMatchesFunc(
IndexCall &Call, const FunctionSummary *Func,
const FunctionSummary *CallerFunc,
std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain) {
ValueInfo Callee =
dyn_cast_if_present<CallsiteInfo *>(Call.getBase())->Callee;
// If there is no summary list then this is a call to an externally defined
// symbol.
AliasSummary *Alias =
Callee.getSummaryList().empty()
? nullptr
: dyn_cast<AliasSummary>(Callee.getSummaryList()[0].get());
assert(FSToVIMap.count(Func));
auto FuncVI = FSToVIMap[Func];
if (Callee == FuncVI ||
// If callee is an alias, check the aliasee, since only function
// summary base objects will contain the stack node summaries and thus
// get a context node.
(Alias && Alias->getAliaseeVI() == FuncVI))
return true;
// Recursively search for the profiled callee through tail calls starting with
// the actual Callee. The discovered tail call chain is saved in
// FoundCalleeChain, and we will fixup the graph to include these callsites
// after returning.
// FIXME: We will currently redo the same recursive walk if we find the same
// mismatched callee from another callsite. We can improve this with more
// bookkeeping of the created chain of new nodes for each mismatch.
unsigned Depth = 1;
bool FoundMultipleCalleeChains = false;
if (!findProfiledCalleeThroughTailCalls(
FuncVI, Callee, Depth, FoundCalleeChain, FoundMultipleCalleeChains)) {
LLVM_DEBUG(dbgs() << "Not found through unique tail call chain: " << FuncVI
<< " from " << FSToVIMap[CallerFunc]
<< " that actually called " << Callee
<< (FoundMultipleCalleeChains
? " (found multiple possible chains)"
: "")
<< "\n");
if (FoundMultipleCalleeChains)
FoundProfiledCalleeNonUniquelyCount++;
return false;
}
return true;
}
static std::string getAllocTypeString(uint8_t AllocTypes) {
if (!AllocTypes)
return "None";
std::string Str;
if (AllocTypes & (uint8_t)AllocationType::NotCold)
Str += "NotCold";
if (AllocTypes & (uint8_t)AllocationType::Cold)
Str += "Cold";
return Str;
}
template <typename DerivedCCG, typename FuncTy, typename CallTy>
void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::dump()
const {
print(dbgs());
dbgs() << "\n";
}
template <typename DerivedCCG, typename FuncTy, typename CallTy>
void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::print(
raw_ostream &OS) const {
OS << "Node " << this << "\n";
OS << "\t";
printCall(OS);
if (Recursive)
OS << " (recursive)";
OS << "\n";
OS << "\tAllocTypes: " << getAllocTypeString(AllocTypes) << "\n";
OS << "\tContextIds:";
std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());
std::sort(SortedIds.begin(), SortedIds.end());
for (auto Id : SortedIds)
OS << " " << Id;
OS << "\n";
OS << "\tCalleeEdges:\n";
for (auto &Edge : CalleeEdges)
OS << "\t\t" << *Edge << "\n";
OS << "\tCallerEdges:\n";
for (auto &Edge : CallerEdges)
OS << "\t\t" << *Edge << "\n";
if (!Clones.empty()) {
OS << "\tClones: ";
FieldSeparator FS;
for (auto *Clone : Clones)
OS << FS << Clone;
OS << "\n";
} else if (CloneOf) {
OS << "\tClone of " << CloneOf << "\n";
}
}
template <typename DerivedCCG, typename FuncTy, typename CallTy>
void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge::dump()
const {
print(dbgs());
dbgs() << "\n";
}
template <typename DerivedCCG, typename FuncTy, typename CallTy>
void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge::print(
raw_ostream &OS) const {
OS << "Edge from Callee " << Callee << " to Caller: " << Caller
<< " AllocTypes: " << getAllocTypeString(AllocTypes);
OS << " ContextIds:";
std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());
std::sort(SortedIds.begin(), SortedIds.end());
for (auto Id : SortedIds)
OS << " " << Id;
}
template <typename DerivedCCG, typename FuncTy, typename CallTy>
void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::dump() const {
print(dbgs());
}
template <typename DerivedCCG, typename FuncTy, typename CallTy>
void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::print(
raw_ostream &OS) const {
OS << "Callsite Context Graph:\n";
using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
for (const auto Node : nodes<GraphType>(this)) {
if (Node->isRemoved())
continue;
Node->print(OS);
OS << "\n";
}
}
template <typename DerivedCCG, typename FuncTy, typename CallTy>
static void checkEdge(
const std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>> &Edge) {
// Confirm that alloc type is not None and that we have at least one context
// id.
assert(Edge->AllocTypes != (uint8_t)AllocationType::None);
assert(!Edge->ContextIds.empty());
}
template <typename DerivedCCG, typename FuncTy, typename CallTy>
static void checkNode(const ContextNode<DerivedCCG, FuncTy, CallTy> *Node,
bool CheckEdges = true) {
if (Node->isRemoved())
return;
// Node's context ids should be the union of both its callee and caller edge
// context ids.
if (Node->CallerEdges.size()) {
auto EI = Node->CallerEdges.begin();
auto &FirstEdge = *EI;
EI++;
DenseSet<uint32_t> CallerEdgeContextIds(FirstEdge->ContextIds);
for (; EI != Node->CallerEdges.end(); EI++) {
const auto &Edge = *EI;
if (CheckEdges)
checkEdge<DerivedCCG, FuncTy, CallTy>(Edge);
set_union(CallerEdgeContextIds, Edge->ContextIds);
}
// Node can have more context ids than callers if some contexts terminate at
// node and some are longer.
assert(Node->ContextIds == CallerEdgeContextIds ||
set_is_subset(CallerEdgeContextIds, Node->ContextIds));
}
if (Node->CalleeEdges.size()) {
auto EI = Node->CalleeEdges.begin();
auto &FirstEdge = *EI;
EI++;
DenseSet<uint32_t> CalleeEdgeContextIds(FirstEdge->ContextIds);
for (; EI != Node->CalleeEdges.end(); EI++) {
const auto &Edge = *EI;
if (CheckEdges)
checkEdge<DerivedCCG, FuncTy, CallTy>(Edge);
set_union(CalleeEdgeContextIds, Edge->ContextIds);
}
assert(Node->ContextIds == CalleeEdgeContextIds);
}
}
template <typename DerivedCCG, typename FuncTy, typename CallTy>
void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::check() const {
using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
for (const auto Node : nodes<GraphType>(this)) {
checkNode<DerivedCCG, FuncTy, CallTy>(Node, /*CheckEdges=*/false);
for (auto &Edge : Node->CallerEdges)
checkEdge<DerivedCCG, FuncTy, CallTy>(Edge);
}
}
template <typename DerivedCCG, typename FuncTy, typename CallTy>
struct GraphTraits<const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *> {
using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
using NodeRef = const ContextNode<DerivedCCG, FuncTy, CallTy> *;
using NodePtrTy = std::unique_ptr<ContextNode<DerivedCCG, FuncTy, CallTy>>;
static NodeRef getNode(const NodePtrTy &P) { return P.get(); }
using nodes_iterator =
mapped_iterator<typename std::vector<NodePtrTy>::const_iterator,
decltype(&getNode)>;
static nodes_iterator nodes_begin(GraphType G) {
return nodes_iterator(G->NodeOwner.begin(), &getNode);
}
static nodes_iterator nodes_end(GraphType G) {
return nodes_iterator(G->NodeOwner.end(), &getNode);
}
static NodeRef getEntryNode(GraphType G) {
return G->NodeOwner.begin()->get();
}
using EdgePtrTy = std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>>;
static const ContextNode<DerivedCCG, FuncTy, CallTy> *
GetCallee(const EdgePtrTy &P) {
return P->Callee;
}
using ChildIteratorType =
mapped_iterator<typename std::vector<std::shared_ptr<ContextEdge<
DerivedCCG, FuncTy, CallTy>>>::const_iterator,
decltype(&GetCallee)>;
static ChildIteratorType child_begin(NodeRef N) {
return ChildIteratorType(N->CalleeEdges.begin(), &GetCallee);
}
static ChildIteratorType child_end(NodeRef N) {
return ChildIteratorType(N->CalleeEdges.end(), &GetCallee);
}
};
template <typename DerivedCCG, typename FuncTy, typename CallTy>
struct DOTGraphTraits<const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>
: public DefaultDOTGraphTraits {
DOTGraphTraits(bool IsSimple = false) : DefaultDOTGraphTraits(IsSimple) {}
using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
using GTraits = GraphTraits<GraphType>;
using NodeRef = typename GTraits::NodeRef;
using ChildIteratorType = typename GTraits::ChildIteratorType;
static std::string getNodeLabel(NodeRef Node, GraphType G) {
std::string LabelString =
(Twine("OrigId: ") + (Node->IsAllocation ? "Alloc" : "") +
Twine(Node->OrigStackOrAllocId))
.str();
LabelString += "\n";
if (Node->hasCall()) {
auto Func = G->NodeToCallingFunc.find(Node);
assert(Func != G->NodeToCallingFunc.end());
LabelString +=
G->getLabel(Func->second, Node->Call.call(), Node->Call.cloneNo());
} else {
LabelString += "null call";
if (Node->Recursive)
LabelString += " (recursive)";
else
LabelString += " (external)";
}
return LabelString;
}
static std::string getNodeAttributes(NodeRef Node, GraphType) {
std::string AttributeString = (Twine("tooltip=\"") + getNodeId(Node) + " " +
getContextIds(Node->ContextIds) + "\"")
.str();
AttributeString +=
(Twine(",fillcolor=\"") + getColor(Node->AllocTypes) + "\"").str();
AttributeString += ",style=\"filled\"";
if (Node->CloneOf) {
AttributeString += ",color=\"blue\"";
AttributeString += ",style=\"filled,bold,dashed\"";
} else
AttributeString += ",style=\"filled\"";
return AttributeString;
}
static std::string getEdgeAttributes(NodeRef, ChildIteratorType ChildIter,
GraphType) {
auto &Edge = *(ChildIter.getCurrent());
return (Twine("tooltip=\"") + getContextIds(Edge->ContextIds) + "\"" +
Twine(",fillcolor=\"") + getColor(Edge->AllocTypes) + "\"")
.str();
}
// Since the NodeOwners list includes nodes that are no longer connected to
// the graph, skip them here.
static bool isNodeHidden(NodeRef Node, GraphType) {
return Node->isRemoved();
}
private:
static std::string getContextIds(const DenseSet<uint32_t> &ContextIds) {
std::string IdString = "ContextIds:";
if (ContextIds.size() < 100) {
std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());
std::sort(SortedIds.begin(), SortedIds.end());
for (auto Id : SortedIds)
IdString += (" " + Twine(Id)).str();
} else {
IdString += (" (" + Twine(ContextIds.size()) + " ids)").str();
}
return IdString;
}
static std::string getColor(uint8_t AllocTypes) {
if (AllocTypes == (uint8_t)AllocationType::NotCold)
// Color "brown1" actually looks like a lighter red.
return "brown1";
if (AllocTypes == (uint8_t)AllocationType::Cold)
return "cyan";
if (AllocTypes ==
((uint8_t)AllocationType::NotCold | (uint8_t)AllocationType::Cold))
// Lighter purple.
return "mediumorchid1";
return "gray";
}
static std::string getNodeId(NodeRef Node) {
std::stringstream SStream;
SStream << std::hex << "N0x" << (unsigned long long)Node;
std::string Result = SStream.str();
return Result;
}
};
template <typename DerivedCCG, typename FuncTy, typename CallTy>
void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::exportToDot(
std::string Label) const {
WriteGraph(this, "", false, Label,
DotFilePathPrefix + "ccg." + Label + ".dot");
}
template <typename DerivedCCG, typename FuncTy, typename CallTy>
typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::moveEdgeToNewCalleeClone(
const std::shared_ptr<ContextEdge> &Edge, EdgeIter *CallerEdgeI,
DenseSet<uint32_t> ContextIdsToMove) {
ContextNode *Node = Edge->Callee;
NodeOwner.push_back(
std::make_unique<ContextNode>(Node->IsAllocation, Node->Call));
ContextNode *Clone = NodeOwner.back().get();
Node->addClone(Clone);
assert(NodeToCallingFunc.count(Node));
NodeToCallingFunc[Clone] = NodeToCallingFunc[Node];
moveEdgeToExistingCalleeClone(Edge, Clone, CallerEdgeI, /*NewClone=*/true,
ContextIdsToMove);
return Clone;
}
template <typename DerivedCCG, typename FuncTy, typename CallTy>
void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
moveEdgeToExistingCalleeClone(const std::shared_ptr<ContextEdge> &Edge,
ContextNode *NewCallee, EdgeIter *CallerEdgeI,
bool NewClone,
DenseSet<uint32_t> ContextIdsToMove) {
// NewCallee and Edge's current callee must be clones of the same original
// node (Edge's current callee may be the original node too).
assert(NewCallee->getOrigNode() == Edge->Callee->getOrigNode());
ContextNode *OldCallee = Edge->Callee;
// We might already have an edge to the new callee from earlier cloning for a
// different allocation. If one exists we will reuse it.
auto ExistingEdgeToNewCallee = NewCallee->findEdgeFromCaller(Edge->Caller);
// Callers will pass an empty ContextIdsToMove set when they want to move the
// edge. Copy in Edge's ids for simplicity.
if (ContextIdsToMove.empty())
ContextIdsToMove = Edge->getContextIds();
// If we are moving all of Edge's ids, then just move the whole Edge.
// Otherwise only move the specified subset, to a new edge if needed.
if (Edge->getContextIds().size() == ContextIdsToMove.size()) {
// Moving the whole Edge.
if (CallerEdgeI)
*CallerEdgeI = OldCallee->CallerEdges.erase(*CallerEdgeI);
else
OldCallee->eraseCallerEdge(Edge.get());
if (ExistingEdgeToNewCallee) {
// Since we already have an edge to NewCallee, simply move the ids
// onto it, and remove the existing Edge.
ExistingEdgeToNewCallee->getContextIds().insert(ContextIdsToMove.begin(),
ContextIdsToMove.end());
ExistingEdgeToNewCallee->AllocTypes |= Edge->AllocTypes;
assert(Edge->ContextIds == ContextIdsToMove);
Edge->ContextIds.clear();
Edge->AllocTypes = (uint8_t)AllocationType::None;
Edge->Caller->eraseCalleeEdge(Edge.get());
} else {
// Otherwise just reconnect Edge to NewCallee.
Edge->Callee = NewCallee;
NewCallee->CallerEdges.push_back(Edge);
// Don't need to update Edge's context ids since we are simply
// reconnecting it.
}
// In either case, need to update the alloc types on New Callee.
NewCallee->AllocTypes |= Edge->AllocTypes;
} else {
// Only moving a subset of Edge's ids.
if (CallerEdgeI)
++CallerEdgeI;
// Compute the alloc type of the subset of ids being moved.
auto CallerEdgeAllocType = computeAllocType(ContextIdsToMove);
if (ExistingEdgeToNewCallee) {
// Since we already have an edge to NewCallee, simply move the ids
// onto it.
ExistingEdgeToNewCallee->getContextIds().insert(ContextIdsToMove.begin(),
ContextIdsToMove.end());
ExistingEdgeToNewCallee->AllocTypes |= CallerEdgeAllocType;
} else {
// Otherwise, create a new edge to NewCallee for the ids being moved.
auto NewEdge = std::make_shared<ContextEdge>(
NewCallee, Edge->Caller, CallerEdgeAllocType, ContextIdsToMove);
Edge->Caller->CalleeEdges.push_back(NewEdge);
NewCallee->CallerEdges.push_back(NewEdge);
}
// In either case, need to update the alloc types on NewCallee, and remove
// those ids and update the alloc type on the original Edge.
NewCallee->AllocTypes |= CallerEdgeAllocType;
set_subtract(Edge->ContextIds, ContextIdsToMove);
Edge->AllocTypes = computeAllocType(Edge->ContextIds);
}
// Now perform some updates that are common to all cases: the NewCallee gets
// the moved ids added, and we need to remove those ids from OldCallee and
// update its alloc type (NewCallee alloc type updates handled above).
NewCallee->ContextIds.insert(ContextIdsToMove.begin(),
ContextIdsToMove.end());
set_subtract(OldCallee->ContextIds, ContextIdsToMove);
OldCallee->AllocTypes = computeAllocType(OldCallee->ContextIds);
// OldCallee alloc type should be None iff its context id set is now empty.
assert((OldCallee->AllocTypes == (uint8_t)AllocationType::None) ==
OldCallee->ContextIds.empty());
// Now walk the old callee node's callee edges and move Edge's context ids
// over to the corresponding edge into the clone (which is created here if
// this is a newly created clone).
for (auto &OldCalleeEdge : OldCallee->CalleeEdges) {
// The context ids moving to the new callee are the subset of this edge's
// context ids and the context ids on the caller edge being moved.
DenseSet<uint32_t> EdgeContextIdsToMove =
set_intersection(OldCalleeEdge->getContextIds(), ContextIdsToMove);
set_subtract(OldCalleeEdge->getContextIds(), EdgeContextIdsToMove);
OldCalleeEdge->AllocTypes =
computeAllocType(OldCalleeEdge->getContextIds());
if (!NewClone) {
// Update context ids / alloc type on corresponding edge to NewCallee.
// There is a chance this may not exist if we are reusing an existing
// clone, specifically during function assignment, where we would have
// removed none type edges after creating the clone. If we can't find
// a corresponding edge there, fall through to the cloning below.
if (auto *NewCalleeEdge =
NewCallee->findEdgeFromCallee(OldCalleeEdge->Callee)) {
NewCalleeEdge->getContextIds().insert(EdgeContextIdsToMove.begin(),
EdgeContextIdsToMove.end());
NewCalleeEdge->AllocTypes |= computeAllocType(EdgeContextIdsToMove);
continue;
}
}
auto NewEdge = std::make_shared<ContextEdge>(
OldCalleeEdge->Callee, NewCallee,
computeAllocType(EdgeContextIdsToMove), EdgeContextIdsToMove);
NewCallee->CalleeEdges.push_back(NewEdge);
NewEdge->Callee->CallerEdges.push_back(NewEdge);
}
if (VerifyCCG) {
checkNode<DerivedCCG, FuncTy, CallTy>(OldCallee, /*CheckEdges=*/false);
checkNode<DerivedCCG, FuncTy, CallTy>(NewCallee, /*CheckEdges=*/false);
for (const auto &OldCalleeEdge : OldCallee->CalleeEdges)
checkNode<DerivedCCG, FuncTy, CallTy>(OldCalleeEdge->Callee,
/*CheckEdges=*/false);
for (const auto &NewCalleeEdge : NewCallee->CalleeEdges)
checkNode<DerivedCCG, FuncTy, CallTy>(NewCalleeEdge->Callee,
/*CheckEdges=*/false);
}
}
template <typename DerivedCCG, typename FuncTy, typename CallTy>
void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
recursivelyRemoveNoneTypeCalleeEdges(
ContextNode *Node, DenseSet<const ContextNode *> &Visited) {
auto Inserted = Visited.insert(Node);
if (!Inserted.second)
return;
removeNoneTypeCalleeEdges(Node);
for (auto *Clone : Node->Clones)
recursivelyRemoveNoneTypeCalleeEdges(Clone, Visited);
// The recursive call may remove some of this Node's caller edges.
// Iterate over a copy and skip any that were removed.
auto CallerEdges = Node->CallerEdges;
for (auto &Edge : CallerEdges) {
// Skip any that have been removed by an earlier recursive call.
if (Edge->Callee == nullptr && Edge->Caller == nullptr) {
assert(!std::count(Node->CallerEdges.begin(), Node->CallerEdges.end(),
Edge));
continue;
}
recursivelyRemoveNoneTypeCalleeEdges(Edge->Caller, Visited);
}
}
template <typename DerivedCCG, typename FuncTy, typename CallTy>
void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::identifyClones() {
DenseSet<const ContextNode *> Visited;
for (auto &Entry : AllocationCallToContextNodeMap) {
Visited.clear();
identifyClones(Entry.second, Visited, Entry.second->ContextIds);
}
Visited.clear();
for (auto &Entry : AllocationCallToContextNodeMap)
recursivelyRemoveNoneTypeCalleeEdges(Entry.second, Visited);
if (VerifyCCG)
check();
}
// helper function to check an AllocType is cold or notcold or both.
bool checkColdOrNotCold(uint8_t AllocType) {
return (AllocType == (uint8_t)AllocationType::Cold) ||
(AllocType == (uint8_t)AllocationType::NotCold) ||
(AllocType ==
((uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold));
}
template <typename DerivedCCG, typename FuncTy, typename CallTy>
void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::identifyClones(
ContextNode *Node, DenseSet<const ContextNode *> &Visited,
const DenseSet<uint32_t> &AllocContextIds) {
if (VerifyNodes)
checkNode<DerivedCCG, FuncTy, CallTy>(Node, /*CheckEdges=*/false);
assert(!Node->CloneOf);
// If Node as a null call, then either it wasn't found in the module (regular
// LTO) or summary index (ThinLTO), or there were other conditions blocking
// cloning (e.g. recursion, calls multiple targets, etc).
// Do this here so that we don't try to recursively clone callers below, which
// isn't useful at least for this node.
if (!Node->hasCall())
return;
#ifndef NDEBUG
auto Insert =
#endif
Visited.insert(Node);
// We should not have visited this node yet.
assert(Insert.second);
// The recursive call to identifyClones may delete the current edge from the
// CallerEdges vector. Make a copy and iterate on that, simpler than passing
// in an iterator and having recursive call erase from it. Other edges may
// also get removed during the recursion, which will have null Callee and
// Caller pointers (and are deleted later), so we skip those below.
{
auto CallerEdges = Node->CallerEdges;
for (auto &Edge : CallerEdges) {
// Skip any that have been removed by an earlier recursive call.
if (Edge->Callee == nullptr && Edge->Caller == nullptr) {
assert(!llvm::count(Node->CallerEdges, Edge));
continue;
}
// Ignore any caller we previously visited via another edge.
if (!Visited.count(Edge->Caller) && !Edge->Caller->CloneOf) {
identifyClones(Edge->Caller, Visited, AllocContextIds);
}
}
}
// Check if we reached an unambiguous call or have have only a single caller.
if (hasSingleAllocType(Node->AllocTypes) || Node->CallerEdges.size() <= 1)
return;
// We need to clone.
// Try to keep the original version as alloc type NotCold. This will make
// cases with indirect calls or any other situation with an unknown call to
// the original function get the default behavior. We do this by sorting the
// CallerEdges of the Node we will clone by alloc type.
//
// Give NotCold edge the lowest sort priority so those edges are at the end of
// the caller edges vector, and stay on the original version (since the below
// code clones greedily until it finds all remaining edges have the same type
// and leaves the remaining ones on the original Node).
//
// We shouldn't actually have any None type edges, so the sorting priority for
// that is arbitrary, and we assert in that case below.
const unsigned AllocTypeCloningPriority[] = {/*None*/ 3, /*NotCold*/ 4,
/*Cold*/ 1,
/*NotColdCold*/ 2};
std::stable_sort(Node->CallerEdges.begin(), Node->CallerEdges.end(),
[&](const std::shared_ptr<ContextEdge> &A,
const std::shared_ptr<ContextEdge> &B) {
// Nodes with non-empty context ids should be sorted before
// those with empty context ids.