bolt/lib/Passes/HFSortPlus.cpp - llvm-project - Git at Google

 //===- bolt/Passes/HFSortPlus.cpp - Order functions by hotness ------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 //
 // hfsort+ - layout of hot functions with i-TLB cache optimization.
 //
 // Given an ordering of hot functions (and hence, their assignment to the
 // i-TLB pages), we can divide all functions calls Into two categories:
 // - 'short' ones that have a caller-callee distance less than a page;
 // - 'long' ones where the distance exceeds a page.
 // The short calls are likely to result in a i-TLB cache hit. For the long ones,
 // the hit/miss result depends on the 'hotness' of the page (i.e., how often
 // the page is accessed). Assuming that functions are sent to the i-TLB cache
 // in a random order, the probability that a page is present in the cache is
 // proportional to the number of samples corresponding to the functions on the
 // page. The following algorithm detects short and long calls, and optimizes
 // the expected number of cache misses for the long ones.
 //
 //===----------------------------------------------------------------------===//

 #include "bolt/Passes/HFSort.h"
 #include "llvm/Support/CommandLine.h"
 #include <cmath>
 #include <set>
 #include <vector>

 #define DEBUG_TYPE "hfsort"

 using namespace llvm;
 using namespace bolt;

 namespace opts {

 extern cl::OptionCategory BoltOptCategory;

 cl::opt<unsigned> ITLBPageSize("itlb-page-size",
                                cl::desc("The size of i-tlb cache page"),
                                cl::init(4096), cl::ReallyHidden,
                                cl::cat(BoltOptCategory));

 cl::opt<unsigned> ITLBEntries("itlb-entries",
                               cl::desc("The number of entries in i-tlb cache"),
                               cl::init(16), cl::ReallyHidden,
                               cl::cat(BoltOptCategory));

 static cl::opt<unsigned> ITLBDensity("itlb-density",
                                      cl::desc("The density of i-tlb cache"),
                                      cl::init(4096), cl::ReallyHidden,
                                      cl::cat(BoltOptCategory));

 static cl::opt<double> MergeProbability(
     "merge-probability",
     cl::desc("The minimum probability of a call for merging two clusters"),
     cl::init(0.9), cl::ReallyHidden, cl::cat(BoltOptCategory));

 static cl::opt<double> ArcThreshold(
     "arc-threshold",
     cl::desc("The threshold for ignoring arcs with a small relative weight"),
     cl::init(0.00000001), cl::ReallyHidden, cl::cat(BoltOptCategory));

 } // namespace opts

 namespace llvm {
 namespace bolt {

 using NodeId = CallGraph::NodeId;
 using Arc = CallGraph::Arc;

 namespace {

 class Edge;
 using ArcList = std::vector<const Arc *>;

 // A chain (ordered sequence) of nodes (functions) in the call graph
 class Chain {
 public:
   Chain(const Chain &) = delete;
   Chain(Chain &&) = default;
   Chain &operator=(const Chain &) = delete;
   Chain &operator=(Chain &&) = default;

   explicit Chain(size_t Id_, NodeId Node, size_t Samples_, size_t Size_)
       : Id(Id_), Samples(Samples_), Size(Size_), Nodes(1, Node) {}

   double density() const { return static_cast<double>(Samples) / Size; }

   Edge *getEdge(Chain *Other) const {
     for (std::pair<Chain *, Edge *> It : Edges)
       if (It.first == Other)
         return It.second;
     return nullptr;
   }

   void removeEdge(Chain *Other) {
     auto It = Edges.begin();
     while (It != Edges.end()) {
       if (It->first == Other) {
         Edges.erase(It);
         return;
       }
       It++;
     }
   }

   void addEdge(Chain *Other, Edge *Edge) { Edges.emplace_back(Other, Edge); }

   void merge(Chain *Other) {
     Nodes.insert(Nodes.end(), Other->Nodes.begin(), Other->Nodes.end());
     Samples += Other->Samples;
     Size += Other->Size;
   }

   void mergeEdges(Chain *Other);

   void clear() {
     Nodes.clear();
     Edges.clear();
   }

 public:
   size_t Id;
   uint64_t Samples;
   uint64_t Size;
   // Cached score for the chain
   double Score{0};
   // Cached short-calls for the chain
   double ShortCalls{0};
   // Nodes in the chain
   std::vector<NodeId> Nodes;
   // Adjacent chains and corresponding edges (lists of arcs)
   std::vector<std::pair<Chain *, Edge *>> Edges;
 };

 // An edge in the call graph representing Arcs between two Chains.
 // When functions are merged Into chains, the edges are combined too so that
 // there is always at most one edge between a pair of chains
 class Edge {
 public:
   Edge(const Edge &) = delete;
   Edge(Edge &&) = default;
   Edge &operator=(const Edge &) = delete;
   Edge &operator=(Edge &&) = default;

   explicit Edge(Chain *SrcChain_, Chain *DstChain_, const Arc *A)
       : SrcChain(SrcChain_), DstChain(DstChain_), Arcs(1, A) {}

   void changeEndpoint(Chain *From, Chain *To) {
     if (From == SrcChain)
       SrcChain = To;
     if (From == DstChain)
       DstChain = To;
   }

   void moveArcs(Edge *Other) {
     Arcs.insert(Arcs.end(), Other->Arcs.begin(), Other->Arcs.end());
     Other->Arcs.clear();
   }

   void setMergeGain(Chain *PredChain, double ForwardGain, double BackwardGain) {
     // When forward and backward gains are the same, prioritize merging that
     // preserves the original order of the functions in the binary
     if (std::abs(ForwardGain - BackwardGain) < 1e-8) {
       if (SrcChain->Id < DstChain->Id) {
         IsGainForward = true;
         CachedGain = PredChain == SrcChain ? ForwardGain : BackwardGain;
       } else {
         IsGainForward = false;
         CachedGain = PredChain == SrcChain ? BackwardGain : ForwardGain;
       }
     } else if (ForwardGain > BackwardGain) {
       IsGainForward = PredChain == SrcChain;
       CachedGain = ForwardGain;
     } else {
       IsGainForward = PredChain != SrcChain;
       CachedGain = BackwardGain;
     }
   }

   double gain() const { return CachedGain; }

   Chain *predChain() const { return IsGainForward ? SrcChain : DstChain; }

   Chain *succChain() const { return IsGainForward ? DstChain : SrcChain; }

 private:
   Chain *SrcChain{nullptr};
   Chain *DstChain{nullptr};

 public:
   // Original arcs in the binary with corresponding execution counts
   ArcList Arcs;
   // Cached gain of merging the pair of chains
   double CachedGain{-1.0};
   // Since the gain of merging (Src, Dst) and (Dst, Src) might be different,
   // we store a flag indicating which of the options results in a higher gain
   bool IsGainForward;
 };

 void Chain::mergeEdges(Chain *Other) {
   // Update edges adjacent to chain other
   for (auto EdgeIt : Other->Edges) {
     Chain *const DstChain = EdgeIt.first;
     Edge *const DstEdge = EdgeIt.second;
     Chain *const TargetChain = DstChain == Other ? this : DstChain;

     // Find the corresponding edge in the current chain
     Edge *CurEdge = getEdge(TargetChain);
     if (CurEdge == nullptr) {
       DstEdge->changeEndpoint(Other, this);
       this->addEdge(TargetChain, DstEdge);
       if (DstChain != this && DstChain != Other)
         DstChain->addEdge(this, DstEdge);
     } else {
       CurEdge->moveArcs(DstEdge);
     }
     // Cleanup leftover edge
     if (DstChain != Other)
       DstChain->removeEdge(Other);
   }
 }

 class HFSortPlus {
 public:
   explicit HFSortPlus(const CallGraph &Cg) : Cg(Cg) { initialize(); }

   /// Run the algorithm and return ordered set of function clusters.
   std::vector<Cluster> run() {
     // Pass 1
     runPassOne();

     // Pass 2
     runPassTwo();

     outs() << "BOLT-INFO: hfsort+ reduced the number of chains from "
            << Cg.numNodes() << " to " << HotChains.size() << "\n";

     // Sorting chains by density in decreasing order
     auto DensityComparator = [](const Chain *L, const Chain *R) {
       if (L->density() != R->density())
         return L->density() > R->density();
       // Making sure the comparison is deterministic
       return L->Id < R->Id;
     };
     llvm::stable_sort(HotChains, DensityComparator);

     // Return the set of clusters that are left, which are the ones that
     // didn't get merged (so their first func is its original func)
     std::vector<Cluster> Clusters;
     Clusters.reserve(HotChains.size());
     for (Chain *Chain : HotChains)
       Clusters.emplace_back(Cluster(Chain->Nodes, Cg));
     return Clusters;
   }

 private:
   /// Initialize the set of active chains, function id to chain mapping,
   /// total number of samples and function addresses.
   void initialize() {
     OutWeight.resize(Cg.numNodes(), 0);
     InWeight.resize(Cg.numNodes(), 0);
     AllChains.reserve(Cg.numNodes());
     HotChains.reserve(Cg.numNodes());
     NodeChain.resize(Cg.numNodes(), nullptr);
     Addr.resize(Cg.numNodes(), 0);

     // Initialize chains
     for (NodeId F = 0; F < Cg.numNodes(); ++F) {
       AllChains.emplace_back(F, F, Cg.samples(F), Cg.size(F));
       HotChains.push_back(&AllChains.back());
       NodeChain[F] = &AllChains.back();
       TotalSamples += Cg.samples(F);
       for (NodeId Succ : Cg.successors(F)) {
         if (F == Succ)
           continue;
         const Arc &Arc = *Cg.findArc(F, Succ);
         OutWeight[F] += Arc.weight();
         InWeight[Succ] += Arc.weight();
       }
     }

     AllEdges.reserve(Cg.numArcs());
     for (NodeId F = 0; F < Cg.numNodes(); ++F) {
       for (NodeId Succ : Cg.successors(F)) {
         if (F == Succ)
           continue;
         const Arc &Arc = *Cg.findArc(F, Succ);
         if (Arc.weight() == 0.0 ||
             Arc.weight() / TotalSamples < opts::ArcThreshold) {
           continue;
         }

         Edge *CurEdge = NodeChain[F]->getEdge(NodeChain[Succ]);
         if (CurEdge != nullptr) {
           // This edge is already present in the graph
           assert(NodeChain[Succ]->getEdge(NodeChain[F]) != nullptr);
           CurEdge->Arcs.push_back(&Arc);
         } else {
           // This is a new edge
           AllEdges.emplace_back(NodeChain[F], NodeChain[Succ], &Arc);
           NodeChain[F]->addEdge(NodeChain[Succ], &AllEdges.back());
           NodeChain[Succ]->addEdge(NodeChain[F], &AllEdges.back());
         }
       }
     }

     for (Chain *&Chain : HotChains) {
       Chain->ShortCalls = shortCalls(Chain);
       Chain->Score = score(Chain);
     }
   }

   /// The probability that a page with a given density is not in the cache.
   ///
   /// Assume that the hot functions are called in a random order; then the
   /// probability of an i-TLB page being accessed after a function call is
   /// p = pageSamples / TotalSamples. The probability that the page is not
   /// accessed is (1 - p), and the probability that it is not in the cache
   /// (i.e. not accessed during the last kCacheEntries function calls)
   /// is (1 - p)^kCacheEntries
   double missProbability(double ChainDensity) const {
     double PageSamples = ChainDensity * opts::ITLBDensity;

     if (PageSamples >= TotalSamples)
       return 0;

     double P = PageSamples / TotalSamples;
     return pow(1.0 - P, double(opts::ITLBEntries));
   }

   /// The expected number of calls on different i-TLB pages for an arc of the
   /// call graph with a specified weight
   double expectedCalls(uint64_t SrcAddr, uint64_t DstAddr,
                        double Weight) const {
     uint64_t Dist = SrcAddr >= DstAddr ? SrcAddr - DstAddr : DstAddr - SrcAddr;
     if (Dist >= opts::ITLBPageSize)
       return 0;

     double D = double(Dist) / double(opts::ITLBPageSize);
     // Increasing the importance of shorter calls
     return (1.0 - D * D) * Weight;
   }

   /// The expected number of calls within a given chain with both endpoints on
   /// the same cache page
   double shortCalls(Chain *Chain) const {
     Edge *Edge = Chain->getEdge(Chain);
     if (Edge == nullptr)
       return 0;

     double Calls = 0;
     for (const Arc *Arc : Edge->Arcs) {
       uint64_t SrcAddr = Addr[Arc->src()] + uint64_t(Arc->avgCallOffset());
       uint64_t DstAddr = Addr[Arc->dst()];
       Calls += expectedCalls(SrcAddr, DstAddr, Arc->weight());
     }
     return Calls;
   }

   /// The number of calls between the two chains with both endpoints on
   /// the same i-TLB page, assuming that a given pair of chains gets merged
   double shortCalls(Chain *ChainPred, Chain *ChainSucc, Edge *Edge) const {
     double Calls = 0;
     for (const Arc *Arc : Edge->Arcs) {
       Chain *SrcChain = NodeChain[Arc->src()];
       uint64_t SrcAddr;
       uint64_t DstAddr;
       if (SrcChain == ChainPred) {
         SrcAddr = Addr[Arc->src()] + uint64_t(Arc->avgCallOffset());
         DstAddr = Addr[Arc->dst()] + ChainPred->Size;
       } else {
         SrcAddr =
             Addr[Arc->src()] + uint64_t(Arc->avgCallOffset()) + ChainPred->Size;
         DstAddr = Addr[Arc->dst()];
       }
       Calls += expectedCalls(SrcAddr, DstAddr, Arc->weight());
     }

     Calls += ChainPred->ShortCalls;
     Calls += ChainSucc->ShortCalls;

     return Calls;
   }

   double score(Chain *Chain) const {
     double LongCalls = Chain->Samples - Chain->ShortCalls;
     return LongCalls * missProbability(Chain->density());
   }

   /// The gain of merging two chains.
   ///
   /// We assume that the final chains are sorted by their density, and hence
   /// every chain is likely to be adjacent with chains of the same density.
   /// Thus, the 'hotness' of every chain can be estimated by density*pageSize,
   /// which is used to compute the probability of cache misses for long calls
   /// of a given chain.
   /// The result is also scaled by the size of the resulting chain in order to
   /// increase the chance of merging short chains, which is helpful for
   /// the i-cache performance.
   double mergeGain(Chain *ChainPred, Chain *ChainSucc, Edge *Edge) const {
     // Cache misses on the chains before merging
     double CurScore = ChainPred->Score + ChainSucc->Score;

     // Cache misses on the merged chain
     double LongCalls = ChainPred->Samples + ChainSucc->Samples -
                        shortCalls(ChainPred, ChainSucc, Edge);
     const double MergedSamples = ChainPred->Samples + ChainSucc->Samples;
     const double MergedSize = ChainPred->Size + ChainSucc->Size;
     double NewScore = LongCalls * missProbability(MergedSamples / MergedSize);

     double Gain = CurScore - NewScore;
     // Scale the result to increase the importance of merging short chains
     Gain /= std::min(ChainPred->Size, ChainSucc->Size);

     return Gain;
   }

   /// Run the first optimization pass of the algorithm:
   /// Merge chains that call each other with a high probability.
   void runPassOne() {
     // Find candidate pairs of chains for merging
     std::vector<const Arc *> ArcsToMerge;
     for (Chain *ChainPred : HotChains) {
       NodeId F = ChainPred->Nodes.back();
       for (NodeId Succ : Cg.successors(F)) {
         if (F == Succ)
           continue;

         const Arc &Arc = *Cg.findArc(F, Succ);
         if (Arc.weight() == 0.0 ||
             Arc.weight() / TotalSamples < opts::ArcThreshold)
           continue;

         const double CallsFromPred = OutWeight[F];
         const double CallsToSucc = InWeight[Succ];
         const double CallsPredSucc = Arc.weight();

         // Probability that the first chain is calling the second one
         const double ProbOut =
             CallsFromPred > 0 ? CallsPredSucc / CallsFromPred : 0;
         assert(0.0 <= ProbOut && ProbOut <= 1.0 && "incorrect out-probability");

         // Probability that the second chain is called From the first one
         const double ProbIn = CallsToSucc > 0 ? CallsPredSucc / CallsToSucc : 0;
         assert(0.0 <= ProbIn && ProbIn <= 1.0 && "incorrect in-probability");

         if (std::min(ProbOut, ProbIn) >= opts::MergeProbability)
           ArcsToMerge.push_back(&Arc);
       }
     }

     // Sort the pairs by the weight in reverse order
     llvm::sort(ArcsToMerge, [](const Arc *L, const Arc *R) {
       return L->weight() > R->weight();
     });

     // Merge the pairs of chains
     for (const Arc *Arc : ArcsToMerge) {
       Chain *ChainPred = NodeChain[Arc->src()];
       Chain *ChainSucc = NodeChain[Arc->dst()];
       if (ChainPred == ChainSucc)
         continue;
       if (ChainPred->Nodes.back() == Arc->src() &&
           ChainSucc->Nodes.front() == Arc->dst())
         mergeChains(ChainPred, ChainSucc);
     }
   }

   /// Run the second optimization pass of the hfsort+ algorithm:
   /// Merge pairs of chains while there is an improvement in the
   /// expected cache miss ratio.
   void runPassTwo() {
     // Creating a priority queue containing all edges ordered by the merge gain
     auto GainComparator = [](Edge *L, Edge *R) {
       if (std::abs(L->gain() - R->gain()) > 1e-8)
         return L->gain() > R->gain();

       // Making sure the comparison is deterministic
       if (L->predChain()->Id != R->predChain()->Id)
         return L->predChain()->Id < R->predChain()->Id;

       return L->succChain()->Id < R->succChain()->Id;
     };
     std::set<Edge *, decltype(GainComparator)> Queue(GainComparator);

     // Inserting the edges Into the queue
     for (Chain *ChainPred : HotChains) {
       for (auto EdgeIt : ChainPred->Edges) {
         Chain *ChainSucc = EdgeIt.first;
         Edge *ChainEdge = EdgeIt.second;
         // Ignore loop edges
         if (ChainPred == ChainSucc)
           continue;
         // Ignore already processed edges
         if (ChainEdge->gain() != -1.0)
           continue;

         // Compute the gain of merging the two chains
         auto ForwardGain = mergeGain(ChainPred, ChainSucc, ChainEdge);
         auto BackwardGain = mergeGain(ChainSucc, ChainPred, ChainEdge);
         ChainEdge->setMergeGain(ChainPred, ForwardGain, BackwardGain);
         if (ChainEdge->gain() > 0.0)
           Queue.insert(ChainEdge);
       }
     }

     // Merge the chains while the gain of merging is positive
     while (!Queue.empty()) {
       // Extract the best (top) edge for merging
       Edge *It = *Queue.begin();
       Queue.erase(Queue.begin());
       Edge *BestEdge = It;
       Chain *BestChainPred = BestEdge->predChain();
       Chain *BestChainSucc = BestEdge->succChain();
       if (BestChainPred == BestChainSucc || BestEdge->gain() <= 0.0)
         continue;

       // Remove outdated edges
       for (std::pair<Chain *, Edge *> EdgeIt : BestChainPred->Edges)
         Queue.erase(EdgeIt.second);
       for (std::pair<Chain *, Edge *> EdgeIt : BestChainSucc->Edges)
         Queue.erase(EdgeIt.second);

       // Merge the best pair of chains
       mergeChains(BestChainPred, BestChainSucc);

       // Insert newly created edges Into the queue
       for (auto EdgeIt : BestChainPred->Edges) {
         Chain *ChainSucc = EdgeIt.first;
         Edge *ChainEdge = EdgeIt.second;
         // Ignore loop edges
         if (BestChainPred == ChainSucc)
           continue;

         // Compute the gain of merging the two chains
         auto ForwardGain = mergeGain(BestChainPred, ChainSucc, ChainEdge);
         auto BackwardGain = mergeGain(ChainSucc, BestChainPred, ChainEdge);
         ChainEdge->setMergeGain(BestChainPred, ForwardGain, BackwardGain);
         if (ChainEdge->gain() > 0.0)
           Queue.insert(ChainEdge);
       }
     }
   }

   /// Merge chain From into chain Into and update the list of active chains.
   void mergeChains(Chain *Into, Chain *From) {
     assert(Into != From && "cannot merge a chain with itself");
     Into->merge(From);

     // Update the chains and addresses for functions merged from From
     size_t CurAddr = 0;
     for (NodeId F : Into->Nodes) {
       NodeChain[F] = Into;
       Addr[F] = CurAddr;
       CurAddr += Cg.size(F);
     }

     // Merge edges
     Into->mergeEdges(From);
     From->clear();

     // Update cached scores for the new chain
     Into->ShortCalls = shortCalls(Into);
     Into->Score = score(Into);

     // Remove chain From From the list of active chains
     llvm::erase_value(HotChains, From);
   }

 private:
   // The call graph
   const CallGraph &Cg;

   // All chains of functions
   std::vector<Chain> AllChains;

   // Active chains. The vector gets updated at runtime when chains are merged
   std::vector<Chain *> HotChains;

   // All edges between chains
   std::vector<Edge> AllEdges;

   // Node_id => chain
   std::vector<Chain *> NodeChain;

   // Current address of the function From the beginning of its chain
   std::vector<uint64_t> Addr;

   // Total weight of outgoing arcs for each function
   std::vector<double> OutWeight;

   // Total weight of incoming arcs for each function
   std::vector<double> InWeight;
   // The total number of samples in the graph
   double TotalSamples{0};
 };

 } // end anonymous namespace

 std::vector<Cluster> hfsortPlus(CallGraph &Cg) {
   // It is required that the sum of incoming arc weights is not greater
   // than the number of samples for every function.
   // Ensuring the call graph obeys the property before running the algorithm.
   Cg.adjustArcWeights();
   return HFSortPlus(Cg).run();
 }

 } // namespace bolt
 } // namespace llvm
	//===- bolt/Passes/HFSortPlus.cpp - Order functions by hotness ------------===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	//
	// hfsort+ - layout of hot functions with i-TLB cache optimization.
	//
	// Given an ordering of hot functions (and hence, their assignment to the
	// i-TLB pages), we can divide all functions calls Into two categories:
	// - 'short' ones that have a caller-callee distance less than a page;
	// - 'long' ones where the distance exceeds a page.
	// The short calls are likely to result in a i-TLB cache hit. For the long ones,
	// the hit/miss result depends on the 'hotness' of the page (i.e., how often
	// the page is accessed). Assuming that functions are sent to the i-TLB cache
	// in a random order, the probability that a page is present in the cache is
	// proportional to the number of samples corresponding to the functions on the
	// page. The following algorithm detects short and long calls, and optimizes
	// the expected number of cache misses for the long ones.
	//
	//===----------------------------------------------------------------------===//

	#include "bolt/Passes/HFSort.h"
	#include "llvm/Support/CommandLine.h"
	#include <cmath>
	#include <set>
	#include <vector>

	#define DEBUG_TYPE "hfsort"

	using namespace llvm;
	using namespace bolt;

	namespace opts {

	extern cl::OptionCategory BoltOptCategory;

	cl::opt<unsigned> ITLBPageSize("itlb-page-size",
	cl::desc("The size of i-tlb cache page"),
	cl::init(4096), cl::ReallyHidden,
	cl::cat(BoltOptCategory));

	cl::opt<unsigned> ITLBEntries("itlb-entries",
	cl::desc("The number of entries in i-tlb cache"),
	cl::init(16), cl::ReallyHidden,
	cl::cat(BoltOptCategory));

	static cl::opt<unsigned> ITLBDensity("itlb-density",
	cl::desc("The density of i-tlb cache"),
	cl::init(4096), cl::ReallyHidden,
	cl::cat(BoltOptCategory));

	static cl::opt<double> MergeProbability(
	"merge-probability",
	cl::desc("The minimum probability of a call for merging two clusters"),
	cl::init(0.9), cl::ReallyHidden, cl::cat(BoltOptCategory));

	static cl::opt<double> ArcThreshold(
	"arc-threshold",
	cl::desc("The threshold for ignoring arcs with a small relative weight"),
	cl::init(0.00000001), cl::ReallyHidden, cl::cat(BoltOptCategory));

	} // namespace opts

	namespace llvm {
	namespace bolt {

	using NodeId = CallGraph::NodeId;
	using Arc = CallGraph::Arc;

	namespace {

	class Edge;
	using ArcList = std::vector<const Arc *>;

	// A chain (ordered sequence) of nodes (functions) in the call graph
	class Chain {
	public:
	Chain(const Chain &) = delete;
	Chain(Chain &&) = default;
	Chain &operator=(const Chain &) = delete;
	Chain &operator=(Chain &&) = default;

	explicit Chain(size_t Id_, NodeId Node, size_t Samples_, size_t Size_)
	: Id(Id_), Samples(Samples_), Size(Size_), Nodes(1, Node) {}

	double density() const { return static_cast<double>(Samples) / Size; }

	Edge getEdge(Chain Other) const {
	for (std::pair<Chain , Edge > It : Edges)
	if (It.first == Other)
	return It.second;
	return nullptr;
	}

	void removeEdge(Chain *Other) {
	auto It = Edges.begin();
	while (It != Edges.end()) {
	if (It->first == Other) {
	Edges.erase(It);
	return;
	}
	It++;
	}
	}

	void addEdge(Chain Other, Edge Edge) { Edges.emplace_back(Other, Edge); }

	void merge(Chain *Other) {
	Nodes.insert(Nodes.end(), Other->Nodes.begin(), Other->Nodes.end());
	Samples += Other->Samples;
	Size += Other->Size;
	}

	void mergeEdges(Chain *Other);

	void clear() {
	Nodes.clear();
	Edges.clear();
	}

	public:
	size_t Id;
	uint64_t Samples;
	uint64_t Size;
	// Cached score for the chain
	double Score{0};
	// Cached short-calls for the chain
	double ShortCalls{0};
	// Nodes in the chain
	std::vector<NodeId> Nodes;
	// Adjacent chains and corresponding edges (lists of arcs)
	std::vector<std::pair<Chain , Edge >> Edges;
	};

	// An edge in the call graph representing Arcs between two Chains.
	// When functions are merged Into chains, the edges are combined too so that
	// there is always at most one edge between a pair of chains
	class Edge {
	public:
	Edge(const Edge &) = delete;
	Edge(Edge &&) = default;
	Edge &operator=(const Edge &) = delete;
	Edge &operator=(Edge &&) = default;

	explicit Edge(Chain SrcChain_, Chain DstChain_, const Arc *A)
	: SrcChain(SrcChain_), DstChain(DstChain_), Arcs(1, A) {}

	void changeEndpoint(Chain From, Chain To) {
	if (From == SrcChain)
	SrcChain = To;
	if (From == DstChain)
	DstChain = To;
	}

	void moveArcs(Edge *Other) {
	Arcs.insert(Arcs.end(), Other->Arcs.begin(), Other->Arcs.end());
	Other->Arcs.clear();
	}

	void setMergeGain(Chain *PredChain, double ForwardGain, double BackwardGain) {
	// When forward and backward gains are the same, prioritize merging that
	// preserves the original order of the functions in the binary
	if (std::abs(ForwardGain - BackwardGain) < 1e-8) {
	if (SrcChain->Id < DstChain->Id) {
	IsGainForward = true;
	CachedGain = PredChain == SrcChain ? ForwardGain : BackwardGain;
	} else {
	IsGainForward = false;
	CachedGain = PredChain == SrcChain ? BackwardGain : ForwardGain;
	}
	} else if (ForwardGain > BackwardGain) {
	IsGainForward = PredChain == SrcChain;
	CachedGain = ForwardGain;
	} else {
	IsGainForward = PredChain != SrcChain;
	CachedGain = BackwardGain;
	}
	}

	double gain() const { return CachedGain; }

	Chain *predChain() const { return IsGainForward ? SrcChain : DstChain; }

	Chain *succChain() const { return IsGainForward ? DstChain : SrcChain; }

	private:
	Chain *SrcChain{nullptr};
	Chain *DstChain{nullptr};

	public:
	// Original arcs in the binary with corresponding execution counts
	ArcList Arcs;
	// Cached gain of merging the pair of chains
	double CachedGain{-1.0};
	// Since the gain of merging (Src, Dst) and (Dst, Src) might be different,
	// we store a flag indicating which of the options results in a higher gain
	bool IsGainForward;
	};

	void Chain::mergeEdges(Chain *Other) {
	// Update edges adjacent to chain other
	for (auto EdgeIt : Other->Edges) {
	Chain *const DstChain = EdgeIt.first;
	Edge *const DstEdge = EdgeIt.second;
	Chain *const TargetChain = DstChain == Other ? this : DstChain;

	// Find the corresponding edge in the current chain
	Edge *CurEdge = getEdge(TargetChain);
	if (CurEdge == nullptr) {
	DstEdge->changeEndpoint(Other, this);
	this->addEdge(TargetChain, DstEdge);
	if (DstChain != this && DstChain != Other)
	DstChain->addEdge(this, DstEdge);
	} else {
	CurEdge->moveArcs(DstEdge);
	}
	// Cleanup leftover edge
	if (DstChain != Other)
	DstChain->removeEdge(Other);
	}
	}

	class HFSortPlus {
	public:
	explicit HFSortPlus(const CallGraph &Cg) : Cg(Cg) { initialize(); }

	/// Run the algorithm and return ordered set of function clusters.
	std::vector<Cluster> run() {
	// Pass 1
	runPassOne();

	// Pass 2
	runPassTwo();

	outs() << "BOLT-INFO: hfsort+ reduced the number of chains from "
	<< Cg.numNodes() << " to " << HotChains.size() << "\n";

	// Sorting chains by density in decreasing order
	auto DensityComparator = [](const Chain L, const Chain R) {
	if (L->density() != R->density())
	return L->density() > R->density();
	// Making sure the comparison is deterministic
	return L->Id < R->Id;
	};
	llvm::stable_sort(HotChains, DensityComparator);

	// Return the set of clusters that are left, which are the ones that
	// didn't get merged (so their first func is its original func)
	std::vector<Cluster> Clusters;
	Clusters.reserve(HotChains.size());
	for (Chain *Chain : HotChains)
	Clusters.emplace_back(Cluster(Chain->Nodes, Cg));
	return Clusters;
	}

	private:
	/// Initialize the set of active chains, function id to chain mapping,
	/// total number of samples and function addresses.
	void initialize() {
	OutWeight.resize(Cg.numNodes(), 0);
	InWeight.resize(Cg.numNodes(), 0);
	AllChains.reserve(Cg.numNodes());
	HotChains.reserve(Cg.numNodes());
	NodeChain.resize(Cg.numNodes(), nullptr);
	Addr.resize(Cg.numNodes(), 0);

	// Initialize chains
	for (NodeId F = 0; F < Cg.numNodes(); ++F) {
	AllChains.emplace_back(F, F, Cg.samples(F), Cg.size(F));
	HotChains.push_back(&AllChains.back());
	NodeChain[F] = &AllChains.back();
	TotalSamples += Cg.samples(F);
	for (NodeId Succ : Cg.successors(F)) {
	if (F == Succ)
	continue;
	const Arc &Arc = *Cg.findArc(F, Succ);
	OutWeight[F] += Arc.weight();
	InWeight[Succ] += Arc.weight();
	}
	}

	AllEdges.reserve(Cg.numArcs());
	for (NodeId F = 0; F < Cg.numNodes(); ++F) {
	for (NodeId Succ : Cg.successors(F)) {
	if (F == Succ)
	continue;
	const Arc &Arc = *Cg.findArc(F, Succ);
	if (Arc.weight() == 0.0 \|\|
	Arc.weight() / TotalSamples < opts::ArcThreshold) {
	continue;
	}

	Edge *CurEdge = NodeChain[F]->getEdge(NodeChain[Succ]);
	if (CurEdge != nullptr) {
	// This edge is already present in the graph
	assert(NodeChain[Succ]->getEdge(NodeChain[F]) != nullptr);
	CurEdge->Arcs.push_back(&Arc);
	} else {
	// This is a new edge
	AllEdges.emplace_back(NodeChain[F], NodeChain[Succ], &Arc);
	NodeChain[F]->addEdge(NodeChain[Succ], &AllEdges.back());
	NodeChain[Succ]->addEdge(NodeChain[F], &AllEdges.back());
	}
	}
	}

	for (Chain *&Chain : HotChains) {
	Chain->ShortCalls = shortCalls(Chain);
	Chain->Score = score(Chain);
	}
	}

	/// The probability that a page with a given density is not in the cache.
	///
	/// Assume that the hot functions are called in a random order; then the
	/// probability of an i-TLB page being accessed after a function call is
	/// p = pageSamples / TotalSamples. The probability that the page is not
	/// accessed is (1 - p), and the probability that it is not in the cache
	/// (i.e. not accessed during the last kCacheEntries function calls)
	/// is (1 - p)^kCacheEntries
	double missProbability(double ChainDensity) const {
	double PageSamples = ChainDensity * opts::ITLBDensity;

	if (PageSamples >= TotalSamples)
	return 0;

	double P = PageSamples / TotalSamples;
	return pow(1.0 - P, double(opts::ITLBEntries));
	}

	/// The expected number of calls on different i-TLB pages for an arc of the
	/// call graph with a specified weight
	double expectedCalls(uint64_t SrcAddr, uint64_t DstAddr,
	double Weight) const {
	uint64_t Dist = SrcAddr >= DstAddr ? SrcAddr - DstAddr : DstAddr - SrcAddr;
	if (Dist >= opts::ITLBPageSize)
	return 0;

	double D = double(Dist) / double(opts::ITLBPageSize);
	// Increasing the importance of shorter calls
	return (1.0 - D * D) * Weight;
	}

	/// The expected number of calls within a given chain with both endpoints on
	/// the same cache page
	double shortCalls(Chain *Chain) const {
	Edge *Edge = Chain->getEdge(Chain);
	if (Edge == nullptr)
	return 0;

	double Calls = 0;
	for (const Arc *Arc : Edge->Arcs) {
	uint64_t SrcAddr = Addr[Arc->src()] + uint64_t(Arc->avgCallOffset());
	uint64_t DstAddr = Addr[Arc->dst()];
	Calls += expectedCalls(SrcAddr, DstAddr, Arc->weight());
	}
	return Calls;
	}

	/// The number of calls between the two chains with both endpoints on
	/// the same i-TLB page, assuming that a given pair of chains gets merged
	double shortCalls(Chain ChainPred, Chain ChainSucc, Edge *Edge) const {
	double Calls = 0;
	for (const Arc *Arc : Edge->Arcs) {
	Chain *SrcChain = NodeChain[Arc->src()];
	uint64_t SrcAddr;
	uint64_t DstAddr;
	if (SrcChain == ChainPred) {
	SrcAddr = Addr[Arc->src()] + uint64_t(Arc->avgCallOffset());
	DstAddr = Addr[Arc->dst()] + ChainPred->Size;
	} else {
	SrcAddr =
	Addr[Arc->src()] + uint64_t(Arc->avgCallOffset()) + ChainPred->Size;
	DstAddr = Addr[Arc->dst()];
	}
	Calls += expectedCalls(SrcAddr, DstAddr, Arc->weight());
	}

	Calls += ChainPred->ShortCalls;
	Calls += ChainSucc->ShortCalls;

	return Calls;
	}

	double score(Chain *Chain) const {
	double LongCalls = Chain->Samples - Chain->ShortCalls;
	return LongCalls * missProbability(Chain->density());
	}

	/// The gain of merging two chains.
	///
	/// We assume that the final chains are sorted by their density, and hence
	/// every chain is likely to be adjacent with chains of the same density.
	/// Thus, the 'hotness' of every chain can be estimated by density*pageSize,
	/// which is used to compute the probability of cache misses for long calls
	/// of a given chain.
	/// The result is also scaled by the size of the resulting chain in order to
	/// increase the chance of merging short chains, which is helpful for
	/// the i-cache performance.
	double mergeGain(Chain ChainPred, Chain ChainSucc, Edge *Edge) const {
	// Cache misses on the chains before merging
	double CurScore = ChainPred->Score + ChainSucc->Score;

	// Cache misses on the merged chain
	double LongCalls = ChainPred->Samples + ChainSucc->Samples -
	shortCalls(ChainPred, ChainSucc, Edge);
	const double MergedSamples = ChainPred->Samples + ChainSucc->Samples;
	const double MergedSize = ChainPred->Size + ChainSucc->Size;
	double NewScore = LongCalls * missProbability(MergedSamples / MergedSize);

	double Gain = CurScore - NewScore;
	// Scale the result to increase the importance of merging short chains
	Gain /= std::min(ChainPred->Size, ChainSucc->Size);

	return Gain;
	}

	/// Run the first optimization pass of the algorithm:
	/// Merge chains that call each other with a high probability.
	void runPassOne() {
	// Find candidate pairs of chains for merging
	std::vector<const Arc *> ArcsToMerge;
	for (Chain *ChainPred : HotChains) {
	NodeId F = ChainPred->Nodes.back();
	for (NodeId Succ : Cg.successors(F)) {
	if (F == Succ)
	continue;

	const Arc &Arc = *Cg.findArc(F, Succ);
	if (Arc.weight() == 0.0 \|\|
	Arc.weight() / TotalSamples < opts::ArcThreshold)
	continue;

	const double CallsFromPred = OutWeight[F];
	const double CallsToSucc = InWeight[Succ];
	const double CallsPredSucc = Arc.weight();

	// Probability that the first chain is calling the second one
	const double ProbOut =
	CallsFromPred > 0 ? CallsPredSucc / CallsFromPred : 0;
	assert(0.0 <= ProbOut && ProbOut <= 1.0 && "incorrect out-probability");

	// Probability that the second chain is called From the first one
	const double ProbIn = CallsToSucc > 0 ? CallsPredSucc / CallsToSucc : 0;
	assert(0.0 <= ProbIn && ProbIn <= 1.0 && "incorrect in-probability");

	if (std::min(ProbOut, ProbIn) >= opts::MergeProbability)
	ArcsToMerge.push_back(&Arc);
	}
	}

	// Sort the pairs by the weight in reverse order
	llvm::sort(ArcsToMerge, [](const Arc L, const Arc R) {
	return L->weight() > R->weight();
	});

	// Merge the pairs of chains
	for (const Arc *Arc : ArcsToMerge) {
	Chain *ChainPred = NodeChain[Arc->src()];
	Chain *ChainSucc = NodeChain[Arc->dst()];
	if (ChainPred == ChainSucc)
	continue;
	if (ChainPred->Nodes.back() == Arc->src() &&
	ChainSucc->Nodes.front() == Arc->dst())
	mergeChains(ChainPred, ChainSucc);
	}
	}

	/// Run the second optimization pass of the hfsort+ algorithm:
	/// Merge pairs of chains while there is an improvement in the
	/// expected cache miss ratio.
	void runPassTwo() {
	// Creating a priority queue containing all edges ordered by the merge gain
	auto GainComparator = [](Edge L, Edge R) {
	if (std::abs(L->gain() - R->gain()) > 1e-8)
	return L->gain() > R->gain();

	// Making sure the comparison is deterministic
	if (L->predChain()->Id != R->predChain()->Id)
	return L->predChain()->Id < R->predChain()->Id;

	return L->succChain()->Id < R->succChain()->Id;
	};
	std::set<Edge *, decltype(GainComparator)> Queue(GainComparator);

	// Inserting the edges Into the queue
	for (Chain *ChainPred : HotChains) {
	for (auto EdgeIt : ChainPred->Edges) {
	Chain *ChainSucc = EdgeIt.first;
	Edge *ChainEdge = EdgeIt.second;
	// Ignore loop edges
	if (ChainPred == ChainSucc)
	continue;
	// Ignore already processed edges
	if (ChainEdge->gain() != -1.0)
	continue;

	// Compute the gain of merging the two chains
	auto ForwardGain = mergeGain(ChainPred, ChainSucc, ChainEdge);
	auto BackwardGain = mergeGain(ChainSucc, ChainPred, ChainEdge);
	ChainEdge->setMergeGain(ChainPred, ForwardGain, BackwardGain);
	if (ChainEdge->gain() > 0.0)
	Queue.insert(ChainEdge);
	}
	}

	// Merge the chains while the gain of merging is positive
	while (!Queue.empty()) {
	// Extract the best (top) edge for merging
	Edge It = Queue.begin();
	Queue.erase(Queue.begin());
	Edge *BestEdge = It;
	Chain *BestChainPred = BestEdge->predChain();
	Chain *BestChainSucc = BestEdge->succChain();
	if (BestChainPred == BestChainSucc \|\| BestEdge->gain() <= 0.0)
	continue;

	// Remove outdated edges
	for (std::pair<Chain , Edge > EdgeIt : BestChainPred->Edges)
	Queue.erase(EdgeIt.second);
	for (std::pair<Chain , Edge > EdgeIt : BestChainSucc->Edges)
	Queue.erase(EdgeIt.second);

	// Merge the best pair of chains
	mergeChains(BestChainPred, BestChainSucc);

	// Insert newly created edges Into the queue
	for (auto EdgeIt : BestChainPred->Edges) {
	Chain *ChainSucc = EdgeIt.first;
	Edge *ChainEdge = EdgeIt.second;
	// Ignore loop edges
	if (BestChainPred == ChainSucc)
	continue;

	// Compute the gain of merging the two chains
	auto ForwardGain = mergeGain(BestChainPred, ChainSucc, ChainEdge);
	auto BackwardGain = mergeGain(ChainSucc, BestChainPred, ChainEdge);
	ChainEdge->setMergeGain(BestChainPred, ForwardGain, BackwardGain);
	if (ChainEdge->gain() > 0.0)
	Queue.insert(ChainEdge);
	}
	}
	}

	/// Merge chain From into chain Into and update the list of active chains.
	void mergeChains(Chain Into, Chain From) {
	assert(Into != From && "cannot merge a chain with itself");
	Into->merge(From);

	// Update the chains and addresses for functions merged from From
	size_t CurAddr = 0;
	for (NodeId F : Into->Nodes) {
	NodeChain[F] = Into;
	Addr[F] = CurAddr;
	CurAddr += Cg.size(F);
	}

	// Merge edges
	Into->mergeEdges(From);
	From->clear();

	// Update cached scores for the new chain
	Into->ShortCalls = shortCalls(Into);
	Into->Score = score(Into);

	// Remove chain From From the list of active chains
	llvm::erase_value(HotChains, From);
	}

	private:
	// The call graph
	const CallGraph &Cg;

	// All chains of functions
	std::vector<Chain> AllChains;

	// Active chains. The vector gets updated at runtime when chains are merged
	std::vector<Chain *> HotChains;

	// All edges between chains
	std::vector<Edge> AllEdges;

	// Node_id => chain
	std::vector<Chain *> NodeChain;

	// Current address of the function From the beginning of its chain
	std::vector<uint64_t> Addr;

	// Total weight of outgoing arcs for each function
	std::vector<double> OutWeight;

	// Total weight of incoming arcs for each function
	std::vector<double> InWeight;
	// The total number of samples in the graph
	double TotalSamples{0};
	};

	} // end anonymous namespace

	std::vector<Cluster> hfsortPlus(CallGraph &Cg) {
	// It is required that the sum of incoming arc weights is not greater
	// than the number of samples for every function.
	// Ensuring the call graph obeys the property before running the algorithm.
	Cg.adjustArcWeights();
	return HFSortPlus(Cg).run();
	}

	} // namespace bolt
	} // namespace llvm