llvm/lib/ProfileData/MemProfRadixTree.cpp - llvm-project - Git at Google

 //===- MemProfRadixTree.cpp - Radix tree encoded callstacks ---------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 // This file contains logic that implements a space efficient radix tree
 // encoding for callstacks used by MemProf.
 //
 //===----------------------------------------------------------------------===//

 #include "llvm/ProfileData/MemProfRadixTree.h"

 namespace llvm {
 namespace memprof {
 // Encode a call stack into RadixArray.  Return the starting index within
 // RadixArray.  For each call stack we encode, we emit two or three components
 // into RadixArray.  If a given call stack doesn't have a common prefix relative
 // to the previous one, we emit:
 //
 // - the frames in the given call stack in the root-to-leaf order
 //
 // - the length of the given call stack
 //
 // If a given call stack has a non-empty common prefix relative to the previous
 // one, we emit:
 //
 // - the relative location of the common prefix, encoded as a negative number.
 //
 // - a portion of the given call stack that's beyond the common prefix
 //
 // - the length of the given call stack, including the length of the common
 //   prefix.
 //
 // The resulting RadixArray requires a somewhat unintuitive backward traversal
 // to reconstruct a call stack -- read the call stack length and scan backward
 // while collecting frames in the leaf to root order.  build, the caller of this
 // function, reverses RadixArray in place so that we can reconstruct a call
 // stack as if we were deserializing an array in a typical way -- the call stack
 // length followed by the frames in the leaf-to-root order except that we need
 // to handle pointers to parents along the way.
 //
 // To quickly determine the location of the common prefix within RadixArray,
 // Indexes caches the indexes of the previous call stack's frames within
 // RadixArray.
 template <typename FrameIdTy>
 LinearCallStackId CallStackRadixTreeBuilder<FrameIdTy>::encodeCallStack(
     const llvm::SmallVector<FrameIdTy> *CallStack,
     const llvm::SmallVector<FrameIdTy> *Prev,
     const llvm::DenseMap<FrameIdTy, LinearFrameId> *MemProfFrameIndexes) {
   // Compute the length of the common root prefix between Prev and CallStack.
   uint32_t CommonLen = 0;
   if (Prev) {
     auto Pos = std::mismatch(Prev->rbegin(), Prev->rend(), CallStack->rbegin(),
                              CallStack->rend());
     CommonLen = std::distance(CallStack->rbegin(), Pos.second);
   }

   // Drop the portion beyond CommonLen.
   assert(CommonLen <= Indexes.size());
   Indexes.resize(CommonLen);

   // Append a pointer to the parent.
   if (CommonLen) {
     uint32_t CurrentIndex = RadixArray.size();
     uint32_t ParentIndex = Indexes.back();
     // The offset to the parent must be negative because we are pointing to an
     // element we've already added to RadixArray.
     assert(ParentIndex < CurrentIndex);
     RadixArray.push_back(ParentIndex - CurrentIndex);
   }

   // Copy the part of the call stack beyond the common prefix to RadixArray.
   assert(CommonLen <= CallStack->size());
   for (FrameIdTy F : llvm::drop_begin(llvm::reverse(*CallStack), CommonLen)) {
     // Remember the index of F in RadixArray.
     Indexes.push_back(RadixArray.size());
     RadixArray.push_back(
         MemProfFrameIndexes ? MemProfFrameIndexes->find(F)->second : F);
   }
   assert(CallStack->size() == Indexes.size());

   // End with the call stack length.
   RadixArray.push_back(CallStack->size());

   // Return the index within RadixArray where we can start reconstructing a
   // given call stack from.
   return RadixArray.size() - 1;
 }

 template <typename FrameIdTy>
 void CallStackRadixTreeBuilder<FrameIdTy>::build(
     llvm::MapVector<CallStackId, llvm::SmallVector<FrameIdTy>>
         &&MemProfCallStackData,
     const llvm::DenseMap<FrameIdTy, LinearFrameId> *MemProfFrameIndexes,
     llvm::DenseMap<FrameIdTy, FrameStat> &FrameHistogram) {
   // Take the vector portion of MemProfCallStackData.  The vector is exactly
   // what we need to sort.  Also, we no longer need its lookup capability.
   llvm::SmallVector<CSIdPair, 0> CallStacks = MemProfCallStackData.takeVector();

   // Return early if we have no work to do.
   if (CallStacks.empty()) {
     RadixArray.clear();
     CallStackPos.clear();
     return;
   }

   // Sorting the list of call stacks in the dictionary order is sufficient to
   // maximize the length of the common prefix between two adjacent call stacks
   // and thus minimize the length of RadixArray.  However, we go one step
   // further and try to reduce the number of times we follow pointers to parents
   // during deserilization.  Consider a poorly encoded radix tree:
   //
   // CallStackId 1:  f1 -> f2 -> f3
   //                  |
   // CallStackId 2:   +--- f4 -> f5
   //                        |
   // CallStackId 3:         +--> f6
   //
   // Here, f2 and f4 appear once and twice, respectively, in the call stacks.
   // Once we encode CallStackId 1 into RadixArray, every other call stack with
   // common prefix f1 ends up pointing to CallStackId 1.  Since CallStackId 3
   // share "f1 f4" with CallStackId 2, CallStackId 3 needs to follow pointers to
   // parents twice.
   //
   // We try to alleviate the situation by sorting the list of call stacks by
   // comparing the popularity of frames rather than the integer values of
   // FrameIds.  In the example above, f4 is more popular than f2, so we sort the
   // call stacks and encode them as:
   //
   // CallStackId 2:  f1 -- f4 -> f5
   //                  |     |
   // CallStackId 3:   |     +--> f6
   //                  |
   // CallStackId 1:   +--> f2 -> f3
   //
   // Notice that CallStackId 3 follows a pointer to a parent only once.
   //
   // All this is a quick-n-dirty trick to reduce the number of jumps.  The
   // proper way would be to compute the weight of each radix tree node -- how
   // many call stacks use a given radix tree node, and encode a radix tree from
   // the heaviest node first.  We do not do so because that's a lot of work.
   llvm::sort(CallStacks, [&](const CSIdPair &L, const CSIdPair &R) {
     // Call stacks are stored from leaf to root.  Perform comparisons from the
     // root.
     return std::lexicographical_compare(
         L.second.rbegin(), L.second.rend(), R.second.rbegin(), R.second.rend(),
         [&](FrameIdTy F1, FrameIdTy F2) {
           uint64_t H1 = FrameHistogram[F1].Count;
           uint64_t H2 = FrameHistogram[F2].Count;
           // Popular frames should come later because we encode call stacks from
           // the last one in the list.
           if (H1 != H2)
             return H1 < H2;
           // For sort stability.
           return F1 < F2;
         });
   });

   // Reserve some reasonable amount of storage.
   RadixArray.clear();
   RadixArray.reserve(CallStacks.size() * 8);

   // Indexes will grow as long as the longest call stack.
   Indexes.clear();
   Indexes.reserve(512);

   // CallStackPos will grow to exactly CallStacks.size() entries.
   CallStackPos.clear();
   CallStackPos.reserve(CallStacks.size());

   // Compute the radix array.  We encode one call stack at a time, computing the
   // longest prefix that's shared with the previous call stack we encode.  For
   // each call stack we encode, we remember a mapping from CallStackId to its
   // position within RadixArray.
   //
   // As an optimization, we encode from the last call stack in CallStacks to
   // reduce the number of times we follow pointers to the parents.  Consider the
   // list of call stacks that has been sorted in the dictionary order:
   //
   // Call Stack 1: F1
   // Call Stack 2: F1 -> F2
   // Call Stack 3: F1 -> F2 -> F3
   //
   // If we traversed CallStacks in the forward order, we would end up with a
   // radix tree like:
   //
   // Call Stack 1:  F1
   //                |
   // Call Stack 2:  +---> F2
   //                      |
   // Call Stack 3:        +---> F3
   //
   // Notice that each call stack jumps to the previous one.  However, if we
   // traverse CallStacks in the reverse order, then Call Stack 3 has the
   // complete call stack encoded without any pointers.  Call Stack 1 and 2 point
   // to appropriate prefixes of Call Stack 3.
   const llvm::SmallVector<FrameIdTy> *Prev = nullptr;
   for (const auto &[CSId, CallStack] : llvm::reverse(CallStacks)) {
     LinearCallStackId Pos =
         encodeCallStack(&CallStack, Prev, MemProfFrameIndexes);
     CallStackPos.insert({CSId, Pos});
     Prev = &CallStack;
   }

   // "RadixArray.size() - 1" below is problematic if RadixArray is empty.
   assert(!RadixArray.empty());

   // Reverse the radix array in place.  We do so mostly for intuitive
   // deserialization where we would read the length field and then the call
   // stack frames proper just like any other array deserialization, except
   // that we have occasional jumps to take advantage of prefixes.
   for (size_t I = 0, J = RadixArray.size() - 1; I < J; ++I, --J)
     std::swap(RadixArray[I], RadixArray[J]);

   // "Reverse" the indexes stored in CallStackPos.
   for (auto &[K, V] : CallStackPos)
     V = RadixArray.size() - 1 - V;
 }

 // Explicitly instantiate class with the utilized FrameIdTy.
 template class CallStackRadixTreeBuilder<FrameId>;
 template class CallStackRadixTreeBuilder<LinearFrameId>;

 template <typename FrameIdTy>
 llvm::DenseMap<FrameIdTy, FrameStat>
 computeFrameHistogram(llvm::MapVector<CallStackId, llvm::SmallVector<FrameIdTy>>
                           &MemProfCallStackData) {
   llvm::DenseMap<FrameIdTy, FrameStat> Histogram;

   for (const auto &KV : MemProfCallStackData) {
     const auto &CS = KV.second;
     for (unsigned I = 0, E = CS.size(); I != E; ++I) {
       auto &S = Histogram[CS[I]];
       ++S.Count;
       S.PositionSum += I;
     }
   }
   return Histogram;
 }

 // Explicitly instantiate function with the utilized FrameIdTy.
 template llvm::DenseMap<FrameId, FrameStat> computeFrameHistogram<FrameId>(
     llvm::MapVector<CallStackId, llvm::SmallVector<FrameId>>
         &MemProfCallStackData);
 template llvm::DenseMap<LinearFrameId, FrameStat>
 computeFrameHistogram<LinearFrameId>(
     llvm::MapVector<CallStackId, llvm::SmallVector<LinearFrameId>>
         &MemProfCallStackData);
 } // namespace memprof
 } // namespace llvm
	//===- MemProfRadixTree.cpp - Radix tree encoded callstacks ---------------===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	// This file contains logic that implements a space efficient radix tree
	// encoding for callstacks used by MemProf.
	//
	//===----------------------------------------------------------------------===//

	#include "llvm/ProfileData/MemProfRadixTree.h"

	namespace llvm {
	namespace memprof {
	// Encode a call stack into RadixArray. Return the starting index within
	// RadixArray. For each call stack we encode, we emit two or three components
	// into RadixArray. If a given call stack doesn't have a common prefix relative
	// to the previous one, we emit:
	//
	// - the frames in the given call stack in the root-to-leaf order
	//
	// - the length of the given call stack
	//
	// If a given call stack has a non-empty common prefix relative to the previous
	// one, we emit:
	//
	// - the relative location of the common prefix, encoded as a negative number.
	//
	// - a portion of the given call stack that's beyond the common prefix
	//
	// - the length of the given call stack, including the length of the common
	// prefix.
	//
	// The resulting RadixArray requires a somewhat unintuitive backward traversal
	// to reconstruct a call stack -- read the call stack length and scan backward
	// while collecting frames in the leaf to root order. build, the caller of this
	// function, reverses RadixArray in place so that we can reconstruct a call
	// stack as if we were deserializing an array in a typical way -- the call stack
	// length followed by the frames in the leaf-to-root order except that we need
	// to handle pointers to parents along the way.
	//
	// To quickly determine the location of the common prefix within RadixArray,
	// Indexes caches the indexes of the previous call stack's frames within
	// RadixArray.
	template <typename FrameIdTy>
	LinearCallStackId CallStackRadixTreeBuilder<FrameIdTy>::encodeCallStack(
	const llvm::SmallVector<FrameIdTy> *CallStack,
	const llvm::SmallVector<FrameIdTy> *Prev,
	const llvm::DenseMap<FrameIdTy, LinearFrameId> *MemProfFrameIndexes) {
	// Compute the length of the common root prefix between Prev and CallStack.
	uint32_t CommonLen = 0;
	if (Prev) {
	auto Pos = std::mismatch(Prev->rbegin(), Prev->rend(), CallStack->rbegin(),
	CallStack->rend());
	CommonLen = std::distance(CallStack->rbegin(), Pos.second);
	}

	// Drop the portion beyond CommonLen.
	assert(CommonLen <= Indexes.size());
	Indexes.resize(CommonLen);

	// Append a pointer to the parent.
	if (CommonLen) {
	uint32_t CurrentIndex = RadixArray.size();
	uint32_t ParentIndex = Indexes.back();
	// The offset to the parent must be negative because we are pointing to an
	// element we've already added to RadixArray.
	assert(ParentIndex < CurrentIndex);
	RadixArray.push_back(ParentIndex - CurrentIndex);
	}

	// Copy the part of the call stack beyond the common prefix to RadixArray.
	assert(CommonLen <= CallStack->size());
	for (FrameIdTy F : llvm::drop_begin(llvm::reverse(*CallStack), CommonLen)) {
	// Remember the index of F in RadixArray.
	Indexes.push_back(RadixArray.size());
	RadixArray.push_back(
	MemProfFrameIndexes ? MemProfFrameIndexes->find(F)->second : F);
	}
	assert(CallStack->size() == Indexes.size());

	// End with the call stack length.
	RadixArray.push_back(CallStack->size());

	// Return the index within RadixArray where we can start reconstructing a
	// given call stack from.
	return RadixArray.size() - 1;
	}

	template <typename FrameIdTy>
	void CallStackRadixTreeBuilder<FrameIdTy>::build(
	llvm::MapVector<CallStackId, llvm::SmallVector<FrameIdTy>>
	&&MemProfCallStackData,
	const llvm::DenseMap<FrameIdTy, LinearFrameId> *MemProfFrameIndexes,
	llvm::DenseMap<FrameIdTy, FrameStat> &FrameHistogram) {
	// Take the vector portion of MemProfCallStackData. The vector is exactly
	// what we need to sort. Also, we no longer need its lookup capability.
	llvm::SmallVector<CSIdPair, 0> CallStacks = MemProfCallStackData.takeVector();

	// Return early if we have no work to do.
	if (CallStacks.empty()) {
	RadixArray.clear();
	CallStackPos.clear();
	return;
	}

	// Sorting the list of call stacks in the dictionary order is sufficient to
	// maximize the length of the common prefix between two adjacent call stacks
	// and thus minimize the length of RadixArray. However, we go one step
	// further and try to reduce the number of times we follow pointers to parents
	// during deserilization. Consider a poorly encoded radix tree:
	//
	// CallStackId 1: f1 -> f2 -> f3
	// \|
	// CallStackId 2: +--- f4 -> f5
	// \|
	// CallStackId 3: +--> f6
	//
	// Here, f2 and f4 appear once and twice, respectively, in the call stacks.
	// Once we encode CallStackId 1 into RadixArray, every other call stack with
	// common prefix f1 ends up pointing to CallStackId 1. Since CallStackId 3
	// share "f1 f4" with CallStackId 2, CallStackId 3 needs to follow pointers to
	// parents twice.
	//
	// We try to alleviate the situation by sorting the list of call stacks by
	// comparing the popularity of frames rather than the integer values of
	// FrameIds. In the example above, f4 is more popular than f2, so we sort the
	// call stacks and encode them as:
	//
	// CallStackId 2: f1 -- f4 -> f5
	// \| \|
	// CallStackId 3: \| +--> f6
	// \|
	// CallStackId 1: +--> f2 -> f3
	//
	// Notice that CallStackId 3 follows a pointer to a parent only once.
	//
	// All this is a quick-n-dirty trick to reduce the number of jumps. The
	// proper way would be to compute the weight of each radix tree node -- how
	// many call stacks use a given radix tree node, and encode a radix tree from
	// the heaviest node first. We do not do so because that's a lot of work.
	llvm::sort(CallStacks, [&](const CSIdPair &L, const CSIdPair &R) {
	// Call stacks are stored from leaf to root. Perform comparisons from the
	// root.
	return std::lexicographical_compare(
	L.second.rbegin(), L.second.rend(), R.second.rbegin(), R.second.rend(),
	[&](FrameIdTy F1, FrameIdTy F2) {
	uint64_t H1 = FrameHistogram[F1].Count;
	uint64_t H2 = FrameHistogram[F2].Count;
	// Popular frames should come later because we encode call stacks from
	// the last one in the list.
	if (H1 != H2)
	return H1 < H2;
	// For sort stability.
	return F1 < F2;
	});
	});

	// Reserve some reasonable amount of storage.
	RadixArray.clear();
	RadixArray.reserve(CallStacks.size() * 8);

	// Indexes will grow as long as the longest call stack.
	Indexes.clear();
	Indexes.reserve(512);

	// CallStackPos will grow to exactly CallStacks.size() entries.
	CallStackPos.clear();
	CallStackPos.reserve(CallStacks.size());

	// Compute the radix array. We encode one call stack at a time, computing the
	// longest prefix that's shared with the previous call stack we encode. For
	// each call stack we encode, we remember a mapping from CallStackId to its
	// position within RadixArray.
	//
	// As an optimization, we encode from the last call stack in CallStacks to
	// reduce the number of times we follow pointers to the parents. Consider the
	// list of call stacks that has been sorted in the dictionary order:
	//
	// Call Stack 1: F1
	// Call Stack 2: F1 -> F2
	// Call Stack 3: F1 -> F2 -> F3
	//
	// If we traversed CallStacks in the forward order, we would end up with a
	// radix tree like:
	//
	// Call Stack 1: F1
	// \|
	// Call Stack 2: +---> F2
	// \|
	// Call Stack 3: +---> F3
	//
	// Notice that each call stack jumps to the previous one. However, if we
	// traverse CallStacks in the reverse order, then Call Stack 3 has the
	// complete call stack encoded without any pointers. Call Stack 1 and 2 point
	// to appropriate prefixes of Call Stack 3.
	const llvm::SmallVector<FrameIdTy> *Prev = nullptr;
	for (const auto &[CSId, CallStack] : llvm::reverse(CallStacks)) {
	LinearCallStackId Pos =
	encodeCallStack(&CallStack, Prev, MemProfFrameIndexes);
	CallStackPos.insert({CSId, Pos});
	Prev = &CallStack;
	}

	// "RadixArray.size() - 1" below is problematic if RadixArray is empty.
	assert(!RadixArray.empty());

	// Reverse the radix array in place. We do so mostly for intuitive
	// deserialization where we would read the length field and then the call
	// stack frames proper just like any other array deserialization, except
	// that we have occasional jumps to take advantage of prefixes.
	for (size_t I = 0, J = RadixArray.size() - 1; I < J; ++I, --J)
	std::swap(RadixArray[I], RadixArray[J]);

	// "Reverse" the indexes stored in CallStackPos.
	for (auto &[K, V] : CallStackPos)
	V = RadixArray.size() - 1 - V;
	}

	// Explicitly instantiate class with the utilized FrameIdTy.
	template class CallStackRadixTreeBuilder<FrameId>;
	template class CallStackRadixTreeBuilder<LinearFrameId>;

	template <typename FrameIdTy>
	llvm::DenseMap<FrameIdTy, FrameStat>
	computeFrameHistogram(llvm::MapVector<CallStackId, llvm::SmallVector<FrameIdTy>>
	&MemProfCallStackData) {
	llvm::DenseMap<FrameIdTy, FrameStat> Histogram;

	for (const auto &KV : MemProfCallStackData) {
	const auto &CS = KV.second;
	for (unsigned I = 0, E = CS.size(); I != E; ++I) {
	auto &S = Histogram[CS[I]];
	++S.Count;
	S.PositionSum += I;
	}
	}
	return Histogram;
	}

	// Explicitly instantiate function with the utilized FrameIdTy.
	template llvm::DenseMap<FrameId, FrameStat> computeFrameHistogram<FrameId>(
	llvm::MapVector<CallStackId, llvm::SmallVector<FrameId>>
	&MemProfCallStackData);
	template llvm::DenseMap<LinearFrameId, FrameStat>
	computeFrameHistogram<LinearFrameId>(
	llvm::MapVector<CallStackId, llvm::SmallVector<LinearFrameId>>
	&MemProfCallStackData);
	} // namespace memprof
	} // namespace llvm