blob: b19d3670d34ccdc2760e22125e496945868665a8 [file] [log] [blame]
//===- BPSectionOrdererBase.inc ---------------------------------*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This file defines the common BPSectionOrderer interface using the Curiously
// Recurring Template Pattern and dispatches to the BalancedPartitioning
// algorithm implemented in LLVMSupport. The optimized section layout attempts
// to group similar sections together (resulting in a smaller compressed app
// size) and utilize a temporal profile file to reduce page faults during
// program startup.
//
// Clients should derive from BPOrderer, providing concrete implementations for
// section and symbol representations. Include this file in a .cpp file to
// specialize the template for the derived class.
//
//===----------------------------------------------------------------------===//
#include "lld/Common/ErrorHandler.h"
#include "llvm/ADT/CachedHashString.h"
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/MapVector.h"
#include "llvm/ADT/SetVector.h"
#include "llvm/ADT/SmallSet.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/StringMap.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/ADT/Twine.h"
#include "llvm/ProfileData/InstrProfReader.h"
#include "llvm/Support/BalancedPartitioning.h"
#include "llvm/Support/TimeProfiler.h"
#include "llvm/Support/VirtualFileSystem.h"
#include <memory>
#include <optional>
#include <set>
#define DEBUG_TYPE "bp-section-orderer"
using namespace llvm;
using namespace lld;
namespace lld {
template <class D> struct BPOrdererTraits;
template <class D> struct BPOrderer {
using Section = typename BPOrdererTraits<D>::Section;
using Defined = typename BPOrdererTraits<D>::Defined;
// Compute a section order using the Balanced Partitioning algorithm.
//
// * for*Compresion: Improve Lempel-Ziv compression by grouping
// similar sections together.
// * profilePath: Utilize a temporal profile file to reduce page faults during
// program startup.
// * compressionSortStartupFunctions: if profilePath is specified, allocate
// extra utility vertices to prioritize nearby function similarity.
auto computeOrder(llvm::StringRef profilePath, bool forFunctionCompression,
bool forDataCompression,
bool compressionSortStartupFunctions, bool verbose,
llvm::ArrayRef<Section *> sections,
const DenseMap<CachedHashStringRef, std::set<unsigned>>
&rootSymbolToSectionIdxs)
-> llvm::DenseMap<const Section *, int>;
std::optional<StringRef> static getResolvedLinkageName(StringRef name) {
return {};
}
};
} // namespace lld
using UtilityNodes = SmallVector<BPFunctionNode::UtilityNodeT>;
template <class D>
static SmallVector<std::pair<unsigned, UtilityNodes>> getUnsForCompression(
ArrayRef<const typename D::Section *> sections,
const DenseMap<const void *, uint64_t> &sectionToIdx,
ArrayRef<unsigned> sectionIdxs,
DenseMap<unsigned, SmallVector<unsigned, 0>> *duplicateSectionIdxs,
BPFunctionNode::UtilityNodeT &maxUN) {
TimeTraceScope timeScope("Build nodes for compression");
SmallVector<std::pair<unsigned, SmallVector<uint64_t>>> sectionHashes;
sectionHashes.reserve(sectionIdxs.size());
SmallVector<uint64_t> hashes;
for (unsigned sectionIdx : sectionIdxs) {
const auto *isec = sections[sectionIdx];
D::getSectionHashes(*isec, hashes, sectionToIdx);
sectionHashes.emplace_back(sectionIdx, std::move(hashes));
hashes.clear();
}
MapVector<uint64_t, unsigned> hashFrequency;
for (auto &[sectionIdx, hashes] : sectionHashes)
for (auto hash : hashes)
++hashFrequency[hash];
if (duplicateSectionIdxs) {
// Merge sections that are nearly identical
SmallVector<std::pair<unsigned, SmallVector<uint64_t>>> newSectionHashes;
DenseMap<uint64_t, unsigned> wholeHashToSectionIdx;
unsigned threshold = sectionHashes.size() > 10000 ? 5 : 0;
for (auto &[sectionIdx, hashes] : sectionHashes) {
uint64_t wholeHash = 0;
for (auto hash : hashes)
if (hashFrequency[hash] > threshold)
wholeHash ^= hash;
auto [it, wasInserted] =
wholeHashToSectionIdx.insert(std::make_pair(wholeHash, sectionIdx));
if (wasInserted) {
newSectionHashes.emplace_back(sectionIdx, hashes);
} else {
(*duplicateSectionIdxs)[it->getSecond()].push_back(sectionIdx);
}
}
sectionHashes = newSectionHashes;
// Recompute hash frequencies
hashFrequency.clear();
for (auto &[sectionIdx, hashes] : sectionHashes)
for (auto hash : hashes)
++hashFrequency[hash];
}
// Filter rare and common hashes and assign each a unique utility node that
// doesn't conflict with the trace utility nodes
DenseMap<uint64_t, BPFunctionNode::UtilityNodeT> hashToUN;
for (auto &[hash, frequency] : hashFrequency) {
if (frequency <= 1 || frequency * 2 > sectionHashes.size())
continue;
hashToUN[hash] = ++maxUN;
}
SmallVector<std::pair<unsigned, UtilityNodes>> sectionUns;
for (auto &[sectionIdx, hashes] : sectionHashes) {
UtilityNodes uns;
for (auto &hash : hashes) {
auto it = hashToUN.find(hash);
if (it != hashToUN.end())
uns.push_back(it->second);
}
sectionUns.emplace_back(sectionIdx, uns);
}
return sectionUns;
}
/// Symbols can be appended with "(.__uniq.xxxx)?(.llvm.yyyy)?(.Tgm)?" where
/// "xxxx" and "yyyy" are numbers that could change between builds, and .Tgm is
/// the global merge functions suffix
/// (see GlobalMergeFunc::MergingInstanceSuffix). We need to use the root symbol
/// name before this suffix so these symbols can be matched with profiles which
/// may have different suffixes.
inline StringRef getRootSymbol(StringRef name) {
name.consume_back(".Tgm");
auto [P0, S0] = name.rsplit(".llvm.");
auto [P1, S1] = P0.rsplit(".__uniq.");
return P1;
}
template <class D>
auto BPOrderer<D>::computeOrder(
StringRef profilePath, bool forFunctionCompression, bool forDataCompression,
bool compressionSortStartupFunctions, bool verbose,
ArrayRef<Section *> sections,
const DenseMap<CachedHashStringRef, std::set<unsigned>>
&rootSymbolToSectionIdxs) -> DenseMap<const Section *, int> {
TimeTraceScope timeScope("Setup Balanced Partitioning");
DenseMap<const void *, uint64_t> sectionToIdx;
for (auto [i, isec] : llvm::enumerate(sections))
sectionToIdx.try_emplace(isec, i);
BPFunctionNode::UtilityNodeT maxUN = 0;
DenseMap<unsigned, UtilityNodes> startupSectionIdxUNs;
// Used to define the initial order for startup functions.
DenseMap<unsigned, size_t> sectionIdxToTimestamp;
std::unique_ptr<InstrProfReader> reader;
if (!profilePath.empty()) {
auto fs = vfs::getRealFileSystem();
auto readerOrErr = InstrProfReader::create(profilePath, *fs);
lld::checkError(readerOrErr.takeError());
reader = std::move(readerOrErr.get());
for (auto &entry : *reader) {
// Read all entries
(void)entry;
}
auto &traces = reader->getTemporalProfTraces();
DenseMap<unsigned, BPFunctionNode::UtilityNodeT> sectionIdxToFirstUN;
for (size_t traceIdx = 0; traceIdx < traces.size(); traceIdx++) {
uint64_t currentSize = 0, cutoffSize = 1;
size_t cutoffTimestamp = 1;
auto &trace = traces[traceIdx].FunctionNameRefs;
for (size_t timestamp = 0; timestamp < trace.size(); timestamp++) {
auto [_, parsedFuncName] = getParsedIRPGOName(
reader->getSymtab().getFuncOrVarName(trace[timestamp]));
parsedFuncName = getRootSymbol(parsedFuncName);
auto sectionIdxsIt =
rootSymbolToSectionIdxs.find(CachedHashStringRef(parsedFuncName));
if (sectionIdxsIt == rootSymbolToSectionIdxs.end())
continue;
auto &sectionIdxs = sectionIdxsIt->second;
// If the same symbol is found in multiple sections, they might be
// identical, so we arbitrarily use the size from the first section.
currentSize += D::getSize(*sections[*sectionIdxs.begin()]);
// Since BalancedPartitioning is sensitive to the initial order, we need
// to explicitly define it to be ordered by earliest timestamp.
for (unsigned sectionIdx : sectionIdxs) {
auto [it, wasInserted] =
sectionIdxToTimestamp.try_emplace(sectionIdx, timestamp);
if (!wasInserted)
it->getSecond() = std::min<size_t>(it->getSecond(), timestamp);
}
if (timestamp >= cutoffTimestamp || currentSize >= cutoffSize) {
++maxUN;
cutoffSize = 2 * currentSize;
cutoffTimestamp = 2 * cutoffTimestamp;
}
for (unsigned sectionIdx : sectionIdxs)
sectionIdxToFirstUN.try_emplace(sectionIdx, maxUN);
}
for (auto &[sectionIdx, firstUN] : sectionIdxToFirstUN)
for (auto un = firstUN; un <= maxUN; ++un)
startupSectionIdxUNs[sectionIdx].push_back(un);
++maxUN;
sectionIdxToFirstUN.clear();
}
}
SmallVector<unsigned> sectionIdxsForFunctionCompression,
sectionIdxsForDataCompression;
for (unsigned sectionIdx = 0; sectionIdx < sections.size(); sectionIdx++) {
if (startupSectionIdxUNs.count(sectionIdx))
continue;
const auto *isec = sections[sectionIdx];
if (D::isCodeSection(*isec)) {
if (forFunctionCompression)
sectionIdxsForFunctionCompression.push_back(sectionIdx);
} else {
if (forDataCompression)
sectionIdxsForDataCompression.push_back(sectionIdx);
}
}
if (compressionSortStartupFunctions) {
SmallVector<unsigned> startupIdxs;
for (auto &[sectionIdx, uns] : startupSectionIdxUNs)
startupIdxs.push_back(sectionIdx);
auto unsForStartupFunctionCompression =
getUnsForCompression<D>(sections, sectionToIdx, startupIdxs,
/*duplicateSectionIdxs=*/nullptr, maxUN);
for (auto &[sectionIdx, compressionUns] :
unsForStartupFunctionCompression) {
auto &uns = startupSectionIdxUNs[sectionIdx];
uns.append(compressionUns);
llvm::sort(uns);
uns.erase(std::unique(uns.begin(), uns.end()), uns.end());
}
}
// Map a section index (order directly) to a list of duplicate section indices
// (not ordered directly).
DenseMap<unsigned, SmallVector<unsigned, 0>> duplicateSectionIdxs;
auto unsForFunctionCompression = getUnsForCompression<D>(
sections, sectionToIdx, sectionIdxsForFunctionCompression,
&duplicateSectionIdxs, maxUN);
auto unsForDataCompression = getUnsForCompression<D>(
sections, sectionToIdx, sectionIdxsForDataCompression,
&duplicateSectionIdxs, maxUN);
std::vector<BPFunctionNode> nodesForStartup, nodesForFunctionCompression,
nodesForDataCompression;
for (auto &[sectionIdx, uns] : startupSectionIdxUNs)
nodesForStartup.emplace_back(sectionIdx, uns);
for (auto &[sectionIdx, uns] : unsForFunctionCompression)
nodesForFunctionCompression.emplace_back(sectionIdx, uns);
for (auto &[sectionIdx, uns] : unsForDataCompression)
nodesForDataCompression.emplace_back(sectionIdx, uns);
// Use the first timestamp to define the initial order for startup nodes.
llvm::sort(nodesForStartup, [&sectionIdxToTimestamp](auto &L, auto &R) {
return std::make_pair(sectionIdxToTimestamp[L.Id], L.Id) <
std::make_pair(sectionIdxToTimestamp[R.Id], R.Id);
});
// Sort compression nodes by their Id (which is the section index) because the
// input linker order tends to be not bad.
llvm::sort(nodesForFunctionCompression,
[](auto &L, auto &R) { return L.Id < R.Id; });
llvm::sort(nodesForDataCompression,
[](auto &L, auto &R) { return L.Id < R.Id; });
{
TimeTraceScope timeScope("Balanced Partitioning");
BalancedPartitioningConfig config;
BalancedPartitioning bp(config);
bp.run(nodesForStartup);
bp.run(nodesForFunctionCompression);
bp.run(nodesForDataCompression);
}
unsigned numStartupSections = 0;
unsigned numCodeCompressionSections = 0;
unsigned numDuplicateCodeSections = 0;
unsigned numDataCompressionSections = 0;
unsigned numDuplicateDataSections = 0;
SetVector<const Section *> orderedSections;
// Order startup functions,
for (auto &node : nodesForStartup) {
const auto *isec = sections[node.Id];
if (orderedSections.insert(isec))
++numStartupSections;
}
// then functions for compression,
for (auto &node : nodesForFunctionCompression) {
const auto *isec = sections[node.Id];
if (orderedSections.insert(isec))
++numCodeCompressionSections;
auto It = duplicateSectionIdxs.find(node.Id);
if (It == duplicateSectionIdxs.end())
continue;
for (auto dupSecIdx : It->getSecond()) {
const auto *dupIsec = sections[dupSecIdx];
if (orderedSections.insert(dupIsec))
++numDuplicateCodeSections;
}
}
// then data for compression.
for (auto &node : nodesForDataCompression) {
const auto *isec = sections[node.Id];
if (orderedSections.insert(isec))
++numDataCompressionSections;
auto It = duplicateSectionIdxs.find(node.Id);
if (It == duplicateSectionIdxs.end())
continue;
for (auto dupSecIdx : It->getSecond()) {
const auto *dupIsec = sections[dupSecIdx];
if (orderedSections.insert(dupIsec))
++numDuplicateDataSections;
}
}
if (verbose) {
unsigned numTotalOrderedSections =
numStartupSections + numCodeCompressionSections +
numDuplicateCodeSections + numDataCompressionSections +
numDuplicateDataSections;
dbgs()
<< "Ordered " << numTotalOrderedSections
<< " sections using balanced partitioning:\n Functions for startup: "
<< numStartupSections
<< "\n Functions for compression: " << numCodeCompressionSections
<< "\n Duplicate functions: " << numDuplicateCodeSections
<< "\n Data for compression: " << numDataCompressionSections
<< "\n Duplicate data: " << numDuplicateDataSections << "\n";
if (!profilePath.empty()) {
// Evaluate this function order for startup
StringMap<std::pair<uint64_t, uint64_t>> symbolToPageNumbers;
const uint64_t pageSize = (1 << 14);
uint64_t currentAddress = 0;
for (const auto *isec : orderedSections) {
for (auto *sym : static_cast<D *>(this)->getSymbols(*isec)) {
uint64_t startAddress = currentAddress + D::getSymValue(*sym);
uint64_t endAddress = startAddress + D::getSymSize(*sym);
uint64_t firstPage = startAddress / pageSize;
// I think the kernel might pull in a few pages when one it touched,
// so it might be more accurate to force lastPage to be aligned by
// 4?
uint64_t lastPage = endAddress / pageSize;
StringRef rootSymbol = D::getSymName(*sym);
rootSymbol = getRootSymbol(rootSymbol);
symbolToPageNumbers.try_emplace(rootSymbol, firstPage, lastPage);
if (auto resolvedLinkageName = D::getResolvedLinkageName(rootSymbol))
symbolToPageNumbers.try_emplace(resolvedLinkageName.value(),
firstPage, lastPage);
}
currentAddress += D::getSize(*isec);
}
// The area under the curve F where F(t) is the total number of page
// faults at step t.
unsigned area = 0;
for (auto &trace : reader->getTemporalProfTraces()) {
SmallSet<uint64_t, 0> touchedPages;
for (unsigned step = 0; step < trace.FunctionNameRefs.size(); step++) {
auto traceId = trace.FunctionNameRefs[step];
auto [Filename, ParsedFuncName] =
getParsedIRPGOName(reader->getSymtab().getFuncOrVarName(traceId));
ParsedFuncName = getRootSymbol(ParsedFuncName);
auto it = symbolToPageNumbers.find(ParsedFuncName);
if (it != symbolToPageNumbers.end()) {
auto &[firstPage, lastPage] = it->getValue();
for (uint64_t i = firstPage; i <= lastPage; i++)
touchedPages.insert(i);
}
area += touchedPages.size();
}
}
dbgs() << "Total area under the page fault curve: " << (float)area
<< "\n";
}
}
DenseMap<const Section *, int> sectionPriorities;
int prio = -orderedSections.size();
for (const auto *isec : orderedSections)
sectionPriorities[isec] = prio++;
return sectionPriorities;
}