[𝘀𝗽𝗿] changes to main this commit is based on
Created using spr 1.3.4
[skip ci]
diff --git a/bolt/include/bolt/Core/BinaryFunction.h b/bolt/include/bolt/Core/BinaryFunction.h
index a529985..e82b857 100644
--- a/bolt/include/bolt/Core/BinaryFunction.h
+++ b/bolt/include/bolt/Core/BinaryFunction.h
@@ -386,8 +386,8 @@
/// Profile match ratio.
float ProfileMatchRatio{0.0f};
- /// Raw branch count for this function in the profile.
- uint64_t RawBranchCount{0};
+ /// Raw sample/branch count for this function in the profile.
+ uint64_t RawSampleCount{0};
/// Dynamically executed function bytes, used for density computation.
uint64_t SampleCountInBytes{0};
@@ -1880,13 +1880,12 @@
/// Return COUNT_NO_PROFILE if there's no profile info.
uint64_t getExecutionCount() const { return ExecutionCount; }
- /// Return the raw profile information about the number of branch
- /// executions corresponding to this function.
- uint64_t getRawBranchCount() const { return RawBranchCount; }
+ /// Return the raw profile information about the number of samples (basic
+ /// profile) or branch executions (branch profile) recorded in this function.
+ uint64_t getRawSampleCount() const { return RawSampleCount; }
- /// Set the profile data about the number of branch executions corresponding
- /// to this function.
- void setRawBranchCount(uint64_t Count) { RawBranchCount = Count; }
+ /// Set raw count of samples or branches recorded in this function.
+ void setRawSampleCount(uint64_t Count) { RawSampleCount = Count; }
/// Return the number of dynamically executed bytes, from raw perf data.
uint64_t getSampleCountInBytes() const { return SampleCountInBytes; }
diff --git a/bolt/include/bolt/Profile/DataAggregator.h b/bolt/include/bolt/Profile/DataAggregator.h
index 79a9186..ff2ea35 100644
--- a/bolt/include/bolt/Profile/DataAggregator.h
+++ b/bolt/include/bolt/Profile/DataAggregator.h
@@ -197,10 +197,6 @@
BoltAddressTranslation *BAT{nullptr};
- /// Whether pre-aggregated profile needs to convert branch profile into call
- /// to continuation fallthrough profile.
- bool NeedsConvertRetProfileToCallCont{false};
-
/// Update function execution profile with a recorded trace.
/// A trace is region of code executed between two LBR entries supplied in
/// execution order.
@@ -285,8 +281,8 @@
/// everything
bool hasData() const { return !ParsingBuf.empty(); }
- /// Print heat map based on LBR samples.
- std::error_code printLBRHeatMap();
+ /// Print heat map based on collected samples.
+ std::error_code printHeatMap();
/// Parse a single perf sample containing a PID associated with a sequence of
/// LBR entries. If the PID does not correspond to the binary we are looking
diff --git a/bolt/include/bolt/Profile/DataReader.h b/bolt/include/bolt/Profile/DataReader.h
index 314dcc9..a7a0933 100644
--- a/bolt/include/bolt/Profile/DataReader.h
+++ b/bolt/include/bolt/Profile/DataReader.h
@@ -252,6 +252,9 @@
/// Get the number of samples recorded in [Start, End)
uint64_t getSamples(uint64_t Start, uint64_t End) const;
+ /// Returns the total number of samples recorded in this function.
+ uint64_t getSamples() const;
+
/// Aggregation helper
DenseMap<uint64_t, size_t> Index;
diff --git a/bolt/include/bolt/Profile/Heatmap.h b/bolt/include/bolt/Profile/Heatmap.h
index 74d7eed..fc1e2cd 100644
--- a/bolt/include/bolt/Profile/Heatmap.h
+++ b/bolt/include/bolt/Profile/Heatmap.h
@@ -57,9 +57,9 @@
}
/// Register a single sample at \p Address.
- void registerAddress(uint64_t Address) {
+ void registerAddress(uint64_t Address, uint64_t Count) {
if (!ignoreAddress(Address))
- ++Map[Address / BucketSize];
+ Map[Address / BucketSize] += Count;
}
/// Register \p Count samples at [\p StartAddress, \p EndAddress ].
diff --git a/bolt/lib/Core/BinaryFunction.cpp b/bolt/lib/Core/BinaryFunction.cpp
index 4624aba..fea5101 100644
--- a/bolt/lib/Core/BinaryFunction.cpp
+++ b/bolt/lib/Core/BinaryFunction.cpp
@@ -471,7 +471,7 @@
OS << "\n Image : 0x" << Twine::utohexstr(getImageAddress());
if (ExecutionCount != COUNT_NO_PROFILE) {
OS << "\n Exec Count : " << ExecutionCount;
- OS << "\n Branch Count: " << RawBranchCount;
+ OS << "\n Branch Count: " << RawSampleCount;
OS << "\n Profile Acc : " << format("%.1f%%", ProfileMatchRatio * 100.0f);
}
diff --git a/bolt/lib/Passes/BinaryPasses.cpp b/bolt/lib/Passes/BinaryPasses.cpp
index d8628c6..420ffc8 100644
--- a/bolt/lib/Passes/BinaryPasses.cpp
+++ b/bolt/lib/Passes/BinaryPasses.cpp
@@ -1445,7 +1445,7 @@
if (!Function.hasProfile())
continue;
- uint64_t SampleCount = Function.getRawBranchCount();
+ uint64_t SampleCount = Function.getRawSampleCount();
TotalSampleCount += SampleCount;
if (Function.hasValidProfile()) {
diff --git a/bolt/lib/Profile/DataAggregator.cpp b/bolt/lib/Profile/DataAggregator.cpp
index a8a1879..da260e0 100644
--- a/bolt/lib/Profile/DataAggregator.cpp
+++ b/bolt/lib/Profile/DataAggregator.cpp
@@ -499,18 +499,15 @@
filterBinaryMMapInfo();
prepareToParse("events", MainEventsPPI, ErrorCallback);
+ if (opts::BasicAggregation ? parseBasicEvents() : parseBranchEvents())
+ errs() << "PERF2BOLT: failed to parse samples\n";
+
if (opts::HeatmapMode) {
- if (std::error_code EC = printLBRHeatMap()) {
- errs() << "ERROR: failed to print heat map: " << EC.message() << '\n';
- exit(1);
- }
+ if (std::error_code EC = printHeatMap())
+ return errorCodeToError(EC);
exit(0);
}
- if ((!opts::BasicAggregation && parseBranchEvents()) ||
- (opts::BasicAggregation && parseBasicEvents()))
- errs() << "PERF2BOLT: failed to parse samples\n";
-
// Special handling for memory events
if (prepareToParse("mem events", MemEventsPPI, MemEventsErrorCallback))
return Error::success();
@@ -567,15 +564,14 @@
processMemEvents();
// Mark all functions with registered events as having a valid profile.
- const auto Flags = opts::BasicAggregation ? BinaryFunction::PF_SAMPLE
- : BinaryFunction::PF_LBR;
for (auto &BFI : BC.getBinaryFunctions()) {
BinaryFunction &BF = BFI.second;
- FuncBranchData *FBD = getBranchData(BF);
- if (FBD || getFuncSampleData(BF.getNames())) {
- BF.markProfiled(Flags);
- if (FBD)
- BF.RawBranchCount = FBD->getNumExecutedBranches();
+ if (FuncBranchData *FBD = getBranchData(BF)) {
+ BF.markProfiled(BinaryFunction::PF_LBR);
+ BF.RawSampleCount = FBD->getNumExecutedBranches();
+ } else if (FuncSampleData *FSD = getFuncSampleData(BF.getNames())) {
+ BF.markProfiled(BinaryFunction::PF_SAMPLE);
+ BF.RawSampleCount = FSD->getSamples();
}
}
@@ -632,10 +628,18 @@
bool DataAggregator::doSample(BinaryFunction &OrigFunc, uint64_t Address,
uint64_t Count) {
+ // To record executed bytes, use basic block size as is regardless of BAT.
+ uint64_t BlockSize = 0;
+ if (BinaryBasicBlock *BB = OrigFunc.getBasicBlockContainingOffset(
+ Address - OrigFunc.getAddress()))
+ BlockSize = BB->getOriginalSize();
+
BinaryFunction *ParentFunc = getBATParentFunction(OrigFunc);
BinaryFunction &Func = ParentFunc ? *ParentFunc : OrigFunc;
- if (ParentFunc || (BAT && !BAT->isBATFunction(OrigFunc.getAddress())))
+ if (ParentFunc || (BAT && !BAT->isBATFunction(Func.getAddress())))
NumColdSamples += Count;
+ // Attach executed bytes to parent function in case of cold fragment.
+ Func.SampleCountInBytes += Count * BlockSize;
auto I = NamesToSamples.find(Func.getOneName());
if (I == NamesToSamples.end()) {
@@ -720,23 +724,6 @@
: isReturn(Func.disassembleInstructionAtOffset(Offset));
};
- // Returns whether \p Offset in \p Func may be a call continuation excluding
- // entry points and landing pads.
- auto checkCallCont = [&](const BinaryFunction &Func, const uint64_t Offset) {
- // No call continuation at a function start.
- if (!Offset)
- return false;
-
- // FIXME: support BAT case where the function might be in empty state
- // (split fragments declared non-simple).
- if (!Func.hasCFG())
- return false;
-
- // The offset should not be an entry point or a landing pad.
- const BinaryBasicBlock *ContBB = Func.getBasicBlockAtOffset(Offset);
- return ContBB && !ContBB->isEntryPoint() && !ContBB->isLandingPad();
- };
-
// Mutates \p Addr to an offset into the containing function, performing BAT
// offset translation and parent lookup.
//
@@ -749,8 +736,7 @@
Addr -= Func->getAddress();
- bool IsRetOrCallCont =
- IsFrom ? checkReturn(*Func, Addr) : checkCallCont(*Func, Addr);
+ bool IsRet = IsFrom && checkReturn(*Func, Addr);
if (BAT)
Addr = BAT->translate(Func->getAddress(), Addr, IsFrom);
@@ -761,24 +747,16 @@
NumColdSamples += Count;
if (!ParentFunc)
- return std::pair{Func, IsRetOrCallCont};
+ return std::pair{Func, IsRet};
- return std::pair{ParentFunc, IsRetOrCallCont};
+ return std::pair{ParentFunc, IsRet};
};
- uint64_t ToOrig = To;
auto [FromFunc, IsReturn] = handleAddress(From, /*IsFrom*/ true);
- auto [ToFunc, IsCallCont] = handleAddress(To, /*IsFrom*/ false);
+ auto [ToFunc, _] = handleAddress(To, /*IsFrom*/ false);
if (!FromFunc && !ToFunc)
return false;
- // Record call to continuation trace.
- if (NeedsConvertRetProfileToCallCont && FromFunc != ToFunc &&
- (IsReturn || IsCallCont)) {
- LBREntry First{ToOrig - 1, ToOrig - 1, false};
- LBREntry Second{ToOrig, ToOrig, false};
- return doTrace(First, Second, Count);
- }
// Ignore returns.
if (IsReturn)
return true;
@@ -1235,21 +1213,14 @@
ErrorOr<StringRef> TypeOrErr = parseString(FieldSeparator);
if (std::error_code EC = TypeOrErr.getError())
return EC;
- // Pre-aggregated profile with branches and fallthroughs needs to convert
- // return profile into call to continuation fall-through.
- auto Type = AggregatedLBREntry::BRANCH;
- if (TypeOrErr.get() == "B") {
- NeedsConvertRetProfileToCallCont = true;
+ auto Type = AggregatedLBREntry::TRACE;
+ if (LLVM_LIKELY(TypeOrErr.get() == "T")) {
+ } else if (TypeOrErr.get() == "B") {
Type = AggregatedLBREntry::BRANCH;
} else if (TypeOrErr.get() == "F") {
- NeedsConvertRetProfileToCallCont = true;
Type = AggregatedLBREntry::FT;
} else if (TypeOrErr.get() == "f") {
- NeedsConvertRetProfileToCallCont = true;
Type = AggregatedLBREntry::FT_EXTERNAL_ORIGIN;
- } else if (TypeOrErr.get() == "T") {
- // Trace is expanded into B and [Ff]
- Type = AggregatedLBREntry::TRACE;
} else {
reportError("expected T, B, F or f");
return make_error_code(llvm::errc::io_error);
@@ -1323,7 +1294,7 @@
(LBR.From >= KernelBaseAddr || LBR.To >= KernelBaseAddr);
}
-std::error_code DataAggregator::printLBRHeatMap() {
+std::error_code DataAggregator::printHeatMap() {
outs() << "PERF2BOLT: parse branch events...\n";
NamedRegionTimer T("parseBranch", "Parsing branch events", TimerGroupName,
TimerGroupDesc, opts::TimeAggregator);
@@ -1334,53 +1305,6 @@
}
Heatmap HM(opts::HeatmapBlock, opts::HeatmapMinAddress,
opts::HeatmapMaxAddress, getTextSections(BC));
- uint64_t NumTotalSamples = 0;
-
- if (opts::BasicAggregation) {
- while (hasData()) {
- ErrorOr<PerfBasicSample> SampleRes = parseBasicSample();
- if (std::error_code EC = SampleRes.getError()) {
- if (EC == errc::no_such_process)
- continue;
- return EC;
- }
- PerfBasicSample &Sample = SampleRes.get();
- HM.registerAddress(Sample.PC);
- NumTotalSamples++;
- }
- outs() << "HEATMAP: read " << NumTotalSamples << " basic samples\n";
- } else {
- while (hasData()) {
- ErrorOr<PerfBranchSample> SampleRes = parseBranchSample();
- if (std::error_code EC = SampleRes.getError()) {
- if (EC == errc::no_such_process)
- continue;
- return EC;
- }
-
- PerfBranchSample &Sample = SampleRes.get();
-
- // LBRs are stored in reverse execution order. NextLBR refers to the next
- // executed branch record.
- const LBREntry *NextLBR = nullptr;
- for (const LBREntry &LBR : Sample.LBR) {
- if (NextLBR) {
- // Record fall-through trace.
- const uint64_t TraceFrom = LBR.To;
- const uint64_t TraceTo = NextLBR->From;
- ++FallthroughLBRs[Trace(TraceFrom, TraceTo)].InternCount;
- }
- NextLBR = &LBR;
- }
- if (!Sample.LBR.empty()) {
- HM.registerAddress(Sample.LBR.front().To);
- HM.registerAddress(Sample.LBR.back().From);
- }
- NumTotalSamples += Sample.LBR.size();
- }
- outs() << "HEATMAP: read " << NumTotalSamples << " LBR samples\n";
- outs() << "HEATMAP: " << FallthroughLBRs.size() << " unique traces\n";
- }
if (!NumTotalSamples) {
if (opts::BasicAggregation) {
@@ -1396,10 +1320,14 @@
outs() << "HEATMAP: building heat map...\n";
- for (const auto &LBR : FallthroughLBRs) {
- const Trace &Trace = LBR.first;
- const FTInfo &Info = LBR.second;
- HM.registerAddressRange(Trace.From, Trace.To, Info.InternCount);
+ if (opts::BasicAggregation) {
+ for (const auto &[PC, Hits] : BasicSamples)
+ HM.registerAddress(PC, Hits);
+ } else {
+ for (const auto &[Trace, Info] : FallthroughLBRs)
+ HM.registerAddressRange(Trace.From, Trace.To, Info.InternCount);
+ for (const auto &[Trace, Info] : BranchLBRs)
+ HM.registerAddress(Trace.From, Info.TakenCount);
}
if (HM.getNumInvalidRanges())
diff --git a/bolt/lib/Profile/DataReader.cpp b/bolt/lib/Profile/DataReader.cpp
index f2e999b..4a92c9e 100644
--- a/bolt/lib/Profile/DataReader.cpp
+++ b/bolt/lib/Profile/DataReader.cpp
@@ -128,6 +128,13 @@
return Result;
}
+uint64_t FuncSampleData::getSamples() const {
+ uint64_t Result = 0;
+ for (const SampleInfo &I : Data)
+ Result += I.Hits;
+ return Result;
+}
+
void FuncSampleData::bumpCount(uint64_t Offset, uint64_t Count) {
auto Iter = Index.find(Offset);
if (Iter == Index.end()) {
@@ -407,12 +414,12 @@
FuncBranchData *FBD = getBranchData(BF);
if (FBD) {
BF.ProfileMatchRatio = evaluateProfileData(BF, *FBD);
- BF.RawBranchCount = FBD->getNumExecutedBranches();
+ BF.RawSampleCount = FBD->getNumExecutedBranches();
if (BF.ProfileMatchRatio == 1.0f) {
if (fetchProfileForOtherEntryPoints(BF)) {
BF.ProfileMatchRatio = evaluateProfileData(BF, *FBD);
BF.ExecutionCount = FBD->ExecutionCount;
- BF.RawBranchCount = FBD->getNumExecutedBranches();
+ BF.RawSampleCount = FBD->getNumExecutedBranches();
}
return;
}
diff --git a/bolt/lib/Profile/Heatmap.cpp b/bolt/lib/Profile/Heatmap.cpp
index 5fc3e06..74ec885 100644
--- a/bolt/lib/Profile/Heatmap.cpp
+++ b/bolt/lib/Profile/Heatmap.cpp
@@ -43,7 +43,7 @@
}
for (uint64_t Bucket = StartAddress / BucketSize;
- Bucket <= EndAddress / BucketSize; ++Bucket)
+ Bucket < EndAddress / BucketSize; ++Bucket)
Map[Bucket] += Count;
}
diff --git a/bolt/lib/Profile/YAMLProfileReader.cpp b/bolt/lib/Profile/YAMLProfileReader.cpp
index f5636bf..88b806c 100644
--- a/bolt/lib/Profile/YAMLProfileReader.cpp
+++ b/bolt/lib/Profile/YAMLProfileReader.cpp
@@ -177,11 +177,11 @@
BF.setExecutionCount(YamlBF.ExecCount);
- uint64_t FuncRawBranchCount = 0;
+ uint64_t FuncRawSampleCount = 0;
for (const yaml::bolt::BinaryBasicBlockProfile &YamlBB : YamlBF.Blocks)
for (const yaml::bolt::SuccessorInfo &YamlSI : YamlBB.Successors)
- FuncRawBranchCount += YamlSI.Count;
- BF.setRawBranchCount(FuncRawBranchCount);
+ FuncRawSampleCount += YamlSI.Count;
+ BF.setRawSampleCount(FuncRawSampleCount);
if (BF.empty())
return true;
diff --git a/bolt/test/X86/callcont-fallthru.s b/bolt/test/X86/callcont-fallthru.s
index ee72d8f..44e3bf2 100644
--- a/bolt/test/X86/callcont-fallthru.s
+++ b/bolt/test/X86/callcont-fallthru.s
@@ -4,31 +4,12 @@
# RUN: %clang %cflags -fpic -shared -xc /dev/null -o %t.so
## Link against a DSO to ensure PLT entries.
# RUN: %clangxx %cxxflags %s %t.so -o %t -Wl,-q -nostdlib
-# RUN: link_fdata %s %t %t.pa1 PREAGG1
-# RUN: link_fdata %s %t %t.pa2 PREAGG2
-# RUN: link_fdata %s %t %t.pa3 PREAGG3
# RUN: link_fdata %s %t %t.pat PREAGGT1
# RUN: link_fdata %s %t %t.pat2 PREAGGT2
# RUN: link_fdata %s %t %t.patplt PREAGGPLT
-## Check normal case: fallthrough is not LP or secondary entry.
# RUN: llvm-strip --strip-unneeded %t -o %t.strip
# RUN: llvm-objcopy --remove-section=.eh_frame %t.strip %t.noeh
-# RUN: llvm-bolt %t.strip --pa -p %t.pa1 -o %t.out \
-# RUN: --print-cfg --print-only=main | FileCheck %s
-
-## Check that getFallthroughsInTrace correctly handles a trace starting at plt
-## call continuation
-# RUN: llvm-bolt %t.strip --pa -p %t.pa2 -o %t.out2 \
-# RUN: --print-cfg --print-only=main | FileCheck %s --check-prefix=CHECK2
-
-## Check that we don't treat secondary entry points as call continuation sites.
-# RUN: llvm-bolt %t --pa -p %t.pa3 -o %t.out \
-# RUN: --print-cfg --print-only=main | FileCheck %s --check-prefix=CHECK3
-
-## Check fallthrough to a landing pad case.
-# RUN: llvm-bolt %t.strip --pa -p %t.pa3 -o %t.out \
-# RUN: --print-cfg --print-only=main | FileCheck %s --check-prefix=CHECK3
## Check pre-aggregated traces attach call continuation fallthrough count
# RUN: llvm-bolt %t.noeh --pa -p %t.pat -o %t.out \
@@ -77,7 +58,6 @@
## Check PLT traces are accepted
# PREAGGPLT: T #Ltmp0_br# #puts@plt# #puts@plt# 3
## Target is an external-origin call continuation
-# PREAGG1: B X:0 #Ltmp1# 2 0
# PREAGGT1: T X:0 #Ltmp1# #Ltmp4_br# 2
# CHECK: callq puts@PLT
# CHECK-NEXT: count: 2
@@ -97,18 +77,15 @@
movl $0xa, -0x18(%rbp)
callq foo
## Target is a binary-local call continuation
-# PREAGG1: B #Lfoo_ret# #Ltmp3# 1 0
# PREAGGT1: T #Lfoo_ret# #Ltmp3# #Ltmp3_br# 1
# CHECK: callq foo
# CHECK-NEXT: count: 1
## PLT call continuation fallthrough spanning the call
-# PREAGG2: F #Ltmp1# #Ltmp3_br# 3
# CHECK2: callq foo
# CHECK2-NEXT: count: 3
## Target is a secondary entry point (unstripped) or a landing pad (stripped)
-# PREAGG3: B X:0 #Ltmp3# 2 0
# PREAGGT2: T X:0 #Ltmp3# #Ltmp3_br# 2
# CHECK3: callq foo
# CHECK3-NEXT: count: 0
diff --git a/bolt/test/perf2bolt/perf_test.test b/bolt/test/perf2bolt/perf_test.test
index 7bec442..44111de 100644
--- a/bolt/test/perf2bolt/perf_test.test
+++ b/bolt/test/perf2bolt/perf_test.test
@@ -8,6 +8,7 @@
CHECK-NOT: PERF2BOLT-ERROR
CHECK-NOT: !! WARNING !! This high mismatch ratio indicates the input binary is probably not the same binary used during profiling collection.
+CHECK: BOLT-INFO: Functions with density >= {{.*}} account for 99.00% total sample counts.
RUN: %clang %S/Inputs/perf_test.c -no-pie -fuse-ld=lld -o %t4
RUN: perf record -Fmax -e cycles:u -o %t5 -- %t4