[𝘀𝗽𝗿] changes to main this commit is based on

Created using spr 1.3.4

[skip ci]
diff --git a/bolt/include/bolt/Core/BinaryFunction.h b/bolt/include/bolt/Core/BinaryFunction.h
index a529985..e82b857 100644
--- a/bolt/include/bolt/Core/BinaryFunction.h
+++ b/bolt/include/bolt/Core/BinaryFunction.h
@@ -386,8 +386,8 @@
   /// Profile match ratio.
   float ProfileMatchRatio{0.0f};
 
-  /// Raw branch count for this function in the profile.
-  uint64_t RawBranchCount{0};
+  /// Raw sample/branch count for this function in the profile.
+  uint64_t RawSampleCount{0};
 
   /// Dynamically executed function bytes, used for density computation.
   uint64_t SampleCountInBytes{0};
@@ -1880,13 +1880,12 @@
   /// Return COUNT_NO_PROFILE if there's no profile info.
   uint64_t getExecutionCount() const { return ExecutionCount; }
 
-  /// Return the raw profile information about the number of branch
-  /// executions corresponding to this function.
-  uint64_t getRawBranchCount() const { return RawBranchCount; }
+  /// Return the raw profile information about the number of samples (basic
+  /// profile) or branch executions (branch profile) recorded in this function.
+  uint64_t getRawSampleCount() const { return RawSampleCount; }
 
-  /// Set the profile data about the number of branch executions corresponding
-  /// to this function.
-  void setRawBranchCount(uint64_t Count) { RawBranchCount = Count; }
+  /// Set raw count of samples or branches recorded in this function.
+  void setRawSampleCount(uint64_t Count) { RawSampleCount = Count; }
 
   /// Return the number of dynamically executed bytes, from raw perf data.
   uint64_t getSampleCountInBytes() const { return SampleCountInBytes; }
diff --git a/bolt/include/bolt/Profile/DataAggregator.h b/bolt/include/bolt/Profile/DataAggregator.h
index 79a9186..ff2ea35 100644
--- a/bolt/include/bolt/Profile/DataAggregator.h
+++ b/bolt/include/bolt/Profile/DataAggregator.h
@@ -197,10 +197,6 @@
 
   BoltAddressTranslation *BAT{nullptr};
 
-  /// Whether pre-aggregated profile needs to convert branch profile into call
-  /// to continuation fallthrough profile.
-  bool NeedsConvertRetProfileToCallCont{false};
-
   /// Update function execution profile with a recorded trace.
   /// A trace is region of code executed between two LBR entries supplied in
   /// execution order.
@@ -285,8 +281,8 @@
   /// everything
   bool hasData() const { return !ParsingBuf.empty(); }
 
-  /// Print heat map based on LBR samples.
-  std::error_code printLBRHeatMap();
+  /// Print heat map based on collected samples.
+  std::error_code printHeatMap();
 
   /// Parse a single perf sample containing a PID associated with a sequence of
   /// LBR entries. If the PID does not correspond to the binary we are looking
diff --git a/bolt/include/bolt/Profile/DataReader.h b/bolt/include/bolt/Profile/DataReader.h
index 314dcc9..a7a0933 100644
--- a/bolt/include/bolt/Profile/DataReader.h
+++ b/bolt/include/bolt/Profile/DataReader.h
@@ -252,6 +252,9 @@
   /// Get the number of samples recorded in [Start, End)
   uint64_t getSamples(uint64_t Start, uint64_t End) const;
 
+  /// Returns the total number of samples recorded in this function.
+  uint64_t getSamples() const;
+
   /// Aggregation helper
   DenseMap<uint64_t, size_t> Index;
 
diff --git a/bolt/include/bolt/Profile/Heatmap.h b/bolt/include/bolt/Profile/Heatmap.h
index 74d7eed..fc1e2cd 100644
--- a/bolt/include/bolt/Profile/Heatmap.h
+++ b/bolt/include/bolt/Profile/Heatmap.h
@@ -57,9 +57,9 @@
   }
 
   /// Register a single sample at \p Address.
-  void registerAddress(uint64_t Address) {
+  void registerAddress(uint64_t Address, uint64_t Count) {
     if (!ignoreAddress(Address))
-      ++Map[Address / BucketSize];
+      Map[Address / BucketSize] += Count;
   }
 
   /// Register \p Count samples at [\p StartAddress, \p EndAddress ].
diff --git a/bolt/lib/Core/BinaryFunction.cpp b/bolt/lib/Core/BinaryFunction.cpp
index 4624aba..fea5101 100644
--- a/bolt/lib/Core/BinaryFunction.cpp
+++ b/bolt/lib/Core/BinaryFunction.cpp
@@ -471,7 +471,7 @@
     OS << "\n  Image       : 0x" << Twine::utohexstr(getImageAddress());
   if (ExecutionCount != COUNT_NO_PROFILE) {
     OS << "\n  Exec Count  : " << ExecutionCount;
-    OS << "\n  Branch Count: " << RawBranchCount;
+    OS << "\n  Branch Count: " << RawSampleCount;
     OS << "\n  Profile Acc : " << format("%.1f%%", ProfileMatchRatio * 100.0f);
   }
 
diff --git a/bolt/lib/Passes/BinaryPasses.cpp b/bolt/lib/Passes/BinaryPasses.cpp
index d8628c6..420ffc8 100644
--- a/bolt/lib/Passes/BinaryPasses.cpp
+++ b/bolt/lib/Passes/BinaryPasses.cpp
@@ -1445,7 +1445,7 @@
     if (!Function.hasProfile())
       continue;
 
-    uint64_t SampleCount = Function.getRawBranchCount();
+    uint64_t SampleCount = Function.getRawSampleCount();
     TotalSampleCount += SampleCount;
 
     if (Function.hasValidProfile()) {
diff --git a/bolt/lib/Profile/DataAggregator.cpp b/bolt/lib/Profile/DataAggregator.cpp
index a8a1879..da260e0 100644
--- a/bolt/lib/Profile/DataAggregator.cpp
+++ b/bolt/lib/Profile/DataAggregator.cpp
@@ -499,18 +499,15 @@
   filterBinaryMMapInfo();
   prepareToParse("events", MainEventsPPI, ErrorCallback);
 
+  if (opts::BasicAggregation ? parseBasicEvents() : parseBranchEvents())
+    errs() << "PERF2BOLT: failed to parse samples\n";
+
   if (opts::HeatmapMode) {
-    if (std::error_code EC = printLBRHeatMap()) {
-      errs() << "ERROR: failed to print heat map: " << EC.message() << '\n';
-      exit(1);
-    }
+    if (std::error_code EC = printHeatMap())
+      return errorCodeToError(EC);
     exit(0);
   }
 
-  if ((!opts::BasicAggregation && parseBranchEvents()) ||
-      (opts::BasicAggregation && parseBasicEvents()))
-    errs() << "PERF2BOLT: failed to parse samples\n";
-
   // Special handling for memory events
   if (prepareToParse("mem events", MemEventsPPI, MemEventsErrorCallback))
     return Error::success();
@@ -567,15 +564,14 @@
   processMemEvents();
 
   // Mark all functions with registered events as having a valid profile.
-  const auto Flags = opts::BasicAggregation ? BinaryFunction::PF_SAMPLE
-                                            : BinaryFunction::PF_LBR;
   for (auto &BFI : BC.getBinaryFunctions()) {
     BinaryFunction &BF = BFI.second;
-    FuncBranchData *FBD = getBranchData(BF);
-    if (FBD || getFuncSampleData(BF.getNames())) {
-      BF.markProfiled(Flags);
-      if (FBD)
-        BF.RawBranchCount = FBD->getNumExecutedBranches();
+    if (FuncBranchData *FBD = getBranchData(BF)) {
+      BF.markProfiled(BinaryFunction::PF_LBR);
+      BF.RawSampleCount = FBD->getNumExecutedBranches();
+    } else if (FuncSampleData *FSD = getFuncSampleData(BF.getNames())) {
+      BF.markProfiled(BinaryFunction::PF_SAMPLE);
+      BF.RawSampleCount = FSD->getSamples();
     }
   }
 
@@ -632,10 +628,18 @@
 
 bool DataAggregator::doSample(BinaryFunction &OrigFunc, uint64_t Address,
                               uint64_t Count) {
+  // To record executed bytes, use basic block size as is regardless of BAT.
+  uint64_t BlockSize = 0;
+  if (BinaryBasicBlock *BB = OrigFunc.getBasicBlockContainingOffset(
+          Address - OrigFunc.getAddress()))
+    BlockSize = BB->getOriginalSize();
+
   BinaryFunction *ParentFunc = getBATParentFunction(OrigFunc);
   BinaryFunction &Func = ParentFunc ? *ParentFunc : OrigFunc;
-  if (ParentFunc || (BAT && !BAT->isBATFunction(OrigFunc.getAddress())))
+  if (ParentFunc || (BAT && !BAT->isBATFunction(Func.getAddress())))
     NumColdSamples += Count;
+  // Attach executed bytes to parent function in case of cold fragment.
+  Func.SampleCountInBytes += Count * BlockSize;
 
   auto I = NamesToSamples.find(Func.getOneName());
   if (I == NamesToSamples.end()) {
@@ -720,23 +724,6 @@
                : isReturn(Func.disassembleInstructionAtOffset(Offset));
   };
 
-  // Returns whether \p Offset in \p Func may be a call continuation excluding
-  // entry points and landing pads.
-  auto checkCallCont = [&](const BinaryFunction &Func, const uint64_t Offset) {
-    // No call continuation at a function start.
-    if (!Offset)
-      return false;
-
-    // FIXME: support BAT case where the function might be in empty state
-    // (split fragments declared non-simple).
-    if (!Func.hasCFG())
-      return false;
-
-    // The offset should not be an entry point or a landing pad.
-    const BinaryBasicBlock *ContBB = Func.getBasicBlockAtOffset(Offset);
-    return ContBB && !ContBB->isEntryPoint() && !ContBB->isLandingPad();
-  };
-
   // Mutates \p Addr to an offset into the containing function, performing BAT
   // offset translation and parent lookup.
   //
@@ -749,8 +736,7 @@
 
     Addr -= Func->getAddress();
 
-    bool IsRetOrCallCont =
-        IsFrom ? checkReturn(*Func, Addr) : checkCallCont(*Func, Addr);
+    bool IsRet = IsFrom && checkReturn(*Func, Addr);
 
     if (BAT)
       Addr = BAT->translate(Func->getAddress(), Addr, IsFrom);
@@ -761,24 +747,16 @@
       NumColdSamples += Count;
 
     if (!ParentFunc)
-      return std::pair{Func, IsRetOrCallCont};
+      return std::pair{Func, IsRet};
 
-    return std::pair{ParentFunc, IsRetOrCallCont};
+    return std::pair{ParentFunc, IsRet};
   };
 
-  uint64_t ToOrig = To;
   auto [FromFunc, IsReturn] = handleAddress(From, /*IsFrom*/ true);
-  auto [ToFunc, IsCallCont] = handleAddress(To, /*IsFrom*/ false);
+  auto [ToFunc, _] = handleAddress(To, /*IsFrom*/ false);
   if (!FromFunc && !ToFunc)
     return false;
 
-  // Record call to continuation trace.
-  if (NeedsConvertRetProfileToCallCont && FromFunc != ToFunc &&
-      (IsReturn || IsCallCont)) {
-    LBREntry First{ToOrig - 1, ToOrig - 1, false};
-    LBREntry Second{ToOrig, ToOrig, false};
-    return doTrace(First, Second, Count);
-  }
   // Ignore returns.
   if (IsReturn)
     return true;
@@ -1235,21 +1213,14 @@
   ErrorOr<StringRef> TypeOrErr = parseString(FieldSeparator);
   if (std::error_code EC = TypeOrErr.getError())
     return EC;
-  // Pre-aggregated profile with branches and fallthroughs needs to convert
-  // return profile into call to continuation fall-through.
-  auto Type = AggregatedLBREntry::BRANCH;
-  if (TypeOrErr.get() == "B") {
-    NeedsConvertRetProfileToCallCont = true;
+  auto Type = AggregatedLBREntry::TRACE;
+  if (LLVM_LIKELY(TypeOrErr.get() == "T")) {
+  } else if (TypeOrErr.get() == "B") {
     Type = AggregatedLBREntry::BRANCH;
   } else if (TypeOrErr.get() == "F") {
-    NeedsConvertRetProfileToCallCont = true;
     Type = AggregatedLBREntry::FT;
   } else if (TypeOrErr.get() == "f") {
-    NeedsConvertRetProfileToCallCont = true;
     Type = AggregatedLBREntry::FT_EXTERNAL_ORIGIN;
-  } else if (TypeOrErr.get() == "T") {
-    // Trace is expanded into B and [Ff]
-    Type = AggregatedLBREntry::TRACE;
   } else {
     reportError("expected T, B, F or f");
     return make_error_code(llvm::errc::io_error);
@@ -1323,7 +1294,7 @@
          (LBR.From >= KernelBaseAddr || LBR.To >= KernelBaseAddr);
 }
 
-std::error_code DataAggregator::printLBRHeatMap() {
+std::error_code DataAggregator::printHeatMap() {
   outs() << "PERF2BOLT: parse branch events...\n";
   NamedRegionTimer T("parseBranch", "Parsing branch events", TimerGroupName,
                      TimerGroupDesc, opts::TimeAggregator);
@@ -1334,53 +1305,6 @@
   }
   Heatmap HM(opts::HeatmapBlock, opts::HeatmapMinAddress,
              opts::HeatmapMaxAddress, getTextSections(BC));
-  uint64_t NumTotalSamples = 0;
-
-  if (opts::BasicAggregation) {
-    while (hasData()) {
-      ErrorOr<PerfBasicSample> SampleRes = parseBasicSample();
-      if (std::error_code EC = SampleRes.getError()) {
-        if (EC == errc::no_such_process)
-          continue;
-        return EC;
-      }
-      PerfBasicSample &Sample = SampleRes.get();
-      HM.registerAddress(Sample.PC);
-      NumTotalSamples++;
-    }
-    outs() << "HEATMAP: read " << NumTotalSamples << " basic samples\n";
-  } else {
-    while (hasData()) {
-      ErrorOr<PerfBranchSample> SampleRes = parseBranchSample();
-      if (std::error_code EC = SampleRes.getError()) {
-        if (EC == errc::no_such_process)
-          continue;
-        return EC;
-      }
-
-      PerfBranchSample &Sample = SampleRes.get();
-
-      // LBRs are stored in reverse execution order. NextLBR refers to the next
-      // executed branch record.
-      const LBREntry *NextLBR = nullptr;
-      for (const LBREntry &LBR : Sample.LBR) {
-        if (NextLBR) {
-          // Record fall-through trace.
-          const uint64_t TraceFrom = LBR.To;
-          const uint64_t TraceTo = NextLBR->From;
-          ++FallthroughLBRs[Trace(TraceFrom, TraceTo)].InternCount;
-        }
-        NextLBR = &LBR;
-      }
-      if (!Sample.LBR.empty()) {
-        HM.registerAddress(Sample.LBR.front().To);
-        HM.registerAddress(Sample.LBR.back().From);
-      }
-      NumTotalSamples += Sample.LBR.size();
-    }
-    outs() << "HEATMAP: read " << NumTotalSamples << " LBR samples\n";
-    outs() << "HEATMAP: " << FallthroughLBRs.size() << " unique traces\n";
-  }
 
   if (!NumTotalSamples) {
     if (opts::BasicAggregation) {
@@ -1396,10 +1320,14 @@
 
   outs() << "HEATMAP: building heat map...\n";
 
-  for (const auto &LBR : FallthroughLBRs) {
-    const Trace &Trace = LBR.first;
-    const FTInfo &Info = LBR.second;
-    HM.registerAddressRange(Trace.From, Trace.To, Info.InternCount);
+  if (opts::BasicAggregation) {
+    for (const auto &[PC, Hits] : BasicSamples)
+      HM.registerAddress(PC, Hits);
+  } else {
+    for (const auto &[Trace, Info] : FallthroughLBRs)
+      HM.registerAddressRange(Trace.From, Trace.To, Info.InternCount);
+    for (const auto &[Trace, Info] : BranchLBRs)
+      HM.registerAddress(Trace.From, Info.TakenCount);
   }
 
   if (HM.getNumInvalidRanges())
diff --git a/bolt/lib/Profile/DataReader.cpp b/bolt/lib/Profile/DataReader.cpp
index f2e999b..4a92c9e 100644
--- a/bolt/lib/Profile/DataReader.cpp
+++ b/bolt/lib/Profile/DataReader.cpp
@@ -128,6 +128,13 @@
   return Result;
 }
 
+uint64_t FuncSampleData::getSamples() const {
+  uint64_t Result = 0;
+  for (const SampleInfo &I : Data)
+    Result += I.Hits;
+  return Result;
+}
+
 void FuncSampleData::bumpCount(uint64_t Offset, uint64_t Count) {
   auto Iter = Index.find(Offset);
   if (Iter == Index.end()) {
@@ -407,12 +414,12 @@
   FuncBranchData *FBD = getBranchData(BF);
   if (FBD) {
     BF.ProfileMatchRatio = evaluateProfileData(BF, *FBD);
-    BF.RawBranchCount = FBD->getNumExecutedBranches();
+    BF.RawSampleCount = FBD->getNumExecutedBranches();
     if (BF.ProfileMatchRatio == 1.0f) {
       if (fetchProfileForOtherEntryPoints(BF)) {
         BF.ProfileMatchRatio = evaluateProfileData(BF, *FBD);
         BF.ExecutionCount = FBD->ExecutionCount;
-        BF.RawBranchCount = FBD->getNumExecutedBranches();
+        BF.RawSampleCount = FBD->getNumExecutedBranches();
       }
       return;
     }
diff --git a/bolt/lib/Profile/Heatmap.cpp b/bolt/lib/Profile/Heatmap.cpp
index 5fc3e06..74ec885 100644
--- a/bolt/lib/Profile/Heatmap.cpp
+++ b/bolt/lib/Profile/Heatmap.cpp
@@ -43,7 +43,7 @@
   }
 
   for (uint64_t Bucket = StartAddress / BucketSize;
-       Bucket <= EndAddress / BucketSize; ++Bucket)
+       Bucket < EndAddress / BucketSize; ++Bucket)
     Map[Bucket] += Count;
 }
 
diff --git a/bolt/lib/Profile/YAMLProfileReader.cpp b/bolt/lib/Profile/YAMLProfileReader.cpp
index f5636bf..88b806c 100644
--- a/bolt/lib/Profile/YAMLProfileReader.cpp
+++ b/bolt/lib/Profile/YAMLProfileReader.cpp
@@ -177,11 +177,11 @@
 
   BF.setExecutionCount(YamlBF.ExecCount);
 
-  uint64_t FuncRawBranchCount = 0;
+  uint64_t FuncRawSampleCount = 0;
   for (const yaml::bolt::BinaryBasicBlockProfile &YamlBB : YamlBF.Blocks)
     for (const yaml::bolt::SuccessorInfo &YamlSI : YamlBB.Successors)
-      FuncRawBranchCount += YamlSI.Count;
-  BF.setRawBranchCount(FuncRawBranchCount);
+      FuncRawSampleCount += YamlSI.Count;
+  BF.setRawSampleCount(FuncRawSampleCount);
 
   if (BF.empty())
     return true;
diff --git a/bolt/test/X86/callcont-fallthru.s b/bolt/test/X86/callcont-fallthru.s
index ee72d8f..44e3bf2 100644
--- a/bolt/test/X86/callcont-fallthru.s
+++ b/bolt/test/X86/callcont-fallthru.s
@@ -4,31 +4,12 @@
 # RUN: %clang %cflags -fpic -shared -xc /dev/null -o %t.so
 ## Link against a DSO to ensure PLT entries.
 # RUN: %clangxx %cxxflags %s %t.so -o %t -Wl,-q -nostdlib
-# RUN: link_fdata %s %t %t.pa1 PREAGG1
-# RUN: link_fdata %s %t %t.pa2 PREAGG2
-# RUN: link_fdata %s %t %t.pa3 PREAGG3
 # RUN: link_fdata %s %t %t.pat PREAGGT1
 # RUN: link_fdata %s %t %t.pat2 PREAGGT2
 # RUN: link_fdata %s %t %t.patplt PREAGGPLT
 
-## Check normal case: fallthrough is not LP or secondary entry.
 # RUN: llvm-strip --strip-unneeded %t -o %t.strip
 # RUN: llvm-objcopy --remove-section=.eh_frame %t.strip %t.noeh
-# RUN: llvm-bolt %t.strip --pa -p %t.pa1 -o %t.out \
-# RUN:   --print-cfg --print-only=main | FileCheck %s
-
-## Check that getFallthroughsInTrace correctly handles a trace starting at plt
-## call continuation
-# RUN: llvm-bolt %t.strip --pa -p %t.pa2 -o %t.out2 \
-# RUN:   --print-cfg --print-only=main | FileCheck %s --check-prefix=CHECK2
-
-## Check that we don't treat secondary entry points as call continuation sites.
-# RUN: llvm-bolt %t --pa -p %t.pa3 -o %t.out \
-# RUN:   --print-cfg --print-only=main | FileCheck %s --check-prefix=CHECK3
-
-## Check fallthrough to a landing pad case.
-# RUN: llvm-bolt %t.strip --pa -p %t.pa3 -o %t.out \
-# RUN:   --print-cfg --print-only=main | FileCheck %s --check-prefix=CHECK3
 
 ## Check pre-aggregated traces attach call continuation fallthrough count
 # RUN: llvm-bolt %t.noeh --pa -p %t.pat -o %t.out \
@@ -77,7 +58,6 @@
 ## Check PLT traces are accepted
 # PREAGGPLT: T #Ltmp0_br# #puts@plt# #puts@plt# 3
 ## Target is an external-origin call continuation
-# PREAGG1: B X:0 #Ltmp1# 2 0
 # PREAGGT1: T X:0 #Ltmp1# #Ltmp4_br# 2
 # CHECK:      callq puts@PLT
 # CHECK-NEXT: count: 2
@@ -97,18 +77,15 @@
 	movl	$0xa, -0x18(%rbp)
 	callq	foo
 ## Target is a binary-local call continuation
-# PREAGG1: B #Lfoo_ret# #Ltmp3# 1 0
 # PREAGGT1: T #Lfoo_ret# #Ltmp3# #Ltmp3_br# 1
 # CHECK:      callq foo
 # CHECK-NEXT: count: 1
 
 ## PLT call continuation fallthrough spanning the call
-# PREAGG2: F #Ltmp1# #Ltmp3_br# 3
 # CHECK2:      callq foo
 # CHECK2-NEXT: count: 3
 
 ## Target is a secondary entry point (unstripped) or a landing pad (stripped)
-# PREAGG3: B X:0 #Ltmp3# 2 0
 # PREAGGT2: T X:0 #Ltmp3# #Ltmp3_br# 2
 # CHECK3:      callq foo
 # CHECK3-NEXT: count: 0
diff --git a/bolt/test/perf2bolt/perf_test.test b/bolt/test/perf2bolt/perf_test.test
index 7bec442..44111de 100644
--- a/bolt/test/perf2bolt/perf_test.test
+++ b/bolt/test/perf2bolt/perf_test.test
@@ -8,6 +8,7 @@
 
 CHECK-NOT: PERF2BOLT-ERROR
 CHECK-NOT: !! WARNING !! This high mismatch ratio indicates the input binary is probably not the same binary used during profiling collection.
+CHECK: BOLT-INFO: Functions with density >= {{.*}} account for 99.00% total sample counts.
 
 RUN: %clang %S/Inputs/perf_test.c -no-pie -fuse-ld=lld -o %t4
 RUN: perf record -Fmax -e cycles:u -o %t5 -- %t4