[𝘀𝗽𝗿] changes to main this commit is based on

Created using spr 1.3.4

[skip ci]
diff --git a/bolt/include/bolt/Core/MCPlusBuilder.h b/bolt/include/bolt/Core/MCPlusBuilder.h
index b99cf8b..f902a8c 100644
--- a/bolt/include/bolt/Core/MCPlusBuilder.h
+++ b/bolt/include/bolt/Core/MCPlusBuilder.h
@@ -430,6 +430,17 @@
     return Analysis->isIndirectBranch(Inst);
   }
 
+  /// Returns true if the instruction unconditionally transfers the control to
+  /// another program point, interrupting sequential code execution, e.g. by a
+  /// call, return, or unconditional jump. This explicitly leaves out
+  /// conditional branches as they may not be taken, but does allow transferring
+  /// the control to the next instruction (zero-displacement jump/call).
+  bool isUnconditionalControlTransfer(const MCInst &Inst) const {
+    const MCInstrDesc &Desc = Info->get(Inst.getOpcode());
+    // barrier captures returns and unconditional branches
+    return Desc.isBarrier() || Desc.isCall();
+  }
+
   /// Returns true if the instruction is memory indirect call or jump
   virtual bool isBranchOnMem(const MCInst &Inst) const {
     llvm_unreachable("not implemented");
diff --git a/bolt/include/bolt/Profile/DataAggregator.h b/bolt/include/bolt/Profile/DataAggregator.h
index c7400b0..db0f690 100644
--- a/bolt/include/bolt/Profile/DataAggregator.h
+++ b/bolt/include/bolt/Profile/DataAggregator.h
@@ -137,7 +137,7 @@
   std::vector<std::pair<Trace, TakenBranchInfo>> Traces;
   /// Pre-populated addresses of returns, coming from pre-aggregated data or
   /// disassembly. Used to disambiguate call-continuation fall-throughs.
-  std::unordered_set<uint64_t> Returns;
+  std::unordered_map<uint64_t, bool> Returns;
   std::unordered_map<uint64_t, uint64_t> BasicSamples;
   std::vector<PerfMemSample> MemSamples;
 
@@ -498,6 +498,10 @@
   /// If \p FileBuildID has no match, then issue an error and exit.
   void processFileBuildID(StringRef FileBuildID);
 
+  /// Infer missing fall-throughs for branch-only traces (LBR top-of-stack
+  /// entries).
+  void imputeFallThroughs();
+
   /// Debugging dump methods
   void dump() const;
   void dump(const PerfBranchSample &Sample) const;
@@ -509,6 +513,43 @@
   void printBasicSamplesDiagnostics(uint64_t OutOfRangeSamples) const;
   void printBranchStacksDiagnostics(uint64_t IgnoredSamples) const;
 
+  /// Get instruction at \p Addr either from containing binary function or
+  /// disassemble in-place, and invoke \p Callback on resulting MCInst.
+  /// Returns the result of the callback or nullopt.
+  template <typename T>
+  std::optional<T>
+  testInstructionAt(const uint64_t Addr,
+                    std::function<T(const MCInst &)> Callback) const {
+    BinaryFunction *Func = getBinaryFunctionContainingAddress(Addr);
+    if (!Func)
+      return std::nullopt;
+    const uint64_t Offset = Addr - Func->getAddress();
+    if (Func->hasInstructions()) {
+      if (auto *MI = Func->getInstructionAtOffset(Offset))
+        return Callback(*MI);
+    } else {
+      if (auto MI = Func->disassembleInstructionAtOffset(Offset))
+        return Callback(*MI);
+    }
+    return std::nullopt;
+  }
+
+  /// Apply \p Callback to the instruction at \p Addr, and memoize the result
+  /// in a \p Map.
+  template <typename T>
+  std::optional<T> testAndSet(const uint64_t Addr,
+                              std::function<T(const MCInst &)> Callback,
+                              std::unordered_map<uint64_t, T> &Map) {
+    auto It = Map.find(Addr);
+    if (It != Map.end())
+      return It->second;
+    if (std::optional<T> Res = testInstructionAt<T>(Addr, Callback)) {
+      Map.emplace(Addr, *Res);
+      return *Res;
+    }
+    return std::nullopt;
+  }
+
 public:
   /// If perf.data was collected without build ids, the buildid-list may contain
   /// incomplete entries. Return true if the buffer containing
diff --git a/bolt/lib/Profile/DataAggregator.cpp b/bolt/lib/Profile/DataAggregator.cpp
index c60591c..905728de 100644
--- a/bolt/lib/Profile/DataAggregator.cpp
+++ b/bolt/lib/Profile/DataAggregator.cpp
@@ -77,6 +77,11 @@
   cl::Optional,
   cl::cat(AggregatorCategory));
 
+static cl::opt<bool> ImputeTraceFallthrough(
+    "impute-trace-fall-through",
+    cl::desc("impute missing fall-throughs for branch-only traces"),
+    cl::Optional, cl::cat(AggregatorCategory));
+
 static cl::opt<bool>
 IgnoreBuildID("ignore-build-id",
   cl::desc("continue even if build-ids in input binary and perf.data mismatch"),
@@ -513,6 +518,69 @@
   deleteTempFiles();
 }
 
+void DataAggregator::imputeFallThroughs() {
+  if (Traces.empty())
+    return;
+
+  std::pair PrevBranch(Trace::EXTERNAL, Trace::EXTERNAL);
+  uint64_t AggregateCount = 0;
+  uint64_t AggregateFallthroughSize = 0;
+  uint64_t InferredTraces = 0;
+
+  // Helper map with whether the instruction is a call/ret/unconditional branch
+  std::unordered_map<uint64_t, bool> IsUncondCTMap;
+  auto checkUnconditionalControlTransfer = [&](const uint64_t Addr) {
+    auto isUncondCT = [&](const MCInst &MI) -> bool {
+      return BC->MIB->isUnconditionalControlTransfer(MI);
+    };
+    return testAndSet<bool>(Addr, isUncondCT, IsUncondCTMap).value_or(true);
+  };
+
+  // Traces are sorted by their component addresses (Branch, From, To).
+  // assert(is_sorted(Traces));
+
+  // Traces corresponding to the top-of-stack branch entry with a missing
+  // fall-through have BR_ONLY(-1ULL/UINT64_MAX) in To field, meaning that for
+  // fixed values of Branch and From branch-only traces are stored after all
+  // traces with valid fall-through.
+  //
+  // Group traces by (Branch, From) and compute weighted average fall-through
+  // length for the top-of-stack trace (closing the group) by accumulating the
+  // fall-through lengths of traces with valid fall-throughs earlier in the
+  // group.
+  for (auto &[Trace, Info] : Traces) {
+    // Skip fall-throughs in external code.
+    if (Trace.From == Trace::EXTERNAL)
+      continue;
+    std::pair CurrentBranch(Trace.Branch, Trace.From);
+    // BR_ONLY must be the last trace in the group
+    if (Trace.To == Trace::BR_ONLY) {
+      // If the group is not empty, use aggregate values, otherwise 0-length
+      // for unconditional jumps (call/ret/uncond branch) or 1-length for others
+      uint64_t InferredBytes =
+          PrevBranch == CurrentBranch
+              ? AggregateFallthroughSize / AggregateCount
+              : !checkUnconditionalControlTransfer(Trace.From);
+      Trace.To = Trace.From + InferredBytes;
+      LLVM_DEBUG(dbgs() << "imputed " << Trace << " (" << InferredBytes
+                        << " bytes)\n");
+      ++InferredTraces;
+    } else {
+      // Trace with a valid fall-through
+      // New group: reset aggregates.
+      if (CurrentBranch != PrevBranch)
+        AggregateCount = AggregateFallthroughSize = 0;
+      // Only use valid fall-through lengths
+      if (Trace.To != Trace::EXTERNAL)
+        AggregateFallthroughSize += (Trace.To - Trace.From) * Info.TakenCount;
+      AggregateCount += Info.TakenCount;
+    }
+    PrevBranch = CurrentBranch;
+  }
+  if (opts::Verbosity >= 1)
+    outs() << "BOLT-INFO: imputed " << InferredTraces << " traces\n";
+}
+
 Error DataAggregator::preprocessProfile(BinaryContext &BC) {
   this->BC = &BC;
 
@@ -525,6 +593,9 @@
   // Sort parsed traces for faster processing.
   llvm::sort(Traces, llvm::less_first());
 
+  if (opts::ImputeTraceFallthrough)
+    imputeFallThroughs();
+
   if (opts::HeatmapMode) {
     if (std::error_code EC = printLBRHeatMap())
       return errorCodeToError(EC);
@@ -726,22 +797,10 @@
 }
 
 bool DataAggregator::checkReturn(uint64_t Addr) {
-  auto isReturn = [&](auto MI) { return MI && BC->MIB->isReturn(*MI); };
-  if (llvm::is_contained(Returns, Addr))
-    return true;
-
-  BinaryFunction *Func = getBinaryFunctionContainingAddress(Addr);
-  if (!Func)
-    return false;
-
-  const uint64_t Offset = Addr - Func->getAddress();
-  if (Func->hasInstructions()
-          ? isReturn(Func->getInstructionAtOffset(Offset))
-          : isReturn(Func->disassembleInstructionAtOffset(Offset))) {
-    Returns.emplace(Addr);
-    return true;
-  }
-  return false;
+  auto isReturn = [&](const MCInst &MI) -> bool {
+    return BC->MIB->isReturn(MI);
+  };
+  return testAndSet<bool>(Addr, isReturn, Returns).value_or(false);
 }
 
 bool DataAggregator::doBranch(uint64_t From, uint64_t To, uint64_t Count,
@@ -1331,7 +1390,7 @@
     if (!Addr[0]->Offset)
       Addr[0]->Offset = Trace::FT_EXTERNAL_RETURN;
     else
-      Returns.emplace(Addr[0]->Offset);
+      Returns.emplace(Addr[0]->Offset, true);
   }
 
   /// Record a trace.
@@ -1592,7 +1651,7 @@
   NamedRegionTimer T("processBranch", "Processing branch events",
                      TimerGroupName, TimerGroupDesc, opts::TimeAggregator);
 
-  Returns.emplace(Trace::FT_EXTERNAL_RETURN);
+  Returns.emplace(Trace::FT_EXTERNAL_RETURN, true);
   for (const auto &[Trace, Info] : Traces) {
     bool IsReturn = checkReturn(Trace.Branch);
     // Ignore returns.