[BOLT] Write and parse BF/BB hashes in BAT

This increases BAT section size to:
- large binary: 34832976 bytes (0.90x original),
- medium binary: 3586800 bytes (0.60x original),
- small binary: 816 bytes (0.57x original).

Test Plan: Updated bolt/test/X86/bolt-address-translation.test

Reviewers: rafaelauler, dcci, ayermolo, maksfb

Reviewed By: rafaelauler

Pull Request: https://github.com/llvm/llvm-project/pull/76907
diff --git a/bolt/docs/BAT.md b/bolt/docs/BAT.md
index d1cab98..060fc63 100644
--- a/bolt/docs/BAT.md
+++ b/bolt/docs/BAT.md
@@ -79,6 +79,7 @@
 | ------ | ------| ----------- |
 | `Address` | Continuous, Delta, ULEB128 | Function address in the output binary |
 | `HotIndex` | Delta, ULEB128 | Cold functions only: index of corresponding hot function in hot functions table |
+| `FuncHash` | 8b | Hot functions only: function hash for input function |
 | `NumEntries` | ULEB128 | Number of address translation entries for a function |
 | `EqualElems` | ULEB128 | Hot functions only: number of equal offsets in the beginning of a function |
 | `BranchEntries` | Bitmask, `alignTo(EqualElems, 8)` bits | Hot functions only: if `EqualElems` is non-zero, bitmask denoting entries with `BRANCHENTRY` bit |
@@ -94,6 +95,7 @@
 | ------ | ------| ----------- |
 | `OutputOffset` | Continuous, Delta, ULEB128 | Function offset in output binary |
 | `InputOffset` | Optional, Delta, SLEB128 | Function offset in input binary with `BRANCHENTRY` LSB bit |
+| `BBHash` | Optional, 8b | Basic block entries only: basic block hash in input binary |
 
 `BRANCHENTRY` bit denotes whether a given offset pair is a control flow source
 (branch or call instruction). If not set, it signifies a control flow target
diff --git a/bolt/include/bolt/Profile/BoltAddressTranslation.h b/bolt/include/bolt/Profile/BoltAddressTranslation.h
index 844f0c5..5f2f095 100644
--- a/bolt/include/bolt/Profile/BoltAddressTranslation.h
+++ b/bolt/include/bolt/Profile/BoltAddressTranslation.h
@@ -115,6 +115,13 @@
   /// Save function and basic block hashes used for metadata dump.
   void saveMetadata(BinaryContext &BC);
 
+  /// Returns BB hash by function output address (after BOLT) and basic block
+  /// input offset.
+  size_t getBBHash(uint64_t FuncOutputAddress, uint32_t BBInputOffset) const;
+
+  /// Returns BF hash by function output address (after BOLT).
+  size_t getBFHash(uint64_t OutputAddress) const;
+
 private:
   /// Helper to update \p Map by inserting one or more BAT entries reflecting
   /// \p BB for function located at \p FuncAddress. At least one entry will be
@@ -150,6 +157,9 @@
   /// Links outlined cold bocks to their original function
   std::map<uint64_t, uint64_t> ColdPartSource;
 
+  /// Links output address of a main fragment back to input address.
+  std::unordered_map<uint64_t, uint64_t> ReverseMap;
+
   /// Identifies the address of a control-flow changing instructions in a
   /// translation map entry
   const static uint32_t BRANCHENTRY = 0x1;
diff --git a/bolt/lib/Profile/BoltAddressTranslation.cpp b/bolt/lib/Profile/BoltAddressTranslation.cpp
index 5477d3b..e279852 100644
--- a/bolt/lib/Profile/BoltAddressTranslation.cpp
+++ b/bolt/lib/Profile/BoltAddressTranslation.cpp
@@ -23,6 +23,9 @@
 void BoltAddressTranslation::writeEntriesForBB(MapTy &Map,
                                                const BinaryBasicBlock &BB,
                                                uint64_t FuncAddress) {
+  uint64_t HotFuncAddress = ColdPartSource.count(FuncAddress)
+                                ? ColdPartSource[FuncAddress]
+                                : FuncAddress;
   const uint64_t BBOutputOffset =
       BB.getOutputAddressRange().first - FuncAddress;
   const uint32_t BBInputOffset = BB.getInputOffset();
@@ -39,6 +42,8 @@
   LLVM_DEBUG(dbgs() << "BB " << BB.getName() << "\n");
   LLVM_DEBUG(dbgs() << "  Key: " << Twine::utohexstr(BBOutputOffset)
                     << " Val: " << Twine::utohexstr(BBInputOffset) << "\n");
+  LLVM_DEBUG(dbgs() << formatv(" Hash: {0:x}\n",
+                               getBBHash(HotFuncAddress, BBInputOffset)));
   // In case of conflicts (same Key mapping to different Vals), the last
   // update takes precedence. Of course it is not ideal to have conflicts and
   // those happen when we have an empty BB that either contained only
@@ -72,20 +77,28 @@
   LLVM_DEBUG(dbgs() << "BOLT-DEBUG: Writing BOLT Address Translation Tables\n");
   for (auto &BFI : BC.getBinaryFunctions()) {
     const BinaryFunction &Function = BFI.second;
+    const uint64_t InputAddress = Function.getAddress();
+    const uint64_t OutputAddress = Function.getOutputAddress();
     // We don't need a translation table if the body of the function hasn't
     // changed
     if (Function.isIgnored() || (!BC.HasRelocations && !Function.isSimple()))
       continue;
 
+    // TBD: handle BAT functions w/multiple entry points.
+    if (Function.isMultiEntry())
+      continue;
+
     LLVM_DEBUG(dbgs() << "Function name: " << Function.getPrintName() << "\n");
     LLVM_DEBUG(dbgs() << " Address reference: 0x"
                       << Twine::utohexstr(Function.getOutputAddress()) << "\n");
+    LLVM_DEBUG(dbgs() << formatv(" Hash: {0:x}\n", getBFHash(OutputAddress)));
 
     MapTy Map;
     for (const BinaryBasicBlock *const BB :
          Function.getLayout().getMainFragment())
       writeEntriesForBB(Map, *BB, Function.getOutputAddress());
     Maps.emplace(Function.getOutputAddress(), std::move(Map));
+    ReverseMap.emplace(OutputAddress, InputAddress);
 
     if (!Function.isSplit())
       continue;
@@ -94,12 +107,12 @@
     LLVM_DEBUG(dbgs() << " Cold part\n");
     for (const FunctionFragment &FF :
          Function.getLayout().getSplitFragments()) {
+      ColdPartSource.emplace(FF.getAddress(), Function.getOutputAddress());
       Map.clear();
       for (const BinaryBasicBlock *const BB : FF)
         writeEntriesForBB(Map, *BB, FF.getAddress());
 
       Maps.emplace(FF.getAddress(), std::move(Map));
-      ColdPartSource.emplace(FF.getAddress(), Function.getOutputAddress());
     }
   }
 
@@ -109,6 +122,11 @@
   writeMaps</*Cold=*/true>(Maps, PrevAddress, OS);
 
   BC.outs() << "BOLT-INFO: Wrote " << Maps.size() << " BAT maps\n";
+  const uint64_t NumBBHashes = std::accumulate(
+      FuncHashes.begin(), FuncHashes.end(), 0ull,
+      [](size_t Acc, const auto &B) { return Acc + B.second.second.size(); });
+  BC.outs() << "BOLT-INFO: Wrote " << FuncHashes.size() << " function and "
+            << NumBBHashes << " basic block hashes\n";
 }
 
 APInt BoltAddressTranslation::calculateBranchEntriesBitMask(MapTy &Map,
@@ -155,6 +173,11 @@
     // Only process cold fragments in cold mode, and vice versa.
     if (Cold != ColdPartSource.count(Address))
       continue;
+    // NB: here we use the input address because hashes are saved early (in
+    // `saveMetadata`) before output addresses are assigned.
+    const uint64_t HotInputAddress =
+        ReverseMap[Cold ? ColdPartSource[Address] : Address];
+    std::pair<size_t, BBHashMap> &FuncHashPair = FuncHashes[HotInputAddress];
     MapTy &Map = MapEntry.second;
     const uint32_t NumEntries = Map.size();
     LLVM_DEBUG(dbgs() << "Writing " << NumEntries << " entries for 0x"
@@ -166,6 +189,10 @@
           std::distance(ColdPartSource.begin(), ColdPartSource.find(Address));
       encodeULEB128(HotIndex - PrevIndex, OS);
       PrevIndex = HotIndex;
+    } else {
+      // Function hash
+      LLVM_DEBUG(dbgs() << "Hash: " << formatv("{0:x}\n", FuncHashPair.first));
+      OS.write(reinterpret_cast<char *>(&FuncHashPair.first), 8);
     }
     encodeULEB128(NumEntries, OS);
     // For hot fragments only: encode the number of equal offsets
@@ -197,6 +224,13 @@
       if (Index++ >= EqualElems)
         encodeSLEB128(KeyVal.second - InOffset, OS);
       InOffset = KeyVal.second; // Keeping InOffset as if BRANCHENTRY is encoded
+      if ((InOffset & BRANCHENTRY) == 0) {
+        // Basic block hash
+        size_t BBHash = FuncHashPair.second[InOffset >> 1];
+        OS.write(reinterpret_cast<char *>(&BBHash), 8);
+        LLVM_DEBUG(dbgs() << formatv("{0:x} -> {1:x} {2:x}\n", KeyVal.first,
+                                     InOffset >> 1, BBHash));
+      }
     }
   }
 }
@@ -239,12 +273,18 @@
   size_t HotIndex = 0;
   for (uint32_t I = 0; I < NumFunctions; ++I) {
     const uint64_t Address = PrevAddress + DE.getULEB128(&Offset, &Err);
+    uint64_t HotAddress = Cold ? 0 : Address;
     PrevAddress = Address;
     if (Cold) {
       HotIndex += DE.getULEB128(&Offset, &Err);
-      ColdPartSource.emplace(Address, HotFuncs[HotIndex]);
+      HotAddress = HotFuncs[HotIndex];
+      ColdPartSource.emplace(Address, HotAddress);
     } else {
       HotFuncs.push_back(Address);
+      // Function hash
+      const size_t FuncHash = DE.getU64(&Offset, &Err);
+      FuncHashes[Address].first = FuncHash;
+      LLVM_DEBUG(dbgs() << formatv("{0:x}: hash {1:x}\n", Address, FuncHash));
     }
     const uint32_t NumEntries = DE.getULEB128(&Offset, &Err);
     // Equal offsets, hot fragments only.
@@ -288,12 +328,22 @@
         InputOffset += InputDelta;
       }
       Map.insert(std::pair<uint32_t, uint32_t>(OutputOffset, InputOffset));
-      LLVM_DEBUG(
-          dbgs() << formatv("{0:x} -> {1:x} ({2}/{3}b -> {4}/{5}b), {6:x}\n",
-                            OutputOffset, InputOffset, OutputDelta,
-                            getULEB128Size(OutputDelta), InputDelta,
-                            (J < EqualElems) ? 0 : getSLEB128Size(InputDelta),
-                            OutputAddress));
+      size_t BBHash = 0;
+      const bool IsBranchEntry = InputOffset & BRANCHENTRY;
+      if (!IsBranchEntry) {
+        BBHash = DE.getU64(&Offset, &Err);
+        // Map basic block hash to hot fragment by input offset
+        FuncHashes[HotAddress].second.emplace(InputOffset >> 1, BBHash);
+      }
+      LLVM_DEBUG({
+        dbgs() << formatv(
+            "{0:x} -> {1:x} ({2}/{3}b -> {4}/{5}b), {6:x}", OutputOffset,
+            InputOffset, OutputDelta, getULEB128Size(OutputDelta), InputDelta,
+            (J < EqualElems) ? 0 : getSLEB128Size(InputDelta), OutputAddress);
+        if (BBHash)
+          dbgs() << formatv(" {0:x}", BBHash);
+        dbgs() << '\n';
+      });
     }
     Maps.insert(std::pair<uint64_t, MapTy>(Address, Map));
   }
@@ -303,7 +353,12 @@
   const size_t NumTables = Maps.size();
   OS << "BAT tables for " << NumTables << " functions:\n";
   for (const auto &MapEntry : Maps) {
-    OS << "Function Address: 0x" << Twine::utohexstr(MapEntry.first) << "\n";
+    const uint64_t Address = MapEntry.first;
+    const uint64_t HotAddress = fetchParentAddress(Address);
+    OS << "Function Address: 0x" << Twine::utohexstr(Address);
+    if (HotAddress == 0)
+      OS << formatv(", hash: {0:x}", getBFHash(Address));
+    OS << "\n";
     OS << "BB mappings:\n";
     for (const auto &Entry : MapEntry.second) {
       const bool IsBranch = Entry.second & BRANCHENTRY;
@@ -312,6 +367,9 @@
          << "0x" << Twine::utohexstr(Val);
       if (IsBranch)
         OS << " (branch)";
+      else
+        OS << formatv(" hash: {0:x}",
+                      getBBHash(HotAddress ? HotAddress : Address, Val));
       OS << "\n";
     }
     OS << "\n";
@@ -439,5 +497,15 @@
                                                  BB.getHash());
   }
 }
+
+size_t BoltAddressTranslation::getBBHash(uint64_t FuncOutputAddress,
+                                         uint32_t BBInputOffset) const {
+  return FuncHashes.at(FuncOutputAddress).second.at(BBInputOffset);
+}
+
+size_t BoltAddressTranslation::getBFHash(uint64_t OutputAddress) const {
+  return FuncHashes.at(OutputAddress).first;
+}
+
 } // namespace bolt
 } // namespace llvm
diff --git a/bolt/test/X86/bolt-address-translation.test b/bolt/test/X86/bolt-address-translation.test
index f2020af..4277b4e 100644
--- a/bolt/test/X86/bolt-address-translation.test
+++ b/bolt/test/X86/bolt-address-translation.test
@@ -36,7 +36,8 @@
 #
 # CHECK:      BOLT: 3 out of 7 functions were overwritten.
 # CHECK:      BOLT-INFO: Wrote 6 BAT maps
-# CHECK:      BOLT-INFO: BAT section size (bytes): 336
+# CHECK:      BOLT-INFO: Wrote 3 function and 58 basic block hashes
+# CHECK:      BOLT-INFO: BAT section size (bytes): 816
 #
 # usqrt mappings (hot part). We match against any key (left side containing
 # the bolted binary offsets) because BOLT may change where it puts instructions
@@ -44,13 +45,13 @@
 # binary offsets (right side) should be the same because these addresses are
 # hardcoded in the blarge.yaml file.
 #
-# CHECK-BAT-DUMP:      Function Address: 0x401170
+# CHECK-BAT-DUMP:      Function Address: 0x401170, hash: 0xace6cbc638b31983
 # CHECK-BAT-DUMP-NEXT: BB mappings:
-# CHECK-BAT-DUMP-NEXT: 0x0 -> 0x0
+# CHECK-BAT-DUMP-NEXT: 0x0 -> 0x0 hash: 0x36007ba1d80c0000
 # CHECK-BAT-DUMP-NEXT: 0x8 -> 0x8 (branch)
-# CHECK-BAT-DUMP-NEXT: 0x{{.*}} -> 0x39
+# CHECK-BAT-DUMP-NEXT: 0x{{.*}} -> 0x39 hash: 0x5c06705524800039
 # CHECK-BAT-DUMP-NEXT: 0x{{.*}} -> 0x3d (branch)
-# CHECK-BAT-DUMP-NEXT: 0x{{.*}} -> 0x10
+# CHECK-BAT-DUMP-NEXT: 0x{{.*}} -> 0x10 hash: 0xd70d7a64320e0010
 # CHECK-BAT-DUMP-NEXT: 0x{{.*}} -> 0x30 (branch)
 #
 # CHECK-BAT-DUMP: 3 cold mappings