[BOLT] Add BB index to BAT

Add input basic block index to BAT metadata. This addresses the case
where some basic blocks are eliminated, and output index is not equal
to the input block index. These indices are used in non-stale-matching
mode.

Increases BAT section size to:
- large binary: 39521512 bytes (1.02x original),
- medium binary: 3799988 bytes (0.64x),
- small binary: 920 bytes (0.64x).

Test Plan:
Updated bolt-address-translation{,-yaml}.test

Pull Request: https://github.com/llvm/llvm-project/pull/86044
diff --git a/bolt/docs/BAT.md b/bolt/docs/BAT.md
index 186b0e5..4365934 100644
--- a/bolt/docs/BAT.md
+++ b/bolt/docs/BAT.md
@@ -90,11 +90,12 @@
 ### Address translation table
 Delta encoding means that only the difference with the previous corresponding
 entry is encoded. Input offsets implicitly start at zero.
-| Entry  | Encoding | Description |
-| ------ | ------| ----------- |
-| `OutputOffset` | Continuous, Delta, ULEB128 | Function offset in output binary |
-| `InputOffset` | Optional, Delta, SLEB128 | Function offset in input binary with `BRANCHENTRY` LSB bit |
-| `BBHash` | Optional, 8b | Basic block entries only: basic block hash in input binary |
+| Entry  | Encoding | Description | Branch/BB |
+| ------ | ------| ----------- | ------ |
+| `OutputOffset` | Continuous, Delta, ULEB128 | Function offset in output binary | Both |
+| `InputOffset` | Optional, Delta, SLEB128 | Function offset in input binary with `BRANCHENTRY` LSB bit | Both |
+| `BBHash` | Optional, 8b | Basic block hash in input binary | BB |
+| `BBIdx`  | Optional, Delta, ULEB128 | Basic block index in input binary | BB |
 
 `BRANCHENTRY` bit denotes whether a given offset pair is a control flow source
 (branch or call instruction). If not set, it signifies a control flow target
diff --git a/bolt/include/bolt/Profile/BoltAddressTranslation.h b/bolt/include/bolt/Profile/BoltAddressTranslation.h
index 1f53f6d..eda2b31 100644
--- a/bolt/include/bolt/Profile/BoltAddressTranslation.h
+++ b/bolt/include/bolt/Profile/BoltAddressTranslation.h
@@ -122,6 +122,10 @@
   /// Returns BF hash by function output address (after BOLT).
   size_t getBFHash(uint64_t OutputAddress) const;
 
+  /// Returns BB index by function output address (after BOLT) and basic block
+  /// input offset.
+  unsigned getBBIndex(uint64_t FuncOutputAddress, uint32_t BBInputOffset) const;
+
   /// True if a given \p Address is a function with translation table entry.
   bool isBATFunction(uint64_t Address) const { return Maps.count(Address); }
 
@@ -154,7 +158,8 @@
 
   std::map<uint64_t, MapTy> Maps;
 
-  using BBHashMap = std::unordered_map<uint32_t, size_t>;
+  /// Map basic block input offset to a basic block index and hash pair.
+  using BBHashMap = std::unordered_map<uint32_t, std::pair<unsigned, size_t>>;
   std::unordered_map<uint64_t, std::pair<size_t, BBHashMap>> FuncHashes;
 
   /// Links outlined cold bocks to their original function
diff --git a/bolt/lib/Profile/BoltAddressTranslation.cpp b/bolt/lib/Profile/BoltAddressTranslation.cpp
index 1d61a1b..8fe976c 100644
--- a/bolt/lib/Profile/BoltAddressTranslation.cpp
+++ b/bolt/lib/Profile/BoltAddressTranslation.cpp
@@ -45,6 +45,8 @@
   LLVM_DEBUG(dbgs() << formatv(" Hash: {0:x}\n",
                                getBBHash(HotFuncAddress, BBInputOffset)));
   (void)HotFuncAddress;
+  LLVM_DEBUG(dbgs() << formatv(" Index: {0}\n",
+                               getBBIndex(HotFuncAddress, BBInputOffset)));
   // In case of conflicts (same Key mapping to different Vals), the last
   // update takes precedence. Of course it is not ideal to have conflicts and
   // those happen when we have an empty BB that either contained only
@@ -217,6 +219,7 @@
     }
     size_t Index = 0;
     uint64_t InOffset = 0;
+    size_t PrevBBIndex = 0;
     // Output and Input addresses and delta-encoded
     for (std::pair<const uint32_t, uint32_t> &KeyVal : Map) {
       const uint64_t OutputAddress = KeyVal.first + Address;
@@ -226,11 +229,15 @@
         encodeSLEB128(KeyVal.second - InOffset, OS);
       InOffset = KeyVal.second; // Keeping InOffset as if BRANCHENTRY is encoded
       if ((InOffset & BRANCHENTRY) == 0) {
-        // Basic block hash
-        size_t BBHash = FuncHashPair.second[InOffset >> 1];
+        unsigned BBIndex;
+        size_t BBHash;
+        std::tie(BBIndex, BBHash) = FuncHashPair.second[InOffset >> 1];
         OS.write(reinterpret_cast<char *>(&BBHash), 8);
-        LLVM_DEBUG(dbgs() << formatv("{0:x} -> {1:x} {2:x}\n", KeyVal.first,
-                                     InOffset >> 1, BBHash));
+        // Basic block index in the input binary
+        encodeULEB128(BBIndex - PrevBBIndex, OS);
+        PrevBBIndex = BBIndex;
+        LLVM_DEBUG(dbgs() << formatv("{0:x} -> {1:x} {2:x} {3}\n", KeyVal.first,
+                                     InOffset >> 1, BBHash, BBIndex));
       }
     }
   }
@@ -316,6 +323,7 @@
     LLVM_DEBUG(dbgs() << "Parsing " << NumEntries << " entries for 0x"
                       << Twine::utohexstr(Address) << "\n");
     uint64_t InputOffset = 0;
+    size_t BBIndex = 0;
     for (uint32_t J = 0; J < NumEntries; ++J) {
       const uint64_t OutputDelta = DE.getULEB128(&Offset, &Err);
       const uint64_t OutputAddress = PrevAddress + OutputDelta;
@@ -330,19 +338,25 @@
       }
       Map.insert(std::pair<uint32_t, uint32_t>(OutputOffset, InputOffset));
       size_t BBHash = 0;
+      size_t BBIndexDelta = 0;
       const bool IsBranchEntry = InputOffset & BRANCHENTRY;
       if (!IsBranchEntry) {
         BBHash = DE.getU64(&Offset, &Err);
+        BBIndexDelta = DE.getULEB128(&Offset, &Err);
+        BBIndex += BBIndexDelta;
         // Map basic block hash to hot fragment by input offset
-        FuncHashes[HotAddress].second.emplace(InputOffset >> 1, BBHash);
+        FuncHashes[HotAddress].second.emplace(InputOffset >> 1,
+                                              std::pair(BBIndex, BBHash));
       }
       LLVM_DEBUG({
         dbgs() << formatv(
             "{0:x} -> {1:x} ({2}/{3}b -> {4}/{5}b), {6:x}", OutputOffset,
             InputOffset, OutputDelta, getULEB128Size(OutputDelta), InputDelta,
             (J < EqualElems) ? 0 : getSLEB128Size(InputDelta), OutputAddress);
-        if (BBHash)
-          dbgs() << formatv(" {0:x}", BBHash);
+        if (!IsBranchEntry) {
+          dbgs() << formatv(" {0:x} {1}/{2}b", BBHash, BBIndex,
+                            getULEB128Size(BBIndexDelta));
+        }
         dbgs() << '\n';
       });
     }
@@ -494,14 +508,19 @@
     FuncHashes[BF.getAddress()].first = BF.computeHash();
     BF.computeBlockHashes();
     for (const BinaryBasicBlock &BB : BF)
-      FuncHashes[BF.getAddress()].second.emplace(BB.getInputOffset(),
-                                                 BB.getHash());
+      FuncHashes[BF.getAddress()].second.emplace(
+          BB.getInputOffset(), std::pair(BB.getIndex(), BB.getHash()));
   }
 }
 
+unsigned BoltAddressTranslation::getBBIndex(uint64_t FuncOutputAddress,
+                                            uint32_t BBInputOffset) const {
+  return FuncHashes.at(FuncOutputAddress).second.at(BBInputOffset).first;
+}
+
 size_t BoltAddressTranslation::getBBHash(uint64_t FuncOutputAddress,
                                          uint32_t BBInputOffset) const {
-  return FuncHashes.at(FuncOutputAddress).second.at(BBInputOffset);
+  return FuncHashes.at(FuncOutputAddress).second.at(BBInputOffset).second;
 }
 
 size_t BoltAddressTranslation::getBFHash(uint64_t OutputAddress) const {
diff --git a/bolt/test/X86/bolt-address-translation-yaml.test b/bolt/test/X86/bolt-address-translation-yaml.test
index 25ff4e7..4516a662 100644
--- a/bolt/test/X86/bolt-address-translation-yaml.test
+++ b/bolt/test/X86/bolt-address-translation-yaml.test
@@ -18,7 +18,7 @@
 
 WRITE-BAT-CHECK: BOLT-INFO: Wrote 5 BAT maps
 WRITE-BAT-CHECK: BOLT-INFO: Wrote 4 function and 22 basic block hashes
-WRITE-BAT-CHECK: BOLT-INFO: BAT section size (bytes): 344
+WRITE-BAT-CHECK: BOLT-INFO: BAT section size (bytes): 376
 
 READ-BAT-CHECK-NOT: BOLT-ERROR: unable to save profile in YAML format for input file processed by BOLT
 READ-BAT-CHECK: BOLT-INFO: Parsed 5 BAT entries
diff --git a/bolt/test/X86/bolt-address-translation.test b/bolt/test/X86/bolt-address-translation.test
index 4277b4e..5c1db89 100644
--- a/bolt/test/X86/bolt-address-translation.test
+++ b/bolt/test/X86/bolt-address-translation.test
@@ -37,7 +37,7 @@
 # CHECK:      BOLT: 3 out of 7 functions were overwritten.
 # CHECK:      BOLT-INFO: Wrote 6 BAT maps
 # CHECK:      BOLT-INFO: Wrote 3 function and 58 basic block hashes
-# CHECK:      BOLT-INFO: BAT section size (bytes): 816
+# CHECK:      BOLT-INFO: BAT section size (bytes): 920
 #
 # usqrt mappings (hot part). We match against any key (left side containing
 # the bolted binary offsets) because BOLT may change where it puts instructions