[𝘀𝗽𝗿] changes to main this commit is based on

Created using spr 1.3.4

[skip ci]
diff --git a/bolt/include/bolt/Core/BinaryContext.h b/bolt/include/bolt/Core/BinaryContext.h
index 88313a6..77595bd 100644
--- a/bolt/include/bolt/Core/BinaryContext.h
+++ b/bolt/include/bolt/Core/BinaryContext.h
@@ -230,6 +230,12 @@
   /// Functions injected by BOLT
   std::vector<BinaryFunction *> InjectedBinaryFunctions;
 
+  /// Thunk functions.
+  std::vector<BinaryFunction *> ThunkBinaryFunctions;
+
+  /// Function that precedes thunks in the binary.
+  const BinaryFunction *ThunkLocation{nullptr};
+
   /// Jump tables for all functions mapped by address.
   std::map<uint64_t, JumpTable *> JumpTables;
 
@@ -435,7 +441,18 @@
 
   /// Return size of an entry for the given jump table \p Type.
   uint64_t getJumpTableEntrySize(JumpTable::JumpTableType Type) const {
-    return Type == JumpTable::JTT_PIC ? 4 : AsmInfo->getCodePointerSize();
+    switch (Type) {
+    case JumpTable::JTT_X86_64_PIC4:
+      return 4;
+    case JumpTable::JTT_X86_64_ABS:
+      return AsmInfo->getCodePointerSize();
+    case JumpTable::JTT_AARCH64_REL1:
+      return 1;
+    case JumpTable::JTT_AARCH64_REL2:
+      return 2;
+    case JumpTable::JTT_AARCH64_REL4:
+      return 4;
+    }
   }
 
   /// Return JumpTable containing a given \p Address.
@@ -553,6 +570,16 @@
     return InjectedBinaryFunctions;
   }
 
+  BinaryFunction *createThunkBinaryFunction(const std::string &Name);
+
+  std::vector<BinaryFunction *> &getThunkBinaryFunctions() {
+    return ThunkBinaryFunctions;
+  }
+
+  const BinaryFunction *getThunkLocation() const { return ThunkLocation; }
+
+  void setThunkLocation(const BinaryFunction *BF) { ThunkLocation = BF; }
+
   /// Return vector with all functions, i.e. include functions from the input
   /// binary and functions created by BOLT.
   std::vector<BinaryFunction *> getAllBinaryFunctions();
@@ -574,14 +601,13 @@
   /// If \p NextJTAddress is different from zero, it is used as an upper
   /// bound for jump table memory layout.
   ///
-  /// Optionally, populate \p Address from jump table entries. The entries
-  /// could be partially populated if the jump table detection fails.
+  /// If \p JT is set, populate it with jump table entries. The entries could be
+  /// partially populated if the jump table detection fails.
   bool analyzeJumpTable(const uint64_t Address,
                         const JumpTable::JumpTableType Type,
                         const BinaryFunction &BF,
                         const uint64_t NextJTAddress = 0,
-                        JumpTable::AddressesType *EntriesAsAddress = nullptr,
-                        bool *HasEntryInFragment = nullptr) const;
+                        JumpTable *JT = nullptr) const;
 
   /// After jump table locations are established, this function will populate
   /// their EntriesAsAddress based on memory contents.
@@ -1372,6 +1398,10 @@
   uint64_t
   computeInstructionSize(const MCInst &Inst,
                          const MCCodeEmitter *Emitter = nullptr) const {
+    // FIXME: hack for faster size computation on aarch64.
+    if (isAArch64())
+      return MIB->isPseudo(Inst) ? 0 : 4;
+
     if (std::optional<uint32_t> Size = MIB->getSize(Inst))
       return *Size;
 
diff --git a/bolt/include/bolt/Core/BinaryFunction.h b/bolt/include/bolt/Core/BinaryFunction.h
index d3d11f8..c18a43f 100644
--- a/bolt/include/bolt/Core/BinaryFunction.h
+++ b/bolt/include/bolt/Core/BinaryFunction.h
@@ -363,6 +363,10 @@
   /// True if the function should not have an associated symbol table entry.
   bool IsAnonymous{false};
 
+  /// True if the function is used for remapping hot text and shall not be
+  /// placed on a huge page.
+  bool IsHotTextMover{false};
+
   /// Name for the section this function code should reside in.
   std::string CodeSectionName;
 
@@ -1385,6 +1389,8 @@
   /// Return true if the function uses ORC format for stack unwinding.
   bool hasORC() const { return HasORC; }
 
+  bool isHotTextMover() const { return IsHotTextMover; }
+
   const JumpTable *getJumpTable(const MCInst &Inst) const {
     const uint64_t Address = BC.MIB->getJumpTable(Inst);
     return getJumpTableContainingAddress(Address);
@@ -1735,6 +1741,8 @@
   /// Mark function that should not be emitted.
   void setIgnored();
 
+  void setHotTextMover(bool V) { IsHotTextMover = V; }
+
   void setHasIndirectTargetToSplitFragment(bool V) {
     HasIndirectTargetToSplitFragment = V;
   }
diff --git a/bolt/include/bolt/Core/JumpTable.h b/bolt/include/bolt/Core/JumpTable.h
index 52b9cce..c76e2a9 100644
--- a/bolt/include/bolt/Core/JumpTable.h
+++ b/bolt/include/bolt/Core/JumpTable.h
@@ -16,6 +16,7 @@
 
 #include "bolt/Core/BinaryData.h"
 #include <map>
+#include <variant>
 #include <vector>
 
 namespace llvm {
@@ -40,6 +41,7 @@
 /// a different label at a different offset in this jump table.
 class JumpTable : public BinaryData {
   friend class BinaryContext;
+  friend class JumpTableInfoReader;
 
   JumpTable() = delete;
   JumpTable(const JumpTable &) = delete;
@@ -47,10 +49,34 @@
 
 public:
   enum JumpTableType : char {
-    JTT_NORMAL,
-    JTT_PIC,
+    JTT_X86_64_FIRST = 0,
+    JTT_X86_64_ABS = JTT_X86_64_FIRST,
+    JTT_X86_64_PIC4,
+    JTT_X86_64_LAST = JTT_X86_64_PIC4,
+    JTT_AARCH64_FIRST,
+    JTT_AARCH64_REL1 = JTT_AARCH64_FIRST,
+    JTT_AARCH64_REL2,
+    JTT_AARCH64_REL4,
+    JTT_AARCH64_LAST = JTT_AARCH64_REL4
   };
 
+  static StringRef getTypeStr(JumpTableType Type) {
+    switch (Type) {
+    case JTT_X86_64_ABS:
+      return "X86_64_ABS";
+    case JTT_X86_64_PIC4:
+      return "X86_64_PIC4";
+    case JTT_AARCH64_REL1:
+      return "AARCH64_REL1";
+    case JTT_AARCH64_REL2:
+      return "AARCH64_REL2";
+    case JTT_AARCH64_REL4:
+      return "AARCH64_REL4";
+    }
+  }
+
+  const StringRef getTypeStr() { return getTypeStr(Type); }
+
   /// Branch statistics for jump table entries.
   struct JumpInfo {
     uint64_t Mispreds{0};
@@ -92,6 +118,16 @@
   /// BinaryFunction this jump tables belongs to.
   SmallVector<BinaryFunction *, 1> Parents;
 
+  ///
+  /// AArch64-specific fields
+  ///
+
+  /// Entries are offsets relative to an arbitrary function location.
+  std::variant<uint64_t, MCSymbol *> BaseAddress;
+
+  /// Address of the instruction referencing the jump table (MemLocInstr).
+  uint64_t MemLocInstrAddress{0};
+
 private:
   /// Constructor should only be called by a BinaryContext.
   JumpTable(MCSymbol &Symbol, uint64_t Address, size_t EntrySize,
diff --git a/bolt/include/bolt/Passes/LongJmp.h b/bolt/include/bolt/Passes/LongJmp.h
index df3ea96..5388d7e 100644
--- a/bolt/include/bolt/Passes/LongJmp.h
+++ b/bolt/include/bolt/Passes/LongJmp.h
@@ -76,6 +76,28 @@
   /// 128MB of each other.
   void relaxLocalBranches(BinaryFunction &BF);
 
+  struct FunctionCluster {
+    DenseSet<BinaryFunction *> Functions;
+
+    // Functions that this cluster of functions is calling. Note that it
+    // excludes all functions in the cluster itself.
+    DenseSet<BinaryFunction *> Callees;
+
+    uint64_t Size{0};
+
+    // Last function in the cluster.
+    BinaryFunction *LastBF{nullptr};
+  };
+
+  /// Maximum size of the function cluster. Note that it's less than 128MB
+  /// as the size of the cluster plus thunk island should be less than 128MB.
+  static constexpr uint64_t MaxClusterSize = 125 * 1024 * 1024;
+
+  /// Relax calls for medium code model where code is < 256MB.
+  /// A thunk island will be introduced between two clusters of functions to
+  /// enable calls over 128MB.
+  void relaxCalls(BinaryContext &BC);
+
   ///                 -- Layout estimation methods --
   /// Try to do layout before running the emitter, by looking at BinaryFunctions
   /// and MCInsts -- this is an estimation. To be correct for longjmp inserter
diff --git a/bolt/include/bolt/Rewrite/MetadataManager.h b/bolt/include/bolt/Rewrite/MetadataManager.h
index 6001b70..cc6e3f9 100644
--- a/bolt/include/bolt/Rewrite/MetadataManager.h
+++ b/bolt/include/bolt/Rewrite/MetadataManager.h
@@ -31,6 +31,10 @@
   /// Run initializers after sections are discovered.
   void runSectionInitializers();
 
+  /// Execute metadata initializers when functions are discovered but not yet
+  /// disassembled.
+  void runInitializersPreDisasm();
+
   /// Execute initialization of rewriters while functions are disassembled, but
   /// CFG is not yet built.
   void runInitializersPreCFG();
diff --git a/bolt/include/bolt/Rewrite/MetadataRewriter.h b/bolt/include/bolt/Rewrite/MetadataRewriter.h
index 6ff8f0a..d39500c 100644
--- a/bolt/include/bolt/Rewrite/MetadataRewriter.h
+++ b/bolt/include/bolt/Rewrite/MetadataRewriter.h
@@ -49,6 +49,10 @@
   /// but before functions are discovered.
   virtual Error sectionInitializer() { return Error::success(); }
 
+  /// Run initialization after the functions are identified but not yet
+  /// disassembled.
+  virtual Error preDisasmInitializer() { return Error::success(); }
+
   /// Interface for modifying/annotating functions in the binary based on the
   /// contents of the section. Functions are in pre-cfg state.
   virtual Error preCFGInitializer() { return Error::success(); }
diff --git a/bolt/include/bolt/Rewrite/MetadataRewriters.h b/bolt/include/bolt/Rewrite/MetadataRewriters.h
index b71bd6c..ae34194 100644
--- a/bolt/include/bolt/Rewrite/MetadataRewriters.h
+++ b/bolt/include/bolt/Rewrite/MetadataRewriters.h
@@ -27,6 +27,8 @@
 
 std::unique_ptr<MetadataRewriter> createSDTRewriter(BinaryContext &);
 
+std::unique_ptr<MetadataRewriter> createJumpTableInfoReader(BinaryContext &);
+
 } // namespace bolt
 } // namespace llvm
 
diff --git a/bolt/include/bolt/Rewrite/RewriteInstance.h b/bolt/include/bolt/Rewrite/RewriteInstance.h
index 94dd06e..8fd6a77 100644
--- a/bolt/include/bolt/Rewrite/RewriteInstance.h
+++ b/bolt/include/bolt/Rewrite/RewriteInstance.h
@@ -181,6 +181,9 @@
   /// Process metadata in sections before functions are discovered.
   void processSectionMetadata();
 
+  /// Process metadata in special sections before functions are disassembled.
+  void processMetadataPreDisasm();
+
   /// Process metadata in special sections before CFG is built for functions.
   void processMetadataPreCFG();
 
diff --git a/bolt/lib/Core/BinaryContext.cpp b/bolt/lib/Core/BinaryContext.cpp
index 80b15d7..25a39f1 100644
--- a/bolt/lib/Core/BinaryContext.cpp
+++ b/bolt/lib/Core/BinaryContext.cpp
@@ -497,7 +497,7 @@
     const MemoryContentsType MemType = analyzeMemoryAt(Address, BF);
     if (MemType == MemoryContentsType::POSSIBLE_PIC_JUMP_TABLE && IsPCRel) {
       const MCSymbol *Symbol =
-          getOrCreateJumpTable(BF, Address, JumpTable::JTT_PIC);
+          getOrCreateJumpTable(BF, Address, JumpTable::JTT_X86_64_PIC4);
 
       return std::make_pair(Symbol, 0);
     }
@@ -541,10 +541,10 @@
 
   // Start with checking for PIC jump table. We expect non-PIC jump tables
   // to have high 32 bits set to 0.
-  if (analyzeJumpTable(Address, JumpTable::JTT_PIC, BF))
+  if (analyzeJumpTable(Address, JumpTable::JTT_X86_64_PIC4, BF))
     return MemoryContentsType::POSSIBLE_PIC_JUMP_TABLE;
 
-  if (analyzeJumpTable(Address, JumpTable::JTT_NORMAL, BF))
+  if (analyzeJumpTable(Address, JumpTable::JTT_X86_64_ABS, BF))
     return MemoryContentsType::POSSIBLE_JUMP_TABLE;
 
   return MemoryContentsType::UNKNOWN;
@@ -554,8 +554,7 @@
                                      const JumpTable::JumpTableType Type,
                                      const BinaryFunction &BF,
                                      const uint64_t NextJTAddress,
-                                     JumpTable::AddressesType *EntriesAsAddress,
-                                     bool *HasEntryInFragment) const {
+                                     JumpTable *JT) const {
   // Target address of __builtin_unreachable.
   const uint64_t UnreachableAddress = BF.getAddress() + BF.getSize();
 
@@ -572,11 +571,11 @@
   size_t TrimmedSize = 0;
 
   auto addEntryAddress = [&](uint64_t EntryAddress, bool Unreachable = false) {
-    if (!EntriesAsAddress)
+    if (!JT)
       return;
-    EntriesAsAddress->emplace_back(EntryAddress);
+    JT->EntriesAsAddress.emplace_back(EntryAddress);
     if (!Unreachable)
-      TrimmedSize = EntriesAsAddress->size();
+      TrimmedSize = JT->EntriesAsAddress.size();
   };
 
   ErrorOr<const BinarySection &> Section = getSectionForAddress(Address);
@@ -595,12 +594,9 @@
   if (NextJTAddress)
     UpperBound = std::min(NextJTAddress, UpperBound);
 
-  LLVM_DEBUG({
-    using JTT = JumpTable::JumpTableType;
-    dbgs() << formatv("BOLT-DEBUG: analyzeJumpTable @{0:x} in {1}, JTT={2}\n",
-                      Address, BF.getPrintName(),
-                      Type == JTT::JTT_PIC ? "PIC" : "Normal");
-  });
+  LLVM_DEBUG(
+      dbgs() << formatv("BOLT-DEBUG: analyzeJumpTable @{0:x} in {1}, JTT={2}\n",
+                        Address, BF, JumpTable::getTypeStr(Type)));
   const uint64_t EntrySize = getJumpTableEntrySize(Type);
   for (uint64_t EntryAddress = Address; EntryAddress <= UpperBound - EntrySize;
        EntryAddress += EntrySize) {
@@ -608,13 +604,13 @@
                       << " -> ");
     // Check if there's a proper relocation against the jump table entry.
     if (HasRelocations) {
-      if (Type == JumpTable::JTT_PIC &&
+      if (Type == JumpTable::JTT_X86_64_PIC4 &&
           !DataPCRelocations.count(EntryAddress)) {
         LLVM_DEBUG(
             dbgs() << "FAIL: JTT_PIC table, no relocation for this address\n");
         break;
       }
-      if (Type == JumpTable::JTT_NORMAL && !getRelocationAt(EntryAddress)) {
+      if (Type == JumpTable::JTT_X86_64_ABS && !getRelocationAt(EntryAddress)) {
         LLVM_DEBUG(
             dbgs()
             << "FAIL: JTT_NORMAL table, no relocation for this address\n");
@@ -622,10 +618,24 @@
       }
     }
 
-    const uint64_t Value =
-        (Type == JumpTable::JTT_PIC)
-            ? Address + *getSignedValueAtAddress(EntryAddress, EntrySize)
-            : *getPointerAtAddress(EntryAddress);
+    uint64_t Value = 0;
+    switch (Type) {
+    case JumpTable::JTT_X86_64_PIC4:
+      Value = Address + *getSignedValueAtAddress(EntryAddress, EntrySize);
+      break;
+    case JumpTable::JTT_X86_64_ABS:
+      Value = *getPointerAtAddress(EntryAddress);
+      break;
+    case JumpTable::JTT_AARCH64_REL1:
+    case JumpTable::JTT_AARCH64_REL2:
+    case JumpTable::JTT_AARCH64_REL4:
+      unsigned ShiftAmt = Type == JumpTable::JTT_AARCH64_REL4 ? 0 : 2;
+      assert(JT &&
+             "jump table must be non-null for AArch64 in analyzeJumpTable");
+      Value = std::get<uint64_t>(JT->BaseAddress) +
+              (*getUnsignedValueAtAddress(EntryAddress, EntrySize) << ShiftAmt);
+      break;
+    }
 
     // __builtin_unreachable() case.
     if (Value == UnreachableAddress) {
@@ -646,24 +656,19 @@
 
     // Function or one of its fragments.
     const BinaryFunction *TargetBF = getBinaryFunctionContainingAddress(Value);
-    const bool DoesBelongToFunction =
-        BF.containsAddress(Value) ||
-        (TargetBF && areRelatedFragments(TargetBF, &BF));
-    if (!DoesBelongToFunction) {
+    if (!TargetBF || !areRelatedFragments(TargetBF, &BF)) {
       LLVM_DEBUG({
-        if (!BF.containsAddress(Value)) {
-          dbgs() << "FAIL: function doesn't contain this address\n";
-          if (TargetBF) {
-            dbgs() << "  ! function containing this address: "
-                   << TargetBF->getPrintName() << '\n';
-            if (TargetBF->isFragment()) {
-              dbgs() << "  ! is a fragment";
-              for (BinaryFunction *Parent : TargetBF->ParentFragments)
-                dbgs() << ", parent: " << Parent->getPrintName();
-              dbgs() << '\n';
-            }
-          }
-        }
+        dbgs() << "FAIL: function doesn't contain this address\n";
+        if (!TargetBF)
+          break;
+        dbgs() << "  ! function containing this address: " << *TargetBF << '\n';
+        if (!TargetBF->isFragment())
+          break;
+        dbgs() << "  ! is a fragment with parents: ";
+        ListSeparator LS;
+        for (BinaryFunction *Parent : TargetBF->ParentFragments)
+          dbgs() << LS << *Parent;
+        dbgs() << '\n';
       });
       break;
     }
@@ -678,17 +683,17 @@
     ++NumRealEntries;
     LLVM_DEBUG(dbgs() << formatv("OK: {0:x} real entry\n", Value));
 
-    if (TargetBF != &BF && HasEntryInFragment)
-      *HasEntryInFragment = true;
+    if (TargetBF != &BF && JT)
+      JT->IsSplit = true;
     addEntryAddress(Value);
   }
 
   // Trim direct/normal jump table to exclude trailing unreachable entries that
   // can collide with a function address.
-  if (Type == JumpTable::JTT_NORMAL && EntriesAsAddress &&
-      TrimmedSize != EntriesAsAddress->size() &&
+  if (Type == JumpTable::JTT_X86_64_ABS && JT &&
+      TrimmedSize != JT->EntriesAsAddress.size() &&
       getBinaryFunctionAtAddress(UnreachableAddress))
-    EntriesAsAddress->resize(TrimmedSize);
+    JT->EntriesAsAddress.resize(TrimmedSize);
 
   // It's a jump table if the number of real entries is more than 1, or there's
   // one real entry and one or more special targets. If there are only multiple
@@ -703,20 +708,20 @@
        ++JTI) {
     JumpTable *JT = JTI->second;
 
-    bool NonSimpleParent = false;
-    for (BinaryFunction *BF : JT->Parents)
-      NonSimpleParent |= !BF->isSimple();
-    if (NonSimpleParent)
+    auto isSimple = std::bind(&BinaryFunction::isSimple, std::placeholders::_1);
+    if (!llvm::all_of(JT->Parents, isSimple))
       continue;
 
     uint64_t NextJTAddress = 0;
     auto NextJTI = std::next(JTI);
-    if (NextJTI != JTE)
+    if (isAArch64()) {
+      NextJTAddress = JT->getAddress() + JT->getSize();
+      JT->Entries.clear();
+    } else if (NextJTI != JTE)
       NextJTAddress = NextJTI->second->getAddress();
 
-    const bool Success =
-        analyzeJumpTable(JT->getAddress(), JT->Type, *(JT->Parents[0]),
-                         NextJTAddress, &JT->EntriesAsAddress, &JT->IsSplit);
+    const bool Success = analyzeJumpTable(
+        JT->getAddress(), JT->Type, *JT->Parents.front(), NextJTAddress, JT);
     if (!Success) {
       LLVM_DEBUG({
         dbgs() << "failed to analyze ";
@@ -744,7 +749,7 @@
 
     // In strict mode, erase PC-relative relocation record. Later we check that
     // all such records are erased and thus have been accounted for.
-    if (opts::StrictMode && JT->Type == JumpTable::JTT_PIC) {
+    if (opts::StrictMode && JT->Type == JumpTable::JTT_X86_64_PIC4) {
       for (uint64_t Address = JT->getAddress();
            Address < JT->getAddress() + JT->getSize();
            Address += JT->EntrySize) {
@@ -840,33 +845,26 @@
     assert(JT->Type == Type && "jump table types have to match");
     assert(Address == JT->getAddress() && "unexpected non-empty jump table");
 
-    // Prevent associating a jump table to a specific fragment twice.
-    if (!llvm::is_contained(JT->Parents, &Function)) {
-      assert(llvm::all_of(JT->Parents,
-                          [&](const BinaryFunction *BF) {
-                            return areRelatedFragments(&Function, BF);
-                          }) &&
-             "cannot re-use jump table of a different function");
-      // Duplicate the entry for the parent function for easy access
-      JT->Parents.push_back(&Function);
-      if (opts::Verbosity > 2) {
-        this->outs() << "BOLT-INFO: Multiple fragments access same jump table: "
-                     << JT->Parents[0]->getPrintName() << "; "
-                     << Function.getPrintName() << "\n";
-        JT->print(this->outs());
-      }
-      Function.JumpTables.emplace(Address, JT);
-      for (BinaryFunction *Parent : JT->Parents)
-        Parent->setHasIndirectTargetToSplitFragment(true);
-    }
+    if (llvm::is_contained(JT->Parents, &Function))
+      return JT->getFirstLabel();
 
-    bool IsJumpTableParent = false;
-    (void)IsJumpTableParent;
-    for (BinaryFunction *Frag : JT->Parents)
-      if (Frag == &Function)
-        IsJumpTableParent = true;
-    assert(IsJumpTableParent &&
+    // Prevent associating a jump table to a specific fragment twice.
+    auto isSibling = std::bind(&BinaryContext::areRelatedFragments, this,
+                               &Function, std::placeholders::_1);
+    assert(llvm::all_of(JT->Parents, isSibling) &&
            "cannot re-use jump table of a different function");
+    if (opts::Verbosity > 2) {
+      this->outs() << "BOLT-INFO: Multiple fragments access same jump table: "
+                   << JT->Parents[0]->getPrintName() << "; "
+                   << Function.getPrintName() << "\n";
+      JT->print(this->outs());
+    }
+    if (JT->Parents.size() == 1)
+      JT->Parents.front()->setHasIndirectTargetToSplitFragment(true);
+    Function.setHasIndirectTargetToSplitFragment(true);
+    // Duplicate the entry for the parent function for easy access
+    JT->Parents.push_back(&Function);
+    Function.JumpTables.emplace(Address, JT);
     return JT->getFirstLabel();
   }
 
@@ -1611,7 +1609,21 @@
                   SortedFunctions.begin(),
                   [](BinaryFunction &BF) { return &BF; });
 
-  llvm::stable_sort(SortedFunctions, compareBinaryFunctionByIndex);
+  llvm::stable_sort(SortedFunctions,
+                    [](const BinaryFunction *A, const BinaryFunction *B) {
+                      // Place hot text movers at the start.
+                      if (A->isHotTextMover() && !B->isHotTextMover())
+                        return true;
+                      if (!A->isHotTextMover() && B->isHotTextMover())
+                        return false;
+                      if (A->hasValidIndex() && B->hasValidIndex()) {
+                        return A->getIndex() < B->getIndex();
+                      }
+                      if (opts::HotFunctionsAtEnd)
+                        return B->hasValidIndex();
+                      else
+                        return A->hasValidIndex();
+                    });
   return SortedFunctions;
 }
 
@@ -2434,6 +2446,15 @@
   return PBF;
 }
 
+BinaryFunction *
+BinaryContext::createThunkBinaryFunction(const std::string &Name) {
+  ThunkBinaryFunctions.push_back(new BinaryFunction(Name, *this, true));
+  BinaryFunction *BF = ThunkBinaryFunctions.back();
+  setSymbolToFunctionMap(BF->getSymbol(), BF);
+  BF->CurrentState = BinaryFunction::State::CFG;
+  return BF;
+}
+
 std::pair<size_t, size_t>
 BinaryContext::calculateEmittedSize(BinaryFunction &BF, bool FixBranches) {
   // Adjust branch instruction to match the current layout.
diff --git a/bolt/lib/Core/BinaryEmitter.cpp b/bolt/lib/Core/BinaryEmitter.cpp
index 1aad252..db0f11b 100644
--- a/bolt/lib/Core/BinaryEmitter.cpp
+++ b/bolt/lib/Core/BinaryEmitter.cpp
@@ -19,6 +19,7 @@
 #include "bolt/Utils/CommandLineOpts.h"
 #include "bolt/Utils/Utils.h"
 #include "llvm/DebugInfo/DWARF/DWARFCompileUnit.h"
+#include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCSection.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/Support/CommandLine.h"
@@ -271,6 +272,14 @@
 
       if (Emitted)
         Function->setEmitted(/*KeepCFG=*/opts::PrintCacheMetrics);
+
+      // Emit thunks.
+      if (BC.getThunkLocation() != Function)
+        continue;
+
+      for (BinaryFunction *Thunk : BC.getThunkBinaryFunctions()) {
+        emitFunction(*Thunk, Thunk->getLayout().getMainFragment());
+      }
     }
   };
 
@@ -809,57 +818,71 @@
     Streamer.switchSection(JT.Count > 0 ? HotSection : ColdSection);
     Streamer.emitValueToAlignment(Align(JT.EntrySize));
   }
-  MCSymbol *LastLabel = nullptr;
+  MCSymbol *JTLabel = nullptr;
+  MCContext &Context = Streamer.getContext();
   uint64_t Offset = 0;
   for (MCSymbol *Entry : JT.Entries) {
     auto LI = JT.Labels.find(Offset);
-    if (LI != JT.Labels.end()) {
-      LLVM_DEBUG({
-        dbgs() << "BOLT-DEBUG: emitting jump table " << LI->second->getName()
-               << " (originally was at address 0x"
-               << Twine::utohexstr(JT.getAddress() + Offset)
-               << (Offset ? ") as part of larger jump table\n" : ")\n");
-      });
-      if (!LabelCounts.empty()) {
-        LLVM_DEBUG(dbgs() << "BOLT-DEBUG: jump table count: "
-                          << LabelCounts[LI->second] << '\n');
-        if (LabelCounts[LI->second] > 0)
-          Streamer.switchSection(HotSection);
-        else
-          Streamer.switchSection(ColdSection);
-        Streamer.emitValueToAlignment(Align(JT.EntrySize));
-      }
-      // Emit all labels registered at the address of this jump table
-      // to sync with our global symbol table.  We may have two labels
-      // registered at this address if one label was created via
-      // getOrCreateGlobalSymbol() (e.g. LEA instructions referencing
-      // this location) and another via getOrCreateJumpTable().  This
-      // creates a race where the symbols created by these two
-      // functions may or may not be the same, but they are both
-      // registered in our symbol table at the same address. By
-      // emitting them all here we make sure there is no ambiguity
-      // that depends on the order that these symbols were created, so
-      // whenever this address is referenced in the binary, it is
-      // certain to point to the jump table identified at this
-      // address.
-      if (BinaryData *BD = BC.getBinaryDataByName(LI->second->getName())) {
-        for (MCSymbol *S : BD->getSymbols())
-          Streamer.emitLabel(S);
-      } else {
-        Streamer.emitLabel(LI->second);
-      }
-      LastLabel = LI->second;
+    if (LI == JT.Labels.end())
+      goto emitEntry;
+    JTLabel = LI->second;
+    LLVM_DEBUG({
+      dbgs() << "BOLT-DEBUG: emitting jump table " << JTLabel->getName()
+             << " (originally was at address 0x"
+             << Twine::utohexstr(JT.getAddress() + Offset)
+             << (Offset ? ") as part of larger jump table\n" : ")\n");
+    });
+    if (!LabelCounts.empty()) {
+      uint64_t JTCount = LabelCounts[JTLabel];
+      LLVM_DEBUG(dbgs() << "BOLT-DEBUG: jump table count: " << JTCount << '\n');
+      Streamer.switchSection(JTCount ? HotSection : ColdSection);
+      Streamer.emitValueToAlignment(Align(JT.EntrySize));
     }
-    if (JT.Type == JumpTable::JTT_NORMAL) {
+    // Emit all labels registered at the address of this jump table
+    // to sync with our global symbol table.  We may have two labels
+    // registered at this address if one label was created via
+    // getOrCreateGlobalSymbol() (e.g. LEA instructions referencing
+    // this location) and another via getOrCreateJumpTable().  This
+    // creates a race where the symbols created by these two
+    // functions may or may not be the same, but they are both
+    // registered in our symbol table at the same address. By
+    // emitting them all here we make sure there is no ambiguity
+    // that depends on the order that these symbols were created, so
+    // whenever this address is referenced in the binary, it is
+    // certain to point to the jump table identified at this
+    // address.
+    if (BinaryData *BD = BC.getBinaryDataByName(JTLabel->getName())) {
+      for (MCSymbol *S : BD->getSymbols())
+        Streamer.emitLabel(S);
+    } else {
+      Streamer.emitLabel(JTLabel);
+    }
+  emitEntry:
+    switch (JT.Type) {
+    case JumpTable::JTT_X86_64_ABS:
       Streamer.emitSymbolValue(Entry, JT.OutputEntrySize);
-    } else { // JTT_PIC
-      const MCSymbolRefExpr *JTExpr =
-          MCSymbolRefExpr::create(LastLabel, Streamer.getContext());
-      const MCSymbolRefExpr *E =
-          MCSymbolRefExpr::create(Entry, Streamer.getContext());
-      const MCBinaryExpr *Value =
-          MCBinaryExpr::createSub(E, JTExpr, Streamer.getContext());
+      break;
+    case JumpTable::JTT_X86_64_PIC4: {
+      const MCSymbolRefExpr *JTExpr = MCSymbolRefExpr::create(JTLabel, Context);
+      const MCSymbolRefExpr *E = MCSymbolRefExpr::create(Entry, Context);
+      const MCBinaryExpr *Value = MCBinaryExpr::createSub(E, JTExpr, Context);
       Streamer.emitValue(Value, JT.EntrySize);
+      break;
+    }
+    case JumpTable::JTT_AARCH64_REL1:
+    case JumpTable::JTT_AARCH64_REL2:
+    case JumpTable::JTT_AARCH64_REL4: {
+      MCSymbol *BaseSym = std::get<MCSymbol *>(JT.BaseAddress);
+      const MCExpr *Base = MCSymbolRefExpr::create(BaseSym, Context);
+      const MCExpr *E = MCSymbolRefExpr::create(Entry, Context);
+      const MCBinaryExpr *Value = MCBinaryExpr::createSub(E, Base, Context);
+      if (JT.EntrySize != 4)
+        Value = MCBinaryExpr::createLShr(
+            Value, MCConstantExpr::create(2, Context), Context);
+
+      Streamer.emitValue(Value, JT.EntrySize);
+      break;
+    }
     }
     Offset += JT.EntrySize;
   }
diff --git a/bolt/lib/Core/BinaryFunction.cpp b/bolt/lib/Core/BinaryFunction.cpp
index d1b293a..678b944 100644
--- a/bolt/lib/Core/BinaryFunction.cpp
+++ b/bolt/lib/Core/BinaryFunction.cpp
@@ -112,6 +112,10 @@
                             cl::desc("try to preserve basic block alignment"),
                             cl::cat(BoltOptCategory));
 
+static cl::opt<bool> PrintOffsets("print-offsets",
+                                  cl::desc("print basic block offsets"),
+                                  cl::Hidden, cl::cat(BoltOptCategory));
+
 static cl::opt<bool> PrintOutputAddressRange(
     "print-output-address-range",
     cl::desc(
@@ -556,6 +560,11 @@
       if (BB->isLandingPad())
         OS << "  Landing Pad\n";
 
+      if (opts::PrintOffsets && BB->getOutputStartAddress()) {
+        OS << "  OutputOffset: 0x"
+           << Twine::utohexstr(BB->getOutputStartAddress()) << '\n';
+      }
+
       uint64_t BBExecCount = BB->getExecutionCount();
       if (hasValidProfile()) {
         OS << "  Exec Count : ";
@@ -909,7 +918,7 @@
            "Invalid memory instruction");
     const MCExpr *FixedEntryDispExpr = FixedEntryDispOperand->getExpr();
     const uint64_t EntryAddress = getExprValue(FixedEntryDispExpr);
-    uint64_t EntrySize = BC.getJumpTableEntrySize(JumpTable::JTT_PIC);
+    uint64_t EntrySize = BC.getJumpTableEntrySize(JumpTable::JTT_X86_64_PIC4);
     ErrorOr<int64_t> Value =
         BC.getSignedValueAtAddress(EntryAddress, EntrySize);
     if (!Value)
@@ -979,12 +988,14 @@
   MemoryContentsType MemType;
   if (JumpTable *JT = BC.getJumpTableContainingAddress(ArrayStart)) {
     switch (JT->Type) {
-    case JumpTable::JTT_NORMAL:
+    case JumpTable::JTT_X86_64_ABS:
       MemType = MemoryContentsType::POSSIBLE_JUMP_TABLE;
       break;
-    case JumpTable::JTT_PIC:
+    case JumpTable::JTT_X86_64_PIC4:
       MemType = MemoryContentsType::POSSIBLE_PIC_JUMP_TABLE;
       break;
+    default:
+      llvm_unreachable("Unhandled jump table type");
     }
   } else {
     MemType = BC.analyzeMemoryAt(ArrayStart, *this);
@@ -995,7 +1006,7 @@
   if (BranchType == IndirectBranchType::POSSIBLE_PIC_JUMP_TABLE) {
     if (MemType != MemoryContentsType::POSSIBLE_PIC_JUMP_TABLE)
       return IndirectBranchType::UNKNOWN;
-    JTType = JumpTable::JTT_PIC;
+    JTType = JumpTable::JTT_X86_64_PIC4;
   } else {
     if (MemType == MemoryContentsType::POSSIBLE_PIC_JUMP_TABLE)
       return IndirectBranchType::UNKNOWN;
@@ -1004,7 +1015,7 @@
       return IndirectBranchType::POSSIBLE_TAIL_CALL;
 
     BranchType = IndirectBranchType::POSSIBLE_JUMP_TABLE;
-    JTType = JumpTable::JTT_NORMAL;
+    JTType = JumpTable::JTT_X86_64_ABS;
   }
 
   // Convert the instruction into jump table branch.
@@ -1908,7 +1919,8 @@
   // Create labels for all entries.
   for (auto &JTI : JumpTables) {
     JumpTable &JT = *JTI.second;
-    if (JT.Type == JumpTable::JTT_PIC && opts::JumpTables == JTS_BASIC) {
+    if ((JT.Type == JumpTable::JTT_X86_64_PIC4 || BC.isAArch64()) &&
+        opts::JumpTables == JTS_BASIC) {
       opts::JumpTables = JTS_MOVE;
       BC.outs() << "BOLT-INFO: forcing -jump-tables=move as PIC jump table was "
                    "detected in function "
@@ -1953,6 +1965,12 @@
       }
       JT.Entries.push_back(Label);
     }
+    // Register jump table base address as a local symbol
+    if (uint64_t BaseAddress = std::get<0>(JT.BaseAddress)) {
+      BinaryFunction *BF = BC.getBinaryFunctionContainingAddress(BaseAddress);
+      assert(BF && "must have a valid jump table base address");
+      JT.BaseAddress = BF->getOrCreateLocalLabel(BaseAddress);
+    }
   }
 
   // Add TakenBranches from JumpTables.
@@ -2103,7 +2121,7 @@
           BC.MIB->unsetJumpTable(Instr);
 
           JumpTable *JT = BC.getJumpTableContainingAddress(LastJT);
-          if (JT->Type == JumpTable::JTT_NORMAL) {
+          if (JT->Type == JumpTable::JTT_X86_64_ABS) {
             // Invalidating the jump table may also invalidate other jump table
             // boundaries. Until we have/need a support for this, mark the
             // function as non-simple.
diff --git a/bolt/lib/Core/JumpTable.cpp b/bolt/lib/Core/JumpTable.cpp
index 6f588d2..e780c73 100644
--- a/bolt/lib/Core/JumpTable.cpp
+++ b/bolt/lib/Core/JumpTable.cpp
@@ -84,10 +84,10 @@
   const uint64_t BaseOffset = getAddress() - getSection().getAddress();
   uint64_t EntryOffset = BaseOffset;
   for (MCSymbol *Entry : Entries) {
-    const uint32_t RelType =
-        Type == JTT_NORMAL ? ELF::R_X86_64_64 : ELF::R_X86_64_PC32;
+    const uint64_t RelType =
+        Type == JTT_X86_64_ABS ? ELF::R_X86_64_64 : ELF::R_X86_64_PC32;
     const uint64_t RelAddend =
-        Type == JTT_NORMAL ? 0 : EntryOffset - BaseOffset;
+        Type == JTT_X86_64_ABS ? 0 : EntryOffset - BaseOffset;
     // Replace existing relocation with the new one to allow any modifications
     // to the original jump table.
     if (BC.HasRelocations)
@@ -99,7 +99,7 @@
 
 void bolt::JumpTable::print(raw_ostream &OS) const {
   uint64_t Offset = 0;
-  if (Type == JTT_PIC)
+  if (Type == JTT_X86_64_PIC4)
     OS << "PIC ";
   ListSeparator LS;
 
diff --git a/bolt/lib/Passes/Aligner.cpp b/bolt/lib/Passes/Aligner.cpp
index c3ddeda..1b499ac 100644
--- a/bolt/lib/Passes/Aligner.cpp
+++ b/bolt/lib/Passes/Aligner.cpp
@@ -77,6 +77,11 @@
   size_t HotSize = 0;
   size_t ColdSize = 0;
 
+  if (!Function.hasProfile() && BC.isAArch64()) {
+    Function.setAlignment(Function.getMinAlignment());
+    return;
+  }
+
   for (const BinaryBasicBlock &BB : Function)
     if (BB.isSplit())
       ColdSize += BC.computeCodeSize(BB.begin(), BB.end(), Emitter);
diff --git a/bolt/lib/Passes/BinaryPasses.cpp b/bolt/lib/Passes/BinaryPasses.cpp
index d8628c6..6b5e08b 100644
--- a/bolt/lib/Passes/BinaryPasses.cpp
+++ b/bolt/lib/Passes/BinaryPasses.cpp
@@ -1287,6 +1287,8 @@
     if (opts::isHotTextMover(Function)) {
       Function.setCodeSectionName(BC.getHotTextMoverSectionName());
       Function.setColdCodeSectionName(BC.getHotTextMoverSectionName());
+      // TODO: find a better place to mark a function as a mover.
+      Function.setHotTextMover(true);
       continue;
     }
 
diff --git a/bolt/lib/Passes/IndirectCallPromotion.cpp b/bolt/lib/Passes/IndirectCallPromotion.cpp
index 2b5a591..d70fd0e 100644
--- a/bolt/lib/Passes/IndirectCallPromotion.cpp
+++ b/bolt/lib/Passes/IndirectCallPromotion.cpp
@@ -246,7 +246,7 @@
 
   if (const JumpTable *JT = BF.getJumpTable(Inst)) {
     // Don't support PIC jump tables for now
-    if (!opts::ICPJumpTablesByTarget && JT->Type == JumpTable::JTT_PIC)
+    if (!opts::ICPJumpTablesByTarget && JT->Type == JumpTable::JTT_X86_64_PIC4)
       return Targets;
     const Location From(BF.getSymbol());
     const std::pair<size_t, size_t> Range =
@@ -256,7 +256,7 @@
     const JumpTable::JumpInfo *JI =
         JT->Counts.empty() ? &DefaultJI : &JT->Counts[Range.first];
     const size_t JIAdj = JT->Counts.empty() ? 0 : 1;
-    assert(JT->Type == JumpTable::JTT_PIC ||
+    assert(JT->Type == JumpTable::JTT_X86_64_PIC4 ||
            JT->EntrySize == BC.AsmInfo->getCodePointerSize());
     for (size_t I = Range.first; I < Range.second; ++I, JI += JIAdj) {
       MCSymbol *Entry = JT->Entries[I];
diff --git a/bolt/lib/Passes/JTFootprintReduction.cpp b/bolt/lib/Passes/JTFootprintReduction.cpp
index 71bdbba..13b37dc 100644
--- a/bolt/lib/Passes/JTFootprintReduction.cpp
+++ b/bolt/lib/Passes/JTFootprintReduction.cpp
@@ -202,7 +202,7 @@
 
   JumpTable->OutputEntrySize = 4;
   // DePICify
-  JumpTable->Type = JumpTable::JTT_NORMAL;
+  JumpTable->Type = JumpTable::JTT_X86_64_ABS;
 
   BB.replaceInstruction(Inst, NewFrag.begin(), NewFrag.end());
   return true;
diff --git a/bolt/lib/Passes/LongJmp.cpp b/bolt/lib/Passes/LongJmp.cpp
index e6bd417..75227da 100644
--- a/bolt/lib/Passes/LongJmp.cpp
+++ b/bolt/lib/Passes/LongJmp.cpp
@@ -31,6 +31,11 @@
                      cl::desc("generate code for binaries <128MB on AArch64"),
                      cl::init(false), cl::cat(BoltCategory));
 
+static cl::opt<bool>
+    ExperimentalRelaxation("relax-exp",
+                           cl::desc("run experimental relaxation pass"),
+                           cl::init(false), cl::cat(BoltOptCategory));
+
 static cl::opt<bool> GroupStubs("group-stubs",
                                 cl::desc("share stubs across functions"),
                                 cl::init(true), cl::cat(BoltOptCategory));
@@ -897,12 +902,185 @@
   }
 }
 
+void LongJmpPass::relaxCalls(BinaryContext &BC) {
+  // Map every function to its direct callees. Note that this is different from
+  // a typical call graph as we completely ignore indirect calls.
+  uint64_t EstimatedSize = 0;
+  // Conservatively estimate emitted function size.
+  auto estimateFunctionSize = [&](const BinaryFunction &BF) -> uint64_t {
+    if (!BC.shouldEmit(BF))
+      return 0;
+    uint64_t Size = BF.estimateSize();
+    if (BF.hasValidIndex())
+      Size += BF.getAlignment();
+    if (BF.hasIslandsInfo()) {
+      Size += BF.estimateConstantIslandSize();
+      Size += BF.getConstantIslandAlignment();
+    }
+
+    return Size;
+  };
+
+  std::unordered_map<BinaryFunction *, std::set<BinaryFunction *>> CallMap;
+  for (BinaryFunction &BF : llvm::make_second_range(BC.getBinaryFunctions())) {
+    if (!BC.shouldEmit(BF))
+      continue;
+
+    EstimatedSize += estimateFunctionSize(BF);
+
+    for (const BinaryBasicBlock &BB : BF) {
+      for (const MCInst &Inst : BB) {
+        if (!BC.MIB->isCall(Inst) || BC.MIB->isIndirectCall(Inst) ||
+            BC.MIB->isIndirectBranch(Inst))
+          continue;
+        const MCSymbol *TargetSymbol = BC.MIB->getTargetSymbol(Inst);
+        assert(TargetSymbol);
+
+        BinaryFunction *Callee = BC.getFunctionForSymbol(TargetSymbol);
+        if (!Callee) {
+          /* Ignore internall calls */
+          continue;
+        }
+
+        CallMap[&BF].insert(Callee);
+      }
+    }
+  }
+
+  LLVM_DEBUG(dbgs() << "LongJmp: estimated code size : " << EstimatedSize
+                    << '\n');
+
+  // Build clusters in the order the functions will appear in the output.
+  std::vector<FunctionCluster> Clusters;
+  Clusters.emplace_back(FunctionCluster());
+
+  for (BinaryFunction *BF : BC.getSortedFunctions()) {
+    if (!BC.shouldEmit(*BF))
+      continue;
+
+    const uint64_t BFSize = estimateFunctionSize(*BF);
+    if (Clusters.empty() || Clusters.back().Size + BFSize > MaxClusterSize) {
+      Clusters.emplace_back(FunctionCluster());
+    }
+
+    FunctionCluster &FC = Clusters.back();
+    FC.Functions.insert(BF);
+    auto It = FC.Callees.find(BF);
+    if (It != FC.Callees.end()) {
+      FC.Callees.erase(It);
+    }
+    FC.Size += BFSize;
+    FC.LastBF = BF;
+
+    for (BinaryFunction *Callee : CallMap[BF])
+      if (!FC.Functions.count(Callee))
+        FC.Callees.insert(Callee);
+  }
+
+  // Print cluster stats.
+  dbgs() << "Built " << Clusters.size() << " clusters\n";
+  uint64_t Index = 0;
+  for (const FunctionCluster &FC : Clusters) {
+    dbgs() << "  Cluster: " << Index++ << '\n';
+    dbgs() << "    " << FC.Functions.size() << " functions\n";
+    dbgs() << "    " << FC.Callees.size() << " callees\n";
+    dbgs() << "    " << FC.Size << " bytes\n";
+  }
+
+  if (Clusters.size() > 2) {
+    BC.errs() << "Large code model is unsupported\n";
+    exit(1);
+  }
+
+  if (Clusters.size() == 1)
+    return;
+
+  // Populate one of the clusters with PLT functions based on the proximity of
+  // the PLT section to avoid unneeded thunk redirection.
+  // FIXME: this part is extremely fragile as it depends on the placement
+  //        of PLT section and its proximity to old or new .text.
+  // FIXME: a slightly better approach will be to always use thunks for PLT and
+  //        eliminate redirection later using final addresses in address maps.
+  const size_t PLTClusterNum = opts::UseOldText ? 1 : 0;
+  for (BinaryFunction &BF : llvm::make_second_range(BC.getBinaryFunctions())) {
+    if (BF.isPLTFunction()) {
+      auto &PLTCluster = Clusters[PLTClusterNum];
+      PLTCluster.Functions.insert(&BF);
+      auto It = PLTCluster.Callees.find(&BF);
+      if (It != PLTCluster.Callees.end())
+        PLTCluster.Callees.erase(It);
+    }
+  }
+
+  // FIXME: section name to use for thunks.
+  std::string SectionName =
+      Clusters[0].LastBF->getCodeSectionName().str().str();
+
+  // Build thunk functions.
+  auto createSmallThunk = [&](BinaryFunction &Callee) {
+    BinaryFunction *ThunkBF =
+        BC.createThunkBinaryFunction("__BThunk__" + Callee.getOneName().str());
+    MCInst Inst;
+    BC.MIB->createTailCall(Inst, Callee.getSymbol(), BC.Ctx.get());
+    ThunkBF->addBasicBlock()->addInstruction(Inst);
+    ThunkBF->setCodeSectionName(SectionName);
+
+    return ThunkBF;
+  };
+
+  DenseMap<BinaryFunction *, BinaryFunction *> Thunks;
+  for (FunctionCluster &FC : Clusters) {
+    SmallVector<BinaryFunction *, 16> Callees(FC.Callees.begin(),
+                                              FC.Callees.end());
+    llvm::sort(Callees, compareBinaryFunctionByIndex);
+    for (BinaryFunction *Callee : Callees)
+      Thunks[Callee] = createSmallThunk(*Callee);
+  }
+
+  BC.outs() << "BOLT-INFO: " << Thunks.size() << " thunks created\n";
+
+  // Replace callees with thunks.
+  for (FunctionCluster &FC : Clusters) {
+    for (BinaryFunction *BF : FC.Functions) {
+      if (!CallMap.count(BF))
+        continue;
+
+      for (BinaryBasicBlock &BB : *BF) {
+        for (MCInst &Inst : BB) {
+          if (!BC.MIB->isCall(Inst) || BC.MIB->isIndirectCall(Inst) ||
+              BC.MIB->isIndirectBranch(Inst))
+            continue;
+          const MCSymbol *TargetSymbol = BC.MIB->getTargetSymbol(Inst);
+          assert(TargetSymbol);
+
+          BinaryFunction *Callee = BC.getFunctionForSymbol(TargetSymbol);
+          if (!Callee) {
+            /* Ignore internal calls */
+            continue;
+          }
+
+          // Check if the callee is in the same cluster.
+          if (!FC.Callees.count(Callee))
+            continue;
+
+          // Use thunk as the call destination.
+          BC.MIB->replaceBranchTarget(Inst, Thunks[Callee]->getSymbol(),
+                                      BC.Ctx.get());
+        }
+      }
+    }
+  }
+
+  BC.setThunkLocation(Clusters[0].LastBF);
+}
+
 Error LongJmpPass::runOnFunctions(BinaryContext &BC) {
 
-  if (opts::CompactCodeModel) {
+  if (opts::CompactCodeModel || opts::ExperimentalRelaxation) {
     BC.outs()
         << "BOLT-INFO: relaxing branches for compact code model (<128MB)\n";
 
+  // TODO: set correct code model based on the total size of split-code.
     ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) {
       relaxLocalBranches(BF);
     };
@@ -916,6 +1094,12 @@
         BC, ParallelUtilities::SchedulingPolicy::SP_INST_LINEAR, WorkFun,
         SkipPredicate, "RelaxLocalBranches");
 
+    if (!opts::ExperimentalRelaxation)
+      return Error::success();
+
+    BC.outs() << "BOLT-INFO: starting experimental relaxation pass\n";
+    relaxCalls(BC);
+
     return Error::success();
   }
 
diff --git a/bolt/lib/Passes/PatchEntries.cpp b/bolt/lib/Passes/PatchEntries.cpp
index 8a2f0a3..a37ee33 100644
--- a/bolt/lib/Passes/PatchEntries.cpp
+++ b/bolt/lib/Passes/PatchEntries.cpp
@@ -36,16 +36,20 @@
   if (!opts::ForcePatch) {
     // Mark the binary for patching if we did not create external references
     // for original code in any of functions we are not going to emit.
-    bool NeedsPatching = llvm::any_of(
-        llvm::make_second_range(BC.getBinaryFunctions()),
-        [&](BinaryFunction &BF) {
-          return !BC.shouldEmit(BF) && !BF.hasExternalRefRelocations();
-        });
+    bool NeedsPatching =
+        llvm::any_of(llvm::make_second_range(BC.getBinaryFunctions()),
+                     [&](BinaryFunction &BF) {
+                       return !BF.isPseudo() && !BC.shouldEmit(BF) &&
+                              !BF.hasExternalRefRelocations();
+                     });
 
     if (!NeedsPatching)
       return Error::success();
   }
 
+  assert(!opts::UseOldText &&
+         "Cannot patch entries while overwriting original .text");
+
   if (opts::Verbosity >= 1)
     BC.outs() << "BOLT-INFO: patching entries in original code\n";
 
diff --git a/bolt/lib/Rewrite/BinaryPassManager.cpp b/bolt/lib/Rewrite/BinaryPassManager.cpp
index dd48653..8304693 100644
--- a/bolt/lib/Rewrite/BinaryPassManager.cpp
+++ b/bolt/lib/Rewrite/BinaryPassManager.cpp
@@ -497,6 +497,9 @@
   // memory profiling data.
   Manager.registerPass(std::make_unique<ReorderData>());
 
+  // Assign each function an output section.
+  Manager.registerPass(std::make_unique<AssignSections>());
+
   if (BC.isAArch64()) {
     Manager.registerPass(
         std::make_unique<ADRRelaxationPass>(PrintAdrRelaxation));
@@ -521,9 +524,6 @@
   Manager.registerPass(
       std::make_unique<RetpolineInsertion>(PrintRetpolineInsertion));
 
-  // Assign each function an output section.
-  Manager.registerPass(std::make_unique<AssignSections>());
-
   // Patch original function entries
   if (BC.HasRelocations)
     Manager.registerPass(std::make_unique<PatchEntries>());
diff --git a/bolt/lib/Rewrite/CMakeLists.txt b/bolt/lib/Rewrite/CMakeLists.txt
index c83cf36..d27bd22 100644
--- a/bolt/lib/Rewrite/CMakeLists.txt
+++ b/bolt/lib/Rewrite/CMakeLists.txt
@@ -17,6 +17,7 @@
   DWARFRewriter.cpp
   ExecutableFileMemoryManager.cpp
   JITLinkLinker.cpp
+  JumpTableInfoReader.cpp
   LinuxKernelRewriter.cpp
   MachORewriteInstance.cpp
   MetadataManager.cpp
diff --git a/bolt/lib/Rewrite/JumpTableInfoReader.cpp b/bolt/lib/Rewrite/JumpTableInfoReader.cpp
new file mode 100644
index 0000000..98230a2
--- /dev/null
+++ b/bolt/lib/Rewrite/JumpTableInfoReader.cpp
@@ -0,0 +1,91 @@
+//===- bolt/Rewrite/JumpTableInfoReader.cpp -------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Read .llvm_jump_table_info section and register jump tables.
+//
+//===----------------------------------------------------------------------===//
+
+#include "bolt/Core/JumpTable.h"
+#include "bolt/Rewrite/MetadataRewriter.h"
+#include "bolt/Rewrite/MetadataRewriters.h"
+#include "llvm/Support/DataExtractor.h"
+
+using namespace llvm;
+using namespace bolt;
+
+namespace {
+class JumpTableInfoReader final : public MetadataRewriter {
+
+public:
+  JumpTableInfoReader(StringRef Name, BinaryContext &BC)
+      : MetadataRewriter(Name, BC) {}
+  Error preDisasmInitializer() override;
+};
+
+Error JumpTableInfoReader::preDisasmInitializer() {
+  if (!BC.isAArch64())
+    return Error::success();
+
+  ErrorOr<BinarySection &> ErrorOrJTInfoSection =
+      BC.getUniqueSectionByName(".llvm_jump_table_info");
+  if (std::error_code E = ErrorOrJTInfoSection.getError())
+    return Error::success();
+  BinarySection &JTInfoSection = *ErrorOrJTInfoSection;
+  StringRef Buf = JTInfoSection.getContents();
+  DataExtractor DE = DataExtractor(Buf, BC.AsmInfo->isLittleEndian(),
+                                   BC.AsmInfo->getCodePointerSize());
+  DataExtractor::Cursor Cursor(0);
+  while (Cursor && !DE.eof(Cursor)) {
+    const uint8_t Format = DE.getU8(Cursor);
+    const uint64_t JTAddr = DE.getAddress(Cursor);
+    const uint64_t JTBase = DE.getAddress(Cursor);
+    const uint64_t JTLoad = DE.getAddress(Cursor);
+    const uint64_t Branch = DE.getAddress(Cursor);
+    const uint64_t NumEntries = DE.getULEB128(Cursor);
+
+    JumpTable::JumpTableType Type = JumpTable::JTT_AARCH64_LAST;
+    switch (Format) {
+    case 2:
+      Type = JumpTable::JTT_AARCH64_REL1;
+      break;
+    case 3:
+      Type = JumpTable::JTT_AARCH64_REL2;
+      break;
+    case 4:
+      Type = JumpTable::JTT_AARCH64_REL4;
+      break;
+    default:
+      errs() << "BOLT-WARNING: unknown jump table info type " << Format
+                << " for jump table " << Twine::utohexstr(JTAddr) << '\n';
+      continue;
+    }
+
+    BinaryFunction *BF = BC.getBinaryFunctionContainingAddress(Branch);
+    if (!BF) {
+      BC.errs() << "BOLT-WARNING: binary function not found for jump table "
+                   "with address "
+                << Twine::utohexstr(JTAddr) << " and branch "
+                << Twine::utohexstr(Branch) << '\n';
+      continue;
+    }
+    const MCSymbol *JTSym = BC.getOrCreateJumpTable(*BF, JTAddr, Type);
+    assert(JTSym && "failed to create a jump table");
+    JumpTable *JT = BC.getJumpTableContainingAddress(JTAddr);
+    assert(JT && "internal error creating jump table");
+    JT->BaseAddress = JTBase;
+    JT->MemLocInstrAddress = JTLoad;
+    JT->Entries.resize(NumEntries);
+  }
+  return Cursor.takeError();
+}
+} // namespace
+
+std::unique_ptr<MetadataRewriter>
+llvm::bolt::createJumpTableInfoReader(BinaryContext &BC) {
+  return std::make_unique<JumpTableInfoReader>("jump-table-info-reader", BC);
+}
diff --git a/bolt/lib/Rewrite/MetadataManager.cpp b/bolt/lib/Rewrite/MetadataManager.cpp
index 713d2e4..8114e15 100644
--- a/bolt/lib/Rewrite/MetadataManager.cpp
+++ b/bolt/lib/Rewrite/MetadataManager.cpp
@@ -32,6 +32,18 @@
   }
 }
 
+void MetadataManager::runInitializersPreDisasm() {
+  for (auto &Rewriter : Rewriters) {
+    LLVM_DEBUG(dbgs() << "BOLT-DEBUG: invoking " << Rewriter->getName()
+                      << " after reading sections\n");
+    if (Error E = Rewriter->preDisasmInitializer()) {
+      errs() << "BOLT-ERROR: while running " << Rewriter->getName()
+             << " in pre-disasm state: " << toString(std::move(E)) << '\n';
+      exit(1);
+    }
+  }
+}
+
 void MetadataManager::runInitializersPreCFG() {
   for (auto &Rewriter : Rewriters) {
     LLVM_DEBUG(dbgs() << "BOLT-DEBUG: invoking " << Rewriter->getName()
diff --git a/bolt/lib/Rewrite/RewriteInstance.cpp b/bolt/lib/Rewrite/RewriteInstance.cpp
index f204aa3..33526b0 100644
--- a/bolt/lib/Rewrite/RewriteInstance.cpp
+++ b/bolt/lib/Rewrite/RewriteInstance.cpp
@@ -695,7 +695,7 @@
 
   selectFunctionsToProcess();
 
-  readDebugInfo();
+  processMetadataPreDisasm();
 
   disassembleFunctions();
 
@@ -2479,6 +2479,13 @@
       exit(1);
     }
 
+    // Workaround for AArch64 issue with hot text.
+    if (BC->isAArch64() && (SymbolName == "__hot_start" ||
+          SymbolName == "__hot_end")) {
+      BC->addRelocation(Rel.getOffset(), Symbol, ELF::R_AARCH64_ABS64, Addend);
+      continue;
+    }
+
     BC->addDynamicRelocation(Rel.getOffset(), Symbol, RType, Addend);
   }
 }
@@ -3249,6 +3256,8 @@
   MetadataManager.registerRewriter(createPseudoProbeRewriter(*BC));
 
   MetadataManager.registerRewriter(createSDTRewriter(*BC));
+
+  MetadataManager.registerRewriter(createJumpTableInfoReader(*BC));
 }
 
 void RewriteInstance::processSectionMetadata() {
@@ -3259,6 +3268,14 @@
   MetadataManager.runSectionInitializers();
 }
 
+void RewriteInstance::processMetadataPreDisasm() {
+  NamedRegionTimer T("processmetadata-predisasm", "process metadata pre-disasm",
+                     TimerGroupName, TimerGroupDesc, opts::TimeRewrite);
+  MetadataManager.runInitializersPreDisasm();
+
+  readDebugInfo();
+}
+
 void RewriteInstance::processMetadataPreCFG() {
   NamedRegionTimer T("processmetadata-precfg", "process metadata pre-CFG",
                      TimerGroupName, TimerGroupDesc, opts::TimeRewrite);
@@ -3854,15 +3871,41 @@
       return Address;
     };
 
+    // Try to allocate sections before the \p Address and return an address for
+    // the allocation of the first section or 0 if \p is not big enough.
+    auto allocateBefore = [&](uint64_t Address) -> uint64_t {
+      for (auto SI = CodeSections.rbegin(), SE = CodeSections.rend(); SI != SE;
+           ++SI) {
+        BinarySection *Section = *SI;
+        if (Section->getOutputSize() > Address)
+          return 0;
+        Address -= Section->getOutputSize();
+        Address = alignDown(Address, Section->getAlignment());
+        Section->setOutputAddress(Address);
+      }
+      return Address;
+    };
+
     // Check if we can fit code in the original .text
     bool AllocationDone = false;
     if (opts::UseOldText) {
-      const uint64_t CodeSize =
-          allocateAt(BC->OldTextSectionAddress) - BC->OldTextSectionAddress;
+      uint64_t StartAddress;
+      uint64_t EndAddress;
+      if (opts::HotFunctionsAtEnd) {
+        EndAddress = BC->OldTextSectionAddress + BC->OldTextSectionSize;
+        StartAddress = allocateBefore(EndAddress);
+      } else {
+        StartAddress = BC->OldTextSectionAddress;
+        EndAddress = allocateAt(BC->OldTextSectionAddress);
+      }
 
+      const uint64_t CodeSize = EndAddress - StartAddress;
       if (CodeSize <= BC->OldTextSectionSize) {
         BC->outs() << "BOLT-INFO: using original .text for new code with 0x"
-                   << Twine::utohexstr(opts::AlignText) << " alignment\n";
+                   << Twine::utohexstr(opts::AlignText) << " alignment";
+        if (StartAddress != BC->OldTextSectionAddress)
+          BC->outs() << " at 0x" << Twine::utohexstr(StartAddress);
+        BC->outs() << '\n';
         AllocationDone = true;
       } else {
         BC->errs()
@@ -4133,6 +4176,11 @@
     NewWritableSegmentSize = NextAvailableAddress - NewWritableSegmentAddress;
   }
 
+  if (!NewTextSegmentSize && !NewWritableSegmentSize) {
+    BC->outs() << "BOLT-INFO: not adding new segments\n";
+    return;
+  }
+
   const uint64_t SavedPos = OS.tell();
   OS.seek(PHDRTableOffset);
 
@@ -4487,6 +4535,11 @@
   if (opts::RemoveSymtab && Section.sh_type == ELF::SHT_SYMTAB)
     return true;
 
+  // Strip jump table metadata by default.
+  // TBD: add a flag to rewrite it.
+  if (SectionName == ".llvm_jump_table_info")
+    return true;
+
   return false;
 }
 
diff --git a/bolt/test/AArch64/Inputs/jump-table.c b/bolt/test/AArch64/Inputs/jump-table.c
new file mode 100644
index 0000000..198c483
--- /dev/null
+++ b/bolt/test/AArch64/Inputs/jump-table.c
@@ -0,0 +1,20 @@
+volatile int g;
+void switchy(int x) {
+  switch (x) {
+  case 0: g--; break;
+  case 1: g++; break;
+  case 2: g = 42; break;
+  case 3: g += 17; break;
+  case 4: g -= 66; break;
+  case 5: g++; g--; break;
+  case 6: g--; g++; break;
+  case 66: g-=3; g++; break;
+  case 8: g+=5; g--; break;
+  case 10: g+=5; g--; break;
+  case 12: g+=42; g--; break;
+  case 15: g+=99; g--; break;
+  case 20: switchy(g); break;
+  case 21: g -= 1234; break;
+  default: g = 0; break;
+  }
+}
diff --git a/bolt/test/AArch64/jump-table-info.s b/bolt/test/AArch64/jump-table-info.s
new file mode 100644
index 0000000..e2b67c6
--- /dev/null
+++ b/bolt/test/AArch64/jump-table-info.s
@@ -0,0 +1,186 @@
+## Check parsing of a .llvm_jump_table_info section
+## The assembly is produced from bolt/test/AArch64/Inputs/jump-table.c
+
+# RUN: llvm-mc -filetype=obj -triple aarch64-unknown-unknown %s -o %t.o
+# RUN: %clang %cflags %t.o -o %t.exe -Wl,-q -static
+# RUN: llvm-bolt %t.exe -o %t.null -print-jump-tables | FileCheck %s
+
+# Confirm 67 entries are parsed:
+# CHECK:      jump tables for function _Z7switchyi:
+# CHECK-NEXT: Jump table {{.*}} for function _Z7switchyi
+# CHECK:      0x0042 : .Ltmp16
+
+	.text
+	.globl	_Z7switchyi                     // -- Begin function _Z7switchyi
+	.p2align	2
+	.type	_Z7switchyi,@function
+_Z7switchyi:                            // @_Z7switchyi
+	.cfi_startproc
+// %bb.0:                               // %entry
+	adrp	x8, g
+	cmp	w0, #20
+	b.ne	.LBB0_2
+.LBB0_1:                                // %sw.bb26
+                                        // =>This Inner Loop Header: Depth=1
+	ldr	w0, [x8, :lo12:g]
+	cmp	w0, #20
+	b.eq	.LBB0_1
+.LBB0_2:                                // %tailrecurse
+	cmp	w0, #66
+	b.hi	.LBB0_18
+// %bb.3:                               // %tailrecurse
+	mov	w9, w0
+	adrp	x10, .LJTI0_0
+	add	x10, x10, :lo12:.LJTI0_0
+	adr	x11, .LBB0_4
+.Ltmp0:
+	ldrb	w12, [x10, x9]
+	add	x11, x11, x12, lsl #2
+.Ltmp1:
+	br	x11
+.LBB0_4:                                // %sw.bb17
+	ldr	w9, [x8, :lo12:g]
+	add	w9, w9, #5
+	b	.LBB0_13
+.LBB0_5:                                // %sw.bb11
+	ldr	w9, [x8, :lo12:g]
+	sub	w9, w9, #3
+	b	.LBB0_10
+.LBB0_6:                                // %sw.bb5
+	ldr	w9, [x8, :lo12:g]
+	add	w9, w9, #1
+	b	.LBB0_13
+.LBB0_7:                                // %sw.bb3
+	ldr	w9, [x8, :lo12:g]
+	add	w9, w9, #17
+	str	w9, [x8, :lo12:g]
+	ret
+.LBB0_8:                                // %sw.bb23
+	ldr	w9, [x8, :lo12:g]
+	add	w9, w9, #99
+	b	.LBB0_13
+.LBB0_9:                                // %sw.bb8
+	ldr	w9, [x8, :lo12:g]
+	sub	w9, w9, #1
+.LBB0_10:                               // %sw.epilog
+	str	w9, [x8, :lo12:g]
+.LBB0_11:                               // %sw.bb1
+	ldr	w9, [x8, :lo12:g]
+	add	w9, w9, #1
+	str	w9, [x8, :lo12:g]
+	ret
+.LBB0_12:                               // %sw.bb20
+	ldr	w9, [x8, :lo12:g]
+	add	w9, w9, #42
+.LBB0_13:                               // %sw.epilog
+	str	w9, [x8, :lo12:g]
+.LBB0_14:                               // %sw.bb
+	ldr	w9, [x8, :lo12:g]
+	sub	w9, w9, #1
+	str	w9, [x8, :lo12:g]
+	ret
+.LBB0_15:                               // %sw.epilog.loopexit
+	mov	w9, #42                         // =0x2a
+	str	w9, [x8, :lo12:g]
+	ret
+.LBB0_16:                               // %sw.bb27
+	ldr	w9, [x8, :lo12:g]
+	sub	w9, w9, #1234
+	str	w9, [x8, :lo12:g]
+	ret
+.LBB0_17:                               // %sw.bb4
+	ldr	w9, [x8, :lo12:g]
+	sub	w9, w9, #66
+	str	w9, [x8, :lo12:g]
+	ret
+.LBB0_18:                               // %sw.epilog.loopexit29
+	str	wzr, [x8, :lo12:g]
+	ret
+.Lfunc_end0:
+	.size	_Z7switchyi, .Lfunc_end0-_Z7switchyi
+	.cfi_endproc
+	.section	.rodata,"a",@progbits
+.LJTI0_0:
+	.byte	(.LBB0_14-.LBB0_4)>>2
+	.byte	(.LBB0_11-.LBB0_4)>>2
+	.byte	(.LBB0_15-.LBB0_4)>>2
+	.byte	(.LBB0_7-.LBB0_4)>>2
+	.byte	(.LBB0_17-.LBB0_4)>>2
+	.byte	(.LBB0_6-.LBB0_4)>>2
+	.byte	(.LBB0_9-.LBB0_4)>>2
+	.byte	(.LBB0_18-.LBB0_4)>>2
+	.byte	(.LBB0_4-.LBB0_4)>>2
+	.byte	(.LBB0_18-.LBB0_4)>>2
+	.byte	(.LBB0_4-.LBB0_4)>>2
+	.byte	(.LBB0_18-.LBB0_4)>>2
+	.byte	(.LBB0_12-.LBB0_4)>>2
+	.byte	(.LBB0_18-.LBB0_4)>>2
+	.byte	(.LBB0_18-.LBB0_4)>>2
+	.byte	(.LBB0_8-.LBB0_4)>>2
+	.byte	(.LBB0_18-.LBB0_4)>>2
+	.byte	(.LBB0_18-.LBB0_4)>>2
+	.byte	(.LBB0_18-.LBB0_4)>>2
+	.byte	(.LBB0_18-.LBB0_4)>>2
+	.byte	(.LBB0_18-.LBB0_4)>>2
+	.byte	(.LBB0_16-.LBB0_4)>>2
+	.byte	(.LBB0_18-.LBB0_4)>>2
+	.byte	(.LBB0_18-.LBB0_4)>>2
+	.byte	(.LBB0_18-.LBB0_4)>>2
+	.byte	(.LBB0_18-.LBB0_4)>>2
+	.byte	(.LBB0_18-.LBB0_4)>>2
+	.byte	(.LBB0_18-.LBB0_4)>>2
+	.byte	(.LBB0_18-.LBB0_4)>>2
+	.byte	(.LBB0_18-.LBB0_4)>>2
+	.byte	(.LBB0_18-.LBB0_4)>>2
+	.byte	(.LBB0_18-.LBB0_4)>>2
+	.byte	(.LBB0_18-.LBB0_4)>>2
+	.byte	(.LBB0_18-.LBB0_4)>>2
+	.byte	(.LBB0_18-.LBB0_4)>>2
+	.byte	(.LBB0_18-.LBB0_4)>>2
+	.byte	(.LBB0_18-.LBB0_4)>>2
+	.byte	(.LBB0_18-.LBB0_4)>>2
+	.byte	(.LBB0_18-.LBB0_4)>>2
+	.byte	(.LBB0_18-.LBB0_4)>>2
+	.byte	(.LBB0_18-.LBB0_4)>>2
+	.byte	(.LBB0_18-.LBB0_4)>>2
+	.byte	(.LBB0_18-.LBB0_4)>>2
+	.byte	(.LBB0_18-.LBB0_4)>>2
+	.byte	(.LBB0_18-.LBB0_4)>>2
+	.byte	(.LBB0_18-.LBB0_4)>>2
+	.byte	(.LBB0_18-.LBB0_4)>>2
+	.byte	(.LBB0_18-.LBB0_4)>>2
+	.byte	(.LBB0_18-.LBB0_4)>>2
+	.byte	(.LBB0_18-.LBB0_4)>>2
+	.byte	(.LBB0_18-.LBB0_4)>>2
+	.byte	(.LBB0_18-.LBB0_4)>>2
+	.byte	(.LBB0_18-.LBB0_4)>>2
+	.byte	(.LBB0_18-.LBB0_4)>>2
+	.byte	(.LBB0_18-.LBB0_4)>>2
+	.byte	(.LBB0_18-.LBB0_4)>>2
+	.byte	(.LBB0_18-.LBB0_4)>>2
+	.byte	(.LBB0_18-.LBB0_4)>>2
+	.byte	(.LBB0_18-.LBB0_4)>>2
+	.byte	(.LBB0_18-.LBB0_4)>>2
+	.byte	(.LBB0_18-.LBB0_4)>>2
+	.byte	(.LBB0_18-.LBB0_4)>>2
+	.byte	(.LBB0_18-.LBB0_4)>>2
+	.byte	(.LBB0_18-.LBB0_4)>>2
+	.byte	(.LBB0_18-.LBB0_4)>>2
+	.byte	(.LBB0_18-.LBB0_4)>>2
+	.byte	(.LBB0_5-.LBB0_4)>>2
+	.section	.llvm_jump_table_info,"",@0x6fff4c0e
+	.byte	2                               // format 2: 1b relative; shr 2
+	.xword	.LJTI0_0
+	.xword	.LBB0_4                         // Base
+	.xword	.Ltmp0                          // Load Instruction
+	.xword	.Ltmp1                          // Branch Instruction
+	.byte	67                              // Number of Entries
+                                        // -- End function
+	.type	g,@object                       // @g
+	.bss
+	.globl	g
+	.p2align	2, 0x0
+g:
+	.word	0                               // 0x0
+	.size	g, 4
+	.section	".note.GNU-stack","",@progbits
diff --git a/bolt/test/program-header.test b/bolt/test/program-header.test
new file mode 100644
index 0000000..4552303
--- /dev/null
+++ b/bolt/test/program-header.test
@@ -0,0 +1,14 @@
+# Check that llvm-bolt does not add new segments when writing code in-place.
+
+REQUIRES: system-linux
+
+RUN: %clang %cflags %p/Inputs/hello.c -o %t -no-pie -Wl,-q
+RUN: llvm-bolt %t -o %t.bolt --use-old-text --align-functions=1 \
+RUN:   --no-huge-pages --align-text=1 --use-gnu-stack \
+RUN:   | FileCheck %s --check-prefix=CHECK-BOLT
+RUN: llvm-readelf -WS %t.bolt | FileCheck %s
+
+CHECK-BOLT: rewriting .eh_frame_hdr in-place
+CHECK-BOLT: not adding new segments
+
+CHECK-NOT: .bolt.org.eh_frame_hdr
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index 4c01088..d3e038d 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -5730,6 +5730,10 @@
   MarshallingInfoFlag<CodeGenOpts<"InstrumentForProfiling">>;
 def pipe : Flag<["-", "--"], "pipe">,
   HelpText<"Use pipes between commands, when possible">;
+// Facebook T92898286
+def post_link_optimize : Flag<["--"], "post-link-optimize">,
+  HelpText<"Apply post-link optimizations using BOLT">;
+// End Facebook T92898286
 def prebind__all__twolevel__modules : Flag<["-"], "prebind_all_twolevel_modules">;
 def prebind : Flag<["-"], "prebind">;
 def preload : Flag<["-"], "preload">;
diff --git a/clang/lib/Driver/ToolChains/Gnu.cpp b/clang/lib/Driver/ToolChains/Gnu.cpp
index a0fa3c6..9ffff3e 100644
--- a/clang/lib/Driver/ToolChains/Gnu.cpp
+++ b/clang/lib/Driver/ToolChains/Gnu.cpp
@@ -671,12 +671,41 @@
     }
   }
 
+  // Facebook T92898286
+  if (Args.hasArg(options::OPT_post_link_optimize))
+    CmdArgs.push_back("-q");
+  // End Facebook T92898286
+
   Args.addAllArgs(CmdArgs, {options::OPT_T, options::OPT_t});
 
   const char *Exec = Args.MakeArgString(ToolChain.GetLinkerPath());
   C.addCommand(std::make_unique<Command>(JA, *this,
                                          ResponseFileSupport::AtFileCurCP(),
                                          Exec, CmdArgs, Inputs, Output));
+  // Facebook T92898286
+  if (!Args.hasArg(options::OPT_post_link_optimize) || !Output.isFilename())
+    return;
+
+  const char *MvExec = Args.MakeArgString(ToolChain.GetProgramPath("mv"));
+  ArgStringList MoveCmdArgs;
+  MoveCmdArgs.push_back(Output.getFilename());
+  const char *PreBoltBin =
+      Args.MakeArgString(Twine(Output.getFilename()) + ".pre-bolt");
+  MoveCmdArgs.push_back(PreBoltBin);
+  C.addCommand(std::make_unique<Command>(JA, *this, ResponseFileSupport::None(),
+                                         MvExec, MoveCmdArgs, std::nullopt));
+
+  ArgStringList BoltCmdArgs;
+  const char *BoltExec =
+      Args.MakeArgString(ToolChain.GetProgramPath("llvm-bolt"));
+  BoltCmdArgs.push_back(PreBoltBin);
+  BoltCmdArgs.push_back("-reorder-blocks=reverse");
+  BoltCmdArgs.push_back("-update-debug-sections");
+  BoltCmdArgs.push_back("-o");
+  BoltCmdArgs.push_back(Output.getFilename());
+  C.addCommand(std::make_unique<Command>(JA, *this, ResponseFileSupport::None(),
+                                         BoltExec, BoltCmdArgs, std::nullopt));
+  // End Facebook T92898286
 }
 
 void tools::gnutools::Assembler::ConstructJob(Compilation &C,
diff --git a/cross-project-tests/lit.cfg.py b/cross-project-tests/lit.cfg.py
index ccd3d01..d38df0e 100644
--- a/cross-project-tests/lit.cfg.py
+++ b/cross-project-tests/lit.cfg.py
@@ -92,7 +92,13 @@
 # use_clang() and use_lld() respectively, so set them to "", if needed.
 if not hasattr(config, "clang_src_dir"):
     config.clang_src_dir = ""
-llvm_config.use_clang(required=("clang" in config.llvm_enabled_projects))
+# Facebook T92898286
+should_test_bolt = get_required_attr(config, "llvm_test_bolt")
+if should_test_bolt:
+    llvm_config.use_clang(required=("clang" in config.llvm_enabled_projects), additional_flags=["--post-link-optimize"])
+else:
+    llvm_config.use_clang(required=("clang" in config.llvm_enabled_projects))
+# End Facebook T92898286
 
 if not hasattr(config, "lld_src_dir"):
     config.lld_src_dir = ""
@@ -305,3 +311,9 @@
 # Allow 'REQUIRES: XXX-registered-target' in tests.
 for arch in config.targets_to_build:
     config.available_features.add(arch.lower() + "-registered-target")
+
+# Facebook T92898286
+# Ensure the user's PYTHONPATH is included.
+if "PYTHONPATH" in os.environ:
+    config.environment["PYTHONPATH"] = os.environ["PYTHONPATH"]
+# End Facebook T92898286
diff --git a/cross-project-tests/lit.site.cfg.py.in b/cross-project-tests/lit.site.cfg.py.in
index 39458df..2d53cd3 100644
--- a/cross-project-tests/lit.site.cfg.py.in
+++ b/cross-project-tests/lit.site.cfg.py.in
@@ -21,6 +21,10 @@
 
 config.llvm_use_sanitizer = "@LLVM_USE_SANITIZER@"
 
+# Facebook T92898286
+config.llvm_test_bolt = lit.util.pythonize_bool("@LLVM_TEST_BOLT@")
+# End Facebook T92898286
+
 import lit.llvm
 lit.llvm.initialize(lit_config, config)
 
diff --git a/lldb/test/API/lit.cfg.py b/lldb/test/API/lit.cfg.py
index 08cf11c..077fb4c 100644
--- a/lldb/test/API/lit.cfg.py
+++ b/lldb/test/API/lit.cfg.py
@@ -271,6 +271,17 @@
 if is_configured("lldb_framework_dir"):
     dotest_cmd += ["--framework", config.lldb_framework_dir]
 
+# Facebook T92898286
+if is_configured("llvm_test_bolt"):
+    dotest_cmd += ["-E", '"--post-link-optimize"']
+# End Facebook T92898286
+
+if (
+    "lldb-repro-capture" in config.available_features
+    or "lldb-repro-replay" in config.available_features
+):
+    dotest_cmd += ["--skip-category=lldb-dap", "--skip-category=std-module"]
+
 if "lldb-simulator-ios" in config.available_features:
     dotest_cmd += ["--apple-sdk", "iphonesimulator", "--platform-name", "ios-simulator"]
 elif "lldb-simulator-watchos" in config.available_features:
diff --git a/lldb/test/API/lit.site.cfg.py.in b/lldb/test/API/lit.site.cfg.py.in
index ecebc44..1c2ab4d 100644
--- a/lldb/test/API/lit.site.cfg.py.in
+++ b/lldb/test/API/lit.site.cfg.py.in
@@ -1,5 +1,9 @@
 @LIT_SITE_CFG_IN_HEADER@
 
+#Facebook T92898286
+import lit.util
+#End Facebook T92898286
+
 config.llvm_src_root = "@LLVM_SOURCE_DIR@"
 config.llvm_obj_root = "@LLVM_BINARY_DIR@"
 config.llvm_tools_dir = lit_config.substitute("@LLVM_TOOLS_DIR@")
@@ -44,6 +48,10 @@
 config.lldb_module_cache = os.path.join("@LLDB_TEST_MODULE_CACHE_LLDB@", "lldb-api")
 config.clang_module_cache = os.path.join("@LLDB_TEST_MODULE_CACHE_CLANG@", "lldb-api")
 
+# Facebook T92898286
+config.llvm_test_bolt = lit.util.pythonize_bool("@LLVM_TEST_BOLT@")
+# End Facebook T92898286
+
 # Plugins
 lldb_build_intel_pt = '@LLDB_BUILD_INTEL_PT@'
 if lldb_build_intel_pt == '1':
diff --git a/lldb/test/Shell/helper/toolchain.py b/lldb/test/Shell/helper/toolchain.py
index 42968128..ac895e8d 100644
--- a/lldb/test/Shell/helper/toolchain.py
+++ b/lldb/test/Shell/helper/toolchain.py
@@ -241,6 +241,11 @@
             "-lc++",
         ]
 
+    # Facebook T92898286
+    if config.llvm_test_bolt:
+        host_flags += ["--post-link-optimize"]
+    # End Facebook T92898286
+
     host_flags = " ".join(host_flags)
     config.substitutions.append(("%clang_host", "%clang " + host_flags))
     config.substitutions.append(("%clangxx_host", "%clangxx " + host_flags))
diff --git a/lldb/test/Shell/lit.site.cfg.py.in b/lldb/test/Shell/lit.site.cfg.py.in
index 31a6d68..8b37d98 100644
--- a/lldb/test/Shell/lit.site.cfg.py.in
+++ b/lldb/test/Shell/lit.site.cfg.py.in
@@ -1,5 +1,10 @@
 @LIT_SITE_CFG_IN_HEADER@
 
+#Facebook T92898286
+import lit.util
+#End Facebook T92898286
+
+
 config.llvm_src_root = "@LLVM_SOURCE_DIR@"
 config.llvm_obj_root = "@LLVM_BINARY_DIR@"
 config.llvm_tools_dir = lit_config.substitute("@LLVM_TOOLS_DIR@")
@@ -36,6 +41,10 @@
 config.lldb_module_cache = os.path.join("@LLDB_TEST_MODULE_CACHE_LLDB@", "lldb-shell")
 config.clang_module_cache = os.path.join("@LLDB_TEST_MODULE_CACHE_CLANG@", "lldb-shell")
 
+# Facebook T92898286
+config.llvm_test_bolt = lit.util.pythonize_bool("@LLVM_TEST_BOLT@")
+# End Facebook T92898286
+
 import lit.llvm
 lit.llvm.initialize(lit_config, config)
 
diff --git a/llvm/CMakeLists.txt b/llvm/CMakeLists.txt
index cfd1a08..1478eea 100644
--- a/llvm/CMakeLists.txt
+++ b/llvm/CMakeLists.txt
@@ -729,6 +729,10 @@
 option(LLVM_USE_SPLIT_DWARF
   "Use -gsplit-dwarf when compiling llvm and --gdb-index when linking." OFF)
 
+# Facebook T92898286
+option(LLVM_TEST_BOLT "Enable BOLT testing in non-BOLT tests that use clang" OFF)
+# End Facebook T92898286
+
 # Define an option controlling whether we should build for 32-bit on 64-bit
 # platforms, where supported.
 if( CMAKE_SIZEOF_VOID_P EQUAL 8 AND NOT (WIN32 OR ${CMAKE_SYSTEM_NAME} MATCHES "AIX"))