[BOLT][AArch64] Add partial support for lite mode (#133014)

In lite mode, we only emit code for a subset of functions while
preserving the original code in .bolt.org.text. This requires updating
code references in non-emitted functions to ensure that:

* Non-optimized versions of the optimized code never execute.
* Function pointer comparison semantics is preserved.

On x86-64, we can update code references in-place using "pending
relocations" added in scanExternalRefs(). However, on AArch64, this is
not always possible due to address range limitations and linker address
"relaxation".

There are two types of code-to-code references: control transfer (e.g.,
calls and branches) and function pointer materialization.
AArch64-specific control transfer instructions are covered by #116964.

For function pointer materialization, simply changing the immediate
field of an instruction is not always sufficient. In some cases, we need
to modify a pair of instructions, such as undoing linker relaxation and
converting NOP+ADR into ADRP+ADD sequence.

To achieve this, we use the instruction patch mechanism instead of
pending relocations. Instruction patches are emitted via the regular MC
layer, just like regular functions. However, they have a fixed address
and do not have an associated symbol table entry. This allows us to make
more complex changes to the code, ensuring that function pointers are
correctly updated. Such mechanism should also be portable to RISC-V and
other architectures.

To summarize, for AArch64, we extend the scanExternalRefs() process to
undo linker relaxation and use instruction patches to partially
overwrite unoptimized code.
diff --git a/bolt/include/bolt/Core/BinaryContext.h b/bolt/include/bolt/Core/BinaryContext.h
index 9a04859..88313a6 100644
--- a/bolt/include/bolt/Core/BinaryContext.h
+++ b/bolt/include/bolt/Core/BinaryContext.h
@@ -544,9 +544,10 @@
   ///
   /// Optional \p Name can be assigned to the patch. The name will be emitted to
   /// the symbol table at \p Address.
-  BinaryFunction *createInstructionPatch(uint64_t Address,
-                                         InstructionListType &Instructions,
-                                         const Twine &Name = "");
+  BinaryFunction *
+  createInstructionPatch(uint64_t Address,
+                         const InstructionListType &Instructions,
+                         const Twine &Name = "");
 
   std::vector<BinaryFunction *> &getInjectedBinaryFunctions() {
     return InjectedBinaryFunctions;
diff --git a/bolt/include/bolt/Core/BinaryFunction.h b/bolt/include/bolt/Core/BinaryFunction.h
index a92cb46..d3d11f8 100644
--- a/bolt/include/bolt/Core/BinaryFunction.h
+++ b/bolt/include/bolt/Core/BinaryFunction.h
@@ -357,6 +357,12 @@
   /// True if another function body was merged into this one.
   bool HasFunctionsFoldedInto{false};
 
+  /// True if the function is used for patching code at a fixed address.
+  bool IsPatch{false};
+
+  /// True if the function should not have an associated symbol table entry.
+  bool IsAnonymous{false};
+
   /// Name for the section this function code should reside in.
   std::string CodeSectionName;
 
@@ -1358,6 +1364,12 @@
   /// Return true if other functions were folded into this one.
   bool hasFunctionsFoldedInto() const { return HasFunctionsFoldedInto; }
 
+  /// Return true if this function is used for patching existing code.
+  bool isPatch() const { return IsPatch; }
+
+  /// Return true if the function should not have associated symbol table entry.
+  bool isAnonymous() const { return IsAnonymous; }
+
   /// If this function was folded, return the function it was folded into.
   BinaryFunction *getFoldedIntoFunction() const { return FoldedIntoFunction; }
 
@@ -1734,6 +1746,18 @@
   /// Indicate that another function body was merged with this function.
   void setHasFunctionsFoldedInto() { HasFunctionsFoldedInto = true; }
 
+  /// Indicate that this function is a patch.
+  void setIsPatch(bool V) {
+    assert(isInjected() && "Only injected functions can be used as patches");
+    IsPatch = V;
+  }
+
+  /// Indicate if the function should have a name in the symbol table.
+  void setAnonymous(bool V) {
+    assert(isInjected() && "Only injected functions could be anonymous");
+    IsAnonymous = V;
+  }
+
   void setHasSDTMarker(bool V) { HasSDTMarker = V; }
 
   /// Mark the function as using ORC format for stack unwinding.
diff --git a/bolt/include/bolt/Core/MCPlusBuilder.h b/bolt/include/bolt/Core/MCPlusBuilder.h
index 76ea248..6860f02 100644
--- a/bolt/include/bolt/Core/MCPlusBuilder.h
+++ b/bolt/include/bolt/Core/MCPlusBuilder.h
@@ -1264,9 +1264,12 @@
     return nullptr;
   }
 
-  /// Return MCSymbol extracted from a target expression
+  /// Return MCSymbol extracted from the expression.
   virtual const MCSymbol *getTargetSymbol(const MCExpr *Expr) const {
-    return &cast<const MCSymbolRefExpr>(Expr)->getSymbol();
+    if (auto *SymbolRefExpr = dyn_cast<const MCSymbolRefExpr>(Expr))
+      return &SymbolRefExpr->getSymbol();
+
+    return nullptr;
   }
 
   /// Return addend that represents an offset from MCSymbol target
diff --git a/bolt/lib/Core/BinaryContext.cpp b/bolt/lib/Core/BinaryContext.cpp
index 7466d2f..80b15d7 100644
--- a/bolt/lib/Core/BinaryContext.cpp
+++ b/bolt/lib/Core/BinaryContext.cpp
@@ -2401,8 +2401,10 @@
   return BF;
 }
 
-BinaryFunction *BinaryContext::createInstructionPatch(
-    uint64_t Address, InstructionListType &Instructions, const Twine &Name) {
+BinaryFunction *
+BinaryContext::createInstructionPatch(uint64_t Address,
+                                      const InstructionListType &Instructions,
+                                      const Twine &Name) {
   ErrorOr<BinarySection &> Section = getSectionForAddress(Address);
   assert(Section && "cannot get section for patching");
   assert(Section->hasSectionRef() && Section->isText() &&
@@ -2423,6 +2425,11 @@
   PBF->setFileOffset(FileOffset);
   PBF->setOriginSection(&Section.get());
   PBF->addBasicBlock()->addInstructions(Instructions);
+  PBF->setIsPatch(true);
+
+  // Don't create symbol table entry if the name wasn't specified.
+  if (Name.str().empty())
+    PBF->setAnonymous(true);
 
   return PBF;
 }
diff --git a/bolt/lib/Core/BinaryFunction.cpp b/bolt/lib/Core/BinaryFunction.cpp
index acf04ce..5ee33f5 100644
--- a/bolt/lib/Core/BinaryFunction.cpp
+++ b/bolt/lib/Core/BinaryFunction.cpp
@@ -1583,13 +1583,18 @@
   assert(FunctionData.size() == getMaxSize() &&
          "function size does not match raw data size");
 
-  if (BC.isX86())
-    BC.SymbolicDisAsm->setSymbolizer(
-        BC.MIB->createTargetSymbolizer(*this, /*CreateSymbols*/ false));
+  BC.SymbolicDisAsm->setSymbolizer(
+      BC.MIB->createTargetSymbolizer(*this, /*CreateSymbols*/ false));
+
+  // A list of patches for this function.
+  using PatchTy = std::pair<uint64_t, MCInst>;
+  std::vector<PatchTy> InstructionPatches;
 
   // Disassemble contents of the function. Detect code entry points and create
   // relocations for references to code that will be moved.
   uint64_t Size = 0; // instruction size
+  MCInst Instruction;
+  MCInst PrevInstruction;
   for (uint64_t Offset = 0; Offset < getSize(); Offset += Size) {
     // Check for data inside code and ignore it
     if (const size_t DataInCodeSize = getSizeOfDataInCodeAt(Offset)) {
@@ -1598,7 +1603,7 @@
     }
 
     const uint64_t AbsoluteInstrAddr = getAddress() + Offset;
-    MCInst Instruction;
+    PrevInstruction = Instruction;
     if (!BC.SymbolicDisAsm->getInstruction(Instruction, Size,
                                            FunctionData.slice(Offset),
                                            AbsoluteInstrAddr, nulls())) {
@@ -1673,12 +1678,108 @@
     if (BranchTargetSymbol) {
       BC.MIB->replaceBranchTarget(Instruction, BranchTargetSymbol,
                                   Emitter.LocalCtx.get());
-    } else if (!llvm::any_of(Instruction,
-                             [](const MCOperand &Op) { return Op.isExpr(); })) {
-      // Skip assembly if the instruction may not have any symbolic operands.
-      continue;
     } else {
       analyzeInstructionForFuncReference(Instruction);
+      const bool NeedsPatch = llvm::any_of(
+          MCPlus::primeOperands(Instruction), [&](const MCOperand &Op) {
+            return Op.isExpr() &&
+                   !ignoreReference(BC.MIB->getTargetSymbol(Op.getExpr()));
+          });
+      if (!NeedsPatch)
+        continue;
+    }
+
+    // For AArch64, we need to undo relaxation done by the linker if the target
+    // of the instruction is a function that we plan to move.
+    //
+    // Linker relaxation is documented at:
+    // https://github.com/ARM-software/abi-aa/blob/main/aaelf64/aaelf64.rst
+    // under #relocation-optimization.
+    if (const Relocation *Rel;
+        BC.isAArch64() && (Rel = getRelocationAt(Offset))) {
+      // NOP+ADR sequence can originate from either ADRP+ADD or ADRP+LDR.
+      // In either case, we convert it into ADRP+ADD.
+      if (BC.MIB->isADR(Instruction) &&
+          (Rel->Type == ELF::R_AARCH64_ADD_ABS_LO12_NC ||
+           Rel->Type == ELF::R_AARCH64_LD64_GOT_LO12_NC)) {
+        if (!BC.MIB->isNoop(PrevInstruction)) {
+          // In case of unexpected conversion from the linker, skip target
+          // optimization.
+          const MCSymbol *Symbol = BC.MIB->getTargetSymbol(Instruction);
+          BC.errs() << "BOLT-WARNING: cannot undo linker relaxation for "
+                       "instruction at 0x"
+                    << Twine::utohexstr(AbsoluteInstrAddr) << " referencing "
+                    << Symbol->getName() << '\n';
+          if (BinaryFunction *TargetBF = BC.getFunctionForSymbol(Symbol))
+            TargetBF->setIgnored();
+          continue;
+        }
+
+        InstructionListType AdrpAdd =
+            BC.MIB->undoAdrpAddRelaxation(Instruction, BC.Ctx.get());
+        assert(AdrpAdd.size() == 2 && "Two instructions expected");
+        LLVM_DEBUG({
+          dbgs() << "BOLT-DEBUG: linker relaxation undone for instruction "
+                    "at 0x"
+                 << Twine::utohexstr(AbsoluteInstrAddr) << '\n';
+        });
+        InstructionPatches.push_back({AbsoluteInstrAddr - 4, AdrpAdd[0]});
+        InstructionPatches.push_back({AbsoluteInstrAddr, AdrpAdd[1]});
+        continue;
+      }
+
+      // If ADR was emitted by the compiler/assembler to reference a nearby
+      // local function, we cannot move away that function due to ADR address
+      // span limitation. Hence, we skip the optimization.
+      if (BC.MIB->isADR(Instruction) &&
+          Rel->Type == ELF::R_AARCH64_ADR_PREL_LO21) {
+        BC.errs() << "BOLT-WARNING: unable to convert ADR that references "
+                  << Rel->Symbol->getName()
+                  << ". Will not optimize the target\n";
+        if (BinaryFunction *TargetBF = BC.getFunctionForSymbol(Rel->Symbol))
+          TargetBF->setIgnored();
+        continue;
+      }
+
+      // In the case of GOT load, ADRP+LDR can also be converted into ADRP+ADD.
+      // When this happens, it's not always possible to properly symbolize ADRP
+      // operand and we might have to adjust the operand based on the next
+      // instruction.
+      if (BC.MIB->isAddXri(Instruction) &&
+          Rel->Type == ELF::R_AARCH64_LD64_GOT_LO12_NC) {
+        if (!BC.MIB->matchAdrpAddPair(PrevInstruction, Instruction)) {
+          BC.errs() << "BOLT-ERROR: cannot find matching ADRP for relaxed LDR "
+                       "instruction at 0x"
+                    << Twine::utohexstr(AbsoluteInstrAddr) << '\n';
+          exit(1);
+        }
+
+        // Check if ADRP was already patched. If not, add a new patch for it.
+        if (InstructionPatches.empty() ||
+            InstructionPatches.back().first != AbsoluteInstrAddr - 4)
+          InstructionPatches.push_back(
+              {AbsoluteInstrAddr - 4, PrevInstruction});
+
+        // Adjust the operand for ADRP from the patch.
+        MCInst &ADRPInst = InstructionPatches.back().second;
+        const MCSymbol *ADRPSymbol = BC.MIB->getTargetSymbol(ADRPInst);
+        const MCSymbol *ADDSymbol = BC.MIB->getTargetSymbol(Instruction);
+        if (ADRPSymbol != ADDSymbol) {
+          const int64_t Addend = BC.MIB->getTargetAddend(Instruction);
+          BC.MIB->setOperandToSymbolRef(ADRPInst, /*OpNum*/ 1, ADDSymbol,
+                                        Addend, BC.Ctx.get(),
+                                        ELF::R_AARCH64_NONE);
+        }
+      }
+    }
+
+    // On AArch64, we use instruction patches for fixing references. We make an
+    // exception for branch instructions since they require optional
+    // relocations.
+    if (BC.isAArch64() && !BranchTargetSymbol) {
+      LLVM_DEBUG(BC.printInstruction(dbgs(), Instruction, AbsoluteInstrAddr));
+      InstructionPatches.push_back({AbsoluteInstrAddr, Instruction});
+      continue;
     }
 
     // Emit the instruction using temp emitter and generate relocations.
@@ -1720,6 +1821,23 @@
     for (Relocation &Rel : FunctionRelocations)
       getOriginSection()->addPendingRelocation(Rel);
 
+  // Add patches grouping them together.
+  if (!InstructionPatches.empty()) {
+    uint64_t PatchGroupAddress;
+    InstructionListType PatchGroup;
+    for (auto PI = InstructionPatches.begin(), PE = InstructionPatches.end();
+         PI != PE; ++PI) {
+      auto &Patch = *PI;
+      if (PatchGroup.empty())
+        PatchGroupAddress = Patch.first;
+      PatchGroup.push_back(Patch.second);
+      if (std::next(PI) == PE || std::next(PI)->first != Patch.first + 4) {
+        BC.createInstructionPatch(PatchGroupAddress, PatchGroup);
+        PatchGroup.clear();
+      }
+    }
+  }
+
   // Inform BinaryContext that this function symbols will not be defined and
   // relocations should not be created against them.
   if (BC.HasRelocations) {
diff --git a/bolt/lib/Passes/BinaryPasses.cpp b/bolt/lib/Passes/BinaryPasses.cpp
index 03d3dd7..d8628c6 100644
--- a/bolt/lib/Passes/BinaryPasses.cpp
+++ b/bolt/lib/Passes/BinaryPasses.cpp
@@ -1269,8 +1269,10 @@
 
 Error AssignSections::runOnFunctions(BinaryContext &BC) {
   for (BinaryFunction *Function : BC.getInjectedBinaryFunctions()) {
-    Function->setCodeSectionName(BC.getInjectedCodeSectionName());
-    Function->setColdCodeSectionName(BC.getInjectedColdCodeSectionName());
+    if (!Function->isPatch()) {
+      Function->setCodeSectionName(BC.getInjectedCodeSectionName());
+      Function->setColdCodeSectionName(BC.getInjectedColdCodeSectionName());
+    }
   }
 
   // In non-relocation mode functions have pre-assigned section names.
diff --git a/bolt/lib/Rewrite/RewriteInstance.cpp b/bolt/lib/Rewrite/RewriteInstance.cpp
index 409c2bb..f204aa3 100644
--- a/bolt/lib/Rewrite/RewriteInstance.cpp
+++ b/bolt/lib/Rewrite/RewriteInstance.cpp
@@ -5078,6 +5078,8 @@
 
   // Add symbols of injected functions
   for (BinaryFunction *Function : BC->getInjectedBinaryFunctions()) {
+    if (Function->isAnonymous())
+      continue;
     ELFSymTy NewSymbol;
     BinarySection *OriginSection = Function->getOriginSection();
     NewSymbol.st_shndx =
diff --git a/bolt/lib/Target/X86/X86MCPlusBuilder.cpp b/bolt/lib/Target/X86/X86MCPlusBuilder.cpp
index 5155085..4016ffe 100644
--- a/bolt/lib/Target/X86/X86MCPlusBuilder.cpp
+++ b/bolt/lib/Target/X86/X86MCPlusBuilder.cpp
@@ -1803,12 +1803,6 @@
     return &SymExpr->getSymbol();
   }
 
-  // This is the same as the base class, but since we are overriding one of
-  // getTargetSymbol's signatures above, we need to override all of them.
-  const MCSymbol *getTargetSymbol(const MCExpr *Expr) const override {
-    return &cast<const MCSymbolRefExpr>(Expr)->getSymbol();
-  }
-
   bool analyzeBranch(InstructionIterator Begin, InstructionIterator End,
                      const MCSymbol *&TBB, const MCSymbol *&FBB,
                      MCInst *&CondBranch,
diff --git a/bolt/test/AArch64/lite-mode.s b/bolt/test/AArch64/lite-mode.s
new file mode 100644
index 0000000..d812060
--- /dev/null
+++ b/bolt/test/AArch64/lite-mode.s
@@ -0,0 +1,108 @@
+## Check that in lite mode llvm-bolt updates function references in
+## non-optimized code.
+
+# RUN: llvm-mc -filetype=obj -triple aarch64-unknown-unknown %s -o %t.o
+# RUN: link_fdata %s %t.o %t.fdata
+# RUN: llvm-strip --strip-unneeded %t.o
+# RUN: %clang %cflags %t.o -o %t.exe -Wl,-q -static
+# RUN: llvm-bolt %t.exe -o %t.bolt --data %t.fdata --lite
+# RUN: llvm-objdump -d --disassemble-symbols=cold_function %t.exe \
+# RUN:   | FileCheck %s --check-prefix=CHECK-INPUT
+# RUN: llvm-objdump -d --disassemble-symbols=cold_function %t.bolt \
+# RUN:   | FileCheck %s
+
+## In lite mode, optimized code will be separated from the original .text by
+## over 128MB, making it impossible for call/bl instructions in cold functions
+## to reach optimized functions directly.
+
+  .text
+  .globl _start
+  .type _start, %function
+_start:
+# FDATA: 0 [unknown] 0 1 _start 0 0 100
+  .cfi_startproc
+  cmp  x0, 1
+  b.eq  .L0
+  bl cold_function
+.L0:
+  ret  x30
+  .cfi_endproc
+.size _start, .-_start
+
+## Cold non-optimized function with a reference to a hot function (_start).
+# CHECK: Disassembly of section .bolt.org.text:
+# CHECK-LABEL: <cold_function>
+  .globl cold_function
+  .type cold_function, %function
+cold_function:
+  .cfi_startproc
+
+## Absolute 64-bit function pointer reference.
+## We check for the lower 16 bits of _start to be zeros after update.
+  movz    x0, :abs_g3:_start
+  movk    x0, :abs_g2_nc:_start
+  movk    x0, :abs_g1_nc:_start
+# CHECK-INPUT-NOT: movk x0, #0x0{{$}}
+# CHECK: movk x0, #0x0{{$}}
+  movk    x0, :abs_g0_nc:_start
+
+## Relaxable address reference.
+# CHECK-INPUT:      nop
+# CHECK-INPUT-NEXT: adr x1
+# CHECK-NEXT:       adrp x1, [[ADDR:0x[0-9a-f]+]] <{{.*}}>
+# CHECK-NEXT:       add  x1
+  adrp    x1, _start
+  add     x1, x1, :lo12:_start
+
+## Non-relaxable address reference.
+# CHECK-INPUT-NEXT: adrp x2
+# CHECK-INPUT-NEXT: add  x2
+# CHECK-NEXT:       adrp x2, [[ADDR]]
+# CHECK-NEXT:       add  x2
+  adrp    x2, far_func
+  add     x2, x2, :lo12:far_func
+
+## Check that fully-relaxed GOT reference is converted into ADRP+ADD.
+  adrp    x3, :got:_start
+  ldr     x3, [x3, #:got_lo12:_start]
+# CHECK-INPUT-NEXT: nop
+# CHECK-INPUT-NEXT: adr x3
+# CHECK-NEXT:       adrp x3, [[ADDR]]
+# CHECK-NEXT:       add  x3
+
+## Check that partially-relaxed GOT reference is converted into ADRP+ADD.
+  adrp    x4, :got:far_func
+  ldr     x4, [x4, #:got_lo12:far_func]
+# CHECK-INPUT-NEXT: adrp x4
+# CHECK-INPUT-NEXT: add x4
+# CHECK-NEXT:       adrp x4, [[ADDR]]
+# CHECK-NEXT:       add  x4
+
+## Check that non-relaxable GOT load is left intact.
+  adrp    x5, :got:far_func
+  nop
+  ldr     x5, [x5, #:got_lo12:far_func]
+# CHECK-INPUT-NEXT: adrp x5
+# CHECK-INPUT-NEXT: nop
+# CHECK-INPUT-NEXT: ldr x5
+# CHECK-NEXT:       adrp x5
+# CHECK-NOT: [[ADDR]]
+# CHECK-NEXT:       nop
+# CHECK-NEXT:       ldr x5
+
+  .cfi_endproc
+.size cold_function, .-cold_function
+
+## Reserve 1MB of space to make functions that follow unreachable by ADRs in
+## code that precedes this gap.
+.space 0x100000
+
+  .globl far_func
+  .type far_func, %function
+far_func:
+# FDATA: 0 [unknown] 0 1 far_func 0 0 100
+  .cfi_startproc
+  ret  x30
+  .cfi_endproc
+.size far_func, .-far_func
+