[lld][MachO] Key branch-extension thunks on (referent, addend) (#191808)
TextOutputSection::finalize ignored branch relocation addends. Two call
sites branching to the same symbol with different addends therefore
collapsed onto a single thunk.
Key thunkMap on (isec, value, addend) so two call sites with different
addends get independent thunks. The addend is encoded in the thunk's
relocs and is zeroed at the call site after the callee is redirected to
the thunk. Thunk names carry a `+N` suffix when the addend is non-zero.
diff --git a/lld/MachO/Arch/ARM64.cpp b/lld/MachO/Arch/ARM64.cpp
index 04da702..7acf48c 100644
--- a/lld/MachO/Arch/ARM64.cpp
+++ b/lld/MachO/Arch/ARM64.cpp
@@ -33,7 +33,8 @@
void writeObjCMsgSendStub(uint8_t *buf, Symbol *sym, uint64_t stubsAddr,
uint64_t &stubOffset, uint64_t selrefVA,
Symbol *objcMsgSend) const override;
- void populateThunk(InputSection *thunk, Symbol *funcSym) override;
+ void populateThunk(InputSection *thunk, Symbol *funcSym,
+ int64_t addend) override;
void initICFSafeThunkBody(InputSection *thunk,
Symbol *targetSym) const override;
@@ -160,17 +161,18 @@
0xd61f0200, // 08: br x16
};
-void ARM64::populateThunk(InputSection *thunk, Symbol *funcSym) {
+void ARM64::populateThunk(InputSection *thunk, Symbol *funcSym,
+ int64_t addend) {
thunk->align = 4;
thunk->data = {reinterpret_cast<const uint8_t *>(thunkCode),
sizeof(thunkCode)};
thunk->relocs.emplace_back(/*type=*/ARM64_RELOC_PAGEOFF12,
/*pcrel=*/false, /*length=*/2,
- /*offset=*/4, /*addend=*/0,
+ /*offset=*/4, /*addend=*/addend,
/*referent=*/funcSym);
thunk->relocs.emplace_back(/*type=*/ARM64_RELOC_PAGE21,
/*pcrel=*/true, /*length=*/2,
- /*offset=*/0, /*addend=*/0,
+ /*offset=*/0, /*addend=*/addend,
/*referent=*/funcSym);
}
// Just a single direct branch to the target function.
diff --git a/lld/MachO/ConcatOutputSection.cpp b/lld/MachO/ConcatOutputSection.cpp
index fe67308..753dea9 100644
--- a/lld/MachO/ConcatOutputSection.cpp
+++ b/lld/MachO/ConcatOutputSection.cpp
@@ -75,8 +75,8 @@
//
// Data:
//
-// * DenseMap<Symbol *, ThunkInfo> thunkMap: Maps the function symbol
-// to its thunk bookkeeper.
+// * DenseMap<ThunkKey, ThunkInfo> thunkMap: Maps each (referent, addend)
+// pair seen on a branch relocation to its thunk bookkeeper.
//
// * struct ThunkInfo (bookkeeper): Call instructions have limited range, and
// distant call sites might be unable to reach the same thunk, so multiple
@@ -112,7 +112,7 @@
// the inputs and thunks vectors (both ordered by ascending address), which
// is simple and cheap.
-DenseMap<Symbol *, ThunkInfo, ThunkMapKeyInfo> lld::macho::thunkMap;
+DenseMap<ThunkKey, ThunkInfo, ThunkMapKeyInfo> lld::macho::thunkMap;
// Determine whether we need thunks, which depends on the target arch -- RISC
// (i.e., ARM) generally does because it has limited-range branch/call
@@ -155,7 +155,7 @@
// Pre-populate the thunkMap and memoize call site counts for every
// InputSection and ThunkInfo. We do this for the benefit of
// estimateBranchTargetThresholdVA().
- ThunkInfo &thunkInfo = thunkMap[sym];
+ ThunkInfo &thunkInfo = thunkMap[ThunkKey{sym, r.addend}];
// Knowing ThunkInfo call site count will help us know whether or not we
// might need to create more for this referent at the time we are
// estimating distance to __stubs in estimateBranchTargetThresholdVA().
@@ -351,19 +351,27 @@
uint64_t highVA = callVA + forwardBranchRange;
// Calculate our call referent address
auto *funcSym = cast<Symbol *>(r.referent);
- ThunkInfo &thunkInfo = thunkMap[funcSym];
- // The referent is not reachable, so we need to use a thunk ...
- if ((funcSym->isInStubs() ||
+ ThunkInfo &thunkInfo = thunkMap[ThunkKey{funcSym, r.addend}];
+ // The referent is not reachable, so we need to use a thunk... unless we
+ // are close enough to the end that branch target sections (__stubs,
+ // __objc_stubs) are now within range of a simple forward branch -- BUT
+ // only for zero-addend branches. The writer's resolveSymbolOffsetVA()
+ // resolves non-zero-addend branches against the symbol body rather than
+ // the stub, so __stubs reachability says nothing about whether such a
+ // call can be emitted directly. Hence the `r.addend == 0` guard below.
+ // See INTERP check lines in arm64-thunk-branch-addend.s.
+ if (r.addend == 0 &&
+ (funcSym->isInStubs() ||
(in.objcStubs && in.objcStubs->isNeeded() &&
ObjCStubsSection::isObjCStubSymbol(funcSym))) &&
callVA >= branchTargetThresholdVA) {
assert(callVA != TargetInfo::outOfRangeVA);
- // ... Oh, wait! We are close enough to the end that branch target
- // sections (__stubs, __objc_stubs) are now within range of a simple
- // forward branch.
continue;
}
- uint64_t funcVA = funcSym->resolveBranchVA();
+ // Use the same resolution rules as the writer: for non-zero addends this
+ // goes directly to the symbol body rather than any stub trampoline.
+ // See INTERP check lines in arm64-thunk-branch-addend.s.
+ uint64_t funcVA = resolveSymbolOffsetVA(funcSym, r.type, r.addend);
++thunkInfo.callSitesUsed;
if (lowVA <= funcVA && funcVA <= highVA) {
// The referent is reachable with a simple call instruction.
@@ -376,6 +384,9 @@
uint64_t thunkVA = thunkInfo.isec->getVA();
if (lowVA <= thunkVA && thunkVA <= highVA) {
r.referent = thunkInfo.sym;
+ // The thunk itself bakes in the addend, so the call-site reloc must
+ // branch to the thunk start with no extra offset.
+ r.addend = 0;
continue;
}
}
@@ -394,8 +405,12 @@
thunkInfo.isec->parent = this;
assert(thunkInfo.isec->live);
- StringRef thunkName = saver().save(funcSym->getName() + ".thunk." +
- std::to_string(thunkInfo.sequence++));
+ std::string addendSuffix;
+ if (r.addend != 0)
+ addendSuffix = "+" + std::to_string(r.addend);
+ StringRef thunkName =
+ saver().save(funcSym->getName() + addendSuffix + ".thunk." +
+ std::to_string(thunkInfo.sequence++));
if (!isa<Defined>(funcSym) || cast<Defined>(funcSym)->isExternal()) {
r.referent = thunkInfo.sym = symtab->addDefined(
thunkName, /*file=*/nullptr, thunkInfo.isec, /*value=*/0, thunkSize,
@@ -410,7 +425,10 @@
/*noDeadStrip=*/false, /*isWeakDefCanBeHidden=*/false);
}
thunkInfo.sym->used = true;
- target->populateThunk(thunkInfo.isec, funcSym);
+ target->populateThunk(thunkInfo.isec, funcSym, r.addend);
+ // The thunk itself bakes in the addend, so the call-site reloc must
+ // branch to the thunk start with no extra offset.
+ r.addend = 0;
finalizeOne(thunkInfo.isec);
thunks.push_back(thunkInfo.isec);
++thunkCount;
diff --git a/lld/MachO/ConcatOutputSection.h b/lld/MachO/ConcatOutputSection.h
index f09d1cd..dd27743 100644
--- a/lld/MachO/ConcatOutputSection.h
+++ b/lld/MachO/ConcatOutputSection.h
@@ -112,32 +112,60 @@
// of ConcatOutputSection, so must have deterministic iteration order.
extern llvm::MapVector<NamePair, ConcatOutputSection *> concatOutputSections;
-// After ICF, multiple Defined symbols may point to the same (isec, value) yet
-// remain as distinct Symbol pointers. Using bare Symbol* as thunkMap keys
-// would create redundant thunks for the same target. This DenseMapInfo
+// Branch-extension thunks are keyed by both the target referent and the
+// branch relocation's addend. Two call sites that branch to the same
+// symbol with different addends (e.g. `bl _func` and `bl _func+8`) target
+// distinct addresses and therefore need distinct thunks.
+//
+// After ICF, multiple Defined symbols may point to the same (isec, value)
+// yet remain as distinct Symbol pointers. The equality predicate below
// canonicalizes Defined symbols by (isec, value) so that ICF-folded copies
-// share a single thunkMap entry.
-struct ThunkMapKeyInfo : llvm::DenseMapInfo<Symbol *> {
- static unsigned getHashValue(const Symbol *sym) {
- if (const auto *d = dyn_cast<Defined>(sym))
- return llvm::hash_combine(d->isec(), d->value);
- return llvm::DenseMapInfo<Symbol *>::getHashValue(sym);
+// still share a single thunkMap entry when their addends match.
+struct ThunkKey {
+ Symbol *sym;
+ int64_t addend;
+
+ static ThunkKey getEmptyKey() {
+ return {llvm::DenseMapInfo<Symbol *>::getEmptyKey(), 0};
}
- static bool isEqual(const Symbol *lhs, const Symbol *rhs) {
- if (lhs == rhs)
- return true;
- if (lhs == getEmptyKey() || lhs == getTombstoneKey() ||
- rhs == getEmptyKey() || rhs == getTombstoneKey())
+ static ThunkKey getTombstoneKey() {
+ return {llvm::DenseMapInfo<Symbol *>::getTombstoneKey(), 0};
+ }
+ bool isSentinel() const {
+ return sym == llvm::DenseMapInfo<Symbol *>::getEmptyKey() ||
+ sym == llvm::DenseMapInfo<Symbol *>::getTombstoneKey();
+ }
+ bool operator==(const ThunkKey &other) const {
+ if (addend != other.addend)
return false;
- const auto *dl = dyn_cast<Defined>(lhs);
- const auto *dr = dyn_cast<Defined>(rhs);
+ if (sym == other.sym)
+ return true;
+ if (isSentinel() || other.isSentinel())
+ return false;
+ const auto *dl = dyn_cast<Defined>(sym);
+ const auto *dr = dyn_cast<Defined>(other.sym);
if (dl && dr)
return dl->isec() == dr->isec() && dl->value == dr->value;
return false;
}
};
-extern llvm::DenseMap<Symbol *, ThunkInfo, ThunkMapKeyInfo> thunkMap;
+struct ThunkMapKeyInfo {
+ static ThunkKey getEmptyKey() { return ThunkKey::getEmptyKey(); }
+ static ThunkKey getTombstoneKey() { return ThunkKey::getTombstoneKey(); }
+ static unsigned getHashValue(const ThunkKey &k) {
+ if (k.isSentinel())
+ return llvm::hash_value(k.sym);
+ if (const auto *d = dyn_cast<Defined>(k.sym))
+ return llvm::hash_combine(d->isec(), d->value, k.addend);
+ return llvm::hash_combine(k.sym, k.addend);
+ }
+ static bool isEqual(const ThunkKey &lhs, const ThunkKey &rhs) {
+ return lhs == rhs;
+ }
+};
+
+extern llvm::DenseMap<ThunkKey, ThunkInfo, ThunkMapKeyInfo> thunkMap;
} // namespace lld::macho
diff --git a/lld/MachO/InputSection.cpp b/lld/MachO/InputSection.cpp
index 34847ad..4c4f644 100644
--- a/lld/MachO/InputSection.cpp
+++ b/lld/MachO/InputSection.cpp
@@ -89,7 +89,7 @@
return parent->addr + getOffset(off);
}
-static uint64_t resolveSymbolOffsetVA(const Symbol *sym, uint8_t type,
+uint64_t macho::resolveSymbolOffsetVA(const Symbol *sym, uint8_t type,
int64_t offset) {
const RelocAttrs &relocAttrs = target->getRelocAttrs(type);
uint64_t symVA;
diff --git a/lld/MachO/InputSection.h b/lld/MachO/InputSection.h
index e0a90a2..2ecc198 100644
--- a/lld/MachO/InputSection.h
+++ b/lld/MachO/InputSection.h
@@ -377,6 +377,8 @@
} // namespace section_names
void addInputSection(InputSection *inputSection);
+
+uint64_t resolveSymbolOffsetVA(const Symbol *sym, uint8_t type, int64_t offset);
} // namespace macho
std::string toString(const macho::InputSection *);
diff --git a/lld/MachO/Target.h b/lld/MachO/Target.h
index 145c140..0c7c0c3 100644
--- a/lld/MachO/Target.h
+++ b/lld/MachO/Target.h
@@ -99,7 +99,8 @@
virtual uint64_t getPageSize() const = 0;
- virtual void populateThunk(InputSection *thunk, Symbol *funcSym) {
+ virtual void populateThunk(InputSection *thunk, Symbol *funcSym,
+ int64_t addend) {
llvm_unreachable("target does not use thunks");
}
diff --git a/lld/test/MachO/arm64-thunk-branch-addend.s b/lld/test/MachO/arm64-thunk-branch-addend.s
new file mode 100644
index 0000000..5423198
--- /dev/null
+++ b/lld/test/MachO/arm64-thunk-branch-addend.s
@@ -0,0 +1,80 @@
+# REQUIRES: aarch64
+
+# RUN: rm -rf %t; mkdir %t
+# RUN: llvm-mc -filetype=obj -triple=arm64-apple-darwin %s -o %t/input.o
+
+## When two call sites branch to the same target symbol with different addends
+## and each needs a branch-extension thunk, the thunks must be distinct.
+# RUN: %lld -arch arm64 -lSystem -o %t/base %t/input.o
+# RUN: llvm-objdump --no-print-imm-hex -d --no-show-raw-insn %t/base \
+# RUN: | FileCheck %s --check-prefixes=CHECK,BASE
+
+## Regression test for the narrow corner case where the referent of a
+## branch relocation is a Defined symbol that *also* has a __stubs entry
+## (here forced via -interposable, which creates a stub for every extern
+## Defined so callers can be interposed at runtime).
+# RUN: %lld -arch arm64 -lSystem -interposable -o %t/interp %t/input.o
+# RUN: llvm-objdump --no-print-imm-hex -d --no-show-raw-insn %t/interp \
+# RUN: | FileCheck %s --check-prefixes=CHECK,INTERP
+
+.subsections_via_symbols
+
+.text
+
+## _target will become an interposable extern Defined under `-interposable`: it
+## has a real body *and* a __stubs entry.
+.globl _target
+## Align it to a 4K boundary to make the test a little bit easier to write.
+.p2align 12
+_target:
+ mov w0, #1
+ ret
+ mov w0, #2
+ ret
+
+## Spacer to push _main out of branch range of _target.
+.globl _spacer
+.p2align 2
+_spacer:
+ .space 0x8000000
+ ret
+
+.globl _main
+.p2align 2
+_main:
+ bl _target
+ bl _target+8
+ ret
+
+## Capture _target's full VA.
+# CHECK-LABEL: <_target>:
+# CHECK-NEXT: [[#%x, TARGET_VA:]]: mov w0, #1
+
+# BASE-LABEL: <_main>:
+# BASE-NEXT: bl 0x{{[0-9a-f]+}} <_target.thunk.0>
+# BASE-NEXT: bl 0x{{[0-9a-f]+}} <_target+8.thunk.0>
+# BASE-NEXT: ret
+
+## For -interposable: The addend=0 call takes the __stubs fast-path, so its bl
+## target is an address inside the __stubs section. The addend=8 call must
+## resolve against the local body.
+# INTERP-LABEL: <_main>:
+# INTERP-NEXT: bl 0x[[#%x, STUB_CALL:]]
+# INTERP-NEXT: bl 0x{{[0-9a-f]+}} <_target+8.thunk.0>
+# INTERP-NEXT: ret
+
+## The +0 thunk should be only present in BASE test.
+# BASE-LABEL: <_target.thunk.0>:
+# BASE-NEXT: adrp x16, 0x[[#TARGET_VA]]
+# BASE-NEXT: add x16, x16, #0
+# BASE-NEXT: br x16
+
+## The +8 thunk must load _target+8 into x16: adrp loads _target (which is
+## 4K-aligned), add supplies the +8.
+# CHECK-LABEL: <_target+8.thunk.0>:
+# CHECK-NEXT: adrp x16, 0x[[#TARGET_VA]]
+# CHECK-NEXT: add x16, x16, #8
+# CHECK-NEXT: br x16
+
+# INTERP-LABEL: <__stubs>:
+# INTERP: [[#STUB_CALL]]: adrp x16,