[lld-macho] LTO: Unset VisibleToRegularObj where possible

This allows LLVM's LTO to internalize symbols that are not referenced
directly by regular objects. Naturally, this means we need to track
which symbols are referenced by regular objects. The approach taken here
is similar to LLD-COFF's: like the COFF port, we extend
`SymbolTable::insert()` to set the isVisibleToRegularObj bit. (LLD-ELF
relies on the Symbol constructor and `Symbol::mergeProperties()`, but
the Mach-O port does not have a `mergeProperties()` equivalent.)

From what I can tell, ld64 (which uses libLTO) doesn't do this
optimization at all. I'm not even sure libLTO provides a way to do this.
Not having ld64's behavior as a reference implementation is unfortunate;
instead, I am relying on LLD-ELF/COFF's behavior as references while
erring on the conservative side. In particular, LLD-MachO will only do
this optimization for executables right now.

We also don't attempt it when `-flat_namespace` is used -- otherwise
we'd need scan the symbol table to find matches for every un-namespaced
symbol reference, which is expensive.

internalize.ll is based off the LLD-ELF tests `internalize-basic.ll` and
`internalize-undef.ll`. Looks like @davide added some of LLD-ELF's internalize
tests, so adding him as a reviewer...

Reviewed By: #lld-macho, gkm

Differential Revision: https://reviews.llvm.org/D99105

GitOrigin-RevId: eb5b7d4497e323cf6214eb3e008dc37bc5ed1fd7
diff --git a/MachO/LTO.cpp b/MachO/LTO.cpp
index 70c555f..f04b950 100644
--- a/MachO/LTO.cpp
+++ b/MachO/LTO.cpp
@@ -25,6 +25,7 @@
 using namespace lld;
 using namespace lld::macho;
 using namespace llvm;
+using namespace llvm::MachO;
 using namespace llvm::sys;
 
 static lto::Config createConfig() {
@@ -39,6 +40,9 @@
   };
   c.TimeTraceEnabled = config->timeTraceEnabled;
   c.TimeTraceGranularity = config->timeTraceGranularity;
+  if (config->saveTemps)
+    checkError(c.addSaveTemps(config->outputFile.str() + ".",
+                              /*UseInputModulePath=*/true));
   return c;
 }
 
@@ -67,6 +71,12 @@
     // be removed.
     r.Prevailing = !objSym.isUndefined() && sym->getFile() == &f;
 
+    // FIXME: What about other output types? And we can probably be less
+    // restrictive with -flat_namespace, but it's an infrequent use case.
+    r.VisibleToRegularObj = config->outputType != MH_EXECUTE ||
+                            config->namespaceKind == NamespaceKind::flat ||
+                            sym->isUsedInRegularObj;
+
     // Un-define the symbol so that we don't get duplicate symbol errors when we
     // load the ObjFile emitted by LTO compilation.
     if (r.Prevailing)
@@ -74,7 +84,6 @@
                                RefState::Strong);
 
     // TODO: set the other resolution configs properly
-    r.VisibleToRegularObj = true;
   }
   checkError(ltoObj->add(std::move(f.obj), resols));
 }
diff --git a/MachO/SymbolTable.cpp b/MachO/SymbolTable.cpp
index 76d4289..1fd7704 100644
--- a/MachO/SymbolTable.cpp
+++ b/MachO/SymbolTable.cpp
@@ -24,17 +24,22 @@
   return symVector[it->second];
 }
 
-std::pair<Symbol *, bool> SymbolTable::insert(StringRef name) {
+std::pair<Symbol *, bool> SymbolTable::insert(StringRef name,
+                                              const InputFile *file) {
   auto p = symMap.insert({CachedHashStringRef(name), (int)symVector.size()});
 
-  // Name already present in the symbol table.
-  if (!p.second)
-    return {symVector[p.first->second], false};
+  Symbol *sym;
+  if (!p.second) {
+    // Name already present in the symbol table.
+    sym = symVector[p.first->second];
+  } else {
+    // Name is a new symbol.
+    sym = reinterpret_cast<Symbol *>(make<SymbolUnion>());
+    symVector.push_back(sym);
+  }
 
-  // Name is a new symbol.
-  Symbol *sym = reinterpret_cast<Symbol *>(make<SymbolUnion>());
-  symVector.push_back(sym);
-  return {sym, true};
+  sym->isUsedInRegularObj |= !file || isa<ObjFile>(file);
+  return {sym, p.second};
 }
 
 Defined *SymbolTable::addDefined(StringRef name, InputFile *file,
@@ -44,7 +49,7 @@
   Symbol *s;
   bool wasInserted;
   bool overridesWeakDef = false;
-  std::tie(s, wasInserted) = insert(name);
+  std::tie(s, wasInserted) = insert(name, file);
 
   if (!wasInserted) {
     if (auto *defined = dyn_cast<Defined>(s)) {
@@ -77,7 +82,7 @@
                                   bool isWeakRef) {
   Symbol *s;
   bool wasInserted;
-  std::tie(s, wasInserted) = insert(name);
+  std::tie(s, wasInserted) = insert(name, file);
 
   RefState refState = isWeakRef ? RefState::Weak : RefState::Strong;
 
@@ -96,7 +101,7 @@
                                uint32_t align, bool isPrivateExtern) {
   Symbol *s;
   bool wasInserted;
-  std::tie(s, wasInserted) = insert(name);
+  std::tie(s, wasInserted) = insert(name, file);
 
   if (!wasInserted) {
     if (auto *common = dyn_cast<CommonSymbol>(s)) {
@@ -117,7 +122,7 @@
                               bool isTlv) {
   Symbol *s;
   bool wasInserted;
-  std::tie(s, wasInserted) = insert(name);
+  std::tie(s, wasInserted) = insert(name, file);
 
   RefState refState = RefState::Unreferenced;
   if (!wasInserted) {
@@ -149,7 +154,7 @@
                              const object::Archive::Symbol &sym) {
   Symbol *s;
   bool wasInserted;
-  std::tie(s, wasInserted) = insert(name);
+  std::tie(s, wasInserted) = insert(name, file);
 
   if (wasInserted)
     replaceSymbol<LazySymbol>(s, file, sym);
diff --git a/MachO/SymbolTable.h b/MachO/SymbolTable.h
index 29aa03e..10d17f6 100644
--- a/MachO/SymbolTable.h
+++ b/MachO/SymbolTable.h
@@ -60,7 +60,7 @@
   Symbol *find(StringRef name) { return find(llvm::CachedHashStringRef(name)); }
 
 private:
-  std::pair<Symbol *, bool> insert(StringRef name);
+  std::pair<Symbol *, bool> insert(StringRef name, const InputFile *);
   llvm::DenseMap<llvm::CachedHashStringRef, int> symMap;
   std::vector<Symbol *> symVector;
 };
diff --git a/MachO/Symbols.h b/MachO/Symbols.h
index 0548342..962aad3 100644
--- a/MachO/Symbols.h
+++ b/MachO/Symbols.h
@@ -85,12 +85,17 @@
 
 protected:
   Symbol(Kind k, StringRefZ name, InputFile *file)
-      : symbolKind(k), nameData(name.data), nameSize(name.size), file(file) {}
+      : symbolKind(k), nameData(name.data), nameSize(name.size), file(file),
+        isUsedInRegularObj(!file || isa<ObjFile>(file)) {}
 
   Kind symbolKind;
   const char *nameData;
   mutable uint32_t nameSize;
   InputFile *file;
+
+public:
+  // True if this symbol was referenced by a regular (non-bitcode) object.
+  bool isUsedInRegularObj;
 };
 
 class Defined : public Symbol {
@@ -249,7 +254,10 @@
   assert(static_cast<Symbol *>(static_cast<T *>(nullptr)) == nullptr &&
          "Not a Symbol");
 
-  return new (s) T(std::forward<ArgT>(arg)...);
+  bool isUsedInRegularObj = s->isUsedInRegularObj;
+  T *sym = new (s) T(std::forward<ArgT>(arg)...);
+  sym->isUsedInRegularObj |= isUsedInRegularObj;
+  return sym;
 }
 
 } // namespace macho
diff --git a/test/MachO/internalize.ll b/test/MachO/internalize.ll
new file mode 100644
index 0000000..74c39be
--- /dev/null
+++ b/test/MachO/internalize.ll
@@ -0,0 +1,72 @@
+; REQUIRES: x86
+
+;; Check that we internalize bitcode symbols (only) where possible, i.e. when
+;; they are not referenced by undefined symbols originating from non-bitcode
+;; files.
+
+; RUN: rm -rf %t; split-file %s %t
+; RUN: llvm-as %t/test.s -o %t/test.o
+; RUN: llvm-as %t/baz.s -o %t/baz.o
+; RUN: llvm-mc -filetype=obj -triple=x86_64-apple-darwin %t/regular.s -o %t/regular.o
+; RUN: %lld -pie -lSystem %t/test.o %t/baz.o %t/regular.o -o %t/test -save-temps
+; RUN: llvm-dis < %t/test.0.2.internalize.bc | FileCheck %s
+; RUN: llvm-objdump --macho --syms %t/test | FileCheck %s --check-prefix=SYMTAB
+
+;; Check that main is not internalized. This covers the case of bitcode symbols
+;; referenced by undefined symbols that don't belong to any InputFile.
+; CHECK: define void @main()
+
+;; Check that the foo and bar functions are correctly internalized.
+; CHECK: define internal void @bar()
+; CHECK: define internal void @foo()
+
+;; Check that a bitcode symbol that is referenced by a regular object file isn't
+;; internalized.
+; CHECK: define void @used_in_regular_obj()
+
+;; Check that a bitcode symbol that is defined in another bitcode file gets
+;; internalized.
+; CHECK: define internal void @baz()
+
+; Check foo and bar are not emitted to the .symtab
+; SYMTAB-LABEL: SYMBOL TABLE:
+; SYMTAB-NEXT:  g     F __TEXT,__text _main
+; SYMTAB-NEXT:  g     F __TEXT,__text _used_in_regular_obj
+; SYMTAB-NEXT:  g     F __TEXT,__text __mh_execute_header
+; SYMTAB-EMPTY:
+
+;--- test.s
+target triple = "x86_64-apple-macosx10.15.0"
+target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+
+declare void @baz()
+
+define void @main() {
+  call void @bar()
+  call void @baz()
+  ret void
+}
+
+define void @bar() {
+  ret void
+}
+
+define hidden void @foo() {
+  ret void
+}
+
+define void @used_in_regular_obj() {
+  ret void
+}
+
+;--- baz.s
+target triple = "x86_64-apple-macosx10.15.0"
+target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+
+define void @baz() {
+  ret void
+}
+
+;--- regular.s
+.data
+.quad _used_in_regular_obj
diff --git a/test/MachO/lto-save-temps.ll b/test/MachO/lto-save-temps.ll
index 63c196b..d1b2cad 100644
--- a/test/MachO/lto-save-temps.ll
+++ b/test/MachO/lto-save-temps.ll
@@ -5,15 +5,15 @@
 
 ; RUN: rm -rf %t; split-file %s %t
 ; RUN: llvm-as %t/foo.ll -o %t/foo.o
-; RUN: llvm-as %t/test.ll -o %t/test.o
-; RUN: %lld -save-temps %t/foo.o %t/test.o -o %t/test
+; RUN: llvm-as %t/bar.ll -o %t/bar.o
+; RUN: %lld -dylib -save-temps %t/foo.o %t/bar.o -o %t/test
 ; RUN: llvm-objdump -d --no-show-raw-insn %t/test.lto.o | FileCheck %s --check-prefix=ALL
 ; RUN: llvm-objdump -d --no-show-raw-insn %t/test | FileCheck %s --check-prefix=ALL
 
 ; RUN: rm -rf %t; split-file %s %t
 ; RUN: opt -module-summary %t/foo.ll -o %t/foo.o
-; RUN: opt -module-summary %t/test.ll -o %t/test.o
-; RUN: %lld -save-temps %t/foo.o %t/test.o -o %t/test
+; RUN: opt -module-summary %t/bar.ll -o %t/bar.o
+; RUN: %lld -dylib -save-temps %t/foo.o %t/bar.o -o %t/test
 ; RUN: llvm-objdump -d --no-show-raw-insn %t/test1.lto.o | FileCheck %s --check-prefix=FOO
 ; RUN: llvm-objdump -d --no-show-raw-insn %t/test2.lto.o | FileCheck %s --check-prefix=MAIN
 ; RUN: llvm-objdump -d --no-show-raw-insn %t/test | FileCheck %s --check-prefix=ALL
@@ -21,12 +21,12 @@
 ; FOO:      <_foo>:
 ; FOO-NEXT: retq
 
-; MAIN:      <_main>:
+; MAIN:      <_bar>:
 ; MAIN-NEXT: retq
 
 ; ALL:      <_foo>:
 ; ALL-NEXT: retq
-; ALL:      <_main>:
+; ALL:      <_bar>:
 ; ALL-NEXT: retq
 
 ;--- foo.ll
@@ -38,11 +38,11 @@
   ret void
 }
 
-;--- test.ll
+;--- bar.ll
 
 target triple = "x86_64-apple-macosx10.15.0"
 target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
 
-define void @main() {
+define void @bar() {
   ret void
 }