[lld-macho] Support Obj-C symbols in order files

Obj-C symbols may have spaces and colons, which our previous order file
parser would be confused by. The order file format has made the very unfortunate
choice of using colons for its delimiters, which means that we have to use
heuristics to determine if a given colon is part of a symbol or not...

Reviewed By: #lld-macho, thakis

Differential Revision: https://reviews.llvm.org/D93567

GitOrigin-RevId: 5f9896d3b23c31beecf225d90194e1bf4e097677
diff --git a/MachO/Driver.cpp b/MachO/Driver.cpp
index 63d1012..9838e67 100644
--- a/MachO/Driver.cpp
+++ b/MachO/Driver.cpp
@@ -393,19 +393,15 @@
     addFile(path, false);
 }
 
-static std::array<StringRef, 6> archNames{"arm",    "arm64", "i386",
-                                          "x86_64", "ppc",   "ppc64"};
-static bool isArchString(StringRef s) {
-  static DenseSet<StringRef> archNamesSet(archNames.begin(), archNames.end());
-  return archNamesSet.find(s) != archNamesSet.end();
-}
-
 // An order file has one entry per line, in the following format:
 //
-//   <arch>:<object file>:<symbol name>
+//   <cpu>:<object file>:<symbol name>
 //
-// <arch> and <object file> are optional. If not specified, then that entry
-// matches any symbol of that name.
+// <cpu> and <object file> are optional. If not specified, then that entry
+// matches any symbol of that name. Parsing this format is not quite
+// straightforward because the symbol name itself can contain colons, so when
+// encountering a colon, we consider the preceding characters to decide if it
+// can be a valid CPU type or file path.
 //
 // If a symbol is matched by multiple entries, then it takes the lowest-ordered
 // entry (the one nearest to the front of the list.)
@@ -420,59 +416,37 @@
 
   MemoryBufferRef mbref = *buffer;
   size_t priority = std::numeric_limits<size_t>::max();
-  for (StringRef rest : args::getLines(mbref)) {
-    StringRef arch, objectFile, symbol;
+  for (StringRef line : args::getLines(mbref)) {
+    StringRef objectFile, symbol;
+    line = line.take_until([](char c) { return c == '#'; }); // ignore comments
+    line = line.ltrim();
 
-    std::array<StringRef, 3> fields;
-    uint8_t fieldCount = 0;
-    while (rest != "" && fieldCount < 3) {
-      std::pair<StringRef, StringRef> p = getToken(rest, ": \t\n\v\f\r");
-      StringRef tok = p.first;
-      rest = p.second;
-
-      // Check if we have a comment
-      if (tok == "" || tok[0] == '#')
-        break;
-
-      fields[fieldCount++] = tok;
-    }
-
-    switch (fieldCount) {
-    case 3:
-      arch = fields[0];
-      objectFile = fields[1];
-      symbol = fields[2];
-      break;
-    case 2:
-      (isArchString(fields[0]) ? arch : objectFile) = fields[0];
-      symbol = fields[1];
-      break;
-    case 1:
-      symbol = fields[0];
-      break;
-    case 0:
-      break;
-    default:
-      llvm_unreachable("too many fields in order file");
-    }
-
-    if (!arch.empty()) {
-      if (!isArchString(arch)) {
-        error("invalid arch \"" + arch + "\" in order file: expected one of " +
-              llvm::join(archNames, ", "));
-        continue;
-      }
-
-      // TODO: Update when we extend support for other archs
-      if (arch != "x86_64")
-        continue;
-    }
-
-    if (!objectFile.empty() && !objectFile.endswith(".o")) {
-      error("invalid object file name \"" + objectFile +
-            "\" in order file: should end with .o");
+    CPUType cpuType = StringSwitch<CPUType>(line)
+                          .StartsWith("i386:", CPU_TYPE_I386)
+                          .StartsWith("x86_64:", CPU_TYPE_X86_64)
+                          .StartsWith("arm:", CPU_TYPE_ARM)
+                          .StartsWith("arm64:", CPU_TYPE_ARM64)
+                          .StartsWith("ppc:", CPU_TYPE_POWERPC)
+                          .StartsWith("ppc64:", CPU_TYPE_POWERPC64)
+                          .Default(CPU_TYPE_ANY);
+    // Drop the CPU type as well as the colon
+    if (cpuType != CPU_TYPE_ANY)
+      line = line.drop_until([](char c) { return c == ':'; }).drop_front();
+    // TODO: Update when we extend support for other CPUs
+    if (cpuType != CPU_TYPE_ANY && cpuType != CPU_TYPE_X86_64)
       continue;
+
+    constexpr std::array<StringRef, 2> fileEnds = {".o:", ".o):"};
+    for (StringRef fileEnd : fileEnds) {
+      size_t pos = line.find(fileEnd);
+      if (pos != StringRef::npos) {
+        // Split the string around the colon
+        objectFile = line.take_front(pos + fileEnd.size() - 1);
+        line = line.drop_front(pos + fileEnd.size());
+        break;
+      }
     }
+    symbol = line.trim();
 
     if (!symbol.empty()) {
       SymbolPriorityEntry &entry = config->priorities[symbol];
diff --git a/test/MachO/invalid/order-file-bad-arch.test b/test/MachO/invalid/order-file-bad-arch.test
deleted file mode 100644
index 06b37b0..0000000
--- a/test/MachO/invalid/order-file-bad-arch.test
+++ /dev/null
@@ -1,9 +0,0 @@
-# REQUIRES: x86
-# RUN: echo ".globl _main; .text; _main: ret" | llvm-mc -filetype=obj -triple=x86_64-apple-darwin -o %t.o
-# RUN: not %lld -o %t %t.o -order_file %s 2>&1 | FileCheck %s
-# CHECK: error: invalid arch "sparc" in order file:  expected one of arm, arm64, i386, x86_64, ppc, ppc64
-# CHECK-EMPTY:
-
-_barsymbol
-sparc:hello.o:_foosymbol
-i386:hello.o:_foosymbol
diff --git a/test/MachO/invalid/order-file-bad-objfile.test b/test/MachO/invalid/order-file-bad-objfile.test
deleted file mode 100644
index 546e688..0000000
--- a/test/MachO/invalid/order-file-bad-objfile.test
+++ /dev/null
@@ -1,10 +0,0 @@
-# REQUIRES: x86
-# RUN: echo ".globl _main; .text; _main: ret" | llvm-mc -filetype=obj -triple=x86_64-apple-darwin -o %t.o
-# RUN: not %lld -o %t %t.o -order_file %s 2>&1 | FileCheck %s
-# CHECK: invalid object file name "helloo" in order file: should end with .o
-# CHECK: invalid object file name "z80" in order file: should end with .o
-# CHECK-EMPTY:
-
-_barsymbol
-x86_64:helloo:_foosymbol
-z80:_foosymbol
diff --git a/test/MachO/order-file.s b/test/MachO/order-file.s
index 6733439..a13abe7 100644
--- a/test/MachO/order-file.s
+++ b/test/MachO/order-file.s
@@ -5,11 +5,11 @@
 # RUN: rm -f %t/foo.a
 # RUN: llvm-ar rcs %t/foo.a %t/foo.o
 
-# FOO-FIRST: <_foo>:
+# FOO-FIRST: <_bar>:
 # FOO-FIRST: <_main>:
 
 # FOO-SECOND: <_main>:
-# FOO-SECOND: <_foo>:
+# FOO-SECOND: <_bar>:
 
 # RUN: %lld -lSystem -o %t/test-1 %t/test.o %t/foo.o -order_file %t/ord-1
 # RUN: llvm-objdump -d %t/test-1 | FileCheck %s --check-prefix=FOO-FIRST
@@ -83,9 +83,9 @@
 # RUN: %lld -lSystem -o %t/test-4 %t/foo.o %t/test.o -order_file %t/ord-multiple-4
 # RUN: llvm-objdump -d %t/test-4 | FileCheck %s --check-prefix=FOO-FIRST
 
-## _foo and _bar both point to the same location. When both symbols appear in
-## an order file, the location in question should be ordered according to the
-## lowest-ordered symbol that references it.
+## -[Foo doFoo:andBar:] and _bar both point to the same location. When both
+## symbols appear in an order file, the location in question should be ordered
+## according to the lowest-ordered symbol that references it.
 
 # RUN: %lld -lSystem -o %t/test-alias %t/test.o %t/foo.o -order_file %t/ord-alias
 # RUN: llvm-objdump -d %t/test-alias | FileCheck %s --check-prefix=FOO-FIRST
@@ -93,63 +93,63 @@
 # RUN: llvm-objdump -d %t/test-alias | FileCheck %s --check-prefix=FOO-FIRST
 
 #--- ord-1
-_foo # just a comment
+-[Foo doFoo:andBar:] # just a comment
 _main # another comment
 
 #--- ord-2
 _main # just a comment
-_foo # another comment
+-[Foo doFoo:andBar:] # another comment
 
 #--- ord-file-match
-foo.o:_foo
+foo.o:-[Foo doFoo:andBar:]
 _main
 
 #--- ord-file-nomatch
-bar.o:_foo
+bar.o:-[Foo doFoo:andBar:]
 _main
-_foo
+-[Foo doFoo:andBar:]
 
 #--- ord-arch-match
-x86_64:_foo
+x86_64:-[Foo doFoo:andBar:]
 _main
 
 #--- ord-arch-nomatch
-ppc:_foo
+ppc:-[Foo doFoo:andBar:]
 _main
-_foo
+-[Foo doFoo:andBar:]
 
 #--- ord-arch-file-match
-x86_64:bar.o:_foo
+x86_64:bar.o:-[Foo doFoo:andBar:]
 _main
 
 #--- ord-multiple-1
-_foo
+-[Foo doFoo:andBar:]
 _main
-foo.o:_foo
+foo.o:-[Foo doFoo:andBar:]
 
 #--- ord-multiple-2
-foo.o:_foo
+foo.o:-[Foo doFoo:andBar:]
 _main
-_foo
+-[Foo doFoo:andBar:]
 
 #--- ord-multiple-3
-_foo
+-[Foo doFoo:andBar:]
 _main
-_foo
+-[Foo doFoo:andBar:]
 
 #--- ord-multiple-4
-foo.o:_foo
+foo.o:-[Foo doFoo:andBar:]
 _main
-foo.o:_foo
+foo.o:-[Foo doFoo:andBar:]
 
 #--- ord-alias
 _bar
 _main
-_foo
+-[Foo doFoo:andBar:]
 
 #--- foo.s
-.globl _foo
-_foo:
+.globl "-[Foo doFoo:andBar:]"
+"-[Foo doFoo:andBar:]":
 _bar:
   ret
 
@@ -157,5 +157,5 @@
 .globl _main
 
 _main:
-  callq _foo
+  callq "-[Foo doFoo:andBar:]"
   ret