[lld-macho] Teach ICF to dedup functions with identical unwind info

Dedup'ing unwind info is tricky because each CUE contains a different
function address, if ICF operated naively and compared the entire
contents of each CUE, entries with identical unwind info but belonging
to different functions would never be considered identical. To work
around this problem, we slice away the function address before
performing ICF. We rely on `relocateCompactUnwind()` to correctly handle
these truncated input sections.

Here are the numbers before and after D109944, D109945, and this diff
were applied, as tested on my 3.2 GHz 16-Core Intel Xeon W:

Without any optimizations:

             base           diff           difference (95% CI)
  sys_time   0.849 ± 0.015  0.896 ± 0.012  [  +4.8% ..   +6.2%]
  user_time  3.357 ± 0.030  3.512 ± 0.023  [  +4.3% ..   +5.0%]
  wall_time  3.944 ± 0.039  4.032 ± 0.031  [  +1.8% ..   +2.6%]
  samples    40             38

With `-dead_strip`:

             base           diff           difference (95% CI)
  sys_time   0.847 ± 0.010  0.896 ± 0.012  [  +5.2% ..   +6.5%]
  user_time  3.377 ± 0.014  3.532 ± 0.015  [  +4.4% ..   +4.8%]
  wall_time  3.962 ± 0.024  4.060 ± 0.030  [  +2.1% ..   +2.8%]
  samples    47             30

With `-dead_strip` and `--icf=all`:

             base           diff           difference (95% CI)
  sys_time   0.935 ± 0.013  0.957 ± 0.018  [  +1.5% ..   +3.2%]
  user_time  3.472 ± 0.022  6.531 ± 0.046  [ +87.6% ..  +88.7%]
  wall_time  4.080 ± 0.040  5.329 ± 0.060  [ +30.0% ..  +31.2%]
  samples    37             30

Unsurprisingly, ICF is now a lot slower, likely due to the much larger
number of input sections it needs to process. But the rest of the
linker only suffers a mild slowdown.

Note that the compact-unwind-bad-reloc.s test was expanded because we
now handle the relocation for CUE's function address in a separate code
path from the rest of the CUE relocations. The extended test covers both
code paths.

Reviewed By: #lld-macho, oontvoo

Differential Revision: https://reviews.llvm.org/D109946

GitOrigin-RevId: d9b6f7e312c11ff69779939bb2be0bd53936a333
diff --git a/MachO/ICF.cpp b/MachO/ICF.cpp
index 57e1da3..6c3a565 100644
--- a/MachO/ICF.cpp
+++ b/MachO/ICF.cpp
@@ -180,8 +180,31 @@
     }
     return isecA->icfEqClass[icfPass % 2] == isecB->icfEqClass[icfPass % 2];
   };
-  return std::equal(ia->relocs.begin(), ia->relocs.end(), ib->relocs.begin(),
-                    f);
+  if (!std::equal(ia->relocs.begin(), ia->relocs.end(), ib->relocs.begin(), f))
+    return false;
+
+  // If there are symbols with associated unwind info, check that the unwind
+  // info matches. For simplicity, we only handle the case where there are only
+  // symbols at offset zero within the section (which is typically the case with
+  // .subsections_via_symbols.)
+  auto hasCU = [](Defined *d) { return d->compactUnwind; };
+  auto itA = std::find_if(ia->symbols.begin(), ia->symbols.end(), hasCU);
+  auto itB = std::find_if(ib->symbols.begin(), ib->symbols.end(), hasCU);
+  if (itA == ia->symbols.end())
+    return itB == ib->symbols.end();
+  if (itB == ib->symbols.end())
+    return false;
+  const Defined *da = *itA;
+  const Defined *db = *itB;
+  if (da->compactUnwind->icfEqClass[icfPass % 2] !=
+          db->compactUnwind->icfEqClass[icfPass % 2] ||
+      da->value != 0 || db->value != 0)
+    return false;
+  auto isZero = [](Defined *d) { return d->value == 0; };
+  return std::find_if_not(std::next(itA), ia->symbols.end(), isZero) ==
+             ia->symbols.end() &&
+         std::find_if_not(std::next(itB), ib->symbols.end(), isZero) ==
+             ib->symbols.end();
 }
 
 // Find the first InputSection after BEGIN whose equivalence class differs
@@ -337,18 +360,14 @@
     // FIXME: consider non-code __text sections as hashable?
     bool isHashable = (isCodeSection(isec) || isCfStringSection(isec)) &&
                       !isec->shouldOmitFromOutput() && isec->isHashableForICF();
-    // ICF can't fold functions with unwind info
-    if (isHashable)
-      for (Defined *d : isec->symbols)
-        if (d->compactUnwind) {
-          isHashable = false;
-          break;
-        }
-
-    if (isHashable)
+    if (isHashable) {
       hashable.push_back(isec);
-    else
+      for (Defined *d : isec->symbols)
+        if (d->compactUnwind)
+          hashable.push_back(d->compactUnwind);
+    } else {
       isec->icfEqClass[0] = ++icfUniqueID;
+    }
   }
   parallelForEach(hashable,
                   [](ConcatInputSection *isec) { isec->hashForICF(); });
diff --git a/MachO/InputFiles.cpp b/MachO/InputFiles.cpp
index 6e6a61e..1b23392 100644
--- a/MachO/InputFiles.cpp
+++ b/MachO/InputFiles.cpp
@@ -904,16 +904,31 @@
 
   for (SubsectionEntry &entry : *cuSubsecMap) {
     ConcatInputSection *isec = cast<ConcatInputSection>(entry.isec);
+    // Hack!! Since each CUE contains a different function address, if ICF
+    // operated naively and compared the entire contents of each CUE, entries
+    // with identical unwind info but belonging to different functions would
+    // never be considered equivalent. To work around this problem, we slice
+    // away the function address here. (Note that we do not adjust the offsets
+    // of the corresponding relocations.) We rely on `relocateCompactUnwind()`
+    // to correctly handle these truncated input sections.
+    isec->data = isec->data.slice(target->wordSize);
+
     ConcatInputSection *referentIsec;
-    for (const Reloc &r : isec->relocs) {
-      if (r.offset != 0)
+    for (auto it = isec->relocs.begin(); it != isec->relocs.end();) {
+      Reloc &r = *it;
+      // We only wish to handle the relocation for CUE::functionAddress.
+      if (r.offset != 0) {
+        ++it;
         continue;
+      }
       uint64_t add = r.addend;
       if (auto *sym = cast_or_null<Defined>(r.referent.dyn_cast<Symbol *>())) {
         // Check whether the symbol defined in this file is the prevailing one.
         // Skip if it is e.g. a weak def that didn't prevail.
-        if (sym->getFile() != this)
+        if (sym->getFile() != this) {
+          ++it;
           continue;
+        }
         add += sym->value;
         referentIsec = cast<ConcatInputSection>(sym->isec);
       } else {
@@ -926,16 +941,23 @@
       // The functionAddress relocations are typically section relocations.
       // However, unwind info operates on a per-symbol basis, so we search for
       // the function symbol here.
-      auto it = llvm::lower_bound(
+      auto symIt = llvm::lower_bound(
           referentIsec->symbols, add,
           [](Defined *d, uint64_t add) { return d->value < add; });
       // The relocation should point at the exact address of a symbol (with no
       // addend).
-      if (it == referentIsec->symbols.end() || (*it)->value != add) {
+      if (symIt == referentIsec->symbols.end() || (*symIt)->value != add) {
         assert(referentIsec->wasCoalesced);
+        ++it;
         continue;
       }
-      (*it)->compactUnwind = isec;
+      (*symIt)->compactUnwind = isec;
+      // Since we've sliced away the functionAddress, we should remove the
+      // corresponding relocation too. Given that clang emits relocations in
+      // reverse order of address, this relocation should be at the end of the
+      // vector for most of our input object files, so this is typically an O(1)
+      // operation.
+      it = isec->relocs.erase(it);
     }
   }
 }
diff --git a/MachO/UnwindInfoSection.cpp b/MachO/UnwindInfoSection.cpp
index c499826..06f8d44 100644
--- a/MachO/UnwindInfoSection.cpp
+++ b/MachO/UnwindInfoSection.cpp
@@ -295,7 +295,8 @@
       return;
 
     // Write the rest of the CUE.
-    memcpy(buf, d->compactUnwind->data.data(), d->compactUnwind->data.size());
+    memcpy(buf + sizeof(Ptr), d->compactUnwind->data.data(),
+           d->compactUnwind->data.size());
     for (const Reloc &r : d->compactUnwind->relocs) {
       uint64_t referentVA = 0;
       if (auto *referentSym = r.referent.dyn_cast<Symbol *>()) {
@@ -309,9 +310,8 @@
         }
       } else {
         auto *referentIsec = r.referent.get<InputSection *>();
-        ConcatInputSection *concatIsec = checkTextSegment(referentIsec);
-        if (!concatIsec->shouldOmitFromOutput())
-          referentVA = referentIsec->getVA(r.addend);
+        checkTextSegment(referentIsec);
+        referentVA = referentIsec->getVA(r.addend);
       }
       writeAddress(buf + r.offset, referentVA, r.length);
     }
diff --git a/test/MachO/icf.s b/test/MachO/icf.s
index 2fc2405..5e61e94 100644
--- a/test/MachO/icf.s
+++ b/test/MachO/icf.s
@@ -32,8 +32,9 @@
 # CHECK: [[#%x,CALL_RECURSIVE_2:]]          l     F __TEXT,__text _call_recursive_2
 # CHECK: [[#%x,CHECK_LENGTH_1:]]            l     F __TEXT,__text _check_length_1
 # CHECK: [[#%x,CHECK_LENGTH_2:]]            l     F __TEXT,__text _check_length_2
-# CHECK: [[#%x,HAS_UNWIND_1:]]              l     F __TEXT,__text _has_unwind_1
+# CHECK: [[#%x,HAS_UNWIND_2:]]              l     F __TEXT,__text _has_unwind_1
 # CHECK: [[#%x,HAS_UNWIND_2:]]              l     F __TEXT,__text _has_unwind_2
+# CHECK: [[#%x,HAS_UNWIND_3:]]              l     F __TEXT,__text _has_unwind_3
 # CHECK: [[#%x,MUTALLY_RECURSIVE_2:]]       l     F __TEXT,__text _mutually_recursive_1
 # CHECK: [[#%x,MUTALLY_RECURSIVE_2:]]       l     F __TEXT,__text _mutually_recursive_2
 # CHECK: [[#%x,INIT_2:]]                    l     F __TEXT,__text _init_1
@@ -64,8 +65,9 @@
 # CHECK: callq 0x[[#%x,CALL_RECURSIVE_2:]]          <_call_recursive_2>
 # CHECK: callq 0x[[#%x,CHECK_LENGTH_1:]]            <_check_length_1>
 # CHECK: callq 0x[[#%x,CHECK_LENGTH_2:]]            <_check_length_2>
-# CHECK: callq 0x[[#%x,HAS_UNWIND_1:]]              <_has_unwind_1>
 # CHECK: callq 0x[[#%x,HAS_UNWIND_2:]]              <_has_unwind_2>
+# CHECK: callq 0x[[#%x,HAS_UNWIND_2:]]              <_has_unwind_2>
+# CHECK: callq 0x[[#%x,HAS_UNWIND_3:]]              <_has_unwind_3>
 # CHECK: callq 0x[[#%x,MUTALLY_RECURSIVE_2:]]       <_mutually_recursive_2>
 # CHECK: callq 0x[[#%x,MUTALLY_RECURSIVE_2:]]       <_mutually_recursive_2>
 ## FIXME: Mutually-recursive functions with identical bodies (see below)
@@ -170,8 +172,7 @@
 _my_personality:
   mov $1345, %rax
 
-## No fold: functions have unwind info.
-## FIXME: Fold functions with identical unwind info.
+## Functions with identical unwind info should be folded.
 _has_unwind_1:
   .cfi_startproc
   .cfi_personality 155, _my_personality
@@ -186,6 +187,15 @@
   ret
   .cfi_endproc
 
+## This function has different unwind info from the preceding two, and therefore
+## should not be folded.
+_has_unwind_3:
+  .cfi_startproc
+  .cfi_personality 155, _my_personality
+  .cfi_def_cfa_offset 8
+  ret
+  .cfi_endproc
+
 ## Fold: Mutually-recursive functions with symmetric bodies
 _mutually_recursive_1:
   callq _mutually_recursive_1 # call myself
@@ -253,6 +263,7 @@
   callq _check_length_2
   callq _has_unwind_1
   callq _has_unwind_2
+  callq _has_unwind_3
   callq _mutually_recursive_1
   callq _mutually_recursive_2
   callq _asymmetric_recursive_1