[lld-macho] Teach ICF to dedup functions with identical unwind info Dedup'ing unwind info is tricky because each CUE contains a different function address, if ICF operated naively and compared the entire contents of each CUE, entries with identical unwind info but belonging to different functions would never be considered identical. To work around this problem, we slice away the function address before performing ICF. We rely on `relocateCompactUnwind()` to correctly handle these truncated input sections. Here are the numbers before and after D109944, D109945, and this diff were applied, as tested on my 3.2 GHz 16-Core Intel Xeon W: Without any optimizations: base diff difference (95% CI) sys_time 0.849 ± 0.015 0.896 ± 0.012 [ +4.8% .. +6.2%] user_time 3.357 ± 0.030 3.512 ± 0.023 [ +4.3% .. +5.0%] wall_time 3.944 ± 0.039 4.032 ± 0.031 [ +1.8% .. +2.6%] samples 40 38 With `-dead_strip`: base diff difference (95% CI) sys_time 0.847 ± 0.010 0.896 ± 0.012 [ +5.2% .. +6.5%] user_time 3.377 ± 0.014 3.532 ± 0.015 [ +4.4% .. +4.8%] wall_time 3.962 ± 0.024 4.060 ± 0.030 [ +2.1% .. +2.8%] samples 47 30 With `-dead_strip` and `--icf=all`: base diff difference (95% CI) sys_time 0.935 ± 0.013 0.957 ± 0.018 [ +1.5% .. +3.2%] user_time 3.472 ± 0.022 6.531 ± 0.046 [ +87.6% .. +88.7%] wall_time 4.080 ± 0.040 5.329 ± 0.060 [ +30.0% .. +31.2%] samples 37 30 Unsurprisingly, ICF is now a lot slower, likely due to the much larger number of input sections it needs to process. But the rest of the linker only suffers a mild slowdown. Note that the compact-unwind-bad-reloc.s test was expanded because we now handle the relocation for CUE's function address in a separate code path from the rest of the CUE relocations. The extended test covers both code paths. Reviewed By: #lld-macho, oontvoo Differential Revision: https://reviews.llvm.org/D109946 GitOrigin-RevId: d9b6f7e312c11ff69779939bb2be0bd53936a333

commit: 98107511e6eea7c1187e20a3fd9c93c283cd7fb3 [log] [tgz]
author: Jez Ng <jezng@fb.com> Fri Nov 12 16:01:25 2021 -0500
committer: Copybara-Service <copybara-worker@google.com> Fri Nov 12 13:05:52 2021 -0800
tree: 775645bb1f0474ccdd700fb81c97f34cf318743b
parent: b5ae575c95f8e6e9553be17cbdfbb3a8bda60dc7 [diff]
diff --git a/MachO/ICF.cpp b/MachO/ICF.cpp
index 57e1da3..6c3a565 100644
--- a/MachO/ICF.cpp
+++ b/MachO/ICF.cpp

@@ -180,8 +180,31 @@
     }
     return isecA->icfEqClass[icfPass % 2] == isecB->icfEqClass[icfPass % 2];
   };
-  return std::equal(ia->relocs.begin(), ia->relocs.end(), ib->relocs.begin(),
-                    f);
+  if (!std::equal(ia->relocs.begin(), ia->relocs.end(), ib->relocs.begin(), f))
+    return false;
+
+  // If there are symbols with associated unwind info, check that the unwind
+  // info matches. For simplicity, we only handle the case where there are only
+  // symbols at offset zero within the section (which is typically the case with
+  // .subsections_via_symbols.)
+  auto hasCU = [](Defined *d) { return d->compactUnwind; };
+  auto itA = std::find_if(ia->symbols.begin(), ia->symbols.end(), hasCU);
+  auto itB = std::find_if(ib->symbols.begin(), ib->symbols.end(), hasCU);
+  if (itA == ia->symbols.end())
+    return itB == ib->symbols.end();
+  if (itB == ib->symbols.end())
+    return false;
+  const Defined *da = *itA;
+  const Defined *db = *itB;
+  if (da->compactUnwind->icfEqClass[icfPass % 2] !=
+          db->compactUnwind->icfEqClass[icfPass % 2] ||
+      da->value != 0 || db->value != 0)
+    return false;
+  auto isZero = [](Defined *d) { return d->value == 0; };
+  return std::find_if_not(std::next(itA), ia->symbols.end(), isZero) ==
+             ia->symbols.end() &&
+         std::find_if_not(std::next(itB), ib->symbols.end(), isZero) ==
+             ib->symbols.end();
 }
 
 // Find the first InputSection after BEGIN whose equivalence class differs
@@ -337,18 +360,14 @@
     // FIXME: consider non-code __text sections as hashable?
     bool isHashable = (isCodeSection(isec) || isCfStringSection(isec)) &&
                       !isec->shouldOmitFromOutput() && isec->isHashableForICF();
-    // ICF can't fold functions with unwind info
-    if (isHashable)
-      for (Defined *d : isec->symbols)
-        if (d->compactUnwind) {
-          isHashable = false;
-          break;
-        }
-
-    if (isHashable)
+    if (isHashable) {
       hashable.push_back(isec);
-    else
+      for (Defined *d : isec->symbols)
+        if (d->compactUnwind)
+          hashable.push_back(d->compactUnwind);
+    } else {
       isec->icfEqClass[0] = ++icfUniqueID;
+    }
   }
   parallelForEach(hashable,
                   [](ConcatInputSection *isec) { isec->hashForICF(); });

diff --git a/MachO/InputFiles.cpp b/MachO/InputFiles.cpp
index 6e6a61e..1b23392 100644
--- a/MachO/InputFiles.cpp
+++ b/MachO/InputFiles.cpp

@@ -904,16 +904,31 @@
 
   for (SubsectionEntry &entry : *cuSubsecMap) {
     ConcatInputSection *isec = cast<ConcatInputSection>(entry.isec);
+    // Hack!! Since each CUE contains a different function address, if ICF
+    // operated naively and compared the entire contents of each CUE, entries
+    // with identical unwind info but belonging to different functions would
+    // never be considered equivalent. To work around this problem, we slice
+    // away the function address here. (Note that we do not adjust the offsets
+    // of the corresponding relocations.) We rely on `relocateCompactUnwind()`
+    // to correctly handle these truncated input sections.
+    isec->data = isec->data.slice(target->wordSize);
+
     ConcatInputSection *referentIsec;
-    for (const Reloc &r : isec->relocs) {
-      if (r.offset != 0)
+    for (auto it = isec->relocs.begin(); it != isec->relocs.end();) {
+      Reloc &r = *it;
+      // We only wish to handle the relocation for CUE::functionAddress.
+      if (r.offset != 0) {
+        ++it;
         continue;
+      }
       uint64_t add = r.addend;
       if (auto *sym = cast_or_null<Defined>(r.referent.dyn_cast<Symbol *>())) {
         // Check whether the symbol defined in this file is the prevailing one.
         // Skip if it is e.g. a weak def that didn't prevail.
-        if (sym->getFile() != this)
+        if (sym->getFile() != this) {
+          ++it;
           continue;
+        }
         add += sym->value;
         referentIsec = cast<ConcatInputSection>(sym->isec);
       } else {
@@ -926,16 +941,23 @@
       // The functionAddress relocations are typically section relocations.
       // However, unwind info operates on a per-symbol basis, so we search for
       // the function symbol here.
-      auto it = llvm::lower_bound(
+      auto symIt = llvm::lower_bound(
           referentIsec->symbols, add,
           [](Defined *d, uint64_t add) { return d->value < add; });
       // The relocation should point at the exact address of a symbol (with no
       // addend).
-      if (it == referentIsec->symbols.end() || (*it)->value != add) {
+      if (symIt == referentIsec->symbols.end() || (*symIt)->value != add) {
         assert(referentIsec->wasCoalesced);
+        ++it;
         continue;
       }
-      (*it)->compactUnwind = isec;
+      (*symIt)->compactUnwind = isec;
+      // Since we've sliced away the functionAddress, we should remove the
+      // corresponding relocation too. Given that clang emits relocations in
+      // reverse order of address, this relocation should be at the end of the
+      // vector for most of our input object files, so this is typically an O(1)
+      // operation.
+      it = isec->relocs.erase(it);
     }
   }
 }

diff --git a/MachO/UnwindInfoSection.cpp b/MachO/UnwindInfoSection.cpp
index c499826..06f8d44 100644
--- a/MachO/UnwindInfoSection.cpp
+++ b/MachO/UnwindInfoSection.cpp

@@ -295,7 +295,8 @@
       return;
 
     // Write the rest of the CUE.
-    memcpy(buf, d->compactUnwind->data.data(), d->compactUnwind->data.size());
+    memcpy(buf + sizeof(Ptr), d->compactUnwind->data.data(),
+           d->compactUnwind->data.size());
     for (const Reloc &r : d->compactUnwind->relocs) {
       uint64_t referentVA = 0;
       if (auto *referentSym = r.referent.dyn_cast<Symbol *>()) {
@@ -309,9 +310,8 @@
         }
       } else {
         auto *referentIsec = r.referent.get<InputSection *>();
-        ConcatInputSection *concatIsec = checkTextSegment(referentIsec);
-        if (!concatIsec->shouldOmitFromOutput())
-          referentVA = referentIsec->getVA(r.addend);
+        checkTextSegment(referentIsec);
+        referentVA = referentIsec->getVA(r.addend);
       }
       writeAddress(buf + r.offset, referentVA, r.length);
     }

diff --git a/test/MachO/icf.s b/test/MachO/icf.s
index 2fc2405..5e61e94 100644
--- a/test/MachO/icf.s
+++ b/test/MachO/icf.s

@@ -32,8 +32,9 @@
 # CHECK: [[#%x,CALL_RECURSIVE_2:]]          l     F __TEXT,__text _call_recursive_2
 # CHECK: [[#%x,CHECK_LENGTH_1:]]            l     F __TEXT,__text _check_length_1
 # CHECK: [[#%x,CHECK_LENGTH_2:]]            l     F __TEXT,__text _check_length_2
-# CHECK: [[#%x,HAS_UNWIND_1:]]              l     F __TEXT,__text _has_unwind_1
+# CHECK: [[#%x,HAS_UNWIND_2:]]              l     F __TEXT,__text _has_unwind_1
 # CHECK: [[#%x,HAS_UNWIND_2:]]              l     F __TEXT,__text _has_unwind_2
+# CHECK: [[#%x,HAS_UNWIND_3:]]              l     F __TEXT,__text _has_unwind_3
 # CHECK: [[#%x,MUTALLY_RECURSIVE_2:]]       l     F __TEXT,__text _mutually_recursive_1
 # CHECK: [[#%x,MUTALLY_RECURSIVE_2:]]       l     F __TEXT,__text _mutually_recursive_2
 # CHECK: [[#%x,INIT_2:]]                    l     F __TEXT,__text _init_1
@@ -64,8 +65,9 @@
 # CHECK: callq 0x[[#%x,CALL_RECURSIVE_2:]]          <_call_recursive_2>
 # CHECK: callq 0x[[#%x,CHECK_LENGTH_1:]]            <_check_length_1>
 # CHECK: callq 0x[[#%x,CHECK_LENGTH_2:]]            <_check_length_2>
-# CHECK: callq 0x[[#%x,HAS_UNWIND_1:]]              <_has_unwind_1>
 # CHECK: callq 0x[[#%x,HAS_UNWIND_2:]]              <_has_unwind_2>
+# CHECK: callq 0x[[#%x,HAS_UNWIND_2:]]              <_has_unwind_2>
+# CHECK: callq 0x[[#%x,HAS_UNWIND_3:]]              <_has_unwind_3>
 # CHECK: callq 0x[[#%x,MUTALLY_RECURSIVE_2:]]       <_mutually_recursive_2>
 # CHECK: callq 0x[[#%x,MUTALLY_RECURSIVE_2:]]       <_mutually_recursive_2>
 ## FIXME: Mutually-recursive functions with identical bodies (see below)
@@ -170,8 +172,7 @@
 _my_personality:
   mov $1345, %rax
 
-## No fold: functions have unwind info.
-## FIXME: Fold functions with identical unwind info.
+## Functions with identical unwind info should be folded.
 _has_unwind_1:
   .cfi_startproc
   .cfi_personality 155, _my_personality
@@ -186,6 +187,15 @@
   ret
   .cfi_endproc
 
+## This function has different unwind info from the preceding two, and therefore
+## should not be folded.
+_has_unwind_3:
+  .cfi_startproc
+  .cfi_personality 155, _my_personality
+  .cfi_def_cfa_offset 8
+  ret
+  .cfi_endproc
+
 ## Fold: Mutually-recursive functions with symmetric bodies
 _mutually_recursive_1:
   callq _mutually_recursive_1 # call myself
@@ -253,6 +263,7 @@
   callq _check_length_2
   callq _has_unwind_1
   callq _has_unwind_2
+  callq _has_unwind_3
   callq _mutually_recursive_1
   callq _mutually_recursive_2
   callq _asymmetric_recursive_1
commit	98107511e6eea7c1187e20a3fd9c93c283cd7fb3	[log] [tgz]
author	Jez Ng <jezng@fb.com>	Fri Nov 12 16:01:25 2021 -0500
committer	Copybara-Service <copybara-worker@google.com>	Fri Nov 12 13:05:52 2021 -0800
tree	775645bb1f0474ccdd700fb81c97f34cf318743b
parent	b5ae575c95f8e6e9553be17cbdfbb3a8bda60dc7 [diff]