[AA] No synchronization effects for never-escaping identified local (#193939)

Fences and other synchronizing operations (such as atomic accesses
stronger than monotonic) are modelled as reading and writing all memory,
in order to enforce their implied ordering constraints.

Currently, this happens even for identified function locals that do not
escape. This patch excludes those objects.

Notably, we can *not* reason based on captures-before here, because the
synchronizing operation still has an effect even if the object only
escapes *later*.

The hope here is that with this restriction in place, it may be viable
to respect potential synchronization inside non-nosync function calls.
diff --git a/llvm/lib/Analysis/AliasAnalysis.cpp b/llvm/lib/Analysis/AliasAnalysis.cpp
index 1449a54..ffb2dc0 100644
--- a/llvm/lib/Analysis/AliasAnalysis.cpp
+++ b/llvm/lib/Analysis/AliasAnalysis.cpp
@@ -458,12 +458,34 @@
 // Helper method implementation
 //===----------------------------------------------------------------------===//
 
+/// Get ModRefInfo for a synchronizing operation, such as a fence or stronger
+/// than monotonic atomic load/store.
+static ModRefInfo getSyncEffects(AAResults *AA, const MemoryLocation &Loc,
+                                 AAQueryInfo &AAQI) {
+  if (!Loc.Ptr)
+    return ModRefInfo::ModRef;
+
+  // If the location is *never* captured, it cannot be affected by
+  // synchronizing operations. However, we cannot ignore locations that are
+  // only captured after the operation, as the synchronization may still have
+  // an effect if the object is only captured *later*. As such, set I to null
+  // and ReturnCaptures to true here.
+  const Value *Obj = getUnderlyingObject(Loc.Ptr);
+  if (capturesNothing(AAQI.CA->getCapturesBefore(
+          Obj, /*I=*/nullptr, /*OrAt=*/true, /*ReturnCaptures=*/true)))
+    return ModRefInfo::NoModRef;
+
+  // If Loc is a constant memory location, the synchronization operation
+  // definitely could not modify it.
+  return AA->getModRefInfoMask(Loc);
+}
+
 ModRefInfo AAResults::getModRefInfo(const LoadInst *L,
                                     const MemoryLocation &Loc,
                                     AAQueryInfo &AAQI) {
   // Be conservative in the face of atomic.
   if (isStrongerThan(L->getOrdering(), AtomicOrdering::Unordered))
-    return ModRefInfo::ModRef;
+    return getSyncEffects(this, Loc, AAQI);
 
   // If the load address doesn't alias the given address, it doesn't read
   // or write the specified memory.
@@ -481,7 +503,7 @@
                                     AAQueryInfo &AAQI) {
   // Be conservative in the face of atomic.
   if (isStrongerThan(S->getOrdering(), AtomicOrdering::Unordered))
-    return ModRefInfo::ModRef;
+    return getSyncEffects(this, Loc, AAQI);
 
   if (Loc.Ptr) {
     AliasResult AR = alias(MemoryLocation::get(S), Loc, AAQI, S);
@@ -515,14 +537,9 @@
         return ModRefInfo::NoModRef;
     }
 
-    // Apply the ModRef mask. This ensures that if Loc is a constant memory
-    // location, we take into account the fact that the fence definitely could
-    // not modify the memory location.
-    if (!isNoModRef(Result))
-      Result &= getModRefInfoMask(Loc);
-
-    return Result;
+    return Result & getSyncEffects(this, Loc, AAQI);
   }
+
   return ModRefInfo::ModRef;
 }
 
@@ -576,7 +593,7 @@
                                     AAQueryInfo &AAQI) {
   // Acquire/Release cmpxchg has properties that matter for arbitrary addresses.
   if (isStrongerThanMonotonic(CX->getSuccessOrdering()))
-    return ModRefInfo::ModRef;
+    return getSyncEffects(this, Loc, AAQI);
 
   if (Loc.Ptr) {
     AliasResult AR = alias(MemoryLocation::get(CX), Loc, AAQI, CX);
@@ -594,7 +611,7 @@
                                     AAQueryInfo &AAQI) {
   // Acquire/Release atomicrmw has properties that matter for arbitrary addresses.
   if (isStrongerThanMonotonic(RMW->getOrdering()))
-    return ModRefInfo::ModRef;
+    return getSyncEffects(this, Loc, AAQI);
 
   if (Loc.Ptr) {
     AliasResult AR = alias(MemoryLocation::get(RMW), Loc, AAQI, RMW);
diff --git a/llvm/test/Analysis/BasicAA/atomics.ll b/llvm/test/Analysis/BasicAA/atomics.ll
index db0417c..1101466 100644
--- a/llvm/test/Analysis/BasicAA/atomics.ll
+++ b/llvm/test/Analysis/BasicAA/atomics.ll
@@ -8,29 +8,29 @@
 ; CHECK:  Both ModRef:  Ptr: i32* %x	<->  %1 = atomicrmw add ptr %x, i32 1 monotonic, align 4
 ; CHECK:  NoModRef:  Ptr: i32* %a	<->  %2 = cmpxchg ptr %x, i32 0, i32 1 monotonic monotonic, align 4
 ; CHECK:  Both ModRef:  Ptr: i32* %x	<->  %2 = cmpxchg ptr %x, i32 0, i32 1 monotonic monotonic, align 4
-; CHECK:  Both ModRef:  Ptr: i32* %a	<->  %3 = load atomic i32, ptr %x monotonic, align 4
+; CHECK:  NoModRef:  Ptr: i32* %a	<->  %3 = load atomic i32, ptr %x monotonic, align 4
 ; CHECK:  Both ModRef:  Ptr: i32* %x	<->  %3 = load atomic i32, ptr %x monotonic, align 4
-; CHECK:  Both ModRef:  Ptr: i32* %a	<->  store atomic i32 0, ptr %x monotonic, align 4
+; CHECK:  NoModRef:  Ptr: i32* %a	<->  store atomic i32 0, ptr %x monotonic, align 4
 ; CHECK:  Both ModRef:  Ptr: i32* %x	<->  store atomic i32 0, ptr %x monotonic, align 4
-; CHECK:  Both ModRef:  Ptr: i32* %a	<->  fence release
+; CHECK:  NoModRef:  Ptr: i32* %a	<->  fence release
 ; CHECK:  Both ModRef:  Ptr: i32* %x	<->  fence release
-; CHECK:  Both ModRef:  Ptr: i32* %a	<->  %4 = atomicrmw add ptr %x, i32 1 acq_rel, align 4
+; CHECK:  NoModRef:  Ptr: i32* %a	<->  %4 = atomicrmw add ptr %x, i32 1 acq_rel, align 4
 ; CHECK:  Both ModRef:  Ptr: i32* %x	<->  %4 = atomicrmw add ptr %x, i32 1 acq_rel, align 4
-; CHECK:  Both ModRef:  Ptr: i32* %a	<->  %5 = cmpxchg ptr %x, i32 0, i32 1 acq_rel monotonic, align 4
+; CHECK:  NoModRef:  Ptr: i32* %a	<->  %5 = cmpxchg ptr %x, i32 0, i32 1 acq_rel monotonic, align 4
 ; CHECK:  Both ModRef:  Ptr: i32* %x	<->  %5 = cmpxchg ptr %x, i32 0, i32 1 acq_rel monotonic, align 4
-; CHECK:  Both ModRef:  Ptr: i32* %a	<->  %6 = load atomic i32, ptr %x acquire, align 4
+; CHECK:  NoModRef:  Ptr: i32* %a	<->  %6 = load atomic i32, ptr %x acquire, align 4
 ; CHECK:  Both ModRef:  Ptr: i32* %x	<->  %6 = load atomic i32, ptr %x acquire, align 4
-; CHECK:  Both ModRef:  Ptr: i32* %a	<->  store atomic i32 0, ptr %x release, align 4
+; CHECK:  NoModRef:  Ptr: i32* %a	<->  store atomic i32 0, ptr %x release, align 4
 ; CHECK:  Both ModRef:  Ptr: i32* %x	<->  store atomic i32 0, ptr %x release, align 4
-; CHECK:  Both ModRef:  Ptr: i32* %a	<->  fence seq_cst
+; CHECK:  NoModRef:  Ptr: i32* %a	<->  fence seq_cst
 ; CHECK:  Both ModRef:  Ptr: i32* %x	<->  fence seq_cst
-; CHECK:  Both ModRef:  Ptr: i32* %a	<->  %7 = atomicrmw add ptr %x, i32 1 seq_cst, align 4
+; CHECK:  NoModRef:  Ptr: i32* %a	<->  %7 = atomicrmw add ptr %x, i32 1 seq_cst, align 4
 ; CHECK:  Both ModRef:  Ptr: i32* %x	<->  %7 = atomicrmw add ptr %x, i32 1 seq_cst, align 4
-; CHECK:  Both ModRef:  Ptr: i32* %a	<->  %8 = cmpxchg ptr %x, i32 0, i32 1 seq_cst seq_cst, align 4
+; CHECK:  NoModRef:  Ptr: i32* %a	<->  %8 = cmpxchg ptr %x, i32 0, i32 1 seq_cst seq_cst, align 4
 ; CHECK:  Both ModRef:  Ptr: i32* %x	<->  %8 = cmpxchg ptr %x, i32 0, i32 1 seq_cst seq_cst, align 4
-; CHECK:  Both ModRef:  Ptr: i32* %a	<->  %9 = load atomic i32, ptr %x seq_cst, align 4
+; CHECK:  NoModRef:  Ptr: i32* %a	<->  %9 = load atomic i32, ptr %x seq_cst, align 4
 ; CHECK:  Both ModRef:  Ptr: i32* %x	<->  %9 = load atomic i32, ptr %x seq_cst, align 4
-; CHECK:  Both ModRef:  Ptr: i32* %a	<->  store atomic i32 0, ptr %x seq_cst, align 4
+; CHECK:  NoModRef:  Ptr: i32* %a	<->  store atomic i32 0, ptr %x seq_cst, align 4
 ; CHECK:  Both ModRef:  Ptr: i32* %x	<->  store atomic i32 0, ptr %x seq_cst, align 4
 define void @alloca_no_escape(ptr %x) {
   %a = alloca i32
@@ -83,15 +83,15 @@
 }
 
 ; CHECK-LABEL: Function: noalias_no_escape:
-; CHECK:  Both ModRef:  Ptr: i32* %a	<->  fence release
+; CHECK:  NoModRef:  Ptr: i32* %a	<->  fence release
 ; CHECK:  Both ModRef:  Ptr: i32* %x	<->  fence release
-; CHECK:  Both ModRef:  Ptr: i32* %a	<->  %1 = atomicrmw add ptr %x, i32 1 acq_rel, align 4
+; CHECK:  NoModRef:  Ptr: i32* %a	<->  %1 = atomicrmw add ptr %x, i32 1 acq_rel, align 4
 ; CHECK:  Both ModRef:  Ptr: i32* %x	<->  %1 = atomicrmw add ptr %x, i32 1 acq_rel, align 4
-; CHECK:  Both ModRef:  Ptr: i32* %a	<->  %2 = cmpxchg ptr %x, i32 0, i32 1 acq_rel monotonic, align 4
+; CHECK:  NoModRef:  Ptr: i32* %a	<->  %2 = cmpxchg ptr %x, i32 0, i32 1 acq_rel monotonic, align 4
 ; CHECK:  Both ModRef:  Ptr: i32* %x	<->  %2 = cmpxchg ptr %x, i32 0, i32 1 acq_rel monotonic, align 4
-; CHECK:  Both ModRef:  Ptr: i32* %a	<->  %3 = load atomic i32, ptr %x acquire, align 4
+; CHECK:  NoModRef:  Ptr: i32* %a	<->  %3 = load atomic i32, ptr %x acquire, align 4
 ; CHECK:  Both ModRef:  Ptr: i32* %x	<->  %3 = load atomic i32, ptr %x acquire, align 4
-; CHECK:  Both ModRef:  Ptr: i32* %a	<->  store atomic i32 0, ptr %x release, align 4
+; CHECK:  NoModRef:  Ptr: i32* %a	<->  store atomic i32 0, ptr %x release, align 4
 ; CHECK:  Both ModRef:  Ptr: i32* %x	<->  store atomic i32 0, ptr %x release, align 4
 define void @noalias_no_escape(ptr noalias %a, ptr %x) {
   store i32 0, ptr %a
@@ -125,21 +125,23 @@
   load atomic i32, ptr %x acquire, align 4
   store atomic i32 0, ptr %x release, align 4
 
+  call void @escape(ptr %a)
+
   ret void
 }
 
 ; CHECK-LABEL: Function: malloc_no_escape:
 ; CHECK:  Both ModRef:  Ptr: i32* %a	<->  %a = call ptr @malloc(i64 4)
 ; CHECK:  Both ModRef:  Ptr: i32* %x	<->  %a = call ptr @malloc(i64 4)
-; CHECK:  Both ModRef:  Ptr: i32* %a	<->  fence release
+; CHECK:  NoModRef:  Ptr: i32* %a	<->  fence release
 ; CHECK:  Both ModRef:  Ptr: i32* %x	<->  fence release
-; CHECK:  Both ModRef:  Ptr: i32* %a	<->  %1 = atomicrmw add ptr %x, i32 1 acq_rel, align 4
+; CHECK:  NoModRef:  Ptr: i32* %a	<->  %1 = atomicrmw add ptr %x, i32 1 acq_rel, align 4
 ; CHECK:  Both ModRef:  Ptr: i32* %x	<->  %1 = atomicrmw add ptr %x, i32 1 acq_rel, align 4
-; CHECK:  Both ModRef:  Ptr: i32* %a	<->  %2 = cmpxchg ptr %x, i32 0, i32 1 acq_rel monotonic, align 4
+; CHECK:  NoModRef:  Ptr: i32* %a	<->  %2 = cmpxchg ptr %x, i32 0, i32 1 acq_rel monotonic, align 4
 ; CHECK:  Both ModRef:  Ptr: i32* %x	<->  %2 = cmpxchg ptr %x, i32 0, i32 1 acq_rel monotonic, align 4
-; CHECK:  Both ModRef:  Ptr: i32* %a	<->  %3 = load atomic i32, ptr %x acquire, align 4
+; CHECK:  NoModRef:  Ptr: i32* %a	<->  %3 = load atomic i32, ptr %x acquire, align 4
 ; CHECK:  Both ModRef:  Ptr: i32* %x	<->  %3 = load atomic i32, ptr %x acquire, align 4
-; CHECK:  Both ModRef:  Ptr: i32* %a	<->  store atomic i32 0, ptr %x release, align 4
+; CHECK:  NoModRef:  Ptr: i32* %a	<->  store atomic i32 0, ptr %x release, align 4
 ; CHECK:  Both ModRef:  Ptr: i32* %x	<->  store atomic i32 0, ptr %x release, align 4
 define void @malloc_no_escape(ptr %x) {
   %a = call ptr @malloc(i64 4)
diff --git a/llvm/test/Analysis/MemorySSA/atomic-clobber.ll b/llvm/test/Analysis/MemorySSA/atomic-clobber.ll
index 326ec8b..86708ec 100644
--- a/llvm/test/Analysis/MemorySSA/atomic-clobber.ll
+++ b/llvm/test/Analysis/MemorySSA/atomic-clobber.ll
@@ -106,7 +106,7 @@
 ; If AA gets more aggressive, we can find another way.
 ;
 ; CHECK-LABEL: define void @check_aa_is_sane
-define void @check_aa_is_sane(ptr noalias %a, ptr noalias %b) {
+define void @check_aa_is_sane(ptr noalias %a, ptr %b) {
 ; CHECK: 1 = MemoryDef(liveOnEntry)
 ; CHECK-NEXT: cmpxchg ptr %a, i32 0, i32 1 acquire acquire
   cmpxchg ptr %a, i32 0, i32 1 acquire acquire
diff --git a/llvm/test/Transforms/DeadStoreElimination/fence.ll b/llvm/test/Transforms/DeadStoreElimination/fence.ll
index b619b00..3c02d71 100644
--- a/llvm/test/Transforms/DeadStoreElimination/fence.ll
+++ b/llvm/test/Transforms/DeadStoreElimination/fence.ll
@@ -54,7 +54,6 @@
   ret void
 }
 
-; TODO:
 ; We DSE stack alloc'ed and byval locations, in the presence of fences.
 ; Fence does not make an otherwise thread local store visible.
 ; Right now the DSE in presence of fence is only done in end blocks (with no successors),
@@ -63,7 +62,6 @@
 define void @test3(ptr byval(i32) %addr.i) {
 ; CHECK-LABEL: define void @test3(
 ; CHECK-SAME: ptr byval(i32) [[ADDR_I:%.*]]) {
-; CHECK-NEXT:    store i32 5, ptr [[ADDR_I]], align 4
 ; CHECK-NEXT:    fence release
 ; CHECK-NEXT:    ret void
 ;
@@ -76,13 +74,11 @@
 
 declare noalias ptr @malloc(i32)
 
-; TODO:
 ; DSE of stores in locations allocated through library calls.
 define void @test_nocapture() {
 ; CHECK-LABEL: define void @test_nocapture() {
 ; CHECK-NEXT:    [[M:%.*]] = call ptr @malloc(i32 24)
 ; CHECK-NEXT:    call void @foo(ptr [[M]])
-; CHECK-NEXT:    store i8 4, ptr [[M]], align 1
 ; CHECK-NEXT:    fence release
 ; CHECK-NEXT:    ret void
 ;
@@ -93,14 +89,10 @@
   ret void
 }
 
-
-; TODO:
 ; This is a full fence, but it does not make a thread local store visible.
 ; We can DSE the store in presence of the fence.
 define void @fence_seq_cst() {
 ; CHECK-LABEL: define void @fence_seq_cst() {
-; CHECK-NEXT:    [[P1:%.*]] = alloca i32, align 4
-; CHECK-NEXT:    store i32 0, ptr [[P1]], align 4
 ; CHECK-NEXT:    fence seq_cst
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/GVN/fence.ll b/llvm/test/Transforms/GVN/fence.ll
index f2b1538..16c6b51 100644
--- a/llvm/test/Transforms/GVN/fence.ll
+++ b/llvm/test/Transforms/GVN/fence.ll
@@ -37,9 +37,9 @@
 ; ordering property (though it is that too), but a liveness
 ; property.  We expect to eventually see the value of store by
 ; another thread when spinning on that location.
-define i32 @test3(ptr noalias %addr.i, ptr noalias %otheraddr) {
+define i32 @test3(ptr %addr.i) {
 ; CHECK-LABEL: define i32 @test3
-; CHECK-SAME: (ptr noalias [[ADDR_I:%.*]], ptr noalias [[OTHERADDR:%.*]]) {
+; CHECK-SAME: (ptr [[ADDR_I:%.*]]) {
 ; CHECK-NEXT:    fence acquire
 ; CHECK-NEXT:    [[A:%.*]] = load i32, ptr [[ADDR_I]], align 4
 ; CHECK-NEXT:    fence acquire
diff --git a/llvm/test/Transforms/GVN/simplify-icf-cache-invalidation.ll b/llvm/test/Transforms/GVN/simplify-icf-cache-invalidation.ll
index f4a4155..fc85048 100644
--- a/llvm/test/Transforms/GVN/simplify-icf-cache-invalidation.ll
+++ b/llvm/test/Transforms/GVN/simplify-icf-cache-invalidation.ll
@@ -28,8 +28,6 @@
 ; CHECK-NEXT:    br label %[[BB9]]
 ; CHECK:       [[BB9]]:
 ; CHECK-NEXT:    tail call void @quux(ptr [[ARG]], i1 [[ARG2]])
-; CHECK-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[TMP]], align 8
-; CHECK-NEXT:    [[TMP18:%.*]] = icmp eq ptr [[TMP17]], null
 ; CHECK-NEXT:    ret void
 ;
 bb:
diff --git a/llvm/test/Transforms/LICM/atomics.ll b/llvm/test/Transforms/LICM/atomics.ll
index 2b3435b..1d29b03 100644
--- a/llvm/test/Transforms/LICM/atomics.ll
+++ b/llvm/test/Transforms/LICM/atomics.ll
@@ -239,6 +239,7 @@
 ; CHECK-LABEL: define i32 @test7b(
 ; CHECK-SAME: ptr noalias captures(none) [[X:%.*]], ptr captures(none) [[Y:%.*]], ptr noalias captures(none) [[Z:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    store i32 5, ptr [[X]], align 4
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
 ; CHECK-NEXT:    [[VALA:%.*]] = load atomic i32, ptr [[Y]] monotonic, align 4
@@ -247,7 +248,6 @@
 ; CHECK:       [[END]]:
 ; CHECK-NEXT:    [[VALA_LCSSA1:%.*]] = phi i32 [ [[VALA]], %[[LOOP]] ]
 ; CHECK-NEXT:    [[VALA_LCSSA:%.*]] = phi i32 [ [[VALA]], %[[LOOP]] ]
-; CHECK-NEXT:    store i32 5, ptr [[X]], align 4
 ; CHECK-NEXT:    store atomic i32 [[VALA_LCSSA1]], ptr [[Z]] unordered, align 4
 ; CHECK-NEXT:    ret i32 [[VALA_LCSSA]]
 ;
@@ -266,9 +266,9 @@
 }
 
 
-define i32 @test8(ptr nocapture noalias %x, ptr nocapture %y) {
+define i32 @test8(ptr nocapture %x, ptr nocapture noalias %y) {
 ; CHECK-LABEL: define i32 @test8(
-; CHECK-SAME: ptr noalias captures(none) [[X:%.*]], ptr captures(none) [[Y:%.*]]) {
+; CHECK-SAME: ptr captures(none) [[X:%.*]], ptr noalias captures(none) [[Y:%.*]]) {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]: