[MemProf] Fix the option to disable memprof ICP (#112917)

The -enable-memprof-indirect-call-support meant to guard the recently
added memprof ICP support was not used in enough places. Specifically,
it was not checked in mayHaveMemprofSummary, which is called from the
ThinLTO backend applyImports. This led to failures when checking the
callsite records, as we incorrectly expected records for indirect calls.

Fix the option to be checked in all necessary locations, and add
testing.
diff --git a/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp b/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp
index 1bd9ee6..0f4e85f 100644
--- a/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp
+++ b/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp
@@ -503,6 +503,10 @@
       if (!IsThinLTO)
         continue;
 
+      // Skip indirect calls if we haven't enabled memprof ICP.
+      if (!CalledFunction && !EnableMemProfIndirectCallSupport)
+        continue;
+
       // Ensure we keep this analysis in sync with the handling in the ThinLTO
       // backend (see MemProfContextDisambiguation::applyImport). Save this call
       // so that we can skip it in checking the reverse case later.
@@ -561,7 +565,8 @@
           auto CalleeValueInfo =
               Index.getOrInsertValueInfo(cast<GlobalValue>(CalledValue));
           Callsites.push_back({CalleeValueInfo, StackIdIndices});
-        } else if (EnableMemProfIndirectCallSupport) {
+        } else {
+          assert(EnableMemProfIndirectCallSupport);
           // For indirect callsites, create multiple Callsites, one per target.
           // This enables having a different set of clone versions per target,
           // and we will apply the cloning decisions while speculatively
@@ -1223,6 +1228,9 @@
     if (CI && CalledFunction->isIntrinsic())
       return false;
   } else {
+    // Skip indirect calls if we haven't enabled memprof ICP.
+    if (!EnableMemProfIndirectCallSupport)
+      return false;
     // Skip inline assembly calls.
     if (CI && CI->isInlineAsm())
       return false;
diff --git a/llvm/test/ThinLTO/X86/memprof-icp.ll b/llvm/test/ThinLTO/X86/memprof-icp.ll
index 5c6d4e3..e19c56b 100644
--- a/llvm/test/ThinLTO/X86/memprof-icp.ll
+++ b/llvm/test/ThinLTO/X86/memprof-icp.ll
@@ -76,17 +76,18 @@
 ;; for each profiled target in the VP metadata. They will have the same stackIds
 ;; since the debug information for the callsite is the same.
 ; RUN: llvm-dis %t/foo.o -o - | FileCheck %s --check-prefix=CALLSITES
-; CALLSITES: gv: (name: "_Z3fooR2B0j", {{.*}} callsites: ((callee: ^{{[0-9]+}}, clones: (0), stackIds: (16345663650247127235)), (callee: ^{{[0-9]+}}, clones: (0), stackIds: (16345663650247127235)))
+; CALLSITES: gv: (name: "_Z3fooR2B0j", {{.*}} callsites: ((callee: ^{{[0-9]+}}, clones: (0), stackIds: (16345663650247127235)), (callee: ^{{[0-9]+}}, clones: (0), stackIds: (16345663650247127235))
 
 ;; Make sure that we don't get the synthesized callsite records if the
 ;; -enable-memprof-indirect-call-support flag is false.
-; RUN: opt -thinlto-bc %t/foo.ll -enable-memprof-indirect-call-support=false -o - \
-; RUN: 	| llvm-dis -o - | FileCheck %s --implicit-check-not callsites
+; RUN: opt -thinlto-bc %t/foo.ll -enable-memprof-indirect-call-support=false >%t/foo.noicp.o
+; RUN: llvm-dis %t/foo.noicp.o -o - | FileCheck %s --implicit-check-not "stackIds: (16345663650247127235)"
 
 ;; First perform in-process ThinLTO
 ; RUN: llvm-lto2 run %t/main.o %t/foo.o -enable-memprof-context-disambiguation \
 ; RUN:  -supports-hot-cold-new \
 ; RUN:  -r=%t/foo.o,_Z3fooR2B0j,plx \
+; RUN:  -r=%t/foo.o,_Z3xyzR2B0j, \
 ; RUN:  -r=%t/main.o,_Z3fooR2B0j, \
 ; RUN:  -r=%t/main.o,_Znwm, \
 ; RUN:  -r=%t/main.o,_ZdlPvm, \
@@ -116,6 +117,7 @@
 ; RUN:  -supports-hot-cold-new \
 ; RUN:  -thinlto-distributed-indexes \
 ; RUN:  -r=%t/foo.o,_Z3fooR2B0j,plx \
+; RUN:  -r=%t/foo.o,_Z3xyzR2B0j, \
 ; RUN:  -r=%t/main.o,_Z3fooR2B0j, \
 ; RUN:  -r=%t/main.o,_Znwm, \
 ; RUN:  -r=%t/main.o,_ZdlPvm, \
@@ -141,6 +143,36 @@
 ; RUN:  %t/foo.o -S 2>&1 | FileCheck %s --check-prefix=IR \
 ; RUN:  --check-prefix=STATS-BE-DISTRIB --check-prefix=REMARKS-FOO
 
+;; Retry with the ICP-disabled object file, and make sure we disable it again
+;; so we don't look for the synthesized callsite records when applying imports.
+;; We should not get any cloning.
+; RUN: llvm-lto2 run %t/main.o %t/foo.noicp.o -enable-memprof-context-disambiguation \
+; RUN:	-enable-memprof-indirect-call-support=false \
+; RUN:  -supports-hot-cold-new \
+; RUN:  -r=%t/foo.noicp.o,_Z3fooR2B0j,plx \
+; RUN:  -r=%t/foo.noicp.o,_Z3xyzR2B0j, \
+; RUN:  -r=%t/main.o,_Z3fooR2B0j, \
+; RUN:  -r=%t/main.o,_Znwm, \
+; RUN:  -r=%t/main.o,_ZdlPvm, \
+; RUN:  -r=%t/main.o,_Z8externalPi, \
+; RUN:  -r=%t/main.o,main,plx \
+; RUN:  -r=%t/main.o,_ZN2B03barEj,plx \
+; RUN:  -r=%t/main.o,_ZN1B3barEj,plx \
+; RUN:  -r=%t/main.o,_ZTV1B,plx \
+; RUN:  -r=%t/main.o,_ZTVN10__cxxabiv120__si_class_type_infoE,plx \
+; RUN:  -r=%t/main.o,_ZTS1B,plx \
+; RUN:  -r=%t/main.o,_ZTVN10__cxxabiv117__class_type_infoE,plx \
+; RUN:  -r=%t/main.o,_ZTS2B0,plx \
+; RUN:  -r=%t/main.o,_ZTI2B0,plx \
+; RUN:  -r=%t/main.o,_ZTI1B,plx \
+; RUN:  -r=%t/main.o,_ZTV2B0,plx \
+; RUN:	-thinlto-threads=1 \
+; RUN:  -memprof-verify-ccg -memprof-verify-nodes \
+; RUN:  -pass-remarks=. -save-temps \
+; RUN:  -o %t.noicp.out 2>&1 | FileCheck %s --implicit-check-not "created clone"
+
+; RUN: llvm-dis %t.noicp.out.2.4.opt.bc -o - | FileCheck %s --implicit-check-not "_Z3fooR2B0j.memprof"
+
 ; REMARKS-MAIN: call in clone main assigned to call function clone _Z3fooR2B0j.memprof.1
 ; REMARKS-MAIN: call in clone main assigned to call function clone _Z3fooR2B0j.memprof.1
 ; REMARKS-MAIN: created clone _ZN2B03barEj.memprof.1
@@ -215,15 +247,22 @@
 target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
+declare i32 @_Z3xyzR2B0j(ptr %b)
+
 define i32 @_Z3fooR2B0j(ptr %b) {
 entry:
   %0 = load ptr, ptr %b, align 8
   %call = tail call i32 %0(ptr null, i32 0), !prof !0, !callsite !1
+  ;; Add a dummy call to ensure that we have some callsite metadata,
+  ;; which triggers callsite record checking in the ThinLTO backend
+  ;; even with -enable-memprof-indirect-call-support=false.
+  %call2 = call i32 @_Z3xyzR2B0j(ptr null, i32 0), !callsite !2
   ret i32 0
 }
 
 !0 = !{!"VP", i32 0, i64 4, i64 4445083295448962937, i64 2, i64 -2718743882639408571, i64 2}
 !1 = !{i64 -2101080423462424381}
+!2 = !{i64 1234}
 
 ;--- main.ll
 target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"