[CodeExtractor] fix use list iterator invalidation (#197986)

Fix crash in HotColdSplit that uses CodeExtractor to outline cold
functions from https://github.com/llvm/llvm-project/pull/191824.

When `CodeExtractor::insertReplacerCall` replaces the outlined function
return value, calling `replaceUsesOfWith` invalidates the users iterator
causing the loop exit early without having replaced all of the original users
of `FuncRetVal`.
```
Referring to an instruction in another function!
  %s.sroa.0.0 = phi ptr [ %call.i, %codeRepl ], [ undef, %entry ]
LLVM ERROR: Broken module found, compilation aborted!
```

Reproducer: https://godbolt.org/z/G5qv35nnq
diff --git a/llvm/lib/Transforms/Utils/CodeExtractor.cpp b/llvm/lib/Transforms/Utils/CodeExtractor.cpp
index 949aae3..7ffa998 100644
--- a/llvm/lib/Transforms/Utils/CodeExtractor.cpp
+++ b/llvm/lib/Transforms/Utils/CodeExtractor.cpp
@@ -2125,11 +2125,9 @@
   }
 
   if (FuncRetVal)
-    for (User *U : FuncRetVal->users()) {
-      Instruction *inst = cast<Instruction>(U);
-      if (inst->getParent()->getParent() == oldFunction)
-        inst->replaceUsesOfWith(FuncRetVal, ReplacerCall);
-    }
+    FuncRetVal->replaceUsesWithIf(ReplacerCall, [&](Use &U) {
+      return cast<Instruction>(U.getUser())->getFunction() == oldFunction;
+    });
 
   // Update the branch weights for the exit block.
   if (BFI && ExtractedFuncRetVals.size() > 1)
diff --git a/llvm/test/Transforms/HotColdSplit/issue-197982.ll b/llvm/test/Transforms/HotColdSplit/issue-197982.ll
new file mode 100644
index 0000000..fa0d4aa
--- /dev/null
+++ b/llvm/test/Transforms/HotColdSplit/issue-197982.ll
@@ -0,0 +1,132 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -S -passes=hotcoldsplit < %s | FileCheck %s
+
+; Reproducer for https://github.com/llvm/llvm-project/issues/197982
+
+target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx14.0.0"
+
+define void @dtor(ptr captures(none) %this) local_unnamed_addr #0 {
+; CHECK-LABEL: define void @dtor(
+; CHECK-SAME: ptr captures(none) [[THIS:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:    [[OLD_I:%.*]] = load ptr, ptr [[THIS]], align 8
+; CHECK-NEXT:    store ptr null, ptr [[THIS]], align 8
+; CHECK-NEXT:    store volatile i32 0, ptr [[OLD_I]], align 4
+; CHECK-NEXT:    ret void
+;
+  %old.i = load ptr, ptr %this, align 8
+  store ptr null, ptr %this, align 8
+  store volatile i32 0, ptr %old.i, align 4
+  ret void
+}
+
+define noalias noundef ptr @assign(ptr captures(none) %this) local_unnamed_addr #0 {
+; CHECK-LABEL: define noalias noundef ptr @assign(
+; CHECK-SAME: ptr captures(none) [[THIS:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[CALL:%.*]] = load volatile ptr, ptr null, align 4294967296
+; CHECK-NEXT:    [[OLD_I:%.*]] = load ptr, ptr [[THIS]], align 8
+; CHECK-NEXT:    store ptr [[CALL]], ptr [[THIS]], align 8
+; CHECK-NEXT:    store volatile i32 0, ptr [[OLD_I]], align 4
+; CHECK-NEXT:    ret ptr null
+;
+  %call = load volatile ptr, ptr null, align 4294967296
+  %old.i = load ptr, ptr %this, align 8
+  store ptr %call, ptr %this, align 8
+  store volatile i32 0, ptr %old.i, align 4
+  ret ptr null
+}
+
+define void @ctor(ptr writeonly captures(none) initializes((0, 8), (16, 24)) %this) local_unnamed_addr !prof !14 {
+; CHECK-LABEL: define void @ctor(
+; CHECK-SAME: ptr writeonly captures(none) initializes((0, 8), (16, 24)) [[THIS:%.*]]) local_unnamed_addr #[[ATTR1:[0-9]+]] !prof [[PROF14:![0-9]+]] {
+; CHECK-NEXT:    tail call void @ext()
+; CHECK-NEXT:    store ptr null, ptr [[THIS]], align 16
+; CHECK-NEXT:    [[P:%.*]] = getelementptr i8, ptr [[THIS]], i64 16
+; CHECK-NEXT:    store ptr null, ptr [[P]], align 16
+; CHECK-NEXT:    store i8 0, ptr [[THIS]], align 16
+; CHECK-NEXT:    tail call void @ext()
+; CHECK-NEXT:    store i32 0, ptr [[THIS]], align 16
+; CHECK-NEXT:    ret void
+;
+  tail call void @ext()
+  store ptr null, ptr %this, align 16
+  %p = getelementptr i8, ptr %this, i64 16
+  store ptr null, ptr %p, align 16
+  store i8 0, ptr %this, align 16
+  tail call void @ext()
+  store i32 0, ptr %this, align 16
+  ret void
+}
+
+declare void @ext() local_unnamed_addr
+
+define void @writeAsOperandInternal(ptr readonly captures(none) %WriterCtx, i1 %cond) local_unnamed_addr {
+; CHECK-LABEL: define void @writeAsOperandInternal(
+; CHECK-SAME: ptr readonly captures(none) [[WRITERCTX:%.*]], i1 [[COND:%.*]]) local_unnamed_addr {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br i1 [[COND]], label %[[CODEREPL:.*]], label %[[IF_END:.*]]
+; CHECK:       [[CODEREPL]]:
+; CHECK-NEXT:    [[TMP0:%.*]] = call ptr @writeAsOperandInternal.cold.1() #[[ATTR2:[0-9]+]]
+; CHECK-NEXT:    br label %[[IF_END]]
+; CHECK:       [[IF_END]]:
+; CHECK-NEXT:    [[S_SROA_0_0:%.*]] = phi ptr [ [[TMP0]], %[[CODEREPL]] ], [ poison, %[[ENTRY]] ]
+; CHECK-NEXT:    [[P:%.*]] = phi ptr [ [[TMP0]], %[[CODEREPL]] ], [ null, %[[ENTRY]] ]
+; CHECK-NEXT:    [[FP:%.*]] = load ptr, ptr [[WRITERCTX]], align 8
+; CHECK-NEXT:    [[R:%.*]] = tail call i32 [[FP]](ptr [[P]], ptr null)
+; CHECK-NEXT:    store volatile i32 0, ptr [[S_SROA_0_0]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  br i1 %cond, label %if.then, label %if.end
+
+if.then:
+  tail call void @ctor(ptr null)
+  %call.i = load volatile ptr, ptr null, align 4294967296
+  store volatile i32 0, ptr poison, align 4
+  br label %if.end
+
+if.end:
+  %s.sroa.0.0 = phi ptr [ %call.i, %if.then ], [ poison, %entry ]
+  %p = phi ptr [ %call.i, %if.then ], [ null, %entry ]
+  %fp = load ptr, ptr %WriterCtx, align 8
+  %r = tail call i32 %fp(ptr %p, ptr null)
+  store volatile i32 0, ptr %s.sroa.0.0, align 4
+  ret void
+}
+
+define void @reset(ptr captures(none) %this, ptr %p) local_unnamed_addr #0 {
+; CHECK-LABEL: define void @reset(
+; CHECK-SAME: ptr captures(none) [[THIS:%.*]], ptr [[P:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[OLD:%.*]] = load ptr, ptr [[THIS]], align 8
+; CHECK-NEXT:    store ptr [[P]], ptr [[THIS]], align 8
+; CHECK-NEXT:    store volatile i32 0, ptr [[OLD]], align 4
+; CHECK-NEXT:    ret void
+;
+  %old = load ptr, ptr %this, align 8
+  store ptr %p, ptr %this, align 8
+  store volatile i32 0, ptr %old, align 4
+  ret void
+}
+
+attributes #0 = { nofree norecurse nounwind memory(readwrite, target_mem: none) }
+
+!llvm.module.flags = !{!0}
+
+!0 = !{i32 1, !"ProfileSummary", !1}
+!1 = !{!2, !3, !4, !5, !6, !7, !8, !9, !10, !11}
+!2 = !{!"ProfileFormat", !"InstrProf"}
+!3 = !{!"TotalCount", i64 436329363820}
+!4 = !{!"MaxCount", i64 7373450795}
+!5 = !{!"MaxInternalCount", i64 502022957}
+!6 = !{!"MaxFunctionCount", i64 7373450795}
+!7 = !{!"NumCounts", i64 3070596}
+!8 = !{!"NumFunctions", i64 874091}
+!9 = !{!"IsPartialProfile", i64 0}
+!10 = !{!"PartialProfileRatio", double 0.000000e+00}
+!11 = !{!"DetailedSummary", !12}
+!12 = !{!13}
+!13 = !{i32 999999, i64 24, i32 423785}
+!14 = !{!"function_entry_count", i64 0}
+;.
+; CHECK: [[PROF14]] = !{!"function_entry_count", i64 0}
+;.
diff --git a/llvm/test/Transforms/HotColdSplit/single-output-multiple-users.ll b/llvm/test/Transforms/HotColdSplit/single-output-multiple-users.ll
new file mode 100644
index 0000000..3197331
--- /dev/null
+++ b/llvm/test/Transforms/HotColdSplit/single-output-multiple-users.ll
@@ -0,0 +1,37 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -S -passes=hotcoldsplit -hotcoldsplit-threshold=0 < %s | FileCheck %s
+
+define void @two_phis_consume_cold_value(ptr %ctx, i1 %cond) {
+; CHECK-LABEL: define void @two_phis_consume_cold_value(
+; CHECK-SAME: ptr [[CTX:%.*]], i1 [[COND:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br i1 [[COND]], label %[[CODEREPL:.*]], label %[[EXIT:.*]]
+; CHECK:       [[CODEREPL]]:
+; CHECK-NEXT:    [[TMP0:%.*]] = call ptr @two_phis_consume_cold_value.cold.1() #[[ATTR2:[0-9]+]]
+; CHECK-NEXT:    br label %[[EXIT]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[A:%.*]] = phi ptr [ [[TMP0]], %[[CODEREPL]] ], [ poison, %[[ENTRY]] ]
+; CHECK-NEXT:    [[B:%.*]] = phi ptr [ [[TMP0]], %[[CODEREPL]] ], [ null, %[[ENTRY]] ]
+; CHECK-NEXT:    [[FP:%.*]] = load ptr, ptr [[CTX]], align 8
+; CHECK-NEXT:    [[R:%.*]] = tail call i32 [[FP]](ptr [[B]], ptr null)
+; CHECK-NEXT:    store volatile i32 0, ptr [[A]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  br i1 %cond, label %cold, label %exit
+
+cold:
+  call void @sink()
+  %v = load volatile ptr, ptr null, align 8
+  br label %exit
+
+exit:
+  %a = phi ptr [ %v, %cold ], [ poison, %entry ]
+  %b = phi ptr [ %v, %cold ], [ null, %entry ]
+  %fp = load ptr, ptr %ctx, align 8
+  %r = tail call i32 %fp(ptr %b, ptr null)
+  store volatile i32 0, ptr %a, align 4
+  ret void
+}
+
+declare void @sink() cold