[AMDGPU] Report only local per-function resource usage when object linking is enabled (#192594)

With object linking the linker aggregates resource usage across TUs, so
compile-time pessimism and call-graph propagation duplicate the linker's
work or pollute its inputs.

In this mode, skip the per-callsite conservative bumps in
`AMDGPUResourceUsageAnalysis` and assign each resource symbol in
`AMDGPUMCResourceInfo` a concrete local constant instead of building
call-graph max/or expressions.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.cpp
index 83012c4..e3a85de 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.cpp
@@ -13,6 +13,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "AMDGPUMCResourceInfo.h"
+#include "AMDGPUTargetMachine.h"
 #include "Utils/AMDGPUBaseInfo.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/MC/MCAsmInfo.h"
@@ -267,6 +268,31 @@
 
   LLVM_DEBUG(dbgs() << "MCResUse: Gathering resource information for "
                     << FnSym->getName() << '\n');
+
+  auto SetToLocal = [&](int64_t Value, ResourceInfoKind RIK) {
+    MCSymbol *Sym = getSymbol(FnSym->getName(), RIK, OutContext);
+    Sym->setVariableValue(MCConstantExpr::create(Value, OutContext));
+  };
+
+  // When link-time object linking is enabled, set all resource symbols to
+  // concrete local values.
+  if (AMDGPUTargetMachine::EnableObjectLinking) {
+    LLVM_DEBUG(dbgs() << "MCResUse:   object linking enabled, no call-graph "
+                         "propagation; emitting local resource values only\n");
+    SetToLocal(FRI.NumVGPR, RIK_NumVGPR);
+    SetToLocal(FRI.NumAGPR, RIK_NumAGPR);
+    SetToLocal(FRI.NumExplicitSGPR, RIK_NumSGPR);
+    SetToLocal(FRI.NumNamedBarrier, RIK_NumNamedBarrier);
+    SetToLocal(FRI.PrivateSegmentSize, RIK_PrivateSegSize);
+    SetToLocal(FRI.UsesVCC, ResourceInfoKind::RIK_UsesVCC);
+    SetToLocal(FRI.UsesFlatScratch, ResourceInfoKind::RIK_UsesFlatScratch);
+    SetToLocal(FRI.HasDynamicallySizedStack,
+               ResourceInfoKind::RIK_HasDynSizedStack);
+    SetToLocal(FRI.HasRecursion, ResourceInfoKind::RIK_HasRecursion);
+    SetToLocal(FRI.HasIndirectCall, ResourceInfoKind::RIK_HasIndirectCall);
+    return;
+  }
+
   LLVM_DEBUG({
     if (!FRI.Callees.empty()) {
       dbgs() << "MCResUse: Callees:\n";
@@ -347,14 +373,6 @@
     Sym->setVariableValue(localConstExpr);
   }
 
-  auto SetToLocal = [&](int64_t LocalValue, ResourceInfoKind RIK) {
-    MCSymbol *Sym = getSymbol(FnSym->getName(), RIK, OutContext);
-    LLVM_DEBUG(
-        dbgs() << "MCResUse:   " << Sym->getName() << ": Adding " << LocalValue
-               << ", no further propagation as indirect callee found within\n");
-    Sym->setVariableValue(MCConstantExpr::create(LocalValue, OutContext));
-  };
-
   if (!FRI.HasIndirectCall) {
     assignResourceInfoExpr(FRI.UsesVCC, ResourceInfoKind::RIK_UsesVCC,
                            AMDGPUMCExpr::AGVK_Or, MF, FRI.Callees, OutContext);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.h
index 5c1f596..3cb063e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.h
@@ -102,6 +102,10 @@
   /// transitive maximum or accumulative. For example, if A calls B and B's VGPR
   /// usage exceeds A's, A should be assigned B's VGPR usage. Furthermore,
   /// functions with indirect calls should be assigned the module level maximum.
+  ///
+  /// When link-time object linking is enabled, skip all call-transitive
+  /// propagation and emit concrete per-function values for every resource
+  /// symbol. Cross-TU aggregation is then the linker's responsibility.
   void gatherResourceInfo(
       const MachineFunction &MF,
       const AMDGPUResourceUsageAnalysisWrapperPass::FunctionResourceInfo &FRI,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
index 4e664e0..51bebef 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
@@ -17,6 +17,7 @@
 
 #include "AMDGPUResourceUsageAnalysis.h"
 #include "AMDGPU.h"
+#include "AMDGPUTargetMachine.h"
 #include "GCNSubtarget.h"
 #include "SIMachineFunctionInfo.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
@@ -272,6 +273,15 @@
           Info.Callees.push_back(Callee);
 
         bool IsIndirect = !Callee || Callee->isDeclaration();
+        Info.HasIndirectCall |= IsIndirect;
+
+        // In object linking mode the linker has the full cross-TU view. It
+        // propagates resource usage across both direct calls to external
+        // declarations and true indirect calls. Skip the compile-time
+        // conservative assumptions so that the locally emitted metadata
+        // describes this function's own usage only.
+        if (AMDGPUTargetMachine::EnableObjectLinking)
+          continue;
 
         // FIXME: Call site could have norecurse on it
         if (!Callee || !Callee->doesNotRecurse()) {
@@ -301,7 +311,6 @@
           Info.UsesVCC = true;
           Info.UsesFlatScratch = ST.hasFlatAddressSpace();
           Info.HasDynamicallySizedStack = true;
-          Info.HasIndirectCall = true;
         }
       }
     }
diff --git a/llvm/test/CodeGen/AMDGPU/object-linking-local-resources.ll b/llvm/test/CodeGen/AMDGPU/object-linking-local-resources.ll
new file mode 100644
index 0000000..95214bc
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/object-linking-local-resources.ll
@@ -0,0 +1,109 @@
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -filetype=asm < %s | FileCheck %s --check-prefix=DEFAULT
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -amdgpu-enable-object-linking -filetype=asm < %s | FileCheck %s --check-prefix=OL
+
+declare void @extern_callee()
+
+define void @calls_extern() {
+  call void @extern_callee()
+  ret void
+}
+
+define void @calls_indirect(ptr %fptr) {
+  call void %fptr()
+  ret void
+}
+
+define void @calls_local() {
+  ret void
+}
+
+define amdgpu_kernel void @my_kernel(ptr %fptr) {
+  call void @calls_extern()
+  call void @calls_indirect(ptr %fptr)
+  call void @calls_local()
+  ret void
+}
+
+; COM: Default mode: direct-to-extern triggers the conservative "unknown
+; COM: callee" path. Register/stack-size symbols include the module-level
+; COM: sinks; boolean flags are all forced to 1; HasIndirectCall is set too
+; COM: (IsIndirect covers calls to declarations).
+; DEFAULT:       .set .Lcalls_extern.num_vgpr, max({{[0-9]+}}, amdgpu.max_num_vgpr)
+; DEFAULT:       .set .Lcalls_extern.num_agpr, max({{[0-9]+}}, amdgpu.max_num_agpr)
+; DEFAULT:       .set .Lcalls_extern.numbered_sgpr, max({{[0-9]+}}, amdgpu.max_num_sgpr)
+; DEFAULT:       .set .Lcalls_extern.num_named_barrier, max({{[0-9]+}}, amdgpu.max_num_named_barrier)
+; DEFAULT:       .set .Lcalls_extern.uses_vcc, 1
+; DEFAULT:       .set .Lcalls_extern.uses_flat_scratch, 1
+; DEFAULT:       .set .Lcalls_extern.has_dyn_sized_stack, 1
+; DEFAULT:       .set .Lcalls_extern.has_recursion, 1
+; DEFAULT:       .set .Lcalls_extern.has_indirect_call, 1
+
+; COM: Object linking: the same function reports only its own local usage.
+; COM: The sinks drop out of the register/stack-size expressions and the
+; COM: pessimized boolean flags collapse to the true local values (UsesVCC is
+; COM: still 1 here because the call-site lowering on gfx900 genuinely uses
+; COM: VCC).
+; OL:            .set .Lcalls_extern.num_vgpr, {{[0-9]+}}
+; OL:            .set .Lcalls_extern.num_agpr, {{[0-9]+}}
+; OL:            .set .Lcalls_extern.numbered_sgpr, {{[0-9]+}}
+; OL:            .set .Lcalls_extern.num_named_barrier, {{[0-9]+}}
+; OL:            .set .Lcalls_extern.uses_vcc, 1
+; OL:            .set .Lcalls_extern.uses_flat_scratch, 0
+; OL:            .set .Lcalls_extern.has_dyn_sized_stack, 0
+; OL:            .set .Lcalls_extern.has_recursion, 0
+; OL:            .set .Lcalls_extern.has_indirect_call, 1
+
+; COM: True indirect call: same DEFAULT-vs-OL behavior as the direct-to-extern
+; COM: case above. In DEFAULT mode all the flags are pessimized; with object
+; COM: linking only HasIndirectCall is preserved (the linker sees the call
+; COM: site's typeid and address-taken set and handles propagation).
+; DEFAULT:       .set .Lcalls_indirect.uses_vcc, 1
+; DEFAULT:       .set .Lcalls_indirect.uses_flat_scratch, 1
+; DEFAULT:       .set .Lcalls_indirect.has_dyn_sized_stack, 1
+; DEFAULT:       .set .Lcalls_indirect.has_recursion, 1
+; DEFAULT:       .set .Lcalls_indirect.has_indirect_call, 1
+
+; OL:            .set .Lcalls_indirect.uses_vcc, 1
+; OL:            .set .Lcalls_indirect.uses_flat_scratch, 0
+; OL:            .set .Lcalls_indirect.has_dyn_sized_stack, 0
+; OL:            .set .Lcalls_indirect.has_recursion, 0
+; OL:            .set .Lcalls_indirect.has_indirect_call, 1
+
+; COM: Baseline: a function that makes no calls outside itself reports the
+; COM: same all-zero local flags in both modes.
+; DEFAULT:       .set .Lcalls_local.uses_vcc, 0
+; DEFAULT:       .set .Lcalls_local.uses_flat_scratch, 0
+; DEFAULT:       .set .Lcalls_local.has_dyn_sized_stack, 0
+; DEFAULT:       .set .Lcalls_local.has_recursion, 0
+; DEFAULT:       .set .Lcalls_local.has_indirect_call, 0
+
+; OL:            .set .Lcalls_local.uses_vcc, 0
+; OL:            .set .Lcalls_local.uses_flat_scratch, 0
+; OL:            .set .Lcalls_local.has_dyn_sized_stack, 0
+; OL:            .set .Lcalls_local.has_recursion, 0
+; OL:            .set .Lcalls_local.has_indirect_call, 0
+
+; COM: Kernel side of the DEFAULT-vs-OL comparison. DEFAULT mode emits
+; COM: call-graph-propagation expressions (max()/or() over every callee's
+; COM: symbols) so the kernel picks up its callees' pessimized values; object
+; COM: linking emits concrete literals and leaves cross-TU aggregation to the
+; COM: linker.
+; DEFAULT:       .set .Lmy_kernel.num_vgpr, max({{[0-9]+}}, .Lcalls_extern.num_vgpr, .Lcalls_indirect.num_vgpr, .Lcalls_local.num_vgpr)
+; DEFAULT:       .set .Lmy_kernel.num_agpr, max({{[0-9]+}}, .Lcalls_extern.num_agpr, .Lcalls_indirect.num_agpr, .Lcalls_local.num_agpr)
+; DEFAULT:       .set .Lmy_kernel.num_named_barrier, max({{[0-9]+}}, .Lcalls_extern.num_named_barrier, .Lcalls_indirect.num_named_barrier, .Lcalls_local.num_named_barrier)
+; DEFAULT:       .set .Lmy_kernel.private_seg_size, {{[0-9]+}}+max(.Lcalls_extern.private_seg_size, .Lcalls_indirect.private_seg_size, .Lcalls_local.private_seg_size)
+; DEFAULT:       .set .Lmy_kernel.uses_vcc, or({{[0-9]+}}, .Lcalls_extern.uses_vcc, .Lcalls_indirect.uses_vcc, .Lcalls_local.uses_vcc)
+; DEFAULT:       .set .Lmy_kernel.uses_flat_scratch, or({{[0-9]+}}, .Lcalls_extern.uses_flat_scratch, .Lcalls_indirect.uses_flat_scratch, .Lcalls_local.uses_flat_scratch)
+; DEFAULT:       .set .Lmy_kernel.has_dyn_sized_stack, or({{[0-9]+}}, .Lcalls_extern.has_dyn_sized_stack, .Lcalls_indirect.has_dyn_sized_stack, .Lcalls_local.has_dyn_sized_stack)
+; DEFAULT:       .set .Lmy_kernel.has_recursion, or({{[0-9]+}}, .Lcalls_extern.has_recursion, .Lcalls_indirect.has_recursion, .Lcalls_local.has_recursion)
+; DEFAULT:       .set .Lmy_kernel.has_indirect_call, or({{[0-9]+}}, .Lcalls_extern.has_indirect_call, .Lcalls_indirect.has_indirect_call, .Lcalls_local.has_indirect_call)
+
+; OL:            .set .Lmy_kernel.num_vgpr, {{[0-9]+}}
+; OL:            .set .Lmy_kernel.num_agpr, {{[0-9]+}}
+; OL:            .set .Lmy_kernel.num_named_barrier, {{[0-9]+}}
+; OL:            .set .Lmy_kernel.private_seg_size, {{[0-9]+}}
+; OL:            .set .Lmy_kernel.uses_vcc, {{[01]}}
+; OL:            .set .Lmy_kernel.uses_flat_scratch, {{[01]}}
+; OL:            .set .Lmy_kernel.has_dyn_sized_stack, 0
+; OL:            .set .Lmy_kernel.has_recursion, 0
+; OL:            .set .Lmy_kernel.has_indirect_call, 0