[AMDGPU] Report only local per-function resource usage when object linking is enabled (#192594)
With object linking the linker aggregates resource usage across TUs, so
compile-time pessimism and call-graph propagation duplicate the linker's
work or pollute its inputs.
In this mode, skip the per-callsite conservative bumps in
`AMDGPUResourceUsageAnalysis` and assign each resource symbol in
`AMDGPUMCResourceInfo` a concrete local constant instead of building
call-graph max/or expressions.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.cpp
index 83012c4..e3a85de 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.cpp
@@ -13,6 +13,7 @@
//===----------------------------------------------------------------------===//
#include "AMDGPUMCResourceInfo.h"
+#include "AMDGPUTargetMachine.h"
#include "Utils/AMDGPUBaseInfo.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/MC/MCAsmInfo.h"
@@ -267,6 +268,31 @@
LLVM_DEBUG(dbgs() << "MCResUse: Gathering resource information for "
<< FnSym->getName() << '\n');
+
+ auto SetToLocal = [&](int64_t Value, ResourceInfoKind RIK) {
+ MCSymbol *Sym = getSymbol(FnSym->getName(), RIK, OutContext);
+ Sym->setVariableValue(MCConstantExpr::create(Value, OutContext));
+ };
+
+ // When link-time object linking is enabled, set all resource symbols to
+ // concrete local values.
+ if (AMDGPUTargetMachine::EnableObjectLinking) {
+ LLVM_DEBUG(dbgs() << "MCResUse: object linking enabled, no call-graph "
+ "propagation; emitting local resource values only\n");
+ SetToLocal(FRI.NumVGPR, RIK_NumVGPR);
+ SetToLocal(FRI.NumAGPR, RIK_NumAGPR);
+ SetToLocal(FRI.NumExplicitSGPR, RIK_NumSGPR);
+ SetToLocal(FRI.NumNamedBarrier, RIK_NumNamedBarrier);
+ SetToLocal(FRI.PrivateSegmentSize, RIK_PrivateSegSize);
+ SetToLocal(FRI.UsesVCC, ResourceInfoKind::RIK_UsesVCC);
+ SetToLocal(FRI.UsesFlatScratch, ResourceInfoKind::RIK_UsesFlatScratch);
+ SetToLocal(FRI.HasDynamicallySizedStack,
+ ResourceInfoKind::RIK_HasDynSizedStack);
+ SetToLocal(FRI.HasRecursion, ResourceInfoKind::RIK_HasRecursion);
+ SetToLocal(FRI.HasIndirectCall, ResourceInfoKind::RIK_HasIndirectCall);
+ return;
+ }
+
LLVM_DEBUG({
if (!FRI.Callees.empty()) {
dbgs() << "MCResUse: Callees:\n";
@@ -347,14 +373,6 @@
Sym->setVariableValue(localConstExpr);
}
- auto SetToLocal = [&](int64_t LocalValue, ResourceInfoKind RIK) {
- MCSymbol *Sym = getSymbol(FnSym->getName(), RIK, OutContext);
- LLVM_DEBUG(
- dbgs() << "MCResUse: " << Sym->getName() << ": Adding " << LocalValue
- << ", no further propagation as indirect callee found within\n");
- Sym->setVariableValue(MCConstantExpr::create(LocalValue, OutContext));
- };
-
if (!FRI.HasIndirectCall) {
assignResourceInfoExpr(FRI.UsesVCC, ResourceInfoKind::RIK_UsesVCC,
AMDGPUMCExpr::AGVK_Or, MF, FRI.Callees, OutContext);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.h
index 5c1f596..3cb063e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.h
@@ -102,6 +102,10 @@
/// transitive maximum or accumulative. For example, if A calls B and B's VGPR
/// usage exceeds A's, A should be assigned B's VGPR usage. Furthermore,
/// functions with indirect calls should be assigned the module level maximum.
+ ///
+ /// When link-time object linking is enabled, skip all call-transitive
+ /// propagation and emit concrete per-function values for every resource
+ /// symbol. Cross-TU aggregation is then the linker's responsibility.
void gatherResourceInfo(
const MachineFunction &MF,
const AMDGPUResourceUsageAnalysisWrapperPass::FunctionResourceInfo &FRI,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
index 4e664e0..51bebef 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
@@ -17,6 +17,7 @@
#include "AMDGPUResourceUsageAnalysis.h"
#include "AMDGPU.h"
+#include "AMDGPUTargetMachine.h"
#include "GCNSubtarget.h"
#include "SIMachineFunctionInfo.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
@@ -272,6 +273,15 @@
Info.Callees.push_back(Callee);
bool IsIndirect = !Callee || Callee->isDeclaration();
+ Info.HasIndirectCall |= IsIndirect;
+
+ // In object linking mode the linker has the full cross-TU view. It
+ // propagates resource usage across both direct calls to external
+ // declarations and true indirect calls. Skip the compile-time
+ // conservative assumptions so that the locally emitted metadata
+ // describes this function's own usage only.
+ if (AMDGPUTargetMachine::EnableObjectLinking)
+ continue;
// FIXME: Call site could have norecurse on it
if (!Callee || !Callee->doesNotRecurse()) {
@@ -301,7 +311,6 @@
Info.UsesVCC = true;
Info.UsesFlatScratch = ST.hasFlatAddressSpace();
Info.HasDynamicallySizedStack = true;
- Info.HasIndirectCall = true;
}
}
}
diff --git a/llvm/test/CodeGen/AMDGPU/object-linking-local-resources.ll b/llvm/test/CodeGen/AMDGPU/object-linking-local-resources.ll
new file mode 100644
index 0000000..95214bc
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/object-linking-local-resources.ll
@@ -0,0 +1,109 @@
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -filetype=asm < %s | FileCheck %s --check-prefix=DEFAULT
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -amdgpu-enable-object-linking -filetype=asm < %s | FileCheck %s --check-prefix=OL
+
+declare void @extern_callee()
+
+define void @calls_extern() {
+ call void @extern_callee()
+ ret void
+}
+
+define void @calls_indirect(ptr %fptr) {
+ call void %fptr()
+ ret void
+}
+
+define void @calls_local() {
+ ret void
+}
+
+define amdgpu_kernel void @my_kernel(ptr %fptr) {
+ call void @calls_extern()
+ call void @calls_indirect(ptr %fptr)
+ call void @calls_local()
+ ret void
+}
+
+; COM: Default mode: direct-to-extern triggers the conservative "unknown
+; COM: callee" path. Register/stack-size symbols include the module-level
+; COM: sinks; boolean flags are all forced to 1; HasIndirectCall is set too
+; COM: (IsIndirect covers calls to declarations).
+; DEFAULT: .set .Lcalls_extern.num_vgpr, max({{[0-9]+}}, amdgpu.max_num_vgpr)
+; DEFAULT: .set .Lcalls_extern.num_agpr, max({{[0-9]+}}, amdgpu.max_num_agpr)
+; DEFAULT: .set .Lcalls_extern.numbered_sgpr, max({{[0-9]+}}, amdgpu.max_num_sgpr)
+; DEFAULT: .set .Lcalls_extern.num_named_barrier, max({{[0-9]+}}, amdgpu.max_num_named_barrier)
+; DEFAULT: .set .Lcalls_extern.uses_vcc, 1
+; DEFAULT: .set .Lcalls_extern.uses_flat_scratch, 1
+; DEFAULT: .set .Lcalls_extern.has_dyn_sized_stack, 1
+; DEFAULT: .set .Lcalls_extern.has_recursion, 1
+; DEFAULT: .set .Lcalls_extern.has_indirect_call, 1
+
+; COM: Object linking: the same function reports only its own local usage.
+; COM: The sinks drop out of the register/stack-size expressions and the
+; COM: pessimized boolean flags collapse to the true local values (UsesVCC is
+; COM: still 1 here because the call-site lowering on gfx900 genuinely uses
+; COM: VCC).
+; OL: .set .Lcalls_extern.num_vgpr, {{[0-9]+}}
+; OL: .set .Lcalls_extern.num_agpr, {{[0-9]+}}
+; OL: .set .Lcalls_extern.numbered_sgpr, {{[0-9]+}}
+; OL: .set .Lcalls_extern.num_named_barrier, {{[0-9]+}}
+; OL: .set .Lcalls_extern.uses_vcc, 1
+; OL: .set .Lcalls_extern.uses_flat_scratch, 0
+; OL: .set .Lcalls_extern.has_dyn_sized_stack, 0
+; OL: .set .Lcalls_extern.has_recursion, 0
+; OL: .set .Lcalls_extern.has_indirect_call, 1
+
+; COM: True indirect call: same DEFAULT-vs-OL behavior as the direct-to-extern
+; COM: case above. In DEFAULT mode all the flags are pessimized; with object
+; COM: linking only HasIndirectCall is preserved (the linker sees the call
+; COM: site's typeid and address-taken set and handles propagation).
+; DEFAULT: .set .Lcalls_indirect.uses_vcc, 1
+; DEFAULT: .set .Lcalls_indirect.uses_flat_scratch, 1
+; DEFAULT: .set .Lcalls_indirect.has_dyn_sized_stack, 1
+; DEFAULT: .set .Lcalls_indirect.has_recursion, 1
+; DEFAULT: .set .Lcalls_indirect.has_indirect_call, 1
+
+; OL: .set .Lcalls_indirect.uses_vcc, 1
+; OL: .set .Lcalls_indirect.uses_flat_scratch, 0
+; OL: .set .Lcalls_indirect.has_dyn_sized_stack, 0
+; OL: .set .Lcalls_indirect.has_recursion, 0
+; OL: .set .Lcalls_indirect.has_indirect_call, 1
+
+; COM: Baseline: a function that makes no calls outside itself reports the
+; COM: same all-zero local flags in both modes.
+; DEFAULT: .set .Lcalls_local.uses_vcc, 0
+; DEFAULT: .set .Lcalls_local.uses_flat_scratch, 0
+; DEFAULT: .set .Lcalls_local.has_dyn_sized_stack, 0
+; DEFAULT: .set .Lcalls_local.has_recursion, 0
+; DEFAULT: .set .Lcalls_local.has_indirect_call, 0
+
+; OL: .set .Lcalls_local.uses_vcc, 0
+; OL: .set .Lcalls_local.uses_flat_scratch, 0
+; OL: .set .Lcalls_local.has_dyn_sized_stack, 0
+; OL: .set .Lcalls_local.has_recursion, 0
+; OL: .set .Lcalls_local.has_indirect_call, 0
+
+; COM: Kernel side of the DEFAULT-vs-OL comparison. DEFAULT mode emits
+; COM: call-graph-propagation expressions (max()/or() over every callee's
+; COM: symbols) so the kernel picks up its callees' pessimized values; object
+; COM: linking emits concrete literals and leaves cross-TU aggregation to the
+; COM: linker.
+; DEFAULT: .set .Lmy_kernel.num_vgpr, max({{[0-9]+}}, .Lcalls_extern.num_vgpr, .Lcalls_indirect.num_vgpr, .Lcalls_local.num_vgpr)
+; DEFAULT: .set .Lmy_kernel.num_agpr, max({{[0-9]+}}, .Lcalls_extern.num_agpr, .Lcalls_indirect.num_agpr, .Lcalls_local.num_agpr)
+; DEFAULT: .set .Lmy_kernel.num_named_barrier, max({{[0-9]+}}, .Lcalls_extern.num_named_barrier, .Lcalls_indirect.num_named_barrier, .Lcalls_local.num_named_barrier)
+; DEFAULT: .set .Lmy_kernel.private_seg_size, {{[0-9]+}}+max(.Lcalls_extern.private_seg_size, .Lcalls_indirect.private_seg_size, .Lcalls_local.private_seg_size)
+; DEFAULT: .set .Lmy_kernel.uses_vcc, or({{[0-9]+}}, .Lcalls_extern.uses_vcc, .Lcalls_indirect.uses_vcc, .Lcalls_local.uses_vcc)
+; DEFAULT: .set .Lmy_kernel.uses_flat_scratch, or({{[0-9]+}}, .Lcalls_extern.uses_flat_scratch, .Lcalls_indirect.uses_flat_scratch, .Lcalls_local.uses_flat_scratch)
+; DEFAULT: .set .Lmy_kernel.has_dyn_sized_stack, or({{[0-9]+}}, .Lcalls_extern.has_dyn_sized_stack, .Lcalls_indirect.has_dyn_sized_stack, .Lcalls_local.has_dyn_sized_stack)
+; DEFAULT: .set .Lmy_kernel.has_recursion, or({{[0-9]+}}, .Lcalls_extern.has_recursion, .Lcalls_indirect.has_recursion, .Lcalls_local.has_recursion)
+; DEFAULT: .set .Lmy_kernel.has_indirect_call, or({{[0-9]+}}, .Lcalls_extern.has_indirect_call, .Lcalls_indirect.has_indirect_call, .Lcalls_local.has_indirect_call)
+
+; OL: .set .Lmy_kernel.num_vgpr, {{[0-9]+}}
+; OL: .set .Lmy_kernel.num_agpr, {{[0-9]+}}
+; OL: .set .Lmy_kernel.num_named_barrier, {{[0-9]+}}
+; OL: .set .Lmy_kernel.private_seg_size, {{[0-9]+}}
+; OL: .set .Lmy_kernel.uses_vcc, {{[01]}}
+; OL: .set .Lmy_kernel.uses_flat_scratch, {{[01]}}
+; OL: .set .Lmy_kernel.has_dyn_sized_stack, 0
+; OL: .set .Lmy_kernel.has_recursion, 0
+; OL: .set .Lmy_kernel.has_indirect_call, 0