blob: 433144a60d12077d1a1bd3ec1290b1e82008df4c [file] [log] [blame]
//===- AMDGPUAttributor.cpp -----------------------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
/// \file This pass uses Attributor framework to deduce AMDGPU attributes.
//
//===----------------------------------------------------------------------===//
#include "AMDGPU.h"
#include "GCNSubtarget.h"
#include "Utils/AMDGPUBaseInfo.h"
#include "llvm/Analysis/CycleAnalysis.h"
#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/IR/IntrinsicsAMDGPU.h"
#include "llvm/IR/IntrinsicsR600.h"
#include "llvm/InitializePasses.h"
#include "llvm/Target/TargetMachine.h"
#include "llvm/Transforms/IPO/Attributor.h"
#define DEBUG_TYPE "amdgpu-attributor"
using namespace llvm;
static cl::opt<unsigned> IndirectCallSpecializationThreshold(
"amdgpu-indirect-call-specialization-threshold",
cl::desc(
"A threshold controls whether an indirect call will be specialized"),
cl::init(3));
#define AMDGPU_ATTRIBUTE(Name, Str) Name##_POS,
enum ImplicitArgumentPositions {
#include "AMDGPUAttributes.def"
LAST_ARG_POS
};
#define AMDGPU_ATTRIBUTE(Name, Str) Name = 1 << Name##_POS,
enum ImplicitArgumentMask {
NOT_IMPLICIT_INPUT = 0,
#include "AMDGPUAttributes.def"
ALL_ARGUMENT_MASK = (1 << LAST_ARG_POS) - 1
};
#define AMDGPU_ATTRIBUTE(Name, Str) {Name, Str},
static constexpr std::pair<ImplicitArgumentMask, StringLiteral>
ImplicitAttrs[] = {
#include "AMDGPUAttributes.def"
};
// We do not need to note the x workitem or workgroup id because they are always
// initialized.
//
// TODO: We should not add the attributes if the known compile time workgroup
// size is 1 for y/z.
static ImplicitArgumentMask
intrinsicToAttrMask(Intrinsic::ID ID, bool &NonKernelOnly, bool &NeedsImplicit,
bool HasApertureRegs, bool SupportsGetDoorBellID,
unsigned CodeObjectVersion) {
switch (ID) {
case Intrinsic::amdgcn_workitem_id_x:
NonKernelOnly = true;
return WORKITEM_ID_X;
case Intrinsic::amdgcn_workgroup_id_x:
NonKernelOnly = true;
return WORKGROUP_ID_X;
case Intrinsic::amdgcn_workitem_id_y:
case Intrinsic::r600_read_tidig_y:
return WORKITEM_ID_Y;
case Intrinsic::amdgcn_workitem_id_z:
case Intrinsic::r600_read_tidig_z:
return WORKITEM_ID_Z;
case Intrinsic::amdgcn_workgroup_id_y:
case Intrinsic::r600_read_tgid_y:
return WORKGROUP_ID_Y;
case Intrinsic::amdgcn_workgroup_id_z:
case Intrinsic::r600_read_tgid_z:
return WORKGROUP_ID_Z;
case Intrinsic::amdgcn_lds_kernel_id:
return LDS_KERNEL_ID;
case Intrinsic::amdgcn_dispatch_ptr:
return DISPATCH_PTR;
case Intrinsic::amdgcn_dispatch_id:
return DISPATCH_ID;
case Intrinsic::amdgcn_implicitarg_ptr:
return IMPLICIT_ARG_PTR;
// Need queue_ptr anyway. But under V5, we also need implicitarg_ptr to access
// queue_ptr.
case Intrinsic::amdgcn_queue_ptr:
NeedsImplicit = (CodeObjectVersion >= AMDGPU::AMDHSA_COV5);
return QUEUE_PTR;
case Intrinsic::amdgcn_is_shared:
case Intrinsic::amdgcn_is_private:
if (HasApertureRegs)
return NOT_IMPLICIT_INPUT;
// Under V5, we need implicitarg_ptr + offsets to access private_base or
// shared_base. For pre-V5, however, need to access them through queue_ptr +
// offsets.
return CodeObjectVersion >= AMDGPU::AMDHSA_COV5 ? IMPLICIT_ARG_PTR
: QUEUE_PTR;
case Intrinsic::trap:
case Intrinsic::debugtrap:
case Intrinsic::ubsantrap:
if (SupportsGetDoorBellID) // GetDoorbellID support implemented since V4.
return CodeObjectVersion >= AMDGPU::AMDHSA_COV4 ? NOT_IMPLICIT_INPUT
: QUEUE_PTR;
NeedsImplicit = (CodeObjectVersion >= AMDGPU::AMDHSA_COV5);
return QUEUE_PTR;
default:
return NOT_IMPLICIT_INPUT;
}
}
static bool castRequiresQueuePtr(unsigned SrcAS) {
return SrcAS == AMDGPUAS::LOCAL_ADDRESS || SrcAS == AMDGPUAS::PRIVATE_ADDRESS;
}
static bool isDSAddress(const Constant *C) {
const GlobalValue *GV = dyn_cast<GlobalValue>(C);
if (!GV)
return false;
unsigned AS = GV->getAddressSpace();
return AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS;
}
/// Returns true if the function requires the implicit argument be passed
/// regardless of the function contents.
static bool funcRequiresHostcallPtr(const Function &F) {
// Sanitizers require the hostcall buffer passed in the implicit arguments.
return F.hasFnAttribute(Attribute::SanitizeAddress) ||
F.hasFnAttribute(Attribute::SanitizeThread) ||
F.hasFnAttribute(Attribute::SanitizeMemory) ||
F.hasFnAttribute(Attribute::SanitizeHWAddress) ||
F.hasFnAttribute(Attribute::SanitizeMemTag);
}
namespace {
class AMDGPUInformationCache : public InformationCache {
public:
AMDGPUInformationCache(const Module &M, AnalysisGetter &AG,
BumpPtrAllocator &Allocator,
SetVector<Function *> *CGSCC, TargetMachine &TM)
: InformationCache(M, AG, Allocator, CGSCC), TM(TM),
CodeObjectVersion(AMDGPU::getAMDHSACodeObjectVersion(M)) {}
TargetMachine &TM;
enum ConstantStatus : uint8_t {
NONE = 0,
DS_GLOBAL = 1 << 0,
ADDR_SPACE_CAST_PRIVATE_TO_FLAT = 1 << 1,
ADDR_SPACE_CAST_LOCAL_TO_FLAT = 1 << 2,
ADDR_SPACE_CAST_BOTH_TO_FLAT =
ADDR_SPACE_CAST_PRIVATE_TO_FLAT | ADDR_SPACE_CAST_LOCAL_TO_FLAT
};
/// Check if the subtarget has aperture regs.
bool hasApertureRegs(Function &F) {
const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
return ST.hasApertureRegs();
}
/// Check if the subtarget supports GetDoorbellID.
bool supportsGetDoorbellID(Function &F) {
const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
return ST.supportsGetDoorbellID();
}
std::optional<std::pair<unsigned, unsigned>>
getFlatWorkGroupSizeAttr(const Function &F) const {
auto R = AMDGPU::getIntegerPairAttribute(F, "amdgpu-flat-work-group-size");
if (!R)
return std::nullopt;
return std::make_pair(R->first, *(R->second));
}
std::pair<unsigned, unsigned>
getDefaultFlatWorkGroupSize(const Function &F) const {
const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
return ST.getDefaultFlatWorkGroupSize(F.getCallingConv());
}
std::pair<unsigned, unsigned>
getMaximumFlatWorkGroupRange(const Function &F) {
const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
return {ST.getMinFlatWorkGroupSize(), ST.getMaxFlatWorkGroupSize()};
}
SmallVector<unsigned> getMaxNumWorkGroups(const Function &F) {
const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
return ST.getMaxNumWorkGroups(F);
}
/// Get code object version.
unsigned getCodeObjectVersion() const { return CodeObjectVersion; }
/// Get the effective value of "amdgpu-waves-per-eu" for the function,
/// accounting for the interaction with the passed value to use for
/// "amdgpu-flat-work-group-size".
std::pair<unsigned, unsigned>
getWavesPerEU(const Function &F,
std::pair<unsigned, unsigned> FlatWorkGroupSize) {
const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
return ST.getWavesPerEU(FlatWorkGroupSize, getLDSSize(F), F);
}
std::optional<std::pair<unsigned, unsigned>>
getWavesPerEUAttr(const Function &F) {
auto Val = AMDGPU::getIntegerPairAttribute(F, "amdgpu-waves-per-eu",
/*OnlyFirstRequired=*/true);
if (!Val)
return std::nullopt;
if (!Val->second) {
const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
Val->second = ST.getMaxWavesPerEU();
}
return std::make_pair(Val->first, *(Val->second));
}
std::pair<unsigned, unsigned>
getEffectiveWavesPerEU(const Function &F,
std::pair<unsigned, unsigned> WavesPerEU,
std::pair<unsigned, unsigned> FlatWorkGroupSize) {
const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
return ST.getEffectiveWavesPerEU(WavesPerEU, FlatWorkGroupSize,
getLDSSize(F));
}
unsigned getMaxWavesPerEU(const Function &F) {
const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
return ST.getMaxWavesPerEU();
}
private:
/// Check if the ConstantExpr \p CE uses an addrspacecast from private or
/// local to flat. These casts may require the queue pointer.
static uint8_t visitConstExpr(const ConstantExpr *CE) {
uint8_t Status = NONE;
if (CE->getOpcode() == Instruction::AddrSpaceCast) {
unsigned SrcAS = CE->getOperand(0)->getType()->getPointerAddressSpace();
if (SrcAS == AMDGPUAS::PRIVATE_ADDRESS)
Status |= ADDR_SPACE_CAST_PRIVATE_TO_FLAT;
else if (SrcAS == AMDGPUAS::LOCAL_ADDRESS)
Status |= ADDR_SPACE_CAST_LOCAL_TO_FLAT;
}
return Status;
}
/// Returns the minimum amount of LDS space used by a workgroup running
/// function \p F.
static unsigned getLDSSize(const Function &F) {
return AMDGPU::getIntegerPairAttribute(F, "amdgpu-lds-size",
{0, UINT32_MAX}, true)
.first;
}
/// Get the constant access bitmap for \p C.
uint8_t getConstantAccess(const Constant *C,
SmallPtrSetImpl<const Constant *> &Visited) {
auto It = ConstantStatus.find(C);
if (It != ConstantStatus.end())
return It->second;
uint8_t Result = 0;
if (isDSAddress(C))
Result = DS_GLOBAL;
if (const auto *CE = dyn_cast<ConstantExpr>(C))
Result |= visitConstExpr(CE);
for (const Use &U : C->operands()) {
const auto *OpC = dyn_cast<Constant>(U);
if (!OpC || !Visited.insert(OpC).second)
continue;
Result |= getConstantAccess(OpC, Visited);
}
return Result;
}
public:
/// Returns true if \p Fn needs the queue pointer because of \p C.
bool needsQueuePtr(const Constant *C, Function &Fn) {
bool IsNonEntryFunc = !AMDGPU::isEntryFunctionCC(Fn.getCallingConv());
bool HasAperture = hasApertureRegs(Fn);
// No need to explore the constants.
if (!IsNonEntryFunc && HasAperture)
return false;
SmallPtrSet<const Constant *, 8> Visited;
uint8_t Access = getConstantAccess(C, Visited);
// We need to trap on DS globals in non-entry functions.
if (IsNonEntryFunc && (Access & DS_GLOBAL))
return true;
return !HasAperture && (Access & ADDR_SPACE_CAST_BOTH_TO_FLAT);
}
bool checkConstForAddrSpaceCastFromPrivate(const Constant *C) {
SmallPtrSet<const Constant *, 8> Visited;
uint8_t Access = getConstantAccess(C, Visited);
return Access & ADDR_SPACE_CAST_PRIVATE_TO_FLAT;
}
private:
/// Used to determine if the Constant needs the queue pointer.
DenseMap<const Constant *, uint8_t> ConstantStatus;
const unsigned CodeObjectVersion;
};
struct AAAMDAttributes
: public StateWrapper<BitIntegerState<uint32_t, ALL_ARGUMENT_MASK, 0>,
AbstractAttribute> {
using Base = StateWrapper<BitIntegerState<uint32_t, ALL_ARGUMENT_MASK, 0>,
AbstractAttribute>;
AAAMDAttributes(const IRPosition &IRP, Attributor &A) : Base(IRP) {}
/// Create an abstract attribute view for the position \p IRP.
static AAAMDAttributes &createForPosition(const IRPosition &IRP,
Attributor &A);
/// See AbstractAttribute::getName().
const std::string getName() const override { return "AAAMDAttributes"; }
/// See AbstractAttribute::getIdAddr().
const char *getIdAddr() const override { return &ID; }
/// This function should return true if the type of the \p AA is
/// AAAMDAttributes.
static bool classof(const AbstractAttribute *AA) {
return (AA->getIdAddr() == &ID);
}
/// Unique ID (due to the unique address)
static const char ID;
};
const char AAAMDAttributes::ID = 0;
struct AAUniformWorkGroupSize
: public StateWrapper<BooleanState, AbstractAttribute> {
using Base = StateWrapper<BooleanState, AbstractAttribute>;
AAUniformWorkGroupSize(const IRPosition &IRP, Attributor &A) : Base(IRP) {}
/// Create an abstract attribute view for the position \p IRP.
static AAUniformWorkGroupSize &createForPosition(const IRPosition &IRP,
Attributor &A);
/// See AbstractAttribute::getName().
const std::string getName() const override {
return "AAUniformWorkGroupSize";
}
/// See AbstractAttribute::getIdAddr().
const char *getIdAddr() const override { return &ID; }
/// This function should return true if the type of the \p AA is
/// AAAMDAttributes.
static bool classof(const AbstractAttribute *AA) {
return (AA->getIdAddr() == &ID);
}
/// Unique ID (due to the unique address)
static const char ID;
};
const char AAUniformWorkGroupSize::ID = 0;
struct AAUniformWorkGroupSizeFunction : public AAUniformWorkGroupSize {
AAUniformWorkGroupSizeFunction(const IRPosition &IRP, Attributor &A)
: AAUniformWorkGroupSize(IRP, A) {}
void initialize(Attributor &A) override {
Function *F = getAssociatedFunction();
CallingConv::ID CC = F->getCallingConv();
if (CC != CallingConv::AMDGPU_KERNEL)
return;
bool InitialValue = false;
if (F->hasFnAttribute("uniform-work-group-size"))
InitialValue =
F->getFnAttribute("uniform-work-group-size").getValueAsString() ==
"true";
if (InitialValue)
indicateOptimisticFixpoint();
else
indicatePessimisticFixpoint();
}
ChangeStatus updateImpl(Attributor &A) override {
ChangeStatus Change = ChangeStatus::UNCHANGED;
auto CheckCallSite = [&](AbstractCallSite CS) {
Function *Caller = CS.getInstruction()->getFunction();
LLVM_DEBUG(dbgs() << "[AAUniformWorkGroupSize] Call " << Caller->getName()
<< "->" << getAssociatedFunction()->getName() << "\n");
const auto *CallerInfo = A.getAAFor<AAUniformWorkGroupSize>(
*this, IRPosition::function(*Caller), DepClassTy::REQUIRED);
if (!CallerInfo || !CallerInfo->isValidState())
return false;
Change = Change | clampStateAndIndicateChange(this->getState(),
CallerInfo->getState());
return true;
};
bool AllCallSitesKnown = true;
if (!A.checkForAllCallSites(CheckCallSite, *this, true, AllCallSitesKnown))
return indicatePessimisticFixpoint();
return Change;
}
ChangeStatus manifest(Attributor &A) override {
SmallVector<Attribute, 8> AttrList;
LLVMContext &Ctx = getAssociatedFunction()->getContext();
AttrList.push_back(Attribute::get(Ctx, "uniform-work-group-size",
getAssumed() ? "true" : "false"));
return A.manifestAttrs(getIRPosition(), AttrList,
/* ForceReplace */ true);
}
bool isValidState() const override {
// This state is always valid, even when the state is false.
return true;
}
const std::string getAsStr(Attributor *) const override {
return "AMDWorkGroupSize[" + std::to_string(getAssumed()) + "]";
}
/// See AbstractAttribute::trackStatistics()
void trackStatistics() const override {}
};
AAUniformWorkGroupSize &
AAUniformWorkGroupSize::createForPosition(const IRPosition &IRP,
Attributor &A) {
if (IRP.getPositionKind() == IRPosition::IRP_FUNCTION)
return *new (A.Allocator) AAUniformWorkGroupSizeFunction(IRP, A);
llvm_unreachable(
"AAUniformWorkGroupSize is only valid for function position");
}
struct AAAMDAttributesFunction : public AAAMDAttributes {
AAAMDAttributesFunction(const IRPosition &IRP, Attributor &A)
: AAAMDAttributes(IRP, A) {}
void initialize(Attributor &A) override {
Function *F = getAssociatedFunction();
// If the function requires the implicit arg pointer due to sanitizers,
// assume it's needed even if explicitly marked as not requiring it.
const bool NeedsHostcall = funcRequiresHostcallPtr(*F);
if (NeedsHostcall) {
removeAssumedBits(IMPLICIT_ARG_PTR);
removeAssumedBits(HOSTCALL_PTR);
}
for (auto Attr : ImplicitAttrs) {
if (NeedsHostcall &&
(Attr.first == IMPLICIT_ARG_PTR || Attr.first == HOSTCALL_PTR))
continue;
if (F->hasFnAttribute(Attr.second))
addKnownBits(Attr.first);
}
if (F->isDeclaration())
return;
// Ignore functions with graphics calling conventions, these are currently
// not allowed to have kernel arguments.
if (AMDGPU::isGraphics(F->getCallingConv())) {
indicatePessimisticFixpoint();
return;
}
}
ChangeStatus updateImpl(Attributor &A) override {
Function *F = getAssociatedFunction();
// The current assumed state used to determine a change.
auto OrigAssumed = getAssumed();
// Check for Intrinsics and propagate attributes.
const AACallEdges *AAEdges = A.getAAFor<AACallEdges>(
*this, this->getIRPosition(), DepClassTy::REQUIRED);
if (!AAEdges || !AAEdges->isValidState() ||
AAEdges->hasNonAsmUnknownCallee())
return indicatePessimisticFixpoint();
bool IsNonEntryFunc = !AMDGPU::isEntryFunctionCC(F->getCallingConv());
bool NeedsImplicit = false;
auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
bool HasApertureRegs = InfoCache.hasApertureRegs(*F);
bool SupportsGetDoorbellID = InfoCache.supportsGetDoorbellID(*F);
unsigned COV = InfoCache.getCodeObjectVersion();
for (Function *Callee : AAEdges->getOptimisticEdges()) {
Intrinsic::ID IID = Callee->getIntrinsicID();
if (IID == Intrinsic::not_intrinsic) {
const AAAMDAttributes *AAAMD = A.getAAFor<AAAMDAttributes>(
*this, IRPosition::function(*Callee), DepClassTy::REQUIRED);
if (!AAAMD || !AAAMD->isValidState())
return indicatePessimisticFixpoint();
*this &= *AAAMD;
continue;
}
bool NonKernelOnly = false;
ImplicitArgumentMask AttrMask =
intrinsicToAttrMask(IID, NonKernelOnly, NeedsImplicit,
HasApertureRegs, SupportsGetDoorbellID, COV);
if (AttrMask != NOT_IMPLICIT_INPUT) {
if ((IsNonEntryFunc || !NonKernelOnly))
removeAssumedBits(AttrMask);
}
}
// Need implicitarg_ptr to acess queue_ptr, private_base, and shared_base.
if (NeedsImplicit)
removeAssumedBits(IMPLICIT_ARG_PTR);
if (isAssumed(QUEUE_PTR) && checkForQueuePtr(A)) {
// Under V5, we need implicitarg_ptr + offsets to access private_base or
// shared_base. We do not actually need queue_ptr.
if (COV >= 5)
removeAssumedBits(IMPLICIT_ARG_PTR);
else
removeAssumedBits(QUEUE_PTR);
}
if (funcRetrievesMultigridSyncArg(A, COV)) {
assert(!isAssumed(IMPLICIT_ARG_PTR) &&
"multigrid_sync_arg needs implicitarg_ptr");
removeAssumedBits(MULTIGRID_SYNC_ARG);
}
if (funcRetrievesHostcallPtr(A, COV)) {
assert(!isAssumed(IMPLICIT_ARG_PTR) && "hostcall needs implicitarg_ptr");
removeAssumedBits(HOSTCALL_PTR);
}
if (funcRetrievesHeapPtr(A, COV)) {
assert(!isAssumed(IMPLICIT_ARG_PTR) && "heap_ptr needs implicitarg_ptr");
removeAssumedBits(HEAP_PTR);
}
if (isAssumed(QUEUE_PTR) && funcRetrievesQueuePtr(A, COV)) {
assert(!isAssumed(IMPLICIT_ARG_PTR) && "queue_ptr needs implicitarg_ptr");
removeAssumedBits(QUEUE_PTR);
}
if (isAssumed(LDS_KERNEL_ID) && funcRetrievesLDSKernelId(A)) {
removeAssumedBits(LDS_KERNEL_ID);
}
if (isAssumed(DEFAULT_QUEUE) && funcRetrievesDefaultQueue(A, COV))
removeAssumedBits(DEFAULT_QUEUE);
if (isAssumed(COMPLETION_ACTION) && funcRetrievesCompletionAction(A, COV))
removeAssumedBits(COMPLETION_ACTION);
if (isAssumed(FLAT_SCRATCH_INIT) && needFlatScratchInit(A))
removeAssumedBits(FLAT_SCRATCH_INIT);
return getAssumed() != OrigAssumed ? ChangeStatus::CHANGED
: ChangeStatus::UNCHANGED;
}
ChangeStatus manifest(Attributor &A) override {
SmallVector<Attribute, 8> AttrList;
LLVMContext &Ctx = getAssociatedFunction()->getContext();
for (auto Attr : ImplicitAttrs) {
if (isKnown(Attr.first))
AttrList.push_back(Attribute::get(Ctx, Attr.second));
}
return A.manifestAttrs(getIRPosition(), AttrList,
/* ForceReplace */ true);
}
const std::string getAsStr(Attributor *) const override {
std::string Str;
raw_string_ostream OS(Str);
OS << "AMDInfo[";
for (auto Attr : ImplicitAttrs)
if (isAssumed(Attr.first))
OS << ' ' << Attr.second;
OS << " ]";
return OS.str();
}
/// See AbstractAttribute::trackStatistics()
void trackStatistics() const override {}
private:
bool checkForQueuePtr(Attributor &A) {
Function *F = getAssociatedFunction();
bool IsNonEntryFunc = !AMDGPU::isEntryFunctionCC(F->getCallingConv());
auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
bool NeedsQueuePtr = false;
auto CheckAddrSpaceCasts = [&](Instruction &I) {
unsigned SrcAS = static_cast<AddrSpaceCastInst &>(I).getSrcAddressSpace();
if (castRequiresQueuePtr(SrcAS)) {
NeedsQueuePtr = true;
return false;
}
return true;
};
bool HasApertureRegs = InfoCache.hasApertureRegs(*F);
// `checkForAllInstructions` is much more cheaper than going through all
// instructions, try it first.
// The queue pointer is not needed if aperture regs is present.
if (!HasApertureRegs) {
bool UsedAssumedInformation = false;
A.checkForAllInstructions(CheckAddrSpaceCasts, *this,
{Instruction::AddrSpaceCast},
UsedAssumedInformation);
}
// If we found that we need the queue pointer, nothing else to do.
if (NeedsQueuePtr)
return true;
if (!IsNonEntryFunc && HasApertureRegs)
return false;
for (BasicBlock &BB : *F) {
for (Instruction &I : BB) {
for (const Use &U : I.operands()) {
if (const auto *C = dyn_cast<Constant>(U)) {
if (InfoCache.needsQueuePtr(C, *F))
return true;
}
}
}
}
return false;
}
bool funcRetrievesMultigridSyncArg(Attributor &A, unsigned COV) {
auto Pos = llvm::AMDGPU::getMultigridSyncArgImplicitArgPosition(COV);
AA::RangeTy Range(Pos, 8);
return funcRetrievesImplicitKernelArg(A, Range);
}
bool funcRetrievesHostcallPtr(Attributor &A, unsigned COV) {
auto Pos = llvm::AMDGPU::getHostcallImplicitArgPosition(COV);
AA::RangeTy Range(Pos, 8);
return funcRetrievesImplicitKernelArg(A, Range);
}
bool funcRetrievesDefaultQueue(Attributor &A, unsigned COV) {
auto Pos = llvm::AMDGPU::getDefaultQueueImplicitArgPosition(COV);
AA::RangeTy Range(Pos, 8);
return funcRetrievesImplicitKernelArg(A, Range);
}
bool funcRetrievesCompletionAction(Attributor &A, unsigned COV) {
auto Pos = llvm::AMDGPU::getCompletionActionImplicitArgPosition(COV);
AA::RangeTy Range(Pos, 8);
return funcRetrievesImplicitKernelArg(A, Range);
}
bool funcRetrievesHeapPtr(Attributor &A, unsigned COV) {
if (COV < 5)
return false;
AA::RangeTy Range(AMDGPU::ImplicitArg::HEAP_PTR_OFFSET, 8);
return funcRetrievesImplicitKernelArg(A, Range);
}
bool funcRetrievesQueuePtr(Attributor &A, unsigned COV) {
if (COV < 5)
return false;
AA::RangeTy Range(AMDGPU::ImplicitArg::QUEUE_PTR_OFFSET, 8);
return funcRetrievesImplicitKernelArg(A, Range);
}
bool funcRetrievesImplicitKernelArg(Attributor &A, AA::RangeTy Range) {
// Check if this is a call to the implicitarg_ptr builtin and it
// is used to retrieve the hostcall pointer. The implicit arg for
// hostcall is not used only if every use of the implicitarg_ptr
// is a load that clearly does not retrieve any byte of the
// hostcall pointer. We check this by tracing all the uses of the
// initial call to the implicitarg_ptr intrinsic.
auto DoesNotLeadToKernelArgLoc = [&](Instruction &I) {
auto &Call = cast<CallBase>(I);
if (Call.getIntrinsicID() != Intrinsic::amdgcn_implicitarg_ptr)
return true;
const auto *PointerInfoAA = A.getAAFor<AAPointerInfo>(
*this, IRPosition::callsite_returned(Call), DepClassTy::REQUIRED);
if (!PointerInfoAA || !PointerInfoAA->getState().isValidState())
return false;
return PointerInfoAA->forallInterferingAccesses(
Range, [](const AAPointerInfo::Access &Acc, bool IsExact) {
return Acc.getRemoteInst()->isDroppable();
});
};
bool UsedAssumedInformation = false;
return !A.checkForAllCallLikeInstructions(DoesNotLeadToKernelArgLoc, *this,
UsedAssumedInformation);
}
bool funcRetrievesLDSKernelId(Attributor &A) {
auto DoesNotRetrieve = [&](Instruction &I) {
auto &Call = cast<CallBase>(I);
return Call.getIntrinsicID() != Intrinsic::amdgcn_lds_kernel_id;
};
bool UsedAssumedInformation = false;
return !A.checkForAllCallLikeInstructions(DoesNotRetrieve, *this,
UsedAssumedInformation);
}
// Returns true if FlatScratchInit is needed, i.e., no-flat-scratch-init is
// not to be set.
bool needFlatScratchInit(Attributor &A) {
assert(isAssumed(FLAT_SCRATCH_INIT)); // only called if the bit is still set
// Check all AddrSpaceCast instructions. FlatScratchInit is needed if
// there is a cast from PRIVATE_ADDRESS.
auto AddrSpaceCastNotFromPrivate = [](Instruction &I) {
return cast<AddrSpaceCastInst>(I).getSrcAddressSpace() !=
AMDGPUAS::PRIVATE_ADDRESS;
};
bool UsedAssumedInformation = false;
if (!A.checkForAllInstructions(AddrSpaceCastNotFromPrivate, *this,
{Instruction::AddrSpaceCast},
UsedAssumedInformation))
return true;
// Check for addrSpaceCast from PRIVATE_ADDRESS in constant expressions
auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
Function *F = getAssociatedFunction();
for (Instruction &I : instructions(F)) {
for (const Use &U : I.operands()) {
if (const auto *C = dyn_cast<Constant>(U)) {
if (InfoCache.checkConstForAddrSpaceCastFromPrivate(C))
return true;
}
}
}
// Finally check callees.
// This is called on each callee; false means callee shouldn't have
// no-flat-scratch-init.
auto CheckForNoFlatScratchInit = [&](Instruction &I) {
const auto &CB = cast<CallBase>(I);
const Function *Callee = CB.getCalledFunction();
// Callee == 0 for inline asm or indirect call with known callees.
// In the latter case, updateImpl() already checked the callees and we
// know their FLAT_SCRATCH_INIT bit is set.
// If function has indirect call with unknown callees, the bit is
// already removed in updateImpl() and execution won't reach here.
if (!Callee)
return true;
return Callee->getIntrinsicID() !=
Intrinsic::amdgcn_addrspacecast_nonnull;
};
UsedAssumedInformation = false;
// If any callee is false (i.e. need FlatScratchInit),
// checkForAllCallLikeInstructions returns false, in which case this
// function returns true.
return !A.checkForAllCallLikeInstructions(CheckForNoFlatScratchInit, *this,
UsedAssumedInformation);
}
};
AAAMDAttributes &AAAMDAttributes::createForPosition(const IRPosition &IRP,
Attributor &A) {
if (IRP.getPositionKind() == IRPosition::IRP_FUNCTION)
return *new (A.Allocator) AAAMDAttributesFunction(IRP, A);
llvm_unreachable("AAAMDAttributes is only valid for function position");
}
/// Base class to derive different size ranges.
struct AAAMDSizeRangeAttribute
: public StateWrapper<IntegerRangeState, AbstractAttribute, uint32_t> {
using Base = StateWrapper<IntegerRangeState, AbstractAttribute, uint32_t>;
StringRef AttrName;
AAAMDSizeRangeAttribute(const IRPosition &IRP, Attributor &A,
StringRef AttrName)
: Base(IRP, 32), AttrName(AttrName) {}
/// See AbstractAttribute::trackStatistics()
void trackStatistics() const override {}
template <class AttributeImpl> ChangeStatus updateImplImpl(Attributor &A) {
ChangeStatus Change = ChangeStatus::UNCHANGED;
auto CheckCallSite = [&](AbstractCallSite CS) {
Function *Caller = CS.getInstruction()->getFunction();
LLVM_DEBUG(dbgs() << '[' << getName() << "] Call " << Caller->getName()
<< "->" << getAssociatedFunction()->getName() << '\n');
const auto *CallerInfo = A.getAAFor<AttributeImpl>(
*this, IRPosition::function(*Caller), DepClassTy::REQUIRED);
if (!CallerInfo || !CallerInfo->isValidState())
return false;
Change |=
clampStateAndIndicateChange(this->getState(), CallerInfo->getState());
return true;
};
bool AllCallSitesKnown = true;
if (!A.checkForAllCallSites(CheckCallSite, *this,
/*RequireAllCallSites=*/true,
AllCallSitesKnown))
return indicatePessimisticFixpoint();
return Change;
}
/// Clamp the assumed range to the default value ([Min, Max]) and emit the
/// attribute if it is not same as default.
ChangeStatus
emitAttributeIfNotDefaultAfterClamp(Attributor &A,
std::pair<unsigned, unsigned> Default) {
auto [Min, Max] = Default;
unsigned Lower = getAssumed().getLower().getZExtValue();
unsigned Upper = getAssumed().getUpper().getZExtValue();
// Clamp the range to the default value.
if (Lower < Min)
Lower = Min;
if (Upper > Max + 1)
Upper = Max + 1;
// No manifest if the value is invalid or same as default after clamp.
if ((Lower == Min && Upper == Max + 1) || (Upper < Lower))
return ChangeStatus::UNCHANGED;
Function *F = getAssociatedFunction();
LLVMContext &Ctx = F->getContext();
SmallString<10> Buffer;
raw_svector_ostream OS(Buffer);
OS << Lower << ',' << Upper - 1;
return A.manifestAttrs(getIRPosition(),
{Attribute::get(Ctx, AttrName, OS.str())},
/*ForceReplace=*/true);
}
const std::string getAsStr(Attributor *) const override {
std::string Str;
raw_string_ostream OS(Str);
OS << getName() << '[';
OS << getAssumed().getLower() << ',' << getAssumed().getUpper() - 1;
OS << ']';
return OS.str();
}
};
/// Propagate amdgpu-flat-work-group-size attribute.
struct AAAMDFlatWorkGroupSize : public AAAMDSizeRangeAttribute {
AAAMDFlatWorkGroupSize(const IRPosition &IRP, Attributor &A)
: AAAMDSizeRangeAttribute(IRP, A, "amdgpu-flat-work-group-size") {}
void initialize(Attributor &A) override {
Function *F = getAssociatedFunction();
auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
bool HasAttr = false;
auto Range = InfoCache.getDefaultFlatWorkGroupSize(*F);
auto MaxRange = InfoCache.getMaximumFlatWorkGroupRange(*F);
if (auto Attr = InfoCache.getFlatWorkGroupSizeAttr(*F)) {
// We only consider an attribute that is not max range because the front
// end always emits the attribute, unfortunately, and sometimes it emits
// the max range.
if (*Attr != MaxRange) {
Range = *Attr;
HasAttr = true;
}
}
// We don't want to directly clamp the state if it's the max range because
// that is basically the worst state.
if (Range == MaxRange)
return;
auto [Min, Max] = Range;
ConstantRange CR(APInt(32, Min), APInt(32, Max + 1));
IntegerRangeState IRS(CR);
clampStateAndIndicateChange(this->getState(), IRS);
if (HasAttr || AMDGPU::isEntryFunctionCC(F->getCallingConv()))
indicateOptimisticFixpoint();
}
ChangeStatus updateImpl(Attributor &A) override {
return updateImplImpl<AAAMDFlatWorkGroupSize>(A);
}
/// Create an abstract attribute view for the position \p IRP.
static AAAMDFlatWorkGroupSize &createForPosition(const IRPosition &IRP,
Attributor &A);
ChangeStatus manifest(Attributor &A) override {
Function *F = getAssociatedFunction();
auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
return emitAttributeIfNotDefaultAfterClamp(
A, InfoCache.getMaximumFlatWorkGroupRange(*F));
}
/// See AbstractAttribute::getName()
const std::string getName() const override {
return "AAAMDFlatWorkGroupSize";
}
/// See AbstractAttribute::getIdAddr()
const char *getIdAddr() const override { return &ID; }
/// This function should return true if the type of the \p AA is
/// AAAMDFlatWorkGroupSize
static bool classof(const AbstractAttribute *AA) {
return (AA->getIdAddr() == &ID);
}
/// Unique ID (due to the unique address)
static const char ID;
};
const char AAAMDFlatWorkGroupSize::ID = 0;
AAAMDFlatWorkGroupSize &
AAAMDFlatWorkGroupSize::createForPosition(const IRPosition &IRP,
Attributor &A) {
if (IRP.getPositionKind() == IRPosition::IRP_FUNCTION)
return *new (A.Allocator) AAAMDFlatWorkGroupSize(IRP, A);
llvm_unreachable(
"AAAMDFlatWorkGroupSize is only valid for function position");
}
struct TupleDecIntegerRangeState : public AbstractState {
DecIntegerState<uint32_t> X, Y, Z;
bool isValidState() const override {
return X.isValidState() && Y.isValidState() && Z.isValidState();
}
bool isAtFixpoint() const override {
return X.isAtFixpoint() && Y.isAtFixpoint() && Z.isAtFixpoint();
}
ChangeStatus indicateOptimisticFixpoint() override {
return X.indicateOptimisticFixpoint() | Y.indicateOptimisticFixpoint() |
Z.indicateOptimisticFixpoint();
}
ChangeStatus indicatePessimisticFixpoint() override {
return X.indicatePessimisticFixpoint() | Y.indicatePessimisticFixpoint() |
Z.indicatePessimisticFixpoint();
}
TupleDecIntegerRangeState operator^=(const TupleDecIntegerRangeState &Other) {
X ^= Other.X;
Y ^= Other.Y;
Z ^= Other.Z;
return *this;
}
bool operator==(const TupleDecIntegerRangeState &Other) const {
return X == Other.X && Y == Other.Y && Z == Other.Z;
}
TupleDecIntegerRangeState &getAssumed() { return *this; }
const TupleDecIntegerRangeState &getAssumed() const { return *this; }
};
using AAAMDMaxNumWorkgroupsState =
StateWrapper<TupleDecIntegerRangeState, AbstractAttribute, uint32_t>;
/// Propagate amdgpu-max-num-workgroups attribute.
struct AAAMDMaxNumWorkgroups
: public StateWrapper<TupleDecIntegerRangeState, AbstractAttribute> {
using Base = StateWrapper<TupleDecIntegerRangeState, AbstractAttribute>;
AAAMDMaxNumWorkgroups(const IRPosition &IRP, Attributor &A) : Base(IRP) {}
void initialize(Attributor &A) override {
Function *F = getAssociatedFunction();
auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
SmallVector<unsigned> MaxNumWorkgroups = InfoCache.getMaxNumWorkGroups(*F);
X.takeKnownMinimum(MaxNumWorkgroups[0]);
Y.takeKnownMinimum(MaxNumWorkgroups[1]);
Z.takeKnownMinimum(MaxNumWorkgroups[2]);
if (AMDGPU::isEntryFunctionCC(F->getCallingConv()))
indicatePessimisticFixpoint();
}
ChangeStatus updateImpl(Attributor &A) override {
ChangeStatus Change = ChangeStatus::UNCHANGED;
auto CheckCallSite = [&](AbstractCallSite CS) {
Function *Caller = CS.getInstruction()->getFunction();
LLVM_DEBUG(dbgs() << "[AAAMDMaxNumWorkgroups] Call " << Caller->getName()
<< "->" << getAssociatedFunction()->getName() << '\n');
const auto *CallerInfo = A.getAAFor<AAAMDMaxNumWorkgroups>(
*this, IRPosition::function(*Caller), DepClassTy::REQUIRED);
if (!CallerInfo || !CallerInfo->isValidState())
return false;
Change |=
clampStateAndIndicateChange(this->getState(), CallerInfo->getState());
return true;
};
bool AllCallSitesKnown = true;
if (!A.checkForAllCallSites(CheckCallSite, *this,
/*RequireAllCallSites=*/true,
AllCallSitesKnown))
return indicatePessimisticFixpoint();
return Change;
}
/// Create an abstract attribute view for the position \p IRP.
static AAAMDMaxNumWorkgroups &createForPosition(const IRPosition &IRP,
Attributor &A);
ChangeStatus manifest(Attributor &A) override {
Function *F = getAssociatedFunction();
LLVMContext &Ctx = F->getContext();
SmallString<32> Buffer;
raw_svector_ostream OS(Buffer);
OS << X.getAssumed() << ',' << Y.getAssumed() << ',' << Z.getAssumed();
// TODO: Should annotate loads of the group size for this to do anything
// useful.
return A.manifestAttrs(
getIRPosition(),
{Attribute::get(Ctx, "amdgpu-max-num-workgroups", OS.str())},
/* ForceReplace= */ true);
}
const std::string getName() const override { return "AAAMDMaxNumWorkgroups"; }
const std::string getAsStr(Attributor *) const override {
std::string Buffer = "AAAMDMaxNumWorkgroupsState[";
raw_string_ostream OS(Buffer);
OS << X.getAssumed() << ',' << Y.getAssumed() << ',' << Z.getAssumed()
<< ']';
return OS.str();
}
const char *getIdAddr() const override { return &ID; }
/// This function should return true if the type of the \p AA is
/// AAAMDMaxNumWorkgroups
static bool classof(const AbstractAttribute *AA) {
return (AA->getIdAddr() == &ID);
}
void trackStatistics() const override {}
/// Unique ID (due to the unique address)
static const char ID;
};
const char AAAMDMaxNumWorkgroups::ID = 0;
AAAMDMaxNumWorkgroups &
AAAMDMaxNumWorkgroups::createForPosition(const IRPosition &IRP, Attributor &A) {
if (IRP.getPositionKind() == IRPosition::IRP_FUNCTION)
return *new (A.Allocator) AAAMDMaxNumWorkgroups(IRP, A);
llvm_unreachable("AAAMDMaxNumWorkgroups is only valid for function position");
}
/// Propagate amdgpu-waves-per-eu attribute.
struct AAAMDWavesPerEU : public AAAMDSizeRangeAttribute {
AAAMDWavesPerEU(const IRPosition &IRP, Attributor &A)
: AAAMDSizeRangeAttribute(IRP, A, "amdgpu-waves-per-eu") {}
void initialize(Attributor &A) override {
Function *F = getAssociatedFunction();
auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
auto TakeRange = [&](std::pair<unsigned, unsigned> R) {
auto [Min, Max] = R;
ConstantRange Range(APInt(32, Min), APInt(32, Max + 1));
IntegerRangeState RangeState(Range);
clampStateAndIndicateChange(this->getState(), RangeState);
indicateOptimisticFixpoint();
};
std::pair<unsigned, unsigned> MaxWavesPerEURange{
1U, InfoCache.getMaxWavesPerEU(*F)};
// If the attribute exists, we will honor it if it is not the default.
if (auto Attr = InfoCache.getWavesPerEUAttr(*F)) {
if (*Attr != MaxWavesPerEURange) {
TakeRange(*Attr);
return;
}
}
// Unlike AAAMDFlatWorkGroupSize, it's getting trickier here. Since the
// calculation of waves per EU involves flat work group size, we can't
// simply use an assumed flat work group size as a start point, because the
// update of flat work group size is in an inverse direction of waves per
// EU. However, we can still do something if it is an entry function. Since
// an entry function is a terminal node, and flat work group size either
// from attribute or default will be used anyway, we can take that value and
// calculate the waves per EU based on it. This result can't be updated by
// no means, but that could still allow us to propagate it.
if (AMDGPU::isEntryFunctionCC(F->getCallingConv())) {
std::pair<unsigned, unsigned> FlatWorkGroupSize;
if (auto Attr = InfoCache.getFlatWorkGroupSizeAttr(*F))
FlatWorkGroupSize = *Attr;
else
FlatWorkGroupSize = InfoCache.getDefaultFlatWorkGroupSize(*F);
TakeRange(InfoCache.getEffectiveWavesPerEU(*F, MaxWavesPerEURange,
FlatWorkGroupSize));
}
}
ChangeStatus updateImpl(Attributor &A) override {
auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
ChangeStatus Change = ChangeStatus::UNCHANGED;
auto CheckCallSite = [&](AbstractCallSite CS) {
Function *Caller = CS.getInstruction()->getFunction();
Function *Func = getAssociatedFunction();
LLVM_DEBUG(dbgs() << '[' << getName() << "] Call " << Caller->getName()
<< "->" << Func->getName() << '\n');
const auto *CallerInfo = A.getAAFor<AAAMDWavesPerEU>(
*this, IRPosition::function(*Caller), DepClassTy::REQUIRED);
const auto *AssumedGroupSize = A.getAAFor<AAAMDFlatWorkGroupSize>(
*this, IRPosition::function(*Func), DepClassTy::REQUIRED);
if (!CallerInfo || !AssumedGroupSize || !CallerInfo->isValidState() ||
!AssumedGroupSize->isValidState())
return false;
unsigned Min, Max;
std::tie(Min, Max) = InfoCache.getEffectiveWavesPerEU(
*Caller,
{CallerInfo->getAssumed().getLower().getZExtValue(),
CallerInfo->getAssumed().getUpper().getZExtValue() - 1},
{AssumedGroupSize->getAssumed().getLower().getZExtValue(),
AssumedGroupSize->getAssumed().getUpper().getZExtValue() - 1});
ConstantRange CallerRange(APInt(32, Min), APInt(32, Max + 1));
IntegerRangeState CallerRangeState(CallerRange);
Change |= clampStateAndIndicateChange(this->getState(), CallerRangeState);
return true;
};
bool AllCallSitesKnown = true;
if (!A.checkForAllCallSites(CheckCallSite, *this, true, AllCallSitesKnown))
return indicatePessimisticFixpoint();
return Change;
}
/// Create an abstract attribute view for the position \p IRP.
static AAAMDWavesPerEU &createForPosition(const IRPosition &IRP,
Attributor &A);
ChangeStatus manifest(Attributor &A) override {
Function *F = getAssociatedFunction();
auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
return emitAttributeIfNotDefaultAfterClamp(
A, {1U, InfoCache.getMaxWavesPerEU(*F)});
}
/// See AbstractAttribute::getName()
const std::string getName() const override { return "AAAMDWavesPerEU"; }
/// See AbstractAttribute::getIdAddr()
const char *getIdAddr() const override { return &ID; }
/// This function should return true if the type of the \p AA is
/// AAAMDWavesPerEU
static bool classof(const AbstractAttribute *AA) {
return (AA->getIdAddr() == &ID);
}
/// Unique ID (due to the unique address)
static const char ID;
};
const char AAAMDWavesPerEU::ID = 0;
AAAMDWavesPerEU &AAAMDWavesPerEU::createForPosition(const IRPosition &IRP,
Attributor &A) {
if (IRP.getPositionKind() == IRPosition::IRP_FUNCTION)
return *new (A.Allocator) AAAMDWavesPerEU(IRP, A);
llvm_unreachable("AAAMDWavesPerEU is only valid for function position");
}
static bool inlineAsmUsesAGPRs(const InlineAsm *IA) {
for (const auto &CI : IA->ParseConstraints()) {
for (StringRef Code : CI.Codes) {
Code.consume_front("{");
if (Code.starts_with("a"))
return true;
}
}
return false;
}
// TODO: Migrate to range merge of amdgpu-agpr-alloc.
// FIXME: Why is this using Attribute::NoUnwind?
struct AAAMDGPUNoAGPR
: public IRAttribute<Attribute::NoUnwind,
StateWrapper<BooleanState, AbstractAttribute>,
AAAMDGPUNoAGPR> {
AAAMDGPUNoAGPR(const IRPosition &IRP, Attributor &A) : IRAttribute(IRP) {}
static AAAMDGPUNoAGPR &createForPosition(const IRPosition &IRP,
Attributor &A) {
if (IRP.getPositionKind() == IRPosition::IRP_FUNCTION)
return *new (A.Allocator) AAAMDGPUNoAGPR(IRP, A);
llvm_unreachable("AAAMDGPUNoAGPR is only valid for function position");
}
void initialize(Attributor &A) override {
Function *F = getAssociatedFunction();
auto [MinNumAGPR, MaxNumAGPR] =
AMDGPU::getIntegerPairAttribute(*F, "amdgpu-agpr-alloc", {~0u, ~0u},
/*OnlyFirstRequired=*/true);
if (MinNumAGPR == 0)
indicateOptimisticFixpoint();
}
const std::string getAsStr(Attributor *A) const override {
return getAssumed() ? "amdgpu-no-agpr" : "amdgpu-maybe-agpr";
}
void trackStatistics() const override {}
ChangeStatus updateImpl(Attributor &A) override {
// TODO: Use AACallEdges, but then we need a way to inspect asm edges.
auto CheckForNoAGPRs = [&](Instruction &I) {
const auto &CB = cast<CallBase>(I);
const Value *CalleeOp = CB.getCalledOperand();
const Function *Callee = dyn_cast<Function>(CalleeOp);
if (!Callee) {
if (const InlineAsm *IA = dyn_cast<InlineAsm>(CalleeOp))
return !inlineAsmUsesAGPRs(IA);
return false;
}
// Some intrinsics may use AGPRs, but if we have a choice, we are not
// required to use AGPRs.
if (Callee->isIntrinsic())
return true;
// TODO: Handle callsite attributes
const auto *CalleeInfo = A.getAAFor<AAAMDGPUNoAGPR>(
*this, IRPosition::function(*Callee), DepClassTy::REQUIRED);
return CalleeInfo && CalleeInfo->isValidState() &&
CalleeInfo->getAssumed();
};
bool UsedAssumedInformation = false;
if (!A.checkForAllCallLikeInstructions(CheckForNoAGPRs, *this,
UsedAssumedInformation))
return indicatePessimisticFixpoint();
return ChangeStatus::UNCHANGED;
}
ChangeStatus manifest(Attributor &A) override {
if (!getAssumed())
return ChangeStatus::UNCHANGED;
LLVMContext &Ctx = getAssociatedFunction()->getContext();
return A.manifestAttrs(getIRPosition(),
{Attribute::get(Ctx, "amdgpu-agpr-alloc", "0")});
}
const std::string getName() const override { return "AAAMDGPUNoAGPR"; }
const char *getIdAddr() const override { return &ID; }
/// This function should return true if the type of the \p AA is
/// AAAMDGPUNoAGPRs
static bool classof(const AbstractAttribute *AA) {
return (AA->getIdAddr() == &ID);
}
static const char ID;
};
const char AAAMDGPUNoAGPR::ID = 0;
static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM,
AMDGPUAttributorOptions Options,
ThinOrFullLTOPhase LTOPhase) {
SetVector<Function *> Functions;
for (Function &F : M) {
if (!F.isIntrinsic())
Functions.insert(&F);
}
CallGraphUpdater CGUpdater;
BumpPtrAllocator Allocator;
AMDGPUInformationCache InfoCache(M, AG, Allocator, nullptr, TM);
DenseSet<const char *> Allowed(
{&AAAMDAttributes::ID, &AAUniformWorkGroupSize::ID,
&AAPotentialValues::ID, &AAAMDFlatWorkGroupSize::ID,
&AAAMDMaxNumWorkgroups::ID, &AAAMDWavesPerEU::ID, &AAAMDGPUNoAGPR::ID,
&AACallEdges::ID, &AAPointerInfo::ID, &AAPotentialConstantValues::ID,
&AAUnderlyingObjects::ID, &AAAddressSpace::ID, &AAIndirectCallInfo::ID,
&AAInstanceInfo::ID});
AttributorConfig AC(CGUpdater);
AC.IsClosedWorldModule = Options.IsClosedWorld;
AC.Allowed = &Allowed;
AC.IsModulePass = true;
AC.DefaultInitializeLiveInternals = false;
AC.IndirectCalleeSpecializationCallback =
[](Attributor &A, const AbstractAttribute &AA, CallBase &CB,
Function &Callee, unsigned NumAssumedCallees) {
return !AMDGPU::isEntryFunctionCC(Callee.getCallingConv()) &&
(NumAssumedCallees <= IndirectCallSpecializationThreshold);
};
AC.IPOAmendableCB = [](const Function &F) {
return F.getCallingConv() == CallingConv::AMDGPU_KERNEL;
};
Attributor A(Functions, InfoCache, AC);
LLVM_DEBUG({
StringRef LTOPhaseStr = to_string(LTOPhase);
dbgs() << "[AMDGPUAttributor] Running at phase " << LTOPhaseStr << '\n'
<< "[AMDGPUAttributor] Module " << M.getName() << " is "
<< (AC.IsClosedWorldModule ? "" : "not ")
<< "assumed to be a closed world.\n";
});
for (auto *F : Functions) {
A.getOrCreateAAFor<AAAMDAttributes>(IRPosition::function(*F));
A.getOrCreateAAFor<AAUniformWorkGroupSize>(IRPosition::function(*F));
A.getOrCreateAAFor<AAAMDMaxNumWorkgroups>(IRPosition::function(*F));
A.getOrCreateAAFor<AAAMDGPUNoAGPR>(IRPosition::function(*F));
CallingConv::ID CC = F->getCallingConv();
if (!AMDGPU::isEntryFunctionCC(CC)) {
A.getOrCreateAAFor<AAAMDFlatWorkGroupSize>(IRPosition::function(*F));
A.getOrCreateAAFor<AAAMDWavesPerEU>(IRPosition::function(*F));
}
for (auto &I : instructions(F)) {
if (auto *LI = dyn_cast<LoadInst>(&I)) {
A.getOrCreateAAFor<AAAddressSpace>(
IRPosition::value(*LI->getPointerOperand()));
} else if (auto *SI = dyn_cast<StoreInst>(&I)) {
A.getOrCreateAAFor<AAAddressSpace>(
IRPosition::value(*SI->getPointerOperand()));
} else if (auto *RMW = dyn_cast<AtomicRMWInst>(&I)) {
A.getOrCreateAAFor<AAAddressSpace>(
IRPosition::value(*RMW->getPointerOperand()));
} else if (auto *CmpX = dyn_cast<AtomicCmpXchgInst>(&I)) {
A.getOrCreateAAFor<AAAddressSpace>(
IRPosition::value(*CmpX->getPointerOperand()));
}
}
}
ChangeStatus Change = A.run();
return Change == ChangeStatus::CHANGED;
}
class AMDGPUAttributorLegacy : public ModulePass {
public:
AMDGPUAttributorLegacy() : ModulePass(ID) {}
/// doInitialization - Virtual method overridden by subclasses to do
/// any necessary initialization before any pass is run.
bool doInitialization(Module &) override {
auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
if (!TPC)
report_fatal_error("TargetMachine is required");
TM = &TPC->getTM<TargetMachine>();
return false;
}
bool runOnModule(Module &M) override {
AnalysisGetter AG(this);
return runImpl(M, AG, *TM, /*Options=*/{},
/*LTOPhase=*/ThinOrFullLTOPhase::None);
}
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.addRequired<CycleInfoWrapperPass>();
}
StringRef getPassName() const override { return "AMDGPU Attributor"; }
TargetMachine *TM;
static char ID;
};
} // namespace
PreservedAnalyses llvm::AMDGPUAttributorPass::run(Module &M,
ModuleAnalysisManager &AM) {
FunctionAnalysisManager &FAM =
AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
AnalysisGetter AG(FAM);
// TODO: Probably preserves CFG
return runImpl(M, AG, TM, Options, LTOPhase) ? PreservedAnalyses::none()
: PreservedAnalyses::all();
}
char AMDGPUAttributorLegacy::ID = 0;
Pass *llvm::createAMDGPUAttributorLegacyPass() {
return new AMDGPUAttributorLegacy();
}
INITIALIZE_PASS_BEGIN(AMDGPUAttributorLegacy, DEBUG_TYPE, "AMDGPU Attributor",
false, false)
INITIALIZE_PASS_DEPENDENCY(CycleInfoWrapperPass);
INITIALIZE_PASS_END(AMDGPUAttributorLegacy, DEBUG_TYPE, "AMDGPU Attributor",
false, false)