blob: 7093fe6405abbe8d78a861ab43abb612f0595c69 [file] [log] [blame]
//===-- SIProgramInfo.cpp ----------------------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
/// \file
///
/// The SIProgramInfo tracks resource usage and hardware flags for kernels and
/// entry functions.
//
//===----------------------------------------------------------------------===//
//
#include "SIProgramInfo.h"
#include "GCNSubtarget.h"
#include "SIDefines.h"
#include "Utils/AMDGPUBaseInfo.h"
#include "llvm/MC/MCExpr.h"
using namespace llvm;
void SIProgramInfo::reset(const MachineFunction &MF) {
MCContext &Ctx = MF.getContext();
const MCExpr *ZeroExpr = MCConstantExpr::create(0, Ctx);
CodeSizeInBytes.reset();
VGPRBlocks = ZeroExpr;
SGPRBlocks = ZeroExpr;
Priority = 0;
FloatMode = 0;
Priv = 0;
DX10Clamp = 0;
DebugMode = 0;
IEEEMode = 0;
WgpMode = 0;
MemOrdered = 0;
FwdProgress = 0;
RrWgMode = 0;
ScratchSize = ZeroExpr;
LDSBlocks = 0;
ScratchBlocks = ZeroExpr;
ScratchEnable = ZeroExpr;
UserSGPR = 0;
TrapHandlerEnable = 0;
TGIdXEnable = 0;
TGIdYEnable = 0;
TGIdZEnable = 0;
TGSizeEnable = 0;
TIdIGCompCount = 0;
EXCPEnMSB = 0;
LdsSize = 0;
EXCPEnable = 0;
ComputePGMRSrc3 = ZeroExpr;
NumVGPR = ZeroExpr;
NumArchVGPR = ZeroExpr;
NumAccVGPR = ZeroExpr;
AccumOffset = ZeroExpr;
TgSplit = 0;
NumSGPR = ZeroExpr;
SGPRSpill = 0;
VGPRSpill = 0;
LDSSize = 0;
FlatUsed = ZeroExpr;
NumSGPRsForWavesPerEU = ZeroExpr;
NumVGPRsForWavesPerEU = ZeroExpr;
Occupancy = ZeroExpr;
DynamicCallStack = ZeroExpr;
VCCUsed = ZeroExpr;
}
static uint64_t getComputePGMRSrc1Reg(const SIProgramInfo &ProgInfo,
const GCNSubtarget &ST) {
uint64_t Reg = S_00B848_PRIORITY(ProgInfo.Priority) |
S_00B848_FLOAT_MODE(ProgInfo.FloatMode) |
S_00B848_PRIV(ProgInfo.Priv) |
S_00B848_DEBUG_MODE(ProgInfo.DebugMode) |
S_00B848_WGP_MODE(ProgInfo.WgpMode) |
S_00B848_MEM_ORDERED(ProgInfo.MemOrdered);
if (ST.hasDX10ClampMode())
Reg |= S_00B848_DX10_CLAMP(ProgInfo.DX10Clamp);
if (ST.hasIEEEMode())
Reg |= S_00B848_IEEE_MODE(ProgInfo.IEEEMode);
// TODO: in the long run we will want to enable this unconditionally.
if (ST.getTargetTriple().getOS() == Triple::OSType::AMDHSA)
Reg |= S_00B848_FWD_PROGRESS(ProgInfo.FwdProgress);
if (ST.hasRrWGMode())
Reg |= S_00B848_RR_WG_MODE(ProgInfo.RrWgMode);
return Reg;
}
static uint64_t getPGMRSrc1Reg(const SIProgramInfo &ProgInfo,
CallingConv::ID CC, const GCNSubtarget &ST) {
uint64_t Reg = S_00B848_PRIORITY(ProgInfo.Priority) |
S_00B848_FLOAT_MODE(ProgInfo.FloatMode) |
S_00B848_PRIV(ProgInfo.Priv) |
S_00B848_DEBUG_MODE(ProgInfo.DebugMode);
if (ST.hasDX10ClampMode())
Reg |= S_00B848_DX10_CLAMP(ProgInfo.DX10Clamp);
if (ST.hasIEEEMode())
Reg |= S_00B848_IEEE_MODE(ProgInfo.IEEEMode);
if (ST.hasRrWGMode())
Reg |= S_00B848_RR_WG_MODE(ProgInfo.RrWgMode);
switch (CC) {
case CallingConv::AMDGPU_PS:
Reg |= S_00B028_MEM_ORDERED(ProgInfo.MemOrdered);
break;
case CallingConv::AMDGPU_VS:
Reg |= S_00B128_MEM_ORDERED(ProgInfo.MemOrdered);
break;
case CallingConv::AMDGPU_GS:
Reg |= S_00B228_WGP_MODE(ProgInfo.WgpMode) |
S_00B228_MEM_ORDERED(ProgInfo.MemOrdered);
break;
case CallingConv::AMDGPU_HS:
Reg |= S_00B428_WGP_MODE(ProgInfo.WgpMode) |
S_00B428_MEM_ORDERED(ProgInfo.MemOrdered);
break;
default:
break;
}
return Reg;
}
static uint64_t getComputePGMRSrc2Reg(const SIProgramInfo &ProgInfo) {
uint64_t Reg = S_00B84C_USER_SGPR(ProgInfo.UserSGPR) |
S_00B84C_TRAP_HANDLER(ProgInfo.TrapHandlerEnable) |
S_00B84C_TGID_X_EN(ProgInfo.TGIdXEnable) |
S_00B84C_TGID_Y_EN(ProgInfo.TGIdYEnable) |
S_00B84C_TGID_Z_EN(ProgInfo.TGIdZEnable) |
S_00B84C_TG_SIZE_EN(ProgInfo.TGSizeEnable) |
S_00B84C_TIDIG_COMP_CNT(ProgInfo.TIdIGCompCount) |
S_00B84C_EXCP_EN_MSB(ProgInfo.EXCPEnMSB) |
S_00B84C_LDS_SIZE(ProgInfo.LdsSize) |
S_00B84C_EXCP_EN(ProgInfo.EXCPEnable);
return Reg;
}
static const MCExpr *MaskShift(const MCExpr *Val, uint32_t Mask, uint32_t Shift,
MCContext &Ctx) {
if (Mask) {
const MCExpr *MaskExpr = MCConstantExpr::create(Mask, Ctx);
Val = MCBinaryExpr::createAnd(Val, MaskExpr, Ctx);
}
if (Shift) {
const MCExpr *ShiftExpr = MCConstantExpr::create(Shift, Ctx);
Val = MCBinaryExpr::createShl(Val, ShiftExpr, Ctx);
}
return Val;
}
const MCExpr *SIProgramInfo::getComputePGMRSrc1(const GCNSubtarget &ST,
MCContext &Ctx) const {
uint64_t Reg = getComputePGMRSrc1Reg(*this, ST);
const MCExpr *RegExpr = MCConstantExpr::create(Reg, Ctx);
const MCExpr *Res = MCBinaryExpr::createOr(
MaskShift(VGPRBlocks, /*Mask=*/0x3F, /*Shift=*/0, Ctx),
MaskShift(SGPRBlocks, /*Mask=*/0xF, /*Shift=*/6, Ctx), Ctx);
return MCBinaryExpr::createOr(RegExpr, Res, Ctx);
}
const MCExpr *SIProgramInfo::getPGMRSrc1(CallingConv::ID CC,
const GCNSubtarget &ST,
MCContext &Ctx) const {
if (AMDGPU::isCompute(CC)) {
return getComputePGMRSrc1(ST, Ctx);
}
uint64_t Reg = getPGMRSrc1Reg(*this, CC, ST);
const MCExpr *RegExpr = MCConstantExpr::create(Reg, Ctx);
const MCExpr *Res = MCBinaryExpr::createOr(
MaskShift(VGPRBlocks, /*Mask=*/0x3F, /*Shift=*/0, Ctx),
MaskShift(SGPRBlocks, /*Mask=*/0xF, /*Shift=*/6, Ctx), Ctx);
return MCBinaryExpr::createOr(RegExpr, Res, Ctx);
}
const MCExpr *SIProgramInfo::getComputePGMRSrc2(MCContext &Ctx) const {
uint64_t Reg = getComputePGMRSrc2Reg(*this);
const MCExpr *RegExpr = MCConstantExpr::create(Reg, Ctx);
return MCBinaryExpr::createOr(ScratchEnable, RegExpr, Ctx);
}
const MCExpr *SIProgramInfo::getPGMRSrc2(CallingConv::ID CC,
MCContext &Ctx) const {
if (AMDGPU::isCompute(CC))
return getComputePGMRSrc2(Ctx);
return MCConstantExpr::create(0, Ctx);
}
uint64_t SIProgramInfo::getFunctionCodeSize(const MachineFunction &MF,
bool IsLowerBound) {
if (!IsLowerBound && CodeSizeInBytes.has_value())
return *CodeSizeInBytes;
const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
const SIInstrInfo *TII = STM.getInstrInfo();
uint64_t CodeSize = 0;
for (const MachineBasicBlock &MBB : MF) {
// The amount of padding to align code can be both underestimated and
// overestimated. In case of inline asm used getInstSizeInBytes() will
// return a maximum size of a single instruction, where the real size may
// differ. At this point CodeSize may be already off.
if (!IsLowerBound)
CodeSize = alignTo(CodeSize, MBB.getAlignment());
for (const MachineInstr &MI : MBB) {
// TODO: CodeSize should account for multiple functions.
if (MI.isMetaInstruction())
continue;
// We cannot properly estimate inline asm size. It can be as small as zero
// if that is just a comment.
if (IsLowerBound && MI.isInlineAsm())
continue;
CodeSize += TII->getInstSizeInBytes(MI);
}
}
CodeSizeInBytes = CodeSize;
return CodeSize;
}