blob: 0d89fc8819929482afe9027428d88e3304895fb6 [file] [log] [blame] [edit]
//===- AMDGPULowerVGPREncoding.cpp - lower VGPRs above v255 ---------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
/// \file
/// Lower VGPRs above first 256 on gfx1250.
///
/// The pass scans used VGPRs and inserts S_SET_VGPR_MSB instructions to switch
/// VGPR addressing mode. The mode change is effective until the next change.
/// This instruction provides high bits of a VGPR address for four of the
/// operands: vdst, src0, src1, and src2, or other 4 operands depending on the
/// instruction encoding. If bits are set they are added as MSB to the
/// corresponding operand VGPR number.
///
/// There is no need to replace actual register operands because encoding of the
/// high and low VGPRs is the same. I.e. v0 has the encoding 0x100, so does
/// v256. v1 has the encoding 0x101 and v257 has the same encoding. So high
/// VGPRs will survive until actual encoding and will result in a same actual
/// bit encoding.
///
/// As a result the pass only inserts S_SET_VGPR_MSB to provide an actual offset
/// to a VGPR address of the subseqent instructions. The InstPrinter will take
/// care of the printing a low VGPR instead of a high one. In prinicple this
/// shall be viable to print actual high VGPR numbers, but that would disagree
/// with a disasm printing and create a situation where asm text is not
/// deterministic.
///
/// This pass creates a convention where non-fall through basic blocks shall
/// start with all 4 MSBs zero. Otherwise a disassembly would not be readable.
/// An optimization here is possible but deemed not desirable because of the
/// readbility concerns.
///
/// Consequentially the ABI is set to expect all 4 MSBs to be zero on entry.
/// The pass must run very late in the pipeline to make sure no changes to VGPR
/// operands will be made after it.
//
//===----------------------------------------------------------------------===//
#include "AMDGPULowerVGPREncoding.h"
#include "AMDGPU.h"
#include "GCNSubtarget.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "SIDefines.h"
#include "SIInstrInfo.h"
#include "llvm/ADT/PackedVector.h"
#include "llvm/ADT/bit.h"
#include "llvm/Support/MathExtras.h"
using namespace llvm;
#define DEBUG_TYPE "amdgpu-lower-vgpr-encoding"
namespace {
class AMDGPULowerVGPREncoding {
static constexpr unsigned OpNum = 4;
static constexpr unsigned BitsPerField = 2;
static constexpr unsigned NumFields = 4;
static constexpr unsigned FieldMask = (1 << BitsPerField) - 1;
static constexpr unsigned ModeWidth = NumFields * BitsPerField;
static constexpr unsigned ModeMask = (1 << ModeWidth) - 1;
using ModeType = PackedVector<unsigned, BitsPerField,
std::bitset<BitsPerField * NumFields>>;
static constexpr unsigned VGPRMSBShift =
llvm::countr_zero_constexpr<unsigned>(AMDGPU::Hwreg::DST_VGPR_MSB);
class ModeTy : public ModeType {
public:
// bitset constructor will set all bits to zero
ModeTy() : ModeType(0) {}
operator int64_t() const { return raw_bits().to_ulong(); }
static ModeTy fullMask() {
ModeTy M;
M.raw_bits().flip();
return M;
}
};
public:
bool run(MachineFunction &MF);
private:
const SIInstrInfo *TII;
const SIRegisterInfo *TRI;
// Current basic block.
MachineBasicBlock *MBB;
/// Most recent s_set_* instruction.
MachineInstr *MostRecentModeSet;
/// Current mode bits.
ModeTy CurrentMode;
/// Current mask of mode bits that instructions since MostRecentModeSet care
/// about.
ModeTy CurrentMask;
/// Number of current hard clause instructions.
unsigned ClauseLen;
/// Number of hard clause instructions remaining.
unsigned ClauseRemaining;
/// Clause group breaks.
unsigned ClauseBreaks;
/// Last hard clause instruction.
MachineInstr *Clause;
/// Insert mode change before \p I. \returns true if mode was changed.
bool setMode(ModeTy NewMode, ModeTy Mask,
MachineBasicBlock::instr_iterator I);
/// Reset mode to default.
void resetMode(MachineBasicBlock::instr_iterator I) {
setMode(ModeTy(), ModeTy::fullMask(), I);
}
/// If \p MO references VGPRs, return the MSBs. Otherwise, return nullopt.
std::optional<unsigned> getMSBs(const MachineOperand &MO) const;
/// Handle single \p MI. \return true if changed.
bool runOnMachineInstr(MachineInstr &MI);
/// Compute the mode and mode mask for a single \p MI given \p Ops operands
/// bit mapping. Optionally takes second array \p Ops2 for VOPD.
/// If provided and an operand from \p Ops is not a VGPR, then \p Ops2
/// is checked.
void computeMode(ModeTy &NewMode, ModeTy &Mask, MachineInstr &MI,
const AMDGPU::OpName Ops[OpNum],
const AMDGPU::OpName *Ops2 = nullptr);
/// Check if an instruction \p I is within a clause and returns a suitable
/// iterator to insert mode change. It may also modify the S_CLAUSE
/// instruction to extend it or drop the clause if it cannot be adjusted.
MachineBasicBlock::instr_iterator
handleClause(MachineBasicBlock::instr_iterator I);
/// Check if an instruction \p I is immediately after another program state
/// instruction which it cannot coissue with. If so, insert before that
/// instruction to encourage more coissuing.
MachineBasicBlock::instr_iterator
handleCoissue(MachineBasicBlock::instr_iterator I);
/// Handle S_SETREG_IMM32_B32 targeting MODE register. On certain hardware,
/// this instruction clobbers VGPR MSB bits[12:19], so we need to restore
/// the current mode. \returns true if the instruction was modified or a
/// new one was inserted.
bool handleSetregMode(MachineInstr &MI);
/// Update bits[12:19] of the imm operand in S_SETREG_IMM32_B32 to contain
/// the VGPR MSB mode value. \returns true if the immediate was changed.
bool updateSetregModeImm(MachineInstr &MI, int64_t ModeValue);
};
bool AMDGPULowerVGPREncoding::setMode(ModeTy NewMode, ModeTy Mask,
MachineBasicBlock::instr_iterator I) {
assert((NewMode.raw_bits() & ~Mask.raw_bits()).none());
auto Delta = NewMode.raw_bits() ^ CurrentMode.raw_bits();
if ((Delta & Mask.raw_bits()).none()) {
CurrentMask |= Mask;
return false;
}
if (MostRecentModeSet && (Delta & CurrentMask.raw_bits()).none()) {
CurrentMode |= NewMode;
CurrentMask |= Mask;
// Update MostRecentModeSet with the new mode. It can be either
// S_SET_VGPR_MSB or S_SETREG_IMM32_B32 (with Size <= 12).
if (MostRecentModeSet->getOpcode() == AMDGPU::S_SET_VGPR_MSB) {
MachineOperand &Op = MostRecentModeSet->getOperand(0);
// Carry old mode bits from the existing instruction.
int64_t OldModeBits = Op.getImm() & (ModeMask << ModeWidth);
Op.setImm(CurrentMode | OldModeBits);
} else {
assert(MostRecentModeSet->getOpcode() == AMDGPU::S_SETREG_IMM32_B32 &&
"unexpected MostRecentModeSet opcode");
updateSetregModeImm(*MostRecentModeSet, CurrentMode);
}
return true;
}
// Record previous mode into high 8 bits of the immediate.
int64_t OldModeBits = CurrentMode << ModeWidth;
I = handleClause(I);
I = handleCoissue(I);
MostRecentModeSet = BuildMI(*MBB, I, {}, TII->get(AMDGPU::S_SET_VGPR_MSB))
.addImm(NewMode | OldModeBits);
CurrentMode = NewMode;
CurrentMask = Mask;
return true;
}
std::optional<unsigned>
AMDGPULowerVGPREncoding::getMSBs(const MachineOperand &MO) const {
if (!MO.isReg())
return std::nullopt;
MCRegister Reg = MO.getReg();
const TargetRegisterClass *RC = TRI->getPhysRegBaseClass(Reg);
if (!RC || !TRI->isVGPRClass(RC))
return std::nullopt;
unsigned Idx = TRI->getHWRegIndex(Reg);
return Idx >> 8;
}
void AMDGPULowerVGPREncoding::computeMode(ModeTy &NewMode, ModeTy &Mask,
MachineInstr &MI,
const AMDGPU::OpName Ops[OpNum],
const AMDGPU::OpName *Ops2) {
NewMode = {};
Mask = {};
for (unsigned I = 0; I < OpNum; ++I) {
MachineOperand *Op = TII->getNamedOperand(MI, Ops[I]);
std::optional<unsigned> MSBits;
if (Op)
MSBits = getMSBs(*Op);
#if !defined(NDEBUG)
if (MSBits.has_value() && Ops2) {
auto Op2 = TII->getNamedOperand(MI, Ops2[I]);
if (Op2) {
std::optional<unsigned> MSBits2;
MSBits2 = getMSBs(*Op2);
if (MSBits2.has_value() && MSBits != MSBits2)
llvm_unreachable("Invalid VOPD pair was created");
}
}
#endif
if (!MSBits.has_value() && Ops2) {
Op = TII->getNamedOperand(MI, Ops2[I]);
if (Op)
MSBits = getMSBs(*Op);
}
if (!MSBits.has_value())
continue;
// Skip tied uses of src2 of VOP2, these will be handled along with defs and
// only vdst bit affects these operands. We cannot skip tied uses of VOP3,
// these uses are real even if must match the vdst.
if (Ops[I] == AMDGPU::OpName::src2 && !Op->isDef() && Op->isTied() &&
(SIInstrInfo::isVOP2(MI) ||
(SIInstrInfo::isVOP3(MI) &&
TII->hasVALU32BitEncoding(MI.getOpcode()))))
continue;
NewMode[I] = MSBits.value();
Mask[I] = FieldMask;
}
}
bool AMDGPULowerVGPREncoding::runOnMachineInstr(MachineInstr &MI) {
auto Ops = AMDGPU::getVGPRLoweringOperandTables(MI.getDesc());
if (Ops.first) {
ModeTy NewMode, Mask;
computeMode(NewMode, Mask, MI, Ops.first, Ops.second);
return setMode(NewMode, Mask, MI.getIterator());
}
assert(!TII->hasVGPRUses(MI) || MI.isMetaInstruction() || MI.isPseudo());
return false;
}
MachineBasicBlock::instr_iterator
AMDGPULowerVGPREncoding::handleClause(MachineBasicBlock::instr_iterator I) {
if (!ClauseRemaining)
return I;
// A clause cannot start with a special instruction, place it right before
// the clause.
if (ClauseRemaining == ClauseLen) {
I = Clause->getPrevNode()->getIterator();
assert(I->isBundle());
return I;
}
// If a clause defines breaks each group cannot start with a mode change.
// just drop the clause.
if (ClauseBreaks) {
Clause->eraseFromBundle();
ClauseRemaining = 0;
return I;
}
// Otherwise adjust a number of instructions in the clause if it fits.
// If it does not clause will just become shorter. Since the length
// recorded in the clause is one less, increment the length after the
// update. Note that SIMM16[5:0] must be 1-62, not 0 or 63.
if (ClauseLen < 63)
Clause->getOperand(0).setImm(ClauseLen | (ClauseBreaks << 8));
++ClauseLen;
return I;
}
MachineBasicBlock::instr_iterator
AMDGPULowerVGPREncoding::handleCoissue(MachineBasicBlock::instr_iterator I) {
if (I.isEnd())
return I;
if (I == I->getParent()->begin())
return I;
MachineBasicBlock::instr_iterator Prev = std::prev(I);
auto isProgramStateSALU = [this](MachineInstr *MI) {
return TII->isBarrier(MI->getOpcode()) ||
TII->isWaitcnt(MI || (SIInstrInfo::isProgramStateSALU(*MI) &&
MI->getOpcode() != AMDGPU::S_SET_VGPR_MSB));
};
if (!isProgramStateSALU(&*Prev))
return I;
while (!Prev.isEnd() && (Prev != Prev->getParent()->begin()) &&
isProgramStateSALU(&*Prev)) {
--Prev;
}
return Prev;
}
/// Convert mode value from S_SET_VGPR_MSB format to MODE register format.
/// S_SET_VGPR_MSB uses: (src0[0-1], src1[2-3], src2[4-5], dst[6-7])
/// MODE register uses: (dst[0-1], src0[2-3], src1[4-5], src2[6-7])
/// This is a left rotation by 2 bits on an 8-bit value.
static int64_t convertModeToSetregFormat(int64_t Mode) {
assert(isUInt<8>(Mode) && "Mode expected to be 8-bit");
return llvm::rotl<uint8_t>(static_cast<uint8_t>(Mode), /*R=*/2);
}
bool AMDGPULowerVGPREncoding::updateSetregModeImm(MachineInstr &MI,
int64_t ModeValue) {
assert(MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32);
// Convert from S_SET_VGPR_MSB format to MODE register format
int64_t SetregMode = convertModeToSetregFormat(ModeValue);
MachineOperand *ImmOp = TII->getNamedOperand(MI, AMDGPU::OpName::imm);
int64_t OldImm = ImmOp->getImm();
int64_t NewImm =
(OldImm & ~AMDGPU::Hwreg::VGPR_MSB_MASK) | (SetregMode << VGPRMSBShift);
ImmOp->setImm(NewImm);
return NewImm != OldImm;
}
bool AMDGPULowerVGPREncoding::handleSetregMode(MachineInstr &MI) {
using namespace AMDGPU::Hwreg;
assert(MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32 &&
"only S_SETREG_IMM32_B32 needs to be handled");
MachineOperand *SIMM16Op = TII->getNamedOperand(MI, AMDGPU::OpName::simm16);
assert(SIMM16Op && "SIMM16Op must be present");
auto [HwRegId, Offset, Size] = HwregEncoding::decode(SIMM16Op->getImm());
(void)Offset;
if (HwRegId != ID_MODE)
return false;
int64_t ModeValue = static_cast<int64_t>(CurrentMode);
// Case 1: Size <= 12 - the original instruction uses imm32[0:Size-1], so
// imm32[12:19] is unused. Safe to set imm32[12:19] to the correct VGPR
// MSBs.
if (Size <= VGPRMSBShift) {
// This instruction now acts as MostRecentModeSet so it can be updated if
// CurrentMode changes via piggybacking.
MostRecentModeSet = &MI;
return updateSetregModeImm(MI, ModeValue);
}
// Case 2: Size > 12 - the original instruction uses bits beyond 11, so we
// cannot arbitrarily modify imm32[12:19]. Check if it already matches VGPR
// MSBs. Note: imm32[12:19] is in MODE register format, while ModeValue is
// in S_SET_VGPR_MSB format, so we need to convert before comparing.
MachineOperand *ImmOp = TII->getNamedOperand(MI, AMDGPU::OpName::imm);
assert(ImmOp && "ImmOp must be present");
int64_t ImmBits12To19 = (ImmOp->getImm() & VGPR_MSB_MASK) >> VGPRMSBShift;
int64_t SetregModeValue = convertModeToSetregFormat(ModeValue);
if (ImmBits12To19 == SetregModeValue) {
// Already correct, but we must invalidate MostRecentModeSet because this
// instruction will overwrite mode[12:19]. We can't update this instruction
// via piggybacking (bits[12:19] are meaningful), so if CurrentMode changes,
// a new s_set_vgpr_msb will be inserted after this instruction.
MostRecentModeSet = nullptr;
return false;
}
// imm32[12:19] doesn't match VGPR MSBs - insert s_set_vgpr_msb after
// the original instruction to restore the correct value.
MachineBasicBlock::iterator InsertPt = std::next(MI.getIterator());
MostRecentModeSet = BuildMI(*MBB, InsertPt, MI.getDebugLoc(),
TII->get(AMDGPU::S_SET_VGPR_MSB))
.addImm(ModeValue);
return true;
}
bool AMDGPULowerVGPREncoding::run(MachineFunction &MF) {
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
if (!ST.has1024AddressableVGPRs())
return false;
TII = ST.getInstrInfo();
TRI = ST.getRegisterInfo();
bool Changed = false;
ClauseLen = ClauseRemaining = 0;
CurrentMode.reset();
CurrentMask.reset();
for (auto &MBB : MF) {
MostRecentModeSet = nullptr;
this->MBB = &MBB;
for (auto &MI : llvm::make_early_inc_range(MBB.instrs())) {
if (MI.isMetaInstruction())
continue;
if (MI.isTerminator() || MI.isCall()) {
if (MI.getOpcode() == AMDGPU::S_ENDPGM ||
MI.getOpcode() == AMDGPU::S_ENDPGM_SAVED)
CurrentMode.reset();
else
resetMode(MI.getIterator());
continue;
}
if (MI.isInlineAsm()) {
if (TII->hasVGPRUses(MI))
resetMode(MI.getIterator());
continue;
}
if (MI.getOpcode() == AMDGPU::S_CLAUSE) {
assert(!ClauseRemaining && "Nested clauses are not supported");
ClauseLen = MI.getOperand(0).getImm();
ClauseBreaks = (ClauseLen >> 8) & 15;
ClauseLen = ClauseRemaining = (ClauseLen & 63) + 1;
Clause = &MI;
continue;
}
if (MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32 &&
ST.hasSetregVGPRMSBFixup()) {
Changed |= handleSetregMode(MI);
continue;
}
Changed |= runOnMachineInstr(MI);
if (ClauseRemaining)
--ClauseRemaining;
}
// Reset the mode if we are falling through.
resetMode(MBB.instr_end());
}
return Changed;
}
class AMDGPULowerVGPREncodingLegacy : public MachineFunctionPass {
public:
static char ID;
AMDGPULowerVGPREncodingLegacy() : MachineFunctionPass(ID) {}
bool runOnMachineFunction(MachineFunction &MF) override {
return AMDGPULowerVGPREncoding().run(MF);
}
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.setPreservesCFG();
MachineFunctionPass::getAnalysisUsage(AU);
}
};
} // namespace
char AMDGPULowerVGPREncodingLegacy::ID = 0;
char &llvm::AMDGPULowerVGPREncodingLegacyID = AMDGPULowerVGPREncodingLegacy::ID;
INITIALIZE_PASS(AMDGPULowerVGPREncodingLegacy, DEBUG_TYPE,
"AMDGPU Lower VGPR Encoding", false, false)
PreservedAnalyses
AMDGPULowerVGPREncodingPass::run(MachineFunction &MF,
MachineFunctionAnalysisManager &MFAM) {
if (!AMDGPULowerVGPREncoding().run(MF))
return PreservedAnalyses::all();
return getMachineFunctionPassPreservedAnalyses().preserveSet<CFGAnalyses>();
}