blob: 5988403c0a29cb3a51e75c91e6c67ce3d5b78968 [file] [log] [blame]
//===- AMDGPURegisterBankInfo.cpp -------------------------------*- C++ -*-==//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
/// \file
/// This file implements the targeting of the RegisterBankInfo class for
/// AMDGPU.
///
/// \par
///
/// AMDGPU has unique register bank constraints that require special high level
/// strategies to deal with. There are two main true physical register banks
/// VGPR (vector), and SGPR (scalar). Additionally the VCC register bank is a
/// sort of pseudo-register bank needed to represent SGPRs used in a vector
/// boolean context. There is also the AGPR bank, which is a special purpose
/// physical register bank present on some subtargets.
///
/// Copying from VGPR to SGPR is generally illegal, unless the value is known to
/// be uniform. It is generally not valid to legalize operands by inserting
/// copies as on other targets. Operations which require uniform, SGPR operands
/// generally require scalarization by repeatedly executing the instruction,
/// activating each set of lanes using a unique set of input values. This is
/// referred to as a waterfall loop.
///
/// \par Booleans
///
/// Booleans (s1 values) requires special consideration. A vector compare result
/// is naturally a bitmask with one bit per lane, in a 32 or 64-bit
/// register. These are represented with the VCC bank. During selection, we need
/// to be able to unambiguously go back from a register class to a register
/// bank. To distinguish whether an SGPR should use the SGPR or VCC register
/// bank, we need to know the use context type. An SGPR s1 value always means a
/// VCC bank value, otherwise it will be the SGPR bank. A scalar compare sets
/// SCC, which is a 1-bit unaddressable register. This will need to be copied to
/// a 32-bit virtual register. Taken together, this means we need to adjust the
/// type of boolean operations to be regbank legal. All SALU booleans need to be
/// widened to 32-bits, and all VALU booleans need to be s1 values.
///
/// A noteworthy exception to the s1-means-vcc rule is for legalization artifact
/// casts. G_TRUNC s1 results, and G_SEXT/G_ZEXT/G_ANYEXT sources are never vcc
/// bank. A non-boolean source (such as a truncate from a 1-bit load from
/// memory) will require a copy to the VCC bank which will require clearing the
/// high bits and inserting a compare.
///
/// \par Constant bus restriction
///
/// VALU instructions have a limitation known as the constant bus
/// restriction. Most VALU instructions can use SGPR operands, but may read at
/// most 1 SGPR or constant literal value (this to 2 in gfx10 for most
/// instructions). This is one unique SGPR, so the same SGPR may be used for
/// multiple operands. From a register bank perspective, any combination of
/// operands should be legal as an SGPR, but this is contextually dependent on
/// the SGPR operands all being the same register. There is therefore optimal to
/// choose the SGPR with the most uses to minimize the number of copies.
///
/// We avoid trying to solve this problem in RegBankSelect. Any VALU G_*
/// operation should have its source operands all mapped to VGPRs (except for
/// VCC), inserting copies from any SGPR operands. This the most trivial legal
/// mapping. Anything beyond the simplest 1:1 instruction selection would be too
/// complicated to solve here. Every optimization pattern or instruction
/// selected to multiple outputs would have to enforce this rule, and there
/// would be additional complexity in tracking this rule for every G_*
/// operation. By forcing all inputs to VGPRs, it also simplifies the task of
/// picking the optimal operand combination from a post-isel optimization pass.
///
//===----------------------------------------------------------------------===//
#include "AMDGPURegisterBankInfo.h"
#include "AMDGPU.h"
#include "AMDGPUGlobalISelUtils.h"
#include "AMDGPUInstrInfo.h"
#include "GCNSubtarget.h"
#include "SIMachineFunctionInfo.h"
#include "SIRegisterInfo.h"
#include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
#include "llvm/CodeGen/GlobalISel/RegisterBank.h"
#include "llvm/IR/IntrinsicsAMDGPU.h"
#define GET_TARGET_REGBANK_IMPL
#include "AMDGPUGenRegisterBank.inc"
// This file will be TableGen'ed at some point.
#include "AMDGPUGenRegisterBankInfo.def"
using namespace llvm;
using namespace MIPatternMatch;
namespace {
// Observer to apply a register bank to new registers created by LegalizerHelper.
class ApplyRegBankMapping final : public GISelChangeObserver {
private:
const AMDGPURegisterBankInfo &RBI;
MachineRegisterInfo &MRI;
const RegisterBank *NewBank;
SmallVector<MachineInstr *, 4> NewInsts;
public:
ApplyRegBankMapping(const AMDGPURegisterBankInfo &RBI_,
MachineRegisterInfo &MRI_, const RegisterBank *RB)
: RBI(RBI_), MRI(MRI_), NewBank(RB) {}
~ApplyRegBankMapping() {
for (MachineInstr *MI : NewInsts)
applyBank(*MI);
}
/// Set any registers that don't have a set register class or bank to SALU.
void applyBank(MachineInstr &MI) {
const unsigned Opc = MI.getOpcode();
if (Opc == AMDGPU::G_ANYEXT || Opc == AMDGPU::G_ZEXT ||
Opc == AMDGPU::G_SEXT) {
// LegalizerHelper wants to use the basic legalization artifacts when
// widening etc. We don't handle selection with vcc in artifact sources,
// so we need to use a select instead to handle these properly.
Register DstReg = MI.getOperand(0).getReg();
Register SrcReg = MI.getOperand(1).getReg();
const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, MRI, *RBI.TRI);
if (SrcBank == &AMDGPU::VCCRegBank) {
const LLT S32 = LLT::scalar(32);
assert(MRI.getType(SrcReg) == LLT::scalar(1));
assert(MRI.getType(DstReg) == S32);
assert(NewBank == &AMDGPU::VGPRRegBank);
// Replace the extension with a select, which really uses the boolean
// source.
MachineIRBuilder B(MI);
auto True = B.buildConstant(S32, Opc == AMDGPU::G_SEXT ? -1 : 1);
auto False = B.buildConstant(S32, 0);
B.buildSelect(DstReg, SrcReg, True, False);
MRI.setRegBank(True.getReg(0), *NewBank);
MRI.setRegBank(False.getReg(0), *NewBank);
MI.eraseFromParent();
}
assert(!MRI.getRegClassOrRegBank(DstReg));
MRI.setRegBank(DstReg, *NewBank);
return;
}
#ifndef NDEBUG
if (Opc == AMDGPU::G_TRUNC) {
Register DstReg = MI.getOperand(0).getReg();
const RegisterBank *DstBank = RBI.getRegBank(DstReg, MRI, *RBI.TRI);
assert(DstBank != &AMDGPU::VCCRegBank);
}
#endif
for (MachineOperand &Op : MI.operands()) {
if (!Op.isReg())
continue;
// We may see physical registers if building a real MI
Register Reg = Op.getReg();
if (Reg.isPhysical() || MRI.getRegClassOrRegBank(Reg))
continue;
const RegisterBank *RB = NewBank;
if (MRI.getType(Reg) == LLT::scalar(1)) {
assert(NewBank == &AMDGPU::VGPRRegBank &&
"s1 operands should only be used for vector bools");
assert((MI.getOpcode() != AMDGPU::G_TRUNC &&
MI.getOpcode() != AMDGPU::G_ANYEXT) &&
"not expecting legalization artifacts here");
RB = &AMDGPU::VCCRegBank;
}
MRI.setRegBank(Reg, *RB);
}
}
void erasingInstr(MachineInstr &MI) override {}
void createdInstr(MachineInstr &MI) override {
// At this point, the instruction was just inserted and has no operands.
NewInsts.push_back(&MI);
}
void changingInstr(MachineInstr &MI) override {}
void changedInstr(MachineInstr &MI) override {
// FIXME: In principle we should probably add the instruction to NewInsts,
// but the way the LegalizerHelper uses the observer, we will always see the
// registers we need to set the regbank on also referenced in a new
// instruction.
}
};
}
AMDGPURegisterBankInfo::AMDGPURegisterBankInfo(const GCNSubtarget &ST)
: AMDGPUGenRegisterBankInfo(),
Subtarget(ST),
TRI(Subtarget.getRegisterInfo()),
TII(Subtarget.getInstrInfo()) {
// HACK: Until this is fully tablegen'd.
static llvm::once_flag InitializeRegisterBankFlag;
static auto InitializeRegisterBankOnce = [this]() {
assert(&getRegBank(AMDGPU::SGPRRegBankID) == &AMDGPU::SGPRRegBank &&
&getRegBank(AMDGPU::VGPRRegBankID) == &AMDGPU::VGPRRegBank &&
&getRegBank(AMDGPU::AGPRRegBankID) == &AMDGPU::AGPRRegBank);
(void)this;
};
llvm::call_once(InitializeRegisterBankFlag, InitializeRegisterBankOnce);
}
static bool isVectorRegisterBank(const RegisterBank &Bank) {
unsigned BankID = Bank.getID();
return BankID == AMDGPU::VGPRRegBankID || BankID == AMDGPU::AGPRRegBankID;
}
unsigned AMDGPURegisterBankInfo::copyCost(const RegisterBank &Dst,
const RegisterBank &Src,
unsigned Size) const {
// TODO: Should there be a UniformVGPRRegBank which can use readfirstlane?
if (Dst.getID() == AMDGPU::SGPRRegBankID &&
(isVectorRegisterBank(Src) || Src.getID() == AMDGPU::VCCRegBankID)) {
return std::numeric_limits<unsigned>::max();
}
// Bool values are tricky, because the meaning is based on context. The SCC
// and VCC banks are for the natural scalar and vector conditions produced by
// a compare.
//
// Legalization doesn't know about the necessary context, so an s1 use may
// have been a truncate from an arbitrary value, in which case a copy (lowered
// as a compare with 0) needs to be inserted.
if (Size == 1 &&
(Dst.getID() == AMDGPU::SGPRRegBankID) &&
(isVectorRegisterBank(Src) ||
Src.getID() == AMDGPU::SGPRRegBankID ||
Src.getID() == AMDGPU::VCCRegBankID))
return std::numeric_limits<unsigned>::max();
// There is no direct copy between AGPRs.
if (Dst.getID() == AMDGPU::AGPRRegBankID &&
Src.getID() == AMDGPU::AGPRRegBankID)
return 4;
return RegisterBankInfo::copyCost(Dst, Src, Size);
}
unsigned AMDGPURegisterBankInfo::getBreakDownCost(
const ValueMapping &ValMapping,
const RegisterBank *CurBank) const {
// Check if this is a breakdown for G_LOAD to move the pointer from SGPR to
// VGPR.
// FIXME: Is there a better way to do this?
if (ValMapping.NumBreakDowns >= 2 || ValMapping.BreakDown[0].Length >= 64)
return 10; // This is expensive.
assert(ValMapping.NumBreakDowns == 2 &&
ValMapping.BreakDown[0].Length == 32 &&
ValMapping.BreakDown[0].StartIdx == 0 &&
ValMapping.BreakDown[1].Length == 32 &&
ValMapping.BreakDown[1].StartIdx == 32 &&
ValMapping.BreakDown[0].RegBank == ValMapping.BreakDown[1].RegBank);
// 32-bit extract of a 64-bit value is just access of a subregister, so free.
// TODO: Cost of 0 hits assert, though it's not clear it's what we really
// want.
// TODO: 32-bit insert to a 64-bit SGPR may incur a non-free copy due to SGPR
// alignment restrictions, but this probably isn't important.
return 1;
}
const RegisterBank &
AMDGPURegisterBankInfo::getRegBankFromRegClass(const TargetRegisterClass &RC,
LLT Ty) const {
if (&RC == &AMDGPU::SReg_1RegClass)
return AMDGPU::VCCRegBank;
// We promote real scalar booleans to SReg_32. Any SGPR using s1 is really a
// VCC-like use.
if (TRI->isSGPRClass(&RC)) {
// FIXME: This probably came from a copy from a physical register, which
// should be inferable from the copied to-type. We don't have many boolean
// physical register constraints so just assume a normal SGPR for now.
if (!Ty.isValid())
return AMDGPU::SGPRRegBank;
return Ty == LLT::scalar(1) ? AMDGPU::VCCRegBank : AMDGPU::SGPRRegBank;
}
return TRI->isAGPRClass(&RC) ? AMDGPU::AGPRRegBank : AMDGPU::VGPRRegBank;
}
template <unsigned NumOps>
RegisterBankInfo::InstructionMappings
AMDGPURegisterBankInfo::addMappingFromTable(
const MachineInstr &MI, const MachineRegisterInfo &MRI,
const std::array<unsigned, NumOps> RegSrcOpIdx,
ArrayRef<OpRegBankEntry<NumOps>> Table) const {
InstructionMappings AltMappings;
SmallVector<const ValueMapping *, 10> Operands(MI.getNumOperands());
unsigned Sizes[NumOps];
for (unsigned I = 0; I < NumOps; ++I) {
Register Reg = MI.getOperand(RegSrcOpIdx[I]).getReg();
Sizes[I] = getSizeInBits(Reg, MRI, *TRI);
}
for (unsigned I = 0, E = MI.getNumExplicitDefs(); I != E; ++I) {
unsigned SizeI = getSizeInBits(MI.getOperand(I).getReg(), MRI, *TRI);
Operands[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SizeI);
}
// getInstrMapping's default mapping uses ID 1, so start at 2.
unsigned MappingID = 2;
for (const auto &Entry : Table) {
for (unsigned I = 0; I < NumOps; ++I) {
int OpIdx = RegSrcOpIdx[I];
Operands[OpIdx] = AMDGPU::getValueMapping(Entry.RegBanks[I], Sizes[I]);
}
AltMappings.push_back(&getInstructionMapping(MappingID++, Entry.Cost,
getOperandsMapping(Operands),
Operands.size()));
}
return AltMappings;
}
RegisterBankInfo::InstructionMappings
AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsic(
const MachineInstr &MI, const MachineRegisterInfo &MRI) const {
switch (MI.getIntrinsicID()) {
case Intrinsic::amdgcn_readlane: {
static const OpRegBankEntry<3> Table[2] = {
// Perfectly legal.
{ { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1 },
// Need a readfirstlane for the index.
{ { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 }
};
const std::array<unsigned, 3> RegSrcOpIdx = { { 0, 2, 3 } };
return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
}
case Intrinsic::amdgcn_writelane: {
static const OpRegBankEntry<4> Table[4] = {
// Perfectly legal.
{ { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 },
// Need readfirstlane of first op
{ { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 },
// Need readfirstlane of second op
{ { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 },
// Need readfirstlane of both ops
{ { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 3 }
};
// rsrc, voffset, offset
const std::array<unsigned, 4> RegSrcOpIdx = { { 0, 2, 3, 4 } };
return addMappingFromTable<4>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
}
default:
return RegisterBankInfo::getInstrAlternativeMappings(MI);
}
}
RegisterBankInfo::InstructionMappings
AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsicWSideEffects(
const MachineInstr &MI, const MachineRegisterInfo &MRI) const {
switch (MI.getIntrinsicID()) {
case Intrinsic::amdgcn_s_buffer_load: {
static const OpRegBankEntry<2> Table[4] = {
// Perfectly legal.
{ { AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID }, 1 },
// Only need 1 register in loop
{ { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 300 },
// Have to waterfall the resource.
{ { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1000 },
// Have to waterfall the resource, and the offset.
{ { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 1500 }
};
// rsrc, offset
const std::array<unsigned, 2> RegSrcOpIdx = { { 2, 3 } };
return addMappingFromTable<2>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
}
case Intrinsic::amdgcn_ds_ordered_add:
case Intrinsic::amdgcn_ds_ordered_swap: {
// VGPR = M0, VGPR
static const OpRegBankEntry<3> Table[2] = {
// Perfectly legal.
{ { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 },
// Need a readfirstlane for m0
{ { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 }
};
const std::array<unsigned, 3> RegSrcOpIdx = { { 0, 2, 3 } };
return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
}
case Intrinsic::amdgcn_s_sendmsg:
case Intrinsic::amdgcn_s_sendmsghalt: {
// FIXME: Should have no register for immediate
static const OpRegBankEntry<1> Table[2] = {
// Perfectly legal.
{ { AMDGPU::SGPRRegBankID }, 1 },
// Need readlane
{ { AMDGPU::VGPRRegBankID }, 3 }
};
const std::array<unsigned, 1> RegSrcOpIdx = { { 2 } };
return addMappingFromTable<1>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
}
default:
return RegisterBankInfo::getInstrAlternativeMappings(MI);
}
}
static bool memOpHasNoClobbered(const MachineMemOperand *MMO) {
const Instruction *I = dyn_cast_or_null<Instruction>(MMO->getValue());
return I && I->getMetadata("amdgpu.noclobber");
}
// FIXME: Returns uniform if there's no source value information. This is
// probably wrong.
static bool isScalarLoadLegal(const MachineInstr &MI) {
if (!MI.hasOneMemOperand())
return false;
const MachineMemOperand *MMO = *MI.memoperands_begin();
const unsigned AS = MMO->getAddrSpace();
const bool IsConst = AS == AMDGPUAS::CONSTANT_ADDRESS ||
AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT;
// Require 4-byte alignment.
return MMO->getAlign() >= Align(4) &&
// Can't do a scalar atomic load.
!MMO->isAtomic() &&
// Don't use scalar loads for volatile accesses to non-constant address
// spaces.
(IsConst || !MMO->isVolatile()) &&
// Memory must be known constant, or not written before this load.
(IsConst || MMO->isInvariant() || memOpHasNoClobbered(MMO)) &&
AMDGPUInstrInfo::isUniformMMO(MMO);
}
RegisterBankInfo::InstructionMappings
AMDGPURegisterBankInfo::getInstrAlternativeMappings(
const MachineInstr &MI) const {
const MachineFunction &MF = *MI.getParent()->getParent();
const MachineRegisterInfo &MRI = MF.getRegInfo();
InstructionMappings AltMappings;
switch (MI.getOpcode()) {
case TargetOpcode::G_CONSTANT: {
unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
if (Size == 1) {
static const OpRegBankEntry<1> Table[3] = {
{ { AMDGPU::VGPRRegBankID }, 1 },
{ { AMDGPU::SGPRRegBankID }, 1 },
{ { AMDGPU::VCCRegBankID }, 1 }
};
return addMappingFromTable<1>(MI, MRI, {{ 0 }}, Table);
}
LLVM_FALLTHROUGH;
}
case TargetOpcode::G_FCONSTANT:
case TargetOpcode::G_FRAME_INDEX:
case TargetOpcode::G_GLOBAL_VALUE: {
static const OpRegBankEntry<1> Table[2] = {
{ { AMDGPU::VGPRRegBankID }, 1 },
{ { AMDGPU::SGPRRegBankID }, 1 }
};
return addMappingFromTable<1>(MI, MRI, {{ 0 }}, Table);
}
case TargetOpcode::G_AND:
case TargetOpcode::G_OR:
case TargetOpcode::G_XOR: {
unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
if (Size == 1) {
// s_{and|or|xor}_b32 set scc when the result of the 32-bit op is not 0.
const InstructionMapping &SCCMapping = getInstructionMapping(
1, 1, getOperandsMapping(
{AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32),
AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32),
AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32)}),
3); // Num Operands
AltMappings.push_back(&SCCMapping);
const InstructionMapping &VCCMapping0 = getInstructionMapping(
2, 1, getOperandsMapping(
{AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size),
AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size),
AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size)}),
3); // Num Operands
AltMappings.push_back(&VCCMapping0);
return AltMappings;
}
if (Size != 64)
break;
const InstructionMapping &SSMapping = getInstructionMapping(
1, 1, getOperandsMapping(
{AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}),
3); // Num Operands
AltMappings.push_back(&SSMapping);
const InstructionMapping &VVMapping = getInstructionMapping(
2, 2, getOperandsMapping(
{AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size)}),
3); // Num Operands
AltMappings.push_back(&VVMapping);
break;
}
case TargetOpcode::G_LOAD:
case TargetOpcode::G_ZEXTLOAD:
case TargetOpcode::G_SEXTLOAD: {
unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
LLT PtrTy = MRI.getType(MI.getOperand(1).getReg());
unsigned PtrSize = PtrTy.getSizeInBits();
unsigned AS = PtrTy.getAddressSpace();
if ((AS != AMDGPUAS::LOCAL_ADDRESS && AS != AMDGPUAS::REGION_ADDRESS &&
AS != AMDGPUAS::PRIVATE_ADDRESS) &&
isScalarLoadLegal(MI)) {
const InstructionMapping &SSMapping = getInstructionMapping(
1, 1, getOperandsMapping(
{AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, PtrSize)}),
2); // Num Operands
AltMappings.push_back(&SSMapping);
}
const InstructionMapping &VVMapping = getInstructionMapping(
2, 1,
getOperandsMapping(
{AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, PtrSize)}),
2); // Num Operands
AltMappings.push_back(&VVMapping);
// It may be possible to have a vgpr = load sgpr mapping here, because
// the mubuf instructions support this kind of load, but probably for only
// gfx7 and older. However, the addressing mode matching in the instruction
// selector should be able to do a better job of detecting and selecting
// these kinds of loads from the vgpr = load vgpr mapping.
return AltMappings;
}
case TargetOpcode::G_SELECT: {
unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
const InstructionMapping &SSMapping = getInstructionMapping(1, 1,
getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1),
AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}),
4); // Num Operands
AltMappings.push_back(&SSMapping);
const InstructionMapping &VVMapping = getInstructionMapping(2, 1,
getOperandsMapping({AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1),
AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size)}),
4); // Num Operands
AltMappings.push_back(&VVMapping);
return AltMappings;
}
case TargetOpcode::G_UADDE:
case TargetOpcode::G_USUBE:
case TargetOpcode::G_SADDE:
case TargetOpcode::G_SSUBE: {
unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
const InstructionMapping &SSMapping = getInstructionMapping(1, 1,
getOperandsMapping(
{AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1),
AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1)}),
5); // Num Operands
AltMappings.push_back(&SSMapping);
const InstructionMapping &VVMapping = getInstructionMapping(2, 1,
getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1),
AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1)}),
5); // Num Operands
AltMappings.push_back(&VVMapping);
return AltMappings;
}
case AMDGPU::G_BRCOND: {
assert(MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() == 1);
// TODO: Change type to 32 for scalar
const InstructionMapping &SMapping = getInstructionMapping(
1, 1, getOperandsMapping(
{AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1), nullptr}),
2); // Num Operands
AltMappings.push_back(&SMapping);
const InstructionMapping &VMapping = getInstructionMapping(
1, 1, getOperandsMapping(
{AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1), nullptr }),
2); // Num Operands
AltMappings.push_back(&VMapping);
return AltMappings;
}
case AMDGPU::G_INTRINSIC:
return getInstrAlternativeMappingsIntrinsic(MI, MRI);
case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS:
return getInstrAlternativeMappingsIntrinsicWSideEffects(MI, MRI);
default:
break;
}
return RegisterBankInfo::getInstrAlternativeMappings(MI);
}
void AMDGPURegisterBankInfo::split64BitValueForMapping(
MachineIRBuilder &B,
SmallVector<Register, 2> &Regs,
LLT HalfTy,
Register Reg) const {
assert(HalfTy.getSizeInBits() == 32);
MachineRegisterInfo *MRI = B.getMRI();
Register LoLHS = MRI->createGenericVirtualRegister(HalfTy);
Register HiLHS = MRI->createGenericVirtualRegister(HalfTy);
const RegisterBank *Bank = getRegBank(Reg, *MRI, *TRI);
MRI->setRegBank(LoLHS, *Bank);
MRI->setRegBank(HiLHS, *Bank);
Regs.push_back(LoLHS);
Regs.push_back(HiLHS);
B.buildInstr(AMDGPU::G_UNMERGE_VALUES)
.addDef(LoLHS)
.addDef(HiLHS)
.addUse(Reg);
}
/// Replace the current type each register in \p Regs has with \p NewTy
static void setRegsToType(MachineRegisterInfo &MRI, ArrayRef<Register> Regs,
LLT NewTy) {
for (Register Reg : Regs) {
assert(MRI.getType(Reg).getSizeInBits() == NewTy.getSizeInBits());
MRI.setType(Reg, NewTy);
}
}
static LLT getHalfSizedType(LLT Ty) {
if (Ty.isVector()) {
assert(Ty.getElementCount().isKnownMultipleOf(2));
return LLT::scalarOrVector(Ty.getElementCount().divideCoefficientBy(2),
Ty.getElementType());
}
assert(Ty.getScalarSizeInBits() % 2 == 0);
return LLT::scalar(Ty.getScalarSizeInBits() / 2);
}
/// Legalize instruction \p MI where operands in \p OpIndices must be SGPRs. If
/// any of the required SGPR operands are VGPRs, perform a waterfall loop to
/// execute the instruction for each unique combination of values in all lanes
/// in the wave. The block will be split such that rest of the instructions are
/// moved to a new block.
///
/// Essentially performs this loop:
//
/// Save Execution Mask
/// For (Lane : Wavefront) {
/// Enable Lane, Disable all other lanes
/// SGPR = read SGPR value for current lane from VGPR
/// VGPRResult[Lane] = use_op SGPR
/// }
/// Restore Execution Mask
///
/// There is additional complexity to try for compare values to identify the
/// unique values used.
bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
MachineIRBuilder &B,
iterator_range<MachineBasicBlock::iterator> Range,
SmallSet<Register, 4> &SGPROperandRegs,
MachineRegisterInfo &MRI) const {
SmallVector<Register, 4> ResultRegs;
SmallVector<Register, 4> InitResultRegs;
SmallVector<Register, 4> PhiRegs;
// Track use registers which have already been expanded with a readfirstlane
// sequence. This may have multiple uses if moving a sequence.
DenseMap<Register, Register> WaterfalledRegMap;
MachineBasicBlock &MBB = B.getMBB();
MachineFunction *MF = &B.getMF();
const TargetRegisterClass *WaveRC = TRI->getWaveMaskRegClass();
const unsigned WaveAndOpc = Subtarget.isWave32() ?
AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
const unsigned MovTermOpc = Subtarget.isWave32() ?
AMDGPU::S_MOV_B32_term : AMDGPU::S_MOV_B64_term;
const unsigned XorTermOpc = Subtarget.isWave32() ?
AMDGPU::S_XOR_B32_term : AMDGPU::S_XOR_B64_term;
const unsigned AndSaveExecOpc = Subtarget.isWave32() ?
AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64;
const unsigned ExecReg = Subtarget.isWave32() ?
AMDGPU::EXEC_LO : AMDGPU::EXEC;
#ifndef NDEBUG
const int OrigRangeSize = std::distance(Range.begin(), Range.end());
#endif
Register SaveExecReg = MRI.createVirtualRegister(WaveRC);
Register InitSaveExecReg = MRI.createVirtualRegister(WaveRC);
// Don't bother using generic instructions/registers for the exec mask.
B.buildInstr(TargetOpcode::IMPLICIT_DEF)
.addDef(InitSaveExecReg);
Register PhiExec = MRI.createVirtualRegister(WaveRC);
Register NewExec = MRI.createVirtualRegister(WaveRC);
// To insert the loop we need to split the block. Move everything before this
// point to a new block, and insert a new empty block before this instruction.
MachineBasicBlock *LoopBB = MF->CreateMachineBasicBlock();
MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock();
MachineBasicBlock *RestoreExecBB = MF->CreateMachineBasicBlock();
MachineFunction::iterator MBBI(MBB);
++MBBI;
MF->insert(MBBI, LoopBB);
MF->insert(MBBI, RestoreExecBB);
MF->insert(MBBI, RemainderBB);
LoopBB->addSuccessor(RestoreExecBB);
LoopBB->addSuccessor(LoopBB);
// Move the rest of the block into a new block.
RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
RemainderBB->splice(RemainderBB->begin(), &MBB, Range.end(), MBB.end());
MBB.addSuccessor(LoopBB);
RestoreExecBB->addSuccessor(RemainderBB);
B.setInsertPt(*LoopBB, LoopBB->end());
B.buildInstr(TargetOpcode::PHI)
.addDef(PhiExec)
.addReg(InitSaveExecReg)
.addMBB(&MBB)
.addReg(NewExec)
.addMBB(LoopBB);
for (auto Result : zip(InitResultRegs, ResultRegs, PhiRegs)) {
B.buildInstr(TargetOpcode::G_PHI)
.addDef(std::get<2>(Result))
.addReg(std::get<0>(Result)) // Initial value / implicit_def
.addMBB(&MBB)
.addReg(std::get<1>(Result)) // Mid-loop value.
.addMBB(LoopBB);
}
const DebugLoc &DL = B.getDL();
MachineInstr &FirstInst = *Range.begin();
// Move the instruction into the loop. Note we moved everything after
// Range.end() already into a new block, so Range.end() is no longer valid.
LoopBB->splice(LoopBB->end(), &MBB, Range.begin(), MBB.end());
// Figure out the iterator range after splicing the instructions.
MachineBasicBlock::iterator NewBegin = FirstInst.getIterator();
auto NewEnd = LoopBB->end();
MachineBasicBlock::iterator I = Range.begin();
B.setInsertPt(*LoopBB, I);
Register CondReg;
assert(std::distance(NewBegin, NewEnd) == OrigRangeSize);
for (MachineInstr &MI : make_range(NewBegin, NewEnd)) {
for (MachineOperand &Op : MI.uses()) {
if (!Op.isReg() || Op.isDef())
continue;
Register OldReg = Op.getReg();
if (!SGPROperandRegs.count(OldReg))
continue;
// See if we already processed this register in another instruction in the
// sequence.
auto OldVal = WaterfalledRegMap.find(OldReg);
if (OldVal != WaterfalledRegMap.end()) {
Op.setReg(OldVal->second);
continue;
}
Register OpReg = Op.getReg();
LLT OpTy = MRI.getType(OpReg);
const RegisterBank *OpBank = getRegBank(OpReg, MRI, *TRI);
if (OpBank != &AMDGPU::VGPRRegBank) {
// Insert copy from AGPR to VGPR before the loop.
B.setMBB(MBB);
OpReg = B.buildCopy(OpTy, OpReg).getReg(0);
MRI.setRegBank(OpReg, AMDGPU::VGPRRegBank);
B.setInstr(*I);
}
unsigned OpSize = OpTy.getSizeInBits();
// Can only do a readlane of 32-bit pieces.
if (OpSize == 32) {
// Avoid extra copies in the simple case of one 32-bit register.
Register CurrentLaneOpReg
= MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
MRI.setType(CurrentLaneOpReg, OpTy);
constrainGenericRegister(OpReg, AMDGPU::VGPR_32RegClass, MRI);
// Read the next variant <- also loop target.
BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
CurrentLaneOpReg)
.addReg(OpReg);
Register NewCondReg = MRI.createVirtualRegister(WaveRC);
bool First = CondReg == AMDGPU::NoRegister;
if (First)
CondReg = NewCondReg;
// Compare the just read M0 value to all possible Idx values.
B.buildInstr(AMDGPU::V_CMP_EQ_U32_e64)
.addDef(NewCondReg)
.addReg(CurrentLaneOpReg)
.addReg(OpReg);
Op.setReg(CurrentLaneOpReg);
if (!First) {
Register AndReg = MRI.createVirtualRegister(WaveRC);
// If there are multiple operands to consider, and the conditions.
B.buildInstr(WaveAndOpc)
.addDef(AndReg)
.addReg(NewCondReg)
.addReg(CondReg);
CondReg = AndReg;
}
} else {
LLT S32 = LLT::scalar(32);
SmallVector<Register, 8> ReadlanePieces;
// The compares can be done as 64-bit, but the extract needs to be done
// in 32-bit pieces.
bool Is64 = OpSize % 64 == 0;
unsigned UnmergeTySize = Is64 ? 64 : 32;
unsigned CmpOp =
Is64 ? AMDGPU::V_CMP_EQ_U64_e64 : AMDGPU::V_CMP_EQ_U32_e64;
// Insert the unmerge before the loop.
B.setMBB(MBB);
unsigned NumPieces = OpSize / UnmergeTySize;
SmallVector<Register, 8> UnmergePieces;
if (NumPieces == 1) {
UnmergePieces.push_back(OpReg);
} else {
LLT UnmergeTy = LLT::scalar(UnmergeTySize);
MachineInstrBuilder Unmerge = B.buildUnmerge(UnmergeTy, OpReg);
for (unsigned PieceIdx = 0; PieceIdx != NumPieces; ++PieceIdx)
UnmergePieces.push_back(Unmerge.getReg(PieceIdx));
}
B.setInstr(*I);
for (Register UnmergePiece : UnmergePieces) {
Register CurrentLaneOpReg;
if (Is64) {
Register CurrentLaneOpRegLo = MRI.createGenericVirtualRegister(S32);
Register CurrentLaneOpRegHi = MRI.createGenericVirtualRegister(S32);
MRI.setRegClass(UnmergePiece, &AMDGPU::VReg_64RegClass);
MRI.setRegClass(CurrentLaneOpRegLo, &AMDGPU::SReg_32_XM0RegClass);
MRI.setRegClass(CurrentLaneOpRegHi, &AMDGPU::SReg_32_XM0RegClass);
// Read the next variant <- also loop target.
BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
CurrentLaneOpRegLo)
.addReg(UnmergePiece, 0, AMDGPU::sub0);
// Read the next variant <- also loop target.
BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
CurrentLaneOpRegHi)
.addReg(UnmergePiece, 0, AMDGPU::sub1);
CurrentLaneOpReg =
B.buildMerge(LLT::scalar(64),
{CurrentLaneOpRegLo, CurrentLaneOpRegHi})
.getReg(0);
MRI.setRegClass(CurrentLaneOpReg, &AMDGPU::SReg_64_XEXECRegClass);
if (OpTy.getScalarSizeInBits() == 64) {
// If we need to produce a 64-bit element vector, so use the
// merged pieces
ReadlanePieces.push_back(CurrentLaneOpReg);
} else {
// 32-bit element type.
ReadlanePieces.push_back(CurrentLaneOpRegLo);
ReadlanePieces.push_back(CurrentLaneOpRegHi);
}
} else {
CurrentLaneOpReg = MRI.createGenericVirtualRegister(S32);
MRI.setRegClass(UnmergePiece, &AMDGPU::VGPR_32RegClass);
MRI.setRegClass(CurrentLaneOpReg, &AMDGPU::SReg_32_XM0RegClass);
// Read the next variant <- also loop target.
BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
CurrentLaneOpReg)
.addReg(UnmergePiece);
ReadlanePieces.push_back(CurrentLaneOpReg);
}
Register NewCondReg = MRI.createVirtualRegister(WaveRC);
bool First = CondReg == AMDGPU::NoRegister;
if (First)
CondReg = NewCondReg;
B.buildInstr(CmpOp)
.addDef(NewCondReg)
.addReg(CurrentLaneOpReg)
.addReg(UnmergePiece);
if (!First) {
Register AndReg = MRI.createVirtualRegister(WaveRC);
// If there are multiple operands to consider, and the conditions.
B.buildInstr(WaveAndOpc)
.addDef(AndReg)
.addReg(NewCondReg)
.addReg(CondReg);
CondReg = AndReg;
}
}
// FIXME: Build merge seems to switch to CONCAT_VECTORS but not
// BUILD_VECTOR
if (OpTy.isVector()) {
auto Merge = B.buildBuildVector(OpTy, ReadlanePieces);
Op.setReg(Merge.getReg(0));
MRI.setRegBank(Op.getReg(), AMDGPU::SGPRRegBank);
} else if (ReadlanePieces.size() > 1) {
auto Merge = B.buildMerge(OpTy, ReadlanePieces);
Op.setReg(Merge.getReg(0));
MRI.setRegBank(Op.getReg(), AMDGPU::SGPRRegBank);
} else {
Op.setReg(ReadlanePieces[0]);
}
}
// Make sure we don't re-process this register again.
WaterfalledRegMap.insert(std::make_pair(OldReg, Op.getReg()));
}
}
// Update EXEC, save the original EXEC value to VCC.
B.buildInstr(AndSaveExecOpc)
.addDef(NewExec)
.addReg(CondReg, RegState::Kill);
MRI.setSimpleHint(NewExec, CondReg);
B.setInsertPt(*LoopBB, LoopBB->end());
// Update EXEC, switch all done bits to 0 and all todo bits to 1.
B.buildInstr(XorTermOpc)
.addDef(ExecReg)
.addReg(ExecReg)
.addReg(NewExec);
// XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
// s_cbranch_scc0?
// Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
B.buildInstr(AMDGPU::SI_WATERFALL_LOOP).addMBB(LoopBB);
// Save the EXEC mask before the loop.
BuildMI(MBB, MBB.end(), DL, TII->get(MovTermOpc), SaveExecReg)
.addReg(ExecReg);
// Restore the EXEC mask after the loop.
B.setMBB(*RestoreExecBB);
B.buildInstr(MovTermOpc)
.addDef(ExecReg)
.addReg(SaveExecReg);
// Set the insert point after the original instruction, so any new
// instructions will be in the remainder.
B.setInsertPt(*RemainderBB, RemainderBB->begin());
return true;
}
// Return any unique registers used by \p MI at \p OpIndices that need to be
// handled in a waterfall loop. Returns these registers in \p
// SGPROperandRegs. Returns true if there are any operands to handle and a
// waterfall loop is necessary.
bool AMDGPURegisterBankInfo::collectWaterfallOperands(
SmallSet<Register, 4> &SGPROperandRegs, MachineInstr &MI,
MachineRegisterInfo &MRI, ArrayRef<unsigned> OpIndices) const {
for (unsigned Op : OpIndices) {
assert(MI.getOperand(Op).isUse());
Register Reg = MI.getOperand(Op).getReg();
const RegisterBank *OpBank = getRegBank(Reg, MRI, *TRI);
if (OpBank->getID() != AMDGPU::SGPRRegBankID)
SGPROperandRegs.insert(Reg);
}
// No operands need to be replaced, so no need to loop.
return !SGPROperandRegs.empty();
}
bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
MachineIRBuilder &B, MachineInstr &MI, MachineRegisterInfo &MRI,
ArrayRef<unsigned> OpIndices) const {
// Use a set to avoid extra readfirstlanes in the case where multiple operands
// are the same register.
SmallSet<Register, 4> SGPROperandRegs;
if (!collectWaterfallOperands(SGPROperandRegs, MI, MRI, OpIndices))
return false;
MachineBasicBlock::iterator I = MI.getIterator();
return executeInWaterfallLoop(B, make_range(I, std::next(I)),
SGPROperandRegs, MRI);
}
bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
MachineInstr &MI, MachineRegisterInfo &MRI,
ArrayRef<unsigned> OpIndices) const {
MachineIRBuilder B(MI);
return executeInWaterfallLoop(B, MI, MRI, OpIndices);
}
// Legalize an operand that must be an SGPR by inserting a readfirstlane.
void AMDGPURegisterBankInfo::constrainOpWithReadfirstlane(
MachineInstr &MI, MachineRegisterInfo &MRI, unsigned OpIdx) const {
Register Reg = MI.getOperand(OpIdx).getReg();
const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI);
if (Bank == &AMDGPU::SGPRRegBank)
return;
LLT Ty = MRI.getType(Reg);
MachineIRBuilder B(MI);
if (Bank != &AMDGPU::VGPRRegBank) {
// We need to copy from AGPR to VGPR
Reg = B.buildCopy(Ty, Reg).getReg(0);
MRI.setRegBank(Reg, AMDGPU::VGPRRegBank);
}
Register SGPR = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
B.buildInstr(AMDGPU::V_READFIRSTLANE_B32)
.addDef(SGPR)
.addReg(Reg);
MRI.setType(SGPR, Ty);
const TargetRegisterClass *Constrained =
constrainGenericRegister(Reg, AMDGPU::VGPR_32RegClass, MRI);
(void)Constrained;
assert(Constrained && "Failed to constrain readfirstlane src reg");
MI.getOperand(OpIdx).setReg(SGPR);
}
/// Split \p Ty into 2 pieces. The first will have \p FirstSize bits, and the
/// rest will be in the remainder.
static std::pair<LLT, LLT> splitUnequalType(LLT Ty, unsigned FirstSize) {
unsigned TotalSize = Ty.getSizeInBits();
if (!Ty.isVector())
return {LLT::scalar(FirstSize), LLT::scalar(TotalSize - FirstSize)};
LLT EltTy = Ty.getElementType();
unsigned EltSize = EltTy.getSizeInBits();
assert(FirstSize % EltSize == 0);
unsigned FirstPartNumElts = FirstSize / EltSize;
unsigned RemainderElts = (TotalSize - FirstSize) / EltSize;
return {LLT::scalarOrVector(ElementCount::getFixed(FirstPartNumElts), EltTy),
LLT::scalarOrVector(ElementCount::getFixed(RemainderElts), EltTy)};
}
static LLT widen96To128(LLT Ty) {
if (!Ty.isVector())
return LLT::scalar(128);
LLT EltTy = Ty.getElementType();
assert(128 % EltTy.getSizeInBits() == 0);
return LLT::fixed_vector(128 / EltTy.getSizeInBits(), EltTy);
}
bool AMDGPURegisterBankInfo::applyMappingLoad(MachineInstr &MI,
const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper,
MachineRegisterInfo &MRI) const {
Register DstReg = MI.getOperand(0).getReg();
const LLT LoadTy = MRI.getType(DstReg);
unsigned LoadSize = LoadTy.getSizeInBits();
const unsigned MaxNonSmrdLoadSize = 128;
const RegisterBank *DstBank =
OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
if (DstBank == &AMDGPU::SGPRRegBank) {
// There are some special cases that we need to look at for 32 bit and 96
// bit SGPR loads otherwise we have nothing to do.
if (LoadSize != 32 && LoadSize != 96)
return false;
MachineMemOperand *MMO = *MI.memoperands_begin();
const unsigned MemSize = 8 * MMO->getSize();
// Scalar loads of size 8 or 16 bit with proper alignment may be widened to
// 32 bit. Check to see if we need to widen the memory access, 8 or 16 bit
// scalar loads should have a load size of 32 but memory access size of less
// than 32.
if (LoadSize == 32 &&
(MemSize == 32 || LoadTy.isVector() || !isScalarLoadLegal(MI)))
return false;
Register PtrReg = MI.getOperand(1).getReg();
ApplyRegBankMapping O(*this, MRI, &AMDGPU::SGPRRegBank);
MachineIRBuilder B(MI, O);
if (LoadSize == 32) {
// This is an extending load from a sub-dword size. Widen the memory
// access size to 4 bytes and clear the extra high bits appropriately
const LLT S32 = LLT::scalar(32);
if (MI.getOpcode() == AMDGPU::G_SEXTLOAD) {
// Must extend the sign bit into higher bits for a G_SEXTLOAD
auto WideLoad = B.buildLoadFromOffset(S32, PtrReg, *MMO, 0);
B.buildSExtInReg(MI.getOperand(0), WideLoad, MemSize);
} else if (MI.getOpcode() == AMDGPU::G_ZEXTLOAD) {
// Must extend zero into higher bits with an AND for a G_ZEXTLOAD
auto WideLoad = B.buildLoadFromOffset(S32, PtrReg, *MMO, 0);
B.buildZExtInReg(MI.getOperand(0), WideLoad, MemSize);
} else
// We do not need to touch the higher bits for regular loads.
B.buildLoadFromOffset(MI.getOperand(0), PtrReg, *MMO, 0);
} else {
// 96-bit loads are only available for vector loads. We need to split this
// into a 64-bit part, and 32 (unless we can widen to a 128-bit load).
if (MMO->getAlign() < Align(16)) {
LLT Part64, Part32;
std::tie(Part64, Part32) = splitUnequalType(LoadTy, 64);
auto Load0 = B.buildLoadFromOffset(Part64, PtrReg, *MMO, 0);
auto Load1 = B.buildLoadFromOffset(Part32, PtrReg, *MMO, 8);
auto Undef = B.buildUndef(LoadTy);
auto Ins0 = B.buildInsert(LoadTy, Undef, Load0, 0);
B.buildInsert(MI.getOperand(0), Ins0, Load1, 64);
} else {
LLT WiderTy = widen96To128(LoadTy);
auto WideLoad = B.buildLoadFromOffset(WiderTy, PtrReg, *MMO, 0);
B.buildExtract(MI.getOperand(0), WideLoad, 0);
}
}
MI.eraseFromParent();
return true;
}
// 128-bit loads are supported for all instruction types.
if (LoadSize <= MaxNonSmrdLoadSize)
return false;
SmallVector<Register, 16> DefRegs(OpdMapper.getVRegs(0));
SmallVector<Register, 1> SrcRegs(OpdMapper.getVRegs(1));
if (SrcRegs.empty())
SrcRegs.push_back(MI.getOperand(1).getReg());
assert(LoadSize % MaxNonSmrdLoadSize == 0);
// RegBankSelect only emits scalar types, so we need to reset the pointer
// operand to a pointer type.
Register BasePtrReg = SrcRegs[0];
LLT PtrTy = MRI.getType(MI.getOperand(1).getReg());
MRI.setType(BasePtrReg, PtrTy);
unsigned NumSplitParts = LoadTy.getSizeInBits() / MaxNonSmrdLoadSize;
const LLT LoadSplitTy = LoadTy.divide(NumSplitParts);
ApplyRegBankMapping Observer(*this, MRI, &AMDGPU::VGPRRegBank);
MachineIRBuilder B(MI, Observer);
LegalizerHelper Helper(B.getMF(), Observer, B);
if (LoadTy.isVector()) {
if (Helper.fewerElementsVector(MI, 0, LoadSplitTy) != LegalizerHelper::Legalized)
return false;
} else {
if (Helper.narrowScalar(MI, 0, LoadSplitTy) != LegalizerHelper::Legalized)
return false;
}
MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank);
return true;
}
bool AMDGPURegisterBankInfo::applyMappingDynStackAlloc(
MachineInstr &MI,
const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper,
MachineRegisterInfo &MRI) const {
const MachineFunction &MF = *MI.getMF();
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
const auto &TFI = *ST.getFrameLowering();
// Guard in case the stack growth direction ever changes with scratch
// instructions.
if (TFI.getStackGrowthDirection() == TargetFrameLowering::StackGrowsDown)
return false;
Register Dst = MI.getOperand(0).getReg();
Register AllocSize = MI.getOperand(1).getReg();
Align Alignment = assumeAligned(MI.getOperand(2).getImm());
const RegisterBank *SizeBank = getRegBank(AllocSize, MRI, *TRI);
// TODO: Need to emit a wave reduction to get the maximum size.
if (SizeBank != &AMDGPU::SGPRRegBank)
return false;
LLT PtrTy = MRI.getType(Dst);
LLT IntPtrTy = LLT::scalar(PtrTy.getSizeInBits());
const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
Register SPReg = Info->getStackPtrOffsetReg();
ApplyRegBankMapping ApplyBank(*this, MRI, &AMDGPU::SGPRRegBank);
MachineIRBuilder B(MI, ApplyBank);
auto WaveSize = B.buildConstant(LLT::scalar(32), ST.getWavefrontSizeLog2());
auto ScaledSize = B.buildShl(IntPtrTy, AllocSize, WaveSize);
auto SPCopy = B.buildCopy(PtrTy, SPReg);
if (Alignment > TFI.getStackAlign()) {
auto PtrAdd = B.buildPtrAdd(PtrTy, SPCopy, ScaledSize);
B.buildMaskLowPtrBits(Dst, PtrAdd,
Log2(Alignment) + ST.getWavefrontSizeLog2());
} else {
B.buildPtrAdd(Dst, SPCopy, ScaledSize);
}
MI.eraseFromParent();
return true;
}
bool AMDGPURegisterBankInfo::applyMappingImage(
MachineInstr &MI, const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper,
MachineRegisterInfo &MRI, int RsrcIdx) const {
const int NumDefs = MI.getNumExplicitDefs();
// The reported argument index is relative to the IR intrinsic call arguments,
// so we need to shift by the number of defs and the intrinsic ID.
RsrcIdx += NumDefs + 1;
// Insert copies to VGPR arguments.
applyDefaultMapping(OpdMapper);
// Fixup any SGPR arguments.
SmallVector<unsigned, 4> SGPRIndexes;
for (int I = NumDefs, NumOps = MI.getNumOperands(); I != NumOps; ++I) {
if (!MI.getOperand(I).isReg())
continue;
// If this intrinsic has a sampler, it immediately follows rsrc.
if (I == RsrcIdx || I == RsrcIdx + 1)
SGPRIndexes.push_back(I);
}
executeInWaterfallLoop(MI, MRI, SGPRIndexes);
return true;
}
static Register getSrcRegIgnoringCopies(const MachineRegisterInfo &MRI,
Register Reg) {
MachineInstr *Def = getDefIgnoringCopies(Reg, MRI);
if (!Def)
return Reg;
// TODO: Guard against this being an implicit def
return Def->getOperand(0).getReg();
}
// Analyze a combined offset from an llvm.amdgcn.s.buffer intrinsic and store
// the three offsets (voffset, soffset and instoffset)
static unsigned setBufferOffsets(MachineIRBuilder &B,
const AMDGPURegisterBankInfo &RBI,
Register CombinedOffset, Register &VOffsetReg,
Register &SOffsetReg, int64_t &InstOffsetVal,
Align Alignment) {
const LLT S32 = LLT::scalar(32);
MachineRegisterInfo *MRI = B.getMRI();
if (Optional<int64_t> Imm = getIConstantVRegSExtVal(CombinedOffset, *MRI)) {
uint32_t SOffset, ImmOffset;
if (AMDGPU::splitMUBUFOffset(*Imm, SOffset, ImmOffset, &RBI.Subtarget,
Alignment)) {
VOffsetReg = B.buildConstant(S32, 0).getReg(0);
SOffsetReg = B.buildConstant(S32, SOffset).getReg(0);
InstOffsetVal = ImmOffset;
B.getMRI()->setRegBank(VOffsetReg, AMDGPU::VGPRRegBank);
B.getMRI()->setRegBank(SOffsetReg, AMDGPU::SGPRRegBank);
return SOffset + ImmOffset;
}
}
Register Base;
unsigned Offset;
std::tie(Base, Offset) =
AMDGPU::getBaseWithConstantOffset(*MRI, CombinedOffset);
uint32_t SOffset, ImmOffset;
if ((int)Offset > 0 && AMDGPU::splitMUBUFOffset(Offset, SOffset, ImmOffset,
&RBI.Subtarget, Alignment)) {
if (RBI.getRegBank(Base, *MRI, *RBI.TRI) == &AMDGPU::VGPRRegBank) {
VOffsetReg = Base;
SOffsetReg = B.buildConstant(S32, SOffset).getReg(0);
B.getMRI()->setRegBank(SOffsetReg, AMDGPU::SGPRRegBank);
InstOffsetVal = ImmOffset;
return 0; // XXX - Why is this 0?
}
// If we have SGPR base, we can use it for soffset.
if (SOffset == 0) {
VOffsetReg = B.buildConstant(S32, 0).getReg(0);
B.getMRI()->setRegBank(VOffsetReg, AMDGPU::VGPRRegBank);
SOffsetReg = Base;
InstOffsetVal = ImmOffset;
return 0; // XXX - Why is this 0?
}
}
// Handle the variable sgpr + vgpr case.
MachineInstr *Add = getOpcodeDef(AMDGPU::G_ADD, CombinedOffset, *MRI);
if (Add && (int)Offset >= 0) {
Register Src0 = getSrcRegIgnoringCopies(*MRI, Add->getOperand(1).getReg());
Register Src1 = getSrcRegIgnoringCopies(*MRI, Add->getOperand(2).getReg());
const RegisterBank *Src0Bank = RBI.getRegBank(Src0, *MRI, *RBI.TRI);
const RegisterBank *Src1Bank = RBI.getRegBank(Src1, *MRI, *RBI.TRI);
if (Src0Bank == &AMDGPU::VGPRRegBank && Src1Bank == &AMDGPU::SGPRRegBank) {
VOffsetReg = Src0;
SOffsetReg = Src1;
return 0;
}
if (Src0Bank == &AMDGPU::SGPRRegBank && Src1Bank == &AMDGPU::VGPRRegBank) {
VOffsetReg = Src1;
SOffsetReg = Src0;
return 0;
}
}
// Ensure we have a VGPR for the combined offset. This could be an issue if we
// have an SGPR offset and a VGPR resource.
if (RBI.getRegBank(CombinedOffset, *MRI, *RBI.TRI) == &AMDGPU::VGPRRegBank) {
VOffsetReg = CombinedOffset;
} else {
VOffsetReg = B.buildCopy(S32, CombinedOffset).getReg(0);
B.getMRI()->setRegBank(VOffsetReg, AMDGPU::VGPRRegBank);
}
SOffsetReg = B.buildConstant(S32, 0).getReg(0);
B.getMRI()->setRegBank(SOffsetReg, AMDGPU::SGPRRegBank);
return 0;
}
bool AMDGPURegisterBankInfo::applyMappingSBufferLoad(
const OperandsMapper &OpdMapper) const {
MachineInstr &MI = OpdMapper.getMI();
MachineRegisterInfo &MRI = OpdMapper.getMRI();
const LLT S32 = LLT::scalar(32);
Register Dst = MI.getOperand(0).getReg();
LLT Ty = MRI.getType(Dst);
const RegisterBank *RSrcBank =
OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
const RegisterBank *OffsetBank =
OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
if (RSrcBank == &AMDGPU::SGPRRegBank &&
OffsetBank == &AMDGPU::SGPRRegBank)
return true; // Legal mapping
// FIXME: 96-bit case was widened during legalize. We need to narrow it back
// here but don't have an MMO.
unsigned LoadSize = Ty.getSizeInBits();
int NumLoads = 1;
if (LoadSize == 256 || LoadSize == 512) {
NumLoads = LoadSize / 128;
Ty = Ty.divide(NumLoads);
}
// Use the alignment to ensure that the required offsets will fit into the
// immediate offsets.
const Align Alignment = NumLoads > 1 ? Align(16 * NumLoads) : Align(1);
MachineIRBuilder B(MI);
MachineFunction &MF = B.getMF();
Register SOffset;
Register VOffset;
int64_t ImmOffset = 0;
unsigned MMOOffset = setBufferOffsets(B, *this, MI.getOperand(2).getReg(),
VOffset, SOffset, ImmOffset, Alignment);
// TODO: 96-bit loads were widened to 128-bit results. Shrink the result if we
// can, but we need to track an MMO for that.
const unsigned MemSize = (Ty.getSizeInBits() + 7) / 8;
const Align MemAlign(4); // FIXME: ABI type alignment?
MachineMemOperand *BaseMMO = MF.getMachineMemOperand(
MachinePointerInfo(),
MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
MachineMemOperand::MOInvariant,
MemSize, MemAlign);
if (MMOOffset != 0)
BaseMMO = MF.getMachineMemOperand(BaseMMO, MMOOffset, MemSize);
// If only the offset is divergent, emit a MUBUF buffer load instead. We can
// assume that the buffer is unswizzled.
Register RSrc = MI.getOperand(1).getReg();
Register VIndex = B.buildConstant(S32, 0).getReg(0);
B.getMRI()->setRegBank(VIndex, AMDGPU::VGPRRegBank);
SmallVector<Register, 4> LoadParts(NumLoads);
MachineBasicBlock::iterator MII = MI.getIterator();
MachineInstrSpan Span(MII, &B.getMBB());
for (int i = 0; i < NumLoads; ++i) {
if (NumLoads == 1) {
LoadParts[i] = Dst;
} else {
LoadParts[i] = MRI.createGenericVirtualRegister(Ty);
MRI.setRegBank(LoadParts[i], AMDGPU::VGPRRegBank);
}
MachineMemOperand *MMO = BaseMMO;
if (i != 0)
BaseMMO = MF.getMachineMemOperand(BaseMMO, MMOOffset + 16 * i, MemSize);
B.buildInstr(AMDGPU::G_AMDGPU_BUFFER_LOAD)
.addDef(LoadParts[i]) // vdata
.addUse(RSrc) // rsrc
.addUse(VIndex) // vindex
.addUse(VOffset) // voffset
.addUse(SOffset) // soffset
.addImm(ImmOffset + 16 * i) // offset(imm)
.addImm(0) // cachepolicy, swizzled buffer(imm)
.addImm(0) // idxen(imm)
.addMemOperand(MMO);
}
// TODO: If only the resource is a VGPR, it may be better to execute the
// scalar load in the waterfall loop if the resource is expected to frequently
// be dynamically uniform.
if (RSrcBank != &AMDGPU::SGPRRegBank) {
// Remove the original instruction to avoid potentially confusing the
// waterfall loop logic.
B.setInstr(*Span.begin());
MI.eraseFromParent();
SmallSet<Register, 4> OpsToWaterfall;
OpsToWaterfall.insert(RSrc);
executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()),
OpsToWaterfall, MRI);
}
if (NumLoads != 1) {
if (Ty.isVector())
B.buildConcatVectors(Dst, LoadParts);
else
B.buildMerge(Dst, LoadParts);
}
// We removed the instruction earlier with a waterfall loop.
if (RSrcBank == &AMDGPU::SGPRRegBank)
MI.eraseFromParent();
return true;
}
bool AMDGPURegisterBankInfo::applyMappingBFE(const OperandsMapper &OpdMapper,
bool Signed) const {
MachineInstr &MI = OpdMapper.getMI();
MachineRegisterInfo &MRI = OpdMapper.getMRI();
// Insert basic copies
applyDefaultMapping(OpdMapper);
Register DstReg = MI.getOperand(0).getReg();
LLT Ty = MRI.getType(DstReg);
const LLT S32 = LLT::scalar(32);
unsigned FirstOpnd = MI.getOpcode() == AMDGPU::G_INTRINSIC ? 2 : 1;
Register SrcReg = MI.getOperand(FirstOpnd).getReg();
Register OffsetReg = MI.getOperand(FirstOpnd + 1).getReg();
Register WidthReg = MI.getOperand(FirstOpnd + 2).getReg();
const RegisterBank *DstBank =
OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
if (DstBank == &AMDGPU::VGPRRegBank) {
if (Ty == S32)
return true;
// There is no 64-bit vgpr bitfield extract instructions so the operation
// is expanded to a sequence of instructions that implement the operation.
ApplyRegBankMapping ApplyBank(*this, MRI, &AMDGPU::VGPRRegBank);
MachineIRBuilder B(MI, ApplyBank);
const LLT S64 = LLT::scalar(64);
// Shift the source operand so that extracted bits start at bit 0.
auto ShiftOffset = Signed ? B.buildAShr(S64, SrcReg, OffsetReg)
: B.buildLShr(S64, SrcReg, OffsetReg);
auto UnmergeSOffset = B.buildUnmerge({S32, S32}, ShiftOffset);
// A 64-bit bitfield extract uses the 32-bit bitfield extract instructions
// if the width is a constant.
if (auto ConstWidth = getIConstantVRegValWithLookThrough(WidthReg, MRI)) {
// Use the 32-bit bitfield extract instruction if the width is a constant.
// Depending on the width size, use either the low or high 32-bits.
auto Zero = B.buildConstant(S32, 0);
auto WidthImm = ConstWidth->Value.getZExtValue();
if (WidthImm <= 32) {
// Use bitfield extract on the lower 32-bit source, and then sign-extend
// or clear the upper 32-bits.
auto Extract =
Signed ? B.buildSbfx(S32, UnmergeSOffset.getReg(0), Zero, WidthReg)
: B.buildUbfx(S32, UnmergeSOffset.getReg(0), Zero, WidthReg);
auto Extend =
Signed ? B.buildAShr(S32, Extract, B.buildConstant(S32, 31)) : Zero;
B.buildMerge(DstReg, {Extract, Extend});
} else {
// Use bitfield extract on upper 32-bit source, and combine with lower
// 32-bit source.
auto UpperWidth = B.buildConstant(S32, WidthImm - 32);
auto Extract =
Signed
? B.buildSbfx(S32, UnmergeSOffset.getReg(1), Zero, UpperWidth)
: B.buildUbfx(S32, UnmergeSOffset.getReg(1), Zero, UpperWidth);
B.buildMerge(DstReg, {UnmergeSOffset.getReg(0), Extract});
}
MI.eraseFromParent();
return true;
}
// Expand to Src >> Offset << (64 - Width) >> (64 - Width) using 64-bit
// operations.
auto ExtShift = B.buildSub(S32, B.buildConstant(S32, 64), WidthReg);
auto SignBit = B.buildShl(S64, ShiftOffset, ExtShift);
if (Signed)
B.buildAShr(S64, SignBit, ExtShift);
else
B.buildLShr(S64, SignBit, ExtShift);
MI.eraseFromParent();
return true;
}
// The scalar form packs the offset and width in a single operand.
ApplyRegBankMapping ApplyBank(*this, MRI, &AMDGPU::SGPRRegBank);
MachineIRBuilder B(MI, ApplyBank);
// Ensure the high bits are clear to insert the offset.
auto OffsetMask = B.buildConstant(S32, maskTrailingOnes<unsigned>(6));
auto ClampOffset = B.buildAnd(S32, OffsetReg, OffsetMask);
// Zeros out the low bits, so don't bother clamping the input value.
auto ShiftWidth = B.buildShl(S32, WidthReg, B.buildConstant(S32, 16));
// Transformation function, pack the offset and width of a BFE into
// the format expected by the S_BFE_I32 / S_BFE_U32. In the second
// source, bits [5:0] contain the offset and bits [22:16] the width.
auto MergedInputs = B.buildOr(S32, ClampOffset, ShiftWidth);
// TODO: It might be worth using a pseudo here to avoid scc clobber and
// register class constraints.
unsigned Opc = Ty == S32 ? (Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32) :
(Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64);
auto MIB = B.buildInstr(Opc, {DstReg}, {SrcReg, MergedInputs});
if (!constrainSelectedInstRegOperands(*MIB, *TII, *TRI, *this))
llvm_unreachable("failed to constrain BFE");
MI.eraseFromParent();
return true;
}
// Return a suitable opcode for extending the operands of Opc when widening.
static unsigned getExtendOp(unsigned Opc) {
switch (Opc) {
case TargetOpcode::G_ASHR:
case TargetOpcode::G_SMIN:
case TargetOpcode::G_SMAX:
return TargetOpcode::G_SEXT;
case TargetOpcode::G_LSHR:
case TargetOpcode::G_UMIN:
case TargetOpcode::G_UMAX:
return TargetOpcode::G_ZEXT;
default:
return TargetOpcode::G_ANYEXT;
}
}
// Emit a legalized extension from <2 x s16> to 2 32-bit components, avoiding
// any illegal vector extend or unmerge operations.
static std::pair<Register, Register>
unpackV2S16ToS32(MachineIRBuilder &B, Register Src, unsigned ExtOpcode) {
const LLT S32 = LLT::scalar(32);
auto Bitcast = B.buildBitcast(S32, Src);
if (ExtOpcode == TargetOpcode::G_SEXT) {
auto ExtLo = B.buildSExtInReg(S32, Bitcast, 16);
auto ShiftHi = B.buildAShr(S32, Bitcast, B.buildConstant(S32, 16));
return std::make_pair(ExtLo.getReg(0), ShiftHi.getReg(0));
}
auto ShiftHi = B.buildLShr(S32, Bitcast, B.buildConstant(S32, 16));
if (ExtOpcode == TargetOpcode::G_ZEXT) {
auto ExtLo = B.buildAnd(S32, Bitcast, B.buildConstant(S32, 0xffff));
return std::make_pair(ExtLo.getReg(0), ShiftHi.getReg(0));
}
assert(ExtOpcode == TargetOpcode::G_ANYEXT);
return std::make_pair(Bitcast.getReg(0), ShiftHi.getReg(0));
}
// For cases where only a single copy is inserted for matching register banks.
// Replace the register in the instruction operand
static bool substituteSimpleCopyRegs(
const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper, unsigned OpIdx) {
SmallVector<unsigned, 1> SrcReg(OpdMapper.getVRegs(OpIdx));
if (!SrcReg.empty()) {
assert(SrcReg.size() == 1);
OpdMapper.getMI().getOperand(OpIdx).setReg(SrcReg[0]);
return true;
}
return false;
}
/// Handle register layout difference for f16 images for some subtargets.
Register AMDGPURegisterBankInfo::handleD16VData(MachineIRBuilder &B,
MachineRegisterInfo &MRI,
Register Reg) const {
if (!Subtarget.hasUnpackedD16VMem())
return Reg;
const LLT S16 = LLT::scalar(16);
LLT StoreVT = MRI.getType(Reg);
if (!StoreVT.isVector() || StoreVT.getElementType() != S16)
return Reg;
auto Unmerge = B.buildUnmerge(S16, Reg);
SmallVector<Register, 4> WideRegs;
for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
WideRegs.push_back(Unmerge.getReg(I));
const LLT S32 = LLT::scalar(32);
int NumElts = StoreVT.getNumElements();
return B.buildMerge(LLT::fixed_vector(NumElts, S32), WideRegs).getReg(0);
}
static std::pair<Register, unsigned>
getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg) {
int64_t Const;
if (mi_match(Reg, MRI, m_ICst(Const)))
return std::make_pair(Register(), Const);
Register Base;
if (mi_match(Reg, MRI, m_GAdd(m_Reg(Base), m_ICst(Const))))
return std::make_pair(Base, Const);
// TODO: Handle G_OR used for add case
return std::make_pair(Reg, 0);
}
std::pair<Register, unsigned>
AMDGPURegisterBankInfo::splitBufferOffsets(MachineIRBuilder &B,
Register OrigOffset) const {
const unsigned MaxImm = 4095;
Register BaseReg;
unsigned ImmOffset;
const LLT S32 = LLT::scalar(32);
std::tie(BaseReg, ImmOffset) = getBaseWithConstantOffset(*B.getMRI(),
OrigOffset);
unsigned C1 = 0;
if (ImmOffset != 0) {
// If the immediate value is too big for the immoffset field, put the value
// and -4096 into the immoffset field so that the value that is copied/added
// for the voffset field is a multiple of 4096, and it stands more chance
// of being CSEd with the copy/add for another similar load/store.
// However, do not do that rounding down to a multiple of 4096 if that is a
// negative number, as it appears to be illegal to have a negative offset
// in the vgpr, even if adding the immediate offset makes it positive.
unsigned Overflow = ImmOffset & ~MaxImm;
ImmOffset -= Overflow;
if ((int32_t)Overflow < 0) {
Overflow += ImmOffset;
ImmOffset = 0;
}
C1 = ImmOffset;
if (Overflow != 0) {
if (!BaseReg)
BaseReg = B.buildConstant(S32, Overflow).getReg(0);
else {
auto OverflowVal = B.buildConstant(S32, Overflow);
BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
}
}
}
if (!BaseReg)
BaseReg = B.buildConstant(S32, 0).getReg(0);
return {BaseReg, C1};
}
bool AMDGPURegisterBankInfo::buildVCopy(MachineIRBuilder &B, Register DstReg,
Register SrcReg) const {
MachineRegisterInfo &MRI = *B.getMRI();
LLT SrcTy = MRI.getType(SrcReg);
if (SrcTy.getSizeInBits() == 32) {
// Use a v_mov_b32 here to make the exec dependency explicit.
B.buildInstr(AMDGPU::V_MOV_B32_e32)
.addDef(DstReg)
.addUse(SrcReg);
return constrainGenericRegister(DstReg, AMDGPU::VGPR_32RegClass, MRI) &&
constrainGenericRegister(SrcReg, AMDGPU::SReg_32RegClass, MRI);
}
Register TmpReg0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
Register TmpReg1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
B.buildInstr(AMDGPU::V_MOV_B32_e32)
.addDef(TmpReg0)
.addUse(SrcReg, 0, AMDGPU::sub0);
B.buildInstr(AMDGPU::V_MOV_B32_e32)
.addDef(TmpReg1)
.addUse(SrcReg, 0, AMDGPU::sub1);
B.buildInstr(AMDGPU::REG_SEQUENCE)
.addDef(DstReg)
.addUse(TmpReg0)
.addImm(AMDGPU::sub0)
.addUse(TmpReg1)
.addImm(AMDGPU::sub1);
return constrainGenericRegister(SrcReg, AMDGPU::SReg_64RegClass, MRI) &&
constrainGenericRegister(DstReg, AMDGPU::VReg_64RegClass, MRI);
}
/// Utility function for pushing dynamic vector indexes with a constant offset
/// into waterwall loops.
static void reinsertVectorIndexAdd(MachineIRBuilder &B,
MachineInstr &IdxUseInstr,
unsigned OpIdx,
unsigned ConstOffset) {
MachineRegisterInfo &MRI = *B.getMRI();
const LLT S32 = LLT::scalar(32);
Register WaterfallIdx = IdxUseInstr.getOperand(OpIdx).getReg();
B.setInsertPt(*IdxUseInstr.getParent(), IdxUseInstr.getIterator());
auto MaterializedOffset = B.buildConstant(S32, ConstOffset);
auto Add = B.buildAdd(S32, WaterfallIdx, MaterializedOffset);
MRI.setRegBank(MaterializedOffset.getReg(0), AMDGPU::SGPRRegBank);
MRI.setRegBank(Add.getReg(0), AMDGPU::SGPRRegBank);
IdxUseInstr.getOperand(OpIdx).setReg(Add.getReg(0));
}
/// Implement extending a 32-bit value to a 64-bit value. \p Lo32Reg is the
/// original 32-bit source value (to be inserted in the low part of the combined
/// 64-bit result), and \p Hi32Reg is the high half of the combined 64-bit
/// value.
static void extendLow32IntoHigh32(MachineIRBuilder &B,
Register Hi32Reg, Register Lo32Reg,
unsigned ExtOpc,
const RegisterBank &RegBank,
bool IsBooleanSrc = false) {
if (ExtOpc == AMDGPU::G_ZEXT) {
B.buildConstant(Hi32Reg, 0);
} else if (ExtOpc == AMDGPU::G_SEXT) {
if (IsBooleanSrc) {
// If we know the original source was an s1, the high half is the same as
// the low.
B.buildCopy(Hi32Reg, Lo32Reg);
} else {
// Replicate sign bit from 32-bit extended part.
auto ShiftAmt = B.buildConstant(LLT::scalar(32), 31);
B.getMRI()->setRegBank(ShiftAmt.getReg(0), RegBank);
B.buildAShr(Hi32Reg, Lo32Reg, ShiftAmt);
}
} else {
assert(ExtOpc == AMDGPU::G_ANYEXT && "not an integer extension");
B.buildUndef(Hi32Reg);
}
}
bool AMDGPURegisterBankInfo::foldExtractEltToCmpSelect(
MachineInstr &MI, MachineRegisterInfo &MRI,
const OperandsMapper &OpdMapper) const {
Register VecReg = MI.getOperand(1).getReg();
Register Idx = MI.getOperand(2).getReg();
const RegisterBank &IdxBank =
*OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
bool IsDivergentIdx = IdxBank != AMDGPU::SGPRRegBank;
LLT VecTy = MRI.getType(VecReg);
unsigned EltSize = VecTy.getScalarSizeInBits();
unsigned NumElem = VecTy.getNumElements();
if (!SITargetLowering::shouldExpandVectorDynExt(EltSize, NumElem,
IsDivergentIdx))
return false;
MachineIRBuilder B(MI);
LLT S32 = LLT::scalar(32);
const RegisterBank &DstBank =
*OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
const RegisterBank &SrcBank =
*OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
const RegisterBank &CCBank =
(DstBank == AMDGPU::SGPRRegBank &&
SrcBank == AMDGPU::SGPRRegBank &&
IdxBank == AMDGPU::SGPRRegBank) ? AMDGPU::SGPRRegBank
: AMDGPU::VCCRegBank;
LLT CCTy = (CCBank == AMDGPU::SGPRRegBank) ? S32 : LLT::scalar(1);
if (CCBank == AMDGPU::VCCRegBank && IdxBank == AMDGPU::SGPRRegBank) {
Idx = B.buildCopy(S32, Idx)->getOperand(0).getReg();
MRI.setRegBank(Idx, AMDGPU::VGPRRegBank);
}
LLT EltTy = VecTy.getScalarType();
SmallVector<Register, 2> DstRegs(OpdMapper.getVRegs(0));
unsigned NumLanes = DstRegs.size();
if (!NumLanes)
NumLanes = 1;
else
EltTy = MRI.getType(DstRegs[0]);
auto UnmergeToEltTy = B.buildUnmerge(EltTy, VecReg);
SmallVector<Register, 2> Res(NumLanes);
for (unsigned L = 0; L < NumLanes; ++L)
Res[L] = UnmergeToEltTy.getReg(L);
for (unsigned I = 1; I < NumElem; ++I) {
auto IC = B.buildConstant(S32, I);
MRI.setRegBank(IC->getOperand(0).getReg(), AMDGPU::SGPRRegBank);
auto Cmp = B.buildICmp(CmpInst::ICMP_EQ, CCTy, Idx, IC);
MRI.setRegBank(Cmp->getOperand(0).getReg(), CCBank);
for (unsigned L = 0; L < NumLanes; ++L) {
auto S = B.buildSelect(EltTy, Cmp,
UnmergeToEltTy.getReg(I * NumLanes + L), Res[L]);
for (unsigned N : { 0, 2, 3 })
MRI.setRegBank(S->getOperand(N).getReg(), DstBank);
Res[L] = S->getOperand(0).getReg();
}
}
for (unsigned L = 0; L < NumLanes; ++L) {
Register DstReg = (NumLanes == 1) ? MI.getOperand(0).getReg() : DstRegs[L];
B.buildCopy(DstReg, Res[L]);
MRI.setRegBank(DstReg, DstBank);
}
MRI.setRegBank(MI.getOperand(0).getReg(), DstBank);
MI.eraseFromParent();
return true;
}
// Insert a cross regbank copy for a register if it already has a bank that
// differs from the one we want to set.
static Register constrainRegToBank(MachineRegisterInfo &MRI,
MachineIRBuilder &B, Register &Reg,
const RegisterBank &Bank) {
const RegisterBank *CurrBank = MRI.getRegBankOrNull(Reg);
if (CurrBank && *CurrBank != Bank) {
Register Copy = B.buildCopy(MRI.getType(Reg), Reg).getReg(0);
MRI.setRegBank(Copy, Bank);
return Copy;
}
MRI.setRegBank(Reg, Bank);
return Reg;
}
bool AMDGPURegisterBankInfo::foldInsertEltToCmpSelect(
MachineInstr &MI, MachineRegisterInfo &MRI,
const OperandsMapper &OpdMapper) const {
Register VecReg = MI.getOperand(1).getReg();
Register Idx = MI.getOperand(3).getReg();
const RegisterBank &IdxBank =
*OpdMapper.getInstrMapping().getOperandMapping(3).BreakDown[0].RegBank;
bool IsDivergentIdx = IdxBank != AMDGPU::SGPRRegBank;
LLT VecTy = MRI.getType(VecReg);
unsigned EltSize = VecTy.getScalarSizeInBits();
unsigned NumElem = VecTy.getNumElements();
if (!SITargetLowering::shouldExpandVectorDynExt(EltSize, NumElem,
IsDivergentIdx))
return false;
MachineIRBuilder B(MI);
LLT S32 = LLT::scalar(32);
const RegisterBank &DstBank =
*OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
const RegisterBank &SrcBank =
*OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
const RegisterBank &InsBank =
*OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
const RegisterBank &CCBank =
(DstBank == AMDGPU::SGPRRegBank &&
SrcBank == AMDGPU::SGPRRegBank &&
InsBank == AMDGPU::SGPRRegBank &&
IdxBank == AMDGPU::SGPRRegBank) ? AMDGPU::SGPRRegBank
: AMDGPU::VCCRegBank;
LLT CCTy = (CCBank == AMDGPU::SGPRRegBank) ? S32 : LLT::scalar(1);
if (CCBank == AMDGPU::VCCRegBank && IdxBank == AMDGPU::SGPRRegBank) {
Idx = B.buildCopy(S32, Idx)->getOperand(0).getReg();
MRI.setRegBank(Idx, AMDGPU::VGPRRegBank);
}
LLT EltTy = VecTy.getScalarType();
SmallVector<Register, 2> InsRegs(OpdMapper.getVRegs(2));
unsigned NumLanes = InsRegs.size();
if (!NumLanes) {
NumLanes = 1;
InsRegs.push_back(MI.getOperand(2).getReg());
} else {
EltTy = MRI.getType(InsRegs[0]);
}
auto UnmergeToEltTy = B.buildUnmerge(EltTy, VecReg);
SmallVector<Register, 16> Ops(NumElem * NumLanes);
for (unsigned I = 0; I < NumElem; ++I) {
auto IC = B.buildConstant(S32, I);
MRI.setRegBank(IC->getOperand(0).getReg(), AMDGPU::SGPRRegBank);
auto Cmp = B.buildICmp(CmpInst::ICMP_EQ, CCTy, Idx, IC);
MRI.setRegBank(Cmp->getOperand(0).getReg(), CCBank);
for (unsigned L = 0; L < NumLanes; ++L) {
Register Op0 = constrainRegToBank(MRI, B, InsRegs[L], DstBank);
Register Op1 = UnmergeToEltTy.getReg(I * NumLanes + L);
Op1 = constrainRegToBank(MRI, B, Op1, DstBank);
Register Select = B.buildSelect(EltTy, Cmp, Op0, Op1).getReg(0);
MRI.setRegBank(Select, DstBank);
Ops[I * NumLanes + L] = Select;
}
}
LLT MergeTy = LLT::fixed_vector(Ops.size(), EltTy);
if (MergeTy == MRI.getType(MI.getOperand(0).getReg())) {
B.buildBuildVector(MI.getOperand(0), Ops);
} else {
auto Vec = B.buildBuildVector(MergeTy, Ops);
MRI.setRegBank(Vec->getOperand(0).getReg(), DstBank);
B.buildBitcast(MI.getOperand(0).getReg(), Vec);
}
MRI.setRegBank(MI.getOperand(0).getReg(), DstBank);
MI.eraseFromParent();
return true;
}
void AMDGPURegisterBankInfo::applyMappingImpl(
const OperandsMapper &OpdMapper) const {
MachineInstr &MI = OpdMapper.getMI();
unsigned Opc = MI.getOpcode();
MachineRegisterInfo &MRI = OpdMapper.getMRI();
switch (Opc) {
case AMDGPU::G_PHI: {
Register DstReg = MI.getOperand(0).getReg();
LLT DstTy = MRI.getType(DstReg);
if (DstTy != LLT::scalar(1))
break;
const LLT S32 = LLT::scalar(32);
const RegisterBank *DstBank =
OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
if (DstBank == &AMDGPU::VCCRegBank) {
applyDefaultMapping(OpdMapper);
// The standard handling only considers the result register bank for
// phis. For VCC, blindly inserting a copy when the phi is lowered will
// produce an invalid copy. We can only copy with some kind of compare to
// get a vector boolean result. Insert a register bank copy that will be
// correctly lowered to a compare.
MachineIRBuilder B(*MI.getParent()->getParent());
for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
Register SrcReg = MI.getOperand(I).getReg();
const RegisterBank *SrcBank = getRegBank(SrcReg, MRI, *TRI);
if (SrcBank != &AMDGPU::VCCRegBank) {
MachineBasicBlock *SrcMBB = MI.getOperand(I + 1).getMBB();
B.setInsertPt(*SrcMBB, SrcMBB->getFirstTerminator());
auto Copy = B.buildCopy(LLT::scalar(1), SrcReg);
MRI.setRegBank(Copy.getReg(0), AMDGPU::VCCRegBank);
MI.getOperand(I).setReg(Copy.getReg(0));
}
}
return;
}
// Phi handling is strange and only considers the bank of the destination.
substituteSimpleCopyRegs(OpdMapper, 0);
// Promote SGPR/VGPR booleans to s32
MachineFunction *MF = MI.getParent()->getParent();
ApplyRegBankMapping ApplyBank(*this, MRI, DstBank);
MachineIRBuilder B(MI, ApplyBank);
LegalizerHelper Helper(*MF, ApplyBank, B);
if (Helper.widenScalar(MI, 0, S32) != LegalizerHelper::Legalized)
llvm_unreachable("widen scalar should have succeeded");
return;
}
case AMDGPU::G_ICMP:
case AMDGPU::G_UADDO:
case AMDGPU::G_USUBO:
case AMDGPU::G_UADDE:
case AMDGPU::G_SADDE:
case AMDGPU::G_USUBE:
case AMDGPU::G_SSUBE: {
unsigned BoolDstOp = Opc == AMDGPU::G_ICMP ? 0 : 1;
Register DstReg = MI.getOperand(BoolDstOp).getReg();
const RegisterBank *DstBank =
OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
if (DstBank != &AMDGPU::SGPRRegBank)
break;
const bool HasCarryIn = MI.getNumOperands() == 5;
// If this is a scalar compare, promote the result to s32, as the selection
// will end up using a copy to a 32-bit vreg.
const LLT S32 = LLT::scalar(32);
Register NewDstReg = MRI.createGenericVirtualRegister(S32);
MRI.setRegBank(NewDstReg, AMDGPU::SGPRRegBank);
MI.getOperand(BoolDstOp).setReg(NewDstReg);
MachineIRBuilder B(MI);
if (HasCarryIn) {
Register NewSrcReg = MRI.createGenericVirtualRegister(S32);
MRI.setRegBank(NewSrcReg, AMDGPU::SGPRRegBank);
B.buildZExt(NewSrcReg, MI.getOperand(4).getReg());
MI.getOperand(4).setReg(NewSrcReg);
}
MachineBasicBlock *MBB = MI.getParent();
B.setInsertPt(*MBB, std::next(MI.getIterator()));
// If we had a constrained VCC result register, a copy was inserted to VCC
// from SGPR.
SmallVector<Register, 1> DefRegs(OpdMapper.getVRegs(0));
if (DefRegs.empty())
DefRegs.push_back(DstReg);
B.buildTrunc(DefRegs[0], NewDstReg);
return;
}
case AMDGPU::G_SELECT: {
Register DstReg = MI.getOperand(0).getReg();
LLT DstTy = MRI.getType(DstReg);
SmallVector<Register, 1> CondRegs(OpdMapper.getVRegs(1));
if (CondRegs.empty())
CondRegs.push_back(MI.getOperand(1).getReg());
else {
assert(CondRegs.size() == 1);
}
const RegisterBank *CondBank = getRegBank(CondRegs[0], MRI, *TRI);
if (CondBank == &AMDGPU::SGPRRegBank) {
MachineIRBuilder B(MI);
const LLT S32 = LLT::scalar(32);
Register NewCondReg = MRI.createGenericVirtualRegister(S32);
MRI.setRegBank(NewCondReg, AMDGPU::SGPRRegBank);
MI.getOperand(1).setReg(NewCondReg);
B.buildZExt(NewCondReg, CondRegs[0]);
}
if (DstTy.getSizeInBits() != 64)
break;
MachineIRBuilder B(MI);
LLT HalfTy = getHalfSizedType(DstTy);
SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
SmallVector<Register, 2> Src1Regs(OpdMapper.getVRegs(2));
SmallVector<Register, 2> Src2Regs(OpdMapper.getVRegs(3));
// All inputs are SGPRs, nothing special to do.
if (DefRegs.empty()) {
assert(Src1Regs.empty() && Src2Regs.empty());
break;
}
if (Src1Regs.empty())
split64BitValueForMapping(B, Src1Regs, HalfTy, MI.getOperand(2).getReg());
else {
setRegsToType(MRI, Src1Regs, HalfTy);
}
if (Src2Regs.empty())
split64BitValueForMapping(B, Src2Regs, HalfTy, MI.getOperand(3).getReg());
else
setRegsToType(MRI, Src2Regs, HalfTy);
setRegsToType(MRI, DefRegs, HalfTy);
B.buildSelect(DefRegs[0], CondRegs[0], Src1Regs[0], Src2Regs[0]);
B.buildSelect(DefRegs[1], CondRegs[0], Src1Regs[1], Src2Regs[1]);
MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank);
MI.eraseFromParent();
return;
}
case AMDGPU::G_BRCOND: {
Register CondReg = MI.getOperand(0).getReg();
// FIXME: Should use legalizer helper, but should change bool ext type.
const RegisterBank *CondBank =
OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
if (CondBank == &AMDGPU::SGPRRegBank) {
MachineIRBuilder B(MI);
const LLT S32 = LLT::scalar(32);
Register NewCondReg = MRI.createGenericVirtualRegister(S32);
MRI.setRegBank(NewCondReg, AMDGPU::SGPRRegBank);
MI.getOperand(0).setReg(NewCondReg);
B.buildZExt(NewCondReg, CondReg);
return;
}
break;
}
case AMDGPU::G_AND:
case AMDGPU::G_OR:
case AMDGPU::G_XOR: {
// 64-bit and is only available on the SALU, so split into 2 32-bit ops if
// there is a VGPR input.
Register DstReg = MI.getOperand(0).getReg();
LLT DstTy = MRI.getType(DstReg);
if (DstTy.getSizeInBits() == 1) {
const RegisterBank *DstBank =
OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
if (DstBank == &AMDGPU::VCCRegBank)
break;
MachineFunction *MF = MI.getParent()->getParent();
ApplyRegBankMapping ApplyBank(*this, MRI, DstBank);
MachineIRBuilder B(MI, ApplyBank);
LegalizerHelper Helper(*MF, ApplyBank, B);
if (Helper.widenScalar(MI, 0, LLT::scalar(32)) !=
LegalizerHelper::Legalized)
llvm_unreachable("widen scalar should have succeeded");
return;
}
if (DstTy.getSizeInBits() != 64)
break;
LLT HalfTy = getHalfSizedType(DstTy);
SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
SmallVector<Register, 2> Src0Regs(OpdMapper.getVRegs(1));
SmallVector<Register, 2> Src1Regs(OpdMapper.getVRegs(2));
// All inputs are SGPRs, nothing special to do.
if (DefRegs.empty()) {
assert(Src0Regs.empty() && Src1Regs.empty());
break;
}
assert(DefRegs.size() == 2);
assert(Src0Regs.size() == Src1Regs.size() &&
(Src0Regs.empty() || Src0Regs.size() == 2));
// Depending on where the source registers came from, the generic code may
// have decided to split the inputs already or not. If not, we still need to
// extract the values.
MachineIRBuilder B(MI);
if (Src0Regs.empty())
split64BitValueForMapping(B, Src0Regs, HalfTy, MI.getOperand(1).getReg());
else
setRegsToType(MRI, Src0Regs, HalfTy);
if (Src1Regs.empty())
split64BitValueForMapping(B, Src1Regs, HalfTy, MI.getOperand(2).getReg());
else
setRegsToType(MRI, Src1Regs, HalfTy);
setRegsToType(MRI, DefRegs, HalfTy);
B.buildInstr(Opc, {DefRegs[0]}, {Src0Regs[0], Src1Regs[0]});
B.buildInstr(Opc, {DefRegs[1]}, {Src0Regs[1], Src1Regs[1]});
MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank);
MI.eraseFromParent();
return;
}
case AMDGPU::G_ABS: {
Register SrcReg = MI.getOperand(1).getReg();
const RegisterBank *SrcBank = MRI.getRegBankOrNull(SrcReg);
// There is no VALU abs instruction so we need to replace it with a sub and
// max combination.
if (SrcBank && SrcBank == &AMDGPU::VGPRRegBank) {
MachineFunction *MF = MI.getParent()->getParent();
ApplyRegBankMapping Apply(*this, MRI, &AMDGPU::VGPRRegBank);
MachineIRBuilder B(MI, Apply);
LegalizerHelper Helper(*MF, Apply, B);
if (Helper.lowerAbsToMaxNeg(MI) != LegalizerHelper::Legalized)
llvm_unreachable("lowerAbsToMaxNeg should have succeeded");
return;
}
LLVM_FALLTHROUGH;
}
case AMDGPU::G_ADD:
case AMDGPU::G_SUB:
case AMDGPU::G_MUL:
case AMDGPU::G_SHL:
case AMDGPU::G_LSHR:
case AMDGPU::G_ASHR:
case AMDGPU::G_SMIN:
case AMDGPU::G_SMAX:
case AMDGPU::G_UMIN:
case AMDGPU::G_UMAX: {
Register DstReg = MI.getOperand(0).getReg();
LLT DstTy = MRI.getType(DstReg);
// 16-bit operations are VALU only, but can be promoted to 32-bit SALU.
// Packed 16-bit operations need to be scalarized and promoted.
if (DstTy != LLT::scalar(16) && DstTy != LLT::fixed_vector(2, 16))
break;
const RegisterBank *DstBank =
OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
if (DstBank == &AMDGPU::VGPRRegBank)
break;
const LLT S32 = LLT::scalar(32);
MachineBasicBlock *MBB = MI.getParent();
MachineFunction *MF = MBB->getParent();
ApplyRegBankMapping ApplySALU(*this, MRI, &AMDGPU::SGPRRegBank);
MachineIRBuilder B(MI, ApplySALU);
if (DstTy.isVector()) {
Register WideSrc0Lo, WideSrc0Hi;
Register WideSrc1Lo, WideSrc1Hi;
unsigned ExtendOp = getExtendOp(MI.getOpcode());
std::tie(WideSrc0Lo, WideSrc0Hi)
= unpackV2S16ToS32(B, MI.getOperand(1).getReg(), ExtendOp);
std::tie(WideSrc1Lo, WideSrc1Hi)
= unpackV2S16ToS32(B, MI.getOperand(2).getReg(), ExtendOp);
auto Lo = B.buildInstr(MI.getOpcode(), {S32}, {WideSrc0Lo, WideSrc1Lo});
auto Hi = B.buildInstr(MI.getOpcode(), {S32}, {WideSrc0Hi, WideSrc1Hi});
B.buildBuildVectorTrunc(DstReg, {Lo.getReg(0), Hi.getReg(0)});
MI.eraseFromParent();
} else {
LegalizerHelper Helper(*MF, ApplySALU, B);
if (Helper.widenScalar(MI, 0, S32) != LegalizerHelper::Legalized)
llvm_unreachable("widen scalar should have succeeded");
// FIXME: s16 shift amounts should be legal.
if (Opc == AMDGPU::G_SHL || Opc == AMDGPU::G_LSHR ||
Opc == AMDGPU::G_ASHR) {
B.setInsertPt(*MBB, MI.getIterator());
if (Helper.widenScalar(MI, 1, S32) != LegalizerHelper::Legalized)
llvm_unreachable("widen scalar should have succeeded");
}
}
return;
}
case AMDGPU::G_SEXT_INREG: {
SmallVector<Register, 2> SrcRegs(OpdMapper.getVRegs(1));
if (SrcRegs.empty())
break; // Nothing to repair
const LLT S32 = LLT::scalar(32);
MachineIRBuilder B(MI);
ApplyRegBankMapping O(*this, MRI, &AMDGPU::VGPRRegBank);
GISelObserverWrapper Observer(&O);
B.setChangeObserver(Observer);
// Don't use LegalizerHelper's narrowScalar. It produces unwanted G_SEXTs
// we would need to further expand, and doesn't let us directly set the
// result registers.
SmallVector<Register, 2> DstRegs(OpdMapper.getVRegs(0));
int Amt = MI.getOperand(2).getImm();
if (Amt <= 32) {
if (Amt == 32) {
// The low bits are unchanged.
B.buildCopy(DstRegs[0], SrcRegs[0]);
} else {
// Extend in the low bits and propagate the sign bit to the high half.
B.buildSExtInReg(DstRegs[0], SrcRegs[0], Amt);
}
B.buildAShr(DstRegs[1], DstRegs[0], B.buildConstant(S32, 31));
} else {
// The low bits are unchanged, and extend in the high bits.
B.buildCopy(DstRegs[0], SrcRegs[0]);
B.buildSExtInReg(DstRegs[1], DstRegs[0], Amt - 32);
}
Register DstReg = MI.getOperand(0).getReg();
MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank);
MI.eraseFromParent();
return;
}
case AMDGPU::G_CTPOP:
case AMDGPU::G_BITREVERSE: {
const RegisterBank *DstBank =
OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
if (DstBank == &AMDGPU::SGPRRegBank)
break;
Register SrcReg = MI.getOperand(1).getReg();
const LLT S32 = LLT::scalar(32);
LLT Ty = MRI.getType(SrcReg);
if (Ty == S32)
break;
ApplyRegBankMapping ApplyVALU(*this, MRI, &AMDGPU::VGPRRegBank);
MachineIRBuilder B(MI, ApplyVALU);
MachineFunction &MF = B.getMF();
LegalizerHelper Helper(MF, ApplyVALU, B);
if (Helper.narrowScalar(MI, 1, S32) != LegalizerHelper::Legalized)
llvm_unreachable("narrowScalar should have succeeded");
return;
}
case AMDGPU::G_AMDGPU_FFBH_U32:
case AMDGPU::G_AMDGPU_FFBL_B32:
case AMDGPU::G_CTLZ_ZERO_UNDEF:
case AMDGPU::G_CTTZ_ZERO_UNDEF: {
const RegisterBank *DstBank =
OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
if (DstBank == &AMDGPU::SGPRRegBank)
break;
Register SrcReg = MI.getOperand(1).getReg();
const LLT S32 = LLT::scalar(32);
LLT Ty = MRI.getType(SrcReg);
if (Ty == S32)
break;
// We can narrow this more efficiently than Helper can by using ffbh/ffbl
// which return -1 when the input is zero:
// (ctlz_zero_undef hi:lo) -> (umin (ffbh hi), (add (ffbh lo), 32))
// (cttz_zero_undef hi:lo) -> (umin (add (ffbl hi), 32), (ffbl lo))
// (ffbh hi:lo) -> (umin (ffbh hi), (uaddsat (ffbh lo), 32))
// (ffbl hi:lo) -> (umin (uaddsat (ffbh hi), 32), (ffbh lo))
ApplyRegBankMapping ApplyVALU(*this, MRI, &AMDGPU::VGPRRegBank);
MachineIRBuilder B(MI, ApplyVALU);
SmallVector<Register, 2> SrcRegs(OpdMapper.getVRegs(1));
unsigned NewOpc = Opc == AMDGPU::G_CTLZ_ZERO_UNDEF
? (unsigned)AMDGPU::G_AMDGPU_FFBH_U32
: Opc == AMDGPU::G_CTTZ_ZERO_UNDEF
? (unsigned)AMDGPU::G_AMDGPU_FFBL_B32
: Opc;
unsigned Idx = NewOpc == AMDGPU::G_AMDGPU_FFBH_U32;
auto X = B.buildInstr(NewOpc, {S32}, {SrcRegs[Idx]});
auto Y = B.buildInstr(NewOpc, {S32}, {SrcRegs[Idx ^ 1]});
unsigned AddOpc =
Opc == AMDGPU::G_CTLZ_ZERO_UNDEF || Opc == AMDGPU::G_CTTZ_ZERO_UNDEF
? AMDGPU::G_ADD
: AMDGPU::G_UADDSAT;
Y = B.buildInstr(AddOpc, {S32}, {Y, B.buildConstant(S32, 32)});
Register DstReg = MI.getOperand(0).getReg();
B.buildUMin(DstReg, X, Y);
MI.eraseFromParent();
return;
}
case AMDGPU::G_SEXT:
case AMDGPU::G_ZEXT:
case AMDGPU::G_ANYEXT: {
Register SrcReg = MI.getOperand(1).getReg();
LLT SrcTy = MRI.getType(SrcReg);
const bool Signed = Opc == AMDGPU::G_SEXT;
assert(empty(OpdMapper.getVRegs(1)));
MachineIRBuilder B(MI);
const RegisterBank *SrcBank =
OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
Register DstReg = MI.getOperand(0).getReg();
LLT DstTy = MRI.getType(DstReg);
if (DstTy.isScalar() &&
SrcBank != &AMDGPU::SGPRRegBank &&
SrcBank != &AMDGPU::VCCRegBank &&
// FIXME: Should handle any type that round to s64 when irregular
// breakdowns supported.
DstTy.getSizeInBits() == 64 &&
SrcTy.getSizeInBits() <= 32) {
SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
// Extend to 32-bit, and then extend the low half.
if (Signed) {
// TODO: Should really be buildSExtOrCopy
B.buildSExtOrTrunc(DefRegs[0], SrcReg);
} else if (Opc == AMDGPU::G_ZEXT) {
B.buildZExtOrTrunc(DefRegs[0], SrcReg);
} else {
B.buildAnyExtOrTrunc(DefRegs[0], SrcReg);
}
extendLow32IntoHigh32(B, DefRegs[1], DefRegs[0], Opc, *SrcBank);
MRI.setRegBank(DstReg, *SrcBank);
MI.eraseFromParent();
return;
}
if (SrcTy != LLT::scalar(1))
return;
// It is not legal to have a legalization artifact with a VCC source. Rather
// than introducing a copy, insert the select we would have to select the
// copy to.
if (SrcBank == &AMDGPU::VCCRegBank) {
SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
const RegisterBank *DstBank = &AMDGPU::VGPRRegBank;
unsigned DstSize = DstTy.getSizeInBits();
// 64-bit select is SGPR only
const bool UseSel64 = DstSize > 32 &&
SrcBank->getID() == AMDGPU::SGPRRegBankID;
// TODO: Should s16 select be legal?
LLT SelType = UseSel64 ? LLT::scalar(64) : LLT::scalar(32);
auto True = B.buildConstant(SelType, Signed ? -1 : 1);
auto False = B.buildConstant(SelType, 0);
MRI.setRegBank(True.getReg(0), *DstBank);
MRI.setRegBank(False.getReg(0), *DstBank);
MRI.setRegBank(DstReg, *DstBank);
if (DstSize > 32) {
B.buildSelect(DefRegs[0], SrcReg, True, False);
extendLow32IntoHigh32(B, DefRegs[1], DefRegs[0], Opc, *SrcBank, true);
} else if (DstSize < 32) {
auto Sel = B.buildSelect(SelType, SrcReg, True, False);
MRI.setRegBank(Sel.getReg(0), *DstBank);
B.buildTrunc(DstReg, Sel);
} else {
B.buildSelect(DstReg, SrcReg, True, False);
}
MI.eraseFromParent();
return;
}
break;
}
case AMDGPU::G_BUILD_VECTOR:
case AMDGPU::G_BUILD_VECTOR_TRUNC: {
Register DstReg = MI.getOperand(0).getReg();
LLT DstTy = MRI.getType(DstReg);
if (DstTy != LLT::fixed_vector(2, 16))
break;
assert(MI.getNumOperands() == 3 && OpdMapper.getVRegs(0).empty());
substituteSimpleCopyRegs(OpdMapper, 1);
substituteSimpleCopyRegs(OpdMapper, 2);
const RegisterBank *DstBank =
OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
if (DstBank == &AMDGPU::SGPRRegBank)
break</