blob: 4d78188b3dc36522a655b711b3e7d5fa6f571997 [file] [log] [blame]
//===- AMDGPURegisterBankInfo.cpp -------------------------------*- C++ -*-==//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
/// \file
/// This file implements the targeting of the RegisterBankInfo class for
/// AMDGPU.
/// \todo This should be generated by TableGen.
//===----------------------------------------------------------------------===//
#include "AMDGPURegisterBankInfo.h"
#include "AMDGPUInstrInfo.h"
#include "AMDGPUSubtarget.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "SIMachineFunctionInfo.h"
#include "SIRegisterInfo.h"
#include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
#include "llvm/CodeGen/GlobalISel/RegisterBank.h"
#include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"
#include "llvm/CodeGen/TargetRegisterInfo.h"
#include "llvm/CodeGen/TargetSubtargetInfo.h"
#include "llvm/IR/Constants.h"
#define GET_TARGET_REGBANK_IMPL
#include "AMDGPUGenRegisterBank.inc"
// This file will be TableGen'ed at some point.
#include "AMDGPUGenRegisterBankInfo.def"
using namespace llvm;
using namespace MIPatternMatch;
namespace {
// Observer to apply a register bank to new registers created by LegalizerHelper.
class ApplyRegBankMapping final : public GISelChangeObserver {
private:
MachineRegisterInfo &MRI;
const RegisterBank *NewBank;
SmallVector<MachineInstr *, 4> NewInsts;
public:
ApplyRegBankMapping(MachineRegisterInfo &MRI_, const RegisterBank *RB)
: MRI(MRI_), NewBank(RB) {}
~ApplyRegBankMapping() {
for (MachineInstr *MI : NewInsts)
applyBank(*MI);
}
/// Set any registers that don't have a set register class or bank to SALU.
void applyBank(MachineInstr &MI) {
for (MachineOperand &Op : MI.operands()) {
if (!Op.isReg())
continue;
Register Reg = Op.getReg();
if (MRI.getRegClassOrRegBank(Reg))
continue;
const RegisterBank *RB = NewBank;
// FIXME: This might not be enough to detect when SCC should be used.
if (MRI.getType(Reg) == LLT::scalar(1))
RB = (NewBank == &AMDGPU::SGPRRegBank ?
&AMDGPU::SCCRegBank : &AMDGPU::VCCRegBank);
MRI.setRegBank(Reg, *RB);
}
}
void erasingInstr(MachineInstr &MI) override {}
void createdInstr(MachineInstr &MI) override {
// At this point, the instruction was just inserted and has no operands.
NewInsts.push_back(&MI);
}
void changingInstr(MachineInstr &MI) override {}
void changedInstr(MachineInstr &MI) override {}
};
}
AMDGPURegisterBankInfo::AMDGPURegisterBankInfo(const GCNSubtarget &ST)
: AMDGPUGenRegisterBankInfo(),
Subtarget(ST),
TRI(Subtarget.getRegisterInfo()),
TII(Subtarget.getInstrInfo()) {
// HACK: Until this is fully tablegen'd.
static bool AlreadyInit = false;
if (AlreadyInit)
return;
AlreadyInit = true;
const RegisterBank &RBSGPR = getRegBank(AMDGPU::SGPRRegBankID);
(void)RBSGPR;
assert(&RBSGPR == &AMDGPU::SGPRRegBank);
const RegisterBank &RBVGPR = getRegBank(AMDGPU::VGPRRegBankID);
(void)RBVGPR;
assert(&RBVGPR == &AMDGPU::VGPRRegBank);
}
unsigned AMDGPURegisterBankInfo::copyCost(const RegisterBank &Dst,
const RegisterBank &Src,
unsigned Size) const {
// TODO: Should there be a UniformVGPRRegBank which can use readfirstlane?
if (Dst.getID() == AMDGPU::SGPRRegBankID &&
Src.getID() == AMDGPU::VGPRRegBankID) {
return std::numeric_limits<unsigned>::max();
}
// Bool values are tricky, because the meaning is based on context. The SCC
// and VCC banks are for the natural scalar and vector conditions produced by
// a compare.
//
// Legalization doesn't know about the necessary context, so an s1 use may
// have been a truncate from an arbitrary value, in which case a copy (lowered
// as a compare with 0) needs to be inserted.
if (Size == 1 &&
(Dst.getID() == AMDGPU::SCCRegBankID ||
Dst.getID() == AMDGPU::SGPRRegBankID) &&
(Src.getID() == AMDGPU::SGPRRegBankID ||
Src.getID() == AMDGPU::VGPRRegBankID ||
Src.getID() == AMDGPU::VCCRegBankID))
return std::numeric_limits<unsigned>::max();
if (Dst.getID() == AMDGPU::SCCRegBankID &&
Src.getID() == AMDGPU::VCCRegBankID)
return std::numeric_limits<unsigned>::max();
return RegisterBankInfo::copyCost(Dst, Src, Size);
}
unsigned AMDGPURegisterBankInfo::getBreakDownCost(
const ValueMapping &ValMapping,
const RegisterBank *CurBank) const {
// Check if this is a breakdown for G_LOAD to move the pointer from SGPR to
// VGPR.
// FIXME: Is there a better way to do this?
if (ValMapping.NumBreakDowns >= 2 || ValMapping.BreakDown[0].Length >= 64)
return 10; // This is expensive.
assert(ValMapping.NumBreakDowns == 2 &&
ValMapping.BreakDown[0].Length == 32 &&
ValMapping.BreakDown[0].StartIdx == 0 &&
ValMapping.BreakDown[1].Length == 32 &&
ValMapping.BreakDown[1].StartIdx == 32 &&
ValMapping.BreakDown[0].RegBank == ValMapping.BreakDown[1].RegBank);
// 32-bit extract of a 64-bit value is just access of a subregister, so free.
// TODO: Cost of 0 hits assert, though it's not clear it's what we really
// want.
// TODO: 32-bit insert to a 64-bit SGPR may incur a non-free copy due to SGPR
// alignment restrictions, but this probably isn't important.
return 1;
}
const RegisterBank &AMDGPURegisterBankInfo::getRegBankFromRegClass(
const TargetRegisterClass &RC) const {
if (&RC == &AMDGPU::SReg_1RegClass)
return AMDGPU::VCCRegBank;
return TRI->isSGPRClass(&RC) ? AMDGPU::SGPRRegBank : AMDGPU::VGPRRegBank;
}
template <unsigned NumOps>
RegisterBankInfo::InstructionMappings
AMDGPURegisterBankInfo::addMappingFromTable(
const MachineInstr &MI, const MachineRegisterInfo &MRI,
const std::array<unsigned, NumOps> RegSrcOpIdx,
ArrayRef<OpRegBankEntry<NumOps>> Table) const {
InstructionMappings AltMappings;
SmallVector<const ValueMapping *, 10> Operands(MI.getNumOperands());
unsigned Sizes[NumOps];
for (unsigned I = 0; I < NumOps; ++I) {
Register Reg = MI.getOperand(RegSrcOpIdx[I]).getReg();
Sizes[I] = getSizeInBits(Reg, MRI, *TRI);
}
for (unsigned I = 0, E = MI.getNumExplicitDefs(); I != E; ++I) {
unsigned SizeI = getSizeInBits(MI.getOperand(I).getReg(), MRI, *TRI);
Operands[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SizeI);
}
// getInstrMapping's default mapping uses ID 1, so start at 2.
unsigned MappingID = 2;
for (const auto &Entry : Table) {
for (unsigned I = 0; I < NumOps; ++I) {
int OpIdx = RegSrcOpIdx[I];
Operands[OpIdx] = AMDGPU::getValueMapping(Entry.RegBanks[I], Sizes[I]);
}
AltMappings.push_back(&getInstructionMapping(MappingID++, Entry.Cost,
getOperandsMapping(Operands),
Operands.size()));
}
return AltMappings;
}
RegisterBankInfo::InstructionMappings
AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsic(
const MachineInstr &MI, const MachineRegisterInfo &MRI) const {
switch (MI.getIntrinsicID()) {
case Intrinsic::amdgcn_readlane: {
static const OpRegBankEntry<3> Table[2] = {
// Perfectly legal.
{ { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1 },
// Need a readfirstlane for the index.
{ { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 }
};
const std::array<unsigned, 3> RegSrcOpIdx = { { 0, 2, 3 } };
return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
}
case Intrinsic::amdgcn_writelane: {
static const OpRegBankEntry<4> Table[4] = {
// Perfectly legal.
{ { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 },
// Need readfirstlane of first op
{ { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 },
// Need readfirstlane of second op
{ { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 },
// Need readfirstlane of both ops
{ { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 3 }
};
// rsrc, voffset, offset
const std::array<unsigned, 4> RegSrcOpIdx = { { 0, 2, 3, 4 } };
return addMappingFromTable<4>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
}
default:
return RegisterBankInfo::getInstrAlternativeMappings(MI);
}
}
RegisterBankInfo::InstructionMappings
AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsicWSideEffects(
const MachineInstr &MI, const MachineRegisterInfo &MRI) const {
switch (MI.getIntrinsicID()) {
case Intrinsic::amdgcn_buffer_load: {
static const OpRegBankEntry<3> Table[4] = {
// Perfectly legal.
{ { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1 },
{ { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 },
// Waterfall loop needed for rsrc. In the worst case this will execute
// approximately an extra 10 * wavesize + 2 instructions.
{ { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1000 },
{ { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 1000 }
};
// rsrc, voffset, offset
const std::array<unsigned, 3> RegSrcOpIdx = { { 2, 3, 4 } };
return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
}
case Intrinsic::amdgcn_s_buffer_load: {
static const OpRegBankEntry<2> Table[4] = {
// Perfectly legal.
{ { AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID }, 1 },
// Only need 1 register in loop
{ { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 300 },
// Have to waterfall the resource.
{ { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1000 },
// Have to waterfall the resource, and the offset.
{ { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 1500 }
};
// rsrc, offset
const std::array<unsigned, 2> RegSrcOpIdx = { { 2, 3 } };
return addMappingFromTable<2>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
}
case Intrinsic::amdgcn_ds_ordered_add:
case Intrinsic::amdgcn_ds_ordered_swap: {
// VGPR = M0, VGPR
static const OpRegBankEntry<3> Table[2] = {
// Perfectly legal.
{ { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 },
// Need a readfirstlane for m0
{ { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 }
};
const std::array<unsigned, 3> RegSrcOpIdx = { { 0, 2, 3 } };
return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
}
case Intrinsic::amdgcn_s_sendmsg:
case Intrinsic::amdgcn_s_sendmsghalt: {
// FIXME: Should have no register for immediate
static const OpRegBankEntry<1> Table[2] = {
// Perfectly legal.
{ { AMDGPU::SGPRRegBankID }, 1 },
// Need readlane
{ { AMDGPU::VGPRRegBankID }, 3 }
};
const std::array<unsigned, 1> RegSrcOpIdx = { { 2 } };
return addMappingFromTable<1>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
}
default:
return RegisterBankInfo::getInstrAlternativeMappings(MI);
}
}
// FIXME: Returns uniform if there's no source value information. This is
// probably wrong.
static bool isInstrUniformNonExtLoadAlign4(const MachineInstr &MI) {
if (!MI.hasOneMemOperand())
return false;
const MachineMemOperand *MMO = *MI.memoperands_begin();
return MMO->getSize() >= 4 && MMO->getAlignment() >= 4 &&
AMDGPUInstrInfo::isUniformMMO(MMO);
}
RegisterBankInfo::InstructionMappings
AMDGPURegisterBankInfo::getInstrAlternativeMappings(
const MachineInstr &MI) const {
const MachineFunction &MF = *MI.getParent()->getParent();
const MachineRegisterInfo &MRI = MF.getRegInfo();
InstructionMappings AltMappings;
switch (MI.getOpcode()) {
case TargetOpcode::G_CONSTANT: {
unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
if (Size == 1) {
static const OpRegBankEntry<1> Table[4] = {
{ { AMDGPU::VGPRRegBankID }, 1 },
{ { AMDGPU::SGPRRegBankID }, 1 },
{ { AMDGPU::VCCRegBankID }, 1 },
{ { AMDGPU::SCCRegBankID }, 1 }
};
return addMappingFromTable<1>(MI, MRI, {{ 0 }}, Table);
}
LLVM_FALLTHROUGH;
}
case TargetOpcode::G_FCONSTANT:
case TargetOpcode::G_FRAME_INDEX:
case TargetOpcode::G_GLOBAL_VALUE: {
static const OpRegBankEntry<1> Table[2] = {
{ { AMDGPU::VGPRRegBankID }, 1 },
{ { AMDGPU::SGPRRegBankID }, 1 }
};
return addMappingFromTable<1>(MI, MRI, {{ 0 }}, Table);
}
case TargetOpcode::G_AND:
case TargetOpcode::G_OR:
case TargetOpcode::G_XOR: {
unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
if (Size == 1) {
// s_{and|or|xor}_b32 set scc when the result of the 32-bit op is not 0.
const InstructionMapping &SCCMapping = getInstructionMapping(
1, 1, getOperandsMapping(
{AMDGPU::getValueMapping(AMDGPU::SCCRegBankID, Size),
AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}),
3); // Num Operands
AltMappings.push_back(&SCCMapping);
const InstructionMapping &SGPRMapping = getInstructionMapping(
1, 1, getOperandsMapping(
{AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}),
3); // Num Operands
AltMappings.push_back(&SGPRMapping);
const InstructionMapping &VCCMapping0 = getInstructionMapping(
2, 10, getOperandsMapping(
{AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size),
AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size),
AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size)}),
3); // Num Operands
AltMappings.push_back(&VCCMapping0);
return AltMappings;
}
if (Size != 64)
break;
const InstructionMapping &SSMapping = getInstructionMapping(
1, 1, getOperandsMapping(
{AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}),
3); // Num Operands
AltMappings.push_back(&SSMapping);
const InstructionMapping &VVMapping = getInstructionMapping(
2, 2, getOperandsMapping(
{AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size)}),
3); // Num Operands
AltMappings.push_back(&VVMapping);
const InstructionMapping &SVMapping = getInstructionMapping(
3, 3, getOperandsMapping(
{AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
AMDGPU::getValueMappingSGPR64Only(AMDGPU::SGPRRegBankID, Size),
AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size)}),
3); // Num Operands
AltMappings.push_back(&SVMapping);
// SGPR in LHS is slightly preferrable, so make it VS more expensive than
// SV.
const InstructionMapping &VSMapping = getInstructionMapping(
3, 4, getOperandsMapping(
{AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
AMDGPU::getValueMappingSGPR64Only(AMDGPU::SGPRRegBankID, Size)}),
3); // Num Operands
AltMappings.push_back(&VSMapping);
break;
}
case TargetOpcode::G_LOAD:
case TargetOpcode::G_ZEXTLOAD:
case TargetOpcode::G_SEXTLOAD: {
unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
LLT PtrTy = MRI.getType(MI.getOperand(1).getReg());
unsigned PtrSize = PtrTy.getSizeInBits();
unsigned AS = PtrTy.getAddressSpace();
LLT LoadTy = MRI.getType(MI.getOperand(0).getReg());
if ((AS != AMDGPUAS::LOCAL_ADDRESS && AS != AMDGPUAS::REGION_ADDRESS &&
AS != AMDGPUAS::PRIVATE_ADDRESS) &&
isInstrUniformNonExtLoadAlign4(MI)) {
const InstructionMapping &SSMapping = getInstructionMapping(
1, 1, getOperandsMapping(
{AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, PtrSize)}),
2); // Num Operands
AltMappings.push_back(&SSMapping);
}
const InstructionMapping &VVMapping = getInstructionMapping(
2, 1, getOperandsMapping(
{AMDGPU::getValueMappingLoadSGPROnly(AMDGPU::VGPRRegBankID, LoadTy),
AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, PtrSize)}),
2); // Num Operands
AltMappings.push_back(&VVMapping);
// It may be possible to have a vgpr = load sgpr mapping here, because
// the mubuf instructions support this kind of load, but probably for only
// gfx7 and older. However, the addressing mode matching in the instruction
// selector should be able to do a better job of detecting and selecting
// these kinds of loads from the vgpr = load vgpr mapping.
return AltMappings;
}
case TargetOpcode::G_ICMP: {
unsigned Size = getSizeInBits(MI.getOperand(2).getReg(), MRI, *TRI);
const InstructionMapping &SSMapping = getInstructionMapping(1, 1,
getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::SCCRegBankID, 1),
nullptr, // Predicate operand.
AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}),
4); // Num Operands
AltMappings.push_back(&SSMapping);
const InstructionMapping &SVMapping = getInstructionMapping(2, 1,
getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1),
nullptr, // Predicate operand.
AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size)}),
4); // Num Operands
AltMappings.push_back(&SVMapping);
const InstructionMapping &VSMapping = getInstructionMapping(3, 1,
getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1),
nullptr, // Predicate operand.
AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}),
4); // Num Operands
AltMappings.push_back(&VSMapping);
const InstructionMapping &VVMapping = getInstructionMapping(4, 1,
getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1),
nullptr, // Predicate operand.
AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size)}),
4); // Num Operands
AltMappings.push_back(&VVMapping);
return AltMappings;
}
case TargetOpcode::G_SELECT: {
unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
const InstructionMapping &SSMapping = getInstructionMapping(1, 1,
getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
AMDGPU::getValueMapping(AMDGPU::SCCRegBankID, 1),
AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}),
4); // Num Operands
AltMappings.push_back(&SSMapping);
const InstructionMapping &VVMapping = getInstructionMapping(2, 1,
getOperandsMapping({AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1),
AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size)}),
4); // Num Operands
AltMappings.push_back(&VVMapping);
return AltMappings;
}
case TargetOpcode::G_SMIN:
case TargetOpcode::G_SMAX:
case TargetOpcode::G_UMIN:
case TargetOpcode::G_UMAX: {
static const OpRegBankEntry<3> Table[4] = {
{ { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 },
{ { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 },
{ { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1 },
// Scalar requires cmp+select, and extends if 16-bit.
// FIXME: Should there be separate costs for 32 and 16-bit
{ { AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID }, 3 }
};
const std::array<unsigned, 3> RegSrcOpIdx = { { 0, 1, 2 } };
return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
}
case TargetOpcode::G_UADDE:
case TargetOpcode::G_USUBE:
case TargetOpcode::G_SADDE:
case TargetOpcode::G_SSUBE: {
unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
const InstructionMapping &SSMapping = getInstructionMapping(1, 1,
getOperandsMapping(
{AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
AMDGPU::getValueMapping(AMDGPU::SCCRegBankID, 1),
AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
AMDGPU::getValueMapping(AMDGPU::SCCRegBankID, 1)}),
5); // Num Operands
AltMappings.push_back(&SSMapping);
const InstructionMapping &VVMapping = getInstructionMapping(2, 1,
getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1),
AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1)}),
5); // Num Operands
AltMappings.push_back(&VVMapping);
return AltMappings;
}
case AMDGPU::G_BRCOND: {
assert(MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() == 1);
const InstructionMapping &SMapping = getInstructionMapping(
1, 1, getOperandsMapping(
{AMDGPU::getValueMapping(AMDGPU::SCCRegBankID, 1), nullptr}),
2); // Num Operands
AltMappings.push_back(&SMapping);
const InstructionMapping &VMapping = getInstructionMapping(
1, 1, getOperandsMapping(
{AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1), nullptr }),
2); // Num Operands
AltMappings.push_back(&VMapping);
return AltMappings;
}
case AMDGPU::G_INTRINSIC:
return getInstrAlternativeMappingsIntrinsic(MI, MRI);
case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS:
return getInstrAlternativeMappingsIntrinsicWSideEffects(MI, MRI);
default:
break;
}
return RegisterBankInfo::getInstrAlternativeMappings(MI);
}
void AMDGPURegisterBankInfo::split64BitValueForMapping(
MachineIRBuilder &B,
SmallVector<Register, 2> &Regs,
LLT HalfTy,
Register Reg) const {
assert(HalfTy.getSizeInBits() == 32);
MachineRegisterInfo *MRI = B.getMRI();
Register LoLHS = MRI->createGenericVirtualRegister(HalfTy);
Register HiLHS = MRI->createGenericVirtualRegister(HalfTy);
const RegisterBank *Bank = getRegBank(Reg, *MRI, *TRI);
MRI->setRegBank(LoLHS, *Bank);
MRI->setRegBank(HiLHS, *Bank);
Regs.push_back(LoLHS);
Regs.push_back(HiLHS);
B.buildInstr(AMDGPU::G_UNMERGE_VALUES)
.addDef(LoLHS)
.addDef(HiLHS)
.addUse(Reg);
}
/// Replace the current type each register in \p Regs has with \p NewTy
static void setRegsToType(MachineRegisterInfo &MRI, ArrayRef<Register> Regs,
LLT NewTy) {
for (Register Reg : Regs) {
assert(MRI.getType(Reg).getSizeInBits() == NewTy.getSizeInBits());
MRI.setType(Reg, NewTy);
}
}
static LLT getHalfSizedType(LLT Ty) {
if (Ty.isVector()) {
assert(Ty.getNumElements() % 2 == 0);
return LLT::scalarOrVector(Ty.getNumElements() / 2, Ty.getElementType());
}
assert(Ty.getSizeInBits() % 2 == 0);
return LLT::scalar(Ty.getSizeInBits() / 2);
}
/// Legalize instruction \p MI where operands in \p OpIndices must be SGPRs. If
/// any of the required SGPR operands are VGPRs, perform a waterfall loop to
/// execute the instruction for each unique combination of values in all lanes
/// in the wave. The block will be split such that rest of the instructions are
/// moved to a new block.
///
/// Essentially performs this loop:
//
/// Save Execution Mask
/// For (Lane : Wavefront) {
/// Enable Lane, Disable all other lanes
/// SGPR = read SGPR value for current lane from VGPR
/// VGPRResult[Lane] = use_op SGPR
/// }
/// Restore Execution Mask
///
/// There is additional complexity to try for compare values to identify the
/// unique values used.
bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
MachineIRBuilder &B,
iterator_range<MachineBasicBlock::iterator> Range,
SmallSet<Register, 4> &SGPROperandRegs,
MachineRegisterInfo &MRI) const {
SmallVector<Register, 4> ResultRegs;
SmallVector<Register, 4> InitResultRegs;
SmallVector<Register, 4> PhiRegs;
MachineBasicBlock &MBB = B.getMBB();
MachineFunction *MF = &B.getMF();
const TargetRegisterClass *WaveRC = TRI->getWaveMaskRegClass();
const unsigned WaveAndOpc = Subtarget.isWave32() ?
AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
const unsigned MovTermOpc = Subtarget.isWave32() ?
AMDGPU::S_MOV_B32_term : AMDGPU::S_MOV_B64_term;
const unsigned XorTermOpc = Subtarget.isWave32() ?
AMDGPU::S_XOR_B32_term : AMDGPU::S_XOR_B64_term;
const unsigned AndSaveExecOpc = Subtarget.isWave32() ?
AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64;
const unsigned ExecReg = Subtarget.isWave32() ?
AMDGPU::EXEC_LO : AMDGPU::EXEC;
for (MachineInstr &MI : Range) {
for (MachineOperand &Def : MI.defs()) {
LLT ResTy = MRI.getType(Def.getReg());
const RegisterBank *DefBank = getRegBank(Def.getReg(), MRI, *TRI);
ResultRegs.push_back(Def.getReg());
Register InitReg = B.buildUndef(ResTy).getReg(0);
Register PhiReg = MRI.createGenericVirtualRegister(ResTy);
InitResultRegs.push_back(InitReg);
PhiRegs.push_back(PhiReg);
MRI.setRegBank(PhiReg, *DefBank);
MRI.setRegBank(InitReg, *DefBank);
}
}
Register SaveExecReg = MRI.createVirtualRegister(WaveRC);
Register InitSaveExecReg = MRI.createVirtualRegister(WaveRC);
// Don't bother using generic instructions/registers for the exec mask.
B.buildInstr(TargetOpcode::IMPLICIT_DEF)
.addDef(InitSaveExecReg);
Register PhiExec = MRI.createVirtualRegister(WaveRC);
Register NewExec = MRI.createVirtualRegister(WaveRC);
// To insert the loop we need to split the block. Move everything before this
// point to a new block, and insert a new empty block before this instruction.
MachineBasicBlock *LoopBB = MF->CreateMachineBasicBlock();
MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock();
MachineBasicBlock *RestoreExecBB = MF->CreateMachineBasicBlock();
MachineFunction::iterator MBBI(MBB);
++MBBI;
MF->insert(MBBI, LoopBB);
MF->insert(MBBI, RestoreExecBB);
MF->insert(MBBI, RemainderBB);
LoopBB->addSuccessor(RestoreExecBB);
LoopBB->addSuccessor(LoopBB);
// Move the rest of the block into a new block.
RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
RemainderBB->splice(RemainderBB->begin(), &MBB, Range.end(), MBB.end());
MBB.addSuccessor(LoopBB);
RestoreExecBB->addSuccessor(RemainderBB);
B.setInsertPt(*LoopBB, LoopBB->end());
B.buildInstr(TargetOpcode::PHI)
.addDef(PhiExec)
.addReg(InitSaveExecReg)
.addMBB(&MBB)
.addReg(NewExec)
.addMBB(LoopBB);
for (auto Result : zip(InitResultRegs, ResultRegs, PhiRegs)) {
B.buildInstr(TargetOpcode::G_PHI)
.addDef(std::get<2>(Result))
.addReg(std::get<0>(Result)) // Initial value / implicit_def
.addMBB(&MBB)
.addReg(std::get<1>(Result)) // Mid-loop value.
.addMBB(LoopBB);
}
const DebugLoc &DL = B.getDL();
// Figure out the iterator range after splicing the instructions.
auto NewBegin = std::prev(LoopBB->end());
// Move the instruction into the loop. Note we moved everything after
// Range.end() already into a new block, so Range.end() is no longer valid.
LoopBB->splice(LoopBB->end(), &MBB, Range.begin(), MBB.end());
auto NewEnd = LoopBB->end();
MachineBasicBlock::iterator I = Range.begin();
B.setInsertPt(*LoopBB, I);
Register CondReg;
for (MachineInstr &MI : make_range(NewBegin, NewEnd)) {
for (MachineOperand &Op : MI.uses()) {
if (!Op.isReg() || Op.isDef())
continue;
if (SGPROperandRegs.count(Op.getReg())) {
LLT OpTy = MRI.getType(Op.getReg());
unsigned OpSize = OpTy.getSizeInBits();
// Can only do a readlane of 32-bit pieces.
if (OpSize == 32) {
// Avoid extra copies in the simple case of one 32-bit register.
Register CurrentLaneOpReg
= MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
MRI.setType(CurrentLaneOpReg, OpTy);
constrainGenericRegister(Op.getReg(), AMDGPU::VGPR_32RegClass, MRI);
// Read the next variant <- also loop target.
BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
CurrentLaneOpReg)
.addReg(Op.getReg());
Register NewCondReg = MRI.createVirtualRegister(WaveRC);
bool First = CondReg == AMDGPU::NoRegister;
if (First)
CondReg = NewCondReg;
// Compare the just read M0 value to all possible Idx values.
B.buildInstr(AMDGPU::V_CMP_EQ_U32_e64)
.addDef(NewCondReg)
.addReg(CurrentLaneOpReg)
.addReg(Op.getReg());
Op.setReg(CurrentLaneOpReg);
if (!First) {
Register AndReg = MRI.createVirtualRegister(WaveRC);
// If there are multiple operands to consider, and the conditions.
B.buildInstr(WaveAndOpc)
.addDef(AndReg)
.addReg(NewCondReg)
.addReg(CondReg);
CondReg = AndReg;
}
} else {
LLT S32 = LLT::scalar(32);
SmallVector<Register, 8> ReadlanePieces;
// The compares can be done as 64-bit, but the extract needs to be done
// in 32-bit pieces.
bool Is64 = OpSize % 64 == 0;
LLT UnmergeTy = OpSize % 64 == 0 ? LLT::scalar(64) : LLT::scalar(32);
unsigned CmpOp = OpSize % 64 == 0 ? AMDGPU::V_CMP_EQ_U64_e64
: AMDGPU::V_CMP_EQ_U32_e64;
// The compares can be done as 64-bit, but the extract needs to be done
// in 32-bit pieces.
// Insert the unmerge before the loop.
B.setMBB(MBB);
auto Unmerge = B.buildUnmerge(UnmergeTy, Op.getReg());
B.setInstr(*I);
unsigned NumPieces = Unmerge->getNumOperands() - 1;
for (unsigned PieceIdx = 0; PieceIdx != NumPieces; ++PieceIdx) {
Register UnmergePiece = Unmerge.getReg(PieceIdx);
Register CurrentLaneOpReg;
if (Is64) {
Register CurrentLaneOpRegLo = MRI.createGenericVirtualRegister(S32);
Register CurrentLaneOpRegHi = MRI.createGenericVirtualRegister(S32);
MRI.setRegClass(UnmergePiece, &AMDGPU::VReg_64RegClass);
MRI.setRegClass(CurrentLaneOpRegLo, &AMDGPU::SReg_32_XM0RegClass);
MRI.setRegClass(CurrentLaneOpRegHi, &AMDGPU::SReg_32_XM0RegClass);
// Read the next variant <- also loop target.
BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
CurrentLaneOpRegLo)
.addReg(UnmergePiece, 0, AMDGPU::sub0);
// Read the next variant <- also loop target.
BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
CurrentLaneOpRegHi)
.addReg(UnmergePiece, 0, AMDGPU::sub1);
CurrentLaneOpReg =
B.buildMerge(LLT::scalar(64),
{CurrentLaneOpRegLo, CurrentLaneOpRegHi})
.getReg(0);
MRI.setRegClass(CurrentLaneOpReg, &AMDGPU::SReg_64_XEXECRegClass);
if (OpTy.getScalarSizeInBits() == 64) {
// If we need to produce a 64-bit element vector, so use the
// merged pieces
ReadlanePieces.push_back(CurrentLaneOpReg);
} else {
// 32-bit element type.
ReadlanePieces.push_back(CurrentLaneOpRegLo);
ReadlanePieces.push_back(CurrentLaneOpRegHi);
}
} else {
CurrentLaneOpReg = MRI.createGenericVirtualRegister(S32);
MRI.setRegClass(UnmergePiece, &AMDGPU::VGPR_32RegClass);
MRI.setRegClass(CurrentLaneOpReg, &AMDGPU::SReg_32_XM0RegClass);
// Read the next variant <- also loop target.
BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
CurrentLaneOpReg)
.addReg(UnmergePiece);
ReadlanePieces.push_back(CurrentLaneOpReg);
}
Register NewCondReg = MRI.createVirtualRegister(WaveRC);
bool First = CondReg == AMDGPU::NoRegister;
if (First)
CondReg = NewCondReg;
B.buildInstr(CmpOp)
.addDef(NewCondReg)
.addReg(CurrentLaneOpReg)
.addReg(UnmergePiece);
if (!First) {
Register AndReg = MRI.createVirtualRegister(WaveRC);
// If there are multiple operands to consider, and the conditions.
B.buildInstr(WaveAndOpc)
.addDef(AndReg)
.addReg(NewCondReg)
.addReg(CondReg);
CondReg = AndReg;
}
}
// FIXME: Build merge seems to switch to CONCAT_VECTORS but not
// BUILD_VECTOR
if (OpTy.isVector()) {
auto Merge = B.buildBuildVector(OpTy, ReadlanePieces);
Op.setReg(Merge.getReg(0));
} else {
auto Merge = B.buildMerge(OpTy, ReadlanePieces);
Op.setReg(Merge.getReg(0));
}
MRI.setRegBank(Op.getReg(), getRegBank(AMDGPU::SGPRRegBankID));
}
}
}
}
B.setInsertPt(*LoopBB, LoopBB->end());
// Update EXEC, save the original EXEC value to VCC.
B.buildInstr(AndSaveExecOpc)
.addDef(NewExec)
.addReg(CondReg, RegState::Kill);
MRI.setSimpleHint(NewExec, CondReg);
// Update EXEC, switch all done bits to 0 and all todo bits to 1.
B.buildInstr(XorTermOpc)
.addDef(ExecReg)
.addReg(ExecReg)
.addReg(NewExec);
// XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
// s_cbranch_scc0?
// Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
B.buildInstr(AMDGPU::S_CBRANCH_EXECNZ)
.addMBB(LoopBB);
// Save the EXEC mask before the loop.
BuildMI(MBB, MBB.end(), DL, TII->get(MovTermOpc), SaveExecReg)
.addReg(ExecReg);
// Restore the EXEC mask after the loop.
B.setMBB(*RestoreExecBB);
B.buildInstr(MovTermOpc)
.addDef(ExecReg)
.addReg(SaveExecReg);
// Restore the insert point before the original instruction.
B.setInsertPt(MBB, MBB.end());
return true;
}
// Return any unique registers used by \p MI at \p OpIndices that need to be
// handled in a waterfall loop. Returns these registers in \p
// SGPROperandRegs. Returns true if there are any operansd to handle and a
// waterfall loop is necessary.
bool AMDGPURegisterBankInfo::collectWaterfallOperands(
SmallSet<Register, 4> &SGPROperandRegs, MachineInstr &MI,
MachineRegisterInfo &MRI, ArrayRef<unsigned> OpIndices) const {
for (unsigned Op : OpIndices) {
assert(MI.getOperand(Op).isUse());
Register Reg = MI.getOperand(Op).getReg();
const RegisterBank *OpBank = getRegBank(Reg, MRI, *TRI);
if (OpBank->getID() == AMDGPU::VGPRRegBankID)
SGPROperandRegs.insert(Reg);
}
// No operands need to be replaced, so no need to loop.
return !SGPROperandRegs.empty();
}
bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
MachineIRBuilder &B, MachineInstr &MI, MachineRegisterInfo &MRI,
ArrayRef<unsigned> OpIndices) const {
// Use a set to avoid extra readfirstlanes in the case where multiple operands
// are the same register.
SmallSet<Register, 4> SGPROperandRegs;
if (!collectWaterfallOperands(SGPROperandRegs, MI, MRI, OpIndices))
return false;
MachineBasicBlock::iterator I = MI.getIterator();
return executeInWaterfallLoop(B, make_range(I, std::next(I)),
SGPROperandRegs, MRI);
}
bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
MachineInstr &MI, MachineRegisterInfo &MRI,
ArrayRef<unsigned> OpIndices) const {
MachineIRBuilder B(MI);
return executeInWaterfallLoop(B, MI, MRI, OpIndices);
}
// Legalize an operand that must be an SGPR by inserting a readfirstlane.
void AMDGPURegisterBankInfo::constrainOpWithReadfirstlane(
MachineInstr &MI, MachineRegisterInfo &MRI, unsigned OpIdx) const {
Register Reg = MI.getOperand(OpIdx).getReg();
const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI);
if (Bank != &AMDGPU::VGPRRegBank)
return;
MachineIRBuilder B(MI);
Register SGPR = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
B.buildInstr(AMDGPU::V_READFIRSTLANE_B32)
.addDef(SGPR)
.addReg(Reg);
const TargetRegisterClass *Constrained =
constrainGenericRegister(Reg, AMDGPU::VGPR_32RegClass, MRI);
(void)Constrained;
assert(Constrained && "Failed to constrain readfirstlane src reg");
MI.getOperand(OpIdx).setReg(SGPR);
}
// When regbankselect repairs registers, it will insert a repair instruction
// which defines the repaired register. Then it calls applyMapping and expects
// that the targets will either delete or rewrite the originally wrote to the
// repaired registers. Beccause of this, we end up in a situation where
// we have 2 instructions defining the same registers.
static MachineInstr *getOtherVRegDef(const MachineRegisterInfo &MRI,
Register Reg,
const MachineInstr &MI) {
// Is there some way we can assert that there are exactly 2 def instructions?
for (MachineInstr &Other : MRI.def_instructions(Reg)) {
if (&Other != &MI)
return &Other;
}
return nullptr;
}
bool AMDGPURegisterBankInfo::applyMappingWideLoad(MachineInstr &MI,
const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper,
MachineRegisterInfo &MRI) const {
Register DstReg = MI.getOperand(0).getReg();
const LLT LoadTy = MRI.getType(DstReg);
unsigned LoadSize = LoadTy.getSizeInBits();
const unsigned MaxNonSmrdLoadSize = 128;
// 128-bit loads are supported for all instruction types.
if (LoadSize <= MaxNonSmrdLoadSize)
return false;
SmallVector<unsigned, 16> DefRegs(OpdMapper.getVRegs(0));
SmallVector<unsigned, 1> SrcRegs(OpdMapper.getVRegs(1));
// If the pointer is an SGPR, we have nothing to do.
if (SrcRegs.empty()) {
Register PtrReg = MI.getOperand(1).getReg();
const RegisterBank *PtrBank = getRegBank(PtrReg, MRI, *TRI);
if (PtrBank == &AMDGPU::SGPRRegBank)
return false;
SrcRegs.push_back(PtrReg);
}
assert(LoadSize % MaxNonSmrdLoadSize == 0);
// We want to get the repair instruction now, because it will help us
// determine which instruction the legalizer inserts that will also
// write to DstReg.
MachineInstr *RepairInst = getOtherVRegDef(MRI, DstReg, MI);
// RegBankSelect only emits scalar types, so we need to reset the pointer
// operand to a pointer type.
Register BasePtrReg = SrcRegs[0];
LLT PtrTy = MRI.getType(MI.getOperand(1).getReg());
MRI.setType(BasePtrReg, PtrTy);
MachineIRBuilder B(MI);
unsigned SplitElts =
MaxNonSmrdLoadSize / LoadTy.getScalarType().getSizeInBits();
const LLT LoadSplitTy = LLT::vector(SplitElts, LoadTy.getScalarType());
ApplyRegBankMapping O(MRI, &AMDGPU::VGPRRegBank);
GISelObserverWrapper Observer(&O);
B.setChangeObserver(Observer);
LegalizerHelper Helper(B.getMF(), Observer, B);
if (Helper.fewerElementsVector(MI, 0, LoadSplitTy) != LegalizerHelper::Legalized)
return false;
// At this point, the legalizer has split the original load into smaller
// loads. At the end of lowering, it inserts an instruction (LegalizedInst)
// that combines the outputs of the lower loads and writes it to DstReg.
// The register bank selector has also added the RepairInst which writes to
// DstReg as well.
MachineInstr *LegalizedInst = getOtherVRegDef(MRI, DstReg, *RepairInst);
// Replace the output of the LegalizedInst with a temporary register, since
// RepairInst already defines DstReg.
Register TmpReg = MRI.createGenericVirtualRegister(MRI.getType(DstReg));
LegalizedInst->getOperand(0).setReg(TmpReg);
B.setInsertPt(*RepairInst->getParent(), RepairInst);
for (unsigned DefIdx = 0, e = DefRegs.size(); DefIdx != e; ++DefIdx) {
Register IdxReg = MRI.createGenericVirtualRegister(LLT::scalar(32));
B.buildConstant(IdxReg, DefIdx);
MRI.setRegBank(IdxReg, getRegBank(AMDGPU::VGPRRegBankID));
B.buildExtractVectorElement(DefRegs[DefIdx], TmpReg, IdxReg);
}
MRI.setRegBank(DstReg, getRegBank(AMDGPU::VGPRRegBankID));
return true;
}
bool AMDGPURegisterBankInfo::applyMappingImage(
MachineInstr &MI, const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper,
MachineRegisterInfo &MRI, int RsrcIdx) const {
const int NumDefs = MI.getNumExplicitDefs();
// The reported argument index is relative to the IR intrinsic call arguments,
// so we need to shift by the number of defs and the intrinsic ID.
RsrcIdx += NumDefs + 1;
// Insert copies to VGPR arguments.
applyDefaultMapping(OpdMapper);
// Fixup any SGPR arguments.
SmallVector<unsigned, 4> SGPRIndexes;
for (int I = NumDefs, NumOps = MI.getNumOperands(); I != NumOps; ++I) {
if (!MI.getOperand(I).isReg())
continue;
// If this intrinsic has a sampler, it immediately follows rsrc.
if (I == RsrcIdx || I == RsrcIdx + 1)
SGPRIndexes.push_back(I);
}
executeInWaterfallLoop(MI, MRI, SGPRIndexes);
return true;
}
// For cases where only a single copy is inserted for matching register banks.
// Replace the register in the instruction operand
static void substituteSimpleCopyRegs(
const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper, unsigned OpIdx) {
SmallVector<unsigned, 1> SrcReg(OpdMapper.getVRegs(OpIdx));
if (!SrcReg.empty()) {
assert(SrcReg.size() == 1);
OpdMapper.getMI().getOperand(OpIdx).setReg(SrcReg[0]);
}
}
/// Handle register layout difference for f16 images for some subtargets.
Register AMDGPURegisterBankInfo::handleD16VData(MachineIRBuilder &B,
MachineRegisterInfo &MRI,
Register Reg) const {
if (!Subtarget.hasUnpackedD16VMem())
return Reg;
const LLT S16 = LLT::scalar(16);
LLT StoreVT = MRI.getType(Reg);
if (!StoreVT.isVector() || StoreVT.getElementType() != S16)
return Reg;
auto Unmerge = B.buildUnmerge(S16, Reg);
SmallVector<Register, 4> WideRegs;
for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
WideRegs.push_back(Unmerge.getReg(I));
const LLT S32 = LLT::scalar(32);
int NumElts = StoreVT.getNumElements();
return B.buildMerge(LLT::vector(NumElts, S32), WideRegs).getReg(0);
}
static std::pair<Register, unsigned>
getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg) {
int64_t Const;
if (mi_match(Reg, MRI, m_ICst(Const)))
return std::make_pair(Register(), Const);
Register Base;
if (mi_match(Reg, MRI, m_GAdd(m_Reg(Base), m_ICst(Const))))
return std::make_pair(Base, Const);
// TODO: Handle G_OR used for add case
return std::make_pair(Reg, 0);
}
std::pair<Register, unsigned>
AMDGPURegisterBankInfo::splitBufferOffsets(MachineIRBuilder &B,
Register OrigOffset) const {
const unsigned MaxImm = 4095;
Register BaseReg;
unsigned ImmOffset;
const LLT S32 = LLT::scalar(32);
std::tie(BaseReg, ImmOffset) = getBaseWithConstantOffset(*B.getMRI(),
OrigOffset);
unsigned C1 = 0;
if (ImmOffset != 0) {
// If the immediate value is too big for the immoffset field, put the value
// and -4096 into the immoffset field so that the value that is copied/added
// for the voffset field is a multiple of 4096, and it stands more chance
// of being CSEd with the copy/add for another similar load/store.
// However, do not do that rounding down to a multiple of 4096 if that is a
// negative number, as it appears to be illegal to have a negative offset
// in the vgpr, even if adding the immediate offset makes it positive.
unsigned Overflow = ImmOffset & ~MaxImm;
ImmOffset -= Overflow;
if ((int32_t)Overflow < 0) {
Overflow += ImmOffset;
ImmOffset = 0;
}
C1 = ImmOffset;
if (Overflow != 0) {
if (!BaseReg)
BaseReg = B.buildConstant(S32, Overflow).getReg(0);
else {
auto OverflowVal = B.buildConstant(S32, Overflow);
BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
}
}
}
if (!BaseReg)
BaseReg = B.buildConstant(S32, 0).getReg(0);
return {BaseReg, C1};
}
static bool isZero(Register Reg, MachineRegisterInfo &MRI) {
int64_t C;
return mi_match(Reg, MRI, m_ICst(C)) && C == 0;
}
static unsigned extractGLC(unsigned CachePolicy) {
return CachePolicy & 1;
}
static unsigned extractSLC(unsigned CachePolicy) {
return (CachePolicy >> 1) & 1;
}
static unsigned extractDLC(unsigned CachePolicy) {
return (CachePolicy >> 2) & 1;
}
MachineInstr *
AMDGPURegisterBankInfo::selectStoreIntrinsic(MachineIRBuilder &B,
MachineInstr &MI) const {
MachineRegisterInfo &MRI = *B.getMRI();
executeInWaterfallLoop(B, MI, MRI, {2, 4});
// FIXME: DAG lowering brokenly changes opcode based on FP vs. integer.
Register VData = MI.getOperand(1).getReg();
LLT Ty = MRI.getType(VData);
int EltSize = Ty.getScalarSizeInBits();
int Size = Ty.getSizeInBits();
// FIXME: Broken integer truncstore.
if (EltSize != 32)
report_fatal_error("unhandled intrinsic store");
// FIXME: Verifier should enforce 1 MMO for these intrinsics.
const int MemSize = (*MI.memoperands_begin())->getSize();
Register RSrc = MI.getOperand(2).getReg();
Register VOffset = MI.getOperand(3).getReg();
Register SOffset = MI.getOperand(4).getReg();
unsigned CachePolicy = MI.getOperand(5).getImm();
unsigned ImmOffset;
std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
const bool Offen = !isZero(VOffset, MRI);
unsigned Opc = AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact;
switch (8 * MemSize) {
case 8:
Opc = Offen ? AMDGPU::BUFFER_STORE_BYTE_OFFEN_exact :
AMDGPU::BUFFER_STORE_BYTE_OFFSET_exact;
break;
case 16:
Opc = Offen ? AMDGPU::BUFFER_STORE_SHORT_OFFEN_exact :
AMDGPU::BUFFER_STORE_SHORT_OFFSET_exact;
break;
default:
Opc = Offen ? AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact :
AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact;
if (Size > 32)
Opc = AMDGPU::getMUBUFOpcode(Opc, Size / 32);
break;
}
// Set the insertion point back to the instruction in case it was moved into a
// loop.
B.setInstr(MI);
MachineInstrBuilder MIB = B.buildInstr(Opc)
.addUse(VData);
if (Offen)
MIB.addUse(VOffset);
MIB.addUse(RSrc)
.addUse(SOffset)
.addImm(ImmOffset)
.addImm(extractGLC(CachePolicy))
.addImm(extractSLC(CachePolicy))
.addImm(0) // tfe: FIXME: Remove from inst
.addImm(extractDLC(CachePolicy))
.cloneMemRefs(MI);
// FIXME: We need a way to report failure from applyMappingImpl.
// Insert constrain copies before inserting the loop.
if (!constrainSelectedInstRegOperands(*MIB, *TII, *TRI, *this))
report_fatal_error("failed to constrain selected store intrinsic");
return MIB;
}
void AMDGPURegisterBankInfo::applyMappingImpl(
const OperandsMapper &OpdMapper) const {
MachineInstr &MI = OpdMapper.getMI();
unsigned Opc = MI.getOpcode();
MachineRegisterInfo &MRI = OpdMapper.getMRI();
switch (Opc) {
case AMDGPU::G_SELECT: {
Register DstReg = MI.getOperand(0).getReg();
LLT DstTy = MRI.getType(DstReg);
if (DstTy.getSizeInBits() != 64)
break;
LLT HalfTy = getHalfSizedType(DstTy);
SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
SmallVector<Register, 1> Src0Regs(OpdMapper.getVRegs(1));
SmallVector<Register, 2> Src1Regs(OpdMapper.getVRegs(2));
SmallVector<Register, 2> Src2Regs(OpdMapper.getVRegs(3));
// All inputs are SGPRs, nothing special to do.
if (DefRegs.empty()) {
assert(Src1Regs.empty() && Src2Regs.empty());
break;
}
MachineIRBuilder B(MI);
if (Src0Regs.empty())
Src0Regs.push_back(MI.getOperand(1).getReg());
else {
assert(Src0Regs.size() == 1);
}
if (Src1Regs.empty())
split64BitValueForMapping(B, Src1Regs, HalfTy, MI.getOperand(2).getReg());
else {
setRegsToType(MRI, Src1Regs, HalfTy);
}
if (Src2Regs.empty())
split64BitValueForMapping(B, Src2Regs, HalfTy, MI.getOperand(3).getReg());
else
setRegsToType(MRI, Src2Regs, HalfTy);
setRegsToType(MRI, DefRegs, HalfTy);
B.buildSelect(DefRegs[0], Src0Regs[0], Src1Regs[0], Src2Regs[0]);
B.buildSelect(DefRegs[1], Src0Regs[0], Src1Regs[1], Src2Regs[1]);
MRI.setRegBank(DstReg, getRegBank(AMDGPU::VGPRRegBankID));
MI.eraseFromParent();
return;
}
case AMDGPU::G_AND:
case AMDGPU::G_OR:
case AMDGPU::G_XOR: {
// 64-bit and is only available on the SALU, so split into 2 32-bit ops if
// there is a VGPR input.
Register DstReg = MI.getOperand(0).getReg();
LLT DstTy = MRI.getType(DstReg);
if (DstTy.getSizeInBits() != 64)
break;
LLT HalfTy = getHalfSizedType(DstTy);
SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
SmallVector<Register, 2> Src0Regs(OpdMapper.getVRegs(1));
SmallVector<Register, 2> Src1Regs(OpdMapper.getVRegs(2));
// All inputs are SGPRs, nothing special to do.
if (DefRegs.empty()) {
assert(Src0Regs.empty() && Src1Regs.empty());
break;
}
assert(DefRegs.size() == 2);
assert(Src0Regs.size() == Src1Regs.size() &&
(Src0Regs.empty() || Src0Regs.size() == 2));
// Depending on where the source registers came from, the generic code may
// have decided to split the inputs already or not. If not, we still need to
// extract the values.
MachineIRBuilder B(MI);
if (Src0Regs.empty())
split64BitValueForMapping(B, Src0Regs, HalfTy, MI.getOperand(1).getReg());
else
setRegsToType(MRI, Src0Regs, HalfTy);
if (Src1Regs.empty())
split64BitValueForMapping(B, Src1Regs, HalfTy, MI.getOperand(2).getReg());
else
setRegsToType(MRI, Src1Regs, HalfTy);
setRegsToType(MRI, DefRegs, HalfTy);
B.buildInstr(Opc)
.addDef(DefRegs[0])
.addUse(Src0Regs[0])
.addUse(Src1Regs[0]);
B.buildInstr(Opc)
.addDef(DefRegs[1])
.addUse(Src0Regs[1])
.addUse(Src1Regs[1]);
MRI.setRegBank(DstReg, getRegBank(AMDGPU::VGPRRegBankID));
MI.eraseFromParent();
return;
}
case AMDGPU::G_ADD:
case AMDGPU::G_SUB:
case AMDGPU::G_MUL: {
Register DstReg = MI.getOperand(0).getReg();
LLT DstTy = MRI.getType(DstReg);
if (DstTy != LLT::scalar(16))
break;
const RegisterBank *DstBank = getRegBank(DstReg, MRI, *TRI);
if (DstBank == &AMDGPU::VGPRRegBank)
break;
// 16-bit operations are VALU only, but can be promoted to 32-bit SALU.
MachineFunction *MF = MI.getParent()->getParent();
MachineIRBuilder B(MI);
ApplyRegBankMapping ApplySALU(MRI, &AMDGPU::SGPRRegBank);
GISelObserverWrapper Observer(&ApplySALU);
LegalizerHelper Helper(*MF, Observer, B);
if (Helper.widenScalar(MI, 0, LLT::scalar(32)) !=
LegalizerHelper::Legalized)
llvm_unreachable("widen scalar should have succeeded");
return;
}
case AMDGPU::G_SMIN:
case AMDGPU::G_SMAX:
case AMDGPU::G_UMIN:
case AMDGPU::G_UMAX: {
Register DstReg = MI.getOperand(0).getReg();
const RegisterBank *DstBank = getRegBank(DstReg, MRI, *TRI);
if (DstBank == &AMDGPU::VGPRRegBank)
break;
MachineFunction *MF = MI.getParent()->getParent();
MachineIRBuilder B(MI);
ApplyRegBankMapping ApplySALU(MRI, &AMDGPU::SGPRRegBank);
GISelObserverWrapper Observer(&ApplySALU);
LegalizerHelper Helper(*MF, Observer, B);
// Turn scalar min/max into a compare and select.
LLT Ty = MRI.getType(DstReg);
LLT S32 = LLT::scalar(32);
LLT S16 = LLT::scalar(16);
if (Ty == S16) {
// Need to widen to s32, and expand as cmp + select.
if (Helper.widenScalar(MI, 0, S32) != LegalizerHelper::Legalized)
llvm_unreachable("widenScalar should have succeeded");
// FIXME: This is relying on widenScalar leaving MI in place.
if (Helper.lower(MI, 0, S32) != LegalizerHelper::Legalized)
llvm_unreachable("lower should have succeeded");
} else {
if (Helper.lower(MI, 0, Ty) != LegalizerHelper::Legalized)
llvm_unreachable("lower should have succeeded");
}
return;
}
case AMDGPU::G_SEXT:
case AMDGPU::G_ZEXT: {
Register SrcReg = MI.getOperand(1).getReg();
LLT SrcTy = MRI.getType(SrcReg);
bool Signed = Opc == AMDGPU::G_SEXT;
MachineIRBuilder B(MI);
const RegisterBank *SrcBank = getRegBank(SrcReg, MRI, *TRI);
Register DstReg = MI.getOperand(0).getReg();
LLT DstTy = MRI.getType(DstReg);
if (DstTy.isScalar() &&
SrcBank != &AMDGPU::SGPRRegBank &&
SrcBank != &AMDGPU::SCCRegBank &&
SrcBank != &AMDGPU::VCCRegBank &&
// FIXME: Should handle any type that round to s64 when irregular
// breakdowns supported.
DstTy.getSizeInBits() == 64 &&
SrcTy.getSizeInBits() <= 32) {
const LLT S32 = LLT::scalar(32);
SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
// Extend to 32-bit, and then extend the low half.
if (Signed) {
// TODO: Should really be buildSExtOrCopy
B.buildSExtOrTrunc(DefRegs[0], SrcReg);
// Replicate sign bit from 32-bit extended part.
auto ShiftAmt = B.buildConstant(S32, 31);
MRI.setRegBank(ShiftAmt.getReg(0), *SrcBank);
B.buildAShr(DefRegs[1], DefRegs[0], ShiftAmt);
} else {
B.buildZExtOrTrunc(DefRegs[0], SrcReg);
B.buildConstant(DefRegs[1], 0);
}
MRI.setRegBank(DstReg, *SrcBank);
MI.eraseFromParent();
return;
}
if (SrcTy != LLT::scalar(1))
return;
if (SrcBank == &AMDGPU::SCCRegBank || SrcBank == &AMDGPU::VCCRegBank) {
SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
const RegisterBank *DstBank = SrcBank == &AMDGPU::SCCRegBank ?
&AMDGPU::SGPRRegBank : &AMDGPU::VGPRRegBank;
unsigned DstSize = DstTy.getSizeInBits();
// 64-bit select is SGPR only
const bool UseSel64 = DstSize > 32 &&
SrcBank->getID() == AMDGPU::SCCRegBankID;
// TODO: Should s16 select be legal?
LLT SelType = UseSel64 ? LLT::scalar(64) : LLT::scalar(32);
auto True = B.buildConstant(SelType, Signed ? -1 : 1);
auto False = B.buildConstant(SelType, 0);
MRI.setRegBank(True.getReg(0), *DstBank);
MRI.setRegBank(False.getReg(0), *DstBank);
MRI.setRegBank(DstReg, *DstBank);
if (DstSize > 32 && SrcBank->getID() != AMDGPU::SCCRegBankID) {
B.buildSelect(DefRegs[0], SrcReg, True, False);
B.buildCopy(DefRegs[1], DefRegs[0]);
} else if (DstSize < 32) {
auto Sel = B.buildSelect(SelType, SrcReg, True, False);
MRI.setRegBank(Sel.getReg(0), *DstBank);
B.buildTrunc(DstReg, Sel);
} else {
B.buildSelect(DstReg, SrcReg, True, False);
}
MI.eraseFromParent();
return;
}
// Fixup the case with an s1 src that isn't a condition register. Use shifts
// instead of introducing a compare to avoid an unnecessary condition
// register (and since there's no scalar 16-bit compares).
auto Ext = B.buildAnyExt(DstTy, SrcReg);
auto ShiftAmt = B.buildConstant(LLT::scalar(32), DstTy.getSizeInBits() - 1);
auto Shl = B.buildShl(DstTy, Ext, ShiftAmt);
if (MI.getOpcode() == AMDGPU::G_SEXT)
B.buildAShr(DstReg, Shl, ShiftAmt);
else
B.buildLShr(DstReg, Shl, ShiftAmt);
MRI.setRegBank(DstReg, *SrcBank);
MRI.setRegBank(Ext.getReg(0), *SrcBank);
MRI.setRegBank(ShiftAmt.getReg(0), *SrcBank);
MRI.setRegBank(Shl.getReg(0), *SrcBank);
MI.eraseFromParent();
return;
}
case AMDGPU::G_BUILD_VECTOR:
case AMDGPU::G_BUILD_VECTOR_TRUNC: {
Register DstReg = MI.getOperand(0).getReg();
LLT DstTy = MRI.getType(DstReg);
if (DstTy != LLT::vector(2, 16))
break;
assert(MI.getNumOperands() == 3 && OpdMapper.getVRegs(0).empty());
substituteSimpleCopyRegs(OpdMapper, 1);
substituteSimpleCopyRegs(OpdMapper, 2);
const RegisterBank *DstBank = getRegBank(DstReg, MRI, *TRI);
if (DstBank == &AMDGPU::SGPRRegBank)
break; // Can use S_PACK_* instructions.
MachineIRBuilder B(MI);
Register Lo = MI.getOperand(1).getReg();
Register Hi = MI.getOperand(2).getReg();
const LLT S32 = LLT::scalar(32);
const RegisterBank *BankLo = getRegBank(Lo, MRI, *TRI);
const RegisterBank *BankHi = getRegBank(Hi, MRI, *TRI);
Register ZextLo;
Register ShiftHi;
if (Opc == AMDGPU::G_BUILD_VECTOR) {
ZextLo = B.buildZExt(S32, Lo).getReg(0);
MRI.setRegBank(ZextLo, *BankLo);
Register ZextHi = B.buildZExt(S32, Hi).getReg(0);
MRI.setRegBank(ZextHi, *BankHi);
auto ShiftAmt = B.buildConstant(S32, 16);
MRI.setRegBank(ShiftAmt.getReg(0), *BankHi);
ShiftHi = B.buildShl(S32, ZextHi, ShiftAmt).getReg(0);
MRI.setRegBank(ShiftHi, *BankHi);
} else {
Register MaskLo = B.buildConstant(S32, 0xffff).getReg(0);
MRI.setRegBank(MaskLo, *BankLo);
auto ShiftAmt = B.buildConstant(S32, 16);
MRI.setRegBank(ShiftAmt.getReg(0), *BankHi);
ShiftHi = B.buildShl(S32, Hi, ShiftAmt).getReg(0);
MRI.setRegBank(ShiftHi, *BankHi);
ZextLo = B.buildAnd(S32, Lo, MaskLo).getReg(0);
MRI.setRegBank(ZextLo, *BankLo);
}
auto Or = B.buildOr(S32, ZextLo, ShiftHi);
MRI.setRegBank(Or.getReg(0), *DstBank);
B.buildBitcast(DstReg, Or);
MI.eraseFromParent();
return;
}
case AMDGPU::G_EXTRACT_VECTOR_ELT: {
SmallVector<Register, 2> DstRegs(OpdMapper.getVRegs(0));
assert(OpdMapper.getVRegs(1).empty() && OpdMapper.getVRegs(2).empty());
if (DstRegs.empty()) {
applyDefaultMapping(OpdMapper);
executeInWaterfallLoop(MI, MRI, { 2 });
return;
}
Register DstReg = MI.getOperand(0).getReg();
Register SrcReg = MI.getOperand(1).getReg();
Register IdxReg = MI.getOperand(2).getReg();
LLT DstTy = MRI.getType(DstReg);
(void)DstTy;
assert(DstTy.getSizeInBits() == 64);
LLT SrcTy = MRI.getType(SrcReg);
const LLT S32 = LLT::scalar(32);
LLT Vec32 = LLT::vector(2 * SrcTy.getNumElements(), 32);
MachineIRBuilder B(MI);
auto CastSrc = B.buildBitcast(Vec32, SrcReg);
auto One = B.buildConstant(S32, 1);
// Split the vector index into 32-bit pieces. Prepare to move all of the
// new instructions into a waterfall loop if necessary.
//
// Don't put the bitcast or constant in the loop.
MachineInstrSpan Span(MachineBasicBlock::iterator(&MI), &B.getMBB());
// Compute 32-bit element indices, (2 * OrigIdx, 2 * OrigIdx + 1).
auto IdxLo = B.buildShl(S32, IdxReg, One);
auto IdxHi = B.buildAdd(S32, IdxLo, One);
B.buildExtractVectorElement(DstRegs[0], CastSrc, IdxLo);
B.buildExtractVectorElement(DstRegs[1], CastSrc, IdxHi);
const ValueMapping &DstMapping
= OpdMapper.getInstrMapping().getOperandMapping(0);
// FIXME: Should be getting from mapping or not?
const RegisterBank *SrcBank = getRegBank(SrcReg, MRI, *TRI);
MRI.setRegBank(DstReg, *DstMapping.BreakDown[0].RegBank);
MRI.setRegBank(CastSrc.getReg(0), *SrcBank);
MRI.setRegBank(One.getReg(0), AMDGPU::SGPRRegBank);
MRI.setRegBank(IdxLo.getReg(0), AMDGPU::SGPRRegBank);
MRI.setRegBank(IdxHi.getReg(0), AMDGPU::SGPRRegBank);
SmallSet<Register, 4> OpsToWaterfall;
if (!collectWaterfallOperands(OpsToWaterfall, MI, MRI, { 2 })) {
MI.eraseFromParent();
return;
}
// Remove the original instruction to avoid potentially confusing the
// waterfall loop logic.
B.setInstr(*Span.begin());
MI.eraseFromParent();
executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()),
OpsToWaterfall, MRI);
return;
}
case AMDGPU::G_INSERT_VECTOR_ELT: {
SmallVector<Register, 2> InsRegs(OpdMapper.getVRegs(2));
assert(OpdMapper.getVRegs(0).empty());
assert(OpdMapper.getVRegs(1).empty());
assert(OpdMapper.getVRegs(3).empty());
if (InsRegs.empty()) {
applyDefaultMapping(OpdMapper);
executeInWaterfallLoop(MI, MRI, { 3 });
return;
}
Register DstReg = MI.getOperand(0).getReg();
Register SrcReg = MI.getOperand(1).getReg();
Register InsReg = MI.getOperand(2).getReg();
Register IdxReg = MI.getOperand(3).getReg();
LLT SrcTy = MRI.getType(SrcReg);
LLT InsTy = MRI.getType(InsReg);
(void)InsTy;
assert(InsTy.getSizeInBits() == 64);
const LLT S32 = LLT::scalar(32);
LLT Vec32 = LLT::vector(2 * SrcTy.getNumElements(), 32);
MachineIRBuilder B(MI);
auto CastSrc = B.buildBitcast(Vec32, SrcReg);
auto One = B.buildConstant(S32, 1);
// Split the vector index into 32-bit pieces. Prepare to move all of the
// new instructions into a waterfall loop if necessary.
//
// Don't put the bitcast or constant in the loop.
MachineInstrSpan Span(MachineBasicBlock::iterator(&MI), &B.getMBB());
// Compute 32-bit element indices, (2 * OrigIdx, 2 * OrigIdx + 1).
auto IdxLo = B.buildShl(S32, IdxReg, One);
auto IdxHi = B.buildAdd(S32, IdxLo, One);
auto InsLo = B.buildInsertVectorElement(Vec32, CastSrc, InsRegs[0], IdxLo);
auto InsHi = B.buildInsertVectorElement(Vec32, InsLo, InsRegs[1], IdxHi);
B.buildBitcast(DstReg, InsHi);
const RegisterBank *DstBank = getRegBank(DstReg, MRI, *TRI);
const RegisterBank *SrcBank = getRegBank(SrcReg, MRI, *TRI);
const RegisterBank *InsSrcBank = getRegBank(InsReg, MRI, *TRI);
MRI.setRegBank(InsReg, *InsSrcBank);
MRI.setRegBank(CastSrc.getReg(0), *SrcBank);
MRI.setRegBank(InsLo.getReg(0), *DstBank);
MRI.setRegBank(InsHi.getReg(0), *DstBank);
MRI.setRegBank(One.getReg(0), AMDGPU::SGPRRegBank);
MRI.setRegBank(IdxLo.getReg(0), AMDGPU::SGPRRegBank);
MRI.setRegBank(IdxHi.getReg(0), AMDGPU::SGPRRegBank);
SmallSet<Register, 4> OpsToWaterfall;
if (!collectWaterfallOperands(OpsToWaterfall, MI, MRI, { 3 })) {
MI.eraseFromParent();
return;
}
B.setInstr(*Span.begin());
MI.eraseFromParent();
executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()),
OpsToWaterfall, MRI);
return;
}
case AMDGPU::G_INTRINSIC: {
switch (MI.getIntrinsicID()) {
case Intrinsic::amdgcn_s_buffer_load: {
// FIXME: Move to G_INTRINSIC_W_SIDE_EFFECTS
executeInWaterfallLoop(MI, MRI, { 2, 3 });
return;
}
case Intrinsic::amdgcn_readlane: {
substituteSimpleCopyRegs(OpdMapper, 2);
assert(OpdMapper.getVRegs(0).empty());
assert(OpdMapper.getVRegs(3).empty());
// Make sure the index is an SGPR. It doesn't make sense to run this in a
// waterfall loop, so assume it's a uniform value.
constrainOpWithReadfirstlane(MI, MRI, 3); // Index
return;
}
case Intrinsic::amdgcn_writelane: {
assert(OpdMapper.getVRegs(0).empty());
assert(OpdMapper.getVRegs(2).empty());
assert(OpdMapper.getVRegs(3).empty());
substituteSimpleCopyRegs(OpdMapper, 4); // VGPR input val
constrainOpWithReadfirstlane(MI, MRI, 2); // Source value
constrainOpWithReadfirstlane(MI, MRI, 3); // Index
return;
}
default:
break;
}
break;
}
case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: {
auto IntrID = MI.getIntrinsicID();
switch (IntrID) {
case Intrinsic::amdgcn_buffer_load: {
executeInWaterfallLoop(MI, MRI, { 2 });
return;
}
case Intrinsic::amdgcn_ds_ordered_add:
case Intrinsic::amdgcn_ds_ordered_swap: {
// This is only allowed to execute with 1 lane, so readfirstlane is safe.
assert(OpdMapper.getVRegs(0).empty());
substituteSimpleCopyRegs(OpdMapper, 3);
constrainOpWithReadfirstlane(MI, MRI, 2); // M0
return;
}
case Intrinsic::amdgcn_ds_gws_init:
case Intrinsic::amdgcn_ds_gws_barrier:
case Intrinsic::amdgcn_ds_gws_sema_br: {
// Only the first lane is executes, so readfirstlane is safe.
substituteSimpleCopyRegs(OpdMapper, 1);
constrainOpWithReadfirstlane(MI, MRI, 2); // M0
return;
}
case Intrinsic::amdgcn_ds_gws_sema_v:
case Intrinsic::amdgcn_ds_gws_sema_p:
case Intrinsic::amdgcn_ds_gws_sema_release_all: {
// Only the first lane is executes, so readfirstlane is safe.
constrainOpWithReadfirstlane(MI, MRI, 1); // M0
return;
}
case Intrinsic::amdgcn_s_sendmsg:
case Intrinsic::amdgcn_s_sendmsghalt: {
// FIXME: Should this use a waterfall loop?
constrainOpWithReadfirstlane(MI, MRI, 2); // M0
return;
}
case Intrinsic::amdgcn_raw_buffer_load:
case Intrinsic::amdgcn_raw_buffer_load_format:
case Intrinsic::amdgcn_raw_tbuffer_load:
case Intrinsic::amdgcn_raw_buffer_store:
case Intrinsic::amdgcn_raw_buffer_store_format:
case Intrinsic::amdgcn_raw_tbuffer_store: {
applyDefaultMapping(OpdMapper);
executeInWaterfallLoop(MI, MRI, {2, 4});
return;
}
case Intrinsic::amdgcn_struct_buffer_load:
case Intrinsic::amdgcn_struct_buffer_store:
case Intrinsic::amdgcn_struct_tbuffer_load:
case Intrinsic::amdgcn_struct_tbuffer_store: {
applyDefaultMapping(OpdMapper);
executeInWaterfallLoop(MI, MRI, {2, 5});
return;
}
default: {
if (const AMDGPU::RsrcIntrinsic *RSrcIntrin =
AMDGPU::lookupRsrcIntrinsic(IntrID)) {
// Non-images can have complications from operands that allow both SGPR
// and VGPR. For now it's too complicated to figure out the final opcode
// to derive the register bank from the MCInstrDesc.
if (RSrcIntrin->IsImage) {
applyMappingImage(MI, OpdMapper, MRI, RSrcIntrin->RsrcArg);
return;
}
}
break;
}
}
break;
}
case AMDGPU::G_LOAD:
case AMDGPU::G_ZEXTLOAD:
case AMDGPU::G_SEXTLOAD: {
if (applyMappingWideLoad(MI, OpdMapper, MRI))
return;
break;
}
default:
break;
}
return applyDefaultMapping(OpdMapper);
}
bool AMDGPURegisterBankInfo::isSALUMapping(const MachineInstr &MI) const {
const MachineFunction &MF = *MI.getParent()->getParent();
const MachineRegisterInfo &MRI = MF.getRegInfo();
for (unsigned i = 0, e = MI.getNumOperands();i != e; ++i) {
if (!MI.getOperand(i).isReg())
continue;
Register Reg = MI.getOperand(i).getReg();
if (const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI)) {
if (Bank->getID() == AMDGPU::VGPRRegBankID)
return false;
assert(Bank->getID() == AMDGPU::SGPRRegBankID ||
Bank->getID() == AMDGPU::SCCRegBankID);
}
}
return true;
}
const RegisterBankInfo::InstructionMapping &
AMDGPURegisterBankInfo::getDefaultMappingSOP(const MachineInstr &MI) const {
const MachineFunction &MF = *MI.getParent()->getParent();
const MachineRegisterInfo &MRI = MF.getRegInfo();
SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
unsigned Size = getSizeInBits(MI.getOperand(i).getReg(), MRI, *TRI);
unsigned BankID = Size == 1 ? AMDGPU::SCCRegBankID : AMDGPU::SGPRRegBankID;
OpdsMapping[i] = AMDGPU::getValueMapping(BankID, Size);
}
return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping),
MI.getNumOperands());
}
const RegisterBankInfo::InstructionMapping &
AMDGPURegisterBankInfo::getDefaultMappingVOP(const MachineInstr &MI) const {
const MachineFunction &MF = *MI.getParent()->getParent();
const MachineRegisterInfo &MRI = MF.getRegInfo();
SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
unsigned OpdIdx = 0;
unsigned Size0 = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
OpdsMapping[OpdIdx++] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size0);
if (MI.getOperand(OpdIdx).isIntrinsicID())
OpdsMapping[OpdIdx++] = nullptr;
Register Reg1 = MI.getOperand(OpdIdx).getReg();
unsigned Size1 = getSizeInBits(Reg1, MRI, *TRI);
unsigned DefaultBankID = Size1 == 1 ?
AMDGPU::VCCRegBankID : AMDGPU::VGPRRegBankID;
unsigned Bank1 = getRegBankID(Reg1, MRI, *TRI, DefaultBankID);
OpdsMapping[OpdIdx++] = AMDGPU::getValueMapping(Bank1, Size1);
for (unsigned e = MI.getNumOperands(); OpdIdx != e; ++OpdIdx) {
const MachineOperand &MO = MI.getOperand(OpdIdx);
if (!MO.isReg())
continue;
unsigned Size = getSizeInBits(MO.getReg(), MRI, *TRI);
unsigned BankID = Size == 1 ? AMDGPU::VCCRegBankID : AMDGPU::VGPRRegBankID;
OpdsMapping[OpdIdx] = AMDGPU::getValueMapping(BankID, Size);
}
return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping),
MI.getNumOperands());
}
const RegisterBankInfo::InstructionMapping &
AMDGPURegisterBankInfo::getDefaultMappingAllVGPR(const MachineInstr &MI) const {
const MachineFunction &MF = *MI.getParent()->getParent();
const MachineRegisterInfo &MRI = MF.getRegInfo();
SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
const MachineOperand &Op = MI.getOperand(I);
if (!Op.isReg())
continue;
unsigned Size = getSizeInBits(Op.getReg(), MRI, *TRI);
OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
}
return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping),
MI.getNumOperands());
}
const RegisterBankInfo::InstructionMapping &
AMDGPURegisterBankInfo::getImageMapping(const MachineRegisterInfo &MRI,
const MachineInstr &MI,
int RsrcIdx) const {
// The reported argument index is relative to the IR intrinsic call arguments,
// so we need to shift by the number of defs and the intrinsic ID.
RsrcIdx += MI.getNumExplicitDefs() + 1;
const int NumOps = MI.getNumOperands();
SmallVector<const ValueMapping *, 8> OpdsMapping(NumOps);
// TODO: Should packed/unpacked D16 difference be reported here as part of
// the value mapping?
for (int I = 0; I != NumOps; ++I) {
if (!MI.getOperand(I).isReg())
continue;
Register OpReg = MI.getOperand(I).getReg();
unsigned Size = getSizeInBits(OpReg, MRI, *TRI);
// FIXME: Probably need a new intrinsic register bank searchable table to
// handle arbitrary intrinsics easily.
//
// If this has a sampler, it immediately follows rsrc.
const bool MustBeSGPR = I == RsrcIdx || I == RsrcIdx + 1;
if (MustBeSGPR) {
// If this must be an SGPR, so we must report whatever it is as legal.
unsigned NewBank = getRegBankID(OpReg, MRI, *TRI, AMDGPU::SGPRRegBankID);
OpdsMapping[I] = AMDGPU::getValueMapping(NewBank, Size);
} else {
// Some operands must be VGPR, and these are easy to copy to.
OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
}
}
return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping), NumOps);
}
const RegisterBankInfo::InstructionMapping &
AMDGPURegisterBankInfo::getInstrMappingForLoad(const MachineInstr &MI) const {
const MachineFunction &MF = *MI.getParent()->getParent();
const MachineRegisterInfo &MRI = MF.getRegInfo();
SmallVector<const ValueMapping*, 2> OpdsMapping(2);
unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
LLT LoadTy = MRI.getType(MI.getOperand(0).getReg());
Register PtrReg = MI.getOperand(1).getReg();
LLT PtrTy = MRI.getType(PtrReg);
unsigned AS = PtrTy.getAddressSpace();
unsigned PtrSize = PtrTy.getSizeInBits();
const ValueMapping *ValMapping;
const ValueMapping *PtrMapping;
const RegisterBank *PtrBank = getRegBank(PtrReg, MRI, *TRI);
if (PtrBank == &AMDGPU::SGPRRegBank &&
(AS != AMDGPUAS::LOCAL_ADDRESS && AS != AMDGPUAS::REGION_ADDRESS &&
AS != AMDGPUAS::PRIVATE_ADDRESS) &&
isInstrUniformNonExtLoadAlign4(MI)) {
// We have a uniform instruction so we want to use an SMRD load
ValMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
PtrMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, PtrSize);
} else {
ValMapping = AMDGPU::getValueMappingLoadSGPROnly(AMDGPU::VGPRRegBankID, LoadTy);
PtrMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, PtrSize);
}
OpdsMapping[0] = ValMapping;
OpdsMapping[1] = PtrMapping;
const RegisterBankInfo::InstructionMapping &Mapping = getInstructionMapping(
1, 1, getOperandsMapping(OpdsMapping), MI.getNumOperands());
return Mapping;
// FIXME: Do we want to add a mapping for FLAT load, or should we just
// handle that during instruction selection?
}
unsigned
AMDGPURegisterBankInfo::getRegBankID(Register Reg,
const MachineRegisterInfo &MRI,
const TargetRegisterInfo &TRI,
unsigned Default) const {
const RegisterBank *Bank = getRegBank(Reg, MRI, TRI);
return Bank ? Bank->getID() : Default;
}
static unsigned regBankUnion(unsigned RB0, unsigned RB1) {
return (RB0 == AMDGPU::SGPRRegBankID && RB1 == AMDGPU::SGPRRegBankID) ?
AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
}
const RegisterBankInfo::ValueMapping *
AMDGPURegisterBankInfo::getSGPROpMapping(Register Reg,
const MachineRegisterInfo &MRI,
const TargetRegisterInfo &TRI) const {
// Lie and claim anything is legal, even though this needs to be an SGPR
// applyMapping will have to deal with it as a waterfall loop.
unsigned Bank = getRegBankID(Reg, MRI, TRI, AMDGPU::SGPRRegBankID);
unsigned Size = getSizeInBits(Reg, MRI, TRI);
return AMDGPU::getValueMapping(Bank, Size);
}
const RegisterBankInfo::ValueMapping *
AMDGPURegisterBankInfo::getVGPROpMapping(Register Reg,
const MachineRegisterInfo &MRI,
const TargetRegisterInfo &TRI) const {
unsigned Size = getSizeInBits(Reg, MRI, TRI);
return AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
}
///
/// This function must return a legal mapping, because
/// AMDGPURegisterBankInfo::getInstrAlternativeMappings() is not called
/// in RegBankSelect::Mode::Fast. Any mapping that would cause a
/// VGPR to SGPR generated is illegal.
///
const RegisterBankInfo::InstructionMapping &
AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
const MachineFunction &MF = *MI.getParent()->getParent();
const MachineRegisterInfo &MRI = MF.getRegInfo();
if (MI.isRegSequence()) {
// If any input is a VGPR, the result must be a VGPR. The default handling
// assumes any copy between banks is legal.
unsigned BankID = AMDGPU::SGPRRegBankID;
for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
auto OpBank = getRegBankID(MI.getOperand(I).getReg(), MRI, *TRI);
// It doesn't make sense to use vcc or scc banks here, so just ignore
// them.
if (OpBank != AMDGPU::SGPRRegBankID) {
BankID = AMDGPU::VGPRRegBankID;
break;
}
}
unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
const ValueMapping &ValMap = getValueMapping(0, Size, getRegBank(BankID));
return getInstructionMapping(
1, /*Cost*/ 1,
/*OperandsMapping*/ getOperandsMapping({&ValMap}), 1);
}
// The default handling is broken and doesn't handle illegal SGPR->VGPR copies
// properly.
//
// TODO: There are additional exec masking dependencies to analyze.
if (MI.getOpcode() == TargetOpcode::G_PHI) {
// TODO: Generate proper invalid bank enum.
int ResultBank = -1;
for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
Register Reg = MI.getOperand(I).getReg();
const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI);
// FIXME: Assuming VGPR for any undetermined inputs.
if (!Bank || Bank->getID() == AMDGPU::VGPRRegBankID) {
ResultBank = AMDGPU::VGPRRegBankID;
break;
}
unsigned OpBank = Bank->getID();
// scc, scc -> sgpr
if (OpBank == AMDGPU::SCCRegBankID) {
// There's only one SCC register, so a phi requires copying to SGPR.
OpBank = AMDGPU::SGPRRegBankID;
} else if (OpBank == AMDGPU::VCCRegBankID) {
// vcc, vcc -> vcc
// vcc, sgpr -> vgpr
if (ResultBank != -1 && ResultBank != AMDGPU::VCCRegBankID) {
ResultBank = AMDGPU::VGPRRegBankID;
break;
}
}
ResultBank = OpBank;
}
assert(ResultBank != -1);
unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
const ValueMapping &ValMap =
getValueMapping(0, Size, getRegBank(ResultBank));
return getInstructionMapping(
1, /*Cost*/ 1,
/*OperandsMapping*/ getOperandsMapping({&ValMap}), 1);
}
const RegisterBankInfo::InstructionMapping &Mapping = getInstrMappingImpl(MI);
if (Mapping.isValid())
return Mapping;
SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
switch (MI.getOpcode()) {
default:
return getInvalidInstructionMapping();
case AMDGPU::G_AND:
case AMDGPU::G_OR:
case AMDGPU::G_XOR: {
unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
if (Size == 1) {
const RegisterBank *DstBank
= getRegBank(MI.getOperand(0).getReg(), MRI, *TRI);
unsigned TargetBankID = -1;
unsigned BankLHS = -1;
unsigned BankRHS = -1;
if (DstBank) {
TargetBankID = DstBank->getID();
if (DstBank == &AMDGPU::VCCRegBank) {
TargetBankID = AMDGPU::VCCRegBankID;
BankLHS = AMDGPU::VCCRegBankID;
BankRHS = AMDGPU::VCCRegBankID;
} else if (DstBank == &AMDGPU::SCCRegBank) {
TargetBankID = AMDGPU::SCCRegBankID;
BankLHS = AMDGPU::SGPRRegBankID;
BankRHS = AMDGPU::SGPRRegBankID;
} else {
BankLHS = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI,
AMDGPU::SGPRRegBankID);
BankRHS = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI,
AMDGPU::SGPRRegBankID);
}
} else {
BankLHS = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI,
AMDGPU::VCCRegBankID);
BankRHS = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI,
AMDGPU::VCCRegBankID);
// Both inputs should be true booleans to produce a boolean result.
if (BankLHS == AMDGPU::VGPRRegBankID || BankRHS == AMDGPU::VGPRRegBankID) {
TargetBankID = AMDGPU::VGPRRegBankID;
} else if (BankLHS == AMDGPU::VCCRegBankID || BankRHS == AMDGPU::VCCRegBankID) {
TargetBankID = AMDGPU::VCCRegBankID;
BankLHS = AMDGPU::VCCRegBankID;
BankRHS = AMDGPU::VCCRegBankID;
} else if (BankLHS == AMDGPU::SGPRRegBankID && BankRHS == AMDGPU::SGPRRegBankID) {
TargetBankID = AMDGPU::SGPRRegBankID;
} else if (BankLHS == AMDGPU::SCCRegBankID || BankRHS == AMDGPU::SCCRegBankID) {
// The operation must be done on a 32-bit register, but it will set
// scc. The result type could interchangably be SCC or SGPR, since
// both values will be produced.
TargetBankID = AMDGPU::SCCRegBankID;
BankLHS = AMDGPU::SGPRRegBankID;
BankRHS = AMDGPU::SGPRRegBankID;
}
}
OpdsMapping[0] = AMDGPU::getValueMapping(TargetBankID, Size);
OpdsMapping[1] = AMDGPU::getValueMapping(BankLHS, Size);
OpdsMapping[2] = AMDGPU::getValueMapping(BankRHS, Size);
break;
}
if (Size == 64) {
if (isSALUMapping(MI)) {
OpdsMapping[0] = getValueMappingSGPR64Only(AMDGPU::SGPRRegBankID, Size);
OpdsMapping[1] = OpdsMapping[2] = OpdsMapping[0];
} else {
OpdsMapping[0] = getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size);
unsigned Bank1 = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI/*, DefaultBankID*/);
OpdsMapping[1] = AMDGPU::getValueMapping(Bank1, Size);
unsigned Bank2 = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI/*, DefaultBankID*/);
OpdsMapping[2] = AMDGPU::getValueMapping(Bank2, Size);
}
break;
}
LLVM_FALLTHROUGH;
}
case AMDGPU::G_GEP:
case AMDGPU::G_ADD:
case AMDGPU::G_SUB:
case AMDGPU::G_MUL:
case AMDGPU::G_SHL:
case AMDGPU::G_LSHR:
case AMDGPU::G_ASHR:
case AMDGPU::G_UADDO:
case AMDGPU::G_USUBO:
case AMDGPU::G_UADDE:
case AMDGPU::G_SADDE:
case AMDGPU::G_USUBE:
case AMDGPU::G_SSUBE:
case AMDGPU::G_SMIN:
case AMDGPU::G_SMAX:
case AMDGPU::G_UMIN:
case AMDGPU::G_UMAX:
if (isSALUMapping(MI))
return getDefaultMappingSOP(MI);
LLVM_FALLTHROUGH;
case AMDGPU::G_FADD:
case AMDGPU::G_FSUB:
case AMDGPU::G_FPTOSI:
case AMDGPU::G_FPTOUI:
case AMDGPU::G_FMUL:
case AMDGPU::G_FMA:
case AMDGPU::G_FMAD:
case AMDGPU::G_FSQRT:
case AMDGPU::G_FFLOOR:
case AMDGPU::G_FCEIL:
case AMDGPU::G_FRINT:
case AMDGPU::G_SITOFP:
case AMDGPU::G_UITOFP:
case AMDGPU::G_FPTRUNC:
case AMDGPU::G_FPEXT:
case AMDGPU::G_FEXP2:
case AMDGPU::G_FLOG2:
case AMDGPU::G_FMINNUM:
case AMDGPU::G_FMAXNUM:
case AMDGPU::G_FMINNUM_IEEE:
case AMDGPU::G_FMAXNUM_IEEE:
case AMDGPU::G_FCANONICALIZE:
case AMDGPU::G_INTRINSIC_TRUNC:
case AMDGPU::G_INTRINSIC_ROUND:
case AMDGPU::G_AMDGPU_FFBH_U32:
return getDefaultMappingVOP(MI);
case AMDGPU::G_UMULH:
case AMDGPU::G_SMULH: {
if (Subtarget.hasScalarMulHiInsts() && isSALUMapping(MI))
return getDefaultMappingSOP(MI);
return getDefaultMappingVOP(MI);
}
case AMDGPU::G_IMPLICIT_DEF: {
unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
break;
}
case AMDGPU::G_FCONSTANT:
case AMDGPU::G_CONSTANT:
case AMDGPU::G_GLOBAL_VALUE:
case AMDGPU::G_BLOCK_ADDR: {
unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
break;
}
case AMDGPU::G_FRAME_INDEX: {
// TODO: This should be the same as other constants, but eliminateFrameIndex
// currently assumes VALU uses.
unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);