blob: 92f5322b8ad24c7582b6b01f23c39d99c08aa562 [file] [log] [blame]
//===- SIInstrInfo.cpp - SI Instruction Information ----------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
/// \file
/// SI Implementation of TargetInstrInfo.
//
//===----------------------------------------------------------------------===//
#include "SIInstrInfo.h"
#include "AMDGPU.h"
#include "AMDGPUInstrInfo.h"
#include "GCNHazardRecognizer.h"
#include "GCNSubtarget.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "SIMachineFunctionInfo.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/CodeGen/LiveIntervals.h"
#include "llvm/CodeGen/LiveVariables.h"
#include "llvm/CodeGen/MachineDominators.h"
#include "llvm/CodeGen/MachineScheduler.h"
#include "llvm/CodeGen/RegisterScavenging.h"
#include "llvm/CodeGen/ScheduleDAG.h"
#include "llvm/IR/DiagnosticInfo.h"
#include "llvm/IR/IntrinsicsAMDGPU.h"
#include "llvm/MC/MCContext.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Target/TargetMachine.h"
using namespace llvm;
#define DEBUG_TYPE "si-instr-info"
#define GET_INSTRINFO_CTOR_DTOR
#include "AMDGPUGenInstrInfo.inc"
namespace llvm {
class AAResults;
namespace AMDGPU {
#define GET_D16ImageDimIntrinsics_IMPL
#define GET_ImageDimIntrinsicTable_IMPL
#define GET_RsrcIntrinsics_IMPL
#include "AMDGPUGenSearchableTables.inc"
}
}
// Must be at least 4 to be able to branch over minimum unconditional branch
// code. This is only for making it possible to write reasonably small tests for
// long branches.
static cl::opt<unsigned>
BranchOffsetBits("amdgpu-s-branch-bits", cl::ReallyHidden, cl::init(16),
cl::desc("Restrict range of branch instructions (DEBUG)"));
static cl::opt<bool> Fix16BitCopies(
"amdgpu-fix-16-bit-physreg-copies",
cl::desc("Fix copies between 32 and 16 bit registers by extending to 32 bit"),
cl::init(true),
cl::ReallyHidden);
SIInstrInfo::SIInstrInfo(const GCNSubtarget &ST)
: AMDGPUGenInstrInfo(AMDGPU::ADJCALLSTACKUP, AMDGPU::ADJCALLSTACKDOWN),
RI(ST), ST(ST) {
SchedModel.init(&ST);
}
//===----------------------------------------------------------------------===//
// TargetInstrInfo callbacks
//===----------------------------------------------------------------------===//
static unsigned getNumOperandsNoGlue(SDNode *Node) {
unsigned N = Node->getNumOperands();
while (N && Node->getOperand(N - 1).getValueType() == MVT::Glue)
--N;
return N;
}
/// Returns true if both nodes have the same value for the given
/// operand \p Op, or if both nodes do not have this operand.
static bool nodesHaveSameOperandValue(SDNode *N0, SDNode* N1, unsigned OpName) {
unsigned Opc0 = N0->getMachineOpcode();
unsigned Opc1 = N1->getMachineOpcode();
int Op0Idx = AMDGPU::getNamedOperandIdx(Opc0, OpName);
int Op1Idx = AMDGPU::getNamedOperandIdx(Opc1, OpName);
if (Op0Idx == -1 && Op1Idx == -1)
return true;
if ((Op0Idx == -1 && Op1Idx != -1) ||
(Op1Idx == -1 && Op0Idx != -1))
return false;
// getNamedOperandIdx returns the index for the MachineInstr's operands,
// which includes the result as the first operand. We are indexing into the
// MachineSDNode's operands, so we need to skip the result operand to get
// the real index.
--Op0Idx;
--Op1Idx;
return N0->getOperand(Op0Idx) == N1->getOperand(Op1Idx);
}
bool SIInstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI,
AAResults *AA) const {
if (isVOP1(MI) || isVOP2(MI) || isVOP3(MI) || isSDWA(MI) || isSALU(MI)) {
// Normally VALU use of exec would block the rematerialization, but that
// is OK in this case to have an implicit exec read as all VALU do.
// We really want all of the generic logic for this except for this.
// Another potential implicit use is mode register. The core logic of
// the RA will not attempt rematerialization if mode is set anywhere
// in the function, otherwise it is safe since mode is not changed.
// There is difference to generic method which does not allow
// rematerialization if there are virtual register uses. We allow this,
// therefore this method includes SOP instructions as well.
return !MI.hasImplicitDef() &&
MI.getNumImplicitOperands() == MI.getDesc().getNumImplicitUses() &&
!MI.mayRaiseFPException();
}
return false;
}
bool SIInstrInfo::isIgnorableUse(const MachineOperand &MO) const {
// Any implicit use of exec by VALU is not a real register read.
return MO.getReg() == AMDGPU::EXEC && MO.isImplicit() &&
isVALU(*MO.getParent());
}
bool SIInstrInfo::areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1,
int64_t &Offset0,
int64_t &Offset1) const {
if (!Load0->isMachineOpcode() || !Load1->isMachineOpcode())
return false;
unsigned Opc0 = Load0->getMachineOpcode();
unsigned Opc1 = Load1->getMachineOpcode();
// Make sure both are actually loads.
if (!get(Opc0).mayLoad() || !get(Opc1).mayLoad())
return false;
if (isDS(Opc0) && isDS(Opc1)) {
// FIXME: Handle this case:
if (getNumOperandsNoGlue(Load0) != getNumOperandsNoGlue(Load1))
return false;
// Check base reg.
if (Load0->getOperand(0) != Load1->getOperand(0))
return false;
// Skip read2 / write2 variants for simplicity.
// TODO: We should report true if the used offsets are adjacent (excluded
// st64 versions).
int Offset0Idx = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset);
int Offset1Idx = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset);
if (Offset0Idx == -1 || Offset1Idx == -1)
return false;
// XXX - be careful of datalesss loads
// getNamedOperandIdx returns the index for MachineInstrs. Since they
// include the output in the operand list, but SDNodes don't, we need to
// subtract the index by one.
Offset0Idx -= get(Opc0).NumDefs;
Offset1Idx -= get(Opc1).NumDefs;
Offset0 = cast<ConstantSDNode>(Load0->getOperand(Offset0Idx))->getZExtValue();
Offset1 = cast<ConstantSDNode>(Load1->getOperand(Offset1Idx))->getZExtValue();
return true;
}
if (isSMRD(Opc0) && isSMRD(Opc1)) {
// Skip time and cache invalidation instructions.
if (AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::sbase) == -1 ||
AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::sbase) == -1)
return false;
assert(getNumOperandsNoGlue(Load0) == getNumOperandsNoGlue(Load1));
// Check base reg.
if (Load0->getOperand(0) != Load1->getOperand(0))
return false;
const ConstantSDNode *Load0Offset =
dyn_cast<ConstantSDNode>(Load0->getOperand(1));
const ConstantSDNode *Load1Offset =
dyn_cast<ConstantSDNode>(Load1->getOperand(1));
if (!Load0Offset || !Load1Offset)
return false;
Offset0 = Load0Offset->getZExtValue();
Offset1 = Load1Offset->getZExtValue();
return true;
}
// MUBUF and MTBUF can access the same addresses.
if ((isMUBUF(Opc0) || isMTBUF(Opc0)) && (isMUBUF(Opc1) || isMTBUF(Opc1))) {
// MUBUF and MTBUF have vaddr at different indices.
if (!nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::soffset) ||
!nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::vaddr) ||
!nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::srsrc))
return false;
int OffIdx0 = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset);
int OffIdx1 = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset);
if (OffIdx0 == -1 || OffIdx1 == -1)
return false;
// getNamedOperandIdx returns the index for MachineInstrs. Since they
// include the output in the operand list, but SDNodes don't, we need to
// subtract the index by one.
OffIdx0 -= get(Opc0).NumDefs;
OffIdx1 -= get(Opc1).NumDefs;
SDValue Off0 = Load0->getOperand(OffIdx0);
SDValue Off1 = Load1->getOperand(OffIdx1);
// The offset might be a FrameIndexSDNode.
if (!isa<ConstantSDNode>(Off0) || !isa<ConstantSDNode>(Off1))
return false;
Offset0 = cast<ConstantSDNode>(Off0)->getZExtValue();
Offset1 = cast<ConstantSDNode>(Off1)->getZExtValue();
return true;
}
return false;
}
static bool isStride64(unsigned Opc) {
switch (Opc) {
case AMDGPU::DS_READ2ST64_B32:
case AMDGPU::DS_READ2ST64_B64:
case AMDGPU::DS_WRITE2ST64_B32:
case AMDGPU::DS_WRITE2ST64_B64:
return true;
default:
return false;
}
}
bool SIInstrInfo::getMemOperandsWithOffsetWidth(
const MachineInstr &LdSt, SmallVectorImpl<const MachineOperand *> &BaseOps,
int64_t &Offset, bool &OffsetIsScalable, unsigned &Width,
const TargetRegisterInfo *TRI) const {
if (!LdSt.mayLoadOrStore())
return false;
unsigned Opc = LdSt.getOpcode();
OffsetIsScalable = false;
const MachineOperand *BaseOp, *OffsetOp;
int DataOpIdx;
if (isDS(LdSt)) {
BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::addr);
OffsetOp = getNamedOperand(LdSt, AMDGPU::OpName::offset);
if (OffsetOp) {
// Normal, single offset LDS instruction.
if (!BaseOp) {
// DS_CONSUME/DS_APPEND use M0 for the base address.
// TODO: find the implicit use operand for M0 and use that as BaseOp?
return false;
}
BaseOps.push_back(BaseOp);
Offset = OffsetOp->getImm();
// Get appropriate operand, and compute width accordingly.
DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
if (DataOpIdx == -1)
DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
Width = getOpSize(LdSt, DataOpIdx);
} else {
// The 2 offset instructions use offset0 and offset1 instead. We can treat
// these as a load with a single offset if the 2 offsets are consecutive.
// We will use this for some partially aligned loads.
const MachineOperand *Offset0Op =
getNamedOperand(LdSt, AMDGPU::OpName::offset0);
const MachineOperand *Offset1Op =
getNamedOperand(LdSt, AMDGPU::OpName::offset1);
unsigned Offset0 = Offset0Op->getImm();
unsigned Offset1 = Offset1Op->getImm();
if (Offset0 + 1 != Offset1)
return false;
// Each of these offsets is in element sized units, so we need to convert
// to bytes of the individual reads.
unsigned EltSize;
if (LdSt.mayLoad())
EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, 0)) / 16;
else {
assert(LdSt.mayStore());
int Data0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, Data0Idx)) / 8;
}
if (isStride64(Opc))
EltSize *= 64;
BaseOps.push_back(BaseOp);
Offset = EltSize * Offset0;
// Get appropriate operand(s), and compute width accordingly.
DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
if (DataOpIdx == -1) {
DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
Width = getOpSize(LdSt, DataOpIdx);
DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data1);
Width += getOpSize(LdSt, DataOpIdx);
} else {
Width = getOpSize(LdSt, DataOpIdx);
}
}
return true;
}
if (isMUBUF(LdSt) || isMTBUF(LdSt)) {
const MachineOperand *RSrc = getNamedOperand(LdSt, AMDGPU::OpName::srsrc);
if (!RSrc) // e.g. BUFFER_WBINVL1_VOL
return false;
BaseOps.push_back(RSrc);
BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
if (BaseOp && !BaseOp->isFI())
BaseOps.push_back(BaseOp);
const MachineOperand *OffsetImm =
getNamedOperand(LdSt, AMDGPU::OpName::offset);
Offset = OffsetImm->getImm();
const MachineOperand *SOffset =
getNamedOperand(LdSt, AMDGPU::OpName::soffset);
if (SOffset) {
if (SOffset->isReg())
BaseOps.push_back(SOffset);
else
Offset += SOffset->getImm();
}
// Get appropriate operand, and compute width accordingly.
DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
if (DataOpIdx == -1)
DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata);
Width = getOpSize(LdSt, DataOpIdx);
return true;
}
if (isMIMG(LdSt)) {
int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc);
BaseOps.push_back(&LdSt.getOperand(SRsrcIdx));
int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0);
if (VAddr0Idx >= 0) {
// GFX10 possible NSA encoding.
for (int I = VAddr0Idx; I < SRsrcIdx; ++I)
BaseOps.push_back(&LdSt.getOperand(I));
} else {
BaseOps.push_back(getNamedOperand(LdSt, AMDGPU::OpName::vaddr));
}
Offset = 0;
// Get appropriate operand, and compute width accordingly.
DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata);
Width = getOpSize(LdSt, DataOpIdx);
return true;
}
if (isSMRD(LdSt)) {
BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::sbase);
if (!BaseOp) // e.g. S_MEMTIME
return false;
BaseOps.push_back(BaseOp);
OffsetOp = getNamedOperand(LdSt, AMDGPU::OpName::offset);
Offset = OffsetOp ? OffsetOp->getImm() : 0;
// Get appropriate operand, and compute width accordingly.
DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::sdst);
Width = getOpSize(LdSt, DataOpIdx);
return true;
}
if (isFLAT(LdSt)) {
// Instructions have either vaddr or saddr or both or none.
BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
if (BaseOp)
BaseOps.push_back(BaseOp);
BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::saddr);
if (BaseOp)
BaseOps.push_back(BaseOp);
Offset = getNamedOperand(LdSt, AMDGPU::OpName::offset)->getImm();
// Get appropriate operand, and compute width accordingly.
DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
if (DataOpIdx == -1)
DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata);
Width = getOpSize(LdSt, DataOpIdx);
return true;
}
return false;
}
static bool memOpsHaveSameBasePtr(const MachineInstr &MI1,
ArrayRef<const MachineOperand *> BaseOps1,
const MachineInstr &MI2,
ArrayRef<const MachineOperand *> BaseOps2) {
// Only examine the first "base" operand of each instruction, on the
// assumption that it represents the real base address of the memory access.
// Other operands are typically offsets or indices from this base address.
if (BaseOps1.front()->isIdenticalTo(*BaseOps2.front()))
return true;
if (!MI1.hasOneMemOperand() || !MI2.hasOneMemOperand())
return false;
auto MO1 = *MI1.memoperands_begin();
auto MO2 = *MI2.memoperands_begin();
if (MO1->getAddrSpace() != MO2->getAddrSpace())
return false;
auto Base1 = MO1->getValue();
auto Base2 = MO2->getValue();
if (!Base1 || !Base2)
return false;
Base1 = getUnderlyingObject(Base1);
Base2 = getUnderlyingObject(Base2);
if (isa<UndefValue>(Base1) || isa<UndefValue>(Base2))
return false;
return Base1 == Base2;
}
bool SIInstrInfo::shouldClusterMemOps(ArrayRef<const MachineOperand *> BaseOps1,
ArrayRef<const MachineOperand *> BaseOps2,
unsigned NumLoads,
unsigned NumBytes) const {
// If the mem ops (to be clustered) do not have the same base ptr, then they
// should not be clustered
if (!BaseOps1.empty() && !BaseOps2.empty()) {
const MachineInstr &FirstLdSt = *BaseOps1.front()->getParent();
const MachineInstr &SecondLdSt = *BaseOps2.front()->getParent();
if (!memOpsHaveSameBasePtr(FirstLdSt, BaseOps1, SecondLdSt, BaseOps2))
return false;
} else if (!BaseOps1.empty() || !BaseOps2.empty()) {
// If only one base op is empty, they do not have the same base ptr
return false;
}
// In order to avoid regester pressure, on an average, the number of DWORDS
// loaded together by all clustered mem ops should not exceed 8. This is an
// empirical value based on certain observations and performance related
// experiments.
// The good thing about this heuristic is - it avoids clustering of too many
// sub-word loads, and also avoids clustering of wide loads. Below is the
// brief summary of how the heuristic behaves for various `LoadSize`.
// (1) 1 <= LoadSize <= 4: cluster at max 8 mem ops
// (2) 5 <= LoadSize <= 8: cluster at max 4 mem ops
// (3) 9 <= LoadSize <= 12: cluster at max 2 mem ops
// (4) 13 <= LoadSize <= 16: cluster at max 2 mem ops
// (5) LoadSize >= 17: do not cluster
const unsigned LoadSize = NumBytes / NumLoads;
const unsigned NumDWORDs = ((LoadSize + 3) / 4) * NumLoads;
return NumDWORDs <= 8;
}
// FIXME: This behaves strangely. If, for example, you have 32 load + stores,
// the first 16 loads will be interleaved with the stores, and the next 16 will
// be clustered as expected. It should really split into 2 16 store batches.
//
// Loads are clustered until this returns false, rather than trying to schedule
// groups of stores. This also means we have to deal with saying different
// address space loads should be clustered, and ones which might cause bank
// conflicts.
//
// This might be deprecated so it might not be worth that much effort to fix.
bool SIInstrInfo::shouldScheduleLoadsNear(SDNode *Load0, SDNode *Load1,
int64_t Offset0, int64_t Offset1,
unsigned NumLoads) const {
assert(Offset1 > Offset0 &&
"Second offset should be larger than first offset!");
// If we have less than 16 loads in a row, and the offsets are within 64
// bytes, then schedule together.
// A cacheline is 64 bytes (for global memory).
return (NumLoads <= 16 && (Offset1 - Offset0) < 64);
}
static void reportIllegalCopy(const SIInstrInfo *TII, MachineBasicBlock &MBB,
MachineBasicBlock::iterator MI,
const DebugLoc &DL, MCRegister DestReg,
MCRegister SrcReg, bool KillSrc,
const char *Msg = "illegal SGPR to VGPR copy") {
MachineFunction *MF = MBB.getParent();
DiagnosticInfoUnsupported IllegalCopy(MF->getFunction(), Msg, DL, DS_Error);
LLVMContext &C = MF->getFunction().getContext();
C.diagnose(IllegalCopy);
BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_ILLEGAL_COPY), DestReg)
.addReg(SrcReg, getKillRegState(KillSrc));
}
/// Handle copying from SGPR to AGPR, or from AGPR to AGPR. It is not possible
/// to directly copy, so an intermediate VGPR needs to be used.
static void indirectCopyToAGPR(const SIInstrInfo &TII,
MachineBasicBlock &MBB,
MachineBasicBlock::iterator MI,
const DebugLoc &DL, MCRegister DestReg,
MCRegister SrcReg, bool KillSrc,
RegScavenger &RS,
Register ImpDefSuperReg = Register(),
Register ImpUseSuperReg = Register()) {
const SIRegisterInfo &RI = TII.getRegisterInfo();
assert(AMDGPU::SReg_32RegClass.contains(SrcReg) ||
AMDGPU::AGPR_32RegClass.contains(SrcReg));
// First try to find defining accvgpr_write to avoid temporary registers.
for (auto Def = MI, E = MBB.begin(); Def != E; ) {
--Def;
if (!Def->definesRegister(SrcReg, &RI))
continue;
if (Def->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
break;
MachineOperand &DefOp = Def->getOperand(1);
assert(DefOp.isReg() || DefOp.isImm());
if (DefOp.isReg()) {
// Check that register source operand if not clobbered before MI.
// Immediate operands are always safe to propagate.
bool SafeToPropagate = true;
for (auto I = Def; I != MI && SafeToPropagate; ++I)
if (I->modifiesRegister(DefOp.getReg(), &RI))
SafeToPropagate = false;
if (!SafeToPropagate)
break;
DefOp.setIsKill(false);
}
MachineInstrBuilder Builder =
BuildMI(MBB, MI, DL, TII.get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
.add(DefOp);
if (ImpDefSuperReg)
Builder.addReg(ImpDefSuperReg, RegState::Define | RegState::Implicit);
if (ImpUseSuperReg) {
Builder.addReg(ImpUseSuperReg,
getKillRegState(KillSrc) | RegState::Implicit);
}
return;
}
RS.enterBasicBlock(MBB);
RS.forward(MI);
// Ideally we want to have three registers for a long reg_sequence copy
// to hide 2 waitstates between v_mov_b32 and accvgpr_write.
unsigned MaxVGPRs = RI.getRegPressureLimit(&AMDGPU::VGPR_32RegClass,
*MBB.getParent());
// Registers in the sequence are allocated contiguously so we can just
// use register number to pick one of three round-robin temps.
unsigned RegNo = DestReg % 3;
Register Tmp = RS.scavengeRegister(&AMDGPU::VGPR_32RegClass, 0);
if (!Tmp)
report_fatal_error("Cannot scavenge VGPR to copy to AGPR");
RS.setRegUsed(Tmp);
if (!TII.getSubtarget().hasGFX90AInsts()) {
// Only loop through if there are any free registers left, otherwise
// scavenger may report a fatal error without emergency spill slot
// or spill with the slot.
while (RegNo-- && RS.FindUnusedReg(&AMDGPU::VGPR_32RegClass)) {
Register Tmp2 = RS.scavengeRegister(&AMDGPU::VGPR_32RegClass, 0);
if (!Tmp2 || RI.getHWRegIndex(Tmp2) >= MaxVGPRs)
break;
Tmp = Tmp2;
RS.setRegUsed(Tmp);
}
}
// Insert copy to temporary VGPR.
unsigned TmpCopyOp = AMDGPU::V_MOV_B32_e32;
if (AMDGPU::AGPR_32RegClass.contains(SrcReg)) {
TmpCopyOp = AMDGPU::V_ACCVGPR_READ_B32_e64;
} else {
assert(AMDGPU::SReg_32RegClass.contains(SrcReg));
}
MachineInstrBuilder UseBuilder = BuildMI(MBB, MI, DL, TII.get(TmpCopyOp), Tmp)
.addReg(SrcReg, getKillRegState(KillSrc));
if (ImpUseSuperReg) {
UseBuilder.addReg(ImpUseSuperReg,
getKillRegState(KillSrc) | RegState::Implicit);
}
MachineInstrBuilder DefBuilder
= BuildMI(MBB, MI, DL, TII.get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
.addReg(Tmp, RegState::Kill);
if (ImpDefSuperReg)
DefBuilder.addReg(ImpDefSuperReg, RegState::Define | RegState::Implicit);
}
static void expandSGPRCopy(const SIInstrInfo &TII, MachineBasicBlock &MBB,
MachineBasicBlock::iterator MI, const DebugLoc &DL,
MCRegister DestReg, MCRegister SrcReg, bool KillSrc,
const TargetRegisterClass *RC, bool Forward) {
const SIRegisterInfo &RI = TII.getRegisterInfo();
ArrayRef<int16_t> BaseIndices = RI.getRegSplitParts(RC, 4);
MachineBasicBlock::iterator I = MI;
MachineInstr *FirstMI = nullptr, *LastMI = nullptr;
for (unsigned Idx = 0; Idx < BaseIndices.size(); ++Idx) {
int16_t SubIdx = BaseIndices[Idx];
Register Reg = RI.getSubReg(DestReg, SubIdx);
unsigned Opcode = AMDGPU::S_MOV_B32;
// Is SGPR aligned? If so try to combine with next.
Register Src = RI.getSubReg(SrcReg, SubIdx);
bool AlignedDest = ((Reg - AMDGPU::SGPR0) % 2) == 0;
bool AlignedSrc = ((Src - AMDGPU::SGPR0) % 2) == 0;
if (AlignedDest && AlignedSrc && (Idx + 1 < BaseIndices.size())) {
// Can use SGPR64 copy
unsigned Channel = RI.getChannelFromSubReg(SubIdx);
SubIdx = RI.getSubRegFromChannel(Channel, 2);
Opcode = AMDGPU::S_MOV_B64;
Idx++;
}
LastMI = BuildMI(MBB, I, DL, TII.get(Opcode), RI.getSubReg(DestReg, SubIdx))
.addReg(RI.getSubReg(SrcReg, SubIdx))
.addReg(SrcReg, RegState::Implicit);
if (!FirstMI)
FirstMI = LastMI;
if (!Forward)
I--;
}
assert(FirstMI && LastMI);
if (!Forward)
std::swap(FirstMI, LastMI);
FirstMI->addOperand(
MachineOperand::CreateReg(DestReg, true /*IsDef*/, true /*IsImp*/));
if (KillSrc)
LastMI->addRegisterKilled(SrcReg, &RI);
}
void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MI,
const DebugLoc &DL, MCRegister DestReg,
MCRegister SrcReg, bool KillSrc) const {
const TargetRegisterClass *RC = RI.getPhysRegClass(DestReg);
// FIXME: This is hack to resolve copies between 16 bit and 32 bit
// registers until all patterns are fixed.
if (Fix16BitCopies &&
((RI.getRegSizeInBits(*RC) == 16) ^
(RI.getRegSizeInBits(*RI.getPhysRegClass(SrcReg)) == 16))) {
MCRegister &RegToFix = (RI.getRegSizeInBits(*RC) == 16) ? DestReg : SrcReg;
MCRegister Super = RI.get32BitRegister(RegToFix);
assert(RI.getSubReg(Super, AMDGPU::lo16) == RegToFix);
RegToFix = Super;
if (DestReg == SrcReg) {
// Insert empty bundle since ExpandPostRA expects an instruction here.
BuildMI(MBB, MI, DL, get(AMDGPU::BUNDLE));
return;
}
RC = RI.getPhysRegClass(DestReg);
}
if (RC == &AMDGPU::VGPR_32RegClass) {
assert(AMDGPU::VGPR_32RegClass.contains(SrcReg) ||
AMDGPU::SReg_32RegClass.contains(SrcReg) ||
AMDGPU::AGPR_32RegClass.contains(SrcReg));
unsigned Opc = AMDGPU::AGPR_32RegClass.contains(SrcReg) ?
AMDGPU::V_ACCVGPR_READ_B32_e64 : AMDGPU::V_MOV_B32_e32;
BuildMI(MBB, MI, DL, get(Opc), DestReg)
.addReg(SrcReg, getKillRegState(KillSrc));
return;
}
if (RC == &AMDGPU::SReg_32_XM0RegClass ||
RC == &AMDGPU::SReg_32RegClass) {
if (SrcReg == AMDGPU::SCC) {
BuildMI(MBB, MI, DL, get(AMDGPU::S_CSELECT_B32), DestReg)
.addImm(1)
.addImm(0);
return;
}
if (DestReg == AMDGPU::VCC_LO) {
if (AMDGPU::SReg_32RegClass.contains(SrcReg)) {
BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), AMDGPU::VCC_LO)
.addReg(SrcReg, getKillRegState(KillSrc));
} else {
// FIXME: Hack until VReg_1 removed.
assert(AMDGPU::VGPR_32RegClass.contains(SrcReg));
BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_U32_e32))
.addImm(0)
.addReg(SrcReg, getKillRegState(KillSrc));
}
return;
}
if (!AMDGPU::SReg_32RegClass.contains(SrcReg)) {
reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
return;
}
BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg)
.addReg(SrcReg, getKillRegState(KillSrc));
return;
}
if (RC == &AMDGPU::SReg_64RegClass) {
if (SrcReg == AMDGPU::SCC) {
BuildMI(MBB, MI, DL, get(AMDGPU::S_CSELECT_B64), DestReg)
.addImm(1)
.addImm(0);
return;
}
if (DestReg == AMDGPU::VCC) {
if (AMDGPU::SReg_64RegClass.contains(SrcReg)) {
BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), AMDGPU::VCC)
.addReg(SrcReg, getKillRegState(KillSrc));
} else {
// FIXME: Hack until VReg_1 removed.
assert(AMDGPU::VGPR_32RegClass.contains(SrcReg));
BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_U32_e32))
.addImm(0)
.addReg(SrcReg, getKillRegState(KillSrc));
}
return;
}
if (!AMDGPU::SReg_64RegClass.contains(SrcReg)) {
reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
return;
}
BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg)
.addReg(SrcReg, getKillRegState(KillSrc));
return;
}
if (DestReg == AMDGPU::SCC) {
// Copying 64-bit or 32-bit sources to SCC barely makes sense,
// but SelectionDAG emits such copies for i1 sources.
if (AMDGPU::SReg_64RegClass.contains(SrcReg)) {
// This copy can only be produced by patterns
// with explicit SCC, which are known to be enabled
// only for subtargets with S_CMP_LG_U64 present.
assert(ST.hasScalarCompareEq64());
BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U64))
.addReg(SrcReg, getKillRegState(KillSrc))
.addImm(0);
} else {
assert(AMDGPU::SReg_32RegClass.contains(SrcReg));
BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U32))
.addReg(SrcReg, getKillRegState(KillSrc))
.addImm(0);
}
return;
}
if (RC == &AMDGPU::AGPR_32RegClass) {
if (AMDGPU::VGPR_32RegClass.contains(SrcReg)) {
BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
.addReg(SrcReg, getKillRegState(KillSrc));
return;
}
if (AMDGPU::AGPR_32RegClass.contains(SrcReg) && ST.hasGFX90AInsts()) {
BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_MOV_B32), DestReg)
.addReg(SrcReg, getKillRegState(KillSrc));
return;
}
// FIXME: Pass should maintain scavenger to avoid scan through the block on
// every AGPR spill.
RegScavenger RS;
indirectCopyToAGPR(*this, MBB, MI, DL, DestReg, SrcReg, KillSrc, RS);
return;
}
const unsigned Size = RI.getRegSizeInBits(*RC);
if (Size == 16) {
assert(AMDGPU::VGPR_LO16RegClass.contains(SrcReg) ||
AMDGPU::VGPR_HI16RegClass.contains(SrcReg) ||
AMDGPU::SReg_LO16RegClass.contains(SrcReg) ||
AMDGPU::AGPR_LO16RegClass.contains(SrcReg));
bool IsSGPRDst = AMDGPU::SReg_LO16RegClass.contains(DestReg);
bool IsSGPRSrc = AMDGPU::SReg_LO16RegClass.contains(SrcReg);
bool IsAGPRDst = AMDGPU::AGPR_LO16RegClass.contains(DestReg);
bool IsAGPRSrc = AMDGPU::AGPR_LO16RegClass.contains(SrcReg);
bool DstLow = AMDGPU::VGPR_LO16RegClass.contains(DestReg) ||
AMDGPU::SReg_LO16RegClass.contains(DestReg) ||
AMDGPU::AGPR_LO16RegClass.contains(DestReg);
bool SrcLow = AMDGPU::VGPR_LO16RegClass.contains(SrcReg) ||
AMDGPU::SReg_LO16RegClass.contains(SrcReg) ||
AMDGPU::AGPR_LO16RegClass.contains(SrcReg);
MCRegister NewDestReg = RI.get32BitRegister(DestReg);
MCRegister NewSrcReg = RI.get32BitRegister(SrcReg);
if (IsSGPRDst) {
if (!IsSGPRSrc) {
reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
return;
}
BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), NewDestReg)
.addReg(NewSrcReg, getKillRegState(KillSrc));
return;
}
if (IsAGPRDst || IsAGPRSrc) {
if (!DstLow || !SrcLow) {
reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc,
"Cannot use hi16 subreg with an AGPR!");
}
copyPhysReg(MBB, MI, DL, NewDestReg, NewSrcReg, KillSrc);
return;
}
if (IsSGPRSrc && !ST.hasSDWAScalar()) {
if (!DstLow || !SrcLow) {
reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc,
"Cannot use hi16 subreg on VI!");
}
BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), NewDestReg)
.addReg(NewSrcReg, getKillRegState(KillSrc));
return;
}
auto MIB = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_sdwa), NewDestReg)
.addImm(0) // src0_modifiers
.addReg(NewSrcReg)
.addImm(0) // clamp
.addImm(DstLow ? AMDGPU::SDWA::SdwaSel::WORD_0
: AMDGPU::SDWA::SdwaSel::WORD_1)
.addImm(AMDGPU::SDWA::DstUnused::UNUSED_PRESERVE)
.addImm(SrcLow ? AMDGPU::SDWA::SdwaSel::WORD_0
: AMDGPU::SDWA::SdwaSel::WORD_1)
.addReg(NewDestReg, RegState::Implicit | RegState::Undef);
// First implicit operand is $exec.
MIB->tieOperands(0, MIB->getNumOperands() - 1);
return;
}
const TargetRegisterClass *SrcRC = RI.getPhysRegClass(SrcReg);
if (RC == RI.getVGPR64Class() && (SrcRC == RC || RI.isSGPRClass(SrcRC))) {
if (ST.hasPackedFP32Ops()) {
BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), DestReg)
.addImm(SISrcMods::OP_SEL_1)
.addReg(SrcReg)
.addImm(SISrcMods::OP_SEL_0 | SISrcMods::OP_SEL_1)
.addReg(SrcReg)
.addImm(0) // op_sel_lo
.addImm(0) // op_sel_hi
.addImm(0) // neg_lo
.addImm(0) // neg_hi
.addImm(0) // clamp
.addReg(SrcReg, getKillRegState(KillSrc) | RegState::Implicit);
return;
}
}
const bool Forward = RI.getHWRegIndex(DestReg) <= RI.getHWRegIndex(SrcReg);
if (RI.isSGPRClass(RC)) {
if (!RI.isSGPRClass(SrcRC)) {
reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
return;
}
expandSGPRCopy(*this, MBB, MI, DL, DestReg, SrcReg, KillSrc, RC, Forward);
return;
}
unsigned EltSize = 4;
unsigned Opcode = AMDGPU::V_MOV_B32_e32;
if (RI.isAGPRClass(RC)) {
Opcode = (RI.hasVGPRs(SrcRC)) ?
AMDGPU::V_ACCVGPR_WRITE_B32_e64 : AMDGPU::INSTRUCTION_LIST_END;
} else if (RI.hasVGPRs(RC) && RI.isAGPRClass(SrcRC)) {
Opcode = AMDGPU::V_ACCVGPR_READ_B32_e64;
} else if ((Size % 64 == 0) && RI.hasVGPRs(RC) &&
(RI.isProperlyAlignedRC(*RC) &&
(SrcRC == RC || RI.isSGPRClass(SrcRC)))) {
// TODO: In 96-bit case, could do a 64-bit mov and then a 32-bit mov.
if (ST.hasPackedFP32Ops()) {
Opcode = AMDGPU::V_PK_MOV_B32;
EltSize = 8;
}
}
// For the cases where we need an intermediate instruction/temporary register
// (destination is an AGPR), we need a scavenger.
//
// FIXME: The pass should maintain this for us so we don't have to re-scan the
// whole block for every handled copy.
std::unique_ptr<RegScavenger> RS;
if (Opcode == AMDGPU::INSTRUCTION_LIST_END)
RS.reset(new RegScavenger());
ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RC, EltSize);
// If there is an overlap, we can't kill the super-register on the last
// instruction, since it will also kill the components made live by this def.
const bool CanKillSuperReg = KillSrc && !RI.regsOverlap(SrcReg, DestReg);
for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) {
unsigned SubIdx;
if (Forward)
SubIdx = SubIndices[Idx];
else
SubIdx = SubIndices[SubIndices.size() - Idx - 1];
bool UseKill = CanKillSuperReg && Idx == SubIndices.size() - 1;
if (Opcode == AMDGPU::INSTRUCTION_LIST_END) {
Register ImpDefSuper = Idx == 0 ? Register(DestReg) : Register();
Register ImpUseSuper = SrcReg;
indirectCopyToAGPR(*this, MBB, MI, DL, RI.getSubReg(DestReg, SubIdx),
RI.getSubReg(SrcReg, SubIdx), UseKill, *RS,
ImpDefSuper, ImpUseSuper);
} else if (Opcode == AMDGPU::V_PK_MOV_B32) {
Register DstSubReg = RI.getSubReg(DestReg, SubIdx);
Register SrcSubReg = RI.getSubReg(SrcReg, SubIdx);
MachineInstrBuilder MIB =
BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), DstSubReg)
.addImm(SISrcMods::OP_SEL_1)
.addReg(SrcSubReg)
.addImm(SISrcMods::OP_SEL_0 | SISrcMods::OP_SEL_1)
.addReg(SrcSubReg)
.addImm(0) // op_sel_lo
.addImm(0) // op_sel_hi
.addImm(0) // neg_lo
.addImm(0) // neg_hi
.addImm(0) // clamp
.addReg(SrcReg, getKillRegState(UseKill) | RegState::Implicit);
if (Idx == 0)
MIB.addReg(DestReg, RegState::Define | RegState::Implicit);
} else {
MachineInstrBuilder Builder =
BuildMI(MBB, MI, DL, get(Opcode), RI.getSubReg(DestReg, SubIdx))
.addReg(RI.getSubReg(SrcReg, SubIdx));
if (Idx == 0)
Builder.addReg(DestReg, RegState::Define | RegState::Implicit);
Builder.addReg(SrcReg, getKillRegState(UseKill) | RegState::Implicit);
}
}
}
int SIInstrInfo::commuteOpcode(unsigned Opcode) const {
int NewOpc;
// Try to map original to commuted opcode
NewOpc = AMDGPU::getCommuteRev(Opcode);
if (NewOpc != -1)
// Check if the commuted (REV) opcode exists on the target.
return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1;
// Try to map commuted to original opcode
NewOpc = AMDGPU::getCommuteOrig(Opcode);
if (NewOpc != -1)
// Check if the original (non-REV) opcode exists on the target.
return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1;
return Opcode;
}
void SIInstrInfo::materializeImmediate(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MI,
const DebugLoc &DL, unsigned DestReg,
int64_t Value) const {
MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
const TargetRegisterClass *RegClass = MRI.getRegClass(DestReg);
if (RegClass == &AMDGPU::SReg_32RegClass ||
RegClass == &AMDGPU::SGPR_32RegClass ||
RegClass == &AMDGPU::SReg_32_XM0RegClass ||
RegClass == &AMDGPU::SReg_32_XM0_XEXECRegClass) {
BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg)
.addImm(Value);
return;
}
if (RegClass == &AMDGPU::SReg_64RegClass ||
RegClass == &AMDGPU::SGPR_64RegClass ||
RegClass == &AMDGPU::SReg_64_XEXECRegClass) {
BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg)
.addImm(Value);
return;
}
if (RegClass == &AMDGPU::VGPR_32RegClass) {
BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DestReg)
.addImm(Value);
return;
}
if (RegClass->hasSuperClassEq(&AMDGPU::VReg_64RegClass)) {
BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO), DestReg)
.addImm(Value);
return;
}
unsigned EltSize = 4;
unsigned Opcode = AMDGPU::V_MOV_B32_e32;
if (RI.isSGPRClass(RegClass)) {
if (RI.getRegSizeInBits(*RegClass) > 32) {
Opcode = AMDGPU::S_MOV_B64;
EltSize = 8;
} else {
Opcode = AMDGPU::S_MOV_B32;
EltSize = 4;
}
}
ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RegClass, EltSize);
for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) {
int64_t IdxValue = Idx == 0 ? Value : 0;
MachineInstrBuilder Builder = BuildMI(MBB, MI, DL,
get(Opcode), RI.getSubReg(DestReg, SubIndices[Idx]));
Builder.addImm(IdxValue);
}
}
const TargetRegisterClass *
SIInstrInfo::getPreferredSelectRegClass(unsigned Size) const {
return &AMDGPU::VGPR_32RegClass;
}
void SIInstrInfo::insertVectorSelect(MachineBasicBlock &MBB,
MachineBasicBlock::iterator I,
const DebugLoc &DL, Register DstReg,
ArrayRef<MachineOperand> Cond,
Register TrueReg,
Register FalseReg) const {
MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
const TargetRegisterClass *BoolXExecRC =
RI.getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
assert(MRI.getRegClass(DstReg) == &AMDGPU::VGPR_32RegClass &&
"Not a VGPR32 reg");
if (Cond.size() == 1) {
Register SReg = MRI.createVirtualRegister(BoolXExecRC);
BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
.add(Cond[0]);
BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
.addImm(0)
.addReg(FalseReg)
.addImm(0)
.addReg(TrueReg)
.addReg(SReg);
} else if (Cond.size() == 2) {
assert(Cond[0].isImm() && "Cond[0] is not an immediate");
switch (Cond[0].getImm()) {
case SIInstrInfo::SCC_TRUE: {
Register SReg = MRI.createVirtualRegister(BoolXExecRC);
BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32
: AMDGPU::S_CSELECT_B64), SReg)
.addImm(1)
.addImm(0);
BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
.addImm(0)
.addReg(FalseReg)
.addImm(0)
.addReg(TrueReg)
.addReg(SReg);
break;
}
case SIInstrInfo::SCC_FALSE: {
Register SReg = MRI.createVirtualRegister(BoolXExecRC);
BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32
: AMDGPU::S_CSELECT_B64), SReg)
.addImm(0)
.addImm(1);
BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
.addImm(0)
.addReg(FalseReg)
.addImm(0)
.addReg(TrueReg)
.addReg(SReg);
break;
}
case SIInstrInfo::VCCNZ: {
MachineOperand RegOp = Cond[1];
RegOp.setImplicit(false);
Register SReg = MRI.createVirtualRegister(BoolXExecRC);
BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
.add(RegOp);
BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
.addImm(0)
.addReg(FalseReg)
.addImm(0)
.addReg(TrueReg)
.addReg(SReg);
break;
}
case SIInstrInfo::VCCZ: {
MachineOperand RegOp = Cond[1];
RegOp.setImplicit(false);
Register SReg = MRI.createVirtualRegister(BoolXExecRC);
BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
.add(RegOp);
BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
.addImm(0)
.addReg(TrueReg)
.addImm(0)
.addReg(FalseReg)
.addReg(SReg);
break;
}
case SIInstrInfo::EXECNZ: {
Register SReg = MRI.createVirtualRegister(BoolXExecRC);
Register SReg2 = MRI.createVirtualRegister(RI.getBoolRC());
BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32
: AMDGPU::S_OR_SAVEEXEC_B64), SReg2)
.addImm(0);
BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32
: AMDGPU::S_CSELECT_B64), SReg)
.addImm(1)
.addImm(0);
BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
.addImm(0)
.addReg(FalseReg)
.addImm(0)
.addReg(TrueReg)
.addReg(SReg);
break;
}
case SIInstrInfo::EXECZ: {
Register SReg = MRI.createVirtualRegister(BoolXExecRC);
Register SReg2 = MRI.createVirtualRegister(RI.getBoolRC());
BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32
: AMDGPU::S_OR_SAVEEXEC_B64), SReg2)
.addImm(0);
BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32
: AMDGPU::S_CSELECT_B64), SReg)
.addImm(0)
.addImm(1);
BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
.addImm(0)
.addReg(FalseReg)
.addImm(0)
.addReg(TrueReg)
.addReg(SReg);
llvm_unreachable("Unhandled branch predicate EXECZ");
break;
}
default:
llvm_unreachable("invalid branch predicate");
}
} else {
llvm_unreachable("Can only handle Cond size 1 or 2");
}
}
Register SIInstrInfo::insertEQ(MachineBasicBlock *MBB,
MachineBasicBlock::iterator I,
const DebugLoc &DL,
Register SrcReg, int Value) const {
MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
Register Reg = MRI.createVirtualRegister(RI.getBoolRC());
BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_EQ_I32_e64), Reg)
.addImm(Value)
.addReg(SrcReg);
return Reg;
}
Register SIInstrInfo::insertNE(MachineBasicBlock *MBB,
MachineBasicBlock::iterator I,
const DebugLoc &DL,
Register SrcReg, int Value) const {
MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
Register Reg = MRI.createVirtualRegister(RI.getBoolRC());
BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_NE_I32_e64), Reg)
.addImm(Value)
.addReg(SrcReg);
return Reg;
}
unsigned SIInstrInfo::getMovOpcode(const TargetRegisterClass *DstRC) const {
if (RI.isAGPRClass(DstRC))
return AMDGPU::COPY;
if (RI.getRegSizeInBits(*DstRC) == 32) {
return RI.isSGPRClass(DstRC) ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
} else if (RI.getRegSizeInBits(*DstRC) == 64 && RI.isSGPRClass(DstRC)) {
return AMDGPU::S_MOV_B64;
} else if (RI.getRegSizeInBits(*DstRC) == 64 && !RI.isSGPRClass(DstRC)) {
return AMDGPU::V_MOV_B64_PSEUDO;
}
return AMDGPU::COPY;
}
const MCInstrDesc &
SIInstrInfo::getIndirectGPRIDXPseudo(unsigned VecSize,
bool IsIndirectSrc) const {
if (IsIndirectSrc) {
if (VecSize <= 32) // 4 bytes
return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V1);
if (VecSize <= 64) // 8 bytes
return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V2);
if (VecSize <= 96) // 12 bytes
return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V3);
if (VecSize <= 128) // 16 bytes
return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V4);
if (VecSize <= 160) // 20 bytes
return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V5);
if (VecSize <= 256) // 32 bytes
return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V8);
if (VecSize <= 512) // 64 bytes
return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V16);
if (VecSize <= 1024) // 128 bytes
return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V32);
llvm_unreachable("unsupported size for IndirectRegReadGPRIDX pseudos");
}
if (VecSize <= 32) // 4 bytes
return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V1);
if (VecSize <= 64) // 8 bytes
return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V2);
if (VecSize <= 96) // 12 bytes
return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V3);
if (VecSize <= 128) // 16 bytes
return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V4);
if (VecSize <= 160) // 20 bytes
return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V5);
if (VecSize <= 256) // 32 bytes
return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V8);
if (VecSize <= 512) // 64 bytes
return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V16);
if (VecSize <= 1024) // 128 bytes
return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V32);
llvm_unreachable("unsupported size for IndirectRegWriteGPRIDX pseudos");
}
static unsigned getIndirectVGPRWriteMovRelPseudoOpc(unsigned VecSize) {
if (VecSize <= 32) // 4 bytes
return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V1;
if (VecSize <= 64) // 8 bytes
return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V2;
if (VecSize <= 96) // 12 bytes
return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V3;
if (VecSize <= 128) // 16 bytes
return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V4;
if (VecSize <= 160) // 20 bytes
return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V5;
if (VecSize <= 256) // 32 bytes
return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V8;
if (VecSize <= 512) // 64 bytes
return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V16;
if (VecSize <= 1024) // 128 bytes
return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V32;
llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
}
static unsigned getIndirectSGPRWriteMovRelPseudo32(unsigned VecSize) {
if (VecSize <= 32) // 4 bytes
return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V1;
if (VecSize <= 64) // 8 bytes
return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V2;
if (VecSize <= 96) // 12 bytes
return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V3;
if (VecSize <= 128) // 16 bytes
return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V4;
if (VecSize <= 160) // 20 bytes
return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V5;
if (VecSize <= 256) // 32 bytes
return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V8;
if (VecSize <= 512) // 64 bytes
return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V16;
if (VecSize <= 1024) // 128 bytes
return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V32;
llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
}
static unsigned getIndirectSGPRWriteMovRelPseudo64(unsigned VecSize) {
if (VecSize <= 64) // 8 bytes
return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V1;
if (VecSize <= 128) // 16 bytes
return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V2;
if (VecSize <= 256) // 32 bytes
return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V4;
if (VecSize <= 512) // 64 bytes
return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V8;
if (VecSize <= 1024) // 128 bytes
return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V16;
llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
}
const MCInstrDesc &
SIInstrInfo::getIndirectRegWriteMovRelPseudo(unsigned VecSize, unsigned EltSize,
bool IsSGPR) const {
if (IsSGPR) {
switch (EltSize) {
case 32:
return get(getIndirectSGPRWriteMovRelPseudo32(VecSize));
case 64:
return get(getIndirectSGPRWriteMovRelPseudo64(VecSize));
default:
llvm_unreachable("invalid reg indexing elt size");
}
}
assert(EltSize == 32 && "invalid reg indexing elt size");
return get(getIndirectVGPRWriteMovRelPseudoOpc(VecSize));
}
static unsigned getSGPRSpillSaveOpcode(unsigned Size) {
switch (Size) {
case 4:
return AMDGPU::SI_SPILL_S32_SAVE;
case 8:
return AMDGPU::SI_SPILL_S64_SAVE;
case 12:
return AMDGPU::SI_SPILL_S96_SAVE;
case 16:
return AMDGPU::SI_SPILL_S128_SAVE;
case 20:
return AMDGPU::SI_SPILL_S160_SAVE;
case 24:
return AMDGPU::SI_SPILL_S192_SAVE;
case 28:
return AMDGPU::SI_SPILL_S224_SAVE;
case 32:
return AMDGPU::SI_SPILL_S256_SAVE;
case 64:
return AMDGPU::SI_SPILL_S512_SAVE;
case 128:
return AMDGPU::SI_SPILL_S1024_SAVE;
default:
llvm_unreachable("unknown register size");
}
}
static unsigned getVGPRSpillSaveOpcode(unsigned Size) {
switch (Size) {
case 4:
return AMDGPU::SI_SPILL_V32_SAVE;
case 8:
return AMDGPU::SI_SPILL_V64_SAVE;
case 12:
return AMDGPU::SI_SPILL_V96_SAVE;
case 16:
return AMDGPU::SI_SPILL_V128_SAVE;
case 20:
return AMDGPU::SI_SPILL_V160_SAVE;
case 24:
return AMDGPU::SI_SPILL_V192_SAVE;
case 28:
return AMDGPU::SI_SPILL_V224_SAVE;
case 32:
return AMDGPU::SI_SPILL_V256_SAVE;
case 64:
return AMDGPU::SI_SPILL_V512_SAVE;
case 128:
return AMDGPU::SI_SPILL_V1024_SAVE;
default:
llvm_unreachable("unknown register size");
}
}
static unsigned getAGPRSpillSaveOpcode(unsigned Size) {
switch (Size) {
case 4:
return AMDGPU::SI_SPILL_A32_SAVE;
case 8:
return AMDGPU::SI_SPILL_A64_SAVE;
case 12:
return AMDGPU::SI_SPILL_A96_SAVE;
case 16:
return AMDGPU::SI_SPILL_A128_SAVE;
case 20:
return AMDGPU::SI_SPILL_A160_SAVE;
case 24:
return AMDGPU::SI_SPILL_A192_SAVE;
case 28:
return AMDGPU::SI_SPILL_A224_SAVE;
case 32:
return AMDGPU::SI_SPILL_A256_SAVE;
case 64:
return AMDGPU::SI_SPILL_A512_SAVE;
case 128:
return AMDGPU::SI_SPILL_A1024_SAVE;
default:
llvm_unreachable("unknown register size");
}
}
void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MI,
Register SrcReg, bool isKill,
int FrameIndex,
const TargetRegisterClass *RC,
const TargetRegisterInfo *TRI) const {
MachineFunction *MF = MBB.getParent();
SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
MachineFrameInfo &FrameInfo = MF->getFrameInfo();
const DebugLoc &DL = MBB.findDebugLoc(MI);
MachinePointerInfo PtrInfo
= MachinePointerInfo::getFixedStack(*MF, FrameIndex);
MachineMemOperand *MMO = MF->getMachineMemOperand(
PtrInfo, MachineMemOperand::MOStore, FrameInfo.getObjectSize(FrameIndex),
FrameInfo.getObjectAlign(FrameIndex));
unsigned SpillSize = TRI->getSpillSize(*RC);
MachineRegisterInfo &MRI = MF->getRegInfo();
if (RI.isSGPRClass(RC)) {
MFI->setHasSpilledSGPRs();
assert(SrcReg != AMDGPU::M0 && "m0 should not be spilled");
assert(SrcReg != AMDGPU::EXEC_LO && SrcReg != AMDGPU::EXEC_HI &&
SrcReg != AMDGPU::EXEC && "exec should not be spilled");
// We are only allowed to create one new instruction when spilling
// registers, so we need to use pseudo instruction for spilling SGPRs.
const MCInstrDesc &OpDesc = get(getSGPRSpillSaveOpcode(SpillSize));
// The SGPR spill/restore instructions only work on number sgprs, so we need
// to make sure we are using the correct register class.
if (SrcReg.isVirtual() && SpillSize == 4) {
MRI.constrainRegClass(SrcReg, &AMDGPU::SReg_32_XM0_XEXECRegClass);
}
BuildMI(MBB, MI, DL, OpDesc)
.addReg(SrcReg, getKillRegState(isKill)) // data
.addFrameIndex(FrameIndex) // addr
.addMemOperand(MMO)
.addReg(MFI->getStackPtrOffsetReg(), RegState::Implicit);
if (RI.spillSGPRToVGPR())
FrameInfo.setStackID(FrameIndex, TargetStackID::SGPRSpill);
return;
}
unsigned Opcode = RI.isAGPRClass(RC) ? getAGPRSpillSaveOpcode(SpillSize)
: getVGPRSpillSaveOpcode(SpillSize);
MFI->setHasSpilledVGPRs();
if (RI.isVectorSuperClass(RC)) {
// Convert an AV spill into a VGPR spill. Introduce a copy from AV to an
// equivalent VGPR register beforehand. Regalloc might want to introduce
// AV spills only to be relevant until rewriter at which they become
// either spills of VGPRs or AGPRs.
Register TmpVReg = MRI.createVirtualRegister(RI.getEquivalentVGPRClass(RC));
BuildMI(MBB, MI, DL, get(TargetOpcode::COPY), TmpVReg)
.addReg(SrcReg, RegState::Kill);
SrcReg = TmpVReg;
}
BuildMI(MBB, MI, DL, get(Opcode))
.addReg(SrcReg, getKillRegState(isKill)) // data
.addFrameIndex(FrameIndex) // addr
.addReg(MFI->getStackPtrOffsetReg()) // scratch_offset
.addImm(0) // offset
.addMemOperand(MMO);
}
static unsigned getSGPRSpillRestoreOpcode(unsigned Size) {
switch (Size) {
case 4:
return AMDGPU::SI_SPILL_S32_RESTORE;
case 8:
return AMDGPU::SI_SPILL_S64_RESTORE;
case 12:
return AMDGPU::SI_SPILL_S96_RESTORE;
case 16:
return AMDGPU::SI_SPILL_S128_RESTORE;
case 20:
return AMDGPU::SI_SPILL_S160_RESTORE;
case 24:
return AMDGPU::SI_SPILL_S192_RESTORE;
case 28:
return AMDGPU::SI_SPILL_S224_RESTORE;
case 32:
return AMDGPU::SI_SPILL_S256_RESTORE;
case 64:
return AMDGPU::SI_SPILL_S512_RESTORE;
case 128:
return AMDGPU::SI_SPILL_S1024_RESTORE;
default:
llvm_unreachable("unknown register size");
}
}
static unsigned getVGPRSpillRestoreOpcode(unsigned Size) {
switch (Size) {
case 4:
return AMDGPU::SI_SPILL_V32_RESTORE;
case 8:
return AMDGPU::SI_SPILL_V64_RESTORE;
case 12:
return AMDGPU::SI_SPILL_V96_RESTORE;
case 16:
return AMDGPU::SI_SPILL_V128_RESTORE;
case 20:
return AMDGPU::SI_SPILL_V160_RESTORE;
case 24:
return AMDGPU::SI_SPILL_V192_RESTORE;
case 28:
return AMDGPU::SI_SPILL_V224_RESTORE;
case 32:
return AMDGPU::SI_SPILL_V256_RESTORE;
case 64:
return AMDGPU::SI_SPILL_V512_RESTORE;
case 128:
return AMDGPU::SI_SPILL_V1024_RESTORE;
default:
llvm_unreachable("unknown register size");
}
}
static unsigned getAGPRSpillRestoreOpcode(unsigned Size) {
switch (Size) {
case 4:
return AMDGPU::SI_SPILL_A32_RESTORE;
case 8:
return AMDGPU::SI_SPILL_A64_RESTORE;
case 12:
return AMDGPU::SI_SPILL_A96_RESTORE;
case 16:
return AMDGPU::SI_SPILL_A128_RESTORE;
case 20:
return AMDGPU::SI_SPILL_A160_RESTORE;
case 24:
return AMDGPU::SI_SPILL_A192_RESTORE;
case 28:
return AMDGPU::SI_SPILL_A224_RESTORE;
case 32:
return AMDGPU::SI_SPILL_A256_RESTORE;
case 64:
return AMDGPU::SI_SPILL_A512_RESTORE;
case 128:
return AMDGPU::SI_SPILL_A1024_RESTORE;
default:
llvm_unreachable("unknown register size");
}
}
void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MI,
Register DestReg, int FrameIndex,
const TargetRegisterClass *RC,
const TargetRegisterInfo *TRI) const {
MachineFunction *MF = MBB.getParent();
SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
MachineFrameInfo &FrameInfo = MF->getFrameInfo();
const DebugLoc &DL = MBB.findDebugLoc(MI);
unsigned SpillSize = TRI->getSpillSize(*RC);
MachinePointerInfo PtrInfo
= MachinePointerInfo::getFixedStack(*MF, FrameIndex);
MachineMemOperand *MMO = MF->getMachineMemOperand(
PtrInfo, MachineMemOperand::MOLoad, FrameInfo.getObjectSize(FrameIndex),
FrameInfo.getObjectAlign(FrameIndex));
if (RI.isSGPRClass(RC)) {
MFI->setHasSpilledSGPRs();
assert(DestReg != AMDGPU::M0 && "m0 should not be reloaded into");
assert(DestReg != AMDGPU::EXEC_LO && DestReg != AMDGPU::EXEC_HI &&
DestReg != AMDGPU::EXEC && "exec should not be spilled");
// FIXME: Maybe this should not include a memoperand because it will be
// lowered to non-memory instructions.
const MCInstrDesc &OpDesc = get(getSGPRSpillRestoreOpcode(SpillSize));
if (DestReg.isVirtual() && SpillSize == 4) {
MachineRegisterInfo &MRI = MF->getRegInfo();
MRI.constrainRegClass(DestReg, &AMDGPU::SReg_32_XM0_XEXECRegClass);
}
if (RI.spillSGPRToVGPR())
FrameInfo.setStackID(FrameIndex, TargetStackID::SGPRSpill);
BuildMI(MBB, MI, DL, OpDesc, DestReg)
.addFrameIndex(FrameIndex) // addr
.addMemOperand(MMO)
.addReg(MFI->getStackPtrOffsetReg(), RegState::Implicit);
return;
}
unsigned Opcode = RI.isAGPRClass(RC) ? getAGPRSpillRestoreOpcode(SpillSize)
: getVGPRSpillRestoreOpcode(SpillSize);
bool IsVectorSuperClass = RI.isVectorSuperClass(RC);
Register TmpReg = DestReg;
if (IsVectorSuperClass) {
// For AV classes, insert the spill restore to a VGPR followed by a copy
// into an equivalent AV register.
MachineRegisterInfo &MRI = MF->getRegInfo();
DestReg = MRI.createVirtualRegister(RI.getEquivalentVGPRClass(RC));
}
BuildMI(MBB, MI, DL, get(Opcode), DestReg)
.addFrameIndex(FrameIndex) // vaddr
.addReg(MFI->getStackPtrOffsetReg()) // scratch_offset
.addImm(0) // offset
.addMemOperand(MMO);
if (IsVectorSuperClass)
BuildMI(MBB, MI, DL, get(TargetOpcode::COPY), TmpReg)
.addReg(DestReg, RegState::Kill);
}
void SIInstrInfo::insertNoop(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MI) const {
insertNoops(MBB, MI, 1);
}
void SIInstrInfo::insertNoops(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MI,
unsigned Quantity) const {
DebugLoc DL = MBB.findDebugLoc(MI);
while (Quantity > 0) {
unsigned Arg = std::min(Quantity, 8u);
Quantity -= Arg;
BuildMI(MBB, MI, DL, get(AMDGPU::S_NOP)).addImm(Arg - 1);
}
}
void SIInstrInfo::insertReturn(MachineBasicBlock &MBB) const {
auto MF = MBB.getParent();
SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
assert(Info->isEntryFunction());
if (MBB.succ_empty()) {
bool HasNoTerminator = MBB.getFirstTerminator() == MBB.end();
if (HasNoTerminator) {
if (Info->returnsVoid()) {
BuildMI(MBB, MBB.end(), DebugLoc(), get(AMDGPU::S_ENDPGM)).addImm(0);
} else {
BuildMI(MBB, MBB.end(), DebugLoc(), get(AMDGPU::SI_RETURN_TO_EPILOG));
}
}
}
}
unsigned SIInstrInfo::getNumWaitStates(const MachineInstr &MI) {
switch (MI.getOpcode()) {
default:
if (MI.isMetaInstruction())
return 0;
return 1; // FIXME: Do wait states equal cycles?
case AMDGPU::S_NOP:
return MI.getOperand(0).getImm() + 1;
// FIXME: Any other pseudo instruction?
// SI_RETURN_TO_EPILOG is a fallthrough to code outside of the function. The
// hazard, even if one exist, won't really be visible. Should we handle it?
case AMDGPU::SI_MASKED_UNREACHABLE:
case AMDGPU::WAVE_BARRIER:
return 0;
}
}
bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
const SIRegisterInfo *TRI = ST.getRegisterInfo();
MachineBasicBlock &MBB = *MI.getParent();
DebugLoc DL = MBB.findDebugLoc(MI);
switch (MI.getOpcode()) {
default: return TargetInstrInfo::expandPostRAPseudo(MI);
case AMDGPU::S_MOV_B64_term:
// This is only a terminator to get the correct spill code placement during
// register allocation.
MI.setDesc(get(AMDGPU::S_MOV_B64));
break;
case AMDGPU::S_MOV_B32_term:
// This is only a terminator to get the correct spill code placement during
// register allocation.
MI.setDesc(get(AMDGPU::S_MOV_B32));
break;
case AMDGPU::S_XOR_B64_term:
// This is only a terminator to get the correct spill code placement during
// register allocation.
MI.setDesc(get(AMDGPU::S_XOR_B64));
break;
case AMDGPU::S_XOR_B32_term:
// This is only a terminator to get the correct spill code placement during
// register allocation.
MI.setDesc(get(AMDGPU::S_XOR_B32));
break;
case AMDGPU::S_OR_B64_term:
// This is only a terminator to get the correct spill code placement during
// register allocation.
MI.setDesc(get(AMDGPU::S_OR_B64));
break;
case AMDGPU::S_OR_B32_term:
// This is only a terminator to get the correct spill code placement during
// register allocation.
MI.setDesc(get(AMDGPU::S_OR_B32));
break;
case AMDGPU::S_ANDN2_B64_term:
// This is only a terminator to get the correct spill code placement during
// register allocation.
MI.setDesc(get(AMDGPU::S_ANDN2_B64));
break;
case AMDGPU::S_ANDN2_B32_term:
// This is only a terminator to get the correct spill code placement during
// register allocation.
MI.setDesc(get(AMDGPU::S_ANDN2_B32));
break;
case AMDGPU::S_AND_B64_term:
// This is only a terminator to get the correct spill code placement during
// register allocation.
MI.setDesc(get(AMDGPU::S_AND_B64));
break;
case AMDGPU::S_AND_B32_term:
// This is only a terminator to get the correct spill code placement during
// register allocation.
MI.setDesc(get(AMDGPU::S_AND_B32));
break;
case AMDGPU::V_MOV_B64_PSEUDO: {
Register Dst = MI.getOperand(0).getReg();
Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
const MachineOperand &SrcOp = MI.getOperand(1);
// FIXME: Will this work for 64-bit floating point immediates?
assert(!SrcOp.isFPImm());
if (SrcOp.isImm()) {
APInt Imm(64, SrcOp.getImm());
APInt Lo(32, Imm.getLoBits(32).getZExtValue());
APInt Hi(32, Imm.getHiBits(32).getZExtValue());
if (ST.hasPackedFP32Ops() && Lo == Hi && isInlineConstant(Lo)) {
BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), Dst)
.addImm(SISrcMods::OP_SEL_1)
.addImm(Lo.getSExtValue())
.addImm(SISrcMods::OP_SEL_1)
.addImm(Lo.getSExtValue())
.addImm(0) // op_sel_lo
.addImm(0) // op_sel_hi
.addImm(0) // neg_lo
.addImm(0) // neg_hi
.addImm(0); // clamp
} else {
BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)
.addImm(Lo.getSExtValue())
.addReg(Dst, RegState::Implicit | RegState::Define);
BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi)
.addImm(Hi.getSExtValue())
.addReg(Dst, RegState::Implicit | RegState::Define);
}
} else {
assert(SrcOp.isReg());
if (ST.hasPackedFP32Ops() &&
!RI.isAGPR(MBB.getParent()->getRegInfo(), SrcOp.getReg())) {
BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), Dst)
.addImm(SISrcMods::OP_SEL_1) // src0_mod
.addReg(SrcOp.getReg())
.addImm(SISrcMods::OP_SEL_0 | SISrcMods::OP_SEL_1) // src1_mod
.addReg(SrcOp.getReg())
.addImm(0) // op_sel_lo
.addImm(0) // op_sel_hi
.addImm(0) // neg_lo
.addImm(0) // neg_hi
.addImm(0); // clamp
} else {
BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)
.addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub0))
.addReg(Dst, RegState::Implicit | RegState::Define);
BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi)
.addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub1))
.addReg(Dst, RegState::Implicit | RegState::Define);
}
}
MI.eraseFromParent();
break;
}
case AMDGPU::V_MOV_B64_DPP_PSEUDO: {
expandMovDPP64(MI);
break;
}
case AMDGPU::S_MOV_B64_IMM_PSEUDO: {
const MachineOperand &SrcOp = MI.getOperand(1);
assert(!SrcOp.isFPImm());
APInt Imm(64, SrcOp.getImm());
if (Imm.isIntN(32) || isInlineConstant(Imm)) {
MI.setDesc(get(AMDGPU::S_MOV_B64));
break;
}
Register Dst = MI.getOperand(0).getReg();
Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
APInt Lo(32, Imm.getLoBits(32).getZExtValue());
APInt Hi(32, Imm.getHiBits(32).getZExtValue());
BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DstLo)
.addImm(Lo.getSExtValue())
.addReg(Dst, RegState::Implicit | RegState::Define);
BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DstHi)
.addImm(Hi.getSExtValue())
.addReg(Dst, RegState::Implicit | RegState::Define);
MI.eraseFromParent();
break;
}
case AMDGPU::V_SET_INACTIVE_B32: {
unsigned NotOpc = ST.isWave32() ? AMDGPU::S_NOT_B32 : AMDGPU::S_NOT_B64;
unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
auto FirstNot = BuildMI(MBB, MI, DL, get(NotOpc), Exec).addReg(Exec);
FirstNot->addRegisterDead(AMDGPU::SCC, TRI); // SCC is overwritten
BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), MI.getOperand(0).getReg())
.add(MI.getOperand(2));
BuildMI(MBB, MI, DL, get(NotOpc), Exec)
.addReg(Exec);
MI.eraseFromParent();
break;
}
case AMDGPU::V_SET_INACTIVE_B64: {
unsigned NotOpc = ST.isWave32() ? AMDGPU::S_NOT_B32 : AMDGPU::S_NOT_B64;
unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
auto FirstNot = BuildMI(MBB, MI, DL, get(NotOpc), Exec).addReg(Exec);
FirstNot->addRegisterDead(AMDGPU::SCC, TRI); // SCC is overwritten
MachineInstr *Copy = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO),
MI.getOperand(0).getReg())
.add(MI.getOperand(2));
expandPostRAPseudo(*Copy);
BuildMI(MBB, MI, DL, get(NotOpc), Exec)
.addReg(Exec);
MI.eraseFromParent();
break;
}
case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V1:
case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V2:
case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V3:
case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V4:
case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V5:
case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V8:
case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V16:
case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V32:
case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V1:
case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V2:
case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V3:
case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V4:
case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V5:
case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V8:
case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V16:
case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V32:
case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V1:
case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V2:
case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V4:
case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V8:
case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V16: {
const TargetRegisterClass *EltRC = getOpRegClass(MI, 2);
unsigned Opc;
if (RI.hasVGPRs(EltRC)) {
Opc = AMDGPU::V_MOVRELD_B32_e32;
} else {
Opc = RI.getRegSizeInBits(*EltRC) == 64 ? AMDGPU::S_MOVRELD_B64
: AMDGPU::S_MOVRELD_B32;
}
const MCInstrDesc &OpDesc = get(Opc);
Register VecReg = MI.getOperand(0).getReg();
bool IsUndef = MI.getOperand(1).isUndef();
unsigned SubReg = MI.getOperand(3).getImm();
assert(VecReg == MI.getOperand(1).getReg());
MachineInstrBuilder MIB =
BuildMI(MBB, MI, DL, OpDesc)
.addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
.add(MI.getOperand(2))
.addReg(VecReg, RegState::ImplicitDefine)
.addReg(VecReg, RegState::Implicit | (IsUndef ? RegState::Undef : 0));
const int ImpDefIdx =
OpDesc.getNumOperands() + OpDesc.getNumImplicitUses();
const int ImpUseIdx = ImpDefIdx + 1;
MIB->tieOperands(ImpDefIdx, ImpUseIdx);
MI.eraseFromParent();
break;
}
case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V1:
case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V2:
case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V3:
case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V4:
case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V5:
case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V8:
case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V16:
case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V32: {
assert(ST.useVGPRIndexMode());
Register VecReg = MI.getOperand(0).getReg();
bool IsUndef = MI.getOperand(1).isUndef();
Register Idx = MI.getOperand(3).getReg();
Register SubReg = MI.getOperand(4).getImm();
MachineInstr *SetOn = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_ON))
.addReg(Idx)
.addImm(AMDGPU::VGPRIndexMode::DST_ENABLE);
SetOn->getOperand(3).setIsUndef();
const MCInstrDesc &OpDesc = get(AMDGPU::V_MOV_B32_indirect_write);
MachineInstrBuilder MIB =
BuildMI(MBB, MI, DL, OpDesc)
.addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
.add(MI.getOperand(2))
.addReg(VecReg, RegState::ImplicitDefine)
.addReg(VecReg,
RegState::Implicit | (IsUndef ? RegState::Undef : 0));
const int ImpDefIdx = OpDesc.getNumOperands() + OpDesc.getNumImplicitUses();
const int ImpUseIdx = ImpDefIdx + 1;
MIB->tieOperands(ImpDefIdx, ImpUseIdx);
MachineInstr *SetOff = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_OFF));
finalizeBundle(MBB, SetOn->getIterator(), std::next(SetOff->getIterator()));
MI.eraseFromParent();
break;
}
case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V1:
case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V2:
case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V3:
case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V4:
case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V5:
case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V8:
case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V16:
case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V32: {
assert(ST.useVGPRIndexMode());
Register Dst = MI.getOperand(0).getReg();
Register VecReg = MI.getOperand(1).getReg();
bool IsUndef = MI.getOperand(1).isUndef();
Register Idx = MI.getOperand(2).getReg();
Register SubReg = MI.getOperand(3).getImm();
MachineInstr *SetOn = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_ON))
.addReg(Idx)
.addImm(AMDGPU::VGPRIndexMode::SRC0_ENABLE);
SetOn->getOperand(3).setIsUndef();
BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_indirect_read))
.addDef(Dst)
.addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
.addReg(VecReg, RegState::Implicit | (IsUndef ? RegState::Undef : 0));
MachineInstr *SetOff = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_OFF));
finalizeBundle(MBB, SetOn->getIterator(), std::next(SetOff->getIterator()));
MI.eraseFromParent();
break;
}
case AMDGPU::SI_PC_ADD_REL_OFFSET: {
MachineFunction &MF = *MBB.getParent();
Register Reg = MI.getOperand(0).getReg();
Register RegLo = RI.getSubReg(Reg, AMDGPU::sub0);
Register RegHi = RI.getSubReg(Reg, AMDGPU::sub1);
// Create a bundle so these instructions won't be re-ordered by the
// post-RA scheduler.
MIBundleBuilder Bundler(MBB, MI);
Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_GETPC_B64), Reg));
// Add 32-bit offset from this instruction to the start of the
// constant data.
Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_ADD_U32), RegLo)
.addReg(RegLo)
.add(MI.getOperand(1)));
MachineInstrBuilder MIB = BuildMI(MF, DL, get(AMDGPU::S_ADDC_U32), RegHi)
.addReg(RegHi);
MIB.add(MI.getOperand(2));
Bundler.append(MIB);
finalizeBundle(MBB, Bundler.begin());
MI.eraseFromParent();
break;
}
case AMDGPU::ENTER_STRICT_WWM: {
// This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
// Whole Wave Mode is entered.
MI.setDesc(get(ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32
: AMDGPU::S_OR_SAVEEXEC_B64));
break;
}
case AMDGPU::ENTER_STRICT_WQM: {
// This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
// STRICT_WQM is entered.
const unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
const unsigned WQMOp = ST.isWave32() ? AMDGPU::S_WQM_B32 : AMDGPU::S_WQM_B64;
const unsigned MovOp = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
BuildMI(MBB, MI, DL, get(MovOp), MI.getOperand(0).getReg()).addReg(Exec);
BuildMI(MBB, MI, DL, get(WQMOp), Exec).addReg(Exec);
MI.eraseFromParent();
break;
}
case AMDGPU::EXIT_STRICT_WWM:
case AMDGPU::EXIT_STRICT_WQM: {
// This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
// WWM/STICT_WQM is exited.
MI.setDesc(get(ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64));
break;
}
}
return true;
}
std::pair<MachineInstr*, MachineInstr*>
SIInstrInfo::expandMovDPP64(MachineInstr &MI) const {
assert (MI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO);
MachineBasicBlock &MBB = *MI.getParent();
DebugLoc DL = MBB.findDebugLoc(MI);
MachineFunction *MF = MBB.getParent();
MachineRegisterInfo &MRI = MF->getRegInfo();
Register Dst = MI.getOperand(0).getReg();
unsigned Part = 0;
MachineInstr *Split[2];
for (auto Sub : { AMDGPU::sub0, AMDGPU::sub1 }) {
auto MovDPP = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_dpp));
if (Dst.isPhysical()) {
MovDPP.addDef(RI.getSubReg(Dst, Sub));
} else {
assert(MRI.isSSA());
auto Tmp = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
MovDPP.addDef(Tmp);
}
for (unsigned I = 1; I <= 2; ++I) { // old and src operands.
const MachineOperand &SrcOp = MI.getOperand(I);
assert(!SrcOp.isFPImm());
if (SrcOp.isImm()) {
APInt Imm(64, SrcOp.getImm());
Imm.ashrInPlace(Part * 32);
MovDPP.addImm(Imm.getLoBits(32).getZExtValue());
} else {
assert(SrcOp.isReg());
Register Src = SrcOp.getReg();
if (Src.isPhysical())
MovDPP.addReg(RI.getSubReg(Src, Sub));
else
MovDPP.addReg(Src, SrcOp.isUndef() ? RegState::Undef : 0, Sub);
}
}
for (unsigned I = 3; I < MI.getNumExplicitOperands(); ++I)
MovDPP.addImm(MI.getOperand(I).getImm());
Split[Part] = MovDPP;
++Part;
}
if (Dst.isVirtual())
BuildMI(MBB, MI, DL, get(AMDGPU::REG_SEQUENCE), Dst)
.addReg(Split[0]->getOperand(0).getReg())
.addImm(AMDGPU::sub0)
.addReg(Split[1]->getOperand(0).getReg())
.addImm(AMDGPU::sub1);
MI.eraseFromParent();
return std::make_pair(Split[0], Split[1]);
}
bool SIInstrInfo::swapSourceModifiers(MachineInstr &MI,
MachineOperand &Src0,
unsigned Src0OpName,
MachineOperand &Src1,
unsigned Src1OpName) const {
MachineOperand *Src0Mods = getNamedOperand(MI, Src0OpName);
if (!Src0Mods)
return false;
MachineOperand *Src1Mods = getNamedOperand(MI, Src1OpName);
assert(Src1Mods &&
"All commutable instructions have both src0 and src1 modifiers");
int Src0ModsVal = Src0Mods->getImm();
int Src1ModsVal = Src1Mods->getImm();
Src1Mods->setImm(Src0ModsVal);
Src0Mods->setImm(Src1ModsVal);
return true;
}
static MachineInstr *swapRegAndNonRegOperand(MachineInstr &MI,
MachineOperand &RegOp,
MachineOperand &NonRegOp) {
Register Reg = RegOp.getReg();
unsigned SubReg = RegOp.getSubReg();
bool IsKill = RegOp.isKill();
bool IsDead = RegOp.isDead();
bool IsUndef = RegOp.isUndef();
bool IsDebug = RegOp.isDebug();
if (NonRegOp.isImm())
RegOp.ChangeToImmediate(NonRegOp.getImm());
else if (NonRegOp.isFI())
RegOp.ChangeToFrameIndex(NonRegOp.getIndex());
else if (NonRegOp.isGlobal()) {
RegOp.ChangeToGA(NonRegOp.getGlobal(), NonRegOp.getOffset(),
NonRegOp.getTargetFlags());
} else
return nullptr;
// Make sure we don't reinterpret a subreg index in the target flags.
RegOp.setTargetFlags(NonRegOp.getTargetFlags());
NonRegOp.ChangeToRegister(Reg, false, false, IsKill, IsDead, IsUndef, IsDebug);
NonRegOp.setSubReg(SubReg);
return &MI;
}
MachineInstr *SIInstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
unsigned Src0Idx,
unsigned Src1Idx) const {
assert(!NewMI && "this should never be used");
unsigned Opc = MI.getOpcode();
int CommutedOpcode = commuteOpcode(Opc);
if (CommutedOpcode == -1)
return nullptr;
assert(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) ==
static_cast<int>(Src0Idx) &&
AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1) ==
static_cast<int>(Src1Idx) &&
"inconsistency with findCommutedOpIndices");
MachineOperand &Src0 = MI.getOperand(Src0Idx);
MachineOperand &Src1 = MI.getOperand(Src1Idx);
MachineInstr *CommutedMI = nullptr;
if (Src0.isReg() && Src1.isReg()) {
if (isOperandLegal(MI, Src1Idx, &Src0)) {
// Be sure to copy the source modifiers to the right place.
CommutedMI
= TargetInstrInfo::commuteInstructionImpl(MI, NewMI, Src0Idx, Src1Idx);
}
} else if (Src0.isReg() && !Src1.isReg()) {
// src0 should always be able to support any operand type, so no need to
// check operand legality.
CommutedMI = swapRegAndNonRegOperand(MI, Src0, Src1);
} else if (!Src0.isReg() && Src1.isReg()) {
if (isOperandLegal(MI, Src1Idx, &Src0))
CommutedMI = swapRegAndNonRegOperand(MI, Src1, Src0);
} else {
// FIXME: Found two non registers to commute. This does happen.
return nullptr;
}
if (CommutedMI) {
swapSourceModifiers(MI, Src0, AMDGPU::OpName::src0_modifiers,
Src1, AMDGPU::OpName::src1_modifiers);
CommutedMI->setDesc(get(CommutedOpcode));
}
return CommutedMI;
}
// This needs to be implemented because the source modifiers may be inserted
// between the true commutable operands, and the base
// TargetInstrInfo::commuteInstruction uses it.
bool SIInstrInfo::findCommutedOpIndices(const MachineInstr &MI,
unsigned &SrcOpIdx0,
unsigned &SrcOpIdx1) const {
return findCommutedOpIndices(MI.getDesc(), SrcOpIdx0, SrcOpIdx1);
}
bool SIInstrInfo::findCommutedOpIndices(MCInstrDesc Desc, unsigned &SrcOpIdx0,
unsigned &SrcOpIdx1) const {
if (!Desc.isCommutable())
return false;
unsigned Opc = Desc.getOpcode();
int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
if (Src0Idx == -1)
return false;
int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
if (Src1Idx == -1)
return false;
return fixCommutedOpIndices(SrcOpIdx0, SrcOpIdx1, Src0Idx, Src1Idx);
}
bool SIInstrInfo::isBranchOffsetInRange(unsigned BranchOp,
int64_t BrOffset) const {
// BranchRelaxation should never have to check s_setpc_b64 because its dest
// block is unanalyzable.
assert(BranchOp != AMDGPU::S_SETPC_B64);
// Convert to dwords.
BrOffset /= 4;
// The branch instructions do PC += signext(SIMM16 * 4) + 4, so the offset is
// from the next instruction.
BrOffset -= 1;
return isIntN(BranchOffsetBits, BrOffset);
}
MachineBasicBlock *SIInstrInfo::getBranchDestBlock(
const MachineInstr &MI) const {
if (MI.getOpcode() == AMDGPU::S_SETPC_B64) {
// This would be a difficult analysis to perform, but can always be legal so
// there's no need to analyze it.
return nullptr;
}
return MI.getOperand(0).getMBB();
}
void SIInstrInfo::insertIndirectBranch(MachineBasicBlock &MBB,
MachineBasicBlock &DestBB,
MachineBasicBlock &RestoreBB,
const DebugLoc &DL, int64_t BrOffset,
RegScavenger *RS) const {
assert(RS && "RegScavenger required for long branching");
assert(MBB.empty() &&
"new block should be inserted for expanding unconditional branch");
assert(MBB.pred_size() == 1);
assert(RestoreBB.empty() &&
"restore block should be inserted for restoring clobbered registers");
MachineFunction *MF = MBB.getParent();
MachineRegisterInfo &MRI = MF->getRegInfo();
// FIXME: Virtual register workaround for RegScavenger not working with empty
// blocks.
Register PCReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
auto I = MBB.end();
// We need to compute the offset relative to the instruction immediately after
// s_getpc_b64. Insert pc arithmetic code before last terminator.
MachineInstr *GetPC = BuildMI(MBB, I, DL, get(AMDGPU::S_GETPC_B64), PCReg);
auto &MCCtx = MF->getContext();
MCSymbol *PostGetPCLabel =
MCCtx.createTempSymbol("post_getpc", /*AlwaysAddSuffix=*/true);
GetPC->setPostInstrSymbol(*MF, PostGetPCLabel);
MCSymbol *OffsetLo =
MCCtx.createTempSymbol("offset_lo", /*AlwaysAddSuffix=*/true);
MCSymbol *OffsetHi =
MCCtx.createTempSymbol("offset_hi", /*AlwaysAddSuffix=*/true);
BuildMI(MBB, I, DL, get(AMDGPU::S_ADD_U32))
.addReg(PCReg, RegState::Define, AMDGPU::sub0)
.addReg(PCReg, 0, AMDGPU::sub0)
.addSym(OffsetLo, MO_FAR_BRANCH_OFFSET);
BuildMI(MBB, I, DL, get(AMDGPU::S_ADDC_U32))
.addReg(PCReg, RegState::Define, AMDGPU::sub1)
.addReg(PCReg, 0, AMDGPU::sub1)
.addSym(OffsetHi, MO_FAR_BRANCH_OFFSET);
// Insert the indirect branch after the other terminator.
BuildMI(&MBB, DL, get(AMDGPU::S_SETPC_B64))
.addReg(PCReg);
// FIXME: If spilling is necessary, this will fail because this scavenger has
// no emergency stack slots. It is non-trivial to spill in this situation,
// because the restore code needs to be specially placed after the
// jump. BranchRelaxation then needs to be made aware of the newly inserted
// block.
//
// If a spill is needed for the pc register pair, we need to insert a spill
// restore block right before the destination block, and insert a short branch
// into the old destination block's fallthrough predecessor.
// e.g.:
//
// s_cbranch_scc0 skip_long_branch:
//
// long_branch_bb:
// spill s[8:9]
// s_getpc_b64 s[8:9]
// s_add_u32 s8, s8, restore_bb
// s_addc_u32 s9, s9, 0
// s_setpc_b64 s[8:9]
//
// skip_long_branch:
// foo;
//
// .....
//
// dest_bb_fallthrough_predecessor:
// bar;
// s_branch dest_bb
//
// restore_bb:
// restore s[8:9]
// fallthrough dest_bb
///
// dest_bb:
// buzz;
RS->enterBasicBlockEnd(MBB);
Register Scav = RS->scavengeRegisterBackwards(
AMDGPU::SReg_64RegClass, MachineBasicBlock::iterator(GetPC),
/* RestoreAfter */ false, 0, /* AllowSpill */ false);
if (Scav) {
RS->setRegUsed(Scav);
MRI.replaceRegWith(PCReg, Scav);
MRI.clearVirtRegs();
} else {
// As SGPR needs VGPR to be spilled, we reuse the slot of temporary VGPR for
// SGPR spill.
const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
const SIRegisterInfo *TRI = ST.getRegisterInfo();
TRI->spillEmergencySGPR(GetPC, RestoreBB, AMDGPU::SGPR0_SGPR1, RS);
MRI.replaceRegWith(PCReg, AMDGPU::SGPR0_SGPR1);
MRI.clearVirtRegs();
}
MCSymbol *DestLabel = Scav ? DestBB.getSymbol() : RestoreBB.getSymbol();
// Now, the distance could be defined.
auto *Offset = MCBinaryExpr::createSub(
MCSymbolRefExpr::create(DestLabel, MCCtx),
MCSymbolRefExpr::create(PostGetPCLabel, MCCtx), MCCtx);
// Add offset assignments.
auto *Mask = MCConstantExpr::create(0xFFFFFFFFULL, MCCtx);
OffsetLo->setVariableValue(MCBinaryExpr::createAnd(Offset, Mask, MCCtx));
auto *ShAmt = MCConstantExpr::create(32, MCCtx);
OffsetHi->setVariableValue(MCBinaryExpr::createAShr(Offset, ShAmt, MCCtx));
return;
}
unsigned SIInstrInfo::getBranchOpcode(SIInstrInfo::BranchPredicate Cond) {
switch (Cond) {
case SIInstrInfo::SCC_TRUE:
return AMDGPU::S_CBRANCH_SCC1;
case SIInstrInfo::SCC_FALSE:
return AMDGPU::S_CBRANCH_SCC0;
case SIInstrInfo::VCCNZ:
return AMDGPU::S_CBRANCH_VCCNZ;
case SIInstrInfo::VCCZ:
return AMDGPU::S_CBRANCH_VCCZ;
case SIInstrInfo::EXECNZ:
return AMDGPU::S_CBRANCH_EXECNZ;
case SIInstrInfo::EXECZ:
return AMDGPU::S_CBRANCH_EXECZ;
default:
llvm_unreachable("invalid branch predicate");
}
}
SIInstrInfo::BranchPredicate SIInstrInfo::getBranchPredicate(unsigned Opcode) {
switch (Opcode) {
case AMDGPU::S_CBRANCH_SCC0:
return SCC_FALSE;
case AMDGPU::S_CBRANCH_SCC1:
return SCC_TRUE;
case AMDGPU::S_CBRANCH_VCCNZ:
return VCCNZ;
case AMDGPU::S_CBRANCH_VCCZ:
return VCCZ;
case AMDGPU::S_CBRANCH_EXECNZ:
return EXECNZ;
case AMDGPU::S_CBRANCH_EXECZ:
return EXECZ;
default:
return INVALID_BR;
}
}
bool SIInstrInfo::analyzeBranchImpl(MachineBasicBlock &MBB,
MachineBasicBlock::iterator I,
MachineBasicBlock *&TBB,
MachineBasicBlock *&FBB,
SmallVectorImpl<MachineOperand> &Cond,
bool AllowModify) const {
if (I->getOpcode() == AMDGPU::S_BRANCH) {
// Unconditional Branch
TBB = I->getOperand(0).getMBB();
return false;
}
MachineBasicBlock *CondBB = nullptr;
if (I->getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO) {
CondBB = I->getOperand(1).getMBB();
Cond.push_back(I->getOperand(0));
} else {
BranchPredicate Pred = getBranchPredicate(I->getOpcode());
if (Pred == INVALID_BR)
return true;
CondBB = I->getOperand(0).getMBB();
Cond.push_back(MachineOperand::CreateImm(Pred));
Cond.push_back(I->getOperand(1)); // Save the branch register.
}
++I;
if (I == MBB.end()) {
// Conditional branch followed by fall-through.
TBB = CondBB;
return false;
}
if (I->getOpcode() == AMDGPU::S_BRANCH) {
TBB = CondBB;
FBB = I->getOperand(0).getMBB();
return false;
}
return true;
}
bool SIInstrInfo::analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
MachineBasicBlock *&FBB,
SmallVectorImpl<MachineOperand> &Cond,
bool AllowModify) const {
MachineBasicBlock::iterator I = MBB.getFirstTerminator();
auto E = MBB.end();
if (I == E)
return false;
// Skip over the instructions that are artificially terminators for special
// exec management.
while (I != E && !I->isBranch() && !I->isReturn()) {
switch (I->getOpcode()) {
case AMDGPU::S_MOV_B64_term:
case AMDGPU::S_XOR_B64_term:
case AMDGPU::S_OR_B64_term:
case AMDGPU::S_ANDN2_B64_term:
case AMDGPU::S_AND_B64_term:
case AMDGPU::S_MOV_B32_term:
case AMDGPU::S_XOR_B32_term:
case AMDGPU::S_OR_B32_term:
case AMDGPU::S_ANDN2_B32_term:
case AMDGPU::S_AND_B32_term:
break;
case AMDGPU::SI_IF:
case AMDGPU::SI_ELSE:
case AMDGPU::SI_KILL_I1_TERMINATOR:
case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
// FIXME: It's messy that these need to be considered here at all.
return true;
default:
llvm_unreachable("unexpected non-branch terminator inst");
}
++I;
}
if (I == E)
return false;
return analyzeBranchImpl(MBB, I, TBB, FBB, Cond, AllowModify);
}
unsigned SIInstrInfo::removeBranch(MachineBasicBlock &MBB,
int *BytesRemoved) const {
unsigned Count = 0;
unsigned RemovedSize = 0;
for (MachineInstr &MI : llvm::make_early_inc_range(MBB.terminators())) {
// Skip over artificial terminators when removing instructions.
if (MI.isBranch() || MI.isReturn()) {
RemovedSize += getInstSizeInBytes(MI);
MI.eraseFromParent();
++Count;
}
}
if (BytesRemoved)
*BytesRemoved = RemovedSize;
return Count;
}
// Copy the flags onto the implicit condition register operand.
static void preserveCondRegFlags(MachineOperand &CondReg,
const MachineOperand &OrigCond) {
CondReg.setIsUndef(OrigCond.isUndef());
CondReg.setIsKill(OrigCond.isKill());
}
unsigned SIInstrInfo::insertBranch(MachineBasicBlock &MBB,
MachineBasicBlock *TBB,
MachineBasicBlock *FBB,
ArrayRef<MachineOperand> Cond,
const DebugLoc &DL,
int *BytesAdded) const {
if (!FBB && Cond.empty()) {
BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH))
.addMBB(TBB);
if (BytesAdded)
*BytesAdded = ST.hasOffset3fBug() ? 8 : 4;
return 1;
}
if(Cond.size() == 1 && Cond[0].isReg()) {
BuildMI(&MBB, DL, get(AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO))
.add(Cond[0])
.addMBB(TBB);
return 1;
}
assert(TBB && Cond[0].isImm());
unsigned Opcode
= getBranchOpcode(static_cast<BranchPredicate>(Cond[0].getImm()));
if (!FBB) {
Cond[1].isUndef();
MachineInstr *CondBr =
BuildMI(&MBB, DL, get(Opcode))
.addMBB(TBB);
// Copy the flags onto the implicit condition register operand.
preserveCondRegFlags(CondBr->getOperand(1), Cond[1]);
fixImplicitOperands(*CondBr);
if (BytesAdded)
*BytesAdded = ST.hasOffset3fBug() ? 8 : 4;
return 1;
}
assert(TBB && FBB);
MachineInstr *CondBr =
BuildMI(&MBB, DL, get(Opcode))
.addMBB(TBB);
fixImplicitOperands(*CondBr);
BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH))
.addMBB(FBB);
MachineOperand &CondReg = CondBr->getOperand(1);
CondReg.setIsUndef(Cond[1].isUndef());
CondReg.setIsKill(Cond[1].isKill());
if (BytesAdded)
*BytesAdded = ST.hasOffset3fBug() ? 16 : 8;
return 2;
}
bool SIInstrInfo::reverseBranchCondition(
SmallVectorImpl<MachineOperand> &Cond) const {
if (Cond.size() != 2) {
return true;
}