| //===- SIInstrInfo.cpp - SI Instruction Information ----------------------===// |
| // |
| // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| // See https://llvm.org/LICENSE.txt for license information. |
| // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| // |
| //===----------------------------------------------------------------------===// |
| // |
| /// \file |
| /// SI Implementation of TargetInstrInfo. |
| // |
| //===----------------------------------------------------------------------===// |
| |
| #include "SIInstrInfo.h" |
| #include "AMDGPU.h" |
| #include "AMDGPUInstrInfo.h" |
| #include "GCNHazardRecognizer.h" |
| #include "GCNSubtarget.h" |
| #include "MCTargetDesc/AMDGPUMCTargetDesc.h" |
| #include "SIMachineFunctionInfo.h" |
| #include "llvm/Analysis/ValueTracking.h" |
| #include "llvm/CodeGen/LiveIntervals.h" |
| #include "llvm/CodeGen/LiveVariables.h" |
| #include "llvm/CodeGen/MachineDominators.h" |
| #include "llvm/CodeGen/MachineScheduler.h" |
| #include "llvm/CodeGen/RegisterScavenging.h" |
| #include "llvm/CodeGen/ScheduleDAG.h" |
| #include "llvm/IR/DiagnosticInfo.h" |
| #include "llvm/IR/IntrinsicsAMDGPU.h" |
| #include "llvm/MC/MCContext.h" |
| #include "llvm/Support/CommandLine.h" |
| #include "llvm/Target/TargetMachine.h" |
| |
| using namespace llvm; |
| |
| #define DEBUG_TYPE "si-instr-info" |
| |
| #define GET_INSTRINFO_CTOR_DTOR |
| #include "AMDGPUGenInstrInfo.inc" |
| |
| namespace llvm { |
| |
| class AAResults; |
| |
| namespace AMDGPU { |
| #define GET_D16ImageDimIntrinsics_IMPL |
| #define GET_ImageDimIntrinsicTable_IMPL |
| #define GET_RsrcIntrinsics_IMPL |
| #include "AMDGPUGenSearchableTables.inc" |
| } |
| } |
| |
| |
| // Must be at least 4 to be able to branch over minimum unconditional branch |
| // code. This is only for making it possible to write reasonably small tests for |
| // long branches. |
| static cl::opt<unsigned> |
| BranchOffsetBits("amdgpu-s-branch-bits", cl::ReallyHidden, cl::init(16), |
| cl::desc("Restrict range of branch instructions (DEBUG)")); |
| |
| static cl::opt<bool> Fix16BitCopies( |
| "amdgpu-fix-16-bit-physreg-copies", |
| cl::desc("Fix copies between 32 and 16 bit registers by extending to 32 bit"), |
| cl::init(true), |
| cl::ReallyHidden); |
| |
| SIInstrInfo::SIInstrInfo(const GCNSubtarget &ST) |
| : AMDGPUGenInstrInfo(AMDGPU::ADJCALLSTACKUP, AMDGPU::ADJCALLSTACKDOWN), |
| RI(ST), ST(ST) { |
| SchedModel.init(&ST); |
| } |
| |
| //===----------------------------------------------------------------------===// |
| // TargetInstrInfo callbacks |
| //===----------------------------------------------------------------------===// |
| |
| static unsigned getNumOperandsNoGlue(SDNode *Node) { |
| unsigned N = Node->getNumOperands(); |
| while (N && Node->getOperand(N - 1).getValueType() == MVT::Glue) |
| --N; |
| return N; |
| } |
| |
| /// Returns true if both nodes have the same value for the given |
| /// operand \p Op, or if both nodes do not have this operand. |
| static bool nodesHaveSameOperandValue(SDNode *N0, SDNode* N1, unsigned OpName) { |
| unsigned Opc0 = N0->getMachineOpcode(); |
| unsigned Opc1 = N1->getMachineOpcode(); |
| |
| int Op0Idx = AMDGPU::getNamedOperandIdx(Opc0, OpName); |
| int Op1Idx = AMDGPU::getNamedOperandIdx(Opc1, OpName); |
| |
| if (Op0Idx == -1 && Op1Idx == -1) |
| return true; |
| |
| |
| if ((Op0Idx == -1 && Op1Idx != -1) || |
| (Op1Idx == -1 && Op0Idx != -1)) |
| return false; |
| |
| // getNamedOperandIdx returns the index for the MachineInstr's operands, |
| // which includes the result as the first operand. We are indexing into the |
| // MachineSDNode's operands, so we need to skip the result operand to get |
| // the real index. |
| --Op0Idx; |
| --Op1Idx; |
| |
| return N0->getOperand(Op0Idx) == N1->getOperand(Op1Idx); |
| } |
| |
| bool SIInstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI, |
| AAResults *AA) const { |
| if (isVOP1(MI) || isVOP2(MI) || isVOP3(MI) || isSDWA(MI) || isSALU(MI)) { |
| // Normally VALU use of exec would block the rematerialization, but that |
| // is OK in this case to have an implicit exec read as all VALU do. |
| // We really want all of the generic logic for this except for this. |
| |
| // Another potential implicit use is mode register. The core logic of |
| // the RA will not attempt rematerialization if mode is set anywhere |
| // in the function, otherwise it is safe since mode is not changed. |
| |
| // There is difference to generic method which does not allow |
| // rematerialization if there are virtual register uses. We allow this, |
| // therefore this method includes SOP instructions as well. |
| return !MI.hasImplicitDef() && |
| MI.getNumImplicitOperands() == MI.getDesc().getNumImplicitUses() && |
| !MI.mayRaiseFPException(); |
| } |
| |
| return false; |
| } |
| |
| bool SIInstrInfo::isIgnorableUse(const MachineOperand &MO) const { |
| // Any implicit use of exec by VALU is not a real register read. |
| return MO.getReg() == AMDGPU::EXEC && MO.isImplicit() && |
| isVALU(*MO.getParent()); |
| } |
| |
| bool SIInstrInfo::areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1, |
| int64_t &Offset0, |
| int64_t &Offset1) const { |
| if (!Load0->isMachineOpcode() || !Load1->isMachineOpcode()) |
| return false; |
| |
| unsigned Opc0 = Load0->getMachineOpcode(); |
| unsigned Opc1 = Load1->getMachineOpcode(); |
| |
| // Make sure both are actually loads. |
| if (!get(Opc0).mayLoad() || !get(Opc1).mayLoad()) |
| return false; |
| |
| if (isDS(Opc0) && isDS(Opc1)) { |
| |
| // FIXME: Handle this case: |
| if (getNumOperandsNoGlue(Load0) != getNumOperandsNoGlue(Load1)) |
| return false; |
| |
| // Check base reg. |
| if (Load0->getOperand(0) != Load1->getOperand(0)) |
| return false; |
| |
| // Skip read2 / write2 variants for simplicity. |
| // TODO: We should report true if the used offsets are adjacent (excluded |
| // st64 versions). |
| int Offset0Idx = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset); |
| int Offset1Idx = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset); |
| if (Offset0Idx == -1 || Offset1Idx == -1) |
| return false; |
| |
| // XXX - be careful of datalesss loads |
| // getNamedOperandIdx returns the index for MachineInstrs. Since they |
| // include the output in the operand list, but SDNodes don't, we need to |
| // subtract the index by one. |
| Offset0Idx -= get(Opc0).NumDefs; |
| Offset1Idx -= get(Opc1).NumDefs; |
| Offset0 = cast<ConstantSDNode>(Load0->getOperand(Offset0Idx))->getZExtValue(); |
| Offset1 = cast<ConstantSDNode>(Load1->getOperand(Offset1Idx))->getZExtValue(); |
| return true; |
| } |
| |
| if (isSMRD(Opc0) && isSMRD(Opc1)) { |
| // Skip time and cache invalidation instructions. |
| if (AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::sbase) == -1 || |
| AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::sbase) == -1) |
| return false; |
| |
| assert(getNumOperandsNoGlue(Load0) == getNumOperandsNoGlue(Load1)); |
| |
| // Check base reg. |
| if (Load0->getOperand(0) != Load1->getOperand(0)) |
| return false; |
| |
| const ConstantSDNode *Load0Offset = |
| dyn_cast<ConstantSDNode>(Load0->getOperand(1)); |
| const ConstantSDNode *Load1Offset = |
| dyn_cast<ConstantSDNode>(Load1->getOperand(1)); |
| |
| if (!Load0Offset || !Load1Offset) |
| return false; |
| |
| Offset0 = Load0Offset->getZExtValue(); |
| Offset1 = Load1Offset->getZExtValue(); |
| return true; |
| } |
| |
| // MUBUF and MTBUF can access the same addresses. |
| if ((isMUBUF(Opc0) || isMTBUF(Opc0)) && (isMUBUF(Opc1) || isMTBUF(Opc1))) { |
| |
| // MUBUF and MTBUF have vaddr at different indices. |
| if (!nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::soffset) || |
| !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::vaddr) || |
| !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::srsrc)) |
| return false; |
| |
| int OffIdx0 = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset); |
| int OffIdx1 = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset); |
| |
| if (OffIdx0 == -1 || OffIdx1 == -1) |
| return false; |
| |
| // getNamedOperandIdx returns the index for MachineInstrs. Since they |
| // include the output in the operand list, but SDNodes don't, we need to |
| // subtract the index by one. |
| OffIdx0 -= get(Opc0).NumDefs; |
| OffIdx1 -= get(Opc1).NumDefs; |
| |
| SDValue Off0 = Load0->getOperand(OffIdx0); |
| SDValue Off1 = Load1->getOperand(OffIdx1); |
| |
| // The offset might be a FrameIndexSDNode. |
| if (!isa<ConstantSDNode>(Off0) || !isa<ConstantSDNode>(Off1)) |
| return false; |
| |
| Offset0 = cast<ConstantSDNode>(Off0)->getZExtValue(); |
| Offset1 = cast<ConstantSDNode>(Off1)->getZExtValue(); |
| return true; |
| } |
| |
| return false; |
| } |
| |
| static bool isStride64(unsigned Opc) { |
| switch (Opc) { |
| case AMDGPU::DS_READ2ST64_B32: |
| case AMDGPU::DS_READ2ST64_B64: |
| case AMDGPU::DS_WRITE2ST64_B32: |
| case AMDGPU::DS_WRITE2ST64_B64: |
| return true; |
| default: |
| return false; |
| } |
| } |
| |
| bool SIInstrInfo::getMemOperandsWithOffsetWidth( |
| const MachineInstr &LdSt, SmallVectorImpl<const MachineOperand *> &BaseOps, |
| int64_t &Offset, bool &OffsetIsScalable, unsigned &Width, |
| const TargetRegisterInfo *TRI) const { |
| if (!LdSt.mayLoadOrStore()) |
| return false; |
| |
| unsigned Opc = LdSt.getOpcode(); |
| OffsetIsScalable = false; |
| const MachineOperand *BaseOp, *OffsetOp; |
| int DataOpIdx; |
| |
| if (isDS(LdSt)) { |
| BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::addr); |
| OffsetOp = getNamedOperand(LdSt, AMDGPU::OpName::offset); |
| if (OffsetOp) { |
| // Normal, single offset LDS instruction. |
| if (!BaseOp) { |
| // DS_CONSUME/DS_APPEND use M0 for the base address. |
| // TODO: find the implicit use operand for M0 and use that as BaseOp? |
| return false; |
| } |
| BaseOps.push_back(BaseOp); |
| Offset = OffsetOp->getImm(); |
| // Get appropriate operand, and compute width accordingly. |
| DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst); |
| if (DataOpIdx == -1) |
| DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0); |
| Width = getOpSize(LdSt, DataOpIdx); |
| } else { |
| // The 2 offset instructions use offset0 and offset1 instead. We can treat |
| // these as a load with a single offset if the 2 offsets are consecutive. |
| // We will use this for some partially aligned loads. |
| const MachineOperand *Offset0Op = |
| getNamedOperand(LdSt, AMDGPU::OpName::offset0); |
| const MachineOperand *Offset1Op = |
| getNamedOperand(LdSt, AMDGPU::OpName::offset1); |
| |
| unsigned Offset0 = Offset0Op->getImm(); |
| unsigned Offset1 = Offset1Op->getImm(); |
| if (Offset0 + 1 != Offset1) |
| return false; |
| |
| // Each of these offsets is in element sized units, so we need to convert |
| // to bytes of the individual reads. |
| |
| unsigned EltSize; |
| if (LdSt.mayLoad()) |
| EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, 0)) / 16; |
| else { |
| assert(LdSt.mayStore()); |
| int Data0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0); |
| EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, Data0Idx)) / 8; |
| } |
| |
| if (isStride64(Opc)) |
| EltSize *= 64; |
| |
| BaseOps.push_back(BaseOp); |
| Offset = EltSize * Offset0; |
| // Get appropriate operand(s), and compute width accordingly. |
| DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst); |
| if (DataOpIdx == -1) { |
| DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0); |
| Width = getOpSize(LdSt, DataOpIdx); |
| DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data1); |
| Width += getOpSize(LdSt, DataOpIdx); |
| } else { |
| Width = getOpSize(LdSt, DataOpIdx); |
| } |
| } |
| return true; |
| } |
| |
| if (isMUBUF(LdSt) || isMTBUF(LdSt)) { |
| const MachineOperand *RSrc = getNamedOperand(LdSt, AMDGPU::OpName::srsrc); |
| if (!RSrc) // e.g. BUFFER_WBINVL1_VOL |
| return false; |
| BaseOps.push_back(RSrc); |
| BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::vaddr); |
| if (BaseOp && !BaseOp->isFI()) |
| BaseOps.push_back(BaseOp); |
| const MachineOperand *OffsetImm = |
| getNamedOperand(LdSt, AMDGPU::OpName::offset); |
| Offset = OffsetImm->getImm(); |
| const MachineOperand *SOffset = |
| getNamedOperand(LdSt, AMDGPU::OpName::soffset); |
| if (SOffset) { |
| if (SOffset->isReg()) |
| BaseOps.push_back(SOffset); |
| else |
| Offset += SOffset->getImm(); |
| } |
| // Get appropriate operand, and compute width accordingly. |
| DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst); |
| if (DataOpIdx == -1) |
| DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata); |
| Width = getOpSize(LdSt, DataOpIdx); |
| return true; |
| } |
| |
| if (isMIMG(LdSt)) { |
| int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc); |
| BaseOps.push_back(&LdSt.getOperand(SRsrcIdx)); |
| int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0); |
| if (VAddr0Idx >= 0) { |
| // GFX10 possible NSA encoding. |
| for (int I = VAddr0Idx; I < SRsrcIdx; ++I) |
| BaseOps.push_back(&LdSt.getOperand(I)); |
| } else { |
| BaseOps.push_back(getNamedOperand(LdSt, AMDGPU::OpName::vaddr)); |
| } |
| Offset = 0; |
| // Get appropriate operand, and compute width accordingly. |
| DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata); |
| Width = getOpSize(LdSt, DataOpIdx); |
| return true; |
| } |
| |
| if (isSMRD(LdSt)) { |
| BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::sbase); |
| if (!BaseOp) // e.g. S_MEMTIME |
| return false; |
| BaseOps.push_back(BaseOp); |
| OffsetOp = getNamedOperand(LdSt, AMDGPU::OpName::offset); |
| Offset = OffsetOp ? OffsetOp->getImm() : 0; |
| // Get appropriate operand, and compute width accordingly. |
| DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::sdst); |
| Width = getOpSize(LdSt, DataOpIdx); |
| return true; |
| } |
| |
| if (isFLAT(LdSt)) { |
| // Instructions have either vaddr or saddr or both or none. |
| BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::vaddr); |
| if (BaseOp) |
| BaseOps.push_back(BaseOp); |
| BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::saddr); |
| if (BaseOp) |
| BaseOps.push_back(BaseOp); |
| Offset = getNamedOperand(LdSt, AMDGPU::OpName::offset)->getImm(); |
| // Get appropriate operand, and compute width accordingly. |
| DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst); |
| if (DataOpIdx == -1) |
| DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata); |
| Width = getOpSize(LdSt, DataOpIdx); |
| return true; |
| } |
| |
| return false; |
| } |
| |
| static bool memOpsHaveSameBasePtr(const MachineInstr &MI1, |
| ArrayRef<const MachineOperand *> BaseOps1, |
| const MachineInstr &MI2, |
| ArrayRef<const MachineOperand *> BaseOps2) { |
| // Only examine the first "base" operand of each instruction, on the |
| // assumption that it represents the real base address of the memory access. |
| // Other operands are typically offsets or indices from this base address. |
| if (BaseOps1.front()->isIdenticalTo(*BaseOps2.front())) |
| return true; |
| |
| if (!MI1.hasOneMemOperand() || !MI2.hasOneMemOperand()) |
| return false; |
| |
| auto MO1 = *MI1.memoperands_begin(); |
| auto MO2 = *MI2.memoperands_begin(); |
| if (MO1->getAddrSpace() != MO2->getAddrSpace()) |
| return false; |
| |
| auto Base1 = MO1->getValue(); |
| auto Base2 = MO2->getValue(); |
| if (!Base1 || !Base2) |
| return false; |
| Base1 = getUnderlyingObject(Base1); |
| Base2 = getUnderlyingObject(Base2); |
| |
| if (isa<UndefValue>(Base1) || isa<UndefValue>(Base2)) |
| return false; |
| |
| return Base1 == Base2; |
| } |
| |
| bool SIInstrInfo::shouldClusterMemOps(ArrayRef<const MachineOperand *> BaseOps1, |
| ArrayRef<const MachineOperand *> BaseOps2, |
| unsigned NumLoads, |
| unsigned NumBytes) const { |
| // If the mem ops (to be clustered) do not have the same base ptr, then they |
| // should not be clustered |
| if (!BaseOps1.empty() && !BaseOps2.empty()) { |
| const MachineInstr &FirstLdSt = *BaseOps1.front()->getParent(); |
| const MachineInstr &SecondLdSt = *BaseOps2.front()->getParent(); |
| if (!memOpsHaveSameBasePtr(FirstLdSt, BaseOps1, SecondLdSt, BaseOps2)) |
| return false; |
| } else if (!BaseOps1.empty() || !BaseOps2.empty()) { |
| // If only one base op is empty, they do not have the same base ptr |
| return false; |
| } |
| |
| // In order to avoid regester pressure, on an average, the number of DWORDS |
| // loaded together by all clustered mem ops should not exceed 8. This is an |
| // empirical value based on certain observations and performance related |
| // experiments. |
| // The good thing about this heuristic is - it avoids clustering of too many |
| // sub-word loads, and also avoids clustering of wide loads. Below is the |
| // brief summary of how the heuristic behaves for various `LoadSize`. |
| // (1) 1 <= LoadSize <= 4: cluster at max 8 mem ops |
| // (2) 5 <= LoadSize <= 8: cluster at max 4 mem ops |
| // (3) 9 <= LoadSize <= 12: cluster at max 2 mem ops |
| // (4) 13 <= LoadSize <= 16: cluster at max 2 mem ops |
| // (5) LoadSize >= 17: do not cluster |
| const unsigned LoadSize = NumBytes / NumLoads; |
| const unsigned NumDWORDs = ((LoadSize + 3) / 4) * NumLoads; |
| return NumDWORDs <= 8; |
| } |
| |
| // FIXME: This behaves strangely. If, for example, you have 32 load + stores, |
| // the first 16 loads will be interleaved with the stores, and the next 16 will |
| // be clustered as expected. It should really split into 2 16 store batches. |
| // |
| // Loads are clustered until this returns false, rather than trying to schedule |
| // groups of stores. This also means we have to deal with saying different |
| // address space loads should be clustered, and ones which might cause bank |
| // conflicts. |
| // |
| // This might be deprecated so it might not be worth that much effort to fix. |
| bool SIInstrInfo::shouldScheduleLoadsNear(SDNode *Load0, SDNode *Load1, |
| int64_t Offset0, int64_t Offset1, |
| unsigned NumLoads) const { |
| assert(Offset1 > Offset0 && |
| "Second offset should be larger than first offset!"); |
| // If we have less than 16 loads in a row, and the offsets are within 64 |
| // bytes, then schedule together. |
| |
| // A cacheline is 64 bytes (for global memory). |
| return (NumLoads <= 16 && (Offset1 - Offset0) < 64); |
| } |
| |
| static void reportIllegalCopy(const SIInstrInfo *TII, MachineBasicBlock &MBB, |
| MachineBasicBlock::iterator MI, |
| const DebugLoc &DL, MCRegister DestReg, |
| MCRegister SrcReg, bool KillSrc, |
| const char *Msg = "illegal SGPR to VGPR copy") { |
| MachineFunction *MF = MBB.getParent(); |
| DiagnosticInfoUnsupported IllegalCopy(MF->getFunction(), Msg, DL, DS_Error); |
| LLVMContext &C = MF->getFunction().getContext(); |
| C.diagnose(IllegalCopy); |
| |
| BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_ILLEGAL_COPY), DestReg) |
| .addReg(SrcReg, getKillRegState(KillSrc)); |
| } |
| |
| /// Handle copying from SGPR to AGPR, or from AGPR to AGPR. It is not possible |
| /// to directly copy, so an intermediate VGPR needs to be used. |
| static void indirectCopyToAGPR(const SIInstrInfo &TII, |
| MachineBasicBlock &MBB, |
| MachineBasicBlock::iterator MI, |
| const DebugLoc &DL, MCRegister DestReg, |
| MCRegister SrcReg, bool KillSrc, |
| RegScavenger &RS, |
| Register ImpDefSuperReg = Register(), |
| Register ImpUseSuperReg = Register()) { |
| const SIRegisterInfo &RI = TII.getRegisterInfo(); |
| |
| assert(AMDGPU::SReg_32RegClass.contains(SrcReg) || |
| AMDGPU::AGPR_32RegClass.contains(SrcReg)); |
| |
| // First try to find defining accvgpr_write to avoid temporary registers. |
| for (auto Def = MI, E = MBB.begin(); Def != E; ) { |
| --Def; |
| if (!Def->definesRegister(SrcReg, &RI)) |
| continue; |
| if (Def->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64) |
| break; |
| |
| MachineOperand &DefOp = Def->getOperand(1); |
| assert(DefOp.isReg() || DefOp.isImm()); |
| |
| if (DefOp.isReg()) { |
| // Check that register source operand if not clobbered before MI. |
| // Immediate operands are always safe to propagate. |
| bool SafeToPropagate = true; |
| for (auto I = Def; I != MI && SafeToPropagate; ++I) |
| if (I->modifiesRegister(DefOp.getReg(), &RI)) |
| SafeToPropagate = false; |
| |
| if (!SafeToPropagate) |
| break; |
| |
| DefOp.setIsKill(false); |
| } |
| |
| MachineInstrBuilder Builder = |
| BuildMI(MBB, MI, DL, TII.get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg) |
| .add(DefOp); |
| if (ImpDefSuperReg) |
| Builder.addReg(ImpDefSuperReg, RegState::Define | RegState::Implicit); |
| |
| if (ImpUseSuperReg) { |
| Builder.addReg(ImpUseSuperReg, |
| getKillRegState(KillSrc) | RegState::Implicit); |
| } |
| |
| return; |
| } |
| |
| RS.enterBasicBlock(MBB); |
| RS.forward(MI); |
| |
| // Ideally we want to have three registers for a long reg_sequence copy |
| // to hide 2 waitstates between v_mov_b32 and accvgpr_write. |
| unsigned MaxVGPRs = RI.getRegPressureLimit(&AMDGPU::VGPR_32RegClass, |
| *MBB.getParent()); |
| |
| // Registers in the sequence are allocated contiguously so we can just |
| // use register number to pick one of three round-robin temps. |
| unsigned RegNo = DestReg % 3; |
| Register Tmp = RS.scavengeRegister(&AMDGPU::VGPR_32RegClass, 0); |
| if (!Tmp) |
| report_fatal_error("Cannot scavenge VGPR to copy to AGPR"); |
| RS.setRegUsed(Tmp); |
| |
| if (!TII.getSubtarget().hasGFX90AInsts()) { |
| // Only loop through if there are any free registers left, otherwise |
| // scavenger may report a fatal error without emergency spill slot |
| // or spill with the slot. |
| while (RegNo-- && RS.FindUnusedReg(&AMDGPU::VGPR_32RegClass)) { |
| Register Tmp2 = RS.scavengeRegister(&AMDGPU::VGPR_32RegClass, 0); |
| if (!Tmp2 || RI.getHWRegIndex(Tmp2) >= MaxVGPRs) |
| break; |
| Tmp = Tmp2; |
| RS.setRegUsed(Tmp); |
| } |
| } |
| |
| // Insert copy to temporary VGPR. |
| unsigned TmpCopyOp = AMDGPU::V_MOV_B32_e32; |
| if (AMDGPU::AGPR_32RegClass.contains(SrcReg)) { |
| TmpCopyOp = AMDGPU::V_ACCVGPR_READ_B32_e64; |
| } else { |
| assert(AMDGPU::SReg_32RegClass.contains(SrcReg)); |
| } |
| |
| MachineInstrBuilder UseBuilder = BuildMI(MBB, MI, DL, TII.get(TmpCopyOp), Tmp) |
| .addReg(SrcReg, getKillRegState(KillSrc)); |
| if (ImpUseSuperReg) { |
| UseBuilder.addReg(ImpUseSuperReg, |
| getKillRegState(KillSrc) | RegState::Implicit); |
| } |
| |
| MachineInstrBuilder DefBuilder |
| = BuildMI(MBB, MI, DL, TII.get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg) |
| .addReg(Tmp, RegState::Kill); |
| |
| if (ImpDefSuperReg) |
| DefBuilder.addReg(ImpDefSuperReg, RegState::Define | RegState::Implicit); |
| } |
| |
| static void expandSGPRCopy(const SIInstrInfo &TII, MachineBasicBlock &MBB, |
| MachineBasicBlock::iterator MI, const DebugLoc &DL, |
| MCRegister DestReg, MCRegister SrcReg, bool KillSrc, |
| const TargetRegisterClass *RC, bool Forward) { |
| const SIRegisterInfo &RI = TII.getRegisterInfo(); |
| ArrayRef<int16_t> BaseIndices = RI.getRegSplitParts(RC, 4); |
| MachineBasicBlock::iterator I = MI; |
| MachineInstr *FirstMI = nullptr, *LastMI = nullptr; |
| |
| for (unsigned Idx = 0; Idx < BaseIndices.size(); ++Idx) { |
| int16_t SubIdx = BaseIndices[Idx]; |
| Register Reg = RI.getSubReg(DestReg, SubIdx); |
| unsigned Opcode = AMDGPU::S_MOV_B32; |
| |
| // Is SGPR aligned? If so try to combine with next. |
| Register Src = RI.getSubReg(SrcReg, SubIdx); |
| bool AlignedDest = ((Reg - AMDGPU::SGPR0) % 2) == 0; |
| bool AlignedSrc = ((Src - AMDGPU::SGPR0) % 2) == 0; |
| if (AlignedDest && AlignedSrc && (Idx + 1 < BaseIndices.size())) { |
| // Can use SGPR64 copy |
| unsigned Channel = RI.getChannelFromSubReg(SubIdx); |
| SubIdx = RI.getSubRegFromChannel(Channel, 2); |
| Opcode = AMDGPU::S_MOV_B64; |
| Idx++; |
| } |
| |
| LastMI = BuildMI(MBB, I, DL, TII.get(Opcode), RI.getSubReg(DestReg, SubIdx)) |
| .addReg(RI.getSubReg(SrcReg, SubIdx)) |
| .addReg(SrcReg, RegState::Implicit); |
| |
| if (!FirstMI) |
| FirstMI = LastMI; |
| |
| if (!Forward) |
| I--; |
| } |
| |
| assert(FirstMI && LastMI); |
| if (!Forward) |
| std::swap(FirstMI, LastMI); |
| |
| FirstMI->addOperand( |
| MachineOperand::CreateReg(DestReg, true /*IsDef*/, true /*IsImp*/)); |
| |
| if (KillSrc) |
| LastMI->addRegisterKilled(SrcReg, &RI); |
| } |
| |
| void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, |
| MachineBasicBlock::iterator MI, |
| const DebugLoc &DL, MCRegister DestReg, |
| MCRegister SrcReg, bool KillSrc) const { |
| const TargetRegisterClass *RC = RI.getPhysRegClass(DestReg); |
| |
| // FIXME: This is hack to resolve copies between 16 bit and 32 bit |
| // registers until all patterns are fixed. |
| if (Fix16BitCopies && |
| ((RI.getRegSizeInBits(*RC) == 16) ^ |
| (RI.getRegSizeInBits(*RI.getPhysRegClass(SrcReg)) == 16))) { |
| MCRegister &RegToFix = (RI.getRegSizeInBits(*RC) == 16) ? DestReg : SrcReg; |
| MCRegister Super = RI.get32BitRegister(RegToFix); |
| assert(RI.getSubReg(Super, AMDGPU::lo16) == RegToFix); |
| RegToFix = Super; |
| |
| if (DestReg == SrcReg) { |
| // Insert empty bundle since ExpandPostRA expects an instruction here. |
| BuildMI(MBB, MI, DL, get(AMDGPU::BUNDLE)); |
| return; |
| } |
| |
| RC = RI.getPhysRegClass(DestReg); |
| } |
| |
| if (RC == &AMDGPU::VGPR_32RegClass) { |
| assert(AMDGPU::VGPR_32RegClass.contains(SrcReg) || |
| AMDGPU::SReg_32RegClass.contains(SrcReg) || |
| AMDGPU::AGPR_32RegClass.contains(SrcReg)); |
| unsigned Opc = AMDGPU::AGPR_32RegClass.contains(SrcReg) ? |
| AMDGPU::V_ACCVGPR_READ_B32_e64 : AMDGPU::V_MOV_B32_e32; |
| BuildMI(MBB, MI, DL, get(Opc), DestReg) |
| .addReg(SrcReg, getKillRegState(KillSrc)); |
| return; |
| } |
| |
| if (RC == &AMDGPU::SReg_32_XM0RegClass || |
| RC == &AMDGPU::SReg_32RegClass) { |
| if (SrcReg == AMDGPU::SCC) { |
| BuildMI(MBB, MI, DL, get(AMDGPU::S_CSELECT_B32), DestReg) |
| .addImm(1) |
| .addImm(0); |
| return; |
| } |
| |
| if (DestReg == AMDGPU::VCC_LO) { |
| if (AMDGPU::SReg_32RegClass.contains(SrcReg)) { |
| BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), AMDGPU::VCC_LO) |
| .addReg(SrcReg, getKillRegState(KillSrc)); |
| } else { |
| // FIXME: Hack until VReg_1 removed. |
| assert(AMDGPU::VGPR_32RegClass.contains(SrcReg)); |
| BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_U32_e32)) |
| .addImm(0) |
| .addReg(SrcReg, getKillRegState(KillSrc)); |
| } |
| |
| return; |
| } |
| |
| if (!AMDGPU::SReg_32RegClass.contains(SrcReg)) { |
| reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc); |
| return; |
| } |
| |
| BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg) |
| .addReg(SrcReg, getKillRegState(KillSrc)); |
| return; |
| } |
| |
| if (RC == &AMDGPU::SReg_64RegClass) { |
| if (SrcReg == AMDGPU::SCC) { |
| BuildMI(MBB, MI, DL, get(AMDGPU::S_CSELECT_B64), DestReg) |
| .addImm(1) |
| .addImm(0); |
| return; |
| } |
| |
| if (DestReg == AMDGPU::VCC) { |
| if (AMDGPU::SReg_64RegClass.contains(SrcReg)) { |
| BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), AMDGPU::VCC) |
| .addReg(SrcReg, getKillRegState(KillSrc)); |
| } else { |
| // FIXME: Hack until VReg_1 removed. |
| assert(AMDGPU::VGPR_32RegClass.contains(SrcReg)); |
| BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_U32_e32)) |
| .addImm(0) |
| .addReg(SrcReg, getKillRegState(KillSrc)); |
| } |
| |
| return; |
| } |
| |
| if (!AMDGPU::SReg_64RegClass.contains(SrcReg)) { |
| reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc); |
| return; |
| } |
| |
| BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg) |
| .addReg(SrcReg, getKillRegState(KillSrc)); |
| return; |
| } |
| |
| if (DestReg == AMDGPU::SCC) { |
| // Copying 64-bit or 32-bit sources to SCC barely makes sense, |
| // but SelectionDAG emits such copies for i1 sources. |
| if (AMDGPU::SReg_64RegClass.contains(SrcReg)) { |
| // This copy can only be produced by patterns |
| // with explicit SCC, which are known to be enabled |
| // only for subtargets with S_CMP_LG_U64 present. |
| assert(ST.hasScalarCompareEq64()); |
| BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U64)) |
| .addReg(SrcReg, getKillRegState(KillSrc)) |
| .addImm(0); |
| } else { |
| assert(AMDGPU::SReg_32RegClass.contains(SrcReg)); |
| BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U32)) |
| .addReg(SrcReg, getKillRegState(KillSrc)) |
| .addImm(0); |
| } |
| |
| return; |
| } |
| |
| if (RC == &AMDGPU::AGPR_32RegClass) { |
| if (AMDGPU::VGPR_32RegClass.contains(SrcReg)) { |
| BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg) |
| .addReg(SrcReg, getKillRegState(KillSrc)); |
| return; |
| } |
| |
| if (AMDGPU::AGPR_32RegClass.contains(SrcReg) && ST.hasGFX90AInsts()) { |
| BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_MOV_B32), DestReg) |
| .addReg(SrcReg, getKillRegState(KillSrc)); |
| return; |
| } |
| |
| // FIXME: Pass should maintain scavenger to avoid scan through the block on |
| // every AGPR spill. |
| RegScavenger RS; |
| indirectCopyToAGPR(*this, MBB, MI, DL, DestReg, SrcReg, KillSrc, RS); |
| return; |
| } |
| |
| const unsigned Size = RI.getRegSizeInBits(*RC); |
| if (Size == 16) { |
| assert(AMDGPU::VGPR_LO16RegClass.contains(SrcReg) || |
| AMDGPU::VGPR_HI16RegClass.contains(SrcReg) || |
| AMDGPU::SReg_LO16RegClass.contains(SrcReg) || |
| AMDGPU::AGPR_LO16RegClass.contains(SrcReg)); |
| |
| bool IsSGPRDst = AMDGPU::SReg_LO16RegClass.contains(DestReg); |
| bool IsSGPRSrc = AMDGPU::SReg_LO16RegClass.contains(SrcReg); |
| bool IsAGPRDst = AMDGPU::AGPR_LO16RegClass.contains(DestReg); |
| bool IsAGPRSrc = AMDGPU::AGPR_LO16RegClass.contains(SrcReg); |
| bool DstLow = AMDGPU::VGPR_LO16RegClass.contains(DestReg) || |
| AMDGPU::SReg_LO16RegClass.contains(DestReg) || |
| AMDGPU::AGPR_LO16RegClass.contains(DestReg); |
| bool SrcLow = AMDGPU::VGPR_LO16RegClass.contains(SrcReg) || |
| AMDGPU::SReg_LO16RegClass.contains(SrcReg) || |
| AMDGPU::AGPR_LO16RegClass.contains(SrcReg); |
| MCRegister NewDestReg = RI.get32BitRegister(DestReg); |
| MCRegister NewSrcReg = RI.get32BitRegister(SrcReg); |
| |
| if (IsSGPRDst) { |
| if (!IsSGPRSrc) { |
| reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc); |
| return; |
| } |
| |
| BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), NewDestReg) |
| .addReg(NewSrcReg, getKillRegState(KillSrc)); |
| return; |
| } |
| |
| if (IsAGPRDst || IsAGPRSrc) { |
| if (!DstLow || !SrcLow) { |
| reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc, |
| "Cannot use hi16 subreg with an AGPR!"); |
| } |
| |
| copyPhysReg(MBB, MI, DL, NewDestReg, NewSrcReg, KillSrc); |
| return; |
| } |
| |
| if (IsSGPRSrc && !ST.hasSDWAScalar()) { |
| if (!DstLow || !SrcLow) { |
| reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc, |
| "Cannot use hi16 subreg on VI!"); |
| } |
| |
| BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), NewDestReg) |
| .addReg(NewSrcReg, getKillRegState(KillSrc)); |
| return; |
| } |
| |
| auto MIB = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_sdwa), NewDestReg) |
| .addImm(0) // src0_modifiers |
| .addReg(NewSrcReg) |
| .addImm(0) // clamp |
| .addImm(DstLow ? AMDGPU::SDWA::SdwaSel::WORD_0 |
| : AMDGPU::SDWA::SdwaSel::WORD_1) |
| .addImm(AMDGPU::SDWA::DstUnused::UNUSED_PRESERVE) |
| .addImm(SrcLow ? AMDGPU::SDWA::SdwaSel::WORD_0 |
| : AMDGPU::SDWA::SdwaSel::WORD_1) |
| .addReg(NewDestReg, RegState::Implicit | RegState::Undef); |
| // First implicit operand is $exec. |
| MIB->tieOperands(0, MIB->getNumOperands() - 1); |
| return; |
| } |
| |
| const TargetRegisterClass *SrcRC = RI.getPhysRegClass(SrcReg); |
| if (RC == RI.getVGPR64Class() && (SrcRC == RC || RI.isSGPRClass(SrcRC))) { |
| if (ST.hasPackedFP32Ops()) { |
| BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), DestReg) |
| .addImm(SISrcMods::OP_SEL_1) |
| .addReg(SrcReg) |
| .addImm(SISrcMods::OP_SEL_0 | SISrcMods::OP_SEL_1) |
| .addReg(SrcReg) |
| .addImm(0) // op_sel_lo |
| .addImm(0) // op_sel_hi |
| .addImm(0) // neg_lo |
| .addImm(0) // neg_hi |
| .addImm(0) // clamp |
| .addReg(SrcReg, getKillRegState(KillSrc) | RegState::Implicit); |
| return; |
| } |
| } |
| |
| const bool Forward = RI.getHWRegIndex(DestReg) <= RI.getHWRegIndex(SrcReg); |
| if (RI.isSGPRClass(RC)) { |
| if (!RI.isSGPRClass(SrcRC)) { |
| reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc); |
| return; |
| } |
| expandSGPRCopy(*this, MBB, MI, DL, DestReg, SrcReg, KillSrc, RC, Forward); |
| return; |
| } |
| |
| unsigned EltSize = 4; |
| unsigned Opcode = AMDGPU::V_MOV_B32_e32; |
| if (RI.isAGPRClass(RC)) { |
| Opcode = (RI.hasVGPRs(SrcRC)) ? |
| AMDGPU::V_ACCVGPR_WRITE_B32_e64 : AMDGPU::INSTRUCTION_LIST_END; |
| } else if (RI.hasVGPRs(RC) && RI.isAGPRClass(SrcRC)) { |
| Opcode = AMDGPU::V_ACCVGPR_READ_B32_e64; |
| } else if ((Size % 64 == 0) && RI.hasVGPRs(RC) && |
| (RI.isProperlyAlignedRC(*RC) && |
| (SrcRC == RC || RI.isSGPRClass(SrcRC)))) { |
| // TODO: In 96-bit case, could do a 64-bit mov and then a 32-bit mov. |
| if (ST.hasPackedFP32Ops()) { |
| Opcode = AMDGPU::V_PK_MOV_B32; |
| EltSize = 8; |
| } |
| } |
| |
| // For the cases where we need an intermediate instruction/temporary register |
| // (destination is an AGPR), we need a scavenger. |
| // |
| // FIXME: The pass should maintain this for us so we don't have to re-scan the |
| // whole block for every handled copy. |
| std::unique_ptr<RegScavenger> RS; |
| if (Opcode == AMDGPU::INSTRUCTION_LIST_END) |
| RS.reset(new RegScavenger()); |
| |
| ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RC, EltSize); |
| |
| // If there is an overlap, we can't kill the super-register on the last |
| // instruction, since it will also kill the components made live by this def. |
| const bool CanKillSuperReg = KillSrc && !RI.regsOverlap(SrcReg, DestReg); |
| |
| for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) { |
| unsigned SubIdx; |
| if (Forward) |
| SubIdx = SubIndices[Idx]; |
| else |
| SubIdx = SubIndices[SubIndices.size() - Idx - 1]; |
| |
| bool UseKill = CanKillSuperReg && Idx == SubIndices.size() - 1; |
| |
| if (Opcode == AMDGPU::INSTRUCTION_LIST_END) { |
| Register ImpDefSuper = Idx == 0 ? Register(DestReg) : Register(); |
| Register ImpUseSuper = SrcReg; |
| indirectCopyToAGPR(*this, MBB, MI, DL, RI.getSubReg(DestReg, SubIdx), |
| RI.getSubReg(SrcReg, SubIdx), UseKill, *RS, |
| ImpDefSuper, ImpUseSuper); |
| } else if (Opcode == AMDGPU::V_PK_MOV_B32) { |
| Register DstSubReg = RI.getSubReg(DestReg, SubIdx); |
| Register SrcSubReg = RI.getSubReg(SrcReg, SubIdx); |
| MachineInstrBuilder MIB = |
| BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), DstSubReg) |
| .addImm(SISrcMods::OP_SEL_1) |
| .addReg(SrcSubReg) |
| .addImm(SISrcMods::OP_SEL_0 | SISrcMods::OP_SEL_1) |
| .addReg(SrcSubReg) |
| .addImm(0) // op_sel_lo |
| .addImm(0) // op_sel_hi |
| .addImm(0) // neg_lo |
| .addImm(0) // neg_hi |
| .addImm(0) // clamp |
| .addReg(SrcReg, getKillRegState(UseKill) | RegState::Implicit); |
| if (Idx == 0) |
| MIB.addReg(DestReg, RegState::Define | RegState::Implicit); |
| } else { |
| MachineInstrBuilder Builder = |
| BuildMI(MBB, MI, DL, get(Opcode), RI.getSubReg(DestReg, SubIdx)) |
| .addReg(RI.getSubReg(SrcReg, SubIdx)); |
| if (Idx == 0) |
| Builder.addReg(DestReg, RegState::Define | RegState::Implicit); |
| |
| Builder.addReg(SrcReg, getKillRegState(UseKill) | RegState::Implicit); |
| } |
| } |
| } |
| |
| int SIInstrInfo::commuteOpcode(unsigned Opcode) const { |
| int NewOpc; |
| |
| // Try to map original to commuted opcode |
| NewOpc = AMDGPU::getCommuteRev(Opcode); |
| if (NewOpc != -1) |
| // Check if the commuted (REV) opcode exists on the target. |
| return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1; |
| |
| // Try to map commuted to original opcode |
| NewOpc = AMDGPU::getCommuteOrig(Opcode); |
| if (NewOpc != -1) |
| // Check if the original (non-REV) opcode exists on the target. |
| return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1; |
| |
| return Opcode; |
| } |
| |
| void SIInstrInfo::materializeImmediate(MachineBasicBlock &MBB, |
| MachineBasicBlock::iterator MI, |
| const DebugLoc &DL, unsigned DestReg, |
| int64_t Value) const { |
| MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); |
| const TargetRegisterClass *RegClass = MRI.getRegClass(DestReg); |
| if (RegClass == &AMDGPU::SReg_32RegClass || |
| RegClass == &AMDGPU::SGPR_32RegClass || |
| RegClass == &AMDGPU::SReg_32_XM0RegClass || |
| RegClass == &AMDGPU::SReg_32_XM0_XEXECRegClass) { |
| BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg) |
| .addImm(Value); |
| return; |
| } |
| |
| if (RegClass == &AMDGPU::SReg_64RegClass || |
| RegClass == &AMDGPU::SGPR_64RegClass || |
| RegClass == &AMDGPU::SReg_64_XEXECRegClass) { |
| BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg) |
| .addImm(Value); |
| return; |
| } |
| |
| if (RegClass == &AMDGPU::VGPR_32RegClass) { |
| BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DestReg) |
| .addImm(Value); |
| return; |
| } |
| if (RegClass->hasSuperClassEq(&AMDGPU::VReg_64RegClass)) { |
| BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO), DestReg) |
| .addImm(Value); |
| return; |
| } |
| |
| unsigned EltSize = 4; |
| unsigned Opcode = AMDGPU::V_MOV_B32_e32; |
| if (RI.isSGPRClass(RegClass)) { |
| if (RI.getRegSizeInBits(*RegClass) > 32) { |
| Opcode = AMDGPU::S_MOV_B64; |
| EltSize = 8; |
| } else { |
| Opcode = AMDGPU::S_MOV_B32; |
| EltSize = 4; |
| } |
| } |
| |
| ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RegClass, EltSize); |
| for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) { |
| int64_t IdxValue = Idx == 0 ? Value : 0; |
| |
| MachineInstrBuilder Builder = BuildMI(MBB, MI, DL, |
| get(Opcode), RI.getSubReg(DestReg, SubIndices[Idx])); |
| Builder.addImm(IdxValue); |
| } |
| } |
| |
| const TargetRegisterClass * |
| SIInstrInfo::getPreferredSelectRegClass(unsigned Size) const { |
| return &AMDGPU::VGPR_32RegClass; |
| } |
| |
| void SIInstrInfo::insertVectorSelect(MachineBasicBlock &MBB, |
| MachineBasicBlock::iterator I, |
| const DebugLoc &DL, Register DstReg, |
| ArrayRef<MachineOperand> Cond, |
| Register TrueReg, |
| Register FalseReg) const { |
| MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); |
| const TargetRegisterClass *BoolXExecRC = |
| RI.getRegClass(AMDGPU::SReg_1_XEXECRegClassID); |
| assert(MRI.getRegClass(DstReg) == &AMDGPU::VGPR_32RegClass && |
| "Not a VGPR32 reg"); |
| |
| if (Cond.size() == 1) { |
| Register SReg = MRI.createVirtualRegister(BoolXExecRC); |
| BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg) |
| .add(Cond[0]); |
| BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) |
| .addImm(0) |
| .addReg(FalseReg) |
| .addImm(0) |
| .addReg(TrueReg) |
| .addReg(SReg); |
| } else if (Cond.size() == 2) { |
| assert(Cond[0].isImm() && "Cond[0] is not an immediate"); |
| switch (Cond[0].getImm()) { |
| case SIInstrInfo::SCC_TRUE: { |
| Register SReg = MRI.createVirtualRegister(BoolXExecRC); |
| BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32 |
| : AMDGPU::S_CSELECT_B64), SReg) |
| .addImm(1) |
| .addImm(0); |
| BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) |
| .addImm(0) |
| .addReg(FalseReg) |
| .addImm(0) |
| .addReg(TrueReg) |
| .addReg(SReg); |
| break; |
| } |
| case SIInstrInfo::SCC_FALSE: { |
| Register SReg = MRI.createVirtualRegister(BoolXExecRC); |
| BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32 |
| : AMDGPU::S_CSELECT_B64), SReg) |
| .addImm(0) |
| .addImm(1); |
| BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) |
| .addImm(0) |
| .addReg(FalseReg) |
| .addImm(0) |
| .addReg(TrueReg) |
| .addReg(SReg); |
| break; |
| } |
| case SIInstrInfo::VCCNZ: { |
| MachineOperand RegOp = Cond[1]; |
| RegOp.setImplicit(false); |
| Register SReg = MRI.createVirtualRegister(BoolXExecRC); |
| BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg) |
| .add(RegOp); |
| BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) |
| .addImm(0) |
| .addReg(FalseReg) |
| .addImm(0) |
| .addReg(TrueReg) |
| .addReg(SReg); |
| break; |
| } |
| case SIInstrInfo::VCCZ: { |
| MachineOperand RegOp = Cond[1]; |
| RegOp.setImplicit(false); |
| Register SReg = MRI.createVirtualRegister(BoolXExecRC); |
| BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg) |
| .add(RegOp); |
| BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) |
| .addImm(0) |
| .addReg(TrueReg) |
| .addImm(0) |
| .addReg(FalseReg) |
| .addReg(SReg); |
| break; |
| } |
| case SIInstrInfo::EXECNZ: { |
| Register SReg = MRI.createVirtualRegister(BoolXExecRC); |
| Register SReg2 = MRI.createVirtualRegister(RI.getBoolRC()); |
| BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32 |
| : AMDGPU::S_OR_SAVEEXEC_B64), SReg2) |
| .addImm(0); |
| BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32 |
| : AMDGPU::S_CSELECT_B64), SReg) |
| .addImm(1) |
| .addImm(0); |
| BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) |
| .addImm(0) |
| .addReg(FalseReg) |
| .addImm(0) |
| .addReg(TrueReg) |
| .addReg(SReg); |
| break; |
| } |
| case SIInstrInfo::EXECZ: { |
| Register SReg = MRI.createVirtualRegister(BoolXExecRC); |
| Register SReg2 = MRI.createVirtualRegister(RI.getBoolRC()); |
| BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32 |
| : AMDGPU::S_OR_SAVEEXEC_B64), SReg2) |
| .addImm(0); |
| BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32 |
| : AMDGPU::S_CSELECT_B64), SReg) |
| .addImm(0) |
| .addImm(1); |
| BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) |
| .addImm(0) |
| .addReg(FalseReg) |
| .addImm(0) |
| .addReg(TrueReg) |
| .addReg(SReg); |
| llvm_unreachable("Unhandled branch predicate EXECZ"); |
| break; |
| } |
| default: |
| llvm_unreachable("invalid branch predicate"); |
| } |
| } else { |
| llvm_unreachable("Can only handle Cond size 1 or 2"); |
| } |
| } |
| |
| Register SIInstrInfo::insertEQ(MachineBasicBlock *MBB, |
| MachineBasicBlock::iterator I, |
| const DebugLoc &DL, |
| Register SrcReg, int Value) const { |
| MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); |
| Register Reg = MRI.createVirtualRegister(RI.getBoolRC()); |
| BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_EQ_I32_e64), Reg) |
| .addImm(Value) |
| .addReg(SrcReg); |
| |
| return Reg; |
| } |
| |
| Register SIInstrInfo::insertNE(MachineBasicBlock *MBB, |
| MachineBasicBlock::iterator I, |
| const DebugLoc &DL, |
| Register SrcReg, int Value) const { |
| MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); |
| Register Reg = MRI.createVirtualRegister(RI.getBoolRC()); |
| BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_NE_I32_e64), Reg) |
| .addImm(Value) |
| .addReg(SrcReg); |
| |
| return Reg; |
| } |
| |
| unsigned SIInstrInfo::getMovOpcode(const TargetRegisterClass *DstRC) const { |
| |
| if (RI.isAGPRClass(DstRC)) |
| return AMDGPU::COPY; |
| if (RI.getRegSizeInBits(*DstRC) == 32) { |
| return RI.isSGPRClass(DstRC) ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32; |
| } else if (RI.getRegSizeInBits(*DstRC) == 64 && RI.isSGPRClass(DstRC)) { |
| return AMDGPU::S_MOV_B64; |
| } else if (RI.getRegSizeInBits(*DstRC) == 64 && !RI.isSGPRClass(DstRC)) { |
| return AMDGPU::V_MOV_B64_PSEUDO; |
| } |
| return AMDGPU::COPY; |
| } |
| |
| const MCInstrDesc & |
| SIInstrInfo::getIndirectGPRIDXPseudo(unsigned VecSize, |
| bool IsIndirectSrc) const { |
| if (IsIndirectSrc) { |
| if (VecSize <= 32) // 4 bytes |
| return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V1); |
| if (VecSize <= 64) // 8 bytes |
| return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V2); |
| if (VecSize <= 96) // 12 bytes |
| return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V3); |
| if (VecSize <= 128) // 16 bytes |
| return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V4); |
| if (VecSize <= 160) // 20 bytes |
| return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V5); |
| if (VecSize <= 256) // 32 bytes |
| return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V8); |
| if (VecSize <= 512) // 64 bytes |
| return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V16); |
| if (VecSize <= 1024) // 128 bytes |
| return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V32); |
| |
| llvm_unreachable("unsupported size for IndirectRegReadGPRIDX pseudos"); |
| } |
| |
| if (VecSize <= 32) // 4 bytes |
| return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V1); |
| if (VecSize <= 64) // 8 bytes |
| return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V2); |
| if (VecSize <= 96) // 12 bytes |
| return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V3); |
| if (VecSize <= 128) // 16 bytes |
| return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V4); |
| if (VecSize <= 160) // 20 bytes |
| return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V5); |
| if (VecSize <= 256) // 32 bytes |
| return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V8); |
| if (VecSize <= 512) // 64 bytes |
| return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V16); |
| if (VecSize <= 1024) // 128 bytes |
| return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V32); |
| |
| llvm_unreachable("unsupported size for IndirectRegWriteGPRIDX pseudos"); |
| } |
| |
| static unsigned getIndirectVGPRWriteMovRelPseudoOpc(unsigned VecSize) { |
| if (VecSize <= 32) // 4 bytes |
| return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V1; |
| if (VecSize <= 64) // 8 bytes |
| return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V2; |
| if (VecSize <= 96) // 12 bytes |
| return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V3; |
| if (VecSize <= 128) // 16 bytes |
| return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V4; |
| if (VecSize <= 160) // 20 bytes |
| return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V5; |
| if (VecSize <= 256) // 32 bytes |
| return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V8; |
| if (VecSize <= 512) // 64 bytes |
| return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V16; |
| if (VecSize <= 1024) // 128 bytes |
| return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V32; |
| |
| llvm_unreachable("unsupported size for IndirectRegWrite pseudos"); |
| } |
| |
| static unsigned getIndirectSGPRWriteMovRelPseudo32(unsigned VecSize) { |
| if (VecSize <= 32) // 4 bytes |
| return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V1; |
| if (VecSize <= 64) // 8 bytes |
| return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V2; |
| if (VecSize <= 96) // 12 bytes |
| return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V3; |
| if (VecSize <= 128) // 16 bytes |
| return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V4; |
| if (VecSize <= 160) // 20 bytes |
| return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V5; |
| if (VecSize <= 256) // 32 bytes |
| return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V8; |
| if (VecSize <= 512) // 64 bytes |
| return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V16; |
| if (VecSize <= 1024) // 128 bytes |
| return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V32; |
| |
| llvm_unreachable("unsupported size for IndirectRegWrite pseudos"); |
| } |
| |
| static unsigned getIndirectSGPRWriteMovRelPseudo64(unsigned VecSize) { |
| if (VecSize <= 64) // 8 bytes |
| return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V1; |
| if (VecSize <= 128) // 16 bytes |
| return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V2; |
| if (VecSize <= 256) // 32 bytes |
| return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V4; |
| if (VecSize <= 512) // 64 bytes |
| return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V8; |
| if (VecSize <= 1024) // 128 bytes |
| return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V16; |
| |
| llvm_unreachable("unsupported size for IndirectRegWrite pseudos"); |
| } |
| |
| const MCInstrDesc & |
| SIInstrInfo::getIndirectRegWriteMovRelPseudo(unsigned VecSize, unsigned EltSize, |
| bool IsSGPR) const { |
| if (IsSGPR) { |
| switch (EltSize) { |
| case 32: |
| return get(getIndirectSGPRWriteMovRelPseudo32(VecSize)); |
| case 64: |
| return get(getIndirectSGPRWriteMovRelPseudo64(VecSize)); |
| default: |
| llvm_unreachable("invalid reg indexing elt size"); |
| } |
| } |
| |
| assert(EltSize == 32 && "invalid reg indexing elt size"); |
| return get(getIndirectVGPRWriteMovRelPseudoOpc(VecSize)); |
| } |
| |
| static unsigned getSGPRSpillSaveOpcode(unsigned Size) { |
| switch (Size) { |
| case 4: |
| return AMDGPU::SI_SPILL_S32_SAVE; |
| case 8: |
| return AMDGPU::SI_SPILL_S64_SAVE; |
| case 12: |
| return AMDGPU::SI_SPILL_S96_SAVE; |
| case 16: |
| return AMDGPU::SI_SPILL_S128_SAVE; |
| case 20: |
| return AMDGPU::SI_SPILL_S160_SAVE; |
| case 24: |
| return AMDGPU::SI_SPILL_S192_SAVE; |
| case 28: |
| return AMDGPU::SI_SPILL_S224_SAVE; |
| case 32: |
| return AMDGPU::SI_SPILL_S256_SAVE; |
| case 64: |
| return AMDGPU::SI_SPILL_S512_SAVE; |
| case 128: |
| return AMDGPU::SI_SPILL_S1024_SAVE; |
| default: |
| llvm_unreachable("unknown register size"); |
| } |
| } |
| |
| static unsigned getVGPRSpillSaveOpcode(unsigned Size) { |
| switch (Size) { |
| case 4: |
| return AMDGPU::SI_SPILL_V32_SAVE; |
| case 8: |
| return AMDGPU::SI_SPILL_V64_SAVE; |
| case 12: |
| return AMDGPU::SI_SPILL_V96_SAVE; |
| case 16: |
| return AMDGPU::SI_SPILL_V128_SAVE; |
| case 20: |
| return AMDGPU::SI_SPILL_V160_SAVE; |
| case 24: |
| return AMDGPU::SI_SPILL_V192_SAVE; |
| case 28: |
| return AMDGPU::SI_SPILL_V224_SAVE; |
| case 32: |
| return AMDGPU::SI_SPILL_V256_SAVE; |
| case 64: |
| return AMDGPU::SI_SPILL_V512_SAVE; |
| case 128: |
| return AMDGPU::SI_SPILL_V1024_SAVE; |
| default: |
| llvm_unreachable("unknown register size"); |
| } |
| } |
| |
| static unsigned getAGPRSpillSaveOpcode(unsigned Size) { |
| switch (Size) { |
| case 4: |
| return AMDGPU::SI_SPILL_A32_SAVE; |
| case 8: |
| return AMDGPU::SI_SPILL_A64_SAVE; |
| case 12: |
| return AMDGPU::SI_SPILL_A96_SAVE; |
| case 16: |
| return AMDGPU::SI_SPILL_A128_SAVE; |
| case 20: |
| return AMDGPU::SI_SPILL_A160_SAVE; |
| case 24: |
| return AMDGPU::SI_SPILL_A192_SAVE; |
| case 28: |
| return AMDGPU::SI_SPILL_A224_SAVE; |
| case 32: |
| return AMDGPU::SI_SPILL_A256_SAVE; |
| case 64: |
| return AMDGPU::SI_SPILL_A512_SAVE; |
| case 128: |
| return AMDGPU::SI_SPILL_A1024_SAVE; |
| default: |
| llvm_unreachable("unknown register size"); |
| } |
| } |
| |
| void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, |
| MachineBasicBlock::iterator MI, |
| Register SrcReg, bool isKill, |
| int FrameIndex, |
| const TargetRegisterClass *RC, |
| const TargetRegisterInfo *TRI) const { |
| MachineFunction *MF = MBB.getParent(); |
| SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); |
| MachineFrameInfo &FrameInfo = MF->getFrameInfo(); |
| const DebugLoc &DL = MBB.findDebugLoc(MI); |
| |
| MachinePointerInfo PtrInfo |
| = MachinePointerInfo::getFixedStack(*MF, FrameIndex); |
| MachineMemOperand *MMO = MF->getMachineMemOperand( |
| PtrInfo, MachineMemOperand::MOStore, FrameInfo.getObjectSize(FrameIndex), |
| FrameInfo.getObjectAlign(FrameIndex)); |
| unsigned SpillSize = TRI->getSpillSize(*RC); |
| |
| MachineRegisterInfo &MRI = MF->getRegInfo(); |
| if (RI.isSGPRClass(RC)) { |
| MFI->setHasSpilledSGPRs(); |
| assert(SrcReg != AMDGPU::M0 && "m0 should not be spilled"); |
| assert(SrcReg != AMDGPU::EXEC_LO && SrcReg != AMDGPU::EXEC_HI && |
| SrcReg != AMDGPU::EXEC && "exec should not be spilled"); |
| |
| // We are only allowed to create one new instruction when spilling |
| // registers, so we need to use pseudo instruction for spilling SGPRs. |
| const MCInstrDesc &OpDesc = get(getSGPRSpillSaveOpcode(SpillSize)); |
| |
| // The SGPR spill/restore instructions only work on number sgprs, so we need |
| // to make sure we are using the correct register class. |
| if (SrcReg.isVirtual() && SpillSize == 4) { |
| MRI.constrainRegClass(SrcReg, &AMDGPU::SReg_32_XM0_XEXECRegClass); |
| } |
| |
| BuildMI(MBB, MI, DL, OpDesc) |
| .addReg(SrcReg, getKillRegState(isKill)) // data |
| .addFrameIndex(FrameIndex) // addr |
| .addMemOperand(MMO) |
| .addReg(MFI->getStackPtrOffsetReg(), RegState::Implicit); |
| |
| if (RI.spillSGPRToVGPR()) |
| FrameInfo.setStackID(FrameIndex, TargetStackID::SGPRSpill); |
| return; |
| } |
| |
| unsigned Opcode = RI.isAGPRClass(RC) ? getAGPRSpillSaveOpcode(SpillSize) |
| : getVGPRSpillSaveOpcode(SpillSize); |
| MFI->setHasSpilledVGPRs(); |
| |
| if (RI.isVectorSuperClass(RC)) { |
| // Convert an AV spill into a VGPR spill. Introduce a copy from AV to an |
| // equivalent VGPR register beforehand. Regalloc might want to introduce |
| // AV spills only to be relevant until rewriter at which they become |
| // either spills of VGPRs or AGPRs. |
| Register TmpVReg = MRI.createVirtualRegister(RI.getEquivalentVGPRClass(RC)); |
| BuildMI(MBB, MI, DL, get(TargetOpcode::COPY), TmpVReg) |
| .addReg(SrcReg, RegState::Kill); |
| SrcReg = TmpVReg; |
| } |
| |
| BuildMI(MBB, MI, DL, get(Opcode)) |
| .addReg(SrcReg, getKillRegState(isKill)) // data |
| .addFrameIndex(FrameIndex) // addr |
| .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset |
| .addImm(0) // offset |
| .addMemOperand(MMO); |
| } |
| |
| static unsigned getSGPRSpillRestoreOpcode(unsigned Size) { |
| switch (Size) { |
| case 4: |
| return AMDGPU::SI_SPILL_S32_RESTORE; |
| case 8: |
| return AMDGPU::SI_SPILL_S64_RESTORE; |
| case 12: |
| return AMDGPU::SI_SPILL_S96_RESTORE; |
| case 16: |
| return AMDGPU::SI_SPILL_S128_RESTORE; |
| case 20: |
| return AMDGPU::SI_SPILL_S160_RESTORE; |
| case 24: |
| return AMDGPU::SI_SPILL_S192_RESTORE; |
| case 28: |
| return AMDGPU::SI_SPILL_S224_RESTORE; |
| case 32: |
| return AMDGPU::SI_SPILL_S256_RESTORE; |
| case 64: |
| return AMDGPU::SI_SPILL_S512_RESTORE; |
| case 128: |
| return AMDGPU::SI_SPILL_S1024_RESTORE; |
| default: |
| llvm_unreachable("unknown register size"); |
| } |
| } |
| |
| static unsigned getVGPRSpillRestoreOpcode(unsigned Size) { |
| switch (Size) { |
| case 4: |
| return AMDGPU::SI_SPILL_V32_RESTORE; |
| case 8: |
| return AMDGPU::SI_SPILL_V64_RESTORE; |
| case 12: |
| return AMDGPU::SI_SPILL_V96_RESTORE; |
| case 16: |
| return AMDGPU::SI_SPILL_V128_RESTORE; |
| case 20: |
| return AMDGPU::SI_SPILL_V160_RESTORE; |
| case 24: |
| return AMDGPU::SI_SPILL_V192_RESTORE; |
| case 28: |
| return AMDGPU::SI_SPILL_V224_RESTORE; |
| case 32: |
| return AMDGPU::SI_SPILL_V256_RESTORE; |
| case 64: |
| return AMDGPU::SI_SPILL_V512_RESTORE; |
| case 128: |
| return AMDGPU::SI_SPILL_V1024_RESTORE; |
| default: |
| llvm_unreachable("unknown register size"); |
| } |
| } |
| |
| static unsigned getAGPRSpillRestoreOpcode(unsigned Size) { |
| switch (Size) { |
| case 4: |
| return AMDGPU::SI_SPILL_A32_RESTORE; |
| case 8: |
| return AMDGPU::SI_SPILL_A64_RESTORE; |
| case 12: |
| return AMDGPU::SI_SPILL_A96_RESTORE; |
| case 16: |
| return AMDGPU::SI_SPILL_A128_RESTORE; |
| case 20: |
| return AMDGPU::SI_SPILL_A160_RESTORE; |
| case 24: |
| return AMDGPU::SI_SPILL_A192_RESTORE; |
| case 28: |
| return AMDGPU::SI_SPILL_A224_RESTORE; |
| case 32: |
| return AMDGPU::SI_SPILL_A256_RESTORE; |
| case 64: |
| return AMDGPU::SI_SPILL_A512_RESTORE; |
| case 128: |
| return AMDGPU::SI_SPILL_A1024_RESTORE; |
| default: |
| llvm_unreachable("unknown register size"); |
| } |
| } |
| |
| void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, |
| MachineBasicBlock::iterator MI, |
| Register DestReg, int FrameIndex, |
| const TargetRegisterClass *RC, |
| const TargetRegisterInfo *TRI) const { |
| MachineFunction *MF = MBB.getParent(); |
| SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); |
| MachineFrameInfo &FrameInfo = MF->getFrameInfo(); |
| const DebugLoc &DL = MBB.findDebugLoc(MI); |
| unsigned SpillSize = TRI->getSpillSize(*RC); |
| |
| MachinePointerInfo PtrInfo |
| = MachinePointerInfo::getFixedStack(*MF, FrameIndex); |
| |
| MachineMemOperand *MMO = MF->getMachineMemOperand( |
| PtrInfo, MachineMemOperand::MOLoad, FrameInfo.getObjectSize(FrameIndex), |
| FrameInfo.getObjectAlign(FrameIndex)); |
| |
| if (RI.isSGPRClass(RC)) { |
| MFI->setHasSpilledSGPRs(); |
| assert(DestReg != AMDGPU::M0 && "m0 should not be reloaded into"); |
| assert(DestReg != AMDGPU::EXEC_LO && DestReg != AMDGPU::EXEC_HI && |
| DestReg != AMDGPU::EXEC && "exec should not be spilled"); |
| |
| // FIXME: Maybe this should not include a memoperand because it will be |
| // lowered to non-memory instructions. |
| const MCInstrDesc &OpDesc = get(getSGPRSpillRestoreOpcode(SpillSize)); |
| if (DestReg.isVirtual() && SpillSize == 4) { |
| MachineRegisterInfo &MRI = MF->getRegInfo(); |
| MRI.constrainRegClass(DestReg, &AMDGPU::SReg_32_XM0_XEXECRegClass); |
| } |
| |
| if (RI.spillSGPRToVGPR()) |
| FrameInfo.setStackID(FrameIndex, TargetStackID::SGPRSpill); |
| BuildMI(MBB, MI, DL, OpDesc, DestReg) |
| .addFrameIndex(FrameIndex) // addr |
| .addMemOperand(MMO) |
| .addReg(MFI->getStackPtrOffsetReg(), RegState::Implicit); |
| |
| return; |
| } |
| |
| unsigned Opcode = RI.isAGPRClass(RC) ? getAGPRSpillRestoreOpcode(SpillSize) |
| : getVGPRSpillRestoreOpcode(SpillSize); |
| |
| bool IsVectorSuperClass = RI.isVectorSuperClass(RC); |
| Register TmpReg = DestReg; |
| if (IsVectorSuperClass) { |
| // For AV classes, insert the spill restore to a VGPR followed by a copy |
| // into an equivalent AV register. |
| MachineRegisterInfo &MRI = MF->getRegInfo(); |
| DestReg = MRI.createVirtualRegister(RI.getEquivalentVGPRClass(RC)); |
| } |
| BuildMI(MBB, MI, DL, get(Opcode), DestReg) |
| .addFrameIndex(FrameIndex) // vaddr |
| .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset |
| .addImm(0) // offset |
| .addMemOperand(MMO); |
| |
| if (IsVectorSuperClass) |
| BuildMI(MBB, MI, DL, get(TargetOpcode::COPY), TmpReg) |
| .addReg(DestReg, RegState::Kill); |
| } |
| |
| void SIInstrInfo::insertNoop(MachineBasicBlock &MBB, |
| MachineBasicBlock::iterator MI) const { |
| insertNoops(MBB, MI, 1); |
| } |
| |
| void SIInstrInfo::insertNoops(MachineBasicBlock &MBB, |
| MachineBasicBlock::iterator MI, |
| unsigned Quantity) const { |
| DebugLoc DL = MBB.findDebugLoc(MI); |
| while (Quantity > 0) { |
| unsigned Arg = std::min(Quantity, 8u); |
| Quantity -= Arg; |
| BuildMI(MBB, MI, DL, get(AMDGPU::S_NOP)).addImm(Arg - 1); |
| } |
| } |
| |
| void SIInstrInfo::insertReturn(MachineBasicBlock &MBB) const { |
| auto MF = MBB.getParent(); |
| SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>(); |
| |
| assert(Info->isEntryFunction()); |
| |
| if (MBB.succ_empty()) { |
| bool HasNoTerminator = MBB.getFirstTerminator() == MBB.end(); |
| if (HasNoTerminator) { |
| if (Info->returnsVoid()) { |
| BuildMI(MBB, MBB.end(), DebugLoc(), get(AMDGPU::S_ENDPGM)).addImm(0); |
| } else { |
| BuildMI(MBB, MBB.end(), DebugLoc(), get(AMDGPU::SI_RETURN_TO_EPILOG)); |
| } |
| } |
| } |
| } |
| |
| unsigned SIInstrInfo::getNumWaitStates(const MachineInstr &MI) { |
| switch (MI.getOpcode()) { |
| default: |
| if (MI.isMetaInstruction()) |
| return 0; |
| return 1; // FIXME: Do wait states equal cycles? |
| |
| case AMDGPU::S_NOP: |
| return MI.getOperand(0).getImm() + 1; |
| |
| // FIXME: Any other pseudo instruction? |
| // SI_RETURN_TO_EPILOG is a fallthrough to code outside of the function. The |
| // hazard, even if one exist, won't really be visible. Should we handle it? |
| case AMDGPU::SI_MASKED_UNREACHABLE: |
| case AMDGPU::WAVE_BARRIER: |
| return 0; |
| } |
| } |
| |
| bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { |
| const SIRegisterInfo *TRI = ST.getRegisterInfo(); |
| MachineBasicBlock &MBB = *MI.getParent(); |
| DebugLoc DL = MBB.findDebugLoc(MI); |
| switch (MI.getOpcode()) { |
| default: return TargetInstrInfo::expandPostRAPseudo(MI); |
| case AMDGPU::S_MOV_B64_term: |
| // This is only a terminator to get the correct spill code placement during |
| // register allocation. |
| MI.setDesc(get(AMDGPU::S_MOV_B64)); |
| break; |
| |
| case AMDGPU::S_MOV_B32_term: |
| // This is only a terminator to get the correct spill code placement during |
| // register allocation. |
| MI.setDesc(get(AMDGPU::S_MOV_B32)); |
| break; |
| |
| case AMDGPU::S_XOR_B64_term: |
| // This is only a terminator to get the correct spill code placement during |
| // register allocation. |
| MI.setDesc(get(AMDGPU::S_XOR_B64)); |
| break; |
| |
| case AMDGPU::S_XOR_B32_term: |
| // This is only a terminator to get the correct spill code placement during |
| // register allocation. |
| MI.setDesc(get(AMDGPU::S_XOR_B32)); |
| break; |
| case AMDGPU::S_OR_B64_term: |
| // This is only a terminator to get the correct spill code placement during |
| // register allocation. |
| MI.setDesc(get(AMDGPU::S_OR_B64)); |
| break; |
| case AMDGPU::S_OR_B32_term: |
| // This is only a terminator to get the correct spill code placement during |
| // register allocation. |
| MI.setDesc(get(AMDGPU::S_OR_B32)); |
| break; |
| |
| case AMDGPU::S_ANDN2_B64_term: |
| // This is only a terminator to get the correct spill code placement during |
| // register allocation. |
| MI.setDesc(get(AMDGPU::S_ANDN2_B64)); |
| break; |
| |
| case AMDGPU::S_ANDN2_B32_term: |
| // This is only a terminator to get the correct spill code placement during |
| // register allocation. |
| MI.setDesc(get(AMDGPU::S_ANDN2_B32)); |
| break; |
| |
| case AMDGPU::S_AND_B64_term: |
| // This is only a terminator to get the correct spill code placement during |
| // register allocation. |
| MI.setDesc(get(AMDGPU::S_AND_B64)); |
| break; |
| |
| case AMDGPU::S_AND_B32_term: |
| // This is only a terminator to get the correct spill code placement during |
| // register allocation. |
| MI.setDesc(get(AMDGPU::S_AND_B32)); |
| break; |
| |
| case AMDGPU::V_MOV_B64_PSEUDO: { |
| Register Dst = MI.getOperand(0).getReg(); |
| Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0); |
| Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1); |
| |
| const MachineOperand &SrcOp = MI.getOperand(1); |
| // FIXME: Will this work for 64-bit floating point immediates? |
| assert(!SrcOp.isFPImm()); |
| if (SrcOp.isImm()) { |
| APInt Imm(64, SrcOp.getImm()); |
| APInt Lo(32, Imm.getLoBits(32).getZExtValue()); |
| APInt Hi(32, Imm.getHiBits(32).getZExtValue()); |
| if (ST.hasPackedFP32Ops() && Lo == Hi && isInlineConstant(Lo)) { |
| BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), Dst) |
| .addImm(SISrcMods::OP_SEL_1) |
| .addImm(Lo.getSExtValue()) |
| .addImm(SISrcMods::OP_SEL_1) |
| .addImm(Lo.getSExtValue()) |
| .addImm(0) // op_sel_lo |
| .addImm(0) // op_sel_hi |
| .addImm(0) // neg_lo |
| .addImm(0) // neg_hi |
| .addImm(0); // clamp |
| } else { |
| BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo) |
| .addImm(Lo.getSExtValue()) |
| .addReg(Dst, RegState::Implicit | RegState::Define); |
| BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi) |
| .addImm(Hi.getSExtValue()) |
| .addReg(Dst, RegState::Implicit | RegState::Define); |
| } |
| } else { |
| assert(SrcOp.isReg()); |
| if (ST.hasPackedFP32Ops() && |
| !RI.isAGPR(MBB.getParent()->getRegInfo(), SrcOp.getReg())) { |
| BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), Dst) |
| .addImm(SISrcMods::OP_SEL_1) // src0_mod |
| .addReg(SrcOp.getReg()) |
| .addImm(SISrcMods::OP_SEL_0 | SISrcMods::OP_SEL_1) // src1_mod |
| .addReg(SrcOp.getReg()) |
| .addImm(0) // op_sel_lo |
| .addImm(0) // op_sel_hi |
| .addImm(0) // neg_lo |
| .addImm(0) // neg_hi |
| .addImm(0); // clamp |
| } else { |
| BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo) |
| .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub0)) |
| .addReg(Dst, RegState::Implicit | RegState::Define); |
| BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi) |
| .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub1)) |
| .addReg(Dst, RegState::Implicit | RegState::Define); |
| } |
| } |
| MI.eraseFromParent(); |
| break; |
| } |
| case AMDGPU::V_MOV_B64_DPP_PSEUDO: { |
| expandMovDPP64(MI); |
| break; |
| } |
| case AMDGPU::S_MOV_B64_IMM_PSEUDO: { |
| const MachineOperand &SrcOp = MI.getOperand(1); |
| assert(!SrcOp.isFPImm()); |
| APInt Imm(64, SrcOp.getImm()); |
| if (Imm.isIntN(32) || isInlineConstant(Imm)) { |
| MI.setDesc(get(AMDGPU::S_MOV_B64)); |
| break; |
| } |
| |
| Register Dst = MI.getOperand(0).getReg(); |
| Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0); |
| Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1); |
| |
| APInt Lo(32, Imm.getLoBits(32).getZExtValue()); |
| APInt Hi(32, Imm.getHiBits(32).getZExtValue()); |
| BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DstLo) |
| .addImm(Lo.getSExtValue()) |
| .addReg(Dst, RegState::Implicit | RegState::Define); |
| BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DstHi) |
| .addImm(Hi.getSExtValue()) |
| .addReg(Dst, RegState::Implicit | RegState::Define); |
| MI.eraseFromParent(); |
| break; |
| } |
| case AMDGPU::V_SET_INACTIVE_B32: { |
| unsigned NotOpc = ST.isWave32() ? AMDGPU::S_NOT_B32 : AMDGPU::S_NOT_B64; |
| unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; |
| auto FirstNot = BuildMI(MBB, MI, DL, get(NotOpc), Exec).addReg(Exec); |
| FirstNot->addRegisterDead(AMDGPU::SCC, TRI); // SCC is overwritten |
| BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), MI.getOperand(0).getReg()) |
| .add(MI.getOperand(2)); |
| BuildMI(MBB, MI, DL, get(NotOpc), Exec) |
| .addReg(Exec); |
| MI.eraseFromParent(); |
| break; |
| } |
| case AMDGPU::V_SET_INACTIVE_B64: { |
| unsigned NotOpc = ST.isWave32() ? AMDGPU::S_NOT_B32 : AMDGPU::S_NOT_B64; |
| unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; |
| auto FirstNot = BuildMI(MBB, MI, DL, get(NotOpc), Exec).addReg(Exec); |
| FirstNot->addRegisterDead(AMDGPU::SCC, TRI); // SCC is overwritten |
| MachineInstr *Copy = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO), |
| MI.getOperand(0).getReg()) |
| .add(MI.getOperand(2)); |
| expandPostRAPseudo(*Copy); |
| BuildMI(MBB, MI, DL, get(NotOpc), Exec) |
| .addReg(Exec); |
| MI.eraseFromParent(); |
| break; |
| } |
| case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V1: |
| case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V2: |
| case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V3: |
| case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V4: |
| case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V5: |
| case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V8: |
| case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V16: |
| case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V32: |
| case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V1: |
| case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V2: |
| case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V3: |
| case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V4: |
| case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V5: |
| case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V8: |
| case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V16: |
| case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V32: |
| case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V1: |
| case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V2: |
| case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V4: |
| case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V8: |
| case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V16: { |
| const TargetRegisterClass *EltRC = getOpRegClass(MI, 2); |
| |
| unsigned Opc; |
| if (RI.hasVGPRs(EltRC)) { |
| Opc = AMDGPU::V_MOVRELD_B32_e32; |
| } else { |
| Opc = RI.getRegSizeInBits(*EltRC) == 64 ? AMDGPU::S_MOVRELD_B64 |
| : AMDGPU::S_MOVRELD_B32; |
| } |
| |
| const MCInstrDesc &OpDesc = get(Opc); |
| Register VecReg = MI.getOperand(0).getReg(); |
| bool IsUndef = MI.getOperand(1).isUndef(); |
| unsigned SubReg = MI.getOperand(3).getImm(); |
| assert(VecReg == MI.getOperand(1).getReg()); |
| |
| MachineInstrBuilder MIB = |
| BuildMI(MBB, MI, DL, OpDesc) |
| .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef) |
| .add(MI.getOperand(2)) |
| .addReg(VecReg, RegState::ImplicitDefine) |
| .addReg(VecReg, RegState::Implicit | (IsUndef ? RegState::Undef : 0)); |
| |
| const int ImpDefIdx = |
| OpDesc.getNumOperands() + OpDesc.getNumImplicitUses(); |
| const int ImpUseIdx = ImpDefIdx + 1; |
| MIB->tieOperands(ImpDefIdx, ImpUseIdx); |
| MI.eraseFromParent(); |
| break; |
| } |
| case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V1: |
| case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V2: |
| case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V3: |
| case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V4: |
| case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V5: |
| case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V8: |
| case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V16: |
| case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V32: { |
| assert(ST.useVGPRIndexMode()); |
| Register VecReg = MI.getOperand(0).getReg(); |
| bool IsUndef = MI.getOperand(1).isUndef(); |
| Register Idx = MI.getOperand(3).getReg(); |
| Register SubReg = MI.getOperand(4).getImm(); |
| |
| MachineInstr *SetOn = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_ON)) |
| .addReg(Idx) |
| .addImm(AMDGPU::VGPRIndexMode::DST_ENABLE); |
| SetOn->getOperand(3).setIsUndef(); |
| |
| const MCInstrDesc &OpDesc = get(AMDGPU::V_MOV_B32_indirect_write); |
| MachineInstrBuilder MIB = |
| BuildMI(MBB, MI, DL, OpDesc) |
| .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef) |
| .add(MI.getOperand(2)) |
| .addReg(VecReg, RegState::ImplicitDefine) |
| .addReg(VecReg, |
| RegState::Implicit | (IsUndef ? RegState::Undef : 0)); |
| |
| const int ImpDefIdx = OpDesc.getNumOperands() + OpDesc.getNumImplicitUses(); |
| const int ImpUseIdx = ImpDefIdx + 1; |
| MIB->tieOperands(ImpDefIdx, ImpUseIdx); |
| |
| MachineInstr *SetOff = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_OFF)); |
| |
| finalizeBundle(MBB, SetOn->getIterator(), std::next(SetOff->getIterator())); |
| |
| MI.eraseFromParent(); |
| break; |
| } |
| case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V1: |
| case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V2: |
| case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V3: |
| case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V4: |
| case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V5: |
| case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V8: |
| case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V16: |
| case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V32: { |
| assert(ST.useVGPRIndexMode()); |
| Register Dst = MI.getOperand(0).getReg(); |
| Register VecReg = MI.getOperand(1).getReg(); |
| bool IsUndef = MI.getOperand(1).isUndef(); |
| Register Idx = MI.getOperand(2).getReg(); |
| Register SubReg = MI.getOperand(3).getImm(); |
| |
| MachineInstr *SetOn = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_ON)) |
| .addReg(Idx) |
| .addImm(AMDGPU::VGPRIndexMode::SRC0_ENABLE); |
| SetOn->getOperand(3).setIsUndef(); |
| |
| BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_indirect_read)) |
| .addDef(Dst) |
| .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef) |
| .addReg(VecReg, RegState::Implicit | (IsUndef ? RegState::Undef : 0)); |
| |
| MachineInstr *SetOff = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_OFF)); |
| |
| finalizeBundle(MBB, SetOn->getIterator(), std::next(SetOff->getIterator())); |
| |
| MI.eraseFromParent(); |
| break; |
| } |
| case AMDGPU::SI_PC_ADD_REL_OFFSET: { |
| MachineFunction &MF = *MBB.getParent(); |
| Register Reg = MI.getOperand(0).getReg(); |
| Register RegLo = RI.getSubReg(Reg, AMDGPU::sub0); |
| Register RegHi = RI.getSubReg(Reg, AMDGPU::sub1); |
| |
| // Create a bundle so these instructions won't be re-ordered by the |
| // post-RA scheduler. |
| MIBundleBuilder Bundler(MBB, MI); |
| Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_GETPC_B64), Reg)); |
| |
| // Add 32-bit offset from this instruction to the start of the |
| // constant data. |
| Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_ADD_U32), RegLo) |
| .addReg(RegLo) |
| .add(MI.getOperand(1))); |
| |
| MachineInstrBuilder MIB = BuildMI(MF, DL, get(AMDGPU::S_ADDC_U32), RegHi) |
| .addReg(RegHi); |
| MIB.add(MI.getOperand(2)); |
| |
| Bundler.append(MIB); |
| finalizeBundle(MBB, Bundler.begin()); |
| |
| MI.eraseFromParent(); |
| break; |
| } |
| case AMDGPU::ENTER_STRICT_WWM: { |
| // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when |
| // Whole Wave Mode is entered. |
| MI.setDesc(get(ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32 |
| : AMDGPU::S_OR_SAVEEXEC_B64)); |
| break; |
| } |
| case AMDGPU::ENTER_STRICT_WQM: { |
| // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when |
| // STRICT_WQM is entered. |
| const unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; |
| const unsigned WQMOp = ST.isWave32() ? AMDGPU::S_WQM_B32 : AMDGPU::S_WQM_B64; |
| const unsigned MovOp = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; |
| BuildMI(MBB, MI, DL, get(MovOp), MI.getOperand(0).getReg()).addReg(Exec); |
| BuildMI(MBB, MI, DL, get(WQMOp), Exec).addReg(Exec); |
| |
| MI.eraseFromParent(); |
| break; |
| } |
| case AMDGPU::EXIT_STRICT_WWM: |
| case AMDGPU::EXIT_STRICT_WQM: { |
| // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when |
| // WWM/STICT_WQM is exited. |
| MI.setDesc(get(ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64)); |
| break; |
| } |
| } |
| return true; |
| } |
| |
| std::pair<MachineInstr*, MachineInstr*> |
| SIInstrInfo::expandMovDPP64(MachineInstr &MI) const { |
| assert (MI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO); |
| |
| MachineBasicBlock &MBB = *MI.getParent(); |
| DebugLoc DL = MBB.findDebugLoc(MI); |
| MachineFunction *MF = MBB.getParent(); |
| MachineRegisterInfo &MRI = MF->getRegInfo(); |
| Register Dst = MI.getOperand(0).getReg(); |
| unsigned Part = 0; |
| MachineInstr *Split[2]; |
| |
| for (auto Sub : { AMDGPU::sub0, AMDGPU::sub1 }) { |
| auto MovDPP = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_dpp)); |
| if (Dst.isPhysical()) { |
| MovDPP.addDef(RI.getSubReg(Dst, Sub)); |
| } else { |
| assert(MRI.isSSA()); |
| auto Tmp = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); |
| MovDPP.addDef(Tmp); |
| } |
| |
| for (unsigned I = 1; I <= 2; ++I) { // old and src operands. |
| const MachineOperand &SrcOp = MI.getOperand(I); |
| assert(!SrcOp.isFPImm()); |
| if (SrcOp.isImm()) { |
| APInt Imm(64, SrcOp.getImm()); |
| Imm.ashrInPlace(Part * 32); |
| MovDPP.addImm(Imm.getLoBits(32).getZExtValue()); |
| } else { |
| assert(SrcOp.isReg()); |
| Register Src = SrcOp.getReg(); |
| if (Src.isPhysical()) |
| MovDPP.addReg(RI.getSubReg(Src, Sub)); |
| else |
| MovDPP.addReg(Src, SrcOp.isUndef() ? RegState::Undef : 0, Sub); |
| } |
| } |
| |
| for (unsigned I = 3; I < MI.getNumExplicitOperands(); ++I) |
| MovDPP.addImm(MI.getOperand(I).getImm()); |
| |
| Split[Part] = MovDPP; |
| ++Part; |
| } |
| |
| if (Dst.isVirtual()) |
| BuildMI(MBB, MI, DL, get(AMDGPU::REG_SEQUENCE), Dst) |
| .addReg(Split[0]->getOperand(0).getReg()) |
| .addImm(AMDGPU::sub0) |
| .addReg(Split[1]->getOperand(0).getReg()) |
| .addImm(AMDGPU::sub1); |
| |
| MI.eraseFromParent(); |
| return std::make_pair(Split[0], Split[1]); |
| } |
| |
| bool SIInstrInfo::swapSourceModifiers(MachineInstr &MI, |
| MachineOperand &Src0, |
| unsigned Src0OpName, |
| MachineOperand &Src1, |
| unsigned Src1OpName) const { |
| MachineOperand *Src0Mods = getNamedOperand(MI, Src0OpName); |
| if (!Src0Mods) |
| return false; |
| |
| MachineOperand *Src1Mods = getNamedOperand(MI, Src1OpName); |
| assert(Src1Mods && |
| "All commutable instructions have both src0 and src1 modifiers"); |
| |
| int Src0ModsVal = Src0Mods->getImm(); |
| int Src1ModsVal = Src1Mods->getImm(); |
| |
| Src1Mods->setImm(Src0ModsVal); |
| Src0Mods->setImm(Src1ModsVal); |
| return true; |
| } |
| |
| static MachineInstr *swapRegAndNonRegOperand(MachineInstr &MI, |
| MachineOperand &RegOp, |
| MachineOperand &NonRegOp) { |
| Register Reg = RegOp.getReg(); |
| unsigned SubReg = RegOp.getSubReg(); |
| bool IsKill = RegOp.isKill(); |
| bool IsDead = RegOp.isDead(); |
| bool IsUndef = RegOp.isUndef(); |
| bool IsDebug = RegOp.isDebug(); |
| |
| if (NonRegOp.isImm()) |
| RegOp.ChangeToImmediate(NonRegOp.getImm()); |
| else if (NonRegOp.isFI()) |
| RegOp.ChangeToFrameIndex(NonRegOp.getIndex()); |
| else if (NonRegOp.isGlobal()) { |
| RegOp.ChangeToGA(NonRegOp.getGlobal(), NonRegOp.getOffset(), |
| NonRegOp.getTargetFlags()); |
| } else |
| return nullptr; |
| |
| // Make sure we don't reinterpret a subreg index in the target flags. |
| RegOp.setTargetFlags(NonRegOp.getTargetFlags()); |
| |
| NonRegOp.ChangeToRegister(Reg, false, false, IsKill, IsDead, IsUndef, IsDebug); |
| NonRegOp.setSubReg(SubReg); |
| |
| return &MI; |
| } |
| |
| MachineInstr *SIInstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI, |
| unsigned Src0Idx, |
| unsigned Src1Idx) const { |
| assert(!NewMI && "this should never be used"); |
| |
| unsigned Opc = MI.getOpcode(); |
| int CommutedOpcode = commuteOpcode(Opc); |
| if (CommutedOpcode == -1) |
| return nullptr; |
| |
| assert(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) == |
| static_cast<int>(Src0Idx) && |
| AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1) == |
| static_cast<int>(Src1Idx) && |
| "inconsistency with findCommutedOpIndices"); |
| |
| MachineOperand &Src0 = MI.getOperand(Src0Idx); |
| MachineOperand &Src1 = MI.getOperand(Src1Idx); |
| |
| MachineInstr *CommutedMI = nullptr; |
| if (Src0.isReg() && Src1.isReg()) { |
| if (isOperandLegal(MI, Src1Idx, &Src0)) { |
| // Be sure to copy the source modifiers to the right place. |
| CommutedMI |
| = TargetInstrInfo::commuteInstructionImpl(MI, NewMI, Src0Idx, Src1Idx); |
| } |
| |
| } else if (Src0.isReg() && !Src1.isReg()) { |
| // src0 should always be able to support any operand type, so no need to |
| // check operand legality. |
| CommutedMI = swapRegAndNonRegOperand(MI, Src0, Src1); |
| } else if (!Src0.isReg() && Src1.isReg()) { |
| if (isOperandLegal(MI, Src1Idx, &Src0)) |
| CommutedMI = swapRegAndNonRegOperand(MI, Src1, Src0); |
| } else { |
| // FIXME: Found two non registers to commute. This does happen. |
| return nullptr; |
| } |
| |
| if (CommutedMI) { |
| swapSourceModifiers(MI, Src0, AMDGPU::OpName::src0_modifiers, |
| Src1, AMDGPU::OpName::src1_modifiers); |
| |
| CommutedMI->setDesc(get(CommutedOpcode)); |
| } |
| |
| return CommutedMI; |
| } |
| |
| // This needs to be implemented because the source modifiers may be inserted |
| // between the true commutable operands, and the base |
| // TargetInstrInfo::commuteInstruction uses it. |
| bool SIInstrInfo::findCommutedOpIndices(const MachineInstr &MI, |
| unsigned &SrcOpIdx0, |
| unsigned &SrcOpIdx1) const { |
| return findCommutedOpIndices(MI.getDesc(), SrcOpIdx0, SrcOpIdx1); |
| } |
| |
| bool SIInstrInfo::findCommutedOpIndices(MCInstrDesc Desc, unsigned &SrcOpIdx0, |
| unsigned &SrcOpIdx1) const { |
| if (!Desc.isCommutable()) |
| return false; |
| |
| unsigned Opc = Desc.getOpcode(); |
| int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); |
| if (Src0Idx == -1) |
| return false; |
| |
| int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1); |
| if (Src1Idx == -1) |
| return false; |
| |
| return fixCommutedOpIndices(SrcOpIdx0, SrcOpIdx1, Src0Idx, Src1Idx); |
| } |
| |
| bool SIInstrInfo::isBranchOffsetInRange(unsigned BranchOp, |
| int64_t BrOffset) const { |
| // BranchRelaxation should never have to check s_setpc_b64 because its dest |
| // block is unanalyzable. |
| assert(BranchOp != AMDGPU::S_SETPC_B64); |
| |
| // Convert to dwords. |
| BrOffset /= 4; |
| |
| // The branch instructions do PC += signext(SIMM16 * 4) + 4, so the offset is |
| // from the next instruction. |
| BrOffset -= 1; |
| |
| return isIntN(BranchOffsetBits, BrOffset); |
| } |
| |
| MachineBasicBlock *SIInstrInfo::getBranchDestBlock( |
| const MachineInstr &MI) const { |
| if (MI.getOpcode() == AMDGPU::S_SETPC_B64) { |
| // This would be a difficult analysis to perform, but can always be legal so |
| // there's no need to analyze it. |
| return nullptr; |
| } |
| |
| return MI.getOperand(0).getMBB(); |
| } |
| |
| void SIInstrInfo::insertIndirectBranch(MachineBasicBlock &MBB, |
| MachineBasicBlock &DestBB, |
| MachineBasicBlock &RestoreBB, |
| const DebugLoc &DL, int64_t BrOffset, |
| RegScavenger *RS) const { |
| assert(RS && "RegScavenger required for long branching"); |
| assert(MBB.empty() && |
| "new block should be inserted for expanding unconditional branch"); |
| assert(MBB.pred_size() == 1); |
| assert(RestoreBB.empty() && |
| "restore block should be inserted for restoring clobbered registers"); |
| |
| MachineFunction *MF = MBB.getParent(); |
| MachineRegisterInfo &MRI = MF->getRegInfo(); |
| |
| // FIXME: Virtual register workaround for RegScavenger not working with empty |
| // blocks. |
| Register PCReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); |
| |
| auto I = MBB.end(); |
| |
| // We need to compute the offset relative to the instruction immediately after |
| // s_getpc_b64. Insert pc arithmetic code before last terminator. |
| MachineInstr *GetPC = BuildMI(MBB, I, DL, get(AMDGPU::S_GETPC_B64), PCReg); |
| |
| auto &MCCtx = MF->getContext(); |
| MCSymbol *PostGetPCLabel = |
| MCCtx.createTempSymbol("post_getpc", /*AlwaysAddSuffix=*/true); |
| GetPC->setPostInstrSymbol(*MF, PostGetPCLabel); |
| |
| MCSymbol *OffsetLo = |
| MCCtx.createTempSymbol("offset_lo", /*AlwaysAddSuffix=*/true); |
| MCSymbol *OffsetHi = |
| MCCtx.createTempSymbol("offset_hi", /*AlwaysAddSuffix=*/true); |
| BuildMI(MBB, I, DL, get(AMDGPU::S_ADD_U32)) |
| .addReg(PCReg, RegState::Define, AMDGPU::sub0) |
| .addReg(PCReg, 0, AMDGPU::sub0) |
| .addSym(OffsetLo, MO_FAR_BRANCH_OFFSET); |
| BuildMI(MBB, I, DL, get(AMDGPU::S_ADDC_U32)) |
| .addReg(PCReg, RegState::Define, AMDGPU::sub1) |
| .addReg(PCReg, 0, AMDGPU::sub1) |
| .addSym(OffsetHi, MO_FAR_BRANCH_OFFSET); |
| |
| // Insert the indirect branch after the other terminator. |
| BuildMI(&MBB, DL, get(AMDGPU::S_SETPC_B64)) |
| .addReg(PCReg); |
| |
| // FIXME: If spilling is necessary, this will fail because this scavenger has |
| // no emergency stack slots. It is non-trivial to spill in this situation, |
| // because the restore code needs to be specially placed after the |
| // jump. BranchRelaxation then needs to be made aware of the newly inserted |
| // block. |
| // |
| // If a spill is needed for the pc register pair, we need to insert a spill |
| // restore block right before the destination block, and insert a short branch |
| // into the old destination block's fallthrough predecessor. |
| // e.g.: |
| // |
| // s_cbranch_scc0 skip_long_branch: |
| // |
| // long_branch_bb: |
| // spill s[8:9] |
| // s_getpc_b64 s[8:9] |
| // s_add_u32 s8, s8, restore_bb |
| // s_addc_u32 s9, s9, 0 |
| // s_setpc_b64 s[8:9] |
| // |
| // skip_long_branch: |
| // foo; |
| // |
| // ..... |
| // |
| // dest_bb_fallthrough_predecessor: |
| // bar; |
| // s_branch dest_bb |
| // |
| // restore_bb: |
| // restore s[8:9] |
| // fallthrough dest_bb |
| /// |
| // dest_bb: |
| // buzz; |
| |
| RS->enterBasicBlockEnd(MBB); |
| Register Scav = RS->scavengeRegisterBackwards( |
| AMDGPU::SReg_64RegClass, MachineBasicBlock::iterator(GetPC), |
| /* RestoreAfter */ false, 0, /* AllowSpill */ false); |
| if (Scav) { |
| RS->setRegUsed(Scav); |
| MRI.replaceRegWith(PCReg, Scav); |
| MRI.clearVirtRegs(); |
| } else { |
| // As SGPR needs VGPR to be spilled, we reuse the slot of temporary VGPR for |
| // SGPR spill. |
| const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); |
| const SIRegisterInfo *TRI = ST.getRegisterInfo(); |
| TRI->spillEmergencySGPR(GetPC, RestoreBB, AMDGPU::SGPR0_SGPR1, RS); |
| MRI.replaceRegWith(PCReg, AMDGPU::SGPR0_SGPR1); |
| MRI.clearVirtRegs(); |
| } |
| |
| MCSymbol *DestLabel = Scav ? DestBB.getSymbol() : RestoreBB.getSymbol(); |
| // Now, the distance could be defined. |
| auto *Offset = MCBinaryExpr::createSub( |
| MCSymbolRefExpr::create(DestLabel, MCCtx), |
| MCSymbolRefExpr::create(PostGetPCLabel, MCCtx), MCCtx); |
| // Add offset assignments. |
| auto *Mask = MCConstantExpr::create(0xFFFFFFFFULL, MCCtx); |
| OffsetLo->setVariableValue(MCBinaryExpr::createAnd(Offset, Mask, MCCtx)); |
| auto *ShAmt = MCConstantExpr::create(32, MCCtx); |
| OffsetHi->setVariableValue(MCBinaryExpr::createAShr(Offset, ShAmt, MCCtx)); |
| |
| return; |
| } |
| |
| unsigned SIInstrInfo::getBranchOpcode(SIInstrInfo::BranchPredicate Cond) { |
| switch (Cond) { |
| case SIInstrInfo::SCC_TRUE: |
| return AMDGPU::S_CBRANCH_SCC1; |
| case SIInstrInfo::SCC_FALSE: |
| return AMDGPU::S_CBRANCH_SCC0; |
| case SIInstrInfo::VCCNZ: |
| return AMDGPU::S_CBRANCH_VCCNZ; |
| case SIInstrInfo::VCCZ: |
| return AMDGPU::S_CBRANCH_VCCZ; |
| case SIInstrInfo::EXECNZ: |
| return AMDGPU::S_CBRANCH_EXECNZ; |
| case SIInstrInfo::EXECZ: |
| return AMDGPU::S_CBRANCH_EXECZ; |
| default: |
| llvm_unreachable("invalid branch predicate"); |
| } |
| } |
| |
| SIInstrInfo::BranchPredicate SIInstrInfo::getBranchPredicate(unsigned Opcode) { |
| switch (Opcode) { |
| case AMDGPU::S_CBRANCH_SCC0: |
| return SCC_FALSE; |
| case AMDGPU::S_CBRANCH_SCC1: |
| return SCC_TRUE; |
| case AMDGPU::S_CBRANCH_VCCNZ: |
| return VCCNZ; |
| case AMDGPU::S_CBRANCH_VCCZ: |
| return VCCZ; |
| case AMDGPU::S_CBRANCH_EXECNZ: |
| return EXECNZ; |
| case AMDGPU::S_CBRANCH_EXECZ: |
| return EXECZ; |
| default: |
| return INVALID_BR; |
| } |
| } |
| |
| bool SIInstrInfo::analyzeBranchImpl(MachineBasicBlock &MBB, |
| MachineBasicBlock::iterator I, |
| MachineBasicBlock *&TBB, |
| MachineBasicBlock *&FBB, |
| SmallVectorImpl<MachineOperand> &Cond, |
| bool AllowModify) const { |
| if (I->getOpcode() == AMDGPU::S_BRANCH) { |
| // Unconditional Branch |
| TBB = I->getOperand(0).getMBB(); |
| return false; |
| } |
| |
| MachineBasicBlock *CondBB = nullptr; |
| |
| if (I->getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO) { |
| CondBB = I->getOperand(1).getMBB(); |
| Cond.push_back(I->getOperand(0)); |
| } else { |
| BranchPredicate Pred = getBranchPredicate(I->getOpcode()); |
| if (Pred == INVALID_BR) |
| return true; |
| |
| CondBB = I->getOperand(0).getMBB(); |
| Cond.push_back(MachineOperand::CreateImm(Pred)); |
| Cond.push_back(I->getOperand(1)); // Save the branch register. |
| } |
| ++I; |
| |
| if (I == MBB.end()) { |
| // Conditional branch followed by fall-through. |
| TBB = CondBB; |
| return false; |
| } |
| |
| if (I->getOpcode() == AMDGPU::S_BRANCH) { |
| TBB = CondBB; |
| FBB = I->getOperand(0).getMBB(); |
| return false; |
| } |
| |
| return true; |
| } |
| |
| bool SIInstrInfo::analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, |
| MachineBasicBlock *&FBB, |
| SmallVectorImpl<MachineOperand> &Cond, |
| bool AllowModify) const { |
| MachineBasicBlock::iterator I = MBB.getFirstTerminator(); |
| auto E = MBB.end(); |
| if (I == E) |
| return false; |
| |
| // Skip over the instructions that are artificially terminators for special |
| // exec management. |
| while (I != E && !I->isBranch() && !I->isReturn()) { |
| switch (I->getOpcode()) { |
| case AMDGPU::S_MOV_B64_term: |
| case AMDGPU::S_XOR_B64_term: |
| case AMDGPU::S_OR_B64_term: |
| case AMDGPU::S_ANDN2_B64_term: |
| case AMDGPU::S_AND_B64_term: |
| case AMDGPU::S_MOV_B32_term: |
| case AMDGPU::S_XOR_B32_term: |
| case AMDGPU::S_OR_B32_term: |
| case AMDGPU::S_ANDN2_B32_term: |
| case AMDGPU::S_AND_B32_term: |
| break; |
| case AMDGPU::SI_IF: |
| case AMDGPU::SI_ELSE: |
| case AMDGPU::SI_KILL_I1_TERMINATOR: |
| case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR: |
| // FIXME: It's messy that these need to be considered here at all. |
| return true; |
| default: |
| llvm_unreachable("unexpected non-branch terminator inst"); |
| } |
| |
| ++I; |
| } |
| |
| if (I == E) |
| return false; |
| |
| return analyzeBranchImpl(MBB, I, TBB, FBB, Cond, AllowModify); |
| } |
| |
| unsigned SIInstrInfo::removeBranch(MachineBasicBlock &MBB, |
| int *BytesRemoved) const { |
| unsigned Count = 0; |
| unsigned RemovedSize = 0; |
| for (MachineInstr &MI : llvm::make_early_inc_range(MBB.terminators())) { |
| // Skip over artificial terminators when removing instructions. |
| if (MI.isBranch() || MI.isReturn()) { |
| RemovedSize += getInstSizeInBytes(MI); |
| MI.eraseFromParent(); |
| ++Count; |
| } |
| } |
| |
| if (BytesRemoved) |
| *BytesRemoved = RemovedSize; |
| |
| return Count; |
| } |
| |
| // Copy the flags onto the implicit condition register operand. |
| static void preserveCondRegFlags(MachineOperand &CondReg, |
| const MachineOperand &OrigCond) { |
| CondReg.setIsUndef(OrigCond.isUndef()); |
| CondReg.setIsKill(OrigCond.isKill()); |
| } |
| |
| unsigned SIInstrInfo::insertBranch(MachineBasicBlock &MBB, |
| MachineBasicBlock *TBB, |
| MachineBasicBlock *FBB, |
| ArrayRef<MachineOperand> Cond, |
| const DebugLoc &DL, |
| int *BytesAdded) const { |
| if (!FBB && Cond.empty()) { |
| BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH)) |
| .addMBB(TBB); |
| if (BytesAdded) |
| *BytesAdded = ST.hasOffset3fBug() ? 8 : 4; |
| return 1; |
| } |
| |
| if(Cond.size() == 1 && Cond[0].isReg()) { |
| BuildMI(&MBB, DL, get(AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO)) |
| .add(Cond[0]) |
| .addMBB(TBB); |
| return 1; |
| } |
| |
| assert(TBB && Cond[0].isImm()); |
| |
| unsigned Opcode |
| = getBranchOpcode(static_cast<BranchPredicate>(Cond[0].getImm())); |
| |
| if (!FBB) { |
| Cond[1].isUndef(); |
| MachineInstr *CondBr = |
| BuildMI(&MBB, DL, get(Opcode)) |
| .addMBB(TBB); |
| |
| // Copy the flags onto the implicit condition register operand. |
| preserveCondRegFlags(CondBr->getOperand(1), Cond[1]); |
| fixImplicitOperands(*CondBr); |
| |
| if (BytesAdded) |
| *BytesAdded = ST.hasOffset3fBug() ? 8 : 4; |
| return 1; |
| } |
| |
| assert(TBB && FBB); |
| |
| MachineInstr *CondBr = |
| BuildMI(&MBB, DL, get(Opcode)) |
| .addMBB(TBB); |
| fixImplicitOperands(*CondBr); |
| BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH)) |
| .addMBB(FBB); |
| |
| MachineOperand &CondReg = CondBr->getOperand(1); |
| CondReg.setIsUndef(Cond[1].isUndef()); |
| CondReg.setIsKill(Cond[1].isKill()); |
| |
| if (BytesAdded) |
| *BytesAdded = ST.hasOffset3fBug() ? 16 : 8; |
| |
| return 2; |
| } |
| |
| bool SIInstrInfo::reverseBranchCondition( |
| SmallVectorImpl<MachineOperand> &Cond) const { |
| if (Cond.size() != 2) { |
| return true; |
| } |
| |
|