| //===------------------ AMDGPUCustomBehaviour.cpp ---------------*-C++ -* -===// |
| // |
| // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| // See https://llvm.org/LICENSE.txt for license information. |
| // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| // |
| //===----------------------------------------------------------------------===// |
| /// \file |
| /// |
| /// This file implements methods from the AMDGPUCustomBehaviour class. |
| /// |
| //===----------------------------------------------------------------------===// |
| |
| #include "AMDGPUCustomBehaviour.h" |
| #include "MCTargetDesc/AMDGPUMCTargetDesc.h" |
| #include "SIInstrInfo.h" |
| #include "TargetInfo/AMDGPUTargetInfo.h" |
| #include "llvm/MC/TargetRegistry.h" |
| #include "llvm/Support/WithColor.h" |
| |
| namespace llvm { |
| namespace mca { |
| |
| void AMDGPUInstrPostProcess::postProcessInstruction( |
| std::unique_ptr<Instruction> &Inst, const MCInst &MCI) { |
| switch (MCI.getOpcode()) { |
| case AMDGPU::S_WAITCNT: |
| case AMDGPU::S_WAITCNT_EXPCNT: |
| case AMDGPU::S_WAITCNT_LGKMCNT: |
| case AMDGPU::S_WAITCNT_VMCNT: |
| case AMDGPU::S_WAITCNT_VSCNT: |
| case AMDGPU::S_WAITCNT_EXPCNT_gfx10: |
| case AMDGPU::S_WAITCNT_LGKMCNT_gfx10: |
| case AMDGPU::S_WAITCNT_VMCNT_gfx10: |
| case AMDGPU::S_WAITCNT_VSCNT_gfx10: |
| case AMDGPU::S_WAITCNT_gfx10: |
| case AMDGPU::S_WAITCNT_gfx6_gfx7: |
| case AMDGPU::S_WAITCNT_vi: |
| return processWaitCnt(Inst, MCI); |
| } |
| } |
| |
| // s_waitcnt instructions encode important information as immediate operands |
| // which are lost during the MCInst -> mca::Instruction lowering. |
| void AMDGPUInstrPostProcess::processWaitCnt(std::unique_ptr<Instruction> &Inst, |
| const MCInst &MCI) { |
| for (int Idx = 0, N = MCI.size(); Idx < N; Idx++) { |
| MCAOperand Op; |
| const MCOperand &MCOp = MCI.getOperand(Idx); |
| if (MCOp.isReg()) { |
| Op = MCAOperand::createReg(MCOp.getReg()); |
| } else if (MCOp.isImm()) { |
| Op = MCAOperand::createImm(MCOp.getImm()); |
| } |
| Op.setIndex(Idx); |
| Inst->addOperand(Op); |
| } |
| } |
| |
| AMDGPUCustomBehaviour::AMDGPUCustomBehaviour(const MCSubtargetInfo &STI, |
| const mca::SourceMgr &SrcMgr, |
| const MCInstrInfo &MCII) |
| : CustomBehaviour(STI, SrcMgr, MCII) { |
| generateWaitCntInfo(); |
| } |
| |
| unsigned AMDGPUCustomBehaviour::checkCustomHazard(ArrayRef<InstRef> IssuedInst, |
| const InstRef &IR) { |
| const Instruction &Inst = *IR.getInstruction(); |
| unsigned Opcode = Inst.getOpcode(); |
| |
| // llvm-mca is generally run on fully compiled assembly so we wouldn't see any |
| // pseudo instructions here. However, there are plans for the future to make |
| // it possible to use mca within backend passes. As such, I have left the |
| // pseudo version of s_waitcnt within this switch statement. |
| switch (Opcode) { |
| default: |
| return 0; |
| case AMDGPU::S_WAITCNT: // This instruction |
| case AMDGPU::S_WAITCNT_EXPCNT: |
| case AMDGPU::S_WAITCNT_LGKMCNT: |
| case AMDGPU::S_WAITCNT_VMCNT: |
| case AMDGPU::S_WAITCNT_VSCNT: // to this instruction are all pseudo. |
| case AMDGPU::S_WAITCNT_EXPCNT_gfx10: |
| case AMDGPU::S_WAITCNT_LGKMCNT_gfx10: |
| case AMDGPU::S_WAITCNT_VMCNT_gfx10: |
| case AMDGPU::S_WAITCNT_VSCNT_gfx10: |
| case AMDGPU::S_WAITCNT_gfx10: |
| case AMDGPU::S_WAITCNT_gfx6_gfx7: |
| case AMDGPU::S_WAITCNT_vi: |
| // s_endpgm also behaves as if there is an implicit |
| // s_waitcnt 0, but I'm not sure if it would be appropriate |
| // to model this in llvm-mca based on how the iterations work |
| // while simulating the pipeline over and over. |
| return handleWaitCnt(IssuedInst, IR); |
| } |
| |
| return 0; |
| } |
| |
| unsigned AMDGPUCustomBehaviour::handleWaitCnt(ArrayRef<InstRef> IssuedInst, |
| const InstRef &IR) { |
| // Currently, all s_waitcnt instructions are handled except s_waitcnt_depctr. |
| // I do not know how that instruction works so I did not attempt to model it. |
| // set the max values to begin |
| unsigned Vmcnt = 63; |
| unsigned Expcnt = 7; |
| unsigned Lgkmcnt = 31; |
| unsigned Vscnt = 63; |
| unsigned CurrVmcnt = 0; |
| unsigned CurrExpcnt = 0; |
| unsigned CurrLgkmcnt = 0; |
| unsigned CurrVscnt = 0; |
| unsigned CyclesToWaitVm = ~0U; |
| unsigned CyclesToWaitExp = ~0U; |
| unsigned CyclesToWaitLgkm = ~0U; |
| unsigned CyclesToWaitVs = ~0U; |
| |
| computeWaitCnt(IR, Vmcnt, Expcnt, Lgkmcnt, Vscnt); |
| |
| // We will now look at each of the currently executing instructions |
| // to find out if this wait instruction still needs to wait. |
| for (auto I = IssuedInst.begin(), E = IssuedInst.end(); I != E; I++) { |
| const InstRef &PrevIR = *I; |
| const Instruction &PrevInst = *PrevIR.getInstruction(); |
| const unsigned PrevInstIndex = PrevIR.getSourceIndex() % SrcMgr.size(); |
| const WaitCntInfo &PrevInstWaitInfo = InstrWaitCntInfo[PrevInstIndex]; |
| const int CyclesLeft = PrevInst.getCyclesLeft(); |
| assert(CyclesLeft != UNKNOWN_CYCLES && |
| "We should know how many cycles are left for this instruction"); |
| if (PrevInstWaitInfo.VmCnt) { |
| CurrVmcnt++; |
| if ((unsigned)CyclesLeft < CyclesToWaitVm) |
| CyclesToWaitVm = CyclesLeft; |
| } |
| if (PrevInstWaitInfo.ExpCnt) { |
| CurrExpcnt++; |
| if ((unsigned)CyclesLeft < CyclesToWaitExp) |
| CyclesToWaitExp = CyclesLeft; |
| } |
| if (PrevInstWaitInfo.LgkmCnt) { |
| CurrLgkmcnt++; |
| if ((unsigned)CyclesLeft < CyclesToWaitLgkm) |
| CyclesToWaitLgkm = CyclesLeft; |
| } |
| if (PrevInstWaitInfo.VsCnt) { |
| CurrVscnt++; |
| if ((unsigned)CyclesLeft < CyclesToWaitVs) |
| CyclesToWaitVs = CyclesLeft; |
| } |
| } |
| |
| unsigned CyclesToWait = ~0U; |
| if (CurrVmcnt > Vmcnt && CyclesToWaitVm < CyclesToWait) |
| CyclesToWait = CyclesToWaitVm; |
| if (CurrExpcnt > Expcnt && CyclesToWaitExp < CyclesToWait) |
| CyclesToWait = CyclesToWaitExp; |
| if (CurrLgkmcnt > Lgkmcnt && CyclesToWaitLgkm < CyclesToWait) |
| CyclesToWait = CyclesToWaitLgkm; |
| if (CurrVscnt > Vscnt && CyclesToWaitVs < CyclesToWait) |
| CyclesToWait = CyclesToWaitVs; |
| |
| // We may underestimate how many cycles we need to wait, but this |
| // isn't a big deal. Our return value is just how many cycles until |
| // this function gets run again. So as long as we don't overestimate |
| // the wait time, we'll still end up stalling at this instruction |
| // for the correct number of cycles. |
| |
| if (CyclesToWait == ~0U) |
| return 0; |
| return CyclesToWait; |
| } |
| |
| void AMDGPUCustomBehaviour::computeWaitCnt(const InstRef &IR, unsigned &Vmcnt, |
| unsigned &Expcnt, unsigned &Lgkmcnt, |
| unsigned &Vscnt) { |
| AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(STI.getCPU()); |
| const Instruction &Inst = *IR.getInstruction(); |
| unsigned Opcode = Inst.getOpcode(); |
| |
| switch (Opcode) { |
| case AMDGPU::S_WAITCNT_EXPCNT_gfx10: |
| case AMDGPU::S_WAITCNT_LGKMCNT_gfx10: |
| case AMDGPU::S_WAITCNT_VMCNT_gfx10: |
| case AMDGPU::S_WAITCNT_VSCNT_gfx10: { |
| // Should probably be checking for nullptr |
| // here, but I'm not sure how I should handle the case |
| // where we see a nullptr. |
| const MCAOperand *OpReg = Inst.getOperand(0); |
| const MCAOperand *OpImm = Inst.getOperand(1); |
| assert(OpReg && OpReg->isReg() && "First operand should be a register."); |
| assert(OpImm && OpImm->isImm() && "Second operand should be an immediate."); |
| if (OpReg->getReg() != AMDGPU::SGPR_NULL) { |
| // Instruction is using a real register. |
| // Since we can't know what value this register will have, |
| // we can't compute what the value of this wait should be. |
| WithColor::warning() << "The register component of " |
| << MCII.getName(Opcode) << " will be completely " |
| << "ignored. So the wait may not be accurate.\n"; |
| } |
| switch (Opcode) { |
| // Redundant switch so I don't have to repeat the code above |
| // for each case. There are more clever ways to avoid this |
| // extra switch and anyone can feel free to implement one of them. |
| case AMDGPU::S_WAITCNT_EXPCNT_gfx10: |
| Expcnt = OpImm->getImm(); |
| break; |
| case AMDGPU::S_WAITCNT_LGKMCNT_gfx10: |
| Lgkmcnt = OpImm->getImm(); |
| break; |
| case AMDGPU::S_WAITCNT_VMCNT_gfx10: |
| Vmcnt = OpImm->getImm(); |
| break; |
| case AMDGPU::S_WAITCNT_VSCNT_gfx10: |
| Vscnt = OpImm->getImm(); |
| break; |
| } |
| return; |
| } |
| case AMDGPU::S_WAITCNT_gfx10: |
| case AMDGPU::S_WAITCNT_gfx6_gfx7: |
| case AMDGPU::S_WAITCNT_vi: |
| unsigned WaitCnt = Inst.getOperand(0)->getImm(); |
| AMDGPU::decodeWaitcnt(IV, WaitCnt, Vmcnt, Expcnt, Lgkmcnt); |
| return; |
| } |
| } |
| |
| void AMDGPUCustomBehaviour::generateWaitCntInfo() { |
| // The core logic from this function is taken from |
| // SIInsertWaitcnts::updateEventWaitcntAfter() In that pass, the instructions |
| // that are being looked at are in the MachineInstr format, whereas we have |
| // access to the MCInst format. The side effects of this are that we can't use |
| // the mayAccessVMEMThroughFlat(Inst) or mayAccessLDSThroughFlat(Inst) |
| // functions. Therefore, we conservatively assume that these functions will |
| // return true. This may cause a few instructions to be incorrectly tagged |
| // with an extra CNT. However, these are instructions that do interact with at |
| // least one CNT so giving them an extra CNT shouldn't cause issues in most |
| // scenarios. |
| AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(STI.getCPU()); |
| InstrWaitCntInfo.resize(SrcMgr.size()); |
| |
| int Index = 0; |
| for (auto I = SrcMgr.begin(), E = SrcMgr.end(); I != E; ++I, ++Index) { |
| const std::unique_ptr<Instruction> &Inst = *I; |
| unsigned Opcode = Inst->getOpcode(); |
| const MCInstrDesc &MCID = MCII.get(Opcode); |
| if ((MCID.TSFlags & SIInstrFlags::DS) && |
| (MCID.TSFlags & SIInstrFlags::LGKM_CNT)) { |
| InstrWaitCntInfo[Index].LgkmCnt = true; |
| if (isAlwaysGDS(Opcode) || hasModifiersSet(Inst, AMDGPU::OpName::gds)) |
| InstrWaitCntInfo[Index].ExpCnt = true; |
| } else if (MCID.TSFlags & SIInstrFlags::FLAT) { |
| // We conservatively assume that mayAccessVMEMThroughFlat(Inst) |
| // and mayAccessLDSThroughFlat(Inst) would both return true for this |
| // instruction. We have to do this because those functions use |
| // information about the memory operands that we don't have access to. |
| InstrWaitCntInfo[Index].LgkmCnt = true; |
| if (!STI.hasFeature(AMDGPU::FeatureVscnt)) |
| InstrWaitCntInfo[Index].VmCnt = true; |
| else if (MCID.mayLoad() && !(MCID.TSFlags & SIInstrFlags::IsAtomicNoRet)) |
| InstrWaitCntInfo[Index].VmCnt = true; |
| else |
| InstrWaitCntInfo[Index].VsCnt = true; |
| } else if (isVMEM(MCID) && !AMDGPU::getMUBUFIsBufferInv(Opcode)) { |
| if (!STI.hasFeature(AMDGPU::FeatureVscnt)) |
| InstrWaitCntInfo[Index].VmCnt = true; |
| else if ((MCID.mayLoad() && |
| !(MCID.TSFlags & SIInstrFlags::IsAtomicNoRet)) || |
| ((MCID.TSFlags & SIInstrFlags::MIMG) && !MCID.mayLoad() && |
| !MCID.mayStore())) |
| InstrWaitCntInfo[Index].VmCnt = true; |
| else if (MCID.mayStore()) |
| InstrWaitCntInfo[Index].VsCnt = true; |
| |
| // (IV.Major < 7) is meant to represent |
| // GCNTarget.vmemWriteNeedsExpWaitcnt() |
| // which is defined as |
| // { return getGeneration() < SEA_ISLANDS; } |
| if (IV.Major < 7 && |
| (MCID.mayStore() || (MCID.TSFlags & SIInstrFlags::IsAtomicRet))) |
| InstrWaitCntInfo[Index].ExpCnt = true; |
| } else if (MCID.TSFlags & SIInstrFlags::SMRD) { |
| InstrWaitCntInfo[Index].LgkmCnt = true; |
| } else if (MCID.TSFlags & SIInstrFlags::EXP) { |
| InstrWaitCntInfo[Index].ExpCnt = true; |
| } else { |
| switch (Opcode) { |
| case AMDGPU::S_SENDMSG: |
| case AMDGPU::S_SENDMSGHALT: |
| case AMDGPU::S_MEMTIME: |
| case AMDGPU::S_MEMREALTIME: |
| InstrWaitCntInfo[Index].LgkmCnt = true; |
| break; |
| } |
| } |
| } |
| } |
| |
| // taken from SIInstrInfo::isVMEM() |
| bool AMDGPUCustomBehaviour::isVMEM(const MCInstrDesc &MCID) { |
| return MCID.TSFlags & SIInstrFlags::MUBUF || |
| MCID.TSFlags & SIInstrFlags::MTBUF || |
| MCID.TSFlags & SIInstrFlags::MIMG; |
| } |
| |
| // taken from SIInstrInfo::hasModifiersSet() |
| bool AMDGPUCustomBehaviour::hasModifiersSet( |
| const std::unique_ptr<Instruction> &Inst, unsigned OpName) const { |
| int Idx = AMDGPU::getNamedOperandIdx(Inst->getOpcode(), OpName); |
| if (Idx == -1) |
| return false; |
| |
| const MCAOperand *Op = Inst->getOperand(Idx); |
| if (Op == nullptr || !Op->isImm() || !Op->getImm()) |
| return false; |
| |
| return true; |
| } |
| |
| // taken from SIInstrInfo::isAlwaysGDS() |
| bool AMDGPUCustomBehaviour::isAlwaysGDS(uint16_t Opcode) const { |
| return Opcode == AMDGPU::DS_ORDERED_COUNT || Opcode == AMDGPU::DS_GWS_INIT || |
| Opcode == AMDGPU::DS_GWS_SEMA_V || Opcode == AMDGPU::DS_GWS_SEMA_BR || |
| Opcode == AMDGPU::DS_GWS_SEMA_P || |
| Opcode == AMDGPU::DS_GWS_SEMA_RELEASE_ALL || |
| Opcode == AMDGPU::DS_GWS_BARRIER; |
| } |
| |
| } // namespace mca |
| } // namespace llvm |
| |
| using namespace llvm; |
| using namespace mca; |
| |
| static CustomBehaviour * |
| createAMDGPUCustomBehaviour(const MCSubtargetInfo &STI, |
| const mca::SourceMgr &SrcMgr, |
| const MCInstrInfo &MCII) { |
| return new AMDGPUCustomBehaviour(STI, SrcMgr, MCII); |
| } |
| |
| static InstrPostProcess * |
| createAMDGPUInstrPostProcess(const MCSubtargetInfo &STI, |
| const MCInstrInfo &MCII) { |
| return new AMDGPUInstrPostProcess(STI, MCII); |
| } |
| |
| /// Extern function to initialize the targets for the AMDGPU backend |
| |
| extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTargetMCA() { |
| TargetRegistry::RegisterCustomBehaviour(getTheAMDGPUTarget(), |
| createAMDGPUCustomBehaviour); |
| TargetRegistry::RegisterInstrPostProcess(getTheAMDGPUTarget(), |
| createAMDGPUInstrPostProcess); |
| |
| TargetRegistry::RegisterCustomBehaviour(getTheGCNTarget(), |
| createAMDGPUCustomBehaviour); |
| TargetRegistry::RegisterInstrPostProcess(getTheGCNTarget(), |
| createAMDGPUInstrPostProcess); |
| } |