| //===-- GCNSchedStrategy.cpp - GCN Scheduler Strategy ---------------------===// |
| // |
| // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| // See https://llvm.org/LICENSE.txt for license information. |
| // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| // |
| //===----------------------------------------------------------------------===// |
| // |
| /// \file |
| /// This contains a MachineSchedStrategy implementation for maximizing wave |
| /// occupancy on GCN hardware. |
| /// |
| /// This pass will apply multiple scheduling stages to the same function. |
| /// Regions are first recorded in GCNScheduleDAGMILive::schedule. The actual |
| /// entry point for the scheduling of those regions is |
| /// GCNScheduleDAGMILive::runSchedStages. |
| |
| /// Generally, the reason for having multiple scheduling stages is to account |
| /// for the kernel-wide effect of register usage on occupancy. Usually, only a |
| /// few scheduling regions will have register pressure high enough to limit |
| /// occupancy for the kernel, so constraints can be relaxed to improve ILP in |
| /// other regions. |
| /// |
| //===----------------------------------------------------------------------===// |
| |
| #include "GCNSchedStrategy.h" |
| #include "AMDGPUIGroupLP.h" |
| #include "GCNRegPressure.h" |
| #include "SIMachineFunctionInfo.h" |
| #include "Utils/AMDGPUBaseInfo.h" |
| #include "llvm/ADT/BitVector.h" |
| #include "llvm/ADT/STLExtras.h" |
| #include "llvm/CodeGen/CalcSpillWeights.h" |
| #include "llvm/CodeGen/MachineBasicBlock.h" |
| #include "llvm/CodeGen/MachineBlockFrequencyInfo.h" |
| #include "llvm/CodeGen/MachineBranchProbabilityInfo.h" |
| #include "llvm/CodeGen/MachineCycleAnalysis.h" |
| #include "llvm/CodeGen/MachineOperand.h" |
| #include "llvm/CodeGen/RegisterClassInfo.h" |
| #include "llvm/MC/LaneBitmask.h" |
| #include "llvm/MC/MCInstrItineraries.h" |
| #include "llvm/MC/MCSchedule.h" |
| #include "llvm/MC/TargetRegistry.h" |
| #include "llvm/Support/ErrorHandling.h" |
| |
| #define DEBUG_TYPE "machine-scheduler" |
| |
| using namespace llvm; |
| |
| static cl::opt<bool> DisableUnclusterHighRP( |
| "amdgpu-disable-unclustered-high-rp-reschedule", cl::Hidden, |
| cl::desc("Disable unclustered high register pressure " |
| "reduction scheduling stage."), |
| cl::init(false)); |
| |
| static cl::opt<bool> DisableClusteredLowOccupancy( |
| "amdgpu-disable-clustered-low-occupancy-reschedule", cl::Hidden, |
| cl::desc("Disable clustered low occupancy " |
| "rescheduling for ILP scheduling stage."), |
| cl::init(false)); |
| |
| static cl::opt<unsigned> ScheduleMetricBias( |
| "amdgpu-schedule-metric-bias", cl::Hidden, |
| cl::desc( |
| "Sets the bias which adds weight to occupancy vs latency. Set it to " |
| "100 to chase the occupancy only."), |
| cl::init(10)); |
| |
| static cl::opt<bool> |
| RelaxedOcc("amdgpu-schedule-relaxed-occupancy", cl::Hidden, |
| cl::desc("Relax occupancy targets for kernels which are memory " |
| "bound (amdgpu-membound-threshold), or " |
| "Wave Limited (amdgpu-limit-wave-threshold)."), |
| cl::init(false)); |
| |
| static cl::opt<bool> GCNTrackers( |
| "amdgpu-use-amdgpu-trackers", cl::Hidden, |
| cl::desc("Use the AMDGPU specific RPTrackers during scheduling"), |
| cl::init(false)); |
| |
| static cl::opt<unsigned> PendingQueueLimit( |
| "amdgpu-scheduler-pending-queue-limit", cl::Hidden, |
| cl::desc( |
| "Max (Available+Pending) size to inspect pending queue (0 disables)"), |
| cl::init(256)); |
| |
| #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
| #define DUMP_MAX_REG_PRESSURE |
| static cl::opt<bool> PrintMaxRPRegUsageBeforeScheduler( |
| "amdgpu-print-max-reg-pressure-regusage-before-scheduler", cl::Hidden, |
| cl::desc("Print a list of live registers along with their def/uses at the " |
| "point of maximum register pressure before scheduling."), |
| cl::init(false)); |
| |
| static cl::opt<bool> PrintMaxRPRegUsageAfterScheduler( |
| "amdgpu-print-max-reg-pressure-regusage-after-scheduler", cl::Hidden, |
| cl::desc("Print a list of live registers along with their def/uses at the " |
| "point of maximum register pressure after scheduling."), |
| cl::init(false)); |
| #endif |
| |
| static cl::opt<bool> DisableRewriteMFMAFormSchedStage( |
| "amdgpu-disable-rewrite-mfma-form-sched-stage", cl::Hidden, |
| cl::desc("Disable rewrite mfma rewrite scheduling stage"), cl::init(true)); |
| |
| const unsigned ScheduleMetrics::ScaleFactor = 100; |
| |
| GCNSchedStrategy::GCNSchedStrategy(const MachineSchedContext *C) |
| : GenericScheduler(C), TargetOccupancy(0), MF(nullptr), |
| DownwardTracker(*C->LIS), UpwardTracker(*C->LIS), HasHighPressure(false) { |
| } |
| |
| void GCNSchedStrategy::initialize(ScheduleDAGMI *DAG) { |
| GenericScheduler::initialize(DAG); |
| |
| MF = &DAG->MF; |
| |
| const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); |
| |
| SGPRExcessLimit = |
| Context->RegClassInfo->getNumAllocatableRegs(&AMDGPU::SGPR_32RegClass); |
| VGPRExcessLimit = |
| Context->RegClassInfo->getNumAllocatableRegs(&AMDGPU::VGPR_32RegClass); |
| |
| SIMachineFunctionInfo &MFI = *MF->getInfo<SIMachineFunctionInfo>(); |
| // Set the initial TargetOccupnacy to the maximum occupancy that we can |
| // achieve for this function. This effectively sets a lower bound on the |
| // 'Critical' register limits in the scheduler. |
| // Allow for lower occupancy targets if kernel is wave limited or memory |
| // bound, and using the relaxed occupancy feature. |
| TargetOccupancy = |
| RelaxedOcc ? MFI.getMinAllowedOccupancy() : MFI.getOccupancy(); |
| SGPRCriticalLimit = |
| std::min(ST.getMaxNumSGPRs(TargetOccupancy, true), SGPRExcessLimit); |
| |
| if (!KnownExcessRP) { |
| VGPRCriticalLimit = std::min( |
| ST.getMaxNumVGPRs(TargetOccupancy, MFI.getDynamicVGPRBlockSize()), |
| VGPRExcessLimit); |
| } else { |
| // This is similar to ST.getMaxNumVGPRs(TargetOccupancy) result except |
| // returns a reasonably small number for targets with lots of VGPRs, such |
| // as GFX10 and GFX11. |
| LLVM_DEBUG(dbgs() << "Region is known to spill, use alternative " |
| "VGPRCriticalLimit calculation method.\n"); |
| unsigned DynamicVGPRBlockSize = MFI.getDynamicVGPRBlockSize(); |
| unsigned Granule = |
| AMDGPU::IsaInfo::getVGPRAllocGranule(&ST, DynamicVGPRBlockSize); |
| unsigned Addressable = |
| AMDGPU::IsaInfo::getAddressableNumVGPRs(&ST, DynamicVGPRBlockSize); |
| unsigned VGPRBudget = alignDown(Addressable / TargetOccupancy, Granule); |
| VGPRBudget = std::max(VGPRBudget, Granule); |
| VGPRCriticalLimit = std::min(VGPRBudget, VGPRExcessLimit); |
| } |
| |
| // Subtract error margin and bias from register limits and avoid overflow. |
| SGPRCriticalLimit -= std::min(SGPRLimitBias + ErrorMargin, SGPRCriticalLimit); |
| VGPRCriticalLimit -= std::min(VGPRLimitBias + ErrorMargin, VGPRCriticalLimit); |
| SGPRExcessLimit -= std::min(SGPRLimitBias + ErrorMargin, SGPRExcessLimit); |
| VGPRExcessLimit -= std::min(VGPRLimitBias + ErrorMargin, VGPRExcessLimit); |
| LLVM_DEBUG(dbgs() << "VGPRCriticalLimit = " << VGPRCriticalLimit |
| << ", VGPRExcessLimit = " << VGPRExcessLimit |
| << ", SGPRCriticalLimit = " << SGPRCriticalLimit |
| << ", SGPRExcessLimit = " << SGPRExcessLimit << "\n\n"); |
| } |
| |
| /// Checks whether \p SU can use the cached DAG pressure diffs to compute the |
| /// current register pressure. |
| /// |
| /// This works for the common case, but it has a few exceptions that have been |
| /// observed through trial and error: |
| /// - Explicit physical register operands |
| /// - Subregister definitions |
| /// |
| /// In both of those cases, PressureDiff doesn't represent the actual pressure, |
| /// and querying LiveIntervals through the RegPressureTracker is needed to get |
| /// an accurate value. |
| /// |
| /// We should eventually only use PressureDiff for maximum performance, but this |
| /// already allows 80% of SUs to take the fast path without changing scheduling |
| /// at all. Further changes would either change scheduling, or require a lot |
| /// more logic to recover an accurate pressure estimate from the PressureDiffs. |
| static bool canUsePressureDiffs(const SUnit &SU) { |
| if (!SU.isInstr()) |
| return false; |
| |
| // Cannot use pressure diffs for subregister defs or with physregs, it's |
| // imprecise in both cases. |
| for (const auto &Op : SU.getInstr()->operands()) { |
| if (!Op.isReg() || Op.isImplicit()) |
| continue; |
| if (Op.getReg().isPhysical() || |
| (Op.isDef() && Op.getSubReg() != AMDGPU::NoSubRegister)) |
| return false; |
| } |
| return true; |
| } |
| |
| static void getRegisterPressures( |
| bool AtTop, const RegPressureTracker &RPTracker, SUnit *SU, |
| std::vector<unsigned> &Pressure, std::vector<unsigned> &MaxPressure, |
| GCNDownwardRPTracker &DownwardTracker, GCNUpwardRPTracker &UpwardTracker, |
| ScheduleDAGMI *DAG, const SIRegisterInfo *SRI) { |
| // getDownwardPressure() and getUpwardPressure() make temporary changes to |
| // the tracker, so we need to pass those function a non-const copy. |
| RegPressureTracker &TempTracker = const_cast<RegPressureTracker &>(RPTracker); |
| if (!GCNTrackers) { |
| AtTop |
| ? TempTracker.getDownwardPressure(SU->getInstr(), Pressure, MaxPressure) |
| : TempTracker.getUpwardPressure(SU->getInstr(), Pressure, MaxPressure); |
| |
| return; |
| } |
| |
| // GCNTrackers |
| Pressure.resize(4, 0); |
| MachineInstr *MI = SU->getInstr(); |
| GCNRegPressure NewPressure; |
| if (AtTop) { |
| GCNDownwardRPTracker TempDownwardTracker(DownwardTracker); |
| NewPressure = TempDownwardTracker.bumpDownwardPressure(MI, SRI); |
| } else { |
| GCNUpwardRPTracker TempUpwardTracker(UpwardTracker); |
| TempUpwardTracker.recede(*MI); |
| NewPressure = TempUpwardTracker.getPressure(); |
| } |
| Pressure[AMDGPU::RegisterPressureSets::SReg_32] = NewPressure.getSGPRNum(); |
| Pressure[AMDGPU::RegisterPressureSets::VGPR_32] = |
| NewPressure.getArchVGPRNum(); |
| Pressure[AMDGPU::RegisterPressureSets::AGPR_32] = NewPressure.getAGPRNum(); |
| } |
| |
| void GCNSchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU, |
| bool AtTop, |
| const RegPressureTracker &RPTracker, |
| const SIRegisterInfo *SRI, |
| unsigned SGPRPressure, |
| unsigned VGPRPressure, bool IsBottomUp) { |
| Cand.SU = SU; |
| Cand.AtTop = AtTop; |
| |
| if (!DAG->isTrackingPressure()) |
| return; |
| |
| Pressure.clear(); |
| MaxPressure.clear(); |
| |
| // We try to use the cached PressureDiffs in the ScheduleDAG whenever |
| // possible over querying the RegPressureTracker. |
| // |
| // RegPressureTracker will make a lot of LIS queries which are very |
| // expensive, it is considered a slow function in this context. |
| // |
| // PressureDiffs are precomputed and cached, and getPressureDiff is just a |
| // trivial lookup into an array. It is pretty much free. |
| // |
| // In EXPENSIVE_CHECKS, we always query RPTracker to verify the results of |
| // PressureDiffs. |
| if (AtTop || !canUsePressureDiffs(*SU) || GCNTrackers) { |
| getRegisterPressures(AtTop, RPTracker, SU, Pressure, MaxPressure, |
| DownwardTracker, UpwardTracker, DAG, SRI); |
| } else { |
| // Reserve 4 slots. |
| Pressure.resize(4, 0); |
| Pressure[AMDGPU::RegisterPressureSets::SReg_32] = SGPRPressure; |
| Pressure[AMDGPU::RegisterPressureSets::VGPR_32] = VGPRPressure; |
| |
| for (const auto &Diff : DAG->getPressureDiff(SU)) { |
| if (!Diff.isValid()) |
| continue; |
| // PressureDiffs is always bottom-up so if we're working top-down we need |
| // to invert its sign. |
| Pressure[Diff.getPSet()] += |
| (IsBottomUp ? Diff.getUnitInc() : -Diff.getUnitInc()); |
| } |
| |
| #ifdef EXPENSIVE_CHECKS |
| std::vector<unsigned> CheckPressure, CheckMaxPressure; |
| getRegisterPressures(AtTop, RPTracker, SU, CheckPressure, CheckMaxPressure, |
| DownwardTracker, UpwardTracker, DAG, SRI); |
| if (Pressure[AMDGPU::RegisterPressureSets::SReg_32] != |
| CheckPressure[AMDGPU::RegisterPressureSets::SReg_32] || |
| Pressure[AMDGPU::RegisterPressureSets::VGPR_32] != |
| CheckPressure[AMDGPU::RegisterPressureSets::VGPR_32]) { |
| errs() << "Register Pressure is inaccurate when calculated through " |
| "PressureDiff\n" |
| << "SGPR got " << Pressure[AMDGPU::RegisterPressureSets::SReg_32] |
| << ", expected " |
| << CheckPressure[AMDGPU::RegisterPressureSets::SReg_32] << "\n" |
| << "VGPR got " << Pressure[AMDGPU::RegisterPressureSets::VGPR_32] |
| << ", expected " |
| << CheckPressure[AMDGPU::RegisterPressureSets::VGPR_32] << "\n"; |
| report_fatal_error("inaccurate register pressure calculation"); |
| } |
| #endif |
| } |
| |
| unsigned NewSGPRPressure = Pressure[AMDGPU::RegisterPressureSets::SReg_32]; |
| unsigned NewVGPRPressure = Pressure[AMDGPU::RegisterPressureSets::VGPR_32]; |
| |
| // If two instructions increase the pressure of different register sets |
| // by the same amount, the generic scheduler will prefer to schedule the |
| // instruction that increases the set with the least amount of registers, |
| // which in our case would be SGPRs. This is rarely what we want, so |
| // when we report excess/critical register pressure, we do it either |
| // only for VGPRs or only for SGPRs. |
| |
| // FIXME: Better heuristics to determine whether to prefer SGPRs or VGPRs. |
| const unsigned MaxVGPRPressureInc = 16; |
| bool ShouldTrackVGPRs = VGPRPressure + MaxVGPRPressureInc >= VGPRExcessLimit; |
| bool ShouldTrackSGPRs = !ShouldTrackVGPRs && SGPRPressure >= SGPRExcessLimit; |
| |
| // FIXME: We have to enter REG-EXCESS before we reach the actual threshold |
| // to increase the likelihood we don't go over the limits. We should improve |
| // the analysis to look through dependencies to find the path with the least |
| // register pressure. |
| |
| // We only need to update the RPDelta for instructions that increase register |
| // pressure. Instructions that decrease or keep reg pressure the same will be |
| // marked as RegExcess in tryCandidate() when they are compared with |
| // instructions that increase the register pressure. |
| if (ShouldTrackVGPRs && NewVGPRPressure >= VGPRExcessLimit) { |
| HasHighPressure = true; |
| Cand.RPDelta.Excess = PressureChange(AMDGPU::RegisterPressureSets::VGPR_32); |
| Cand.RPDelta.Excess.setUnitInc(NewVGPRPressure - VGPRExcessLimit); |
| } |
| |
| if (ShouldTrackSGPRs && NewSGPRPressure >= SGPRExcessLimit) { |
| HasHighPressure = true; |
| Cand.RPDelta.Excess = PressureChange(AMDGPU::RegisterPressureSets::SReg_32); |
| Cand.RPDelta.Excess.setUnitInc(NewSGPRPressure - SGPRExcessLimit); |
| } |
| |
| // Register pressure is considered 'CRITICAL' if it is approaching a value |
| // that would reduce the wave occupancy for the execution unit. When |
| // register pressure is 'CRITICAL', increasing SGPR and VGPR pressure both |
| // has the same cost, so we don't need to prefer one over the other. |
| |
| int SGPRDelta = NewSGPRPressure - SGPRCriticalLimit; |
| int VGPRDelta = NewVGPRPressure - VGPRCriticalLimit; |
| |
| if (SGPRDelta >= 0 || VGPRDelta >= 0) { |
| HasHighPressure = true; |
| if (SGPRDelta > VGPRDelta) { |
| Cand.RPDelta.CriticalMax = |
| PressureChange(AMDGPU::RegisterPressureSets::SReg_32); |
| Cand.RPDelta.CriticalMax.setUnitInc(SGPRDelta); |
| } else { |
| Cand.RPDelta.CriticalMax = |
| PressureChange(AMDGPU::RegisterPressureSets::VGPR_32); |
| Cand.RPDelta.CriticalMax.setUnitInc(VGPRDelta); |
| } |
| } |
| } |
| |
| static bool shouldCheckPending(SchedBoundary &Zone, |
| const TargetSchedModel *SchedModel) { |
| bool HasBufferedModel = |
| SchedModel->hasInstrSchedModel() && SchedModel->getMicroOpBufferSize(); |
| unsigned Combined = Zone.Available.size() + Zone.Pending.size(); |
| return Combined <= PendingQueueLimit && HasBufferedModel; |
| } |
| |
| static SUnit *pickOnlyChoice(SchedBoundary &Zone, |
| const TargetSchedModel *SchedModel) { |
| // pickOnlyChoice() releases pending instructions and checks for new hazards. |
| SUnit *OnlyChoice = Zone.pickOnlyChoice(); |
| if (!shouldCheckPending(Zone, SchedModel) || Zone.Pending.empty()) |
| return OnlyChoice; |
| |
| return nullptr; |
| } |
| |
| void GCNSchedStrategy::printCandidateDecision(const SchedCandidate &Current, |
| const SchedCandidate &Preferred) { |
| LLVM_DEBUG({ |
| dbgs() << "Prefer:\t\t"; |
| DAG->dumpNode(*Preferred.SU); |
| |
| if (Current.SU) { |
| dbgs() << "Not:\t"; |
| DAG->dumpNode(*Current.SU); |
| } |
| |
| dbgs() << "Reason:\t\t"; |
| traceCandidate(Preferred); |
| }); |
| } |
| |
| // This function is mostly cut and pasted from |
| // GenericScheduler::pickNodeFromQueue() |
| void GCNSchedStrategy::pickNodeFromQueue(SchedBoundary &Zone, |
| const CandPolicy &ZonePolicy, |
| const RegPressureTracker &RPTracker, |
| SchedCandidate &Cand, bool &IsPending, |
| bool IsBottomUp) { |
| const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo *>(TRI); |
| ArrayRef<unsigned> Pressure = RPTracker.getRegSetPressureAtPos(); |
| unsigned SGPRPressure = 0; |
| unsigned VGPRPressure = 0; |
| IsPending = false; |
| if (DAG->isTrackingPressure()) { |
| if (!GCNTrackers) { |
| SGPRPressure = Pressure[AMDGPU::RegisterPressureSets::SReg_32]; |
| VGPRPressure = Pressure[AMDGPU::RegisterPressureSets::VGPR_32]; |
| } else { |
| GCNRPTracker *T = IsBottomUp |
| ? static_cast<GCNRPTracker *>(&UpwardTracker) |
| : static_cast<GCNRPTracker *>(&DownwardTracker); |
| SGPRPressure = T->getPressure().getSGPRNum(); |
| VGPRPressure = T->getPressure().getArchVGPRNum(); |
| } |
| } |
| LLVM_DEBUG(dbgs() << "Available Q:\n"); |
| ReadyQueue &AQ = Zone.Available; |
| for (SUnit *SU : AQ) { |
| |
| SchedCandidate TryCand(ZonePolicy); |
| initCandidate(TryCand, SU, Zone.isTop(), RPTracker, SRI, SGPRPressure, |
| VGPRPressure, IsBottomUp); |
| // Pass SchedBoundary only when comparing nodes from the same boundary. |
| SchedBoundary *ZoneArg = Cand.AtTop == TryCand.AtTop ? &Zone : nullptr; |
| tryCandidate(Cand, TryCand, ZoneArg); |
| if (TryCand.Reason != NoCand) { |
| // Initialize resource delta if needed in case future heuristics query it. |
| if (TryCand.ResDelta == SchedResourceDelta()) |
| TryCand.initResourceDelta(Zone.DAG, SchedModel); |
| LLVM_DEBUG(printCandidateDecision(Cand, TryCand)); |
| Cand.setBest(TryCand); |
| } else { |
| printCandidateDecision(TryCand, Cand); |
| } |
| } |
| |
| if (!shouldCheckPending(Zone, SchedModel)) |
| return; |
| |
| LLVM_DEBUG(dbgs() << "Pending Q:\n"); |
| ReadyQueue &PQ = Zone.Pending; |
| for (SUnit *SU : PQ) { |
| |
| SchedCandidate TryCand(ZonePolicy); |
| initCandidate(TryCand, SU, Zone.isTop(), RPTracker, SRI, SGPRPressure, |
| VGPRPressure, IsBottomUp); |
| // Pass SchedBoundary only when comparing nodes from the same boundary. |
| SchedBoundary *ZoneArg = Cand.AtTop == TryCand.AtTop ? &Zone : nullptr; |
| tryPendingCandidate(Cand, TryCand, ZoneArg); |
| if (TryCand.Reason != NoCand) { |
| // Initialize resource delta if needed in case future heuristics query it. |
| if (TryCand.ResDelta == SchedResourceDelta()) |
| TryCand.initResourceDelta(Zone.DAG, SchedModel); |
| LLVM_DEBUG(printCandidateDecision(Cand, TryCand)); |
| IsPending = true; |
| Cand.setBest(TryCand); |
| } else { |
| printCandidateDecision(TryCand, Cand); |
| } |
| } |
| } |
| |
| // This function is mostly cut and pasted from |
| // GenericScheduler::pickNodeBidirectional() |
| SUnit *GCNSchedStrategy::pickNodeBidirectional(bool &IsTopNode, |
| bool &PickedPending) { |
| // Schedule as far as possible in the direction of no choice. This is most |
| // efficient, but also provides the best heuristics for CriticalPSets. |
| if (SUnit *SU = pickOnlyChoice(Bot, SchedModel)) { |
| IsTopNode = false; |
| return SU; |
| } |
| if (SUnit *SU = pickOnlyChoice(Top, SchedModel)) { |
| IsTopNode = true; |
| return SU; |
| } |
| // Set the bottom-up policy based on the state of the current bottom zone |
| // and the instructions outside the zone, including the top zone. |
| CandPolicy BotPolicy; |
| setPolicy(BotPolicy, /*IsPostRA=*/false, Bot, &Top); |
| // Set the top-down policy based on the state of the current top zone and |
| // the instructions outside the zone, including the bottom zone. |
| CandPolicy TopPolicy; |
| setPolicy(TopPolicy, /*IsPostRA=*/false, Top, &Bot); |
| |
| bool BotPending = false; |
| // See if BotCand is still valid (because we previously scheduled from Top). |
| LLVM_DEBUG(dbgs() << "Picking from Bot:\n"); |
| if (!BotCand.isValid() || BotCand.SU->isScheduled || |
| BotCand.Policy != BotPolicy) { |
| BotCand.reset(CandPolicy()); |
| pickNodeFromQueue(Bot, BotPolicy, DAG->getBotRPTracker(), BotCand, |
| BotPending, |
| /*IsBottomUp=*/true); |
| assert(BotCand.Reason != NoCand && "failed to find the first candidate"); |
| } else { |
| LLVM_DEBUG(traceCandidate(BotCand)); |
| #ifndef NDEBUG |
| if (VerifyScheduling) { |
| SchedCandidate TCand; |
| TCand.reset(CandPolicy()); |
| pickNodeFromQueue(Bot, BotPolicy, DAG->getBotRPTracker(), TCand, |
| BotPending, |
| /*IsBottomUp=*/true); |
| assert(TCand.SU == BotCand.SU && |
| "Last pick result should correspond to re-picking right now"); |
| } |
| #endif |
| } |
| |
| bool TopPending = false; |
| // Check if the top Q has a better candidate. |
| LLVM_DEBUG(dbgs() << "Picking from Top:\n"); |
| if (!TopCand.isValid() || TopCand.SU->isScheduled || |
| TopCand.Policy != TopPolicy) { |
| TopCand.reset(CandPolicy()); |
| pickNodeFromQueue(Top, TopPolicy, DAG->getTopRPTracker(), TopCand, |
| TopPending, |
| /*IsBottomUp=*/false); |
| assert(TopCand.Reason != NoCand && "failed to find the first candidate"); |
| } else { |
| LLVM_DEBUG(traceCandidate(TopCand)); |
| #ifndef NDEBUG |
| if (VerifyScheduling) { |
| SchedCandidate TCand; |
| TCand.reset(CandPolicy()); |
| pickNodeFromQueue(Top, TopPolicy, DAG->getTopRPTracker(), TCand, |
| TopPending, |
| /*IsBottomUp=*/false); |
| assert(TCand.SU == TopCand.SU && |
| "Last pick result should correspond to re-picking right now"); |
| } |
| #endif |
| } |
| |
| // Pick best from BotCand and TopCand. |
| LLVM_DEBUG(dbgs() << "Top Cand: "; traceCandidate(TopCand); |
| dbgs() << "Bot Cand: "; traceCandidate(BotCand);); |
| SchedCandidate Cand = BotPending ? TopCand : BotCand; |
| SchedCandidate TryCand = BotPending ? BotCand : TopCand; |
| PickedPending = BotPending && TopPending; |
| |
| TryCand.Reason = NoCand; |
| if (BotPending || TopPending) { |
| PickedPending |= tryPendingCandidate(Cand, TopCand, nullptr); |
| } else { |
| tryCandidate(Cand, TryCand, nullptr); |
| } |
| |
| if (TryCand.Reason != NoCand) { |
| Cand.setBest(TryCand); |
| } |
| |
| LLVM_DEBUG(dbgs() << "Picking: "; traceCandidate(Cand);); |
| |
| IsTopNode = Cand.AtTop; |
| return Cand.SU; |
| } |
| |
| // This function is mostly cut and pasted from |
| // GenericScheduler::pickNode() |
| SUnit *GCNSchedStrategy::pickNode(bool &IsTopNode) { |
| if (DAG->top() == DAG->bottom()) { |
| assert(Top.Available.empty() && Top.Pending.empty() && |
| Bot.Available.empty() && Bot.Pending.empty() && "ReadyQ garbage"); |
| return nullptr; |
| } |
| bool PickedPending; |
| SUnit *SU; |
| do { |
| PickedPending = false; |
| if (RegionPolicy.OnlyTopDown) { |
| SU = pickOnlyChoice(Top, SchedModel); |
| if (!SU) { |
| CandPolicy NoPolicy; |
| TopCand.reset(NoPolicy); |
| pickNodeFromQueue(Top, NoPolicy, DAG->getTopRPTracker(), TopCand, |
| PickedPending, |
| /*IsBottomUp=*/false); |
| assert(TopCand.Reason != NoCand && "failed to find a candidate"); |
| SU = TopCand.SU; |
| } |
| IsTopNode = true; |
| } else if (RegionPolicy.OnlyBottomUp) { |
| SU = pickOnlyChoice(Bot, SchedModel); |
| if (!SU) { |
| CandPolicy NoPolicy; |
| BotCand.reset(NoPolicy); |
| pickNodeFromQueue(Bot, NoPolicy, DAG->getBotRPTracker(), BotCand, |
| PickedPending, |
| /*IsBottomUp=*/true); |
| assert(BotCand.Reason != NoCand && "failed to find a candidate"); |
| SU = BotCand.SU; |
| } |
| IsTopNode = false; |
| } else { |
| SU = pickNodeBidirectional(IsTopNode, PickedPending); |
| } |
| } while (SU->isScheduled); |
| |
| if (PickedPending) { |
| unsigned ReadyCycle = IsTopNode ? SU->TopReadyCycle : SU->BotReadyCycle; |
| SchedBoundary &Zone = IsTopNode ? Top : Bot; |
| unsigned CurrentCycle = Zone.getCurrCycle(); |
| if (ReadyCycle > CurrentCycle) |
| Zone.bumpCycle(ReadyCycle); |
| |
| // FIXME: checkHazard() doesn't give information about which cycle the |
| // hazard will resolve so just keep bumping the cycle by 1. This could be |
| // made more efficient if checkHazard() returned more details. |
| while (Zone.checkHazard(SU)) |
| Zone.bumpCycle(Zone.getCurrCycle() + 1); |
| |
| Zone.releasePending(); |
| } |
| |
| if (SU->isTopReady()) |
| Top.removeReady(SU); |
| if (SU->isBottomReady()) |
| Bot.removeReady(SU); |
| |
| LLVM_DEBUG(dbgs() << "Scheduling SU(" << SU->NodeNum << ") " |
| << *SU->getInstr()); |
| return SU; |
| } |
| |
| void GCNSchedStrategy::schedNode(SUnit *SU, bool IsTopNode) { |
| if (GCNTrackers) { |
| MachineInstr *MI = SU->getInstr(); |
| IsTopNode ? (void)DownwardTracker.advance(MI, false) |
| : UpwardTracker.recede(*MI); |
| } |
| |
| return GenericScheduler::schedNode(SU, IsTopNode); |
| } |
| |
| GCNSchedStageID GCNSchedStrategy::getCurrentStage() { |
| assert(CurrentStage && CurrentStage != SchedStages.end()); |
| return *CurrentStage; |
| } |
| |
| bool GCNSchedStrategy::advanceStage() { |
| assert(CurrentStage != SchedStages.end()); |
| if (!CurrentStage) |
| CurrentStage = SchedStages.begin(); |
| else |
| CurrentStage++; |
| |
| return CurrentStage != SchedStages.end(); |
| } |
| |
| bool GCNSchedStrategy::hasNextStage() const { |
| assert(CurrentStage); |
| return std::next(CurrentStage) != SchedStages.end(); |
| } |
| |
| GCNSchedStageID GCNSchedStrategy::getNextStage() const { |
| assert(CurrentStage && std::next(CurrentStage) != SchedStages.end()); |
| return *std::next(CurrentStage); |
| } |
| |
| bool GCNSchedStrategy::tryPendingCandidate(SchedCandidate &Cand, |
| SchedCandidate &TryCand, |
| SchedBoundary *Zone) const { |
| // Initialize the candidate if needed. |
| if (!Cand.isValid()) { |
| TryCand.Reason = NodeOrder; |
| return true; |
| } |
| |
| // Bias PhysReg Defs and copies to their uses and defined respectively. |
| if (tryGreater(biasPhysReg(TryCand.SU, TryCand.AtTop), |
| biasPhysReg(Cand.SU, Cand.AtTop), TryCand, Cand, PhysReg)) |
| return TryCand.Reason != NoCand; |
| |
| // Avoid exceeding the target's limit. |
| if (DAG->isTrackingPressure() && |
| tryPressure(TryCand.RPDelta.Excess, Cand.RPDelta.Excess, TryCand, Cand, |
| RegExcess, TRI, DAG->MF)) |
| return TryCand.Reason != NoCand; |
| |
| // Avoid increasing the max critical pressure in the scheduled region. |
| if (DAG->isTrackingPressure() && |
| tryPressure(TryCand.RPDelta.CriticalMax, Cand.RPDelta.CriticalMax, |
| TryCand, Cand, RegCritical, TRI, DAG->MF)) |
| return TryCand.Reason != NoCand; |
| |
| bool SameBoundary = Zone != nullptr; |
| if (SameBoundary) { |
| TryCand.initResourceDelta(DAG, SchedModel); |
| if (tryLess(TryCand.ResDelta.CritResources, Cand.ResDelta.CritResources, |
| TryCand, Cand, ResourceReduce)) |
| return TryCand.Reason != NoCand; |
| if (tryGreater(TryCand.ResDelta.DemandedResources, |
| Cand.ResDelta.DemandedResources, TryCand, Cand, |
| ResourceDemand)) |
| return TryCand.Reason != NoCand; |
| } |
| |
| return false; |
| } |
| |
| GCNMaxOccupancySchedStrategy::GCNMaxOccupancySchedStrategy( |
| const MachineSchedContext *C, bool IsLegacyScheduler) |
| : GCNSchedStrategy(C) { |
| SchedStages.push_back(GCNSchedStageID::OccInitialSchedule); |
| if (!DisableRewriteMFMAFormSchedStage) |
| SchedStages.push_back(GCNSchedStageID::RewriteMFMAForm); |
| SchedStages.push_back(GCNSchedStageID::UnclusteredHighRPReschedule); |
| SchedStages.push_back(GCNSchedStageID::ClusteredLowOccupancyReschedule); |
| SchedStages.push_back(GCNSchedStageID::PreRARematerialize); |
| GCNTrackers = GCNTrackers & !IsLegacyScheduler; |
| } |
| |
| GCNMaxILPSchedStrategy::GCNMaxILPSchedStrategy(const MachineSchedContext *C) |
| : GCNSchedStrategy(C) { |
| SchedStages.push_back(GCNSchedStageID::ILPInitialSchedule); |
| } |
| |
| bool GCNMaxILPSchedStrategy::tryCandidate(SchedCandidate &Cand, |
| SchedCandidate &TryCand, |
| SchedBoundary *Zone) const { |
| // Initialize the candidate if needed. |
| if (!Cand.isValid()) { |
| TryCand.Reason = NodeOrder; |
| return true; |
| } |
| |
| // Avoid spilling by exceeding the register limit. |
| if (DAG->isTrackingPressure() && |
| tryPressure(TryCand.RPDelta.Excess, Cand.RPDelta.Excess, TryCand, Cand, |
| RegExcess, TRI, DAG->MF)) |
| return TryCand.Reason != NoCand; |
| |
| // Bias PhysReg Defs and copies to their uses and defined respectively. |
| if (tryGreater(biasPhysReg(TryCand.SU, TryCand.AtTop), |
| biasPhysReg(Cand.SU, Cand.AtTop), TryCand, Cand, PhysReg)) |
| return TryCand.Reason != NoCand; |
| |
| bool SameBoundary = Zone != nullptr; |
| if (SameBoundary) { |
| // Prioritize instructions that read unbuffered resources by stall cycles. |
| if (tryLess(Zone->getLatencyStallCycles(TryCand.SU), |
| Zone->getLatencyStallCycles(Cand.SU), TryCand, Cand, Stall)) |
| return TryCand.Reason != NoCand; |
| |
| // Avoid critical resource consumption and balance the schedule. |
| TryCand.initResourceDelta(DAG, SchedModel); |
| if (tryLess(TryCand.ResDelta.CritResources, Cand.ResDelta.CritResources, |
| TryCand, Cand, ResourceReduce)) |
| return TryCand.Reason != NoCand; |
| if (tryGreater(TryCand.ResDelta.DemandedResources, |
| Cand.ResDelta.DemandedResources, TryCand, Cand, |
| ResourceDemand)) |
| return TryCand.Reason != NoCand; |
| |
| // Unconditionally try to reduce latency. |
| if (tryLatency(TryCand, Cand, *Zone)) |
| return TryCand.Reason != NoCand; |
| |
| // Weak edges are for clustering and other constraints. |
| if (tryLess(getWeakLeft(TryCand.SU, TryCand.AtTop), |
| getWeakLeft(Cand.SU, Cand.AtTop), TryCand, Cand, Weak)) |
| return TryCand.Reason != NoCand; |
| } |
| |
| // Keep clustered nodes together to encourage downstream peephole |
| // optimizations which may reduce resource requirements. |
| // |
| // This is a best effort to set things up for a post-RA pass. Optimizations |
| // like generating loads of multiple registers should ideally be done within |
| // the scheduler pass by combining the loads during DAG postprocessing. |
| unsigned CandZoneCluster = Cand.AtTop ? TopClusterID : BotClusterID; |
| unsigned TryCandZoneCluster = TryCand.AtTop ? TopClusterID : BotClusterID; |
| bool CandIsClusterSucc = |
| isTheSameCluster(CandZoneCluster, Cand.SU->ParentClusterIdx); |
| bool TryCandIsClusterSucc = |
| isTheSameCluster(TryCandZoneCluster, TryCand.SU->ParentClusterIdx); |
| if (tryGreater(TryCandIsClusterSucc, CandIsClusterSucc, TryCand, Cand, |
| Cluster)) |
| return TryCand.Reason != NoCand; |
| |
| // Avoid increasing the max critical pressure in the scheduled region. |
| if (DAG->isTrackingPressure() && |
| tryPressure(TryCand.RPDelta.CriticalMax, Cand.RPDelta.CriticalMax, |
| TryCand, Cand, RegCritical, TRI, DAG->MF)) |
| return TryCand.Reason != NoCand; |
| |
| // Avoid increasing the max pressure of the entire region. |
| if (DAG->isTrackingPressure() && |
| tryPressure(TryCand.RPDelta.CurrentMax, Cand.RPDelta.CurrentMax, TryCand, |
| Cand, RegMax, TRI, DAG->MF)) |
| return TryCand.Reason != NoCand; |
| |
| if (SameBoundary) { |
| // Fall through to original instruction order. |
| if ((Zone->isTop() && TryCand.SU->NodeNum < Cand.SU->NodeNum) || |
| (!Zone->isTop() && TryCand.SU->NodeNum > Cand.SU->NodeNum)) { |
| TryCand.Reason = NodeOrder; |
| return true; |
| } |
| } |
| return false; |
| } |
| |
| GCNMaxMemoryClauseSchedStrategy::GCNMaxMemoryClauseSchedStrategy( |
| const MachineSchedContext *C) |
| : GCNSchedStrategy(C) { |
| SchedStages.push_back(GCNSchedStageID::MemoryClauseInitialSchedule); |
| } |
| |
| /// GCNMaxMemoryClauseSchedStrategy tries best to clause memory instructions as |
| /// much as possible. This is achieved by: |
| // 1. Prioritize clustered operations before stall latency heuristic. |
| // 2. Prioritize long-latency-load before stall latency heuristic. |
| /// |
| /// \param Cand provides the policy and current best candidate. |
| /// \param TryCand refers to the next SUnit candidate, otherwise uninitialized. |
| /// \param Zone describes the scheduled zone that we are extending, or nullptr |
| /// if Cand is from a different zone than TryCand. |
| /// \return \c true if TryCand is better than Cand (Reason is NOT NoCand) |
| bool GCNMaxMemoryClauseSchedStrategy::tryCandidate(SchedCandidate &Cand, |
| SchedCandidate &TryCand, |
| SchedBoundary *Zone) const { |
| // Initialize the candidate if needed. |
| if (!Cand.isValid()) { |
| TryCand.Reason = NodeOrder; |
| return true; |
| } |
| |
| // Bias PhysReg Defs and copies to their uses and defined respectively. |
| if (tryGreater(biasPhysReg(TryCand.SU, TryCand.AtTop), |
| biasPhysReg(Cand.SU, Cand.AtTop), TryCand, Cand, PhysReg)) |
| return TryCand.Reason != NoCand; |
| |
| if (DAG->isTrackingPressure()) { |
| // Avoid exceeding the target's limit. |
| if (tryPressure(TryCand.RPDelta.Excess, Cand.RPDelta.Excess, TryCand, Cand, |
| RegExcess, TRI, DAG->MF)) |
| return TryCand.Reason != NoCand; |
| |
| // Avoid increasing the max critical pressure in the scheduled region. |
| if (tryPressure(TryCand.RPDelta.CriticalMax, Cand.RPDelta.CriticalMax, |
| TryCand, Cand, RegCritical, TRI, DAG->MF)) |
| return TryCand.Reason != NoCand; |
| } |
| |
| // MaxMemoryClause-specific: We prioritize clustered instructions as we would |
| // get more benefit from clausing these memory instructions. |
| unsigned CandZoneCluster = Cand.AtTop ? TopClusterID : BotClusterID; |
| unsigned TryCandZoneCluster = TryCand.AtTop ? TopClusterID : BotClusterID; |
| bool CandIsClusterSucc = |
| isTheSameCluster(CandZoneCluster, Cand.SU->ParentClusterIdx); |
| bool TryCandIsClusterSucc = |
| isTheSameCluster(TryCandZoneCluster, TryCand.SU->ParentClusterIdx); |
| if (tryGreater(TryCandIsClusterSucc, CandIsClusterSucc, TryCand, Cand, |
| Cluster)) |
| return TryCand.Reason != NoCand; |
| |
| // We only compare a subset of features when comparing nodes between |
| // Top and Bottom boundary. Some properties are simply incomparable, in many |
| // other instances we should only override the other boundary if something |
| // is a clear good pick on one boundary. Skip heuristics that are more |
| // "tie-breaking" in nature. |
| bool SameBoundary = Zone != nullptr; |
| if (SameBoundary) { |
| // For loops that are acyclic path limited, aggressively schedule for |
| // latency. Within an single cycle, whenever CurrMOps > 0, allow normal |
| // heuristics to take precedence. |
| if (Rem.IsAcyclicLatencyLimited && !Zone->getCurrMOps() && |
| tryLatency(TryCand, Cand, *Zone)) |
| return TryCand.Reason != NoCand; |
| |
| // MaxMemoryClause-specific: Prioritize long latency memory load |
| // instructions in top-bottom order to hide more latency. The mayLoad check |
| // is used to exclude store-like instructions, which we do not want to |
| // scheduler them too early. |
| bool TryMayLoad = |
| TryCand.SU->isInstr() && TryCand.SU->getInstr()->mayLoad(); |
| bool CandMayLoad = Cand.SU->isInstr() && Cand.SU->getInstr()->mayLoad(); |
| |
| if (TryMayLoad || CandMayLoad) { |
| bool TryLongLatency = |
| TryCand.SU->Latency > 10 * Cand.SU->Latency && TryMayLoad; |
| bool CandLongLatency = |
| 10 * TryCand.SU->Latency < Cand.SU->Latency && CandMayLoad; |
| |
| if (tryGreater(Zone->isTop() ? TryLongLatency : CandLongLatency, |
| Zone->isTop() ? CandLongLatency : TryLongLatency, TryCand, |
| Cand, Stall)) |
| return TryCand.Reason != NoCand; |
| } |
| // Prioritize instructions that read unbuffered resources by stall cycles. |
| if (tryLess(Zone->getLatencyStallCycles(TryCand.SU), |
| Zone->getLatencyStallCycles(Cand.SU), TryCand, Cand, Stall)) |
| return TryCand.Reason != NoCand; |
| } |
| |
| if (SameBoundary) { |
| // Weak edges are for clustering and other constraints. |
| if (tryLess(getWeakLeft(TryCand.SU, TryCand.AtTop), |
| getWeakLeft(Cand.SU, Cand.AtTop), TryCand, Cand, Weak)) |
| return TryCand.Reason != NoCand; |
| } |
| |
| // Avoid increasing the max pressure of the entire region. |
| if (DAG->isTrackingPressure() && |
| tryPressure(TryCand.RPDelta.CurrentMax, Cand.RPDelta.CurrentMax, TryCand, |
| Cand, RegMax, TRI, DAG->MF)) |
| return TryCand.Reason != NoCand; |
| |
| if (SameBoundary) { |
| // Avoid critical resource consumption and balance the schedule. |
| TryCand.initResourceDelta(DAG, SchedModel); |
| if (tryLess(TryCand.ResDelta.CritResources, Cand.ResDelta.CritResources, |
| TryCand, Cand, ResourceReduce)) |
| return TryCand.Reason != NoCand; |
| if (tryGreater(TryCand.ResDelta.DemandedResources, |
| Cand.ResDelta.DemandedResources, TryCand, Cand, |
| ResourceDemand)) |
| return TryCand.Reason != NoCand; |
| |
| // Avoid serializing long latency dependence chains. |
| // For acyclic path limited loops, latency was already checked above. |
| if (!RegionPolicy.DisableLatencyHeuristic && TryCand.Policy.ReduceLatency && |
| !Rem.IsAcyclicLatencyLimited && tryLatency(TryCand, Cand, *Zone)) |
| return TryCand.Reason != NoCand; |
| |
| // Fall through to original instruction order. |
| if (Zone->isTop() == (TryCand.SU->NodeNum < Cand.SU->NodeNum)) { |
| assert(TryCand.SU->NodeNum != Cand.SU->NodeNum); |
| TryCand.Reason = NodeOrder; |
| return true; |
| } |
| } |
| |
| return false; |
| } |
| |
| GCNScheduleDAGMILive::GCNScheduleDAGMILive( |
| MachineSchedContext *C, std::unique_ptr<MachineSchedStrategy> S) |
| : ScheduleDAGMILive(C, std::move(S)), ST(MF.getSubtarget<GCNSubtarget>()), |
| MFI(*MF.getInfo<SIMachineFunctionInfo>()), |
| StartingOccupancy(MFI.getOccupancy()), MinOccupancy(StartingOccupancy), |
| RegionLiveOuts(this, /*IsLiveOut=*/true) { |
| |
| // We want regions with a single MI to be scheduled so that we can reason |
| // about them correctly during scheduling stages that move MIs between regions |
| // (e.g., rematerialization). |
| ScheduleSingleMIRegions = true; |
| LLVM_DEBUG(dbgs() << "Starting occupancy is " << StartingOccupancy << ".\n"); |
| if (RelaxedOcc) { |
| MinOccupancy = std::min(MFI.getMinAllowedOccupancy(), StartingOccupancy); |
| if (MinOccupancy != StartingOccupancy) |
| LLVM_DEBUG(dbgs() << "Allowing Occupancy drops to " << MinOccupancy |
| << ".\n"); |
| } |
| } |
| |
| std::unique_ptr<GCNSchedStage> |
| GCNScheduleDAGMILive::createSchedStage(GCNSchedStageID SchedStageID) { |
| switch (SchedStageID) { |
| case GCNSchedStageID::OccInitialSchedule: |
| return std::make_unique<OccInitialScheduleStage>(SchedStageID, *this); |
| case GCNSchedStageID::RewriteMFMAForm: |
| return std::make_unique<RewriteMFMAFormStage>(SchedStageID, *this); |
| case GCNSchedStageID::UnclusteredHighRPReschedule: |
| return std::make_unique<UnclusteredHighRPStage>(SchedStageID, *this); |
| case GCNSchedStageID::ClusteredLowOccupancyReschedule: |
| return std::make_unique<ClusteredLowOccStage>(SchedStageID, *this); |
| case GCNSchedStageID::PreRARematerialize: |
| return std::make_unique<PreRARematStage>(SchedStageID, *this); |
| case GCNSchedStageID::ILPInitialSchedule: |
| return std::make_unique<ILPInitialScheduleStage>(SchedStageID, *this); |
| case GCNSchedStageID::MemoryClauseInitialSchedule: |
| return std::make_unique<MemoryClauseInitialScheduleStage>(SchedStageID, |
| *this); |
| } |
| |
| llvm_unreachable("Unknown SchedStageID."); |
| } |
| |
| void GCNScheduleDAGMILive::schedule() { |
| // Collect all scheduling regions. The actual scheduling is performed in |
| // GCNScheduleDAGMILive::finalizeSchedule. |
| Regions.push_back(std::pair(RegionBegin, RegionEnd)); |
| } |
| |
| GCNRegPressure |
| GCNScheduleDAGMILive::getRealRegPressure(unsigned RegionIdx) const { |
| if (Regions[RegionIdx].first == Regions[RegionIdx].second) |
| return llvm::getRegPressure(MRI, LiveIns[RegionIdx]); |
| GCNDownwardRPTracker RPTracker(*LIS); |
| RPTracker.advance(Regions[RegionIdx].first, Regions[RegionIdx].second, |
| &LiveIns[RegionIdx]); |
| return RPTracker.moveMaxPressure(); |
| } |
| |
| static MachineInstr *getLastMIForRegion(MachineBasicBlock::iterator RegionBegin, |
| MachineBasicBlock::iterator RegionEnd) { |
| assert(RegionBegin != RegionEnd && "Region must not be empty"); |
| return &*skipDebugInstructionsBackward(std::prev(RegionEnd), RegionBegin); |
| } |
| |
| void GCNScheduleDAGMILive::computeBlockPressure(unsigned RegionIdx, |
| const MachineBasicBlock *MBB) { |
| GCNDownwardRPTracker RPTracker(*LIS); |
| |
| // If the block has the only successor then live-ins of that successor are |
| // live-outs of the current block. We can reuse calculated live set if the |
| // successor will be sent to scheduling past current block. |
| |
| // However, due to the bug in LiveInterval analysis it may happen that two |
| // predecessors of the same successor block have different lane bitmasks for |
| // a live-out register. Workaround that by sticking to one-to-one relationship |
| // i.e. one predecessor with one successor block. |
| const MachineBasicBlock *OnlySucc = nullptr; |
| if (MBB->succ_size() == 1) { |
| auto *Candidate = *MBB->succ_begin(); |
| if (!Candidate->empty() && Candidate->pred_size() == 1) { |
| SlotIndexes *Ind = LIS->getSlotIndexes(); |
| if (Ind->getMBBStartIdx(MBB) < Ind->getMBBStartIdx(Candidate)) |
| OnlySucc = Candidate; |
| } |
| } |
| |
| // Scheduler sends regions from the end of the block upwards. |
| size_t CurRegion = RegionIdx; |
| for (size_t E = Regions.size(); CurRegion != E; ++CurRegion) |
| if (Regions[CurRegion].first->getParent() != MBB) |
| break; |
| --CurRegion; |
| |
| auto I = MBB->begin(); |
| auto LiveInIt = MBBLiveIns.find(MBB); |
| auto &Rgn = Regions[CurRegion]; |
| auto *NonDbgMI = &*skipDebugInstructionsForward(Rgn.first, Rgn.second); |
| if (LiveInIt != MBBLiveIns.end()) { |
| auto LiveIn = std::move(LiveInIt->second); |
| RPTracker.reset(*MBB->begin(), &LiveIn); |
| MBBLiveIns.erase(LiveInIt); |
| } else { |
| I = Rgn.first; |
| auto LRS = BBLiveInMap.lookup(NonDbgMI); |
| #ifdef EXPENSIVE_CHECKS |
| assert(isEqual(getLiveRegsBefore(*NonDbgMI, *LIS), LRS)); |
| #endif |
| RPTracker.reset(*I, &LRS); |
| } |
| |
| for (;;) { |
| I = RPTracker.getNext(); |
| |
| if (Regions[CurRegion].first == I || NonDbgMI == I) { |
| LiveIns[CurRegion] = RPTracker.getLiveRegs(); |
| RPTracker.clearMaxPressure(); |
| } |
| |
| if (Regions[CurRegion].second == I) { |
| Pressure[CurRegion] = RPTracker.moveMaxPressure(); |
| if (CurRegion-- == RegionIdx) |
| break; |
| auto &Rgn = Regions[CurRegion]; |
| NonDbgMI = &*skipDebugInstructionsForward(Rgn.first, Rgn.second); |
| } |
| RPTracker.advanceBeforeNext(); |
| RPTracker.advanceToNext(); |
| } |
| |
| if (OnlySucc) { |
| if (I != MBB->end()) { |
| RPTracker.advanceBeforeNext(); |
| RPTracker.advanceToNext(); |
| RPTracker.advance(MBB->end()); |
| } |
| MBBLiveIns[OnlySucc] = RPTracker.moveLiveRegs(); |
| } |
| } |
| |
| DenseMap<MachineInstr *, GCNRPTracker::LiveRegSet> |
| GCNScheduleDAGMILive::getRegionLiveInMap() const { |
| assert(!Regions.empty()); |
| std::vector<MachineInstr *> RegionFirstMIs; |
| RegionFirstMIs.reserve(Regions.size()); |
| for (auto &[RegionBegin, RegionEnd] : reverse(Regions)) |
| RegionFirstMIs.push_back( |
| &*skipDebugInstructionsForward(RegionBegin, RegionEnd)); |
| |
| return getLiveRegMap(RegionFirstMIs, /*After=*/false, *LIS); |
| } |
| |
| DenseMap<MachineInstr *, GCNRPTracker::LiveRegSet> |
| GCNScheduleDAGMILive::getRegionLiveOutMap() const { |
| assert(!Regions.empty()); |
| std::vector<MachineInstr *> RegionLastMIs; |
| RegionLastMIs.reserve(Regions.size()); |
| for (auto &[RegionBegin, RegionEnd] : reverse(Regions)) { |
| // Skip empty regions. |
| if (RegionBegin == RegionEnd) |
| continue; |
| RegionLastMIs.push_back(getLastMIForRegion(RegionBegin, RegionEnd)); |
| } |
| return getLiveRegMap(RegionLastMIs, /*After=*/true, *LIS); |
| } |
| |
| void RegionPressureMap::buildLiveRegMap() { |
| IdxToInstruction.clear(); |
| |
| RegionLiveRegMap = |
| IsLiveOut ? DAG->getRegionLiveOutMap() : DAG->getRegionLiveInMap(); |
| for (unsigned I = 0; I < DAG->Regions.size(); I++) { |
| auto &[RegionBegin, RegionEnd] = DAG->Regions[I]; |
| // Skip empty regions. |
| if (RegionBegin == RegionEnd) |
| continue; |
| MachineInstr *RegionKey = |
| IsLiveOut ? getLastMIForRegion(RegionBegin, RegionEnd) : &*RegionBegin; |
| IdxToInstruction[I] = RegionKey; |
| } |
| } |
| |
| void GCNScheduleDAGMILive::finalizeSchedule() { |
| // Start actual scheduling here. This function is called by the base |
| // MachineScheduler after all regions have been recorded by |
| // GCNScheduleDAGMILive::schedule(). |
| LiveIns.resize(Regions.size()); |
| Pressure.resize(Regions.size()); |
| RegionsWithHighRP.resize(Regions.size()); |
| RegionsWithExcessRP.resize(Regions.size()); |
| RegionsWithIGLPInstrs.resize(Regions.size()); |
| RegionsWithHighRP.reset(); |
| RegionsWithExcessRP.reset(); |
| RegionsWithIGLPInstrs.reset(); |
| |
| runSchedStages(); |
| } |
| |
| void GCNScheduleDAGMILive::runSchedStages() { |
| LLVM_DEBUG(dbgs() << "All regions recorded, starting actual scheduling.\n"); |
| |
| if (!Regions.empty()) { |
| BBLiveInMap = getRegionLiveInMap(); |
| if (GCNTrackers) |
| RegionLiveOuts.buildLiveRegMap(); |
| } |
| |
| #ifdef DUMP_MAX_REG_PRESSURE |
| if (PrintMaxRPRegUsageBeforeScheduler) { |
| dumpMaxRegPressure(MF, GCNRegPressure::VGPR, *LIS, MLI); |
| dumpMaxRegPressure(MF, GCNRegPressure::SGPR, *LIS, MLI); |
| LIS->dump(); |
| } |
| #endif |
| |
| GCNSchedStrategy &S = static_cast<GCNSchedStrategy &>(*SchedImpl); |
| while (S.advanceStage()) { |
| auto Stage = createSchedStage(S.getCurrentStage()); |
| if (!Stage->initGCNSchedStage()) |
| continue; |
| |
| for (auto Region : Regions) { |
| RegionBegin = Region.first; |
| RegionEnd = Region.second; |
| // Setup for scheduling the region and check whether it should be skipped. |
| if (!Stage->initGCNRegion()) { |
| Stage->advanceRegion(); |
| exitRegion(); |
| continue; |
| } |
| |
| if (GCNTrackers) { |
| GCNDownwardRPTracker *DownwardTracker = S.getDownwardTracker(); |
| GCNUpwardRPTracker *UpwardTracker = S.getUpwardTracker(); |
| GCNRPTracker::LiveRegSet *RegionLiveIns = |
| &LiveIns[Stage->getRegionIdx()]; |
| |
| reinterpret_cast<GCNRPTracker *>(DownwardTracker) |
| ->reset(MRI, *RegionLiveIns); |
| reinterpret_cast<GCNRPTracker *>(UpwardTracker) |
| ->reset(MRI, RegionLiveOuts.getLiveRegsForRegionIdx( |
| Stage->getRegionIdx())); |
| } |
| |
| ScheduleDAGMILive::schedule(); |
| Stage->finalizeGCNRegion(); |
| Stage->advanceRegion(); |
| exitRegion(); |
| } |
| |
| Stage->finalizeGCNSchedStage(); |
| } |
| |
| #ifdef DUMP_MAX_REG_PRESSURE |
| if (PrintMaxRPRegUsageAfterScheduler) { |
| dumpMaxRegPressure(MF, GCNRegPressure::VGPR, *LIS, MLI); |
| dumpMaxRegPressure(MF, GCNRegPressure::SGPR, *LIS, MLI); |
| LIS->dump(); |
| } |
| #endif |
| } |
| |
| #ifndef NDEBUG |
| raw_ostream &llvm::operator<<(raw_ostream &OS, const GCNSchedStageID &StageID) { |
| switch (StageID) { |
| case GCNSchedStageID::OccInitialSchedule: |
| OS << "Max Occupancy Initial Schedule"; |
| break; |
| case GCNSchedStageID::RewriteMFMAForm: |
| OS << "Instruction Rewriting Reschedule"; |
| break; |
| case GCNSchedStageID::UnclusteredHighRPReschedule: |
| OS << "Unclustered High Register Pressure Reschedule"; |
| break; |
| case GCNSchedStageID::ClusteredLowOccupancyReschedule: |
| OS << "Clustered Low Occupancy Reschedule"; |
| break; |
| case GCNSchedStageID::PreRARematerialize: |
| OS << "Pre-RA Rematerialize"; |
| break; |
| case GCNSchedStageID::ILPInitialSchedule: |
| OS << "Max ILP Initial Schedule"; |
| break; |
| case GCNSchedStageID::MemoryClauseInitialSchedule: |
| OS << "Max memory clause Initial Schedule"; |
| break; |
| } |
| |
| return OS; |
| } |
| #endif |
| |
| GCNSchedStage::GCNSchedStage(GCNSchedStageID StageID, GCNScheduleDAGMILive &DAG) |
| : DAG(DAG), S(static_cast<GCNSchedStrategy &>(*DAG.SchedImpl)), MF(DAG.MF), |
| MFI(DAG.MFI), ST(DAG.ST), StageID(StageID) {} |
| |
| bool GCNSchedStage::initGCNSchedStage() { |
| if (!DAG.LIS) |
| return false; |
| |
| LLVM_DEBUG(dbgs() << "Starting scheduling stage: " << StageID << "\n"); |
| return true; |
| } |
| |
| void RewriteMFMAFormStage::findReachingDefs( |
| MachineOperand &UseMO, LiveIntervals *LIS, |
| SmallVectorImpl<SlotIndex> &DefIdxs) { |
| MachineInstr *UseMI = UseMO.getParent(); |
| LiveInterval &UseLI = LIS->getInterval(UseMO.getReg()); |
| VNInfo *VNI = UseLI.getVNInfoAt(LIS->getInstructionIndex(*UseMI)); |
| |
| // If the def is not a PHI, then it must be the only reaching def. |
| if (!VNI->isPHIDef()) { |
| DefIdxs.push_back(VNI->def); |
| return; |
| } |
| |
| SmallPtrSet<MachineBasicBlock *, 8> Visited = {UseMI->getParent()}; |
| SmallVector<MachineBasicBlock *, 8> Worklist; |
| |
| // Mark the predecessor blocks for traversal |
| for (MachineBasicBlock *PredMBB : UseMI->getParent()->predecessors()) { |
| Worklist.push_back(PredMBB); |
| Visited.insert(PredMBB); |
| } |
| |
| while (!Worklist.empty()) { |
| MachineBasicBlock *CurrMBB = Worklist.pop_back_val(); |
| |
| SlotIndex CurrMBBEnd = LIS->getMBBEndIdx(CurrMBB); |
| VNInfo *VNI = UseLI.getVNInfoAt(CurrMBBEnd.getPrevSlot()); |
| |
| MachineBasicBlock *DefMBB = LIS->getMBBFromIndex(VNI->def); |
| |
| // If there is a def in this block, then add it to the list. This is the |
| // reaching def of this path. |
| if (!VNI->isPHIDef()) { |
| DefIdxs.push_back(VNI->def); |
| continue; |
| } |
| |
| for (MachineBasicBlock *PredMBB : DefMBB->predecessors()) { |
| if (Visited.insert(PredMBB).second) |
| Worklist.push_back(PredMBB); |
| } |
| } |
| } |
| |
| void RewriteMFMAFormStage::findReachingUses( |
| MachineInstr *DefMI, LiveIntervals *LIS, |
| SmallVectorImpl<MachineOperand *> &ReachingUses) { |
| SlotIndex DefIdx = LIS->getInstructionIndex(*DefMI); |
| for (MachineOperand &UseMO : |
| DAG.MRI.use_nodbg_operands(DefMI->getOperand(0).getReg())) { |
| SmallVector<SlotIndex, 8> ReachingDefIndexes; |
| findReachingDefs(UseMO, LIS, ReachingDefIndexes); |
| |
| // If we find a use that contains this DefMI in its reachingDefs, then it is |
| // a reaching use. |
| if (any_of(ReachingDefIndexes, [DefIdx](SlotIndex RDIdx) { |
| return SlotIndex::isSameInstr(RDIdx, DefIdx); |
| })) |
| ReachingUses.push_back(&UseMO); |
| } |
| } |
| |
| bool RewriteMFMAFormStage::initGCNSchedStage() { |
| // We only need to run this pass if the architecture supports AGPRs. |
| // Additionally, we don't use AGPRs at occupancy levels above 1 so there |
| // is no need for this pass in that case, either. |
| const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); |
| if (!ST.hasGFX90AInsts() || MFI.getMinWavesPerEU() > 1) |
| return false; |
| |
| RegionsWithExcessArchVGPR.resize(DAG.Regions.size()); |
| RegionsWithExcessArchVGPR.reset(); |
| for (unsigned Region = 0; Region < DAG.Regions.size(); Region++) { |
| GCNRegPressure PressureBefore = DAG.Pressure[Region]; |
| if (PressureBefore.getArchVGPRNum() > ST.getAddressableNumArchVGPRs()) |
| RegionsWithExcessArchVGPR[Region] = true; |
| } |
| |
| if (RegionsWithExcessArchVGPR.none()) |
| return false; |
| |
| TII = ST.getInstrInfo(); |
| SRI = ST.getRegisterInfo(); |
| |
| std::vector<std::pair<MachineInstr *, unsigned>> RewriteCands; |
| DenseMap<MachineBasicBlock *, std::set<Register>> CopyForUse; |
| SmallPtrSet<MachineInstr *, 8> CopyForDef; |
| |
| if (!initHeuristics(RewriteCands, CopyForUse, CopyForDef)) |
| return false; |
| |
| int64_t Cost = getRewriteCost(RewriteCands, CopyForUse, CopyForDef); |
| |
| // If we haven't found the beneficial conditions, prefer the VGPR form which |
| // may result in less cross RC copies. |
| if (Cost > 0) |
| return false; |
| |
| return rewrite(RewriteCands); |
| } |
| |
| bool UnclusteredHighRPStage::initGCNSchedStage() { |
| if (DisableUnclusterHighRP) |
| return false; |
| |
| if (!GCNSchedStage::initGCNSchedStage()) |
| return false; |
| |
| if (DAG.RegionsWithHighRP.none() && DAG.RegionsWithExcessRP.none()) |
| return false; |
| |
| SavedMutations.swap(DAG.Mutations); |
| DAG.addMutation( |
| createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::PreRAReentry)); |
| |
| InitialOccupancy = DAG.MinOccupancy; |
| // Aggressively try to reduce register pressure in the unclustered high RP |
| // stage. Temporarily increase occupancy target in the region. |
| TempTargetOccupancy = MFI.getMaxWavesPerEU() > DAG.MinOccupancy |
| ? InitialOccupancy + 1 |
| : InitialOccupancy; |
| IsAnyRegionScheduled = false; |
| S.SGPRLimitBias = S.HighRPSGPRBias; |
| S.VGPRLimitBias = S.HighRPVGPRBias; |
| |
| LLVM_DEBUG( |
| dbgs() |
| << "Retrying function scheduling without clustering. " |
| "Aggressively try to reduce register pressure to achieve occupancy " |
| << TempTargetOccupancy << ".\n"); |
| |
| return true; |
| } |
| |
| bool ClusteredLowOccStage::initGCNSchedStage() { |
| if (DisableClusteredLowOccupancy) |
| return false; |
| |
| if (!GCNSchedStage::initGCNSchedStage()) |
| return false; |
| |
| // Don't bother trying to improve ILP in lower RP regions if occupancy has not |
| // been dropped. All regions will have already been scheduled with the ideal |
| // occupancy targets. |
| if (DAG.StartingOccupancy <= DAG.MinOccupancy) |
| return false; |
| |
| LLVM_DEBUG( |
| dbgs() << "Retrying function scheduling with lowest recorded occupancy " |
| << DAG.MinOccupancy << ".\n"); |
| return true; |
| } |
| |
| /// Allows to easily filter for this stage's debug output. |
| #define REMAT_PREFIX "[PreRARemat] " |
| #define REMAT_DEBUG(X) LLVM_DEBUG(dbgs() << REMAT_PREFIX; X;) |
| |
| #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
| Printable PreRARematStage::ScoredRemat::print() const { |
| return Printable([&](raw_ostream &OS) { |
| OS << '(' << MaxFreq << ", " << FreqDiff << ", " << RegionImpact << ')'; |
| }); |
| } |
| #endif |
| |
| bool PreRARematStage::initGCNSchedStage() { |
| // FIXME: This pass will invalidate cached BBLiveInMap and MBBLiveIns for |
| // regions inbetween the defs and region we sinked the def to. Will need to be |
| // fixed if there is another pass after this pass. |
| assert(!S.hasNextStage()); |
| |
| if (!GCNSchedStage::initGCNSchedStage() || DAG.Regions.size() <= 1) |
| return false; |
| |
| // Maps all MIs (except lone terminators, which are not part of any region) to |
| // their parent region. Non-lone terminators are considered part of the region |
| // they delimitate. |
| DenseMap<MachineInstr *, unsigned> MIRegion(MF.getInstructionCount()); |
| |
| // Before performing any IR modification record the parent region of each MI |
| // and the parent MBB of each region. |
| const unsigned NumRegions = DAG.Regions.size(); |
| for (unsigned I = 0; I < NumRegions; ++I) { |
| RegionBoundaries Region = DAG.Regions[I]; |
| for (auto MI = Region.first; MI != Region.second; ++MI) |
| MIRegion.insert({&*MI, I}); |
| MachineBasicBlock *ParentMBB = Region.first->getParent(); |
| if (Region.second != ParentMBB->end()) |
| MIRegion.insert({&*Region.second, I}); |
| RegionBB.push_back(ParentMBB); |
| } |
| |
| #ifndef NDEBUG |
| auto PrintTargetRegions = [&]() -> void { |
| if (TargetRegions.none()) { |
| dbgs() << REMAT_PREFIX << "No target regions\n"; |
| return; |
| } |
| dbgs() << REMAT_PREFIX << "Target regions:\n"; |
| for (unsigned I : TargetRegions.set_bits()) |
| dbgs() << REMAT_PREFIX << " [" << I << "] " << RPTargets[I] << '\n'; |
| }; |
| auto PrintRematReg = [&](const RematReg &Remat) -> Printable { |
| return Printable([&, Remat](raw_ostream &OS) { |
| // Concatenate all region numbers in which the register is unused and |
| // live-through. |
| bool HasLiveThroughRegion = false; |
| OS << '[' << Remat.DefRegion << " -"; |
| for (unsigned I = 0; I < NumRegions; ++I) { |
| if (Remat.isUnusedLiveThrough(I)) { |
| if (HasLiveThroughRegion) { |
| OS << ','; |
| } else { |
| OS << "- "; |
| HasLiveThroughRegion = true; |
| } |
| OS << I; |
| } |
| } |
| if (HasLiveThroughRegion) |
| OS << " -"; |
| OS << "-> " << Remat.UseRegion << "] "; |
| Remat.DefMI->print(OS, /*IsStandalone=*/true, /*SkipOpers=*/false, |
| /*SkipDebugLoc=*/false, /*AddNewLine=*/false); |
| }); |
| }; |
| #endif |
| |
| // Set an objective for the stage based on current RP in each region. |
| REMAT_DEBUG({ |
| dbgs() << "Analyzing "; |
| MF.getFunction().printAsOperand(dbgs(), false); |
| dbgs() << ": "; |
| }); |
| if (!setObjective()) { |
| LLVM_DEBUG(dbgs() << "no objective to achieve, occupancy is maximal at " |
| << MFI.getMaxWavesPerEU() << '\n'); |
| return false; |
| } |
| LLVM_DEBUG({ |
| if (TargetOcc) { |
| dbgs() << "increase occupancy from " << *TargetOcc - 1 << '\n'; |
| } else { |
| dbgs() << "reduce spilling (minimum target occupancy is " |
| << MFI.getMinWavesPerEU() << ")\n"; |
| } |
| PrintTargetRegions(); |
| }); |
| |
| if (!collectRematRegs(MIRegion)) { |
| REMAT_DEBUG(dbgs() << "No rematerializable registers\n"); |
| return false; |
| } |
| const ScoredRemat::FreqInfo FreqInfo(MF, DAG); |
| REMAT_DEBUG({ |
| dbgs() << "Rematerializable registers:\n"; |
| for (const RematReg &Remat : RematRegs) |
| dbgs() << REMAT_PREFIX << " " << PrintRematReg(Remat) << '\n'; |
| dbgs() << REMAT_PREFIX << "Region frequencies\n"; |
| for (auto [I, Freq] : enumerate(FreqInfo.Regions)) { |
| dbgs() << REMAT_PREFIX << " [" << I << "] "; |
| if (Freq) |
| dbgs() << Freq; |
| else |
| dbgs() << "unknown "; |
| dbgs() << " | " << *DAG.Regions[I].first; |
| } |
| }); |
| |
| SmallVector<ScoredRemat> ScoredRemats; |
| for (RematReg &Remat : RematRegs) |
| ScoredRemats.emplace_back(&Remat, FreqInfo, DAG); |
| |
| // Rematerialize registers in successive rounds until all RP targets are |
| // satisifed or until we run out of rematerialization candidates. |
| #ifndef NDEBUG |
| unsigned RoundNum = 0; |
| #endif |
| BitVector RecomputeRP(NumRegions); |
| do { |
| assert(!ScoredRemats.empty() && "no more remat candidates"); |
| |
| // (Re-)Score and (re-)sort all remats in increasing score order. |
| for (ScoredRemat &Remat : ScoredRemats) |
| Remat.update(TargetRegions, RPTargets, FreqInfo, !TargetOcc); |
| sort(ScoredRemats); |
| |
| REMAT_DEBUG({ |
| dbgs() << "==== ROUND " << RoundNum++ << " ====\n" |
| << REMAT_PREFIX |
| << "Candidates with non-null score, in rematerialization order:\n"; |
| for (const ScoredRemat &RematDecision : reverse(ScoredRemats)) { |
| if (RematDecision.hasNullScore()) |
| break; |
| dbgs() << REMAT_PREFIX << " " << RematDecision.print() << " | " |
| << *RematDecision.Remat->DefMI; |
| } |
| PrintTargetRegions(); |
| }); |
| |
| RecomputeRP.reset(); |
| unsigned RematIdx = ScoredRemats.size(); |
| |
| // Rematerialize registers in decreasing score order until we estimate |
| // that all RP targets are satisfied or until rematerialization candidates |
| // are no longer useful to decrease RP. |
| for (; RematIdx && TargetRegions.any(); --RematIdx) { |
| const ScoredRemat &Candidate = ScoredRemats[RematIdx - 1]; |
| // Stop rematerializing on encountering a null score. Since scores |
| // monotonically decrease as we rematerialize, we know there is nothing |
| // useful left to do in such cases, even if we were to re-score. |
| if (Candidate.hasNullScore()) { |
| RematIdx = 0; |
| break; |
| } |
| |
| RematReg &Remat = *Candidate.Remat; |
| // When previous rematerializations in this round have already satisfied |
| // RP targets in all regions this rematerialization can impact, we have a |
| // good indication that our scores have diverged significantly from |
| // reality, in which case we interrupt this round and re-score. This also |
| // ensures that every rematerialization we perform is possibly impactful |
| // in at least one target region. |
| if (!Remat.maybeBeneficial(TargetRegions, RPTargets)) |
| break; |
| |
| REMAT_DEBUG(dbgs() << "** REMAT " << PrintRematReg(Remat) << '\n';); |
| MachineInstr *RematMI = |
| Candidate.rematerialize(RecomputeRP, RPTargets, DAG); |
| RescheduleRegions |= Remat.Live; |
| |
| // Every rematerialization we do here is likely to move the instruction |
| // into a higher frequency region, increasing the total sum latency of the |
| // instruction itself. This is acceptable if we are eliminating a spill in |
| // the process, but when the goal is increasing occupancy we get nothing |
| // out of rematerialization if occupancy is not increased in the end; in |
| // such cases we want to roll back the rematerialization. |
| if (TargetOcc) { |
| RollbackInfo &Rollback = Rollbacks.emplace_back(&Remat); |
| Rollback.RematMI = RematMI; |
| // Make the original MI a debug value so that it does not influence |
| // scheduling and replace all read registers with a sentinel register to |
| // prevent operands to appear in use-lists of other MIs during LIS |
| // updates. Store mappings between operand indices and original |
| // registers for potential rollback. |
| Remat.DefMI->setDesc(DAG.TII->get(TargetOpcode::DBG_VALUE)); |
| for (auto [Idx, MO] : enumerate(Remat.DefMI->operands())) { |
| if (MO.isReg() && MO.readsReg()) { |
| Rollback.RegMap.insert({Idx, MO.getReg()}); |
| MO.setReg(Register()); |
| } |
| } |
| } else { |
| // Just delete the original instruction if it cannot be rolled back. |
| DAG.deleteMI(Remat.DefRegion, Remat.DefMI); |
| } |
| |
| unsetSatisfiedRPTargets(Remat.Live); |
| } |
| |
| REMAT_DEBUG({ |
| if (!TargetRegions.any()) { |
| dbgs() << "** Interrupt round on all targets achieved\n"; |
| } else if (RematIdx) { |
| dbgs() << "** Interrupt round on stale score for " |
| << *ScoredRemats[RematIdx - 1].Remat->DefMI; |
| } else { |
| dbgs() << "** Stop on exhausted rematerialization candidates\n"; |
| } |
| }); |
| |
| // Peel off registers we already rematerialized from the vector's tail. |
| ScoredRemats.truncate(RematIdx); |
| } while ((updateAndVerifyRPTargets(RecomputeRP) || TargetRegions.any()) && |
| !ScoredRemats.empty()); |
| if (RescheduleRegions.none()) |
| return false; |
| |
| // Commit all pressure changes to the DAG and compute minimum achieved |
| // occupancy in impacted regions. |
| REMAT_DEBUG(dbgs() << "==== REMAT RESULTS ====\n"); |
| unsigned DynamicVGPRBlockSize = MFI.getDynamicVGPRBlockSize(); |
| for (unsigned I : RescheduleRegions.set_bits()) { |
| DAG.Pressure[I] = RPTargets[I].getCurrentRP(); |
| REMAT_DEBUG(dbgs() << '[' << I << "] Achieved occupancy " |
| << DAG.Pressure[I].getOccupancy(ST, DynamicVGPRBlockSize) |
| << " (" << RPTargets[I] << ")\n"); |
| } |
| AchievedOcc = MFI.getMaxWavesPerEU(); |
| for (const GCNRegPressure &RP : DAG.Pressure) { |
| AchievedOcc = |
| std::min(AchievedOcc, RP.getOccupancy(ST, DynamicVGPRBlockSize)); |
| } |
| |
| REMAT_DEBUG({ |
| dbgs() << "Retrying function scheduling with new min. occupancy of " |
| << AchievedOcc << " from rematerializing (original was " |
| << DAG.MinOccupancy; |
| if (TargetOcc) |
| dbgs() << ", target was " << *TargetOcc; |
| dbgs() << ")\n"; |
| }); |
| |
| DAG.setTargetOccupancy(getStageTargetOccupancy()); |
| return true; |
| } |
| |
| void GCNSchedStage::finalizeGCNSchedStage() { |
| DAG.finishBlock(); |
| LLVM_DEBUG(dbgs() << "Ending scheduling stage: " << StageID << "\n"); |
| } |
| |
| void UnclusteredHighRPStage::finalizeGCNSchedStage() { |
| SavedMutations.swap(DAG.Mutations); |
| S.SGPRLimitBias = S.VGPRLimitBias = 0; |
| if (DAG.MinOccupancy > InitialOccupancy) { |
| assert(IsAnyRegionScheduled); |
| LLVM_DEBUG(dbgs() << StageID |
| << " stage successfully increased occupancy to " |
| << DAG.MinOccupancy << '\n'); |
| } else if (!IsAnyRegionScheduled) { |
| assert(DAG.MinOccupancy == InitialOccupancy); |
| LLVM_DEBUG(dbgs() << StageID |
| << ": No regions scheduled, min occupancy stays at " |
| << DAG.MinOccupancy << ", MFI occupancy stays at " |
| << MFI.getOccupancy() << ".\n"); |
| } |
| |
| GCNSchedStage::finalizeGCNSchedStage(); |
| } |
| |
| bool GCNSchedStage::initGCNRegion() { |
| // Skip empty scheduling region. |
| if (DAG.begin() == DAG.end()) |
| return false; |
| |
| // Check whether this new region is also a new block. |
| if (DAG.RegionBegin->getParent() != CurrentMBB) |
| setupNewBlock(); |
| |
| unsigned NumRegionInstrs = std::distance(DAG.begin(), DAG.end()); |
| DAG.enterRegion(CurrentMBB, DAG.begin(), DAG.end(), NumRegionInstrs); |
| |
| // Skip regions with 1 schedulable instruction. |
| if (DAG.begin() == std::prev(DAG.end())) |
| return false; |
| |
| LLVM_DEBUG(dbgs() << "********** MI Scheduling **********\n"); |
| LLVM_DEBUG(dbgs() << MF.getName() << ":" << printMBBReference(*CurrentMBB) |
| << " " << CurrentMBB->getName() |
| << "\n From: " << *DAG.begin() << " To: "; |
| if (DAG.RegionEnd != CurrentMBB->end()) dbgs() << *DAG.RegionEnd; |
| else dbgs() << "End"; |
| dbgs() << " RegionInstrs: " << NumRegionInstrs << '\n'); |
| |
| // Save original instruction order before scheduling for possible revert. |
| Unsched.clear(); |
| Unsched.reserve(DAG.NumRegionInstrs); |
| if (StageID == GCNSchedStageID::OccInitialSchedule || |
| StageID == GCNSchedStageID::ILPInitialSchedule) { |
| const SIInstrInfo *SII = static_cast<const SIInstrInfo *>(DAG.TII); |
| for (auto &I : DAG) { |
| Unsched.push_back(&I); |
| if (SII->isIGLPMutationOnly(I.getOpcode())) |
| DAG.RegionsWithIGLPInstrs[RegionIdx] = true; |
| } |
| } else { |
| for (auto &I : DAG) |
| Unsched.push_back(&I); |
| } |
| |
| PressureBefore = DAG.Pressure[RegionIdx]; |
| |
| LLVM_DEBUG( |
| dbgs() << "Pressure before scheduling:\nRegion live-ins:" |
| << print(DAG.LiveIns[RegionIdx], DAG.MRI) |
| << "Region live-in pressure: " |
| << print(llvm::getRegPressure(DAG.MRI, DAG.LiveIns[RegionIdx])) |
| << "Region register pressure: " << print(PressureBefore)); |
| |
| S.HasHighPressure = false; |
| S.KnownExcessRP = isRegionWithExcessRP(); |
| |
| if (DAG.RegionsWithIGLPInstrs[RegionIdx] && |
| StageID != GCNSchedStageID::UnclusteredHighRPReschedule) { |
| SavedMutations.clear(); |
| SavedMutations.swap(DAG.Mutations); |
| bool IsInitialStage = StageID == GCNSchedStageID::OccInitialSchedule || |
| StageID == GCNSchedStageID::ILPInitialSchedule; |
| DAG.addMutation(createIGroupLPDAGMutation( |
| IsInitialStage ? AMDGPU::SchedulingPhase::Initial |
| : AMDGPU::SchedulingPhase::PreRAReentry)); |
| } |
| |
| return true; |
| } |
| |
| bool UnclusteredHighRPStage::initGCNRegion() { |
| // Only reschedule regions that have excess register pressure (i.e. spilling) |
| // or had minimum occupancy at the beginning of the stage (as long as |
| // rescheduling of previous regions did not make occupancy drop back down to |
| // the initial minimum). |
| unsigned DynamicVGPRBlockSize = DAG.MFI.getDynamicVGPRBlockSize(); |
| // If no region has been scheduled yet, the DAG has not yet been updated with |
| // the occupancy target. So retrieve it from the temporary. |
| unsigned CurrentTargetOccupancy = |
| IsAnyRegionScheduled ? DAG.MinOccupancy : TempTargetOccupancy; |
| if (!DAG.RegionsWithExcessRP[RegionIdx] && |
| (CurrentTargetOccupancy <= InitialOccupancy || |
| DAG.Pressure[RegionIdx].getOccupancy(ST, DynamicVGPRBlockSize) != |
| InitialOccupancy)) |
| return false; |
| |
| bool IsSchedulingThisRegion = GCNSchedStage::initGCNRegion(); |
| // If this is the first region scheduled during this stage, make the target |
| // occupancy changes in the DAG and MFI. |
| if (!IsAnyRegionScheduled && IsSchedulingThisRegion) { |
| IsAnyRegionScheduled = true; |
| if (MFI.getMaxWavesPerEU() > DAG.MinOccupancy) |
| DAG.setTargetOccupancy(TempTargetOccupancy); |
| } |
| return IsSchedulingThisRegion; |
| } |
| |
| bool ClusteredLowOccStage::initGCNRegion() { |
| // We may need to reschedule this region if it wasn't rescheduled in the last |
| // stage, or if we found it was testing critical register pressure limits in |
| // the unclustered reschedule stage. The later is because we may not have been |
| // able to raise the min occupancy in the previous stage so the region may be |
| // overly constrained even if it was already rescheduled. |
| if (!DAG.RegionsWithHighRP[RegionIdx]) |
| return false; |
| |
| return GCNSchedStage::initGCNRegion(); |
| } |
| |
| bool PreRARematStage::initGCNRegion() { |
| return RescheduleRegions[RegionIdx] && GCNSchedStage::initGCNRegion(); |
| } |
| |
| void GCNSchedStage::setupNewBlock() { |
| if (CurrentMBB) |
| DAG.finishBlock(); |
| |
| CurrentMBB = DAG.RegionBegin->getParent(); |
| DAG.startBlock(CurrentMBB); |
| // Get real RP for the region if it hasn't be calculated before. After the |
| // initial schedule stage real RP will be collected after scheduling. |
| if (StageID == GCNSchedStageID::OccInitialSchedule || |
| StageID == GCNSchedStageID::ILPInitialSchedule || |
| StageID == GCNSchedStageID::MemoryClauseInitialSchedule) |
| DAG.computeBlockPressure(RegionIdx, CurrentMBB); |
| } |
| |
| void GCNSchedStage::finalizeGCNRegion() { |
| DAG.Regions[RegionIdx] = std::pair(DAG.RegionBegin, DAG.RegionEnd); |
| if (S.HasHighPressure) |
| DAG.RegionsWithHighRP[RegionIdx] = true; |
| |
| // Revert scheduling if we have dropped occupancy or there is some other |
| // reason that the original schedule is better. |
| checkScheduling(); |
| |
| if (DAG.RegionsWithIGLPInstrs[RegionIdx] && |
| StageID != GCNSchedStageID::UnclusteredHighRPReschedule) |
| SavedMutations.swap(DAG.Mutations); |
| } |
| |
| void PreRARematStage::finalizeGCNRegion() { |
| GCNSchedStage::finalizeGCNRegion(); |
| // When the goal is to increase occupancy, all regions must reach the target |
| // occupancy for rematerializations to be possibly useful, otherwise we will |
| // just hurt latency for no benefit. If minimum occupancy drops below the |
| // target there is no point in trying to re-schedule further regions. |
| if (!TargetOcc) |
| return; |
| RegionReverts.emplace_back(RegionIdx, Unsched, PressureBefore); |
| if (DAG.MinOccupancy < *TargetOcc) { |
| REMAT_DEBUG(dbgs() << "Region " << RegionIdx |
| << " cannot meet occupancy target, interrupting " |
| "re-scheduling in all regions\n"); |
| RescheduleRegions.reset(); |
| } |
| } |
| |
| void GCNSchedStage::checkScheduling() { |
| // Check the results of scheduling. |
| PressureAfter = DAG.getRealRegPressure(RegionIdx); |
| |
| LLVM_DEBUG(dbgs() << "Pressure after scheduling: " << print(PressureAfter)); |
| LLVM_DEBUG(dbgs() << "Region: " << RegionIdx << ".\n"); |
| |
| unsigned DynamicVGPRBlockSize = DAG.MFI.getDynamicVGPRBlockSize(); |
| |
| if (PressureAfter.getSGPRNum() <= S.SGPRCriticalLimit && |
| PressureAfter.getVGPRNum(ST.hasGFX90AInsts()) <= S.VGPRCriticalLimit) { |
| DAG.Pressure[RegionIdx] = PressureAfter; |
| |
| // Early out if we have achieved the occupancy target. |
| LLVM_DEBUG(dbgs() << "Pressure in desired limits, done.\n"); |
| return; |
| } |
| |
| unsigned TargetOccupancy = std::min( |
| S.getTargetOccupancy(), ST.getOccupancyWithWorkGroupSizes(MF).second); |
| unsigned WavesAfter = std::min( |
| TargetOccupancy, PressureAfter.getOccupancy(ST, DynamicVGPRBlockSize)); |
| unsigned WavesBefore = std::min( |
| TargetOccupancy, PressureBefore.getOccupancy(ST, DynamicVGPRBlockSize)); |
| LLVM_DEBUG(dbgs() << "Occupancy before scheduling: " << WavesBefore |
| << ", after " << WavesAfter << ".\n"); |
| |
| // We may not be able to keep the current target occupancy because of the just |
| // scheduled region. We might still be able to revert scheduling if the |
| // occupancy before was higher, or if the current schedule has register |
| // pressure higher than the excess limits which could lead to more spilling. |
| unsigned NewOccupancy = std::max(WavesAfter, WavesBefore); |
| |
| // Allow memory bound functions to drop to 4 waves if not limited by an |
| // attribute. |
| if (WavesAfter < WavesBefore && WavesAfter < DAG.MinOccupancy && |
| WavesAfter >= MFI.getMinAllowedOccupancy()) { |
| LLVM_DEBUG(dbgs() << "Function is memory bound, allow occupancy drop up to " |
| << MFI.getMinAllowedOccupancy() << " waves\n"); |
| NewOccupancy = WavesAfter; |
| } |
| |
| if (NewOccupancy < DAG.MinOccupancy) { |
| DAG.MinOccupancy = NewOccupancy; |
| MFI.limitOccupancy(DAG.MinOccupancy); |
| LLVM_DEBUG(dbgs() << "Occupancy lowered for the function to " |
| << DAG.MinOccupancy << ".\n"); |
| } |
| // The maximum number of arch VGPR on non-unified register file, or the |
| // maximum VGPR + AGPR in the unified register file case. |
| unsigned MaxVGPRs = ST.getMaxNumVGPRs(MF); |
| // The maximum number of arch VGPR for both unified and non-unified register |
| // file. |
| unsigned MaxArchVGPRs = std::min(MaxVGPRs, ST.getAddressableNumArchVGPRs()); |
| unsigned MaxSGPRs = ST.getMaxNumSGPRs(MF); |
| |
| if (PressureAfter.getVGPRNum(ST.hasGFX90AInsts()) > MaxVGPRs || |
| PressureAfter.getArchVGPRNum() > MaxArchVGPRs || |
| PressureAfter.getAGPRNum() > MaxArchVGPRs || |
| PressureAfter.getSGPRNum() > MaxSGPRs) { |
| DAG.RegionsWithHighRP[RegionIdx] = true; |
| DAG.RegionsWithExcessRP[RegionIdx] = true; |
| } |
| |
| // Revert if this region's schedule would cause a drop in occupancy or |
| // spilling. |
| if (shouldRevertScheduling(WavesAfter)) { |
| modifyRegionSchedule(RegionIdx, DAG.BB, Unsched); |
| std::tie(DAG.RegionBegin, DAG.RegionEnd) = DAG.Regions[RegionIdx]; |
| } else { |
| DAG.Pressure[RegionIdx] = PressureAfter; |
| } |
| } |
| |
| unsigned |
| GCNSchedStage::computeSUnitReadyCycle(const SUnit &SU, unsigned CurrCycle, |
| DenseMap<unsigned, unsigned> &ReadyCycles, |
| const TargetSchedModel &SM) { |
| unsigned ReadyCycle = CurrCycle; |
| for (auto &D : SU.Preds) { |
| if (D.isAssignedRegDep()) { |
| MachineInstr *DefMI = D.getSUnit()->getInstr(); |
| unsigned Latency = SM.computeInstrLatency(DefMI); |
| unsigned DefReady = ReadyCycles[DAG.getSUnit(DefMI)->NodeNum]; |
| ReadyCycle = std::max(ReadyCycle, DefReady + Latency); |
| } |
| } |
| ReadyCycles[SU.NodeNum] = ReadyCycle; |
| return ReadyCycle; |
| } |
| |
| #ifndef NDEBUG |
| struct EarlierIssuingCycle { |
| bool operator()(std::pair<MachineInstr *, unsigned> A, |
| std::pair<MachineInstr *, unsigned> B) const { |
| return A.second < B.second; |
| } |
| }; |
| |
| static void printScheduleModel(std::set<std::pair<MachineInstr *, unsigned>, |
| EarlierIssuingCycle> &ReadyCycles) { |
| if (ReadyCycles.empty()) |
| return; |
| unsigned BBNum = ReadyCycles.begin()->first->getParent()->getNumber(); |
| dbgs() << "\n################## Schedule time ReadyCycles for MBB : " << BBNum |
| << " ##################\n# Cycle #\t\t\tInstruction " |
| " " |
| " \n"; |
| unsigned IPrev = 1; |
| for (auto &I : ReadyCycles) { |
| if (I.second > IPrev + 1) |
| dbgs() << "****************************** BUBBLE OF " << I.second - IPrev |
| << " CYCLES DETECTED ******************************\n\n"; |
| dbgs() << "[ " << I.second << " ] : " << *I.first << "\n"; |
| IPrev = I.second; |
| } |
| } |
| #endif |
| |
| ScheduleMetrics |
| GCNSchedStage::getScheduleMetrics(const std::vector<SUnit> &InputSchedule) { |
| #ifndef NDEBUG |
| std::set<std::pair<MachineInstr *, unsigned>, EarlierIssuingCycle> |
| ReadyCyclesSorted; |
| #endif |
| const TargetSchedModel &SM = ST.getInstrInfo()->getSchedModel(); |
| unsigned SumBubbles = 0; |
| DenseMap<unsigned, unsigned> ReadyCycles; |
| unsigned CurrCycle = 0; |
| for (auto &SU : InputSchedule) { |
| unsigned ReadyCycle = |
| computeSUnitReadyCycle(SU, CurrCycle, ReadyCycles, SM); |
| SumBubbles += ReadyCycle - CurrCycle; |
| #ifndef NDEBUG |
| ReadyCyclesSorted.insert(std::make_pair(SU.getInstr(), ReadyCycle)); |
| #endif |
| CurrCycle = ++ReadyCycle; |
| } |
| #ifndef NDEBUG |
| LLVM_DEBUG( |
| printScheduleModel(ReadyCyclesSorted); |
| dbgs() << "\n\t" |
| << "Metric: " |
| << (SumBubbles |
| ? (SumBubbles * ScheduleMetrics::ScaleFactor) / CurrCycle |
| : 1) |
| << "\n\n"); |
| #endif |
| |
| return ScheduleMetrics(CurrCycle, SumBubbles); |
| } |
| |
| ScheduleMetrics |
| GCNSchedStage::getScheduleMetrics(const GCNScheduleDAGMILive &DAG) { |
| #ifndef NDEBUG |
| std::set<std::pair<MachineInstr *, unsigned>, EarlierIssuingCycle> |
| ReadyCyclesSorted; |
| #endif |
| const TargetSchedModel &SM = ST.getInstrInfo()->getSchedModel(); |
| unsigned SumBubbles = 0; |
| DenseMap<unsigned, unsigned> ReadyCycles; |
| unsigned CurrCycle = 0; |
| for (auto &MI : DAG) { |
| SUnit *SU = DAG.getSUnit(&MI); |
| if (!SU) |
| continue; |
| unsigned ReadyCycle = |
| computeSUnitReadyCycle(*SU, CurrCycle, ReadyCycles, SM); |
| SumBubbles += ReadyCycle - CurrCycle; |
| #ifndef NDEBUG |
| ReadyCyclesSorted.insert(std::make_pair(SU->getInstr(), ReadyCycle)); |
| #endif |
| CurrCycle = ++ReadyCycle; |
| } |
| #ifndef NDEBUG |
| LLVM_DEBUG( |
| printScheduleModel(ReadyCyclesSorted); |
| dbgs() << "\n\t" |
| << "Metric: " |
| << (SumBubbles |
| ? (SumBubbles * ScheduleMetrics::ScaleFactor) / CurrCycle |
| : 1) |
| << "\n\n"); |
| #endif |
| |
| return ScheduleMetrics(CurrCycle, SumBubbles); |
| } |
| |
| bool GCNSchedStage::shouldRevertScheduling(unsigned WavesAfter) { |
| if (WavesAfter < DAG.MinOccupancy) |
| return true; |
| |
| // For dynamic VGPR mode, we don't want to waste any VGPR blocks. |
| if (DAG.MFI.isDynamicVGPREnabled()) { |
| unsigned BlocksBefore = AMDGPU::IsaInfo::getAllocatedNumVGPRBlocks( |
| &ST, DAG.MFI.getDynamicVGPRBlockSize(), |
| PressureBefore.getVGPRNum(false)); |
| unsigned BlocksAfter = AMDGPU::IsaInfo::getAllocatedNumVGPRBlocks( |
| &ST, DAG.MFI.getDynamicVGPRBlockSize(), |
| PressureAfter.getVGPRNum(false)); |
| if (BlocksAfter > BlocksBefore) |
| return true; |
| } |
| |
| return false; |
| } |
| |
| bool OccInitialScheduleStage::shouldRevertScheduling(unsigned WavesAfter) { |
| if (PressureAfter == PressureBefore) |
| return false; |
| |
| if (GCNSchedStage::shouldRevertScheduling(WavesAfter)) |
| return true; |
| |
| if (mayCauseSpilling(WavesAfter)) |
| return true; |
| |
| return false; |
| } |
| |
| bool UnclusteredHighRPStage::shouldRevertScheduling(unsigned WavesAfter) { |
| // If RP is not reduced in the unclustered reschedule stage, revert to the |
| // old schedule. |
| if ((WavesAfter <= |
| PressureBefore.getOccupancy(ST, DAG.MFI.getDynamicVGPRBlockSize()) && |
| mayCauseSpilling(WavesAfter)) || |
| GCNSchedStage::shouldRevertScheduling(WavesAfter)) { |
| LLVM_DEBUG(dbgs() << "Unclustered reschedule did not help.\n"); |
| return true; |
| } |
| |
| // Do not attempt to relax schedule even more if we are already spilling. |
| if (isRegionWithExcessRP()) |
| return false; |
| |
| LLVM_DEBUG( |
| dbgs() |
| << "\n\t *** In shouldRevertScheduling ***\n" |
| << " *********** BEFORE UnclusteredHighRPStage ***********\n"); |
| ScheduleMetrics MBefore = getScheduleMetrics(DAG.SUnits); |
| LLVM_DEBUG( |
| dbgs() |
| << "\n *********** AFTER UnclusteredHighRPStage ***********\n"); |
| ScheduleMetrics MAfter = getScheduleMetrics(DAG); |
| unsigned OldMetric = MBefore.getMetric(); |
| unsigned NewMetric = MAfter.getMetric(); |
| unsigned WavesBefore = std::min( |
| S.getTargetOccupancy(), |
| PressureBefore.getOccupancy(ST, DAG.MFI.getDynamicVGPRBlockSize())); |
| unsigned Profit = |
| ((WavesAfter * ScheduleMetrics::ScaleFactor) / WavesBefore * |
| ((OldMetric + ScheduleMetricBias) * ScheduleMetrics::ScaleFactor) / |
| NewMetric) / |
| ScheduleMetrics::ScaleFactor; |
| LLVM_DEBUG(dbgs() << "\tMetric before " << MBefore << "\tMetric after " |
| << MAfter << "Profit: " << Profit << "\n"); |
| return Profit < ScheduleMetrics::ScaleFactor; |
| } |
| |
| bool ClusteredLowOccStage::shouldRevertScheduling(unsigned WavesAfter) { |
| if (PressureAfter == PressureBefore) |
| return false; |
| |
| if (GCNSchedStage::shouldRevertScheduling(WavesAfter)) |
| return true; |
| |
| if (mayCauseSpilling(WavesAfter)) |
| return true; |
| |
| return false; |
| } |
| |
| bool PreRARematStage::shouldRevertScheduling(unsigned WavesAfter) { |
| // When trying to increase occupancy (TargetOcc == true) the stage manages |
| // region reverts globally (all or none), so we always return false here. |
| return !TargetOcc && mayCauseSpilling(WavesAfter); |
| } |
| |
| bool ILPInitialScheduleStage::shouldRevertScheduling(unsigned WavesAfter) { |
| if (mayCauseSpilling(WavesAfter)) |
| return true; |
| |
| return false; |
| } |
| |
| bool MemoryClauseInitialScheduleStage::shouldRevertScheduling( |
| unsigned WavesAfter) { |
| return mayCauseSpilling(WavesAfter); |
| } |
| |
| bool GCNSchedStage::mayCauseSpilling(unsigned WavesAfter) { |
| if (WavesAfter <= MFI.getMinWavesPerEU() && isRegionWithExcessRP() && |
| !PressureAfter.less(MF, PressureBefore)) { |
| LLVM_DEBUG(dbgs() << "New pressure will result in more spilling.\n"); |
| return true; |
| } |
| |
| return false; |
| } |
| |
| void GCNSchedStage::modifyRegionSchedule(unsigned RegionIdx, |
| MachineBasicBlock *MBB, |
| ArrayRef<MachineInstr *> MIOrder) { |
| assert(static_cast<size_t>(std::distance(DAG.Regions[RegionIdx].first, |
| DAG.Regions[RegionIdx].second)) == |
| MIOrder.size() && |
| "instruction number mismatch"); |
| if (MIOrder.empty()) |
| return; |
| |
| LLVM_DEBUG(dbgs() << "Reverting scheduling for region " << RegionIdx << '\n'); |
| |
| // Reconstruct MI sequence by moving instructions in desired order before |
| // the current region's start. |
| MachineBasicBlock::iterator RegionEnd = DAG.Regions[RegionIdx].first; |
| for (MachineInstr *MI : MIOrder) { |
| // Either move the next MI in order before the end of the region or move the |
| // region end past the MI if it is at the correct position. |
| MachineBasicBlock::iterator MII = MI->getIterator(); |
| if (MII != RegionEnd) { |
| // Will subsequent splice move MI up past a non-debug instruction? |
| bool NonDebugReordered = |
| !MI->isDebugInstr() && |
| skipDebugInstructionsForward(RegionEnd, MII) != MII; |
| MBB->splice(RegionEnd, MBB, MI); |
| // Only update LiveIntervals information if non-debug instructions are |
| // reordered. Otherwise debug instructions could cause code generation to |
| // change. |
| if (NonDebugReordered) |
| DAG.LIS->handleMove(*MI, true); |
| } else { |
| ++RegionEnd; |
| } |
| if (MI->isDebugInstr()) { |
| LLVM_DEBUG(dbgs() << "Scheduling " << *MI); |
| continue; |
| } |
| |
| // Reset read-undef flags and update them later. |
| for (MachineOperand &Op : MI->all_defs()) |
| Op.setIsUndef(false); |
| RegisterOperands RegOpers; |
| RegOpers.collect(*MI, *DAG.TRI, DAG.MRI, DAG.ShouldTrackLaneMasks, false); |
| if (DAG.ShouldTrackLaneMasks) { |
| // Adjust liveness and add missing dead+read-undef flags. |
| SlotIndex SlotIdx = DAG.LIS->getInstructionIndex(*MI).getRegSlot(); |
| RegOpers.adjustLaneLiveness(*DAG.LIS, DAG.MRI, SlotIdx, MI); |
| } else { |
| // Adjust for missing dead-def flags. |
| RegOpers.detectDeadDefs(*MI, *DAG.LIS); |
| } |
| LLVM_DEBUG(dbgs() << "Scheduling " << *MI); |
| } |
| |
| // The region end doesn't change throughout scheduling since it itself is |
| // outside the region (whether that is a MBB end or a terminator MI). |
| assert(RegionEnd == DAG.Regions[RegionIdx].second && "region end mismatch"); |
| DAG.Regions[RegionIdx].first = MIOrder.front(); |
| } |
| |
| bool RewriteMFMAFormStage::isRewriteCandidate(MachineInstr *MI) const { |
| |
| if (!static_cast<const SIInstrInfo *>(DAG.TII)->isMAI(*MI)) |
| return false; |
| return AMDGPU::getMFMASrcCVDstAGPROp(MI->getOpcode()) != -1; |
| } |
| |
| bool RewriteMFMAFormStage::initHeuristics( |
| std::vector<std::pair<MachineInstr *, unsigned>> &RewriteCands, |
| DenseMap<MachineBasicBlock *, std::set<Register>> &CopyForUse, |
| SmallPtrSetImpl<MachineInstr *> &CopyForDef) { |
| bool Changed = false; |
| |
| // Prepare for the heuristics |
| for (MachineBasicBlock &MBB : MF) { |
| for (MachineInstr &MI : MBB) { |
| if (!isRewriteCandidate(&MI)) |
| continue; |
| |
| int ReplacementOp = AMDGPU::getMFMASrcCVDstAGPROp(MI.getOpcode()); |
| assert(ReplacementOp != -1); |
| |
| RewriteCands.push_back({&MI, MI.getOpcode()}); |
| MI.setDesc(TII->get(ReplacementOp)); |
| |
| MachineOperand *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2); |
| if (Src2->isReg()) { |
| SmallVector<SlotIndex, 8> Src2ReachingDefs; |
| findReachingDefs(*Src2, DAG.LIS, Src2ReachingDefs); |
| |
| // For any definition of the src2 register which is non-MFMA, we |
| // insert a copy. |
| for (SlotIndex RDIdx : Src2ReachingDefs) { |
| MachineInstr *RD = DAG.LIS->getInstructionFromIndex(RDIdx); |
| if (!TII->isMAI(*RD)) |
| CopyForDef.insert(RD); |
| } |
| } |
| |
| MachineOperand &Dst = MI.getOperand(0); |
| SmallVector<MachineOperand *, 8> DstReachingUses; |
| |
| findReachingUses(&MI, DAG.LIS, DstReachingUses); |
| |
| for (MachineOperand *RUOp : DstReachingUses) { |
| if (TII->isMAI(*RUOp->getParent())) |
| continue; |
| |
| // For any user of the result of the MFMA which is not an MFMA, we |
| // insert a copy. For a given register, we will only insert one copy |
| // per user block. |
| CopyForUse[RUOp->getParent()->getParent()].insert(RUOp->getReg()); |
| |
| SmallVector<SlotIndex, 8> DstUsesReachingDefs; |
| findReachingDefs(*RUOp, DAG.LIS, DstUsesReachingDefs); |
| |
| for (SlotIndex RDIndex : DstUsesReachingDefs) { |
| MachineInstr *RD = DAG.LIS->getInstructionFromIndex(RDIndex); |
| if (TII->isMAI(*RD)) |
| continue; |
| |
| // For any definition of the user of the MFMA which is not an MFMA, |
| // we insert a copy. We do this to transform all the reaching defs |
| // of this use to AGPR. By doing this, we can insert a copy from |
| // AGPR to VGPR at the user rather than after the MFMA. |
| CopyForDef.insert(RD); |
| } |
| } |
| |
| // Do the rewrite to allow for updated RP calculation. |
| const TargetRegisterClass *VDefRC = DAG.MRI.getRegClass(Dst.getReg()); |
| const TargetRegisterClass *ADefRC = SRI->getEquivalentAGPRClass(VDefRC); |
| DAG.MRI.setRegClass(Dst.getReg(), ADefRC); |
| if (Src2->isReg()) { |
| // Have to get src types separately since subregs may cause C and D |
| // registers to be different types even though the actual operand is |
| // the same size. |
| const TargetRegisterClass *VUseRC = DAG.MRI.getRegClass(Src2->getReg()); |
| const TargetRegisterClass *AUseRC = SRI->getEquivalentAGPRClass(VUseRC); |
| DAG.MRI.setRegClass(Src2->getReg(), AUseRC); |
| } |
| Changed = true; |
| } |
| } |
| |
| return Changed; |
| } |
| |
| int64_t RewriteMFMAFormStage::getRewriteCost( |
| const std::vector<std::pair<MachineInstr *, unsigned>> &RewriteCands, |
| const DenseMap<MachineBasicBlock *, std::set<Register>> &CopyForUse, |
| const SmallPtrSetImpl<MachineInstr *> &CopyForDef) { |
| MachineBlockFrequencyInfo *MBFI = DAG.MBFI; |
| |
| int64_t BestSpillCost = 0; |
| int64_t Cost = 0; |
| uint64_t EntryFreq = MBFI->getEntryFreq().getFrequency(); |
| |
| std::pair<unsigned, unsigned> MaxVectorRegs = |
| ST.getMaxNumVectorRegs(MF.getFunction()); |
| unsigned ArchVGPRThreshold = MaxVectorRegs.first; |
| unsigned AGPRThreshold = MaxVectorRegs.second; |
| unsigned CombinedThreshold = ST.getMaxNumVGPRs(MF); |
| |
| for (unsigned Region = 0; Region < DAG.Regions.size(); Region++) { |
| if (!RegionsWithExcessArchVGPR[Region]) |
| continue; |
| |
| GCNRegPressure &PressureBefore = DAG.Pressure[Region]; |
| unsigned SpillCostBefore = PressureBefore.getVGPRSpills( |
| MF, ArchVGPRThreshold, AGPRThreshold, CombinedThreshold); |
| |
| // For the cases we care about (i.e. ArchVGPR usage is greater than the |
| // addressable limit), rewriting alone should bring pressure to manageable |
| // level. If we find any such region, then the rewrite is potentially |
| // beneficial. |
| GCNRegPressure PressureAfter = DAG.getRealRegPressure(Region); |
| unsigned SpillCostAfter = PressureAfter.getVGPRSpills( |
| MF, ArchVGPRThreshold, AGPRThreshold, CombinedThreshold); |
| |
| uint64_t BlockFreq = |
| MBFI->getBlockFreq(DAG.Regions[Region].first->getParent()) |
| .getFrequency(); |
| |
| bool RelativeFreqIsDenom = EntryFreq > BlockFreq; |
| uint64_t RelativeFreq = EntryFreq && BlockFreq |
| ? (RelativeFreqIsDenom ? EntryFreq / BlockFreq |
| : BlockFreq / EntryFreq) |
| : 1; |
| |
| // This assumes perfect spilling / splitting -- using one spill / copy |
| // instruction and one restoreFrom / copy for each excess register, |
| int64_t SpillCost = ((int)SpillCostAfter - (int)SpillCostBefore) * 2; |
| |
| // Also account for the block frequency. |
| if (RelativeFreqIsDenom) |
| SpillCost /= (int64_t)RelativeFreq; |
| else |
| SpillCost *= (int64_t)RelativeFreq; |
| |
| // If we have increased spilling in any block, just bail. |
| if (SpillCost > 0) |
| return SpillCost; |
| |
| if (SpillCost < BestSpillCost) |
| BestSpillCost = SpillCost; |
| } |
| |
| // Set the cost to the largest decrease in spill cost in order to not double |
| // count spill reductions. |
| Cost = BestSpillCost; |
| assert(Cost <= 0); |
| |
| unsigned CopyCost = 0; |
| |
| // For each CopyForDef, increase the cost by the register size while |
| // accounting for block frequency. |
| for (MachineInstr *DefMI : CopyForDef) { |
| Register DefReg = DefMI->getOperand(0).getReg(); |
| uint64_t DefFreq = |
| EntryFreq |
| ? MBFI->getBlockFreq(DefMI->getParent()).getFrequency() / EntryFreq |
| : 1; |
| |
| const TargetRegisterClass *RC = DAG.MRI.getRegClass(DefReg); |
| CopyCost += RC->getCopyCost() * DefFreq; |
| } |
| |
| // Account for CopyForUse copies in each block that the register is used. |
| for (auto &[UseBlock, UseRegs] : CopyForUse) { |
| uint64_t UseFreq = |
| EntryFreq ? MBFI->getBlockFreq(UseBlock).getFrequency() / EntryFreq : 1; |
| |
| for (Register UseReg : UseRegs) { |
| const TargetRegisterClass *RC = DAG.MRI.getRegClass(UseReg); |
| CopyCost += RC->getCopyCost() * UseFreq; |
| } |
| } |
| |
| // Reset the classes that were changed to AGPR for better RB analysis. |
| // We must do rewriting after copy-insertion, as some defs of the register |
| // may require VGPR. Additionally, if we bail out and don't perform the |
| // rewrite then these need to be restored anyway. |
| for (auto &[MI, OriginalOpcode] : RewriteCands) { |
| assert(TII->isMAI(*MI)); |
| const TargetRegisterClass *ADefRC = |
| DAG.MRI.getRegClass(MI->getOperand(0).getReg()); |
| const TargetRegisterClass *VDefRC = SRI->getEquivalentVGPRClass(ADefRC); |
| DAG.MRI.setRegClass(MI->getOperand(0).getReg(), VDefRC); |
| MI->setDesc(TII->get(OriginalOpcode)); |
| |
| MachineOperand *Src2 = TII->getNamedOperand(*MI, AMDGPU::OpName::src2); |
| assert(Src2); |
| if (!Src2->isReg()) |
| continue; |
| |
| // Have to get src types separately since subregs may cause C and D |
| // registers to be different types even though the actual operand is |
| // the same size. |
| const TargetRegisterClass *AUseRC = DAG.MRI.getRegClass(Src2->getReg()); |
| const TargetRegisterClass *VUseRC = SRI->getEquivalentVGPRClass(AUseRC); |
| DAG.MRI.setRegClass(Src2->getReg(), VUseRC); |
| } |
| |
| return Cost + CopyCost; |
| } |
| |
| bool RewriteMFMAFormStage::rewrite( |
| const std::vector<std::pair<MachineInstr *, unsigned>> &RewriteCands) { |
| DenseMap<MachineInstr *, unsigned> FirstMIToRegion; |
| DenseMap<MachineInstr *, unsigned> LastMIToRegion; |
| |
| for (unsigned Region = 0; Region < DAG.Regions.size(); Region++) { |
| RegionBoundaries Entry = DAG.Regions[Region]; |
| if (Entry.first == Entry.second) |
| continue; |
| |
| FirstMIToRegion[&*Entry.first] = Region; |
| if (Entry.second != Entry.first->getParent()->end()) |
| LastMIToRegion[&*Entry.second] = Region; |
| } |
| |
| // Rewrite the MFMAs to AGPR, and insert any copies as needed. |
| // The general assumption of the algorithm (and the previous cost calculation) |
| // is that it is better to insert the copies in the MBB of the def of the src2 |
| // operands, and in the MBB of the user of the dest operands. This is based on |
| // the assumption that the MFMAs are likely to appear in loop bodies, while |
| // the src2 and dest operands are live-in / live-out of the loop. Due to this |
| // design, the algorithm for finding copy insertion points is more |
| // complicated. |
| // |
| // There are three main cases to handle: 1. the reaching defs of the src2 |
| // operands, 2. the reaching uses of the dst operands, and 3. the reaching |
| // defs of the reaching uses of the dst operand. |
| // |
| // In the first case, we simply insert copies after each of the reaching |
| // definitions. In the second case, we collect all the uses of a given dest |
| // and organize them by MBB. Then, we insert 1 copy for each MBB before the |
| // earliest use. Since the use may have multiple reaching defs, and since we |
| // want to replace the register it is using with the result of the copy, we |
| // must handle case 3. In the third case, we simply insert a copy after each |
| // of the reaching defs to connect to the copy of the reaching uses of the dst |
| // reg. This allows us to avoid inserting copies next to the MFMAs. |
| // |
| // While inserting the copies, we maintain a map of operands which will use |
| // different regs (i.e. the result of the copies). For example, a case 1 src2 |
| // operand will use the register result of the copies after the reaching defs, |
| // as opposed to the original register. Now that we have completed our copy |
| // analysis and placement, we can bulk update the registers. We do this |
| // separately as to avoid complicating the reachingDef and reachingUse |
| // queries. |
| // |
| // While inserting the copies, we also maintain a list or registers which we |
| // will want to reclassify as AGPR. After doing the copy insertion and the |
| // register replacement, we can finally do the reclassification. This uses the |
| // redef map, as the registers we are interested in reclassifying may be |
| // replaced by the result of a copy. We must do this after the copy analysis |
| // and placement as we must have an accurate redef map -- otherwise we may end |
| // up creating illegal instructions. |
| |
| // The original registers of the MFMA that need to be reclassified as AGPR. |
| DenseSet<Register> RewriteRegs; |
| // The map of an original register in the MFMA to a new register (result of a |
| // copy) that it should be replaced with. |
| DenseMap<Register, Register> RedefMap; |
| // The map of the original MFMA registers to the relevant MFMA operands. |
| DenseMap<Register, DenseSet<MachineOperand *>> ReplaceMap; |
| // The map of reaching defs for a given register -- to avoid duplicate copies. |
| DenseMap<Register, SmallPtrSet<MachineInstr *, 8>> ReachingDefCopyMap; |
| // The map of reaching uses for a given register by basic block -- to avoid |
| // duplicate copies and to calculate per MBB insert pts. |
| DenseMap<unsigned, DenseMap<Register, SmallPtrSet<MachineOperand *, 8>>> |
| ReachingUseTracker; |
| |
| for (auto &[MI, OriginalOpcode] : RewriteCands) { |
| int ReplacementOp = AMDGPU::getMFMASrcCVDstAGPROp(MI->getOpcode()); |
| if (ReplacementOp == -1) |
| continue; |
| MI->setDesc(TII->get(ReplacementOp)); |
| |
| // Case 1: insert copies for the reaching defs of the Src2Reg. |
| MachineOperand *Src2 = TII->getNamedOperand(*MI, AMDGPU::OpName::src2); |
| if (Src2->isReg()) { |
| Register Src2Reg = Src2->getReg(); |
| if (!Src2Reg.isVirtual()) |
| return false; |
| |
| Register MappedReg = Src2->getReg(); |
| SmallVector<SlotIndex, 8> Src2ReachingDefs; |
| findReachingDefs(*Src2, DAG.LIS, Src2ReachingDefs); |
| SmallSetVector<MachineInstr *, 8> Src2DefsReplace; |
| |
| for (SlotIndex RDIndex : Src2ReachingDefs) { |
| MachineInstr *RD = DAG.LIS->getInstructionFromIndex(RDIndex); |
| if (TII->isMAI(*RD)) |
| continue; |
| |
| // If there is a non mai reaching def, then we need a copy. |
| Src2DefsReplace.insert(RD); |
| } |
| |
| if (!Src2DefsReplace.empty()) { |
| DenseMap<Register, Register>::iterator RI = RedefMap.find(Src2Reg); |
| if (RI != RedefMap.end()) { |
| MappedReg = RI->second; |
| } else { |
| assert(!ReachingDefCopyMap.contains(Src2Reg)); |
| const TargetRegisterClass *Src2RC = DAG.MRI.getRegClass(Src2Reg); |
| const TargetRegisterClass *VGPRRC = |
| SRI->getEquivalentVGPRClass(Src2RC); |
| |
| // Track the mapping of the original register to the new register. |
| MappedReg = DAG.MRI.createVirtualRegister(VGPRRC); |
| RedefMap[Src2Reg] = MappedReg; |
| } |
| |
| // If none exists, create a copy from this reaching def. |
| // We may have inserted a copy already in an earlier iteration. |
| for (MachineInstr *RD : Src2DefsReplace) { |
| // Do not create redundant copies. |
| if (ReachingDefCopyMap[Src2Reg].insert(RD).second) { |
| MachineInstrBuilder VGPRCopy = |
| BuildMI(*RD->getParent(), std::next(RD->getIterator()), |
| RD->getDebugLoc(), TII->get(TargetOpcode::COPY)) |
| .addDef(MappedReg, {}, 0) |
| .addUse(Src2Reg, {}, 0); |
| DAG.LIS->InsertMachineInstrInMaps(*VGPRCopy); |
| |
| // If this reaching def was the last MI in the region, update the |
| // region boundaries. |
| if (LastMIToRegion.contains(RD)) { |
| unsigned UpdateRegion = LastMIToRegion[RD]; |
| DAG.Regions[UpdateRegion].second = VGPRCopy; |
| LastMIToRegion.erase(RD); |
| } |
| } |
| } |
| } |
| |
| // Track the register for reclassification |
| RewriteRegs.insert(Src2Reg); |
| |
| // Always insert the operand for replacement. If this corresponds with a |
| // chain of tied-def we may not see the VGPR requirement until later. |
| ReplaceMap[Src2Reg].insert(Src2); |
| } |
| |
| // Case 2 and Case 3: insert copies before the reaching uses of the dsts, |
| // and after the reaching defs of the reaching uses of the dsts. |
| |
| MachineOperand *Dst = &MI->getOperand(0); |
| Register DstReg = Dst->getReg(); |
| if (!DstReg.isVirtual()) |
| return false; |
| |
| Register MappedReg = DstReg; |
| SmallVector<MachineOperand *, 8> DstReachingUses; |
| |
| SmallVector<MachineOperand *, 8> DstReachingUseCopies; |
| SmallVector<MachineInstr *, 8> DstUseDefsReplace; |
| |
| findReachingUses(MI, DAG.LIS, DstReachingUses); |
| |
| for (MachineOperand *RUOp : DstReachingUses) { |
| if (TII->isMAI(*RUOp->getParent())) |
| continue; |
| |
| // If there is a non mai reaching use, then we need a copy. |
| if (find(DstReachingUseCopies, RUOp) == DstReachingUseCopies.end()) |
| DstReachingUseCopies.push_back(RUOp); |
| SmallVector<SlotIndex, 8> DstUsesReachingDefs; |
| findReachingDefs(*RUOp, DAG.LIS, DstUsesReachingDefs); |
| |
| for (SlotIndex RDIndex : DstUsesReachingDefs) { |
| MachineInstr *RD = DAG.LIS->getInstructionFromIndex(RDIndex); |
| if (TII->isMAI(*RD)) |
| continue; |
| |
| // If there is a non mai reaching def of this reaching use, then we will |
| // need a copy. |
| if (find(DstUseDefsReplace, RD) == DstUseDefsReplace.end()) |
| DstUseDefsReplace.push_back(RD); |
| } |
| } |
| |
| if (!DstUseDefsReplace.empty()) { |
| DenseMap<Register, Register>::iterator RI = RedefMap.find(DstReg); |
| if (RI != RedefMap.end()) { |
| MappedReg = RI->second; |
| } else { |
| assert(!ReachingDefCopyMap.contains(DstReg)); |
| const TargetRegisterClass *DstRC = DAG.MRI.getRegClass(DstReg); |
| const TargetRegisterClass *VGPRRC = SRI->getEquivalentVGPRClass(DstRC); |
| |
| // Track the mapping of the original register to the new register. |
| MappedReg = DAG.MRI.createVirtualRegister(VGPRRC); |
| RedefMap[DstReg] = MappedReg; |
| } |
| |
| // If none exists, create a copy from this reaching def. |
| // We may have inserted a copy already in an earlier iteration. |
| for (MachineInstr *RD : DstUseDefsReplace) { |
| // Do not create reundant copies. |
| if (ReachingDefCopyMap[DstReg].insert(RD).second) { |
| MachineInstrBuilder VGPRCopy = |
| BuildMI(*RD->getParent(), std::next(RD->getIterator()), |
| RD->getDebugLoc(), TII->get(TargetOpcode::COPY)) |
| .addDef(MappedReg, {}, 0) |
| .addUse(DstReg, {}, 0); |
| DAG.LIS->InsertMachineInstrInMaps(*VGPRCopy); |
| |
| // If this reaching def was the last MI in the region, update the |
| // region boundaries. |
| DenseMap<MachineInstr *, unsigned>::iterator LMI = |
| LastMIToRegion.find(RD); |
| if (LMI != LastMIToRegion.end()) { |
| unsigned UpdateRegion = LMI->second; |
| DAG.Regions[UpdateRegion].second = VGPRCopy; |
| LastMIToRegion.erase(RD); |
| } |
| } |
| } |
| } |
| |
| DenseSet<MachineOperand *> &DstRegSet = ReplaceMap[DstReg]; |
| for (MachineOperand *RU : DstReachingUseCopies) { |
| MachineBasicBlock *RUBlock = RU->getParent()->getParent(); |
| // Just keep track of the reaching use of this register by block. After we |
| // have scanned all the MFMAs we can find optimal insert pts. |
| if (RUBlock != MI->getParent()) { |
| ReachingUseTracker[RUBlock->getNumber()][DstReg].insert(RU); |
| continue; |
| } |
| |
| // Special case, the use is in the same block as the MFMA. Insert the copy |
| // just before the use. |
| const TargetRegisterClass *DstRC = DAG.MRI.getRegClass(DstReg); |
| const TargetRegisterClass *VGPRRC = SRI->getEquivalentVGPRClass(DstRC); |
| Register NewUseReg = DAG.MRI.createVirtualRegister(VGPRRC); |
| MachineInstr *UseInst = RU->getParent(); |
| MachineInstrBuilder VGPRCopy = |
| BuildMI(*UseInst->getParent(), UseInst->getIterator(), |
| UseInst->getDebugLoc(), TII->get(TargetOpcode::COPY)) |
| .addDef(NewUseReg, {}, 0) |
| .addUse(DstReg, {}, 0); |
| DAG.LIS->InsertMachineInstrInMaps(*VGPRCopy); |
| // Since we know this use has only one reaching def, we can replace the |
| // use reg. |
| RU->setReg(NewUseReg); |
| // Track the copy source operand for r eplacement. |
| DstRegSet.insert(&VGPRCopy->getOperand(1)); |
| } |
| |
| // Track the register for reclassification |
| RewriteRegs.insert(DstReg); |
| |
| // Insert the dst operand for replacement. If this dst is in a chain of |
| // tied-def MFMAs, and the first src2 needs to be replaced with a new reg, |
| // all the correspond operands need to be replaced. |
| DstRegSet.insert(Dst); |
| } |
| |
| // Handle the copies for dst uses. |
| using RUBType = |
| std::pair<unsigned, DenseMap<Register, SmallPtrSet<MachineOperand *, 8>>>; |
| for (RUBType RUBlockEntry : ReachingUseTracker) { |
| using RUDType = std::pair<Register, SmallPtrSet<MachineOperand *, 8>>; |
| for (RUDType RUDst : RUBlockEntry.second) { |
| MachineOperand *OpBegin = *RUDst.second.begin(); |
| SlotIndex InstPt = DAG.LIS->getInstructionIndex(*OpBegin->getParent()); |
| |
| // Find the earliest use in this block. |
| for (MachineOperand *User : RUDst.second) { |
| SlotIndex NewInstPt = DAG.LIS->getInstructionIndex(*User->getParent()); |
| if (SlotIndex::isEarlierInstr(NewInstPt, InstPt)) |
| InstPt = NewInstPt; |
| } |
| |
| const TargetRegisterClass *DstRC = DAG.MRI.getRegClass(RUDst.first); |
| const TargetRegisterClass *VGPRRC = SRI->getEquivalentVGPRClass(DstRC); |
| Register NewUseReg = DAG.MRI.createVirtualRegister(VGPRRC); |
| MachineInstr *UseInst = DAG.LIS->getInstructionFromIndex(InstPt); |
| |
| MachineInstrBuilder VGPRCopy = |
| BuildMI(*UseInst->getParent(), UseInst->getIterator(), |
| UseInst->getDebugLoc(), TII->get(TargetOpcode::COPY)) |
| .addDef(NewUseReg, {}, 0) |
| .addUse(RUDst.first, {}, 0); |
| DAG.LIS->InsertMachineInstrInMaps(*VGPRCopy); |
| |
| // If this UseInst was the first MI in the region, update the region |
| // boundaries. |
| DenseMap<MachineInstr *, unsigned>::iterator FI = |
| FirstMIToRegion.find(UseInst); |
| if (FI != FirstMIToRegion.end()) { |
| unsigned UpdateRegion = FI->second; |
| DAG.Regions[UpdateRegion].first = VGPRCopy; |
| FirstMIToRegion.erase(UseInst); |
| } |
| |
| // Replace the operand for all users. |
| for (MachineOperand *User : RUDst.second) { |
| User->setReg(NewUseReg); |
| } |
| |
| // Track the copy source operand for replacement. |
| ReplaceMap[RUDst.first].insert(&VGPRCopy->getOperand(1)); |
| } |
| } |
| |
| // We may have needed to insert copies after the reaching defs of the MFMAs. |
| // Replace the original register with the result of the copy for all relevant |
| // operands. |
| for (std::pair<Register, Register> NewDef : RedefMap) { |
| Register OldReg = NewDef.first; |
| Register NewReg = NewDef.second; |
| |
| // Replace the register for any associated operand in the MFMA chain. |
| for (MachineOperand *ReplaceOp : ReplaceMap[OldReg]) |
| ReplaceOp->setReg(NewReg); |
| } |
| |
| // Finally, do the reclassification of the MFMA registers. |
| for (Register RewriteReg : RewriteRegs) { |
| Register RegToRewrite = RewriteReg; |
| |
| // Be sure to update the replacement register and not the original. |
| DenseMap<Register, Register>::iterator RI = RedefMap.find(RewriteReg); |
| if (RI != RedefMap.end()) |
| RegToRewrite = RI->second; |
| |
| const TargetRegisterClass *CurrRC = DAG.MRI.getRegClass(RegToRewrite); |
| const TargetRegisterClass *AGPRRC = SRI->getEquivalentAGPRClass(CurrRC); |
| |
| DAG.MRI.setRegClass(RegToRewrite, AGPRRC); |
| } |
| |
| // Bulk update the LIS. |
| DAG.LIS->reanalyze(DAG.MF); |
| // Liveins may have been modified for cross RC copies |
| RegionPressureMap LiveInUpdater(&DAG, false); |
| LiveInUpdater.buildLiveRegMap(); |
| |
| for (unsigned Region = 0; Region < DAG.Regions.size(); Region++) |
| DAG.LiveIns[Region] = LiveInUpdater.getLiveRegsForRegionIdx(Region); |
| |
| DAG.Pressure[RegionIdx] = DAG.getRealRegPressure(RegionIdx); |
| |
| return true; |
| } |
| |
| unsigned PreRARematStage::getStageTargetOccupancy() const { |
| return TargetOcc ? *TargetOcc : MFI.getMinWavesPerEU(); |
| } |
| |
| bool PreRARematStage::setObjective() { |
| const Function &F = MF.getFunction(); |
| |
| // Set up "spilling targets" for all regions. |
| unsigned MaxSGPRs = ST.getMaxNumSGPRs(F); |
| unsigned MaxVGPRs = ST.getMaxNumVGPRs(F); |
| bool HasVectorRegisterExcess = false; |
| for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) { |
| const GCNRegPressure &RP = DAG.Pressure[I]; |
| GCNRPTarget &Target = RPTargets.emplace_back(MaxSGPRs, MaxVGPRs, MF, RP); |
| if (!Target.satisfied()) |
| TargetRegions.set(I); |
| HasVectorRegisterExcess |= Target.hasVectorRegisterExcess(); |
| } |
| |
| if (HasVectorRegisterExcess || DAG.MinOccupancy >= MFI.getMaxWavesPerEU()) { |
| // In addition to register usage being above addressable limits, occupancy |
| // below the minimum is considered like "spilling" as well. |
| TargetOcc = std::nullopt; |
| } else { |
| // There is no spilling and room to improve occupancy; set up "increased |
| // occupancy targets" for all regions. |
| TargetOcc = DAG.MinOccupancy + 1; |
| const unsigned VGPRBlockSize = MFI.getDynamicVGPRBlockSize(); |
| MaxSGPRs = ST.getMaxNumSGPRs(*TargetOcc, false); |
| MaxVGPRs = ST.getMaxNumVGPRs(*TargetOcc, VGPRBlockSize); |
| for (auto [I, Target] : enumerate(RPTargets)) { |
| Target.setTarget(MaxSGPRs, MaxVGPRs); |
| if (!Target.satisfied()) |
| TargetRegions.set(I); |
| } |
| } |
| |
| return TargetRegions.any(); |
| } |
| |
| bool PreRARematStage::collectRematRegs( |
| const DenseMap<MachineInstr *, unsigned> &MIRegion) { |
| // We need up-to-date live-out info. to query live-out register masks in |
| // regions containing rematerializable instructions. |
| DAG.RegionLiveOuts.buildLiveRegMap(); |
| |
| // Set of registers already marked for potential remterialization; used to |
| // avoid rematerialization chains. |
| SmallSet<Register, 4> MarkedRegs; |
| auto IsMarkedForRemat = [&MarkedRegs](const MachineOperand &MO) -> bool { |
| return MO.isReg() && MarkedRegs.contains(MO.getReg()); |
| }; |
| |
| // Identify rematerializable instructions in the function. |
| for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) { |
| RegionBoundaries Bounds = DAG.Regions[I]; |
| for (auto MI = Bounds.first; MI != Bounds.second; ++MI) { |
| // The instruction must be rematerializable. |
| MachineInstr &DefMI = *MI; |
| if (!isReMaterializable(DefMI)) |
| continue; |
| |
| // We only support rematerializing virtual registers with one |
| // definition. |
| Register Reg = DefMI.getOperand(0).getReg(); |
| if (!Reg.isVirtual() || !DAG.MRI.hasOneDef(Reg)) |
| continue; |
| |
| // We only care to rematerialize the instruction if it has a single |
| // non-debug user in a different region. |
| // FIXME: Allow rematerializations with multiple uses. This should be |
| // relatively easy to support using the current cost model. |
| MachineInstr *UseMI = DAG.MRI.getOneNonDBGUser(Reg); |
| if (!UseMI) |
| continue; |
| auto UseRegion = MIRegion.find(UseMI); |
| if (UseRegion == MIRegion.end() || UseRegion->second == I) |
| continue; |
| |
| // Do not rematerialize an instruction if it uses or is used by an |
| // instruction that we have designated for rematerialization. |
| // FIXME: Allow for rematerialization chains: this requires 1. updating |
| // remat points to account for uses that are rematerialized, and 2. |
| // either rematerializing the candidates in careful ordering, or |
| // deferring the MBB RP walk until the entire chain has been |
| // rematerialized. |
| const MachineOperand &UseMO = UseMI->getOperand(0); |
| if (IsMarkedForRemat(UseMO) || |
| llvm::any_of(DefMI.operands(), IsMarkedForRemat)) |
| continue; |
| |
| // Do not rematerialize an instruction it it uses registers that aren't |
| // available at its use. This ensures that we are not extending any live |
| // range while rematerializing. |
| SlotIndex UseIdx = DAG.LIS->getInstructionIndex(*UseMI).getRegSlot(true); |
| if (!VirtRegAuxInfo::allUsesAvailableAt(&DefMI, UseIdx, *DAG.LIS, DAG.MRI, |
| *DAG.TII)) |
| continue; |
| |
| // Add the instruction to the rematerializable list. |
| MarkedRegs.insert(Reg); |
| RematRegs.emplace_back(&DefMI, UseMI, DAG, MIRegion); |
| } |
| } |
| |
| return !RematRegs.empty(); |
| } |
| |
| PreRARematStage::RematReg::RematReg( |
| MachineInstr *DefMI, MachineInstr *UseMI, GCNScheduleDAGMILive &DAG, |
| const DenseMap<MachineInstr *, unsigned> &MIRegion) |
| : DefMI(DefMI), UseMI(UseMI), LiveIn(DAG.Regions.size()), |
| LiveOut(DAG.Regions.size()), Live(DAG.Regions.size()), |
| DefRegion(MIRegion.at(DefMI)), UseRegion(MIRegion.at(UseMI)) { |
| |
| // Mark regions in which the rematerializable register is live. |
| Register Reg = getReg(); |
| for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) { |
| auto LiveInIt = DAG.LiveIns[I].find(Reg); |
| if (LiveInIt != DAG.LiveIns[I].end()) |
| LiveIn.set(I); |
| const auto &LiveOuts = DAG.RegionLiveOuts.getLiveRegsForRegionIdx(I); |
| if (auto LiveOutIt = LiveOuts.find(Reg); LiveOutIt != LiveOuts.end()) |
| LiveOut.set(I); |
| } |
| Live |= LiveIn; |
| Live |= LiveOut; |
| Mask = DAG.RegionLiveOuts.getLiveRegsForRegionIdx(DefRegion).at(Reg); |
| } |
| |
| bool PreRARematStage::RematReg::maybeBeneficial( |
| const BitVector &TargetRegions, ArrayRef<GCNRPTarget> RPTargets) const { |
| Register Reg = getReg(); |
| for (unsigned I : TargetRegions.set_bits()) { |
| if (Live[I] && RPTargets[I].isSaveBeneficial(Reg)) |
| return true; |
| } |
| return false; |
| } |
| |
| void PreRARematStage::RematReg::insertMI(unsigned RegionIdx, |
| MachineInstr *RematMI, |
| GCNScheduleDAGMILive &DAG) const { |
| RegionBoundaries &Bounds = DAG.Regions[RegionIdx]; |
| if (Bounds.first == std::next(MachineBasicBlock::iterator(RematMI))) |
| Bounds.first = RematMI; |
| DAG.LIS->InsertMachineInstrInMaps(*RematMI); |
| DAG.LIS->createAndComputeVirtRegInterval(RematMI->getOperand(0).getReg()); |
| } |
| |
| PreRARematStage::ScoredRemat::FreqInfo::FreqInfo( |
| MachineFunction &MF, const GCNScheduleDAGMILive &DAG) { |
| assert(DAG.MLI && "MLI not defined in DAG"); |
| MachineBranchProbabilityInfo MBPI; |
| MachineBlockFrequencyInfo MBFI(MF, MBPI, *DAG.MLI); |
| |
| const unsigned NumRegions = DAG.Regions.size(); |
| MinFreq = MBFI.getEntryFreq().getFrequency(); |
| MaxFreq = 0; |
| Regions.reserve(NumRegions); |
| for (unsigned I = 0; I < NumRegions; ++I) { |
| MachineBasicBlock *MBB = DAG.Regions[I].first->getParent(); |
| uint64_t BlockFreq = MBFI.getBlockFreq(MBB).getFrequency(); |
| Regions.push_back(BlockFreq); |
| if (BlockFreq && BlockFreq < MinFreq) |
| MinFreq = BlockFreq; |
| else if (BlockFreq > MaxFreq) |
| MaxFreq = BlockFreq; |
| } |
| if (!MinFreq) |
| return; |
| |
| // Scale everything down if frequencies are high. |
| if (MinFreq >= ScaleFactor * ScaleFactor) { |
| for (uint64_t &Freq : Regions) |
| Freq /= ScaleFactor; |
| MinFreq /= ScaleFactor; |
| MaxFreq /= ScaleFactor; |
| } |
| } |
| |
| PreRARematStage::ScoredRemat::ScoredRemat(RematReg *Remat, const FreqInfo &Freq, |
| const GCNScheduleDAGMILive &DAG) |
| : Remat(Remat), FreqDiff(getFreqDiff(Freq)) { |
| RPSave.inc(Remat->getReg(), LaneBitmask::getNone(), Remat->Mask, DAG.MRI); |
| } |
| |
| int64_t PreRARematStage::ScoredRemat::getFreqDiff(const FreqInfo &Freq) const { |
| // Get frequencies of defining and using regions. A rematerialization from the |
| // least frequent region to the most frequent region will yield the greatest |
| // latency penalty and therefore should get minimum score. Reciprocally, a |
| // rematerialization in the other direction should get maximum score. Default |
| // to values that will yield the worst possible score given known frequencies |
| // in order to penalize rematerializations from or into regions whose |
| // frequency is unknown. |
| int64_t DefOrMin = std::max(Freq.Regions[Remat->DefRegion], Freq.MinFreq); |
| int64_t UseOrMax = Freq.Regions[Remat->UseRegion]; |
| if (!UseOrMax) |
| UseOrMax = Freq.MaxFreq; |
| return DefOrMin - UseOrMax; |
| } |
| |
| void PreRARematStage::ScoredRemat::update(const BitVector &TargetRegions, |
| ArrayRef<GCNRPTarget> RPTargets, |
| const FreqInfo &FreqInfo, |
| bool ReduceSpill) { |
| MaxFreq = 0; |
| RegionImpact = 0; |
| for (unsigned I : TargetRegions.set_bits()) { |
| if (!Remat->Live[I]) |
| continue; |
| |
| // The rematerialization must contribute positively in at least one |
| // register class with usage above the RP target for this region to |
| // contribute to the score. |
| const GCNRPTarget &RegionTarget = RPTargets[I]; |
| const unsigned NumRegsBenefit = RegionTarget.getNumRegsBenefit(RPSave); |
| if (!NumRegsBenefit) |
| continue; |
| |
| bool UnusedLT = Remat->isUnusedLiveThrough(I); |
| |
| // Regions in which RP is guaranteed to decrease have more weight. |
| RegionImpact += (UnusedLT ? 2 : 1) * NumRegsBenefit; |
| |
| if (ReduceSpill) { |
| uint64_t Freq = FreqInfo.Regions[I]; |
| if (!UnusedLT) { |
| // Apply a frequency penalty in regions in which we are not sure that RP |
| // will decrease. |
| Freq /= 2; |
| } |
| MaxFreq = std::max(MaxFreq, Freq); |
| } |
| } |
| } |
| |
| MachineInstr *PreRARematStage::ScoredRemat::rematerialize( |
| BitVector &RecomputeRP, SmallVectorImpl<GCNRPTarget> &RPTargets, |
| GCNScheduleDAGMILive &DAG) const { |
| const SIInstrInfo *TII = DAG.MF.getSubtarget<GCNSubtarget>().getInstrInfo(); |
| MachineInstr &DefMI = *Remat->DefMI; |
| Register Reg = DefMI.getOperand(0).getReg(); |
| Register NewReg = DAG.MRI.cloneVirtualRegister(Reg); |
| |
| // Rematerialize the register in the region where it is used. |
| MachineBasicBlock::iterator InsertPos = Remat->UseMI; |
| TII->reMaterialize(*InsertPos->getParent(), InsertPos, NewReg, 0, DefMI); |
| MachineInstr *RematMI = &*std::prev(InsertPos); |
| Remat->UseMI->substituteRegister(Reg, NewReg, 0, *DAG.TRI); |
| Remat->insertMI(Remat->UseRegion, RematMI, DAG); |
| |
| #ifdef EXPENSIVE_CHECKS |
| // All uses are known to be available / live at the remat point. Thus, |
| // the uses should already be live in to the using region. |
| for (MachineOperand &MO : DefMI.operands()) { |
| if (!MO.isReg() || !MO.getReg() || !MO.readsReg()) |
| continue; |
| |
| Register UseReg = MO.getReg(); |
| if (!UseReg.isVirtual()) |
| continue; |
| |
| LiveInterval &LI = DAG.LIS->getInterval(UseReg); |
| LaneBitmask LM = DAG.MRI.getMaxLaneMaskForVReg(MO.getReg()); |
| if (LI.hasSubRanges() && MO.getSubReg()) |
| LM = DAG.TRI->getSubRegIndexLaneMask(MO.getSubReg()); |
| |
| LaneBitmask LiveInMask = DAG.LiveIns[Remat->UseRegion].at(UseReg); |
| LaneBitmask UncoveredLanes = LM & ~(LiveInMask & LM); |
| // If this register has lanes not covered by the LiveIns, be sure they |
| // do not map to any subrange. ref: |
| // machine-scheduler-sink-trivial-remats.mir::omitted_subrange |
| if (UncoveredLanes.any()) { |
| assert(LI.hasSubRanges()); |
| for (LiveInterval::SubRange &SR : LI.subranges()) |
| assert((SR.LaneMask & UncoveredLanes).none()); |
| } |
| } |
| #endif |
| |
| // Remove the register from all regions where it is a live-in or live-out |
| // and adjust RP targets. The save is guaranteed in regions in which the |
| // register is live-through and unused but optimistic in all other regions |
| // where the register is live. |
| for (unsigned I : Remat->Live.set_bits()) { |
| RPTargets[I].saveRP(RPSave); |
| DAG.LiveIns[I].erase(Reg); |
| DAG.RegionLiveOuts.getLiveRegsForRegionIdx(I).erase(Reg); |
| if (!Remat->isUnusedLiveThrough(I)) |
| RecomputeRP.set(I); |
| } |
| |
| return RematMI; |
| } |
| |
| void PreRARematStage::commitRematerializations() const { |
| REMAT_DEBUG(dbgs() << "Commiting all rematerializations\n"); |
| for (const RollbackInfo &Rollback : Rollbacks) |
| DAG.deleteMI(Rollback.Remat->DefRegion, Rollback.Remat->DefMI); |
| } |
| |
| void PreRARematStage::unsetSatisfiedRPTargets(const BitVector &Regions) { |
| for (unsigned I : Regions.set_bits()) { |
| if (TargetRegions[I] && RPTargets[I].satisfied()) { |
| REMAT_DEBUG(dbgs() << " [" << I << "] Target reached!\n"); |
| TargetRegions.reset(I); |
| } |
| } |
| } |
| |
| bool PreRARematStage::updateAndVerifyRPTargets(const BitVector &Regions) { |
| bool TooOptimistic = false; |
| for (unsigned I : Regions.set_bits()) { |
| GCNRPTarget &Target = RPTargets[I]; |
| Target.setRP(DAG.getRealRegPressure(I)); |
| |
| // Since we were optimistic in assessing RP decreases in these regions, we |
| // may need to remark the target as a target region if RP didn't decrease |
| // as expected. |
| if (!TargetRegions[I] && !Target.satisfied()) { |
| REMAT_DEBUG(dbgs() << " [" << I << "] Incorrect RP estimation\n"); |
| TooOptimistic = true; |
| TargetRegions.set(I); |
| } |
| } |
| return TooOptimistic; |
| } |
| |
| // Copied from MachineLICM |
| bool PreRARematStage::isReMaterializable(const MachineInstr &MI) { |
| if (!DAG.TII->isReMaterializable(MI)) |
| return false; |
| |
| for (const MachineOperand &MO : MI.all_uses()) { |
| // We can't remat physreg uses, unless it is a constant or an ignorable |
| // use (e.g. implicit exec use on VALU instructions) |
| if (MO.getReg().isPhysical()) { |
| if (DAG.MRI.isConstantPhysReg(MO.getReg()) || DAG.TII->isIgnorableUse(MO)) |
| continue; |
| return false; |
| } |
| } |
| |
| return true; |
| } |
| |
| void PreRARematStage::finalizeGCNSchedStage() { |
| // We consider that reducing spilling is always beneficial so we never |
| // rollback rematerializations or revert scheduling in such cases. |
| if (!TargetOcc) |
| return; |
| |
| // When increasing occupancy, it is possible that re-scheduling is not able to |
| // achieve the target occupancy in all regions, in which case re-scheduling in |
| // all regions should be reverted. |
| if (DAG.MinOccupancy >= *TargetOcc) { |
| commitRematerializations(); |
| return; |
| } |
| |
| // It is possible that re-scheduling lowers occupancy over the one achieved |
| // just through rematerializations, in which case we revert re-scheduling in |
| // all regions but do not roll back rematerializations. |
| const bool ShouldRollbackRemats = AchievedOcc < *TargetOcc; |
| |
| // When we both need to revert re-scheduling and rollback rematerializations, |
| // restore rematerialized MIs' original state before reverting so that they |
| // are treated as non-debug instructions by the revert logic. |
| if (ShouldRollbackRemats) { |
| for (const RollbackInfo &Rollback : Rollbacks) { |
| const auto &[Remat, RematMI, RegMap] = Rollback; |
| Remat->DefMI->setDesc(DAG.TII->get(RematMI->getOpcode())); |
| for (const auto &[MOIdx, Reg] : RegMap) |
| Remat->DefMI->getOperand(MOIdx).setReg(Reg); |
| } |
| } |
| |
| // Revert re-scheduling in all affected regions. |
| for (const auto &[RegionIdx, OrigMIOrder, MaxPressure] : RegionReverts) { |
| REMAT_DEBUG(dbgs() << "Reverting re-scheduling in region " << RegionIdx |
| << '\n'); |
| DAG.Pressure[RegionIdx] = MaxPressure; |
| modifyRegionSchedule(RegionIdx, RegionBB[RegionIdx], OrigMIOrder); |
| } |
| |
| if (!ShouldRollbackRemats) { |
| commitRematerializations(); |
| DAG.setTargetOccupancy(AchievedOcc); |
| return; |
| } |
| |
| // Reset the target occupancy to what it was pre-rematerialization. |
| DAG.setTargetOccupancy(*TargetOcc - 1); |
| |
| // Finish rolling back rematerializations, then recompute pressure in all |
| // affected regions. |
| REMAT_DEBUG(dbgs() << "==== ROLLBACK ====\n"); |
| BitVector RecomputeRP(DAG.Regions.size()); |
| DenseSet<Register> RecomputeLI; |
| for (const RollbackInfo &Rollback : Rollbacks) { |
| const auto &[Remat, RematMI, RegMap] = Rollback; |
| |
| // Switch back to using the original register and delete the |
| // rematerialization. |
| Register Reg = RematMI->getOperand(0).getReg(); |
| Register OriginalReg = Remat->DefMI->getOperand(0).getReg(); |
| Remat->UseMI->substituteRegister(Reg, OriginalReg, 0, *DAG.TRI); |
| REMAT_DEBUG(dbgs() << '[' << Remat->UseRegion |
| << "] Deleting rematerialization " << *RematMI); |
| DAG.deleteMI(Remat->UseRegion, RematMI); |
| |
| // Re-add the defined register as a live-in/live-out in all regions it used |
| // to be one in. |
| std::pair<Register, LaneBitmask> LiveReg(OriginalReg, Remat->Mask); |
| for (unsigned I : Remat->LiveIn.set_bits()) |
| DAG.LiveIns[I].insert(LiveReg); |
| for (unsigned I : Remat->LiveOut.set_bits()) |
| DAG.RegionLiveOuts.getLiveRegsForRegionIdx(I).insert(LiveReg); |
| |
| RecomputeRP |= Rollback.Remat->Live; |
| // Regenerate intervals for all register operands of rematerialized MIs as |
| // slot indices may have changed slightly from before re-scheduling. |
| for (MachineOperand &MO : Rollback.Remat->DefMI->operands()) { |
| if (MO.isReg() && MO.getReg().isVirtual()) |
| RecomputeLI.insert(MO.getReg()); |
| } |
| } |
| for (Register Reg : RecomputeLI) { |
| DAG.LIS->removeInterval(Reg); |
| DAG.LIS->createAndComputeVirtRegInterval(Reg); |
| } |
| #ifdef EXPENSIVE_CHECKS |
| // In particular, we want to check for coherent MI/slot order in regions in |
| // which reverts and/or rollbacks may have happened. |
| MF.verify(); |
| #endif |
| for (unsigned I : RecomputeRP.set_bits()) |
| DAG.Pressure[I] = DAG.getRealRegPressure(I); |
| |
| GCNSchedStage::finalizeGCNSchedStage(); |
| } |
| |
| void GCNScheduleDAGMILive::deleteMI(unsigned RegionIdx, MachineInstr *MI) { |
| // It's not possible for the deleted instruction to be upper region boundary |
| // since we don't delete region terminators. |
| if (Regions[RegionIdx].first == MI) |
| Regions[RegionIdx].first = std::next(MachineBasicBlock::iterator(MI)); |
| LIS->removeInterval(MI->getOperand(0).getReg()); |
| LIS->RemoveMachineInstrFromMaps(*MI); |
| MI->eraseFromParent(); |
| } |
| |
| void GCNScheduleDAGMILive::setTargetOccupancy(unsigned TargetOccupancy) { |
| MinOccupancy = TargetOccupancy; |
| if (MFI.getOccupancy() < TargetOccupancy) |
| MFI.increaseOccupancy(MF, MinOccupancy); |
| else |
| MFI.limitOccupancy(MinOccupancy); |
| } |
| |
| static bool hasIGLPInstrs(ScheduleDAGInstrs *DAG) { |
| const SIInstrInfo *SII = static_cast<const SIInstrInfo *>(DAG->TII); |
| return any_of(*DAG, [SII](MachineBasicBlock::iterator MI) { |
| return SII->isIGLPMutationOnly(MI->getOpcode()); |
| }); |
| } |
| |
| GCNPostScheduleDAGMILive::GCNPostScheduleDAGMILive( |
| MachineSchedContext *C, std::unique_ptr<MachineSchedStrategy> S, |
| bool RemoveKillFlags) |
| : ScheduleDAGMI(C, std::move(S), RemoveKillFlags) {} |
| |
| void GCNPostScheduleDAGMILive::schedule() { |
| HasIGLPInstrs = hasIGLPInstrs(this); |
| if (HasIGLPInstrs) { |
| SavedMutations.clear(); |
| SavedMutations.swap(Mutations); |
| addMutation(createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::PostRA)); |
| } |
| |
| ScheduleDAGMI::schedule(); |
| } |
| |
| void GCNPostScheduleDAGMILive::finalizeSchedule() { |
| if (HasIGLPInstrs) |
| SavedMutations.swap(Mutations); |
| |
| ScheduleDAGMI::finalizeSchedule(); |
| } |