| //=== AArch64PostSelectOptimize.cpp ---------------------------------------===// |
| // |
| // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| // See https://llvm.org/LICENSE.txt for license information. |
| // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| // |
| //===----------------------------------------------------------------------===// |
| // |
| // This pass does post-instruction-selection optimizations in the GlobalISel |
| // pipeline, before the rest of codegen runs. |
| // |
| //===----------------------------------------------------------------------===// |
| |
| #include "AArch64.h" |
| #include "AArch64TargetMachine.h" |
| #include "MCTargetDesc/AArch64MCTargetDesc.h" |
| #include "llvm/ADT/STLExtras.h" |
| #include "llvm/CodeGen/GlobalISel/Utils.h" |
| #include "llvm/CodeGen/MachineBasicBlock.h" |
| #include "llvm/CodeGen/MachineFunctionPass.h" |
| #include "llvm/CodeGen/MachineInstr.h" |
| #include "llvm/CodeGen/MachineOperand.h" |
| #include "llvm/CodeGen/TargetPassConfig.h" |
| #include "llvm/Support/Debug.h" |
| #include "llvm/Support/ErrorHandling.h" |
| |
| #define DEBUG_TYPE "aarch64-post-select-optimize" |
| |
| using namespace llvm; |
| |
| namespace { |
| class AArch64PostSelectOptimize : public MachineFunctionPass { |
| public: |
| static char ID; |
| |
| AArch64PostSelectOptimize(); |
| |
| StringRef getPassName() const override { |
| return "AArch64 Post Select Optimizer"; |
| } |
| |
| bool runOnMachineFunction(MachineFunction &MF) override; |
| |
| void getAnalysisUsage(AnalysisUsage &AU) const override; |
| |
| private: |
| bool optimizeNZCVDefs(MachineBasicBlock &MBB); |
| bool doPeepholeOpts(MachineBasicBlock &MBB); |
| /// Look for cross regclass copies that can be trivially eliminated. |
| bool foldSimpleCrossClassCopies(MachineInstr &MI); |
| bool foldCopyDup(MachineInstr &MI); |
| }; |
| } // end anonymous namespace |
| |
| void AArch64PostSelectOptimize::getAnalysisUsage(AnalysisUsage &AU) const { |
| AU.addRequired<TargetPassConfig>(); |
| AU.setPreservesCFG(); |
| getSelectionDAGFallbackAnalysisUsage(AU); |
| MachineFunctionPass::getAnalysisUsage(AU); |
| } |
| |
| AArch64PostSelectOptimize::AArch64PostSelectOptimize() |
| : MachineFunctionPass(ID) { |
| initializeAArch64PostSelectOptimizePass(*PassRegistry::getPassRegistry()); |
| } |
| |
| unsigned getNonFlagSettingVariant(unsigned Opc) { |
| switch (Opc) { |
| default: |
| return 0; |
| case AArch64::SUBSXrr: |
| return AArch64::SUBXrr; |
| case AArch64::SUBSWrr: |
| return AArch64::SUBWrr; |
| case AArch64::SUBSXrs: |
| return AArch64::SUBXrs; |
| case AArch64::SUBSWrs: |
| return AArch64::SUBWrs; |
| case AArch64::SUBSXri: |
| return AArch64::SUBXri; |
| case AArch64::SUBSWri: |
| return AArch64::SUBWri; |
| case AArch64::ADDSXrr: |
| return AArch64::ADDXrr; |
| case AArch64::ADDSWrr: |
| return AArch64::ADDWrr; |
| case AArch64::ADDSXrs: |
| return AArch64::ADDXrs; |
| case AArch64::ADDSWrs: |
| return AArch64::ADDWrs; |
| case AArch64::ADDSXri: |
| return AArch64::ADDXri; |
| case AArch64::ADDSWri: |
| return AArch64::ADDWri; |
| case AArch64::SBCSXr: |
| return AArch64::SBCXr; |
| case AArch64::SBCSWr: |
| return AArch64::SBCWr; |
| case AArch64::ADCSXr: |
| return AArch64::ADCXr; |
| case AArch64::ADCSWr: |
| return AArch64::ADCWr; |
| } |
| } |
| |
| bool AArch64PostSelectOptimize::doPeepholeOpts(MachineBasicBlock &MBB) { |
| bool Changed = false; |
| for (auto &MI : make_early_inc_range(MBB)) { |
| bool CurrentIterChanged = foldSimpleCrossClassCopies(MI); |
| if (!CurrentIterChanged) |
| CurrentIterChanged |= foldCopyDup(MI); |
| Changed |= CurrentIterChanged; |
| } |
| return Changed; |
| } |
| |
| bool AArch64PostSelectOptimize::foldSimpleCrossClassCopies(MachineInstr &MI) { |
| auto *MF = MI.getMF(); |
| auto &MRI = MF->getRegInfo(); |
| |
| if (!MI.isCopy()) |
| return false; |
| |
| if (MI.getOperand(1).getSubReg()) |
| return false; // Don't deal with subreg copies |
| |
| Register Src = MI.getOperand(1).getReg(); |
| Register Dst = MI.getOperand(0).getReg(); |
| |
| if (Src.isPhysical() || Dst.isPhysical()) |
| return false; |
| |
| const TargetRegisterClass *SrcRC = MRI.getRegClass(Src); |
| const TargetRegisterClass *DstRC = MRI.getRegClass(Dst); |
| |
| if (SrcRC == DstRC) |
| return false; |
| |
| |
| if (SrcRC->hasSubClass(DstRC)) { |
| // This is the case where the source class is a superclass of the dest, so |
| // if the copy is the only user of the source, we can just constrain the |
| // source reg to the dest class. |
| |
| if (!MRI.hasOneNonDBGUse(Src)) |
| return false; // Only constrain single uses of the source. |
| |
| // Constrain to dst reg class as long as it's not a weird class that only |
| // has a few registers. |
| if (!MRI.constrainRegClass(Src, DstRC, /* MinNumRegs */ 25)) |
| return false; |
| } else if (DstRC->hasSubClass(SrcRC)) { |
| // This is the inverse case, where the destination class is a superclass of |
| // the source. Here, if the copy is the only user, we can just constrain |
| // the user of the copy to use the smaller class of the source. |
| } else { |
| return false; |
| } |
| |
| MRI.replaceRegWith(Dst, Src); |
| MI.eraseFromParent(); |
| return true; |
| } |
| |
| bool AArch64PostSelectOptimize::foldCopyDup(MachineInstr &MI) { |
| if (!MI.isCopy()) |
| return false; |
| |
| auto *MF = MI.getMF(); |
| auto &MRI = MF->getRegInfo(); |
| auto *TII = MF->getSubtarget().getInstrInfo(); |
| |
| // Optimize COPY(y:GPR, DUP(x:FPR, i)) -> UMOV(y:GPR, x:FPR, i). |
| // Here Dst is y and Src is the result of DUP. |
| Register Dst = MI.getOperand(0).getReg(); |
| Register Src = MI.getOperand(1).getReg(); |
| |
| if (!Dst.isVirtual() || !Src.isVirtual()) |
| return false; |
| |
| auto TryMatchDUP = [&](const TargetRegisterClass *GPRRegClass, |
| const TargetRegisterClass *FPRRegClass, unsigned DUP, |
| unsigned UMOV) { |
| if (MRI.getRegClassOrNull(Dst) != GPRRegClass || |
| MRI.getRegClassOrNull(Src) != FPRRegClass) |
| return false; |
| |
| // There is a special case when one of the uses is COPY(z:FPR, y:GPR). |
| // In this case, we get COPY(z:FPR, COPY(y:GPR, DUP(x:FPR, i))), which can |
| // be folded by peephole-opt into just DUP(z:FPR, i), so this transform is |
| // not worthwhile in that case. |
| for (auto &Use : MRI.use_nodbg_instructions(Dst)) { |
| if (!Use.isCopy()) |
| continue; |
| |
| Register UseOp0 = Use.getOperand(0).getReg(); |
| Register UseOp1 = Use.getOperand(1).getReg(); |
| if (UseOp0.isPhysical() || UseOp1.isPhysical()) |
| return false; |
| |
| if (MRI.getRegClassOrNull(UseOp0) == FPRRegClass && |
| MRI.getRegClassOrNull(UseOp1) == GPRRegClass) |
| return false; |
| } |
| |
| MachineInstr *SrcMI = MRI.getUniqueVRegDef(Src); |
| if (!SrcMI || SrcMI->getOpcode() != DUP || !MRI.hasOneNonDBGUse(Src)) |
| return false; |
| |
| Register DupSrc = SrcMI->getOperand(1).getReg(); |
| int64_t DupImm = SrcMI->getOperand(2).getImm(); |
| |
| BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), TII->get(UMOV), Dst) |
| .addReg(DupSrc) |
| .addImm(DupImm); |
| SrcMI->eraseFromParent(); |
| MI.eraseFromParent(); |
| return true; |
| }; |
| |
| return TryMatchDUP(&AArch64::GPR32RegClass, &AArch64::FPR32RegClass, |
| AArch64::DUPi32, AArch64::UMOVvi32) || |
| TryMatchDUP(&AArch64::GPR64RegClass, &AArch64::FPR64RegClass, |
| AArch64::DUPi64, AArch64::UMOVvi64); |
| } |
| |
| bool AArch64PostSelectOptimize::optimizeNZCVDefs(MachineBasicBlock &MBB) { |
| // If we find a dead NZCV implicit-def, we |
| // - try to convert the operation to a non-flag-setting equivalent |
| // - or mark the def as dead to aid later peephole optimizations. |
| |
| // Use cases: |
| // 1) |
| // Consider the following code: |
| // FCMPSrr %0, %1, implicit-def $nzcv |
| // %sel1:gpr32 = CSELWr %_, %_, 12, implicit $nzcv |
| // %sub:gpr32 = SUBSWrr %_, %_, implicit-def $nzcv |
| // FCMPSrr %0, %1, implicit-def $nzcv |
| // %sel2:gpr32 = CSELWr %_, %_, 12, implicit $nzcv |
| // This kind of code where we have 2 FCMPs each feeding a CSEL can happen |
| // when we have a single IR fcmp being used by two selects. During selection, |
| // to ensure that there can be no clobbering of nzcv between the fcmp and the |
| // csel, we have to generate an fcmp immediately before each csel is |
| // selected. |
| // However, often we can essentially CSE these together later in MachineCSE. |
| // This doesn't work though if there are unrelated flag-setting instructions |
| // in between the two FCMPs. In this case, the SUBS defines NZCV |
| // but it doesn't have any users, being overwritten by the second FCMP. |
| // |
| // 2) |
| // The instruction selector always emits the flag-setting variant of ADC/SBC |
| // while selecting G_UADDE/G_SADDE/G_USUBE/G_SSUBE. If the carry-out of these |
| // instructions is never used, we can switch to the non-flag-setting variant. |
| |
| bool Changed = false; |
| auto &MF = *MBB.getParent(); |
| auto &Subtarget = MF.getSubtarget(); |
| const auto &TII = Subtarget.getInstrInfo(); |
| auto TRI = Subtarget.getRegisterInfo(); |
| auto RBI = Subtarget.getRegBankInfo(); |
| auto &MRI = MF.getRegInfo(); |
| |
| LiveRegUnits LRU(*MBB.getParent()->getSubtarget().getRegisterInfo()); |
| LRU.addLiveOuts(MBB); |
| |
| for (auto &II : instructionsWithoutDebug(MBB.rbegin(), MBB.rend())) { |
| bool NZCVDead = LRU.available(AArch64::NZCV); |
| if (NZCVDead && II.definesRegister(AArch64::NZCV, /*TRI=*/nullptr)) { |
| // The instruction defines NZCV, but NZCV is dead. |
| unsigned NewOpc = getNonFlagSettingVariant(II.getOpcode()); |
| int DeadNZCVIdx = |
| II.findRegisterDefOperandIdx(AArch64::NZCV, /*TRI=*/nullptr); |
| if (DeadNZCVIdx != -1) { |
| if (NewOpc) { |
| // If there is an equivalent non-flag-setting op, we convert. |
| LLVM_DEBUG(dbgs() << "Post-select optimizer: converting flag-setting " |
| "op: " |
| << II); |
| II.setDesc(TII->get(NewOpc)); |
| II.removeOperand(DeadNZCVIdx); |
| // Changing the opcode can result in differing regclass requirements, |
| // e.g. SUBSWri uses gpr32 for the dest, whereas SUBWri uses gpr32sp. |
| // Constrain the regclasses, possibly introducing a copy. |
| constrainOperandRegClass(MF, *TRI, MRI, *TII, *RBI, II, II.getDesc(), |
| II.getOperand(0), 0); |
| Changed |= true; |
| } else { |
| // Otherwise, we just set the nzcv imp-def operand to be dead, so the |
| // peephole optimizations can optimize them further. |
| II.getOperand(DeadNZCVIdx).setIsDead(); |
| } |
| } |
| } |
| LRU.stepBackward(II); |
| } |
| return Changed; |
| } |
| |
| bool AArch64PostSelectOptimize::runOnMachineFunction(MachineFunction &MF) { |
| if (MF.getProperties().hasProperty( |
| MachineFunctionProperties::Property::FailedISel)) |
| return false; |
| assert(MF.getProperties().hasProperty( |
| MachineFunctionProperties::Property::Selected) && |
| "Expected a selected MF"); |
| |
| bool Changed = false; |
| for (auto &BB : MF) { |
| Changed |= optimizeNZCVDefs(BB); |
| Changed |= doPeepholeOpts(BB); |
| } |
| return Changed; |
| } |
| |
| char AArch64PostSelectOptimize::ID = 0; |
| INITIALIZE_PASS_BEGIN(AArch64PostSelectOptimize, DEBUG_TYPE, |
| "Optimize AArch64 selected instructions", |
| false, false) |
| INITIALIZE_PASS_END(AArch64PostSelectOptimize, DEBUG_TYPE, |
| "Optimize AArch64 selected instructions", false, |
| false) |
| |
| namespace llvm { |
| FunctionPass *createAArch64PostSelectOptimize() { |
| return new AArch64PostSelectOptimize(); |
| } |
| } // end namespace llvm |