|  | //===-- MVETPAndVPTOptimisationsPass.cpp ----------------------------------===// | 
|  | // | 
|  | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. | 
|  | // See https://llvm.org/LICENSE.txt for license information. | 
|  | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | 
|  | // | 
|  | //===----------------------------------------------------------------------===// | 
|  | // | 
|  | /// \file This pass does a few optimisations related to Tail predicated loops | 
|  | /// and MVE VPT blocks before register allocation is performed. For VPT blocks | 
|  | /// the goal is to maximize the sizes of the blocks that will be created by the | 
|  | /// MVE VPT Block Insertion pass (which runs after register allocation). For | 
|  | /// tail predicated loops we transform the loop into something that will | 
|  | /// hopefully make the backend ARMLowOverheadLoops pass's job easier. | 
|  | /// | 
|  | //===----------------------------------------------------------------------===// | 
|  |  | 
|  | #include "ARM.h" | 
|  | #include "ARMSubtarget.h" | 
|  | #include "MCTargetDesc/ARMBaseInfo.h" | 
|  | #include "MVETailPredUtils.h" | 
|  | #include "Thumb2InstrInfo.h" | 
|  | #include "llvm/ADT/SmallVector.h" | 
|  | #include "llvm/CodeGen/MachineBasicBlock.h" | 
|  | #include "llvm/CodeGen/MachineDominators.h" | 
|  | #include "llvm/CodeGen/MachineFunction.h" | 
|  | #include "llvm/CodeGen/MachineFunctionPass.h" | 
|  | #include "llvm/CodeGen/MachineInstr.h" | 
|  | #include "llvm/CodeGen/MachineLoopInfo.h" | 
|  | #include "llvm/InitializePasses.h" | 
|  | #include "llvm/Support/Debug.h" | 
|  | #include <cassert> | 
|  |  | 
|  | using namespace llvm; | 
|  |  | 
|  | #define DEBUG_TYPE "arm-mve-vpt-opts" | 
|  |  | 
|  | static cl::opt<bool> | 
|  | MergeEndDec("arm-enable-merge-loopenddec", cl::Hidden, | 
|  | cl::desc("Enable merging Loop End and Dec instructions."), | 
|  | cl::init(true)); | 
|  |  | 
|  | static cl::opt<bool> | 
|  | SetLRPredicate("arm-set-lr-predicate", cl::Hidden, | 
|  | cl::desc("Enable setting lr as a predicate in tail predication regions."), | 
|  | cl::init(true)); | 
|  |  | 
|  | namespace { | 
|  | class MVETPAndVPTOptimisations : public MachineFunctionPass { | 
|  | public: | 
|  | static char ID; | 
|  | const Thumb2InstrInfo *TII; | 
|  | MachineRegisterInfo *MRI; | 
|  |  | 
|  | MVETPAndVPTOptimisations() : MachineFunctionPass(ID) {} | 
|  |  | 
|  | bool runOnMachineFunction(MachineFunction &Fn) override; | 
|  |  | 
|  | void getAnalysisUsage(AnalysisUsage &AU) const override { | 
|  | AU.addRequired<MachineLoopInfo>(); | 
|  | AU.addPreserved<MachineLoopInfo>(); | 
|  | AU.addRequired<MachineDominatorTree>(); | 
|  | AU.addPreserved<MachineDominatorTree>(); | 
|  | MachineFunctionPass::getAnalysisUsage(AU); | 
|  | } | 
|  |  | 
|  | StringRef getPassName() const override { | 
|  | return "ARM MVE TailPred and VPT Optimisation Pass"; | 
|  | } | 
|  |  | 
|  | private: | 
|  | bool LowerWhileLoopStart(MachineLoop *ML); | 
|  | bool MergeLoopEnd(MachineLoop *ML); | 
|  | bool ConvertTailPredLoop(MachineLoop *ML, MachineDominatorTree *DT); | 
|  | MachineInstr &ReplaceRegisterUseWithVPNOT(MachineBasicBlock &MBB, | 
|  | MachineInstr &Instr, | 
|  | MachineOperand &User, | 
|  | Register Target); | 
|  | bool ReduceOldVCCRValueUses(MachineBasicBlock &MBB); | 
|  | bool ReplaceVCMPsByVPNOTs(MachineBasicBlock &MBB); | 
|  | bool ReplaceConstByVPNOTs(MachineBasicBlock &MBB, MachineDominatorTree *DT); | 
|  | bool ConvertVPSEL(MachineBasicBlock &MBB); | 
|  | bool HintDoLoopStartReg(MachineBasicBlock &MBB); | 
|  | MachineInstr *CheckForLRUseInPredecessors(MachineBasicBlock *PreHeader, | 
|  | MachineInstr *LoopStart); | 
|  | }; | 
|  |  | 
|  | char MVETPAndVPTOptimisations::ID = 0; | 
|  |  | 
|  | } // end anonymous namespace | 
|  |  | 
|  | INITIALIZE_PASS_BEGIN(MVETPAndVPTOptimisations, DEBUG_TYPE, | 
|  | "ARM MVE TailPred and VPT Optimisations pass", false, | 
|  | false) | 
|  | INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo) | 
|  | INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) | 
|  | INITIALIZE_PASS_END(MVETPAndVPTOptimisations, DEBUG_TYPE, | 
|  | "ARM MVE TailPred and VPT Optimisations pass", false, false) | 
|  |  | 
|  | static MachineInstr *LookThroughCOPY(MachineInstr *MI, | 
|  | MachineRegisterInfo *MRI) { | 
|  | while (MI && MI->getOpcode() == TargetOpcode::COPY && | 
|  | MI->getOperand(1).getReg().isVirtual()) | 
|  | MI = MRI->getVRegDef(MI->getOperand(1).getReg()); | 
|  | return MI; | 
|  | } | 
|  |  | 
|  | // Given a loop ML, this attempts to find the t2LoopEnd, t2LoopDec and | 
|  | // corresponding PHI that make up a low overhead loop. Only handles 'do' loops | 
|  | // at the moment, returning a t2DoLoopStart in LoopStart. | 
|  | static bool findLoopComponents(MachineLoop *ML, MachineRegisterInfo *MRI, | 
|  | MachineInstr *&LoopStart, MachineInstr *&LoopPhi, | 
|  | MachineInstr *&LoopDec, MachineInstr *&LoopEnd) { | 
|  | MachineBasicBlock *Header = ML->getHeader(); | 
|  | MachineBasicBlock *Latch = ML->getLoopLatch(); | 
|  | if (!Header || !Latch) { | 
|  | LLVM_DEBUG(dbgs() << "  no Loop Latch or Header\n"); | 
|  | return false; | 
|  | } | 
|  |  | 
|  | // Find the loop end from the terminators. | 
|  | LoopEnd = nullptr; | 
|  | for (auto &T : Latch->terminators()) { | 
|  | if (T.getOpcode() == ARM::t2LoopEnd && T.getOperand(1).getMBB() == Header) { | 
|  | LoopEnd = &T; | 
|  | break; | 
|  | } | 
|  | if (T.getOpcode() == ARM::t2LoopEndDec && | 
|  | T.getOperand(2).getMBB() == Header) { | 
|  | LoopEnd = &T; | 
|  | break; | 
|  | } | 
|  | } | 
|  | if (!LoopEnd) { | 
|  | LLVM_DEBUG(dbgs() << "  no LoopEnd\n"); | 
|  | return false; | 
|  | } | 
|  | LLVM_DEBUG(dbgs() << "  found loop end: " << *LoopEnd); | 
|  |  | 
|  | // Find the dec from the use of the end. There may be copies between | 
|  | // instructions. We expect the loop to loop like: | 
|  | //   $vs = t2DoLoopStart ... | 
|  | // loop: | 
|  | //   $vp = phi [ $vs ], [ $vd ] | 
|  | //   ... | 
|  | //   $vd = t2LoopDec $vp | 
|  | //   ... | 
|  | //   t2LoopEnd $vd, loop | 
|  | if (LoopEnd->getOpcode() == ARM::t2LoopEndDec) | 
|  | LoopDec = LoopEnd; | 
|  | else { | 
|  | LoopDec = | 
|  | LookThroughCOPY(MRI->getVRegDef(LoopEnd->getOperand(0).getReg()), MRI); | 
|  | if (!LoopDec || LoopDec->getOpcode() != ARM::t2LoopDec) { | 
|  | LLVM_DEBUG(dbgs() << "  didn't find LoopDec where we expected!\n"); | 
|  | return false; | 
|  | } | 
|  | } | 
|  | LLVM_DEBUG(dbgs() << "  found loop dec: " << *LoopDec); | 
|  |  | 
|  | LoopPhi = | 
|  | LookThroughCOPY(MRI->getVRegDef(LoopDec->getOperand(1).getReg()), MRI); | 
|  | if (!LoopPhi || LoopPhi->getOpcode() != TargetOpcode::PHI || | 
|  | LoopPhi->getNumOperands() != 5 || | 
|  | (LoopPhi->getOperand(2).getMBB() != Latch && | 
|  | LoopPhi->getOperand(4).getMBB() != Latch)) { | 
|  | LLVM_DEBUG(dbgs() << "  didn't find PHI where we expected!\n"); | 
|  | return false; | 
|  | } | 
|  | LLVM_DEBUG(dbgs() << "  found loop phi: " << *LoopPhi); | 
|  |  | 
|  | Register StartReg = LoopPhi->getOperand(2).getMBB() == Latch | 
|  | ? LoopPhi->getOperand(3).getReg() | 
|  | : LoopPhi->getOperand(1).getReg(); | 
|  | LoopStart = LookThroughCOPY(MRI->getVRegDef(StartReg), MRI); | 
|  | if (!LoopStart || (LoopStart->getOpcode() != ARM::t2DoLoopStart && | 
|  | LoopStart->getOpcode() != ARM::t2WhileLoopSetup && | 
|  | LoopStart->getOpcode() != ARM::t2WhileLoopStartLR)) { | 
|  | LLVM_DEBUG(dbgs() << "  didn't find Start where we expected!\n"); | 
|  | return false; | 
|  | } | 
|  | LLVM_DEBUG(dbgs() << "  found loop start: " << *LoopStart); | 
|  |  | 
|  | return true; | 
|  | } | 
|  |  | 
|  | static void RevertWhileLoopSetup(MachineInstr *MI, const TargetInstrInfo *TII) { | 
|  | MachineBasicBlock *MBB = MI->getParent(); | 
|  | assert(MI->getOpcode() == ARM::t2WhileLoopSetup && | 
|  | "Only expected a t2WhileLoopSetup in RevertWhileLoopStart!"); | 
|  |  | 
|  | // Subs | 
|  | MachineInstrBuilder MIB = | 
|  | BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(ARM::t2SUBri)); | 
|  | MIB.add(MI->getOperand(0)); | 
|  | MIB.add(MI->getOperand(1)); | 
|  | MIB.addImm(0); | 
|  | MIB.addImm(ARMCC::AL); | 
|  | MIB.addReg(ARM::NoRegister); | 
|  | MIB.addReg(ARM::CPSR, RegState::Define); | 
|  |  | 
|  | // Attempt to find a t2WhileLoopStart and revert to a t2Bcc. | 
|  | for (MachineInstr &I : MBB->terminators()) { | 
|  | if (I.getOpcode() == ARM::t2WhileLoopStart) { | 
|  | MachineInstrBuilder MIB = | 
|  | BuildMI(*MBB, &I, I.getDebugLoc(), TII->get(ARM::t2Bcc)); | 
|  | MIB.add(MI->getOperand(1)); // branch target | 
|  | MIB.addImm(ARMCC::EQ); | 
|  | MIB.addReg(ARM::CPSR); | 
|  | I.eraseFromParent(); | 
|  | break; | 
|  | } | 
|  | } | 
|  |  | 
|  | MI->eraseFromParent(); | 
|  | } | 
|  |  | 
|  | // The Hardware Loop insertion and ISel Lowering produce the pseudos for the | 
|  | // start of a while loop: | 
|  | //   %a:gprlr = t2WhileLoopSetup %Cnt | 
|  | //   t2WhileLoopStart %a, %BB | 
|  | // We want to convert those to a single instruction which, like t2LoopEndDec and | 
|  | // t2DoLoopStartTP is both a terminator and produces a value: | 
|  | //   %a:grplr: t2WhileLoopStartLR %Cnt, %BB | 
|  | // | 
|  | // Otherwise if we can't, we revert the loop. t2WhileLoopSetup and | 
|  | // t2WhileLoopStart are not valid past regalloc. | 
|  | bool MVETPAndVPTOptimisations::LowerWhileLoopStart(MachineLoop *ML) { | 
|  | LLVM_DEBUG(dbgs() << "LowerWhileLoopStart on loop " | 
|  | << ML->getHeader()->getName() << "\n"); | 
|  |  | 
|  | MachineInstr *LoopEnd, *LoopPhi, *LoopStart, *LoopDec; | 
|  | if (!findLoopComponents(ML, MRI, LoopStart, LoopPhi, LoopDec, LoopEnd)) | 
|  | return false; | 
|  |  | 
|  | if (LoopStart->getOpcode() != ARM::t2WhileLoopSetup) | 
|  | return false; | 
|  |  | 
|  | Register LR = LoopStart->getOperand(0).getReg(); | 
|  | auto WLSIt = find_if(MRI->use_nodbg_instructions(LR), [](auto &MI) { | 
|  | return MI.getOpcode() == ARM::t2WhileLoopStart; | 
|  | }); | 
|  | if (!MergeEndDec || WLSIt == MRI->use_instr_nodbg_end()) { | 
|  | RevertWhileLoopSetup(LoopStart, TII); | 
|  | RevertLoopDec(LoopStart, TII); | 
|  | RevertLoopEnd(LoopStart, TII); | 
|  | return true; | 
|  | } | 
|  |  | 
|  | MachineInstrBuilder MI = | 
|  | BuildMI(*WLSIt->getParent(), *WLSIt, WLSIt->getDebugLoc(), | 
|  | TII->get(ARM::t2WhileLoopStartLR), LR) | 
|  | .add(LoopStart->getOperand(1)) | 
|  | .add(WLSIt->getOperand(1)); | 
|  | (void)MI; | 
|  | LLVM_DEBUG(dbgs() << "Lowered WhileLoopStart into: " << *MI.getInstr()); | 
|  |  | 
|  | WLSIt->eraseFromParent(); | 
|  | LoopStart->eraseFromParent(); | 
|  | return true; | 
|  | } | 
|  |  | 
|  | // Return true if this instruction is invalid in a low overhead loop, usually | 
|  | // because it clobbers LR. | 
|  | static bool IsInvalidTPInstruction(MachineInstr &MI) { | 
|  | return MI.isCall() || isLoopStart(MI); | 
|  | } | 
|  |  | 
|  | // Starting from PreHeader, search for invalid instructions back until the | 
|  | // LoopStart block is reached. If invalid instructions are found, the loop start | 
|  | // is reverted from a WhileLoopStart to a DoLoopStart on the same loop. Will | 
|  | // return the new DLS LoopStart if updated. | 
|  | MachineInstr *MVETPAndVPTOptimisations::CheckForLRUseInPredecessors( | 
|  | MachineBasicBlock *PreHeader, MachineInstr *LoopStart) { | 
|  | SmallVector<MachineBasicBlock *> Worklist; | 
|  | SmallPtrSet<MachineBasicBlock *, 4> Visited; | 
|  | Worklist.push_back(PreHeader); | 
|  | Visited.insert(LoopStart->getParent()); | 
|  |  | 
|  | while (!Worklist.empty()) { | 
|  | MachineBasicBlock *MBB = Worklist.pop_back_val(); | 
|  | if (Visited.count(MBB)) | 
|  | continue; | 
|  |  | 
|  | for (MachineInstr &MI : *MBB) { | 
|  | if (!IsInvalidTPInstruction(MI)) | 
|  | continue; | 
|  |  | 
|  | LLVM_DEBUG(dbgs() << "Found LR use in predecessors, reverting: " << MI); | 
|  |  | 
|  | // Create a t2DoLoopStart at the end of the preheader. | 
|  | MachineInstrBuilder MIB = | 
|  | BuildMI(*PreHeader, PreHeader->getFirstTerminator(), | 
|  | LoopStart->getDebugLoc(), TII->get(ARM::t2DoLoopStart)); | 
|  | MIB.add(LoopStart->getOperand(0)); | 
|  | MIB.add(LoopStart->getOperand(1)); | 
|  |  | 
|  | // Make sure to remove the kill flags, to prevent them from being invalid. | 
|  | LoopStart->getOperand(1).setIsKill(false); | 
|  |  | 
|  | // Revert the t2WhileLoopStartLR to a CMP and Br. | 
|  | RevertWhileLoopStartLR(LoopStart, TII, ARM::t2Bcc, true); | 
|  | return MIB; | 
|  | } | 
|  |  | 
|  | Visited.insert(MBB); | 
|  | for (auto *Pred : MBB->predecessors()) | 
|  | Worklist.push_back(Pred); | 
|  | } | 
|  | return LoopStart; | 
|  | } | 
|  |  | 
|  | // This function converts loops with t2LoopEnd and t2LoopEnd instructions into | 
|  | // a single t2LoopEndDec instruction. To do that it needs to make sure that LR | 
|  | // will be valid to be used for the low overhead loop, which means nothing else | 
|  | // is using LR (especially calls) and there are no superfluous copies in the | 
|  | // loop. The t2LoopEndDec is a branching terminator that produces a value (the | 
|  | // decrement) around the loop edge, which means we need to be careful that they | 
|  | // will be valid to allocate without any spilling. | 
|  | bool MVETPAndVPTOptimisations::MergeLoopEnd(MachineLoop *ML) { | 
|  | if (!MergeEndDec) | 
|  | return false; | 
|  |  | 
|  | LLVM_DEBUG(dbgs() << "MergeLoopEnd on loop " << ML->getHeader()->getName() | 
|  | << "\n"); | 
|  |  | 
|  | MachineInstr *LoopEnd, *LoopPhi, *LoopStart, *LoopDec; | 
|  | if (!findLoopComponents(ML, MRI, LoopStart, LoopPhi, LoopDec, LoopEnd)) | 
|  | return false; | 
|  |  | 
|  | // Check if there is an illegal instruction (a call) in the low overhead loop | 
|  | // and if so revert it now before we get any further. While loops also need to | 
|  | // check the preheaders, but can be reverted to a DLS loop if needed. | 
|  | auto *PreHeader = ML->getLoopPreheader(); | 
|  | if (LoopStart->getOpcode() == ARM::t2WhileLoopStartLR && PreHeader) | 
|  | LoopStart = CheckForLRUseInPredecessors(PreHeader, LoopStart); | 
|  |  | 
|  | for (MachineBasicBlock *MBB : ML->blocks()) { | 
|  | for (MachineInstr &MI : *MBB) { | 
|  | if (IsInvalidTPInstruction(MI)) { | 
|  | LLVM_DEBUG(dbgs() << "Found LR use in loop, reverting: " << MI); | 
|  | if (LoopStart->getOpcode() == ARM::t2DoLoopStart) | 
|  | RevertDoLoopStart(LoopStart, TII); | 
|  | else | 
|  | RevertWhileLoopStartLR(LoopStart, TII); | 
|  | RevertLoopDec(LoopDec, TII); | 
|  | RevertLoopEnd(LoopEnd, TII); | 
|  | return true; | 
|  | } | 
|  | } | 
|  | } | 
|  |  | 
|  | // Remove any copies from the loop, to ensure the phi that remains is both | 
|  | // simpler and contains no extra uses. Because t2LoopEndDec is a terminator | 
|  | // that cannot spill, we need to be careful what remains in the loop. | 
|  | Register PhiReg = LoopPhi->getOperand(0).getReg(); | 
|  | Register DecReg = LoopDec->getOperand(0).getReg(); | 
|  | Register StartReg = LoopStart->getOperand(0).getReg(); | 
|  | // Ensure the uses are expected, and collect any copies we want to remove. | 
|  | SmallVector<MachineInstr *, 4> Copies; | 
|  | auto CheckUsers = [&Copies](Register BaseReg, | 
|  | ArrayRef<MachineInstr *> ExpectedUsers, | 
|  | MachineRegisterInfo *MRI) { | 
|  | SmallVector<Register, 4> Worklist; | 
|  | Worklist.push_back(BaseReg); | 
|  | while (!Worklist.empty()) { | 
|  | Register Reg = Worklist.pop_back_val(); | 
|  | for (MachineInstr &MI : MRI->use_nodbg_instructions(Reg)) { | 
|  | if (count(ExpectedUsers, &MI)) | 
|  | continue; | 
|  | if (MI.getOpcode() != TargetOpcode::COPY || | 
|  | !MI.getOperand(0).getReg().isVirtual()) { | 
|  | LLVM_DEBUG(dbgs() << "Extra users of register found: " << MI); | 
|  | return false; | 
|  | } | 
|  | Worklist.push_back(MI.getOperand(0).getReg()); | 
|  | Copies.push_back(&MI); | 
|  | } | 
|  | } | 
|  | return true; | 
|  | }; | 
|  | if (!CheckUsers(PhiReg, {LoopDec}, MRI) || | 
|  | !CheckUsers(DecReg, {LoopPhi, LoopEnd}, MRI) || | 
|  | !CheckUsers(StartReg, {LoopPhi}, MRI)) { | 
|  | // Don't leave a t2WhileLoopStartLR without the LoopDecEnd. | 
|  | if (LoopStart->getOpcode() == ARM::t2WhileLoopStartLR) { | 
|  | RevertWhileLoopStartLR(LoopStart, TII); | 
|  | RevertLoopDec(LoopDec, TII); | 
|  | RevertLoopEnd(LoopEnd, TII); | 
|  | return true; | 
|  | } | 
|  | return false; | 
|  | } | 
|  |  | 
|  | MRI->constrainRegClass(StartReg, &ARM::GPRlrRegClass); | 
|  | MRI->constrainRegClass(PhiReg, &ARM::GPRlrRegClass); | 
|  | MRI->constrainRegClass(DecReg, &ARM::GPRlrRegClass); | 
|  |  | 
|  | if (LoopPhi->getOperand(2).getMBB() == ML->getLoopLatch()) { | 
|  | LoopPhi->getOperand(3).setReg(StartReg); | 
|  | LoopPhi->getOperand(1).setReg(DecReg); | 
|  | } else { | 
|  | LoopPhi->getOperand(1).setReg(StartReg); | 
|  | LoopPhi->getOperand(3).setReg(DecReg); | 
|  | } | 
|  |  | 
|  | // Replace the loop dec and loop end as a single instruction. | 
|  | MachineInstrBuilder MI = | 
|  | BuildMI(*LoopEnd->getParent(), *LoopEnd, LoopEnd->getDebugLoc(), | 
|  | TII->get(ARM::t2LoopEndDec), DecReg) | 
|  | .addReg(PhiReg) | 
|  | .add(LoopEnd->getOperand(1)); | 
|  | (void)MI; | 
|  | LLVM_DEBUG(dbgs() << "Merged LoopDec and End into: " << *MI.getInstr()); | 
|  |  | 
|  | LoopDec->eraseFromParent(); | 
|  | LoopEnd->eraseFromParent(); | 
|  | for (auto *MI : Copies) | 
|  | MI->eraseFromParent(); | 
|  | return true; | 
|  | } | 
|  |  | 
|  | // Convert t2DoLoopStart to t2DoLoopStartTP if the loop contains VCTP | 
|  | // instructions. This keeps the VCTP count reg operand on the t2DoLoopStartTP | 
|  | // instruction, making the backend ARMLowOverheadLoops passes job of finding the | 
|  | // VCTP operand much simpler. | 
|  | bool MVETPAndVPTOptimisations::ConvertTailPredLoop(MachineLoop *ML, | 
|  | MachineDominatorTree *DT) { | 
|  | LLVM_DEBUG(dbgs() << "ConvertTailPredLoop on loop " | 
|  | << ML->getHeader()->getName() << "\n"); | 
|  |  | 
|  | // Find some loop components including the LoopEnd/Dec/Start, and any VCTP's | 
|  | // in the loop. | 
|  | MachineInstr *LoopEnd, *LoopPhi, *LoopStart, *LoopDec; | 
|  | if (!findLoopComponents(ML, MRI, LoopStart, LoopPhi, LoopDec, LoopEnd)) | 
|  | return false; | 
|  | if (LoopDec != LoopEnd || (LoopStart->getOpcode() != ARM::t2DoLoopStart && | 
|  | LoopStart->getOpcode() != ARM::t2WhileLoopStartLR)) | 
|  | return false; | 
|  |  | 
|  | SmallVector<MachineInstr *, 4> VCTPs; | 
|  | SmallVector<MachineInstr *, 4> MVEInstrs; | 
|  | for (MachineBasicBlock *BB : ML->blocks()) { | 
|  | for (MachineInstr &MI : *BB) | 
|  | if (isVCTP(&MI)) | 
|  | VCTPs.push_back(&MI); | 
|  | else if (findFirstVPTPredOperandIdx(MI) != -1) | 
|  | MVEInstrs.push_back(&MI); | 
|  | } | 
|  |  | 
|  | if (VCTPs.empty()) { | 
|  | LLVM_DEBUG(dbgs() << "  no VCTPs\n"); | 
|  | return false; | 
|  | } | 
|  |  | 
|  | // Check all VCTPs are the same. | 
|  | MachineInstr *FirstVCTP = *VCTPs.begin(); | 
|  | for (MachineInstr *VCTP : VCTPs) { | 
|  | LLVM_DEBUG(dbgs() << "  with VCTP " << *VCTP); | 
|  | if (VCTP->getOpcode() != FirstVCTP->getOpcode() || | 
|  | VCTP->getOperand(0).getReg() != FirstVCTP->getOperand(0).getReg()) { | 
|  | LLVM_DEBUG(dbgs() << "  VCTP's are not identical\n"); | 
|  | return false; | 
|  | } | 
|  | } | 
|  |  | 
|  | // Check for the register being used can be setup before the loop. We expect | 
|  | // this to be: | 
|  | //   $vx = ... | 
|  | // loop: | 
|  | //   $vp = PHI [ $vx ], [ $vd ] | 
|  | //   .. | 
|  | //   $vpr = VCTP $vp | 
|  | //   .. | 
|  | //   $vd = t2SUBri $vp, #n | 
|  | //   .. | 
|  | Register CountReg = FirstVCTP->getOperand(1).getReg(); | 
|  | if (!CountReg.isVirtual()) { | 
|  | LLVM_DEBUG(dbgs() << "  cannot determine VCTP PHI\n"); | 
|  | return false; | 
|  | } | 
|  | MachineInstr *Phi = LookThroughCOPY(MRI->getVRegDef(CountReg), MRI); | 
|  | if (!Phi || Phi->getOpcode() != TargetOpcode::PHI || | 
|  | Phi->getNumOperands() != 5 || | 
|  | (Phi->getOperand(2).getMBB() != ML->getLoopLatch() && | 
|  | Phi->getOperand(4).getMBB() != ML->getLoopLatch())) { | 
|  | LLVM_DEBUG(dbgs() << "  cannot determine VCTP Count\n"); | 
|  | return false; | 
|  | } | 
|  | CountReg = Phi->getOperand(2).getMBB() == ML->getLoopLatch() | 
|  | ? Phi->getOperand(3).getReg() | 
|  | : Phi->getOperand(1).getReg(); | 
|  |  | 
|  | // Replace the t2DoLoopStart with the t2DoLoopStartTP, move it to the end of | 
|  | // the preheader and add the new CountReg to it. We attempt to place it late | 
|  | // in the preheader, but may need to move that earlier based on uses. | 
|  | MachineBasicBlock *MBB = LoopStart->getParent(); | 
|  | MachineBasicBlock::iterator InsertPt = MBB->getFirstTerminator(); | 
|  | for (MachineInstr &Use : | 
|  | MRI->use_instructions(LoopStart->getOperand(0).getReg())) | 
|  | if ((InsertPt != MBB->end() && !DT->dominates(&*InsertPt, &Use)) || | 
|  | !DT->dominates(ML->getHeader(), Use.getParent())) { | 
|  | LLVM_DEBUG(dbgs() << "  InsertPt could not be a terminator!\n"); | 
|  | return false; | 
|  | } | 
|  |  | 
|  | unsigned NewOpc = LoopStart->getOpcode() == ARM::t2DoLoopStart | 
|  | ? ARM::t2DoLoopStartTP | 
|  | : ARM::t2WhileLoopStartTP; | 
|  | MachineInstrBuilder MI = | 
|  | BuildMI(*MBB, InsertPt, LoopStart->getDebugLoc(), TII->get(NewOpc)) | 
|  | .add(LoopStart->getOperand(0)) | 
|  | .add(LoopStart->getOperand(1)) | 
|  | .addReg(CountReg); | 
|  | if (NewOpc == ARM::t2WhileLoopStartTP) | 
|  | MI.add(LoopStart->getOperand(2)); | 
|  | LLVM_DEBUG(dbgs() << "Replacing " << *LoopStart << "  with " | 
|  | << *MI.getInstr()); | 
|  | MRI->constrainRegClass(CountReg, &ARM::rGPRRegClass); | 
|  | LoopStart->eraseFromParent(); | 
|  |  | 
|  | if (SetLRPredicate) { | 
|  | // Each instruction in the loop needs to be using LR as the predicate from | 
|  | // the Phi as the predicate. | 
|  | Register LR = LoopPhi->getOperand(0).getReg(); | 
|  | for (MachineInstr *MI : MVEInstrs) { | 
|  | int Idx = findFirstVPTPredOperandIdx(*MI); | 
|  | MI->getOperand(Idx + 2).setReg(LR); | 
|  | } | 
|  | } | 
|  |  | 
|  | return true; | 
|  | } | 
|  |  | 
|  | // Returns true if Opcode is any VCMP Opcode. | 
|  | static bool IsVCMP(unsigned Opcode) { return VCMPOpcodeToVPT(Opcode) != 0; } | 
|  |  | 
|  | // Returns true if a VCMP with this Opcode can have its operands swapped. | 
|  | // There is 2 kind of VCMP that can't have their operands swapped: Float VCMPs, | 
|  | // and VCMPr instructions (since the r is always on the right). | 
|  | static bool CanHaveSwappedOperands(unsigned Opcode) { | 
|  | switch (Opcode) { | 
|  | default: | 
|  | return true; | 
|  | case ARM::MVE_VCMPf32: | 
|  | case ARM::MVE_VCMPf16: | 
|  | case ARM::MVE_VCMPf32r: | 
|  | case ARM::MVE_VCMPf16r: | 
|  | case ARM::MVE_VCMPi8r: | 
|  | case ARM::MVE_VCMPi16r: | 
|  | case ARM::MVE_VCMPi32r: | 
|  | case ARM::MVE_VCMPu8r: | 
|  | case ARM::MVE_VCMPu16r: | 
|  | case ARM::MVE_VCMPu32r: | 
|  | case ARM::MVE_VCMPs8r: | 
|  | case ARM::MVE_VCMPs16r: | 
|  | case ARM::MVE_VCMPs32r: | 
|  | return false; | 
|  | } | 
|  | } | 
|  |  | 
|  | // Returns the CondCode of a VCMP Instruction. | 
|  | static ARMCC::CondCodes GetCondCode(MachineInstr &Instr) { | 
|  | assert(IsVCMP(Instr.getOpcode()) && "Inst must be a VCMP"); | 
|  | return ARMCC::CondCodes(Instr.getOperand(3).getImm()); | 
|  | } | 
|  |  | 
|  | // Returns true if Cond is equivalent to a VPNOT instruction on the result of | 
|  | // Prev. Cond and Prev must be VCMPs. | 
|  | static bool IsVPNOTEquivalent(MachineInstr &Cond, MachineInstr &Prev) { | 
|  | assert(IsVCMP(Cond.getOpcode()) && IsVCMP(Prev.getOpcode())); | 
|  |  | 
|  | // Opcodes must match. | 
|  | if (Cond.getOpcode() != Prev.getOpcode()) | 
|  | return false; | 
|  |  | 
|  | MachineOperand &CondOP1 = Cond.getOperand(1), &CondOP2 = Cond.getOperand(2); | 
|  | MachineOperand &PrevOP1 = Prev.getOperand(1), &PrevOP2 = Prev.getOperand(2); | 
|  |  | 
|  | // If the VCMP has the opposite condition with the same operands, we can | 
|  | // replace it with a VPNOT | 
|  | ARMCC::CondCodes ExpectedCode = GetCondCode(Cond); | 
|  | ExpectedCode = ARMCC::getOppositeCondition(ExpectedCode); | 
|  | if (ExpectedCode == GetCondCode(Prev)) | 
|  | if (CondOP1.isIdenticalTo(PrevOP1) && CondOP2.isIdenticalTo(PrevOP2)) | 
|  | return true; | 
|  | // Check again with operands swapped if possible | 
|  | if (!CanHaveSwappedOperands(Cond.getOpcode())) | 
|  | return false; | 
|  | ExpectedCode = ARMCC::getSwappedCondition(ExpectedCode); | 
|  | return ExpectedCode == GetCondCode(Prev) && CondOP1.isIdenticalTo(PrevOP2) && | 
|  | CondOP2.isIdenticalTo(PrevOP1); | 
|  | } | 
|  |  | 
|  | // Returns true if Instr writes to VCCR. | 
|  | static bool IsWritingToVCCR(MachineInstr &Instr) { | 
|  | if (Instr.getNumOperands() == 0) | 
|  | return false; | 
|  | MachineOperand &Dst = Instr.getOperand(0); | 
|  | if (!Dst.isReg()) | 
|  | return false; | 
|  | Register DstReg = Dst.getReg(); | 
|  | if (!DstReg.isVirtual()) | 
|  | return false; | 
|  | MachineRegisterInfo &RegInfo = Instr.getMF()->getRegInfo(); | 
|  | const TargetRegisterClass *RegClass = RegInfo.getRegClassOrNull(DstReg); | 
|  | return RegClass && (RegClass->getID() == ARM::VCCRRegClassID); | 
|  | } | 
|  |  | 
|  | // Transforms | 
|  | //    <Instr that uses %A ('User' Operand)> | 
|  | // Into | 
|  | //    %K = VPNOT %Target | 
|  | //    <Instr that uses %K ('User' Operand)> | 
|  | // And returns the newly inserted VPNOT. | 
|  | // This optimization is done in the hopes of preventing spills/reloads of VPR by | 
|  | // reducing the number of VCCR values with overlapping lifetimes. | 
|  | MachineInstr &MVETPAndVPTOptimisations::ReplaceRegisterUseWithVPNOT( | 
|  | MachineBasicBlock &MBB, MachineInstr &Instr, MachineOperand &User, | 
|  | Register Target) { | 
|  | Register NewResult = MRI->createVirtualRegister(MRI->getRegClass(Target)); | 
|  |  | 
|  | MachineInstrBuilder MIBuilder = | 
|  | BuildMI(MBB, &Instr, Instr.getDebugLoc(), TII->get(ARM::MVE_VPNOT)) | 
|  | .addDef(NewResult) | 
|  | .addReg(Target); | 
|  | addUnpredicatedMveVpredNOp(MIBuilder); | 
|  |  | 
|  | // Make the user use NewResult instead, and clear its kill flag. | 
|  | User.setReg(NewResult); | 
|  | User.setIsKill(false); | 
|  |  | 
|  | LLVM_DEBUG(dbgs() << "  Inserting VPNOT (for spill prevention): "; | 
|  | MIBuilder.getInstr()->dump()); | 
|  |  | 
|  | return *MIBuilder.getInstr(); | 
|  | } | 
|  |  | 
|  | // Moves a VPNOT before its first user if an instruction that uses Reg is found | 
|  | // in-between the VPNOT and its user. | 
|  | // Returns true if there is at least one user of the VPNOT in the block. | 
|  | static bool MoveVPNOTBeforeFirstUser(MachineBasicBlock &MBB, | 
|  | MachineBasicBlock::iterator Iter, | 
|  | Register Reg) { | 
|  | assert(Iter->getOpcode() == ARM::MVE_VPNOT && "Not a VPNOT!"); | 
|  | assert(getVPTInstrPredicate(*Iter) == ARMVCC::None && | 
|  | "The VPNOT cannot be predicated"); | 
|  |  | 
|  | MachineInstr &VPNOT = *Iter; | 
|  | Register VPNOTResult = VPNOT.getOperand(0).getReg(); | 
|  | Register VPNOTOperand = VPNOT.getOperand(1).getReg(); | 
|  |  | 
|  | // Whether the VPNOT will need to be moved, and whether we found a user of the | 
|  | // VPNOT. | 
|  | bool MustMove = false, HasUser = false; | 
|  | MachineOperand *VPNOTOperandKiller = nullptr; | 
|  | for (; Iter != MBB.end(); ++Iter) { | 
|  | if (MachineOperand *MO = | 
|  | Iter->findRegisterUseOperand(VPNOTOperand, /*isKill*/ true)) { | 
|  | // If we find the operand that kills the VPNOTOperand's result, save it. | 
|  | VPNOTOperandKiller = MO; | 
|  | } | 
|  |  | 
|  | if (Iter->findRegisterUseOperandIdx(Reg) != -1) { | 
|  | MustMove = true; | 
|  | continue; | 
|  | } | 
|  |  | 
|  | if (Iter->findRegisterUseOperandIdx(VPNOTResult) == -1) | 
|  | continue; | 
|  |  | 
|  | HasUser = true; | 
|  | if (!MustMove) | 
|  | break; | 
|  |  | 
|  | // Move the VPNOT right before Iter | 
|  | LLVM_DEBUG(dbgs() << "Moving: "; VPNOT.dump(); dbgs() << "  Before: "; | 
|  | Iter->dump()); | 
|  | MBB.splice(Iter, &MBB, VPNOT.getIterator()); | 
|  | // If we move the instr, and its operand was killed earlier, remove the kill | 
|  | // flag. | 
|  | if (VPNOTOperandKiller) | 
|  | VPNOTOperandKiller->setIsKill(false); | 
|  |  | 
|  | break; | 
|  | } | 
|  | return HasUser; | 
|  | } | 
|  |  | 
|  | // This optimisation attempts to reduce the number of overlapping lifetimes of | 
|  | // VCCR values by replacing uses of old VCCR values with VPNOTs. For example, | 
|  | // this replaces | 
|  | //    %A:vccr = (something) | 
|  | //    %B:vccr = VPNOT %A | 
|  | //    %Foo = (some op that uses %B) | 
|  | //    %Bar = (some op that uses %A) | 
|  | // With | 
|  | //    %A:vccr = (something) | 
|  | //    %B:vccr = VPNOT %A | 
|  | //    %Foo = (some op that uses %B) | 
|  | //    %TMP2:vccr = VPNOT %B | 
|  | //    %Bar = (some op that uses %A) | 
|  | bool MVETPAndVPTOptimisations::ReduceOldVCCRValueUses(MachineBasicBlock &MBB) { | 
|  | MachineBasicBlock::iterator Iter = MBB.begin(), End = MBB.end(); | 
|  | SmallVector<MachineInstr *, 4> DeadInstructions; | 
|  | bool Modified = false; | 
|  |  | 
|  | while (Iter != End) { | 
|  | Register VCCRValue, OppositeVCCRValue; | 
|  | // The first loop looks for 2 unpredicated instructions: | 
|  | //    %A:vccr = (instr)     ; A is stored in VCCRValue | 
|  | //    %B:vccr = VPNOT %A    ; B is stored in OppositeVCCRValue | 
|  | for (; Iter != End; ++Iter) { | 
|  | // We're only interested in unpredicated instructions that write to VCCR. | 
|  | if (!IsWritingToVCCR(*Iter) || | 
|  | getVPTInstrPredicate(*Iter) != ARMVCC::None) | 
|  | continue; | 
|  | Register Dst = Iter->getOperand(0).getReg(); | 
|  |  | 
|  | // If we already have a VCCRValue, and this is a VPNOT on VCCRValue, we've | 
|  | // found what we were looking for. | 
|  | if (VCCRValue && Iter->getOpcode() == ARM::MVE_VPNOT && | 
|  | Iter->findRegisterUseOperandIdx(VCCRValue) != -1) { | 
|  | // Move the VPNOT closer to its first user if needed, and ignore if it | 
|  | // has no users. | 
|  | if (!MoveVPNOTBeforeFirstUser(MBB, Iter, VCCRValue)) | 
|  | continue; | 
|  |  | 
|  | OppositeVCCRValue = Dst; | 
|  | ++Iter; | 
|  | break; | 
|  | } | 
|  |  | 
|  | // Else, just set VCCRValue. | 
|  | VCCRValue = Dst; | 
|  | } | 
|  |  | 
|  | // If the first inner loop didn't find anything, stop here. | 
|  | if (Iter == End) | 
|  | break; | 
|  |  | 
|  | assert(VCCRValue && OppositeVCCRValue && | 
|  | "VCCRValue and OppositeVCCRValue shouldn't be empty if the loop " | 
|  | "stopped before the end of the block!"); | 
|  | assert(VCCRValue != OppositeVCCRValue && | 
|  | "VCCRValue should not be equal to OppositeVCCRValue!"); | 
|  |  | 
|  | // LastVPNOTResult always contains the same value as OppositeVCCRValue. | 
|  | Register LastVPNOTResult = OppositeVCCRValue; | 
|  |  | 
|  | // This second loop tries to optimize the remaining instructions. | 
|  | for (; Iter != End; ++Iter) { | 
|  | bool IsInteresting = false; | 
|  |  | 
|  | if (MachineOperand *MO = Iter->findRegisterUseOperand(VCCRValue)) { | 
|  | IsInteresting = true; | 
|  |  | 
|  | // - If the instruction is a VPNOT, it can be removed, and we can just | 
|  | //   replace its uses with LastVPNOTResult. | 
|  | // - Else, insert a new VPNOT on LastVPNOTResult to recompute VCCRValue. | 
|  | if (Iter->getOpcode() == ARM::MVE_VPNOT) { | 
|  | Register Result = Iter->getOperand(0).getReg(); | 
|  |  | 
|  | MRI->replaceRegWith(Result, LastVPNOTResult); | 
|  | DeadInstructions.push_back(&*Iter); | 
|  | Modified = true; | 
|  |  | 
|  | LLVM_DEBUG(dbgs() | 
|  | << "Replacing all uses of '" << printReg(Result) | 
|  | << "' with '" << printReg(LastVPNOTResult) << "'\n"); | 
|  | } else { | 
|  | MachineInstr &VPNOT = | 
|  | ReplaceRegisterUseWithVPNOT(MBB, *Iter, *MO, LastVPNOTResult); | 
|  | Modified = true; | 
|  |  | 
|  | LastVPNOTResult = VPNOT.getOperand(0).getReg(); | 
|  | std::swap(VCCRValue, OppositeVCCRValue); | 
|  |  | 
|  | LLVM_DEBUG(dbgs() << "Replacing use of '" << printReg(VCCRValue) | 
|  | << "' with '" << printReg(LastVPNOTResult) | 
|  | << "' in instr: " << *Iter); | 
|  | } | 
|  | } else { | 
|  | // If the instr uses OppositeVCCRValue, make it use LastVPNOTResult | 
|  | // instead as they contain the same value. | 
|  | if (MachineOperand *MO = | 
|  | Iter->findRegisterUseOperand(OppositeVCCRValue)) { | 
|  | IsInteresting = true; | 
|  |  | 
|  | // This is pointless if LastVPNOTResult == OppositeVCCRValue. | 
|  | if (LastVPNOTResult != OppositeVCCRValue) { | 
|  | LLVM_DEBUG(dbgs() << "Replacing usage of '" | 
|  | << printReg(OppositeVCCRValue) << "' with '" | 
|  | << printReg(LastVPNOTResult) << " for instr: "; | 
|  | Iter->dump()); | 
|  | MO->setReg(LastVPNOTResult); | 
|  | Modified = true; | 
|  | } | 
|  |  | 
|  | MO->setIsKill(false); | 
|  | } | 
|  |  | 
|  | // If this is an unpredicated VPNOT on | 
|  | // LastVPNOTResult/OppositeVCCRValue, we can act like we inserted it. | 
|  | if (Iter->getOpcode() == ARM::MVE_VPNOT && | 
|  | getVPTInstrPredicate(*Iter) == ARMVCC::None) { | 
|  | Register VPNOTOperand = Iter->getOperand(1).getReg(); | 
|  | if (VPNOTOperand == LastVPNOTResult || | 
|  | VPNOTOperand == OppositeVCCRValue) { | 
|  | IsInteresting = true; | 
|  |  | 
|  | std::swap(VCCRValue, OppositeVCCRValue); | 
|  | LastVPNOTResult = Iter->getOperand(0).getReg(); | 
|  | } | 
|  | } | 
|  | } | 
|  |  | 
|  | // If this instruction was not interesting, and it writes to VCCR, stop. | 
|  | if (!IsInteresting && IsWritingToVCCR(*Iter)) | 
|  | break; | 
|  | } | 
|  | } | 
|  |  | 
|  | for (MachineInstr *DeadInstruction : DeadInstructions) | 
|  | DeadInstruction->eraseFromParent(); | 
|  |  | 
|  | return Modified; | 
|  | } | 
|  |  | 
|  | // This optimisation replaces VCMPs with VPNOTs when they are equivalent. | 
|  | bool MVETPAndVPTOptimisations::ReplaceVCMPsByVPNOTs(MachineBasicBlock &MBB) { | 
|  | SmallVector<MachineInstr *, 4> DeadInstructions; | 
|  |  | 
|  | // The last VCMP that we have seen and that couldn't be replaced. | 
|  | // This is reset when an instruction that writes to VCCR/VPR is found, or when | 
|  | // a VCMP is replaced with a VPNOT. | 
|  | // We'll only replace VCMPs with VPNOTs when this is not null, and when the | 
|  | // current VCMP is the opposite of PrevVCMP. | 
|  | MachineInstr *PrevVCMP = nullptr; | 
|  | // If we find an instruction that kills the result of PrevVCMP, we save the | 
|  | // operand here to remove the kill flag in case we need to use PrevVCMP's | 
|  | // result. | 
|  | MachineOperand *PrevVCMPResultKiller = nullptr; | 
|  |  | 
|  | for (MachineInstr &Instr : MBB.instrs()) { | 
|  | if (PrevVCMP) { | 
|  | if (MachineOperand *MO = Instr.findRegisterUseOperand( | 
|  | PrevVCMP->getOperand(0).getReg(), /*isKill*/ true)) { | 
|  | // If we come accross the instr that kills PrevVCMP's result, record it | 
|  | // so we can remove the kill flag later if we need to. | 
|  | PrevVCMPResultKiller = MO; | 
|  | } | 
|  | } | 
|  |  | 
|  | // Ignore predicated instructions. | 
|  | if (getVPTInstrPredicate(Instr) != ARMVCC::None) | 
|  | continue; | 
|  |  | 
|  | // Only look at VCMPs | 
|  | if (!IsVCMP(Instr.getOpcode())) { | 
|  | // If the instruction writes to VCCR, forget the previous VCMP. | 
|  | if (IsWritingToVCCR(Instr)) | 
|  | PrevVCMP = nullptr; | 
|  | continue; | 
|  | } | 
|  |  | 
|  | if (!PrevVCMP || !IsVPNOTEquivalent(Instr, *PrevVCMP)) { | 
|  | PrevVCMP = &Instr; | 
|  | continue; | 
|  | } | 
|  |  | 
|  | // The register containing the result of the VCMP that we're going to | 
|  | // replace. | 
|  | Register PrevVCMPResultReg = PrevVCMP->getOperand(0).getReg(); | 
|  |  | 
|  | // Build a VPNOT to replace the VCMP, reusing its operands. | 
|  | MachineInstrBuilder MIBuilder = | 
|  | BuildMI(MBB, &Instr, Instr.getDebugLoc(), TII->get(ARM::MVE_VPNOT)) | 
|  | .add(Instr.getOperand(0)) | 
|  | .addReg(PrevVCMPResultReg); | 
|  | addUnpredicatedMveVpredNOp(MIBuilder); | 
|  | LLVM_DEBUG(dbgs() << "Inserting VPNOT (to replace VCMP): "; | 
|  | MIBuilder.getInstr()->dump(); dbgs() << "  Removed VCMP: "; | 
|  | Instr.dump()); | 
|  |  | 
|  | // If we found an instruction that uses, and kills PrevVCMP's result, | 
|  | // remove the kill flag. | 
|  | if (PrevVCMPResultKiller) | 
|  | PrevVCMPResultKiller->setIsKill(false); | 
|  |  | 
|  | // Finally, mark the old VCMP for removal and reset | 
|  | // PrevVCMP/PrevVCMPResultKiller. | 
|  | DeadInstructions.push_back(&Instr); | 
|  | PrevVCMP = nullptr; | 
|  | PrevVCMPResultKiller = nullptr; | 
|  | } | 
|  |  | 
|  | for (MachineInstr *DeadInstruction : DeadInstructions) | 
|  | DeadInstruction->eraseFromParent(); | 
|  |  | 
|  | return !DeadInstructions.empty(); | 
|  | } | 
|  |  | 
|  | bool MVETPAndVPTOptimisations::ReplaceConstByVPNOTs(MachineBasicBlock &MBB, | 
|  | MachineDominatorTree *DT) { | 
|  | // Scan through the block, looking for instructions that use constants moves | 
|  | // into VPR that are the negative of one another. These are expected to be | 
|  | // COPY's to VCCRRegClass, from a t2MOVi or t2MOVi16. The last seen constant | 
|  | // mask is kept it or and VPNOT's of it are added or reused as we scan through | 
|  | // the function. | 
|  | unsigned LastVPTImm = 0; | 
|  | Register LastVPTReg = 0; | 
|  | SmallSet<MachineInstr *, 4> DeadInstructions; | 
|  |  | 
|  | for (MachineInstr &Instr : MBB.instrs()) { | 
|  | // Look for predicated MVE instructions. | 
|  | int PIdx = llvm::findFirstVPTPredOperandIdx(Instr); | 
|  | if (PIdx == -1) | 
|  | continue; | 
|  | Register VPR = Instr.getOperand(PIdx + 1).getReg(); | 
|  | if (!VPR.isVirtual()) | 
|  | continue; | 
|  |  | 
|  | // From that we are looking for an instruction like %11:vccr = COPY %9:rgpr. | 
|  | MachineInstr *Copy = MRI->getVRegDef(VPR); | 
|  | if (!Copy || Copy->getOpcode() != TargetOpcode::COPY || | 
|  | !Copy->getOperand(1).getReg().isVirtual() || | 
|  | MRI->getRegClass(Copy->getOperand(1).getReg()) == &ARM::VCCRRegClass) { | 
|  | LastVPTReg = 0; | 
|  | continue; | 
|  | } | 
|  | Register GPR = Copy->getOperand(1).getReg(); | 
|  |  | 
|  | // Find the Immediate used by the copy. | 
|  | auto getImm = [&](Register GPR) -> unsigned { | 
|  | MachineInstr *Def = MRI->getVRegDef(GPR); | 
|  | if (Def && (Def->getOpcode() == ARM::t2MOVi || | 
|  | Def->getOpcode() == ARM::t2MOVi16)) | 
|  | return Def->getOperand(1).getImm(); | 
|  | return -1U; | 
|  | }; | 
|  | unsigned Imm = getImm(GPR); | 
|  | if (Imm == -1U) { | 
|  | LastVPTReg = 0; | 
|  | continue; | 
|  | } | 
|  |  | 
|  | unsigned NotImm = ~Imm & 0xffff; | 
|  | if (LastVPTReg != 0 && LastVPTReg != VPR && LastVPTImm == Imm) { | 
|  | Instr.getOperand(PIdx + 1).setReg(LastVPTReg); | 
|  | if (MRI->use_empty(VPR)) { | 
|  | DeadInstructions.insert(Copy); | 
|  | if (MRI->hasOneUse(GPR)) | 
|  | DeadInstructions.insert(MRI->getVRegDef(GPR)); | 
|  | } | 
|  | LLVM_DEBUG(dbgs() << "Reusing predicate: in  " << Instr); | 
|  | } else if (LastVPTReg != 0 && LastVPTImm == NotImm) { | 
|  | // We have found the not of a previous constant. Create a VPNot of the | 
|  | // earlier predicate reg and use it instead of the copy. | 
|  | Register NewVPR = MRI->createVirtualRegister(&ARM::VCCRRegClass); | 
|  | auto VPNot = BuildMI(MBB, &Instr, Instr.getDebugLoc(), | 
|  | TII->get(ARM::MVE_VPNOT), NewVPR) | 
|  | .addReg(LastVPTReg); | 
|  | addUnpredicatedMveVpredNOp(VPNot); | 
|  |  | 
|  | // Use the new register and check if the def is now dead. | 
|  | Instr.getOperand(PIdx + 1).setReg(NewVPR); | 
|  | if (MRI->use_empty(VPR)) { | 
|  | DeadInstructions.insert(Copy); | 
|  | if (MRI->hasOneUse(GPR)) | 
|  | DeadInstructions.insert(MRI->getVRegDef(GPR)); | 
|  | } | 
|  | LLVM_DEBUG(dbgs() << "Adding VPNot: " << *VPNot << "  to replace use at " | 
|  | << Instr); | 
|  | VPR = NewVPR; | 
|  | } | 
|  |  | 
|  | LastVPTImm = Imm; | 
|  | LastVPTReg = VPR; | 
|  | } | 
|  |  | 
|  | for (MachineInstr *DI : DeadInstructions) | 
|  | DI->eraseFromParent(); | 
|  |  | 
|  | return !DeadInstructions.empty(); | 
|  | } | 
|  |  | 
|  | // Replace VPSEL with a predicated VMOV in blocks with a VCTP. This is a | 
|  | // somewhat blunt approximation to allow tail predicated with vpsel | 
|  | // instructions. We turn a vselect into a VPSEL in ISEL, but they have slightly | 
|  | // different semantics under tail predication. Until that is modelled we just | 
|  | // convert to a VMOVT (via a predicated VORR) instead. | 
|  | bool MVETPAndVPTOptimisations::ConvertVPSEL(MachineBasicBlock &MBB) { | 
|  | bool HasVCTP = false; | 
|  | SmallVector<MachineInstr *, 4> DeadInstructions; | 
|  |  | 
|  | for (MachineInstr &MI : MBB.instrs()) { | 
|  | if (isVCTP(&MI)) { | 
|  | HasVCTP = true; | 
|  | continue; | 
|  | } | 
|  |  | 
|  | if (!HasVCTP || MI.getOpcode() != ARM::MVE_VPSEL) | 
|  | continue; | 
|  |  | 
|  | MachineInstrBuilder MIBuilder = | 
|  | BuildMI(MBB, &MI, MI.getDebugLoc(), TII->get(ARM::MVE_VORR)) | 
|  | .add(MI.getOperand(0)) | 
|  | .add(MI.getOperand(1)) | 
|  | .add(MI.getOperand(1)) | 
|  | .addImm(ARMVCC::Then) | 
|  | .add(MI.getOperand(4)) | 
|  | .add(MI.getOperand(5)) | 
|  | .add(MI.getOperand(2)); | 
|  | // Silence unused variable warning in release builds. | 
|  | (void)MIBuilder; | 
|  | LLVM_DEBUG(dbgs() << "Replacing VPSEL: "; MI.dump(); | 
|  | dbgs() << "     with VMOVT: "; MIBuilder.getInstr()->dump()); | 
|  | DeadInstructions.push_back(&MI); | 
|  | } | 
|  |  | 
|  | for (MachineInstr *DeadInstruction : DeadInstructions) | 
|  | DeadInstruction->eraseFromParent(); | 
|  |  | 
|  | return !DeadInstructions.empty(); | 
|  | } | 
|  |  | 
|  | // Add a registry allocation hint for t2DoLoopStart to hint it towards LR, as | 
|  | // the instruction may be removable as a noop. | 
|  | bool MVETPAndVPTOptimisations::HintDoLoopStartReg(MachineBasicBlock &MBB) { | 
|  | bool Changed = false; | 
|  | for (MachineInstr &MI : MBB.instrs()) { | 
|  | if (MI.getOpcode() != ARM::t2DoLoopStart) | 
|  | continue; | 
|  | Register R = MI.getOperand(1).getReg(); | 
|  | MachineFunction *MF = MI.getParent()->getParent(); | 
|  | MF->getRegInfo().setRegAllocationHint(R, ARMRI::RegLR, 0); | 
|  | Changed = true; | 
|  | } | 
|  | return Changed; | 
|  | } | 
|  |  | 
|  | bool MVETPAndVPTOptimisations::runOnMachineFunction(MachineFunction &Fn) { | 
|  | const ARMSubtarget &STI = | 
|  | static_cast<const ARMSubtarget &>(Fn.getSubtarget()); | 
|  |  | 
|  | if (!STI.isThumb2() || !STI.hasLOB()) | 
|  | return false; | 
|  |  | 
|  | TII = static_cast<const Thumb2InstrInfo *>(STI.getInstrInfo()); | 
|  | MRI = &Fn.getRegInfo(); | 
|  | MachineLoopInfo *MLI = &getAnalysis<MachineLoopInfo>(); | 
|  | MachineDominatorTree *DT = &getAnalysis<MachineDominatorTree>(); | 
|  |  | 
|  | LLVM_DEBUG(dbgs() << "********** ARM MVE VPT Optimisations **********\n" | 
|  | << "********** Function: " << Fn.getName() << '\n'); | 
|  |  | 
|  | bool Modified = false; | 
|  | for (MachineLoop *ML : MLI->getBase().getLoopsInPreorder()) { | 
|  | Modified |= LowerWhileLoopStart(ML); | 
|  | Modified |= MergeLoopEnd(ML); | 
|  | Modified |= ConvertTailPredLoop(ML, DT); | 
|  | } | 
|  |  | 
|  | for (MachineBasicBlock &MBB : Fn) { | 
|  | Modified |= HintDoLoopStartReg(MBB); | 
|  | Modified |= ReplaceConstByVPNOTs(MBB, DT); | 
|  | Modified |= ReplaceVCMPsByVPNOTs(MBB); | 
|  | Modified |= ReduceOldVCCRValueUses(MBB); | 
|  | Modified |= ConvertVPSEL(MBB); | 
|  | } | 
|  |  | 
|  | LLVM_DEBUG(dbgs() << "**************************************\n"); | 
|  | return Modified; | 
|  | } | 
|  |  | 
|  | /// createMVETPAndVPTOptimisationsPass | 
|  | FunctionPass *llvm::createMVETPAndVPTOptimisationsPass() { | 
|  | return new MVETPAndVPTOptimisations(); | 
|  | } |