[ARM] Transforming memcpy to Tail predicated Loop

This patch converts llvm.memcpy intrinsic into Tail Predicated
Hardware loops for a target that supports the Arm M-profile
Vector Extension (MVE).

From an implementation point of view, the patch

- adds an ARM specific SDAG Node (to which the llvm.memcpy intrinsic is lowered to, during first phase of ISel)
- adds a corresponding TableGen entry to generate a pseudo instruction, with a custom inserter,
  on matching the above node.
- Adds a custom inserter function that expands the pseudo instruction into MIR suitable
   to be (by later passes) into a WLSTP loop.

Note: A cli option is used to control the conversion of memcpy to TP
loop and this option is currently disabled by default. It may be enabled
in the future after further downstream testing.

Reviewed By: dmgreen

Differential Revision: https://reviews.llvm.org/D99723
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index 9b4e1bc..9931d2c 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -1802,6 +1802,7 @@
     MAKE_CASE(ARMISD::CSINV)
     MAKE_CASE(ARMISD::CSNEG)
     MAKE_CASE(ARMISD::CSINC)
+    MAKE_CASE(ARMISD::MEMCPYLOOP)
 #undef MAKE_CASE
   }
   return nullptr;
@@ -11097,6 +11098,141 @@
   return true;
 }
 
+/// Adds logic in loop entry MBB to calculate loop iteration count and adds
+/// t2WhileLoopSetup and t2WhileLoopStart to generate WLS loop
+static Register genTPEntry(MachineBasicBlock *TpEntry,
+                           MachineBasicBlock *TpLoopBody,
+                           MachineBasicBlock *TpExit, Register OpSizeReg,
+                           const TargetInstrInfo *TII, DebugLoc Dl,
+                           MachineRegisterInfo &MRI) {
+
+  // Calculates loop iteration count = ceil(n/16)/16 = ((n + 15)&(-16)) / 16.
+  Register AddDestReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
+  BuildMI(TpEntry, Dl, TII->get(ARM::t2ADDri), AddDestReg)
+      .addUse(OpSizeReg)
+      .addImm(15)
+      .add(predOps(ARMCC::AL))
+      .addReg(0);
+
+  Register BicDestReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
+  BuildMI(TpEntry, Dl, TII->get(ARM::t2BICri), BicDestReg)
+      .addUse(AddDestReg, RegState::Kill)
+      .addImm(16)
+      .add(predOps(ARMCC::AL))
+      .addReg(0);
+
+  Register LsrDestReg = MRI.createVirtualRegister(&ARM::GPRlrRegClass);
+  BuildMI(TpEntry, Dl, TII->get(ARM::t2LSRri), LsrDestReg)
+      .addUse(BicDestReg, RegState::Kill)
+      .addImm(4)
+      .add(predOps(ARMCC::AL))
+      .addReg(0);
+
+  Register TotalIterationsReg = MRI.createVirtualRegister(&ARM::GPRlrRegClass);
+  BuildMI(TpEntry, Dl, TII->get(ARM::t2WhileLoopSetup), TotalIterationsReg)
+      .addUse(LsrDestReg, RegState::Kill);
+
+  BuildMI(TpEntry, Dl, TII->get(ARM::t2WhileLoopStart))
+      .addUse(TotalIterationsReg)
+      .addMBB(TpExit);
+
+  return TotalIterationsReg;
+}
+
+/// Adds logic in the loopBody MBB to generate MVE_VCTP, t2DoLoopDec and
+/// t2DoLoopEnd. These are used by later passes to generate tail predicated
+/// loops.
+static void genTPLoopBody(MachineBasicBlock *TpLoopBody,
+                          MachineBasicBlock *TpEntry, MachineBasicBlock *TpExit,
+                          const TargetInstrInfo *TII, DebugLoc Dl,
+                          MachineRegisterInfo &MRI, Register OpSrcReg,
+                          Register OpDestReg, Register ElementCountReg,
+                          Register TotalIterationsReg) {
+
+  // First insert 4 PHI nodes for: Current pointer to Src, Dest array, loop
+  // iteration counter, predication counter Current position in the src array
+  Register SrcPhiReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
+  Register CurrSrcReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
+  BuildMI(TpLoopBody, Dl, TII->get(ARM::PHI), SrcPhiReg)
+      .addUse(OpSrcReg)
+      .addMBB(TpEntry)
+      .addUse(CurrSrcReg)
+      .addMBB(TpLoopBody);
+
+  // Current position in the dest array
+  Register DestPhiReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
+  Register CurrDestReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
+  BuildMI(TpLoopBody, Dl, TII->get(ARM::PHI), DestPhiReg)
+      .addUse(OpDestReg)
+      .addMBB(TpEntry)
+      .addUse(CurrDestReg)
+      .addMBB(TpLoopBody);
+
+  // Current loop counter
+  Register LoopCounterPhiReg = MRI.createVirtualRegister(&ARM::GPRlrRegClass);
+  Register RemainingLoopIterationsReg =
+      MRI.createVirtualRegister(&ARM::GPRlrRegClass);
+  BuildMI(TpLoopBody, Dl, TII->get(ARM::PHI), LoopCounterPhiReg)
+      .addUse(TotalIterationsReg)
+      .addMBB(TpEntry)
+      .addUse(RemainingLoopIterationsReg)
+      .addMBB(TpLoopBody);
+
+  // Predication counter
+  Register PredCounterPhiReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
+  Register RemainingElementsReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
+  BuildMI(TpLoopBody, Dl, TII->get(ARM::PHI), PredCounterPhiReg)
+      .addUse(ElementCountReg)
+      .addMBB(TpEntry)
+      .addUse(RemainingElementsReg)
+      .addMBB(TpLoopBody);
+
+  // Pass predication counter to VCTP
+  Register VccrReg = MRI.createVirtualRegister(&ARM::VCCRRegClass);
+  BuildMI(TpLoopBody, Dl, TII->get(ARM::MVE_VCTP8), VccrReg)
+      .addUse(PredCounterPhiReg)
+      .addImm(ARMVCC::None)
+      .addReg(0);
+
+  BuildMI(TpLoopBody, Dl, TII->get(ARM::t2SUBri), RemainingElementsReg)
+      .addUse(PredCounterPhiReg)
+      .addImm(16)
+      .add(predOps(ARMCC::AL))
+      .addReg(0);
+
+  // VLDRB and VSTRB instructions, predicated using VPR
+  Register LoadedValueReg = MRI.createVirtualRegister(&ARM::MQPRRegClass);
+  BuildMI(TpLoopBody, Dl, TII->get(ARM::MVE_VLDRBU8_post))
+      .addDef(CurrSrcReg)
+      .addDef(LoadedValueReg)
+      .addReg(SrcPhiReg)
+      .addImm(16)
+      .addImm(ARMVCC::Then)
+      .addUse(VccrReg);
+
+  BuildMI(TpLoopBody, Dl, TII->get(ARM::MVE_VSTRBU8_post))
+      .addDef(CurrDestReg)
+      .addUse(LoadedValueReg, RegState::Kill)
+      .addReg(DestPhiReg)
+      .addImm(16)
+      .addImm(ARMVCC::Then)
+      .addUse(VccrReg);
+
+  // Add the pseudoInstrs for decrementing the loop counter and marking the
+  // end:t2DoLoopDec and t2DoLoopEnd
+  BuildMI(TpLoopBody, Dl, TII->get(ARM::t2LoopDec), RemainingLoopIterationsReg)
+      .addUse(LoopCounterPhiReg)
+      .addImm(1);
+
+  BuildMI(TpLoopBody, Dl, TII->get(ARM::t2LoopEnd))
+      .addUse(RemainingLoopIterationsReg)
+      .addMBB(TpLoopBody);
+
+  BuildMI(TpLoopBody, Dl, TII->get(ARM::t2B))
+      .addMBB(TpExit)
+      .add(predOps(ARMCC::AL));
+}
+
 MachineBasicBlock *
 ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
                                                MachineBasicBlock *BB) const {
@@ -11123,6 +11259,91 @@
     return BB;
   }
 
+  case ARM::MVE_MEMCPYLOOPINST: {
+
+    // Transformation below expands MVE_MEMCPYLOOPINST Pseudo instruction
+    // into a Tail Predicated (TP) Loop. It adds the instructions to calculate
+    // the iteration count =ceil(size_in_bytes/16)) in the TP entry block and
+    // adds the relevant instructions in the TP loop Body for generation of a
+    // WLSTP loop.
+
+    // Below is relevant portion of the CFG after the transformation.
+    // The Machine Basic Blocks are shown along with branch conditions (in
+    // brackets). Note that TP entry/exit MBBs depict the entry/exit of this
+    // portion of the CFG and may not necessarily be the entry/exit of the
+    // function.
+
+    //             (Relevant) CFG after transformation:
+    //               TP entry MBB
+    //                   |
+    //          |-----------------|
+    //       (n <= 0)          (n > 0)
+    //          |                 |
+    //          |         TP loop Body MBB<--|
+    //          |                |           |
+    //           \               |___________|
+    //            \             /
+    //              TP exit MBB
+
+    MachineFunction *MF = BB->getParent();
+    MachineRegisterInfo &MRI = MF->getRegInfo();
+
+    Register OpDestReg = MI.getOperand(0).getReg();
+    Register OpSrcReg = MI.getOperand(1).getReg();
+    Register OpSizeReg = MI.getOperand(2).getReg();
+
+    // Allocate the required MBBs and add to parent function.
+    MachineBasicBlock *TpEntry = BB;
+    MachineBasicBlock *TpLoopBody = MF->CreateMachineBasicBlock();
+    MachineBasicBlock *TpExit;
+
+    MF->push_back(TpLoopBody);
+
+    // If any instructions are present in the current block after
+    // MVE_MEMCPYLOOPINST, split the current block and move the instructions
+    // into the newly created exit block. If there are no instructions
+    // add an explicit branch to the FallThrough block and then split.
+    //
+    // The split is required for two reasons:
+    // 1) A terminator(t2WhileLoopStart) will be placed at that site.
+    // 2) Since a TPLoopBody will be added later, any phis in successive blocks
+    //    need to be updated. splitAt() already handles this.
+    TpExit = BB->splitAt(MI, false);
+    if (TpExit == BB) {
+      assert(BB->canFallThrough() &&
+             "Exit block must be FallThrough of the block containing memcpy");
+      TpExit = BB->getFallThrough();
+      BuildMI(BB, dl, TII->get(ARM::t2B))
+          .addMBB(TpExit)
+          .add(predOps(ARMCC::AL));
+      TpExit = BB->splitAt(MI, false);
+    }
+
+    // Add logic for iteration count
+    Register TotalIterationsReg =
+        genTPEntry(TpEntry, TpLoopBody, TpExit, OpSizeReg, TII, dl, MRI);
+
+    // Add the vectorized (and predicated) loads/store instructions
+    genTPLoopBody(TpLoopBody, TpEntry, TpExit, TII, dl, MRI, OpSrcReg,
+                  OpDestReg, OpSizeReg, TotalIterationsReg);
+
+    // Connect the blocks
+    TpEntry->addSuccessor(TpLoopBody);
+    TpLoopBody->addSuccessor(TpLoopBody);
+    TpLoopBody->addSuccessor(TpExit);
+
+    // Reorder for a more natural layout
+    TpLoopBody->moveAfter(TpEntry);
+    TpExit->moveAfter(TpLoopBody);
+
+    // Finally, remove the memcpy Psuedo Instruction
+    MI.eraseFromParent();
+
+    // Return the exit block as it may contain other instructions requiring a
+    // custom inserter
+    return TpExit;
+  }
+
   // The Thumb2 pre-indexed stores have the same MI operands, they just
   // define them differently in the .td files from the isel patterns, so
   // they need pseudos.
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h
index 41bebee..b604fae 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.h
+++ b/llvm/lib/Target/ARM/ARMISelLowering.h
@@ -300,6 +300,10 @@
     // instructions.
     MEMCPY,
 
+    // Pseudo-instruction representing a memory copy using a tail predicated
+    // loop
+    MEMCPYLOOP,
+
     // V8.1MMainline condition select
     CSINV, // Conditional select invert.
     CSNEG, // Conditional select negate.
diff --git a/llvm/lib/Target/ARM/ARMInstrMVE.td b/llvm/lib/Target/ARM/ARMInstrMVE.td
index 0356f42..bf9e52f 100644
--- a/llvm/lib/Target/ARM/ARMInstrMVE.td
+++ b/llvm/lib/Target/ARM/ARMInstrMVE.td
@@ -6865,6 +6865,18 @@
   let isTerminator = 1;
 }
 
+def SDT_MVEMEMCPYLOOPNODE
+    : SDTypeProfile<0, 3, [SDTCisPtrTy<0>, SDTCisPtrTy<1>, SDTCisVT<2, i32>]>;
+def MVE_MEMCPYLOOPNODE : SDNode<"ARMISD::MEMCPYLOOP", SDT_MVEMEMCPYLOOPNODE,
+                                [SDNPHasChain, SDNPMayStore, SDNPMayLoad]>;
+
+let usesCustomInserter = 1, hasNoSchedulingInfo = 1 in {
+  def MVE_MEMCPYLOOPINST : PseudoInst<(outs),
+        (ins rGPR:$dst, rGPR:$src, rGPR:$sz),
+        NoItinerary,
+        [(MVE_MEMCPYLOOPNODE rGPR:$dst, rGPR:$src, rGPR:$sz)]>;
+}
+
 def MVE_DLSTP_8  : MVE_DLSTP<"dlstp.8",  0b00>;
 def MVE_DLSTP_16 : MVE_DLSTP<"dlstp.16", 0b01>;
 def MVE_DLSTP_32 : MVE_DLSTP<"dlstp.32", 0b10>;
diff --git a/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp b/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp
index 7e06229..a0c8285 100644
--- a/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp
@@ -11,12 +11,27 @@
 //===----------------------------------------------------------------------===//
 
 #include "ARMTargetMachine.h"
+#include "ARMTargetTransformInfo.h"
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/IR/DerivedTypes.h"
+#include "llvm/Support/CommandLine.h"
 using namespace llvm;
 
 #define DEBUG_TYPE "arm-selectiondag-info"
 
+cl::opt<TPLoop::MemTransfer> EnableMemtransferTPLoop(
+    "arm-memtransfer-tploop", cl::Hidden,
+    cl::desc("Control conversion of memcpy to "
+             "Tail predicated loops (WLSTP)"),
+    cl::init(TPLoop::ForceDisabled),
+    cl::values(clEnumValN(TPLoop::ForceDisabled, "force-disabled",
+                          "Don't convert memcpy to TP loop."),
+               clEnumValN(TPLoop::ForceEnabled, "force-enabled",
+                          "Always convert memcpy to TP loop."),
+               clEnumValN(TPLoop::Allow, "allow",
+                          "Allow (may be subject to certain conditions) "
+                          "conversion of memcpy to TP loop.")));
+
 // Emit, if possible, a specialized version of the given Libcall. Typically this
 // means selecting the appropriately aligned version, but we also convert memset
 // of 0 into memclr.
@@ -130,13 +145,40 @@
     MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) const {
   const ARMSubtarget &Subtarget =
       DAG.getMachineFunction().getSubtarget<ARMSubtarget>();
+  ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
+
+  auto GenInlineTP = [&](const ARMSubtarget &Subtarget,
+                         const SelectionDAG &DAG) {
+    auto &F = DAG.getMachineFunction().getFunction();
+    if (!EnableMemtransferTPLoop)
+      return false;
+    if (EnableMemtransferTPLoop == TPLoop::ForceEnabled)
+      return true;
+    // Do not generate inline TP loop if optimizations is disabled,
+    // or if optimization for size (-Os or -Oz) is on.
+    if (F.hasOptNone() || F.hasOptSize())
+      return false;
+    // If cli option is unset
+    if (!ConstantSize && Alignment >= Align(4))
+      return true;
+    if (ConstantSize &&
+        ConstantSize->getZExtValue() > Subtarget.getMaxInlineSizeThreshold() &&
+        ConstantSize->getZExtValue() <
+            Subtarget.getMaxTPLoopInlineSizeThreshold())
+      return true;
+    return false;
+  };
+
+  if (Subtarget.hasMVEIntegerOps() && GenInlineTP(Subtarget, DAG))
+    return DAG.getNode(ARMISD::MEMCPYLOOP, dl, MVT::Other, Chain, Dst, Src,
+                       DAG.getZExtOrTrunc(Size, dl, MVT::i32));
+
   // Do repeated 4-byte loads and stores. To be improved.
   // This requires 4-byte alignment.
   if (Alignment < Align(4))
     return SDValue();
   // This requires the copy size to be a constant, preferably
   // within a subtarget-specific limit.
-  ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
   if (!ConstantSize)
     return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size,
                                   Alignment.value(), RTLIB::MEMCPY);
diff --git a/llvm/lib/Target/ARM/ARMSubtarget.h b/llvm/lib/Target/ARM/ARMSubtarget.h
index 44ef859..91c7b7c 100644
--- a/llvm/lib/Target/ARM/ARMSubtarget.h
+++ b/llvm/lib/Target/ARM/ARMSubtarget.h
@@ -538,6 +538,11 @@
     return 64;
   }
 
+  /// getMaxTPLoopSizeThreshold - Returns the maximum memcpy size
+  /// that still makes it profitable to inline the call as a Tail
+  /// Predicated loop
+  unsigned getMaxTPLoopInlineSizeThreshold() const { return 128; }
+
   /// ParseSubtargetFeatures - Parses features string setting specified
   /// subtarget options.  Definition of function is auto generated by tblgen.
   void ParseSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS);
diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
index 6ad6a16..c44689d 100644
--- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
+++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
@@ -48,6 +48,11 @@
   };
 }
 
+// For controlling conversion of memcpy into Tail Predicated loop.
+namespace TPLoop {
+enum MemTransfer { ForceDisabled = 0, ForceEnabled, Allow };
+}
+
 class ARMTTIImpl : public BasicTTIImplBase<ARMTTIImpl> {
   using BaseT = BasicTTIImplBase<ARMTTIImpl>;
   using TTI = TargetTransformInfo;
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/memcall.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/memcall.ll
index 8a4665a..a489a02 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/memcall.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/memcall.ll
@@ -1,34 +1,39 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -verify-machineinstrs -tail-predication=enabled -o - %s | FileCheck %s
+; RUN: llc --arm-memtransfer-tploop=allow -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -verify-machineinstrs -tail-predication=enabled -o - %s | FileCheck %s
 
 define void @test_memcpy(i32* nocapture %x, i32* nocapture readonly %y, i32 %n, i32 %m) {
 ; CHECK-LABEL: test_memcpy:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, r9, lr}
-; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, r9, lr}
-; CHECK-NEXT:    .pad #4
-; CHECK-NEXT:    sub sp, #4
+; CHECK-NEXT:    .save {r4, r5, r6, r7, lr}
+; CHECK-NEXT:    push {r4, r5, r6, r7, lr}
 ; CHECK-NEXT:    cmp r2, #1
-; CHECK-NEXT:    blt .LBB0_3
+; CHECK-NEXT:    blt .LBB0_5
 ; CHECK-NEXT:  @ %bb.1: @ %for.body.preheader
-; CHECK-NEXT:    mov r8, r3
-; CHECK-NEXT:    mov r5, r2
-; CHECK-NEXT:    mov r9, r1
-; CHECK-NEXT:    mov r7, r0
-; CHECK-NEXT:    lsls r4, r3, #2
-; CHECK-NEXT:    movs r6, #0
+; CHECK-NEXT:    lsl.w r12, r3, #2
+; CHECK-NEXT:    movs r7, #0
+; CHECK-NEXT:    b .LBB0_2
 ; CHECK-NEXT:  .LBB0_2: @ %for.body
-; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    adds r0, r7, r6
-; CHECK-NEXT:    add.w r1, r9, r6
-; CHECK-NEXT:    mov r2, r8
-; CHECK-NEXT:    bl __aeabi_memcpy4
-; CHECK-NEXT:    add r6, r4
-; CHECK-NEXT:    subs r5, #1
-; CHECK-NEXT:    bne .LBB0_2
-; CHECK-NEXT:  .LBB0_3: @ %for.cond.cleanup
-; CHECK-NEXT:    add sp, #4
-; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, pc}
+; CHECK-NEXT:    @ =>This Loop Header: Depth=1
+; CHECK-NEXT:    @ Child Loop BB0_4 Depth 2
+; CHECK-NEXT:    adds r4, r1, r7
+; CHECK-NEXT:    adds r5, r0, r7
+; CHECK-NEXT:    mov r6, r3
+; CHECK-NEXT:    wlstp.8 lr, r6, .LBB0_3
+; CHECK-NEXT:    b .LBB0_4
+; CHECK-NEXT:  .LBB0_3: @ %for.body
+; CHECK-NEXT:    @ in Loop: Header=BB0_2 Depth=1
+; CHECK-NEXT:    add r7, r12
+; CHECK-NEXT:    subs r2, #1
+; CHECK-NEXT:    beq .LBB0_5
+; CHECK-NEXT:    b .LBB0_2
+; CHECK-NEXT:  .LBB0_4: @ Parent Loop BB0_2 Depth=1
+; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
+; CHECK-NEXT:    vldrb.u8 q0, [r4], #16
+; CHECK-NEXT:    vstrb.8 q0, [r5], #16
+; CHECK-NEXT:    letp lr, .LBB0_4
+; CHECK-NEXT:    b .LBB0_3
+; CHECK-NEXT:  .LBB0_5: @ %for.cond.cleanup
+; CHECK-NEXT:    pop {r4, r5, r6, r7, pc}
 entry:
   %cmp8 = icmp sgt i32 %n, 0
   br i1 %cmp8, label %for.body, label %for.cond.cleanup
diff --git a/llvm/test/CodeGen/Thumb2/mve-tp-loop.ll b/llvm/test/CodeGen/Thumb2/mve-tp-loop.ll
new file mode 100644
index 0000000..a87fff9
--- /dev/null
+++ b/llvm/test/CodeGen/Thumb2/mve-tp-loop.ll
@@ -0,0 +1,285 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc --arm-memtransfer-tploop=allow -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve --verify-machineinstrs %s -o - | FileCheck %s
+
+; Check that WLSTP loop is not generated for alignment < 4
+; void test1(char* dest, char* src, int n){
+;    memcpy(dest, src, n);
+; }
+
+declare void @llvm.memcpy.p0i8.p0i8.i32(i8* noalias nocapture writeonly, i8* noalias nocapture readonly, i32, i1 immarg) #1
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* noalias nocapture writeonly, i8* noalias nocapture readonly, i64, i1 immarg) #1
+
+define void @test1(i8* noalias nocapture %X, i8* noalias nocapture readonly %Y, i32 %n){
+; CHECK-LABEL: test1:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .save {r7, lr}
+; CHECK-NEXT:    push {r7, lr}
+; CHECK-NEXT:    bl __aeabi_memcpy
+; CHECK-NEXT:    pop {r7, pc}
+entry:
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 %X, i8* align 1 %Y, i32 %n, i1 false)
+  ret void
+}
+
+
+; Check that WLSTP loop is generated for alignment >= 4
+; void test2(int* restrict X, int* restrict Y, int n){
+;     memcpy(X, Y, n);
+; }
+
+define void @test2(i32* noalias %X, i32* noalias readonly %Y, i32 %n){
+; CHECK-LABEL: test2:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .save {r7, lr}
+; CHECK-NEXT:    push {r7, lr}
+; CHECK-NEXT:    wlstp.8 lr, r2, .LBB1_2
+; CHECK-NEXT:  .LBB1_1: @ =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    vldrb.u8 q0, [r1], #16
+; CHECK-NEXT:    vstrb.8 q0, [r0], #16
+; CHECK-NEXT:    letp lr, .LBB1_1
+; CHECK-NEXT:  .LBB1_2: @ %entry
+; CHECK-NEXT:    pop {r7, pc}
+entry:
+  %0 = bitcast i32* %X to i8*
+  %1 = bitcast i32* %Y to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 %0, i8* align 4 %1, i32 %n, i1 false)
+  ret void
+}
+
+
+; Checks that transform handles some arithmetic on the input arguments.
+; void test3(int* restrict X, int* restrict Y, int n)
+; {
+;     memcpy(X+2, Y+3, (n*2)+10);
+; }
+
+define void @test3(i32* noalias nocapture %X, i32* noalias nocapture readonly %Y, i32 %n) {
+; CHECK-LABEL: test3:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .save {r7, lr}
+; CHECK-NEXT:    push {r7, lr}
+; CHECK-NEXT:    movs r3, #10
+; CHECK-NEXT:    add.w r2, r3, r2, lsl #1
+; CHECK-NEXT:    adds r1, #12
+; CHECK-NEXT:    adds r0, #8
+; CHECK-NEXT:    wlstp.8 lr, r2, .LBB2_2
+; CHECK-NEXT:  .LBB2_1: @ =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    vldrb.u8 q0, [r1], #16
+; CHECK-NEXT:    vstrb.8 q0, [r0], #16
+; CHECK-NEXT:    letp lr, .LBB2_1
+; CHECK-NEXT:  .LBB2_2: @ %entry
+; CHECK-NEXT:    pop {r7, pc}
+entry:
+  %add.ptr = getelementptr inbounds i32, i32* %X, i32 2
+  %0 = bitcast i32* %add.ptr to i8*
+  %add.ptr1 = getelementptr inbounds i32, i32* %Y, i32 3
+  %1 = bitcast i32* %add.ptr1 to i8*
+  %mul = shl nsw i32 %n, 1
+  %add = add nsw i32 %mul, 10
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* nonnull align 4 %0, i8* nonnull align 4 %1, i32 %add, i1 false)
+  ret void
+}
+
+
+; Checks that transform handles for loops that are implicitly converted to mempcy
+; void test4(int* restrict X, int* restrict Y, int n){
+;     for(int i = 0; i < n; ++i){
+;         X[i] = Y[i];
+;     }
+; }
+
+define void @test4(i32* noalias %X, i32* noalias readonly %Y, i32 %n) {
+; CHECK-LABEL: test4:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    cmp r2, #1
+; CHECK-NEXT:    it lt
+; CHECK-NEXT:    bxlt lr
+; CHECK-NEXT:  .LBB3_1: @ %for.body.preheader
+; CHECK-NEXT:    .save {r7, lr}
+; CHECK-NEXT:    push {r7, lr}
+; CHECK-NEXT:    wlstp.8 lr, r2, .LBB3_3
+; CHECK-NEXT:  .LBB3_2: @ =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    vldrb.u8 q0, [r1], #16
+; CHECK-NEXT:    vstrb.8 q0, [r0], #16
+; CHECK-NEXT:    letp lr, .LBB3_2
+; CHECK-NEXT:  .LBB3_3: @ %for.body.preheader
+; CHECK-NEXT:    pop.w {r7, lr}
+; CHECK-NEXT:    bx lr
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:                               ; preds = %entry
+  %X.bits = bitcast i32* %X to i8*
+  %Y.bits = bitcast i32* %Y to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 %X.bits, i8* align 4 %Y.bits, i32 %n, i1 false)
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.body.preheader, %entry
+  ret void
+}
+
+; Checks that transform can handle > i32 size inputs
+define void @test5(i8* noalias %X, i8* noalias %Y, i64 %n){
+; CHECK-LABEL: test5:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    .save {r7, lr}
+; CHECK-NEXT:    push {r7, lr}
+; CHECK-NEXT:    wlstp.8 lr, r2, .LBB4_2
+; CHECK-NEXT:  .LBB4_1: @ =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    vldrb.u8 q0, [r1], #16
+; CHECK-NEXT:    vstrb.8 q0, [r0], #16
+; CHECK-NEXT:    letp lr, .LBB4_1
+; CHECK-NEXT:  .LBB4_2:
+; CHECK-NEXT:    pop {r7, pc}
+    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %X, i8* align 4 %Y, i64 %n, i1 false)
+    ret void
+}
+
+; Checks the transform is applied for constant size inputs below a certain threshold (128 in this case)
+define void @test6(i32* noalias nocapture %X, i32* noalias nocapture readonly %Y, i32 %n) {
+; CHECK-LABEL: test6:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .save {r7, lr}
+; CHECK-NEXT:    push {r7, lr}
+; CHECK-NEXT:    movs r2, #127
+; CHECK-NEXT:    wlstp.8 lr, r2, .LBB5_2
+; CHECK-NEXT:  .LBB5_1: @ =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    vldrb.u8 q0, [r1], #16
+; CHECK-NEXT:    vstrb.8 q0, [r0], #16
+; CHECK-NEXT:    letp lr, .LBB5_1
+; CHECK-NEXT:  .LBB5_2: @ %entry
+; CHECK-NEXT:    pop {r7, pc}
+entry:
+  %0 = bitcast i32* %X to i8*
+  %1 = bitcast i32* %Y to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* noundef nonnull align 4 dereferenceable(127) %0, i8* noundef nonnull align 4 dereferenceable(127) %1, i32 127, i1 false)
+  ret void
+}
+
+; Checks the transform is NOT applied for constant size inputs above a certain threshold (128 in this case)
+define void @test7(i32* noalias nocapture %X, i32* noalias nocapture readonly %Y, i32 %n) {
+; CHECK-LABEL: test7:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .save {r7, lr}
+; CHECK-NEXT:    push {r7, lr}
+; CHECK-NEXT:    movs r2, #128
+; CHECK-NEXT:    bl __aeabi_memcpy4
+; CHECK-NEXT:    pop {r7, pc}
+entry:
+  %0 = bitcast i32* %X to i8*
+  %1 = bitcast i32* %Y to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 %0, i8* align 4 %1, i32 128, i1 false)
+  ret void
+}
+
+; Checks the transform is NOT applied for constant size inputs below a certain threshold (64 in this case)
+define void @test8(i32* noalias nocapture %X, i32* noalias nocapture readonly %Y, i32 %n) {
+; CHECK-LABEL: test8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .save {r4, lr}
+; CHECK-NEXT:    push {r4, lr}
+; CHECK-NEXT:    ldm.w r1!, {r2, r3, r4, r12, lr}
+; CHECK-NEXT:    stm.w r0!, {r2, r3, r4, r12, lr}
+; CHECK-NEXT:    ldm.w r1!, {r2, r3, r4, r12, lr}
+; CHECK-NEXT:    stm.w r0!, {r2, r3, r4, r12, lr}
+; CHECK-NEXT:    ldm.w r1, {r2, r3, r4, r12, lr}
+; CHECK-NEXT:    stm.w r0, {r2, r3, r4, r12, lr}
+; CHECK-NEXT:    pop {r4, pc}
+entry:
+  %0 = bitcast i32* %X to i8*
+  %1 = bitcast i32* %Y to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 %0, i8* align 4 %1, i32 60, i1 false)
+  ret void
+}
+
+; Checks the transform is NOT applied (regardless of alignment) when optimizations are disabled
+define void @test9(i32* noalias nocapture %X, i32* noalias nocapture readonly %Y, i32 %n) #0 {
+; CHECK-LABEL: test9:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .save {r7, lr}
+; CHECK-NEXT:    push {r7, lr}
+; CHECK-NEXT:    bl __aeabi_memcpy4
+; CHECK-NEXT:    pop {r7, pc}
+entry:
+  %0 = bitcast i32* %X to i8*
+  %1 = bitcast i32* %Y to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 %0, i8* align 4 %1, i32 %n, i1 false)
+  ret void
+}
+
+; Checks the transform is NOT applied (regardless of alignment) when optimization for size is on (-Os or -Oz)
+define void @test10(i32* noalias nocapture %X, i32* noalias nocapture readonly %Y, i32 %n) #1 {
+; CHECK-LABEL: test10:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .save {r7, lr}
+; CHECK-NEXT:    push {r7, lr}
+; CHECK-NEXT:    bl __aeabi_memcpy4
+; CHECK-NEXT:    pop {r7, pc}
+entry:
+  %0 = bitcast i32* %X to i8*
+  %1 = bitcast i32* %Y to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 %0, i8* align 4 %1, i32 %n, i1 false)
+  ret void
+}
+
+define void @test11(i8* nocapture %x, i8* nocapture %y, i32 %n) {
+; CHECK-LABEL: test11:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .save {r4, lr}
+; CHECK-NEXT:    push {r4, lr}
+; CHECK-NEXT:    cmp.w r2, #-1
+; CHECK-NEXT:    it gt
+; CHECK-NEXT:    popgt {r4, pc}
+; CHECK-NEXT:  .LBB10_1: @ %prehead
+; CHECK-NEXT:    add.w r3, r2, #15
+; CHECK-NEXT:    mov r12, r1
+; CHECK-NEXT:    bic r3, r3, #16
+; CHECK-NEXT:    mov r4, r0
+; CHECK-NEXT:    lsr.w lr, r3, #4
+; CHECK-NEXT:    mov r3, r2
+; CHECK-NEXT:    subs.w lr, lr, #0
+; CHECK-NEXT:    beq .LBB10_3
+; CHECK-NEXT:  .LBB10_2: @ =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    vctp.8 r3
+; CHECK-NEXT:    subs r3, #16
+; CHECK-NEXT:    vpstt
+; CHECK-NEXT:    vldrbt.u8 q0, [r12], #16
+; CHECK-NEXT:    vstrbt.8 q0, [r4], #16
+; CHECK-NEXT:    subs.w lr, lr, #1
+; CHECK-NEXT:    bne .LBB10_2
+; CHECK-NEXT:    b .LBB10_3
+; CHECK-NEXT:  .LBB10_3: @ %for.body
+; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    ldrb r3, [r0], #1
+; CHECK-NEXT:    subs r2, #2
+; CHECK-NEXT:    strb r3, [r1], #1
+; CHECK-NEXT:    bne .LBB10_3
+; CHECK-NEXT:  @ %bb.4: @ %for.cond.cleanup
+; CHECK-NEXT:    pop {r4, pc}
+entry:
+  %cmp6 = icmp slt i32 %n, 0
+  br i1 %cmp6, label %prehead, label %for.cond.cleanup
+
+prehead:                                          ; preds = %entry
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 %x, i8* align 4 %y, i32 %n, i1 false)
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %prehead
+  %i.09 = phi i32 [ %inc, %for.body ], [ 0, %prehead ]
+  %x.addr.08 = phi i8* [ %add.ptr, %for.body ], [ %x, %prehead ]
+  %y.addr.07 = phi i8* [ %add.ptr1, %for.body ], [ %y, %prehead ]
+  %add.ptr = getelementptr inbounds i8, i8* %x.addr.08, i32 1
+  %add.ptr1 = getelementptr inbounds i8, i8* %y.addr.07, i32 1
+  %l = load i8, i8* %x.addr.08, align 1
+  store i8 %l, i8* %y.addr.07, align 1
+  %inc = add nuw nsw i32 %i.09, 2
+  %exitcond.not = icmp eq i32 %inc, %n
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:                                 ; preds = %entry
+  ret void
+}
+
+attributes #0 = { noinline  optnone }
+attributes #1 = { optsize }
diff --git a/llvm/test/CodeGen/Thumb2/mve-tp-loop.mir b/llvm/test/CodeGen/Thumb2/mve-tp-loop.mir
new file mode 100644
index 0000000..4dfba16
--- /dev/null
+++ b/llvm/test/CodeGen/Thumb2/mve-tp-loop.mir
@@ -0,0 +1,127 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve -simplify-mir -run-pass=finalize-isel %s -o - | FileCheck %s
+--- |
+  target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
+  target triple = "arm-arm-none-eabi"
+
+  ; Function Attrs: argmemonly nofree nosync nounwind willreturn
+  declare void @llvm.memcpy.p0i8.p0i8.i32(i8* noalias nocapture writeonly, i8* noalias nocapture readonly, i32, i1 immarg)
+
+  define void @test1(i32* noalias %X, i32* noalias readonly %Y, i32 %n) {
+  entry:
+    %0 = bitcast i32* %X to i8*
+    %1 = bitcast i32* %Y to i8*
+    call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 %0, i8* align 4 %1, i32 %n, i1 false)
+    ret void
+  }
+
+  define void @test2(i32* noalias %X, i32* noalias readonly %Y, i32 %n) {
+  entry:
+    %cmp6 = icmp sgt i32 %n, 0
+    br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
+
+  for.body.preheader:                               ; preds = %entry
+    %X.bits = bitcast i32* %X to i8*
+    %Y.bits = bitcast i32* %Y to i8*
+    call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 %X.bits, i8* align 4 %Y.bits, i32 %n, i1 false)
+    br label %for.cond.cleanup
+
+  for.cond.cleanup:                                 ; preds = %for.body.preheader, %entry
+    ret void
+  }
+
+...
+---
+name:            test1
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $r0, $r1, $r2
+
+    ; CHECK-LABEL: name: test1
+    ; CHECK: liveins: $r0, $r1, $r2
+    ; CHECK: [[COPY:%[0-9]+]]:rgpr = COPY $r2
+    ; CHECK: [[COPY1:%[0-9]+]]:rgpr = COPY $r1
+    ; CHECK: [[COPY2:%[0-9]+]]:rgpr = COPY $r0
+    ; CHECK: [[t2ADDri:%[0-9]+]]:rgpr = t2ADDri [[COPY]], 15, 14 /* CC::al */, $noreg, $noreg
+    ; CHECK: [[t2BICri:%[0-9]+]]:rgpr = t2BICri killed [[t2ADDri]], 16, 14 /* CC::al */, $noreg, $noreg
+    ; CHECK: [[t2LSRri:%[0-9]+]]:gprlr = t2LSRri killed [[t2BICri]], 4, 14 /* CC::al */, $noreg, $noreg
+    ; CHECK: [[t2WhileLoopSetup:%[0-9]+]]:gprlr = t2WhileLoopSetup killed [[t2LSRri]]
+    ; CHECK: t2WhileLoopStart [[t2WhileLoopSetup]], %bb.2, implicit-def $cpsr
+    ; CHECK: .1:
+    ; CHECK: [[PHI:%[0-9]+]]:rgpr = PHI [[COPY1]], %bb.0, %8, %bb.1
+    ; CHECK: [[PHI1:%[0-9]+]]:rgpr = PHI [[COPY2]], %bb.0, %10, %bb.1
+    ; CHECK: [[PHI2:%[0-9]+]]:gprlr = PHI [[t2WhileLoopSetup]], %bb.0, %12, %bb.1
+    ; CHECK: [[PHI3:%[0-9]+]]:rgpr = PHI [[COPY]], %bb.0, %14, %bb.1
+    ; CHECK: [[MVE_VCTP8_:%[0-9]+]]:vccr = MVE_VCTP8 [[PHI3]], 0, $noreg
+    ; CHECK: [[t2SUBri:%[0-9]+]]:rgpr = t2SUBri [[PHI3]], 16, 14 /* CC::al */, $noreg, $noreg
+    ; CHECK: [[MVE_VLDRBU8_post:%[0-9]+]]:rgpr, [[MVE_VLDRBU8_post1:%[0-9]+]]:mqpr = MVE_VLDRBU8_post [[PHI]], 16, 1, [[MVE_VCTP8_]]
+    ; CHECK: [[MVE_VSTRBU8_post:%[0-9]+]]:rgpr = MVE_VSTRBU8_post killed [[MVE_VLDRBU8_post1]], [[PHI1]], 16, 1, [[MVE_VCTP8_]]
+    ; CHECK: [[t2LoopDec:%[0-9]+]]:gprlr = t2LoopDec [[PHI2]], 1
+    ; CHECK: t2LoopEnd [[t2LoopDec]], %bb.1, implicit-def $cpsr
+    ; CHECK: t2B %bb.2, 14 /* CC::al */, $noreg
+    ; CHECK: .2.entry:
+    ; CHECK: tBX_RET 14 /* CC::al */, $noreg
+    %2:rgpr = COPY $r2
+    %1:rgpr = COPY $r1
+    %0:rgpr = COPY $r0
+    MVE_MEMCPYLOOPINST %0, %1, %2
+    tBX_RET 14 /* CC::al */, $noreg
+
+...
+---
+name:            test2
+tracksRegLiveness: true
+body:             |
+  ; CHECK-LABEL: name: test2
+  ; CHECK: bb.0.entry:
+  ; CHECK:   successors: %bb.1(0x50000000), %bb.2(0x30000000)
+  ; CHECK:   liveins: $r0, $r1, $r2
+  ; CHECK:   [[COPY:%[0-9]+]]:rgpr = COPY $r2
+  ; CHECK:   [[COPY1:%[0-9]+]]:rgpr = COPY $r1
+  ; CHECK:   [[COPY2:%[0-9]+]]:rgpr = COPY $r0
+  ; CHECK:   t2CMPri [[COPY]], 1, 14 /* CC::al */, $noreg, implicit-def $cpsr
+  ; CHECK:   t2Bcc %bb.2, 11 /* CC::lt */, $cpsr
+  ; CHECK:   t2B %bb.1, 14 /* CC::al */, $noreg
+  ; CHECK: bb.1.for.body.preheader:
+  ; CHECK:   [[t2ADDri:%[0-9]+]]:rgpr = t2ADDri [[COPY]], 15, 14 /* CC::al */, $noreg, $noreg
+  ; CHECK:   [[t2BICri:%[0-9]+]]:rgpr = t2BICri killed [[t2ADDri]], 16, 14 /* CC::al */, $noreg, $noreg
+  ; CHECK:   [[t2LSRri:%[0-9]+]]:gprlr = t2LSRri killed [[t2BICri]], 4, 14 /* CC::al */, $noreg, $noreg
+  ; CHECK:   [[t2WhileLoopSetup:%[0-9]+]]:gprlr = t2WhileLoopSetup killed [[t2LSRri]]
+  ; CHECK:   t2WhileLoopStart [[t2WhileLoopSetup]], %bb.4, implicit-def $cpsr
+  ; CHECK: bb.3:
+  ; CHECK:   [[PHI:%[0-9]+]]:rgpr = PHI [[COPY1]], %bb.1, %8, %bb.3
+  ; CHECK:   [[PHI1:%[0-9]+]]:rgpr = PHI [[COPY2]], %bb.1, %10, %bb.3
+  ; CHECK:   [[PHI2:%[0-9]+]]:gprlr = PHI [[t2WhileLoopSetup]], %bb.1, %12, %bb.3
+  ; CHECK:   [[PHI3:%[0-9]+]]:rgpr = PHI [[COPY]], %bb.1, %14, %bb.3
+  ; CHECK:   [[MVE_VCTP8_:%[0-9]+]]:vccr = MVE_VCTP8 [[PHI3]], 0, $noreg
+  ; CHECK:   [[t2SUBri:%[0-9]+]]:rgpr = t2SUBri [[PHI3]], 16, 14 /* CC::al */, $noreg, $noreg
+  ; CHECK:   [[MVE_VLDRBU8_post:%[0-9]+]]:rgpr, [[MVE_VLDRBU8_post1:%[0-9]+]]:mqpr = MVE_VLDRBU8_post [[PHI]], 16, 1, [[MVE_VCTP8_]]
+  ; CHECK:   [[MVE_VSTRBU8_post:%[0-9]+]]:rgpr = MVE_VSTRBU8_post killed [[MVE_VLDRBU8_post1]], [[PHI1]], 16, 1, [[MVE_VCTP8_]]
+  ; CHECK:   [[t2LoopDec:%[0-9]+]]:gprlr = t2LoopDec [[PHI2]], 1
+  ; CHECK:   t2LoopEnd [[t2LoopDec]], %bb.3, implicit-def $cpsr
+  ; CHECK:   t2B %bb.4, 14 /* CC::al */, $noreg
+  ; CHECK: bb.4.for.body.preheader:
+  ; CHECK:   t2B %bb.2, 14 /* CC::al */, $noreg
+  ; CHECK: bb.2.for.cond.cleanup:
+  ; CHECK:   tBX_RET 14 /* CC::al */, $noreg
+  bb.0.entry:
+    successors: %bb.1(0x50000000), %bb.2(0x30000000)
+    liveins: $r0, $r1, $r2
+
+    %2:rgpr = COPY $r2
+    %1:rgpr = COPY $r1
+    %0:rgpr = COPY $r0
+    t2CMPri %2, 1, 14 /* CC::al */, $noreg, implicit-def $cpsr
+    t2Bcc %bb.2, 11 /* CC::lt */, $cpsr
+    t2B %bb.1, 14 /* CC::al */, $noreg
+
+  bb.1.for.body.preheader:
+    successors: %bb.2(0x80000000)
+
+    MVE_MEMCPYLOOPINST %0, %1, %2
+
+  bb.2.for.cond.cleanup:
+    tBX_RET 14 /* CC::al */, $noreg
+
+...