[ARM] Run ARMParallelDSP in the IRPasses phase

Run EarlyCSE before ParallelDSP and do this in the backend IR opt
phase.

Differential Revision: https://reviews.llvm.org/D59257


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@356130 91177308-0d34-0410-b5e6-96231b3b80d8
diff --git a/lib/Target/ARM/ARMTargetMachine.cpp b/lib/Target/ARM/ARMTargetMachine.cpp
index 401843c..bd075cd 100644
--- a/lib/Target/ARM/ARMTargetMachine.cpp
+++ b/lib/Target/ARM/ARMTargetMachine.cpp
@@ -403,6 +403,12 @@
 
   TargetPassConfig::addIRPasses();
 
+  // Run the parallel DSP pass and its helpers.
+  if (getOptLevel() == CodeGenOpt::Aggressive) {
+    addPass(createEarlyCSEPass());
+    addPass(createARMParallelDSPPass());
+  }
+
   // Match interleaved memory accesses to ldN/stN intrinsics.
   if (TM->getOptLevel() != CodeGenOpt::None)
     addPass(createInterleavedAccessPass());
@@ -415,9 +421,6 @@
 }
 
 bool ARMPassConfig::addPreISel() {
-  if (getOptLevel() != CodeGenOpt::None)
-    addPass(createARMParallelDSPPass());
-
   if ((TM->getOptLevel() != CodeGenOpt::None &&
        EnableGlobalMerge == cl::BOU_UNSET) ||
       EnableGlobalMerge == cl::BOU_TRUE) {
diff --git a/test/CodeGen/ARM/2011-02-04-AntidepMultidef.ll b/test/CodeGen/ARM/2011-02-04-AntidepMultidef.ll
index 777eccb..730f305 100644
--- a/test/CodeGen/ARM/2011-02-04-AntidepMultidef.ll
+++ b/test/CodeGen/ARM/2011-02-04-AntidepMultidef.ll
@@ -103,12 +103,6 @@
   %35 = add i8 %33, 87
   %iftmp.5.0.7 = select i1 %32, i8 %34, i8 %35
   store volatile i8 %iftmp.5.0.7, i8* %p8, align 1
-  ; CHECK: umull [[REGISTER:lr|r[0-9]+]],
-  ; CHECK-NOT: [[REGISTER]],
-  ; CHECK: {{lr|r[0-9]+}}, {{lr|r[0-9]+$}}
-  ; CHECK: umull [[REGISTER:lr|r[0-9]+]],
-  ; CHECK-NOT: [[REGISTER]],
-  ; CHECK: {{lr|r[0-9]+}}, {{lr|r[0-9]+$}}
   %36 = udiv i32 %2, 100000000
   %37 = urem i32 %36, 10
   %38 = icmp ult i32 %37, 10
diff --git a/test/CodeGen/ARM/O3-pipeline.ll b/test/CodeGen/ARM/O3-pipeline.ll
new file mode 100644
index 0000000..d9ec9c8
--- /dev/null
+++ b/test/CodeGen/ARM/O3-pipeline.ll
@@ -0,0 +1,150 @@
+; RUN: llc -mtriple=arm -O3 -debug-pass=Structure < %s -o /dev/null 2>&1 | FileCheck %s
+
+; REQUIRES: asserts
+
+; CHECK:  ModulePass Manager
+; CHECK:    Pre-ISel Intrinsic Lowering
+; CHECK:    FunctionPass Manager
+; CHECK:      Expand Atomic instructions
+; CHECK:      Simplify the CFG
+; CHECK:      Dominator Tree Construction
+; CHECK:      Basic Alias Analysis (stateless AA impl)
+; CHECK:      Module Verifier
+; CHECK:      Natural Loop Information
+; CHECK:      Canonicalize natural loops
+; CHECK:      Scalar Evolution Analysis
+; CHECK:      Loop Pass Manager
+; CHECK:        Induction Variable Users
+; CHECK:        Loop Strength Reduction
+; CHECK:      Basic Alias Analysis (stateless AA impl)
+; CHECK:      Function Alias Analysis Results
+; CHECK:      Merge contiguous icmps into a memcmp
+; CHECK:      Expand memcmp() to load/stores
+; CHECK:      Lower Garbage Collection Instructions
+; CHECK:      Shadow Stack GC Lowering
+; CHECK:      Remove unreachable blocks from the CFG
+; CHECK:      Dominator Tree Construction
+; CHECK:      Natural Loop Information
+; CHECK:      Branch Probability Analysis
+; CHECK:      Block Frequency Analysis
+; CHECK:      Constant Hoisting
+; CHECK:      Partially inline calls to library functions
+; CHECK:      Instrument function entry/exit with calls to e.g. mcount() (post inlining)
+; CHECK:      Scalarize Masked Memory Intrinsics
+; CHECK:      Expand reduction intrinsics
+; CHECK:      Dominator Tree Construction
+; CHECK:      Early CSE
+; CHECK:      Natural Loop Information
+; CHECK:      Scalar Evolution Analysis
+; CHECK:      Basic Alias Analysis (stateless AA impl)
+; CHECK:      Function Alias Analysis Results
+; CHECK:      Loop Pass Manager
+; CHECK:        Transform loops to use DSP intrinsics
+; CHECK:      Interleaved Access Pass
+; CHECK:      ARM IR optimizations
+; CHECK:      Dominator Tree Construction
+; CHECK:      Natural Loop Information
+; CHECK:      CodeGen Prepare
+; CHECK:    Rewrite Symbols
+; CHECK:    FunctionPass Manager
+; CHECK:      Dominator Tree Construction
+; CHECK:      Exception handling preparation
+; CHECK:      Merge internal globals
+; CHECK:      Safe Stack instrumentation pass
+; CHECK:      Insert stack protectors
+; CHECK:      Module Verifier
+; CHECK:      Dominator Tree Construction
+; CHECK:      Basic Alias Analysis (stateless AA impl)
+; CHECK:      Function Alias Analysis Results
+; CHECK:      Natural Loop Information
+; CHECK:      Branch Probability Analysis
+; CHECK:      ARM Instruction Selection
+; CHECK:      Expand ISel Pseudo-instructions
+; CHECK:      Early Tail Duplication
+; CHECK:      Optimize machine instruction PHIs
+; CHECK:      Slot index numbering
+; CHECK:      Merge disjoint stack slots
+; CHECK:      Local Stack Slot Allocation
+; CHECK:      Remove dead machine instructions
+; CHECK:      MachineDominator Tree Construction
+; CHECK:      Machine Natural Loop Construction
+; CHECK:      Early Machine Loop Invariant Code Motion
+; CHECK:      Machine Common Subexpression Elimination
+; CHECK:      MachinePostDominator Tree Construction
+; CHECK:      Machine Block Frequency Analysis
+; CHECK:      Machine code sinking
+; CHECK:      Peephole Optimizations
+; CHECK:      Remove dead machine instructions
+; CHECK:      ARM MLA / MLS expansion pass
+; CHECK:      ARM pre- register allocation load / store optimization pass
+; CHECK:      ARM A15 S->D optimizer
+; CHECK:      Detect Dead Lanes
+; CHECK:      Process Implicit Definitions
+; CHECK:      Remove unreachable machine basic blocks
+; CHECK:      Live Variable Analysis
+; CHECK:      MachineDominator Tree Construction
+; CHECK:      Machine Natural Loop Construction
+; CHECK:      Eliminate PHI nodes for register allocation
+; CHECK:      Two-Address instruction pass
+; CHECK:      Slot index numbering
+; CHECK:      Live Interval Analysis
+; CHECK:      Simple Register Coalescing
+; CHECK:      Rename Disconnected Subregister Components
+; CHECK:      Machine Instruction Scheduler
+; CHECK:      Machine Block Frequency Analysis
+; CHECK:      Debug Variable Analysis
+; CHECK:      Live Stack Slot Analysis
+; CHECK:      Virtual Register Map
+; CHECK:      Live Register Matrix
+; CHECK:      Bundle Machine CFG Edges
+; CHECK:      Spill Code Placement Analysis
+; CHECK:      Lazy Machine Block Frequency Analysis
+; CHECK:      Machine Optimization Remark Emitter
+; CHECK:      Greedy Register Allocator
+; CHECK:      Virtual Register Rewriter
+; CHECK:      Stack Slot Coloring
+; CHECK:      Machine Copy Propagation Pass
+; CHECK:      Machine Loop Invariant Code Motion
+; CHECK:      PostRA Machine Sink
+; CHECK:      Machine Block Frequency Analysis
+; CHECK:      MachinePostDominator Tree Construction
+; CHECK:      Lazy Machine Block Frequency Analysis
+; CHECK:      Machine Optimization Remark Emitter
+; CHECK:      Shrink Wrapping analysis
+; CHECK:      Prologue/Epilogue Insertion & Frame Finalization
+; CHECK:      Control Flow Optimizer
+; CHECK:      Tail Duplication
+; CHECK:      Machine Copy Propagation Pass
+; CHECK:      Post-RA pseudo instruction expansion pass
+; CHECK:      ARM load / store optimization pass
+; CHECK:      ReachingDefAnalysis
+; CHECK:      ARM Execution Domain Fix
+; CHECK:      BreakFalseDeps
+; CHECK:      ARM pseudo instruction expansion pass
+; CHECK:      Thumb2 instruction size reduce pass
+; CHECK:      MachineDominator Tree Construction
+; CHECK:      Machine Natural Loop Construction
+; CHECK:      Machine Block Frequency Analysis
+; CHECK:      If Converter
+; CHECK:      Thumb IT blocks insertion pass
+; CHECK:      MachineDominator Tree Construction
+; CHECK:      Machine Natural Loop Construction
+; CHECK:      Post RA top-down list latency scheduler
+; CHECK:      Analyze Machine Code For Garbage Collection
+; CHECK:      Machine Block Frequency Analysis
+; CHECK:      MachinePostDominator Tree Construction
+; CHECK:      Branch Probability Basic Block Placement
+; CHECK:      Thumb2 instruction size reduce pass
+; CHECK:      Unpack machine instruction bundles
+; CHECK:      optimise barriers pass
+; CHECK:      ARM constant island placement and branch shortening pass
+; CHECK:      Contiguously Lay Out Funclets
+; CHECK:      StackMap Liveness Analysis
+; CHECK:      Live DEBUG_VALUE analysis
+; CHECK:      Insert fentry calls
+; CHECK:      Insert XRay ops
+; CHECK:      Implement the 'patchable-function' attribute
+; CHECK:      Lazy Machine Block Frequency Analysis
+; CHECK:      Machine Optimization Remark Emitter
+; CHECK:      ARM Assembly Printer
+; CHECK:      Free MachineFunction
diff --git a/test/CodeGen/ARM/loop-indexing.ll b/test/CodeGen/ARM/loop-indexing.ll
index d8a8c3d..00d5bdc 100644
--- a/test/CodeGen/ARM/loop-indexing.ll
+++ b/test/CodeGen/ARM/loop-indexing.ll
@@ -1,9 +1,9 @@
-; RUN: llc -mtriple=thumbv7em -mattr=+fp-armv8 %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-DEFAULT --check-prefix=CHECK-T2
-; RUN: llc -mtriple=thumbv8m.main -mattr=+fp-armv8,+dsp %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-DEFAULT --check-prefix=CHECK-T2
+; RUN: llc --mtriple=thumbv7em -mattr=+fp-armv8 -O3 %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-DEFAULT --check-prefix=CHECK-T2
+; RUN: llc -mtriple=thumbv8m.main -mattr=+fp-armv8,+dsp -O3 %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-DEFAULT --check-prefix=CHECK-T2
 ; RUN: llc -mtriple=thumbv8m.main -mattr=+fp-armv8,+dsp -lsr-backedge-indexing=false %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=DISABLED
 ; RUN: llc -mtriple=thumbv8m.base %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=DISABLED
 ; RUN: llc -mtriple=thumbv8 %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=DISABLED
-; RUN: llc -mtriple=thumbv8m.main -mattr=+fp-armv8,+dsp -lsr-complexity-limit=2147483647 %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-COMPLEX --check-prefix=CHECK-T2
+; RUN: llc -mtriple=thumbv8m.main -mattr=+fp-armv8,+dsp -O3 -lsr-complexity-limit=2147483647 %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-COMPLEX --check-prefix=CHECK-T2
 
 ; Tests to check that post increment addressing modes are used instead of
 ; updating base pointers with add instructions.
diff --git a/test/CodeGen/ARM/vldm-sched-a9.ll b/test/CodeGen/ARM/vldm-sched-a9.ll
index 57387d5..4a4b880 100644
--- a/test/CodeGen/ARM/vldm-sched-a9.ll
+++ b/test/CodeGen/ARM/vldm-sched-a9.ll
@@ -5,67 +5,133 @@
 ; This test used to test vector spilling using vstmia/vldmia instructions, but
 ; the changes for PR:18825 prevent that spilling.
 
+; VST1 and VLD1 are now used for spilling/restoring.
+;
+; TODO:
+; I think more vldm should be generated, initial ones are used to load some
+; elements and then a sequence of vldr are used:
+; vldr  d15, [r1, #104]
+; vldr  d13, [r2, #96]
+; vldr  d9, [r1, #120]
+; vldr  d11, [r2, #112]
+; vldr  d14, [r1, #96]
+; vldr  d12, [r2, #88]
+; vldr  d8, [r1, #112]
+; vldr  d10, [r2, #104]
+
+; Also this patterns repeats several times which certainly seems like a vld1.64
+; should be used to load the data:
+; vldr  d16, [r1, #16]
+; vldr  d17, [r1, #24]
+; vst1.64 {d16, d17}, [lr:128]    @ 16-byte Spill
+
 ; CHECK: test:
-; CHECK: vstmia
-; CHECK: vldmia
-define void @test(i64* %src) #0 {
+; CHECK: vldmia r{{.*}}, {d{{.*}}, d{{.*}}}
+; CHECK: vldmia r{{.*}}, {d{{.*}}, d{{.*}}}
+define <16 x i64> @test(i64* %src0, i64* %src1) #0 {
 entry:
-  %arrayidx39 = getelementptr inbounds i64, i64* %src, i32 13
-  %vecinit285 = shufflevector <16 x i64> undef, <16 x i64> <i64 15, i64 16, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef>, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
-  store <16 x i64> %vecinit285, <16 x i64>* undef, align 128
-  %0 = load i64, i64* undef, align 8
-  %vecinit379 = insertelement <16 x i64> undef, i64 %0, i32 9
-  %1 = load i64, i64* undef, align 8
-  %vecinit419 = insertelement <16 x i64> undef, i64 %1, i32 15
-  store <16 x i64> %vecinit419, <16 x i64>* undef, align 128
-  %vecinit579 = insertelement <16 x i64> undef, i64 0, i32 4
-  %vecinit582 = shufflevector <16 x i64> %vecinit579, <16 x i64> <i64 6, i64 7, i64 8, i64 9, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef>, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 16, i32 17, i32 18, i32 19, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-  %vecinit584 = insertelement <16 x i64> %vecinit582, i64 undef, i32 9
-  %vecinit586 = insertelement <16 x i64> %vecinit584, i64 0, i32 10
-  %vecinit589 = shufflevector <16 x i64> %vecinit586, <16 x i64> <i64 12, i64 13, i64 14, i64 15, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef>, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 16, i32 17, i32 18, i32 19, i32 undef>
-  %2 = load i64, i64* undef, align 8
-  %vecinit591 = insertelement <16 x i64> %vecinit589, i64 %2, i32 15
-  store <16 x i64> %vecinit591, <16 x i64>* undef, align 128
-  %vecinit694 = shufflevector <16 x i64> undef, <16 x i64> <i64 13, i64 14, i64 15, i64 16, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef>, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-  store <16 x i64> %vecinit694, <16 x i64>* undef, align 128
-  %3 = load i64, i64* undef, align 8
-  %vecinit1331 = insertelement <16 x i64> undef, i64 %3, i32 14
-  %4 = load i64, i64* undef, align 8
-  %vecinit1468 = insertelement <16 x i64> undef, i64 %4, i32 11
-  %vecinit1471 = shufflevector <16 x i64> %vecinit1468, <16 x i64> <i64 13, i64 14, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef>, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 undef, i32 undef>
-  %vecinit1474 = shufflevector <16 x i64> %vecinit1471, <16 x i64> <i64 15, i64 16, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef>, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
-  store <16 x i64> %vecinit1474, <16 x i64>* undef, align 128
-  %vecinit1552 = shufflevector <16 x i64> undef, <16 x i64> <i64 10, i64 11, i64 12, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef>, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 16, i32 17, i32 18, i32 undef, i32 undef, i32 undef, i32 undef>
-  %vecinit1555 = shufflevector <16 x i64> %vecinit1552, <16 x i64> <i64 13, i64 14, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef>, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 undef, i32 undef>
-  %vecinit1558 = shufflevector <16 x i64> %vecinit1555, <16 x i64> <i64 15, i64 16, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef>, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
-  store <16 x i64> %vecinit1558, <16 x i64>* undef, align 128
-  %vecinit1591 = shufflevector <16 x i64> undef, <16 x i64> <i64 3, i64 4, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef>, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-  %vecinit1594 = shufflevector <16 x i64> %vecinit1591, <16 x i64> <i64 5, i64 6, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef>, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-  %vecinit1597 = shufflevector <16 x i64> %vecinit1594, <16 x i64> <i64 7, i64 8, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef>, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-  %vecinit1599 = insertelement <16 x i64> %vecinit1597, i64 undef, i32 8
-  %vecinit1601 = insertelement <16 x i64> %vecinit1599, i64 undef, i32 9
-  %vecinit1603 = insertelement <16 x i64> %vecinit1601, i64 undef, i32 10
-  %5 = load i64, i64* undef, align 8
-  %vecinit1605 = insertelement <16 x i64> %vecinit1603, i64 %5, i32 11
-  %vecinit1608 = shufflevector <16 x i64> %vecinit1605, <16 x i64> <i64 13, i64 14, i64 15, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef>, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 undef>
-  %6 = load i64, i64* undef, align 8
-  %vecinit1610 = insertelement <16 x i64> %vecinit1608, i64 %6, i32 15
-  store <16 x i64> %vecinit1610, <16 x i64>* undef, align 128
-  %vecinit2226 = shufflevector <16 x i64> undef, <16 x i64> <i64 6, i64 7, i64 8, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef>, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 16, i32 17, i32 18, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-  %7 = load i64, i64* undef, align 8
-  %vecinit2228 = insertelement <16 x i64> %vecinit2226, i64 %7, i32 8
-  %vecinit2230 = insertelement <16 x i64> %vecinit2228, i64 undef, i32 9
-  %vecinit2233 = shufflevector <16 x i64> %vecinit2230, <16 x i64> <i64 11, i64 12, i64 13, i64 14, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef>, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 18, i32 19, i32 undef, i32 undef>
-  %vecinit2236 = shufflevector <16 x i64> %vecinit2233, <16 x i64> <i64 15, i64 16, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef>, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
-  store <16 x i64> %vecinit2236, <16 x i64>* undef, align 128
-  %vecinit2246 = shufflevector <16 x i64> undef, <16 x i64> <i64 4, i64 5, i64 6, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef>, <16 x i32> <i32 0, i32 1, i32 2, i32 16, i32 17, i32 18, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-  %vecinit2249 = shufflevector <16 x i64> %vecinit2246, <16 x i64> <i64 7, i64 8, i64 9, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef>, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 18, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-  %vecinit2252 = shufflevector <16 x i64> %vecinit2249, <16 x i64> <i64 10, i64 11, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef>, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 16, i32 17, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-  %vecinit2255 = shufflevector <16 x i64> %vecinit2252, <16 x i64> <i64 12, i64 13, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef>, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 16, i32 17, i32 undef, i32 undef, i32 undef>
-  %8 = load i64, i64* %arrayidx39, align 8
-  %vecinit2257 = insertelement <16 x i64> %vecinit2255, i64 %8, i32 13
-  %vecinit2260 = shufflevector <16 x i64> %vecinit2257, <16 x i64> <i64 15, i64 16, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef>, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
-  store <16 x i64> %vecinit2260, <16 x i64>* null, align 128
-  ret void
+  %addr.0 = getelementptr inbounds i64, i64* %src0, i32 0
+  %el.0 = load i64, i64* %addr.0, align 8
+  %addr.1 = getelementptr inbounds i64, i64* %src0, i32 1
+  %el.1 = load i64, i64* %addr.1, align 8
+  %addr.2 = getelementptr inbounds i64, i64* %src0, i32 2
+  %el.2 = load i64, i64* %addr.2, align 8
+  %addr.3 = getelementptr inbounds i64, i64* %src0, i32 3
+  %el.3 = load i64, i64* %addr.3, align 8
+  %addr.4 = getelementptr inbounds i64, i64* %src0, i32 4
+  %el.4 = load i64, i64* %addr.4, align 8
+  %addr.5 = getelementptr inbounds i64, i64* %src0, i32 5
+  %el.5 = load i64, i64* %addr.5, align 8
+  %addr.6 = getelementptr inbounds i64, i64* %src0, i32 6
+  %el.6 = load i64, i64* %addr.6, align 8
+  %addr.7 = getelementptr inbounds i64, i64* %src0, i32 7
+  %el.7 = load i64, i64* %addr.7, align 8
+  %addr.8 = getelementptr inbounds i64, i64* %src0, i32 8
+  %el.8 = load i64, i64* %addr.8, align 8
+  %addr.9 = getelementptr inbounds i64, i64* %src0, i32 9
+  %el.9 = load i64, i64* %addr.9, align 8
+  %addr.10 = getelementptr inbounds i64, i64* %src0, i32 10
+  %el.10 = load i64, i64* %addr.10, align 8
+  %addr.11 = getelementptr inbounds i64, i64* %src0, i32 11
+  %el.11 = load i64, i64* %addr.11, align 8
+  %addr.12 = getelementptr inbounds i64, i64* %src0, i32 12
+  %el.12 = load i64, i64* %addr.12, align 8
+  %addr.13 = getelementptr inbounds i64, i64* %src0, i32 13
+  %el.13 = load i64, i64* %addr.13, align 8
+  %addr.14 = getelementptr inbounds i64, i64* %src0, i32 14
+  %el.14 = load i64, i64* %addr.14, align 8
+  %addr.15 = getelementptr inbounds i64, i64* %src0, i32 15
+  %el.15 = load i64, i64* %addr.15, align 8
+
+  %addr.0.1 = getelementptr inbounds i64, i64* %src1, i32 0
+  %el.0.1 = load i64, i64* %addr.0.1, align 8
+  %addr.1.1 = getelementptr inbounds i64, i64* %src1, i32 1
+  %el.1.1 = load i64, i64* %addr.1.1, align 8
+  %addr.2.1 = getelementptr inbounds i64, i64* %src1, i32 2
+  %el.2.1 = load i64, i64* %addr.2.1, align 8
+  %addr.3.1 = getelementptr inbounds i64, i64* %src1, i32 3
+  %el.3.1 = load i64, i64* %addr.3.1, align 8
+  %addr.4.1 = getelementptr inbounds i64, i64* %src1, i32 4
+  %el.4.1 = load i64, i64* %addr.4.1, align 8
+  %addr.5.1 = getelementptr inbounds i64, i64* %src1, i32 5
+  %el.5.1 = load i64, i64* %addr.5.1, align 8
+  %addr.6.1 = getelementptr inbounds i64, i64* %src1, i32 6
+  %el.6.1 = load i64, i64* %addr.6.1, align 8
+  %addr.7.1 = getelementptr inbounds i64, i64* %src1, i32 7
+  %el.7.1 = load i64, i64* %addr.7.1, align 8
+  %addr.8.1 = getelementptr inbounds i64, i64* %src1, i32 8
+  %el.8.1 = load i64, i64* %addr.8.1, align 8
+  %addr.9.1 = getelementptr inbounds i64, i64* %src1, i32 9
+  %el.9.1 = load i64, i64* %addr.9.1, align 8
+  %addr.10.1 = getelementptr inbounds i64, i64* %src1, i32 10
+  %el.10.1 = load i64, i64* %addr.10.1, align 8
+  %addr.11.1 = getelementptr inbounds i64, i64* %src1, i32 11
+  %el.11.1 = load i64, i64* %addr.11.1, align 8
+  %addr.12.1 = getelementptr inbounds i64, i64* %src1, i32 12
+  %el.12.1 = load i64, i64* %addr.12.1, align 8
+  %addr.13.1 = getelementptr inbounds i64, i64* %src1, i32 13
+  %el.13.1 = load i64, i64* %addr.13.1, align 8
+  %addr.14.1 = getelementptr inbounds i64, i64* %src1, i32 14
+  %el.14.1 = load i64, i64* %addr.14.1, align 8
+  %addr.15.1 = getelementptr inbounds i64, i64* %src1, i32 15
+  %el.15.1 = load i64, i64* %addr.15.1, align 8
+  %vec.0 = insertelement <16 x i64> undef, i64 %el.0, i32 0
+  %vec.1 = insertelement <16 x i64> %vec.0, i64 %el.1, i32 1
+  %vec.2 = insertelement <16 x i64> %vec.1, i64 %el.2, i32 2
+  %vec.3 = insertelement <16 x i64> %vec.2, i64 %el.3, i32 3
+  %vec.4 = insertelement <16 x i64> %vec.3, i64 %el.4, i32 4
+  %vec.5 = insertelement <16 x i64> %vec.4, i64 %el.5, i32 5
+  %vec.6 = insertelement <16 x i64> %vec.5, i64 %el.6, i32 6
+  %vec.7 = insertelement <16 x i64> %vec.6, i64 %el.7, i32 7
+  %vec.8 = insertelement <16 x i64> %vec.7, i64 %el.8, i32 8
+  %vec.9 = insertelement <16 x i64> %vec.8, i64 %el.9, i32 9
+  %vec.10 = insertelement <16 x i64> %vec.9, i64 %el.10, i32 10
+  %vec.11 = insertelement <16 x i64> %vec.10, i64 %el.11, i32 11
+  %vec.12 = insertelement <16 x i64> %vec.11, i64 %el.12, i32 12
+  %vec.13 = insertelement <16 x i64> %vec.12, i64 %el.13, i32 13
+  %vec.14 = insertelement <16 x i64> %vec.13, i64 %el.14, i32 14
+  %vec.15 = insertelement <16 x i64> %vec.14, i64 %el.15, i32 15
+  call void @capture(i64* %src0, i64* %src1)
+  %vec.0.1 = insertelement <16 x i64> undef, i64 %el.0.1, i32 0
+  %vec.1.1 = insertelement <16 x i64> %vec.0.1, i64 %el.1.1, i32 1
+  %vec.2.1 = insertelement <16 x i64> %vec.1.1, i64 %el.2.1, i32 2
+  %vec.3.1 = insertelement <16 x i64> %vec.2.1, i64 %el.3.1, i32 3
+  %vec.4.1 = insertelement <16 x i64> %vec.3.1, i64 %el.4.1, i32 4
+  %vec.5.1 = insertelement <16 x i64> %vec.4.1, i64 %el.5.1, i32 5
+  %vec.6.1 = insertelement <16 x i64> %vec.5.1, i64 %el.6.1, i32 6
+  %vec.7.1 = insertelement <16 x i64> %vec.6.1, i64 %el.7.1, i32 7
+  %vec.8.1 = insertelement <16 x i64> %vec.7.1, i64 %el.7.1, i32 8
+  %vec.9.1 = insertelement <16 x i64> %vec.8.1, i64 %el.8.1, i32 9
+  %vec.10.1 = insertelement <16 x i64> %vec.9.1, i64 %el.9.1, i32 10
+  %vec.11.1 = insertelement <16 x i64> %vec.10.1, i64 %el.10.1, i32 11
+  %vec.12.1 = insertelement <16 x i64> %vec.11.1, i64 %el.11.1, i32 12
+  %vec.13.1 = insertelement <16 x i64> %vec.12.1, i64 %el.12.1, i32 13
+  %vec.14.1 = insertelement <16 x i64> %vec.13.1, i64 %el.13.1, i32 14
+  %vec.15.1 = insertelement <16 x i64> %vec.14.1, i64 %el.14.1, i32 15
+  %res = add <16 x i64> %vec.15, %vec.15.1
+  ret <16 x i64> %res
 }
+
+declare void @capture(i64*, i64*)
+
 attributes #0 = { noredzone "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/test/CodeGen/Thumb2/2010-04-15-DynAllocBug.ll b/test/CodeGen/Thumb2/2010-04-15-DynAllocBug.ll
index 8beb5b1..03292e9 100644
--- a/test/CodeGen/Thumb2/2010-04-15-DynAllocBug.ll
+++ b/test/CodeGen/Thumb2/2010-04-15-DynAllocBug.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=thumbv7-apple-darwin -mcpu=cortex-a8 -arm-atomic-cfg-tidy=0 -O3 | FileCheck %s
+; RUN: llc < %s -mtriple=thumbv7-apple-darwin -mcpu=cortex-a8 -arm-atomic-cfg-tidy=0 -O2 | FileCheck %s
 ; rdar://7493908
 
 ; Make sure the result of the first dynamic_alloc isn't copied back to sp more
diff --git a/test/CodeGen/Thumb2/2010-06-21-TailMergeBug.ll b/test/CodeGen/Thumb2/2010-06-21-TailMergeBug.ll
index 34569e9..e043773 100644
--- a/test/CodeGen/Thumb2/2010-06-21-TailMergeBug.ll
+++ b/test/CodeGen/Thumb2/2010-06-21-TailMergeBug.ll
@@ -32,13 +32,12 @@
 
 define fastcc i32 @parse_percent_token() nounwind {
 entry:
-; CHECK: pop
-; CHECK: pop
-; CHECK: pop
-; CHECK: pop
-; CHECK: pop
-; CHECK: pop
-; CHECK: pop
+; CHECK: bx lr
+; CHECK: bx lr
+; CHECK: bx lr
+; CHECK: bx lr
+; CHECK: bx lr
+; CHECK: bx lr
 ; Do not convert into single stream code. BranchProbability Analysis assumes
 ; that branches which goes to "ret" instruction have lower probabilities.
   switch i32 undef, label %bb7 [
diff --git a/test/Transforms/LoopStrengthReduce/ARM/ivchain-ARM.ll b/test/Transforms/LoopStrengthReduce/ARM/ivchain-ARM.ll
index 728f5dc..dcb27d5 100644
--- a/test/Transforms/LoopStrengthReduce/ARM/ivchain-ARM.ll
+++ b/test/Transforms/LoopStrengthReduce/ARM/ivchain-ARM.ll
@@ -302,7 +302,6 @@
 ; A9: vld1.8 {d{{[0-9]+}}}, [[BASE]], [[INC]]
 ; A9: vld1.8 {d{{[0-9]+}}}, [[BASE]], [[INC]]
 ; A9: vld1.8 {d{{[0-9]+}}}, [[BASE]], [[INC]]
-; A9: vld1.8 {d{{[0-9]+}}}, [[BASE]], {{r[0-9]}}
 ; A9: vst1.8 {d{{[0-9]+}}}, [[BASE]], [[INC]]
 ; A9: vst1.8 {d{{[0-9]+}}}, [[BASE]], [[INC]]
 ; A9: vst1.8 {d{{[0-9]+}}}, [[BASE]], [[INC]]