| ; RUN: opt -loop-vectorize -mtriple=arm64-apple-darwin -S %s | FileCheck %s |
| |
| ; Test cases for extending the vectorization factor, if small memory operations |
| ; are not profitable. |
| |
| ; Test with a loop that contains memory accesses of i8 and i32 types. The |
| ; default maximum VF for NEON is 4, but vectorizing 4 x i8 is not |
| ; profitable. But we can extend to VF to 8 or 16, at which point the |
| ; i8 memory accesses become profitable. |
| define void @test_load_i8_store_i32(i8* noalias %src, i32* noalias %dst, i32 %off, i64 %N) { |
| ; CHECK-LABEL: @test_load_i8_store_i32( |
| ; CHECK-NOT: x i8> |
| ; |
| entry: |
| br label %loop |
| |
| loop: |
| %iv = phi i64 [ %iv.next, %loop ], [ 0, %entry ] |
| %gep.src = getelementptr inbounds i8, i8* %src, i64 %iv |
| %lv = load i8, i8* %gep.src, align 1 |
| %lv.ext = zext i8 %lv to i32 |
| %add = add i32 %lv.ext, %off |
| %gep.dst = getelementptr inbounds i32, i32* %dst, i64 %iv |
| store i32 %add, i32* %gep.dst |
| %iv.next = add nuw nsw i64 %iv, 1 |
| %exitcond.not = icmp eq i64 %iv.next, %N |
| br i1 %exitcond.not, label %exit, label %loop |
| |
| exit: |
| ret void |
| } |
| |
| ; Same as test_load_i8_store_i32, but with types flipped for load and store. |
| define void @test_load_i32_store_i8(i32* noalias %src, i8* noalias %dst, i32 %off, i64 %N) { |
| ; CHECK-LABEL: @test_load_i32_store_i8( |
| ; CHECK: <4 x i8> |
| ; |
| entry: |
| br label %loop |
| |
| loop: |
| %iv = phi i64 [ %iv.next, %loop ], [ 0, %entry ] |
| %gep.src = getelementptr inbounds i32, i32* %src, i64 %iv |
| %lv = load i32, i32* %gep.src, align 1 |
| %add = add i32 %lv, %off |
| %add.trunc = trunc i32 %add to i8 |
| %gep.dst = getelementptr inbounds i8, i8* %dst, i64 %iv |
| store i8 %add.trunc, i8* %gep.dst |
| %iv.next = add nuw nsw i64 %iv, 1 |
| %exitcond.not = icmp eq i64 %iv.next, %N |
| br i1 %exitcond.not, label %exit, label %loop |
| |
| exit: |
| ret void |
| } |
| |
| ; All memory operations use i32, all memory operations are profitable with VF 4. |
| define void @test_load_i32_store_i32(i32* noalias %src, i32* noalias %dst, i8 %off, i64 %N) { |
| ; CHECK-LABEL: @test_load_i32_store_i32( |
| ; CHECK: vector.body: |
| ; CHECK: <4 x i32> |
| ; |
| entry: |
| br label %loop |
| |
| loop: |
| %iv = phi i64 [ %iv.next, %loop ], [ 0, %entry ] |
| %gep.src = getelementptr inbounds i32, i32* %src, i64 %iv |
| %lv = load i32, i32* %gep.src, align 1 |
| %lv.trunc = trunc i32 %lv to i8 |
| %add = add i8 %lv.trunc, %off |
| %add.ext = zext i8 %add to i32 |
| %gep.dst = getelementptr inbounds i32, i32* %dst, i64 %iv |
| store i32 %add.ext, i32* %gep.dst |
| %iv.next = add nuw nsw i64 %iv, 1 |
| %exitcond.not = icmp eq i64 %iv.next, %N |
| br i1 %exitcond.not, label %exit, label %loop |
| |
| exit: |
| ret void |
| } |
| |
| ; Test with loop body that requires a large number of vector registers if the |
| ; vectorization factor is large. Make sure the register estimates limit the |
| ; vectorization factor. |
| define void @test_load_i8_store_i64_large(i8* noalias %src, i64* noalias %dst, i64* noalias %dst.2, i64* noalias %dst.3, i64* noalias %dst.4, i64* noalias %dst.5, i64%off, i64 %off.2, i64 %N) { |
| ; CHECK-LABEL: @test_load_i8_store_i64_large |
| ; CHECK: <2 x i64> |
| ; |
| entry: |
| br label %loop |
| |
| loop: |
| %iv = phi i64 [ %iv.next, %loop ], [ 0, %entry ] |
| %gep.src = getelementptr inbounds i8, i8* %src, i64 %iv |
| %gep.dst.3 = getelementptr inbounds i64, i64* %dst.3, i64 %iv |
| %lv.dst.3 = load i64, i64* %gep.dst.3, align 1 |
| %gep.dst.5 = getelementptr inbounds i64, i64* %dst.5, i64 %iv |
| %lv.dst.5 = load i64, i64* %gep.dst.3, align 1 |
| |
| %lv = load i8, i8* %gep.src, align 1 |
| %lv.ext = zext i8 %lv to i64 |
| %add = add i64 %lv.ext, %off |
| %add.2 = add i64 %add, %off.2 |
| %gep.dst = getelementptr inbounds i64, i64* %dst, i64 %iv |
| %gep.dst.2 = getelementptr inbounds i64, i64* %dst.2, i64 %iv |
| |
| %add.3 = add i64 %add.2, %lv.dst.3 |
| %add.4 = add i64 %add.3, %add |
| %gep.dst.4 = getelementptr inbounds i64, i64* %dst.4, i64 %iv |
| %add.5 = add i64 %add.2, %lv.dst.5 |
| store i64 %add.2, i64* %gep.dst.2 |
| store i64 %add, i64* %gep.dst |
| store i64 %add.3, i64* %gep.dst.3 |
| store i64 %add.4, i64* %gep.dst.4 |
| store i64 %add.5, i64* %gep.dst.5 |
| |
| %iv.next = add nuw nsw i64 %iv, 1 |
| %exitcond.not = icmp eq i64 %iv.next, %N |
| br i1 %exitcond.not, label %exit, label %loop |
| |
| exit: |
| ret void |
| } |