blob: 21be567fdf64a8683740db4f7e32dba79a48a794 [file] [log] [blame]
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
; RUN: opt -passes=loop-vectorize -mtriple=thumbv7s-apple-ios6.0.0 -S -enable-interleaved-mem-accesses=false < %s | FileCheck %s
target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:32:64-v128:32:128-a0:0:32-n32-S32"
@kernel = global [512 x float] zeroinitializer, align 4
@kernel2 = global [512 x float] zeroinitializer, align 4
@kernel3 = global [512 x float] zeroinitializer, align 4
@kernel4 = global [512 x float] zeroinitializer, align 4
@src_data = global [1536 x float] zeroinitializer, align 4
; We don't want to vectorize most loops containing gathers because they are
; expensive.
; Make sure we don't vectorize it.
define float @_Z4testmm(i64 %size, i64 %offset) {
; CHECK-LABEL: define float @_Z4testmm(
; CHECK-SAME: i64 [[SIZE:%.*]], i64 [[OFFSET:%.*]]) {
; CHECK-NEXT: [[ENTRY:.*]]:
; CHECK-NEXT: br label %[[LOOP:.*]]
; CHECK: [[LOOP]]:
; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
; CHECK-NEXT: [[RDX_0:%.*]] = phi float [ 0.000000e+00, %[[ENTRY]] ], [ [[RDX_0_NEXT:%.*]], %[[LOOP]] ]
; CHECK-NEXT: [[RDX_1:%.*]] = phi float [ 0.000000e+00, %[[ENTRY]] ], [ [[RDX_1_NEXT:%.*]], %[[LOOP]] ]
; CHECK-NEXT: [[RED_2:%.*]] = phi float [ 0.000000e+00, %[[ENTRY]] ], [ [[RDX_2_NEXT:%.*]], %[[LOOP]] ]
; CHECK-NEXT: [[ADD:%.*]] = add i64 [[IV]], [[OFFSET]]
; CHECK-NEXT: [[MUL:%.*]] = mul i64 [[ADD]], 3
; CHECK-NEXT: [[GEP_SRC_DATA:%.*]] = getelementptr inbounds [1536 x float], ptr @src_data, i64 0, i64 [[MUL]]
; CHECK-NEXT: [[TMP0:%.*]] = load float, ptr [[GEP_SRC_DATA]], align 4
; CHECK-NEXT: [[GEP_KERNEL:%.*]] = getelementptr inbounds [512 x float], ptr @kernel, i64 0, i64 [[IV]]
; CHECK-NEXT: [[TMP1:%.*]] = load float, ptr [[GEP_KERNEL]], align 4
; CHECK-NEXT: [[MUL3:%.*]] = fmul fast float [[TMP0]], [[TMP1]]
; CHECK-NEXT: [[GEP_KERNEL2:%.*]] = getelementptr inbounds [512 x float], ptr @kernel2, i64 0, i64 [[IV]]
; CHECK-NEXT: [[TMP2:%.*]] = load float, ptr [[GEP_KERNEL2]], align 4
; CHECK-NEXT: [[MUL5:%.*]] = fmul fast float [[MUL3]], [[TMP2]]
; CHECK-NEXT: [[GEP_KERNEL3:%.*]] = getelementptr inbounds [512 x float], ptr @kernel3, i64 0, i64 [[IV]]
; CHECK-NEXT: [[TMP3:%.*]] = load float, ptr [[GEP_KERNEL3]], align 4
; CHECK-NEXT: [[MUL7:%.*]] = fmul fast float [[MUL5]], [[TMP3]]
; CHECK-NEXT: [[GEP_KERNEL4:%.*]] = getelementptr inbounds [512 x float], ptr @kernel4, i64 0, i64 [[IV]]
; CHECK-NEXT: [[TMP4:%.*]] = load float, ptr [[GEP_KERNEL4]], align 4
; CHECK-NEXT: [[MUL9:%.*]] = fmul fast float [[MUL7]], [[TMP4]]
; CHECK-NEXT: [[RDX_0_NEXT]] = fadd fast float [[RDX_0]], [[MUL9]]
; CHECK-NEXT: [[GEP_SRC_DATA_SUM:%.*]] = add i64 [[MUL]], 1
; CHECK-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds [1536 x float], ptr @src_data, i64 0, i64 [[GEP_SRC_DATA_SUM]]
; CHECK-NEXT: [[TMP5:%.*]] = load float, ptr [[ARRAYIDX11]], align 4
; CHECK-NEXT: [[MUL13:%.*]] = fmul fast float [[TMP1]], [[TMP5]]
; CHECK-NEXT: [[MUL15:%.*]] = fmul fast float [[TMP2]], [[MUL13]]
; CHECK-NEXT: [[MUL17:%.*]] = fmul fast float [[TMP3]], [[MUL15]]
; CHECK-NEXT: [[MUL19:%.*]] = fmul fast float [[TMP4]], [[MUL17]]
; CHECK-NEXT: [[RDX_1_NEXT]] = fadd fast float [[RDX_1]], [[MUL19]]
; CHECK-NEXT: [[GEP_SRC_DATA_SUM52:%.*]] = add i64 [[MUL]], 2
; CHECK-NEXT: [[ARRAYIDX21:%.*]] = getelementptr inbounds [1536 x float], ptr @src_data, i64 0, i64 [[GEP_SRC_DATA_SUM52]]
; CHECK-NEXT: [[TMP6:%.*]] = load float, ptr [[ARRAYIDX21]], align 4
; CHECK-NEXT: [[MUL23:%.*]] = fmul fast float [[TMP1]], [[TMP6]]
; CHECK-NEXT: [[MUL25:%.*]] = fmul fast float [[TMP2]], [[MUL23]]
; CHECK-NEXT: [[MUL27:%.*]] = fmul fast float [[TMP3]], [[MUL25]]
; CHECK-NEXT: [[MUL29:%.*]] = fmul fast float [[TMP4]], [[MUL27]]
; CHECK-NEXT: [[RDX_2_NEXT]] = fadd fast float [[RED_2]], [[MUL29]]
; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1
; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[IV_NEXT]], [[SIZE]]
; CHECK-NEXT: br i1 [[EXITCOND]], label %[[LOOP]], label %[[EXIT:.*]]
; CHECK: [[EXIT]]:
; CHECK-NEXT: [[RDX_0_NEXT_LCSSA:%.*]] = phi float [ [[RDX_0_NEXT]], %[[LOOP]] ]
; CHECK-NEXT: [[RDX_1_NEXT_LCSSA:%.*]] = phi float [ [[RDX_1_NEXT]], %[[LOOP]] ]
; CHECK-NEXT: [[RDX_2_NEXT_LCSSA:%.*]] = phi float [ [[RDX_2_NEXT]], %[[LOOP]] ]
; CHECK-NEXT: [[RES_0:%.*]] = fadd float [[RDX_0_NEXT_LCSSA]], [[RDX_1_NEXT_LCSSA]]
; CHECK-NEXT: [[RES_1:%.*]] = fadd float [[RES_0]], [[RDX_2_NEXT_LCSSA]]
; CHECK-NEXT: ret float [[RES_1]]
;
entry:
br label %loop
loop:
%iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
%rdx.0 = phi float [ 0.000000e+00, %entry ], [ %rdx.0.next, %loop ]
%rdx.1 = phi float [ 0.000000e+00, %entry ], [ %rdx.1.next, %loop ]
%red.2 = phi float [ 0.000000e+00, %entry ], [ %rdx.2.next, %loop ]
%add = add i64 %iv, %offset
%mul = mul i64 %add, 3
%gep.src_data = getelementptr inbounds [1536 x float], ptr @src_data, i64 0, i64 %mul
%0 = load float, ptr %gep.src_data, align 4
%gep.kernel = getelementptr inbounds [512 x float], ptr @kernel, i64 0, i64 %iv
%1 = load float, ptr %gep.kernel, align 4
%mul3 = fmul fast float %0, %1
%gep.kernel2 = getelementptr inbounds [512 x float], ptr @kernel2, i64 0, i64 %iv
%2 = load float, ptr %gep.kernel2, align 4
%mul5 = fmul fast float %mul3, %2
%gep.kernel3 = getelementptr inbounds [512 x float], ptr @kernel3, i64 0, i64 %iv
%3 = load float, ptr %gep.kernel3, align 4
%mul7 = fmul fast float %mul5, %3
%gep.kernel4 = getelementptr inbounds [512 x float], ptr @kernel4, i64 0, i64 %iv
%4 = load float, ptr %gep.kernel4, align 4
%mul9 = fmul fast float %mul7, %4
%rdx.0.next = fadd fast float %rdx.0, %mul9
%gep.src_data.sum = add i64 %mul, 1
%arrayidx11 = getelementptr inbounds [1536 x float], ptr @src_data, i64 0, i64 %gep.src_data.sum
%5 = load float, ptr %arrayidx11, align 4
%mul13 = fmul fast float %1, %5
%mul15 = fmul fast float %2, %mul13
%mul17 = fmul fast float %3, %mul15
%mul19 = fmul fast float %4, %mul17
%rdx.1.next = fadd fast float %rdx.1, %mul19
%gep.src_data.sum52 = add i64 %mul, 2
%arrayidx21 = getelementptr inbounds [1536 x float], ptr @src_data, i64 0, i64 %gep.src_data.sum52
%6 = load float, ptr %arrayidx21, align 4
%mul23 = fmul fast float %1, %6
%mul25 = fmul fast float %2, %mul23
%mul27 = fmul fast float %3, %mul25
%mul29 = fmul fast float %4, %mul27
%rdx.2.next = fadd fast float %red.2, %mul29
%iv.next = add i64 %iv, 1
%exitcond = icmp ne i64 %iv.next, %size
br i1 %exitcond, label %loop, label %exit
exit:
%res.0 = fadd float %rdx.0.next, %rdx.1.next
%res.1 = fadd float %res.0, %rdx.2.next
ret float %res.1
}