blob: 57b4703073e81bd72bf1782e239af3ddc1bb2335 [file] [log] [blame] [edit]
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
; RUN: opt < %s -passes=loop-vectorize -S -mtriple=aarch64-unknown-linux-gnu -mcpu=cortex-a320 2>&1 | FileCheck %s --check-prefix=A320
; The loop below is a small counted loop with a scalar reduction.
; When AggressiveInterleaving is enabled for this subtarget, LoopVectorize
; should choose an interleave count > 1 (VF == 1), which manifests as
; multiple loads / multiplies / adds in the vector body.
define void @test_interleave_reduction(ptr %arg, ptr %arg1) {
; A320-LABEL: define void @test_interleave_reduction(
; A320-SAME: ptr [[ARG:%.*]], ptr [[ARG1:%.*]]) #[[ATTR0:[0-9]+]] {
; A320-NEXT: [[ENTRY:.*:]]
; A320-NEXT: [[TPM15:%.*]] = load ptr, ptr [[ARG]], align 8
; A320-NEXT: [[TPM19:%.*]] = load ptr, ptr [[ARG1]], align 8
; A320-NEXT: br label %[[OUTER:.*]]
; A320: [[OUTER]]:
; A320-NEXT: [[TPM26:%.*]] = add i64 0, 1
; A320-NEXT: [[TPM10:%.*]] = alloca i32, align 8
; A320-NEXT: [[TPM102:%.*]] = ptrtoint ptr [[TPM10]] to i64
; A320-NEXT: [[TPM27:%.*]] = getelementptr inbounds i32, ptr [[TPM10]], i64 [[TPM26]]
; A320-NEXT: [[TPM28:%.*]] = getelementptr inbounds ptr, ptr [[TPM15]], i64 0
; A320-NEXT: [[TPM29:%.*]] = load ptr, ptr [[TPM28]], align 8
; A320-NEXT: [[TPM291:%.*]] = ptrtoint ptr [[TPM29]] to i64
; A320-NEXT: [[TPM17:%.*]] = alloca double, align 8
; A320-NEXT: [[TPM32:%.*]] = getelementptr inbounds double, ptr [[TPM17]], i64 [[TPM26]]
; A320-NEXT: [[TMP0:%.*]] = add i64 [[TPM291]], -8
; A320-NEXT: [[TMP1:%.*]] = sub i64 [[TMP0]], [[TPM102]]
; A320-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP1]], 2
; A320-NEXT: [[TMP3:%.*]] = add nuw nsw i64 [[TMP2]], 1
; A320-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP3]], 2
; A320-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
; A320: [[VECTOR_PH]]:
; A320-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP3]], 2
; A320-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP3]], [[N_MOD_VF]]
; A320-NEXT: [[TMP4:%.*]] = mul i64 [[N_VEC]], 4
; A320-NEXT: [[IND_END:%.*]] = getelementptr i8, ptr [[TPM27]], i64 [[TMP4]]
; A320-NEXT: [[TMP5:%.*]] = mul i64 [[N_VEC]], 8
; A320-NEXT: [[IND_END3:%.*]] = getelementptr i8, ptr [[TPM32]], i64 [[TMP5]]
; A320-NEXT: br label %[[VECTOR_BODY:.*]]
; A320: [[VECTOR_BODY]]:
; A320-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
; A320-NEXT: [[VEC_PHI:%.*]] = phi double [ 0.000000e+00, %[[VECTOR_PH]] ], [ [[TMP22:%.*]], %[[VECTOR_BODY]] ]
; A320-NEXT: [[VEC_PHI5:%.*]] = phi double [ 0.000000e+00, %[[VECTOR_PH]] ], [ [[TMP23:%.*]], %[[VECTOR_BODY]] ]
; A320-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 4
; A320-NEXT: [[TMP7:%.*]] = add i64 [[OFFSET_IDX]], 4
; A320-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[TPM27]], i64 [[OFFSET_IDX]]
; A320-NEXT: [[NEXT_GEP6:%.*]] = getelementptr i8, ptr [[TPM27]], i64 [[TMP7]]
; A320-NEXT: [[OFFSET_IDX7:%.*]] = mul i64 [[INDEX]], 8
; A320-NEXT: [[TMP9:%.*]] = add i64 [[OFFSET_IDX7]], 8
; A320-NEXT: [[NEXT_GEP8:%.*]] = getelementptr i8, ptr [[TPM32]], i64 [[OFFSET_IDX7]]
; A320-NEXT: [[NEXT_GEP9:%.*]] = getelementptr i8, ptr [[TPM32]], i64 [[TMP9]]
; A320-NEXT: [[TMP10:%.*]] = load double, ptr [[NEXT_GEP8]], align 8
; A320-NEXT: [[TMP11:%.*]] = load double, ptr [[NEXT_GEP9]], align 8
; A320-NEXT: [[TMP12:%.*]] = load i32, ptr [[NEXT_GEP]], align 4
; A320-NEXT: [[TMP13:%.*]] = load i32, ptr [[NEXT_GEP6]], align 4
; A320-NEXT: [[TMP14:%.*]] = zext i32 [[TMP12]] to i64
; A320-NEXT: [[TMP15:%.*]] = zext i32 [[TMP13]] to i64
; A320-NEXT: [[TMP16:%.*]] = getelementptr inbounds double, ptr [[TPM19]], i64 [[TMP14]]
; A320-NEXT: [[TMP17:%.*]] = getelementptr inbounds double, ptr [[TPM19]], i64 [[TMP15]]
; A320-NEXT: [[TMP18:%.*]] = load double, ptr [[TMP16]], align 8
; A320-NEXT: [[TMP19:%.*]] = load double, ptr [[TMP17]], align 8
; A320-NEXT: [[TMP20:%.*]] = fmul fast double [[TMP18]], [[TMP10]]
; A320-NEXT: [[TMP21:%.*]] = fmul fast double [[TMP19]], [[TMP11]]
; A320-NEXT: [[TMP22]] = fadd fast double [[TMP20]], [[VEC_PHI]]
; A320-NEXT: [[TMP23]] = fadd fast double [[TMP21]], [[VEC_PHI5]]
; A320-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
; A320-NEXT: [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; A320-NEXT: br i1 [[TMP24]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; A320: [[MIDDLE_BLOCK]]:
; A320-NEXT: [[BIN_RDX:%.*]] = fadd fast double [[TMP23]], [[TMP22]]
; A320-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC]]
; A320-NEXT: br i1 [[CMP_N]], label %[[EXIT_INNER:.*]], label %[[SCALAR_PH]]
; A320: [[SCALAR_PH]]:
; A320-NEXT: [[BC_RESUME_VAL:%.*]] = phi ptr [ [[IND_END]], %[[MIDDLE_BLOCK]] ], [ [[TPM27]], %[[OUTER]] ]
; A320-NEXT: [[BC_RESUME_VAL8:%.*]] = phi ptr [ [[IND_END3]], %[[MIDDLE_BLOCK]] ], [ [[TPM32]], %[[OUTER]] ]
; A320-NEXT: [[BC_MERGE_RDX:%.*]] = phi double [ [[BIN_RDX]], %[[MIDDLE_BLOCK]] ], [ 0.000000e+00, %[[OUTER]] ]
; A320-NEXT: br label %[[INNER:.*]]
; A320: [[INNER]]:
; A320-NEXT: [[PHI_PTR_I32:%.*]] = phi ptr [ [[NEXT_I32:%.*]], %[[INNER]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
; A320-NEXT: [[PHI_PTR_F64:%.*]] = phi ptr [ [[NEXT_F64:%.*]], %[[INNER]] ], [ [[BC_RESUME_VAL8]], %[[SCALAR_PH]] ]
; A320-NEXT: [[PHI_ACC:%.*]] = phi double [ [[TPM50:%.*]], %[[INNER]] ], [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ]
; A320-NEXT: [[TPM44:%.*]] = load double, ptr [[PHI_PTR_F64]], align 8
; A320-NEXT: [[TPM45:%.*]] = load i32, ptr [[PHI_PTR_I32]], align 4
; A320-NEXT: [[TPM46:%.*]] = zext i32 [[TPM45]] to i64
; A320-NEXT: [[TPM47:%.*]] = getelementptr inbounds double, ptr [[TPM19]], i64 [[TPM46]]
; A320-NEXT: [[TPM48:%.*]] = load double, ptr [[TPM47]], align 8
; A320-NEXT: [[TPM49:%.*]] = fmul fast double [[TPM48]], [[TPM44]]
; A320-NEXT: [[TPM50]] = fadd fast double [[TPM49]], [[PHI_ACC]]
; A320-NEXT: [[NEXT_I32]] = getelementptr inbounds i32, ptr [[PHI_PTR_I32]], i64 1
; A320-NEXT: [[NEXT_F64]] = getelementptr inbounds double, ptr [[PHI_PTR_F64]], i64 1
; A320-NEXT: [[DONE:%.*]] = icmp eq ptr [[NEXT_I32]], [[TPM29]]
; A320-NEXT: br i1 [[DONE]], label %[[EXIT_INNER]], label %[[INNER]], !llvm.loop [[LOOP3:![0-9]+]]
; A320: [[EXIT_INNER]]:
; A320-NEXT: [[TPM50_LCSSA:%.*]] = phi double [ [[TPM50]], %[[INNER]] ], [ [[BIN_RDX]], %[[MIDDLE_BLOCK]] ]
; A320-NEXT: [[TPM35:%.*]] = getelementptr inbounds double, ptr [[TPM19]], i64 0
; A320-NEXT: [[TPM37:%.*]] = fsub fast double 0.000000e+00, [[TPM50_LCSSA]]
; A320-NEXT: store double [[TPM37]], ptr [[TPM35]], align 8
; A320-NEXT: br label %[[OUTER]]
;
entry:
%tpm15 = load ptr, ptr %arg, align 8
%tpm19 = load ptr, ptr %arg1, align 8
br label %outer
outer: ; preds = %inner, %entry
%tpm26 = add i64 0, 1
%tpm10 = alloca i32, align 8
%tpm27 = getelementptr inbounds i32, ptr %tpm10, i64 %tpm26
%tpm28 = getelementptr inbounds ptr, ptr %tpm15, i64 0
%tpm29 = load ptr, ptr %tpm28, align 8
%tpm17 = alloca double, align 8
%tpm32 = getelementptr inbounds double, ptr %tpm17, i64 %tpm26
br label %inner
inner: ; preds = %inner, %outer
%phi.ptr.i32 = phi ptr [ %next.i32, %inner ], [ %tpm27, %outer ]
%phi.ptr.f64 = phi ptr [ %next.f64, %inner ], [ %tpm32, %outer ]
%phi.acc = phi double [ %tpm50, %inner ], [ 0.0, %outer ]
%tpm44 = load double, ptr %phi.ptr.f64, align 8
%tpm45 = load i32, ptr %phi.ptr.i32, align 4
%tpm46 = zext i32 %tpm45 to i64
%tpm47 = getelementptr inbounds double, ptr %tpm19, i64 %tpm46
%tpm48 = load double, ptr %tpm47, align 8
%tpm49 = fmul fast double %tpm48, %tpm44
%tpm50 = fadd fast double %tpm49, %phi.acc
%next.i32 = getelementptr inbounds i32, ptr %phi.ptr.i32, i64 1
%next.f64 = getelementptr inbounds double, ptr %phi.ptr.f64, i64 1
%done = icmp eq ptr %next.i32, %tpm29
br i1 %done, label %exit.inner, label %inner
exit.inner: ; preds = %inner
%tpm35 = getelementptr inbounds double, ptr %tpm19, i64 0
%tpm37 = fsub fast double 0.0, %tpm50
store double %tpm37, ptr %tpm35, align 8
br label %outer
}
;===---------------------------------------------------------------------===;
; 1) Simple sum-reduction over one array
; Expect: VF = 1 with interleave count > 1, so vector.body contains
; duplicated loads and adds.
;===---------------------------------------------------------------------===;
define double @sum_reduction(ptr nocapture readonly %a, i64 %n) {
; A320-LABEL: define double @sum_reduction(
; A320-SAME: ptr readonly captures(none) [[A:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
; A320-NEXT: [[ENTRY:.*]]:
; A320-NEXT: [[CMP0:%.*]] = icmp eq i64 [[N]], 0
; A320-NEXT: br i1 [[CMP0]], label %[[EXIT:.*]], label %[[LOOP_PREHEADER:.*]]
; A320: [[LOOP_PREHEADER]]:
; A320-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
; A320-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
; A320: [[VECTOR_PH]]:
; A320-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
; A320-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
; A320-NEXT: br label %[[VECTOR_BODY:.*]]
; A320: [[VECTOR_BODY]]:
; A320-NEXT: [[TMP1:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
; A320-NEXT: [[VEC_PHI:%.*]] = phi <2 x double> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP2:%.*]], %[[VECTOR_BODY]] ]
; A320-NEXT: [[VEC_PHI1:%.*]] = phi <2 x double> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP4:%.*]], %[[VECTOR_BODY]] ]
; A320-NEXT: [[TMP3:%.*]] = getelementptr inbounds double, ptr [[A]], i64 [[TMP1]]
; A320-NEXT: [[TMP6:%.*]] = getelementptr inbounds double, ptr [[TMP3]], i64 2
; A320-NEXT: [[WIDE_LOAD:%.*]] = load <2 x double>, ptr [[TMP3]], align 8
; A320-NEXT: [[WIDE_LOAD2:%.*]] = load <2 x double>, ptr [[TMP6]], align 8
; A320-NEXT: [[TMP2]] = fadd fast <2 x double> [[VEC_PHI]], [[WIDE_LOAD]]
; A320-NEXT: [[TMP4]] = fadd fast <2 x double> [[VEC_PHI1]], [[WIDE_LOAD2]]
; A320-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP1]], 4
; A320-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; A320-NEXT: br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
; A320: [[MIDDLE_BLOCK]]:
; A320-NEXT: [[BIN_RDX:%.*]] = fadd fast <2 x double> [[TMP4]], [[TMP2]]
; A320-NEXT: [[TMP5:%.*]] = call fast double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> [[BIN_RDX]])
; A320-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
; A320-NEXT: br i1 [[CMP_N]], label %[[EXIT_LOOPEXIT:.*]], label %[[SCALAR_PH]]
; A320: [[SCALAR_PH]]:
; A320-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[LOOP_PREHEADER]] ]
; A320-NEXT: [[BC_MERGE_RDX:%.*]] = phi double [ [[TMP5]], %[[MIDDLE_BLOCK]] ], [ 0.000000e+00, %[[LOOP_PREHEADER]] ]
; A320-NEXT: br label %[[LOOP:.*]]
; A320: [[LOOP]]:
; A320-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
; A320-NEXT: [[SUM:%.*]] = phi double [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SUM_NEXT:%.*]], %[[LOOP]] ]
; A320-NEXT: [[GEP:%.*]] = getelementptr inbounds double, ptr [[A]], i64 [[IV]]
; A320-NEXT: [[VAL:%.*]] = load double, ptr [[GEP]], align 8
; A320-NEXT: [[SUM_NEXT]] = fadd fast double [[SUM]], [[VAL]]
; A320-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
; A320-NEXT: [[COND:%.*]] = icmp ult i64 [[IV_NEXT]], [[N]]
; A320-NEXT: br i1 [[COND]], label %[[LOOP]], label %[[EXIT_LOOPEXIT]], !llvm.loop [[LOOP5:![0-9]+]]
; A320: [[EXIT_LOOPEXIT]]:
; A320-NEXT: [[SUM_NEXT_LCSSA:%.*]] = phi double [ [[SUM_NEXT]], %[[LOOP]] ], [ [[TMP5]], %[[MIDDLE_BLOCK]] ]
; A320-NEXT: br label %[[EXIT]]
; A320: [[EXIT]]:
; A320-NEXT: [[RES:%.*]] = phi double [ 0.000000e+00, %[[ENTRY]] ], [ [[SUM_NEXT_LCSSA]], %[[EXIT_LOOPEXIT]] ]
; A320-NEXT: ret double [[RES]]
;
entry:
%cmp0 = icmp eq i64 %n, 0
br i1 %cmp0, label %exit, label %loop.preheader
loop.preheader:
br label %loop
loop:
%iv = phi i64 [ 0, %loop.preheader ], [ %iv.next, %loop ]
%sum = phi double [ 0.0, %loop.preheader ], [ %sum.next, %loop ]
%gep = getelementptr inbounds double, ptr %a, i64 %iv
%val = load double, ptr %gep, align 8
%sum.next = fadd fast double %sum, %val
%iv.next = add nuw nsw i64 %iv, 1
%cond = icmp ult i64 %iv.next, %n
br i1 %cond, label %loop, label %exit
exit:
%res = phi double [ 0.0, %entry ], [ %sum.next, %loop ]
ret double %res
}
;===---------------------------------------------------------------------===;
; 2) Dot-product of two arrays
; Expect: again, VF = 1 with interleave count > 1. The vector body should
; have multiple pairs of loads and fmuls/fadds.
;===---------------------------------------------------------------------===;
define double @dot_product(ptr nocapture readonly %a, ptr nocapture readonly %b, i64 %n) {
; A320-LABEL: define double @dot_product(
; A320-SAME: ptr readonly captures(none) [[A:%.*]], ptr readonly captures(none) [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
; A320-NEXT: [[ENTRY:.*]]:
; A320-NEXT: [[CMP0:%.*]] = icmp eq i64 [[N]], 0
; A320-NEXT: br i1 [[CMP0]], label %[[EXIT:.*]], label %[[LOOP_PREHEADER:.*]]
; A320: [[LOOP_PREHEADER]]:
; A320-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
; A320-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
; A320: [[VECTOR_PH]]:
; A320-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
; A320-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
; A320-NEXT: br label %[[VECTOR_BODY:.*]]
; A320: [[VECTOR_BODY]]:
; A320-NEXT: [[TMP1:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
; A320-NEXT: [[VEC_PHI:%.*]] = phi <2 x double> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP6:%.*]], %[[VECTOR_BODY]] ]
; A320-NEXT: [[VEC_PHI1:%.*]] = phi <2 x double> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP7:%.*]], %[[VECTOR_BODY]] ]
; A320-NEXT: [[TMP3:%.*]] = getelementptr inbounds double, ptr [[A]], i64 [[TMP1]]
; A320-NEXT: [[TMP5:%.*]] = getelementptr inbounds double, ptr [[B]], i64 [[TMP1]]
; A320-NEXT: [[TMP2:%.*]] = getelementptr inbounds double, ptr [[TMP3]], i64 2
; A320-NEXT: [[WIDE_LOAD:%.*]] = load <2 x double>, ptr [[TMP3]], align 8
; A320-NEXT: [[WIDE_LOAD2:%.*]] = load <2 x double>, ptr [[TMP2]], align 8
; A320-NEXT: [[TMP8:%.*]] = getelementptr inbounds double, ptr [[TMP5]], i64 2
; A320-NEXT: [[WIDE_LOAD3:%.*]] = load <2 x double>, ptr [[TMP5]], align 8
; A320-NEXT: [[WIDE_LOAD4:%.*]] = load <2 x double>, ptr [[TMP8]], align 8
; A320-NEXT: [[TMP4:%.*]] = fmul fast <2 x double> [[WIDE_LOAD]], [[WIDE_LOAD3]]
; A320-NEXT: [[TMP10:%.*]] = fmul fast <2 x double> [[WIDE_LOAD2]], [[WIDE_LOAD4]]
; A320-NEXT: [[TMP6]] = fadd fast <2 x double> [[VEC_PHI]], [[TMP4]]
; A320-NEXT: [[TMP7]] = fadd fast <2 x double> [[VEC_PHI1]], [[TMP10]]
; A320-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP1]], 4
; A320-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; A320-NEXT: br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
; A320: [[MIDDLE_BLOCK]]:
; A320-NEXT: [[BIN_RDX:%.*]] = fadd fast <2 x double> [[TMP7]], [[TMP6]]
; A320-NEXT: [[TMP9:%.*]] = call fast double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> [[BIN_RDX]])
; A320-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
; A320-NEXT: br i1 [[CMP_N]], label %[[EXIT_LOOPEXIT:.*]], label %[[SCALAR_PH]]
; A320: [[SCALAR_PH]]:
; A320-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[LOOP_PREHEADER]] ]
; A320-NEXT: [[BC_MERGE_RDX:%.*]] = phi double [ [[TMP9]], %[[MIDDLE_BLOCK]] ], [ 0.000000e+00, %[[LOOP_PREHEADER]] ]
; A320-NEXT: br label %[[LOOP:.*]]
; A320: [[LOOP]]:
; A320-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
; A320-NEXT: [[ACC:%.*]] = phi double [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[ACC_NEXT:%.*]], %[[LOOP]] ]
; A320-NEXT: [[GEP_A:%.*]] = getelementptr inbounds double, ptr [[A]], i64 [[IV]]
; A320-NEXT: [[GEP_B:%.*]] = getelementptr inbounds double, ptr [[B]], i64 [[IV]]
; A320-NEXT: [[VA:%.*]] = load double, ptr [[GEP_A]], align 8
; A320-NEXT: [[VB:%.*]] = load double, ptr [[GEP_B]], align 8
; A320-NEXT: [[PROD:%.*]] = fmul fast double [[VA]], [[VB]]
; A320-NEXT: [[ACC_NEXT]] = fadd fast double [[ACC]], [[PROD]]
; A320-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
; A320-NEXT: [[COND:%.*]] = icmp ult i64 [[IV_NEXT]], [[N]]
; A320-NEXT: br i1 [[COND]], label %[[LOOP]], label %[[EXIT_LOOPEXIT]], !llvm.loop [[LOOP7:![0-9]+]]
; A320: [[EXIT_LOOPEXIT]]:
; A320-NEXT: [[ACC_NEXT_LCSSA:%.*]] = phi double [ [[ACC_NEXT]], %[[LOOP]] ], [ [[TMP9]], %[[MIDDLE_BLOCK]] ]
; A320-NEXT: br label %[[EXIT]]
; A320: [[EXIT]]:
; A320-NEXT: [[RES:%.*]] = phi double [ 0.000000e+00, %[[ENTRY]] ], [ [[ACC_NEXT_LCSSA]], %[[EXIT_LOOPEXIT]] ]
; A320-NEXT: ret double [[RES]]
;
entry:
%cmp0 = icmp eq i64 %n, 0
br i1 %cmp0, label %exit, label %loop.preheader
loop.preheader:
br label %loop
loop:
%iv = phi i64 [ 0, %loop.preheader ], [ %iv.next, %loop ]
%acc = phi double [ 0.0, %loop.preheader ], [ %acc.next, %loop ]
%gep.a = getelementptr inbounds double, ptr %a, i64 %iv
%gep.b = getelementptr inbounds double, ptr %b, i64 %iv
%va = load double, ptr %gep.a, align 8
%vb = load double, ptr %gep.b, align 8
%prod = fmul fast double %va, %vb
%acc.next = fadd fast double %acc, %prod
%iv.next = add nuw nsw i64 %iv, 1
%cond = icmp ult i64 %iv.next, %n
br i1 %cond, label %loop, label %exit
exit:
%res = phi double [ 0.0, %entry ], [ %acc.next, %loop ]
ret double %res
}
;.
; A320: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
; A320: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
; A320: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
; A320: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]]}
; A320: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
; A320: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]}
; A320: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]}
; A320: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]}
;.