blob: f63e506f3adb1a5763e95a64d51992b31ffebce8 [file] [log] [blame] [edit]
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
; RUN: opt -passes=loop-vectorize -mtriple=arm64-apple-ios -S -mcpu=cyclone -enable-interleaved-mem-accesses=false < %s | FileCheck %s
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n32:64-S128"
@kernel = global [512 x float] zeroinitializer, align 16
@kernel2 = global [512 x float] zeroinitializer, align 16
@kernel3 = global [512 x float] zeroinitializer, align 16
@kernel4 = global [512 x float] zeroinitializer, align 16
@src_data = global [1536 x float] zeroinitializer, align 16
; The cost of gathers in the loop gets offset by the vector math.
define float @_Z4testmm(i64 %size, i64 %offset) {
; CHECK-LABEL: define float @_Z4testmm(
; CHECK-SAME: i64 [[SIZE:%.*]], i64 [[OFFSET:%.*]]) #[[ATTR0:[0-9]+]] {
; CHECK-NEXT: [[ENTRY:.*]]:
; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SIZE]], 8
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
; CHECK: [[VECTOR_PH]]:
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[SIZE]], 8
; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[SIZE]], [[N_MOD_VF]]
; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
; CHECK: [[VECTOR_BODY]]:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x float> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP64:%.*]], %[[VECTOR_BODY]] ]
; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <4 x float> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP65:%.*]], %[[VECTOR_BODY]] ]
; CHECK-NEXT: [[VEC_PHI2:%.*]] = phi <4 x float> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP106:%.*]], %[[VECTOR_BODY]] ]
; CHECK-NEXT: [[VEC_PHI3:%.*]] = phi <4 x float> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP107:%.*]], %[[VECTOR_BODY]] ]
; CHECK-NEXT: [[VEC_PHI4:%.*]] = phi <4 x float> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP148:%.*]], %[[VECTOR_BODY]] ]
; CHECK-NEXT: [[VEC_PHI5:%.*]] = phi <4 x float> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP149:%.*]], %[[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], 1
; CHECK-NEXT: [[TMP16:%.*]] = add i64 [[INDEX]], 2
; CHECK-NEXT: [[TMP24:%.*]] = add i64 [[INDEX]], 3
; CHECK-NEXT: [[TMP32:%.*]] = add i64 [[INDEX]], 4
; CHECK-NEXT: [[TMP48:%.*]] = add i64 [[INDEX]], 5
; CHECK-NEXT: [[TMP155:%.*]] = add i64 [[INDEX]], 6
; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 7
; CHECK-NEXT: [[ADD:%.*]] = add i64 [[INDEX]], [[OFFSET]]
; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[TMP8]], [[OFFSET]]
; CHECK-NEXT: [[TMP10:%.*]] = add i64 [[TMP16]], [[OFFSET]]
; CHECK-NEXT: [[TMP11:%.*]] = add i64 [[TMP24]], [[OFFSET]]
; CHECK-NEXT: [[TMP12:%.*]] = add i64 [[TMP32]], [[OFFSET]]
; CHECK-NEXT: [[TMP13:%.*]] = add i64 [[TMP48]], [[OFFSET]]
; CHECK-NEXT: [[TMP14:%.*]] = add i64 [[TMP155]], [[OFFSET]]
; CHECK-NEXT: [[TMP15:%.*]] = add i64 [[TMP7]], [[OFFSET]]
; CHECK-NEXT: [[MUL:%.*]] = mul i64 [[ADD]], 3
; CHECK-NEXT: [[TMP17:%.*]] = mul i64 [[TMP9]], 3
; CHECK-NEXT: [[TMP18:%.*]] = mul i64 [[TMP10]], 3
; CHECK-NEXT: [[TMP19:%.*]] = mul i64 [[TMP11]], 3
; CHECK-NEXT: [[TMP20:%.*]] = mul i64 [[TMP12]], 3
; CHECK-NEXT: [[TMP21:%.*]] = mul i64 [[TMP13]], 3
; CHECK-NEXT: [[TMP22:%.*]] = mul i64 [[TMP14]], 3
; CHECK-NEXT: [[TMP23:%.*]] = mul i64 [[TMP15]], 3
; CHECK-NEXT: [[GEP_SRC_DATA:%.*]] = getelementptr inbounds [1536 x float], ptr @src_data, i64 0, i64 [[MUL]]
; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds [1536 x float], ptr @src_data, i64 0, i64 [[TMP17]]
; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds [1536 x float], ptr @src_data, i64 0, i64 [[TMP18]]
; CHECK-NEXT: [[TMP27:%.*]] = getelementptr inbounds [1536 x float], ptr @src_data, i64 0, i64 [[TMP19]]
; CHECK-NEXT: [[TMP28:%.*]] = getelementptr inbounds [1536 x float], ptr @src_data, i64 0, i64 [[TMP20]]
; CHECK-NEXT: [[TMP29:%.*]] = getelementptr inbounds [1536 x float], ptr @src_data, i64 0, i64 [[TMP21]]
; CHECK-NEXT: [[TMP30:%.*]] = getelementptr inbounds [1536 x float], ptr @src_data, i64 0, i64 [[TMP22]]
; CHECK-NEXT: [[TMP31:%.*]] = getelementptr inbounds [1536 x float], ptr @src_data, i64 0, i64 [[TMP23]]
; CHECK-NEXT: [[TMP0:%.*]] = load float, ptr [[GEP_SRC_DATA]], align 4
; CHECK-NEXT: [[TMP33:%.*]] = load float, ptr [[TMP25]], align 4
; CHECK-NEXT: [[TMP34:%.*]] = load float, ptr [[TMP26]], align 4
; CHECK-NEXT: [[TMP35:%.*]] = load float, ptr [[TMP27]], align 4
; CHECK-NEXT: [[TMP36:%.*]] = insertelement <4 x float> poison, float [[TMP0]], i32 0
; CHECK-NEXT: [[TMP37:%.*]] = insertelement <4 x float> [[TMP36]], float [[TMP33]], i32 1
; CHECK-NEXT: [[TMP38:%.*]] = insertelement <4 x float> [[TMP37]], float [[TMP34]], i32 2
; CHECK-NEXT: [[TMP39:%.*]] = insertelement <4 x float> [[TMP38]], float [[TMP35]], i32 3
; CHECK-NEXT: [[TMP40:%.*]] = load float, ptr [[TMP28]], align 4
; CHECK-NEXT: [[TMP41:%.*]] = load float, ptr [[TMP29]], align 4
; CHECK-NEXT: [[TMP42:%.*]] = load float, ptr [[TMP30]], align 4
; CHECK-NEXT: [[TMP43:%.*]] = load float, ptr [[TMP31]], align 4
; CHECK-NEXT: [[TMP44:%.*]] = insertelement <4 x float> poison, float [[TMP40]], i32 0
; CHECK-NEXT: [[TMP45:%.*]] = insertelement <4 x float> [[TMP44]], float [[TMP41]], i32 1
; CHECK-NEXT: [[TMP46:%.*]] = insertelement <4 x float> [[TMP45]], float [[TMP42]], i32 2
; CHECK-NEXT: [[TMP47:%.*]] = insertelement <4 x float> [[TMP46]], float [[TMP43]], i32 3
; CHECK-NEXT: [[GEP_KERNEL:%.*]] = getelementptr inbounds [512 x float], ptr @kernel, i64 0, i64 [[INDEX]]
; CHECK-NEXT: [[TMP49:%.*]] = getelementptr inbounds float, ptr [[GEP_KERNEL]], i64 4
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[GEP_KERNEL]], align 4
; CHECK-NEXT: [[WIDE_LOAD6:%.*]] = load <4 x float>, ptr [[TMP49]], align 4
; CHECK-NEXT: [[TMP50:%.*]] = fmul fast <4 x float> [[TMP39]], [[WIDE_LOAD]]
; CHECK-NEXT: [[TMP51:%.*]] = fmul fast <4 x float> [[TMP47]], [[WIDE_LOAD6]]
; CHECK-NEXT: [[TMP52:%.*]] = getelementptr inbounds [512 x float], ptr @kernel2, i64 0, i64 [[INDEX]]
; CHECK-NEXT: [[TMP53:%.*]] = getelementptr inbounds float, ptr [[TMP52]], i64 4
; CHECK-NEXT: [[WIDE_LOAD7:%.*]] = load <4 x float>, ptr [[TMP52]], align 4
; CHECK-NEXT: [[WIDE_LOAD8:%.*]] = load <4 x float>, ptr [[TMP53]], align 4
; CHECK-NEXT: [[TMP54:%.*]] = fmul fast <4 x float> [[TMP50]], [[WIDE_LOAD7]]
; CHECK-NEXT: [[TMP55:%.*]] = fmul fast <4 x float> [[TMP51]], [[WIDE_LOAD8]]
; CHECK-NEXT: [[TMP56:%.*]] = getelementptr inbounds [512 x float], ptr @kernel3, i64 0, i64 [[INDEX]]
; CHECK-NEXT: [[TMP57:%.*]] = getelementptr inbounds float, ptr [[TMP56]], i64 4
; CHECK-NEXT: [[WIDE_LOAD9:%.*]] = load <4 x float>, ptr [[TMP56]], align 4
; CHECK-NEXT: [[WIDE_LOAD10:%.*]] = load <4 x float>, ptr [[TMP57]], align 4
; CHECK-NEXT: [[TMP58:%.*]] = fmul fast <4 x float> [[TMP54]], [[WIDE_LOAD9]]
; CHECK-NEXT: [[TMP59:%.*]] = fmul fast <4 x float> [[TMP55]], [[WIDE_LOAD10]]
; CHECK-NEXT: [[TMP60:%.*]] = getelementptr inbounds [512 x float], ptr @kernel4, i64 0, i64 [[INDEX]]
; CHECK-NEXT: [[TMP61:%.*]] = getelementptr inbounds float, ptr [[TMP60]], i64 4
; CHECK-NEXT: [[WIDE_LOAD11:%.*]] = load <4 x float>, ptr [[TMP60]], align 4
; CHECK-NEXT: [[WIDE_LOAD12:%.*]] = load <4 x float>, ptr [[TMP61]], align 4
; CHECK-NEXT: [[TMP62:%.*]] = fmul fast <4 x float> [[TMP58]], [[WIDE_LOAD11]]
; CHECK-NEXT: [[TMP63:%.*]] = fmul fast <4 x float> [[TMP59]], [[WIDE_LOAD12]]
; CHECK-NEXT: [[TMP64]] = fadd fast <4 x float> [[VEC_PHI]], [[TMP62]]
; CHECK-NEXT: [[TMP65]] = fadd fast <4 x float> [[VEC_PHI1]], [[TMP63]]
; CHECK-NEXT: [[TMP66:%.*]] = add i64 [[MUL]], 1
; CHECK-NEXT: [[TMP67:%.*]] = add i64 [[TMP17]], 1
; CHECK-NEXT: [[TMP68:%.*]] = add i64 [[TMP18]], 1
; CHECK-NEXT: [[TMP69:%.*]] = add i64 [[TMP19]], 1
; CHECK-NEXT: [[TMP70:%.*]] = add i64 [[TMP20]], 1
; CHECK-NEXT: [[TMP71:%.*]] = add i64 [[TMP21]], 1
; CHECK-NEXT: [[TMP72:%.*]] = add i64 [[TMP22]], 1
; CHECK-NEXT: [[TMP73:%.*]] = add i64 [[TMP23]], 1
; CHECK-NEXT: [[TMP74:%.*]] = getelementptr inbounds [1536 x float], ptr @src_data, i64 0, i64 [[TMP66]]
; CHECK-NEXT: [[TMP75:%.*]] = getelementptr inbounds [1536 x float], ptr @src_data, i64 0, i64 [[TMP67]]
; CHECK-NEXT: [[TMP76:%.*]] = getelementptr inbounds [1536 x float], ptr @src_data, i64 0, i64 [[TMP68]]
; CHECK-NEXT: [[TMP77:%.*]] = getelementptr inbounds [1536 x float], ptr @src_data, i64 0, i64 [[TMP69]]
; CHECK-NEXT: [[TMP78:%.*]] = getelementptr inbounds [1536 x float], ptr @src_data, i64 0, i64 [[TMP70]]
; CHECK-NEXT: [[TMP79:%.*]] = getelementptr inbounds [1536 x float], ptr @src_data, i64 0, i64 [[TMP71]]
; CHECK-NEXT: [[TMP80:%.*]] = getelementptr inbounds [1536 x float], ptr @src_data, i64 0, i64 [[TMP72]]
; CHECK-NEXT: [[TMP81:%.*]] = getelementptr inbounds [1536 x float], ptr @src_data, i64 0, i64 [[TMP73]]
; CHECK-NEXT: [[TMP82:%.*]] = load float, ptr [[TMP74]], align 4
; CHECK-NEXT: [[TMP83:%.*]] = load float, ptr [[TMP75]], align 4
; CHECK-NEXT: [[TMP84:%.*]] = load float, ptr [[TMP76]], align 4
; CHECK-NEXT: [[TMP85:%.*]] = load float, ptr [[TMP77]], align 4
; CHECK-NEXT: [[TMP86:%.*]] = insertelement <4 x float> poison, float [[TMP82]], i32 0
; CHECK-NEXT: [[TMP87:%.*]] = insertelement <4 x float> [[TMP86]], float [[TMP83]], i32 1
; CHECK-NEXT: [[TMP88:%.*]] = insertelement <4 x float> [[TMP87]], float [[TMP84]], i32 2
; CHECK-NEXT: [[TMP89:%.*]] = insertelement <4 x float> [[TMP88]], float [[TMP85]], i32 3
; CHECK-NEXT: [[TMP90:%.*]] = load float, ptr [[TMP78]], align 4
; CHECK-NEXT: [[TMP91:%.*]] = load float, ptr [[TMP79]], align 4
; CHECK-NEXT: [[TMP92:%.*]] = load float, ptr [[TMP80]], align 4
; CHECK-NEXT: [[TMP93:%.*]] = load float, ptr [[TMP81]], align 4
; CHECK-NEXT: [[TMP94:%.*]] = insertelement <4 x float> poison, float [[TMP90]], i32 0
; CHECK-NEXT: [[TMP95:%.*]] = insertelement <4 x float> [[TMP94]], float [[TMP91]], i32 1
; CHECK-NEXT: [[TMP96:%.*]] = insertelement <4 x float> [[TMP95]], float [[TMP92]], i32 2
; CHECK-NEXT: [[TMP97:%.*]] = insertelement <4 x float> [[TMP96]], float [[TMP93]], i32 3
; CHECK-NEXT: [[TMP98:%.*]] = fmul fast <4 x float> [[WIDE_LOAD]], [[TMP89]]
; CHECK-NEXT: [[TMP99:%.*]] = fmul fast <4 x float> [[WIDE_LOAD6]], [[TMP97]]
; CHECK-NEXT: [[TMP100:%.*]] = fmul fast <4 x float> [[WIDE_LOAD7]], [[TMP98]]
; CHECK-NEXT: [[TMP101:%.*]] = fmul fast <4 x float> [[WIDE_LOAD8]], [[TMP99]]
; CHECK-NEXT: [[TMP102:%.*]] = fmul fast <4 x float> [[WIDE_LOAD9]], [[TMP100]]
; CHECK-NEXT: [[TMP103:%.*]] = fmul fast <4 x float> [[WIDE_LOAD10]], [[TMP101]]
; CHECK-NEXT: [[TMP104:%.*]] = fmul fast <4 x float> [[WIDE_LOAD11]], [[TMP102]]
; CHECK-NEXT: [[TMP105:%.*]] = fmul fast <4 x float> [[WIDE_LOAD12]], [[TMP103]]
; CHECK-NEXT: [[TMP106]] = fadd fast <4 x float> [[VEC_PHI2]], [[TMP104]]
; CHECK-NEXT: [[TMP107]] = fadd fast <4 x float> [[VEC_PHI3]], [[TMP105]]
; CHECK-NEXT: [[TMP108:%.*]] = add i64 [[MUL]], 2
; CHECK-NEXT: [[TMP109:%.*]] = add i64 [[TMP17]], 2
; CHECK-NEXT: [[TMP110:%.*]] = add i64 [[TMP18]], 2
; CHECK-NEXT: [[TMP111:%.*]] = add i64 [[TMP19]], 2
; CHECK-NEXT: [[TMP112:%.*]] = add i64 [[TMP20]], 2
; CHECK-NEXT: [[TMP113:%.*]] = add i64 [[TMP21]], 2
; CHECK-NEXT: [[TMP114:%.*]] = add i64 [[TMP22]], 2
; CHECK-NEXT: [[TMP115:%.*]] = add i64 [[TMP23]], 2
; CHECK-NEXT: [[TMP116:%.*]] = getelementptr inbounds [1536 x float], ptr @src_data, i64 0, i64 [[TMP108]]
; CHECK-NEXT: [[TMP117:%.*]] = getelementptr inbounds [1536 x float], ptr @src_data, i64 0, i64 [[TMP109]]
; CHECK-NEXT: [[TMP118:%.*]] = getelementptr inbounds [1536 x float], ptr @src_data, i64 0, i64 [[TMP110]]
; CHECK-NEXT: [[TMP119:%.*]] = getelementptr inbounds [1536 x float], ptr @src_data, i64 0, i64 [[TMP111]]
; CHECK-NEXT: [[TMP120:%.*]] = getelementptr inbounds [1536 x float], ptr @src_data, i64 0, i64 [[TMP112]]
; CHECK-NEXT: [[TMP121:%.*]] = getelementptr inbounds [1536 x float], ptr @src_data, i64 0, i64 [[TMP113]]
; CHECK-NEXT: [[TMP122:%.*]] = getelementptr inbounds [1536 x float], ptr @src_data, i64 0, i64 [[TMP114]]
; CHECK-NEXT: [[TMP123:%.*]] = getelementptr inbounds [1536 x float], ptr @src_data, i64 0, i64 [[TMP115]]
; CHECK-NEXT: [[TMP124:%.*]] = load float, ptr [[TMP116]], align 4
; CHECK-NEXT: [[TMP125:%.*]] = load float, ptr [[TMP117]], align 4
; CHECK-NEXT: [[TMP126:%.*]] = load float, ptr [[TMP118]], align 4
; CHECK-NEXT: [[TMP127:%.*]] = load float, ptr [[TMP119]], align 4
; CHECK-NEXT: [[TMP128:%.*]] = insertelement <4 x float> poison, float [[TMP124]], i32 0
; CHECK-NEXT: [[TMP129:%.*]] = insertelement <4 x float> [[TMP128]], float [[TMP125]], i32 1
; CHECK-NEXT: [[TMP130:%.*]] = insertelement <4 x float> [[TMP129]], float [[TMP126]], i32 2
; CHECK-NEXT: [[TMP131:%.*]] = insertelement <4 x float> [[TMP130]], float [[TMP127]], i32 3
; CHECK-NEXT: [[TMP132:%.*]] = load float, ptr [[TMP120]], align 4
; CHECK-NEXT: [[TMP133:%.*]] = load float, ptr [[TMP121]], align 4
; CHECK-NEXT: [[TMP134:%.*]] = load float, ptr [[TMP122]], align 4
; CHECK-NEXT: [[TMP135:%.*]] = load float, ptr [[TMP123]], align 4
; CHECK-NEXT: [[TMP136:%.*]] = insertelement <4 x float> poison, float [[TMP132]], i32 0
; CHECK-NEXT: [[TMP137:%.*]] = insertelement <4 x float> [[TMP136]], float [[TMP133]], i32 1
; CHECK-NEXT: [[TMP138:%.*]] = insertelement <4 x float> [[TMP137]], float [[TMP134]], i32 2
; CHECK-NEXT: [[TMP139:%.*]] = insertelement <4 x float> [[TMP138]], float [[TMP135]], i32 3
; CHECK-NEXT: [[TMP140:%.*]] = fmul fast <4 x float> [[WIDE_LOAD]], [[TMP131]]
; CHECK-NEXT: [[TMP141:%.*]] = fmul fast <4 x float> [[WIDE_LOAD6]], [[TMP139]]
; CHECK-NEXT: [[TMP142:%.*]] = fmul fast <4 x float> [[WIDE_LOAD7]], [[TMP140]]
; CHECK-NEXT: [[TMP143:%.*]] = fmul fast <4 x float> [[WIDE_LOAD8]], [[TMP141]]
; CHECK-NEXT: [[TMP144:%.*]] = fmul fast <4 x float> [[WIDE_LOAD9]], [[TMP142]]
; CHECK-NEXT: [[TMP145:%.*]] = fmul fast <4 x float> [[WIDE_LOAD10]], [[TMP143]]
; CHECK-NEXT: [[TMP146:%.*]] = fmul fast <4 x float> [[WIDE_LOAD11]], [[TMP144]]
; CHECK-NEXT: [[TMP147:%.*]] = fmul fast <4 x float> [[WIDE_LOAD12]], [[TMP145]]
; CHECK-NEXT: [[TMP148]] = fadd fast <4 x float> [[VEC_PHI4]], [[TMP146]]
; CHECK-NEXT: [[TMP149]] = fadd fast <4 x float> [[VEC_PHI5]], [[TMP147]]
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
; CHECK-NEXT: [[TMP150:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-NEXT: br i1 [[TMP150]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
; CHECK-NEXT: [[BIN_RDX:%.*]] = fadd fast <4 x float> [[TMP65]], [[TMP64]]
; CHECK-NEXT: [[TMP151:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[BIN_RDX]])
; CHECK-NEXT: [[BIN_RDX13:%.*]] = fadd fast <4 x float> [[TMP107]], [[TMP106]]
; CHECK-NEXT: [[TMP152:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[BIN_RDX13]])
; CHECK-NEXT: [[BIN_RDX14:%.*]] = fadd fast <4 x float> [[TMP149]], [[TMP148]]
; CHECK-NEXT: [[TMP153:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[BIN_RDX14]])
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[SIZE]], [[N_VEC]]
; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
; CHECK: [[SCALAR_PH]]:
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP151]], %[[MIDDLE_BLOCK]] ], [ 0.000000e+00, %[[ENTRY]] ]
; CHECK-NEXT: [[BC_MERGE_RDX15:%.*]] = phi float [ [[TMP152]], %[[MIDDLE_BLOCK]] ], [ 0.000000e+00, %[[ENTRY]] ]
; CHECK-NEXT: [[BC_MERGE_RDX16:%.*]] = phi float [ [[TMP153]], %[[MIDDLE_BLOCK]] ], [ 0.000000e+00, %[[ENTRY]] ]
; CHECK-NEXT: br label %[[LOOP:.*]]
; CHECK: [[LOOP]]:
; CHECK-NEXT: [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
; CHECK-NEXT: [[RDX_0:%.*]] = phi float [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[RDX_0_NEXT:%.*]], %[[LOOP]] ]
; CHECK-NEXT: [[RDX_1:%.*]] = phi float [ [[BC_MERGE_RDX15]], %[[SCALAR_PH]] ], [ [[RDX_1_NEXT:%.*]], %[[LOOP]] ]
; CHECK-NEXT: [[RED_2:%.*]] = phi float [ [[BC_MERGE_RDX16]], %[[SCALAR_PH]] ], [ [[RDX_2_NEXT:%.*]], %[[LOOP]] ]
; CHECK-NEXT: [[ADD1:%.*]] = add i64 [[IV1]], [[OFFSET]]
; CHECK-NEXT: [[MUL1:%.*]] = mul i64 [[ADD1]], 3
; CHECK-NEXT: [[GEP_SRC_DATA1:%.*]] = getelementptr inbounds [1536 x float], ptr @src_data, i64 0, i64 [[MUL1]]
; CHECK-NEXT: [[TMP154:%.*]] = load float, ptr [[GEP_SRC_DATA1]], align 4
; CHECK-NEXT: [[GEP_KERNEL1:%.*]] = getelementptr inbounds [512 x float], ptr @kernel, i64 0, i64 [[IV1]]
; CHECK-NEXT: [[TMP1:%.*]] = load float, ptr [[GEP_KERNEL1]], align 4
; CHECK-NEXT: [[MUL3:%.*]] = fmul fast float [[TMP154]], [[TMP1]]
; CHECK-NEXT: [[GEP_KERNEL2:%.*]] = getelementptr inbounds [512 x float], ptr @kernel2, i64 0, i64 [[IV1]]
; CHECK-NEXT: [[TMP2:%.*]] = load float, ptr [[GEP_KERNEL2]], align 4
; CHECK-NEXT: [[MUL5:%.*]] = fmul fast float [[MUL3]], [[TMP2]]
; CHECK-NEXT: [[GEP_KERNEL3:%.*]] = getelementptr inbounds [512 x float], ptr @kernel3, i64 0, i64 [[IV1]]
; CHECK-NEXT: [[TMP3:%.*]] = load float, ptr [[GEP_KERNEL3]], align 4
; CHECK-NEXT: [[MUL7:%.*]] = fmul fast float [[MUL5]], [[TMP3]]
; CHECK-NEXT: [[GEP_KERNEL4:%.*]] = getelementptr inbounds [512 x float], ptr @kernel4, i64 0, i64 [[IV1]]
; CHECK-NEXT: [[TMP4:%.*]] = load float, ptr [[GEP_KERNEL4]], align 4
; CHECK-NEXT: [[MUL9:%.*]] = fmul fast float [[MUL7]], [[TMP4]]
; CHECK-NEXT: [[RDX_0_NEXT]] = fadd fast float [[RDX_0]], [[MUL9]]
; CHECK-NEXT: [[GEP_SRC_DATA_SUM:%.*]] = add i64 [[MUL1]], 1
; CHECK-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds [1536 x float], ptr @src_data, i64 0, i64 [[GEP_SRC_DATA_SUM]]
; CHECK-NEXT: [[TMP5:%.*]] = load float, ptr [[ARRAYIDX11]], align 4
; CHECK-NEXT: [[MUL13:%.*]] = fmul fast float [[TMP1]], [[TMP5]]
; CHECK-NEXT: [[MUL15:%.*]] = fmul fast float [[TMP2]], [[MUL13]]
; CHECK-NEXT: [[MUL17:%.*]] = fmul fast float [[TMP3]], [[MUL15]]
; CHECK-NEXT: [[MUL19:%.*]] = fmul fast float [[TMP4]], [[MUL17]]
; CHECK-NEXT: [[RDX_1_NEXT]] = fadd fast float [[RDX_1]], [[MUL19]]
; CHECK-NEXT: [[GEP_SRC_DATA_SUM52:%.*]] = add i64 [[MUL1]], 2
; CHECK-NEXT: [[ARRAYIDX21:%.*]] = getelementptr inbounds [1536 x float], ptr @src_data, i64 0, i64 [[GEP_SRC_DATA_SUM52]]
; CHECK-NEXT: [[TMP6:%.*]] = load float, ptr [[ARRAYIDX21]], align 4
; CHECK-NEXT: [[MUL23:%.*]] = fmul fast float [[TMP1]], [[TMP6]]
; CHECK-NEXT: [[MUL25:%.*]] = fmul fast float [[TMP2]], [[MUL23]]
; CHECK-NEXT: [[MUL27:%.*]] = fmul fast float [[TMP3]], [[MUL25]]
; CHECK-NEXT: [[MUL29:%.*]] = fmul fast float [[TMP4]], [[MUL27]]
; CHECK-NEXT: [[RDX_2_NEXT]] = fadd fast float [[RED_2]], [[MUL29]]
; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV1]], 1
; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[IV_NEXT]], [[SIZE]]
; CHECK-NEXT: br i1 [[EXITCOND]], label %[[LOOP]], label %[[EXIT]], !llvm.loop [[LOOP3:![0-9]+]]
; CHECK: [[EXIT]]:
; CHECK-NEXT: [[RDX_0_NEXT_LCSSA:%.*]] = phi float [ [[RDX_0_NEXT]], %[[LOOP]] ], [ [[TMP151]], %[[MIDDLE_BLOCK]] ]
; CHECK-NEXT: [[RDX_1_NEXT_LCSSA:%.*]] = phi float [ [[RDX_1_NEXT]], %[[LOOP]] ], [ [[TMP152]], %[[MIDDLE_BLOCK]] ]
; CHECK-NEXT: [[RDX_2_NEXT_LCSSA:%.*]] = phi float [ [[RDX_2_NEXT]], %[[LOOP]] ], [ [[TMP153]], %[[MIDDLE_BLOCK]] ]
; CHECK-NEXT: [[RES_0:%.*]] = fadd float [[RDX_0_NEXT_LCSSA]], [[RDX_1_NEXT_LCSSA]]
; CHECK-NEXT: [[RES_1:%.*]] = fadd float [[RES_0]], [[RDX_2_NEXT_LCSSA]]
; CHECK-NEXT: ret float [[RES_1]]
;
entry:
br label %loop
loop:
%iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
%rdx.0 = phi float [ 0.000000e+00, %entry ], [ %rdx.0.next, %loop ]
%rdx.1 = phi float [ 0.000000e+00, %entry ], [ %rdx.1.next, %loop ]
%red.2 = phi float [ 0.000000e+00, %entry ], [ %rdx.2.next, %loop ]
%add = add i64 %iv, %offset
%mul = mul i64 %add, 3
%gep.src_data = getelementptr inbounds [1536 x float], ptr @src_data, i64 0, i64 %mul
%0 = load float, ptr %gep.src_data, align 4
%gep.kernel = getelementptr inbounds [512 x float], ptr @kernel, i64 0, i64 %iv
%1 = load float, ptr %gep.kernel, align 4
%mul3 = fmul fast float %0, %1
%gep.kernel2 = getelementptr inbounds [512 x float], ptr @kernel2, i64 0, i64 %iv
%2 = load float, ptr %gep.kernel2, align 4
%mul5 = fmul fast float %mul3, %2
%gep.kernel3 = getelementptr inbounds [512 x float], ptr @kernel3, i64 0, i64 %iv
%3 = load float, ptr %gep.kernel3, align 4
%mul7 = fmul fast float %mul5, %3
%gep.kernel4 = getelementptr inbounds [512 x float], ptr @kernel4, i64 0, i64 %iv
%4 = load float, ptr %gep.kernel4, align 4
%mul9 = fmul fast float %mul7, %4
%rdx.0.next = fadd fast float %rdx.0, %mul9
%gep.src_data.sum = add i64 %mul, 1
%arrayidx11 = getelementptr inbounds [1536 x float], ptr @src_data, i64 0, i64 %gep.src_data.sum
%5 = load float, ptr %arrayidx11, align 4
%mul13 = fmul fast float %1, %5
%mul15 = fmul fast float %2, %mul13
%mul17 = fmul fast float %3, %mul15
%mul19 = fmul fast float %4, %mul17
%rdx.1.next = fadd fast float %rdx.1, %mul19
%gep.src_data.sum52 = add i64 %mul, 2
%arrayidx21 = getelementptr inbounds [1536 x float], ptr @src_data, i64 0, i64 %gep.src_data.sum52
%6 = load float, ptr %arrayidx21, align 4
%mul23 = fmul fast float %1, %6
%mul25 = fmul fast float %2, %mul23
%mul27 = fmul fast float %3, %mul25
%mul29 = fmul fast float %4, %mul27
%rdx.2.next = fadd fast float %red.2, %mul29
%iv.next = add i64 %iv, 1
%exitcond = icmp ne i64 %iv.next, %size
br i1 %exitcond, label %loop, label %exit
exit:
%res.0 = fadd float %rdx.0.next, %rdx.1.next
%res.1 = fadd float %res.0, %rdx.2.next
ret float %res.1
}
;.
; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
;.