| ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6 |
| ; RUN: opt -passes=loop-vectorize -mtriple=arm64-apple-ios -S -mcpu=cyclone -enable-interleaved-mem-accesses=false < %s | FileCheck %s |
| target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n32:64-S128" |
| |
| @kernel = global [512 x float] zeroinitializer, align 16 |
| @kernel2 = global [512 x float] zeroinitializer, align 16 |
| @kernel3 = global [512 x float] zeroinitializer, align 16 |
| @kernel4 = global [512 x float] zeroinitializer, align 16 |
| @src_data = global [1536 x float] zeroinitializer, align 16 |
| |
| ; The cost of gathers in the loop gets offset by the vector math. |
| |
| define float @_Z4testmm(i64 %size, i64 %offset) { |
| ; CHECK-LABEL: define float @_Z4testmm( |
| ; CHECK-SAME: i64 [[SIZE:%.*]], i64 [[OFFSET:%.*]]) #[[ATTR0:[0-9]+]] { |
| ; CHECK-NEXT: [[ENTRY:.*]]: |
| ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SIZE]], 8 |
| ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] |
| ; CHECK: [[VECTOR_PH]]: |
| ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[SIZE]], 8 |
| ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[SIZE]], [[N_MOD_VF]] |
| ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] |
| ; CHECK: [[VECTOR_BODY]]: |
| ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] |
| ; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x float> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP64:%.*]], %[[VECTOR_BODY]] ] |
| ; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <4 x float> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP65:%.*]], %[[VECTOR_BODY]] ] |
| ; CHECK-NEXT: [[VEC_PHI2:%.*]] = phi <4 x float> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP106:%.*]], %[[VECTOR_BODY]] ] |
| ; CHECK-NEXT: [[VEC_PHI3:%.*]] = phi <4 x float> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP107:%.*]], %[[VECTOR_BODY]] ] |
| ; CHECK-NEXT: [[VEC_PHI4:%.*]] = phi <4 x float> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP148:%.*]], %[[VECTOR_BODY]] ] |
| ; CHECK-NEXT: [[VEC_PHI5:%.*]] = phi <4 x float> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP149:%.*]], %[[VECTOR_BODY]] ] |
| ; CHECK-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], 1 |
| ; CHECK-NEXT: [[TMP16:%.*]] = add i64 [[INDEX]], 2 |
| ; CHECK-NEXT: [[TMP24:%.*]] = add i64 [[INDEX]], 3 |
| ; CHECK-NEXT: [[TMP32:%.*]] = add i64 [[INDEX]], 4 |
| ; CHECK-NEXT: [[TMP48:%.*]] = add i64 [[INDEX]], 5 |
| ; CHECK-NEXT: [[TMP155:%.*]] = add i64 [[INDEX]], 6 |
| ; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 7 |
| ; CHECK-NEXT: [[ADD:%.*]] = add i64 [[INDEX]], [[OFFSET]] |
| ; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[TMP8]], [[OFFSET]] |
| ; CHECK-NEXT: [[TMP10:%.*]] = add i64 [[TMP16]], [[OFFSET]] |
| ; CHECK-NEXT: [[TMP11:%.*]] = add i64 [[TMP24]], [[OFFSET]] |
| ; CHECK-NEXT: [[TMP12:%.*]] = add i64 [[TMP32]], [[OFFSET]] |
| ; CHECK-NEXT: [[TMP13:%.*]] = add i64 [[TMP48]], [[OFFSET]] |
| ; CHECK-NEXT: [[TMP14:%.*]] = add i64 [[TMP155]], [[OFFSET]] |
| ; CHECK-NEXT: [[TMP15:%.*]] = add i64 [[TMP7]], [[OFFSET]] |
| ; CHECK-NEXT: [[MUL:%.*]] = mul i64 [[ADD]], 3 |
| ; CHECK-NEXT: [[TMP17:%.*]] = mul i64 [[TMP9]], 3 |
| ; CHECK-NEXT: [[TMP18:%.*]] = mul i64 [[TMP10]], 3 |
| ; CHECK-NEXT: [[TMP19:%.*]] = mul i64 [[TMP11]], 3 |
| ; CHECK-NEXT: [[TMP20:%.*]] = mul i64 [[TMP12]], 3 |
| ; CHECK-NEXT: [[TMP21:%.*]] = mul i64 [[TMP13]], 3 |
| ; CHECK-NEXT: [[TMP22:%.*]] = mul i64 [[TMP14]], 3 |
| ; CHECK-NEXT: [[TMP23:%.*]] = mul i64 [[TMP15]], 3 |
| ; CHECK-NEXT: [[GEP_SRC_DATA:%.*]] = getelementptr inbounds [1536 x float], ptr @src_data, i64 0, i64 [[MUL]] |
| ; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds [1536 x float], ptr @src_data, i64 0, i64 [[TMP17]] |
| ; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds [1536 x float], ptr @src_data, i64 0, i64 [[TMP18]] |
| ; CHECK-NEXT: [[TMP27:%.*]] = getelementptr inbounds [1536 x float], ptr @src_data, i64 0, i64 [[TMP19]] |
| ; CHECK-NEXT: [[TMP28:%.*]] = getelementptr inbounds [1536 x float], ptr @src_data, i64 0, i64 [[TMP20]] |
| ; CHECK-NEXT: [[TMP29:%.*]] = getelementptr inbounds [1536 x float], ptr @src_data, i64 0, i64 [[TMP21]] |
| ; CHECK-NEXT: [[TMP30:%.*]] = getelementptr inbounds [1536 x float], ptr @src_data, i64 0, i64 [[TMP22]] |
| ; CHECK-NEXT: [[TMP31:%.*]] = getelementptr inbounds [1536 x float], ptr @src_data, i64 0, i64 [[TMP23]] |
| ; CHECK-NEXT: [[TMP0:%.*]] = load float, ptr [[GEP_SRC_DATA]], align 4 |
| ; CHECK-NEXT: [[TMP33:%.*]] = load float, ptr [[TMP25]], align 4 |
| ; CHECK-NEXT: [[TMP34:%.*]] = load float, ptr [[TMP26]], align 4 |
| ; CHECK-NEXT: [[TMP35:%.*]] = load float, ptr [[TMP27]], align 4 |
| ; CHECK-NEXT: [[TMP36:%.*]] = insertelement <4 x float> poison, float [[TMP0]], i32 0 |
| ; CHECK-NEXT: [[TMP37:%.*]] = insertelement <4 x float> [[TMP36]], float [[TMP33]], i32 1 |
| ; CHECK-NEXT: [[TMP38:%.*]] = insertelement <4 x float> [[TMP37]], float [[TMP34]], i32 2 |
| ; CHECK-NEXT: [[TMP39:%.*]] = insertelement <4 x float> [[TMP38]], float [[TMP35]], i32 3 |
| ; CHECK-NEXT: [[TMP40:%.*]] = load float, ptr [[TMP28]], align 4 |
| ; CHECK-NEXT: [[TMP41:%.*]] = load float, ptr [[TMP29]], align 4 |
| ; CHECK-NEXT: [[TMP42:%.*]] = load float, ptr [[TMP30]], align 4 |
| ; CHECK-NEXT: [[TMP43:%.*]] = load float, ptr [[TMP31]], align 4 |
| ; CHECK-NEXT: [[TMP44:%.*]] = insertelement <4 x float> poison, float [[TMP40]], i32 0 |
| ; CHECK-NEXT: [[TMP45:%.*]] = insertelement <4 x float> [[TMP44]], float [[TMP41]], i32 1 |
| ; CHECK-NEXT: [[TMP46:%.*]] = insertelement <4 x float> [[TMP45]], float [[TMP42]], i32 2 |
| ; CHECK-NEXT: [[TMP47:%.*]] = insertelement <4 x float> [[TMP46]], float [[TMP43]], i32 3 |
| ; CHECK-NEXT: [[GEP_KERNEL:%.*]] = getelementptr inbounds [512 x float], ptr @kernel, i64 0, i64 [[INDEX]] |
| ; CHECK-NEXT: [[TMP49:%.*]] = getelementptr inbounds float, ptr [[GEP_KERNEL]], i64 4 |
| ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[GEP_KERNEL]], align 4 |
| ; CHECK-NEXT: [[WIDE_LOAD6:%.*]] = load <4 x float>, ptr [[TMP49]], align 4 |
| ; CHECK-NEXT: [[TMP50:%.*]] = fmul fast <4 x float> [[TMP39]], [[WIDE_LOAD]] |
| ; CHECK-NEXT: [[TMP51:%.*]] = fmul fast <4 x float> [[TMP47]], [[WIDE_LOAD6]] |
| ; CHECK-NEXT: [[TMP52:%.*]] = getelementptr inbounds [512 x float], ptr @kernel2, i64 0, i64 [[INDEX]] |
| ; CHECK-NEXT: [[TMP53:%.*]] = getelementptr inbounds float, ptr [[TMP52]], i64 4 |
| ; CHECK-NEXT: [[WIDE_LOAD7:%.*]] = load <4 x float>, ptr [[TMP52]], align 4 |
| ; CHECK-NEXT: [[WIDE_LOAD8:%.*]] = load <4 x float>, ptr [[TMP53]], align 4 |
| ; CHECK-NEXT: [[TMP54:%.*]] = fmul fast <4 x float> [[TMP50]], [[WIDE_LOAD7]] |
| ; CHECK-NEXT: [[TMP55:%.*]] = fmul fast <4 x float> [[TMP51]], [[WIDE_LOAD8]] |
| ; CHECK-NEXT: [[TMP56:%.*]] = getelementptr inbounds [512 x float], ptr @kernel3, i64 0, i64 [[INDEX]] |
| ; CHECK-NEXT: [[TMP57:%.*]] = getelementptr inbounds float, ptr [[TMP56]], i64 4 |
| ; CHECK-NEXT: [[WIDE_LOAD9:%.*]] = load <4 x float>, ptr [[TMP56]], align 4 |
| ; CHECK-NEXT: [[WIDE_LOAD10:%.*]] = load <4 x float>, ptr [[TMP57]], align 4 |
| ; CHECK-NEXT: [[TMP58:%.*]] = fmul fast <4 x float> [[TMP54]], [[WIDE_LOAD9]] |
| ; CHECK-NEXT: [[TMP59:%.*]] = fmul fast <4 x float> [[TMP55]], [[WIDE_LOAD10]] |
| ; CHECK-NEXT: [[TMP60:%.*]] = getelementptr inbounds [512 x float], ptr @kernel4, i64 0, i64 [[INDEX]] |
| ; CHECK-NEXT: [[TMP61:%.*]] = getelementptr inbounds float, ptr [[TMP60]], i64 4 |
| ; CHECK-NEXT: [[WIDE_LOAD11:%.*]] = load <4 x float>, ptr [[TMP60]], align 4 |
| ; CHECK-NEXT: [[WIDE_LOAD12:%.*]] = load <4 x float>, ptr [[TMP61]], align 4 |
| ; CHECK-NEXT: [[TMP62:%.*]] = fmul fast <4 x float> [[TMP58]], [[WIDE_LOAD11]] |
| ; CHECK-NEXT: [[TMP63:%.*]] = fmul fast <4 x float> [[TMP59]], [[WIDE_LOAD12]] |
| ; CHECK-NEXT: [[TMP64]] = fadd fast <4 x float> [[VEC_PHI]], [[TMP62]] |
| ; CHECK-NEXT: [[TMP65]] = fadd fast <4 x float> [[VEC_PHI1]], [[TMP63]] |
| ; CHECK-NEXT: [[TMP66:%.*]] = add i64 [[MUL]], 1 |
| ; CHECK-NEXT: [[TMP67:%.*]] = add i64 [[TMP17]], 1 |
| ; CHECK-NEXT: [[TMP68:%.*]] = add i64 [[TMP18]], 1 |
| ; CHECK-NEXT: [[TMP69:%.*]] = add i64 [[TMP19]], 1 |
| ; CHECK-NEXT: [[TMP70:%.*]] = add i64 [[TMP20]], 1 |
| ; CHECK-NEXT: [[TMP71:%.*]] = add i64 [[TMP21]], 1 |
| ; CHECK-NEXT: [[TMP72:%.*]] = add i64 [[TMP22]], 1 |
| ; CHECK-NEXT: [[TMP73:%.*]] = add i64 [[TMP23]], 1 |
| ; CHECK-NEXT: [[TMP74:%.*]] = getelementptr inbounds [1536 x float], ptr @src_data, i64 0, i64 [[TMP66]] |
| ; CHECK-NEXT: [[TMP75:%.*]] = getelementptr inbounds [1536 x float], ptr @src_data, i64 0, i64 [[TMP67]] |
| ; CHECK-NEXT: [[TMP76:%.*]] = getelementptr inbounds [1536 x float], ptr @src_data, i64 0, i64 [[TMP68]] |
| ; CHECK-NEXT: [[TMP77:%.*]] = getelementptr inbounds [1536 x float], ptr @src_data, i64 0, i64 [[TMP69]] |
| ; CHECK-NEXT: [[TMP78:%.*]] = getelementptr inbounds [1536 x float], ptr @src_data, i64 0, i64 [[TMP70]] |
| ; CHECK-NEXT: [[TMP79:%.*]] = getelementptr inbounds [1536 x float], ptr @src_data, i64 0, i64 [[TMP71]] |
| ; CHECK-NEXT: [[TMP80:%.*]] = getelementptr inbounds [1536 x float], ptr @src_data, i64 0, i64 [[TMP72]] |
| ; CHECK-NEXT: [[TMP81:%.*]] = getelementptr inbounds [1536 x float], ptr @src_data, i64 0, i64 [[TMP73]] |
| ; CHECK-NEXT: [[TMP82:%.*]] = load float, ptr [[TMP74]], align 4 |
| ; CHECK-NEXT: [[TMP83:%.*]] = load float, ptr [[TMP75]], align 4 |
| ; CHECK-NEXT: [[TMP84:%.*]] = load float, ptr [[TMP76]], align 4 |
| ; CHECK-NEXT: [[TMP85:%.*]] = load float, ptr [[TMP77]], align 4 |
| ; CHECK-NEXT: [[TMP86:%.*]] = insertelement <4 x float> poison, float [[TMP82]], i32 0 |
| ; CHECK-NEXT: [[TMP87:%.*]] = insertelement <4 x float> [[TMP86]], float [[TMP83]], i32 1 |
| ; CHECK-NEXT: [[TMP88:%.*]] = insertelement <4 x float> [[TMP87]], float [[TMP84]], i32 2 |
| ; CHECK-NEXT: [[TMP89:%.*]] = insertelement <4 x float> [[TMP88]], float [[TMP85]], i32 3 |
| ; CHECK-NEXT: [[TMP90:%.*]] = load float, ptr [[TMP78]], align 4 |
| ; CHECK-NEXT: [[TMP91:%.*]] = load float, ptr [[TMP79]], align 4 |
| ; CHECK-NEXT: [[TMP92:%.*]] = load float, ptr [[TMP80]], align 4 |
| ; CHECK-NEXT: [[TMP93:%.*]] = load float, ptr [[TMP81]], align 4 |
| ; CHECK-NEXT: [[TMP94:%.*]] = insertelement <4 x float> poison, float [[TMP90]], i32 0 |
| ; CHECK-NEXT: [[TMP95:%.*]] = insertelement <4 x float> [[TMP94]], float [[TMP91]], i32 1 |
| ; CHECK-NEXT: [[TMP96:%.*]] = insertelement <4 x float> [[TMP95]], float [[TMP92]], i32 2 |
| ; CHECK-NEXT: [[TMP97:%.*]] = insertelement <4 x float> [[TMP96]], float [[TMP93]], i32 3 |
| ; CHECK-NEXT: [[TMP98:%.*]] = fmul fast <4 x float> [[WIDE_LOAD]], [[TMP89]] |
| ; CHECK-NEXT: [[TMP99:%.*]] = fmul fast <4 x float> [[WIDE_LOAD6]], [[TMP97]] |
| ; CHECK-NEXT: [[TMP100:%.*]] = fmul fast <4 x float> [[WIDE_LOAD7]], [[TMP98]] |
| ; CHECK-NEXT: [[TMP101:%.*]] = fmul fast <4 x float> [[WIDE_LOAD8]], [[TMP99]] |
| ; CHECK-NEXT: [[TMP102:%.*]] = fmul fast <4 x float> [[WIDE_LOAD9]], [[TMP100]] |
| ; CHECK-NEXT: [[TMP103:%.*]] = fmul fast <4 x float> [[WIDE_LOAD10]], [[TMP101]] |
| ; CHECK-NEXT: [[TMP104:%.*]] = fmul fast <4 x float> [[WIDE_LOAD11]], [[TMP102]] |
| ; CHECK-NEXT: [[TMP105:%.*]] = fmul fast <4 x float> [[WIDE_LOAD12]], [[TMP103]] |
| ; CHECK-NEXT: [[TMP106]] = fadd fast <4 x float> [[VEC_PHI2]], [[TMP104]] |
| ; CHECK-NEXT: [[TMP107]] = fadd fast <4 x float> [[VEC_PHI3]], [[TMP105]] |
| ; CHECK-NEXT: [[TMP108:%.*]] = add i64 [[MUL]], 2 |
| ; CHECK-NEXT: [[TMP109:%.*]] = add i64 [[TMP17]], 2 |
| ; CHECK-NEXT: [[TMP110:%.*]] = add i64 [[TMP18]], 2 |
| ; CHECK-NEXT: [[TMP111:%.*]] = add i64 [[TMP19]], 2 |
| ; CHECK-NEXT: [[TMP112:%.*]] = add i64 [[TMP20]], 2 |
| ; CHECK-NEXT: [[TMP113:%.*]] = add i64 [[TMP21]], 2 |
| ; CHECK-NEXT: [[TMP114:%.*]] = add i64 [[TMP22]], 2 |
| ; CHECK-NEXT: [[TMP115:%.*]] = add i64 [[TMP23]], 2 |
| ; CHECK-NEXT: [[TMP116:%.*]] = getelementptr inbounds [1536 x float], ptr @src_data, i64 0, i64 [[TMP108]] |
| ; CHECK-NEXT: [[TMP117:%.*]] = getelementptr inbounds [1536 x float], ptr @src_data, i64 0, i64 [[TMP109]] |
| ; CHECK-NEXT: [[TMP118:%.*]] = getelementptr inbounds [1536 x float], ptr @src_data, i64 0, i64 [[TMP110]] |
| ; CHECK-NEXT: [[TMP119:%.*]] = getelementptr inbounds [1536 x float], ptr @src_data, i64 0, i64 [[TMP111]] |
| ; CHECK-NEXT: [[TMP120:%.*]] = getelementptr inbounds [1536 x float], ptr @src_data, i64 0, i64 [[TMP112]] |
| ; CHECK-NEXT: [[TMP121:%.*]] = getelementptr inbounds [1536 x float], ptr @src_data, i64 0, i64 [[TMP113]] |
| ; CHECK-NEXT: [[TMP122:%.*]] = getelementptr inbounds [1536 x float], ptr @src_data, i64 0, i64 [[TMP114]] |
| ; CHECK-NEXT: [[TMP123:%.*]] = getelementptr inbounds [1536 x float], ptr @src_data, i64 0, i64 [[TMP115]] |
| ; CHECK-NEXT: [[TMP124:%.*]] = load float, ptr [[TMP116]], align 4 |
| ; CHECK-NEXT: [[TMP125:%.*]] = load float, ptr [[TMP117]], align 4 |
| ; CHECK-NEXT: [[TMP126:%.*]] = load float, ptr [[TMP118]], align 4 |
| ; CHECK-NEXT: [[TMP127:%.*]] = load float, ptr [[TMP119]], align 4 |
| ; CHECK-NEXT: [[TMP128:%.*]] = insertelement <4 x float> poison, float [[TMP124]], i32 0 |
| ; CHECK-NEXT: [[TMP129:%.*]] = insertelement <4 x float> [[TMP128]], float [[TMP125]], i32 1 |
| ; CHECK-NEXT: [[TMP130:%.*]] = insertelement <4 x float> [[TMP129]], float [[TMP126]], i32 2 |
| ; CHECK-NEXT: [[TMP131:%.*]] = insertelement <4 x float> [[TMP130]], float [[TMP127]], i32 3 |
| ; CHECK-NEXT: [[TMP132:%.*]] = load float, ptr [[TMP120]], align 4 |
| ; CHECK-NEXT: [[TMP133:%.*]] = load float, ptr [[TMP121]], align 4 |
| ; CHECK-NEXT: [[TMP134:%.*]] = load float, ptr [[TMP122]], align 4 |
| ; CHECK-NEXT: [[TMP135:%.*]] = load float, ptr [[TMP123]], align 4 |
| ; CHECK-NEXT: [[TMP136:%.*]] = insertelement <4 x float> poison, float [[TMP132]], i32 0 |
| ; CHECK-NEXT: [[TMP137:%.*]] = insertelement <4 x float> [[TMP136]], float [[TMP133]], i32 1 |
| ; CHECK-NEXT: [[TMP138:%.*]] = insertelement <4 x float> [[TMP137]], float [[TMP134]], i32 2 |
| ; CHECK-NEXT: [[TMP139:%.*]] = insertelement <4 x float> [[TMP138]], float [[TMP135]], i32 3 |
| ; CHECK-NEXT: [[TMP140:%.*]] = fmul fast <4 x float> [[WIDE_LOAD]], [[TMP131]] |
| ; CHECK-NEXT: [[TMP141:%.*]] = fmul fast <4 x float> [[WIDE_LOAD6]], [[TMP139]] |
| ; CHECK-NEXT: [[TMP142:%.*]] = fmul fast <4 x float> [[WIDE_LOAD7]], [[TMP140]] |
| ; CHECK-NEXT: [[TMP143:%.*]] = fmul fast <4 x float> [[WIDE_LOAD8]], [[TMP141]] |
| ; CHECK-NEXT: [[TMP144:%.*]] = fmul fast <4 x float> [[WIDE_LOAD9]], [[TMP142]] |
| ; CHECK-NEXT: [[TMP145:%.*]] = fmul fast <4 x float> [[WIDE_LOAD10]], [[TMP143]] |
| ; CHECK-NEXT: [[TMP146:%.*]] = fmul fast <4 x float> [[WIDE_LOAD11]], [[TMP144]] |
| ; CHECK-NEXT: [[TMP147:%.*]] = fmul fast <4 x float> [[WIDE_LOAD12]], [[TMP145]] |
| ; CHECK-NEXT: [[TMP148]] = fadd fast <4 x float> [[VEC_PHI4]], [[TMP146]] |
| ; CHECK-NEXT: [[TMP149]] = fadd fast <4 x float> [[VEC_PHI5]], [[TMP147]] |
| ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 |
| ; CHECK-NEXT: [[TMP150:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] |
| ; CHECK-NEXT: br i1 [[TMP150]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] |
| ; CHECK: [[MIDDLE_BLOCK]]: |
| ; CHECK-NEXT: [[BIN_RDX:%.*]] = fadd fast <4 x float> [[TMP65]], [[TMP64]] |
| ; CHECK-NEXT: [[TMP151:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[BIN_RDX]]) |
| ; CHECK-NEXT: [[BIN_RDX13:%.*]] = fadd fast <4 x float> [[TMP107]], [[TMP106]] |
| ; CHECK-NEXT: [[TMP152:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[BIN_RDX13]]) |
| ; CHECK-NEXT: [[BIN_RDX14:%.*]] = fadd fast <4 x float> [[TMP149]], [[TMP148]] |
| ; CHECK-NEXT: [[TMP153:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[BIN_RDX14]]) |
| ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[SIZE]], [[N_VEC]] |
| ; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] |
| ; CHECK: [[SCALAR_PH]]: |
| ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] |
| ; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP151]], %[[MIDDLE_BLOCK]] ], [ 0.000000e+00, %[[ENTRY]] ] |
| ; CHECK-NEXT: [[BC_MERGE_RDX15:%.*]] = phi float [ [[TMP152]], %[[MIDDLE_BLOCK]] ], [ 0.000000e+00, %[[ENTRY]] ] |
| ; CHECK-NEXT: [[BC_MERGE_RDX16:%.*]] = phi float [ [[TMP153]], %[[MIDDLE_BLOCK]] ], [ 0.000000e+00, %[[ENTRY]] ] |
| ; CHECK-NEXT: br label %[[LOOP:.*]] |
| ; CHECK: [[LOOP]]: |
| ; CHECK-NEXT: [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] |
| ; CHECK-NEXT: [[RDX_0:%.*]] = phi float [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[RDX_0_NEXT:%.*]], %[[LOOP]] ] |
| ; CHECK-NEXT: [[RDX_1:%.*]] = phi float [ [[BC_MERGE_RDX15]], %[[SCALAR_PH]] ], [ [[RDX_1_NEXT:%.*]], %[[LOOP]] ] |
| ; CHECK-NEXT: [[RED_2:%.*]] = phi float [ [[BC_MERGE_RDX16]], %[[SCALAR_PH]] ], [ [[RDX_2_NEXT:%.*]], %[[LOOP]] ] |
| ; CHECK-NEXT: [[ADD1:%.*]] = add i64 [[IV1]], [[OFFSET]] |
| ; CHECK-NEXT: [[MUL1:%.*]] = mul i64 [[ADD1]], 3 |
| ; CHECK-NEXT: [[GEP_SRC_DATA1:%.*]] = getelementptr inbounds [1536 x float], ptr @src_data, i64 0, i64 [[MUL1]] |
| ; CHECK-NEXT: [[TMP154:%.*]] = load float, ptr [[GEP_SRC_DATA1]], align 4 |
| ; CHECK-NEXT: [[GEP_KERNEL1:%.*]] = getelementptr inbounds [512 x float], ptr @kernel, i64 0, i64 [[IV1]] |
| ; CHECK-NEXT: [[TMP1:%.*]] = load float, ptr [[GEP_KERNEL1]], align 4 |
| ; CHECK-NEXT: [[MUL3:%.*]] = fmul fast float [[TMP154]], [[TMP1]] |
| ; CHECK-NEXT: [[GEP_KERNEL2:%.*]] = getelementptr inbounds [512 x float], ptr @kernel2, i64 0, i64 [[IV1]] |
| ; CHECK-NEXT: [[TMP2:%.*]] = load float, ptr [[GEP_KERNEL2]], align 4 |
| ; CHECK-NEXT: [[MUL5:%.*]] = fmul fast float [[MUL3]], [[TMP2]] |
| ; CHECK-NEXT: [[GEP_KERNEL3:%.*]] = getelementptr inbounds [512 x float], ptr @kernel3, i64 0, i64 [[IV1]] |
| ; CHECK-NEXT: [[TMP3:%.*]] = load float, ptr [[GEP_KERNEL3]], align 4 |
| ; CHECK-NEXT: [[MUL7:%.*]] = fmul fast float [[MUL5]], [[TMP3]] |
| ; CHECK-NEXT: [[GEP_KERNEL4:%.*]] = getelementptr inbounds [512 x float], ptr @kernel4, i64 0, i64 [[IV1]] |
| ; CHECK-NEXT: [[TMP4:%.*]] = load float, ptr [[GEP_KERNEL4]], align 4 |
| ; CHECK-NEXT: [[MUL9:%.*]] = fmul fast float [[MUL7]], [[TMP4]] |
| ; CHECK-NEXT: [[RDX_0_NEXT]] = fadd fast float [[RDX_0]], [[MUL9]] |
| ; CHECK-NEXT: [[GEP_SRC_DATA_SUM:%.*]] = add i64 [[MUL1]], 1 |
| ; CHECK-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds [1536 x float], ptr @src_data, i64 0, i64 [[GEP_SRC_DATA_SUM]] |
| ; CHECK-NEXT: [[TMP5:%.*]] = load float, ptr [[ARRAYIDX11]], align 4 |
| ; CHECK-NEXT: [[MUL13:%.*]] = fmul fast float [[TMP1]], [[TMP5]] |
| ; CHECK-NEXT: [[MUL15:%.*]] = fmul fast float [[TMP2]], [[MUL13]] |
| ; CHECK-NEXT: [[MUL17:%.*]] = fmul fast float [[TMP3]], [[MUL15]] |
| ; CHECK-NEXT: [[MUL19:%.*]] = fmul fast float [[TMP4]], [[MUL17]] |
| ; CHECK-NEXT: [[RDX_1_NEXT]] = fadd fast float [[RDX_1]], [[MUL19]] |
| ; CHECK-NEXT: [[GEP_SRC_DATA_SUM52:%.*]] = add i64 [[MUL1]], 2 |
| ; CHECK-NEXT: [[ARRAYIDX21:%.*]] = getelementptr inbounds [1536 x float], ptr @src_data, i64 0, i64 [[GEP_SRC_DATA_SUM52]] |
| ; CHECK-NEXT: [[TMP6:%.*]] = load float, ptr [[ARRAYIDX21]], align 4 |
| ; CHECK-NEXT: [[MUL23:%.*]] = fmul fast float [[TMP1]], [[TMP6]] |
| ; CHECK-NEXT: [[MUL25:%.*]] = fmul fast float [[TMP2]], [[MUL23]] |
| ; CHECK-NEXT: [[MUL27:%.*]] = fmul fast float [[TMP3]], [[MUL25]] |
| ; CHECK-NEXT: [[MUL29:%.*]] = fmul fast float [[TMP4]], [[MUL27]] |
| ; CHECK-NEXT: [[RDX_2_NEXT]] = fadd fast float [[RED_2]], [[MUL29]] |
| ; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV1]], 1 |
| ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[IV_NEXT]], [[SIZE]] |
| ; CHECK-NEXT: br i1 [[EXITCOND]], label %[[LOOP]], label %[[EXIT]], !llvm.loop [[LOOP3:![0-9]+]] |
| ; CHECK: [[EXIT]]: |
| ; CHECK-NEXT: [[RDX_0_NEXT_LCSSA:%.*]] = phi float [ [[RDX_0_NEXT]], %[[LOOP]] ], [ [[TMP151]], %[[MIDDLE_BLOCK]] ] |
| ; CHECK-NEXT: [[RDX_1_NEXT_LCSSA:%.*]] = phi float [ [[RDX_1_NEXT]], %[[LOOP]] ], [ [[TMP152]], %[[MIDDLE_BLOCK]] ] |
| ; CHECK-NEXT: [[RDX_2_NEXT_LCSSA:%.*]] = phi float [ [[RDX_2_NEXT]], %[[LOOP]] ], [ [[TMP153]], %[[MIDDLE_BLOCK]] ] |
| ; CHECK-NEXT: [[RES_0:%.*]] = fadd float [[RDX_0_NEXT_LCSSA]], [[RDX_1_NEXT_LCSSA]] |
| ; CHECK-NEXT: [[RES_1:%.*]] = fadd float [[RES_0]], [[RDX_2_NEXT_LCSSA]] |
| ; CHECK-NEXT: ret float [[RES_1]] |
| ; |
| entry: |
| br label %loop |
| |
| loop: |
| %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] |
| %rdx.0 = phi float [ 0.000000e+00, %entry ], [ %rdx.0.next, %loop ] |
| %rdx.1 = phi float [ 0.000000e+00, %entry ], [ %rdx.1.next, %loop ] |
| %red.2 = phi float [ 0.000000e+00, %entry ], [ %rdx.2.next, %loop ] |
| %add = add i64 %iv, %offset |
| %mul = mul i64 %add, 3 |
| %gep.src_data = getelementptr inbounds [1536 x float], ptr @src_data, i64 0, i64 %mul |
| %0 = load float, ptr %gep.src_data, align 4 |
| %gep.kernel = getelementptr inbounds [512 x float], ptr @kernel, i64 0, i64 %iv |
| %1 = load float, ptr %gep.kernel, align 4 |
| %mul3 = fmul fast float %0, %1 |
| %gep.kernel2 = getelementptr inbounds [512 x float], ptr @kernel2, i64 0, i64 %iv |
| %2 = load float, ptr %gep.kernel2, align 4 |
| %mul5 = fmul fast float %mul3, %2 |
| %gep.kernel3 = getelementptr inbounds [512 x float], ptr @kernel3, i64 0, i64 %iv |
| %3 = load float, ptr %gep.kernel3, align 4 |
| %mul7 = fmul fast float %mul5, %3 |
| %gep.kernel4 = getelementptr inbounds [512 x float], ptr @kernel4, i64 0, i64 %iv |
| %4 = load float, ptr %gep.kernel4, align 4 |
| %mul9 = fmul fast float %mul7, %4 |
| %rdx.0.next = fadd fast float %rdx.0, %mul9 |
| %gep.src_data.sum = add i64 %mul, 1 |
| %arrayidx11 = getelementptr inbounds [1536 x float], ptr @src_data, i64 0, i64 %gep.src_data.sum |
| %5 = load float, ptr %arrayidx11, align 4 |
| %mul13 = fmul fast float %1, %5 |
| %mul15 = fmul fast float %2, %mul13 |
| %mul17 = fmul fast float %3, %mul15 |
| %mul19 = fmul fast float %4, %mul17 |
| %rdx.1.next = fadd fast float %rdx.1, %mul19 |
| %gep.src_data.sum52 = add i64 %mul, 2 |
| %arrayidx21 = getelementptr inbounds [1536 x float], ptr @src_data, i64 0, i64 %gep.src_data.sum52 |
| %6 = load float, ptr %arrayidx21, align 4 |
| %mul23 = fmul fast float %1, %6 |
| %mul25 = fmul fast float %2, %mul23 |
| %mul27 = fmul fast float %3, %mul25 |
| %mul29 = fmul fast float %4, %mul27 |
| %rdx.2.next = fadd fast float %red.2, %mul29 |
| %iv.next = add i64 %iv, 1 |
| %exitcond = icmp ne i64 %iv.next, %size |
| br i1 %exitcond, label %loop, label %exit |
| |
| exit: |
| %res.0 = fadd float %rdx.0.next, %rdx.1.next |
| %res.1 = fadd float %res.0, %rdx.2.next |
| ret float %res.1 |
| } |
| ;. |
| ; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} |
| ; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} |
| ; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} |
| ; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]} |
| ;. |