| ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py |
| ; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp %s -o - | FileCheck %s |
| |
| %struct.DCT_InstanceTypeDef = type { float*, i32, i32 } |
| |
| define void @DCT_mve1(%struct.DCT_InstanceTypeDef* nocapture readonly %S, float* nocapture readonly %pIn, float* nocapture %pOut) { |
| ; CHECK-LABEL: DCT_mve1: |
| ; CHECK: @ %bb.0: @ %entry |
| ; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, lr} |
| ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, lr} |
| ; CHECK-NEXT: ldr r3, [r0, #4] |
| ; CHECK-NEXT: sub.w r12, r3, #1 |
| ; CHECK-NEXT: cmp.w r12, #2 |
| ; CHECK-NEXT: blo .LBB0_5 |
| ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader |
| ; CHECK-NEXT: ldr r5, [r0, #8] |
| ; CHECK-NEXT: ldr r3, [r0] |
| ; CHECK-NEXT: add.w r3, r3, r5, lsl #2 |
| ; CHECK-NEXT: movs r0, #1 |
| ; CHECK-NEXT: lsl.w r9, r5, #2 |
| ; CHECK-NEXT: .LBB0_2: @ %for.body |
| ; CHECK-NEXT: @ =>This Loop Header: Depth=1 |
| ; CHECK-NEXT: @ Child Loop BB0_3 Depth 2 |
| ; CHECK-NEXT: vmov.i32 q0, #0x0 |
| ; CHECK-NEXT: mov r6, r1 |
| ; CHECK-NEXT: mov r7, r3 |
| ; CHECK-NEXT: dlstp.32 lr, r5 |
| ; CHECK-NEXT: .LBB0_3: @ %vector.body |
| ; CHECK-NEXT: @ Parent Loop BB0_2 Depth=1 |
| ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 |
| ; CHECK-NEXT: vldrw.u32 q1, [r6], #16 |
| ; CHECK-NEXT: vldrw.u32 q2, [r7], #16 |
| ; CHECK-NEXT: vfma.f32 q0, q2, q1 |
| ; CHECK-NEXT: letp lr, .LBB0_3 |
| ; CHECK-NEXT: @ %bb.4: @ %middle.block |
| ; CHECK-NEXT: @ in Loop: Header=BB0_2 Depth=1 |
| ; CHECK-NEXT: vadd.f32 s2, s2, s3 |
| ; CHECK-NEXT: add.w r7, r2, r0, lsl #2 |
| ; CHECK-NEXT: vadd.f32 s0, s0, s1 |
| ; CHECK-NEXT: adds r0, #1 |
| ; CHECK-NEXT: add r3, r9 |
| ; CHECK-NEXT: cmp r0, r12 |
| ; CHECK-NEXT: vadd.f32 s0, s0, s2 |
| ; CHECK-NEXT: vstr s0, [r7] |
| ; CHECK-NEXT: bne .LBB0_2 |
| ; CHECK-NEXT: .LBB0_5: @ %for.cond.cleanup |
| ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, pc} |
| entry: |
| %NumInputs = getelementptr inbounds %struct.DCT_InstanceTypeDef, %struct.DCT_InstanceTypeDef* %S, i32 0, i32 2 |
| %0 = load i32, i32* %NumInputs, align 4 |
| %NumFilters = getelementptr inbounds %struct.DCT_InstanceTypeDef, %struct.DCT_InstanceTypeDef* %S, i32 0, i32 1 |
| %1 = load i32, i32* %NumFilters, align 4 |
| %pDCTCoefs = getelementptr inbounds %struct.DCT_InstanceTypeDef, %struct.DCT_InstanceTypeDef* %S, i32 0, i32 0 |
| %2 = load float*, float** %pDCTCoefs, align 4 |
| %cmp = icmp ugt i32 %0, 1 |
| tail call void @llvm.assume(i1 %cmp) |
| %sub = add i32 %1, -1 |
| %cmp350 = icmp ugt i32 %sub, 1 |
| br i1 %cmp350, label %for.body.preheader, label %for.cond.cleanup |
| |
| for.body.preheader: ; preds = %entry |
| %n.rnd.up = add i32 %0, 3 |
| %n.vec = and i32 %n.rnd.up, -4 |
| br label %for.body |
| |
| for.cond.cleanup: ; preds = %middle.block, %entry |
| ret void |
| |
| for.body: ; preds = %for.body.preheader, %middle.block |
| %k2.051 = phi i32 [ %add16, %middle.block ], [ 1, %for.body.preheader ] |
| %mul4 = mul i32 %k2.051, %0 |
| br label %vector.body |
| |
| vector.body: ; preds = %vector.body, %for.body |
| %index = phi i32 [ 0, %for.body ], [ %index.next, %vector.body ] |
| %vec.phi = phi <4 x float> [ zeroinitializer, %for.body ], [ %10, %vector.body ] |
| %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %0) |
| %3 = getelementptr inbounds float, float* %pIn, i32 %index |
| %4 = bitcast float* %3 to <4 x float>* |
| %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %4, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef) |
| %5 = add i32 %index, %mul4 |
| %6 = getelementptr inbounds float, float* %2, i32 %5 |
| %7 = bitcast float* %6 to <4 x float>* |
| %wide.masked.load53 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %7, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef) |
| %8 = fmul fast <4 x float> %wide.masked.load53, %wide.masked.load |
| %9 = fadd fast <4 x float> %8, %vec.phi |
| %10 = select <4 x i1> %active.lane.mask, <4 x float> %9, <4 x float> %vec.phi |
| %index.next = add i32 %index, 4 |
| %11 = icmp eq i32 %index.next, %n.vec |
| br i1 %11, label %middle.block, label %vector.body |
| |
| middle.block: ; preds = %vector.body |
| %12 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %10) |
| %arrayidx14 = getelementptr inbounds float, float* %pOut, i32 %k2.051 |
| store float %12, float* %arrayidx14, align 4 |
| %add16 = add nuw i32 %k2.051, 1 |
| %exitcond52.not = icmp eq i32 %add16, %sub |
| br i1 %exitcond52.not, label %for.cond.cleanup, label %for.body |
| } |
| |
| define void @DCT_mve2(%struct.DCT_InstanceTypeDef* nocapture readonly %S, float* nocapture readonly %pIn, float* nocapture %pOut) { |
| ; CHECK-LABEL: DCT_mve2: |
| ; CHECK: @ %bb.0: @ %entry |
| ; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr} |
| ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} |
| ; CHECK-NEXT: .pad #4 |
| ; CHECK-NEXT: sub sp, #4 |
| ; CHECK-NEXT: str r1, [sp] @ 4-byte Spill |
| ; CHECK-NEXT: ldr r1, [r0, #4] |
| ; CHECK-NEXT: subs r1, #2 |
| ; CHECK-NEXT: cmp r1, #2 |
| ; CHECK-NEXT: blo .LBB1_5 |
| ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader |
| ; CHECK-NEXT: ldr.w r12, [r0, #8] |
| ; CHECK-NEXT: movs r4, #1 |
| ; CHECK-NEXT: ldr r3, [r0] |
| ; CHECK-NEXT: add.w r11, r3, r12, lsl #2 |
| ; CHECK-NEXT: add.w r7, r3, r12, lsl #3 |
| ; CHECK-NEXT: lsl.w r9, r12, #3 |
| ; CHECK-NEXT: .LBB1_2: @ %for.body |
| ; CHECK-NEXT: @ =>This Loop Header: Depth=1 |
| ; CHECK-NEXT: @ Child Loop BB1_3 Depth 2 |
| ; CHECK-NEXT: ldr r5, [sp] @ 4-byte Reload |
| ; CHECK-NEXT: vmov.i32 q0, #0x0 |
| ; CHECK-NEXT: add.w r10, r4, #1 |
| ; CHECK-NEXT: mov r3, r11 |
| ; CHECK-NEXT: mov r0, r7 |
| ; CHECK-NEXT: vmov q1, q0 |
| ; CHECK-NEXT: dlstp.32 lr, r12 |
| ; CHECK-NEXT: .LBB1_3: @ %vector.body |
| ; CHECK-NEXT: @ Parent Loop BB1_2 Depth=1 |
| ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 |
| ; CHECK-NEXT: vldrw.u32 q2, [r5], #16 |
| ; CHECK-NEXT: vldrw.u32 q3, [r3], #16 |
| ; CHECK-NEXT: vfma.f32 q1, q3, q2 |
| ; CHECK-NEXT: vldrw.u32 q3, [r0], #16 |
| ; CHECK-NEXT: vfma.f32 q0, q3, q2 |
| ; CHECK-NEXT: letp lr, .LBB1_3 |
| ; CHECK-NEXT: @ %bb.4: @ %middle.block |
| ; CHECK-NEXT: @ in Loop: Header=BB1_2 Depth=1 |
| ; CHECK-NEXT: vadd.f32 s2, s2, s3 |
| ; CHECK-NEXT: add.w r0, r2, r10, lsl #2 |
| ; CHECK-NEXT: vadd.f32 s0, s0, s1 |
| ; CHECK-NEXT: add r11, r9 |
| ; CHECK-NEXT: vadd.f32 s6, s6, s7 |
| ; CHECK-NEXT: add r7, r9 |
| ; CHECK-NEXT: vadd.f32 s4, s4, s5 |
| ; CHECK-NEXT: vadd.f32 s0, s0, s2 |
| ; CHECK-NEXT: vadd.f32 s2, s4, s6 |
| ; CHECK-NEXT: vstr s0, [r0] |
| ; CHECK-NEXT: add.w r0, r2, r4, lsl #2 |
| ; CHECK-NEXT: adds r4, #2 |
| ; CHECK-NEXT: cmp r4, r1 |
| ; CHECK-NEXT: vstr s2, [r0] |
| ; CHECK-NEXT: blo .LBB1_2 |
| ; CHECK-NEXT: .LBB1_5: @ %for.cond.cleanup |
| ; CHECK-NEXT: add sp, #4 |
| ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} |
| entry: |
| %NumInputs = getelementptr inbounds %struct.DCT_InstanceTypeDef, %struct.DCT_InstanceTypeDef* %S, i32 0, i32 2 |
| %0 = load i32, i32* %NumInputs, align 4 |
| %NumFilters = getelementptr inbounds %struct.DCT_InstanceTypeDef, %struct.DCT_InstanceTypeDef* %S, i32 0, i32 1 |
| %1 = load i32, i32* %NumFilters, align 4 |
| %pDCTCoefs = getelementptr inbounds %struct.DCT_InstanceTypeDef, %struct.DCT_InstanceTypeDef* %S, i32 0, i32 0 |
| %2 = load float*, float** %pDCTCoefs, align 4 |
| %cmp = icmp ugt i32 %0, 1 |
| tail call void @llvm.assume(i1 %cmp) |
| %sub = add i32 %1, -2 |
| %cmp371 = icmp ugt i32 %sub, 1 |
| br i1 %cmp371, label %for.body.preheader, label %for.cond.cleanup |
| |
| for.body.preheader: ; preds = %entry |
| %n.rnd.up = add i32 %0, 3 |
| %n.vec = and i32 %n.rnd.up, -4 |
| br label %for.body |
| |
| for.cond.cleanup: ; preds = %middle.block, %entry |
| ret void |
| |
| for.body: ; preds = %for.body.preheader, %middle.block |
| %k2.072 = phi i32 [ %add25, %middle.block ], [ 1, %for.body.preheader ] |
| %mul4 = mul i32 %k2.072, %0 |
| %add = add nuw i32 %k2.072, 1 |
| %mul5 = mul i32 %add, %0 |
| br label %vector.body |
| |
| vector.body: ; preds = %vector.body, %for.body |
| %index = phi i32 [ 0, %for.body ], [ %index.next, %vector.body ] |
| %vec.phi = phi <4 x float> [ zeroinitializer, %for.body ], [ %15, %vector.body ] |
| %vec.phi73 = phi <4 x float> [ zeroinitializer, %for.body ], [ %16, %vector.body ] |
| %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %0) |
| %3 = getelementptr inbounds float, float* %pIn, i32 %index |
| %4 = bitcast float* %3 to <4 x float>* |
| %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %4, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef) |
| %5 = add i32 %index, %mul4 |
| %6 = getelementptr inbounds float, float* %2, i32 %5 |
| %7 = bitcast float* %6 to <4 x float>* |
| %wide.masked.load74 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %7, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef) |
| %8 = fmul fast <4 x float> %wide.masked.load74, %wide.masked.load |
| %9 = fadd fast <4 x float> %8, %vec.phi73 |
| %10 = add i32 %index, %mul5 |
| %11 = getelementptr inbounds float, float* %2, i32 %10 |
| %12 = bitcast float* %11 to <4 x float>* |
| %wide.masked.load75 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %12, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef) |
| %13 = fmul fast <4 x float> %wide.masked.load75, %wide.masked.load |
| %14 = fadd fast <4 x float> %13, %vec.phi |
| %15 = select <4 x i1> %active.lane.mask, <4 x float> %14, <4 x float> %vec.phi |
| %16 = select <4 x i1> %active.lane.mask, <4 x float> %9, <4 x float> %vec.phi73 |
| %index.next = add i32 %index, 4 |
| %17 = icmp eq i32 %index.next, %n.vec |
| br i1 %17, label %middle.block, label %vector.body |
| |
| middle.block: ; preds = %vector.body |
| %18 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %16) |
| %19 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %15) |
| %arrayidx21 = getelementptr inbounds float, float* %pOut, i32 %k2.072 |
| store float %18, float* %arrayidx21, align 4 |
| %arrayidx23 = getelementptr inbounds float, float* %pOut, i32 %add |
| store float %19, float* %arrayidx23, align 4 |
| %add25 = add i32 %k2.072, 2 |
| %cmp3 = icmp ult i32 %add25, %sub |
| br i1 %cmp3, label %for.body, label %for.cond.cleanup |
| } |
| |
| define void @DCT_mve3(%struct.DCT_InstanceTypeDef* nocapture readonly %S, float* nocapture readonly %pIn, float* nocapture %pOut) { |
| ; CHECK-LABEL: DCT_mve3: |
| ; CHECK: @ %bb.0: @ %entry |
| ; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr} |
| ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} |
| ; CHECK-NEXT: .pad #4 |
| ; CHECK-NEXT: sub sp, #4 |
| ; CHECK-NEXT: .vsave {d8, d9} |
| ; CHECK-NEXT: vpush {d8, d9} |
| ; CHECK-NEXT: .pad #24 |
| ; CHECK-NEXT: sub sp, #24 |
| ; CHECK-NEXT: str r1, [sp, #16] @ 4-byte Spill |
| ; CHECK-NEXT: ldr r1, [r0, #4] |
| ; CHECK-NEXT: str r2, [sp, #8] @ 4-byte Spill |
| ; CHECK-NEXT: subs r1, #3 |
| ; CHECK-NEXT: str r1, [sp, #12] @ 4-byte Spill |
| ; CHECK-NEXT: cmp r1, #2 |
| ; CHECK-NEXT: blo .LBB2_5 |
| ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader |
| ; CHECK-NEXT: ldr r3, [r0, #8] |
| ; CHECK-NEXT: movs r5, #1 |
| ; CHECK-NEXT: ldr r1, [r0] |
| ; CHECK-NEXT: str r3, [sp, #4] @ 4-byte Spill |
| ; CHECK-NEXT: add.w r0, r3, r3, lsl #1 |
| ; CHECK-NEXT: add.w r9, r1, r3, lsl #2 |
| ; CHECK-NEXT: add.w r12, r1, r3, lsl #3 |
| ; CHECK-NEXT: adds r3, #3 |
| ; CHECK-NEXT: bic r3, r3, #3 |
| ; CHECK-NEXT: ldr r7, [sp, #4] @ 4-byte Reload |
| ; CHECK-NEXT: add.w r10, r1, r0, lsl #2 |
| ; CHECK-NEXT: subs r3, #4 |
| ; CHECK-NEXT: lsl.w r11, r0, #2 |
| ; CHECK-NEXT: add.w r1, r5, r3, lsr #2 |
| ; CHECK-NEXT: str r1, [sp] @ 4-byte Spill |
| ; CHECK-NEXT: .LBB2_2: @ %for.body |
| ; CHECK-NEXT: @ =>This Loop Header: Depth=1 |
| ; CHECK-NEXT: @ Child Loop BB2_3 Depth 2 |
| ; CHECK-NEXT: ldr r6, [sp, #16] @ 4-byte Reload |
| ; CHECK-NEXT: vmov.i32 q0, #0x0 |
| ; CHECK-NEXT: ldr r1, [sp] @ 4-byte Reload |
| ; CHECK-NEXT: adds r0, r5, #2 |
| ; CHECK-NEXT: adds r2, r5, #1 |
| ; CHECK-NEXT: str r0, [sp, #20] @ 4-byte Spill |
| ; CHECK-NEXT: mov r3, r9 |
| ; CHECK-NEXT: mov r0, r12 |
| ; CHECK-NEXT: mov r4, r10 |
| ; CHECK-NEXT: vmov q2, q0 |
| ; CHECK-NEXT: vmov q1, q0 |
| ; CHECK-NEXT: dlstp.32 lr, r7 |
| ; CHECK-NEXT: .LBB2_3: @ %vector.body |
| ; CHECK-NEXT: @ Parent Loop BB2_2 Depth=1 |
| ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 |
| ; CHECK-NEXT: vldrw.u32 q3, [r6], #16 |
| ; CHECK-NEXT: vldrw.u32 q4, [r3], #16 |
| ; CHECK-NEXT: vfma.f32 q1, q4, q3 |
| ; CHECK-NEXT: vldrw.u32 q4, [r0], #16 |
| ; CHECK-NEXT: vfma.f32 q2, q4, q3 |
| ; CHECK-NEXT: vldrw.u32 q4, [r4], #16 |
| ; CHECK-NEXT: vfma.f32 q0, q4, q3 |
| ; CHECK-NEXT: letp lr, .LBB2_3 |
| ; CHECK-NEXT: @ %bb.4: @ %middle.block |
| ; CHECK-NEXT: @ in Loop: Header=BB2_2 Depth=1 |
| ; CHECK-NEXT: vadd.f32 s10, s10, s11 |
| ; CHECK-NEXT: ldr r1, [sp, #8] @ 4-byte Reload |
| ; CHECK-NEXT: vadd.f32 s8, s8, s9 |
| ; CHECK-NEXT: add r9, r11 |
| ; CHECK-NEXT: vadd.f32 s6, s6, s7 |
| ; CHECK-NEXT: add.w r0, r1, r2, lsl #2 |
| ; CHECK-NEXT: vadd.f32 s4, s4, s5 |
| ; CHECK-NEXT: add r12, r11 |
| ; CHECK-NEXT: vadd.f32 s2, s2, s3 |
| ; CHECK-NEXT: add r10, r11 |
| ; CHECK-NEXT: vadd.f32 s0, s0, s1 |
| ; CHECK-NEXT: vadd.f32 s8, s8, s10 |
| ; CHECK-NEXT: vadd.f32 s4, s4, s6 |
| ; CHECK-NEXT: vadd.f32 s0, s0, s2 |
| ; CHECK-NEXT: vstr s8, [r0] |
| ; CHECK-NEXT: add.w r0, r1, r5, lsl #2 |
| ; CHECK-NEXT: adds r5, #3 |
| ; CHECK-NEXT: vstr s4, [r0] |
| ; CHECK-NEXT: ldr r0, [sp, #20] @ 4-byte Reload |
| ; CHECK-NEXT: add.w r0, r1, r0, lsl #2 |
| ; CHECK-NEXT: vstr s0, [r0] |
| ; CHECK-NEXT: ldr r0, [sp, #12] @ 4-byte Reload |
| ; CHECK-NEXT: cmp r5, r0 |
| ; CHECK-NEXT: blo .LBB2_2 |
| ; CHECK-NEXT: .LBB2_5: @ %for.cond.cleanup |
| ; CHECK-NEXT: add sp, #24 |
| ; CHECK-NEXT: vpop {d8, d9} |
| ; CHECK-NEXT: add sp, #4 |
| ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} |
| entry: |
| %NumInputs = getelementptr inbounds %struct.DCT_InstanceTypeDef, %struct.DCT_InstanceTypeDef* %S, i32 0, i32 2 |
| %0 = load i32, i32* %NumInputs, align 4 |
| %NumFilters = getelementptr inbounds %struct.DCT_InstanceTypeDef, %struct.DCT_InstanceTypeDef* %S, i32 0, i32 1 |
| %1 = load i32, i32* %NumFilters, align 4 |
| %pDCTCoefs = getelementptr inbounds %struct.DCT_InstanceTypeDef, %struct.DCT_InstanceTypeDef* %S, i32 0, i32 0 |
| %2 = load float*, float** %pDCTCoefs, align 4 |
| %cmp = icmp ugt i32 %0, 1 |
| tail call void @llvm.assume(i1 %cmp) |
| %sub = add i32 %1, -3 |
| %cmp392 = icmp ugt i32 %sub, 1 |
| br i1 %cmp392, label %for.body.preheader, label %for.cond.cleanup |
| |
| for.body.preheader: ; preds = %entry |
| %n.rnd.up = add i32 %0, 3 |
| %n.vec = and i32 %n.rnd.up, -4 |
| br label %for.body |
| |
| for.cond.cleanup: ; preds = %middle.block, %entry |
| ret void |
| |
| for.body: ; preds = %for.body.preheader, %middle.block |
| %k2.093 = phi i32 [ %add34, %middle.block ], [ 1, %for.body.preheader ] |
| %mul4 = mul i32 %k2.093, %0 |
| %add = add nuw i32 %k2.093, 1 |
| %mul5 = mul i32 %add, %0 |
| %add6 = add i32 %k2.093, 2 |
| %mul7 = mul i32 %add6, %0 |
| br label %vector.body |
| |
| vector.body: ; preds = %vector.body, %for.body |
| %index = phi i32 [ 0, %for.body ], [ %index.next, %vector.body ] |
| %vec.phi = phi <4 x float> [ zeroinitializer, %for.body ], [ %20, %vector.body ] |
| %vec.phi94 = phi <4 x float> [ zeroinitializer, %for.body ], [ %21, %vector.body ] |
| %vec.phi95 = phi <4 x float> [ zeroinitializer, %for.body ], [ %22, %vector.body ] |
| %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %0) |
| %3 = getelementptr inbounds float, float* %pIn, i32 %index |
| %4 = bitcast float* %3 to <4 x float>* |
| %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %4, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef) |
| %5 = add i32 %index, %mul4 |
| %6 = getelementptr inbounds float, float* %2, i32 %5 |
| %7 = bitcast float* %6 to <4 x float>* |
| %wide.masked.load96 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %7, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef) |
| %8 = fmul fast <4 x float> %wide.masked.load96, %wide.masked.load |
| %9 = fadd fast <4 x float> %8, %vec.phi95 |
| %10 = add i32 %index, %mul5 |
| %11 = getelementptr inbounds float, float* %2, i32 %10 |
| %12 = bitcast float* %11 to <4 x float>* |
| %wide.masked.load97 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %12, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef) |
| %13 = fmul fast <4 x float> %wide.masked.load97, %wide.masked.load |
| %14 = fadd fast <4 x float> %13, %vec.phi94 |
| %15 = add i32 %index, %mul7 |
| %16 = getelementptr inbounds float, float* %2, i32 %15 |
| %17 = bitcast float* %16 to <4 x float>* |
| %wide.masked.load98 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %17, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef) |
| %18 = fmul fast <4 x float> %wide.masked.load98, %wide.masked.load |
| %19 = fadd fast <4 x float> %18, %vec.phi |
| %20 = select <4 x i1> %active.lane.mask, <4 x float> %19, <4 x float> %vec.phi |
| %21 = select <4 x i1> %active.lane.mask, <4 x float> %14, <4 x float> %vec.phi94 |
| %22 = select <4 x i1> %active.lane.mask, <4 x float> %9, <4 x float> %vec.phi95 |
| %index.next = add i32 %index, 4 |
| %23 = icmp eq i32 %index.next, %n.vec |
| br i1 %23, label %middle.block, label %vector.body |
| |
| middle.block: ; preds = %vector.body |
| %24 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %22) |
| %25 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %21) |
| %26 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %20) |
| %arrayidx28 = getelementptr inbounds float, float* %pOut, i32 %k2.093 |
| store float %24, float* %arrayidx28, align 4 |
| %arrayidx30 = getelementptr inbounds float, float* %pOut, i32 %add |
| store float %25, float* %arrayidx30, align 4 |
| %arrayidx32 = getelementptr inbounds float, float* %pOut, i32 %add6 |
| store float %26, float* %arrayidx32, align 4 |
| %add34 = add i32 %k2.093, 3 |
| %cmp3 = icmp ult i32 %add34, %sub |
| br i1 %cmp3, label %for.body, label %for.cond.cleanup |
| } |
| |
| define void @DCT_mve4(%struct.DCT_InstanceTypeDef* nocapture readonly %S, float* nocapture readonly %pIn, float* nocapture %pOut) { |
| ; CHECK-LABEL: DCT_mve4: |
| ; CHECK: @ %bb.0: @ %entry |
| ; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr} |
| ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} |
| ; CHECK-NEXT: .pad #4 |
| ; CHECK-NEXT: sub sp, #4 |
| ; CHECK-NEXT: .vsave {d8, d9, d10, d11} |
| ; CHECK-NEXT: vpush {d8, d9, d10, d11} |
| ; CHECK-NEXT: .pad #40 |
| ; CHECK-NEXT: sub sp, #40 |
| ; CHECK-NEXT: str r1, [sp, #24] @ 4-byte Spill |
| ; CHECK-NEXT: ldr r1, [r0, #4] |
| ; CHECK-NEXT: str r2, [sp, #16] @ 4-byte Spill |
| ; CHECK-NEXT: subs r1, #4 |
| ; CHECK-NEXT: str r1, [sp, #20] @ 4-byte Spill |
| ; CHECK-NEXT: cmp r1, #2 |
| ; CHECK-NEXT: blo.w .LBB3_5 |
| ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader |
| ; CHECK-NEXT: ldr r2, [r0, #8] |
| ; CHECK-NEXT: movs r6, #1 |
| ; CHECK-NEXT: ldr r1, [r0] |
| ; CHECK-NEXT: add.w r0, r2, r2, lsl #1 |
| ; CHECK-NEXT: add.w r12, r1, r2, lsl #2 |
| ; CHECK-NEXT: add.w r8, r1, r2, lsl #3 |
| ; CHECK-NEXT: add.w r9, r1, r2, lsl #4 |
| ; CHECK-NEXT: add.w r11, r1, r0, lsl #2 |
| ; CHECK-NEXT: adds r0, r2, #3 |
| ; CHECK-NEXT: bic r0, r0, #3 |
| ; CHECK-NEXT: subs r0, #4 |
| ; CHECK-NEXT: add.w r0, r6, r0, lsr #2 |
| ; CHECK-NEXT: strd r0, r2, [sp, #8] @ 8-byte Folded Spill |
| ; CHECK-NEXT: lsls r0, r2, #4 |
| ; CHECK-NEXT: ldrd r2, r7, [sp, #8] @ 8-byte Folded Reload |
| ; CHECK-NEXT: str r0, [sp, #4] @ 4-byte Spill |
| ; CHECK-NEXT: .LBB3_2: @ %for.body |
| ; CHECK-NEXT: @ =>This Loop Header: Depth=1 |
| ; CHECK-NEXT: @ Child Loop BB3_3 Depth 2 |
| ; CHECK-NEXT: adds r0, r6, #3 |
| ; CHECK-NEXT: str r0, [sp, #36] @ 4-byte Spill |
| ; CHECK-NEXT: adds r0, r6, #2 |
| ; CHECK-NEXT: ldr r1, [sp, #24] @ 4-byte Reload |
| ; CHECK-NEXT: vmov.i32 q0, #0x0 |
| ; CHECK-NEXT: str r0, [sp, #32] @ 4-byte Spill |
| ; CHECK-NEXT: adds r0, r6, #1 |
| ; CHECK-NEXT: str r0, [sp, #28] @ 4-byte Spill |
| ; CHECK-NEXT: mov r3, r12 |
| ; CHECK-NEXT: mov r0, r8 |
| ; CHECK-NEXT: mov r5, r11 |
| ; CHECK-NEXT: mov r4, r9 |
| ; CHECK-NEXT: vmov q1, q0 |
| ; CHECK-NEXT: vmov q2, q0 |
| ; CHECK-NEXT: vmov q3, q0 |
| ; CHECK-NEXT: dlstp.32 lr, r7 |
| ; CHECK-NEXT: .LBB3_3: @ %vector.body |
| ; CHECK-NEXT: @ Parent Loop BB3_2 Depth=1 |
| ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 |
| ; CHECK-NEXT: vldrw.u32 q4, [r1], #16 |
| ; CHECK-NEXT: vldrw.u32 q5, [r0], #16 |
| ; CHECK-NEXT: vfma.f32 q3, q5, q4 |
| ; CHECK-NEXT: vldrw.u32 q5, [r3], #16 |
| ; CHECK-NEXT: vfma.f32 q2, q5, q4 |
| ; CHECK-NEXT: vldrw.u32 q5, [r5], #16 |
| ; CHECK-NEXT: vfma.f32 q1, q5, q4 |
| ; CHECK-NEXT: vldrw.u32 q5, [r4], #16 |
| ; CHECK-NEXT: vfma.f32 q0, q5, q4 |
| ; CHECK-NEXT: letp lr, .LBB3_3 |
| ; CHECK-NEXT: @ %bb.4: @ %middle.block |
| ; CHECK-NEXT: @ in Loop: Header=BB3_2 Depth=1 |
| ; CHECK-NEXT: vadd.f32 s14, s14, s15 |
| ; CHECK-NEXT: ldr r0, [sp, #28] @ 4-byte Reload |
| ; CHECK-NEXT: vadd.f32 s12, s12, s13 |
| ; CHECK-NEXT: ldr r1, [sp, #16] @ 4-byte Reload |
| ; CHECK-NEXT: vadd.f32 s10, s10, s11 |
| ; CHECK-NEXT: vadd.f32 s8, s8, s9 |
| ; CHECK-NEXT: add.w r0, r1, r0, lsl #2 |
| ; CHECK-NEXT: vadd.f32 s6, s6, s7 |
| ; CHECK-NEXT: vadd.f32 s4, s4, s5 |
| ; CHECK-NEXT: vadd.f32 s2, s2, s3 |
| ; CHECK-NEXT: vadd.f32 s0, s0, s1 |
| ; CHECK-NEXT: vadd.f32 s12, s12, s14 |
| ; CHECK-NEXT: vadd.f32 s8, s8, s10 |
| ; CHECK-NEXT: vadd.f32 s4, s4, s6 |
| ; CHECK-NEXT: vadd.f32 s0, s0, s2 |
| ; CHECK-NEXT: vstr s12, [r0] |
| ; CHECK-NEXT: add.w r0, r1, r6, lsl #2 |
| ; CHECK-NEXT: adds r6, #4 |
| ; CHECK-NEXT: vstr s8, [r0] |
| ; CHECK-NEXT: ldr r0, [sp, #32] @ 4-byte Reload |
| ; CHECK-NEXT: add.w r0, r1, r0, lsl #2 |
| ; CHECK-NEXT: vstr s4, [r0] |
| ; CHECK-NEXT: ldr r0, [sp, #36] @ 4-byte Reload |
| ; CHECK-NEXT: add.w r0, r1, r0, lsl #2 |
| ; CHECK-NEXT: vstr s0, [r0] |
| ; CHECK-NEXT: ldr r0, [sp, #4] @ 4-byte Reload |
| ; CHECK-NEXT: add r12, r0 |
| ; CHECK-NEXT: add r8, r0 |
| ; CHECK-NEXT: add r11, r0 |
| ; CHECK-NEXT: add r9, r0 |
| ; CHECK-NEXT: ldr r0, [sp, #20] @ 4-byte Reload |
| ; CHECK-NEXT: cmp r6, r0 |
| ; CHECK-NEXT: blo .LBB3_2 |
| ; CHECK-NEXT: .LBB3_5: @ %for.cond.cleanup |
| ; CHECK-NEXT: add sp, #40 |
| ; CHECK-NEXT: vpop {d8, d9, d10, d11} |
| ; CHECK-NEXT: add sp, #4 |
| ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} |
| entry: |
| %NumInputs = getelementptr inbounds %struct.DCT_InstanceTypeDef, %struct.DCT_InstanceTypeDef* %S, i32 0, i32 2 |
| %0 = load i32, i32* %NumInputs, align 4 |
| %NumFilters = getelementptr inbounds %struct.DCT_InstanceTypeDef, %struct.DCT_InstanceTypeDef* %S, i32 0, i32 1 |
| %1 = load i32, i32* %NumFilters, align 4 |
| %pDCTCoefs = getelementptr inbounds %struct.DCT_InstanceTypeDef, %struct.DCT_InstanceTypeDef* %S, i32 0, i32 0 |
| %2 = load float*, float** %pDCTCoefs, align 4 |
| %cmp = icmp ugt i32 %0, 1 |
| tail call void @llvm.assume(i1 %cmp) |
| %sub = add i32 %1, -4 |
| %cmp3113 = icmp ugt i32 %sub, 1 |
| br i1 %cmp3113, label %for.body.preheader, label %for.cond.cleanup |
| |
| for.body.preheader: ; preds = %entry |
| %n.rnd.up = add i32 %0, 3 |
| %n.vec = and i32 %n.rnd.up, -4 |
| br label %for.body |
| |
| for.cond.cleanup: ; preds = %middle.block, %entry |
| ret void |
| |
| for.body: ; preds = %for.body.preheader, %middle.block |
| %k2.0114 = phi i32 [ %add43, %middle.block ], [ 1, %for.body.preheader ] |
| %mul4 = mul i32 %k2.0114, %0 |
| %add = add nuw nsw i32 %k2.0114, 1 |
| %mul5 = mul i32 %add, %0 |
| %add6 = add nuw nsw i32 %k2.0114, 2 |
| %mul7 = mul i32 %add6, %0 |
| %add8 = add i32 %k2.0114, 3 |
| %mul9 = mul i32 %add8, %0 |
| br label %vector.body |
| |
| vector.body: ; preds = %vector.body, %for.body |
| %index = phi i32 [ 0, %for.body ], [ %index.next, %vector.body ] |
| %vec.phi = phi <4 x float> [ zeroinitializer, %for.body ], [ %25, %vector.body ] |
| %vec.phi115 = phi <4 x float> [ zeroinitializer, %for.body ], [ %26, %vector.body ] |
| %vec.phi116 = phi <4 x float> [ zeroinitializer, %for.body ], [ %27, %vector.body ] |
| %vec.phi117 = phi <4 x float> [ zeroinitializer, %for.body ], [ %28, %vector.body ] |
| %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %0) |
| %3 = getelementptr inbounds float, float* %pIn, i32 %index |
| %4 = bitcast float* %3 to <4 x float>* |
| %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %4, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef) |
| %5 = add i32 %index, %mul4 |
| %6 = getelementptr inbounds float, float* %2, i32 %5 |
| %7 = bitcast float* %6 to <4 x float>* |
| %wide.masked.load118 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %7, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef) |
| %8 = fmul fast <4 x float> %wide.masked.load118, %wide.masked.load |
| %9 = fadd fast <4 x float> %8, %vec.phi116 |
| %10 = add i32 %index, %mul5 |
| %11 = getelementptr inbounds float, float* %2, i32 %10 |
| %12 = bitcast float* %11 to <4 x float>* |
| %wide.masked.load119 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %12, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef) |
| %13 = fmul fast <4 x float> %wide.masked.load119, %wide.masked.load |
| %14 = fadd fast <4 x float> %13, %vec.phi117 |
| %15 = add i32 %index, %mul7 |
| %16 = getelementptr inbounds float, float* %2, i32 %15 |
| %17 = bitcast float* %16 to <4 x float>* |
| %wide.masked.load120 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %17, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef) |
| %18 = fmul fast <4 x float> %wide.masked.load120, %wide.masked.load |
| %19 = fadd fast <4 x float> %18, %vec.phi115 |
| %20 = add i32 %index, %mul9 |
| %21 = getelementptr inbounds float, float* %2, i32 %20 |
| %22 = bitcast float* %21 to <4 x float>* |
| %wide.masked.load121 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %22, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef) |
| %23 = fmul fast <4 x float> %wide.masked.load121, %wide.masked.load |
| %24 = fadd fast <4 x float> %23, %vec.phi |
| %25 = select <4 x i1> %active.lane.mask, <4 x float> %24, <4 x float> %vec.phi |
| %26 = select <4 x i1> %active.lane.mask, <4 x float> %19, <4 x float> %vec.phi115 |
| %27 = select <4 x i1> %active.lane.mask, <4 x float> %9, <4 x float> %vec.phi116 |
| %28 = select <4 x i1> %active.lane.mask, <4 x float> %14, <4 x float> %vec.phi117 |
| %index.next = add i32 %index, 4 |
| %29 = icmp eq i32 %index.next, %n.vec |
| br i1 %29, label %middle.block, label %vector.body |
| |
| middle.block: ; preds = %vector.body |
| %30 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %28) |
| %31 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %27) |
| %32 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %26) |
| %33 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %25) |
| %arrayidx35 = getelementptr inbounds float, float* %pOut, i32 %k2.0114 |
| store float %31, float* %arrayidx35, align 4 |
| %arrayidx37 = getelementptr inbounds float, float* %pOut, i32 %add |
| store float %30, float* %arrayidx37, align 4 |
| %arrayidx39 = getelementptr inbounds float, float* %pOut, i32 %add6 |
| store float %32, float* %arrayidx39, align 4 |
| %arrayidx41 = getelementptr inbounds float, float* %pOut, i32 %add8 |
| store float %33, float* %arrayidx41, align 4 |
| %add43 = add i32 %k2.0114, 4 |
| %cmp3 = icmp ult i32 %add43, %sub |
| br i1 %cmp3, label %for.body, label %for.cond.cleanup |
| } |
| |
| define void @DCT_mve5(%struct.DCT_InstanceTypeDef* nocapture readonly %S, float* nocapture readonly %pIn, float* nocapture %pOut) { |
| ; CHECK-LABEL: DCT_mve5: |
| ; CHECK: @ %bb.0: @ %entry |
| ; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr} |
| ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} |
| ; CHECK-NEXT: .pad #4 |
| ; CHECK-NEXT: sub sp, #4 |
| ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} |
| ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} |
| ; CHECK-NEXT: .pad #32 |
| ; CHECK-NEXT: sub sp, #32 |
| ; CHECK-NEXT: str r1, [sp, #20] @ 4-byte Spill |
| ; CHECK-NEXT: ldr r1, [r0, #4] |
| ; CHECK-NEXT: subs r1, #5 |
| ; CHECK-NEXT: str r1, [sp, #16] @ 4-byte Spill |
| ; CHECK-NEXT: cmp r1, #2 |
| ; CHECK-NEXT: blo.w .LBB4_5 |
| ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader |
| ; CHECK-NEXT: ldr r3, [r0, #8] |
| ; CHECK-NEXT: ldr r1, [r0] |
| ; CHECK-NEXT: adds r0, r3, #3 |
| ; CHECK-NEXT: str r3, [sp, #12] @ 4-byte Spill |
| ; CHECK-NEXT: bic r0, r0, #3 |
| ; CHECK-NEXT: add.w r8, r1, r3, lsl #2 |
| ; CHECK-NEXT: subs r1, r0, #4 |
| ; CHECK-NEXT: movs r0, #1 |
| ; CHECK-NEXT: lsls r5, r3, #2 |
| ; CHECK-NEXT: add.w r1, r0, r1, lsr #2 |
| ; CHECK-NEXT: str r1, [sp, #8] @ 4-byte Spill |
| ; CHECK-NEXT: add.w r1, r3, r3, lsl #2 |
| ; CHECK-NEXT: lsls r1, r1, #2 |
| ; CHECK-NEXT: str r1, [sp, #4] @ 4-byte Spill |
| ; CHECK-NEXT: .LBB4_2: @ %for.body |
| ; CHECK-NEXT: @ =>This Loop Header: Depth=1 |
| ; CHECK-NEXT: @ Child Loop BB4_3 Depth 2 |
| ; CHECK-NEXT: ldr r7, [sp, #12] @ 4-byte Reload |
| ; CHECK-NEXT: adds r1, r0, #4 |
| ; CHECK-NEXT: ldr r4, [sp, #20] @ 4-byte Reload |
| ; CHECK-NEXT: vmov.i32 q1, #0x0 |
| ; CHECK-NEXT: ldr r6, [sp, #8] @ 4-byte Reload |
| ; CHECK-NEXT: add.w r10, r0, #2 |
| ; CHECK-NEXT: str r1, [sp, #28] @ 4-byte Spill |
| ; CHECK-NEXT: adds r1, r0, #3 |
| ; CHECK-NEXT: add.w r11, r0, #1 |
| ; CHECK-NEXT: str r1, [sp, #24] @ 4-byte Spill |
| ; CHECK-NEXT: mov r3, r8 |
| ; CHECK-NEXT: vmov q0, q1 |
| ; CHECK-NEXT: vmov q3, q1 |
| ; CHECK-NEXT: vmov q2, q1 |
| ; CHECK-NEXT: vmov q4, q1 |
| ; CHECK-NEXT: dlstp.32 lr, r7 |
| ; CHECK-NEXT: .LBB4_3: @ %vector.body |
| ; CHECK-NEXT: @ Parent Loop BB4_2 Depth=1 |
| ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 |
| ; CHECK-NEXT: add.w r9, r3, r5 |
| ; CHECK-NEXT: vldrw.u32 q5, [r4], #16 |
| ; CHECK-NEXT: vldrw.u32 q6, [r3], #16 |
| ; CHECK-NEXT: add.w r12, r9, r5 |
| ; CHECK-NEXT: vfma.f32 q3, q6, q5 |
| ; CHECK-NEXT: vldrw.u32 q6, [r9] |
| ; CHECK-NEXT: add.w r6, r12, r5 |
| ; CHECK-NEXT: vfma.f32 q4, q6, q5 |
| ; CHECK-NEXT: vldrw.u32 q6, [r12] |
| ; CHECK-NEXT: adds r7, r6, r5 |
| ; CHECK-NEXT: vfma.f32 q2, q6, q5 |
| ; CHECK-NEXT: vldrw.u32 q6, [r6] |
| ; CHECK-NEXT: vfma.f32 q0, q6, q5 |
| ; CHECK-NEXT: vldrw.u32 q6, [r7] |
| ; CHECK-NEXT: vfma.f32 q1, q6, q5 |
| ; CHECK-NEXT: letp lr, .LBB4_3 |
| ; CHECK-NEXT: @ %bb.4: @ %middle.block |
| ; CHECK-NEXT: @ in Loop: Header=BB4_2 Depth=1 |
| ; CHECK-NEXT: vadd.f32 s18, s18, s19 |
| ; CHECK-NEXT: add.w r1, r2, r11, lsl #2 |
| ; CHECK-NEXT: vadd.f32 s16, s16, s17 |
| ; CHECK-NEXT: vadd.f32 s14, s14, s15 |
| ; CHECK-NEXT: vadd.f32 s12, s12, s13 |
| ; CHECK-NEXT: vadd.f32 s6, s6, s7 |
| ; CHECK-NEXT: vadd.f32 s4, s4, s5 |
| ; CHECK-NEXT: vadd.f32 s10, s10, s11 |
| ; CHECK-NEXT: vadd.f32 s8, s8, s9 |
| ; CHECK-NEXT: vadd.f32 s0, s0, s1 |
| ; CHECK-NEXT: vadd.f32 s1, s16, s18 |
| ; CHECK-NEXT: vadd.f32 s2, s2, s3 |
| ; CHECK-NEXT: vadd.f32 s12, s12, s14 |
| ; CHECK-NEXT: vadd.f32 s4, s4, s6 |
| ; CHECK-NEXT: vadd.f32 s6, s8, s10 |
| ; CHECK-NEXT: vstr s1, [r1] |
| ; CHECK-NEXT: add.w r1, r2, r0, lsl #2 |
| ; CHECK-NEXT: vadd.f32 s0, s0, s2 |
| ; CHECK-NEXT: adds r0, #5 |
| ; CHECK-NEXT: vstr s12, [r1] |
| ; CHECK-NEXT: add.w r1, r2, r10, lsl #2 |
| ; CHECK-NEXT: vstr s6, [r1] |
| ; CHECK-NEXT: ldr r1, [sp, #24] @ 4-byte Reload |
| ; CHECK-NEXT: add.w r1, r2, r1, lsl #2 |
| ; CHECK-NEXT: vstr s0, [r1] |
| ; CHECK-NEXT: ldr r1, [sp, #28] @ 4-byte Reload |
| ; CHECK-NEXT: add.w r1, r2, r1, lsl #2 |
| ; CHECK-NEXT: vstr s4, [r1] |
| ; CHECK-NEXT: ldr r1, [sp, #4] @ 4-byte Reload |
| ; CHECK-NEXT: add r8, r1 |
| ; CHECK-NEXT: ldr r1, [sp, #16] @ 4-byte Reload |
| ; CHECK-NEXT: cmp r0, r1 |
| ; CHECK-NEXT: blo.w .LBB4_2 |
| ; CHECK-NEXT: .LBB4_5: @ %for.cond.cleanup |
| ; CHECK-NEXT: add sp, #32 |
| ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} |
| ; CHECK-NEXT: add sp, #4 |
| ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} |
| entry: |
| %NumInputs = getelementptr inbounds %struct.DCT_InstanceTypeDef, %struct.DCT_InstanceTypeDef* %S, i32 0, i32 2 |
| %0 = load i32, i32* %NumInputs, align 4 |
| %NumFilters = getelementptr inbounds %struct.DCT_InstanceTypeDef, %struct.DCT_InstanceTypeDef* %S, i32 0, i32 1 |
| %1 = load i32, i32* %NumFilters, align 4 |
| %pDCTCoefs = getelementptr inbounds %struct.DCT_InstanceTypeDef, %struct.DCT_InstanceTypeDef* %S, i32 0, i32 0 |
| %2 = load float*, float** %pDCTCoefs, align 4 |
| %cmp = icmp ugt i32 %0, 1 |
| tail call void @llvm.assume(i1 %cmp) |
| %sub = add i32 %1, -5 |
| %cmp3134 = icmp ugt i32 %sub, 1 |
| br i1 %cmp3134, label %for.body.preheader, label %for.cond.cleanup |
| |
| for.body.preheader: ; preds = %entry |
| %n.rnd.up = add i32 %0, 3 |
| %n.vec = and i32 %n.rnd.up, -4 |
| br label %for.body |
| |
| for.cond.cleanup: ; preds = %middle.block, %entry |
| ret void |
| |
| for.body: ; preds = %for.body.preheader, %middle.block |
| %k2.0135 = phi i32 [ %add52, %middle.block ], [ 1, %for.body.preheader ] |
| %mul4 = mul i32 %k2.0135, %0 |
| %add = add nuw i32 %k2.0135, 1 |
| %mul5 = mul i32 %add, %0 |
| %add6 = add i32 %k2.0135, 2 |
| %mul7 = mul i32 %add6, %0 |
| %add8 = add i32 %k2.0135, 3 |
| %mul9 = mul i32 %add8, %0 |
| %add10 = add i32 %k2.0135, 4 |
| %mul11 = mul i32 %add10, %0 |
| br label %vector.body |
| |
| vector.body: ; preds = %vector.body, %for.body |
| %index = phi i32 [ 0, %for.body ], [ %index.next, %vector.body ] |
| %vec.phi = phi <4 x float> [ zeroinitializer, %for.body ], [ %30, %vector.body ] |
| %vec.phi136 = phi <4 x float> [ zeroinitializer, %for.body ], [ %31, %vector.body ] |
| %vec.phi137 = phi <4 x float> [ zeroinitializer, %for.body ], [ %32, %vector.body ] |
| %vec.phi138 = phi <4 x float> [ zeroinitializer, %for.body ], [ %33, %vector.body ] |
| %vec.phi139 = phi <4 x float> [ zeroinitializer, %for.body ], [ %34, %vector.body ] |
| %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %0) |
| %3 = getelementptr inbounds float, float* %pIn, i32 %index |
| %4 = bitcast float* %3 to <4 x float>* |
| %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %4, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef) |
| %5 = add i32 %index, %mul4 |
| %6 = getelementptr inbounds float, float* %2, i32 %5 |
| %7 = bitcast float* %6 to <4 x float>* |
| %wide.masked.load140 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %7, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef) |
| %8 = fmul fast <4 x float> %wide.masked.load140, %wide.masked.load |
| %9 = fadd fast <4 x float> %8, %vec.phi137 |
| %10 = add i32 %index, %mul5 |
| %11 = getelementptr inbounds float, float* %2, i32 %10 |
| %12 = bitcast float* %11 to <4 x float>* |
| %wide.masked.load141 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %12, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef) |
| %13 = fmul fast <4 x float> %wide.masked.load141, %wide.masked.load |
| %14 = fadd fast <4 x float> %13, %vec.phi139 |
| %15 = add i32 %index, %mul7 |
| %16 = getelementptr inbounds float, float* %2, i32 %15 |
| %17 = bitcast float* %16 to <4 x float>* |
| %wide.masked.load142 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %17, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef) |
| %18 = fmul fast <4 x float> %wide.masked.load142, %wide.masked.load |
| %19 = fadd fast <4 x float> %18, %vec.phi138 |
| %20 = add i32 %index, %mul9 |
| %21 = getelementptr inbounds float, float* %2, i32 %20 |
| %22 = bitcast float* %21 to <4 x float>* |
| %wide.masked.load143 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %22, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef) |
| %23 = fmul fast <4 x float> %wide.masked.load143, %wide.masked.load |
| %24 = fadd fast <4 x float> %23, %vec.phi136 |
| %25 = add i32 %index, %mul11 |
| %26 = getelementptr inbounds float, float* %2, i32 %25 |
| %27 = bitcast float* %26 to <4 x float>* |
| %wide.masked.load144 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %27, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef) |
| %28 = fmul fast <4 x float> %wide.masked.load144, %wide.masked.load |
| %29 = fadd fast <4 x float> %28, %vec.phi |
| %30 = select <4 x i1> %active.lane.mask, <4 x float> %29, <4 x float> %vec.phi |
| %31 = select <4 x i1> %active.lane.mask, <4 x float> %24, <4 x float> %vec.phi136 |
| %32 = select <4 x i1> %active.lane.mask, <4 x float> %9, <4 x float> %vec.phi137 |
| %33 = select <4 x i1> %active.lane.mask, <4 x float> %19, <4 x float> %vec.phi138 |
| %34 = select <4 x i1> %active.lane.mask, <4 x float> %14, <4 x float> %vec.phi139 |
| %index.next = add i32 %index, 4 |
| %35 = icmp eq i32 %index.next, %n.vec |
| br i1 %35, label %middle.block, label %vector.body |
| |
| middle.block: ; preds = %vector.body |
| %36 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %34) |
| %37 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %33) |
| %38 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %32) |
| %39 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %31) |
| %40 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %30) |
| %arrayidx42 = getelementptr inbounds float, float* %pOut, i32 %k2.0135 |
| store float %38, float* %arrayidx42, align 4 |
| %arrayidx44 = getelementptr inbounds float, float* %pOut, i32 %add |
| store float %36, float* %arrayidx44, align 4 |
| %arrayidx46 = getelementptr inbounds float, float* %pOut, i32 %add6 |
| store float %37, float* %arrayidx46, align 4 |
| %arrayidx48 = getelementptr inbounds float, float* %pOut, i32 %add8 |
| store float %39, float* %arrayidx48, align 4 |
| %arrayidx50 = getelementptr inbounds float, float* %pOut, i32 %add10 |
| store float %40, float* %arrayidx50, align 4 |
| %add52 = add i32 %k2.0135, 5 |
| %cmp3 = icmp ult i32 %add52, %sub |
| br i1 %cmp3, label %for.body, label %for.cond.cleanup |
| } |
| |
| define void @DCT_mve6(%struct.DCT_InstanceTypeDef* nocapture readonly %S, float* nocapture readonly %pIn, float* nocapture %pOut) { |
| ; CHECK-LABEL: DCT_mve6: |
| ; CHECK: @ %bb.0: @ %entry |
| ; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr} |
| ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} |
| ; CHECK-NEXT: .pad #4 |
| ; CHECK-NEXT: sub sp, #4 |
| ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} |
| ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} |
| ; CHECK-NEXT: .pad #32 |
| ; CHECK-NEXT: sub sp, #32 |
| ; CHECK-NEXT: str r1, [sp, #16] @ 4-byte Spill |
| ; CHECK-NEXT: ldr r1, [r0, #4] |
| ; CHECK-NEXT: subs r1, #6 |
| ; CHECK-NEXT: str r1, [sp, #12] @ 4-byte Spill |
| ; CHECK-NEXT: cmp r1, #2 |
| ; CHECK-NEXT: blo.w .LBB5_5 |
| ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader |
| ; CHECK-NEXT: ldr r3, [r0, #8] |
| ; CHECK-NEXT: ldr r1, [r0] |
| ; CHECK-NEXT: adds r0, r3, #3 |
| ; CHECK-NEXT: str r3, [sp, #8] @ 4-byte Spill |
| ; CHECK-NEXT: bic r0, r0, #3 |
| ; CHECK-NEXT: add.w r8, r1, r3, lsl #2 |
| ; CHECK-NEXT: subs r1, r0, #4 |
| ; CHECK-NEXT: movs r0, #1 |
| ; CHECK-NEXT: lsls r5, r3, #2 |
| ; CHECK-NEXT: add.w r1, r0, r1, lsr #2 |
| ; CHECK-NEXT: str r1, [sp, #4] @ 4-byte Spill |
| ; CHECK-NEXT: add.w r1, r3, r3, lsl #1 |
| ; CHECK-NEXT: lsls r1, r1, #3 |
| ; CHECK-NEXT: str r1, [sp] @ 4-byte Spill |
| ; CHECK-NEXT: .LBB5_2: @ %for.body |
| ; CHECK-NEXT: @ =>This Loop Header: Depth=1 |
| ; CHECK-NEXT: @ Child Loop BB5_3 Depth 2 |
| ; CHECK-NEXT: adds r1, r0, #5 |
| ; CHECK-NEXT: str r1, [sp, #28] @ 4-byte Spill |
| ; CHECK-NEXT: adds r1, r0, #4 |
| ; CHECK-NEXT: str r1, [sp, #24] @ 4-byte Spill |
| ; CHECK-NEXT: adds r1, r0, #3 |
| ; CHECK-NEXT: ldr r7, [sp, #8] @ 4-byte Reload |
| ; CHECK-NEXT: str r1, [sp, #20] @ 4-byte Spill |
| ; CHECK-NEXT: vmov.i32 q1, #0x0 |
| ; CHECK-NEXT: ldr r1, [sp, #16] @ 4-byte Reload |
| ; CHECK-NEXT: add.w r11, r0, #2 |
| ; CHECK-NEXT: ldr r6, [sp, #4] @ 4-byte Reload |
| ; CHECK-NEXT: adds r4, r0, #1 |
| ; CHECK-NEXT: mov r3, r8 |
| ; CHECK-NEXT: vmov q3, q1 |
| ; CHECK-NEXT: vmov q4, q1 |
| ; CHECK-NEXT: vmov q0, q1 |
| ; CHECK-NEXT: vmov q5, q1 |
| ; CHECK-NEXT: vmov q2, q1 |
| ; CHECK-NEXT: dlstp.32 lr, r7 |
| ; CHECK-NEXT: .LBB5_3: @ %vector.body |
| ; CHECK-NEXT: @ Parent Loop BB5_2 Depth=1 |
| ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 |
| ; CHECK-NEXT: add.w r12, r3, r5 |
| ; CHECK-NEXT: vldrw.u32 q6, [r1], #16 |
| ; CHECK-NEXT: vldrw.u32 q7, [r3], #16 |
| ; CHECK-NEXT: add.w r10, r12, r5 |
| ; CHECK-NEXT: vfma.f32 q4, q7, q6 |
| ; CHECK-NEXT: vldrw.u32 q7, [r12] |
| ; CHECK-NEXT: add.w r6, r10, r5 |
| ; CHECK-NEXT: vfma.f32 q5, q7, q6 |
| ; CHECK-NEXT: vldrw.u32 q7, [r10] |
| ; CHECK-NEXT: adds r7, r6, r5 |
| ; CHECK-NEXT: vfma.f32 q2, q7, q6 |
| ; CHECK-NEXT: vldrw.u32 q7, [r6] |
| ; CHECK-NEXT: adds r6, r7, r5 |
| ; CHECK-NEXT: vfma.f32 q0, q7, q6 |
| ; CHECK-NEXT: vldrw.u32 q7, [r7] |
| ; CHECK-NEXT: vfma.f32 q3, q7, q6 |
| ; CHECK-NEXT: vldrw.u32 q7, [r6] |
| ; CHECK-NEXT: vfma.f32 q1, q7, q6 |
| ; CHECK-NEXT: letp lr, .LBB5_3 |
| ; CHECK-NEXT: @ %bb.4: @ %middle.block |
| ; CHECK-NEXT: @ in Loop: Header=BB5_2 Depth=1 |
| ; CHECK-NEXT: vadd.f32 s22, s22, s23 |
| ; CHECK-NEXT: add.w r1, r2, r4, lsl #2 |
| ; CHECK-NEXT: vadd.f32 s20, s20, s21 |
| ; CHECK-NEXT: vadd.f32 s18, s18, s19 |
| ; CHECK-NEXT: vadd.f32 s16, s16, s17 |
| ; CHECK-NEXT: vadd.f32 s10, s10, s11 |
| ; CHECK-NEXT: vadd.f32 s8, s8, s9 |
| ; CHECK-NEXT: vadd.f32 s0, s0, s1 |
| ; CHECK-NEXT: vadd.f32 s2, s2, s3 |
| ; CHECK-NEXT: vadd.f32 s1, s20, s22 |
| ; CHECK-NEXT: vadd.f32 s6, s6, s7 |
| ; CHECK-NEXT: vadd.f32 s3, s16, s18 |
| ; CHECK-NEXT: vadd.f32 s4, s4, s5 |
| ; CHECK-NEXT: vadd.f32 s8, s8, s10 |
| ; CHECK-NEXT: vadd.f32 s14, s14, s15 |
| ; CHECK-NEXT: vadd.f32 s12, s12, s13 |
| ; CHECK-NEXT: vstr s1, [r1] |
| ; CHECK-NEXT: add.w r1, r2, r0, lsl #2 |
| ; CHECK-NEXT: vadd.f32 s0, s0, s2 |
| ; CHECK-NEXT: adds r0, #6 |
| ; CHECK-NEXT: vstr s3, [r1] |
| ; CHECK-NEXT: add.w r1, r2, r11, lsl #2 |
| ; CHECK-NEXT: vadd.f32 s4, s4, s6 |
| ; CHECK-NEXT: vstr s8, [r1] |
| ; CHECK-NEXT: ldr r1, [sp, #20] @ 4-byte Reload |
| ; CHECK-NEXT: vadd.f32 s6, s12, s14 |
| ; CHECK-NEXT: add.w r1, r2, r1, lsl #2 |
| ; CHECK-NEXT: vstr s0, [r1] |
| ; CHECK-NEXT: ldr r1, [sp, #24] @ 4-byte Reload |
| ; CHECK-NEXT: add.w r1, r2, r1, lsl #2 |
| ; CHECK-NEXT: vstr s6, [r1] |
| ; CHECK-NEXT: ldr r1, [sp, #28] @ 4-byte Reload |
| ; CHECK-NEXT: add.w r1, r2, r1, lsl #2 |
| ; CHECK-NEXT: vstr s4, [r1] |
| ; CHECK-NEXT: ldr r1, [sp] @ 4-byte Reload |
| ; CHECK-NEXT: add r8, r1 |
| ; CHECK-NEXT: ldr r1, [sp, #12] @ 4-byte Reload |
| ; CHECK-NEXT: cmp r0, r1 |
| ; CHECK-NEXT: blo.w .LBB5_2 |
| ; CHECK-NEXT: .LBB5_5: @ %for.cond.cleanup |
| ; CHECK-NEXT: add sp, #32 |
| ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} |
| ; CHECK-NEXT: add sp, #4 |
| ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} |
| entry: |
| %NumInputs = getelementptr inbounds %struct.DCT_InstanceTypeDef, %struct.DCT_InstanceTypeDef* %S, i32 0, i32 2 |
| %0 = load i32, i32* %NumInputs, align 4 |
| %NumFilters = getelementptr inbounds %struct.DCT_InstanceTypeDef, %struct.DCT_InstanceTypeDef* %S, i32 0, i32 1 |
| %1 = load i32, i32* %NumFilters, align 4 |
| %pDCTCoefs = getelementptr inbounds %struct.DCT_InstanceTypeDef, %struct.DCT_InstanceTypeDef* %S, i32 0, i32 0 |
| %2 = load float*, float** %pDCTCoefs, align 4 |
| %cmp = icmp ugt i32 %0, 1 |
| tail call void @llvm.assume(i1 %cmp) |
| %sub = add i32 %1, -6 |
| %cmp3155 = icmp ugt i32 %sub, 1 |
| br i1 %cmp3155, label %for.body.preheader, label %for.cond.cleanup |
| |
| for.body.preheader: ; preds = %entry |
| %n.rnd.up = add i32 %0, 3 |
| %n.vec = and i32 %n.rnd.up, -4 |
| br label %for.body |
| |
| for.cond.cleanup: ; preds = %middle.block, %entry |
| ret void |
| |
| for.body: ; preds = %for.body.preheader, %middle.block |
| %k2.0156 = phi i32 [ %add61, %middle.block ], [ 1, %for.body.preheader ] |
| %mul4 = mul i32 %k2.0156, %0 |
| %add = add nuw i32 %k2.0156, 1 |
| %mul5 = mul i32 %add, %0 |
| %add6 = add i32 %k2.0156, 2 |
| %mul7 = mul i32 %add6, %0 |
| %add8 = add i32 %k2.0156, 3 |
| %mul9 = mul i32 %add8, %0 |
| %add10 = add i32 %k2.0156, 4 |
| %mul11 = mul i32 %add10, %0 |
| %add12 = add i32 %k2.0156, 5 |
| %mul13 = mul i32 %add12, %0 |
| br label %vector.body |
| |
| vector.body: ; preds = %vector.body, %for.body |
| %index = phi i32 [ 0, %for.body ], [ %index.next, %vector.body ] |
| %vec.phi = phi <4 x float> [ zeroinitializer, %for.body ], [ %35, %vector.body ] |
| %vec.phi157 = phi <4 x float> [ zeroinitializer, %for.body ], [ %36, %vector.body ] |
| %vec.phi158 = phi <4 x float> [ zeroinitializer, %for.body ], [ %37, %vector.body ] |
| %vec.phi159 = phi <4 x float> [ zeroinitializer, %for.body ], [ %38, %vector.body ] |
| %vec.phi160 = phi <4 x float> [ zeroinitializer, %for.body ], [ %39, %vector.body ] |
| %vec.phi161 = phi <4 x float> [ zeroinitializer, %for.body ], [ %40, %vector.body ] |
| %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %0) |
| %3 = getelementptr inbounds float, float* %pIn, i32 %index |
| %4 = bitcast float* %3 to <4 x float>* |
| %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %4, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef) |
| %5 = add i32 %index, %mul4 |
| %6 = getelementptr inbounds float, float* %2, i32 %5 |
| %7 = bitcast float* %6 to <4 x float>* |
| %wide.masked.load162 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %7, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef) |
| %8 = fmul fast <4 x float> %wide.masked.load162, %wide.masked.load |
| %9 = fadd fast <4 x float> %8, %vec.phi158 |
| %10 = add i32 %index, %mul5 |
| %11 = getelementptr inbounds float, float* %2, i32 %10 |
| %12 = bitcast float* %11 to <4 x float>* |
| %wide.masked.load163 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %12, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef) |
| %13 = fmul fast <4 x float> %wide.masked.load163, %wide.masked.load |
| %14 = fadd fast <4 x float> %13, %vec.phi160 |
| %15 = add i32 %index, %mul7 |
| %16 = getelementptr inbounds float, float* %2, i32 %15 |
| %17 = bitcast float* %16 to <4 x float>* |
| %wide.masked.load164 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %17, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef) |
| %18 = fmul fast <4 x float> %wide.masked.load164, %wide.masked.load |
| %19 = fadd fast <4 x float> %18, %vec.phi161 |
| %20 = add i32 %index, %mul9 |
| %21 = getelementptr inbounds float, float* %2, i32 %20 |
| %22 = bitcast float* %21 to <4 x float>* |
| %wide.masked.load165 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %22, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef) |
| %23 = fmul fast <4 x float> %wide.masked.load165, %wide.masked.load |
| %24 = fadd fast <4 x float> %23, %vec.phi159 |
| %25 = add i32 %index, %mul11 |
| %26 = getelementptr inbounds float, float* %2, i32 %25 |
| %27 = bitcast float* %26 to <4 x float>* |
| %wide.masked.load166 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %27, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef) |
| %28 = fmul fast <4 x float> %wide.masked.load166, %wide.masked.load |
| %29 = fadd fast <4 x float> %28, %vec.phi157 |
| %30 = add i32 %index, %mul13 |
| %31 = getelementptr inbounds float, float* %2, i32 %30 |
| %32 = bitcast float* %31 to <4 x float>* |
| %wide.masked.load167 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %32, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef) |
| %33 = fmul fast <4 x float> %wide.masked.load167, %wide.masked.load |
| %34 = fadd fast <4 x float> %33, %vec.phi |
| %35 = select <4 x i1> %active.lane.mask, <4 x float> %34, <4 x float> %vec.phi |
| %36 = select <4 x i1> %active.lane.mask, <4 x float> %29, <4 x float> %vec.phi157 |
| %37 = select <4 x i1> %active.lane.mask, <4 x float> %9, <4 x float> %vec.phi158 |
| %38 = select <4 x i1> %active.lane.mask, <4 x float> %24, <4 x float> %vec.phi159 |
| %39 = select <4 x i1> %active.lane.mask, <4 x float> %14, <4 x float> %vec.phi160 |
| %40 = select <4 x i1> %active.lane.mask, <4 x float> %19, <4 x float> %vec.phi161 |
| %index.next = add i32 %index, 4 |
| %41 = icmp eq i32 %index.next, %n.vec |
| br i1 %41, label %middle.block, label %vector.body |
| |
| middle.block: ; preds = %vector.body |
| %42 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %40) |
| %43 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %39) |
| %44 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %38) |
| %45 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %37) |
| %46 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %36) |
| %47 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %35) |
| %arrayidx49 = getelementptr inbounds float, float* %pOut, i32 %k2.0156 |
| store float %45, float* %arrayidx49, align 4 |
| %arrayidx51 = getelementptr inbounds float, float* %pOut, i32 %add |
| store float %43, float* %arrayidx51, align 4 |
| %arrayidx53 = getelementptr inbounds float, float* %pOut, i32 %add6 |
| store float %42, float* %arrayidx53, align 4 |
| %arrayidx55 = getelementptr inbounds float, float* %pOut, i32 %add8 |
| store float %44, float* %arrayidx55, align 4 |
| %arrayidx57 = getelementptr inbounds float, float* %pOut, i32 %add10 |
| store float %46, float* %arrayidx57, align 4 |
| %arrayidx59 = getelementptr inbounds float, float* %pOut, i32 %add12 |
| store float %47, float* %arrayidx59, align 4 |
| %add61 = add i32 %k2.0156, 6 |
| %cmp3 = icmp ult i32 %add61, %sub |
| br i1 %cmp3, label %for.body, label %for.cond.cleanup |
| } |
| |
| define void @DCT_mve7(%struct.DCT_InstanceTypeDef* nocapture readonly %S, float* nocapture readonly %pIn, float* nocapture %pOut) { |
| ; CHECK-LABEL: DCT_mve7: |
| ; CHECK: @ %bb.0: @ %entry |
| ; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr} |
| ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} |
| ; CHECK-NEXT: .pad #4 |
| ; CHECK-NEXT: sub sp, #4 |
| ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} |
| ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} |
| ; CHECK-NEXT: .pad #72 |
| ; CHECK-NEXT: sub sp, #72 |
| ; CHECK-NEXT: str r1, [sp, #20] @ 4-byte Spill |
| ; CHECK-NEXT: ldr r1, [r0, #4] |
| ; CHECK-NEXT: subs r1, #7 |
| ; CHECK-NEXT: str r1, [sp, #16] @ 4-byte Spill |
| ; CHECK-NEXT: cmp r1, #2 |
| ; CHECK-NEXT: blo.w .LBB6_5 |
| ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader |
| ; CHECK-NEXT: ldr r3, [r0, #8] |
| ; CHECK-NEXT: ldr r1, [r0] |
| ; CHECK-NEXT: adds r0, r3, #3 |
| ; CHECK-NEXT: str r3, [sp, #12] @ 4-byte Spill |
| ; CHECK-NEXT: bic r0, r0, #3 |
| ; CHECK-NEXT: add.w r9, r1, r3, lsl #2 |
| ; CHECK-NEXT: subs r1, r0, #4 |
| ; CHECK-NEXT: movs r0, #1 |
| ; CHECK-NEXT: lsls r5, r3, #2 |
| ; CHECK-NEXT: add.w r1, r0, r1, lsr #2 |
| ; CHECK-NEXT: str r1, [sp, #8] @ 4-byte Spill |
| ; CHECK-NEXT: rsb r1, r3, r3, lsl #3 |
| ; CHECK-NEXT: lsls r1, r1, #2 |
| ; CHECK-NEXT: str r1, [sp, #4] @ 4-byte Spill |
| ; CHECK-NEXT: .LBB6_2: @ %for.body |
| ; CHECK-NEXT: @ =>This Loop Header: Depth=1 |
| ; CHECK-NEXT: @ Child Loop BB6_3 Depth 2 |
| ; CHECK-NEXT: adds r1, r0, #6 |
| ; CHECK-NEXT: str r1, [sp, #36] @ 4-byte Spill |
| ; CHECK-NEXT: adds r1, r0, #5 |
| ; CHECK-NEXT: str r1, [sp, #32] @ 4-byte Spill |
| ; CHECK-NEXT: adds r1, r0, #4 |
| ; CHECK-NEXT: str r1, [sp, #28] @ 4-byte Spill |
| ; CHECK-NEXT: adds r1, r0, #3 |
| ; CHECK-NEXT: ldr r7, [sp, #12] @ 4-byte Reload |
| ; CHECK-NEXT: str r1, [sp, #24] @ 4-byte Spill |
| ; CHECK-NEXT: vmov.i32 q2, #0x0 |
| ; CHECK-NEXT: ldr r1, [sp, #20] @ 4-byte Reload |
| ; CHECK-NEXT: adds r4, r0, #2 |
| ; CHECK-NEXT: ldr r6, [sp, #8] @ 4-byte Reload |
| ; CHECK-NEXT: add.w r8, r0, #1 |
| ; CHECK-NEXT: mov r3, r9 |
| ; CHECK-NEXT: vmov q4, q2 |
| ; CHECK-NEXT: vmov q5, q2 |
| ; CHECK-NEXT: vmov q3, q2 |
| ; CHECK-NEXT: vmov q6, q2 |
| ; CHECK-NEXT: vmov q1, q2 |
| ; CHECK-NEXT: mov r12, r7 |
| ; CHECK-NEXT: vstrw.32 q2, [sp, #56] @ 16-byte Spill |
| ; CHECK-NEXT: dls lr, r6 |
| ; CHECK-NEXT: .LBB6_3: @ %vector.body |
| ; CHECK-NEXT: @ Parent Loop BB6_2 Depth=1 |
| ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 |
| ; CHECK-NEXT: vctp.32 r12 |
| ; CHECK-NEXT: add.w r10, r3, r5 |
| ; CHECK-NEXT: vpstt |
| ; CHECK-NEXT: vldrwt.u32 q7, [r1], #16 |
| ; CHECK-NEXT: vldrwt.u32 q0, [r3], #16 |
| ; CHECK-NEXT: add.w r11, r10, r5 |
| ; CHECK-NEXT: sub.w r12, r12, #4 |
| ; CHECK-NEXT: vpstt |
| ; CHECK-NEXT: vfmat.f32 q5, q0, q7 |
| ; CHECK-NEXT: vldrwt.u32 q0, [r10] |
| ; CHECK-NEXT: add.w r6, r11, r5 |
| ; CHECK-NEXT: vpstt |
| ; CHECK-NEXT: vfmat.f32 q6, q0, q7 |
| ; CHECK-NEXT: vldrwt.u32 q0, [r11] |
| ; CHECK-NEXT: vstrw.32 q6, [sp, #40] @ 16-byte Spill |
| ; CHECK-NEXT: vmov q6, q5 |
| ; CHECK-NEXT: vpst |
| ; CHECK-NEXT: vfmat.f32 q1, q0, q7 |
| ; CHECK-NEXT: vmov q5, q4 |
| ; CHECK-NEXT: vmov q4, q3 |
| ; CHECK-NEXT: vmov q3, q1 |
| ; CHECK-NEXT: vpst |
| ; CHECK-NEXT: vldrwt.u32 q0, [r6] |
| ; CHECK-NEXT: vldrw.u32 q1, [sp, #56] @ 16-byte Reload |
| ; CHECK-NEXT: adds r7, r6, r5 |
| ; CHECK-NEXT: vpstt |
| ; CHECK-NEXT: vfmat.f32 q1, q0, q7 |
| ; CHECK-NEXT: vldrwt.u32 q0, [r7] |
| ; CHECK-NEXT: adds r6, r7, r5 |
| ; CHECK-NEXT: vstrw.32 q1, [sp, #56] @ 16-byte Spill |
| ; CHECK-NEXT: vmov q1, q3 |
| ; CHECK-NEXT: vmov q3, q4 |
| ; CHECK-NEXT: vpstt |
| ; CHECK-NEXT: vfmat.f32 q3, q0, q7 |
| ; CHECK-NEXT: vldrwt.u32 q0, [r6] |
| ; CHECK-NEXT: vmov q4, q5 |
| ; CHECK-NEXT: adds r7, r6, r5 |
| ; CHECK-NEXT: vpstt |
| ; CHECK-NEXT: vfmat.f32 q4, q0, q7 |
| ; CHECK-NEXT: vldrwt.u32 q0, [r7] |
| ; CHECK-NEXT: vmov q5, q6 |
| ; CHECK-NEXT: vldrw.u32 q6, [sp, #40] @ 16-byte Reload |
| ; CHECK-NEXT: vpst |
| ; CHECK-NEXT: vfmat.f32 q2, q0, q7 |
| ; CHECK-NEXT: le lr, .LBB6_3 |
| ; CHECK-NEXT: @ %bb.4: @ %middle.block |
| ; CHECK-NEXT: @ in Loop: Header=BB6_2 Depth=1 |
| ; CHECK-NEXT: vadd.f32 s0, s26, s27 |
| ; CHECK-NEXT: add.w r1, r2, r8, lsl #2 |
| ; CHECK-NEXT: vadd.f32 s2, s24, s25 |
| ; CHECK-NEXT: vadd.f32 s1, s22, s23 |
| ; CHECK-NEXT: vadd.f32 s3, s20, s21 |
| ; CHECK-NEXT: vadd.f32 s6, s6, s7 |
| ; CHECK-NEXT: vadd.f32 s4, s4, s5 |
| ; CHECK-NEXT: vadd.f32 s10, s10, s11 |
| ; CHECK-NEXT: vadd.f32 s8, s8, s9 |
| ; CHECK-NEXT: vadd.f32 s0, s2, s0 |
| ; CHECK-NEXT: vadd.f32 s9, s18, s19 |
| ; CHECK-NEXT: vadd.f32 s11, s16, s17 |
| ; CHECK-NEXT: vldrw.u32 q4, [sp, #56] @ 16-byte Reload |
| ; CHECK-NEXT: vadd.f32 s2, s3, s1 |
| ; CHECK-NEXT: vadd.f32 s5, s18, s19 |
| ; CHECK-NEXT: vadd.f32 s7, s16, s17 |
| ; CHECK-NEXT: vadd.f32 s4, s4, s6 |
| ; CHECK-NEXT: vstr s0, [r1] |
| ; CHECK-NEXT: add.w r1, r2, r0, lsl #2 |
| ; CHECK-NEXT: vadd.f32 s14, s14, s15 |
| ; CHECK-NEXT: adds r0, #7 |
| ; CHECK-NEXT: vadd.f32 s12, s12, s13 |
| ; CHECK-NEXT: vstr s2, [r1] |
| ; CHECK-NEXT: add.w r1, r2, r4, lsl #2 |
| ; CHECK-NEXT: vadd.f32 s8, s8, s10 |
| ; CHECK-NEXT: vadd.f32 s6, s7, s5 |
| ; CHECK-NEXT: vstr s4, [r1] |
| ; CHECK-NEXT: vadd.f32 s10, s11, s9 |
| ; CHECK-NEXT: ldr r1, [sp, #24] @ 4-byte Reload |
| ; CHECK-NEXT: vadd.f32 s12, s12, s14 |
| ; CHECK-NEXT: add.w r1, r2, r1, lsl #2 |
| ; CHECK-NEXT: vstr s6, [r1] |
| ; CHECK-NEXT: ldr r1, [sp, #28] @ 4-byte Reload |
| ; CHECK-NEXT: add.w r1, r2, r1, lsl #2 |
| ; CHECK-NEXT: vstr s12, [r1] |
| ; CHECK-NEXT: ldr r1, [sp, #32] @ 4-byte Reload |
| ; CHECK-NEXT: add.w r1, r2, r1, lsl #2 |
| ; CHECK-NEXT: vstr s10, [r1] |
| ; CHECK-NEXT: ldr r1, [sp, #36] @ 4-byte Reload |
| ; CHECK-NEXT: add.w r1, r2, r1, lsl #2 |
| ; CHECK-NEXT: vstr s8, [r1] |
| ; CHECK-NEXT: ldr r1, [sp, #4] @ 4-byte Reload |
| ; CHECK-NEXT: add r9, r1 |
| ; CHECK-NEXT: ldr r1, [sp, #16] @ 4-byte Reload |
| ; CHECK-NEXT: cmp r0, r1 |
| ; CHECK-NEXT: blo.w .LBB6_2 |
| ; CHECK-NEXT: .LBB6_5: @ %for.cond.cleanup |
| ; CHECK-NEXT: add sp, #72 |
| ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} |
| ; CHECK-NEXT: add sp, #4 |
| ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} |
| entry: |
| %NumInputs = getelementptr inbounds %struct.DCT_InstanceTypeDef, %struct.DCT_InstanceTypeDef* %S, i32 0, i32 2 |
| %0 = load i32, i32* %NumInputs, align 4 |
| %NumFilters = getelementptr inbounds %struct.DCT_InstanceTypeDef, %struct.DCT_InstanceTypeDef* %S, i32 0, i32 1 |
| %1 = load i32, i32* %NumFilters, align 4 |
| %pDCTCoefs = getelementptr inbounds %struct.DCT_InstanceTypeDef, %struct.DCT_InstanceTypeDef* %S, i32 0, i32 0 |
| %2 = load float*, float** %pDCTCoefs, align 4 |
| %cmp = icmp ugt i32 %0, 1 |
| tail call void @llvm.assume(i1 %cmp) |
| %sub = add i32 %1, -7 |
| %cmp3176 = icmp ugt i32 %sub, 1 |
| br i1 %cmp3176, label %for.body.preheader, label %for.cond.cleanup |
| |
| for.body.preheader: ; preds = %entry |
| %n.rnd.up = add i32 %0, 3 |
| %n.vec = and i32 %n.rnd.up, -4 |
| br label %for.body |
| |
| for.cond.cleanup: ; preds = %middle.block, %entry |
| ret void |
| |
| for.body: ; preds = %for.body.preheader, %middle.block |
| %k2.0177 = phi i32 [ %add70, %middle.block ], [ 1, %for.body.preheader ] |
| %mul4 = mul i32 %k2.0177, %0 |
| %add = add nuw i32 %k2.0177, 1 |
| %mul5 = mul i32 %add, %0 |
| %add6 = add i32 %k2.0177, 2 |
| %mul7 = mul i32 %add6, %0 |
| %add8 = add i32 %k2.0177, 3 |
| %mul9 = mul i32 %add8, %0 |
| %add10 = add i32 %k2.0177, 4 |
| %mul11 = mul i32 %add10, %0 |
| %add12 = add i32 %k2.0177, 5 |
| %mul13 = mul i32 %add12, %0 |
| %add14 = add i32 %k2.0177, 6 |
| %mul15 = mul i32 %add14, %0 |
| br label %vector.body |
| |
| vector.body: ; preds = %vector.body, %for.body |
| %index = phi i32 [ 0, %for.body ], [ %index.next, %vector.body ] |
| %vec.phi = phi <4 x float> [ zeroinitializer, %for.body ], [ %40, %vector.body ] |
| %vec.phi178 = phi <4 x float> [ zeroinitializer, %for.body ], [ %41, %vector.body ] |
| %vec.phi179 = phi <4 x float> [ zeroinitializer, %for.body ], [ %42, %vector.body ] |
| %vec.phi180 = phi <4 x float> [ zeroinitializer, %for.body ], [ %43, %vector.body ] |
| %vec.phi181 = phi <4 x float> [ zeroinitializer, %for.body ], [ %44, %vector.body ] |
| %vec.phi182 = phi <4 x float> [ zeroinitializer, %for.body ], [ %45, %vector.body ] |
| %vec.phi183 = phi <4 x float> [ zeroinitializer, %for.body ], [ %46, %vector.body ] |
| %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %0) |
| %3 = getelementptr inbounds float, float* %pIn, i32 %index |
| %4 = bitcast float* %3 to <4 x float>* |
| %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %4, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef) |
| %5 = add i32 %index, %mul4 |
| %6 = getelementptr inbounds float, float* %2, i32 %5 |
| %7 = bitcast float* %6 to <4 x float>* |
| %wide.masked.load184 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %7, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef) |
| %8 = fmul fast <4 x float> %wide.masked.load184, %wide.masked.load |
| %9 = fadd fast <4 x float> %8, %vec.phi179 |
| %10 = add i32 %index, %mul5 |
| %11 = getelementptr inbounds float, float* %2, i32 %10 |
| %12 = bitcast float* %11 to <4 x float>* |
| %wide.masked.load185 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %12, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef) |
| %13 = fmul fast <4 x float> %wide.masked.load185, %wide.masked.load |
| %14 = fadd fast <4 x float> %13, %vec.phi181 |
| %15 = add i32 %index, %mul7 |
| %16 = getelementptr inbounds float, float* %2, i32 %15 |
| %17 = bitcast float* %16 to <4 x float>* |
| %wide.masked.load186 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %17, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef) |
| %18 = fmul fast <4 x float> %wide.masked.load186, %wide.masked.load |
| %19 = fadd fast <4 x float> %18, %vec.phi183 |
| %20 = add i32 %index, %mul9 |
| %21 = getelementptr inbounds float, float* %2, i32 %20 |
| %22 = bitcast float* %21 to <4 x float>* |
| %wide.masked.load187 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %22, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef) |
| %23 = fmul fast <4 x float> %wide.masked.load187, %wide.masked.load |
| %24 = fadd fast <4 x float> %23, %vec.phi182 |
| %25 = add i32 %index, %mul11 |
| %26 = getelementptr inbounds float, float* %2, i32 %25 |
| %27 = bitcast float* %26 to <4 x float>* |
| %wide.masked.load188 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %27, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef) |
| %28 = fmul fast <4 x float> %wide.masked.load188, %wide.masked.load |
| %29 = fadd fast <4 x float> %28, %vec.phi180 |
| %30 = add i32 %index, %mul13 |
| %31 = getelementptr inbounds float, float* %2, i32 %30 |
| %32 = bitcast float* %31 to <4 x float>* |
| %wide.masked.load189 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %32, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef) |
| %33 = fmul fast <4 x float> %wide.masked.load189, %wide.masked.load |
| %34 = fadd fast <4 x float> %33, %vec.phi178 |
| %35 = add i32 %index, %mul15 |
| %36 = getelementptr inbounds float, float* %2, i32 %35 |
| %37 = bitcast float* %36 to <4 x float>* |
| %wide.masked.load190 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %37, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef) |
| %38 = fmul fast <4 x float> %wide.masked.load190, %wide.masked.load |
| %39 = fadd fast <4 x float> %38, %vec.phi |
| %40 = select <4 x i1> %active.lane.mask, <4 x float> %39, <4 x float> %vec.phi |
| %41 = select <4 x i1> %active.lane.mask, <4 x float> %34, <4 x float> %vec.phi178 |
| %42 = select <4 x i1> %active.lane.mask, <4 x float> %9, <4 x float> %vec.phi179 |
| %43 = select <4 x i1> %active.lane.mask, <4 x float> %29, <4 x float> %vec.phi180 |
| %44 = select <4 x i1> %active.lane.mask, <4 x float> %14, <4 x float> %vec.phi181 |
| %45 = select <4 x i1> %active.lane.mask, <4 x float> %24, <4 x float> %vec.phi182 |
| %46 = select <4 x i1> %active.lane.mask, <4 x float> %19, <4 x float> %vec.phi183 |
| %index.next = add i32 %index, 4 |
| %47 = icmp eq i32 %index.next, %n.vec |
| br i1 %47, label %middle.block, label %vector.body |
| |
| middle.block: ; preds = %vector.body |
| %48 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %46) |
| %49 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %45) |
| %50 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %44) |
| %51 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %43) |
| %52 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %42) |
| %53 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %41) |
| %54 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %40) |
| %arrayidx56 = getelementptr inbounds float, float* %pOut, i32 %k2.0177 |
| store float %52, float* %arrayidx56, align 4 |
| %arrayidx58 = getelementptr inbounds float, float* %pOut, i32 %add |
| store float %50, float* %arrayidx58, align 4 |
| %arrayidx60 = getelementptr inbounds float, float* %pOut, i32 %add6 |
| store float %48, float* %arrayidx60, align 4 |
| %arrayidx62 = getelementptr inbounds float, float* %pOut, i32 %add8 |
| store float %49, float* %arrayidx62, align 4 |
| %arrayidx64 = getelementptr inbounds float, float* %pOut, i32 %add10 |
| store float %51, float* %arrayidx64, align 4 |
| %arrayidx66 = getelementptr inbounds float, float* %pOut, i32 %add12 |
| store float %53, float* %arrayidx66, align 4 |
| %arrayidx68 = getelementptr inbounds float, float* %pOut, i32 %add14 |
| store float %54, float* %arrayidx68, align 4 |
| %add70 = add i32 %k2.0177, 7 |
| %cmp3 = icmp ult i32 %add70, %sub |
| br i1 %cmp3, label %for.body, label %for.cond.cleanup |
| } |
| |
| define void @DCT_mve8(%struct.DCT_InstanceTypeDef* nocapture readonly %S, float* nocapture readonly %pIn, float* nocapture %pOut) { |
| ; CHECK-LABEL: DCT_mve8: |
| ; CHECK: @ %bb.0: @ %entry |
| ; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr} |
| ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} |
| ; CHECK-NEXT: .pad #4 |
| ; CHECK-NEXT: sub sp, #4 |
| ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} |
| ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} |
| ; CHECK-NEXT: .pad #88 |
| ; CHECK-NEXT: sub sp, #88 |
| ; CHECK-NEXT: str r1, [sp, #20] @ 4-byte Spill |
| ; CHECK-NEXT: ldr r1, [r0, #4] |
| ; CHECK-NEXT: subs r1, #8 |
| ; CHECK-NEXT: str r1, [sp, #16] @ 4-byte Spill |
| ; CHECK-NEXT: cmp r1, #2 |
| ; CHECK-NEXT: blo.w .LBB7_5 |
| ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader |
| ; CHECK-NEXT: ldr r3, [r0, #8] |
| ; CHECK-NEXT: ldr r1, [r0] |
| ; CHECK-NEXT: adds r0, r3, #3 |
| ; CHECK-NEXT: str r3, [sp, #12] @ 4-byte Spill |
| ; CHECK-NEXT: bic r0, r0, #3 |
| ; CHECK-NEXT: add.w r12, r1, r3, lsl #2 |
| ; CHECK-NEXT: subs r1, r0, #4 |
| ; CHECK-NEXT: movs r0, #1 |
| ; CHECK-NEXT: lsls r5, r3, #2 |
| ; CHECK-NEXT: add.w r1, r0, r1, lsr #2 |
| ; CHECK-NEXT: str r1, [sp, #8] @ 4-byte Spill |
| ; CHECK-NEXT: lsls r1, r3, #5 |
| ; CHECK-NEXT: str r1, [sp, #4] @ 4-byte Spill |
| ; CHECK-NEXT: .LBB7_2: @ %for.body |
| ; CHECK-NEXT: @ =>This Loop Header: Depth=1 |
| ; CHECK-NEXT: @ Child Loop BB7_3 Depth 2 |
| ; CHECK-NEXT: adds r1, r0, #7 |
| ; CHECK-NEXT: str r1, [sp, #36] @ 4-byte Spill |
| ; CHECK-NEXT: adds r1, r0, #6 |
| ; CHECK-NEXT: str r1, [sp, #32] @ 4-byte Spill |
| ; CHECK-NEXT: adds r1, r0, #5 |
| ; CHECK-NEXT: ldr r7, [sp, #12] @ 4-byte Reload |
| ; CHECK-NEXT: str r1, [sp, #28] @ 4-byte Spill |
| ; CHECK-NEXT: adds r1, r0, #4 |
| ; CHECK-NEXT: ldr.w r9, [sp, #20] @ 4-byte Reload |
| ; CHECK-NEXT: vmov.i32 q3, #0x0 |
| ; CHECK-NEXT: ldr r6, [sp, #8] @ 4-byte Reload |
| ; CHECK-NEXT: adds r4, r0, #3 |
| ; CHECK-NEXT: str r1, [sp, #24] @ 4-byte Spill |
| ; CHECK-NEXT: add.w r8, r0, #2 |
| ; CHECK-NEXT: adds r1, r0, #1 |
| ; CHECK-NEXT: mov r3, r12 |
| ; CHECK-NEXT: vmov q5, q3 |
| ; CHECK-NEXT: vmov q6, q3 |
| ; CHECK-NEXT: vmov q4, q3 |
| ; CHECK-NEXT: vmov q7, q3 |
| ; CHECK-NEXT: vmov q2, q3 |
| ; CHECK-NEXT: mov r10, r7 |
| ; CHECK-NEXT: vstrw.32 q3, [sp, #56] @ 16-byte Spill |
| ; CHECK-NEXT: vstrw.32 q3, [sp, #72] @ 16-byte Spill |
| ; CHECK-NEXT: dls lr, r6 |
| ; CHECK-NEXT: .LBB7_3: @ %vector.body |
| ; CHECK-NEXT: @ Parent Loop BB7_2 Depth=1 |
| ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 |
| ; CHECK-NEXT: vctp.32 r10 |
| ; CHECK-NEXT: add.w r11, r3, r5 |
| ; CHECK-NEXT: vpstt |
| ; CHECK-NEXT: vldrwt.u32 q0, [r9], #16 |
| ; CHECK-NEXT: vldrwt.u32 q1, [r3], #16 |
| ; CHECK-NEXT: add.w r6, r11, r5 |
| ; CHECK-NEXT: sub.w r10, r10, #4 |
| ; CHECK-NEXT: vpstt |
| ; CHECK-NEXT: vfmat.f32 q6, q1, q0 |
| ; CHECK-NEXT: vldrwt.u32 q1, [r11] |
| ; CHECK-NEXT: vstrw.32 q6, [sp, #40] @ 16-byte Spill |
| ; CHECK-NEXT: vmov q6, q5 |
| ; CHECK-NEXT: vpst |
| ; CHECK-NEXT: vfmat.f32 q7, q1, q0 |
| ; CHECK-NEXT: vmov q5, q3 |
| ; CHECK-NEXT: vmov q3, q4 |
| ; CHECK-NEXT: vmov q4, q2 |
| ; CHECK-NEXT: vpst |
| ; CHECK-NEXT: vldrwt.u32 q1, [r6] |
| ; CHECK-NEXT: vldrw.u32 q2, [sp, #56] @ 16-byte Reload |
| ; CHECK-NEXT: adds r7, r6, r5 |
| ; CHECK-NEXT: vpstt |
| ; CHECK-NEXT: vfmat.f32 q2, q1, q0 |
| ; CHECK-NEXT: vldrwt.u32 q1, [r7] |
| ; CHECK-NEXT: vstrw.32 q2, [sp, #56] @ 16-byte Spill |
| ; CHECK-NEXT: vldrw.u32 q2, [sp, #72] @ 16-byte Reload |
| ; CHECK-NEXT: adds r6, r7, r5 |
| ; CHECK-NEXT: vpstt |
| ; CHECK-NEXT: vfmat.f32 q2, q1, q0 |
| ; CHECK-NEXT: vldrwt.u32 q1, [r6] |
| ; CHECK-NEXT: adds r7, r6, r5 |
| ; CHECK-NEXT: vstrw.32 q2, [sp, #72] @ 16-byte Spill |
| ; CHECK-NEXT: vmov q2, q4 |
| ; CHECK-NEXT: vmov q4, q3 |
| ; CHECK-NEXT: vpstt |
| ; CHECK-NEXT: vfmat.f32 q2, q1, q0 |
| ; CHECK-NEXT: vldrwt.u32 q1, [r7] |
| ; CHECK-NEXT: adds r6, r7, r5 |
| ; CHECK-NEXT: vmov q3, q5 |
| ; CHECK-NEXT: vpstt |
| ; CHECK-NEXT: vfmat.f32 q4, q1, q0 |
| ; CHECK-NEXT: vldrwt.u32 q1, [r6] |
| ; CHECK-NEXT: vmov q5, q6 |
| ; CHECK-NEXT: add r6, r5 |
| ; CHECK-NEXT: vpstt |
| ; CHECK-NEXT: vfmat.f32 q5, q1, q0 |
| ; CHECK-NEXT: vldrwt.u32 q1, [r6] |
| ; CHECK-NEXT: vldrw.u32 q6, [sp, #40] @ 16-byte Reload |
| ; CHECK-NEXT: vpst |
| ; CHECK-NEXT: vfmat.f32 q3, q1, q0 |
| ; CHECK-NEXT: le lr, .LBB7_3 |
| ; CHECK-NEXT: @ %bb.4: @ %middle.block |
| ; CHECK-NEXT: @ in Loop: Header=BB7_2 Depth=1 |
| ; CHECK-NEXT: vadd.f32 s0, s30, s31 |
| ; CHECK-NEXT: add.w r1, r2, r1, lsl #2 |
| ; CHECK-NEXT: vadd.f32 s2, s28, s29 |
| ; CHECK-NEXT: vadd.f32 s4, s26, s27 |
| ; CHECK-NEXT: vadd.f32 s6, s24, s25 |
| ; CHECK-NEXT: vadd.f32 s5, s18, s19 |
| ; CHECK-NEXT: vadd.f32 s7, s16, s17 |
| ; CHECK-NEXT: vldrw.u32 q4, [sp, #56] @ 16-byte Reload |
| ; CHECK-NEXT: vadd.f32 s10, s10, s11 |
| ; CHECK-NEXT: vadd.f32 s8, s8, s9 |
| ; CHECK-NEXT: vadd.f32 s9, s18, s19 |
| ; CHECK-NEXT: vadd.f32 s11, s16, s17 |
| ; CHECK-NEXT: vldrw.u32 q4, [sp, #72] @ 16-byte Reload |
| ; CHECK-NEXT: vadd.f32 s14, s14, s15 |
| ; CHECK-NEXT: vadd.f32 s12, s12, s13 |
| ; CHECK-NEXT: vadd.f32 s13, s18, s19 |
| ; CHECK-NEXT: vadd.f32 s15, s16, s17 |
| ; CHECK-NEXT: vadd.f32 s0, s2, s0 |
| ; CHECK-NEXT: vadd.f32 s2, s6, s4 |
| ; CHECK-NEXT: vadd.f32 s8, s8, s10 |
| ; CHECK-NEXT: vadd.f32 s10, s11, s9 |
| ; CHECK-NEXT: vadd.f32 s6, s12, s14 |
| ; CHECK-NEXT: vadd.f32 s1, s22, s23 |
| ; CHECK-NEXT: vadd.f32 s14, s15, s13 |
| ; CHECK-NEXT: vstr s0, [r1] |
| ; CHECK-NEXT: add.w r1, r2, r0, lsl #2 |
| ; CHECK-NEXT: vadd.f32 s3, s20, s21 |
| ; CHECK-NEXT: adds r0, #8 |
| ; CHECK-NEXT: vstr s2, [r1] |
| ; CHECK-NEXT: add.w r1, r2, r8, lsl #2 |
| ; CHECK-NEXT: vadd.f32 s12, s7, s5 |
| ; CHECK-NEXT: vstr s10, [r1] |
| ; CHECK-NEXT: add.w r1, r2, r4, lsl #2 |
| ; CHECK-NEXT: vstr s14, [r1] |
| ; CHECK-NEXT: ldr r1, [sp, #24] @ 4-byte Reload |
| ; CHECK-NEXT: vadd.f32 s4, s3, s1 |
| ; CHECK-NEXT: add.w r1, r2, r1, lsl #2 |
| ; CHECK-NEXT: vstr s8, [r1] |
| ; CHECK-NEXT: ldr r1, [sp, #28] @ 4-byte Reload |
| ; CHECK-NEXT: add.w r1, r2, r1, lsl #2 |
| ; CHECK-NEXT: vstr s12, [r1] |
| ; CHECK-NEXT: ldr r1, [sp, #32] @ 4-byte Reload |
| ; CHECK-NEXT: add.w r1, r2, r1, lsl #2 |
| ; CHECK-NEXT: vstr s4, [r1] |
| ; CHECK-NEXT: ldr r1, [sp, #36] @ 4-byte Reload |
| ; CHECK-NEXT: add.w r1, r2, r1, lsl #2 |
| ; CHECK-NEXT: vstr s6, [r1] |
| ; CHECK-NEXT: ldr r1, [sp, #4] @ 4-byte Reload |
| ; CHECK-NEXT: add r12, r1 |
| ; CHECK-NEXT: ldr r1, [sp, #16] @ 4-byte Reload |
| ; CHECK-NEXT: cmp r0, r1 |
| ; CHECK-NEXT: blo.w .LBB7_2 |
| ; CHECK-NEXT: .LBB7_5: @ %for.cond.cleanup |
| ; CHECK-NEXT: add sp, #88 |
| ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} |
| ; CHECK-NEXT: add sp, #4 |
| ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} |
| entry: |
| %NumInputs = getelementptr inbounds %struct.DCT_InstanceTypeDef, %struct.DCT_InstanceTypeDef* %S, i32 0, i32 2 |
| %0 = load i32, i32* %NumInputs, align 4 |
| %NumFilters = getelementptr inbounds %struct.DCT_InstanceTypeDef, %struct.DCT_InstanceTypeDef* %S, i32 0, i32 1 |
| %1 = load i32, i32* %NumFilters, align 4 |
| %pDCTCoefs = getelementptr inbounds %struct.DCT_InstanceTypeDef, %struct.DCT_InstanceTypeDef* %S, i32 0, i32 0 |
| %2 = load float*, float** %pDCTCoefs, align 4 |
| %cmp = icmp ugt i32 %0, 1 |
| tail call void @llvm.assume(i1 %cmp) |
| %sub = add i32 %1, -8 |
| %cmp3197 = icmp ugt i32 %sub, 1 |
| br i1 %cmp3197, label %for.body.preheader, label %for.cond.cleanup |
| |
| for.body.preheader: ; preds = %entry |
| %n.rnd.up = add i32 %0, 3 |
| %n.vec = and i32 %n.rnd.up, -4 |
| br label %for.body |
| |
| for.cond.cleanup: ; preds = %middle.block, %entry |
| ret void |
| |
| for.body: ; preds = %for.body.preheader, %middle.block |
| %k2.0198 = phi i32 [ %add79, %middle.block ], [ 1, %for.body.preheader ] |
| %mul4 = mul i32 %k2.0198, %0 |
| %add = add nuw nsw i32 %k2.0198, 1 |
| %mul5 = mul i32 %add, %0 |
| %add6 = add nuw nsw i32 %k2.0198, 2 |
| %mul7 = mul i32 %add6, %0 |
| %add8 = add nuw nsw i32 %k2.0198, 3 |
| %mul9 = mul i32 %add8, %0 |
| %add10 = add nuw nsw i32 %k2.0198, 4 |
| %mul11 = mul i32 %add10, %0 |
| %add12 = add nuw nsw i32 %k2.0198, 5 |
| %mul13 = mul i32 %add12, %0 |
| %add14 = add nuw nsw i32 %k2.0198, 6 |
| %mul15 = mul i32 %add14, %0 |
| %add16 = add i32 %k2.0198, 7 |
| %mul17 = mul i32 %add16, %0 |
| br label %vector.body |
| |
| vector.body: ; preds = %vector.body, %for.body |
| %index = phi i32 [ 0, %for.body ], [ %index.next, %vector.body ] |
| %vec.phi = phi <4 x float> [ zeroinitializer, %for.body ], [ %45, %vector.body ] |
| %vec.phi199 = phi <4 x float> [ zeroinitializer, %for.body ], [ %46, %vector.body ] |
| %vec.phi200 = phi <4 x float> [ zeroinitializer, %for.body ], [ %47, %vector.body ] |
| %vec.phi201 = phi <4 x float> [ zeroinitializer, %for.body ], [ %48, %vector.body ] |
| %vec.phi202 = phi <4 x float> [ zeroinitializer, %for.body ], [ %49, %vector.body ] |
| %vec.phi203 = phi <4 x float> [ zeroinitializer, %for.body ], [ %50, %vector.body ] |
| %vec.phi204 = phi <4 x float> [ zeroinitializer, %for.body ], [ %51, %vector.body ] |
| %vec.phi205 = phi <4 x float> [ zeroinitializer, %for.body ], [ %52, %vector.body ] |
| %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %0) |
| %3 = getelementptr inbounds float, float* %pIn, i32 %index |
| %4 = bitcast float* %3 to <4 x float>* |
| %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %4, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef) |
| %5 = add i32 %index, %mul4 |
| %6 = getelementptr inbounds float, float* %2, i32 %5 |
| %7 = bitcast float* %6 to <4 x float>* |
| %wide.masked.load206 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %7, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef) |
| %8 = fmul fast <4 x float> %wide.masked.load206, %wide.masked.load |
| %9 = fadd fast <4 x float> %8, %vec.phi200 |
| %10 = add i32 %index, %mul5 |
| %11 = getelementptr inbounds float, float* %2, i32 %10 |
| %12 = bitcast float* %11 to <4 x float>* |
| %wide.masked.load207 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %12, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef) |
| %13 = fmul fast <4 x float> %wide.masked.load207, %wide.masked.load |
| %14 = fadd fast <4 x float> %13, %vec.phi202 |
| %15 = add i32 %index, %mul7 |
| %16 = getelementptr inbounds float, float* %2, i32 %15 |
| %17 = bitcast float* %16 to <4 x float>* |
| %wide.masked.load208 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %17, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef) |
| %18 = fmul fast <4 x float> %wide.masked.load208, %wide.masked.load |
| %19 = fadd fast <4 x float> %18, %vec.phi204 |
| %20 = add i32 %index, %mul9 |
| %21 = getelementptr inbounds float, float* %2, i32 %20 |
| %22 = bitcast float* %21 to <4 x float>* |
| %wide.masked.load209 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %22, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef) |
| %23 = fmul fast <4 x float> %wide.masked.load209, %wide.masked.load |
| %24 = fadd fast <4 x float> %23, %vec.phi205 |
| %25 = add i32 %index, %mul11 |
| %26 = getelementptr inbounds float, float* %2, i32 %25 |
| %27 = bitcast float* %26 to <4 x float>* |
| %wide.masked.load210 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %27, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef) |
| %28 = fmul fast <4 x float> %wide.masked.load210, %wide.masked.load |
| %29 = fadd fast <4 x float> %28, %vec.phi203 |
| %30 = add i32 %index, %mul13 |
| %31 = getelementptr inbounds float, float* %2, i32 %30 |
| %32 = bitcast float* %31 to <4 x float>* |
| %wide.masked.load211 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %32, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef) |
| %33 = fmul fast <4 x float> %wide.masked.load211, %wide.masked.load |
| %34 = fadd fast <4 x float> %33, %vec.phi201 |
| %35 = add i32 %index, %mul15 |
| %36 = getelementptr inbounds float, float* %2, i32 %35 |
| %37 = bitcast float* %36 to <4 x float>* |
| %wide.masked.load212 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %37, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef) |
| %38 = fmul fast <4 x float> %wide.masked.load212, %wide.masked.load |
| %39 = fadd fast <4 x float> %38, %vec.phi199 |
| %40 = add i32 %index, %mul17 |
| %41 = getelementptr inbounds float, float* %2, i32 %40 |
| %42 = bitcast float* %41 to <4 x float>* |
| %wide.masked.load213 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %42, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef) |
| %43 = fmul fast <4 x float> %wide.masked.load213, %wide.masked.load |
| %44 = fadd fast <4 x float> %43, %vec.phi |
| %45 = select <4 x i1> %active.lane.mask, <4 x float> %44, <4 x float> %vec.phi |
| %46 = select <4 x i1> %active.lane.mask, <4 x float> %39, <4 x float> %vec.phi199 |
| %47 = select <4 x i1> %active.lane.mask, <4 x float> %9, <4 x float> %vec.phi200 |
| %48 = select <4 x i1> %active.lane.mask, <4 x float> %34, <4 x float> %vec.phi201 |
| %49 = select <4 x i1> %active.lane.mask, <4 x float> %14, <4 x float> %vec.phi202 |
| %50 = select <4 x i1> %active.lane.mask, <4 x float> %29, <4 x float> %vec.phi203 |
| %51 = select <4 x i1> %active.lane.mask, <4 x float> %19, <4 x float> %vec.phi204 |
| %52 = select <4 x i1> %active.lane.mask, <4 x float> %24, <4 x float> %vec.phi205 |
| %index.next = add i32 %index, 4 |
| %53 = icmp eq i32 %index.next, %n.vec |
| br i1 %53, label %middle.block, label %vector.body |
| |
| middle.block: ; preds = %vector.body |
| %54 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %52) |
| %55 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %51) |
| %56 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %50) |
| %57 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %49) |
| %58 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %48) |
| %59 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %47) |
| %60 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %46) |
| %61 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %45) |
| %arrayidx63 = getelementptr inbounds float, float* %pOut, i32 %k2.0198 |
| store float %59, float* %arrayidx63, align 4 |
| %arrayidx65 = getelementptr inbounds float, float* %pOut, i32 %add |
| store float %57, float* %arrayidx65, align 4 |
| %arrayidx67 = getelementptr inbounds float, float* %pOut, i32 %add6 |
| store float %55, float* %arrayidx67, align 4 |
| %arrayidx69 = getelementptr inbounds float, float* %pOut, i32 %add8 |
| store float %54, float* %arrayidx69, align 4 |
| %arrayidx71 = getelementptr inbounds float, float* %pOut, i32 %add10 |
| store float %56, float* %arrayidx71, align 4 |
| %arrayidx73 = getelementptr inbounds float, float* %pOut, i32 %add12 |
| store float %58, float* %arrayidx73, align 4 |
| %arrayidx75 = getelementptr inbounds float, float* %pOut, i32 %add14 |
| store float %60, float* %arrayidx75, align 4 |
| %arrayidx77 = getelementptr inbounds float, float* %pOut, i32 %add16 |
| store float %61, float* %arrayidx77, align 4 |
| %add79 = add i32 %k2.0198, 8 |
| %cmp3 = icmp ult i32 %add79, %sub |
| br i1 %cmp3, label %for.body, label %for.cond.cleanup |
| } |
| |
| declare void @llvm.assume(i1 noundef) |
| declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32) |
| declare <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>*, i32 immarg, <4 x i1>, <4 x float>) |
| declare float @llvm.vector.reduce.fadd.v4f32(float, <4 x float>) |