| ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py |
| ; RUN: llc -O3 -tail-predication=enabled -mtriple=thumbv8.1m.main -mattr=+mve,+mve.fp %s -o - | FileCheck %s |
| |
| target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64" |
| target triple = "thumbv8.1m-arm-none-eabi" |
| |
| ; Tests that LSR will not interfere with the VCTP intrinsic, |
| ; and that this loop will correctly become tail-predicated. |
| |
| define arm_aapcs_vfpcc float @vctpi32(float* %0, i32 %1) { |
| ; CHECK-LABEL: vctpi32: |
| ; CHECK: @ %bb.0: |
| ; CHECK-NEXT: push {r7, lr} |
| ; CHECK-NEXT: vmvn.i32 q1, #0x1f |
| ; CHECK-NEXT: vmov.32 q3[0], r0 |
| ; CHECK-NEXT: movs r2, #0 |
| ; CHECK-NEXT: vadd.i32 q1, q3, q1 |
| ; CHECK-NEXT: subs r3, r1, #1 |
| ; CHECK-NEXT: vidup.u32 q2, r2, #8 |
| ; CHECK-NEXT: vmov r0, s4 |
| ; CHECK-NEXT: vadd.i32 q1, q2, r0 |
| ; CHECK-NEXT: vmov.i32 q0, #0x0 |
| ; CHECK-NEXT: dlstp.32 lr, r3 |
| ; CHECK-NEXT: .LBB0_1: @ =>This Inner Loop Header: Depth=1 |
| ; CHECK-NEXT: vldrw.u32 q2, [q1, #32]! |
| ; CHECK-NEXT: vadd.f32 q0, q0, q2 |
| ; CHECK-NEXT: letp lr, .LBB0_1 |
| ; CHECK-NEXT: @ %bb.2: |
| ; CHECK-NEXT: bl vecAddAcrossF32Mve |
| ; CHECK-NEXT: vmov s0, r0 |
| ; CHECK-NEXT: vcvt.f32.s32 s0, s0 |
| ; CHECK-NEXT: vabs.f32 s0, s0 |
| ; CHECK-NEXT: pop {r7, pc} |
| %3 = tail call { <4 x i32>, i32 } @llvm.arm.mve.vidup.v4i32(i32 0, i32 8) |
| %4 = extractvalue { <4 x i32>, i32 } %3, 0 |
| %5 = add nsw i32 %1, -1 |
| %6 = ptrtoint float* %0 to i32 |
| %7 = insertelement <4 x i32> undef, i32 %6, i32 0 |
| %8 = add <4 x i32> %7, <i32 -32, i32 undef, i32 undef, i32 undef> |
| %9 = shufflevector <4 x i32> %8, <4 x i32> undef, <4 x i32> zeroinitializer |
| %10 = add <4 x i32> %4, %9 |
| br label %11 |
| |
| 11: |
| %12 = phi i32 [ %5, %2 ], [ %20, %11 ] |
| %13 = phi <4 x float> [ zeroinitializer, %2 ], [ %19, %11 ] |
| %14 = phi <4 x i32> [ %10, %2 ], [ %17, %11 ] |
| %15 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %12) |
| %16 = tail call { <4 x float>, <4 x i32> } @llvm.arm.mve.vldr.gather.base.wb.predicated.v4f32.v4i32.v4i1(<4 x i32> %14, i32 32, <4 x i1> %15) |
| %17 = extractvalue { <4 x float>, <4 x i32> } %16, 1 |
| %18 = extractvalue { <4 x float>, <4 x i32> } %16, 0 |
| %19 = tail call <4 x float> @llvm.arm.mve.add.predicated.v4f32.v4i1(<4 x float> %13, <4 x float> %18, <4 x i1> %15, <4 x float> %13) |
| %20 = add nsw i32 %12, -4 |
| %21 = icmp sgt i32 %12, 4 |
| br i1 %21, label %11, label %22 |
| |
| 22: |
| %23 = tail call arm_aapcs_vfpcc i32 bitcast (i32 (...)* @vecAddAcrossF32Mve to i32 (<4 x float>)*)(<4 x float> %19) |
| %24 = sitofp i32 %23 to float |
| %25 = tail call float @llvm.fabs.f32(float %24) |
| ret float %25 |
| } |
| |
| declare { <4 x i32>, i32 } @llvm.arm.mve.vidup.v4i32(i32, i32) |
| declare <4 x i1> @llvm.arm.mve.vctp32(i32) |
| declare { <4 x float>, <4 x i32> } @llvm.arm.mve.vldr.gather.base.wb.predicated.v4f32.v4i32.v4i1(<4 x i32>, i32, <4 x i1>) |
| declare <4 x float> @llvm.arm.mve.add.predicated.v4f32.v4i1(<4 x float>, <4 x float>, <4 x i1>, <4 x float>) |
| declare arm_aapcs_vfpcc i32 @vecAddAcrossF32Mve(...) |
| declare float @llvm.fabs.f32(float) |