| ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2 |
| ; RUN: opt -S -O3 < %s | FileCheck %s |
| |
| target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32" |
| target triple = "aarch64" |
| |
| ; This function (a 16x reduction of a[i] * b[i]) should be vectorized successfully. |
| |
| define dso_local nofpclass(nan inf) float @vmlaq(ptr noundef %0, ptr noundef %1) { |
| ; CHECK-LABEL: define dso_local nofpclass(nan inf) float @vmlaq |
| ; CHECK-SAME: (ptr noundef readonly captures(none) [[TMP0:%.*]], ptr noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { |
| ; CHECK-NEXT: [[TMP3:%.*]] = load <16 x float>, ptr [[TMP0]], align 4, !tbaa [[TBAA4:![0-9]+]] |
| ; CHECK-NEXT: [[TMP4:%.*]] = load <16 x float>, ptr [[TMP1]], align 4, !tbaa [[TBAA4]] |
| ; CHECK-NEXT: [[TMP5:%.*]] = fmul fast <16 x float> [[TMP4]], [[TMP3]] |
| ; CHECK-NEXT: [[TMP6:%.*]] = tail call fast float @llvm.vector.reduce.fadd.v16f32(float 0.000000e+00, <16 x float> [[TMP5]]) |
| ; CHECK-NEXT: ret float [[TMP6]] |
| ; |
| %3 = alloca ptr, align 8 |
| %4 = alloca ptr, align 8 |
| %5 = alloca float, align 4 |
| %6 = alloca i32, align 4 |
| store ptr %0, ptr %3, align 8, !tbaa !4 |
| store ptr %1, ptr %4, align 8, !tbaa !4 |
| call void @llvm.lifetime.start.p0(ptr %5) |
| store float 0.000000e+00, ptr %5, align 4, !tbaa !9 |
| call void @llvm.lifetime.start.p0(ptr %6) |
| store i32 0, ptr %6, align 4, !tbaa !11 |
| br label %7 |
| |
| 7: ; preds = %25, %2 |
| %8 = load i32, ptr %6, align 4, !tbaa !11 |
| %9 = icmp slt i32 %8, 16 |
| br i1 %9, label %11, label %10 |
| |
| 10: ; preds = %7 |
| call void @llvm.lifetime.end.p0(ptr %6) |
| br label %28 |
| |
| 11: ; preds = %7 |
| %12 = load ptr, ptr %3, align 8, !tbaa !4 |
| %13 = load i32, ptr %6, align 4, !tbaa !11 |
| %14 = sext i32 %13 to i64 |
| %15 = getelementptr inbounds float, ptr %12, i64 %14 |
| %16 = load float, ptr %15, align 4, !tbaa !9 |
| %17 = load ptr, ptr %4, align 8, !tbaa !4 |
| %18 = load i32, ptr %6, align 4, !tbaa !11 |
| %19 = sext i32 %18 to i64 |
| %20 = getelementptr inbounds float, ptr %17, i64 %19 |
| %21 = load float, ptr %20, align 4, !tbaa !9 |
| %22 = fmul fast float %16, %21 |
| %23 = load float, ptr %5, align 4, !tbaa !9 |
| %24 = fadd fast float %23, %22 |
| store float %24, ptr %5, align 4, !tbaa !9 |
| br label %25 |
| |
| 25: ; preds = %11 |
| %26 = load i32, ptr %6, align 4, !tbaa !11 |
| %27 = add nsw i32 %26, 1 |
| store i32 %27, ptr %6, align 4, !tbaa !11 |
| br label %7, !llvm.loop !13 |
| |
| 28: ; preds = %10 |
| %29 = load float, ptr %5, align 4, !tbaa !9 |
| call void @llvm.lifetime.end.p0(ptr %5) |
| ret float %29 |
| } |
| |
| declare void @llvm.lifetime.start.p0(ptr captures(none)) |
| declare void @llvm.lifetime.end.p0(ptr captures(none)) |
| |
| !llvm.module.flags = !{!0, !1, !2} |
| !llvm.ident = !{!3} |
| |
| !0 = !{i32 1, !"wchar_size", i32 4} |
| !1 = !{i32 7, !"uwtable", i32 2} |
| !2 = !{i32 7, !"frame-pointer", i32 1} |
| !3 = !{!"clang version 22.0.0git"} |
| !4 = !{!5, !5, i64 0} |
| !5 = !{!"p1 float", !6, i64 0} |
| !6 = !{!"any pointer", !7, i64 0} |
| !7 = !{!"omnipotent char", !8, i64 0} |
| !8 = !{!"Simple C/C++ TBAA"} |
| !9 = !{!10, !10, i64 0} |
| !10 = !{!"float", !7, i64 0} |
| !11 = !{!12, !12, i64 0} |
| !12 = !{!"int", !7, i64 0} |
| !13 = distinct !{!13, !14} |
| !14 = !{!"llvm.loop.mustprogress"} |