blob: 3b617502ed9554b1e7615e7457970dbc9cee3faa [file] [edit]
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
; RUN: opt -S -O3 < %s | FileCheck %s
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32"
target triple = "aarch64"
; This function (a 16x reduction of a[i] * b[i]) should be vectorized successfully.
define dso_local nofpclass(nan inf) float @vmlaq(ptr noundef %0, ptr noundef %1) {
; CHECK-LABEL: define dso_local nofpclass(nan inf) float @vmlaq
; CHECK-SAME: (ptr noundef readonly captures(none) [[TMP0:%.*]], ptr noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
; CHECK-NEXT: [[TMP3:%.*]] = load <16 x float>, ptr [[TMP0]], align 4, !tbaa [[TBAA4:![0-9]+]]
; CHECK-NEXT: [[TMP4:%.*]] = load <16 x float>, ptr [[TMP1]], align 4, !tbaa [[TBAA4]]
; CHECK-NEXT: [[TMP5:%.*]] = fmul fast <16 x float> [[TMP4]], [[TMP3]]
; CHECK-NEXT: [[TMP6:%.*]] = tail call fast float @llvm.vector.reduce.fadd.v16f32(float 0.000000e+00, <16 x float> [[TMP5]])
; CHECK-NEXT: ret float [[TMP6]]
;
%3 = alloca ptr, align 8
%4 = alloca ptr, align 8
%5 = alloca float, align 4
%6 = alloca i32, align 4
store ptr %0, ptr %3, align 8, !tbaa !4
store ptr %1, ptr %4, align 8, !tbaa !4
call void @llvm.lifetime.start.p0(ptr %5)
store float 0.000000e+00, ptr %5, align 4, !tbaa !9
call void @llvm.lifetime.start.p0(ptr %6)
store i32 0, ptr %6, align 4, !tbaa !11
br label %7
7: ; preds = %25, %2
%8 = load i32, ptr %6, align 4, !tbaa !11
%9 = icmp slt i32 %8, 16
br i1 %9, label %11, label %10
10: ; preds = %7
call void @llvm.lifetime.end.p0(ptr %6)
br label %28
11: ; preds = %7
%12 = load ptr, ptr %3, align 8, !tbaa !4
%13 = load i32, ptr %6, align 4, !tbaa !11
%14 = sext i32 %13 to i64
%15 = getelementptr inbounds float, ptr %12, i64 %14
%16 = load float, ptr %15, align 4, !tbaa !9
%17 = load ptr, ptr %4, align 8, !tbaa !4
%18 = load i32, ptr %6, align 4, !tbaa !11
%19 = sext i32 %18 to i64
%20 = getelementptr inbounds float, ptr %17, i64 %19
%21 = load float, ptr %20, align 4, !tbaa !9
%22 = fmul fast float %16, %21
%23 = load float, ptr %5, align 4, !tbaa !9
%24 = fadd fast float %23, %22
store float %24, ptr %5, align 4, !tbaa !9
br label %25
25: ; preds = %11
%26 = load i32, ptr %6, align 4, !tbaa !11
%27 = add nsw i32 %26, 1
store i32 %27, ptr %6, align 4, !tbaa !11
br label %7, !llvm.loop !13
28: ; preds = %10
%29 = load float, ptr %5, align 4, !tbaa !9
call void @llvm.lifetime.end.p0(ptr %5)
ret float %29
}
declare void @llvm.lifetime.start.p0(ptr captures(none))
declare void @llvm.lifetime.end.p0(ptr captures(none))
!llvm.module.flags = !{!0, !1, !2}
!llvm.ident = !{!3}
!0 = !{i32 1, !"wchar_size", i32 4}
!1 = !{i32 7, !"uwtable", i32 2}
!2 = !{i32 7, !"frame-pointer", i32 1}
!3 = !{!"clang version 22.0.0git"}
!4 = !{!5, !5, i64 0}
!5 = !{!"p1 float", !6, i64 0}
!6 = !{!"any pointer", !7, i64 0}
!7 = !{!"omnipotent char", !8, i64 0}
!8 = !{!"Simple C/C++ TBAA"}
!9 = !{!10, !10, i64 0}
!10 = !{!"float", !7, i64 0}
!11 = !{!12, !12, i64 0}
!12 = !{!"int", !7, i64 0}
!13 = distinct !{!13, !14}
!14 = !{!"llvm.loop.mustprogress"}