blob: 9dbe096ebdb5794635cfbb32ad46cc0c6bcd4a0c [file] [log] [blame] [edit]
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
; RUN: llc -mattr=+sve2 < %s | FileCheck %s --check-prefixes=CHECK,SVE2
; RUN: llc -global-isel -global-isel-abort=2 -mattr=+sve2 < %s | FileCheck %s --check-prefixes=CHECK,SVE2
; RUN: llc -mattr=+sve2p1 < %s | FileCheck %s --check-prefixes=CHECK,SVE2P1
; RUN: llc -global-isel -global-isel-abort=2 -mattr=+sve2p1 < %s | FileCheck %s --check-prefixes=CHECK,SVE2P1
target triple = "aarch64-linux-gnu"
define <vscale x 4 x float> @fdot_wide_nxv4f32(<vscale x 4 x float> %acc, <vscale x 8 x half> %a, <vscale x 8 x half> %b) {
; SVE2-LABEL: fdot_wide_nxv4f32:
; SVE2: // %bb.0: // %entry
; SVE2-NEXT: uunpklo z3.s, z1.h
; SVE2-NEXT: uunpklo z4.s, z2.h
; SVE2-NEXT: ptrue p0.s
; SVE2-NEXT: uunpkhi z1.s, z1.h
; SVE2-NEXT: uunpkhi z2.s, z2.h
; SVE2-NEXT: fcvt z3.s, p0/m, z3.h
; SVE2-NEXT: fcvt z4.s, p0/m, z4.h
; SVE2-NEXT: fcvt z1.s, p0/m, z1.h
; SVE2-NEXT: fcvt z2.s, p0/m, z2.h
; SVE2-NEXT: fmul z3.s, z3.s, z4.s
; SVE2-NEXT: fmul z1.s, z1.s, z2.s
; SVE2-NEXT: fadd z0.s, z0.s, z3.s
; SVE2-NEXT: fadd z0.s, z0.s, z1.s
; SVE2-NEXT: ret
;
; SVE2P1-LABEL: fdot_wide_nxv4f32:
; SVE2P1: // %bb.0: // %entry
; SVE2P1-NEXT: fdot z0.s, z1.h, z2.h
; SVE2P1-NEXT: ret
entry:
%a.wide = fpext <vscale x 8 x half> %a to <vscale x 8 x float>
%b.wide = fpext <vscale x 8 x half> %b to <vscale x 8 x float>
%mult = fmul <vscale x 8 x float> %a.wide, %b.wide
%partial.reduce = call <vscale x 4 x float> @llvm.vector.partial.reduce.fadd(<vscale x 4 x float> %acc, <vscale x 8 x float> %mult)
ret <vscale x 4 x float> %partial.reduce
}
define <vscale x 4 x float> @fdot_splat_nxv4f32(<vscale x 4 x float> %acc, <vscale x 8 x half> %a) {
; SVE2-LABEL: fdot_splat_nxv4f32:
; SVE2: // %bb.0: // %entry
; SVE2-NEXT: uunpklo z2.s, z1.h
; SVE2-NEXT: ptrue p0.s
; SVE2-NEXT: uunpkhi z1.s, z1.h
; SVE2-NEXT: fcvt z2.s, p0/m, z2.h
; SVE2-NEXT: fcvt z1.s, p0/m, z1.h
; SVE2-NEXT: fadd z0.s, z0.s, z2.s
; SVE2-NEXT: fadd z0.s, z0.s, z1.s
; SVE2-NEXT: ret
;
; SVE2P1-LABEL: fdot_splat_nxv4f32:
; SVE2P1: // %bb.0: // %entry
; SVE2P1-NEXT: fmov z2.h, #1.00000000
; SVE2P1-NEXT: fdot z0.s, z1.h, z2.h
; SVE2P1-NEXT: ret
entry:
%a.wide = fpext <vscale x 8 x half> %a to <vscale x 8 x float>
%partial.reduce = call <vscale x 4 x float> @llvm.vector.partial.reduce.fadd(<vscale x 4 x float> %acc, <vscale x 8 x float> %a.wide)
ret <vscale x 4 x float> %partial.reduce
}
define <vscale x 8 x half> @partial_reduce_nxv8f16(<vscale x 8 x half> %acc, <vscale x 16 x half> %a) {
; CHECK-LABEL: partial_reduce_nxv8f16:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: fadd z0.h, z0.h, z1.h
; CHECK-NEXT: fadd z0.h, z0.h, z2.h
; CHECK-NEXT: ret
entry:
%partial.reduce = call <vscale x 8 x half> @llvm.vector.partial.reduce.fadd(<vscale x 8 x half> %acc, <vscale x 16 x half> %a)
ret <vscale x 8 x half> %partial.reduce
}
define <vscale x 4 x float> @partial_reduce_nxv4f32(<vscale x 4 x float> %acc, <vscale x 8 x float> %a) {
; CHECK-LABEL: partial_reduce_nxv4f32:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: fadd z0.s, z0.s, z1.s
; CHECK-NEXT: fadd z0.s, z0.s, z2.s
; CHECK-NEXT: ret
entry:
%partial.reduce = call <vscale x 4 x float> @llvm.vector.partial.reduce.fadd(<vscale x 4 x float> %acc, <vscale x 8 x float> %a)
ret <vscale x 4 x float> %partial.reduce
}
define <vscale x 2 x double> @partial_reduce_nxv2f64(<vscale x 2 x double> %acc, <vscale x 4 x double> %a) {
; CHECK-LABEL: partial_reduce_nxv2f64:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: fadd z0.d, z0.d, z1.d
; CHECK-NEXT: fadd z0.d, z0.d, z2.d
; CHECK-NEXT: ret
entry:
%partial.reduce = call <vscale x 2 x double> @llvm.vector.partial.reduce.fadd(<vscale x 2 x double> %acc, <vscale x 4 x double> %a)
ret <vscale x 2 x double> %partial.reduce
}