blob: 7f79c9c5431eabe1bcd89d04751da29bd712ff9c [file] [log] [blame]
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -mattr=+sve,+bf16 < %s | FileCheck %s -check-prefixes=CHECK,SVE
; RUN: llc -mattr=+sme -force-streaming < %s | FileCheck %s -check-prefixes=CHECK,SME
target triple = "aarch64-unknown-linux-gnu"
; FADDV
define bfloat @faddv_nxv2bf16(<vscale x 2 x bfloat> %a) {
; CHECK-LABEL: faddv_nxv2bf16:
; CHECK: // %bb.0:
; CHECK-NEXT: lsl z0.s, z0.s, #16
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: faddv s0, p0, z0.s
; CHECK-NEXT: bfcvt h0, s0
; CHECK-NEXT: ret
%res = call fast bfloat @llvm.vector.reduce.fadd.nxv2bf16(bfloat zeroinitializer, <vscale x 2 x bfloat> %a)
ret bfloat %res
}
define bfloat @faddv_nxv4bf16(<vscale x 4 x bfloat> %a) {
; CHECK-LABEL: faddv_nxv4bf16:
; CHECK: // %bb.0:
; CHECK-NEXT: lsl z0.s, z0.s, #16
; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: faddv s0, p0, z0.s
; CHECK-NEXT: bfcvt h0, s0
; CHECK-NEXT: ret
%res = call fast bfloat @llvm.vector.reduce.fadd.nxv4bf16(bfloat zeroinitializer, <vscale x 4 x bfloat> %a)
ret bfloat %res
}
define bfloat @faddv_nxv8bf16(<vscale x 8 x bfloat> %a) {
; CHECK-LABEL: faddv_nxv8bf16:
; CHECK: // %bb.0:
; CHECK-NEXT: uunpkhi z1.s, z0.h
; CHECK-NEXT: uunpklo z0.s, z0.h
; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: lsl z1.s, z1.s, #16
; CHECK-NEXT: lsl z0.s, z0.s, #16
; CHECK-NEXT: fadd z0.s, z0.s, z1.s
; CHECK-NEXT: faddv s0, p0, z0.s
; CHECK-NEXT: bfcvt h0, s0
; CHECK-NEXT: ret
%res = call fast bfloat @llvm.vector.reduce.fadd.nxv8bf16(bfloat zeroinitializer, <vscale x 8 x bfloat> %a)
ret bfloat %res
}
; FMAXNMV
define bfloat @fmaxv_nxv2bf16(<vscale x 2 x bfloat> %a) {
; CHECK-LABEL: fmaxv_nxv2bf16:
; CHECK: // %bb.0:
; CHECK-NEXT: lsl z0.s, z0.s, #16
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: fmaxnmv s0, p0, z0.s
; CHECK-NEXT: bfcvt h0, s0
; CHECK-NEXT: ret
%res = call bfloat @llvm.vector.reduce.fmax.nxv2bf16(<vscale x 2 x bfloat> %a)
ret bfloat %res
}
define bfloat @fmaxv_nxv4bf16(<vscale x 4 x bfloat> %a) {
; CHECK-LABEL: fmaxv_nxv4bf16:
; CHECK: // %bb.0:
; CHECK-NEXT: lsl z0.s, z0.s, #16
; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: fmaxnmv s0, p0, z0.s
; CHECK-NEXT: bfcvt h0, s0
; CHECK-NEXT: ret
%res = call bfloat @llvm.vector.reduce.fmax.nxv4bf16(<vscale x 4 x bfloat> %a)
ret bfloat %res
}
define bfloat @fmaxv_nxv8bf16(<vscale x 8 x bfloat> %a) {
; CHECK-LABEL: fmaxv_nxv8bf16:
; CHECK: // %bb.0:
; CHECK-NEXT: uunpkhi z1.s, z0.h
; CHECK-NEXT: uunpklo z0.s, z0.h
; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: lsl z1.s, z1.s, #16
; CHECK-NEXT: lsl z0.s, z0.s, #16
; CHECK-NEXT: fmaxnm z0.s, p0/m, z0.s, z1.s
; CHECK-NEXT: fmaxnmv s0, p0, z0.s
; CHECK-NEXT: bfcvt h0, s0
; CHECK-NEXT: ret
%res = call bfloat @llvm.vector.reduce.fmax.nxv8bf16(<vscale x 8 x bfloat> %a)
ret bfloat %res
}
; FMINNMV
define bfloat @fminv_nxv2bf16(<vscale x 2 x bfloat> %a) {
; CHECK-LABEL: fminv_nxv2bf16:
; CHECK: // %bb.0:
; CHECK-NEXT: lsl z0.s, z0.s, #16
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: fminnmv s0, p0, z0.s
; CHECK-NEXT: bfcvt h0, s0
; CHECK-NEXT: ret
%res = call bfloat @llvm.vector.reduce.fmin.nxv2bf16(<vscale x 2 x bfloat> %a)
ret bfloat %res
}
define bfloat @fminv_nxv4bf16(<vscale x 4 x bfloat> %a) {
; CHECK-LABEL: fminv_nxv4bf16:
; CHECK: // %bb.0:
; CHECK-NEXT: lsl z0.s, z0.s, #16
; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: fminnmv s0, p0, z0.s
; CHECK-NEXT: bfcvt h0, s0
; CHECK-NEXT: ret
%res = call bfloat @llvm.vector.reduce.fmin.nxv4bf16(<vscale x 4 x bfloat> %a)
ret bfloat %res
}
define bfloat @fminv_nxv8bf16(<vscale x 8 x bfloat> %a) {
; CHECK-LABEL: fminv_nxv8bf16:
; CHECK: // %bb.0:
; CHECK-NEXT: uunpkhi z1.s, z0.h
; CHECK-NEXT: uunpklo z0.s, z0.h
; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: lsl z1.s, z1.s, #16
; CHECK-NEXT: lsl z0.s, z0.s, #16
; CHECK-NEXT: fminnm z0.s, p0/m, z0.s, z1.s
; CHECK-NEXT: fminnmv s0, p0, z0.s
; CHECK-NEXT: bfcvt h0, s0
; CHECK-NEXT: ret
%res = call bfloat @llvm.vector.reduce.fmin.nxv8bf16(<vscale x 8 x bfloat> %a)
ret bfloat %res
}
; FMAXV
define bfloat @fmaximumv_nxv2bf16(<vscale x 2 x bfloat> %a) {
; CHECK-LABEL: fmaximumv_nxv2bf16:
; CHECK: // %bb.0:
; CHECK-NEXT: lsl z0.s, z0.s, #16
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: fmaxv s0, p0, z0.s
; CHECK-NEXT: bfcvt h0, s0
; CHECK-NEXT: ret
%res = call bfloat @llvm.vector.reduce.fmaximum.nxv2bf16(<vscale x 2 x bfloat> %a)
ret bfloat %res
}
define bfloat @fmaximumv_nxv4bf16(<vscale x 4 x bfloat> %a) {
; CHECK-LABEL: fmaximumv_nxv4bf16:
; CHECK: // %bb.0:
; CHECK-NEXT: lsl z0.s, z0.s, #16
; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: fmaxv s0, p0, z0.s
; CHECK-NEXT: bfcvt h0, s0
; CHECK-NEXT: ret
%res = call bfloat @llvm.vector.reduce.fmaximum.nxv4bf16(<vscale x 4 x bfloat> %a)
ret bfloat %res
}
define bfloat @fmaximumv_nxv8bf16(<vscale x 8 x bfloat> %a) {
; CHECK-LABEL: fmaximumv_nxv8bf16:
; CHECK: // %bb.0:
; CHECK-NEXT: uunpkhi z1.s, z0.h
; CHECK-NEXT: uunpklo z0.s, z0.h
; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: lsl z1.s, z1.s, #16
; CHECK-NEXT: lsl z0.s, z0.s, #16
; CHECK-NEXT: fmax z0.s, p0/m, z0.s, z1.s
; CHECK-NEXT: fmaxv s0, p0, z0.s
; CHECK-NEXT: bfcvt h0, s0
; CHECK-NEXT: ret
%res = call bfloat @llvm.vector.reduce.fmaximum.nxv8bf16(<vscale x 8 x bfloat> %a)
ret bfloat %res
}
; FMINV
define bfloat @fminimumv_nxv2bf16(<vscale x 2 x bfloat> %a) {
; CHECK-LABEL: fminimumv_nxv2bf16:
; CHECK: // %bb.0:
; CHECK-NEXT: lsl z0.s, z0.s, #16
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: fminv s0, p0, z0.s
; CHECK-NEXT: bfcvt h0, s0
; CHECK-NEXT: ret
%res = call bfloat @llvm.vector.reduce.fminimum.nxv2bf16(<vscale x 2 x bfloat> %a)
ret bfloat %res
}
define bfloat @fminimumv_nxv4bf16(<vscale x 4 x bfloat> %a) {
; CHECK-LABEL: fminimumv_nxv4bf16:
; CHECK: // %bb.0:
; CHECK-NEXT: lsl z0.s, z0.s, #16
; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: fminv s0, p0, z0.s
; CHECK-NEXT: bfcvt h0, s0
; CHECK-NEXT: ret
%res = call bfloat @llvm.vector.reduce.fminimum.nxv4bf16(<vscale x 4 x bfloat> %a)
ret bfloat %res
}
define bfloat @fminimumv_nxv8bf16(<vscale x 8 x bfloat> %a) {
; CHECK-LABEL: fminimumv_nxv8bf16:
; CHECK: // %bb.0:
; CHECK-NEXT: uunpkhi z1.s, z0.h
; CHECK-NEXT: uunpklo z0.s, z0.h
; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: lsl z1.s, z1.s, #16
; CHECK-NEXT: lsl z0.s, z0.s, #16
; CHECK-NEXT: fmin z0.s, p0/m, z0.s, z1.s
; CHECK-NEXT: fminv s0, p0, z0.s
; CHECK-NEXT: bfcvt h0, s0
; CHECK-NEXT: ret
%res = call bfloat @llvm.vector.reduce.fminimum.nxv8bf16(<vscale x 8 x bfloat> %a)
ret bfloat %res
}
; The reduction is performed at a higher precision. Because add operations
; can utilise that precision, its result must be rounded even if it's then
; promoted.
define float @promoted_fadd(<vscale x 4 x bfloat> %a) {
; SVE-LABEL: promoted_fadd:
; SVE: // %bb.0:
; SVE-NEXT: lsl z0.s, z0.s, #16
; SVE-NEXT: ptrue p0.s
; SVE-NEXT: faddv s0, p0, z0.s
; SVE-NEXT: bfcvt h0, s0
; SVE-NEXT: shll v0.4s, v0.4h, #16
; SVE-NEXT: // kill: def $s0 killed $s0 killed $q0
; SVE-NEXT: ret
;
; SME-LABEL: promoted_fadd:
; SME: // %bb.0:
; SME-NEXT: lsl z0.s, z0.s, #16
; SME-NEXT: ptrue p0.s
; SME-NEXT: faddv s0, p0, z0.s
; SME-NEXT: bfcvt h0, s0
; SME-NEXT: fmov w8, s0
; SME-NEXT: lsl w8, w8, #16
; SME-NEXT: fmov s0, w8
; SME-NEXT: ret
%rdx = call fast bfloat @llvm.vector.reduce.fadd.nxv4bf16(bfloat zeroinitializer, <vscale x 4 x bfloat> %a)
%res = fpext bfloat %rdx to float
ret float %res
}
; The reduction is performed at a higher precision. Because min/max operations
; don't utilise that precision, its result can be used directly.
define float @promoted_fmax(<vscale x 4 x bfloat> %a) {
; CHECK-LABEL: promoted_fmax:
; CHECK: // %bb.0:
; CHECK-NEXT: lsl z0.s, z0.s, #16
; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: fmaxnmv s0, p0, z0.s
; CHECK-NEXT: // kill: def $s0 killed $s0 killed $z0
; CHECK-NEXT: ret
%rdx = call bfloat @llvm.vector.reduce.fmax.nxv4bf16(<vscale x 4 x bfloat> %a)
%res = fpext bfloat %rdx to float
ret float %res
}
declare bfloat @llvm.vector.reduce.fadd.nxv2bf16(bfloat, <vscale x 2 x bfloat>)
declare bfloat @llvm.vector.reduce.fadd.nxv4bf16(bfloat, <vscale x 4 x bfloat>)
declare bfloat @llvm.vector.reduce.fadd.nxv8bf16(bfloat, <vscale x 8 x bfloat>)
declare bfloat @llvm.vector.reduce.fmax.nxv2bf16(<vscale x 2 x bfloat>)
declare bfloat @llvm.vector.reduce.fmax.nxv4bf16(<vscale x 4 x bfloat>)
declare bfloat @llvm.vector.reduce.fmax.nxv8bf16(<vscale x 8 x bfloat>)
declare bfloat @llvm.vector.reduce.fmin.nxv2bf16(<vscale x 2 x bfloat>)
declare bfloat @llvm.vector.reduce.fmin.nxv4bf16(<vscale x 4 x bfloat>)
declare bfloat @llvm.vector.reduce.fmin.nxv8bf16(<vscale x 8 x bfloat>)
declare bfloat @llvm.vector.reduce.fmaximum.nxv2bf16(<vscale x 2 x bfloat>)
declare bfloat @llvm.vector.reduce.fmaximum.nxv4bf16(<vscale x 4 x bfloat>)
declare bfloat @llvm.vector.reduce.fmaximum.nxv8bf16(<vscale x 8 x bfloat>)
declare bfloat @llvm.vector.reduce.fminimum.nxv2bf16(<vscale x 2 x bfloat>)
declare bfloat @llvm.vector.reduce.fminimum.nxv4bf16(<vscale x 4 x bfloat>)
declare bfloat @llvm.vector.reduce.fminimum.nxv8bf16(<vscale x 8 x bfloat>)