| ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 |
| ; RUN: llc -mattr=+sve,+bf16 < %s | FileCheck %s -check-prefixes=CHECK,SVE |
| ; RUN: llc -mattr=+sme -force-streaming < %s | FileCheck %s -check-prefixes=CHECK,SME |
| |
| target triple = "aarch64-unknown-linux-gnu" |
| |
| ; FADDV |
| |
| define bfloat @faddv_nxv2bf16(<vscale x 2 x bfloat> %a) { |
| ; CHECK-LABEL: faddv_nxv2bf16: |
| ; CHECK: // %bb.0: |
| ; CHECK-NEXT: lsl z0.s, z0.s, #16 |
| ; CHECK-NEXT: ptrue p0.d |
| ; CHECK-NEXT: faddv s0, p0, z0.s |
| ; CHECK-NEXT: bfcvt h0, s0 |
| ; CHECK-NEXT: ret |
| %res = call fast bfloat @llvm.vector.reduce.fadd.nxv2bf16(bfloat zeroinitializer, <vscale x 2 x bfloat> %a) |
| ret bfloat %res |
| } |
| |
| define bfloat @faddv_nxv4bf16(<vscale x 4 x bfloat> %a) { |
| ; CHECK-LABEL: faddv_nxv4bf16: |
| ; CHECK: // %bb.0: |
| ; CHECK-NEXT: lsl z0.s, z0.s, #16 |
| ; CHECK-NEXT: ptrue p0.s |
| ; CHECK-NEXT: faddv s0, p0, z0.s |
| ; CHECK-NEXT: bfcvt h0, s0 |
| ; CHECK-NEXT: ret |
| %res = call fast bfloat @llvm.vector.reduce.fadd.nxv4bf16(bfloat zeroinitializer, <vscale x 4 x bfloat> %a) |
| ret bfloat %res |
| } |
| |
| define bfloat @faddv_nxv8bf16(<vscale x 8 x bfloat> %a) { |
| ; CHECK-LABEL: faddv_nxv8bf16: |
| ; CHECK: // %bb.0: |
| ; CHECK-NEXT: uunpkhi z1.s, z0.h |
| ; CHECK-NEXT: uunpklo z0.s, z0.h |
| ; CHECK-NEXT: ptrue p0.s |
| ; CHECK-NEXT: lsl z1.s, z1.s, #16 |
| ; CHECK-NEXT: lsl z0.s, z0.s, #16 |
| ; CHECK-NEXT: fadd z0.s, z0.s, z1.s |
| ; CHECK-NEXT: faddv s0, p0, z0.s |
| ; CHECK-NEXT: bfcvt h0, s0 |
| ; CHECK-NEXT: ret |
| %res = call fast bfloat @llvm.vector.reduce.fadd.nxv8bf16(bfloat zeroinitializer, <vscale x 8 x bfloat> %a) |
| ret bfloat %res |
| } |
| |
| ; FMAXNMV |
| |
| define bfloat @fmaxv_nxv2bf16(<vscale x 2 x bfloat> %a) { |
| ; CHECK-LABEL: fmaxv_nxv2bf16: |
| ; CHECK: // %bb.0: |
| ; CHECK-NEXT: lsl z0.s, z0.s, #16 |
| ; CHECK-NEXT: ptrue p0.d |
| ; CHECK-NEXT: fmaxnmv s0, p0, z0.s |
| ; CHECK-NEXT: bfcvt h0, s0 |
| ; CHECK-NEXT: ret |
| %res = call bfloat @llvm.vector.reduce.fmax.nxv2bf16(<vscale x 2 x bfloat> %a) |
| ret bfloat %res |
| } |
| |
| define bfloat @fmaxv_nxv4bf16(<vscale x 4 x bfloat> %a) { |
| ; CHECK-LABEL: fmaxv_nxv4bf16: |
| ; CHECK: // %bb.0: |
| ; CHECK-NEXT: lsl z0.s, z0.s, #16 |
| ; CHECK-NEXT: ptrue p0.s |
| ; CHECK-NEXT: fmaxnmv s0, p0, z0.s |
| ; CHECK-NEXT: bfcvt h0, s0 |
| ; CHECK-NEXT: ret |
| %res = call bfloat @llvm.vector.reduce.fmax.nxv4bf16(<vscale x 4 x bfloat> %a) |
| ret bfloat %res |
| } |
| |
| define bfloat @fmaxv_nxv8bf16(<vscale x 8 x bfloat> %a) { |
| ; CHECK-LABEL: fmaxv_nxv8bf16: |
| ; CHECK: // %bb.0: |
| ; CHECK-NEXT: uunpkhi z1.s, z0.h |
| ; CHECK-NEXT: uunpklo z0.s, z0.h |
| ; CHECK-NEXT: ptrue p0.s |
| ; CHECK-NEXT: lsl z1.s, z1.s, #16 |
| ; CHECK-NEXT: lsl z0.s, z0.s, #16 |
| ; CHECK-NEXT: fmaxnm z0.s, p0/m, z0.s, z1.s |
| ; CHECK-NEXT: fmaxnmv s0, p0, z0.s |
| ; CHECK-NEXT: bfcvt h0, s0 |
| ; CHECK-NEXT: ret |
| %res = call bfloat @llvm.vector.reduce.fmax.nxv8bf16(<vscale x 8 x bfloat> %a) |
| ret bfloat %res |
| } |
| |
| ; FMINNMV |
| |
| define bfloat @fminv_nxv2bf16(<vscale x 2 x bfloat> %a) { |
| ; CHECK-LABEL: fminv_nxv2bf16: |
| ; CHECK: // %bb.0: |
| ; CHECK-NEXT: lsl z0.s, z0.s, #16 |
| ; CHECK-NEXT: ptrue p0.d |
| ; CHECK-NEXT: fminnmv s0, p0, z0.s |
| ; CHECK-NEXT: bfcvt h0, s0 |
| ; CHECK-NEXT: ret |
| %res = call bfloat @llvm.vector.reduce.fmin.nxv2bf16(<vscale x 2 x bfloat> %a) |
| ret bfloat %res |
| } |
| |
| define bfloat @fminv_nxv4bf16(<vscale x 4 x bfloat> %a) { |
| ; CHECK-LABEL: fminv_nxv4bf16: |
| ; CHECK: // %bb.0: |
| ; CHECK-NEXT: lsl z0.s, z0.s, #16 |
| ; CHECK-NEXT: ptrue p0.s |
| ; CHECK-NEXT: fminnmv s0, p0, z0.s |
| ; CHECK-NEXT: bfcvt h0, s0 |
| ; CHECK-NEXT: ret |
| %res = call bfloat @llvm.vector.reduce.fmin.nxv4bf16(<vscale x 4 x bfloat> %a) |
| ret bfloat %res |
| } |
| |
| define bfloat @fminv_nxv8bf16(<vscale x 8 x bfloat> %a) { |
| ; CHECK-LABEL: fminv_nxv8bf16: |
| ; CHECK: // %bb.0: |
| ; CHECK-NEXT: uunpkhi z1.s, z0.h |
| ; CHECK-NEXT: uunpklo z0.s, z0.h |
| ; CHECK-NEXT: ptrue p0.s |
| ; CHECK-NEXT: lsl z1.s, z1.s, #16 |
| ; CHECK-NEXT: lsl z0.s, z0.s, #16 |
| ; CHECK-NEXT: fminnm z0.s, p0/m, z0.s, z1.s |
| ; CHECK-NEXT: fminnmv s0, p0, z0.s |
| ; CHECK-NEXT: bfcvt h0, s0 |
| ; CHECK-NEXT: ret |
| %res = call bfloat @llvm.vector.reduce.fmin.nxv8bf16(<vscale x 8 x bfloat> %a) |
| ret bfloat %res |
| } |
| |
| ; FMAXV |
| |
| define bfloat @fmaximumv_nxv2bf16(<vscale x 2 x bfloat> %a) { |
| ; CHECK-LABEL: fmaximumv_nxv2bf16: |
| ; CHECK: // %bb.0: |
| ; CHECK-NEXT: lsl z0.s, z0.s, #16 |
| ; CHECK-NEXT: ptrue p0.d |
| ; CHECK-NEXT: fmaxv s0, p0, z0.s |
| ; CHECK-NEXT: bfcvt h0, s0 |
| ; CHECK-NEXT: ret |
| %res = call bfloat @llvm.vector.reduce.fmaximum.nxv2bf16(<vscale x 2 x bfloat> %a) |
| ret bfloat %res |
| } |
| |
| define bfloat @fmaximumv_nxv4bf16(<vscale x 4 x bfloat> %a) { |
| ; CHECK-LABEL: fmaximumv_nxv4bf16: |
| ; CHECK: // %bb.0: |
| ; CHECK-NEXT: lsl z0.s, z0.s, #16 |
| ; CHECK-NEXT: ptrue p0.s |
| ; CHECK-NEXT: fmaxv s0, p0, z0.s |
| ; CHECK-NEXT: bfcvt h0, s0 |
| ; CHECK-NEXT: ret |
| %res = call bfloat @llvm.vector.reduce.fmaximum.nxv4bf16(<vscale x 4 x bfloat> %a) |
| ret bfloat %res |
| } |
| |
| define bfloat @fmaximumv_nxv8bf16(<vscale x 8 x bfloat> %a) { |
| ; CHECK-LABEL: fmaximumv_nxv8bf16: |
| ; CHECK: // %bb.0: |
| ; CHECK-NEXT: uunpkhi z1.s, z0.h |
| ; CHECK-NEXT: uunpklo z0.s, z0.h |
| ; CHECK-NEXT: ptrue p0.s |
| ; CHECK-NEXT: lsl z1.s, z1.s, #16 |
| ; CHECK-NEXT: lsl z0.s, z0.s, #16 |
| ; CHECK-NEXT: fmax z0.s, p0/m, z0.s, z1.s |
| ; CHECK-NEXT: fmaxv s0, p0, z0.s |
| ; CHECK-NEXT: bfcvt h0, s0 |
| ; CHECK-NEXT: ret |
| %res = call bfloat @llvm.vector.reduce.fmaximum.nxv8bf16(<vscale x 8 x bfloat> %a) |
| ret bfloat %res |
| } |
| |
| ; FMINV |
| |
| define bfloat @fminimumv_nxv2bf16(<vscale x 2 x bfloat> %a) { |
| ; CHECK-LABEL: fminimumv_nxv2bf16: |
| ; CHECK: // %bb.0: |
| ; CHECK-NEXT: lsl z0.s, z0.s, #16 |
| ; CHECK-NEXT: ptrue p0.d |
| ; CHECK-NEXT: fminv s0, p0, z0.s |
| ; CHECK-NEXT: bfcvt h0, s0 |
| ; CHECK-NEXT: ret |
| %res = call bfloat @llvm.vector.reduce.fminimum.nxv2bf16(<vscale x 2 x bfloat> %a) |
| ret bfloat %res |
| } |
| |
| define bfloat @fminimumv_nxv4bf16(<vscale x 4 x bfloat> %a) { |
| ; CHECK-LABEL: fminimumv_nxv4bf16: |
| ; CHECK: // %bb.0: |
| ; CHECK-NEXT: lsl z0.s, z0.s, #16 |
| ; CHECK-NEXT: ptrue p0.s |
| ; CHECK-NEXT: fminv s0, p0, z0.s |
| ; CHECK-NEXT: bfcvt h0, s0 |
| ; CHECK-NEXT: ret |
| %res = call bfloat @llvm.vector.reduce.fminimum.nxv4bf16(<vscale x 4 x bfloat> %a) |
| ret bfloat %res |
| } |
| |
| define bfloat @fminimumv_nxv8bf16(<vscale x 8 x bfloat> %a) { |
| ; CHECK-LABEL: fminimumv_nxv8bf16: |
| ; CHECK: // %bb.0: |
| ; CHECK-NEXT: uunpkhi z1.s, z0.h |
| ; CHECK-NEXT: uunpklo z0.s, z0.h |
| ; CHECK-NEXT: ptrue p0.s |
| ; CHECK-NEXT: lsl z1.s, z1.s, #16 |
| ; CHECK-NEXT: lsl z0.s, z0.s, #16 |
| ; CHECK-NEXT: fmin z0.s, p0/m, z0.s, z1.s |
| ; CHECK-NEXT: fminv s0, p0, z0.s |
| ; CHECK-NEXT: bfcvt h0, s0 |
| ; CHECK-NEXT: ret |
| %res = call bfloat @llvm.vector.reduce.fminimum.nxv8bf16(<vscale x 8 x bfloat> %a) |
| ret bfloat %res |
| } |
| |
| ; The reduction is performed at a higher precision. Because add operations |
| ; can utilise that precision, its result must be rounded even if it's then |
| ; promoted. |
| define float @promoted_fadd(<vscale x 4 x bfloat> %a) { |
| ; SVE-LABEL: promoted_fadd: |
| ; SVE: // %bb.0: |
| ; SVE-NEXT: lsl z0.s, z0.s, #16 |
| ; SVE-NEXT: ptrue p0.s |
| ; SVE-NEXT: faddv s0, p0, z0.s |
| ; SVE-NEXT: bfcvt h0, s0 |
| ; SVE-NEXT: shll v0.4s, v0.4h, #16 |
| ; SVE-NEXT: // kill: def $s0 killed $s0 killed $q0 |
| ; SVE-NEXT: ret |
| ; |
| ; SME-LABEL: promoted_fadd: |
| ; SME: // %bb.0: |
| ; SME-NEXT: lsl z0.s, z0.s, #16 |
| ; SME-NEXT: ptrue p0.s |
| ; SME-NEXT: faddv s0, p0, z0.s |
| ; SME-NEXT: bfcvt h0, s0 |
| ; SME-NEXT: fmov w8, s0 |
| ; SME-NEXT: lsl w8, w8, #16 |
| ; SME-NEXT: fmov s0, w8 |
| ; SME-NEXT: ret |
| %rdx = call fast bfloat @llvm.vector.reduce.fadd.nxv4bf16(bfloat zeroinitializer, <vscale x 4 x bfloat> %a) |
| %res = fpext bfloat %rdx to float |
| ret float %res |
| } |
| |
| ; The reduction is performed at a higher precision. Because min/max operations |
| ; don't utilise that precision, its result can be used directly. |
| define float @promoted_fmax(<vscale x 4 x bfloat> %a) { |
| ; CHECK-LABEL: promoted_fmax: |
| ; CHECK: // %bb.0: |
| ; CHECK-NEXT: lsl z0.s, z0.s, #16 |
| ; CHECK-NEXT: ptrue p0.s |
| ; CHECK-NEXT: fmaxnmv s0, p0, z0.s |
| ; CHECK-NEXT: // kill: def $s0 killed $s0 killed $z0 |
| ; CHECK-NEXT: ret |
| %rdx = call bfloat @llvm.vector.reduce.fmax.nxv4bf16(<vscale x 4 x bfloat> %a) |
| %res = fpext bfloat %rdx to float |
| ret float %res |
| } |
| |
| declare bfloat @llvm.vector.reduce.fadd.nxv2bf16(bfloat, <vscale x 2 x bfloat>) |
| declare bfloat @llvm.vector.reduce.fadd.nxv4bf16(bfloat, <vscale x 4 x bfloat>) |
| declare bfloat @llvm.vector.reduce.fadd.nxv8bf16(bfloat, <vscale x 8 x bfloat>) |
| |
| declare bfloat @llvm.vector.reduce.fmax.nxv2bf16(<vscale x 2 x bfloat>) |
| declare bfloat @llvm.vector.reduce.fmax.nxv4bf16(<vscale x 4 x bfloat>) |
| declare bfloat @llvm.vector.reduce.fmax.nxv8bf16(<vscale x 8 x bfloat>) |
| |
| declare bfloat @llvm.vector.reduce.fmin.nxv2bf16(<vscale x 2 x bfloat>) |
| declare bfloat @llvm.vector.reduce.fmin.nxv4bf16(<vscale x 4 x bfloat>) |
| declare bfloat @llvm.vector.reduce.fmin.nxv8bf16(<vscale x 8 x bfloat>) |
| |
| declare bfloat @llvm.vector.reduce.fmaximum.nxv2bf16(<vscale x 2 x bfloat>) |
| declare bfloat @llvm.vector.reduce.fmaximum.nxv4bf16(<vscale x 4 x bfloat>) |
| declare bfloat @llvm.vector.reduce.fmaximum.nxv8bf16(<vscale x 8 x bfloat>) |
| |
| declare bfloat @llvm.vector.reduce.fminimum.nxv2bf16(<vscale x 2 x bfloat>) |
| declare bfloat @llvm.vector.reduce.fminimum.nxv4bf16(<vscale x 4 x bfloat>) |
| declare bfloat @llvm.vector.reduce.fminimum.nxv8bf16(<vscale x 8 x bfloat>) |