| ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 |
| ; RUN: llc -mattr=+sve2,+dotprod < %s | FileCheck %s |
| |
| target triple = "aarch64-unknown-linux-gnu" |
| |
| define <vscale x 16 x i8> @sabs_nxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b) { |
| ; CHECK-LABEL: sabs_nxv16i8: |
| ; CHECK: // %bb.0: |
| ; CHECK-NEXT: ptrue p0.b |
| ; CHECK-NEXT: sabd z0.b, p0/m, z0.b, z1.b |
| ; CHECK-NEXT: ret |
| %smax = tail call <vscale x 16 x i8> @llvm.smax.nxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b) |
| %smin = tail call <vscale x 16 x i8> @llvm.smin.nxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b) |
| %sabs = sub <vscale x 16 x i8> %smax, %smin |
| ret <vscale x 16 x i8> %sabs |
| } |
| |
| define <vscale x 16 x i8> @uabs_nxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b) { |
| ; CHECK-LABEL: uabs_nxv16i8: |
| ; CHECK: // %bb.0: |
| ; CHECK-NEXT: ptrue p0.b |
| ; CHECK-NEXT: uabd z0.b, p0/m, z0.b, z1.b |
| ; CHECK-NEXT: ret |
| %umax = tail call <vscale x 16 x i8> @llvm.umax.nxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b) |
| %umin = tail call <vscale x 16 x i8> @llvm.umin.nxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b) |
| %uabs = sub <vscale x 16 x i8> %umax, %umin |
| ret <vscale x 16 x i8> %uabs |
| } |
| |
| ; TODO: This case could be lowered to a sabal[bt] pair. |
| define <vscale x 8 x i16> @sabs_nxv16i8_wide_add(<vscale x 8 x i16> %acc, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b) { |
| ; CHECK-LABEL: sabs_nxv16i8_wide_add: |
| ; CHECK: // %bb.0: |
| ; CHECK-NEXT: ptrue p0.b |
| ; CHECK-NEXT: sabd z1.b, p0/m, z1.b, z2.b |
| ; CHECK-NEXT: uaddwb z0.h, z0.h, z1.b |
| ; CHECK-NEXT: uaddwt z0.h, z0.h, z1.b |
| ; CHECK-NEXT: ret |
| %smax = tail call <vscale x 16 x i8> @llvm.smax.nxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b) |
| %smin = tail call <vscale x 16 x i8> @llvm.smin.nxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b) |
| %sabs = sub <vscale x 16 x i8> %smax, %smin |
| %ext = zext <vscale x 16 x i8> %sabs to <vscale x 16 x i16> |
| %reduce = call <vscale x 8 x i16> @llvm.vector.partial.reduce.add.v4i32.v16i32(<vscale x 8 x i16> %acc, <vscale x 16 x i16> %ext) |
| ret <vscale x 8 x i16> %reduce |
| } |
| |
| define <vscale x 4 x i32> @sabs_nxv16i8_dot(<vscale x 4 x i32> %acc, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b) { |
| ; CHECK-LABEL: sabs_nxv16i8_dot: |
| ; CHECK: // %bb.0: |
| ; CHECK-NEXT: ptrue p0.b |
| ; CHECK-NEXT: sabd z1.b, p0/m, z1.b, z2.b |
| ; CHECK-NEXT: mov z2.b, #1 // =0x1 |
| ; CHECK-NEXT: udot z0.s, z1.b, z2.b |
| ; CHECK-NEXT: ret |
| %smax = tail call <vscale x 16 x i8> @llvm.smax.nxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b) |
| %smin = tail call <vscale x 16 x i8> @llvm.smin.nxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b) |
| %sabs = sub <vscale x 16 x i8> %smax, %smin |
| %ext = zext <vscale x 16 x i8> %sabs to <vscale x 16 x i32> |
| %dot = call <vscale x 4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<vscale x 4 x i32> %acc, <vscale x 16 x i32> %ext) |
| ret <vscale x 4 x i32> %dot |
| } |
| |
| define <16 x i8> @sabs_v16i8(<16 x i8> %a, <16 x i8> %b) { |
| ; CHECK-LABEL: sabs_v16i8: |
| ; CHECK: // %bb.0: |
| ; CHECK-NEXT: sabd v0.16b, v0.16b, v1.16b |
| ; CHECK-NEXT: ret |
| %smax = tail call <16 x i8> @llvm.smax.v16i8(<16 x i8> %a, <16 x i8> %b) |
| %smin = tail call <16 x i8> @llvm.smin.v16i8(<16 x i8> %a, <16 x i8> %b) |
| %sabs = sub <16 x i8> %smax, %smin |
| ret <16 x i8> %sabs |
| } |
| |
| define <16 x i8> @uabs_v16i8(<16 x i8> %a, <16 x i8> %b) { |
| ; CHECK-LABEL: uabs_v16i8: |
| ; CHECK: // %bb.0: |
| ; CHECK-NEXT: uabd v0.16b, v0.16b, v1.16b |
| ; CHECK-NEXT: ret |
| %umax = tail call <16 x i8> @llvm.umax.v16i8(<16 x i8> %a, <16 x i8> %b) |
| %umin = tail call <16 x i8> @llvm.umin.v16i8(<16 x i8> %a, <16 x i8> %b) |
| %uabs = sub <16 x i8> %umax, %umin |
| ret <16 x i8> %uabs |
| } |
| |
| ; TODO: This case could be lowered to a uabal[bt] pair. |
| define <vscale x 4 x i32> @uabs_nxv16i8_wide_add(<vscale x 4 x i32> %acc, <vscale x 8 x i16> %a, <vscale x 8 x i16> %b) { |
| ; CHECK-LABEL: uabs_nxv16i8_wide_add: |
| ; CHECK: // %bb.0: |
| ; CHECK-NEXT: ptrue p0.h |
| ; CHECK-NEXT: uabd z1.h, p0/m, z1.h, z2.h |
| ; CHECK-NEXT: uaddwb z0.s, z0.s, z1.h |
| ; CHECK-NEXT: uaddwt z0.s, z0.s, z1.h |
| ; CHECK-NEXT: ret |
| %umax = tail call <vscale x 8 x i16> @llvm.umax.nxv8i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b) |
| %umin = tail call <vscale x 8 x i16> @llvm.umin.nxv8i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b) |
| %uabs = sub <vscale x 8 x i16> %umax, %umin |
| %ext = zext <vscale x 8 x i16> %uabs to <vscale x 8 x i32> |
| %reduce = call <vscale x 4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<vscale x 4 x i32> %acc, <vscale x 8 x i32> %ext) |
| ret <vscale x 4 x i32> %reduce |
| } |
| |
| define <4 x i32> @uabs_v16i8_dot(<4 x i32> %acc, <16 x i8> %a, <16 x i8> %b) { |
| ; CHECK-LABEL: uabs_v16i8_dot: |
| ; CHECK: // %bb.0: |
| ; CHECK-NEXT: mov z3.b, #1 // =0x1 |
| ; CHECK-NEXT: uabd v1.16b, v1.16b, v2.16b |
| ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 |
| ; CHECK-NEXT: udot z0.s, z1.b, z3.b |
| ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 |
| ; CHECK-NEXT: ret |
| %umax = tail call <16 x i8> @llvm.umax.v16i8(<16 x i8> %a, <16 x i8> %b) |
| %umin = tail call <16 x i8> @llvm.umin.v16i8(<16 x i8> %a, <16 x i8> %b) |
| %uabs = sub <16 x i8> %umax, %umin |
| %ext = zext <16 x i8> %uabs to <16 x i32> |
| %dot = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> %acc, <16 x i32> %ext) |
| ret <4 x i32> %dot |
| } |