blob: bebf1cb00e6a923d892a4bb32193230d688ae8ac [file]
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
; RUN: llc -mattr=+sve2,+dotprod < %s | FileCheck %s
target triple = "aarch64-unknown-linux-gnu"
define <vscale x 16 x i8> @sabs_nxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b) {
; CHECK-LABEL: sabs_nxv16i8:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.b
; CHECK-NEXT: sabd z0.b, p0/m, z0.b, z1.b
; CHECK-NEXT: ret
%smax = tail call <vscale x 16 x i8> @llvm.smax.nxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b)
%smin = tail call <vscale x 16 x i8> @llvm.smin.nxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b)
%sabs = sub <vscale x 16 x i8> %smax, %smin
ret <vscale x 16 x i8> %sabs
}
define <vscale x 16 x i8> @uabs_nxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b) {
; CHECK-LABEL: uabs_nxv16i8:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.b
; CHECK-NEXT: uabd z0.b, p0/m, z0.b, z1.b
; CHECK-NEXT: ret
%umax = tail call <vscale x 16 x i8> @llvm.umax.nxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b)
%umin = tail call <vscale x 16 x i8> @llvm.umin.nxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b)
%uabs = sub <vscale x 16 x i8> %umax, %umin
ret <vscale x 16 x i8> %uabs
}
; TODO: This case could be lowered to a sabal[bt] pair.
define <vscale x 8 x i16> @sabs_nxv16i8_wide_add(<vscale x 8 x i16> %acc, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b) {
; CHECK-LABEL: sabs_nxv16i8_wide_add:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.b
; CHECK-NEXT: sabd z1.b, p0/m, z1.b, z2.b
; CHECK-NEXT: uaddwb z0.h, z0.h, z1.b
; CHECK-NEXT: uaddwt z0.h, z0.h, z1.b
; CHECK-NEXT: ret
%smax = tail call <vscale x 16 x i8> @llvm.smax.nxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b)
%smin = tail call <vscale x 16 x i8> @llvm.smin.nxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b)
%sabs = sub <vscale x 16 x i8> %smax, %smin
%ext = zext <vscale x 16 x i8> %sabs to <vscale x 16 x i16>
%reduce = call <vscale x 8 x i16> @llvm.vector.partial.reduce.add.v4i32.v16i32(<vscale x 8 x i16> %acc, <vscale x 16 x i16> %ext)
ret <vscale x 8 x i16> %reduce
}
define <vscale x 4 x i32> @sabs_nxv16i8_dot(<vscale x 4 x i32> %acc, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b) {
; CHECK-LABEL: sabs_nxv16i8_dot:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.b
; CHECK-NEXT: sabd z1.b, p0/m, z1.b, z2.b
; CHECK-NEXT: mov z2.b, #1 // =0x1
; CHECK-NEXT: udot z0.s, z1.b, z2.b
; CHECK-NEXT: ret
%smax = tail call <vscale x 16 x i8> @llvm.smax.nxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b)
%smin = tail call <vscale x 16 x i8> @llvm.smin.nxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b)
%sabs = sub <vscale x 16 x i8> %smax, %smin
%ext = zext <vscale x 16 x i8> %sabs to <vscale x 16 x i32>
%dot = call <vscale x 4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<vscale x 4 x i32> %acc, <vscale x 16 x i32> %ext)
ret <vscale x 4 x i32> %dot
}
define <16 x i8> @sabs_v16i8(<16 x i8> %a, <16 x i8> %b) {
; CHECK-LABEL: sabs_v16i8:
; CHECK: // %bb.0:
; CHECK-NEXT: sabd v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ret
%smax = tail call <16 x i8> @llvm.smax.v16i8(<16 x i8> %a, <16 x i8> %b)
%smin = tail call <16 x i8> @llvm.smin.v16i8(<16 x i8> %a, <16 x i8> %b)
%sabs = sub <16 x i8> %smax, %smin
ret <16 x i8> %sabs
}
define <16 x i8> @uabs_v16i8(<16 x i8> %a, <16 x i8> %b) {
; CHECK-LABEL: uabs_v16i8:
; CHECK: // %bb.0:
; CHECK-NEXT: uabd v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ret
%umax = tail call <16 x i8> @llvm.umax.v16i8(<16 x i8> %a, <16 x i8> %b)
%umin = tail call <16 x i8> @llvm.umin.v16i8(<16 x i8> %a, <16 x i8> %b)
%uabs = sub <16 x i8> %umax, %umin
ret <16 x i8> %uabs
}
; TODO: This case could be lowered to a uabal[bt] pair.
define <vscale x 4 x i32> @uabs_nxv16i8_wide_add(<vscale x 4 x i32> %acc, <vscale x 8 x i16> %a, <vscale x 8 x i16> %b) {
; CHECK-LABEL: uabs_nxv16i8_wide_add:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.h
; CHECK-NEXT: uabd z1.h, p0/m, z1.h, z2.h
; CHECK-NEXT: uaddwb z0.s, z0.s, z1.h
; CHECK-NEXT: uaddwt z0.s, z0.s, z1.h
; CHECK-NEXT: ret
%umax = tail call <vscale x 8 x i16> @llvm.umax.nxv8i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b)
%umin = tail call <vscale x 8 x i16> @llvm.umin.nxv8i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b)
%uabs = sub <vscale x 8 x i16> %umax, %umin
%ext = zext <vscale x 8 x i16> %uabs to <vscale x 8 x i32>
%reduce = call <vscale x 4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<vscale x 4 x i32> %acc, <vscale x 8 x i32> %ext)
ret <vscale x 4 x i32> %reduce
}
define <4 x i32> @uabs_v16i8_dot(<4 x i32> %acc, <16 x i8> %a, <16 x i8> %b) {
; CHECK-LABEL: uabs_v16i8_dot:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z3.b, #1 // =0x1
; CHECK-NEXT: uabd v1.16b, v1.16b, v2.16b
; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
; CHECK-NEXT: udot z0.s, z1.b, z3.b
; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
; CHECK-NEXT: ret
%umax = tail call <16 x i8> @llvm.umax.v16i8(<16 x i8> %a, <16 x i8> %b)
%umin = tail call <16 x i8> @llvm.umin.v16i8(<16 x i8> %a, <16 x i8> %b)
%uabs = sub <16 x i8> %umax, %umin
%ext = zext <16 x i8> %uabs to <16 x i32>
%dot = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> %acc, <16 x i32> %ext)
ret <4 x i32> %dot
}