blob: b2cde51e9961982f4ab57edfcd1361cb81010761 [file] [log] [blame]
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -mtriple=aarch64 -mattr=+sve2 %s -o - | FileCheck %s --check-prefix=CHECK-SVE2
; RUN: llc -mtriple=aarch64 -mattr=+sve2,+i8mm %s -o - | FileCheck %s --check-prefix=CHECK-SVE2-I8MM
; RUN: llc -mtriple=aarch64 -mattr=+sve2,+sme,+i8mm -force-streaming %s -o - | FileCheck %s --check-prefix=CHECK-SME
define <vscale x 4 x i32> @udot(<vscale x 4 x i32> %acc, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b) {
; CHECK-SVE2-LABEL: udot:
; CHECK-SVE2: // %bb.0: // %entry
; CHECK-SVE2-NEXT: udot z0.s, z1.b, z2.b
; CHECK-SVE2-NEXT: ret
;
; CHECK-SVE2-I8MM-LABEL: udot:
; CHECK-SVE2-I8MM: // %bb.0: // %entry
; CHECK-SVE2-I8MM-NEXT: udot z0.s, z1.b, z2.b
; CHECK-SVE2-I8MM-NEXT: ret
;
; CHECK-SME-LABEL: udot:
; CHECK-SME: // %bb.0: // %entry
; CHECK-SME-NEXT: udot z0.s, z1.b, z2.b
; CHECK-SME-NEXT: ret
entry:
%a.wide = zext <vscale x 16 x i8> %a to <vscale x 16 x i32>
%b.wide = zext <vscale x 16 x i8> %b to <vscale x 16 x i32>
%mult = mul nuw nsw <vscale x 16 x i32> %a.wide, %b.wide
%partial.reduce = tail call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %acc, <vscale x 16 x i32> %mult)
ret <vscale x 4 x i32> %partial.reduce
}
define <vscale x 2 x i64> @udot_wide(<vscale x 2 x i64> %acc, <vscale x 8 x i16> %a, <vscale x 8 x i16> %b) {
; CHECK-SVE2-LABEL: udot_wide:
; CHECK-SVE2: // %bb.0: // %entry
; CHECK-SVE2-NEXT: udot z0.d, z1.h, z2.h
; CHECK-SVE2-NEXT: ret
;
; CHECK-SVE2-I8MM-LABEL: udot_wide:
; CHECK-SVE2-I8MM: // %bb.0: // %entry
; CHECK-SVE2-I8MM-NEXT: udot z0.d, z1.h, z2.h
; CHECK-SVE2-I8MM-NEXT: ret
;
; CHECK-SME-LABEL: udot_wide:
; CHECK-SME: // %bb.0: // %entry
; CHECK-SME-NEXT: udot z0.d, z1.h, z2.h
; CHECK-SME-NEXT: ret
entry:
%a.wide = zext <vscale x 8 x i16> %a to <vscale x 8 x i64>
%b.wide = zext <vscale x 8 x i16> %b to <vscale x 8 x i64>
%mult = mul nuw nsw <vscale x 8 x i64> %a.wide, %b.wide
%partial.reduce = tail call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> %acc, <vscale x 8 x i64> %mult)
ret <vscale x 2 x i64> %partial.reduce
}
define <vscale x 4 x i32> @sdot(<vscale x 4 x i32> %accc, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b) {
; CHECK-SVE2-LABEL: sdot:
; CHECK-SVE2: // %bb.0: // %entry
; CHECK-SVE2-NEXT: sdot z0.s, z1.b, z2.b
; CHECK-SVE2-NEXT: ret
;
; CHECK-SVE2-I8MM-LABEL: sdot:
; CHECK-SVE2-I8MM: // %bb.0: // %entry
; CHECK-SVE2-I8MM-NEXT: sdot z0.s, z1.b, z2.b
; CHECK-SVE2-I8MM-NEXT: ret
;
; CHECK-SME-LABEL: sdot:
; CHECK-SME: // %bb.0: // %entry
; CHECK-SME-NEXT: sdot z0.s, z1.b, z2.b
; CHECK-SME-NEXT: ret
entry:
%a.wide = sext <vscale x 16 x i8> %a to <vscale x 16 x i32>
%b.wide = sext <vscale x 16 x i8> %b to <vscale x 16 x i32>
%mult = mul nuw nsw <vscale x 16 x i32> %a.wide, %b.wide
%partial.reduce = tail call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %accc, <vscale x 16 x i32> %mult)
ret <vscale x 4 x i32> %partial.reduce
}
define <vscale x 2 x i64> @sdot_wide(<vscale x 2 x i64> %acc, <vscale x 8 x i16> %a, <vscale x 8 x i16> %b) {
; CHECK-SVE2-LABEL: sdot_wide:
; CHECK-SVE2: // %bb.0: // %entry
; CHECK-SVE2-NEXT: sdot z0.d, z1.h, z2.h
; CHECK-SVE2-NEXT: ret
;
; CHECK-SVE2-I8MM-LABEL: sdot_wide:
; CHECK-SVE2-I8MM: // %bb.0: // %entry
; CHECK-SVE2-I8MM-NEXT: sdot z0.d, z1.h, z2.h
; CHECK-SVE2-I8MM-NEXT: ret
;
; CHECK-SME-LABEL: sdot_wide:
; CHECK-SME: // %bb.0: // %entry
; CHECK-SME-NEXT: sdot z0.d, z1.h, z2.h
; CHECK-SME-NEXT: ret
entry:
%a.wide = sext <vscale x 8 x i16> %a to <vscale x 8 x i64>
%b.wide = sext <vscale x 8 x i16> %b to <vscale x 8 x i64>
%mult = mul nuw nsw <vscale x 8 x i64> %a.wide, %b.wide
%partial.reduce = tail call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> %acc, <vscale x 8 x i64> %mult)
ret <vscale x 2 x i64> %partial.reduce
}
define <vscale x 4 x i32> @usdot(<vscale x 4 x i32> %acc, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b) {
; CHECK-SVE2-LABEL: usdot:
; CHECK-SVE2: // %bb.0: // %entry
; CHECK-SVE2-NEXT: uunpklo z3.h, z1.b
; CHECK-SVE2-NEXT: sunpklo z4.h, z2.b
; CHECK-SVE2-NEXT: ptrue p0.s
; CHECK-SVE2-NEXT: uunpkhi z1.h, z1.b
; CHECK-SVE2-NEXT: sunpkhi z2.h, z2.b
; CHECK-SVE2-NEXT: uunpklo z5.s, z3.h
; CHECK-SVE2-NEXT: sunpklo z6.s, z4.h
; CHECK-SVE2-NEXT: uunpkhi z3.s, z3.h
; CHECK-SVE2-NEXT: sunpkhi z4.s, z4.h
; CHECK-SVE2-NEXT: mla z0.s, p0/m, z5.s, z6.s
; CHECK-SVE2-NEXT: uunpklo z5.s, z1.h
; CHECK-SVE2-NEXT: sunpklo z6.s, z2.h
; CHECK-SVE2-NEXT: uunpkhi z1.s, z1.h
; CHECK-SVE2-NEXT: sunpkhi z2.s, z2.h
; CHECK-SVE2-NEXT: mla z0.s, p0/m, z3.s, z4.s
; CHECK-SVE2-NEXT: mla z0.s, p0/m, z5.s, z6.s
; CHECK-SVE2-NEXT: mla z0.s, p0/m, z1.s, z2.s
; CHECK-SVE2-NEXT: ret
;
; CHECK-SVE2-I8MM-LABEL: usdot:
; CHECK-SVE2-I8MM: // %bb.0: // %entry
; CHECK-SVE2-I8MM-NEXT: usdot z0.s, z1.b, z2.b
; CHECK-SVE2-I8MM-NEXT: ret
;
; CHECK-SME-LABEL: usdot:
; CHECK-SME: // %bb.0: // %entry
; CHECK-SME-NEXT: usdot z0.s, z1.b, z2.b
; CHECK-SME-NEXT: ret
entry:
%a.wide = zext <vscale x 16 x i8> %a to <vscale x 16 x i32>
%b.wide = sext <vscale x 16 x i8> %b to <vscale x 16 x i32>
%mult = mul nuw nsw <vscale x 16 x i32> %a.wide, %b.wide
%partial.reduce = tail call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %acc, <vscale x 16 x i32> %mult)
ret <vscale x 4 x i32> %partial.reduce
}
define <vscale x 4 x i32> @sudot(<vscale x 4 x i32> %acc, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b) {
; CHECK-SVE2-LABEL: sudot:
; CHECK-SVE2: // %bb.0: // %entry
; CHECK-SVE2-NEXT: sunpklo z3.h, z1.b
; CHECK-SVE2-NEXT: uunpklo z4.h, z2.b
; CHECK-SVE2-NEXT: ptrue p0.s
; CHECK-SVE2-NEXT: sunpkhi z1.h, z1.b
; CHECK-SVE2-NEXT: uunpkhi z2.h, z2.b
; CHECK-SVE2-NEXT: sunpklo z5.s, z3.h
; CHECK-SVE2-NEXT: uunpklo z6.s, z4.h
; CHECK-SVE2-NEXT: sunpkhi z3.s, z3.h
; CHECK-SVE2-NEXT: uunpkhi z4.s, z4.h
; CHECK-SVE2-NEXT: mla z0.s, p0/m, z5.s, z6.s
; CHECK-SVE2-NEXT: sunpklo z5.s, z1.h
; CHECK-SVE2-NEXT: uunpklo z6.s, z2.h
; CHECK-SVE2-NEXT: sunpkhi z1.s, z1.h
; CHECK-SVE2-NEXT: uunpkhi z2.s, z2.h
; CHECK-SVE2-NEXT: mla z0.s, p0/m, z3.s, z4.s
; CHECK-SVE2-NEXT: mla z0.s, p0/m, z5.s, z6.s
; CHECK-SVE2-NEXT: mla z0.s, p0/m, z1.s, z2.s
; CHECK-SVE2-NEXT: ret
;
; CHECK-SVE2-I8MM-LABEL: sudot:
; CHECK-SVE2-I8MM: // %bb.0: // %entry
; CHECK-SVE2-I8MM-NEXT: usdot z0.s, z2.b, z1.b
; CHECK-SVE2-I8MM-NEXT: ret
;
; CHECK-SME-LABEL: sudot:
; CHECK-SME: // %bb.0: // %entry
; CHECK-SME-NEXT: usdot z0.s, z2.b, z1.b
; CHECK-SME-NEXT: ret
entry:
%a.wide = sext <vscale x 16 x i8> %a to <vscale x 16 x i32>
%b.wide = zext <vscale x 16 x i8> %b to <vscale x 16 x i32>
%mult = mul nuw nsw <vscale x 16 x i32> %a.wide, %b.wide
%partial.reduce = tail call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %acc, <vscale x 16 x i32> %mult)
ret <vscale x 4 x i32> %partial.reduce
}
define <vscale x 4 x i64> @udot_8to64(<vscale x 4 x i64> %acc, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b) {
; CHECK-SVE2-LABEL: udot_8to64:
; CHECK-SVE2: // %bb.0: // %entry
; CHECK-SVE2-NEXT: movi v4.2d, #0000000000000000
; CHECK-SVE2-NEXT: udot z4.s, z2.b, z3.b
; CHECK-SVE2-NEXT: uaddwb z0.d, z0.d, z4.s
; CHECK-SVE2-NEXT: uaddwt z0.d, z0.d, z4.s
; CHECK-SVE2-NEXT: ret
;
; CHECK-SVE2-I8MM-LABEL: udot_8to64:
; CHECK-SVE2-I8MM: // %bb.0: // %entry
; CHECK-SVE2-I8MM-NEXT: movi v4.2d, #0000000000000000
; CHECK-SVE2-I8MM-NEXT: udot z4.s, z2.b, z3.b
; CHECK-SVE2-I8MM-NEXT: uaddwb z0.d, z0.d, z4.s
; CHECK-SVE2-I8MM-NEXT: uaddwt z0.d, z0.d, z4.s
; CHECK-SVE2-I8MM-NEXT: ret
;
; CHECK-SME-LABEL: udot_8to64:
; CHECK-SME: // %bb.0: // %entry
; CHECK-SME-NEXT: mov z4.s, #0 // =0x0
; CHECK-SME-NEXT: udot z4.s, z2.b, z3.b
; CHECK-SME-NEXT: uaddwb z0.d, z0.d, z4.s
; CHECK-SME-NEXT: uaddwt z0.d, z0.d, z4.s
; CHECK-SME-NEXT: ret
entry:
%a.wide = zext <vscale x 16 x i8> %a to <vscale x 16 x i64>
%b.wide = zext <vscale x 16 x i8> %b to <vscale x 16 x i64>
%mult = mul nuw nsw <vscale x 16 x i64> %a.wide, %b.wide
%partial.reduce = tail call <vscale x 4 x i64> @llvm.experimental.vector.partial.reduce.add.nxv4i64.nxv16i64(
<vscale x 4 x i64> %acc, <vscale x 16 x i64> %mult)
ret <vscale x 4 x i64> %partial.reduce
}
define <vscale x 4 x i64> @sdot_8to64(<vscale x 4 x i64> %acc, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b){
; CHECK-SVE2-LABEL: sdot_8to64:
; CHECK-SVE2: // %bb.0: // %entry
; CHECK-SVE2-NEXT: movi v4.2d, #0000000000000000
; CHECK-SVE2-NEXT: sdot z4.s, z2.b, z3.b
; CHECK-SVE2-NEXT: saddwb z0.d, z0.d, z4.s
; CHECK-SVE2-NEXT: saddwt z0.d, z0.d, z4.s
; CHECK-SVE2-NEXT: ret
;
; CHECK-SVE2-I8MM-LABEL: sdot_8to64:
; CHECK-SVE2-I8MM: // %bb.0: // %entry
; CHECK-SVE2-I8MM-NEXT: movi v4.2d, #0000000000000000
; CHECK-SVE2-I8MM-NEXT: sdot z4.s, z2.b, z3.b
; CHECK-SVE2-I8MM-NEXT: saddwb z0.d, z0.d, z4.s
; CHECK-SVE2-I8MM-NEXT: saddwt z0.d, z0.d, z4.s
; CHECK-SVE2-I8MM-NEXT: ret
;
; CHECK-SME-LABEL: sdot_8to64:
; CHECK-SME: // %bb.0: // %entry
; CHECK-SME-NEXT: mov z4.s, #0 // =0x0
; CHECK-SME-NEXT: sdot z4.s, z2.b, z3.b
; CHECK-SME-NEXT: saddwb z0.d, z0.d, z4.s
; CHECK-SME-NEXT: saddwt z0.d, z0.d, z4.s
; CHECK-SME-NEXT: ret
entry:
%a.wide = sext <vscale x 16 x i8> %a to <vscale x 16 x i64>
%b.wide = sext <vscale x 16 x i8> %b to <vscale x 16 x i64>
%mult = mul nuw nsw <vscale x 16 x i64> %a.wide, %b.wide
%partial.reduce = tail call <vscale x 4 x i64> @llvm.experimental.vector.partial.reduce.add.nxv4i64.nxv16i64(
<vscale x 4 x i64> %acc, <vscale x 16 x i64> %mult)
ret <vscale x 4 x i64> %partial.reduce
}
define <vscale x 4 x i64> @usdot_8to64(<vscale x 4 x i64> %acc, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b){
; CHECK-SVE2-LABEL: usdot_8to64:
; CHECK-SVE2: // %bb.0: // %entry
; CHECK-SVE2-NEXT: uunpkhi z4.h, z2.b
; CHECK-SVE2-NEXT: uunpklo z2.h, z2.b
; CHECK-SVE2-NEXT: sunpkhi z5.h, z3.b
; CHECK-SVE2-NEXT: sunpklo z3.h, z3.b
; CHECK-SVE2-NEXT: ptrue p0.d
; CHECK-SVE2-NEXT: uunpklo z6.s, z4.h
; CHECK-SVE2-NEXT: uunpklo z7.s, z2.h
; CHECK-SVE2-NEXT: sunpklo z24.s, z5.h
; CHECK-SVE2-NEXT: sunpklo z25.s, z3.h
; CHECK-SVE2-NEXT: uunpkhi z4.s, z4.h
; CHECK-SVE2-NEXT: uunpkhi z2.s, z2.h
; CHECK-SVE2-NEXT: sunpkhi z5.s, z5.h
; CHECK-SVE2-NEXT: sunpkhi z3.s, z3.h
; CHECK-SVE2-NEXT: uunpklo z26.d, z6.s
; CHECK-SVE2-NEXT: uunpklo z27.d, z7.s
; CHECK-SVE2-NEXT: sunpklo z28.d, z24.s
; CHECK-SVE2-NEXT: sunpklo z29.d, z25.s
; CHECK-SVE2-NEXT: uunpkhi z6.d, z6.s
; CHECK-SVE2-NEXT: uunpkhi z7.d, z7.s
; CHECK-SVE2-NEXT: sunpkhi z24.d, z24.s
; CHECK-SVE2-NEXT: sunpkhi z25.d, z25.s
; CHECK-SVE2-NEXT: mla z1.d, p0/m, z26.d, z28.d
; CHECK-SVE2-NEXT: uunpklo z26.d, z4.s
; CHECK-SVE2-NEXT: sunpklo z28.d, z5.s
; CHECK-SVE2-NEXT: mla z0.d, p0/m, z27.d, z29.d
; CHECK-SVE2-NEXT: uunpklo z27.d, z2.s
; CHECK-SVE2-NEXT: sunpklo z29.d, z3.s
; CHECK-SVE2-NEXT: uunpkhi z4.d, z4.s
; CHECK-SVE2-NEXT: uunpkhi z2.d, z2.s
; CHECK-SVE2-NEXT: sunpkhi z5.d, z5.s
; CHECK-SVE2-NEXT: sunpkhi z3.d, z3.s
; CHECK-SVE2-NEXT: mla z1.d, p0/m, z6.d, z24.d
; CHECK-SVE2-NEXT: mla z0.d, p0/m, z7.d, z25.d
; CHECK-SVE2-NEXT: mla z1.d, p0/m, z26.d, z28.d
; CHECK-SVE2-NEXT: mla z0.d, p0/m, z27.d, z29.d
; CHECK-SVE2-NEXT: mla z1.d, p0/m, z4.d, z5.d
; CHECK-SVE2-NEXT: mla z0.d, p0/m, z2.d, z3.d
; CHECK-SVE2-NEXT: ret
;
; CHECK-SVE2-I8MM-LABEL: usdot_8to64:
; CHECK-SVE2-I8MM: // %bb.0: // %entry
; CHECK-SVE2-I8MM-NEXT: movi v4.2d, #0000000000000000
; CHECK-SVE2-I8MM-NEXT: usdot z4.s, z2.b, z3.b
; CHECK-SVE2-I8MM-NEXT: saddwb z0.d, z0.d, z4.s
; CHECK-SVE2-I8MM-NEXT: saddwt z0.d, z0.d, z4.s
; CHECK-SVE2-I8MM-NEXT: ret
;
; CHECK-SME-LABEL: usdot_8to64:
; CHECK-SME: // %bb.0: // %entry
; CHECK-SME-NEXT: mov z4.s, #0 // =0x0
; CHECK-SME-NEXT: usdot z4.s, z2.b, z3.b
; CHECK-SME-NEXT: saddwb z0.d, z0.d, z4.s
; CHECK-SME-NEXT: saddwt z0.d, z0.d, z4.s
; CHECK-SME-NEXT: ret
entry:
%a.wide = zext <vscale x 16 x i8> %a to <vscale x 16 x i64>
%b.wide = sext <vscale x 16 x i8> %b to <vscale x 16 x i64>
%mult = mul nuw nsw <vscale x 16 x i64> %a.wide, %b.wide
%partial.reduce = tail call <vscale x 4 x i64> @llvm.experimental.vector.partial.reduce.add.nxv4i64.nxv16i64(
<vscale x 4 x i64> %acc, <vscale x 16 x i64> %mult)
ret <vscale x 4 x i64> %partial.reduce
}
define <vscale x 4 x i64> @sudot_8to64(<vscale x 4 x i64> %acc, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b) {
; CHECK-SVE2-LABEL: sudot_8to64:
; CHECK-SVE2: // %bb.0: // %entry
; CHECK-SVE2-NEXT: sunpkhi z4.h, z2.b
; CHECK-SVE2-NEXT: sunpklo z2.h, z2.b
; CHECK-SVE2-NEXT: uunpkhi z5.h, z3.b
; CHECK-SVE2-NEXT: uunpklo z3.h, z3.b
; CHECK-SVE2-NEXT: ptrue p0.d
; CHECK-SVE2-NEXT: sunpklo z6.s, z4.h
; CHECK-SVE2-NEXT: sunpklo z7.s, z2.h
; CHECK-SVE2-NEXT: uunpklo z24.s, z5.h
; CHECK-SVE2-NEXT: uunpklo z25.s, z3.h
; CHECK-SVE2-NEXT: sunpkhi z4.s, z4.h
; CHECK-SVE2-NEXT: sunpkhi z2.s, z2.h
; CHECK-SVE2-NEXT: uunpkhi z5.s, z5.h
; CHECK-SVE2-NEXT: uunpkhi z3.s, z3.h
; CHECK-SVE2-NEXT: sunpklo z26.d, z6.s
; CHECK-SVE2-NEXT: sunpklo z27.d, z7.s
; CHECK-SVE2-NEXT: uunpklo z28.d, z24.s
; CHECK-SVE2-NEXT: uunpklo z29.d, z25.s
; CHECK-SVE2-NEXT: sunpkhi z6.d, z6.s
; CHECK-SVE2-NEXT: sunpkhi z7.d, z7.s
; CHECK-SVE2-NEXT: uunpkhi z24.d, z24.s
; CHECK-SVE2-NEXT: uunpkhi z25.d, z25.s
; CHECK-SVE2-NEXT: mla z1.d, p0/m, z26.d, z28.d
; CHECK-SVE2-NEXT: sunpklo z26.d, z4.s
; CHECK-SVE2-NEXT: uunpklo z28.d, z5.s
; CHECK-SVE2-NEXT: mla z0.d, p0/m, z27.d, z29.d
; CHECK-SVE2-NEXT: sunpklo z27.d, z2.s
; CHECK-SVE2-NEXT: uunpklo z29.d, z3.s
; CHECK-SVE2-NEXT: sunpkhi z4.d, z4.s
; CHECK-SVE2-NEXT: sunpkhi z2.d, z2.s
; CHECK-SVE2-NEXT: uunpkhi z5.d, z5.s
; CHECK-SVE2-NEXT: uunpkhi z3.d, z3.s
; CHECK-SVE2-NEXT: mla z1.d, p0/m, z6.d, z24.d
; CHECK-SVE2-NEXT: mla z0.d, p0/m, z7.d, z25.d
; CHECK-SVE2-NEXT: mla z1.d, p0/m, z26.d, z28.d
; CHECK-SVE2-NEXT: mla z0.d, p0/m, z27.d, z29.d
; CHECK-SVE2-NEXT: mla z1.d, p0/m, z4.d, z5.d
; CHECK-SVE2-NEXT: mla z0.d, p0/m, z2.d, z3.d
; CHECK-SVE2-NEXT: ret
;
; CHECK-SVE2-I8MM-LABEL: sudot_8to64:
; CHECK-SVE2-I8MM: // %bb.0: // %entry
; CHECK-SVE2-I8MM-NEXT: movi v4.2d, #0000000000000000
; CHECK-SVE2-I8MM-NEXT: usdot z4.s, z3.b, z2.b
; CHECK-SVE2-I8MM-NEXT: saddwb z0.d, z0.d, z4.s
; CHECK-SVE2-I8MM-NEXT: saddwt z0.d, z0.d, z4.s
; CHECK-SVE2-I8MM-NEXT: ret
;
; CHECK-SME-LABEL: sudot_8to64:
; CHECK-SME: // %bb.0: // %entry
; CHECK-SME-NEXT: mov z4.s, #0 // =0x0
; CHECK-SME-NEXT: usdot z4.s, z3.b, z2.b
; CHECK-SME-NEXT: saddwb z0.d, z0.d, z4.s
; CHECK-SME-NEXT: saddwt z0.d, z0.d, z4.s
; CHECK-SME-NEXT: ret
entry:
%a.wide = sext <vscale x 16 x i8> %a to <vscale x 16 x i64>
%b.wide = zext <vscale x 16 x i8> %b to <vscale x 16 x i64>
%mult = mul nuw nsw <vscale x 16 x i64> %a.wide, %b.wide
%partial.reduce = tail call <vscale x 4 x i64> @llvm.experimental.vector.partial.reduce.add.nxv4i64.nxv16i64(
<vscale x 4 x i64> %acc, <vscale x 16 x i64> %mult)
ret <vscale x 4 x i64> %partial.reduce
}
define <vscale x 4 x i32> @udot_no_bin_op(<vscale x 4 x i32> %acc, <vscale x 16 x i8> %a){
; CHECK-SVE2-LABEL: udot_no_bin_op:
; CHECK-SVE2: // %bb.0:
; CHECK-SVE2-NEXT: mov z2.b, #1 // =0x1
; CHECK-SVE2-NEXT: udot z0.s, z1.b, z2.b
; CHECK-SVE2-NEXT: ret
;
; CHECK-SVE2-I8MM-LABEL: udot_no_bin_op:
; CHECK-SVE2-I8MM: // %bb.0:
; CHECK-SVE2-I8MM-NEXT: mov z2.b, #1 // =0x1
; CHECK-SVE2-I8MM-NEXT: udot z0.s, z1.b, z2.b
; CHECK-SVE2-I8MM-NEXT: ret
;
; CHECK-SME-LABEL: udot_no_bin_op:
; CHECK-SME: // %bb.0:
; CHECK-SME-NEXT: mov z2.b, #1 // =0x1
; CHECK-SME-NEXT: udot z0.s, z1.b, z2.b
; CHECK-SME-NEXT: ret
%a.ext = zext <vscale x 16 x i8> %a to <vscale x 16 x i32>
%partial.reduce = tail call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %acc, <vscale x 16 x i32> %a.ext)
ret <vscale x 4 x i32> %partial.reduce
}
define <vscale x 4 x i32> @sdot_no_bin_op(<vscale x 4 x i32> %acc, <vscale x 16 x i8> %a){
; CHECK-SVE2-LABEL: sdot_no_bin_op:
; CHECK-SVE2: // %bb.0:
; CHECK-SVE2-NEXT: mov z2.b, #1 // =0x1
; CHECK-SVE2-NEXT: sdot z0.s, z1.b, z2.b
; CHECK-SVE2-NEXT: ret
;
; CHECK-SVE2-I8MM-LABEL: sdot_no_bin_op:
; CHECK-SVE2-I8MM: // %bb.0:
; CHECK-SVE2-I8MM-NEXT: mov z2.b, #1 // =0x1
; CHECK-SVE2-I8MM-NEXT: sdot z0.s, z1.b, z2.b
; CHECK-SVE2-I8MM-NEXT: ret
;
; CHECK-SME-LABEL: sdot_no_bin_op:
; CHECK-SME: // %bb.0:
; CHECK-SME-NEXT: mov z2.b, #1 // =0x1
; CHECK-SME-NEXT: sdot z0.s, z1.b, z2.b
; CHECK-SME-NEXT: ret
%a.ext = sext <vscale x 16 x i8> %a to <vscale x 16 x i32>
%partial.reduce = tail call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %acc, <vscale x 16 x i32> %a.ext)
ret <vscale x 4 x i32> %partial.reduce
}
define <vscale x 2 x i64> @udot_no_bin_op_wide(<vscale x 2 x i64> %acc, <vscale x 8 x i16> %a, <vscale x 8 x i16> %b){
; CHECK-SVE2-LABEL: udot_no_bin_op_wide:
; CHECK-SVE2: // %bb.0: // %entry
; CHECK-SVE2-NEXT: mov z2.h, #1 // =0x1
; CHECK-SVE2-NEXT: udot z0.d, z1.h, z2.h
; CHECK-SVE2-NEXT: ret
;
; CHECK-SVE2-I8MM-LABEL: udot_no_bin_op_wide:
; CHECK-SVE2-I8MM: // %bb.0: // %entry
; CHECK-SVE2-I8MM-NEXT: mov z2.h, #1 // =0x1
; CHECK-SVE2-I8MM-NEXT: udot z0.d, z1.h, z2.h
; CHECK-SVE2-I8MM-NEXT: ret
;
; CHECK-SME-LABEL: udot_no_bin_op_wide:
; CHECK-SME: // %bb.0: // %entry
; CHECK-SME-NEXT: mov z2.h, #1 // =0x1
; CHECK-SME-NEXT: udot z0.d, z1.h, z2.h
; CHECK-SME-NEXT: ret
entry:
%a.wide = zext <vscale x 8 x i16> %a to <vscale x 8 x i64>
%partial.reduce = tail call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> %acc, <vscale x 8 x i64> %a.wide)
ret <vscale x 2 x i64> %partial.reduce
}
define <vscale x 2 x i64> @sdot_no_bin_op_wide(<vscale x 2 x i64> %acc, <vscale x 8 x i16> %a, <vscale x 8 x i16> %b){
; CHECK-SVE2-LABEL: sdot_no_bin_op_wide:
; CHECK-SVE2: // %bb.0: // %entry
; CHECK-SVE2-NEXT: mov z2.h, #1 // =0x1
; CHECK-SVE2-NEXT: sdot z0.d, z1.h, z2.h
; CHECK-SVE2-NEXT: ret
;
; CHECK-SVE2-I8MM-LABEL: sdot_no_bin_op_wide:
; CHECK-SVE2-I8MM: // %bb.0: // %entry
; CHECK-SVE2-I8MM-NEXT: mov z2.h, #1 // =0x1
; CHECK-SVE2-I8MM-NEXT: sdot z0.d, z1.h, z2.h
; CHECK-SVE2-I8MM-NEXT: ret
;
; CHECK-SME-LABEL: sdot_no_bin_op_wide:
; CHECK-SME: // %bb.0: // %entry
; CHECK-SME-NEXT: mov z2.h, #1 // =0x1
; CHECK-SME-NEXT: sdot z0.d, z1.h, z2.h
; CHECK-SME-NEXT: ret
entry:
%a.wide = sext <vscale x 8 x i16> %a to <vscale x 8 x i64>
%partial.reduce = tail call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> %acc, <vscale x 8 x i64> %a.wide)
ret <vscale x 2 x i64> %partial.reduce
}
define <vscale x 4 x i64> @udot_no_bin_op_8to64(<vscale x 4 x i64> %acc, <vscale x 16 x i8> %a){
; CHECK-SVE2-LABEL: udot_no_bin_op_8to64:
; CHECK-SVE2: // %bb.0:
; CHECK-SVE2-NEXT: movi v3.2d, #0000000000000000
; CHECK-SVE2-NEXT: mov z4.b, #1 // =0x1
; CHECK-SVE2-NEXT: udot z3.s, z2.b, z4.b
; CHECK-SVE2-NEXT: uaddwb z0.d, z0.d, z3.s
; CHECK-SVE2-NEXT: uaddwt z0.d, z0.d, z3.s
; CHECK-SVE2-NEXT: ret
;
; CHECK-SVE2-I8MM-LABEL: udot_no_bin_op_8to64:
; CHECK-SVE2-I8MM: // %bb.0:
; CHECK-SVE2-I8MM-NEXT: movi v3.2d, #0000000000000000
; CHECK-SVE2-I8MM-NEXT: mov z4.b, #1 // =0x1
; CHECK-SVE2-I8MM-NEXT: udot z3.s, z2.b, z4.b
; CHECK-SVE2-I8MM-NEXT: uaddwb z0.d, z0.d, z3.s
; CHECK-SVE2-I8MM-NEXT: uaddwt z0.d, z0.d, z3.s
; CHECK-SVE2-I8MM-NEXT: ret
;
; CHECK-SME-LABEL: udot_no_bin_op_8to64:
; CHECK-SME: // %bb.0:
; CHECK-SME-NEXT: mov z3.b, #1 // =0x1
; CHECK-SME-NEXT: mov z4.s, #0 // =0x0
; CHECK-SME-NEXT: udot z4.s, z2.b, z3.b
; CHECK-SME-NEXT: uaddwb z0.d, z0.d, z4.s
; CHECK-SME-NEXT: uaddwt z0.d, z0.d, z4.s
; CHECK-SME-NEXT: ret
%a.ext = zext <vscale x 16 x i8> %a to <vscale x 16 x i64>
%partial.reduce = tail call <vscale x 4 x i64> @llvm.experimental.vector.partial.reduce.add.nxv4i64.nxv16i64(<vscale x 4 x i64> %acc, <vscale x 16 x i64> %a.ext)
ret <vscale x 4 x i64> %partial.reduce
}
define <vscale x 4 x i64> @sdot_no_bin_op_8to64(<vscale x 4 x i64> %acc, <vscale x 16 x i8> %a){
; CHECK-SVE2-LABEL: sdot_no_bin_op_8to64:
; CHECK-SVE2: // %bb.0:
; CHECK-SVE2-NEXT: movi v3.2d, #0000000000000000
; CHECK-SVE2-NEXT: mov z4.b, #1 // =0x1
; CHECK-SVE2-NEXT: sdot z3.s, z2.b, z4.b
; CHECK-SVE2-NEXT: saddwb z0.d, z0.d, z3.s
; CHECK-SVE2-NEXT: saddwt z0.d, z0.d, z3.s
; CHECK-SVE2-NEXT: ret
;
; CHECK-SVE2-I8MM-LABEL: sdot_no_bin_op_8to64:
; CHECK-SVE2-I8MM: // %bb.0:
; CHECK-SVE2-I8MM-NEXT: movi v3.2d, #0000000000000000
; CHECK-SVE2-I8MM-NEXT: mov z4.b, #1 // =0x1
; CHECK-SVE2-I8MM-NEXT: sdot z3.s, z2.b, z4.b
; CHECK-SVE2-I8MM-NEXT: saddwb z0.d, z0.d, z3.s
; CHECK-SVE2-I8MM-NEXT: saddwt z0.d, z0.d, z3.s
; CHECK-SVE2-I8MM-NEXT: ret
;
; CHECK-SME-LABEL: sdot_no_bin_op_8to64:
; CHECK-SME: // %bb.0:
; CHECK-SME-NEXT: mov z3.b, #1 // =0x1
; CHECK-SME-NEXT: mov z4.s, #0 // =0x0
; CHECK-SME-NEXT: sdot z4.s, z2.b, z3.b
; CHECK-SME-NEXT: saddwb z0.d, z0.d, z4.s
; CHECK-SME-NEXT: saddwt z0.d, z0.d, z4.s
; CHECK-SME-NEXT: ret
%a.ext = sext <vscale x 16 x i8> %a to <vscale x 16 x i64>
%partial.reduce = tail call <vscale x 4 x i64> @llvm.experimental.vector.partial.reduce.add.nxv4i64.nxv16i64(<vscale x 4 x i64> %acc, <vscale x 16 x i64> %a.ext)
ret <vscale x 4 x i64> %partial.reduce
}
define <vscale x 4 x i32> @not_udot(<vscale x 4 x i32> %acc, <vscale x 8 x i8> %a, <vscale x 8 x i8> %b) {
; CHECK-SVE2-LABEL: not_udot:
; CHECK-SVE2: // %bb.0: // %entry
; CHECK-SVE2-NEXT: and z2.h, z2.h, #0xff
; CHECK-SVE2-NEXT: and z1.h, z1.h, #0xff
; CHECK-SVE2-NEXT: umlalb z0.s, z1.h, z2.h
; CHECK-SVE2-NEXT: umlalt z0.s, z1.h, z2.h
; CHECK-SVE2-NEXT: ret
;
; CHECK-SVE2-I8MM-LABEL: not_udot:
; CHECK-SVE2-I8MM: // %bb.0: // %entry
; CHECK-SVE2-I8MM-NEXT: and z2.h, z2.h, #0xff
; CHECK-SVE2-I8MM-NEXT: and z1.h, z1.h, #0xff
; CHECK-SVE2-I8MM-NEXT: umlalb z0.s, z1.h, z2.h
; CHECK-SVE2-I8MM-NEXT: umlalt z0.s, z1.h, z2.h
; CHECK-SVE2-I8MM-NEXT: ret
;
; CHECK-SME-LABEL: not_udot:
; CHECK-SME: // %bb.0: // %entry
; CHECK-SME-NEXT: and z2.h, z2.h, #0xff
; CHECK-SME-NEXT: and z1.h, z1.h, #0xff
; CHECK-SME-NEXT: umlalb z0.s, z1.h, z2.h
; CHECK-SME-NEXT: umlalt z0.s, z1.h, z2.h
; CHECK-SME-NEXT: ret
entry:
%a.wide = zext <vscale x 8 x i8> %a to <vscale x 8 x i32>
%b.wide = zext <vscale x 8 x i8> %b to <vscale x 8 x i32>
%mult = mul nuw nsw <vscale x 8 x i32> %a.wide, %b.wide
%partial.reduce = tail call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %acc, <vscale x 8 x i32> %mult)
ret <vscale x 4 x i32> %partial.reduce
}
define <vscale x 2 x i64> @not_udot_wide(<vscale x 2 x i64> %acc, <vscale x 4 x i16> %a, <vscale x 4 x i16> %b) {
; CHECK-SVE2-LABEL: not_udot_wide:
; CHECK-SVE2: // %bb.0: // %entry
; CHECK-SVE2-NEXT: and z2.s, z2.s, #0xffff
; CHECK-SVE2-NEXT: and z1.s, z1.s, #0xffff
; CHECK-SVE2-NEXT: umlalb z0.d, z1.s, z2.s
; CHECK-SVE2-NEXT: umlalt z0.d, z1.s, z2.s
; CHECK-SVE2-NEXT: ret
;
; CHECK-SVE2-I8MM-LABEL: not_udot_wide:
; CHECK-SVE2-I8MM: // %bb.0: // %entry
; CHECK-SVE2-I8MM-NEXT: and z2.s, z2.s, #0xffff
; CHECK-SVE2-I8MM-NEXT: and z1.s, z1.s, #0xffff
; CHECK-SVE2-I8MM-NEXT: umlalb z0.d, z1.s, z2.s
; CHECK-SVE2-I8MM-NEXT: umlalt z0.d, z1.s, z2.s
; CHECK-SVE2-I8MM-NEXT: ret
;
; CHECK-SME-LABEL: not_udot_wide:
; CHECK-SME: // %bb.0: // %entry
; CHECK-SME-NEXT: and z2.s, z2.s, #0xffff
; CHECK-SME-NEXT: and z1.s, z1.s, #0xffff
; CHECK-SME-NEXT: umlalb z0.d, z1.s, z2.s
; CHECK-SME-NEXT: umlalt z0.d, z1.s, z2.s
; CHECK-SME-NEXT: ret
entry:
%a.wide = zext <vscale x 4 x i16> %a to <vscale x 4 x i64>
%b.wide = zext <vscale x 4 x i16> %b to <vscale x 4 x i64>
%mult = mul nuw nsw <vscale x 4 x i64> %a.wide, %b.wide
%partial.reduce = tail call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv4i64(<vscale x 2 x i64> %acc, <vscale x 4 x i64> %mult)
ret <vscale x 2 x i64> %partial.reduce
}
define <vscale x 2 x i64> @not_usdot(<vscale x 2 x i64> %acc, <vscale x 8 x i16> %a, <vscale x 8 x i16> %b) {
; CHECK-SVE2-LABEL: not_usdot:
; CHECK-SVE2: // %bb.0: // %entry
; CHECK-SVE2-NEXT: uunpklo z3.s, z1.h
; CHECK-SVE2-NEXT: sunpklo z4.s, z2.h
; CHECK-SVE2-NEXT: ptrue p0.d
; CHECK-SVE2-NEXT: uunpkhi z1.s, z1.h
; CHECK-SVE2-NEXT: sunpkhi z2.s, z2.h
; CHECK-SVE2-NEXT: uunpklo z5.d, z3.s
; CHECK-SVE2-NEXT: sunpklo z6.d, z4.s
; CHECK-SVE2-NEXT: uunpkhi z3.d, z3.s
; CHECK-SVE2-NEXT: sunpkhi z4.d, z4.s
; CHECK-SVE2-NEXT: mla z0.d, p0/m, z5.d, z6.d
; CHECK-SVE2-NEXT: uunpklo z5.d, z1.s
; CHECK-SVE2-NEXT: sunpklo z6.d, z2.s
; CHECK-SVE2-NEXT: uunpkhi z1.d, z1.s
; CHECK-SVE2-NEXT: sunpkhi z2.d, z2.s
; CHECK-SVE2-NEXT: mla z0.d, p0/m, z3.d, z4.d
; CHECK-SVE2-NEXT: mla z0.d, p0/m, z5.d, z6.d
; CHECK-SVE2-NEXT: mla z0.d, p0/m, z1.d, z2.d
; CHECK-SVE2-NEXT: ret
;
; CHECK-SVE2-I8MM-LABEL: not_usdot:
; CHECK-SVE2-I8MM: // %bb.0: // %entry
; CHECK-SVE2-I8MM-NEXT: uunpklo z3.s, z1.h
; CHECK-SVE2-I8MM-NEXT: sunpklo z4.s, z2.h
; CHECK-SVE2-I8MM-NEXT: ptrue p0.d
; CHECK-SVE2-I8MM-NEXT: uunpkhi z1.s, z1.h
; CHECK-SVE2-I8MM-NEXT: sunpkhi z2.s, z2.h
; CHECK-SVE2-I8MM-NEXT: uunpklo z5.d, z3.s
; CHECK-SVE2-I8MM-NEXT: sunpklo z6.d, z4.s
; CHECK-SVE2-I8MM-NEXT: uunpkhi z3.d, z3.s
; CHECK-SVE2-I8MM-NEXT: sunpkhi z4.d, z4.s
; CHECK-SVE2-I8MM-NEXT: mla z0.d, p0/m, z5.d, z6.d
; CHECK-SVE2-I8MM-NEXT: uunpklo z5.d, z1.s
; CHECK-SVE2-I8MM-NEXT: sunpklo z6.d, z2.s
; CHECK-SVE2-I8MM-NEXT: uunpkhi z1.d, z1.s
; CHECK-SVE2-I8MM-NEXT: sunpkhi z2.d, z2.s
; CHECK-SVE2-I8MM-NEXT: mla z0.d, p0/m, z3.d, z4.d
; CHECK-SVE2-I8MM-NEXT: mla z0.d, p0/m, z5.d, z6.d
; CHECK-SVE2-I8MM-NEXT: mla z0.d, p0/m, z1.d, z2.d
; CHECK-SVE2-I8MM-NEXT: ret
;
; CHECK-SME-LABEL: not_usdot:
; CHECK-SME: // %bb.0: // %entry
; CHECK-SME-NEXT: uunpklo z3.s, z1.h
; CHECK-SME-NEXT: sunpklo z4.s, z2.h
; CHECK-SME-NEXT: ptrue p0.d
; CHECK-SME-NEXT: uunpkhi z1.s, z1.h
; CHECK-SME-NEXT: sunpkhi z2.s, z2.h
; CHECK-SME-NEXT: uunpklo z5.d, z3.s
; CHECK-SME-NEXT: sunpklo z6.d, z4.s
; CHECK-SME-NEXT: uunpkhi z3.d, z3.s
; CHECK-SME-NEXT: sunpkhi z4.d, z4.s
; CHECK-SME-NEXT: mla z0.d, p0/m, z5.d, z6.d
; CHECK-SME-NEXT: uunpklo z5.d, z1.s
; CHECK-SME-NEXT: sunpklo z6.d, z2.s
; CHECK-SME-NEXT: uunpkhi z1.d, z1.s
; CHECK-SME-NEXT: sunpkhi z2.d, z2.s
; CHECK-SME-NEXT: mla z0.d, p0/m, z3.d, z4.d
; CHECK-SME-NEXT: mla z0.d, p0/m, z5.d, z6.d
; CHECK-SME-NEXT: mla z0.d, p0/m, z1.d, z2.d
; CHECK-SME-NEXT: ret
entry:
%a.wide = zext <vscale x 8 x i16> %a to <vscale x 8 x i64>
%b.wide = sext <vscale x 8 x i16> %b to <vscale x 8 x i64>
%mult = mul nuw nsw <vscale x 8 x i64> %a.wide, %b.wide
%partial.reduce = tail call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> %acc, <vscale x 8 x i64> %mult)
ret <vscale x 2 x i64> %partial.reduce
}
define <vscale x 2 x i64> @not_sudot(<vscale x 2 x i64> %acc, <vscale x 8 x i16> %a, <vscale x 8 x i16> %b) {
; CHECK-SVE2-LABEL: not_sudot:
; CHECK-SVE2: // %bb.0: // %entry
; CHECK-SVE2-NEXT: sunpklo z3.s, z1.h
; CHECK-SVE2-NEXT: uunpklo z4.s, z2.h
; CHECK-SVE2-NEXT: ptrue p0.d
; CHECK-SVE2-NEXT: sunpkhi z1.s, z1.h
; CHECK-SVE2-NEXT: uunpkhi z2.s, z2.h
; CHECK-SVE2-NEXT: sunpklo z5.d, z3.s
; CHECK-SVE2-NEXT: uunpklo z6.d, z4.s
; CHECK-SVE2-NEXT: sunpkhi z3.d, z3.s
; CHECK-SVE2-NEXT: uunpkhi z4.d, z4.s
; CHECK-SVE2-NEXT: mla z0.d, p0/m, z5.d, z6.d
; CHECK-SVE2-NEXT: sunpklo z5.d, z1.s
; CHECK-SVE2-NEXT: uunpklo z6.d, z2.s
; CHECK-SVE2-NEXT: sunpkhi z1.d, z1.s
; CHECK-SVE2-NEXT: uunpkhi z2.d, z2.s
; CHECK-SVE2-NEXT: mla z0.d, p0/m, z3.d, z4.d
; CHECK-SVE2-NEXT: mla z0.d, p0/m, z5.d, z6.d
; CHECK-SVE2-NEXT: mla z0.d, p0/m, z1.d, z2.d
; CHECK-SVE2-NEXT: ret
;
; CHECK-SVE2-I8MM-LABEL: not_sudot:
; CHECK-SVE2-I8MM: // %bb.0: // %entry
; CHECK-SVE2-I8MM-NEXT: sunpklo z3.s, z1.h
; CHECK-SVE2-I8MM-NEXT: uunpklo z4.s, z2.h
; CHECK-SVE2-I8MM-NEXT: ptrue p0.d
; CHECK-SVE2-I8MM-NEXT: sunpkhi z1.s, z1.h
; CHECK-SVE2-I8MM-NEXT: uunpkhi z2.s, z2.h
; CHECK-SVE2-I8MM-NEXT: sunpklo z5.d, z3.s
; CHECK-SVE2-I8MM-NEXT: uunpklo z6.d, z4.s
; CHECK-SVE2-I8MM-NEXT: sunpkhi z3.d, z3.s
; CHECK-SVE2-I8MM-NEXT: uunpkhi z4.d, z4.s
; CHECK-SVE2-I8MM-NEXT: mla z0.d, p0/m, z5.d, z6.d
; CHECK-SVE2-I8MM-NEXT: sunpklo z5.d, z1.s
; CHECK-SVE2-I8MM-NEXT: uunpklo z6.d, z2.s
; CHECK-SVE2-I8MM-NEXT: sunpkhi z1.d, z1.s
; CHECK-SVE2-I8MM-NEXT: uunpkhi z2.d, z2.s
; CHECK-SVE2-I8MM-NEXT: mla z0.d, p0/m, z3.d, z4.d
; CHECK-SVE2-I8MM-NEXT: mla z0.d, p0/m, z5.d, z6.d
; CHECK-SVE2-I8MM-NEXT: mla z0.d, p0/m, z1.d, z2.d
; CHECK-SVE2-I8MM-NEXT: ret
;
; CHECK-SME-LABEL: not_sudot:
; CHECK-SME: // %bb.0: // %entry
; CHECK-SME-NEXT: sunpklo z3.s, z1.h
; CHECK-SME-NEXT: uunpklo z4.s, z2.h
; CHECK-SME-NEXT: ptrue p0.d
; CHECK-SME-NEXT: sunpkhi z1.s, z1.h
; CHECK-SME-NEXT: uunpkhi z2.s, z2.h
; CHECK-SME-NEXT: sunpklo z5.d, z3.s
; CHECK-SME-NEXT: uunpklo z6.d, z4.s
; CHECK-SME-NEXT: sunpkhi z3.d, z3.s
; CHECK-SME-NEXT: uunpkhi z4.d, z4.s
; CHECK-SME-NEXT: mla z0.d, p0/m, z5.d, z6.d
; CHECK-SME-NEXT: sunpklo z5.d, z1.s
; CHECK-SME-NEXT: uunpklo z6.d, z2.s
; CHECK-SME-NEXT: sunpkhi z1.d, z1.s
; CHECK-SME-NEXT: uunpkhi z2.d, z2.s
; CHECK-SME-NEXT: mla z0.d, p0/m, z3.d, z4.d
; CHECK-SME-NEXT: mla z0.d, p0/m, z5.d, z6.d
; CHECK-SME-NEXT: mla z0.d, p0/m, z1.d, z2.d
; CHECK-SME-NEXT: ret
entry:
%a.wide = sext <vscale x 8 x i16> %a to <vscale x 8 x i64>
%b.wide = zext <vscale x 8 x i16> %b to <vscale x 8 x i64>
%mult = mul nuw nsw <vscale x 8 x i64> %a.wide, %b.wide
%partial.reduce = tail call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> %acc, <vscale x 8 x i64> %mult)
ret <vscale x 2 x i64> %partial.reduce
}
define <vscale x 2 x i64> @udot_different_types(<vscale x 2 x i64> %acc, <vscale x 8 x i16> %a, <vscale x 8 x i8> %b){
; CHECK-SVE2-LABEL: udot_different_types:
; CHECK-SVE2: // %bb.0: // %entry
; CHECK-SVE2-NEXT: and z2.h, z2.h, #0xff
; CHECK-SVE2-NEXT: uunpklo z3.s, z1.h
; CHECK-SVE2-NEXT: ptrue p0.d
; CHECK-SVE2-NEXT: uunpkhi z1.s, z1.h
; CHECK-SVE2-NEXT: uunpklo z4.s, z2.h
; CHECK-SVE2-NEXT: uunpkhi z2.s, z2.h
; CHECK-SVE2-NEXT: uunpklo z5.d, z3.s
; CHECK-SVE2-NEXT: uunpkhi z3.d, z3.s
; CHECK-SVE2-NEXT: uunpklo z6.d, z4.s
; CHECK-SVE2-NEXT: uunpkhi z4.d, z4.s
; CHECK-SVE2-NEXT: mla z0.d, p0/m, z5.d, z6.d
; CHECK-SVE2-NEXT: uunpklo z5.d, z1.s
; CHECK-SVE2-NEXT: uunpklo z6.d, z2.s
; CHECK-SVE2-NEXT: uunpkhi z1.d, z1.s
; CHECK-SVE2-NEXT: uunpkhi z2.d, z2.s
; CHECK-SVE2-NEXT: mla z0.d, p0/m, z3.d, z4.d
; CHECK-SVE2-NEXT: mla z0.d, p0/m, z5.d, z6.d
; CHECK-SVE2-NEXT: mla z0.d, p0/m, z1.d, z2.d
; CHECK-SVE2-NEXT: ret
;
; CHECK-SVE2-I8MM-LABEL: udot_different_types:
; CHECK-SVE2-I8MM: // %bb.0: // %entry
; CHECK-SVE2-I8MM-NEXT: and z2.h, z2.h, #0xff
; CHECK-SVE2-I8MM-NEXT: uunpklo z3.s, z1.h
; CHECK-SVE2-I8MM-NEXT: ptrue p0.d
; CHECK-SVE2-I8MM-NEXT: uunpkhi z1.s, z1.h
; CHECK-SVE2-I8MM-NEXT: uunpklo z4.s, z2.h
; CHECK-SVE2-I8MM-NEXT: uunpkhi z2.s, z2.h
; CHECK-SVE2-I8MM-NEXT: uunpklo z5.d, z3.s
; CHECK-SVE2-I8MM-NEXT: uunpkhi z3.d, z3.s
; CHECK-SVE2-I8MM-NEXT: uunpklo z6.d, z4.s
; CHECK-SVE2-I8MM-NEXT: uunpkhi z4.d, z4.s
; CHECK-SVE2-I8MM-NEXT: mla z0.d, p0/m, z5.d, z6.d
; CHECK-SVE2-I8MM-NEXT: uunpklo z5.d, z1.s
; CHECK-SVE2-I8MM-NEXT: uunpklo z6.d, z2.s
; CHECK-SVE2-I8MM-NEXT: uunpkhi z1.d, z1.s
; CHECK-SVE2-I8MM-NEXT: uunpkhi z2.d, z2.s
; CHECK-SVE2-I8MM-NEXT: mla z0.d, p0/m, z3.d, z4.d
; CHECK-SVE2-I8MM-NEXT: mla z0.d, p0/m, z5.d, z6.d
; CHECK-SVE2-I8MM-NEXT: mla z0.d, p0/m, z1.d, z2.d
; CHECK-SVE2-I8MM-NEXT: ret
;
; CHECK-SME-LABEL: udot_different_types:
; CHECK-SME: // %bb.0: // %entry
; CHECK-SME-NEXT: and z2.h, z2.h, #0xff
; CHECK-SME-NEXT: uunpklo z3.s, z1.h
; CHECK-SME-NEXT: ptrue p0.d
; CHECK-SME-NEXT: uunpkhi z1.s, z1.h
; CHECK-SME-NEXT: uunpklo z4.s, z2.h
; CHECK-SME-NEXT: uunpkhi z2.s, z2.h
; CHECK-SME-NEXT: uunpklo z5.d, z3.s
; CHECK-SME-NEXT: uunpkhi z3.d, z3.s
; CHECK-SME-NEXT: uunpklo z6.d, z4.s
; CHECK-SME-NEXT: uunpkhi z4.d, z4.s
; CHECK-SME-NEXT: mla z0.d, p0/m, z5.d, z6.d
; CHECK-SME-NEXT: uunpklo z5.d, z1.s
; CHECK-SME-NEXT: uunpklo z6.d, z2.s
; CHECK-SME-NEXT: uunpkhi z1.d, z1.s
; CHECK-SME-NEXT: uunpkhi z2.d, z2.s
; CHECK-SME-NEXT: mla z0.d, p0/m, z3.d, z4.d
; CHECK-SME-NEXT: mla z0.d, p0/m, z5.d, z6.d
; CHECK-SME-NEXT: mla z0.d, p0/m, z1.d, z2.d
; CHECK-SME-NEXT: ret
entry:
%a.wide = zext <vscale x 8 x i16> %a to <vscale x 8 x i64>
%b.wide = zext <vscale x 8 x i8> %b to <vscale x 8 x i64>
%mult = mul nuw nsw <vscale x 8 x i64> %a.wide, %b.wide
%partial.reduce = tail call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> %acc, <vscale x 8 x i64> %mult)
ret <vscale x 2 x i64> %partial.reduce
}
define <vscale x 2 x i64> @sdot_different_types(<vscale x 2 x i64> %acc, <vscale x 8 x i16> %a, <vscale x 8 x i8> %b){
; CHECK-SVE2-LABEL: sdot_different_types:
; CHECK-SVE2: // %bb.0: // %entry
; CHECK-SVE2-NEXT: ptrue p0.h
; CHECK-SVE2-NEXT: sunpklo z3.s, z1.h
; CHECK-SVE2-NEXT: sunpkhi z1.s, z1.h
; CHECK-SVE2-NEXT: sxtb z2.h, p0/m, z2.h
; CHECK-SVE2-NEXT: ptrue p0.d
; CHECK-SVE2-NEXT: sunpklo z5.d, z3.s
; CHECK-SVE2-NEXT: sunpkhi z3.d, z3.s
; CHECK-SVE2-NEXT: sunpklo z4.s, z2.h
; CHECK-SVE2-NEXT: sunpkhi z2.s, z2.h
; CHECK-SVE2-NEXT: sunpklo z6.d, z4.s
; CHECK-SVE2-NEXT: sunpkhi z4.d, z4.s
; CHECK-SVE2-NEXT: mla z0.d, p0/m, z5.d, z6.d
; CHECK-SVE2-NEXT: sunpklo z5.d, z1.s
; CHECK-SVE2-NEXT: sunpklo z6.d, z2.s
; CHECK-SVE2-NEXT: sunpkhi z1.d, z1.s
; CHECK-SVE2-NEXT: sunpkhi z2.d, z2.s
; CHECK-SVE2-NEXT: mla z0.d, p0/m, z3.d, z4.d
; CHECK-SVE2-NEXT: mla z0.d, p0/m, z5.d, z6.d
; CHECK-SVE2-NEXT: mla z0.d, p0/m, z1.d, z2.d
; CHECK-SVE2-NEXT: ret
;
; CHECK-SVE2-I8MM-LABEL: sdot_different_types:
; CHECK-SVE2-I8MM: // %bb.0: // %entry
; CHECK-SVE2-I8MM-NEXT: ptrue p0.h
; CHECK-SVE2-I8MM-NEXT: sunpklo z3.s, z1.h
; CHECK-SVE2-I8MM-NEXT: sunpkhi z1.s, z1.h
; CHECK-SVE2-I8MM-NEXT: sxtb z2.h, p0/m, z2.h
; CHECK-SVE2-I8MM-NEXT: ptrue p0.d
; CHECK-SVE2-I8MM-NEXT: sunpklo z5.d, z3.s
; CHECK-SVE2-I8MM-NEXT: sunpkhi z3.d, z3.s
; CHECK-SVE2-I8MM-NEXT: sunpklo z4.s, z2.h
; CHECK-SVE2-I8MM-NEXT: sunpkhi z2.s, z2.h
; CHECK-SVE2-I8MM-NEXT: sunpklo z6.d, z4.s
; CHECK-SVE2-I8MM-NEXT: sunpkhi z4.d, z4.s
; CHECK-SVE2-I8MM-NEXT: mla z0.d, p0/m, z5.d, z6.d
; CHECK-SVE2-I8MM-NEXT: sunpklo z5.d, z1.s
; CHECK-SVE2-I8MM-NEXT: sunpklo z6.d, z2.s
; CHECK-SVE2-I8MM-NEXT: sunpkhi z1.d, z1.s
; CHECK-SVE2-I8MM-NEXT: sunpkhi z2.d, z2.s
; CHECK-SVE2-I8MM-NEXT: mla z0.d, p0/m, z3.d, z4.d
; CHECK-SVE2-I8MM-NEXT: mla z0.d, p0/m, z5.d, z6.d
; CHECK-SVE2-I8MM-NEXT: mla z0.d, p0/m, z1.d, z2.d
; CHECK-SVE2-I8MM-NEXT: ret
;
; CHECK-SME-LABEL: sdot_different_types:
; CHECK-SME: // %bb.0: // %entry
; CHECK-SME-NEXT: ptrue p0.h
; CHECK-SME-NEXT: sunpklo z3.s, z1.h
; CHECK-SME-NEXT: sunpkhi z1.s, z1.h
; CHECK-SME-NEXT: sxtb z2.h, p0/m, z2.h
; CHECK-SME-NEXT: ptrue p0.d
; CHECK-SME-NEXT: sunpklo z5.d, z3.s
; CHECK-SME-NEXT: sunpkhi z3.d, z3.s
; CHECK-SME-NEXT: sunpklo z4.s, z2.h
; CHECK-SME-NEXT: sunpkhi z2.s, z2.h
; CHECK-SME-NEXT: sunpklo z6.d, z4.s
; CHECK-SME-NEXT: sunpkhi z4.d, z4.s
; CHECK-SME-NEXT: mla z0.d, p0/m, z5.d, z6.d
; CHECK-SME-NEXT: sunpklo z5.d, z1.s
; CHECK-SME-NEXT: sunpklo z6.d, z2.s
; CHECK-SME-NEXT: sunpkhi z1.d, z1.s
; CHECK-SME-NEXT: sunpkhi z2.d, z2.s
; CHECK-SME-NEXT: mla z0.d, p0/m, z3.d, z4.d
; CHECK-SME-NEXT: mla z0.d, p0/m, z5.d, z6.d
; CHECK-SME-NEXT: mla z0.d, p0/m, z1.d, z2.d
; CHECK-SME-NEXT: ret
entry:
%a.wide = sext <vscale x 8 x i16> %a to <vscale x 8 x i64>
%b.wide = sext <vscale x 8 x i8> %b to <vscale x 8 x i64>
%mult = mul nuw nsw <vscale x 8 x i64> %a.wide, %b.wide
%partial.reduce = tail call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> %acc, <vscale x 8 x i64> %mult)
ret <vscale x 2 x i64> %partial.reduce
}
define <vscale x 2 x i64> @usdot_different_types(<vscale x 2 x i64> %acc, <vscale x 8 x i16> %a, <vscale x 8 x i8> %b){
; CHECK-SVE2-LABEL: usdot_different_types:
; CHECK-SVE2: // %bb.0: // %entry
; CHECK-SVE2-NEXT: ptrue p0.h
; CHECK-SVE2-NEXT: uunpklo z3.s, z1.h
; CHECK-SVE2-NEXT: uunpkhi z1.s, z1.h
; CHECK-SVE2-NEXT: sxtb z2.h, p0/m, z2.h
; CHECK-SVE2-NEXT: ptrue p0.d
; CHECK-SVE2-NEXT: uunpklo z5.d, z3.s
; CHECK-SVE2-NEXT: uunpkhi z3.d, z3.s
; CHECK-SVE2-NEXT: sunpklo z4.s, z2.h
; CHECK-SVE2-NEXT: sunpkhi z2.s, z2.h
; CHECK-SVE2-NEXT: sunpklo z6.d, z4.s
; CHECK-SVE2-NEXT: sunpkhi z4.d, z4.s
; CHECK-SVE2-NEXT: mla z0.d, p0/m, z5.d, z6.d
; CHECK-SVE2-NEXT: uunpklo z5.d, z1.s
; CHECK-SVE2-NEXT: sunpklo z6.d, z2.s
; CHECK-SVE2-NEXT: uunpkhi z1.d, z1.s
; CHECK-SVE2-NEXT: sunpkhi z2.d, z2.s
; CHECK-SVE2-NEXT: mla z0.d, p0/m, z3.d, z4.d
; CHECK-SVE2-NEXT: mla z0.d, p0/m, z5.d, z6.d
; CHECK-SVE2-NEXT: mla z0.d, p0/m, z1.d, z2.d
; CHECK-SVE2-NEXT: ret
;
; CHECK-SVE2-I8MM-LABEL: usdot_different_types:
; CHECK-SVE2-I8MM: // %bb.0: // %entry
; CHECK-SVE2-I8MM-NEXT: ptrue p0.h
; CHECK-SVE2-I8MM-NEXT: uunpklo z3.s, z1.h
; CHECK-SVE2-I8MM-NEXT: uunpkhi z1.s, z1.h
; CHECK-SVE2-I8MM-NEXT: sxtb z2.h, p0/m, z2.h
; CHECK-SVE2-I8MM-NEXT: ptrue p0.d
; CHECK-SVE2-I8MM-NEXT: uunpklo z5.d, z3.s
; CHECK-SVE2-I8MM-NEXT: uunpkhi z3.d, z3.s
; CHECK-SVE2-I8MM-NEXT: sunpklo z4.s, z2.h
; CHECK-SVE2-I8MM-NEXT: sunpkhi z2.s, z2.h
; CHECK-SVE2-I8MM-NEXT: sunpklo z6.d, z4.s
; CHECK-SVE2-I8MM-NEXT: sunpkhi z4.d, z4.s
; CHECK-SVE2-I8MM-NEXT: mla z0.d, p0/m, z5.d, z6.d
; CHECK-SVE2-I8MM-NEXT: uunpklo z5.d, z1.s
; CHECK-SVE2-I8MM-NEXT: sunpklo z6.d, z2.s
; CHECK-SVE2-I8MM-NEXT: uunpkhi z1.d, z1.s
; CHECK-SVE2-I8MM-NEXT: sunpkhi z2.d, z2.s
; CHECK-SVE2-I8MM-NEXT: mla z0.d, p0/m, z3.d, z4.d
; CHECK-SVE2-I8MM-NEXT: mla z0.d, p0/m, z5.d, z6.d
; CHECK-SVE2-I8MM-NEXT: mla z0.d, p0/m, z1.d, z2.d
; CHECK-SVE2-I8MM-NEXT: ret
;
; CHECK-SME-LABEL: usdot_different_types:
; CHECK-SME: // %bb.0: // %entry
; CHECK-SME-NEXT: ptrue p0.h
; CHECK-SME-NEXT: uunpklo z3.s, z1.h
; CHECK-SME-NEXT: uunpkhi z1.s, z1.h
; CHECK-SME-NEXT: sxtb z2.h, p0/m, z2.h
; CHECK-SME-NEXT: ptrue p0.d
; CHECK-SME-NEXT: uunpklo z5.d, z3.s
; CHECK-SME-NEXT: uunpkhi z3.d, z3.s
; CHECK-SME-NEXT: sunpklo z4.s, z2.h
; CHECK-SME-NEXT: sunpkhi z2.s, z2.h
; CHECK-SME-NEXT: sunpklo z6.d, z4.s
; CHECK-SME-NEXT: sunpkhi z4.d, z4.s
; CHECK-SME-NEXT: mla z0.d, p0/m, z5.d, z6.d
; CHECK-SME-NEXT: uunpklo z5.d, z1.s
; CHECK-SME-NEXT: sunpklo z6.d, z2.s
; CHECK-SME-NEXT: uunpkhi z1.d, z1.s
; CHECK-SME-NEXT: sunpkhi z2.d, z2.s
; CHECK-SME-NEXT: mla z0.d, p0/m, z3.d, z4.d
; CHECK-SME-NEXT: mla z0.d, p0/m, z5.d, z6.d
; CHECK-SME-NEXT: mla z0.d, p0/m, z1.d, z2.d
; CHECK-SME-NEXT: ret
entry:
%a.wide = zext <vscale x 8 x i16> %a to <vscale x 8 x i64>
%b.wide = sext <vscale x 8 x i8> %b to <vscale x 8 x i64>
%mult = mul nuw nsw <vscale x 8 x i64> %a.wide, %b.wide
%partial.reduce = tail call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> %acc, <vscale x 8 x i64> %mult)
ret <vscale x 2 x i64> %partial.reduce
}
define <vscale x 2 x i64> @sudot_different_types(<vscale x 2 x i64> %acc, <vscale x 8 x i16> %a, <vscale x 8 x i8> %b){
; CHECK-SVE2-LABEL: sudot_different_types:
; CHECK-SVE2: // %bb.0: // %entry
; CHECK-SVE2-NEXT: and z2.h, z2.h, #0xff
; CHECK-SVE2-NEXT: sunpklo z3.s, z1.h
; CHECK-SVE2-NEXT: ptrue p0.d
; CHECK-SVE2-NEXT: sunpkhi z1.s, z1.h
; CHECK-SVE2-NEXT: uunpklo z4.s, z2.h
; CHECK-SVE2-NEXT: uunpkhi z2.s, z2.h
; CHECK-SVE2-NEXT: sunpklo z5.d, z3.s
; CHECK-SVE2-NEXT: sunpkhi z3.d, z3.s
; CHECK-SVE2-NEXT: uunpklo z6.d, z4.s
; CHECK-SVE2-NEXT: uunpkhi z4.d, z4.s
; CHECK-SVE2-NEXT: mla z0.d, p0/m, z5.d, z6.d
; CHECK-SVE2-NEXT: sunpklo z5.d, z1.s
; CHECK-SVE2-NEXT: uunpklo z6.d, z2.s
; CHECK-SVE2-NEXT: sunpkhi z1.d, z1.s
; CHECK-SVE2-NEXT: uunpkhi z2.d, z2.s
; CHECK-SVE2-NEXT: mla z0.d, p0/m, z3.d, z4.d
; CHECK-SVE2-NEXT: mla z0.d, p0/m, z5.d, z6.d
; CHECK-SVE2-NEXT: mla z0.d, p0/m, z1.d, z2.d
; CHECK-SVE2-NEXT: ret
;
; CHECK-SVE2-I8MM-LABEL: sudot_different_types:
; CHECK-SVE2-I8MM: // %bb.0: // %entry
; CHECK-SVE2-I8MM-NEXT: and z2.h, z2.h, #0xff
; CHECK-SVE2-I8MM-NEXT: sunpklo z3.s, z1.h
; CHECK-SVE2-I8MM-NEXT: ptrue p0.d
; CHECK-SVE2-I8MM-NEXT: sunpkhi z1.s, z1.h
; CHECK-SVE2-I8MM-NEXT: uunpklo z4.s, z2.h
; CHECK-SVE2-I8MM-NEXT: uunpkhi z2.s, z2.h
; CHECK-SVE2-I8MM-NEXT: sunpklo z5.d, z3.s
; CHECK-SVE2-I8MM-NEXT: sunpkhi z3.d, z3.s
; CHECK-SVE2-I8MM-NEXT: uunpklo z6.d, z4.s
; CHECK-SVE2-I8MM-NEXT: uunpkhi z4.d, z4.s
; CHECK-SVE2-I8MM-NEXT: mla z0.d, p0/m, z5.d, z6.d
; CHECK-SVE2-I8MM-NEXT: sunpklo z5.d, z1.s
; CHECK-SVE2-I8MM-NEXT: uunpklo z6.d, z2.s
; CHECK-SVE2-I8MM-NEXT: sunpkhi z1.d, z1.s
; CHECK-SVE2-I8MM-NEXT: uunpkhi z2.d, z2.s
; CHECK-SVE2-I8MM-NEXT: mla z0.d, p0/m, z3.d, z4.d
; CHECK-SVE2-I8MM-NEXT: mla z0.d, p0/m, z5.d, z6.d
; CHECK-SVE2-I8MM-NEXT: mla z0.d, p0/m, z1.d, z2.d
; CHECK-SVE2-I8MM-NEXT: ret
;
; CHECK-SME-LABEL: sudot_different_types:
; CHECK-SME: // %bb.0: // %entry
; CHECK-SME-NEXT: and z2.h, z2.h, #0xff
; CHECK-SME-NEXT: sunpklo z3.s, z1.h
; CHECK-SME-NEXT: ptrue p0.d
; CHECK-SME-NEXT: sunpkhi z1.s, z1.h
; CHECK-SME-NEXT: uunpklo z4.s, z2.h
; CHECK-SME-NEXT: uunpkhi z2.s, z2.h
; CHECK-SME-NEXT: sunpklo z5.d, z3.s
; CHECK-SME-NEXT: sunpkhi z3.d, z3.s
; CHECK-SME-NEXT: uunpklo z6.d, z4.s
; CHECK-SME-NEXT: uunpkhi z4.d, z4.s
; CHECK-SME-NEXT: mla z0.d, p0/m, z5.d, z6.d
; CHECK-SME-NEXT: sunpklo z5.d, z1.s
; CHECK-SME-NEXT: uunpklo z6.d, z2.s
; CHECK-SME-NEXT: sunpkhi z1.d, z1.s
; CHECK-SME-NEXT: uunpkhi z2.d, z2.s
; CHECK-SME-NEXT: mla z0.d, p0/m, z3.d, z4.d
; CHECK-SME-NEXT: mla z0.d, p0/m, z5.d, z6.d
; CHECK-SME-NEXT: mla z0.d, p0/m, z1.d, z2.d
; CHECK-SME-NEXT: ret
entry:
%a.wide = sext <vscale x 8 x i16> %a to <vscale x 8 x i64>
%b.wide = zext <vscale x 8 x i8> %b to <vscale x 8 x i64>
%mult = mul nuw nsw <vscale x 8 x i64> %a.wide, %b.wide
%partial.reduce = tail call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> %acc, <vscale x 8 x i64> %mult)
ret <vscale x 2 x i64> %partial.reduce
}
define <vscale x 2 x i16> @udot_nxv8i8_promote (<vscale x 2 x i16> %acc, <vscale x 8 x i8> %a, <vscale x 8 x i8> %b){
; CHECK-SVE2-LABEL: udot_nxv8i8_promote:
; CHECK-SVE2: // %bb.0: // %entry
; CHECK-SVE2-NEXT: and z2.h, z2.h, #0xff
; CHECK-SVE2-NEXT: and z1.h, z1.h, #0xff
; CHECK-SVE2-NEXT: udot z0.d, z1.h, z2.h
; CHECK-SVE2-NEXT: ret
;
; CHECK-SVE2-I8MM-LABEL: udot_nxv8i8_promote:
; CHECK-SVE2-I8MM: // %bb.0: // %entry
; CHECK-SVE2-I8MM-NEXT: and z2.h, z2.h, #0xff
; CHECK-SVE2-I8MM-NEXT: and z1.h, z1.h, #0xff
; CHECK-SVE2-I8MM-NEXT: udot z0.d, z1.h, z2.h
; CHECK-SVE2-I8MM-NEXT: ret
;
; CHECK-SME-LABEL: udot_nxv8i8_promote:
; CHECK-SME: // %bb.0: // %entry
; CHECK-SME-NEXT: and z2.h, z2.h, #0xff
; CHECK-SME-NEXT: and z1.h, z1.h, #0xff
; CHECK-SME-NEXT: udot z0.d, z1.h, z2.h
; CHECK-SME-NEXT: ret
entry:
%a.wide = zext <vscale x 8 x i8> %a to <vscale x 8 x i16>
%b.wide = zext <vscale x 8 x i8> %b to <vscale x 8 x i16>
%mult = mul nuw nsw <vscale x 8 x i16> %a.wide, %b.wide
%partial.reduce = tail call <vscale x 2 x i16> @llvm.experimental.vector.partial.reduce.add.nxv2i16.nxv8i16(<vscale x 2 x i16> %acc, <vscale x 8 x i16> %mult)
ret <vscale x 2 x i16> %partial.reduce
}
define <vscale x 2 x i16> @sdot_nxv8i8_promote (<vscale x 2 x i16> %acc, <vscale x 8 x i8> %a, <vscale x 8 x i8> %b){
; CHECK-SVE2-LABEL: sdot_nxv8i8_promote:
; CHECK-SVE2: // %bb.0: // %entry
; CHECK-SVE2-NEXT: ptrue p0.h
; CHECK-SVE2-NEXT: sxtb z2.h, p0/m, z2.h
; CHECK-SVE2-NEXT: sxtb z1.h, p0/m, z1.h
; CHECK-SVE2-NEXT: sdot z0.d, z1.h, z2.h
; CHECK-SVE2-NEXT: ret
;
; CHECK-SVE2-I8MM-LABEL: sdot_nxv8i8_promote:
; CHECK-SVE2-I8MM: // %bb.0: // %entry
; CHECK-SVE2-I8MM-NEXT: ptrue p0.h
; CHECK-SVE2-I8MM-NEXT: sxtb z2.h, p0/m, z2.h
; CHECK-SVE2-I8MM-NEXT: sxtb z1.h, p0/m, z1.h
; CHECK-SVE2-I8MM-NEXT: sdot z0.d, z1.h, z2.h
; CHECK-SVE2-I8MM-NEXT: ret
;
; CHECK-SME-LABEL: sdot_nxv8i8_promote:
; CHECK-SME: // %bb.0: // %entry
; CHECK-SME-NEXT: ptrue p0.h
; CHECK-SME-NEXT: sxtb z2.h, p0/m, z2.h
; CHECK-SME-NEXT: sxtb z1.h, p0/m, z1.h
; CHECK-SME-NEXT: sdot z0.d, z1.h, z2.h
; CHECK-SME-NEXT: ret
entry:
%a.wide = sext <vscale x 8 x i8> %a to <vscale x 8 x i16>
%b.wide = sext <vscale x 8 x i8> %b to <vscale x 8 x i16>
%mult = mul nuw nsw <vscale x 8 x i16> %a.wide, %b.wide
%partial.reduce = tail call <vscale x 2 x i16> @llvm.experimental.vector.partial.reduce.add.nxv2i16.nxv8i16(<vscale x 2 x i16> %acc, <vscale x 8 x i16> %mult)
ret <vscale x 2 x i16> %partial.reduce
}
define <vscale x 4 x i64> @partial_reduce_only_split_acc(<vscale x 4 x i64> %acc, <vscale x 8 x i8> %a, <vscale x 8 x i8> %b) {
; CHECK-SVE2-LABEL: partial_reduce_only_split_acc:
; CHECK-SVE2: // %bb.0: // %entry
; CHECK-SVE2-NEXT: and z3.h, z3.h, #0xff
; CHECK-SVE2-NEXT: and z2.h, z2.h, #0xff
; CHECK-SVE2-NEXT: udot z0.d, z2.h, z3.h
; CHECK-SVE2-NEXT: ret
;
; CHECK-SVE2-I8MM-LABEL: partial_reduce_only_split_acc:
; CHECK-SVE2-I8MM: // %bb.0: // %entry
; CHECK-SVE2-I8MM-NEXT: and z3.h, z3.h, #0xff
; CHECK-SVE2-I8MM-NEXT: and z2.h, z2.h, #0xff
; CHECK-SVE2-I8MM-NEXT: udot z0.d, z2.h, z3.h
; CHECK-SVE2-I8MM-NEXT: ret
;
; CHECK-SME-LABEL: partial_reduce_only_split_acc:
; CHECK-SME: // %bb.0: // %entry
; CHECK-SME-NEXT: and z3.h, z3.h, #0xff
; CHECK-SME-NEXT: and z2.h, z2.h, #0xff
; CHECK-SME-NEXT: udot z0.d, z2.h, z3.h
; CHECK-SME-NEXT: ret
entry:
%a.wide = zext <vscale x 8 x i8> %a to <vscale x 8 x i64>
%b.wide = zext <vscale x 8 x i8> %b to <vscale x 8 x i64>
%mult = mul nuw nsw <vscale x 8 x i64> %a.wide, %b.wide
%partial.reduce = tail call <vscale x 4 x i64> @llvm.experimental.vector.partial.reduce.add.nxv4i64.nxv8i64(
<vscale x 4 x i64> %acc, <vscale x 8 x i64> %mult)
ret <vscale x 4 x i64> %partial.reduce
}
define <vscale x 4 x i32> @sdot_imm(<vscale x 4 x i32> %acc, <vscale x 16 x i8> %a) {
; CHECK-SVE2-LABEL: sdot_imm:
; CHECK-SVE2: // %bb.0: // %entry
; CHECK-SVE2-NEXT: mov z2.b, #-1 // =0xffffffffffffffff
; CHECK-SVE2-NEXT: sdot z0.s, z1.b, z2.b
; CHECK-SVE2-NEXT: ret
;
; CHECK-SVE2-I8MM-LABEL: sdot_imm:
; CHECK-SVE2-I8MM: // %bb.0: // %entry
; CHECK-SVE2-I8MM-NEXT: mov z2.b, #-1 // =0xffffffffffffffff
; CHECK-SVE2-I8MM-NEXT: sdot z0.s, z1.b, z2.b
; CHECK-SVE2-I8MM-NEXT: ret
;
; CHECK-SME-LABEL: sdot_imm:
; CHECK-SME: // %bb.0: // %entry
; CHECK-SME-NEXT: mov z2.b, #-1 // =0xffffffffffffffff
; CHECK-SME-NEXT: sdot z0.s, z1.b, z2.b
; CHECK-SME-NEXT: ret
entry:
%a.wide = sext <vscale x 16 x i8> %a to <vscale x 16 x i32>
%mult = mul nuw nsw <vscale x 16 x i32> %a.wide, splat(i32 -1)
%partial.reduce = tail call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %acc, <vscale x 16 x i32> %mult)
ret <vscale x 4 x i32> %partial.reduce
}
define <vscale x 4 x i32> @sdot_imm_does_not_fit(<vscale x 4 x i32> %acc, <vscale x 16 x i8> %a) {
; CHECK-SVE2-LABEL: sdot_imm_does_not_fit:
; CHECK-SVE2: // %bb.0: // %entry
; CHECK-SVE2-NEXT: sunpklo z2.h, z1.b
; CHECK-SVE2-NEXT: sunpkhi z1.h, z1.b
; CHECK-SVE2-NEXT: sunpklo z3.s, z2.h
; CHECK-SVE2-NEXT: sunpkhi z2.s, z2.h
; CHECK-SVE2-NEXT: sunpklo z4.s, z1.h
; CHECK-SVE2-NEXT: sunpkhi z1.s, z1.h
; CHECK-SVE2-NEXT: lsl z4.s, z4.s, #8
; CHECK-SVE2-NEXT: lsl z2.s, z2.s, #8
; CHECK-SVE2-NEXT: lsl z3.s, z3.s, #8
; CHECK-SVE2-NEXT: lsl z1.s, z1.s, #8
; CHECK-SVE2-NEXT: add z0.s, z0.s, z3.s
; CHECK-SVE2-NEXT: add z2.s, z2.s, z4.s
; CHECK-SVE2-NEXT: add z0.s, z0.s, z2.s
; CHECK-SVE2-NEXT: add z0.s, z0.s, z1.s
; CHECK-SVE2-NEXT: ret
;
; CHECK-SVE2-I8MM-LABEL: sdot_imm_does_not_fit:
; CHECK-SVE2-I8MM: // %bb.0: // %entry
; CHECK-SVE2-I8MM-NEXT: sunpklo z2.h, z1.b
; CHECK-SVE2-I8MM-NEXT: sunpkhi z1.h, z1.b
; CHECK-SVE2-I8MM-NEXT: sunpklo z3.s, z2.h
; CHECK-SVE2-I8MM-NEXT: sunpkhi z2.s, z2.h
; CHECK-SVE2-I8MM-NEXT: sunpklo z4.s, z1.h
; CHECK-SVE2-I8MM-NEXT: sunpkhi z1.s, z1.h
; CHECK-SVE2-I8MM-NEXT: lsl z4.s, z4.s, #8
; CHECK-SVE2-I8MM-NEXT: lsl z2.s, z2.s, #8
; CHECK-SVE2-I8MM-NEXT: lsl z3.s, z3.s, #8
; CHECK-SVE2-I8MM-NEXT: lsl z1.s, z1.s, #8
; CHECK-SVE2-I8MM-NEXT: add z0.s, z0.s, z3.s
; CHECK-SVE2-I8MM-NEXT: add z2.s, z2.s, z4.s
; CHECK-SVE2-I8MM-NEXT: add z0.s, z0.s, z2.s
; CHECK-SVE2-I8MM-NEXT: add z0.s, z0.s, z1.s
; CHECK-SVE2-I8MM-NEXT: ret
;
; CHECK-SME-LABEL: sdot_imm_does_not_fit:
; CHECK-SME: // %bb.0: // %entry
; CHECK-SME-NEXT: sunpklo z2.h, z1.b
; CHECK-SME-NEXT: sunpkhi z1.h, z1.b
; CHECK-SME-NEXT: sunpklo z3.s, z2.h
; CHECK-SME-NEXT: sunpkhi z2.s, z2.h
; CHECK-SME-NEXT: sunpklo z4.s, z1.h
; CHECK-SME-NEXT: sunpkhi z1.s, z1.h
; CHECK-SME-NEXT: lsl z4.s, z4.s, #8
; CHECK-SME-NEXT: lsl z2.s, z2.s, #8
; CHECK-SME-NEXT: lsl z3.s, z3.s, #8
; CHECK-SME-NEXT: lsl z1.s, z1.s, #8
; CHECK-SME-NEXT: add z0.s, z0.s, z3.s
; CHECK-SME-NEXT: add z2.s, z2.s, z4.s
; CHECK-SME-NEXT: add z0.s, z0.s, z2.s
; CHECK-SME-NEXT: add z0.s, z0.s, z1.s
; CHECK-SME-NEXT: ret
entry:
%a.wide = sext <vscale x 16 x i8> %a to <vscale x 16 x i32>
%mult = mul nuw nsw <vscale x 16 x i32> %a.wide, splat(i32 256)
%partial.reduce = tail call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %acc, <vscale x 16 x i32> %mult)
ret <vscale x 4 x i32> %partial.reduce
}
define <vscale x 4 x i32> @udot_imm(<vscale x 4 x i32> %acc, <vscale x 16 x i8> %a) {
; CHECK-SVE2-LABEL: udot_imm:
; CHECK-SVE2: // %bb.0: // %entry
; CHECK-SVE2-NEXT: mov z2.b, #-1 // =0xffffffffffffffff
; CHECK-SVE2-NEXT: udot z0.s, z1.b, z2.b
; CHECK-SVE2-NEXT: ret
;
; CHECK-SVE2-I8MM-LABEL: udot_imm:
; CHECK-SVE2-I8MM: // %bb.0: // %entry
; CHECK-SVE2-I8MM-NEXT: mov z2.b, #-1 // =0xffffffffffffffff
; CHECK-SVE2-I8MM-NEXT: udot z0.s, z1.b, z2.b
; CHECK-SVE2-I8MM-NEXT: ret
;
; CHECK-SME-LABEL: udot_imm:
; CHECK-SME: // %bb.0: // %entry
; CHECK-SME-NEXT: mov z2.b, #-1 // =0xffffffffffffffff
; CHECK-SME-NEXT: udot z0.s, z1.b, z2.b
; CHECK-SME-NEXT: ret
entry:
%a.wide = zext <vscale x 16 x i8> %a to <vscale x 16 x i32>
%mult = mul nuw nsw <vscale x 16 x i32> %a.wide, splat(i32 255)
%partial.reduce = tail call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %acc, <vscale x 16 x i32> %mult)
ret <vscale x 4 x i32> %partial.reduce
}
define <vscale x 4 x i32> @udot_imm_does_not_fit(<vscale x 4 x i32> %acc, <vscale x 16 x i8> %a) {
; CHECK-SVE2-LABEL: udot_imm_does_not_fit:
; CHECK-SVE2: // %bb.0: // %entry
; CHECK-SVE2-NEXT: uunpklo z2.h, z1.b
; CHECK-SVE2-NEXT: uunpkhi z1.h, z1.b
; CHECK-SVE2-NEXT: uunpklo z3.s, z2.h
; CHECK-SVE2-NEXT: uunpkhi z2.s, z2.h
; CHECK-SVE2-NEXT: uunpklo z4.s, z1.h
; CHECK-SVE2-NEXT: uunpkhi z1.s, z1.h
; CHECK-SVE2-NEXT: lsl z4.s, z4.s, #8
; CHECK-SVE2-NEXT: lsl z2.s, z2.s, #8
; CHECK-SVE2-NEXT: lsl z3.s, z3.s, #8
; CHECK-SVE2-NEXT: lsl z1.s, z1.s, #8
; CHECK-SVE2-NEXT: add z0.s, z0.s, z3.s
; CHECK-SVE2-NEXT: add z2.s, z2.s, z4.s
; CHECK-SVE2-NEXT: add z0.s, z0.s, z2.s
; CHECK-SVE2-NEXT: add z0.s, z0.s, z1.s
; CHECK-SVE2-NEXT: ret
;
; CHECK-SVE2-I8MM-LABEL: udot_imm_does_not_fit:
; CHECK-SVE2-I8MM: // %bb.0: // %entry
; CHECK-SVE2-I8MM-NEXT: uunpklo z2.h, z1.b
; CHECK-SVE2-I8MM-NEXT: uunpkhi z1.h, z1.b
; CHECK-SVE2-I8MM-NEXT: uunpklo z3.s, z2.h
; CHECK-SVE2-I8MM-NEXT: uunpkhi z2.s, z2.h
; CHECK-SVE2-I8MM-NEXT: uunpklo z4.s, z1.h
; CHECK-SVE2-I8MM-NEXT: uunpkhi z1.s, z1.h
; CHECK-SVE2-I8MM-NEXT: lsl z4.s, z4.s, #8
; CHECK-SVE2-I8MM-NEXT: lsl z2.s, z2.s, #8
; CHECK-SVE2-I8MM-NEXT: lsl z3.s, z3.s, #8
; CHECK-SVE2-I8MM-NEXT: lsl z1.s, z1.s, #8
; CHECK-SVE2-I8MM-NEXT: add z0.s, z0.s, z3.s
; CHECK-SVE2-I8MM-NEXT: add z2.s, z2.s, z4.s
; CHECK-SVE2-I8MM-NEXT: add z0.s, z0.s, z2.s
; CHECK-SVE2-I8MM-NEXT: add z0.s, z0.s, z1.s
; CHECK-SVE2-I8MM-NEXT: ret
;
; CHECK-SME-LABEL: udot_imm_does_not_fit:
; CHECK-SME: // %bb.0: // %entry
; CHECK-SME-NEXT: uunpklo z2.h, z1.b
; CHECK-SME-NEXT: uunpkhi z1.h, z1.b
; CHECK-SME-NEXT: uunpklo z3.s, z2.h
; CHECK-SME-NEXT: uunpkhi z2.s, z2.h
; CHECK-SME-NEXT: uunpklo z4.s, z1.h
; CHECK-SME-NEXT: uunpkhi z1.s, z1.h
; CHECK-SME-NEXT: lsl z4.s, z4.s, #8
; CHECK-SME-NEXT: lsl z2.s, z2.s, #8
; CHECK-SME-NEXT: lsl z3.s, z3.s, #8
; CHECK-SME-NEXT: lsl z1.s, z1.s, #8
; CHECK-SME-NEXT: add z0.s, z0.s, z3.s
; CHECK-SME-NEXT: add z2.s, z2.s, z4.s
; CHECK-SME-NEXT: add z0.s, z0.s, z2.s
; CHECK-SME-NEXT: add z0.s, z0.s, z1.s
; CHECK-SME-NEXT: ret
entry:
%a.wide = zext <vscale x 16 x i8> %a to <vscale x 16 x i32>
%mult = mul nuw nsw <vscale x 16 x i32> %a.wide, splat(i32 256)
%partial.reduce = tail call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %acc, <vscale x 16 x i32> %mult)
ret <vscale x 4 x i32> %partial.reduce
}